summaryrefslogtreecommitdiff
path: root/main/openssl/crypto
diff options
context:
space:
mode:
authorArne Schwabe <arne@rfc2549.org>2014-04-23 09:56:37 +0200
committerArne Schwabe <arne@rfc2549.org>2014-04-23 09:56:37 +0200
commite436c963f0976b885a7db04681344779e26dd3b5 (patch)
tree240663106f32e02e1c34080656f4ef21a2e1776e /main/openssl/crypto
parent6a99715a9b072fa249e79c98cd9f03991f0f1219 (diff)
Update OpenSSL to 1.0.1g and statically link OpenVPN with it
Diffstat (limited to 'main/openssl/crypto')
-rw-r--r--main/openssl/crypto/Android.mk559
-rw-r--r--main/openssl/crypto/LPdir_win32.c30
-rw-r--r--main/openssl/crypto/aes/aes.h5
-rw-r--r--main/openssl/crypto/aes/aes_core.c12
-rw-r--r--main/openssl/crypto/aes/aes_misc.c21
-rw-r--r--main/openssl/crypto/aes/asm/aes-586.S3239
-rw-r--r--[-rwxr-xr-x]main/openssl/crypto/aes/asm/aes-586.pl20
-rw-r--r--main/openssl/crypto/aes/asm/aes-armv4.pl182
-rw-r--r--main/openssl/crypto/aes/asm/aes-armv4.s177
-rw-r--r--main/openssl/crypto/aes/asm/aes-mips.S1337
-rw-r--r--main/openssl/crypto/aes/asm/aes-mips.pl1611
-rw-r--r--main/openssl/crypto/aes/asm/aes-parisc.pl1022
-rw-r--r--main/openssl/crypto/aes/asm/aes-ppc.pl444
-rw-r--r--main/openssl/crypto/aes/asm/aes-s390x.pl1054
-rwxr-xr-xmain/openssl/crypto/aes/asm/aes-sparcv9.pl3
-rw-r--r--main/openssl/crypto/aes/asm/aes-x86_64.S2541
-rwxr-xr-xmain/openssl/crypto/aes/asm/aes-x86_64.pl48
-rw-r--r--main/openssl/crypto/aes/asm/aesni-sha1-x86_64.S1396
-rw-r--r--main/openssl/crypto/aes/asm/aesni-sha1-x86_64.pl1250
-rw-r--r--main/openssl/crypto/aes/asm/aesni-x86.S2143
-rw-r--r--main/openssl/crypto/aes/asm/aesni-x86.pl2189
-rw-r--r--main/openssl/crypto/aes/asm/aesni-x86_64.S2535
-rw-r--r--main/openssl/crypto/aes/asm/aesni-x86_64.pl3069
-rw-r--r--main/openssl/crypto/aes/asm/bsaes-x86_64.S2498
-rw-r--r--main/openssl/crypto/aes/asm/bsaes-x86_64.pl3108
-rw-r--r--main/openssl/crypto/aes/asm/vpaes-x86.S661
-rw-r--r--main/openssl/crypto/aes/asm/vpaes-x86.pl903
-rw-r--r--main/openssl/crypto/aes/asm/vpaes-x86_64.S828
-rw-r--r--main/openssl/crypto/aes/asm/vpaes-x86_64.pl1207
-rw-r--r--main/openssl/crypto/arm_arch.h51
-rw-r--r--main/openssl/crypto/armv4cpuid.S154
-rw-r--r--main/openssl/crypto/asn1/a_d2i_fp.c54
-rw-r--r--main/openssl/crypto/asn1/a_digest.c6
-rw-r--r--main/openssl/crypto/asn1/a_int.c6
-rw-r--r--main/openssl/crypto/asn1/a_sign.c111
-rw-r--r--main/openssl/crypto/asn1/a_strex.c1
-rw-r--r--main/openssl/crypto/asn1/a_verify.c83
-rw-r--r--main/openssl/crypto/asn1/ameth_lib.c12
-rw-r--r--main/openssl/crypto/asn1/asn1.h8
-rw-r--r--main/openssl/crypto/asn1/asn1_err.c5
-rw-r--r--main/openssl/crypto/asn1/asn1_locl.h11
-rw-r--r--main/openssl/crypto/asn1/asn_mime.c23
-rw-r--r--main/openssl/crypto/asn1/n_pkey.c38
-rw-r--r--main/openssl/crypto/asn1/p5_pbev2.c143
-rw-r--r--main/openssl/crypto/asn1/t_crl.c3
-rw-r--r--main/openssl/crypto/asn1/t_x509.c55
-rw-r--r--main/openssl/crypto/asn1/tasn_prn.c12
-rw-r--r--main/openssl/crypto/asn1/x_algor.c14
-rw-r--r--main/openssl/crypto/asn1/x_name.c3
-rw-r--r--main/openssl/crypto/asn1/x_pubkey.c14
-rw-r--r--main/openssl/crypto/bf/asm/bf-586.S896
-rw-r--r--main/openssl/crypto/bf/bf_skey.c8
-rw-r--r--main/openssl/crypto/bf/blowfish.h4
-rw-r--r--main/openssl/crypto/bio/b_sock.c13
-rw-r--r--main/openssl/crypto/bio/bf_buff.c15
-rw-r--r--main/openssl/crypto/bio/bio.h79
-rw-r--r--main/openssl/crypto/bio/bio_err.c3
-rw-r--r--main/openssl/crypto/bio/bio_lib.c28
-rw-r--r--main/openssl/crypto/bio/bss_bio.c18
-rw-r--r--main/openssl/crypto/bio/bss_dgram.c1073
-rw-r--r--main/openssl/crypto/bn/asm/armv4-gf2m.S213
-rw-r--r--main/openssl/crypto/bn/asm/armv4-gf2m.pl278
-rw-r--r--main/openssl/crypto/bn/asm/armv4-mont.pl23
-rw-r--r--main/openssl/crypto/bn/asm/armv4-mont.s20
-rw-r--r--main/openssl/crypto/bn/asm/bn-586.S1384
-rw-r--r--main/openssl/crypto/bn/asm/bn-mips.S2175
-rw-r--r--main/openssl/crypto/bn/asm/co-586.S1254
-rw-r--r--main/openssl/crypto/bn/asm/ia64-mont.pl851
-rw-r--r--main/openssl/crypto/bn/asm/ia64.S2
-rw-r--r--main/openssl/crypto/bn/asm/mips-mont.S284
-rw-r--r--main/openssl/crypto/bn/asm/mips-mont.pl426
-rw-r--r--main/openssl/crypto/bn/asm/mips.pl2583
-rw-r--r--main/openssl/crypto/bn/asm/modexp512-x86_64.S1773
-rw-r--r--main/openssl/crypto/bn/asm/modexp512-x86_64.pl1497
-rw-r--r--main/openssl/crypto/bn/asm/parisc-mont.pl995
-rw-r--r--main/openssl/crypto/bn/asm/ppc-mont.pl107
-rw-r--r--main/openssl/crypto/bn/asm/ppc.pl45
-rw-r--r--main/openssl/crypto/bn/asm/ppc64-mont.pl338
-rw-r--r--main/openssl/crypto/bn/asm/s390x-gf2m.pl221
-rw-r--r--main/openssl/crypto/bn/asm/s390x-mont.pl102
-rw-r--r--main/openssl/crypto/bn/asm/x86-gf2m.S335
-rw-r--r--main/openssl/crypto/bn/asm/x86-gf2m.pl313
-rw-r--r--main/openssl/crypto/bn/asm/x86-mont.S338
-rwxr-xr-xmain/openssl/crypto/bn/asm/x86-mont.pl4
-rw-r--r--main/openssl/crypto/bn/asm/x86_64-gcc.c2
-rw-r--r--main/openssl/crypto/bn/asm/x86_64-gf2m.S291
-rw-r--r--main/openssl/crypto/bn/asm/x86_64-gf2m.pl390
-rw-r--r--main/openssl/crypto/bn/asm/x86_64-mont.S1374
-rwxr-xr-xmain/openssl/crypto/bn/asm/x86_64-mont.pl1489
-rw-r--r--main/openssl/crypto/bn/asm/x86_64-mont5.S784
-rwxr-xr-xmain/openssl/crypto/bn/asm/x86_64-mont5.pl1071
-rw-r--r--main/openssl/crypto/bn/bn.h21
-rw-r--r--main/openssl/crypto/bn/bn_blind.c37
-rw-r--r--main/openssl/crypto/bn/bn_div.c274
-rw-r--r--main/openssl/crypto/bn/bn_err.c2
-rw-r--r--main/openssl/crypto/bn/bn_exp.c240
-rw-r--r--main/openssl/crypto/bn/bn_gcd.c1
-rw-r--r--main/openssl/crypto/bn/bn_gf2m.c114
-rw-r--r--main/openssl/crypto/bn/bn_lcl.h30
-rw-r--r--main/openssl/crypto/bn/bn_lib.c19
-rw-r--r--main/openssl/crypto/bn/bn_mont.c116
-rw-r--r--main/openssl/crypto/bn/bn_nist.c387
-rw-r--r--main/openssl/crypto/bn/bn_print.c19
-rw-r--r--main/openssl/crypto/bn/bn_rand.c70
-rw-r--r--main/openssl/crypto/bn/bn_shift.c27
-rw-r--r--main/openssl/crypto/bn/bn_word.c25
-rw-r--r--main/openssl/crypto/bn/bntest.c8
-rw-r--r--main/openssl/crypto/buffer/buf_str.c119
-rw-r--r--main/openssl/crypto/buffer/buffer.c79
-rw-r--r--main/openssl/crypto/buffer/buffer.h2
-rw-r--r--main/openssl/crypto/cmac/cm_ameth.c97
-rw-r--r--main/openssl/crypto/cmac/cm_pmeth.c224
-rw-r--r--main/openssl/crypto/cmac/cmac.c308
-rw-r--r--main/openssl/crypto/cmac/cmac.h82
-rw-r--r--main/openssl/crypto/cms/cms.h501
-rw-r--r--main/openssl/crypto/cms/cms_asn1.c389
-rw-r--r--main/openssl/crypto/cms/cms_att.c195
-rw-r--r--main/openssl/crypto/cms/cms_cd.c136
-rw-r--r--main/openssl/crypto/cms/cms_dd.c148
-rw-r--r--main/openssl/crypto/cms/cms_enc.c294
-rw-r--r--main/openssl/crypto/cms/cms_env.c876
-rw-r--r--main/openssl/crypto/cms/cms_err.c245
-rw-r--r--main/openssl/crypto/cms/cms_ess.c420
-rw-r--r--main/openssl/crypto/cms/cms_io.c133
-rw-r--r--main/openssl/crypto/cms/cms_lcl.h473
-rw-r--r--main/openssl/crypto/cms/cms_lib.c624
-rw-r--r--main/openssl/crypto/cms/cms_pwri.c454
-rw-r--r--main/openssl/crypto/cms/cms_sd.c985
-rw-r--r--main/openssl/crypto/cms/cms_smime.c850
-rw-r--r--main/openssl/crypto/comp/c_rle.c4
-rw-r--r--main/openssl/crypto/conf/conf_mall.c1
-rw-r--r--main/openssl/crypto/cpt_err.c4
-rw-r--r--main/openssl/crypto/cryptlib.c56
-rw-r--r--main/openssl/crypto/cryptlib.h4
-rw-r--r--main/openssl/crypto/crypto.h40
-rw-r--r--main/openssl/crypto/des/asm/crypt586.S879
-rw-r--r--main/openssl/crypto/des/asm/des-586.S1837
-rw-r--r--main/openssl/crypto/des/des.h3
-rw-r--r--main/openssl/crypto/des/set_key.c8
-rw-r--r--main/openssl/crypto/des/str2key.c2
-rw-r--r--main/openssl/crypto/dh/dh.h20
-rw-r--r--main/openssl/crypto/dh/dh_ameth.c1
-rw-r--r--main/openssl/crypto/dh/dh_err.c7
-rw-r--r--main/openssl/crypto/dh/dh_gen.c17
-rw-r--r--main/openssl/crypto/dh/dh_key.c33
-rw-r--r--main/openssl/crypto/dh/dh_lib.c15
-rw-r--r--main/openssl/crypto/dsa/dsa.h30
-rw-r--r--main/openssl/crypto/dsa/dsa_ameth.c47
-rw-r--r--main/openssl/crypto/dsa/dsa_asn1.c40
-rw-r--r--main/openssl/crypto/dsa/dsa_err.c8
-rw-r--r--main/openssl/crypto/dsa/dsa_gen.c35
-rw-r--r--main/openssl/crypto/dsa/dsa_key.c16
-rw-r--r--main/openssl/crypto/dsa/dsa_lib.c22
-rw-r--r--main/openssl/crypto/dsa/dsa_locl.h1
-rw-r--r--main/openssl/crypto/dsa/dsa_ossl.c44
-rw-r--r--main/openssl/crypto/dsa/dsa_pmeth.c6
-rw-r--r--main/openssl/crypto/dsa/dsa_sign.c57
-rw-r--r--main/openssl/crypto/dsa/dsa_vrf.c29
-rw-r--r--main/openssl/crypto/dso/dso_dlfcn.c3
-rw-r--r--main/openssl/crypto/dso/dso_lib.c8
-rw-r--r--main/openssl/crypto/ec/ec.h108
-rw-r--r--main/openssl/crypto/ec/ec2_mult.c4
-rw-r--r--main/openssl/crypto/ec/ec2_oct.c407
-rw-r--r--main/openssl/crypto/ec/ec2_smpl.c353
-rw-r--r--main/openssl/crypto/ec/ec_ameth.c3
-rw-r--r--main/openssl/crypto/ec/ec_asn1.c30
-rw-r--r--main/openssl/crypto/ec/ec_curve.c197
-rw-r--r--main/openssl/crypto/ec/ec_cvt.c28
-rw-r--r--main/openssl/crypto/ec/ec_err.c20
-rw-r--r--main/openssl/crypto/ec/ec_key.c127
-rw-r--r--main/openssl/crypto/ec/ec_lcl.h56
-rw-r--r--main/openssl/crypto/ec/ec_lib.c90
-rw-r--r--main/openssl/crypto/ec/ec_oct.c199
-rw-r--r--main/openssl/crypto/ec/ec_pmeth.c3
-rw-r--r--main/openssl/crypto/ec/eck_prn.c3
-rw-r--r--main/openssl/crypto/ec/ecp_mont.c13
-rw-r--r--main/openssl/crypto/ec/ecp_nist.c13
-rw-r--r--main/openssl/crypto/ec/ecp_oct.c433
-rw-r--r--main/openssl/crypto/ec/ecp_smpl.c379
-rw-r--r--main/openssl/crypto/ec/ectest.c343
-rw-r--r--main/openssl/crypto/ecdh/ecdh.h2
-rw-r--r--main/openssl/crypto/ecdh/ecdhtest.c6
-rw-r--r--main/openssl/crypto/ecdh/ech_err.c4
-rw-r--r--main/openssl/crypto/ecdh/ech_key.c3
-rw-r--r--main/openssl/crypto/ecdh/ech_lib.c31
-rw-r--r--main/openssl/crypto/ecdh/ech_locl.h8
-rw-r--r--main/openssl/crypto/ecdh/ech_ossl.c2
-rw-r--r--main/openssl/crypto/ecdsa/ecdsa.h3
-rw-r--r--main/openssl/crypto/ecdsa/ecdsatest.c89
-rw-r--r--main/openssl/crypto/ecdsa/ecs_err.c5
-rw-r--r--main/openssl/crypto/ecdsa/ecs_lib.c32
-rw-r--r--main/openssl/crypto/ecdsa/ecs_locl.h13
-rw-r--r--main/openssl/crypto/ecdsa/ecs_ossl.c43
-rw-r--r--main/openssl/crypto/ecdsa/ecs_sign.c10
-rw-r--r--main/openssl/crypto/engine/eng_all.c9
-rw-r--r--main/openssl/crypto/engine/eng_cryptodev.c71
-rw-r--r--main/openssl/crypto/engine/eng_dyn.c5
-rw-r--r--main/openssl/crypto/engine/eng_fat.c3
-rw-r--r--main/openssl/crypto/engine/engine.h9
-rw-r--r--main/openssl/crypto/engine/tb_asnmth.c246
-rw-r--r--main/openssl/crypto/engine/tb_pkmeth.c167
-rw-r--r--main/openssl/crypto/err/err.c13
-rw-r--r--main/openssl/crypto/err/err.h3
-rw-r--r--main/openssl/crypto/err/err_all.c12
-rw-r--r--main/openssl/crypto/evp/bio_md.c11
-rw-r--r--main/openssl/crypto/evp/bio_ok.c103
-rw-r--r--main/openssl/crypto/evp/c_allc.c18
-rw-r--r--main/openssl/crypto/evp/digest.c34
-rw-r--r--main/openssl/crypto/evp/e_aes.c1280
-rw-r--r--main/openssl/crypto/evp/e_aes_cbc_hmac_sha1.c581
-rw-r--r--main/openssl/crypto/evp/e_des3.c9
-rw-r--r--main/openssl/crypto/evp/e_null.c4
-rw-r--r--main/openssl/crypto/evp/e_rc2.c3
-rw-r--r--main/openssl/crypto/evp/e_rc4.c1
-rw-r--r--main/openssl/crypto/evp/e_rc4_hmac_md5.c298
-rw-r--r--main/openssl/crypto/evp/evp.h102
-rw-r--r--main/openssl/crypto/evp/evp_cnf.c125
-rw-r--r--main/openssl/crypto/evp/evp_enc.c95
-rw-r--r--main/openssl/crypto/evp/evp_err.c26
-rw-r--r--main/openssl/crypto/evp/evp_key.c27
-rw-r--r--main/openssl/crypto/evp/evp_lib.c4
-rw-r--r--main/openssl/crypto/evp/evp_locl.h40
-rw-r--r--main/openssl/crypto/evp/evp_pbe.c5
-rw-r--r--main/openssl/crypto/evp/evptests.txt13
-rw-r--r--main/openssl/crypto/evp/m_dss.c4
-rw-r--r--main/openssl/crypto/evp/m_dss1.c5
-rw-r--r--main/openssl/crypto/evp/m_ecdsa.c3
-rw-r--r--main/openssl/crypto/evp/m_md4.c2
-rw-r--r--main/openssl/crypto/evp/m_md5.c1
-rw-r--r--main/openssl/crypto/evp/m_mdc2.c2
-rw-r--r--main/openssl/crypto/evp/m_ripemd.c1
-rw-r--r--main/openssl/crypto/evp/m_sha1.c7
-rw-r--r--main/openssl/crypto/evp/m_wp.c1
-rw-r--r--main/openssl/crypto/evp/names.c5
-rw-r--r--main/openssl/crypto/evp/p5_crpt.c33
-rw-r--r--main/openssl/crypto/evp/p5_crpt2.c121
-rw-r--r--main/openssl/crypto/evp/p_lib.c6
-rw-r--r--main/openssl/crypto/evp/p_open.c3
-rw-r--r--main/openssl/crypto/evp/p_seal.c3
-rw-r--r--main/openssl/crypto/evp/p_sign.c10
-rw-r--r--main/openssl/crypto/evp/p_verify.c10
-rw-r--r--main/openssl/crypto/evp/pmeth_gn.c5
-rw-r--r--main/openssl/crypto/evp/pmeth_lib.c55
-rw-r--r--main/openssl/crypto/hmac/hm_ameth.c2
-rw-r--r--main/openssl/crypto/hmac/hm_pmeth.c14
-rw-r--r--main/openssl/crypto/hmac/hmac.c37
-rw-r--r--main/openssl/crypto/ia64cpuid.S2
-rw-r--r--main/openssl/crypto/md4/md4.h3
-rw-r--r--main/openssl/crypto/md4/md4_dgst.c36
-rw-r--r--main/openssl/crypto/md4/md4_locl.h8
-rw-r--r--main/openssl/crypto/md5/asm/md5-586.S679
-rw-r--r--main/openssl/crypto/md5/asm/md5-x86_64.S668
-rwxr-xr-xmain/openssl/crypto/md5/asm/md5-x86_64.pl3
-rw-r--r--main/openssl/crypto/md5/md5.h3
-rw-r--r--main/openssl/crypto/md5/md5_dgst.c3
-rw-r--r--main/openssl/crypto/md5/md5_locl.h8
-rw-r--r--main/openssl/crypto/mdc2/mdc2.h3
-rw-r--r--main/openssl/crypto/mdc2/mdc2dgst.c3
-rw-r--r--main/openssl/crypto/mem.c8
-rw-r--r--main/openssl/crypto/modes/asm/ghash-alpha.pl460
-rw-r--r--main/openssl/crypto/modes/asm/ghash-armv4.S408
-rw-r--r--main/openssl/crypto/modes/asm/ghash-armv4.pl429
-rwxr-xr-xmain/openssl/crypto/modes/asm/ghash-ia64.pl463
-rw-r--r--main/openssl/crypto/modes/asm/ghash-parisc.pl731
-rw-r--r--main/openssl/crypto/modes/asm/ghash-s390x.pl262
-rw-r--r--main/openssl/crypto/modes/asm/ghash-sparcv9.pl330
-rw-r--r--main/openssl/crypto/modes/asm/ghash-x86.S728
-rw-r--r--main/openssl/crypto/modes/asm/ghash-x86.pl1342
-rw-r--r--main/openssl/crypto/modes/asm/ghash-x86_64.S1026
-rw-r--r--main/openssl/crypto/modes/asm/ghash-x86_64.pl806
-rw-r--r--main/openssl/crypto/modes/cbc128.c35
-rw-r--r--main/openssl/crypto/modes/ccm128.c441
-rw-r--r--main/openssl/crypto/modes/cfb128.c11
-rw-r--r--main/openssl/crypto/modes/ctr128.c92
-rw-r--r--main/openssl/crypto/modes/gcm128.c1817
-rw-r--r--main/openssl/crypto/modes/modes_lcl.h128
-rw-r--r--main/openssl/crypto/modes/ofb128.c11
-rw-r--r--main/openssl/crypto/modes/xts128.c187
-rw-r--r--main/openssl/crypto/o_init.c82
-rw-r--r--main/openssl/crypto/objects/o_names.c2
-rw-r--r--main/openssl/crypto/objects/obj_dat.h136
-rw-r--r--main/openssl/crypto/objects/obj_mac.h142
-rw-r--r--main/openssl/crypto/objects/obj_mac.num27
-rw-r--r--main/openssl/crypto/objects/obj_xref.c9
-rw-r--r--main/openssl/crypto/objects/obj_xref.h2
-rw-r--r--main/openssl/crypto/objects/obj_xref.txt4
-rw-r--r--main/openssl/crypto/objects/objects.txt41
-rw-r--r--main/openssl/crypto/ocsp/ocsp_lib.c3
-rw-r--r--main/openssl/crypto/ocsp/ocsp_vfy.c10
-rw-r--r--main/openssl/crypto/opensslconf-32.h316
-rw-r--r--main/openssl/crypto/opensslconf-64.h316
-rw-r--r--main/openssl/crypto/opensslconf-static-32.h316
-rw-r--r--main/openssl/crypto/opensslconf-static-64.h316
-rw-r--r--main/openssl/crypto/opensslconf-static-trusty.h448
-rw-r--r--main/openssl/crypto/opensslconf-static.h6
-rw-r--r--main/openssl/crypto/opensslconf-trusty.h448
-rw-r--r--main/openssl/crypto/opensslconf.h256
-rw-r--r--main/openssl/crypto/opensslv.h6
-rw-r--r--main/openssl/crypto/ossl_typ.h2
-rw-r--r--main/openssl/crypto/pariscid.pl225
-rw-r--r--main/openssl/crypto/pem/pem_all.c161
-rw-r--r--main/openssl/crypto/pem/pem_info.c1
-rw-r--r--main/openssl/crypto/pem/pem_lib.c27
-rw-r--r--main/openssl/crypto/pem/pem_seal.c6
-rw-r--r--main/openssl/crypto/pem/pvkfmt.c58
-rw-r--r--main/openssl/crypto/perlasm/cbc.pl2
-rwxr-xr-xmain/openssl/crypto/perlasm/ppc-xlate.pl13
-rwxr-xr-xmain/openssl/crypto/perlasm/x86_64-xlate.pl221
-rw-r--r--main/openssl/crypto/perlasm/x86asm.pl55
-rw-r--r--main/openssl/crypto/perlasm/x86gas.pl34
-rw-r--r--main/openssl/crypto/perlasm/x86masm.pl20
-rw-r--r--main/openssl/crypto/perlasm/x86nasm.pl15
-rw-r--r--main/openssl/crypto/pkcs12/p12_crt.c7
-rw-r--r--main/openssl/crypto/pkcs12/p12_decr.c9
-rw-r--r--main/openssl/crypto/pkcs12/p12_key.c40
-rw-r--r--main/openssl/crypto/pkcs12/p12_kiss.c2
-rw-r--r--main/openssl/crypto/pkcs12/p12_mutl.c12
-rw-r--r--main/openssl/crypto/pkcs7/pk7_doit.c101
-rw-r--r--main/openssl/crypto/pkcs7/pk7_smime.c25
-rwxr-xr-xmain/openssl/crypto/ppccpuid.pl48
-rw-r--r--main/openssl/crypto/rand/md_rand.c55
-rw-r--r--main/openssl/crypto/rand/rand.h10
-rw-r--r--main/openssl/crypto/rand/rand_err.c7
-rw-r--r--main/openssl/crypto/rand/rand_lib.c130
-rw-r--r--main/openssl/crypto/rand/rand_unix.c108
-rw-r--r--main/openssl/crypto/rand/randfile.c4
-rw-r--r--main/openssl/crypto/rc2/rc2.h4
-rw-r--r--main/openssl/crypto/rc2/rc2_skey.c8
-rw-r--r--main/openssl/crypto/rc4/asm/rc4-586.pl162
-rw-r--r--main/openssl/crypto/rc4/asm/rc4-md5-x86_64.S1259
-rw-r--r--main/openssl/crypto/rc4/asm/rc4-md5-x86_64.pl632
-rw-r--r--main/openssl/crypto/rc4/asm/rc4-parisc.pl314
-rw-r--r--main/openssl/crypto/rc4/asm/rc4-s390x.pl47
-rw-r--r--main/openssl/crypto/rc4/asm/rc4-x86_64.S615
-rw-r--r--[-rwxr-xr-x]main/openssl/crypto/rc4/asm/rc4-x86_64.pl295
-rw-r--r--main/openssl/crypto/rc4/rc4.h1
-rw-r--r--main/openssl/crypto/rc4/rc4_skey.c36
-rw-r--r--main/openssl/crypto/rc4/rc4_utl.c62
-rw-r--r--main/openssl/crypto/rc4/rc4test.c6
-rw-r--r--main/openssl/crypto/ripemd/ripemd.h3
-rw-r--r--main/openssl/crypto/ripemd/rmd_dgst.c33
-rw-r--r--main/openssl/crypto/ripemd/rmd_locl.h10
-rw-r--r--main/openssl/crypto/rsa/rsa.h81
-rw-r--r--main/openssl/crypto/rsa/rsa_ameth.c351
-rw-r--r--main/openssl/crypto/rsa/rsa_asn1.c10
-rw-r--r--main/openssl/crypto/rsa/rsa_chk.c6
-rw-r--r--main/openssl/crypto/rsa/rsa_crpt.c257
-rw-r--r--main/openssl/crypto/rsa/rsa_eay.c86
-rw-r--r--main/openssl/crypto/rsa/rsa_err.c21
-rw-r--r--main/openssl/crypto/rsa/rsa_gen.c15
-rw-r--r--main/openssl/crypto/rsa/rsa_lib.c172
-rw-r--r--main/openssl/crypto/rsa/rsa_oaep.c8
-rw-r--r--main/openssl/crypto/rsa/rsa_pmeth.c156
-rw-r--r--main/openssl/crypto/rsa/rsa_pss.c81
-rw-r--r--main/openssl/crypto/rsa/rsa_sign.c33
-rw-r--r--main/openssl/crypto/s390xcap.c12
-rw-r--r--main/openssl/crypto/s390xcpuid.S17
-rw-r--r--main/openssl/crypto/sha/asm/sha1-586.S1380
-rw-r--r--main/openssl/crypto/sha/asm/sha1-586.pl1107
-rw-r--r--main/openssl/crypto/sha/asm/sha1-alpha.pl322
-rw-r--r--main/openssl/crypto/sha/asm/sha1-armv4-large.pl41
-rw-r--r--main/openssl/crypto/sha/asm/sha1-armv4-large.s209
-rw-r--r--main/openssl/crypto/sha/asm/sha1-ia64.pl193
-rw-r--r--main/openssl/crypto/sha/asm/sha1-mips.S1664
-rw-r--r--main/openssl/crypto/sha/asm/sha1-mips.pl354
-rw-r--r--main/openssl/crypto/sha/asm/sha1-parisc.pl260
-rwxr-xr-xmain/openssl/crypto/sha/asm/sha1-ppc.pl83
-rw-r--r--main/openssl/crypto/sha/asm/sha1-s390x.pl50
-rw-r--r--main/openssl/crypto/sha/asm/sha1-sparcv9a.pl2
-rw-r--r--main/openssl/crypto/sha/asm/sha1-x86_64.S2486
-rwxr-xr-xmain/openssl/crypto/sha/asm/sha1-x86_64.pl1188
-rw-r--r--main/openssl/crypto/sha/asm/sha256-586.S258
-rw-r--r--main/openssl/crypto/sha/asm/sha256-586.pl54
-rw-r--r--main/openssl/crypto/sha/asm/sha256-armv4.pl55
-rw-r--r--main/openssl/crypto/sha/asm/sha256-armv4.s858
-rw-r--r--main/openssl/crypto/sha/asm/sha256-mips.S1998
-rw-r--r--main/openssl/crypto/sha/asm/sha256-x86_64.S1778
-rw-r--r--main/openssl/crypto/sha/asm/sha512-586.S563
-rw-r--r--main/openssl/crypto/sha/asm/sha512-586.pl18
-rw-r--r--main/openssl/crypto/sha/asm/sha512-armv4.pl357
-rw-r--r--main/openssl/crypto/sha/asm/sha512-armv4.s1635
-rw-r--r--main/openssl/crypto/sha/asm/sha512-mips.pl455
-rwxr-xr-xmain/openssl/crypto/sha/asm/sha512-parisc.pl793
-rwxr-xr-xmain/openssl/crypto/sha/asm/sha512-ppc.pl114
-rw-r--r--main/openssl/crypto/sha/asm/sha512-s390x.pl63
-rw-r--r--main/openssl/crypto/sha/asm/sha512-sparcv9.pl6
-rw-r--r--main/openssl/crypto/sha/asm/sha512-x86_64.S1802
-rwxr-xr-xmain/openssl/crypto/sha/asm/sha512-x86_64.pl89
-rw-r--r--main/openssl/crypto/sha/sha.h14
-rw-r--r--main/openssl/crypto/sha/sha1_one.c2
-rw-r--r--main/openssl/crypto/sha/sha1dgst.c1
-rw-r--r--main/openssl/crypto/sha/sha256.c10
-rw-r--r--main/openssl/crypto/sha/sha512.c63
-rw-r--r--main/openssl/crypto/sha/sha_dgst.c1
-rw-r--r--main/openssl/crypto/sha/sha_locl.h46
-rw-r--r--main/openssl/crypto/sparccpuid.S6
-rw-r--r--main/openssl/crypto/sparcv9cap.c4
-rw-r--r--main/openssl/crypto/srp/srp.h172
-rw-r--r--main/openssl/crypto/srp/srp_grps.h517
-rw-r--r--main/openssl/crypto/srp/srp_lcl.h83
-rw-r--r--main/openssl/crypto/srp/srp_lib.c361
-rw-r--r--main/openssl/crypto/srp/srp_vfy.c658
-rw-r--r--main/openssl/crypto/stack/safestack.h138
-rw-r--r--main/openssl/crypto/symhacks.h32
-rw-r--r--main/openssl/crypto/ui/ui.h2
-rw-r--r--main/openssl/crypto/ui/ui_openssl.c6
-rw-r--r--main/openssl/crypto/x509/x509.h11
-rw-r--r--main/openssl/crypto/x509/x509_cmp.c34
-rw-r--r--main/openssl/crypto/x509/x509_lu.c2
-rw-r--r--main/openssl/crypto/x509/x509_vfy.c22
-rw-r--r--main/openssl/crypto/x509/x509type.c32
-rw-r--r--main/openssl/crypto/x509/x_all.c21
-rw-r--r--main/openssl/crypto/x509v3/v3_addr.c125
-rw-r--r--main/openssl/crypto/x509v3/v3_asid.c63
-rw-r--r--main/openssl/crypto/x509v3/v3_pci.c2
-rw-r--r--main/openssl/crypto/x509v3/v3_pcia.c2
-rw-r--r--main/openssl/crypto/x509v3/v3_purp.c4
-rw-r--r--main/openssl/crypto/x509v3/v3_skey.c3
-rw-r--r--main/openssl/crypto/x509v3/v3_utl.c2
-rw-r--r--main/openssl/crypto/x86_64cpuid.S234
-rw-r--r--main/openssl/crypto/x86_64cpuid.pl88
-rw-r--r--main/openssl/crypto/x86cpuid.S334
-rw-r--r--main/openssl/crypto/x86cpuid.pl84
423 files changed, 115106 insertions, 5525 deletions
diff --git a/main/openssl/crypto/Android.mk b/main/openssl/crypto/Android.mk
deleted file mode 100644
index 70aef35d..00000000
--- a/main/openssl/crypto/Android.mk
+++ /dev/null
@@ -1,559 +0,0 @@
-LOCAL_PATH:= $(call my-dir)
-
-arm_cflags := -DOPENSSL_BN_ASM_MONT -DAES_ASM -DSHA1_ASM -DSHA256_ASM -DSHA512_ASM
-arm_src_files := \
- aes/asm/aes-armv4.s \
- bn/asm/armv4-mont.s \
- sha/asm/sha1-armv4-large.s \
- sha/asm/sha256-armv4.s \
- sha/asm/sha512-armv4.s
-non_arm_src_files := aes/aes_core.c
-
-local_src_files := \
- cryptlib.c \
- mem.c \
- mem_clr.c \
- mem_dbg.c \
- cversion.c \
- ex_data.c \
- cpt_err.c \
- ebcdic.c \
- uid.c \
- o_time.c \
- o_str.c \
- o_dir.c \
- aes/aes_cbc.c \
- aes/aes_cfb.c \
- aes/aes_ctr.c \
- aes/aes_ecb.c \
- aes/aes_misc.c \
- aes/aes_ofb.c \
- aes/aes_wrap.c \
- asn1/a_bitstr.c \
- asn1/a_bool.c \
- asn1/a_bytes.c \
- asn1/a_d2i_fp.c \
- asn1/a_digest.c \
- asn1/a_dup.c \
- asn1/a_enum.c \
- asn1/a_gentm.c \
- asn1/a_i2d_fp.c \
- asn1/a_int.c \
- asn1/a_mbstr.c \
- asn1/a_object.c \
- asn1/a_octet.c \
- asn1/a_print.c \
- asn1/a_set.c \
- asn1/a_sign.c \
- asn1/a_strex.c \
- asn1/a_strnid.c \
- asn1/a_time.c \
- asn1/a_type.c \
- asn1/a_utctm.c \
- asn1/a_utf8.c \
- asn1/a_verify.c \
- asn1/ameth_lib.c \
- asn1/asn1_err.c \
- asn1/asn1_gen.c \
- asn1/asn1_lib.c \
- asn1/asn1_par.c \
- asn1/asn_mime.c \
- asn1/asn_moid.c \
- asn1/asn_pack.c \
- asn1/bio_asn1.c \
- asn1/bio_ndef.c \
- asn1/d2i_pr.c \
- asn1/d2i_pu.c \
- asn1/evp_asn1.c \
- asn1/f_enum.c \
- asn1/f_int.c \
- asn1/f_string.c \
- asn1/i2d_pr.c \
- asn1/i2d_pu.c \
- asn1/n_pkey.c \
- asn1/nsseq.c \
- asn1/p5_pbe.c \
- asn1/p5_pbev2.c \
- asn1/p8_pkey.c \
- asn1/t_bitst.c \
- asn1/t_crl.c \
- asn1/t_pkey.c \
- asn1/t_req.c \
- asn1/t_spki.c \
- asn1/t_x509.c \
- asn1/t_x509a.c \
- asn1/tasn_dec.c \
- asn1/tasn_enc.c \
- asn1/tasn_fre.c \
- asn1/tasn_new.c \
- asn1/tasn_prn.c \
- asn1/tasn_typ.c \
- asn1/tasn_utl.c \
- asn1/x_algor.c \
- asn1/x_attrib.c \
- asn1/x_bignum.c \
- asn1/x_crl.c \
- asn1/x_exten.c \
- asn1/x_info.c \
- asn1/x_long.c \
- asn1/x_name.c \
- asn1/x_nx509.c \
- asn1/x_pkey.c \
- asn1/x_pubkey.c \
- asn1/x_req.c \
- asn1/x_sig.c \
- asn1/x_spki.c \
- asn1/x_val.c \
- asn1/x_x509.c \
- asn1/x_x509a.c \
- bf/bf_cfb64.c \
- bf/bf_ecb.c \
- bf/bf_enc.c \
- bf/bf_ofb64.c \
- bf/bf_skey.c \
- bio/b_dump.c \
- bio/b_print.c \
- bio/b_sock.c \
- bio/bf_buff.c \
- bio/bf_nbio.c \
- bio/bf_null.c \
- bio/bio_cb.c \
- bio/bio_err.c \
- bio/bio_lib.c \
- bio/bss_acpt.c \
- bio/bss_bio.c \
- bio/bss_conn.c \
- bio/bss_dgram.c \
- bio/bss_fd.c \
- bio/bss_file.c \
- bio/bss_log.c \
- bio/bss_mem.c \
- bio/bss_null.c \
- bio/bss_sock.c \
- bn/bn_add.c \
- bn/bn_asm.c \
- bn/bn_blind.c \
- bn/bn_const.c \
- bn/bn_ctx.c \
- bn/bn_div.c \
- bn/bn_err.c \
- bn/bn_exp.c \
- bn/bn_exp2.c \
- bn/bn_gcd.c \
- bn/bn_gf2m.c \
- bn/bn_kron.c \
- bn/bn_lib.c \
- bn/bn_mod.c \
- bn/bn_mont.c \
- bn/bn_mpi.c \
- bn/bn_mul.c \
- bn/bn_nist.c \
- bn/bn_prime.c \
- bn/bn_print.c \
- bn/bn_rand.c \
- bn/bn_recp.c \
- bn/bn_shift.c \
- bn/bn_sqr.c \
- bn/bn_sqrt.c \
- bn/bn_word.c \
- buffer/buf_err.c \
- buffer/buffer.c \
- comp/c_rle.c \
- comp/c_zlib.c \
- comp/comp_err.c \
- comp/comp_lib.c \
- conf/conf_api.c \
- conf/conf_def.c \
- conf/conf_err.c \
- conf/conf_lib.c \
- conf/conf_mall.c \
- conf/conf_mod.c \
- conf/conf_sap.c \
- des/cbc_cksm.c \
- des/cbc_enc.c \
- des/cfb64ede.c \
- des/cfb64enc.c \
- des/cfb_enc.c \
- des/des_enc.c \
- des/des_old.c \
- des/des_old2.c \
- des/ecb3_enc.c \
- des/ecb_enc.c \
- des/ede_cbcm_enc.c \
- des/enc_read.c \
- des/enc_writ.c \
- des/fcrypt.c \
- des/fcrypt_b.c \
- des/ofb64ede.c \
- des/ofb64enc.c \
- des/ofb_enc.c \
- des/pcbc_enc.c \
- des/qud_cksm.c \
- des/rand_key.c \
- des/read2pwd.c \
- des/rpc_enc.c \
- des/set_key.c \
- des/str2key.c \
- des/xcbc_enc.c \
- dh/dh_ameth.c \
- dh/dh_asn1.c \
- dh/dh_check.c \
- dh/dh_depr.c \
- dh/dh_err.c \
- dh/dh_gen.c \
- dh/dh_key.c \
- dh/dh_lib.c \
- dh/dh_pmeth.c \
- dsa/dsa_ameth.c \
- dsa/dsa_asn1.c \
- dsa/dsa_depr.c \
- dsa/dsa_err.c \
- dsa/dsa_gen.c \
- dsa/dsa_key.c \
- dsa/dsa_lib.c \
- dsa/dsa_ossl.c \
- dsa/dsa_pmeth.c \
- dsa/dsa_prn.c \
- dsa/dsa_sign.c \
- dsa/dsa_vrf.c \
- dso/dso_dl.c \
- dso/dso_dlfcn.c \
- dso/dso_err.c \
- dso/dso_lib.c \
- dso/dso_null.c \
- dso/dso_openssl.c \
- ec/ec2_mult.c \
- ec/ec2_smpl.c \
- ec/ec_ameth.c \
- ec/ec_asn1.c \
- ec/ec_check.c \
- ec/ec_curve.c \
- ec/ec_cvt.c \
- ec/ec_err.c \
- ec/ec_key.c \
- ec/ec_lib.c \
- ec/ec_mult.c \
- ec/ec_pmeth.c \
- ec/ec_print.c \
- ec/eck_prn.c \
- ec/ecp_mont.c \
- ec/ecp_nist.c \
- ec/ecp_smpl.c \
- ecdh/ech_err.c \
- ecdh/ech_key.c \
- ecdh/ech_lib.c \
- ecdh/ech_ossl.c \
- ecdsa/ecs_asn1.c \
- ecdsa/ecs_err.c \
- ecdsa/ecs_lib.c \
- ecdsa/ecs_ossl.c \
- ecdsa/ecs_sign.c \
- ecdsa/ecs_vrf.c \
- err/err.c \
- err/err_all.c \
- err/err_prn.c \
- evp/bio_b64.c \
- evp/bio_enc.c \
- evp/bio_md.c \
- evp/bio_ok.c \
- evp/c_all.c \
- evp/c_allc.c \
- evp/c_alld.c \
- evp/digest.c \
- evp/e_aes.c \
- evp/e_bf.c \
- evp/e_des.c \
- evp/e_des3.c \
- evp/e_null.c \
- evp/e_old.c \
- evp/e_rc2.c \
- evp/e_rc4.c \
- evp/e_rc5.c \
- evp/e_xcbc_d.c \
- evp/encode.c \
- evp/evp_acnf.c \
- evp/evp_enc.c \
- evp/evp_err.c \
- evp/evp_key.c \
- evp/evp_lib.c \
- evp/evp_pbe.c \
- evp/evp_pkey.c \
- evp/m_dss.c \
- evp/m_dss1.c \
- evp/m_ecdsa.c \
- evp/m_md4.c \
- evp/m_md5.c \
- evp/m_mdc2.c \
- evp/m_null.c \
- evp/m_ripemd.c \
- evp/m_sha1.c \
- evp/m_sigver.c \
- evp/m_wp.c \
- evp/names.c \
- evp/p5_crpt.c \
- evp/p5_crpt2.c \
- evp/p_dec.c \
- evp/p_enc.c \
- evp/p_lib.c \
- evp/p_open.c \
- evp/p_seal.c \
- evp/p_sign.c \
- evp/p_verify.c \
- evp/pmeth_fn.c \
- evp/pmeth_gn.c \
- evp/pmeth_lib.c \
- hmac/hm_ameth.c \
- hmac/hm_pmeth.c \
- hmac/hmac.c \
- krb5/krb5_asn.c \
- lhash/lh_stats.c \
- lhash/lhash.c \
- md4/md4_dgst.c \
- md4/md4_one.c \
- md5/md5_dgst.c \
- md5/md5_one.c \
- modes/cbc128.c \
- modes/cfb128.c \
- modes/ctr128.c \
- modes/ofb128.c \
- objects/o_names.c \
- objects/obj_dat.c \
- objects/obj_err.c \
- objects/obj_lib.c \
- objects/obj_xref.c \
- ocsp/ocsp_asn.c \
- ocsp/ocsp_cl.c \
- ocsp/ocsp_err.c \
- ocsp/ocsp_ext.c \
- ocsp/ocsp_ht.c \
- ocsp/ocsp_lib.c \
- ocsp/ocsp_prn.c \
- ocsp/ocsp_srv.c \
- ocsp/ocsp_vfy.c \
- pem/pem_all.c \
- pem/pem_err.c \
- pem/pem_info.c \
- pem/pem_lib.c \
- pem/pem_oth.c \
- pem/pem_pk8.c \
- pem/pem_pkey.c \
- pem/pem_seal.c \
- pem/pem_sign.c \
- pem/pem_x509.c \
- pem/pem_xaux.c \
- pem/pvkfmt.c \
- pkcs12/p12_add.c \
- pkcs12/p12_asn.c \
- pkcs12/p12_attr.c \
- pkcs12/p12_crpt.c \
- pkcs12/p12_crt.c \
- pkcs12/p12_decr.c \
- pkcs12/p12_init.c \
- pkcs12/p12_key.c \
- pkcs12/p12_kiss.c \
- pkcs12/p12_mutl.c \
- pkcs12/p12_npas.c \
- pkcs12/p12_p8d.c \
- pkcs12/p12_p8e.c \
- pkcs12/p12_utl.c \
- pkcs12/pk12err.c \
- pkcs7/pk7_asn1.c \
- pkcs7/pk7_attr.c \
- pkcs7/pk7_doit.c \
- pkcs7/pk7_lib.c \
- pkcs7/pk7_mime.c \
- pkcs7/pk7_smime.c \
- pkcs7/pkcs7err.c \
- rand/md_rand.c \
- rand/rand_egd.c \
- rand/rand_err.c \
- rand/rand_lib.c \
- rand/rand_unix.c \
- rand/randfile.c \
- rc2/rc2_cbc.c \
- rc2/rc2_ecb.c \
- rc2/rc2_skey.c \
- rc2/rc2cfb64.c \
- rc2/rc2ofb64.c \
- rc4/rc4_enc.c \
- rc4/rc4_skey.c \
- ripemd/rmd_dgst.c \
- ripemd/rmd_one.c \
- rsa/rsa_ameth.c \
- rsa/rsa_asn1.c \
- rsa/rsa_chk.c \
- rsa/rsa_eay.c \
- rsa/rsa_err.c \
- rsa/rsa_gen.c \
- rsa/rsa_lib.c \
- rsa/rsa_none.c \
- rsa/rsa_null.c \
- rsa/rsa_oaep.c \
- rsa/rsa_pk1.c \
- rsa/rsa_pmeth.c \
- rsa/rsa_prn.c \
- rsa/rsa_pss.c \
- rsa/rsa_saos.c \
- rsa/rsa_sign.c \
- rsa/rsa_ssl.c \
- rsa/rsa_x931.c \
- sha/sha1_one.c \
- sha/sha1dgst.c \
- sha/sha256.c \
- sha/sha512.c \
- sha/sha_dgst.c \
- stack/stack.c \
- ts/ts_err.c \
- txt_db/txt_db.c \
- ui/ui_compat.c \
- ui/ui_err.c \
- ui/ui_lib.c \
- ui/ui_openssl.c \
- ui/ui_util.c \
- x509/by_dir.c \
- x509/by_file.c \
- x509/x509_att.c \
- x509/x509_cmp.c \
- x509/x509_d2.c \
- x509/x509_def.c \
- x509/x509_err.c \
- x509/x509_ext.c \
- x509/x509_lu.c \
- x509/x509_obj.c \
- x509/x509_r2x.c \
- x509/x509_req.c \
- x509/x509_set.c \
- x509/x509_trs.c \
- x509/x509_txt.c \
- x509/x509_v3.c \
- x509/x509_vfy.c \
- x509/x509_vpm.c \
- x509/x509cset.c \
- x509/x509name.c \
- x509/x509rset.c \
- x509/x509spki.c \
- x509/x509type.c \
- x509/x_all.c \
- x509v3/pcy_cache.c \
- x509v3/pcy_data.c \
- x509v3/pcy_lib.c \
- x509v3/pcy_map.c \
- x509v3/pcy_node.c \
- x509v3/pcy_tree.c \
- x509v3/v3_akey.c \
- x509v3/v3_akeya.c \
- x509v3/v3_alt.c \
- x509v3/v3_bcons.c \
- x509v3/v3_bitst.c \
- x509v3/v3_conf.c \
- x509v3/v3_cpols.c \
- x509v3/v3_crld.c \
- x509v3/v3_enum.c \
- x509v3/v3_extku.c \
- x509v3/v3_genn.c \
- x509v3/v3_ia5.c \
- x509v3/v3_info.c \
- x509v3/v3_int.c \
- x509v3/v3_lib.c \
- x509v3/v3_ncons.c \
- x509v3/v3_ocsp.c \
- x509v3/v3_pci.c \
- x509v3/v3_pcia.c \
- x509v3/v3_pcons.c \
- x509v3/v3_pku.c \
- x509v3/v3_pmaps.c \
- x509v3/v3_prn.c \
- x509v3/v3_purp.c \
- x509v3/v3_skey.c \
- x509v3/v3_sxnet.c \
- x509v3/v3_utl.c \
- x509v3/v3err.c
-
-local_c_includes := \
- openssl \
- openssl/crypto/asn1 \
- openssl/crypto/evp \
- openssl/include \
- openssl/include/openssl \
- zlib
-
-local_c_flags := -DNO_WINDOWS_BRAINDEATH
-
-#######################################
-# target static library
-include $(CLEAR_VARS)
-include $(LOCAL_PATH)/../android-config.mk
-
-ifneq ($(TARGET_ARCH),x86)
-LOCAL_NDK_VERSION := 5
-LOCAL_SDK_VERSION := 9
-endif
-
-LOCAL_SRC_FILES += $(local_src_files)
-LOCAL_CFLAGS += $(local_c_flags)
-LOCAL_C_INCLUDES += $(local_c_includes)
-ifeq ($(TARGET_ARCH),arm)
- LOCAL_SRC_FILES += $(arm_src_files)
- LOCAL_CFLAGS += $(arm_cflags)
-else
- LOCAL_SRC_FILES += $(non_arm_src_files)
-endif
-LOCAL_MODULE_TAGS := optional
-LOCAL_MODULE:= libcrypto_static
-include $(BUILD_STATIC_LIBRARY)
-
-#######################################
-# target shared library
-include $(CLEAR_VARS)
-include $(LOCAL_PATH)/../android-config.mk
-
-#ifneq ($(TARGET_ARCH),x86)
-#LOCAL_NDK_VERSION := 5
-#LOCAL_SDK_VERSION := 9
-## Use the NDK prebuilt libz and libdl.
-LOCAL_LDFLAGS += -lz -ldl
-#else
-#LOCAL_SHARED_LIBRARIES += libz libdl
-#endif
-
-LOCAL_SRC_FILES += $(local_src_files)
-LOCAL_CFLAGS += $(local_c_flags)
-LOCAL_C_INCLUDES += $(local_c_includes)
-ifeq ($(TARGET_ARCH),arm)
- LOCAL_SRC_FILES += $(arm_src_files)
- LOCAL_CFLAGS += $(arm_cflags)
-else
- LOCAL_SRC_FILES += $(non_arm_src_files)
-endif
-LOCAL_MODULE_TAGS := optional
-LOCAL_MODULE:= libcrypto
-include $(BUILD_SHARED_LIBRARY)
-
-#######################################
-# Host shared library
-#include $(CLEAR_VARS)
-##include $(LOCAL_PATH)/../android-config.mk
-#LOCAL_SRC_FILES += $(local_src_files)
-#LOCAL_CFLAGS += $(local_c_flags) -DPURIFY
-#LOCAL_C_INCLUDES += $(local_c_includes)
-#LOCAL_SRC_FILES += $(non_arm_src_files)
-#LOCAL_STATIC_LIBRARIES += libz
-#LOCAL_LDLIBS += -ldl
-#LOCAL_MODULE_TAGS := optional
-#LOCAL_MODULE:= libcrypto
-#include $(BUILD_HOST_SHARED_LIBRARY)
-
-########################################
-# host static library, which is used by some SDK tools.
-
-#include $(CLEAR_VARS)
-#include $(LOCAL_PATH)/../android-config.mk
-#LOCAL_SRC_FILES += $(local_src_files)
-#LOCAL_CFLAGS += $(local_c_flags) -DPURIFY
-#LOCAL_C_INCLUDES += $(local_c_includes)
-#LOCAL_SRC_FILES += $(non_arm_src_files)
-#LOCAL_STATIC_LIBRARIES += libz
-#LOCAL_LDLIBS += -ldl
-#LOCAL_MODULE_TAGS := optional
-#LOCAL_MODULE:= libcrypto_static
-#include $(BUILD_HOST_STATIC_LIBRARY)
diff --git a/main/openssl/crypto/LPdir_win32.c b/main/openssl/crypto/LPdir_win32.c
new file mode 100644
index 00000000..e39872da
--- /dev/null
+++ b/main/openssl/crypto/LPdir_win32.c
@@ -0,0 +1,30 @@
+/* $LP: LPlib/source/LPdir_win32.c,v 1.3 2004/08/26 13:36:05 _cvs_levitte Exp $ */
+/*
+ * Copyright (c) 2004, Richard Levitte <richard@levitte.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#define LP_SYS_WIN32
+#define LP_MULTIBYTE_AVAILABLE
+#include "LPdir_win.c"
diff --git a/main/openssl/crypto/aes/aes.h b/main/openssl/crypto/aes/aes.h
index d2c99730..031abf01 100644
--- a/main/openssl/crypto/aes/aes.h
+++ b/main/openssl/crypto/aes/aes.h
@@ -90,6 +90,11 @@ int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
AES_KEY *key);
+int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits,
+ AES_KEY *key);
+int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits,
+ AES_KEY *key);
+
void AES_encrypt(const unsigned char *in, unsigned char *out,
const AES_KEY *key);
void AES_decrypt(const unsigned char *in, unsigned char *out,
diff --git a/main/openssl/crypto/aes/aes_core.c b/main/openssl/crypto/aes/aes_core.c
index a7ec54f4..8f5210ac 100644
--- a/main/openssl/crypto/aes/aes_core.c
+++ b/main/openssl/crypto/aes/aes_core.c
@@ -625,7 +625,7 @@ static const u32 rcon[] = {
/**
* Expand the cipher key into the encryption key schedule.
*/
-int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
+int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits,
AES_KEY *key) {
u32 *rk;
@@ -726,7 +726,7 @@ int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
/**
* Expand the cipher key into the decryption key schedule.
*/
-int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
+int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits,
AES_KEY *key) {
u32 *rk;
@@ -734,7 +734,7 @@ int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
u32 temp;
/* first, start with an encryption schedule */
- status = AES_set_encrypt_key(userKey, bits, key);
+ status = private_AES_set_encrypt_key(userKey, bits, key);
if (status < 0)
return status;
@@ -1201,7 +1201,7 @@ static const u32 rcon[] = {
/**
* Expand the cipher key into the encryption key schedule.
*/
-int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
+int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits,
AES_KEY *key) {
u32 *rk;
int i = 0;
@@ -1301,7 +1301,7 @@ int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
/**
* Expand the cipher key into the decryption key schedule.
*/
-int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
+int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits,
AES_KEY *key) {
u32 *rk;
@@ -1309,7 +1309,7 @@ int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
u32 temp;
/* first, start with an encryption schedule */
- status = AES_set_encrypt_key(userKey, bits, key);
+ status = private_AES_set_encrypt_key(userKey, bits, key);
if (status < 0)
return status;
diff --git a/main/openssl/crypto/aes/aes_misc.c b/main/openssl/crypto/aes/aes_misc.c
index 4fead1b4..f083488e 100644
--- a/main/openssl/crypto/aes/aes_misc.c
+++ b/main/openssl/crypto/aes/aes_misc.c
@@ -50,6 +50,7 @@
*/
#include <openssl/opensslv.h>
+#include <openssl/crypto.h>
#include <openssl/aes.h>
#include "aes_locl.h"
@@ -62,3 +63,23 @@ const char *AES_options(void) {
return "aes(partial)";
#endif
}
+
+/* FIPS wrapper functions to block low level AES calls in FIPS mode */
+
+int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
+ AES_KEY *key)
+ {
+#ifdef OPENSSL_FIPS
+ fips_cipher_abort(AES);
+#endif
+ return private_AES_set_encrypt_key(userKey, bits, key);
+ }
+
+int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
+ AES_KEY *key)
+ {
+#ifdef OPENSSL_FIPS
+ fips_cipher_abort(AES);
+#endif
+ return private_AES_set_decrypt_key(userKey, bits, key);
+ }
diff --git a/main/openssl/crypto/aes/asm/aes-586.S b/main/openssl/crypto/aes/asm/aes-586.S
new file mode 100644
index 00000000..20c0238f
--- /dev/null
+++ b/main/openssl/crypto/aes/asm/aes-586.S
@@ -0,0 +1,3239 @@
+.file "aes-586.s"
+.text
+.type _x86_AES_encrypt_compact,@function
+.align 16
+_x86_AES_encrypt_compact:
+ movl %edi,20(%esp)
+ xorl (%edi),%eax
+ xorl 4(%edi),%ebx
+ xorl 8(%edi),%ecx
+ xorl 12(%edi),%edx
+ movl 240(%edi),%esi
+ leal -2(%esi,%esi,1),%esi
+ leal (%edi,%esi,8),%esi
+ movl %esi,24(%esp)
+ movl -128(%ebp),%edi
+ movl -96(%ebp),%esi
+ movl -64(%ebp),%edi
+ movl -32(%ebp),%esi
+ movl (%ebp),%edi
+ movl 32(%ebp),%esi
+ movl 64(%ebp),%edi
+ movl 96(%ebp),%esi
+.align 16
+.L000loop:
+ movl %eax,%esi
+ andl $255,%esi
+ movzbl -128(%ebp,%esi,1),%esi
+ movzbl %bh,%edi
+ movzbl -128(%ebp,%edi,1),%edi
+ shll $8,%edi
+ xorl %edi,%esi
+ movl %ecx,%edi
+ shrl $16,%edi
+ andl $255,%edi
+ movzbl -128(%ebp,%edi,1),%edi
+ shll $16,%edi
+ xorl %edi,%esi
+ movl %edx,%edi
+ shrl $24,%edi
+ movzbl -128(%ebp,%edi,1),%edi
+ shll $24,%edi
+ xorl %edi,%esi
+ movl %esi,4(%esp)
+
+ movl %ebx,%esi
+ andl $255,%esi
+ shrl $16,%ebx
+ movzbl -128(%ebp,%esi,1),%esi
+ movzbl %ch,%edi
+ movzbl -128(%ebp,%edi,1),%edi
+ shll $8,%edi
+ xorl %edi,%esi
+ movl %edx,%edi
+ shrl $16,%edi
+ andl $255,%edi
+ movzbl -128(%ebp,%edi,1),%edi
+ shll $16,%edi
+ xorl %edi,%esi
+ movl %eax,%edi
+ shrl $24,%edi
+ movzbl -128(%ebp,%edi,1),%edi
+ shll $24,%edi
+ xorl %edi,%esi
+ movl %esi,8(%esp)
+
+ movl %ecx,%esi
+ andl $255,%esi
+ shrl $24,%ecx
+ movzbl -128(%ebp,%esi,1),%esi
+ movzbl %dh,%edi
+ movzbl -128(%ebp,%edi,1),%edi
+ shll $8,%edi
+ xorl %edi,%esi
+ movl %eax,%edi
+ shrl $16,%edi
+ andl $255,%edx
+ andl $255,%edi
+ movzbl -128(%ebp,%edi,1),%edi
+ shll $16,%edi
+ xorl %edi,%esi
+ movzbl %bh,%edi
+ movzbl -128(%ebp,%edi,1),%edi
+ shll $24,%edi
+ xorl %edi,%esi
+
+ andl $255,%edx
+ movzbl -128(%ebp,%edx,1),%edx
+ movzbl %ah,%eax
+ movzbl -128(%ebp,%eax,1),%eax
+ shll $8,%eax
+ xorl %eax,%edx
+ movl 4(%esp),%eax
+ andl $255,%ebx
+ movzbl -128(%ebp,%ebx,1),%ebx
+ shll $16,%ebx
+ xorl %ebx,%edx
+ movl 8(%esp),%ebx
+ movzbl -128(%ebp,%ecx,1),%ecx
+ shll $24,%ecx
+ xorl %ecx,%edx
+ movl %esi,%ecx
+
+ movl %ecx,%esi
+ andl $2155905152,%esi
+ movl %esi,%ebp
+ shrl $7,%ebp
+ leal (%ecx,%ecx,1),%edi
+ subl %ebp,%esi
+ andl $4278124286,%edi
+ andl $454761243,%esi
+ movl %ecx,%ebp
+ xorl %edi,%esi
+ xorl %esi,%ecx
+ roll $24,%ecx
+ xorl %esi,%ecx
+ rorl $16,%ebp
+ xorl %ebp,%ecx
+ rorl $8,%ebp
+ xorl %ebp,%ecx
+ movl %edx,%esi
+ andl $2155905152,%esi
+ movl %esi,%ebp
+ shrl $7,%ebp
+ leal (%edx,%edx,1),%edi
+ subl %ebp,%esi
+ andl $4278124286,%edi
+ andl $454761243,%esi
+ movl %edx,%ebp
+ xorl %edi,%esi
+ xorl %esi,%edx
+ roll $24,%edx
+ xorl %esi,%edx
+ rorl $16,%ebp
+ xorl %ebp,%edx
+ rorl $8,%ebp
+ xorl %ebp,%edx
+ movl %eax,%esi
+ andl $2155905152,%esi
+ movl %esi,%ebp
+ shrl $7,%ebp
+ leal (%eax,%eax,1),%edi
+ subl %ebp,%esi
+ andl $4278124286,%edi
+ andl $454761243,%esi
+ movl %eax,%ebp
+ xorl %edi,%esi
+ xorl %esi,%eax
+ roll $24,%eax
+ xorl %esi,%eax
+ rorl $16,%ebp
+ xorl %ebp,%eax
+ rorl $8,%ebp
+ xorl %ebp,%eax
+ movl %ebx,%esi
+ andl $2155905152,%esi
+ movl %esi,%ebp
+ shrl $7,%ebp
+ leal (%ebx,%ebx,1),%edi
+ subl %ebp,%esi
+ andl $4278124286,%edi
+ andl $454761243,%esi
+ movl %ebx,%ebp
+ xorl %edi,%esi
+ xorl %esi,%ebx
+ roll $24,%ebx
+ xorl %esi,%ebx
+ rorl $16,%ebp
+ xorl %ebp,%ebx
+ rorl $8,%ebp
+ xorl %ebp,%ebx
+ movl 20(%esp),%edi
+ movl 28(%esp),%ebp
+ addl $16,%edi
+ xorl (%edi),%eax
+ xorl 4(%edi),%ebx
+ xorl 8(%edi),%ecx
+ xorl 12(%edi),%edx
+ cmpl 24(%esp),%edi
+ movl %edi,20(%esp)
+ jb .L000loop
+ movl %eax,%esi
+ andl $255,%esi
+ movzbl -128(%ebp,%esi,1),%esi
+ movzbl %bh,%edi
+ movzbl -128(%ebp,%edi,1),%edi
+ shll $8,%edi
+ xorl %edi,%esi
+ movl %ecx,%edi
+ shrl $16,%edi
+ andl $255,%edi
+ movzbl -128(%ebp,%edi,1),%edi
+ shll $16,%edi
+ xorl %edi,%esi
+ movl %edx,%edi
+ shrl $24,%edi
+ movzbl -128(%ebp,%edi,1),%edi
+ shll $24,%edi
+ xorl %edi,%esi
+ movl %esi,4(%esp)
+
+ movl %ebx,%esi
+ andl $255,%esi
+ shrl $16,%ebx
+ movzbl -128(%ebp,%esi,1),%esi
+ movzbl %ch,%edi
+ movzbl -128(%ebp,%edi,1),%edi
+ shll $8,%edi
+ xorl %edi,%esi
+ movl %edx,%edi
+ shrl $16,%edi
+ andl $255,%edi
+ movzbl -128(%ebp,%edi,1),%edi
+ shll $16,%edi
+ xorl %edi,%esi
+ movl %eax,%edi
+ shrl $24,%edi
+ movzbl -128(%ebp,%edi,1),%edi
+ shll $24,%edi
+ xorl %edi,%esi
+ movl %esi,8(%esp)
+
+ movl %ecx,%esi
+ andl $255,%esi
+ shrl $24,%ecx
+ movzbl -128(%ebp,%esi,1),%esi
+ movzbl %dh,%edi
+ movzbl -128(%ebp,%edi,1),%edi
+ shll $8,%edi
+ xorl %edi,%esi
+ movl %eax,%edi
+ shrl $16,%edi
+ andl $255,%edx
+ andl $255,%edi
+ movzbl -128(%ebp,%edi,1),%edi
+ shll $16,%edi
+ xorl %edi,%esi
+ movzbl %bh,%edi
+ movzbl -128(%ebp,%edi,1),%edi
+ shll $24,%edi
+ xorl %edi,%esi
+
+ movl 20(%esp),%edi
+ andl $255,%edx
+ movzbl -128(%ebp,%edx,1),%edx
+ movzbl %ah,%eax
+ movzbl -128(%ebp,%eax,1),%eax
+ shll $8,%eax
+ xorl %eax,%edx
+ movl 4(%esp),%eax
+ andl $255,%ebx
+ movzbl -128(%ebp,%ebx,1),%ebx
+ shll $16,%ebx
+ xorl %ebx,%edx
+ movl 8(%esp),%ebx
+ movzbl -128(%ebp,%ecx,1),%ecx
+ shll $24,%ecx
+ xorl %ecx,%edx
+ movl %esi,%ecx
+
+ xorl 16(%edi),%eax
+ xorl 20(%edi),%ebx
+ xorl 24(%edi),%ecx
+ xorl 28(%edi),%edx
+ ret
+.size _x86_AES_encrypt_compact,.-_x86_AES_encrypt_compact
+.type _sse_AES_encrypt_compact,@function
+.align 16
+_sse_AES_encrypt_compact:
+ pxor (%edi),%mm0
+ pxor 8(%edi),%mm4
+ movl 240(%edi),%esi
+ leal -2(%esi,%esi,1),%esi
+ leal (%edi,%esi,8),%esi
+ movl %esi,24(%esp)
+ movl $454761243,%eax
+ movl %eax,8(%esp)
+ movl %eax,12(%esp)
+ movl -128(%ebp),%eax
+ movl -96(%ebp),%ebx
+ movl -64(%ebp),%ecx
+ movl -32(%ebp),%edx
+ movl (%ebp),%eax
+ movl 32(%ebp),%ebx
+ movl 64(%ebp),%ecx
+ movl 96(%ebp),%edx
+.align 16
+.L001loop:
+ pshufw $8,%mm0,%mm1
+ pshufw $13,%mm4,%mm5
+ movd %mm1,%eax
+ movd %mm5,%ebx
+ movzbl %al,%esi
+ movzbl -128(%ebp,%esi,1),%ecx
+ pshufw $13,%mm0,%mm2
+ movzbl %ah,%edx
+ movzbl -128(%ebp,%edx,1),%edx
+ shll $8,%edx
+ shrl $16,%eax
+ movzbl %bl,%esi
+ movzbl -128(%ebp,%esi,1),%esi
+ shll $16,%esi
+ orl %esi,%ecx
+ pshufw $8,%mm4,%mm6
+ movzbl %bh,%esi
+ movzbl -128(%ebp,%esi,1),%esi
+ shll $24,%esi
+ orl %esi,%edx
+ shrl $16,%ebx
+ movzbl %ah,%esi
+ movzbl -128(%ebp,%esi,1),%esi
+ shll $8,%esi
+ orl %esi,%ecx
+ movzbl %bh,%esi
+ movzbl -128(%ebp,%esi,1),%esi
+ shll $24,%esi
+ orl %esi,%ecx
+ movd %ecx,%mm0
+ movzbl %al,%esi
+ movzbl -128(%ebp,%esi,1),%ecx
+ movd %mm2,%eax
+ movzbl %bl,%esi
+ movzbl -128(%ebp,%esi,1),%esi
+ shll $16,%esi
+ orl %esi,%ecx
+ movd %mm6,%ebx
+ movzbl %ah,%esi
+ movzbl -128(%ebp,%esi,1),%esi
+ shll $24,%esi
+ orl %esi,%ecx
+ movzbl %bh,%esi
+ movzbl -128(%ebp,%esi,1),%esi
+ shll $8,%esi
+ orl %esi,%ecx
+ movd %ecx,%mm1
+ movzbl %bl,%esi
+ movzbl -128(%ebp,%esi,1),%ecx
+ shrl $16,%ebx
+ movzbl %al,%esi
+ movzbl -128(%ebp,%esi,1),%esi
+ shll $16,%esi
+ orl %esi,%ecx
+ shrl $16,%eax
+ punpckldq %mm1,%mm0
+ movzbl %ah,%esi
+ movzbl -128(%ebp,%esi,1),%esi
+ shll $24,%esi
+ orl %esi,%ecx
+ andl $255,%eax
+ movzbl -128(%ebp,%eax,1),%eax
+ shll $16,%eax
+ orl %eax,%edx
+ movzbl %bh,%esi
+ movzbl -128(%ebp,%esi,1),%esi
+ shll $8,%esi
+ orl %esi,%ecx
+ movd %ecx,%mm4
+ andl $255,%ebx
+ movzbl -128(%ebp,%ebx,1),%ebx
+ orl %ebx,%edx
+ movd %edx,%mm5
+ punpckldq %mm5,%mm4
+ addl $16,%edi
+ cmpl 24(%esp),%edi
+ ja .L002out
+ movq 8(%esp),%mm2
+ pxor %mm3,%mm3
+ pxor %mm7,%mm7
+ movq %mm0,%mm1
+ movq %mm4,%mm5
+ pcmpgtb %mm0,%mm3
+ pcmpgtb %mm4,%mm7
+ pand %mm2,%mm3
+ pand %mm2,%mm7
+ pshufw $177,%mm0,%mm2
+ pshufw $177,%mm4,%mm6
+ paddb %mm0,%mm0
+ paddb %mm4,%mm4
+ pxor %mm3,%mm0
+ pxor %mm7,%mm4
+ pshufw $177,%mm2,%mm3
+ pshufw $177,%mm6,%mm7
+ pxor %mm0,%mm1
+ pxor %mm4,%mm5
+ pxor %mm2,%mm0
+ pxor %mm6,%mm4
+ movq %mm3,%mm2
+ movq %mm7,%mm6
+ pslld $8,%mm3
+ pslld $8,%mm7
+ psrld $24,%mm2
+ psrld $24,%mm6
+ pxor %mm3,%mm0
+ pxor %mm7,%mm4
+ pxor %mm2,%mm0
+ pxor %mm6,%mm4
+ movq %mm1,%mm3
+ movq %mm5,%mm7
+ movq (%edi),%mm2
+ movq 8(%edi),%mm6
+ psrld $8,%mm1
+ psrld $8,%mm5
+ movl -128(%ebp),%eax
+ pslld $24,%mm3
+ pslld $24,%mm7
+ movl -64(%ebp),%ebx
+ pxor %mm1,%mm0
+ pxor %mm5,%mm4
+ movl (%ebp),%ecx
+ pxor %mm3,%mm0
+ pxor %mm7,%mm4
+ movl 64(%ebp),%edx
+ pxor %mm2,%mm0
+ pxor %mm6,%mm4
+ jmp .L001loop
+.align 16
+.L002out:
+ pxor (%edi),%mm0
+ pxor 8(%edi),%mm4
+ ret
+.size _sse_AES_encrypt_compact,.-_sse_AES_encrypt_compact
+.type _x86_AES_encrypt,@function
+.align 16
+_x86_AES_encrypt:
+ movl %edi,20(%esp)
+ xorl (%edi),%eax
+ xorl 4(%edi),%ebx
+ xorl 8(%edi),%ecx
+ xorl 12(%edi),%edx
+ movl 240(%edi),%esi
+ leal -2(%esi,%esi,1),%esi
+ leal (%edi,%esi,8),%esi
+ movl %esi,24(%esp)
+.align 16
+.L003loop:
+ movl %eax,%esi
+ andl $255,%esi
+ movl (%ebp,%esi,8),%esi
+ movzbl %bh,%edi
+ xorl 3(%ebp,%edi,8),%esi
+ movl %ecx,%edi
+ shrl $16,%edi
+ andl $255,%edi
+ xorl 2(%ebp,%edi,8),%esi
+ movl %edx,%edi
+ shrl $24,%edi
+ xorl 1(%ebp,%edi,8),%esi
+ movl %esi,4(%esp)
+
+ movl %ebx,%esi
+ andl $255,%esi
+ shrl $16,%ebx
+ movl (%ebp,%esi,8),%esi
+ movzbl %ch,%edi
+ xorl 3(%ebp,%edi,8),%esi
+ movl %edx,%edi
+ shrl $16,%edi
+ andl $255,%edi
+ xorl 2(%ebp,%edi,8),%esi
+ movl %eax,%edi
+ shrl $24,%edi
+ xorl 1(%ebp,%edi,8),%esi
+ movl %esi,8(%esp)
+
+ movl %ecx,%esi
+ andl $255,%esi
+ shrl $24,%ecx
+ movl (%ebp,%esi,8),%esi
+ movzbl %dh,%edi
+ xorl 3(%ebp,%edi,8),%esi
+ movl %eax,%edi
+ shrl $16,%edi
+ andl $255,%edx
+ andl $255,%edi
+ xorl 2(%ebp,%edi,8),%esi
+ movzbl %bh,%edi
+ xorl 1(%ebp,%edi,8),%esi
+
+ movl 20(%esp),%edi
+ movl (%ebp,%edx,8),%edx
+ movzbl %ah,%eax
+ xorl 3(%ebp,%eax,8),%edx
+ movl 4(%esp),%eax
+ andl $255,%ebx
+ xorl 2(%ebp,%ebx,8),%edx
+ movl 8(%esp),%ebx
+ xorl 1(%ebp,%ecx,8),%edx
+ movl %esi,%ecx
+
+ addl $16,%edi
+ xorl (%edi),%eax
+ xorl 4(%edi),%ebx
+ xorl 8(%edi),%ecx
+ xorl 12(%edi),%edx
+ cmpl 24(%esp),%edi
+ movl %edi,20(%esp)
+ jb .L003loop
+ movl %eax,%esi
+ andl $255,%esi
+ movl 2(%ebp,%esi,8),%esi
+ andl $255,%esi
+ movzbl %bh,%edi
+ movl (%ebp,%edi,8),%edi
+ andl $65280,%edi
+ xorl %edi,%esi
+ movl %ecx,%edi
+ shrl $16,%edi
+ andl $255,%edi
+ movl (%ebp,%edi,8),%edi
+ andl $16711680,%edi
+ xorl %edi,%esi
+ movl %edx,%edi
+ shrl $24,%edi
+ movl 2(%ebp,%edi,8),%edi
+ andl $4278190080,%edi
+ xorl %edi,%esi
+ movl %esi,4(%esp)
+ movl %ebx,%esi
+ andl $255,%esi
+ shrl $16,%ebx
+ movl 2(%ebp,%esi,8),%esi
+ andl $255,%esi
+ movzbl %ch,%edi
+ movl (%ebp,%edi,8),%edi
+ andl $65280,%edi
+ xorl %edi,%esi
+ movl %edx,%edi
+ shrl $16,%edi
+ andl $255,%edi
+ movl (%ebp,%edi,8),%edi
+ andl $16711680,%edi
+ xorl %edi,%esi
+ movl %eax,%edi
+ shrl $24,%edi
+ movl 2(%ebp,%edi,8),%edi
+ andl $4278190080,%edi
+ xorl %edi,%esi
+ movl %esi,8(%esp)
+ movl %ecx,%esi
+ andl $255,%esi
+ shrl $24,%ecx
+ movl 2(%ebp,%esi,8),%esi
+ andl $255,%esi
+ movzbl %dh,%edi
+ movl (%ebp,%edi,8),%edi
+ andl $65280,%edi
+ xorl %edi,%esi
+ movl %eax,%edi
+ shrl $16,%edi
+ andl $255,%edx
+ andl $255,%edi
+ movl (%ebp,%edi,8),%edi
+ andl $16711680,%edi
+ xorl %edi,%esi
+ movzbl %bh,%edi
+ movl 2(%ebp,%edi,8),%edi
+ andl $4278190080,%edi
+ xorl %edi,%esi
+ movl 20(%esp),%edi
+ andl $255,%edx
+ movl 2(%ebp,%edx,8),%edx
+ andl $255,%edx
+ movzbl %ah,%eax
+ movl (%ebp,%eax,8),%eax
+ andl $65280,%eax
+ xorl %eax,%edx
+ movl 4(%esp),%eax
+ andl $255,%ebx
+ movl (%ebp,%ebx,8),%ebx
+ andl $16711680,%ebx
+ xorl %ebx,%edx
+ movl 8(%esp),%ebx
+ movl 2(%ebp,%ecx,8),%ecx
+ andl $4278190080,%ecx
+ xorl %ecx,%edx
+ movl %esi,%ecx
+ addl $16,%edi
+ xorl (%edi),%eax
+ xorl 4(%edi),%ebx
+ xorl 8(%edi),%ecx
+ xorl 12(%edi),%edx
+ ret
+.align 64
+.LAES_Te:
+.long 2774754246,2774754246
+.long 2222750968,2222750968
+.long 2574743534,2574743534
+.long 2373680118,2373680118
+.long 234025727,234025727
+.long 3177933782,3177933782
+.long 2976870366,2976870366
+.long 1422247313,1422247313
+.long 1345335392,1345335392
+.long 50397442,50397442
+.long 2842126286,2842126286
+.long 2099981142,2099981142
+.long 436141799,436141799
+.long 1658312629,1658312629
+.long 3870010189,3870010189
+.long 2591454956,2591454956
+.long 1170918031,1170918031
+.long 2642575903,2642575903
+.long 1086966153,1086966153
+.long 2273148410,2273148410
+.long 368769775,368769775
+.long 3948501426,3948501426
+.long 3376891790,3376891790
+.long 200339707,200339707
+.long 3970805057,3970805057
+.long 1742001331,1742001331
+.long 4255294047,4255294047
+.long 3937382213,3937382213
+.long 3214711843,3214711843
+.long 4154762323,4154762323
+.long 2524082916,2524082916
+.long 1539358875,1539358875
+.long 3266819957,3266819957
+.long 486407649,486407649
+.long 2928907069,2928907069
+.long 1780885068,1780885068
+.long 1513502316,1513502316
+.long 1094664062,1094664062
+.long 49805301,49805301
+.long 1338821763,1338821763
+.long 1546925160,1546925160
+.long 4104496465,4104496465
+.long 887481809,887481809
+.long 150073849,150073849
+.long 2473685474,2473685474
+.long 1943591083,1943591083
+.long 1395732834,1395732834
+.long 1058346282,1058346282
+.long 201589768,201589768
+.long 1388824469,1388824469
+.long 1696801606,1696801606
+.long 1589887901,1589887901
+.long 672667696,672667696
+.long 2711000631,2711000631
+.long 251987210,251987210
+.long 3046808111,3046808111
+.long 151455502,151455502
+.long 907153956,907153956
+.long 2608889883,2608889883
+.long 1038279391,1038279391
+.long 652995533,652995533
+.long 1764173646,1764173646
+.long 3451040383,3451040383
+.long 2675275242,2675275242
+.long 453576978,453576978
+.long 2659418909,2659418909
+.long 1949051992,1949051992
+.long 773462580,773462580
+.long 756751158,756751158
+.long 2993581788,2993581788
+.long 3998898868,3998898868
+.long 4221608027,4221608027
+.long 4132590244,4132590244
+.long 1295727478,1295727478
+.long 1641469623,1641469623
+.long 3467883389,3467883389
+.long 2066295122,2066295122
+.long 1055122397,1055122397
+.long 1898917726,1898917726
+.long 2542044179,2542044179
+.long 4115878822,4115878822
+.long 1758581177,1758581177
+.long 0,0
+.long 753790401,753790401
+.long 1612718144,1612718144
+.long 536673507,536673507
+.long 3367088505,3367088505
+.long 3982187446,3982187446
+.long 3194645204,3194645204
+.long 1187761037,1187761037
+.long 3653156455,3653156455
+.long 1262041458,1262041458
+.long 3729410708,3729410708
+.long 3561770136,3561770136
+.long 3898103984,3898103984
+.long 1255133061,1255133061
+.long 1808847035,1808847035
+.long 720367557,720367557
+.long 3853167183,3853167183
+.long 385612781,385612781
+.long 3309519750,3309519750
+.long 3612167578,3612167578
+.long 1429418854,1429418854
+.long 2491778321,2491778321
+.long 3477423498,3477423498
+.long 284817897,284817897
+.long 100794884,100794884
+.long 2172616702,2172616702
+.long 4031795360,4031795360
+.long 1144798328,1144798328
+.long 3131023141,3131023141
+.long 3819481163,3819481163
+.long 4082192802,4082192802
+.long 4272137053,4272137053
+.long 3225436288,3225436288
+.long 2324664069,2324664069
+.long 2912064063,2912064063
+.long 3164445985,3164445985
+.long 1211644016,1211644016
+.long 83228145,83228145
+.long 3753688163,3753688163
+.long 3249976951,3249976951
+.long 1977277103,1977277103
+.long 1663115586,1663115586
+.long 806359072,806359072
+.long 452984805,452984805
+.long 250868733,250868733
+.long 1842533055,1842533055
+.long 1288555905,1288555905
+.long 336333848,336333848
+.long 890442534,890442534
+.long 804056259,804056259
+.long 3781124030,3781124030
+.long 2727843637,2727843637
+.long 3427026056,3427026056
+.long 957814574,957814574
+.long 1472513171,1472513171
+.long 4071073621,4071073621
+.long 2189328124,2189328124
+.long 1195195770,1195195770
+.long 2892260552,2892260552
+.long 3881655738,3881655738
+.long 723065138,723065138
+.long 2507371494,2507371494
+.long 2690670784,2690670784
+.long 2558624025,2558624025
+.long 3511635870,3511635870
+.long 2145180835,2145180835
+.long 1713513028,1713513028
+.long 2116692564,2116692564
+.long 2878378043,2878378043
+.long 2206763019,2206763019
+.long 3393603212,3393603212
+.long 703524551,703524551
+.long 3552098411,3552098411
+.long 1007948840,1007948840
+.long 2044649127,2044649127
+.long 3797835452,3797835452
+.long 487262998,487262998
+.long 1994120109,1994120109
+.long 1004593371,1004593371
+.long 1446130276,1446130276
+.long 1312438900,1312438900
+.long 503974420,503974420
+.long 3679013266,3679013266
+.long 168166924,168166924
+.long 1814307912,1814307912
+.long 3831258296,3831258296
+.long 1573044895,1573044895
+.long 1859376061,1859376061
+.long 4021070915,4021070915
+.long 2791465668,2791465668
+.long 2828112185,2828112185
+.long 2761266481,2761266481
+.long 937747667,937747667
+.long 2339994098,2339994098
+.long 854058965,854058965
+.long 1137232011,1137232011
+.long 1496790894,1496790894
+.long 3077402074,3077402074
+.long 2358086913,2358086913
+.long 1691735473,1691735473
+.long 3528347292,3528347292
+.long 3769215305,3769215305
+.long 3027004632,3027004632
+.long 4199962284,4199962284
+.long 133494003,133494003
+.long 636152527,636152527
+.long 2942657994,2942657994
+.long 2390391540,2390391540
+.long 3920539207,3920539207
+.long 403179536,403179536
+.long 3585784431,3585784431
+.long 2289596656,2289596656
+.long 1864705354,1864705354
+.long 1915629148,1915629148
+.long 605822008,605822008
+.long 4054230615,4054230615
+.long 3350508659,3350508659
+.long 1371981463,1371981463
+.long 602466507,602466507
+.long 2094914977,2094914977
+.long 2624877800,2624877800
+.long 555687742,555687742
+.long 3712699286,3712699286
+.long 3703422305,3703422305
+.long 2257292045,2257292045
+.long 2240449039,2240449039
+.long 2423288032,2423288032
+.long 1111375484,1111375484
+.long 3300242801,3300242801
+.long 2858837708,2858837708
+.long 3628615824,3628615824
+.long 84083462,84083462
+.long 32962295,32962295
+.long 302911004,302911004
+.long 2741068226,2741068226
+.long 1597322602,1597322602
+.long 4183250862,4183250862
+.long 3501832553,3501832553
+.long 2441512471,2441512471
+.long 1489093017,1489093017
+.long 656219450,656219450
+.long 3114180135,3114180135
+.long 954327513,954327513
+.long 335083755,335083755
+.long 3013122091,3013122091
+.long 856756514,856756514
+.long 3144247762,3144247762
+.long 1893325225,1893325225
+.long 2307821063,2307821063
+.long 2811532339,2811532339
+.long 3063651117,3063651117
+.long 572399164,572399164
+.long 2458355477,2458355477
+.long 552200649,552200649
+.long 1238290055,1238290055
+.long 4283782570,4283782570
+.long 2015897680,2015897680
+.long 2061492133,2061492133
+.long 2408352771,2408352771
+.long 4171342169,4171342169
+.long 2156497161,2156497161
+.long 386731290,386731290
+.long 3669999461,3669999461
+.long 837215959,837215959
+.long 3326231172,3326231172
+.long 3093850320,3093850320
+.long 3275833730,3275833730
+.long 2962856233,2962856233
+.long 1999449434,1999449434
+.long 286199582,286199582
+.long 3417354363,3417354363
+.long 4233385128,4233385128
+.long 3602627437,3602627437
+.long 974525996,974525996
+.byte 99,124,119,123,242,107,111,197
+.byte 48,1,103,43,254,215,171,118
+.byte 202,130,201,125,250,89,71,240
+.byte 173,212,162,175,156,164,114,192
+.byte 183,253,147,38,54,63,247,204
+.byte 52,165,229,241,113,216,49,21
+.byte 4,199,35,195,24,150,5,154
+.byte 7,18,128,226,235,39,178,117
+.byte 9,131,44,26,27,110,90,160
+.byte 82,59,214,179,41,227,47,132
+.byte 83,209,0,237,32,252,177,91
+.byte 106,203,190,57,74,76,88,207
+.byte 208,239,170,251,67,77,51,133
+.byte 69,249,2,127,80,60,159,168
+.byte 81,163,64,143,146,157,56,245
+.byte 188,182,218,33,16,255,243,210
+.byte 205,12,19,236,95,151,68,23
+.byte 196,167,126,61,100,93,25,115
+.byte 96,129,79,220,34,42,144,136
+.byte 70,238,184,20,222,94,11,219
+.byte 224,50,58,10,73,6,36,92
+.byte 194,211,172,98,145,149,228,121
+.byte 231,200,55,109,141,213,78,169
+.byte 108,86,244,234,101,122,174,8
+.byte 186,120,37,46,28,166,180,198
+.byte 232,221,116,31,75,189,139,138
+.byte 112,62,181,102,72,3,246,14
+.byte 97,53,87,185,134,193,29,158
+.byte 225,248,152,17,105,217,142,148
+.byte 155,30,135,233,206,85,40,223
+.byte 140,161,137,13,191,230,66,104
+.byte 65,153,45,15,176,84,187,22
+.byte 99,124,119,123,242,107,111,197
+.byte 48,1,103,43,254,215,171,118
+.byte 202,130,201,125,250,89,71,240
+.byte 173,212,162,175,156,164,114,192
+.byte 183,253,147,38,54,63,247,204
+.byte 52,165,229,241,113,216,49,21
+.byte 4,199,35,195,24,150,5,154
+.byte 7,18,128,226,235,39,178,117
+.byte 9,131,44,26,27,110,90,160
+.byte 82,59,214,179,41,227,47,132
+.byte 83,209,0,237,32,252,177,91
+.byte 106,203,190,57,74,76,88,207
+.byte 208,239,170,251,67,77,51,133
+.byte 69,249,2,127,80,60,159,168
+.byte 81,163,64,143,146,157,56,245
+.byte 188,182,218,33,16,255,243,210
+.byte 205,12,19,236,95,151,68,23
+.byte 196,167,126,61,100,93,25,115
+.byte 96,129,79,220,34,42,144,136
+.byte 70,238,184,20,222,94,11,219
+.byte 224,50,58,10,73,6,36,92
+.byte 194,211,172,98,145,149,228,121
+.byte 231,200,55,109,141,213,78,169
+.byte 108,86,244,234,101,122,174,8
+.byte 186,120,37,46,28,166,180,198
+.byte 232,221,116,31,75,189,139,138
+.byte 112,62,181,102,72,3,246,14
+.byte 97,53,87,185,134,193,29,158
+.byte 225,248,152,17,105,217,142,148
+.byte 155,30,135,233,206,85,40,223
+.byte 140,161,137,13,191,230,66,104
+.byte 65,153,45,15,176,84,187,22
+.byte 99,124,119,123,242,107,111,197
+.byte 48,1,103,43,254,215,171,118
+.byte 202,130,201,125,250,89,71,240
+.byte 173,212,162,175,156,164,114,192
+.byte 183,253,147,38,54,63,247,204
+.byte 52,165,229,241,113,216,49,21
+.byte 4,199,35,195,24,150,5,154
+.byte 7,18,128,226,235,39,178,117
+.byte 9,131,44,26,27,110,90,160
+.byte 82,59,214,179,41,227,47,132
+.byte 83,209,0,237,32,252,177,91
+.byte 106,203,190,57,74,76,88,207
+.byte 208,239,170,251,67,77,51,133
+.byte 69,249,2,127,80,60,159,168
+.byte 81,163,64,143,146,157,56,245
+.byte 188,182,218,33,16,255,243,210
+.byte 205,12,19,236,95,151,68,23
+.byte 196,167,126,61,100,93,25,115
+.byte 96,129,79,220,34,42,144,136
+.byte 70,238,184,20,222,94,11,219
+.byte 224,50,58,10,73,6,36,92
+.byte 194,211,172,98,145,149,228,121
+.byte 231,200,55,109,141,213,78,169
+.byte 108,86,244,234,101,122,174,8
+.byte 186,120,37,46,28,166,180,198
+.byte 232,221,116,31,75,189,139,138
+.byte 112,62,181,102,72,3,246,14
+.byte 97,53,87,185,134,193,29,158
+.byte 225,248,152,17,105,217,142,148
+.byte 155,30,135,233,206,85,40,223
+.byte 140,161,137,13,191,230,66,104
+.byte 65,153,45,15,176,84,187,22
+.byte 99,124,119,123,242,107,111,197
+.byte 48,1,103,43,254,215,171,118
+.byte 202,130,201,125,250,89,71,240
+.byte 173,212,162,175,156,164,114,192
+.byte 183,253,147,38,54,63,247,204
+.byte 52,165,229,241,113,216,49,21
+.byte 4,199,35,195,24,150,5,154
+.byte 7,18,128,226,235,39,178,117
+.byte 9,131,44,26,27,110,90,160
+.byte 82,59,214,179,41,227,47,132
+.byte 83,209,0,237,32,252,177,91
+.byte 106,203,190,57,74,76,88,207
+.byte 208,239,170,251,67,77,51,133
+.byte 69,249,2,127,80,60,159,168
+.byte 81,163,64,143,146,157,56,245
+.byte 188,182,218,33,16,255,243,210
+.byte 205,12,19,236,95,151,68,23
+.byte 196,167,126,61,100,93,25,115
+.byte 96,129,79,220,34,42,144,136
+.byte 70,238,184,20,222,94,11,219
+.byte 224,50,58,10,73,6,36,92
+.byte 194,211,172,98,145,149,228,121
+.byte 231,200,55,109,141,213,78,169
+.byte 108,86,244,234,101,122,174,8
+.byte 186,120,37,46,28,166,180,198
+.byte 232,221,116,31,75,189,139,138
+.byte 112,62,181,102,72,3,246,14
+.byte 97,53,87,185,134,193,29,158
+.byte 225,248,152,17,105,217,142,148
+.byte 155,30,135,233,206,85,40,223
+.byte 140,161,137,13,191,230,66,104
+.byte 65,153,45,15,176,84,187,22
+.long 1,2,4,8
+.long 16,32,64,128
+.long 27,54,0,0
+.long 0,0,0,0
+.size _x86_AES_encrypt,.-_x86_AES_encrypt
+.globl AES_encrypt
+.type AES_encrypt,@function
+.align 16
+AES_encrypt:
+.L_AES_encrypt_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%esi
+ movl 28(%esp),%edi
+ movl %esp,%eax
+ subl $36,%esp
+ andl $-64,%esp
+ leal -127(%edi),%ebx
+ subl %esp,%ebx
+ negl %ebx
+ andl $960,%ebx
+ subl %ebx,%esp
+ addl $4,%esp
+ movl %eax,28(%esp)
+ call .L004pic_point
+.L004pic_point:
+ popl %ebp
+ leal _GLOBAL_OFFSET_TABLE_+[.-.L004pic_point](%ebp),%eax
+ movl OPENSSL_ia32cap_P@GOT(%eax),%eax
+ leal .LAES_Te-.L004pic_point(%ebp),%ebp
+ leal 764(%esp),%ebx
+ subl %ebp,%ebx
+ andl $768,%ebx
+ leal 2176(%ebp,%ebx,1),%ebp
+ btl $25,(%eax)
+ jnc .L005x86
+ movq (%esi),%mm0
+ movq 8(%esi),%mm4
+ call _sse_AES_encrypt_compact
+ movl 28(%esp),%esp
+ movl 24(%esp),%esi
+ movq %mm0,(%esi)
+ movq %mm4,8(%esi)
+ emms
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.align 16
+.L005x86:
+ movl %ebp,24(%esp)
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl 8(%esi),%ecx
+ movl 12(%esi),%edx
+ call _x86_AES_encrypt_compact
+ movl 28(%esp),%esp
+ movl 24(%esp),%esi
+ movl %eax,(%esi)
+ movl %ebx,4(%esi)
+ movl %ecx,8(%esi)
+ movl %edx,12(%esi)
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size AES_encrypt,.-.L_AES_encrypt_begin
+.type _x86_AES_decrypt_compact,@function
+.align 16
+_x86_AES_decrypt_compact:
+ movl %edi,20(%esp)
+ xorl (%edi),%eax
+ xorl 4(%edi),%ebx
+ xorl 8(%edi),%ecx
+ xorl 12(%edi),%edx
+ movl 240(%edi),%esi
+ leal -2(%esi,%esi,1),%esi
+ leal (%edi,%esi,8),%esi
+ movl %esi,24(%esp)
+ movl -128(%ebp),%edi
+ movl -96(%ebp),%esi
+ movl -64(%ebp),%edi
+ movl -32(%ebp),%esi
+ movl (%ebp),%edi
+ movl 32(%ebp),%esi
+ movl 64(%ebp),%edi
+ movl 96(%ebp),%esi
+.align 16
+.L006loop:
+ movl %eax,%esi
+ andl $255,%esi
+ movzbl -128(%ebp,%esi,1),%esi
+ movzbl %dh,%edi
+ movzbl -128(%ebp,%edi,1),%edi
+ shll $8,%edi
+ xorl %edi,%esi
+ movl %ecx,%edi
+ shrl $16,%edi
+ andl $255,%edi
+ movzbl -128(%ebp,%edi,1),%edi
+ shll $16,%edi
+ xorl %edi,%esi
+ movl %ebx,%edi
+ shrl $24,%edi
+ movzbl -128(%ebp,%edi,1),%edi
+ shll $24,%edi
+ xorl %edi,%esi
+ movl %esi,4(%esp)
+ movl %ebx,%esi
+ andl $255,%esi
+ movzbl -128(%ebp,%esi,1),%esi
+ movzbl %ah,%edi
+ movzbl -128(%ebp,%edi,1),%edi
+ shll $8,%edi
+ xorl %edi,%esi
+ movl %edx,%edi
+ shrl $16,%edi
+ andl $255,%edi
+ movzbl -128(%ebp,%edi,1),%edi
+ shll $16,%edi
+ xorl %edi,%esi
+ movl %ecx,%edi
+ shrl $24,%edi
+ movzbl -128(%ebp,%edi,1),%edi
+ shll $24,%edi
+ xorl %edi,%esi
+ movl %esi,8(%esp)
+ movl %ecx,%esi
+ andl $255,%esi
+ movzbl -128(%ebp,%esi,1),%esi
+ movzbl %bh,%edi
+ movzbl -128(%ebp,%edi,1),%edi
+ shll $8,%edi
+ xorl %edi,%esi
+ movl %eax,%edi
+ shrl $16,%edi
+ andl $255,%edi
+ movzbl -128(%ebp,%edi,1),%edi
+ shll $16,%edi
+ xorl %edi,%esi
+ movl %edx,%edi
+ shrl $24,%edi
+ movzbl -128(%ebp,%edi,1),%edi
+ shll $24,%edi
+ xorl %edi,%esi
+ andl $255,%edx
+ movzbl -128(%ebp,%edx,1),%edx
+ movzbl %ch,%ecx
+ movzbl -128(%ebp,%ecx,1),%ecx
+ shll $8,%ecx
+ xorl %ecx,%edx
+ movl %esi,%ecx
+ shrl $16,%ebx
+ andl $255,%ebx
+ movzbl -128(%ebp,%ebx,1),%ebx
+ shll $16,%ebx
+ xorl %ebx,%edx
+ shrl $24,%eax
+ movzbl -128(%ebp,%eax,1),%eax
+ shll $24,%eax
+ xorl %eax,%edx
+ movl %ecx,%esi
+ andl $2155905152,%esi
+ movl %esi,%edi
+ shrl $7,%edi
+ leal (%ecx,%ecx,1),%eax
+ subl %edi,%esi
+ andl $4278124286,%eax
+ andl $454761243,%esi
+ xorl %eax,%esi
+ movl %esi,%eax
+ andl $2155905152,%esi
+ movl %esi,%edi
+ shrl $7,%edi
+ leal (%eax,%eax,1),%ebx
+ subl %edi,%esi
+ andl $4278124286,%ebx
+ andl $454761243,%esi
+ xorl %ecx,%eax
+ xorl %ebx,%esi
+ movl %esi,%ebx
+ andl $2155905152,%esi
+ movl %esi,%edi
+ shrl $7,%edi
+ leal (%ebx,%ebx,1),%ebp
+ subl %edi,%esi
+ andl $4278124286,%ebp
+ andl $454761243,%esi
+ xorl %ecx,%ebx
+ roll $8,%ecx
+ xorl %esi,%ebp
+ xorl %eax,%ecx
+ xorl %ebp,%eax
+ roll $24,%eax
+ xorl %ebx,%ecx
+ xorl %ebp,%ebx
+ roll $16,%ebx
+ xorl %ebp,%ecx
+ roll $8,%ebp
+ xorl %eax,%ecx
+ xorl %ebx,%ecx
+ movl 4(%esp),%eax
+ xorl %ebp,%ecx
+ movl %ecx,12(%esp)
+ movl %edx,%esi
+ andl $2155905152,%esi
+ movl %esi,%edi
+ shrl $7,%edi
+ leal (%edx,%edx,1),%ebx
+ subl %edi,%esi
+ andl $4278124286,%ebx
+ andl $454761243,%esi
+ xorl %ebx,%esi
+ movl %esi,%ebx
+ andl $2155905152,%esi
+ movl %esi,%edi
+ shrl $7,%edi
+ leal (%ebx,%ebx,1),%ecx
+ subl %edi,%esi
+ andl $4278124286,%ecx
+ andl $454761243,%esi
+ xorl %edx,%ebx
+ xorl %ecx,%esi
+ movl %esi,%ecx
+ andl $2155905152,%esi
+ movl %esi,%edi
+ shrl $7,%edi
+ leal (%ecx,%ecx,1),%ebp
+ subl %edi,%esi
+ andl $4278124286,%ebp
+ andl $454761243,%esi
+ xorl %edx,%ecx
+ roll $8,%edx
+ xorl %esi,%ebp
+ xorl %ebx,%edx
+ xorl %ebp,%ebx
+ roll $24,%ebx
+ xorl %ecx,%edx
+ xorl %ebp,%ecx
+ roll $16,%ecx
+ xorl %ebp,%edx
+ roll $8,%ebp
+ xorl %ebx,%edx
+ xorl %ecx,%edx
+ movl 8(%esp),%ebx
+ xorl %ebp,%edx
+ movl %edx,16(%esp)
+ movl %eax,%esi
+ andl $2155905152,%esi
+ movl %esi,%edi
+ shrl $7,%edi
+ leal (%eax,%eax,1),%ecx
+ subl %edi,%esi
+ andl $4278124286,%ecx
+ andl $454761243,%esi
+ xorl %ecx,%esi
+ movl %esi,%ecx
+ andl $2155905152,%esi
+ movl %esi,%edi
+ shrl $7,%edi
+ leal (%ecx,%ecx,1),%edx
+ subl %edi,%esi
+ andl $4278124286,%edx
+ andl $454761243,%esi
+ xorl %eax,%ecx
+ xorl %edx,%esi
+ movl %esi,%edx
+ andl $2155905152,%esi
+ movl %esi,%edi
+ shrl $7,%edi
+ leal (%edx,%edx,1),%ebp
+ subl %edi,%esi
+ andl $4278124286,%ebp
+ andl $454761243,%esi
+ xorl %eax,%edx
+ roll $8,%eax
+ xorl %esi,%ebp
+ xorl %ecx,%eax
+ xorl %ebp,%ecx
+ roll $24,%ecx
+ xorl %edx,%eax
+ xorl %ebp,%edx
+ roll $16,%edx
+ xorl %ebp,%eax
+ roll $8,%ebp
+ xorl %ecx,%eax
+ xorl %edx,%eax
+ xorl %ebp,%eax
+ movl %ebx,%esi
+ andl $2155905152,%esi
+ movl %esi,%edi
+ shrl $7,%edi
+ leal (%ebx,%ebx,1),%ecx
+ subl %edi,%esi
+ andl $4278124286,%ecx
+ andl $454761243,%esi
+ xorl %ecx,%esi
+ movl %esi,%ecx
+ andl $2155905152,%esi
+ movl %esi,%edi
+ shrl $7,%edi
+ leal (%ecx,%ecx,1),%edx
+ subl %edi,%esi
+ andl $4278124286,%edx
+ andl $454761243,%esi
+ xorl %ebx,%ecx
+ xorl %edx,%esi
+ movl %esi,%edx
+ andl $2155905152,%esi
+ movl %esi,%edi
+ shrl $7,%edi
+ leal (%edx,%edx,1),%ebp
+ subl %edi,%esi
+ andl $4278124286,%ebp
+ andl $454761243,%esi
+ xorl %ebx,%edx
+ roll $8,%ebx
+ xorl %esi,%ebp
+ xorl %ecx,%ebx
+ xorl %ebp,%ecx
+ roll $24,%ecx
+ xorl %edx,%ebx
+ xorl %ebp,%edx
+ roll $16,%edx
+ xorl %ebp,%ebx
+ roll $8,%ebp
+ xorl %ecx,%ebx
+ xorl %edx,%ebx
+ movl 12(%esp),%ecx
+ xorl %ebp,%ebx
+ movl 16(%esp),%edx
+ movl 20(%esp),%edi
+ movl 28(%esp),%ebp
+ addl $16,%edi
+ xorl (%edi),%eax
+ xorl 4(%edi),%ebx
+ xorl 8(%edi),%ecx
+ xorl 12(%edi),%edx
+ cmpl 24(%esp),%edi
+ movl %edi,20(%esp)
+ jb .L006loop
+ movl %eax,%esi
+ andl $255,%esi
+ movzbl -128(%ebp,%esi,1),%esi
+ movzbl %dh,%edi
+ movzbl -128(%ebp,%edi,1),%edi
+ shll $8,%edi
+ xorl %edi,%esi
+ movl %ecx,%edi
+ shrl $16,%edi
+ andl $255,%edi
+ movzbl -128(%ebp,%edi,1),%edi
+ shll $16,%edi
+ xorl %edi,%esi
+ movl %ebx,%edi
+ shrl $24,%edi
+ movzbl -128(%ebp,%edi,1),%edi
+ shll $24,%edi
+ xorl %edi,%esi
+ movl %esi,4(%esp)
+ movl %ebx,%esi
+ andl $255,%esi
+ movzbl -128(%ebp,%esi,1),%esi
+ movzbl %ah,%edi
+ movzbl -128(%ebp,%edi,1),%edi
+ shll $8,%edi
+ xorl %edi,%esi
+ movl %edx,%edi
+ shrl $16,%edi
+ andl $255,%edi
+ movzbl -128(%ebp,%edi,1),%edi
+ shll $16,%edi
+ xorl %edi,%esi
+ movl %ecx,%edi
+ shrl $24,%edi
+ movzbl -128(%ebp,%edi,1),%edi
+ shll $24,%edi
+ xorl %edi,%esi
+ movl %esi,8(%esp)
+ movl %ecx,%esi
+ andl $255,%esi
+ movzbl -128(%ebp,%esi,1),%esi
+ movzbl %bh,%edi
+ movzbl -128(%ebp,%edi,1),%edi
+ shll $8,%edi
+ xorl %edi,%esi
+ movl %eax,%edi
+ shrl $16,%edi
+ andl $255,%edi
+ movzbl -128(%ebp,%edi,1),%edi
+ shll $16,%edi
+ xorl %edi,%esi
+ movl %edx,%edi
+ shrl $24,%edi
+ movzbl -128(%ebp,%edi,1),%edi
+ shll $24,%edi
+ xorl %edi,%esi
+ movl 20(%esp),%edi
+ andl $255,%edx
+ movzbl -128(%ebp,%edx,1),%edx
+ movzbl %ch,%ecx
+ movzbl -128(%ebp,%ecx,1),%ecx
+ shll $8,%ecx
+ xorl %ecx,%edx
+ movl %esi,%ecx
+ shrl $16,%ebx
+ andl $255,%ebx
+ movzbl -128(%ebp,%ebx,1),%ebx
+ shll $16,%ebx
+ xorl %ebx,%edx
+ movl 8(%esp),%ebx
+ shrl $24,%eax
+ movzbl -128(%ebp,%eax,1),%eax
+ shll $24,%eax
+ xorl %eax,%edx
+ movl 4(%esp),%eax
+ xorl 16(%edi),%eax
+ xorl 20(%edi),%ebx
+ xorl 24(%edi),%ecx
+ xorl 28(%edi),%edx
+ ret
+.size _x86_AES_decrypt_compact,.-_x86_AES_decrypt_compact
+.type _sse_AES_decrypt_compact,@function
+.align 16
+_sse_AES_decrypt_compact:
+ pxor (%edi),%mm0
+ pxor 8(%edi),%mm4
+ movl 240(%edi),%esi
+ leal -2(%esi,%esi,1),%esi
+ leal (%edi,%esi,8),%esi
+ movl %esi,24(%esp)
+ movl $454761243,%eax
+ movl %eax,8(%esp)
+ movl %eax,12(%esp)
+ movl -128(%ebp),%eax
+ movl -96(%ebp),%ebx
+ movl -64(%ebp),%ecx
+ movl -32(%ebp),%edx
+ movl (%ebp),%eax
+ movl 32(%ebp),%ebx
+ movl 64(%ebp),%ecx
+ movl 96(%ebp),%edx
+.align 16
+.L007loop:
+ pshufw $12,%mm0,%mm1
+ movd %mm1,%eax
+ pshufw $9,%mm4,%mm5
+ movzbl %al,%esi
+ movzbl -128(%ebp,%esi,1),%ecx
+ movd %mm5,%ebx
+ movzbl %ah,%edx
+ movzbl -128(%ebp,%edx,1),%edx
+ shll $8,%edx
+ pshufw $6,%mm0,%mm2
+ movzbl %bl,%esi
+ movzbl -128(%ebp,%esi,1),%esi
+ shll $16,%esi
+ orl %esi,%ecx
+ shrl $16,%eax
+ movzbl %bh,%esi
+ movzbl -128(%ebp,%esi,1),%esi
+ shll $24,%esi
+ orl %esi,%edx
+ shrl $16,%ebx
+ pshufw $3,%mm4,%mm6
+ movzbl %ah,%esi
+ movzbl -128(%ebp,%esi,1),%esi
+ shll $24,%esi
+ orl %esi,%ecx
+ movzbl %bh,%esi
+ movzbl -128(%ebp,%esi,1),%esi
+ shll $8,%esi
+ orl %esi,%ecx
+ movd %ecx,%mm0
+ movzbl %al,%esi
+ movd %mm2,%eax
+ movzbl -128(%ebp,%esi,1),%ecx
+ shll $16,%ecx
+ movzbl %bl,%esi
+ movd %mm6,%ebx
+ movzbl -128(%ebp,%esi,1),%esi
+ orl %esi,%ecx
+ movzbl %al,%esi
+ movzbl -128(%ebp,%esi,1),%esi
+ orl %esi,%edx
+ movzbl %bl,%esi
+ movzbl -128(%ebp,%esi,1),%esi
+ shll $16,%esi
+ orl %esi,%edx
+ movd %edx,%mm1
+ movzbl %ah,%esi
+ movzbl -128(%ebp,%esi,1),%edx
+ shll $8,%edx
+ movzbl %bh,%esi
+ shrl $16,%eax
+ movzbl -128(%ebp,%esi,1),%esi
+ shll $24,%esi
+ orl %esi,%edx
+ shrl $16,%ebx
+ punpckldq %mm1,%mm0
+ movzbl %bh,%esi
+ movzbl -128(%ebp,%esi,1),%esi
+ shll $8,%esi
+ orl %esi,%ecx
+ andl $255,%ebx
+ movzbl -128(%ebp,%ebx,1),%ebx
+ orl %ebx,%edx
+ movzbl %al,%esi
+ movzbl -128(%ebp,%esi,1),%esi
+ shll $16,%esi
+ orl %esi,%edx
+ movd %edx,%mm4
+ movzbl %ah,%eax
+ movzbl -128(%ebp,%eax,1),%eax
+ shll $24,%eax
+ orl %eax,%ecx
+ movd %ecx,%mm5
+ punpckldq %mm5,%mm4
+ addl $16,%edi
+ cmpl 24(%esp),%edi
+ ja .L008out
+ movq %mm0,%mm3
+ movq %mm4,%mm7
+ pshufw $228,%mm0,%mm2
+ pshufw $228,%mm4,%mm6
+ movq %mm0,%mm1
+ movq %mm4,%mm5
+ pshufw $177,%mm0,%mm0
+ pshufw $177,%mm4,%mm4
+ pslld $8,%mm2
+ pslld $8,%mm6
+ psrld $8,%mm3
+ psrld $8,%mm7
+ pxor %mm2,%mm0
+ pxor %mm6,%mm4
+ pxor %mm3,%mm0
+ pxor %mm7,%mm4
+ pslld $16,%mm2
+ pslld $16,%mm6
+ psrld $16,%mm3
+ psrld $16,%mm7
+ pxor %mm2,%mm0
+ pxor %mm6,%mm4
+ pxor %mm3,%mm0
+ pxor %mm7,%mm4
+ movq 8(%esp),%mm3
+ pxor %mm2,%mm2
+ pxor %mm6,%mm6
+ pcmpgtb %mm1,%mm2
+ pcmpgtb %mm5,%mm6
+ pand %mm3,%mm2
+ pand %mm3,%mm6
+ paddb %mm1,%mm1
+ paddb %mm5,%mm5
+ pxor %mm2,%mm1
+ pxor %mm6,%mm5
+ movq %mm1,%mm3
+ movq %mm5,%mm7
+ movq %mm1,%mm2
+ movq %mm5,%mm6
+ pxor %mm1,%mm0
+ pxor %mm5,%mm4
+ pslld $24,%mm3
+ pslld $24,%mm7
+ psrld $8,%mm2
+ psrld $8,%mm6
+ pxor %mm3,%mm0
+ pxor %mm7,%mm4
+ pxor %mm2,%mm0
+ pxor %mm6,%mm4
+ movq 8(%esp),%mm2
+ pxor %mm3,%mm3
+ pxor %mm7,%mm7
+ pcmpgtb %mm1,%mm3
+ pcmpgtb %mm5,%mm7
+ pand %mm2,%mm3
+ pand %mm2,%mm7
+ paddb %mm1,%mm1
+ paddb %mm5,%mm5
+ pxor %mm3,%mm1
+ pxor %mm7,%mm5
+ pshufw $177,%mm1,%mm3
+ pshufw $177,%mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm5,%mm4
+ pxor %mm3,%mm0
+ pxor %mm7,%mm4
+ pxor %mm3,%mm3
+ pxor %mm7,%mm7
+ pcmpgtb %mm1,%mm3
+ pcmpgtb %mm5,%mm7
+ pand %mm2,%mm3
+ pand %mm2,%mm7
+ paddb %mm1,%mm1
+ paddb %mm5,%mm5
+ pxor %mm3,%mm1
+ pxor %mm7,%mm5
+ pxor %mm1,%mm0
+ pxor %mm5,%mm4
+ movq %mm1,%mm3
+ movq %mm5,%mm7
+ pshufw $177,%mm1,%mm2
+ pshufw $177,%mm5,%mm6
+ pxor %mm2,%mm0
+ pxor %mm6,%mm4
+ pslld $8,%mm1
+ pslld $8,%mm5
+ psrld $8,%mm3
+ psrld $8,%mm7
+ movq (%edi),%mm2
+ movq 8(%edi),%mm6
+ pxor %mm1,%mm0
+ pxor %mm5,%mm4
+ pxor %mm3,%mm0
+ pxor %mm7,%mm4
+ movl -128(%ebp),%eax
+ pslld $16,%mm1
+ pslld $16,%mm5
+ movl -64(%ebp),%ebx
+ psrld $16,%mm3
+ psrld $16,%mm7
+ movl (%ebp),%ecx
+ pxor %mm1,%mm0
+ pxor %mm5,%mm4
+ movl 64(%ebp),%edx
+ pxor %mm3,%mm0
+ pxor %mm7,%mm4
+ pxor %mm2,%mm0
+ pxor %mm6,%mm4
+ jmp .L007loop
+.align 16
+.L008out:
+ pxor (%edi),%mm0
+ pxor 8(%edi),%mm4
+ ret
+.size _sse_AES_decrypt_compact,.-_sse_AES_decrypt_compact
+.type _x86_AES_decrypt,@function
+.align 16
+_x86_AES_decrypt:
+ movl %edi,20(%esp)
+ xorl (%edi),%eax
+ xorl 4(%edi),%ebx
+ xorl 8(%edi),%ecx
+ xorl 12(%edi),%edx
+ movl 240(%edi),%esi
+ leal -2(%esi,%esi,1),%esi
+ leal (%edi,%esi,8),%esi
+ movl %esi,24(%esp)
+.align 16
+.L009loop:
+ movl %eax,%esi
+ andl $255,%esi
+ movl (%ebp,%esi,8),%esi
+ movzbl %dh,%edi
+ xorl 3(%ebp,%edi,8),%esi
+ movl %ecx,%edi
+ shrl $16,%edi
+ andl $255,%edi
+ xorl 2(%ebp,%edi,8),%esi
+ movl %ebx,%edi
+ shrl $24,%edi
+ xorl 1(%ebp,%edi,8),%esi
+ movl %esi,4(%esp)
+
+ movl %ebx,%esi
+ andl $255,%esi
+ movl (%ebp,%esi,8),%esi
+ movzbl %ah,%edi
+ xorl 3(%ebp,%edi,8),%esi
+ movl %edx,%edi
+ shrl $16,%edi
+ andl $255,%edi
+ xorl 2(%ebp,%edi,8),%esi
+ movl %ecx,%edi
+ shrl $24,%edi
+ xorl 1(%ebp,%edi,8),%esi
+ movl %esi,8(%esp)
+
+ movl %ecx,%esi
+ andl $255,%esi
+ movl (%ebp,%esi,8),%esi
+ movzbl %bh,%edi
+ xorl 3(%ebp,%edi,8),%esi
+ movl %eax,%edi
+ shrl $16,%edi
+ andl $255,%edi
+ xorl 2(%ebp,%edi,8),%esi
+ movl %edx,%edi
+ shrl $24,%edi
+ xorl 1(%ebp,%edi,8),%esi
+
+ movl 20(%esp),%edi
+ andl $255,%edx
+ movl (%ebp,%edx,8),%edx
+ movzbl %ch,%ecx
+ xorl 3(%ebp,%ecx,8),%edx
+ movl %esi,%ecx
+ shrl $16,%ebx
+ andl $255,%ebx
+ xorl 2(%ebp,%ebx,8),%edx
+ movl 8(%esp),%ebx
+ shrl $24,%eax
+ xorl 1(%ebp,%eax,8),%edx
+ movl 4(%esp),%eax
+
+ addl $16,%edi
+ xorl (%edi),%eax
+ xorl 4(%edi),%ebx
+ xorl 8(%edi),%ecx
+ xorl 12(%edi),%edx
+ cmpl 24(%esp),%edi
+ movl %edi,20(%esp)
+ jb .L009loop
+ leal 2176(%ebp),%ebp
+ movl -128(%ebp),%edi
+ movl -96(%ebp),%esi
+ movl -64(%ebp),%edi
+ movl -32(%ebp),%esi
+ movl (%ebp),%edi
+ movl 32(%ebp),%esi
+ movl 64(%ebp),%edi
+ movl 96(%ebp),%esi
+ leal -128(%ebp),%ebp
+ movl %eax,%esi
+ andl $255,%esi
+ movzbl (%ebp,%esi,1),%esi
+ movzbl %dh,%edi
+ movzbl (%ebp,%edi,1),%edi
+ shll $8,%edi
+ xorl %edi,%esi
+ movl %ecx,%edi
+ shrl $16,%edi
+ andl $255,%edi
+ movzbl (%ebp,%edi,1),%edi
+ shll $16,%edi
+ xorl %edi,%esi
+ movl %ebx,%edi
+ shrl $24,%edi
+ movzbl (%ebp,%edi,1),%edi
+ shll $24,%edi
+ xorl %edi,%esi
+ movl %esi,4(%esp)
+ movl %ebx,%esi
+ andl $255,%esi
+ movzbl (%ebp,%esi,1),%esi
+ movzbl %ah,%edi
+ movzbl (%ebp,%edi,1),%edi
+ shll $8,%edi
+ xorl %edi,%esi
+ movl %edx,%edi
+ shrl $16,%edi
+ andl $255,%edi
+ movzbl (%ebp,%edi,1),%edi
+ shll $16,%edi
+ xorl %edi,%esi
+ movl %ecx,%edi
+ shrl $24,%edi
+ movzbl (%ebp,%edi,1),%edi
+ shll $24,%edi
+ xorl %edi,%esi
+ movl %esi,8(%esp)
+ movl %ecx,%esi
+ andl $255,%esi
+ movzbl (%ebp,%esi,1),%esi
+ movzbl %bh,%edi
+ movzbl (%ebp,%edi,1),%edi
+ shll $8,%edi
+ xorl %edi,%esi
+ movl %eax,%edi
+ shrl $16,%edi
+ andl $255,%edi
+ movzbl (%ebp,%edi,1),%edi
+ shll $16,%edi
+ xorl %edi,%esi
+ movl %edx,%edi
+ shrl $24,%edi
+ movzbl (%ebp,%edi,1),%edi
+ shll $24,%edi
+ xorl %edi,%esi
+ movl 20(%esp),%edi
+ andl $255,%edx
+ movzbl (%ebp,%edx,1),%edx
+ movzbl %ch,%ecx
+ movzbl (%ebp,%ecx,1),%ecx
+ shll $8,%ecx
+ xorl %ecx,%edx
+ movl %esi,%ecx
+ shrl $16,%ebx
+ andl $255,%ebx
+ movzbl (%ebp,%ebx,1),%ebx
+ shll $16,%ebx
+ xorl %ebx,%edx
+ movl 8(%esp),%ebx
+ shrl $24,%eax
+ movzbl (%ebp,%eax,1),%eax
+ shll $24,%eax
+ xorl %eax,%edx
+ movl 4(%esp),%eax
+ leal -2048(%ebp),%ebp
+ addl $16,%edi
+ xorl (%edi),%eax
+ xorl 4(%edi),%ebx
+ xorl 8(%edi),%ecx
+ xorl 12(%edi),%edx
+ ret
+.align 64
+.LAES_Td:
+.long 1353184337,1353184337
+.long 1399144830,1399144830
+.long 3282310938,3282310938
+.long 2522752826,2522752826
+.long 3412831035,3412831035
+.long 4047871263,4047871263
+.long 2874735276,2874735276
+.long 2466505547,2466505547
+.long 1442459680,1442459680
+.long 4134368941,4134368941
+.long 2440481928,2440481928
+.long 625738485,625738485
+.long 4242007375,4242007375
+.long 3620416197,3620416197
+.long 2151953702,2151953702
+.long 2409849525,2409849525
+.long 1230680542,1230680542
+.long 1729870373,1729870373
+.long 2551114309,2551114309
+.long 3787521629,3787521629
+.long 41234371,41234371
+.long 317738113,317738113
+.long 2744600205,2744600205
+.long 3338261355,3338261355
+.long 3881799427,3881799427
+.long 2510066197,2510066197
+.long 3950669247,3950669247
+.long 3663286933,3663286933
+.long 763608788,763608788
+.long 3542185048,3542185048
+.long 694804553,694804553
+.long 1154009486,1154009486
+.long 1787413109,1787413109
+.long 2021232372,2021232372
+.long 1799248025,1799248025
+.long 3715217703,3715217703
+.long 3058688446,3058688446
+.long 397248752,397248752
+.long 1722556617,1722556617
+.long 3023752829,3023752829
+.long 407560035,407560035
+.long 2184256229,2184256229
+.long 1613975959,1613975959
+.long 1165972322,1165972322
+.long 3765920945,3765920945
+.long 2226023355,2226023355
+.long 480281086,480281086
+.long 2485848313,2485848313
+.long 1483229296,1483229296
+.long 436028815,436028815
+.long 2272059028,2272059028
+.long 3086515026,3086515026
+.long 601060267,601060267
+.long 3791801202,3791801202
+.long 1468997603,1468997603
+.long 715871590,715871590
+.long 120122290,120122290
+.long 63092015,63092015
+.long 2591802758,2591802758
+.long 2768779219,2768779219
+.long 4068943920,4068943920
+.long 2997206819,2997206819
+.long 3127509762,3127509762
+.long 1552029421,1552029421
+.long 723308426,723308426
+.long 2461301159,2461301159
+.long 4042393587,4042393587
+.long 2715969870,2715969870
+.long 3455375973,3455375973
+.long 3586000134,3586000134
+.long 526529745,526529745
+.long 2331944644,2331944644
+.long 2639474228,2639474228
+.long 2689987490,2689987490
+.long 853641733,853641733
+.long 1978398372,1978398372
+.long 971801355,971801355
+.long 2867814464,2867814464
+.long 111112542,111112542
+.long 1360031421,1360031421
+.long 4186579262,4186579262
+.long 1023860118,1023860118
+.long 2919579357,2919579357
+.long 1186850381,1186850381
+.long 3045938321,3045938321
+.long 90031217,90031217
+.long 1876166148,1876166148
+.long 4279586912,4279586912
+.long 620468249,620468249
+.long 2548678102,2548678102
+.long 3426959497,3426959497
+.long 2006899047,2006899047
+.long 3175278768,3175278768
+.long 2290845959,2290845959
+.long 945494503,945494503
+.long 3689859193,3689859193
+.long 1191869601,1191869601
+.long 3910091388,3910091388
+.long 3374220536,3374220536
+.long 0,0
+.long 2206629897,2206629897
+.long 1223502642,1223502642
+.long 2893025566,2893025566
+.long 1316117100,1316117100
+.long 4227796733,4227796733
+.long 1446544655,1446544655
+.long 517320253,517320253
+.long 658058550,658058550
+.long 1691946762,1691946762
+.long 564550760,564550760
+.long 3511966619,3511966619
+.long 976107044,976107044
+.long 2976320012,2976320012
+.long 266819475,266819475
+.long 3533106868,3533106868
+.long 2660342555,2660342555
+.long 1338359936,1338359936
+.long 2720062561,2720062561
+.long 1766553434,1766553434
+.long 370807324,370807324
+.long 179999714,179999714
+.long 3844776128,3844776128
+.long 1138762300,1138762300
+.long 488053522,488053522
+.long 185403662,185403662
+.long 2915535858,2915535858
+.long 3114841645,3114841645
+.long 3366526484,3366526484
+.long 2233069911,2233069911
+.long 1275557295,1275557295
+.long 3151862254,3151862254
+.long 4250959779,4250959779
+.long 2670068215,2670068215
+.long 3170202204,3170202204
+.long 3309004356,3309004356
+.long 880737115,880737115
+.long 1982415755,1982415755
+.long 3703972811,3703972811
+.long 1761406390,1761406390
+.long 1676797112,1676797112
+.long 3403428311,3403428311
+.long 277177154,277177154
+.long 1076008723,1076008723
+.long 538035844,538035844
+.long 2099530373,2099530373
+.long 4164795346,4164795346
+.long 288553390,288553390
+.long 1839278535,1839278535
+.long 1261411869,1261411869
+.long 4080055004,4080055004
+.long 3964831245,3964831245
+.long 3504587127,3504587127
+.long 1813426987,1813426987
+.long 2579067049,2579067049
+.long 4199060497,4199060497
+.long 577038663,577038663
+.long 3297574056,3297574056
+.long 440397984,440397984
+.long 3626794326,3626794326
+.long 4019204898,4019204898
+.long 3343796615,3343796615
+.long 3251714265,3251714265
+.long 4272081548,4272081548
+.long 906744984,906744984
+.long 3481400742,3481400742
+.long 685669029,685669029
+.long 646887386,646887386
+.long 2764025151,2764025151
+.long 3835509292,3835509292
+.long 227702864,227702864
+.long 2613862250,2613862250
+.long 1648787028,1648787028
+.long 3256061430,3256061430
+.long 3904428176,3904428176
+.long 1593260334,1593260334
+.long 4121936770,4121936770
+.long 3196083615,3196083615
+.long 2090061929,2090061929
+.long 2838353263,2838353263
+.long 3004310991,3004310991
+.long 999926984,999926984
+.long 2809993232,2809993232
+.long 1852021992,1852021992
+.long 2075868123,2075868123
+.long 158869197,158869197
+.long 4095236462,4095236462
+.long 28809964,28809964
+.long 2828685187,2828685187
+.long 1701746150,1701746150
+.long 2129067946,2129067946
+.long 147831841,147831841
+.long 3873969647,3873969647
+.long 3650873274,3650873274
+.long 3459673930,3459673930
+.long 3557400554,3557400554
+.long 3598495785,3598495785
+.long 2947720241,2947720241
+.long 824393514,824393514
+.long 815048134,815048134
+.long 3227951669,3227951669
+.long 935087732,935087732
+.long 2798289660,2798289660
+.long 2966458592,2966458592
+.long 366520115,366520115
+.long 1251476721,1251476721
+.long 4158319681,4158319681
+.long 240176511,240176511
+.long 804688151,804688151
+.long 2379631990,2379631990
+.long 1303441219,1303441219
+.long 1414376140,1414376140
+.long 3741619940,3741619940
+.long 3820343710,3820343710
+.long 461924940,461924940
+.long 3089050817,3089050817
+.long 2136040774,2136040774
+.long 82468509,82468509
+.long 1563790337,1563790337
+.long 1937016826,1937016826
+.long 776014843,776014843
+.long 1511876531,1511876531
+.long 1389550482,1389550482
+.long 861278441,861278441
+.long 323475053,323475053
+.long 2355222426,2355222426
+.long 2047648055,2047648055
+.long 2383738969,2383738969
+.long 2302415851,2302415851
+.long 3995576782,3995576782
+.long 902390199,902390199
+.long 3991215329,3991215329
+.long 1018251130,1018251130
+.long 1507840668,1507840668
+.long 1064563285,1064563285
+.long 2043548696,2043548696
+.long 3208103795,3208103795
+.long 3939366739,3939366739
+.long 1537932639,1537932639
+.long 342834655,342834655
+.long 2262516856,2262516856
+.long 2180231114,2180231114
+.long 1053059257,1053059257
+.long 741614648,741614648
+.long 1598071746,1598071746
+.long 1925389590,1925389590
+.long 203809468,203809468
+.long 2336832552,2336832552
+.long 1100287487,1100287487
+.long 1895934009,1895934009
+.long 3736275976,3736275976
+.long 2632234200,2632234200
+.long 2428589668,2428589668
+.long 1636092795,1636092795
+.long 1890988757,1890988757
+.long 1952214088,1952214088
+.long 1113045200,1113045200
+.byte 82,9,106,213,48,54,165,56
+.byte 191,64,163,158,129,243,215,251
+.byte 124,227,57,130,155,47,255,135
+.byte 52,142,67,68,196,222,233,203
+.byte 84,123,148,50,166,194,35,61
+.byte 238,76,149,11,66,250,195,78
+.byte 8,46,161,102,40,217,36,178
+.byte 118,91,162,73,109,139,209,37
+.byte 114,248,246,100,134,104,152,22
+.byte 212,164,92,204,93,101,182,146
+.byte 108,112,72,80,253,237,185,218
+.byte 94,21,70,87,167,141,157,132
+.byte 144,216,171,0,140,188,211,10
+.byte 247,228,88,5,184,179,69,6
+.byte 208,44,30,143,202,63,15,2
+.byte 193,175,189,3,1,19,138,107
+.byte 58,145,17,65,79,103,220,234
+.byte 151,242,207,206,240,180,230,115
+.byte 150,172,116,34,231,173,53,133
+.byte 226,249,55,232,28,117,223,110
+.byte 71,241,26,113,29,41,197,137
+.byte 111,183,98,14,170,24,190,27
+.byte 252,86,62,75,198,210,121,32
+.byte 154,219,192,254,120,205,90,244
+.byte 31,221,168,51,136,7,199,49
+.byte 177,18,16,89,39,128,236,95
+.byte 96,81,127,169,25,181,74,13
+.byte 45,229,122,159,147,201,156,239
+.byte 160,224,59,77,174,42,245,176
+.byte 200,235,187,60,131,83,153,97
+.byte 23,43,4,126,186,119,214,38
+.byte 225,105,20,99,85,33,12,125
+.byte 82,9,106,213,48,54,165,56
+.byte 191,64,163,158,129,243,215,251
+.byte 124,227,57,130,155,47,255,135
+.byte 52,142,67,68,196,222,233,203
+.byte 84,123,148,50,166,194,35,61
+.byte 238,76,149,11,66,250,195,78
+.byte 8,46,161,102,40,217,36,178
+.byte 118,91,162,73,109,139,209,37
+.byte 114,248,246,100,134,104,152,22
+.byte 212,164,92,204,93,101,182,146
+.byte 108,112,72,80,253,237,185,218
+.byte 94,21,70,87,167,141,157,132
+.byte 144,216,171,0,140,188,211,10
+.byte 247,228,88,5,184,179,69,6
+.byte 208,44,30,143,202,63,15,2
+.byte 193,175,189,3,1,19,138,107
+.byte 58,145,17,65,79,103,220,234
+.byte 151,242,207,206,240,180,230,115
+.byte 150,172,116,34,231,173,53,133
+.byte 226,249,55,232,28,117,223,110
+.byte 71,241,26,113,29,41,197,137
+.byte 111,183,98,14,170,24,190,27
+.byte 252,86,62,75,198,210,121,32
+.byte 154,219,192,254,120,205,90,244
+.byte 31,221,168,51,136,7,199,49
+.byte 177,18,16,89,39,128,236,95
+.byte 96,81,127,169,25,181,74,13
+.byte 45,229,122,159,147,201,156,239
+.byte 160,224,59,77,174,42,245,176
+.byte 200,235,187,60,131,83,153,97
+.byte 23,43,4,126,186,119,214,38
+.byte 225,105,20,99,85,33,12,125
+.byte 82,9,106,213,48,54,165,56
+.byte 191,64,163,158,129,243,215,251
+.byte 124,227,57,130,155,47,255,135
+.byte 52,142,67,68,196,222,233,203
+.byte 84,123,148,50,166,194,35,61
+.byte 238,76,149,11,66,250,195,78
+.byte 8,46,161,102,40,217,36,178
+.byte 118,91,162,73,109,139,209,37
+.byte 114,248,246,100,134,104,152,22
+.byte 212,164,92,204,93,101,182,146
+.byte 108,112,72,80,253,237,185,218
+.byte 94,21,70,87,167,141,157,132
+.byte 144,216,171,0,140,188,211,10
+.byte 247,228,88,5,184,179,69,6
+.byte 208,44,30,143,202,63,15,2
+.byte 193,175,189,3,1,19,138,107
+.byte 58,145,17,65,79,103,220,234
+.byte 151,242,207,206,240,180,230,115
+.byte 150,172,116,34,231,173,53,133
+.byte 226,249,55,232,28,117,223,110
+.byte 71,241,26,113,29,41,197,137
+.byte 111,183,98,14,170,24,190,27
+.byte 252,86,62,75,198,210,121,32
+.byte 154,219,192,254,120,205,90,244
+.byte 31,221,168,51,136,7,199,49
+.byte 177,18,16,89,39,128,236,95
+.byte 96,81,127,169,25,181,74,13
+.byte 45,229,122,159,147,201,156,239
+.byte 160,224,59,77,174,42,245,176
+.byte 200,235,187,60,131,83,153,97
+.byte 23,43,4,126,186,119,214,38
+.byte 225,105,20,99,85,33,12,125
+.byte 82,9,106,213,48,54,165,56
+.byte 191,64,163,158,129,243,215,251
+.byte 124,227,57,130,155,47,255,135
+.byte 52,142,67,68,196,222,233,203
+.byte 84,123,148,50,166,194,35,61
+.byte 238,76,149,11,66,250,195,78
+.byte 8,46,161,102,40,217,36,178
+.byte 118,91,162,73,109,139,209,37
+.byte 114,248,246,100,134,104,152,22
+.byte 212,164,92,204,93,101,182,146
+.byte 108,112,72,80,253,237,185,218
+.byte 94,21,70,87,167,141,157,132
+.byte 144,216,171,0,140,188,211,10
+.byte 247,228,88,5,184,179,69,6
+.byte 208,44,30,143,202,63,15,2
+.byte 193,175,189,3,1,19,138,107
+.byte 58,145,17,65,79,103,220,234
+.byte 151,242,207,206,240,180,230,115
+.byte 150,172,116,34,231,173,53,133
+.byte 226,249,55,232,28,117,223,110
+.byte 71,241,26,113,29,41,197,137
+.byte 111,183,98,14,170,24,190,27
+.byte 252,86,62,75,198,210,121,32
+.byte 154,219,192,254,120,205,90,244
+.byte 31,221,168,51,136,7,199,49
+.byte 177,18,16,89,39,128,236,95
+.byte 96,81,127,169,25,181,74,13
+.byte 45,229,122,159,147,201,156,239
+.byte 160,224,59,77,174,42,245,176
+.byte 200,235,187,60,131,83,153,97
+.byte 23,43,4,126,186,119,214,38
+.byte 225,105,20,99,85,33,12,125
+.size _x86_AES_decrypt,.-_x86_AES_decrypt
+.globl AES_decrypt
+.type AES_decrypt,@function
+.align 16
+AES_decrypt:
+.L_AES_decrypt_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%esi
+ movl 28(%esp),%edi
+ movl %esp,%eax
+ subl $36,%esp
+ andl $-64,%esp
+ leal -127(%edi),%ebx
+ subl %esp,%ebx
+ negl %ebx
+ andl $960,%ebx
+ subl %ebx,%esp
+ addl $4,%esp
+ movl %eax,28(%esp)
+ call .L010pic_point
+.L010pic_point:
+ popl %ebp
+ leal _GLOBAL_OFFSET_TABLE_+[.-.L010pic_point](%ebp),%eax
+ movl OPENSSL_ia32cap_P@GOT(%eax),%eax
+ leal .LAES_Td-.L010pic_point(%ebp),%ebp
+ leal 764(%esp),%ebx
+ subl %ebp,%ebx
+ andl $768,%ebx
+ leal 2176(%ebp,%ebx,1),%ebp
+ btl $25,(%eax)
+ jnc .L011x86
+ movq (%esi),%mm0
+ movq 8(%esi),%mm4
+ call _sse_AES_decrypt_compact
+ movl 28(%esp),%esp
+ movl 24(%esp),%esi
+ movq %mm0,(%esi)
+ movq %mm4,8(%esi)
+ emms
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.align 16
+.L011x86:
+ movl %ebp,24(%esp)
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl 8(%esi),%ecx
+ movl 12(%esi),%edx
+ call _x86_AES_decrypt_compact
+ movl 28(%esp),%esp
+ movl 24(%esp),%esi
+ movl %eax,(%esi)
+ movl %ebx,4(%esi)
+ movl %ecx,8(%esi)
+ movl %edx,12(%esi)
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size AES_decrypt,.-.L_AES_decrypt_begin
+.globl AES_cbc_encrypt
+.type AES_cbc_encrypt,@function
+.align 16
+AES_cbc_encrypt:
+.L_AES_cbc_encrypt_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 28(%esp),%ecx
+ cmpl $0,%ecx
+ je .L012drop_out
+ call .L013pic_point
+.L013pic_point:
+ popl %ebp
+ leal _GLOBAL_OFFSET_TABLE_+[.-.L013pic_point](%ebp),%eax
+ movl OPENSSL_ia32cap_P@GOT(%eax),%eax
+ cmpl $0,40(%esp)
+ leal .LAES_Te-.L013pic_point(%ebp),%ebp
+ jne .L014picked_te
+ leal .LAES_Td-.LAES_Te(%ebp),%ebp
+.L014picked_te:
+ pushfl
+ cld
+ cmpl $512,%ecx
+ jb .L015slow_way
+ testl $15,%ecx
+ jnz .L015slow_way
+ btl $28,(%eax)
+ jc .L015slow_way
+ leal -324(%esp),%esi
+ andl $-64,%esi
+ movl %ebp,%eax
+ leal 2304(%ebp),%ebx
+ movl %esi,%edx
+ andl $4095,%eax
+ andl $4095,%ebx
+ andl $4095,%edx
+ cmpl %ebx,%edx
+ jb .L016tbl_break_out
+ subl %ebx,%edx
+ subl %edx,%esi
+ jmp .L017tbl_ok
+.align 4
+.L016tbl_break_out:
+ subl %eax,%edx
+ andl $4095,%edx
+ addl $384,%edx
+ subl %edx,%esi
+.align 4
+.L017tbl_ok:
+ leal 24(%esp),%edx
+ xchgl %esi,%esp
+ addl $4,%esp
+ movl %ebp,24(%esp)
+ movl %esi,28(%esp)
+ movl (%edx),%eax
+ movl 4(%edx),%ebx
+ movl 12(%edx),%edi
+ movl 16(%edx),%esi
+ movl 20(%edx),%edx
+ movl %eax,32(%esp)
+ movl %ebx,36(%esp)
+ movl %ecx,40(%esp)
+ movl %edi,44(%esp)
+ movl %esi,48(%esp)
+ movl $0,316(%esp)
+ movl %edi,%ebx
+ movl $61,%ecx
+ subl %ebp,%ebx
+ movl %edi,%esi
+ andl $4095,%ebx
+ leal 76(%esp),%edi
+ cmpl $2304,%ebx
+ jb .L018do_copy
+ cmpl $3852,%ebx
+ jb .L019skip_copy
+.align 4
+.L018do_copy:
+ movl %edi,44(%esp)
+.long 2784229001
+.L019skip_copy:
+ movl $16,%edi
+.align 4
+.L020prefetch_tbl:
+ movl (%ebp),%eax
+ movl 32(%ebp),%ebx
+ movl 64(%ebp),%ecx
+ movl 96(%ebp),%esi
+ leal 128(%ebp),%ebp
+ subl $1,%edi
+ jnz .L020prefetch_tbl
+ subl $2048,%ebp
+ movl 32(%esp),%esi
+ movl 48(%esp),%edi
+ cmpl $0,%edx
+ je .L021fast_decrypt
+ movl (%edi),%eax
+ movl 4(%edi),%ebx
+.align 16
+.L022fast_enc_loop:
+ movl 8(%edi),%ecx
+ movl 12(%edi),%edx
+ xorl (%esi),%eax
+ xorl 4(%esi),%ebx
+ xorl 8(%esi),%ecx
+ xorl 12(%esi),%edx
+ movl 44(%esp),%edi
+ call _x86_AES_encrypt
+ movl 32(%esp),%esi
+ movl 36(%esp),%edi
+ movl %eax,(%edi)
+ movl %ebx,4(%edi)
+ movl %ecx,8(%edi)
+ movl %edx,12(%edi)
+ leal 16(%esi),%esi
+ movl 40(%esp),%ecx
+ movl %esi,32(%esp)
+ leal 16(%edi),%edx
+ movl %edx,36(%esp)
+ subl $16,%ecx
+ movl %ecx,40(%esp)
+ jnz .L022fast_enc_loop
+ movl 48(%esp),%esi
+ movl 8(%edi),%ecx
+ movl 12(%edi),%edx
+ movl %eax,(%esi)
+ movl %ebx,4(%esi)
+ movl %ecx,8(%esi)
+ movl %edx,12(%esi)
+ cmpl $0,316(%esp)
+ movl 44(%esp),%edi
+ je .L023skip_ezero
+ movl $60,%ecx
+ xorl %eax,%eax
+.align 4
+.long 2884892297
+.L023skip_ezero:
+ movl 28(%esp),%esp
+ popfl
+.L012drop_out:
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+ pushfl
+.align 16
+.L021fast_decrypt:
+ cmpl 36(%esp),%esi
+ je .L024fast_dec_in_place
+ movl %edi,52(%esp)
+.align 4
+.align 16
+.L025fast_dec_loop:
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl 8(%esi),%ecx
+ movl 12(%esi),%edx
+ movl 44(%esp),%edi
+ call _x86_AES_decrypt
+ movl 52(%esp),%edi
+ movl 40(%esp),%esi
+ xorl (%edi),%eax
+ xorl 4(%edi),%ebx
+ xorl 8(%edi),%ecx
+ xorl 12(%edi),%edx
+ movl 36(%esp),%edi
+ movl 32(%esp),%esi
+ movl %eax,(%edi)
+ movl %ebx,4(%edi)
+ movl %ecx,8(%edi)
+ movl %edx,12(%edi)
+ movl 40(%esp),%ecx
+ movl %esi,52(%esp)
+ leal 16(%esi),%esi
+ movl %esi,32(%esp)
+ leal 16(%edi),%edi
+ movl %edi,36(%esp)
+ subl $16,%ecx
+ movl %ecx,40(%esp)
+ jnz .L025fast_dec_loop
+ movl 52(%esp),%edi
+ movl 48(%esp),%esi
+ movl (%edi),%eax
+ movl 4(%edi),%ebx
+ movl 8(%edi),%ecx
+ movl 12(%edi),%edx
+ movl %eax,(%esi)
+ movl %ebx,4(%esi)
+ movl %ecx,8(%esi)
+ movl %edx,12(%esi)
+ jmp .L026fast_dec_out
+.align 16
+.L024fast_dec_in_place:
+.L027fast_dec_in_place_loop:
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl 8(%esi),%ecx
+ movl 12(%esi),%edx
+ leal 60(%esp),%edi
+ movl %eax,(%edi)
+ movl %ebx,4(%edi)
+ movl %ecx,8(%edi)
+ movl %edx,12(%edi)
+ movl 44(%esp),%edi
+ call _x86_AES_decrypt
+ movl 48(%esp),%edi
+ movl 36(%esp),%esi
+ xorl (%edi),%eax
+ xorl 4(%edi),%ebx
+ xorl 8(%edi),%ecx
+ xorl 12(%edi),%edx
+ movl %eax,(%esi)
+ movl %ebx,4(%esi)
+ movl %ecx,8(%esi)
+ movl %edx,12(%esi)
+ leal 16(%esi),%esi
+ movl %esi,36(%esp)
+ leal 60(%esp),%esi
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl 8(%esi),%ecx
+ movl 12(%esi),%edx
+ movl %eax,(%edi)
+ movl %ebx,4(%edi)
+ movl %ecx,8(%edi)
+ movl %edx,12(%edi)
+ movl 32(%esp),%esi
+ movl 40(%esp),%ecx
+ leal 16(%esi),%esi
+ movl %esi,32(%esp)
+ subl $16,%ecx
+ movl %ecx,40(%esp)
+ jnz .L027fast_dec_in_place_loop
+.align 4
+.L026fast_dec_out:
+ cmpl $0,316(%esp)
+ movl 44(%esp),%edi
+ je .L028skip_dzero
+ movl $60,%ecx
+ xorl %eax,%eax
+.align 4
+.long 2884892297
+.L028skip_dzero:
+ movl 28(%esp),%esp
+ popfl
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+ pushfl
+.align 16
+.L015slow_way:
+ movl (%eax),%eax
+ movl 36(%esp),%edi
+ leal -80(%esp),%esi
+ andl $-64,%esi
+ leal -143(%edi),%ebx
+ subl %esi,%ebx
+ negl %ebx
+ andl $960,%ebx
+ subl %ebx,%esi
+ leal 768(%esi),%ebx
+ subl %ebp,%ebx
+ andl $768,%ebx
+ leal 2176(%ebp,%ebx,1),%ebp
+ leal 24(%esp),%edx
+ xchgl %esi,%esp
+ addl $4,%esp
+ movl %ebp,24(%esp)
+ movl %esi,28(%esp)
+ movl %eax,52(%esp)
+ movl (%edx),%eax
+ movl 4(%edx),%ebx
+ movl 16(%edx),%esi
+ movl 20(%edx),%edx
+ movl %eax,32(%esp)
+ movl %ebx,36(%esp)
+ movl %ecx,40(%esp)
+ movl %edi,44(%esp)
+ movl %esi,48(%esp)
+ movl %esi,%edi
+ movl %eax,%esi
+ cmpl $0,%edx
+ je .L029slow_decrypt
+ cmpl $16,%ecx
+ movl %ebx,%edx
+ jb .L030slow_enc_tail
+ btl $25,52(%esp)
+ jnc .L031slow_enc_x86
+ movq (%edi),%mm0
+ movq 8(%edi),%mm4
+.align 16
+.L032slow_enc_loop_sse:
+ pxor (%esi),%mm0
+ pxor 8(%esi),%mm4
+ movl 44(%esp),%edi
+ call _sse_AES_encrypt_compact
+ movl 32(%esp),%esi
+ movl 36(%esp),%edi
+ movl 40(%esp),%ecx
+ movq %mm0,(%edi)
+ movq %mm4,8(%edi)
+ leal 16(%esi),%esi
+ movl %esi,32(%esp)
+ leal 16(%edi),%edx
+ movl %edx,36(%esp)
+ subl $16,%ecx
+ cmpl $16,%ecx
+ movl %ecx,40(%esp)
+ jae .L032slow_enc_loop_sse
+ testl $15,%ecx
+ jnz .L030slow_enc_tail
+ movl 48(%esp),%esi
+ movq %mm0,(%esi)
+ movq %mm4,8(%esi)
+ emms
+ movl 28(%esp),%esp
+ popfl
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+ pushfl
+.align 16
+.L031slow_enc_x86:
+ movl (%edi),%eax
+ movl 4(%edi),%ebx
+.align 4
+.L033slow_enc_loop_x86:
+ movl 8(%edi),%ecx
+ movl 12(%edi),%edx
+ xorl (%esi),%eax
+ xorl 4(%esi),%ebx
+ xorl 8(%esi),%ecx
+ xorl 12(%esi),%edx
+ movl 44(%esp),%edi
+ call _x86_AES_encrypt_compact
+ movl 32(%esp),%esi
+ movl 36(%esp),%edi
+ movl %eax,(%edi)
+ movl %ebx,4(%edi)
+ movl %ecx,8(%edi)
+ movl %edx,12(%edi)
+ movl 40(%esp),%ecx
+ leal 16(%esi),%esi
+ movl %esi,32(%esp)
+ leal 16(%edi),%edx
+ movl %edx,36(%esp)
+ subl $16,%ecx
+ cmpl $16,%ecx
+ movl %ecx,40(%esp)
+ jae .L033slow_enc_loop_x86
+ testl $15,%ecx
+ jnz .L030slow_enc_tail
+ movl 48(%esp),%esi
+ movl 8(%edi),%ecx
+ movl 12(%edi),%edx
+ movl %eax,(%esi)
+ movl %ebx,4(%esi)
+ movl %ecx,8(%esi)
+ movl %edx,12(%esi)
+ movl 28(%esp),%esp
+ popfl
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+ pushfl
+.align 16
+.L030slow_enc_tail:
+ emms
+ movl %edx,%edi
+ movl $16,%ebx
+ subl %ecx,%ebx
+ cmpl %esi,%edi
+ je .L034enc_in_place
+.align 4
+.long 2767451785
+ jmp .L035enc_skip_in_place
+.L034enc_in_place:
+ leal (%edi,%ecx,1),%edi
+.L035enc_skip_in_place:
+ movl %ebx,%ecx
+ xorl %eax,%eax
+.align 4
+.long 2868115081
+ movl 48(%esp),%edi
+ movl %edx,%esi
+ movl (%edi),%eax
+ movl 4(%edi),%ebx
+ movl $16,40(%esp)
+ jmp .L033slow_enc_loop_x86
+.align 16
+.L029slow_decrypt:
+ btl $25,52(%esp)
+ jnc .L036slow_dec_loop_x86
+.align 4
+.L037slow_dec_loop_sse:
+ movq (%esi),%mm0
+ movq 8(%esi),%mm4
+ movl 44(%esp),%edi
+ call _sse_AES_decrypt_compact
+ movl 32(%esp),%esi
+ leal 60(%esp),%eax
+ movl 36(%esp),%ebx
+ movl 40(%esp),%ecx
+ movl 48(%esp),%edi
+ movq (%esi),%mm1
+ movq 8(%esi),%mm5
+ pxor (%edi),%mm0
+ pxor 8(%edi),%mm4
+ movq %mm1,(%edi)
+ movq %mm5,8(%edi)
+ subl $16,%ecx
+ jc .L038slow_dec_partial_sse
+ movq %mm0,(%ebx)
+ movq %mm4,8(%ebx)
+ leal 16(%ebx),%ebx
+ movl %ebx,36(%esp)
+ leal 16(%esi),%esi
+ movl %esi,32(%esp)
+ movl %ecx,40(%esp)
+ jnz .L037slow_dec_loop_sse
+ emms
+ movl 28(%esp),%esp
+ popfl
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+ pushfl
+.align 16
+.L038slow_dec_partial_sse:
+ movq %mm0,(%eax)
+ movq %mm4,8(%eax)
+ emms
+ addl $16,%ecx
+ movl %ebx,%edi
+ movl %eax,%esi
+.align 4
+.long 2767451785
+ movl 28(%esp),%esp
+ popfl
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+ pushfl
+.align 16
+.L036slow_dec_loop_x86:
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl 8(%esi),%ecx
+ movl 12(%esi),%edx
+ leal 60(%esp),%edi
+ movl %eax,(%edi)
+ movl %ebx,4(%edi)
+ movl %ecx,8(%edi)
+ movl %edx,12(%edi)
+ movl 44(%esp),%edi
+ call _x86_AES_decrypt_compact
+ movl 48(%esp),%edi
+ movl 40(%esp),%esi
+ xorl (%edi),%eax
+ xorl 4(%edi),%ebx
+ xorl 8(%edi),%ecx
+ xorl 12(%edi),%edx
+ subl $16,%esi
+ jc .L039slow_dec_partial_x86
+ movl %esi,40(%esp)
+ movl 36(%esp),%esi
+ movl %eax,(%esi)
+ movl %ebx,4(%esi)
+ movl %ecx,8(%esi)
+ movl %edx,12(%esi)
+ leal 16(%esi),%esi
+ movl %esi,36(%esp)
+ leal 60(%esp),%esi
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl 8(%esi),%ecx
+ movl 12(%esi),%edx
+ movl %eax,(%edi)
+ movl %ebx,4(%edi)
+ movl %ecx,8(%edi)
+ movl %edx,12(%edi)
+ movl 32(%esp),%esi
+ leal 16(%esi),%esi
+ movl %esi,32(%esp)
+ jnz .L036slow_dec_loop_x86
+ movl 28(%esp),%esp
+ popfl
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+ pushfl
+.align 16
+.L039slow_dec_partial_x86:
+ leal 60(%esp),%esi
+ movl %eax,(%esi)
+ movl %ebx,4(%esi)
+ movl %ecx,8(%esi)
+ movl %edx,12(%esi)
+ movl 32(%esp),%esi
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl 8(%esi),%ecx
+ movl 12(%esi),%edx
+ movl %eax,(%edi)
+ movl %ebx,4(%edi)
+ movl %ecx,8(%edi)
+ movl %edx,12(%edi)
+ movl 40(%esp),%ecx
+ movl 36(%esp),%edi
+ leal 60(%esp),%esi
+.align 4
+.long 2767451785
+ movl 28(%esp),%esp
+ popfl
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size AES_cbc_encrypt,.-.L_AES_cbc_encrypt_begin
+.type _x86_AES_set_encrypt_key,@function
+.align 16
+_x86_AES_set_encrypt_key:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 24(%esp),%esi
+ movl 32(%esp),%edi
+ testl $-1,%esi
+ jz .L040badpointer
+ testl $-1,%edi
+ jz .L040badpointer
+ call .L041pic_point
+.L041pic_point:
+ popl %ebp
+ leal .LAES_Te-.L041pic_point(%ebp),%ebp
+ leal 2176(%ebp),%ebp
+ movl -128(%ebp),%eax
+ movl -96(%ebp),%ebx
+ movl -64(%ebp),%ecx
+ movl -32(%ebp),%edx
+ movl (%ebp),%eax
+ movl 32(%ebp),%ebx
+ movl 64(%ebp),%ecx
+ movl 96(%ebp),%edx
+ movl 28(%esp),%ecx
+ cmpl $128,%ecx
+ je .L04210rounds
+ cmpl $192,%ecx
+ je .L04312rounds
+ cmpl $256,%ecx
+ je .L04414rounds
+ movl $-2,%eax
+ jmp .L045exit
+.L04210rounds:
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl 8(%esi),%ecx
+ movl 12(%esi),%edx
+ movl %eax,(%edi)
+ movl %ebx,4(%edi)
+ movl %ecx,8(%edi)
+ movl %edx,12(%edi)
+ xorl %ecx,%ecx
+ jmp .L04610shortcut
+.align 4
+.L04710loop:
+ movl (%edi),%eax
+ movl 12(%edi),%edx
+.L04610shortcut:
+ movzbl %dl,%esi
+ movzbl -128(%ebp,%esi,1),%ebx
+ movzbl %dh,%esi
+ shll $24,%ebx
+ xorl %ebx,%eax
+ movzbl -128(%ebp,%esi,1),%ebx
+ shrl $16,%edx
+ movzbl %dl,%esi
+ xorl %ebx,%eax
+ movzbl -128(%ebp,%esi,1),%ebx
+ movzbl %dh,%esi
+ shll $8,%ebx
+ xorl %ebx,%eax
+ movzbl -128(%ebp,%esi,1),%ebx
+ shll $16,%ebx
+ xorl %ebx,%eax
+ xorl 896(%ebp,%ecx,4),%eax
+ movl %eax,16(%edi)
+ xorl 4(%edi),%eax
+ movl %eax,20(%edi)
+ xorl 8(%edi),%eax
+ movl %eax,24(%edi)
+ xorl 12(%edi),%eax
+ movl %eax,28(%edi)
+ incl %ecx
+ addl $16,%edi
+ cmpl $10,%ecx
+ jl .L04710loop
+ movl $10,80(%edi)
+ xorl %eax,%eax
+ jmp .L045exit
+.L04312rounds:
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl 8(%esi),%ecx
+ movl 12(%esi),%edx
+ movl %eax,(%edi)
+ movl %ebx,4(%edi)
+ movl %ecx,8(%edi)
+ movl %edx,12(%edi)
+ movl 16(%esi),%ecx
+ movl 20(%esi),%edx
+ movl %ecx,16(%edi)
+ movl %edx,20(%edi)
+ xorl %ecx,%ecx
+ jmp .L04812shortcut
+.align 4
+.L04912loop:
+ movl (%edi),%eax
+ movl 20(%edi),%edx
+.L04812shortcut:
+ movzbl %dl,%esi
+ movzbl -128(%ebp,%esi,1),%ebx
+ movzbl %dh,%esi
+ shll $24,%ebx
+ xorl %ebx,%eax
+ movzbl -128(%ebp,%esi,1),%ebx
+ shrl $16,%edx
+ movzbl %dl,%esi
+ xorl %ebx,%eax
+ movzbl -128(%ebp,%esi,1),%ebx
+ movzbl %dh,%esi
+ shll $8,%ebx
+ xorl %ebx,%eax
+ movzbl -128(%ebp,%esi,1),%ebx
+ shll $16,%ebx
+ xorl %ebx,%eax
+ xorl 896(%ebp,%ecx,4),%eax
+ movl %eax,24(%edi)
+ xorl 4(%edi),%eax
+ movl %eax,28(%edi)
+ xorl 8(%edi),%eax
+ movl %eax,32(%edi)
+ xorl 12(%edi),%eax
+ movl %eax,36(%edi)
+ cmpl $7,%ecx
+ je .L05012break
+ incl %ecx
+ xorl 16(%edi),%eax
+ movl %eax,40(%edi)
+ xorl 20(%edi),%eax
+ movl %eax,44(%edi)
+ addl $24,%edi
+ jmp .L04912loop
+.L05012break:
+ movl $12,72(%edi)
+ xorl %eax,%eax
+ jmp .L045exit
+.L04414rounds:
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl 8(%esi),%ecx
+ movl 12(%esi),%edx
+ movl %eax,(%edi)
+ movl %ebx,4(%edi)
+ movl %ecx,8(%edi)
+ movl %edx,12(%edi)
+ movl 16(%esi),%eax
+ movl 20(%esi),%ebx
+ movl 24(%esi),%ecx
+ movl 28(%esi),%edx
+ movl %eax,16(%edi)
+ movl %ebx,20(%edi)
+ movl %ecx,24(%edi)
+ movl %edx,28(%edi)
+ xorl %ecx,%ecx
+ jmp .L05114shortcut
+.align 4
+.L05214loop:
+ movl 28(%edi),%edx
+.L05114shortcut:
+ movl (%edi),%eax
+ movzbl %dl,%esi
+ movzbl -128(%ebp,%esi,1),%ebx
+ movzbl %dh,%esi
+ shll $24,%ebx
+ xorl %ebx,%eax
+ movzbl -128(%ebp,%esi,1),%ebx
+ shrl $16,%edx
+ movzbl %dl,%esi
+ xorl %ebx,%eax
+ movzbl -128(%ebp,%esi,1),%ebx
+ movzbl %dh,%esi
+ shll $8,%ebx
+ xorl %ebx,%eax
+ movzbl -128(%ebp,%esi,1),%ebx
+ shll $16,%ebx
+ xorl %ebx,%eax
+ xorl 896(%ebp,%ecx,4),%eax
+ movl %eax,32(%edi)
+ xorl 4(%edi),%eax
+ movl %eax,36(%edi)
+ xorl 8(%edi),%eax
+ movl %eax,40(%edi)
+ xorl 12(%edi),%eax
+ movl %eax,44(%edi)
+ cmpl $6,%ecx
+ je .L05314break
+ incl %ecx
+ movl %eax,%edx
+ movl 16(%edi),%eax
+ movzbl %dl,%esi
+ movzbl -128(%ebp,%esi,1),%ebx
+ movzbl %dh,%esi
+ xorl %ebx,%eax
+ movzbl -128(%ebp,%esi,1),%ebx
+ shrl $16,%edx
+ shll $8,%ebx
+ movzbl %dl,%esi
+ xorl %ebx,%eax
+ movzbl -128(%ebp,%esi,1),%ebx
+ movzbl %dh,%esi
+ shll $16,%ebx
+ xorl %ebx,%eax
+ movzbl -128(%ebp,%esi,1),%ebx
+ shll $24,%ebx
+ xorl %ebx,%eax
+ movl %eax,48(%edi)
+ xorl 20(%edi),%eax
+ movl %eax,52(%edi)
+ xorl 24(%edi),%eax
+ movl %eax,56(%edi)
+ xorl 28(%edi),%eax
+ movl %eax,60(%edi)
+ addl $32,%edi
+ jmp .L05214loop
+.L05314break:
+ movl $14,48(%edi)
+ xorl %eax,%eax
+ jmp .L045exit
+.L040badpointer:
+ movl $-1,%eax
+.L045exit:
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size _x86_AES_set_encrypt_key,.-_x86_AES_set_encrypt_key
+.globl private_AES_set_encrypt_key
+.type private_AES_set_encrypt_key,@function
+.align 16
+private_AES_set_encrypt_key:
+.L_private_AES_set_encrypt_key_begin:
+ call _x86_AES_set_encrypt_key
+ ret
+.size private_AES_set_encrypt_key,.-.L_private_AES_set_encrypt_key_begin
+.globl private_AES_set_decrypt_key
+.type private_AES_set_decrypt_key,@function
+.align 16
+private_AES_set_decrypt_key:
+.L_private_AES_set_decrypt_key_begin:
+ call _x86_AES_set_encrypt_key
+ cmpl $0,%eax
+ je .L054proceed
+ ret
+.L054proceed:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 28(%esp),%esi
+ movl 240(%esi),%ecx
+ leal (,%ecx,4),%ecx
+ leal (%esi,%ecx,4),%edi
+.align 4
+.L055invert:
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl (%edi),%ecx
+ movl 4(%edi),%edx
+ movl %eax,(%edi)
+ movl %ebx,4(%edi)
+ movl %ecx,(%esi)
+ movl %edx,4(%esi)
+ movl 8(%esi),%eax
+ movl 12(%esi),%ebx
+ movl 8(%edi),%ecx
+ movl 12(%edi),%edx
+ movl %eax,8(%edi)
+ movl %ebx,12(%edi)
+ movl %ecx,8(%esi)
+ movl %edx,12(%esi)
+ addl $16,%esi
+ subl $16,%edi
+ cmpl %edi,%esi
+ jne .L055invert
+ movl 28(%esp),%edi
+ movl 240(%edi),%esi
+ leal -2(%esi,%esi,1),%esi
+ leal (%edi,%esi,8),%esi
+ movl %esi,28(%esp)
+ movl 16(%edi),%eax
+.align 4
+.L056permute:
+ addl $16,%edi
+ movl %eax,%esi
+ andl $2155905152,%esi
+ movl %esi,%ebp
+ shrl $7,%ebp
+ leal (%eax,%eax,1),%ebx
+ subl %ebp,%esi
+ andl $4278124286,%ebx
+ andl $454761243,%esi
+ xorl %ebx,%esi
+ movl %esi,%ebx
+ andl $2155905152,%esi
+ movl %esi,%ebp
+ shrl $7,%ebp
+ leal (%ebx,%ebx,1),%ecx
+ subl %ebp,%esi
+ andl $4278124286,%ecx
+ andl $454761243,%esi
+ xorl %eax,%ebx
+ xorl %ecx,%esi
+ movl %esi,%ecx
+ andl $2155905152,%esi
+ movl %esi,%ebp
+ shrl $7,%ebp
+ leal (%ecx,%ecx,1),%edx
+ xorl %eax,%ecx
+ subl %ebp,%esi
+ andl $4278124286,%edx
+ andl $454761243,%esi
+ roll $8,%eax
+ xorl %esi,%edx
+ movl 4(%edi),%ebp
+ xorl %ebx,%eax
+ xorl %edx,%ebx
+ xorl %ecx,%eax
+ roll $24,%ebx
+ xorl %edx,%ecx
+ xorl %edx,%eax
+ roll $16,%ecx
+ xorl %ebx,%eax
+ roll $8,%edx
+ xorl %ecx,%eax
+ movl %ebp,%ebx
+ xorl %edx,%eax
+ movl %eax,(%edi)
+ movl %ebx,%esi
+ andl $2155905152,%esi
+ movl %esi,%ebp
+ shrl $7,%ebp
+ leal (%ebx,%ebx,1),%ecx
+ subl %ebp,%esi
+ andl $4278124286,%ecx
+ andl $454761243,%esi
+ xorl %ecx,%esi
+ movl %esi,%ecx
+ andl $2155905152,%esi
+ movl %esi,%ebp
+ shrl $7,%ebp
+ leal (%ecx,%ecx,1),%edx
+ subl %ebp,%esi
+ andl $4278124286,%edx
+ andl $454761243,%esi
+ xorl %ebx,%ecx
+ xorl %edx,%esi
+ movl %esi,%edx
+ andl $2155905152,%esi
+ movl %esi,%ebp
+ shrl $7,%ebp
+ leal (%edx,%edx,1),%eax
+ xorl %ebx,%edx
+ subl %ebp,%esi
+ andl $4278124286,%eax
+ andl $454761243,%esi
+ roll $8,%ebx
+ xorl %esi,%eax
+ movl 8(%edi),%ebp
+ xorl %ecx,%ebx
+ xorl %eax,%ecx
+ xorl %edx,%ebx
+ roll $24,%ecx
+ xorl %eax,%edx
+ xorl %eax,%ebx
+ roll $16,%edx
+ xorl %ecx,%ebx
+ roll $8,%eax
+ xorl %edx,%ebx
+ movl %ebp,%ecx
+ xorl %eax,%ebx
+ movl %ebx,4(%edi)
+ movl %ecx,%esi
+ andl $2155905152,%esi
+ movl %esi,%ebp
+ shrl $7,%ebp
+ leal (%ecx,%ecx,1),%edx
+ subl %ebp,%esi
+ andl $4278124286,%edx
+ andl $454761243,%esi
+ xorl %edx,%esi
+ movl %esi,%edx
+ andl $2155905152,%esi
+ movl %esi,%ebp
+ shrl $7,%ebp
+ leal (%edx,%edx,1),%eax
+ subl %ebp,%esi
+ andl $4278124286,%eax
+ andl $454761243,%esi
+ xorl %ecx,%edx
+ xorl %eax,%esi
+ movl %esi,%eax
+ andl $2155905152,%esi
+ movl %esi,%ebp
+ shrl $7,%ebp
+ leal (%eax,%eax,1),%ebx
+ xorl %ecx,%eax
+ subl %ebp,%esi
+ andl $4278124286,%ebx
+ andl $454761243,%esi
+ roll $8,%ecx
+ xorl %esi,%ebx
+ movl 12(%edi),%ebp
+ xorl %edx,%ecx
+ xorl %ebx,%edx
+ xorl %eax,%ecx
+ roll $24,%edx
+ xorl %ebx,%eax
+ xorl %ebx,%ecx
+ roll $16,%eax
+ xorl %edx,%ecx
+ roll $8,%ebx
+ xorl %eax,%ecx
+ movl %ebp,%edx
+ xorl %ebx,%ecx
+ movl %ecx,8(%edi)
+ movl %edx,%esi
+ andl $2155905152,%esi
+ movl %esi,%ebp
+ shrl $7,%ebp
+ leal (%edx,%edx,1),%eax
+ subl %ebp,%esi
+ andl $4278124286,%eax
+ andl $454761243,%esi
+ xorl %eax,%esi
+ movl %esi,%eax
+ andl $2155905152,%esi
+ movl %esi,%ebp
+ shrl $7,%ebp
+ leal (%eax,%eax,1),%ebx
+ subl %ebp,%esi
+ andl $4278124286,%ebx
+ andl $454761243,%esi
+ xorl %edx,%eax
+ xorl %ebx,%esi
+ movl %esi,%ebx
+ andl $2155905152,%esi
+ movl %esi,%ebp
+ shrl $7,%ebp
+ leal (%ebx,%ebx,1),%ecx
+ xorl %edx,%ebx
+ subl %ebp,%esi
+ andl $4278124286,%ecx
+ andl $454761243,%esi
+ roll $8,%edx
+ xorl %esi,%ecx
+ movl 16(%edi),%ebp
+ xorl %eax,%edx
+ xorl %ecx,%eax
+ xorl %ebx,%edx
+ roll $24,%eax
+ xorl %ecx,%ebx
+ xorl %ecx,%edx
+ roll $16,%ebx
+ xorl %eax,%edx
+ roll $8,%ecx
+ xorl %ebx,%edx
+ movl %ebp,%eax
+ xorl %ecx,%edx
+ movl %edx,12(%edi)
+ cmpl 28(%esp),%edi
+ jb .L056permute
+ xorl %eax,%eax
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size private_AES_set_decrypt_key,.-.L_private_AES_set_decrypt_key_begin
+.byte 65,69,83,32,102,111,114,32,120,56,54,44,32,67,82,89
+.byte 80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114
+.byte 111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.comm OPENSSL_ia32cap_P,8,4
diff --git a/main/openssl/crypto/aes/asm/aes-586.pl b/main/openssl/crypto/aes/asm/aes-586.pl
index aab40e6f..51b500dd 100755..100644
--- a/main/openssl/crypto/aes/asm/aes-586.pl
+++ b/main/openssl/crypto/aes/asm/aes-586.pl
@@ -39,13 +39,13 @@
# but exhibits up to 10% improvement on other cores.
#
# Second version is "monolithic" replacement for aes_core.c, which in
-# addition to AES_[de|en]crypt implements AES_set_[de|en]cryption_key.
+# addition to AES_[de|en]crypt implements private_AES_set_[de|en]cryption_key.
# This made it possible to implement little-endian variant of the
# algorithm without modifying the base C code. Motivating factor for
# the undertaken effort was that it appeared that in tight IA-32
# register window little-endian flavor could achieve slightly higher
# Instruction Level Parallelism, and it indeed resulted in up to 15%
-# better performance on most recent µ-archs...
+# better performance on most recent µ-archs...
#
# Third version adds AES_cbc_encrypt implementation, which resulted in
# up to 40% performance imrovement of CBC benchmark results. 40% was
@@ -223,7 +223,7 @@ sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } }
$speed_limit=512; # chunks smaller than $speed_limit are
# processed with compact routine in CBC mode
$small_footprint=1; # $small_footprint=1 code is ~5% slower [on
- # recent µ-archs], but ~5 times smaller!
+ # recent µ-archs], but ~5 times smaller!
# I favor compact code to minimize cache
# contention and in hope to "collect" 5% back
# in real-life applications...
@@ -562,7 +562,7 @@ sub enctransform()
# Performance is not actually extraordinary in comparison to pure
# x86 code. In particular encrypt performance is virtually the same.
# Decrypt performance on the other hand is 15-20% better on newer
-# µ-archs [but we're thankful for *any* improvement here], and ~50%
+# µ-archs [but we're thankful for *any* improvement here], and ~50%
# better on PIII:-) And additionally on the pros side this code
# eliminates redundant references to stack and thus relieves/
# minimizes the pressure on the memory bus.
@@ -2854,12 +2854,12 @@ sub enckey()
&set_label("exit");
&function_end("_x86_AES_set_encrypt_key");
-# int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
+# int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits,
# AES_KEY *key)
-&function_begin_B("AES_set_encrypt_key");
+&function_begin_B("private_AES_set_encrypt_key");
&call ("_x86_AES_set_encrypt_key");
&ret ();
-&function_end_B("AES_set_encrypt_key");
+&function_end_B("private_AES_set_encrypt_key");
sub deckey()
{ my ($i,$key,$tp1,$tp2,$tp4,$tp8) = @_;
@@ -2916,9 +2916,9 @@ sub deckey()
&mov (&DWP(4*$i,$key),$tp1);
}
-# int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
+# int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits,
# AES_KEY *key)
-&function_begin_B("AES_set_decrypt_key");
+&function_begin_B("private_AES_set_decrypt_key");
&call ("_x86_AES_set_encrypt_key");
&cmp ("eax",0);
&je (&label("proceed"));
@@ -2974,7 +2974,7 @@ sub deckey()
&jb (&label("permute"));
&xor ("eax","eax"); # return success
-&function_end("AES_set_decrypt_key");
+&function_end("private_AES_set_decrypt_key");
&asciz("AES for x86, CRYPTOGAMS by <appro\@openssl.org>");
&asm_finish();
diff --git a/main/openssl/crypto/aes/asm/aes-armv4.pl b/main/openssl/crypto/aes/asm/aes-armv4.pl
index c51ee1fb..86b86c4a 100644
--- a/main/openssl/crypto/aes/asm/aes-armv4.pl
+++ b/main/openssl/crypto/aes/asm/aes-armv4.pl
@@ -27,6 +27,11 @@
# Rescheduling for dual-issue pipeline resulted in 12% improvement on
# Cortex A8 core and ~25 cycles per byte processed with 128-bit key.
+# February 2011.
+#
+# Profiler-assisted and platform-specific optimization resulted in 16%
+# improvement on Cortex A8 core and ~21.5 cycles per byte.
+
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
@@ -46,6 +51,7 @@ $key="r11";
$rounds="r12";
$code=<<___;
+#include "arm_arch.h"
.text
.code 32
@@ -166,7 +172,7 @@ AES_encrypt:
mov $rounds,r0 @ inp
mov $key,r2
sub $tbl,r3,#AES_encrypt-AES_Te @ Te
-
+#if __ARM_ARCH__<7
ldrb $s0,[$rounds,#3] @ load input data in endian-neutral
ldrb $t1,[$rounds,#2] @ manner...
ldrb $t2,[$rounds,#1]
@@ -195,10 +201,33 @@ AES_encrypt:
orr $s3,$s3,$t1,lsl#8
orr $s3,$s3,$t2,lsl#16
orr $s3,$s3,$t3,lsl#24
-
+#else
+ ldr $s0,[$rounds,#0]
+ ldr $s1,[$rounds,#4]
+ ldr $s2,[$rounds,#8]
+ ldr $s3,[$rounds,#12]
+#ifdef __ARMEL__
+ rev $s0,$s0
+ rev $s1,$s1
+ rev $s2,$s2
+ rev $s3,$s3
+#endif
+#endif
bl _armv4_AES_encrypt
ldr $rounds,[sp],#4 @ pop out
+#if __ARM_ARCH__>=7
+#ifdef __ARMEL__
+ rev $s0,$s0
+ rev $s1,$s1
+ rev $s2,$s2
+ rev $s3,$s3
+#endif
+ str $s0,[$rounds,#0]
+ str $s1,[$rounds,#4]
+ str $s2,[$rounds,#8]
+ str $s3,[$rounds,#12]
+#else
mov $t1,$s0,lsr#24 @ write output in endian-neutral
mov $t2,$s0,lsr#16 @ manner...
mov $t3,$s0,lsr#8
@@ -227,11 +256,15 @@ AES_encrypt:
strb $t2,[$rounds,#13]
strb $t3,[$rounds,#14]
strb $s3,[$rounds,#15]
-
+#endif
+#if __ARM_ARCH__>=5
+ ldmia sp!,{r4-r12,pc}
+#else
ldmia sp!,{r4-r12,lr}
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
+#endif
.size AES_encrypt,.-AES_encrypt
.type _armv4_AES_encrypt,%function
@@ -271,11 +304,11 @@ _armv4_AES_encrypt:
and $i2,lr,$s2,lsr#16 @ i1
eor $t3,$t3,$i3,ror#8
and $i3,lr,$s2
- eor $s1,$s1,$t1,ror#24
ldr $i1,[$tbl,$i1,lsl#2] @ Te2[s2>>8]
+ eor $s1,$s1,$t1,ror#24
+ ldr $i2,[$tbl,$i2,lsl#2] @ Te1[s2>>16]
mov $s2,$s2,lsr#24
- ldr $i2,[$tbl,$i2,lsl#2] @ Te1[s2>>16]
ldr $i3,[$tbl,$i3,lsl#2] @ Te3[s2>>0]
eor $s0,$s0,$i1,ror#16
ldr $s2,[$tbl,$s2,lsl#2] @ Te0[s2>>24]
@@ -284,16 +317,16 @@ _armv4_AES_encrypt:
and $i2,lr,$s3,lsr#8 @ i1
eor $t3,$t3,$i3,ror#16
and $i3,lr,$s3,lsr#16 @ i2
- eor $s2,$s2,$t2,ror#16
ldr $i1,[$tbl,$i1,lsl#2] @ Te3[s3>>0]
+ eor $s2,$s2,$t2,ror#16
+ ldr $i2,[$tbl,$i2,lsl#2] @ Te2[s3>>8]
mov $s3,$s3,lsr#24
- ldr $i2,[$tbl,$i2,lsl#2] @ Te2[s3>>8]
ldr $i3,[$tbl,$i3,lsl#2] @ Te1[s3>>16]
eor $s0,$s0,$i1,ror#24
- ldr $s3,[$tbl,$s3,lsl#2] @ Te0[s3>>24]
- eor $s1,$s1,$i2,ror#16
ldr $i1,[$key],#16
+ eor $s1,$s1,$i2,ror#16
+ ldr $s3,[$tbl,$s3,lsl#2] @ Te0[s3>>24]
eor $s2,$s2,$i3,ror#8
ldr $t1,[$key,#-12]
eor $s3,$s3,$t3,ror#8
@@ -333,11 +366,11 @@ _armv4_AES_encrypt:
and $i2,lr,$s2,lsr#16 @ i1
eor $t3,$i3,$t3,lsl#8
and $i3,lr,$s2
- eor $s1,$t1,$s1,lsl#24
ldrb $i1,[$tbl,$i1,lsl#2] @ Te4[s2>>8]
+ eor $s1,$t1,$s1,lsl#24
+ ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s2>>16]
mov $s2,$s2,lsr#24
- ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s2>>16]
ldrb $i3,[$tbl,$i3,lsl#2] @ Te4[s2>>0]
eor $s0,$i1,$s0,lsl#8
ldrb $s2,[$tbl,$s2,lsl#2] @ Te4[s2>>24]
@@ -346,15 +379,15 @@ _armv4_AES_encrypt:
and $i2,lr,$s3,lsr#8 @ i1
eor $t3,$i3,$t3,lsl#8
and $i3,lr,$s3,lsr#16 @ i2
- eor $s2,$t2,$s2,lsl#24
ldrb $i1,[$tbl,$i1,lsl#2] @ Te4[s3>>0]
+ eor $s2,$t2,$s2,lsl#24
+ ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s3>>8]
mov $s3,$s3,lsr#24
- ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s3>>8]
ldrb $i3,[$tbl,$i3,lsl#2] @ Te4[s3>>16]
eor $s0,$i1,$s0,lsl#8
- ldrb $s3,[$tbl,$s3,lsl#2] @ Te4[s3>>24]
ldr $i1,[$key,#0]
+ ldrb $s3,[$tbl,$s3,lsl#2] @ Te4[s3>>24]
eor $s1,$s1,$i2,lsl#8
ldr $t1,[$key,#4]
eor $s2,$s2,$i3,lsl#16
@@ -371,10 +404,11 @@ _armv4_AES_encrypt:
ldr pc,[sp],#4 @ pop and return
.size _armv4_AES_encrypt,.-_armv4_AES_encrypt
-.global AES_set_encrypt_key
-.type AES_set_encrypt_key,%function
+.global private_AES_set_encrypt_key
+.type private_AES_set_encrypt_key,%function
.align 5
-AES_set_encrypt_key:
+private_AES_set_encrypt_key:
+_armv4_AES_set_encrypt_key:
sub r3,pc,#8 @ AES_set_encrypt_key
teq r0,#0
moveq r0,#-1
@@ -392,12 +426,13 @@ AES_set_encrypt_key:
bne .Labrt
.Lok: stmdb sp!,{r4-r12,lr}
- sub $tbl,r3,#AES_set_encrypt_key-AES_Te-1024 @ Te4
+ sub $tbl,r3,#_armv4_AES_set_encrypt_key-AES_Te-1024 @ Te4
mov $rounds,r0 @ inp
mov lr,r1 @ bits
mov $key,r2 @ key
+#if __ARM_ARCH__<7
ldrb $s0,[$rounds,#3] @ load input data in endian-neutral
ldrb $t1,[$rounds,#2] @ manner...
ldrb $t2,[$rounds,#1]
@@ -430,6 +465,22 @@ AES_set_encrypt_key:
orr $s3,$s3,$t3,lsl#24
str $s2,[$key,#-8]
str $s3,[$key,#-4]
+#else
+ ldr $s0,[$rounds,#0]
+ ldr $s1,[$rounds,#4]
+ ldr $s2,[$rounds,#8]
+ ldr $s3,[$rounds,#12]
+#ifdef __ARMEL__
+ rev $s0,$s0
+ rev $s1,$s1
+ rev $s2,$s2
+ rev $s3,$s3
+#endif
+ str $s0,[$key],#16
+ str $s1,[$key,#-12]
+ str $s2,[$key,#-8]
+ str $s3,[$key,#-4]
+#endif
teq lr,#128
bne .Lnot128
@@ -466,6 +517,7 @@ AES_set_encrypt_key:
b .Ldone
.Lnot128:
+#if __ARM_ARCH__<7
ldrb $i2,[$rounds,#19]
ldrb $t1,[$rounds,#18]
ldrb $t2,[$rounds,#17]
@@ -482,6 +534,16 @@ AES_set_encrypt_key:
str $i2,[$key],#8
orr $i3,$i3,$t3,lsl#24
str $i3,[$key,#-4]
+#else
+ ldr $i2,[$rounds,#16]
+ ldr $i3,[$rounds,#20]
+#ifdef __ARMEL__
+ rev $i2,$i2
+ rev $i3,$i3
+#endif
+ str $i2,[$key],#8
+ str $i3,[$key,#-4]
+#endif
teq lr,#192
bne .Lnot192
@@ -526,6 +588,7 @@ AES_set_encrypt_key:
b .L192_loop
.Lnot192:
+#if __ARM_ARCH__<7
ldrb $i2,[$rounds,#27]
ldrb $t1,[$rounds,#26]
ldrb $t2,[$rounds,#25]
@@ -542,6 +605,16 @@ AES_set_encrypt_key:
str $i2,[$key],#8
orr $i3,$i3,$t3,lsl#24
str $i3,[$key,#-4]
+#else
+ ldr $i2,[$rounds,#24]
+ ldr $i3,[$rounds,#28]
+#ifdef __ARMEL__
+ rev $i2,$i2
+ rev $i3,$i3
+#endif
+ str $i2,[$key],#8
+ str $i3,[$key,#-4]
+#endif
mov $rounds,#14
str $rounds,[$key,#240-32]
@@ -606,14 +679,14 @@ AES_set_encrypt_key:
.Labrt: tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
-.size AES_set_encrypt_key,.-AES_set_encrypt_key
+.size private_AES_set_encrypt_key,.-private_AES_set_encrypt_key
-.global AES_set_decrypt_key
-.type AES_set_decrypt_key,%function
+.global private_AES_set_decrypt_key
+.type private_AES_set_decrypt_key,%function
.align 5
-AES_set_decrypt_key:
+private_AES_set_decrypt_key:
str lr,[sp,#-4]! @ push lr
- bl AES_set_encrypt_key
+ bl _armv4_AES_set_encrypt_key
teq r0,#0
ldrne lr,[sp],#4 @ pop lr
bne .Labrt
@@ -692,11 +765,15 @@ $code.=<<___;
bne .Lmix
mov r0,#0
+#if __ARM_ARCH__>=5
+ ldmia sp!,{r4-r12,pc}
+#else
ldmia sp!,{r4-r12,lr}
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
-.size AES_set_decrypt_key,.-AES_set_decrypt_key
+#endif
+.size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key
.type AES_Td,%object
.align 5
@@ -811,7 +888,7 @@ AES_decrypt:
mov $rounds,r0 @ inp
mov $key,r2
sub $tbl,r3,#AES_decrypt-AES_Td @ Td
-
+#if __ARM_ARCH__<7
ldrb $s0,[$rounds,#3] @ load input data in endian-neutral
ldrb $t1,[$rounds,#2] @ manner...
ldrb $t2,[$rounds,#1]
@@ -840,10 +917,33 @@ AES_decrypt:
orr $s3,$s3,$t1,lsl#8
orr $s3,$s3,$t2,lsl#16
orr $s3,$s3,$t3,lsl#24
-
+#else
+ ldr $s0,[$rounds,#0]
+ ldr $s1,[$rounds,#4]
+ ldr $s2,[$rounds,#8]
+ ldr $s3,[$rounds,#12]
+#ifdef __ARMEL__
+ rev $s0,$s0
+ rev $s1,$s1
+ rev $s2,$s2
+ rev $s3,$s3
+#endif
+#endif
bl _armv4_AES_decrypt
ldr $rounds,[sp],#4 @ pop out
+#if __ARM_ARCH__>=7
+#ifdef __ARMEL__
+ rev $s0,$s0
+ rev $s1,$s1
+ rev $s2,$s2
+ rev $s3,$s3
+#endif
+ str $s0,[$rounds,#0]
+ str $s1,[$rounds,#4]
+ str $s2,[$rounds,#8]
+ str $s3,[$rounds,#12]
+#else
mov $t1,$s0,lsr#24 @ write output in endian-neutral
mov $t2,$s0,lsr#16 @ manner...
mov $t3,$s0,lsr#8
@@ -872,11 +972,15 @@ AES_decrypt:
strb $t2,[$rounds,#13]
strb $t3,[$rounds,#14]
strb $s3,[$rounds,#15]
-
+#endif
+#if __ARM_ARCH__>=5
+ ldmia sp!,{r4-r12,pc}
+#else
ldmia sp!,{r4-r12,lr}
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
+#endif
.size AES_decrypt,.-AES_decrypt
.type _armv4_AES_decrypt,%function
@@ -916,11 +1020,11 @@ _armv4_AES_decrypt:
and $i2,lr,$s2 @ i1
eor $t3,$i3,$t3,ror#8
and $i3,lr,$s2,lsr#16
- eor $s1,$s1,$t1,ror#8
ldr $i1,[$tbl,$i1,lsl#2] @ Td2[s2>>8]
+ eor $s1,$s1,$t1,ror#8
+ ldr $i2,[$tbl,$i2,lsl#2] @ Td3[s2>>0]
mov $s2,$s2,lsr#24
- ldr $i2,[$tbl,$i2,lsl#2] @ Td3[s2>>0]
ldr $i3,[$tbl,$i3,lsl#2] @ Td1[s2>>16]
eor $s0,$s0,$i1,ror#16
ldr $s2,[$tbl,$s2,lsl#2] @ Td0[s2>>24]
@@ -929,22 +1033,22 @@ _armv4_AES_decrypt:
and $i2,lr,$s3,lsr#8 @ i1
eor $t3,$i3,$t3,ror#8
and $i3,lr,$s3 @ i2
- eor $s2,$s2,$t2,ror#8
ldr $i1,[$tbl,$i1,lsl#2] @ Td1[s3>>16]
+ eor $s2,$s2,$t2,ror#8
+ ldr $i2,[$tbl,$i2,lsl#2] @ Td2[s3>>8]
mov $s3,$s3,lsr#24
- ldr $i2,[$tbl,$i2,lsl#2] @ Td2[s3>>8]
ldr $i3,[$tbl,$i3,lsl#2] @ Td3[s3>>0]
eor $s0,$s0,$i1,ror#8
- ldr $s3,[$tbl,$s3,lsl#2] @ Td0[s3>>24]
+ ldr $i1,[$key],#16
eor $s1,$s1,$i2,ror#16
+ ldr $s3,[$tbl,$s3,lsl#2] @ Td0[s3>>24]
eor $s2,$s2,$i3,ror#24
- ldr $i1,[$key],#16
- eor $s3,$s3,$t3,ror#8
ldr $t1,[$key,#-12]
- ldr $t2,[$key,#-8]
eor $s0,$s0,$i1
+ ldr $t2,[$key,#-8]
+ eor $s3,$s3,$t3,ror#8
ldr $t3,[$key,#-4]
and $i1,lr,$s0,lsr#16
eor $s1,$s1,$t1
@@ -985,11 +1089,11 @@ _armv4_AES_decrypt:
and $i1,lr,$s2,lsr#8 @ i0
eor $t2,$t2,$i2,lsl#8
and $i2,lr,$s2 @ i1
- eor $t3,$t3,$i3,lsl#8
ldrb $i1,[$tbl,$i1] @ Td4[s2>>8]
+ eor $t3,$t3,$i3,lsl#8
+ ldrb $i2,[$tbl,$i2] @ Td4[s2>>0]
and $i3,lr,$s2,lsr#16
- ldrb $i2,[$tbl,$i2] @ Td4[s2>>0]
ldrb $s2,[$tbl,$s2,lsr#24] @ Td4[s2>>24]
eor $s0,$s0,$i1,lsl#8
ldrb $i3,[$tbl,$i3] @ Td4[s2>>16]
@@ -997,11 +1101,11 @@ _armv4_AES_decrypt:
and $i1,lr,$s3,lsr#16 @ i0
eor $s2,$t2,$s2,lsl#16
and $i2,lr,$s3,lsr#8 @ i1
- eor $t3,$t3,$i3,lsl#16
ldrb $i1,[$tbl,$i1] @ Td4[s3>>16]
+ eor $t3,$t3,$i3,lsl#16
+ ldrb $i2,[$tbl,$i2] @ Td4[s3>>8]
and $i3,lr,$s3 @ i2
- ldrb $i2,[$tbl,$i2] @ Td4[s3>>8]
ldrb $i3,[$tbl,$i3] @ Td4[s3>>0]
ldrb $s3,[$tbl,$s3,lsr#24] @ Td4[s3>>24]
eor $s0,$s0,$i1,lsl#16
diff --git a/main/openssl/crypto/aes/asm/aes-armv4.s b/main/openssl/crypto/aes/asm/aes-armv4.s
index 27c681c7..2697d4ce 100644
--- a/main/openssl/crypto/aes/asm/aes-armv4.s
+++ b/main/openssl/crypto/aes/asm/aes-armv4.s
@@ -1,3 +1,4 @@
+#include "arm_arch.h"
.text
.code 32
@@ -118,7 +119,7 @@ AES_encrypt:
mov r12,r0 @ inp
mov r11,r2
sub r10,r3,#AES_encrypt-AES_Te @ Te
-
+#if __ARM_ARCH__<7
ldrb r0,[r12,#3] @ load input data in endian-neutral
ldrb r4,[r12,#2] @ manner...
ldrb r5,[r12,#1]
@@ -147,10 +148,33 @@ AES_encrypt:
orr r3,r3,r4,lsl#8
orr r3,r3,r5,lsl#16
orr r3,r3,r6,lsl#24
-
+#else
+ ldr r0,[r12,#0]
+ ldr r1,[r12,#4]
+ ldr r2,[r12,#8]
+ ldr r3,[r12,#12]
+#ifdef __ARMEL__
+ rev r0,r0
+ rev r1,r1
+ rev r2,r2
+ rev r3,r3
+#endif
+#endif
bl _armv4_AES_encrypt
ldr r12,[sp],#4 @ pop out
+#if __ARM_ARCH__>=7
+#ifdef __ARMEL__
+ rev r0,r0
+ rev r1,r1
+ rev r2,r2
+ rev r3,r3
+#endif
+ str r0,[r12,#0]
+ str r1,[r12,#4]
+ str r2,[r12,#8]
+ str r3,[r12,#12]
+#else
mov r4,r0,lsr#24 @ write output in endian-neutral
mov r5,r0,lsr#16 @ manner...
mov r6,r0,lsr#8
@@ -179,11 +203,15 @@ AES_encrypt:
strb r5,[r12,#13]
strb r6,[r12,#14]
strb r3,[r12,#15]
-
+#endif
+#if __ARM_ARCH__>=5
+ ldmia sp!,{r4-r12,pc}
+#else
ldmia sp!,{r4-r12,lr}
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
.word 0xe12fff1e @ interoperable with Thumb ISA:-)
+#endif
.size AES_encrypt,.-AES_encrypt
.type _armv4_AES_encrypt,%function
@@ -223,11 +251,11 @@ _armv4_AES_encrypt:
and r8,lr,r2,lsr#16 @ i1
eor r6,r6,r9,ror#8
and r9,lr,r2
- eor r1,r1,r4,ror#24
ldr r7,[r10,r7,lsl#2] @ Te2[s2>>8]
+ eor r1,r1,r4,ror#24
+ ldr r8,[r10,r8,lsl#2] @ Te1[s2>>16]
mov r2,r2,lsr#24
- ldr r8,[r10,r8,lsl#2] @ Te1[s2>>16]
ldr r9,[r10,r9,lsl#2] @ Te3[s2>>0]
eor r0,r0,r7,ror#16
ldr r2,[r10,r2,lsl#2] @ Te0[s2>>24]
@@ -236,16 +264,16 @@ _armv4_AES_encrypt:
and r8,lr,r3,lsr#8 @ i1
eor r6,r6,r9,ror#16
and r9,lr,r3,lsr#16 @ i2
- eor r2,r2,r5,ror#16
ldr r7,[r10,r7,lsl#2] @ Te3[s3>>0]
+ eor r2,r2,r5,ror#16
+ ldr r8,[r10,r8,lsl#2] @ Te2[s3>>8]
mov r3,r3,lsr#24
- ldr r8,[r10,r8,lsl#2] @ Te2[s3>>8]
ldr r9,[r10,r9,lsl#2] @ Te1[s3>>16]
eor r0,r0,r7,ror#24
- ldr r3,[r10,r3,lsl#2] @ Te0[s3>>24]
- eor r1,r1,r8,ror#16
ldr r7,[r11],#16
+ eor r1,r1,r8,ror#16
+ ldr r3,[r10,r3,lsl#2] @ Te0[s3>>24]
eor r2,r2,r9,ror#8
ldr r4,[r11,#-12]
eor r3,r3,r6,ror#8
@@ -285,11 +313,11 @@ _armv4_AES_encrypt:
and r8,lr,r2,lsr#16 @ i1
eor r6,r9,r6,lsl#8
and r9,lr,r2
- eor r1,r4,r1,lsl#24
ldrb r7,[r10,r7,lsl#2] @ Te4[s2>>8]
+ eor r1,r4,r1,lsl#24
+ ldrb r8,[r10,r8,lsl#2] @ Te4[s2>>16]
mov r2,r2,lsr#24
- ldrb r8,[r10,r8,lsl#2] @ Te4[s2>>16]
ldrb r9,[r10,r9,lsl#2] @ Te4[s2>>0]
eor r0,r7,r0,lsl#8
ldrb r2,[r10,r2,lsl#2] @ Te4[s2>>24]
@@ -298,15 +326,15 @@ _armv4_AES_encrypt:
and r8,lr,r3,lsr#8 @ i1
eor r6,r9,r6,lsl#8
and r9,lr,r3,lsr#16 @ i2
- eor r2,r5,r2,lsl#24
ldrb r7,[r10,r7,lsl#2] @ Te4[s3>>0]
+ eor r2,r5,r2,lsl#24
+ ldrb r8,[r10,r8,lsl#2] @ Te4[s3>>8]
mov r3,r3,lsr#24
- ldrb r8,[r10,r8,lsl#2] @ Te4[s3>>8]
ldrb r9,[r10,r9,lsl#2] @ Te4[s3>>16]
eor r0,r7,r0,lsl#8
- ldrb r3,[r10,r3,lsl#2] @ Te4[s3>>24]
ldr r7,[r11,#0]
+ ldrb r3,[r10,r3,lsl#2] @ Te4[s3>>24]
eor r1,r1,r8,lsl#8
ldr r4,[r11,#4]
eor r2,r2,r9,lsl#16
@@ -323,10 +351,11 @@ _armv4_AES_encrypt:
ldr pc,[sp],#4 @ pop and return
.size _armv4_AES_encrypt,.-_armv4_AES_encrypt
-.global AES_set_encrypt_key
-.type AES_set_encrypt_key,%function
+.global private_AES_set_encrypt_key
+.type private_AES_set_encrypt_key,%function
.align 5
-AES_set_encrypt_key:
+private_AES_set_encrypt_key:
+_armv4_AES_set_encrypt_key:
sub r3,pc,#8 @ AES_set_encrypt_key
teq r0,#0
moveq r0,#-1
@@ -344,12 +373,13 @@ AES_set_encrypt_key:
bne .Labrt
.Lok: stmdb sp!,{r4-r12,lr}
- sub r10,r3,#AES_set_encrypt_key-AES_Te-1024 @ Te4
+ sub r10,r3,#_armv4_AES_set_encrypt_key-AES_Te-1024 @ Te4
mov r12,r0 @ inp
mov lr,r1 @ bits
mov r11,r2 @ key
+#if __ARM_ARCH__<7
ldrb r0,[r12,#3] @ load input data in endian-neutral
ldrb r4,[r12,#2] @ manner...
ldrb r5,[r12,#1]
@@ -382,6 +412,22 @@ AES_set_encrypt_key:
orr r3,r3,r6,lsl#24
str r2,[r11,#-8]
str r3,[r11,#-4]
+#else
+ ldr r0,[r12,#0]
+ ldr r1,[r12,#4]
+ ldr r2,[r12,#8]
+ ldr r3,[r12,#12]
+#ifdef __ARMEL__
+ rev r0,r0
+ rev r1,r1
+ rev r2,r2
+ rev r3,r3
+#endif
+ str r0,[r11],#16
+ str r1,[r11,#-12]
+ str r2,[r11,#-8]
+ str r3,[r11,#-4]
+#endif
teq lr,#128
bne .Lnot128
@@ -418,6 +464,7 @@ AES_set_encrypt_key:
b .Ldone
.Lnot128:
+#if __ARM_ARCH__<7
ldrb r8,[r12,#19]
ldrb r4,[r12,#18]
ldrb r5,[r12,#17]
@@ -434,6 +481,16 @@ AES_set_encrypt_key:
str r8,[r11],#8
orr r9,r9,r6,lsl#24
str r9,[r11,#-4]
+#else
+ ldr r8,[r12,#16]
+ ldr r9,[r12,#20]
+#ifdef __ARMEL__
+ rev r8,r8
+ rev r9,r9
+#endif
+ str r8,[r11],#8
+ str r9,[r11,#-4]
+#endif
teq lr,#192
bne .Lnot192
@@ -478,6 +535,7 @@ AES_set_encrypt_key:
b .L192_loop
.Lnot192:
+#if __ARM_ARCH__<7
ldrb r8,[r12,#27]
ldrb r4,[r12,#26]
ldrb r5,[r12,#25]
@@ -494,6 +552,16 @@ AES_set_encrypt_key:
str r8,[r11],#8
orr r9,r9,r6,lsl#24
str r9,[r11,#-4]
+#else
+ ldr r8,[r12,#24]
+ ldr r9,[r12,#28]
+#ifdef __ARMEL__
+ rev r8,r8
+ rev r9,r9
+#endif
+ str r8,[r11],#8
+ str r9,[r11,#-4]
+#endif
mov r12,#14
str r12,[r11,#240-32]
@@ -558,14 +626,14 @@ AES_set_encrypt_key:
.Labrt: tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
.word 0xe12fff1e @ interoperable with Thumb ISA:-)
-.size AES_set_encrypt_key,.-AES_set_encrypt_key
+.size private_AES_set_encrypt_key,.-private_AES_set_encrypt_key
-.global AES_set_decrypt_key
-.type AES_set_decrypt_key,%function
+.global private_AES_set_decrypt_key
+.type private_AES_set_decrypt_key,%function
.align 5
-AES_set_decrypt_key:
+private_AES_set_decrypt_key:
str lr,[sp,#-4]! @ push lr
- bl AES_set_encrypt_key
+ bl _armv4_AES_set_encrypt_key
teq r0,#0
ldrne lr,[sp],#4 @ pop lr
bne .Labrt
@@ -639,11 +707,15 @@ AES_set_decrypt_key:
bne .Lmix
mov r0,#0
+#if __ARM_ARCH__>=5
+ ldmia sp!,{r4-r12,pc}
+#else
ldmia sp!,{r4-r12,lr}
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
.word 0xe12fff1e @ interoperable with Thumb ISA:-)
-.size AES_set_decrypt_key,.-AES_set_decrypt_key
+#endif
+.size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key
.type AES_Td,%object
.align 5
@@ -758,7 +830,7 @@ AES_decrypt:
mov r12,r0 @ inp
mov r11,r2
sub r10,r3,#AES_decrypt-AES_Td @ Td
-
+#if __ARM_ARCH__<7
ldrb r0,[r12,#3] @ load input data in endian-neutral
ldrb r4,[r12,#2] @ manner...
ldrb r5,[r12,#1]
@@ -787,10 +859,33 @@ AES_decrypt:
orr r3,r3,r4,lsl#8
orr r3,r3,r5,lsl#16
orr r3,r3,r6,lsl#24
-
+#else
+ ldr r0,[r12,#0]
+ ldr r1,[r12,#4]
+ ldr r2,[r12,#8]
+ ldr r3,[r12,#12]
+#ifdef __ARMEL__
+ rev r0,r0
+ rev r1,r1
+ rev r2,r2
+ rev r3,r3
+#endif
+#endif
bl _armv4_AES_decrypt
ldr r12,[sp],#4 @ pop out
+#if __ARM_ARCH__>=7
+#ifdef __ARMEL__
+ rev r0,r0
+ rev r1,r1
+ rev r2,r2
+ rev r3,r3
+#endif
+ str r0,[r12,#0]
+ str r1,[r12,#4]
+ str r2,[r12,#8]
+ str r3,[r12,#12]
+#else
mov r4,r0,lsr#24 @ write output in endian-neutral
mov r5,r0,lsr#16 @ manner...
mov r6,r0,lsr#8
@@ -819,11 +914,15 @@ AES_decrypt:
strb r5,[r12,#13]
strb r6,[r12,#14]
strb r3,[r12,#15]
-
+#endif
+#if __ARM_ARCH__>=5
+ ldmia sp!,{r4-r12,pc}
+#else
ldmia sp!,{r4-r12,lr}
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
.word 0xe12fff1e @ interoperable with Thumb ISA:-)
+#endif
.size AES_decrypt,.-AES_decrypt
.type _armv4_AES_decrypt,%function
@@ -863,11 +962,11 @@ _armv4_AES_decrypt:
and r8,lr,r2 @ i1
eor r6,r9,r6,ror#8
and r9,lr,r2,lsr#16
- eor r1,r1,r4,ror#8
ldr r7,[r10,r7,lsl#2] @ Td2[s2>>8]
+ eor r1,r1,r4,ror#8
+ ldr r8,[r10,r8,lsl#2] @ Td3[s2>>0]
mov r2,r2,lsr#24
- ldr r8,[r10,r8,lsl#2] @ Td3[s2>>0]
ldr r9,[r10,r9,lsl#2] @ Td1[s2>>16]
eor r0,r0,r7,ror#16
ldr r2,[r10,r2,lsl#2] @ Td0[s2>>24]
@@ -876,22 +975,22 @@ _armv4_AES_decrypt:
and r8,lr,r3,lsr#8 @ i1
eor r6,r9,r6,ror#8
and r9,lr,r3 @ i2
- eor r2,r2,r5,ror#8
ldr r7,[r10,r7,lsl#2] @ Td1[s3>>16]
+ eor r2,r2,r5,ror#8
+ ldr r8,[r10,r8,lsl#2] @ Td2[s3>>8]
mov r3,r3,lsr#24
- ldr r8,[r10,r8,lsl#2] @ Td2[s3>>8]
ldr r9,[r10,r9,lsl#2] @ Td3[s3>>0]
eor r0,r0,r7,ror#8
- ldr r3,[r10,r3,lsl#2] @ Td0[s3>>24]
+ ldr r7,[r11],#16
eor r1,r1,r8,ror#16
+ ldr r3,[r10,r3,lsl#2] @ Td0[s3>>24]
eor r2,r2,r9,ror#24
- ldr r7,[r11],#16
- eor r3,r3,r6,ror#8
ldr r4,[r11,#-12]
- ldr r5,[r11,#-8]
eor r0,r0,r7
+ ldr r5,[r11,#-8]
+ eor r3,r3,r6,ror#8
ldr r6,[r11,#-4]
and r7,lr,r0,lsr#16
eor r1,r1,r4
@@ -932,11 +1031,11 @@ _armv4_AES_decrypt:
and r7,lr,r2,lsr#8 @ i0
eor r5,r5,r8,lsl#8
and r8,lr,r2 @ i1
- eor r6,r6,r9,lsl#8
ldrb r7,[r10,r7] @ Td4[s2>>8]
+ eor r6,r6,r9,lsl#8
+ ldrb r8,[r10,r8] @ Td4[s2>>0]
and r9,lr,r2,lsr#16
- ldrb r8,[r10,r8] @ Td4[s2>>0]
ldrb r2,[r10,r2,lsr#24] @ Td4[s2>>24]
eor r0,r0,r7,lsl#8
ldrb r9,[r10,r9] @ Td4[s2>>16]
@@ -944,11 +1043,11 @@ _armv4_AES_decrypt:
and r7,lr,r3,lsr#16 @ i0
eor r2,r5,r2,lsl#16
and r8,lr,r3,lsr#8 @ i1
- eor r6,r6,r9,lsl#16
ldrb r7,[r10,r7] @ Td4[s3>>16]
+ eor r6,r6,r9,lsl#16
+ ldrb r8,[r10,r8] @ Td4[s3>>8]
and r9,lr,r3 @ i2
- ldrb r8,[r10,r8] @ Td4[s3>>8]
ldrb r9,[r10,r9] @ Td4[s3>>0]
ldrb r3,[r10,r3,lsr#24] @ Td4[s3>>24]
eor r0,r0,r7,lsl#16
diff --git a/main/openssl/crypto/aes/asm/aes-mips.S b/main/openssl/crypto/aes/asm/aes-mips.S
new file mode 100644
index 00000000..f5750bf8
--- /dev/null
+++ b/main/openssl/crypto/aes/asm/aes-mips.S
@@ -0,0 +1,1337 @@
+.text
+#ifdef OPENSSL_FIPSCANISTER
+# include <openssl/fipssyms.h>
+#endif
+
+#if !defined(__vxworks) || defined(__pic__)
+.option pic2
+#endif
+.set noat
+.align 5
+.ent _mips_AES_encrypt
+_mips_AES_encrypt:
+ .frame $29,0,$31
+ .set reorder
+ lw $12,0($6)
+ lw $13,4($6)
+ lw $14,8($6)
+ lw $15,12($6)
+ lw $30,240($6)
+ add $3,$6,16
+
+ xor $8,$12
+ xor $9,$13
+ xor $10,$14
+ xor $11,$15
+
+ sub $30,1
+ srl $1,$9,6
+.Loop_enc:
+ srl $2,$10,6
+ srl $24,$11,6
+ srl $25,$8,6
+ and $1,0x3fc
+ and $2,0x3fc
+ and $24,0x3fc
+ and $25,0x3fc
+ add $1,$7
+ add $2,$7
+ add $24,$7
+ add $25,$7
+ lwl $12,2($1) # Te1[s1>>16]
+ lwl $13,2($2) # Te1[s2>>16]
+ lwl $14,2($24) # Te1[s3>>16]
+ lwl $15,2($25) # Te1[s0>>16]
+ lwr $12,3($1) # Te1[s1>>16]
+ lwr $13,3($2) # Te1[s2>>16]
+ lwr $14,3($24) # Te1[s3>>16]
+ lwr $15,3($25) # Te1[s0>>16]
+
+ srl $1,$10,14
+ srl $2,$11,14
+ srl $24,$8,14
+ srl $25,$9,14
+ and $1,0x3fc
+ and $2,0x3fc
+ and $24,0x3fc
+ and $25,0x3fc
+ add $1,$7
+ add $2,$7
+ add $24,$7
+ add $25,$7
+ lwl $16,1($1) # Te2[s2>>8]
+ lwl $17,1($2) # Te2[s3>>8]
+ lwl $18,1($24) # Te2[s0>>8]
+ lwl $19,1($25) # Te2[s1>>8]
+ lwr $16,2($1) # Te2[s2>>8]
+ lwr $17,2($2) # Te2[s3>>8]
+ lwr $18,2($24) # Te2[s0>>8]
+ lwr $19,2($25) # Te2[s1>>8]
+
+ srl $1,$11,22
+ srl $2,$8,22
+ srl $24,$9,22
+ srl $25,$10,22
+ and $1,0x3fc
+ and $2,0x3fc
+ and $24,0x3fc
+ and $25,0x3fc
+ add $1,$7
+ add $2,$7
+ add $24,$7
+ add $25,$7
+ lwl $20,0($1) # Te3[s3]
+ lwl $21,0($2) # Te3[s0]
+ lwl $22,0($24) # Te3[s1]
+ lwl $23,0($25) # Te3[s2]
+ lwr $20,1($1) # Te3[s3]
+ lwr $21,1($2) # Te3[s0]
+ lwr $22,1($24) # Te3[s1]
+ lwr $23,1($25) # Te3[s2]
+
+ sll $1,$8,2
+ sll $2,$9,2
+ sll $24,$10,2
+ sll $25,$11,2
+ and $1,0x3fc
+ and $2,0x3fc
+ and $24,0x3fc
+ and $25,0x3fc
+ add $1,$7
+ add $2,$7
+ add $24,$7
+ add $25,$7
+ xor $12,$16
+ xor $13,$17
+ xor $14,$18
+ xor $15,$19
+ lw $16,0($1) # Te0[s0>>24]
+ lw $17,0($2) # Te0[s1>>24]
+ lw $18,0($24) # Te0[s2>>24]
+ lw $19,0($25) # Te0[s3>>24]
+
+ lw $8,0($3)
+ lw $9,4($3)
+ lw $10,8($3)
+ lw $11,12($3)
+
+ xor $12,$20
+ xor $13,$21
+ xor $14,$22
+ xor $15,$23
+
+ xor $12,$16
+ xor $13,$17
+ xor $14,$18
+ xor $15,$19
+
+ sub $30,1
+ add $3,16
+ xor $8,$12
+ xor $9,$13
+ xor $10,$14
+ xor $11,$15
+ .set noreorder
+ bnez $30,.Loop_enc
+ srl $1,$9,6
+
+ .set reorder
+ srl $2,$10,6
+ srl $24,$11,6
+ srl $25,$8,6
+ and $1,0x3fc
+ and $2,0x3fc
+ and $24,0x3fc
+ and $25,0x3fc
+ add $1,$7
+ add $2,$7
+ add $24,$7
+ add $25,$7
+ lbu $12,2($1) # Te4[s1>>16]
+ lbu $13,2($2) # Te4[s2>>16]
+ lbu $14,2($24) # Te4[s3>>16]
+ lbu $15,2($25) # Te4[s0>>16]
+
+ srl $1,$10,14
+ srl $2,$11,14
+ srl $24,$8,14
+ srl $25,$9,14
+ and $1,0x3fc
+ and $2,0x3fc
+ and $24,0x3fc
+ and $25,0x3fc
+ add $1,$7
+ add $2,$7
+ add $24,$7
+ add $25,$7
+ lbu $16,2($1) # Te4[s2>>8]
+ lbu $17,2($2) # Te4[s3>>8]
+ lbu $18,2($24) # Te4[s0>>8]
+ lbu $19,2($25) # Te4[s1>>8]
+
+ sll $1,$8,2
+ sll $2,$9,2
+ sll $24,$10,2
+ sll $25,$11,2
+ and $1,0x3fc
+ and $2,0x3fc
+ and $24,0x3fc
+ and $25,0x3fc
+ add $1,$7
+ add $2,$7
+ add $24,$7
+ add $25,$7
+ lbu $20,2($1) # Te4[s0>>24]
+ lbu $21,2($2) # Te4[s1>>24]
+ lbu $22,2($24) # Te4[s2>>24]
+ lbu $23,2($25) # Te4[s3>>24]
+
+ srl $1,$11,22
+ srl $2,$8,22
+ srl $24,$9,22
+ srl $25,$10,22
+ and $1,0x3fc
+ and $2,0x3fc
+ and $24,0x3fc
+ and $25,0x3fc
+
+ sll $12,$12,8
+ sll $13,$13,8
+ sll $14,$14,8
+ sll $15,$15,8
+
+ sll $16,$16,16
+ sll $17,$17,16
+ sll $18,$18,16
+ sll $19,$19,16
+
+ xor $12,$16
+ xor $13,$17
+ xor $14,$18
+ xor $15,$19
+
+ add $1,$7
+ add $2,$7
+ add $24,$7
+ add $25,$7
+ lbu $16,2($1) # Te4[s3]
+ lbu $17,2($2) # Te4[s0]
+ lbu $18,2($24) # Te4[s1]
+ lbu $19,2($25) # Te4[s2]
+
+ #sll $20,$20,0
+ #sll $21,$21,0
+ #sll $22,$22,0
+ #sll $23,$23,0
+
+ lw $8,0($3)
+ lw $9,4($3)
+ lw $10,8($3)
+ lw $11,12($3)
+
+ xor $12,$20
+ xor $13,$21
+ xor $14,$22
+ xor $15,$23
+
+ sll $16,$16,24
+ sll $17,$17,24
+ sll $18,$18,24
+ sll $19,$19,24
+
+ xor $12,$16
+ xor $13,$17
+ xor $14,$18
+ xor $15,$19
+
+ xor $8,$12
+ xor $9,$13
+ xor $10,$14
+ xor $11,$15
+
+ jr $31
+.end _mips_AES_encrypt
+
+.align 5
+.globl AES_encrypt
+.ent AES_encrypt
+AES_encrypt:
+ .frame $29,64,$31
+ .mask 3237937152,-4
+ .set noreorder
+ .cpload $25
+ sub $29,64
+ sw $31,64-1*4($29)
+ sw $30,64-2*4($29)
+ sw $23,64-3*4($29)
+ sw $22,64-4*4($29)
+ sw $21,64-5*4($29)
+ sw $20,64-6*4($29)
+ sw $19,64-7*4($29)
+ sw $18,64-8*4($29)
+ sw $17,64-9*4($29)
+ sw $16,64-10*4($29)
+ .set reorder
+ la $7,AES_Te # PIC-ified 'load address'
+
+ lwl $8,0+3($4)
+ lwl $9,4+3($4)
+ lwl $10,8+3($4)
+ lwl $11,12+3($4)
+ lwr $8,0+0($4)
+ lwr $9,4+0($4)
+ lwr $10,8+0($4)
+ lwr $11,12+0($4)
+
+ bal _mips_AES_encrypt
+
+ swr $8,0+0($5)
+ swr $9,4+0($5)
+ swr $10,8+0($5)
+ swr $11,12+0($5)
+ swl $8,0+3($5)
+ swl $9,4+3($5)
+ swl $10,8+3($5)
+ swl $11,12+3($5)
+
+ .set noreorder
+ lw $31,64-1*4($29)
+ lw $30,64-2*4($29)
+ lw $23,64-3*4($29)
+ lw $22,64-4*4($29)
+ lw $21,64-5*4($29)
+ lw $20,64-6*4($29)
+ lw $19,64-7*4($29)
+ lw $18,64-8*4($29)
+ lw $17,64-9*4($29)
+ lw $16,64-10*4($29)
+ jr $31
+ add $29,64
+.end AES_encrypt
+.align 5
+.ent _mips_AES_decrypt
+_mips_AES_decrypt:
+ .frame $29,0,$31
+ .set reorder
+ lw $12,0($6)
+ lw $13,4($6)
+ lw $14,8($6)
+ lw $15,12($6)
+ lw $30,240($6)
+ add $3,$6,16
+
+ xor $8,$12
+ xor $9,$13
+ xor $10,$14
+ xor $11,$15
+
+ sub $30,1
+ srl $1,$11,6
+.Loop_dec:
+ srl $2,$8,6
+ srl $24,$9,6
+ srl $25,$10,6
+ and $1,0x3fc
+ and $2,0x3fc
+ and $24,0x3fc
+ and $25,0x3fc
+ add $1,$7
+ add $2,$7
+ add $24,$7
+ add $25,$7
+ lwl $12,2($1) # Td1[s3>>16]
+ lwl $13,2($2) # Td1[s0>>16]
+ lwl $14,2($24) # Td1[s1>>16]
+ lwl $15,2($25) # Td1[s2>>16]
+ lwr $12,3($1) # Td1[s3>>16]
+ lwr $13,3($2) # Td1[s0>>16]
+ lwr $14,3($24) # Td1[s1>>16]
+ lwr $15,3($25) # Td1[s2>>16]
+
+ srl $1,$10,14
+ srl $2,$11,14
+ srl $24,$8,14
+ srl $25,$9,14
+ and $1,0x3fc
+ and $2,0x3fc
+ and $24,0x3fc
+ and $25,0x3fc
+ add $1,$7
+ add $2,$7
+ add $24,$7
+ add $25,$7
+ lwl $16,1($1) # Td2[s2>>8]
+ lwl $17,1($2) # Td2[s3>>8]
+ lwl $18,1($24) # Td2[s0>>8]
+ lwl $19,1($25) # Td2[s1>>8]
+ lwr $16,2($1) # Td2[s2>>8]
+ lwr $17,2($2) # Td2[s3>>8]
+ lwr $18,2($24) # Td2[s0>>8]
+ lwr $19,2($25) # Td2[s1>>8]
+
+ srl $1,$9,22
+ srl $2,$10,22
+ srl $24,$11,22
+ srl $25,$8,22
+ and $1,0x3fc
+ and $2,0x3fc
+ and $24,0x3fc
+ and $25,0x3fc
+ add $1,$7
+ add $2,$7
+ add $24,$7
+ add $25,$7
+ lwl $20,0($1) # Td3[s1]
+ lwl $21,0($2) # Td3[s2]
+ lwl $22,0($24) # Td3[s3]
+ lwl $23,0($25) # Td3[s0]
+ lwr $20,1($1) # Td3[s1]
+ lwr $21,1($2) # Td3[s2]
+ lwr $22,1($24) # Td3[s3]
+ lwr $23,1($25) # Td3[s0]
+
+ sll $1,$8,2
+ sll $2,$9,2
+ sll $24,$10,2
+ sll $25,$11,2
+ and $1,0x3fc
+ and $2,0x3fc
+ and $24,0x3fc
+ and $25,0x3fc
+ add $1,$7
+ add $2,$7
+ add $24,$7
+ add $25,$7
+
+ xor $12,$16
+ xor $13,$17
+ xor $14,$18
+ xor $15,$19
+
+
+ lw $16,0($1) # Td0[s0>>24]
+ lw $17,0($2) # Td0[s1>>24]
+ lw $18,0($24) # Td0[s2>>24]
+ lw $19,0($25) # Td0[s3>>24]
+
+ lw $8,0($3)
+ lw $9,4($3)
+ lw $10,8($3)
+ lw $11,12($3)
+
+ xor $12,$20
+ xor $13,$21
+ xor $14,$22
+ xor $15,$23
+
+ xor $12,$16
+ xor $13,$17
+ xor $14,$18
+ xor $15,$19
+
+ sub $30,1
+ add $3,16
+ xor $8,$12
+ xor $9,$13
+ xor $10,$14
+ xor $11,$15
+ .set noreorder
+ bnez $30,.Loop_dec
+ srl $1,$11,6
+
+ .set reorder
+ lw $16,1024($7) # prefetch Td4
+ lw $17,1024+32($7)
+ lw $18,1024+64($7)
+ lw $19,1024+96($7)
+ lw $20,1024+128($7)
+ lw $21,1024+160($7)
+ lw $22,1024+192($7)
+ lw $23,1024+224($7)
+
+ srl $1,$11,8
+ srl $2,$8,8
+ srl $24,$9,8
+ srl $25,$10,8
+ and $1,0xff
+ and $2,0xff
+ and $24,0xff
+ and $25,0xff
+ add $1,$7
+ add $2,$7
+ add $24,$7
+ add $25,$7
+ lbu $12,1024($1) # Td4[s3>>16]
+ lbu $13,1024($2) # Td4[s0>>16]
+ lbu $14,1024($24) # Td4[s1>>16]
+ lbu $15,1024($25) # Td4[s2>>16]
+
+ srl $1,$10,16
+ srl $2,$11,16
+ srl $24,$8,16
+ srl $25,$9,16
+ and $1,0xff
+ and $2,0xff
+ and $24,0xff
+ and $25,0xff
+ add $1,$7
+ add $2,$7
+ add $24,$7
+ add $25,$7
+ lbu $16,1024($1) # Td4[s2>>8]
+ lbu $17,1024($2) # Td4[s3>>8]
+ lbu $18,1024($24) # Td4[s0>>8]
+ lbu $19,1024($25) # Td4[s1>>8]
+
+ and $1,$8,0xff
+ and $2,$9,0xff
+ and $24,$10,0xff
+ and $25,$11,0xff
+ add $1,$7
+ add $2,$7
+ add $24,$7
+ add $25,$7
+ lbu $20,1024($1) # Td4[s0>>24]
+ lbu $21,1024($2) # Td4[s1>>24]
+ lbu $22,1024($24) # Td4[s2>>24]
+ lbu $23,1024($25) # Td4[s3>>24]
+
+ srl $1,$9,24
+ srl $2,$10,24
+ srl $24,$11,24
+ srl $25,$8,24
+
+ sll $12,$12,8
+ sll $13,$13,8
+ sll $14,$14,8
+ sll $15,$15,8
+
+ sll $16,$16,16
+ sll $17,$17,16
+ sll $18,$18,16
+ sll $19,$19,16
+
+ xor $12,$16
+ xor $13,$17
+ xor $14,$18
+ xor $15,$19
+
+ add $1,$7
+ add $2,$7
+ add $24,$7
+ add $25,$7
+ lbu $16,1024($1) # Td4[s1]
+ lbu $17,1024($2) # Td4[s2]
+ lbu $18,1024($24) # Td4[s3]
+ lbu $19,1024($25) # Td4[s0]
+
+ #sll $20,$20,0
+ #sll $21,$21,0
+ #sll $22,$22,0
+ #sll $23,$23,0
+
+ lw $8,0($3)
+ lw $9,4($3)
+ lw $10,8($3)
+ lw $11,12($3)
+
+ sll $16,$16,24
+ sll $17,$17,24
+ sll $18,$18,24
+ sll $19,$19,24
+
+
+ xor $12,$20
+ xor $13,$21
+ xor $14,$22
+ xor $15,$23
+
+ xor $12,$16
+ xor $13,$17
+ xor $14,$18
+ xor $15,$19
+
+ xor $8,$12
+ xor $9,$13
+ xor $10,$14
+ xor $11,$15
+
+ jr $31
+.end _mips_AES_decrypt
+
+.align 5
+.globl AES_decrypt
+.ent AES_decrypt
+AES_decrypt:
+ .frame $29,64,$31
+ .mask 3237937152,-4
+ .set noreorder
+ .cpload $25
+ sub $29,64
+ sw $31,64-1*4($29)
+ sw $30,64-2*4($29)
+ sw $23,64-3*4($29)
+ sw $22,64-4*4($29)
+ sw $21,64-5*4($29)
+ sw $20,64-6*4($29)
+ sw $19,64-7*4($29)
+ sw $18,64-8*4($29)
+ sw $17,64-9*4($29)
+ sw $16,64-10*4($29)
+ .set reorder
+ la $7,AES_Td # PIC-ified 'load address'
+
+ lwl $8,0+3($4)
+ lwl $9,4+3($4)
+ lwl $10,8+3($4)
+ lwl $11,12+3($4)
+ lwr $8,0+0($4)
+ lwr $9,4+0($4)
+ lwr $10,8+0($4)
+ lwr $11,12+0($4)
+
+ bal _mips_AES_decrypt
+
+ swr $8,0+0($5)
+ swr $9,4+0($5)
+ swr $10,8+0($5)
+ swr $11,12+0($5)
+ swl $8,0+3($5)
+ swl $9,4+3($5)
+ swl $10,8+3($5)
+ swl $11,12+3($5)
+
+ .set noreorder
+ lw $31,64-1*4($29)
+ lw $30,64-2*4($29)
+ lw $23,64-3*4($29)
+ lw $22,64-4*4($29)
+ lw $21,64-5*4($29)
+ lw $20,64-6*4($29)
+ lw $19,64-7*4($29)
+ lw $18,64-8*4($29)
+ lw $17,64-9*4($29)
+ lw $16,64-10*4($29)
+ jr $31
+ add $29,64
+.end AES_decrypt
+.align 5
+.ent _mips_AES_set_encrypt_key
+_mips_AES_set_encrypt_key:
+ .frame $29,0,$31
+ .set noreorder
+ beqz $4,.Lekey_done
+ li $2,-1
+ beqz $6,.Lekey_done
+ add $3,$7,1024+256
+
+ .set reorder
+ lwl $8,0+3($4) # load 128 bits
+ lwl $9,4+3($4)
+ lwl $10,8+3($4)
+ lwl $11,12+3($4)
+ li $1,128
+ lwr $8,0+0($4)
+ lwr $9,4+0($4)
+ lwr $10,8+0($4)
+ lwr $11,12+0($4)
+ .set noreorder
+ beq $5,$1,.L128bits
+ li $30,10
+
+ .set reorder
+ lwl $12,16+3($4) # load 192 bits
+ lwl $13,20+3($4)
+ li $1,192
+ lwr $12,16+0($4)
+ lwr $13,20+0($4)
+ .set noreorder
+ beq $5,$1,.L192bits
+ li $30,8
+
+ .set reorder
+ lwl $14,24+3($4) # load 256 bits
+ lwl $15,28+3($4)
+ li $1,256
+ lwr $14,24+0($4)
+ lwr $15,28+0($4)
+ .set noreorder
+ beq $5,$1,.L256bits
+ li $30,7
+
+ b .Lekey_done
+ li $2,-2
+
+.align 4
+.L128bits:
+ .set reorder
+ srl $1,$11,16
+ srl $2,$11,8
+ and $1,0xff
+ and $2,0xff
+ and $24,$11,0xff
+ srl $25,$11,24
+ add $1,$7
+ add $2,$7
+ add $24,$7
+ add $25,$7
+ lbu $1,1024($1)
+ lbu $2,1024($2)
+ lbu $24,1024($24)
+ lbu $25,1024($25)
+
+ sw $8,0($6)
+ sw $9,4($6)
+ sw $10,8($6)
+ sw $11,12($6)
+ sub $30,1
+ add $6,16
+
+ sll $1,$1,8
+ #sll $2,$2,0
+ sll $24,$24,24
+ sll $25,$25,16
+
+ xor $8,$1
+ lw $1,0($3)
+ xor $8,$2
+ xor $8,$24
+ xor $8,$25
+ xor $8,$1
+
+ xor $9,$8
+ xor $10,$9
+ xor $11,$10
+
+ .set noreorder
+ bnez $30,.L128bits
+ add $3,4
+
+ sw $8,0($6)
+ sw $9,4($6)
+ sw $10,8($6)
+ li $30,10
+ sw $11,12($6)
+ li $2,0
+ sw $30,80($6)
+ b .Lekey_done
+ sub $6,10*16
+
+.align 4
+.L192bits:
+ .set reorder
+ srl $1,$13,16
+ srl $2,$13,8
+ and $1,0xff
+ and $2,0xff
+ and $24,$13,0xff
+ srl $25,$13,24
+ add $1,$7
+ add $2,$7
+ add $24,$7
+ add $25,$7
+ lbu $1,1024($1)
+ lbu $2,1024($2)
+ lbu $24,1024($24)
+ lbu $25,1024($25)
+
+ sw $8,0($6)
+ sw $9,4($6)
+ sw $10,8($6)
+ sw $11,12($6)
+ sw $12,16($6)
+ sw $13,20($6)
+ sub $30,1
+ add $6,24
+
+ sll $1,$1,8
+ #sll $2,$2,0
+ sll $24,$24,24
+ sll $25,$25,16
+
+ xor $8,$1
+ lw $1,0($3)
+ xor $8,$2
+ xor $8,$24
+ xor $8,$25
+ xor $8,$1
+
+ xor $9,$8
+ xor $10,$9
+ xor $11,$10
+ xor $12,$11
+ xor $13,$12
+
+ .set noreorder
+ bnez $30,.L192bits
+ add $3,4
+
+ sw $8,0($6)
+ sw $9,4($6)
+ sw $10,8($6)
+ li $30,12
+ sw $11,12($6)
+ li $2,0
+ sw $30,48($6)
+ b .Lekey_done
+ sub $6,12*16
+
+.align 4
+.L256bits:
+ .set reorder
+ srl $1,$15,16
+ srl $2,$15,8
+ and $1,0xff
+ and $2,0xff
+ and $24,$15,0xff
+ srl $25,$15,24
+ add $1,$7
+ add $2,$7
+ add $24,$7
+ add $25,$7
+ lbu $1,1024($1)
+ lbu $2,1024($2)
+ lbu $24,1024($24)
+ lbu $25,1024($25)
+
+ sw $8,0($6)
+ sw $9,4($6)
+ sw $10,8($6)
+ sw $11,12($6)
+ sw $12,16($6)
+ sw $13,20($6)
+ sw $14,24($6)
+ sw $15,28($6)
+ sub $30,1
+
+ sll $1,$1,8
+ #sll $2,$2,0
+ sll $24,$24,24
+ sll $25,$25,16
+
+ xor $8,$1
+ lw $1,0($3)
+ xor $8,$2
+ xor $8,$24
+ xor $8,$25
+ xor $8,$1
+
+ xor $9,$8
+ xor $10,$9
+ xor $11,$10
+ beqz $30,.L256bits_done
+
+ srl $1,$11,24
+ srl $2,$11,16
+ srl $24,$11,8
+ and $25,$11,0xff
+ and $2,0xff
+ and $24,0xff
+ add $1,$7
+ add $2,$7
+ add $24,$7
+ add $25,$7
+ lbu $1,1024($1)
+ lbu $2,1024($2)
+ lbu $24,1024($24)
+ lbu $25,1024($25)
+ sll $1,24
+ sll $2,16
+ sll $24,8
+
+ xor $12,$1
+ xor $12,$2
+ xor $12,$24
+ xor $12,$25
+
+ xor $13,$12
+ xor $14,$13
+ xor $15,$14
+
+ add $6,32
+ .set noreorder
+ b .L256bits
+ add $3,4
+
+.L256bits_done:
+ sw $8,32($6)
+ sw $9,36($6)
+ sw $10,40($6)
+ li $30,14
+ sw $11,44($6)
+ li $2,0
+ sw $30,48($6)
+ sub $6,12*16
+
+.Lekey_done:
+ jr $31
+ nop
+.end _mips_AES_set_encrypt_key
+
+.globl private_AES_set_encrypt_key
+.ent private_AES_set_encrypt_key
+private_AES_set_encrypt_key:
+ .frame $29,32,$31
+ .mask 3221225472,-4
+ .set noreorder
+ .cpload $25
+ sub $29,32
+ sw $31,32-1*4($29)
+ sw $30,32-2*4($29)
+ .set reorder
+ la $7,AES_Te # PIC-ified 'load address'
+
+ bal _mips_AES_set_encrypt_key
+
+ .set noreorder
+ move $4,$2
+ lw $31,32-1*4($29)
+ lw $30,32-2*4($29)
+ jr $31
+ add $29,32
+.end private_AES_set_encrypt_key
+.align 5
+.globl private_AES_set_decrypt_key
+.ent private_AES_set_decrypt_key
+private_AES_set_decrypt_key:
+ .frame $29,32,$31
+ .mask 3221225472,-4
+ .set noreorder
+ .cpload $25
+ sub $29,32
+ sw $31,32-1*4($29)
+ sw $30,32-2*4($29)
+ .set reorder
+ la $7,AES_Te # PIC-ified 'load address'
+
+ bal _mips_AES_set_encrypt_key
+
+ bltz $2,.Ldkey_done
+
+ sll $1,$30,4
+ add $4,$6,0
+ add $5,$6,$1
+.align 4
+.Lswap:
+ lw $8,0($4)
+ lw $9,4($4)
+ lw $10,8($4)
+ lw $11,12($4)
+ lw $12,0($5)
+ lw $13,4($5)
+ lw $14,8($5)
+ lw $15,12($5)
+ sw $8,0($5)
+ sw $9,4($5)
+ sw $10,8($5)
+ sw $11,12($5)
+ add $4,16
+ sub $5,16
+ sw $12,-16($4)
+ sw $13,-12($4)
+ sw $14,-8($4)
+ sw $15,-4($4)
+ bne $4,$5,.Lswap
+
+ lw $8,16($6) # modulo-scheduled
+ lui $2,0x8080
+ sub $30,1
+ or $2,0x8080
+ sll $30,2
+ add $6,16
+ lui $25,0x1b1b
+ nor $24,$0,$2
+ or $25,0x1b1b
+.align 4
+.Lmix:
+ and $1,$8,$2
+ and $9,$8,$24
+ srl $10,$1,7
+ addu $9,$9 # tp2<<1
+ subu $1,$10
+ and $1,$25
+ xor $9,$1
+
+ and $1,$9,$2
+ and $10,$9,$24
+ srl $11,$1,7
+ addu $10,$10 # tp4<<1
+ subu $1,$11
+ and $1,$25
+ xor $10,$1
+
+ and $1,$10,$2
+ and $11,$10,$24
+ srl $12,$1,7
+ addu $11,$11 # tp8<<1
+ subu $1,$12
+ and $1,$25
+ xor $11,$1
+
+ xor $12,$11,$8
+ xor $15,$11,$10
+ xor $13,$12,$9
+ xor $14,$12,$10
+
+ sll $8,$14,16
+ xor $15,$9
+ srl $9,$14,16
+ xor $15,$8
+ sll $8,$12,8
+ xor $15,$9
+ srl $9,$12,24
+ xor $15,$8
+ sll $8,$13,24
+ xor $15,$9
+ srl $9,$13,8
+ xor $15,$8
+ lw $8,4($6) # modulo-scheduled
+ xor $15,$9
+ sub $30,1
+ sw $15,0($6)
+ add $6,4
+ bnez $30,.Lmix
+
+ li $2,0
+.Ldkey_done:
+ .set noreorder
+ move $4,$2
+ lw $31,32-1*4($29)
+ lw $30,32-2*4($29)
+ jr $31
+ add $29,32
+.end private_AES_set_decrypt_key
+.rdata
+.align 6
+AES_Te:
+.byte 0xc6,0x63,0x63,0xa5, 0xf8,0x7c,0x7c,0x84 # Te0
+.byte 0xee,0x77,0x77,0x99, 0xf6,0x7b,0x7b,0x8d
+.byte 0xff,0xf2,0xf2,0x0d, 0xd6,0x6b,0x6b,0xbd
+.byte 0xde,0x6f,0x6f,0xb1, 0x91,0xc5,0xc5,0x54
+.byte 0x60,0x30,0x30,0x50, 0x02,0x01,0x01,0x03
+.byte 0xce,0x67,0x67,0xa9, 0x56,0x2b,0x2b,0x7d
+.byte 0xe7,0xfe,0xfe,0x19, 0xb5,0xd7,0xd7,0x62
+.byte 0x4d,0xab,0xab,0xe6, 0xec,0x76,0x76,0x9a
+.byte 0x8f,0xca,0xca,0x45, 0x1f,0x82,0x82,0x9d
+.byte 0x89,0xc9,0xc9,0x40, 0xfa,0x7d,0x7d,0x87
+.byte 0xef,0xfa,0xfa,0x15, 0xb2,0x59,0x59,0xeb
+.byte 0x8e,0x47,0x47,0xc9, 0xfb,0xf0,0xf0,0x0b
+.byte 0x41,0xad,0xad,0xec, 0xb3,0xd4,0xd4,0x67
+.byte 0x5f,0xa2,0xa2,0xfd, 0x45,0xaf,0xaf,0xea
+.byte 0x23,0x9c,0x9c,0xbf, 0x53,0xa4,0xa4,0xf7
+.byte 0xe4,0x72,0x72,0x96, 0x9b,0xc0,0xc0,0x5b
+.byte 0x75,0xb7,0xb7,0xc2, 0xe1,0xfd,0xfd,0x1c
+.byte 0x3d,0x93,0x93,0xae, 0x4c,0x26,0x26,0x6a
+.byte 0x6c,0x36,0x36,0x5a, 0x7e,0x3f,0x3f,0x41
+.byte 0xf5,0xf7,0xf7,0x02, 0x83,0xcc,0xcc,0x4f
+.byte 0x68,0x34,0x34,0x5c, 0x51,0xa5,0xa5,0xf4
+.byte 0xd1,0xe5,0xe5,0x34, 0xf9,0xf1,0xf1,0x08
+.byte 0xe2,0x71,0x71,0x93, 0xab,0xd8,0xd8,0x73
+.byte 0x62,0x31,0x31,0x53, 0x2a,0x15,0x15,0x3f
+.byte 0x08,0x04,0x04,0x0c, 0x95,0xc7,0xc7,0x52
+.byte 0x46,0x23,0x23,0x65, 0x9d,0xc3,0xc3,0x5e
+.byte 0x30,0x18,0x18,0x28, 0x37,0x96,0x96,0xa1
+.byte 0x0a,0x05,0x05,0x0f, 0x2f,0x9a,0x9a,0xb5
+.byte 0x0e,0x07,0x07,0x09, 0x24,0x12,0x12,0x36
+.byte 0x1b,0x80,0x80,0x9b, 0xdf,0xe2,0xe2,0x3d
+.byte 0xcd,0xeb,0xeb,0x26, 0x4e,0x27,0x27,0x69
+.byte 0x7f,0xb2,0xb2,0xcd, 0xea,0x75,0x75,0x9f
+.byte 0x12,0x09,0x09,0x1b, 0x1d,0x83,0x83,0x9e
+.byte 0x58,0x2c,0x2c,0x74, 0x34,0x1a,0x1a,0x2e
+.byte 0x36,0x1b,0x1b,0x2d, 0xdc,0x6e,0x6e,0xb2
+.byte 0xb4,0x5a,0x5a,0xee, 0x5b,0xa0,0xa0,0xfb
+.byte 0xa4,0x52,0x52,0xf6, 0x76,0x3b,0x3b,0x4d
+.byte 0xb7,0xd6,0xd6,0x61, 0x7d,0xb3,0xb3,0xce
+.byte 0x52,0x29,0x29,0x7b, 0xdd,0xe3,0xe3,0x3e
+.byte 0x5e,0x2f,0x2f,0x71, 0x13,0x84,0x84,0x97
+.byte 0xa6,0x53,0x53,0xf5, 0xb9,0xd1,0xd1,0x68
+.byte 0x00,0x00,0x00,0x00, 0xc1,0xed,0xed,0x2c
+.byte 0x40,0x20,0x20,0x60, 0xe3,0xfc,0xfc,0x1f
+.byte 0x79,0xb1,0xb1,0xc8, 0xb6,0x5b,0x5b,0xed
+.byte 0xd4,0x6a,0x6a,0xbe, 0x8d,0xcb,0xcb,0x46
+.byte 0x67,0xbe,0xbe,0xd9, 0x72,0x39,0x39,0x4b
+.byte 0x94,0x4a,0x4a,0xde, 0x98,0x4c,0x4c,0xd4
+.byte 0xb0,0x58,0x58,0xe8, 0x85,0xcf,0xcf,0x4a
+.byte 0xbb,0xd0,0xd0,0x6b, 0xc5,0xef,0xef,0x2a
+.byte 0x4f,0xaa,0xaa,0xe5, 0xed,0xfb,0xfb,0x16
+.byte 0x86,0x43,0x43,0xc5, 0x9a,0x4d,0x4d,0xd7
+.byte 0x66,0x33,0x33,0x55, 0x11,0x85,0x85,0x94
+.byte 0x8a,0x45,0x45,0xcf, 0xe9,0xf9,0xf9,0x10
+.byte 0x04,0x02,0x02,0x06, 0xfe,0x7f,0x7f,0x81
+.byte 0xa0,0x50,0x50,0xf0, 0x78,0x3c,0x3c,0x44
+.byte 0x25,0x9f,0x9f,0xba, 0x4b,0xa8,0xa8,0xe3
+.byte 0xa2,0x51,0x51,0xf3, 0x5d,0xa3,0xa3,0xfe
+.byte 0x80,0x40,0x40,0xc0, 0x05,0x8f,0x8f,0x8a
+.byte 0x3f,0x92,0x92,0xad, 0x21,0x9d,0x9d,0xbc
+.byte 0x70,0x38,0x38,0x48, 0xf1,0xf5,0xf5,0x04
+.byte 0x63,0xbc,0xbc,0xdf, 0x77,0xb6,0xb6,0xc1
+.byte 0xaf,0xda,0xda,0x75, 0x42,0x21,0x21,0x63
+.byte 0x20,0x10,0x10,0x30, 0xe5,0xff,0xff,0x1a
+.byte 0xfd,0xf3,0xf3,0x0e, 0xbf,0xd2,0xd2,0x6d
+.byte 0x81,0xcd,0xcd,0x4c, 0x18,0x0c,0x0c,0x14
+.byte 0x26,0x13,0x13,0x35, 0xc3,0xec,0xec,0x2f
+.byte 0xbe,0x5f,0x5f,0xe1, 0x35,0x97,0x97,0xa2
+.byte 0x88,0x44,0x44,0xcc, 0x2e,0x17,0x17,0x39
+.byte 0x93,0xc4,0xc4,0x57, 0x55,0xa7,0xa7,0xf2
+.byte 0xfc,0x7e,0x7e,0x82, 0x7a,0x3d,0x3d,0x47
+.byte 0xc8,0x64,0x64,0xac, 0xba,0x5d,0x5d,0xe7
+.byte 0x32,0x19,0x19,0x2b, 0xe6,0x73,0x73,0x95
+.byte 0xc0,0x60,0x60,0xa0, 0x19,0x81,0x81,0x98
+.byte 0x9e,0x4f,0x4f,0xd1, 0xa3,0xdc,0xdc,0x7f
+.byte 0x44,0x22,0x22,0x66, 0x54,0x2a,0x2a,0x7e
+.byte 0x3b,0x90,0x90,0xab, 0x0b,0x88,0x88,0x83
+.byte 0x8c,0x46,0x46,0xca, 0xc7,0xee,0xee,0x29
+.byte 0x6b,0xb8,0xb8,0xd3, 0x28,0x14,0x14,0x3c
+.byte 0xa7,0xde,0xde,0x79, 0xbc,0x5e,0x5e,0xe2
+.byte 0x16,0x0b,0x0b,0x1d, 0xad,0xdb,0xdb,0x76
+.byte 0xdb,0xe0,0xe0,0x3b, 0x64,0x32,0x32,0x56
+.byte 0x74,0x3a,0x3a,0x4e, 0x14,0x0a,0x0a,0x1e
+.byte 0x92,0x49,0x49,0xdb, 0x0c,0x06,0x06,0x0a
+.byte 0x48,0x24,0x24,0x6c, 0xb8,0x5c,0x5c,0xe4
+.byte 0x9f,0xc2,0xc2,0x5d, 0xbd,0xd3,0xd3,0x6e
+.byte 0x43,0xac,0xac,0xef, 0xc4,0x62,0x62,0xa6
+.byte 0x39,0x91,0x91,0xa8, 0x31,0x95,0x95,0xa4
+.byte 0xd3,0xe4,0xe4,0x37, 0xf2,0x79,0x79,0x8b
+.byte 0xd5,0xe7,0xe7,0x32, 0x8b,0xc8,0xc8,0x43
+.byte 0x6e,0x37,0x37,0x59, 0xda,0x6d,0x6d,0xb7
+.byte 0x01,0x8d,0x8d,0x8c, 0xb1,0xd5,0xd5,0x64
+.byte 0x9c,0x4e,0x4e,0xd2, 0x49,0xa9,0xa9,0xe0
+.byte 0xd8,0x6c,0x6c,0xb4, 0xac,0x56,0x56,0xfa
+.byte 0xf3,0xf4,0xf4,0x07, 0xcf,0xea,0xea,0x25
+.byte 0xca,0x65,0x65,0xaf, 0xf4,0x7a,0x7a,0x8e
+.byte 0x47,0xae,0xae,0xe9, 0x10,0x08,0x08,0x18
+.byte 0x6f,0xba,0xba,0xd5, 0xf0,0x78,0x78,0x88
+.byte 0x4a,0x25,0x25,0x6f, 0x5c,0x2e,0x2e,0x72
+.byte 0x38,0x1c,0x1c,0x24, 0x57,0xa6,0xa6,0xf1
+.byte 0x73,0xb4,0xb4,0xc7, 0x97,0xc6,0xc6,0x51
+.byte 0xcb,0xe8,0xe8,0x23, 0xa1,0xdd,0xdd,0x7c
+.byte 0xe8,0x74,0x74,0x9c, 0x3e,0x1f,0x1f,0x21
+.byte 0x96,0x4b,0x4b,0xdd, 0x61,0xbd,0xbd,0xdc
+.byte 0x0d,0x8b,0x8b,0x86, 0x0f,0x8a,0x8a,0x85
+.byte 0xe0,0x70,0x70,0x90, 0x7c,0x3e,0x3e,0x42
+.byte 0x71,0xb5,0xb5,0xc4, 0xcc,0x66,0x66,0xaa
+.byte 0x90,0x48,0x48,0xd8, 0x06,0x03,0x03,0x05
+.byte 0xf7,0xf6,0xf6,0x01, 0x1c,0x0e,0x0e,0x12
+.byte 0xc2,0x61,0x61,0xa3, 0x6a,0x35,0x35,0x5f
+.byte 0xae,0x57,0x57,0xf9, 0x69,0xb9,0xb9,0xd0
+.byte 0x17,0x86,0x86,0x91, 0x99,0xc1,0xc1,0x58
+.byte 0x3a,0x1d,0x1d,0x27, 0x27,0x9e,0x9e,0xb9
+.byte 0xd9,0xe1,0xe1,0x38, 0xeb,0xf8,0xf8,0x13
+.byte 0x2b,0x98,0x98,0xb3, 0x22,0x11,0x11,0x33
+.byte 0xd2,0x69,0x69,0xbb, 0xa9,0xd9,0xd9,0x70
+.byte 0x07,0x8e,0x8e,0x89, 0x33,0x94,0x94,0xa7
+.byte 0x2d,0x9b,0x9b,0xb6, 0x3c,0x1e,0x1e,0x22
+.byte 0x15,0x87,0x87,0x92, 0xc9,0xe9,0xe9,0x20
+.byte 0x87,0xce,0xce,0x49, 0xaa,0x55,0x55,0xff
+.byte 0x50,0x28,0x28,0x78, 0xa5,0xdf,0xdf,0x7a
+.byte 0x03,0x8c,0x8c,0x8f, 0x59,0xa1,0xa1,0xf8
+.byte 0x09,0x89,0x89,0x80, 0x1a,0x0d,0x0d,0x17
+.byte 0x65,0xbf,0xbf,0xda, 0xd7,0xe6,0xe6,0x31
+.byte 0x84,0x42,0x42,0xc6, 0xd0,0x68,0x68,0xb8
+.byte 0x82,0x41,0x41,0xc3, 0x29,0x99,0x99,0xb0
+.byte 0x5a,0x2d,0x2d,0x77, 0x1e,0x0f,0x0f,0x11
+.byte 0x7b,0xb0,0xb0,0xcb, 0xa8,0x54,0x54,0xfc
+.byte 0x6d,0xbb,0xbb,0xd6, 0x2c,0x16,0x16,0x3a
+
+.byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5 # Te4
+.byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
+.byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
+.byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
+.byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
+.byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
+.byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
+.byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
+.byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
+.byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
+.byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
+.byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
+.byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
+.byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
+.byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
+.byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
+.byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
+.byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
+.byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
+.byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
+.byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
+.byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
+.byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
+.byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
+.byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
+.byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
+.byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
+.byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
+.byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
+.byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
+.byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
+.byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
+
+.byte 0x01,0x00,0x00,0x00, 0x02,0x00,0x00,0x00 # rcon
+.byte 0x04,0x00,0x00,0x00, 0x08,0x00,0x00,0x00
+.byte 0x10,0x00,0x00,0x00, 0x20,0x00,0x00,0x00
+.byte 0x40,0x00,0x00,0x00, 0x80,0x00,0x00,0x00
+.byte 0x1B,0x00,0x00,0x00, 0x36,0x00,0x00,0x00
+
+.align 6
+AES_Td:
+.byte 0x51,0xf4,0xa7,0x50, 0x7e,0x41,0x65,0x53 # Td0
+.byte 0x1a,0x17,0xa4,0xc3, 0x3a,0x27,0x5e,0x96
+.byte 0x3b,0xab,0x6b,0xcb, 0x1f,0x9d,0x45,0xf1
+.byte 0xac,0xfa,0x58,0xab, 0x4b,0xe3,0x03,0x93
+.byte 0x20,0x30,0xfa,0x55, 0xad,0x76,0x6d,0xf6
+.byte 0x88,0xcc,0x76,0x91, 0xf5,0x02,0x4c,0x25
+.byte 0x4f,0xe5,0xd7,0xfc, 0xc5,0x2a,0xcb,0xd7
+.byte 0x26,0x35,0x44,0x80, 0xb5,0x62,0xa3,0x8f
+.byte 0xde,0xb1,0x5a,0x49, 0x25,0xba,0x1b,0x67
+.byte 0x45,0xea,0x0e,0x98, 0x5d,0xfe,0xc0,0xe1
+.byte 0xc3,0x2f,0x75,0x02, 0x81,0x4c,0xf0,0x12
+.byte 0x8d,0x46,0x97,0xa3, 0x6b,0xd3,0xf9,0xc6
+.byte 0x03,0x8f,0x5f,0xe7, 0x15,0x92,0x9c,0x95
+.byte 0xbf,0x6d,0x7a,0xeb, 0x95,0x52,0x59,0xda
+.byte 0xd4,0xbe,0x83,0x2d, 0x58,0x74,0x21,0xd3
+.byte 0x49,0xe0,0x69,0x29, 0x8e,0xc9,0xc8,0x44
+.byte 0x75,0xc2,0x89,0x6a, 0xf4,0x8e,0x79,0x78
+.byte 0x99,0x58,0x3e,0x6b, 0x27,0xb9,0x71,0xdd
+.byte 0xbe,0xe1,0x4f,0xb6, 0xf0,0x88,0xad,0x17
+.byte 0xc9,0x20,0xac,0x66, 0x7d,0xce,0x3a,0xb4
+.byte 0x63,0xdf,0x4a,0x18, 0xe5,0x1a,0x31,0x82
+.byte 0x97,0x51,0x33,0x60, 0x62,0x53,0x7f,0x45
+.byte 0xb1,0x64,0x77,0xe0, 0xbb,0x6b,0xae,0x84
+.byte 0xfe,0x81,0xa0,0x1c, 0xf9,0x08,0x2b,0x94
+.byte 0x70,0x48,0x68,0x58, 0x8f,0x45,0xfd,0x19
+.byte 0x94,0xde,0x6c,0x87, 0x52,0x7b,0xf8,0xb7
+.byte 0xab,0x73,0xd3,0x23, 0x72,0x4b,0x02,0xe2
+.byte 0xe3,0x1f,0x8f,0x57, 0x66,0x55,0xab,0x2a
+.byte 0xb2,0xeb,0x28,0x07, 0x2f,0xb5,0xc2,0x03
+.byte 0x86,0xc5,0x7b,0x9a, 0xd3,0x37,0x08,0xa5
+.byte 0x30,0x28,0x87,0xf2, 0x23,0xbf,0xa5,0xb2
+.byte 0x02,0x03,0x6a,0xba, 0xed,0x16,0x82,0x5c
+.byte 0x8a,0xcf,0x1c,0x2b, 0xa7,0x79,0xb4,0x92
+.byte 0xf3,0x07,0xf2,0xf0, 0x4e,0x69,0xe2,0xa1
+.byte 0x65,0xda,0xf4,0xcd, 0x06,0x05,0xbe,0xd5
+.byte 0xd1,0x34,0x62,0x1f, 0xc4,0xa6,0xfe,0x8a
+.byte 0x34,0x2e,0x53,0x9d, 0xa2,0xf3,0x55,0xa0
+.byte 0x05,0x8a,0xe1,0x32, 0xa4,0xf6,0xeb,0x75
+.byte 0x0b,0x83,0xec,0x39, 0x40,0x60,0xef,0xaa
+.byte 0x5e,0x71,0x9f,0x06, 0xbd,0x6e,0x10,0x51
+.byte 0x3e,0x21,0x8a,0xf9, 0x96,0xdd,0x06,0x3d
+.byte 0xdd,0x3e,0x05,0xae, 0x4d,0xe6,0xbd,0x46
+.byte 0x91,0x54,0x8d,0xb5, 0x71,0xc4,0x5d,0x05
+.byte 0x04,0x06,0xd4,0x6f, 0x60,0x50,0x15,0xff
+.byte 0x19,0x98,0xfb,0x24, 0xd6,0xbd,0xe9,0x97
+.byte 0x89,0x40,0x43,0xcc, 0x67,0xd9,0x9e,0x77
+.byte 0xb0,0xe8,0x42,0xbd, 0x07,0x89,0x8b,0x88
+.byte 0xe7,0x19,0x5b,0x38, 0x79,0xc8,0xee,0xdb
+.byte 0xa1,0x7c,0x0a,0x47, 0x7c,0x42,0x0f,0xe9
+.byte 0xf8,0x84,0x1e,0xc9, 0x00,0x00,0x00,0x00
+.byte 0x09,0x80,0x86,0x83, 0x32,0x2b,0xed,0x48
+.byte 0x1e,0x11,0x70,0xac, 0x6c,0x5a,0x72,0x4e
+.byte 0xfd,0x0e,0xff,0xfb, 0x0f,0x85,0x38,0x56
+.byte 0x3d,0xae,0xd5,0x1e, 0x36,0x2d,0x39,0x27
+.byte 0x0a,0x0f,0xd9,0x64, 0x68,0x5c,0xa6,0x21
+.byte 0x9b,0x5b,0x54,0xd1, 0x24,0x36,0x2e,0x3a
+.byte 0x0c,0x0a,0x67,0xb1, 0x93,0x57,0xe7,0x0f
+.byte 0xb4,0xee,0x96,0xd2, 0x1b,0x9b,0x91,0x9e
+.byte 0x80,0xc0,0xc5,0x4f, 0x61,0xdc,0x20,0xa2
+.byte 0x5a,0x77,0x4b,0x69, 0x1c,0x12,0x1a,0x16
+.byte 0xe2,0x93,0xba,0x0a, 0xc0,0xa0,0x2a,0xe5
+.byte 0x3c,0x22,0xe0,0x43, 0x12,0x1b,0x17,0x1d
+.byte 0x0e,0x09,0x0d,0x0b, 0xf2,0x8b,0xc7,0xad
+.byte 0x2d,0xb6,0xa8,0xb9, 0x14,0x1e,0xa9,0xc8
+.byte 0x57,0xf1,0x19,0x85, 0xaf,0x75,0x07,0x4c
+.byte 0xee,0x99,0xdd,0xbb, 0xa3,0x7f,0x60,0xfd
+.byte 0xf7,0x01,0x26,0x9f, 0x5c,0x72,0xf5,0xbc
+.byte 0x44,0x66,0x3b,0xc5, 0x5b,0xfb,0x7e,0x34
+.byte 0x8b,0x43,0x29,0x76, 0xcb,0x23,0xc6,0xdc
+.byte 0xb6,0xed,0xfc,0x68, 0xb8,0xe4,0xf1,0x63
+.byte 0xd7,0x31,0xdc,0xca, 0x42,0x63,0x85,0x10
+.byte 0x13,0x97,0x22,0x40, 0x84,0xc6,0x11,0x20
+.byte 0x85,0x4a,0x24,0x7d, 0xd2,0xbb,0x3d,0xf8
+.byte 0xae,0xf9,0x32,0x11, 0xc7,0x29,0xa1,0x6d
+.byte 0x1d,0x9e,0x2f,0x4b, 0xdc,0xb2,0x30,0xf3
+.byte 0x0d,0x86,0x52,0xec, 0x77,0xc1,0xe3,0xd0
+.byte 0x2b,0xb3,0x16,0x6c, 0xa9,0x70,0xb9,0x99
+.byte 0x11,0x94,0x48,0xfa, 0x47,0xe9,0x64,0x22
+.byte 0xa8,0xfc,0x8c,0xc4, 0xa0,0xf0,0x3f,0x1a
+.byte 0x56,0x7d,0x2c,0xd8, 0x22,0x33,0x90,0xef
+.byte 0x87,0x49,0x4e,0xc7, 0xd9,0x38,0xd1,0xc1
+.byte 0x8c,0xca,0xa2,0xfe, 0x98,0xd4,0x0b,0x36
+.byte 0xa6,0xf5,0x81,0xcf, 0xa5,0x7a,0xde,0x28
+.byte 0xda,0xb7,0x8e,0x26, 0x3f,0xad,0xbf,0xa4
+.byte 0x2c,0x3a,0x9d,0xe4, 0x50,0x78,0x92,0x0d
+.byte 0x6a,0x5f,0xcc,0x9b, 0x54,0x7e,0x46,0x62
+.byte 0xf6,0x8d,0x13,0xc2, 0x90,0xd8,0xb8,0xe8
+.byte 0x2e,0x39,0xf7,0x5e, 0x82,0xc3,0xaf,0xf5
+.byte 0x9f,0x5d,0x80,0xbe, 0x69,0xd0,0x93,0x7c
+.byte 0x6f,0xd5,0x2d,0xa9, 0xcf,0x25,0x12,0xb3
+.byte 0xc8,0xac,0x99,0x3b, 0x10,0x18,0x7d,0xa7
+.byte 0xe8,0x9c,0x63,0x6e, 0xdb,0x3b,0xbb,0x7b
+.byte 0xcd,0x26,0x78,0x09, 0x6e,0x59,0x18,0xf4
+.byte 0xec,0x9a,0xb7,0x01, 0x83,0x4f,0x9a,0xa8
+.byte 0xe6,0x95,0x6e,0x65, 0xaa,0xff,0xe6,0x7e
+.byte 0x21,0xbc,0xcf,0x08, 0xef,0x15,0xe8,0xe6
+.byte 0xba,0xe7,0x9b,0xd9, 0x4a,0x6f,0x36,0xce
+.byte 0xea,0x9f,0x09,0xd4, 0x29,0xb0,0x7c,0xd6
+.byte 0x31,0xa4,0xb2,0xaf, 0x2a,0x3f,0x23,0x31
+.byte 0xc6,0xa5,0x94,0x30, 0x35,0xa2,0x66,0xc0
+.byte 0x74,0x4e,0xbc,0x37, 0xfc,0x82,0xca,0xa6
+.byte 0xe0,0x90,0xd0,0xb0, 0x33,0xa7,0xd8,0x15
+.byte 0xf1,0x04,0x98,0x4a, 0x41,0xec,0xda,0xf7
+.byte 0x7f,0xcd,0x50,0x0e, 0x17,0x91,0xf6,0x2f
+.byte 0x76,0x4d,0xd6,0x8d, 0x43,0xef,0xb0,0x4d
+.byte 0xcc,0xaa,0x4d,0x54, 0xe4,0x96,0x04,0xdf
+.byte 0x9e,0xd1,0xb5,0xe3, 0x4c,0x6a,0x88,0x1b
+.byte 0xc1,0x2c,0x1f,0xb8, 0x46,0x65,0x51,0x7f
+.byte 0x9d,0x5e,0xea,0x04, 0x01,0x8c,0x35,0x5d
+.byte 0xfa,0x87,0x74,0x73, 0xfb,0x0b,0x41,0x2e
+.byte 0xb3,0x67,0x1d,0x5a, 0x92,0xdb,0xd2,0x52
+.byte 0xe9,0x10,0x56,0x33, 0x6d,0xd6,0x47,0x13
+.byte 0x9a,0xd7,0x61,0x8c, 0x37,0xa1,0x0c,0x7a
+.byte 0x59,0xf8,0x14,0x8e, 0xeb,0x13,0x3c,0x89
+.byte 0xce,0xa9,0x27,0xee, 0xb7,0x61,0xc9,0x35
+.byte 0xe1,0x1c,0xe5,0xed, 0x7a,0x47,0xb1,0x3c
+.byte 0x9c,0xd2,0xdf,0x59, 0x55,0xf2,0x73,0x3f
+.byte 0x18,0x14,0xce,0x79, 0x73,0xc7,0x37,0xbf
+.byte 0x53,0xf7,0xcd,0xea, 0x5f,0xfd,0xaa,0x5b
+.byte 0xdf,0x3d,0x6f,0x14, 0x78,0x44,0xdb,0x86
+.byte 0xca,0xaf,0xf3,0x81, 0xb9,0x68,0xc4,0x3e
+.byte 0x38,0x24,0x34,0x2c, 0xc2,0xa3,0x40,0x5f
+.byte 0x16,0x1d,0xc3,0x72, 0xbc,0xe2,0x25,0x0c
+.byte 0x28,0x3c,0x49,0x8b, 0xff,0x0d,0x95,0x41
+.byte 0x39,0xa8,0x01,0x71, 0x08,0x0c,0xb3,0xde
+.byte 0xd8,0xb4,0xe4,0x9c, 0x64,0x56,0xc1,0x90
+.byte 0x7b,0xcb,0x84,0x61, 0xd5,0x32,0xb6,0x70
+.byte 0x48,0x6c,0x5c,0x74, 0xd0,0xb8,0x57,0x42
+
+.byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38 # Td4
+.byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
+.byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
+.byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
+.byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
+.byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
+.byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
+.byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
+.byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
+.byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
+.byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
+.byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
+.byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
+.byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
+.byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
+.byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
+.byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
+.byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
+.byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
+.byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
+.byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
+.byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
+.byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
+.byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
+.byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
+.byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
+.byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
+.byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
+.byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
+.byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
+.byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
+.byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
diff --git a/main/openssl/crypto/aes/asm/aes-mips.pl b/main/openssl/crypto/aes/asm/aes-mips.pl
new file mode 100644
index 00000000..e5239542
--- /dev/null
+++ b/main/openssl/crypto/aes/asm/aes-mips.pl
@@ -0,0 +1,1611 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# AES for MIPS
+
+# October 2010
+#
+# Code uses 1K[+256B] S-box and on single-issue core [such as R5000]
+# spends ~68 cycles per byte processed with 128-bit key. This is ~16%
+# faster than gcc-generated code, which is not very impressive. But
+# recall that compressed S-box requires extra processing, namely
+# additional rotations. Rotations are implemented with lwl/lwr pairs,
+# which is normally used for loading unaligned data. Another cool
+# thing about this module is its endian neutrality, which means that
+# it processes data without ever changing byte order...
+
+######################################################################
+# There is a number of MIPS ABI in use, O32 and N32/64 are most
+# widely used. Then there is a new contender: NUBI. It appears that if
+# one picks the latter, it's possible to arrange code in ABI neutral
+# manner. Therefore let's stick to NUBI register layout:
+#
+($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
+($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
+($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
+#
+# The return value is placed in $a0. Following coding rules facilitate
+# interoperability:
+#
+# - never ever touch $tp, "thread pointer", former $gp;
+# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
+# old code];
+# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
+#
+# For reference here is register layout for N32/64 MIPS ABIs:
+#
+# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
+# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
+# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
+# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
+#
+$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
+
+if ($flavour =~ /64|n32/i) {
+ $PTR_ADD="dadd"; # incidentally works even on n32
+ $PTR_SUB="dsub"; # incidentally works even on n32
+ $REG_S="sd";
+ $REG_L="ld";
+ $PTR_SLL="dsll"; # incidentally works even on n32
+ $SZREG=8;
+} else {
+ $PTR_ADD="add";
+ $PTR_SUB="sub";
+ $REG_S="sw";
+ $REG_L="lw";
+ $PTR_SLL="sll";
+ $SZREG=4;
+}
+$pf = ($flavour =~ /nubi/i) ? $t0 : $t2;
+#
+# <appro@openssl.org>
+#
+######################################################################
+
+$big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0;
+
+for (@ARGV) { $output=$_ if (/^\w[\w\-]*\.\w+$/); }
+open STDOUT,">$output";
+
+if (!defined($big_endian))
+{ $big_endian=(unpack('L',pack('N',1))==1); }
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+my ($MSB,$LSB)=(0,3); # automatically converted to little-endian
+
+$code.=<<___;
+.text
+#ifdef OPENSSL_FIPSCANISTER
+# include <openssl/fipssyms.h>
+#endif
+
+#if !defined(__vxworks) || defined(__pic__)
+.option pic2
+#endif
+.set noat
+___
+
+{{{
+my $FRAMESIZE=16*$SZREG;
+my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000;
+
+my ($inp,$out,$key,$Tbl,$s0,$s1,$s2,$s3)=($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7);
+my ($i0,$i1,$i2,$i3)=($at,$t0,$t1,$t2);
+my ($t0,$t1,$t2,$t3,$t4,$t5,$t6,$t7,$t8,$t9,$t10,$t11) = map("\$$_",(12..23));
+my ($key0,$cnt)=($gp,$fp);
+
+# instuction ordering is "stolen" from output from MIPSpro assembler
+# invoked with -mips3 -O3 arguments...
+$code.=<<___;
+.align 5
+.ent _mips_AES_encrypt
+_mips_AES_encrypt:
+ .frame $sp,0,$ra
+ .set reorder
+ lw $t0,0($key)
+ lw $t1,4($key)
+ lw $t2,8($key)
+ lw $t3,12($key)
+ lw $cnt,240($key)
+ $PTR_ADD $key0,$key,16
+
+ xor $s0,$t0
+ xor $s1,$t1
+ xor $s2,$t2
+ xor $s3,$t3
+
+ sub $cnt,1
+ _xtr $i0,$s1,16-2
+.Loop_enc:
+ _xtr $i1,$s2,16-2
+ _xtr $i2,$s3,16-2
+ _xtr $i3,$s0,16-2
+ and $i0,0x3fc
+ and $i1,0x3fc
+ and $i2,0x3fc
+ and $i3,0x3fc
+ $PTR_ADD $i0,$Tbl
+ $PTR_ADD $i1,$Tbl
+ $PTR_ADD $i2,$Tbl
+ $PTR_ADD $i3,$Tbl
+ lwl $t0,3($i0) # Te1[s1>>16]
+ lwl $t1,3($i1) # Te1[s2>>16]
+ lwl $t2,3($i2) # Te1[s3>>16]
+ lwl $t3,3($i3) # Te1[s0>>16]
+ lwr $t0,2($i0) # Te1[s1>>16]
+ lwr $t1,2($i1) # Te1[s2>>16]
+ lwr $t2,2($i2) # Te1[s3>>16]
+ lwr $t3,2($i3) # Te1[s0>>16]
+
+ _xtr $i0,$s2,8-2
+ _xtr $i1,$s3,8-2
+ _xtr $i2,$s0,8-2
+ _xtr $i3,$s1,8-2
+ and $i0,0x3fc
+ and $i1,0x3fc
+ and $i2,0x3fc
+ and $i3,0x3fc
+ $PTR_ADD $i0,$Tbl
+ $PTR_ADD $i1,$Tbl
+ $PTR_ADD $i2,$Tbl
+ $PTR_ADD $i3,$Tbl
+ lwl $t4,2($i0) # Te2[s2>>8]
+ lwl $t5,2($i1) # Te2[s3>>8]
+ lwl $t6,2($i2) # Te2[s0>>8]
+ lwl $t7,2($i3) # Te2[s1>>8]
+ lwr $t4,1($i0) # Te2[s2>>8]
+ lwr $t5,1($i1) # Te2[s3>>8]
+ lwr $t6,1($i2) # Te2[s0>>8]
+ lwr $t7,1($i3) # Te2[s1>>8]
+
+ _xtr $i0,$s3,0-2
+ _xtr $i1,$s0,0-2
+ _xtr $i2,$s1,0-2
+ _xtr $i3,$s2,0-2
+ and $i0,0x3fc
+ and $i1,0x3fc
+ and $i2,0x3fc
+ and $i3,0x3fc
+ $PTR_ADD $i0,$Tbl
+ $PTR_ADD $i1,$Tbl
+ $PTR_ADD $i2,$Tbl
+ $PTR_ADD $i3,$Tbl
+ lwl $t8,1($i0) # Te3[s3]
+ lwl $t9,1($i1) # Te3[s0]
+ lwl $t10,1($i2) # Te3[s1]
+ lwl $t11,1($i3) # Te3[s2]
+ lwr $t8,0($i0) # Te3[s3]
+ lwr $t9,0($i1) # Te3[s0]
+ lwr $t10,0($i2) # Te3[s1]
+ lwr $t11,0($i3) # Te3[s2]
+
+ _xtr $i0,$s0,24-2
+ _xtr $i1,$s1,24-2
+ _xtr $i2,$s2,24-2
+ _xtr $i3,$s3,24-2
+ and $i0,0x3fc
+ and $i1,0x3fc
+ and $i2,0x3fc
+ and $i3,0x3fc
+ $PTR_ADD $i0,$Tbl
+ $PTR_ADD $i1,$Tbl
+ $PTR_ADD $i2,$Tbl
+ $PTR_ADD $i3,$Tbl
+ xor $t0,$t4
+ xor $t1,$t5
+ xor $t2,$t6
+ xor $t3,$t7
+ lw $t4,0($i0) # Te0[s0>>24]
+ lw $t5,0($i1) # Te0[s1>>24]
+ lw $t6,0($i2) # Te0[s2>>24]
+ lw $t7,0($i3) # Te0[s3>>24]
+
+ lw $s0,0($key0)
+ lw $s1,4($key0)
+ lw $s2,8($key0)
+ lw $s3,12($key0)
+
+ xor $t0,$t8
+ xor $t1,$t9
+ xor $t2,$t10
+ xor $t3,$t11
+
+ xor $t0,$t4
+ xor $t1,$t5
+ xor $t2,$t6
+ xor $t3,$t7
+
+ sub $cnt,1
+ $PTR_ADD $key0,16
+ xor $s0,$t0
+ xor $s1,$t1
+ xor $s2,$t2
+ xor $s3,$t3
+ .set noreorder
+ bnez $cnt,.Loop_enc
+ _xtr $i0,$s1,16-2
+
+ .set reorder
+ _xtr $i1,$s2,16-2
+ _xtr $i2,$s3,16-2
+ _xtr $i3,$s0,16-2
+ and $i0,0x3fc
+ and $i1,0x3fc
+ and $i2,0x3fc
+ and $i3,0x3fc
+ $PTR_ADD $i0,$Tbl
+ $PTR_ADD $i1,$Tbl
+ $PTR_ADD $i2,$Tbl
+ $PTR_ADD $i3,$Tbl
+ lbu $t0,2($i0) # Te4[s1>>16]
+ lbu $t1,2($i1) # Te4[s2>>16]
+ lbu $t2,2($i2) # Te4[s3>>16]
+ lbu $t3,2($i3) # Te4[s0>>16]
+
+ _xtr $i0,$s2,8-2
+ _xtr $i1,$s3,8-2
+ _xtr $i2,$s0,8-2
+ _xtr $i3,$s1,8-2
+ and $i0,0x3fc
+ and $i1,0x3fc
+ and $i2,0x3fc
+ and $i3,0x3fc
+ $PTR_ADD $i0,$Tbl
+ $PTR_ADD $i1,$Tbl
+ $PTR_ADD $i2,$Tbl
+ $PTR_ADD $i3,$Tbl
+ lbu $t4,2($i0) # Te4[s2>>8]
+ lbu $t5,2($i1) # Te4[s3>>8]
+ lbu $t6,2($i2) # Te4[s0>>8]
+ lbu $t7,2($i3) # Te4[s1>>8]
+
+ _xtr $i0,$s0,24-2
+ _xtr $i1,$s1,24-2
+ _xtr $i2,$s2,24-2
+ _xtr $i3,$s3,24-2
+ and $i0,0x3fc
+ and $i1,0x3fc
+ and $i2,0x3fc
+ and $i3,0x3fc
+ $PTR_ADD $i0,$Tbl
+ $PTR_ADD $i1,$Tbl
+ $PTR_ADD $i2,$Tbl
+ $PTR_ADD $i3,$Tbl
+ lbu $t8,2($i0) # Te4[s0>>24]
+ lbu $t9,2($i1) # Te4[s1>>24]
+ lbu $t10,2($i2) # Te4[s2>>24]
+ lbu $t11,2($i3) # Te4[s3>>24]
+
+ _xtr $i0,$s3,0-2
+ _xtr $i1,$s0,0-2
+ _xtr $i2,$s1,0-2
+ _xtr $i3,$s2,0-2
+ and $i0,0x3fc
+ and $i1,0x3fc
+ and $i2,0x3fc
+ and $i3,0x3fc
+
+ _ins $t0,16
+ _ins $t1,16
+ _ins $t2,16
+ _ins $t3,16
+
+ _ins $t4,8
+ _ins $t5,8
+ _ins $t6,8
+ _ins $t7,8
+
+ xor $t0,$t4
+ xor $t1,$t5
+ xor $t2,$t6
+ xor $t3,$t7
+
+ $PTR_ADD $i0,$Tbl
+ $PTR_ADD $i1,$Tbl
+ $PTR_ADD $i2,$Tbl
+ $PTR_ADD $i3,$Tbl
+ lbu $t4,2($i0) # Te4[s3]
+ lbu $t5,2($i1) # Te4[s0]
+ lbu $t6,2($i2) # Te4[s1]
+ lbu $t7,2($i3) # Te4[s2]
+
+ _ins $t8,24
+ _ins $t9,24
+ _ins $t10,24
+ _ins $t11,24
+
+ lw $s0,0($key0)
+ lw $s1,4($key0)
+ lw $s2,8($key0)
+ lw $s3,12($key0)
+
+ xor $t0,$t8
+ xor $t1,$t9
+ xor $t2,$t10
+ xor $t3,$t11
+
+ _ins $t4,0
+ _ins $t5,0
+ _ins $t6,0
+ _ins $t7,0
+
+ xor $t0,$t4
+ xor $t1,$t5
+ xor $t2,$t6
+ xor $t3,$t7
+
+ xor $s0,$t0
+ xor $s1,$t1
+ xor $s2,$t2
+ xor $s3,$t3
+
+ jr $ra
+.end _mips_AES_encrypt
+
+.align 5
+.globl AES_encrypt
+.ent AES_encrypt
+AES_encrypt:
+ .frame $sp,$FRAMESIZE,$ra
+ .mask $SAVED_REGS_MASK,-$SZREG
+ .set noreorder
+___
+$code.=<<___ if ($flavour =~ /o32/i); # o32 PIC-ification
+ .cpload $pf
+___
+$code.=<<___;
+ $PTR_SUB $sp,$FRAMESIZE
+ $REG_S $ra,$FRAMESIZE-1*$SZREG($sp)
+ $REG_S $fp,$FRAMESIZE-2*$SZREG($sp)
+ $REG_S $s11,$FRAMESIZE-3*$SZREG($sp)
+ $REG_S $s10,$FRAMESIZE-4*$SZREG($sp)
+ $REG_S $s9,$FRAMESIZE-5*$SZREG($sp)
+ $REG_S $s8,$FRAMESIZE-6*$SZREG($sp)
+ $REG_S $s7,$FRAMESIZE-7*$SZREG($sp)
+ $REG_S $s6,$FRAMESIZE-8*$SZREG($sp)
+ $REG_S $s5,$FRAMESIZE-9*$SZREG($sp)
+ $REG_S $s4,$FRAMESIZE-10*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
+ $REG_S \$15,$FRAMESIZE-11*$SZREG($sp)
+ $REG_S \$14,$FRAMESIZE-12*$SZREG($sp)
+ $REG_S \$13,$FRAMESIZE-13*$SZREG($sp)
+ $REG_S \$12,$FRAMESIZE-14*$SZREG($sp)
+ $REG_S $gp,$FRAMESIZE-15*$SZREG($sp)
+___
+$code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification
+ .cplocal $Tbl
+ .cpsetup $pf,$zero,AES_encrypt
+___
+$code.=<<___;
+ .set reorder
+ la $Tbl,AES_Te # PIC-ified 'load address'
+
+ lwl $s0,0+$MSB($inp)
+ lwl $s1,4+$MSB($inp)
+ lwl $s2,8+$MSB($inp)
+ lwl $s3,12+$MSB($inp)
+ lwr $s0,0+$LSB($inp)
+ lwr $s1,4+$LSB($inp)
+ lwr $s2,8+$LSB($inp)
+ lwr $s3,12+$LSB($inp)
+
+ bal _mips_AES_encrypt
+
+ swr $s0,0+$LSB($out)
+ swr $s1,4+$LSB($out)
+ swr $s2,8+$LSB($out)
+ swr $s3,12+$LSB($out)
+ swl $s0,0+$MSB($out)
+ swl $s1,4+$MSB($out)
+ swl $s2,8+$MSB($out)
+ swl $s3,12+$MSB($out)
+
+ .set noreorder
+ $REG_L $ra,$FRAMESIZE-1*$SZREG($sp)
+ $REG_L $fp,$FRAMESIZE-2*$SZREG($sp)
+ $REG_L $s11,$FRAMESIZE-3*$SZREG($sp)
+ $REG_L $s10,$FRAMESIZE-4*$SZREG($sp)
+ $REG_L $s9,$FRAMESIZE-5*$SZREG($sp)
+ $REG_L $s8,$FRAMESIZE-6*$SZREG($sp)
+ $REG_L $s7,$FRAMESIZE-7*$SZREG($sp)
+ $REG_L $s6,$FRAMESIZE-8*$SZREG($sp)
+ $REG_L $s5,$FRAMESIZE-9*$SZREG($sp)
+ $REG_L $s4,$FRAMESIZE-10*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+ $REG_L \$15,$FRAMESIZE-11*$SZREG($sp)
+ $REG_L \$14,$FRAMESIZE-12*$SZREG($sp)
+ $REG_L \$13,$FRAMESIZE-13*$SZREG($sp)
+ $REG_L \$12,$FRAMESIZE-14*$SZREG($sp)
+ $REG_L $gp,$FRAMESIZE-15*$SZREG($sp)
+___
+$code.=<<___;
+ jr $ra
+ $PTR_ADD $sp,$FRAMESIZE
+.end AES_encrypt
+___
+
+$code.=<<___;
+.align 5
+.ent _mips_AES_decrypt
+_mips_AES_decrypt:
+ .frame $sp,0,$ra
+ .set reorder
+ lw $t0,0($key)
+ lw $t1,4($key)
+ lw $t2,8($key)
+ lw $t3,12($key)
+ lw $cnt,240($key)
+ $PTR_ADD $key0,$key,16
+
+ xor $s0,$t0
+ xor $s1,$t1
+ xor $s2,$t2
+ xor $s3,$t3
+
+ sub $cnt,1
+ _xtr $i0,$s3,16-2
+.Loop_dec:
+ _xtr $i1,$s0,16-2
+ _xtr $i2,$s1,16-2
+ _xtr $i3,$s2,16-2
+ and $i0,0x3fc
+ and $i1,0x3fc
+ and $i2,0x3fc
+ and $i3,0x3fc
+ $PTR_ADD $i0,$Tbl
+ $PTR_ADD $i1,$Tbl
+ $PTR_ADD $i2,$Tbl
+ $PTR_ADD $i3,$Tbl
+ lwl $t0,3($i0) # Td1[s3>>16]
+ lwl $t1,3($i1) # Td1[s0>>16]
+ lwl $t2,3($i2) # Td1[s1>>16]
+ lwl $t3,3($i3) # Td1[s2>>16]
+ lwr $t0,2($i0) # Td1[s3>>16]
+ lwr $t1,2($i1) # Td1[s0>>16]
+ lwr $t2,2($i2) # Td1[s1>>16]
+ lwr $t3,2($i3) # Td1[s2>>16]
+
+ _xtr $i0,$s2,8-2
+ _xtr $i1,$s3,8-2
+ _xtr $i2,$s0,8-2
+ _xtr $i3,$s1,8-2
+ and $i0,0x3fc
+ and $i1,0x3fc
+ and $i2,0x3fc
+ and $i3,0x3fc
+ $PTR_ADD $i0,$Tbl
+ $PTR_ADD $i1,$Tbl
+ $PTR_ADD $i2,$Tbl
+ $PTR_ADD $i3,$Tbl
+ lwl $t4,2($i0) # Td2[s2>>8]
+ lwl $t5,2($i1) # Td2[s3>>8]
+ lwl $t6,2($i2) # Td2[s0>>8]
+ lwl $t7,2($i3) # Td2[s1>>8]
+ lwr $t4,1($i0) # Td2[s2>>8]
+ lwr $t5,1($i1) # Td2[s3>>8]
+ lwr $t6,1($i2) # Td2[s0>>8]
+ lwr $t7,1($i3) # Td2[s1>>8]
+
+ _xtr $i0,$s1,0-2
+ _xtr $i1,$s2,0-2
+ _xtr $i2,$s3,0-2
+ _xtr $i3,$s0,0-2
+ and $i0,0x3fc
+ and $i1,0x3fc
+ and $i2,0x3fc
+ and $i3,0x3fc
+ $PTR_ADD $i0,$Tbl
+ $PTR_ADD $i1,$Tbl
+ $PTR_ADD $i2,$Tbl
+ $PTR_ADD $i3,$Tbl
+ lwl $t8,1($i0) # Td3[s1]
+ lwl $t9,1($i1) # Td3[s2]
+ lwl $t10,1($i2) # Td3[s3]
+ lwl $t11,1($i3) # Td3[s0]
+ lwr $t8,0($i0) # Td3[s1]
+ lwr $t9,0($i1) # Td3[s2]
+ lwr $t10,0($i2) # Td3[s3]
+ lwr $t11,0($i3) # Td3[s0]
+
+ _xtr $i0,$s0,24-2
+ _xtr $i1,$s1,24-2
+ _xtr $i2,$s2,24-2
+ _xtr $i3,$s3,24-2
+ and $i0,0x3fc
+ and $i1,0x3fc
+ and $i2,0x3fc
+ and $i3,0x3fc
+ $PTR_ADD $i0,$Tbl
+ $PTR_ADD $i1,$Tbl
+ $PTR_ADD $i2,$Tbl
+ $PTR_ADD $i3,$Tbl
+
+ xor $t0,$t4
+ xor $t1,$t5
+ xor $t2,$t6
+ xor $t3,$t7
+
+
+ lw $t4,0($i0) # Td0[s0>>24]
+ lw $t5,0($i1) # Td0[s1>>24]
+ lw $t6,0($i2) # Td0[s2>>24]
+ lw $t7,0($i3) # Td0[s3>>24]
+
+ lw $s0,0($key0)
+ lw $s1,4($key0)
+ lw $s2,8($key0)
+ lw $s3,12($key0)
+
+ xor $t0,$t8
+ xor $t1,$t9
+ xor $t2,$t10
+ xor $t3,$t11
+
+ xor $t0,$t4
+ xor $t1,$t5
+ xor $t2,$t6
+ xor $t3,$t7
+
+ sub $cnt,1
+ $PTR_ADD $key0,16
+ xor $s0,$t0
+ xor $s1,$t1
+ xor $s2,$t2
+ xor $s3,$t3
+ .set noreorder
+ bnez $cnt,.Loop_dec
+ _xtr $i0,$s3,16-2
+
+ .set reorder
+ lw $t4,1024($Tbl) # prefetch Td4
+ lw $t5,1024+32($Tbl)
+ lw $t6,1024+64($Tbl)
+ lw $t7,1024+96($Tbl)
+ lw $t8,1024+128($Tbl)
+ lw $t9,1024+160($Tbl)
+ lw $t10,1024+192($Tbl)
+ lw $t11,1024+224($Tbl)
+
+ _xtr $i0,$s3,16
+ _xtr $i1,$s0,16
+ _xtr $i2,$s1,16
+ _xtr $i3,$s2,16
+ and $i0,0xff
+ and $i1,0xff
+ and $i2,0xff
+ and $i3,0xff
+ $PTR_ADD $i0,$Tbl
+ $PTR_ADD $i1,$Tbl
+ $PTR_ADD $i2,$Tbl
+ $PTR_ADD $i3,$Tbl
+ lbu $t0,1024($i0) # Td4[s3>>16]
+ lbu $t1,1024($i1) # Td4[s0>>16]
+ lbu $t2,1024($i2) # Td4[s1>>16]
+ lbu $t3,1024($i3) # Td4[s2>>16]
+
+ _xtr $i0,$s2,8
+ _xtr $i1,$s3,8
+ _xtr $i2,$s0,8
+ _xtr $i3,$s1,8
+ and $i0,0xff
+ and $i1,0xff
+ and $i2,0xff
+ and $i3,0xff
+ $PTR_ADD $i0,$Tbl
+ $PTR_ADD $i1,$Tbl
+ $PTR_ADD $i2,$Tbl
+ $PTR_ADD $i3,$Tbl
+ lbu $t4,1024($i0) # Td4[s2>>8]
+ lbu $t5,1024($i1) # Td4[s3>>8]
+ lbu $t6,1024($i2) # Td4[s0>>8]
+ lbu $t7,1024($i3) # Td4[s1>>8]
+
+ _xtr $i0,$s0,24
+ _xtr $i1,$s1,24
+ _xtr $i2,$s2,24
+ _xtr $i3,$s3,24
+ $PTR_ADD $i0,$Tbl
+ $PTR_ADD $i1,$Tbl
+ $PTR_ADD $i2,$Tbl
+ $PTR_ADD $i3,$Tbl
+ lbu $t8,1024($i0) # Td4[s0>>24]
+ lbu $t9,1024($i1) # Td4[s1>>24]
+ lbu $t10,1024($i2) # Td4[s2>>24]
+ lbu $t11,1024($i3) # Td4[s3>>24]
+
+ _xtr $i0,$s1,0
+ _xtr $i1,$s2,0
+ _xtr $i2,$s3,0
+ _xtr $i3,$s0,0
+
+ _ins $t0,16
+ _ins $t1,16
+ _ins $t2,16
+ _ins $t3,16
+
+ _ins $t4,8
+ _ins $t5,8
+ _ins $t6,8
+ _ins $t7,8
+
+ xor $t0,$t4
+ xor $t1,$t5
+ xor $t2,$t6
+ xor $t3,$t7
+
+ $PTR_ADD $i0,$Tbl
+ $PTR_ADD $i1,$Tbl
+ $PTR_ADD $i2,$Tbl
+ $PTR_ADD $i3,$Tbl
+ lbu $t4,1024($i0) # Td4[s1]
+ lbu $t5,1024($i1) # Td4[s2]
+ lbu $t6,1024($i2) # Td4[s3]
+ lbu $t7,1024($i3) # Td4[s0]
+
+ _ins $t8,24
+ _ins $t9,24
+ _ins $t10,24
+ _ins $t11,24
+
+ lw $s0,0($key0)
+ lw $s1,4($key0)
+ lw $s2,8($key0)
+ lw $s3,12($key0)
+
+ _ins $t4,0
+ _ins $t5,0
+ _ins $t6,0
+ _ins $t7,0
+
+
+ xor $t0,$t8
+ xor $t1,$t9
+ xor $t2,$t10
+ xor $t3,$t11
+
+ xor $t0,$t4
+ xor $t1,$t5
+ xor $t2,$t6
+ xor $t3,$t7
+
+ xor $s0,$t0
+ xor $s1,$t1
+ xor $s2,$t2
+ xor $s3,$t3
+
+ jr $ra
+.end _mips_AES_decrypt
+
+.align 5
+.globl AES_decrypt
+.ent AES_decrypt
+AES_decrypt:
+ .frame $sp,$FRAMESIZE,$ra
+ .mask $SAVED_REGS_MASK,-$SZREG
+ .set noreorder
+___
+$code.=<<___ if ($flavour =~ /o32/i); # o32 PIC-ification
+ .cpload $pf
+___
+$code.=<<___;
+ $PTR_SUB $sp,$FRAMESIZE
+ $REG_S $ra,$FRAMESIZE-1*$SZREG($sp)
+ $REG_S $fp,$FRAMESIZE-2*$SZREG($sp)
+ $REG_S $s11,$FRAMESIZE-3*$SZREG($sp)
+ $REG_S $s10,$FRAMESIZE-4*$SZREG($sp)
+ $REG_S $s9,$FRAMESIZE-5*$SZREG($sp)
+ $REG_S $s8,$FRAMESIZE-6*$SZREG($sp)
+ $REG_S $s7,$FRAMESIZE-7*$SZREG($sp)
+ $REG_S $s6,$FRAMESIZE-8*$SZREG($sp)
+ $REG_S $s5,$FRAMESIZE-9*$SZREG($sp)
+ $REG_S $s4,$FRAMESIZE-10*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
+ $REG_S \$15,$FRAMESIZE-11*$SZREG($sp)
+ $REG_S \$14,$FRAMESIZE-12*$SZREG($sp)
+ $REG_S \$13,$FRAMESIZE-13*$SZREG($sp)
+ $REG_S \$12,$FRAMESIZE-14*$SZREG($sp)
+ $REG_S $gp,$FRAMESIZE-15*$SZREG($sp)
+___
+$code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification
+ .cplocal $Tbl
+ .cpsetup $pf,$zero,AES_decrypt
+___
+$code.=<<___;
+ .set reorder
+ la $Tbl,AES_Td # PIC-ified 'load address'
+
+ lwl $s0,0+$MSB($inp)
+ lwl $s1,4+$MSB($inp)
+ lwl $s2,8+$MSB($inp)
+ lwl $s3,12+$MSB($inp)
+ lwr $s0,0+$LSB($inp)
+ lwr $s1,4+$LSB($inp)
+ lwr $s2,8+$LSB($inp)
+ lwr $s3,12+$LSB($inp)
+
+ bal _mips_AES_decrypt
+
+ swr $s0,0+$LSB($out)
+ swr $s1,4+$LSB($out)
+ swr $s2,8+$LSB($out)
+ swr $s3,12+$LSB($out)
+ swl $s0,0+$MSB($out)
+ swl $s1,4+$MSB($out)
+ swl $s2,8+$MSB($out)
+ swl $s3,12+$MSB($out)
+
+ .set noreorder
+ $REG_L $ra,$FRAMESIZE-1*$SZREG($sp)
+ $REG_L $fp,$FRAMESIZE-2*$SZREG($sp)
+ $REG_L $s11,$FRAMESIZE-3*$SZREG($sp)
+ $REG_L $s10,$FRAMESIZE-4*$SZREG($sp)
+ $REG_L $s9,$FRAMESIZE-5*$SZREG($sp)
+ $REG_L $s8,$FRAMESIZE-6*$SZREG($sp)
+ $REG_L $s7,$FRAMESIZE-7*$SZREG($sp)
+ $REG_L $s6,$FRAMESIZE-8*$SZREG($sp)
+ $REG_L $s5,$FRAMESIZE-9*$SZREG($sp)
+ $REG_L $s4,$FRAMESIZE-10*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+ $REG_L \$15,$FRAMESIZE-11*$SZREG($sp)
+ $REG_L \$14,$FRAMESIZE-12*$SZREG($sp)
+ $REG_L \$13,$FRAMESIZE-13*$SZREG($sp)
+ $REG_L \$12,$FRAMESIZE-14*$SZREG($sp)
+ $REG_L $gp,$FRAMESIZE-15*$SZREG($sp)
+___
+$code.=<<___;
+ jr $ra
+ $PTR_ADD $sp,$FRAMESIZE
+.end AES_decrypt
+___
+}}}
+
+{{{
+my $FRAMESIZE=8*$SZREG;
+my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc000f008 : 0xc0000000;
+
+my ($inp,$bits,$key,$Tbl)=($a0,$a1,$a2,$a3);
+my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7)=($a4,$a5,$a6,$a7,$s0,$s1,$s2,$s3);
+my ($i0,$i1,$i2,$i3)=($at,$t0,$t1,$t2);
+my ($rcon,$cnt)=($gp,$fp);
+
+$code.=<<___;
+.align 5
+.ent _mips_AES_set_encrypt_key
+_mips_AES_set_encrypt_key:
+ .frame $sp,0,$ra
+ .set noreorder
+ beqz $inp,.Lekey_done
+ li $t0,-1
+ beqz $key,.Lekey_done
+ $PTR_ADD $rcon,$Tbl,1024+256
+
+ .set reorder
+ lwl $rk0,0+$MSB($inp) # load 128 bits
+ lwl $rk1,4+$MSB($inp)
+ lwl $rk2,8+$MSB($inp)
+ lwl $rk3,12+$MSB($inp)
+ li $at,128
+ lwr $rk0,0+$LSB($inp)
+ lwr $rk1,4+$LSB($inp)
+ lwr $rk2,8+$LSB($inp)
+ lwr $rk3,12+$LSB($inp)
+ .set noreorder
+ beq $bits,$at,.L128bits
+ li $cnt,10
+
+ .set reorder
+ lwl $rk4,16+$MSB($inp) # load 192 bits
+ lwl $rk5,20+$MSB($inp)
+ li $at,192
+ lwr $rk4,16+$LSB($inp)
+ lwr $rk5,20+$LSB($inp)
+ .set noreorder
+ beq $bits,$at,.L192bits
+ li $cnt,8
+
+ .set reorder
+ lwl $rk6,24+$MSB($inp) # load 256 bits
+ lwl $rk7,28+$MSB($inp)
+ li $at,256
+ lwr $rk6,24+$LSB($inp)
+ lwr $rk7,28+$LSB($inp)
+ .set noreorder
+ beq $bits,$at,.L256bits
+ li $cnt,7
+
+ b .Lekey_done
+ li $t0,-2
+
+.align 4
+.L128bits:
+ .set reorder
+ srl $i0,$rk3,16
+ srl $i1,$rk3,8
+ and $i0,0xff
+ and $i1,0xff
+ and $i2,$rk3,0xff
+ srl $i3,$rk3,24
+ $PTR_ADD $i0,$Tbl
+ $PTR_ADD $i1,$Tbl
+ $PTR_ADD $i2,$Tbl
+ $PTR_ADD $i3,$Tbl
+ lbu $i0,1024($i0)
+ lbu $i1,1024($i1)
+ lbu $i2,1024($i2)
+ lbu $i3,1024($i3)
+
+ sw $rk0,0($key)
+ sw $rk1,4($key)
+ sw $rk2,8($key)
+ sw $rk3,12($key)
+ sub $cnt,1
+ $PTR_ADD $key,16
+
+ _bias $i0,24
+ _bias $i1,16
+ _bias $i2,8
+ _bias $i3,0
+
+ xor $rk0,$i0
+ lw $i0,0($rcon)
+ xor $rk0,$i1
+ xor $rk0,$i2
+ xor $rk0,$i3
+ xor $rk0,$i0
+
+ xor $rk1,$rk0
+ xor $rk2,$rk1
+ xor $rk3,$rk2
+
+ .set noreorder
+ bnez $cnt,.L128bits
+ $PTR_ADD $rcon,4
+
+ sw $rk0,0($key)
+ sw $rk1,4($key)
+ sw $rk2,8($key)
+ li $cnt,10
+ sw $rk3,12($key)
+ li $t0,0
+ sw $cnt,80($key)
+ b .Lekey_done
+ $PTR_SUB $key,10*16
+
+.align 4
+.L192bits:
+ .set reorder
+ srl $i0,$rk5,16
+ srl $i1,$rk5,8
+ and $i0,0xff
+ and $i1,0xff
+ and $i2,$rk5,0xff
+ srl $i3,$rk5,24
+ $PTR_ADD $i0,$Tbl
+ $PTR_ADD $i1,$Tbl
+ $PTR_ADD $i2,$Tbl
+ $PTR_ADD $i3,$Tbl
+ lbu $i0,1024($i0)
+ lbu $i1,1024($i1)
+ lbu $i2,1024($i2)
+ lbu $i3,1024($i3)
+
+ sw $rk0,0($key)
+ sw $rk1,4($key)
+ sw $rk2,8($key)
+ sw $rk3,12($key)
+ sw $rk4,16($key)
+ sw $rk5,20($key)
+ sub $cnt,1
+ $PTR_ADD $key,24
+
+ _bias $i0,24
+ _bias $i1,16
+ _bias $i2,8
+ _bias $i3,0
+
+ xor $rk0,$i0
+ lw $i0,0($rcon)
+ xor $rk0,$i1
+ xor $rk0,$i2
+ xor $rk0,$i3
+ xor $rk0,$i0
+
+ xor $rk1,$rk0
+ xor $rk2,$rk1
+ xor $rk3,$rk2
+ xor $rk4,$rk3
+ xor $rk5,$rk4
+
+ .set noreorder
+ bnez $cnt,.L192bits
+ $PTR_ADD $rcon,4
+
+ sw $rk0,0($key)
+ sw $rk1,4($key)
+ sw $rk2,8($key)
+ li $cnt,12
+ sw $rk3,12($key)
+ li $t0,0
+ sw $cnt,48($key)
+ b .Lekey_done
+ $PTR_SUB $key,12*16
+
+.align 4
+.L256bits:
+ .set reorder
+ srl $i0,$rk7,16
+ srl $i1,$rk7,8
+ and $i0,0xff
+ and $i1,0xff
+ and $i2,$rk7,0xff
+ srl $i3,$rk7,24
+ $PTR_ADD $i0,$Tbl
+ $PTR_ADD $i1,$Tbl
+ $PTR_ADD $i2,$Tbl
+ $PTR_ADD $i3,$Tbl
+ lbu $i0,1024($i0)
+ lbu $i1,1024($i1)
+ lbu $i2,1024($i2)
+ lbu $i3,1024($i3)
+
+ sw $rk0,0($key)
+ sw $rk1,4($key)
+ sw $rk2,8($key)
+ sw $rk3,12($key)
+ sw $rk4,16($key)
+ sw $rk5,20($key)
+ sw $rk6,24($key)
+ sw $rk7,28($key)
+ sub $cnt,1
+
+ _bias $i0,24
+ _bias $i1,16
+ _bias $i2,8
+ _bias $i3,0
+
+ xor $rk0,$i0
+ lw $i0,0($rcon)
+ xor $rk0,$i1
+ xor $rk0,$i2
+ xor $rk0,$i3
+ xor $rk0,$i0
+
+ xor $rk1,$rk0
+ xor $rk2,$rk1
+ xor $rk3,$rk2
+ beqz $cnt,.L256bits_done
+
+ srl $i0,$rk3,24
+ srl $i1,$rk3,16
+ srl $i2,$rk3,8
+ and $i3,$rk3,0xff
+ and $i1,0xff
+ and $i2,0xff
+ $PTR_ADD $i0,$Tbl
+ $PTR_ADD $i1,$Tbl
+ $PTR_ADD $i2,$Tbl
+ $PTR_ADD $i3,$Tbl
+ lbu $i0,1024($i0)
+ lbu $i1,1024($i1)
+ lbu $i2,1024($i2)
+ lbu $i3,1024($i3)
+ sll $i0,24
+ sll $i1,16
+ sll $i2,8
+
+ xor $rk4,$i0
+ xor $rk4,$i1
+ xor $rk4,$i2
+ xor $rk4,$i3
+
+ xor $rk5,$rk4
+ xor $rk6,$rk5
+ xor $rk7,$rk6
+
+ $PTR_ADD $key,32
+ .set noreorder
+ b .L256bits
+ $PTR_ADD $rcon,4
+
+.L256bits_done:
+ sw $rk0,32($key)
+ sw $rk1,36($key)
+ sw $rk2,40($key)
+ li $cnt,14
+ sw $rk3,44($key)
+ li $t0,0
+ sw $cnt,48($key)
+ $PTR_SUB $key,12*16
+
+.Lekey_done:
+ jr $ra
+ nop
+.end _mips_AES_set_encrypt_key
+
+.globl private_AES_set_encrypt_key
+.ent private_AES_set_encrypt_key
+private_AES_set_encrypt_key:
+ .frame $sp,$FRAMESIZE,$ra
+ .mask $SAVED_REGS_MASK,-$SZREG
+ .set noreorder
+___
+$code.=<<___ if ($flavour =~ /o32/i); # o32 PIC-ification
+ .cpload $pf
+___
+$code.=<<___;
+ $PTR_SUB $sp,$FRAMESIZE
+ $REG_S $ra,$FRAMESIZE-1*$SZREG($sp)
+ $REG_S $fp,$FRAMESIZE-2*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
+ $REG_S $s3,$FRAMESIZE-3*$SZREG($sp)
+ $REG_S $s2,$FRAMESIZE-4*$SZREG($sp)
+ $REG_S $s1,$FRAMESIZE-5*$SZREG($sp)
+ $REG_S $s0,$FRAMESIZE-6*$SZREG($sp)
+ $REG_S $gp,$FRAMESIZE-7*$SZREG($sp)
+___
+$code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification
+ .cplocal $Tbl
+ .cpsetup $pf,$zero,private_AES_set_encrypt_key
+___
+$code.=<<___;
+ .set reorder
+ la $Tbl,AES_Te # PIC-ified 'load address'
+
+ bal _mips_AES_set_encrypt_key
+
+ .set noreorder
+ move $a0,$t0
+ $REG_L $ra,$FRAMESIZE-1*$SZREG($sp)
+ $REG_L $fp,$FRAMESIZE-2*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+ $REG_L $s3,$FRAMESIZE-11*$SZREG($sp)
+ $REG_L $s2,$FRAMESIZE-12*$SZREG($sp)
+ $REG_L $s1,$FRAMESIZE-13*$SZREG($sp)
+ $REG_L $s0,$FRAMESIZE-14*$SZREG($sp)
+ $REG_L $gp,$FRAMESIZE-15*$SZREG($sp)
+___
+$code.=<<___;
+ jr $ra
+ $PTR_ADD $sp,$FRAMESIZE
+.end private_AES_set_encrypt_key
+___
+
+my ($head,$tail)=($inp,$bits);
+my ($tp1,$tp2,$tp4,$tp8,$tp9,$tpb,$tpd,$tpe)=($a4,$a5,$a6,$a7,$s0,$s1,$s2,$s3);
+my ($m,$x80808080,$x7f7f7f7f,$x1b1b1b1b)=($at,$t0,$t1,$t2);
+$code.=<<___;
+.align 5
+.globl private_AES_set_decrypt_key
+.ent private_AES_set_decrypt_key
+private_AES_set_decrypt_key:
+ .frame $sp,$FRAMESIZE,$ra
+ .mask $SAVED_REGS_MASK,-$SZREG
+ .set noreorder
+___
+$code.=<<___ if ($flavour =~ /o32/i); # o32 PIC-ification
+ .cpload $pf
+___
+$code.=<<___;
+ $PTR_SUB $sp,$FRAMESIZE
+ $REG_S $ra,$FRAMESIZE-1*$SZREG($sp)
+ $REG_S $fp,$FRAMESIZE-2*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
+ $REG_S $s3,$FRAMESIZE-3*$SZREG($sp)
+ $REG_S $s2,$FRAMESIZE-4*$SZREG($sp)
+ $REG_S $s1,$FRAMESIZE-5*$SZREG($sp)
+ $REG_S $s0,$FRAMESIZE-6*$SZREG($sp)
+ $REG_S $gp,$FRAMESIZE-7*$SZREG($sp)
+___
+$code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification
+ .cplocal $Tbl
+ .cpsetup $pf,$zero,private_AES_set_decrypt_key
+___
+$code.=<<___;
+ .set reorder
+ la $Tbl,AES_Te # PIC-ified 'load address'
+
+ bal _mips_AES_set_encrypt_key
+
+ bltz $t0,.Ldkey_done
+
+ sll $at,$cnt,4
+ $PTR_ADD $head,$key,0
+ $PTR_ADD $tail,$key,$at
+.align 4
+.Lswap:
+ lw $rk0,0($head)
+ lw $rk1,4($head)
+ lw $rk2,8($head)
+ lw $rk3,12($head)
+ lw $rk4,0($tail)
+ lw $rk5,4($tail)
+ lw $rk6,8($tail)
+ lw $rk7,12($tail)
+ sw $rk0,0($tail)
+ sw $rk1,4($tail)
+ sw $rk2,8($tail)
+ sw $rk3,12($tail)
+ $PTR_ADD $head,16
+ $PTR_SUB $tail,16
+ sw $rk4,-16($head)
+ sw $rk5,-12($head)
+ sw $rk6,-8($head)
+ sw $rk7,-4($head)
+ bne $head,$tail,.Lswap
+
+ lw $tp1,16($key) # modulo-scheduled
+ lui $x80808080,0x8080
+ sub $cnt,1
+ or $x80808080,0x8080
+ sll $cnt,2
+ $PTR_ADD $key,16
+ lui $x1b1b1b1b,0x1b1b
+ nor $x7f7f7f7f,$zero,$x80808080
+ or $x1b1b1b1b,0x1b1b
+.align 4
+.Lmix:
+ and $m,$tp1,$x80808080
+ and $tp2,$tp1,$x7f7f7f7f
+ srl $tp4,$m,7
+ addu $tp2,$tp2 # tp2<<1
+ subu $m,$tp4
+ and $m,$x1b1b1b1b
+ xor $tp2,$m
+
+ and $m,$tp2,$x80808080
+ and $tp4,$tp2,$x7f7f7f7f
+ srl $tp8,$m,7
+ addu $tp4,$tp4 # tp4<<1
+ subu $m,$tp8
+ and $m,$x1b1b1b1b
+ xor $tp4,$m
+
+ and $m,$tp4,$x80808080
+ and $tp8,$tp4,$x7f7f7f7f
+ srl $tp9,$m,7
+ addu $tp8,$tp8 # tp8<<1
+ subu $m,$tp9
+ and $m,$x1b1b1b1b
+ xor $tp8,$m
+
+ xor $tp9,$tp8,$tp1
+ xor $tpe,$tp8,$tp4
+ xor $tpb,$tp9,$tp2
+ xor $tpd,$tp9,$tp4
+
+ _ror $tp1,$tpd,16
+ xor $tpe,$tp2
+ _ror $tp2,$tpd,-16
+ xor $tpe,$tp1
+ _ror $tp1,$tp9,8
+ xor $tpe,$tp2
+ _ror $tp2,$tp9,-24
+ xor $tpe,$tp1
+ _ror $tp1,$tpb,24
+ xor $tpe,$tp2
+ _ror $tp2,$tpb,-8
+ xor $tpe,$tp1
+ lw $tp1,4($key) # modulo-scheduled
+ xor $tpe,$tp2
+ sub $cnt,1
+ sw $tpe,0($key)
+ $PTR_ADD $key,4
+ bnez $cnt,.Lmix
+
+ li $t0,0
+.Ldkey_done:
+ .set noreorder
+ move $a0,$t0
+ $REG_L $ra,$FRAMESIZE-1*$SZREG($sp)
+ $REG_L $fp,$FRAMESIZE-2*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+ $REG_L $s3,$FRAMESIZE-11*$SZREG($sp)
+ $REG_L $s2,$FRAMESIZE-12*$SZREG($sp)
+ $REG_L $s1,$FRAMESIZE-13*$SZREG($sp)
+ $REG_L $s0,$FRAMESIZE-14*$SZREG($sp)
+ $REG_L $gp,$FRAMESIZE-15*$SZREG($sp)
+___
+$code.=<<___;
+ jr $ra
+ $PTR_ADD $sp,$FRAMESIZE
+.end private_AES_set_decrypt_key
+___
+}}}
+
+######################################################################
+# Tables are kept in endian-neutral manner
+$code.=<<___;
+.rdata
+.align 6
+AES_Te:
+.byte 0xc6,0x63,0x63,0xa5, 0xf8,0x7c,0x7c,0x84 # Te0
+.byte 0xee,0x77,0x77,0x99, 0xf6,0x7b,0x7b,0x8d
+.byte 0xff,0xf2,0xf2,0x0d, 0xd6,0x6b,0x6b,0xbd
+.byte 0xde,0x6f,0x6f,0xb1, 0x91,0xc5,0xc5,0x54
+.byte 0x60,0x30,0x30,0x50, 0x02,0x01,0x01,0x03
+.byte 0xce,0x67,0x67,0xa9, 0x56,0x2b,0x2b,0x7d
+.byte 0xe7,0xfe,0xfe,0x19, 0xb5,0xd7,0xd7,0x62
+.byte 0x4d,0xab,0xab,0xe6, 0xec,0x76,0x76,0x9a
+.byte 0x8f,0xca,0xca,0x45, 0x1f,0x82,0x82,0x9d
+.byte 0x89,0xc9,0xc9,0x40, 0xfa,0x7d,0x7d,0x87
+.byte 0xef,0xfa,0xfa,0x15, 0xb2,0x59,0x59,0xeb
+.byte 0x8e,0x47,0x47,0xc9, 0xfb,0xf0,0xf0,0x0b
+.byte 0x41,0xad,0xad,0xec, 0xb3,0xd4,0xd4,0x67
+.byte 0x5f,0xa2,0xa2,0xfd, 0x45,0xaf,0xaf,0xea
+.byte 0x23,0x9c,0x9c,0xbf, 0x53,0xa4,0xa4,0xf7
+.byte 0xe4,0x72,0x72,0x96, 0x9b,0xc0,0xc0,0x5b
+.byte 0x75,0xb7,0xb7,0xc2, 0xe1,0xfd,0xfd,0x1c
+.byte 0x3d,0x93,0x93,0xae, 0x4c,0x26,0x26,0x6a
+.byte 0x6c,0x36,0x36,0x5a, 0x7e,0x3f,0x3f,0x41
+.byte 0xf5,0xf7,0xf7,0x02, 0x83,0xcc,0xcc,0x4f
+.byte 0x68,0x34,0x34,0x5c, 0x51,0xa5,0xa5,0xf4
+.byte 0xd1,0xe5,0xe5,0x34, 0xf9,0xf1,0xf1,0x08
+.byte 0xe2,0x71,0x71,0x93, 0xab,0xd8,0xd8,0x73
+.byte 0x62,0x31,0x31,0x53, 0x2a,0x15,0x15,0x3f
+.byte 0x08,0x04,0x04,0x0c, 0x95,0xc7,0xc7,0x52
+.byte 0x46,0x23,0x23,0x65, 0x9d,0xc3,0xc3,0x5e
+.byte 0x30,0x18,0x18,0x28, 0x37,0x96,0x96,0xa1
+.byte 0x0a,0x05,0x05,0x0f, 0x2f,0x9a,0x9a,0xb5
+.byte 0x0e,0x07,0x07,0x09, 0x24,0x12,0x12,0x36
+.byte 0x1b,0x80,0x80,0x9b, 0xdf,0xe2,0xe2,0x3d
+.byte 0xcd,0xeb,0xeb,0x26, 0x4e,0x27,0x27,0x69
+.byte 0x7f,0xb2,0xb2,0xcd, 0xea,0x75,0x75,0x9f
+.byte 0x12,0x09,0x09,0x1b, 0x1d,0x83,0x83,0x9e
+.byte 0x58,0x2c,0x2c,0x74, 0x34,0x1a,0x1a,0x2e
+.byte 0x36,0x1b,0x1b,0x2d, 0xdc,0x6e,0x6e,0xb2
+.byte 0xb4,0x5a,0x5a,0xee, 0x5b,0xa0,0xa0,0xfb
+.byte 0xa4,0x52,0x52,0xf6, 0x76,0x3b,0x3b,0x4d
+.byte 0xb7,0xd6,0xd6,0x61, 0x7d,0xb3,0xb3,0xce
+.byte 0x52,0x29,0x29,0x7b, 0xdd,0xe3,0xe3,0x3e
+.byte 0x5e,0x2f,0x2f,0x71, 0x13,0x84,0x84,0x97
+.byte 0xa6,0x53,0x53,0xf5, 0xb9,0xd1,0xd1,0x68
+.byte 0x00,0x00,0x00,0x00, 0xc1,0xed,0xed,0x2c
+.byte 0x40,0x20,0x20,0x60, 0xe3,0xfc,0xfc,0x1f
+.byte 0x79,0xb1,0xb1,0xc8, 0xb6,0x5b,0x5b,0xed
+.byte 0xd4,0x6a,0x6a,0xbe, 0x8d,0xcb,0xcb,0x46
+.byte 0x67,0xbe,0xbe,0xd9, 0x72,0x39,0x39,0x4b
+.byte 0x94,0x4a,0x4a,0xde, 0x98,0x4c,0x4c,0xd4
+.byte 0xb0,0x58,0x58,0xe8, 0x85,0xcf,0xcf,0x4a
+.byte 0xbb,0xd0,0xd0,0x6b, 0xc5,0xef,0xef,0x2a
+.byte 0x4f,0xaa,0xaa,0xe5, 0xed,0xfb,0xfb,0x16
+.byte 0x86,0x43,0x43,0xc5, 0x9a,0x4d,0x4d,0xd7
+.byte 0x66,0x33,0x33,0x55, 0x11,0x85,0x85,0x94
+.byte 0x8a,0x45,0x45,0xcf, 0xe9,0xf9,0xf9,0x10
+.byte 0x04,0x02,0x02,0x06, 0xfe,0x7f,0x7f,0x81
+.byte 0xa0,0x50,0x50,0xf0, 0x78,0x3c,0x3c,0x44
+.byte 0x25,0x9f,0x9f,0xba, 0x4b,0xa8,0xa8,0xe3
+.byte 0xa2,0x51,0x51,0xf3, 0x5d,0xa3,0xa3,0xfe
+.byte 0x80,0x40,0x40,0xc0, 0x05,0x8f,0x8f,0x8a
+.byte 0x3f,0x92,0x92,0xad, 0x21,0x9d,0x9d,0xbc
+.byte 0x70,0x38,0x38,0x48, 0xf1,0xf5,0xf5,0x04
+.byte 0x63,0xbc,0xbc,0xdf, 0x77,0xb6,0xb6,0xc1
+.byte 0xaf,0xda,0xda,0x75, 0x42,0x21,0x21,0x63
+.byte 0x20,0x10,0x10,0x30, 0xe5,0xff,0xff,0x1a
+.byte 0xfd,0xf3,0xf3,0x0e, 0xbf,0xd2,0xd2,0x6d
+.byte 0x81,0xcd,0xcd,0x4c, 0x18,0x0c,0x0c,0x14
+.byte 0x26,0x13,0x13,0x35, 0xc3,0xec,0xec,0x2f
+.byte 0xbe,0x5f,0x5f,0xe1, 0x35,0x97,0x97,0xa2
+.byte 0x88,0x44,0x44,0xcc, 0x2e,0x17,0x17,0x39
+.byte 0x93,0xc4,0xc4,0x57, 0x55,0xa7,0xa7,0xf2
+.byte 0xfc,0x7e,0x7e,0x82, 0x7a,0x3d,0x3d,0x47
+.byte 0xc8,0x64,0x64,0xac, 0xba,0x5d,0x5d,0xe7
+.byte 0x32,0x19,0x19,0x2b, 0xe6,0x73,0x73,0x95
+.byte 0xc0,0x60,0x60,0xa0, 0x19,0x81,0x81,0x98
+.byte 0x9e,0x4f,0x4f,0xd1, 0xa3,0xdc,0xdc,0x7f
+.byte 0x44,0x22,0x22,0x66, 0x54,0x2a,0x2a,0x7e
+.byte 0x3b,0x90,0x90,0xab, 0x0b,0x88,0x88,0x83
+.byte 0x8c,0x46,0x46,0xca, 0xc7,0xee,0xee,0x29
+.byte 0x6b,0xb8,0xb8,0xd3, 0x28,0x14,0x14,0x3c
+.byte 0xa7,0xde,0xde,0x79, 0xbc,0x5e,0x5e,0xe2
+.byte 0x16,0x0b,0x0b,0x1d, 0xad,0xdb,0xdb,0x76
+.byte 0xdb,0xe0,0xe0,0x3b, 0x64,0x32,0x32,0x56
+.byte 0x74,0x3a,0x3a,0x4e, 0x14,0x0a,0x0a,0x1e
+.byte 0x92,0x49,0x49,0xdb, 0x0c,0x06,0x06,0x0a
+.byte 0x48,0x24,0x24,0x6c, 0xb8,0x5c,0x5c,0xe4
+.byte 0x9f,0xc2,0xc2,0x5d, 0xbd,0xd3,0xd3,0x6e
+.byte 0x43,0xac,0xac,0xef, 0xc4,0x62,0x62,0xa6
+.byte 0x39,0x91,0x91,0xa8, 0x31,0x95,0x95,0xa4
+.byte 0xd3,0xe4,0xe4,0x37, 0xf2,0x79,0x79,0x8b
+.byte 0xd5,0xe7,0xe7,0x32, 0x8b,0xc8,0xc8,0x43
+.byte 0x6e,0x37,0x37,0x59, 0xda,0x6d,0x6d,0xb7
+.byte 0x01,0x8d,0x8d,0x8c, 0xb1,0xd5,0xd5,0x64
+.byte 0x9c,0x4e,0x4e,0xd2, 0x49,0xa9,0xa9,0xe0
+.byte 0xd8,0x6c,0x6c,0xb4, 0xac,0x56,0x56,0xfa
+.byte 0xf3,0xf4,0xf4,0x07, 0xcf,0xea,0xea,0x25
+.byte 0xca,0x65,0x65,0xaf, 0xf4,0x7a,0x7a,0x8e
+.byte 0x47,0xae,0xae,0xe9, 0x10,0x08,0x08,0x18
+.byte 0x6f,0xba,0xba,0xd5, 0xf0,0x78,0x78,0x88
+.byte 0x4a,0x25,0x25,0x6f, 0x5c,0x2e,0x2e,0x72
+.byte 0x38,0x1c,0x1c,0x24, 0x57,0xa6,0xa6,0xf1
+.byte 0x73,0xb4,0xb4,0xc7, 0x97,0xc6,0xc6,0x51
+.byte 0xcb,0xe8,0xe8,0x23, 0xa1,0xdd,0xdd,0x7c
+.byte 0xe8,0x74,0x74,0x9c, 0x3e,0x1f,0x1f,0x21
+.byte 0x96,0x4b,0x4b,0xdd, 0x61,0xbd,0xbd,0xdc
+.byte 0x0d,0x8b,0x8b,0x86, 0x0f,0x8a,0x8a,0x85
+.byte 0xe0,0x70,0x70,0x90, 0x7c,0x3e,0x3e,0x42
+.byte 0x71,0xb5,0xb5,0xc4, 0xcc,0x66,0x66,0xaa
+.byte 0x90,0x48,0x48,0xd8, 0x06,0x03,0x03,0x05
+.byte 0xf7,0xf6,0xf6,0x01, 0x1c,0x0e,0x0e,0x12
+.byte 0xc2,0x61,0x61,0xa3, 0x6a,0x35,0x35,0x5f
+.byte 0xae,0x57,0x57,0xf9, 0x69,0xb9,0xb9,0xd0
+.byte 0x17,0x86,0x86,0x91, 0x99,0xc1,0xc1,0x58
+.byte 0x3a,0x1d,0x1d,0x27, 0x27,0x9e,0x9e,0xb9
+.byte 0xd9,0xe1,0xe1,0x38, 0xeb,0xf8,0xf8,0x13
+.byte 0x2b,0x98,0x98,0xb3, 0x22,0x11,0x11,0x33
+.byte 0xd2,0x69,0x69,0xbb, 0xa9,0xd9,0xd9,0x70
+.byte 0x07,0x8e,0x8e,0x89, 0x33,0x94,0x94,0xa7
+.byte 0x2d,0x9b,0x9b,0xb6, 0x3c,0x1e,0x1e,0x22
+.byte 0x15,0x87,0x87,0x92, 0xc9,0xe9,0xe9,0x20
+.byte 0x87,0xce,0xce,0x49, 0xaa,0x55,0x55,0xff
+.byte 0x50,0x28,0x28,0x78, 0xa5,0xdf,0xdf,0x7a
+.byte 0x03,0x8c,0x8c,0x8f, 0x59,0xa1,0xa1,0xf8
+.byte 0x09,0x89,0x89,0x80, 0x1a,0x0d,0x0d,0x17
+.byte 0x65,0xbf,0xbf,0xda, 0xd7,0xe6,0xe6,0x31
+.byte 0x84,0x42,0x42,0xc6, 0xd0,0x68,0x68,0xb8
+.byte 0x82,0x41,0x41,0xc3, 0x29,0x99,0x99,0xb0
+.byte 0x5a,0x2d,0x2d,0x77, 0x1e,0x0f,0x0f,0x11
+.byte 0x7b,0xb0,0xb0,0xcb, 0xa8,0x54,0x54,0xfc
+.byte 0x6d,0xbb,0xbb,0xd6, 0x2c,0x16,0x16,0x3a
+
+.byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5 # Te4
+.byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
+.byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
+.byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
+.byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
+.byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
+.byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
+.byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
+.byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
+.byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
+.byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
+.byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
+.byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
+.byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
+.byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
+.byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
+.byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
+.byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
+.byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
+.byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
+.byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
+.byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
+.byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
+.byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
+.byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
+.byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
+.byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
+.byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
+.byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
+.byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
+.byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
+.byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
+
+.byte 0x01,0x00,0x00,0x00, 0x02,0x00,0x00,0x00 # rcon
+.byte 0x04,0x00,0x00,0x00, 0x08,0x00,0x00,0x00
+.byte 0x10,0x00,0x00,0x00, 0x20,0x00,0x00,0x00
+.byte 0x40,0x00,0x00,0x00, 0x80,0x00,0x00,0x00
+.byte 0x1B,0x00,0x00,0x00, 0x36,0x00,0x00,0x00
+
+.align 6
+AES_Td:
+.byte 0x51,0xf4,0xa7,0x50, 0x7e,0x41,0x65,0x53 # Td0
+.byte 0x1a,0x17,0xa4,0xc3, 0x3a,0x27,0x5e,0x96
+.byte 0x3b,0xab,0x6b,0xcb, 0x1f,0x9d,0x45,0xf1
+.byte 0xac,0xfa,0x58,0xab, 0x4b,0xe3,0x03,0x93
+.byte 0x20,0x30,0xfa,0x55, 0xad,0x76,0x6d,0xf6
+.byte 0x88,0xcc,0x76,0x91, 0xf5,0x02,0x4c,0x25
+.byte 0x4f,0xe5,0xd7,0xfc, 0xc5,0x2a,0xcb,0xd7
+.byte 0x26,0x35,0x44,0x80, 0xb5,0x62,0xa3,0x8f
+.byte 0xde,0xb1,0x5a,0x49, 0x25,0xba,0x1b,0x67
+.byte 0x45,0xea,0x0e,0x98, 0x5d,0xfe,0xc0,0xe1
+.byte 0xc3,0x2f,0x75,0x02, 0x81,0x4c,0xf0,0x12
+.byte 0x8d,0x46,0x97,0xa3, 0x6b,0xd3,0xf9,0xc6
+.byte 0x03,0x8f,0x5f,0xe7, 0x15,0x92,0x9c,0x95
+.byte 0xbf,0x6d,0x7a,0xeb, 0x95,0x52,0x59,0xda
+.byte 0xd4,0xbe,0x83,0x2d, 0x58,0x74,0x21,0xd3
+.byte 0x49,0xe0,0x69,0x29, 0x8e,0xc9,0xc8,0x44
+.byte 0x75,0xc2,0x89,0x6a, 0xf4,0x8e,0x79,0x78
+.byte 0x99,0x58,0x3e,0x6b, 0x27,0xb9,0x71,0xdd
+.byte 0xbe,0xe1,0x4f,0xb6, 0xf0,0x88,0xad,0x17
+.byte 0xc9,0x20,0xac,0x66, 0x7d,0xce,0x3a,0xb4
+.byte 0x63,0xdf,0x4a,0x18, 0xe5,0x1a,0x31,0x82
+.byte 0x97,0x51,0x33,0x60, 0x62,0x53,0x7f,0x45
+.byte 0xb1,0x64,0x77,0xe0, 0xbb,0x6b,0xae,0x84
+.byte 0xfe,0x81,0xa0,0x1c, 0xf9,0x08,0x2b,0x94
+.byte 0x70,0x48,0x68,0x58, 0x8f,0x45,0xfd,0x19
+.byte 0x94,0xde,0x6c,0x87, 0x52,0x7b,0xf8,0xb7
+.byte 0xab,0x73,0xd3,0x23, 0x72,0x4b,0x02,0xe2
+.byte 0xe3,0x1f,0x8f,0x57, 0x66,0x55,0xab,0x2a
+.byte 0xb2,0xeb,0x28,0x07, 0x2f,0xb5,0xc2,0x03
+.byte 0x86,0xc5,0x7b,0x9a, 0xd3,0x37,0x08,0xa5
+.byte 0x30,0x28,0x87,0xf2, 0x23,0xbf,0xa5,0xb2
+.byte 0x02,0x03,0x6a,0xba, 0xed,0x16,0x82,0x5c
+.byte 0x8a,0xcf,0x1c,0x2b, 0xa7,0x79,0xb4,0x92
+.byte 0xf3,0x07,0xf2,0xf0, 0x4e,0x69,0xe2,0xa1
+.byte 0x65,0xda,0xf4,0xcd, 0x06,0x05,0xbe,0xd5
+.byte 0xd1,0x34,0x62,0x1f, 0xc4,0xa6,0xfe,0x8a
+.byte 0x34,0x2e,0x53,0x9d, 0xa2,0xf3,0x55,0xa0
+.byte 0x05,0x8a,0xe1,0x32, 0xa4,0xf6,0xeb,0x75
+.byte 0x0b,0x83,0xec,0x39, 0x40,0x60,0xef,0xaa
+.byte 0x5e,0x71,0x9f,0x06, 0xbd,0x6e,0x10,0x51
+.byte 0x3e,0x21,0x8a,0xf9, 0x96,0xdd,0x06,0x3d
+.byte 0xdd,0x3e,0x05,0xae, 0x4d,0xe6,0xbd,0x46
+.byte 0x91,0x54,0x8d,0xb5, 0x71,0xc4,0x5d,0x05
+.byte 0x04,0x06,0xd4,0x6f, 0x60,0x50,0x15,0xff
+.byte 0x19,0x98,0xfb,0x24, 0xd6,0xbd,0xe9,0x97
+.byte 0x89,0x40,0x43,0xcc, 0x67,0xd9,0x9e,0x77
+.byte 0xb0,0xe8,0x42,0xbd, 0x07,0x89,0x8b,0x88
+.byte 0xe7,0x19,0x5b,0x38, 0x79,0xc8,0xee,0xdb
+.byte 0xa1,0x7c,0x0a,0x47, 0x7c,0x42,0x0f,0xe9
+.byte 0xf8,0x84,0x1e,0xc9, 0x00,0x00,0x00,0x00
+.byte 0x09,0x80,0x86,0x83, 0x32,0x2b,0xed,0x48
+.byte 0x1e,0x11,0x70,0xac, 0x6c,0x5a,0x72,0x4e
+.byte 0xfd,0x0e,0xff,0xfb, 0x0f,0x85,0x38,0x56
+.byte 0x3d,0xae,0xd5,0x1e, 0x36,0x2d,0x39,0x27
+.byte 0x0a,0x0f,0xd9,0x64, 0x68,0x5c,0xa6,0x21
+.byte 0x9b,0x5b,0x54,0xd1, 0x24,0x36,0x2e,0x3a
+.byte 0x0c,0x0a,0x67,0xb1, 0x93,0x57,0xe7,0x0f
+.byte 0xb4,0xee,0x96,0xd2, 0x1b,0x9b,0x91,0x9e
+.byte 0x80,0xc0,0xc5,0x4f, 0x61,0xdc,0x20,0xa2
+.byte 0x5a,0x77,0x4b,0x69, 0x1c,0x12,0x1a,0x16
+.byte 0xe2,0x93,0xba,0x0a, 0xc0,0xa0,0x2a,0xe5
+.byte 0x3c,0x22,0xe0,0x43, 0x12,0x1b,0x17,0x1d
+.byte 0x0e,0x09,0x0d,0x0b, 0xf2,0x8b,0xc7,0xad
+.byte 0x2d,0xb6,0xa8,0xb9, 0x14,0x1e,0xa9,0xc8
+.byte 0x57,0xf1,0x19,0x85, 0xaf,0x75,0x07,0x4c
+.byte 0xee,0x99,0xdd,0xbb, 0xa3,0x7f,0x60,0xfd
+.byte 0xf7,0x01,0x26,0x9f, 0x5c,0x72,0xf5,0xbc
+.byte 0x44,0x66,0x3b,0xc5, 0x5b,0xfb,0x7e,0x34
+.byte 0x8b,0x43,0x29,0x76, 0xcb,0x23,0xc6,0xdc
+.byte 0xb6,0xed,0xfc,0x68, 0xb8,0xe4,0xf1,0x63
+.byte 0xd7,0x31,0xdc,0xca, 0x42,0x63,0x85,0x10
+.byte 0x13,0x97,0x22,0x40, 0x84,0xc6,0x11,0x20
+.byte 0x85,0x4a,0x24,0x7d, 0xd2,0xbb,0x3d,0xf8
+.byte 0xae,0xf9,0x32,0x11, 0xc7,0x29,0xa1,0x6d
+.byte 0x1d,0x9e,0x2f,0x4b, 0xdc,0xb2,0x30,0xf3
+.byte 0x0d,0x86,0x52,0xec, 0x77,0xc1,0xe3,0xd0
+.byte 0x2b,0xb3,0x16,0x6c, 0xa9,0x70,0xb9,0x99
+.byte 0x11,0x94,0x48,0xfa, 0x47,0xe9,0x64,0x22
+.byte 0xa8,0xfc,0x8c,0xc4, 0xa0,0xf0,0x3f,0x1a
+.byte 0x56,0x7d,0x2c,0xd8, 0x22,0x33,0x90,0xef
+.byte 0x87,0x49,0x4e,0xc7, 0xd9,0x38,0xd1,0xc1
+.byte 0x8c,0xca,0xa2,0xfe, 0x98,0xd4,0x0b,0x36
+.byte 0xa6,0xf5,0x81,0xcf, 0xa5,0x7a,0xde,0x28
+.byte 0xda,0xb7,0x8e,0x26, 0x3f,0xad,0xbf,0xa4
+.byte 0x2c,0x3a,0x9d,0xe4, 0x50,0x78,0x92,0x0d
+.byte 0x6a,0x5f,0xcc,0x9b, 0x54,0x7e,0x46,0x62
+.byte 0xf6,0x8d,0x13,0xc2, 0x90,0xd8,0xb8,0xe8
+.byte 0x2e,0x39,0xf7,0x5e, 0x82,0xc3,0xaf,0xf5
+.byte 0x9f,0x5d,0x80,0xbe, 0x69,0xd0,0x93,0x7c
+.byte 0x6f,0xd5,0x2d,0xa9, 0xcf,0x25,0x12,0xb3
+.byte 0xc8,0xac,0x99,0x3b, 0x10,0x18,0x7d,0xa7
+.byte 0xe8,0x9c,0x63,0x6e, 0xdb,0x3b,0xbb,0x7b
+.byte 0xcd,0x26,0x78,0x09, 0x6e,0x59,0x18,0xf4
+.byte 0xec,0x9a,0xb7,0x01, 0x83,0x4f,0x9a,0xa8
+.byte 0xe6,0x95,0x6e,0x65, 0xaa,0xff,0xe6,0x7e
+.byte 0x21,0xbc,0xcf,0x08, 0xef,0x15,0xe8,0xe6
+.byte 0xba,0xe7,0x9b,0xd9, 0x4a,0x6f,0x36,0xce
+.byte 0xea,0x9f,0x09,0xd4, 0x29,0xb0,0x7c,0xd6
+.byte 0x31,0xa4,0xb2,0xaf, 0x2a,0x3f,0x23,0x31
+.byte 0xc6,0xa5,0x94,0x30, 0x35,0xa2,0x66,0xc0
+.byte 0x74,0x4e,0xbc,0x37, 0xfc,0x82,0xca,0xa6
+.byte 0xe0,0x90,0xd0,0xb0, 0x33,0xa7,0xd8,0x15
+.byte 0xf1,0x04,0x98,0x4a, 0x41,0xec,0xda,0xf7
+.byte 0x7f,0xcd,0x50,0x0e, 0x17,0x91,0xf6,0x2f
+.byte 0x76,0x4d,0xd6,0x8d, 0x43,0xef,0xb0,0x4d
+.byte 0xcc,0xaa,0x4d,0x54, 0xe4,0x96,0x04,0xdf
+.byte 0x9e,0xd1,0xb5,0xe3, 0x4c,0x6a,0x88,0x1b
+.byte 0xc1,0x2c,0x1f,0xb8, 0x46,0x65,0x51,0x7f
+.byte 0x9d,0x5e,0xea,0x04, 0x01,0x8c,0x35,0x5d
+.byte 0xfa,0x87,0x74,0x73, 0xfb,0x0b,0x41,0x2e
+.byte 0xb3,0x67,0x1d,0x5a, 0x92,0xdb,0xd2,0x52
+.byte 0xe9,0x10,0x56,0x33, 0x6d,0xd6,0x47,0x13
+.byte 0x9a,0xd7,0x61,0x8c, 0x37,0xa1,0x0c,0x7a
+.byte 0x59,0xf8,0x14,0x8e, 0xeb,0x13,0x3c,0x89
+.byte 0xce,0xa9,0x27,0xee, 0xb7,0x61,0xc9,0x35
+.byte 0xe1,0x1c,0xe5,0xed, 0x7a,0x47,0xb1,0x3c
+.byte 0x9c,0xd2,0xdf,0x59, 0x55,0xf2,0x73,0x3f
+.byte 0x18,0x14,0xce,0x79, 0x73,0xc7,0x37,0xbf
+.byte 0x53,0xf7,0xcd,0xea, 0x5f,0xfd,0xaa,0x5b
+.byte 0xdf,0x3d,0x6f,0x14, 0x78,0x44,0xdb,0x86
+.byte 0xca,0xaf,0xf3,0x81, 0xb9,0x68,0xc4,0x3e
+.byte 0x38,0x24,0x34,0x2c, 0xc2,0xa3,0x40,0x5f
+.byte 0x16,0x1d,0xc3,0x72, 0xbc,0xe2,0x25,0x0c
+.byte 0x28,0x3c,0x49,0x8b, 0xff,0x0d,0x95,0x41
+.byte 0x39,0xa8,0x01,0x71, 0x08,0x0c,0xb3,0xde
+.byte 0xd8,0xb4,0xe4,0x9c, 0x64,0x56,0xc1,0x90
+.byte 0x7b,0xcb,0x84,0x61, 0xd5,0x32,0xb6,0x70
+.byte 0x48,0x6c,0x5c,0x74, 0xd0,0xb8,0x57,0x42
+
+.byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38 # Td4
+.byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
+.byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
+.byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
+.byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
+.byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
+.byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
+.byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
+.byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
+.byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
+.byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
+.byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
+.byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
+.byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
+.byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
+.byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
+.byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
+.byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
+.byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
+.byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
+.byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
+.byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
+.byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
+.byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
+.byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
+.byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
+.byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
+.byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
+.byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
+.byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
+.byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
+.byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
+___
+
+foreach (split("\n",$code)) {
+ s/\`([^\`]*)\`/eval $1/ge;
+
+ # made-up _instructions, _xtr, _ins, _ror and _bias, cope
+ # with byte order dependencies...
+ if (/^\s+_/) {
+ s/(_[a-z]+\s+)(\$[0-9]+),([^,]+)(#.*)*$/$1$2,$2,$3/;
+
+ s/_xtr\s+(\$[0-9]+),(\$[0-9]+),([0-9]+(\-2)*)/
+ sprintf("srl\t$1,$2,%d",$big_endian ? eval($3)
+ : eval("24-$3"))/e or
+ s/_ins\s+(\$[0-9]+),(\$[0-9]+),([0-9]+)/
+ sprintf("sll\t$1,$2,%d",$big_endian ? eval($3)
+ : eval("24-$3"))/e or
+ s/_ror\s+(\$[0-9]+),(\$[0-9]+),(\-?[0-9]+)/
+ sprintf("srl\t$1,$2,%d",$big_endian ? eval($3)
+ : eval("$3*-1"))/e or
+ s/_bias\s+(\$[0-9]+),(\$[0-9]+),([0-9]+)/
+ sprintf("sll\t$1,$2,%d",$big_endian ? eval($3)
+ : eval("($3-16)&31"))/e;
+
+ s/srl\s+(\$[0-9]+),(\$[0-9]+),\-([0-9]+)/
+ sprintf("sll\t$1,$2,$3")/e or
+ s/srl\s+(\$[0-9]+),(\$[0-9]+),0/
+ sprintf("and\t$1,$2,0xff")/e or
+ s/(sll\s+\$[0-9]+,\$[0-9]+,0)/#$1/;
+ }
+
+ # convert lwl/lwr and swr/swl to little-endian order
+ if (!$big_endian && /^\s+[sl]w[lr]\s+/) {
+ s/([sl]wl.*)([0-9]+)\((\$[0-9]+)\)/
+ sprintf("$1%d($3)",eval("$2-$2%4+($2%4-1)&3"))/e or
+ s/([sl]wr.*)([0-9]+)\((\$[0-9]+)\)/
+ sprintf("$1%d($3)",eval("$2-$2%4+($2%4+1)&3"))/e;
+ }
+
+ print $_,"\n";
+}
+
+close STDOUT;
diff --git a/main/openssl/crypto/aes/asm/aes-parisc.pl b/main/openssl/crypto/aes/asm/aes-parisc.pl
new file mode 100644
index 00000000..714dcfbb
--- /dev/null
+++ b/main/openssl/crypto/aes/asm/aes-parisc.pl
@@ -0,0 +1,1022 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# AES for PA-RISC.
+#
+# June 2009.
+#
+# The module is mechanical transliteration of aes-sparcv9.pl, but with
+# a twist: S-boxes are compressed even further down to 1K+256B. On
+# PA-7100LC performance is ~40% better than gcc 3.2 generated code and
+# is about 33 cycles per byte processed with 128-bit key. Newer CPUs
+# perform at 16 cycles per byte. It's not faster than code generated
+# by vendor compiler, but recall that it has compressed S-boxes, which
+# requires extra processing.
+#
+# Special thanks to polarhome.com for providing HP-UX account.
+
+$flavour = shift;
+$output = shift;
+open STDOUT,">$output";
+
+if ($flavour =~ /64/) {
+ $LEVEL ="2.0W";
+ $SIZE_T =8;
+ $FRAME_MARKER =80;
+ $SAVED_RP =16;
+ $PUSH ="std";
+ $PUSHMA ="std,ma";
+ $POP ="ldd";
+ $POPMB ="ldd,mb";
+} else {
+ $LEVEL ="1.0";
+ $SIZE_T =4;
+ $FRAME_MARKER =48;
+ $SAVED_RP =20;
+ $PUSH ="stw";
+ $PUSHMA ="stwm";
+ $POP ="ldw";
+ $POPMB ="ldwm";
+}
+
+$FRAME=16*$SIZE_T+$FRAME_MARKER;# 16 saved regs + frame marker
+ # [+ argument transfer]
+$inp="%r26"; # arg0
+$out="%r25"; # arg1
+$key="%r24"; # arg2
+
+($s0,$s1,$s2,$s3) = ("%r1","%r2","%r3","%r4");
+($t0,$t1,$t2,$t3) = ("%r5","%r6","%r7","%r8");
+
+($acc0, $acc1, $acc2, $acc3, $acc4, $acc5, $acc6, $acc7,
+ $acc8, $acc9,$acc10,$acc11,$acc12,$acc13,$acc14,$acc15) =
+("%r9","%r10","%r11","%r12","%r13","%r14","%r15","%r16",
+"%r17","%r18","%r19","%r20","%r21","%r22","%r23","%r26");
+
+$tbl="%r28";
+$rounds="%r29";
+
+$code=<<___;
+ .LEVEL $LEVEL
+ .SPACE \$TEXT\$
+ .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
+
+ .EXPORT AES_encrypt,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
+ .ALIGN 64
+AES_encrypt
+ .PROC
+ .CALLINFO FRAME=`$FRAME-16*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=18
+ .ENTRY
+ $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
+ $PUSHMA %r3,$FRAME(%sp)
+ $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
+ $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
+ $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
+ $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
+ $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
+ $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
+ $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
+ $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp)
+ $PUSH %r12,`-$FRAME+9*$SIZE_T`(%sp)
+ $PUSH %r13,`-$FRAME+10*$SIZE_T`(%sp)
+ $PUSH %r14,`-$FRAME+11*$SIZE_T`(%sp)
+ $PUSH %r15,`-$FRAME+12*$SIZE_T`(%sp)
+ $PUSH %r16,`-$FRAME+13*$SIZE_T`(%sp)
+ $PUSH %r17,`-$FRAME+14*$SIZE_T`(%sp)
+ $PUSH %r18,`-$FRAME+15*$SIZE_T`(%sp)
+
+ blr %r0,$tbl
+ ldi 3,$t0
+L\$enc_pic
+ andcm $tbl,$t0,$tbl
+ ldo L\$AES_Te-L\$enc_pic($tbl),$tbl
+
+ and $inp,$t0,$t0
+ sub $inp,$t0,$inp
+ ldw 0($inp),$s0
+ ldw 4($inp),$s1
+ ldw 8($inp),$s2
+ comib,= 0,$t0,L\$enc_inp_aligned
+ ldw 12($inp),$s3
+
+ sh3addl $t0,%r0,$t0
+ subi 32,$t0,$t0
+ mtctl $t0,%cr11
+ ldw 16($inp),$t1
+ vshd $s0,$s1,$s0
+ vshd $s1,$s2,$s1
+ vshd $s2,$s3,$s2
+ vshd $s3,$t1,$s3
+
+L\$enc_inp_aligned
+ bl _parisc_AES_encrypt,%r31
+ nop
+
+ extru,<> $out,31,2,%r0
+ b L\$enc_out_aligned
+ nop
+
+ _srm $s0,24,$acc0
+ _srm $s0,16,$acc1
+ stb $acc0,0($out)
+ _srm $s0,8,$acc2
+ stb $acc1,1($out)
+ _srm $s1,24,$acc4
+ stb $acc2,2($out)
+ _srm $s1,16,$acc5
+ stb $s0,3($out)
+ _srm $s1,8,$acc6
+ stb $acc4,4($out)
+ _srm $s2,24,$acc0
+ stb $acc5,5($out)
+ _srm $s2,16,$acc1
+ stb $acc6,6($out)
+ _srm $s2,8,$acc2
+ stb $s1,7($out)
+ _srm $s3,24,$acc4
+ stb $acc0,8($out)
+ _srm $s3,16,$acc5
+ stb $acc1,9($out)
+ _srm $s3,8,$acc6
+ stb $acc2,10($out)
+ stb $s2,11($out)
+ stb $acc4,12($out)
+ stb $acc5,13($out)
+ stb $acc6,14($out)
+ b L\$enc_done
+ stb $s3,15($out)
+
+L\$enc_out_aligned
+ stw $s0,0($out)
+ stw $s1,4($out)
+ stw $s2,8($out)
+ stw $s3,12($out)
+
+L\$enc_done
+ $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
+ $POP `-$FRAME+1*$SIZE_T`(%sp),%r4
+ $POP `-$FRAME+2*$SIZE_T`(%sp),%r5
+ $POP `-$FRAME+3*$SIZE_T`(%sp),%r6
+ $POP `-$FRAME+4*$SIZE_T`(%sp),%r7
+ $POP `-$FRAME+5*$SIZE_T`(%sp),%r8
+ $POP `-$FRAME+6*$SIZE_T`(%sp),%r9
+ $POP `-$FRAME+7*$SIZE_T`(%sp),%r10
+ $POP `-$FRAME+8*$SIZE_T`(%sp),%r11
+ $POP `-$FRAME+9*$SIZE_T`(%sp),%r12
+ $POP `-$FRAME+10*$SIZE_T`(%sp),%r13
+ $POP `-$FRAME+11*$SIZE_T`(%sp),%r14
+ $POP `-$FRAME+12*$SIZE_T`(%sp),%r15
+ $POP `-$FRAME+13*$SIZE_T`(%sp),%r16
+ $POP `-$FRAME+14*$SIZE_T`(%sp),%r17
+ $POP `-$FRAME+15*$SIZE_T`(%sp),%r18
+ bv (%r2)
+ .EXIT
+ $POPMB -$FRAME(%sp),%r3
+ .PROCEND
+
+ .ALIGN 16
+_parisc_AES_encrypt
+ .PROC
+ .CALLINFO MILLICODE
+ .ENTRY
+ ldw 240($key),$rounds
+ ldw 0($key),$t0
+ ldw 4($key),$t1
+ ldw 8($key),$t2
+ _srm $rounds,1,$rounds
+ xor $t0,$s0,$s0
+ ldw 12($key),$t3
+ _srm $s0,24,$acc0
+ xor $t1,$s1,$s1
+ ldw 16($key),$t0
+ _srm $s1,16,$acc1
+ xor $t2,$s2,$s2
+ ldw 20($key),$t1
+ xor $t3,$s3,$s3
+ ldw 24($key),$t2
+ ldw 28($key),$t3
+L\$enc_loop
+ _srm $s2,8,$acc2
+ ldwx,s $acc0($tbl),$acc0
+ _srm $s3,0,$acc3
+ ldwx,s $acc1($tbl),$acc1
+ _srm $s1,24,$acc4
+ ldwx,s $acc2($tbl),$acc2
+ _srm $s2,16,$acc5
+ ldwx,s $acc3($tbl),$acc3
+ _srm $s3,8,$acc6
+ ldwx,s $acc4($tbl),$acc4
+ _srm $s0,0,$acc7
+ ldwx,s $acc5($tbl),$acc5
+ _srm $s2,24,$acc8
+ ldwx,s $acc6($tbl),$acc6
+ _srm $s3,16,$acc9
+ ldwx,s $acc7($tbl),$acc7
+ _srm $s0,8,$acc10
+ ldwx,s $acc8($tbl),$acc8
+ _srm $s1,0,$acc11
+ ldwx,s $acc9($tbl),$acc9
+ _srm $s3,24,$acc12
+ ldwx,s $acc10($tbl),$acc10
+ _srm $s0,16,$acc13
+ ldwx,s $acc11($tbl),$acc11
+ _srm $s1,8,$acc14
+ ldwx,s $acc12($tbl),$acc12
+ _srm $s2,0,$acc15
+ ldwx,s $acc13($tbl),$acc13
+ ldwx,s $acc14($tbl),$acc14
+ ldwx,s $acc15($tbl),$acc15
+ addib,= -1,$rounds,L\$enc_last
+ ldo 32($key),$key
+
+ _ror $acc1,8,$acc1
+ xor $acc0,$t0,$t0
+ ldw 0($key),$s0
+ _ror $acc2,16,$acc2
+ xor $acc1,$t0,$t0
+ ldw 4($key),$s1
+ _ror $acc3,24,$acc3
+ xor $acc2,$t0,$t0
+ ldw 8($key),$s2
+ _ror $acc5,8,$acc5
+ xor $acc3,$t0,$t0
+ ldw 12($key),$s3
+ _ror $acc6,16,$acc6
+ xor $acc4,$t1,$t1
+ _ror $acc7,24,$acc7
+ xor $acc5,$t1,$t1
+ _ror $acc9,8,$acc9
+ xor $acc6,$t1,$t1
+ _ror $acc10,16,$acc10
+ xor $acc7,$t1,$t1
+ _ror $acc11,24,$acc11
+ xor $acc8,$t2,$t2
+ _ror $acc13,8,$acc13
+ xor $acc9,$t2,$t2
+ _ror $acc14,16,$acc14
+ xor $acc10,$t2,$t2
+ _ror $acc15,24,$acc15
+ xor $acc11,$t2,$t2
+ xor $acc12,$acc14,$acc14
+ xor $acc13,$t3,$t3
+ _srm $t0,24,$acc0
+ xor $acc14,$t3,$t3
+ _srm $t1,16,$acc1
+ xor $acc15,$t3,$t3
+
+ _srm $t2,8,$acc2
+ ldwx,s $acc0($tbl),$acc0
+ _srm $t3,0,$acc3
+ ldwx,s $acc1($tbl),$acc1
+ _srm $t1,24,$acc4
+ ldwx,s $acc2($tbl),$acc2
+ _srm $t2,16,$acc5
+ ldwx,s $acc3($tbl),$acc3
+ _srm $t3,8,$acc6
+ ldwx,s $acc4($tbl),$acc4
+ _srm $t0,0,$acc7
+ ldwx,s $acc5($tbl),$acc5
+ _srm $t2,24,$acc8
+ ldwx,s $acc6($tbl),$acc6
+ _srm $t3,16,$acc9
+ ldwx,s $acc7($tbl),$acc7
+ _srm $t0,8,$acc10
+ ldwx,s $acc8($tbl),$acc8
+ _srm $t1,0,$acc11
+ ldwx,s $acc9($tbl),$acc9
+ _srm $t3,24,$acc12
+ ldwx,s $acc10($tbl),$acc10
+ _srm $t0,16,$acc13
+ ldwx,s $acc11($tbl),$acc11
+ _srm $t1,8,$acc14
+ ldwx,s $acc12($tbl),$acc12
+ _srm $t2,0,$acc15
+ ldwx,s $acc13($tbl),$acc13
+ _ror $acc1,8,$acc1
+ ldwx,s $acc14($tbl),$acc14
+
+ _ror $acc2,16,$acc2
+ xor $acc0,$s0,$s0
+ ldwx,s $acc15($tbl),$acc15
+ _ror $acc3,24,$acc3
+ xor $acc1,$s0,$s0
+ ldw 16($key),$t0
+ _ror $acc5,8,$acc5
+ xor $acc2,$s0,$s0
+ ldw 20($key),$t1
+ _ror $acc6,16,$acc6
+ xor $acc3,$s0,$s0
+ ldw 24($key),$t2
+ _ror $acc7,24,$acc7
+ xor $acc4,$s1,$s1
+ ldw 28($key),$t3
+ _ror $acc9,8,$acc9
+ xor $acc5,$s1,$s1
+ ldw 1024+0($tbl),%r0 ; prefetch te4
+ _ror $acc10,16,$acc10
+ xor $acc6,$s1,$s1
+ ldw 1024+32($tbl),%r0 ; prefetch te4
+ _ror $acc11,24,$acc11
+ xor $acc7,$s1,$s1
+ ldw 1024+64($tbl),%r0 ; prefetch te4
+ _ror $acc13,8,$acc13
+ xor $acc8,$s2,$s2
+ ldw 1024+96($tbl),%r0 ; prefetch te4
+ _ror $acc14,16,$acc14
+ xor $acc9,$s2,$s2
+ ldw 1024+128($tbl),%r0 ; prefetch te4
+ _ror $acc15,24,$acc15
+ xor $acc10,$s2,$s2
+ ldw 1024+160($tbl),%r0 ; prefetch te4
+ _srm $s0,24,$acc0
+ xor $acc11,$s2,$s2
+ ldw 1024+192($tbl),%r0 ; prefetch te4
+ xor $acc12,$acc14,$acc14
+ xor $acc13,$s3,$s3
+ ldw 1024+224($tbl),%r0 ; prefetch te4
+ _srm $s1,16,$acc1
+ xor $acc14,$s3,$s3
+ b L\$enc_loop
+ xor $acc15,$s3,$s3
+
+ .ALIGN 16
+L\$enc_last
+ ldo 1024($tbl),$rounds
+ _ror $acc1,8,$acc1
+ xor $acc0,$t0,$t0
+ ldw 0($key),$s0
+ _ror $acc2,16,$acc2
+ xor $acc1,$t0,$t0
+ ldw 4($key),$s1
+ _ror $acc3,24,$acc3
+ xor $acc2,$t0,$t0
+ ldw 8($key),$s2
+ _ror $acc5,8,$acc5
+ xor $acc3,$t0,$t0
+ ldw 12($key),$s3
+ _ror $acc6,16,$acc6
+ xor $acc4,$t1,$t1
+ _ror $acc7,24,$acc7
+ xor $acc5,$t1,$t1
+ _ror $acc9,8,$acc9
+ xor $acc6,$t1,$t1
+ _ror $acc10,16,$acc10
+ xor $acc7,$t1,$t1
+ _ror $acc11,24,$acc11
+ xor $acc8,$t2,$t2
+ _ror $acc13,8,$acc13
+ xor $acc9,$t2,$t2
+ _ror $acc14,16,$acc14
+ xor $acc10,$t2,$t2
+ _ror $acc15,24,$acc15
+ xor $acc11,$t2,$t2
+ xor $acc12,$acc14,$acc14
+ xor $acc13,$t3,$t3
+ _srm $t0,24,$acc0
+ xor $acc14,$t3,$t3
+ _srm $t1,16,$acc1
+ xor $acc15,$t3,$t3
+
+ _srm $t2,8,$acc2
+ ldbx $acc0($rounds),$acc0
+ _srm $t1,24,$acc4
+ ldbx $acc1($rounds),$acc1
+ _srm $t2,16,$acc5
+ _srm $t3,0,$acc3
+ ldbx $acc2($rounds),$acc2
+ ldbx $acc3($rounds),$acc3
+ _srm $t3,8,$acc6
+ ldbx $acc4($rounds),$acc4
+ _srm $t2,24,$acc8
+ ldbx $acc5($rounds),$acc5
+ _srm $t3,16,$acc9
+ _srm $t0,0,$acc7
+ ldbx $acc6($rounds),$acc6
+ ldbx $acc7($rounds),$acc7
+ _srm $t0,8,$acc10
+ ldbx $acc8($rounds),$acc8
+ _srm $t3,24,$acc12
+ ldbx $acc9($rounds),$acc9
+ _srm $t0,16,$acc13
+ _srm $t1,0,$acc11
+ ldbx $acc10($rounds),$acc10
+ _srm $t1,8,$acc14
+ ldbx $acc11($rounds),$acc11
+ ldbx $acc12($rounds),$acc12
+ ldbx $acc13($rounds),$acc13
+ _srm $t2,0,$acc15
+ ldbx $acc14($rounds),$acc14
+
+ dep $acc0,7,8,$acc3
+ ldbx $acc15($rounds),$acc15
+ dep $acc4,7,8,$acc7
+ dep $acc1,15,8,$acc3
+ dep $acc5,15,8,$acc7
+ dep $acc2,23,8,$acc3
+ dep $acc6,23,8,$acc7
+ xor $acc3,$s0,$s0
+ xor $acc7,$s1,$s1
+ dep $acc8,7,8,$acc11
+ dep $acc12,7,8,$acc15
+ dep $acc9,15,8,$acc11
+ dep $acc13,15,8,$acc15
+ dep $acc10,23,8,$acc11
+ dep $acc14,23,8,$acc15
+ xor $acc11,$s2,$s2
+
+ bv (%r31)
+ .EXIT
+ xor $acc15,$s3,$s3
+ .PROCEND
+
+ .ALIGN 64
+L\$AES_Te
+ .WORD 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d
+ .WORD 0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554
+ .WORD 0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d
+ .WORD 0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a
+ .WORD 0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87
+ .WORD 0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b
+ .WORD 0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea
+ .WORD 0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b
+ .WORD 0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a
+ .WORD 0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f
+ .WORD 0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108
+ .WORD 0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f
+ .WORD 0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e
+ .WORD 0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5
+ .WORD 0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d
+ .WORD 0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f
+ .WORD 0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e
+ .WORD 0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb
+ .WORD 0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce
+ .WORD 0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497
+ .WORD 0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c
+ .WORD 0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed
+ .WORD 0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b
+ .WORD 0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a
+ .WORD 0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16
+ .WORD 0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594
+ .WORD 0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81
+ .WORD 0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3
+ .WORD 0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a
+ .WORD 0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504
+ .WORD 0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163
+ .WORD 0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d
+ .WORD 0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f
+ .WORD 0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739
+ .WORD 0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47
+ .WORD 0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395
+ .WORD 0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f
+ .WORD 0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883
+ .WORD 0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c
+ .WORD 0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76
+ .WORD 0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e
+ .WORD 0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4
+ .WORD 0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6
+ .WORD 0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b
+ .WORD 0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7
+ .WORD 0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0
+ .WORD 0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25
+ .WORD 0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818
+ .WORD 0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72
+ .WORD 0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651
+ .WORD 0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21
+ .WORD 0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85
+ .WORD 0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa
+ .WORD 0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12
+ .WORD 0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0
+ .WORD 0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9
+ .WORD 0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133
+ .WORD 0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7
+ .WORD 0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920
+ .WORD 0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a
+ .WORD 0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17
+ .WORD 0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8
+ .WORD 0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11
+ .WORD 0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a
+ .BYTE 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
+ .BYTE 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
+ .BYTE 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
+ .BYTE 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
+ .BYTE 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
+ .BYTE 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
+ .BYTE 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
+ .BYTE 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
+ .BYTE 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
+ .BYTE 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
+ .BYTE 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
+ .BYTE 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
+ .BYTE 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
+ .BYTE 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
+ .BYTE 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
+ .BYTE 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
+ .BYTE 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
+ .BYTE 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
+ .BYTE 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
+ .BYTE 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
+ .BYTE 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
+ .BYTE 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
+ .BYTE 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
+ .BYTE 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
+ .BYTE 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
+ .BYTE 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
+ .BYTE 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
+ .BYTE 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
+ .BYTE 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
+ .BYTE 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
+ .BYTE 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
+ .BYTE 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
+___
+
+$code.=<<___;
+ .EXPORT AES_decrypt,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
+ .ALIGN 16
+AES_decrypt
+ .PROC
+ .CALLINFO FRAME=`$FRAME-16*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=18
+ .ENTRY
+ $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
+ $PUSHMA %r3,$FRAME(%sp)
+ $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
+ $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
+ $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
+ $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
+ $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
+ $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
+ $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
+ $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp)
+ $PUSH %r12,`-$FRAME+9*$SIZE_T`(%sp)
+ $PUSH %r13,`-$FRAME+10*$SIZE_T`(%sp)
+ $PUSH %r14,`-$FRAME+11*$SIZE_T`(%sp)
+ $PUSH %r15,`-$FRAME+12*$SIZE_T`(%sp)
+ $PUSH %r16,`-$FRAME+13*$SIZE_T`(%sp)
+ $PUSH %r17,`-$FRAME+14*$SIZE_T`(%sp)
+ $PUSH %r18,`-$FRAME+15*$SIZE_T`(%sp)
+
+ blr %r0,$tbl
+ ldi 3,$t0
+L\$dec_pic
+ andcm $tbl,$t0,$tbl
+ ldo L\$AES_Td-L\$dec_pic($tbl),$tbl
+
+ and $inp,$t0,$t0
+ sub $inp,$t0,$inp
+ ldw 0($inp),$s0
+ ldw 4($inp),$s1
+ ldw 8($inp),$s2
+ comib,= 0,$t0,L\$dec_inp_aligned
+ ldw 12($inp),$s3
+
+ sh3addl $t0,%r0,$t0
+ subi 32,$t0,$t0
+ mtctl $t0,%cr11
+ ldw 16($inp),$t1
+ vshd $s0,$s1,$s0
+ vshd $s1,$s2,$s1
+ vshd $s2,$s3,$s2
+ vshd $s3,$t1,$s3
+
+L\$dec_inp_aligned
+ bl _parisc_AES_decrypt,%r31
+ nop
+
+ extru,<> $out,31,2,%r0
+ b L\$dec_out_aligned
+ nop
+
+ _srm $s0,24,$acc0
+ _srm $s0,16,$acc1
+ stb $acc0,0($out)
+ _srm $s0,8,$acc2
+ stb $acc1,1($out)
+ _srm $s1,24,$acc4
+ stb $acc2,2($out)
+ _srm $s1,16,$acc5
+ stb $s0,3($out)
+ _srm $s1,8,$acc6
+ stb $acc4,4($out)
+ _srm $s2,24,$acc0
+ stb $acc5,5($out)
+ _srm $s2,16,$acc1
+ stb $acc6,6($out)
+ _srm $s2,8,$acc2
+ stb $s1,7($out)
+ _srm $s3,24,$acc4
+ stb $acc0,8($out)
+ _srm $s3,16,$acc5
+ stb $acc1,9($out)
+ _srm $s3,8,$acc6
+ stb $acc2,10($out)
+ stb $s2,11($out)
+ stb $acc4,12($out)
+ stb $acc5,13($out)
+ stb $acc6,14($out)
+ b L\$dec_done
+ stb $s3,15($out)
+
+L\$dec_out_aligned
+ stw $s0,0($out)
+ stw $s1,4($out)
+ stw $s2,8($out)
+ stw $s3,12($out)
+
+L\$dec_done
+ $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
+ $POP `-$FRAME+1*$SIZE_T`(%sp),%r4
+ $POP `-$FRAME+2*$SIZE_T`(%sp),%r5
+ $POP `-$FRAME+3*$SIZE_T`(%sp),%r6
+ $POP `-$FRAME+4*$SIZE_T`(%sp),%r7
+ $POP `-$FRAME+5*$SIZE_T`(%sp),%r8
+ $POP `-$FRAME+6*$SIZE_T`(%sp),%r9
+ $POP `-$FRAME+7*$SIZE_T`(%sp),%r10
+ $POP `-$FRAME+8*$SIZE_T`(%sp),%r11
+ $POP `-$FRAME+9*$SIZE_T`(%sp),%r12
+ $POP `-$FRAME+10*$SIZE_T`(%sp),%r13
+ $POP `-$FRAME+11*$SIZE_T`(%sp),%r14
+ $POP `-$FRAME+12*$SIZE_T`(%sp),%r15
+ $POP `-$FRAME+13*$SIZE_T`(%sp),%r16
+ $POP `-$FRAME+14*$SIZE_T`(%sp),%r17
+ $POP `-$FRAME+15*$SIZE_T`(%sp),%r18
+ bv (%r2)
+ .EXIT
+ $POPMB -$FRAME(%sp),%r3
+ .PROCEND
+
+ .ALIGN 16
+_parisc_AES_decrypt
+ .PROC
+ .CALLINFO MILLICODE
+ .ENTRY
+ ldw 240($key),$rounds
+ ldw 0($key),$t0
+ ldw 4($key),$t1
+ ldw 8($key),$t2
+ ldw 12($key),$t3
+ _srm $rounds,1,$rounds
+ xor $t0,$s0,$s0
+ ldw 16($key),$t0
+ xor $t1,$s1,$s1
+ ldw 20($key),$t1
+ _srm $s0,24,$acc0
+ xor $t2,$s2,$s2
+ ldw 24($key),$t2
+ xor $t3,$s3,$s3
+ ldw 28($key),$t3
+ _srm $s3,16,$acc1
+L\$dec_loop
+ _srm $s2,8,$acc2
+ ldwx,s $acc0($tbl),$acc0
+ _srm $s1,0,$acc3
+ ldwx,s $acc1($tbl),$acc1
+ _srm $s1,24,$acc4
+ ldwx,s $acc2($tbl),$acc2
+ _srm $s0,16,$acc5
+ ldwx,s $acc3($tbl),$acc3
+ _srm $s3,8,$acc6
+ ldwx,s $acc4($tbl),$acc4
+ _srm $s2,0,$acc7
+ ldwx,s $acc5($tbl),$acc5
+ _srm $s2,24,$acc8
+ ldwx,s $acc6($tbl),$acc6
+ _srm $s1,16,$acc9
+ ldwx,s $acc7($tbl),$acc7
+ _srm $s0,8,$acc10
+ ldwx,s $acc8($tbl),$acc8
+ _srm $s3,0,$acc11
+ ldwx,s $acc9($tbl),$acc9
+ _srm $s3,24,$acc12
+ ldwx,s $acc10($tbl),$acc10
+ _srm $s2,16,$acc13
+ ldwx,s $acc11($tbl),$acc11
+ _srm $s1,8,$acc14
+ ldwx,s $acc12($tbl),$acc12
+ _srm $s0,0,$acc15
+ ldwx,s $acc13($tbl),$acc13
+ ldwx,s $acc14($tbl),$acc14
+ ldwx,s $acc15($tbl),$acc15
+ addib,= -1,$rounds,L\$dec_last
+ ldo 32($key),$key
+
+ _ror $acc1,8,$acc1
+ xor $acc0,$t0,$t0
+ ldw 0($key),$s0
+ _ror $acc2,16,$acc2
+ xor $acc1,$t0,$t0
+ ldw 4($key),$s1
+ _ror $acc3,24,$acc3
+ xor $acc2,$t0,$t0
+ ldw 8($key),$s2
+ _ror $acc5,8,$acc5
+ xor $acc3,$t0,$t0
+ ldw 12($key),$s3
+ _ror $acc6,16,$acc6
+ xor $acc4,$t1,$t1
+ _ror $acc7,24,$acc7
+ xor $acc5,$t1,$t1
+ _ror $acc9,8,$acc9
+ xor $acc6,$t1,$t1
+ _ror $acc10,16,$acc10
+ xor $acc7,$t1,$t1
+ _ror $acc11,24,$acc11
+ xor $acc8,$t2,$t2
+ _ror $acc13,8,$acc13
+ xor $acc9,$t2,$t2
+ _ror $acc14,16,$acc14
+ xor $acc10,$t2,$t2
+ _ror $acc15,24,$acc15
+ xor $acc11,$t2,$t2
+ xor $acc12,$acc14,$acc14
+ xor $acc13,$t3,$t3
+ _srm $t0,24,$acc0
+ xor $acc14,$t3,$t3
+ xor $acc15,$t3,$t3
+ _srm $t3,16,$acc1
+
+ _srm $t2,8,$acc2
+ ldwx,s $acc0($tbl),$acc0
+ _srm $t1,0,$acc3
+ ldwx,s $acc1($tbl),$acc1
+ _srm $t1,24,$acc4
+ ldwx,s $acc2($tbl),$acc2
+ _srm $t0,16,$acc5
+ ldwx,s $acc3($tbl),$acc3
+ _srm $t3,8,$acc6
+ ldwx,s $acc4($tbl),$acc4
+ _srm $t2,0,$acc7
+ ldwx,s $acc5($tbl),$acc5
+ _srm $t2,24,$acc8
+ ldwx,s $acc6($tbl),$acc6
+ _srm $t1,16,$acc9
+ ldwx,s $acc7($tbl),$acc7
+ _srm $t0,8,$acc10
+ ldwx,s $acc8($tbl),$acc8
+ _srm $t3,0,$acc11
+ ldwx,s $acc9($tbl),$acc9
+ _srm $t3,24,$acc12
+ ldwx,s $acc10($tbl),$acc10
+ _srm $t2,16,$acc13
+ ldwx,s $acc11($tbl),$acc11
+ _srm $t1,8,$acc14
+ ldwx,s $acc12($tbl),$acc12
+ _srm $t0,0,$acc15
+ ldwx,s $acc13($tbl),$acc13
+ _ror $acc1,8,$acc1
+ ldwx,s $acc14($tbl),$acc14
+
+ _ror $acc2,16,$acc2
+ xor $acc0,$s0,$s0
+ ldwx,s $acc15($tbl),$acc15
+ _ror $acc3,24,$acc3
+ xor $acc1,$s0,$s0
+ ldw 16($key),$t0
+ _ror $acc5,8,$acc5
+ xor $acc2,$s0,$s0
+ ldw 20($key),$t1
+ _ror $acc6,16,$acc6
+ xor $acc3,$s0,$s0
+ ldw 24($key),$t2
+ _ror $acc7,24,$acc7
+ xor $acc4,$s1,$s1
+ ldw 28($key),$t3
+ _ror $acc9,8,$acc9
+ xor $acc5,$s1,$s1
+ ldw 1024+0($tbl),%r0 ; prefetch td4
+ _ror $acc10,16,$acc10
+ xor $acc6,$s1,$s1
+ ldw 1024+32($tbl),%r0 ; prefetch td4
+ _ror $acc11,24,$acc11
+ xor $acc7,$s1,$s1
+ ldw 1024+64($tbl),%r0 ; prefetch td4
+ _ror $acc13,8,$acc13
+ xor $acc8,$s2,$s2
+ ldw 1024+96($tbl),%r0 ; prefetch td4
+ _ror $acc14,16,$acc14
+ xor $acc9,$s2,$s2
+ ldw 1024+128($tbl),%r0 ; prefetch td4
+ _ror $acc15,24,$acc15
+ xor $acc10,$s2,$s2
+ ldw 1024+160($tbl),%r0 ; prefetch td4
+ _srm $s0,24,$acc0
+ xor $acc11,$s2,$s2
+ ldw 1024+192($tbl),%r0 ; prefetch td4
+ xor $acc12,$acc14,$acc14
+ xor $acc13,$s3,$s3
+ ldw 1024+224($tbl),%r0 ; prefetch td4
+ xor $acc14,$s3,$s3
+ xor $acc15,$s3,$s3
+ b L\$dec_loop
+ _srm $s3,16,$acc1
+
+ .ALIGN 16
+L\$dec_last
+ ldo 1024($tbl),$rounds
+ _ror $acc1,8,$acc1
+ xor $acc0,$t0,$t0
+ ldw 0($key),$s0
+ _ror $acc2,16,$acc2
+ xor $acc1,$t0,$t0
+ ldw 4($key),$s1
+ _ror $acc3,24,$acc3
+ xor $acc2,$t0,$t0
+ ldw 8($key),$s2
+ _ror $acc5,8,$acc5
+ xor $acc3,$t0,$t0
+ ldw 12($key),$s3
+ _ror $acc6,16,$acc6
+ xor $acc4,$t1,$t1
+ _ror $acc7,24,$acc7
+ xor $acc5,$t1,$t1
+ _ror $acc9,8,$acc9
+ xor $acc6,$t1,$t1
+ _ror $acc10,16,$acc10
+ xor $acc7,$t1,$t1
+ _ror $acc11,24,$acc11
+ xor $acc8,$t2,$t2
+ _ror $acc13,8,$acc13
+ xor $acc9,$t2,$t2
+ _ror $acc14,16,$acc14
+ xor $acc10,$t2,$t2
+ _ror $acc15,24,$acc15
+ xor $acc11,$t2,$t2
+ xor $acc12,$acc14,$acc14
+ xor $acc13,$t3,$t3
+ _srm $t0,24,$acc0
+ xor $acc14,$t3,$t3
+ xor $acc15,$t3,$t3
+ _srm $t3,16,$acc1
+
+ _srm $t2,8,$acc2
+ ldbx $acc0($rounds),$acc0
+ _srm $t1,24,$acc4
+ ldbx $acc1($rounds),$acc1
+ _srm $t0,16,$acc5
+ _srm $t1,0,$acc3
+ ldbx $acc2($rounds),$acc2
+ ldbx $acc3($rounds),$acc3
+ _srm $t3,8,$acc6
+ ldbx $acc4($rounds),$acc4
+ _srm $t2,24,$acc8
+ ldbx $acc5($rounds),$acc5
+ _srm $t1,16,$acc9
+ _srm $t2,0,$acc7
+ ldbx $acc6($rounds),$acc6
+ ldbx $acc7($rounds),$acc7
+ _srm $t0,8,$acc10
+ ldbx $acc8($rounds),$acc8
+ _srm $t3,24,$acc12
+ ldbx $acc9($rounds),$acc9
+ _srm $t2,16,$acc13
+ _srm $t3,0,$acc11
+ ldbx $acc10($rounds),$acc10
+ _srm $t1,8,$acc14
+ ldbx $acc11($rounds),$acc11
+ ldbx $acc12($rounds),$acc12
+ ldbx $acc13($rounds),$acc13
+ _srm $t0,0,$acc15
+ ldbx $acc14($rounds),$acc14
+
+ dep $acc0,7,8,$acc3
+ ldbx $acc15($rounds),$acc15
+ dep $acc4,7,8,$acc7
+ dep $acc1,15,8,$acc3
+ dep $acc5,15,8,$acc7
+ dep $acc2,23,8,$acc3
+ dep $acc6,23,8,$acc7
+ xor $acc3,$s0,$s0
+ xor $acc7,$s1,$s1
+ dep $acc8,7,8,$acc11
+ dep $acc12,7,8,$acc15
+ dep $acc9,15,8,$acc11
+ dep $acc13,15,8,$acc15
+ dep $acc10,23,8,$acc11
+ dep $acc14,23,8,$acc15
+ xor $acc11,$s2,$s2
+
+ bv (%r31)
+ .EXIT
+ xor $acc15,$s3,$s3
+ .PROCEND
+
+ .ALIGN 64
+L\$AES_Td
+ .WORD 0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96
+ .WORD 0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393
+ .WORD 0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25
+ .WORD 0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f
+ .WORD 0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1
+ .WORD 0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6
+ .WORD 0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da
+ .WORD 0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844
+ .WORD 0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd
+ .WORD 0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4
+ .WORD 0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45
+ .WORD 0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94
+ .WORD 0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7
+ .WORD 0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a
+ .WORD 0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5
+ .WORD 0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c
+ .WORD 0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1
+ .WORD 0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a
+ .WORD 0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75
+ .WORD 0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051
+ .WORD 0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46
+ .WORD 0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff
+ .WORD 0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77
+ .WORD 0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb
+ .WORD 0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000
+ .WORD 0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e
+ .WORD 0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927
+ .WORD 0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a
+ .WORD 0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e
+ .WORD 0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16
+ .WORD 0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d
+ .WORD 0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8
+ .WORD 0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd
+ .WORD 0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34
+ .WORD 0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163
+ .WORD 0xd731dcca, 0x42638510, 0x13972240, 0x84c61120
+ .WORD 0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d
+ .WORD 0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0
+ .WORD 0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422
+ .WORD 0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef
+ .WORD 0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36
+ .WORD 0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4
+ .WORD 0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662
+ .WORD 0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5
+ .WORD 0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3
+ .WORD 0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b
+ .WORD 0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8
+ .WORD 0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6
+ .WORD 0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6
+ .WORD 0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0
+ .WORD 0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815
+ .WORD 0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f
+ .WORD 0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df
+ .WORD 0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f
+ .WORD 0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e
+ .WORD 0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713
+ .WORD 0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89
+ .WORD 0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c
+ .WORD 0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf
+ .WORD 0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86
+ .WORD 0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f
+ .WORD 0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541
+ .WORD 0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190
+ .WORD 0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742
+ .BYTE 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
+ .BYTE 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
+ .BYTE 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
+ .BYTE 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
+ .BYTE 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
+ .BYTE 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
+ .BYTE 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
+ .BYTE 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
+ .BYTE 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
+ .BYTE 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
+ .BYTE 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
+ .BYTE 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
+ .BYTE 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
+ .BYTE 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
+ .BYTE 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
+ .BYTE 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
+ .BYTE 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
+ .BYTE 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
+ .BYTE 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
+ .BYTE 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
+ .BYTE 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
+ .BYTE 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
+ .BYTE 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
+ .BYTE 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
+ .BYTE 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
+ .BYTE 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
+ .BYTE 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
+ .BYTE 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
+ .BYTE 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
+ .BYTE 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
+ .BYTE 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
+ .BYTE 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
+ .STRINGZ "AES for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+foreach (split("\n",$code)) {
+ s/\`([^\`]*)\`/eval $1/ge;
+
+ # translate made up instructons: _ror, _srm
+ s/_ror(\s+)(%r[0-9]+),/shd$1$2,$2,/ or
+
+ s/_srm(\s+%r[0-9]+),([0-9]+),/
+ $SIZE_T==4 ? sprintf("extru%s,%d,8,",$1,31-$2)
+ : sprintf("extrd,u%s,%d,8,",$1,63-$2)/e;
+
+ s/,\*/,/ if ($SIZE_T==4);
+ s/\bbv\b(.*\(%r2\))/bve$1/ if ($SIZE_T==8);
+ print $_,"\n";
+}
+close STDOUT;
diff --git a/main/openssl/crypto/aes/asm/aes-ppc.pl b/main/openssl/crypto/aes/asm/aes-ppc.pl
index f82c5e18..7c52cbe5 100644
--- a/main/openssl/crypto/aes/asm/aes-ppc.pl
+++ b/main/openssl/crypto/aes/asm/aes-ppc.pl
@@ -7,7 +7,7 @@
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
-# Needs more work: key setup, page boundaries, CBC routine...
+# Needs more work: key setup, CBC routine...
#
# ppc_AES_[en|de]crypt perform at 18 cycles per byte processed with
# 128-bit key, which is ~40% better than 64-bit code generated by gcc
@@ -18,7 +18,7 @@
# February 2010
#
-# Rescheduling instructions to favour Power6 pipeline gives 10%
+# Rescheduling instructions to favour Power6 pipeline gave 10%
# performance improvement on the platfrom in question (and marginal
# improvement even on others). It should be noted that Power6 fails
# to process byte in 18 cycles, only in 23, because it fails to issue
@@ -33,11 +33,13 @@ $flavour = shift;
if ($flavour =~ /64/) {
$SIZE_T =8;
+ $LRSAVE =2*$SIZE_T;
$STU ="stdu";
$POP ="ld";
$PUSH ="std";
} elsif ($flavour =~ /32/) {
$SIZE_T =4;
+ $LRSAVE =$SIZE_T;
$STU ="stwu";
$POP ="lwz";
$PUSH ="stw";
@@ -116,15 +118,19 @@ LAES_Te:
addi $Tbl0,$Tbl0,`128-8`
mtlr r0
blr
- .space `32-24`
+ .long 0
+ .byte 0,12,0x14,0,0,0,0,0
+ .space `64-9*4`
LAES_Td:
mflr r0
bcl 20,31,\$+4
mflr $Tbl0 ; vvvvvvvv "distance" between . and 1st data entry
- addi $Tbl0,$Tbl0,`128-8-32+2048+256`
+ addi $Tbl0,$Tbl0,`128-64-8+2048+256`
mtlr r0
blr
- .space `128-32-24`
+ .long 0
+ .byte 0,12,0x14,0,0,0,0,0
+ .space `128-64-9*4`
___
&_data_word(
0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
@@ -328,10 +334,9 @@ $code.=<<___;
.globl .AES_encrypt
.align 7
.AES_encrypt:
- mflr r0
$STU $sp,-$FRAME($sp)
+ mflr r0
- $PUSH r0,`$FRAME-$SIZE_T*21`($sp)
$PUSH $toc,`$FRAME-$SIZE_T*20`($sp)
$PUSH r13,`$FRAME-$SIZE_T*19`($sp)
$PUSH r14,`$FRAME-$SIZE_T*18`($sp)
@@ -352,7 +357,14 @@ $code.=<<___;
$PUSH r29,`$FRAME-$SIZE_T*3`($sp)
$PUSH r30,`$FRAME-$SIZE_T*2`($sp)
$PUSH r31,`$FRAME-$SIZE_T*1`($sp)
+ $PUSH r0,`$FRAME+$LRSAVE`($sp)
+
+ andi. $t0,$inp,3
+ andi. $t1,$out,3
+ or. $t0,$t0,$t1
+ bne Lenc_unaligned
+Lenc_unaligned_ok:
lwz $s0,0($inp)
lwz $s1,4($inp)
lwz $s2,8($inp)
@@ -363,8 +375,80 @@ $code.=<<___;
stw $s1,4($out)
stw $s2,8($out)
stw $s3,12($out)
+ b Lenc_done
+
+Lenc_unaligned:
+ subfic $t0,$inp,4096
+ subfic $t1,$out,4096
+ andi. $t0,$t0,4096-16
+ beq Lenc_xpage
+ andi. $t1,$t1,4096-16
+ bne Lenc_unaligned_ok
+
+Lenc_xpage:
+ lbz $acc00,0($inp)
+ lbz $acc01,1($inp)
+ lbz $acc02,2($inp)
+ lbz $s0,3($inp)
+ lbz $acc04,4($inp)
+ lbz $acc05,5($inp)
+ lbz $acc06,6($inp)
+ lbz $s1,7($inp)
+ lbz $acc08,8($inp)
+ lbz $acc09,9($inp)
+ lbz $acc10,10($inp)
+ insrwi $s0,$acc00,8,0
+ lbz $s2,11($inp)
+ insrwi $s1,$acc04,8,0
+ lbz $acc12,12($inp)
+ insrwi $s0,$acc01,8,8
+ lbz $acc13,13($inp)
+ insrwi $s1,$acc05,8,8
+ lbz $acc14,14($inp)
+ insrwi $s0,$acc02,8,16
+ lbz $s3,15($inp)
+ insrwi $s1,$acc06,8,16
+ insrwi $s2,$acc08,8,0
+ insrwi $s3,$acc12,8,0
+ insrwi $s2,$acc09,8,8
+ insrwi $s3,$acc13,8,8
+ insrwi $s2,$acc10,8,16
+ insrwi $s3,$acc14,8,16
+
+ bl LAES_Te
+ bl Lppc_AES_encrypt_compact
+
+ extrwi $acc00,$s0,8,0
+ extrwi $acc01,$s0,8,8
+ stb $acc00,0($out)
+ extrwi $acc02,$s0,8,16
+ stb $acc01,1($out)
+ stb $acc02,2($out)
+ extrwi $acc04,$s1,8,0
+ stb $s0,3($out)
+ extrwi $acc05,$s1,8,8
+ stb $acc04,4($out)
+ extrwi $acc06,$s1,8,16
+ stb $acc05,5($out)
+ stb $acc06,6($out)
+ extrwi $acc08,$s2,8,0
+ stb $s1,7($out)
+ extrwi $acc09,$s2,8,8
+ stb $acc08,8($out)
+ extrwi $acc10,$s2,8,16
+ stb $acc09,9($out)
+ stb $acc10,10($out)
+ extrwi $acc12,$s3,8,0
+ stb $s2,11($out)
+ extrwi $acc13,$s3,8,8
+ stb $acc12,12($out)
+ extrwi $acc14,$s3,8,16
+ stb $acc13,13($out)
+ stb $acc14,14($out)
+ stb $s3,15($out)
- $POP r0,`$FRAME-$SIZE_T*21`($sp)
+Lenc_done:
+ $POP r0,`$FRAME+$LRSAVE`($sp)
$POP $toc,`$FRAME-$SIZE_T*20`($sp)
$POP r13,`$FRAME-$SIZE_T*19`($sp)
$POP r14,`$FRAME-$SIZE_T*18`($sp)
@@ -388,18 +472,21 @@ $code.=<<___;
mtlr r0
addi $sp,$sp,$FRAME
blr
+ .long 0
+ .byte 0,12,4,1,0x80,18,3,0
+ .long 0
.align 5
Lppc_AES_encrypt:
lwz $acc00,240($key)
- lwz $t0,0($key)
- lwz $t1,4($key)
- lwz $t2,8($key)
- lwz $t3,12($key)
addi $Tbl1,$Tbl0,3
+ lwz $t0,0($key)
addi $Tbl2,$Tbl0,2
+ lwz $t1,4($key)
addi $Tbl3,$Tbl0,1
+ lwz $t2,8($key)
addi $acc00,$acc00,-1
+ lwz $t3,12($key)
addi $key,$key,16
xor $s0,$s0,$t0
xor $s1,$s1,$t1
@@ -413,44 +500,44 @@ Lenc_loop:
rlwinm $acc02,$s2,`32-24+3`,21,28
rlwinm $acc03,$s3,`32-24+3`,21,28
lwz $t0,0($key)
- lwz $t1,4($key)
rlwinm $acc04,$s1,`32-16+3`,21,28
+ lwz $t1,4($key)
rlwinm $acc05,$s2,`32-16+3`,21,28
lwz $t2,8($key)
- lwz $t3,12($key)
rlwinm $acc06,$s3,`32-16+3`,21,28
+ lwz $t3,12($key)
rlwinm $acc07,$s0,`32-16+3`,21,28
lwzx $acc00,$Tbl0,$acc00
- lwzx $acc01,$Tbl0,$acc01
rlwinm $acc08,$s2,`32-8+3`,21,28
+ lwzx $acc01,$Tbl0,$acc01
rlwinm $acc09,$s3,`32-8+3`,21,28
lwzx $acc02,$Tbl0,$acc02
- lwzx $acc03,$Tbl0,$acc03
rlwinm $acc10,$s0,`32-8+3`,21,28
+ lwzx $acc03,$Tbl0,$acc03
rlwinm $acc11,$s1,`32-8+3`,21,28
lwzx $acc04,$Tbl1,$acc04
- lwzx $acc05,$Tbl1,$acc05
rlwinm $acc12,$s3,`0+3`,21,28
+ lwzx $acc05,$Tbl1,$acc05
rlwinm $acc13,$s0,`0+3`,21,28
lwzx $acc06,$Tbl1,$acc06
- lwzx $acc07,$Tbl1,$acc07
rlwinm $acc14,$s1,`0+3`,21,28
+ lwzx $acc07,$Tbl1,$acc07
rlwinm $acc15,$s2,`0+3`,21,28
lwzx $acc08,$Tbl2,$acc08
- lwzx $acc09,$Tbl2,$acc09
xor $t0,$t0,$acc00
+ lwzx $acc09,$Tbl2,$acc09
xor $t1,$t1,$acc01
lwzx $acc10,$Tbl2,$acc10
- lwzx $acc11,$Tbl2,$acc11
xor $t2,$t2,$acc02
+ lwzx $acc11,$Tbl2,$acc11
xor $t3,$t3,$acc03
lwzx $acc12,$Tbl3,$acc12
- lwzx $acc13,$Tbl3,$acc13
xor $t0,$t0,$acc04
+ lwzx $acc13,$Tbl3,$acc13
xor $t1,$t1,$acc05
lwzx $acc14,$Tbl3,$acc14
- lwzx $acc15,$Tbl3,$acc15
xor $t2,$t2,$acc06
+ lwzx $acc15,$Tbl3,$acc15
xor $t3,$t3,$acc07
xor $t0,$t0,$acc08
xor $t1,$t1,$acc09
@@ -466,60 +553,60 @@ Lenc_loop:
addi $Tbl2,$Tbl0,2048
nop
lwz $t0,0($key)
- lwz $t1,4($key)
rlwinm $acc00,$s0,`32-24`,24,31
+ lwz $t1,4($key)
rlwinm $acc01,$s1,`32-24`,24,31
lwz $t2,8($key)
- lwz $t3,12($key)
rlwinm $acc02,$s2,`32-24`,24,31
+ lwz $t3,12($key)
rlwinm $acc03,$s3,`32-24`,24,31
lwz $acc08,`2048+0`($Tbl0) ! prefetch Te4
- lwz $acc09,`2048+32`($Tbl0)
rlwinm $acc04,$s1,`32-16`,24,31
+ lwz $acc09,`2048+32`($Tbl0)
rlwinm $acc05,$s2,`32-16`,24,31
lwz $acc10,`2048+64`($Tbl0)
- lwz $acc11,`2048+96`($Tbl0)
rlwinm $acc06,$s3,`32-16`,24,31
+ lwz $acc11,`2048+96`($Tbl0)
rlwinm $acc07,$s0,`32-16`,24,31
lwz $acc12,`2048+128`($Tbl0)
- lwz $acc13,`2048+160`($Tbl0)
rlwinm $acc08,$s2,`32-8`,24,31
+ lwz $acc13,`2048+160`($Tbl0)
rlwinm $acc09,$s3,`32-8`,24,31
lwz $acc14,`2048+192`($Tbl0)
- lwz $acc15,`2048+224`($Tbl0)
rlwinm $acc10,$s0,`32-8`,24,31
+ lwz $acc15,`2048+224`($Tbl0)
rlwinm $acc11,$s1,`32-8`,24,31
lbzx $acc00,$Tbl2,$acc00
- lbzx $acc01,$Tbl2,$acc01
rlwinm $acc12,$s3,`0`,24,31
+ lbzx $acc01,$Tbl2,$acc01
rlwinm $acc13,$s0,`0`,24,31
lbzx $acc02,$Tbl2,$acc02
- lbzx $acc03,$Tbl2,$acc03
rlwinm $acc14,$s1,`0`,24,31
+ lbzx $acc03,$Tbl2,$acc03
rlwinm $acc15,$s2,`0`,24,31
lbzx $acc04,$Tbl2,$acc04
- lbzx $acc05,$Tbl2,$acc05
rlwinm $s0,$acc00,24,0,7
+ lbzx $acc05,$Tbl2,$acc05
rlwinm $s1,$acc01,24,0,7
lbzx $acc06,$Tbl2,$acc06
- lbzx $acc07,$Tbl2,$acc07
rlwinm $s2,$acc02,24,0,7
+ lbzx $acc07,$Tbl2,$acc07
rlwinm $s3,$acc03,24,0,7
lbzx $acc08,$Tbl2,$acc08
- lbzx $acc09,$Tbl2,$acc09
rlwimi $s0,$acc04,16,8,15
+ lbzx $acc09,$Tbl2,$acc09
rlwimi $s1,$acc05,16,8,15
lbzx $acc10,$Tbl2,$acc10
- lbzx $acc11,$Tbl2,$acc11
rlwimi $s2,$acc06,16,8,15
+ lbzx $acc11,$Tbl2,$acc11
rlwimi $s3,$acc07,16,8,15
lbzx $acc12,$Tbl2,$acc12
- lbzx $acc13,$Tbl2,$acc13
rlwimi $s0,$acc08,8,16,23
+ lbzx $acc13,$Tbl2,$acc13
rlwimi $s1,$acc09,8,16,23
lbzx $acc14,$Tbl2,$acc14
- lbzx $acc15,$Tbl2,$acc15
rlwimi $s2,$acc10,8,16,23
+ lbzx $acc15,$Tbl2,$acc15
rlwimi $s3,$acc11,8,16,23
or $s0,$s0,$acc12
or $s1,$s1,$acc13
@@ -530,29 +617,31 @@ Lenc_loop:
xor $s2,$s2,$t2
xor $s3,$s3,$t3
blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,0,0
.align 4
Lppc_AES_encrypt_compact:
lwz $acc00,240($key)
- lwz $t0,0($key)
- lwz $t1,4($key)
- lwz $t2,8($key)
- lwz $t3,12($key)
addi $Tbl1,$Tbl0,2048
+ lwz $t0,0($key)
lis $mask80,0x8080
+ lwz $t1,4($key)
lis $mask1b,0x1b1b
- addi $key,$key,16
+ lwz $t2,8($key)
ori $mask80,$mask80,0x8080
+ lwz $t3,12($key)
ori $mask1b,$mask1b,0x1b1b
+ addi $key,$key,16
mtctr $acc00
.align 4
Lenc_compact_loop:
xor $s0,$s0,$t0
xor $s1,$s1,$t1
- xor $s2,$s2,$t2
- xor $s3,$s3,$t3
rlwinm $acc00,$s0,`32-24`,24,31
+ xor $s2,$s2,$t2
rlwinm $acc01,$s1,`32-24`,24,31
+ xor $s3,$s3,$t3
rlwinm $acc02,$s2,`32-24`,24,31
rlwinm $acc03,$s3,`32-24`,24,31
rlwinm $acc04,$s1,`32-16`,24,31
@@ -560,48 +649,48 @@ Lenc_compact_loop:
rlwinm $acc06,$s3,`32-16`,24,31
rlwinm $acc07,$s0,`32-16`,24,31
lbzx $acc00,$Tbl1,$acc00
- lbzx $acc01,$Tbl1,$acc01
rlwinm $acc08,$s2,`32-8`,24,31
+ lbzx $acc01,$Tbl1,$acc01
rlwinm $acc09,$s3,`32-8`,24,31
lbzx $acc02,$Tbl1,$acc02
- lbzx $acc03,$Tbl1,$acc03
rlwinm $acc10,$s0,`32-8`,24,31
+ lbzx $acc03,$Tbl1,$acc03
rlwinm $acc11,$s1,`32-8`,24,31
lbzx $acc04,$Tbl1,$acc04
- lbzx $acc05,$Tbl1,$acc05
rlwinm $acc12,$s3,`0`,24,31
+ lbzx $acc05,$Tbl1,$acc05
rlwinm $acc13,$s0,`0`,24,31
lbzx $acc06,$Tbl1,$acc06
- lbzx $acc07,$Tbl1,$acc07
rlwinm $acc14,$s1,`0`,24,31
+ lbzx $acc07,$Tbl1,$acc07
rlwinm $acc15,$s2,`0`,24,31
lbzx $acc08,$Tbl1,$acc08
- lbzx $acc09,$Tbl1,$acc09
rlwinm $s0,$acc00,24,0,7
+ lbzx $acc09,$Tbl1,$acc09
rlwinm $s1,$acc01,24,0,7
lbzx $acc10,$Tbl1,$acc10
- lbzx $acc11,$Tbl1,$acc11
rlwinm $s2,$acc02,24,0,7
+ lbzx $acc11,$Tbl1,$acc11
rlwinm $s3,$acc03,24,0,7
lbzx $acc12,$Tbl1,$acc12
- lbzx $acc13,$Tbl1,$acc13
rlwimi $s0,$acc04,16,8,15
+ lbzx $acc13,$Tbl1,$acc13
rlwimi $s1,$acc05,16,8,15
lbzx $acc14,$Tbl1,$acc14
- lbzx $acc15,$Tbl1,$acc15
rlwimi $s2,$acc06,16,8,15
+ lbzx $acc15,$Tbl1,$acc15
rlwimi $s3,$acc07,16,8,15
rlwimi $s0,$acc08,8,16,23
rlwimi $s1,$acc09,8,16,23
rlwimi $s2,$acc10,8,16,23
rlwimi $s3,$acc11,8,16,23
lwz $t0,0($key)
- lwz $t1,4($key)
or $s0,$s0,$acc12
+ lwz $t1,4($key)
or $s1,$s1,$acc13
lwz $t2,8($key)
- lwz $t3,12($key)
or $s2,$s2,$acc14
+ lwz $t3,12($key)
or $s3,$s3,$acc15
addi $key,$key,16
@@ -612,12 +701,12 @@ Lenc_compact_loop:
and $acc02,$s2,$mask80
and $acc03,$s3,$mask80
srwi $acc04,$acc00,7 # r1>>7
- srwi $acc05,$acc01,7
- srwi $acc06,$acc02,7
- srwi $acc07,$acc03,7
andc $acc08,$s0,$mask80 # r0&0x7f7f7f7f
+ srwi $acc05,$acc01,7
andc $acc09,$s1,$mask80
+ srwi $acc06,$acc02,7
andc $acc10,$s2,$mask80
+ srwi $acc07,$acc03,7
andc $acc11,$s3,$mask80
sub $acc00,$acc00,$acc04 # r1-(r1>>7)
sub $acc01,$acc01,$acc05
@@ -633,32 +722,32 @@ Lenc_compact_loop:
and $acc03,$acc03,$mask1b
xor $acc00,$acc00,$acc08 # r2
xor $acc01,$acc01,$acc09
+ rotlwi $acc12,$s0,16 # ROTATE(r0,16)
xor $acc02,$acc02,$acc10
+ rotlwi $acc13,$s1,16
xor $acc03,$acc03,$acc11
+ rotlwi $acc14,$s2,16
- rotlwi $acc12,$s0,16 # ROTATE(r0,16)
- rotlwi $acc13,$s1,16
- rotlwi $acc14,$s2,16
- rotlwi $acc15,$s3,16
xor $s0,$s0,$acc00 # r0^r2
+ rotlwi $acc15,$s3,16
xor $s1,$s1,$acc01
- xor $s2,$s2,$acc02
- xor $s3,$s3,$acc03
rotrwi $s0,$s0,24 # ROTATE(r2^r0,24)
+ xor $s2,$s2,$acc02
rotrwi $s1,$s1,24
+ xor $s3,$s3,$acc03
rotrwi $s2,$s2,24
- rotrwi $s3,$s3,24
xor $s0,$s0,$acc00 # ROTATE(r2^r0,24)^r2
+ rotrwi $s3,$s3,24
xor $s1,$s1,$acc01
xor $s2,$s2,$acc02
xor $s3,$s3,$acc03
rotlwi $acc08,$acc12,8 # ROTATE(r0,24)
- rotlwi $acc09,$acc13,8
- rotlwi $acc10,$acc14,8
- rotlwi $acc11,$acc15,8
xor $s0,$s0,$acc12 #
+ rotlwi $acc09,$acc13,8
xor $s1,$s1,$acc13
+ rotlwi $acc10,$acc14,8
xor $s2,$s2,$acc14
+ rotlwi $acc11,$acc15,8
xor $s3,$s3,$acc15
xor $s0,$s0,$acc08 #
xor $s1,$s1,$acc09
@@ -673,14 +762,15 @@ Lenc_compact_done:
xor $s2,$s2,$t2
xor $s3,$s3,$t3
blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,0,0
.globl .AES_decrypt
.align 7
.AES_decrypt:
- mflr r0
$STU $sp,-$FRAME($sp)
+ mflr r0
- $PUSH r0,`$FRAME-$SIZE_T*21`($sp)
$PUSH $toc,`$FRAME-$SIZE_T*20`($sp)
$PUSH r13,`$FRAME-$SIZE_T*19`($sp)
$PUSH r14,`$FRAME-$SIZE_T*18`($sp)
@@ -701,7 +791,14 @@ Lenc_compact_done:
$PUSH r29,`$FRAME-$SIZE_T*3`($sp)
$PUSH r30,`$FRAME-$SIZE_T*2`($sp)
$PUSH r31,`$FRAME-$SIZE_T*1`($sp)
+ $PUSH r0,`$FRAME+$LRSAVE`($sp)
+ andi. $t0,$inp,3
+ andi. $t1,$out,3
+ or. $t0,$t0,$t1
+ bne Ldec_unaligned
+
+Ldec_unaligned_ok:
lwz $s0,0($inp)
lwz $s1,4($inp)
lwz $s2,8($inp)
@@ -712,8 +809,80 @@ Lenc_compact_done:
stw $s1,4($out)
stw $s2,8($out)
stw $s3,12($out)
+ b Ldec_done
+
+Ldec_unaligned:
+ subfic $t0,$inp,4096
+ subfic $t1,$out,4096
+ andi. $t0,$t0,4096-16
+ beq Ldec_xpage
+ andi. $t1,$t1,4096-16
+ bne Ldec_unaligned_ok
+
+Ldec_xpage:
+ lbz $acc00,0($inp)
+ lbz $acc01,1($inp)
+ lbz $acc02,2($inp)
+ lbz $s0,3($inp)
+ lbz $acc04,4($inp)
+ lbz $acc05,5($inp)
+ lbz $acc06,6($inp)
+ lbz $s1,7($inp)
+ lbz $acc08,8($inp)
+ lbz $acc09,9($inp)
+ lbz $acc10,10($inp)
+ insrwi $s0,$acc00,8,0
+ lbz $s2,11($inp)
+ insrwi $s1,$acc04,8,0
+ lbz $acc12,12($inp)
+ insrwi $s0,$acc01,8,8
+ lbz $acc13,13($inp)
+ insrwi $s1,$acc05,8,8
+ lbz $acc14,14($inp)
+ insrwi $s0,$acc02,8,16
+ lbz $s3,15($inp)
+ insrwi $s1,$acc06,8,16
+ insrwi $s2,$acc08,8,0
+ insrwi $s3,$acc12,8,0
+ insrwi $s2,$acc09,8,8
+ insrwi $s3,$acc13,8,8
+ insrwi $s2,$acc10,8,16
+ insrwi $s3,$acc14,8,16
+
+ bl LAES_Td
+ bl Lppc_AES_decrypt_compact
- $POP r0,`$FRAME-$SIZE_T*21`($sp)
+ extrwi $acc00,$s0,8,0
+ extrwi $acc01,$s0,8,8
+ stb $acc00,0($out)
+ extrwi $acc02,$s0,8,16
+ stb $acc01,1($out)
+ stb $acc02,2($out)
+ extrwi $acc04,$s1,8,0
+ stb $s0,3($out)
+ extrwi $acc05,$s1,8,8
+ stb $acc04,4($out)
+ extrwi $acc06,$s1,8,16
+ stb $acc05,5($out)
+ stb $acc06,6($out)
+ extrwi $acc08,$s2,8,0
+ stb $s1,7($out)
+ extrwi $acc09,$s2,8,8
+ stb $acc08,8($out)
+ extrwi $acc10,$s2,8,16
+ stb $acc09,9($out)
+ stb $acc10,10($out)
+ extrwi $acc12,$s3,8,0
+ stb $s2,11($out)
+ extrwi $acc13,$s3,8,8
+ stb $acc12,12($out)
+ extrwi $acc14,$s3,8,16
+ stb $acc13,13($out)
+ stb $acc14,14($out)
+ stb $s3,15($out)
+
+Ldec_done:
+ $POP r0,`$FRAME+$LRSAVE`($sp)
$POP $toc,`$FRAME-$SIZE_T*20`($sp)
$POP r13,`$FRAME-$SIZE_T*19`($sp)
$POP r14,`$FRAME-$SIZE_T*18`($sp)
@@ -737,18 +906,21 @@ Lenc_compact_done:
mtlr r0
addi $sp,$sp,$FRAME
blr
+ .long 0
+ .byte 0,12,4,1,0x80,18,3,0
+ .long 0
.align 5
Lppc_AES_decrypt:
lwz $acc00,240($key)
- lwz $t0,0($key)
- lwz $t1,4($key)
- lwz $t2,8($key)
- lwz $t3,12($key)
addi $Tbl1,$Tbl0,3
+ lwz $t0,0($key)
addi $Tbl2,$Tbl0,2
+ lwz $t1,4($key)
addi $Tbl3,$Tbl0,1
+ lwz $t2,8($key)
addi $acc00,$acc00,-1
+ lwz $t3,12($key)
addi $key,$key,16
xor $s0,$s0,$t0
xor $s1,$s1,$t1
@@ -762,44 +934,44 @@ Ldec_loop:
rlwinm $acc02,$s2,`32-24+3`,21,28
rlwinm $acc03,$s3,`32-24+3`,21,28
lwz $t0,0($key)
- lwz $t1,4($key)
rlwinm $acc04,$s3,`32-16+3`,21,28
+ lwz $t1,4($key)
rlwinm $acc05,$s0,`32-16+3`,21,28
lwz $t2,8($key)
- lwz $t3,12($key)
rlwinm $acc06,$s1,`32-16+3`,21,28
+ lwz $t3,12($key)
rlwinm $acc07,$s2,`32-16+3`,21,28
lwzx $acc00,$Tbl0,$acc00
- lwzx $acc01,$Tbl0,$acc01
rlwinm $acc08,$s2,`32-8+3`,21,28
+ lwzx $acc01,$Tbl0,$acc01
rlwinm $acc09,$s3,`32-8+3`,21,28
lwzx $acc02,$Tbl0,$acc02
- lwzx $acc03,$Tbl0,$acc03
rlwinm $acc10,$s0,`32-8+3`,21,28
+ lwzx $acc03,$Tbl0,$acc03
rlwinm $acc11,$s1,`32-8+3`,21,28
lwzx $acc04,$Tbl1,$acc04
- lwzx $acc05,$Tbl1,$acc05
rlwinm $acc12,$s1,`0+3`,21,28
+ lwzx $acc05,$Tbl1,$acc05
rlwinm $acc13,$s2,`0+3`,21,28
lwzx $acc06,$Tbl1,$acc06
- lwzx $acc07,$Tbl1,$acc07
rlwinm $acc14,$s3,`0+3`,21,28
+ lwzx $acc07,$Tbl1,$acc07
rlwinm $acc15,$s0,`0+3`,21,28
lwzx $acc08,$Tbl2,$acc08
- lwzx $acc09,$Tbl2,$acc09
xor $t0,$t0,$acc00
+ lwzx $acc09,$Tbl2,$acc09
xor $t1,$t1,$acc01
lwzx $acc10,$Tbl2,$acc10
- lwzx $acc11,$Tbl2,$acc11
xor $t2,$t2,$acc02
+ lwzx $acc11,$Tbl2,$acc11
xor $t3,$t3,$acc03
lwzx $acc12,$Tbl3,$acc12
- lwzx $acc13,$Tbl3,$acc13
xor $t0,$t0,$acc04
+ lwzx $acc13,$Tbl3,$acc13
xor $t1,$t1,$acc05
lwzx $acc14,$Tbl3,$acc14
- lwzx $acc15,$Tbl3,$acc15
xor $t2,$t2,$acc06
+ lwzx $acc15,$Tbl3,$acc15
xor $t3,$t3,$acc07
xor $t0,$t0,$acc08
xor $t1,$t1,$acc09
@@ -815,56 +987,56 @@ Ldec_loop:
addi $Tbl2,$Tbl0,2048
nop
lwz $t0,0($key)
- lwz $t1,4($key)
rlwinm $acc00,$s0,`32-24`,24,31
+ lwz $t1,4($key)
rlwinm $acc01,$s1,`32-24`,24,31
lwz $t2,8($key)
- lwz $t3,12($key)
rlwinm $acc02,$s2,`32-24`,24,31
+ lwz $t3,12($key)
rlwinm $acc03,$s3,`32-24`,24,31
lwz $acc08,`2048+0`($Tbl0) ! prefetch Td4
- lwz $acc09,`2048+32`($Tbl0)
rlwinm $acc04,$s3,`32-16`,24,31
+ lwz $acc09,`2048+32`($Tbl0)
rlwinm $acc05,$s0,`32-16`,24,31
lwz $acc10,`2048+64`($Tbl0)
- lwz $acc11,`2048+96`($Tbl0)
lbzx $acc00,$Tbl2,$acc00
+ lwz $acc11,`2048+96`($Tbl0)
lbzx $acc01,$Tbl2,$acc01
lwz $acc12,`2048+128`($Tbl0)
- lwz $acc13,`2048+160`($Tbl0)
rlwinm $acc06,$s1,`32-16`,24,31
+ lwz $acc13,`2048+160`($Tbl0)
rlwinm $acc07,$s2,`32-16`,24,31
lwz $acc14,`2048+192`($Tbl0)
- lwz $acc15,`2048+224`($Tbl0)
rlwinm $acc08,$s2,`32-8`,24,31
+ lwz $acc15,`2048+224`($Tbl0)
rlwinm $acc09,$s3,`32-8`,24,31
lbzx $acc02,$Tbl2,$acc02
- lbzx $acc03,$Tbl2,$acc03
rlwinm $acc10,$s0,`32-8`,24,31
+ lbzx $acc03,$Tbl2,$acc03
rlwinm $acc11,$s1,`32-8`,24,31
lbzx $acc04,$Tbl2,$acc04
- lbzx $acc05,$Tbl2,$acc05
rlwinm $acc12,$s1,`0`,24,31
+ lbzx $acc05,$Tbl2,$acc05
rlwinm $acc13,$s2,`0`,24,31
lbzx $acc06,$Tbl2,$acc06
- lbzx $acc07,$Tbl2,$acc07
rlwinm $acc14,$s3,`0`,24,31
+ lbzx $acc07,$Tbl2,$acc07
rlwinm $acc15,$s0,`0`,24,31
lbzx $acc08,$Tbl2,$acc08
- lbzx $acc09,$Tbl2,$acc09
rlwinm $s0,$acc00,24,0,7
+ lbzx $acc09,$Tbl2,$acc09
rlwinm $s1,$acc01,24,0,7
lbzx $acc10,$Tbl2,$acc10
- lbzx $acc11,$Tbl2,$acc11
rlwinm $s2,$acc02,24,0,7
+ lbzx $acc11,$Tbl2,$acc11
rlwinm $s3,$acc03,24,0,7
lbzx $acc12,$Tbl2,$acc12
- lbzx $acc13,$Tbl2,$acc13
rlwimi $s0,$acc04,16,8,15
+ lbzx $acc13,$Tbl2,$acc13
rlwimi $s1,$acc05,16,8,15
lbzx $acc14,$Tbl2,$acc14
- lbzx $acc15,$Tbl2,$acc15
rlwimi $s2,$acc06,16,8,15
+ lbzx $acc15,$Tbl2,$acc15
rlwimi $s3,$acc07,16,8,15
rlwimi $s0,$acc08,8,16,23
rlwimi $s1,$acc09,8,16,23
@@ -879,20 +1051,22 @@ Ldec_loop:
xor $s2,$s2,$t2
xor $s3,$s3,$t3
blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,0,0
.align 4
Lppc_AES_decrypt_compact:
lwz $acc00,240($key)
- lwz $t0,0($key)
- lwz $t1,4($key)
- lwz $t2,8($key)
- lwz $t3,12($key)
addi $Tbl1,$Tbl0,2048
+ lwz $t0,0($key)
lis $mask80,0x8080
+ lwz $t1,4($key)
lis $mask1b,0x1b1b
- addi $key,$key,16
+ lwz $t2,8($key)
ori $mask80,$mask80,0x8080
+ lwz $t3,12($key)
ori $mask1b,$mask1b,0x1b1b
+ addi $key,$key,16
___
$code.=<<___ if ($SIZE_T==8);
insrdi $mask80,$mask80,32,0
@@ -904,10 +1078,10 @@ $code.=<<___;
Ldec_compact_loop:
xor $s0,$s0,$t0
xor $s1,$s1,$t1
- xor $s2,$s2,$t2
- xor $s3,$s3,$t3
rlwinm $acc00,$s0,`32-24`,24,31
+ xor $s2,$s2,$t2
rlwinm $acc01,$s1,`32-24`,24,31
+ xor $s3,$s3,$t3
rlwinm $acc02,$s2,`32-24`,24,31
rlwinm $acc03,$s3,`32-24`,24,31
rlwinm $acc04,$s3,`32-16`,24,31
@@ -915,48 +1089,48 @@ Ldec_compact_loop:
rlwinm $acc06,$s1,`32-16`,24,31
rlwinm $acc07,$s2,`32-16`,24,31
lbzx $acc00,$Tbl1,$acc00
- lbzx $acc01,$Tbl1,$acc01
rlwinm $acc08,$s2,`32-8`,24,31
+ lbzx $acc01,$Tbl1,$acc01
rlwinm $acc09,$s3,`32-8`,24,31
lbzx $acc02,$Tbl1,$acc02
- lbzx $acc03,$Tbl1,$acc03
rlwinm $acc10,$s0,`32-8`,24,31
+ lbzx $acc03,$Tbl1,$acc03
rlwinm $acc11,$s1,`32-8`,24,31
lbzx $acc04,$Tbl1,$acc04
- lbzx $acc05,$Tbl1,$acc05
rlwinm $acc12,$s1,`0`,24,31
+ lbzx $acc05,$Tbl1,$acc05
rlwinm $acc13,$s2,`0`,24,31
lbzx $acc06,$Tbl1,$acc06
- lbzx $acc07,$Tbl1,$acc07
rlwinm $acc14,$s3,`0`,24,31
+ lbzx $acc07,$Tbl1,$acc07
rlwinm $acc15,$s0,`0`,24,31
lbzx $acc08,$Tbl1,$acc08
- lbzx $acc09,$Tbl1,$acc09
rlwinm $s0,$acc00,24,0,7
+ lbzx $acc09,$Tbl1,$acc09
rlwinm $s1,$acc01,24,0,7
lbzx $acc10,$Tbl1,$acc10
- lbzx $acc11,$Tbl1,$acc11
rlwinm $s2,$acc02,24,0,7
+ lbzx $acc11,$Tbl1,$acc11
rlwinm $s3,$acc03,24,0,7
lbzx $acc12,$Tbl1,$acc12
- lbzx $acc13,$Tbl1,$acc13
rlwimi $s0,$acc04,16,8,15
+ lbzx $acc13,$Tbl1,$acc13
rlwimi $s1,$acc05,16,8,15
lbzx $acc14,$Tbl1,$acc14
- lbzx $acc15,$Tbl1,$acc15
rlwimi $s2,$acc06,16,8,15
+ lbzx $acc15,$Tbl1,$acc15
rlwimi $s3,$acc07,16,8,15
rlwimi $s0,$acc08,8,16,23
rlwimi $s1,$acc09,8,16,23
rlwimi $s2,$acc10,8,16,23
rlwimi $s3,$acc11,8,16,23
lwz $t0,0($key)
- lwz $t1,4($key)
or $s0,$s0,$acc12
+ lwz $t1,4($key)
or $s1,$s1,$acc13
lwz $t2,8($key)
- lwz $t3,12($key)
or $s2,$s2,$acc14
+ lwz $t3,12($key)
or $s3,$s3,$acc15
addi $key,$key,16
@@ -1030,12 +1204,12 @@ $code.=<<___ if ($SIZE_T==4);
and $acc02,$s2,$mask80
and $acc03,$s3,$mask80
srwi $acc04,$acc00,7 # r1>>7
- srwi $acc05,$acc01,7
- srwi $acc06,$acc02,7
- srwi $acc07,$acc03,7
andc $acc08,$s0,$mask80 # r0&0x7f7f7f7f
+ srwi $acc05,$acc01,7
andc $acc09,$s1,$mask80
+ srwi $acc06,$acc02,7
andc $acc10,$s2,$mask80
+ srwi $acc07,$acc03,7
andc $acc11,$s3,$mask80
sub $acc00,$acc00,$acc04 # r1-(r1>>7)
sub $acc01,$acc01,$acc05
@@ -1059,12 +1233,12 @@ $code.=<<___ if ($SIZE_T==4);
and $acc06,$acc02,$mask80
and $acc07,$acc03,$mask80
srwi $acc08,$acc04,7 # r1>>7
- srwi $acc09,$acc05,7
- srwi $acc10,$acc06,7
- srwi $acc11,$acc07,7
andc $acc12,$acc00,$mask80 # r2&0x7f7f7f7f
+ srwi $acc09,$acc05,7
andc $acc13,$acc01,$mask80
+ srwi $acc10,$acc06,7
andc $acc14,$acc02,$mask80
+ srwi $acc11,$acc07,7
andc $acc15,$acc03,$mask80
sub $acc04,$acc04,$acc08 # r1-(r1>>7)
sub $acc05,$acc05,$acc09
@@ -1085,13 +1259,13 @@ $code.=<<___ if ($SIZE_T==4);
and $acc08,$acc04,$mask80 # r1=r4&0x80808080
and $acc09,$acc05,$mask80
- and $acc10,$acc06,$mask80
- and $acc11,$acc07,$mask80
srwi $acc12,$acc08,7 # r1>>7
+ and $acc10,$acc06,$mask80
srwi $acc13,$acc09,7
+ and $acc11,$acc07,$mask80
srwi $acc14,$acc10,7
- srwi $acc15,$acc11,7
sub $acc08,$acc08,$acc12 # r1-(r1>>7)
+ srwi $acc15,$acc11,7
sub $acc09,$acc09,$acc13
sub $acc10,$acc10,$acc14
sub $acc11,$acc11,$acc15
@@ -1124,10 +1298,10 @@ ___
$code.=<<___;
rotrwi $s0,$s0,8 # = ROTATE(r0,8)
rotrwi $s1,$s1,8
- rotrwi $s2,$s2,8
- rotrwi $s3,$s3,8
xor $s0,$s0,$acc00 # ^= r2^r0
+ rotrwi $s2,$s2,8
xor $s1,$s1,$acc01
+ rotrwi $s3,$s3,8
xor $s2,$s2,$acc02
xor $s3,$s3,$acc03
xor $acc00,$acc00,$acc08
@@ -1135,32 +1309,32 @@ $code.=<<___;
xor $acc02,$acc02,$acc10
xor $acc03,$acc03,$acc11
xor $s0,$s0,$acc04 # ^= r4^r0
- xor $s1,$s1,$acc05
- xor $s2,$s2,$acc06
- xor $s3,$s3,$acc07
rotrwi $acc00,$acc00,24
+ xor $s1,$s1,$acc05
rotrwi $acc01,$acc01,24
+ xor $s2,$s2,$acc06
rotrwi $acc02,$acc02,24
+ xor $s3,$s3,$acc07
rotrwi $acc03,$acc03,24
xor $acc04,$acc04,$acc08
xor $acc05,$acc05,$acc09
xor $acc06,$acc06,$acc10
xor $acc07,$acc07,$acc11
xor $s0,$s0,$acc08 # ^= r8 [^((r4^r0)^(r2^r0)=r4^r2)]
- xor $s1,$s1,$acc09
- xor $s2,$s2,$acc10
- xor $s3,$s3,$acc11
rotrwi $acc04,$acc04,16
+ xor $s1,$s1,$acc09
rotrwi $acc05,$acc05,16
+ xor $s2,$s2,$acc10
rotrwi $acc06,$acc06,16
+ xor $s3,$s3,$acc11
rotrwi $acc07,$acc07,16
xor $s0,$s0,$acc00 # ^= ROTATE(r8^r2^r0,24)
- xor $s1,$s1,$acc01
- xor $s2,$s2,$acc02
- xor $s3,$s3,$acc03
rotrwi $acc08,$acc08,8
+ xor $s1,$s1,$acc01
rotrwi $acc09,$acc09,8
+ xor $s2,$s2,$acc02
rotrwi $acc10,$acc10,8
+ xor $s3,$s3,$acc03
rotrwi $acc11,$acc11,8
xor $s0,$s0,$acc04 # ^= ROTATE(r8^r4^r0,16)
xor $s1,$s1,$acc05
@@ -1179,7 +1353,9 @@ Ldec_compact_done:
xor $s2,$s2,$t2
xor $s3,$s3,$t3
blr
-.long 0
+ .long 0
+ .byte 0,12,0x14,0,0,0,0,0
+
.asciz "AES for PPC, CRYPTOGAMS by <appro\@openssl.org>"
.align 7
___
diff --git a/main/openssl/crypto/aes/asm/aes-s390x.pl b/main/openssl/crypto/aes/asm/aes-s390x.pl
index 7e018892..e75dcd03 100644
--- a/main/openssl/crypto/aes/asm/aes-s390x.pl
+++ b/main/openssl/crypto/aes/asm/aes-s390x.pl
@@ -44,12 +44,57 @@
# Unlike previous version hardware support detection takes place only
# at the moment of key schedule setup, which is denoted in key->rounds.
# This is done, because deferred key setup can't be made MT-safe, not
-# for key lengthes longer than 128 bits.
+# for keys longer than 128 bits.
#
# Add AES_cbc_encrypt, which gives incredible performance improvement,
# it was measured to be ~6.6x. It's less than previously mentioned 8x,
# because software implementation was optimized.
+# May 2010.
+#
+# Add AES_ctr32_encrypt. If hardware-assisted, it provides up to 4.3x
+# performance improvement over "generic" counter mode routine relying
+# on single-block, also hardware-assisted, AES_encrypt. "Up to" refers
+# to the fact that exact throughput value depends on current stack
+# frame alignment within 4KB page. In worst case you get ~75% of the
+# maximum, but *on average* it would be as much as ~98%. Meaning that
+# worst case is unlike, it's like hitting ravine on plateau.
+
+# November 2010.
+#
+# Adapt for -m31 build. If kernel supports what's called "highgprs"
+# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
+# instructions and achieve "64-bit" performance even in 31-bit legacy
+# application context. The feature is not specific to any particular
+# processor, as long as it's "z-CPU". Latter implies that the code
+# remains z/Architecture specific. On z990 it was measured to perform
+# 2x better than code generated by gcc 4.3.
+
+# December 2010.
+#
+# Add support for z196 "cipher message with counter" instruction.
+# Note however that it's disengaged, because it was measured to
+# perform ~12% worse than vanilla km-based code...
+
+# February 2011.
+#
+# Add AES_xts_[en|de]crypt. This includes support for z196 km-xts-aes
+# instructions, which deliver ~70% improvement at 8KB block size over
+# vanilla km-based code, 37% - at most like 512-bytes block size.
+
+$flavour = shift;
+
+if ($flavour =~ /3[12]/) {
+ $SIZE_T=4;
+ $g="";
+} else {
+ $SIZE_T=8;
+ $g="g";
+}
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
$softonly=0; # allow hardware support
$t0="%r0"; $mask="%r0";
@@ -69,6 +114,8 @@ $rounds="%r13";
$ra="%r14";
$sp="%r15";
+$stdframe=16*$SIZE_T+4*8;
+
sub _data_word()
{ my $i;
while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; }
@@ -210,7 +257,7 @@ $code.=<<___ if (!$softonly);
.Lesoft:
___
$code.=<<___;
- stmg %r3,$ra,24($sp)
+ stm${g} %r3,$ra,3*$SIZE_T($sp)
llgf $s0,0($inp)
llgf $s1,4($inp)
@@ -220,20 +267,20 @@ $code.=<<___;
larl $tbl,AES_Te
bras $ra,_s390x_AES_encrypt
- lg $out,24($sp)
+ l${g} $out,3*$SIZE_T($sp)
st $s0,0($out)
st $s1,4($out)
st $s2,8($out)
st $s3,12($out)
- lmg %r6,$ra,48($sp)
+ lm${g} %r6,$ra,6*$SIZE_T($sp)
br $ra
.size AES_encrypt,.-AES_encrypt
.type _s390x_AES_encrypt,\@function
.align 16
_s390x_AES_encrypt:
- stg $ra,152($sp)
+ st${g} $ra,15*$SIZE_T($sp)
x $s0,0($key)
x $s1,4($key)
x $s2,8($key)
@@ -397,7 +444,7 @@ _s390x_AES_encrypt:
or $s2,$i3
or $s3,$t3
- lg $ra,152($sp)
+ l${g} $ra,15*$SIZE_T($sp)
xr $s0,$t0
xr $s1,$t2
x $s2,24($key)
@@ -536,7 +583,7 @@ $code.=<<___ if (!$softonly);
.Ldsoft:
___
$code.=<<___;
- stmg %r3,$ra,24($sp)
+ stm${g} %r3,$ra,3*$SIZE_T($sp)
llgf $s0,0($inp)
llgf $s1,4($inp)
@@ -546,20 +593,20 @@ $code.=<<___;
larl $tbl,AES_Td
bras $ra,_s390x_AES_decrypt
- lg $out,24($sp)
+ l${g} $out,3*$SIZE_T($sp)
st $s0,0($out)
st $s1,4($out)
st $s2,8($out)
st $s3,12($out)
- lmg %r6,$ra,48($sp)
+ lm${g} %r6,$ra,6*$SIZE_T($sp)
br $ra
.size AES_decrypt,.-AES_decrypt
.type _s390x_AES_decrypt,\@function
.align 16
_s390x_AES_decrypt:
- stg $ra,152($sp)
+ st${g} $ra,15*$SIZE_T($sp)
x $s0,0($key)
x $s1,4($key)
x $s2,8($key)
@@ -703,7 +750,7 @@ _s390x_AES_decrypt:
nr $i1,$mask
nr $i2,$mask
- lg $ra,152($sp)
+ l${g} $ra,15*$SIZE_T($sp)
or $s1,$t1
l $t0,16($key)
l $t1,20($key)
@@ -732,14 +779,15 @@ ___
$code.=<<___;
# void AES_set_encrypt_key(const unsigned char *in, int bits,
# AES_KEY *key) {
-.globl AES_set_encrypt_key
-.type AES_set_encrypt_key,\@function
+.globl private_AES_set_encrypt_key
+.type private_AES_set_encrypt_key,\@function
.align 16
-AES_set_encrypt_key:
+private_AES_set_encrypt_key:
+_s390x_AES_set_encrypt_key:
lghi $t0,0
- clgr $inp,$t0
+ cl${g}r $inp,$t0
je .Lminus1
- clgr $key,$t0
+ cl${g}r $key,$t0
je .Lminus1
lghi $t0,128
@@ -789,7 +837,8 @@ $code.=<<___ if (!$softonly);
je 1f
lg %r1,24($inp)
stg %r1,24($key)
-1: st $bits,236($key) # save bits
+1: st $bits,236($key) # save bits [for debugging purposes]
+ lgr $t0,%r5
st %r5,240($key) # save km code
lghi %r2,0
br %r14
@@ -797,7 +846,7 @@ ___
$code.=<<___;
.align 16
.Lekey_internal:
- stmg %r6,%r13,48($sp) # all non-volatile regs
+ stm${g} %r4,%r13,4*$SIZE_T($sp) # all non-volatile regs and $key
larl $tbl,AES_Te+2048
@@ -857,8 +906,9 @@ $code.=<<___;
la $key,16($key) # key+=4
la $t3,4($t3) # i++
brct $rounds,.L128_loop
+ lghi $t0,10
lghi %r2,0
- lmg %r6,%r13,48($sp)
+ lm${g} %r4,%r13,4*$SIZE_T($sp)
br $ra
.align 16
@@ -905,8 +955,9 @@ $code.=<<___;
st $s2,32($key)
st $s3,36($key)
brct $rounds,.L192_continue
+ lghi $t0,12
lghi %r2,0
- lmg %r6,%r13,48($sp)
+ lm${g} %r4,%r13,4*$SIZE_T($sp)
br $ra
.align 16
@@ -967,8 +1018,9 @@ $code.=<<___;
st $s2,40($key)
st $s3,44($key)
brct $rounds,.L256_continue
+ lghi $t0,14
lghi %r2,0
- lmg %r6,%r13,48($sp)
+ lm${g} %r4,%r13,4*$SIZE_T($sp)
br $ra
.align 16
@@ -1011,42 +1063,34 @@ $code.=<<___;
.Lminus1:
lghi %r2,-1
br $ra
-.size AES_set_encrypt_key,.-AES_set_encrypt_key
+.size private_AES_set_encrypt_key,.-private_AES_set_encrypt_key
# void AES_set_decrypt_key(const unsigned char *in, int bits,
# AES_KEY *key) {
-.globl AES_set_decrypt_key
-.type AES_set_decrypt_key,\@function
+.globl private_AES_set_decrypt_key
+.type private_AES_set_decrypt_key,\@function
.align 16
-AES_set_decrypt_key:
- stg $key,32($sp) # I rely on AES_set_encrypt_key to
- stg $ra,112($sp) # save non-volatile registers!
- bras $ra,AES_set_encrypt_key
- lg $key,32($sp)
- lg $ra,112($sp)
+private_AES_set_decrypt_key:
+ #st${g} $key,4*$SIZE_T($sp) # I rely on AES_set_encrypt_key to
+ st${g} $ra,14*$SIZE_T($sp) # save non-volatile registers and $key!
+ bras $ra,_s390x_AES_set_encrypt_key
+ #l${g} $key,4*$SIZE_T($sp)
+ l${g} $ra,14*$SIZE_T($sp)
ltgr %r2,%r2
bnzr $ra
___
$code.=<<___ if (!$softonly);
- l $t0,240($key)
+ #l $t0,240($key)
lhi $t1,16
cr $t0,$t1
jl .Lgo
oill $t0,0x80 # set "decrypt" bit
st $t0,240($key)
br $ra
-
-.align 16
-.Ldkey_internal:
- stg $key,32($sp)
- stg $ra,40($sp)
- bras $ra,.Lekey_internal
- lg $key,32($sp)
- lg $ra,40($sp)
___
$code.=<<___;
-
-.Lgo: llgf $rounds,240($key)
+.align 16
+.Lgo: lgr $rounds,$t0 #llgf $rounds,240($key)
la $i1,0($key)
sllg $i2,$rounds,4
la $i2,0($i2,$key)
@@ -1123,13 +1167,14 @@ $code.=<<___;
la $key,4($key)
brct $rounds,.Lmix
- lmg %r6,%r13,48($sp)# as was saved by AES_set_encrypt_key!
+ lm${g} %r6,%r13,6*$SIZE_T($sp)# as was saved by AES_set_encrypt_key!
lghi %r2,0
br $ra
-.size AES_set_decrypt_key,.-AES_set_decrypt_key
+.size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key
___
-#void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
+########################################################################
+# void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
# size_t length, const AES_KEY *key,
# unsigned char *ivec, const int enc)
{
@@ -1163,7 +1208,7 @@ $code.=<<___ if (!$softonly);
l %r0,240($key) # load kmc code
lghi $key,15 # res=len%16, len-=res;
ngr $key,$len
- slgr $len,$key
+ sl${g}r $len,$key
la %r1,16($sp) # parameter block - ivec || key
jz .Lkmc_truncated
.long 0xb92f0042 # kmc %r4,%r2
@@ -1181,34 +1226,34 @@ $code.=<<___ if (!$softonly);
tmll %r0,0x80
jnz .Lkmc_truncated_dec
lghi %r1,0
- stg %r1,128($sp)
- stg %r1,136($sp)
+ stg %r1,16*$SIZE_T($sp)
+ stg %r1,16*$SIZE_T+8($sp)
bras %r1,1f
- mvc 128(1,$sp),0($inp)
+ mvc 16*$SIZE_T(1,$sp),0($inp)
1: ex $key,0(%r1)
la %r1,16($sp) # restore parameter block
- la $inp,128($sp)
+ la $inp,16*$SIZE_T($sp)
lghi $len,16
.long 0xb92f0042 # kmc %r4,%r2
j .Lkmc_done
.align 16
.Lkmc_truncated_dec:
- stg $out,64($sp)
- la $out,128($sp)
+ st${g} $out,4*$SIZE_T($sp)
+ la $out,16*$SIZE_T($sp)
lghi $len,16
.long 0xb92f0042 # kmc %r4,%r2
- lg $out,64($sp)
+ l${g} $out,4*$SIZE_T($sp)
bras %r1,2f
- mvc 0(1,$out),128($sp)
+ mvc 0(1,$out),16*$SIZE_T($sp)
2: ex $key,0(%r1)
j .Lkmc_done
.align 16
.Lcbc_software:
___
$code.=<<___;
- stmg $key,$ra,40($sp)
+ stm${g} $key,$ra,5*$SIZE_T($sp)
lhi %r0,0
- cl %r0,164($sp)
+ cl %r0,`$stdframe+$SIZE_T-4`($sp)
je .Lcbc_decrypt
larl $tbl,AES_Te
@@ -1219,10 +1264,10 @@ $code.=<<___;
llgf $s3,12($ivp)
lghi $t0,16
- slgr $len,$t0
+ sl${g}r $len,$t0
brc 4,.Lcbc_enc_tail # if borrow
.Lcbc_enc_loop:
- stmg $inp,$out,16($sp)
+ stm${g} $inp,$out,2*$SIZE_T($sp)
x $s0,0($inp)
x $s1,4($inp)
x $s2,8($inp)
@@ -1231,7 +1276,7 @@ $code.=<<___;
bras $ra,_s390x_AES_encrypt
- lmg $inp,$key,16($sp)
+ lm${g} $inp,$key,2*$SIZE_T($sp)
st $s0,0($out)
st $s1,4($out)
st $s2,8($out)
@@ -1240,33 +1285,33 @@ $code.=<<___;
la $inp,16($inp)
la $out,16($out)
lghi $t0,16
- ltgr $len,$len
+ lt${g}r $len,$len
jz .Lcbc_enc_done
- slgr $len,$t0
+ sl${g}r $len,$t0
brc 4,.Lcbc_enc_tail # if borrow
j .Lcbc_enc_loop
.align 16
.Lcbc_enc_done:
- lg $ivp,48($sp)
+ l${g} $ivp,6*$SIZE_T($sp)
st $s0,0($ivp)
st $s1,4($ivp)
st $s2,8($ivp)
st $s3,12($ivp)
- lmg %r7,$ra,56($sp)
+ lm${g} %r7,$ra,7*$SIZE_T($sp)
br $ra
.align 16
.Lcbc_enc_tail:
aghi $len,15
lghi $t0,0
- stg $t0,128($sp)
- stg $t0,136($sp)
+ stg $t0,16*$SIZE_T($sp)
+ stg $t0,16*$SIZE_T+8($sp)
bras $t1,3f
- mvc 128(1,$sp),0($inp)
+ mvc 16*$SIZE_T(1,$sp),0($inp)
3: ex $len,0($t1)
lghi $len,0
- la $inp,128($sp)
+ la $inp,16*$SIZE_T($sp)
j .Lcbc_enc_loop
.align 16
@@ -1275,10 +1320,10 @@ $code.=<<___;
lg $t0,0($ivp)
lg $t1,8($ivp)
- stmg $t0,$t1,128($sp)
+ stmg $t0,$t1,16*$SIZE_T($sp)
.Lcbc_dec_loop:
- stmg $inp,$out,16($sp)
+ stm${g} $inp,$out,2*$SIZE_T($sp)
llgf $s0,0($inp)
llgf $s1,4($inp)
llgf $s2,8($inp)
@@ -1287,7 +1332,7 @@ $code.=<<___;
bras $ra,_s390x_AES_decrypt
- lmg $inp,$key,16($sp)
+ lm${g} $inp,$key,2*$SIZE_T($sp)
sllg $s0,$s0,32
sllg $s2,$s2,32
lr $s0,$s1
@@ -1295,15 +1340,15 @@ $code.=<<___;
lg $t0,0($inp)
lg $t1,8($inp)
- xg $s0,128($sp)
- xg $s2,136($sp)
+ xg $s0,16*$SIZE_T($sp)
+ xg $s2,16*$SIZE_T+8($sp)
lghi $s1,16
- slgr $len,$s1
+ sl${g}r $len,$s1
brc 4,.Lcbc_dec_tail # if borrow
brc 2,.Lcbc_dec_done # if zero
stg $s0,0($out)
stg $s2,8($out)
- stmg $t0,$t1,128($sp)
+ stmg $t0,$t1,16*$SIZE_T($sp)
la $inp,16($inp)
la $out,16($out)
@@ -1313,7 +1358,7 @@ $code.=<<___;
stg $s0,0($out)
stg $s2,8($out)
.Lcbc_dec_exit:
- lmg $ivp,$ra,48($sp)
+ lm${g} %r6,$ra,6*$SIZE_T($sp)
stmg $t0,$t1,0($ivp)
br $ra
@@ -1321,19 +1366,872 @@ $code.=<<___;
.align 16
.Lcbc_dec_tail:
aghi $len,15
- stg $s0,128($sp)
- stg $s2,136($sp)
+ stg $s0,16*$SIZE_T($sp)
+ stg $s2,16*$SIZE_T+8($sp)
bras $s1,4f
- mvc 0(1,$out),128($sp)
+ mvc 0(1,$out),16*$SIZE_T($sp)
4: ex $len,0($s1)
j .Lcbc_dec_exit
.size AES_cbc_encrypt,.-AES_cbc_encrypt
-.comm OPENSSL_s390xcap_P,8,8
+___
+}
+########################################################################
+# void AES_ctr32_encrypt(const unsigned char *in, unsigned char *out,
+# size_t blocks, const AES_KEY *key,
+# const unsigned char *ivec)
+{
+my $inp="%r2";
+my $out="%r4"; # blocks and out are swapped
+my $len="%r3";
+my $key="%r5"; my $iv0="%r5";
+my $ivp="%r6";
+my $fp ="%r7";
+
+$code.=<<___;
+.globl AES_ctr32_encrypt
+.type AES_ctr32_encrypt,\@function
+.align 16
+AES_ctr32_encrypt:
+ xgr %r3,%r4 # flip %r3 and %r4, $out and $len
+ xgr %r4,%r3
+ xgr %r3,%r4
+ llgfr $len,$len # safe in ctr32 subroutine even in 64-bit case
+___
+$code.=<<___ if (!$softonly);
+ l %r0,240($key)
+ lhi %r1,16
+ clr %r0,%r1
+ jl .Lctr32_software
+
+ stm${g} %r6,$s3,6*$SIZE_T($sp)
+
+ slgr $out,$inp
+ la %r1,0($key) # %r1 is permanent copy of $key
+ lg $iv0,0($ivp) # load ivec
+ lg $ivp,8($ivp)
+
+ # prepare and allocate stack frame at the top of 4K page
+ # with 1K reserved for eventual signal handling
+ lghi $s0,-1024-256-16# guarantee at least 256-bytes buffer
+ lghi $s1,-4096
+ algr $s0,$sp
+ lgr $fp,$sp
+ ngr $s0,$s1 # align at page boundary
+ slgr $fp,$s0 # total buffer size
+ lgr $s2,$sp
+ lghi $s1,1024+16 # sl[g]fi is extended-immediate facility
+ slgr $fp,$s1 # deduct reservation to get usable buffer size
+ # buffer size is at lest 256 and at most 3072+256-16
+
+ la $sp,1024($s0) # alloca
+ srlg $fp,$fp,4 # convert bytes to blocks, minimum 16
+ st${g} $s2,0($sp) # back-chain
+ st${g} $fp,$SIZE_T($sp)
+
+ slgr $len,$fp
+ brc 1,.Lctr32_hw_switch # not zero, no borrow
+ algr $fp,$len # input is shorter than allocated buffer
+ lghi $len,0
+ st${g} $fp,$SIZE_T($sp)
+
+.Lctr32_hw_switch:
+___
+$code.=<<___ if (0); ######### kmctr code was measured to be ~12% slower
+ larl $s0,OPENSSL_s390xcap_P
+ lg $s0,8($s0)
+ tmhh $s0,0x0004 # check for message_security-assist-4
+ jz .Lctr32_km_loop
+
+ llgfr $s0,%r0
+ lgr $s1,%r1
+ lghi %r0,0
+ la %r1,16($sp)
+ .long 0xb92d2042 # kmctr %r4,%r2,%r2
+
+ llihh %r0,0x8000 # check if kmctr supports the function code
+ srlg %r0,%r0,0($s0)
+ ng %r0,16($sp)
+ lgr %r0,$s0
+ lgr %r1,$s1
+ jz .Lctr32_km_loop
+
+####### kmctr code
+ algr $out,$inp # restore $out
+ lgr $s1,$len # $s1 undertakes $len
+ j .Lctr32_kmctr_loop
+.align 16
+.Lctr32_kmctr_loop:
+ la $s2,16($sp)
+ lgr $s3,$fp
+.Lctr32_kmctr_prepare:
+ stg $iv0,0($s2)
+ stg $ivp,8($s2)
+ la $s2,16($s2)
+ ahi $ivp,1 # 32-bit increment, preserves upper half
+ brct $s3,.Lctr32_kmctr_prepare
+
+ #la $inp,0($inp) # inp
+ sllg $len,$fp,4 # len
+ #la $out,0($out) # out
+ la $s2,16($sp) # iv
+ .long 0xb92da042 # kmctr $out,$s2,$inp
+ brc 1,.-4 # pay attention to "partial completion"
+
+ slgr $s1,$fp
+ brc 1,.Lctr32_kmctr_loop # not zero, no borrow
+ algr $fp,$s1
+ lghi $s1,0
+ brc 4+1,.Lctr32_kmctr_loop # not zero
+
+ l${g} $sp,0($sp)
+ lm${g} %r6,$s3,6*$SIZE_T($sp)
+ br $ra
+.align 16
+___
+$code.=<<___;
+.Lctr32_km_loop:
+ la $s2,16($sp)
+ lgr $s3,$fp
+.Lctr32_km_prepare:
+ stg $iv0,0($s2)
+ stg $ivp,8($s2)
+ la $s2,16($s2)
+ ahi $ivp,1 # 32-bit increment, preserves upper half
+ brct $s3,.Lctr32_km_prepare
+
+ la $s0,16($sp) # inp
+ sllg $s1,$fp,4 # len
+ la $s2,16($sp) # out
+ .long 0xb92e00a8 # km %r10,%r8
+ brc 1,.-4 # pay attention to "partial completion"
+
+ la $s2,16($sp)
+ lgr $s3,$fp
+ slgr $s2,$inp
+.Lctr32_km_xor:
+ lg $s0,0($inp)
+ lg $s1,8($inp)
+ xg $s0,0($s2,$inp)
+ xg $s1,8($s2,$inp)
+ stg $s0,0($out,$inp)
+ stg $s1,8($out,$inp)
+ la $inp,16($inp)
+ brct $s3,.Lctr32_km_xor
+
+ slgr $len,$fp
+ brc 1,.Lctr32_km_loop # not zero, no borrow
+ algr $fp,$len
+ lghi $len,0
+ brc 4+1,.Lctr32_km_loop # not zero
+
+ l${g} $s0,0($sp)
+ l${g} $s1,$SIZE_T($sp)
+ la $s2,16($sp)
+.Lctr32_km_zap:
+ stg $s0,0($s2)
+ stg $s0,8($s2)
+ la $s2,16($s2)
+ brct $s1,.Lctr32_km_zap
+
+ la $sp,0($s0)
+ lm${g} %r6,$s3,6*$SIZE_T($sp)
+ br $ra
+.align 16
+.Lctr32_software:
+___
+$code.=<<___;
+ stm${g} $key,$ra,5*$SIZE_T($sp)
+ sl${g}r $inp,$out
+ larl $tbl,AES_Te
+ llgf $t1,12($ivp)
+
+.Lctr32_loop:
+ stm${g} $inp,$out,2*$SIZE_T($sp)
+ llgf $s0,0($ivp)
+ llgf $s1,4($ivp)
+ llgf $s2,8($ivp)
+ lgr $s3,$t1
+ st $t1,16*$SIZE_T($sp)
+ lgr %r4,$key
+
+ bras $ra,_s390x_AES_encrypt
+
+ lm${g} $inp,$ivp,2*$SIZE_T($sp)
+ llgf $t1,16*$SIZE_T($sp)
+ x $s0,0($inp,$out)
+ x $s1,4($inp,$out)
+ x $s2,8($inp,$out)
+ x $s3,12($inp,$out)
+ stm $s0,$s3,0($out)
+
+ la $out,16($out)
+ ahi $t1,1 # 32-bit increment
+ brct $len,.Lctr32_loop
+
+ lm${g} %r6,$ra,6*$SIZE_T($sp)
+ br $ra
+.size AES_ctr32_encrypt,.-AES_ctr32_encrypt
+___
+}
+
+########################################################################
+# void AES_xts_encrypt(const char *inp,char *out,size_t len,
+# const AES_KEY *key1, const AES_KEY *key2,
+# const unsigned char iv[16]);
+#
+{
+my $inp="%r2";
+my $out="%r4"; # len and out are swapped
+my $len="%r3";
+my $key1="%r5"; # $i1
+my $key2="%r6"; # $i2
+my $fp="%r7"; # $i3
+my $tweak=16*$SIZE_T+16; # or $stdframe-16, bottom of the frame...
+
+$code.=<<___;
+.type _s390x_xts_km,\@function
+.align 16
+_s390x_xts_km:
+___
+$code.=<<___ if(1);
+ llgfr $s0,%r0 # put aside the function code
+ lghi $s1,0x7f
+ nr $s1,%r0
+ lghi %r0,0 # query capability vector
+ la %r1,$tweak-16($sp)
+ .long 0xb92e0042 # km %r4,%r2
+ llihh %r1,0x8000
+ srlg %r1,%r1,32($s1) # check for 32+function code
+ ng %r1,$tweak-16($sp)
+ lgr %r0,$s0 # restore the function code
+ la %r1,0($key1) # restore $key1
+ jz .Lxts_km_vanilla
+
+ lmg $i2,$i3,$tweak($sp) # put aside the tweak value
+ algr $out,$inp
+
+ oill %r0,32 # switch to xts function code
+ aghi $s1,-18 #
+ sllg $s1,$s1,3 # (function code - 18)*8, 0 or 16
+ la %r1,$tweak-16($sp)
+ slgr %r1,$s1 # parameter block position
+ lmg $s0,$s3,0($key1) # load 256 bits of key material,
+ stmg $s0,$s3,0(%r1) # and copy it to parameter block.
+ # yes, it contains junk and overlaps
+ # with the tweak in 128-bit case.
+ # it's done to avoid conditional
+ # branch.
+ stmg $i2,$i3,$tweak($sp) # "re-seat" the tweak value
+
+ .long 0xb92e0042 # km %r4,%r2
+ brc 1,.-4 # pay attention to "partial completion"
+
+ lrvg $s0,$tweak+0($sp) # load the last tweak
+ lrvg $s1,$tweak+8($sp)
+ stmg %r0,%r3,$tweak-32($sp) # wipe copy of the key
+
+ nill %r0,0xffdf # switch back to original function code
+ la %r1,0($key1) # restore pointer to $key1
+ slgr $out,$inp
+
+ llgc $len,2*$SIZE_T-1($sp)
+ nill $len,0x0f # $len%=16
+ br $ra
+
+.align 16
+.Lxts_km_vanilla:
+___
+$code.=<<___;
+ # prepare and allocate stack frame at the top of 4K page
+ # with 1K reserved for eventual signal handling
+ lghi $s0,-1024-256-16# guarantee at least 256-bytes buffer
+ lghi $s1,-4096
+ algr $s0,$sp
+ lgr $fp,$sp
+ ngr $s0,$s1 # align at page boundary
+ slgr $fp,$s0 # total buffer size
+ lgr $s2,$sp
+ lghi $s1,1024+16 # sl[g]fi is extended-immediate facility
+ slgr $fp,$s1 # deduct reservation to get usable buffer size
+ # buffer size is at lest 256 and at most 3072+256-16
+
+ la $sp,1024($s0) # alloca
+ nill $fp,0xfff0 # round to 16*n
+ st${g} $s2,0($sp) # back-chain
+ nill $len,0xfff0 # redundant
+ st${g} $fp,$SIZE_T($sp)
+
+ slgr $len,$fp
+ brc 1,.Lxts_km_go # not zero, no borrow
+ algr $fp,$len # input is shorter than allocated buffer
+ lghi $len,0
+ st${g} $fp,$SIZE_T($sp)
+
+.Lxts_km_go:
+ lrvg $s0,$tweak+0($s2) # load the tweak value in little-endian
+ lrvg $s1,$tweak+8($s2)
+
+ la $s2,16($sp) # vector of ascending tweak values
+ slgr $s2,$inp
+ srlg $s3,$fp,4
+ j .Lxts_km_start
+
+.Lxts_km_loop:
+ la $s2,16($sp)
+ slgr $s2,$inp
+ srlg $s3,$fp,4
+.Lxts_km_prepare:
+ lghi $i1,0x87
+ srag $i2,$s1,63 # broadcast upper bit
+ ngr $i1,$i2 # rem
+ algr $s0,$s0
+ alcgr $s1,$s1
+ xgr $s0,$i1
+.Lxts_km_start:
+ lrvgr $i1,$s0 # flip byte order
+ lrvgr $i2,$s1
+ stg $i1,0($s2,$inp)
+ stg $i2,8($s2,$inp)
+ xg $i1,0($inp)
+ xg $i2,8($inp)
+ stg $i1,0($out,$inp)
+ stg $i2,8($out,$inp)
+ la $inp,16($inp)
+ brct $s3,.Lxts_km_prepare
+
+ slgr $inp,$fp # rewind $inp
+ la $s2,0($out,$inp)
+ lgr $s3,$fp
+ .long 0xb92e00aa # km $s2,$s2
+ brc 1,.-4 # pay attention to "partial completion"
+
+ la $s2,16($sp)
+ slgr $s2,$inp
+ srlg $s3,$fp,4
+.Lxts_km_xor:
+ lg $i1,0($out,$inp)
+ lg $i2,8($out,$inp)
+ xg $i1,0($s2,$inp)
+ xg $i2,8($s2,$inp)
+ stg $i1,0($out,$inp)
+ stg $i2,8($out,$inp)
+ la $inp,16($inp)
+ brct $s3,.Lxts_km_xor
+
+ slgr $len,$fp
+ brc 1,.Lxts_km_loop # not zero, no borrow
+ algr $fp,$len
+ lghi $len,0
+ brc 4+1,.Lxts_km_loop # not zero
+
+ l${g} $i1,0($sp) # back-chain
+ llgf $fp,`2*$SIZE_T-4`($sp) # bytes used
+ la $i2,16($sp)
+ srlg $fp,$fp,4
+.Lxts_km_zap:
+ stg $i1,0($i2)
+ stg $i1,8($i2)
+ la $i2,16($i2)
+ brct $fp,.Lxts_km_zap
+
+ la $sp,0($i1)
+ llgc $len,2*$SIZE_T-1($i1)
+ nill $len,0x0f # $len%=16
+ bzr $ra
+
+ # generate one more tweak...
+ lghi $i1,0x87
+ srag $i2,$s1,63 # broadcast upper bit
+ ngr $i1,$i2 # rem
+ algr $s0,$s0
+ alcgr $s1,$s1
+ xgr $s0,$i1
+
+ ltr $len,$len # clear zero flag
+ br $ra
+.size _s390x_xts_km,.-_s390x_xts_km
+
+.globl AES_xts_encrypt
+.type AES_xts_encrypt,\@function
+.align 16
+AES_xts_encrypt:
+ xgr %r3,%r4 # flip %r3 and %r4, $out and $len
+ xgr %r4,%r3
+ xgr %r3,%r4
+___
+$code.=<<___ if ($SIZE_T==4);
+ llgfr $len,$len
+___
+$code.=<<___;
+ st${g} $len,1*$SIZE_T($sp) # save copy of $len
+ srag $len,$len,4 # formally wrong, because it expands
+ # sign byte, but who can afford asking
+ # to process more than 2^63-1 bytes?
+ # I use it, because it sets condition
+ # code...
+ bcr 8,$ra # abort if zero (i.e. less than 16)
+___
+$code.=<<___ if (!$softonly);
+ llgf %r0,240($key2)
+ lhi %r1,16
+ clr %r0,%r1
+ jl .Lxts_enc_software
+
+ st${g} $ra,5*$SIZE_T($sp)
+ stm${g} %r6,$s3,6*$SIZE_T($sp)
+
+ sllg $len,$len,4 # $len&=~15
+ slgr $out,$inp
+
+ # generate the tweak value
+ l${g} $s3,$stdframe($sp) # pointer to iv
+ la $s2,$tweak($sp)
+ lmg $s0,$s1,0($s3)
+ lghi $s3,16
+ stmg $s0,$s1,0($s2)
+ la %r1,0($key2) # $key2 is not needed anymore
+ .long 0xb92e00aa # km $s2,$s2, generate the tweak
+ brc 1,.-4 # can this happen?
+
+ l %r0,240($key1)
+ la %r1,0($key1) # $key1 is not needed anymore
+ bras $ra,_s390x_xts_km
+ jz .Lxts_enc_km_done
+
+ aghi $inp,-16 # take one step back
+ la $i3,0($out,$inp) # put aside real $out
+.Lxts_enc_km_steal:
+ llgc $i1,16($inp)
+ llgc $i2,0($out,$inp)
+ stc $i1,0($out,$inp)
+ stc $i2,16($out,$inp)
+ la $inp,1($inp)
+ brct $len,.Lxts_enc_km_steal
+
+ la $s2,0($i3)
+ lghi $s3,16
+ lrvgr $i1,$s0 # flip byte order
+ lrvgr $i2,$s1
+ xg $i1,0($s2)
+ xg $i2,8($s2)
+ stg $i1,0($s2)
+ stg $i2,8($s2)
+ .long 0xb92e00aa # km $s2,$s2
+ brc 1,.-4 # can this happen?
+ lrvgr $i1,$s0 # flip byte order
+ lrvgr $i2,$s1
+ xg $i1,0($i3)
+ xg $i2,8($i3)
+ stg $i1,0($i3)
+ stg $i2,8($i3)
+
+.Lxts_enc_km_done:
+ stg $sp,$tweak+0($sp) # wipe tweak
+ stg $sp,$tweak+8($sp)
+ l${g} $ra,5*$SIZE_T($sp)
+ lm${g} %r6,$s3,6*$SIZE_T($sp)
+ br $ra
+.align 16
+.Lxts_enc_software:
+___
+$code.=<<___;
+ stm${g} %r6,$ra,6*$SIZE_T($sp)
+
+ slgr $out,$inp
+
+ l${g} $s3,$stdframe($sp) # ivp
+ llgf $s0,0($s3) # load iv
+ llgf $s1,4($s3)
+ llgf $s2,8($s3)
+ llgf $s3,12($s3)
+ stm${g} %r2,%r5,2*$SIZE_T($sp)
+ la $key,0($key2)
+ larl $tbl,AES_Te
+ bras $ra,_s390x_AES_encrypt # generate the tweak
+ lm${g} %r2,%r5,2*$SIZE_T($sp)
+ stm $s0,$s3,$tweak($sp) # save the tweak
+ j .Lxts_enc_enter
+
+.align 16
+.Lxts_enc_loop:
+ lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
+ lrvg $s3,$tweak+8($sp)
+ lghi %r1,0x87
+ srag %r0,$s3,63 # broadcast upper bit
+ ngr %r1,%r0 # rem
+ algr $s1,$s1
+ alcgr $s3,$s3
+ xgr $s1,%r1
+ lrvgr $s1,$s1 # flip byte order
+ lrvgr $s3,$s3
+ srlg $s0,$s1,32 # smash the tweak to 4x32-bits
+ stg $s1,$tweak+0($sp) # save the tweak
+ llgfr $s1,$s1
+ srlg $s2,$s3,32
+ stg $s3,$tweak+8($sp)
+ llgfr $s3,$s3
+ la $inp,16($inp) # $inp+=16
+.Lxts_enc_enter:
+ x $s0,0($inp) # ^=*($inp)
+ x $s1,4($inp)
+ x $s2,8($inp)
+ x $s3,12($inp)
+ stm${g} %r2,%r3,2*$SIZE_T($sp) # only two registers are changing
+ la $key,0($key1)
+ bras $ra,_s390x_AES_encrypt
+ lm${g} %r2,%r5,2*$SIZE_T($sp)
+ x $s0,$tweak+0($sp) # ^=tweak
+ x $s1,$tweak+4($sp)
+ x $s2,$tweak+8($sp)
+ x $s3,$tweak+12($sp)
+ st $s0,0($out,$inp)
+ st $s1,4($out,$inp)
+ st $s2,8($out,$inp)
+ st $s3,12($out,$inp)
+ brct${g} $len,.Lxts_enc_loop
+
+ llgc $len,`2*$SIZE_T-1`($sp)
+ nill $len,0x0f # $len%16
+ jz .Lxts_enc_done
+
+ la $i3,0($inp,$out) # put aside real $out
+.Lxts_enc_steal:
+ llgc %r0,16($inp)
+ llgc %r1,0($out,$inp)
+ stc %r0,0($out,$inp)
+ stc %r1,16($out,$inp)
+ la $inp,1($inp)
+ brct $len,.Lxts_enc_steal
+ la $out,0($i3) # restore real $out
+
+ # generate last tweak...
+ lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
+ lrvg $s3,$tweak+8($sp)
+ lghi %r1,0x87
+ srag %r0,$s3,63 # broadcast upper bit
+ ngr %r1,%r0 # rem
+ algr $s1,$s1
+ alcgr $s3,$s3
+ xgr $s1,%r1
+ lrvgr $s1,$s1 # flip byte order
+ lrvgr $s3,$s3
+ srlg $s0,$s1,32 # smash the tweak to 4x32-bits
+ stg $s1,$tweak+0($sp) # save the tweak
+ llgfr $s1,$s1
+ srlg $s2,$s3,32
+ stg $s3,$tweak+8($sp)
+ llgfr $s3,$s3
+
+ x $s0,0($out) # ^=*(inp)|stolen cipther-text
+ x $s1,4($out)
+ x $s2,8($out)
+ x $s3,12($out)
+ st${g} $out,4*$SIZE_T($sp)
+ la $key,0($key1)
+ bras $ra,_s390x_AES_encrypt
+ l${g} $out,4*$SIZE_T($sp)
+ x $s0,`$tweak+0`($sp) # ^=tweak
+ x $s1,`$tweak+4`($sp)
+ x $s2,`$tweak+8`($sp)
+ x $s3,`$tweak+12`($sp)
+ st $s0,0($out)
+ st $s1,4($out)
+ st $s2,8($out)
+ st $s3,12($out)
+
+.Lxts_enc_done:
+ stg $sp,$tweak+0($sp) # wipe tweak
+ stg $sp,$twesk+8($sp)
+ lm${g} %r6,$ra,6*$SIZE_T($sp)
+ br $ra
+.size AES_xts_encrypt,.-AES_xts_encrypt
+___
+# void AES_xts_decrypt(const char *inp,char *out,size_t len,
+# const AES_KEY *key1, const AES_KEY *key2,
+# const unsigned char iv[16]);
+#
+$code.=<<___;
+.globl AES_xts_decrypt
+.type AES_xts_decrypt,\@function
+.align 16
+AES_xts_decrypt:
+ xgr %r3,%r4 # flip %r3 and %r4, $out and $len
+ xgr %r4,%r3
+ xgr %r3,%r4
+___
+$code.=<<___ if ($SIZE_T==4);
+ llgfr $len,$len
+___
+$code.=<<___;
+ st${g} $len,1*$SIZE_T($sp) # save copy of $len
+ aghi $len,-16
+ bcr 4,$ra # abort if less than zero. formally
+ # wrong, because $len is unsigned,
+ # but who can afford asking to
+ # process more than 2^63-1 bytes?
+ tmll $len,0x0f
+ jnz .Lxts_dec_proceed
+ aghi $len,16
+.Lxts_dec_proceed:
+___
+$code.=<<___ if (!$softonly);
+ llgf %r0,240($key2)
+ lhi %r1,16
+ clr %r0,%r1
+ jl .Lxts_dec_software
+
+ st${g} $ra,5*$SIZE_T($sp)
+ stm${g} %r6,$s3,6*$SIZE_T($sp)
+
+ nill $len,0xfff0 # $len&=~15
+ slgr $out,$inp
+
+ # generate the tweak value
+ l${g} $s3,$stdframe($sp) # pointer to iv
+ la $s2,$tweak($sp)
+ lmg $s0,$s1,0($s3)
+ lghi $s3,16
+ stmg $s0,$s1,0($s2)
+ la %r1,0($key2) # $key2 is not needed past this point
+ .long 0xb92e00aa # km $s2,$s2, generate the tweak
+ brc 1,.-4 # can this happen?
+
+ l %r0,240($key1)
+ la %r1,0($key1) # $key1 is not needed anymore
+
+ ltgr $len,$len
+ jz .Lxts_dec_km_short
+ bras $ra,_s390x_xts_km
+ jz .Lxts_dec_km_done
+
+ lrvgr $s2,$s0 # make copy in reverse byte order
+ lrvgr $s3,$s1
+ j .Lxts_dec_km_2ndtweak
+
+.Lxts_dec_km_short:
+ llgc $len,`2*$SIZE_T-1`($sp)
+ nill $len,0x0f # $len%=16
+ lrvg $s0,$tweak+0($sp) # load the tweak
+ lrvg $s1,$tweak+8($sp)
+ lrvgr $s2,$s0 # make copy in reverse byte order
+ lrvgr $s3,$s1
+
+.Lxts_dec_km_2ndtweak:
+ lghi $i1,0x87
+ srag $i2,$s1,63 # broadcast upper bit
+ ngr $i1,$i2 # rem
+ algr $s0,$s0
+ alcgr $s1,$s1
+ xgr $s0,$i1
+ lrvgr $i1,$s0 # flip byte order
+ lrvgr $i2,$s1
+
+ xg $i1,0($inp)
+ xg $i2,8($inp)
+ stg $i1,0($out,$inp)
+ stg $i2,8($out,$inp)
+ la $i2,0($out,$inp)
+ lghi $i3,16
+ .long 0xb92e0066 # km $i2,$i2
+ brc 1,.-4 # can this happen?
+ lrvgr $i1,$s0
+ lrvgr $i2,$s1
+ xg $i1,0($out,$inp)
+ xg $i2,8($out,$inp)
+ stg $i1,0($out,$inp)
+ stg $i2,8($out,$inp)
+
+ la $i3,0($out,$inp) # put aside real $out
+.Lxts_dec_km_steal:
+ llgc $i1,16($inp)
+ llgc $i2,0($out,$inp)
+ stc $i1,0($out,$inp)
+ stc $i2,16($out,$inp)
+ la $inp,1($inp)
+ brct $len,.Lxts_dec_km_steal
+
+ lgr $s0,$s2
+ lgr $s1,$s3
+ xg $s0,0($i3)
+ xg $s1,8($i3)
+ stg $s0,0($i3)
+ stg $s1,8($i3)
+ la $s0,0($i3)
+ lghi $s1,16
+ .long 0xb92e0088 # km $s0,$s0
+ brc 1,.-4 # can this happen?
+ xg $s2,0($i3)
+ xg $s3,8($i3)
+ stg $s2,0($i3)
+ stg $s3,8($i3)
+.Lxts_dec_km_done:
+ stg $sp,$tweak+0($sp) # wipe tweak
+ stg $sp,$tweak+8($sp)
+ l${g} $ra,5*$SIZE_T($sp)
+ lm${g} %r6,$s3,6*$SIZE_T($sp)
+ br $ra
+.align 16
+.Lxts_dec_software:
+___
+$code.=<<___;
+ stm${g} %r6,$ra,6*$SIZE_T($sp)
+
+ srlg $len,$len,4
+ slgr $out,$inp
+
+ l${g} $s3,$stdframe($sp) # ivp
+ llgf $s0,0($s3) # load iv
+ llgf $s1,4($s3)
+ llgf $s2,8($s3)
+ llgf $s3,12($s3)
+ stm${g} %r2,%r5,2*$SIZE_T($sp)
+ la $key,0($key2)
+ larl $tbl,AES_Te
+ bras $ra,_s390x_AES_encrypt # generate the tweak
+ lm${g} %r2,%r5,2*$SIZE_T($sp)
+ larl $tbl,AES_Td
+ lt${g}r $len,$len
+ stm $s0,$s3,$tweak($sp) # save the tweak
+ jz .Lxts_dec_short
+ j .Lxts_dec_enter
+
+.align 16
+.Lxts_dec_loop:
+ lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
+ lrvg $s3,$tweak+8($sp)
+ lghi %r1,0x87
+ srag %r0,$s3,63 # broadcast upper bit
+ ngr %r1,%r0 # rem
+ algr $s1,$s1
+ alcgr $s3,$s3
+ xgr $s1,%r1
+ lrvgr $s1,$s1 # flip byte order
+ lrvgr $s3,$s3
+ srlg $s0,$s1,32 # smash the tweak to 4x32-bits
+ stg $s1,$tweak+0($sp) # save the tweak
+ llgfr $s1,$s1
+ srlg $s2,$s3,32
+ stg $s3,$tweak+8($sp)
+ llgfr $s3,$s3
+.Lxts_dec_enter:
+ x $s0,0($inp) # tweak^=*(inp)
+ x $s1,4($inp)
+ x $s2,8($inp)
+ x $s3,12($inp)
+ stm${g} %r2,%r3,2*$SIZE_T($sp) # only two registers are changing
+ la $key,0($key1)
+ bras $ra,_s390x_AES_decrypt
+ lm${g} %r2,%r5,2*$SIZE_T($sp)
+ x $s0,$tweak+0($sp) # ^=tweak
+ x $s1,$tweak+4($sp)
+ x $s2,$tweak+8($sp)
+ x $s3,$tweak+12($sp)
+ st $s0,0($out,$inp)
+ st $s1,4($out,$inp)
+ st $s2,8($out,$inp)
+ st $s3,12($out,$inp)
+ la $inp,16($inp)
+ brct${g} $len,.Lxts_dec_loop
+
+ llgc $len,`2*$SIZE_T-1`($sp)
+ nill $len,0x0f # $len%16
+ jz .Lxts_dec_done
+
+ # generate pair of tweaks...
+ lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
+ lrvg $s3,$tweak+8($sp)
+ lghi %r1,0x87
+ srag %r0,$s3,63 # broadcast upper bit
+ ngr %r1,%r0 # rem
+ algr $s1,$s1
+ alcgr $s3,$s3
+ xgr $s1,%r1
+ lrvgr $i2,$s1 # flip byte order
+ lrvgr $i3,$s3
+ stmg $i2,$i3,$tweak($sp) # save the 1st tweak
+ j .Lxts_dec_2ndtweak
+
+.align 16
+.Lxts_dec_short:
+ llgc $len,`2*$SIZE_T-1`($sp)
+ nill $len,0x0f # $len%16
+ lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
+ lrvg $s3,$tweak+8($sp)
+.Lxts_dec_2ndtweak:
+ lghi %r1,0x87
+ srag %r0,$s3,63 # broadcast upper bit
+ ngr %r1,%r0 # rem
+ algr $s1,$s1
+ alcgr $s3,$s3
+ xgr $s1,%r1
+ lrvgr $s1,$s1 # flip byte order
+ lrvgr $s3,$s3
+ srlg $s0,$s1,32 # smash the tweak to 4x32-bits
+ stg $s1,$tweak-16+0($sp) # save the 2nd tweak
+ llgfr $s1,$s1
+ srlg $s2,$s3,32
+ stg $s3,$tweak-16+8($sp)
+ llgfr $s3,$s3
+
+ x $s0,0($inp) # tweak_the_2nd^=*(inp)
+ x $s1,4($inp)
+ x $s2,8($inp)
+ x $s3,12($inp)
+ stm${g} %r2,%r3,2*$SIZE_T($sp)
+ la $key,0($key1)
+ bras $ra,_s390x_AES_decrypt
+ lm${g} %r2,%r5,2*$SIZE_T($sp)
+ x $s0,$tweak-16+0($sp) # ^=tweak_the_2nd
+ x $s1,$tweak-16+4($sp)
+ x $s2,$tweak-16+8($sp)
+ x $s3,$tweak-16+12($sp)
+ st $s0,0($out,$inp)
+ st $s1,4($out,$inp)
+ st $s2,8($out,$inp)
+ st $s3,12($out,$inp)
+
+ la $i3,0($out,$inp) # put aside real $out
+.Lxts_dec_steal:
+ llgc %r0,16($inp)
+ llgc %r1,0($out,$inp)
+ stc %r0,0($out,$inp)
+ stc %r1,16($out,$inp)
+ la $inp,1($inp)
+ brct $len,.Lxts_dec_steal
+ la $out,0($i3) # restore real $out
+
+ lm $s0,$s3,$tweak($sp) # load the 1st tweak
+ x $s0,0($out) # tweak^=*(inp)|stolen cipher-text
+ x $s1,4($out)
+ x $s2,8($out)
+ x $s3,12($out)
+ st${g} $out,4*$SIZE_T($sp)
+ la $key,0($key1)
+ bras $ra,_s390x_AES_decrypt
+ l${g} $out,4*$SIZE_T($sp)
+ x $s0,$tweak+0($sp) # ^=tweak
+ x $s1,$tweak+4($sp)
+ x $s2,$tweak+8($sp)
+ x $s3,$tweak+12($sp)
+ st $s0,0($out)
+ st $s1,4($out)
+ st $s2,8($out)
+ st $s3,12($out)
+ stg $sp,$tweak-16+0($sp) # wipe 2nd tweak
+ stg $sp,$tweak-16+8($sp)
+.Lxts_dec_done:
+ stg $sp,$tweak+0($sp) # wipe tweak
+ stg $sp,$twesk+8($sp)
+ lm${g} %r6,$ra,6*$SIZE_T($sp)
+ br $ra
+.size AES_xts_decrypt,.-AES_xts_decrypt
___
}
$code.=<<___;
.string "AES for s390x, CRYPTOGAMS by <appro\@openssl.org>"
+.comm OPENSSL_s390xcap_P,16,8
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
print $code;
+close STDOUT; # force flush
diff --git a/main/openssl/crypto/aes/asm/aes-sparcv9.pl b/main/openssl/crypto/aes/asm/aes-sparcv9.pl
index c57b3a2d..403c4d12 100755
--- a/main/openssl/crypto/aes/asm/aes-sparcv9.pl
+++ b/main/openssl/crypto/aes/asm/aes-sparcv9.pl
@@ -1176,6 +1176,7 @@ ___
# As UltraSPARC T1, a.k.a. Niagara, has shared FPU, FP nops can have
# undesired effect, so just omit them and sacrifice some portion of
# percent in performance...
-$code =~ s/fmovs.*$//gem;
+$code =~ s/fmovs.*$//gm;
print $code;
+close STDOUT; # ensure flush
diff --git a/main/openssl/crypto/aes/asm/aes-x86_64.S b/main/openssl/crypto/aes/asm/aes-x86_64.S
new file mode 100644
index 00000000..e385566f
--- /dev/null
+++ b/main/openssl/crypto/aes/asm/aes-x86_64.S
@@ -0,0 +1,2541 @@
+.text
+.type _x86_64_AES_encrypt,@function
+.align 16
+_x86_64_AES_encrypt:
+ xorl 0(%r15),%eax
+ xorl 4(%r15),%ebx
+ xorl 8(%r15),%ecx
+ xorl 12(%r15),%edx
+
+ movl 240(%r15),%r13d
+ subl $1,%r13d
+ jmp .Lenc_loop
+.align 16
+.Lenc_loop:
+
+ movzbl %al,%esi
+ movzbl %bl,%edi
+ movzbl %cl,%ebp
+ movl 0(%r14,%rsi,8),%r10d
+ movl 0(%r14,%rdi,8),%r11d
+ movl 0(%r14,%rbp,8),%r12d
+
+ movzbl %bh,%esi
+ movzbl %ch,%edi
+ movzbl %dl,%ebp
+ xorl 3(%r14,%rsi,8),%r10d
+ xorl 3(%r14,%rdi,8),%r11d
+ movl 0(%r14,%rbp,8),%r8d
+
+ movzbl %dh,%esi
+ shrl $16,%ecx
+ movzbl %ah,%ebp
+ xorl 3(%r14,%rsi,8),%r12d
+ shrl $16,%edx
+ xorl 3(%r14,%rbp,8),%r8d
+
+ shrl $16,%ebx
+ leaq 16(%r15),%r15
+ shrl $16,%eax
+
+ movzbl %cl,%esi
+ movzbl %dl,%edi
+ movzbl %al,%ebp
+ xorl 2(%r14,%rsi,8),%r10d
+ xorl 2(%r14,%rdi,8),%r11d
+ xorl 2(%r14,%rbp,8),%r12d
+
+ movzbl %dh,%esi
+ movzbl %ah,%edi
+ movzbl %bl,%ebp
+ xorl 1(%r14,%rsi,8),%r10d
+ xorl 1(%r14,%rdi,8),%r11d
+ xorl 2(%r14,%rbp,8),%r8d
+
+ movl 12(%r15),%edx
+ movzbl %bh,%edi
+ movzbl %ch,%ebp
+ movl 0(%r15),%eax
+ xorl 1(%r14,%rdi,8),%r12d
+ xorl 1(%r14,%rbp,8),%r8d
+
+ movl 4(%r15),%ebx
+ movl 8(%r15),%ecx
+ xorl %r10d,%eax
+ xorl %r11d,%ebx
+ xorl %r12d,%ecx
+ xorl %r8d,%edx
+ subl $1,%r13d
+ jnz .Lenc_loop
+ movzbl %al,%esi
+ movzbl %bl,%edi
+ movzbl %cl,%ebp
+ movzbl 2(%r14,%rsi,8),%r10d
+ movzbl 2(%r14,%rdi,8),%r11d
+ movzbl 2(%r14,%rbp,8),%r12d
+
+ movzbl %dl,%esi
+ movzbl %bh,%edi
+ movzbl %ch,%ebp
+ movzbl 2(%r14,%rsi,8),%r8d
+ movl 0(%r14,%rdi,8),%edi
+ movl 0(%r14,%rbp,8),%ebp
+
+ andl $65280,%edi
+ andl $65280,%ebp
+
+ xorl %edi,%r10d
+ xorl %ebp,%r11d
+ shrl $16,%ecx
+
+ movzbl %dh,%esi
+ movzbl %ah,%edi
+ shrl $16,%edx
+ movl 0(%r14,%rsi,8),%esi
+ movl 0(%r14,%rdi,8),%edi
+
+ andl $65280,%esi
+ andl $65280,%edi
+ shrl $16,%ebx
+ xorl %esi,%r12d
+ xorl %edi,%r8d
+ shrl $16,%eax
+
+ movzbl %cl,%esi
+ movzbl %dl,%edi
+ movzbl %al,%ebp
+ movl 0(%r14,%rsi,8),%esi
+ movl 0(%r14,%rdi,8),%edi
+ movl 0(%r14,%rbp,8),%ebp
+
+ andl $16711680,%esi
+ andl $16711680,%edi
+ andl $16711680,%ebp
+
+ xorl %esi,%r10d
+ xorl %edi,%r11d
+ xorl %ebp,%r12d
+
+ movzbl %bl,%esi
+ movzbl %dh,%edi
+ movzbl %ah,%ebp
+ movl 0(%r14,%rsi,8),%esi
+ movl 2(%r14,%rdi,8),%edi
+ movl 2(%r14,%rbp,8),%ebp
+
+ andl $16711680,%esi
+ andl $4278190080,%edi
+ andl $4278190080,%ebp
+
+ xorl %esi,%r8d
+ xorl %edi,%r10d
+ xorl %ebp,%r11d
+
+ movzbl %bh,%esi
+ movzbl %ch,%edi
+ movl 16+12(%r15),%edx
+ movl 2(%r14,%rsi,8),%esi
+ movl 2(%r14,%rdi,8),%edi
+ movl 16+0(%r15),%eax
+
+ andl $4278190080,%esi
+ andl $4278190080,%edi
+
+ xorl %esi,%r12d
+ xorl %edi,%r8d
+
+ movl 16+4(%r15),%ebx
+ movl 16+8(%r15),%ecx
+ xorl %r10d,%eax
+ xorl %r11d,%ebx
+ xorl %r12d,%ecx
+ xorl %r8d,%edx
+.byte 0xf3,0xc3
+.size _x86_64_AES_encrypt,.-_x86_64_AES_encrypt
+.type _x86_64_AES_encrypt_compact,@function
+.align 16
+_x86_64_AES_encrypt_compact:
+ leaq 128(%r14),%r8
+ movl 0-128(%r8),%edi
+ movl 32-128(%r8),%ebp
+ movl 64-128(%r8),%r10d
+ movl 96-128(%r8),%r11d
+ movl 128-128(%r8),%edi
+ movl 160-128(%r8),%ebp
+ movl 192-128(%r8),%r10d
+ movl 224-128(%r8),%r11d
+ jmp .Lenc_loop_compact
+.align 16
+.Lenc_loop_compact:
+ xorl 0(%r15),%eax
+ xorl 4(%r15),%ebx
+ xorl 8(%r15),%ecx
+ xorl 12(%r15),%edx
+ leaq 16(%r15),%r15
+ movzbl %al,%r10d
+ movzbl %bl,%r11d
+ movzbl %cl,%r12d
+ movzbl (%r14,%r10,1),%r10d
+ movzbl (%r14,%r11,1),%r11d
+ movzbl (%r14,%r12,1),%r12d
+
+ movzbl %dl,%r8d
+ movzbl %bh,%esi
+ movzbl %ch,%edi
+ movzbl (%r14,%r8,1),%r8d
+ movzbl (%r14,%rsi,1),%r9d
+ movzbl (%r14,%rdi,1),%r13d
+
+ movzbl %dh,%ebp
+ movzbl %ah,%esi
+ shrl $16,%ecx
+ movzbl (%r14,%rbp,1),%ebp
+ movzbl (%r14,%rsi,1),%esi
+ shrl $16,%edx
+
+ movzbl %cl,%edi
+ shll $8,%r9d
+ shll $8,%r13d
+ movzbl (%r14,%rdi,1),%edi
+ xorl %r9d,%r10d
+ xorl %r13d,%r11d
+
+ movzbl %dl,%r9d
+ shrl $16,%eax
+ shrl $16,%ebx
+ movzbl %al,%r13d
+ shll $8,%ebp
+ shll $8,%esi
+ movzbl (%r14,%r9,1),%r9d
+ movzbl (%r14,%r13,1),%r13d
+ xorl %ebp,%r12d
+ xorl %esi,%r8d
+
+ movzbl %bl,%ebp
+ movzbl %dh,%esi
+ shll $16,%edi
+ movzbl (%r14,%rbp,1),%ebp
+ movzbl (%r14,%rsi,1),%esi
+ xorl %edi,%r10d
+
+ movzbl %ah,%edi
+ shrl $8,%ecx
+ shrl $8,%ebx
+ movzbl (%r14,%rdi,1),%edi
+ movzbl (%r14,%rcx,1),%edx
+ movzbl (%r14,%rbx,1),%ecx
+ shll $16,%r9d
+ shll $16,%r13d
+ shll $16,%ebp
+ xorl %r9d,%r11d
+ xorl %r13d,%r12d
+ xorl %ebp,%r8d
+
+ shll $24,%esi
+ shll $24,%edi
+ shll $24,%edx
+ xorl %esi,%r10d
+ shll $24,%ecx
+ xorl %edi,%r11d
+ movl %r10d,%eax
+ movl %r11d,%ebx
+ xorl %r12d,%ecx
+ xorl %r8d,%edx
+ cmpq 16(%rsp),%r15
+ je .Lenc_compact_done
+ movl %eax,%esi
+ movl %ebx,%edi
+ andl $2155905152,%esi
+ andl $2155905152,%edi
+ movl %esi,%r10d
+ movl %edi,%r11d
+ shrl $7,%r10d
+ leal (%rax,%rax,1),%r8d
+ shrl $7,%r11d
+ leal (%rbx,%rbx,1),%r9d
+ subl %r10d,%esi
+ subl %r11d,%edi
+ andl $4278124286,%r8d
+ andl $4278124286,%r9d
+ andl $454761243,%esi
+ andl $454761243,%edi
+ movl %eax,%r10d
+ movl %ebx,%r11d
+ xorl %esi,%r8d
+ xorl %edi,%r9d
+
+ xorl %r8d,%eax
+ xorl %r9d,%ebx
+ movl %ecx,%esi
+ movl %edx,%edi
+ roll $24,%eax
+ roll $24,%ebx
+ andl $2155905152,%esi
+ andl $2155905152,%edi
+ xorl %r8d,%eax
+ xorl %r9d,%ebx
+ movl %esi,%r12d
+ movl %edi,%ebp
+ rorl $16,%r10d
+ rorl $16,%r11d
+ shrl $7,%r12d
+ leal (%rcx,%rcx,1),%r8d
+ xorl %r10d,%eax
+ xorl %r11d,%ebx
+ shrl $7,%ebp
+ leal (%rdx,%rdx,1),%r9d
+ rorl $8,%r10d
+ rorl $8,%r11d
+ subl %r12d,%esi
+ subl %ebp,%edi
+ xorl %r10d,%eax
+ xorl %r11d,%ebx
+
+ andl $4278124286,%r8d
+ andl $4278124286,%r9d
+ andl $454761243,%esi
+ andl $454761243,%edi
+ movl %ecx,%r12d
+ movl %edx,%ebp
+ xorl %esi,%r8d
+ xorl %edi,%r9d
+
+ xorl %r8d,%ecx
+ xorl %r9d,%edx
+ roll $24,%ecx
+ roll $24,%edx
+ xorl %r8d,%ecx
+ xorl %r9d,%edx
+ movl 0(%r14),%esi
+ rorl $16,%r12d
+ rorl $16,%ebp
+ movl 64(%r14),%edi
+ xorl %r12d,%ecx
+ xorl %ebp,%edx
+ movl 128(%r14),%r8d
+ rorl $8,%r12d
+ rorl $8,%ebp
+ movl 192(%r14),%r9d
+ xorl %r12d,%ecx
+ xorl %ebp,%edx
+ jmp .Lenc_loop_compact
+.align 16
+.Lenc_compact_done:
+ xorl 0(%r15),%eax
+ xorl 4(%r15),%ebx
+ xorl 8(%r15),%ecx
+ xorl 12(%r15),%edx
+.byte 0xf3,0xc3
+.size _x86_64_AES_encrypt_compact,.-_x86_64_AES_encrypt_compact
+.globl AES_encrypt
+.type AES_encrypt,@function
+.align 16
+.globl asm_AES_encrypt
+.hidden asm_AES_encrypt
+asm_AES_encrypt:
+AES_encrypt:
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+
+
+ movq %rsp,%r10
+ leaq -63(%rdx),%rcx
+ andq $-64,%rsp
+ subq %rsp,%rcx
+ negq %rcx
+ andq $960,%rcx
+ subq %rcx,%rsp
+ subq $32,%rsp
+
+ movq %rsi,16(%rsp)
+ movq %r10,24(%rsp)
+.Lenc_prologue:
+
+ movq %rdx,%r15
+ movl 240(%r15),%r13d
+
+ movl 0(%rdi),%eax
+ movl 4(%rdi),%ebx
+ movl 8(%rdi),%ecx
+ movl 12(%rdi),%edx
+
+ shll $4,%r13d
+ leaq (%r15,%r13,1),%rbp
+ movq %r15,(%rsp)
+ movq %rbp,8(%rsp)
+
+
+ leaq .LAES_Te+2048(%rip),%r14
+ leaq 768(%rsp),%rbp
+ subq %r14,%rbp
+ andq $768,%rbp
+ leaq (%r14,%rbp,1),%r14
+
+ call _x86_64_AES_encrypt_compact
+
+ movq 16(%rsp),%r9
+ movq 24(%rsp),%rsi
+ movl %eax,0(%r9)
+ movl %ebx,4(%r9)
+ movl %ecx,8(%r9)
+ movl %edx,12(%r9)
+
+ movq (%rsi),%r15
+ movq 8(%rsi),%r14
+ movq 16(%rsi),%r13
+ movq 24(%rsi),%r12
+ movq 32(%rsi),%rbp
+ movq 40(%rsi),%rbx
+ leaq 48(%rsi),%rsp
+.Lenc_epilogue:
+ .byte 0xf3,0xc3
+.size AES_encrypt,.-AES_encrypt
+.type _x86_64_AES_decrypt,@function
+.align 16
+_x86_64_AES_decrypt:
+ xorl 0(%r15),%eax
+ xorl 4(%r15),%ebx
+ xorl 8(%r15),%ecx
+ xorl 12(%r15),%edx
+
+ movl 240(%r15),%r13d
+ subl $1,%r13d
+ jmp .Ldec_loop
+.align 16
+.Ldec_loop:
+
+ movzbl %al,%esi
+ movzbl %bl,%edi
+ movzbl %cl,%ebp
+ movl 0(%r14,%rsi,8),%r10d
+ movl 0(%r14,%rdi,8),%r11d
+ movl 0(%r14,%rbp,8),%r12d
+
+ movzbl %dh,%esi
+ movzbl %ah,%edi
+ movzbl %dl,%ebp
+ xorl 3(%r14,%rsi,8),%r10d
+ xorl 3(%r14,%rdi,8),%r11d
+ movl 0(%r14,%rbp,8),%r8d
+
+ movzbl %bh,%esi
+ shrl $16,%eax
+ movzbl %ch,%ebp
+ xorl 3(%r14,%rsi,8),%r12d
+ shrl $16,%edx
+ xorl 3(%r14,%rbp,8),%r8d
+
+ shrl $16,%ebx
+ leaq 16(%r15),%r15
+ shrl $16,%ecx
+
+ movzbl %cl,%esi
+ movzbl %dl,%edi
+ movzbl %al,%ebp
+ xorl 2(%r14,%rsi,8),%r10d
+ xorl 2(%r14,%rdi,8),%r11d
+ xorl 2(%r14,%rbp,8),%r12d
+
+ movzbl %bh,%esi
+ movzbl %ch,%edi
+ movzbl %bl,%ebp
+ xorl 1(%r14,%rsi,8),%r10d
+ xorl 1(%r14,%rdi,8),%r11d
+ xorl 2(%r14,%rbp,8),%r8d
+
+ movzbl %dh,%esi
+ movl 12(%r15),%edx
+ movzbl %ah,%ebp
+ xorl 1(%r14,%rsi,8),%r12d
+ movl 0(%r15),%eax
+ xorl 1(%r14,%rbp,8),%r8d
+
+ xorl %r10d,%eax
+ movl 4(%r15),%ebx
+ movl 8(%r15),%ecx
+ xorl %r12d,%ecx
+ xorl %r11d,%ebx
+ xorl %r8d,%edx
+ subl $1,%r13d
+ jnz .Ldec_loop
+ leaq 2048(%r14),%r14
+ movzbl %al,%esi
+ movzbl %bl,%edi
+ movzbl %cl,%ebp
+ movzbl (%r14,%rsi,1),%r10d
+ movzbl (%r14,%rdi,1),%r11d
+ movzbl (%r14,%rbp,1),%r12d
+
+ movzbl %dl,%esi
+ movzbl %dh,%edi
+ movzbl %ah,%ebp
+ movzbl (%r14,%rsi,1),%r8d
+ movzbl (%r14,%rdi,1),%edi
+ movzbl (%r14,%rbp,1),%ebp
+
+ shll $8,%edi
+ shll $8,%ebp
+
+ xorl %edi,%r10d
+ xorl %ebp,%r11d
+ shrl $16,%edx
+
+ movzbl %bh,%esi
+ movzbl %ch,%edi
+ shrl $16,%eax
+ movzbl (%r14,%rsi,1),%esi
+ movzbl (%r14,%rdi,1),%edi
+
+ shll $8,%esi
+ shll $8,%edi
+ shrl $16,%ebx
+ xorl %esi,%r12d
+ xorl %edi,%r8d
+ shrl $16,%ecx
+
+ movzbl %cl,%esi
+ movzbl %dl,%edi
+ movzbl %al,%ebp
+ movzbl (%r14,%rsi,1),%esi
+ movzbl (%r14,%rdi,1),%edi
+ movzbl (%r14,%rbp,1),%ebp
+
+ shll $16,%esi
+ shll $16,%edi
+ shll $16,%ebp
+
+ xorl %esi,%r10d
+ xorl %edi,%r11d
+ xorl %ebp,%r12d
+
+ movzbl %bl,%esi
+ movzbl %bh,%edi
+ movzbl %ch,%ebp
+ movzbl (%r14,%rsi,1),%esi
+ movzbl (%r14,%rdi,1),%edi
+ movzbl (%r14,%rbp,1),%ebp
+
+ shll $16,%esi
+ shll $24,%edi
+ shll $24,%ebp
+
+ xorl %esi,%r8d
+ xorl %edi,%r10d
+ xorl %ebp,%r11d
+
+ movzbl %dh,%esi
+ movzbl %ah,%edi
+ movl 16+12(%r15),%edx
+ movzbl (%r14,%rsi,1),%esi
+ movzbl (%r14,%rdi,1),%edi
+ movl 16+0(%r15),%eax
+
+ shll $24,%esi
+ shll $24,%edi
+
+ xorl %esi,%r12d
+ xorl %edi,%r8d
+
+ movl 16+4(%r15),%ebx
+ movl 16+8(%r15),%ecx
+ leaq -2048(%r14),%r14
+ xorl %r10d,%eax
+ xorl %r11d,%ebx
+ xorl %r12d,%ecx
+ xorl %r8d,%edx
+.byte 0xf3,0xc3
+.size _x86_64_AES_decrypt,.-_x86_64_AES_decrypt
+.type _x86_64_AES_decrypt_compact,@function
+.align 16
+_x86_64_AES_decrypt_compact:
+ leaq 128(%r14),%r8
+ movl 0-128(%r8),%edi
+ movl 32-128(%r8),%ebp
+ movl 64-128(%r8),%r10d
+ movl 96-128(%r8),%r11d
+ movl 128-128(%r8),%edi
+ movl 160-128(%r8),%ebp
+ movl 192-128(%r8),%r10d
+ movl 224-128(%r8),%r11d
+ jmp .Ldec_loop_compact
+
+.align 16
+.Ldec_loop_compact:
+ xorl 0(%r15),%eax
+ xorl 4(%r15),%ebx
+ xorl 8(%r15),%ecx
+ xorl 12(%r15),%edx
+ leaq 16(%r15),%r15
+ movzbl %al,%r10d
+ movzbl %bl,%r11d
+ movzbl %cl,%r12d
+ movzbl (%r14,%r10,1),%r10d
+ movzbl (%r14,%r11,1),%r11d
+ movzbl (%r14,%r12,1),%r12d
+
+ movzbl %dl,%r8d
+ movzbl %dh,%esi
+ movzbl %ah,%edi
+ movzbl (%r14,%r8,1),%r8d
+ movzbl (%r14,%rsi,1),%r9d
+ movzbl (%r14,%rdi,1),%r13d
+
+ movzbl %bh,%ebp
+ movzbl %ch,%esi
+ shrl $16,%ecx
+ movzbl (%r14,%rbp,1),%ebp
+ movzbl (%r14,%rsi,1),%esi
+ shrl $16,%edx
+
+ movzbl %cl,%edi
+ shll $8,%r9d
+ shll $8,%r13d
+ movzbl (%r14,%rdi,1),%edi
+ xorl %r9d,%r10d
+ xorl %r13d,%r11d
+
+ movzbl %dl,%r9d
+ shrl $16,%eax
+ shrl $16,%ebx
+ movzbl %al,%r13d
+ shll $8,%ebp
+ shll $8,%esi
+ movzbl (%r14,%r9,1),%r9d
+ movzbl (%r14,%r13,1),%r13d
+ xorl %ebp,%r12d
+ xorl %esi,%r8d
+
+ movzbl %bl,%ebp
+ movzbl %bh,%esi
+ shll $16,%edi
+ movzbl (%r14,%rbp,1),%ebp
+ movzbl (%r14,%rsi,1),%esi
+ xorl %edi,%r10d
+
+ movzbl %ch,%edi
+ shll $16,%r9d
+ shll $16,%r13d
+ movzbl (%r14,%rdi,1),%ebx
+ xorl %r9d,%r11d
+ xorl %r13d,%r12d
+
+ movzbl %dh,%edi
+ shrl $8,%eax
+ shll $16,%ebp
+ movzbl (%r14,%rdi,1),%ecx
+ movzbl (%r14,%rax,1),%edx
+ xorl %ebp,%r8d
+
+ shll $24,%esi
+ shll $24,%ebx
+ shll $24,%ecx
+ xorl %esi,%r10d
+ shll $24,%edx
+ xorl %r11d,%ebx
+ movl %r10d,%eax
+ xorl %r12d,%ecx
+ xorl %r8d,%edx
+ cmpq 16(%rsp),%r15
+ je .Ldec_compact_done
+
+ movq 256+0(%r14),%rsi
+ shlq $32,%rbx
+ shlq $32,%rdx
+ movq 256+8(%r14),%rdi
+ orq %rbx,%rax
+ orq %rdx,%rcx
+ movq 256+16(%r14),%rbp
+ movq %rax,%rbx
+ movq %rcx,%rdx
+ andq %rsi,%rbx
+ andq %rsi,%rdx
+ movq %rbx,%r9
+ movq %rdx,%r12
+ shrq $7,%r9
+ leaq (%rax,%rax,1),%r8
+ shrq $7,%r12
+ leaq (%rcx,%rcx,1),%r11
+ subq %r9,%rbx
+ subq %r12,%rdx
+ andq %rdi,%r8
+ andq %rdi,%r11
+ andq %rbp,%rbx
+ andq %rbp,%rdx
+ xorq %r8,%rbx
+ xorq %r11,%rdx
+ movq %rbx,%r8
+ movq %rdx,%r11
+
+ andq %rsi,%rbx
+ andq %rsi,%rdx
+ movq %rbx,%r10
+ movq %rdx,%r13
+ shrq $7,%r10
+ leaq (%r8,%r8,1),%r9
+ shrq $7,%r13
+ leaq (%r11,%r11,1),%r12
+ subq %r10,%rbx
+ subq %r13,%rdx
+ andq %rdi,%r9
+ andq %rdi,%r12
+ andq %rbp,%rbx
+ andq %rbp,%rdx
+ xorq %r9,%rbx
+ xorq %r12,%rdx
+ movq %rbx,%r9
+ movq %rdx,%r12
+
+ andq %rsi,%rbx
+ andq %rsi,%rdx
+ movq %rbx,%r10
+ movq %rdx,%r13
+ shrq $7,%r10
+ xorq %rax,%r8
+ shrq $7,%r13
+ xorq %rcx,%r11
+ subq %r10,%rbx
+ subq %r13,%rdx
+ leaq (%r9,%r9,1),%r10
+ leaq (%r12,%r12,1),%r13
+ xorq %rax,%r9
+ xorq %rcx,%r12
+ andq %rdi,%r10
+ andq %rdi,%r13
+ andq %rbp,%rbx
+ andq %rbp,%rdx
+ xorq %rbx,%r10
+ xorq %rdx,%r13
+
+ xorq %r10,%rax
+ xorq %r13,%rcx
+ xorq %r10,%r8
+ xorq %r13,%r11
+ movq %rax,%rbx
+ movq %rcx,%rdx
+ xorq %r10,%r9
+ xorq %r13,%r12
+ shrq $32,%rbx
+ shrq $32,%rdx
+ xorq %r8,%r10
+ xorq %r11,%r13
+ roll $8,%eax
+ roll $8,%ecx
+ xorq %r9,%r10
+ xorq %r12,%r13
+
+ roll $8,%ebx
+ roll $8,%edx
+ xorl %r10d,%eax
+ xorl %r13d,%ecx
+ shrq $32,%r10
+ shrq $32,%r13
+ xorl %r10d,%ebx
+ xorl %r13d,%edx
+
+ movq %r8,%r10
+ movq %r11,%r13
+ shrq $32,%r10
+ shrq $32,%r13
+ roll $24,%r8d
+ roll $24,%r11d
+ roll $24,%r10d
+ roll $24,%r13d
+ xorl %r8d,%eax
+ xorl %r11d,%ecx
+ movq %r9,%r8
+ movq %r12,%r11
+ xorl %r10d,%ebx
+ xorl %r13d,%edx
+
+ movq 0(%r14),%rsi
+ shrq $32,%r8
+ shrq $32,%r11
+ movq 64(%r14),%rdi
+ roll $16,%r9d
+ roll $16,%r12d
+ movq 128(%r14),%rbp
+ roll $16,%r8d
+ roll $16,%r11d
+ movq 192(%r14),%r10
+ xorl %r9d,%eax
+ xorl %r12d,%ecx
+ movq 256(%r14),%r13
+ xorl %r8d,%ebx
+ xorl %r11d,%edx
+ jmp .Ldec_loop_compact
+.align 16
+.Ldec_compact_done:
+ xorl 0(%r15),%eax
+ xorl 4(%r15),%ebx
+ xorl 8(%r15),%ecx
+ xorl 12(%r15),%edx
+.byte 0xf3,0xc3
+.size _x86_64_AES_decrypt_compact,.-_x86_64_AES_decrypt_compact
+.globl AES_decrypt
+.type AES_decrypt,@function
+.align 16
+.globl asm_AES_decrypt
+.hidden asm_AES_decrypt
+asm_AES_decrypt:
+AES_decrypt:
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+
+
+ movq %rsp,%r10
+ leaq -63(%rdx),%rcx
+ andq $-64,%rsp
+ subq %rsp,%rcx
+ negq %rcx
+ andq $960,%rcx
+ subq %rcx,%rsp
+ subq $32,%rsp
+
+ movq %rsi,16(%rsp)
+ movq %r10,24(%rsp)
+.Ldec_prologue:
+
+ movq %rdx,%r15
+ movl 240(%r15),%r13d
+
+ movl 0(%rdi),%eax
+ movl 4(%rdi),%ebx
+ movl 8(%rdi),%ecx
+ movl 12(%rdi),%edx
+
+ shll $4,%r13d
+ leaq (%r15,%r13,1),%rbp
+ movq %r15,(%rsp)
+ movq %rbp,8(%rsp)
+
+
+ leaq .LAES_Td+2048(%rip),%r14
+ leaq 768(%rsp),%rbp
+ subq %r14,%rbp
+ andq $768,%rbp
+ leaq (%r14,%rbp,1),%r14
+ shrq $3,%rbp
+ addq %rbp,%r14
+
+ call _x86_64_AES_decrypt_compact
+
+ movq 16(%rsp),%r9
+ movq 24(%rsp),%rsi
+ movl %eax,0(%r9)
+ movl %ebx,4(%r9)
+ movl %ecx,8(%r9)
+ movl %edx,12(%r9)
+
+ movq (%rsi),%r15
+ movq 8(%rsi),%r14
+ movq 16(%rsi),%r13
+ movq 24(%rsi),%r12
+ movq 32(%rsi),%rbp
+ movq 40(%rsi),%rbx
+ leaq 48(%rsi),%rsp
+.Ldec_epilogue:
+ .byte 0xf3,0xc3
+.size AES_decrypt,.-AES_decrypt
+.globl private_AES_set_encrypt_key
+.type private_AES_set_encrypt_key,@function
+.align 16
+private_AES_set_encrypt_key:
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ subq $8,%rsp
+.Lenc_key_prologue:
+
+ call _x86_64_AES_set_encrypt_key
+
+ movq 8(%rsp),%r15
+ movq 16(%rsp),%r14
+ movq 24(%rsp),%r13
+ movq 32(%rsp),%r12
+ movq 40(%rsp),%rbp
+ movq 48(%rsp),%rbx
+ addq $56,%rsp
+.Lenc_key_epilogue:
+ .byte 0xf3,0xc3
+.size private_AES_set_encrypt_key,.-private_AES_set_encrypt_key
+
+.type _x86_64_AES_set_encrypt_key,@function
+.align 16
+_x86_64_AES_set_encrypt_key:
+ movl %esi,%ecx
+ movq %rdi,%rsi
+ movq %rdx,%rdi
+
+ testq $-1,%rsi
+ jz .Lbadpointer
+ testq $-1,%rdi
+ jz .Lbadpointer
+
+ leaq .LAES_Te(%rip),%rbp
+ leaq 2048+128(%rbp),%rbp
+
+
+ movl 0-128(%rbp),%eax
+ movl 32-128(%rbp),%ebx
+ movl 64-128(%rbp),%r8d
+ movl 96-128(%rbp),%edx
+ movl 128-128(%rbp),%eax
+ movl 160-128(%rbp),%ebx
+ movl 192-128(%rbp),%r8d
+ movl 224-128(%rbp),%edx
+
+ cmpl $128,%ecx
+ je .L10rounds
+ cmpl $192,%ecx
+ je .L12rounds
+ cmpl $256,%ecx
+ je .L14rounds
+ movq $-2,%rax
+ jmp .Lexit
+
+.L10rounds:
+ movq 0(%rsi),%rax
+ movq 8(%rsi),%rdx
+ movq %rax,0(%rdi)
+ movq %rdx,8(%rdi)
+
+ shrq $32,%rdx
+ xorl %ecx,%ecx
+ jmp .L10shortcut
+.align 4
+.L10loop:
+ movl 0(%rdi),%eax
+ movl 12(%rdi),%edx
+.L10shortcut:
+ movzbl %dl,%esi
+ movzbl -128(%rbp,%rsi,1),%ebx
+ movzbl %dh,%esi
+ shll $24,%ebx
+ xorl %ebx,%eax
+
+ movzbl -128(%rbp,%rsi,1),%ebx
+ shrl $16,%edx
+ movzbl %dl,%esi
+ xorl %ebx,%eax
+
+ movzbl -128(%rbp,%rsi,1),%ebx
+ movzbl %dh,%esi
+ shll $8,%ebx
+ xorl %ebx,%eax
+
+ movzbl -128(%rbp,%rsi,1),%ebx
+ shll $16,%ebx
+ xorl %ebx,%eax
+
+ xorl 1024-128(%rbp,%rcx,4),%eax
+ movl %eax,16(%rdi)
+ xorl 4(%rdi),%eax
+ movl %eax,20(%rdi)
+ xorl 8(%rdi),%eax
+ movl %eax,24(%rdi)
+ xorl 12(%rdi),%eax
+ movl %eax,28(%rdi)
+ addl $1,%ecx
+ leaq 16(%rdi),%rdi
+ cmpl $10,%ecx
+ jl .L10loop
+
+ movl $10,80(%rdi)
+ xorq %rax,%rax
+ jmp .Lexit
+
+.L12rounds:
+ movq 0(%rsi),%rax
+ movq 8(%rsi),%rbx
+ movq 16(%rsi),%rdx
+ movq %rax,0(%rdi)
+ movq %rbx,8(%rdi)
+ movq %rdx,16(%rdi)
+
+ shrq $32,%rdx
+ xorl %ecx,%ecx
+ jmp .L12shortcut
+.align 4
+.L12loop:
+ movl 0(%rdi),%eax
+ movl 20(%rdi),%edx
+.L12shortcut:
+ movzbl %dl,%esi
+ movzbl -128(%rbp,%rsi,1),%ebx
+ movzbl %dh,%esi
+ shll $24,%ebx
+ xorl %ebx,%eax
+
+ movzbl -128(%rbp,%rsi,1),%ebx
+ shrl $16,%edx
+ movzbl %dl,%esi
+ xorl %ebx,%eax
+
+ movzbl -128(%rbp,%rsi,1),%ebx
+ movzbl %dh,%esi
+ shll $8,%ebx
+ xorl %ebx,%eax
+
+ movzbl -128(%rbp,%rsi,1),%ebx
+ shll $16,%ebx
+ xorl %ebx,%eax
+
+ xorl 1024-128(%rbp,%rcx,4),%eax
+ movl %eax,24(%rdi)
+ xorl 4(%rdi),%eax
+ movl %eax,28(%rdi)
+ xorl 8(%rdi),%eax
+ movl %eax,32(%rdi)
+ xorl 12(%rdi),%eax
+ movl %eax,36(%rdi)
+
+ cmpl $7,%ecx
+ je .L12break
+ addl $1,%ecx
+
+ xorl 16(%rdi),%eax
+ movl %eax,40(%rdi)
+ xorl 20(%rdi),%eax
+ movl %eax,44(%rdi)
+
+ leaq 24(%rdi),%rdi
+ jmp .L12loop
+.L12break:
+ movl $12,72(%rdi)
+ xorq %rax,%rax
+ jmp .Lexit
+
+.L14rounds:
+ movq 0(%rsi),%rax
+ movq 8(%rsi),%rbx
+ movq 16(%rsi),%rcx
+ movq 24(%rsi),%rdx
+ movq %rax,0(%rdi)
+ movq %rbx,8(%rdi)
+ movq %rcx,16(%rdi)
+ movq %rdx,24(%rdi)
+
+ shrq $32,%rdx
+ xorl %ecx,%ecx
+ jmp .L14shortcut
+.align 4
+.L14loop:
+ movl 0(%rdi),%eax
+ movl 28(%rdi),%edx
+.L14shortcut:
+ movzbl %dl,%esi
+ movzbl -128(%rbp,%rsi,1),%ebx
+ movzbl %dh,%esi
+ shll $24,%ebx
+ xorl %ebx,%eax
+
+ movzbl -128(%rbp,%rsi,1),%ebx
+ shrl $16,%edx
+ movzbl %dl,%esi
+ xorl %ebx,%eax
+
+ movzbl -128(%rbp,%rsi,1),%ebx
+ movzbl %dh,%esi
+ shll $8,%ebx
+ xorl %ebx,%eax
+
+ movzbl -128(%rbp,%rsi,1),%ebx
+ shll $16,%ebx
+ xorl %ebx,%eax
+
+ xorl 1024-128(%rbp,%rcx,4),%eax
+ movl %eax,32(%rdi)
+ xorl 4(%rdi),%eax
+ movl %eax,36(%rdi)
+ xorl 8(%rdi),%eax
+ movl %eax,40(%rdi)
+ xorl 12(%rdi),%eax
+ movl %eax,44(%rdi)
+
+ cmpl $6,%ecx
+ je .L14break
+ addl $1,%ecx
+
+ movl %eax,%edx
+ movl 16(%rdi),%eax
+ movzbl %dl,%esi
+ movzbl -128(%rbp,%rsi,1),%ebx
+ movzbl %dh,%esi
+ xorl %ebx,%eax
+
+ movzbl -128(%rbp,%rsi,1),%ebx
+ shrl $16,%edx
+ shll $8,%ebx
+ movzbl %dl,%esi
+ xorl %ebx,%eax
+
+ movzbl -128(%rbp,%rsi,1),%ebx
+ movzbl %dh,%esi
+ shll $16,%ebx
+ xorl %ebx,%eax
+
+ movzbl -128(%rbp,%rsi,1),%ebx
+ shll $24,%ebx
+ xorl %ebx,%eax
+
+ movl %eax,48(%rdi)
+ xorl 20(%rdi),%eax
+ movl %eax,52(%rdi)
+ xorl 24(%rdi),%eax
+ movl %eax,56(%rdi)
+ xorl 28(%rdi),%eax
+ movl %eax,60(%rdi)
+
+ leaq 32(%rdi),%rdi
+ jmp .L14loop
+.L14break:
+ movl $14,48(%rdi)
+ xorq %rax,%rax
+ jmp .Lexit
+
+.Lbadpointer:
+ movq $-1,%rax
+.Lexit:
+.byte 0xf3,0xc3
+.size _x86_64_AES_set_encrypt_key,.-_x86_64_AES_set_encrypt_key
+.globl private_AES_set_decrypt_key
+.type private_AES_set_decrypt_key,@function
+.align 16
+private_AES_set_decrypt_key:
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ pushq %rdx
+.Ldec_key_prologue:
+
+ call _x86_64_AES_set_encrypt_key
+ movq (%rsp),%r8
+ cmpl $0,%eax
+ jne .Labort
+
+ movl 240(%r8),%r14d
+ xorq %rdi,%rdi
+ leaq (%rdi,%r14,4),%rcx
+ movq %r8,%rsi
+ leaq (%r8,%rcx,4),%rdi
+.align 4
+.Linvert:
+ movq 0(%rsi),%rax
+ movq 8(%rsi),%rbx
+ movq 0(%rdi),%rcx
+ movq 8(%rdi),%rdx
+ movq %rax,0(%rdi)
+ movq %rbx,8(%rdi)
+ movq %rcx,0(%rsi)
+ movq %rdx,8(%rsi)
+ leaq 16(%rsi),%rsi
+ leaq -16(%rdi),%rdi
+ cmpq %rsi,%rdi
+ jne .Linvert
+
+ leaq .LAES_Te+2048+1024(%rip),%rax
+
+ movq 40(%rax),%rsi
+ movq 48(%rax),%rdi
+ movq 56(%rax),%rbp
+
+ movq %r8,%r15
+ subl $1,%r14d
+.align 4
+.Lpermute:
+ leaq 16(%r15),%r15
+ movq 0(%r15),%rax
+ movq 8(%r15),%rcx
+ movq %rax,%rbx
+ movq %rcx,%rdx
+ andq %rsi,%rbx
+ andq %rsi,%rdx
+ movq %rbx,%r9
+ movq %rdx,%r12
+ shrq $7,%r9
+ leaq (%rax,%rax,1),%r8
+ shrq $7,%r12
+ leaq (%rcx,%rcx,1),%r11
+ subq %r9,%rbx
+ subq %r12,%rdx
+ andq %rdi,%r8
+ andq %rdi,%r11
+ andq %rbp,%rbx
+ andq %rbp,%rdx
+ xorq %r8,%rbx
+ xorq %r11,%rdx
+ movq %rbx,%r8
+ movq %rdx,%r11
+
+ andq %rsi,%rbx
+ andq %rsi,%rdx
+ movq %rbx,%r10
+ movq %rdx,%r13
+ shrq $7,%r10
+ leaq (%r8,%r8,1),%r9
+ shrq $7,%r13
+ leaq (%r11,%r11,1),%r12
+ subq %r10,%rbx
+ subq %r13,%rdx
+ andq %rdi,%r9
+ andq %rdi,%r12
+ andq %rbp,%rbx
+ andq %rbp,%rdx
+ xorq %r9,%rbx
+ xorq %r12,%rdx
+ movq %rbx,%r9
+ movq %rdx,%r12
+
+ andq %rsi,%rbx
+ andq %rsi,%rdx
+ movq %rbx,%r10
+ movq %rdx,%r13
+ shrq $7,%r10
+ xorq %rax,%r8
+ shrq $7,%r13
+ xorq %rcx,%r11
+ subq %r10,%rbx
+ subq %r13,%rdx
+ leaq (%r9,%r9,1),%r10
+ leaq (%r12,%r12,1),%r13
+ xorq %rax,%r9
+ xorq %rcx,%r12
+ andq %rdi,%r10
+ andq %rdi,%r13
+ andq %rbp,%rbx
+ andq %rbp,%rdx
+ xorq %rbx,%r10
+ xorq %rdx,%r13
+
+ xorq %r10,%rax
+ xorq %r13,%rcx
+ xorq %r10,%r8
+ xorq %r13,%r11
+ movq %rax,%rbx
+ movq %rcx,%rdx
+ xorq %r10,%r9
+ xorq %r13,%r12
+ shrq $32,%rbx
+ shrq $32,%rdx
+ xorq %r8,%r10
+ xorq %r11,%r13
+ roll $8,%eax
+ roll $8,%ecx
+ xorq %r9,%r10
+ xorq %r12,%r13
+
+ roll $8,%ebx
+ roll $8,%edx
+ xorl %r10d,%eax
+ xorl %r13d,%ecx
+ shrq $32,%r10
+ shrq $32,%r13
+ xorl %r10d,%ebx
+ xorl %r13d,%edx
+
+ movq %r8,%r10
+ movq %r11,%r13
+ shrq $32,%r10
+ shrq $32,%r13
+ roll $24,%r8d
+ roll $24,%r11d
+ roll $24,%r10d
+ roll $24,%r13d
+ xorl %r8d,%eax
+ xorl %r11d,%ecx
+ movq %r9,%r8
+ movq %r12,%r11
+ xorl %r10d,%ebx
+ xorl %r13d,%edx
+
+
+ shrq $32,%r8
+ shrq $32,%r11
+
+ roll $16,%r9d
+ roll $16,%r12d
+
+ roll $16,%r8d
+ roll $16,%r11d
+
+ xorl %r9d,%eax
+ xorl %r12d,%ecx
+
+ xorl %r8d,%ebx
+ xorl %r11d,%edx
+ movl %eax,0(%r15)
+ movl %ebx,4(%r15)
+ movl %ecx,8(%r15)
+ movl %edx,12(%r15)
+ subl $1,%r14d
+ jnz .Lpermute
+
+ xorq %rax,%rax
+.Labort:
+ movq 8(%rsp),%r15
+ movq 16(%rsp),%r14
+ movq 24(%rsp),%r13
+ movq 32(%rsp),%r12
+ movq 40(%rsp),%rbp
+ movq 48(%rsp),%rbx
+ addq $56,%rsp
+.Ldec_key_epilogue:
+ .byte 0xf3,0xc3
+.size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key
+.globl AES_cbc_encrypt
+.type AES_cbc_encrypt,@function
+.align 16
+
+.globl asm_AES_cbc_encrypt
+.hidden asm_AES_cbc_encrypt
+asm_AES_cbc_encrypt:
+AES_cbc_encrypt:
+ cmpq $0,%rdx
+ je .Lcbc_epilogue
+ pushfq
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+.Lcbc_prologue:
+
+ cld
+ movl %r9d,%r9d
+
+ leaq .LAES_Te(%rip),%r14
+ cmpq $0,%r9
+ jne .Lcbc_picked_te
+ leaq .LAES_Td(%rip),%r14
+.Lcbc_picked_te:
+
+ movl OPENSSL_ia32cap_P(%rip),%r10d
+ cmpq $512,%rdx
+ jb .Lcbc_slow_prologue
+ testq $15,%rdx
+ jnz .Lcbc_slow_prologue
+ btl $28,%r10d
+ jc .Lcbc_slow_prologue
+
+
+ leaq -88-248(%rsp),%r15
+ andq $-64,%r15
+
+
+ movq %r14,%r10
+ leaq 2304(%r14),%r11
+ movq %r15,%r12
+ andq $4095,%r10
+ andq $4095,%r11
+ andq $4095,%r12
+
+ cmpq %r11,%r12
+ jb .Lcbc_te_break_out
+ subq %r11,%r12
+ subq %r12,%r15
+ jmp .Lcbc_te_ok
+.Lcbc_te_break_out:
+ subq %r10,%r12
+ andq $4095,%r12
+ addq $320,%r12
+ subq %r12,%r15
+.align 4
+.Lcbc_te_ok:
+
+ xchgq %rsp,%r15
+
+ movq %r15,16(%rsp)
+.Lcbc_fast_body:
+ movq %rdi,24(%rsp)
+ movq %rsi,32(%rsp)
+ movq %rdx,40(%rsp)
+ movq %rcx,48(%rsp)
+ movq %r8,56(%rsp)
+ movl $0,80+240(%rsp)
+ movq %r8,%rbp
+ movq %r9,%rbx
+ movq %rsi,%r9
+ movq %rdi,%r8
+ movq %rcx,%r15
+
+ movl 240(%r15),%eax
+
+ movq %r15,%r10
+ subq %r14,%r10
+ andq $4095,%r10
+ cmpq $2304,%r10
+ jb .Lcbc_do_ecopy
+ cmpq $4096-248,%r10
+ jb .Lcbc_skip_ecopy
+.align 4
+.Lcbc_do_ecopy:
+ movq %r15,%rsi
+ leaq 80(%rsp),%rdi
+ leaq 80(%rsp),%r15
+ movl $30,%ecx
+.long 0x90A548F3
+ movl %eax,(%rdi)
+.Lcbc_skip_ecopy:
+ movq %r15,0(%rsp)
+
+ movl $18,%ecx
+.align 4
+.Lcbc_prefetch_te:
+ movq 0(%r14),%r10
+ movq 32(%r14),%r11
+ movq 64(%r14),%r12
+ movq 96(%r14),%r13
+ leaq 128(%r14),%r14
+ subl $1,%ecx
+ jnz .Lcbc_prefetch_te
+ leaq -2304(%r14),%r14
+
+ cmpq $0,%rbx
+ je .LFAST_DECRYPT
+
+
+ movl 0(%rbp),%eax
+ movl 4(%rbp),%ebx
+ movl 8(%rbp),%ecx
+ movl 12(%rbp),%edx
+
+.align 4
+.Lcbc_fast_enc_loop:
+ xorl 0(%r8),%eax
+ xorl 4(%r8),%ebx
+ xorl 8(%r8),%ecx
+ xorl 12(%r8),%edx
+ movq 0(%rsp),%r15
+ movq %r8,24(%rsp)
+
+ call _x86_64_AES_encrypt
+
+ movq 24(%rsp),%r8
+ movq 40(%rsp),%r10
+ movl %eax,0(%r9)
+ movl %ebx,4(%r9)
+ movl %ecx,8(%r9)
+ movl %edx,12(%r9)
+
+ leaq 16(%r8),%r8
+ leaq 16(%r9),%r9
+ subq $16,%r10
+ testq $-16,%r10
+ movq %r10,40(%rsp)
+ jnz .Lcbc_fast_enc_loop
+ movq 56(%rsp),%rbp
+ movl %eax,0(%rbp)
+ movl %ebx,4(%rbp)
+ movl %ecx,8(%rbp)
+ movl %edx,12(%rbp)
+
+ jmp .Lcbc_fast_cleanup
+
+
+.align 16
+.LFAST_DECRYPT:
+ cmpq %r8,%r9
+ je .Lcbc_fast_dec_in_place
+
+ movq %rbp,64(%rsp)
+.align 4
+.Lcbc_fast_dec_loop:
+ movl 0(%r8),%eax
+ movl 4(%r8),%ebx
+ movl 8(%r8),%ecx
+ movl 12(%r8),%edx
+ movq 0(%rsp),%r15
+ movq %r8,24(%rsp)
+
+ call _x86_64_AES_decrypt
+
+ movq 64(%rsp),%rbp
+ movq 24(%rsp),%r8
+ movq 40(%rsp),%r10
+ xorl 0(%rbp),%eax
+ xorl 4(%rbp),%ebx
+ xorl 8(%rbp),%ecx
+ xorl 12(%rbp),%edx
+ movq %r8,%rbp
+
+ subq $16,%r10
+ movq %r10,40(%rsp)
+ movq %rbp,64(%rsp)
+
+ movl %eax,0(%r9)
+ movl %ebx,4(%r9)
+ movl %ecx,8(%r9)
+ movl %edx,12(%r9)
+
+ leaq 16(%r8),%r8
+ leaq 16(%r9),%r9
+ jnz .Lcbc_fast_dec_loop
+ movq 56(%rsp),%r12
+ movq 0(%rbp),%r10
+ movq 8(%rbp),%r11
+ movq %r10,0(%r12)
+ movq %r11,8(%r12)
+ jmp .Lcbc_fast_cleanup
+
+.align 16
+.Lcbc_fast_dec_in_place:
+ movq 0(%rbp),%r10
+ movq 8(%rbp),%r11
+ movq %r10,0+64(%rsp)
+ movq %r11,8+64(%rsp)
+.align 4
+.Lcbc_fast_dec_in_place_loop:
+ movl 0(%r8),%eax
+ movl 4(%r8),%ebx
+ movl 8(%r8),%ecx
+ movl 12(%r8),%edx
+ movq 0(%rsp),%r15
+ movq %r8,24(%rsp)
+
+ call _x86_64_AES_decrypt
+
+ movq 24(%rsp),%r8
+ movq 40(%rsp),%r10
+ xorl 0+64(%rsp),%eax
+ xorl 4+64(%rsp),%ebx
+ xorl 8+64(%rsp),%ecx
+ xorl 12+64(%rsp),%edx
+
+ movq 0(%r8),%r11
+ movq 8(%r8),%r12
+ subq $16,%r10
+ jz .Lcbc_fast_dec_in_place_done
+
+ movq %r11,0+64(%rsp)
+ movq %r12,8+64(%rsp)
+
+ movl %eax,0(%r9)
+ movl %ebx,4(%r9)
+ movl %ecx,8(%r9)
+ movl %edx,12(%r9)
+
+ leaq 16(%r8),%r8
+ leaq 16(%r9),%r9
+ movq %r10,40(%rsp)
+ jmp .Lcbc_fast_dec_in_place_loop
+.Lcbc_fast_dec_in_place_done:
+ movq 56(%rsp),%rdi
+ movq %r11,0(%rdi)
+ movq %r12,8(%rdi)
+
+ movl %eax,0(%r9)
+ movl %ebx,4(%r9)
+ movl %ecx,8(%r9)
+ movl %edx,12(%r9)
+
+.align 4
+.Lcbc_fast_cleanup:
+ cmpl $0,80+240(%rsp)
+ leaq 80(%rsp),%rdi
+ je .Lcbc_exit
+ movl $30,%ecx
+ xorq %rax,%rax
+.long 0x90AB48F3
+
+ jmp .Lcbc_exit
+
+
+.align 16
+.Lcbc_slow_prologue:
+
+ leaq -88(%rsp),%rbp
+ andq $-64,%rbp
+
+ leaq -88-63(%rcx),%r10
+ subq %rbp,%r10
+ negq %r10
+ andq $960,%r10
+ subq %r10,%rbp
+
+ xchgq %rsp,%rbp
+
+ movq %rbp,16(%rsp)
+.Lcbc_slow_body:
+
+
+
+
+ movq %r8,56(%rsp)
+ movq %r8,%rbp
+ movq %r9,%rbx
+ movq %rsi,%r9
+ movq %rdi,%r8
+ movq %rcx,%r15
+ movq %rdx,%r10
+
+ movl 240(%r15),%eax
+ movq %r15,0(%rsp)
+ shll $4,%eax
+ leaq (%r15,%rax,1),%rax
+ movq %rax,8(%rsp)
+
+
+ leaq 2048(%r14),%r14
+ leaq 768-8(%rsp),%rax
+ subq %r14,%rax
+ andq $768,%rax
+ leaq (%r14,%rax,1),%r14
+
+ cmpq $0,%rbx
+ je .LSLOW_DECRYPT
+
+
+ testq $-16,%r10
+ movl 0(%rbp),%eax
+ movl 4(%rbp),%ebx
+ movl 8(%rbp),%ecx
+ movl 12(%rbp),%edx
+ jz .Lcbc_slow_enc_tail
+
+.align 4
+.Lcbc_slow_enc_loop:
+ xorl 0(%r8),%eax
+ xorl 4(%r8),%ebx
+ xorl 8(%r8),%ecx
+ xorl 12(%r8),%edx
+ movq 0(%rsp),%r15
+ movq %r8,24(%rsp)
+ movq %r9,32(%rsp)
+ movq %r10,40(%rsp)
+
+ call _x86_64_AES_encrypt_compact
+
+ movq 24(%rsp),%r8
+ movq 32(%rsp),%r9
+ movq 40(%rsp),%r10
+ movl %eax,0(%r9)
+ movl %ebx,4(%r9)
+ movl %ecx,8(%r9)
+ movl %edx,12(%r9)
+
+ leaq 16(%r8),%r8
+ leaq 16(%r9),%r9
+ subq $16,%r10
+ testq $-16,%r10
+ jnz .Lcbc_slow_enc_loop
+ testq $15,%r10
+ jnz .Lcbc_slow_enc_tail
+ movq 56(%rsp),%rbp
+ movl %eax,0(%rbp)
+ movl %ebx,4(%rbp)
+ movl %ecx,8(%rbp)
+ movl %edx,12(%rbp)
+
+ jmp .Lcbc_exit
+
+.align 4
+.Lcbc_slow_enc_tail:
+ movq %rax,%r11
+ movq %rcx,%r12
+ movq %r10,%rcx
+ movq %r8,%rsi
+ movq %r9,%rdi
+.long 0x9066A4F3
+ movq $16,%rcx
+ subq %r10,%rcx
+ xorq %rax,%rax
+.long 0x9066AAF3
+ movq %r9,%r8
+ movq $16,%r10
+ movq %r11,%rax
+ movq %r12,%rcx
+ jmp .Lcbc_slow_enc_loop
+
+.align 16
+.LSLOW_DECRYPT:
+ shrq $3,%rax
+ addq %rax,%r14
+
+ movq 0(%rbp),%r11
+ movq 8(%rbp),%r12
+ movq %r11,0+64(%rsp)
+ movq %r12,8+64(%rsp)
+
+.align 4
+.Lcbc_slow_dec_loop:
+ movl 0(%r8),%eax
+ movl 4(%r8),%ebx
+ movl 8(%r8),%ecx
+ movl 12(%r8),%edx
+ movq 0(%rsp),%r15
+ movq %r8,24(%rsp)
+ movq %r9,32(%rsp)
+ movq %r10,40(%rsp)
+
+ call _x86_64_AES_decrypt_compact
+
+ movq 24(%rsp),%r8
+ movq 32(%rsp),%r9
+ movq 40(%rsp),%r10
+ xorl 0+64(%rsp),%eax
+ xorl 4+64(%rsp),%ebx
+ xorl 8+64(%rsp),%ecx
+ xorl 12+64(%rsp),%edx
+
+ movq 0(%r8),%r11
+ movq 8(%r8),%r12
+ subq $16,%r10
+ jc .Lcbc_slow_dec_partial
+ jz .Lcbc_slow_dec_done
+
+ movq %r11,0+64(%rsp)
+ movq %r12,8+64(%rsp)
+
+ movl %eax,0(%r9)
+ movl %ebx,4(%r9)
+ movl %ecx,8(%r9)
+ movl %edx,12(%r9)
+
+ leaq 16(%r8),%r8
+ leaq 16(%r9),%r9
+ jmp .Lcbc_slow_dec_loop
+.Lcbc_slow_dec_done:
+ movq 56(%rsp),%rdi
+ movq %r11,0(%rdi)
+ movq %r12,8(%rdi)
+
+ movl %eax,0(%r9)
+ movl %ebx,4(%r9)
+ movl %ecx,8(%r9)
+ movl %edx,12(%r9)
+
+ jmp .Lcbc_exit
+
+.align 4
+.Lcbc_slow_dec_partial:
+ movq 56(%rsp),%rdi
+ movq %r11,0(%rdi)
+ movq %r12,8(%rdi)
+
+ movl %eax,0+64(%rsp)
+ movl %ebx,4+64(%rsp)
+ movl %ecx,8+64(%rsp)
+ movl %edx,12+64(%rsp)
+
+ movq %r9,%rdi
+ leaq 64(%rsp),%rsi
+ leaq 16(%r10),%rcx
+.long 0x9066A4F3
+ jmp .Lcbc_exit
+
+.align 16
+.Lcbc_exit:
+ movq 16(%rsp),%rsi
+ movq (%rsi),%r15
+ movq 8(%rsi),%r14
+ movq 16(%rsi),%r13
+ movq 24(%rsi),%r12
+ movq 32(%rsi),%rbp
+ movq 40(%rsi),%rbx
+ leaq 48(%rsi),%rsp
+.Lcbc_popfq:
+ popfq
+.Lcbc_epilogue:
+ .byte 0xf3,0xc3
+.size AES_cbc_encrypt,.-AES_cbc_encrypt
+.align 64
+.LAES_Te:
+.long 0xa56363c6,0xa56363c6
+.long 0x847c7cf8,0x847c7cf8
+.long 0x997777ee,0x997777ee
+.long 0x8d7b7bf6,0x8d7b7bf6
+.long 0x0df2f2ff,0x0df2f2ff
+.long 0xbd6b6bd6,0xbd6b6bd6
+.long 0xb16f6fde,0xb16f6fde
+.long 0x54c5c591,0x54c5c591
+.long 0x50303060,0x50303060
+.long 0x03010102,0x03010102
+.long 0xa96767ce,0xa96767ce
+.long 0x7d2b2b56,0x7d2b2b56
+.long 0x19fefee7,0x19fefee7
+.long 0x62d7d7b5,0x62d7d7b5
+.long 0xe6abab4d,0xe6abab4d
+.long 0x9a7676ec,0x9a7676ec
+.long 0x45caca8f,0x45caca8f
+.long 0x9d82821f,0x9d82821f
+.long 0x40c9c989,0x40c9c989
+.long 0x877d7dfa,0x877d7dfa
+.long 0x15fafaef,0x15fafaef
+.long 0xeb5959b2,0xeb5959b2
+.long 0xc947478e,0xc947478e
+.long 0x0bf0f0fb,0x0bf0f0fb
+.long 0xecadad41,0xecadad41
+.long 0x67d4d4b3,0x67d4d4b3
+.long 0xfda2a25f,0xfda2a25f
+.long 0xeaafaf45,0xeaafaf45
+.long 0xbf9c9c23,0xbf9c9c23
+.long 0xf7a4a453,0xf7a4a453
+.long 0x967272e4,0x967272e4
+.long 0x5bc0c09b,0x5bc0c09b
+.long 0xc2b7b775,0xc2b7b775
+.long 0x1cfdfde1,0x1cfdfde1
+.long 0xae93933d,0xae93933d
+.long 0x6a26264c,0x6a26264c
+.long 0x5a36366c,0x5a36366c
+.long 0x413f3f7e,0x413f3f7e
+.long 0x02f7f7f5,0x02f7f7f5
+.long 0x4fcccc83,0x4fcccc83
+.long 0x5c343468,0x5c343468
+.long 0xf4a5a551,0xf4a5a551
+.long 0x34e5e5d1,0x34e5e5d1
+.long 0x08f1f1f9,0x08f1f1f9
+.long 0x937171e2,0x937171e2
+.long 0x73d8d8ab,0x73d8d8ab
+.long 0x53313162,0x53313162
+.long 0x3f15152a,0x3f15152a
+.long 0x0c040408,0x0c040408
+.long 0x52c7c795,0x52c7c795
+.long 0x65232346,0x65232346
+.long 0x5ec3c39d,0x5ec3c39d
+.long 0x28181830,0x28181830
+.long 0xa1969637,0xa1969637
+.long 0x0f05050a,0x0f05050a
+.long 0xb59a9a2f,0xb59a9a2f
+.long 0x0907070e,0x0907070e
+.long 0x36121224,0x36121224
+.long 0x9b80801b,0x9b80801b
+.long 0x3de2e2df,0x3de2e2df
+.long 0x26ebebcd,0x26ebebcd
+.long 0x6927274e,0x6927274e
+.long 0xcdb2b27f,0xcdb2b27f
+.long 0x9f7575ea,0x9f7575ea
+.long 0x1b090912,0x1b090912
+.long 0x9e83831d,0x9e83831d
+.long 0x742c2c58,0x742c2c58
+.long 0x2e1a1a34,0x2e1a1a34
+.long 0x2d1b1b36,0x2d1b1b36
+.long 0xb26e6edc,0xb26e6edc
+.long 0xee5a5ab4,0xee5a5ab4
+.long 0xfba0a05b,0xfba0a05b
+.long 0xf65252a4,0xf65252a4
+.long 0x4d3b3b76,0x4d3b3b76
+.long 0x61d6d6b7,0x61d6d6b7
+.long 0xceb3b37d,0xceb3b37d
+.long 0x7b292952,0x7b292952
+.long 0x3ee3e3dd,0x3ee3e3dd
+.long 0x712f2f5e,0x712f2f5e
+.long 0x97848413,0x97848413
+.long 0xf55353a6,0xf55353a6
+.long 0x68d1d1b9,0x68d1d1b9
+.long 0x00000000,0x00000000
+.long 0x2cededc1,0x2cededc1
+.long 0x60202040,0x60202040
+.long 0x1ffcfce3,0x1ffcfce3
+.long 0xc8b1b179,0xc8b1b179
+.long 0xed5b5bb6,0xed5b5bb6
+.long 0xbe6a6ad4,0xbe6a6ad4
+.long 0x46cbcb8d,0x46cbcb8d
+.long 0xd9bebe67,0xd9bebe67
+.long 0x4b393972,0x4b393972
+.long 0xde4a4a94,0xde4a4a94
+.long 0xd44c4c98,0xd44c4c98
+.long 0xe85858b0,0xe85858b0
+.long 0x4acfcf85,0x4acfcf85
+.long 0x6bd0d0bb,0x6bd0d0bb
+.long 0x2aefefc5,0x2aefefc5
+.long 0xe5aaaa4f,0xe5aaaa4f
+.long 0x16fbfbed,0x16fbfbed
+.long 0xc5434386,0xc5434386
+.long 0xd74d4d9a,0xd74d4d9a
+.long 0x55333366,0x55333366
+.long 0x94858511,0x94858511
+.long 0xcf45458a,0xcf45458a
+.long 0x10f9f9e9,0x10f9f9e9
+.long 0x06020204,0x06020204
+.long 0x817f7ffe,0x817f7ffe
+.long 0xf05050a0,0xf05050a0
+.long 0x443c3c78,0x443c3c78
+.long 0xba9f9f25,0xba9f9f25
+.long 0xe3a8a84b,0xe3a8a84b
+.long 0xf35151a2,0xf35151a2
+.long 0xfea3a35d,0xfea3a35d
+.long 0xc0404080,0xc0404080
+.long 0x8a8f8f05,0x8a8f8f05
+.long 0xad92923f,0xad92923f
+.long 0xbc9d9d21,0xbc9d9d21
+.long 0x48383870,0x48383870
+.long 0x04f5f5f1,0x04f5f5f1
+.long 0xdfbcbc63,0xdfbcbc63
+.long 0xc1b6b677,0xc1b6b677
+.long 0x75dadaaf,0x75dadaaf
+.long 0x63212142,0x63212142
+.long 0x30101020,0x30101020
+.long 0x1affffe5,0x1affffe5
+.long 0x0ef3f3fd,0x0ef3f3fd
+.long 0x6dd2d2bf,0x6dd2d2bf
+.long 0x4ccdcd81,0x4ccdcd81
+.long 0x140c0c18,0x140c0c18
+.long 0x35131326,0x35131326
+.long 0x2fececc3,0x2fececc3
+.long 0xe15f5fbe,0xe15f5fbe
+.long 0xa2979735,0xa2979735
+.long 0xcc444488,0xcc444488
+.long 0x3917172e,0x3917172e
+.long 0x57c4c493,0x57c4c493
+.long 0xf2a7a755,0xf2a7a755
+.long 0x827e7efc,0x827e7efc
+.long 0x473d3d7a,0x473d3d7a
+.long 0xac6464c8,0xac6464c8
+.long 0xe75d5dba,0xe75d5dba
+.long 0x2b191932,0x2b191932
+.long 0x957373e6,0x957373e6
+.long 0xa06060c0,0xa06060c0
+.long 0x98818119,0x98818119
+.long 0xd14f4f9e,0xd14f4f9e
+.long 0x7fdcdca3,0x7fdcdca3
+.long 0x66222244,0x66222244
+.long 0x7e2a2a54,0x7e2a2a54
+.long 0xab90903b,0xab90903b
+.long 0x8388880b,0x8388880b
+.long 0xca46468c,0xca46468c
+.long 0x29eeeec7,0x29eeeec7
+.long 0xd3b8b86b,0xd3b8b86b
+.long 0x3c141428,0x3c141428
+.long 0x79dedea7,0x79dedea7
+.long 0xe25e5ebc,0xe25e5ebc
+.long 0x1d0b0b16,0x1d0b0b16
+.long 0x76dbdbad,0x76dbdbad
+.long 0x3be0e0db,0x3be0e0db
+.long 0x56323264,0x56323264
+.long 0x4e3a3a74,0x4e3a3a74
+.long 0x1e0a0a14,0x1e0a0a14
+.long 0xdb494992,0xdb494992
+.long 0x0a06060c,0x0a06060c
+.long 0x6c242448,0x6c242448
+.long 0xe45c5cb8,0xe45c5cb8
+.long 0x5dc2c29f,0x5dc2c29f
+.long 0x6ed3d3bd,0x6ed3d3bd
+.long 0xefacac43,0xefacac43
+.long 0xa66262c4,0xa66262c4
+.long 0xa8919139,0xa8919139
+.long 0xa4959531,0xa4959531
+.long 0x37e4e4d3,0x37e4e4d3
+.long 0x8b7979f2,0x8b7979f2
+.long 0x32e7e7d5,0x32e7e7d5
+.long 0x43c8c88b,0x43c8c88b
+.long 0x5937376e,0x5937376e
+.long 0xb76d6dda,0xb76d6dda
+.long 0x8c8d8d01,0x8c8d8d01
+.long 0x64d5d5b1,0x64d5d5b1
+.long 0xd24e4e9c,0xd24e4e9c
+.long 0xe0a9a949,0xe0a9a949
+.long 0xb46c6cd8,0xb46c6cd8
+.long 0xfa5656ac,0xfa5656ac
+.long 0x07f4f4f3,0x07f4f4f3
+.long 0x25eaeacf,0x25eaeacf
+.long 0xaf6565ca,0xaf6565ca
+.long 0x8e7a7af4,0x8e7a7af4
+.long 0xe9aeae47,0xe9aeae47
+.long 0x18080810,0x18080810
+.long 0xd5baba6f,0xd5baba6f
+.long 0x887878f0,0x887878f0
+.long 0x6f25254a,0x6f25254a
+.long 0x722e2e5c,0x722e2e5c
+.long 0x241c1c38,0x241c1c38
+.long 0xf1a6a657,0xf1a6a657
+.long 0xc7b4b473,0xc7b4b473
+.long 0x51c6c697,0x51c6c697
+.long 0x23e8e8cb,0x23e8e8cb
+.long 0x7cdddda1,0x7cdddda1
+.long 0x9c7474e8,0x9c7474e8
+.long 0x211f1f3e,0x211f1f3e
+.long 0xdd4b4b96,0xdd4b4b96
+.long 0xdcbdbd61,0xdcbdbd61
+.long 0x868b8b0d,0x868b8b0d
+.long 0x858a8a0f,0x858a8a0f
+.long 0x907070e0,0x907070e0
+.long 0x423e3e7c,0x423e3e7c
+.long 0xc4b5b571,0xc4b5b571
+.long 0xaa6666cc,0xaa6666cc
+.long 0xd8484890,0xd8484890
+.long 0x05030306,0x05030306
+.long 0x01f6f6f7,0x01f6f6f7
+.long 0x120e0e1c,0x120e0e1c
+.long 0xa36161c2,0xa36161c2
+.long 0x5f35356a,0x5f35356a
+.long 0xf95757ae,0xf95757ae
+.long 0xd0b9b969,0xd0b9b969
+.long 0x91868617,0x91868617
+.long 0x58c1c199,0x58c1c199
+.long 0x271d1d3a,0x271d1d3a
+.long 0xb99e9e27,0xb99e9e27
+.long 0x38e1e1d9,0x38e1e1d9
+.long 0x13f8f8eb,0x13f8f8eb
+.long 0xb398982b,0xb398982b
+.long 0x33111122,0x33111122
+.long 0xbb6969d2,0xbb6969d2
+.long 0x70d9d9a9,0x70d9d9a9
+.long 0x898e8e07,0x898e8e07
+.long 0xa7949433,0xa7949433
+.long 0xb69b9b2d,0xb69b9b2d
+.long 0x221e1e3c,0x221e1e3c
+.long 0x92878715,0x92878715
+.long 0x20e9e9c9,0x20e9e9c9
+.long 0x49cece87,0x49cece87
+.long 0xff5555aa,0xff5555aa
+.long 0x78282850,0x78282850
+.long 0x7adfdfa5,0x7adfdfa5
+.long 0x8f8c8c03,0x8f8c8c03
+.long 0xf8a1a159,0xf8a1a159
+.long 0x80898909,0x80898909
+.long 0x170d0d1a,0x170d0d1a
+.long 0xdabfbf65,0xdabfbf65
+.long 0x31e6e6d7,0x31e6e6d7
+.long 0xc6424284,0xc6424284
+.long 0xb86868d0,0xb86868d0
+.long 0xc3414182,0xc3414182
+.long 0xb0999929,0xb0999929
+.long 0x772d2d5a,0x772d2d5a
+.long 0x110f0f1e,0x110f0f1e
+.long 0xcbb0b07b,0xcbb0b07b
+.long 0xfc5454a8,0xfc5454a8
+.long 0xd6bbbb6d,0xd6bbbb6d
+.long 0x3a16162c,0x3a16162c
+.byte 0x63,0x7c,0x77,0x7b,0xf2,0x6b,0x6f,0xc5
+.byte 0x30,0x01,0x67,0x2b,0xfe,0xd7,0xab,0x76
+.byte 0xca,0x82,0xc9,0x7d,0xfa,0x59,0x47,0xf0
+.byte 0xad,0xd4,0xa2,0xaf,0x9c,0xa4,0x72,0xc0
+.byte 0xb7,0xfd,0x93,0x26,0x36,0x3f,0xf7,0xcc
+.byte 0x34,0xa5,0xe5,0xf1,0x71,0xd8,0x31,0x15
+.byte 0x04,0xc7,0x23,0xc3,0x18,0x96,0x05,0x9a
+.byte 0x07,0x12,0x80,0xe2,0xeb,0x27,0xb2,0x75
+.byte 0x09,0x83,0x2c,0x1a,0x1b,0x6e,0x5a,0xa0
+.byte 0x52,0x3b,0xd6,0xb3,0x29,0xe3,0x2f,0x84
+.byte 0x53,0xd1,0x00,0xed,0x20,0xfc,0xb1,0x5b
+.byte 0x6a,0xcb,0xbe,0x39,0x4a,0x4c,0x58,0xcf
+.byte 0xd0,0xef,0xaa,0xfb,0x43,0x4d,0x33,0x85
+.byte 0x45,0xf9,0x02,0x7f,0x50,0x3c,0x9f,0xa8
+.byte 0x51,0xa3,0x40,0x8f,0x92,0x9d,0x38,0xf5
+.byte 0xbc,0xb6,0xda,0x21,0x10,0xff,0xf3,0xd2
+.byte 0xcd,0x0c,0x13,0xec,0x5f,0x97,0x44,0x17
+.byte 0xc4,0xa7,0x7e,0x3d,0x64,0x5d,0x19,0x73
+.byte 0x60,0x81,0x4f,0xdc,0x22,0x2a,0x90,0x88
+.byte 0x46,0xee,0xb8,0x14,0xde,0x5e,0x0b,0xdb
+.byte 0xe0,0x32,0x3a,0x0a,0x49,0x06,0x24,0x5c
+.byte 0xc2,0xd3,0xac,0x62,0x91,0x95,0xe4,0x79
+.byte 0xe7,0xc8,0x37,0x6d,0x8d,0xd5,0x4e,0xa9
+.byte 0x6c,0x56,0xf4,0xea,0x65,0x7a,0xae,0x08
+.byte 0xba,0x78,0x25,0x2e,0x1c,0xa6,0xb4,0xc6
+.byte 0xe8,0xdd,0x74,0x1f,0x4b,0xbd,0x8b,0x8a
+.byte 0x70,0x3e,0xb5,0x66,0x48,0x03,0xf6,0x0e
+.byte 0x61,0x35,0x57,0xb9,0x86,0xc1,0x1d,0x9e
+.byte 0xe1,0xf8,0x98,0x11,0x69,0xd9,0x8e,0x94
+.byte 0x9b,0x1e,0x87,0xe9,0xce,0x55,0x28,0xdf
+.byte 0x8c,0xa1,0x89,0x0d,0xbf,0xe6,0x42,0x68
+.byte 0x41,0x99,0x2d,0x0f,0xb0,0x54,0xbb,0x16
+.byte 0x63,0x7c,0x77,0x7b,0xf2,0x6b,0x6f,0xc5
+.byte 0x30,0x01,0x67,0x2b,0xfe,0xd7,0xab,0x76
+.byte 0xca,0x82,0xc9,0x7d,0xfa,0x59,0x47,0xf0
+.byte 0xad,0xd4,0xa2,0xaf,0x9c,0xa4,0x72,0xc0
+.byte 0xb7,0xfd,0x93,0x26,0x36,0x3f,0xf7,0xcc
+.byte 0x34,0xa5,0xe5,0xf1,0x71,0xd8,0x31,0x15
+.byte 0x04,0xc7,0x23,0xc3,0x18,0x96,0x05,0x9a
+.byte 0x07,0x12,0x80,0xe2,0xeb,0x27,0xb2,0x75
+.byte 0x09,0x83,0x2c,0x1a,0x1b,0x6e,0x5a,0xa0
+.byte 0x52,0x3b,0xd6,0xb3,0x29,0xe3,0x2f,0x84
+.byte 0x53,0xd1,0x00,0xed,0x20,0xfc,0xb1,0x5b
+.byte 0x6a,0xcb,0xbe,0x39,0x4a,0x4c,0x58,0xcf
+.byte 0xd0,0xef,0xaa,0xfb,0x43,0x4d,0x33,0x85
+.byte 0x45,0xf9,0x02,0x7f,0x50,0x3c,0x9f,0xa8
+.byte 0x51,0xa3,0x40,0x8f,0x92,0x9d,0x38,0xf5
+.byte 0xbc,0xb6,0xda,0x21,0x10,0xff,0xf3,0xd2
+.byte 0xcd,0x0c,0x13,0xec,0x5f,0x97,0x44,0x17
+.byte 0xc4,0xa7,0x7e,0x3d,0x64,0x5d,0x19,0x73
+.byte 0x60,0x81,0x4f,0xdc,0x22,0x2a,0x90,0x88
+.byte 0x46,0xee,0xb8,0x14,0xde,0x5e,0x0b,0xdb
+.byte 0xe0,0x32,0x3a,0x0a,0x49,0x06,0x24,0x5c
+.byte 0xc2,0xd3,0xac,0x62,0x91,0x95,0xe4,0x79
+.byte 0xe7,0xc8,0x37,0x6d,0x8d,0xd5,0x4e,0xa9
+.byte 0x6c,0x56,0xf4,0xea,0x65,0x7a,0xae,0x08
+.byte 0xba,0x78,0x25,0x2e,0x1c,0xa6,0xb4,0xc6
+.byte 0xe8,0xdd,0x74,0x1f,0x4b,0xbd,0x8b,0x8a
+.byte 0x70,0x3e,0xb5,0x66,0x48,0x03,0xf6,0x0e
+.byte 0x61,0x35,0x57,0xb9,0x86,0xc1,0x1d,0x9e
+.byte 0xe1,0xf8,0x98,0x11,0x69,0xd9,0x8e,0x94
+.byte 0x9b,0x1e,0x87,0xe9,0xce,0x55,0x28,0xdf
+.byte 0x8c,0xa1,0x89,0x0d,0xbf,0xe6,0x42,0x68
+.byte 0x41,0x99,0x2d,0x0f,0xb0,0x54,0xbb,0x16
+.byte 0x63,0x7c,0x77,0x7b,0xf2,0x6b,0x6f,0xc5
+.byte 0x30,0x01,0x67,0x2b,0xfe,0xd7,0xab,0x76
+.byte 0xca,0x82,0xc9,0x7d,0xfa,0x59,0x47,0xf0
+.byte 0xad,0xd4,0xa2,0xaf,0x9c,0xa4,0x72,0xc0
+.byte 0xb7,0xfd,0x93,0x26,0x36,0x3f,0xf7,0xcc
+.byte 0x34,0xa5,0xe5,0xf1,0x71,0xd8,0x31,0x15
+.byte 0x04,0xc7,0x23,0xc3,0x18,0x96,0x05,0x9a
+.byte 0x07,0x12,0x80,0xe2,0xeb,0x27,0xb2,0x75
+.byte 0x09,0x83,0x2c,0x1a,0x1b,0x6e,0x5a,0xa0
+.byte 0x52,0x3b,0xd6,0xb3,0x29,0xe3,0x2f,0x84
+.byte 0x53,0xd1,0x00,0xed,0x20,0xfc,0xb1,0x5b
+.byte 0x6a,0xcb,0xbe,0x39,0x4a,0x4c,0x58,0xcf
+.byte 0xd0,0xef,0xaa,0xfb,0x43,0x4d,0x33,0x85
+.byte 0x45,0xf9,0x02,0x7f,0x50,0x3c,0x9f,0xa8
+.byte 0x51,0xa3,0x40,0x8f,0x92,0x9d,0x38,0xf5
+.byte 0xbc,0xb6,0xda,0x21,0x10,0xff,0xf3,0xd2
+.byte 0xcd,0x0c,0x13,0xec,0x5f,0x97,0x44,0x17
+.byte 0xc4,0xa7,0x7e,0x3d,0x64,0x5d,0x19,0x73
+.byte 0x60,0x81,0x4f,0xdc,0x22,0x2a,0x90,0x88
+.byte 0x46,0xee,0xb8,0x14,0xde,0x5e,0x0b,0xdb
+.byte 0xe0,0x32,0x3a,0x0a,0x49,0x06,0x24,0x5c
+.byte 0xc2,0xd3,0xac,0x62,0x91,0x95,0xe4,0x79
+.byte 0xe7,0xc8,0x37,0x6d,0x8d,0xd5,0x4e,0xa9
+.byte 0x6c,0x56,0xf4,0xea,0x65,0x7a,0xae,0x08
+.byte 0xba,0x78,0x25,0x2e,0x1c,0xa6,0xb4,0xc6
+.byte 0xe8,0xdd,0x74,0x1f,0x4b,0xbd,0x8b,0x8a
+.byte 0x70,0x3e,0xb5,0x66,0x48,0x03,0xf6,0x0e
+.byte 0x61,0x35,0x57,0xb9,0x86,0xc1,0x1d,0x9e
+.byte 0xe1,0xf8,0x98,0x11,0x69,0xd9,0x8e,0x94
+.byte 0x9b,0x1e,0x87,0xe9,0xce,0x55,0x28,0xdf
+.byte 0x8c,0xa1,0x89,0x0d,0xbf,0xe6,0x42,0x68
+.byte 0x41,0x99,0x2d,0x0f,0xb0,0x54,0xbb,0x16
+.byte 0x63,0x7c,0x77,0x7b,0xf2,0x6b,0x6f,0xc5
+.byte 0x30,0x01,0x67,0x2b,0xfe,0xd7,0xab,0x76
+.byte 0xca,0x82,0xc9,0x7d,0xfa,0x59,0x47,0xf0
+.byte 0xad,0xd4,0xa2,0xaf,0x9c,0xa4,0x72,0xc0
+.byte 0xb7,0xfd,0x93,0x26,0x36,0x3f,0xf7,0xcc
+.byte 0x34,0xa5,0xe5,0xf1,0x71,0xd8,0x31,0x15
+.byte 0x04,0xc7,0x23,0xc3,0x18,0x96,0x05,0x9a
+.byte 0x07,0x12,0x80,0xe2,0xeb,0x27,0xb2,0x75
+.byte 0x09,0x83,0x2c,0x1a,0x1b,0x6e,0x5a,0xa0
+.byte 0x52,0x3b,0xd6,0xb3,0x29,0xe3,0x2f,0x84
+.byte 0x53,0xd1,0x00,0xed,0x20,0xfc,0xb1,0x5b
+.byte 0x6a,0xcb,0xbe,0x39,0x4a,0x4c,0x58,0xcf
+.byte 0xd0,0xef,0xaa,0xfb,0x43,0x4d,0x33,0x85
+.byte 0x45,0xf9,0x02,0x7f,0x50,0x3c,0x9f,0xa8
+.byte 0x51,0xa3,0x40,0x8f,0x92,0x9d,0x38,0xf5
+.byte 0xbc,0xb6,0xda,0x21,0x10,0xff,0xf3,0xd2
+.byte 0xcd,0x0c,0x13,0xec,0x5f,0x97,0x44,0x17
+.byte 0xc4,0xa7,0x7e,0x3d,0x64,0x5d,0x19,0x73
+.byte 0x60,0x81,0x4f,0xdc,0x22,0x2a,0x90,0x88
+.byte 0x46,0xee,0xb8,0x14,0xde,0x5e,0x0b,0xdb
+.byte 0xe0,0x32,0x3a,0x0a,0x49,0x06,0x24,0x5c
+.byte 0xc2,0xd3,0xac,0x62,0x91,0x95,0xe4,0x79
+.byte 0xe7,0xc8,0x37,0x6d,0x8d,0xd5,0x4e,0xa9
+.byte 0x6c,0x56,0xf4,0xea,0x65,0x7a,0xae,0x08
+.byte 0xba,0x78,0x25,0x2e,0x1c,0xa6,0xb4,0xc6
+.byte 0xe8,0xdd,0x74,0x1f,0x4b,0xbd,0x8b,0x8a
+.byte 0x70,0x3e,0xb5,0x66,0x48,0x03,0xf6,0x0e
+.byte 0x61,0x35,0x57,0xb9,0x86,0xc1,0x1d,0x9e
+.byte 0xe1,0xf8,0x98,0x11,0x69,0xd9,0x8e,0x94
+.byte 0x9b,0x1e,0x87,0xe9,0xce,0x55,0x28,0xdf
+.byte 0x8c,0xa1,0x89,0x0d,0xbf,0xe6,0x42,0x68
+.byte 0x41,0x99,0x2d,0x0f,0xb0,0x54,0xbb,0x16
+.long 0x00000001, 0x00000002, 0x00000004, 0x00000008
+.long 0x00000010, 0x00000020, 0x00000040, 0x00000080
+.long 0x0000001b, 0x00000036, 0x80808080, 0x80808080
+.long 0xfefefefe, 0xfefefefe, 0x1b1b1b1b, 0x1b1b1b1b
+.align 64
+.LAES_Td:
+.long 0x50a7f451,0x50a7f451
+.long 0x5365417e,0x5365417e
+.long 0xc3a4171a,0xc3a4171a
+.long 0x965e273a,0x965e273a
+.long 0xcb6bab3b,0xcb6bab3b
+.long 0xf1459d1f,0xf1459d1f
+.long 0xab58faac,0xab58faac
+.long 0x9303e34b,0x9303e34b
+.long 0x55fa3020,0x55fa3020
+.long 0xf66d76ad,0xf66d76ad
+.long 0x9176cc88,0x9176cc88
+.long 0x254c02f5,0x254c02f5
+.long 0xfcd7e54f,0xfcd7e54f
+.long 0xd7cb2ac5,0xd7cb2ac5
+.long 0x80443526,0x80443526
+.long 0x8fa362b5,0x8fa362b5
+.long 0x495ab1de,0x495ab1de
+.long 0x671bba25,0x671bba25
+.long 0x980eea45,0x980eea45
+.long 0xe1c0fe5d,0xe1c0fe5d
+.long 0x02752fc3,0x02752fc3
+.long 0x12f04c81,0x12f04c81
+.long 0xa397468d,0xa397468d
+.long 0xc6f9d36b,0xc6f9d36b
+.long 0xe75f8f03,0xe75f8f03
+.long 0x959c9215,0x959c9215
+.long 0xeb7a6dbf,0xeb7a6dbf
+.long 0xda595295,0xda595295
+.long 0x2d83bed4,0x2d83bed4
+.long 0xd3217458,0xd3217458
+.long 0x2969e049,0x2969e049
+.long 0x44c8c98e,0x44c8c98e
+.long 0x6a89c275,0x6a89c275
+.long 0x78798ef4,0x78798ef4
+.long 0x6b3e5899,0x6b3e5899
+.long 0xdd71b927,0xdd71b927
+.long 0xb64fe1be,0xb64fe1be
+.long 0x17ad88f0,0x17ad88f0
+.long 0x66ac20c9,0x66ac20c9
+.long 0xb43ace7d,0xb43ace7d
+.long 0x184adf63,0x184adf63
+.long 0x82311ae5,0x82311ae5
+.long 0x60335197,0x60335197
+.long 0x457f5362,0x457f5362
+.long 0xe07764b1,0xe07764b1
+.long 0x84ae6bbb,0x84ae6bbb
+.long 0x1ca081fe,0x1ca081fe
+.long 0x942b08f9,0x942b08f9
+.long 0x58684870,0x58684870
+.long 0x19fd458f,0x19fd458f
+.long 0x876cde94,0x876cde94
+.long 0xb7f87b52,0xb7f87b52
+.long 0x23d373ab,0x23d373ab
+.long 0xe2024b72,0xe2024b72
+.long 0x578f1fe3,0x578f1fe3
+.long 0x2aab5566,0x2aab5566
+.long 0x0728ebb2,0x0728ebb2
+.long 0x03c2b52f,0x03c2b52f
+.long 0x9a7bc586,0x9a7bc586
+.long 0xa50837d3,0xa50837d3
+.long 0xf2872830,0xf2872830
+.long 0xb2a5bf23,0xb2a5bf23
+.long 0xba6a0302,0xba6a0302
+.long 0x5c8216ed,0x5c8216ed
+.long 0x2b1ccf8a,0x2b1ccf8a
+.long 0x92b479a7,0x92b479a7
+.long 0xf0f207f3,0xf0f207f3
+.long 0xa1e2694e,0xa1e2694e
+.long 0xcdf4da65,0xcdf4da65
+.long 0xd5be0506,0xd5be0506
+.long 0x1f6234d1,0x1f6234d1
+.long 0x8afea6c4,0x8afea6c4
+.long 0x9d532e34,0x9d532e34
+.long 0xa055f3a2,0xa055f3a2
+.long 0x32e18a05,0x32e18a05
+.long 0x75ebf6a4,0x75ebf6a4
+.long 0x39ec830b,0x39ec830b
+.long 0xaaef6040,0xaaef6040
+.long 0x069f715e,0x069f715e
+.long 0x51106ebd,0x51106ebd
+.long 0xf98a213e,0xf98a213e
+.long 0x3d06dd96,0x3d06dd96
+.long 0xae053edd,0xae053edd
+.long 0x46bde64d,0x46bde64d
+.long 0xb58d5491,0xb58d5491
+.long 0x055dc471,0x055dc471
+.long 0x6fd40604,0x6fd40604
+.long 0xff155060,0xff155060
+.long 0x24fb9819,0x24fb9819
+.long 0x97e9bdd6,0x97e9bdd6
+.long 0xcc434089,0xcc434089
+.long 0x779ed967,0x779ed967
+.long 0xbd42e8b0,0xbd42e8b0
+.long 0x888b8907,0x888b8907
+.long 0x385b19e7,0x385b19e7
+.long 0xdbeec879,0xdbeec879
+.long 0x470a7ca1,0x470a7ca1
+.long 0xe90f427c,0xe90f427c
+.long 0xc91e84f8,0xc91e84f8
+.long 0x00000000,0x00000000
+.long 0x83868009,0x83868009
+.long 0x48ed2b32,0x48ed2b32
+.long 0xac70111e,0xac70111e
+.long 0x4e725a6c,0x4e725a6c
+.long 0xfbff0efd,0xfbff0efd
+.long 0x5638850f,0x5638850f
+.long 0x1ed5ae3d,0x1ed5ae3d
+.long 0x27392d36,0x27392d36
+.long 0x64d90f0a,0x64d90f0a
+.long 0x21a65c68,0x21a65c68
+.long 0xd1545b9b,0xd1545b9b
+.long 0x3a2e3624,0x3a2e3624
+.long 0xb1670a0c,0xb1670a0c
+.long 0x0fe75793,0x0fe75793
+.long 0xd296eeb4,0xd296eeb4
+.long 0x9e919b1b,0x9e919b1b
+.long 0x4fc5c080,0x4fc5c080
+.long 0xa220dc61,0xa220dc61
+.long 0x694b775a,0x694b775a
+.long 0x161a121c,0x161a121c
+.long 0x0aba93e2,0x0aba93e2
+.long 0xe52aa0c0,0xe52aa0c0
+.long 0x43e0223c,0x43e0223c
+.long 0x1d171b12,0x1d171b12
+.long 0x0b0d090e,0x0b0d090e
+.long 0xadc78bf2,0xadc78bf2
+.long 0xb9a8b62d,0xb9a8b62d
+.long 0xc8a91e14,0xc8a91e14
+.long 0x8519f157,0x8519f157
+.long 0x4c0775af,0x4c0775af
+.long 0xbbdd99ee,0xbbdd99ee
+.long 0xfd607fa3,0xfd607fa3
+.long 0x9f2601f7,0x9f2601f7
+.long 0xbcf5725c,0xbcf5725c
+.long 0xc53b6644,0xc53b6644
+.long 0x347efb5b,0x347efb5b
+.long 0x7629438b,0x7629438b
+.long 0xdcc623cb,0xdcc623cb
+.long 0x68fcedb6,0x68fcedb6
+.long 0x63f1e4b8,0x63f1e4b8
+.long 0xcadc31d7,0xcadc31d7
+.long 0x10856342,0x10856342
+.long 0x40229713,0x40229713
+.long 0x2011c684,0x2011c684
+.long 0x7d244a85,0x7d244a85
+.long 0xf83dbbd2,0xf83dbbd2
+.long 0x1132f9ae,0x1132f9ae
+.long 0x6da129c7,0x6da129c7
+.long 0x4b2f9e1d,0x4b2f9e1d
+.long 0xf330b2dc,0xf330b2dc
+.long 0xec52860d,0xec52860d
+.long 0xd0e3c177,0xd0e3c177
+.long 0x6c16b32b,0x6c16b32b
+.long 0x99b970a9,0x99b970a9
+.long 0xfa489411,0xfa489411
+.long 0x2264e947,0x2264e947
+.long 0xc48cfca8,0xc48cfca8
+.long 0x1a3ff0a0,0x1a3ff0a0
+.long 0xd82c7d56,0xd82c7d56
+.long 0xef903322,0xef903322
+.long 0xc74e4987,0xc74e4987
+.long 0xc1d138d9,0xc1d138d9
+.long 0xfea2ca8c,0xfea2ca8c
+.long 0x360bd498,0x360bd498
+.long 0xcf81f5a6,0xcf81f5a6
+.long 0x28de7aa5,0x28de7aa5
+.long 0x268eb7da,0x268eb7da
+.long 0xa4bfad3f,0xa4bfad3f
+.long 0xe49d3a2c,0xe49d3a2c
+.long 0x0d927850,0x0d927850
+.long 0x9bcc5f6a,0x9bcc5f6a
+.long 0x62467e54,0x62467e54
+.long 0xc2138df6,0xc2138df6
+.long 0xe8b8d890,0xe8b8d890
+.long 0x5ef7392e,0x5ef7392e
+.long 0xf5afc382,0xf5afc382
+.long 0xbe805d9f,0xbe805d9f
+.long 0x7c93d069,0x7c93d069
+.long 0xa92dd56f,0xa92dd56f
+.long 0xb31225cf,0xb31225cf
+.long 0x3b99acc8,0x3b99acc8
+.long 0xa77d1810,0xa77d1810
+.long 0x6e639ce8,0x6e639ce8
+.long 0x7bbb3bdb,0x7bbb3bdb
+.long 0x097826cd,0x097826cd
+.long 0xf418596e,0xf418596e
+.long 0x01b79aec,0x01b79aec
+.long 0xa89a4f83,0xa89a4f83
+.long 0x656e95e6,0x656e95e6
+.long 0x7ee6ffaa,0x7ee6ffaa
+.long 0x08cfbc21,0x08cfbc21
+.long 0xe6e815ef,0xe6e815ef
+.long 0xd99be7ba,0xd99be7ba
+.long 0xce366f4a,0xce366f4a
+.long 0xd4099fea,0xd4099fea
+.long 0xd67cb029,0xd67cb029
+.long 0xafb2a431,0xafb2a431
+.long 0x31233f2a,0x31233f2a
+.long 0x3094a5c6,0x3094a5c6
+.long 0xc066a235,0xc066a235
+.long 0x37bc4e74,0x37bc4e74
+.long 0xa6ca82fc,0xa6ca82fc
+.long 0xb0d090e0,0xb0d090e0
+.long 0x15d8a733,0x15d8a733
+.long 0x4a9804f1,0x4a9804f1
+.long 0xf7daec41,0xf7daec41
+.long 0x0e50cd7f,0x0e50cd7f
+.long 0x2ff69117,0x2ff69117
+.long 0x8dd64d76,0x8dd64d76
+.long 0x4db0ef43,0x4db0ef43
+.long 0x544daacc,0x544daacc
+.long 0xdf0496e4,0xdf0496e4
+.long 0xe3b5d19e,0xe3b5d19e
+.long 0x1b886a4c,0x1b886a4c
+.long 0xb81f2cc1,0xb81f2cc1
+.long 0x7f516546,0x7f516546
+.long 0x04ea5e9d,0x04ea5e9d
+.long 0x5d358c01,0x5d358c01
+.long 0x737487fa,0x737487fa
+.long 0x2e410bfb,0x2e410bfb
+.long 0x5a1d67b3,0x5a1d67b3
+.long 0x52d2db92,0x52d2db92
+.long 0x335610e9,0x335610e9
+.long 0x1347d66d,0x1347d66d
+.long 0x8c61d79a,0x8c61d79a
+.long 0x7a0ca137,0x7a0ca137
+.long 0x8e14f859,0x8e14f859
+.long 0x893c13eb,0x893c13eb
+.long 0xee27a9ce,0xee27a9ce
+.long 0x35c961b7,0x35c961b7
+.long 0xede51ce1,0xede51ce1
+.long 0x3cb1477a,0x3cb1477a
+.long 0x59dfd29c,0x59dfd29c
+.long 0x3f73f255,0x3f73f255
+.long 0x79ce1418,0x79ce1418
+.long 0xbf37c773,0xbf37c773
+.long 0xeacdf753,0xeacdf753
+.long 0x5baafd5f,0x5baafd5f
+.long 0x146f3ddf,0x146f3ddf
+.long 0x86db4478,0x86db4478
+.long 0x81f3afca,0x81f3afca
+.long 0x3ec468b9,0x3ec468b9
+.long 0x2c342438,0x2c342438
+.long 0x5f40a3c2,0x5f40a3c2
+.long 0x72c31d16,0x72c31d16
+.long 0x0c25e2bc,0x0c25e2bc
+.long 0x8b493c28,0x8b493c28
+.long 0x41950dff,0x41950dff
+.long 0x7101a839,0x7101a839
+.long 0xdeb30c08,0xdeb30c08
+.long 0x9ce4b4d8,0x9ce4b4d8
+.long 0x90c15664,0x90c15664
+.long 0x6184cb7b,0x6184cb7b
+.long 0x70b632d5,0x70b632d5
+.long 0x745c6c48,0x745c6c48
+.long 0x4257b8d0,0x4257b8d0
+.byte 0x52,0x09,0x6a,0xd5,0x30,0x36,0xa5,0x38
+.byte 0xbf,0x40,0xa3,0x9e,0x81,0xf3,0xd7,0xfb
+.byte 0x7c,0xe3,0x39,0x82,0x9b,0x2f,0xff,0x87
+.byte 0x34,0x8e,0x43,0x44,0xc4,0xde,0xe9,0xcb
+.byte 0x54,0x7b,0x94,0x32,0xa6,0xc2,0x23,0x3d
+.byte 0xee,0x4c,0x95,0x0b,0x42,0xfa,0xc3,0x4e
+.byte 0x08,0x2e,0xa1,0x66,0x28,0xd9,0x24,0xb2
+.byte 0x76,0x5b,0xa2,0x49,0x6d,0x8b,0xd1,0x25
+.byte 0x72,0xf8,0xf6,0x64,0x86,0x68,0x98,0x16
+.byte 0xd4,0xa4,0x5c,0xcc,0x5d,0x65,0xb6,0x92
+.byte 0x6c,0x70,0x48,0x50,0xfd,0xed,0xb9,0xda
+.byte 0x5e,0x15,0x46,0x57,0xa7,0x8d,0x9d,0x84
+.byte 0x90,0xd8,0xab,0x00,0x8c,0xbc,0xd3,0x0a
+.byte 0xf7,0xe4,0x58,0x05,0xb8,0xb3,0x45,0x06
+.byte 0xd0,0x2c,0x1e,0x8f,0xca,0x3f,0x0f,0x02
+.byte 0xc1,0xaf,0xbd,0x03,0x01,0x13,0x8a,0x6b
+.byte 0x3a,0x91,0x11,0x41,0x4f,0x67,0xdc,0xea
+.byte 0x97,0xf2,0xcf,0xce,0xf0,0xb4,0xe6,0x73
+.byte 0x96,0xac,0x74,0x22,0xe7,0xad,0x35,0x85
+.byte 0xe2,0xf9,0x37,0xe8,0x1c,0x75,0xdf,0x6e
+.byte 0x47,0xf1,0x1a,0x71,0x1d,0x29,0xc5,0x89
+.byte 0x6f,0xb7,0x62,0x0e,0xaa,0x18,0xbe,0x1b
+.byte 0xfc,0x56,0x3e,0x4b,0xc6,0xd2,0x79,0x20
+.byte 0x9a,0xdb,0xc0,0xfe,0x78,0xcd,0x5a,0xf4
+.byte 0x1f,0xdd,0xa8,0x33,0x88,0x07,0xc7,0x31
+.byte 0xb1,0x12,0x10,0x59,0x27,0x80,0xec,0x5f
+.byte 0x60,0x51,0x7f,0xa9,0x19,0xb5,0x4a,0x0d
+.byte 0x2d,0xe5,0x7a,0x9f,0x93,0xc9,0x9c,0xef
+.byte 0xa0,0xe0,0x3b,0x4d,0xae,0x2a,0xf5,0xb0
+.byte 0xc8,0xeb,0xbb,0x3c,0x83,0x53,0x99,0x61
+.byte 0x17,0x2b,0x04,0x7e,0xba,0x77,0xd6,0x26
+.byte 0xe1,0x69,0x14,0x63,0x55,0x21,0x0c,0x7d
+.long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
+.long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0
+.byte 0x52,0x09,0x6a,0xd5,0x30,0x36,0xa5,0x38
+.byte 0xbf,0x40,0xa3,0x9e,0x81,0xf3,0xd7,0xfb
+.byte 0x7c,0xe3,0x39,0x82,0x9b,0x2f,0xff,0x87
+.byte 0x34,0x8e,0x43,0x44,0xc4,0xde,0xe9,0xcb
+.byte 0x54,0x7b,0x94,0x32,0xa6,0xc2,0x23,0x3d
+.byte 0xee,0x4c,0x95,0x0b,0x42,0xfa,0xc3,0x4e
+.byte 0x08,0x2e,0xa1,0x66,0x28,0xd9,0x24,0xb2
+.byte 0x76,0x5b,0xa2,0x49,0x6d,0x8b,0xd1,0x25
+.byte 0x72,0xf8,0xf6,0x64,0x86,0x68,0x98,0x16
+.byte 0xd4,0xa4,0x5c,0xcc,0x5d,0x65,0xb6,0x92
+.byte 0x6c,0x70,0x48,0x50,0xfd,0xed,0xb9,0xda
+.byte 0x5e,0x15,0x46,0x57,0xa7,0x8d,0x9d,0x84
+.byte 0x90,0xd8,0xab,0x00,0x8c,0xbc,0xd3,0x0a
+.byte 0xf7,0xe4,0x58,0x05,0xb8,0xb3,0x45,0x06
+.byte 0xd0,0x2c,0x1e,0x8f,0xca,0x3f,0x0f,0x02
+.byte 0xc1,0xaf,0xbd,0x03,0x01,0x13,0x8a,0x6b
+.byte 0x3a,0x91,0x11,0x41,0x4f,0x67,0xdc,0xea
+.byte 0x97,0xf2,0xcf,0xce,0xf0,0xb4,0xe6,0x73
+.byte 0x96,0xac,0x74,0x22,0xe7,0xad,0x35,0x85
+.byte 0xe2,0xf9,0x37,0xe8,0x1c,0x75,0xdf,0x6e
+.byte 0x47,0xf1,0x1a,0x71,0x1d,0x29,0xc5,0x89
+.byte 0x6f,0xb7,0x62,0x0e,0xaa,0x18,0xbe,0x1b
+.byte 0xfc,0x56,0x3e,0x4b,0xc6,0xd2,0x79,0x20
+.byte 0x9a,0xdb,0xc0,0xfe,0x78,0xcd,0x5a,0xf4
+.byte 0x1f,0xdd,0xa8,0x33,0x88,0x07,0xc7,0x31
+.byte 0xb1,0x12,0x10,0x59,0x27,0x80,0xec,0x5f
+.byte 0x60,0x51,0x7f,0xa9,0x19,0xb5,0x4a,0x0d
+.byte 0x2d,0xe5,0x7a,0x9f,0x93,0xc9,0x9c,0xef
+.byte 0xa0,0xe0,0x3b,0x4d,0xae,0x2a,0xf5,0xb0
+.byte 0xc8,0xeb,0xbb,0x3c,0x83,0x53,0x99,0x61
+.byte 0x17,0x2b,0x04,0x7e,0xba,0x77,0xd6,0x26
+.byte 0xe1,0x69,0x14,0x63,0x55,0x21,0x0c,0x7d
+.long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
+.long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0
+.byte 0x52,0x09,0x6a,0xd5,0x30,0x36,0xa5,0x38
+.byte 0xbf,0x40,0xa3,0x9e,0x81,0xf3,0xd7,0xfb
+.byte 0x7c,0xe3,0x39,0x82,0x9b,0x2f,0xff,0x87
+.byte 0x34,0x8e,0x43,0x44,0xc4,0xde,0xe9,0xcb
+.byte 0x54,0x7b,0x94,0x32,0xa6,0xc2,0x23,0x3d
+.byte 0xee,0x4c,0x95,0x0b,0x42,0xfa,0xc3,0x4e
+.byte 0x08,0x2e,0xa1,0x66,0x28,0xd9,0x24,0xb2
+.byte 0x76,0x5b,0xa2,0x49,0x6d,0x8b,0xd1,0x25
+.byte 0x72,0xf8,0xf6,0x64,0x86,0x68,0x98,0x16
+.byte 0xd4,0xa4,0x5c,0xcc,0x5d,0x65,0xb6,0x92
+.byte 0x6c,0x70,0x48,0x50,0xfd,0xed,0xb9,0xda
+.byte 0x5e,0x15,0x46,0x57,0xa7,0x8d,0x9d,0x84
+.byte 0x90,0xd8,0xab,0x00,0x8c,0xbc,0xd3,0x0a
+.byte 0xf7,0xe4,0x58,0x05,0xb8,0xb3,0x45,0x06
+.byte 0xd0,0x2c,0x1e,0x8f,0xca,0x3f,0x0f,0x02
+.byte 0xc1,0xaf,0xbd,0x03,0x01,0x13,0x8a,0x6b
+.byte 0x3a,0x91,0x11,0x41,0x4f,0x67,0xdc,0xea
+.byte 0x97,0xf2,0xcf,0xce,0xf0,0xb4,0xe6,0x73
+.byte 0x96,0xac,0x74,0x22,0xe7,0xad,0x35,0x85
+.byte 0xe2,0xf9,0x37,0xe8,0x1c,0x75,0xdf,0x6e
+.byte 0x47,0xf1,0x1a,0x71,0x1d,0x29,0xc5,0x89
+.byte 0x6f,0xb7,0x62,0x0e,0xaa,0x18,0xbe,0x1b
+.byte 0xfc,0x56,0x3e,0x4b,0xc6,0xd2,0x79,0x20
+.byte 0x9a,0xdb,0xc0,0xfe,0x78,0xcd,0x5a,0xf4
+.byte 0x1f,0xdd,0xa8,0x33,0x88,0x07,0xc7,0x31
+.byte 0xb1,0x12,0x10,0x59,0x27,0x80,0xec,0x5f
+.byte 0x60,0x51,0x7f,0xa9,0x19,0xb5,0x4a,0x0d
+.byte 0x2d,0xe5,0x7a,0x9f,0x93,0xc9,0x9c,0xef
+.byte 0xa0,0xe0,0x3b,0x4d,0xae,0x2a,0xf5,0xb0
+.byte 0xc8,0xeb,0xbb,0x3c,0x83,0x53,0x99,0x61
+.byte 0x17,0x2b,0x04,0x7e,0xba,0x77,0xd6,0x26
+.byte 0xe1,0x69,0x14,0x63,0x55,0x21,0x0c,0x7d
+.long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
+.long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0
+.byte 0x52,0x09,0x6a,0xd5,0x30,0x36,0xa5,0x38
+.byte 0xbf,0x40,0xa3,0x9e,0x81,0xf3,0xd7,0xfb
+.byte 0x7c,0xe3,0x39,0x82,0x9b,0x2f,0xff,0x87
+.byte 0x34,0x8e,0x43,0x44,0xc4,0xde,0xe9,0xcb
+.byte 0x54,0x7b,0x94,0x32,0xa6,0xc2,0x23,0x3d
+.byte 0xee,0x4c,0x95,0x0b,0x42,0xfa,0xc3,0x4e
+.byte 0x08,0x2e,0xa1,0x66,0x28,0xd9,0x24,0xb2
+.byte 0x76,0x5b,0xa2,0x49,0x6d,0x8b,0xd1,0x25
+.byte 0x72,0xf8,0xf6,0x64,0x86,0x68,0x98,0x16
+.byte 0xd4,0xa4,0x5c,0xcc,0x5d,0x65,0xb6,0x92
+.byte 0x6c,0x70,0x48,0x50,0xfd,0xed,0xb9,0xda
+.byte 0x5e,0x15,0x46,0x57,0xa7,0x8d,0x9d,0x84
+.byte 0x90,0xd8,0xab,0x00,0x8c,0xbc,0xd3,0x0a
+.byte 0xf7,0xe4,0x58,0x05,0xb8,0xb3,0x45,0x06
+.byte 0xd0,0x2c,0x1e,0x8f,0xca,0x3f,0x0f,0x02
+.byte 0xc1,0xaf,0xbd,0x03,0x01,0x13,0x8a,0x6b
+.byte 0x3a,0x91,0x11,0x41,0x4f,0x67,0xdc,0xea
+.byte 0x97,0xf2,0xcf,0xce,0xf0,0xb4,0xe6,0x73
+.byte 0x96,0xac,0x74,0x22,0xe7,0xad,0x35,0x85
+.byte 0xe2,0xf9,0x37,0xe8,0x1c,0x75,0xdf,0x6e
+.byte 0x47,0xf1,0x1a,0x71,0x1d,0x29,0xc5,0x89
+.byte 0x6f,0xb7,0x62,0x0e,0xaa,0x18,0xbe,0x1b
+.byte 0xfc,0x56,0x3e,0x4b,0xc6,0xd2,0x79,0x20
+.byte 0x9a,0xdb,0xc0,0xfe,0x78,0xcd,0x5a,0xf4
+.byte 0x1f,0xdd,0xa8,0x33,0x88,0x07,0xc7,0x31
+.byte 0xb1,0x12,0x10,0x59,0x27,0x80,0xec,0x5f
+.byte 0x60,0x51,0x7f,0xa9,0x19,0xb5,0x4a,0x0d
+.byte 0x2d,0xe5,0x7a,0x9f,0x93,0xc9,0x9c,0xef
+.byte 0xa0,0xe0,0x3b,0x4d,0xae,0x2a,0xf5,0xb0
+.byte 0xc8,0xeb,0xbb,0x3c,0x83,0x53,0x99,0x61
+.byte 0x17,0x2b,0x04,0x7e,0xba,0x77,0xd6,0x26
+.byte 0xe1,0x69,0x14,0x63,0x55,0x21,0x0c,0x7d
+.long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
+.long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0
+.byte 65,69,83,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 64
diff --git a/main/openssl/crypto/aes/asm/aes-x86_64.pl b/main/openssl/crypto/aes/asm/aes-x86_64.pl
index a545e892..34cbb5d8 100755
--- a/main/openssl/crypto/aes/asm/aes-x86_64.pl
+++ b/main/openssl/crypto/aes/asm/aes-x86_64.pl
@@ -36,7 +36,8 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
-open STDOUT,"| $^X $xlate $flavour $output";
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
$verticalspin=1; # unlike 32-bit version $verticalspin performs
# ~15% better on both AMD and Intel cores
@@ -588,6 +589,9 @@ $code.=<<___;
.globl AES_encrypt
.type AES_encrypt,\@function,3
.align 16
+.globl asm_AES_encrypt
+.hidden asm_AES_encrypt
+asm_AES_encrypt:
AES_encrypt:
push %rbx
push %rbp
@@ -1184,6 +1188,9 @@ $code.=<<___;
.globl AES_decrypt
.type AES_decrypt,\@function,3
.align 16
+.globl asm_AES_decrypt
+.hidden asm_AES_decrypt
+asm_AES_decrypt:
AES_decrypt:
push %rbx
push %rbp
@@ -1277,13 +1284,13 @@ $code.=<<___;
___
}
-# int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
+# int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits,
# AES_KEY *key)
$code.=<<___;
-.globl AES_set_encrypt_key
-.type AES_set_encrypt_key,\@function,3
+.globl private_AES_set_encrypt_key
+.type private_AES_set_encrypt_key,\@function,3
.align 16
-AES_set_encrypt_key:
+private_AES_set_encrypt_key:
push %rbx
push %rbp
push %r12 # redundant, but allows to share
@@ -1304,7 +1311,7 @@ AES_set_encrypt_key:
add \$56,%rsp
.Lenc_key_epilogue:
ret
-.size AES_set_encrypt_key,.-AES_set_encrypt_key
+.size private_AES_set_encrypt_key,.-private_AES_set_encrypt_key
.type _x86_64_AES_set_encrypt_key,\@abi-omnipotent
.align 16
@@ -1547,13 +1554,13 @@ $code.=<<___;
___
}
-# int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
+# int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits,
# AES_KEY *key)
$code.=<<___;
-.globl AES_set_decrypt_key
-.type AES_set_decrypt_key,\@function,3
+.globl private_AES_set_decrypt_key
+.type private_AES_set_decrypt_key,\@function,3
.align 16
-AES_set_decrypt_key:
+private_AES_set_decrypt_key:
push %rbx
push %rbp
push %r12
@@ -1622,7 +1629,7 @@ $code.=<<___;
add \$56,%rsp
.Ldec_key_epilogue:
ret
-.size AES_set_decrypt_key,.-AES_set_decrypt_key
+.size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key
___
# void AES_cbc_encrypt (const void char *inp, unsigned char *out,
@@ -1648,6 +1655,9 @@ $code.=<<___;
.type AES_cbc_encrypt,\@function,6
.align 16
.extern OPENSSL_ia32cap_P
+.globl asm_AES_cbc_encrypt
+.hidden asm_AES_cbc_encrypt
+asm_AES_cbc_encrypt:
AES_cbc_encrypt:
cmp \$0,%rdx # check length
je .Lcbc_epilogue
@@ -2766,13 +2776,13 @@ cbc_se_handler:
.rva .LSEH_end_AES_decrypt
.rva .LSEH_info_AES_decrypt
- .rva .LSEH_begin_AES_set_encrypt_key
- .rva .LSEH_end_AES_set_encrypt_key
- .rva .LSEH_info_AES_set_encrypt_key
+ .rva .LSEH_begin_private_AES_set_encrypt_key
+ .rva .LSEH_end_private_AES_set_encrypt_key
+ .rva .LSEH_info_private_AES_set_encrypt_key
- .rva .LSEH_begin_AES_set_decrypt_key
- .rva .LSEH_end_AES_set_decrypt_key
- .rva .LSEH_info_AES_set_decrypt_key
+ .rva .LSEH_begin_private_AES_set_decrypt_key
+ .rva .LSEH_end_private_AES_set_decrypt_key
+ .rva .LSEH_info_private_AES_set_decrypt_key
.rva .LSEH_begin_AES_cbc_encrypt
.rva .LSEH_end_AES_cbc_encrypt
@@ -2788,11 +2798,11 @@ cbc_se_handler:
.byte 9,0,0,0
.rva block_se_handler
.rva .Ldec_prologue,.Ldec_epilogue # HandlerData[]
-.LSEH_info_AES_set_encrypt_key:
+.LSEH_info_private_AES_set_encrypt_key:
.byte 9,0,0,0
.rva key_se_handler
.rva .Lenc_key_prologue,.Lenc_key_epilogue # HandlerData[]
-.LSEH_info_AES_set_decrypt_key:
+.LSEH_info_private_AES_set_decrypt_key:
.byte 9,0,0,0
.rva key_se_handler
.rva .Ldec_key_prologue,.Ldec_key_epilogue # HandlerData[]
diff --git a/main/openssl/crypto/aes/asm/aesni-sha1-x86_64.S b/main/openssl/crypto/aes/asm/aesni-sha1-x86_64.S
new file mode 100644
index 00000000..32fd600b
--- /dev/null
+++ b/main/openssl/crypto/aes/asm/aesni-sha1-x86_64.S
@@ -0,0 +1,1396 @@
+.text
+
+
+.globl aesni_cbc_sha1_enc
+.type aesni_cbc_sha1_enc,@function
+.align 16
+aesni_cbc_sha1_enc:
+
+ movl OPENSSL_ia32cap_P+0(%rip),%r10d
+ movl OPENSSL_ia32cap_P+4(%rip),%r11d
+ jmp aesni_cbc_sha1_enc_ssse3
+ .byte 0xf3,0xc3
+.size aesni_cbc_sha1_enc,.-aesni_cbc_sha1_enc
+.type aesni_cbc_sha1_enc_ssse3,@function
+.align 16
+aesni_cbc_sha1_enc_ssse3:
+ movq 8(%rsp),%r10
+
+
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ leaq -104(%rsp),%rsp
+
+
+ movq %rdi,%r12
+ movq %rsi,%r13
+ movq %rdx,%r14
+ movq %rcx,%r15
+ movdqu (%r8),%xmm11
+ movq %r8,88(%rsp)
+ shlq $6,%r14
+ subq %r12,%r13
+ movl 240(%r15),%r8d
+ addq %r10,%r14
+
+ leaq K_XX_XX(%rip),%r11
+ movl 0(%r9),%eax
+ movl 4(%r9),%ebx
+ movl 8(%r9),%ecx
+ movl 12(%r9),%edx
+ movl %ebx,%esi
+ movl 16(%r9),%ebp
+
+ movdqa 64(%r11),%xmm6
+ movdqa 0(%r11),%xmm9
+ movdqu 0(%r10),%xmm0
+ movdqu 16(%r10),%xmm1
+ movdqu 32(%r10),%xmm2
+ movdqu 48(%r10),%xmm3
+.byte 102,15,56,0,198
+ addq $64,%r10
+.byte 102,15,56,0,206
+.byte 102,15,56,0,214
+.byte 102,15,56,0,222
+ paddd %xmm9,%xmm0
+ paddd %xmm9,%xmm1
+ paddd %xmm9,%xmm2
+ movdqa %xmm0,0(%rsp)
+ psubd %xmm9,%xmm0
+ movdqa %xmm1,16(%rsp)
+ psubd %xmm9,%xmm1
+ movdqa %xmm2,32(%rsp)
+ psubd %xmm9,%xmm2
+ movups (%r15),%xmm13
+ movups 16(%r15),%xmm14
+ jmp .Loop_ssse3
+.align 16
+.Loop_ssse3:
+ movdqa %xmm1,%xmm4
+ addl 0(%rsp),%ebp
+ movups 0(%r12),%xmm12
+ xorps %xmm13,%xmm12
+ xorps %xmm12,%xmm11
+.byte 102,69,15,56,220,222
+ movups 32(%r15),%xmm15
+ xorl %edx,%ecx
+ movdqa %xmm3,%xmm8
+.byte 102,15,58,15,224,8
+ movl %eax,%edi
+ roll $5,%eax
+ paddd %xmm3,%xmm9
+ andl %ecx,%esi
+ xorl %edx,%ecx
+ psrldq $4,%xmm8
+ xorl %edx,%esi
+ addl %eax,%ebp
+ pxor %xmm0,%xmm4
+ rorl $2,%ebx
+ addl %esi,%ebp
+ pxor %xmm2,%xmm8
+ addl 4(%rsp),%edx
+ xorl %ecx,%ebx
+ movl %ebp,%esi
+ roll $5,%ebp
+ pxor %xmm8,%xmm4
+ andl %ebx,%edi
+ xorl %ecx,%ebx
+ movdqa %xmm9,48(%rsp)
+ xorl %ecx,%edi
+.byte 102,69,15,56,220,223
+ movups 48(%r15),%xmm14
+ addl %ebp,%edx
+ movdqa %xmm4,%xmm10
+ movdqa %xmm4,%xmm8
+ rorl $7,%eax
+ addl %edi,%edx
+ addl 8(%rsp),%ecx
+ xorl %ebx,%eax
+ pslldq $12,%xmm10
+ paddd %xmm4,%xmm4
+ movl %edx,%edi
+ roll $5,%edx
+ andl %eax,%esi
+ xorl %ebx,%eax
+ psrld $31,%xmm8
+ xorl %ebx,%esi
+ addl %edx,%ecx
+ movdqa %xmm10,%xmm9
+ rorl $7,%ebp
+ addl %esi,%ecx
+ psrld $30,%xmm10
+ por %xmm8,%xmm4
+ addl 12(%rsp),%ebx
+ xorl %eax,%ebp
+ movl %ecx,%esi
+ roll $5,%ecx
+.byte 102,69,15,56,220,222
+ movups 64(%r15),%xmm15
+ pslld $2,%xmm9
+ pxor %xmm10,%xmm4
+ andl %ebp,%edi
+ xorl %eax,%ebp
+ movdqa 0(%r11),%xmm10
+ xorl %eax,%edi
+ addl %ecx,%ebx
+ pxor %xmm9,%xmm4
+ rorl $7,%edx
+ addl %edi,%ebx
+ movdqa %xmm2,%xmm5
+ addl 16(%rsp),%eax
+ xorl %ebp,%edx
+ movdqa %xmm4,%xmm9
+.byte 102,15,58,15,233,8
+ movl %ebx,%edi
+ roll $5,%ebx
+ paddd %xmm4,%xmm10
+ andl %edx,%esi
+ xorl %ebp,%edx
+ psrldq $4,%xmm9
+ xorl %ebp,%esi
+ addl %ebx,%eax
+ pxor %xmm1,%xmm5
+ rorl $7,%ecx
+ addl %esi,%eax
+ pxor %xmm3,%xmm9
+ addl 20(%rsp),%ebp
+.byte 102,69,15,56,220,223
+ movups 80(%r15),%xmm14
+ xorl %edx,%ecx
+ movl %eax,%esi
+ roll $5,%eax
+ pxor %xmm9,%xmm5
+ andl %ecx,%edi
+ xorl %edx,%ecx
+ movdqa %xmm10,0(%rsp)
+ xorl %edx,%edi
+ addl %eax,%ebp
+ movdqa %xmm5,%xmm8
+ movdqa %xmm5,%xmm9
+ rorl $7,%ebx
+ addl %edi,%ebp
+ addl 24(%rsp),%edx
+ xorl %ecx,%ebx
+ pslldq $12,%xmm8
+ paddd %xmm5,%xmm5
+ movl %ebp,%edi
+ roll $5,%ebp
+ andl %ebx,%esi
+ xorl %ecx,%ebx
+ psrld $31,%xmm9
+ xorl %ecx,%esi
+.byte 102,69,15,56,220,222
+ movups 96(%r15),%xmm15
+ addl %ebp,%edx
+ movdqa %xmm8,%xmm10
+ rorl $7,%eax
+ addl %esi,%edx
+ psrld $30,%xmm8
+ por %xmm9,%xmm5
+ addl 28(%rsp),%ecx
+ xorl %ebx,%eax
+ movl %edx,%esi
+ roll $5,%edx
+ pslld $2,%xmm10
+ pxor %xmm8,%xmm5
+ andl %eax,%edi
+ xorl %ebx,%eax
+ movdqa 16(%r11),%xmm8
+ xorl %ebx,%edi
+ addl %edx,%ecx
+ pxor %xmm10,%xmm5
+ rorl $7,%ebp
+ addl %edi,%ecx
+ movdqa %xmm3,%xmm6
+ addl 32(%rsp),%ebx
+ xorl %eax,%ebp
+ movdqa %xmm5,%xmm10
+.byte 102,15,58,15,242,8
+ movl %ecx,%edi
+ roll $5,%ecx
+.byte 102,69,15,56,220,223
+ movups 112(%r15),%xmm14
+ paddd %xmm5,%xmm8
+ andl %ebp,%esi
+ xorl %eax,%ebp
+ psrldq $4,%xmm10
+ xorl %eax,%esi
+ addl %ecx,%ebx
+ pxor %xmm2,%xmm6
+ rorl $7,%edx
+ addl %esi,%ebx
+ pxor %xmm4,%xmm10
+ addl 36(%rsp),%eax
+ xorl %ebp,%edx
+ movl %ebx,%esi
+ roll $5,%ebx
+ pxor %xmm10,%xmm6
+ andl %edx,%edi
+ xorl %ebp,%edx
+ movdqa %xmm8,16(%rsp)
+ xorl %ebp,%edi
+ addl %ebx,%eax
+ movdqa %xmm6,%xmm9
+ movdqa %xmm6,%xmm10
+ rorl $7,%ecx
+ addl %edi,%eax
+ addl 40(%rsp),%ebp
+.byte 102,69,15,56,220,222
+ movups 128(%r15),%xmm15
+ xorl %edx,%ecx
+ pslldq $12,%xmm9
+ paddd %xmm6,%xmm6
+ movl %eax,%edi
+ roll $5,%eax
+ andl %ecx,%esi
+ xorl %edx,%ecx
+ psrld $31,%xmm10
+ xorl %edx,%esi
+ addl %eax,%ebp
+ movdqa %xmm9,%xmm8
+ rorl $7,%ebx
+ addl %esi,%ebp
+ psrld $30,%xmm9
+ por %xmm10,%xmm6
+ addl 44(%rsp),%edx
+ xorl %ecx,%ebx
+ movl %ebp,%esi
+ roll $5,%ebp
+ pslld $2,%xmm8
+ pxor %xmm9,%xmm6
+ andl %ebx,%edi
+ xorl %ecx,%ebx
+ movdqa 16(%r11),%xmm9
+ xorl %ecx,%edi
+.byte 102,69,15,56,220,223
+ movups 144(%r15),%xmm14
+ addl %ebp,%edx
+ pxor %xmm8,%xmm6
+ rorl $7,%eax
+ addl %edi,%edx
+ movdqa %xmm4,%xmm7
+ addl 48(%rsp),%ecx
+ xorl %ebx,%eax
+ movdqa %xmm6,%xmm8
+.byte 102,15,58,15,251,8
+ movl %edx,%edi
+ roll $5,%edx
+ paddd %xmm6,%xmm9
+ andl %eax,%esi
+ xorl %ebx,%eax
+ psrldq $4,%xmm8
+ xorl %ebx,%esi
+ addl %edx,%ecx
+ pxor %xmm3,%xmm7
+ rorl $7,%ebp
+ addl %esi,%ecx
+ pxor %xmm5,%xmm8
+ addl 52(%rsp),%ebx
+ xorl %eax,%ebp
+ movl %ecx,%esi
+ roll $5,%ecx
+.byte 102,69,15,56,220,222
+ movups 160(%r15),%xmm15
+ pxor %xmm8,%xmm7
+ andl %ebp,%edi
+ xorl %eax,%ebp
+ movdqa %xmm9,32(%rsp)
+ xorl %eax,%edi
+ addl %ecx,%ebx
+ movdqa %xmm7,%xmm10
+ movdqa %xmm7,%xmm8
+ rorl $7,%edx
+ addl %edi,%ebx
+ addl 56(%rsp),%eax
+ xorl %ebp,%edx
+ pslldq $12,%xmm10
+ paddd %xmm7,%xmm7
+ movl %ebx,%edi
+ roll $5,%ebx
+ andl %edx,%esi
+ xorl %ebp,%edx
+ psrld $31,%xmm8
+ xorl %ebp,%esi
+ addl %ebx,%eax
+ movdqa %xmm10,%xmm9
+ rorl $7,%ecx
+ addl %esi,%eax
+ psrld $30,%xmm10
+ por %xmm8,%xmm7
+ addl 60(%rsp),%ebp
+ cmpl $11,%r8d
+ jb .Laesenclast1
+ movups 176(%r15),%xmm14
+.byte 102,69,15,56,220,223
+ movups 192(%r15),%xmm15
+.byte 102,69,15,56,220,222
+ je .Laesenclast1
+ movups 208(%r15),%xmm14
+.byte 102,69,15,56,220,223
+ movups 224(%r15),%xmm15
+.byte 102,69,15,56,220,222
+.Laesenclast1:
+.byte 102,69,15,56,221,223
+ movups 16(%r15),%xmm14
+ xorl %edx,%ecx
+ movl %eax,%esi
+ roll $5,%eax
+ pslld $2,%xmm9
+ pxor %xmm10,%xmm7
+ andl %ecx,%edi
+ xorl %edx,%ecx
+ movdqa 16(%r11),%xmm10
+ xorl %edx,%edi
+ addl %eax,%ebp
+ pxor %xmm9,%xmm7
+ rorl $7,%ebx
+ addl %edi,%ebp
+ movdqa %xmm7,%xmm9
+ addl 0(%rsp),%edx
+ pxor %xmm4,%xmm0
+.byte 102,68,15,58,15,206,8
+ xorl %ecx,%ebx
+ movl %ebp,%edi
+ roll $5,%ebp
+ pxor %xmm1,%xmm0
+ andl %ebx,%esi
+ xorl %ecx,%ebx
+ movdqa %xmm10,%xmm8
+ paddd %xmm7,%xmm10
+ xorl %ecx,%esi
+ movups 16(%r12),%xmm12
+ xorps %xmm13,%xmm12
+ movups %xmm11,0(%r13,%r12,1)
+ xorps %xmm12,%xmm11
+.byte 102,69,15,56,220,222
+ movups 32(%r15),%xmm15
+ addl %ebp,%edx
+ pxor %xmm9,%xmm0
+ rorl $7,%eax
+ addl %esi,%edx
+ addl 4(%rsp),%ecx
+ xorl %ebx,%eax
+ movdqa %xmm0,%xmm9
+ movdqa %xmm10,48(%rsp)
+ movl %edx,%esi
+ roll $5,%edx
+ andl %eax,%edi
+ xorl %ebx,%eax
+ pslld $2,%xmm0
+ xorl %ebx,%edi
+ addl %edx,%ecx
+ psrld $30,%xmm9
+ rorl $7,%ebp
+ addl %edi,%ecx
+ addl 8(%rsp),%ebx
+ xorl %eax,%ebp
+ movl %ecx,%edi
+ roll $5,%ecx
+.byte 102,69,15,56,220,223
+ movups 48(%r15),%xmm14
+ por %xmm9,%xmm0
+ andl %ebp,%esi
+ xorl %eax,%ebp
+ movdqa %xmm0,%xmm10
+ xorl %eax,%esi
+ addl %ecx,%ebx
+ rorl $7,%edx
+ addl %esi,%ebx
+ addl 12(%rsp),%eax
+ xorl %ebp,%edx
+ movl %ebx,%esi
+ roll $5,%ebx
+ andl %edx,%edi
+ xorl %ebp,%edx
+ xorl %ebp,%edi
+ addl %ebx,%eax
+ rorl $7,%ecx
+ addl %edi,%eax
+ addl 16(%rsp),%ebp
+.byte 102,69,15,56,220,222
+ movups 64(%r15),%xmm15
+ pxor %xmm5,%xmm1
+.byte 102,68,15,58,15,215,8
+ xorl %edx,%esi
+ movl %eax,%edi
+ roll $5,%eax
+ pxor %xmm2,%xmm1
+ xorl %ecx,%esi
+ addl %eax,%ebp
+ movdqa %xmm8,%xmm9
+ paddd %xmm0,%xmm8
+ rorl $7,%ebx
+ addl %esi,%ebp
+ pxor %xmm10,%xmm1
+ addl 20(%rsp),%edx
+ xorl %ecx,%edi
+ movl %ebp,%esi
+ roll $5,%ebp
+ movdqa %xmm1,%xmm10
+ movdqa %xmm8,0(%rsp)
+ xorl %ebx,%edi
+ addl %ebp,%edx
+ rorl $7,%eax
+ addl %edi,%edx
+ pslld $2,%xmm1
+ addl 24(%rsp),%ecx
+ xorl %ebx,%esi
+ psrld $30,%xmm10
+ movl %edx,%edi
+ roll $5,%edx
+ xorl %eax,%esi
+.byte 102,69,15,56,220,223
+ movups 80(%r15),%xmm14
+ addl %edx,%ecx
+ rorl $7,%ebp
+ addl %esi,%ecx
+ por %xmm10,%xmm1
+ addl 28(%rsp),%ebx
+ xorl %eax,%edi
+ movdqa %xmm1,%xmm8
+ movl %ecx,%esi
+ roll $5,%ecx
+ xorl %ebp,%edi
+ addl %ecx,%ebx
+ rorl $7,%edx
+ addl %edi,%ebx
+ addl 32(%rsp),%eax
+ pxor %xmm6,%xmm2
+.byte 102,68,15,58,15,192,8
+ xorl %ebp,%esi
+ movl %ebx,%edi
+ roll $5,%ebx
+ pxor %xmm3,%xmm2
+ xorl %edx,%esi
+ addl %ebx,%eax
+ movdqa 32(%r11),%xmm10
+ paddd %xmm1,%xmm9
+ rorl $7,%ecx
+ addl %esi,%eax
+ pxor %xmm8,%xmm2
+ addl 36(%rsp),%ebp
+.byte 102,69,15,56,220,222
+ movups 96(%r15),%xmm15
+ xorl %edx,%edi
+ movl %eax,%esi
+ roll $5,%eax
+ movdqa %xmm2,%xmm8
+ movdqa %xmm9,16(%rsp)
+ xorl %ecx,%edi
+ addl %eax,%ebp
+ rorl $7,%ebx
+ addl %edi,%ebp
+ pslld $2,%xmm2
+ addl 40(%rsp),%edx
+ xorl %ecx,%esi
+ psrld $30,%xmm8
+ movl %ebp,%edi
+ roll $5,%ebp
+ xorl %ebx,%esi
+ addl %ebp,%edx
+ rorl $7,%eax
+ addl %esi,%edx
+ por %xmm8,%xmm2
+ addl 44(%rsp),%ecx
+ xorl %ebx,%edi
+ movdqa %xmm2,%xmm9
+ movl %edx,%esi
+ roll $5,%edx
+ xorl %eax,%edi
+.byte 102,69,15,56,220,223
+ movups 112(%r15),%xmm14
+ addl %edx,%ecx
+ rorl $7,%ebp
+ addl %edi,%ecx
+ addl 48(%rsp),%ebx
+ pxor %xmm7,%xmm3
+.byte 102,68,15,58,15,201,8
+ xorl %eax,%esi
+ movl %ecx,%edi
+ roll $5,%ecx
+ pxor %xmm4,%xmm3
+ xorl %ebp,%esi
+ addl %ecx,%ebx
+ movdqa %xmm10,%xmm8
+ paddd %xmm2,%xmm10
+ rorl $7,%edx
+ addl %esi,%ebx
+ pxor %xmm9,%xmm3
+ addl 52(%rsp),%eax
+ xorl %ebp,%edi
+ movl %ebx,%esi
+ roll $5,%ebx
+ movdqa %xmm3,%xmm9
+ movdqa %xmm10,32(%rsp)
+ xorl %edx,%edi
+ addl %ebx,%eax
+ rorl $7,%ecx
+ addl %edi,%eax
+ pslld $2,%xmm3
+ addl 56(%rsp),%ebp
+.byte 102,69,15,56,220,222
+ movups 128(%r15),%xmm15
+ xorl %edx,%esi
+ psrld $30,%xmm9
+ movl %eax,%edi
+ roll $5,%eax
+ xorl %ecx,%esi
+ addl %eax,%ebp
+ rorl $7,%ebx
+ addl %esi,%ebp
+ por %xmm9,%xmm3
+ addl 60(%rsp),%edx
+ xorl %ecx,%edi
+ movdqa %xmm3,%xmm10
+ movl %ebp,%esi
+ roll $5,%ebp
+ xorl %ebx,%edi
+ addl %ebp,%edx
+ rorl $7,%eax
+ addl %edi,%edx
+ addl 0(%rsp),%ecx
+ pxor %xmm0,%xmm4
+.byte 102,68,15,58,15,210,8
+ xorl %ebx,%esi
+ movl %edx,%edi
+ roll $5,%edx
+ pxor %xmm5,%xmm4
+ xorl %eax,%esi
+.byte 102,69,15,56,220,223
+ movups 144(%r15),%xmm14
+ addl %edx,%ecx
+ movdqa %xmm8,%xmm9
+ paddd %xmm3,%xmm8
+ rorl $7,%ebp
+ addl %esi,%ecx
+ pxor %xmm10,%xmm4
+ addl 4(%rsp),%ebx
+ xorl %eax,%edi
+ movl %ecx,%esi
+ roll $5,%ecx
+ movdqa %xmm4,%xmm10
+ movdqa %xmm8,48(%rsp)
+ xorl %ebp,%edi
+ addl %ecx,%ebx
+ rorl $7,%edx
+ addl %edi,%ebx
+ pslld $2,%xmm4
+ addl 8(%rsp),%eax
+ xorl %ebp,%esi
+ psrld $30,%xmm10
+ movl %ebx,%edi
+ roll $5,%ebx
+ xorl %edx,%esi
+ addl %ebx,%eax
+ rorl $7,%ecx
+ addl %esi,%eax
+ por %xmm10,%xmm4
+ addl 12(%rsp),%ebp
+.byte 102,69,15,56,220,222
+ movups 160(%r15),%xmm15
+ xorl %edx,%edi
+ movdqa %xmm4,%xmm8
+ movl %eax,%esi
+ roll $5,%eax
+ xorl %ecx,%edi
+ addl %eax,%ebp
+ rorl $7,%ebx
+ addl %edi,%ebp
+ addl 16(%rsp),%edx
+ pxor %xmm1,%xmm5
+.byte 102,68,15,58,15,195,8
+ xorl %ecx,%esi
+ movl %ebp,%edi
+ roll $5,%ebp
+ pxor %xmm6,%xmm5
+ xorl %ebx,%esi
+ addl %ebp,%edx
+ movdqa %xmm9,%xmm10
+ paddd %xmm4,%xmm9
+ rorl $7,%eax
+ addl %esi,%edx
+ pxor %xmm8,%xmm5
+ addl 20(%rsp),%ecx
+ xorl %ebx,%edi
+ movl %edx,%esi
+ roll $5,%edx
+ movdqa %xmm5,%xmm8
+ movdqa %xmm9,0(%rsp)
+ xorl %eax,%edi
+ cmpl $11,%r8d
+ jb .Laesenclast2
+ movups 176(%r15),%xmm14
+.byte 102,69,15,56,220,223
+ movups 192(%r15),%xmm15
+.byte 102,69,15,56,220,222
+ je .Laesenclast2
+ movups 208(%r15),%xmm14
+.byte 102,69,15,56,220,223
+ movups 224(%r15),%xmm15
+.byte 102,69,15,56,220,222
+.Laesenclast2:
+.byte 102,69,15,56,221,223
+ movups 16(%r15),%xmm14
+ addl %edx,%ecx
+ rorl $7,%ebp
+ addl %edi,%ecx
+ pslld $2,%xmm5
+ addl 24(%rsp),%ebx
+ xorl %eax,%esi
+ psrld $30,%xmm8
+ movl %ecx,%edi
+ roll $5,%ecx
+ xorl %ebp,%esi
+ addl %ecx,%ebx
+ rorl $7,%edx
+ addl %esi,%ebx
+ por %xmm8,%xmm5
+ addl 28(%rsp),%eax
+ xorl %ebp,%edi
+ movdqa %xmm5,%xmm9
+ movl %ebx,%esi
+ roll $5,%ebx
+ xorl %edx,%edi
+ addl %ebx,%eax
+ rorl $7,%ecx
+ addl %edi,%eax
+ movl %ecx,%edi
+ movups 32(%r12),%xmm12
+ xorps %xmm13,%xmm12
+ movups %xmm11,16(%r13,%r12,1)
+ xorps %xmm12,%xmm11
+.byte 102,69,15,56,220,222
+ movups 32(%r15),%xmm15
+ pxor %xmm2,%xmm6
+.byte 102,68,15,58,15,204,8
+ xorl %edx,%ecx
+ addl 32(%rsp),%ebp
+ andl %edx,%edi
+ pxor %xmm7,%xmm6
+ andl %ecx,%esi
+ rorl $7,%ebx
+ movdqa %xmm10,%xmm8
+ paddd %xmm5,%xmm10
+ addl %edi,%ebp
+ movl %eax,%edi
+ pxor %xmm9,%xmm6
+ roll $5,%eax
+ addl %esi,%ebp
+ xorl %edx,%ecx
+ addl %eax,%ebp
+ movdqa %xmm6,%xmm9
+ movdqa %xmm10,16(%rsp)
+ movl %ebx,%esi
+ xorl %ecx,%ebx
+ addl 36(%rsp),%edx
+ andl %ecx,%esi
+ pslld $2,%xmm6
+ andl %ebx,%edi
+ rorl $7,%eax
+ psrld $30,%xmm9
+ addl %esi,%edx
+ movl %ebp,%esi
+ roll $5,%ebp
+.byte 102,69,15,56,220,223
+ movups 48(%r15),%xmm14
+ addl %edi,%edx
+ xorl %ecx,%ebx
+ addl %ebp,%edx
+ por %xmm9,%xmm6
+ movl %eax,%edi
+ xorl %ebx,%eax
+ movdqa %xmm6,%xmm10
+ addl 40(%rsp),%ecx
+ andl %ebx,%edi
+ andl %eax,%esi
+ rorl $7,%ebp
+ addl %edi,%ecx
+ movl %edx,%edi
+ roll $5,%edx
+ addl %esi,%ecx
+ xorl %ebx,%eax
+ addl %edx,%ecx
+ movl %ebp,%esi
+ xorl %eax,%ebp
+ addl 44(%rsp),%ebx
+ andl %eax,%esi
+ andl %ebp,%edi
+.byte 102,69,15,56,220,222
+ movups 64(%r15),%xmm15
+ rorl $7,%edx
+ addl %esi,%ebx
+ movl %ecx,%esi
+ roll $5,%ecx
+ addl %edi,%ebx
+ xorl %eax,%ebp
+ addl %ecx,%ebx
+ movl %edx,%edi
+ pxor %xmm3,%xmm7
+.byte 102,68,15,58,15,213,8
+ xorl %ebp,%edx
+ addl 48(%rsp),%eax
+ andl %ebp,%edi
+ pxor %xmm0,%xmm7
+ andl %edx,%esi
+ rorl $7,%ecx
+ movdqa 48(%r11),%xmm9
+ paddd %xmm6,%xmm8
+ addl %edi,%eax
+ movl %ebx,%edi
+ pxor %xmm10,%xmm7
+ roll $5,%ebx
+ addl %esi,%eax
+ xorl %ebp,%edx
+ addl %ebx,%eax
+ movdqa %xmm7,%xmm10
+ movdqa %xmm8,32(%rsp)
+ movl %ecx,%esi
+.byte 102,69,15,56,220,223
+ movups 80(%r15),%xmm14
+ xorl %edx,%ecx
+ addl 52(%rsp),%ebp
+ andl %edx,%esi
+ pslld $2,%xmm7
+ andl %ecx,%edi
+ rorl $7,%ebx
+ psrld $30,%xmm10
+ addl %esi,%ebp
+ movl %eax,%esi
+ roll $5,%eax
+ addl %edi,%ebp
+ xorl %edx,%ecx
+ addl %eax,%ebp
+ por %xmm10,%xmm7
+ movl %ebx,%edi
+ xorl %ecx,%ebx
+ movdqa %xmm7,%xmm8
+ addl 56(%rsp),%edx
+ andl %ecx,%edi
+ andl %ebx,%esi
+ rorl $7,%eax
+ addl %edi,%edx
+ movl %ebp,%edi
+ roll $5,%ebp
+.byte 102,69,15,56,220,222
+ movups 96(%r15),%xmm15
+ addl %esi,%edx
+ xorl %ecx,%ebx
+ addl %ebp,%edx
+ movl %eax,%esi
+ xorl %ebx,%eax
+ addl 60(%rsp),%ecx
+ andl %ebx,%esi
+ andl %eax,%edi
+ rorl $7,%ebp
+ addl %esi,%ecx
+ movl %edx,%esi
+ roll $5,%edx
+ addl %edi,%ecx
+ xorl %ebx,%eax
+ addl %edx,%ecx
+ movl %ebp,%edi
+ pxor %xmm4,%xmm0
+.byte 102,68,15,58,15,198,8
+ xorl %eax,%ebp
+ addl 0(%rsp),%ebx
+ andl %eax,%edi
+ pxor %xmm1,%xmm0
+ andl %ebp,%esi
+.byte 102,69,15,56,220,223
+ movups 112(%r15),%xmm14
+ rorl $7,%edx
+ movdqa %xmm9,%xmm10
+ paddd %xmm7,%xmm9
+ addl %edi,%ebx
+ movl %ecx,%edi
+ pxor %xmm8,%xmm0
+ roll $5,%ecx
+ addl %esi,%ebx
+ xorl %eax,%ebp
+ addl %ecx,%ebx
+ movdqa %xmm0,%xmm8
+ movdqa %xmm9,48(%rsp)
+ movl %edx,%esi
+ xorl %ebp,%edx
+ addl 4(%rsp),%eax
+ andl %ebp,%esi
+ pslld $2,%xmm0
+ andl %edx,%edi
+ rorl $7,%ecx
+ psrld $30,%xmm8
+ addl %esi,%eax
+ movl %ebx,%esi
+ roll $5,%ebx
+ addl %edi,%eax
+ xorl %ebp,%edx
+ addl %ebx,%eax
+ por %xmm8,%xmm0
+ movl %ecx,%edi
+.byte 102,69,15,56,220,222
+ movups 128(%r15),%xmm15
+ xorl %edx,%ecx
+ movdqa %xmm0,%xmm9
+ addl 8(%rsp),%ebp
+ andl %edx,%edi
+ andl %ecx,%esi
+ rorl $7,%ebx
+ addl %edi,%ebp
+ movl %eax,%edi
+ roll $5,%eax
+ addl %esi,%ebp
+ xorl %edx,%ecx
+ addl %eax,%ebp
+ movl %ebx,%esi
+ xorl %ecx,%ebx
+ addl 12(%rsp),%edx
+ andl %ecx,%esi
+ andl %ebx,%edi
+ rorl $7,%eax
+ addl %esi,%edx
+ movl %ebp,%esi
+ roll $5,%ebp
+.byte 102,69,15,56,220,223
+ movups 144(%r15),%xmm14
+ addl %edi,%edx
+ xorl %ecx,%ebx
+ addl %ebp,%edx
+ movl %eax,%edi
+ pxor %xmm5,%xmm1
+.byte 102,68,15,58,15,207,8
+ xorl %ebx,%eax
+ addl 16(%rsp),%ecx
+ andl %ebx,%edi
+ pxor %xmm2,%xmm1
+ andl %eax,%esi
+ rorl $7,%ebp
+ movdqa %xmm10,%xmm8
+ paddd %xmm0,%xmm10
+ addl %edi,%ecx
+ movl %edx,%edi
+ pxor %xmm9,%xmm1
+ roll $5,%edx
+ addl %esi,%ecx
+ xorl %ebx,%eax
+ addl %edx,%ecx
+ movdqa %xmm1,%xmm9
+ movdqa %xmm10,0(%rsp)
+ movl %ebp,%esi
+ xorl %eax,%ebp
+ addl 20(%rsp),%ebx
+ andl %eax,%esi
+ pslld $2,%xmm1
+ andl %ebp,%edi
+.byte 102,69,15,56,220,222
+ movups 160(%r15),%xmm15
+ rorl $7,%edx
+ psrld $30,%xmm9
+ addl %esi,%ebx
+ movl %ecx,%esi
+ roll $5,%ecx
+ addl %edi,%ebx
+ xorl %eax,%ebp
+ addl %ecx,%ebx
+ por %xmm9,%xmm1
+ movl %edx,%edi
+ xorl %ebp,%edx
+ movdqa %xmm1,%xmm10
+ addl 24(%rsp),%eax
+ andl %ebp,%edi
+ andl %edx,%esi
+ rorl $7,%ecx
+ addl %edi,%eax
+ movl %ebx,%edi
+ roll $5,%ebx
+ addl %esi,%eax
+ xorl %ebp,%edx
+ addl %ebx,%eax
+ movl %ecx,%esi
+ cmpl $11,%r8d
+ jb .Laesenclast3
+ movups 176(%r15),%xmm14
+.byte 102,69,15,56,220,223
+ movups 192(%r15),%xmm15
+.byte 102,69,15,56,220,222
+ je .Laesenclast3
+ movups 208(%r15),%xmm14
+.byte 102,69,15,56,220,223
+ movups 224(%r15),%xmm15
+.byte 102,69,15,56,220,222
+.Laesenclast3:
+.byte 102,69,15,56,221,223
+ movups 16(%r15),%xmm14
+ xorl %edx,%ecx
+ addl 28(%rsp),%ebp
+ andl %edx,%esi
+ andl %ecx,%edi
+ rorl $7,%ebx
+ addl %esi,%ebp
+ movl %eax,%esi
+ roll $5,%eax
+ addl %edi,%ebp
+ xorl %edx,%ecx
+ addl %eax,%ebp
+ movl %ebx,%edi
+ pxor %xmm6,%xmm2
+.byte 102,68,15,58,15,208,8
+ xorl %ecx,%ebx
+ addl 32(%rsp),%edx
+ andl %ecx,%edi
+ pxor %xmm3,%xmm2
+ andl %ebx,%esi
+ rorl $7,%eax
+ movdqa %xmm8,%xmm9
+ paddd %xmm1,%xmm8
+ addl %edi,%edx
+ movl %ebp,%edi
+ pxor %xmm10,%xmm2
+ roll $5,%ebp
+ movups 48(%r12),%xmm12
+ xorps %xmm13,%xmm12
+ movups %xmm11,32(%r13,%r12,1)
+ xorps %xmm12,%xmm11
+.byte 102,69,15,56,220,222
+ movups 32(%r15),%xmm15
+ addl %esi,%edx
+ xorl %ecx,%ebx
+ addl %ebp,%edx
+ movdqa %xmm2,%xmm10
+ movdqa %xmm8,16(%rsp)
+ movl %eax,%esi
+ xorl %ebx,%eax
+ addl 36(%rsp),%ecx
+ andl %ebx,%esi
+ pslld $2,%xmm2
+ andl %eax,%edi
+ rorl $7,%ebp
+ psrld $30,%xmm10
+ addl %esi,%ecx
+ movl %edx,%esi
+ roll $5,%edx
+ addl %edi,%ecx
+ xorl %ebx,%eax
+ addl %edx,%ecx
+ por %xmm10,%xmm2
+ movl %ebp,%edi
+ xorl %eax,%ebp
+ movdqa %xmm2,%xmm8
+ addl 40(%rsp),%ebx
+ andl %eax,%edi
+ andl %ebp,%esi
+.byte 102,69,15,56,220,223
+ movups 48(%r15),%xmm14
+ rorl $7,%edx
+ addl %edi,%ebx
+ movl %ecx,%edi
+ roll $5,%ecx
+ addl %esi,%ebx
+ xorl %eax,%ebp
+ addl %ecx,%ebx
+ movl %edx,%esi
+ xorl %ebp,%edx
+ addl 44(%rsp),%eax
+ andl %ebp,%esi
+ andl %edx,%edi
+ rorl $7,%ecx
+ addl %esi,%eax
+ movl %ebx,%esi
+ roll $5,%ebx
+ addl %edi,%eax
+ xorl %ebp,%edx
+ addl %ebx,%eax
+ addl 48(%rsp),%ebp
+.byte 102,69,15,56,220,222
+ movups 64(%r15),%xmm15
+ pxor %xmm7,%xmm3
+.byte 102,68,15,58,15,193,8
+ xorl %edx,%esi
+ movl %eax,%edi
+ roll $5,%eax
+ pxor %xmm4,%xmm3
+ xorl %ecx,%esi
+ addl %eax,%ebp
+ movdqa %xmm9,%xmm10
+ paddd %xmm2,%xmm9
+ rorl $7,%ebx
+ addl %esi,%ebp
+ pxor %xmm8,%xmm3
+ addl 52(%rsp),%edx
+ xorl %ecx,%edi
+ movl %ebp,%esi
+ roll $5,%ebp
+ movdqa %xmm3,%xmm8
+ movdqa %xmm9,32(%rsp)
+ xorl %ebx,%edi
+ addl %ebp,%edx
+ rorl $7,%eax
+ addl %edi,%edx
+ pslld $2,%xmm3
+ addl 56(%rsp),%ecx
+ xorl %ebx,%esi
+ psrld $30,%xmm8
+ movl %edx,%edi
+ roll $5,%edx
+ xorl %eax,%esi
+.byte 102,69,15,56,220,223
+ movups 80(%r15),%xmm14
+ addl %edx,%ecx
+ rorl $7,%ebp
+ addl %esi,%ecx
+ por %xmm8,%xmm3
+ addl 60(%rsp),%ebx
+ xorl %eax,%edi
+ movl %ecx,%esi
+ roll $5,%ecx
+ xorl %ebp,%edi
+ addl %ecx,%ebx
+ rorl $7,%edx
+ addl %edi,%ebx
+ addl 0(%rsp),%eax
+ paddd %xmm3,%xmm10
+ xorl %ebp,%esi
+ movl %ebx,%edi
+ roll $5,%ebx
+ xorl %edx,%esi
+ movdqa %xmm10,48(%rsp)
+ addl %ebx,%eax
+ rorl $7,%ecx
+ addl %esi,%eax
+ addl 4(%rsp),%ebp
+.byte 102,69,15,56,220,222
+ movups 96(%r15),%xmm15
+ xorl %edx,%edi
+ movl %eax,%esi
+ roll $5,%eax
+ xorl %ecx,%edi
+ addl %eax,%ebp
+ rorl $7,%ebx
+ addl %edi,%ebp
+ addl 8(%rsp),%edx
+ xorl %ecx,%esi
+ movl %ebp,%edi
+ roll $5,%ebp
+ xorl %ebx,%esi
+ addl %ebp,%edx
+ rorl $7,%eax
+ addl %esi,%edx
+ addl 12(%rsp),%ecx
+ xorl %ebx,%edi
+ movl %edx,%esi
+ roll $5,%edx
+ xorl %eax,%edi
+.byte 102,69,15,56,220,223
+ movups 112(%r15),%xmm14
+ addl %edx,%ecx
+ rorl $7,%ebp
+ addl %edi,%ecx
+ cmpq %r14,%r10
+ je .Ldone_ssse3
+ movdqa 64(%r11),%xmm6
+ movdqa 0(%r11),%xmm9
+ movdqu 0(%r10),%xmm0
+ movdqu 16(%r10),%xmm1
+ movdqu 32(%r10),%xmm2
+ movdqu 48(%r10),%xmm3
+.byte 102,15,56,0,198
+ addq $64,%r10
+ addl 16(%rsp),%ebx
+ xorl %eax,%esi
+.byte 102,15,56,0,206
+ movl %ecx,%edi
+ roll $5,%ecx
+ paddd %xmm9,%xmm0
+ xorl %ebp,%esi
+ addl %ecx,%ebx
+ rorl $7,%edx
+ addl %esi,%ebx
+ movdqa %xmm0,0(%rsp)
+ addl 20(%rsp),%eax
+ xorl %ebp,%edi
+ psubd %xmm9,%xmm0
+ movl %ebx,%esi
+ roll $5,%ebx
+ xorl %edx,%edi
+ addl %ebx,%eax
+ rorl $7,%ecx
+ addl %edi,%eax
+ addl 24(%rsp),%ebp
+.byte 102,69,15,56,220,222
+ movups 128(%r15),%xmm15
+ xorl %edx,%esi
+ movl %eax,%edi
+ roll $5,%eax
+ xorl %ecx,%esi
+ addl %eax,%ebp
+ rorl $7,%ebx
+ addl %esi,%ebp
+ addl 28(%rsp),%edx
+ xorl %ecx,%edi
+ movl %ebp,%esi
+ roll $5,%ebp
+ xorl %ebx,%edi
+ addl %ebp,%edx
+ rorl $7,%eax
+ addl %edi,%edx
+ addl 32(%rsp),%ecx
+ xorl %ebx,%esi
+.byte 102,15,56,0,214
+ movl %edx,%edi
+ roll $5,%edx
+ paddd %xmm9,%xmm1
+ xorl %eax,%esi
+.byte 102,69,15,56,220,223
+ movups 144(%r15),%xmm14
+ addl %edx,%ecx
+ rorl $7,%ebp
+ addl %esi,%ecx
+ movdqa %xmm1,16(%rsp)
+ addl 36(%rsp),%ebx
+ xorl %eax,%edi
+ psubd %xmm9,%xmm1
+ movl %ecx,%esi
+ roll $5,%ecx
+ xorl %ebp,%edi
+ addl %ecx,%ebx
+ rorl $7,%edx
+ addl %edi,%ebx
+ addl 40(%rsp),%eax
+ xorl %ebp,%esi
+ movl %ebx,%edi
+ roll $5,%ebx
+ xorl %edx,%esi
+ addl %ebx,%eax
+ rorl $7,%ecx
+ addl %esi,%eax
+ addl 44(%rsp),%ebp
+.byte 102,69,15,56,220,222
+ movups 160(%r15),%xmm15
+ xorl %edx,%edi
+ movl %eax,%esi
+ roll $5,%eax
+ xorl %ecx,%edi
+ addl %eax,%ebp
+ rorl $7,%ebx
+ addl %edi,%ebp
+ addl 48(%rsp),%edx
+ xorl %ecx,%esi
+.byte 102,15,56,0,222
+ movl %ebp,%edi
+ roll $5,%ebp
+ paddd %xmm9,%xmm2
+ xorl %ebx,%esi
+ addl %ebp,%edx
+ rorl $7,%eax
+ addl %esi,%edx
+ movdqa %xmm2,32(%rsp)
+ addl 52(%rsp),%ecx
+ xorl %ebx,%edi
+ psubd %xmm9,%xmm2
+ movl %edx,%esi
+ roll $5,%edx
+ xorl %eax,%edi
+ cmpl $11,%r8d
+ jb .Laesenclast4
+ movups 176(%r15),%xmm14
+.byte 102,69,15,56,220,223
+ movups 192(%r15),%xmm15
+.byte 102,69,15,56,220,222
+ je .Laesenclast4
+ movups 208(%r15),%xmm14
+.byte 102,69,15,56,220,223
+ movups 224(%r15),%xmm15
+.byte 102,69,15,56,220,222
+.Laesenclast4:
+.byte 102,69,15,56,221,223
+ movups 16(%r15),%xmm14
+ addl %edx,%ecx
+ rorl $7,%ebp
+ addl %edi,%ecx
+ addl 56(%rsp),%ebx
+ xorl %eax,%esi
+ movl %ecx,%edi
+ roll $5,%ecx
+ xorl %ebp,%esi
+ addl %ecx,%ebx
+ rorl $7,%edx
+ addl %esi,%ebx
+ addl 60(%rsp),%eax
+ xorl %ebp,%edi
+ movl %ebx,%esi
+ roll $5,%ebx
+ xorl %edx,%edi
+ addl %ebx,%eax
+ rorl $7,%ecx
+ addl %edi,%eax
+ movups %xmm11,48(%r13,%r12,1)
+ leaq 64(%r12),%r12
+
+ addl 0(%r9),%eax
+ addl 4(%r9),%esi
+ addl 8(%r9),%ecx
+ addl 12(%r9),%edx
+ movl %eax,0(%r9)
+ addl 16(%r9),%ebp
+ movl %esi,4(%r9)
+ movl %esi,%ebx
+ movl %ecx,8(%r9)
+ movl %edx,12(%r9)
+ movl %ebp,16(%r9)
+ jmp .Loop_ssse3
+
+.align 16
+.Ldone_ssse3:
+ addl 16(%rsp),%ebx
+ xorl %eax,%esi
+ movl %ecx,%edi
+ roll $5,%ecx
+ xorl %ebp,%esi
+ addl %ecx,%ebx
+ rorl $7,%edx
+ addl %esi,%ebx
+ addl 20(%rsp),%eax
+ xorl %ebp,%edi
+ movl %ebx,%esi
+ roll $5,%ebx
+ xorl %edx,%edi
+ addl %ebx,%eax
+ rorl $7,%ecx
+ addl %edi,%eax
+ addl 24(%rsp),%ebp
+.byte 102,69,15,56,220,222
+ movups 128(%r15),%xmm15
+ xorl %edx,%esi
+ movl %eax,%edi
+ roll $5,%eax
+ xorl %ecx,%esi
+ addl %eax,%ebp
+ rorl $7,%ebx
+ addl %esi,%ebp
+ addl 28(%rsp),%edx
+ xorl %ecx,%edi
+ movl %ebp,%esi
+ roll $5,%ebp
+ xorl %ebx,%edi
+ addl %ebp,%edx
+ rorl $7,%eax
+ addl %edi,%edx
+ addl 32(%rsp),%ecx
+ xorl %ebx,%esi
+ movl %edx,%edi
+ roll $5,%edx
+ xorl %eax,%esi
+.byte 102,69,15,56,220,223
+ movups 144(%r15),%xmm14
+ addl %edx,%ecx
+ rorl $7,%ebp
+ addl %esi,%ecx
+ addl 36(%rsp),%ebx
+ xorl %eax,%edi
+ movl %ecx,%esi
+ roll $5,%ecx
+ xorl %ebp,%edi
+ addl %ecx,%ebx
+ rorl $7,%edx
+ addl %edi,%ebx
+ addl 40(%rsp),%eax
+ xorl %ebp,%esi
+ movl %ebx,%edi
+ roll $5,%ebx
+ xorl %edx,%esi
+ addl %ebx,%eax
+ rorl $7,%ecx
+ addl %esi,%eax
+ addl 44(%rsp),%ebp
+.byte 102,69,15,56,220,222
+ movups 160(%r15),%xmm15
+ xorl %edx,%edi
+ movl %eax,%esi
+ roll $5,%eax
+ xorl %ecx,%edi
+ addl %eax,%ebp
+ rorl $7,%ebx
+ addl %edi,%ebp
+ addl 48(%rsp),%edx
+ xorl %ecx,%esi
+ movl %ebp,%edi
+ roll $5,%ebp
+ xorl %ebx,%esi
+ addl %ebp,%edx
+ rorl $7,%eax
+ addl %esi,%edx
+ addl 52(%rsp),%ecx
+ xorl %ebx,%edi
+ movl %edx,%esi
+ roll $5,%edx
+ xorl %eax,%edi
+ cmpl $11,%r8d
+ jb .Laesenclast5
+ movups 176(%r15),%xmm14
+.byte 102,69,15,56,220,223
+ movups 192(%r15),%xmm15
+.byte 102,69,15,56,220,222
+ je .Laesenclast5
+ movups 208(%r15),%xmm14
+.byte 102,69,15,56,220,223
+ movups 224(%r15),%xmm15
+.byte 102,69,15,56,220,222
+.Laesenclast5:
+.byte 102,69,15,56,221,223
+ movups 16(%r15),%xmm14
+ addl %edx,%ecx
+ rorl $7,%ebp
+ addl %edi,%ecx
+ addl 56(%rsp),%ebx
+ xorl %eax,%esi
+ movl %ecx,%edi
+ roll $5,%ecx
+ xorl %ebp,%esi
+ addl %ecx,%ebx
+ rorl $7,%edx
+ addl %esi,%ebx
+ addl 60(%rsp),%eax
+ xorl %ebp,%edi
+ movl %ebx,%esi
+ roll $5,%ebx
+ xorl %edx,%edi
+ addl %ebx,%eax
+ rorl $7,%ecx
+ addl %edi,%eax
+ movups %xmm11,48(%r13,%r12,1)
+ movq 88(%rsp),%r8
+
+ addl 0(%r9),%eax
+ addl 4(%r9),%esi
+ addl 8(%r9),%ecx
+ movl %eax,0(%r9)
+ addl 12(%r9),%edx
+ movl %esi,4(%r9)
+ addl 16(%r9),%ebp
+ movl %ecx,8(%r9)
+ movl %edx,12(%r9)
+ movl %ebp,16(%r9)
+ movups %xmm11,(%r8)
+ leaq 104(%rsp),%rsi
+ movq 0(%rsi),%r15
+ movq 8(%rsi),%r14
+ movq 16(%rsi),%r13
+ movq 24(%rsi),%r12
+ movq 32(%rsi),%rbp
+ movq 40(%rsi),%rbx
+ leaq 48(%rsi),%rsp
+.Lepilogue_ssse3:
+ .byte 0xf3,0xc3
+.size aesni_cbc_sha1_enc_ssse3,.-aesni_cbc_sha1_enc_ssse3
+.align 64
+K_XX_XX:
+.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999
+.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1
+.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc
+.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6
+.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+
+.byte 65,69,83,78,73,45,67,66,67,43,83,72,65,49,32,115,116,105,116,99,104,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 64
diff --git a/main/openssl/crypto/aes/asm/aesni-sha1-x86_64.pl b/main/openssl/crypto/aes/asm/aesni-sha1-x86_64.pl
new file mode 100644
index 00000000..3c8f6c19
--- /dev/null
+++ b/main/openssl/crypto/aes/asm/aesni-sha1-x86_64.pl
@@ -0,0 +1,1250 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# June 2011
+#
+# This is AESNI-CBC+SHA1 "stitch" implementation. The idea, as spelled
+# in http://download.intel.com/design/intarch/papers/323686.pdf, is
+# that since AESNI-CBC encrypt exhibit *very* low instruction-level
+# parallelism, interleaving it with another algorithm would allow to
+# utilize processor resources better and achieve better performance.
+# SHA1 instruction sequences(*) are taken from sha1-x86_64.pl and
+# AESNI code is weaved into it. Below are performance numbers in
+# cycles per processed byte, less is better, for standalone AESNI-CBC
+# encrypt, sum of the latter and standalone SHA1, and "stitched"
+# subroutine:
+#
+# AES-128-CBC +SHA1 stitch gain
+# Westmere 3.77[+5.6] 9.37 6.65 +41%
+# Sandy Bridge 5.05[+5.2(6.3)] 10.25(11.35) 6.16(7.08) +67%(+60%)
+#
+# AES-192-CBC
+# Westmere 4.51 10.11 6.97 +45%
+# Sandy Bridge 6.05 11.25(12.35) 6.34(7.27) +77%(+70%)
+#
+# AES-256-CBC
+# Westmere 5.25 10.85 7.25 +50%
+# Sandy Bridge 7.05 12.25(13.35) 7.06(7.70) +74%(+73%)
+#
+# (*) There are two code paths: SSSE3 and AVX. See sha1-568.pl for
+# background information. Above numbers in parentheses are SSSE3
+# results collected on AVX-capable CPU, i.e. apply on OSes that
+# don't support AVX.
+#
+# Needless to mention that it makes no sense to implement "stitched"
+# *decrypt* subroutine. Because *both* AESNI-CBC decrypt and SHA1
+# fully utilize parallelism, so stitching would not give any gain
+# anyway. Well, there might be some, e.g. because of better cache
+# locality... For reference, here are performance results for
+# standalone AESNI-CBC decrypt:
+#
+# AES-128-CBC AES-192-CBC AES-256-CBC
+# Westmere 1.31 1.55 1.80
+# Sandy Bridge 0.93 1.06 1.22
+
+$flavour = shift;
+$output = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+$avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+ =~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
+ $1>=2.19);
+$avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+ `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
+ $1>=2.09);
+$avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+ `ml64 2>&1` =~ /Version ([0-9]+)\./ &&
+ $1>=10);
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+# void aesni_cbc_sha1_enc(const void *inp,
+# void *out,
+# size_t length,
+# const AES_KEY *key,
+# unsigned char *iv,
+# SHA_CTX *ctx,
+# const void *in0);
+
+$code.=<<___;
+.text
+.extern OPENSSL_ia32cap_P
+
+.globl aesni_cbc_sha1_enc
+.type aesni_cbc_sha1_enc,\@abi-omnipotent
+.align 16
+aesni_cbc_sha1_enc:
+ # caller should check for SSSE3 and AES-NI bits
+ mov OPENSSL_ia32cap_P+0(%rip),%r10d
+ mov OPENSSL_ia32cap_P+4(%rip),%r11d
+___
+$code.=<<___ if ($avx);
+ and \$`1<<28`,%r11d # mask AVX bit
+ and \$`1<<30`,%r10d # mask "Intel CPU" bit
+ or %r11d,%r10d
+ cmp \$`1<<28|1<<30`,%r10d
+ je aesni_cbc_sha1_enc_avx
+___
+$code.=<<___;
+ jmp aesni_cbc_sha1_enc_ssse3
+ ret
+.size aesni_cbc_sha1_enc,.-aesni_cbc_sha1_enc
+___
+
+my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
+
+my $Xi=4;
+my @X=map("%xmm$_",(4..7,0..3));
+my @Tx=map("%xmm$_",(8..10));
+my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization
+my @T=("%esi","%edi");
+my $j=0; my $jj=0; my $r=0; my $sn=0;
+my $K_XX_XX="%r11";
+my ($iv,$in,$rndkey0)=map("%xmm$_",(11..13));
+my @rndkey=("%xmm14","%xmm15");
+
+sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
+ my $arg = pop;
+ $arg = "\$$arg" if ($arg*1 eq $arg);
+ $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
+}
+
+my $_rol=sub { &rol(@_) };
+my $_ror=sub { &ror(@_) };
+
+$code.=<<___;
+.type aesni_cbc_sha1_enc_ssse3,\@function,6
+.align 16
+aesni_cbc_sha1_enc_ssse3:
+ mov `($win64?56:8)`(%rsp),$inp # load 7th argument
+ #shr \$6,$len # debugging artefact
+ #jz .Lepilogue_ssse3 # debugging artefact
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ lea `-104-($win64?10*16:0)`(%rsp),%rsp
+ #mov $in0,$inp # debugging artefact
+ #lea 64(%rsp),$ctx # debugging artefact
+___
+$code.=<<___ if ($win64);
+ movaps %xmm6,96+0(%rsp)
+ movaps %xmm7,96+16(%rsp)
+ movaps %xmm8,96+32(%rsp)
+ movaps %xmm9,96+48(%rsp)
+ movaps %xmm10,96+64(%rsp)
+ movaps %xmm11,96+80(%rsp)
+ movaps %xmm12,96+96(%rsp)
+ movaps %xmm13,96+112(%rsp)
+ movaps %xmm14,96+128(%rsp)
+ movaps %xmm15,96+144(%rsp)
+.Lprologue_ssse3:
+___
+$code.=<<___;
+ mov $in0,%r12 # reassign arguments
+ mov $out,%r13
+ mov $len,%r14
+ mov $key,%r15
+ movdqu ($ivp),$iv # load IV
+ mov $ivp,88(%rsp) # save $ivp
+___
+my ($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments
+my $rounds="${ivp}d";
+$code.=<<___;
+ shl \$6,$len
+ sub $in0,$out
+ mov 240($key),$rounds
+ add $inp,$len # end of input
+
+ lea K_XX_XX(%rip),$K_XX_XX
+ mov 0($ctx),$A # load context
+ mov 4($ctx),$B
+ mov 8($ctx),$C
+ mov 12($ctx),$D
+ mov $B,@T[0] # magic seed
+ mov 16($ctx),$E
+
+ movdqa 64($K_XX_XX),@X[2] # pbswap mask
+ movdqa 0($K_XX_XX),@Tx[1] # K_00_19
+ movdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
+ movdqu 16($inp),@X[-3&7]
+ movdqu 32($inp),@X[-2&7]
+ movdqu 48($inp),@X[-1&7]
+ pshufb @X[2],@X[-4&7] # byte swap
+ add \$64,$inp
+ pshufb @X[2],@X[-3&7]
+ pshufb @X[2],@X[-2&7]
+ pshufb @X[2],@X[-1&7]
+ paddd @Tx[1],@X[-4&7] # add K_00_19
+ paddd @Tx[1],@X[-3&7]
+ paddd @Tx[1],@X[-2&7]
+ movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU
+ psubd @Tx[1],@X[-4&7] # restore X[]
+ movdqa @X[-3&7],16(%rsp)
+ psubd @Tx[1],@X[-3&7]
+ movdqa @X[-2&7],32(%rsp)
+ psubd @Tx[1],@X[-2&7]
+ movups ($key),$rndkey0 # $key[0]
+ movups 16($key),$rndkey[0] # forward reference
+ jmp .Loop_ssse3
+___
+
+my $aesenc=sub {
+ use integer;
+ my ($n,$k)=($r/10,$r%10);
+ if ($k==0) {
+ $code.=<<___;
+ movups `16*$n`($in0),$in # load input
+ xorps $rndkey0,$in
+___
+ $code.=<<___ if ($n);
+ movups $iv,`16*($n-1)`($out,$in0) # write output
+___
+ $code.=<<___;
+ xorps $in,$iv
+ aesenc $rndkey[0],$iv
+ movups `32+16*$k`($key),$rndkey[1]
+___
+ } elsif ($k==9) {
+ $sn++;
+ $code.=<<___;
+ cmp \$11,$rounds
+ jb .Laesenclast$sn
+ movups `32+16*($k+0)`($key),$rndkey[1]
+ aesenc $rndkey[0],$iv
+ movups `32+16*($k+1)`($key),$rndkey[0]
+ aesenc $rndkey[1],$iv
+ je .Laesenclast$sn
+ movups `32+16*($k+2)`($key),$rndkey[1]
+ aesenc $rndkey[0],$iv
+ movups `32+16*($k+3)`($key),$rndkey[0]
+ aesenc $rndkey[1],$iv
+.Laesenclast$sn:
+ aesenclast $rndkey[0],$iv
+ movups 16($key),$rndkey[1] # forward reference
+___
+ } else {
+ $code.=<<___;
+ aesenc $rndkey[0],$iv
+ movups `32+16*$k`($key),$rndkey[1]
+___
+ }
+ $r++; unshift(@rndkey,pop(@rndkey));
+};
+
+sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
+ my ($a,$b,$c,$d,$e);
+
+ &movdqa (@X[0],@X[-3&7]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &movdqa (@Tx[0],@X[-1&7]);
+ &palignr(@X[0],@X[-4&7],8); # compose "X[-14]" in "X[0]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &paddd (@Tx[1],@X[-1&7]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &psrldq (@Tx[0],4); # "X[-3]", 3 dwords
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &pxor (@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &movdqa (@Tx[2],@X[0]);
+ &movdqa (@Tx[0],@X[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &pslldq (@Tx[2],12); # "X[0]"<<96, extract one dword
+ &paddd (@X[0],@X[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &psrld (@Tx[0],31);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &movdqa (@Tx[1],@Tx[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &psrld (@Tx[2],30);
+ &por (@X[0],@Tx[0]); # "X[0]"<<<=1
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &pslld (@Tx[1],2);
+ &pxor (@X[0],@Tx[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &movdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &pxor (@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2
+
+ foreach (@insns) { eval; } # remaining instructions [if any]
+
+ $Xi++; push(@X,shift(@X)); # "rotate" X[]
+ push(@Tx,shift(@Tx));
+}
+
+sub Xupdate_ssse3_32_79()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions
+ my ($a,$b,$c,$d,$e);
+
+ &movdqa (@Tx[0],@X[-1&7]) if ($Xi==8);
+ eval(shift(@insns)); # body_20_39
+ &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
+ &palignr(@Tx[0],@X[-2&7],8); # compose "X[-6]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+
+ &pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
+ eval(shift(@insns));
+ eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/);
+ if ($Xi%5) {
+ &movdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
+ } else { # ... or load next one
+ &movdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
+ }
+ &paddd (@Tx[1],@X[-1&7]);
+ eval(shift(@insns)); # ror
+ eval(shift(@insns));
+
+ &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-6]"
+ eval(shift(@insns)); # body_20_39
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+
+ &movdqa (@Tx[0],@X[0]);
+ &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # ror
+ eval(shift(@insns));
+
+ &pslld (@X[0],2);
+ eval(shift(@insns)); # body_20_39
+ eval(shift(@insns));
+ &psrld (@Tx[0],30);
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # ror
+ eval(shift(@insns));
+
+ &por (@X[0],@Tx[0]); # "X[0]"<<<=2
+ eval(shift(@insns)); # body_20_39
+ eval(shift(@insns));
+ &movdqa (@Tx[1],@X[0]) if ($Xi<19);
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+ eval(shift(@insns));
+
+ foreach (@insns) { eval; } # remaining instructions
+
+ $Xi++; push(@X,shift(@X)); # "rotate" X[]
+ push(@Tx,shift(@Tx));
+}
+
+sub Xuplast_ssse3_80()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
+ my ($a,$b,$c,$d,$e);
+
+ eval(shift(@insns));
+ &paddd (@Tx[1],@X[-1&7]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
+
+ foreach (@insns) { eval; } # remaining instructions
+
+ &cmp ($inp,$len);
+ &je (".Ldone_ssse3");
+
+ unshift(@Tx,pop(@Tx));
+
+ &movdqa (@X[2],"64($K_XX_XX)"); # pbswap mask
+ &movdqa (@Tx[1],"0($K_XX_XX)"); # K_00_19
+ &movdqu (@X[-4&7],"0($inp)"); # load input
+ &movdqu (@X[-3&7],"16($inp)");
+ &movdqu (@X[-2&7],"32($inp)");
+ &movdqu (@X[-1&7],"48($inp)");
+ &pshufb (@X[-4&7],@X[2]); # byte swap
+ &add ($inp,64);
+
+ $Xi=0;
+}
+
+sub Xloop_ssse3()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
+ my ($a,$b,$c,$d,$e);
+
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &pshufb (@X[($Xi-3)&7],@X[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &paddd (@X[($Xi-4)&7],@Tx[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]); # X[]+K xfer to IALU
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &psubd (@X[($Xi-4)&7],@Tx[1]);
+
+ foreach (@insns) { eval; }
+ $Xi++;
+}
+
+sub Xtail_ssse3()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
+ my ($a,$b,$c,$d,$e);
+
+ foreach (@insns) { eval; }
+}
+
+sub body_00_19 () {
+ use integer;
+ my ($k,$n);
+ my @r=(
+ '($a,$b,$c,$d,$e)=@V;'.
+ '&add ($e,eval(4*($j&15))."(%rsp)");', # X[]+K xfer
+ '&xor ($c,$d);',
+ '&mov (@T[1],$a);', # $b in next round
+ '&$_rol ($a,5);',
+ '&and (@T[0],$c);', # ($b&($c^$d))
+ '&xor ($c,$d);', # restore $c
+ '&xor (@T[0],$d);',
+ '&add ($e,$a);',
+ '&$_ror ($b,$j?7:2);', # $b>>>2
+ '&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+ );
+ $n = scalar(@r);
+ $k = (($jj+1)*12/20)*20*$n/12; # 12 aesencs per these 20 rounds
+ @r[$k%$n].='&$aesenc();' if ($jj==$k/$n);
+ $jj++;
+ return @r;
+}
+
+sub body_20_39 () {
+ use integer;
+ my ($k,$n);
+ my @r=(
+ '($a,$b,$c,$d,$e)=@V;'.
+ '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer
+ '&xor (@T[0],$d);', # ($b^$d)
+ '&mov (@T[1],$a);', # $b in next round
+ '&$_rol ($a,5);',
+ '&xor (@T[0],$c);', # ($b^$d^$c)
+ '&add ($e,$a);',
+ '&$_ror ($b,7);', # $b>>>2
+ '&add ($e,@T[0]);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+ );
+ $n = scalar(@r);
+ $k = (($jj+1)*8/20)*20*$n/8; # 8 aesencs per these 20 rounds
+ @r[$k%$n].='&$aesenc();' if ($jj==$k/$n);
+ $jj++;
+ return @r;
+}
+
+sub body_40_59 () {
+ use integer;
+ my ($k,$n);
+ my @r=(
+ '($a,$b,$c,$d,$e)=@V;'.
+ '&mov (@T[1],$c);',
+ '&xor ($c,$d);',
+ '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer
+ '&and (@T[1],$d);',
+ '&and (@T[0],$c);', # ($b&($c^$d))
+ '&$_ror ($b,7);', # $b>>>2
+ '&add ($e,@T[1]);',
+ '&mov (@T[1],$a);', # $b in next round
+ '&$_rol ($a,5);',
+ '&add ($e,@T[0]);',
+ '&xor ($c,$d);', # restore $c
+ '&add ($e,$a);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+ );
+ $n = scalar(@r);
+ $k=(($jj+1)*12/20)*20*$n/12; # 12 aesencs per these 20 rounds
+ @r[$k%$n].='&$aesenc();' if ($jj==$k/$n);
+ $jj++;
+ return @r;
+}
+$code.=<<___;
+.align 16
+.Loop_ssse3:
+___
+ &Xupdate_ssse3_16_31(\&body_00_19);
+ &Xupdate_ssse3_16_31(\&body_00_19);
+ &Xupdate_ssse3_16_31(\&body_00_19);
+ &Xupdate_ssse3_16_31(\&body_00_19);
+ &Xupdate_ssse3_32_79(\&body_00_19);
+ &Xupdate_ssse3_32_79(\&body_20_39);
+ &Xupdate_ssse3_32_79(\&body_20_39);
+ &Xupdate_ssse3_32_79(\&body_20_39);
+ &Xupdate_ssse3_32_79(\&body_20_39);
+ &Xupdate_ssse3_32_79(\&body_20_39);
+ &Xupdate_ssse3_32_79(\&body_40_59);
+ &Xupdate_ssse3_32_79(\&body_40_59);
+ &Xupdate_ssse3_32_79(\&body_40_59);
+ &Xupdate_ssse3_32_79(\&body_40_59);
+ &Xupdate_ssse3_32_79(\&body_40_59);
+ &Xupdate_ssse3_32_79(\&body_20_39);
+ &Xuplast_ssse3_80(\&body_20_39); # can jump to "done"
+
+ $saved_j=$j; @saved_V=@V;
+ $saved_r=$r; @saved_rndkey=@rndkey;
+
+ &Xloop_ssse3(\&body_20_39);
+ &Xloop_ssse3(\&body_20_39);
+ &Xloop_ssse3(\&body_20_39);
+
+$code.=<<___;
+ movups $iv,48($out,$in0) # write output
+ lea 64($in0),$in0
+
+ add 0($ctx),$A # update context
+ add 4($ctx),@T[0]
+ add 8($ctx),$C
+ add 12($ctx),$D
+ mov $A,0($ctx)
+ add 16($ctx),$E
+ mov @T[0],4($ctx)
+ mov @T[0],$B # magic seed
+ mov $C,8($ctx)
+ mov $D,12($ctx)
+ mov $E,16($ctx)
+ jmp .Loop_ssse3
+
+.align 16
+.Ldone_ssse3:
+___
+ $jj=$j=$saved_j; @V=@saved_V;
+ $r=$saved_r; @rndkey=@saved_rndkey;
+
+ &Xtail_ssse3(\&body_20_39);
+ &Xtail_ssse3(\&body_20_39);
+ &Xtail_ssse3(\&body_20_39);
+
+$code.=<<___;
+ movups $iv,48($out,$in0) # write output
+ mov 88(%rsp),$ivp # restore $ivp
+
+ add 0($ctx),$A # update context
+ add 4($ctx),@T[0]
+ add 8($ctx),$C
+ mov $A,0($ctx)
+ add 12($ctx),$D
+ mov @T[0],4($ctx)
+ add 16($ctx),$E
+ mov $C,8($ctx)
+ mov $D,12($ctx)
+ mov $E,16($ctx)
+ movups $iv,($ivp) # write IV
+___
+$code.=<<___ if ($win64);
+ movaps 96+0(%rsp),%xmm6
+ movaps 96+16(%rsp),%xmm7
+ movaps 96+32(%rsp),%xmm8
+ movaps 96+48(%rsp),%xmm9
+ movaps 96+64(%rsp),%xmm10
+ movaps 96+80(%rsp),%xmm11
+ movaps 96+96(%rsp),%xmm12
+ movaps 96+112(%rsp),%xmm13
+ movaps 96+128(%rsp),%xmm14
+ movaps 96+144(%rsp),%xmm15
+___
+$code.=<<___;
+ lea `104+($win64?10*16:0)`(%rsp),%rsi
+ mov 0(%rsi),%r15
+ mov 8(%rsi),%r14
+ mov 16(%rsi),%r13
+ mov 24(%rsi),%r12
+ mov 32(%rsi),%rbp
+ mov 40(%rsi),%rbx
+ lea 48(%rsi),%rsp
+.Lepilogue_ssse3:
+ ret
+.size aesni_cbc_sha1_enc_ssse3,.-aesni_cbc_sha1_enc_ssse3
+___
+
+$j=$jj=$r=$sn=0;
+
+if ($avx) {
+my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
+
+my $Xi=4;
+my @X=map("%xmm$_",(4..7,0..3));
+my @Tx=map("%xmm$_",(8..10));
+my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization
+my @T=("%esi","%edi");
+
+my $_rol=sub { &shld(@_[0],@_) };
+my $_ror=sub { &shrd(@_[0],@_) };
+
+$code.=<<___;
+.type aesni_cbc_sha1_enc_avx,\@function,6
+.align 16
+aesni_cbc_sha1_enc_avx:
+ mov `($win64?56:8)`(%rsp),$inp # load 7th argument
+ #shr \$6,$len # debugging artefact
+ #jz .Lepilogue_avx # debugging artefact
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ lea `-104-($win64?10*16:0)`(%rsp),%rsp
+ #mov $in0,$inp # debugging artefact
+ #lea 64(%rsp),$ctx # debugging artefact
+___
+$code.=<<___ if ($win64);
+ movaps %xmm6,96+0(%rsp)
+ movaps %xmm7,96+16(%rsp)
+ movaps %xmm8,96+32(%rsp)
+ movaps %xmm9,96+48(%rsp)
+ movaps %xmm10,96+64(%rsp)
+ movaps %xmm11,96+80(%rsp)
+ movaps %xmm12,96+96(%rsp)
+ movaps %xmm13,96+112(%rsp)
+ movaps %xmm14,96+128(%rsp)
+ movaps %xmm15,96+144(%rsp)
+.Lprologue_avx:
+___
+$code.=<<___;
+ vzeroall
+ mov $in0,%r12 # reassign arguments
+ mov $out,%r13
+ mov $len,%r14
+ mov $key,%r15
+ vmovdqu ($ivp),$iv # load IV
+ mov $ivp,88(%rsp) # save $ivp
+___
+my ($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments
+my $rounds="${ivp}d";
+$code.=<<___;
+ shl \$6,$len
+ sub $in0,$out
+ mov 240($key),$rounds
+ add \$112,$key # size optimization
+ add $inp,$len # end of input
+
+ lea K_XX_XX(%rip),$K_XX_XX
+ mov 0($ctx),$A # load context
+ mov 4($ctx),$B
+ mov 8($ctx),$C
+ mov 12($ctx),$D
+ mov $B,@T[0] # magic seed
+ mov 16($ctx),$E
+
+ vmovdqa 64($K_XX_XX),@X[2] # pbswap mask
+ vmovdqa 0($K_XX_XX),@Tx[1] # K_00_19
+ vmovdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
+ vmovdqu 16($inp),@X[-3&7]
+ vmovdqu 32($inp),@X[-2&7]
+ vmovdqu 48($inp),@X[-1&7]
+ vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap
+ add \$64,$inp
+ vpshufb @X[2],@X[-3&7],@X[-3&7]
+ vpshufb @X[2],@X[-2&7],@X[-2&7]
+ vpshufb @X[2],@X[-1&7],@X[-1&7]
+ vpaddd @Tx[1],@X[-4&7],@X[0] # add K_00_19
+ vpaddd @Tx[1],@X[-3&7],@X[1]
+ vpaddd @Tx[1],@X[-2&7],@X[2]
+ vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU
+ vmovdqa @X[1],16(%rsp)
+ vmovdqa @X[2],32(%rsp)
+ vmovups -112($key),$rndkey0 # $key[0]
+ vmovups 16-112($key),$rndkey[0] # forward reference
+ jmp .Loop_avx
+___
+
+my $aesenc=sub {
+ use integer;
+ my ($n,$k)=($r/10,$r%10);
+ if ($k==0) {
+ $code.=<<___;
+ vmovups `16*$n`($in0),$in # load input
+ vxorps $rndkey0,$in,$in
+___
+ $code.=<<___ if ($n);
+ vmovups $iv,`16*($n-1)`($out,$in0) # write output
+___
+ $code.=<<___;
+ vxorps $in,$iv,$iv
+ vaesenc $rndkey[0],$iv,$iv
+ vmovups `32+16*$k-112`($key),$rndkey[1]
+___
+ } elsif ($k==9) {
+ $sn++;
+ $code.=<<___;
+ cmp \$11,$rounds
+ jb .Lvaesenclast$sn
+ vaesenc $rndkey[0],$iv,$iv
+ vmovups `32+16*($k+0)-112`($key),$rndkey[1]
+ vaesenc $rndkey[1],$iv,$iv
+ vmovups `32+16*($k+1)-112`($key),$rndkey[0]
+ je .Lvaesenclast$sn
+ vaesenc $rndkey[0],$iv,$iv
+ vmovups `32+16*($k+2)-112`($key),$rndkey[1]
+ vaesenc $rndkey[1],$iv,$iv
+ vmovups `32+16*($k+3)-112`($key),$rndkey[0]
+.Lvaesenclast$sn:
+ vaesenclast $rndkey[0],$iv,$iv
+ vmovups 16-112($key),$rndkey[1] # forward reference
+___
+ } else {
+ $code.=<<___;
+ vaesenc $rndkey[0],$iv,$iv
+ vmovups `32+16*$k-112`($key),$rndkey[1]
+___
+ }
+ $r++; unshift(@rndkey,pop(@rndkey));
+};
+
+sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
+ my ($a,$b,$c,$d,$e);
+
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpaddd (@Tx[1],@Tx[1],@X[-1&7]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpsrld (@Tx[0],@X[0],31);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword
+ &vpaddd (@X[0],@X[0],@X[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpsrld (@Tx[1],@Tx[2],30);
+ &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpslld (@Tx[2],@Tx[2],2);
+ &vpxor (@X[0],@X[0],@Tx[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vmovdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+
+ foreach (@insns) { eval; } # remaining instructions [if any]
+
+ $Xi++; push(@X,shift(@X)); # "rotate" X[]
+ push(@Tx,shift(@Tx));
+}
+
+sub Xupdate_avx_32_79()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions
+ my ($a,$b,$c,$d,$e);
+
+ &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8); # compose "X[-6]"
+ &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
+ eval(shift(@insns)); # body_20_39
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+
+ &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
+ eval(shift(@insns));
+ eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/);
+ if ($Xi%5) {
+ &vmovdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
+ } else { # ... or load next one
+ &vmovdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
+ }
+ &vpaddd (@Tx[1],@Tx[1],@X[-1&7]);
+ eval(shift(@insns)); # ror
+ eval(shift(@insns));
+
+ &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-6]"
+ eval(shift(@insns)); # body_20_39
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+
+ &vpsrld (@Tx[0],@X[0],30);
+ &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # ror
+ eval(shift(@insns));
+
+ &vpslld (@X[0],@X[0],2);
+ eval(shift(@insns)); # body_20_39
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # ror
+ eval(shift(@insns));
+
+ &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2
+ eval(shift(@insns)); # body_20_39
+ eval(shift(@insns));
+ &vmovdqa (@Tx[1],@X[0]) if ($Xi<19);
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+ eval(shift(@insns));
+
+ foreach (@insns) { eval; } # remaining instructions
+
+ $Xi++; push(@X,shift(@X)); # "rotate" X[]
+ push(@Tx,shift(@Tx));
+}
+
+sub Xuplast_avx_80()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
+ my ($a,$b,$c,$d,$e);
+
+ eval(shift(@insns));
+ &vpaddd (@Tx[1],@Tx[1],@X[-1&7]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
+
+ foreach (@insns) { eval; } # remaining instructions
+
+ &cmp ($inp,$len);
+ &je (".Ldone_avx");
+
+ unshift(@Tx,pop(@Tx));
+
+ &vmovdqa(@X[2],"64($K_XX_XX)"); # pbswap mask
+ &vmovdqa(@Tx[1],"0($K_XX_XX)"); # K_00_19
+ &vmovdqu(@X[-4&7],"0($inp)"); # load input
+ &vmovdqu(@X[-3&7],"16($inp)");
+ &vmovdqu(@X[-2&7],"32($inp)");
+ &vmovdqu(@X[-1&7],"48($inp)");
+ &vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap
+ &add ($inp,64);
+
+ $Xi=0;
+}
+
+sub Xloop_avx()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
+ my ($a,$b,$c,$d,$e);
+
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpaddd (@X[$Xi&7],@X[($Xi-4)&7],@Tx[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]); # X[]+K xfer to IALU
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ foreach (@insns) { eval; }
+ $Xi++;
+}
+
+sub Xtail_avx()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
+ my ($a,$b,$c,$d,$e);
+
+ foreach (@insns) { eval; }
+}
+
+$code.=<<___;
+.align 16
+.Loop_avx:
+___
+ &Xupdate_avx_16_31(\&body_00_19);
+ &Xupdate_avx_16_31(\&body_00_19);
+ &Xupdate_avx_16_31(\&body_00_19);
+ &Xupdate_avx_16_31(\&body_00_19);
+ &Xupdate_avx_32_79(\&body_00_19);
+ &Xupdate_avx_32_79(\&body_20_39);
+ &Xupdate_avx_32_79(\&body_20_39);
+ &Xupdate_avx_32_79(\&body_20_39);
+ &Xupdate_avx_32_79(\&body_20_39);
+ &Xupdate_avx_32_79(\&body_20_39);
+ &Xupdate_avx_32_79(\&body_40_59);
+ &Xupdate_avx_32_79(\&body_40_59);
+ &Xupdate_avx_32_79(\&body_40_59);
+ &Xupdate_avx_32_79(\&body_40_59);
+ &Xupdate_avx_32_79(\&body_40_59);
+ &Xupdate_avx_32_79(\&body_20_39);
+ &Xuplast_avx_80(\&body_20_39); # can jump to "done"
+
+ $saved_j=$j; @saved_V=@V;
+ $saved_r=$r; @saved_rndkey=@rndkey;
+
+ &Xloop_avx(\&body_20_39);
+ &Xloop_avx(\&body_20_39);
+ &Xloop_avx(\&body_20_39);
+
+$code.=<<___;
+ vmovups $iv,48($out,$in0) # write output
+ lea 64($in0),$in0
+
+ add 0($ctx),$A # update context
+ add 4($ctx),@T[0]
+ add 8($ctx),$C
+ add 12($ctx),$D
+ mov $A,0($ctx)
+ add 16($ctx),$E
+ mov @T[0],4($ctx)
+ mov @T[0],$B # magic seed
+ mov $C,8($ctx)
+ mov $D,12($ctx)
+ mov $E,16($ctx)
+ jmp .Loop_avx
+
+.align 16
+.Ldone_avx:
+___
+ $jj=$j=$saved_j; @V=@saved_V;
+ $r=$saved_r; @rndkey=@saved_rndkey;
+
+ &Xtail_avx(\&body_20_39);
+ &Xtail_avx(\&body_20_39);
+ &Xtail_avx(\&body_20_39);
+
+$code.=<<___;
+ vmovups $iv,48($out,$in0) # write output
+ mov 88(%rsp),$ivp # restore $ivp
+
+ add 0($ctx),$A # update context
+ add 4($ctx),@T[0]
+ add 8($ctx),$C
+ mov $A,0($ctx)
+ add 12($ctx),$D
+ mov @T[0],4($ctx)
+ add 16($ctx),$E
+ mov $C,8($ctx)
+ mov $D,12($ctx)
+ mov $E,16($ctx)
+ vmovups $iv,($ivp) # write IV
+ vzeroall
+___
+$code.=<<___ if ($win64);
+ movaps 96+0(%rsp),%xmm6
+ movaps 96+16(%rsp),%xmm7
+ movaps 96+32(%rsp),%xmm8
+ movaps 96+48(%rsp),%xmm9
+ movaps 96+64(%rsp),%xmm10
+ movaps 96+80(%rsp),%xmm11
+ movaps 96+96(%rsp),%xmm12
+ movaps 96+112(%rsp),%xmm13
+ movaps 96+128(%rsp),%xmm14
+ movaps 96+144(%rsp),%xmm15
+___
+$code.=<<___;
+ lea `104+($win64?10*16:0)`(%rsp),%rsi
+ mov 0(%rsi),%r15
+ mov 8(%rsi),%r14
+ mov 16(%rsi),%r13
+ mov 24(%rsi),%r12
+ mov 32(%rsi),%rbp
+ mov 40(%rsi),%rbx
+ lea 48(%rsi),%rsp
+.Lepilogue_avx:
+ ret
+.size aesni_cbc_sha1_enc_avx,.-aesni_cbc_sha1_enc_avx
+___
+}
+$code.=<<___;
+.align 64
+K_XX_XX:
+.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19
+.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39
+.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59
+.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79
+.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask
+
+.asciz "AESNI-CBC+SHA1 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align 64
+___
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+# CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern __imp_RtlVirtualUnwind
+.type ssse3_handler,\@abi-omnipotent
+.align 16
+ssse3_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HandlerData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # prologue label
+ cmp %r10,%rbx # context->Rip<prologue label
+ jb .Lcommon_seh_tail
+
+ mov 152($context),%rax # pull context->Rsp
+
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=epilogue label
+ jae .Lcommon_seh_tail
+
+ lea 96(%rax),%rsi
+ lea 512($context),%rdi # &context.Xmm6
+ mov \$20,%ecx
+ .long 0xa548f3fc # cld; rep movsq
+ lea `104+10*16`(%rax),%rax # adjust stack pointer
+
+ mov 0(%rax),%r15
+ mov 8(%rax),%r14
+ mov 16(%rax),%r13
+ mov 24(%rax),%r12
+ mov 32(%rax),%rbp
+ mov 40(%rax),%rbx
+ lea 48(%rax),%rax
+ mov %rbx,144($context) # restore context->Rbx
+ mov %rbp,160($context) # restore context->Rbp
+ mov %r12,216($context) # restore context->R12
+ mov %r13,224($context) # restore context->R13
+ mov %r14,232($context) # restore context->R14
+ mov %r15,240($context) # restore context->R15
+
+.Lcommon_seh_tail:
+ mov 8(%rax),%rdi
+ mov 16(%rax),%rsi
+ mov %rax,152($context) # restore context->Rsp
+ mov %rsi,168($context) # restore context->Rsi
+ mov %rdi,176($context) # restore context->Rdi
+
+ mov 40($disp),%rdi # disp->ContextRecord
+ mov $context,%rsi # context
+ mov \$154,%ecx # sizeof(CONTEXT)
+ .long 0xa548f3fc # cld; rep movsq
+
+ mov $disp,%rsi
+ xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
+ mov 8(%rsi),%rdx # arg2, disp->ImageBase
+ mov 0(%rsi),%r8 # arg3, disp->ControlPc
+ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
+ mov 40(%rsi),%r10 # disp->ContextRecord
+ lea 56(%rsi),%r11 # &disp->HandlerData
+ lea 24(%rsi),%r12 # &disp->EstablisherFrame
+ mov %r10,32(%rsp) # arg5
+ mov %r11,40(%rsp) # arg6
+ mov %r12,48(%rsp) # arg7
+ mov %rcx,56(%rsp) # arg8, (NULL)
+ call *__imp_RtlVirtualUnwind(%rip)
+
+ mov \$1,%eax # ExceptionContinueSearch
+ add \$64,%rsp
+ popfq
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ pop %rdi
+ pop %rsi
+ ret
+.size ssse3_handler,.-ssse3_handler
+
+.section .pdata
+.align 4
+ .rva .LSEH_begin_aesni_cbc_sha1_enc_ssse3
+ .rva .LSEH_end_aesni_cbc_sha1_enc_ssse3
+ .rva .LSEH_info_aesni_cbc_sha1_enc_ssse3
+___
+$code.=<<___ if ($avx);
+ .rva .LSEH_begin_aesni_cbc_sha1_enc_avx
+ .rva .LSEH_end_aesni_cbc_sha1_enc_avx
+ .rva .LSEH_info_aesni_cbc_sha1_enc_avx
+___
+$code.=<<___;
+.section .xdata
+.align 8
+.LSEH_info_aesni_cbc_sha1_enc_ssse3:
+ .byte 9,0,0,0
+ .rva ssse3_handler
+ .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[]
+___
+$code.=<<___ if ($avx);
+.LSEH_info_aesni_cbc_sha1_enc_avx:
+ .byte 9,0,0,0
+ .rva ssse3_handler
+ .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
+___
+}
+
+####################################################################
+sub rex {
+ local *opcode=shift;
+ my ($dst,$src)=@_;
+ my $rex=0;
+
+ $rex|=0x04 if($dst>=8);
+ $rex|=0x01 if($src>=8);
+ push @opcode,$rex|0x40 if($rex);
+}
+
+sub aesni {
+ my $line=shift;
+ my @opcode=(0x66);
+
+ if ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+ my %opcodelet = (
+ "aesenc" => 0xdc, "aesenclast" => 0xdd
+ );
+ return undef if (!defined($opcodelet{$1}));
+ rex(\@opcode,$3,$2);
+ push @opcode,0x0f,0x38,$opcodelet{$1};
+ push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
+ return ".byte\t".join(',',@opcode);
+ }
+ return $line;
+}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
+
+print $code;
+close STDOUT;
diff --git a/main/openssl/crypto/aes/asm/aesni-x86.S b/main/openssl/crypto/aes/asm/aesni-x86.S
new file mode 100644
index 00000000..0766bb54
--- /dev/null
+++ b/main/openssl/crypto/aes/asm/aesni-x86.S
@@ -0,0 +1,2143 @@
+.file "crypto/aes/asm/aesni-x86.s"
+.text
+.globl aesni_encrypt
+.type aesni_encrypt,@function
+.align 16
+aesni_encrypt:
+.L_aesni_encrypt_begin:
+ movl 4(%esp),%eax
+ movl 12(%esp),%edx
+ movups (%eax),%xmm2
+ movl 240(%edx),%ecx
+ movl 8(%esp),%eax
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+.L000enc1_loop_1:
+.byte 102,15,56,220,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz .L000enc1_loop_1
+.byte 102,15,56,221,209
+ movups %xmm2,(%eax)
+ ret
+.size aesni_encrypt,.-.L_aesni_encrypt_begin
+.globl aesni_decrypt
+.type aesni_decrypt,@function
+.align 16
+aesni_decrypt:
+.L_aesni_decrypt_begin:
+ movl 4(%esp),%eax
+ movl 12(%esp),%edx
+ movups (%eax),%xmm2
+ movl 240(%edx),%ecx
+ movl 8(%esp),%eax
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+.L001dec1_loop_2:
+.byte 102,15,56,222,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz .L001dec1_loop_2
+.byte 102,15,56,223,209
+ movups %xmm2,(%eax)
+ ret
+.size aesni_decrypt,.-.L_aesni_decrypt_begin
+.type _aesni_encrypt3,@function
+.align 16
+_aesni_encrypt3:
+ movups (%edx),%xmm0
+ shrl $1,%ecx
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+ pxor %xmm0,%xmm3
+ pxor %xmm0,%xmm4
+ movups (%edx),%xmm0
+.L002enc3_loop:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ decl %ecx
+.byte 102,15,56,220,225
+ movups 16(%edx),%xmm1
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ leal 32(%edx),%edx
+.byte 102,15,56,220,224
+ movups (%edx),%xmm0
+ jnz .L002enc3_loop
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+.byte 102,15,56,221,224
+ ret
+.size _aesni_encrypt3,.-_aesni_encrypt3
+.type _aesni_decrypt3,@function
+.align 16
+_aesni_decrypt3:
+ movups (%edx),%xmm0
+ shrl $1,%ecx
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+ pxor %xmm0,%xmm3
+ pxor %xmm0,%xmm4
+ movups (%edx),%xmm0
+.L003dec3_loop:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+ decl %ecx
+.byte 102,15,56,222,225
+ movups 16(%edx),%xmm1
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+ leal 32(%edx),%edx
+.byte 102,15,56,222,224
+ movups (%edx),%xmm0
+ jnz .L003dec3_loop
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,223,208
+.byte 102,15,56,223,216
+.byte 102,15,56,223,224
+ ret
+.size _aesni_decrypt3,.-_aesni_decrypt3
+.type _aesni_encrypt4,@function
+.align 16
+_aesni_encrypt4:
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ shrl $1,%ecx
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+ pxor %xmm0,%xmm3
+ pxor %xmm0,%xmm4
+ pxor %xmm0,%xmm5
+ movups (%edx),%xmm0
+.L004enc4_loop:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ decl %ecx
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+ movups 16(%edx),%xmm1
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ leal 32(%edx),%edx
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+ movups (%edx),%xmm0
+ jnz .L004enc4_loop
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+.byte 102,15,56,221,224
+.byte 102,15,56,221,232
+ ret
+.size _aesni_encrypt4,.-_aesni_encrypt4
+.type _aesni_decrypt4,@function
+.align 16
+_aesni_decrypt4:
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ shrl $1,%ecx
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+ pxor %xmm0,%xmm3
+ pxor %xmm0,%xmm4
+ pxor %xmm0,%xmm5
+ movups (%edx),%xmm0
+.L005dec4_loop:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+ decl %ecx
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+ movups 16(%edx),%xmm1
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+ leal 32(%edx),%edx
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+ movups (%edx),%xmm0
+ jnz .L005dec4_loop
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,223,208
+.byte 102,15,56,223,216
+.byte 102,15,56,223,224
+.byte 102,15,56,223,232
+ ret
+.size _aesni_decrypt4,.-_aesni_decrypt4
+.type _aesni_encrypt6,@function
+.align 16
+_aesni_encrypt6:
+ movups (%edx),%xmm0
+ shrl $1,%ecx
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+ pxor %xmm0,%xmm3
+.byte 102,15,56,220,209
+ pxor %xmm0,%xmm4
+.byte 102,15,56,220,217
+ pxor %xmm0,%xmm5
+ decl %ecx
+.byte 102,15,56,220,225
+ pxor %xmm0,%xmm6
+.byte 102,15,56,220,233
+ pxor %xmm0,%xmm7
+.byte 102,15,56,220,241
+ movups (%edx),%xmm0
+.byte 102,15,56,220,249
+ jmp .L_aesni_encrypt6_enter
+.align 16
+.L006enc6_loop:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ decl %ecx
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.align 16
+.L_aesni_encrypt6_enter:
+ movups 16(%edx),%xmm1
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ leal 32(%edx),%edx
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+ movups (%edx),%xmm0
+ jnz .L006enc6_loop
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+.byte 102,15,56,221,224
+.byte 102,15,56,221,232
+.byte 102,15,56,221,240
+.byte 102,15,56,221,248
+ ret
+.size _aesni_encrypt6,.-_aesni_encrypt6
+.type _aesni_decrypt6,@function
+.align 16
+_aesni_decrypt6:
+ movups (%edx),%xmm0
+ shrl $1,%ecx
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+ pxor %xmm0,%xmm3
+.byte 102,15,56,222,209
+ pxor %xmm0,%xmm4
+.byte 102,15,56,222,217
+ pxor %xmm0,%xmm5
+ decl %ecx
+.byte 102,15,56,222,225
+ pxor %xmm0,%xmm6
+.byte 102,15,56,222,233
+ pxor %xmm0,%xmm7
+.byte 102,15,56,222,241
+ movups (%edx),%xmm0
+.byte 102,15,56,222,249
+ jmp .L_aesni_decrypt6_enter
+.align 16
+.L007dec6_loop:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+ decl %ecx
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.align 16
+.L_aesni_decrypt6_enter:
+ movups 16(%edx),%xmm1
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+ leal 32(%edx),%edx
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+ movups (%edx),%xmm0
+ jnz .L007dec6_loop
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,15,56,223,208
+.byte 102,15,56,223,216
+.byte 102,15,56,223,224
+.byte 102,15,56,223,232
+.byte 102,15,56,223,240
+.byte 102,15,56,223,248
+ ret
+.size _aesni_decrypt6,.-_aesni_decrypt6
+.globl aesni_ecb_encrypt
+.type aesni_ecb_encrypt,@function
+.align 16
+aesni_ecb_encrypt:
+.L_aesni_ecb_encrypt_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%esi
+ movl 24(%esp),%edi
+ movl 28(%esp),%eax
+ movl 32(%esp),%edx
+ movl 36(%esp),%ebx
+ andl $-16,%eax
+ jz .L008ecb_ret
+ movl 240(%edx),%ecx
+ testl %ebx,%ebx
+ jz .L009ecb_decrypt
+ movl %edx,%ebp
+ movl %ecx,%ebx
+ cmpl $96,%eax
+ jb .L010ecb_enc_tail
+ movdqu (%esi),%xmm2
+ movdqu 16(%esi),%xmm3
+ movdqu 32(%esi),%xmm4
+ movdqu 48(%esi),%xmm5
+ movdqu 64(%esi),%xmm6
+ movdqu 80(%esi),%xmm7
+ leal 96(%esi),%esi
+ subl $96,%eax
+ jmp .L011ecb_enc_loop6_enter
+.align 16
+.L012ecb_enc_loop6:
+ movups %xmm2,(%edi)
+ movdqu (%esi),%xmm2
+ movups %xmm3,16(%edi)
+ movdqu 16(%esi),%xmm3
+ movups %xmm4,32(%edi)
+ movdqu 32(%esi),%xmm4
+ movups %xmm5,48(%edi)
+ movdqu 48(%esi),%xmm5
+ movups %xmm6,64(%edi)
+ movdqu 64(%esi),%xmm6
+ movups %xmm7,80(%edi)
+ leal 96(%edi),%edi
+ movdqu 80(%esi),%xmm7
+ leal 96(%esi),%esi
+.L011ecb_enc_loop6_enter:
+ call _aesni_encrypt6
+ movl %ebp,%edx
+ movl %ebx,%ecx
+ subl $96,%eax
+ jnc .L012ecb_enc_loop6
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ movups %xmm5,48(%edi)
+ movups %xmm6,64(%edi)
+ movups %xmm7,80(%edi)
+ leal 96(%edi),%edi
+ addl $96,%eax
+ jz .L008ecb_ret
+.L010ecb_enc_tail:
+ movups (%esi),%xmm2
+ cmpl $32,%eax
+ jb .L013ecb_enc_one
+ movups 16(%esi),%xmm3
+ je .L014ecb_enc_two
+ movups 32(%esi),%xmm4
+ cmpl $64,%eax
+ jb .L015ecb_enc_three
+ movups 48(%esi),%xmm5
+ je .L016ecb_enc_four
+ movups 64(%esi),%xmm6
+ xorps %xmm7,%xmm7
+ call _aesni_encrypt6
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ movups %xmm5,48(%edi)
+ movups %xmm6,64(%edi)
+ jmp .L008ecb_ret
+.align 16
+.L013ecb_enc_one:
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+.L017enc1_loop_3:
+.byte 102,15,56,220,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz .L017enc1_loop_3
+.byte 102,15,56,221,209
+ movups %xmm2,(%edi)
+ jmp .L008ecb_ret
+.align 16
+.L014ecb_enc_two:
+ xorps %xmm4,%xmm4
+ call _aesni_encrypt3
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ jmp .L008ecb_ret
+.align 16
+.L015ecb_enc_three:
+ call _aesni_encrypt3
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ jmp .L008ecb_ret
+.align 16
+.L016ecb_enc_four:
+ call _aesni_encrypt4
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ movups %xmm5,48(%edi)
+ jmp .L008ecb_ret
+.align 16
+.L009ecb_decrypt:
+ movl %edx,%ebp
+ movl %ecx,%ebx
+ cmpl $96,%eax
+ jb .L018ecb_dec_tail
+ movdqu (%esi),%xmm2
+ movdqu 16(%esi),%xmm3
+ movdqu 32(%esi),%xmm4
+ movdqu 48(%esi),%xmm5
+ movdqu 64(%esi),%xmm6
+ movdqu 80(%esi),%xmm7
+ leal 96(%esi),%esi
+ subl $96,%eax
+ jmp .L019ecb_dec_loop6_enter
+.align 16
+.L020ecb_dec_loop6:
+ movups %xmm2,(%edi)
+ movdqu (%esi),%xmm2
+ movups %xmm3,16(%edi)
+ movdqu 16(%esi),%xmm3
+ movups %xmm4,32(%edi)
+ movdqu 32(%esi),%xmm4
+ movups %xmm5,48(%edi)
+ movdqu 48(%esi),%xmm5
+ movups %xmm6,64(%edi)
+ movdqu 64(%esi),%xmm6
+ movups %xmm7,80(%edi)
+ leal 96(%edi),%edi
+ movdqu 80(%esi),%xmm7
+ leal 96(%esi),%esi
+.L019ecb_dec_loop6_enter:
+ call _aesni_decrypt6
+ movl %ebp,%edx
+ movl %ebx,%ecx
+ subl $96,%eax
+ jnc .L020ecb_dec_loop6
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ movups %xmm5,48(%edi)
+ movups %xmm6,64(%edi)
+ movups %xmm7,80(%edi)
+ leal 96(%edi),%edi
+ addl $96,%eax
+ jz .L008ecb_ret
+.L018ecb_dec_tail:
+ movups (%esi),%xmm2
+ cmpl $32,%eax
+ jb .L021ecb_dec_one
+ movups 16(%esi),%xmm3
+ je .L022ecb_dec_two
+ movups 32(%esi),%xmm4
+ cmpl $64,%eax
+ jb .L023ecb_dec_three
+ movups 48(%esi),%xmm5
+ je .L024ecb_dec_four
+ movups 64(%esi),%xmm6
+ xorps %xmm7,%xmm7
+ call _aesni_decrypt6
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ movups %xmm5,48(%edi)
+ movups %xmm6,64(%edi)
+ jmp .L008ecb_ret
+.align 16
+.L021ecb_dec_one:
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+.L025dec1_loop_4:
+.byte 102,15,56,222,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz .L025dec1_loop_4
+.byte 102,15,56,223,209
+ movups %xmm2,(%edi)
+ jmp .L008ecb_ret
+.align 16
+.L022ecb_dec_two:
+ xorps %xmm4,%xmm4
+ call _aesni_decrypt3
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ jmp .L008ecb_ret
+.align 16
+.L023ecb_dec_three:
+ call _aesni_decrypt3
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ jmp .L008ecb_ret
+.align 16
+.L024ecb_dec_four:
+ call _aesni_decrypt4
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ movups %xmm5,48(%edi)
+.L008ecb_ret:
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size aesni_ecb_encrypt,.-.L_aesni_ecb_encrypt_begin
+.globl aesni_ccm64_encrypt_blocks
+.type aesni_ccm64_encrypt_blocks,@function
+.align 16
+aesni_ccm64_encrypt_blocks:
+.L_aesni_ccm64_encrypt_blocks_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%esi
+ movl 24(%esp),%edi
+ movl 28(%esp),%eax
+ movl 32(%esp),%edx
+ movl 36(%esp),%ebx
+ movl 40(%esp),%ecx
+ movl %esp,%ebp
+ subl $60,%esp
+ andl $-16,%esp
+ movl %ebp,48(%esp)
+ movdqu (%ebx),%xmm7
+ movdqu (%ecx),%xmm3
+ movl 240(%edx),%ecx
+ movl $202182159,(%esp)
+ movl $134810123,4(%esp)
+ movl $67438087,8(%esp)
+ movl $66051,12(%esp)
+ movl $1,%ebx
+ xorl %ebp,%ebp
+ movl %ebx,16(%esp)
+ movl %ebp,20(%esp)
+ movl %ebp,24(%esp)
+ movl %ebp,28(%esp)
+ shrl $1,%ecx
+ leal (%edx),%ebp
+ movdqa (%esp),%xmm5
+ movdqa %xmm7,%xmm2
+ movl %ecx,%ebx
+.byte 102,15,56,0,253
+.L026ccm64_enc_outer:
+ movups (%ebp),%xmm0
+ movl %ebx,%ecx
+ movups (%esi),%xmm6
+ xorps %xmm0,%xmm2
+ movups 16(%ebp),%xmm1
+ xorps %xmm6,%xmm0
+ leal 32(%ebp),%edx
+ xorps %xmm0,%xmm3
+ movups (%edx),%xmm0
+.L027ccm64_enc2_loop:
+.byte 102,15,56,220,209
+ decl %ecx
+.byte 102,15,56,220,217
+ movups 16(%edx),%xmm1
+.byte 102,15,56,220,208
+ leal 32(%edx),%edx
+.byte 102,15,56,220,216
+ movups (%edx),%xmm0
+ jnz .L027ccm64_enc2_loop
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ paddq 16(%esp),%xmm7
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+ decl %eax
+ leal 16(%esi),%esi
+ xorps %xmm2,%xmm6
+ movdqa %xmm7,%xmm2
+ movups %xmm6,(%edi)
+ leal 16(%edi),%edi
+.byte 102,15,56,0,213
+ jnz .L026ccm64_enc_outer
+ movl 48(%esp),%esp
+ movl 40(%esp),%edi
+ movups %xmm3,(%edi)
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size aesni_ccm64_encrypt_blocks,.-.L_aesni_ccm64_encrypt_blocks_begin
+.globl aesni_ccm64_decrypt_blocks
+.type aesni_ccm64_decrypt_blocks,@function
+.align 16
+aesni_ccm64_decrypt_blocks:
+.L_aesni_ccm64_decrypt_blocks_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%esi
+ movl 24(%esp),%edi
+ movl 28(%esp),%eax
+ movl 32(%esp),%edx
+ movl 36(%esp),%ebx
+ movl 40(%esp),%ecx
+ movl %esp,%ebp
+ subl $60,%esp
+ andl $-16,%esp
+ movl %ebp,48(%esp)
+ movdqu (%ebx),%xmm7
+ movdqu (%ecx),%xmm3
+ movl 240(%edx),%ecx
+ movl $202182159,(%esp)
+ movl $134810123,4(%esp)
+ movl $67438087,8(%esp)
+ movl $66051,12(%esp)
+ movl $1,%ebx
+ xorl %ebp,%ebp
+ movl %ebx,16(%esp)
+ movl %ebp,20(%esp)
+ movl %ebp,24(%esp)
+ movl %ebp,28(%esp)
+ movdqa (%esp),%xmm5
+ movdqa %xmm7,%xmm2
+ movl %edx,%ebp
+ movl %ecx,%ebx
+.byte 102,15,56,0,253
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+.L028enc1_loop_5:
+.byte 102,15,56,220,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz .L028enc1_loop_5
+.byte 102,15,56,221,209
+ movups (%esi),%xmm6
+ paddq 16(%esp),%xmm7
+ leal 16(%esi),%esi
+ jmp .L029ccm64_dec_outer
+.align 16
+.L029ccm64_dec_outer:
+ xorps %xmm2,%xmm6
+ movdqa %xmm7,%xmm2
+ movl %ebx,%ecx
+ movups %xmm6,(%edi)
+ leal 16(%edi),%edi
+.byte 102,15,56,0,213
+ subl $1,%eax
+ jz .L030ccm64_dec_break
+ movups (%ebp),%xmm0
+ shrl $1,%ecx
+ movups 16(%ebp),%xmm1
+ xorps %xmm0,%xmm6
+ leal 32(%ebp),%edx
+ xorps %xmm0,%xmm2
+ xorps %xmm6,%xmm3
+ movups (%edx),%xmm0
+.L031ccm64_dec2_loop:
+.byte 102,15,56,220,209
+ decl %ecx
+.byte 102,15,56,220,217
+ movups 16(%edx),%xmm1
+.byte 102,15,56,220,208
+ leal 32(%edx),%edx
+.byte 102,15,56,220,216
+ movups (%edx),%xmm0
+ jnz .L031ccm64_dec2_loop
+ movups (%esi),%xmm6
+ paddq 16(%esp),%xmm7
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ leal 16(%esi),%esi
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+ jmp .L029ccm64_dec_outer
+.align 16
+.L030ccm64_dec_break:
+ movl %ebp,%edx
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ xorps %xmm0,%xmm6
+ leal 32(%edx),%edx
+ xorps %xmm6,%xmm3
+.L032enc1_loop_6:
+.byte 102,15,56,220,217
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz .L032enc1_loop_6
+.byte 102,15,56,221,217
+ movl 48(%esp),%esp
+ movl 40(%esp),%edi
+ movups %xmm3,(%edi)
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size aesni_ccm64_decrypt_blocks,.-.L_aesni_ccm64_decrypt_blocks_begin
+.globl aesni_ctr32_encrypt_blocks
+.type aesni_ctr32_encrypt_blocks,@function
+.align 16
+aesni_ctr32_encrypt_blocks:
+.L_aesni_ctr32_encrypt_blocks_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%esi
+ movl 24(%esp),%edi
+ movl 28(%esp),%eax
+ movl 32(%esp),%edx
+ movl 36(%esp),%ebx
+ movl %esp,%ebp
+ subl $88,%esp
+ andl $-16,%esp
+ movl %ebp,80(%esp)
+ cmpl $1,%eax
+ je .L033ctr32_one_shortcut
+ movdqu (%ebx),%xmm7
+ movl $202182159,(%esp)
+ movl $134810123,4(%esp)
+ movl $67438087,8(%esp)
+ movl $66051,12(%esp)
+ movl $6,%ecx
+ xorl %ebp,%ebp
+ movl %ecx,16(%esp)
+ movl %ecx,20(%esp)
+ movl %ecx,24(%esp)
+ movl %ebp,28(%esp)
+.byte 102,15,58,22,251,3
+.byte 102,15,58,34,253,3
+ movl 240(%edx),%ecx
+ bswap %ebx
+ pxor %xmm1,%xmm1
+ pxor %xmm0,%xmm0
+ movdqa (%esp),%xmm2
+.byte 102,15,58,34,203,0
+ leal 3(%ebx),%ebp
+.byte 102,15,58,34,197,0
+ incl %ebx
+.byte 102,15,58,34,203,1
+ incl %ebp
+.byte 102,15,58,34,197,1
+ incl %ebx
+.byte 102,15,58,34,203,2
+ incl %ebp
+.byte 102,15,58,34,197,2
+ movdqa %xmm1,48(%esp)
+.byte 102,15,56,0,202
+ movdqa %xmm0,64(%esp)
+.byte 102,15,56,0,194
+ pshufd $192,%xmm1,%xmm2
+ pshufd $128,%xmm1,%xmm3
+ cmpl $6,%eax
+ jb .L034ctr32_tail
+ movdqa %xmm7,32(%esp)
+ shrl $1,%ecx
+ movl %edx,%ebp
+ movl %ecx,%ebx
+ subl $6,%eax
+ jmp .L035ctr32_loop6
+.align 16
+.L035ctr32_loop6:
+ pshufd $64,%xmm1,%xmm4
+ movdqa 32(%esp),%xmm1
+ pshufd $192,%xmm0,%xmm5
+ por %xmm1,%xmm2
+ pshufd $128,%xmm0,%xmm6
+ por %xmm1,%xmm3
+ pshufd $64,%xmm0,%xmm7
+ por %xmm1,%xmm4
+ por %xmm1,%xmm5
+ por %xmm1,%xmm6
+ por %xmm1,%xmm7
+ movups (%ebp),%xmm0
+ movups 16(%ebp),%xmm1
+ leal 32(%ebp),%edx
+ decl %ecx
+ pxor %xmm0,%xmm2
+ pxor %xmm0,%xmm3
+.byte 102,15,56,220,209
+ pxor %xmm0,%xmm4
+.byte 102,15,56,220,217
+ pxor %xmm0,%xmm5
+.byte 102,15,56,220,225
+ pxor %xmm0,%xmm6
+.byte 102,15,56,220,233
+ pxor %xmm0,%xmm7
+.byte 102,15,56,220,241
+ movups (%edx),%xmm0
+.byte 102,15,56,220,249
+ call .L_aesni_encrypt6_enter
+ movups (%esi),%xmm1
+ movups 16(%esi),%xmm0
+ xorps %xmm1,%xmm2
+ movups 32(%esi),%xmm1
+ xorps %xmm0,%xmm3
+ movups %xmm2,(%edi)
+ movdqa 16(%esp),%xmm0
+ xorps %xmm1,%xmm4
+ movdqa 48(%esp),%xmm1
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ paddd %xmm0,%xmm1
+ paddd 64(%esp),%xmm0
+ movdqa (%esp),%xmm2
+ movups 48(%esi),%xmm3
+ movups 64(%esi),%xmm4
+ xorps %xmm3,%xmm5
+ movups 80(%esi),%xmm3
+ leal 96(%esi),%esi
+ movdqa %xmm1,48(%esp)
+.byte 102,15,56,0,202
+ xorps %xmm4,%xmm6
+ movups %xmm5,48(%edi)
+ xorps %xmm3,%xmm7
+ movdqa %xmm0,64(%esp)
+.byte 102,15,56,0,194
+ movups %xmm6,64(%edi)
+ pshufd $192,%xmm1,%xmm2
+ movups %xmm7,80(%edi)
+ leal 96(%edi),%edi
+ movl %ebx,%ecx
+ pshufd $128,%xmm1,%xmm3
+ subl $6,%eax
+ jnc .L035ctr32_loop6
+ addl $6,%eax
+ jz .L036ctr32_ret
+ movl %ebp,%edx
+ leal 1(,%ecx,2),%ecx
+ movdqa 32(%esp),%xmm7
+.L034ctr32_tail:
+ por %xmm7,%xmm2
+ cmpl $2,%eax
+ jb .L037ctr32_one
+ pshufd $64,%xmm1,%xmm4
+ por %xmm7,%xmm3
+ je .L038ctr32_two
+ pshufd $192,%xmm0,%xmm5
+ por %xmm7,%xmm4
+ cmpl $4,%eax
+ jb .L039ctr32_three
+ pshufd $128,%xmm0,%xmm6
+ por %xmm7,%xmm5
+ je .L040ctr32_four
+ por %xmm7,%xmm6
+ call _aesni_encrypt6
+ movups (%esi),%xmm1
+ movups 16(%esi),%xmm0
+ xorps %xmm1,%xmm2
+ movups 32(%esi),%xmm1
+ xorps %xmm0,%xmm3
+ movups 48(%esi),%xmm0
+ xorps %xmm1,%xmm4
+ movups 64(%esi),%xmm1
+ xorps %xmm0,%xmm5
+ movups %xmm2,(%edi)
+ xorps %xmm1,%xmm6
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ movups %xmm5,48(%edi)
+ movups %xmm6,64(%edi)
+ jmp .L036ctr32_ret
+.align 16
+.L033ctr32_one_shortcut:
+ movups (%ebx),%xmm2
+ movl 240(%edx),%ecx
+.L037ctr32_one:
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+.L041enc1_loop_7:
+.byte 102,15,56,220,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz .L041enc1_loop_7
+.byte 102,15,56,221,209
+ movups (%esi),%xmm6
+ xorps %xmm2,%xmm6
+ movups %xmm6,(%edi)
+ jmp .L036ctr32_ret
+.align 16
+.L038ctr32_two:
+ call _aesni_encrypt3
+ movups (%esi),%xmm5
+ movups 16(%esi),%xmm6
+ xorps %xmm5,%xmm2
+ xorps %xmm6,%xmm3
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ jmp .L036ctr32_ret
+.align 16
+.L039ctr32_three:
+ call _aesni_encrypt3
+ movups (%esi),%xmm5
+ movups 16(%esi),%xmm6
+ xorps %xmm5,%xmm2
+ movups 32(%esi),%xmm7
+ xorps %xmm6,%xmm3
+ movups %xmm2,(%edi)
+ xorps %xmm7,%xmm4
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ jmp .L036ctr32_ret
+.align 16
+.L040ctr32_four:
+ call _aesni_encrypt4
+ movups (%esi),%xmm6
+ movups 16(%esi),%xmm7
+ movups 32(%esi),%xmm1
+ xorps %xmm6,%xmm2
+ movups 48(%esi),%xmm0
+ xorps %xmm7,%xmm3
+ movups %xmm2,(%edi)
+ xorps %xmm1,%xmm4
+ movups %xmm3,16(%edi)
+ xorps %xmm0,%xmm5
+ movups %xmm4,32(%edi)
+ movups %xmm5,48(%edi)
+.L036ctr32_ret:
+ movl 80(%esp),%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size aesni_ctr32_encrypt_blocks,.-.L_aesni_ctr32_encrypt_blocks_begin
+.globl aesni_xts_encrypt
+.type aesni_xts_encrypt,@function
+.align 16
+aesni_xts_encrypt:
+.L_aesni_xts_encrypt_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 36(%esp),%edx
+ movl 40(%esp),%esi
+ movl 240(%edx),%ecx
+ movups (%esi),%xmm2
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+.L042enc1_loop_8:
+.byte 102,15,56,220,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz .L042enc1_loop_8
+.byte 102,15,56,221,209
+ movl 20(%esp),%esi
+ movl 24(%esp),%edi
+ movl 28(%esp),%eax
+ movl 32(%esp),%edx
+ movl %esp,%ebp
+ subl $120,%esp
+ movl 240(%edx),%ecx
+ andl $-16,%esp
+ movl $135,96(%esp)
+ movl $0,100(%esp)
+ movl $1,104(%esp)
+ movl $0,108(%esp)
+ movl %eax,112(%esp)
+ movl %ebp,116(%esp)
+ movdqa %xmm2,%xmm1
+ pxor %xmm0,%xmm0
+ movdqa 96(%esp),%xmm3
+ pcmpgtd %xmm1,%xmm0
+ andl $-16,%eax
+ movl %edx,%ebp
+ movl %ecx,%ebx
+ subl $96,%eax
+ jc .L043xts_enc_short
+ shrl $1,%ecx
+ movl %ecx,%ebx
+ jmp .L044xts_enc_loop6
+.align 16
+.L044xts_enc_loop6:
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ movdqa %xmm1,(%esp)
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ movdqa %xmm1,16(%esp)
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ movdqa %xmm1,32(%esp)
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ movdqa %xmm1,48(%esp)
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ pshufd $19,%xmm0,%xmm7
+ movdqa %xmm1,64(%esp)
+ paddq %xmm1,%xmm1
+ movups (%ebp),%xmm0
+ pand %xmm3,%xmm7
+ movups (%esi),%xmm2
+ pxor %xmm1,%xmm7
+ movdqu 16(%esi),%xmm3
+ xorps %xmm0,%xmm2
+ movdqu 32(%esi),%xmm4
+ pxor %xmm0,%xmm3
+ movdqu 48(%esi),%xmm5
+ pxor %xmm0,%xmm4
+ movdqu 64(%esi),%xmm6
+ pxor %xmm0,%xmm5
+ movdqu 80(%esi),%xmm1
+ pxor %xmm0,%xmm6
+ leal 96(%esi),%esi
+ pxor (%esp),%xmm2
+ movdqa %xmm7,80(%esp)
+ pxor %xmm1,%xmm7
+ movups 16(%ebp),%xmm1
+ leal 32(%ebp),%edx
+ pxor 16(%esp),%xmm3
+.byte 102,15,56,220,209
+ pxor 32(%esp),%xmm4
+.byte 102,15,56,220,217
+ pxor 48(%esp),%xmm5
+ decl %ecx
+.byte 102,15,56,220,225
+ pxor 64(%esp),%xmm6
+.byte 102,15,56,220,233
+ pxor %xmm0,%xmm7
+.byte 102,15,56,220,241
+ movups (%edx),%xmm0
+.byte 102,15,56,220,249
+ call .L_aesni_encrypt6_enter
+ movdqa 80(%esp),%xmm1
+ pxor %xmm0,%xmm0
+ xorps (%esp),%xmm2
+ pcmpgtd %xmm1,%xmm0
+ xorps 16(%esp),%xmm3
+ movups %xmm2,(%edi)
+ xorps 32(%esp),%xmm4
+ movups %xmm3,16(%edi)
+ xorps 48(%esp),%xmm5
+ movups %xmm4,32(%edi)
+ xorps 64(%esp),%xmm6
+ movups %xmm5,48(%edi)
+ xorps %xmm1,%xmm7
+ movups %xmm6,64(%edi)
+ pshufd $19,%xmm0,%xmm2
+ movups %xmm7,80(%edi)
+ leal 96(%edi),%edi
+ movdqa 96(%esp),%xmm3
+ pxor %xmm0,%xmm0
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ movl %ebx,%ecx
+ pxor %xmm2,%xmm1
+ subl $96,%eax
+ jnc .L044xts_enc_loop6
+ leal 1(,%ecx,2),%ecx
+ movl %ebp,%edx
+ movl %ecx,%ebx
+.L043xts_enc_short:
+ addl $96,%eax
+ jz .L045xts_enc_done6x
+ movdqa %xmm1,%xmm5
+ cmpl $32,%eax
+ jb .L046xts_enc_one
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ je .L047xts_enc_two
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ movdqa %xmm1,%xmm6
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ cmpl $64,%eax
+ jb .L048xts_enc_three
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ movdqa %xmm1,%xmm7
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ movdqa %xmm5,(%esp)
+ movdqa %xmm6,16(%esp)
+ je .L049xts_enc_four
+ movdqa %xmm7,32(%esp)
+ pshufd $19,%xmm0,%xmm7
+ movdqa %xmm1,48(%esp)
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm7
+ pxor %xmm1,%xmm7
+ movdqu (%esi),%xmm2
+ movdqu 16(%esi),%xmm3
+ movdqu 32(%esi),%xmm4
+ pxor (%esp),%xmm2
+ movdqu 48(%esi),%xmm5
+ pxor 16(%esp),%xmm3
+ movdqu 64(%esi),%xmm6
+ pxor 32(%esp),%xmm4
+ leal 80(%esi),%esi
+ pxor 48(%esp),%xmm5
+ movdqa %xmm7,64(%esp)
+ pxor %xmm7,%xmm6
+ call _aesni_encrypt6
+ movaps 64(%esp),%xmm1
+ xorps (%esp),%xmm2
+ xorps 16(%esp),%xmm3
+ xorps 32(%esp),%xmm4
+ movups %xmm2,(%edi)
+ xorps 48(%esp),%xmm5
+ movups %xmm3,16(%edi)
+ xorps %xmm1,%xmm6
+ movups %xmm4,32(%edi)
+ movups %xmm5,48(%edi)
+ movups %xmm6,64(%edi)
+ leal 80(%edi),%edi
+ jmp .L050xts_enc_done
+.align 16
+.L046xts_enc_one:
+ movups (%esi),%xmm2
+ leal 16(%esi),%esi
+ xorps %xmm5,%xmm2
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+.L051enc1_loop_9:
+.byte 102,15,56,220,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz .L051enc1_loop_9
+.byte 102,15,56,221,209
+ xorps %xmm5,%xmm2
+ movups %xmm2,(%edi)
+ leal 16(%edi),%edi
+ movdqa %xmm5,%xmm1
+ jmp .L050xts_enc_done
+.align 16
+.L047xts_enc_two:
+ movaps %xmm1,%xmm6
+ movups (%esi),%xmm2
+ movups 16(%esi),%xmm3
+ leal 32(%esi),%esi
+ xorps %xmm5,%xmm2
+ xorps %xmm6,%xmm3
+ xorps %xmm4,%xmm4
+ call _aesni_encrypt3
+ xorps %xmm5,%xmm2
+ xorps %xmm6,%xmm3
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ leal 32(%edi),%edi
+ movdqa %xmm6,%xmm1
+ jmp .L050xts_enc_done
+.align 16
+.L048xts_enc_three:
+ movaps %xmm1,%xmm7
+ movups (%esi),%xmm2
+ movups 16(%esi),%xmm3
+ movups 32(%esi),%xmm4
+ leal 48(%esi),%esi
+ xorps %xmm5,%xmm2
+ xorps %xmm6,%xmm3
+ xorps %xmm7,%xmm4
+ call _aesni_encrypt3
+ xorps %xmm5,%xmm2
+ xorps %xmm6,%xmm3
+ xorps %xmm7,%xmm4
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ leal 48(%edi),%edi
+ movdqa %xmm7,%xmm1
+ jmp .L050xts_enc_done
+.align 16
+.L049xts_enc_four:
+ movaps %xmm1,%xmm6
+ movups (%esi),%xmm2
+ movups 16(%esi),%xmm3
+ movups 32(%esi),%xmm4
+ xorps (%esp),%xmm2
+ movups 48(%esi),%xmm5
+ leal 64(%esi),%esi
+ xorps 16(%esp),%xmm3
+ xorps %xmm7,%xmm4
+ xorps %xmm6,%xmm5
+ call _aesni_encrypt4
+ xorps (%esp),%xmm2
+ xorps 16(%esp),%xmm3
+ xorps %xmm7,%xmm4
+ movups %xmm2,(%edi)
+ xorps %xmm6,%xmm5
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ movups %xmm5,48(%edi)
+ leal 64(%edi),%edi
+ movdqa %xmm6,%xmm1
+ jmp .L050xts_enc_done
+.align 16
+.L045xts_enc_done6x:
+ movl 112(%esp),%eax
+ andl $15,%eax
+ jz .L052xts_enc_ret
+ movdqa %xmm1,%xmm5
+ movl %eax,112(%esp)
+ jmp .L053xts_enc_steal
+.align 16
+.L050xts_enc_done:
+ movl 112(%esp),%eax
+ pxor %xmm0,%xmm0
+ andl $15,%eax
+ jz .L052xts_enc_ret
+ pcmpgtd %xmm1,%xmm0
+ movl %eax,112(%esp)
+ pshufd $19,%xmm0,%xmm5
+ paddq %xmm1,%xmm1
+ pand 96(%esp),%xmm5
+ pxor %xmm1,%xmm5
+.L053xts_enc_steal:
+ movzbl (%esi),%ecx
+ movzbl -16(%edi),%edx
+ leal 1(%esi),%esi
+ movb %cl,-16(%edi)
+ movb %dl,(%edi)
+ leal 1(%edi),%edi
+ subl $1,%eax
+ jnz .L053xts_enc_steal
+ subl 112(%esp),%edi
+ movl %ebp,%edx
+ movl %ebx,%ecx
+ movups -16(%edi),%xmm2
+ xorps %xmm5,%xmm2
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+.L054enc1_loop_10:
+.byte 102,15,56,220,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz .L054enc1_loop_10
+.byte 102,15,56,221,209
+ xorps %xmm5,%xmm2
+ movups %xmm2,-16(%edi)
+.L052xts_enc_ret:
+ movl 116(%esp),%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size aesni_xts_encrypt,.-.L_aesni_xts_encrypt_begin
+.globl aesni_xts_decrypt
+.type aesni_xts_decrypt,@function
+.align 16
+aesni_xts_decrypt:
+.L_aesni_xts_decrypt_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 36(%esp),%edx
+ movl 40(%esp),%esi
+ movl 240(%edx),%ecx
+ movups (%esi),%xmm2
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+.L055enc1_loop_11:
+.byte 102,15,56,220,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz .L055enc1_loop_11
+.byte 102,15,56,221,209
+ movl 20(%esp),%esi
+ movl 24(%esp),%edi
+ movl 28(%esp),%eax
+ movl 32(%esp),%edx
+ movl %esp,%ebp
+ subl $120,%esp
+ andl $-16,%esp
+ xorl %ebx,%ebx
+ testl $15,%eax
+ setnz %bl
+ shll $4,%ebx
+ subl %ebx,%eax
+ movl $135,96(%esp)
+ movl $0,100(%esp)
+ movl $1,104(%esp)
+ movl $0,108(%esp)
+ movl %eax,112(%esp)
+ movl %ebp,116(%esp)
+ movl 240(%edx),%ecx
+ movl %edx,%ebp
+ movl %ecx,%ebx
+ movdqa %xmm2,%xmm1
+ pxor %xmm0,%xmm0
+ movdqa 96(%esp),%xmm3
+ pcmpgtd %xmm1,%xmm0
+ andl $-16,%eax
+ subl $96,%eax
+ jc .L056xts_dec_short
+ shrl $1,%ecx
+ movl %ecx,%ebx
+ jmp .L057xts_dec_loop6
+.align 16
+.L057xts_dec_loop6:
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ movdqa %xmm1,(%esp)
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ movdqa %xmm1,16(%esp)
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ movdqa %xmm1,32(%esp)
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ movdqa %xmm1,48(%esp)
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ pshufd $19,%xmm0,%xmm7
+ movdqa %xmm1,64(%esp)
+ paddq %xmm1,%xmm1
+ movups (%ebp),%xmm0
+ pand %xmm3,%xmm7
+ movups (%esi),%xmm2
+ pxor %xmm1,%xmm7
+ movdqu 16(%esi),%xmm3
+ xorps %xmm0,%xmm2
+ movdqu 32(%esi),%xmm4
+ pxor %xmm0,%xmm3
+ movdqu 48(%esi),%xmm5
+ pxor %xmm0,%xmm4
+ movdqu 64(%esi),%xmm6
+ pxor %xmm0,%xmm5
+ movdqu 80(%esi),%xmm1
+ pxor %xmm0,%xmm6
+ leal 96(%esi),%esi
+ pxor (%esp),%xmm2
+ movdqa %xmm7,80(%esp)
+ pxor %xmm1,%xmm7
+ movups 16(%ebp),%xmm1
+ leal 32(%ebp),%edx
+ pxor 16(%esp),%xmm3
+.byte 102,15,56,222,209
+ pxor 32(%esp),%xmm4
+.byte 102,15,56,222,217
+ pxor 48(%esp),%xmm5
+ decl %ecx
+.byte 102,15,56,222,225
+ pxor 64(%esp),%xmm6
+.byte 102,15,56,222,233
+ pxor %xmm0,%xmm7
+.byte 102,15,56,222,241
+ movups (%edx),%xmm0
+.byte 102,15,56,222,249
+ call .L_aesni_decrypt6_enter
+ movdqa 80(%esp),%xmm1
+ pxor %xmm0,%xmm0
+ xorps (%esp),%xmm2
+ pcmpgtd %xmm1,%xmm0
+ xorps 16(%esp),%xmm3
+ movups %xmm2,(%edi)
+ xorps 32(%esp),%xmm4
+ movups %xmm3,16(%edi)
+ xorps 48(%esp),%xmm5
+ movups %xmm4,32(%edi)
+ xorps 64(%esp),%xmm6
+ movups %xmm5,48(%edi)
+ xorps %xmm1,%xmm7
+ movups %xmm6,64(%edi)
+ pshufd $19,%xmm0,%xmm2
+ movups %xmm7,80(%edi)
+ leal 96(%edi),%edi
+ movdqa 96(%esp),%xmm3
+ pxor %xmm0,%xmm0
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ movl %ebx,%ecx
+ pxor %xmm2,%xmm1
+ subl $96,%eax
+ jnc .L057xts_dec_loop6
+ leal 1(,%ecx,2),%ecx
+ movl %ebp,%edx
+ movl %ecx,%ebx
+.L056xts_dec_short:
+ addl $96,%eax
+ jz .L058xts_dec_done6x
+ movdqa %xmm1,%xmm5
+ cmpl $32,%eax
+ jb .L059xts_dec_one
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ je .L060xts_dec_two
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ movdqa %xmm1,%xmm6
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ cmpl $64,%eax
+ jb .L061xts_dec_three
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ movdqa %xmm1,%xmm7
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ movdqa %xmm5,(%esp)
+ movdqa %xmm6,16(%esp)
+ je .L062xts_dec_four
+ movdqa %xmm7,32(%esp)
+ pshufd $19,%xmm0,%xmm7
+ movdqa %xmm1,48(%esp)
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm7
+ pxor %xmm1,%xmm7
+ movdqu (%esi),%xmm2
+ movdqu 16(%esi),%xmm3
+ movdqu 32(%esi),%xmm4
+ pxor (%esp),%xmm2
+ movdqu 48(%esi),%xmm5
+ pxor 16(%esp),%xmm3
+ movdqu 64(%esi),%xmm6
+ pxor 32(%esp),%xmm4
+ leal 80(%esi),%esi
+ pxor 48(%esp),%xmm5
+ movdqa %xmm7,64(%esp)
+ pxor %xmm7,%xmm6
+ call _aesni_decrypt6
+ movaps 64(%esp),%xmm1
+ xorps (%esp),%xmm2
+ xorps 16(%esp),%xmm3
+ xorps 32(%esp),%xmm4
+ movups %xmm2,(%edi)
+ xorps 48(%esp),%xmm5
+ movups %xmm3,16(%edi)
+ xorps %xmm1,%xmm6
+ movups %xmm4,32(%edi)
+ movups %xmm5,48(%edi)
+ movups %xmm6,64(%edi)
+ leal 80(%edi),%edi
+ jmp .L063xts_dec_done
+.align 16
+.L059xts_dec_one:
+ movups (%esi),%xmm2
+ leal 16(%esi),%esi
+ xorps %xmm5,%xmm2
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+.L064dec1_loop_12:
+.byte 102,15,56,222,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz .L064dec1_loop_12
+.byte 102,15,56,223,209
+ xorps %xmm5,%xmm2
+ movups %xmm2,(%edi)
+ leal 16(%edi),%edi
+ movdqa %xmm5,%xmm1
+ jmp .L063xts_dec_done
+.align 16
+.L060xts_dec_two:
+ movaps %xmm1,%xmm6
+ movups (%esi),%xmm2
+ movups 16(%esi),%xmm3
+ leal 32(%esi),%esi
+ xorps %xmm5,%xmm2
+ xorps %xmm6,%xmm3
+ call _aesni_decrypt3
+ xorps %xmm5,%xmm2
+ xorps %xmm6,%xmm3
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ leal 32(%edi),%edi
+ movdqa %xmm6,%xmm1
+ jmp .L063xts_dec_done
+.align 16
+.L061xts_dec_three:
+ movaps %xmm1,%xmm7
+ movups (%esi),%xmm2
+ movups 16(%esi),%xmm3
+ movups 32(%esi),%xmm4
+ leal 48(%esi),%esi
+ xorps %xmm5,%xmm2
+ xorps %xmm6,%xmm3
+ xorps %xmm7,%xmm4
+ call _aesni_decrypt3
+ xorps %xmm5,%xmm2
+ xorps %xmm6,%xmm3
+ xorps %xmm7,%xmm4
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ leal 48(%edi),%edi
+ movdqa %xmm7,%xmm1
+ jmp .L063xts_dec_done
+.align 16
+.L062xts_dec_four:
+ movaps %xmm1,%xmm6
+ movups (%esi),%xmm2
+ movups 16(%esi),%xmm3
+ movups 32(%esi),%xmm4
+ xorps (%esp),%xmm2
+ movups 48(%esi),%xmm5
+ leal 64(%esi),%esi
+ xorps 16(%esp),%xmm3
+ xorps %xmm7,%xmm4
+ xorps %xmm6,%xmm5
+ call _aesni_decrypt4
+ xorps (%esp),%xmm2
+ xorps 16(%esp),%xmm3
+ xorps %xmm7,%xmm4
+ movups %xmm2,(%edi)
+ xorps %xmm6,%xmm5
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ movups %xmm5,48(%edi)
+ leal 64(%edi),%edi
+ movdqa %xmm6,%xmm1
+ jmp .L063xts_dec_done
+.align 16
+.L058xts_dec_done6x:
+ movl 112(%esp),%eax
+ andl $15,%eax
+ jz .L065xts_dec_ret
+ movl %eax,112(%esp)
+ jmp .L066xts_dec_only_one_more
+.align 16
+.L063xts_dec_done:
+ movl 112(%esp),%eax
+ pxor %xmm0,%xmm0
+ andl $15,%eax
+ jz .L065xts_dec_ret
+ pcmpgtd %xmm1,%xmm0
+ movl %eax,112(%esp)
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ movdqa 96(%esp),%xmm3
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+.L066xts_dec_only_one_more:
+ pshufd $19,%xmm0,%xmm5
+ movdqa %xmm1,%xmm6
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm5
+ pxor %xmm1,%xmm5
+ movl %ebp,%edx
+ movl %ebx,%ecx
+ movups (%esi),%xmm2
+ xorps %xmm5,%xmm2
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+.L067dec1_loop_13:
+.byte 102,15,56,222,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz .L067dec1_loop_13
+.byte 102,15,56,223,209
+ xorps %xmm5,%xmm2
+ movups %xmm2,(%edi)
+.L068xts_dec_steal:
+ movzbl 16(%esi),%ecx
+ movzbl (%edi),%edx
+ leal 1(%esi),%esi
+ movb %cl,(%edi)
+ movb %dl,16(%edi)
+ leal 1(%edi),%edi
+ subl $1,%eax
+ jnz .L068xts_dec_steal
+ subl 112(%esp),%edi
+ movl %ebp,%edx
+ movl %ebx,%ecx
+ movups (%edi),%xmm2
+ xorps %xmm6,%xmm2
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+.L069dec1_loop_14:
+.byte 102,15,56,222,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz .L069dec1_loop_14
+.byte 102,15,56,223,209
+ xorps %xmm6,%xmm2
+ movups %xmm2,(%edi)
+.L065xts_dec_ret:
+ movl 116(%esp),%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size aesni_xts_decrypt,.-.L_aesni_xts_decrypt_begin
+.globl aesni_cbc_encrypt
+.type aesni_cbc_encrypt,@function
+.align 16
+aesni_cbc_encrypt:
+.L_aesni_cbc_encrypt_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%esi
+ movl %esp,%ebx
+ movl 24(%esp),%edi
+ subl $24,%ebx
+ movl 28(%esp),%eax
+ andl $-16,%ebx
+ movl 32(%esp),%edx
+ movl 36(%esp),%ebp
+ testl %eax,%eax
+ jz .L070cbc_abort
+ cmpl $0,40(%esp)
+ xchgl %esp,%ebx
+ movups (%ebp),%xmm7
+ movl 240(%edx),%ecx
+ movl %edx,%ebp
+ movl %ebx,16(%esp)
+ movl %ecx,%ebx
+ je .L071cbc_decrypt
+ movaps %xmm7,%xmm2
+ cmpl $16,%eax
+ jb .L072cbc_enc_tail
+ subl $16,%eax
+ jmp .L073cbc_enc_loop
+.align 16
+.L073cbc_enc_loop:
+ movups (%esi),%xmm7
+ leal 16(%esi),%esi
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ xorps %xmm0,%xmm7
+ leal 32(%edx),%edx
+ xorps %xmm7,%xmm2
+.L074enc1_loop_15:
+.byte 102,15,56,220,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz .L074enc1_loop_15
+.byte 102,15,56,221,209
+ movl %ebx,%ecx
+ movl %ebp,%edx
+ movups %xmm2,(%edi)
+ leal 16(%edi),%edi
+ subl $16,%eax
+ jnc .L073cbc_enc_loop
+ addl $16,%eax
+ jnz .L072cbc_enc_tail
+ movaps %xmm2,%xmm7
+ jmp .L075cbc_ret
+.L072cbc_enc_tail:
+ movl %eax,%ecx
+.long 2767451785
+ movl $16,%ecx
+ subl %eax,%ecx
+ xorl %eax,%eax
+.long 2868115081
+ leal -16(%edi),%edi
+ movl %ebx,%ecx
+ movl %edi,%esi
+ movl %ebp,%edx
+ jmp .L073cbc_enc_loop
+.align 16
+.L071cbc_decrypt:
+ cmpl $80,%eax
+ jbe .L076cbc_dec_tail
+ movaps %xmm7,(%esp)
+ subl $80,%eax
+ jmp .L077cbc_dec_loop6_enter
+.align 16
+.L078cbc_dec_loop6:
+ movaps %xmm0,(%esp)
+ movups %xmm7,(%edi)
+ leal 16(%edi),%edi
+.L077cbc_dec_loop6_enter:
+ movdqu (%esi),%xmm2
+ movdqu 16(%esi),%xmm3
+ movdqu 32(%esi),%xmm4
+ movdqu 48(%esi),%xmm5
+ movdqu 64(%esi),%xmm6
+ movdqu 80(%esi),%xmm7
+ call _aesni_decrypt6
+ movups (%esi),%xmm1
+ movups 16(%esi),%xmm0
+ xorps (%esp),%xmm2
+ xorps %xmm1,%xmm3
+ movups 32(%esi),%xmm1
+ xorps %xmm0,%xmm4
+ movups 48(%esi),%xmm0
+ xorps %xmm1,%xmm5
+ movups 64(%esi),%xmm1
+ xorps %xmm0,%xmm6
+ movups 80(%esi),%xmm0
+ xorps %xmm1,%xmm7
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ leal 96(%esi),%esi
+ movups %xmm4,32(%edi)
+ movl %ebx,%ecx
+ movups %xmm5,48(%edi)
+ movl %ebp,%edx
+ movups %xmm6,64(%edi)
+ leal 80(%edi),%edi
+ subl $96,%eax
+ ja .L078cbc_dec_loop6
+ movaps %xmm7,%xmm2
+ movaps %xmm0,%xmm7
+ addl $80,%eax
+ jle .L079cbc_dec_tail_collected
+ movups %xmm2,(%edi)
+ leal 16(%edi),%edi
+.L076cbc_dec_tail:
+ movups (%esi),%xmm2
+ movaps %xmm2,%xmm6
+ cmpl $16,%eax
+ jbe .L080cbc_dec_one
+ movups 16(%esi),%xmm3
+ movaps %xmm3,%xmm5
+ cmpl $32,%eax
+ jbe .L081cbc_dec_two
+ movups 32(%esi),%xmm4
+ cmpl $48,%eax
+ jbe .L082cbc_dec_three
+ movups 48(%esi),%xmm5
+ cmpl $64,%eax
+ jbe .L083cbc_dec_four
+ movups 64(%esi),%xmm6
+ movaps %xmm7,(%esp)
+ movups (%esi),%xmm2
+ xorps %xmm7,%xmm7
+ call _aesni_decrypt6
+ movups (%esi),%xmm1
+ movups 16(%esi),%xmm0
+ xorps (%esp),%xmm2
+ xorps %xmm1,%xmm3
+ movups 32(%esi),%xmm1
+ xorps %xmm0,%xmm4
+ movups 48(%esi),%xmm0
+ xorps %xmm1,%xmm5
+ movups 64(%esi),%xmm7
+ xorps %xmm0,%xmm6
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ movups %xmm5,48(%edi)
+ leal 64(%edi),%edi
+ movaps %xmm6,%xmm2
+ subl $80,%eax
+ jmp .L079cbc_dec_tail_collected
+.align 16
+.L080cbc_dec_one:
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+.L084dec1_loop_16:
+.byte 102,15,56,222,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz .L084dec1_loop_16
+.byte 102,15,56,223,209
+ xorps %xmm7,%xmm2
+ movaps %xmm6,%xmm7
+ subl $16,%eax
+ jmp .L079cbc_dec_tail_collected
+.align 16
+.L081cbc_dec_two:
+ xorps %xmm4,%xmm4
+ call _aesni_decrypt3
+ xorps %xmm7,%xmm2
+ xorps %xmm6,%xmm3
+ movups %xmm2,(%edi)
+ movaps %xmm3,%xmm2
+ leal 16(%edi),%edi
+ movaps %xmm5,%xmm7
+ subl $32,%eax
+ jmp .L079cbc_dec_tail_collected
+.align 16
+.L082cbc_dec_three:
+ call _aesni_decrypt3
+ xorps %xmm7,%xmm2
+ xorps %xmm6,%xmm3
+ xorps %xmm5,%xmm4
+ movups %xmm2,(%edi)
+ movaps %xmm4,%xmm2
+ movups %xmm3,16(%edi)
+ leal 32(%edi),%edi
+ movups 32(%esi),%xmm7
+ subl $48,%eax
+ jmp .L079cbc_dec_tail_collected
+.align 16
+.L083cbc_dec_four:
+ call _aesni_decrypt4
+ movups 16(%esi),%xmm1
+ movups 32(%esi),%xmm0
+ xorps %xmm7,%xmm2
+ movups 48(%esi),%xmm7
+ xorps %xmm6,%xmm3
+ movups %xmm2,(%edi)
+ xorps %xmm1,%xmm4
+ movups %xmm3,16(%edi)
+ xorps %xmm0,%xmm5
+ movups %xmm4,32(%edi)
+ leal 48(%edi),%edi
+ movaps %xmm5,%xmm2
+ subl $64,%eax
+.L079cbc_dec_tail_collected:
+ andl $15,%eax
+ jnz .L085cbc_dec_tail_partial
+ movups %xmm2,(%edi)
+ jmp .L075cbc_ret
+.align 16
+.L085cbc_dec_tail_partial:
+ movaps %xmm2,(%esp)
+ movl $16,%ecx
+ movl %esp,%esi
+ subl %eax,%ecx
+.long 2767451785
+.L075cbc_ret:
+ movl 16(%esp),%esp
+ movl 36(%esp),%ebp
+ movups %xmm7,(%ebp)
+.L070cbc_abort:
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size aesni_cbc_encrypt,.-.L_aesni_cbc_encrypt_begin
+.type _aesni_set_encrypt_key,@function
+.align 16
+_aesni_set_encrypt_key:
+ testl %eax,%eax
+ jz .L086bad_pointer
+ testl %edx,%edx
+ jz .L086bad_pointer
+ movups (%eax),%xmm0
+ xorps %xmm4,%xmm4
+ leal 16(%edx),%edx
+ cmpl $256,%ecx
+ je .L08714rounds
+ cmpl $192,%ecx
+ je .L08812rounds
+ cmpl $128,%ecx
+ jne .L089bad_keybits
+.align 16
+.L09010rounds:
+ movl $9,%ecx
+ movups %xmm0,-16(%edx)
+.byte 102,15,58,223,200,1
+ call .L091key_128_cold
+.byte 102,15,58,223,200,2
+ call .L092key_128
+.byte 102,15,58,223,200,4
+ call .L092key_128
+.byte 102,15,58,223,200,8
+ call .L092key_128
+.byte 102,15,58,223,200,16
+ call .L092key_128
+.byte 102,15,58,223,200,32
+ call .L092key_128
+.byte 102,15,58,223,200,64
+ call .L092key_128
+.byte 102,15,58,223,200,128
+ call .L092key_128
+.byte 102,15,58,223,200,27
+ call .L092key_128
+.byte 102,15,58,223,200,54
+ call .L092key_128
+ movups %xmm0,(%edx)
+ movl %ecx,80(%edx)
+ xorl %eax,%eax
+ ret
+.align 16
+.L092key_128:
+ movups %xmm0,(%edx)
+ leal 16(%edx),%edx
+.L091key_128_cold:
+ shufps $16,%xmm0,%xmm4
+ xorps %xmm4,%xmm0
+ shufps $140,%xmm0,%xmm4
+ xorps %xmm4,%xmm0
+ shufps $255,%xmm1,%xmm1
+ xorps %xmm1,%xmm0
+ ret
+.align 16
+.L08812rounds:
+ movq 16(%eax),%xmm2
+ movl $11,%ecx
+ movups %xmm0,-16(%edx)
+.byte 102,15,58,223,202,1
+ call .L093key_192a_cold
+.byte 102,15,58,223,202,2
+ call .L094key_192b
+.byte 102,15,58,223,202,4
+ call .L095key_192a
+.byte 102,15,58,223,202,8
+ call .L094key_192b
+.byte 102,15,58,223,202,16
+ call .L095key_192a
+.byte 102,15,58,223,202,32
+ call .L094key_192b
+.byte 102,15,58,223,202,64
+ call .L095key_192a
+.byte 102,15,58,223,202,128
+ call .L094key_192b
+ movups %xmm0,(%edx)
+ movl %ecx,48(%edx)
+ xorl %eax,%eax
+ ret
+.align 16
+.L095key_192a:
+ movups %xmm0,(%edx)
+ leal 16(%edx),%edx
+.align 16
+.L093key_192a_cold:
+ movaps %xmm2,%xmm5
+.L096key_192b_warm:
+ shufps $16,%xmm0,%xmm4
+ movdqa %xmm2,%xmm3
+ xorps %xmm4,%xmm0
+ shufps $140,%xmm0,%xmm4
+ pslldq $4,%xmm3
+ xorps %xmm4,%xmm0
+ pshufd $85,%xmm1,%xmm1
+ pxor %xmm3,%xmm2
+ pxor %xmm1,%xmm0
+ pshufd $255,%xmm0,%xmm3
+ pxor %xmm3,%xmm2
+ ret
+.align 16
+.L094key_192b:
+ movaps %xmm0,%xmm3
+ shufps $68,%xmm0,%xmm5
+ movups %xmm5,(%edx)
+ shufps $78,%xmm2,%xmm3
+ movups %xmm3,16(%edx)
+ leal 32(%edx),%edx
+ jmp .L096key_192b_warm
+.align 16
+.L08714rounds:
+ movups 16(%eax),%xmm2
+ movl $13,%ecx
+ leal 16(%edx),%edx
+ movups %xmm0,-32(%edx)
+ movups %xmm2,-16(%edx)
+.byte 102,15,58,223,202,1
+ call .L097key_256a_cold
+.byte 102,15,58,223,200,1
+ call .L098key_256b
+.byte 102,15,58,223,202,2
+ call .L099key_256a
+.byte 102,15,58,223,200,2
+ call .L098key_256b
+.byte 102,15,58,223,202,4
+ call .L099key_256a
+.byte 102,15,58,223,200,4
+ call .L098key_256b
+.byte 102,15,58,223,202,8
+ call .L099key_256a
+.byte 102,15,58,223,200,8
+ call .L098key_256b
+.byte 102,15,58,223,202,16
+ call .L099key_256a
+.byte 102,15,58,223,200,16
+ call .L098key_256b
+.byte 102,15,58,223,202,32
+ call .L099key_256a
+.byte 102,15,58,223,200,32
+ call .L098key_256b
+.byte 102,15,58,223,202,64
+ call .L099key_256a
+ movups %xmm0,(%edx)
+ movl %ecx,16(%edx)
+ xorl %eax,%eax
+ ret
+.align 16
+.L099key_256a:
+ movups %xmm2,(%edx)
+ leal 16(%edx),%edx
+.L097key_256a_cold:
+ shufps $16,%xmm0,%xmm4
+ xorps %xmm4,%xmm0
+ shufps $140,%xmm0,%xmm4
+ xorps %xmm4,%xmm0
+ shufps $255,%xmm1,%xmm1
+ xorps %xmm1,%xmm0
+ ret
+.align 16
+.L098key_256b:
+ movups %xmm0,(%edx)
+ leal 16(%edx),%edx
+ shufps $16,%xmm2,%xmm4
+ xorps %xmm4,%xmm2
+ shufps $140,%xmm2,%xmm4
+ xorps %xmm4,%xmm2
+ shufps $170,%xmm1,%xmm1
+ xorps %xmm1,%xmm2
+ ret
+.align 4
+.L086bad_pointer:
+ movl $-1,%eax
+ ret
+.align 4
+.L089bad_keybits:
+ movl $-2,%eax
+ ret
+.size _aesni_set_encrypt_key,.-_aesni_set_encrypt_key
+.globl aesni_set_encrypt_key
+.type aesni_set_encrypt_key,@function
+.align 16
+aesni_set_encrypt_key:
+.L_aesni_set_encrypt_key_begin:
+ movl 4(%esp),%eax
+ movl 8(%esp),%ecx
+ movl 12(%esp),%edx
+ call _aesni_set_encrypt_key
+ ret
+.size aesni_set_encrypt_key,.-.L_aesni_set_encrypt_key_begin
+.globl aesni_set_decrypt_key
+.type aesni_set_decrypt_key,@function
+.align 16
+aesni_set_decrypt_key:
+.L_aesni_set_decrypt_key_begin:
+ movl 4(%esp),%eax
+ movl 8(%esp),%ecx
+ movl 12(%esp),%edx
+ call _aesni_set_encrypt_key
+ movl 12(%esp),%edx
+ shll $4,%ecx
+ testl %eax,%eax
+ jnz .L100dec_key_ret
+ leal 16(%edx,%ecx,1),%eax
+ movups (%edx),%xmm0
+ movups (%eax),%xmm1
+ movups %xmm0,(%eax)
+ movups %xmm1,(%edx)
+ leal 16(%edx),%edx
+ leal -16(%eax),%eax
+.L101dec_key_inverse:
+ movups (%edx),%xmm0
+ movups (%eax),%xmm1
+.byte 102,15,56,219,192
+.byte 102,15,56,219,201
+ leal 16(%edx),%edx
+ leal -16(%eax),%eax
+ movups %xmm0,16(%eax)
+ movups %xmm1,-16(%edx)
+ cmpl %edx,%eax
+ ja .L101dec_key_inverse
+ movups (%edx),%xmm0
+.byte 102,15,56,219,192
+ movups %xmm0,(%edx)
+ xorl %eax,%eax
+.L100dec_key_ret:
+ ret
+.size aesni_set_decrypt_key,.-.L_aesni_set_decrypt_key_begin
+.byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69
+.byte 83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83
+.byte 32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115
+.byte 115,108,46,111,114,103,62,0
diff --git a/main/openssl/crypto/aes/asm/aesni-x86.pl b/main/openssl/crypto/aes/asm/aesni-x86.pl
new file mode 100644
index 00000000..3dc345b5
--- /dev/null
+++ b/main/openssl/crypto/aes/asm/aesni-x86.pl
@@ -0,0 +1,2189 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# This module implements support for Intel AES-NI extension. In
+# OpenSSL context it's used with Intel engine, but can also be used as
+# drop-in replacement for crypto/aes/asm/aes-586.pl [see below for
+# details].
+#
+# Performance.
+#
+# To start with see corresponding paragraph in aesni-x86_64.pl...
+# Instead of filling table similar to one found there I've chosen to
+# summarize *comparison* results for raw ECB, CTR and CBC benchmarks.
+# The simplified table below represents 32-bit performance relative
+# to 64-bit one in every given point. Ratios vary for different
+# encryption modes, therefore interval values.
+#
+# 16-byte 64-byte 256-byte 1-KB 8-KB
+# 53-67% 67-84% 91-94% 95-98% 97-99.5%
+#
+# Lower ratios for smaller block sizes are perfectly understandable,
+# because function call overhead is higher in 32-bit mode. Largest
+# 8-KB block performance is virtually same: 32-bit code is less than
+# 1% slower for ECB, CBC and CCM, and ~3% slower otherwise.
+
+# January 2011
+#
+# See aesni-x86_64.pl for details. Unlike x86_64 version this module
+# interleaves at most 6 aes[enc|dec] instructions, because there are
+# not enough registers for 8x interleave [which should be optimal for
+# Sandy Bridge]. Actually, performance results for 6x interleave
+# factor presented in aesni-x86_64.pl (except for CTR) are for this
+# module.
+
+# April 2011
+#
+# Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing
+# one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09.
+
+$PREFIX="aesni"; # if $PREFIX is set to "AES", the script
+ # generates drop-in replacement for
+ # crypto/aes/asm/aes-586.pl:-)
+$inline=1; # inline _aesni_[en|de]crypt
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+
+&asm_init($ARGV[0],$0);
+
+if ($PREFIX eq "aesni") { $movekey=*movups; }
+else { $movekey=*movups; }
+
+$len="eax";
+$rounds="ecx";
+$key="edx";
+$inp="esi";
+$out="edi";
+$rounds_="ebx"; # backup copy for $rounds
+$key_="ebp"; # backup copy for $key
+
+$rndkey0="xmm0";
+$rndkey1="xmm1";
+$inout0="xmm2";
+$inout1="xmm3";
+$inout2="xmm4";
+$inout3="xmm5"; $in1="xmm5";
+$inout4="xmm6"; $in0="xmm6";
+$inout5="xmm7"; $ivec="xmm7";
+
+# AESNI extenstion
+sub aeskeygenassist
+{ my($dst,$src,$imm)=@_;
+ if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
+ { &data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm); }
+}
+sub aescommon
+{ my($opcodelet,$dst,$src)=@_;
+ if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
+ { &data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);}
+}
+sub aesimc { aescommon(0xdb,@_); }
+sub aesenc { aescommon(0xdc,@_); }
+sub aesenclast { aescommon(0xdd,@_); }
+sub aesdec { aescommon(0xde,@_); }
+sub aesdeclast { aescommon(0xdf,@_); }
+
+# Inline version of internal aesni_[en|de]crypt1
+{ my $sn;
+sub aesni_inline_generate1
+{ my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
+ $sn++;
+
+ &$movekey ($rndkey0,&QWP(0,$key));
+ &$movekey ($rndkey1,&QWP(16,$key));
+ &xorps ($ivec,$rndkey0) if (defined($ivec));
+ &lea ($key,&DWP(32,$key));
+ &xorps ($inout,$ivec) if (defined($ivec));
+ &xorps ($inout,$rndkey0) if (!defined($ivec));
+ &set_label("${p}1_loop_$sn");
+ eval"&aes${p} ($inout,$rndkey1)";
+ &dec ($rounds);
+ &$movekey ($rndkey1,&QWP(0,$key));
+ &lea ($key,&DWP(16,$key));
+ &jnz (&label("${p}1_loop_$sn"));
+ eval"&aes${p}last ($inout,$rndkey1)";
+}}
+
+sub aesni_generate1 # fully unrolled loop
+{ my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout));
+
+ &function_begin_B("_aesni_${p}rypt1");
+ &movups ($rndkey0,&QWP(0,$key));
+ &$movekey ($rndkey1,&QWP(0x10,$key));
+ &xorps ($inout,$rndkey0);
+ &$movekey ($rndkey0,&QWP(0x20,$key));
+ &lea ($key,&DWP(0x30,$key));
+ &cmp ($rounds,11);
+ &jb (&label("${p}128"));
+ &lea ($key,&DWP(0x20,$key));
+ &je (&label("${p}192"));
+ &lea ($key,&DWP(0x20,$key));
+ eval"&aes${p} ($inout,$rndkey1)";
+ &$movekey ($rndkey1,&QWP(-0x40,$key));
+ eval"&aes${p} ($inout,$rndkey0)";
+ &$movekey ($rndkey0,&QWP(-0x30,$key));
+ &set_label("${p}192");
+ eval"&aes${p} ($inout,$rndkey1)";
+ &$movekey ($rndkey1,&QWP(-0x20,$key));
+ eval"&aes${p} ($inout,$rndkey0)";
+ &$movekey ($rndkey0,&QWP(-0x10,$key));
+ &set_label("${p}128");
+ eval"&aes${p} ($inout,$rndkey1)";
+ &$movekey ($rndkey1,&QWP(0,$key));
+ eval"&aes${p} ($inout,$rndkey0)";
+ &$movekey ($rndkey0,&QWP(0x10,$key));
+ eval"&aes${p} ($inout,$rndkey1)";
+ &$movekey ($rndkey1,&QWP(0x20,$key));
+ eval"&aes${p} ($inout,$rndkey0)";
+ &$movekey ($rndkey0,&QWP(0x30,$key));
+ eval"&aes${p} ($inout,$rndkey1)";
+ &$movekey ($rndkey1,&QWP(0x40,$key));
+ eval"&aes${p} ($inout,$rndkey0)";
+ &$movekey ($rndkey0,&QWP(0x50,$key));
+ eval"&aes${p} ($inout,$rndkey1)";
+ &$movekey ($rndkey1,&QWP(0x60,$key));
+ eval"&aes${p} ($inout,$rndkey0)";
+ &$movekey ($rndkey0,&QWP(0x70,$key));
+ eval"&aes${p} ($inout,$rndkey1)";
+ eval"&aes${p}last ($inout,$rndkey0)";
+ &ret();
+ &function_end_B("_aesni_${p}rypt1");
+}
+
+# void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
+&aesni_generate1("enc") if (!$inline);
+&function_begin_B("${PREFIX}_encrypt");
+ &mov ("eax",&wparam(0));
+ &mov ($key,&wparam(2));
+ &movups ($inout0,&QWP(0,"eax"));
+ &mov ($rounds,&DWP(240,$key));
+ &mov ("eax",&wparam(1));
+ if ($inline)
+ { &aesni_inline_generate1("enc"); }
+ else
+ { &call ("_aesni_encrypt1"); }
+ &movups (&QWP(0,"eax"),$inout0);
+ &ret ();
+&function_end_B("${PREFIX}_encrypt");
+
+# void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key);
+&aesni_generate1("dec") if(!$inline);
+&function_begin_B("${PREFIX}_decrypt");
+ &mov ("eax",&wparam(0));
+ &mov ($key,&wparam(2));
+ &movups ($inout0,&QWP(0,"eax"));
+ &mov ($rounds,&DWP(240,$key));
+ &mov ("eax",&wparam(1));
+ if ($inline)
+ { &aesni_inline_generate1("dec"); }
+ else
+ { &call ("_aesni_decrypt1"); }
+ &movups (&QWP(0,"eax"),$inout0);
+ &ret ();
+&function_end_B("${PREFIX}_decrypt");
+
+# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
+# factor. Why 3x subroutine were originally used in loops? Even though
+# aes[enc|dec] latency was originally 6, it could be scheduled only
+# every *2nd* cycle. Thus 3x interleave was the one providing optimal
+# utilization, i.e. when subroutine's throughput is virtually same as
+# of non-interleaved subroutine [for number of input blocks up to 3].
+# This is why it makes no sense to implement 2x subroutine.
+# aes[enc|dec] latency in next processor generation is 8, but the
+# instructions can be scheduled every cycle. Optimal interleave for
+# new processor is therefore 8x, but it's unfeasible to accommodate it
+# in XMM registers addreassable in 32-bit mode and therefore 6x is
+# used instead...
+
+sub aesni_generate3
+{ my $p=shift;
+
+ &function_begin_B("_aesni_${p}rypt3");
+ &$movekey ($rndkey0,&QWP(0,$key));
+ &shr ($rounds,1);
+ &$movekey ($rndkey1,&QWP(16,$key));
+ &lea ($key,&DWP(32,$key));
+ &xorps ($inout0,$rndkey0);
+ &pxor ($inout1,$rndkey0);
+ &pxor ($inout2,$rndkey0);
+ &$movekey ($rndkey0,&QWP(0,$key));
+
+ &set_label("${p}3_loop");
+ eval"&aes${p} ($inout0,$rndkey1)";
+ eval"&aes${p} ($inout1,$rndkey1)";
+ &dec ($rounds);
+ eval"&aes${p} ($inout2,$rndkey1)";
+ &$movekey ($rndkey1,&QWP(16,$key));
+ eval"&aes${p} ($inout0,$rndkey0)";
+ eval"&aes${p} ($inout1,$rndkey0)";
+ &lea ($key,&DWP(32,$key));
+ eval"&aes${p} ($inout2,$rndkey0)";
+ &$movekey ($rndkey0,&QWP(0,$key));
+ &jnz (&label("${p}3_loop"));
+ eval"&aes${p} ($inout0,$rndkey1)";
+ eval"&aes${p} ($inout1,$rndkey1)";
+ eval"&aes${p} ($inout2,$rndkey1)";
+ eval"&aes${p}last ($inout0,$rndkey0)";
+ eval"&aes${p}last ($inout1,$rndkey0)";
+ eval"&aes${p}last ($inout2,$rndkey0)";
+ &ret();
+ &function_end_B("_aesni_${p}rypt3");
+}
+
+# 4x interleave is implemented to improve small block performance,
+# most notably [and naturally] 4 block by ~30%. One can argue that one
+# should have implemented 5x as well, but improvement would be <20%,
+# so it's not worth it...
+sub aesni_generate4
+{ my $p=shift;
+
+ &function_begin_B("_aesni_${p}rypt4");
+ &$movekey ($rndkey0,&QWP(0,$key));
+ &$movekey ($rndkey1,&QWP(16,$key));
+ &shr ($rounds,1);
+ &lea ($key,&DWP(32,$key));
+ &xorps ($inout0,$rndkey0);
+ &pxor ($inout1,$rndkey0);
+ &pxor ($inout2,$rndkey0);
+ &pxor ($inout3,$rndkey0);
+ &$movekey ($rndkey0,&QWP(0,$key));
+
+ &set_label("${p}4_loop");
+ eval"&aes${p} ($inout0,$rndkey1)";
+ eval"&aes${p} ($inout1,$rndkey1)";
+ &dec ($rounds);
+ eval"&aes${p} ($inout2,$rndkey1)";
+ eval"&aes${p} ($inout3,$rndkey1)";
+ &$movekey ($rndkey1,&QWP(16,$key));
+ eval"&aes${p} ($inout0,$rndkey0)";
+ eval"&aes${p} ($inout1,$rndkey0)";
+ &lea ($key,&DWP(32,$key));
+ eval"&aes${p} ($inout2,$rndkey0)";
+ eval"&aes${p} ($inout3,$rndkey0)";
+ &$movekey ($rndkey0,&QWP(0,$key));
+ &jnz (&label("${p}4_loop"));
+
+ eval"&aes${p} ($inout0,$rndkey1)";
+ eval"&aes${p} ($inout1,$rndkey1)";
+ eval"&aes${p} ($inout2,$rndkey1)";
+ eval"&aes${p} ($inout3,$rndkey1)";
+ eval"&aes${p}last ($inout0,$rndkey0)";
+ eval"&aes${p}last ($inout1,$rndkey0)";
+ eval"&aes${p}last ($inout2,$rndkey0)";
+ eval"&aes${p}last ($inout3,$rndkey0)";
+ &ret();
+ &function_end_B("_aesni_${p}rypt4");
+}
+
+sub aesni_generate6
+{ my $p=shift;
+
+ &function_begin_B("_aesni_${p}rypt6");
+ &static_label("_aesni_${p}rypt6_enter");
+ &$movekey ($rndkey0,&QWP(0,$key));
+ &shr ($rounds,1);
+ &$movekey ($rndkey1,&QWP(16,$key));
+ &lea ($key,&DWP(32,$key));
+ &xorps ($inout0,$rndkey0);
+ &pxor ($inout1,$rndkey0); # pxor does better here
+ eval"&aes${p} ($inout0,$rndkey1)";
+ &pxor ($inout2,$rndkey0);
+ eval"&aes${p} ($inout1,$rndkey1)";
+ &pxor ($inout3,$rndkey0);
+ &dec ($rounds);
+ eval"&aes${p} ($inout2,$rndkey1)";
+ &pxor ($inout4,$rndkey0);
+ eval"&aes${p} ($inout3,$rndkey1)";
+ &pxor ($inout5,$rndkey0);
+ eval"&aes${p} ($inout4,$rndkey1)";
+ &$movekey ($rndkey0,&QWP(0,$key));
+ eval"&aes${p} ($inout5,$rndkey1)";
+ &jmp (&label("_aesni_${p}rypt6_enter"));
+
+ &set_label("${p}6_loop",16);
+ eval"&aes${p} ($inout0,$rndkey1)";
+ eval"&aes${p} ($inout1,$rndkey1)";
+ &dec ($rounds);
+ eval"&aes${p} ($inout2,$rndkey1)";
+ eval"&aes${p} ($inout3,$rndkey1)";
+ eval"&aes${p} ($inout4,$rndkey1)";
+ eval"&aes${p} ($inout5,$rndkey1)";
+ &set_label("_aesni_${p}rypt6_enter",16);
+ &$movekey ($rndkey1,&QWP(16,$key));
+ eval"&aes${p} ($inout0,$rndkey0)";
+ eval"&aes${p} ($inout1,$rndkey0)";
+ &lea ($key,&DWP(32,$key));
+ eval"&aes${p} ($inout2,$rndkey0)";
+ eval"&aes${p} ($inout3,$rndkey0)";
+ eval"&aes${p} ($inout4,$rndkey0)";
+ eval"&aes${p} ($inout5,$rndkey0)";
+ &$movekey ($rndkey0,&QWP(0,$key));
+ &jnz (&label("${p}6_loop"));
+
+ eval"&aes${p} ($inout0,$rndkey1)";
+ eval"&aes${p} ($inout1,$rndkey1)";
+ eval"&aes${p} ($inout2,$rndkey1)";
+ eval"&aes${p} ($inout3,$rndkey1)";
+ eval"&aes${p} ($inout4,$rndkey1)";
+ eval"&aes${p} ($inout5,$rndkey1)";
+ eval"&aes${p}last ($inout0,$rndkey0)";
+ eval"&aes${p}last ($inout1,$rndkey0)";
+ eval"&aes${p}last ($inout2,$rndkey0)";
+ eval"&aes${p}last ($inout3,$rndkey0)";
+ eval"&aes${p}last ($inout4,$rndkey0)";
+ eval"&aes${p}last ($inout5,$rndkey0)";
+ &ret();
+ &function_end_B("_aesni_${p}rypt6");
+}
+&aesni_generate3("enc") if ($PREFIX eq "aesni");
+&aesni_generate3("dec");
+&aesni_generate4("enc") if ($PREFIX eq "aesni");
+&aesni_generate4("dec");
+&aesni_generate6("enc") if ($PREFIX eq "aesni");
+&aesni_generate6("dec");
+
+if ($PREFIX eq "aesni") {
+######################################################################
+# void aesni_ecb_encrypt (const void *in, void *out,
+# size_t length, const AES_KEY *key,
+# int enc);
+&function_begin("aesni_ecb_encrypt");
+ &mov ($inp,&wparam(0));
+ &mov ($out,&wparam(1));
+ &mov ($len,&wparam(2));
+ &mov ($key,&wparam(3));
+ &mov ($rounds_,&wparam(4));
+ &and ($len,-16);
+ &jz (&label("ecb_ret"));
+ &mov ($rounds,&DWP(240,$key));
+ &test ($rounds_,$rounds_);
+ &jz (&label("ecb_decrypt"));
+
+ &mov ($key_,$key); # backup $key
+ &mov ($rounds_,$rounds); # backup $rounds
+ &cmp ($len,0x60);
+ &jb (&label("ecb_enc_tail"));
+
+ &movdqu ($inout0,&QWP(0,$inp));
+ &movdqu ($inout1,&QWP(0x10,$inp));
+ &movdqu ($inout2,&QWP(0x20,$inp));
+ &movdqu ($inout3,&QWP(0x30,$inp));
+ &movdqu ($inout4,&QWP(0x40,$inp));
+ &movdqu ($inout5,&QWP(0x50,$inp));
+ &lea ($inp,&DWP(0x60,$inp));
+ &sub ($len,0x60);
+ &jmp (&label("ecb_enc_loop6_enter"));
+
+&set_label("ecb_enc_loop6",16);
+ &movups (&QWP(0,$out),$inout0);
+ &movdqu ($inout0,&QWP(0,$inp));
+ &movups (&QWP(0x10,$out),$inout1);
+ &movdqu ($inout1,&QWP(0x10,$inp));
+ &movups (&QWP(0x20,$out),$inout2);
+ &movdqu ($inout2,&QWP(0x20,$inp));
+ &movups (&QWP(0x30,$out),$inout3);
+ &movdqu ($inout3,&QWP(0x30,$inp));
+ &movups (&QWP(0x40,$out),$inout4);
+ &movdqu ($inout4,&QWP(0x40,$inp));
+ &movups (&QWP(0x50,$out),$inout5);
+ &lea ($out,&DWP(0x60,$out));
+ &movdqu ($inout5,&QWP(0x50,$inp));
+ &lea ($inp,&DWP(0x60,$inp));
+&set_label("ecb_enc_loop6_enter");
+
+ &call ("_aesni_encrypt6");
+
+ &mov ($key,$key_); # restore $key
+ &mov ($rounds,$rounds_); # restore $rounds
+ &sub ($len,0x60);
+ &jnc (&label("ecb_enc_loop6"));
+
+ &movups (&QWP(0,$out),$inout0);
+ &movups (&QWP(0x10,$out),$inout1);
+ &movups (&QWP(0x20,$out),$inout2);
+ &movups (&QWP(0x30,$out),$inout3);
+ &movups (&QWP(0x40,$out),$inout4);
+ &movups (&QWP(0x50,$out),$inout5);
+ &lea ($out,&DWP(0x60,$out));
+ &add ($len,0x60);
+ &jz (&label("ecb_ret"));
+
+&set_label("ecb_enc_tail");
+ &movups ($inout0,&QWP(0,$inp));
+ &cmp ($len,0x20);
+ &jb (&label("ecb_enc_one"));
+ &movups ($inout1,&QWP(0x10,$inp));
+ &je (&label("ecb_enc_two"));
+ &movups ($inout2,&QWP(0x20,$inp));
+ &cmp ($len,0x40);
+ &jb (&label("ecb_enc_three"));
+ &movups ($inout3,&QWP(0x30,$inp));
+ &je (&label("ecb_enc_four"));
+ &movups ($inout4,&QWP(0x40,$inp));
+ &xorps ($inout5,$inout5);
+ &call ("_aesni_encrypt6");
+ &movups (&QWP(0,$out),$inout0);
+ &movups (&QWP(0x10,$out),$inout1);
+ &movups (&QWP(0x20,$out),$inout2);
+ &movups (&QWP(0x30,$out),$inout3);
+ &movups (&QWP(0x40,$out),$inout4);
+ jmp (&label("ecb_ret"));
+
+&set_label("ecb_enc_one",16);
+ if ($inline)
+ { &aesni_inline_generate1("enc"); }
+ else
+ { &call ("_aesni_encrypt1"); }
+ &movups (&QWP(0,$out),$inout0);
+ &jmp (&label("ecb_ret"));
+
+&set_label("ecb_enc_two",16);
+ &xorps ($inout2,$inout2);
+ &call ("_aesni_encrypt3");
+ &movups (&QWP(0,$out),$inout0);
+ &movups (&QWP(0x10,$out),$inout1);
+ &jmp (&label("ecb_ret"));
+
+&set_label("ecb_enc_three",16);
+ &call ("_aesni_encrypt3");
+ &movups (&QWP(0,$out),$inout0);
+ &movups (&QWP(0x10,$out),$inout1);
+ &movups (&QWP(0x20,$out),$inout2);
+ &jmp (&label("ecb_ret"));
+
+&set_label("ecb_enc_four",16);
+ &call ("_aesni_encrypt4");
+ &movups (&QWP(0,$out),$inout0);
+ &movups (&QWP(0x10,$out),$inout1);
+ &movups (&QWP(0x20,$out),$inout2);
+ &movups (&QWP(0x30,$out),$inout3);
+ &jmp (&label("ecb_ret"));
+######################################################################
+&set_label("ecb_decrypt",16);
+ &mov ($key_,$key); # backup $key
+ &mov ($rounds_,$rounds); # backup $rounds
+ &cmp ($len,0x60);
+ &jb (&label("ecb_dec_tail"));
+
+ &movdqu ($inout0,&QWP(0,$inp));
+ &movdqu ($inout1,&QWP(0x10,$inp));
+ &movdqu ($inout2,&QWP(0x20,$inp));
+ &movdqu ($inout3,&QWP(0x30,$inp));
+ &movdqu ($inout4,&QWP(0x40,$inp));
+ &movdqu ($inout5,&QWP(0x50,$inp));
+ &lea ($inp,&DWP(0x60,$inp));
+ &sub ($len,0x60);
+ &jmp (&label("ecb_dec_loop6_enter"));
+
+&set_label("ecb_dec_loop6",16);
+ &movups (&QWP(0,$out),$inout0);
+ &movdqu ($inout0,&QWP(0,$inp));
+ &movups (&QWP(0x10,$out),$inout1);
+ &movdqu ($inout1,&QWP(0x10,$inp));
+ &movups (&QWP(0x20,$out),$inout2);
+ &movdqu ($inout2,&QWP(0x20,$inp));
+ &movups (&QWP(0x30,$out),$inout3);
+ &movdqu ($inout3,&QWP(0x30,$inp));
+ &movups (&QWP(0x40,$out),$inout4);
+ &movdqu ($inout4,&QWP(0x40,$inp));
+ &movups (&QWP(0x50,$out),$inout5);
+ &lea ($out,&DWP(0x60,$out));
+ &movdqu ($inout5,&QWP(0x50,$inp));
+ &lea ($inp,&DWP(0x60,$inp));
+&set_label("ecb_dec_loop6_enter");
+
+ &call ("_aesni_decrypt6");
+
+ &mov ($key,$key_); # restore $key
+ &mov ($rounds,$rounds_); # restore $rounds
+ &sub ($len,0x60);
+ &jnc (&label("ecb_dec_loop6"));
+
+ &movups (&QWP(0,$out),$inout0);
+ &movups (&QWP(0x10,$out),$inout1);
+ &movups (&QWP(0x20,$out),$inout2);
+ &movups (&QWP(0x30,$out),$inout3);
+ &movups (&QWP(0x40,$out),$inout4);
+ &movups (&QWP(0x50,$out),$inout5);
+ &lea ($out,&DWP(0x60,$out));
+ &add ($len,0x60);
+ &jz (&label("ecb_ret"));
+
+&set_label("ecb_dec_tail");
+ &movups ($inout0,&QWP(0,$inp));
+ &cmp ($len,0x20);
+ &jb (&label("ecb_dec_one"));
+ &movups ($inout1,&QWP(0x10,$inp));
+ &je (&label("ecb_dec_two"));
+ &movups ($inout2,&QWP(0x20,$inp));
+ &cmp ($len,0x40);
+ &jb (&label("ecb_dec_three"));
+ &movups ($inout3,&QWP(0x30,$inp));
+ &je (&label("ecb_dec_four"));
+ &movups ($inout4,&QWP(0x40,$inp));
+ &xorps ($inout5,$inout5);
+ &call ("_aesni_decrypt6");
+ &movups (&QWP(0,$out),$inout0);
+ &movups (&QWP(0x10,$out),$inout1);
+ &movups (&QWP(0x20,$out),$inout2);
+ &movups (&QWP(0x30,$out),$inout3);
+ &movups (&QWP(0x40,$out),$inout4);
+ &jmp (&label("ecb_ret"));
+
+&set_label("ecb_dec_one",16);
+ if ($inline)
+ { &aesni_inline_generate1("dec"); }
+ else
+ { &call ("_aesni_decrypt1"); }
+ &movups (&QWP(0,$out),$inout0);
+ &jmp (&label("ecb_ret"));
+
+&set_label("ecb_dec_two",16);
+ &xorps ($inout2,$inout2);
+ &call ("_aesni_decrypt3");
+ &movups (&QWP(0,$out),$inout0);
+ &movups (&QWP(0x10,$out),$inout1);
+ &jmp (&label("ecb_ret"));
+
+&set_label("ecb_dec_three",16);
+ &call ("_aesni_decrypt3");
+ &movups (&QWP(0,$out),$inout0);
+ &movups (&QWP(0x10,$out),$inout1);
+ &movups (&QWP(0x20,$out),$inout2);
+ &jmp (&label("ecb_ret"));
+
+&set_label("ecb_dec_four",16);
+ &call ("_aesni_decrypt4");
+ &movups (&QWP(0,$out),$inout0);
+ &movups (&QWP(0x10,$out),$inout1);
+ &movups (&QWP(0x20,$out),$inout2);
+ &movups (&QWP(0x30,$out),$inout3);
+
+&set_label("ecb_ret");
+&function_end("aesni_ecb_encrypt");
+
+######################################################################
+# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
+# size_t blocks, const AES_KEY *key,
+# const char *ivec,char *cmac);
+#
+# Handles only complete blocks, operates on 64-bit counter and
+# does not update *ivec! Nor does it finalize CMAC value
+# (see engine/eng_aesni.c for details)
+#
+{ my $cmac=$inout1;
+&function_begin("aesni_ccm64_encrypt_blocks");
+ &mov ($inp,&wparam(0));
+ &mov ($out,&wparam(1));
+ &mov ($len,&wparam(2));
+ &mov ($key,&wparam(3));
+ &mov ($rounds_,&wparam(4));
+ &mov ($rounds,&wparam(5));
+ &mov ($key_,"esp");
+ &sub ("esp",60);
+ &and ("esp",-16); # align stack
+ &mov (&DWP(48,"esp"),$key_);
+
+ &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec
+ &movdqu ($cmac,&QWP(0,$rounds)); # load cmac
+ &mov ($rounds,&DWP(240,$key));
+
+ # compose byte-swap control mask for pshufb on stack
+ &mov (&DWP(0,"esp"),0x0c0d0e0f);
+ &mov (&DWP(4,"esp"),0x08090a0b);
+ &mov (&DWP(8,"esp"),0x04050607);
+ &mov (&DWP(12,"esp"),0x00010203);
+
+ # compose counter increment vector on stack
+ &mov ($rounds_,1);
+ &xor ($key_,$key_);
+ &mov (&DWP(16,"esp"),$rounds_);
+ &mov (&DWP(20,"esp"),$key_);
+ &mov (&DWP(24,"esp"),$key_);
+ &mov (&DWP(28,"esp"),$key_);
+
+ &shr ($rounds,1);
+ &lea ($key_,&DWP(0,$key));
+ &movdqa ($inout3,&QWP(0,"esp"));
+ &movdqa ($inout0,$ivec);
+ &mov ($rounds_,$rounds);
+ &pshufb ($ivec,$inout3);
+
+&set_label("ccm64_enc_outer");
+ &$movekey ($rndkey0,&QWP(0,$key_));
+ &mov ($rounds,$rounds_);
+ &movups ($in0,&QWP(0,$inp));
+
+ &xorps ($inout0,$rndkey0);
+ &$movekey ($rndkey1,&QWP(16,$key_));
+ &xorps ($rndkey0,$in0);
+ &lea ($key,&DWP(32,$key_));
+ &xorps ($cmac,$rndkey0); # cmac^=inp
+ &$movekey ($rndkey0,&QWP(0,$key));
+
+&set_label("ccm64_enc2_loop");
+ &aesenc ($inout0,$rndkey1);
+ &dec ($rounds);
+ &aesenc ($cmac,$rndkey1);
+ &$movekey ($rndkey1,&QWP(16,$key));
+ &aesenc ($inout0,$rndkey0);
+ &lea ($key,&DWP(32,$key));
+ &aesenc ($cmac,$rndkey0);
+ &$movekey ($rndkey0,&QWP(0,$key));
+ &jnz (&label("ccm64_enc2_loop"));
+ &aesenc ($inout0,$rndkey1);
+ &aesenc ($cmac,$rndkey1);
+ &paddq ($ivec,&QWP(16,"esp"));
+ &aesenclast ($inout0,$rndkey0);
+ &aesenclast ($cmac,$rndkey0);
+
+ &dec ($len);
+ &lea ($inp,&DWP(16,$inp));
+ &xorps ($in0,$inout0); # inp^=E(ivec)
+ &movdqa ($inout0,$ivec);
+ &movups (&QWP(0,$out),$in0); # save output
+ &lea ($out,&DWP(16,$out));
+ &pshufb ($inout0,$inout3);
+ &jnz (&label("ccm64_enc_outer"));
+
+ &mov ("esp",&DWP(48,"esp"));
+ &mov ($out,&wparam(5));
+ &movups (&QWP(0,$out),$cmac);
+&function_end("aesni_ccm64_encrypt_blocks");
+
+&function_begin("aesni_ccm64_decrypt_blocks");
+ &mov ($inp,&wparam(0));
+ &mov ($out,&wparam(1));
+ &mov ($len,&wparam(2));
+ &mov ($key,&wparam(3));
+ &mov ($rounds_,&wparam(4));
+ &mov ($rounds,&wparam(5));
+ &mov ($key_,"esp");
+ &sub ("esp",60);
+ &and ("esp",-16); # align stack
+ &mov (&DWP(48,"esp"),$key_);
+
+ &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec
+ &movdqu ($cmac,&QWP(0,$rounds)); # load cmac
+ &mov ($rounds,&DWP(240,$key));
+
+ # compose byte-swap control mask for pshufb on stack
+ &mov (&DWP(0,"esp"),0x0c0d0e0f);
+ &mov (&DWP(4,"esp"),0x08090a0b);
+ &mov (&DWP(8,"esp"),0x04050607);
+ &mov (&DWP(12,"esp"),0x00010203);
+
+ # compose counter increment vector on stack
+ &mov ($rounds_,1);
+ &xor ($key_,$key_);
+ &mov (&DWP(16,"esp"),$rounds_);
+ &mov (&DWP(20,"esp"),$key_);
+ &mov (&DWP(24,"esp"),$key_);
+ &mov (&DWP(28,"esp"),$key_);
+
+ &movdqa ($inout3,&QWP(0,"esp")); # bswap mask
+ &movdqa ($inout0,$ivec);
+
+ &mov ($key_,$key);
+ &mov ($rounds_,$rounds);
+
+ &pshufb ($ivec,$inout3);
+ if ($inline)
+ { &aesni_inline_generate1("enc"); }
+ else
+ { &call ("_aesni_encrypt1"); }
+ &movups ($in0,&QWP(0,$inp)); # load inp
+ &paddq ($ivec,&QWP(16,"esp"));
+ &lea ($inp,&QWP(16,$inp));
+ &jmp (&label("ccm64_dec_outer"));
+
+&set_label("ccm64_dec_outer",16);
+ &xorps ($in0,$inout0); # inp ^= E(ivec)
+ &movdqa ($inout0,$ivec);
+ &mov ($rounds,$rounds_);
+ &movups (&QWP(0,$out),$in0); # save output
+ &lea ($out,&DWP(16,$out));
+ &pshufb ($inout0,$inout3);
+
+ &sub ($len,1);
+ &jz (&label("ccm64_dec_break"));
+
+ &$movekey ($rndkey0,&QWP(0,$key_));
+ &shr ($rounds,1);
+ &$movekey ($rndkey1,&QWP(16,$key_));
+ &xorps ($in0,$rndkey0);
+ &lea ($key,&DWP(32,$key_));
+ &xorps ($inout0,$rndkey0);
+ &xorps ($cmac,$in0); # cmac^=out
+ &$movekey ($rndkey0,&QWP(0,$key));
+
+&set_label("ccm64_dec2_loop");
+ &aesenc ($inout0,$rndkey1);
+ &dec ($rounds);
+ &aesenc ($cmac,$rndkey1);
+ &$movekey ($rndkey1,&QWP(16,$key));
+ &aesenc ($inout0,$rndkey0);
+ &lea ($key,&DWP(32,$key));
+ &aesenc ($cmac,$rndkey0);
+ &$movekey ($rndkey0,&QWP(0,$key));
+ &jnz (&label("ccm64_dec2_loop"));
+ &movups ($in0,&QWP(0,$inp)); # load inp
+ &paddq ($ivec,&QWP(16,"esp"));
+ &aesenc ($inout0,$rndkey1);
+ &aesenc ($cmac,$rndkey1);
+ &lea ($inp,&QWP(16,$inp));
+ &aesenclast ($inout0,$rndkey0);
+ &aesenclast ($cmac,$rndkey0);
+ &jmp (&label("ccm64_dec_outer"));
+
+&set_label("ccm64_dec_break",16);
+ &mov ($key,$key_);
+ if ($inline)
+ { &aesni_inline_generate1("enc",$cmac,$in0); }
+ else
+ { &call ("_aesni_encrypt1",$cmac); }
+
+ &mov ("esp",&DWP(48,"esp"));
+ &mov ($out,&wparam(5));
+ &movups (&QWP(0,$out),$cmac);
+&function_end("aesni_ccm64_decrypt_blocks");
+}
+
+######################################################################
+# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
+# size_t blocks, const AES_KEY *key,
+# const char *ivec);
+#
+# Handles only complete blocks, operates on 32-bit counter and
+# does not update *ivec! (see engine/eng_aesni.c for details)
+#
+# stack layout:
+# 0 pshufb mask
+# 16 vector addend: 0,6,6,6
+# 32 counter-less ivec
+# 48 1st triplet of counter vector
+# 64 2nd triplet of counter vector
+# 80 saved %esp
+
+&function_begin("aesni_ctr32_encrypt_blocks");
+ &mov ($inp,&wparam(0));
+ &mov ($out,&wparam(1));
+ &mov ($len,&wparam(2));
+ &mov ($key,&wparam(3));
+ &mov ($rounds_,&wparam(4));
+ &mov ($key_,"esp");
+ &sub ("esp",88);
+ &and ("esp",-16); # align stack
+ &mov (&DWP(80,"esp"),$key_);
+
+ &cmp ($len,1);
+ &je (&label("ctr32_one_shortcut"));
+
+ &movdqu ($inout5,&QWP(0,$rounds_)); # load ivec
+
+ # compose byte-swap control mask for pshufb on stack
+ &mov (&DWP(0,"esp"),0x0c0d0e0f);
+ &mov (&DWP(4,"esp"),0x08090a0b);
+ &mov (&DWP(8,"esp"),0x04050607);
+ &mov (&DWP(12,"esp"),0x00010203);
+
+ # compose counter increment vector on stack
+ &mov ($rounds,6);
+ &xor ($key_,$key_);
+ &mov (&DWP(16,"esp"),$rounds);
+ &mov (&DWP(20,"esp"),$rounds);
+ &mov (&DWP(24,"esp"),$rounds);
+ &mov (&DWP(28,"esp"),$key_);
+
+ &pextrd ($rounds_,$inout5,3); # pull 32-bit counter
+ &pinsrd ($inout5,$key_,3); # wipe 32-bit counter
+
+ &mov ($rounds,&DWP(240,$key)); # key->rounds
+
+ # compose 2 vectors of 3x32-bit counters
+ &bswap ($rounds_);
+ &pxor ($rndkey1,$rndkey1);
+ &pxor ($rndkey0,$rndkey0);
+ &movdqa ($inout0,&QWP(0,"esp")); # load byte-swap mask
+ &pinsrd ($rndkey1,$rounds_,0);
+ &lea ($key_,&DWP(3,$rounds_));
+ &pinsrd ($rndkey0,$key_,0);
+ &inc ($rounds_);
+ &pinsrd ($rndkey1,$rounds_,1);
+ &inc ($key_);
+ &pinsrd ($rndkey0,$key_,1);
+ &inc ($rounds_);
+ &pinsrd ($rndkey1,$rounds_,2);
+ &inc ($key_);
+ &pinsrd ($rndkey0,$key_,2);
+ &movdqa (&QWP(48,"esp"),$rndkey1); # save 1st triplet
+ &pshufb ($rndkey1,$inout0); # byte swap
+ &movdqa (&QWP(64,"esp"),$rndkey0); # save 2nd triplet
+ &pshufb ($rndkey0,$inout0); # byte swap
+
+ &pshufd ($inout0,$rndkey1,3<<6); # place counter to upper dword
+ &pshufd ($inout1,$rndkey1,2<<6);
+ &cmp ($len,6);
+ &jb (&label("ctr32_tail"));
+ &movdqa (&QWP(32,"esp"),$inout5); # save counter-less ivec
+ &shr ($rounds,1);
+ &mov ($key_,$key); # backup $key
+ &mov ($rounds_,$rounds); # backup $rounds
+ &sub ($len,6);
+ &jmp (&label("ctr32_loop6"));
+
+&set_label("ctr32_loop6",16);
+ &pshufd ($inout2,$rndkey1,1<<6);
+ &movdqa ($rndkey1,&QWP(32,"esp")); # pull counter-less ivec
+ &pshufd ($inout3,$rndkey0,3<<6);
+ &por ($inout0,$rndkey1); # merge counter-less ivec
+ &pshufd ($inout4,$rndkey0,2<<6);
+ &por ($inout1,$rndkey1);
+ &pshufd ($inout5,$rndkey0,1<<6);
+ &por ($inout2,$rndkey1);
+ &por ($inout3,$rndkey1);
+ &por ($inout4,$rndkey1);
+ &por ($inout5,$rndkey1);
+
+ # inlining _aesni_encrypt6's prologue gives ~4% improvement...
+ &$movekey ($rndkey0,&QWP(0,$key_));
+ &$movekey ($rndkey1,&QWP(16,$key_));
+ &lea ($key,&DWP(32,$key_));
+ &dec ($rounds);
+ &pxor ($inout0,$rndkey0);
+ &pxor ($inout1,$rndkey0);
+ &aesenc ($inout0,$rndkey1);
+ &pxor ($inout2,$rndkey0);
+ &aesenc ($inout1,$rndkey1);
+ &pxor ($inout3,$rndkey0);
+ &aesenc ($inout2,$rndkey1);
+ &pxor ($inout4,$rndkey0);
+ &aesenc ($inout3,$rndkey1);
+ &pxor ($inout5,$rndkey0);
+ &aesenc ($inout4,$rndkey1);
+ &$movekey ($rndkey0,&QWP(0,$key));
+ &aesenc ($inout5,$rndkey1);
+
+ &call (&label("_aesni_encrypt6_enter"));
+
+ &movups ($rndkey1,&QWP(0,$inp));
+ &movups ($rndkey0,&QWP(0x10,$inp));
+ &xorps ($inout0,$rndkey1);
+ &movups ($rndkey1,&QWP(0x20,$inp));
+ &xorps ($inout1,$rndkey0);
+ &movups (&QWP(0,$out),$inout0);
+ &movdqa ($rndkey0,&QWP(16,"esp")); # load increment
+ &xorps ($inout2,$rndkey1);
+ &movdqa ($rndkey1,&QWP(48,"esp")); # load 1st triplet
+ &movups (&QWP(0x10,$out),$inout1);
+ &movups (&QWP(0x20,$out),$inout2);
+
+ &paddd ($rndkey1,$rndkey0); # 1st triplet increment
+ &paddd ($rndkey0,&QWP(64,"esp")); # 2nd triplet increment
+ &movdqa ($inout0,&QWP(0,"esp")); # load byte swap mask
+
+ &movups ($inout1,&QWP(0x30,$inp));
+ &movups ($inout2,&QWP(0x40,$inp));
+ &xorps ($inout3,$inout1);
+ &movups ($inout1,&QWP(0x50,$inp));
+ &lea ($inp,&DWP(0x60,$inp));
+ &movdqa (&QWP(48,"esp"),$rndkey1); # save 1st triplet
+ &pshufb ($rndkey1,$inout0); # byte swap
+ &xorps ($inout4,$inout2);
+ &movups (&QWP(0x30,$out),$inout3);
+ &xorps ($inout5,$inout1);
+ &movdqa (&QWP(64,"esp"),$rndkey0); # save 2nd triplet
+ &pshufb ($rndkey0,$inout0); # byte swap
+ &movups (&QWP(0x40,$out),$inout4);
+ &pshufd ($inout0,$rndkey1,3<<6);
+ &movups (&QWP(0x50,$out),$inout5);
+ &lea ($out,&DWP(0x60,$out));
+
+ &mov ($rounds,$rounds_);
+ &pshufd ($inout1,$rndkey1,2<<6);
+ &sub ($len,6);
+ &jnc (&label("ctr32_loop6"));
+
+ &add ($len,6);
+ &jz (&label("ctr32_ret"));
+ &mov ($key,$key_);
+ &lea ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds
+ &movdqa ($inout5,&QWP(32,"esp")); # pull count-less ivec
+
+&set_label("ctr32_tail");
+ &por ($inout0,$inout5);
+ &cmp ($len,2);
+ &jb (&label("ctr32_one"));
+
+ &pshufd ($inout2,$rndkey1,1<<6);
+ &por ($inout1,$inout5);
+ &je (&label("ctr32_two"));
+
+ &pshufd ($inout3,$rndkey0,3<<6);
+ &por ($inout2,$inout5);
+ &cmp ($len,4);
+ &jb (&label("ctr32_three"));
+
+ &pshufd ($inout4,$rndkey0,2<<6);
+ &por ($inout3,$inout5);
+ &je (&label("ctr32_four"));
+
+ &por ($inout4,$inout5);
+ &call ("_aesni_encrypt6");
+ &movups ($rndkey1,&QWP(0,$inp));
+ &movups ($rndkey0,&QWP(0x10,$inp));
+ &xorps ($inout0,$rndkey1);
+ &movups ($rndkey1,&QWP(0x20,$inp));
+ &xorps ($inout1,$rndkey0);
+ &movups ($rndkey0,&QWP(0x30,$inp));
+ &xorps ($inout2,$rndkey1);
+ &movups ($rndkey1,&QWP(0x40,$inp));
+ &xorps ($inout3,$rndkey0);
+ &movups (&QWP(0,$out),$inout0);
+ &xorps ($inout4,$rndkey1);
+ &movups (&QWP(0x10,$out),$inout1);
+ &movups (&QWP(0x20,$out),$inout2);
+ &movups (&QWP(0x30,$out),$inout3);
+ &movups (&QWP(0x40,$out),$inout4);
+ &jmp (&label("ctr32_ret"));
+
+&set_label("ctr32_one_shortcut",16);
+ &movups ($inout0,&QWP(0,$rounds_)); # load ivec
+ &mov ($rounds,&DWP(240,$key));
+
+&set_label("ctr32_one");
+ if ($inline)
+ { &aesni_inline_generate1("enc"); }
+ else
+ { &call ("_aesni_encrypt1"); }
+ &movups ($in0,&QWP(0,$inp));
+ &xorps ($in0,$inout0);
+ &movups (&QWP(0,$out),$in0);
+ &jmp (&label("ctr32_ret"));
+
+&set_label("ctr32_two",16);
+ &call ("_aesni_encrypt3");
+ &movups ($inout3,&QWP(0,$inp));
+ &movups ($inout4,&QWP(0x10,$inp));
+ &xorps ($inout0,$inout3);
+ &xorps ($inout1,$inout4);
+ &movups (&QWP(0,$out),$inout0);
+ &movups (&QWP(0x10,$out),$inout1);
+ &jmp (&label("ctr32_ret"));
+
+&set_label("ctr32_three",16);
+ &call ("_aesni_encrypt3");
+ &movups ($inout3,&QWP(0,$inp));
+ &movups ($inout4,&QWP(0x10,$inp));
+ &xorps ($inout0,$inout3);
+ &movups ($inout5,&QWP(0x20,$inp));
+ &xorps ($inout1,$inout4);
+ &movups (&QWP(0,$out),$inout0);
+ &xorps ($inout2,$inout5);
+ &movups (&QWP(0x10,$out),$inout1);
+ &movups (&QWP(0x20,$out),$inout2);
+ &jmp (&label("ctr32_ret"));
+
+&set_label("ctr32_four",16);
+ &call ("_aesni_encrypt4");
+ &movups ($inout4,&QWP(0,$inp));
+ &movups ($inout5,&QWP(0x10,$inp));
+ &movups ($rndkey1,&QWP(0x20,$inp));
+ &xorps ($inout0,$inout4);
+ &movups ($rndkey0,&QWP(0x30,$inp));
+ &xorps ($inout1,$inout5);
+ &movups (&QWP(0,$out),$inout0);
+ &xorps ($inout2,$rndkey1);
+ &movups (&QWP(0x10,$out),$inout1);
+ &xorps ($inout3,$rndkey0);
+ &movups (&QWP(0x20,$out),$inout2);
+ &movups (&QWP(0x30,$out),$inout3);
+
+&set_label("ctr32_ret");
+ &mov ("esp",&DWP(80,"esp"));
+&function_end("aesni_ctr32_encrypt_blocks");
+
+######################################################################
+# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
+# const AES_KEY *key1, const AES_KEY *key2
+# const unsigned char iv[16]);
+#
+{ my ($tweak,$twtmp,$twres,$twmask)=($rndkey1,$rndkey0,$inout0,$inout1);
+
+&function_begin("aesni_xts_encrypt");
+ &mov ($key,&wparam(4)); # key2
+ &mov ($inp,&wparam(5)); # clear-text tweak
+
+ &mov ($rounds,&DWP(240,$key)); # key2->rounds
+ &movups ($inout0,&QWP(0,$inp));
+ if ($inline)
+ { &aesni_inline_generate1("enc"); }
+ else
+ { &call ("_aesni_encrypt1"); }
+
+ &mov ($inp,&wparam(0));
+ &mov ($out,&wparam(1));
+ &mov ($len,&wparam(2));
+ &mov ($key,&wparam(3)); # key1
+
+ &mov ($key_,"esp");
+ &sub ("esp",16*7+8);
+ &mov ($rounds,&DWP(240,$key)); # key1->rounds
+ &and ("esp",-16); # align stack
+
+ &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant
+ &mov (&DWP(16*6+4,"esp"),0);
+ &mov (&DWP(16*6+8,"esp"),1);
+ &mov (&DWP(16*6+12,"esp"),0);
+ &mov (&DWP(16*7+0,"esp"),$len); # save original $len
+ &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp
+
+ &movdqa ($tweak,$inout0);
+ &pxor ($twtmp,$twtmp);
+ &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87
+ &pcmpgtd($twtmp,$tweak); # broadcast upper bits
+
+ &and ($len,-16);
+ &mov ($key_,$key); # backup $key
+ &mov ($rounds_,$rounds); # backup $rounds
+ &sub ($len,16*6);
+ &jc (&label("xts_enc_short"));
+
+ &shr ($rounds,1);
+ &mov ($rounds_,$rounds);
+ &jmp (&label("xts_enc_loop6"));
+
+&set_label("xts_enc_loop6",16);
+ for ($i=0;$i<4;$i++) {
+ &pshufd ($twres,$twtmp,0x13);
+ &pxor ($twtmp,$twtmp);
+ &movdqa (&QWP(16*$i,"esp"),$tweak);
+ &paddq ($tweak,$tweak); # &psllq($tweak,1);
+ &pand ($twres,$twmask); # isolate carry and residue
+ &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
+ &pxor ($tweak,$twres);
+ }
+ &pshufd ($inout5,$twtmp,0x13);
+ &movdqa (&QWP(16*$i++,"esp"),$tweak);
+ &paddq ($tweak,$tweak); # &psllq($tweak,1);
+ &$movekey ($rndkey0,&QWP(0,$key_));
+ &pand ($inout5,$twmask); # isolate carry and residue
+ &movups ($inout0,&QWP(0,$inp)); # load input
+ &pxor ($inout5,$tweak);
+
+ # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
+ &movdqu ($inout1,&QWP(16*1,$inp));
+ &xorps ($inout0,$rndkey0); # input^=rndkey[0]
+ &movdqu ($inout2,&QWP(16*2,$inp));
+ &pxor ($inout1,$rndkey0);
+ &movdqu ($inout3,&QWP(16*3,$inp));
+ &pxor ($inout2,$rndkey0);
+ &movdqu ($inout4,&QWP(16*4,$inp));
+ &pxor ($inout3,$rndkey0);
+ &movdqu ($rndkey1,&QWP(16*5,$inp));
+ &pxor ($inout4,$rndkey0);
+ &lea ($inp,&DWP(16*6,$inp));
+ &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
+ &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak
+ &pxor ($inout5,$rndkey1);
+
+ &$movekey ($rndkey1,&QWP(16,$key_));
+ &lea ($key,&DWP(32,$key_));
+ &pxor ($inout1,&QWP(16*1,"esp"));
+ &aesenc ($inout0,$rndkey1);
+ &pxor ($inout2,&QWP(16*2,"esp"));
+ &aesenc ($inout1,$rndkey1);
+ &pxor ($inout3,&QWP(16*3,"esp"));
+ &dec ($rounds);
+ &aesenc ($inout2,$rndkey1);
+ &pxor ($inout4,&QWP(16*4,"esp"));
+ &aesenc ($inout3,$rndkey1);
+ &pxor ($inout5,$rndkey0);
+ &aesenc ($inout4,$rndkey1);
+ &$movekey ($rndkey0,&QWP(0,$key));
+ &aesenc ($inout5,$rndkey1);
+ &call (&label("_aesni_encrypt6_enter"));
+
+ &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak
+ &pxor ($twtmp,$twtmp);
+ &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
+ &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
+ &xorps ($inout1,&QWP(16*1,"esp"));
+ &movups (&QWP(16*0,$out),$inout0); # write output
+ &xorps ($inout2,&QWP(16*2,"esp"));
+ &movups (&QWP(16*1,$out),$inout1);
+ &xorps ($inout3,&QWP(16*3,"esp"));
+ &movups (&QWP(16*2,$out),$inout2);
+ &xorps ($inout4,&QWP(16*4,"esp"));
+ &movups (&QWP(16*3,$out),$inout3);
+ &xorps ($inout5,$tweak);
+ &movups (&QWP(16*4,$out),$inout4);
+ &pshufd ($twres,$twtmp,0x13);
+ &movups (&QWP(16*5,$out),$inout5);
+ &lea ($out,&DWP(16*6,$out));
+ &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87
+
+ &pxor ($twtmp,$twtmp);
+ &paddq ($tweak,$tweak); # &psllq($tweak,1);
+ &pand ($twres,$twmask); # isolate carry and residue
+ &pcmpgtd($twtmp,$tweak); # broadcast upper bits
+ &mov ($rounds,$rounds_); # restore $rounds
+ &pxor ($tweak,$twres);
+
+ &sub ($len,16*6);
+ &jnc (&label("xts_enc_loop6"));
+
+ &lea ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds
+ &mov ($key,$key_); # restore $key
+ &mov ($rounds_,$rounds);
+
+&set_label("xts_enc_short");
+ &add ($len,16*6);
+ &jz (&label("xts_enc_done6x"));
+
+ &movdqa ($inout3,$tweak); # put aside previous tweak
+ &cmp ($len,0x20);
+ &jb (&label("xts_enc_one"));
+
+ &pshufd ($twres,$twtmp,0x13);
+ &pxor ($twtmp,$twtmp);
+ &paddq ($tweak,$tweak); # &psllq($tweak,1);
+ &pand ($twres,$twmask); # isolate carry and residue
+ &pcmpgtd($twtmp,$tweak); # broadcast upper bits
+ &pxor ($tweak,$twres);
+ &je (&label("xts_enc_two"));
+
+ &pshufd ($twres,$twtmp,0x13);
+ &pxor ($twtmp,$twtmp);
+ &movdqa ($inout4,$tweak); # put aside previous tweak
+ &paddq ($tweak,$tweak); # &psllq($tweak,1);
+ &pand ($twres,$twmask); # isolate carry and residue
+ &pcmpgtd($twtmp,$tweak); # broadcast upper bits
+ &pxor ($tweak,$twres);
+ &cmp ($len,0x40);
+ &jb (&label("xts_enc_three"));
+
+ &pshufd ($twres,$twtmp,0x13);
+ &pxor ($twtmp,$twtmp);
+ &movdqa ($inout5,$tweak); # put aside previous tweak
+ &paddq ($tweak,$tweak); # &psllq($tweak,1);
+ &pand ($twres,$twmask); # isolate carry and residue
+ &pcmpgtd($twtmp,$tweak); # broadcast upper bits
+ &pxor ($tweak,$twres);
+ &movdqa (&QWP(16*0,"esp"),$inout3);
+ &movdqa (&QWP(16*1,"esp"),$inout4);
+ &je (&label("xts_enc_four"));
+
+ &movdqa (&QWP(16*2,"esp"),$inout5);
+ &pshufd ($inout5,$twtmp,0x13);
+ &movdqa (&QWP(16*3,"esp"),$tweak);
+ &paddq ($tweak,$tweak); # &psllq($inout0,1);
+ &pand ($inout5,$twmask); # isolate carry and residue
+ &pxor ($inout5,$tweak);
+
+ &movdqu ($inout0,&QWP(16*0,$inp)); # load input
+ &movdqu ($inout1,&QWP(16*1,$inp));
+ &movdqu ($inout2,&QWP(16*2,$inp));
+ &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
+ &movdqu ($inout3,&QWP(16*3,$inp));
+ &pxor ($inout1,&QWP(16*1,"esp"));
+ &movdqu ($inout4,&QWP(16*4,$inp));
+ &pxor ($inout2,&QWP(16*2,"esp"));
+ &lea ($inp,&DWP(16*5,$inp));
+ &pxor ($inout3,&QWP(16*3,"esp"));
+ &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak
+ &pxor ($inout4,$inout5);
+
+ &call ("_aesni_encrypt6");
+
+ &movaps ($tweak,&QWP(16*4,"esp")); # last tweak
+ &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
+ &xorps ($inout1,&QWP(16*1,"esp"));
+ &xorps ($inout2,&QWP(16*2,"esp"));
+ &movups (&QWP(16*0,$out),$inout0); # write output
+ &xorps ($inout3,&QWP(16*3,"esp"));
+ &movups (&QWP(16*1,$out),$inout1);
+ &xorps ($inout4,$tweak);
+ &movups (&QWP(16*2,$out),$inout2);
+ &movups (&QWP(16*3,$out),$inout3);
+ &movups (&QWP(16*4,$out),$inout4);
+ &lea ($out,&DWP(16*5,$out));
+ &jmp (&label("xts_enc_done"));
+
+&set_label("xts_enc_one",16);
+ &movups ($inout0,&QWP(16*0,$inp)); # load input
+ &lea ($inp,&DWP(16*1,$inp));
+ &xorps ($inout0,$inout3); # input^=tweak
+ if ($inline)
+ { &aesni_inline_generate1("enc"); }
+ else
+ { &call ("_aesni_encrypt1"); }
+ &xorps ($inout0,$inout3); # output^=tweak
+ &movups (&QWP(16*0,$out),$inout0); # write output
+ &lea ($out,&DWP(16*1,$out));
+
+ &movdqa ($tweak,$inout3); # last tweak
+ &jmp (&label("xts_enc_done"));
+
+&set_label("xts_enc_two",16);
+ &movaps ($inout4,$tweak); # put aside last tweak
+
+ &movups ($inout0,&QWP(16*0,$inp)); # load input
+ &movups ($inout1,&QWP(16*1,$inp));
+ &lea ($inp,&DWP(16*2,$inp));
+ &xorps ($inout0,$inout3); # input^=tweak
+ &xorps ($inout1,$inout4);
+ &xorps ($inout2,$inout2);
+
+ &call ("_aesni_encrypt3");
+
+ &xorps ($inout0,$inout3); # output^=tweak
+ &xorps ($inout1,$inout4);
+ &movups (&QWP(16*0,$out),$inout0); # write output
+ &movups (&QWP(16*1,$out),$inout1);
+ &lea ($out,&DWP(16*2,$out));
+
+ &movdqa ($tweak,$inout4); # last tweak
+ &jmp (&label("xts_enc_done"));
+
+&set_label("xts_enc_three",16);
+ &movaps ($inout5,$tweak); # put aside last tweak
+ &movups ($inout0,&QWP(16*0,$inp)); # load input
+ &movups ($inout1,&QWP(16*1,$inp));
+ &movups ($inout2,&QWP(16*2,$inp));
+ &lea ($inp,&DWP(16*3,$inp));
+ &xorps ($inout0,$inout3); # input^=tweak
+ &xorps ($inout1,$inout4);
+ &xorps ($inout2,$inout5);
+
+ &call ("_aesni_encrypt3");
+
+ &xorps ($inout0,$inout3); # output^=tweak
+ &xorps ($inout1,$inout4);
+ &xorps ($inout2,$inout5);
+ &movups (&QWP(16*0,$out),$inout0); # write output
+ &movups (&QWP(16*1,$out),$inout1);
+ &movups (&QWP(16*2,$out),$inout2);
+ &lea ($out,&DWP(16*3,$out));
+
+ &movdqa ($tweak,$inout5); # last tweak
+ &jmp (&label("xts_enc_done"));
+
+&set_label("xts_enc_four",16);
+ &movaps ($inout4,$tweak); # put aside last tweak
+
+ &movups ($inout0,&QWP(16*0,$inp)); # load input
+ &movups ($inout1,&QWP(16*1,$inp));
+ &movups ($inout2,&QWP(16*2,$inp));
+ &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak
+ &movups ($inout3,&QWP(16*3,$inp));
+ &lea ($inp,&DWP(16*4,$inp));
+ &xorps ($inout1,&QWP(16*1,"esp"));
+ &xorps ($inout2,$inout5);
+ &xorps ($inout3,$inout4);
+
+ &call ("_aesni_encrypt4");
+
+ &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
+ &xorps ($inout1,&QWP(16*1,"esp"));
+ &xorps ($inout2,$inout5);
+ &movups (&QWP(16*0,$out),$inout0); # write output
+ &xorps ($inout3,$inout4);
+ &movups (&QWP(16*1,$out),$inout1);
+ &movups (&QWP(16*2,$out),$inout2);
+ &movups (&QWP(16*3,$out),$inout3);
+ &lea ($out,&DWP(16*4,$out));
+
+ &movdqa ($tweak,$inout4); # last tweak
+ &jmp (&label("xts_enc_done"));
+
+&set_label("xts_enc_done6x",16); # $tweak is pre-calculated
+ &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
+ &and ($len,15);
+ &jz (&label("xts_enc_ret"));
+ &movdqa ($inout3,$tweak);
+ &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
+ &jmp (&label("xts_enc_steal"));
+
+&set_label("xts_enc_done",16);
+ &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
+ &pxor ($twtmp,$twtmp);
+ &and ($len,15);
+ &jz (&label("xts_enc_ret"));
+
+ &pcmpgtd($twtmp,$tweak); # broadcast upper bits
+ &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
+ &pshufd ($inout3,$twtmp,0x13);
+ &paddq ($tweak,$tweak); # &psllq($tweak,1);
+ &pand ($inout3,&QWP(16*6,"esp")); # isolate carry and residue
+ &pxor ($inout3,$tweak);
+
+&set_label("xts_enc_steal");
+ &movz ($rounds,&BP(0,$inp));
+ &movz ($key,&BP(-16,$out));
+ &lea ($inp,&DWP(1,$inp));
+ &mov (&BP(-16,$out),&LB($rounds));
+ &mov (&BP(0,$out),&LB($key));
+ &lea ($out,&DWP(1,$out));
+ &sub ($len,1);
+ &jnz (&label("xts_enc_steal"));
+
+ &sub ($out,&DWP(16*7+0,"esp")); # rewind $out
+ &mov ($key,$key_); # restore $key
+ &mov ($rounds,$rounds_); # restore $rounds
+
+ &movups ($inout0,&QWP(-16,$out)); # load input
+ &xorps ($inout0,$inout3); # input^=tweak
+ if ($inline)
+ { &aesni_inline_generate1("enc"); }
+ else
+ { &call ("_aesni_encrypt1"); }
+ &xorps ($inout0,$inout3); # output^=tweak
+ &movups (&QWP(-16,$out),$inout0); # write output
+
+&set_label("xts_enc_ret");
+ &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp
+&function_end("aesni_xts_encrypt");
+
+&function_begin("aesni_xts_decrypt");
+ &mov ($key,&wparam(4)); # key2
+ &mov ($inp,&wparam(5)); # clear-text tweak
+
+ &mov ($rounds,&DWP(240,$key)); # key2->rounds
+ &movups ($inout0,&QWP(0,$inp));
+ if ($inline)
+ { &aesni_inline_generate1("enc"); }
+ else
+ { &call ("_aesni_encrypt1"); }
+
+ &mov ($inp,&wparam(0));
+ &mov ($out,&wparam(1));
+ &mov ($len,&wparam(2));
+ &mov ($key,&wparam(3)); # key1
+
+ &mov ($key_,"esp");
+ &sub ("esp",16*7+8);
+ &and ("esp",-16); # align stack
+
+ &xor ($rounds_,$rounds_); # if(len%16) len-=16;
+ &test ($len,15);
+ &setnz (&LB($rounds_));
+ &shl ($rounds_,4);
+ &sub ($len,$rounds_);
+
+ &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant
+ &mov (&DWP(16*6+4,"esp"),0);
+ &mov (&DWP(16*6+8,"esp"),1);
+ &mov (&DWP(16*6+12,"esp"),0);
+ &mov (&DWP(16*7+0,"esp"),$len); # save original $len
+ &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp
+
+ &mov ($rounds,&DWP(240,$key)); # key1->rounds
+ &mov ($key_,$key); # backup $key
+ &mov ($rounds_,$rounds); # backup $rounds
+
+ &movdqa ($tweak,$inout0);
+ &pxor ($twtmp,$twtmp);
+ &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87
+ &pcmpgtd($twtmp,$tweak); # broadcast upper bits
+
+ &and ($len,-16);
+ &sub ($len,16*6);
+ &jc (&label("xts_dec_short"));
+
+ &shr ($rounds,1);
+ &mov ($rounds_,$rounds);
+ &jmp (&label("xts_dec_loop6"));
+
+&set_label("xts_dec_loop6",16);
+ for ($i=0;$i<4;$i++) {
+ &pshufd ($twres,$twtmp,0x13);
+ &pxor ($twtmp,$twtmp);
+ &movdqa (&QWP(16*$i,"esp"),$tweak);
+ &paddq ($tweak,$tweak); # &psllq($tweak,1);
+ &pand ($twres,$twmask); # isolate carry and residue
+ &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
+ &pxor ($tweak,$twres);
+ }
+ &pshufd ($inout5,$twtmp,0x13);
+ &movdqa (&QWP(16*$i++,"esp"),$tweak);
+ &paddq ($tweak,$tweak); # &psllq($tweak,1);
+ &$movekey ($rndkey0,&QWP(0,$key_));
+ &pand ($inout5,$twmask); # isolate carry and residue
+ &movups ($inout0,&QWP(0,$inp)); # load input
+ &pxor ($inout5,$tweak);
+
+ # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
+ &movdqu ($inout1,&QWP(16*1,$inp));
+ &xorps ($inout0,$rndkey0); # input^=rndkey[0]
+ &movdqu ($inout2,&QWP(16*2,$inp));
+ &pxor ($inout1,$rndkey0);
+ &movdqu ($inout3,&QWP(16*3,$inp));
+ &pxor ($inout2,$rndkey0);
+ &movdqu ($inout4,&QWP(16*4,$inp));
+ &pxor ($inout3,$rndkey0);
+ &movdqu ($rndkey1,&QWP(16*5,$inp));
+ &pxor ($inout4,$rndkey0);
+ &lea ($inp,&DWP(16*6,$inp));
+ &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
+ &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak
+ &pxor ($inout5,$rndkey1);
+
+ &$movekey ($rndkey1,&QWP(16,$key_));
+ &lea ($key,&DWP(32,$key_));
+ &pxor ($inout1,&QWP(16*1,"esp"));
+ &aesdec ($inout0,$rndkey1);
+ &pxor ($inout2,&QWP(16*2,"esp"));
+ &aesdec ($inout1,$rndkey1);
+ &pxor ($inout3,&QWP(16*3,"esp"));
+ &dec ($rounds);
+ &aesdec ($inout2,$rndkey1);
+ &pxor ($inout4,&QWP(16*4,"esp"));
+ &aesdec ($inout3,$rndkey1);
+ &pxor ($inout5,$rndkey0);
+ &aesdec ($inout4,$rndkey1);
+ &$movekey ($rndkey0,&QWP(0,$key));
+ &aesdec ($inout5,$rndkey1);
+ &call (&label("_aesni_decrypt6_enter"));
+
+ &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak
+ &pxor ($twtmp,$twtmp);
+ &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
+ &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
+ &xorps ($inout1,&QWP(16*1,"esp"));
+ &movups (&QWP(16*0,$out),$inout0); # write output
+ &xorps ($inout2,&QWP(16*2,"esp"));
+ &movups (&QWP(16*1,$out),$inout1);
+ &xorps ($inout3,&QWP(16*3,"esp"));
+ &movups (&QWP(16*2,$out),$inout2);
+ &xorps ($inout4,&QWP(16*4,"esp"));
+ &movups (&QWP(16*3,$out),$inout3);
+ &xorps ($inout5,$tweak);
+ &movups (&QWP(16*4,$out),$inout4);
+ &pshufd ($twres,$twtmp,0x13);
+ &movups (&QWP(16*5,$out),$inout5);
+ &lea ($out,&DWP(16*6,$out));
+ &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87
+
+ &pxor ($twtmp,$twtmp);
+ &paddq ($tweak,$tweak); # &psllq($tweak,1);
+ &pand ($twres,$twmask); # isolate carry and residue
+ &pcmpgtd($twtmp,$tweak); # broadcast upper bits
+ &mov ($rounds,$rounds_); # restore $rounds
+ &pxor ($tweak,$twres);
+
+ &sub ($len,16*6);
+ &jnc (&label("xts_dec_loop6"));
+
+ &lea ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds
+ &mov ($key,$key_); # restore $key
+ &mov ($rounds_,$rounds);
+
+&set_label("xts_dec_short");
+ &add ($len,16*6);
+ &jz (&label("xts_dec_done6x"));
+
+ &movdqa ($inout3,$tweak); # put aside previous tweak
+ &cmp ($len,0x20);
+ &jb (&label("xts_dec_one"));
+
+ &pshufd ($twres,$twtmp,0x13);
+ &pxor ($twtmp,$twtmp);
+ &paddq ($tweak,$tweak); # &psllq($tweak,1);
+ &pand ($twres,$twmask); # isolate carry and residue
+ &pcmpgtd($twtmp,$tweak); # broadcast upper bits
+ &pxor ($tweak,$twres);
+ &je (&label("xts_dec_two"));
+
+ &pshufd ($twres,$twtmp,0x13);
+ &pxor ($twtmp,$twtmp);
+ &movdqa ($inout4,$tweak); # put aside previous tweak
+ &paddq ($tweak,$tweak); # &psllq($tweak,1);
+ &pand ($twres,$twmask); # isolate carry and residue
+ &pcmpgtd($twtmp,$tweak); # broadcast upper bits
+ &pxor ($tweak,$twres);
+ &cmp ($len,0x40);
+ &jb (&label("xts_dec_three"));
+
+ &pshufd ($twres,$twtmp,0x13);
+ &pxor ($twtmp,$twtmp);
+ &movdqa ($inout5,$tweak); # put aside previous tweak
+ &paddq ($tweak,$tweak); # &psllq($tweak,1);
+ &pand ($twres,$twmask); # isolate carry and residue
+ &pcmpgtd($twtmp,$tweak); # broadcast upper bits
+ &pxor ($tweak,$twres);
+ &movdqa (&QWP(16*0,"esp"),$inout3);
+ &movdqa (&QWP(16*1,"esp"),$inout4);
+ &je (&label("xts_dec_four"));
+
+ &movdqa (&QWP(16*2,"esp"),$inout5);
+ &pshufd ($inout5,$twtmp,0x13);
+ &movdqa (&QWP(16*3,"esp"),$tweak);
+ &paddq ($tweak,$tweak); # &psllq($inout0,1);
+ &pand ($inout5,$twmask); # isolate carry and residue
+ &pxor ($inout5,$tweak);
+
+ &movdqu ($inout0,&QWP(16*0,$inp)); # load input
+ &movdqu ($inout1,&QWP(16*1,$inp));
+ &movdqu ($inout2,&QWP(16*2,$inp));
+ &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
+ &movdqu ($inout3,&QWP(16*3,$inp));
+ &pxor ($inout1,&QWP(16*1,"esp"));
+ &movdqu ($inout4,&QWP(16*4,$inp));
+ &pxor ($inout2,&QWP(16*2,"esp"));
+ &lea ($inp,&DWP(16*5,$inp));
+ &pxor ($inout3,&QWP(16*3,"esp"));
+ &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak
+ &pxor ($inout4,$inout5);
+
+ &call ("_aesni_decrypt6");
+
+ &movaps ($tweak,&QWP(16*4,"esp")); # last tweak
+ &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
+ &xorps ($inout1,&QWP(16*1,"esp"));
+ &xorps ($inout2,&QWP(16*2,"esp"));
+ &movups (&QWP(16*0,$out),$inout0); # write output
+ &xorps ($inout3,&QWP(16*3,"esp"));
+ &movups (&QWP(16*1,$out),$inout1);
+ &xorps ($inout4,$tweak);
+ &movups (&QWP(16*2,$out),$inout2);
+ &movups (&QWP(16*3,$out),$inout3);
+ &movups (&QWP(16*4,$out),$inout4);
+ &lea ($out,&DWP(16*5,$out));
+ &jmp (&label("xts_dec_done"));
+
+&set_label("xts_dec_one",16);
+ &movups ($inout0,&QWP(16*0,$inp)); # load input
+ &lea ($inp,&DWP(16*1,$inp));
+ &xorps ($inout0,$inout3); # input^=tweak
+ if ($inline)
+ { &aesni_inline_generate1("dec"); }
+ else
+ { &call ("_aesni_decrypt1"); }
+ &xorps ($inout0,$inout3); # output^=tweak
+ &movups (&QWP(16*0,$out),$inout0); # write output
+ &lea ($out,&DWP(16*1,$out));
+
+ &movdqa ($tweak,$inout3); # last tweak
+ &jmp (&label("xts_dec_done"));
+
+&set_label("xts_dec_two",16);
+ &movaps ($inout4,$tweak); # put aside last tweak
+
+ &movups ($inout0,&QWP(16*0,$inp)); # load input
+ &movups ($inout1,&QWP(16*1,$inp));
+ &lea ($inp,&DWP(16*2,$inp));
+ &xorps ($inout0,$inout3); # input^=tweak
+ &xorps ($inout1,$inout4);
+
+ &call ("_aesni_decrypt3");
+
+ &xorps ($inout0,$inout3); # output^=tweak
+ &xorps ($inout1,$inout4);
+ &movups (&QWP(16*0,$out),$inout0); # write output
+ &movups (&QWP(16*1,$out),$inout1);
+ &lea ($out,&DWP(16*2,$out));
+
+ &movdqa ($tweak,$inout4); # last tweak
+ &jmp (&label("xts_dec_done"));
+
+&set_label("xts_dec_three",16);
+ &movaps ($inout5,$tweak); # put aside last tweak
+ &movups ($inout0,&QWP(16*0,$inp)); # load input
+ &movups ($inout1,&QWP(16*1,$inp));
+ &movups ($inout2,&QWP(16*2,$inp));
+ &lea ($inp,&DWP(16*3,$inp));
+ &xorps ($inout0,$inout3); # input^=tweak
+ &xorps ($inout1,$inout4);
+ &xorps ($inout2,$inout5);
+
+ &call ("_aesni_decrypt3");
+
+ &xorps ($inout0,$inout3); # output^=tweak
+ &xorps ($inout1,$inout4);
+ &xorps ($inout2,$inout5);
+ &movups (&QWP(16*0,$out),$inout0); # write output
+ &movups (&QWP(16*1,$out),$inout1);
+ &movups (&QWP(16*2,$out),$inout2);
+ &lea ($out,&DWP(16*3,$out));
+
+ &movdqa ($tweak,$inout5); # last tweak
+ &jmp (&label("xts_dec_done"));
+
+&set_label("xts_dec_four",16);
+ &movaps ($inout4,$tweak); # put aside last tweak
+
+ &movups ($inout0,&QWP(16*0,$inp)); # load input
+ &movups ($inout1,&QWP(16*1,$inp));
+ &movups ($inout2,&QWP(16*2,$inp));
+ &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak
+ &movups ($inout3,&QWP(16*3,$inp));
+ &lea ($inp,&DWP(16*4,$inp));
+ &xorps ($inout1,&QWP(16*1,"esp"));
+ &xorps ($inout2,$inout5);
+ &xorps ($inout3,$inout4);
+
+ &call ("_aesni_decrypt4");
+
+ &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
+ &xorps ($inout1,&QWP(16*1,"esp"));
+ &xorps ($inout2,$inout5);
+ &movups (&QWP(16*0,$out),$inout0); # write output
+ &xorps ($inout3,$inout4);
+ &movups (&QWP(16*1,$out),$inout1);
+ &movups (&QWP(16*2,$out),$inout2);
+ &movups (&QWP(16*3,$out),$inout3);
+ &lea ($out,&DWP(16*4,$out));
+
+ &movdqa ($tweak,$inout4); # last tweak
+ &jmp (&label("xts_dec_done"));
+
+&set_label("xts_dec_done6x",16); # $tweak is pre-calculated
+ &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
+ &and ($len,15);
+ &jz (&label("xts_dec_ret"));
+ &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
+ &jmp (&label("xts_dec_only_one_more"));
+
+&set_label("xts_dec_done",16);
+ &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
+ &pxor ($twtmp,$twtmp);
+ &and ($len,15);
+ &jz (&label("xts_dec_ret"));
+
+ &pcmpgtd($twtmp,$tweak); # broadcast upper bits
+ &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
+ &pshufd ($twres,$twtmp,0x13);
+ &pxor ($twtmp,$twtmp);
+ &movdqa ($twmask,&QWP(16*6,"esp"));
+ &paddq ($tweak,$tweak); # &psllq($tweak,1);
+ &pand ($twres,$twmask); # isolate carry and residue
+ &pcmpgtd($twtmp,$tweak); # broadcast upper bits
+ &pxor ($tweak,$twres);
+
+&set_label("xts_dec_only_one_more");
+ &pshufd ($inout3,$twtmp,0x13);
+ &movdqa ($inout4,$tweak); # put aside previous tweak
+ &paddq ($tweak,$tweak); # &psllq($tweak,1);
+ &pand ($inout3,$twmask); # isolate carry and residue
+ &pxor ($inout3,$tweak);
+
+ &mov ($key,$key_); # restore $key
+ &mov ($rounds,$rounds_); # restore $rounds
+
+ &movups ($inout0,&QWP(0,$inp)); # load input
+ &xorps ($inout0,$inout3); # input^=tweak
+ if ($inline)
+ { &aesni_inline_generate1("dec"); }
+ else
+ { &call ("_aesni_decrypt1"); }
+ &xorps ($inout0,$inout3); # output^=tweak
+ &movups (&QWP(0,$out),$inout0); # write output
+
+&set_label("xts_dec_steal");
+ &movz ($rounds,&BP(16,$inp));
+ &movz ($key,&BP(0,$out));
+ &lea ($inp,&DWP(1,$inp));
+ &mov (&BP(0,$out),&LB($rounds));
+ &mov (&BP(16,$out),&LB($key));
+ &lea ($out,&DWP(1,$out));
+ &sub ($len,1);
+ &jnz (&label("xts_dec_steal"));
+
+ &sub ($out,&DWP(16*7+0,"esp")); # rewind $out
+ &mov ($key,$key_); # restore $key
+ &mov ($rounds,$rounds_); # restore $rounds
+
+ &movups ($inout0,&QWP(0,$out)); # load input
+ &xorps ($inout0,$inout4); # input^=tweak
+ if ($inline)
+ { &aesni_inline_generate1("dec"); }
+ else
+ { &call ("_aesni_decrypt1"); }
+ &xorps ($inout0,$inout4); # output^=tweak
+ &movups (&QWP(0,$out),$inout0); # write output
+
+&set_label("xts_dec_ret");
+ &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp
+&function_end("aesni_xts_decrypt");
+}
+}
+
+######################################################################
+# void $PREFIX_cbc_encrypt (const void *inp, void *out,
+# size_t length, const AES_KEY *key,
+# unsigned char *ivp,const int enc);
+&function_begin("${PREFIX}_cbc_encrypt");
+ &mov ($inp,&wparam(0));
+ &mov ($rounds_,"esp");
+ &mov ($out,&wparam(1));
+ &sub ($rounds_,24);
+ &mov ($len,&wparam(2));
+ &and ($rounds_,-16);
+ &mov ($key,&wparam(3));
+ &mov ($key_,&wparam(4));
+ &test ($len,$len);
+ &jz (&label("cbc_abort"));
+
+ &cmp (&wparam(5),0);
+ &xchg ($rounds_,"esp"); # alloca
+ &movups ($ivec,&QWP(0,$key_)); # load IV
+ &mov ($rounds,&DWP(240,$key));
+ &mov ($key_,$key); # backup $key
+ &mov (&DWP(16,"esp"),$rounds_); # save original %esp
+ &mov ($rounds_,$rounds); # backup $rounds
+ &je (&label("cbc_decrypt"));
+
+ &movaps ($inout0,$ivec);
+ &cmp ($len,16);
+ &jb (&label("cbc_enc_tail"));
+ &sub ($len,16);
+ &jmp (&label("cbc_enc_loop"));
+
+&set_label("cbc_enc_loop",16);
+ &movups ($ivec,&QWP(0,$inp)); # input actually
+ &lea ($inp,&DWP(16,$inp));
+ if ($inline)
+ { &aesni_inline_generate1("enc",$inout0,$ivec); }
+ else
+ { &xorps($inout0,$ivec); &call("_aesni_encrypt1"); }
+ &mov ($rounds,$rounds_); # restore $rounds
+ &mov ($key,$key_); # restore $key
+ &movups (&QWP(0,$out),$inout0); # store output
+ &lea ($out,&DWP(16,$out));
+ &sub ($len,16);
+ &jnc (&label("cbc_enc_loop"));
+ &add ($len,16);
+ &jnz (&label("cbc_enc_tail"));
+ &movaps ($ivec,$inout0);
+ &jmp (&label("cbc_ret"));
+
+&set_label("cbc_enc_tail");
+ &mov ("ecx",$len); # zaps $rounds
+ &data_word(0xA4F3F689); # rep movsb
+ &mov ("ecx",16); # zero tail
+ &sub ("ecx",$len);
+ &xor ("eax","eax"); # zaps $len
+ &data_word(0xAAF3F689); # rep stosb
+ &lea ($out,&DWP(-16,$out)); # rewind $out by 1 block
+ &mov ($rounds,$rounds_); # restore $rounds
+ &mov ($inp,$out); # $inp and $out are the same
+ &mov ($key,$key_); # restore $key
+ &jmp (&label("cbc_enc_loop"));
+######################################################################
+&set_label("cbc_decrypt",16);
+ &cmp ($len,0x50);
+ &jbe (&label("cbc_dec_tail"));
+ &movaps (&QWP(0,"esp"),$ivec); # save IV
+ &sub ($len,0x50);
+ &jmp (&label("cbc_dec_loop6_enter"));
+
+&set_label("cbc_dec_loop6",16);
+ &movaps (&QWP(0,"esp"),$rndkey0); # save IV
+ &movups (&QWP(0,$out),$inout5);
+ &lea ($out,&DWP(0x10,$out));
+&set_label("cbc_dec_loop6_enter");
+ &movdqu ($inout0,&QWP(0,$inp));
+ &movdqu ($inout1,&QWP(0x10,$inp));
+ &movdqu ($inout2,&QWP(0x20,$inp));
+ &movdqu ($inout3,&QWP(0x30,$inp));
+ &movdqu ($inout4,&QWP(0x40,$inp));
+ &movdqu ($inout5,&QWP(0x50,$inp));
+
+ &call ("_aesni_decrypt6");
+
+ &movups ($rndkey1,&QWP(0,$inp));
+ &movups ($rndkey0,&QWP(0x10,$inp));
+ &xorps ($inout0,&QWP(0,"esp")); # ^=IV
+ &xorps ($inout1,$rndkey1);
+ &movups ($rndkey1,&QWP(0x20,$inp));
+ &xorps ($inout2,$rndkey0);
+ &movups ($rndkey0,&QWP(0x30,$inp));
+ &xorps ($inout3,$rndkey1);
+ &movups ($rndkey1,&QWP(0x40,$inp));
+ &xorps ($inout4,$rndkey0);
+ &movups ($rndkey0,&QWP(0x50,$inp)); # IV
+ &xorps ($inout5,$rndkey1);
+ &movups (&QWP(0,$out),$inout0);
+ &movups (&QWP(0x10,$out),$inout1);
+ &lea ($inp,&DWP(0x60,$inp));
+ &movups (&QWP(0x20,$out),$inout2);
+ &mov ($rounds,$rounds_) # restore $rounds
+ &movups (&QWP(0x30,$out),$inout3);
+ &mov ($key,$key_); # restore $key
+ &movups (&QWP(0x40,$out),$inout4);
+ &lea ($out,&DWP(0x50,$out));
+ &sub ($len,0x60);
+ &ja (&label("cbc_dec_loop6"));
+
+ &movaps ($inout0,$inout5);
+ &movaps ($ivec,$rndkey0);
+ &add ($len,0x50);
+ &jle (&label("cbc_dec_tail_collected"));
+ &movups (&QWP(0,$out),$inout0);
+ &lea ($out,&DWP(0x10,$out));
+&set_label("cbc_dec_tail");
+ &movups ($inout0,&QWP(0,$inp));
+ &movaps ($in0,$inout0);
+ &cmp ($len,0x10);
+ &jbe (&label("cbc_dec_one"));
+
+ &movups ($inout1,&QWP(0x10,$inp));
+ &movaps ($in1,$inout1);
+ &cmp ($len,0x20);
+ &jbe (&label("cbc_dec_two"));
+
+ &movups ($inout2,&QWP(0x20,$inp));
+ &cmp ($len,0x30);
+ &jbe (&label("cbc_dec_three"));
+
+ &movups ($inout3,&QWP(0x30,$inp));
+ &cmp ($len,0x40);
+ &jbe (&label("cbc_dec_four"));
+
+ &movups ($inout4,&QWP(0x40,$inp));
+ &movaps (&QWP(0,"esp"),$ivec); # save IV
+ &movups ($inout0,&QWP(0,$inp));
+ &xorps ($inout5,$inout5);
+ &call ("_aesni_decrypt6");
+ &movups ($rndkey1,&QWP(0,$inp));
+ &movups ($rndkey0,&QWP(0x10,$inp));
+ &xorps ($inout0,&QWP(0,"esp")); # ^= IV
+ &xorps ($inout1,$rndkey1);
+ &movups ($rndkey1,&QWP(0x20,$inp));
+ &xorps ($inout2,$rndkey0);
+ &movups ($rndkey0,&QWP(0x30,$inp));
+ &xorps ($inout3,$rndkey1);
+ &movups ($ivec,&QWP(0x40,$inp)); # IV
+ &xorps ($inout4,$rndkey0);
+ &movups (&QWP(0,$out),$inout0);
+ &movups (&QWP(0x10,$out),$inout1);
+ &movups (&QWP(0x20,$out),$inout2);
+ &movups (&QWP(0x30,$out),$inout3);
+ &lea ($out,&DWP(0x40,$out));
+ &movaps ($inout0,$inout4);
+ &sub ($len,0x50);
+ &jmp (&label("cbc_dec_tail_collected"));
+
+&set_label("cbc_dec_one",16);
+ if ($inline)
+ { &aesni_inline_generate1("dec"); }
+ else
+ { &call ("_aesni_decrypt1"); }
+ &xorps ($inout0,$ivec);
+ &movaps ($ivec,$in0);
+ &sub ($len,0x10);
+ &jmp (&label("cbc_dec_tail_collected"));
+
+&set_label("cbc_dec_two",16);
+ &xorps ($inout2,$inout2);
+ &call ("_aesni_decrypt3");
+ &xorps ($inout0,$ivec);
+ &xorps ($inout1,$in0);
+ &movups (&QWP(0,$out),$inout0);
+ &movaps ($inout0,$inout1);
+ &lea ($out,&DWP(0x10,$out));
+ &movaps ($ivec,$in1);
+ &sub ($len,0x20);
+ &jmp (&label("cbc_dec_tail_collected"));
+
+&set_label("cbc_dec_three",16);
+ &call ("_aesni_decrypt3");
+ &xorps ($inout0,$ivec);
+ &xorps ($inout1,$in0);
+ &xorps ($inout2,$in1);
+ &movups (&QWP(0,$out),$inout0);
+ &movaps ($inout0,$inout2);
+ &movups (&QWP(0x10,$out),$inout1);
+ &lea ($out,&DWP(0x20,$out));
+ &movups ($ivec,&QWP(0x20,$inp));
+ &sub ($len,0x30);
+ &jmp (&label("cbc_dec_tail_collected"));
+
+&set_label("cbc_dec_four",16);
+ &call ("_aesni_decrypt4");
+ &movups ($rndkey1,&QWP(0x10,$inp));
+ &movups ($rndkey0,&QWP(0x20,$inp));
+ &xorps ($inout0,$ivec);
+ &movups ($ivec,&QWP(0x30,$inp));
+ &xorps ($inout1,$in0);
+ &movups (&QWP(0,$out),$inout0);
+ &xorps ($inout2,$rndkey1);
+ &movups (&QWP(0x10,$out),$inout1);
+ &xorps ($inout3,$rndkey0);
+ &movups (&QWP(0x20,$out),$inout2);
+ &lea ($out,&DWP(0x30,$out));
+ &movaps ($inout0,$inout3);
+ &sub ($len,0x40);
+
+&set_label("cbc_dec_tail_collected");
+ &and ($len,15);
+ &jnz (&label("cbc_dec_tail_partial"));
+ &movups (&QWP(0,$out),$inout0);
+ &jmp (&label("cbc_ret"));
+
+&set_label("cbc_dec_tail_partial",16);
+ &movaps (&QWP(0,"esp"),$inout0);
+ &mov ("ecx",16);
+ &mov ($inp,"esp");
+ &sub ("ecx",$len);
+ &data_word(0xA4F3F689); # rep movsb
+
+&set_label("cbc_ret");
+ &mov ("esp",&DWP(16,"esp")); # pull original %esp
+ &mov ($key_,&wparam(4));
+ &movups (&QWP(0,$key_),$ivec); # output IV
+&set_label("cbc_abort");
+&function_end("${PREFIX}_cbc_encrypt");
+
+######################################################################
+# Mechanical port from aesni-x86_64.pl.
+#
+# _aesni_set_encrypt_key is private interface,
+# input:
+# "eax" const unsigned char *userKey
+# $rounds int bits
+# $key AES_KEY *key
+# output:
+# "eax" return code
+# $round rounds
+
+&function_begin_B("_aesni_set_encrypt_key");
+ &test ("eax","eax");
+ &jz (&label("bad_pointer"));
+ &test ($key,$key);
+ &jz (&label("bad_pointer"));
+
+ &movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey
+ &xorps ("xmm4","xmm4"); # low dword of xmm4 is assumed 0
+ &lea ($key,&DWP(16,$key));
+ &cmp ($rounds,256);
+ &je (&label("14rounds"));
+ &cmp ($rounds,192);
+ &je (&label("12rounds"));
+ &cmp ($rounds,128);
+ &jne (&label("bad_keybits"));
+
+&set_label("10rounds",16);
+ &mov ($rounds,9);
+ &$movekey (&QWP(-16,$key),"xmm0"); # round 0
+ &aeskeygenassist("xmm1","xmm0",0x01); # round 1
+ &call (&label("key_128_cold"));
+ &aeskeygenassist("xmm1","xmm0",0x2); # round 2
+ &call (&label("key_128"));
+ &aeskeygenassist("xmm1","xmm0",0x04); # round 3
+ &call (&label("key_128"));
+ &aeskeygenassist("xmm1","xmm0",0x08); # round 4
+ &call (&label("key_128"));
+ &aeskeygenassist("xmm1","xmm0",0x10); # round 5
+ &call (&label("key_128"));
+ &aeskeygenassist("xmm1","xmm0",0x20); # round 6
+ &call (&label("key_128"));
+ &aeskeygenassist("xmm1","xmm0",0x40); # round 7
+ &call (&label("key_128"));
+ &aeskeygenassist("xmm1","xmm0",0x80); # round 8
+ &call (&label("key_128"));
+ &aeskeygenassist("xmm1","xmm0",0x1b); # round 9
+ &call (&label("key_128"));
+ &aeskeygenassist("xmm1","xmm0",0x36); # round 10
+ &call (&label("key_128"));
+ &$movekey (&QWP(0,$key),"xmm0");
+ &mov (&DWP(80,$key),$rounds);
+ &xor ("eax","eax");
+ &ret();
+
+&set_label("key_128",16);
+ &$movekey (&QWP(0,$key),"xmm0");
+ &lea ($key,&DWP(16,$key));
+&set_label("key_128_cold");
+ &shufps ("xmm4","xmm0",0b00010000);
+ &xorps ("xmm0","xmm4");
+ &shufps ("xmm4","xmm0",0b10001100);
+ &xorps ("xmm0","xmm4");
+ &shufps ("xmm1","xmm1",0b11111111); # critical path
+ &xorps ("xmm0","xmm1");
+ &ret();
+
+&set_label("12rounds",16);
+ &movq ("xmm2",&QWP(16,"eax")); # remaining 1/3 of *userKey
+ &mov ($rounds,11);
+ &$movekey (&QWP(-16,$key),"xmm0") # round 0
+ &aeskeygenassist("xmm1","xmm2",0x01); # round 1,2
+ &call (&label("key_192a_cold"));
+ &aeskeygenassist("xmm1","xmm2",0x02); # round 2,3
+ &call (&label("key_192b"));
+ &aeskeygenassist("xmm1","xmm2",0x04); # round 4,5
+ &call (&label("key_192a"));
+ &aeskeygenassist("xmm1","xmm2",0x08); # round 5,6
+ &call (&label("key_192b"));
+ &aeskeygenassist("xmm1","xmm2",0x10); # round 7,8
+ &call (&label("key_192a"));
+ &aeskeygenassist("xmm1","xmm2",0x20); # round 8,9
+ &call (&label("key_192b"));
+ &aeskeygenassist("xmm1","xmm2",0x40); # round 10,11
+ &call (&label("key_192a"));
+ &aeskeygenassist("xmm1","xmm2",0x80); # round 11,12
+ &call (&label("key_192b"));
+ &$movekey (&QWP(0,$key),"xmm0");
+ &mov (&DWP(48,$key),$rounds);
+ &xor ("eax","eax");
+ &ret();
+
+&set_label("key_192a",16);
+ &$movekey (&QWP(0,$key),"xmm0");
+ &lea ($key,&DWP(16,$key));
+&set_label("key_192a_cold",16);
+ &movaps ("xmm5","xmm2");
+&set_label("key_192b_warm");
+ &shufps ("xmm4","xmm0",0b00010000);
+ &movdqa ("xmm3","xmm2");
+ &xorps ("xmm0","xmm4");
+ &shufps ("xmm4","xmm0",0b10001100);
+ &pslldq ("xmm3",4);
+ &xorps ("xmm0","xmm4");
+ &pshufd ("xmm1","xmm1",0b01010101); # critical path
+ &pxor ("xmm2","xmm3");
+ &pxor ("xmm0","xmm1");
+ &pshufd ("xmm3","xmm0",0b11111111);
+ &pxor ("xmm2","xmm3");
+ &ret();
+
+&set_label("key_192b",16);
+ &movaps ("xmm3","xmm0");
+ &shufps ("xmm5","xmm0",0b01000100);
+ &$movekey (&QWP(0,$key),"xmm5");
+ &shufps ("xmm3","xmm2",0b01001110);
+ &$movekey (&QWP(16,$key),"xmm3");
+ &lea ($key,&DWP(32,$key));
+ &jmp (&label("key_192b_warm"));
+
+&set_label("14rounds",16);
+ &movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey
+ &mov ($rounds,13);
+ &lea ($key,&DWP(16,$key));
+ &$movekey (&QWP(-32,$key),"xmm0"); # round 0
+ &$movekey (&QWP(-16,$key),"xmm2"); # round 1
+ &aeskeygenassist("xmm1","xmm2",0x01); # round 2
+ &call (&label("key_256a_cold"));
+ &aeskeygenassist("xmm1","xmm0",0x01); # round 3
+ &call (&label("key_256b"));
+ &aeskeygenassist("xmm1","xmm2",0x02); # round 4
+ &call (&label("key_256a"));
+ &aeskeygenassist("xmm1","xmm0",0x02); # round 5
+ &call (&label("key_256b"));
+ &aeskeygenassist("xmm1","xmm2",0x04); # round 6
+ &call (&label("key_256a"));
+ &aeskeygenassist("xmm1","xmm0",0x04); # round 7
+ &call (&label("key_256b"));
+ &aeskeygenassist("xmm1","xmm2",0x08); # round 8
+ &call (&label("key_256a"));
+ &aeskeygenassist("xmm1","xmm0",0x08); # round 9
+ &call (&label("key_256b"));
+ &aeskeygenassist("xmm1","xmm2",0x10); # round 10
+ &call (&label("key_256a"));
+ &aeskeygenassist("xmm1","xmm0",0x10); # round 11
+ &call (&label("key_256b"));
+ &aeskeygenassist("xmm1","xmm2",0x20); # round 12
+ &call (&label("key_256a"));
+ &aeskeygenassist("xmm1","xmm0",0x20); # round 13
+ &call (&label("key_256b"));
+ &aeskeygenassist("xmm1","xmm2",0x40); # round 14
+ &call (&label("key_256a"));
+ &$movekey (&QWP(0,$key),"xmm0");
+ &mov (&DWP(16,$key),$rounds);
+ &xor ("eax","eax");
+ &ret();
+
+&set_label("key_256a",16);
+ &$movekey (&QWP(0,$key),"xmm2");
+ &lea ($key,&DWP(16,$key));
+&set_label("key_256a_cold");
+ &shufps ("xmm4","xmm0",0b00010000);
+ &xorps ("xmm0","xmm4");
+ &shufps ("xmm4","xmm0",0b10001100);
+ &xorps ("xmm0","xmm4");
+ &shufps ("xmm1","xmm1",0b11111111); # critical path
+ &xorps ("xmm0","xmm1");
+ &ret();
+
+&set_label("key_256b",16);
+ &$movekey (&QWP(0,$key),"xmm0");
+ &lea ($key,&DWP(16,$key));
+
+ &shufps ("xmm4","xmm2",0b00010000);
+ &xorps ("xmm2","xmm4");
+ &shufps ("xmm4","xmm2",0b10001100);
+ &xorps ("xmm2","xmm4");
+ &shufps ("xmm1","xmm1",0b10101010); # critical path
+ &xorps ("xmm2","xmm1");
+ &ret();
+
+&set_label("bad_pointer",4);
+ &mov ("eax",-1);
+ &ret ();
+&set_label("bad_keybits",4);
+ &mov ("eax",-2);
+ &ret ();
+&function_end_B("_aesni_set_encrypt_key");
+
+# int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits,
+# AES_KEY *key)
+&function_begin_B("${PREFIX}_set_encrypt_key");
+ &mov ("eax",&wparam(0));
+ &mov ($rounds,&wparam(1));
+ &mov ($key,&wparam(2));
+ &call ("_aesni_set_encrypt_key");
+ &ret ();
+&function_end_B("${PREFIX}_set_encrypt_key");
+
+# int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits,
+# AES_KEY *key)
+&function_begin_B("${PREFIX}_set_decrypt_key");
+ &mov ("eax",&wparam(0));
+ &mov ($rounds,&wparam(1));
+ &mov ($key,&wparam(2));
+ &call ("_aesni_set_encrypt_key");
+ &mov ($key,&wparam(2));
+ &shl ($rounds,4) # rounds-1 after _aesni_set_encrypt_key
+ &test ("eax","eax");
+ &jnz (&label("dec_key_ret"));
+ &lea ("eax",&DWP(16,$key,$rounds)); # end of key schedule
+
+ &$movekey ("xmm0",&QWP(0,$key)); # just swap
+ &$movekey ("xmm1",&QWP(0,"eax"));
+ &$movekey (&QWP(0,"eax"),"xmm0");
+ &$movekey (&QWP(0,$key),"xmm1");
+ &lea ($key,&DWP(16,$key));
+ &lea ("eax",&DWP(-16,"eax"));
+
+&set_label("dec_key_inverse");
+ &$movekey ("xmm0",&QWP(0,$key)); # swap and inverse
+ &$movekey ("xmm1",&QWP(0,"eax"));
+ &aesimc ("xmm0","xmm0");
+ &aesimc ("xmm1","xmm1");
+ &lea ($key,&DWP(16,$key));
+ &lea ("eax",&DWP(-16,"eax"));
+ &$movekey (&QWP(16,"eax"),"xmm0");
+ &$movekey (&QWP(-16,$key),"xmm1");
+ &cmp ("eax",$key);
+ &ja (&label("dec_key_inverse"));
+
+ &$movekey ("xmm0",&QWP(0,$key)); # inverse middle
+ &aesimc ("xmm0","xmm0");
+ &$movekey (&QWP(0,$key),"xmm0");
+
+ &xor ("eax","eax"); # return success
+&set_label("dec_key_ret");
+ &ret ();
+&function_end_B("${PREFIX}_set_decrypt_key");
+&asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>");
+
+&asm_finish();
diff --git a/main/openssl/crypto/aes/asm/aesni-x86_64.S b/main/openssl/crypto/aes/asm/aesni-x86_64.S
new file mode 100644
index 00000000..917c8323
--- /dev/null
+++ b/main/openssl/crypto/aes/asm/aesni-x86_64.S
@@ -0,0 +1,2535 @@
+.text
+.globl aesni_encrypt
+.type aesni_encrypt,@function
+.align 16
+aesni_encrypt:
+ movups (%rdi),%xmm2
+ movl 240(%rdx),%eax
+ movups (%rdx),%xmm0
+ movups 16(%rdx),%xmm1
+ leaq 32(%rdx),%rdx
+ xorps %xmm0,%xmm2
+.Loop_enc1_1:
+.byte 102,15,56,220,209
+ decl %eax
+ movups (%rdx),%xmm1
+ leaq 16(%rdx),%rdx
+ jnz .Loop_enc1_1
+.byte 102,15,56,221,209
+ movups %xmm2,(%rsi)
+ .byte 0xf3,0xc3
+.size aesni_encrypt,.-aesni_encrypt
+
+.globl aesni_decrypt
+.type aesni_decrypt,@function
+.align 16
+aesni_decrypt:
+ movups (%rdi),%xmm2
+ movl 240(%rdx),%eax
+ movups (%rdx),%xmm0
+ movups 16(%rdx),%xmm1
+ leaq 32(%rdx),%rdx
+ xorps %xmm0,%xmm2
+.Loop_dec1_2:
+.byte 102,15,56,222,209
+ decl %eax
+ movups (%rdx),%xmm1
+ leaq 16(%rdx),%rdx
+ jnz .Loop_dec1_2
+.byte 102,15,56,223,209
+ movups %xmm2,(%rsi)
+ .byte 0xf3,0xc3
+.size aesni_decrypt, .-aesni_decrypt
+.type _aesni_encrypt3,@function
+.align 16
+_aesni_encrypt3:
+ movups (%rcx),%xmm0
+ shrl $1,%eax
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+ xorps %xmm0,%xmm3
+ xorps %xmm0,%xmm4
+ movups (%rcx),%xmm0
+
+.Lenc_loop3:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ decl %eax
+.byte 102,15,56,220,225
+ movups 16(%rcx),%xmm1
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ leaq 32(%rcx),%rcx
+.byte 102,15,56,220,224
+ movups (%rcx),%xmm0
+ jnz .Lenc_loop3
+
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+.byte 102,15,56,221,224
+ .byte 0xf3,0xc3
+.size _aesni_encrypt3,.-_aesni_encrypt3
+.type _aesni_decrypt3,@function
+.align 16
+_aesni_decrypt3:
+ movups (%rcx),%xmm0
+ shrl $1,%eax
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+ xorps %xmm0,%xmm3
+ xorps %xmm0,%xmm4
+ movups (%rcx),%xmm0
+
+.Ldec_loop3:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+ decl %eax
+.byte 102,15,56,222,225
+ movups 16(%rcx),%xmm1
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+ leaq 32(%rcx),%rcx
+.byte 102,15,56,222,224
+ movups (%rcx),%xmm0
+ jnz .Ldec_loop3
+
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,223,208
+.byte 102,15,56,223,216
+.byte 102,15,56,223,224
+ .byte 0xf3,0xc3
+.size _aesni_decrypt3,.-_aesni_decrypt3
+.type _aesni_encrypt4,@function
+.align 16
+_aesni_encrypt4:
+ movups (%rcx),%xmm0
+ shrl $1,%eax
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+ xorps %xmm0,%xmm3
+ xorps %xmm0,%xmm4
+ xorps %xmm0,%xmm5
+ movups (%rcx),%xmm0
+
+.Lenc_loop4:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ decl %eax
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+ movups 16(%rcx),%xmm1
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ leaq 32(%rcx),%rcx
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+ movups (%rcx),%xmm0
+ jnz .Lenc_loop4
+
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+.byte 102,15,56,221,224
+.byte 102,15,56,221,232
+ .byte 0xf3,0xc3
+.size _aesni_encrypt4,.-_aesni_encrypt4
+.type _aesni_decrypt4,@function
+.align 16
+_aesni_decrypt4:
+ movups (%rcx),%xmm0
+ shrl $1,%eax
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+ xorps %xmm0,%xmm3
+ xorps %xmm0,%xmm4
+ xorps %xmm0,%xmm5
+ movups (%rcx),%xmm0
+
+.Ldec_loop4:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+ decl %eax
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+ movups 16(%rcx),%xmm1
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+ leaq 32(%rcx),%rcx
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+ movups (%rcx),%xmm0
+ jnz .Ldec_loop4
+
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,223,208
+.byte 102,15,56,223,216
+.byte 102,15,56,223,224
+.byte 102,15,56,223,232
+ .byte 0xf3,0xc3
+.size _aesni_decrypt4,.-_aesni_decrypt4
+.type _aesni_encrypt6,@function
+.align 16
+_aesni_encrypt6:
+ movups (%rcx),%xmm0
+ shrl $1,%eax
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+ pxor %xmm0,%xmm3
+.byte 102,15,56,220,209
+ pxor %xmm0,%xmm4
+.byte 102,15,56,220,217
+ pxor %xmm0,%xmm5
+.byte 102,15,56,220,225
+ pxor %xmm0,%xmm6
+.byte 102,15,56,220,233
+ pxor %xmm0,%xmm7
+ decl %eax
+.byte 102,15,56,220,241
+ movups (%rcx),%xmm0
+.byte 102,15,56,220,249
+ jmp .Lenc_loop6_enter
+.align 16
+.Lenc_loop6:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ decl %eax
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.Lenc_loop6_enter:
+ movups 16(%rcx),%xmm1
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ leaq 32(%rcx),%rcx
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+ movups (%rcx),%xmm0
+ jnz .Lenc_loop6
+
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+.byte 102,15,56,221,224
+.byte 102,15,56,221,232
+.byte 102,15,56,221,240
+.byte 102,15,56,221,248
+ .byte 0xf3,0xc3
+.size _aesni_encrypt6,.-_aesni_encrypt6
+.type _aesni_decrypt6,@function
+.align 16
+_aesni_decrypt6:
+ movups (%rcx),%xmm0
+ shrl $1,%eax
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+ pxor %xmm0,%xmm3
+.byte 102,15,56,222,209
+ pxor %xmm0,%xmm4
+.byte 102,15,56,222,217
+ pxor %xmm0,%xmm5
+.byte 102,15,56,222,225
+ pxor %xmm0,%xmm6
+.byte 102,15,56,222,233
+ pxor %xmm0,%xmm7
+ decl %eax
+.byte 102,15,56,222,241
+ movups (%rcx),%xmm0
+.byte 102,15,56,222,249
+ jmp .Ldec_loop6_enter
+.align 16
+.Ldec_loop6:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+ decl %eax
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.Ldec_loop6_enter:
+ movups 16(%rcx),%xmm1
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+ leaq 32(%rcx),%rcx
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+ movups (%rcx),%xmm0
+ jnz .Ldec_loop6
+
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,15,56,223,208
+.byte 102,15,56,223,216
+.byte 102,15,56,223,224
+.byte 102,15,56,223,232
+.byte 102,15,56,223,240
+.byte 102,15,56,223,248
+ .byte 0xf3,0xc3
+.size _aesni_decrypt6,.-_aesni_decrypt6
+.type _aesni_encrypt8,@function
+.align 16
+_aesni_encrypt8:
+ movups (%rcx),%xmm0
+ shrl $1,%eax
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+ xorps %xmm0,%xmm3
+.byte 102,15,56,220,209
+ pxor %xmm0,%xmm4
+.byte 102,15,56,220,217
+ pxor %xmm0,%xmm5
+.byte 102,15,56,220,225
+ pxor %xmm0,%xmm6
+.byte 102,15,56,220,233
+ pxor %xmm0,%xmm7
+ decl %eax
+.byte 102,15,56,220,241
+ pxor %xmm0,%xmm8
+.byte 102,15,56,220,249
+ pxor %xmm0,%xmm9
+ movups (%rcx),%xmm0
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movups 16(%rcx),%xmm1
+ jmp .Lenc_loop8_enter
+.align 16
+.Lenc_loop8:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ decl %eax
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movups 16(%rcx),%xmm1
+.Lenc_loop8_enter:
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ leaq 32(%rcx),%rcx
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+.byte 102,68,15,56,220,192
+.byte 102,68,15,56,220,200
+ movups (%rcx),%xmm0
+ jnz .Lenc_loop8
+
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+.byte 102,15,56,221,224
+.byte 102,15,56,221,232
+.byte 102,15,56,221,240
+.byte 102,15,56,221,248
+.byte 102,68,15,56,221,192
+.byte 102,68,15,56,221,200
+ .byte 0xf3,0xc3
+.size _aesni_encrypt8,.-_aesni_encrypt8
+.type _aesni_decrypt8,@function
+.align 16
+_aesni_decrypt8:
+ movups (%rcx),%xmm0
+ shrl $1,%eax
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+ xorps %xmm0,%xmm3
+.byte 102,15,56,222,209
+ pxor %xmm0,%xmm4
+.byte 102,15,56,222,217
+ pxor %xmm0,%xmm5
+.byte 102,15,56,222,225
+ pxor %xmm0,%xmm6
+.byte 102,15,56,222,233
+ pxor %xmm0,%xmm7
+ decl %eax
+.byte 102,15,56,222,241
+ pxor %xmm0,%xmm8
+.byte 102,15,56,222,249
+ pxor %xmm0,%xmm9
+ movups (%rcx),%xmm0
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+ movups 16(%rcx),%xmm1
+ jmp .Ldec_loop8_enter
+.align 16
+.Ldec_loop8:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+ decl %eax
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+ movups 16(%rcx),%xmm1
+.Ldec_loop8_enter:
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+ leaq 32(%rcx),%rcx
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+.byte 102,68,15,56,222,192
+.byte 102,68,15,56,222,200
+ movups (%rcx),%xmm0
+ jnz .Ldec_loop8
+
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+.byte 102,15,56,223,208
+.byte 102,15,56,223,216
+.byte 102,15,56,223,224
+.byte 102,15,56,223,232
+.byte 102,15,56,223,240
+.byte 102,15,56,223,248
+.byte 102,68,15,56,223,192
+.byte 102,68,15,56,223,200
+ .byte 0xf3,0xc3
+.size _aesni_decrypt8,.-_aesni_decrypt8
+.globl aesni_ecb_encrypt
+.type aesni_ecb_encrypt,@function
+.align 16
+aesni_ecb_encrypt:
+ andq $-16,%rdx
+ jz .Lecb_ret
+
+ movl 240(%rcx),%eax
+ movups (%rcx),%xmm0
+ movq %rcx,%r11
+ movl %eax,%r10d
+ testl %r8d,%r8d
+ jz .Lecb_decrypt
+
+ cmpq $128,%rdx
+ jb .Lecb_enc_tail
+
+ movdqu (%rdi),%xmm2
+ movdqu 16(%rdi),%xmm3
+ movdqu 32(%rdi),%xmm4
+ movdqu 48(%rdi),%xmm5
+ movdqu 64(%rdi),%xmm6
+ movdqu 80(%rdi),%xmm7
+ movdqu 96(%rdi),%xmm8
+ movdqu 112(%rdi),%xmm9
+ leaq 128(%rdi),%rdi
+ subq $128,%rdx
+ jmp .Lecb_enc_loop8_enter
+.align 16
+.Lecb_enc_loop8:
+ movups %xmm2,(%rsi)
+ movq %r11,%rcx
+ movdqu (%rdi),%xmm2
+ movl %r10d,%eax
+ movups %xmm3,16(%rsi)
+ movdqu 16(%rdi),%xmm3
+ movups %xmm4,32(%rsi)
+ movdqu 32(%rdi),%xmm4
+ movups %xmm5,48(%rsi)
+ movdqu 48(%rdi),%xmm5
+ movups %xmm6,64(%rsi)
+ movdqu 64(%rdi),%xmm6
+ movups %xmm7,80(%rsi)
+ movdqu 80(%rdi),%xmm7
+ movups %xmm8,96(%rsi)
+ movdqu 96(%rdi),%xmm8
+ movups %xmm9,112(%rsi)
+ leaq 128(%rsi),%rsi
+ movdqu 112(%rdi),%xmm9
+ leaq 128(%rdi),%rdi
+.Lecb_enc_loop8_enter:
+
+ call _aesni_encrypt8
+
+ subq $128,%rdx
+ jnc .Lecb_enc_loop8
+
+ movups %xmm2,(%rsi)
+ movq %r11,%rcx
+ movups %xmm3,16(%rsi)
+ movl %r10d,%eax
+ movups %xmm4,32(%rsi)
+ movups %xmm5,48(%rsi)
+ movups %xmm6,64(%rsi)
+ movups %xmm7,80(%rsi)
+ movups %xmm8,96(%rsi)
+ movups %xmm9,112(%rsi)
+ leaq 128(%rsi),%rsi
+ addq $128,%rdx
+ jz .Lecb_ret
+
+.Lecb_enc_tail:
+ movups (%rdi),%xmm2
+ cmpq $32,%rdx
+ jb .Lecb_enc_one
+ movups 16(%rdi),%xmm3
+ je .Lecb_enc_two
+ movups 32(%rdi),%xmm4
+ cmpq $64,%rdx
+ jb .Lecb_enc_three
+ movups 48(%rdi),%xmm5
+ je .Lecb_enc_four
+ movups 64(%rdi),%xmm6
+ cmpq $96,%rdx
+ jb .Lecb_enc_five
+ movups 80(%rdi),%xmm7
+ je .Lecb_enc_six
+ movdqu 96(%rdi),%xmm8
+ call _aesni_encrypt8
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ movups %xmm4,32(%rsi)
+ movups %xmm5,48(%rsi)
+ movups %xmm6,64(%rsi)
+ movups %xmm7,80(%rsi)
+ movups %xmm8,96(%rsi)
+ jmp .Lecb_ret
+.align 16
+.Lecb_enc_one:
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+.Loop_enc1_3:
+.byte 102,15,56,220,209
+ decl %eax
+ movups (%rcx),%xmm1
+ leaq 16(%rcx),%rcx
+ jnz .Loop_enc1_3
+.byte 102,15,56,221,209
+ movups %xmm2,(%rsi)
+ jmp .Lecb_ret
+.align 16
+.Lecb_enc_two:
+ xorps %xmm4,%xmm4
+ call _aesni_encrypt3
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ jmp .Lecb_ret
+.align 16
+.Lecb_enc_three:
+ call _aesni_encrypt3
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ movups %xmm4,32(%rsi)
+ jmp .Lecb_ret
+.align 16
+.Lecb_enc_four:
+ call _aesni_encrypt4
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ movups %xmm4,32(%rsi)
+ movups %xmm5,48(%rsi)
+ jmp .Lecb_ret
+.align 16
+.Lecb_enc_five:
+ xorps %xmm7,%xmm7
+ call _aesni_encrypt6
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ movups %xmm4,32(%rsi)
+ movups %xmm5,48(%rsi)
+ movups %xmm6,64(%rsi)
+ jmp .Lecb_ret
+.align 16
+.Lecb_enc_six:
+ call _aesni_encrypt6
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ movups %xmm4,32(%rsi)
+ movups %xmm5,48(%rsi)
+ movups %xmm6,64(%rsi)
+ movups %xmm7,80(%rsi)
+ jmp .Lecb_ret
+
+.align 16
+.Lecb_decrypt:
+ cmpq $128,%rdx
+ jb .Lecb_dec_tail
+
+ movdqu (%rdi),%xmm2
+ movdqu 16(%rdi),%xmm3
+ movdqu 32(%rdi),%xmm4
+ movdqu 48(%rdi),%xmm5
+ movdqu 64(%rdi),%xmm6
+ movdqu 80(%rdi),%xmm7
+ movdqu 96(%rdi),%xmm8
+ movdqu 112(%rdi),%xmm9
+ leaq 128(%rdi),%rdi
+ subq $128,%rdx
+ jmp .Lecb_dec_loop8_enter
+.align 16
+.Lecb_dec_loop8:
+ movups %xmm2,(%rsi)
+ movq %r11,%rcx
+ movdqu (%rdi),%xmm2
+ movl %r10d,%eax
+ movups %xmm3,16(%rsi)
+ movdqu 16(%rdi),%xmm3
+ movups %xmm4,32(%rsi)
+ movdqu 32(%rdi),%xmm4
+ movups %xmm5,48(%rsi)
+ movdqu 48(%rdi),%xmm5
+ movups %xmm6,64(%rsi)
+ movdqu 64(%rdi),%xmm6
+ movups %xmm7,80(%rsi)
+ movdqu 80(%rdi),%xmm7
+ movups %xmm8,96(%rsi)
+ movdqu 96(%rdi),%xmm8
+ movups %xmm9,112(%rsi)
+ leaq 128(%rsi),%rsi
+ movdqu 112(%rdi),%xmm9
+ leaq 128(%rdi),%rdi
+.Lecb_dec_loop8_enter:
+
+ call _aesni_decrypt8
+
+ movups (%r11),%xmm0
+ subq $128,%rdx
+ jnc .Lecb_dec_loop8
+
+ movups %xmm2,(%rsi)
+ movq %r11,%rcx
+ movups %xmm3,16(%rsi)
+ movl %r10d,%eax
+ movups %xmm4,32(%rsi)
+ movups %xmm5,48(%rsi)
+ movups %xmm6,64(%rsi)
+ movups %xmm7,80(%rsi)
+ movups %xmm8,96(%rsi)
+ movups %xmm9,112(%rsi)
+ leaq 128(%rsi),%rsi
+ addq $128,%rdx
+ jz .Lecb_ret
+
+.Lecb_dec_tail:
+ movups (%rdi),%xmm2
+ cmpq $32,%rdx
+ jb .Lecb_dec_one
+ movups 16(%rdi),%xmm3
+ je .Lecb_dec_two
+ movups 32(%rdi),%xmm4
+ cmpq $64,%rdx
+ jb .Lecb_dec_three
+ movups 48(%rdi),%xmm5
+ je .Lecb_dec_four
+ movups 64(%rdi),%xmm6
+ cmpq $96,%rdx
+ jb .Lecb_dec_five
+ movups 80(%rdi),%xmm7
+ je .Lecb_dec_six
+ movups 96(%rdi),%xmm8
+ movups (%rcx),%xmm0
+ call _aesni_decrypt8
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ movups %xmm4,32(%rsi)
+ movups %xmm5,48(%rsi)
+ movups %xmm6,64(%rsi)
+ movups %xmm7,80(%rsi)
+ movups %xmm8,96(%rsi)
+ jmp .Lecb_ret
+.align 16
+.Lecb_dec_one:
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+.Loop_dec1_4:
+.byte 102,15,56,222,209
+ decl %eax
+ movups (%rcx),%xmm1
+ leaq 16(%rcx),%rcx
+ jnz .Loop_dec1_4
+.byte 102,15,56,223,209
+ movups %xmm2,(%rsi)
+ jmp .Lecb_ret
+.align 16
+.Lecb_dec_two:
+ xorps %xmm4,%xmm4
+ call _aesni_decrypt3
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ jmp .Lecb_ret
+.align 16
+.Lecb_dec_three:
+ call _aesni_decrypt3
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ movups %xmm4,32(%rsi)
+ jmp .Lecb_ret
+.align 16
+.Lecb_dec_four:
+ call _aesni_decrypt4
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ movups %xmm4,32(%rsi)
+ movups %xmm5,48(%rsi)
+ jmp .Lecb_ret
+.align 16
+.Lecb_dec_five:
+ xorps %xmm7,%xmm7
+ call _aesni_decrypt6
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ movups %xmm4,32(%rsi)
+ movups %xmm5,48(%rsi)
+ movups %xmm6,64(%rsi)
+ jmp .Lecb_ret
+.align 16
+.Lecb_dec_six:
+ call _aesni_decrypt6
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ movups %xmm4,32(%rsi)
+ movups %xmm5,48(%rsi)
+ movups %xmm6,64(%rsi)
+ movups %xmm7,80(%rsi)
+
+.Lecb_ret:
+ .byte 0xf3,0xc3
+.size aesni_ecb_encrypt,.-aesni_ecb_encrypt
+.globl aesni_ccm64_encrypt_blocks
+.type aesni_ccm64_encrypt_blocks,@function
+.align 16
+aesni_ccm64_encrypt_blocks:
+ movl 240(%rcx),%eax
+ movdqu (%r8),%xmm9
+ movdqa .Lincrement64(%rip),%xmm6
+ movdqa .Lbswap_mask(%rip),%xmm7
+
+ shrl $1,%eax
+ leaq 0(%rcx),%r11
+ movdqu (%r9),%xmm3
+ movdqa %xmm9,%xmm2
+ movl %eax,%r10d
+.byte 102,68,15,56,0,207
+ jmp .Lccm64_enc_outer
+.align 16
+.Lccm64_enc_outer:
+ movups (%r11),%xmm0
+ movl %r10d,%eax
+ movups (%rdi),%xmm8
+
+ xorps %xmm0,%xmm2
+ movups 16(%r11),%xmm1
+ xorps %xmm8,%xmm0
+ leaq 32(%r11),%rcx
+ xorps %xmm0,%xmm3
+ movups (%rcx),%xmm0
+
+.Lccm64_enc2_loop:
+.byte 102,15,56,220,209
+ decl %eax
+.byte 102,15,56,220,217
+ movups 16(%rcx),%xmm1
+.byte 102,15,56,220,208
+ leaq 32(%rcx),%rcx
+.byte 102,15,56,220,216
+ movups 0(%rcx),%xmm0
+ jnz .Lccm64_enc2_loop
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ paddq %xmm6,%xmm9
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+
+ decq %rdx
+ leaq 16(%rdi),%rdi
+ xorps %xmm2,%xmm8
+ movdqa %xmm9,%xmm2
+ movups %xmm8,(%rsi)
+ leaq 16(%rsi),%rsi
+.byte 102,15,56,0,215
+ jnz .Lccm64_enc_outer
+
+ movups %xmm3,(%r9)
+ .byte 0xf3,0xc3
+.size aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks
+.globl aesni_ccm64_decrypt_blocks
+.type aesni_ccm64_decrypt_blocks,@function
+.align 16
+aesni_ccm64_decrypt_blocks:
+ movl 240(%rcx),%eax
+ movups (%r8),%xmm9
+ movdqu (%r9),%xmm3
+ movdqa .Lincrement64(%rip),%xmm6
+ movdqa .Lbswap_mask(%rip),%xmm7
+
+ movaps %xmm9,%xmm2
+ movl %eax,%r10d
+ movq %rcx,%r11
+.byte 102,68,15,56,0,207
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+.Loop_enc1_5:
+.byte 102,15,56,220,209
+ decl %eax
+ movups (%rcx),%xmm1
+ leaq 16(%rcx),%rcx
+ jnz .Loop_enc1_5
+.byte 102,15,56,221,209
+ movups (%rdi),%xmm8
+ paddq %xmm6,%xmm9
+ leaq 16(%rdi),%rdi
+ jmp .Lccm64_dec_outer
+.align 16
+.Lccm64_dec_outer:
+ xorps %xmm2,%xmm8
+ movdqa %xmm9,%xmm2
+ movl %r10d,%eax
+ movups %xmm8,(%rsi)
+ leaq 16(%rsi),%rsi
+.byte 102,15,56,0,215
+
+ subq $1,%rdx
+ jz .Lccm64_dec_break
+
+ movups (%r11),%xmm0
+ shrl $1,%eax
+ movups 16(%r11),%xmm1
+ xorps %xmm0,%xmm8
+ leaq 32(%r11),%rcx
+ xorps %xmm0,%xmm2
+ xorps %xmm8,%xmm3
+ movups (%rcx),%xmm0
+
+.Lccm64_dec2_loop:
+.byte 102,15,56,220,209
+ decl %eax
+.byte 102,15,56,220,217
+ movups 16(%rcx),%xmm1
+.byte 102,15,56,220,208
+ leaq 32(%rcx),%rcx
+.byte 102,15,56,220,216
+ movups 0(%rcx),%xmm0
+ jnz .Lccm64_dec2_loop
+ movups (%rdi),%xmm8
+ paddq %xmm6,%xmm9
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ leaq 16(%rdi),%rdi
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+ jmp .Lccm64_dec_outer
+
+.align 16
+.Lccm64_dec_break:
+
+ movups (%r11),%xmm0
+ movups 16(%r11),%xmm1
+ xorps %xmm0,%xmm8
+ leaq 32(%r11),%r11
+ xorps %xmm8,%xmm3
+.Loop_enc1_6:
+.byte 102,15,56,220,217
+ decl %eax
+ movups (%r11),%xmm1
+ leaq 16(%r11),%r11
+ jnz .Loop_enc1_6
+.byte 102,15,56,221,217
+ movups %xmm3,(%r9)
+ .byte 0xf3,0xc3
+.size aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks
+.globl aesni_ctr32_encrypt_blocks
+.type aesni_ctr32_encrypt_blocks,@function
+.align 16
+aesni_ctr32_encrypt_blocks:
+ cmpq $1,%rdx
+ je .Lctr32_one_shortcut
+
+ movdqu (%r8),%xmm14
+ movdqa .Lbswap_mask(%rip),%xmm15
+ xorl %eax,%eax
+.byte 102,69,15,58,22,242,3
+.byte 102,68,15,58,34,240,3
+
+ movl 240(%rcx),%eax
+ bswapl %r10d
+ pxor %xmm12,%xmm12
+ pxor %xmm13,%xmm13
+.byte 102,69,15,58,34,226,0
+ leaq 3(%r10),%r11
+.byte 102,69,15,58,34,235,0
+ incl %r10d
+.byte 102,69,15,58,34,226,1
+ incq %r11
+.byte 102,69,15,58,34,235,1
+ incl %r10d
+.byte 102,69,15,58,34,226,2
+ incq %r11
+.byte 102,69,15,58,34,235,2
+ movdqa %xmm12,-40(%rsp)
+.byte 102,69,15,56,0,231
+ movdqa %xmm13,-24(%rsp)
+.byte 102,69,15,56,0,239
+
+ pshufd $192,%xmm12,%xmm2
+ pshufd $128,%xmm12,%xmm3
+ pshufd $64,%xmm12,%xmm4
+ cmpq $6,%rdx
+ jb .Lctr32_tail
+ shrl $1,%eax
+ movq %rcx,%r11
+ movl %eax,%r10d
+ subq $6,%rdx
+ jmp .Lctr32_loop6
+
+.align 16
+.Lctr32_loop6:
+ pshufd $192,%xmm13,%xmm5
+ por %xmm14,%xmm2
+ movups (%r11),%xmm0
+ pshufd $128,%xmm13,%xmm6
+ por %xmm14,%xmm3
+ movups 16(%r11),%xmm1
+ pshufd $64,%xmm13,%xmm7
+ por %xmm14,%xmm4
+ por %xmm14,%xmm5
+ xorps %xmm0,%xmm2
+ por %xmm14,%xmm6
+ por %xmm14,%xmm7
+
+
+
+
+ pxor %xmm0,%xmm3
+.byte 102,15,56,220,209
+ leaq 32(%r11),%rcx
+ pxor %xmm0,%xmm4
+.byte 102,15,56,220,217
+ movdqa .Lincrement32(%rip),%xmm13
+ pxor %xmm0,%xmm5
+.byte 102,15,56,220,225
+ movdqa -40(%rsp),%xmm12
+ pxor %xmm0,%xmm6
+.byte 102,15,56,220,233
+ pxor %xmm0,%xmm7
+ movups (%rcx),%xmm0
+ decl %eax
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+ jmp .Lctr32_enc_loop6_enter
+.align 16
+.Lctr32_enc_loop6:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ decl %eax
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.Lctr32_enc_loop6_enter:
+ movups 16(%rcx),%xmm1
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ leaq 32(%rcx),%rcx
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+ movups (%rcx),%xmm0
+ jnz .Lctr32_enc_loop6
+
+.byte 102,15,56,220,209
+ paddd %xmm13,%xmm12
+.byte 102,15,56,220,217
+ paddd -24(%rsp),%xmm13
+.byte 102,15,56,220,225
+ movdqa %xmm12,-40(%rsp)
+.byte 102,15,56,220,233
+ movdqa %xmm13,-24(%rsp)
+.byte 102,15,56,220,241
+.byte 102,69,15,56,0,231
+.byte 102,15,56,220,249
+.byte 102,69,15,56,0,239
+
+.byte 102,15,56,221,208
+ movups (%rdi),%xmm8
+.byte 102,15,56,221,216
+ movups 16(%rdi),%xmm9
+.byte 102,15,56,221,224
+ movups 32(%rdi),%xmm10
+.byte 102,15,56,221,232
+ movups 48(%rdi),%xmm11
+.byte 102,15,56,221,240
+ movups 64(%rdi),%xmm1
+.byte 102,15,56,221,248
+ movups 80(%rdi),%xmm0
+ leaq 96(%rdi),%rdi
+
+ xorps %xmm2,%xmm8
+ pshufd $192,%xmm12,%xmm2
+ xorps %xmm3,%xmm9
+ pshufd $128,%xmm12,%xmm3
+ movups %xmm8,(%rsi)
+ xorps %xmm4,%xmm10
+ pshufd $64,%xmm12,%xmm4
+ movups %xmm9,16(%rsi)
+ xorps %xmm5,%xmm11
+ movups %xmm10,32(%rsi)
+ xorps %xmm6,%xmm1
+ movups %xmm11,48(%rsi)
+ xorps %xmm7,%xmm0
+ movups %xmm1,64(%rsi)
+ movups %xmm0,80(%rsi)
+ leaq 96(%rsi),%rsi
+ movl %r10d,%eax
+ subq $6,%rdx
+ jnc .Lctr32_loop6
+
+ addq $6,%rdx
+ jz .Lctr32_done
+ movq %r11,%rcx
+ leal 1(%rax,%rax,1),%eax
+
+.Lctr32_tail:
+ por %xmm14,%xmm2
+ movups (%rdi),%xmm8
+ cmpq $2,%rdx
+ jb .Lctr32_one
+
+ por %xmm14,%xmm3
+ movups 16(%rdi),%xmm9
+ je .Lctr32_two
+
+ pshufd $192,%xmm13,%xmm5
+ por %xmm14,%xmm4
+ movups 32(%rdi),%xmm10
+ cmpq $4,%rdx
+ jb .Lctr32_three
+
+ pshufd $128,%xmm13,%xmm6
+ por %xmm14,%xmm5
+ movups 48(%rdi),%xmm11
+ je .Lctr32_four
+
+ por %xmm14,%xmm6
+ xorps %xmm7,%xmm7
+
+ call _aesni_encrypt6
+
+ movups 64(%rdi),%xmm1
+ xorps %xmm2,%xmm8
+ xorps %xmm3,%xmm9
+ movups %xmm8,(%rsi)
+ xorps %xmm4,%xmm10
+ movups %xmm9,16(%rsi)
+ xorps %xmm5,%xmm11
+ movups %xmm10,32(%rsi)
+ xorps %xmm6,%xmm1
+ movups %xmm11,48(%rsi)
+ movups %xmm1,64(%rsi)
+ jmp .Lctr32_done
+
+.align 16
+.Lctr32_one_shortcut:
+ movups (%r8),%xmm2
+ movups (%rdi),%xmm8
+ movl 240(%rcx),%eax
+.Lctr32_one:
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+.Loop_enc1_7:
+.byte 102,15,56,220,209
+ decl %eax
+ movups (%rcx),%xmm1
+ leaq 16(%rcx),%rcx
+ jnz .Loop_enc1_7
+.byte 102,15,56,221,209
+ xorps %xmm2,%xmm8
+ movups %xmm8,(%rsi)
+ jmp .Lctr32_done
+
+.align 16
+.Lctr32_two:
+ xorps %xmm4,%xmm4
+ call _aesni_encrypt3
+ xorps %xmm2,%xmm8
+ xorps %xmm3,%xmm9
+ movups %xmm8,(%rsi)
+ movups %xmm9,16(%rsi)
+ jmp .Lctr32_done
+
+.align 16
+.Lctr32_three:
+ call _aesni_encrypt3
+ xorps %xmm2,%xmm8
+ xorps %xmm3,%xmm9
+ movups %xmm8,(%rsi)
+ xorps %xmm4,%xmm10
+ movups %xmm9,16(%rsi)
+ movups %xmm10,32(%rsi)
+ jmp .Lctr32_done
+
+.align 16
+.Lctr32_four:
+ call _aesni_encrypt4
+ xorps %xmm2,%xmm8
+ xorps %xmm3,%xmm9
+ movups %xmm8,(%rsi)
+ xorps %xmm4,%xmm10
+ movups %xmm9,16(%rsi)
+ xorps %xmm5,%xmm11
+ movups %xmm10,32(%rsi)
+ movups %xmm11,48(%rsi)
+
+.Lctr32_done:
+ .byte 0xf3,0xc3
+.size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
+.globl aesni_xts_encrypt
+.type aesni_xts_encrypt,@function
+.align 16
+aesni_xts_encrypt:
+ leaq -104(%rsp),%rsp
+ movups (%r9),%xmm15
+ movl 240(%r8),%eax
+ movl 240(%rcx),%r10d
+ movups (%r8),%xmm0
+ movups 16(%r8),%xmm1
+ leaq 32(%r8),%r8
+ xorps %xmm0,%xmm15
+.Loop_enc1_8:
+.byte 102,68,15,56,220,249
+ decl %eax
+ movups (%r8),%xmm1
+ leaq 16(%r8),%r8
+ jnz .Loop_enc1_8
+.byte 102,68,15,56,221,249
+ movq %rcx,%r11
+ movl %r10d,%eax
+ movq %rdx,%r9
+ andq $-16,%rdx
+
+ movdqa .Lxts_magic(%rip),%xmm8
+ pxor %xmm14,%xmm14
+ pcmpgtd %xmm15,%xmm14
+ pshufd $19,%xmm14,%xmm9
+ pxor %xmm14,%xmm14
+ movdqa %xmm15,%xmm10
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm9
+ pcmpgtd %xmm15,%xmm14
+ pxor %xmm9,%xmm15
+ pshufd $19,%xmm14,%xmm9
+ pxor %xmm14,%xmm14
+ movdqa %xmm15,%xmm11
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm9
+ pcmpgtd %xmm15,%xmm14
+ pxor %xmm9,%xmm15
+ pshufd $19,%xmm14,%xmm9
+ pxor %xmm14,%xmm14
+ movdqa %xmm15,%xmm12
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm9
+ pcmpgtd %xmm15,%xmm14
+ pxor %xmm9,%xmm15
+ pshufd $19,%xmm14,%xmm9
+ pxor %xmm14,%xmm14
+ movdqa %xmm15,%xmm13
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm9
+ pcmpgtd %xmm15,%xmm14
+ pxor %xmm9,%xmm15
+ subq $96,%rdx
+ jc .Lxts_enc_short
+
+ shrl $1,%eax
+ subl $1,%eax
+ movl %eax,%r10d
+ jmp .Lxts_enc_grandloop
+
+.align 16
+.Lxts_enc_grandloop:
+ pshufd $19,%xmm14,%xmm9
+ movdqa %xmm15,%xmm14
+ paddq %xmm15,%xmm15
+ movdqu 0(%rdi),%xmm2
+ pand %xmm8,%xmm9
+ movdqu 16(%rdi),%xmm3
+ pxor %xmm9,%xmm15
+
+ movdqu 32(%rdi),%xmm4
+ pxor %xmm10,%xmm2
+ movdqu 48(%rdi),%xmm5
+ pxor %xmm11,%xmm3
+ movdqu 64(%rdi),%xmm6
+ pxor %xmm12,%xmm4
+ movdqu 80(%rdi),%xmm7
+ leaq 96(%rdi),%rdi
+ pxor %xmm13,%xmm5
+ movups (%r11),%xmm0
+ pxor %xmm14,%xmm6
+ pxor %xmm15,%xmm7
+
+
+
+ movups 16(%r11),%xmm1
+ pxor %xmm0,%xmm2
+ pxor %xmm0,%xmm3
+ movdqa %xmm10,0(%rsp)
+.byte 102,15,56,220,209
+ leaq 32(%r11),%rcx
+ pxor %xmm0,%xmm4
+ movdqa %xmm11,16(%rsp)
+.byte 102,15,56,220,217
+ pxor %xmm0,%xmm5
+ movdqa %xmm12,32(%rsp)
+.byte 102,15,56,220,225
+ pxor %xmm0,%xmm6
+ movdqa %xmm13,48(%rsp)
+.byte 102,15,56,220,233
+ pxor %xmm0,%xmm7
+ movups (%rcx),%xmm0
+ decl %eax
+ movdqa %xmm14,64(%rsp)
+.byte 102,15,56,220,241
+ movdqa %xmm15,80(%rsp)
+.byte 102,15,56,220,249
+ pxor %xmm14,%xmm14
+ pcmpgtd %xmm15,%xmm14
+ jmp .Lxts_enc_loop6_enter
+
+.align 16
+.Lxts_enc_loop6:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ decl %eax
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.Lxts_enc_loop6_enter:
+ movups 16(%rcx),%xmm1
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ leaq 32(%rcx),%rcx
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+ movups (%rcx),%xmm0
+ jnz .Lxts_enc_loop6
+
+ pshufd $19,%xmm14,%xmm9
+ pxor %xmm14,%xmm14
+ paddq %xmm15,%xmm15
+.byte 102,15,56,220,209
+ pand %xmm8,%xmm9
+.byte 102,15,56,220,217
+ pcmpgtd %xmm15,%xmm14
+.byte 102,15,56,220,225
+ pxor %xmm9,%xmm15
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+ movups 16(%rcx),%xmm1
+
+ pshufd $19,%xmm14,%xmm9
+ pxor %xmm14,%xmm14
+ movdqa %xmm15,%xmm10
+ paddq %xmm15,%xmm15
+.byte 102,15,56,220,208
+ pand %xmm8,%xmm9
+.byte 102,15,56,220,216
+ pcmpgtd %xmm15,%xmm14
+.byte 102,15,56,220,224
+ pxor %xmm9,%xmm15
+.byte 102,15,56,220,232
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+ movups 32(%rcx),%xmm0
+
+ pshufd $19,%xmm14,%xmm9
+ pxor %xmm14,%xmm14
+ movdqa %xmm15,%xmm11
+ paddq %xmm15,%xmm15
+.byte 102,15,56,220,209
+ pand %xmm8,%xmm9
+.byte 102,15,56,220,217
+ pcmpgtd %xmm15,%xmm14
+.byte 102,15,56,220,225
+ pxor %xmm9,%xmm15
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+
+ pshufd $19,%xmm14,%xmm9
+ pxor %xmm14,%xmm14
+ movdqa %xmm15,%xmm12
+ paddq %xmm15,%xmm15
+.byte 102,15,56,221,208
+ pand %xmm8,%xmm9
+.byte 102,15,56,221,216
+ pcmpgtd %xmm15,%xmm14
+.byte 102,15,56,221,224
+ pxor %xmm9,%xmm15
+.byte 102,15,56,221,232
+.byte 102,15,56,221,240
+.byte 102,15,56,221,248
+
+ pshufd $19,%xmm14,%xmm9
+ pxor %xmm14,%xmm14
+ movdqa %xmm15,%xmm13
+ paddq %xmm15,%xmm15
+ xorps 0(%rsp),%xmm2
+ pand %xmm8,%xmm9
+ xorps 16(%rsp),%xmm3
+ pcmpgtd %xmm15,%xmm14
+ pxor %xmm9,%xmm15
+
+ xorps 32(%rsp),%xmm4
+ movups %xmm2,0(%rsi)
+ xorps 48(%rsp),%xmm5
+ movups %xmm3,16(%rsi)
+ xorps 64(%rsp),%xmm6
+ movups %xmm4,32(%rsi)
+ xorps 80(%rsp),%xmm7
+ movups %xmm5,48(%rsi)
+ movl %r10d,%eax
+ movups %xmm6,64(%rsi)
+ movups %xmm7,80(%rsi)
+ leaq 96(%rsi),%rsi
+ subq $96,%rdx
+ jnc .Lxts_enc_grandloop
+
+ leal 3(%rax,%rax,1),%eax
+ movq %r11,%rcx
+ movl %eax,%r10d
+
+.Lxts_enc_short:
+ addq $96,%rdx
+ jz .Lxts_enc_done
+
+ cmpq $32,%rdx
+ jb .Lxts_enc_one
+ je .Lxts_enc_two
+
+ cmpq $64,%rdx
+ jb .Lxts_enc_three
+ je .Lxts_enc_four
+
+ pshufd $19,%xmm14,%xmm9
+ movdqa %xmm15,%xmm14
+ paddq %xmm15,%xmm15
+ movdqu (%rdi),%xmm2
+ pand %xmm8,%xmm9
+ movdqu 16(%rdi),%xmm3
+ pxor %xmm9,%xmm15
+
+ movdqu 32(%rdi),%xmm4
+ pxor %xmm10,%xmm2
+ movdqu 48(%rdi),%xmm5
+ pxor %xmm11,%xmm3
+ movdqu 64(%rdi),%xmm6
+ leaq 80(%rdi),%rdi
+ pxor %xmm12,%xmm4
+ pxor %xmm13,%xmm5
+ pxor %xmm14,%xmm6
+
+ call _aesni_encrypt6
+
+ xorps %xmm10,%xmm2
+ movdqa %xmm15,%xmm10
+ xorps %xmm11,%xmm3
+ xorps %xmm12,%xmm4
+ movdqu %xmm2,(%rsi)
+ xorps %xmm13,%xmm5
+ movdqu %xmm3,16(%rsi)
+ xorps %xmm14,%xmm6
+ movdqu %xmm4,32(%rsi)
+ movdqu %xmm5,48(%rsi)
+ movdqu %xmm6,64(%rsi)
+ leaq 80(%rsi),%rsi
+ jmp .Lxts_enc_done
+
+.align 16
+.Lxts_enc_one:
+ movups (%rdi),%xmm2
+ leaq 16(%rdi),%rdi
+ xorps %xmm10,%xmm2
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+.Loop_enc1_9:
+.byte 102,15,56,220,209
+ decl %eax
+ movups (%rcx),%xmm1
+ leaq 16(%rcx),%rcx
+ jnz .Loop_enc1_9
+.byte 102,15,56,221,209
+ xorps %xmm10,%xmm2
+ movdqa %xmm11,%xmm10
+ movups %xmm2,(%rsi)
+ leaq 16(%rsi),%rsi
+ jmp .Lxts_enc_done
+
+.align 16
+.Lxts_enc_two:
+ movups (%rdi),%xmm2
+ movups 16(%rdi),%xmm3
+ leaq 32(%rdi),%rdi
+ xorps %xmm10,%xmm2
+ xorps %xmm11,%xmm3
+
+ call _aesni_encrypt3
+
+ xorps %xmm10,%xmm2
+ movdqa %xmm12,%xmm10
+ xorps %xmm11,%xmm3
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ leaq 32(%rsi),%rsi
+ jmp .Lxts_enc_done
+
+.align 16
+.Lxts_enc_three:
+ movups (%rdi),%xmm2
+ movups 16(%rdi),%xmm3
+ movups 32(%rdi),%xmm4
+ leaq 48(%rdi),%rdi
+ xorps %xmm10,%xmm2
+ xorps %xmm11,%xmm3
+ xorps %xmm12,%xmm4
+
+ call _aesni_encrypt3
+
+ xorps %xmm10,%xmm2
+ movdqa %xmm13,%xmm10
+ xorps %xmm11,%xmm3
+ xorps %xmm12,%xmm4
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ movups %xmm4,32(%rsi)
+ leaq 48(%rsi),%rsi
+ jmp .Lxts_enc_done
+
+.align 16
+.Lxts_enc_four:
+ movups (%rdi),%xmm2
+ movups 16(%rdi),%xmm3
+ movups 32(%rdi),%xmm4
+ xorps %xmm10,%xmm2
+ movups 48(%rdi),%xmm5
+ leaq 64(%rdi),%rdi
+ xorps %xmm11,%xmm3
+ xorps %xmm12,%xmm4
+ xorps %xmm13,%xmm5
+
+ call _aesni_encrypt4
+
+ xorps %xmm10,%xmm2
+ movdqa %xmm15,%xmm10
+ xorps %xmm11,%xmm3
+ xorps %xmm12,%xmm4
+ movups %xmm2,(%rsi)
+ xorps %xmm13,%xmm5
+ movups %xmm3,16(%rsi)
+ movups %xmm4,32(%rsi)
+ movups %xmm5,48(%rsi)
+ leaq 64(%rsi),%rsi
+ jmp .Lxts_enc_done
+
+.align 16
+.Lxts_enc_done:
+ andq $15,%r9
+ jz .Lxts_enc_ret
+ movq %r9,%rdx
+
+.Lxts_enc_steal:
+ movzbl (%rdi),%eax
+ movzbl -16(%rsi),%ecx
+ leaq 1(%rdi),%rdi
+ movb %al,-16(%rsi)
+ movb %cl,0(%rsi)
+ leaq 1(%rsi),%rsi
+ subq $1,%rdx
+ jnz .Lxts_enc_steal
+
+ subq %r9,%rsi
+ movq %r11,%rcx
+ movl %r10d,%eax
+
+ movups -16(%rsi),%xmm2
+ xorps %xmm10,%xmm2
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+.Loop_enc1_10:
+.byte 102,15,56,220,209
+ decl %eax
+ movups (%rcx),%xmm1
+ leaq 16(%rcx),%rcx
+ jnz .Loop_enc1_10
+.byte 102,15,56,221,209
+ xorps %xmm10,%xmm2
+ movups %xmm2,-16(%rsi)
+
+.Lxts_enc_ret:
+ leaq 104(%rsp),%rsp
+.Lxts_enc_epilogue:
+ .byte 0xf3,0xc3
+.size aesni_xts_encrypt,.-aesni_xts_encrypt
+.globl aesni_xts_decrypt
+.type aesni_xts_decrypt,@function
+.align 16
+aesni_xts_decrypt:
+ leaq -104(%rsp),%rsp
+ movups (%r9),%xmm15
+ movl 240(%r8),%eax
+ movl 240(%rcx),%r10d
+ movups (%r8),%xmm0
+ movups 16(%r8),%xmm1
+ leaq 32(%r8),%r8
+ xorps %xmm0,%xmm15
+.Loop_enc1_11:
+.byte 102,68,15,56,220,249
+ decl %eax
+ movups (%r8),%xmm1
+ leaq 16(%r8),%r8
+ jnz .Loop_enc1_11
+.byte 102,68,15,56,221,249
+ xorl %eax,%eax
+ testq $15,%rdx
+ setnz %al
+ shlq $4,%rax
+ subq %rax,%rdx
+
+ movq %rcx,%r11
+ movl %r10d,%eax
+ movq %rdx,%r9
+ andq $-16,%rdx
+
+ movdqa .Lxts_magic(%rip),%xmm8
+ pxor %xmm14,%xmm14
+ pcmpgtd %xmm15,%xmm14
+ pshufd $19,%xmm14,%xmm9
+ pxor %xmm14,%xmm14
+ movdqa %xmm15,%xmm10
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm9
+ pcmpgtd %xmm15,%xmm14
+ pxor %xmm9,%xmm15
+ pshufd $19,%xmm14,%xmm9
+ pxor %xmm14,%xmm14
+ movdqa %xmm15,%xmm11
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm9
+ pcmpgtd %xmm15,%xmm14
+ pxor %xmm9,%xmm15
+ pshufd $19,%xmm14,%xmm9
+ pxor %xmm14,%xmm14
+ movdqa %xmm15,%xmm12
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm9
+ pcmpgtd %xmm15,%xmm14
+ pxor %xmm9,%xmm15
+ pshufd $19,%xmm14,%xmm9
+ pxor %xmm14,%xmm14
+ movdqa %xmm15,%xmm13
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm9
+ pcmpgtd %xmm15,%xmm14
+ pxor %xmm9,%xmm15
+ subq $96,%rdx
+ jc .Lxts_dec_short
+
+ shrl $1,%eax
+ subl $1,%eax
+ movl %eax,%r10d
+ jmp .Lxts_dec_grandloop
+
+.align 16
+.Lxts_dec_grandloop:
+ pshufd $19,%xmm14,%xmm9
+ movdqa %xmm15,%xmm14
+ paddq %xmm15,%xmm15
+ movdqu 0(%rdi),%xmm2
+ pand %xmm8,%xmm9
+ movdqu 16(%rdi),%xmm3
+ pxor %xmm9,%xmm15
+
+ movdqu 32(%rdi),%xmm4
+ pxor %xmm10,%xmm2
+ movdqu 48(%rdi),%xmm5
+ pxor %xmm11,%xmm3
+ movdqu 64(%rdi),%xmm6
+ pxor %xmm12,%xmm4
+ movdqu 80(%rdi),%xmm7
+ leaq 96(%rdi),%rdi
+ pxor %xmm13,%xmm5
+ movups (%r11),%xmm0
+ pxor %xmm14,%xmm6
+ pxor %xmm15,%xmm7
+
+
+
+ movups 16(%r11),%xmm1
+ pxor %xmm0,%xmm2
+ pxor %xmm0,%xmm3
+ movdqa %xmm10,0(%rsp)
+.byte 102,15,56,222,209
+ leaq 32(%r11),%rcx
+ pxor %xmm0,%xmm4
+ movdqa %xmm11,16(%rsp)
+.byte 102,15,56,222,217
+ pxor %xmm0,%xmm5
+ movdqa %xmm12,32(%rsp)
+.byte 102,15,56,222,225
+ pxor %xmm0,%xmm6
+ movdqa %xmm13,48(%rsp)
+.byte 102,15,56,222,233
+ pxor %xmm0,%xmm7
+ movups (%rcx),%xmm0
+ decl %eax
+ movdqa %xmm14,64(%rsp)
+.byte 102,15,56,222,241
+ movdqa %xmm15,80(%rsp)
+.byte 102,15,56,222,249
+ pxor %xmm14,%xmm14
+ pcmpgtd %xmm15,%xmm14
+ jmp .Lxts_dec_loop6_enter
+
+.align 16
+.Lxts_dec_loop6:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+ decl %eax
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.Lxts_dec_loop6_enter:
+ movups 16(%rcx),%xmm1
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+ leaq 32(%rcx),%rcx
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+ movups (%rcx),%xmm0
+ jnz .Lxts_dec_loop6
+
+ pshufd $19,%xmm14,%xmm9
+ pxor %xmm14,%xmm14
+ paddq %xmm15,%xmm15
+.byte 102,15,56,222,209
+ pand %xmm8,%xmm9
+.byte 102,15,56,222,217
+ pcmpgtd %xmm15,%xmm14
+.byte 102,15,56,222,225
+ pxor %xmm9,%xmm15
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+ movups 16(%rcx),%xmm1
+
+ pshufd $19,%xmm14,%xmm9
+ pxor %xmm14,%xmm14
+ movdqa %xmm15,%xmm10
+ paddq %xmm15,%xmm15
+.byte 102,15,56,222,208
+ pand %xmm8,%xmm9
+.byte 102,15,56,222,216
+ pcmpgtd %xmm15,%xmm14
+.byte 102,15,56,222,224
+ pxor %xmm9,%xmm15
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+ movups 32(%rcx),%xmm0
+
+ pshufd $19,%xmm14,%xmm9
+ pxor %xmm14,%xmm14
+ movdqa %xmm15,%xmm11
+ paddq %xmm15,%xmm15
+.byte 102,15,56,222,209
+ pand %xmm8,%xmm9
+.byte 102,15,56,222,217
+ pcmpgtd %xmm15,%xmm14
+.byte 102,15,56,222,225
+ pxor %xmm9,%xmm15
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+
+ pshufd $19,%xmm14,%xmm9
+ pxor %xmm14,%xmm14
+ movdqa %xmm15,%xmm12
+ paddq %xmm15,%xmm15
+.byte 102,15,56,223,208
+ pand %xmm8,%xmm9
+.byte 102,15,56,223,216
+ pcmpgtd %xmm15,%xmm14
+.byte 102,15,56,223,224
+ pxor %xmm9,%xmm15
+.byte 102,15,56,223,232
+.byte 102,15,56,223,240
+.byte 102,15,56,223,248
+
+ pshufd $19,%xmm14,%xmm9
+ pxor %xmm14,%xmm14
+ movdqa %xmm15,%xmm13
+ paddq %xmm15,%xmm15
+ xorps 0(%rsp),%xmm2
+ pand %xmm8,%xmm9
+ xorps 16(%rsp),%xmm3
+ pcmpgtd %xmm15,%xmm14
+ pxor %xmm9,%xmm15
+
+ xorps 32(%rsp),%xmm4
+ movups %xmm2,0(%rsi)
+ xorps 48(%rsp),%xmm5
+ movups %xmm3,16(%rsi)
+ xorps 64(%rsp),%xmm6
+ movups %xmm4,32(%rsi)
+ xorps 80(%rsp),%xmm7
+ movups %xmm5,48(%rsi)
+ movl %r10d,%eax
+ movups %xmm6,64(%rsi)
+ movups %xmm7,80(%rsi)
+ leaq 96(%rsi),%rsi
+ subq $96,%rdx
+ jnc .Lxts_dec_grandloop
+
+ leal 3(%rax,%rax,1),%eax
+ movq %r11,%rcx
+ movl %eax,%r10d
+
+.Lxts_dec_short:
+ addq $96,%rdx
+ jz .Lxts_dec_done
+
+ cmpq $32,%rdx
+ jb .Lxts_dec_one
+ je .Lxts_dec_two
+
+ cmpq $64,%rdx
+ jb .Lxts_dec_three
+ je .Lxts_dec_four
+
+ pshufd $19,%xmm14,%xmm9
+ movdqa %xmm15,%xmm14
+ paddq %xmm15,%xmm15
+ movdqu (%rdi),%xmm2
+ pand %xmm8,%xmm9
+ movdqu 16(%rdi),%xmm3
+ pxor %xmm9,%xmm15
+
+ movdqu 32(%rdi),%xmm4
+ pxor %xmm10,%xmm2
+ movdqu 48(%rdi),%xmm5
+ pxor %xmm11,%xmm3
+ movdqu 64(%rdi),%xmm6
+ leaq 80(%rdi),%rdi
+ pxor %xmm12,%xmm4
+ pxor %xmm13,%xmm5
+ pxor %xmm14,%xmm6
+
+ call _aesni_decrypt6
+
+ xorps %xmm10,%xmm2
+ xorps %xmm11,%xmm3
+ xorps %xmm12,%xmm4
+ movdqu %xmm2,(%rsi)
+ xorps %xmm13,%xmm5
+ movdqu %xmm3,16(%rsi)
+ xorps %xmm14,%xmm6
+ movdqu %xmm4,32(%rsi)
+ pxor %xmm14,%xmm14
+ movdqu %xmm5,48(%rsi)
+ pcmpgtd %xmm15,%xmm14
+ movdqu %xmm6,64(%rsi)
+ leaq 80(%rsi),%rsi
+ pshufd $19,%xmm14,%xmm11
+ andq $15,%r9
+ jz .Lxts_dec_ret
+
+ movdqa %xmm15,%xmm10
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm11
+ pxor %xmm15,%xmm11
+ jmp .Lxts_dec_done2
+
+.align 16
+.Lxts_dec_one:
+ movups (%rdi),%xmm2
+ leaq 16(%rdi),%rdi
+ xorps %xmm10,%xmm2
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+.Loop_dec1_12:
+.byte 102,15,56,222,209
+ decl %eax
+ movups (%rcx),%xmm1
+ leaq 16(%rcx),%rcx
+ jnz .Loop_dec1_12
+.byte 102,15,56,223,209
+ xorps %xmm10,%xmm2
+ movdqa %xmm11,%xmm10
+ movups %xmm2,(%rsi)
+ movdqa %xmm12,%xmm11
+ leaq 16(%rsi),%rsi
+ jmp .Lxts_dec_done
+
+.align 16
+.Lxts_dec_two:
+ movups (%rdi),%xmm2
+ movups 16(%rdi),%xmm3
+ leaq 32(%rdi),%rdi
+ xorps %xmm10,%xmm2
+ xorps %xmm11,%xmm3
+
+ call _aesni_decrypt3
+
+ xorps %xmm10,%xmm2
+ movdqa %xmm12,%xmm10
+ xorps %xmm11,%xmm3
+ movdqa %xmm13,%xmm11
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ leaq 32(%rsi),%rsi
+ jmp .Lxts_dec_done
+
+.align 16
+.Lxts_dec_three:
+ movups (%rdi),%xmm2
+ movups 16(%rdi),%xmm3
+ movups 32(%rdi),%xmm4
+ leaq 48(%rdi),%rdi
+ xorps %xmm10,%xmm2
+ xorps %xmm11,%xmm3
+ xorps %xmm12,%xmm4
+
+ call _aesni_decrypt3
+
+ xorps %xmm10,%xmm2
+ movdqa %xmm13,%xmm10
+ xorps %xmm11,%xmm3
+ movdqa %xmm15,%xmm11
+ xorps %xmm12,%xmm4
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ movups %xmm4,32(%rsi)
+ leaq 48(%rsi),%rsi
+ jmp .Lxts_dec_done
+
+.align 16
+.Lxts_dec_four:
+ pshufd $19,%xmm14,%xmm9
+ movdqa %xmm15,%xmm14
+ paddq %xmm15,%xmm15
+ movups (%rdi),%xmm2
+ pand %xmm8,%xmm9
+ movups 16(%rdi),%xmm3
+ pxor %xmm9,%xmm15
+
+ movups 32(%rdi),%xmm4
+ xorps %xmm10,%xmm2
+ movups 48(%rdi),%xmm5
+ leaq 64(%rdi),%rdi
+ xorps %xmm11,%xmm3
+ xorps %xmm12,%xmm4
+ xorps %xmm13,%xmm5
+
+ call _aesni_decrypt4
+
+ xorps %xmm10,%xmm2
+ movdqa %xmm14,%xmm10
+ xorps %xmm11,%xmm3
+ movdqa %xmm15,%xmm11
+ xorps %xmm12,%xmm4
+ movups %xmm2,(%rsi)
+ xorps %xmm13,%xmm5
+ movups %xmm3,16(%rsi)
+ movups %xmm4,32(%rsi)
+ movups %xmm5,48(%rsi)
+ leaq 64(%rsi),%rsi
+ jmp .Lxts_dec_done
+
+.align 16
+.Lxts_dec_done:
+ andq $15,%r9
+ jz .Lxts_dec_ret
+.Lxts_dec_done2:
+ movq %r9,%rdx
+ movq %r11,%rcx
+ movl %r10d,%eax
+
+ movups (%rdi),%xmm2
+ xorps %xmm11,%xmm2
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+.Loop_dec1_13:
+.byte 102,15,56,222,209
+ decl %eax
+ movups (%rcx),%xmm1
+ leaq 16(%rcx),%rcx
+ jnz .Loop_dec1_13
+.byte 102,15,56,223,209
+ xorps %xmm11,%xmm2
+ movups %xmm2,(%rsi)
+
+.Lxts_dec_steal:
+ movzbl 16(%rdi),%eax
+ movzbl (%rsi),%ecx
+ leaq 1(%rdi),%rdi
+ movb %al,(%rsi)
+ movb %cl,16(%rsi)
+ leaq 1(%rsi),%rsi
+ subq $1,%rdx
+ jnz .Lxts_dec_steal
+
+ subq %r9,%rsi
+ movq %r11,%rcx
+ movl %r10d,%eax
+
+ movups (%rsi),%xmm2
+ xorps %xmm10,%xmm2
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+.Loop_dec1_14:
+.byte 102,15,56,222,209
+ decl %eax
+ movups (%rcx),%xmm1
+ leaq 16(%rcx),%rcx
+ jnz .Loop_dec1_14
+.byte 102,15,56,223,209
+ xorps %xmm10,%xmm2
+ movups %xmm2,(%rsi)
+
+.Lxts_dec_ret:
+ leaq 104(%rsp),%rsp
+.Lxts_dec_epilogue:
+ .byte 0xf3,0xc3
+.size aesni_xts_decrypt,.-aesni_xts_decrypt
+.globl aesni_cbc_encrypt
+.type aesni_cbc_encrypt,@function
+.align 16
+aesni_cbc_encrypt:
+ testq %rdx,%rdx
+ jz .Lcbc_ret
+
+ movl 240(%rcx),%r10d
+ movq %rcx,%r11
+ testl %r9d,%r9d
+ jz .Lcbc_decrypt
+
+ movups (%r8),%xmm2
+ movl %r10d,%eax
+ cmpq $16,%rdx
+ jb .Lcbc_enc_tail
+ subq $16,%rdx
+ jmp .Lcbc_enc_loop
+.align 16
+.Lcbc_enc_loop:
+ movups (%rdi),%xmm3
+ leaq 16(%rdi),%rdi
+
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ xorps %xmm0,%xmm3
+ leaq 32(%rcx),%rcx
+ xorps %xmm3,%xmm2
+.Loop_enc1_15:
+.byte 102,15,56,220,209
+ decl %eax
+ movups (%rcx),%xmm1
+ leaq 16(%rcx),%rcx
+ jnz .Loop_enc1_15
+.byte 102,15,56,221,209
+ movl %r10d,%eax
+ movq %r11,%rcx
+ movups %xmm2,0(%rsi)
+ leaq 16(%rsi),%rsi
+ subq $16,%rdx
+ jnc .Lcbc_enc_loop
+ addq $16,%rdx
+ jnz .Lcbc_enc_tail
+ movups %xmm2,(%r8)
+ jmp .Lcbc_ret
+
+.Lcbc_enc_tail:
+ movq %rdx,%rcx
+ xchgq %rdi,%rsi
+.long 0x9066A4F3
+ movl $16,%ecx
+ subq %rdx,%rcx
+ xorl %eax,%eax
+.long 0x9066AAF3
+ leaq -16(%rdi),%rdi
+ movl %r10d,%eax
+ movq %rdi,%rsi
+ movq %r11,%rcx
+ xorq %rdx,%rdx
+ jmp .Lcbc_enc_loop
+
+.align 16
+.Lcbc_decrypt:
+ movups (%r8),%xmm9
+ movl %r10d,%eax
+ cmpq $112,%rdx
+ jbe .Lcbc_dec_tail
+ shrl $1,%r10d
+ subq $112,%rdx
+ movl %r10d,%eax
+ movaps %xmm9,-24(%rsp)
+ jmp .Lcbc_dec_loop8_enter
+.align 16
+.Lcbc_dec_loop8:
+ movaps %xmm0,-24(%rsp)
+ movups %xmm9,(%rsi)
+ leaq 16(%rsi),%rsi
+.Lcbc_dec_loop8_enter:
+ movups (%rcx),%xmm0
+ movups (%rdi),%xmm2
+ movups 16(%rdi),%xmm3
+ movups 16(%rcx),%xmm1
+
+ leaq 32(%rcx),%rcx
+ movdqu 32(%rdi),%xmm4
+ xorps %xmm0,%xmm2
+ movdqu 48(%rdi),%xmm5
+ xorps %xmm0,%xmm3
+ movdqu 64(%rdi),%xmm6
+.byte 102,15,56,222,209
+ pxor %xmm0,%xmm4
+ movdqu 80(%rdi),%xmm7
+.byte 102,15,56,222,217
+ pxor %xmm0,%xmm5
+ movdqu 96(%rdi),%xmm8
+.byte 102,15,56,222,225
+ pxor %xmm0,%xmm6
+ movdqu 112(%rdi),%xmm9
+.byte 102,15,56,222,233
+ pxor %xmm0,%xmm7
+ decl %eax
+.byte 102,15,56,222,241
+ pxor %xmm0,%xmm8
+.byte 102,15,56,222,249
+ pxor %xmm0,%xmm9
+ movups (%rcx),%xmm0
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+ movups 16(%rcx),%xmm1
+
+ call .Ldec_loop8_enter
+
+ movups (%rdi),%xmm1
+ movups 16(%rdi),%xmm0
+ xorps -24(%rsp),%xmm2
+ xorps %xmm1,%xmm3
+ movups 32(%rdi),%xmm1
+ xorps %xmm0,%xmm4
+ movups 48(%rdi),%xmm0
+ xorps %xmm1,%xmm5
+ movups 64(%rdi),%xmm1
+ xorps %xmm0,%xmm6
+ movups 80(%rdi),%xmm0
+ xorps %xmm1,%xmm7
+ movups 96(%rdi),%xmm1
+ xorps %xmm0,%xmm8
+ movups 112(%rdi),%xmm0
+ xorps %xmm1,%xmm9
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ movups %xmm4,32(%rsi)
+ movups %xmm5,48(%rsi)
+ movl %r10d,%eax
+ movups %xmm6,64(%rsi)
+ movq %r11,%rcx
+ movups %xmm7,80(%rsi)
+ leaq 128(%rdi),%rdi
+ movups %xmm8,96(%rsi)
+ leaq 112(%rsi),%rsi
+ subq $128,%rdx
+ ja .Lcbc_dec_loop8
+
+ movaps %xmm9,%xmm2
+ movaps %xmm0,%xmm9
+ addq $112,%rdx
+ jle .Lcbc_dec_tail_collected
+ movups %xmm2,(%rsi)
+ leal 1(%r10,%r10,1),%eax
+ leaq 16(%rsi),%rsi
+.Lcbc_dec_tail:
+ movups (%rdi),%xmm2
+ movaps %xmm2,%xmm8
+ cmpq $16,%rdx
+ jbe .Lcbc_dec_one
+
+ movups 16(%rdi),%xmm3
+ movaps %xmm3,%xmm7
+ cmpq $32,%rdx
+ jbe .Lcbc_dec_two
+
+ movups 32(%rdi),%xmm4
+ movaps %xmm4,%xmm6
+ cmpq $48,%rdx
+ jbe .Lcbc_dec_three
+
+ movups 48(%rdi),%xmm5
+ cmpq $64,%rdx
+ jbe .Lcbc_dec_four
+
+ movups 64(%rdi),%xmm6
+ cmpq $80,%rdx
+ jbe .Lcbc_dec_five
+
+ movups 80(%rdi),%xmm7
+ cmpq $96,%rdx
+ jbe .Lcbc_dec_six
+
+ movups 96(%rdi),%xmm8
+ movaps %xmm9,-24(%rsp)
+ call _aesni_decrypt8
+ movups (%rdi),%xmm1
+ movups 16(%rdi),%xmm0
+ xorps -24(%rsp),%xmm2
+ xorps %xmm1,%xmm3
+ movups 32(%rdi),%xmm1
+ xorps %xmm0,%xmm4
+ movups 48(%rdi),%xmm0
+ xorps %xmm1,%xmm5
+ movups 64(%rdi),%xmm1
+ xorps %xmm0,%xmm6
+ movups 80(%rdi),%xmm0
+ xorps %xmm1,%xmm7
+ movups 96(%rdi),%xmm9
+ xorps %xmm0,%xmm8
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ movups %xmm4,32(%rsi)
+ movups %xmm5,48(%rsi)
+ movups %xmm6,64(%rsi)
+ movups %xmm7,80(%rsi)
+ leaq 96(%rsi),%rsi
+ movaps %xmm8,%xmm2
+ subq $112,%rdx
+ jmp .Lcbc_dec_tail_collected
+.align 16
+.Lcbc_dec_one:
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+.Loop_dec1_16:
+.byte 102,15,56,222,209
+ decl %eax
+ movups (%rcx),%xmm1
+ leaq 16(%rcx),%rcx
+ jnz .Loop_dec1_16
+.byte 102,15,56,223,209
+ xorps %xmm9,%xmm2
+ movaps %xmm8,%xmm9
+ subq $16,%rdx
+ jmp .Lcbc_dec_tail_collected
+.align 16
+.Lcbc_dec_two:
+ xorps %xmm4,%xmm4
+ call _aesni_decrypt3
+ xorps %xmm9,%xmm2
+ xorps %xmm8,%xmm3
+ movups %xmm2,(%rsi)
+ movaps %xmm7,%xmm9
+ movaps %xmm3,%xmm2
+ leaq 16(%rsi),%rsi
+ subq $32,%rdx
+ jmp .Lcbc_dec_tail_collected
+.align 16
+.Lcbc_dec_three:
+ call _aesni_decrypt3
+ xorps %xmm9,%xmm2
+ xorps %xmm8,%xmm3
+ movups %xmm2,(%rsi)
+ xorps %xmm7,%xmm4
+ movups %xmm3,16(%rsi)
+ movaps %xmm6,%xmm9
+ movaps %xmm4,%xmm2
+ leaq 32(%rsi),%rsi
+ subq $48,%rdx
+ jmp .Lcbc_dec_tail_collected
+.align 16
+.Lcbc_dec_four:
+ call _aesni_decrypt4
+ xorps %xmm9,%xmm2
+ movups 48(%rdi),%xmm9
+ xorps %xmm8,%xmm3
+ movups %xmm2,(%rsi)
+ xorps %xmm7,%xmm4
+ movups %xmm3,16(%rsi)
+ xorps %xmm6,%xmm5
+ movups %xmm4,32(%rsi)
+ movaps %xmm5,%xmm2
+ leaq 48(%rsi),%rsi
+ subq $64,%rdx
+ jmp .Lcbc_dec_tail_collected
+.align 16
+.Lcbc_dec_five:
+ xorps %xmm7,%xmm7
+ call _aesni_decrypt6
+ movups 16(%rdi),%xmm1
+ movups 32(%rdi),%xmm0
+ xorps %xmm9,%xmm2
+ xorps %xmm8,%xmm3
+ xorps %xmm1,%xmm4
+ movups 48(%rdi),%xmm1
+ xorps %xmm0,%xmm5
+ movups 64(%rdi),%xmm9
+ xorps %xmm1,%xmm6
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ movups %xmm4,32(%rsi)
+ movups %xmm5,48(%rsi)
+ leaq 64(%rsi),%rsi
+ movaps %xmm6,%xmm2
+ subq $80,%rdx
+ jmp .Lcbc_dec_tail_collected
+.align 16
+.Lcbc_dec_six:
+ call _aesni_decrypt6
+ movups 16(%rdi),%xmm1
+ movups 32(%rdi),%xmm0
+ xorps %xmm9,%xmm2
+ xorps %xmm8,%xmm3
+ xorps %xmm1,%xmm4
+ movups 48(%rdi),%xmm1
+ xorps %xmm0,%xmm5
+ movups 64(%rdi),%xmm0
+ xorps %xmm1,%xmm6
+ movups 80(%rdi),%xmm9
+ xorps %xmm0,%xmm7
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ movups %xmm4,32(%rsi)
+ movups %xmm5,48(%rsi)
+ movups %xmm6,64(%rsi)
+ leaq 80(%rsi),%rsi
+ movaps %xmm7,%xmm2
+ subq $96,%rdx
+ jmp .Lcbc_dec_tail_collected
+.align 16
+.Lcbc_dec_tail_collected:
+ andq $15,%rdx
+ movups %xmm9,(%r8)
+ jnz .Lcbc_dec_tail_partial
+ movups %xmm2,(%rsi)
+ jmp .Lcbc_dec_ret
+.align 16
+.Lcbc_dec_tail_partial:
+ movaps %xmm2,-24(%rsp)
+ movq $16,%rcx
+ movq %rsi,%rdi
+ subq %rdx,%rcx
+ leaq -24(%rsp),%rsi
+.long 0x9066A4F3
+
+.Lcbc_dec_ret:
+.Lcbc_ret:
+ .byte 0xf3,0xc3
+.size aesni_cbc_encrypt,.-aesni_cbc_encrypt
+.globl aesni_set_decrypt_key
+.type aesni_set_decrypt_key,@function
+.align 16
+aesni_set_decrypt_key:
+.byte 0x48,0x83,0xEC,0x08
+ call __aesni_set_encrypt_key
+ shll $4,%esi
+ testl %eax,%eax
+ jnz .Ldec_key_ret
+ leaq 16(%rdx,%rsi,1),%rdi
+
+ movups (%rdx),%xmm0
+ movups (%rdi),%xmm1
+ movups %xmm0,(%rdi)
+ movups %xmm1,(%rdx)
+ leaq 16(%rdx),%rdx
+ leaq -16(%rdi),%rdi
+
+.Ldec_key_inverse:
+ movups (%rdx),%xmm0
+ movups (%rdi),%xmm1
+.byte 102,15,56,219,192
+.byte 102,15,56,219,201
+ leaq 16(%rdx),%rdx
+ leaq -16(%rdi),%rdi
+ movups %xmm0,16(%rdi)
+ movups %xmm1,-16(%rdx)
+ cmpq %rdx,%rdi
+ ja .Ldec_key_inverse
+
+ movups (%rdx),%xmm0
+.byte 102,15,56,219,192
+ movups %xmm0,(%rdi)
+.Ldec_key_ret:
+ addq $8,%rsp
+ .byte 0xf3,0xc3
+.LSEH_end_set_decrypt_key:
+.size aesni_set_decrypt_key,.-aesni_set_decrypt_key
+.globl aesni_set_encrypt_key
+.type aesni_set_encrypt_key,@function
+.align 16
+aesni_set_encrypt_key:
+__aesni_set_encrypt_key:
+.byte 0x48,0x83,0xEC,0x08
+ movq $-1,%rax
+ testq %rdi,%rdi
+ jz .Lenc_key_ret
+ testq %rdx,%rdx
+ jz .Lenc_key_ret
+
+ movups (%rdi),%xmm0
+ xorps %xmm4,%xmm4
+ leaq 16(%rdx),%rax
+ cmpl $256,%esi
+ je .L14rounds
+ cmpl $192,%esi
+ je .L12rounds
+ cmpl $128,%esi
+ jne .Lbad_keybits
+
+.L10rounds:
+ movl $9,%esi
+ movups %xmm0,(%rdx)
+.byte 102,15,58,223,200,1
+ call .Lkey_expansion_128_cold
+.byte 102,15,58,223,200,2
+ call .Lkey_expansion_128
+.byte 102,15,58,223,200,4
+ call .Lkey_expansion_128
+.byte 102,15,58,223,200,8
+ call .Lkey_expansion_128
+.byte 102,15,58,223,200,16
+ call .Lkey_expansion_128
+.byte 102,15,58,223,200,32
+ call .Lkey_expansion_128
+.byte 102,15,58,223,200,64
+ call .Lkey_expansion_128
+.byte 102,15,58,223,200,128
+ call .Lkey_expansion_128
+.byte 102,15,58,223,200,27
+ call .Lkey_expansion_128
+.byte 102,15,58,223,200,54
+ call .Lkey_expansion_128
+ movups %xmm0,(%rax)
+ movl %esi,80(%rax)
+ xorl %eax,%eax
+ jmp .Lenc_key_ret
+
+.align 16
+.L12rounds:
+ movq 16(%rdi),%xmm2
+ movl $11,%esi
+ movups %xmm0,(%rdx)
+.byte 102,15,58,223,202,1
+ call .Lkey_expansion_192a_cold
+.byte 102,15,58,223,202,2
+ call .Lkey_expansion_192b
+.byte 102,15,58,223,202,4
+ call .Lkey_expansion_192a
+.byte 102,15,58,223,202,8
+ call .Lkey_expansion_192b
+.byte 102,15,58,223,202,16
+ call .Lkey_expansion_192a
+.byte 102,15,58,223,202,32
+ call .Lkey_expansion_192b
+.byte 102,15,58,223,202,64
+ call .Lkey_expansion_192a
+.byte 102,15,58,223,202,128
+ call .Lkey_expansion_192b
+ movups %xmm0,(%rax)
+ movl %esi,48(%rax)
+ xorq %rax,%rax
+ jmp .Lenc_key_ret
+
+.align 16
+.L14rounds:
+ movups 16(%rdi),%xmm2
+ movl $13,%esi
+ leaq 16(%rax),%rax
+ movups %xmm0,(%rdx)
+ movups %xmm2,16(%rdx)
+.byte 102,15,58,223,202,1
+ call .Lkey_expansion_256a_cold
+.byte 102,15,58,223,200,1
+ call .Lkey_expansion_256b
+.byte 102,15,58,223,202,2
+ call .Lkey_expansion_256a
+.byte 102,15,58,223,200,2
+ call .Lkey_expansion_256b
+.byte 102,15,58,223,202,4
+ call .Lkey_expansion_256a
+.byte 102,15,58,223,200,4
+ call .Lkey_expansion_256b
+.byte 102,15,58,223,202,8
+ call .Lkey_expansion_256a
+.byte 102,15,58,223,200,8
+ call .Lkey_expansion_256b
+.byte 102,15,58,223,202,16
+ call .Lkey_expansion_256a
+.byte 102,15,58,223,200,16
+ call .Lkey_expansion_256b
+.byte 102,15,58,223,202,32
+ call .Lkey_expansion_256a
+.byte 102,15,58,223,200,32
+ call .Lkey_expansion_256b
+.byte 102,15,58,223,202,64
+ call .Lkey_expansion_256a
+ movups %xmm0,(%rax)
+ movl %esi,16(%rax)
+ xorq %rax,%rax
+ jmp .Lenc_key_ret
+
+.align 16
+.Lbad_keybits:
+ movq $-2,%rax
+.Lenc_key_ret:
+ addq $8,%rsp
+ .byte 0xf3,0xc3
+.LSEH_end_set_encrypt_key:
+
+.align 16
+.Lkey_expansion_128:
+ movups %xmm0,(%rax)
+ leaq 16(%rax),%rax
+.Lkey_expansion_128_cold:
+ shufps $16,%xmm0,%xmm4
+ xorps %xmm4,%xmm0
+ shufps $140,%xmm0,%xmm4
+ xorps %xmm4,%xmm0
+ shufps $255,%xmm1,%xmm1
+ xorps %xmm1,%xmm0
+ .byte 0xf3,0xc3
+
+.align 16
+.Lkey_expansion_192a:
+ movups %xmm0,(%rax)
+ leaq 16(%rax),%rax
+.Lkey_expansion_192a_cold:
+ movaps %xmm2,%xmm5
+.Lkey_expansion_192b_warm:
+ shufps $16,%xmm0,%xmm4
+ movdqa %xmm2,%xmm3
+ xorps %xmm4,%xmm0
+ shufps $140,%xmm0,%xmm4
+ pslldq $4,%xmm3
+ xorps %xmm4,%xmm0
+ pshufd $85,%xmm1,%xmm1
+ pxor %xmm3,%xmm2
+ pxor %xmm1,%xmm0
+ pshufd $255,%xmm0,%xmm3
+ pxor %xmm3,%xmm2
+ .byte 0xf3,0xc3
+
+.align 16
+.Lkey_expansion_192b:
+ movaps %xmm0,%xmm3
+ shufps $68,%xmm0,%xmm5
+ movups %xmm5,(%rax)
+ shufps $78,%xmm2,%xmm3
+ movups %xmm3,16(%rax)
+ leaq 32(%rax),%rax
+ jmp .Lkey_expansion_192b_warm
+
+.align 16
+.Lkey_expansion_256a:
+ movups %xmm2,(%rax)
+ leaq 16(%rax),%rax
+.Lkey_expansion_256a_cold:
+ shufps $16,%xmm0,%xmm4
+ xorps %xmm4,%xmm0
+ shufps $140,%xmm0,%xmm4
+ xorps %xmm4,%xmm0
+ shufps $255,%xmm1,%xmm1
+ xorps %xmm1,%xmm0
+ .byte 0xf3,0xc3
+
+.align 16
+.Lkey_expansion_256b:
+ movups %xmm0,(%rax)
+ leaq 16(%rax),%rax
+
+ shufps $16,%xmm2,%xmm4
+ xorps %xmm4,%xmm2
+ shufps $140,%xmm2,%xmm4
+ xorps %xmm4,%xmm2
+ shufps $170,%xmm1,%xmm1
+ xorps %xmm1,%xmm2
+ .byte 0xf3,0xc3
+.size aesni_set_encrypt_key,.-aesni_set_encrypt_key
+.size __aesni_set_encrypt_key,.-__aesni_set_encrypt_key
+.align 64
+.Lbswap_mask:
+.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.Lincrement32:
+.long 6,6,6,0
+.Lincrement64:
+.long 1,0,0,0
+.Lxts_magic:
+.long 0x87,0,1,0
+
+.byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 64
diff --git a/main/openssl/crypto/aes/asm/aesni-x86_64.pl b/main/openssl/crypto/aes/asm/aesni-x86_64.pl
new file mode 100644
index 00000000..0dbb194b
--- /dev/null
+++ b/main/openssl/crypto/aes/asm/aesni-x86_64.pl
@@ -0,0 +1,3069 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# This module implements support for Intel AES-NI extension. In
+# OpenSSL context it's used with Intel engine, but can also be used as
+# drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for
+# details].
+#
+# Performance.
+#
+# Given aes(enc|dec) instructions' latency asymptotic performance for
+# non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte
+# processed with 128-bit key. And given their throughput asymptotic
+# performance for parallelizable modes is 1.25 cycles per byte. Being
+# asymptotic limit it's not something you commonly achieve in reality,
+# but how close does one get? Below are results collected for
+# different modes and block sized. Pairs of numbers are for en-/
+# decryption.
+#
+# 16-byte 64-byte 256-byte 1-KB 8-KB
+# ECB 4.25/4.25 1.38/1.38 1.28/1.28 1.26/1.26 1.26/1.26
+# CTR 5.42/5.42 1.92/1.92 1.44/1.44 1.28/1.28 1.26/1.26
+# CBC 4.38/4.43 4.15/1.43 4.07/1.32 4.07/1.29 4.06/1.28
+# CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07
+# OFB 5.42/5.42 4.64/4.64 4.44/4.44 4.39/4.39 4.38/4.38
+# CFB 5.73/5.85 5.56/5.62 5.48/5.56 5.47/5.55 5.47/5.55
+#
+# ECB, CTR, CBC and CCM results are free from EVP overhead. This means
+# that otherwise used 'openssl speed -evp aes-128-??? -engine aesni
+# [-decrypt]' will exhibit 10-15% worse results for smaller blocks.
+# The results were collected with specially crafted speed.c benchmark
+# in order to compare them with results reported in "Intel Advanced
+# Encryption Standard (AES) New Instruction Set" White Paper Revision
+# 3.0 dated May 2010. All above results are consistently better. This
+# module also provides better performance for block sizes smaller than
+# 128 bytes in points *not* represented in the above table.
+#
+# Looking at the results for 8-KB buffer.
+#
+# CFB and OFB results are far from the limit, because implementation
+# uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on
+# single-block aesni_encrypt, which is not the most optimal way to go.
+# CBC encrypt result is unexpectedly high and there is no documented
+# explanation for it. Seemingly there is a small penalty for feeding
+# the result back to AES unit the way it's done in CBC mode. There is
+# nothing one can do and the result appears optimal. CCM result is
+# identical to CBC, because CBC-MAC is essentially CBC encrypt without
+# saving output. CCM CTR "stays invisible," because it's neatly
+# interleaved wih CBC-MAC. This provides ~30% improvement over
+# "straghtforward" CCM implementation with CTR and CBC-MAC performed
+# disjointly. Parallelizable modes practically achieve the theoretical
+# limit.
+#
+# Looking at how results vary with buffer size.
+#
+# Curves are practically saturated at 1-KB buffer size. In most cases
+# "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one.
+# CTR curve doesn't follow this pattern and is "slowest" changing one
+# with "256-byte" result being 87% of "8-KB." This is because overhead
+# in CTR mode is most computationally intensive. Small-block CCM
+# decrypt is slower than encrypt, because first CTR and last CBC-MAC
+# iterations can't be interleaved.
+#
+# Results for 192- and 256-bit keys.
+#
+# EVP-free results were observed to scale perfectly with number of
+# rounds for larger block sizes, i.e. 192-bit result being 10/12 times
+# lower and 256-bit one - 10/14. Well, in CBC encrypt case differences
+# are a tad smaller, because the above mentioned penalty biases all
+# results by same constant value. In similar way function call
+# overhead affects small-block performance, as well as OFB and CFB
+# results. Differences are not large, most common coefficients are
+# 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one
+# observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)...
+
+# January 2011
+#
+# While Westmere processor features 6 cycles latency for aes[enc|dec]
+# instructions, which can be scheduled every second cycle, Sandy
+# Bridge spends 8 cycles per instruction, but it can schedule them
+# every cycle. This means that code targeting Westmere would perform
+# suboptimally on Sandy Bridge. Therefore this update.
+#
+# In addition, non-parallelizable CBC encrypt (as well as CCM) is
+# optimized. Relative improvement might appear modest, 8% on Westmere,
+# but in absolute terms it's 3.77 cycles per byte encrypted with
+# 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers
+# should be compared to asymptotic limits of 3.75 for Westmere and
+# 5.00 for Sandy Bridge. Actually, the fact that they get this close
+# to asymptotic limits is quite amazing. Indeed, the limit is
+# calculated as latency times number of rounds, 10 for 128-bit key,
+# and divided by 16, the number of bytes in block, or in other words
+# it accounts *solely* for aesenc instructions. But there are extra
+# instructions, and numbers so close to the asymptotic limits mean
+# that it's as if it takes as little as *one* additional cycle to
+# execute all of them. How is it possible? It is possible thanks to
+# out-of-order execution logic, which manages to overlap post-
+# processing of previous block, things like saving the output, with
+# actual encryption of current block, as well as pre-processing of
+# current block, things like fetching input and xor-ing it with
+# 0-round element of the key schedule, with actual encryption of
+# previous block. Keep this in mind...
+#
+# For parallelizable modes, such as ECB, CBC decrypt, CTR, higher
+# performance is achieved by interleaving instructions working on
+# independent blocks. In which case asymptotic limit for such modes
+# can be obtained by dividing above mentioned numbers by AES
+# instructions' interleave factor. Westmere can execute at most 3
+# instructions at a time, meaning that optimal interleave factor is 3,
+# and that's where the "magic" number of 1.25 come from. "Optimal
+# interleave factor" means that increase of interleave factor does
+# not improve performance. The formula has proven to reflect reality
+# pretty well on Westmere... Sandy Bridge on the other hand can
+# execute up to 8 AES instructions at a time, so how does varying
+# interleave factor affect the performance? Here is table for ECB
+# (numbers are cycles per byte processed with 128-bit key):
+#
+# instruction interleave factor 3x 6x 8x
+# theoretical asymptotic limit 1.67 0.83 0.625
+# measured performance for 8KB block 1.05 0.86 0.84
+#
+# "as if" interleave factor 4.7x 5.8x 6.0x
+#
+# Further data for other parallelizable modes:
+#
+# CBC decrypt 1.16 0.93 0.93
+# CTR 1.14 0.91 n/a
+#
+# Well, given 3x column it's probably inappropriate to call the limit
+# asymptotic, if it can be surpassed, isn't it? What happens there?
+# Rewind to CBC paragraph for the answer. Yes, out-of-order execution
+# magic is responsible for this. Processor overlaps not only the
+# additional instructions with AES ones, but even AES instuctions
+# processing adjacent triplets of independent blocks. In the 6x case
+# additional instructions still claim disproportionally small amount
+# of additional cycles, but in 8x case number of instructions must be
+# a tad too high for out-of-order logic to cope with, and AES unit
+# remains underutilized... As you can see 8x interleave is hardly
+# justifiable, so there no need to feel bad that 32-bit aesni-x86.pl
+# utilizies 6x interleave because of limited register bank capacity.
+#
+# Higher interleave factors do have negative impact on Westmere
+# performance. While for ECB mode it's negligible ~1.5%, other
+# parallelizables perform ~5% worse, which is outweighed by ~25%
+# improvement on Sandy Bridge. To balance regression on Westmere
+# CTR mode was implemented with 6x aesenc interleave factor.
+
+# April 2011
+#
+# Add aesni_xts_[en|de]crypt. Westmere spends 1.33 cycles processing
+# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.97. Just like
+# in CTR mode AES instruction interleave factor was chosen to be 6x.
+
+$PREFIX="aesni"; # if $PREFIX is set to "AES", the script
+ # generates drop-in replacement for
+ # crypto/aes/asm/aes-x86_64.pl:-)
+
+$flavour = shift;
+$output = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+$movkey = $PREFIX eq "aesni" ? "movups" : "movups";
+@_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
+ ("%rdi","%rsi","%rdx","%rcx"); # Unix order
+
+$code=".text\n";
+
+$rounds="%eax"; # input to and changed by aesni_[en|de]cryptN !!!
+# this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ...
+$inp="%rdi";
+$out="%rsi";
+$len="%rdx";
+$key="%rcx"; # input to and changed by aesni_[en|de]cryptN !!!
+$ivp="%r8"; # cbc, ctr, ...
+
+$rnds_="%r10d"; # backup copy for $rounds
+$key_="%r11"; # backup copy for $key
+
+# %xmm register layout
+$rndkey0="%xmm0"; $rndkey1="%xmm1";
+$inout0="%xmm2"; $inout1="%xmm3";
+$inout2="%xmm4"; $inout3="%xmm5";
+$inout4="%xmm6"; $inout5="%xmm7";
+$inout6="%xmm8"; $inout7="%xmm9";
+
+$in2="%xmm6"; $in1="%xmm7"; # used in CBC decrypt, CTR, ...
+$in0="%xmm8"; $iv="%xmm9";
+
+# Inline version of internal aesni_[en|de]crypt1.
+#
+# Why folded loop? Because aes[enc|dec] is slow enough to accommodate
+# cycles which take care of loop variables...
+{ my $sn;
+sub aesni_generate1 {
+my ($p,$key,$rounds,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
+++$sn;
+$code.=<<___;
+ $movkey ($key),$rndkey0
+ $movkey 16($key),$rndkey1
+___
+$code.=<<___ if (defined($ivec));
+ xorps $rndkey0,$ivec
+ lea 32($key),$key
+ xorps $ivec,$inout
+___
+$code.=<<___ if (!defined($ivec));
+ lea 32($key),$key
+ xorps $rndkey0,$inout
+___
+$code.=<<___;
+.Loop_${p}1_$sn:
+ aes${p} $rndkey1,$inout
+ dec $rounds
+ $movkey ($key),$rndkey1
+ lea 16($key),$key
+ jnz .Loop_${p}1_$sn # loop body is 16 bytes
+ aes${p}last $rndkey1,$inout
+___
+}}
+# void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key);
+#
+{ my ($inp,$out,$key) = @_4args;
+
+$code.=<<___;
+.globl ${PREFIX}_encrypt
+.type ${PREFIX}_encrypt,\@abi-omnipotent
+.align 16
+${PREFIX}_encrypt:
+ movups ($inp),$inout0 # load input
+ mov 240($key),$rounds # key->rounds
+___
+ &aesni_generate1("enc",$key,$rounds);
+$code.=<<___;
+ movups $inout0,($out) # output
+ ret
+.size ${PREFIX}_encrypt,.-${PREFIX}_encrypt
+
+.globl ${PREFIX}_decrypt
+.type ${PREFIX}_decrypt,\@abi-omnipotent
+.align 16
+${PREFIX}_decrypt:
+ movups ($inp),$inout0 # load input
+ mov 240($key),$rounds # key->rounds
+___
+ &aesni_generate1("dec",$key,$rounds);
+$code.=<<___;
+ movups $inout0,($out) # output
+ ret
+.size ${PREFIX}_decrypt, .-${PREFIX}_decrypt
+___
+}
+
+# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
+# factor. Why 3x subroutine were originally used in loops? Even though
+# aes[enc|dec] latency was originally 6, it could be scheduled only
+# every *2nd* cycle. Thus 3x interleave was the one providing optimal
+# utilization, i.e. when subroutine's throughput is virtually same as
+# of non-interleaved subroutine [for number of input blocks up to 3].
+# This is why it makes no sense to implement 2x subroutine.
+# aes[enc|dec] latency in next processor generation is 8, but the
+# instructions can be scheduled every cycle. Optimal interleave for
+# new processor is therefore 8x...
+sub aesni_generate3 {
+my $dir=shift;
+# As already mentioned it takes in $key and $rounds, which are *not*
+# preserved. $inout[0-2] is cipher/clear text...
+$code.=<<___;
+.type _aesni_${dir}rypt3,\@abi-omnipotent
+.align 16
+_aesni_${dir}rypt3:
+ $movkey ($key),$rndkey0
+ shr \$1,$rounds
+ $movkey 16($key),$rndkey1
+ lea 32($key),$key
+ xorps $rndkey0,$inout0
+ xorps $rndkey0,$inout1
+ xorps $rndkey0,$inout2
+ $movkey ($key),$rndkey0
+
+.L${dir}_loop3:
+ aes${dir} $rndkey1,$inout0
+ aes${dir} $rndkey1,$inout1
+ dec $rounds
+ aes${dir} $rndkey1,$inout2
+ $movkey 16($key),$rndkey1
+ aes${dir} $rndkey0,$inout0
+ aes${dir} $rndkey0,$inout1
+ lea 32($key),$key
+ aes${dir} $rndkey0,$inout2
+ $movkey ($key),$rndkey0
+ jnz .L${dir}_loop3
+
+ aes${dir} $rndkey1,$inout0
+ aes${dir} $rndkey1,$inout1
+ aes${dir} $rndkey1,$inout2
+ aes${dir}last $rndkey0,$inout0
+ aes${dir}last $rndkey0,$inout1
+ aes${dir}last $rndkey0,$inout2
+ ret
+.size _aesni_${dir}rypt3,.-_aesni_${dir}rypt3
+___
+}
+# 4x interleave is implemented to improve small block performance,
+# most notably [and naturally] 4 block by ~30%. One can argue that one
+# should have implemented 5x as well, but improvement would be <20%,
+# so it's not worth it...
+sub aesni_generate4 {
+my $dir=shift;
+# As already mentioned it takes in $key and $rounds, which are *not*
+# preserved. $inout[0-3] is cipher/clear text...
+$code.=<<___;
+.type _aesni_${dir}rypt4,\@abi-omnipotent
+.align 16
+_aesni_${dir}rypt4:
+ $movkey ($key),$rndkey0
+ shr \$1,$rounds
+ $movkey 16($key),$rndkey1
+ lea 32($key),$key
+ xorps $rndkey0,$inout0
+ xorps $rndkey0,$inout1
+ xorps $rndkey0,$inout2
+ xorps $rndkey0,$inout3
+ $movkey ($key),$rndkey0
+
+.L${dir}_loop4:
+ aes${dir} $rndkey1,$inout0
+ aes${dir} $rndkey1,$inout1
+ dec $rounds
+ aes${dir} $rndkey1,$inout2
+ aes${dir} $rndkey1,$inout3
+ $movkey 16($key),$rndkey1
+ aes${dir} $rndkey0,$inout0
+ aes${dir} $rndkey0,$inout1
+ lea 32($key),$key
+ aes${dir} $rndkey0,$inout2
+ aes${dir} $rndkey0,$inout3
+ $movkey ($key),$rndkey0
+ jnz .L${dir}_loop4
+
+ aes${dir} $rndkey1,$inout0
+ aes${dir} $rndkey1,$inout1
+ aes${dir} $rndkey1,$inout2
+ aes${dir} $rndkey1,$inout3
+ aes${dir}last $rndkey0,$inout0
+ aes${dir}last $rndkey0,$inout1
+ aes${dir}last $rndkey0,$inout2
+ aes${dir}last $rndkey0,$inout3
+ ret
+.size _aesni_${dir}rypt4,.-_aesni_${dir}rypt4
+___
+}
+sub aesni_generate6 {
+my $dir=shift;
+# As already mentioned it takes in $key and $rounds, which are *not*
+# preserved. $inout[0-5] is cipher/clear text...
+$code.=<<___;
+.type _aesni_${dir}rypt6,\@abi-omnipotent
+.align 16
+_aesni_${dir}rypt6:
+ $movkey ($key),$rndkey0
+ shr \$1,$rounds
+ $movkey 16($key),$rndkey1
+ lea 32($key),$key
+ xorps $rndkey0,$inout0
+ pxor $rndkey0,$inout1
+ aes${dir} $rndkey1,$inout0
+ pxor $rndkey0,$inout2
+ aes${dir} $rndkey1,$inout1
+ pxor $rndkey0,$inout3
+ aes${dir} $rndkey1,$inout2
+ pxor $rndkey0,$inout4
+ aes${dir} $rndkey1,$inout3
+ pxor $rndkey0,$inout5
+ dec $rounds
+ aes${dir} $rndkey1,$inout4
+ $movkey ($key),$rndkey0
+ aes${dir} $rndkey1,$inout5
+ jmp .L${dir}_loop6_enter
+.align 16
+.L${dir}_loop6:
+ aes${dir} $rndkey1,$inout0
+ aes${dir} $rndkey1,$inout1
+ dec $rounds
+ aes${dir} $rndkey1,$inout2
+ aes${dir} $rndkey1,$inout3
+ aes${dir} $rndkey1,$inout4
+ aes${dir} $rndkey1,$inout5
+.L${dir}_loop6_enter: # happens to be 16-byte aligned
+ $movkey 16($key),$rndkey1
+ aes${dir} $rndkey0,$inout0
+ aes${dir} $rndkey0,$inout1
+ lea 32($key),$key
+ aes${dir} $rndkey0,$inout2
+ aes${dir} $rndkey0,$inout3
+ aes${dir} $rndkey0,$inout4
+ aes${dir} $rndkey0,$inout5
+ $movkey ($key),$rndkey0
+ jnz .L${dir}_loop6
+
+ aes${dir} $rndkey1,$inout0
+ aes${dir} $rndkey1,$inout1
+ aes${dir} $rndkey1,$inout2
+ aes${dir} $rndkey1,$inout3
+ aes${dir} $rndkey1,$inout4
+ aes${dir} $rndkey1,$inout5
+ aes${dir}last $rndkey0,$inout0
+ aes${dir}last $rndkey0,$inout1
+ aes${dir}last $rndkey0,$inout2
+ aes${dir}last $rndkey0,$inout3
+ aes${dir}last $rndkey0,$inout4
+ aes${dir}last $rndkey0,$inout5
+ ret
+.size _aesni_${dir}rypt6,.-_aesni_${dir}rypt6
+___
+}
+sub aesni_generate8 {
+my $dir=shift;
+# As already mentioned it takes in $key and $rounds, which are *not*
+# preserved. $inout[0-7] is cipher/clear text...
+$code.=<<___;
+.type _aesni_${dir}rypt8,\@abi-omnipotent
+.align 16
+_aesni_${dir}rypt8:
+ $movkey ($key),$rndkey0
+ shr \$1,$rounds
+ $movkey 16($key),$rndkey1
+ lea 32($key),$key
+ xorps $rndkey0,$inout0
+ xorps $rndkey0,$inout1
+ aes${dir} $rndkey1,$inout0
+ pxor $rndkey0,$inout2
+ aes${dir} $rndkey1,$inout1
+ pxor $rndkey0,$inout3
+ aes${dir} $rndkey1,$inout2
+ pxor $rndkey0,$inout4
+ aes${dir} $rndkey1,$inout3
+ pxor $rndkey0,$inout5
+ dec $rounds
+ aes${dir} $rndkey1,$inout4
+ pxor $rndkey0,$inout6
+ aes${dir} $rndkey1,$inout5
+ pxor $rndkey0,$inout7
+ $movkey ($key),$rndkey0
+ aes${dir} $rndkey1,$inout6
+ aes${dir} $rndkey1,$inout7
+ $movkey 16($key),$rndkey1
+ jmp .L${dir}_loop8_enter
+.align 16
+.L${dir}_loop8:
+ aes${dir} $rndkey1,$inout0
+ aes${dir} $rndkey1,$inout1
+ dec $rounds
+ aes${dir} $rndkey1,$inout2
+ aes${dir} $rndkey1,$inout3
+ aes${dir} $rndkey1,$inout4
+ aes${dir} $rndkey1,$inout5
+ aes${dir} $rndkey1,$inout6
+ aes${dir} $rndkey1,$inout7
+ $movkey 16($key),$rndkey1
+.L${dir}_loop8_enter: # happens to be 16-byte aligned
+ aes${dir} $rndkey0,$inout0
+ aes${dir} $rndkey0,$inout1
+ lea 32($key),$key
+ aes${dir} $rndkey0,$inout2
+ aes${dir} $rndkey0,$inout3
+ aes${dir} $rndkey0,$inout4
+ aes${dir} $rndkey0,$inout5
+ aes${dir} $rndkey0,$inout6
+ aes${dir} $rndkey0,$inout7
+ $movkey ($key),$rndkey0
+ jnz .L${dir}_loop8
+
+ aes${dir} $rndkey1,$inout0
+ aes${dir} $rndkey1,$inout1
+ aes${dir} $rndkey1,$inout2
+ aes${dir} $rndkey1,$inout3
+ aes${dir} $rndkey1,$inout4
+ aes${dir} $rndkey1,$inout5
+ aes${dir} $rndkey1,$inout6
+ aes${dir} $rndkey1,$inout7
+ aes${dir}last $rndkey0,$inout0
+ aes${dir}last $rndkey0,$inout1
+ aes${dir}last $rndkey0,$inout2
+ aes${dir}last $rndkey0,$inout3
+ aes${dir}last $rndkey0,$inout4
+ aes${dir}last $rndkey0,$inout5
+ aes${dir}last $rndkey0,$inout6
+ aes${dir}last $rndkey0,$inout7
+ ret
+.size _aesni_${dir}rypt8,.-_aesni_${dir}rypt8
+___
+}
+&aesni_generate3("enc") if ($PREFIX eq "aesni");
+&aesni_generate3("dec");
+&aesni_generate4("enc") if ($PREFIX eq "aesni");
+&aesni_generate4("dec");
+&aesni_generate6("enc") if ($PREFIX eq "aesni");
+&aesni_generate6("dec");
+&aesni_generate8("enc") if ($PREFIX eq "aesni");
+&aesni_generate8("dec");
+
+if ($PREFIX eq "aesni") {
+########################################################################
+# void aesni_ecb_encrypt (const void *in, void *out,
+# size_t length, const AES_KEY *key,
+# int enc);
+$code.=<<___;
+.globl aesni_ecb_encrypt
+.type aesni_ecb_encrypt,\@function,5
+.align 16
+aesni_ecb_encrypt:
+ and \$-16,$len
+ jz .Lecb_ret
+
+ mov 240($key),$rounds # key->rounds
+ $movkey ($key),$rndkey0
+ mov $key,$key_ # backup $key
+ mov $rounds,$rnds_ # backup $rounds
+ test %r8d,%r8d # 5th argument
+ jz .Lecb_decrypt
+#--------------------------- ECB ENCRYPT ------------------------------#
+ cmp \$0x80,$len
+ jb .Lecb_enc_tail
+
+ movdqu ($inp),$inout0
+ movdqu 0x10($inp),$inout1
+ movdqu 0x20($inp),$inout2
+ movdqu 0x30($inp),$inout3
+ movdqu 0x40($inp),$inout4
+ movdqu 0x50($inp),$inout5
+ movdqu 0x60($inp),$inout6
+ movdqu 0x70($inp),$inout7
+ lea 0x80($inp),$inp
+ sub \$0x80,$len
+ jmp .Lecb_enc_loop8_enter
+.align 16
+.Lecb_enc_loop8:
+ movups $inout0,($out)
+ mov $key_,$key # restore $key
+ movdqu ($inp),$inout0
+ mov $rnds_,$rounds # restore $rounds
+ movups $inout1,0x10($out)
+ movdqu 0x10($inp),$inout1
+ movups $inout2,0x20($out)
+ movdqu 0x20($inp),$inout2
+ movups $inout3,0x30($out)
+ movdqu 0x30($inp),$inout3
+ movups $inout4,0x40($out)
+ movdqu 0x40($inp),$inout4
+ movups $inout5,0x50($out)
+ movdqu 0x50($inp),$inout5
+ movups $inout6,0x60($out)
+ movdqu 0x60($inp),$inout6
+ movups $inout7,0x70($out)
+ lea 0x80($out),$out
+ movdqu 0x70($inp),$inout7
+ lea 0x80($inp),$inp
+.Lecb_enc_loop8_enter:
+
+ call _aesni_encrypt8
+
+ sub \$0x80,$len
+ jnc .Lecb_enc_loop8
+
+ movups $inout0,($out)
+ mov $key_,$key # restore $key
+ movups $inout1,0x10($out)
+ mov $rnds_,$rounds # restore $rounds
+ movups $inout2,0x20($out)
+ movups $inout3,0x30($out)
+ movups $inout4,0x40($out)
+ movups $inout5,0x50($out)
+ movups $inout6,0x60($out)
+ movups $inout7,0x70($out)
+ lea 0x80($out),$out
+ add \$0x80,$len
+ jz .Lecb_ret
+
+.Lecb_enc_tail:
+ movups ($inp),$inout0
+ cmp \$0x20,$len
+ jb .Lecb_enc_one
+ movups 0x10($inp),$inout1
+ je .Lecb_enc_two
+ movups 0x20($inp),$inout2
+ cmp \$0x40,$len
+ jb .Lecb_enc_three
+ movups 0x30($inp),$inout3
+ je .Lecb_enc_four
+ movups 0x40($inp),$inout4
+ cmp \$0x60,$len
+ jb .Lecb_enc_five
+ movups 0x50($inp),$inout5
+ je .Lecb_enc_six
+ movdqu 0x60($inp),$inout6
+ call _aesni_encrypt8
+ movups $inout0,($out)
+ movups $inout1,0x10($out)
+ movups $inout2,0x20($out)
+ movups $inout3,0x30($out)
+ movups $inout4,0x40($out)
+ movups $inout5,0x50($out)
+ movups $inout6,0x60($out)
+ jmp .Lecb_ret
+.align 16
+.Lecb_enc_one:
+___
+ &aesni_generate1("enc",$key,$rounds);
+$code.=<<___;
+ movups $inout0,($out)
+ jmp .Lecb_ret
+.align 16
+.Lecb_enc_two:
+ xorps $inout2,$inout2
+ call _aesni_encrypt3
+ movups $inout0,($out)
+ movups $inout1,0x10($out)
+ jmp .Lecb_ret
+.align 16
+.Lecb_enc_three:
+ call _aesni_encrypt3
+ movups $inout0,($out)
+ movups $inout1,0x10($out)
+ movups $inout2,0x20($out)
+ jmp .Lecb_ret
+.align 16
+.Lecb_enc_four:
+ call _aesni_encrypt4
+ movups $inout0,($out)
+ movups $inout1,0x10($out)
+ movups $inout2,0x20($out)
+ movups $inout3,0x30($out)
+ jmp .Lecb_ret
+.align 16
+.Lecb_enc_five:
+ xorps $inout5,$inout5
+ call _aesni_encrypt6
+ movups $inout0,($out)
+ movups $inout1,0x10($out)
+ movups $inout2,0x20($out)
+ movups $inout3,0x30($out)
+ movups $inout4,0x40($out)
+ jmp .Lecb_ret
+.align 16
+.Lecb_enc_six:
+ call _aesni_encrypt6
+ movups $inout0,($out)
+ movups $inout1,0x10($out)
+ movups $inout2,0x20($out)
+ movups $inout3,0x30($out)
+ movups $inout4,0x40($out)
+ movups $inout5,0x50($out)
+ jmp .Lecb_ret
+ #--------------------------- ECB DECRYPT ------------------------------#
+.align 16
+.Lecb_decrypt:
+ cmp \$0x80,$len
+ jb .Lecb_dec_tail
+
+ movdqu ($inp),$inout0
+ movdqu 0x10($inp),$inout1
+ movdqu 0x20($inp),$inout2
+ movdqu 0x30($inp),$inout3
+ movdqu 0x40($inp),$inout4
+ movdqu 0x50($inp),$inout5
+ movdqu 0x60($inp),$inout6
+ movdqu 0x70($inp),$inout7
+ lea 0x80($inp),$inp
+ sub \$0x80,$len
+ jmp .Lecb_dec_loop8_enter
+.align 16
+.Lecb_dec_loop8:
+ movups $inout0,($out)
+ mov $key_,$key # restore $key
+ movdqu ($inp),$inout0
+ mov $rnds_,$rounds # restore $rounds
+ movups $inout1,0x10($out)
+ movdqu 0x10($inp),$inout1
+ movups $inout2,0x20($out)
+ movdqu 0x20($inp),$inout2
+ movups $inout3,0x30($out)
+ movdqu 0x30($inp),$inout3
+ movups $inout4,0x40($out)
+ movdqu 0x40($inp),$inout4
+ movups $inout5,0x50($out)
+ movdqu 0x50($inp),$inout5
+ movups $inout6,0x60($out)
+ movdqu 0x60($inp),$inout6
+ movups $inout7,0x70($out)
+ lea 0x80($out),$out
+ movdqu 0x70($inp),$inout7
+ lea 0x80($inp),$inp
+.Lecb_dec_loop8_enter:
+
+ call _aesni_decrypt8
+
+ $movkey ($key_),$rndkey0
+ sub \$0x80,$len
+ jnc .Lecb_dec_loop8
+
+ movups $inout0,($out)
+ mov $key_,$key # restore $key
+ movups $inout1,0x10($out)
+ mov $rnds_,$rounds # restore $rounds
+ movups $inout2,0x20($out)
+ movups $inout3,0x30($out)
+ movups $inout4,0x40($out)
+ movups $inout5,0x50($out)
+ movups $inout6,0x60($out)
+ movups $inout7,0x70($out)
+ lea 0x80($out),$out
+ add \$0x80,$len
+ jz .Lecb_ret
+
+.Lecb_dec_tail:
+ movups ($inp),$inout0
+ cmp \$0x20,$len
+ jb .Lecb_dec_one
+ movups 0x10($inp),$inout1
+ je .Lecb_dec_two
+ movups 0x20($inp),$inout2
+ cmp \$0x40,$len
+ jb .Lecb_dec_three
+ movups 0x30($inp),$inout3
+ je .Lecb_dec_four
+ movups 0x40($inp),$inout4
+ cmp \$0x60,$len
+ jb .Lecb_dec_five
+ movups 0x50($inp),$inout5
+ je .Lecb_dec_six
+ movups 0x60($inp),$inout6
+ $movkey ($key),$rndkey0
+ call _aesni_decrypt8
+ movups $inout0,($out)
+ movups $inout1,0x10($out)
+ movups $inout2,0x20($out)
+ movups $inout3,0x30($out)
+ movups $inout4,0x40($out)
+ movups $inout5,0x50($out)
+ movups $inout6,0x60($out)
+ jmp .Lecb_ret
+.align 16
+.Lecb_dec_one:
+___
+ &aesni_generate1("dec",$key,$rounds);
+$code.=<<___;
+ movups $inout0,($out)
+ jmp .Lecb_ret
+.align 16
+.Lecb_dec_two:
+ xorps $inout2,$inout2
+ call _aesni_decrypt3
+ movups $inout0,($out)
+ movups $inout1,0x10($out)
+ jmp .Lecb_ret
+.align 16
+.Lecb_dec_three:
+ call _aesni_decrypt3
+ movups $inout0,($out)
+ movups $inout1,0x10($out)
+ movups $inout2,0x20($out)
+ jmp .Lecb_ret
+.align 16
+.Lecb_dec_four:
+ call _aesni_decrypt4
+ movups $inout0,($out)
+ movups $inout1,0x10($out)
+ movups $inout2,0x20($out)
+ movups $inout3,0x30($out)
+ jmp .Lecb_ret
+.align 16
+.Lecb_dec_five:
+ xorps $inout5,$inout5
+ call _aesni_decrypt6
+ movups $inout0,($out)
+ movups $inout1,0x10($out)
+ movups $inout2,0x20($out)
+ movups $inout3,0x30($out)
+ movups $inout4,0x40($out)
+ jmp .Lecb_ret
+.align 16
+.Lecb_dec_six:
+ call _aesni_decrypt6
+ movups $inout0,($out)
+ movups $inout1,0x10($out)
+ movups $inout2,0x20($out)
+ movups $inout3,0x30($out)
+ movups $inout4,0x40($out)
+ movups $inout5,0x50($out)
+
+.Lecb_ret:
+ ret
+.size aesni_ecb_encrypt,.-aesni_ecb_encrypt
+___
+
+{
+######################################################################
+# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
+# size_t blocks, const AES_KEY *key,
+# const char *ivec,char *cmac);
+#
+# Handles only complete blocks, operates on 64-bit counter and
+# does not update *ivec! Nor does it finalize CMAC value
+# (see engine/eng_aesni.c for details)
+#
+{
+my $cmac="%r9"; # 6th argument
+
+my $increment="%xmm6";
+my $bswap_mask="%xmm7";
+
+$code.=<<___;
+.globl aesni_ccm64_encrypt_blocks
+.type aesni_ccm64_encrypt_blocks,\@function,6
+.align 16
+aesni_ccm64_encrypt_blocks:
+___
+$code.=<<___ if ($win64);
+ lea -0x58(%rsp),%rsp
+ movaps %xmm6,(%rsp)
+ movaps %xmm7,0x10(%rsp)
+ movaps %xmm8,0x20(%rsp)
+ movaps %xmm9,0x30(%rsp)
+.Lccm64_enc_body:
+___
+$code.=<<___;
+ mov 240($key),$rounds # key->rounds
+ movdqu ($ivp),$iv
+ movdqa .Lincrement64(%rip),$increment
+ movdqa .Lbswap_mask(%rip),$bswap_mask
+
+ shr \$1,$rounds
+ lea 0($key),$key_
+ movdqu ($cmac),$inout1
+ movdqa $iv,$inout0
+ mov $rounds,$rnds_
+ pshufb $bswap_mask,$iv
+ jmp .Lccm64_enc_outer
+.align 16
+.Lccm64_enc_outer:
+ $movkey ($key_),$rndkey0
+ mov $rnds_,$rounds
+ movups ($inp),$in0 # load inp
+
+ xorps $rndkey0,$inout0 # counter
+ $movkey 16($key_),$rndkey1
+ xorps $in0,$rndkey0
+ lea 32($key_),$key
+ xorps $rndkey0,$inout1 # cmac^=inp
+ $movkey ($key),$rndkey0
+
+.Lccm64_enc2_loop:
+ aesenc $rndkey1,$inout0
+ dec $rounds
+ aesenc $rndkey1,$inout1
+ $movkey 16($key),$rndkey1
+ aesenc $rndkey0,$inout0
+ lea 32($key),$key
+ aesenc $rndkey0,$inout1
+ $movkey 0($key),$rndkey0
+ jnz .Lccm64_enc2_loop
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ paddq $increment,$iv
+ aesenclast $rndkey0,$inout0
+ aesenclast $rndkey0,$inout1
+
+ dec $len
+ lea 16($inp),$inp
+ xorps $inout0,$in0 # inp ^= E(iv)
+ movdqa $iv,$inout0
+ movups $in0,($out) # save output
+ lea 16($out),$out
+ pshufb $bswap_mask,$inout0
+ jnz .Lccm64_enc_outer
+
+ movups $inout1,($cmac)
+___
+$code.=<<___ if ($win64);
+ movaps (%rsp),%xmm6
+ movaps 0x10(%rsp),%xmm7
+ movaps 0x20(%rsp),%xmm8
+ movaps 0x30(%rsp),%xmm9
+ lea 0x58(%rsp),%rsp
+.Lccm64_enc_ret:
+___
+$code.=<<___;
+ ret
+.size aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks
+___
+######################################################################
+$code.=<<___;
+.globl aesni_ccm64_decrypt_blocks
+.type aesni_ccm64_decrypt_blocks,\@function,6
+.align 16
+aesni_ccm64_decrypt_blocks:
+___
+$code.=<<___ if ($win64);
+ lea -0x58(%rsp),%rsp
+ movaps %xmm6,(%rsp)
+ movaps %xmm7,0x10(%rsp)
+ movaps %xmm8,0x20(%rsp)
+ movaps %xmm9,0x30(%rsp)
+.Lccm64_dec_body:
+___
+$code.=<<___;
+ mov 240($key),$rounds # key->rounds
+ movups ($ivp),$iv
+ movdqu ($cmac),$inout1
+ movdqa .Lincrement64(%rip),$increment
+ movdqa .Lbswap_mask(%rip),$bswap_mask
+
+ movaps $iv,$inout0
+ mov $rounds,$rnds_
+ mov $key,$key_
+ pshufb $bswap_mask,$iv
+___
+ &aesni_generate1("enc",$key,$rounds);
+$code.=<<___;
+ movups ($inp),$in0 # load inp
+ paddq $increment,$iv
+ lea 16($inp),$inp
+ jmp .Lccm64_dec_outer
+.align 16
+.Lccm64_dec_outer:
+ xorps $inout0,$in0 # inp ^= E(iv)
+ movdqa $iv,$inout0
+ mov $rnds_,$rounds
+ movups $in0,($out) # save output
+ lea 16($out),$out
+ pshufb $bswap_mask,$inout0
+
+ sub \$1,$len
+ jz .Lccm64_dec_break
+
+ $movkey ($key_),$rndkey0
+ shr \$1,$rounds
+ $movkey 16($key_),$rndkey1
+ xorps $rndkey0,$in0
+ lea 32($key_),$key
+ xorps $rndkey0,$inout0
+ xorps $in0,$inout1 # cmac^=out
+ $movkey ($key),$rndkey0
+
+.Lccm64_dec2_loop:
+ aesenc $rndkey1,$inout0
+ dec $rounds
+ aesenc $rndkey1,$inout1
+ $movkey 16($key),$rndkey1
+ aesenc $rndkey0,$inout0
+ lea 32($key),$key
+ aesenc $rndkey0,$inout1
+ $movkey 0($key),$rndkey0
+ jnz .Lccm64_dec2_loop
+ movups ($inp),$in0 # load inp
+ paddq $increment,$iv
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ lea 16($inp),$inp
+ aesenclast $rndkey0,$inout0
+ aesenclast $rndkey0,$inout1
+ jmp .Lccm64_dec_outer
+
+.align 16
+.Lccm64_dec_break:
+ #xorps $in0,$inout1 # cmac^=out
+___
+ &aesni_generate1("enc",$key_,$rounds,$inout1,$in0);
+$code.=<<___;
+ movups $inout1,($cmac)
+___
+$code.=<<___ if ($win64);
+ movaps (%rsp),%xmm6
+ movaps 0x10(%rsp),%xmm7
+ movaps 0x20(%rsp),%xmm8
+ movaps 0x30(%rsp),%xmm9
+ lea 0x58(%rsp),%rsp
+.Lccm64_dec_ret:
+___
+$code.=<<___;
+ ret
+.size aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks
+___
+}
+######################################################################
+# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
+# size_t blocks, const AES_KEY *key,
+# const char *ivec);
+#
+# Handles only complete blocks, operates on 32-bit counter and
+# does not update *ivec! (see engine/eng_aesni.c for details)
+#
+{
+my $reserved = $win64?0:-0x28;
+my ($in0,$in1,$in2,$in3)=map("%xmm$_",(8..11));
+my ($iv0,$iv1,$ivec)=("%xmm12","%xmm13","%xmm14");
+my $bswap_mask="%xmm15";
+
+$code.=<<___;
+.globl aesni_ctr32_encrypt_blocks
+.type aesni_ctr32_encrypt_blocks,\@function,5
+.align 16
+aesni_ctr32_encrypt_blocks:
+___
+$code.=<<___ if ($win64);
+ lea -0xc8(%rsp),%rsp
+ movaps %xmm6,0x20(%rsp)
+ movaps %xmm7,0x30(%rsp)
+ movaps %xmm8,0x40(%rsp)
+ movaps %xmm9,0x50(%rsp)
+ movaps %xmm10,0x60(%rsp)
+ movaps %xmm11,0x70(%rsp)
+ movaps %xmm12,0x80(%rsp)
+ movaps %xmm13,0x90(%rsp)
+ movaps %xmm14,0xa0(%rsp)
+ movaps %xmm15,0xb0(%rsp)
+.Lctr32_body:
+___
+$code.=<<___;
+ cmp \$1,$len
+ je .Lctr32_one_shortcut
+
+ movdqu ($ivp),$ivec
+ movdqa .Lbswap_mask(%rip),$bswap_mask
+ xor $rounds,$rounds
+ pextrd \$3,$ivec,$rnds_ # pull 32-bit counter
+ pinsrd \$3,$rounds,$ivec # wipe 32-bit counter
+
+ mov 240($key),$rounds # key->rounds
+ bswap $rnds_
+ pxor $iv0,$iv0 # vector of 3 32-bit counters
+ pxor $iv1,$iv1 # vector of 3 32-bit counters
+ pinsrd \$0,$rnds_,$iv0
+ lea 3($rnds_),$key_
+ pinsrd \$0,$key_,$iv1
+ inc $rnds_
+ pinsrd \$1,$rnds_,$iv0
+ inc $key_
+ pinsrd \$1,$key_,$iv1
+ inc $rnds_
+ pinsrd \$2,$rnds_,$iv0
+ inc $key_
+ pinsrd \$2,$key_,$iv1
+ movdqa $iv0,$reserved(%rsp)
+ pshufb $bswap_mask,$iv0
+ movdqa $iv1,`$reserved+0x10`(%rsp)
+ pshufb $bswap_mask,$iv1
+
+ pshufd \$`3<<6`,$iv0,$inout0 # place counter to upper dword
+ pshufd \$`2<<6`,$iv0,$inout1
+ pshufd \$`1<<6`,$iv0,$inout2
+ cmp \$6,$len
+ jb .Lctr32_tail
+ shr \$1,$rounds
+ mov $key,$key_ # backup $key
+ mov $rounds,$rnds_ # backup $rounds
+ sub \$6,$len
+ jmp .Lctr32_loop6
+
+.align 16
+.Lctr32_loop6:
+ pshufd \$`3<<6`,$iv1,$inout3
+ por $ivec,$inout0 # merge counter-less ivec
+ $movkey ($key_),$rndkey0
+ pshufd \$`2<<6`,$iv1,$inout4
+ por $ivec,$inout1
+ $movkey 16($key_),$rndkey1
+ pshufd \$`1<<6`,$iv1,$inout5
+ por $ivec,$inout2
+ por $ivec,$inout3
+ xorps $rndkey0,$inout0
+ por $ivec,$inout4
+ por $ivec,$inout5
+
+ # inline _aesni_encrypt6 and interleave last rounds
+ # with own code...
+
+ pxor $rndkey0,$inout1
+ aesenc $rndkey1,$inout0
+ lea 32($key_),$key
+ pxor $rndkey0,$inout2
+ aesenc $rndkey1,$inout1
+ movdqa .Lincrement32(%rip),$iv1
+ pxor $rndkey0,$inout3
+ aesenc $rndkey1,$inout2
+ movdqa $reserved(%rsp),$iv0
+ pxor $rndkey0,$inout4
+ aesenc $rndkey1,$inout3
+ pxor $rndkey0,$inout5
+ $movkey ($key),$rndkey0
+ dec $rounds
+ aesenc $rndkey1,$inout4
+ aesenc $rndkey1,$inout5
+ jmp .Lctr32_enc_loop6_enter
+.align 16
+.Lctr32_enc_loop6:
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ dec $rounds
+ aesenc $rndkey1,$inout2
+ aesenc $rndkey1,$inout3
+ aesenc $rndkey1,$inout4
+ aesenc $rndkey1,$inout5
+.Lctr32_enc_loop6_enter:
+ $movkey 16($key),$rndkey1
+ aesenc $rndkey0,$inout0
+ aesenc $rndkey0,$inout1
+ lea 32($key),$key
+ aesenc $rndkey0,$inout2
+ aesenc $rndkey0,$inout3
+ aesenc $rndkey0,$inout4
+ aesenc $rndkey0,$inout5
+ $movkey ($key),$rndkey0
+ jnz .Lctr32_enc_loop6
+
+ aesenc $rndkey1,$inout0
+ paddd $iv1,$iv0 # increment counter vector
+ aesenc $rndkey1,$inout1
+ paddd `$reserved+0x10`(%rsp),$iv1
+ aesenc $rndkey1,$inout2
+ movdqa $iv0,$reserved(%rsp) # save counter vector
+ aesenc $rndkey1,$inout3
+ movdqa $iv1,`$reserved+0x10`(%rsp)
+ aesenc $rndkey1,$inout4
+ pshufb $bswap_mask,$iv0 # byte swap
+ aesenc $rndkey1,$inout5
+ pshufb $bswap_mask,$iv1
+
+ aesenclast $rndkey0,$inout0
+ movups ($inp),$in0 # load input
+ aesenclast $rndkey0,$inout1
+ movups 0x10($inp),$in1
+ aesenclast $rndkey0,$inout2
+ movups 0x20($inp),$in2
+ aesenclast $rndkey0,$inout3
+ movups 0x30($inp),$in3
+ aesenclast $rndkey0,$inout4
+ movups 0x40($inp),$rndkey1
+ aesenclast $rndkey0,$inout5
+ movups 0x50($inp),$rndkey0
+ lea 0x60($inp),$inp
+
+ xorps $inout0,$in0 # xor
+ pshufd \$`3<<6`,$iv0,$inout0
+ xorps $inout1,$in1
+ pshufd \$`2<<6`,$iv0,$inout1
+ movups $in0,($out) # store output
+ xorps $inout2,$in2
+ pshufd \$`1<<6`,$iv0,$inout2
+ movups $in1,0x10($out)
+ xorps $inout3,$in3
+ movups $in2,0x20($out)
+ xorps $inout4,$rndkey1
+ movups $in3,0x30($out)
+ xorps $inout5,$rndkey0
+ movups $rndkey1,0x40($out)
+ movups $rndkey0,0x50($out)
+ lea 0x60($out),$out
+ mov $rnds_,$rounds
+ sub \$6,$len
+ jnc .Lctr32_loop6
+
+ add \$6,$len
+ jz .Lctr32_done
+ mov $key_,$key # restore $key
+ lea 1($rounds,$rounds),$rounds # restore original value
+
+.Lctr32_tail:
+ por $ivec,$inout0
+ movups ($inp),$in0
+ cmp \$2,$len
+ jb .Lctr32_one
+
+ por $ivec,$inout1
+ movups 0x10($inp),$in1
+ je .Lctr32_two
+
+ pshufd \$`3<<6`,$iv1,$inout3
+ por $ivec,$inout2
+ movups 0x20($inp),$in2
+ cmp \$4,$len
+ jb .Lctr32_three
+
+ pshufd \$`2<<6`,$iv1,$inout4
+ por $ivec,$inout3
+ movups 0x30($inp),$in3
+ je .Lctr32_four
+
+ por $ivec,$inout4
+ xorps $inout5,$inout5
+
+ call _aesni_encrypt6
+
+ movups 0x40($inp),$rndkey1
+ xorps $inout0,$in0
+ xorps $inout1,$in1
+ movups $in0,($out)
+ xorps $inout2,$in2
+ movups $in1,0x10($out)
+ xorps $inout3,$in3
+ movups $in2,0x20($out)
+ xorps $inout4,$rndkey1
+ movups $in3,0x30($out)
+ movups $rndkey1,0x40($out)
+ jmp .Lctr32_done
+
+.align 16
+.Lctr32_one_shortcut:
+ movups ($ivp),$inout0
+ movups ($inp),$in0
+ mov 240($key),$rounds # key->rounds
+.Lctr32_one:
+___
+ &aesni_generate1("enc",$key,$rounds);
+$code.=<<___;
+ xorps $inout0,$in0
+ movups $in0,($out)
+ jmp .Lctr32_done
+
+.align 16
+.Lctr32_two:
+ xorps $inout2,$inout2
+ call _aesni_encrypt3
+ xorps $inout0,$in0
+ xorps $inout1,$in1
+ movups $in0,($out)
+ movups $in1,0x10($out)
+ jmp .Lctr32_done
+
+.align 16
+.Lctr32_three:
+ call _aesni_encrypt3
+ xorps $inout0,$in0
+ xorps $inout1,$in1
+ movups $in0,($out)
+ xorps $inout2,$in2
+ movups $in1,0x10($out)
+ movups $in2,0x20($out)
+ jmp .Lctr32_done
+
+.align 16
+.Lctr32_four:
+ call _aesni_encrypt4
+ xorps $inout0,$in0
+ xorps $inout1,$in1
+ movups $in0,($out)
+ xorps $inout2,$in2
+ movups $in1,0x10($out)
+ xorps $inout3,$in3
+ movups $in2,0x20($out)
+ movups $in3,0x30($out)
+
+.Lctr32_done:
+___
+$code.=<<___ if ($win64);
+ movaps 0x20(%rsp),%xmm6
+ movaps 0x30(%rsp),%xmm7
+ movaps 0x40(%rsp),%xmm8
+ movaps 0x50(%rsp),%xmm9
+ movaps 0x60(%rsp),%xmm10
+ movaps 0x70(%rsp),%xmm11
+ movaps 0x80(%rsp),%xmm12
+ movaps 0x90(%rsp),%xmm13
+ movaps 0xa0(%rsp),%xmm14
+ movaps 0xb0(%rsp),%xmm15
+ lea 0xc8(%rsp),%rsp
+.Lctr32_ret:
+___
+$code.=<<___;
+ ret
+.size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
+___
+}
+
+######################################################################
+# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
+# const AES_KEY *key1, const AES_KEY *key2
+# const unsigned char iv[16]);
+#
+{
+my @tweak=map("%xmm$_",(10..15));
+my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]);
+my ($key2,$ivp,$len_)=("%r8","%r9","%r9");
+my $frame_size = 0x68 + ($win64?160:0);
+
+$code.=<<___;
+.globl aesni_xts_encrypt
+.type aesni_xts_encrypt,\@function,6
+.align 16
+aesni_xts_encrypt:
+ lea -$frame_size(%rsp),%rsp
+___
+$code.=<<___ if ($win64);
+ movaps %xmm6,0x60(%rsp)
+ movaps %xmm7,0x70(%rsp)
+ movaps %xmm8,0x80(%rsp)
+ movaps %xmm9,0x90(%rsp)
+ movaps %xmm10,0xa0(%rsp)
+ movaps %xmm11,0xb0(%rsp)
+ movaps %xmm12,0xc0(%rsp)
+ movaps %xmm13,0xd0(%rsp)
+ movaps %xmm14,0xe0(%rsp)
+ movaps %xmm15,0xf0(%rsp)
+.Lxts_enc_body:
+___
+$code.=<<___;
+ movups ($ivp),@tweak[5] # load clear-text tweak
+ mov 240(%r8),$rounds # key2->rounds
+ mov 240($key),$rnds_ # key1->rounds
+___
+ # generate the tweak
+ &aesni_generate1("enc",$key2,$rounds,@tweak[5]);
+$code.=<<___;
+ mov $key,$key_ # backup $key
+ mov $rnds_,$rounds # backup $rounds
+ mov $len,$len_ # backup $len
+ and \$-16,$len
+
+ movdqa .Lxts_magic(%rip),$twmask
+ pxor $twtmp,$twtmp
+ pcmpgtd @tweak[5],$twtmp # broadcast upper bits
+___
+ for ($i=0;$i<4;$i++) {
+ $code.=<<___;
+ pshufd \$0x13,$twtmp,$twres
+ pxor $twtmp,$twtmp
+ movdqa @tweak[5],@tweak[$i]
+ paddq @tweak[5],@tweak[5] # psllq 1,$tweak
+ pand $twmask,$twres # isolate carry and residue
+ pcmpgtd @tweak[5],$twtmp # broadcat upper bits
+ pxor $twres,@tweak[5]
+___
+ }
+$code.=<<___;
+ sub \$16*6,$len
+ jc .Lxts_enc_short
+
+ shr \$1,$rounds
+ sub \$1,$rounds
+ mov $rounds,$rnds_
+ jmp .Lxts_enc_grandloop
+
+.align 16
+.Lxts_enc_grandloop:
+ pshufd \$0x13,$twtmp,$twres
+ movdqa @tweak[5],@tweak[4]
+ paddq @tweak[5],@tweak[5] # psllq 1,$tweak
+ movdqu `16*0`($inp),$inout0 # load input
+ pand $twmask,$twres # isolate carry and residue
+ movdqu `16*1`($inp),$inout1
+ pxor $twres,@tweak[5]
+
+ movdqu `16*2`($inp),$inout2
+ pxor @tweak[0],$inout0 # input^=tweak
+ movdqu `16*3`($inp),$inout3
+ pxor @tweak[1],$inout1
+ movdqu `16*4`($inp),$inout4
+ pxor @tweak[2],$inout2
+ movdqu `16*5`($inp),$inout5
+ lea `16*6`($inp),$inp
+ pxor @tweak[3],$inout3
+ $movkey ($key_),$rndkey0
+ pxor @tweak[4],$inout4
+ pxor @tweak[5],$inout5
+
+ # inline _aesni_encrypt6 and interleave first and last rounds
+ # with own code...
+ $movkey 16($key_),$rndkey1
+ pxor $rndkey0,$inout0
+ pxor $rndkey0,$inout1
+ movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks
+ aesenc $rndkey1,$inout0
+ lea 32($key_),$key
+ pxor $rndkey0,$inout2
+ movdqa @tweak[1],`16*1`(%rsp)
+ aesenc $rndkey1,$inout1
+ pxor $rndkey0,$inout3
+ movdqa @tweak[2],`16*2`(%rsp)
+ aesenc $rndkey1,$inout2
+ pxor $rndkey0,$inout4
+ movdqa @tweak[3],`16*3`(%rsp)
+ aesenc $rndkey1,$inout3
+ pxor $rndkey0,$inout5
+ $movkey ($key),$rndkey0
+ dec $rounds
+ movdqa @tweak[4],`16*4`(%rsp)
+ aesenc $rndkey1,$inout4
+ movdqa @tweak[5],`16*5`(%rsp)
+ aesenc $rndkey1,$inout5
+ pxor $twtmp,$twtmp
+ pcmpgtd @tweak[5],$twtmp
+ jmp .Lxts_enc_loop6_enter
+
+.align 16
+.Lxts_enc_loop6:
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ dec $rounds
+ aesenc $rndkey1,$inout2
+ aesenc $rndkey1,$inout3
+ aesenc $rndkey1,$inout4
+ aesenc $rndkey1,$inout5
+.Lxts_enc_loop6_enter:
+ $movkey 16($key),$rndkey1
+ aesenc $rndkey0,$inout0
+ aesenc $rndkey0,$inout1
+ lea 32($key),$key
+ aesenc $rndkey0,$inout2
+ aesenc $rndkey0,$inout3
+ aesenc $rndkey0,$inout4
+ aesenc $rndkey0,$inout5
+ $movkey ($key),$rndkey0
+ jnz .Lxts_enc_loop6
+
+ pshufd \$0x13,$twtmp,$twres
+ pxor $twtmp,$twtmp
+ paddq @tweak[5],@tweak[5] # psllq 1,$tweak
+ aesenc $rndkey1,$inout0
+ pand $twmask,$twres # isolate carry and residue
+ aesenc $rndkey1,$inout1
+ pcmpgtd @tweak[5],$twtmp # broadcast upper bits
+ aesenc $rndkey1,$inout2
+ pxor $twres,@tweak[5]
+ aesenc $rndkey1,$inout3
+ aesenc $rndkey1,$inout4
+ aesenc $rndkey1,$inout5
+ $movkey 16($key),$rndkey1
+
+ pshufd \$0x13,$twtmp,$twres
+ pxor $twtmp,$twtmp
+ movdqa @tweak[5],@tweak[0]
+ paddq @tweak[5],@tweak[5] # psllq 1,$tweak
+ aesenc $rndkey0,$inout0
+ pand $twmask,$twres # isolate carry and residue
+ aesenc $rndkey0,$inout1
+ pcmpgtd @tweak[5],$twtmp # broadcat upper bits
+ aesenc $rndkey0,$inout2
+ pxor $twres,@tweak[5]
+ aesenc $rndkey0,$inout3
+ aesenc $rndkey0,$inout4
+ aesenc $rndkey0,$inout5
+ $movkey 32($key),$rndkey0
+
+ pshufd \$0x13,$twtmp,$twres
+ pxor $twtmp,$twtmp
+ movdqa @tweak[5],@tweak[1]
+ paddq @tweak[5],@tweak[5] # psllq 1,$tweak
+ aesenc $rndkey1,$inout0
+ pand $twmask,$twres # isolate carry and residue
+ aesenc $rndkey1,$inout1
+ pcmpgtd @tweak[5],$twtmp # broadcat upper bits
+ aesenc $rndkey1,$inout2
+ pxor $twres,@tweak[5]
+ aesenc $rndkey1,$inout3
+ aesenc $rndkey1,$inout4
+ aesenc $rndkey1,$inout5
+
+ pshufd \$0x13,$twtmp,$twres
+ pxor $twtmp,$twtmp
+ movdqa @tweak[5],@tweak[2]
+ paddq @tweak[5],@tweak[5] # psllq 1,$tweak
+ aesenclast $rndkey0,$inout0
+ pand $twmask,$twres # isolate carry and residue
+ aesenclast $rndkey0,$inout1
+ pcmpgtd @tweak[5],$twtmp # broadcat upper bits
+ aesenclast $rndkey0,$inout2
+ pxor $twres,@tweak[5]
+ aesenclast $rndkey0,$inout3
+ aesenclast $rndkey0,$inout4
+ aesenclast $rndkey0,$inout5
+
+ pshufd \$0x13,$twtmp,$twres
+ pxor $twtmp,$twtmp
+ movdqa @tweak[5],@tweak[3]
+ paddq @tweak[5],@tweak[5] # psllq 1,$tweak
+ xorps `16*0`(%rsp),$inout0 # output^=tweak
+ pand $twmask,$twres # isolate carry and residue
+ xorps `16*1`(%rsp),$inout1
+ pcmpgtd @tweak[5],$twtmp # broadcat upper bits
+ pxor $twres,@tweak[5]
+
+ xorps `16*2`(%rsp),$inout2
+ movups $inout0,`16*0`($out) # write output
+ xorps `16*3`(%rsp),$inout3
+ movups $inout1,`16*1`($out)
+ xorps `16*4`(%rsp),$inout4
+ movups $inout2,`16*2`($out)
+ xorps `16*5`(%rsp),$inout5
+ movups $inout3,`16*3`($out)
+ mov $rnds_,$rounds # restore $rounds
+ movups $inout4,`16*4`($out)
+ movups $inout5,`16*5`($out)
+ lea `16*6`($out),$out
+ sub \$16*6,$len
+ jnc .Lxts_enc_grandloop
+
+ lea 3($rounds,$rounds),$rounds # restore original value
+ mov $key_,$key # restore $key
+ mov $rounds,$rnds_ # backup $rounds
+
+.Lxts_enc_short:
+ add \$16*6,$len
+ jz .Lxts_enc_done
+
+ cmp \$0x20,$len
+ jb .Lxts_enc_one
+ je .Lxts_enc_two
+
+ cmp \$0x40,$len
+ jb .Lxts_enc_three
+ je .Lxts_enc_four
+
+ pshufd \$0x13,$twtmp,$twres
+ movdqa @tweak[5],@tweak[4]
+ paddq @tweak[5],@tweak[5] # psllq 1,$tweak
+ movdqu ($inp),$inout0
+ pand $twmask,$twres # isolate carry and residue
+ movdqu 16*1($inp),$inout1
+ pxor $twres,@tweak[5]
+
+ movdqu 16*2($inp),$inout2
+ pxor @tweak[0],$inout0
+ movdqu 16*3($inp),$inout3
+ pxor @tweak[1],$inout1
+ movdqu 16*4($inp),$inout4
+ lea 16*5($inp),$inp
+ pxor @tweak[2],$inout2
+ pxor @tweak[3],$inout3
+ pxor @tweak[4],$inout4
+
+ call _aesni_encrypt6
+
+ xorps @tweak[0],$inout0
+ movdqa @tweak[5],@tweak[0]
+ xorps @tweak[1],$inout1
+ xorps @tweak[2],$inout2
+ movdqu $inout0,($out)
+ xorps @tweak[3],$inout3
+ movdqu $inout1,16*1($out)
+ xorps @tweak[4],$inout4
+ movdqu $inout2,16*2($out)
+ movdqu $inout3,16*3($out)
+ movdqu $inout4,16*4($out)
+ lea 16*5($out),$out
+ jmp .Lxts_enc_done
+
+.align 16
+.Lxts_enc_one:
+ movups ($inp),$inout0
+ lea 16*1($inp),$inp
+ xorps @tweak[0],$inout0
+___
+ &aesni_generate1("enc",$key,$rounds);
+$code.=<<___;
+ xorps @tweak[0],$inout0
+ movdqa @tweak[1],@tweak[0]
+ movups $inout0,($out)
+ lea 16*1($out),$out
+ jmp .Lxts_enc_done
+
+.align 16
+.Lxts_enc_two:
+ movups ($inp),$inout0
+ movups 16($inp),$inout1
+ lea 32($inp),$inp
+ xorps @tweak[0],$inout0
+ xorps @tweak[1],$inout1
+
+ call _aesni_encrypt3
+
+ xorps @tweak[0],$inout0
+ movdqa @tweak[2],@tweak[0]
+ xorps @tweak[1],$inout1
+ movups $inout0,($out)
+ movups $inout1,16*1($out)
+ lea 16*2($out),$out
+ jmp .Lxts_enc_done
+
+.align 16
+.Lxts_enc_three:
+ movups ($inp),$inout0
+ movups 16*1($inp),$inout1
+ movups 16*2($inp),$inout2
+ lea 16*3($inp),$inp
+ xorps @tweak[0],$inout0
+ xorps @tweak[1],$inout1
+ xorps @tweak[2],$inout2
+
+ call _aesni_encrypt3
+
+ xorps @tweak[0],$inout0
+ movdqa @tweak[3],@tweak[0]
+ xorps @tweak[1],$inout1
+ xorps @tweak[2],$inout2
+ movups $inout0,($out)
+ movups $inout1,16*1($out)
+ movups $inout2,16*2($out)
+ lea 16*3($out),$out
+ jmp .Lxts_enc_done
+
+.align 16
+.Lxts_enc_four:
+ movups ($inp),$inout0
+ movups 16*1($inp),$inout1
+ movups 16*2($inp),$inout2
+ xorps @tweak[0],$inout0
+ movups 16*3($inp),$inout3
+ lea 16*4($inp),$inp
+ xorps @tweak[1],$inout1
+ xorps @tweak[2],$inout2
+ xorps @tweak[3],$inout3
+
+ call _aesni_encrypt4
+
+ xorps @tweak[0],$inout0
+ movdqa @tweak[5],@tweak[0]
+ xorps @tweak[1],$inout1
+ xorps @tweak[2],$inout2
+ movups $inout0,($out)
+ xorps @tweak[3],$inout3
+ movups $inout1,16*1($out)
+ movups $inout2,16*2($out)
+ movups $inout3,16*3($out)
+ lea 16*4($out),$out
+ jmp .Lxts_enc_done
+
+.align 16
+.Lxts_enc_done:
+ and \$15,$len_
+ jz .Lxts_enc_ret
+ mov $len_,$len
+
+.Lxts_enc_steal:
+ movzb ($inp),%eax # borrow $rounds ...
+ movzb -16($out),%ecx # ... and $key
+ lea 1($inp),$inp
+ mov %al,-16($out)
+ mov %cl,0($out)
+ lea 1($out),$out
+ sub \$1,$len
+ jnz .Lxts_enc_steal
+
+ sub $len_,$out # rewind $out
+ mov $key_,$key # restore $key
+ mov $rnds_,$rounds # restore $rounds
+
+ movups -16($out),$inout0
+ xorps @tweak[0],$inout0
+___
+ &aesni_generate1("enc",$key,$rounds);
+$code.=<<___;
+ xorps @tweak[0],$inout0
+ movups $inout0,-16($out)
+
+.Lxts_enc_ret:
+___
+$code.=<<___ if ($win64);
+ movaps 0x60(%rsp),%xmm6
+ movaps 0x70(%rsp),%xmm7
+ movaps 0x80(%rsp),%xmm8
+ movaps 0x90(%rsp),%xmm9
+ movaps 0xa0(%rsp),%xmm10
+ movaps 0xb0(%rsp),%xmm11
+ movaps 0xc0(%rsp),%xmm12
+ movaps 0xd0(%rsp),%xmm13
+ movaps 0xe0(%rsp),%xmm14
+ movaps 0xf0(%rsp),%xmm15
+___
+$code.=<<___;
+ lea $frame_size(%rsp),%rsp
+.Lxts_enc_epilogue:
+ ret
+.size aesni_xts_encrypt,.-aesni_xts_encrypt
+___
+
+$code.=<<___;
+.globl aesni_xts_decrypt
+.type aesni_xts_decrypt,\@function,6
+.align 16
+aesni_xts_decrypt:
+ lea -$frame_size(%rsp),%rsp
+___
+$code.=<<___ if ($win64);
+ movaps %xmm6,0x60(%rsp)
+ movaps %xmm7,0x70(%rsp)
+ movaps %xmm8,0x80(%rsp)
+ movaps %xmm9,0x90(%rsp)
+ movaps %xmm10,0xa0(%rsp)
+ movaps %xmm11,0xb0(%rsp)
+ movaps %xmm12,0xc0(%rsp)
+ movaps %xmm13,0xd0(%rsp)
+ movaps %xmm14,0xe0(%rsp)
+ movaps %xmm15,0xf0(%rsp)
+.Lxts_dec_body:
+___
+$code.=<<___;
+ movups ($ivp),@tweak[5] # load clear-text tweak
+ mov 240($key2),$rounds # key2->rounds
+ mov 240($key),$rnds_ # key1->rounds
+___
+ # generate the tweak
+ &aesni_generate1("enc",$key2,$rounds,@tweak[5]);
+$code.=<<___;
+ xor %eax,%eax # if ($len%16) len-=16;
+ test \$15,$len
+ setnz %al
+ shl \$4,%rax
+ sub %rax,$len
+
+ mov $key,$key_ # backup $key
+ mov $rnds_,$rounds # backup $rounds
+ mov $len,$len_ # backup $len
+ and \$-16,$len
+
+ movdqa .Lxts_magic(%rip),$twmask
+ pxor $twtmp,$twtmp
+ pcmpgtd @tweak[5],$twtmp # broadcast upper bits
+___
+ for ($i=0;$i<4;$i++) {
+ $code.=<<___;
+ pshufd \$0x13,$twtmp,$twres
+ pxor $twtmp,$twtmp
+ movdqa @tweak[5],@tweak[$i]
+ paddq @tweak[5],@tweak[5] # psllq 1,$tweak
+ pand $twmask,$twres # isolate carry and residue
+ pcmpgtd @tweak[5],$twtmp # broadcat upper bits
+ pxor $twres,@tweak[5]
+___
+ }
+$code.=<<___;
+ sub \$16*6,$len
+ jc .Lxts_dec_short
+
+ shr \$1,$rounds
+ sub \$1,$rounds
+ mov $rounds,$rnds_
+ jmp .Lxts_dec_grandloop
+
+.align 16
+.Lxts_dec_grandloop:
+ pshufd \$0x13,$twtmp,$twres
+ movdqa @tweak[5],@tweak[4]
+ paddq @tweak[5],@tweak[5] # psllq 1,$tweak
+ movdqu `16*0`($inp),$inout0 # load input
+ pand $twmask,$twres # isolate carry and residue
+ movdqu `16*1`($inp),$inout1
+ pxor $twres,@tweak[5]
+
+ movdqu `16*2`($inp),$inout2
+ pxor @tweak[0],$inout0 # input^=tweak
+ movdqu `16*3`($inp),$inout3
+ pxor @tweak[1],$inout1
+ movdqu `16*4`($inp),$inout4
+ pxor @tweak[2],$inout2
+ movdqu `16*5`($inp),$inout5
+ lea `16*6`($inp),$inp
+ pxor @tweak[3],$inout3
+ $movkey ($key_),$rndkey0
+ pxor @tweak[4],$inout4
+ pxor @tweak[5],$inout5
+
+ # inline _aesni_decrypt6 and interleave first and last rounds
+ # with own code...
+ $movkey 16($key_),$rndkey1
+ pxor $rndkey0,$inout0
+ pxor $rndkey0,$inout1
+ movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks
+ aesdec $rndkey1,$inout0
+ lea 32($key_),$key
+ pxor $rndkey0,$inout2
+ movdqa @tweak[1],`16*1`(%rsp)
+ aesdec $rndkey1,$inout1
+ pxor $rndkey0,$inout3
+ movdqa @tweak[2],`16*2`(%rsp)
+ aesdec $rndkey1,$inout2
+ pxor $rndkey0,$inout4
+ movdqa @tweak[3],`16*3`(%rsp)
+ aesdec $rndkey1,$inout3
+ pxor $rndkey0,$inout5
+ $movkey ($key),$rndkey0
+ dec $rounds
+ movdqa @tweak[4],`16*4`(%rsp)
+ aesdec $rndkey1,$inout4
+ movdqa @tweak[5],`16*5`(%rsp)
+ aesdec $rndkey1,$inout5
+ pxor $twtmp,$twtmp
+ pcmpgtd @tweak[5],$twtmp
+ jmp .Lxts_dec_loop6_enter
+
+.align 16
+.Lxts_dec_loop6:
+ aesdec $rndkey1,$inout0
+ aesdec $rndkey1,$inout1
+ dec $rounds
+ aesdec $rndkey1,$inout2
+ aesdec $rndkey1,$inout3
+ aesdec $rndkey1,$inout4
+ aesdec $rndkey1,$inout5
+.Lxts_dec_loop6_enter:
+ $movkey 16($key),$rndkey1
+ aesdec $rndkey0,$inout0
+ aesdec $rndkey0,$inout1
+ lea 32($key),$key
+ aesdec $rndkey0,$inout2
+ aesdec $rndkey0,$inout3
+ aesdec $rndkey0,$inout4
+ aesdec $rndkey0,$inout5
+ $movkey ($key),$rndkey0
+ jnz .Lxts_dec_loop6
+
+ pshufd \$0x13,$twtmp,$twres
+ pxor $twtmp,$twtmp
+ paddq @tweak[5],@tweak[5] # psllq 1,$tweak
+ aesdec $rndkey1,$inout0
+ pand $twmask,$twres # isolate carry and residue
+ aesdec $rndkey1,$inout1
+ pcmpgtd @tweak[5],$twtmp # broadcast upper bits
+ aesdec $rndkey1,$inout2
+ pxor $twres,@tweak[5]
+ aesdec $rndkey1,$inout3
+ aesdec $rndkey1,$inout4
+ aesdec $rndkey1,$inout5
+ $movkey 16($key),$rndkey1
+
+ pshufd \$0x13,$twtmp,$twres
+ pxor $twtmp,$twtmp
+ movdqa @tweak[5],@tweak[0]
+ paddq @tweak[5],@tweak[5] # psllq 1,$tweak
+ aesdec $rndkey0,$inout0
+ pand $twmask,$twres # isolate carry and residue
+ aesdec $rndkey0,$inout1
+ pcmpgtd @tweak[5],$twtmp # broadcat upper bits
+ aesdec $rndkey0,$inout2
+ pxor $twres,@tweak[5]
+ aesdec $rndkey0,$inout3
+ aesdec $rndkey0,$inout4
+ aesdec $rndkey0,$inout5
+ $movkey 32($key),$rndkey0
+
+ pshufd \$0x13,$twtmp,$twres
+ pxor $twtmp,$twtmp
+ movdqa @tweak[5],@tweak[1]
+ paddq @tweak[5],@tweak[5] # psllq 1,$tweak
+ aesdec $rndkey1,$inout0
+ pand $twmask,$twres # isolate carry and residue
+ aesdec $rndkey1,$inout1
+ pcmpgtd @tweak[5],$twtmp # broadcat upper bits
+ aesdec $rndkey1,$inout2
+ pxor $twres,@tweak[5]
+ aesdec $rndkey1,$inout3
+ aesdec $rndkey1,$inout4
+ aesdec $rndkey1,$inout5
+
+ pshufd \$0x13,$twtmp,$twres
+ pxor $twtmp,$twtmp
+ movdqa @tweak[5],@tweak[2]
+ paddq @tweak[5],@tweak[5] # psllq 1,$tweak
+ aesdeclast $rndkey0,$inout0
+ pand $twmask,$twres # isolate carry and residue
+ aesdeclast $rndkey0,$inout1
+ pcmpgtd @tweak[5],$twtmp # broadcat upper bits
+ aesdeclast $rndkey0,$inout2
+ pxor $twres,@tweak[5]
+ aesdeclast $rndkey0,$inout3
+ aesdeclast $rndkey0,$inout4
+ aesdeclast $rndkey0,$inout5
+
+ pshufd \$0x13,$twtmp,$twres
+ pxor $twtmp,$twtmp
+ movdqa @tweak[5],@tweak[3]
+ paddq @tweak[5],@tweak[5] # psllq 1,$tweak
+ xorps `16*0`(%rsp),$inout0 # output^=tweak
+ pand $twmask,$twres # isolate carry and residue
+ xorps `16*1`(%rsp),$inout1
+ pcmpgtd @tweak[5],$twtmp # broadcat upper bits
+ pxor $twres,@tweak[5]
+
+ xorps `16*2`(%rsp),$inout2
+ movups $inout0,`16*0`($out) # write output
+ xorps `16*3`(%rsp),$inout3
+ movups $inout1,`16*1`($out)
+ xorps `16*4`(%rsp),$inout4
+ movups $inout2,`16*2`($out)
+ xorps `16*5`(%rsp),$inout5
+ movups $inout3,`16*3`($out)
+ mov $rnds_,$rounds # restore $rounds
+ movups $inout4,`16*4`($out)
+ movups $inout5,`16*5`($out)
+ lea `16*6`($out),$out
+ sub \$16*6,$len
+ jnc .Lxts_dec_grandloop
+
+ lea 3($rounds,$rounds),$rounds # restore original value
+ mov $key_,$key # restore $key
+ mov $rounds,$rnds_ # backup $rounds
+
+.Lxts_dec_short:
+ add \$16*6,$len
+ jz .Lxts_dec_done
+
+ cmp \$0x20,$len
+ jb .Lxts_dec_one
+ je .Lxts_dec_two
+
+ cmp \$0x40,$len
+ jb .Lxts_dec_three
+ je .Lxts_dec_four
+
+ pshufd \$0x13,$twtmp,$twres
+ movdqa @tweak[5],@tweak[4]
+ paddq @tweak[5],@tweak[5] # psllq 1,$tweak
+ movdqu ($inp),$inout0
+ pand $twmask,$twres # isolate carry and residue
+ movdqu 16*1($inp),$inout1
+ pxor $twres,@tweak[5]
+
+ movdqu 16*2($inp),$inout2
+ pxor @tweak[0],$inout0
+ movdqu 16*3($inp),$inout3
+ pxor @tweak[1],$inout1
+ movdqu 16*4($inp),$inout4
+ lea 16*5($inp),$inp
+ pxor @tweak[2],$inout2
+ pxor @tweak[3],$inout3
+ pxor @tweak[4],$inout4
+
+ call _aesni_decrypt6
+
+ xorps @tweak[0],$inout0
+ xorps @tweak[1],$inout1
+ xorps @tweak[2],$inout2
+ movdqu $inout0,($out)
+ xorps @tweak[3],$inout3
+ movdqu $inout1,16*1($out)
+ xorps @tweak[4],$inout4
+ movdqu $inout2,16*2($out)
+ pxor $twtmp,$twtmp
+ movdqu $inout3,16*3($out)
+ pcmpgtd @tweak[5],$twtmp
+ movdqu $inout4,16*4($out)
+ lea 16*5($out),$out
+ pshufd \$0x13,$twtmp,@tweak[1] # $twres
+ and \$15,$len_
+ jz .Lxts_dec_ret
+
+ movdqa @tweak[5],@tweak[0]
+ paddq @tweak[5],@tweak[5] # psllq 1,$tweak
+ pand $twmask,@tweak[1] # isolate carry and residue
+ pxor @tweak[5],@tweak[1]
+ jmp .Lxts_dec_done2
+
+.align 16
+.Lxts_dec_one:
+ movups ($inp),$inout0
+ lea 16*1($inp),$inp
+ xorps @tweak[0],$inout0
+___
+ &aesni_generate1("dec",$key,$rounds);
+$code.=<<___;
+ xorps @tweak[0],$inout0
+ movdqa @tweak[1],@tweak[0]
+ movups $inout0,($out)
+ movdqa @tweak[2],@tweak[1]
+ lea 16*1($out),$out
+ jmp .Lxts_dec_done
+
+.align 16
+.Lxts_dec_two:
+ movups ($inp),$inout0
+ movups 16($inp),$inout1
+ lea 32($inp),$inp
+ xorps @tweak[0],$inout0
+ xorps @tweak[1],$inout1
+
+ call _aesni_decrypt3
+
+ xorps @tweak[0],$inout0
+ movdqa @tweak[2],@tweak[0]
+ xorps @tweak[1],$inout1
+ movdqa @tweak[3],@tweak[1]
+ movups $inout0,($out)
+ movups $inout1,16*1($out)
+ lea 16*2($out),$out
+ jmp .Lxts_dec_done
+
+.align 16
+.Lxts_dec_three:
+ movups ($inp),$inout0
+ movups 16*1($inp),$inout1
+ movups 16*2($inp),$inout2
+ lea 16*3($inp),$inp
+ xorps @tweak[0],$inout0
+ xorps @tweak[1],$inout1
+ xorps @tweak[2],$inout2
+
+ call _aesni_decrypt3
+
+ xorps @tweak[0],$inout0
+ movdqa @tweak[3],@tweak[0]
+ xorps @tweak[1],$inout1
+ movdqa @tweak[5],@tweak[1]
+ xorps @tweak[2],$inout2
+ movups $inout0,($out)
+ movups $inout1,16*1($out)
+ movups $inout2,16*2($out)
+ lea 16*3($out),$out
+ jmp .Lxts_dec_done
+
+.align 16
+.Lxts_dec_four:
+ pshufd \$0x13,$twtmp,$twres
+ movdqa @tweak[5],@tweak[4]
+ paddq @tweak[5],@tweak[5] # psllq 1,$tweak
+ movups ($inp),$inout0
+ pand $twmask,$twres # isolate carry and residue
+ movups 16*1($inp),$inout1
+ pxor $twres,@tweak[5]
+
+ movups 16*2($inp),$inout2
+ xorps @tweak[0],$inout0
+ movups 16*3($inp),$inout3
+ lea 16*4($inp),$inp
+ xorps @tweak[1],$inout1
+ xorps @tweak[2],$inout2
+ xorps @tweak[3],$inout3
+
+ call _aesni_decrypt4
+
+ xorps @tweak[0],$inout0
+ movdqa @tweak[4],@tweak[0]
+ xorps @tweak[1],$inout1
+ movdqa @tweak[5],@tweak[1]
+ xorps @tweak[2],$inout2
+ movups $inout0,($out)
+ xorps @tweak[3],$inout3
+ movups $inout1,16*1($out)
+ movups $inout2,16*2($out)
+ movups $inout3,16*3($out)
+ lea 16*4($out),$out
+ jmp .Lxts_dec_done
+
+.align 16
+.Lxts_dec_done:
+ and \$15,$len_
+ jz .Lxts_dec_ret
+.Lxts_dec_done2:
+ mov $len_,$len
+ mov $key_,$key # restore $key
+ mov $rnds_,$rounds # restore $rounds
+
+ movups ($inp),$inout0
+ xorps @tweak[1],$inout0
+___
+ &aesni_generate1("dec",$key,$rounds);
+$code.=<<___;
+ xorps @tweak[1],$inout0
+ movups $inout0,($out)
+
+.Lxts_dec_steal:
+ movzb 16($inp),%eax # borrow $rounds ...
+ movzb ($out),%ecx # ... and $key
+ lea 1($inp),$inp
+ mov %al,($out)
+ mov %cl,16($out)
+ lea 1($out),$out
+ sub \$1,$len
+ jnz .Lxts_dec_steal
+
+ sub $len_,$out # rewind $out
+ mov $key_,$key # restore $key
+ mov $rnds_,$rounds # restore $rounds
+
+ movups ($out),$inout0
+ xorps @tweak[0],$inout0
+___
+ &aesni_generate1("dec",$key,$rounds);
+$code.=<<___;
+ xorps @tweak[0],$inout0
+ movups $inout0,($out)
+
+.Lxts_dec_ret:
+___
+$code.=<<___ if ($win64);
+ movaps 0x60(%rsp),%xmm6
+ movaps 0x70(%rsp),%xmm7
+ movaps 0x80(%rsp),%xmm8
+ movaps 0x90(%rsp),%xmm9
+ movaps 0xa0(%rsp),%xmm10
+ movaps 0xb0(%rsp),%xmm11
+ movaps 0xc0(%rsp),%xmm12
+ movaps 0xd0(%rsp),%xmm13
+ movaps 0xe0(%rsp),%xmm14
+ movaps 0xf0(%rsp),%xmm15
+___
+$code.=<<___;
+ lea $frame_size(%rsp),%rsp
+.Lxts_dec_epilogue:
+ ret
+.size aesni_xts_decrypt,.-aesni_xts_decrypt
+___
+} }}
+
+########################################################################
+# void $PREFIX_cbc_encrypt (const void *inp, void *out,
+# size_t length, const AES_KEY *key,
+# unsigned char *ivp,const int enc);
+{
+my $reserved = $win64?0x40:-0x18; # used in decrypt
+$code.=<<___;
+.globl ${PREFIX}_cbc_encrypt
+.type ${PREFIX}_cbc_encrypt,\@function,6
+.align 16
+${PREFIX}_cbc_encrypt:
+ test $len,$len # check length
+ jz .Lcbc_ret
+
+ mov 240($key),$rnds_ # key->rounds
+ mov $key,$key_ # backup $key
+ test %r9d,%r9d # 6th argument
+ jz .Lcbc_decrypt
+#--------------------------- CBC ENCRYPT ------------------------------#
+ movups ($ivp),$inout0 # load iv as initial state
+ mov $rnds_,$rounds
+ cmp \$16,$len
+ jb .Lcbc_enc_tail
+ sub \$16,$len
+ jmp .Lcbc_enc_loop
+.align 16
+.Lcbc_enc_loop:
+ movups ($inp),$inout1 # load input
+ lea 16($inp),$inp
+ #xorps $inout1,$inout0
+___
+ &aesni_generate1("enc",$key,$rounds,$inout0,$inout1);
+$code.=<<___;
+ mov $rnds_,$rounds # restore $rounds
+ mov $key_,$key # restore $key
+ movups $inout0,0($out) # store output
+ lea 16($out),$out
+ sub \$16,$len
+ jnc .Lcbc_enc_loop
+ add \$16,$len
+ jnz .Lcbc_enc_tail
+ movups $inout0,($ivp)
+ jmp .Lcbc_ret
+
+.Lcbc_enc_tail:
+ mov $len,%rcx # zaps $key
+ xchg $inp,$out # $inp is %rsi and $out is %rdi now
+ .long 0x9066A4F3 # rep movsb
+ mov \$16,%ecx # zero tail
+ sub $len,%rcx
+ xor %eax,%eax
+ .long 0x9066AAF3 # rep stosb
+ lea -16(%rdi),%rdi # rewind $out by 1 block
+ mov $rnds_,$rounds # restore $rounds
+ mov %rdi,%rsi # $inp and $out are the same
+ mov $key_,$key # restore $key
+ xor $len,$len # len=16
+ jmp .Lcbc_enc_loop # one more spin
+ #--------------------------- CBC DECRYPT ------------------------------#
+.align 16
+.Lcbc_decrypt:
+___
+$code.=<<___ if ($win64);
+ lea -0x58(%rsp),%rsp
+ movaps %xmm6,(%rsp)
+ movaps %xmm7,0x10(%rsp)
+ movaps %xmm8,0x20(%rsp)
+ movaps %xmm9,0x30(%rsp)
+.Lcbc_decrypt_body:
+___
+$code.=<<___;
+ movups ($ivp),$iv
+ mov $rnds_,$rounds
+ cmp \$0x70,$len
+ jbe .Lcbc_dec_tail
+ shr \$1,$rnds_
+ sub \$0x70,$len
+ mov $rnds_,$rounds
+ movaps $iv,$reserved(%rsp)
+ jmp .Lcbc_dec_loop8_enter
+.align 16
+.Lcbc_dec_loop8:
+ movaps $rndkey0,$reserved(%rsp) # save IV
+ movups $inout7,($out)
+ lea 0x10($out),$out
+.Lcbc_dec_loop8_enter:
+ $movkey ($key),$rndkey0
+ movups ($inp),$inout0 # load input
+ movups 0x10($inp),$inout1
+ $movkey 16($key),$rndkey1
+
+ lea 32($key),$key
+ movdqu 0x20($inp),$inout2
+ xorps $rndkey0,$inout0
+ movdqu 0x30($inp),$inout3
+ xorps $rndkey0,$inout1
+ movdqu 0x40($inp),$inout4
+ aesdec $rndkey1,$inout0
+ pxor $rndkey0,$inout2
+ movdqu 0x50($inp),$inout5
+ aesdec $rndkey1,$inout1
+ pxor $rndkey0,$inout3
+ movdqu 0x60($inp),$inout6
+ aesdec $rndkey1,$inout2
+ pxor $rndkey0,$inout4
+ movdqu 0x70($inp),$inout7
+ aesdec $rndkey1,$inout3
+ pxor $rndkey0,$inout5
+ dec $rounds
+ aesdec $rndkey1,$inout4
+ pxor $rndkey0,$inout6
+ aesdec $rndkey1,$inout5
+ pxor $rndkey0,$inout7
+ $movkey ($key),$rndkey0
+ aesdec $rndkey1,$inout6
+ aesdec $rndkey1,$inout7
+ $movkey 16($key),$rndkey1
+
+ call .Ldec_loop8_enter
+
+ movups ($inp),$rndkey1 # re-load input
+ movups 0x10($inp),$rndkey0
+ xorps $reserved(%rsp),$inout0 # ^= IV
+ xorps $rndkey1,$inout1
+ movups 0x20($inp),$rndkey1
+ xorps $rndkey0,$inout2
+ movups 0x30($inp),$rndkey0
+ xorps $rndkey1,$inout3
+ movups 0x40($inp),$rndkey1
+ xorps $rndkey0,$inout4
+ movups 0x50($inp),$rndkey0
+ xorps $rndkey1,$inout5
+ movups 0x60($inp),$rndkey1
+ xorps $rndkey0,$inout6
+ movups 0x70($inp),$rndkey0 # IV
+ xorps $rndkey1,$inout7
+ movups $inout0,($out)
+ movups $inout1,0x10($out)
+ movups $inout2,0x20($out)
+ movups $inout3,0x30($out)
+ mov $rnds_,$rounds # restore $rounds
+ movups $inout4,0x40($out)
+ mov $key_,$key # restore $key
+ movups $inout5,0x50($out)
+ lea 0x80($inp),$inp
+ movups $inout6,0x60($out)
+ lea 0x70($out),$out
+ sub \$0x80,$len
+ ja .Lcbc_dec_loop8
+
+ movaps $inout7,$inout0
+ movaps $rndkey0,$iv
+ add \$0x70,$len
+ jle .Lcbc_dec_tail_collected
+ movups $inout0,($out)
+ lea 1($rnds_,$rnds_),$rounds
+ lea 0x10($out),$out
+.Lcbc_dec_tail:
+ movups ($inp),$inout0
+ movaps $inout0,$in0
+ cmp \$0x10,$len
+ jbe .Lcbc_dec_one
+
+ movups 0x10($inp),$inout1
+ movaps $inout1,$in1
+ cmp \$0x20,$len
+ jbe .Lcbc_dec_two
+
+ movups 0x20($inp),$inout2
+ movaps $inout2,$in2
+ cmp \$0x30,$len
+ jbe .Lcbc_dec_three
+
+ movups 0x30($inp),$inout3
+ cmp \$0x40,$len
+ jbe .Lcbc_dec_four
+
+ movups 0x40($inp),$inout4
+ cmp \$0x50,$len
+ jbe .Lcbc_dec_five
+
+ movups 0x50($inp),$inout5
+ cmp \$0x60,$len
+ jbe .Lcbc_dec_six
+
+ movups 0x60($inp),$inout6
+ movaps $iv,$reserved(%rsp) # save IV
+ call _aesni_decrypt8
+ movups ($inp),$rndkey1
+ movups 0x10($inp),$rndkey0
+ xorps $reserved(%rsp),$inout0 # ^= IV
+ xorps $rndkey1,$inout1
+ movups 0x20($inp),$rndkey1
+ xorps $rndkey0,$inout2
+ movups 0x30($inp),$rndkey0
+ xorps $rndkey1,$inout3
+ movups 0x40($inp),$rndkey1
+ xorps $rndkey0,$inout4
+ movups 0x50($inp),$rndkey0
+ xorps $rndkey1,$inout5
+ movups 0x60($inp),$iv # IV
+ xorps $rndkey0,$inout6
+ movups $inout0,($out)
+ movups $inout1,0x10($out)
+ movups $inout2,0x20($out)
+ movups $inout3,0x30($out)
+ movups $inout4,0x40($out)
+ movups $inout5,0x50($out)
+ lea 0x60($out),$out
+ movaps $inout6,$inout0
+ sub \$0x70,$len
+ jmp .Lcbc_dec_tail_collected
+.align 16
+.Lcbc_dec_one:
+___
+ &aesni_generate1("dec",$key,$rounds);
+$code.=<<___;
+ xorps $iv,$inout0
+ movaps $in0,$iv
+ sub \$0x10,$len
+ jmp .Lcbc_dec_tail_collected
+.align 16
+.Lcbc_dec_two:
+ xorps $inout2,$inout2
+ call _aesni_decrypt3
+ xorps $iv,$inout0
+ xorps $in0,$inout1
+ movups $inout0,($out)
+ movaps $in1,$iv
+ movaps $inout1,$inout0
+ lea 0x10($out),$out
+ sub \$0x20,$len
+ jmp .Lcbc_dec_tail_collected
+.align 16
+.Lcbc_dec_three:
+ call _aesni_decrypt3
+ xorps $iv,$inout0
+ xorps $in0,$inout1
+ movups $inout0,($out)
+ xorps $in1,$inout2
+ movups $inout1,0x10($out)
+ movaps $in2,$iv
+ movaps $inout2,$inout0
+ lea 0x20($out),$out
+ sub \$0x30,$len
+ jmp .Lcbc_dec_tail_collected
+.align 16
+.Lcbc_dec_four:
+ call _aesni_decrypt4
+ xorps $iv,$inout0
+ movups 0x30($inp),$iv
+ xorps $in0,$inout1
+ movups $inout0,($out)
+ xorps $in1,$inout2
+ movups $inout1,0x10($out)
+ xorps $in2,$inout3
+ movups $inout2,0x20($out)
+ movaps $inout3,$inout0
+ lea 0x30($out),$out
+ sub \$0x40,$len
+ jmp .Lcbc_dec_tail_collected
+.align 16
+.Lcbc_dec_five:
+ xorps $inout5,$inout5
+ call _aesni_decrypt6
+ movups 0x10($inp),$rndkey1
+ movups 0x20($inp),$rndkey0
+ xorps $iv,$inout0
+ xorps $in0,$inout1
+ xorps $rndkey1,$inout2
+ movups 0x30($inp),$rndkey1
+ xorps $rndkey0,$inout3
+ movups 0x40($inp),$iv
+ xorps $rndkey1,$inout4
+ movups $inout0,($out)
+ movups $inout1,0x10($out)
+ movups $inout2,0x20($out)
+ movups $inout3,0x30($out)
+ lea 0x40($out),$out
+ movaps $inout4,$inout0
+ sub \$0x50,$len
+ jmp .Lcbc_dec_tail_collected
+.align 16
+.Lcbc_dec_six:
+ call _aesni_decrypt6
+ movups 0x10($inp),$rndkey1
+ movups 0x20($inp),$rndkey0
+ xorps $iv,$inout0
+ xorps $in0,$inout1
+ xorps $rndkey1,$inout2
+ movups 0x30($inp),$rndkey1
+ xorps $rndkey0,$inout3
+ movups 0x40($inp),$rndkey0
+ xorps $rndkey1,$inout4
+ movups 0x50($inp),$iv
+ xorps $rndkey0,$inout5
+ movups $inout0,($out)
+ movups $inout1,0x10($out)
+ movups $inout2,0x20($out)
+ movups $inout3,0x30($out)
+ movups $inout4,0x40($out)
+ lea 0x50($out),$out
+ movaps $inout5,$inout0
+ sub \$0x60,$len
+ jmp .Lcbc_dec_tail_collected
+.align 16
+.Lcbc_dec_tail_collected:
+ and \$15,$len
+ movups $iv,($ivp)
+ jnz .Lcbc_dec_tail_partial
+ movups $inout0,($out)
+ jmp .Lcbc_dec_ret
+.align 16
+.Lcbc_dec_tail_partial:
+ movaps $inout0,$reserved(%rsp)
+ mov \$16,%rcx
+ mov $out,%rdi
+ sub $len,%rcx
+ lea $reserved(%rsp),%rsi
+ .long 0x9066A4F3 # rep movsb
+
+.Lcbc_dec_ret:
+___
+$code.=<<___ if ($win64);
+ movaps (%rsp),%xmm6
+ movaps 0x10(%rsp),%xmm7
+ movaps 0x20(%rsp),%xmm8
+ movaps 0x30(%rsp),%xmm9
+ lea 0x58(%rsp),%rsp
+___
+$code.=<<___;
+.Lcbc_ret:
+ ret
+.size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
+___
+}
+# int $PREFIX_set_[en|de]crypt_key (const unsigned char *userKey,
+# int bits, AES_KEY *key)
+{ my ($inp,$bits,$key) = @_4args;
+ $bits =~ s/%r/%e/;
+
+$code.=<<___;
+.globl ${PREFIX}_set_decrypt_key
+.type ${PREFIX}_set_decrypt_key,\@abi-omnipotent
+.align 16
+${PREFIX}_set_decrypt_key:
+ .byte 0x48,0x83,0xEC,0x08 # sub rsp,8
+ call __aesni_set_encrypt_key
+ shl \$4,$bits # rounds-1 after _aesni_set_encrypt_key
+ test %eax,%eax
+ jnz .Ldec_key_ret
+ lea 16($key,$bits),$inp # points at the end of key schedule
+
+ $movkey ($key),%xmm0 # just swap
+ $movkey ($inp),%xmm1
+ $movkey %xmm0,($inp)
+ $movkey %xmm1,($key)
+ lea 16($key),$key
+ lea -16($inp),$inp
+
+.Ldec_key_inverse:
+ $movkey ($key),%xmm0 # swap and inverse
+ $movkey ($inp),%xmm1
+ aesimc %xmm0,%xmm0
+ aesimc %xmm1,%xmm1
+ lea 16($key),$key
+ lea -16($inp),$inp
+ $movkey %xmm0,16($inp)
+ $movkey %xmm1,-16($key)
+ cmp $key,$inp
+ ja .Ldec_key_inverse
+
+ $movkey ($key),%xmm0 # inverse middle
+ aesimc %xmm0,%xmm0
+ $movkey %xmm0,($inp)
+.Ldec_key_ret:
+ add \$8,%rsp
+ ret
+.LSEH_end_set_decrypt_key:
+.size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
+___
+
+# This is based on submission by
+#
+# Huang Ying <ying.huang@intel.com>
+# Vinodh Gopal <vinodh.gopal@intel.com>
+# Kahraman Akdemir
+#
+# Agressively optimized in respect to aeskeygenassist's critical path
+# and is contained in %xmm0-5 to meet Win64 ABI requirement.
+#
+$code.=<<___;
+.globl ${PREFIX}_set_encrypt_key
+.type ${PREFIX}_set_encrypt_key,\@abi-omnipotent
+.align 16
+${PREFIX}_set_encrypt_key:
+__aesni_set_encrypt_key:
+ .byte 0x48,0x83,0xEC,0x08 # sub rsp,8
+ mov \$-1,%rax
+ test $inp,$inp
+ jz .Lenc_key_ret
+ test $key,$key
+ jz .Lenc_key_ret
+
+ movups ($inp),%xmm0 # pull first 128 bits of *userKey
+ xorps %xmm4,%xmm4 # low dword of xmm4 is assumed 0
+ lea 16($key),%rax
+ cmp \$256,$bits
+ je .L14rounds
+ cmp \$192,$bits
+ je .L12rounds
+ cmp \$128,$bits
+ jne .Lbad_keybits
+
+.L10rounds:
+ mov \$9,$bits # 10 rounds for 128-bit key
+ $movkey %xmm0,($key) # round 0
+ aeskeygenassist \$0x1,%xmm0,%xmm1 # round 1
+ call .Lkey_expansion_128_cold
+ aeskeygenassist \$0x2,%xmm0,%xmm1 # round 2
+ call .Lkey_expansion_128
+ aeskeygenassist \$0x4,%xmm0,%xmm1 # round 3
+ call .Lkey_expansion_128
+ aeskeygenassist \$0x8,%xmm0,%xmm1 # round 4
+ call .Lkey_expansion_128
+ aeskeygenassist \$0x10,%xmm0,%xmm1 # round 5
+ call .Lkey_expansion_128
+ aeskeygenassist \$0x20,%xmm0,%xmm1 # round 6
+ call .Lkey_expansion_128
+ aeskeygenassist \$0x40,%xmm0,%xmm1 # round 7
+ call .Lkey_expansion_128
+ aeskeygenassist \$0x80,%xmm0,%xmm1 # round 8
+ call .Lkey_expansion_128
+ aeskeygenassist \$0x1b,%xmm0,%xmm1 # round 9
+ call .Lkey_expansion_128
+ aeskeygenassist \$0x36,%xmm0,%xmm1 # round 10
+ call .Lkey_expansion_128
+ $movkey %xmm0,(%rax)
+ mov $bits,80(%rax) # 240(%rdx)
+ xor %eax,%eax
+ jmp .Lenc_key_ret
+
+.align 16
+.L12rounds:
+ movq 16($inp),%xmm2 # remaining 1/3 of *userKey
+ mov \$11,$bits # 12 rounds for 192
+ $movkey %xmm0,($key) # round 0
+ aeskeygenassist \$0x1,%xmm2,%xmm1 # round 1,2
+ call .Lkey_expansion_192a_cold
+ aeskeygenassist \$0x2,%xmm2,%xmm1 # round 2,3
+ call .Lkey_expansion_192b
+ aeskeygenassist \$0x4,%xmm2,%xmm1 # round 4,5
+ call .Lkey_expansion_192a
+ aeskeygenassist \$0x8,%xmm2,%xmm1 # round 5,6
+ call .Lkey_expansion_192b
+ aeskeygenassist \$0x10,%xmm2,%xmm1 # round 7,8
+ call .Lkey_expansion_192a
+ aeskeygenassist \$0x20,%xmm2,%xmm1 # round 8,9
+ call .Lkey_expansion_192b
+ aeskeygenassist \$0x40,%xmm2,%xmm1 # round 10,11
+ call .Lkey_expansion_192a
+ aeskeygenassist \$0x80,%xmm2,%xmm1 # round 11,12
+ call .Lkey_expansion_192b
+ $movkey %xmm0,(%rax)
+ mov $bits,48(%rax) # 240(%rdx)
+ xor %rax, %rax
+ jmp .Lenc_key_ret
+
+.align 16
+.L14rounds:
+ movups 16($inp),%xmm2 # remaning half of *userKey
+ mov \$13,$bits # 14 rounds for 256
+ lea 16(%rax),%rax
+ $movkey %xmm0,($key) # round 0
+ $movkey %xmm2,16($key) # round 1
+ aeskeygenassist \$0x1,%xmm2,%xmm1 # round 2
+ call .Lkey_expansion_256a_cold
+ aeskeygenassist \$0x1,%xmm0,%xmm1 # round 3
+ call .Lkey_expansion_256b
+ aeskeygenassist \$0x2,%xmm2,%xmm1 # round 4
+ call .Lkey_expansion_256a
+ aeskeygenassist \$0x2,%xmm0,%xmm1 # round 5
+ call .Lkey_expansion_256b
+ aeskeygenassist \$0x4,%xmm2,%xmm1 # round 6
+ call .Lkey_expansion_256a
+ aeskeygenassist \$0x4,%xmm0,%xmm1 # round 7
+ call .Lkey_expansion_256b
+ aeskeygenassist \$0x8,%xmm2,%xmm1 # round 8
+ call .Lkey_expansion_256a
+ aeskeygenassist \$0x8,%xmm0,%xmm1 # round 9
+ call .Lkey_expansion_256b
+ aeskeygenassist \$0x10,%xmm2,%xmm1 # round 10
+ call .Lkey_expansion_256a
+ aeskeygenassist \$0x10,%xmm0,%xmm1 # round 11
+ call .Lkey_expansion_256b
+ aeskeygenassist \$0x20,%xmm2,%xmm1 # round 12
+ call .Lkey_expansion_256a
+ aeskeygenassist \$0x20,%xmm0,%xmm1 # round 13
+ call .Lkey_expansion_256b
+ aeskeygenassist \$0x40,%xmm2,%xmm1 # round 14
+ call .Lkey_expansion_256a
+ $movkey %xmm0,(%rax)
+ mov $bits,16(%rax) # 240(%rdx)
+ xor %rax,%rax
+ jmp .Lenc_key_ret
+
+.align 16
+.Lbad_keybits:
+ mov \$-2,%rax
+.Lenc_key_ret:
+ add \$8,%rsp
+ ret
+.LSEH_end_set_encrypt_key:
+
+.align 16
+.Lkey_expansion_128:
+ $movkey %xmm0,(%rax)
+ lea 16(%rax),%rax
+.Lkey_expansion_128_cold:
+ shufps \$0b00010000,%xmm0,%xmm4
+ xorps %xmm4, %xmm0
+ shufps \$0b10001100,%xmm0,%xmm4
+ xorps %xmm4, %xmm0
+ shufps \$0b11111111,%xmm1,%xmm1 # critical path
+ xorps %xmm1,%xmm0
+ ret
+
+.align 16
+.Lkey_expansion_192a:
+ $movkey %xmm0,(%rax)
+ lea 16(%rax),%rax
+.Lkey_expansion_192a_cold:
+ movaps %xmm2, %xmm5
+.Lkey_expansion_192b_warm:
+ shufps \$0b00010000,%xmm0,%xmm4
+ movdqa %xmm2,%xmm3
+ xorps %xmm4,%xmm0
+ shufps \$0b10001100,%xmm0,%xmm4
+ pslldq \$4,%xmm3
+ xorps %xmm4,%xmm0
+ pshufd \$0b01010101,%xmm1,%xmm1 # critical path
+ pxor %xmm3,%xmm2
+ pxor %xmm1,%xmm0
+ pshufd \$0b11111111,%xmm0,%xmm3
+ pxor %xmm3,%xmm2
+ ret
+
+.align 16
+.Lkey_expansion_192b:
+ movaps %xmm0,%xmm3
+ shufps \$0b01000100,%xmm0,%xmm5
+ $movkey %xmm5,(%rax)
+ shufps \$0b01001110,%xmm2,%xmm3
+ $movkey %xmm3,16(%rax)
+ lea 32(%rax),%rax
+ jmp .Lkey_expansion_192b_warm
+
+.align 16
+.Lkey_expansion_256a:
+ $movkey %xmm2,(%rax)
+ lea 16(%rax),%rax
+.Lkey_expansion_256a_cold:
+ shufps \$0b00010000,%xmm0,%xmm4
+ xorps %xmm4,%xmm0
+ shufps \$0b10001100,%xmm0,%xmm4
+ xorps %xmm4,%xmm0
+ shufps \$0b11111111,%xmm1,%xmm1 # critical path
+ xorps %xmm1,%xmm0
+ ret
+
+.align 16
+.Lkey_expansion_256b:
+ $movkey %xmm0,(%rax)
+ lea 16(%rax),%rax
+
+ shufps \$0b00010000,%xmm2,%xmm4
+ xorps %xmm4,%xmm2
+ shufps \$0b10001100,%xmm2,%xmm4
+ xorps %xmm4,%xmm2
+ shufps \$0b10101010,%xmm1,%xmm1 # critical path
+ xorps %xmm1,%xmm2
+ ret
+.size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
+.size __aesni_set_encrypt_key,.-__aesni_set_encrypt_key
+___
+}
+
+$code.=<<___;
+.align 64
+.Lbswap_mask:
+ .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.Lincrement32:
+ .long 6,6,6,0
+.Lincrement64:
+ .long 1,0,0,0
+.Lxts_magic:
+ .long 0x87,0,1,0
+
+.asciz "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"
+.align 64
+___
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+# CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern __imp_RtlVirtualUnwind
+___
+$code.=<<___ if ($PREFIX eq "aesni");
+.type ecb_se_handler,\@abi-omnipotent
+.align 16
+ecb_se_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 152($context),%rax # pull context->Rsp
+
+ jmp .Lcommon_seh_tail
+.size ecb_se_handler,.-ecb_se_handler
+
+.type ccm64_se_handler,\@abi-omnipotent
+.align 16
+ccm64_se_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HandlerData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # prologue label
+ cmp %r10,%rbx # context->Rip<prologue label
+ jb .Lcommon_seh_tail
+
+ mov 152($context),%rax # pull context->Rsp
+
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=epilogue label
+ jae .Lcommon_seh_tail
+
+ lea 0(%rax),%rsi # %xmm save area
+ lea 512($context),%rdi # &context.Xmm6
+ mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax)
+ .long 0xa548f3fc # cld; rep movsq
+ lea 0x58(%rax),%rax # adjust stack pointer
+
+ jmp .Lcommon_seh_tail
+.size ccm64_se_handler,.-ccm64_se_handler
+
+.type ctr32_se_handler,\@abi-omnipotent
+.align 16
+ctr32_se_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ lea .Lctr32_body(%rip),%r10
+ cmp %r10,%rbx # context->Rip<"prologue" label
+ jb .Lcommon_seh_tail
+
+ mov 152($context),%rax # pull context->Rsp
+
+ lea .Lctr32_ret(%rip),%r10
+ cmp %r10,%rbx
+ jae .Lcommon_seh_tail
+
+ lea 0x20(%rax),%rsi # %xmm save area
+ lea 512($context),%rdi # &context.Xmm6
+ mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
+ .long 0xa548f3fc # cld; rep movsq
+ lea 0xc8(%rax),%rax # adjust stack pointer
+
+ jmp .Lcommon_seh_tail
+.size ctr32_se_handler,.-ctr32_se_handler
+
+.type xts_se_handler,\@abi-omnipotent
+.align 16
+xts_se_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HandlerData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # prologue lable
+ cmp %r10,%rbx # context->Rip<prologue label
+ jb .Lcommon_seh_tail
+
+ mov 152($context),%rax # pull context->Rsp
+
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=epilogue label
+ jae .Lcommon_seh_tail
+
+ lea 0x60(%rax),%rsi # %xmm save area
+ lea 512($context),%rdi # & context.Xmm6
+ mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
+ .long 0xa548f3fc # cld; rep movsq
+ lea 0x68+160(%rax),%rax # adjust stack pointer
+
+ jmp .Lcommon_seh_tail
+.size xts_se_handler,.-xts_se_handler
+___
+$code.=<<___;
+.type cbc_se_handler,\@abi-omnipotent
+.align 16
+cbc_se_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 152($context),%rax # pull context->Rsp
+ mov 248($context),%rbx # pull context->Rip
+
+ lea .Lcbc_decrypt(%rip),%r10
+ cmp %r10,%rbx # context->Rip<"prologue" label
+ jb .Lcommon_seh_tail
+
+ lea .Lcbc_decrypt_body(%rip),%r10
+ cmp %r10,%rbx # context->Rip<cbc_decrypt_body
+ jb .Lrestore_cbc_rax
+
+ lea .Lcbc_ret(%rip),%r10
+ cmp %r10,%rbx # context->Rip>="epilogue" label
+ jae .Lcommon_seh_tail
+
+ lea 0(%rax),%rsi # top of stack
+ lea 512($context),%rdi # &context.Xmm6
+ mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax)
+ .long 0xa548f3fc # cld; rep movsq
+ lea 0x58(%rax),%rax # adjust stack pointer
+ jmp .Lcommon_seh_tail
+
+.Lrestore_cbc_rax:
+ mov 120($context),%rax
+
+.Lcommon_seh_tail:
+ mov 8(%rax),%rdi
+ mov 16(%rax),%rsi
+ mov %rax,152($context) # restore context->Rsp
+ mov %rsi,168($context) # restore context->Rsi
+ mov %rdi,176($context) # restore context->Rdi
+
+ mov 40($disp),%rdi # disp->ContextRecord
+ mov $context,%rsi # context
+ mov \$154,%ecx # sizeof(CONTEXT)
+ .long 0xa548f3fc # cld; rep movsq
+
+ mov $disp,%rsi
+ xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
+ mov 8(%rsi),%rdx # arg2, disp->ImageBase
+ mov 0(%rsi),%r8 # arg3, disp->ControlPc
+ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
+ mov 40(%rsi),%r10 # disp->ContextRecord
+ lea 56(%rsi),%r11 # &disp->HandlerData
+ lea 24(%rsi),%r12 # &disp->EstablisherFrame
+ mov %r10,32(%rsp) # arg5
+ mov %r11,40(%rsp) # arg6
+ mov %r12,48(%rsp) # arg7
+ mov %rcx,56(%rsp) # arg8, (NULL)
+ call *__imp_RtlVirtualUnwind(%rip)
+
+ mov \$1,%eax # ExceptionContinueSearch
+ add \$64,%rsp
+ popfq
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ pop %rdi
+ pop %rsi
+ ret
+.size cbc_se_handler,.-cbc_se_handler
+
+.section .pdata
+.align 4
+___
+$code.=<<___ if ($PREFIX eq "aesni");
+ .rva .LSEH_begin_aesni_ecb_encrypt
+ .rva .LSEH_end_aesni_ecb_encrypt
+ .rva .LSEH_info_ecb
+
+ .rva .LSEH_begin_aesni_ccm64_encrypt_blocks
+ .rva .LSEH_end_aesni_ccm64_encrypt_blocks
+ .rva .LSEH_info_ccm64_enc
+
+ .rva .LSEH_begin_aesni_ccm64_decrypt_blocks
+ .rva .LSEH_end_aesni_ccm64_decrypt_blocks
+ .rva .LSEH_info_ccm64_dec
+
+ .rva .LSEH_begin_aesni_ctr32_encrypt_blocks
+ .rva .LSEH_end_aesni_ctr32_encrypt_blocks
+ .rva .LSEH_info_ctr32
+
+ .rva .LSEH_begin_aesni_xts_encrypt
+ .rva .LSEH_end_aesni_xts_encrypt
+ .rva .LSEH_info_xts_enc
+
+ .rva .LSEH_begin_aesni_xts_decrypt
+ .rva .LSEH_end_aesni_xts_decrypt
+ .rva .LSEH_info_xts_dec
+___
+$code.=<<___;
+ .rva .LSEH_begin_${PREFIX}_cbc_encrypt
+ .rva .LSEH_end_${PREFIX}_cbc_encrypt
+ .rva .LSEH_info_cbc
+
+ .rva ${PREFIX}_set_decrypt_key
+ .rva .LSEH_end_set_decrypt_key
+ .rva .LSEH_info_key
+
+ .rva ${PREFIX}_set_encrypt_key
+ .rva .LSEH_end_set_encrypt_key
+ .rva .LSEH_info_key
+.section .xdata
+.align 8
+___
+$code.=<<___ if ($PREFIX eq "aesni");
+.LSEH_info_ecb:
+ .byte 9,0,0,0
+ .rva ecb_se_handler
+.LSEH_info_ccm64_enc:
+ .byte 9,0,0,0
+ .rva ccm64_se_handler
+ .rva .Lccm64_enc_body,.Lccm64_enc_ret # HandlerData[]
+.LSEH_info_ccm64_dec:
+ .byte 9,0,0,0
+ .rva ccm64_se_handler
+ .rva .Lccm64_dec_body,.Lccm64_dec_ret # HandlerData[]
+.LSEH_info_ctr32:
+ .byte 9,0,0,0
+ .rva ctr32_se_handler
+.LSEH_info_xts_enc:
+ .byte 9,0,0,0
+ .rva xts_se_handler
+ .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[]
+.LSEH_info_xts_dec:
+ .byte 9,0,0,0
+ .rva xts_se_handler
+ .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
+___
+$code.=<<___;
+.LSEH_info_cbc:
+ .byte 9,0,0,0
+ .rva cbc_se_handler
+.LSEH_info_key:
+ .byte 0x01,0x04,0x01,0x00
+ .byte 0x04,0x02,0x00,0x00 # sub rsp,8
+___
+}
+
+sub rex {
+ local *opcode=shift;
+ my ($dst,$src)=@_;
+ my $rex=0;
+
+ $rex|=0x04 if($dst>=8);
+ $rex|=0x01 if($src>=8);
+ push @opcode,$rex|0x40 if($rex);
+}
+
+sub aesni {
+ my $line=shift;
+ my @opcode=(0x66);
+
+ if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+ rex(\@opcode,$4,$3);
+ push @opcode,0x0f,0x3a,0xdf;
+ push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M
+ my $c=$2;
+ push @opcode,$c=~/^0/?oct($c):$c;
+ return ".byte\t".join(',',@opcode);
+ }
+ elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+ my %opcodelet = (
+ "aesimc" => 0xdb,
+ "aesenc" => 0xdc, "aesenclast" => 0xdd,
+ "aesdec" => 0xde, "aesdeclast" => 0xdf
+ );
+ return undef if (!defined($opcodelet{$1}));
+ rex(\@opcode,$3,$2);
+ push @opcode,0x0f,0x38,$opcodelet{$1};
+ push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
+ return ".byte\t".join(',',@opcode);
+ }
+ return $line;
+}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
+
+print $code;
+
+close STDOUT;
diff --git a/main/openssl/crypto/aes/asm/bsaes-x86_64.S b/main/openssl/crypto/aes/asm/bsaes-x86_64.S
new file mode 100644
index 00000000..dc92d4da
--- /dev/null
+++ b/main/openssl/crypto/aes/asm/bsaes-x86_64.S
@@ -0,0 +1,2498 @@
+.text
+
+
+
+
+.type _bsaes_encrypt8,@function
+.align 64
+_bsaes_encrypt8:
+ leaq .LBS0(%rip),%r11
+
+ movdqa (%rax),%xmm8
+ leaq 16(%rax),%rax
+ movdqa 80(%r11),%xmm7
+ pxor %xmm8,%xmm15
+ pxor %xmm8,%xmm0
+.byte 102,68,15,56,0,255
+ pxor %xmm8,%xmm1
+.byte 102,15,56,0,199
+ pxor %xmm8,%xmm2
+.byte 102,15,56,0,207
+ pxor %xmm8,%xmm3
+.byte 102,15,56,0,215
+ pxor %xmm8,%xmm4
+.byte 102,15,56,0,223
+ pxor %xmm8,%xmm5
+.byte 102,15,56,0,231
+ pxor %xmm8,%xmm6
+.byte 102,15,56,0,239
+.byte 102,15,56,0,247
+_bsaes_encrypt8_bitslice:
+ movdqa 0(%r11),%xmm7
+ movdqa 16(%r11),%xmm8
+ movdqa %xmm5,%xmm9
+ psrlq $1,%xmm5
+ movdqa %xmm3,%xmm10
+ psrlq $1,%xmm3
+ pxor %xmm6,%xmm5
+ pxor %xmm4,%xmm3
+ pand %xmm7,%xmm5
+ pand %xmm7,%xmm3
+ pxor %xmm5,%xmm6
+ psllq $1,%xmm5
+ pxor %xmm3,%xmm4
+ psllq $1,%xmm3
+ pxor %xmm9,%xmm5
+ pxor %xmm10,%xmm3
+ movdqa %xmm1,%xmm9
+ psrlq $1,%xmm1
+ movdqa %xmm15,%xmm10
+ psrlq $1,%xmm15
+ pxor %xmm2,%xmm1
+ pxor %xmm0,%xmm15
+ pand %xmm7,%xmm1
+ pand %xmm7,%xmm15
+ pxor %xmm1,%xmm2
+ psllq $1,%xmm1
+ pxor %xmm15,%xmm0
+ psllq $1,%xmm15
+ pxor %xmm9,%xmm1
+ pxor %xmm10,%xmm15
+ movdqa 32(%r11),%xmm7
+ movdqa %xmm4,%xmm9
+ psrlq $2,%xmm4
+ movdqa %xmm3,%xmm10
+ psrlq $2,%xmm3
+ pxor %xmm6,%xmm4
+ pxor %xmm5,%xmm3
+ pand %xmm8,%xmm4
+ pand %xmm8,%xmm3
+ pxor %xmm4,%xmm6
+ psllq $2,%xmm4
+ pxor %xmm3,%xmm5
+ psllq $2,%xmm3
+ pxor %xmm9,%xmm4
+ pxor %xmm10,%xmm3
+ movdqa %xmm0,%xmm9
+ psrlq $2,%xmm0
+ movdqa %xmm15,%xmm10
+ psrlq $2,%xmm15
+ pxor %xmm2,%xmm0
+ pxor %xmm1,%xmm15
+ pand %xmm8,%xmm0
+ pand %xmm8,%xmm15
+ pxor %xmm0,%xmm2
+ psllq $2,%xmm0
+ pxor %xmm15,%xmm1
+ psllq $2,%xmm15
+ pxor %xmm9,%xmm0
+ pxor %xmm10,%xmm15
+ movdqa %xmm2,%xmm9
+ psrlq $4,%xmm2
+ movdqa %xmm1,%xmm10
+ psrlq $4,%xmm1
+ pxor %xmm6,%xmm2
+ pxor %xmm5,%xmm1
+ pand %xmm7,%xmm2
+ pand %xmm7,%xmm1
+ pxor %xmm2,%xmm6
+ psllq $4,%xmm2
+ pxor %xmm1,%xmm5
+ psllq $4,%xmm1
+ pxor %xmm9,%xmm2
+ pxor %xmm10,%xmm1
+ movdqa %xmm0,%xmm9
+ psrlq $4,%xmm0
+ movdqa %xmm15,%xmm10
+ psrlq $4,%xmm15
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm15
+ pand %xmm7,%xmm0
+ pand %xmm7,%xmm15
+ pxor %xmm0,%xmm4
+ psllq $4,%xmm0
+ pxor %xmm15,%xmm3
+ psllq $4,%xmm15
+ pxor %xmm9,%xmm0
+ pxor %xmm10,%xmm15
+ decl %r10d
+ jmp .Lenc_sbox
+.align 16
+.Lenc_loop:
+ pxor 0(%rax),%xmm15
+ pxor 16(%rax),%xmm0
+.byte 102,68,15,56,0,255
+ pxor 32(%rax),%xmm1
+.byte 102,15,56,0,199
+ pxor 48(%rax),%xmm2
+.byte 102,15,56,0,207
+ pxor 64(%rax),%xmm3
+.byte 102,15,56,0,215
+ pxor 80(%rax),%xmm4
+.byte 102,15,56,0,223
+ pxor 96(%rax),%xmm5
+.byte 102,15,56,0,231
+ pxor 112(%rax),%xmm6
+.byte 102,15,56,0,239
+ leaq 128(%rax),%rax
+.byte 102,15,56,0,247
+.Lenc_sbox:
+ pxor %xmm5,%xmm4
+ pxor %xmm0,%xmm1
+ pxor %xmm15,%xmm2
+ pxor %xmm1,%xmm5
+ pxor %xmm15,%xmm4
+
+ pxor %xmm2,%xmm5
+ pxor %xmm6,%xmm2
+ pxor %xmm4,%xmm6
+ pxor %xmm3,%xmm2
+ pxor %xmm4,%xmm3
+ pxor %xmm0,%xmm2
+
+ pxor %xmm6,%xmm1
+ pxor %xmm4,%xmm0
+ movdqa %xmm6,%xmm10
+ movdqa %xmm0,%xmm9
+ movdqa %xmm4,%xmm8
+ movdqa %xmm1,%xmm12
+ movdqa %xmm5,%xmm11
+
+ pxor %xmm3,%xmm10
+ pxor %xmm1,%xmm9
+ pxor %xmm2,%xmm8
+ movdqa %xmm10,%xmm13
+ pxor %xmm3,%xmm12
+ movdqa %xmm9,%xmm7
+ pxor %xmm15,%xmm11
+ movdqa %xmm10,%xmm14
+
+ por %xmm8,%xmm9
+ por %xmm11,%xmm10
+ pxor %xmm7,%xmm14
+ pand %xmm11,%xmm13
+ pxor %xmm8,%xmm11
+ pand %xmm8,%xmm7
+ pand %xmm11,%xmm14
+ movdqa %xmm2,%xmm11
+ pxor %xmm15,%xmm11
+ pand %xmm11,%xmm12
+ pxor %xmm12,%xmm10
+ pxor %xmm12,%xmm9
+ movdqa %xmm6,%xmm12
+ movdqa %xmm4,%xmm11
+ pxor %xmm0,%xmm12
+ pxor %xmm5,%xmm11
+ movdqa %xmm12,%xmm8
+ pand %xmm11,%xmm12
+ por %xmm11,%xmm8
+ pxor %xmm12,%xmm7
+ pxor %xmm14,%xmm10
+ pxor %xmm13,%xmm9
+ pxor %xmm14,%xmm8
+ movdqa %xmm1,%xmm11
+ pxor %xmm13,%xmm7
+ movdqa %xmm3,%xmm12
+ pxor %xmm13,%xmm8
+ movdqa %xmm0,%xmm13
+ pand %xmm2,%xmm11
+ movdqa %xmm6,%xmm14
+ pand %xmm15,%xmm12
+ pand %xmm4,%xmm13
+ por %xmm5,%xmm14
+ pxor %xmm11,%xmm10
+ pxor %xmm12,%xmm9
+ pxor %xmm13,%xmm8
+ pxor %xmm14,%xmm7
+
+
+
+
+
+ movdqa %xmm10,%xmm11
+ pand %xmm8,%xmm10
+ pxor %xmm9,%xmm11
+
+ movdqa %xmm7,%xmm13
+ movdqa %xmm11,%xmm14
+ pxor %xmm10,%xmm13
+ pand %xmm13,%xmm14
+
+ movdqa %xmm8,%xmm12
+ pxor %xmm9,%xmm14
+ pxor %xmm7,%xmm12
+
+ pxor %xmm9,%xmm10
+
+ pand %xmm10,%xmm12
+
+ movdqa %xmm13,%xmm9
+ pxor %xmm7,%xmm12
+
+ pxor %xmm12,%xmm9
+ pxor %xmm12,%xmm8
+
+ pand %xmm7,%xmm9
+
+ pxor %xmm9,%xmm13
+ pxor %xmm9,%xmm8
+
+ pand %xmm14,%xmm13
+
+ pxor %xmm11,%xmm13
+ movdqa %xmm5,%xmm11
+ movdqa %xmm4,%xmm7
+ movdqa %xmm14,%xmm9
+ pxor %xmm13,%xmm9
+ pand %xmm5,%xmm9
+ pxor %xmm4,%xmm5
+ pand %xmm14,%xmm4
+ pand %xmm13,%xmm5
+ pxor %xmm4,%xmm5
+ pxor %xmm9,%xmm4
+ pxor %xmm15,%xmm11
+ pxor %xmm2,%xmm7
+ pxor %xmm12,%xmm14
+ pxor %xmm8,%xmm13
+ movdqa %xmm14,%xmm10
+ movdqa %xmm12,%xmm9
+ pxor %xmm13,%xmm10
+ pxor %xmm8,%xmm9
+ pand %xmm11,%xmm10
+ pand %xmm15,%xmm9
+ pxor %xmm7,%xmm11
+ pxor %xmm2,%xmm15
+ pand %xmm14,%xmm7
+ pand %xmm12,%xmm2
+ pand %xmm13,%xmm11
+ pand %xmm8,%xmm15
+ pxor %xmm11,%xmm7
+ pxor %xmm2,%xmm15
+ pxor %xmm10,%xmm11
+ pxor %xmm9,%xmm2
+ pxor %xmm11,%xmm5
+ pxor %xmm11,%xmm15
+ pxor %xmm7,%xmm4
+ pxor %xmm7,%xmm2
+
+ movdqa %xmm6,%xmm11
+ movdqa %xmm0,%xmm7
+ pxor %xmm3,%xmm11
+ pxor %xmm1,%xmm7
+ movdqa %xmm14,%xmm10
+ movdqa %xmm12,%xmm9
+ pxor %xmm13,%xmm10
+ pxor %xmm8,%xmm9
+ pand %xmm11,%xmm10
+ pand %xmm3,%xmm9
+ pxor %xmm7,%xmm11
+ pxor %xmm1,%xmm3
+ pand %xmm14,%xmm7
+ pand %xmm12,%xmm1
+ pand %xmm13,%xmm11
+ pand %xmm8,%xmm3
+ pxor %xmm11,%xmm7
+ pxor %xmm1,%xmm3
+ pxor %xmm10,%xmm11
+ pxor %xmm9,%xmm1
+ pxor %xmm12,%xmm14
+ pxor %xmm8,%xmm13
+ movdqa %xmm14,%xmm10
+ pxor %xmm13,%xmm10
+ pand %xmm6,%xmm10
+ pxor %xmm0,%xmm6
+ pand %xmm14,%xmm0
+ pand %xmm13,%xmm6
+ pxor %xmm0,%xmm6
+ pxor %xmm10,%xmm0
+ pxor %xmm11,%xmm6
+ pxor %xmm11,%xmm3
+ pxor %xmm7,%xmm0
+ pxor %xmm7,%xmm1
+ pxor %xmm15,%xmm6
+ pxor %xmm5,%xmm0
+ pxor %xmm6,%xmm3
+ pxor %xmm15,%xmm5
+ pxor %xmm0,%xmm15
+
+ pxor %xmm4,%xmm0
+ pxor %xmm1,%xmm4
+ pxor %xmm2,%xmm1
+ pxor %xmm4,%xmm2
+ pxor %xmm4,%xmm3
+
+ pxor %xmm2,%xmm5
+ decl %r10d
+ jl .Lenc_done
+ pshufd $147,%xmm15,%xmm7
+ pshufd $147,%xmm0,%xmm8
+ pxor %xmm7,%xmm15
+ pshufd $147,%xmm3,%xmm9
+ pxor %xmm8,%xmm0
+ pshufd $147,%xmm5,%xmm10
+ pxor %xmm9,%xmm3
+ pshufd $147,%xmm2,%xmm11
+ pxor %xmm10,%xmm5
+ pshufd $147,%xmm6,%xmm12
+ pxor %xmm11,%xmm2
+ pshufd $147,%xmm1,%xmm13
+ pxor %xmm12,%xmm6
+ pshufd $147,%xmm4,%xmm14
+ pxor %xmm13,%xmm1
+ pxor %xmm14,%xmm4
+
+ pxor %xmm15,%xmm8
+ pxor %xmm4,%xmm7
+ pxor %xmm4,%xmm8
+ pshufd $78,%xmm15,%xmm15
+ pxor %xmm0,%xmm9
+ pshufd $78,%xmm0,%xmm0
+ pxor %xmm2,%xmm12
+ pxor %xmm7,%xmm15
+ pxor %xmm6,%xmm13
+ pxor %xmm8,%xmm0
+ pxor %xmm5,%xmm11
+ pshufd $78,%xmm2,%xmm7
+ pxor %xmm1,%xmm14
+ pshufd $78,%xmm6,%xmm8
+ pxor %xmm3,%xmm10
+ pshufd $78,%xmm5,%xmm2
+ pxor %xmm4,%xmm10
+ pshufd $78,%xmm4,%xmm6
+ pxor %xmm4,%xmm11
+ pshufd $78,%xmm1,%xmm5
+ pxor %xmm11,%xmm7
+ pshufd $78,%xmm3,%xmm1
+ pxor %xmm12,%xmm8
+ pxor %xmm10,%xmm2
+ pxor %xmm14,%xmm6
+ pxor %xmm13,%xmm5
+ movdqa %xmm7,%xmm3
+ pxor %xmm9,%xmm1
+ movdqa %xmm8,%xmm4
+ movdqa 48(%r11),%xmm7
+ jnz .Lenc_loop
+ movdqa 64(%r11),%xmm7
+ jmp .Lenc_loop
+.align 16
+.Lenc_done:
+ movdqa 0(%r11),%xmm7
+ movdqa 16(%r11),%xmm8
+ movdqa %xmm1,%xmm9
+ psrlq $1,%xmm1
+ movdqa %xmm2,%xmm10
+ psrlq $1,%xmm2
+ pxor %xmm4,%xmm1
+ pxor %xmm6,%xmm2
+ pand %xmm7,%xmm1
+ pand %xmm7,%xmm2
+ pxor %xmm1,%xmm4
+ psllq $1,%xmm1
+ pxor %xmm2,%xmm6
+ psllq $1,%xmm2
+ pxor %xmm9,%xmm1
+ pxor %xmm10,%xmm2
+ movdqa %xmm3,%xmm9
+ psrlq $1,%xmm3
+ movdqa %xmm15,%xmm10
+ psrlq $1,%xmm15
+ pxor %xmm5,%xmm3
+ pxor %xmm0,%xmm15
+ pand %xmm7,%xmm3
+ pand %xmm7,%xmm15
+ pxor %xmm3,%xmm5
+ psllq $1,%xmm3
+ pxor %xmm15,%xmm0
+ psllq $1,%xmm15
+ pxor %xmm9,%xmm3
+ pxor %xmm10,%xmm15
+ movdqa 32(%r11),%xmm7
+ movdqa %xmm6,%xmm9
+ psrlq $2,%xmm6
+ movdqa %xmm2,%xmm10
+ psrlq $2,%xmm2
+ pxor %xmm4,%xmm6
+ pxor %xmm1,%xmm2
+ pand %xmm8,%xmm6
+ pand %xmm8,%xmm2
+ pxor %xmm6,%xmm4
+ psllq $2,%xmm6
+ pxor %xmm2,%xmm1
+ psllq $2,%xmm2
+ pxor %xmm9,%xmm6
+ pxor %xmm10,%xmm2
+ movdqa %xmm0,%xmm9
+ psrlq $2,%xmm0
+ movdqa %xmm15,%xmm10
+ psrlq $2,%xmm15
+ pxor %xmm5,%xmm0
+ pxor %xmm3,%xmm15
+ pand %xmm8,%xmm0
+ pand %xmm8,%xmm15
+ pxor %xmm0,%xmm5
+ psllq $2,%xmm0
+ pxor %xmm15,%xmm3
+ psllq $2,%xmm15
+ pxor %xmm9,%xmm0
+ pxor %xmm10,%xmm15
+ movdqa %xmm5,%xmm9
+ psrlq $4,%xmm5
+ movdqa %xmm3,%xmm10
+ psrlq $4,%xmm3
+ pxor %xmm4,%xmm5
+ pxor %xmm1,%xmm3
+ pand %xmm7,%xmm5
+ pand %xmm7,%xmm3
+ pxor %xmm5,%xmm4
+ psllq $4,%xmm5
+ pxor %xmm3,%xmm1
+ psllq $4,%xmm3
+ pxor %xmm9,%xmm5
+ pxor %xmm10,%xmm3
+ movdqa %xmm0,%xmm9
+ psrlq $4,%xmm0
+ movdqa %xmm15,%xmm10
+ psrlq $4,%xmm15
+ pxor %xmm6,%xmm0
+ pxor %xmm2,%xmm15
+ pand %xmm7,%xmm0
+ pand %xmm7,%xmm15
+ pxor %xmm0,%xmm6
+ psllq $4,%xmm0
+ pxor %xmm15,%xmm2
+ psllq $4,%xmm15
+ pxor %xmm9,%xmm0
+ pxor %xmm10,%xmm15
+ movdqa (%rax),%xmm7
+ pxor %xmm7,%xmm3
+ pxor %xmm7,%xmm5
+ pxor %xmm7,%xmm2
+ pxor %xmm7,%xmm6
+ pxor %xmm7,%xmm1
+ pxor %xmm7,%xmm4
+ pxor %xmm7,%xmm15
+ pxor %xmm7,%xmm0
+ .byte 0xf3,0xc3
+.size _bsaes_encrypt8,.-_bsaes_encrypt8
+
+.type _bsaes_decrypt8,@function
+.align 64
+_bsaes_decrypt8:
+ leaq .LBS0(%rip),%r11
+
+ movdqa (%rax),%xmm8
+ leaq 16(%rax),%rax
+ movdqa -48(%r11),%xmm7
+ pxor %xmm8,%xmm15
+ pxor %xmm8,%xmm0
+.byte 102,68,15,56,0,255
+ pxor %xmm8,%xmm1
+.byte 102,15,56,0,199
+ pxor %xmm8,%xmm2
+.byte 102,15,56,0,207
+ pxor %xmm8,%xmm3
+.byte 102,15,56,0,215
+ pxor %xmm8,%xmm4
+.byte 102,15,56,0,223
+ pxor %xmm8,%xmm5
+.byte 102,15,56,0,231
+ pxor %xmm8,%xmm6
+.byte 102,15,56,0,239
+.byte 102,15,56,0,247
+ movdqa 0(%r11),%xmm7
+ movdqa 16(%r11),%xmm8
+ movdqa %xmm5,%xmm9
+ psrlq $1,%xmm5
+ movdqa %xmm3,%xmm10
+ psrlq $1,%xmm3
+ pxor %xmm6,%xmm5
+ pxor %xmm4,%xmm3
+ pand %xmm7,%xmm5
+ pand %xmm7,%xmm3
+ pxor %xmm5,%xmm6
+ psllq $1,%xmm5
+ pxor %xmm3,%xmm4
+ psllq $1,%xmm3
+ pxor %xmm9,%xmm5
+ pxor %xmm10,%xmm3
+ movdqa %xmm1,%xmm9
+ psrlq $1,%xmm1
+ movdqa %xmm15,%xmm10
+ psrlq $1,%xmm15
+ pxor %xmm2,%xmm1
+ pxor %xmm0,%xmm15
+ pand %xmm7,%xmm1
+ pand %xmm7,%xmm15
+ pxor %xmm1,%xmm2
+ psllq $1,%xmm1
+ pxor %xmm15,%xmm0
+ psllq $1,%xmm15
+ pxor %xmm9,%xmm1
+ pxor %xmm10,%xmm15
+ movdqa 32(%r11),%xmm7
+ movdqa %xmm4,%xmm9
+ psrlq $2,%xmm4
+ movdqa %xmm3,%xmm10
+ psrlq $2,%xmm3
+ pxor %xmm6,%xmm4
+ pxor %xmm5,%xmm3
+ pand %xmm8,%xmm4
+ pand %xmm8,%xmm3
+ pxor %xmm4,%xmm6
+ psllq $2,%xmm4
+ pxor %xmm3,%xmm5
+ psllq $2,%xmm3
+ pxor %xmm9,%xmm4
+ pxor %xmm10,%xmm3
+ movdqa %xmm0,%xmm9
+ psrlq $2,%xmm0
+ movdqa %xmm15,%xmm10
+ psrlq $2,%xmm15
+ pxor %xmm2,%xmm0
+ pxor %xmm1,%xmm15
+ pand %xmm8,%xmm0
+ pand %xmm8,%xmm15
+ pxor %xmm0,%xmm2
+ psllq $2,%xmm0
+ pxor %xmm15,%xmm1
+ psllq $2,%xmm15
+ pxor %xmm9,%xmm0
+ pxor %xmm10,%xmm15
+ movdqa %xmm2,%xmm9
+ psrlq $4,%xmm2
+ movdqa %xmm1,%xmm10
+ psrlq $4,%xmm1
+ pxor %xmm6,%xmm2
+ pxor %xmm5,%xmm1
+ pand %xmm7,%xmm2
+ pand %xmm7,%xmm1
+ pxor %xmm2,%xmm6
+ psllq $4,%xmm2
+ pxor %xmm1,%xmm5
+ psllq $4,%xmm1
+ pxor %xmm9,%xmm2
+ pxor %xmm10,%xmm1
+ movdqa %xmm0,%xmm9
+ psrlq $4,%xmm0
+ movdqa %xmm15,%xmm10
+ psrlq $4,%xmm15
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm15
+ pand %xmm7,%xmm0
+ pand %xmm7,%xmm15
+ pxor %xmm0,%xmm4
+ psllq $4,%xmm0
+ pxor %xmm15,%xmm3
+ psllq $4,%xmm15
+ pxor %xmm9,%xmm0
+ pxor %xmm10,%xmm15
+ decl %r10d
+ jmp .Ldec_sbox
+.align 16
+.Ldec_loop:
+ pxor 0(%rax),%xmm15
+ pxor 16(%rax),%xmm0
+.byte 102,68,15,56,0,255
+ pxor 32(%rax),%xmm1
+.byte 102,15,56,0,199
+ pxor 48(%rax),%xmm2
+.byte 102,15,56,0,207
+ pxor 64(%rax),%xmm3
+.byte 102,15,56,0,215
+ pxor 80(%rax),%xmm4
+.byte 102,15,56,0,223
+ pxor 96(%rax),%xmm5
+.byte 102,15,56,0,231
+ pxor 112(%rax),%xmm6
+.byte 102,15,56,0,239
+ leaq 128(%rax),%rax
+.byte 102,15,56,0,247
+.Ldec_sbox:
+ pxor %xmm3,%xmm2
+
+ pxor %xmm6,%xmm3
+ pxor %xmm6,%xmm1
+ pxor %xmm3,%xmm5
+ pxor %xmm5,%xmm6
+ pxor %xmm6,%xmm0
+
+ pxor %xmm0,%xmm15
+ pxor %xmm4,%xmm1
+ pxor %xmm15,%xmm2
+ pxor %xmm15,%xmm4
+ pxor %xmm2,%xmm0
+ movdqa %xmm2,%xmm10
+ movdqa %xmm6,%xmm9
+ movdqa %xmm0,%xmm8
+ movdqa %xmm3,%xmm12
+ movdqa %xmm4,%xmm11
+
+ pxor %xmm15,%xmm10
+ pxor %xmm3,%xmm9
+ pxor %xmm5,%xmm8
+ movdqa %xmm10,%xmm13
+ pxor %xmm15,%xmm12
+ movdqa %xmm9,%xmm7
+ pxor %xmm1,%xmm11
+ movdqa %xmm10,%xmm14
+
+ por %xmm8,%xmm9
+ por %xmm11,%xmm10
+ pxor %xmm7,%xmm14
+ pand %xmm11,%xmm13
+ pxor %xmm8,%xmm11
+ pand %xmm8,%xmm7
+ pand %xmm11,%xmm14
+ movdqa %xmm5,%xmm11
+ pxor %xmm1,%xmm11
+ pand %xmm11,%xmm12
+ pxor %xmm12,%xmm10
+ pxor %xmm12,%xmm9
+ movdqa %xmm2,%xmm12
+ movdqa %xmm0,%xmm11
+ pxor %xmm6,%xmm12
+ pxor %xmm4,%xmm11
+ movdqa %xmm12,%xmm8
+ pand %xmm11,%xmm12
+ por %xmm11,%xmm8
+ pxor %xmm12,%xmm7
+ pxor %xmm14,%xmm10
+ pxor %xmm13,%xmm9
+ pxor %xmm14,%xmm8
+ movdqa %xmm3,%xmm11
+ pxor %xmm13,%xmm7
+ movdqa %xmm15,%xmm12
+ pxor %xmm13,%xmm8
+ movdqa %xmm6,%xmm13
+ pand %xmm5,%xmm11
+ movdqa %xmm2,%xmm14
+ pand %xmm1,%xmm12
+ pand %xmm0,%xmm13
+ por %xmm4,%xmm14
+ pxor %xmm11,%xmm10
+ pxor %xmm12,%xmm9
+ pxor %xmm13,%xmm8
+ pxor %xmm14,%xmm7
+
+
+
+
+
+ movdqa %xmm10,%xmm11
+ pand %xmm8,%xmm10
+ pxor %xmm9,%xmm11
+
+ movdqa %xmm7,%xmm13
+ movdqa %xmm11,%xmm14
+ pxor %xmm10,%xmm13
+ pand %xmm13,%xmm14
+
+ movdqa %xmm8,%xmm12
+ pxor %xmm9,%xmm14
+ pxor %xmm7,%xmm12
+
+ pxor %xmm9,%xmm10
+
+ pand %xmm10,%xmm12
+
+ movdqa %xmm13,%xmm9
+ pxor %xmm7,%xmm12
+
+ pxor %xmm12,%xmm9
+ pxor %xmm12,%xmm8
+
+ pand %xmm7,%xmm9
+
+ pxor %xmm9,%xmm13
+ pxor %xmm9,%xmm8
+
+ pand %xmm14,%xmm13
+
+ pxor %xmm11,%xmm13
+ movdqa %xmm4,%xmm11
+ movdqa %xmm0,%xmm7
+ movdqa %xmm14,%xmm9
+ pxor %xmm13,%xmm9
+ pand %xmm4,%xmm9
+ pxor %xmm0,%xmm4
+ pand %xmm14,%xmm0
+ pand %xmm13,%xmm4
+ pxor %xmm0,%xmm4
+ pxor %xmm9,%xmm0
+ pxor %xmm1,%xmm11
+ pxor %xmm5,%xmm7
+ pxor %xmm12,%xmm14
+ pxor %xmm8,%xmm13
+ movdqa %xmm14,%xmm10
+ movdqa %xmm12,%xmm9
+ pxor %xmm13,%xmm10
+ pxor %xmm8,%xmm9
+ pand %xmm11,%xmm10
+ pand %xmm1,%xmm9
+ pxor %xmm7,%xmm11
+ pxor %xmm5,%xmm1
+ pand %xmm14,%xmm7
+ pand %xmm12,%xmm5
+ pand %xmm13,%xmm11
+ pand %xmm8,%xmm1
+ pxor %xmm11,%xmm7
+ pxor %xmm5,%xmm1
+ pxor %xmm10,%xmm11
+ pxor %xmm9,%xmm5
+ pxor %xmm11,%xmm4
+ pxor %xmm11,%xmm1
+ pxor %xmm7,%xmm0
+ pxor %xmm7,%xmm5
+
+ movdqa %xmm2,%xmm11
+ movdqa %xmm6,%xmm7
+ pxor %xmm15,%xmm11
+ pxor %xmm3,%xmm7
+ movdqa %xmm14,%xmm10
+ movdqa %xmm12,%xmm9
+ pxor %xmm13,%xmm10
+ pxor %xmm8,%xmm9
+ pand %xmm11,%xmm10
+ pand %xmm15,%xmm9
+ pxor %xmm7,%xmm11
+ pxor %xmm3,%xmm15
+ pand %xmm14,%xmm7
+ pand %xmm12,%xmm3
+ pand %xmm13,%xmm11
+ pand %xmm8,%xmm15
+ pxor %xmm11,%xmm7
+ pxor %xmm3,%xmm15
+ pxor %xmm10,%xmm11
+ pxor %xmm9,%xmm3
+ pxor %xmm12,%xmm14
+ pxor %xmm8,%xmm13
+ movdqa %xmm14,%xmm10
+ pxor %xmm13,%xmm10
+ pand %xmm2,%xmm10
+ pxor %xmm6,%xmm2
+ pand %xmm14,%xmm6
+ pand %xmm13,%xmm2
+ pxor %xmm6,%xmm2
+ pxor %xmm10,%xmm6
+ pxor %xmm11,%xmm2
+ pxor %xmm11,%xmm15
+ pxor %xmm7,%xmm6
+ pxor %xmm7,%xmm3
+ pxor %xmm6,%xmm0
+ pxor %xmm4,%xmm5
+
+ pxor %xmm0,%xmm3
+ pxor %xmm6,%xmm1
+ pxor %xmm6,%xmm4
+ pxor %xmm1,%xmm3
+ pxor %xmm15,%xmm6
+ pxor %xmm4,%xmm3
+ pxor %xmm5,%xmm2
+ pxor %xmm0,%xmm5
+ pxor %xmm3,%xmm2
+
+ pxor %xmm15,%xmm3
+ pxor %xmm2,%xmm6
+ decl %r10d
+ jl .Ldec_done
+
+ pshufd $78,%xmm15,%xmm7
+ pshufd $78,%xmm2,%xmm13
+ pxor %xmm15,%xmm7
+ pshufd $78,%xmm4,%xmm14
+ pxor %xmm2,%xmm13
+ pshufd $78,%xmm0,%xmm8
+ pxor %xmm4,%xmm14
+ pshufd $78,%xmm5,%xmm9
+ pxor %xmm0,%xmm8
+ pshufd $78,%xmm3,%xmm10
+ pxor %xmm5,%xmm9
+ pxor %xmm13,%xmm15
+ pxor %xmm13,%xmm0
+ pshufd $78,%xmm1,%xmm11
+ pxor %xmm3,%xmm10
+ pxor %xmm7,%xmm5
+ pxor %xmm8,%xmm3
+ pshufd $78,%xmm6,%xmm12
+ pxor %xmm1,%xmm11
+ pxor %xmm14,%xmm0
+ pxor %xmm9,%xmm1
+ pxor %xmm6,%xmm12
+
+ pxor %xmm14,%xmm5
+ pxor %xmm13,%xmm3
+ pxor %xmm13,%xmm1
+ pxor %xmm10,%xmm6
+ pxor %xmm11,%xmm2
+ pxor %xmm14,%xmm1
+ pxor %xmm14,%xmm6
+ pxor %xmm12,%xmm4
+ pshufd $147,%xmm15,%xmm7
+ pshufd $147,%xmm0,%xmm8
+ pxor %xmm7,%xmm15
+ pshufd $147,%xmm5,%xmm9
+ pxor %xmm8,%xmm0
+ pshufd $147,%xmm3,%xmm10
+ pxor %xmm9,%xmm5
+ pshufd $147,%xmm1,%xmm11
+ pxor %xmm10,%xmm3
+ pshufd $147,%xmm6,%xmm12
+ pxor %xmm11,%xmm1
+ pshufd $147,%xmm2,%xmm13
+ pxor %xmm12,%xmm6
+ pshufd $147,%xmm4,%xmm14
+ pxor %xmm13,%xmm2
+ pxor %xmm14,%xmm4
+
+ pxor %xmm15,%xmm8
+ pxor %xmm4,%xmm7
+ pxor %xmm4,%xmm8
+ pshufd $78,%xmm15,%xmm15
+ pxor %xmm0,%xmm9
+ pshufd $78,%xmm0,%xmm0
+ pxor %xmm1,%xmm12
+ pxor %xmm7,%xmm15
+ pxor %xmm6,%xmm13
+ pxor %xmm8,%xmm0
+ pxor %xmm3,%xmm11
+ pshufd $78,%xmm1,%xmm7
+ pxor %xmm2,%xmm14
+ pshufd $78,%xmm6,%xmm8
+ pxor %xmm5,%xmm10
+ pshufd $78,%xmm3,%xmm1
+ pxor %xmm4,%xmm10
+ pshufd $78,%xmm4,%xmm6
+ pxor %xmm4,%xmm11
+ pshufd $78,%xmm2,%xmm3
+ pxor %xmm11,%xmm7
+ pshufd $78,%xmm5,%xmm2
+ pxor %xmm12,%xmm8
+ pxor %xmm1,%xmm10
+ pxor %xmm14,%xmm6
+ pxor %xmm3,%xmm13
+ movdqa %xmm7,%xmm3
+ pxor %xmm9,%xmm2
+ movdqa %xmm13,%xmm5
+ movdqa %xmm8,%xmm4
+ movdqa %xmm2,%xmm1
+ movdqa %xmm10,%xmm2
+ movdqa -16(%r11),%xmm7
+ jnz .Ldec_loop
+ movdqa -32(%r11),%xmm7
+ jmp .Ldec_loop
+.align 16
+.Ldec_done:
+ movdqa 0(%r11),%xmm7
+ movdqa 16(%r11),%xmm8
+ movdqa %xmm2,%xmm9
+ psrlq $1,%xmm2
+ movdqa %xmm1,%xmm10
+ psrlq $1,%xmm1
+ pxor %xmm4,%xmm2
+ pxor %xmm6,%xmm1
+ pand %xmm7,%xmm2
+ pand %xmm7,%xmm1
+ pxor %xmm2,%xmm4
+ psllq $1,%xmm2
+ pxor %xmm1,%xmm6
+ psllq $1,%xmm1
+ pxor %xmm9,%xmm2
+ pxor %xmm10,%xmm1
+ movdqa %xmm5,%xmm9
+ psrlq $1,%xmm5
+ movdqa %xmm15,%xmm10
+ psrlq $1,%xmm15
+ pxor %xmm3,%xmm5
+ pxor %xmm0,%xmm15
+ pand %xmm7,%xmm5
+ pand %xmm7,%xmm15
+ pxor %xmm5,%xmm3
+ psllq $1,%xmm5
+ pxor %xmm15,%xmm0
+ psllq $1,%xmm15
+ pxor %xmm9,%xmm5
+ pxor %xmm10,%xmm15
+ movdqa 32(%r11),%xmm7
+ movdqa %xmm6,%xmm9
+ psrlq $2,%xmm6
+ movdqa %xmm1,%xmm10
+ psrlq $2,%xmm1
+ pxor %xmm4,%xmm6
+ pxor %xmm2,%xmm1
+ pand %xmm8,%xmm6
+ pand %xmm8,%xmm1
+ pxor %xmm6,%xmm4
+ psllq $2,%xmm6
+ pxor %xmm1,%xmm2
+ psllq $2,%xmm1
+ pxor %xmm9,%xmm6
+ pxor %xmm10,%xmm1
+ movdqa %xmm0,%xmm9
+ psrlq $2,%xmm0
+ movdqa %xmm15,%xmm10
+ psrlq $2,%xmm15
+ pxor %xmm3,%xmm0
+ pxor %xmm5,%xmm15
+ pand %xmm8,%xmm0
+ pand %xmm8,%xmm15
+ pxor %xmm0,%xmm3
+ psllq $2,%xmm0
+ pxor %xmm15,%xmm5
+ psllq $2,%xmm15
+ pxor %xmm9,%xmm0
+ pxor %xmm10,%xmm15
+ movdqa %xmm3,%xmm9
+ psrlq $4,%xmm3
+ movdqa %xmm5,%xmm10
+ psrlq $4,%xmm5
+ pxor %xmm4,%xmm3
+ pxor %xmm2,%xmm5
+ pand %xmm7,%xmm3
+ pand %xmm7,%xmm5
+ pxor %xmm3,%xmm4
+ psllq $4,%xmm3
+ pxor %xmm5,%xmm2
+ psllq $4,%xmm5
+ pxor %xmm9,%xmm3
+ pxor %xmm10,%xmm5
+ movdqa %xmm0,%xmm9
+ psrlq $4,%xmm0
+ movdqa %xmm15,%xmm10
+ psrlq $4,%xmm15
+ pxor %xmm6,%xmm0
+ pxor %xmm1,%xmm15
+ pand %xmm7,%xmm0
+ pand %xmm7,%xmm15
+ pxor %xmm0,%xmm6
+ psllq $4,%xmm0
+ pxor %xmm15,%xmm1
+ psllq $4,%xmm15
+ pxor %xmm9,%xmm0
+ pxor %xmm10,%xmm15
+ movdqa (%rax),%xmm7
+ pxor %xmm7,%xmm5
+ pxor %xmm7,%xmm3
+ pxor %xmm7,%xmm1
+ pxor %xmm7,%xmm6
+ pxor %xmm7,%xmm2
+ pxor %xmm7,%xmm4
+ pxor %xmm7,%xmm15
+ pxor %xmm7,%xmm0
+ .byte 0xf3,0xc3
+.size _bsaes_decrypt8,.-_bsaes_decrypt8
+.type _bsaes_key_convert,@function
+.align 16
+_bsaes_key_convert:
+ leaq .Lmasks(%rip),%r11
+ movdqu (%rcx),%xmm7
+ leaq 16(%rcx),%rcx
+ movdqa 0(%r11),%xmm0
+ movdqa 16(%r11),%xmm1
+ movdqa 32(%r11),%xmm2
+ movdqa 48(%r11),%xmm3
+ movdqa 64(%r11),%xmm4
+ pcmpeqd %xmm5,%xmm5
+
+ movdqu (%rcx),%xmm6
+ movdqa %xmm7,(%rax)
+ leaq 16(%rax),%rax
+ decl %r10d
+ jmp .Lkey_loop
+.align 16
+.Lkey_loop:
+.byte 102,15,56,0,244
+
+ movdqa %xmm0,%xmm8
+ movdqa %xmm1,%xmm9
+
+ pand %xmm6,%xmm8
+ pand %xmm6,%xmm9
+ movdqa %xmm2,%xmm10
+ pcmpeqb %xmm0,%xmm8
+ psllq $4,%xmm0
+ movdqa %xmm3,%xmm11
+ pcmpeqb %xmm1,%xmm9
+ psllq $4,%xmm1
+
+ pand %xmm6,%xmm10
+ pand %xmm6,%xmm11
+ movdqa %xmm0,%xmm12
+ pcmpeqb %xmm2,%xmm10
+ psllq $4,%xmm2
+ movdqa %xmm1,%xmm13
+ pcmpeqb %xmm3,%xmm11
+ psllq $4,%xmm3
+
+ movdqa %xmm2,%xmm14
+ movdqa %xmm3,%xmm15
+ pxor %xmm5,%xmm8
+ pxor %xmm5,%xmm9
+
+ pand %xmm6,%xmm12
+ pand %xmm6,%xmm13
+ movdqa %xmm8,0(%rax)
+ pcmpeqb %xmm0,%xmm12
+ psrlq $4,%xmm0
+ movdqa %xmm9,16(%rax)
+ pcmpeqb %xmm1,%xmm13
+ psrlq $4,%xmm1
+ leaq 16(%rcx),%rcx
+
+ pand %xmm6,%xmm14
+ pand %xmm6,%xmm15
+ movdqa %xmm10,32(%rax)
+ pcmpeqb %xmm2,%xmm14
+ psrlq $4,%xmm2
+ movdqa %xmm11,48(%rax)
+ pcmpeqb %xmm3,%xmm15
+ psrlq $4,%xmm3
+ movdqu (%rcx),%xmm6
+
+ pxor %xmm5,%xmm13
+ pxor %xmm5,%xmm14
+ movdqa %xmm12,64(%rax)
+ movdqa %xmm13,80(%rax)
+ movdqa %xmm14,96(%rax)
+ movdqa %xmm15,112(%rax)
+ leaq 128(%rax),%rax
+ decl %r10d
+ jnz .Lkey_loop
+
+ movdqa 80(%r11),%xmm7
+
+ .byte 0xf3,0xc3
+.size _bsaes_key_convert,.-_bsaes_key_convert
+
+.globl bsaes_cbc_encrypt
+.type bsaes_cbc_encrypt,@function
+.align 16
+bsaes_cbc_encrypt:
+ cmpl $0,%r9d
+ jne asm_AES_cbc_encrypt
+ cmpq $128,%rdx
+ jb asm_AES_cbc_encrypt
+
+ movq %rsp,%rax
+.Lcbc_dec_prologue:
+ pushq %rbp
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ leaq -72(%rsp),%rsp
+ movq %rsp,%rbp
+ movl 240(%rcx),%eax
+ movq %rdi,%r12
+ movq %rsi,%r13
+ movq %rdx,%r14
+ movq %rcx,%r15
+ movq %r8,%rbx
+ shrq $4,%r14
+
+ movl %eax,%edx
+ shlq $7,%rax
+ subq $96,%rax
+ subq %rax,%rsp
+
+ movq %rsp,%rax
+ movq %r15,%rcx
+ movl %edx,%r10d
+ call _bsaes_key_convert
+ pxor (%rsp),%xmm7
+ movdqa %xmm6,(%rax)
+ movdqa %xmm7,(%rsp)
+
+ movdqu (%rbx),%xmm14
+ subq $8,%r14
+.Lcbc_dec_loop:
+ movdqu 0(%r12),%xmm15
+ movdqu 16(%r12),%xmm0
+ movdqu 32(%r12),%xmm1
+ movdqu 48(%r12),%xmm2
+ movdqu 64(%r12),%xmm3
+ movdqu 80(%r12),%xmm4
+ movq %rsp,%rax
+ movdqu 96(%r12),%xmm5
+ movl %edx,%r10d
+ movdqu 112(%r12),%xmm6
+ movdqa %xmm14,32(%rbp)
+
+ call _bsaes_decrypt8
+
+ pxor 32(%rbp),%xmm15
+ movdqu 0(%r12),%xmm7
+ movdqu 16(%r12),%xmm8
+ pxor %xmm7,%xmm0
+ movdqu 32(%r12),%xmm9
+ pxor %xmm8,%xmm5
+ movdqu 48(%r12),%xmm10
+ pxor %xmm9,%xmm3
+ movdqu 64(%r12),%xmm11
+ pxor %xmm10,%xmm1
+ movdqu 80(%r12),%xmm12
+ pxor %xmm11,%xmm6
+ movdqu 96(%r12),%xmm13
+ pxor %xmm12,%xmm2
+ movdqu 112(%r12),%xmm14
+ pxor %xmm13,%xmm4
+ movdqu %xmm15,0(%r13)
+ leaq 128(%r12),%r12
+ movdqu %xmm0,16(%r13)
+ movdqu %xmm5,32(%r13)
+ movdqu %xmm3,48(%r13)
+ movdqu %xmm1,64(%r13)
+ movdqu %xmm6,80(%r13)
+ movdqu %xmm2,96(%r13)
+ movdqu %xmm4,112(%r13)
+ leaq 128(%r13),%r13
+ subq $8,%r14
+ jnc .Lcbc_dec_loop
+
+ addq $8,%r14
+ jz .Lcbc_dec_done
+
+ movdqu 0(%r12),%xmm15
+ movq %rsp,%rax
+ movl %edx,%r10d
+ cmpq $2,%r14
+ jb .Lcbc_dec_one
+ movdqu 16(%r12),%xmm0
+ je .Lcbc_dec_two
+ movdqu 32(%r12),%xmm1
+ cmpq $4,%r14
+ jb .Lcbc_dec_three
+ movdqu 48(%r12),%xmm2
+ je .Lcbc_dec_four
+ movdqu 64(%r12),%xmm3
+ cmpq $6,%r14
+ jb .Lcbc_dec_five
+ movdqu 80(%r12),%xmm4
+ je .Lcbc_dec_six
+ movdqu 96(%r12),%xmm5
+ movdqa %xmm14,32(%rbp)
+ call _bsaes_decrypt8
+ pxor 32(%rbp),%xmm15
+ movdqu 0(%r12),%xmm7
+ movdqu 16(%r12),%xmm8
+ pxor %xmm7,%xmm0
+ movdqu 32(%r12),%xmm9
+ pxor %xmm8,%xmm5
+ movdqu 48(%r12),%xmm10
+ pxor %xmm9,%xmm3
+ movdqu 64(%r12),%xmm11
+ pxor %xmm10,%xmm1
+ movdqu 80(%r12),%xmm12
+ pxor %xmm11,%xmm6
+ movdqu 96(%r12),%xmm14
+ pxor %xmm12,%xmm2
+ movdqu %xmm15,0(%r13)
+ movdqu %xmm0,16(%r13)
+ movdqu %xmm5,32(%r13)
+ movdqu %xmm3,48(%r13)
+ movdqu %xmm1,64(%r13)
+ movdqu %xmm6,80(%r13)
+ movdqu %xmm2,96(%r13)
+ jmp .Lcbc_dec_done
+.align 16
+.Lcbc_dec_six:
+ movdqa %xmm14,32(%rbp)
+ call _bsaes_decrypt8
+ pxor 32(%rbp),%xmm15
+ movdqu 0(%r12),%xmm7
+ movdqu 16(%r12),%xmm8
+ pxor %xmm7,%xmm0
+ movdqu 32(%r12),%xmm9
+ pxor %xmm8,%xmm5
+ movdqu 48(%r12),%xmm10
+ pxor %xmm9,%xmm3
+ movdqu 64(%r12),%xmm11
+ pxor %xmm10,%xmm1
+ movdqu 80(%r12),%xmm14
+ pxor %xmm11,%xmm6
+ movdqu %xmm15,0(%r13)
+ movdqu %xmm0,16(%r13)
+ movdqu %xmm5,32(%r13)
+ movdqu %xmm3,48(%r13)
+ movdqu %xmm1,64(%r13)
+ movdqu %xmm6,80(%r13)
+ jmp .Lcbc_dec_done
+.align 16
+.Lcbc_dec_five:
+ movdqa %xmm14,32(%rbp)
+ call _bsaes_decrypt8
+ pxor 32(%rbp),%xmm15
+ movdqu 0(%r12),%xmm7
+ movdqu 16(%r12),%xmm8
+ pxor %xmm7,%xmm0
+ movdqu 32(%r12),%xmm9
+ pxor %xmm8,%xmm5
+ movdqu 48(%r12),%xmm10
+ pxor %xmm9,%xmm3
+ movdqu 64(%r12),%xmm14
+ pxor %xmm10,%xmm1
+ movdqu %xmm15,0(%r13)
+ movdqu %xmm0,16(%r13)
+ movdqu %xmm5,32(%r13)
+ movdqu %xmm3,48(%r13)
+ movdqu %xmm1,64(%r13)
+ jmp .Lcbc_dec_done
+.align 16
+.Lcbc_dec_four:
+ movdqa %xmm14,32(%rbp)
+ call _bsaes_decrypt8
+ pxor 32(%rbp),%xmm15
+ movdqu 0(%r12),%xmm7
+ movdqu 16(%r12),%xmm8
+ pxor %xmm7,%xmm0
+ movdqu 32(%r12),%xmm9
+ pxor %xmm8,%xmm5
+ movdqu 48(%r12),%xmm14
+ pxor %xmm9,%xmm3
+ movdqu %xmm15,0(%r13)
+ movdqu %xmm0,16(%r13)
+ movdqu %xmm5,32(%r13)
+ movdqu %xmm3,48(%r13)
+ jmp .Lcbc_dec_done
+.align 16
+.Lcbc_dec_three:
+ movdqa %xmm14,32(%rbp)
+ call _bsaes_decrypt8
+ pxor 32(%rbp),%xmm15
+ movdqu 0(%r12),%xmm7
+ movdqu 16(%r12),%xmm8
+ pxor %xmm7,%xmm0
+ movdqu 32(%r12),%xmm14
+ pxor %xmm8,%xmm5
+ movdqu %xmm15,0(%r13)
+ movdqu %xmm0,16(%r13)
+ movdqu %xmm5,32(%r13)
+ jmp .Lcbc_dec_done
+.align 16
+.Lcbc_dec_two:
+ movdqa %xmm14,32(%rbp)
+ call _bsaes_decrypt8
+ pxor 32(%rbp),%xmm15
+ movdqu 0(%r12),%xmm7
+ movdqu 16(%r12),%xmm14
+ pxor %xmm7,%xmm0
+ movdqu %xmm15,0(%r13)
+ movdqu %xmm0,16(%r13)
+ jmp .Lcbc_dec_done
+.align 16
+.Lcbc_dec_one:
+ leaq (%r12),%rdi
+ leaq 32(%rbp),%rsi
+ leaq (%r15),%rdx
+ call asm_AES_decrypt
+ pxor 32(%rbp),%xmm14
+ movdqu %xmm14,(%r13)
+ movdqa %xmm15,%xmm14
+
+.Lcbc_dec_done:
+ movdqu %xmm14,(%rbx)
+ leaq (%rsp),%rax
+ pxor %xmm0,%xmm0
+.Lcbc_dec_bzero:
+ movdqa %xmm0,0(%rax)
+ movdqa %xmm0,16(%rax)
+ leaq 32(%rax),%rax
+ cmpq %rax,%rbp
+ ja .Lcbc_dec_bzero
+
+ leaq (%rbp),%rsp
+ movq 72(%rsp),%r15
+ movq 80(%rsp),%r14
+ movq 88(%rsp),%r13
+ movq 96(%rsp),%r12
+ movq 104(%rsp),%rbx
+ movq 112(%rsp),%rax
+ leaq 120(%rsp),%rsp
+ movq %rax,%rbp
+.Lcbc_dec_epilogue:
+ .byte 0xf3,0xc3
+.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
+
+.globl bsaes_ctr32_encrypt_blocks
+.type bsaes_ctr32_encrypt_blocks,@function
+.align 16
+bsaes_ctr32_encrypt_blocks:
+ movq %rsp,%rax
+.Lctr_enc_prologue:
+ pushq %rbp
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ leaq -72(%rsp),%rsp
+ movq %rsp,%rbp
+ movdqu (%r8),%xmm0
+ movl 240(%rcx),%eax
+ movq %rdi,%r12
+ movq %rsi,%r13
+ movq %rdx,%r14
+ movq %rcx,%r15
+ movdqa %xmm0,32(%rbp)
+ cmpq $8,%rdx
+ jb .Lctr_enc_short
+
+ movl %eax,%ebx
+ shlq $7,%rax
+ subq $96,%rax
+ subq %rax,%rsp
+
+ movq %rsp,%rax
+ movq %r15,%rcx
+ movl %ebx,%r10d
+ call _bsaes_key_convert
+ pxor %xmm6,%xmm7
+ movdqa %xmm7,(%rax)
+
+ movdqa (%rsp),%xmm8
+ leaq .LADD1(%rip),%r11
+ movdqa 32(%rbp),%xmm15
+ movdqa -32(%r11),%xmm7
+.byte 102,68,15,56,0,199
+.byte 102,68,15,56,0,255
+ movdqa %xmm8,(%rsp)
+ jmp .Lctr_enc_loop
+.align 16
+.Lctr_enc_loop:
+ movdqa %xmm15,32(%rbp)
+ movdqa %xmm15,%xmm0
+ movdqa %xmm15,%xmm1
+ paddd 0(%r11),%xmm0
+ movdqa %xmm15,%xmm2
+ paddd 16(%r11),%xmm1
+ movdqa %xmm15,%xmm3
+ paddd 32(%r11),%xmm2
+ movdqa %xmm15,%xmm4
+ paddd 48(%r11),%xmm3
+ movdqa %xmm15,%xmm5
+ paddd 64(%r11),%xmm4
+ movdqa %xmm15,%xmm6
+ paddd 80(%r11),%xmm5
+ paddd 96(%r11),%xmm6
+
+
+
+ movdqa (%rsp),%xmm8
+ leaq 16(%rsp),%rax
+ movdqa -16(%r11),%xmm7
+ pxor %xmm8,%xmm15
+ pxor %xmm8,%xmm0
+.byte 102,68,15,56,0,255
+ pxor %xmm8,%xmm1
+.byte 102,15,56,0,199
+ pxor %xmm8,%xmm2
+.byte 102,15,56,0,207
+ pxor %xmm8,%xmm3
+.byte 102,15,56,0,215
+ pxor %xmm8,%xmm4
+.byte 102,15,56,0,223
+ pxor %xmm8,%xmm5
+.byte 102,15,56,0,231
+ pxor %xmm8,%xmm6
+.byte 102,15,56,0,239
+ leaq .LBS0(%rip),%r11
+.byte 102,15,56,0,247
+ movl %ebx,%r10d
+
+ call _bsaes_encrypt8_bitslice
+
+ subq $8,%r14
+ jc .Lctr_enc_loop_done
+
+ movdqu 0(%r12),%xmm7
+ movdqu 16(%r12),%xmm8
+ movdqu 32(%r12),%xmm9
+ movdqu 48(%r12),%xmm10
+ movdqu 64(%r12),%xmm11
+ movdqu 80(%r12),%xmm12
+ movdqu 96(%r12),%xmm13
+ movdqu 112(%r12),%xmm14
+ leaq 128(%r12),%r12
+ pxor %xmm15,%xmm7
+ movdqa 32(%rbp),%xmm15
+ pxor %xmm8,%xmm0
+ movdqu %xmm7,0(%r13)
+ pxor %xmm9,%xmm3
+ movdqu %xmm0,16(%r13)
+ pxor %xmm10,%xmm5
+ movdqu %xmm3,32(%r13)
+ pxor %xmm11,%xmm2
+ movdqu %xmm5,48(%r13)
+ pxor %xmm12,%xmm6
+ movdqu %xmm2,64(%r13)
+ pxor %xmm13,%xmm1
+ movdqu %xmm6,80(%r13)
+ pxor %xmm14,%xmm4
+ movdqu %xmm1,96(%r13)
+ leaq .LADD1(%rip),%r11
+ movdqu %xmm4,112(%r13)
+ leaq 128(%r13),%r13
+ paddd 112(%r11),%xmm15
+ jnz .Lctr_enc_loop
+
+ jmp .Lctr_enc_done
+.align 16
+.Lctr_enc_loop_done:
+ addq $8,%r14
+ movdqu 0(%r12),%xmm7
+ pxor %xmm7,%xmm15
+ movdqu %xmm15,0(%r13)
+ cmpq $2,%r14
+ jb .Lctr_enc_done
+ movdqu 16(%r12),%xmm8
+ pxor %xmm8,%xmm0
+ movdqu %xmm0,16(%r13)
+ je .Lctr_enc_done
+ movdqu 32(%r12),%xmm9
+ pxor %xmm9,%xmm3
+ movdqu %xmm3,32(%r13)
+ cmpq $4,%r14
+ jb .Lctr_enc_done
+ movdqu 48(%r12),%xmm10
+ pxor %xmm10,%xmm5
+ movdqu %xmm5,48(%r13)
+ je .Lctr_enc_done
+ movdqu 64(%r12),%xmm11
+ pxor %xmm11,%xmm2
+ movdqu %xmm2,64(%r13)
+ cmpq $6,%r14
+ jb .Lctr_enc_done
+ movdqu 80(%r12),%xmm12
+ pxor %xmm12,%xmm6
+ movdqu %xmm6,80(%r13)
+ je .Lctr_enc_done
+ movdqu 96(%r12),%xmm13
+ pxor %xmm13,%xmm1
+ movdqu %xmm1,96(%r13)
+ jmp .Lctr_enc_done
+
+.align 16
+.Lctr_enc_short:
+ leaq 32(%rbp),%rdi
+ leaq 48(%rbp),%rsi
+ leaq (%r15),%rdx
+ call asm_AES_encrypt
+ movdqu (%r12),%xmm0
+ leaq 16(%r12),%r12
+ movl 44(%rbp),%eax
+ bswapl %eax
+ pxor 48(%rbp),%xmm0
+ incl %eax
+ movdqu %xmm0,(%r13)
+ bswapl %eax
+ leaq 16(%r13),%r13
+ movl %eax,44(%rsp)
+ decq %r14
+ jnz .Lctr_enc_short
+
+.Lctr_enc_done:
+ leaq (%rsp),%rax
+ pxor %xmm0,%xmm0
+.Lctr_enc_bzero:
+ movdqa %xmm0,0(%rax)
+ movdqa %xmm0,16(%rax)
+ leaq 32(%rax),%rax
+ cmpq %rax,%rbp
+ ja .Lctr_enc_bzero
+
+ leaq (%rbp),%rsp
+ movq 72(%rsp),%r15
+ movq 80(%rsp),%r14
+ movq 88(%rsp),%r13
+ movq 96(%rsp),%r12
+ movq 104(%rsp),%rbx
+ movq 112(%rsp),%rax
+ leaq 120(%rsp),%rsp
+ movq %rax,%rbp
+.Lctr_enc_epilogue:
+ .byte 0xf3,0xc3
+.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
+.globl bsaes_xts_encrypt
+.type bsaes_xts_encrypt,@function
+.align 16
+bsaes_xts_encrypt:
+ movq %rsp,%rax
+.Lxts_enc_prologue:
+ pushq %rbp
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ leaq -72(%rsp),%rsp
+ movq %rsp,%rbp
+ movq %rdi,%r12
+ movq %rsi,%r13
+ movq %rdx,%r14
+ movq %rcx,%r15
+
+ leaq (%r9),%rdi
+ leaq 32(%rbp),%rsi
+ leaq (%r8),%rdx
+ call asm_AES_encrypt
+
+ movl 240(%r15),%eax
+ movq %r14,%rbx
+
+ movl %eax,%edx
+ shlq $7,%rax
+ subq $96,%rax
+ subq %rax,%rsp
+
+ movq %rsp,%rax
+ movq %r15,%rcx
+ movl %edx,%r10d
+ call _bsaes_key_convert
+ pxor %xmm6,%xmm7
+ movdqa %xmm7,(%rax)
+
+ andq $-16,%r14
+ subq $128,%rsp
+ movdqa 32(%rbp),%xmm6
+
+ pxor %xmm14,%xmm14
+ movdqa .Lxts_magic(%rip),%xmm12
+ pcmpgtd %xmm6,%xmm14
+
+ subq $128,%r14
+ jc .Lxts_enc_short
+ jmp .Lxts_enc_loop
+
+.align 16
+.Lxts_enc_loop:
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm15
+ movdqa %xmm6,0(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm0
+ movdqa %xmm6,16(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ movdqu 0(%r12),%xmm7
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm1
+ movdqa %xmm6,32(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ movdqu 16(%r12),%xmm8
+ pxor %xmm7,%xmm15
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm2
+ movdqa %xmm6,48(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ movdqu 32(%r12),%xmm9
+ pxor %xmm8,%xmm0
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm3
+ movdqa %xmm6,64(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ movdqu 48(%r12),%xmm10
+ pxor %xmm9,%xmm1
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm4
+ movdqa %xmm6,80(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ movdqu 64(%r12),%xmm11
+ pxor %xmm10,%xmm2
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm5
+ movdqa %xmm6,96(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ movdqu 80(%r12),%xmm12
+ pxor %xmm11,%xmm3
+ movdqu 96(%r12),%xmm13
+ pxor %xmm12,%xmm4
+ movdqu 112(%r12),%xmm14
+ leaq 128(%r12),%r12
+ movdqa %xmm6,112(%rsp)
+ pxor %xmm13,%xmm5
+ leaq 128(%rsp),%rax
+ pxor %xmm14,%xmm6
+ movl %edx,%r10d
+
+ call _bsaes_encrypt8
+
+ pxor 0(%rsp),%xmm15
+ pxor 16(%rsp),%xmm0
+ movdqu %xmm15,0(%r13)
+ pxor 32(%rsp),%xmm3
+ movdqu %xmm0,16(%r13)
+ pxor 48(%rsp),%xmm5
+ movdqu %xmm3,32(%r13)
+ pxor 64(%rsp),%xmm2
+ movdqu %xmm5,48(%r13)
+ pxor 80(%rsp),%xmm6
+ movdqu %xmm2,64(%r13)
+ pxor 96(%rsp),%xmm1
+ movdqu %xmm6,80(%r13)
+ pxor 112(%rsp),%xmm4
+ movdqu %xmm1,96(%r13)
+ movdqu %xmm4,112(%r13)
+ leaq 128(%r13),%r13
+
+ movdqa 112(%rsp),%xmm6
+ pxor %xmm14,%xmm14
+ movdqa .Lxts_magic(%rip),%xmm12
+ pcmpgtd %xmm6,%xmm14
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+
+ subq $128,%r14
+ jnc .Lxts_enc_loop
+
+.Lxts_enc_short:
+ addq $128,%r14
+ jz .Lxts_enc_done
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm15
+ movdqa %xmm6,0(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm0
+ movdqa %xmm6,16(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ movdqu 0(%r12),%xmm7
+ cmpq $16,%r14
+ je .Lxts_enc_1
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm1
+ movdqa %xmm6,32(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ movdqu 16(%r12),%xmm8
+ cmpq $32,%r14
+ je .Lxts_enc_2
+ pxor %xmm7,%xmm15
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm2
+ movdqa %xmm6,48(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ movdqu 32(%r12),%xmm9
+ cmpq $48,%r14
+ je .Lxts_enc_3
+ pxor %xmm8,%xmm0
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm3
+ movdqa %xmm6,64(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ movdqu 48(%r12),%xmm10
+ cmpq $64,%r14
+ je .Lxts_enc_4
+ pxor %xmm9,%xmm1
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm4
+ movdqa %xmm6,80(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ movdqu 64(%r12),%xmm11
+ cmpq $80,%r14
+ je .Lxts_enc_5
+ pxor %xmm10,%xmm2
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm5
+ movdqa %xmm6,96(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ movdqu 80(%r12),%xmm12
+ cmpq $96,%r14
+ je .Lxts_enc_6
+ pxor %xmm11,%xmm3
+ movdqu 96(%r12),%xmm13
+ pxor %xmm12,%xmm4
+ movdqa %xmm6,112(%rsp)
+ leaq 112(%r12),%r12
+ pxor %xmm13,%xmm5
+ leaq 128(%rsp),%rax
+ movl %edx,%r10d
+
+ call _bsaes_encrypt8
+
+ pxor 0(%rsp),%xmm15
+ pxor 16(%rsp),%xmm0
+ movdqu %xmm15,0(%r13)
+ pxor 32(%rsp),%xmm3
+ movdqu %xmm0,16(%r13)
+ pxor 48(%rsp),%xmm5
+ movdqu %xmm3,32(%r13)
+ pxor 64(%rsp),%xmm2
+ movdqu %xmm5,48(%r13)
+ pxor 80(%rsp),%xmm6
+ movdqu %xmm2,64(%r13)
+ pxor 96(%rsp),%xmm1
+ movdqu %xmm6,80(%r13)
+ movdqu %xmm1,96(%r13)
+ leaq 112(%r13),%r13
+
+ movdqa 112(%rsp),%xmm6
+ jmp .Lxts_enc_done
+.align 16
+.Lxts_enc_6:
+ pxor %xmm11,%xmm3
+ leaq 96(%r12),%r12
+ pxor %xmm12,%xmm4
+ leaq 128(%rsp),%rax
+ movl %edx,%r10d
+
+ call _bsaes_encrypt8
+
+ pxor 0(%rsp),%xmm15
+ pxor 16(%rsp),%xmm0
+ movdqu %xmm15,0(%r13)
+ pxor 32(%rsp),%xmm3
+ movdqu %xmm0,16(%r13)
+ pxor 48(%rsp),%xmm5
+ movdqu %xmm3,32(%r13)
+ pxor 64(%rsp),%xmm2
+ movdqu %xmm5,48(%r13)
+ pxor 80(%rsp),%xmm6
+ movdqu %xmm2,64(%r13)
+ movdqu %xmm6,80(%r13)
+ leaq 96(%r13),%r13
+
+ movdqa 96(%rsp),%xmm6
+ jmp .Lxts_enc_done
+.align 16
+.Lxts_enc_5:
+ pxor %xmm10,%xmm2
+ leaq 80(%r12),%r12
+ pxor %xmm11,%xmm3
+ leaq 128(%rsp),%rax
+ movl %edx,%r10d
+
+ call _bsaes_encrypt8
+
+ pxor 0(%rsp),%xmm15
+ pxor 16(%rsp),%xmm0
+ movdqu %xmm15,0(%r13)
+ pxor 32(%rsp),%xmm3
+ movdqu %xmm0,16(%r13)
+ pxor 48(%rsp),%xmm5
+ movdqu %xmm3,32(%r13)
+ pxor 64(%rsp),%xmm2
+ movdqu %xmm5,48(%r13)
+ movdqu %xmm2,64(%r13)
+ leaq 80(%r13),%r13
+
+ movdqa 80(%rsp),%xmm6
+ jmp .Lxts_enc_done
+.align 16
+.Lxts_enc_4:
+ pxor %xmm9,%xmm1
+ leaq 64(%r12),%r12
+ pxor %xmm10,%xmm2
+ leaq 128(%rsp),%rax
+ movl %edx,%r10d
+
+ call _bsaes_encrypt8
+
+ pxor 0(%rsp),%xmm15
+ pxor 16(%rsp),%xmm0
+ movdqu %xmm15,0(%r13)
+ pxor 32(%rsp),%xmm3
+ movdqu %xmm0,16(%r13)
+ pxor 48(%rsp),%xmm5
+ movdqu %xmm3,32(%r13)
+ movdqu %xmm5,48(%r13)
+ leaq 64(%r13),%r13
+
+ movdqa 64(%rsp),%xmm6
+ jmp .Lxts_enc_done
+.align 16
+.Lxts_enc_3:
+ pxor %xmm8,%xmm0
+ leaq 48(%r12),%r12
+ pxor %xmm9,%xmm1
+ leaq 128(%rsp),%rax
+ movl %edx,%r10d
+
+ call _bsaes_encrypt8
+
+ pxor 0(%rsp),%xmm15
+ pxor 16(%rsp),%xmm0
+ movdqu %xmm15,0(%r13)
+ pxor 32(%rsp),%xmm3
+ movdqu %xmm0,16(%r13)
+ movdqu %xmm3,32(%r13)
+ leaq 48(%r13),%r13
+
+ movdqa 48(%rsp),%xmm6
+ jmp .Lxts_enc_done
+.align 16
+.Lxts_enc_2:
+ pxor %xmm7,%xmm15
+ leaq 32(%r12),%r12
+ pxor %xmm8,%xmm0
+ leaq 128(%rsp),%rax
+ movl %edx,%r10d
+
+ call _bsaes_encrypt8
+
+ pxor 0(%rsp),%xmm15
+ pxor 16(%rsp),%xmm0
+ movdqu %xmm15,0(%r13)
+ movdqu %xmm0,16(%r13)
+ leaq 32(%r13),%r13
+
+ movdqa 32(%rsp),%xmm6
+ jmp .Lxts_enc_done
+.align 16
+.Lxts_enc_1:
+ pxor %xmm15,%xmm7
+ leaq 16(%r12),%r12
+ movdqa %xmm7,32(%rbp)
+ leaq 32(%rbp),%rdi
+ leaq 32(%rbp),%rsi
+ leaq (%r15),%rdx
+ call asm_AES_encrypt
+ pxor 32(%rbp),%xmm15
+
+
+
+
+
+ movdqu %xmm15,0(%r13)
+ leaq 16(%r13),%r13
+
+ movdqa 16(%rsp),%xmm6
+
+.Lxts_enc_done:
+ andl $15,%ebx
+ jz .Lxts_enc_ret
+ movq %r13,%rdx
+
+.Lxts_enc_steal:
+ movzbl (%r12),%eax
+ movzbl -16(%rdx),%ecx
+ leaq 1(%r12),%r12
+ movb %al,-16(%rdx)
+ movb %cl,0(%rdx)
+ leaq 1(%rdx),%rdx
+ subl $1,%ebx
+ jnz .Lxts_enc_steal
+
+ movdqu -16(%r13),%xmm15
+ leaq 32(%rbp),%rdi
+ pxor %xmm6,%xmm15
+ leaq 32(%rbp),%rsi
+ movdqa %xmm15,32(%rbp)
+ leaq (%r15),%rdx
+ call asm_AES_encrypt
+ pxor 32(%rbp),%xmm6
+ movdqu %xmm6,-16(%r13)
+
+.Lxts_enc_ret:
+ leaq (%rsp),%rax
+ pxor %xmm0,%xmm0
+.Lxts_enc_bzero:
+ movdqa %xmm0,0(%rax)
+ movdqa %xmm0,16(%rax)
+ leaq 32(%rax),%rax
+ cmpq %rax,%rbp
+ ja .Lxts_enc_bzero
+
+ leaq (%rbp),%rsp
+ movq 72(%rsp),%r15
+ movq 80(%rsp),%r14
+ movq 88(%rsp),%r13
+ movq 96(%rsp),%r12
+ movq 104(%rsp),%rbx
+ movq 112(%rsp),%rax
+ leaq 120(%rsp),%rsp
+ movq %rax,%rbp
+.Lxts_enc_epilogue:
+ .byte 0xf3,0xc3
+.size bsaes_xts_encrypt,.-bsaes_xts_encrypt
+
+.globl bsaes_xts_decrypt
+.type bsaes_xts_decrypt,@function
+.align 16
+bsaes_xts_decrypt:
+ movq %rsp,%rax
+.Lxts_dec_prologue:
+ pushq %rbp
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ leaq -72(%rsp),%rsp
+ movq %rsp,%rbp
+ movq %rdi,%r12
+ movq %rsi,%r13
+ movq %rdx,%r14
+ movq %rcx,%r15
+
+ leaq (%r9),%rdi
+ leaq 32(%rbp),%rsi
+ leaq (%r8),%rdx
+ call asm_AES_encrypt
+
+ movl 240(%r15),%eax
+ movq %r14,%rbx
+
+ movl %eax,%edx
+ shlq $7,%rax
+ subq $96,%rax
+ subq %rax,%rsp
+
+ movq %rsp,%rax
+ movq %r15,%rcx
+ movl %edx,%r10d
+ call _bsaes_key_convert
+ pxor (%rsp),%xmm7
+ movdqa %xmm6,(%rax)
+ movdqa %xmm7,(%rsp)
+
+ xorl %eax,%eax
+ andq $-16,%r14
+ testl $15,%ebx
+ setnz %al
+ shlq $4,%rax
+ subq %rax,%r14
+
+ subq $128,%rsp
+ movdqa 32(%rbp),%xmm6
+
+ pxor %xmm14,%xmm14
+ movdqa .Lxts_magic(%rip),%xmm12
+ pcmpgtd %xmm6,%xmm14
+
+ subq $128,%r14
+ jc .Lxts_dec_short
+ jmp .Lxts_dec_loop
+
+.align 16
+.Lxts_dec_loop:
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm15
+ movdqa %xmm6,0(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm0
+ movdqa %xmm6,16(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ movdqu 0(%r12),%xmm7
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm1
+ movdqa %xmm6,32(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ movdqu 16(%r12),%xmm8
+ pxor %xmm7,%xmm15
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm2
+ movdqa %xmm6,48(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ movdqu 32(%r12),%xmm9
+ pxor %xmm8,%xmm0
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm3
+ movdqa %xmm6,64(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ movdqu 48(%r12),%xmm10
+ pxor %xmm9,%xmm1
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm4
+ movdqa %xmm6,80(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ movdqu 64(%r12),%xmm11
+ pxor %xmm10,%xmm2
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm5
+ movdqa %xmm6,96(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ movdqu 80(%r12),%xmm12
+ pxor %xmm11,%xmm3
+ movdqu 96(%r12),%xmm13
+ pxor %xmm12,%xmm4
+ movdqu 112(%r12),%xmm14
+ leaq 128(%r12),%r12
+ movdqa %xmm6,112(%rsp)
+ pxor %xmm13,%xmm5
+ leaq 128(%rsp),%rax
+ pxor %xmm14,%xmm6
+ movl %edx,%r10d
+
+ call _bsaes_decrypt8
+
+ pxor 0(%rsp),%xmm15
+ pxor 16(%rsp),%xmm0
+ movdqu %xmm15,0(%r13)
+ pxor 32(%rsp),%xmm5
+ movdqu %xmm0,16(%r13)
+ pxor 48(%rsp),%xmm3
+ movdqu %xmm5,32(%r13)
+ pxor 64(%rsp),%xmm1
+ movdqu %xmm3,48(%r13)
+ pxor 80(%rsp),%xmm6
+ movdqu %xmm1,64(%r13)
+ pxor 96(%rsp),%xmm2
+ movdqu %xmm6,80(%r13)
+ pxor 112(%rsp),%xmm4
+ movdqu %xmm2,96(%r13)
+ movdqu %xmm4,112(%r13)
+ leaq 128(%r13),%r13
+
+ movdqa 112(%rsp),%xmm6
+ pxor %xmm14,%xmm14
+ movdqa .Lxts_magic(%rip),%xmm12
+ pcmpgtd %xmm6,%xmm14
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+
+ subq $128,%r14
+ jnc .Lxts_dec_loop
+
+.Lxts_dec_short:
+ addq $128,%r14
+ jz .Lxts_dec_done
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm15
+ movdqa %xmm6,0(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm0
+ movdqa %xmm6,16(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ movdqu 0(%r12),%xmm7
+ cmpq $16,%r14
+ je .Lxts_dec_1
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm1
+ movdqa %xmm6,32(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ movdqu 16(%r12),%xmm8
+ cmpq $32,%r14
+ je .Lxts_dec_2
+ pxor %xmm7,%xmm15
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm2
+ movdqa %xmm6,48(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ movdqu 32(%r12),%xmm9
+ cmpq $48,%r14
+ je .Lxts_dec_3
+ pxor %xmm8,%xmm0
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm3
+ movdqa %xmm6,64(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ movdqu 48(%r12),%xmm10
+ cmpq $64,%r14
+ je .Lxts_dec_4
+ pxor %xmm9,%xmm1
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm4
+ movdqa %xmm6,80(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ movdqu 64(%r12),%xmm11
+ cmpq $80,%r14
+ je .Lxts_dec_5
+ pxor %xmm10,%xmm2
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm5
+ movdqa %xmm6,96(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ movdqu 80(%r12),%xmm12
+ cmpq $96,%r14
+ je .Lxts_dec_6
+ pxor %xmm11,%xmm3
+ movdqu 96(%r12),%xmm13
+ pxor %xmm12,%xmm4
+ movdqa %xmm6,112(%rsp)
+ leaq 112(%r12),%r12
+ pxor %xmm13,%xmm5
+ leaq 128(%rsp),%rax
+ movl %edx,%r10d
+
+ call _bsaes_decrypt8
+
+ pxor 0(%rsp),%xmm15
+ pxor 16(%rsp),%xmm0
+ movdqu %xmm15,0(%r13)
+ pxor 32(%rsp),%xmm5
+ movdqu %xmm0,16(%r13)
+ pxor 48(%rsp),%xmm3
+ movdqu %xmm5,32(%r13)
+ pxor 64(%rsp),%xmm1
+ movdqu %xmm3,48(%r13)
+ pxor 80(%rsp),%xmm6
+ movdqu %xmm1,64(%r13)
+ pxor 96(%rsp),%xmm2
+ movdqu %xmm6,80(%r13)
+ movdqu %xmm2,96(%r13)
+ leaq 112(%r13),%r13
+
+ movdqa 112(%rsp),%xmm6
+ jmp .Lxts_dec_done
+.align 16
+.Lxts_dec_6:
+ pxor %xmm11,%xmm3
+ leaq 96(%r12),%r12
+ pxor %xmm12,%xmm4
+ leaq 128(%rsp),%rax
+ movl %edx,%r10d
+
+ call _bsaes_decrypt8
+
+ pxor 0(%rsp),%xmm15
+ pxor 16(%rsp),%xmm0
+ movdqu %xmm15,0(%r13)
+ pxor 32(%rsp),%xmm5
+ movdqu %xmm0,16(%r13)
+ pxor 48(%rsp),%xmm3
+ movdqu %xmm5,32(%r13)
+ pxor 64(%rsp),%xmm1
+ movdqu %xmm3,48(%r13)
+ pxor 80(%rsp),%xmm6
+ movdqu %xmm1,64(%r13)
+ movdqu %xmm6,80(%r13)
+ leaq 96(%r13),%r13
+
+ movdqa 96(%rsp),%xmm6
+ jmp .Lxts_dec_done
+.align 16
+.Lxts_dec_5:
+ pxor %xmm10,%xmm2
+ leaq 80(%r12),%r12
+ pxor %xmm11,%xmm3
+ leaq 128(%rsp),%rax
+ movl %edx,%r10d
+
+ call _bsaes_decrypt8
+
+ pxor 0(%rsp),%xmm15
+ pxor 16(%rsp),%xmm0
+ movdqu %xmm15,0(%r13)
+ pxor 32(%rsp),%xmm5
+ movdqu %xmm0,16(%r13)
+ pxor 48(%rsp),%xmm3
+ movdqu %xmm5,32(%r13)
+ pxor 64(%rsp),%xmm1
+ movdqu %xmm3,48(%r13)
+ movdqu %xmm1,64(%r13)
+ leaq 80(%r13),%r13
+
+ movdqa 80(%rsp),%xmm6
+ jmp .Lxts_dec_done
+.align 16
+.Lxts_dec_4:
+ pxor %xmm9,%xmm1
+ leaq 64(%r12),%r12
+ pxor %xmm10,%xmm2
+ leaq 128(%rsp),%rax
+ movl %edx,%r10d
+
+ call _bsaes_decrypt8
+
+ pxor 0(%rsp),%xmm15
+ pxor 16(%rsp),%xmm0
+ movdqu %xmm15,0(%r13)
+ pxor 32(%rsp),%xmm5
+ movdqu %xmm0,16(%r13)
+ pxor 48(%rsp),%xmm3
+ movdqu %xmm5,32(%r13)
+ movdqu %xmm3,48(%r13)
+ leaq 64(%r13),%r13
+
+ movdqa 64(%rsp),%xmm6
+ jmp .Lxts_dec_done
+.align 16
+.Lxts_dec_3:
+ pxor %xmm8,%xmm0
+ leaq 48(%r12),%r12
+ pxor %xmm9,%xmm1
+ leaq 128(%rsp),%rax
+ movl %edx,%r10d
+
+ call _bsaes_decrypt8
+
+ pxor 0(%rsp),%xmm15
+ pxor 16(%rsp),%xmm0
+ movdqu %xmm15,0(%r13)
+ pxor 32(%rsp),%xmm5
+ movdqu %xmm0,16(%r13)
+ movdqu %xmm5,32(%r13)
+ leaq 48(%r13),%r13
+
+ movdqa 48(%rsp),%xmm6
+ jmp .Lxts_dec_done
+.align 16
+.Lxts_dec_2:
+ pxor %xmm7,%xmm15
+ leaq 32(%r12),%r12
+ pxor %xmm8,%xmm0
+ leaq 128(%rsp),%rax
+ movl %edx,%r10d
+
+ call _bsaes_decrypt8
+
+ pxor 0(%rsp),%xmm15
+ pxor 16(%rsp),%xmm0
+ movdqu %xmm15,0(%r13)
+ movdqu %xmm0,16(%r13)
+ leaq 32(%r13),%r13
+
+ movdqa 32(%rsp),%xmm6
+ jmp .Lxts_dec_done
+.align 16
+.Lxts_dec_1:
+ pxor %xmm15,%xmm7
+ leaq 16(%r12),%r12
+ movdqa %xmm7,32(%rbp)
+ leaq 32(%rbp),%rdi
+ leaq 32(%rbp),%rsi
+ leaq (%r15),%rdx
+ call asm_AES_decrypt
+ pxor 32(%rbp),%xmm15
+
+
+
+
+
+ movdqu %xmm15,0(%r13)
+ leaq 16(%r13),%r13
+
+ movdqa 16(%rsp),%xmm6
+
+.Lxts_dec_done:
+ andl $15,%ebx
+ jz .Lxts_dec_ret
+
+ pxor %xmm14,%xmm14
+ movdqa .Lxts_magic(%rip),%xmm12
+ pcmpgtd %xmm6,%xmm14
+ pshufd $19,%xmm14,%xmm13
+ movdqa %xmm6,%xmm5
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ movdqu (%r12),%xmm15
+ pxor %xmm13,%xmm6
+
+ leaq 32(%rbp),%rdi
+ pxor %xmm6,%xmm15
+ leaq 32(%rbp),%rsi
+ movdqa %xmm15,32(%rbp)
+ leaq (%r15),%rdx
+ call asm_AES_decrypt
+ pxor 32(%rbp),%xmm6
+ movq %r13,%rdx
+ movdqu %xmm6,(%r13)
+
+.Lxts_dec_steal:
+ movzbl 16(%r12),%eax
+ movzbl (%rdx),%ecx
+ leaq 1(%r12),%r12
+ movb %al,(%rdx)
+ movb %cl,16(%rdx)
+ leaq 1(%rdx),%rdx
+ subl $1,%ebx
+ jnz .Lxts_dec_steal
+
+ movdqu (%r13),%xmm15
+ leaq 32(%rbp),%rdi
+ pxor %xmm5,%xmm15
+ leaq 32(%rbp),%rsi
+ movdqa %xmm15,32(%rbp)
+ leaq (%r15),%rdx
+ call asm_AES_decrypt
+ pxor 32(%rbp),%xmm5
+ movdqu %xmm5,(%r13)
+
+.Lxts_dec_ret:
+ leaq (%rsp),%rax
+ pxor %xmm0,%xmm0
+.Lxts_dec_bzero:
+ movdqa %xmm0,0(%rax)
+ movdqa %xmm0,16(%rax)
+ leaq 32(%rax),%rax
+ cmpq %rax,%rbp
+ ja .Lxts_dec_bzero
+
+ leaq (%rbp),%rsp
+ movq 72(%rsp),%r15
+ movq 80(%rsp),%r14
+ movq 88(%rsp),%r13
+ movq 96(%rsp),%r12
+ movq 104(%rsp),%rbx
+ movq 112(%rsp),%rax
+ leaq 120(%rsp),%rsp
+ movq %rax,%rbp
+.Lxts_dec_epilogue:
+ .byte 0xf3,0xc3
+.size bsaes_xts_decrypt,.-bsaes_xts_decrypt
+.type _bsaes_const,@object
+.align 64
+_bsaes_const:
+.LM0ISR:
+.quad 0x0a0e0206070b0f03, 0x0004080c0d010509
+.LISRM0:
+.quad 0x01040b0e0205080f, 0x0306090c00070a0d
+.LISR:
+.quad 0x0504070602010003, 0x0f0e0d0c080b0a09
+.LBS0:
+.quad 0x5555555555555555, 0x5555555555555555
+.LBS1:
+.quad 0x3333333333333333, 0x3333333333333333
+.LBS2:
+.quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
+.LSR:
+.quad 0x0504070600030201, 0x0f0e0d0c0a09080b
+.LSRM0:
+.quad 0x0304090e00050a0f, 0x01060b0c0207080d
+.LM0SR:
+.quad 0x0a0e02060f03070b, 0x0004080c05090d01
+.LSWPUP:
+.quad 0x0706050403020100, 0x0c0d0e0f0b0a0908
+.LSWPUPM0SR:
+.quad 0x0a0d02060c03070b, 0x0004080f05090e01
+.LADD1:
+.quad 0x0000000000000000, 0x0000000100000000
+.LADD2:
+.quad 0x0000000000000000, 0x0000000200000000
+.LADD3:
+.quad 0x0000000000000000, 0x0000000300000000
+.LADD4:
+.quad 0x0000000000000000, 0x0000000400000000
+.LADD5:
+.quad 0x0000000000000000, 0x0000000500000000
+.LADD6:
+.quad 0x0000000000000000, 0x0000000600000000
+.LADD7:
+.quad 0x0000000000000000, 0x0000000700000000
+.LADD8:
+.quad 0x0000000000000000, 0x0000000800000000
+.Lxts_magic:
+.long 0x87,0,1,0
+.Lmasks:
+.quad 0x0101010101010101, 0x0101010101010101
+.quad 0x0202020202020202, 0x0202020202020202
+.quad 0x0404040404040404, 0x0404040404040404
+.quad 0x0808080808080808, 0x0808080808080808
+.LM0:
+.quad 0x02060a0e03070b0f, 0x0004080c0105090d
+.L63:
+.quad 0x6363636363636363, 0x6363636363636363
+.byte 66,105,116,45,115,108,105,99,101,100,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,69,109,105,108,105,97,32,75,195,164,115,112,101,114,44,32,80,101,116,101,114,32,83,99,104,119,97,98,101,44,32,65,110,100,121,32,80,111,108,121,97,107,111,118,0
+.align 64
+.size _bsaes_const,.-_bsaes_const
diff --git a/main/openssl/crypto/aes/asm/bsaes-x86_64.pl b/main/openssl/crypto/aes/asm/bsaes-x86_64.pl
new file mode 100644
index 00000000..41b90f08
--- /dev/null
+++ b/main/openssl/crypto/aes/asm/bsaes-x86_64.pl
@@ -0,0 +1,3108 @@
+#!/usr/bin/env perl
+
+###################################################################
+### AES-128 [originally in CTR mode] ###
+### bitsliced implementation for Intel Core 2 processors ###
+### requires support of SSE extensions up to SSSE3 ###
+### Author: Emilia Käsper and Peter Schwabe ###
+### Date: 2009-03-19 ###
+### Public domain ###
+### ###
+### See http://homes.esat.kuleuven.be/~ekasper/#software for ###
+### further information. ###
+###################################################################
+#
+# September 2011.
+#
+# Started as transliteration to "perlasm" the original code has
+# undergone following changes:
+#
+# - code was made position-independent;
+# - rounds were folded into a loop resulting in >5x size reduction
+# from 12.5KB to 2.2KB;
+# - above was possibile thanks to mixcolumns() modification that
+# allowed to feed its output back to aesenc[last], this was
+# achieved at cost of two additional inter-registers moves;
+# - some instruction reordering and interleaving;
+# - this module doesn't implement key setup subroutine, instead it
+# relies on conversion of "conventional" key schedule as returned
+# by AES_set_encrypt_key (see discussion below);
+# - first and last round keys are treated differently, which allowed
+# to skip one shiftrows(), reduce bit-sliced key schedule and
+# speed-up conversion by 22%;
+# - support for 192- and 256-bit keys was added;
+#
+# Resulting performance in CPU cycles spent to encrypt one byte out
+# of 4096-byte buffer with 128-bit key is:
+#
+# Emilia's this(*) difference
+#
+# Core 2 9.30 8.69 +7%
+# Nehalem(**) 7.63 6.98 +9%
+# Atom 17.1 17.4 -2%(***)
+#
+# (*) Comparison is not completely fair, because "this" is ECB,
+# i.e. no extra processing such as counter values calculation
+# and xor-ing input as in Emilia's CTR implementation is
+# performed. However, the CTR calculations stand for not more
+# than 1% of total time, so comparison is *rather* fair.
+#
+# (**) Results were collected on Westmere, which is considered to
+# be equivalent to Nehalem for this code.
+#
+# (***) Slowdown on Atom is rather strange per se, because original
+# implementation has a number of 9+-bytes instructions, which
+# are bad for Atom front-end, and which I eliminated completely.
+# In attempt to address deterioration sbox() was tested in FP
+# SIMD "domain" (movaps instead of movdqa, xorps instead of
+# pxor, etc.). While it resulted in nominal 4% improvement on
+# Atom, it hurted Westmere by more than 2x factor.
+#
+# As for key schedule conversion subroutine. Interface to OpenSSL
+# relies on per-invocation on-the-fly conversion. This naturally
+# has impact on performance, especially for short inputs. Conversion
+# time in CPU cycles and its ratio to CPU cycles spent in 8x block
+# function is:
+#
+# conversion conversion/8x block
+# Core 2 240 0.22
+# Nehalem 180 0.20
+# Atom 430 0.19
+#
+# The ratio values mean that 128-byte blocks will be processed
+# 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
+# etc. Then keep in mind that input sizes not divisible by 128 are
+# *effectively* slower, especially shortest ones, e.g. consecutive
+# 144-byte blocks are processed 44% slower than one would expect,
+# 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
+# it's still faster than ["hyper-threading-safe" code path in]
+# aes-x86_64.pl on all lengths above 64 bytes...
+#
+# October 2011.
+#
+# Add decryption procedure. Performance in CPU cycles spent to decrypt
+# one byte out of 4096-byte buffer with 128-bit key is:
+#
+# Core 2 9.83
+# Nehalem 7.74
+# Atom 19.0
+#
+# November 2011.
+#
+# Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is
+# suboptimal, but XTS is meant to be used with larger blocks...
+#
+# <appro@openssl.org>
+
+$flavour = shift;
+$output = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
+my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15)
+my $ecb=0; # suppress unreferenced ECB subroutines, spare some space...
+
+{
+my ($key,$rounds,$const)=("%rax","%r10d","%r11");
+
+sub Sbox {
+# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
+my @b=@_[0..7];
+my @t=@_[8..11];
+my @s=@_[12..15];
+ &InBasisChange (@b);
+ &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s);
+ &OutBasisChange (@b[7,1,4,2,6,5,0,3]);
+}
+
+sub InBasisChange {
+# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
+my @b=@_[0..7];
+$code.=<<___;
+ pxor @b[6], @b[5]
+ pxor @b[1], @b[2]
+ pxor @b[0], @b[3]
+ pxor @b[2], @b[6]
+ pxor @b[0], @b[5]
+
+ pxor @b[3], @b[6]
+ pxor @b[7], @b[3]
+ pxor @b[5], @b[7]
+ pxor @b[4], @b[3]
+ pxor @b[5], @b[4]
+ pxor @b[1], @b[3]
+
+ pxor @b[7], @b[2]
+ pxor @b[5], @b[1]
+___
+}
+
+sub OutBasisChange {
+# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
+my @b=@_[0..7];
+$code.=<<___;
+ pxor @b[6], @b[0]
+ pxor @b[4], @b[1]
+ pxor @b[0], @b[2]
+ pxor @b[6], @b[4]
+ pxor @b[1], @b[6]
+
+ pxor @b[5], @b[1]
+ pxor @b[3], @b[5]
+ pxor @b[7], @b[3]
+ pxor @b[5], @b[7]
+ pxor @b[5], @b[2]
+
+ pxor @b[7], @b[4]
+___
+}
+
+sub InvSbox {
+# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
+my @b=@_[0..7];
+my @t=@_[8..11];
+my @s=@_[12..15];
+ &InvInBasisChange (@b);
+ &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s);
+ &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]);
+}
+
+sub InvInBasisChange { # OutBasisChange in reverse
+my @b=@_[5,1,2,6,3,7,0,4];
+$code.=<<___
+ pxor @b[7], @b[4]
+
+ pxor @b[5], @b[7]
+ pxor @b[5], @b[2]
+ pxor @b[7], @b[3]
+ pxor @b[3], @b[5]
+ pxor @b[5], @b[1]
+
+ pxor @b[1], @b[6]
+ pxor @b[0], @b[2]
+ pxor @b[6], @b[4]
+ pxor @b[6], @b[0]
+ pxor @b[4], @b[1]
+___
+}
+
+sub InvOutBasisChange { # InBasisChange in reverse
+my @b=@_[2,5,7,3,6,1,0,4];
+$code.=<<___;
+ pxor @b[5], @b[1]
+ pxor @b[7], @b[2]
+
+ pxor @b[1], @b[3]
+ pxor @b[5], @b[4]
+ pxor @b[5], @b[7]
+ pxor @b[4], @b[3]
+ pxor @b[0], @b[5]
+ pxor @b[7], @b[3]
+ pxor @b[2], @b[6]
+ pxor @b[1], @b[2]
+ pxor @b[3], @b[6]
+
+ pxor @b[0], @b[3]
+ pxor @b[6], @b[5]
+___
+}
+
+sub Mul_GF4 {
+#;*************************************************************
+#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
+#;*************************************************************
+my ($x0,$x1,$y0,$y1,$t0)=@_;
+$code.=<<___;
+ movdqa $y0, $t0
+ pxor $y1, $t0
+ pand $x0, $t0
+ pxor $x1, $x0
+ pand $y0, $x1
+ pand $y1, $x0
+ pxor $x1, $x0
+ pxor $t0, $x1
+___
+}
+
+sub Mul_GF4_N { # not used, see next subroutine
+# multiply and scale by N
+my ($x0,$x1,$y0,$y1,$t0)=@_;
+$code.=<<___;
+ movdqa $y0, $t0
+ pxor $y1, $t0
+ pand $x0, $t0
+ pxor $x1, $x0
+ pand $y0, $x1
+ pand $y1, $x0
+ pxor $x0, $x1
+ pxor $t0, $x0
+___
+}
+
+sub Mul_GF4_N_GF4 {
+# interleaved Mul_GF4_N and Mul_GF4
+my ($x0,$x1,$y0,$y1,$t0,
+ $x2,$x3,$y2,$y3,$t1)=@_;
+$code.=<<___;
+ movdqa $y0, $t0
+ movdqa $y2, $t1
+ pxor $y1, $t0
+ pxor $y3, $t1
+ pand $x0, $t0
+ pand $x2, $t1
+ pxor $x1, $x0
+ pxor $x3, $x2
+ pand $y0, $x1
+ pand $y2, $x3
+ pand $y1, $x0
+ pand $y3, $x2
+ pxor $x0, $x1
+ pxor $x3, $x2
+ pxor $t0, $x0
+ pxor $t1, $x3
+___
+}
+sub Mul_GF16_2 {
+my @x=@_[0..7];
+my @y=@_[8..11];
+my @t=@_[12..15];
+$code.=<<___;
+ movdqa @x[0], @t[0]
+ movdqa @x[1], @t[1]
+___
+ &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]);
+$code.=<<___;
+ pxor @x[2], @t[0]
+ pxor @x[3], @t[1]
+ pxor @y[2], @y[0]
+ pxor @y[3], @y[1]
+___
+ Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
+ @x[2], @x[3], @y[2], @y[3], @t[2]);
+$code.=<<___;
+ pxor @t[0], @x[0]
+ pxor @t[0], @x[2]
+ pxor @t[1], @x[1]
+ pxor @t[1], @x[3]
+
+ movdqa @x[4], @t[0]
+ movdqa @x[5], @t[1]
+ pxor @x[6], @t[0]
+ pxor @x[7], @t[1]
+___
+ &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
+ @x[6], @x[7], @y[2], @y[3], @t[2]);
+$code.=<<___;
+ pxor @y[2], @y[0]
+ pxor @y[3], @y[1]
+___
+ &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]);
+$code.=<<___;
+ pxor @t[0], @x[4]
+ pxor @t[0], @x[6]
+ pxor @t[1], @x[5]
+ pxor @t[1], @x[7]
+___
+}
+sub Inv_GF256 {
+#;********************************************************************
+#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) *
+#;********************************************************************
+my @x=@_[0..7];
+my @t=@_[8..11];
+my @s=@_[12..15];
+# direct optimizations from hardware
+$code.=<<___;
+ movdqa @x[4], @t[3]
+ movdqa @x[5], @t[2]
+ movdqa @x[1], @t[1]
+ movdqa @x[7], @s[1]
+ movdqa @x[0], @s[0]
+
+ pxor @x[6], @t[3]
+ pxor @x[7], @t[2]
+ pxor @x[3], @t[1]
+ movdqa @t[3], @s[2]
+ pxor @x[6], @s[1]
+ movdqa @t[2], @t[0]
+ pxor @x[2], @s[0]
+ movdqa @t[3], @s[3]
+
+ por @t[1], @t[2]
+ por @s[0], @t[3]
+ pxor @t[0], @s[3]
+ pand @s[0], @s[2]
+ pxor @t[1], @s[0]
+ pand @t[1], @t[0]
+ pand @s[0], @s[3]
+ movdqa @x[3], @s[0]
+ pxor @x[2], @s[0]
+ pand @s[0], @s[1]
+ pxor @s[1], @t[3]
+ pxor @s[1], @t[2]
+ movdqa @x[4], @s[1]
+ movdqa @x[1], @s[0]
+ pxor @x[5], @s[1]
+ pxor @x[0], @s[0]
+ movdqa @s[1], @t[1]
+ pand @s[0], @s[1]
+ por @s[0], @t[1]
+ pxor @s[1], @t[0]
+ pxor @s[3], @t[3]
+ pxor @s[2], @t[2]
+ pxor @s[3], @t[1]
+ movdqa @x[7], @s[0]
+ pxor @s[2], @t[0]
+ movdqa @x[6], @s[1]
+ pxor @s[2], @t[1]
+ movdqa @x[5], @s[2]
+ pand @x[3], @s[0]
+ movdqa @x[4], @s[3]
+ pand @x[2], @s[1]
+ pand @x[1], @s[2]
+ por @x[0], @s[3]
+ pxor @s[0], @t[3]
+ pxor @s[1], @t[2]
+ pxor @s[2], @t[1]
+ pxor @s[3], @t[0]
+
+ #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
+
+ # new smaller inversion
+
+ movdqa @t[3], @s[0]
+ pand @t[1], @t[3]
+ pxor @t[2], @s[0]
+
+ movdqa @t[0], @s[2]
+ movdqa @s[0], @s[3]
+ pxor @t[3], @s[2]
+ pand @s[2], @s[3]
+
+ movdqa @t[1], @s[1]
+ pxor @t[2], @s[3]
+ pxor @t[0], @s[1]
+
+ pxor @t[2], @t[3]
+
+ pand @t[3], @s[1]
+
+ movdqa @s[2], @t[2]
+ pxor @t[0], @s[1]
+
+ pxor @s[1], @t[2]
+ pxor @s[1], @t[1]
+
+ pand @t[0], @t[2]
+
+ pxor @t[2], @s[2]
+ pxor @t[2], @t[1]
+
+ pand @s[3], @s[2]
+
+ pxor @s[0], @s[2]
+___
+# output in s3, s2, s1, t1
+
+# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
+
+# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
+ &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
+
+### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
+}
+
+# AES linear components
+
+sub ShiftRows {
+my @x=@_[0..7];
+my $mask=pop;
+$code.=<<___;
+ pxor 0x00($key),@x[0]
+ pxor 0x10($key),@x[1]
+ pshufb $mask,@x[0]
+ pxor 0x20($key),@x[2]
+ pshufb $mask,@x[1]
+ pxor 0x30($key),@x[3]
+ pshufb $mask,@x[2]
+ pxor 0x40($key),@x[4]
+ pshufb $mask,@x[3]
+ pxor 0x50($key),@x[5]
+ pshufb $mask,@x[4]
+ pxor 0x60($key),@x[6]
+ pshufb $mask,@x[5]
+ pxor 0x70($key),@x[7]
+ pshufb $mask,@x[6]
+ lea 0x80($key),$key
+ pshufb $mask,@x[7]
+___
+}
+
+sub MixColumns {
+# modified to emit output in order suitable for feeding back to aesenc[last]
+my @x=@_[0..7];
+my @t=@_[8..15];
+my $inv=@_[16]; # optional
+$code.=<<___;
+ pshufd \$0x93, @x[0], @t[0] # x0 <<< 32
+ pshufd \$0x93, @x[1], @t[1]
+ pxor @t[0], @x[0] # x0 ^ (x0 <<< 32)
+ pshufd \$0x93, @x[2], @t[2]
+ pxor @t[1], @x[1]
+ pshufd \$0x93, @x[3], @t[3]
+ pxor @t[2], @x[2]
+ pshufd \$0x93, @x[4], @t[4]
+ pxor @t[3], @x[3]
+ pshufd \$0x93, @x[5], @t[5]
+ pxor @t[4], @x[4]
+ pshufd \$0x93, @x[6], @t[6]
+ pxor @t[5], @x[5]
+ pshufd \$0x93, @x[7], @t[7]
+ pxor @t[6], @x[6]
+ pxor @t[7], @x[7]
+
+ pxor @x[0], @t[1]
+ pxor @x[7], @t[0]
+ pxor @x[7], @t[1]
+ pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64)
+ pxor @x[1], @t[2]
+ pshufd \$0x4E, @x[1], @x[1]
+ pxor @x[4], @t[5]
+ pxor @t[0], @x[0]
+ pxor @x[5], @t[6]
+ pxor @t[1], @x[1]
+ pxor @x[3], @t[4]
+ pshufd \$0x4E, @x[4], @t[0]
+ pxor @x[6], @t[7]
+ pshufd \$0x4E, @x[5], @t[1]
+ pxor @x[2], @t[3]
+ pshufd \$0x4E, @x[3], @x[4]
+ pxor @x[7], @t[3]
+ pshufd \$0x4E, @x[7], @x[5]
+ pxor @x[7], @t[4]
+ pshufd \$0x4E, @x[6], @x[3]
+ pxor @t[4], @t[0]
+ pshufd \$0x4E, @x[2], @x[6]
+ pxor @t[5], @t[1]
+___
+$code.=<<___ if (!$inv);
+ pxor @t[3], @x[4]
+ pxor @t[7], @x[5]
+ pxor @t[6], @x[3]
+ movdqa @t[0], @x[2]
+ pxor @t[2], @x[6]
+ movdqa @t[1], @x[7]
+___
+$code.=<<___ if ($inv);
+ pxor @x[4], @t[3]
+ pxor @t[7], @x[5]
+ pxor @x[3], @t[6]
+ movdqa @t[0], @x[3]
+ pxor @t[2], @x[6]
+ movdqa @t[6], @x[2]
+ movdqa @t[1], @x[7]
+ movdqa @x[6], @x[4]
+ movdqa @t[3], @x[6]
+___
+}
+
+sub InvMixColumns_orig {
+my @x=@_[0..7];
+my @t=@_[8..15];
+
+$code.=<<___;
+ # multiplication by 0x0e
+ pshufd \$0x93, @x[7], @t[7]
+ movdqa @x[2], @t[2]
+ pxor @x[5], @x[7] # 7 5
+ pxor @x[5], @x[2] # 2 5
+ pshufd \$0x93, @x[0], @t[0]
+ movdqa @x[5], @t[5]
+ pxor @x[0], @x[5] # 5 0 [1]
+ pxor @x[1], @x[0] # 0 1
+ pshufd \$0x93, @x[1], @t[1]
+ pxor @x[2], @x[1] # 1 25
+ pxor @x[6], @x[0] # 01 6 [2]
+ pxor @x[3], @x[1] # 125 3 [4]
+ pshufd \$0x93, @x[3], @t[3]
+ pxor @x[0], @x[2] # 25 016 [3]
+ pxor @x[7], @x[3] # 3 75
+ pxor @x[6], @x[7] # 75 6 [0]
+ pshufd \$0x93, @x[6], @t[6]
+ movdqa @x[4], @t[4]
+ pxor @x[4], @x[6] # 6 4
+ pxor @x[3], @x[4] # 4 375 [6]
+ pxor @x[7], @x[3] # 375 756=36
+ pxor @t[5], @x[6] # 64 5 [7]
+ pxor @t[2], @x[3] # 36 2
+ pxor @t[4], @x[3] # 362 4 [5]
+ pshufd \$0x93, @t[5], @t[5]
+___
+ my @y = @x[7,5,0,2,1,3,4,6];
+$code.=<<___;
+ # multiplication by 0x0b
+ pxor @y[0], @y[1]
+ pxor @t[0], @y[0]
+ pxor @t[1], @y[1]
+ pshufd \$0x93, @t[2], @t[2]
+ pxor @t[5], @y[0]
+ pxor @t[6], @y[1]
+ pxor @t[7], @y[0]
+ pshufd \$0x93, @t[4], @t[4]
+ pxor @t[6], @t[7] # clobber t[7]
+ pxor @y[0], @y[1]
+
+ pxor @t[0], @y[3]
+ pshufd \$0x93, @t[0], @t[0]
+ pxor @t[1], @y[2]
+ pxor @t[1], @y[4]
+ pxor @t[2], @y[2]
+ pshufd \$0x93, @t[1], @t[1]
+ pxor @t[2], @y[3]
+ pxor @t[2], @y[5]
+ pxor @t[7], @y[2]
+ pshufd \$0x93, @t[2], @t[2]
+ pxor @t[3], @y[3]
+ pxor @t[3], @y[6]
+ pxor @t[3], @y[4]
+ pshufd \$0x93, @t[3], @t[3]
+ pxor @t[4], @y[7]
+ pxor @t[4], @y[5]
+ pxor @t[7], @y[7]
+ pxor @t[5], @y[3]
+ pxor @t[4], @y[4]
+ pxor @t[5], @t[7] # clobber t[7] even more
+
+ pxor @t[7], @y[5]
+ pshufd \$0x93, @t[4], @t[4]
+ pxor @t[7], @y[6]
+ pxor @t[7], @y[4]
+
+ pxor @t[5], @t[7]
+ pshufd \$0x93, @t[5], @t[5]
+ pxor @t[6], @t[7] # restore t[7]
+
+ # multiplication by 0x0d
+ pxor @y[7], @y[4]
+ pxor @t[4], @y[7]
+ pshufd \$0x93, @t[6], @t[6]
+ pxor @t[0], @y[2]
+ pxor @t[5], @y[7]
+ pxor @t[2], @y[2]
+ pshufd \$0x93, @t[7], @t[7]
+
+ pxor @y[1], @y[3]
+ pxor @t[1], @y[1]
+ pxor @t[0], @y[0]
+ pxor @t[0], @y[3]
+ pxor @t[5], @y[1]
+ pxor @t[5], @y[0]
+ pxor @t[7], @y[1]
+ pshufd \$0x93, @t[0], @t[0]
+ pxor @t[6], @y[0]
+ pxor @y[1], @y[3]
+ pxor @t[1], @y[4]
+ pshufd \$0x93, @t[1], @t[1]
+
+ pxor @t[7], @y[7]
+ pxor @t[2], @y[4]
+ pxor @t[2], @y[5]
+ pshufd \$0x93, @t[2], @t[2]
+ pxor @t[6], @y[2]
+ pxor @t[3], @t[6] # clobber t[6]
+ pxor @y[7], @y[4]
+ pxor @t[6], @y[3]
+
+ pxor @t[6], @y[6]
+ pxor @t[5], @y[5]
+ pxor @t[4], @y[6]
+ pshufd \$0x93, @t[4], @t[4]
+ pxor @t[6], @y[5]
+ pxor @t[7], @y[6]
+ pxor @t[3], @t[6] # restore t[6]
+
+ pshufd \$0x93, @t[5], @t[5]
+ pshufd \$0x93, @t[6], @t[6]
+ pshufd \$0x93, @t[7], @t[7]
+ pshufd \$0x93, @t[3], @t[3]
+
+ # multiplication by 0x09
+ pxor @y[1], @y[4]
+ pxor @y[1], @t[1] # t[1]=y[1]
+ pxor @t[5], @t[0] # clobber t[0]
+ pxor @t[5], @t[1]
+ pxor @t[0], @y[3]
+ pxor @y[0], @t[0] # t[0]=y[0]
+ pxor @t[6], @t[1]
+ pxor @t[7], @t[6] # clobber t[6]
+ pxor @t[1], @y[4]
+ pxor @t[4], @y[7]
+ pxor @y[4], @t[4] # t[4]=y[4]
+ pxor @t[3], @y[6]
+ pxor @y[3], @t[3] # t[3]=y[3]
+ pxor @t[2], @y[5]
+ pxor @y[2], @t[2] # t[2]=y[2]
+ pxor @t[7], @t[3]
+ pxor @y[5], @t[5] # t[5]=y[5]
+ pxor @t[6], @t[2]
+ pxor @t[6], @t[5]
+ pxor @y[6], @t[6] # t[6]=y[6]
+ pxor @y[7], @t[7] # t[7]=y[7]
+
+ movdqa @t[0],@XMM[0]
+ movdqa @t[1],@XMM[1]
+ movdqa @t[2],@XMM[2]
+ movdqa @t[3],@XMM[3]
+ movdqa @t[4],@XMM[4]
+ movdqa @t[5],@XMM[5]
+ movdqa @t[6],@XMM[6]
+ movdqa @t[7],@XMM[7]
+___
+}
+
+sub InvMixColumns {
+my @x=@_[0..7];
+my @t=@_[8..15];
+
+# Thanks to Jussi Kivilinna for providing pointer to
+#
+# | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 |
+# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
+# | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 |
+# | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 |
+
+$code.=<<___;
+ # multiplication by 0x05-0x00-0x04-0x00
+ pshufd \$0x4E, @x[0], @t[0]
+ pshufd \$0x4E, @x[6], @t[6]
+ pxor @x[0], @t[0]
+ pshufd \$0x4E, @x[7], @t[7]
+ pxor @x[6], @t[6]
+ pshufd \$0x4E, @x[1], @t[1]
+ pxor @x[7], @t[7]
+ pshufd \$0x4E, @x[2], @t[2]
+ pxor @x[1], @t[1]
+ pshufd \$0x4E, @x[3], @t[3]
+ pxor @x[2], @t[2]
+ pxor @t[6], @x[0]
+ pxor @t[6], @x[1]
+ pshufd \$0x4E, @x[4], @t[4]
+ pxor @x[3], @t[3]
+ pxor @t[0], @x[2]
+ pxor @t[1], @x[3]
+ pshufd \$0x4E, @x[5], @t[5]
+ pxor @x[4], @t[4]
+ pxor @t[7], @x[1]
+ pxor @t[2], @x[4]
+ pxor @x[5], @t[5]
+
+ pxor @t[7], @x[2]
+ pxor @t[6], @x[3]
+ pxor @t[6], @x[4]
+ pxor @t[3], @x[5]
+ pxor @t[4], @x[6]
+ pxor @t[7], @x[4]
+ pxor @t[7], @x[5]
+ pxor @t[5], @x[7]
+___
+ &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6
+}
+
+sub aesenc { # not used
+my @b=@_[0..7];
+my @t=@_[8..15];
+$code.=<<___;
+ movdqa 0x30($const),@t[0] # .LSR
+___
+ &ShiftRows (@b,@t[0]);
+ &Sbox (@b,@t);
+ &MixColumns (@b[0,1,4,6,3,7,2,5],@t);
+}
+
+sub aesenclast { # not used
+my @b=@_[0..7];
+my @t=@_[8..15];
+$code.=<<___;
+ movdqa 0x40($const),@t[0] # .LSRM0
+___
+ &ShiftRows (@b,@t[0]);
+ &Sbox (@b,@t);
+$code.=<<___
+ pxor 0x00($key),@b[0]
+ pxor 0x10($key),@b[1]
+ pxor 0x20($key),@b[4]
+ pxor 0x30($key),@b[6]
+ pxor 0x40($key),@b[3]
+ pxor 0x50($key),@b[7]
+ pxor 0x60($key),@b[2]
+ pxor 0x70($key),@b[5]
+___
+}
+
+sub swapmove {
+my ($a,$b,$n,$mask,$t)=@_;
+$code.=<<___;
+ movdqa $b,$t
+ psrlq \$$n,$b
+ pxor $a,$b
+ pand $mask,$b
+ pxor $b,$a
+ psllq \$$n,$b
+ pxor $t,$b
+___
+}
+sub swapmove2x {
+my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
+$code.=<<___;
+ movdqa $b0,$t0
+ psrlq \$$n,$b0
+ movdqa $b1,$t1
+ psrlq \$$n,$b1
+ pxor $a0,$b0
+ pxor $a1,$b1
+ pand $mask,$b0
+ pand $mask,$b1
+ pxor $b0,$a0
+ psllq \$$n,$b0
+ pxor $b1,$a1
+ psllq \$$n,$b1
+ pxor $t0,$b0
+ pxor $t1,$b1
+___
+}
+
+sub bitslice {
+my @x=reverse(@_[0..7]);
+my ($t0,$t1,$t2,$t3)=@_[8..11];
+$code.=<<___;
+ movdqa 0x00($const),$t0 # .LBS0
+ movdqa 0x10($const),$t1 # .LBS1
+___
+ &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
+ &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
+$code.=<<___;
+ movdqa 0x20($const),$t0 # .LBS2
+___
+ &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
+ &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
+
+ &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
+ &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
+}
+
+$code.=<<___;
+.text
+
+.extern asm_AES_encrypt
+.extern asm_AES_decrypt
+
+.type _bsaes_encrypt8,\@abi-omnipotent
+.align 64
+_bsaes_encrypt8:
+ lea .LBS0(%rip), $const # constants table
+
+ movdqa ($key), @XMM[9] # round 0 key
+ lea 0x10($key), $key
+ movdqa 0x50($const), @XMM[8] # .LM0SR
+ pxor @XMM[9], @XMM[0] # xor with round0 key
+ pxor @XMM[9], @XMM[1]
+ pshufb @XMM[8], @XMM[0]
+ pxor @XMM[9], @XMM[2]
+ pshufb @XMM[8], @XMM[1]
+ pxor @XMM[9], @XMM[3]
+ pshufb @XMM[8], @XMM[2]
+ pxor @XMM[9], @XMM[4]
+ pshufb @XMM[8], @XMM[3]
+ pxor @XMM[9], @XMM[5]
+ pshufb @XMM[8], @XMM[4]
+ pxor @XMM[9], @XMM[6]
+ pshufb @XMM[8], @XMM[5]
+ pxor @XMM[9], @XMM[7]
+ pshufb @XMM[8], @XMM[6]
+ pshufb @XMM[8], @XMM[7]
+_bsaes_encrypt8_bitslice:
+___
+ &bitslice (@XMM[0..7, 8..11]);
+$code.=<<___;
+ dec $rounds
+ jmp .Lenc_sbox
+.align 16
+.Lenc_loop:
+___
+ &ShiftRows (@XMM[0..7, 8]);
+$code.=".Lenc_sbox:\n";
+ &Sbox (@XMM[0..7, 8..15]);
+$code.=<<___;
+ dec $rounds
+ jl .Lenc_done
+___
+ &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
+$code.=<<___;
+ movdqa 0x30($const), @XMM[8] # .LSR
+ jnz .Lenc_loop
+ movdqa 0x40($const), @XMM[8] # .LSRM0
+ jmp .Lenc_loop
+.align 16
+.Lenc_done:
+___
+ # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
+ &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]);
+$code.=<<___;
+ movdqa ($key), @XMM[8] # last round key
+ pxor @XMM[8], @XMM[4]
+ pxor @XMM[8], @XMM[6]
+ pxor @XMM[8], @XMM[3]
+ pxor @XMM[8], @XMM[7]
+ pxor @XMM[8], @XMM[2]
+ pxor @XMM[8], @XMM[5]
+ pxor @XMM[8], @XMM[0]
+ pxor @XMM[8], @XMM[1]
+ ret
+.size _bsaes_encrypt8,.-_bsaes_encrypt8
+
+.type _bsaes_decrypt8,\@abi-omnipotent
+.align 64
+_bsaes_decrypt8:
+ lea .LBS0(%rip), $const # constants table
+
+ movdqa ($key), @XMM[9] # round 0 key
+ lea 0x10($key), $key
+ movdqa -0x30($const), @XMM[8] # .LM0ISR
+ pxor @XMM[9], @XMM[0] # xor with round0 key
+ pxor @XMM[9], @XMM[1]
+ pshufb @XMM[8], @XMM[0]
+ pxor @XMM[9], @XMM[2]
+ pshufb @XMM[8], @XMM[1]
+ pxor @XMM[9], @XMM[3]
+ pshufb @XMM[8], @XMM[2]
+ pxor @XMM[9], @XMM[4]
+ pshufb @XMM[8], @XMM[3]
+ pxor @XMM[9], @XMM[5]
+ pshufb @XMM[8], @XMM[4]
+ pxor @XMM[9], @XMM[6]
+ pshufb @XMM[8], @XMM[5]
+ pxor @XMM[9], @XMM[7]
+ pshufb @XMM[8], @XMM[6]
+ pshufb @XMM[8], @XMM[7]
+___
+ &bitslice (@XMM[0..7, 8..11]);
+$code.=<<___;
+ dec $rounds
+ jmp .Ldec_sbox
+.align 16
+.Ldec_loop:
+___
+ &ShiftRows (@XMM[0..7, 8]);
+$code.=".Ldec_sbox:\n";
+ &InvSbox (@XMM[0..7, 8..15]);
+$code.=<<___;
+ dec $rounds
+ jl .Ldec_done
+___
+ &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]);
+$code.=<<___;
+ movdqa -0x10($const), @XMM[8] # .LISR
+ jnz .Ldec_loop
+ movdqa -0x20($const), @XMM[8] # .LISRM0
+ jmp .Ldec_loop
+.align 16
+.Ldec_done:
+___
+ &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]);
+$code.=<<___;
+ movdqa ($key), @XMM[8] # last round key
+ pxor @XMM[8], @XMM[6]
+ pxor @XMM[8], @XMM[4]
+ pxor @XMM[8], @XMM[2]
+ pxor @XMM[8], @XMM[7]
+ pxor @XMM[8], @XMM[3]
+ pxor @XMM[8], @XMM[5]
+ pxor @XMM[8], @XMM[0]
+ pxor @XMM[8], @XMM[1]
+ ret
+.size _bsaes_decrypt8,.-_bsaes_decrypt8
+___
+}
+{
+my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
+
+sub bitslice_key {
+my @x=reverse(@_[0..7]);
+my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
+
+ &swapmove (@x[0,1],1,$bs0,$t2,$t3);
+$code.=<<___;
+ #&swapmove(@x[2,3],1,$t0,$t2,$t3);
+ movdqa @x[0], @x[2]
+ movdqa @x[1], @x[3]
+___
+ #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
+
+ &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3);
+$code.=<<___;
+ #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
+ movdqa @x[0], @x[4]
+ movdqa @x[2], @x[6]
+ movdqa @x[1], @x[5]
+ movdqa @x[3], @x[7]
+___
+ &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3);
+ &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3);
+}
+
+$code.=<<___;
+.type _bsaes_key_convert,\@abi-omnipotent
+.align 16
+_bsaes_key_convert:
+ lea .Lmasks(%rip), $const
+ movdqu ($inp), %xmm7 # load round 0 key
+ lea 0x10($inp), $inp
+ movdqa 0x00($const), %xmm0 # 0x01...
+ movdqa 0x10($const), %xmm1 # 0x02...
+ movdqa 0x20($const), %xmm2 # 0x04...
+ movdqa 0x30($const), %xmm3 # 0x08...
+ movdqa 0x40($const), %xmm4 # .LM0
+ pcmpeqd %xmm5, %xmm5 # .LNOT
+
+ movdqu ($inp), %xmm6 # load round 1 key
+ movdqa %xmm7, ($out) # save round 0 key
+ lea 0x10($out), $out
+ dec $rounds
+ jmp .Lkey_loop
+.align 16
+.Lkey_loop:
+ pshufb %xmm4, %xmm6 # .LM0
+
+ movdqa %xmm0, %xmm8
+ movdqa %xmm1, %xmm9
+
+ pand %xmm6, %xmm8
+ pand %xmm6, %xmm9
+ movdqa %xmm2, %xmm10
+ pcmpeqb %xmm0, %xmm8
+ psllq \$4, %xmm0 # 0x10...
+ movdqa %xmm3, %xmm11
+ pcmpeqb %xmm1, %xmm9
+ psllq \$4, %xmm1 # 0x20...
+
+ pand %xmm6, %xmm10
+ pand %xmm6, %xmm11
+ movdqa %xmm0, %xmm12
+ pcmpeqb %xmm2, %xmm10
+ psllq \$4, %xmm2 # 0x40...
+ movdqa %xmm1, %xmm13
+ pcmpeqb %xmm3, %xmm11
+ psllq \$4, %xmm3 # 0x80...
+
+ movdqa %xmm2, %xmm14
+ movdqa %xmm3, %xmm15
+ pxor %xmm5, %xmm8 # "pnot"
+ pxor %xmm5, %xmm9
+
+ pand %xmm6, %xmm12
+ pand %xmm6, %xmm13
+ movdqa %xmm8, 0x00($out) # write bit-sliced round key
+ pcmpeqb %xmm0, %xmm12
+ psrlq \$4, %xmm0 # 0x01...
+ movdqa %xmm9, 0x10($out)
+ pcmpeqb %xmm1, %xmm13
+ psrlq \$4, %xmm1 # 0x02...
+ lea 0x10($inp), $inp
+
+ pand %xmm6, %xmm14
+ pand %xmm6, %xmm15
+ movdqa %xmm10, 0x20($out)
+ pcmpeqb %xmm2, %xmm14
+ psrlq \$4, %xmm2 # 0x04...
+ movdqa %xmm11, 0x30($out)
+ pcmpeqb %xmm3, %xmm15
+ psrlq \$4, %xmm3 # 0x08...
+ movdqu ($inp), %xmm6 # load next round key
+
+ pxor %xmm5, %xmm13 # "pnot"
+ pxor %xmm5, %xmm14
+ movdqa %xmm12, 0x40($out)
+ movdqa %xmm13, 0x50($out)
+ movdqa %xmm14, 0x60($out)
+ movdqa %xmm15, 0x70($out)
+ lea 0x80($out),$out
+ dec $rounds
+ jnz .Lkey_loop
+
+ movdqa 0x50($const), %xmm7 # .L63
+ #movdqa %xmm6, ($out) # don't save last round key
+ ret
+.size _bsaes_key_convert,.-_bsaes_key_convert
+___
+}
+
+if (0 && !$win64) { # following four functions are unsupported interface
+ # used for benchmarking...
+$code.=<<___;
+.globl bsaes_enc_key_convert
+.type bsaes_enc_key_convert,\@function,2
+.align 16
+bsaes_enc_key_convert:
+ mov 240($inp),%r10d # pass rounds
+ mov $inp,%rcx # pass key
+ mov $out,%rax # pass key schedule
+ call _bsaes_key_convert
+ pxor %xmm6,%xmm7 # fix up last round key
+ movdqa %xmm7,(%rax) # save last round key
+ ret
+.size bsaes_enc_key_convert,.-bsaes_enc_key_convert
+
+.globl bsaes_encrypt_128
+.type bsaes_encrypt_128,\@function,4
+.align 16
+bsaes_encrypt_128:
+.Lenc128_loop:
+ movdqu 0x00($inp), @XMM[0] # load input
+ movdqu 0x10($inp), @XMM[1]
+ movdqu 0x20($inp), @XMM[2]
+ movdqu 0x30($inp), @XMM[3]
+ movdqu 0x40($inp), @XMM[4]
+ movdqu 0x50($inp), @XMM[5]
+ movdqu 0x60($inp), @XMM[6]
+ movdqu 0x70($inp), @XMM[7]
+ mov $key, %rax # pass the $key
+ lea 0x80($inp), $inp
+ mov \$10,%r10d
+
+ call _bsaes_encrypt8
+
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[4], 0x20($out)
+ movdqu @XMM[6], 0x30($out)
+ movdqu @XMM[3], 0x40($out)
+ movdqu @XMM[7], 0x50($out)
+ movdqu @XMM[2], 0x60($out)
+ movdqu @XMM[5], 0x70($out)
+ lea 0x80($out), $out
+ sub \$0x80,$len
+ ja .Lenc128_loop
+ ret
+.size bsaes_encrypt_128,.-bsaes_encrypt_128
+
+.globl bsaes_dec_key_convert
+.type bsaes_dec_key_convert,\@function,2
+.align 16
+bsaes_dec_key_convert:
+ mov 240($inp),%r10d # pass rounds
+ mov $inp,%rcx # pass key
+ mov $out,%rax # pass key schedule
+ call _bsaes_key_convert
+ pxor ($out),%xmm7 # fix up round 0 key
+ movdqa %xmm6,(%rax) # save last round key
+ movdqa %xmm7,($out)
+ ret
+.size bsaes_dec_key_convert,.-bsaes_dec_key_convert
+
+.globl bsaes_decrypt_128
+.type bsaes_decrypt_128,\@function,4
+.align 16
+bsaes_decrypt_128:
+.Ldec128_loop:
+ movdqu 0x00($inp), @XMM[0] # load input
+ movdqu 0x10($inp), @XMM[1]
+ movdqu 0x20($inp), @XMM[2]
+ movdqu 0x30($inp), @XMM[3]
+ movdqu 0x40($inp), @XMM[4]
+ movdqu 0x50($inp), @XMM[5]
+ movdqu 0x60($inp), @XMM[6]
+ movdqu 0x70($inp), @XMM[7]
+ mov $key, %rax # pass the $key
+ lea 0x80($inp), $inp
+ mov \$10,%r10d
+
+ call _bsaes_decrypt8
+
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[6], 0x20($out)
+ movdqu @XMM[4], 0x30($out)
+ movdqu @XMM[2], 0x40($out)
+ movdqu @XMM[7], 0x50($out)
+ movdqu @XMM[3], 0x60($out)
+ movdqu @XMM[5], 0x70($out)
+ lea 0x80($out), $out
+ sub \$0x80,$len
+ ja .Ldec128_loop
+ ret
+.size bsaes_decrypt_128,.-bsaes_decrypt_128
+___
+}
+{
+######################################################################
+#
+# OpenSSL interface
+#
+my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64 ? ("%rcx","%rdx","%r8","%r9","%r10","%r11d")
+ : ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
+my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
+
+if ($ecb) {
+$code.=<<___;
+.globl bsaes_ecb_encrypt_blocks
+.type bsaes_ecb_encrypt_blocks,\@abi-omnipotent
+.align 16
+bsaes_ecb_encrypt_blocks:
+ mov %rsp, %rax
+.Lecb_enc_prologue:
+ push %rbp
+ push %rbx
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ lea -0x48(%rsp),%rsp
+___
+$code.=<<___ if ($win64);
+ lea -0xa0(%rsp), %rsp
+ movaps %xmm6, 0x40(%rsp)
+ movaps %xmm7, 0x50(%rsp)
+ movaps %xmm8, 0x60(%rsp)
+ movaps %xmm9, 0x70(%rsp)
+ movaps %xmm10, 0x80(%rsp)
+ movaps %xmm11, 0x90(%rsp)
+ movaps %xmm12, 0xa0(%rsp)
+ movaps %xmm13, 0xb0(%rsp)
+ movaps %xmm14, 0xc0(%rsp)
+ movaps %xmm15, 0xd0(%rsp)
+.Lecb_enc_body:
+___
+$code.=<<___;
+ mov %rsp,%rbp # backup %rsp
+ mov 240($arg4),%eax # rounds
+ mov $arg1,$inp # backup arguments
+ mov $arg2,$out
+ mov $arg3,$len
+ mov $arg4,$key
+ cmp \$8,$arg3
+ jb .Lecb_enc_short
+
+ mov %eax,%ebx # backup rounds
+ shl \$7,%rax # 128 bytes per inner round key
+ sub \$`128-32`,%rax # size of bit-sliced key schedule
+ sub %rax,%rsp
+ mov %rsp,%rax # pass key schedule
+ mov $key,%rcx # pass key
+ mov %ebx,%r10d # pass rounds
+ call _bsaes_key_convert
+ pxor %xmm6,%xmm7 # fix up last round key
+ movdqa %xmm7,(%rax) # save last round key
+
+ sub \$8,$len
+.Lecb_enc_loop:
+ movdqu 0x00($inp), @XMM[0] # load input
+ movdqu 0x10($inp), @XMM[1]
+ movdqu 0x20($inp), @XMM[2]
+ movdqu 0x30($inp), @XMM[3]
+ movdqu 0x40($inp), @XMM[4]
+ movdqu 0x50($inp), @XMM[5]
+ mov %rsp, %rax # pass key schedule
+ movdqu 0x60($inp), @XMM[6]
+ mov %ebx,%r10d # pass rounds
+ movdqu 0x70($inp), @XMM[7]
+ lea 0x80($inp), $inp
+
+ call _bsaes_encrypt8
+
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[4], 0x20($out)
+ movdqu @XMM[6], 0x30($out)
+ movdqu @XMM[3], 0x40($out)
+ movdqu @XMM[7], 0x50($out)
+ movdqu @XMM[2], 0x60($out)
+ movdqu @XMM[5], 0x70($out)
+ lea 0x80($out), $out
+ sub \$8,$len
+ jnc .Lecb_enc_loop
+
+ add \$8,$len
+ jz .Lecb_enc_done
+
+ movdqu 0x00($inp), @XMM[0] # load input
+ mov %rsp, %rax # pass key schedule
+ mov %ebx,%r10d # pass rounds
+ cmp \$2,$len
+ jb .Lecb_enc_one
+ movdqu 0x10($inp), @XMM[1]
+ je .Lecb_enc_two
+ movdqu 0x20($inp), @XMM[2]
+ cmp \$4,$len
+ jb .Lecb_enc_three
+ movdqu 0x30($inp), @XMM[3]
+ je .Lecb_enc_four
+ movdqu 0x40($inp), @XMM[4]
+ cmp \$6,$len
+ jb .Lecb_enc_five
+ movdqu 0x50($inp), @XMM[5]
+ je .Lecb_enc_six
+ movdqu 0x60($inp), @XMM[6]
+ call _bsaes_encrypt8
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[4], 0x20($out)
+ movdqu @XMM[6], 0x30($out)
+ movdqu @XMM[3], 0x40($out)
+ movdqu @XMM[7], 0x50($out)
+ movdqu @XMM[2], 0x60($out)
+ jmp .Lecb_enc_done
+.align 16
+.Lecb_enc_six:
+ call _bsaes_encrypt8
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[4], 0x20($out)
+ movdqu @XMM[6], 0x30($out)
+ movdqu @XMM[3], 0x40($out)
+ movdqu @XMM[7], 0x50($out)
+ jmp .Lecb_enc_done
+.align 16
+.Lecb_enc_five:
+ call _bsaes_encrypt8
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[4], 0x20($out)
+ movdqu @XMM[6], 0x30($out)
+ movdqu @XMM[3], 0x40($out)
+ jmp .Lecb_enc_done
+.align 16
+.Lecb_enc_four:
+ call _bsaes_encrypt8
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[4], 0x20($out)
+ movdqu @XMM[6], 0x30($out)
+ jmp .Lecb_enc_done
+.align 16
+.Lecb_enc_three:
+ call _bsaes_encrypt8
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[4], 0x20($out)
+ jmp .Lecb_enc_done
+.align 16
+.Lecb_enc_two:
+ call _bsaes_encrypt8
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ jmp .Lecb_enc_done
+.align 16
+.Lecb_enc_one:
+ call _bsaes_encrypt8
+ movdqu @XMM[0], 0x00($out) # write output
+ jmp .Lecb_enc_done
+.align 16
+.Lecb_enc_short:
+ lea ($inp), $arg1
+ lea ($out), $arg2
+ lea ($key), $arg3
+ call asm_AES_encrypt
+ lea 16($inp), $inp
+ lea 16($out), $out
+ dec $len
+ jnz .Lecb_enc_short
+
+.Lecb_enc_done:
+ lea (%rsp),%rax
+ pxor %xmm0, %xmm0
+.Lecb_enc_bzero: # wipe key schedule [if any]
+ movdqa %xmm0, 0x00(%rax)
+ movdqa %xmm0, 0x10(%rax)
+ lea 0x20(%rax), %rax
+ cmp %rax, %rbp
+ jb .Lecb_enc_bzero
+
+ lea (%rbp),%rsp # restore %rsp
+___
+$code.=<<___ if ($win64);
+ movaps 0x40(%rbp), %xmm6
+ movaps 0x50(%rbp), %xmm7
+ movaps 0x60(%rbp), %xmm8
+ movaps 0x70(%rbp), %xmm9
+ movaps 0x80(%rbp), %xmm10
+ movaps 0x90(%rbp), %xmm11
+ movaps 0xa0(%rbp), %xmm12
+ movaps 0xb0(%rbp), %xmm13
+ movaps 0xc0(%rbp), %xmm14
+ movaps 0xd0(%rbp), %xmm15
+ lea 0xa0(%rbp), %rsp
+___
+$code.=<<___;
+ mov 0x48(%rsp), %r15
+ mov 0x50(%rsp), %r14
+ mov 0x58(%rsp), %r13
+ mov 0x60(%rsp), %r12
+ mov 0x68(%rsp), %rbx
+ mov 0x70(%rsp), %rax
+ lea 0x78(%rsp), %rsp
+ mov %rax, %rbp
+.Lecb_enc_epilogue:
+ ret
+.size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
+
+.globl bsaes_ecb_decrypt_blocks
+.type bsaes_ecb_decrypt_blocks,\@abi-omnipotent
+.align 16
+bsaes_ecb_decrypt_blocks:
+ mov %rsp, %rax
+.Lecb_dec_prologue:
+ push %rbp
+ push %rbx
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ lea -0x48(%rsp),%rsp
+___
+$code.=<<___ if ($win64);
+ lea -0xa0(%rsp), %rsp
+ movaps %xmm6, 0x40(%rsp)
+ movaps %xmm7, 0x50(%rsp)
+ movaps %xmm8, 0x60(%rsp)
+ movaps %xmm9, 0x70(%rsp)
+ movaps %xmm10, 0x80(%rsp)
+ movaps %xmm11, 0x90(%rsp)
+ movaps %xmm12, 0xa0(%rsp)
+ movaps %xmm13, 0xb0(%rsp)
+ movaps %xmm14, 0xc0(%rsp)
+ movaps %xmm15, 0xd0(%rsp)
+.Lecb_dec_body:
+___
+$code.=<<___;
+ mov %rsp,%rbp # backup %rsp
+ mov 240($arg4),%eax # rounds
+ mov $arg1,$inp # backup arguments
+ mov $arg2,$out
+ mov $arg3,$len
+ mov $arg4,$key
+ cmp \$8,$arg3
+ jb .Lecb_dec_short
+
+ mov %eax,%ebx # backup rounds
+ shl \$7,%rax # 128 bytes per inner round key
+ sub \$`128-32`,%rax # size of bit-sliced key schedule
+ sub %rax,%rsp
+ mov %rsp,%rax # pass key schedule
+ mov $key,%rcx # pass key
+ mov %ebx,%r10d # pass rounds
+ call _bsaes_key_convert
+ pxor (%rsp),%xmm7 # fix up 0 round key
+ movdqa %xmm6,(%rax) # save last round key
+ movdqa %xmm7,(%rsp)
+
+ sub \$8,$len
+.Lecb_dec_loop:
+ movdqu 0x00($inp), @XMM[0] # load input
+ movdqu 0x10($inp), @XMM[1]
+ movdqu 0x20($inp), @XMM[2]
+ movdqu 0x30($inp), @XMM[3]
+ movdqu 0x40($inp), @XMM[4]
+ movdqu 0x50($inp), @XMM[5]
+ mov %rsp, %rax # pass key schedule
+ movdqu 0x60($inp), @XMM[6]
+ mov %ebx,%r10d # pass rounds
+ movdqu 0x70($inp), @XMM[7]
+ lea 0x80($inp), $inp
+
+ call _bsaes_decrypt8
+
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[6], 0x20($out)
+ movdqu @XMM[4], 0x30($out)
+ movdqu @XMM[2], 0x40($out)
+ movdqu @XMM[7], 0x50($out)
+ movdqu @XMM[3], 0x60($out)
+ movdqu @XMM[5], 0x70($out)
+ lea 0x80($out), $out
+ sub \$8,$len
+ jnc .Lecb_dec_loop
+
+ add \$8,$len
+ jz .Lecb_dec_done
+
+ movdqu 0x00($inp), @XMM[0] # load input
+ mov %rsp, %rax # pass key schedule
+ mov %ebx,%r10d # pass rounds
+ cmp \$2,$len
+ jb .Lecb_dec_one
+ movdqu 0x10($inp), @XMM[1]
+ je .Lecb_dec_two
+ movdqu 0x20($inp), @XMM[2]
+ cmp \$4,$len
+ jb .Lecb_dec_three
+ movdqu 0x30($inp), @XMM[3]
+ je .Lecb_dec_four
+ movdqu 0x40($inp), @XMM[4]
+ cmp \$6,$len
+ jb .Lecb_dec_five
+ movdqu 0x50($inp), @XMM[5]
+ je .Lecb_dec_six
+ movdqu 0x60($inp), @XMM[6]
+ call _bsaes_decrypt8
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[6], 0x20($out)
+ movdqu @XMM[4], 0x30($out)
+ movdqu @XMM[2], 0x40($out)
+ movdqu @XMM[7], 0x50($out)
+ movdqu @XMM[3], 0x60($out)
+ jmp .Lecb_dec_done
+.align 16
+.Lecb_dec_six:
+ call _bsaes_decrypt8
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[6], 0x20($out)
+ movdqu @XMM[4], 0x30($out)
+ movdqu @XMM[2], 0x40($out)
+ movdqu @XMM[7], 0x50($out)
+ jmp .Lecb_dec_done
+.align 16
+.Lecb_dec_five:
+ call _bsaes_decrypt8
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[6], 0x20($out)
+ movdqu @XMM[4], 0x30($out)
+ movdqu @XMM[2], 0x40($out)
+ jmp .Lecb_dec_done
+.align 16
+.Lecb_dec_four:
+ call _bsaes_decrypt8
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[6], 0x20($out)
+ movdqu @XMM[4], 0x30($out)
+ jmp .Lecb_dec_done
+.align 16
+.Lecb_dec_three:
+ call _bsaes_decrypt8
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[6], 0x20($out)
+ jmp .Lecb_dec_done
+.align 16
+.Lecb_dec_two:
+ call _bsaes_decrypt8
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ jmp .Lecb_dec_done
+.align 16
+.Lecb_dec_one:
+ call _bsaes_decrypt8
+ movdqu @XMM[0], 0x00($out) # write output
+ jmp .Lecb_dec_done
+.align 16
+.Lecb_dec_short:
+ lea ($inp), $arg1
+ lea ($out), $arg2
+ lea ($key), $arg3
+ call asm_AES_decrypt
+ lea 16($inp), $inp
+ lea 16($out), $out
+ dec $len
+ jnz .Lecb_dec_short
+
+.Lecb_dec_done:
+ lea (%rsp),%rax
+ pxor %xmm0, %xmm0
+.Lecb_dec_bzero: # wipe key schedule [if any]
+ movdqa %xmm0, 0x00(%rax)
+ movdqa %xmm0, 0x10(%rax)
+ lea 0x20(%rax), %rax
+ cmp %rax, %rbp
+ jb .Lecb_dec_bzero
+
+ lea (%rbp),%rsp # restore %rsp
+___
+$code.=<<___ if ($win64);
+ movaps 0x40(%rbp), %xmm6
+ movaps 0x50(%rbp), %xmm7
+ movaps 0x60(%rbp), %xmm8
+ movaps 0x70(%rbp), %xmm9
+ movaps 0x80(%rbp), %xmm10
+ movaps 0x90(%rbp), %xmm11
+ movaps 0xa0(%rbp), %xmm12
+ movaps 0xb0(%rbp), %xmm13
+ movaps 0xc0(%rbp), %xmm14
+ movaps 0xd0(%rbp), %xmm15
+ lea 0xa0(%rbp), %rsp
+___
+$code.=<<___;
+ mov 0x48(%rsp), %r15
+ mov 0x50(%rsp), %r14
+ mov 0x58(%rsp), %r13
+ mov 0x60(%rsp), %r12
+ mov 0x68(%rsp), %rbx
+ mov 0x70(%rsp), %rax
+ lea 0x78(%rsp), %rsp
+ mov %rax, %rbp
+.Lecb_dec_epilogue:
+ ret
+.size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
+___
+}
+$code.=<<___;
+.extern asm_AES_cbc_encrypt
+.globl bsaes_cbc_encrypt
+.type bsaes_cbc_encrypt,\@abi-omnipotent
+.align 16
+bsaes_cbc_encrypt:
+___
+$code.=<<___ if ($win64);
+ mov 48(%rsp),$arg6 # pull direction flag
+___
+$code.=<<___;
+ cmp \$0,$arg6
+ jne asm_AES_cbc_encrypt
+ cmp \$128,$arg3
+ jb asm_AES_cbc_encrypt
+
+ mov %rsp, %rax
+.Lcbc_dec_prologue:
+ push %rbp
+ push %rbx
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ lea -0x48(%rsp), %rsp
+___
+$code.=<<___ if ($win64);
+ mov 0xa0(%rsp),$arg5 # pull ivp
+ lea -0xa0(%rsp), %rsp
+ movaps %xmm6, 0x40(%rsp)
+ movaps %xmm7, 0x50(%rsp)
+ movaps %xmm8, 0x60(%rsp)
+ movaps %xmm9, 0x70(%rsp)
+ movaps %xmm10, 0x80(%rsp)
+ movaps %xmm11, 0x90(%rsp)
+ movaps %xmm12, 0xa0(%rsp)
+ movaps %xmm13, 0xb0(%rsp)
+ movaps %xmm14, 0xc0(%rsp)
+ movaps %xmm15, 0xd0(%rsp)
+.Lcbc_dec_body:
+___
+$code.=<<___;
+ mov %rsp, %rbp # backup %rsp
+ mov 240($arg4), %eax # rounds
+ mov $arg1, $inp # backup arguments
+ mov $arg2, $out
+ mov $arg3, $len
+ mov $arg4, $key
+ mov $arg5, %rbx
+ shr \$4, $len # bytes to blocks
+
+ mov %eax, %edx # rounds
+ shl \$7, %rax # 128 bytes per inner round key
+ sub \$`128-32`, %rax # size of bit-sliced key schedule
+ sub %rax, %rsp
+
+ mov %rsp, %rax # pass key schedule
+ mov $key, %rcx # pass key
+ mov %edx, %r10d # pass rounds
+ call _bsaes_key_convert
+ pxor (%rsp),%xmm7 # fix up 0 round key
+ movdqa %xmm6,(%rax) # save last round key
+ movdqa %xmm7,(%rsp)
+
+ movdqu (%rbx), @XMM[15] # load IV
+ sub \$8,$len
+.Lcbc_dec_loop:
+ movdqu 0x00($inp), @XMM[0] # load input
+ movdqu 0x10($inp), @XMM[1]
+ movdqu 0x20($inp), @XMM[2]
+ movdqu 0x30($inp), @XMM[3]
+ movdqu 0x40($inp), @XMM[4]
+ movdqu 0x50($inp), @XMM[5]
+ mov %rsp, %rax # pass key schedule
+ movdqu 0x60($inp), @XMM[6]
+ mov %edx,%r10d # pass rounds
+ movdqu 0x70($inp), @XMM[7]
+ movdqa @XMM[15], 0x20(%rbp) # put aside IV
+
+ call _bsaes_decrypt8
+
+ pxor 0x20(%rbp), @XMM[0] # ^= IV
+ movdqu 0x00($inp), @XMM[8] # re-load input
+ movdqu 0x10($inp), @XMM[9]
+ pxor @XMM[8], @XMM[1]
+ movdqu 0x20($inp), @XMM[10]
+ pxor @XMM[9], @XMM[6]
+ movdqu 0x30($inp), @XMM[11]
+ pxor @XMM[10], @XMM[4]
+ movdqu 0x40($inp), @XMM[12]
+ pxor @XMM[11], @XMM[2]
+ movdqu 0x50($inp), @XMM[13]
+ pxor @XMM[12], @XMM[7]
+ movdqu 0x60($inp), @XMM[14]
+ pxor @XMM[13], @XMM[3]
+ movdqu 0x70($inp), @XMM[15] # IV
+ pxor @XMM[14], @XMM[5]
+ movdqu @XMM[0], 0x00($out) # write output
+ lea 0x80($inp), $inp
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[6], 0x20($out)
+ movdqu @XMM[4], 0x30($out)
+ movdqu @XMM[2], 0x40($out)
+ movdqu @XMM[7], 0x50($out)
+ movdqu @XMM[3], 0x60($out)
+ movdqu @XMM[5], 0x70($out)
+ lea 0x80($out), $out
+ sub \$8,$len
+ jnc .Lcbc_dec_loop
+
+ add \$8,$len
+ jz .Lcbc_dec_done
+
+ movdqu 0x00($inp), @XMM[0] # load input
+ mov %rsp, %rax # pass key schedule
+ mov %edx, %r10d # pass rounds
+ cmp \$2,$len
+ jb .Lcbc_dec_one
+ movdqu 0x10($inp), @XMM[1]
+ je .Lcbc_dec_two
+ movdqu 0x20($inp), @XMM[2]
+ cmp \$4,$len
+ jb .Lcbc_dec_three
+ movdqu 0x30($inp), @XMM[3]
+ je .Lcbc_dec_four
+ movdqu 0x40($inp), @XMM[4]
+ cmp \$6,$len
+ jb .Lcbc_dec_five
+ movdqu 0x50($inp), @XMM[5]
+ je .Lcbc_dec_six
+ movdqu 0x60($inp), @XMM[6]
+ movdqa @XMM[15], 0x20(%rbp) # put aside IV
+ call _bsaes_decrypt8
+ pxor 0x20(%rbp), @XMM[0] # ^= IV
+ movdqu 0x00($inp), @XMM[8] # re-load input
+ movdqu 0x10($inp), @XMM[9]
+ pxor @XMM[8], @XMM[1]
+ movdqu 0x20($inp), @XMM[10]
+ pxor @XMM[9], @XMM[6]
+ movdqu 0x30($inp), @XMM[11]
+ pxor @XMM[10], @XMM[4]
+ movdqu 0x40($inp), @XMM[12]
+ pxor @XMM[11], @XMM[2]
+ movdqu 0x50($inp), @XMM[13]
+ pxor @XMM[12], @XMM[7]
+ movdqu 0x60($inp), @XMM[15] # IV
+ pxor @XMM[13], @XMM[3]
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[6], 0x20($out)
+ movdqu @XMM[4], 0x30($out)
+ movdqu @XMM[2], 0x40($out)
+ movdqu @XMM[7], 0x50($out)
+ movdqu @XMM[3], 0x60($out)
+ jmp .Lcbc_dec_done
+.align 16
+.Lcbc_dec_six:
+ movdqa @XMM[15], 0x20(%rbp) # put aside IV
+ call _bsaes_decrypt8
+ pxor 0x20(%rbp), @XMM[0] # ^= IV
+ movdqu 0x00($inp), @XMM[8] # re-load input
+ movdqu 0x10($inp), @XMM[9]
+ pxor @XMM[8], @XMM[1]
+ movdqu 0x20($inp), @XMM[10]
+ pxor @XMM[9], @XMM[6]
+ movdqu 0x30($inp), @XMM[11]
+ pxor @XMM[10], @XMM[4]
+ movdqu 0x40($inp), @XMM[12]
+ pxor @XMM[11], @XMM[2]
+ movdqu 0x50($inp), @XMM[15] # IV
+ pxor @XMM[12], @XMM[7]
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[6], 0x20($out)
+ movdqu @XMM[4], 0x30($out)
+ movdqu @XMM[2], 0x40($out)
+ movdqu @XMM[7], 0x50($out)
+ jmp .Lcbc_dec_done
+.align 16
+.Lcbc_dec_five:
+ movdqa @XMM[15], 0x20(%rbp) # put aside IV
+ call _bsaes_decrypt8
+ pxor 0x20(%rbp), @XMM[0] # ^= IV
+ movdqu 0x00($inp), @XMM[8] # re-load input
+ movdqu 0x10($inp), @XMM[9]
+ pxor @XMM[8], @XMM[1]
+ movdqu 0x20($inp), @XMM[10]
+ pxor @XMM[9], @XMM[6]
+ movdqu 0x30($inp), @XMM[11]
+ pxor @XMM[10], @XMM[4]
+ movdqu 0x40($inp), @XMM[15] # IV
+ pxor @XMM[11], @XMM[2]
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[6], 0x20($out)
+ movdqu @XMM[4], 0x30($out)
+ movdqu @XMM[2], 0x40($out)
+ jmp .Lcbc_dec_done
+.align 16
+.Lcbc_dec_four:
+ movdqa @XMM[15], 0x20(%rbp) # put aside IV
+ call _bsaes_decrypt8
+ pxor 0x20(%rbp), @XMM[0] # ^= IV
+ movdqu 0x00($inp), @XMM[8] # re-load input
+ movdqu 0x10($inp), @XMM[9]
+ pxor @XMM[8], @XMM[1]
+ movdqu 0x20($inp), @XMM[10]
+ pxor @XMM[9], @XMM[6]
+ movdqu 0x30($inp), @XMM[15] # IV
+ pxor @XMM[10], @XMM[4]
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[6], 0x20($out)
+ movdqu @XMM[4], 0x30($out)
+ jmp .Lcbc_dec_done
+.align 16
+.Lcbc_dec_three:
+ movdqa @XMM[15], 0x20(%rbp) # put aside IV
+ call _bsaes_decrypt8
+ pxor 0x20(%rbp), @XMM[0] # ^= IV
+ movdqu 0x00($inp), @XMM[8] # re-load input
+ movdqu 0x10($inp), @XMM[9]
+ pxor @XMM[8], @XMM[1]
+ movdqu 0x20($inp), @XMM[15] # IV
+ pxor @XMM[9], @XMM[6]
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[6], 0x20($out)
+ jmp .Lcbc_dec_done
+.align 16
+.Lcbc_dec_two:
+ movdqa @XMM[15], 0x20(%rbp) # put aside IV
+ call _bsaes_decrypt8
+ pxor 0x20(%rbp), @XMM[0] # ^= IV
+ movdqu 0x00($inp), @XMM[8] # re-load input
+ movdqu 0x10($inp), @XMM[15] # IV
+ pxor @XMM[8], @XMM[1]
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ jmp .Lcbc_dec_done
+.align 16
+.Lcbc_dec_one:
+ lea ($inp), $arg1
+ lea 0x20(%rbp), $arg2 # buffer output
+ lea ($key), $arg3
+ call asm_AES_decrypt # doesn't touch %xmm
+ pxor 0x20(%rbp), @XMM[15] # ^= IV
+ movdqu @XMM[15], ($out) # write output
+ movdqa @XMM[0], @XMM[15] # IV
+
+.Lcbc_dec_done:
+ movdqu @XMM[15], (%rbx) # return IV
+ lea (%rsp), %rax
+ pxor %xmm0, %xmm0
+.Lcbc_dec_bzero: # wipe key schedule [if any]
+ movdqa %xmm0, 0x00(%rax)
+ movdqa %xmm0, 0x10(%rax)
+ lea 0x20(%rax), %rax
+ cmp %rax, %rbp
+ ja .Lcbc_dec_bzero
+
+ lea (%rbp),%rsp # restore %rsp
+___
+$code.=<<___ if ($win64);
+ movaps 0x40(%rbp), %xmm6
+ movaps 0x50(%rbp), %xmm7
+ movaps 0x60(%rbp), %xmm8
+ movaps 0x70(%rbp), %xmm9
+ movaps 0x80(%rbp), %xmm10
+ movaps 0x90(%rbp), %xmm11
+ movaps 0xa0(%rbp), %xmm12
+ movaps 0xb0(%rbp), %xmm13
+ movaps 0xc0(%rbp), %xmm14
+ movaps 0xd0(%rbp), %xmm15
+ lea 0xa0(%rbp), %rsp
+___
+$code.=<<___;
+ mov 0x48(%rsp), %r15
+ mov 0x50(%rsp), %r14
+ mov 0x58(%rsp), %r13
+ mov 0x60(%rsp), %r12
+ mov 0x68(%rsp), %rbx
+ mov 0x70(%rsp), %rax
+ lea 0x78(%rsp), %rsp
+ mov %rax, %rbp
+.Lcbc_dec_epilogue:
+ ret
+.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
+
+.globl bsaes_ctr32_encrypt_blocks
+.type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
+.align 16
+bsaes_ctr32_encrypt_blocks:
+ mov %rsp, %rax
+.Lctr_enc_prologue:
+ push %rbp
+ push %rbx
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ lea -0x48(%rsp), %rsp
+___
+$code.=<<___ if ($win64);
+ mov 0xa0(%rsp),$arg5 # pull ivp
+ lea -0xa0(%rsp), %rsp
+ movaps %xmm6, 0x40(%rsp)
+ movaps %xmm7, 0x50(%rsp)
+ movaps %xmm8, 0x60(%rsp)
+ movaps %xmm9, 0x70(%rsp)
+ movaps %xmm10, 0x80(%rsp)
+ movaps %xmm11, 0x90(%rsp)
+ movaps %xmm12, 0xa0(%rsp)
+ movaps %xmm13, 0xb0(%rsp)
+ movaps %xmm14, 0xc0(%rsp)
+ movaps %xmm15, 0xd0(%rsp)
+.Lctr_enc_body:
+___
+$code.=<<___;
+ mov %rsp, %rbp # backup %rsp
+ movdqu ($arg5), %xmm0 # load counter
+ mov 240($arg4), %eax # rounds
+ mov $arg1, $inp # backup arguments
+ mov $arg2, $out
+ mov $arg3, $len
+ mov $arg4, $key
+ movdqa %xmm0, 0x20(%rbp) # copy counter
+ cmp \$8, $arg3
+ jb .Lctr_enc_short
+
+ mov %eax, %ebx # rounds
+ shl \$7, %rax # 128 bytes per inner round key
+ sub \$`128-32`, %rax # size of bit-sliced key schedule
+ sub %rax, %rsp
+
+ mov %rsp, %rax # pass key schedule
+ mov $key, %rcx # pass key
+ mov %ebx, %r10d # pass rounds
+ call _bsaes_key_convert
+ pxor %xmm6,%xmm7 # fix up last round key
+ movdqa %xmm7,(%rax) # save last round key
+
+ movdqa (%rsp), @XMM[9] # load round0 key
+ lea .LADD1(%rip), %r11
+ movdqa 0x20(%rbp), @XMM[0] # counter copy
+ movdqa -0x20(%r11), @XMM[8] # .LSWPUP
+ pshufb @XMM[8], @XMM[9] # byte swap upper part
+ pshufb @XMM[8], @XMM[0]
+ movdqa @XMM[9], (%rsp) # save adjusted round0 key
+ jmp .Lctr_enc_loop
+.align 16
+.Lctr_enc_loop:
+ movdqa @XMM[0], 0x20(%rbp) # save counter
+ movdqa @XMM[0], @XMM[1] # prepare 8 counter values
+ movdqa @XMM[0], @XMM[2]
+ paddd 0x00(%r11), @XMM[1] # .LADD1
+ movdqa @XMM[0], @XMM[3]
+ paddd 0x10(%r11), @XMM[2] # .LADD2
+ movdqa @XMM[0], @XMM[4]
+ paddd 0x20(%r11), @XMM[3] # .LADD3
+ movdqa @XMM[0], @XMM[5]
+ paddd 0x30(%r11), @XMM[4] # .LADD4
+ movdqa @XMM[0], @XMM[6]
+ paddd 0x40(%r11), @XMM[5] # .LADD5
+ movdqa @XMM[0], @XMM[7]
+ paddd 0x50(%r11), @XMM[6] # .LADD6
+ paddd 0x60(%r11), @XMM[7] # .LADD7
+
+ # Borrow prologue from _bsaes_encrypt8 to use the opportunity
+ # to flip byte order in 32-bit counter
+ movdqa (%rsp), @XMM[9] # round 0 key
+ lea 0x10(%rsp), %rax # pass key schedule
+ movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR
+ pxor @XMM[9], @XMM[0] # xor with round0 key
+ pxor @XMM[9], @XMM[1]
+ pshufb @XMM[8], @XMM[0]
+ pxor @XMM[9], @XMM[2]
+ pshufb @XMM[8], @XMM[1]
+ pxor @XMM[9], @XMM[3]
+ pshufb @XMM[8], @XMM[2]
+ pxor @XMM[9], @XMM[4]
+ pshufb @XMM[8], @XMM[3]
+ pxor @XMM[9], @XMM[5]
+ pshufb @XMM[8], @XMM[4]
+ pxor @XMM[9], @XMM[6]
+ pshufb @XMM[8], @XMM[5]
+ pxor @XMM[9], @XMM[7]
+ pshufb @XMM[8], @XMM[6]
+ lea .LBS0(%rip), %r11 # constants table
+ pshufb @XMM[8], @XMM[7]
+ mov %ebx,%r10d # pass rounds
+
+ call _bsaes_encrypt8_bitslice
+
+ sub \$8,$len
+ jc .Lctr_enc_loop_done
+
+ movdqu 0x00($inp), @XMM[8] # load input
+ movdqu 0x10($inp), @XMM[9]
+ movdqu 0x20($inp), @XMM[10]
+ movdqu 0x30($inp), @XMM[11]
+ movdqu 0x40($inp), @XMM[12]
+ movdqu 0x50($inp), @XMM[13]
+ movdqu 0x60($inp), @XMM[14]
+ movdqu 0x70($inp), @XMM[15]
+ lea 0x80($inp),$inp
+ pxor @XMM[0], @XMM[8]
+ movdqa 0x20(%rbp), @XMM[0] # load counter
+ pxor @XMM[9], @XMM[1]
+ movdqu @XMM[8], 0x00($out) # write output
+ pxor @XMM[10], @XMM[4]
+ movdqu @XMM[1], 0x10($out)
+ pxor @XMM[11], @XMM[6]
+ movdqu @XMM[4], 0x20($out)
+ pxor @XMM[12], @XMM[3]
+ movdqu @XMM[6], 0x30($out)
+ pxor @XMM[13], @XMM[7]
+ movdqu @XMM[3], 0x40($out)
+ pxor @XMM[14], @XMM[2]
+ movdqu @XMM[7], 0x50($out)
+ pxor @XMM[15], @XMM[5]
+ movdqu @XMM[2], 0x60($out)
+ lea .LADD1(%rip), %r11
+ movdqu @XMM[5], 0x70($out)
+ lea 0x80($out), $out
+ paddd 0x70(%r11), @XMM[0] # .LADD8
+ jnz .Lctr_enc_loop
+
+ jmp .Lctr_enc_done
+.align 16
+.Lctr_enc_loop_done:
+ add \$8, $len
+ movdqu 0x00($inp), @XMM[8] # load input
+ pxor @XMM[8], @XMM[0]
+ movdqu @XMM[0], 0x00($out) # write output
+ cmp \$2,$len
+ jb .Lctr_enc_done
+ movdqu 0x10($inp), @XMM[9]
+ pxor @XMM[9], @XMM[1]
+ movdqu @XMM[1], 0x10($out)
+ je .Lctr_enc_done
+ movdqu 0x20($inp), @XMM[10]
+ pxor @XMM[10], @XMM[4]
+ movdqu @XMM[4], 0x20($out)
+ cmp \$4,$len
+ jb .Lctr_enc_done
+ movdqu 0x30($inp), @XMM[11]
+ pxor @XMM[11], @XMM[6]
+ movdqu @XMM[6], 0x30($out)
+ je .Lctr_enc_done
+ movdqu 0x40($inp), @XMM[12]
+ pxor @XMM[12], @XMM[3]
+ movdqu @XMM[3], 0x40($out)
+ cmp \$6,$len
+ jb .Lctr_enc_done
+ movdqu 0x50($inp), @XMM[13]
+ pxor @XMM[13], @XMM[7]
+ movdqu @XMM[7], 0x50($out)
+ je .Lctr_enc_done
+ movdqu 0x60($inp), @XMM[14]
+ pxor @XMM[14], @XMM[2]
+ movdqu @XMM[2], 0x60($out)
+ jmp .Lctr_enc_done
+
+.align 16
+.Lctr_enc_short:
+ lea 0x20(%rbp), $arg1
+ lea 0x30(%rbp), $arg2
+ lea ($key), $arg3
+ call asm_AES_encrypt
+ movdqu ($inp), @XMM[1]
+ lea 16($inp), $inp
+ mov 0x2c(%rbp), %eax # load 32-bit counter
+ bswap %eax
+ pxor 0x30(%rbp), @XMM[1]
+ inc %eax # increment
+ movdqu @XMM[1], ($out)
+ bswap %eax
+ lea 16($out), $out
+ mov %eax, 0x2c(%rsp) # save 32-bit counter
+ dec $len
+ jnz .Lctr_enc_short
+
+.Lctr_enc_done:
+ lea (%rsp), %rax
+ pxor %xmm0, %xmm0
+.Lctr_enc_bzero: # wipe key schedule [if any]
+ movdqa %xmm0, 0x00(%rax)
+ movdqa %xmm0, 0x10(%rax)
+ lea 0x20(%rax), %rax
+ cmp %rax, %rbp
+ ja .Lctr_enc_bzero
+
+ lea (%rbp),%rsp # restore %rsp
+___
+$code.=<<___ if ($win64);
+ movaps 0x40(%rbp), %xmm6
+ movaps 0x50(%rbp), %xmm7
+ movaps 0x60(%rbp), %xmm8
+ movaps 0x70(%rbp), %xmm9
+ movaps 0x80(%rbp), %xmm10
+ movaps 0x90(%rbp), %xmm11
+ movaps 0xa0(%rbp), %xmm12
+ movaps 0xb0(%rbp), %xmm13
+ movaps 0xc0(%rbp), %xmm14
+ movaps 0xd0(%rbp), %xmm15
+ lea 0xa0(%rbp), %rsp
+___
+$code.=<<___;
+ mov 0x48(%rsp), %r15
+ mov 0x50(%rsp), %r14
+ mov 0x58(%rsp), %r13
+ mov 0x60(%rsp), %r12
+ mov 0x68(%rsp), %rbx
+ mov 0x70(%rsp), %rax
+ lea 0x78(%rsp), %rsp
+ mov %rax, %rbp
+.Lctr_enc_epilogue:
+ ret
+.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
+___
+######################################################################
+# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
+# const AES_KEY *key1, const AES_KEY *key2,
+# const unsigned char iv[16]);
+#
+my ($twmask,$twres,$twtmp)=@XMM[13..15];
+$arg6=~s/d$//;
+
+$code.=<<___;
+.globl bsaes_xts_encrypt
+.type bsaes_xts_encrypt,\@abi-omnipotent
+.align 16
+bsaes_xts_encrypt:
+ mov %rsp, %rax
+.Lxts_enc_prologue:
+ push %rbp
+ push %rbx
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ lea -0x48(%rsp), %rsp
+___
+$code.=<<___ if ($win64);
+ mov 0xa0(%rsp),$arg5 # pull key2
+ mov 0xa8(%rsp),$arg6 # pull ivp
+ lea -0xa0(%rsp), %rsp
+ movaps %xmm6, 0x40(%rsp)
+ movaps %xmm7, 0x50(%rsp)
+ movaps %xmm8, 0x60(%rsp)
+ movaps %xmm9, 0x70(%rsp)
+ movaps %xmm10, 0x80(%rsp)
+ movaps %xmm11, 0x90(%rsp)
+ movaps %xmm12, 0xa0(%rsp)
+ movaps %xmm13, 0xb0(%rsp)
+ movaps %xmm14, 0xc0(%rsp)
+ movaps %xmm15, 0xd0(%rsp)
+.Lxts_enc_body:
+___
+$code.=<<___;
+ mov %rsp, %rbp # backup %rsp
+ mov $arg1, $inp # backup arguments
+ mov $arg2, $out
+ mov $arg3, $len
+ mov $arg4, $key
+
+ lea ($arg6), $arg1
+ lea 0x20(%rbp), $arg2
+ lea ($arg5), $arg3
+ call asm_AES_encrypt # generate initial tweak
+
+ mov 240($key), %eax # rounds
+ mov $len, %rbx # backup $len
+
+ mov %eax, %edx # rounds
+ shl \$7, %rax # 128 bytes per inner round key
+ sub \$`128-32`, %rax # size of bit-sliced key schedule
+ sub %rax, %rsp
+
+ mov %rsp, %rax # pass key schedule
+ mov $key, %rcx # pass key
+ mov %edx, %r10d # pass rounds
+ call _bsaes_key_convert
+ pxor %xmm6, %xmm7 # fix up last round key
+ movdqa %xmm7, (%rax) # save last round key
+
+ and \$-16, $len
+ sub \$0x80, %rsp # place for tweak[8]
+ movdqa 0x20(%rbp), @XMM[7] # initial tweak
+
+ pxor $twtmp, $twtmp
+ movdqa .Lxts_magic(%rip), $twmask
+ pcmpgtd @XMM[7], $twtmp # broadcast upper bits
+
+ sub \$0x80, $len
+ jc .Lxts_enc_short
+ jmp .Lxts_enc_loop
+
+.align 16
+.Lxts_enc_loop:
+___
+ for ($i=0;$i<7;$i++) {
+ $code.=<<___;
+ pshufd \$0x13, $twtmp, $twres
+ pxor $twtmp, $twtmp
+ movdqa @XMM[7], @XMM[$i]
+ movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
+ paddq @XMM[7], @XMM[7] # psllq 1,$tweak
+ pand $twmask, $twres # isolate carry and residue
+ pcmpgtd @XMM[7], $twtmp # broadcast upper bits
+ pxor $twres, @XMM[7]
+___
+ $code.=<<___ if ($i>=1);
+ movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
+___
+ $code.=<<___ if ($i>=2);
+ pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
+___
+ }
+$code.=<<___;
+ movdqu 0x60($inp), @XMM[8+6]
+ pxor @XMM[8+5], @XMM[5]
+ movdqu 0x70($inp), @XMM[8+7]
+ lea 0x80($inp), $inp
+ movdqa @XMM[7], 0x70(%rsp)
+ pxor @XMM[8+6], @XMM[6]
+ lea 0x80(%rsp), %rax # pass key schedule
+ pxor @XMM[8+7], @XMM[7]
+ mov %edx, %r10d # pass rounds
+
+ call _bsaes_encrypt8
+
+ pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
+ pxor 0x10(%rsp), @XMM[1]
+ movdqu @XMM[0], 0x00($out) # write output
+ pxor 0x20(%rsp), @XMM[4]
+ movdqu @XMM[1], 0x10($out)
+ pxor 0x30(%rsp), @XMM[6]
+ movdqu @XMM[4], 0x20($out)
+ pxor 0x40(%rsp), @XMM[3]
+ movdqu @XMM[6], 0x30($out)
+ pxor 0x50(%rsp), @XMM[7]
+ movdqu @XMM[3], 0x40($out)
+ pxor 0x60(%rsp), @XMM[2]
+ movdqu @XMM[7], 0x50($out)
+ pxor 0x70(%rsp), @XMM[5]
+ movdqu @XMM[2], 0x60($out)
+ movdqu @XMM[5], 0x70($out)
+ lea 0x80($out), $out
+
+ movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
+ pxor $twtmp, $twtmp
+ movdqa .Lxts_magic(%rip), $twmask
+ pcmpgtd @XMM[7], $twtmp
+ pshufd \$0x13, $twtmp, $twres
+ pxor $twtmp, $twtmp
+ paddq @XMM[7], @XMM[7] # psllq 1,$tweak
+ pand $twmask, $twres # isolate carry and residue
+ pcmpgtd @XMM[7], $twtmp # broadcast upper bits
+ pxor $twres, @XMM[7]
+
+ sub \$0x80,$len
+ jnc .Lxts_enc_loop
+
+.Lxts_enc_short:
+ add \$0x80, $len
+ jz .Lxts_enc_done
+___
+ for ($i=0;$i<7;$i++) {
+ $code.=<<___;
+ pshufd \$0x13, $twtmp, $twres
+ pxor $twtmp, $twtmp
+ movdqa @XMM[7], @XMM[$i]
+ movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
+ paddq @XMM[7], @XMM[7] # psllq 1,$tweak
+ pand $twmask, $twres # isolate carry and residue
+ pcmpgtd @XMM[7], $twtmp # broadcast upper bits
+ pxor $twres, @XMM[7]
+___
+ $code.=<<___ if ($i>=1);
+ movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
+ cmp \$`0x10*$i`,$len
+ je .Lxts_enc_$i
+___
+ $code.=<<___ if ($i>=2);
+ pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
+___
+ }
+$code.=<<___;
+ movdqu 0x60($inp), @XMM[8+6]
+ pxor @XMM[8+5], @XMM[5]
+ movdqa @XMM[7], 0x70(%rsp)
+ lea 0x70($inp), $inp
+ pxor @XMM[8+6], @XMM[6]
+ lea 0x80(%rsp), %rax # pass key schedule
+ mov %edx, %r10d # pass rounds
+
+ call _bsaes_encrypt8
+
+ pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
+ pxor 0x10(%rsp), @XMM[1]
+ movdqu @XMM[0], 0x00($out) # write output
+ pxor 0x20(%rsp), @XMM[4]
+ movdqu @XMM[1], 0x10($out)
+ pxor 0x30(%rsp), @XMM[6]
+ movdqu @XMM[4], 0x20($out)
+ pxor 0x40(%rsp), @XMM[3]
+ movdqu @XMM[6], 0x30($out)
+ pxor 0x50(%rsp), @XMM[7]
+ movdqu @XMM[3], 0x40($out)
+ pxor 0x60(%rsp), @XMM[2]
+ movdqu @XMM[7], 0x50($out)
+ movdqu @XMM[2], 0x60($out)
+ lea 0x70($out), $out
+
+ movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
+ jmp .Lxts_enc_done
+.align 16
+.Lxts_enc_6:
+ pxor @XMM[8+4], @XMM[4]
+ lea 0x60($inp), $inp
+ pxor @XMM[8+5], @XMM[5]
+ lea 0x80(%rsp), %rax # pass key schedule
+ mov %edx, %r10d # pass rounds
+
+ call _bsaes_encrypt8
+
+ pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
+ pxor 0x10(%rsp), @XMM[1]
+ movdqu @XMM[0], 0x00($out) # write output
+ pxor 0x20(%rsp), @XMM[4]
+ movdqu @XMM[1], 0x10($out)
+ pxor 0x30(%rsp), @XMM[6]
+ movdqu @XMM[4], 0x20($out)
+ pxor 0x40(%rsp), @XMM[3]
+ movdqu @XMM[6], 0x30($out)
+ pxor 0x50(%rsp), @XMM[7]
+ movdqu @XMM[3], 0x40($out)
+ movdqu @XMM[7], 0x50($out)
+ lea 0x60($out), $out
+
+ movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
+ jmp .Lxts_enc_done
+.align 16
+.Lxts_enc_5:
+ pxor @XMM[8+3], @XMM[3]
+ lea 0x50($inp), $inp
+ pxor @XMM[8+4], @XMM[4]
+ lea 0x80(%rsp), %rax # pass key schedule
+ mov %edx, %r10d # pass rounds
+
+ call _bsaes_encrypt8
+
+ pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
+ pxor 0x10(%rsp), @XMM[1]
+ movdqu @XMM[0], 0x00($out) # write output
+ pxor 0x20(%rsp), @XMM[4]
+ movdqu @XMM[1], 0x10($out)
+ pxor 0x30(%rsp), @XMM[6]
+ movdqu @XMM[4], 0x20($out)
+ pxor 0x40(%rsp), @XMM[3]
+ movdqu @XMM[6], 0x30($out)
+ movdqu @XMM[3], 0x40($out)
+ lea 0x50($out), $out
+
+ movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
+ jmp .Lxts_enc_done
+.align 16
+.Lxts_enc_4:
+ pxor @XMM[8+2], @XMM[2]
+ lea 0x40($inp), $inp
+ pxor @XMM[8+3], @XMM[3]
+ lea 0x80(%rsp), %rax # pass key schedule
+ mov %edx, %r10d # pass rounds
+
+ call _bsaes_encrypt8
+
+ pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
+ pxor 0x10(%rsp), @XMM[1]
+ movdqu @XMM[0], 0x00($out) # write output
+ pxor 0x20(%rsp), @XMM[4]
+ movdqu @XMM[1], 0x10($out)
+ pxor 0x30(%rsp), @XMM[6]
+ movdqu @XMM[4], 0x20($out)
+ movdqu @XMM[6], 0x30($out)
+ lea 0x40($out), $out
+
+ movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
+ jmp .Lxts_enc_done
+.align 16
+.Lxts_enc_3:
+ pxor @XMM[8+1], @XMM[1]
+ lea 0x30($inp), $inp
+ pxor @XMM[8+2], @XMM[2]
+ lea 0x80(%rsp), %rax # pass key schedule
+ mov %edx, %r10d # pass rounds
+
+ call _bsaes_encrypt8
+
+ pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
+ pxor 0x10(%rsp), @XMM[1]
+ movdqu @XMM[0], 0x00($out) # write output
+ pxor 0x20(%rsp), @XMM[4]
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[4], 0x20($out)
+ lea 0x30($out), $out
+
+ movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
+ jmp .Lxts_enc_done
+.align 16
+.Lxts_enc_2:
+ pxor @XMM[8+0], @XMM[0]
+ lea 0x20($inp), $inp
+ pxor @XMM[8+1], @XMM[1]
+ lea 0x80(%rsp), %rax # pass key schedule
+ mov %edx, %r10d # pass rounds
+
+ call _bsaes_encrypt8
+
+ pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
+ pxor 0x10(%rsp), @XMM[1]
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ lea 0x20($out), $out
+
+ movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
+ jmp .Lxts_enc_done
+.align 16
+.Lxts_enc_1:
+ pxor @XMM[0], @XMM[8]
+ lea 0x10($inp), $inp
+ movdqa @XMM[8], 0x20(%rbp)
+ lea 0x20(%rbp), $arg1
+ lea 0x20(%rbp), $arg2
+ lea ($key), $arg3
+ call asm_AES_encrypt # doesn't touch %xmm
+ pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
+ #pxor @XMM[8], @XMM[0]
+ #lea 0x80(%rsp), %rax # pass key schedule
+ #mov %edx, %r10d # pass rounds
+ #call _bsaes_encrypt8
+ #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
+ movdqu @XMM[0], 0x00($out) # write output
+ lea 0x10($out), $out
+
+ movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
+
+.Lxts_enc_done:
+ and \$15, %ebx
+ jz .Lxts_enc_ret
+ mov $out, %rdx
+
+.Lxts_enc_steal:
+ movzb ($inp), %eax
+ movzb -16(%rdx), %ecx
+ lea 1($inp), $inp
+ mov %al, -16(%rdx)
+ mov %cl, 0(%rdx)
+ lea 1(%rdx), %rdx
+ sub \$1,%ebx
+ jnz .Lxts_enc_steal
+
+ movdqu -16($out), @XMM[0]
+ lea 0x20(%rbp), $arg1
+ pxor @XMM[7], @XMM[0]
+ lea 0x20(%rbp), $arg2
+ movdqa @XMM[0], 0x20(%rbp)
+ lea ($key), $arg3
+ call asm_AES_encrypt # doesn't touch %xmm
+ pxor 0x20(%rbp), @XMM[7]
+ movdqu @XMM[7], -16($out)
+
+.Lxts_enc_ret:
+ lea (%rsp), %rax
+ pxor %xmm0, %xmm0
+.Lxts_enc_bzero: # wipe key schedule [if any]
+ movdqa %xmm0, 0x00(%rax)
+ movdqa %xmm0, 0x10(%rax)
+ lea 0x20(%rax), %rax
+ cmp %rax, %rbp
+ ja .Lxts_enc_bzero
+
+ lea (%rbp),%rsp # restore %rsp
+___
+$code.=<<___ if ($win64);
+ movaps 0x40(%rbp), %xmm6
+ movaps 0x50(%rbp), %xmm7
+ movaps 0x60(%rbp), %xmm8
+ movaps 0x70(%rbp), %xmm9
+ movaps 0x80(%rbp), %xmm10
+ movaps 0x90(%rbp), %xmm11
+ movaps 0xa0(%rbp), %xmm12
+ movaps 0xb0(%rbp), %xmm13
+ movaps 0xc0(%rbp), %xmm14
+ movaps 0xd0(%rbp), %xmm15
+ lea 0xa0(%rbp), %rsp
+___
+$code.=<<___;
+ mov 0x48(%rsp), %r15
+ mov 0x50(%rsp), %r14
+ mov 0x58(%rsp), %r13
+ mov 0x60(%rsp), %r12
+ mov 0x68(%rsp), %rbx
+ mov 0x70(%rsp), %rax
+ lea 0x78(%rsp), %rsp
+ mov %rax, %rbp
+.Lxts_enc_epilogue:
+ ret
+.size bsaes_xts_encrypt,.-bsaes_xts_encrypt
+
+.globl bsaes_xts_decrypt
+.type bsaes_xts_decrypt,\@abi-omnipotent
+.align 16
+bsaes_xts_decrypt:
+ mov %rsp, %rax
+.Lxts_dec_prologue:
+ push %rbp
+ push %rbx
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ lea -0x48(%rsp), %rsp
+___
+$code.=<<___ if ($win64);
+ mov 0xa0(%rsp),$arg5 # pull key2
+ mov 0xa8(%rsp),$arg6 # pull ivp
+ lea -0xa0(%rsp), %rsp
+ movaps %xmm6, 0x40(%rsp)
+ movaps %xmm7, 0x50(%rsp)
+ movaps %xmm8, 0x60(%rsp)
+ movaps %xmm9, 0x70(%rsp)
+ movaps %xmm10, 0x80(%rsp)
+ movaps %xmm11, 0x90(%rsp)
+ movaps %xmm12, 0xa0(%rsp)
+ movaps %xmm13, 0xb0(%rsp)
+ movaps %xmm14, 0xc0(%rsp)
+ movaps %xmm15, 0xd0(%rsp)
+.Lxts_dec_body:
+___
+$code.=<<___;
+ mov %rsp, %rbp # backup %rsp
+ mov $arg1, $inp # backup arguments
+ mov $arg2, $out
+ mov $arg3, $len
+ mov $arg4, $key
+
+ lea ($arg6), $arg1
+ lea 0x20(%rbp), $arg2
+ lea ($arg5), $arg3
+ call asm_AES_encrypt # generate initial tweak
+
+ mov 240($key), %eax # rounds
+ mov $len, %rbx # backup $len
+
+ mov %eax, %edx # rounds
+ shl \$7, %rax # 128 bytes per inner round key
+ sub \$`128-32`, %rax # size of bit-sliced key schedule
+ sub %rax, %rsp
+
+ mov %rsp, %rax # pass key schedule
+ mov $key, %rcx # pass key
+ mov %edx, %r10d # pass rounds
+ call _bsaes_key_convert
+ pxor (%rsp), %xmm7 # fix up round 0 key
+ movdqa %xmm6, (%rax) # save last round key
+ movdqa %xmm7, (%rsp)
+
+ xor %eax, %eax # if ($len%16) len-=16;
+ and \$-16, $len
+ test \$15, %ebx
+ setnz %al
+ shl \$4, %rax
+ sub %rax, $len
+
+ sub \$0x80, %rsp # place for tweak[8]
+ movdqa 0x20(%rbp), @XMM[7] # initial tweak
+
+ pxor $twtmp, $twtmp
+ movdqa .Lxts_magic(%rip), $twmask
+ pcmpgtd @XMM[7], $twtmp # broadcast upper bits
+
+ sub \$0x80, $len
+ jc .Lxts_dec_short
+ jmp .Lxts_dec_loop
+
+.align 16
+.Lxts_dec_loop:
+___
+ for ($i=0;$i<7;$i++) {
+ $code.=<<___;
+ pshufd \$0x13, $twtmp, $twres
+ pxor $twtmp, $twtmp
+ movdqa @XMM[7], @XMM[$i]
+ movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
+ paddq @XMM[7], @XMM[7] # psllq 1,$tweak
+ pand $twmask, $twres # isolate carry and residue
+ pcmpgtd @XMM[7], $twtmp # broadcast upper bits
+ pxor $twres, @XMM[7]
+___
+ $code.=<<___ if ($i>=1);
+ movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
+___
+ $code.=<<___ if ($i>=2);
+ pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
+___
+ }
+$code.=<<___;
+ movdqu 0x60($inp), @XMM[8+6]
+ pxor @XMM[8+5], @XMM[5]
+ movdqu 0x70($inp), @XMM[8+7]
+ lea 0x80($inp), $inp
+ movdqa @XMM[7], 0x70(%rsp)
+ pxor @XMM[8+6], @XMM[6]
+ lea 0x80(%rsp), %rax # pass key schedule
+ pxor @XMM[8+7], @XMM[7]
+ mov %edx, %r10d # pass rounds
+
+ call _bsaes_decrypt8
+
+ pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
+ pxor 0x10(%rsp), @XMM[1]
+ movdqu @XMM[0], 0x00($out) # write output
+ pxor 0x20(%rsp), @XMM[6]
+ movdqu @XMM[1], 0x10($out)
+ pxor 0x30(%rsp), @XMM[4]
+ movdqu @XMM[6], 0x20($out)
+ pxor 0x40(%rsp), @XMM[2]
+ movdqu @XMM[4], 0x30($out)
+ pxor 0x50(%rsp), @XMM[7]
+ movdqu @XMM[2], 0x40($out)
+ pxor 0x60(%rsp), @XMM[3]
+ movdqu @XMM[7], 0x50($out)
+ pxor 0x70(%rsp), @XMM[5]
+ movdqu @XMM[3], 0x60($out)
+ movdqu @XMM[5], 0x70($out)
+ lea 0x80($out), $out
+
+ movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
+ pxor $twtmp, $twtmp
+ movdqa .Lxts_magic(%rip), $twmask
+ pcmpgtd @XMM[7], $twtmp
+ pshufd \$0x13, $twtmp, $twres
+ pxor $twtmp, $twtmp
+ paddq @XMM[7], @XMM[7] # psllq 1,$tweak
+ pand $twmask, $twres # isolate carry and residue
+ pcmpgtd @XMM[7], $twtmp # broadcast upper bits
+ pxor $twres, @XMM[7]
+
+ sub \$0x80,$len
+ jnc .Lxts_dec_loop
+
+.Lxts_dec_short:
+ add \$0x80, $len
+ jz .Lxts_dec_done
+___
+ for ($i=0;$i<7;$i++) {
+ $code.=<<___;
+ pshufd \$0x13, $twtmp, $twres
+ pxor $twtmp, $twtmp
+ movdqa @XMM[7], @XMM[$i]
+ movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
+ paddq @XMM[7], @XMM[7] # psllq 1,$tweak
+ pand $twmask, $twres # isolate carry and residue
+ pcmpgtd @XMM[7], $twtmp # broadcast upper bits
+ pxor $twres, @XMM[7]
+___
+ $code.=<<___ if ($i>=1);
+ movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
+ cmp \$`0x10*$i`,$len
+ je .Lxts_dec_$i
+___
+ $code.=<<___ if ($i>=2);
+ pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
+___
+ }
+$code.=<<___;
+ movdqu 0x60($inp), @XMM[8+6]
+ pxor @XMM[8+5], @XMM[5]
+ movdqa @XMM[7], 0x70(%rsp)
+ lea 0x70($inp), $inp
+ pxor @XMM[8+6], @XMM[6]
+ lea 0x80(%rsp), %rax # pass key schedule
+ mov %edx, %r10d # pass rounds
+
+ call _bsaes_decrypt8
+
+ pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
+ pxor 0x10(%rsp), @XMM[1]
+ movdqu @XMM[0], 0x00($out) # write output
+ pxor 0x20(%rsp), @XMM[6]
+ movdqu @XMM[1], 0x10($out)
+ pxor 0x30(%rsp), @XMM[4]
+ movdqu @XMM[6], 0x20($out)
+ pxor 0x40(%rsp), @XMM[2]
+ movdqu @XMM[4], 0x30($out)
+ pxor 0x50(%rsp), @XMM[7]
+ movdqu @XMM[2], 0x40($out)
+ pxor 0x60(%rsp), @XMM[3]
+ movdqu @XMM[7], 0x50($out)
+ movdqu @XMM[3], 0x60($out)
+ lea 0x70($out), $out
+
+ movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
+ jmp .Lxts_dec_done
+.align 16
+.Lxts_dec_6:
+ pxor @XMM[8+4], @XMM[4]
+ lea 0x60($inp), $inp
+ pxor @XMM[8+5], @XMM[5]
+ lea 0x80(%rsp), %rax # pass key schedule
+ mov %edx, %r10d # pass rounds
+
+ call _bsaes_decrypt8
+
+ pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
+ pxor 0x10(%rsp), @XMM[1]
+ movdqu @XMM[0], 0x00($out) # write output
+ pxor 0x20(%rsp), @XMM[6]
+ movdqu @XMM[1], 0x10($out)
+ pxor 0x30(%rsp), @XMM[4]
+ movdqu @XMM[6], 0x20($out)
+ pxor 0x40(%rsp), @XMM[2]
+ movdqu @XMM[4], 0x30($out)
+ pxor 0x50(%rsp), @XMM[7]
+ movdqu @XMM[2], 0x40($out)
+ movdqu @XMM[7], 0x50($out)
+ lea 0x60($out), $out
+
+ movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
+ jmp .Lxts_dec_done
+.align 16
+.Lxts_dec_5:
+ pxor @XMM[8+3], @XMM[3]
+ lea 0x50($inp), $inp
+ pxor @XMM[8+4], @XMM[4]
+ lea 0x80(%rsp), %rax # pass key schedule
+ mov %edx, %r10d # pass rounds
+
+ call _bsaes_decrypt8
+
+ pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
+ pxor 0x10(%rsp), @XMM[1]
+ movdqu @XMM[0], 0x00($out) # write output
+ pxor 0x20(%rsp), @XMM[6]
+ movdqu @XMM[1], 0x10($out)
+ pxor 0x30(%rsp), @XMM[4]
+ movdqu @XMM[6], 0x20($out)
+ pxor 0x40(%rsp), @XMM[2]
+ movdqu @XMM[4], 0x30($out)
+ movdqu @XMM[2], 0x40($out)
+ lea 0x50($out), $out
+
+ movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
+ jmp .Lxts_dec_done
+.align 16
+.Lxts_dec_4:
+ pxor @XMM[8+2], @XMM[2]
+ lea 0x40($inp), $inp
+ pxor @XMM[8+3], @XMM[3]
+ lea 0x80(%rsp), %rax # pass key schedule
+ mov %edx, %r10d # pass rounds
+
+ call _bsaes_decrypt8
+
+ pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
+ pxor 0x10(%rsp), @XMM[1]
+ movdqu @XMM[0], 0x00($out) # write output
+ pxor 0x20(%rsp), @XMM[6]
+ movdqu @XMM[1], 0x10($out)
+ pxor 0x30(%rsp), @XMM[4]
+ movdqu @XMM[6], 0x20($out)
+ movdqu @XMM[4], 0x30($out)
+ lea 0x40($out), $out
+
+ movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
+ jmp .Lxts_dec_done
+.align 16
+.Lxts_dec_3:
+ pxor @XMM[8+1], @XMM[1]
+ lea 0x30($inp), $inp
+ pxor @XMM[8+2], @XMM[2]
+ lea 0x80(%rsp), %rax # pass key schedule
+ mov %edx, %r10d # pass rounds
+
+ call _bsaes_decrypt8
+
+ pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
+ pxor 0x10(%rsp), @XMM[1]
+ movdqu @XMM[0], 0x00($out) # write output
+ pxor 0x20(%rsp), @XMM[6]
+ movdqu @XMM[1], 0x10($out)
+ movdqu @XMM[6], 0x20($out)
+ lea 0x30($out), $out
+
+ movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
+ jmp .Lxts_dec_done
+.align 16
+.Lxts_dec_2:
+ pxor @XMM[8+0], @XMM[0]
+ lea 0x20($inp), $inp
+ pxor @XMM[8+1], @XMM[1]
+ lea 0x80(%rsp), %rax # pass key schedule
+ mov %edx, %r10d # pass rounds
+
+ call _bsaes_decrypt8
+
+ pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
+ pxor 0x10(%rsp), @XMM[1]
+ movdqu @XMM[0], 0x00($out) # write output
+ movdqu @XMM[1], 0x10($out)
+ lea 0x20($out), $out
+
+ movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
+ jmp .Lxts_dec_done
+.align 16
+.Lxts_dec_1:
+ pxor @XMM[0], @XMM[8]
+ lea 0x10($inp), $inp
+ movdqa @XMM[8], 0x20(%rbp)
+ lea 0x20(%rbp), $arg1
+ lea 0x20(%rbp), $arg2
+ lea ($key), $arg3
+ call asm_AES_decrypt # doesn't touch %xmm
+ pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
+ #pxor @XMM[8], @XMM[0]
+ #lea 0x80(%rsp), %rax # pass key schedule
+ #mov %edx, %r10d # pass rounds
+ #call _bsaes_decrypt8
+ #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
+ movdqu @XMM[0], 0x00($out) # write output
+ lea 0x10($out), $out
+
+ movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
+
+.Lxts_dec_done:
+ and \$15, %ebx
+ jz .Lxts_dec_ret
+
+ pxor $twtmp, $twtmp
+ movdqa .Lxts_magic(%rip), $twmask
+ pcmpgtd @XMM[7], $twtmp
+ pshufd \$0x13, $twtmp, $twres
+ movdqa @XMM[7], @XMM[6]
+ paddq @XMM[7], @XMM[7] # psllq 1,$tweak
+ pand $twmask, $twres # isolate carry and residue
+ movdqu ($inp), @XMM[0]
+ pxor $twres, @XMM[7]
+
+ lea 0x20(%rbp), $arg1
+ pxor @XMM[7], @XMM[0]
+ lea 0x20(%rbp), $arg2
+ movdqa @XMM[0], 0x20(%rbp)
+ lea ($key), $arg3
+ call asm_AES_decrypt # doesn't touch %xmm
+ pxor 0x20(%rbp), @XMM[7]
+ mov $out, %rdx
+ movdqu @XMM[7], ($out)
+
+.Lxts_dec_steal:
+ movzb 16($inp), %eax
+ movzb (%rdx), %ecx
+ lea 1($inp), $inp
+ mov %al, (%rdx)
+ mov %cl, 16(%rdx)
+ lea 1(%rdx), %rdx
+ sub \$1,%ebx
+ jnz .Lxts_dec_steal
+
+ movdqu ($out), @XMM[0]
+ lea 0x20(%rbp), $arg1
+ pxor @XMM[6], @XMM[0]
+ lea 0x20(%rbp), $arg2
+ movdqa @XMM[0], 0x20(%rbp)
+ lea ($key), $arg3
+ call asm_AES_decrypt # doesn't touch %xmm
+ pxor 0x20(%rbp), @XMM[6]
+ movdqu @XMM[6], ($out)
+
+.Lxts_dec_ret:
+ lea (%rsp), %rax
+ pxor %xmm0, %xmm0
+.Lxts_dec_bzero: # wipe key schedule [if any]
+ movdqa %xmm0, 0x00(%rax)
+ movdqa %xmm0, 0x10(%rax)
+ lea 0x20(%rax), %rax
+ cmp %rax, %rbp
+ ja .Lxts_dec_bzero
+
+ lea (%rbp),%rsp # restore %rsp
+___
+$code.=<<___ if ($win64);
+ movaps 0x40(%rbp), %xmm6
+ movaps 0x50(%rbp), %xmm7
+ movaps 0x60(%rbp), %xmm8
+ movaps 0x70(%rbp), %xmm9
+ movaps 0x80(%rbp), %xmm10
+ movaps 0x90(%rbp), %xmm11
+ movaps 0xa0(%rbp), %xmm12
+ movaps 0xb0(%rbp), %xmm13
+ movaps 0xc0(%rbp), %xmm14
+ movaps 0xd0(%rbp), %xmm15
+ lea 0xa0(%rbp), %rsp
+___
+$code.=<<___;
+ mov 0x48(%rsp), %r15
+ mov 0x50(%rsp), %r14
+ mov 0x58(%rsp), %r13
+ mov 0x60(%rsp), %r12
+ mov 0x68(%rsp), %rbx
+ mov 0x70(%rsp), %rax
+ lea 0x78(%rsp), %rsp
+ mov %rax, %rbp
+.Lxts_dec_epilogue:
+ ret
+.size bsaes_xts_decrypt,.-bsaes_xts_decrypt
+___
+}
+$code.=<<___;
+.type _bsaes_const,\@object
+.align 64
+_bsaes_const:
+.LM0ISR: # InvShiftRows constants
+ .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
+.LISRM0:
+ .quad 0x01040b0e0205080f, 0x0306090c00070a0d
+.LISR:
+ .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
+.LBS0: # bit-slice constants
+ .quad 0x5555555555555555, 0x5555555555555555
+.LBS1:
+ .quad 0x3333333333333333, 0x3333333333333333
+.LBS2:
+ .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
+.LSR: # shiftrows constants
+ .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
+.LSRM0:
+ .quad 0x0304090e00050a0f, 0x01060b0c0207080d
+.LM0SR:
+ .quad 0x0a0e02060f03070b, 0x0004080c05090d01
+.LSWPUP: # byte-swap upper dword
+ .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908
+.LSWPUPM0SR:
+ .quad 0x0a0d02060c03070b, 0x0004080f05090e01
+.LADD1: # counter increment constants
+ .quad 0x0000000000000000, 0x0000000100000000
+.LADD2:
+ .quad 0x0000000000000000, 0x0000000200000000
+.LADD3:
+ .quad 0x0000000000000000, 0x0000000300000000
+.LADD4:
+ .quad 0x0000000000000000, 0x0000000400000000
+.LADD5:
+ .quad 0x0000000000000000, 0x0000000500000000
+.LADD6:
+ .quad 0x0000000000000000, 0x0000000600000000
+.LADD7:
+ .quad 0x0000000000000000, 0x0000000700000000
+.LADD8:
+ .quad 0x0000000000000000, 0x0000000800000000
+.Lxts_magic:
+ .long 0x87,0,1,0
+.Lmasks:
+ .quad 0x0101010101010101, 0x0101010101010101
+ .quad 0x0202020202020202, 0x0202020202020202
+ .quad 0x0404040404040404, 0x0404040404040404
+ .quad 0x0808080808080808, 0x0808080808080808
+.LM0:
+ .quad 0x02060a0e03070b0f, 0x0004080c0105090d
+.L63:
+ .quad 0x6363636363636363, 0x6363636363636363
+.asciz "Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov"
+.align 64
+.size _bsaes_const,.-_bsaes_const
+___
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+# CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern __imp_RtlVirtualUnwind
+.type se_handler,\@abi-omnipotent
+.align 16
+se_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HandlerData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # prologue label
+ cmp %r10,%rbx # context->Rip<prologue label
+ jb .Lin_prologue
+
+ mov 152($context),%rax # pull context->Rsp
+
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=epilogue label
+ jae .Lin_prologue
+
+ mov 160($context),%rax # pull context->Rbp
+
+ lea 0x40(%rax),%rsi # %xmm save area
+ lea 512($context),%rdi # &context.Xmm6
+ mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
+ .long 0xa548f3fc # cld; rep movsq
+ lea 0xa0(%rax),%rax # adjust stack pointer
+
+ mov 0x70(%rax),%rbp
+ mov 0x68(%rax),%rbx
+ mov 0x60(%rax),%r12
+ mov 0x58(%rax),%r13
+ mov 0x50(%rax),%r14
+ mov 0x48(%rax),%r15
+ lea 0x78(%rax),%rax # adjust stack pointer
+ mov %rbx,144($context) # restore context->Rbx
+ mov %rbp,160($context) # restore context->Rbp
+ mov %r12,216($context) # restore context->R12
+ mov %r13,224($context) # restore context->R13
+ mov %r14,232($context) # restore context->R14
+ mov %r15,240($context) # restore context->R15
+
+.Lin_prologue:
+ mov %rax,152($context) # restore context->Rsp
+
+ mov 40($disp),%rdi # disp->ContextRecord
+ mov $context,%rsi # context
+ mov \$`1232/8`,%ecx # sizeof(CONTEXT)
+ .long 0xa548f3fc # cld; rep movsq
+
+ mov $disp,%rsi
+ xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
+ mov 8(%rsi),%rdx # arg2, disp->ImageBase
+ mov 0(%rsi),%r8 # arg3, disp->ControlPc
+ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
+ mov 40(%rsi),%r10 # disp->ContextRecord
+ lea 56(%rsi),%r11 # &disp->HandlerData
+ lea 24(%rsi),%r12 # &disp->EstablisherFrame
+ mov %r10,32(%rsp) # arg5
+ mov %r11,40(%rsp) # arg6
+ mov %r12,48(%rsp) # arg7
+ mov %rcx,56(%rsp) # arg8, (NULL)
+ call *__imp_RtlVirtualUnwind(%rip)
+
+ mov \$1,%eax # ExceptionContinueSearch
+ add \$64,%rsp
+ popfq
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ pop %rdi
+ pop %rsi
+ ret
+.size se_handler,.-se_handler
+
+.section .pdata
+.align 4
+___
+$code.=<<___ if ($ecb);
+ .rva .Lecb_enc_prologue
+ .rva .Lecb_enc_epilogue
+ .rva .Lecb_enc_info
+
+ .rva .Lecb_dec_prologue
+ .rva .Lecb_dec_epilogue
+ .rva .Lecb_dec_info
+___
+$code.=<<___;
+ .rva .Lcbc_dec_prologue
+ .rva .Lcbc_dec_epilogue
+ .rva .Lcbc_dec_info
+
+ .rva .Lctr_enc_prologue
+ .rva .Lctr_enc_epilogue
+ .rva .Lctr_enc_info
+
+ .rva .Lxts_enc_prologue
+ .rva .Lxts_enc_epilogue
+ .rva .Lxts_enc_info
+
+ .rva .Lxts_dec_prologue
+ .rva .Lxts_dec_epilogue
+ .rva .Lxts_dec_info
+
+.section .xdata
+.align 8
+___
+$code.=<<___ if ($ecb);
+.Lecb_enc_info:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[]
+.Lecb_dec_info:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[]
+___
+$code.=<<___;
+.Lcbc_dec_info:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[]
+.Lctr_enc_info:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[]
+.Lxts_enc_info:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[]
+.Lxts_dec_info:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
+___
+}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+
+print $code;
+
+close STDOUT;
diff --git a/main/openssl/crypto/aes/asm/vpaes-x86.S b/main/openssl/crypto/aes/asm/vpaes-x86.S
new file mode 100644
index 00000000..c53a5074
--- /dev/null
+++ b/main/openssl/crypto/aes/asm/vpaes-x86.S
@@ -0,0 +1,661 @@
+.file "vpaes-x86.s"
+.text
+.align 64
+.L_vpaes_consts:
+.long 218628480,235210255,168496130,67568393
+.long 252381056,17041926,33884169,51187212
+.long 252645135,252645135,252645135,252645135
+.long 1512730624,3266504856,1377990664,3401244816
+.long 830229760,1275146365,2969422977,3447763452
+.long 3411033600,2979783055,338359620,2782886510
+.long 4209124096,907596821,221174255,1006095553
+.long 191964160,3799684038,3164090317,1589111125
+.long 182528256,1777043520,2877432650,3265356744
+.long 1874708224,3503451415,3305285752,363511674
+.long 1606117888,3487855781,1093350906,2384367825
+.long 197121,67569157,134941193,202313229
+.long 67569157,134941193,202313229,197121
+.long 134941193,202313229,197121,67569157
+.long 202313229,197121,67569157,134941193
+.long 33619971,100992007,168364043,235736079
+.long 235736079,33619971,100992007,168364043
+.long 168364043,235736079,33619971,100992007
+.long 100992007,168364043,235736079,33619971
+.long 50462976,117835012,185207048,252579084
+.long 252314880,51251460,117574920,184942860
+.long 184682752,252054788,50987272,118359308
+.long 118099200,185467140,251790600,50727180
+.long 2946363062,528716217,1300004225,1881839624
+.long 1532713819,1532713819,1532713819,1532713819
+.long 3602276352,4288629033,3737020424,4153884961
+.long 1354558464,32357713,2958822624,3775749553
+.long 1201988352,132424512,1572796698,503232858
+.long 2213177600,1597421020,4103937655,675398315
+.long 2749646592,4273543773,1511898873,121693092
+.long 3040248576,1103263732,2871565598,1608280554
+.long 2236667136,2588920351,482954393,64377734
+.long 3069987328,291237287,2117370568,3650299247
+.long 533321216,3573750986,2572112006,1401264716
+.long 1339849704,2721158661,548607111,3445553514
+.long 2128193280,3054596040,2183486460,1257083700
+.long 655635200,1165381986,3923443150,2344132524
+.long 190078720,256924420,290342170,357187870
+.long 1610966272,2263057382,4103205268,309794674
+.long 2592527872,2233205587,1335446729,3402964816
+.long 3973531904,3225098121,3002836325,1918774430
+.long 3870401024,2102906079,2284471353,4117666579
+.long 617007872,1021508343,366931923,691083277
+.long 2528395776,3491914898,2968704004,1613121270
+.long 3445188352,3247741094,844474987,4093578302
+.long 651481088,1190302358,1689581232,574775300
+.long 4289380608,206939853,2555985458,2489840491
+.long 2130264064,327674451,3566485037,3349835193
+.long 2470714624,316102159,3636825756,3393945945
+.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105
+.byte 111,110,32,65,69,83,32,102,111,114,32,120,56,54,47,83
+.byte 83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117
+.byte 114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105
+.byte 118,101,114,115,105,116,121,41,0
+.align 64
+.type _vpaes_preheat,@function
+.align 16
+_vpaes_preheat:
+ addl (%esp),%ebp
+ movdqa -48(%ebp),%xmm7
+ movdqa -16(%ebp),%xmm6
+ ret
+.size _vpaes_preheat,.-_vpaes_preheat
+.type _vpaes_encrypt_core,@function
+.align 16
+_vpaes_encrypt_core:
+ movl $16,%ecx
+ movl 240(%edx),%eax
+ movdqa %xmm6,%xmm1
+ movdqa (%ebp),%xmm2
+ pandn %xmm0,%xmm1
+ movdqu (%edx),%xmm5
+ psrld $4,%xmm1
+ pand %xmm6,%xmm0
+.byte 102,15,56,0,208
+ movdqa 16(%ebp),%xmm0
+.byte 102,15,56,0,193
+ pxor %xmm5,%xmm2
+ pxor %xmm2,%xmm0
+ addl $16,%edx
+ leal 192(%ebp),%ebx
+ jmp .L000enc_entry
+.align 16
+.L001enc_loop:
+ movdqa 32(%ebp),%xmm4
+.byte 102,15,56,0,226
+ pxor %xmm5,%xmm4
+ movdqa 48(%ebp),%xmm0
+.byte 102,15,56,0,195
+ pxor %xmm4,%xmm0
+ movdqa 64(%ebp),%xmm5
+.byte 102,15,56,0,234
+ movdqa -64(%ebx,%ecx,1),%xmm1
+ movdqa 80(%ebp),%xmm2
+.byte 102,15,56,0,211
+ pxor %xmm5,%xmm2
+ movdqa (%ebx,%ecx,1),%xmm4
+ movdqa %xmm0,%xmm3
+.byte 102,15,56,0,193
+ addl $16,%edx
+ pxor %xmm2,%xmm0
+.byte 102,15,56,0,220
+ addl $16,%ecx
+ pxor %xmm0,%xmm3
+.byte 102,15,56,0,193
+ andl $48,%ecx
+ pxor %xmm3,%xmm0
+ subl $1,%eax
+.L000enc_entry:
+ movdqa %xmm6,%xmm1
+ pandn %xmm0,%xmm1
+ psrld $4,%xmm1
+ pand %xmm6,%xmm0
+ movdqa -32(%ebp),%xmm5
+.byte 102,15,56,0,232
+ pxor %xmm1,%xmm0
+ movdqa %xmm7,%xmm3
+.byte 102,15,56,0,217
+ pxor %xmm5,%xmm3
+ movdqa %xmm7,%xmm4
+.byte 102,15,56,0,224
+ pxor %xmm5,%xmm4
+ movdqa %xmm7,%xmm2
+.byte 102,15,56,0,211
+ pxor %xmm0,%xmm2
+ movdqa %xmm7,%xmm3
+ movdqu (%edx),%xmm5
+.byte 102,15,56,0,220
+ pxor %xmm1,%xmm3
+ jnz .L001enc_loop
+ movdqa 96(%ebp),%xmm4
+ movdqa 112(%ebp),%xmm0
+.byte 102,15,56,0,226
+ pxor %xmm5,%xmm4
+.byte 102,15,56,0,195
+ movdqa 64(%ebx,%ecx,1),%xmm1
+ pxor %xmm4,%xmm0
+.byte 102,15,56,0,193
+ ret
+.size _vpaes_encrypt_core,.-_vpaes_encrypt_core
+.type _vpaes_decrypt_core,@function
+.align 16
+_vpaes_decrypt_core:
+ movl 240(%edx),%eax
+ leal 608(%ebp),%ebx
+ movdqa %xmm6,%xmm1
+ movdqa -64(%ebx),%xmm2
+ pandn %xmm0,%xmm1
+ movl %eax,%ecx
+ psrld $4,%xmm1
+ movdqu (%edx),%xmm5
+ shll $4,%ecx
+ pand %xmm6,%xmm0
+.byte 102,15,56,0,208
+ movdqa -48(%ebx),%xmm0
+ xorl $48,%ecx
+.byte 102,15,56,0,193
+ andl $48,%ecx
+ pxor %xmm5,%xmm2
+ movdqa 176(%ebp),%xmm5
+ pxor %xmm2,%xmm0
+ addl $16,%edx
+ leal -352(%ebx,%ecx,1),%ecx
+ jmp .L002dec_entry
+.align 16
+.L003dec_loop:
+ movdqa -32(%ebx),%xmm4
+.byte 102,15,56,0,226
+ pxor %xmm0,%xmm4
+ movdqa -16(%ebx),%xmm0
+.byte 102,15,56,0,195
+ pxor %xmm4,%xmm0
+ addl $16,%edx
+.byte 102,15,56,0,197
+ movdqa (%ebx),%xmm4
+.byte 102,15,56,0,226
+ pxor %xmm0,%xmm4
+ movdqa 16(%ebx),%xmm0
+.byte 102,15,56,0,195
+ pxor %xmm4,%xmm0
+ subl $1,%eax
+.byte 102,15,56,0,197
+ movdqa 32(%ebx),%xmm4
+.byte 102,15,56,0,226
+ pxor %xmm0,%xmm4
+ movdqa 48(%ebx),%xmm0
+.byte 102,15,56,0,195
+ pxor %xmm4,%xmm0
+.byte 102,15,56,0,197
+ movdqa 64(%ebx),%xmm4
+.byte 102,15,56,0,226
+ pxor %xmm0,%xmm4
+ movdqa 80(%ebx),%xmm0
+.byte 102,15,56,0,195
+ pxor %xmm4,%xmm0
+.byte 102,15,58,15,237,12
+.L002dec_entry:
+ movdqa %xmm6,%xmm1
+ pandn %xmm0,%xmm1
+ psrld $4,%xmm1
+ pand %xmm6,%xmm0
+ movdqa -32(%ebp),%xmm2
+.byte 102,15,56,0,208
+ pxor %xmm1,%xmm0
+ movdqa %xmm7,%xmm3
+.byte 102,15,56,0,217
+ pxor %xmm2,%xmm3
+ movdqa %xmm7,%xmm4
+.byte 102,15,56,0,224
+ pxor %xmm2,%xmm4
+ movdqa %xmm7,%xmm2
+.byte 102,15,56,0,211
+ pxor %xmm0,%xmm2
+ movdqa %xmm7,%xmm3
+.byte 102,15,56,0,220
+ pxor %xmm1,%xmm3
+ movdqu (%edx),%xmm0
+ jnz .L003dec_loop
+ movdqa 96(%ebx),%xmm4
+.byte 102,15,56,0,226
+ pxor %xmm0,%xmm4
+ movdqa 112(%ebx),%xmm0
+ movdqa (%ecx),%xmm2
+.byte 102,15,56,0,195
+ pxor %xmm4,%xmm0
+.byte 102,15,56,0,194
+ ret
+.size _vpaes_decrypt_core,.-_vpaes_decrypt_core
+.type _vpaes_schedule_core,@function
+.align 16
+_vpaes_schedule_core:
+ addl (%esp),%ebp
+ movdqu (%esi),%xmm0
+ movdqa 320(%ebp),%xmm2
+ movdqa %xmm0,%xmm3
+ leal (%ebp),%ebx
+ movdqa %xmm2,4(%esp)
+ call _vpaes_schedule_transform
+ movdqa %xmm0,%xmm7
+ testl %edi,%edi
+ jnz .L004schedule_am_decrypting
+ movdqu %xmm0,(%edx)
+ jmp .L005schedule_go
+.L004schedule_am_decrypting:
+ movdqa 256(%ebp,%ecx,1),%xmm1
+.byte 102,15,56,0,217
+ movdqu %xmm3,(%edx)
+ xorl $48,%ecx
+.L005schedule_go:
+ cmpl $192,%eax
+ ja .L006schedule_256
+ je .L007schedule_192
+.L008schedule_128:
+ movl $10,%eax
+.L009loop_schedule_128:
+ call _vpaes_schedule_round
+ decl %eax
+ jz .L010schedule_mangle_last
+ call _vpaes_schedule_mangle
+ jmp .L009loop_schedule_128
+.align 16
+.L007schedule_192:
+ movdqu 8(%esi),%xmm0
+ call _vpaes_schedule_transform
+ movdqa %xmm0,%xmm6
+ pxor %xmm4,%xmm4
+ movhlps %xmm4,%xmm6
+ movl $4,%eax
+.L011loop_schedule_192:
+ call _vpaes_schedule_round
+.byte 102,15,58,15,198,8
+ call _vpaes_schedule_mangle
+ call _vpaes_schedule_192_smear
+ call _vpaes_schedule_mangle
+ call _vpaes_schedule_round
+ decl %eax
+ jz .L010schedule_mangle_last
+ call _vpaes_schedule_mangle
+ call _vpaes_schedule_192_smear
+ jmp .L011loop_schedule_192
+.align 16
+.L006schedule_256:
+ movdqu 16(%esi),%xmm0
+ call _vpaes_schedule_transform
+ movl $7,%eax
+.L012loop_schedule_256:
+ call _vpaes_schedule_mangle
+ movdqa %xmm0,%xmm6
+ call _vpaes_schedule_round
+ decl %eax
+ jz .L010schedule_mangle_last
+ call _vpaes_schedule_mangle
+ pshufd $255,%xmm0,%xmm0
+ movdqa %xmm7,20(%esp)
+ movdqa %xmm6,%xmm7
+ call .L_vpaes_schedule_low_round
+ movdqa 20(%esp),%xmm7
+ jmp .L012loop_schedule_256
+.align 16
+.L010schedule_mangle_last:
+ leal 384(%ebp),%ebx
+ testl %edi,%edi
+ jnz .L013schedule_mangle_last_dec
+ movdqa 256(%ebp,%ecx,1),%xmm1
+.byte 102,15,56,0,193
+ leal 352(%ebp),%ebx
+ addl $32,%edx
+.L013schedule_mangle_last_dec:
+ addl $-16,%edx
+ pxor 336(%ebp),%xmm0
+ call _vpaes_schedule_transform
+ movdqu %xmm0,(%edx)
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ ret
+.size _vpaes_schedule_core,.-_vpaes_schedule_core
+.type _vpaes_schedule_192_smear,@function
+.align 16
+_vpaes_schedule_192_smear:
+ pshufd $128,%xmm6,%xmm0
+ pxor %xmm0,%xmm6
+ pshufd $254,%xmm7,%xmm0
+ pxor %xmm0,%xmm6
+ movdqa %xmm6,%xmm0
+ pxor %xmm1,%xmm1
+ movhlps %xmm1,%xmm6
+ ret
+.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
+.type _vpaes_schedule_round,@function
+.align 16
+_vpaes_schedule_round:
+ movdqa 8(%esp),%xmm2
+ pxor %xmm1,%xmm1
+.byte 102,15,58,15,202,15
+.byte 102,15,58,15,210,15
+ pxor %xmm1,%xmm7
+ pshufd $255,%xmm0,%xmm0
+.byte 102,15,58,15,192,1
+ movdqa %xmm2,8(%esp)
+.L_vpaes_schedule_low_round:
+ movdqa %xmm7,%xmm1
+ pslldq $4,%xmm7
+ pxor %xmm1,%xmm7
+ movdqa %xmm7,%xmm1
+ pslldq $8,%xmm7
+ pxor %xmm1,%xmm7
+ pxor 336(%ebp),%xmm7
+ movdqa -16(%ebp),%xmm4
+ movdqa -48(%ebp),%xmm5
+ movdqa %xmm4,%xmm1
+ pandn %xmm0,%xmm1
+ psrld $4,%xmm1
+ pand %xmm4,%xmm0
+ movdqa -32(%ebp),%xmm2
+.byte 102,15,56,0,208
+ pxor %xmm1,%xmm0
+ movdqa %xmm5,%xmm3
+.byte 102,15,56,0,217
+ pxor %xmm2,%xmm3
+ movdqa %xmm5,%xmm4
+.byte 102,15,56,0,224
+ pxor %xmm2,%xmm4
+ movdqa %xmm5,%xmm2
+.byte 102,15,56,0,211
+ pxor %xmm0,%xmm2
+ movdqa %xmm5,%xmm3
+.byte 102,15,56,0,220
+ pxor %xmm1,%xmm3
+ movdqa 32(%ebp),%xmm4
+.byte 102,15,56,0,226
+ movdqa 48(%ebp),%xmm0
+.byte 102,15,56,0,195
+ pxor %xmm4,%xmm0
+ pxor %xmm7,%xmm0
+ movdqa %xmm0,%xmm7
+ ret
+.size _vpaes_schedule_round,.-_vpaes_schedule_round
+.type _vpaes_schedule_transform,@function
+.align 16
+_vpaes_schedule_transform:
+ movdqa -16(%ebp),%xmm2
+ movdqa %xmm2,%xmm1
+ pandn %xmm0,%xmm1
+ psrld $4,%xmm1
+ pand %xmm2,%xmm0
+ movdqa (%ebx),%xmm2
+.byte 102,15,56,0,208
+ movdqa 16(%ebx),%xmm0
+.byte 102,15,56,0,193
+ pxor %xmm2,%xmm0
+ ret
+.size _vpaes_schedule_transform,.-_vpaes_schedule_transform
+.type _vpaes_schedule_mangle,@function
+.align 16
+_vpaes_schedule_mangle:
+ movdqa %xmm0,%xmm4
+ movdqa 128(%ebp),%xmm5
+ testl %edi,%edi
+ jnz .L014schedule_mangle_dec
+ addl $16,%edx
+ pxor 336(%ebp),%xmm4
+.byte 102,15,56,0,229
+ movdqa %xmm4,%xmm3
+.byte 102,15,56,0,229
+ pxor %xmm4,%xmm3
+.byte 102,15,56,0,229
+ pxor %xmm4,%xmm3
+ jmp .L015schedule_mangle_both
+.align 16
+.L014schedule_mangle_dec:
+ movdqa -16(%ebp),%xmm2
+ leal 416(%ebp),%esi
+ movdqa %xmm2,%xmm1
+ pandn %xmm4,%xmm1
+ psrld $4,%xmm1
+ pand %xmm2,%xmm4
+ movdqa (%esi),%xmm2
+.byte 102,15,56,0,212
+ movdqa 16(%esi),%xmm3
+.byte 102,15,56,0,217
+ pxor %xmm2,%xmm3
+.byte 102,15,56,0,221
+ movdqa 32(%esi),%xmm2
+.byte 102,15,56,0,212
+ pxor %xmm3,%xmm2
+ movdqa 48(%esi),%xmm3
+.byte 102,15,56,0,217
+ pxor %xmm2,%xmm3
+.byte 102,15,56,0,221
+ movdqa 64(%esi),%xmm2
+.byte 102,15,56,0,212
+ pxor %xmm3,%xmm2
+ movdqa 80(%esi),%xmm3
+.byte 102,15,56,0,217
+ pxor %xmm2,%xmm3
+.byte 102,15,56,0,221
+ movdqa 96(%esi),%xmm2
+.byte 102,15,56,0,212
+ pxor %xmm3,%xmm2
+ movdqa 112(%esi),%xmm3
+.byte 102,15,56,0,217
+ pxor %xmm2,%xmm3
+ addl $-16,%edx
+.L015schedule_mangle_both:
+ movdqa 256(%ebp,%ecx,1),%xmm1
+.byte 102,15,56,0,217
+ addl $-16,%ecx
+ andl $48,%ecx
+ movdqu %xmm3,(%edx)
+ ret
+.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle
+.globl vpaes_set_encrypt_key
+.type vpaes_set_encrypt_key,@function
+.align 16
+vpaes_set_encrypt_key:
+.L_vpaes_set_encrypt_key_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%esi
+ leal -56(%esp),%ebx
+ movl 24(%esp),%eax
+ andl $-16,%ebx
+ movl 28(%esp),%edx
+ xchgl %esp,%ebx
+ movl %ebx,48(%esp)
+ movl %eax,%ebx
+ shrl $5,%ebx
+ addl $5,%ebx
+ movl %ebx,240(%edx)
+ movl $48,%ecx
+ movl $0,%edi
+ leal .L_vpaes_consts+0x30-.L016pic_point,%ebp
+ call _vpaes_schedule_core
+.L016pic_point:
+ movl 48(%esp),%esp
+ xorl %eax,%eax
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size vpaes_set_encrypt_key,.-.L_vpaes_set_encrypt_key_begin
+.globl vpaes_set_decrypt_key
+.type vpaes_set_decrypt_key,@function
+.align 16
+vpaes_set_decrypt_key:
+.L_vpaes_set_decrypt_key_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%esi
+ leal -56(%esp),%ebx
+ movl 24(%esp),%eax
+ andl $-16,%ebx
+ movl 28(%esp),%edx
+ xchgl %esp,%ebx
+ movl %ebx,48(%esp)
+ movl %eax,%ebx
+ shrl $5,%ebx
+ addl $5,%ebx
+ movl %ebx,240(%edx)
+ shll $4,%ebx
+ leal 16(%edx,%ebx,1),%edx
+ movl $1,%edi
+ movl %eax,%ecx
+ shrl $1,%ecx
+ andl $32,%ecx
+ xorl $32,%ecx
+ leal .L_vpaes_consts+0x30-.L017pic_point,%ebp
+ call _vpaes_schedule_core
+.L017pic_point:
+ movl 48(%esp),%esp
+ xorl %eax,%eax
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size vpaes_set_decrypt_key,.-.L_vpaes_set_decrypt_key_begin
+.globl vpaes_encrypt
+.type vpaes_encrypt,@function
+.align 16
+vpaes_encrypt:
+.L_vpaes_encrypt_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ leal .L_vpaes_consts+0x30-.L018pic_point,%ebp
+ call _vpaes_preheat
+.L018pic_point:
+ movl 20(%esp),%esi
+ leal -56(%esp),%ebx
+ movl 24(%esp),%edi
+ andl $-16,%ebx
+ movl 28(%esp),%edx
+ xchgl %esp,%ebx
+ movl %ebx,48(%esp)
+ movdqu (%esi),%xmm0
+ call _vpaes_encrypt_core
+ movdqu %xmm0,(%edi)
+ movl 48(%esp),%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size vpaes_encrypt,.-.L_vpaes_encrypt_begin
+.globl vpaes_decrypt
+.type vpaes_decrypt,@function
+.align 16
+vpaes_decrypt:
+.L_vpaes_decrypt_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ leal .L_vpaes_consts+0x30-.L019pic_point,%ebp
+ call _vpaes_preheat
+.L019pic_point:
+ movl 20(%esp),%esi
+ leal -56(%esp),%ebx
+ movl 24(%esp),%edi
+ andl $-16,%ebx
+ movl 28(%esp),%edx
+ xchgl %esp,%ebx
+ movl %ebx,48(%esp)
+ movdqu (%esi),%xmm0
+ call _vpaes_decrypt_core
+ movdqu %xmm0,(%edi)
+ movl 48(%esp),%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size vpaes_decrypt,.-.L_vpaes_decrypt_begin
+.globl vpaes_cbc_encrypt
+.type vpaes_cbc_encrypt,@function
+.align 16
+vpaes_cbc_encrypt:
+.L_vpaes_cbc_encrypt_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%esi
+ movl 24(%esp),%edi
+ movl 28(%esp),%eax
+ movl 32(%esp),%edx
+ subl $16,%eax
+ jc .L020cbc_abort
+ leal -56(%esp),%ebx
+ movl 36(%esp),%ebp
+ andl $-16,%ebx
+ movl 40(%esp),%ecx
+ xchgl %esp,%ebx
+ movdqu (%ebp),%xmm1
+ subl %esi,%edi
+ movl %ebx,48(%esp)
+ movl %edi,(%esp)
+ movl %edx,4(%esp)
+ movl %ebp,8(%esp)
+ movl %eax,%edi
+ leal .L_vpaes_consts+0x30-.L021pic_point,%ebp
+ call _vpaes_preheat
+.L021pic_point:
+ cmpl $0,%ecx
+ je .L022cbc_dec_loop
+ jmp .L023cbc_enc_loop
+.align 16
+.L023cbc_enc_loop:
+ movdqu (%esi),%xmm0
+ pxor %xmm1,%xmm0
+ call _vpaes_encrypt_core
+ movl (%esp),%ebx
+ movl 4(%esp),%edx
+ movdqa %xmm0,%xmm1
+ movdqu %xmm0,(%ebx,%esi,1)
+ leal 16(%esi),%esi
+ subl $16,%edi
+ jnc .L023cbc_enc_loop
+ jmp .L024cbc_done
+.align 16
+.L022cbc_dec_loop:
+ movdqu (%esi),%xmm0
+ movdqa %xmm1,16(%esp)
+ movdqa %xmm0,32(%esp)
+ call _vpaes_decrypt_core
+ movl (%esp),%ebx
+ movl 4(%esp),%edx
+ pxor 16(%esp),%xmm0
+ movdqa 32(%esp),%xmm1
+ movdqu %xmm0,(%ebx,%esi,1)
+ leal 16(%esi),%esi
+ subl $16,%edi
+ jnc .L022cbc_dec_loop
+.L024cbc_done:
+ movl 8(%esp),%ebx
+ movl 48(%esp),%esp
+ movdqu %xmm1,(%ebx)
+.L020cbc_abort:
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size vpaes_cbc_encrypt,.-.L_vpaes_cbc_encrypt_begin
diff --git a/main/openssl/crypto/aes/asm/vpaes-x86.pl b/main/openssl/crypto/aes/asm/vpaes-x86.pl
new file mode 100644
index 00000000..1533e2c3
--- /dev/null
+++ b/main/openssl/crypto/aes/asm/vpaes-x86.pl
@@ -0,0 +1,903 @@
+#!/usr/bin/env perl
+
+######################################################################
+## Constant-time SSSE3 AES core implementation.
+## version 0.1
+##
+## By Mike Hamburg (Stanford University), 2009
+## Public domain.
+##
+## For details see http://shiftleft.org/papers/vector_aes/ and
+## http://crypto.stanford.edu/vpaes/.
+
+######################################################################
+# September 2011.
+#
+# Port vpaes-x86_64.pl as 32-bit "almost" drop-in replacement for
+# aes-586.pl. "Almost" refers to the fact that AES_cbc_encrypt
+# doesn't handle partial vectors (doesn't have to if called from
+# EVP only). "Drop-in" implies that this module doesn't share key
+# schedule structure with the original nor does it make assumption
+# about its alignment...
+#
+# Performance summary. aes-586.pl column lists large-block CBC
+# encrypt/decrypt/with-hyper-threading-off(*) results in cycles per
+# byte processed with 128-bit key, and vpaes-x86.pl column - [also
+# large-block CBC] encrypt/decrypt.
+#
+# aes-586.pl vpaes-x86.pl
+#
+# Core 2(**) 29.1/42.3/18.3 22.0/25.6(***)
+# Nehalem 27.9/40.4/18.1 10.3/12.0
+# Atom 102./119./60.1 64.5/85.3(***)
+#
+# (*) "Hyper-threading" in the context refers rather to cache shared
+# among multiple cores, than to specifically Intel HTT. As vast
+# majority of contemporary cores share cache, slower code path
+# is common place. In other words "with-hyper-threading-off"
+# results are presented mostly for reference purposes.
+#
+# (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe.
+#
+# (***) Less impressive improvement on Core 2 and Atom is due to slow
+# pshufb, yet it's respectable +32%/65% improvement on Core 2
+# and +58%/40% on Atom (as implied, over "hyper-threading-safe"
+# code path).
+#
+# <appro@openssl.org>
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+
+&asm_init($ARGV[0],"vpaes-x86.pl",$x86only = $ARGV[$#ARGV] eq "386");
+
+$PREFIX="vpaes";
+
+my ($round, $base, $magic, $key, $const, $inp, $out)=
+ ("eax", "ebx", "ecx", "edx","ebp", "esi","edi");
+
+&static_label("_vpaes_consts");
+&static_label("_vpaes_schedule_low_round");
+
+&set_label("_vpaes_consts",64);
+$k_inv=-0x30; # inv, inva
+ &data_word(0x0D080180,0x0E05060F,0x0A0B0C02,0x04070309);
+ &data_word(0x0F0B0780,0x01040A06,0x02050809,0x030D0E0C);
+
+$k_s0F=-0x10; # s0F
+ &data_word(0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F);
+
+$k_ipt=0x00; # input transform (lo, hi)
+ &data_word(0x5A2A7000,0xC2B2E898,0x52227808,0xCABAE090);
+ &data_word(0x317C4D00,0x4C01307D,0xB0FDCC81,0xCD80B1FC);
+
+$k_sb1=0x20; # sb1u, sb1t
+ &data_word(0xCB503E00,0xB19BE18F,0x142AF544,0xA5DF7A6E);
+ &data_word(0xFAE22300,0x3618D415,0x0D2ED9EF,0x3BF7CCC1);
+$k_sb2=0x40; # sb2u, sb2t
+ &data_word(0x0B712400,0xE27A93C6,0xBC982FCD,0x5EB7E955);
+ &data_word(0x0AE12900,0x69EB8840,0xAB82234A,0xC2A163C8);
+$k_sbo=0x60; # sbou, sbot
+ &data_word(0x6FBDC700,0xD0D26D17,0xC502A878,0x15AABF7A);
+ &data_word(0x5FBB6A00,0xCFE474A5,0x412B35FA,0x8E1E90D1);
+
+$k_mc_forward=0x80; # mc_forward
+ &data_word(0x00030201,0x04070605,0x080B0A09,0x0C0F0E0D);
+ &data_word(0x04070605,0x080B0A09,0x0C0F0E0D,0x00030201);
+ &data_word(0x080B0A09,0x0C0F0E0D,0x00030201,0x04070605);
+ &data_word(0x0C0F0E0D,0x00030201,0x04070605,0x080B0A09);
+
+$k_mc_backward=0xc0; # mc_backward
+ &data_word(0x02010003,0x06050407,0x0A09080B,0x0E0D0C0F);
+ &data_word(0x0E0D0C0F,0x02010003,0x06050407,0x0A09080B);
+ &data_word(0x0A09080B,0x0E0D0C0F,0x02010003,0x06050407);
+ &data_word(0x06050407,0x0A09080B,0x0E0D0C0F,0x02010003);
+
+$k_sr=0x100; # sr
+ &data_word(0x03020100,0x07060504,0x0B0A0908,0x0F0E0D0C);
+ &data_word(0x0F0A0500,0x030E0904,0x07020D08,0x0B06010C);
+ &data_word(0x0B020900,0x0F060D04,0x030A0108,0x070E050C);
+ &data_word(0x070A0D00,0x0B0E0104,0x0F020508,0x0306090C);
+
+$k_rcon=0x140; # rcon
+ &data_word(0xAF9DEEB6,0x1F8391B9,0x4D7C7D81,0x702A9808);
+
+$k_s63=0x150; # s63: all equal to 0x63 transformed
+ &data_word(0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B);
+
+$k_opt=0x160; # output transform
+ &data_word(0xD6B66000,0xFF9F4929,0xDEBE6808,0xF7974121);
+ &data_word(0x50BCEC00,0x01EDBD51,0xB05C0CE0,0xE10D5DB1);
+
+$k_deskew=0x180; # deskew tables: inverts the sbox's "skew"
+ &data_word(0x47A4E300,0x07E4A340,0x5DBEF91A,0x1DFEB95A);
+ &data_word(0x83EA6900,0x5F36B5DC,0xF49D1E77,0x2841C2AB);
+##
+## Decryption stuff
+## Key schedule constants
+##
+$k_dksd=0x1a0; # decryption key schedule: invskew x*D
+ &data_word(0xA3E44700,0xFEB91A5D,0x5A1DBEF9,0x0740E3A4);
+ &data_word(0xB5368300,0x41C277F4,0xAB289D1E,0x5FDC69EA);
+$k_dksb=0x1c0; # decryption key schedule: invskew x*B
+ &data_word(0x8550D500,0x9A4FCA1F,0x1CC94C99,0x03D65386);
+ &data_word(0xB6FC4A00,0x115BEDA7,0x7E3482C8,0xD993256F);
+$k_dkse=0x1e0; # decryption key schedule: invskew x*E + 0x63
+ &data_word(0x1FC9D600,0xD5031CCA,0x994F5086,0x53859A4C);
+ &data_word(0x4FDC7BE8,0xA2319605,0x20B31487,0xCD5EF96A);
+$k_dks9=0x200; # decryption key schedule: invskew x*9
+ &data_word(0x7ED9A700,0xB6116FC8,0x82255BFC,0x4AED9334);
+ &data_word(0x27143300,0x45765162,0xE9DAFDCE,0x8BB89FAC);
+
+##
+## Decryption stuff
+## Round function constants
+##
+$k_dipt=0x220; # decryption input transform
+ &data_word(0x0B545F00,0x0F505B04,0x114E451A,0x154A411E);
+ &data_word(0x60056500,0x86E383E6,0xF491F194,0x12771772);
+
+$k_dsb9=0x240; # decryption sbox output *9*u, *9*t
+ &data_word(0x9A86D600,0x851C0353,0x4F994CC9,0xCAD51F50);
+ &data_word(0xECD74900,0xC03B1789,0xB2FBA565,0x725E2C9E);
+$k_dsbd=0x260; # decryption sbox output *D*u, *D*t
+ &data_word(0xE6B1A200,0x7D57CCDF,0x882A4439,0xF56E9B13);
+ &data_word(0x24C6CB00,0x3CE2FAF7,0x15DEEFD3,0x2931180D);
+$k_dsbb=0x280; # decryption sbox output *B*u, *B*t
+ &data_word(0x96B44200,0xD0226492,0xB0F2D404,0x602646F6);
+ &data_word(0xCD596700,0xC19498A6,0x3255AA6B,0xF3FF0C3E);
+$k_dsbe=0x2a0; # decryption sbox output *E*u, *E*t
+ &data_word(0x26D4D000,0x46F29296,0x64B4F6B0,0x22426004);
+ &data_word(0xFFAAC100,0x0C55A6CD,0x98593E32,0x9467F36B);
+$k_dsbo=0x2c0; # decryption sbox final output
+ &data_word(0x7EF94000,0x1387EA53,0xD4943E2D,0xC7AA6DB9);
+ &data_word(0x93441D00,0x12D7560F,0xD8C58E9C,0xCA4B8159);
+&asciz ("Vector Permutation AES for x86/SSSE3, Mike Hamburg (Stanford University)");
+&align (64);
+
+&function_begin_B("_vpaes_preheat");
+ &add ($const,&DWP(0,"esp"));
+ &movdqa ("xmm7",&QWP($k_inv,$const));
+ &movdqa ("xmm6",&QWP($k_s0F,$const));
+ &ret ();
+&function_end_B("_vpaes_preheat");
+
+##
+## _aes_encrypt_core
+##
+## AES-encrypt %xmm0.
+##
+## Inputs:
+## %xmm0 = input
+## %xmm6-%xmm7 as in _vpaes_preheat
+## (%edx) = scheduled keys
+##
+## Output in %xmm0
+## Clobbers %xmm1-%xmm5, %eax, %ebx, %ecx, %edx
+##
+##
+&function_begin_B("_vpaes_encrypt_core");
+ &mov ($magic,16);
+ &mov ($round,&DWP(240,$key));
+ &movdqa ("xmm1","xmm6")
+ &movdqa ("xmm2",&QWP($k_ipt,$const));
+ &pandn ("xmm1","xmm0");
+ &movdqu ("xmm5",&QWP(0,$key));
+ &psrld ("xmm1",4);
+ &pand ("xmm0","xmm6");
+ &pshufb ("xmm2","xmm0");
+ &movdqa ("xmm0",&QWP($k_ipt+16,$const));
+ &pshufb ("xmm0","xmm1");
+ &pxor ("xmm2","xmm5");
+ &pxor ("xmm0","xmm2");
+ &add ($key,16);
+ &lea ($base,&DWP($k_mc_backward,$const));
+ &jmp (&label("enc_entry"));
+
+
+&set_label("enc_loop",16);
+ # middle of middle round
+ &movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sb1u
+ &pshufb ("xmm4","xmm2"); # 4 = sb1u
+ &pxor ("xmm4","xmm5"); # 4 = sb1u + k
+ &movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sb1t
+ &pshufb ("xmm0","xmm3"); # 0 = sb1t
+ &pxor ("xmm0","xmm4"); # 0 = A
+ &movdqa ("xmm5",&QWP($k_sb2,$const)); # 4 : sb2u
+ &pshufb ("xmm5","xmm2"); # 4 = sb2u
+ &movdqa ("xmm1",&QWP(-0x40,$base,$magic));# .Lk_mc_forward[]
+ &movdqa ("xmm2",&QWP($k_sb2+16,$const));# 2 : sb2t
+ &pshufb ("xmm2","xmm3"); # 2 = sb2t
+ &pxor ("xmm2","xmm5"); # 2 = 2A
+ &movdqa ("xmm4",&QWP(0,$base,$magic)); # .Lk_mc_backward[]
+ &movdqa ("xmm3","xmm0"); # 3 = A
+ &pshufb ("xmm0","xmm1"); # 0 = B
+ &add ($key,16); # next key
+ &pxor ("xmm0","xmm2"); # 0 = 2A+B
+ &pshufb ("xmm3","xmm4"); # 3 = D
+ &add ($magic,16); # next mc
+ &pxor ("xmm3","xmm0"); # 3 = 2A+B+D
+ &pshufb ("xmm0","xmm1"); # 0 = 2B+C
+ &and ($magic,0x30); # ... mod 4
+ &pxor ("xmm0","xmm3"); # 0 = 2A+3B+C+D
+ &sub ($round,1); # nr--
+
+&set_label("enc_entry");
+ # top of round
+ &movdqa ("xmm1","xmm6"); # 1 : i
+ &pandn ("xmm1","xmm0"); # 1 = i<<4
+ &psrld ("xmm1",4); # 1 = i
+ &pand ("xmm0","xmm6"); # 0 = k
+ &movdqa ("xmm5",&QWP($k_inv+16,$const));# 2 : a/k
+ &pshufb ("xmm5","xmm0"); # 2 = a/k
+ &pxor ("xmm0","xmm1"); # 0 = j
+ &movdqa ("xmm3","xmm7"); # 3 : 1/i
+ &pshufb ("xmm3","xmm1"); # 3 = 1/i
+ &pxor ("xmm3","xmm5"); # 3 = iak = 1/i + a/k
+ &movdqa ("xmm4","xmm7"); # 4 : 1/j
+ &pshufb ("xmm4","xmm0"); # 4 = 1/j
+ &pxor ("xmm4","xmm5"); # 4 = jak = 1/j + a/k
+ &movdqa ("xmm2","xmm7"); # 2 : 1/iak
+ &pshufb ("xmm2","xmm3"); # 2 = 1/iak
+ &pxor ("xmm2","xmm0"); # 2 = io
+ &movdqa ("xmm3","xmm7"); # 3 : 1/jak
+ &movdqu ("xmm5",&QWP(0,$key));
+ &pshufb ("xmm3","xmm4"); # 3 = 1/jak
+ &pxor ("xmm3","xmm1"); # 3 = jo
+ &jnz (&label("enc_loop"));
+
+ # middle of last round
+ &movdqa ("xmm4",&QWP($k_sbo,$const)); # 3 : sbou .Lk_sbo
+ &movdqa ("xmm0",&QWP($k_sbo+16,$const));# 3 : sbot .Lk_sbo+16
+ &pshufb ("xmm4","xmm2"); # 4 = sbou
+ &pxor ("xmm4","xmm5"); # 4 = sb1u + k
+ &pshufb ("xmm0","xmm3"); # 0 = sb1t
+ &movdqa ("xmm1",&QWP(0x40,$base,$magic));# .Lk_sr[]
+ &pxor ("xmm0","xmm4"); # 0 = A
+ &pshufb ("xmm0","xmm1");
+ &ret ();
+&function_end_B("_vpaes_encrypt_core");
+
+##
+## Decryption core
+##
+## Same API as encryption core.
+##
+&function_begin_B("_vpaes_decrypt_core");
+ &mov ($round,&DWP(240,$key));
+ &lea ($base,&DWP($k_dsbd,$const));
+ &movdqa ("xmm1","xmm6");
+ &movdqa ("xmm2",&QWP($k_dipt-$k_dsbd,$base));
+ &pandn ("xmm1","xmm0");
+ &mov ($magic,$round);
+ &psrld ("xmm1",4)
+ &movdqu ("xmm5",&QWP(0,$key));
+ &shl ($magic,4);
+ &pand ("xmm0","xmm6");
+ &pshufb ("xmm2","xmm0");
+ &movdqa ("xmm0",&QWP($k_dipt-$k_dsbd+16,$base));
+ &xor ($magic,0x30);
+ &pshufb ("xmm0","xmm1");
+ &and ($magic,0x30);
+ &pxor ("xmm2","xmm5");
+ &movdqa ("xmm5",&QWP($k_mc_forward+48,$const));
+ &pxor ("xmm0","xmm2");
+ &add ($key,16);
+ &lea ($magic,&DWP($k_sr-$k_dsbd,$base,$magic));
+ &jmp (&label("dec_entry"));
+
+&set_label("dec_loop",16);
+##
+## Inverse mix columns
+##
+ &movdqa ("xmm4",&QWP(-0x20,$base)); # 4 : sb9u
+ &pshufb ("xmm4","xmm2"); # 4 = sb9u
+ &pxor ("xmm4","xmm0");
+ &movdqa ("xmm0",&QWP(-0x10,$base)); # 0 : sb9t
+ &pshufb ("xmm0","xmm3"); # 0 = sb9t
+ &pxor ("xmm0","xmm4"); # 0 = ch
+ &add ($key,16); # next round key
+
+ &pshufb ("xmm0","xmm5"); # MC ch
+ &movdqa ("xmm4",&QWP(0,$base)); # 4 : sbdu
+ &pshufb ("xmm4","xmm2"); # 4 = sbdu
+ &pxor ("xmm4","xmm0"); # 4 = ch
+ &movdqa ("xmm0",&QWP(0x10,$base)); # 0 : sbdt
+ &pshufb ("xmm0","xmm3"); # 0 = sbdt
+ &pxor ("xmm0","xmm4"); # 0 = ch
+ &sub ($round,1); # nr--
+
+ &pshufb ("xmm0","xmm5"); # MC ch
+ &movdqa ("xmm4",&QWP(0x20,$base)); # 4 : sbbu
+ &pshufb ("xmm4","xmm2"); # 4 = sbbu
+ &pxor ("xmm4","xmm0"); # 4 = ch
+ &movdqa ("xmm0",&QWP(0x30,$base)); # 0 : sbbt
+ &pshufb ("xmm0","xmm3"); # 0 = sbbt
+ &pxor ("xmm0","xmm4"); # 0 = ch
+
+ &pshufb ("xmm0","xmm5"); # MC ch
+ &movdqa ("xmm4",&QWP(0x40,$base)); # 4 : sbeu
+ &pshufb ("xmm4","xmm2"); # 4 = sbeu
+ &pxor ("xmm4","xmm0"); # 4 = ch
+ &movdqa ("xmm0",&QWP(0x50,$base)); # 0 : sbet
+ &pshufb ("xmm0","xmm3"); # 0 = sbet
+ &pxor ("xmm0","xmm4"); # 0 = ch
+
+ &palignr("xmm5","xmm5",12);
+
+&set_label("dec_entry");
+ # top of round
+ &movdqa ("xmm1","xmm6"); # 1 : i
+ &pandn ("xmm1","xmm0"); # 1 = i<<4
+ &psrld ("xmm1",4); # 1 = i
+ &pand ("xmm0","xmm6"); # 0 = k
+ &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
+ &pshufb ("xmm2","xmm0"); # 2 = a/k
+ &pxor ("xmm0","xmm1"); # 0 = j
+ &movdqa ("xmm3","xmm7"); # 3 : 1/i
+ &pshufb ("xmm3","xmm1"); # 3 = 1/i
+ &pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k
+ &movdqa ("xmm4","xmm7"); # 4 : 1/j
+ &pshufb ("xmm4","xmm0"); # 4 = 1/j
+ &pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k
+ &movdqa ("xmm2","xmm7"); # 2 : 1/iak
+ &pshufb ("xmm2","xmm3"); # 2 = 1/iak
+ &pxor ("xmm2","xmm0"); # 2 = io
+ &movdqa ("xmm3","xmm7"); # 3 : 1/jak
+ &pshufb ("xmm3","xmm4"); # 3 = 1/jak
+ &pxor ("xmm3","xmm1"); # 3 = jo
+ &movdqu ("xmm0",&QWP(0,$key));
+ &jnz (&label("dec_loop"));
+
+ # middle of last round
+ &movdqa ("xmm4",&QWP(0x60,$base)); # 3 : sbou
+ &pshufb ("xmm4","xmm2"); # 4 = sbou
+ &pxor ("xmm4","xmm0"); # 4 = sb1u + k
+ &movdqa ("xmm0",&QWP(0x70,$base)); # 0 : sbot
+ &movdqa ("xmm2",&QWP(0,$magic));
+ &pshufb ("xmm0","xmm3"); # 0 = sb1t
+ &pxor ("xmm0","xmm4"); # 0 = A
+ &pshufb ("xmm0","xmm2");
+ &ret ();
+&function_end_B("_vpaes_decrypt_core");
+
+########################################################
+## ##
+## AES key schedule ##
+## ##
+########################################################
+&function_begin_B("_vpaes_schedule_core");
+ &add ($const,&DWP(0,"esp"));
+ &movdqu ("xmm0",&QWP(0,$inp)); # load key (unaligned)
+ &movdqa ("xmm2",&QWP($k_rcon,$const)); # load rcon
+
+ # input transform
+ &movdqa ("xmm3","xmm0");
+ &lea ($base,&DWP($k_ipt,$const));
+ &movdqa (&QWP(4,"esp"),"xmm2"); # xmm8
+ &call ("_vpaes_schedule_transform");
+ &movdqa ("xmm7","xmm0");
+
+ &test ($out,$out);
+ &jnz (&label("schedule_am_decrypting"));
+
+ # encrypting, output zeroth round key after transform
+ &movdqu (&QWP(0,$key),"xmm0");
+ &jmp (&label("schedule_go"));
+
+&set_label("schedule_am_decrypting");
+ # decrypting, output zeroth round key after shiftrows
+ &movdqa ("xmm1",&QWP($k_sr,$const,$magic));
+ &pshufb ("xmm3","xmm1");
+ &movdqu (&QWP(0,$key),"xmm3");
+ &xor ($magic,0x30);
+
+&set_label("schedule_go");
+ &cmp ($round,192);
+ &ja (&label("schedule_256"));
+ &je (&label("schedule_192"));
+ # 128: fall though
+
+##
+## .schedule_128
+##
+## 128-bit specific part of key schedule.
+##
+## This schedule is really simple, because all its parts
+## are accomplished by the subroutines.
+##
+&set_label("schedule_128");
+ &mov ($round,10);
+
+&set_label("loop_schedule_128");
+ &call ("_vpaes_schedule_round");
+ &dec ($round);
+ &jz (&label("schedule_mangle_last"));
+ &call ("_vpaes_schedule_mangle"); # write output
+ &jmp (&label("loop_schedule_128"));
+
+##
+## .aes_schedule_192
+##
+## 192-bit specific part of key schedule.
+##
+## The main body of this schedule is the same as the 128-bit
+## schedule, but with more smearing. The long, high side is
+## stored in %xmm7 as before, and the short, low side is in
+## the high bits of %xmm6.
+##
+## This schedule is somewhat nastier, however, because each
+## round produces 192 bits of key material, or 1.5 round keys.
+## Therefore, on each cycle we do 2 rounds and produce 3 round
+## keys.
+##
+&set_label("schedule_192",16);
+ &movdqu ("xmm0",&QWP(8,$inp)); # load key part 2 (very unaligned)
+ &call ("_vpaes_schedule_transform"); # input transform
+ &movdqa ("xmm6","xmm0"); # save short part
+ &pxor ("xmm4","xmm4"); # clear 4
+ &movhlps("xmm6","xmm4"); # clobber low side with zeros
+ &mov ($round,4);
+
+&set_label("loop_schedule_192");
+ &call ("_vpaes_schedule_round");
+ &palignr("xmm0","xmm6",8);
+ &call ("_vpaes_schedule_mangle"); # save key n
+ &call ("_vpaes_schedule_192_smear");
+ &call ("_vpaes_schedule_mangle"); # save key n+1
+ &call ("_vpaes_schedule_round");
+ &dec ($round);
+ &jz (&label("schedule_mangle_last"));
+ &call ("_vpaes_schedule_mangle"); # save key n+2
+ &call ("_vpaes_schedule_192_smear");
+ &jmp (&label("loop_schedule_192"));
+
+##
+## .aes_schedule_256
+##
+## 256-bit specific part of key schedule.
+##
+## The structure here is very similar to the 128-bit
+## schedule, but with an additional "low side" in
+## %xmm6. The low side's rounds are the same as the
+## high side's, except no rcon and no rotation.
+##
+&set_label("schedule_256",16);
+ &movdqu ("xmm0",&QWP(16,$inp)); # load key part 2 (unaligned)
+ &call ("_vpaes_schedule_transform"); # input transform
+ &mov ($round,7);
+
+&set_label("loop_schedule_256");
+ &call ("_vpaes_schedule_mangle"); # output low result
+ &movdqa ("xmm6","xmm0"); # save cur_lo in xmm6
+
+ # high round
+ &call ("_vpaes_schedule_round");
+ &dec ($round);
+ &jz (&label("schedule_mangle_last"));
+ &call ("_vpaes_schedule_mangle");
+
+ # low round. swap xmm7 and xmm6
+ &pshufd ("xmm0","xmm0",0xFF);
+ &movdqa (&QWP(20,"esp"),"xmm7");
+ &movdqa ("xmm7","xmm6");
+ &call ("_vpaes_schedule_low_round");
+ &movdqa ("xmm7",&QWP(20,"esp"));
+
+ &jmp (&label("loop_schedule_256"));
+
+##
+## .aes_schedule_mangle_last
+##
+## Mangler for last round of key schedule
+## Mangles %xmm0
+## when encrypting, outputs out(%xmm0) ^ 63
+## when decrypting, outputs unskew(%xmm0)
+##
+## Always called right before return... jumps to cleanup and exits
+##
+&set_label("schedule_mangle_last",16);
+ # schedule last round key from xmm0
+ &lea ($base,&DWP($k_deskew,$const));
+ &test ($out,$out);
+ &jnz (&label("schedule_mangle_last_dec"));
+
+ # encrypting
+ &movdqa ("xmm1",&QWP($k_sr,$const,$magic));
+ &pshufb ("xmm0","xmm1"); # output permute
+ &lea ($base,&DWP($k_opt,$const)); # prepare to output transform
+ &add ($key,32);
+
+&set_label("schedule_mangle_last_dec");
+ &add ($key,-16);
+ &pxor ("xmm0",&QWP($k_s63,$const));
+ &call ("_vpaes_schedule_transform"); # output transform
+ &movdqu (&QWP(0,$key),"xmm0"); # save last key
+
+ # cleanup
+ &pxor ("xmm0","xmm0");
+ &pxor ("xmm1","xmm1");
+ &pxor ("xmm2","xmm2");
+ &pxor ("xmm3","xmm3");
+ &pxor ("xmm4","xmm4");
+ &pxor ("xmm5","xmm5");
+ &pxor ("xmm6","xmm6");
+ &pxor ("xmm7","xmm7");
+ &ret ();
+&function_end_B("_vpaes_schedule_core");
+
+##
+## .aes_schedule_192_smear
+##
+## Smear the short, low side in the 192-bit key schedule.
+##
+## Inputs:
+## %xmm7: high side, b a x y
+## %xmm6: low side, d c 0 0
+## %xmm13: 0
+##
+## Outputs:
+## %xmm6: b+c+d b+c 0 0
+## %xmm0: b+c+d b+c b a
+##
+&function_begin_B("_vpaes_schedule_192_smear");
+ &pshufd ("xmm0","xmm6",0x80); # d c 0 0 -> c 0 0 0
+ &pxor ("xmm6","xmm0"); # -> c+d c 0 0
+ &pshufd ("xmm0","xmm7",0xFE); # b a _ _ -> b b b a
+ &pxor ("xmm6","xmm0"); # -> b+c+d b+c b a
+ &movdqa ("xmm0","xmm6");
+ &pxor ("xmm1","xmm1");
+ &movhlps("xmm6","xmm1"); # clobber low side with zeros
+ &ret ();
+&function_end_B("_vpaes_schedule_192_smear");
+
+##
+## .aes_schedule_round
+##
+## Runs one main round of the key schedule on %xmm0, %xmm7
+##
+## Specifically, runs subbytes on the high dword of %xmm0
+## then rotates it by one byte and xors into the low dword of
+## %xmm7.
+##
+## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
+## next rcon.
+##
+## Smears the dwords of %xmm7 by xoring the low into the
+## second low, result into third, result into highest.
+##
+## Returns results in %xmm7 = %xmm0.
+## Clobbers %xmm1-%xmm5.
+##
+&function_begin_B("_vpaes_schedule_round");
+ # extract rcon from xmm8
+ &movdqa ("xmm2",&QWP(8,"esp")); # xmm8
+ &pxor ("xmm1","xmm1");
+ &palignr("xmm1","xmm2",15);
+ &palignr("xmm2","xmm2",15);
+ &pxor ("xmm7","xmm1");
+
+ # rotate
+ &pshufd ("xmm0","xmm0",0xFF);
+ &palignr("xmm0","xmm0",1);
+
+ # fall through...
+ &movdqa (&QWP(8,"esp"),"xmm2"); # xmm8
+
+ # low round: same as high round, but no rotation and no rcon.
+&set_label("_vpaes_schedule_low_round");
+ # smear xmm7
+ &movdqa ("xmm1","xmm7");
+ &pslldq ("xmm7",4);
+ &pxor ("xmm7","xmm1");
+ &movdqa ("xmm1","xmm7");
+ &pslldq ("xmm7",8);
+ &pxor ("xmm7","xmm1");
+ &pxor ("xmm7",&QWP($k_s63,$const));
+
+ # subbyte
+ &movdqa ("xmm4",&QWP($k_s0F,$const));
+ &movdqa ("xmm5",&QWP($k_inv,$const)); # 4 : 1/j
+ &movdqa ("xmm1","xmm4");
+ &pandn ("xmm1","xmm0");
+ &psrld ("xmm1",4); # 1 = i
+ &pand ("xmm0","xmm4"); # 0 = k
+ &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
+ &pshufb ("xmm2","xmm0"); # 2 = a/k
+ &pxor ("xmm0","xmm1"); # 0 = j
+ &movdqa ("xmm3","xmm5"); # 3 : 1/i
+ &pshufb ("xmm3","xmm1"); # 3 = 1/i
+ &pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k
+ &movdqa ("xmm4","xmm5"); # 4 : 1/j
+ &pshufb ("xmm4","xmm0"); # 4 = 1/j
+ &pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k
+ &movdqa ("xmm2","xmm5"); # 2 : 1/iak
+ &pshufb ("xmm2","xmm3"); # 2 = 1/iak
+ &pxor ("xmm2","xmm0"); # 2 = io
+ &movdqa ("xmm3","xmm5"); # 3 : 1/jak
+ &pshufb ("xmm3","xmm4"); # 3 = 1/jak
+ &pxor ("xmm3","xmm1"); # 3 = jo
+ &movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sbou
+ &pshufb ("xmm4","xmm2"); # 4 = sbou
+ &movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sbot
+ &pshufb ("xmm0","xmm3"); # 0 = sb1t
+ &pxor ("xmm0","xmm4"); # 0 = sbox output
+
+ # add in smeared stuff
+ &pxor ("xmm0","xmm7");
+ &movdqa ("xmm7","xmm0");
+ &ret ();
+&function_end_B("_vpaes_schedule_round");
+
+##
+## .aes_schedule_transform
+##
+## Linear-transform %xmm0 according to tables at (%ebx)
+##
+## Output in %xmm0
+## Clobbers %xmm1, %xmm2
+##
+&function_begin_B("_vpaes_schedule_transform");
+ &movdqa ("xmm2",&QWP($k_s0F,$const));
+ &movdqa ("xmm1","xmm2");
+ &pandn ("xmm1","xmm0");
+ &psrld ("xmm1",4);
+ &pand ("xmm0","xmm2");
+ &movdqa ("xmm2",&QWP(0,$base));
+ &pshufb ("xmm2","xmm0");
+ &movdqa ("xmm0",&QWP(16,$base));
+ &pshufb ("xmm0","xmm1");
+ &pxor ("xmm0","xmm2");
+ &ret ();
+&function_end_B("_vpaes_schedule_transform");
+
+##
+## .aes_schedule_mangle
+##
+## Mangle xmm0 from (basis-transformed) standard version
+## to our version.
+##
+## On encrypt,
+## xor with 0x63
+## multiply by circulant 0,1,1,1
+## apply shiftrows transform
+##
+## On decrypt,
+## xor with 0x63
+## multiply by "inverse mixcolumns" circulant E,B,D,9
+## deskew
+## apply shiftrows transform
+##
+##
+## Writes out to (%edx), and increments or decrements it
+## Keeps track of round number mod 4 in %ecx
+## Preserves xmm0
+## Clobbers xmm1-xmm5
+##
+&function_begin_B("_vpaes_schedule_mangle");
+ &movdqa ("xmm4","xmm0"); # save xmm0 for later
+ &movdqa ("xmm5",&QWP($k_mc_forward,$const));
+ &test ($out,$out);
+ &jnz (&label("schedule_mangle_dec"));
+
+ # encrypting
+ &add ($key,16);
+ &pxor ("xmm4",&QWP($k_s63,$const));
+ &pshufb ("xmm4","xmm5");
+ &movdqa ("xmm3","xmm4");
+ &pshufb ("xmm4","xmm5");
+ &pxor ("xmm3","xmm4");
+ &pshufb ("xmm4","xmm5");
+ &pxor ("xmm3","xmm4");
+
+ &jmp (&label("schedule_mangle_both"));
+
+&set_label("schedule_mangle_dec",16);
+ # inverse mix columns
+ &movdqa ("xmm2",&QWP($k_s0F,$const));
+ &lea ($inp,&DWP($k_dksd,$const));
+ &movdqa ("xmm1","xmm2");
+ &pandn ("xmm1","xmm4");
+ &psrld ("xmm1",4); # 1 = hi
+ &pand ("xmm4","xmm2"); # 4 = lo
+
+ &movdqa ("xmm2",&QWP(0,$inp));
+ &pshufb ("xmm2","xmm4");
+ &movdqa ("xmm3",&QWP(0x10,$inp));
+ &pshufb ("xmm3","xmm1");
+ &pxor ("xmm3","xmm2");
+ &pshufb ("xmm3","xmm5");
+
+ &movdqa ("xmm2",&QWP(0x20,$inp));
+ &pshufb ("xmm2","xmm4");
+ &pxor ("xmm2","xmm3");
+ &movdqa ("xmm3",&QWP(0x30,$inp));
+ &pshufb ("xmm3","xmm1");
+ &pxor ("xmm3","xmm2");
+ &pshufb ("xmm3","xmm5");
+
+ &movdqa ("xmm2",&QWP(0x40,$inp));
+ &pshufb ("xmm2","xmm4");
+ &pxor ("xmm2","xmm3");
+ &movdqa ("xmm3",&QWP(0x50,$inp));
+ &pshufb ("xmm3","xmm1");
+ &pxor ("xmm3","xmm2");
+ &pshufb ("xmm3","xmm5");
+
+ &movdqa ("xmm2",&QWP(0x60,$inp));
+ &pshufb ("xmm2","xmm4");
+ &pxor ("xmm2","xmm3");
+ &movdqa ("xmm3",&QWP(0x70,$inp));
+ &pshufb ("xmm3","xmm1");
+ &pxor ("xmm3","xmm2");
+
+ &add ($key,-16);
+
+&set_label("schedule_mangle_both");
+ &movdqa ("xmm1",&QWP($k_sr,$const,$magic));
+ &pshufb ("xmm3","xmm1");
+ &add ($magic,-16);
+ &and ($magic,0x30);
+ &movdqu (&QWP(0,$key),"xmm3");
+ &ret ();
+&function_end_B("_vpaes_schedule_mangle");
+
+#
+# Interface to OpenSSL
+#
+&function_begin("${PREFIX}_set_encrypt_key");
+ &mov ($inp,&wparam(0)); # inp
+ &lea ($base,&DWP(-56,"esp"));
+ &mov ($round,&wparam(1)); # bits
+ &and ($base,-16);
+ &mov ($key,&wparam(2)); # key
+ &xchg ($base,"esp"); # alloca
+ &mov (&DWP(48,"esp"),$base);
+
+ &mov ($base,$round);
+ &shr ($base,5);
+ &add ($base,5);
+ &mov (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5;
+ &mov ($magic,0x30);
+ &mov ($out,0);
+
+ &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
+ &call ("_vpaes_schedule_core");
+&set_label("pic_point");
+
+ &mov ("esp",&DWP(48,"esp"));
+ &xor ("eax","eax");
+&function_end("${PREFIX}_set_encrypt_key");
+
+&function_begin("${PREFIX}_set_decrypt_key");
+ &mov ($inp,&wparam(0)); # inp
+ &lea ($base,&DWP(-56,"esp"));
+ &mov ($round,&wparam(1)); # bits
+ &and ($base,-16);
+ &mov ($key,&wparam(2)); # key
+ &xchg ($base,"esp"); # alloca
+ &mov (&DWP(48,"esp"),$base);
+
+ &mov ($base,$round);
+ &shr ($base,5);
+ &add ($base,5);
+ &mov (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5;
+ &shl ($base,4);
+ &lea ($key,&DWP(16,$key,$base));
+
+ &mov ($out,1);
+ &mov ($magic,$round);
+ &shr ($magic,1);
+ &and ($magic,32);
+ &xor ($magic,32); # nbist==192?0:32;
+
+ &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
+ &call ("_vpaes_schedule_core");
+&set_label("pic_point");
+
+ &mov ("esp",&DWP(48,"esp"));
+ &xor ("eax","eax");
+&function_end("${PREFIX}_set_decrypt_key");
+
+&function_begin("${PREFIX}_encrypt");
+ &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
+ &call ("_vpaes_preheat");
+&set_label("pic_point");
+ &mov ($inp,&wparam(0)); # inp
+ &lea ($base,&DWP(-56,"esp"));
+ &mov ($out,&wparam(1)); # out
+ &and ($base,-16);
+ &mov ($key,&wparam(2)); # key
+ &xchg ($base,"esp"); # alloca
+ &mov (&DWP(48,"esp"),$base);
+
+ &movdqu ("xmm0",&QWP(0,$inp));
+ &call ("_vpaes_encrypt_core");
+ &movdqu (&QWP(0,$out),"xmm0");
+
+ &mov ("esp",&DWP(48,"esp"));
+&function_end("${PREFIX}_encrypt");
+
+&function_begin("${PREFIX}_decrypt");
+ &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
+ &call ("_vpaes_preheat");
+&set_label("pic_point");
+ &mov ($inp,&wparam(0)); # inp
+ &lea ($base,&DWP(-56,"esp"));
+ &mov ($out,&wparam(1)); # out
+ &and ($base,-16);
+ &mov ($key,&wparam(2)); # key
+ &xchg ($base,"esp"); # alloca
+ &mov (&DWP(48,"esp"),$base);
+
+ &movdqu ("xmm0",&QWP(0,$inp));
+ &call ("_vpaes_decrypt_core");
+ &movdqu (&QWP(0,$out),"xmm0");
+
+ &mov ("esp",&DWP(48,"esp"));
+&function_end("${PREFIX}_decrypt");
+
+&function_begin("${PREFIX}_cbc_encrypt");
+ &mov ($inp,&wparam(0)); # inp
+ &mov ($out,&wparam(1)); # out
+ &mov ($round,&wparam(2)); # len
+ &mov ($key,&wparam(3)); # key
+ &sub ($round,16);
+ &jc (&label("cbc_abort"));
+ &lea ($base,&DWP(-56,"esp"));
+ &mov ($const,&wparam(4)); # ivp
+ &and ($base,-16);
+ &mov ($magic,&wparam(5)); # enc
+ &xchg ($base,"esp"); # alloca
+ &movdqu ("xmm1",&QWP(0,$const)); # load IV
+ &sub ($out,$inp);
+ &mov (&DWP(48,"esp"),$base);
+
+ &mov (&DWP(0,"esp"),$out); # save out
+ &mov (&DWP(4,"esp"),$key) # save key
+ &mov (&DWP(8,"esp"),$const); # save ivp
+ &mov ($out,$round); # $out works as $len
+
+ &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
+ &call ("_vpaes_preheat");
+&set_label("pic_point");
+ &cmp ($magic,0);
+ &je (&label("cbc_dec_loop"));
+ &jmp (&label("cbc_enc_loop"));
+
+&set_label("cbc_enc_loop",16);
+ &movdqu ("xmm0",&QWP(0,$inp)); # load input
+ &pxor ("xmm0","xmm1"); # inp^=iv
+ &call ("_vpaes_encrypt_core");
+ &mov ($base,&DWP(0,"esp")); # restore out
+ &mov ($key,&DWP(4,"esp")); # restore key
+ &movdqa ("xmm1","xmm0");
+ &movdqu (&QWP(0,$base,$inp),"xmm0"); # write output
+ &lea ($inp,&DWP(16,$inp));
+ &sub ($out,16);
+ &jnc (&label("cbc_enc_loop"));
+ &jmp (&label("cbc_done"));
+
+&set_label("cbc_dec_loop",16);
+ &movdqu ("xmm0",&QWP(0,$inp)); # load input
+ &movdqa (&QWP(16,"esp"),"xmm1"); # save IV
+ &movdqa (&QWP(32,"esp"),"xmm0"); # save future IV
+ &call ("_vpaes_decrypt_core");
+ &mov ($base,&DWP(0,"esp")); # restore out
+ &mov ($key,&DWP(4,"esp")); # restore key
+ &pxor ("xmm0",&QWP(16,"esp")); # out^=iv
+ &movdqa ("xmm1",&QWP(32,"esp")); # load next IV
+ &movdqu (&QWP(0,$base,$inp),"xmm0"); # write output
+ &lea ($inp,&DWP(16,$inp));
+ &sub ($out,16);
+ &jnc (&label("cbc_dec_loop"));
+
+&set_label("cbc_done");
+ &mov ($base,&DWP(8,"esp")); # restore ivp
+ &mov ("esp",&DWP(48,"esp"));
+ &movdqu (&QWP(0,$base),"xmm1"); # write IV
+&set_label("cbc_abort");
+&function_end("${PREFIX}_cbc_encrypt");
+
+&asm_finish();
diff --git a/main/openssl/crypto/aes/asm/vpaes-x86_64.S b/main/openssl/crypto/aes/asm/vpaes-x86_64.S
new file mode 100644
index 00000000..2b68e615
--- /dev/null
+++ b/main/openssl/crypto/aes/asm/vpaes-x86_64.S
@@ -0,0 +1,828 @@
+.text
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.type _vpaes_encrypt_core,@function
+.align 16
+_vpaes_encrypt_core:
+ movq %rdx,%r9
+ movq $16,%r11
+ movl 240(%rdx),%eax
+ movdqa %xmm9,%xmm1
+ movdqa .Lk_ipt(%rip),%xmm2
+ pandn %xmm0,%xmm1
+ movdqu (%r9),%xmm5
+ psrld $4,%xmm1
+ pand %xmm9,%xmm0
+.byte 102,15,56,0,208
+ movdqa .Lk_ipt+16(%rip),%xmm0
+.byte 102,15,56,0,193
+ pxor %xmm5,%xmm2
+ pxor %xmm2,%xmm0
+ addq $16,%r9
+ leaq .Lk_mc_backward(%rip),%r10
+ jmp .Lenc_entry
+
+.align 16
+.Lenc_loop:
+
+ movdqa %xmm13,%xmm4
+.byte 102,15,56,0,226
+ pxor %xmm5,%xmm4
+ movdqa %xmm12,%xmm0
+.byte 102,15,56,0,195
+ pxor %xmm4,%xmm0
+ movdqa %xmm15,%xmm5
+.byte 102,15,56,0,234
+ movdqa -64(%r11,%r10,1),%xmm1
+ movdqa %xmm14,%xmm2
+.byte 102,15,56,0,211
+ pxor %xmm5,%xmm2
+ movdqa (%r11,%r10,1),%xmm4
+ movdqa %xmm0,%xmm3
+.byte 102,15,56,0,193
+ addq $16,%r9
+ pxor %xmm2,%xmm0
+.byte 102,15,56,0,220
+ addq $16,%r11
+ pxor %xmm0,%xmm3
+.byte 102,15,56,0,193
+ andq $48,%r11
+ pxor %xmm3,%xmm0
+ subq $1,%rax
+
+.Lenc_entry:
+
+ movdqa %xmm9,%xmm1
+ pandn %xmm0,%xmm1
+ psrld $4,%xmm1
+ pand %xmm9,%xmm0
+ movdqa %xmm11,%xmm5
+.byte 102,15,56,0,232
+ pxor %xmm1,%xmm0
+ movdqa %xmm10,%xmm3
+.byte 102,15,56,0,217
+ pxor %xmm5,%xmm3
+ movdqa %xmm10,%xmm4
+.byte 102,15,56,0,224
+ pxor %xmm5,%xmm4
+ movdqa %xmm10,%xmm2
+.byte 102,15,56,0,211
+ pxor %xmm0,%xmm2
+ movdqa %xmm10,%xmm3
+ movdqu (%r9),%xmm5
+.byte 102,15,56,0,220
+ pxor %xmm1,%xmm3
+ jnz .Lenc_loop
+
+
+ movdqa -96(%r10),%xmm4
+ movdqa -80(%r10),%xmm0
+.byte 102,15,56,0,226
+ pxor %xmm5,%xmm4
+.byte 102,15,56,0,195
+ movdqa 64(%r11,%r10,1),%xmm1
+ pxor %xmm4,%xmm0
+.byte 102,15,56,0,193
+ .byte 0xf3,0xc3
+.size _vpaes_encrypt_core,.-_vpaes_encrypt_core
+
+
+
+
+
+
+.type _vpaes_decrypt_core,@function
+.align 16
+_vpaes_decrypt_core:
+ movq %rdx,%r9
+ movl 240(%rdx),%eax
+ movdqa %xmm9,%xmm1
+ movdqa .Lk_dipt(%rip),%xmm2
+ pandn %xmm0,%xmm1
+ movq %rax,%r11
+ psrld $4,%xmm1
+ movdqu (%r9),%xmm5
+ shlq $4,%r11
+ pand %xmm9,%xmm0
+.byte 102,15,56,0,208
+ movdqa .Lk_dipt+16(%rip),%xmm0
+ xorq $48,%r11
+ leaq .Lk_dsbd(%rip),%r10
+.byte 102,15,56,0,193
+ andq $48,%r11
+ pxor %xmm5,%xmm2
+ movdqa .Lk_mc_forward+48(%rip),%xmm5
+ pxor %xmm2,%xmm0
+ addq $16,%r9
+ addq %r10,%r11
+ jmp .Ldec_entry
+
+.align 16
+.Ldec_loop:
+
+
+
+ movdqa -32(%r10),%xmm4
+.byte 102,15,56,0,226
+ pxor %xmm0,%xmm4
+ movdqa -16(%r10),%xmm0
+.byte 102,15,56,0,195
+ pxor %xmm4,%xmm0
+ addq $16,%r9
+
+.byte 102,15,56,0,197
+ movdqa 0(%r10),%xmm4
+.byte 102,15,56,0,226
+ pxor %xmm0,%xmm4
+ movdqa 16(%r10),%xmm0
+.byte 102,15,56,0,195
+ pxor %xmm4,%xmm0
+ subq $1,%rax
+
+.byte 102,15,56,0,197
+ movdqa 32(%r10),%xmm4
+.byte 102,15,56,0,226
+ pxor %xmm0,%xmm4
+ movdqa 48(%r10),%xmm0
+.byte 102,15,56,0,195
+ pxor %xmm4,%xmm0
+
+.byte 102,15,56,0,197
+ movdqa 64(%r10),%xmm4
+.byte 102,15,56,0,226
+ pxor %xmm0,%xmm4
+ movdqa 80(%r10),%xmm0
+.byte 102,15,56,0,195
+ pxor %xmm4,%xmm0
+
+.byte 102,15,58,15,237,12
+
+.Ldec_entry:
+
+ movdqa %xmm9,%xmm1
+ pandn %xmm0,%xmm1
+ psrld $4,%xmm1
+ pand %xmm9,%xmm0
+ movdqa %xmm11,%xmm2
+.byte 102,15,56,0,208
+ pxor %xmm1,%xmm0
+ movdqa %xmm10,%xmm3
+.byte 102,15,56,0,217
+ pxor %xmm2,%xmm3
+ movdqa %xmm10,%xmm4
+.byte 102,15,56,0,224
+ pxor %xmm2,%xmm4
+ movdqa %xmm10,%xmm2
+.byte 102,15,56,0,211
+ pxor %xmm0,%xmm2
+ movdqa %xmm10,%xmm3
+.byte 102,15,56,0,220
+ pxor %xmm1,%xmm3
+ movdqu (%r9),%xmm0
+ jnz .Ldec_loop
+
+
+ movdqa 96(%r10),%xmm4
+.byte 102,15,56,0,226
+ pxor %xmm0,%xmm4
+ movdqa 112(%r10),%xmm0
+ movdqa -352(%r11),%xmm2
+.byte 102,15,56,0,195
+ pxor %xmm4,%xmm0
+.byte 102,15,56,0,194
+ .byte 0xf3,0xc3
+.size _vpaes_decrypt_core,.-_vpaes_decrypt_core
+
+
+
+
+
+
+.type _vpaes_schedule_core,@function
+.align 16
+_vpaes_schedule_core:
+
+
+
+
+
+ call _vpaes_preheat
+ movdqa .Lk_rcon(%rip),%xmm8
+ movdqu (%rdi),%xmm0
+
+
+ movdqa %xmm0,%xmm3
+ leaq .Lk_ipt(%rip),%r11
+ call _vpaes_schedule_transform
+ movdqa %xmm0,%xmm7
+
+ leaq .Lk_sr(%rip),%r10
+ testq %rcx,%rcx
+ jnz .Lschedule_am_decrypting
+
+
+ movdqu %xmm0,(%rdx)
+ jmp .Lschedule_go
+
+.Lschedule_am_decrypting:
+
+ movdqa (%r8,%r10,1),%xmm1
+.byte 102,15,56,0,217
+ movdqu %xmm3,(%rdx)
+ xorq $48,%r8
+
+.Lschedule_go:
+ cmpl $192,%esi
+ ja .Lschedule_256
+ je .Lschedule_192
+
+
+
+
+
+
+
+
+
+
+.Lschedule_128:
+ movl $10,%esi
+
+.Loop_schedule_128:
+ call _vpaes_schedule_round
+ decq %rsi
+ jz .Lschedule_mangle_last
+ call _vpaes_schedule_mangle
+ jmp .Loop_schedule_128
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.align 16
+.Lschedule_192:
+ movdqu 8(%rdi),%xmm0
+ call _vpaes_schedule_transform
+ movdqa %xmm0,%xmm6
+ pxor %xmm4,%xmm4
+ movhlps %xmm4,%xmm6
+ movl $4,%esi
+
+.Loop_schedule_192:
+ call _vpaes_schedule_round
+.byte 102,15,58,15,198,8
+ call _vpaes_schedule_mangle
+ call _vpaes_schedule_192_smear
+ call _vpaes_schedule_mangle
+ call _vpaes_schedule_round
+ decq %rsi
+ jz .Lschedule_mangle_last
+ call _vpaes_schedule_mangle
+ call _vpaes_schedule_192_smear
+ jmp .Loop_schedule_192
+
+
+
+
+
+
+
+
+
+
+
+.align 16
+.Lschedule_256:
+ movdqu 16(%rdi),%xmm0
+ call _vpaes_schedule_transform
+ movl $7,%esi
+
+.Loop_schedule_256:
+ call _vpaes_schedule_mangle
+ movdqa %xmm0,%xmm6
+
+
+ call _vpaes_schedule_round
+ decq %rsi
+ jz .Lschedule_mangle_last
+ call _vpaes_schedule_mangle
+
+
+ pshufd $255,%xmm0,%xmm0
+ movdqa %xmm7,%xmm5
+ movdqa %xmm6,%xmm7
+ call _vpaes_schedule_low_round
+ movdqa %xmm5,%xmm7
+
+ jmp .Loop_schedule_256
+
+
+
+
+
+
+
+
+
+
+
+
+.align 16
+.Lschedule_mangle_last:
+
+ leaq .Lk_deskew(%rip),%r11
+ testq %rcx,%rcx
+ jnz .Lschedule_mangle_last_dec
+
+
+ movdqa (%r8,%r10,1),%xmm1
+.byte 102,15,56,0,193
+ leaq .Lk_opt(%rip),%r11
+ addq $32,%rdx
+
+.Lschedule_mangle_last_dec:
+ addq $-16,%rdx
+ pxor .Lk_s63(%rip),%xmm0
+ call _vpaes_schedule_transform
+ movdqu %xmm0,(%rdx)
+
+
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ .byte 0xf3,0xc3
+.size _vpaes_schedule_core,.-_vpaes_schedule_core
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.type _vpaes_schedule_192_smear,@function
+.align 16
+_vpaes_schedule_192_smear:
+ pshufd $128,%xmm6,%xmm0
+ pxor %xmm0,%xmm6
+ pshufd $254,%xmm7,%xmm0
+ pxor %xmm0,%xmm6
+ movdqa %xmm6,%xmm0
+ pxor %xmm1,%xmm1
+ movhlps %xmm1,%xmm6
+ .byte 0xf3,0xc3
+.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.type _vpaes_schedule_round,@function
+.align 16
+_vpaes_schedule_round:
+
+ pxor %xmm1,%xmm1
+.byte 102,65,15,58,15,200,15
+.byte 102,69,15,58,15,192,15
+ pxor %xmm1,%xmm7
+
+
+ pshufd $255,%xmm0,%xmm0
+.byte 102,15,58,15,192,1
+
+
+
+
+_vpaes_schedule_low_round:
+
+ movdqa %xmm7,%xmm1
+ pslldq $4,%xmm7
+ pxor %xmm1,%xmm7
+ movdqa %xmm7,%xmm1
+ pslldq $8,%xmm7
+ pxor %xmm1,%xmm7
+ pxor .Lk_s63(%rip),%xmm7
+
+
+ movdqa %xmm9,%xmm1
+ pandn %xmm0,%xmm1
+ psrld $4,%xmm1
+ pand %xmm9,%xmm0
+ movdqa %xmm11,%xmm2
+.byte 102,15,56,0,208
+ pxor %xmm1,%xmm0
+ movdqa %xmm10,%xmm3
+.byte 102,15,56,0,217
+ pxor %xmm2,%xmm3
+ movdqa %xmm10,%xmm4
+.byte 102,15,56,0,224
+ pxor %xmm2,%xmm4
+ movdqa %xmm10,%xmm2
+.byte 102,15,56,0,211
+ pxor %xmm0,%xmm2
+ movdqa %xmm10,%xmm3
+.byte 102,15,56,0,220
+ pxor %xmm1,%xmm3
+ movdqa %xmm13,%xmm4
+.byte 102,15,56,0,226
+ movdqa %xmm12,%xmm0
+.byte 102,15,56,0,195
+ pxor %xmm4,%xmm0
+
+
+ pxor %xmm7,%xmm0
+ movdqa %xmm0,%xmm7
+ .byte 0xf3,0xc3
+.size _vpaes_schedule_round,.-_vpaes_schedule_round
+
+
+
+
+
+
+
+
+
+
+.type _vpaes_schedule_transform,@function
+.align 16
+_vpaes_schedule_transform:
+ movdqa %xmm9,%xmm1
+ pandn %xmm0,%xmm1
+ psrld $4,%xmm1
+ pand %xmm9,%xmm0
+ movdqa (%r11),%xmm2
+.byte 102,15,56,0,208
+ movdqa 16(%r11),%xmm0
+.byte 102,15,56,0,193
+ pxor %xmm2,%xmm0
+ .byte 0xf3,0xc3
+.size _vpaes_schedule_transform,.-_vpaes_schedule_transform
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.type _vpaes_schedule_mangle,@function
+.align 16
+_vpaes_schedule_mangle:
+ movdqa %xmm0,%xmm4
+ movdqa .Lk_mc_forward(%rip),%xmm5
+ testq %rcx,%rcx
+ jnz .Lschedule_mangle_dec
+
+
+ addq $16,%rdx
+ pxor .Lk_s63(%rip),%xmm4
+.byte 102,15,56,0,229
+ movdqa %xmm4,%xmm3
+.byte 102,15,56,0,229
+ pxor %xmm4,%xmm3
+.byte 102,15,56,0,229
+ pxor %xmm4,%xmm3
+
+ jmp .Lschedule_mangle_both
+.align 16
+.Lschedule_mangle_dec:
+
+ leaq .Lk_dksd(%rip),%r11
+ movdqa %xmm9,%xmm1
+ pandn %xmm4,%xmm1
+ psrld $4,%xmm1
+ pand %xmm9,%xmm4
+
+ movdqa 0(%r11),%xmm2
+.byte 102,15,56,0,212
+ movdqa 16(%r11),%xmm3
+.byte 102,15,56,0,217
+ pxor %xmm2,%xmm3
+.byte 102,15,56,0,221
+
+ movdqa 32(%r11),%xmm2
+.byte 102,15,56,0,212
+ pxor %xmm3,%xmm2
+ movdqa 48(%r11),%xmm3
+.byte 102,15,56,0,217
+ pxor %xmm2,%xmm3
+.byte 102,15,56,0,221
+
+ movdqa 64(%r11),%xmm2
+.byte 102,15,56,0,212
+ pxor %xmm3,%xmm2
+ movdqa 80(%r11),%xmm3
+.byte 102,15,56,0,217
+ pxor %xmm2,%xmm3
+.byte 102,15,56,0,221
+
+ movdqa 96(%r11),%xmm2
+.byte 102,15,56,0,212
+ pxor %xmm3,%xmm2
+ movdqa 112(%r11),%xmm3
+.byte 102,15,56,0,217
+ pxor %xmm2,%xmm3
+
+ addq $-16,%rdx
+
+.Lschedule_mangle_both:
+ movdqa (%r8,%r10,1),%xmm1
+.byte 102,15,56,0,217
+ addq $-16,%r8
+ andq $48,%r8
+ movdqu %xmm3,(%rdx)
+ .byte 0xf3,0xc3
+.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle
+
+
+
+
+.globl vpaes_set_encrypt_key
+.type vpaes_set_encrypt_key,@function
+.align 16
+vpaes_set_encrypt_key:
+ movl %esi,%eax
+ shrl $5,%eax
+ addl $5,%eax
+ movl %eax,240(%rdx)
+
+ movl $0,%ecx
+ movl $48,%r8d
+ call _vpaes_schedule_core
+ xorl %eax,%eax
+ .byte 0xf3,0xc3
+.size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key
+
+.globl vpaes_set_decrypt_key
+.type vpaes_set_decrypt_key,@function
+.align 16
+vpaes_set_decrypt_key:
+ movl %esi,%eax
+ shrl $5,%eax
+ addl $5,%eax
+ movl %eax,240(%rdx)
+ shll $4,%eax
+ leaq 16(%rdx,%rax,1),%rdx
+
+ movl $1,%ecx
+ movl %esi,%r8d
+ shrl $1,%r8d
+ andl $32,%r8d
+ xorl $32,%r8d
+ call _vpaes_schedule_core
+ xorl %eax,%eax
+ .byte 0xf3,0xc3
+.size vpaes_set_decrypt_key,.-vpaes_set_decrypt_key
+
+.globl vpaes_encrypt
+.type vpaes_encrypt,@function
+.align 16
+vpaes_encrypt:
+ movdqu (%rdi),%xmm0
+ call _vpaes_preheat
+ call _vpaes_encrypt_core
+ movdqu %xmm0,(%rsi)
+ .byte 0xf3,0xc3
+.size vpaes_encrypt,.-vpaes_encrypt
+
+.globl vpaes_decrypt
+.type vpaes_decrypt,@function
+.align 16
+vpaes_decrypt:
+ movdqu (%rdi),%xmm0
+ call _vpaes_preheat
+ call _vpaes_decrypt_core
+ movdqu %xmm0,(%rsi)
+ .byte 0xf3,0xc3
+.size vpaes_decrypt,.-vpaes_decrypt
+.globl vpaes_cbc_encrypt
+.type vpaes_cbc_encrypt,@function
+.align 16
+vpaes_cbc_encrypt:
+ xchgq %rcx,%rdx
+ subq $16,%rcx
+ jc .Lcbc_abort
+ movdqu (%r8),%xmm6
+ subq %rdi,%rsi
+ call _vpaes_preheat
+ cmpl $0,%r9d
+ je .Lcbc_dec_loop
+ jmp .Lcbc_enc_loop
+.align 16
+.Lcbc_enc_loop:
+ movdqu (%rdi),%xmm0
+ pxor %xmm6,%xmm0
+ call _vpaes_encrypt_core
+ movdqa %xmm0,%xmm6
+ movdqu %xmm0,(%rsi,%rdi,1)
+ leaq 16(%rdi),%rdi
+ subq $16,%rcx
+ jnc .Lcbc_enc_loop
+ jmp .Lcbc_done
+.align 16
+.Lcbc_dec_loop:
+ movdqu (%rdi),%xmm0
+ movdqa %xmm0,%xmm7
+ call _vpaes_decrypt_core
+ pxor %xmm6,%xmm0
+ movdqa %xmm7,%xmm6
+ movdqu %xmm0,(%rsi,%rdi,1)
+ leaq 16(%rdi),%rdi
+ subq $16,%rcx
+ jnc .Lcbc_dec_loop
+.Lcbc_done:
+ movdqu %xmm6,(%r8)
+.Lcbc_abort:
+ .byte 0xf3,0xc3
+.size vpaes_cbc_encrypt,.-vpaes_cbc_encrypt
+
+
+
+
+
+
+.type _vpaes_preheat,@function
+.align 16
+_vpaes_preheat:
+ leaq .Lk_s0F(%rip),%r10
+ movdqa -32(%r10),%xmm10
+ movdqa -16(%r10),%xmm11
+ movdqa 0(%r10),%xmm9
+ movdqa 48(%r10),%xmm13
+ movdqa 64(%r10),%xmm12
+ movdqa 80(%r10),%xmm15
+ movdqa 96(%r10),%xmm14
+ .byte 0xf3,0xc3
+.size _vpaes_preheat,.-_vpaes_preheat
+
+
+
+
+
+.type _vpaes_consts,@object
+.align 64
+_vpaes_consts:
+.Lk_inv:
+.quad 0x0E05060F0D080180, 0x040703090A0B0C02
+.quad 0x01040A060F0B0780, 0x030D0E0C02050809
+
+.Lk_s0F:
+.quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F
+
+.Lk_ipt:
+.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808
+.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
+
+.Lk_sb1:
+.quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
+.quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
+.Lk_sb2:
+.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD
+.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A
+.Lk_sbo:
+.quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878
+.quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
+
+.Lk_mc_forward:
+.quad 0x0407060500030201, 0x0C0F0E0D080B0A09
+.quad 0x080B0A0904070605, 0x000302010C0F0E0D
+.quad 0x0C0F0E0D080B0A09, 0x0407060500030201
+.quad 0x000302010C0F0E0D, 0x080B0A0904070605
+
+.Lk_mc_backward:
+.quad 0x0605040702010003, 0x0E0D0C0F0A09080B
+.quad 0x020100030E0D0C0F, 0x0A09080B06050407
+.quad 0x0E0D0C0F0A09080B, 0x0605040702010003
+.quad 0x0A09080B06050407, 0x020100030E0D0C0F
+
+.Lk_sr:
+.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908
+.quad 0x030E09040F0A0500, 0x0B06010C07020D08
+.quad 0x0F060D040B020900, 0x070E050C030A0108
+.quad 0x0B0E0104070A0D00, 0x0306090C0F020508
+
+.Lk_rcon:
+.quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
+
+.Lk_s63:
+.quad 0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B
+
+.Lk_opt:
+.quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808
+.quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
+
+.Lk_deskew:
+.quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
+.quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
+
+
+
+
+
+.Lk_dksd:
+.quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
+.quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E
+.Lk_dksb:
+.quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99
+.quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
+.Lk_dkse:
+.quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086
+.quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487
+.Lk_dks9:
+.quad 0xB6116FC87ED9A700, 0x4AED933482255BFC
+.quad 0x4576516227143300, 0x8BB89FACE9DAFDCE
+
+
+
+
+
+.Lk_dipt:
+.quad 0x0F505B040B545F00, 0x154A411E114E451A
+.quad 0x86E383E660056500, 0x12771772F491F194
+
+.Lk_dsb9:
+.quad 0x851C03539A86D600, 0xCAD51F504F994CC9
+.quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565
+.Lk_dsbd:
+.quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
+.quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
+.Lk_dsbb:
+.quad 0xD022649296B44200, 0x602646F6B0F2D404
+.quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
+.Lk_dsbe:
+.quad 0x46F2929626D4D000, 0x2242600464B4F6B0
+.quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32
+.Lk_dsbo:
+.quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
+.quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C
+.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,105,111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
+.align 64
+.size _vpaes_consts,.-_vpaes_consts
diff --git a/main/openssl/crypto/aes/asm/vpaes-x86_64.pl b/main/openssl/crypto/aes/asm/vpaes-x86_64.pl
new file mode 100644
index 00000000..41f2e46f
--- /dev/null
+++ b/main/openssl/crypto/aes/asm/vpaes-x86_64.pl
@@ -0,0 +1,1207 @@
+#!/usr/bin/env perl
+
+######################################################################
+## Constant-time SSSE3 AES core implementation.
+## version 0.1
+##
+## By Mike Hamburg (Stanford University), 2009
+## Public domain.
+##
+## For details see http://shiftleft.org/papers/vector_aes/ and
+## http://crypto.stanford.edu/vpaes/.
+
+######################################################################
+# September 2011.
+#
+# Interface to OpenSSL as "almost" drop-in replacement for
+# aes-x86_64.pl. "Almost" refers to the fact that AES_cbc_encrypt
+# doesn't handle partial vectors (doesn't have to if called from
+# EVP only). "Drop-in" implies that this module doesn't share key
+# schedule structure with the original nor does it make assumption
+# about its alignment...
+#
+# Performance summary. aes-x86_64.pl column lists large-block CBC
+# encrypt/decrypt/with-hyper-threading-off(*) results in cycles per
+# byte processed with 128-bit key, and vpaes-x86_64.pl column -
+# [also large-block CBC] encrypt/decrypt.
+#
+# aes-x86_64.pl vpaes-x86_64.pl
+#
+# Core 2(**) 30.5/43.7/14.3 21.8/25.7(***)
+# Nehalem 30.5/42.2/14.6 9.8/11.8
+# Atom 63.9/79.0/32.1 64.0/84.8(***)
+#
+# (*) "Hyper-threading" in the context refers rather to cache shared
+# among multiple cores, than to specifically Intel HTT. As vast
+# majority of contemporary cores share cache, slower code path
+# is common place. In other words "with-hyper-threading-off"
+# results are presented mostly for reference purposes.
+#
+# (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe.
+#
+# (***) Less impressive improvement on Core 2 and Atom is due to slow
+# pshufb, yet it's respectable +40%/78% improvement on Core 2
+# (as implied, over "hyper-threading-safe" code path).
+#
+# <appro@openssl.org>
+
+$flavour = shift;
+$output = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+$PREFIX="vpaes";
+
+$code.=<<___;
+.text
+
+##
+## _aes_encrypt_core
+##
+## AES-encrypt %xmm0.
+##
+## Inputs:
+## %xmm0 = input
+## %xmm9-%xmm15 as in _vpaes_preheat
+## (%rdx) = scheduled keys
+##
+## Output in %xmm0
+## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax
+## Preserves %xmm6 - %xmm8 so you get some local vectors
+##
+##
+.type _vpaes_encrypt_core,\@abi-omnipotent
+.align 16
+_vpaes_encrypt_core:
+ mov %rdx, %r9
+ mov \$16, %r11
+ mov 240(%rdx),%eax
+ movdqa %xmm9, %xmm1
+ movdqa .Lk_ipt(%rip), %xmm2 # iptlo
+ pandn %xmm0, %xmm1
+ movdqu (%r9), %xmm5 # round0 key
+ psrld \$4, %xmm1
+ pand %xmm9, %xmm0
+ pshufb %xmm0, %xmm2
+ movdqa .Lk_ipt+16(%rip), %xmm0 # ipthi
+ pshufb %xmm1, %xmm0
+ pxor %xmm5, %xmm2
+ pxor %xmm2, %xmm0
+ add \$16, %r9
+ lea .Lk_mc_backward(%rip),%r10
+ jmp .Lenc_entry
+
+.align 16
+.Lenc_loop:
+ # middle of middle round
+ movdqa %xmm13, %xmm4 # 4 : sb1u
+ pshufb %xmm2, %xmm4 # 4 = sb1u
+ pxor %xmm5, %xmm4 # 4 = sb1u + k
+ movdqa %xmm12, %xmm0 # 0 : sb1t
+ pshufb %xmm3, %xmm0 # 0 = sb1t
+ pxor %xmm4, %xmm0 # 0 = A
+ movdqa %xmm15, %xmm5 # 4 : sb2u
+ pshufb %xmm2, %xmm5 # 4 = sb2u
+ movdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
+ movdqa %xmm14, %xmm2 # 2 : sb2t
+ pshufb %xmm3, %xmm2 # 2 = sb2t
+ pxor %xmm5, %xmm2 # 2 = 2A
+ movdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
+ movdqa %xmm0, %xmm3 # 3 = A
+ pshufb %xmm1, %xmm0 # 0 = B
+ add \$16, %r9 # next key
+ pxor %xmm2, %xmm0 # 0 = 2A+B
+ pshufb %xmm4, %xmm3 # 3 = D
+ add \$16, %r11 # next mc
+ pxor %xmm0, %xmm3 # 3 = 2A+B+D
+ pshufb %xmm1, %xmm0 # 0 = 2B+C
+ and \$0x30, %r11 # ... mod 4
+ pxor %xmm3, %xmm0 # 0 = 2A+3B+C+D
+ sub \$1,%rax # nr--
+
+.Lenc_entry:
+ # top of round
+ movdqa %xmm9, %xmm1 # 1 : i
+ pandn %xmm0, %xmm1 # 1 = i<<4
+ psrld \$4, %xmm1 # 1 = i
+ pand %xmm9, %xmm0 # 0 = k
+ movdqa %xmm11, %xmm5 # 2 : a/k
+ pshufb %xmm0, %xmm5 # 2 = a/k
+ pxor %xmm1, %xmm0 # 0 = j
+ movdqa %xmm10, %xmm3 # 3 : 1/i
+ pshufb %xmm1, %xmm3 # 3 = 1/i
+ pxor %xmm5, %xmm3 # 3 = iak = 1/i + a/k
+ movdqa %xmm10, %xmm4 # 4 : 1/j
+ pshufb %xmm0, %xmm4 # 4 = 1/j
+ pxor %xmm5, %xmm4 # 4 = jak = 1/j + a/k
+ movdqa %xmm10, %xmm2 # 2 : 1/iak
+ pshufb %xmm3, %xmm2 # 2 = 1/iak
+ pxor %xmm0, %xmm2 # 2 = io
+ movdqa %xmm10, %xmm3 # 3 : 1/jak
+ movdqu (%r9), %xmm5
+ pshufb %xmm4, %xmm3 # 3 = 1/jak
+ pxor %xmm1, %xmm3 # 3 = jo
+ jnz .Lenc_loop
+
+ # middle of last round
+ movdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
+ movdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
+ pshufb %xmm2, %xmm4 # 4 = sbou
+ pxor %xmm5, %xmm4 # 4 = sb1u + k
+ pshufb %xmm3, %xmm0 # 0 = sb1t
+ movdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
+ pxor %xmm4, %xmm0 # 0 = A
+ pshufb %xmm1, %xmm0
+ ret
+.size _vpaes_encrypt_core,.-_vpaes_encrypt_core
+
+##
+## Decryption core
+##
+## Same API as encryption core.
+##
+.type _vpaes_decrypt_core,\@abi-omnipotent
+.align 16
+_vpaes_decrypt_core:
+ mov %rdx, %r9 # load key
+ mov 240(%rdx),%eax
+ movdqa %xmm9, %xmm1
+ movdqa .Lk_dipt(%rip), %xmm2 # iptlo
+ pandn %xmm0, %xmm1
+ mov %rax, %r11
+ psrld \$4, %xmm1
+ movdqu (%r9), %xmm5 # round0 key
+ shl \$4, %r11
+ pand %xmm9, %xmm0
+ pshufb %xmm0, %xmm2
+ movdqa .Lk_dipt+16(%rip), %xmm0 # ipthi
+ xor \$0x30, %r11
+ lea .Lk_dsbd(%rip),%r10
+ pshufb %xmm1, %xmm0
+ and \$0x30, %r11
+ pxor %xmm5, %xmm2
+ movdqa .Lk_mc_forward+48(%rip), %xmm5
+ pxor %xmm2, %xmm0
+ add \$16, %r9
+ add %r10, %r11
+ jmp .Ldec_entry
+
+.align 16
+.Ldec_loop:
+##
+## Inverse mix columns
+##
+ movdqa -0x20(%r10),%xmm4 # 4 : sb9u
+ pshufb %xmm2, %xmm4 # 4 = sb9u
+ pxor %xmm0, %xmm4
+ movdqa -0x10(%r10),%xmm0 # 0 : sb9t
+ pshufb %xmm3, %xmm0 # 0 = sb9t
+ pxor %xmm4, %xmm0 # 0 = ch
+ add \$16, %r9 # next round key
+
+ pshufb %xmm5, %xmm0 # MC ch
+ movdqa 0x00(%r10),%xmm4 # 4 : sbdu
+ pshufb %xmm2, %xmm4 # 4 = sbdu
+ pxor %xmm0, %xmm4 # 4 = ch
+ movdqa 0x10(%r10),%xmm0 # 0 : sbdt
+ pshufb %xmm3, %xmm0 # 0 = sbdt
+ pxor %xmm4, %xmm0 # 0 = ch
+ sub \$1,%rax # nr--
+
+ pshufb %xmm5, %xmm0 # MC ch
+ movdqa 0x20(%r10),%xmm4 # 4 : sbbu
+ pshufb %xmm2, %xmm4 # 4 = sbbu
+ pxor %xmm0, %xmm4 # 4 = ch
+ movdqa 0x30(%r10),%xmm0 # 0 : sbbt
+ pshufb %xmm3, %xmm0 # 0 = sbbt
+ pxor %xmm4, %xmm0 # 0 = ch
+
+ pshufb %xmm5, %xmm0 # MC ch
+ movdqa 0x40(%r10),%xmm4 # 4 : sbeu
+ pshufb %xmm2, %xmm4 # 4 = sbeu
+ pxor %xmm0, %xmm4 # 4 = ch
+ movdqa 0x50(%r10),%xmm0 # 0 : sbet
+ pshufb %xmm3, %xmm0 # 0 = sbet
+ pxor %xmm4, %xmm0 # 0 = ch
+
+ palignr \$12, %xmm5, %xmm5
+
+.Ldec_entry:
+ # top of round
+ movdqa %xmm9, %xmm1 # 1 : i
+ pandn %xmm0, %xmm1 # 1 = i<<4
+ psrld \$4, %xmm1 # 1 = i
+ pand %xmm9, %xmm0 # 0 = k
+ movdqa %xmm11, %xmm2 # 2 : a/k
+ pshufb %xmm0, %xmm2 # 2 = a/k
+ pxor %xmm1, %xmm0 # 0 = j
+ movdqa %xmm10, %xmm3 # 3 : 1/i
+ pshufb %xmm1, %xmm3 # 3 = 1/i
+ pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k
+ movdqa %xmm10, %xmm4 # 4 : 1/j
+ pshufb %xmm0, %xmm4 # 4 = 1/j
+ pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k
+ movdqa %xmm10, %xmm2 # 2 : 1/iak
+ pshufb %xmm3, %xmm2 # 2 = 1/iak
+ pxor %xmm0, %xmm2 # 2 = io
+ movdqa %xmm10, %xmm3 # 3 : 1/jak
+ pshufb %xmm4, %xmm3 # 3 = 1/jak
+ pxor %xmm1, %xmm3 # 3 = jo
+ movdqu (%r9), %xmm0
+ jnz .Ldec_loop
+
+ # middle of last round
+ movdqa 0x60(%r10), %xmm4 # 3 : sbou
+ pshufb %xmm2, %xmm4 # 4 = sbou
+ pxor %xmm0, %xmm4 # 4 = sb1u + k
+ movdqa 0x70(%r10), %xmm0 # 0 : sbot
+ movdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160
+ pshufb %xmm3, %xmm0 # 0 = sb1t
+ pxor %xmm4, %xmm0 # 0 = A
+ pshufb %xmm2, %xmm0
+ ret
+.size _vpaes_decrypt_core,.-_vpaes_decrypt_core
+
+########################################################
+## ##
+## AES key schedule ##
+## ##
+########################################################
+.type _vpaes_schedule_core,\@abi-omnipotent
+.align 16
+_vpaes_schedule_core:
+ # rdi = key
+ # rsi = size in bits
+ # rdx = buffer
+ # rcx = direction. 0=encrypt, 1=decrypt
+
+ call _vpaes_preheat # load the tables
+ movdqa .Lk_rcon(%rip), %xmm8 # load rcon
+ movdqu (%rdi), %xmm0 # load key (unaligned)
+
+ # input transform
+ movdqa %xmm0, %xmm3
+ lea .Lk_ipt(%rip), %r11
+ call _vpaes_schedule_transform
+ movdqa %xmm0, %xmm7
+
+ lea .Lk_sr(%rip),%r10
+ test %rcx, %rcx
+ jnz .Lschedule_am_decrypting
+
+ # encrypting, output zeroth round key after transform
+ movdqu %xmm0, (%rdx)
+ jmp .Lschedule_go
+
+.Lschedule_am_decrypting:
+ # decrypting, output zeroth round key after shiftrows
+ movdqa (%r8,%r10),%xmm1
+ pshufb %xmm1, %xmm3
+ movdqu %xmm3, (%rdx)
+ xor \$0x30, %r8
+
+.Lschedule_go:
+ cmp \$192, %esi
+ ja .Lschedule_256
+ je .Lschedule_192
+ # 128: fall though
+
+##
+## .schedule_128
+##
+## 128-bit specific part of key schedule.
+##
+## This schedule is really simple, because all its parts
+## are accomplished by the subroutines.
+##
+.Lschedule_128:
+ mov \$10, %esi
+
+.Loop_schedule_128:
+ call _vpaes_schedule_round
+ dec %rsi
+ jz .Lschedule_mangle_last
+ call _vpaes_schedule_mangle # write output
+ jmp .Loop_schedule_128
+
+##
+## .aes_schedule_192
+##
+## 192-bit specific part of key schedule.
+##
+## The main body of this schedule is the same as the 128-bit
+## schedule, but with more smearing. The long, high side is
+## stored in %xmm7 as before, and the short, low side is in
+## the high bits of %xmm6.
+##
+## This schedule is somewhat nastier, however, because each
+## round produces 192 bits of key material, or 1.5 round keys.
+## Therefore, on each cycle we do 2 rounds and produce 3 round
+## keys.
+##
+.align 16
+.Lschedule_192:
+ movdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
+ call _vpaes_schedule_transform # input transform
+ movdqa %xmm0, %xmm6 # save short part
+ pxor %xmm4, %xmm4 # clear 4
+ movhlps %xmm4, %xmm6 # clobber low side with zeros
+ mov \$4, %esi
+
+.Loop_schedule_192:
+ call _vpaes_schedule_round
+ palignr \$8,%xmm6,%xmm0
+ call _vpaes_schedule_mangle # save key n
+ call _vpaes_schedule_192_smear
+ call _vpaes_schedule_mangle # save key n+1
+ call _vpaes_schedule_round
+ dec %rsi
+ jz .Lschedule_mangle_last
+ call _vpaes_schedule_mangle # save key n+2
+ call _vpaes_schedule_192_smear
+ jmp .Loop_schedule_192
+
+##
+## .aes_schedule_256
+##
+## 256-bit specific part of key schedule.
+##
+## The structure here is very similar to the 128-bit
+## schedule, but with an additional "low side" in
+## %xmm6. The low side's rounds are the same as the
+## high side's, except no rcon and no rotation.
+##
+.align 16
+.Lschedule_256:
+ movdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
+ call _vpaes_schedule_transform # input transform
+ mov \$7, %esi
+
+.Loop_schedule_256:
+ call _vpaes_schedule_mangle # output low result
+ movdqa %xmm0, %xmm6 # save cur_lo in xmm6
+
+ # high round
+ call _vpaes_schedule_round
+ dec %rsi
+ jz .Lschedule_mangle_last
+ call _vpaes_schedule_mangle
+
+ # low round. swap xmm7 and xmm6
+ pshufd \$0xFF, %xmm0, %xmm0
+ movdqa %xmm7, %xmm5
+ movdqa %xmm6, %xmm7
+ call _vpaes_schedule_low_round
+ movdqa %xmm5, %xmm7
+
+ jmp .Loop_schedule_256
+
+
+##
+## .aes_schedule_mangle_last
+##
+## Mangler for last round of key schedule
+## Mangles %xmm0
+## when encrypting, outputs out(%xmm0) ^ 63
+## when decrypting, outputs unskew(%xmm0)
+##
+## Always called right before return... jumps to cleanup and exits
+##
+.align 16
+.Lschedule_mangle_last:
+ # schedule last round key from xmm0
+ lea .Lk_deskew(%rip),%r11 # prepare to deskew
+ test %rcx, %rcx
+ jnz .Lschedule_mangle_last_dec
+
+ # encrypting
+ movdqa (%r8,%r10),%xmm1
+ pshufb %xmm1, %xmm0 # output permute
+ lea .Lk_opt(%rip), %r11 # prepare to output transform
+ add \$32, %rdx
+
+.Lschedule_mangle_last_dec:
+ add \$-16, %rdx
+ pxor .Lk_s63(%rip), %xmm0
+ call _vpaes_schedule_transform # output transform
+ movdqu %xmm0, (%rdx) # save last key
+
+ # cleanup
+ pxor %xmm0, %xmm0
+ pxor %xmm1, %xmm1
+ pxor %xmm2, %xmm2
+ pxor %xmm3, %xmm3
+ pxor %xmm4, %xmm4
+ pxor %xmm5, %xmm5
+ pxor %xmm6, %xmm6
+ pxor %xmm7, %xmm7
+ ret
+.size _vpaes_schedule_core,.-_vpaes_schedule_core
+
+##
+## .aes_schedule_192_smear
+##
+## Smear the short, low side in the 192-bit key schedule.
+##
+## Inputs:
+## %xmm7: high side, b a x y
+## %xmm6: low side, d c 0 0
+## %xmm13: 0
+##
+## Outputs:
+## %xmm6: b+c+d b+c 0 0
+## %xmm0: b+c+d b+c b a
+##
+.type _vpaes_schedule_192_smear,\@abi-omnipotent
+.align 16
+_vpaes_schedule_192_smear:
+ pshufd \$0x80, %xmm6, %xmm0 # d c 0 0 -> c 0 0 0
+ pxor %xmm0, %xmm6 # -> c+d c 0 0
+ pshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a
+ pxor %xmm0, %xmm6 # -> b+c+d b+c b a
+ movdqa %xmm6, %xmm0
+ pxor %xmm1, %xmm1
+ movhlps %xmm1, %xmm6 # clobber low side with zeros
+ ret
+.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
+
+##
+## .aes_schedule_round
+##
+## Runs one main round of the key schedule on %xmm0, %xmm7
+##
+## Specifically, runs subbytes on the high dword of %xmm0
+## then rotates it by one byte and xors into the low dword of
+## %xmm7.
+##
+## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
+## next rcon.
+##
+## Smears the dwords of %xmm7 by xoring the low into the
+## second low, result into third, result into highest.
+##
+## Returns results in %xmm7 = %xmm0.
+## Clobbers %xmm1-%xmm4, %r11.
+##
+.type _vpaes_schedule_round,\@abi-omnipotent
+.align 16
+_vpaes_schedule_round:
+ # extract rcon from xmm8
+ pxor %xmm1, %xmm1
+ palignr \$15, %xmm8, %xmm1
+ palignr \$15, %xmm8, %xmm8
+ pxor %xmm1, %xmm7
+
+ # rotate
+ pshufd \$0xFF, %xmm0, %xmm0
+ palignr \$1, %xmm0, %xmm0
+
+ # fall through...
+
+ # low round: same as high round, but no rotation and no rcon.
+_vpaes_schedule_low_round:
+ # smear xmm7
+ movdqa %xmm7, %xmm1
+ pslldq \$4, %xmm7
+ pxor %xmm1, %xmm7
+ movdqa %xmm7, %xmm1
+ pslldq \$8, %xmm7
+ pxor %xmm1, %xmm7
+ pxor .Lk_s63(%rip), %xmm7
+
+ # subbytes
+ movdqa %xmm9, %xmm1
+ pandn %xmm0, %xmm1
+ psrld \$4, %xmm1 # 1 = i
+ pand %xmm9, %xmm0 # 0 = k
+ movdqa %xmm11, %xmm2 # 2 : a/k
+ pshufb %xmm0, %xmm2 # 2 = a/k
+ pxor %xmm1, %xmm0 # 0 = j
+ movdqa %xmm10, %xmm3 # 3 : 1/i
+ pshufb %xmm1, %xmm3 # 3 = 1/i
+ pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k
+ movdqa %xmm10, %xmm4 # 4 : 1/j
+ pshufb %xmm0, %xmm4 # 4 = 1/j
+ pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k
+ movdqa %xmm10, %xmm2 # 2 : 1/iak
+ pshufb %xmm3, %xmm2 # 2 = 1/iak
+ pxor %xmm0, %xmm2 # 2 = io
+ movdqa %xmm10, %xmm3 # 3 : 1/jak
+ pshufb %xmm4, %xmm3 # 3 = 1/jak
+ pxor %xmm1, %xmm3 # 3 = jo
+ movdqa %xmm13, %xmm4 # 4 : sbou
+ pshufb %xmm2, %xmm4 # 4 = sbou
+ movdqa %xmm12, %xmm0 # 0 : sbot
+ pshufb %xmm3, %xmm0 # 0 = sb1t
+ pxor %xmm4, %xmm0 # 0 = sbox output
+
+ # add in smeared stuff
+ pxor %xmm7, %xmm0
+ movdqa %xmm0, %xmm7
+ ret
+.size _vpaes_schedule_round,.-_vpaes_schedule_round
+
+##
+## .aes_schedule_transform
+##
+## Linear-transform %xmm0 according to tables at (%r11)
+##
+## Requires that %xmm9 = 0x0F0F... as in preheat
+## Output in %xmm0
+## Clobbers %xmm1, %xmm2
+##
+.type _vpaes_schedule_transform,\@abi-omnipotent
+.align 16
+_vpaes_schedule_transform:
+ movdqa %xmm9, %xmm1
+ pandn %xmm0, %xmm1
+ psrld \$4, %xmm1
+ pand %xmm9, %xmm0
+ movdqa (%r11), %xmm2 # lo
+ pshufb %xmm0, %xmm2
+ movdqa 16(%r11), %xmm0 # hi
+ pshufb %xmm1, %xmm0
+ pxor %xmm2, %xmm0
+ ret
+.size _vpaes_schedule_transform,.-_vpaes_schedule_transform
+
+##
+## .aes_schedule_mangle
+##
+## Mangle xmm0 from (basis-transformed) standard version
+## to our version.
+##
+## On encrypt,
+## xor with 0x63
+## multiply by circulant 0,1,1,1
+## apply shiftrows transform
+##
+## On decrypt,
+## xor with 0x63
+## multiply by "inverse mixcolumns" circulant E,B,D,9
+## deskew
+## apply shiftrows transform
+##
+##
+## Writes out to (%rdx), and increments or decrements it
+## Keeps track of round number mod 4 in %r8
+## Preserves xmm0
+## Clobbers xmm1-xmm5
+##
+.type _vpaes_schedule_mangle,\@abi-omnipotent
+.align 16
+_vpaes_schedule_mangle:
+ movdqa %xmm0, %xmm4 # save xmm0 for later
+ movdqa .Lk_mc_forward(%rip),%xmm5
+ test %rcx, %rcx
+ jnz .Lschedule_mangle_dec
+
+ # encrypting
+ add \$16, %rdx
+ pxor .Lk_s63(%rip),%xmm4
+ pshufb %xmm5, %xmm4
+ movdqa %xmm4, %xmm3
+ pshufb %xmm5, %xmm4
+ pxor %xmm4, %xmm3
+ pshufb %xmm5, %xmm4
+ pxor %xmm4, %xmm3
+
+ jmp .Lschedule_mangle_both
+.align 16
+.Lschedule_mangle_dec:
+ # inverse mix columns
+ lea .Lk_dksd(%rip),%r11
+ movdqa %xmm9, %xmm1
+ pandn %xmm4, %xmm1
+ psrld \$4, %xmm1 # 1 = hi
+ pand %xmm9, %xmm4 # 4 = lo
+
+ movdqa 0x00(%r11), %xmm2
+ pshufb %xmm4, %xmm2
+ movdqa 0x10(%r11), %xmm3
+ pshufb %xmm1, %xmm3
+ pxor %xmm2, %xmm3
+ pshufb %xmm5, %xmm3
+
+ movdqa 0x20(%r11), %xmm2
+ pshufb %xmm4, %xmm2
+ pxor %xmm3, %xmm2
+ movdqa 0x30(%r11), %xmm3
+ pshufb %xmm1, %xmm3
+ pxor %xmm2, %xmm3
+ pshufb %xmm5, %xmm3
+
+ movdqa 0x40(%r11), %xmm2
+ pshufb %xmm4, %xmm2
+ pxor %xmm3, %xmm2
+ movdqa 0x50(%r11), %xmm3
+ pshufb %xmm1, %xmm3
+ pxor %xmm2, %xmm3
+ pshufb %xmm5, %xmm3
+
+ movdqa 0x60(%r11), %xmm2
+ pshufb %xmm4, %xmm2
+ pxor %xmm3, %xmm2
+ movdqa 0x70(%r11), %xmm3
+ pshufb %xmm1, %xmm3
+ pxor %xmm2, %xmm3
+
+ add \$-16, %rdx
+
+.Lschedule_mangle_both:
+ movdqa (%r8,%r10),%xmm1
+ pshufb %xmm1,%xmm3
+ add \$-16, %r8
+ and \$0x30, %r8
+ movdqu %xmm3, (%rdx)
+ ret
+.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle
+
+#
+# Interface to OpenSSL
+#
+.globl ${PREFIX}_set_encrypt_key
+.type ${PREFIX}_set_encrypt_key,\@function,3
+.align 16
+${PREFIX}_set_encrypt_key:
+___
+$code.=<<___ if ($win64);
+ lea -0xb8(%rsp),%rsp
+ movaps %xmm6,0x10(%rsp)
+ movaps %xmm7,0x20(%rsp)
+ movaps %xmm8,0x30(%rsp)
+ movaps %xmm9,0x40(%rsp)
+ movaps %xmm10,0x50(%rsp)
+ movaps %xmm11,0x60(%rsp)
+ movaps %xmm12,0x70(%rsp)
+ movaps %xmm13,0x80(%rsp)
+ movaps %xmm14,0x90(%rsp)
+ movaps %xmm15,0xa0(%rsp)
+.Lenc_key_body:
+___
+$code.=<<___;
+ mov %esi,%eax
+ shr \$5,%eax
+ add \$5,%eax
+ mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
+
+ mov \$0,%ecx
+ mov \$0x30,%r8d
+ call _vpaes_schedule_core
+___
+$code.=<<___ if ($win64);
+ movaps 0x10(%rsp),%xmm6
+ movaps 0x20(%rsp),%xmm7
+ movaps 0x30(%rsp),%xmm8
+ movaps 0x40(%rsp),%xmm9
+ movaps 0x50(%rsp),%xmm10
+ movaps 0x60(%rsp),%xmm11
+ movaps 0x70(%rsp),%xmm12
+ movaps 0x80(%rsp),%xmm13
+ movaps 0x90(%rsp),%xmm14
+ movaps 0xa0(%rsp),%xmm15
+ lea 0xb8(%rsp),%rsp
+.Lenc_key_epilogue:
+___
+$code.=<<___;
+ xor %eax,%eax
+ ret
+.size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
+
+.globl ${PREFIX}_set_decrypt_key
+.type ${PREFIX}_set_decrypt_key,\@function,3
+.align 16
+${PREFIX}_set_decrypt_key:
+___
+$code.=<<___ if ($win64);
+ lea -0xb8(%rsp),%rsp
+ movaps %xmm6,0x10(%rsp)
+ movaps %xmm7,0x20(%rsp)
+ movaps %xmm8,0x30(%rsp)
+ movaps %xmm9,0x40(%rsp)
+ movaps %xmm10,0x50(%rsp)
+ movaps %xmm11,0x60(%rsp)
+ movaps %xmm12,0x70(%rsp)
+ movaps %xmm13,0x80(%rsp)
+ movaps %xmm14,0x90(%rsp)
+ movaps %xmm15,0xa0(%rsp)
+.Ldec_key_body:
+___
+$code.=<<___;
+ mov %esi,%eax
+ shr \$5,%eax
+ add \$5,%eax
+ mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
+ shl \$4,%eax
+ lea 16(%rdx,%rax),%rdx
+
+ mov \$1,%ecx
+ mov %esi,%r8d
+ shr \$1,%r8d
+ and \$32,%r8d
+ xor \$32,%r8d # nbits==192?0:32
+ call _vpaes_schedule_core
+___
+$code.=<<___ if ($win64);
+ movaps 0x10(%rsp),%xmm6
+ movaps 0x20(%rsp),%xmm7
+ movaps 0x30(%rsp),%xmm8
+ movaps 0x40(%rsp),%xmm9
+ movaps 0x50(%rsp),%xmm10
+ movaps 0x60(%rsp),%xmm11
+ movaps 0x70(%rsp),%xmm12
+ movaps 0x80(%rsp),%xmm13
+ movaps 0x90(%rsp),%xmm14
+ movaps 0xa0(%rsp),%xmm15
+ lea 0xb8(%rsp),%rsp
+.Ldec_key_epilogue:
+___
+$code.=<<___;
+ xor %eax,%eax
+ ret
+.size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
+
+.globl ${PREFIX}_encrypt
+.type ${PREFIX}_encrypt,\@function,3
+.align 16
+${PREFIX}_encrypt:
+___
+$code.=<<___ if ($win64);
+ lea -0xb8(%rsp),%rsp
+ movaps %xmm6,0x10(%rsp)
+ movaps %xmm7,0x20(%rsp)
+ movaps %xmm8,0x30(%rsp)
+ movaps %xmm9,0x40(%rsp)
+ movaps %xmm10,0x50(%rsp)
+ movaps %xmm11,0x60(%rsp)
+ movaps %xmm12,0x70(%rsp)
+ movaps %xmm13,0x80(%rsp)
+ movaps %xmm14,0x90(%rsp)
+ movaps %xmm15,0xa0(%rsp)
+.Lenc_body:
+___
+$code.=<<___;
+ movdqu (%rdi),%xmm0
+ call _vpaes_preheat
+ call _vpaes_encrypt_core
+ movdqu %xmm0,(%rsi)
+___
+$code.=<<___ if ($win64);
+ movaps 0x10(%rsp),%xmm6
+ movaps 0x20(%rsp),%xmm7
+ movaps 0x30(%rsp),%xmm8
+ movaps 0x40(%rsp),%xmm9
+ movaps 0x50(%rsp),%xmm10
+ movaps 0x60(%rsp),%xmm11
+ movaps 0x70(%rsp),%xmm12
+ movaps 0x80(%rsp),%xmm13
+ movaps 0x90(%rsp),%xmm14
+ movaps 0xa0(%rsp),%xmm15
+ lea 0xb8(%rsp),%rsp
+.Lenc_epilogue:
+___
+$code.=<<___;
+ ret
+.size ${PREFIX}_encrypt,.-${PREFIX}_encrypt
+
+.globl ${PREFIX}_decrypt
+.type ${PREFIX}_decrypt,\@function,3
+.align 16
+${PREFIX}_decrypt:
+___
+$code.=<<___ if ($win64);
+ lea -0xb8(%rsp),%rsp
+ movaps %xmm6,0x10(%rsp)
+ movaps %xmm7,0x20(%rsp)
+ movaps %xmm8,0x30(%rsp)
+ movaps %xmm9,0x40(%rsp)
+ movaps %xmm10,0x50(%rsp)
+ movaps %xmm11,0x60(%rsp)
+ movaps %xmm12,0x70(%rsp)
+ movaps %xmm13,0x80(%rsp)
+ movaps %xmm14,0x90(%rsp)
+ movaps %xmm15,0xa0(%rsp)
+.Ldec_body:
+___
+$code.=<<___;
+ movdqu (%rdi),%xmm0
+ call _vpaes_preheat
+ call _vpaes_decrypt_core
+ movdqu %xmm0,(%rsi)
+___
+$code.=<<___ if ($win64);
+ movaps 0x10(%rsp),%xmm6
+ movaps 0x20(%rsp),%xmm7
+ movaps 0x30(%rsp),%xmm8
+ movaps 0x40(%rsp),%xmm9
+ movaps 0x50(%rsp),%xmm10
+ movaps 0x60(%rsp),%xmm11
+ movaps 0x70(%rsp),%xmm12
+ movaps 0x80(%rsp),%xmm13
+ movaps 0x90(%rsp),%xmm14
+ movaps 0xa0(%rsp),%xmm15
+ lea 0xb8(%rsp),%rsp
+.Ldec_epilogue:
+___
+$code.=<<___;
+ ret
+.size ${PREFIX}_decrypt,.-${PREFIX}_decrypt
+___
+{
+my ($inp,$out,$len,$key,$ivp,$enc)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
+# void AES_cbc_encrypt (const void char *inp, unsigned char *out,
+# size_t length, const AES_KEY *key,
+# unsigned char *ivp,const int enc);
+$code.=<<___;
+.globl ${PREFIX}_cbc_encrypt
+.type ${PREFIX}_cbc_encrypt,\@function,6
+.align 16
+${PREFIX}_cbc_encrypt:
+ xchg $key,$len
+___
+($len,$key)=($key,$len);
+$code.=<<___;
+ sub \$16,$len
+ jc .Lcbc_abort
+___
+$code.=<<___ if ($win64);
+ lea -0xb8(%rsp),%rsp
+ movaps %xmm6,0x10(%rsp)
+ movaps %xmm7,0x20(%rsp)
+ movaps %xmm8,0x30(%rsp)
+ movaps %xmm9,0x40(%rsp)
+ movaps %xmm10,0x50(%rsp)
+ movaps %xmm11,0x60(%rsp)
+ movaps %xmm12,0x70(%rsp)
+ movaps %xmm13,0x80(%rsp)
+ movaps %xmm14,0x90(%rsp)
+ movaps %xmm15,0xa0(%rsp)
+.Lcbc_body:
+___
+$code.=<<___;
+ movdqu ($ivp),%xmm6 # load IV
+ sub $inp,$out
+ call _vpaes_preheat
+ cmp \$0,${enc}d
+ je .Lcbc_dec_loop
+ jmp .Lcbc_enc_loop
+.align 16
+.Lcbc_enc_loop:
+ movdqu ($inp),%xmm0
+ pxor %xmm6,%xmm0
+ call _vpaes_encrypt_core
+ movdqa %xmm0,%xmm6
+ movdqu %xmm0,($out,$inp)
+ lea 16($inp),$inp
+ sub \$16,$len
+ jnc .Lcbc_enc_loop
+ jmp .Lcbc_done
+.align 16
+.Lcbc_dec_loop:
+ movdqu ($inp),%xmm0
+ movdqa %xmm0,%xmm7
+ call _vpaes_decrypt_core
+ pxor %xmm6,%xmm0
+ movdqa %xmm7,%xmm6
+ movdqu %xmm0,($out,$inp)
+ lea 16($inp),$inp
+ sub \$16,$len
+ jnc .Lcbc_dec_loop
+.Lcbc_done:
+ movdqu %xmm6,($ivp) # save IV
+___
+$code.=<<___ if ($win64);
+ movaps 0x10(%rsp),%xmm6
+ movaps 0x20(%rsp),%xmm7
+ movaps 0x30(%rsp),%xmm8
+ movaps 0x40(%rsp),%xmm9
+ movaps 0x50(%rsp),%xmm10
+ movaps 0x60(%rsp),%xmm11
+ movaps 0x70(%rsp),%xmm12
+ movaps 0x80(%rsp),%xmm13
+ movaps 0x90(%rsp),%xmm14
+ movaps 0xa0(%rsp),%xmm15
+ lea 0xb8(%rsp),%rsp
+.Lcbc_epilogue:
+___
+$code.=<<___;
+.Lcbc_abort:
+ ret
+.size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
+___
+}
+$code.=<<___;
+##
+## _aes_preheat
+##
+## Fills register %r10 -> .aes_consts (so you can -fPIC)
+## and %xmm9-%xmm15 as specified below.
+##
+.type _vpaes_preheat,\@abi-omnipotent
+.align 16
+_vpaes_preheat:
+ lea .Lk_s0F(%rip), %r10
+ movdqa -0x20(%r10), %xmm10 # .Lk_inv
+ movdqa -0x10(%r10), %xmm11 # .Lk_inv+16
+ movdqa 0x00(%r10), %xmm9 # .Lk_s0F
+ movdqa 0x30(%r10), %xmm13 # .Lk_sb1
+ movdqa 0x40(%r10), %xmm12 # .Lk_sb1+16
+ movdqa 0x50(%r10), %xmm15 # .Lk_sb2
+ movdqa 0x60(%r10), %xmm14 # .Lk_sb2+16
+ ret
+.size _vpaes_preheat,.-_vpaes_preheat
+########################################################
+## ##
+## Constants ##
+## ##
+########################################################
+.type _vpaes_consts,\@object
+.align 64
+_vpaes_consts:
+.Lk_inv: # inv, inva
+ .quad 0x0E05060F0D080180, 0x040703090A0B0C02
+ .quad 0x01040A060F0B0780, 0x030D0E0C02050809
+
+.Lk_s0F: # s0F
+ .quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F
+
+.Lk_ipt: # input transform (lo, hi)
+ .quad 0xC2B2E8985A2A7000, 0xCABAE09052227808
+ .quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
+
+.Lk_sb1: # sb1u, sb1t
+ .quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
+ .quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
+.Lk_sb2: # sb2u, sb2t
+ .quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD
+ .quad 0x69EB88400AE12900, 0xC2A163C8AB82234A
+.Lk_sbo: # sbou, sbot
+ .quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878
+ .quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
+
+.Lk_mc_forward: # mc_forward
+ .quad 0x0407060500030201, 0x0C0F0E0D080B0A09
+ .quad 0x080B0A0904070605, 0x000302010C0F0E0D
+ .quad 0x0C0F0E0D080B0A09, 0x0407060500030201
+ .quad 0x000302010C0F0E0D, 0x080B0A0904070605
+
+.Lk_mc_backward:# mc_backward
+ .quad 0x0605040702010003, 0x0E0D0C0F0A09080B
+ .quad 0x020100030E0D0C0F, 0x0A09080B06050407
+ .quad 0x0E0D0C0F0A09080B, 0x0605040702010003
+ .quad 0x0A09080B06050407, 0x020100030E0D0C0F
+
+.Lk_sr: # sr
+ .quad 0x0706050403020100, 0x0F0E0D0C0B0A0908
+ .quad 0x030E09040F0A0500, 0x0B06010C07020D08
+ .quad 0x0F060D040B020900, 0x070E050C030A0108
+ .quad 0x0B0E0104070A0D00, 0x0306090C0F020508
+
+.Lk_rcon: # rcon
+ .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
+
+.Lk_s63: # s63: all equal to 0x63 transformed
+ .quad 0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B
+
+.Lk_opt: # output transform
+ .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808
+ .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
+
+.Lk_deskew: # deskew tables: inverts the sbox's "skew"
+ .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
+ .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
+
+##
+## Decryption stuff
+## Key schedule constants
+##
+.Lk_dksd: # decryption key schedule: invskew x*D
+ .quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
+ .quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E
+.Lk_dksb: # decryption key schedule: invskew x*B
+ .quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99
+ .quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
+.Lk_dkse: # decryption key schedule: invskew x*E + 0x63
+ .quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086
+ .quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487
+.Lk_dks9: # decryption key schedule: invskew x*9
+ .quad 0xB6116FC87ED9A700, 0x4AED933482255BFC
+ .quad 0x4576516227143300, 0x8BB89FACE9DAFDCE
+
+##
+## Decryption stuff
+## Round function constants
+##
+.Lk_dipt: # decryption input transform
+ .quad 0x0F505B040B545F00, 0x154A411E114E451A
+ .quad 0x86E383E660056500, 0x12771772F491F194
+
+.Lk_dsb9: # decryption sbox output *9*u, *9*t
+ .quad 0x851C03539A86D600, 0xCAD51F504F994CC9
+ .quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565
+.Lk_dsbd: # decryption sbox output *D*u, *D*t
+ .quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
+ .quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
+.Lk_dsbb: # decryption sbox output *B*u, *B*t
+ .quad 0xD022649296B44200, 0x602646F6B0F2D404
+ .quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
+.Lk_dsbe: # decryption sbox output *E*u, *E*t
+ .quad 0x46F2929626D4D000, 0x2242600464B4F6B0
+ .quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32
+.Lk_dsbo: # decryption sbox final output
+ .quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
+ .quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C
+.asciz "Vector Permutaion AES for x86_64/SSSE3, Mike Hamburg (Stanford University)"
+.align 64
+.size _vpaes_consts,.-_vpaes_consts
+___
+
+if ($win64) {
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+# CONTEXT *context,DISPATCHER_CONTEXT *disp)
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern __imp_RtlVirtualUnwind
+.type se_handler,\@abi-omnipotent
+.align 16
+se_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HandlerData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # prologue label
+ cmp %r10,%rbx # context->Rip<prologue label
+ jb .Lin_prologue
+
+ mov 152($context),%rax # pull context->Rsp
+
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=epilogue label
+ jae .Lin_prologue
+
+ lea 16(%rax),%rsi # %xmm save area
+ lea 512($context),%rdi # &context.Xmm6
+ mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
+ .long 0xa548f3fc # cld; rep movsq
+ lea 0xb8(%rax),%rax # adjust stack pointer
+
+.Lin_prologue:
+ mov 8(%rax),%rdi
+ mov 16(%rax),%rsi
+ mov %rax,152($context) # restore context->Rsp
+ mov %rsi,168($context) # restore context->Rsi
+ mov %rdi,176($context) # restore context->Rdi
+
+ mov 40($disp),%rdi # disp->ContextRecord
+ mov $context,%rsi # context
+ mov \$`1232/8`,%ecx # sizeof(CONTEXT)
+ .long 0xa548f3fc # cld; rep movsq
+
+ mov $disp,%rsi
+ xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
+ mov 8(%rsi),%rdx # arg2, disp->ImageBase
+ mov 0(%rsi),%r8 # arg3, disp->ControlPc
+ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
+ mov 40(%rsi),%r10 # disp->ContextRecord
+ lea 56(%rsi),%r11 # &disp->HandlerData
+ lea 24(%rsi),%r12 # &disp->EstablisherFrame
+ mov %r10,32(%rsp) # arg5
+ mov %r11,40(%rsp) # arg6
+ mov %r12,48(%rsp) # arg7
+ mov %rcx,56(%rsp) # arg8, (NULL)
+ call *__imp_RtlVirtualUnwind(%rip)
+
+ mov \$1,%eax # ExceptionContinueSearch
+ add \$64,%rsp
+ popfq
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ pop %rdi
+ pop %rsi
+ ret
+.size se_handler,.-se_handler
+
+.section .pdata
+.align 4
+ .rva .LSEH_begin_${PREFIX}_set_encrypt_key
+ .rva .LSEH_end_${PREFIX}_set_encrypt_key
+ .rva .LSEH_info_${PREFIX}_set_encrypt_key
+
+ .rva .LSEH_begin_${PREFIX}_set_decrypt_key
+ .rva .LSEH_end_${PREFIX}_set_decrypt_key
+ .rva .LSEH_info_${PREFIX}_set_decrypt_key
+
+ .rva .LSEH_begin_${PREFIX}_encrypt
+ .rva .LSEH_end_${PREFIX}_encrypt
+ .rva .LSEH_info_${PREFIX}_encrypt
+
+ .rva .LSEH_begin_${PREFIX}_decrypt
+ .rva .LSEH_end_${PREFIX}_decrypt
+ .rva .LSEH_info_${PREFIX}_decrypt
+
+ .rva .LSEH_begin_${PREFIX}_cbc_encrypt
+ .rva .LSEH_end_${PREFIX}_cbc_encrypt
+ .rva .LSEH_info_${PREFIX}_cbc_encrypt
+
+.section .xdata
+.align 8
+.LSEH_info_${PREFIX}_set_encrypt_key:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lenc_key_body,.Lenc_key_epilogue # HandlerData[]
+.LSEH_info_${PREFIX}_set_decrypt_key:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Ldec_key_body,.Ldec_key_epilogue # HandlerData[]
+.LSEH_info_${PREFIX}_encrypt:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lenc_body,.Lenc_epilogue # HandlerData[]
+.LSEH_info_${PREFIX}_decrypt:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Ldec_body,.Ldec_epilogue # HandlerData[]
+.LSEH_info_${PREFIX}_cbc_encrypt:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lcbc_body,.Lcbc_epilogue # HandlerData[]
+___
+}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+
+print $code;
+
+close STDOUT;
diff --git a/main/openssl/crypto/arm_arch.h b/main/openssl/crypto/arm_arch.h
new file mode 100644
index 00000000..5a831076
--- /dev/null
+++ b/main/openssl/crypto/arm_arch.h
@@ -0,0 +1,51 @@
+#ifndef __ARM_ARCH_H__
+#define __ARM_ARCH_H__
+
+#if !defined(__ARM_ARCH__)
+# if defined(__CC_ARM)
+# define __ARM_ARCH__ __TARGET_ARCH_ARM
+# if defined(__BIG_ENDIAN)
+# define __ARMEB__
+# else
+# define __ARMEL__
+# endif
+# elif defined(__GNUC__)
+ /*
+ * Why doesn't gcc define __ARM_ARCH__? Instead it defines
+ * bunch of below macros. See all_architectires[] table in
+ * gcc/config/arm/arm.c. On a side note it defines
+ * __ARMEL__/__ARMEB__ for little-/big-endian.
+ */
+# if defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || \
+ defined(__ARM_ARCH_7R__)|| defined(__ARM_ARCH_7M__) || \
+ defined(__ARM_ARCH_7EM__)
+# define __ARM_ARCH__ 7
+# elif defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || \
+ defined(__ARM_ARCH_6K__)|| defined(__ARM_ARCH_6M__) || \
+ defined(__ARM_ARCH_6Z__)|| defined(__ARM_ARCH_6ZK__) || \
+ defined(__ARM_ARCH_6T2__)
+# define __ARM_ARCH__ 6
+# elif defined(__ARM_ARCH_5__) || defined(__ARM_ARCH_5T__) || \
+ defined(__ARM_ARCH_5E__)|| defined(__ARM_ARCH_5TE__) || \
+ defined(__ARM_ARCH_5TEJ__)
+# define __ARM_ARCH__ 5
+# elif defined(__ARM_ARCH_4__) || defined(__ARM_ARCH_4T__)
+# define __ARM_ARCH__ 4
+# else
+# error "unsupported ARM architecture"
+# endif
+# endif
+#endif
+
+#ifdef OPENSSL_FIPSCANISTER
+#include <openssl/fipssyms.h>
+#endif
+
+#if !__ASSEMBLER__
+extern unsigned int OPENSSL_armcap_P;
+
+#define ARMV7_NEON (1<<0)
+#define ARMV7_TICK (1<<1)
+#endif
+
+#endif
diff --git a/main/openssl/crypto/armv4cpuid.S b/main/openssl/crypto/armv4cpuid.S
new file mode 100644
index 00000000..2d618dea
--- /dev/null
+++ b/main/openssl/crypto/armv4cpuid.S
@@ -0,0 +1,154 @@
+#include "arm_arch.h"
+
+.text
+.code 32
+
+.align 5
+.global _armv7_neon_probe
+.type _armv7_neon_probe,%function
+_armv7_neon_probe:
+ .word 0xf26ee1fe @ vorr q15,q15,q15
+ .word 0xe12fff1e @ bx lr
+.size _armv7_neon_probe,.-_armv7_neon_probe
+
+.global _armv7_tick
+.type _armv7_tick,%function
+_armv7_tick:
+ mrc p15,0,r0,c9,c13,0
+ .word 0xe12fff1e @ bx lr
+.size _armv7_tick,.-_armv7_tick
+
+.global OPENSSL_atomic_add
+.type OPENSSL_atomic_add,%function
+OPENSSL_atomic_add:
+#if __ARM_ARCH__>=6
+.Ladd: ldrex r2,[r0]
+ add r3,r2,r1
+ strex r2,r3,[r0]
+ cmp r2,#0
+ bne .Ladd
+ mov r0,r3
+ .word 0xe12fff1e @ bx lr
+#else
+ stmdb sp!,{r4-r6,lr}
+ ldr r2,.Lspinlock
+ adr r3,.Lspinlock
+ mov r4,r0
+ mov r5,r1
+ add r6,r3,r2 @ &spinlock
+ b .+8
+.Lspin: bl sched_yield
+ mov r0,#-1
+ swp r0,r0,[r6]
+ cmp r0,#0
+ bne .Lspin
+
+ ldr r2,[r4]
+ add r2,r2,r5
+ str r2,[r4]
+ str r0,[r6] @ release spinlock
+ ldmia sp!,{r4-r6,lr}
+ tst lr,#1
+ moveq pc,lr
+ .word 0xe12fff1e @ bx lr
+#endif
+.size OPENSSL_atomic_add,.-OPENSSL_atomic_add
+
+.global OPENSSL_cleanse
+.type OPENSSL_cleanse,%function
+OPENSSL_cleanse:
+ eor ip,ip,ip
+ cmp r1,#7
+ subhs r1,r1,#4
+ bhs .Lot
+ cmp r1,#0
+ beq .Lcleanse_done
+.Little:
+ strb ip,[r0],#1
+ subs r1,r1,#1
+ bhi .Little
+ b .Lcleanse_done
+
+.Lot: tst r0,#3
+ beq .Laligned
+ strb ip,[r0],#1
+ sub r1,r1,#1
+ b .Lot
+.Laligned:
+ str ip,[r0],#4
+ subs r1,r1,#4
+ bhs .Laligned
+ adds r1,r1,#4
+ bne .Little
+.Lcleanse_done:
+ tst lr,#1
+ moveq pc,lr
+ .word 0xe12fff1e @ bx lr
+.size OPENSSL_cleanse,.-OPENSSL_cleanse
+
+.global OPENSSL_wipe_cpu
+.type OPENSSL_wipe_cpu,%function
+OPENSSL_wipe_cpu:
+ ldr r0,.LOPENSSL_armcap
+ adr r1,.LOPENSSL_armcap
+ ldr r0,[r1,r0]
+ eor r2,r2,r2
+ eor r3,r3,r3
+ eor ip,ip,ip
+ tst r0,#1
+ beq .Lwipe_done
+ .word 0xf3000150 @ veor q0, q0, q0
+ .word 0xf3022152 @ veor q1, q1, q1
+ .word 0xf3044154 @ veor q2, q2, q2
+ .word 0xf3066156 @ veor q3, q3, q3
+ .word 0xf34001f0 @ veor q8, q8, q8
+ .word 0xf34221f2 @ veor q9, q9, q9
+ .word 0xf34441f4 @ veor q10, q10, q10
+ .word 0xf34661f6 @ veor q11, q11, q11
+ .word 0xf34881f8 @ veor q12, q12, q12
+ .word 0xf34aa1fa @ veor q13, q13, q13
+ .word 0xf34cc1fc @ veor q14, q14, q14
+ .word 0xf34ee1fe @ veor q15, q15, q15
+.Lwipe_done:
+ mov r0,sp
+ tst lr,#1
+ moveq pc,lr
+ .word 0xe12fff1e @ bx lr
+.size OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu
+
+.global OPENSSL_instrument_bus
+.type OPENSSL_instrument_bus,%function
+OPENSSL_instrument_bus:
+ eor r0,r0,r0
+ tst lr,#1
+ moveq pc,lr
+ .word 0xe12fff1e @ bx lr
+.size OPENSSL_instrument_bus,.-OPENSSL_instrument_bus
+
+.global OPENSSL_instrument_bus2
+.type OPENSSL_instrument_bus2,%function
+OPENSSL_instrument_bus2:
+ eor r0,r0,r0
+ tst lr,#1
+ moveq pc,lr
+ .word 0xe12fff1e @ bx lr
+.size OPENSSL_instrument_bus2,.-OPENSSL_instrument_bus2
+
+.align 5
+.LOPENSSL_armcap:
+.word OPENSSL_armcap_P-.LOPENSSL_armcap
+#if __ARM_ARCH__>=6
+.align 5
+#else
+.Lspinlock:
+.word atomic_add_spinlock-.Lspinlock
+.align 5
+
+.data
+.align 2
+atomic_add_spinlock:
+.word 0
+#endif
+
+.comm OPENSSL_armcap_P,4,4
+.hidden OPENSSL_armcap_P
diff --git a/main/openssl/crypto/asn1/a_d2i_fp.c b/main/openssl/crypto/asn1/a_d2i_fp.c
index ece40bc4..52b2ebdb 100644
--- a/main/openssl/crypto/asn1/a_d2i_fp.c
+++ b/main/openssl/crypto/asn1/a_d2i_fp.c
@@ -57,6 +57,7 @@
*/
#include <stdio.h>
+#include <limits.h>
#include "cryptlib.h"
#include <openssl/buffer.h>
#include <openssl/asn1_mac.h>
@@ -143,17 +144,11 @@ static int asn1_d2i_read_bio(BIO *in, BUF_MEM **pb)
BUF_MEM *b;
unsigned char *p;
int i;
- int ret=-1;
ASN1_const_CTX c;
- int want=HEADER_SIZE;
+ size_t want=HEADER_SIZE;
int eos=0;
-#if defined(__GNUC__) && defined(__ia64)
- /* pathetic compiler bug in all known versions as of Nov. 2002 */
- long off=0;
-#else
- int off=0;
-#endif
- int len=0;
+ size_t off=0;
+ size_t len=0;
b=BUF_MEM_new();
if (b == NULL)
@@ -169,7 +164,7 @@ static int asn1_d2i_read_bio(BIO *in, BUF_MEM **pb)
{
want-=(len-off);
- if (!BUF_MEM_grow_clean(b,len+want))
+ if (len + want < len || !BUF_MEM_grow_clean(b,len+want))
{
ASN1err(ASN1_F_ASN1_D2I_READ_BIO,ERR_R_MALLOC_FAILURE);
goto err;
@@ -181,7 +176,14 @@ static int asn1_d2i_read_bio(BIO *in, BUF_MEM **pb)
goto err;
}
if (i > 0)
+ {
+ if (len+i < len)
+ {
+ ASN1err(ASN1_F_ASN1_D2I_READ_BIO,ASN1_R_TOO_LONG);
+ goto err;
+ }
len+=i;
+ }
}
/* else data already loaded */
@@ -206,6 +208,11 @@ static int asn1_d2i_read_bio(BIO *in, BUF_MEM **pb)
{
/* no data body so go round again */
eos++;
+ if (eos < 0)
+ {
+ ASN1err(ASN1_F_ASN1_D2I_READ_BIO,ASN1_R_HEADER_TOO_LONG);
+ goto err;
+ }
want=HEADER_SIZE;
}
else if (eos && (c.slen == 0) && (c.tag == V_ASN1_EOC))
@@ -220,10 +227,16 @@ static int asn1_d2i_read_bio(BIO *in, BUF_MEM **pb)
else
{
/* suck in c.slen bytes of data */
- want=(int)c.slen;
+ want=c.slen;
if (want > (len-off))
{
want-=(len-off);
+ if (want > INT_MAX /* BIO_read takes an int length */ ||
+ len+want < len)
+ {
+ ASN1err(ASN1_F_ASN1_D2I_READ_BIO,ASN1_R_TOO_LONG);
+ goto err;
+ }
if (!BUF_MEM_grow_clean(b,len+want))
{
ASN1err(ASN1_F_ASN1_D2I_READ_BIO,ERR_R_MALLOC_FAILURE);
@@ -238,11 +251,18 @@ static int asn1_d2i_read_bio(BIO *in, BUF_MEM **pb)
ASN1_R_NOT_ENOUGH_DATA);
goto err;
}
+ /* This can't overflow because
+ * |len+want| didn't overflow. */
len+=i;
- want -= i;
+ want-=i;
}
}
- off+=(int)c.slen;
+ if (off + c.slen < off)
+ {
+ ASN1err(ASN1_F_ASN1_D2I_READ_BIO,ASN1_R_TOO_LONG);
+ goto err;
+ }
+ off+=c.slen;
if (eos <= 0)
{
break;
@@ -252,9 +272,15 @@ static int asn1_d2i_read_bio(BIO *in, BUF_MEM **pb)
}
}
+ if (off > INT_MAX)
+ {
+ ASN1err(ASN1_F_ASN1_D2I_READ_BIO,ASN1_R_TOO_LONG);
+ goto err;
+ }
+
*pb = b;
return off;
err:
if (b != NULL) BUF_MEM_free(b);
- return(ret);
+ return -1;
}
diff --git a/main/openssl/crypto/asn1/a_digest.c b/main/openssl/crypto/asn1/a_digest.c
index d00d9e22..cbdeea6a 100644
--- a/main/openssl/crypto/asn1/a_digest.c
+++ b/main/openssl/crypto/asn1/a_digest.c
@@ -87,7 +87,8 @@ int ASN1_digest(i2d_of_void *i2d, const EVP_MD *type, char *data,
p=str;
i2d(data,&p);
- EVP_Digest(str, i, md, len, type, NULL);
+ if (!EVP_Digest(str, i, md, len, type, NULL))
+ return 0;
OPENSSL_free(str);
return(1);
}
@@ -104,7 +105,8 @@ int ASN1_item_digest(const ASN1_ITEM *it, const EVP_MD *type, void *asn,
i=ASN1_item_i2d(asn,&str, it);
if (!str) return(0);
- EVP_Digest(str, i, md, len, type, NULL);
+ if (!EVP_Digest(str, i, md, len, type, NULL))
+ return 0;
OPENSSL_free(str);
return(1);
}
diff --git a/main/openssl/crypto/asn1/a_int.c b/main/openssl/crypto/asn1/a_int.c
index 3348b876..297c45a9 100644
--- a/main/openssl/crypto/asn1/a_int.c
+++ b/main/openssl/crypto/asn1/a_int.c
@@ -116,7 +116,7 @@ int i2c_ASN1_INTEGER(ASN1_INTEGER *a, unsigned char **pp)
int pad=0,ret,i,neg;
unsigned char *p,*n,pb=0;
- if ((a == NULL) || (a->data == NULL)) return(0);
+ if (a == NULL) return(0);
neg=a->type & V_ASN1_NEG;
if (a->length == 0)
ret=1;
@@ -386,8 +386,8 @@ long ASN1_INTEGER_get(const ASN1_INTEGER *a)
if (a->length > (int)sizeof(long))
{
- /* hmm... a bit ugly */
- return(0xffffffffL);
+ /* hmm... a bit ugly, return all ones */
+ return -1;
}
if (a->data == NULL)
return 0;
diff --git a/main/openssl/crypto/asn1/a_sign.c b/main/openssl/crypto/asn1/a_sign.c
index ff63bfc7..7b4a193d 100644
--- a/main/openssl/crypto/asn1/a_sign.c
+++ b/main/openssl/crypto/asn1/a_sign.c
@@ -184,9 +184,9 @@ int ASN1_sign(i2d_of_void *i2d, X509_ALGOR *algor1, X509_ALGOR *algor2,
p=buf_in;
i2d(data,&p);
- EVP_SignInit_ex(&ctx,type, NULL);
- EVP_SignUpdate(&ctx,(unsigned char *)buf_in,inl);
- if (!EVP_SignFinal(&ctx,(unsigned char *)buf_out,
+ if (!EVP_SignInit_ex(&ctx,type, NULL)
+ || !EVP_SignUpdate(&ctx,(unsigned char *)buf_in,inl)
+ || !EVP_SignFinal(&ctx,(unsigned char *)buf_out,
(unsigned int *)&outl,pkey))
{
outl=0;
@@ -218,65 +218,100 @@ int ASN1_item_sign(const ASN1_ITEM *it, X509_ALGOR *algor1, X509_ALGOR *algor2,
const EVP_MD *type)
{
EVP_MD_CTX ctx;
+ EVP_MD_CTX_init(&ctx);
+ if (!EVP_DigestSignInit(&ctx, NULL, type, NULL, pkey))
+ {
+ EVP_MD_CTX_cleanup(&ctx);
+ return 0;
+ }
+ return ASN1_item_sign_ctx(it, algor1, algor2, signature, asn, &ctx);
+ }
+
+
+int ASN1_item_sign_ctx(const ASN1_ITEM *it,
+ X509_ALGOR *algor1, X509_ALGOR *algor2,
+ ASN1_BIT_STRING *signature, void *asn, EVP_MD_CTX *ctx)
+ {
+ const EVP_MD *type;
+ EVP_PKEY *pkey;
unsigned char *buf_in=NULL,*buf_out=NULL;
- int inl=0,outl=0,outll=0;
+ size_t inl=0,outl=0,outll=0;
int signid, paramtype;
+ int rv;
+
+ type = EVP_MD_CTX_md(ctx);
+ pkey = EVP_PKEY_CTX_get0_pkey(ctx->pctx);
- if (type == NULL)
+ if (!type || !pkey)
{
- int def_nid;
- if (EVP_PKEY_get_default_digest_nid(pkey, &def_nid) > 0)
- type = EVP_get_digestbynid(def_nid);
+ ASN1err(ASN1_F_ASN1_ITEM_SIGN_CTX, ASN1_R_CONTEXT_NOT_INITIALISED);
+ return 0;
}
- if (type == NULL)
+ if (pkey->ameth->item_sign)
{
- ASN1err(ASN1_F_ASN1_ITEM_SIGN, ASN1_R_NO_DEFAULT_DIGEST);
- return 0;
+ rv = pkey->ameth->item_sign(ctx, it, asn, algor1, algor2,
+ signature);
+ if (rv == 1)
+ outl = signature->length;
+ /* Return value meanings:
+ * <=0: error.
+ * 1: method does everything.
+ * 2: carry on as normal.
+ * 3: ASN1 method sets algorithm identifiers: just sign.
+ */
+ if (rv <= 0)
+ ASN1err(ASN1_F_ASN1_ITEM_SIGN_CTX, ERR_R_EVP_LIB);
+ if (rv <= 1)
+ goto err;
}
+ else
+ rv = 2;
- if (type->flags & EVP_MD_FLAG_PKEY_METHOD_SIGNATURE)
+ if (rv == 2)
{
- if (!pkey->ameth ||
- !OBJ_find_sigid_by_algs(&signid, EVP_MD_nid(type),
- pkey->ameth->pkey_id))
+ if (type->flags & EVP_MD_FLAG_PKEY_METHOD_SIGNATURE)
{
- ASN1err(ASN1_F_ASN1_ITEM_SIGN,
- ASN1_R_DIGEST_AND_KEY_TYPE_NOT_SUPPORTED);
- return 0;
+ if (!pkey->ameth ||
+ !OBJ_find_sigid_by_algs(&signid,
+ EVP_MD_nid(type),
+ pkey->ameth->pkey_id))
+ {
+ ASN1err(ASN1_F_ASN1_ITEM_SIGN_CTX,
+ ASN1_R_DIGEST_AND_KEY_TYPE_NOT_SUPPORTED);
+ return 0;
+ }
}
- }
- else
- signid = type->pkey_type;
+ else
+ signid = type->pkey_type;
- if (pkey->ameth->pkey_flags & ASN1_PKEY_SIGPARAM_NULL)
- paramtype = V_ASN1_NULL;
- else
- paramtype = V_ASN1_UNDEF;
+ if (pkey->ameth->pkey_flags & ASN1_PKEY_SIGPARAM_NULL)
+ paramtype = V_ASN1_NULL;
+ else
+ paramtype = V_ASN1_UNDEF;
- if (algor1)
- X509_ALGOR_set0(algor1, OBJ_nid2obj(signid), paramtype, NULL);
- if (algor2)
- X509_ALGOR_set0(algor2, OBJ_nid2obj(signid), paramtype, NULL);
+ if (algor1)
+ X509_ALGOR_set0(algor1, OBJ_nid2obj(signid), paramtype, NULL);
+ if (algor2)
+ X509_ALGOR_set0(algor2, OBJ_nid2obj(signid), paramtype, NULL);
+
+ }
- EVP_MD_CTX_init(&ctx);
inl=ASN1_item_i2d(asn,&buf_in, it);
outll=outl=EVP_PKEY_size(pkey);
- buf_out=(unsigned char *)OPENSSL_malloc((unsigned int)outl);
+ buf_out=OPENSSL_malloc((unsigned int)outl);
if ((buf_in == NULL) || (buf_out == NULL))
{
outl=0;
- ASN1err(ASN1_F_ASN1_ITEM_SIGN,ERR_R_MALLOC_FAILURE);
+ ASN1err(ASN1_F_ASN1_ITEM_SIGN_CTX,ERR_R_MALLOC_FAILURE);
goto err;
}
- EVP_SignInit_ex(&ctx,type, NULL);
- EVP_SignUpdate(&ctx,(unsigned char *)buf_in,inl);
- if (!EVP_SignFinal(&ctx,(unsigned char *)buf_out,
- (unsigned int *)&outl,pkey))
+ if (!EVP_DigestSignUpdate(ctx, buf_in, inl)
+ || !EVP_DigestSignFinal(ctx, buf_out, &outl))
{
outl=0;
- ASN1err(ASN1_F_ASN1_ITEM_SIGN,ERR_R_EVP_LIB);
+ ASN1err(ASN1_F_ASN1_ITEM_SIGN_CTX,ERR_R_EVP_LIB);
goto err;
}
if (signature->data != NULL) OPENSSL_free(signature->data);
@@ -289,7 +324,7 @@ int ASN1_item_sign(const ASN1_ITEM *it, X509_ALGOR *algor1, X509_ALGOR *algor2,
signature->flags&= ~(ASN1_STRING_FLAG_BITS_LEFT|0x07);
signature->flags|=ASN1_STRING_FLAG_BITS_LEFT;
err:
- EVP_MD_CTX_cleanup(&ctx);
+ EVP_MD_CTX_cleanup(ctx);
if (buf_in != NULL)
{ OPENSSL_cleanse((char *)buf_in,(unsigned int)inl); OPENSSL_free(buf_in); }
if (buf_out != NULL)
diff --git a/main/openssl/crypto/asn1/a_strex.c b/main/openssl/crypto/asn1/a_strex.c
index 264ebf23..ead37ac3 100644
--- a/main/openssl/crypto/asn1/a_strex.c
+++ b/main/openssl/crypto/asn1/a_strex.c
@@ -567,6 +567,7 @@ int ASN1_STRING_to_UTF8(unsigned char **out, ASN1_STRING *in)
if(mbflag == -1) return -1;
mbflag |= MBSTRING_FLAG;
stmp.data = NULL;
+ stmp.length = 0;
ret = ASN1_mbstring_copy(&str, in->data, in->length, mbflag, B_ASN1_UTF8STRING);
if(ret < 0) return ret;
*out = stmp.data;
diff --git a/main/openssl/crypto/asn1/a_verify.c b/main/openssl/crypto/asn1/a_verify.c
index cecdb13c..fc84cd3d 100644
--- a/main/openssl/crypto/asn1/a_verify.c
+++ b/main/openssl/crypto/asn1/a_verify.c
@@ -101,8 +101,13 @@ int ASN1_verify(i2d_of_void *i2d, X509_ALGOR *a, ASN1_BIT_STRING *signature,
p=buf_in;
i2d(data,&p);
- EVP_VerifyInit_ex(&ctx,type, NULL);
- EVP_VerifyUpdate(&ctx,(unsigned char *)buf_in,inl);
+ if (!EVP_VerifyInit_ex(&ctx,type, NULL)
+ || !EVP_VerifyUpdate(&ctx,(unsigned char *)buf_in,inl))
+ {
+ ASN1err(ASN1_F_ASN1_VERIFY,ERR_R_EVP_LIB);
+ ret=0;
+ goto err;
+ }
OPENSSL_cleanse(buf_in,(unsigned int)inl);
OPENSSL_free(buf_in);
@@ -126,16 +131,21 @@ err:
#endif
-int ASN1_item_verify(const ASN1_ITEM *it, X509_ALGOR *a, ASN1_BIT_STRING *signature,
- void *asn, EVP_PKEY *pkey)
+int ASN1_item_verify(const ASN1_ITEM *it, X509_ALGOR *a,
+ ASN1_BIT_STRING *signature, void *asn, EVP_PKEY *pkey)
{
EVP_MD_CTX ctx;
- const EVP_MD *type = NULL;
unsigned char *buf_in=NULL;
int ret= -1,inl;
int mdnid, pknid;
+ if (!pkey)
+ {
+ ASN1err(ASN1_F_ASN1_ITEM_VERIFY, ERR_R_PASSED_NULL_PARAMETER);
+ return -1;
+ }
+
EVP_MD_CTX_init(&ctx);
/* Convert signature OID into digest and public key OIDs */
@@ -144,25 +154,47 @@ int ASN1_item_verify(const ASN1_ITEM *it, X509_ALGOR *a, ASN1_BIT_STRING *signat
ASN1err(ASN1_F_ASN1_ITEM_VERIFY,ASN1_R_UNKNOWN_SIGNATURE_ALGORITHM);
goto err;
}
- type=EVP_get_digestbynid(mdnid);
- if (type == NULL)
+ if (mdnid == NID_undef)
{
- ASN1err(ASN1_F_ASN1_ITEM_VERIFY,ASN1_R_UNKNOWN_MESSAGE_DIGEST_ALGORITHM);
- goto err;
+ if (!pkey->ameth || !pkey->ameth->item_verify)
+ {
+ ASN1err(ASN1_F_ASN1_ITEM_VERIFY,ASN1_R_UNKNOWN_SIGNATURE_ALGORITHM);
+ goto err;
+ }
+ ret = pkey->ameth->item_verify(&ctx, it, asn, a,
+ signature, pkey);
+ /* Return value of 2 means carry on, anything else means we
+ * exit straight away: either a fatal error of the underlying
+ * verification routine handles all verification.
+ */
+ if (ret != 2)
+ goto err;
+ ret = -1;
}
-
- /* Check public key OID matches public key type */
- if (EVP_PKEY_type(pknid) != pkey->ameth->pkey_id)
+ else
{
- ASN1err(ASN1_F_ASN1_ITEM_VERIFY,ASN1_R_WRONG_PUBLIC_KEY_TYPE);
- goto err;
- }
+ const EVP_MD *type;
+ type=EVP_get_digestbynid(mdnid);
+ if (type == NULL)
+ {
+ ASN1err(ASN1_F_ASN1_ITEM_VERIFY,ASN1_R_UNKNOWN_MESSAGE_DIGEST_ALGORITHM);
+ goto err;
+ }
+
+ /* Check public key OID matches public key type */
+ if (EVP_PKEY_type(pknid) != pkey->ameth->pkey_id)
+ {
+ ASN1err(ASN1_F_ASN1_ITEM_VERIFY,ASN1_R_WRONG_PUBLIC_KEY_TYPE);
+ goto err;
+ }
+
+ if (!EVP_DigestVerifyInit(&ctx, NULL, type, NULL, pkey))
+ {
+ ASN1err(ASN1_F_ASN1_ITEM_VERIFY,ERR_R_EVP_LIB);
+ ret=0;
+ goto err;
+ }
- if (!EVP_VerifyInit_ex(&ctx,type, NULL))
- {
- ASN1err(ASN1_F_ASN1_ITEM_VERIFY,ERR_R_EVP_LIB);
- ret=0;
- goto err;
}
inl = ASN1_item_i2d(asn, &buf_in, it);
@@ -173,13 +205,18 @@ int ASN1_item_verify(const ASN1_ITEM *it, X509_ALGOR *a, ASN1_BIT_STRING *signat
goto err;
}
- EVP_VerifyUpdate(&ctx,(unsigned char *)buf_in,inl);
+ if (!EVP_DigestVerifyUpdate(&ctx,buf_in,inl))
+ {
+ ASN1err(ASN1_F_ASN1_ITEM_VERIFY,ERR_R_EVP_LIB);
+ ret=0;
+ goto err;
+ }
OPENSSL_cleanse(buf_in,(unsigned int)inl);
OPENSSL_free(buf_in);
- if (EVP_VerifyFinal(&ctx,(unsigned char *)signature->data,
- (unsigned int)signature->length,pkey) <= 0)
+ if (EVP_DigestVerifyFinal(&ctx,signature->data,
+ (size_t)signature->length) <= 0)
{
ASN1err(ASN1_F_ASN1_ITEM_VERIFY,ERR_R_EVP_LIB);
ret=0;
diff --git a/main/openssl/crypto/asn1/ameth_lib.c b/main/openssl/crypto/asn1/ameth_lib.c
index 5a581b90..a19e058f 100644
--- a/main/openssl/crypto/asn1/ameth_lib.c
+++ b/main/openssl/crypto/asn1/ameth_lib.c
@@ -69,6 +69,7 @@ extern const EVP_PKEY_ASN1_METHOD dsa_asn1_meths[];
extern const EVP_PKEY_ASN1_METHOD dh_asn1_meth;
extern const EVP_PKEY_ASN1_METHOD eckey_asn1_meth;
extern const EVP_PKEY_ASN1_METHOD hmac_asn1_meth;
+extern const EVP_PKEY_ASN1_METHOD cmac_asn1_meth;
/* Keep this sorted in type order !! */
static const EVP_PKEY_ASN1_METHOD *standard_methods[] =
@@ -90,7 +91,8 @@ static const EVP_PKEY_ASN1_METHOD *standard_methods[] =
#ifndef OPENSSL_NO_EC
&eckey_asn1_meth,
#endif
- &hmac_asn1_meth
+ &hmac_asn1_meth,
+ &cmac_asn1_meth
};
typedef int sk_cmp_fn_type(const char * const *a, const char * const *b);
@@ -291,6 +293,8 @@ EVP_PKEY_ASN1_METHOD* EVP_PKEY_asn1_new(int id, int flags,
if (!ameth)
return NULL;
+ memset(ameth, 0, sizeof(EVP_PKEY_ASN1_METHOD));
+
ameth->pkey_id = id;
ameth->pkey_base_id = id;
ameth->pkey_flags = flags | ASN1_PKEY_DYNAMIC;
@@ -325,6 +329,9 @@ EVP_PKEY_ASN1_METHOD* EVP_PKEY_asn1_new(int id, int flags,
ameth->old_priv_encode = 0;
ameth->old_priv_decode = 0;
+ ameth->item_verify = 0;
+ ameth->item_sign = 0;
+
ameth->pkey_size = 0;
ameth->pkey_bits = 0;
@@ -376,6 +383,9 @@ void EVP_PKEY_asn1_copy(EVP_PKEY_ASN1_METHOD *dst,
dst->pkey_free = src->pkey_free;
dst->pkey_ctrl = src->pkey_ctrl;
+ dst->item_sign = src->item_sign;
+ dst->item_verify = src->item_verify;
+
}
void EVP_PKEY_asn1_free(EVP_PKEY_ASN1_METHOD *ameth)
diff --git a/main/openssl/crypto/asn1/asn1.h b/main/openssl/crypto/asn1/asn1.h
index 59540e4e..220a0c8c 100644
--- a/main/openssl/crypto/asn1/asn1.h
+++ b/main/openssl/crypto/asn1/asn1.h
@@ -235,7 +235,7 @@ typedef struct asn1_object_st
*/
#define ASN1_STRING_FLAG_MSTRING 0x040
/* This is the base type that holds just about everything :-) */
-typedef struct asn1_string_st
+struct asn1_string_st
{
int length;
int type;
@@ -245,7 +245,7 @@ typedef struct asn1_string_st
* input data has a non-zero 'unused bits' value, it will be
* handled correctly */
long flags;
- } ASN1_STRING;
+ };
/* ASN1_ENCODING structure: this is used to save the received
* encoding of an ASN1 type. This is useful to get round
@@ -293,7 +293,6 @@ DECLARE_STACK_OF(ASN1_STRING_TABLE)
* see asn1t.h
*/
typedef struct ASN1_TEMPLATE_st ASN1_TEMPLATE;
-typedef struct ASN1_ITEM_st ASN1_ITEM;
typedef struct ASN1_TLC_st ASN1_TLC;
/* This is just an opaque pointer */
typedef struct ASN1_VALUE_st ASN1_VALUE;
@@ -1194,6 +1193,7 @@ void ERR_load_ASN1_strings(void);
#define ASN1_F_ASN1_ITEM_I2D_FP 193
#define ASN1_F_ASN1_ITEM_PACK 198
#define ASN1_F_ASN1_ITEM_SIGN 195
+#define ASN1_F_ASN1_ITEM_SIGN_CTX 220
#define ASN1_F_ASN1_ITEM_UNPACK 199
#define ASN1_F_ASN1_ITEM_VERIFY 197
#define ASN1_F_ASN1_MBSTRING_NCOPY 122
@@ -1266,6 +1266,7 @@ void ERR_load_ASN1_strings(void);
#define ASN1_F_PKCS5_PBE2_SET_IV 167
#define ASN1_F_PKCS5_PBE_SET 202
#define ASN1_F_PKCS5_PBE_SET0_ALGOR 215
+#define ASN1_F_PKCS5_PBKDF2_SET 219
#define ASN1_F_SMIME_READ_ASN1 212
#define ASN1_F_SMIME_TEXT 213
#define ASN1_F_X509_CINF_NEW 168
@@ -1291,6 +1292,7 @@ void ERR_load_ASN1_strings(void);
#define ASN1_R_BOOLEAN_IS_WRONG_LENGTH 106
#define ASN1_R_BUFFER_TOO_SMALL 107
#define ASN1_R_CIPHER_HAS_NO_OBJECT_IDENTIFIER 108
+#define ASN1_R_CONTEXT_NOT_INITIALISED 217
#define ASN1_R_DATA_IS_WRONG 109
#define ASN1_R_DECODE_ERROR 110
#define ASN1_R_DECODING_ERROR 111
diff --git a/main/openssl/crypto/asn1/asn1_err.c b/main/openssl/crypto/asn1/asn1_err.c
index 6e04d08f..1a30bf11 100644
--- a/main/openssl/crypto/asn1/asn1_err.c
+++ b/main/openssl/crypto/asn1/asn1_err.c
@@ -1,6 +1,6 @@
/* crypto/asn1/asn1_err.c */
/* ====================================================================
- * Copyright (c) 1999-2009 The OpenSSL Project. All rights reserved.
+ * Copyright (c) 1999-2011 The OpenSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -107,6 +107,7 @@ static ERR_STRING_DATA ASN1_str_functs[]=
{ERR_FUNC(ASN1_F_ASN1_ITEM_I2D_FP), "ASN1_item_i2d_fp"},
{ERR_FUNC(ASN1_F_ASN1_ITEM_PACK), "ASN1_item_pack"},
{ERR_FUNC(ASN1_F_ASN1_ITEM_SIGN), "ASN1_item_sign"},
+{ERR_FUNC(ASN1_F_ASN1_ITEM_SIGN_CTX), "ASN1_item_sign_ctx"},
{ERR_FUNC(ASN1_F_ASN1_ITEM_UNPACK), "ASN1_item_unpack"},
{ERR_FUNC(ASN1_F_ASN1_ITEM_VERIFY), "ASN1_item_verify"},
{ERR_FUNC(ASN1_F_ASN1_MBSTRING_NCOPY), "ASN1_mbstring_ncopy"},
@@ -179,6 +180,7 @@ static ERR_STRING_DATA ASN1_str_functs[]=
{ERR_FUNC(ASN1_F_PKCS5_PBE2_SET_IV), "PKCS5_pbe2_set_iv"},
{ERR_FUNC(ASN1_F_PKCS5_PBE_SET), "PKCS5_pbe_set"},
{ERR_FUNC(ASN1_F_PKCS5_PBE_SET0_ALGOR), "PKCS5_pbe_set0_algor"},
+{ERR_FUNC(ASN1_F_PKCS5_PBKDF2_SET), "PKCS5_pbkdf2_set"},
{ERR_FUNC(ASN1_F_SMIME_READ_ASN1), "SMIME_read_ASN1"},
{ERR_FUNC(ASN1_F_SMIME_TEXT), "SMIME_text"},
{ERR_FUNC(ASN1_F_X509_CINF_NEW), "X509_CINF_NEW"},
@@ -207,6 +209,7 @@ static ERR_STRING_DATA ASN1_str_reasons[]=
{ERR_REASON(ASN1_R_BOOLEAN_IS_WRONG_LENGTH),"boolean is wrong length"},
{ERR_REASON(ASN1_R_BUFFER_TOO_SMALL) ,"buffer too small"},
{ERR_REASON(ASN1_R_CIPHER_HAS_NO_OBJECT_IDENTIFIER),"cipher has no object identifier"},
+{ERR_REASON(ASN1_R_CONTEXT_NOT_INITIALISED),"context not initialised"},
{ERR_REASON(ASN1_R_DATA_IS_WRONG) ,"data is wrong"},
{ERR_REASON(ASN1_R_DECODE_ERROR) ,"decode error"},
{ERR_REASON(ASN1_R_DECODING_ERROR) ,"decoding error"},
diff --git a/main/openssl/crypto/asn1/asn1_locl.h b/main/openssl/crypto/asn1/asn1_locl.h
index 5aa65e28..9fcf0d95 100644
--- a/main/openssl/crypto/asn1/asn1_locl.h
+++ b/main/openssl/crypto/asn1/asn1_locl.h
@@ -102,6 +102,10 @@ struct evp_pkey_asn1_method_st
int (*param_cmp)(const EVP_PKEY *a, const EVP_PKEY *b);
int (*param_print)(BIO *out, const EVP_PKEY *pkey, int indent,
ASN1_PCTX *pctx);
+ int (*sig_print)(BIO *out,
+ const X509_ALGOR *sigalg, const ASN1_STRING *sig,
+ int indent, ASN1_PCTX *pctx);
+
void (*pkey_free)(EVP_PKEY *pkey);
int (*pkey_ctrl)(EVP_PKEY *pkey, int op, long arg1, void *arg2);
@@ -111,6 +115,13 @@ struct evp_pkey_asn1_method_st
int (*old_priv_decode)(EVP_PKEY *pkey,
const unsigned char **pder, int derlen);
int (*old_priv_encode)(const EVP_PKEY *pkey, unsigned char **pder);
+ /* Custom ASN1 signature verification */
+ int (*item_verify)(EVP_MD_CTX *ctx, const ASN1_ITEM *it, void *asn,
+ X509_ALGOR *a, ASN1_BIT_STRING *sig,
+ EVP_PKEY *pkey);
+ int (*item_sign)(EVP_MD_CTX *ctx, const ASN1_ITEM *it, void *asn,
+ X509_ALGOR *alg1, X509_ALGOR *alg2,
+ ASN1_BIT_STRING *sig);
} /* EVP_PKEY_ASN1_METHOD */;
diff --git a/main/openssl/crypto/asn1/asn_mime.c b/main/openssl/crypto/asn1/asn_mime.c
index c1d1b122..54a704a9 100644
--- a/main/openssl/crypto/asn1/asn_mime.c
+++ b/main/openssl/crypto/asn1/asn_mime.c
@@ -377,8 +377,12 @@ static int asn1_output_data(BIO *out, BIO *data, ASN1_VALUE *val, int flags,
BIO *tmpbio;
const ASN1_AUX *aux = it->funcs;
ASN1_STREAM_ARG sarg;
+ int rv = 1;
- if (!(flags & SMIME_DETACHED))
+ /* If data is not deteched or resigning then the output BIO is
+ * already set up to finalise when it is written through.
+ */
+ if (!(flags & SMIME_DETACHED) || (flags & PKCS7_REUSE_DIGEST))
{
SMIME_crlf_copy(data, out, flags);
return 1;
@@ -405,7 +409,7 @@ static int asn1_output_data(BIO *out, BIO *data, ASN1_VALUE *val, int flags,
/* Finalize structure */
if (aux->asn1_cb(ASN1_OP_DETACHED_POST, &val, it, &sarg) <= 0)
- return 0;
+ rv = 0;
/* Now remove any digests prepended to the BIO */
@@ -416,7 +420,7 @@ static int asn1_output_data(BIO *out, BIO *data, ASN1_VALUE *val, int flags,
sarg.ndef_bio = tmpbio;
}
- return 1;
+ return rv;
}
@@ -486,9 +490,9 @@ ASN1_VALUE *SMIME_read_ASN1(BIO *bio, BIO **bcont, const ASN1_ITEM *it)
if(strcmp(hdr->value, "application/x-pkcs7-signature") &&
strcmp(hdr->value, "application/pkcs7-signature")) {
- sk_MIME_HEADER_pop_free(headers, mime_hdr_free);
ASN1err(ASN1_F_SMIME_READ_ASN1,ASN1_R_SIG_INVALID_MIME_TYPE);
ERR_add_error_data(2, "type: ", hdr->value);
+ sk_MIME_HEADER_pop_free(headers, mime_hdr_free);
sk_BIO_pop_free(parts, BIO_vfree);
return NULL;
}
@@ -801,7 +805,7 @@ static MIME_HEADER *mime_hdr_new(char *name, char *value)
if(name) {
if(!(tmpname = BUF_strdup(name))) return NULL;
for(p = tmpname ; *p; p++) {
- c = *p;
+ c = (unsigned char)*p;
if(isupper(c)) {
c = tolower(c);
*p = c;
@@ -811,7 +815,7 @@ static MIME_HEADER *mime_hdr_new(char *name, char *value)
if(value) {
if(!(tmpval = BUF_strdup(value))) return NULL;
for(p = tmpval ; *p; p++) {
- c = *p;
+ c = (unsigned char)*p;
if(isupper(c)) {
c = tolower(c);
*p = c;
@@ -835,7 +839,7 @@ static int mime_hdr_addparam(MIME_HEADER *mhdr, char *name, char *value)
tmpname = BUF_strdup(name);
if(!tmpname) return 0;
for(p = tmpname ; *p; p++) {
- c = *p;
+ c = (unsigned char)*p;
if(isupper(c)) {
c = tolower(c);
*p = c;
@@ -858,12 +862,17 @@ static int mime_hdr_addparam(MIME_HEADER *mhdr, char *name, char *value)
static int mime_hdr_cmp(const MIME_HEADER * const *a,
const MIME_HEADER * const *b)
{
+ if (!(*a)->name || !(*b)->name)
+ return !!(*a)->name - !!(*b)->name;
+
return(strcmp((*a)->name, (*b)->name));
}
static int mime_param_cmp(const MIME_PARAM * const *a,
const MIME_PARAM * const *b)
{
+ if (!(*a)->param_name || !(*b)->param_name)
+ return !!(*a)->param_name - !!(*b)->param_name;
return(strcmp((*a)->param_name, (*b)->param_name));
}
diff --git a/main/openssl/crypto/asn1/n_pkey.c b/main/openssl/crypto/asn1/n_pkey.c
index e7d04390..e2517399 100644
--- a/main/openssl/crypto/asn1/n_pkey.c
+++ b/main/openssl/crypto/asn1/n_pkey.c
@@ -129,6 +129,7 @@ int i2d_RSA_NET(const RSA *a, unsigned char **pp,
unsigned char buf[256],*zz;
unsigned char key[EVP_MAX_KEY_LENGTH];
EVP_CIPHER_CTX ctx;
+ EVP_CIPHER_CTX_init(&ctx);
if (a == NULL) return(0);
@@ -206,24 +207,28 @@ int i2d_RSA_NET(const RSA *a, unsigned char **pp,
i = strlen((char *)buf);
/* If the key is used for SGC the algorithm is modified a little. */
if(sgckey) {
- EVP_Digest(buf, i, buf, NULL, EVP_md5(), NULL);
+ if (!EVP_Digest(buf, i, buf, NULL, EVP_md5(), NULL))
+ goto err;
memcpy(buf + 16, "SGCKEYSALT", 10);
i = 26;
}
- EVP_BytesToKey(EVP_rc4(),EVP_md5(),NULL,buf,i,1,key,NULL);
+ if (!EVP_BytesToKey(EVP_rc4(),EVP_md5(),NULL,buf,i,1,key,NULL))
+ goto err;
OPENSSL_cleanse(buf,256);
/* Encrypt private key in place */
zz = enckey->enckey->digest->data;
- EVP_CIPHER_CTX_init(&ctx);
- EVP_EncryptInit_ex(&ctx,EVP_rc4(),NULL,key,NULL);
- EVP_EncryptUpdate(&ctx,zz,&i,zz,pkeylen);
- EVP_EncryptFinal_ex(&ctx,zz + i,&j);
- EVP_CIPHER_CTX_cleanup(&ctx);
+ if (!EVP_EncryptInit_ex(&ctx,EVP_rc4(),NULL,key,NULL))
+ goto err;
+ if (!EVP_EncryptUpdate(&ctx,zz,&i,zz,pkeylen))
+ goto err;
+ if (!EVP_EncryptFinal_ex(&ctx,zz + i,&j))
+ goto err;
ret = i2d_NETSCAPE_ENCRYPTED_PKEY(enckey, pp);
err:
+ EVP_CIPHER_CTX_cleanup(&ctx);
NETSCAPE_ENCRYPTED_PKEY_free(enckey);
NETSCAPE_PKEY_free(pkey);
return(ret);
@@ -288,6 +293,7 @@ static RSA *d2i_RSA_NET_2(RSA **a, ASN1_OCTET_STRING *os,
const unsigned char *zz;
unsigned char key[EVP_MAX_KEY_LENGTH];
EVP_CIPHER_CTX ctx;
+ EVP_CIPHER_CTX_init(&ctx);
i=cb((char *)buf,256,"Enter Private Key password:",0);
if (i != 0)
@@ -298,19 +304,22 @@ static RSA *d2i_RSA_NET_2(RSA **a, ASN1_OCTET_STRING *os,
i = strlen((char *)buf);
if(sgckey){
- EVP_Digest(buf, i, buf, NULL, EVP_md5(), NULL);
+ if (!EVP_Digest(buf, i, buf, NULL, EVP_md5(), NULL))
+ goto err;
memcpy(buf + 16, "SGCKEYSALT", 10);
i = 26;
}
- EVP_BytesToKey(EVP_rc4(),EVP_md5(),NULL,buf,i,1,key,NULL);
+ if (!EVP_BytesToKey(EVP_rc4(),EVP_md5(),NULL,buf,i,1,key,NULL))
+ goto err;
OPENSSL_cleanse(buf,256);
- EVP_CIPHER_CTX_init(&ctx);
- EVP_DecryptInit_ex(&ctx,EVP_rc4(),NULL, key,NULL);
- EVP_DecryptUpdate(&ctx,os->data,&i,os->data,os->length);
- EVP_DecryptFinal_ex(&ctx,&(os->data[i]),&j);
- EVP_CIPHER_CTX_cleanup(&ctx);
+ if (!EVP_DecryptInit_ex(&ctx,EVP_rc4(),NULL, key,NULL))
+ goto err;
+ if (!EVP_DecryptUpdate(&ctx,os->data,&i,os->data,os->length))
+ goto err;
+ if (!EVP_DecryptFinal_ex(&ctx,&(os->data[i]),&j))
+ goto err;
os->length=i+j;
zz=os->data;
@@ -328,6 +337,7 @@ static RSA *d2i_RSA_NET_2(RSA **a, ASN1_OCTET_STRING *os,
goto err;
}
err:
+ EVP_CIPHER_CTX_cleanup(&ctx);
NETSCAPE_PKEY_free(pkey);
return(ret);
}
diff --git a/main/openssl/crypto/asn1/p5_pbev2.c b/main/openssl/crypto/asn1/p5_pbev2.c
index cb49b665..4ea68303 100644
--- a/main/openssl/crypto/asn1/p5_pbev2.c
+++ b/main/openssl/crypto/asn1/p5_pbev2.c
@@ -91,12 +91,10 @@ X509_ALGOR *PKCS5_pbe2_set_iv(const EVP_CIPHER *cipher, int iter,
unsigned char *aiv, int prf_nid)
{
X509_ALGOR *scheme = NULL, *kalg = NULL, *ret = NULL;
- int alg_nid;
+ int alg_nid, keylen;
EVP_CIPHER_CTX ctx;
unsigned char iv[EVP_MAX_IV_LENGTH];
- PBKDF2PARAM *kdf = NULL;
PBE2PARAM *pbe2 = NULL;
- ASN1_OCTET_STRING *osalt = NULL;
ASN1_OBJECT *obj;
alg_nid = EVP_CIPHER_type(cipher);
@@ -127,7 +125,8 @@ X509_ALGOR *PKCS5_pbe2_set_iv(const EVP_CIPHER *cipher, int iter,
EVP_CIPHER_CTX_init(&ctx);
/* Dummy cipherinit to just setup the IV, and PRF */
- EVP_CipherInit_ex(&ctx, cipher, NULL, NULL, iv, 0);
+ if (!EVP_CipherInit_ex(&ctx, cipher, NULL, NULL, iv, 0))
+ goto err;
if(EVP_CIPHER_param_to_asn1(&ctx, scheme->parameter) < 0) {
ASN1err(ASN1_F_PKCS5_PBE2_SET_IV,
ASN1_R_ERROR_SETTING_CIPHER_PARAMS);
@@ -145,55 +144,21 @@ X509_ALGOR *PKCS5_pbe2_set_iv(const EVP_CIPHER *cipher, int iter,
}
EVP_CIPHER_CTX_cleanup(&ctx);
- if(!(kdf = PBKDF2PARAM_new())) goto merr;
- if(!(osalt = M_ASN1_OCTET_STRING_new())) goto merr;
-
- if (!saltlen) saltlen = PKCS5_SALT_LEN;
- if (!(osalt->data = OPENSSL_malloc (saltlen))) goto merr;
- osalt->length = saltlen;
- if (salt) memcpy (osalt->data, salt, saltlen);
- else if (RAND_pseudo_bytes (osalt->data, saltlen) < 0) goto merr;
-
- if(iter <= 0) iter = PKCS5_DEFAULT_ITER;
- if(!ASN1_INTEGER_set(kdf->iter, iter)) goto merr;
-
- /* Now include salt in kdf structure */
- kdf->salt->value.octet_string = osalt;
- kdf->salt->type = V_ASN1_OCTET_STRING;
- osalt = NULL;
-
/* If its RC2 then we'd better setup the key length */
- if(alg_nid == NID_rc2_cbc) {
- if(!(kdf->keylength = M_ASN1_INTEGER_new())) goto merr;
- if(!ASN1_INTEGER_set (kdf->keylength,
- EVP_CIPHER_key_length(cipher))) goto merr;
- }
-
- /* prf can stay NULL if we are using hmacWithSHA1 */
- if (prf_nid != NID_hmacWithSHA1)
- {
- kdf->prf = X509_ALGOR_new();
- if (!kdf->prf)
- goto merr;
- X509_ALGOR_set0(kdf->prf, OBJ_nid2obj(prf_nid),
- V_ASN1_NULL, NULL);
- }
-
- /* Now setup the PBE2PARAM keyfunc structure */
+ if(alg_nid == NID_rc2_cbc)
+ keylen = EVP_CIPHER_key_length(cipher);
+ else
+ keylen = -1;
- pbe2->keyfunc->algorithm = OBJ_nid2obj(NID_id_pbkdf2);
+ /* Setup keyfunc */
- /* Encode PBKDF2PARAM into parameter of pbe2 */
+ X509_ALGOR_free(pbe2->keyfunc);
- if(!(pbe2->keyfunc->parameter = ASN1_TYPE_new())) goto merr;
+ pbe2->keyfunc = PKCS5_pbkdf2_set(iter, salt, saltlen, prf_nid, keylen);
- if(!ASN1_item_pack(kdf, ASN1_ITEM_rptr(PBKDF2PARAM),
- &pbe2->keyfunc->parameter->value.sequence)) goto merr;
- pbe2->keyfunc->parameter->type = V_ASN1_SEQUENCE;
-
- PBKDF2PARAM_free(kdf);
- kdf = NULL;
+ if (!pbe2->keyfunc)
+ goto merr;
/* Now set up top level AlgorithmIdentifier */
@@ -219,8 +184,6 @@ X509_ALGOR *PKCS5_pbe2_set_iv(const EVP_CIPHER *cipher, int iter,
err:
PBE2PARAM_free(pbe2);
/* Note 'scheme' is freed as part of pbe2 */
- M_ASN1_OCTET_STRING_free(osalt);
- PBKDF2PARAM_free(kdf);
X509_ALGOR_free(kalg);
X509_ALGOR_free(ret);
@@ -233,3 +196,85 @@ X509_ALGOR *PKCS5_pbe2_set(const EVP_CIPHER *cipher, int iter,
{
return PKCS5_pbe2_set_iv(cipher, iter, salt, saltlen, NULL, -1);
}
+
+X509_ALGOR *PKCS5_pbkdf2_set(int iter, unsigned char *salt, int saltlen,
+ int prf_nid, int keylen)
+ {
+ X509_ALGOR *keyfunc = NULL;
+ PBKDF2PARAM *kdf = NULL;
+ ASN1_OCTET_STRING *osalt = NULL;
+
+ if(!(kdf = PBKDF2PARAM_new()))
+ goto merr;
+ if(!(osalt = M_ASN1_OCTET_STRING_new()))
+ goto merr;
+
+ kdf->salt->value.octet_string = osalt;
+ kdf->salt->type = V_ASN1_OCTET_STRING;
+
+ if (!saltlen)
+ saltlen = PKCS5_SALT_LEN;
+ if (!(osalt->data = OPENSSL_malloc (saltlen)))
+ goto merr;
+
+ osalt->length = saltlen;
+
+ if (salt)
+ memcpy (osalt->data, salt, saltlen);
+ else if (RAND_pseudo_bytes (osalt->data, saltlen) < 0)
+ goto merr;
+
+ if(iter <= 0)
+ iter = PKCS5_DEFAULT_ITER;
+
+ if(!ASN1_INTEGER_set(kdf->iter, iter))
+ goto merr;
+
+ /* If have a key len set it up */
+
+ if(keylen > 0)
+ {
+ if(!(kdf->keylength = M_ASN1_INTEGER_new()))
+ goto merr;
+ if(!ASN1_INTEGER_set (kdf->keylength, keylen))
+ goto merr;
+ }
+
+ /* prf can stay NULL if we are using hmacWithSHA1 */
+ if (prf_nid > 0 && prf_nid != NID_hmacWithSHA1)
+ {
+ kdf->prf = X509_ALGOR_new();
+ if (!kdf->prf)
+ goto merr;
+ X509_ALGOR_set0(kdf->prf, OBJ_nid2obj(prf_nid),
+ V_ASN1_NULL, NULL);
+ }
+
+ /* Finally setup the keyfunc structure */
+
+ keyfunc = X509_ALGOR_new();
+ if (!keyfunc)
+ goto merr;
+
+ keyfunc->algorithm = OBJ_nid2obj(NID_id_pbkdf2);
+
+ /* Encode PBKDF2PARAM into parameter of pbe2 */
+
+ if(!(keyfunc->parameter = ASN1_TYPE_new()))
+ goto merr;
+
+ if(!ASN1_item_pack(kdf, ASN1_ITEM_rptr(PBKDF2PARAM),
+ &keyfunc->parameter->value.sequence))
+ goto merr;
+ keyfunc->parameter->type = V_ASN1_SEQUENCE;
+
+ PBKDF2PARAM_free(kdf);
+ return keyfunc;
+
+ merr:
+ ASN1err(ASN1_F_PKCS5_PBKDF2_SET,ERR_R_MALLOC_FAILURE);
+ PBKDF2PARAM_free(kdf);
+ X509_ALGOR_free(keyfunc);
+ return NULL;
+ }
+
diff --git a/main/openssl/crypto/asn1/t_crl.c b/main/openssl/crypto/asn1/t_crl.c
index ee5a687c..c6116920 100644
--- a/main/openssl/crypto/asn1/t_crl.c
+++ b/main/openssl/crypto/asn1/t_crl.c
@@ -94,8 +94,7 @@ int X509_CRL_print(BIO *out, X509_CRL *x)
l = X509_CRL_get_version(x);
BIO_printf(out, "%8sVersion %lu (0x%lx)\n", "", l+1, l);
i = OBJ_obj2nid(x->sig_alg->algorithm);
- BIO_printf(out, "%8sSignature Algorithm: %s\n", "",
- (i == NID_undef) ? "NONE" : OBJ_nid2ln(i));
+ X509_signature_print(out, x->sig_alg, NULL);
p=X509_NAME_oneline(X509_CRL_get_issuer(x),NULL,0);
BIO_printf(out,"%8sIssuer: %s\n","",p);
OPENSSL_free(p);
diff --git a/main/openssl/crypto/asn1/t_x509.c b/main/openssl/crypto/asn1/t_x509.c
index e061f2ff..edbb39a0 100644
--- a/main/openssl/crypto/asn1/t_x509.c
+++ b/main/openssl/crypto/asn1/t_x509.c
@@ -72,6 +72,7 @@
#include <openssl/objects.h>
#include <openssl/x509.h>
#include <openssl/x509v3.h>
+#include "asn1_locl.h"
#ifndef OPENSSL_NO_FP_API
int X509_print_fp(FILE *fp, X509 *x)
@@ -137,10 +138,10 @@ int X509_print_ex(BIO *bp, X509 *x, unsigned long nmflags, unsigned long cflag)
if (BIO_write(bp," Serial Number:",22) <= 0) goto err;
bs=X509_get_serialNumber(x);
- if (bs->length <= 4)
+ if (bs->length <= (int)sizeof(long))
{
l=ASN1_INTEGER_get(bs);
- if (l < 0)
+ if (bs->type == V_ASN1_NEG_INTEGER)
{
l= -l;
neg="-";
@@ -167,12 +168,16 @@ int X509_print_ex(BIO *bp, X509 *x, unsigned long nmflags, unsigned long cflag)
if(!(cflag & X509_FLAG_NO_SIGNAME))
{
+ if(X509_signature_print(bp, x->sig_alg, NULL) <= 0)
+ goto err;
+#if 0
if (BIO_printf(bp,"%8sSignature Algorithm: ","") <= 0)
goto err;
if (i2a_ASN1_OBJECT(bp, ci->signature->algorithm) <= 0)
goto err;
if (BIO_puts(bp, "\n") <= 0)
goto err;
+#endif
}
if(!(cflag & X509_FLAG_NO_ISSUER))
@@ -255,7 +260,8 @@ int X509_ocspid_print (BIO *bp, X509 *x)
goto err;
i2d_X509_NAME(x->cert_info->subject, &dertmp);
- EVP_Digest(der, derlen, SHA1md, NULL, EVP_sha1(), NULL);
+ if (!EVP_Digest(der, derlen, SHA1md, NULL, EVP_sha1(), NULL))
+ goto err;
for (i=0; i < SHA_DIGEST_LENGTH; i++)
{
if (BIO_printf(bp,"%02X",SHA1md[i]) <= 0) goto err;
@@ -268,8 +274,10 @@ int X509_ocspid_print (BIO *bp, X509 *x)
if (BIO_printf(bp,"\n Public key OCSP hash: ") <= 0)
goto err;
- EVP_Digest(x->cert_info->key->public_key->data,
- x->cert_info->key->public_key->length, SHA1md, NULL, EVP_sha1(), NULL);
+ if (!EVP_Digest(x->cert_info->key->public_key->data,
+ x->cert_info->key->public_key->length,
+ SHA1md, NULL, EVP_sha1(), NULL))
+ goto err;
for (i=0; i < SHA_DIGEST_LENGTH; i++)
{
if (BIO_printf(bp,"%02X",SHA1md[i]) <= 0)
@@ -283,23 +291,50 @@ err:
return(0);
}
-int X509_signature_print(BIO *bp, X509_ALGOR *sigalg, ASN1_STRING *sig)
+int X509_signature_dump(BIO *bp, const ASN1_STRING *sig, int indent)
{
- unsigned char *s;
+ const unsigned char *s;
int i, n;
- if (BIO_puts(bp," Signature Algorithm: ") <= 0) return 0;
- if (i2a_ASN1_OBJECT(bp, sigalg->algorithm) <= 0) return 0;
n=sig->length;
s=sig->data;
for (i=0; i<n; i++)
{
if ((i%18) == 0)
- if (BIO_write(bp,"\n ",9) <= 0) return 0;
+ {
+ if (BIO_write(bp,"\n",1) <= 0) return 0;
+ if (BIO_indent(bp, indent, indent) <= 0) return 0;
+ }
if (BIO_printf(bp,"%02x%s",s[i],
((i+1) == n)?"":":") <= 0) return 0;
}
if (BIO_write(bp,"\n",1) != 1) return 0;
+
+ return 1;
+}
+
+int X509_signature_print(BIO *bp, X509_ALGOR *sigalg, ASN1_STRING *sig)
+{
+ int sig_nid;
+ if (BIO_puts(bp," Signature Algorithm: ") <= 0) return 0;
+ if (i2a_ASN1_OBJECT(bp, sigalg->algorithm) <= 0) return 0;
+
+ sig_nid = OBJ_obj2nid(sigalg->algorithm);
+ if (sig_nid != NID_undef)
+ {
+ int pkey_nid, dig_nid;
+ const EVP_PKEY_ASN1_METHOD *ameth;
+ if (OBJ_find_sigid_algs(sig_nid, &dig_nid, &pkey_nid))
+ {
+ ameth = EVP_PKEY_asn1_find(NULL, pkey_nid);
+ if (ameth && ameth->sig_print)
+ return ameth->sig_print(bp, sigalg, sig, 9, 0);
+ }
+ }
+ if (sig)
+ return X509_signature_dump(bp, sig, 9);
+ else if (BIO_puts(bp, "\n") <= 0)
+ return 0;
return 1;
}
diff --git a/main/openssl/crypto/asn1/tasn_prn.c b/main/openssl/crypto/asn1/tasn_prn.c
index 45369801..542a091a 100644
--- a/main/openssl/crypto/asn1/tasn_prn.c
+++ b/main/openssl/crypto/asn1/tasn_prn.c
@@ -446,11 +446,11 @@ static int asn1_print_fsname(BIO *out, int indent,
return 1;
}
-static int asn1_print_boolean_ctx(BIO *out, const int bool,
+static int asn1_print_boolean_ctx(BIO *out, int boolval,
const ASN1_PCTX *pctx)
{
const char *str;
- switch (bool)
+ switch (boolval)
{
case -1:
str = "BOOL ABSENT";
@@ -574,10 +574,10 @@ static int asn1_primitive_print(BIO *out, ASN1_VALUE **fld,
{
case V_ASN1_BOOLEAN:
{
- int bool = *(int *)fld;
- if (bool == -1)
- bool = it->size;
- ret = asn1_print_boolean_ctx(out, bool, pctx);
+ int boolval = *(int *)fld;
+ if (boolval == -1)
+ boolval = it->size;
+ ret = asn1_print_boolean_ctx(out, boolval, pctx);
}
break;
diff --git a/main/openssl/crypto/asn1/x_algor.c b/main/openssl/crypto/asn1/x_algor.c
index 99e53429..274e456c 100644
--- a/main/openssl/crypto/asn1/x_algor.c
+++ b/main/openssl/crypto/asn1/x_algor.c
@@ -128,3 +128,17 @@ void X509_ALGOR_get0(ASN1_OBJECT **paobj, int *pptype, void **ppval,
}
}
+/* Set up an X509_ALGOR DigestAlgorithmIdentifier from an EVP_MD */
+
+void X509_ALGOR_set_md(X509_ALGOR *alg, const EVP_MD *md)
+ {
+ int param_type;
+
+ if (md->flags & EVP_MD_FLAG_DIGALGID_ABSENT)
+ param_type = V_ASN1_UNDEF;
+ else
+ param_type = V_ASN1_NULL;
+
+ X509_ALGOR_set0(alg, OBJ_nid2obj(EVP_MD_type(md)), param_type, NULL);
+
+ }
diff --git a/main/openssl/crypto/asn1/x_name.c b/main/openssl/crypto/asn1/x_name.c
index 49be08b4..d7c23186 100644
--- a/main/openssl/crypto/asn1/x_name.c
+++ b/main/openssl/crypto/asn1/x_name.c
@@ -399,8 +399,7 @@ static int asn1_string_canon(ASN1_STRING *out, ASN1_STRING *in)
/* If type not in bitmask just copy string across */
if (!(ASN1_tag2bit(in->type) & ASN1_MASK_CANON))
{
- out->type = in->type;
- if (!ASN1_STRING_set(out, in->data, in->length))
+ if (!ASN1_STRING_copy(out, in))
return 0;
return 1;
}
diff --git a/main/openssl/crypto/asn1/x_pubkey.c b/main/openssl/crypto/asn1/x_pubkey.c
index d42b6a2c..b649e1fc 100644
--- a/main/openssl/crypto/asn1/x_pubkey.c
+++ b/main/openssl/crypto/asn1/x_pubkey.c
@@ -171,7 +171,19 @@ EVP_PKEY *X509_PUBKEY_get(X509_PUBKEY *key)
goto error;
}
- key->pkey = ret;
+ /* Check to see if another thread set key->pkey first */
+ CRYPTO_w_lock(CRYPTO_LOCK_EVP_PKEY);
+ if (key->pkey)
+ {
+ CRYPTO_w_unlock(CRYPTO_LOCK_EVP_PKEY);
+ EVP_PKEY_free(ret);
+ ret = key->pkey;
+ }
+ else
+ {
+ key->pkey = ret;
+ CRYPTO_w_unlock(CRYPTO_LOCK_EVP_PKEY);
+ }
CRYPTO_add(&ret->references, 1, CRYPTO_LOCK_EVP_PKEY);
return ret;
diff --git a/main/openssl/crypto/bf/asm/bf-586.S b/main/openssl/crypto/bf/asm/bf-586.S
new file mode 100644
index 00000000..aa718d40
--- /dev/null
+++ b/main/openssl/crypto/bf/asm/bf-586.S
@@ -0,0 +1,896 @@
+.file "bf-586.s"
+.text
+.globl BF_encrypt
+.type BF_encrypt,@function
+.align 16
+BF_encrypt:
+.L_BF_encrypt_begin:
+
+ pushl %ebp
+ pushl %ebx
+ movl 12(%esp),%ebx
+ movl 16(%esp),%ebp
+ pushl %esi
+ pushl %edi
+
+ movl (%ebx),%edi
+ movl 4(%ebx),%esi
+ xorl %eax,%eax
+ movl (%ebp),%ebx
+ xorl %ecx,%ecx
+ xorl %ebx,%edi
+
+
+ movl 4(%ebp),%edx
+ movl %edi,%ebx
+ xorl %edx,%esi
+ shrl $16,%ebx
+ movl %edi,%edx
+ movb %bh,%al
+ andl $255,%ebx
+ movb %dh,%cl
+ andl $255,%edx
+ movl 72(%ebp,%eax,4),%eax
+ movl 1096(%ebp,%ebx,4),%ebx
+ addl %eax,%ebx
+ movl 2120(%ebp,%ecx,4),%eax
+ xorl %eax,%ebx
+ movl 3144(%ebp,%edx,4),%edx
+ addl %edx,%ebx
+ xorl %eax,%eax
+ xorl %ebx,%esi
+
+
+ movl 8(%ebp),%edx
+ movl %esi,%ebx
+ xorl %edx,%edi
+ shrl $16,%ebx
+ movl %esi,%edx
+ movb %bh,%al
+ andl $255,%ebx
+ movb %dh,%cl
+ andl $255,%edx
+ movl 72(%ebp,%eax,4),%eax
+ movl 1096(%ebp,%ebx,4),%ebx
+ addl %eax,%ebx
+ movl 2120(%ebp,%ecx,4),%eax
+ xorl %eax,%ebx
+ movl 3144(%ebp,%edx,4),%edx
+ addl %edx,%ebx
+ xorl %eax,%eax
+ xorl %ebx,%edi
+
+
+ movl 12(%ebp),%edx
+ movl %edi,%ebx
+ xorl %edx,%esi
+ shrl $16,%ebx
+ movl %edi,%edx
+ movb %bh,%al
+ andl $255,%ebx
+ movb %dh,%cl
+ andl $255,%edx
+ movl 72(%ebp,%eax,4),%eax
+ movl 1096(%ebp,%ebx,4),%ebx
+ addl %eax,%ebx
+ movl 2120(%ebp,%ecx,4),%eax
+ xorl %eax,%ebx
+ movl 3144(%ebp,%edx,4),%edx
+ addl %edx,%ebx
+ xorl %eax,%eax
+ xorl %ebx,%esi
+
+
+ movl 16(%ebp),%edx
+ movl %esi,%ebx
+ xorl %edx,%edi
+ shrl $16,%ebx
+ movl %esi,%edx
+ movb %bh,%al
+ andl $255,%ebx
+ movb %dh,%cl
+ andl $255,%edx
+ movl 72(%ebp,%eax,4),%eax
+ movl 1096(%ebp,%ebx,4),%ebx
+ addl %eax,%ebx
+ movl 2120(%ebp,%ecx,4),%eax
+ xorl %eax,%ebx
+ movl 3144(%ebp,%edx,4),%edx
+ addl %edx,%ebx
+ xorl %eax,%eax
+ xorl %ebx,%edi
+
+
+ movl 20(%ebp),%edx
+ movl %edi,%ebx
+ xorl %edx,%esi
+ shrl $16,%ebx
+ movl %edi,%edx
+ movb %bh,%al
+ andl $255,%ebx
+ movb %dh,%cl
+ andl $255,%edx
+ movl 72(%ebp,%eax,4),%eax
+ movl 1096(%ebp,%ebx,4),%ebx
+ addl %eax,%ebx
+ movl 2120(%ebp,%ecx,4),%eax
+ xorl %eax,%ebx
+ movl 3144(%ebp,%edx,4),%edx
+ addl %edx,%ebx
+ xorl %eax,%eax
+ xorl %ebx,%esi
+
+
+ movl 24(%ebp),%edx
+ movl %esi,%ebx
+ xorl %edx,%edi
+ shrl $16,%ebx
+ movl %esi,%edx
+ movb %bh,%al
+ andl $255,%ebx
+ movb %dh,%cl
+ andl $255,%edx
+ movl 72(%ebp,%eax,4),%eax
+ movl 1096(%ebp,%ebx,4),%ebx
+ addl %eax,%ebx
+ movl 2120(%ebp,%ecx,4),%eax
+ xorl %eax,%ebx
+ movl 3144(%ebp,%edx,4),%edx
+ addl %edx,%ebx
+ xorl %eax,%eax
+ xorl %ebx,%edi
+
+
+ movl 28(%ebp),%edx
+ movl %edi,%ebx
+ xorl %edx,%esi
+ shrl $16,%ebx
+ movl %edi,%edx
+ movb %bh,%al
+ andl $255,%ebx
+ movb %dh,%cl
+ andl $255,%edx
+ movl 72(%ebp,%eax,4),%eax
+ movl 1096(%ebp,%ebx,4),%ebx
+ addl %eax,%ebx
+ movl 2120(%ebp,%ecx,4),%eax
+ xorl %eax,%ebx
+ movl 3144(%ebp,%edx,4),%edx
+ addl %edx,%ebx
+ xorl %eax,%eax
+ xorl %ebx,%esi
+
+
+ movl 32(%ebp),%edx
+ movl %esi,%ebx
+ xorl %edx,%edi
+ shrl $16,%ebx
+ movl %esi,%edx
+ movb %bh,%al
+ andl $255,%ebx
+ movb %dh,%cl
+ andl $255,%edx
+ movl 72(%ebp,%eax,4),%eax
+ movl 1096(%ebp,%ebx,4),%ebx
+ addl %eax,%ebx
+ movl 2120(%ebp,%ecx,4),%eax
+ xorl %eax,%ebx
+ movl 3144(%ebp,%edx,4),%edx
+ addl %edx,%ebx
+ xorl %eax,%eax
+ xorl %ebx,%edi
+
+
+ movl 36(%ebp),%edx
+ movl %edi,%ebx
+ xorl %edx,%esi
+ shrl $16,%ebx
+ movl %edi,%edx
+ movb %bh,%al
+ andl $255,%ebx
+ movb %dh,%cl
+ andl $255,%edx
+ movl 72(%ebp,%eax,4),%eax
+ movl 1096(%ebp,%ebx,4),%ebx
+ addl %eax,%ebx
+ movl 2120(%ebp,%ecx,4),%eax
+ xorl %eax,%ebx
+ movl 3144(%ebp,%edx,4),%edx
+ addl %edx,%ebx
+ xorl %eax,%eax
+ xorl %ebx,%esi
+
+
+ movl 40(%ebp),%edx
+ movl %esi,%ebx
+ xorl %edx,%edi
+ shrl $16,%ebx
+ movl %esi,%edx
+ movb %bh,%al
+ andl $255,%ebx
+ movb %dh,%cl
+ andl $255,%edx
+ movl 72(%ebp,%eax,4),%eax
+ movl 1096(%ebp,%ebx,4),%ebx
+ addl %eax,%ebx
+ movl 2120(%ebp,%ecx,4),%eax
+ xorl %eax,%ebx
+ movl 3144(%ebp,%edx,4),%edx
+ addl %edx,%ebx
+ xorl %eax,%eax
+ xorl %ebx,%edi
+
+
+ movl 44(%ebp),%edx
+ movl %edi,%ebx
+ xorl %edx,%esi
+ shrl $16,%ebx
+ movl %edi,%edx
+ movb %bh,%al
+ andl $255,%ebx
+ movb %dh,%cl
+ andl $255,%edx
+ movl 72(%ebp,%eax,4),%eax
+ movl 1096(%ebp,%ebx,4),%ebx
+ addl %eax,%ebx
+ movl 2120(%ebp,%ecx,4),%eax
+ xorl %eax,%ebx
+ movl 3144(%ebp,%edx,4),%edx
+ addl %edx,%ebx
+ xorl %eax,%eax
+ xorl %ebx,%esi
+
+
+ movl 48(%ebp),%edx
+ movl %esi,%ebx
+ xorl %edx,%edi
+ shrl $16,%ebx
+ movl %esi,%edx
+ movb %bh,%al
+ andl $255,%ebx
+ movb %dh,%cl
+ andl $255,%edx
+ movl 72(%ebp,%eax,4),%eax
+ movl 1096(%ebp,%ebx,4),%ebx
+ addl %eax,%ebx
+ movl 2120(%ebp,%ecx,4),%eax
+ xorl %eax,%ebx
+ movl 3144(%ebp,%edx,4),%edx
+ addl %edx,%ebx
+ xorl %eax,%eax
+ xorl %ebx,%edi
+
+
+ movl 52(%ebp),%edx
+ movl %edi,%ebx
+ xorl %edx,%esi
+ shrl $16,%ebx
+ movl %edi,%edx
+ movb %bh,%al
+ andl $255,%ebx
+ movb %dh,%cl
+ andl $255,%edx
+ movl 72(%ebp,%eax,4),%eax
+ movl 1096(%ebp,%ebx,4),%ebx
+ addl %eax,%ebx
+ movl 2120(%ebp,%ecx,4),%eax
+ xorl %eax,%ebx
+ movl 3144(%ebp,%edx,4),%edx
+ addl %edx,%ebx
+ xorl %eax,%eax
+ xorl %ebx,%esi
+
+
+ movl 56(%ebp),%edx
+ movl %esi,%ebx
+ xorl %edx,%edi
+ shrl $16,%ebx
+ movl %esi,%edx
+ movb %bh,%al
+ andl $255,%ebx
+ movb %dh,%cl
+ andl $255,%edx
+ movl 72(%ebp,%eax,4),%eax
+ movl 1096(%ebp,%ebx,4),%ebx
+ addl %eax,%ebx
+ movl 2120(%ebp,%ecx,4),%eax
+ xorl %eax,%ebx
+ movl 3144(%ebp,%edx,4),%edx
+ addl %edx,%ebx
+ xorl %eax,%eax
+ xorl %ebx,%edi
+
+
+ movl 60(%ebp),%edx
+ movl %edi,%ebx
+ xorl %edx,%esi
+ shrl $16,%ebx
+ movl %edi,%edx
+ movb %bh,%al
+ andl $255,%ebx
+ movb %dh,%cl
+ andl $255,%edx
+ movl 72(%ebp,%eax,4),%eax
+ movl 1096(%ebp,%ebx,4),%ebx
+ addl %eax,%ebx
+ movl 2120(%ebp,%ecx,4),%eax
+ xorl %eax,%ebx
+ movl 3144(%ebp,%edx,4),%edx
+ addl %edx,%ebx
+ xorl %eax,%eax
+ xorl %ebx,%esi
+
+
+ movl 64(%ebp),%edx
+ movl %esi,%ebx
+ xorl %edx,%edi
+ shrl $16,%ebx
+ movl %esi,%edx
+ movb %bh,%al
+ andl $255,%ebx
+ movb %dh,%cl
+ andl $255,%edx
+ movl 72(%ebp,%eax,4),%eax
+ movl 1096(%ebp,%ebx,4),%ebx
+ addl %eax,%ebx
+ movl 2120(%ebp,%ecx,4),%eax
+ xorl %eax,%ebx
+ movl 3144(%ebp,%edx,4),%edx
+ addl %edx,%ebx
+
+ movl 20(%esp),%eax
+ xorl %ebx,%edi
+ movl 68(%ebp),%edx
+ xorl %edx,%esi
+ movl %edi,4(%eax)
+ movl %esi,(%eax)
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size BF_encrypt,.-.L_BF_encrypt_begin
+.globl BF_decrypt
+.type BF_decrypt,@function
+.align 16
+BF_decrypt:
+.L_BF_decrypt_begin:
+
+ pushl %ebp
+ pushl %ebx
+ movl 12(%esp),%ebx
+ movl 16(%esp),%ebp
+ pushl %esi
+ pushl %edi
+
+ movl (%ebx),%edi
+ movl 4(%ebx),%esi
+ xorl %eax,%eax
+ movl 68(%ebp),%ebx
+ xorl %ecx,%ecx
+ xorl %ebx,%edi
+
+
+ movl 64(%ebp),%edx
+ movl %edi,%ebx
+ xorl %edx,%esi
+ shrl $16,%ebx
+ movl %edi,%edx
+ movb %bh,%al
+ andl $255,%ebx
+ movb %dh,%cl
+ andl $255,%edx
+ movl 72(%ebp,%eax,4),%eax
+ movl 1096(%ebp,%ebx,4),%ebx
+ addl %eax,%ebx
+ movl 2120(%ebp,%ecx,4),%eax
+ xorl %eax,%ebx
+ movl 3144(%ebp,%edx,4),%edx
+ addl %edx,%ebx
+ xorl %eax,%eax
+ xorl %ebx,%esi
+
+
+ movl 60(%ebp),%edx
+ movl %esi,%ebx
+ xorl %edx,%edi
+ shrl $16,%ebx
+ movl %esi,%edx
+ movb %bh,%al
+ andl $255,%ebx
+ movb %dh,%cl
+ andl $255,%edx
+ movl 72(%ebp,%eax,4),%eax
+ movl 1096(%ebp,%ebx,4),%ebx
+ addl %eax,%ebx
+ movl 2120(%ebp,%ecx,4),%eax
+ xorl %eax,%ebx
+ movl 3144(%ebp,%edx,4),%edx
+ addl %edx,%ebx
+ xorl %eax,%eax
+ xorl %ebx,%edi
+
+
+ movl 56(%ebp),%edx
+ movl %edi,%ebx
+ xorl %edx,%esi
+ shrl $16,%ebx
+ movl %edi,%edx
+ movb %bh,%al
+ andl $255,%ebx
+ movb %dh,%cl
+ andl $255,%edx
+ movl 72(%ebp,%eax,4),%eax
+ movl 1096(%ebp,%ebx,4),%ebx
+ addl %eax,%ebx
+ movl 2120(%ebp,%ecx,4),%eax
+ xorl %eax,%ebx
+ movl 3144(%ebp,%edx,4),%edx
+ addl %edx,%ebx
+ xorl %eax,%eax
+ xorl %ebx,%esi
+
+
+ movl 52(%ebp),%edx
+ movl %esi,%ebx
+ xorl %edx,%edi
+ shrl $16,%ebx
+ movl %esi,%edx
+ movb %bh,%al
+ andl $255,%ebx
+ movb %dh,%cl
+ andl $255,%edx
+ movl 72(%ebp,%eax,4),%eax
+ movl 1096(%ebp,%ebx,4),%ebx
+ addl %eax,%ebx
+ movl 2120(%ebp,%ecx,4),%eax
+ xorl %eax,%ebx
+ movl 3144(%ebp,%edx,4),%edx
+ addl %edx,%ebx
+ xorl %eax,%eax
+ xorl %ebx,%edi
+
+
+ movl 48(%ebp),%edx
+ movl %edi,%ebx
+ xorl %edx,%esi
+ shrl $16,%ebx
+ movl %edi,%edx
+ movb %bh,%al
+ andl $255,%ebx
+ movb %dh,%cl
+ andl $255,%edx
+ movl 72(%ebp,%eax,4),%eax
+ movl 1096(%ebp,%ebx,4),%ebx
+ addl %eax,%ebx
+ movl 2120(%ebp,%ecx,4),%eax
+ xorl %eax,%ebx
+ movl 3144(%ebp,%edx,4),%edx
+ addl %edx,%ebx
+ xorl %eax,%eax
+ xorl %ebx,%esi
+
+
+ movl 44(%ebp),%edx
+ movl %esi,%ebx
+ xorl %edx,%edi
+ shrl $16,%ebx
+ movl %esi,%edx
+ movb %bh,%al
+ andl $255,%ebx
+ movb %dh,%cl
+ andl $255,%edx
+ movl 72(%ebp,%eax,4),%eax
+ movl 1096(%ebp,%ebx,4),%ebx
+ addl %eax,%ebx
+ movl 2120(%ebp,%ecx,4),%eax
+ xorl %eax,%ebx
+ movl 3144(%ebp,%edx,4),%edx
+ addl %edx,%ebx
+ xorl %eax,%eax
+ xorl %ebx,%edi
+
+
+ movl 40(%ebp),%edx
+ movl %edi,%ebx
+ xorl %edx,%esi
+ shrl $16,%ebx
+ movl %edi,%edx
+ movb %bh,%al
+ andl $255,%ebx
+ movb %dh,%cl
+ andl $255,%edx
+ movl 72(%ebp,%eax,4),%eax
+ movl 1096(%ebp,%ebx,4),%ebx
+ addl %eax,%ebx
+ movl 2120(%ebp,%ecx,4),%eax
+ xorl %eax,%ebx
+ movl 3144(%ebp,%edx,4),%edx
+ addl %edx,%ebx
+ xorl %eax,%eax
+ xorl %ebx,%esi
+
+
+ movl 36(%ebp),%edx
+ movl %esi,%ebx
+ xorl %edx,%edi
+ shrl $16,%ebx
+ movl %esi,%edx
+ movb %bh,%al
+ andl $255,%ebx
+ movb %dh,%cl
+ andl $255,%edx
+ movl 72(%ebp,%eax,4),%eax
+ movl 1096(%ebp,%ebx,4),%ebx
+ addl %eax,%ebx
+ movl 2120(%ebp,%ecx,4),%eax
+ xorl %eax,%ebx
+ movl 3144(%ebp,%edx,4),%edx
+ addl %edx,%ebx
+ xorl %eax,%eax
+ xorl %ebx,%edi
+
+
+ movl 32(%ebp),%edx
+ movl %edi,%ebx
+ xorl %edx,%esi
+ shrl $16,%ebx
+ movl %edi,%edx
+ movb %bh,%al
+ andl $255,%ebx
+ movb %dh,%cl
+ andl $255,%edx
+ movl 72(%ebp,%eax,4),%eax
+ movl 1096(%ebp,%ebx,4),%ebx
+ addl %eax,%ebx
+ movl 2120(%ebp,%ecx,4),%eax
+ xorl %eax,%ebx
+ movl 3144(%ebp,%edx,4),%edx
+ addl %edx,%ebx
+ xorl %eax,%eax
+ xorl %ebx,%esi
+
+
+ movl 28(%ebp),%edx
+ movl %esi,%ebx
+ xorl %edx,%edi
+ shrl $16,%ebx
+ movl %esi,%edx
+ movb %bh,%al
+ andl $255,%ebx
+ movb %dh,%cl
+ andl $255,%edx
+ movl 72(%ebp,%eax,4),%eax
+ movl 1096(%ebp,%ebx,4),%ebx
+ addl %eax,%ebx
+ movl 2120(%ebp,%ecx,4),%eax
+ xorl %eax,%ebx
+ movl 3144(%ebp,%edx,4),%edx
+ addl %edx,%ebx
+ xorl %eax,%eax
+ xorl %ebx,%edi
+
+
+ movl 24(%ebp),%edx
+ movl %edi,%ebx
+ xorl %edx,%esi
+ shrl $16,%ebx
+ movl %edi,%edx
+ movb %bh,%al
+ andl $255,%ebx
+ movb %dh,%cl
+ andl $255,%edx
+ movl 72(%ebp,%eax,4),%eax
+ movl 1096(%ebp,%ebx,4),%ebx
+ addl %eax,%ebx
+ movl 2120(%ebp,%ecx,4),%eax
+ xorl %eax,%ebx
+ movl 3144(%ebp,%edx,4),%edx
+ addl %edx,%ebx
+ xorl %eax,%eax
+ xorl %ebx,%esi
+
+
+ movl 20(%ebp),%edx
+ movl %esi,%ebx
+ xorl %edx,%edi
+ shrl $16,%ebx
+ movl %esi,%edx
+ movb %bh,%al
+ andl $255,%ebx
+ movb %dh,%cl
+ andl $255,%edx
+ movl 72(%ebp,%eax,4),%eax
+ movl 1096(%ebp,%ebx,4),%ebx
+ addl %eax,%ebx
+ movl 2120(%ebp,%ecx,4),%eax
+ xorl %eax,%ebx
+ movl 3144(%ebp,%edx,4),%edx
+ addl %edx,%ebx
+ xorl %eax,%eax
+ xorl %ebx,%edi
+
+
+ movl 16(%ebp),%edx
+ movl %edi,%ebx
+ xorl %edx,%esi
+ shrl $16,%ebx
+ movl %edi,%edx
+ movb %bh,%al
+ andl $255,%ebx
+ movb %dh,%cl
+ andl $255,%edx
+ movl 72(%ebp,%eax,4),%eax
+ movl 1096(%ebp,%ebx,4),%ebx
+ addl %eax,%ebx
+ movl 2120(%ebp,%ecx,4),%eax
+ xorl %eax,%ebx
+ movl 3144(%ebp,%edx,4),%edx
+ addl %edx,%ebx
+ xorl %eax,%eax
+ xorl %ebx,%esi
+
+
+ movl 12(%ebp),%edx
+ movl %esi,%ebx
+ xorl %edx,%edi
+ shrl $16,%ebx
+ movl %esi,%edx
+ movb %bh,%al
+ andl $255,%ebx
+ movb %dh,%cl
+ andl $255,%edx
+ movl 72(%ebp,%eax,4),%eax
+ movl 1096(%ebp,%ebx,4),%ebx
+ addl %eax,%ebx
+ movl 2120(%ebp,%ecx,4),%eax
+ xorl %eax,%ebx
+ movl 3144(%ebp,%edx,4),%edx
+ addl %edx,%ebx
+ xorl %eax,%eax
+ xorl %ebx,%edi
+
+
+ movl 8(%ebp),%edx
+ movl %edi,%ebx
+ xorl %edx,%esi
+ shrl $16,%ebx
+ movl %edi,%edx
+ movb %bh,%al
+ andl $255,%ebx
+ movb %dh,%cl
+ andl $255,%edx
+ movl 72(%ebp,%eax,4),%eax
+ movl 1096(%ebp,%ebx,4),%ebx
+ addl %eax,%ebx
+ movl 2120(%ebp,%ecx,4),%eax
+ xorl %eax,%ebx
+ movl 3144(%ebp,%edx,4),%edx
+ addl %edx,%ebx
+ xorl %eax,%eax
+ xorl %ebx,%esi
+
+
+ movl 4(%ebp),%edx
+ movl %esi,%ebx
+ xorl %edx,%edi
+ shrl $16,%ebx
+ movl %esi,%edx
+ movb %bh,%al
+ andl $255,%ebx
+ movb %dh,%cl
+ andl $255,%edx
+ movl 72(%ebp,%eax,4),%eax
+ movl 1096(%ebp,%ebx,4),%ebx
+ addl %eax,%ebx
+ movl 2120(%ebp,%ecx,4),%eax
+ xorl %eax,%ebx
+ movl 3144(%ebp,%edx,4),%edx
+ addl %edx,%ebx
+
+ movl 20(%esp),%eax
+ xorl %ebx,%edi
+ movl (%ebp),%edx
+ xorl %edx,%esi
+ movl %edi,4(%eax)
+ movl %esi,(%eax)
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size BF_decrypt,.-.L_BF_decrypt_begin
+.globl BF_cbc_encrypt
+.type BF_cbc_encrypt,@function
+.align 16
+BF_cbc_encrypt:
+.L_BF_cbc_encrypt_begin:
+
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 28(%esp),%ebp
+
+ movl 36(%esp),%ebx
+ movl (%ebx),%esi
+ movl 4(%ebx),%edi
+ pushl %edi
+ pushl %esi
+ pushl %edi
+ pushl %esi
+ movl %esp,%ebx
+ movl 36(%esp),%esi
+ movl 40(%esp),%edi
+
+ movl 56(%esp),%ecx
+
+ movl 48(%esp),%eax
+ pushl %eax
+ pushl %ebx
+ cmpl $0,%ecx
+ jz .L000decrypt
+ andl $4294967288,%ebp
+ movl 8(%esp),%eax
+ movl 12(%esp),%ebx
+ jz .L001encrypt_finish
+.L002encrypt_loop:
+ movl (%esi),%ecx
+ movl 4(%esi),%edx
+ xorl %ecx,%eax
+ xorl %edx,%ebx
+ bswap %eax
+ bswap %ebx
+ movl %eax,8(%esp)
+ movl %ebx,12(%esp)
+ call .L_BF_encrypt_begin
+ movl 8(%esp),%eax
+ movl 12(%esp),%ebx
+ bswap %eax
+ bswap %ebx
+ movl %eax,(%edi)
+ movl %ebx,4(%edi)
+ addl $8,%esi
+ addl $8,%edi
+ subl $8,%ebp
+ jnz .L002encrypt_loop
+.L001encrypt_finish:
+ movl 52(%esp),%ebp
+ andl $7,%ebp
+ jz .L003finish
+ call .L004PIC_point
+.L004PIC_point:
+ popl %edx
+ leal .L005cbc_enc_jmp_table-.L004PIC_point(%edx),%ecx
+ movl (%ecx,%ebp,4),%ebp
+ addl %edx,%ebp
+ xorl %ecx,%ecx
+ xorl %edx,%edx
+ jmp *%ebp
+.L006ej7:
+ movb 6(%esi),%dh
+ shll $8,%edx
+.L007ej6:
+ movb 5(%esi),%dh
+.L008ej5:
+ movb 4(%esi),%dl
+.L009ej4:
+ movl (%esi),%ecx
+ jmp .L010ejend
+.L011ej3:
+ movb 2(%esi),%ch
+ shll $8,%ecx
+.L012ej2:
+ movb 1(%esi),%ch
+.L013ej1:
+ movb (%esi),%cl
+.L010ejend:
+ xorl %ecx,%eax
+ xorl %edx,%ebx
+ bswap %eax
+ bswap %ebx
+ movl %eax,8(%esp)
+ movl %ebx,12(%esp)
+ call .L_BF_encrypt_begin
+ movl 8(%esp),%eax
+ movl 12(%esp),%ebx
+ bswap %eax
+ bswap %ebx
+ movl %eax,(%edi)
+ movl %ebx,4(%edi)
+ jmp .L003finish
+.L000decrypt:
+ andl $4294967288,%ebp
+ movl 16(%esp),%eax
+ movl 20(%esp),%ebx
+ jz .L014decrypt_finish
+.L015decrypt_loop:
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ bswap %eax
+ bswap %ebx
+ movl %eax,8(%esp)
+ movl %ebx,12(%esp)
+ call .L_BF_decrypt_begin
+ movl 8(%esp),%eax
+ movl 12(%esp),%ebx
+ bswap %eax
+ bswap %ebx
+ movl 16(%esp),%ecx
+ movl 20(%esp),%edx
+ xorl %eax,%ecx
+ xorl %ebx,%edx
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl %ecx,(%edi)
+ movl %edx,4(%edi)
+ movl %eax,16(%esp)
+ movl %ebx,20(%esp)
+ addl $8,%esi
+ addl $8,%edi
+ subl $8,%ebp
+ jnz .L015decrypt_loop
+.L014decrypt_finish:
+ movl 52(%esp),%ebp
+ andl $7,%ebp
+ jz .L003finish
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ bswap %eax
+ bswap %ebx
+ movl %eax,8(%esp)
+ movl %ebx,12(%esp)
+ call .L_BF_decrypt_begin
+ movl 8(%esp),%eax
+ movl 12(%esp),%ebx
+ bswap %eax
+ bswap %ebx
+ movl 16(%esp),%ecx
+ movl 20(%esp),%edx
+ xorl %eax,%ecx
+ xorl %ebx,%edx
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+.L016dj7:
+ rorl $16,%edx
+ movb %dl,6(%edi)
+ shrl $16,%edx
+.L017dj6:
+ movb %dh,5(%edi)
+.L018dj5:
+ movb %dl,4(%edi)
+.L019dj4:
+ movl %ecx,(%edi)
+ jmp .L020djend
+.L021dj3:
+ rorl $16,%ecx
+ movb %cl,2(%edi)
+ shll $16,%ecx
+.L022dj2:
+ movb %ch,1(%esi)
+.L023dj1:
+ movb %cl,(%esi)
+.L020djend:
+ jmp .L003finish
+.L003finish:
+ movl 60(%esp),%ecx
+ addl $24,%esp
+ movl %eax,(%ecx)
+ movl %ebx,4(%ecx)
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.align 64
+.L005cbc_enc_jmp_table:
+.long 0
+.long .L013ej1-.L004PIC_point
+.long .L012ej2-.L004PIC_point
+.long .L011ej3-.L004PIC_point
+.long .L009ej4-.L004PIC_point
+.long .L008ej5-.L004PIC_point
+.long .L007ej6-.L004PIC_point
+.long .L006ej7-.L004PIC_point
+.align 64
+.size BF_cbc_encrypt,.-.L_BF_cbc_encrypt_begin
diff --git a/main/openssl/crypto/bf/bf_skey.c b/main/openssl/crypto/bf/bf_skey.c
index 3673cdee..3b0bca41 100644
--- a/main/openssl/crypto/bf/bf_skey.c
+++ b/main/openssl/crypto/bf/bf_skey.c
@@ -58,11 +58,19 @@
#include <stdio.h>
#include <string.h>
+#include <openssl/crypto.h>
#include <openssl/blowfish.h>
#include "bf_locl.h"
#include "bf_pi.h"
void BF_set_key(BF_KEY *key, int len, const unsigned char *data)
+#ifdef OPENSSL_FIPS
+ {
+ fips_cipher_abort(BLOWFISH);
+ private_BF_set_key(key, len, data);
+ }
+void private_BF_set_key(BF_KEY *key, int len, const unsigned char *data)
+#endif
{
int i;
BF_LONG *p,ri,in[2];
diff --git a/main/openssl/crypto/bf/blowfish.h b/main/openssl/crypto/bf/blowfish.h
index b97e76f9..4b6c8920 100644
--- a/main/openssl/crypto/bf/blowfish.h
+++ b/main/openssl/crypto/bf/blowfish.h
@@ -104,7 +104,9 @@ typedef struct bf_key_st
BF_LONG S[4*256];
} BF_KEY;
-
+#ifdef OPENSSL_FIPS
+void private_BF_set_key(BF_KEY *key, int len, const unsigned char *data);
+#endif
void BF_set_key(BF_KEY *key, int len, const unsigned char *data);
void BF_encrypt(BF_LONG *data,const BF_KEY *key);
diff --git a/main/openssl/crypto/bio/b_sock.c b/main/openssl/crypto/bio/b_sock.c
index d47310d6..470b1a00 100644
--- a/main/openssl/crypto/bio/b_sock.c
+++ b/main/openssl/crypto/bio/b_sock.c
@@ -629,7 +629,8 @@ int BIO_get_accept_socket(char *host, int bind_mode)
struct sockaddr_in6 sa_in6;
#endif
} server,client;
- int s=INVALID_SOCKET,cs,addrlen;
+ int s=INVALID_SOCKET,cs;
+ socklen_t addrlen;
unsigned char ip[4];
unsigned short port;
char *str=NULL,*e;
@@ -704,10 +705,10 @@ int BIO_get_accept_socket(char *host, int bind_mode)
if ((*p_getaddrinfo.f)(h,p,&hint,&res)) break;
- addrlen = res->ai_addrlen<=sizeof(server) ?
+ addrlen = res->ai_addrlen <= (socklen_t)sizeof(server) ?
res->ai_addrlen :
- sizeof(server);
- memcpy(&server, res->ai_addr, addrlen);
+ (socklen_t)sizeof(server);
+ memcpy(&server, res->ai_addr, (size_t)addrlen);
(*p_freeaddrinfo.f)(res);
goto again;
@@ -719,7 +720,7 @@ int BIO_get_accept_socket(char *host, int bind_mode)
memset((char *)&server,0,sizeof(server));
server.sa_in.sin_family=AF_INET;
server.sa_in.sin_port=htons(port);
- addrlen = sizeof(server.sa_in);
+ addrlen = (socklen_t)sizeof(server.sa_in);
if (h == NULL || strcmp(h,"*") == 0)
server.sa_in.sin_addr.s_addr=INADDR_ANY;
@@ -960,7 +961,6 @@ int BIO_set_tcp_ndelay(int s, int on)
#endif
return(ret == 0);
}
-#endif
int BIO_socket_nbio(int s, int mode)
{
@@ -973,3 +973,4 @@ int BIO_socket_nbio(int s, int mode)
#endif
return(ret == 0);
}
+#endif
diff --git a/main/openssl/crypto/bio/bf_buff.c b/main/openssl/crypto/bio/bf_buff.c
index c1fd75aa..4b5a132d 100644
--- a/main/openssl/crypto/bio/bf_buff.c
+++ b/main/openssl/crypto/bio/bf_buff.c
@@ -209,7 +209,7 @@ start:
/* add to buffer and return */
if (i >= inl)
{
- memcpy(&(ctx->obuf[ctx->obuf_len]),in,inl);
+ memcpy(&(ctx->obuf[ctx->obuf_off+ctx->obuf_len]),in,inl);
ctx->obuf_len+=inl;
return(num+inl);
}
@@ -219,7 +219,7 @@ start:
{
if (i > 0) /* lets fill it up if we can */
{
- memcpy(&(ctx->obuf[ctx->obuf_len]),in,i);
+ memcpy(&(ctx->obuf[ctx->obuf_off+ctx->obuf_len]),in,i);
in+=i;
inl-=i;
num+=i;
@@ -294,9 +294,9 @@ static long buffer_ctrl(BIO *b, int cmd, long num, void *ptr)
case BIO_C_GET_BUFF_NUM_LINES:
ret=0;
p1=ctx->ibuf;
- for (i=ctx->ibuf_off; i<ctx->ibuf_len; i++)
+ for (i=0; i<ctx->ibuf_len; i++)
{
- if (p1[i] == '\n') ret++;
+ if (p1[ctx->ibuf_off + i] == '\n') ret++;
}
break;
case BIO_CTRL_WPENDING:
@@ -399,17 +399,18 @@ static long buffer_ctrl(BIO *b, int cmd, long num, void *ptr)
for (;;)
{
BIO_clear_retry_flags(b);
- if (ctx->obuf_len > ctx->obuf_off)
+ if (ctx->obuf_len > 0)
{
r=BIO_write(b->next_bio,
&(ctx->obuf[ctx->obuf_off]),
- ctx->obuf_len-ctx->obuf_off);
+ ctx->obuf_len);
#if 0
-fprintf(stderr,"FLUSH [%3d] %3d -> %3d\n",ctx->obuf_off,ctx->obuf_len-ctx->obuf_off,r);
+fprintf(stderr,"FLUSH [%3d] %3d -> %3d\n",ctx->obuf_off,ctx->obuf_len,r);
#endif
BIO_copy_next_retry(b);
if (r <= 0) return((long)r);
ctx->obuf_off+=r;
+ ctx->obuf_len-=r;
}
else
{
diff --git a/main/openssl/crypto/bio/bio.h b/main/openssl/crypto/bio/bio.h
index 152802fb..05699ab2 100644
--- a/main/openssl/crypto/bio/bio.h
+++ b/main/openssl/crypto/bio/bio.h
@@ -68,6 +68,14 @@
#include <openssl/crypto.h>
+#ifndef OPENSSL_NO_SCTP
+# ifndef OPENSSL_SYS_VMS
+# include <stdint.h>
+# else
+# include <inttypes.h>
+# endif
+#endif
+
#ifdef __cplusplus
extern "C" {
#endif
@@ -95,6 +103,9 @@ extern "C" {
#define BIO_TYPE_BIO (19|0x0400) /* (half a) BIO pair */
#define BIO_TYPE_LINEBUFFER (20|0x0200) /* filter */
#define BIO_TYPE_DGRAM (21|0x0400|0x0100)
+#ifndef OPENSSL_NO_SCTP
+#define BIO_TYPE_DGRAM_SCTP (24|0x0400|0x0100)
+#endif
#define BIO_TYPE_ASN1 (22|0x0200) /* filter */
#define BIO_TYPE_COMP (23|0x0200) /* filter */
@@ -146,6 +157,7 @@ extern "C" {
/* #endif */
#define BIO_CTRL_DGRAM_QUERY_MTU 40 /* as kernel for current MTU */
+#define BIO_CTRL_DGRAM_GET_FALLBACK_MTU 47
#define BIO_CTRL_DGRAM_GET_MTU 41 /* get cached value for MTU */
#define BIO_CTRL_DGRAM_SET_MTU 42 /* set cached value for
* MTU. want to use this
@@ -161,7 +173,22 @@ extern "C" {
#define BIO_CTRL_DGRAM_SET_PEER 44 /* Destination for the data */
#define BIO_CTRL_DGRAM_SET_NEXT_TIMEOUT 45 /* Next DTLS handshake timeout to
- * adjust socket timeouts */
+ * adjust socket timeouts */
+
+#ifndef OPENSSL_NO_SCTP
+/* SCTP stuff */
+#define BIO_CTRL_DGRAM_SCTP_SET_IN_HANDSHAKE 50
+#define BIO_CTRL_DGRAM_SCTP_ADD_AUTH_KEY 51
+#define BIO_CTRL_DGRAM_SCTP_NEXT_AUTH_KEY 52
+#define BIO_CTRL_DGRAM_SCTP_AUTH_CCS_RCVD 53
+#define BIO_CTRL_DGRAM_SCTP_GET_SNDINFO 60
+#define BIO_CTRL_DGRAM_SCTP_SET_SNDINFO 61
+#define BIO_CTRL_DGRAM_SCTP_GET_RCVINFO 62
+#define BIO_CTRL_DGRAM_SCTP_SET_RCVINFO 63
+#define BIO_CTRL_DGRAM_SCTP_GET_PRINFO 64
+#define BIO_CTRL_DGRAM_SCTP_SET_PRINFO 65
+#define BIO_CTRL_DGRAM_SCTP_SAVE_SHUTDOWN 70
+#endif
/* modifiers */
#define BIO_FP_READ 0x02
@@ -306,6 +333,15 @@ DECLARE_STACK_OF(BIO)
typedef struct bio_f_buffer_ctx_struct
{
+ /* Buffers are setup like this:
+ *
+ * <---------------------- size ----------------------->
+ * +---------------------------------------------------+
+ * | consumed | remaining | free space |
+ * +---------------------------------------------------+
+ * <-- off --><------- len ------->
+ */
+
/* BIO *bio; */ /* this is now in the BIO struct */
int ibuf_size; /* how big is the input buffer */
int obuf_size; /* how big is the output buffer */
@@ -322,6 +358,34 @@ typedef struct bio_f_buffer_ctx_struct
/* Prefix and suffix callback in ASN1 BIO */
typedef int asn1_ps_func(BIO *b, unsigned char **pbuf, int *plen, void *parg);
+#ifndef OPENSSL_NO_SCTP
+/* SCTP parameter structs */
+struct bio_dgram_sctp_sndinfo
+ {
+ uint16_t snd_sid;
+ uint16_t snd_flags;
+ uint32_t snd_ppid;
+ uint32_t snd_context;
+ };
+
+struct bio_dgram_sctp_rcvinfo
+ {
+ uint16_t rcv_sid;
+ uint16_t rcv_ssn;
+ uint16_t rcv_flags;
+ uint32_t rcv_ppid;
+ uint32_t rcv_tsn;
+ uint32_t rcv_cumtsn;
+ uint32_t rcv_context;
+ };
+
+struct bio_dgram_sctp_prinfo
+ {
+ uint16_t pr_policy;
+ uint32_t pr_value;
+ };
+#endif
+
/* connect BIO stuff */
#define BIO_CONN_S_BEFORE 1
#define BIO_CONN_S_GET_IP 2
@@ -619,6 +683,9 @@ BIO_METHOD *BIO_f_linebuffer(void);
BIO_METHOD *BIO_f_nbio_test(void);
#ifndef OPENSSL_NO_DGRAM
BIO_METHOD *BIO_s_datagram(void);
+#ifndef OPENSSL_NO_SCTP
+BIO_METHOD *BIO_s_datagram_sctp(void);
+#endif
#endif
/* BIO_METHOD *BIO_f_ber(void); */
@@ -661,6 +728,15 @@ int BIO_set_tcp_ndelay(int sock,int turn_on);
BIO *BIO_new_socket(int sock, int close_flag);
BIO *BIO_new_dgram(int fd, int close_flag);
+#ifndef OPENSSL_NO_SCTP
+BIO *BIO_new_dgram_sctp(int fd, int close_flag);
+int BIO_dgram_is_sctp(BIO *bio);
+int BIO_dgram_sctp_notification_cb(BIO *b,
+ void (*handle_notifications)(BIO *bio, void *context, void *buf),
+ void *context);
+int BIO_dgram_sctp_wait_for_dry(BIO *b);
+int BIO_dgram_sctp_msg_waiting(BIO *b);
+#endif
BIO *BIO_new_fd(int fd, int close_flag);
BIO *BIO_new_connect(char *host_port);
BIO *BIO_new_accept(char *host_port);
@@ -725,6 +801,7 @@ void ERR_load_BIO_strings(void);
#define BIO_F_BUFFER_CTRL 114
#define BIO_F_CONN_CTRL 127
#define BIO_F_CONN_STATE 115
+#define BIO_F_DGRAM_SCTP_READ 132
#define BIO_F_FILE_CTRL 116
#define BIO_F_FILE_READ 130
#define BIO_F_LINEBUFFER_CTRL 129
diff --git a/main/openssl/crypto/bio/bio_err.c b/main/openssl/crypto/bio/bio_err.c
index a224edd5..0dbfbd80 100644
--- a/main/openssl/crypto/bio/bio_err.c
+++ b/main/openssl/crypto/bio/bio_err.c
@@ -1,6 +1,6 @@
/* crypto/bio/bio_err.c */
/* ====================================================================
- * Copyright (c) 1999-2006 The OpenSSL Project. All rights reserved.
+ * Copyright (c) 1999-2011 The OpenSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -95,6 +95,7 @@ static ERR_STRING_DATA BIO_str_functs[]=
{ERR_FUNC(BIO_F_BUFFER_CTRL), "BUFFER_CTRL"},
{ERR_FUNC(BIO_F_CONN_CTRL), "CONN_CTRL"},
{ERR_FUNC(BIO_F_CONN_STATE), "CONN_STATE"},
+{ERR_FUNC(BIO_F_DGRAM_SCTP_READ), "DGRAM_SCTP_READ"},
{ERR_FUNC(BIO_F_FILE_CTRL), "FILE_CTRL"},
{ERR_FUNC(BIO_F_FILE_READ), "FILE_READ"},
{ERR_FUNC(BIO_F_LINEBUFFER_CTRL), "LINEBUFFER_CTRL"},
diff --git a/main/openssl/crypto/bio/bio_lib.c b/main/openssl/crypto/bio/bio_lib.c
index e12bc3a2..9c9646af 100644
--- a/main/openssl/crypto/bio/bio_lib.c
+++ b/main/openssl/crypto/bio/bio_lib.c
@@ -521,40 +521,40 @@ void BIO_free_all(BIO *bio)
BIO *BIO_dup_chain(BIO *in)
{
- BIO *ret=NULL,*eoc=NULL,*bio,*new;
+ BIO *ret=NULL,*eoc=NULL,*bio,*new_bio;
for (bio=in; bio != NULL; bio=bio->next_bio)
{
- if ((new=BIO_new(bio->method)) == NULL) goto err;
- new->callback=bio->callback;
- new->cb_arg=bio->cb_arg;
- new->init=bio->init;
- new->shutdown=bio->shutdown;
- new->flags=bio->flags;
+ if ((new_bio=BIO_new(bio->method)) == NULL) goto err;
+ new_bio->callback=bio->callback;
+ new_bio->cb_arg=bio->cb_arg;
+ new_bio->init=bio->init;
+ new_bio->shutdown=bio->shutdown;
+ new_bio->flags=bio->flags;
/* This will let SSL_s_sock() work with stdin/stdout */
- new->num=bio->num;
+ new_bio->num=bio->num;
- if (!BIO_dup_state(bio,(char *)new))
+ if (!BIO_dup_state(bio,(char *)new_bio))
{
- BIO_free(new);
+ BIO_free(new_bio);
goto err;
}
/* copy app data */
- if (!CRYPTO_dup_ex_data(CRYPTO_EX_INDEX_BIO, &new->ex_data,
+ if (!CRYPTO_dup_ex_data(CRYPTO_EX_INDEX_BIO, &new_bio->ex_data,
&bio->ex_data))
goto err;
if (ret == NULL)
{
- eoc=new;
+ eoc=new_bio;
ret=eoc;
}
else
{
- BIO_push(eoc,new);
- eoc=new;
+ BIO_push(eoc,new_bio);
+ eoc=new_bio;
}
}
return(ret);
diff --git a/main/openssl/crypto/bio/bss_bio.c b/main/openssl/crypto/bio/bss_bio.c
index 76bd48e7..52ef0ebc 100644
--- a/main/openssl/crypto/bio/bss_bio.c
+++ b/main/openssl/crypto/bio/bss_bio.c
@@ -277,10 +277,10 @@ static int bio_read(BIO *bio, char *buf, int size_)
*/
/* WARNING: The non-copying interface is largely untested as of yet
* and may contain bugs. */
-static ssize_t bio_nread0(BIO *bio, char **buf)
+static ossl_ssize_t bio_nread0(BIO *bio, char **buf)
{
struct bio_bio_st *b, *peer_b;
- ssize_t num;
+ ossl_ssize_t num;
BIO_clear_retry_flags(bio);
@@ -315,15 +315,15 @@ static ssize_t bio_nread0(BIO *bio, char **buf)
return num;
}
-static ssize_t bio_nread(BIO *bio, char **buf, size_t num_)
+static ossl_ssize_t bio_nread(BIO *bio, char **buf, size_t num_)
{
struct bio_bio_st *b, *peer_b;
- ssize_t num, available;
+ ossl_ssize_t num, available;
if (num_ > SSIZE_MAX)
num = SSIZE_MAX;
else
- num = (ssize_t)num_;
+ num = (ossl_ssize_t)num_;
available = bio_nread0(bio, buf);
if (num > available)
@@ -428,7 +428,7 @@ static int bio_write(BIO *bio, const char *buf, int num_)
* (example usage: bio_nwrite0(), write to buffer, bio_nwrite()
* or just bio_nwrite(), write to buffer)
*/
-static ssize_t bio_nwrite0(BIO *bio, char **buf)
+static ossl_ssize_t bio_nwrite0(BIO *bio, char **buf)
{
struct bio_bio_st *b;
size_t num;
@@ -476,15 +476,15 @@ static ssize_t bio_nwrite0(BIO *bio, char **buf)
return num;
}
-static ssize_t bio_nwrite(BIO *bio, char **buf, size_t num_)
+static ossl_ssize_t bio_nwrite(BIO *bio, char **buf, size_t num_)
{
struct bio_bio_st *b;
- ssize_t num, space;
+ ossl_ssize_t num, space;
if (num_ > SSIZE_MAX)
num = SSIZE_MAX;
else
- num = (ssize_t)num_;
+ num = (ossl_ssize_t)num_;
space = bio_nwrite0(bio, buf);
if (num > space)
diff --git a/main/openssl/crypto/bio/bss_dgram.c b/main/openssl/crypto/bio/bss_dgram.c
index 71ebe987..54c012c4 100644
--- a/main/openssl/crypto/bio/bss_dgram.c
+++ b/main/openssl/crypto/bio/bss_dgram.c
@@ -70,10 +70,27 @@
#include <sys/timeb.h>
#endif
-#ifdef OPENSSL_SYS_LINUX
+#ifndef OPENSSL_NO_SCTP
+#include <netinet/sctp.h>
+#include <fcntl.h>
+#define OPENSSL_SCTP_DATA_CHUNK_TYPE 0x00
+#define OPENSSL_SCTP_FORWARD_CUM_TSN_CHUNK_TYPE 0xc0
+#endif
+
+#if defined(OPENSSL_SYS_LINUX) && !defined(IP_MTU)
#define IP_MTU 14 /* linux is lame */
#endif
+#if defined(__FreeBSD__) && defined(IN6_IS_ADDR_V4MAPPED)
+/* Standard definition causes type-punning problems. */
+#undef IN6_IS_ADDR_V4MAPPED
+#define s6_addr32 __u6_addr.__u6_addr32
+#define IN6_IS_ADDR_V4MAPPED(a) \
+ (((a)->s6_addr32[0] == 0) && \
+ ((a)->s6_addr32[1] == 0) && \
+ ((a)->s6_addr32[2] == htonl(0x0000ffff)))
+#endif
+
#ifdef WATT32
#define sock_write SockWrite /* Watt-32 uses same names */
#define sock_read SockRead
@@ -88,6 +105,18 @@ static int dgram_new(BIO *h);
static int dgram_free(BIO *data);
static int dgram_clear(BIO *bio);
+#ifndef OPENSSL_NO_SCTP
+static int dgram_sctp_write(BIO *h, const char *buf, int num);
+static int dgram_sctp_read(BIO *h, char *buf, int size);
+static int dgram_sctp_puts(BIO *h, const char *str);
+static long dgram_sctp_ctrl(BIO *h, int cmd, long arg1, void *arg2);
+static int dgram_sctp_new(BIO *h);
+static int dgram_sctp_free(BIO *data);
+#ifdef SCTP_AUTHENTICATION_EVENT
+static void dgram_sctp_handle_auth_free_key_event(BIO *b, union sctp_notification *snp);
+#endif
+#endif
+
static int BIO_dgram_should_retry(int s);
static void get_current_time(struct timeval *t);
@@ -106,6 +135,22 @@ static BIO_METHOD methods_dgramp=
NULL,
};
+#ifndef OPENSSL_NO_SCTP
+static BIO_METHOD methods_dgramp_sctp=
+ {
+ BIO_TYPE_DGRAM_SCTP,
+ "datagram sctp socket",
+ dgram_sctp_write,
+ dgram_sctp_read,
+ dgram_sctp_puts,
+ NULL, /* dgram_gets, */
+ dgram_sctp_ctrl,
+ dgram_sctp_new,
+ dgram_sctp_free,
+ NULL,
+ };
+#endif
+
typedef struct bio_dgram_data_st
{
union {
@@ -122,6 +167,40 @@ typedef struct bio_dgram_data_st
struct timeval socket_timeout;
} bio_dgram_data;
+#ifndef OPENSSL_NO_SCTP
+typedef struct bio_dgram_sctp_save_message_st
+ {
+ BIO *bio;
+ char *data;
+ int length;
+ } bio_dgram_sctp_save_message;
+
+typedef struct bio_dgram_sctp_data_st
+ {
+ union {
+ struct sockaddr sa;
+ struct sockaddr_in sa_in;
+#if OPENSSL_USE_IPV6
+ struct sockaddr_in6 sa_in6;
+#endif
+ } peer;
+ unsigned int connected;
+ unsigned int _errno;
+ unsigned int mtu;
+ struct bio_dgram_sctp_sndinfo sndinfo;
+ struct bio_dgram_sctp_rcvinfo rcvinfo;
+ struct bio_dgram_sctp_prinfo prinfo;
+ void (*handle_notifications)(BIO *bio, void *context, void *buf);
+ void* notification_context;
+ int in_handshake;
+ int ccs_rcvd;
+ int ccs_sent;
+ int save_shutdown;
+ int peer_auth_tested;
+ bio_dgram_sctp_save_message saved_message;
+ } bio_dgram_sctp_data;
+#endif
+
BIO_METHOD *BIO_s_datagram(void)
{
return(&methods_dgramp);
@@ -186,7 +265,7 @@ static void dgram_adjust_rcv_timeout(BIO *b)
{
#if defined(SO_RCVTIMEO)
bio_dgram_data *data = (bio_dgram_data *)b->ptr;
- int sz = sizeof(int);
+ union { size_t s; int i; } sz = {0};
/* Is a timer active? */
if (data->next_timeout.tv_sec > 0 || data->next_timeout.tv_usec > 0)
@@ -196,8 +275,10 @@ static void dgram_adjust_rcv_timeout(BIO *b)
/* Read current socket timeout */
#ifdef OPENSSL_SYS_WINDOWS
int timeout;
+
+ sz.i = sizeof(timeout);
if (getsockopt(b->num, SOL_SOCKET, SO_RCVTIMEO,
- (void*)&timeout, &sz) < 0)
+ (void*)&timeout, &sz.i) < 0)
{ perror("getsockopt"); }
else
{
@@ -205,9 +286,12 @@ static void dgram_adjust_rcv_timeout(BIO *b)
data->socket_timeout.tv_usec = (timeout % 1000) * 1000;
}
#else
+ sz.i = sizeof(data->socket_timeout);
if ( getsockopt(b->num, SOL_SOCKET, SO_RCVTIMEO,
&(data->socket_timeout), (void *)&sz) < 0)
{ perror("getsockopt"); }
+ else if (sizeof(sz.s)!=sizeof(sz.i) && sz.i==0)
+ OPENSSL_assert(sz.s<=sizeof(data->socket_timeout));
#endif
/* Get current time */
@@ -376,11 +460,10 @@ static long dgram_ctrl(BIO *b, int cmd, long num, void *ptr)
int *ip;
struct sockaddr *to = NULL;
bio_dgram_data *data = NULL;
-#if defined(IP_MTU_DISCOVER) || defined(IP_MTU)
- long sockopt_val = 0;
- unsigned int sockopt_len = 0;
-#endif
-#ifdef OPENSSL_SYS_LINUX
+#if defined(OPENSSL_SYS_LINUX) && (defined(IP_MTU_DISCOVER) || defined(IP_MTU))
+ int sockopt_val = 0;
+ socklen_t sockopt_len; /* assume that system supporting IP_MTU is
+ * modern enough to define socklen_t */
socklen_t addr_len;
union {
struct sockaddr sa;
@@ -462,7 +545,7 @@ static long dgram_ctrl(BIO *b, int cmd, long num, void *ptr)
break;
/* (Linux)kernel sets DF bit on outgoing IP packets */
case BIO_CTRL_DGRAM_MTU_DISCOVER:
-#ifdef OPENSSL_SYS_LINUX
+#if defined(OPENSSL_SYS_LINUX) && defined(IP_MTU_DISCOVER) && defined(IP_PMTUDISC_DO)
addr_len = (socklen_t)sizeof(addr);
memset((void *)&addr, 0, sizeof(addr));
if (getsockname(b->num, &addr.sa, &addr_len) < 0)
@@ -470,7 +553,6 @@ static long dgram_ctrl(BIO *b, int cmd, long num, void *ptr)
ret = 0;
break;
}
- sockopt_len = sizeof(sockopt_val);
switch (addr.sa.sa_family)
{
case AF_INET:
@@ -479,7 +561,7 @@ static long dgram_ctrl(BIO *b, int cmd, long num, void *ptr)
&sockopt_val, sizeof(sockopt_val))) < 0)
perror("setsockopt");
break;
-#if OPENSSL_USE_IPV6 && defined(IPV6_MTU_DISCOVER)
+#if OPENSSL_USE_IPV6 && defined(IPV6_MTU_DISCOVER) && defined(IPV6_PMTUDISC_DO)
case AF_INET6:
sockopt_val = IPV6_PMTUDISC_DO;
if ((ret = setsockopt(b->num, IPPROTO_IPV6, IPV6_MTU_DISCOVER,
@@ -496,7 +578,7 @@ static long dgram_ctrl(BIO *b, int cmd, long num, void *ptr)
break;
#endif
case BIO_CTRL_DGRAM_QUERY_MTU:
-#ifdef OPENSSL_SYS_LINUX
+#if defined(OPENSSL_SYS_LINUX) && defined(IP_MTU)
addr_len = (socklen_t)sizeof(addr);
memset((void *)&addr, 0, sizeof(addr));
if (getsockname(b->num, &addr.sa, &addr_len) < 0)
@@ -547,6 +629,27 @@ static long dgram_ctrl(BIO *b, int cmd, long num, void *ptr)
ret = 0;
#endif
break;
+ case BIO_CTRL_DGRAM_GET_FALLBACK_MTU:
+ switch (data->peer.sa.sa_family)
+ {
+ case AF_INET:
+ ret = 576 - 20 - 8;
+ break;
+#if OPENSSL_USE_IPV6
+ case AF_INET6:
+#ifdef IN6_IS_ADDR_V4MAPPED
+ if (IN6_IS_ADDR_V4MAPPED(&data->peer.sa_in6.sin6_addr))
+ ret = 576 - 20 - 8;
+ else
+#endif
+ ret = 1280 - 40 - 8;
+ break;
+#endif
+ default:
+ ret = 576 - 20 - 8;
+ break;
+ }
+ break;
case BIO_CTRL_DGRAM_GET_MTU:
return data->mtu;
break;
@@ -637,12 +740,15 @@ static long dgram_ctrl(BIO *b, int cmd, long num, void *ptr)
#endif
break;
case BIO_CTRL_DGRAM_GET_RECV_TIMEOUT:
-#ifdef OPENSSL_SYS_WINDOWS
{
- int timeout, sz = sizeof(timeout);
+ union { size_t s; int i; } sz = {0};
+#ifdef OPENSSL_SYS_WINDOWS
+ int timeout;
struct timeval *tv = (struct timeval *)ptr;
+
+ sz.i = sizeof(timeout);
if (getsockopt(b->num, SOL_SOCKET, SO_RCVTIMEO,
- (void*)&timeout, &sz) < 0)
+ (void*)&timeout, &sz.i) < 0)
{ perror("getsockopt"); ret = -1; }
else
{
@@ -650,12 +756,20 @@ static long dgram_ctrl(BIO *b, int cmd, long num, void *ptr)
tv->tv_usec = (timeout % 1000) * 1000;
ret = sizeof(*tv);
}
- }
#else
+ sz.i = sizeof(struct timeval);
if ( getsockopt(b->num, SOL_SOCKET, SO_RCVTIMEO,
- ptr, (void *)&ret) < 0)
+ ptr, (void *)&sz) < 0)
{ perror("getsockopt"); ret = -1; }
+ else if (sizeof(sz.s)!=sizeof(sz.i) && sz.i==0)
+ {
+ OPENSSL_assert(sz.s<=sizeof(struct timeval));
+ ret = (int)sz.s;
+ }
+ else
+ ret = sz.i;
#endif
+ }
break;
#endif
#if defined(SO_SNDTIMEO)
@@ -675,12 +789,15 @@ static long dgram_ctrl(BIO *b, int cmd, long num, void *ptr)
#endif
break;
case BIO_CTRL_DGRAM_GET_SEND_TIMEOUT:
-#ifdef OPENSSL_SYS_WINDOWS
{
- int timeout, sz = sizeof(timeout);
+ union { size_t s; int i; } sz = {0};
+#ifdef OPENSSL_SYS_WINDOWS
+ int timeout;
struct timeval *tv = (struct timeval *)ptr;
+
+ sz.i = sizeof(timeout);
if (getsockopt(b->num, SOL_SOCKET, SO_SNDTIMEO,
- (void*)&timeout, &sz) < 0)
+ (void*)&timeout, &sz.i) < 0)
{ perror("getsockopt"); ret = -1; }
else
{
@@ -688,12 +805,20 @@ static long dgram_ctrl(BIO *b, int cmd, long num, void *ptr)
tv->tv_usec = (timeout % 1000) * 1000;
ret = sizeof(*tv);
}
- }
#else
+ sz.i = sizeof(struct timeval);
if ( getsockopt(b->num, SOL_SOCKET, SO_SNDTIMEO,
- ptr, (void *)&ret) < 0)
+ ptr, (void *)&sz) < 0)
{ perror("getsockopt"); ret = -1; }
+ else if (sizeof(sz.s)!=sizeof(sz.i) && sz.i==0)
+ {
+ OPENSSL_assert(sz.s<=sizeof(struct timeval));
+ ret = (int)sz.s;
+ }
+ else
+ ret = sz.i;
#endif
+ }
break;
#endif
case BIO_CTRL_DGRAM_GET_SEND_TIMER_EXP:
@@ -738,6 +863,910 @@ static int dgram_puts(BIO *bp, const char *str)
return(ret);
}
+#ifndef OPENSSL_NO_SCTP
+BIO_METHOD *BIO_s_datagram_sctp(void)
+ {
+ return(&methods_dgramp_sctp);
+ }
+
+BIO *BIO_new_dgram_sctp(int fd, int close_flag)
+ {
+ BIO *bio;
+ int ret, optval = 20000;
+ int auth_data = 0, auth_forward = 0;
+ unsigned char *p;
+ struct sctp_authchunk auth;
+ struct sctp_authchunks *authchunks;
+ socklen_t sockopt_len;
+#ifdef SCTP_AUTHENTICATION_EVENT
+#ifdef SCTP_EVENT
+ struct sctp_event event;
+#else
+ struct sctp_event_subscribe event;
+#endif
+#endif
+
+ bio=BIO_new(BIO_s_datagram_sctp());
+ if (bio == NULL) return(NULL);
+ BIO_set_fd(bio,fd,close_flag);
+
+ /* Activate SCTP-AUTH for DATA and FORWARD-TSN chunks */
+ auth.sauth_chunk = OPENSSL_SCTP_DATA_CHUNK_TYPE;
+ ret = setsockopt(fd, IPPROTO_SCTP, SCTP_AUTH_CHUNK, &auth, sizeof(struct sctp_authchunk));
+ OPENSSL_assert(ret >= 0);
+ auth.sauth_chunk = OPENSSL_SCTP_FORWARD_CUM_TSN_CHUNK_TYPE;
+ ret = setsockopt(fd, IPPROTO_SCTP, SCTP_AUTH_CHUNK, &auth, sizeof(struct sctp_authchunk));
+ OPENSSL_assert(ret >= 0);
+
+ /* Test if activation was successful. When using accept(),
+ * SCTP-AUTH has to be activated for the listening socket
+ * already, otherwise the connected socket won't use it. */
+ sockopt_len = (socklen_t)(sizeof(sctp_assoc_t) + 256 * sizeof(uint8_t));
+ authchunks = OPENSSL_malloc(sockopt_len);
+ memset(authchunks, 0, sizeof(sockopt_len));
+ ret = getsockopt(fd, IPPROTO_SCTP, SCTP_LOCAL_AUTH_CHUNKS, authchunks, &sockopt_len);
+ OPENSSL_assert(ret >= 0);
+
+ for (p = (unsigned char*) authchunks->gauth_chunks;
+ p < (unsigned char*) authchunks + sockopt_len;
+ p += sizeof(uint8_t))
+ {
+ if (*p == OPENSSL_SCTP_DATA_CHUNK_TYPE) auth_data = 1;
+ if (*p == OPENSSL_SCTP_FORWARD_CUM_TSN_CHUNK_TYPE) auth_forward = 1;
+ }
+
+ OPENSSL_free(authchunks);
+
+ OPENSSL_assert(auth_data);
+ OPENSSL_assert(auth_forward);
+
+#ifdef SCTP_AUTHENTICATION_EVENT
+#ifdef SCTP_EVENT
+ memset(&event, 0, sizeof(struct sctp_event));
+ event.se_assoc_id = 0;
+ event.se_type = SCTP_AUTHENTICATION_EVENT;
+ event.se_on = 1;
+ ret = setsockopt(fd, IPPROTO_SCTP, SCTP_EVENT, &event, sizeof(struct sctp_event));
+ OPENSSL_assert(ret >= 0);
+#else
+ sockopt_len = (socklen_t) sizeof(struct sctp_event_subscribe);
+ ret = getsockopt(fd, IPPROTO_SCTP, SCTP_EVENTS, &event, &sockopt_len);
+ OPENSSL_assert(ret >= 0);
+
+ event.sctp_authentication_event = 1;
+
+ ret = setsockopt(fd, IPPROTO_SCTP, SCTP_EVENTS, &event, sizeof(struct sctp_event_subscribe));
+ OPENSSL_assert(ret >= 0);
+#endif
+#endif
+
+ /* Disable partial delivery by setting the min size
+ * larger than the max record size of 2^14 + 2048 + 13
+ */
+ ret = setsockopt(fd, IPPROTO_SCTP, SCTP_PARTIAL_DELIVERY_POINT, &optval, sizeof(optval));
+ OPENSSL_assert(ret >= 0);
+
+ return(bio);
+ }
+
+int BIO_dgram_is_sctp(BIO *bio)
+ {
+ return (BIO_method_type(bio) == BIO_TYPE_DGRAM_SCTP);
+ }
+
+static int dgram_sctp_new(BIO *bi)
+ {
+ bio_dgram_sctp_data *data = NULL;
+
+ bi->init=0;
+ bi->num=0;
+ data = OPENSSL_malloc(sizeof(bio_dgram_sctp_data));
+ if (data == NULL)
+ return 0;
+ memset(data, 0x00, sizeof(bio_dgram_sctp_data));
+#ifdef SCTP_PR_SCTP_NONE
+ data->prinfo.pr_policy = SCTP_PR_SCTP_NONE;
+#endif
+ bi->ptr = data;
+
+ bi->flags=0;
+ return(1);
+ }
+
+static int dgram_sctp_free(BIO *a)
+ {
+ bio_dgram_sctp_data *data;
+
+ if (a == NULL) return(0);
+ if ( ! dgram_clear(a))
+ return 0;
+
+ data = (bio_dgram_sctp_data *)a->ptr;
+ if(data != NULL) OPENSSL_free(data);
+
+ return(1);
+ }
+
+#ifdef SCTP_AUTHENTICATION_EVENT
+void dgram_sctp_handle_auth_free_key_event(BIO *b, union sctp_notification *snp)
+ {
+ int ret;
+ struct sctp_authkey_event* authkeyevent = &snp->sn_auth_event;
+
+ if (authkeyevent->auth_indication == SCTP_AUTH_FREE_KEY)
+ {
+ struct sctp_authkeyid authkeyid;
+
+ /* delete key */
+ authkeyid.scact_keynumber = authkeyevent->auth_keynumber;
+ ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_AUTH_DELETE_KEY,
+ &authkeyid, sizeof(struct sctp_authkeyid));
+ }
+ }
+#endif
+
+static int dgram_sctp_read(BIO *b, char *out, int outl)
+ {
+ int ret = 0, n = 0, i, optval;
+ socklen_t optlen;
+ bio_dgram_sctp_data *data = (bio_dgram_sctp_data *)b->ptr;
+ union sctp_notification *snp;
+ struct msghdr msg;
+ struct iovec iov;
+ struct cmsghdr *cmsg;
+ char cmsgbuf[512];
+
+ if (out != NULL)
+ {
+ clear_socket_error();
+
+ do
+ {
+ memset(&data->rcvinfo, 0x00, sizeof(struct bio_dgram_sctp_rcvinfo));
+ iov.iov_base = out;
+ iov.iov_len = outl;
+ msg.msg_name = NULL;
+ msg.msg_namelen = 0;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsgbuf;
+ msg.msg_controllen = 512;
+ msg.msg_flags = 0;
+ n = recvmsg(b->num, &msg, 0);
+
+ if (msg.msg_controllen > 0)
+ {
+ for (cmsg = CMSG_FIRSTHDR(&msg); cmsg; cmsg = CMSG_NXTHDR(&msg, cmsg))
+ {
+ if (cmsg->cmsg_level != IPPROTO_SCTP)
+ continue;
+#ifdef SCTP_RCVINFO
+ if (cmsg->cmsg_type == SCTP_RCVINFO)
+ {
+ struct sctp_rcvinfo *rcvinfo;
+
+ rcvinfo = (struct sctp_rcvinfo *)CMSG_DATA(cmsg);
+ data->rcvinfo.rcv_sid = rcvinfo->rcv_sid;
+ data->rcvinfo.rcv_ssn = rcvinfo->rcv_ssn;
+ data->rcvinfo.rcv_flags = rcvinfo->rcv_flags;
+ data->rcvinfo.rcv_ppid = rcvinfo->rcv_ppid;
+ data->rcvinfo.rcv_tsn = rcvinfo->rcv_tsn;
+ data->rcvinfo.rcv_cumtsn = rcvinfo->rcv_cumtsn;
+ data->rcvinfo.rcv_context = rcvinfo->rcv_context;
+ }
+#endif
+#ifdef SCTP_SNDRCV
+ if (cmsg->cmsg_type == SCTP_SNDRCV)
+ {
+ struct sctp_sndrcvinfo *sndrcvinfo;
+
+ sndrcvinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
+ data->rcvinfo.rcv_sid = sndrcvinfo->sinfo_stream;
+ data->rcvinfo.rcv_ssn = sndrcvinfo->sinfo_ssn;
+ data->rcvinfo.rcv_flags = sndrcvinfo->sinfo_flags;
+ data->rcvinfo.rcv_ppid = sndrcvinfo->sinfo_ppid;
+ data->rcvinfo.rcv_tsn = sndrcvinfo->sinfo_tsn;
+ data->rcvinfo.rcv_cumtsn = sndrcvinfo->sinfo_cumtsn;
+ data->rcvinfo.rcv_context = sndrcvinfo->sinfo_context;
+ }
+#endif
+ }
+ }
+
+ if (n <= 0)
+ {
+ if (n < 0)
+ ret = n;
+ break;
+ }
+
+ if (msg.msg_flags & MSG_NOTIFICATION)
+ {
+ snp = (union sctp_notification*) out;
+ if (snp->sn_header.sn_type == SCTP_SENDER_DRY_EVENT)
+ {
+#ifdef SCTP_EVENT
+ struct sctp_event event;
+#else
+ struct sctp_event_subscribe event;
+ socklen_t eventsize;
+#endif
+ /* If a message has been delayed until the socket
+ * is dry, it can be sent now.
+ */
+ if (data->saved_message.length > 0)
+ {
+ dgram_sctp_write(data->saved_message.bio, data->saved_message.data,
+ data->saved_message.length);
+ OPENSSL_free(data->saved_message.data);
+ data->saved_message.length = 0;
+ }
+
+ /* disable sender dry event */
+#ifdef SCTP_EVENT
+ memset(&event, 0, sizeof(struct sctp_event));
+ event.se_assoc_id = 0;
+ event.se_type = SCTP_SENDER_DRY_EVENT;
+ event.se_on = 0;
+ i = setsockopt(b->num, IPPROTO_SCTP, SCTP_EVENT, &event, sizeof(struct sctp_event));
+ OPENSSL_assert(i >= 0);
+#else
+ eventsize = sizeof(struct sctp_event_subscribe);
+ i = getsockopt(b->num, IPPROTO_SCTP, SCTP_EVENTS, &event, &eventsize);
+ OPENSSL_assert(i >= 0);
+
+ event.sctp_sender_dry_event = 0;
+
+ i = setsockopt(b->num, IPPROTO_SCTP, SCTP_EVENTS, &event, sizeof(struct sctp_event_subscribe));
+ OPENSSL_assert(i >= 0);
+#endif
+ }
+
+#ifdef SCTP_AUTHENTICATION_EVENT
+ if (snp->sn_header.sn_type == SCTP_AUTHENTICATION_EVENT)
+ dgram_sctp_handle_auth_free_key_event(b, snp);
+#endif
+
+ if (data->handle_notifications != NULL)
+ data->handle_notifications(b, data->notification_context, (void*) out);
+
+ memset(out, 0, outl);
+ }
+ else
+ ret += n;
+ }
+ while ((msg.msg_flags & MSG_NOTIFICATION) && (msg.msg_flags & MSG_EOR) && (ret < outl));
+
+ if (ret > 0 && !(msg.msg_flags & MSG_EOR))
+ {
+ /* Partial message read, this should never happen! */
+
+ /* The buffer was too small, this means the peer sent
+ * a message that was larger than allowed. */
+ if (ret == outl)
+ return -1;
+
+ /* Test if socket buffer can handle max record
+ * size (2^14 + 2048 + 13)
+ */
+ optlen = (socklen_t) sizeof(int);
+ ret = getsockopt(b->num, SOL_SOCKET, SO_RCVBUF, &optval, &optlen);
+ OPENSSL_assert(ret >= 0);
+ OPENSSL_assert(optval >= 18445);
+
+ /* Test if SCTP doesn't partially deliver below
+ * max record size (2^14 + 2048 + 13)
+ */
+ optlen = (socklen_t) sizeof(int);
+ ret = getsockopt(b->num, IPPROTO_SCTP, SCTP_PARTIAL_DELIVERY_POINT,
+ &optval, &optlen);
+ OPENSSL_assert(ret >= 0);
+ OPENSSL_assert(optval >= 18445);
+
+ /* Partially delivered notification??? Probably a bug.... */
+ OPENSSL_assert(!(msg.msg_flags & MSG_NOTIFICATION));
+
+ /* Everything seems ok till now, so it's most likely
+ * a message dropped by PR-SCTP.
+ */
+ memset(out, 0, outl);
+ BIO_set_retry_read(b);
+ return -1;
+ }
+
+ BIO_clear_retry_flags(b);
+ if (ret < 0)
+ {
+ if (BIO_dgram_should_retry(ret))
+ {
+ BIO_set_retry_read(b);
+ data->_errno = get_last_socket_error();
+ }
+ }
+
+ /* Test if peer uses SCTP-AUTH before continuing */
+ if (!data->peer_auth_tested)
+ {
+ int ii, auth_data = 0, auth_forward = 0;
+ unsigned char *p;
+ struct sctp_authchunks *authchunks;
+
+ optlen = (socklen_t)(sizeof(sctp_assoc_t) + 256 * sizeof(uint8_t));
+ authchunks = OPENSSL_malloc(optlen);
+ memset(authchunks, 0, sizeof(optlen));
+ ii = getsockopt(b->num, IPPROTO_SCTP, SCTP_PEER_AUTH_CHUNKS, authchunks, &optlen);
+ OPENSSL_assert(ii >= 0);
+
+ for (p = (unsigned char*) authchunks->gauth_chunks;
+ p < (unsigned char*) authchunks + optlen;
+ p += sizeof(uint8_t))
+ {
+ if (*p == OPENSSL_SCTP_DATA_CHUNK_TYPE) auth_data = 1;
+ if (*p == OPENSSL_SCTP_FORWARD_CUM_TSN_CHUNK_TYPE) auth_forward = 1;
+ }
+
+ OPENSSL_free(authchunks);
+
+ if (!auth_data || !auth_forward)
+ {
+ BIOerr(BIO_F_DGRAM_SCTP_READ,BIO_R_CONNECT_ERROR);
+ return -1;
+ }
+
+ data->peer_auth_tested = 1;
+ }
+ }
+ return(ret);
+ }
+
+static int dgram_sctp_write(BIO *b, const char *in, int inl)
+ {
+ int ret;
+ bio_dgram_sctp_data *data = (bio_dgram_sctp_data *)b->ptr;
+ struct bio_dgram_sctp_sndinfo *sinfo = &(data->sndinfo);
+ struct bio_dgram_sctp_prinfo *pinfo = &(data->prinfo);
+ struct bio_dgram_sctp_sndinfo handshake_sinfo;
+ struct iovec iov[1];
+ struct msghdr msg;
+ struct cmsghdr *cmsg;
+#if defined(SCTP_SNDINFO) && defined(SCTP_PRINFO)
+ char cmsgbuf[CMSG_SPACE(sizeof(struct sctp_sndinfo)) + CMSG_SPACE(sizeof(struct sctp_prinfo))];
+ struct sctp_sndinfo *sndinfo;
+ struct sctp_prinfo *prinfo;
+#else
+ char cmsgbuf[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
+ struct sctp_sndrcvinfo *sndrcvinfo;
+#endif
+
+ clear_socket_error();
+
+ /* If we're send anything else than application data,
+ * disable all user parameters and flags.
+ */
+ if (in[0] != 23) {
+ memset(&handshake_sinfo, 0x00, sizeof(struct bio_dgram_sctp_sndinfo));
+#ifdef SCTP_SACK_IMMEDIATELY
+ handshake_sinfo.snd_flags = SCTP_SACK_IMMEDIATELY;
+#endif
+ sinfo = &handshake_sinfo;
+ }
+
+ /* If we have to send a shutdown alert message and the
+ * socket is not dry yet, we have to save it and send it
+ * as soon as the socket gets dry.
+ */
+ if (data->save_shutdown && !BIO_dgram_sctp_wait_for_dry(b))
+ {
+ data->saved_message.bio = b;
+ data->saved_message.length = inl;
+ data->saved_message.data = OPENSSL_malloc(inl);
+ memcpy(data->saved_message.data, in, inl);
+ return inl;
+ }
+
+ iov[0].iov_base = (char *)in;
+ iov[0].iov_len = inl;
+ msg.msg_name = NULL;
+ msg.msg_namelen = 0;
+ msg.msg_iov = iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = (caddr_t)cmsgbuf;
+ msg.msg_controllen = 0;
+ msg.msg_flags = 0;
+#if defined(SCTP_SNDINFO) && defined(SCTP_PRINFO)
+ cmsg = (struct cmsghdr *)cmsgbuf;
+ cmsg->cmsg_level = IPPROTO_SCTP;
+ cmsg->cmsg_type = SCTP_SNDINFO;
+ cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndinfo));
+ sndinfo = (struct sctp_sndinfo *)CMSG_DATA(cmsg);
+ memset(sndinfo, 0, sizeof(struct sctp_sndinfo));
+ sndinfo->snd_sid = sinfo->snd_sid;
+ sndinfo->snd_flags = sinfo->snd_flags;
+ sndinfo->snd_ppid = sinfo->snd_ppid;
+ sndinfo->snd_context = sinfo->snd_context;
+ msg.msg_controllen += CMSG_SPACE(sizeof(struct sctp_sndinfo));
+
+ cmsg = (struct cmsghdr *)&cmsgbuf[CMSG_SPACE(sizeof(struct sctp_sndinfo))];
+ cmsg->cmsg_level = IPPROTO_SCTP;
+ cmsg->cmsg_type = SCTP_PRINFO;
+ cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_prinfo));
+ prinfo = (struct sctp_prinfo *)CMSG_DATA(cmsg);
+ memset(prinfo, 0, sizeof(struct sctp_prinfo));
+ prinfo->pr_policy = pinfo->pr_policy;
+ prinfo->pr_value = pinfo->pr_value;
+ msg.msg_controllen += CMSG_SPACE(sizeof(struct sctp_prinfo));
+#else
+ cmsg = (struct cmsghdr *)cmsgbuf;
+ cmsg->cmsg_level = IPPROTO_SCTP;
+ cmsg->cmsg_type = SCTP_SNDRCV;
+ cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
+ sndrcvinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
+ memset(sndrcvinfo, 0, sizeof(struct sctp_sndrcvinfo));
+ sndrcvinfo->sinfo_stream = sinfo->snd_sid;
+ sndrcvinfo->sinfo_flags = sinfo->snd_flags;
+#ifdef __FreeBSD__
+ sndrcvinfo->sinfo_flags |= pinfo->pr_policy;
+#endif
+ sndrcvinfo->sinfo_ppid = sinfo->snd_ppid;
+ sndrcvinfo->sinfo_context = sinfo->snd_context;
+ sndrcvinfo->sinfo_timetolive = pinfo->pr_value;
+ msg.msg_controllen += CMSG_SPACE(sizeof(struct sctp_sndrcvinfo));
+#endif
+
+ ret = sendmsg(b->num, &msg, 0);
+
+ BIO_clear_retry_flags(b);
+ if (ret <= 0)
+ {
+ if (BIO_dgram_should_retry(ret))
+ {
+ BIO_set_retry_write(b);
+ data->_errno = get_last_socket_error();
+ }
+ }
+ return(ret);
+ }
+
+static long dgram_sctp_ctrl(BIO *b, int cmd, long num, void *ptr)
+ {
+ long ret=1;
+ bio_dgram_sctp_data *data = NULL;
+ socklen_t sockopt_len = 0;
+ struct sctp_authkeyid authkeyid;
+ struct sctp_authkey *authkey;
+
+ data = (bio_dgram_sctp_data *)b->ptr;
+
+ switch (cmd)
+ {
+ case BIO_CTRL_DGRAM_QUERY_MTU:
+ /* Set to maximum (2^14)
+ * and ignore user input to enable transport
+ * protocol fragmentation.
+ * Returns always 2^14.
+ */
+ data->mtu = 16384;
+ ret = data->mtu;
+ break;
+ case BIO_CTRL_DGRAM_SET_MTU:
+ /* Set to maximum (2^14)
+ * and ignore input to enable transport
+ * protocol fragmentation.
+ * Returns always 2^14.
+ */
+ data->mtu = 16384;
+ ret = data->mtu;
+ break;
+ case BIO_CTRL_DGRAM_SET_CONNECTED:
+ case BIO_CTRL_DGRAM_CONNECT:
+ /* Returns always -1. */
+ ret = -1;
+ break;
+ case BIO_CTRL_DGRAM_SET_NEXT_TIMEOUT:
+ /* SCTP doesn't need the DTLS timer
+ * Returns always 1.
+ */
+ break;
+ case BIO_CTRL_DGRAM_SCTP_SET_IN_HANDSHAKE:
+ if (num > 0)
+ data->in_handshake = 1;
+ else
+ data->in_handshake = 0;
+
+ ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_NODELAY, &data->in_handshake, sizeof(int));
+ break;
+ case BIO_CTRL_DGRAM_SCTP_ADD_AUTH_KEY:
+ /* New shared key for SCTP AUTH.
+ * Returns 0 on success, -1 otherwise.
+ */
+
+ /* Get active key */
+ sockopt_len = sizeof(struct sctp_authkeyid);
+ ret = getsockopt(b->num, IPPROTO_SCTP, SCTP_AUTH_ACTIVE_KEY, &authkeyid, &sockopt_len);
+ if (ret < 0) break;
+
+ /* Add new key */
+ sockopt_len = sizeof(struct sctp_authkey) + 64 * sizeof(uint8_t);
+ authkey = OPENSSL_malloc(sockopt_len);
+ memset(authkey, 0x00, sockopt_len);
+ authkey->sca_keynumber = authkeyid.scact_keynumber + 1;
+#ifndef __FreeBSD__
+ /* This field is missing in FreeBSD 8.2 and earlier,
+ * and FreeBSD 8.3 and higher work without it.
+ */
+ authkey->sca_keylength = 64;
+#endif
+ memcpy(&authkey->sca_key[0], ptr, 64 * sizeof(uint8_t));
+
+ ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_AUTH_KEY, authkey, sockopt_len);
+ if (ret < 0) break;
+
+ /* Reset active key */
+ ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_AUTH_ACTIVE_KEY,
+ &authkeyid, sizeof(struct sctp_authkeyid));
+ if (ret < 0) break;
+
+ break;
+ case BIO_CTRL_DGRAM_SCTP_NEXT_AUTH_KEY:
+ /* Returns 0 on success, -1 otherwise. */
+
+ /* Get active key */
+ sockopt_len = sizeof(struct sctp_authkeyid);
+ ret = getsockopt(b->num, IPPROTO_SCTP, SCTP_AUTH_ACTIVE_KEY, &authkeyid, &sockopt_len);
+ if (ret < 0) break;
+
+ /* Set active key */
+ authkeyid.scact_keynumber = authkeyid.scact_keynumber + 1;
+ ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_AUTH_ACTIVE_KEY,
+ &authkeyid, sizeof(struct sctp_authkeyid));
+ if (ret < 0) break;
+
+ /* CCS has been sent, so remember that and fall through
+ * to check if we need to deactivate an old key
+ */
+ data->ccs_sent = 1;
+
+ case BIO_CTRL_DGRAM_SCTP_AUTH_CCS_RCVD:
+ /* Returns 0 on success, -1 otherwise. */
+
+ /* Has this command really been called or is this just a fall-through? */
+ if (cmd == BIO_CTRL_DGRAM_SCTP_AUTH_CCS_RCVD)
+ data->ccs_rcvd = 1;
+
+ /* CSS has been both, received and sent, so deactivate an old key */
+ if (data->ccs_rcvd == 1 && data->ccs_sent == 1)
+ {
+ /* Get active key */
+ sockopt_len = sizeof(struct sctp_authkeyid);
+ ret = getsockopt(b->num, IPPROTO_SCTP, SCTP_AUTH_ACTIVE_KEY, &authkeyid, &sockopt_len);
+ if (ret < 0) break;
+
+ /* Deactivate key or delete second last key if
+ * SCTP_AUTHENTICATION_EVENT is not available.
+ */
+ authkeyid.scact_keynumber = authkeyid.scact_keynumber - 1;
+#ifdef SCTP_AUTH_DEACTIVATE_KEY
+ sockopt_len = sizeof(struct sctp_authkeyid);
+ ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_AUTH_DEACTIVATE_KEY,
+ &authkeyid, sockopt_len);
+ if (ret < 0) break;
+#endif
+#ifndef SCTP_AUTHENTICATION_EVENT
+ if (authkeyid.scact_keynumber > 0)
+ {
+ authkeyid.scact_keynumber = authkeyid.scact_keynumber - 1;
+ ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_AUTH_DELETE_KEY,
+ &authkeyid, sizeof(struct sctp_authkeyid));
+ if (ret < 0) break;
+ }
+#endif
+
+ data->ccs_rcvd = 0;
+ data->ccs_sent = 0;
+ }
+ break;
+ case BIO_CTRL_DGRAM_SCTP_GET_SNDINFO:
+ /* Returns the size of the copied struct. */
+ if (num > (long) sizeof(struct bio_dgram_sctp_sndinfo))
+ num = sizeof(struct bio_dgram_sctp_sndinfo);
+
+ memcpy(ptr, &(data->sndinfo), num);
+ ret = num;
+ break;
+ case BIO_CTRL_DGRAM_SCTP_SET_SNDINFO:
+ /* Returns the size of the copied struct. */
+ if (num > (long) sizeof(struct bio_dgram_sctp_sndinfo))
+ num = sizeof(struct bio_dgram_sctp_sndinfo);
+
+ memcpy(&(data->sndinfo), ptr, num);
+ break;
+ case BIO_CTRL_DGRAM_SCTP_GET_RCVINFO:
+ /* Returns the size of the copied struct. */
+ if (num > (long) sizeof(struct bio_dgram_sctp_rcvinfo))
+ num = sizeof(struct bio_dgram_sctp_rcvinfo);
+
+ memcpy(ptr, &data->rcvinfo, num);
+
+ ret = num;
+ break;
+ case BIO_CTRL_DGRAM_SCTP_SET_RCVINFO:
+ /* Returns the size of the copied struct. */
+ if (num > (long) sizeof(struct bio_dgram_sctp_rcvinfo))
+ num = sizeof(struct bio_dgram_sctp_rcvinfo);
+
+ memcpy(&(data->rcvinfo), ptr, num);
+ break;
+ case BIO_CTRL_DGRAM_SCTP_GET_PRINFO:
+ /* Returns the size of the copied struct. */
+ if (num > (long) sizeof(struct bio_dgram_sctp_prinfo))
+ num = sizeof(struct bio_dgram_sctp_prinfo);
+
+ memcpy(ptr, &(data->prinfo), num);
+ ret = num;
+ break;
+ case BIO_CTRL_DGRAM_SCTP_SET_PRINFO:
+ /* Returns the size of the copied struct. */
+ if (num > (long) sizeof(struct bio_dgram_sctp_prinfo))
+ num = sizeof(struct bio_dgram_sctp_prinfo);
+
+ memcpy(&(data->prinfo), ptr, num);
+ break;
+ case BIO_CTRL_DGRAM_SCTP_SAVE_SHUTDOWN:
+ /* Returns always 1. */
+ if (num > 0)
+ data->save_shutdown = 1;
+ else
+ data->save_shutdown = 0;
+ break;
+
+ default:
+ /* Pass to default ctrl function to
+ * process SCTP unspecific commands
+ */
+ ret=dgram_ctrl(b, cmd, num, ptr);
+ break;
+ }
+ return(ret);
+ }
+
+int BIO_dgram_sctp_notification_cb(BIO *b,
+ void (*handle_notifications)(BIO *bio, void *context, void *buf),
+ void *context)
+ {
+ bio_dgram_sctp_data *data = (bio_dgram_sctp_data *) b->ptr;
+
+ if (handle_notifications != NULL)
+ {
+ data->handle_notifications = handle_notifications;
+ data->notification_context = context;
+ }
+ else
+ return -1;
+
+ return 0;
+ }
+
+int BIO_dgram_sctp_wait_for_dry(BIO *b)
+{
+ int is_dry = 0;
+ int n, sockflags, ret;
+ union sctp_notification snp;
+ struct msghdr msg;
+ struct iovec iov;
+#ifdef SCTP_EVENT
+ struct sctp_event event;
+#else
+ struct sctp_event_subscribe event;
+ socklen_t eventsize;
+#endif
+ bio_dgram_sctp_data *data = (bio_dgram_sctp_data *)b->ptr;
+
+ /* set sender dry event */
+#ifdef SCTP_EVENT
+ memset(&event, 0, sizeof(struct sctp_event));
+ event.se_assoc_id = 0;
+ event.se_type = SCTP_SENDER_DRY_EVENT;
+ event.se_on = 1;
+ ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_EVENT, &event, sizeof(struct sctp_event));
+#else
+ eventsize = sizeof(struct sctp_event_subscribe);
+ ret = getsockopt(b->num, IPPROTO_SCTP, SCTP_EVENTS, &event, &eventsize);
+ if (ret < 0)
+ return -1;
+
+ event.sctp_sender_dry_event = 1;
+
+ ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_EVENTS, &event, sizeof(struct sctp_event_subscribe));
+#endif
+ if (ret < 0)
+ return -1;
+
+ /* peek for notification */
+ memset(&snp, 0x00, sizeof(union sctp_notification));
+ iov.iov_base = (char *)&snp;
+ iov.iov_len = sizeof(union sctp_notification);
+ msg.msg_name = NULL;
+ msg.msg_namelen = 0;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_flags = 0;
+
+ n = recvmsg(b->num, &msg, MSG_PEEK);
+ if (n <= 0)
+ {
+ if ((n < 0) && (get_last_socket_error() != EAGAIN) && (get_last_socket_error() != EWOULDBLOCK))
+ return -1;
+ else
+ return 0;
+ }
+
+ /* if we find a notification, process it and try again if necessary */
+ while (msg.msg_flags & MSG_NOTIFICATION)
+ {
+ memset(&snp, 0x00, sizeof(union sctp_notification));
+ iov.iov_base = (char *)&snp;
+ iov.iov_len = sizeof(union sctp_notification);
+ msg.msg_name = NULL;
+ msg.msg_namelen = 0;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_flags = 0;
+
+ n = recvmsg(b->num, &msg, 0);
+ if (n <= 0)
+ {
+ if ((n < 0) && (get_last_socket_error() != EAGAIN) && (get_last_socket_error() != EWOULDBLOCK))
+ return -1;
+ else
+ return is_dry;
+ }
+
+ if (snp.sn_header.sn_type == SCTP_SENDER_DRY_EVENT)
+ {
+ is_dry = 1;
+
+ /* disable sender dry event */
+#ifdef SCTP_EVENT
+ memset(&event, 0, sizeof(struct sctp_event));
+ event.se_assoc_id = 0;
+ event.se_type = SCTP_SENDER_DRY_EVENT;
+ event.se_on = 0;
+ ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_EVENT, &event, sizeof(struct sctp_event));
+#else
+ eventsize = (socklen_t) sizeof(struct sctp_event_subscribe);
+ ret = getsockopt(b->num, IPPROTO_SCTP, SCTP_EVENTS, &event, &eventsize);
+ if (ret < 0)
+ return -1;
+
+ event.sctp_sender_dry_event = 0;
+
+ ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_EVENTS, &event, sizeof(struct sctp_event_subscribe));
+#endif
+ if (ret < 0)
+ return -1;
+ }
+
+#ifdef SCTP_AUTHENTICATION_EVENT
+ if (snp.sn_header.sn_type == SCTP_AUTHENTICATION_EVENT)
+ dgram_sctp_handle_auth_free_key_event(b, &snp);
+#endif
+
+ if (data->handle_notifications != NULL)
+ data->handle_notifications(b, data->notification_context, (void*) &snp);
+
+ /* found notification, peek again */
+ memset(&snp, 0x00, sizeof(union sctp_notification));
+ iov.iov_base = (char *)&snp;
+ iov.iov_len = sizeof(union sctp_notification);
+ msg.msg_name = NULL;
+ msg.msg_namelen = 0;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_flags = 0;
+
+ /* if we have seen the dry already, don't wait */
+ if (is_dry)
+ {
+ sockflags = fcntl(b->num, F_GETFL, 0);
+ fcntl(b->num, F_SETFL, O_NONBLOCK);
+ }
+
+ n = recvmsg(b->num, &msg, MSG_PEEK);
+
+ if (is_dry)
+ {
+ fcntl(b->num, F_SETFL, sockflags);
+ }
+
+ if (n <= 0)
+ {
+ if ((n < 0) && (get_last_socket_error() != EAGAIN) && (get_last_socket_error() != EWOULDBLOCK))
+ return -1;
+ else
+ return is_dry;
+ }
+ }
+
+ /* read anything else */
+ return is_dry;
+}
+
+int BIO_dgram_sctp_msg_waiting(BIO *b)
+ {
+ int n, sockflags;
+ union sctp_notification snp;
+ struct msghdr msg;
+ struct iovec iov;
+ bio_dgram_sctp_data *data = (bio_dgram_sctp_data *)b->ptr;
+
+ /* Check if there are any messages waiting to be read */
+ do
+ {
+ memset(&snp, 0x00, sizeof(union sctp_notification));
+ iov.iov_base = (char *)&snp;
+ iov.iov_len = sizeof(union sctp_notification);
+ msg.msg_name = NULL;
+ msg.msg_namelen = 0;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_flags = 0;
+
+ sockflags = fcntl(b->num, F_GETFL, 0);
+ fcntl(b->num, F_SETFL, O_NONBLOCK);
+ n = recvmsg(b->num, &msg, MSG_PEEK);
+ fcntl(b->num, F_SETFL, sockflags);
+
+ /* if notification, process and try again */
+ if (n > 0 && (msg.msg_flags & MSG_NOTIFICATION))
+ {
+#ifdef SCTP_AUTHENTICATION_EVENT
+ if (snp.sn_header.sn_type == SCTP_AUTHENTICATION_EVENT)
+ dgram_sctp_handle_auth_free_key_event(b, &snp);
+#endif
+
+ memset(&snp, 0x00, sizeof(union sctp_notification));
+ iov.iov_base = (char *)&snp;
+ iov.iov_len = sizeof(union sctp_notification);
+ msg.msg_name = NULL;
+ msg.msg_namelen = 0;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_flags = 0;
+ n = recvmsg(b->num, &msg, 0);
+
+ if (data->handle_notifications != NULL)
+ data->handle_notifications(b, data->notification_context, (void*) &snp);
+ }
+
+ } while (n > 0 && (msg.msg_flags & MSG_NOTIFICATION));
+
+ /* Return 1 if there is a message to be read, return 0 otherwise. */
+ if (n > 0)
+ return 1;
+ else
+ return 0;
+ }
+
+static int dgram_sctp_puts(BIO *bp, const char *str)
+ {
+ int n,ret;
+
+ n=strlen(str);
+ ret=dgram_sctp_write(bp,str,n);
+ return(ret);
+ }
+#endif
+
static int BIO_dgram_should_retry(int i)
{
int err;
diff --git a/main/openssl/crypto/bn/asm/armv4-gf2m.S b/main/openssl/crypto/bn/asm/armv4-gf2m.S
new file mode 100644
index 00000000..038f0864
--- /dev/null
+++ b/main/openssl/crypto/bn/asm/armv4-gf2m.S
@@ -0,0 +1,213 @@
+#include "arm_arch.h"
+
+.text
+.code 32
+
+#if __ARM_ARCH__>=7
+.fpu neon
+
+.type mul_1x1_neon,%function
+.align 5
+mul_1x1_neon:
+ vshl.u64 d2,d16,#8 @ q1-q3 are slided
+ vmull.p8 q0,d16,d17 @ a·bb
+ vshl.u64 d4,d16,#16
+ vmull.p8 q1,d2,d17 @ a<<8·bb
+ vshl.u64 d6,d16,#24
+ vmull.p8 q2,d4,d17 @ a<<16·bb
+ vshr.u64 d2,#8
+ vmull.p8 q3,d6,d17 @ a<<24·bb
+ vshl.u64 d3,#24
+ veor d0,d2
+ vshr.u64 d4,#16
+ veor d0,d3
+ vshl.u64 d5,#16
+ veor d0,d4
+ vshr.u64 d6,#24
+ veor d0,d5
+ vshl.u64 d7,#8
+ veor d0,d6
+ veor d0,d7
+ .word 0xe12fff1e
+.size mul_1x1_neon,.-mul_1x1_neon
+#endif
+.type mul_1x1_ialu,%function
+.align 5
+mul_1x1_ialu:
+ mov r4,#0
+ bic r5,r1,#3<<30 @ a1=a&0x3fffffff
+ str r4,[sp,#0] @ tab[0]=0
+ add r6,r5,r5 @ a2=a1<<1
+ str r5,[sp,#4] @ tab[1]=a1
+ eor r7,r5,r6 @ a1^a2
+ str r6,[sp,#8] @ tab[2]=a2
+ mov r8,r5,lsl#2 @ a4=a1<<2
+ str r7,[sp,#12] @ tab[3]=a1^a2
+ eor r9,r5,r8 @ a1^a4
+ str r8,[sp,#16] @ tab[4]=a4
+ eor r4,r6,r8 @ a2^a4
+ str r9,[sp,#20] @ tab[5]=a1^a4
+ eor r7,r7,r8 @ a1^a2^a4
+ str r4,[sp,#24] @ tab[6]=a2^a4
+ and r8,r12,r0,lsl#2
+ str r7,[sp,#28] @ tab[7]=a1^a2^a4
+
+ and r9,r12,r0,lsr#1
+ ldr r5,[sp,r8] @ tab[b & 0x7]
+ and r8,r12,r0,lsr#4
+ ldr r7,[sp,r9] @ tab[b >> 3 & 0x7]
+ and r9,r12,r0,lsr#7
+ ldr r6,[sp,r8] @ tab[b >> 6 & 0x7]
+ eor r5,r5,r7,lsl#3 @ stall
+ mov r4,r7,lsr#29
+ ldr r7,[sp,r9] @ tab[b >> 9 & 0x7]
+
+ and r8,r12,r0,lsr#10
+ eor r5,r5,r6,lsl#6
+ eor r4,r4,r6,lsr#26
+ ldr r6,[sp,r8] @ tab[b >> 12 & 0x7]
+
+ and r9,r12,r0,lsr#13
+ eor r5,r5,r7,lsl#9
+ eor r4,r4,r7,lsr#23
+ ldr r7,[sp,r9] @ tab[b >> 15 & 0x7]
+
+ and r8,r12,r0,lsr#16
+ eor r5,r5,r6,lsl#12
+ eor r4,r4,r6,lsr#20
+ ldr r6,[sp,r8] @ tab[b >> 18 & 0x7]
+
+ and r9,r12,r0,lsr#19
+ eor r5,r5,r7,lsl#15
+ eor r4,r4,r7,lsr#17
+ ldr r7,[sp,r9] @ tab[b >> 21 & 0x7]
+
+ and r8,r12,r0,lsr#22
+ eor r5,r5,r6,lsl#18
+ eor r4,r4,r6,lsr#14
+ ldr r6,[sp,r8] @ tab[b >> 24 & 0x7]
+
+ and r9,r12,r0,lsr#25
+ eor r5,r5,r7,lsl#21
+ eor r4,r4,r7,lsr#11
+ ldr r7,[sp,r9] @ tab[b >> 27 & 0x7]
+
+ tst r1,#1<<30
+ and r8,r12,r0,lsr#28
+ eor r5,r5,r6,lsl#24
+ eor r4,r4,r6,lsr#8
+ ldr r6,[sp,r8] @ tab[b >> 30 ]
+
+ eorne r5,r5,r0,lsl#30
+ eorne r4,r4,r0,lsr#2
+ tst r1,#1<<31
+ eor r5,r5,r7,lsl#27
+ eor r4,r4,r7,lsr#5
+ eorne r5,r5,r0,lsl#31
+ eorne r4,r4,r0,lsr#1
+ eor r5,r5,r6,lsl#30
+ eor r4,r4,r6,lsr#2
+
+ mov pc,lr
+.size mul_1x1_ialu,.-mul_1x1_ialu
+.global bn_GF2m_mul_2x2
+.type bn_GF2m_mul_2x2,%function
+.align 5
+bn_GF2m_mul_2x2:
+#if __ARM_ARCH__>=7
+ ldr r12,.LOPENSSL_armcap
+.Lpic: ldr r12,[pc,r12]
+ tst r12,#1
+ beq .Lialu
+
+ veor d18,d18
+ vmov.32 d19,r3,r3 @ two copies of b1
+ vmov.32 d18[0],r1 @ a1
+
+ veor d20,d20
+ vld1.32 d21[],[sp,:32] @ two copies of b0
+ vmov.32 d20[0],r2 @ a0
+ mov r12,lr
+
+ vmov d16,d18
+ vmov d17,d19
+ bl mul_1x1_neon @ a1·b1
+ vmov d22,d0
+
+ vmov d16,d20
+ vmov d17,d21
+ bl mul_1x1_neon @ a0·b0
+ vmov d23,d0
+
+ veor d16,d20,d18
+ veor d17,d21,d19
+ veor d20,d23,d22
+ bl mul_1x1_neon @ (a0+a1)·(b0+b1)
+
+ veor d0,d20 @ (a0+a1)·(b0+b1)-a0·b0-a1·b1
+ vshl.u64 d1,d0,#32
+ vshr.u64 d0,d0,#32
+ veor d23,d1
+ veor d22,d0
+ vst1.32 {d23[0]},[r0,:32]!
+ vst1.32 {d23[1]},[r0,:32]!
+ vst1.32 {d22[0]},[r0,:32]!
+ vst1.32 {d22[1]},[r0,:32]
+ bx r12
+.align 4
+.Lialu:
+#endif
+ stmdb sp!,{r4-r10,lr}
+ mov r10,r0 @ reassign 1st argument
+ mov r0,r3 @ r0=b1
+ ldr r3,[sp,#32] @ load b0
+ mov r12,#7<<2
+ sub sp,sp,#32 @ allocate tab[8]
+
+ bl mul_1x1_ialu @ a1·b1
+ str r5,[r10,#8]
+ str r4,[r10,#12]
+
+ eor r0,r0,r3 @ flip b0 and b1
+ eor r1,r1,r2 @ flip a0 and a1
+ eor r3,r3,r0
+ eor r2,r2,r1
+ eor r0,r0,r3
+ eor r1,r1,r2
+ bl mul_1x1_ialu @ a0·b0
+ str r5,[r10]
+ str r4,[r10,#4]
+
+ eor r1,r1,r2
+ eor r0,r0,r3
+ bl mul_1x1_ialu @ (a1+a0)·(b1+b0)
+ ldmia r10,{r6-r9}
+ eor r5,r5,r4
+ eor r4,r4,r7
+ eor r5,r5,r6
+ eor r4,r4,r8
+ eor r5,r5,r9
+ eor r4,r4,r9
+ str r4,[r10,#8]
+ eor r5,r5,r4
+ add sp,sp,#32 @ destroy tab[8]
+ str r5,[r10,#4]
+
+#if __ARM_ARCH__>=5
+ ldmia sp!,{r4-r10,pc}
+#else
+ ldmia sp!,{r4-r10,lr}
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ .word 0xe12fff1e @ interoperable with Thumb ISA:-)
+#endif
+.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
+#if __ARM_ARCH__>=7
+.align 5
+.LOPENSSL_armcap:
+.word OPENSSL_armcap_P-(.Lpic+8)
+#endif
+.asciz "GF(2^m) Multiplication for ARMv4/NEON, CRYPTOGAMS by <appro@openssl.org>"
+.align 5
+
+.comm OPENSSL_armcap_P,4,4
diff --git a/main/openssl/crypto/bn/asm/armv4-gf2m.pl b/main/openssl/crypto/bn/asm/armv4-gf2m.pl
new file mode 100644
index 00000000..22ad1f85
--- /dev/null
+++ b/main/openssl/crypto/bn/asm/armv4-gf2m.pl
@@ -0,0 +1,278 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# May 2011
+#
+# The module implements bn_GF2m_mul_2x2 polynomial multiplication
+# used in bn_gf2m.c. It's kind of low-hanging mechanical port from
+# C for the time being... Except that it has two code paths: pure
+# integer code suitable for any ARMv4 and later CPU and NEON code
+# suitable for ARMv7. Pure integer 1x1 multiplication subroutine runs
+# in ~45 cycles on dual-issue core such as Cortex A8, which is ~50%
+# faster than compiler-generated code. For ECDH and ECDSA verify (but
+# not for ECDSA sign) it means 25%-45% improvement depending on key
+# length, more for longer keys. Even though NEON 1x1 multiplication
+# runs in even less cycles, ~30, improvement is measurable only on
+# longer keys. One has to optimize code elsewhere to get NEON glow...
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
+sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
+sub Q() { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; }
+
+$code=<<___;
+#include "arm_arch.h"
+
+.text
+.code 32
+
+#if __ARM_ARCH__>=7
+.fpu neon
+
+.type mul_1x1_neon,%function
+.align 5
+mul_1x1_neon:
+ vshl.u64 `&Dlo("q1")`,d16,#8 @ q1-q3 are slided $a
+ vmull.p8 `&Q("d0")`,d16,d17 @ a·bb
+ vshl.u64 `&Dlo("q2")`,d16,#16
+ vmull.p8 q1,`&Dlo("q1")`,d17 @ a<<8·bb
+ vshl.u64 `&Dlo("q3")`,d16,#24
+ vmull.p8 q2,`&Dlo("q2")`,d17 @ a<<16·bb
+ vshr.u64 `&Dlo("q1")`,#8
+ vmull.p8 q3,`&Dlo("q3")`,d17 @ a<<24·bb
+ vshl.u64 `&Dhi("q1")`,#24
+ veor d0,`&Dlo("q1")`
+ vshr.u64 `&Dlo("q2")`,#16
+ veor d0,`&Dhi("q1")`
+ vshl.u64 `&Dhi("q2")`,#16
+ veor d0,`&Dlo("q2")`
+ vshr.u64 `&Dlo("q3")`,#24
+ veor d0,`&Dhi("q2")`
+ vshl.u64 `&Dhi("q3")`,#8
+ veor d0,`&Dlo("q3")`
+ veor d0,`&Dhi("q3")`
+ bx lr
+.size mul_1x1_neon,.-mul_1x1_neon
+#endif
+___
+################
+# private interface to mul_1x1_ialu
+#
+$a="r1";
+$b="r0";
+
+($a0,$a1,$a2,$a12,$a4,$a14)=
+($hi,$lo,$t0,$t1, $i0,$i1 )=map("r$_",(4..9),12);
+
+$mask="r12";
+
+$code.=<<___;
+.type mul_1x1_ialu,%function
+.align 5
+mul_1x1_ialu:
+ mov $a0,#0
+ bic $a1,$a,#3<<30 @ a1=a&0x3fffffff
+ str $a0,[sp,#0] @ tab[0]=0
+ add $a2,$a1,$a1 @ a2=a1<<1
+ str $a1,[sp,#4] @ tab[1]=a1
+ eor $a12,$a1,$a2 @ a1^a2
+ str $a2,[sp,#8] @ tab[2]=a2
+ mov $a4,$a1,lsl#2 @ a4=a1<<2
+ str $a12,[sp,#12] @ tab[3]=a1^a2
+ eor $a14,$a1,$a4 @ a1^a4
+ str $a4,[sp,#16] @ tab[4]=a4
+ eor $a0,$a2,$a4 @ a2^a4
+ str $a14,[sp,#20] @ tab[5]=a1^a4
+ eor $a12,$a12,$a4 @ a1^a2^a4
+ str $a0,[sp,#24] @ tab[6]=a2^a4
+ and $i0,$mask,$b,lsl#2
+ str $a12,[sp,#28] @ tab[7]=a1^a2^a4
+
+ and $i1,$mask,$b,lsr#1
+ ldr $lo,[sp,$i0] @ tab[b & 0x7]
+ and $i0,$mask,$b,lsr#4
+ ldr $t1,[sp,$i1] @ tab[b >> 3 & 0x7]
+ and $i1,$mask,$b,lsr#7
+ ldr $t0,[sp,$i0] @ tab[b >> 6 & 0x7]
+ eor $lo,$lo,$t1,lsl#3 @ stall
+ mov $hi,$t1,lsr#29
+ ldr $t1,[sp,$i1] @ tab[b >> 9 & 0x7]
+
+ and $i0,$mask,$b,lsr#10
+ eor $lo,$lo,$t0,lsl#6
+ eor $hi,$hi,$t0,lsr#26
+ ldr $t0,[sp,$i0] @ tab[b >> 12 & 0x7]
+
+ and $i1,$mask,$b,lsr#13
+ eor $lo,$lo,$t1,lsl#9
+ eor $hi,$hi,$t1,lsr#23
+ ldr $t1,[sp,$i1] @ tab[b >> 15 & 0x7]
+
+ and $i0,$mask,$b,lsr#16
+ eor $lo,$lo,$t0,lsl#12
+ eor $hi,$hi,$t0,lsr#20
+ ldr $t0,[sp,$i0] @ tab[b >> 18 & 0x7]
+
+ and $i1,$mask,$b,lsr#19
+ eor $lo,$lo,$t1,lsl#15
+ eor $hi,$hi,$t1,lsr#17
+ ldr $t1,[sp,$i1] @ tab[b >> 21 & 0x7]
+
+ and $i0,$mask,$b,lsr#22
+ eor $lo,$lo,$t0,lsl#18
+ eor $hi,$hi,$t0,lsr#14
+ ldr $t0,[sp,$i0] @ tab[b >> 24 & 0x7]
+
+ and $i1,$mask,$b,lsr#25
+ eor $lo,$lo,$t1,lsl#21
+ eor $hi,$hi,$t1,lsr#11
+ ldr $t1,[sp,$i1] @ tab[b >> 27 & 0x7]
+
+ tst $a,#1<<30
+ and $i0,$mask,$b,lsr#28
+ eor $lo,$lo,$t0,lsl#24
+ eor $hi,$hi,$t0,lsr#8
+ ldr $t0,[sp,$i0] @ tab[b >> 30 ]
+
+ eorne $lo,$lo,$b,lsl#30
+ eorne $hi,$hi,$b,lsr#2
+ tst $a,#1<<31
+ eor $lo,$lo,$t1,lsl#27
+ eor $hi,$hi,$t1,lsr#5
+ eorne $lo,$lo,$b,lsl#31
+ eorne $hi,$hi,$b,lsr#1
+ eor $lo,$lo,$t0,lsl#30
+ eor $hi,$hi,$t0,lsr#2
+
+ mov pc,lr
+.size mul_1x1_ialu,.-mul_1x1_ialu
+___
+################
+# void bn_GF2m_mul_2x2(BN_ULONG *r,
+# BN_ULONG a1,BN_ULONG a0,
+# BN_ULONG b1,BN_ULONG b0); # r[3..0]=a1a0·b1b0
+
+($A1,$B1,$A0,$B0,$A1B1,$A0B0)=map("d$_",(18..23));
+
+$code.=<<___;
+.global bn_GF2m_mul_2x2
+.type bn_GF2m_mul_2x2,%function
+.align 5
+bn_GF2m_mul_2x2:
+#if __ARM_ARCH__>=7
+ ldr r12,.LOPENSSL_armcap
+.Lpic: ldr r12,[pc,r12]
+ tst r12,#1
+ beq .Lialu
+
+ veor $A1,$A1
+ vmov.32 $B1,r3,r3 @ two copies of b1
+ vmov.32 ${A1}[0],r1 @ a1
+
+ veor $A0,$A0
+ vld1.32 ${B0}[],[sp,:32] @ two copies of b0
+ vmov.32 ${A0}[0],r2 @ a0
+ mov r12,lr
+
+ vmov d16,$A1
+ vmov d17,$B1
+ bl mul_1x1_neon @ a1·b1
+ vmov $A1B1,d0
+
+ vmov d16,$A0
+ vmov d17,$B0
+ bl mul_1x1_neon @ a0·b0
+ vmov $A0B0,d0
+
+ veor d16,$A0,$A1
+ veor d17,$B0,$B1
+ veor $A0,$A0B0,$A1B1
+ bl mul_1x1_neon @ (a0+a1)·(b0+b1)
+
+ veor d0,$A0 @ (a0+a1)·(b0+b1)-a0·b0-a1·b1
+ vshl.u64 d1,d0,#32
+ vshr.u64 d0,d0,#32
+ veor $A0B0,d1
+ veor $A1B1,d0
+ vst1.32 {${A0B0}[0]},[r0,:32]!
+ vst1.32 {${A0B0}[1]},[r0,:32]!
+ vst1.32 {${A1B1}[0]},[r0,:32]!
+ vst1.32 {${A1B1}[1]},[r0,:32]
+ bx r12
+.align 4
+.Lialu:
+#endif
+___
+$ret="r10"; # reassigned 1st argument
+$code.=<<___;
+ stmdb sp!,{r4-r10,lr}
+ mov $ret,r0 @ reassign 1st argument
+ mov $b,r3 @ $b=b1
+ ldr r3,[sp,#32] @ load b0
+ mov $mask,#7<<2
+ sub sp,sp,#32 @ allocate tab[8]
+
+ bl mul_1x1_ialu @ a1·b1
+ str $lo,[$ret,#8]
+ str $hi,[$ret,#12]
+
+ eor $b,$b,r3 @ flip b0 and b1
+ eor $a,$a,r2 @ flip a0 and a1
+ eor r3,r3,$b
+ eor r2,r2,$a
+ eor $b,$b,r3
+ eor $a,$a,r2
+ bl mul_1x1_ialu @ a0·b0
+ str $lo,[$ret]
+ str $hi,[$ret,#4]
+
+ eor $a,$a,r2
+ eor $b,$b,r3
+ bl mul_1x1_ialu @ (a1+a0)·(b1+b0)
+___
+@r=map("r$_",(6..9));
+$code.=<<___;
+ ldmia $ret,{@r[0]-@r[3]}
+ eor $lo,$lo,$hi
+ eor $hi,$hi,@r[1]
+ eor $lo,$lo,@r[0]
+ eor $hi,$hi,@r[2]
+ eor $lo,$lo,@r[3]
+ eor $hi,$hi,@r[3]
+ str $hi,[$ret,#8]
+ eor $lo,$lo,$hi
+ add sp,sp,#32 @ destroy tab[8]
+ str $lo,[$ret,#4]
+
+#if __ARM_ARCH__>=5
+ ldmia sp!,{r4-r10,pc}
+#else
+ ldmia sp!,{r4-r10,lr}
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ bx lr @ interoperable with Thumb ISA:-)
+#endif
+.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
+#if __ARM_ARCH__>=7
+.align 5
+.LOPENSSL_armcap:
+.word OPENSSL_armcap_P-(.Lpic+8)
+#endif
+.asciz "GF(2^m) Multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
+.align 5
+
+.comm OPENSSL_armcap_P,4,4
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
+print $code;
+close STDOUT; # enforce flush
diff --git a/main/openssl/crypto/bn/asm/armv4-mont.pl b/main/openssl/crypto/bn/asm/armv4-mont.pl
index 14e0d2d1..f78a8b5f 100644
--- a/main/openssl/crypto/bn/asm/armv4-mont.pl
+++ b/main/openssl/crypto/bn/asm/armv4-mont.pl
@@ -23,6 +23,9 @@
# than 1/2KB. Windows CE port would be trivial, as it's exclusively
# about decorations, ABI and instruction syntax are identical.
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
$num="r0"; # starts as num argument, but holds &tp[num-1]
$ap="r1";
$bp="r2"; $bi="r2"; $rp="r2";
@@ -89,9 +92,9 @@ bn_mul_mont:
.L1st:
ldr $aj,[$ap],#4 @ ap[j],ap++
mov $alo,$ahi
+ ldr $nj,[$np],#4 @ np[j],np++
mov $ahi,#0
umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[0]
- ldr $nj,[$np],#4 @ np[j],np++
mov $nhi,#0
umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0
adds $nlo,$nlo,$alo
@@ -101,21 +104,21 @@ bn_mul_mont:
bne .L1st
adds $nlo,$nlo,$ahi
+ ldr $tp,[$_bp] @ restore bp
mov $nhi,#0
+ ldr $n0,[$_n0] @ restore n0
adc $nhi,$nhi,#0
- ldr $tp,[$_bp] @ restore bp
str $nlo,[$num] @ tp[num-1]=
- ldr $n0,[$_n0] @ restore n0
str $nhi,[$num,#4] @ tp[num]=
.Louter:
sub $tj,$num,sp @ "original" $num-1 value
sub $ap,$ap,$tj @ "rewind" ap to &ap[1]
- sub $np,$np,$tj @ "rewind" np to &np[1]
ldr $bi,[$tp,#4]! @ *(++bp)
+ sub $np,$np,$tj @ "rewind" np to &np[1]
ldr $aj,[$ap,#-4] @ ap[0]
- ldr $nj,[$np,#-4] @ np[0]
ldr $alo,[sp] @ tp[0]
+ ldr $nj,[$np,#-4] @ np[0]
ldr $tj,[sp,#4] @ tp[1]
mov $ahi,#0
@@ -129,13 +132,13 @@ bn_mul_mont:
.Linner:
ldr $aj,[$ap],#4 @ ap[j],ap++
adds $alo,$ahi,$tj @ +=tp[j]
+ ldr $nj,[$np],#4 @ np[j],np++
mov $ahi,#0
umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[i]
- ldr $nj,[$np],#4 @ np[j],np++
mov $nhi,#0
umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0
- ldr $tj,[$tp,#8] @ tp[j+1]
adc $ahi,$ahi,#0
+ ldr $tj,[$tp,#8] @ tp[j+1]
adds $nlo,$nlo,$alo
str $nlo,[$tp],#4 @ tp[j-1]=,tp++
adc $nlo,$nhi,#0
@@ -144,13 +147,13 @@ bn_mul_mont:
adds $nlo,$nlo,$ahi
mov $nhi,#0
+ ldr $tp,[$_bp] @ restore bp
adc $nhi,$nhi,#0
+ ldr $n0,[$_n0] @ restore n0
adds $nlo,$nlo,$tj
- adc $nhi,$nhi,#0
- ldr $tp,[$_bp] @ restore bp
ldr $tj,[$_bpend] @ restore &bp[num]
+ adc $nhi,$nhi,#0
str $nlo,[$num] @ tp[num-1]=
- ldr $n0,[$_n0] @ restore n0
str $nhi,[$num,#4] @ tp[num]=
cmp $tp,$tj
diff --git a/main/openssl/crypto/bn/asm/armv4-mont.s b/main/openssl/crypto/bn/asm/armv4-mont.s
index 0488455f..64c220b5 100644
--- a/main/openssl/crypto/bn/asm/armv4-mont.s
+++ b/main/openssl/crypto/bn/asm/armv4-mont.s
@@ -38,9 +38,9 @@ bn_mul_mont:
.L1st:
ldr r5,[r1],#4 @ ap[j],ap++
mov r10,r11
+ ldr r6,[r3],#4 @ np[j],np++
mov r11,#0
umlal r10,r11,r5,r2 @ ap[j]*bp[0]
- ldr r6,[r3],#4 @ np[j],np++
mov r14,#0
umlal r12,r14,r6,r8 @ np[j]*n0
adds r12,r12,r10
@@ -50,21 +50,21 @@ bn_mul_mont:
bne .L1st
adds r12,r12,r11
+ ldr r4,[r0,#13*4] @ restore bp
mov r14,#0
+ ldr r8,[r0,#14*4] @ restore n0
adc r14,r14,#0
- ldr r4,[r0,#13*4] @ restore bp
str r12,[r0] @ tp[num-1]=
- ldr r8,[r0,#14*4] @ restore n0
str r14,[r0,#4] @ tp[num]=
.Louter:
sub r7,r0,sp @ "original" r0-1 value
sub r1,r1,r7 @ "rewind" ap to &ap[1]
- sub r3,r3,r7 @ "rewind" np to &np[1]
ldr r2,[r4,#4]! @ *(++bp)
+ sub r3,r3,r7 @ "rewind" np to &np[1]
ldr r5,[r1,#-4] @ ap[0]
- ldr r6,[r3,#-4] @ np[0]
ldr r10,[sp] @ tp[0]
+ ldr r6,[r3,#-4] @ np[0]
ldr r7,[sp,#4] @ tp[1]
mov r11,#0
@@ -78,13 +78,13 @@ bn_mul_mont:
.Linner:
ldr r5,[r1],#4 @ ap[j],ap++
adds r10,r11,r7 @ +=tp[j]
+ ldr r6,[r3],#4 @ np[j],np++
mov r11,#0
umlal r10,r11,r5,r2 @ ap[j]*bp[i]
- ldr r6,[r3],#4 @ np[j],np++
mov r14,#0
umlal r12,r14,r6,r8 @ np[j]*n0
- ldr r7,[r4,#8] @ tp[j+1]
adc r11,r11,#0
+ ldr r7,[r4,#8] @ tp[j+1]
adds r12,r12,r10
str r12,[r4],#4 @ tp[j-1]=,tp++
adc r12,r14,#0
@@ -93,13 +93,13 @@ bn_mul_mont:
adds r12,r12,r11
mov r14,#0
+ ldr r4,[r0,#13*4] @ restore bp
adc r14,r14,#0
+ ldr r8,[r0,#14*4] @ restore n0
adds r12,r12,r7
- adc r14,r14,#0
- ldr r4,[r0,#13*4] @ restore bp
ldr r7,[r0,#15*4] @ restore &bp[num]
+ adc r14,r14,#0
str r12,[r0] @ tp[num-1]=
- ldr r8,[r0,#14*4] @ restore n0
str r14,[r0,#4] @ tp[num]=
cmp r4,r7
diff --git a/main/openssl/crypto/bn/asm/bn-586.S b/main/openssl/crypto/bn/asm/bn-586.S
new file mode 100644
index 00000000..fe873ce9
--- /dev/null
+++ b/main/openssl/crypto/bn/asm/bn-586.S
@@ -0,0 +1,1384 @@
+.file "crypto/bn/asm/bn-586.s"
+.text
+.globl bn_mul_add_words
+.type bn_mul_add_words,@function
+.align 16
+bn_mul_add_words:
+.L_bn_mul_add_words_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+
+ xorl %esi,%esi
+ movl 20(%esp),%edi
+ movl 28(%esp),%ecx
+ movl 24(%esp),%ebx
+ andl $4294967288,%ecx
+ movl 32(%esp),%ebp
+ pushl %ecx
+ jz .L000maw_finish
+.align 16
+.L001maw_loop:
+
+ movl (%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl (%edi),%eax
+ adcl $0,%edx
+ movl %eax,(%edi)
+ movl %edx,%esi
+
+ movl 4(%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl 4(%edi),%eax
+ adcl $0,%edx
+ movl %eax,4(%edi)
+ movl %edx,%esi
+
+ movl 8(%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl 8(%edi),%eax
+ adcl $0,%edx
+ movl %eax,8(%edi)
+ movl %edx,%esi
+
+ movl 12(%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl 12(%edi),%eax
+ adcl $0,%edx
+ movl %eax,12(%edi)
+ movl %edx,%esi
+
+ movl 16(%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl 16(%edi),%eax
+ adcl $0,%edx
+ movl %eax,16(%edi)
+ movl %edx,%esi
+
+ movl 20(%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl 20(%edi),%eax
+ adcl $0,%edx
+ movl %eax,20(%edi)
+ movl %edx,%esi
+
+ movl 24(%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl 24(%edi),%eax
+ adcl $0,%edx
+ movl %eax,24(%edi)
+ movl %edx,%esi
+
+ movl 28(%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl 28(%edi),%eax
+ adcl $0,%edx
+ movl %eax,28(%edi)
+ movl %edx,%esi
+
+ subl $8,%ecx
+ leal 32(%ebx),%ebx
+ leal 32(%edi),%edi
+ jnz .L001maw_loop
+.L000maw_finish:
+ movl 32(%esp),%ecx
+ andl $7,%ecx
+ jnz .L002maw_finish2
+ jmp .L003maw_end
+.L002maw_finish2:
+
+ movl (%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl (%edi),%eax
+ adcl $0,%edx
+ decl %ecx
+ movl %eax,(%edi)
+ movl %edx,%esi
+ jz .L003maw_end
+
+ movl 4(%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl 4(%edi),%eax
+ adcl $0,%edx
+ decl %ecx
+ movl %eax,4(%edi)
+ movl %edx,%esi
+ jz .L003maw_end
+
+ movl 8(%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl 8(%edi),%eax
+ adcl $0,%edx
+ decl %ecx
+ movl %eax,8(%edi)
+ movl %edx,%esi
+ jz .L003maw_end
+
+ movl 12(%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl 12(%edi),%eax
+ adcl $0,%edx
+ decl %ecx
+ movl %eax,12(%edi)
+ movl %edx,%esi
+ jz .L003maw_end
+
+ movl 16(%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl 16(%edi),%eax
+ adcl $0,%edx
+ decl %ecx
+ movl %eax,16(%edi)
+ movl %edx,%esi
+ jz .L003maw_end
+
+ movl 20(%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl 20(%edi),%eax
+ adcl $0,%edx
+ decl %ecx
+ movl %eax,20(%edi)
+ movl %edx,%esi
+ jz .L003maw_end
+
+ movl 24(%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl 24(%edi),%eax
+ adcl $0,%edx
+ movl %eax,24(%edi)
+ movl %edx,%esi
+.L003maw_end:
+ movl %esi,%eax
+ popl %ecx
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size bn_mul_add_words,.-.L_bn_mul_add_words_begin
+.globl bn_mul_words
+.type bn_mul_words,@function
+.align 16
+bn_mul_words:
+.L_bn_mul_words_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+
+ xorl %esi,%esi
+ movl 20(%esp),%edi
+ movl 24(%esp),%ebx
+ movl 28(%esp),%ebp
+ movl 32(%esp),%ecx
+ andl $4294967288,%ebp
+ jz .L004mw_finish
+.L005mw_loop:
+
+ movl (%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,(%edi)
+ movl %edx,%esi
+
+ movl 4(%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,4(%edi)
+ movl %edx,%esi
+
+ movl 8(%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,8(%edi)
+ movl %edx,%esi
+
+ movl 12(%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,12(%edi)
+ movl %edx,%esi
+
+ movl 16(%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,16(%edi)
+ movl %edx,%esi
+
+ movl 20(%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,20(%edi)
+ movl %edx,%esi
+
+ movl 24(%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,24(%edi)
+ movl %edx,%esi
+
+ movl 28(%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,28(%edi)
+ movl %edx,%esi
+
+ addl $32,%ebx
+ addl $32,%edi
+ subl $8,%ebp
+ jz .L004mw_finish
+ jmp .L005mw_loop
+.L004mw_finish:
+ movl 28(%esp),%ebp
+ andl $7,%ebp
+ jnz .L006mw_finish2
+ jmp .L007mw_end
+.L006mw_finish2:
+
+ movl (%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,(%edi)
+ movl %edx,%esi
+ decl %ebp
+ jz .L007mw_end
+
+ movl 4(%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,4(%edi)
+ movl %edx,%esi
+ decl %ebp
+ jz .L007mw_end
+
+ movl 8(%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,8(%edi)
+ movl %edx,%esi
+ decl %ebp
+ jz .L007mw_end
+
+ movl 12(%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,12(%edi)
+ movl %edx,%esi
+ decl %ebp
+ jz .L007mw_end
+
+ movl 16(%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,16(%edi)
+ movl %edx,%esi
+ decl %ebp
+ jz .L007mw_end
+
+ movl 20(%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,20(%edi)
+ movl %edx,%esi
+ decl %ebp
+ jz .L007mw_end
+
+ movl 24(%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,24(%edi)
+ movl %edx,%esi
+.L007mw_end:
+ movl %esi,%eax
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size bn_mul_words,.-.L_bn_mul_words_begin
+.globl bn_sqr_words
+.type bn_sqr_words,@function
+.align 16
+bn_sqr_words:
+.L_bn_sqr_words_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+
+ movl 20(%esp),%esi
+ movl 24(%esp),%edi
+ movl 28(%esp),%ebx
+ andl $4294967288,%ebx
+ jz .L008sw_finish
+.L009sw_loop:
+
+ movl (%edi),%eax
+ mull %eax
+ movl %eax,(%esi)
+ movl %edx,4(%esi)
+
+ movl 4(%edi),%eax
+ mull %eax
+ movl %eax,8(%esi)
+ movl %edx,12(%esi)
+
+ movl 8(%edi),%eax
+ mull %eax
+ movl %eax,16(%esi)
+ movl %edx,20(%esi)
+
+ movl 12(%edi),%eax
+ mull %eax
+ movl %eax,24(%esi)
+ movl %edx,28(%esi)
+
+ movl 16(%edi),%eax
+ mull %eax
+ movl %eax,32(%esi)
+ movl %edx,36(%esi)
+
+ movl 20(%edi),%eax
+ mull %eax
+ movl %eax,40(%esi)
+ movl %edx,44(%esi)
+
+ movl 24(%edi),%eax
+ mull %eax
+ movl %eax,48(%esi)
+ movl %edx,52(%esi)
+
+ movl 28(%edi),%eax
+ mull %eax
+ movl %eax,56(%esi)
+ movl %edx,60(%esi)
+
+ addl $32,%edi
+ addl $64,%esi
+ subl $8,%ebx
+ jnz .L009sw_loop
+.L008sw_finish:
+ movl 28(%esp),%ebx
+ andl $7,%ebx
+ jz .L010sw_end
+
+ movl (%edi),%eax
+ mull %eax
+ movl %eax,(%esi)
+ decl %ebx
+ movl %edx,4(%esi)
+ jz .L010sw_end
+
+ movl 4(%edi),%eax
+ mull %eax
+ movl %eax,8(%esi)
+ decl %ebx
+ movl %edx,12(%esi)
+ jz .L010sw_end
+
+ movl 8(%edi),%eax
+ mull %eax
+ movl %eax,16(%esi)
+ decl %ebx
+ movl %edx,20(%esi)
+ jz .L010sw_end
+
+ movl 12(%edi),%eax
+ mull %eax
+ movl %eax,24(%esi)
+ decl %ebx
+ movl %edx,28(%esi)
+ jz .L010sw_end
+
+ movl 16(%edi),%eax
+ mull %eax
+ movl %eax,32(%esi)
+ decl %ebx
+ movl %edx,36(%esi)
+ jz .L010sw_end
+
+ movl 20(%edi),%eax
+ mull %eax
+ movl %eax,40(%esi)
+ decl %ebx
+ movl %edx,44(%esi)
+ jz .L010sw_end
+
+ movl 24(%edi),%eax
+ mull %eax
+ movl %eax,48(%esi)
+ movl %edx,52(%esi)
+.L010sw_end:
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size bn_sqr_words,.-.L_bn_sqr_words_begin
+.globl bn_div_words
+.type bn_div_words,@function
+.align 16
+bn_div_words:
+.L_bn_div_words_begin:
+ movl 4(%esp),%edx
+ movl 8(%esp),%eax
+ movl 12(%esp),%ecx
+ divl %ecx
+ ret
+.size bn_div_words,.-.L_bn_div_words_begin
+.globl bn_add_words
+.type bn_add_words,@function
+.align 16
+bn_add_words:
+.L_bn_add_words_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+
+ movl 20(%esp),%ebx
+ movl 24(%esp),%esi
+ movl 28(%esp),%edi
+ movl 32(%esp),%ebp
+ xorl %eax,%eax
+ andl $4294967288,%ebp
+ jz .L011aw_finish
+.L012aw_loop:
+
+ movl (%esi),%ecx
+ movl (%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,(%ebx)
+
+ movl 4(%esi),%ecx
+ movl 4(%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,4(%ebx)
+
+ movl 8(%esi),%ecx
+ movl 8(%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,8(%ebx)
+
+ movl 12(%esi),%ecx
+ movl 12(%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,12(%ebx)
+
+ movl 16(%esi),%ecx
+ movl 16(%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,16(%ebx)
+
+ movl 20(%esi),%ecx
+ movl 20(%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,20(%ebx)
+
+ movl 24(%esi),%ecx
+ movl 24(%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,24(%ebx)
+
+ movl 28(%esi),%ecx
+ movl 28(%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,28(%ebx)
+
+ addl $32,%esi
+ addl $32,%edi
+ addl $32,%ebx
+ subl $8,%ebp
+ jnz .L012aw_loop
+.L011aw_finish:
+ movl 32(%esp),%ebp
+ andl $7,%ebp
+ jz .L013aw_end
+
+ movl (%esi),%ecx
+ movl (%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ decl %ebp
+ movl %ecx,(%ebx)
+ jz .L013aw_end
+
+ movl 4(%esi),%ecx
+ movl 4(%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ decl %ebp
+ movl %ecx,4(%ebx)
+ jz .L013aw_end
+
+ movl 8(%esi),%ecx
+ movl 8(%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ decl %ebp
+ movl %ecx,8(%ebx)
+ jz .L013aw_end
+
+ movl 12(%esi),%ecx
+ movl 12(%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ decl %ebp
+ movl %ecx,12(%ebx)
+ jz .L013aw_end
+
+ movl 16(%esi),%ecx
+ movl 16(%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ decl %ebp
+ movl %ecx,16(%ebx)
+ jz .L013aw_end
+
+ movl 20(%esi),%ecx
+ movl 20(%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ decl %ebp
+ movl %ecx,20(%ebx)
+ jz .L013aw_end
+
+ movl 24(%esi),%ecx
+ movl 24(%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,24(%ebx)
+.L013aw_end:
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size bn_add_words,.-.L_bn_add_words_begin
+.globl bn_sub_words
+.type bn_sub_words,@function
+.align 16
+bn_sub_words:
+.L_bn_sub_words_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+
+ movl 20(%esp),%ebx
+ movl 24(%esp),%esi
+ movl 28(%esp),%edi
+ movl 32(%esp),%ebp
+ xorl %eax,%eax
+ andl $4294967288,%ebp
+ jz .L014aw_finish
+.L015aw_loop:
+
+ movl (%esi),%ecx
+ movl (%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,(%ebx)
+
+ movl 4(%esi),%ecx
+ movl 4(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,4(%ebx)
+
+ movl 8(%esi),%ecx
+ movl 8(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,8(%ebx)
+
+ movl 12(%esi),%ecx
+ movl 12(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,12(%ebx)
+
+ movl 16(%esi),%ecx
+ movl 16(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,16(%ebx)
+
+ movl 20(%esi),%ecx
+ movl 20(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,20(%ebx)
+
+ movl 24(%esi),%ecx
+ movl 24(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,24(%ebx)
+
+ movl 28(%esi),%ecx
+ movl 28(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,28(%ebx)
+
+ addl $32,%esi
+ addl $32,%edi
+ addl $32,%ebx
+ subl $8,%ebp
+ jnz .L015aw_loop
+.L014aw_finish:
+ movl 32(%esp),%ebp
+ andl $7,%ebp
+ jz .L016aw_end
+
+ movl (%esi),%ecx
+ movl (%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ decl %ebp
+ movl %ecx,(%ebx)
+ jz .L016aw_end
+
+ movl 4(%esi),%ecx
+ movl 4(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ decl %ebp
+ movl %ecx,4(%ebx)
+ jz .L016aw_end
+
+ movl 8(%esi),%ecx
+ movl 8(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ decl %ebp
+ movl %ecx,8(%ebx)
+ jz .L016aw_end
+
+ movl 12(%esi),%ecx
+ movl 12(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ decl %ebp
+ movl %ecx,12(%ebx)
+ jz .L016aw_end
+
+ movl 16(%esi),%ecx
+ movl 16(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ decl %ebp
+ movl %ecx,16(%ebx)
+ jz .L016aw_end
+
+ movl 20(%esi),%ecx
+ movl 20(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ decl %ebp
+ movl %ecx,20(%ebx)
+ jz .L016aw_end
+
+ movl 24(%esi),%ecx
+ movl 24(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,24(%ebx)
+.L016aw_end:
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size bn_sub_words,.-.L_bn_sub_words_begin
+.globl bn_sub_part_words
+.type bn_sub_part_words,@function
+.align 16
+bn_sub_part_words:
+.L_bn_sub_part_words_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+
+ movl 20(%esp),%ebx
+ movl 24(%esp),%esi
+ movl 28(%esp),%edi
+ movl 32(%esp),%ebp
+ xorl %eax,%eax
+ andl $4294967288,%ebp
+ jz .L017aw_finish
+.L018aw_loop:
+
+ movl (%esi),%ecx
+ movl (%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,(%ebx)
+
+ movl 4(%esi),%ecx
+ movl 4(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,4(%ebx)
+
+ movl 8(%esi),%ecx
+ movl 8(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,8(%ebx)
+
+ movl 12(%esi),%ecx
+ movl 12(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,12(%ebx)
+
+ movl 16(%esi),%ecx
+ movl 16(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,16(%ebx)
+
+ movl 20(%esi),%ecx
+ movl 20(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,20(%ebx)
+
+ movl 24(%esi),%ecx
+ movl 24(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,24(%ebx)
+
+ movl 28(%esi),%ecx
+ movl 28(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,28(%ebx)
+
+ addl $32,%esi
+ addl $32,%edi
+ addl $32,%ebx
+ subl $8,%ebp
+ jnz .L018aw_loop
+.L017aw_finish:
+ movl 32(%esp),%ebp
+ andl $7,%ebp
+ jz .L019aw_end
+
+ movl (%esi),%ecx
+ movl (%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,(%ebx)
+ addl $4,%esi
+ addl $4,%edi
+ addl $4,%ebx
+ decl %ebp
+ jz .L019aw_end
+
+ movl (%esi),%ecx
+ movl (%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,(%ebx)
+ addl $4,%esi
+ addl $4,%edi
+ addl $4,%ebx
+ decl %ebp
+ jz .L019aw_end
+
+ movl (%esi),%ecx
+ movl (%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,(%ebx)
+ addl $4,%esi
+ addl $4,%edi
+ addl $4,%ebx
+ decl %ebp
+ jz .L019aw_end
+
+ movl (%esi),%ecx
+ movl (%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,(%ebx)
+ addl $4,%esi
+ addl $4,%edi
+ addl $4,%ebx
+ decl %ebp
+ jz .L019aw_end
+
+ movl (%esi),%ecx
+ movl (%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,(%ebx)
+ addl $4,%esi
+ addl $4,%edi
+ addl $4,%ebx
+ decl %ebp
+ jz .L019aw_end
+
+ movl (%esi),%ecx
+ movl (%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,(%ebx)
+ addl $4,%esi
+ addl $4,%edi
+ addl $4,%ebx
+ decl %ebp
+ jz .L019aw_end
+
+ movl (%esi),%ecx
+ movl (%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,(%ebx)
+ addl $4,%esi
+ addl $4,%edi
+ addl $4,%ebx
+.L019aw_end:
+ cmpl $0,36(%esp)
+ je .L020pw_end
+ movl 36(%esp),%ebp
+ cmpl $0,%ebp
+ je .L020pw_end
+ jge .L021pw_pos
+
+ movl $0,%edx
+ subl %ebp,%edx
+ movl %edx,%ebp
+ andl $4294967288,%ebp
+ jz .L022pw_neg_finish
+.L023pw_neg_loop:
+
+ movl $0,%ecx
+ movl (%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,(%ebx)
+
+ movl $0,%ecx
+ movl 4(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,4(%ebx)
+
+ movl $0,%ecx
+ movl 8(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,8(%ebx)
+
+ movl $0,%ecx
+ movl 12(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,12(%ebx)
+
+ movl $0,%ecx
+ movl 16(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,16(%ebx)
+
+ movl $0,%ecx
+ movl 20(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,20(%ebx)
+
+ movl $0,%ecx
+ movl 24(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,24(%ebx)
+
+ movl $0,%ecx
+ movl 28(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,28(%ebx)
+
+ addl $32,%edi
+ addl $32,%ebx
+ subl $8,%ebp
+ jnz .L023pw_neg_loop
+.L022pw_neg_finish:
+ movl 36(%esp),%edx
+ movl $0,%ebp
+ subl %edx,%ebp
+ andl $7,%ebp
+ jz .L020pw_end
+
+ movl $0,%ecx
+ movl (%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ decl %ebp
+ movl %ecx,(%ebx)
+ jz .L020pw_end
+
+ movl $0,%ecx
+ movl 4(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ decl %ebp
+ movl %ecx,4(%ebx)
+ jz .L020pw_end
+
+ movl $0,%ecx
+ movl 8(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ decl %ebp
+ movl %ecx,8(%ebx)
+ jz .L020pw_end
+
+ movl $0,%ecx
+ movl 12(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ decl %ebp
+ movl %ecx,12(%ebx)
+ jz .L020pw_end
+
+ movl $0,%ecx
+ movl 16(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ decl %ebp
+ movl %ecx,16(%ebx)
+ jz .L020pw_end
+
+ movl $0,%ecx
+ movl 20(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ decl %ebp
+ movl %ecx,20(%ebx)
+ jz .L020pw_end
+
+ movl $0,%ecx
+ movl 24(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,24(%ebx)
+ jmp .L020pw_end
+.L021pw_pos:
+ andl $4294967288,%ebp
+ jz .L024pw_pos_finish
+.L025pw_pos_loop:
+
+ movl (%esi),%ecx
+ subl %eax,%ecx
+ movl %ecx,(%ebx)
+ jnc .L026pw_nc0
+
+ movl 4(%esi),%ecx
+ subl %eax,%ecx
+ movl %ecx,4(%ebx)
+ jnc .L027pw_nc1
+
+ movl 8(%esi),%ecx
+ subl %eax,%ecx
+ movl %ecx,8(%ebx)
+ jnc .L028pw_nc2
+
+ movl 12(%esi),%ecx
+ subl %eax,%ecx
+ movl %ecx,12(%ebx)
+ jnc .L029pw_nc3
+
+ movl 16(%esi),%ecx
+ subl %eax,%ecx
+ movl %ecx,16(%ebx)
+ jnc .L030pw_nc4
+
+ movl 20(%esi),%ecx
+ subl %eax,%ecx
+ movl %ecx,20(%ebx)
+ jnc .L031pw_nc5
+
+ movl 24(%esi),%ecx
+ subl %eax,%ecx
+ movl %ecx,24(%ebx)
+ jnc .L032pw_nc6
+
+ movl 28(%esi),%ecx
+ subl %eax,%ecx
+ movl %ecx,28(%ebx)
+ jnc .L033pw_nc7
+
+ addl $32,%esi
+ addl $32,%ebx
+ subl $8,%ebp
+ jnz .L025pw_pos_loop
+.L024pw_pos_finish:
+ movl 36(%esp),%ebp
+ andl $7,%ebp
+ jz .L020pw_end
+
+ movl (%esi),%ecx
+ subl %eax,%ecx
+ movl %ecx,(%ebx)
+ jnc .L034pw_tail_nc0
+ decl %ebp
+ jz .L020pw_end
+
+ movl 4(%esi),%ecx
+ subl %eax,%ecx
+ movl %ecx,4(%ebx)
+ jnc .L035pw_tail_nc1
+ decl %ebp
+ jz .L020pw_end
+
+ movl 8(%esi),%ecx
+ subl %eax,%ecx
+ movl %ecx,8(%ebx)
+ jnc .L036pw_tail_nc2
+ decl %ebp
+ jz .L020pw_end
+
+ movl 12(%esi),%ecx
+ subl %eax,%ecx
+ movl %ecx,12(%ebx)
+ jnc .L037pw_tail_nc3
+ decl %ebp
+ jz .L020pw_end
+
+ movl 16(%esi),%ecx
+ subl %eax,%ecx
+ movl %ecx,16(%ebx)
+ jnc .L038pw_tail_nc4
+ decl %ebp
+ jz .L020pw_end
+
+ movl 20(%esi),%ecx
+ subl %eax,%ecx
+ movl %ecx,20(%ebx)
+ jnc .L039pw_tail_nc5
+ decl %ebp
+ jz .L020pw_end
+
+ movl 24(%esi),%ecx
+ subl %eax,%ecx
+ movl %ecx,24(%ebx)
+ jnc .L040pw_tail_nc6
+ movl $1,%eax
+ jmp .L020pw_end
+.L041pw_nc_loop:
+ movl (%esi),%ecx
+ movl %ecx,(%ebx)
+.L026pw_nc0:
+ movl 4(%esi),%ecx
+ movl %ecx,4(%ebx)
+.L027pw_nc1:
+ movl 8(%esi),%ecx
+ movl %ecx,8(%ebx)
+.L028pw_nc2:
+ movl 12(%esi),%ecx
+ movl %ecx,12(%ebx)
+.L029pw_nc3:
+ movl 16(%esi),%ecx
+ movl %ecx,16(%ebx)
+.L030pw_nc4:
+ movl 20(%esi),%ecx
+ movl %ecx,20(%ebx)
+.L031pw_nc5:
+ movl 24(%esi),%ecx
+ movl %ecx,24(%ebx)
+.L032pw_nc6:
+ movl 28(%esi),%ecx
+ movl %ecx,28(%ebx)
+.L033pw_nc7:
+
+ addl $32,%esi
+ addl $32,%ebx
+ subl $8,%ebp
+ jnz .L041pw_nc_loop
+ movl 36(%esp),%ebp
+ andl $7,%ebp
+ jz .L042pw_nc_end
+ movl (%esi),%ecx
+ movl %ecx,(%ebx)
+.L034pw_tail_nc0:
+ decl %ebp
+ jz .L042pw_nc_end
+ movl 4(%esi),%ecx
+ movl %ecx,4(%ebx)
+.L035pw_tail_nc1:
+ decl %ebp
+ jz .L042pw_nc_end
+ movl 8(%esi),%ecx
+ movl %ecx,8(%ebx)
+.L036pw_tail_nc2:
+ decl %ebp
+ jz .L042pw_nc_end
+ movl 12(%esi),%ecx
+ movl %ecx,12(%ebx)
+.L037pw_tail_nc3:
+ decl %ebp
+ jz .L042pw_nc_end
+ movl 16(%esi),%ecx
+ movl %ecx,16(%ebx)
+.L038pw_tail_nc4:
+ decl %ebp
+ jz .L042pw_nc_end
+ movl 20(%esi),%ecx
+ movl %ecx,20(%ebx)
+.L039pw_tail_nc5:
+ decl %ebp
+ jz .L042pw_nc_end
+ movl 24(%esi),%ecx
+ movl %ecx,24(%ebx)
+.L040pw_tail_nc6:
+.L042pw_nc_end:
+ movl $0,%eax
+.L020pw_end:
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size bn_sub_part_words,.-.L_bn_sub_part_words_begin
diff --git a/main/openssl/crypto/bn/asm/bn-mips.S b/main/openssl/crypto/bn/asm/bn-mips.S
new file mode 100644
index 00000000..2e7cccb7
--- /dev/null
+++ b/main/openssl/crypto/bn/asm/bn-mips.S
@@ -0,0 +1,2175 @@
+.set mips2
+.rdata
+.asciiz "mips3.s, Version 1.2"
+.asciiz "MIPS II/III/IV ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
+
+.text
+.set noat
+
+.align 5
+.globl bn_mul_add_words
+.ent bn_mul_add_words
+bn_mul_add_words:
+ .set noreorder
+ bgtz $6,bn_mul_add_words_internal
+ move $2,$0
+ jr $31
+ move $4,$2
+.end bn_mul_add_words
+
+.align 5
+.ent bn_mul_add_words_internal
+bn_mul_add_words_internal:
+ .set reorder
+ li $3,-4
+ and $8,$6,$3
+ beqz $8,.L_bn_mul_add_words_tail
+
+.L_bn_mul_add_words_loop:
+ lw $12,0($5)
+ multu $12,$7
+ lw $13,0($4)
+ lw $14,4($5)
+ lw $15,4($4)
+ lw $8,2*4($5)
+ lw $9,2*4($4)
+ addu $13,$2
+ sltu $2,$13,$2 # All manuals say it "compares 32-bit
+ # values", but it seems to work fine
+ # even on 64-bit registers.
+ mflo $1
+ mfhi $12
+ addu $13,$1
+ addu $2,$12
+ multu $14,$7
+ sltu $1,$13,$1
+ sw $13,0($4)
+ addu $2,$1
+
+ lw $10,3*4($5)
+ lw $11,3*4($4)
+ addu $15,$2
+ sltu $2,$15,$2
+ mflo $1
+ mfhi $14
+ addu $15,$1
+ addu $2,$14
+ multu $8,$7
+ sltu $1,$15,$1
+ sw $15,4($4)
+ addu $2,$1
+
+ subu $6,4
+ addu $4,4*4
+ addu $5,4*4
+ addu $9,$2
+ sltu $2,$9,$2
+ mflo $1
+ mfhi $8
+ addu $9,$1
+ addu $2,$8
+ multu $10,$7
+ sltu $1,$9,$1
+ sw $9,-2*4($4)
+ addu $2,$1
+
+
+ and $8,$6,$3
+ addu $11,$2
+ sltu $2,$11,$2
+ mflo $1
+ mfhi $10
+ addu $11,$1
+ addu $2,$10
+ sltu $1,$11,$1
+ sw $11,-4($4)
+ .set noreorder
+ bgtz $8,.L_bn_mul_add_words_loop
+ addu $2,$1
+
+ beqz $6,.L_bn_mul_add_words_return
+ nop
+
+.L_bn_mul_add_words_tail:
+ .set reorder
+ lw $12,0($5)
+ multu $12,$7
+ lw $13,0($4)
+ subu $6,1
+ addu $13,$2
+ sltu $2,$13,$2
+ mflo $1
+ mfhi $12
+ addu $13,$1
+ addu $2,$12
+ sltu $1,$13,$1
+ sw $13,0($4)
+ addu $2,$1
+ beqz $6,.L_bn_mul_add_words_return
+
+ lw $12,4($5)
+ multu $12,$7
+ lw $13,4($4)
+ subu $6,1
+ addu $13,$2
+ sltu $2,$13,$2
+ mflo $1
+ mfhi $12
+ addu $13,$1
+ addu $2,$12
+ sltu $1,$13,$1
+ sw $13,4($4)
+ addu $2,$1
+ beqz $6,.L_bn_mul_add_words_return
+
+ lw $12,2*4($5)
+ multu $12,$7
+ lw $13,2*4($4)
+ addu $13,$2
+ sltu $2,$13,$2
+ mflo $1
+ mfhi $12
+ addu $13,$1
+ addu $2,$12
+ sltu $1,$13,$1
+ sw $13,2*4($4)
+ addu $2,$1
+
+.L_bn_mul_add_words_return:
+ .set noreorder
+ jr $31
+ move $4,$2
+.end bn_mul_add_words_internal
+
+.align 5
+.globl bn_mul_words
+.ent bn_mul_words
+bn_mul_words:
+ .set noreorder
+ bgtz $6,bn_mul_words_internal
+ move $2,$0
+ jr $31
+ move $4,$2
+.end bn_mul_words
+
+.align 5
+.ent bn_mul_words_internal
+bn_mul_words_internal:
+ .set reorder
+ li $3,-4
+ and $8,$6,$3
+ beqz $8,.L_bn_mul_words_tail
+
+.L_bn_mul_words_loop:
+ lw $12,0($5)
+ multu $12,$7
+ lw $14,4($5)
+ lw $8,2*4($5)
+ lw $10,3*4($5)
+ mflo $1
+ mfhi $12
+ addu $2,$1
+ sltu $13,$2,$1
+ multu $14,$7
+ sw $2,0($4)
+ addu $2,$13,$12
+
+ subu $6,4
+ addu $4,4*4
+ addu $5,4*4
+ mflo $1
+ mfhi $14
+ addu $2,$1
+ sltu $15,$2,$1
+ multu $8,$7
+ sw $2,-3*4($4)
+ addu $2,$15,$14
+
+ mflo $1
+ mfhi $8
+ addu $2,$1
+ sltu $9,$2,$1
+ multu $10,$7
+ sw $2,-2*4($4)
+ addu $2,$9,$8
+
+ and $8,$6,$3
+ mflo $1
+ mfhi $10
+ addu $2,$1
+ sltu $11,$2,$1
+ sw $2,-4($4)
+ .set noreorder
+ bgtz $8,.L_bn_mul_words_loop
+ addu $2,$11,$10
+
+ beqz $6,.L_bn_mul_words_return
+ nop
+
+.L_bn_mul_words_tail:
+ .set reorder
+ lw $12,0($5)
+ multu $12,$7
+ subu $6,1
+ mflo $1
+ mfhi $12
+ addu $2,$1
+ sltu $13,$2,$1
+ sw $2,0($4)
+ addu $2,$13,$12
+ beqz $6,.L_bn_mul_words_return
+
+ lw $12,4($5)
+ multu $12,$7
+ subu $6,1
+ mflo $1
+ mfhi $12
+ addu $2,$1
+ sltu $13,$2,$1
+ sw $2,4($4)
+ addu $2,$13,$12
+ beqz $6,.L_bn_mul_words_return
+
+ lw $12,2*4($5)
+ multu $12,$7
+ mflo $1
+ mfhi $12
+ addu $2,$1
+ sltu $13,$2,$1
+ sw $2,2*4($4)
+ addu $2,$13,$12
+
+.L_bn_mul_words_return:
+ .set noreorder
+ jr $31
+ move $4,$2
+.end bn_mul_words_internal
+
+.align 5
+.globl bn_sqr_words
+.ent bn_sqr_words
+bn_sqr_words:
+ .set noreorder
+ bgtz $6,bn_sqr_words_internal
+ move $2,$0
+ jr $31
+ move $4,$2
+.end bn_sqr_words
+
+.align 5
+.ent bn_sqr_words_internal
+bn_sqr_words_internal:
+ .set reorder
+ li $3,-4
+ and $8,$6,$3
+ beqz $8,.L_bn_sqr_words_tail
+
+.L_bn_sqr_words_loop:
+ lw $12,0($5)
+ multu $12,$12
+ lw $14,4($5)
+ lw $8,2*4($5)
+ lw $10,3*4($5)
+ mflo $13
+ mfhi $12
+ sw $13,0($4)
+ sw $12,4($4)
+
+ multu $14,$14
+ subu $6,4
+ addu $4,8*4
+ addu $5,4*4
+ mflo $15
+ mfhi $14
+ sw $15,-6*4($4)
+ sw $14,-5*4($4)
+
+ multu $8,$8
+ mflo $9
+ mfhi $8
+ sw $9,-4*4($4)
+ sw $8,-3*4($4)
+
+
+ multu $10,$10
+ and $8,$6,$3
+ mflo $11
+ mfhi $10
+ sw $11,-2*4($4)
+
+ .set noreorder
+ bgtz $8,.L_bn_sqr_words_loop
+ sw $10,-4($4)
+
+ beqz $6,.L_bn_sqr_words_return
+ nop
+
+.L_bn_sqr_words_tail:
+ .set reorder
+ lw $12,0($5)
+ multu $12,$12
+ subu $6,1
+ mflo $13
+ mfhi $12
+ sw $13,0($4)
+ sw $12,4($4)
+ beqz $6,.L_bn_sqr_words_return
+
+ lw $12,4($5)
+ multu $12,$12
+ subu $6,1
+ mflo $13
+ mfhi $12
+ sw $13,2*4($4)
+ sw $12,3*4($4)
+ beqz $6,.L_bn_sqr_words_return
+
+ lw $12,2*4($5)
+ multu $12,$12
+ mflo $13
+ mfhi $12
+ sw $13,4*4($4)
+ sw $12,5*4($4)
+
+.L_bn_sqr_words_return:
+ .set noreorder
+ jr $31
+ move $4,$2
+
+.end bn_sqr_words_internal
+
+.align 5
+.globl bn_add_words
+.ent bn_add_words
+bn_add_words:
+ .set noreorder
+ bgtz $7,bn_add_words_internal
+ move $2,$0
+ jr $31
+ move $4,$2
+.end bn_add_words
+
+.align 5
+.ent bn_add_words_internal
+bn_add_words_internal:
+ .set reorder
+ li $3,-4
+ and $1,$7,$3
+ beqz $1,.L_bn_add_words_tail
+
+.L_bn_add_words_loop:
+ lw $12,0($5)
+ lw $8,0($6)
+ subu $7,4
+ lw $13,4($5)
+ and $1,$7,$3
+ lw $14,2*4($5)
+ addu $6,4*4
+ lw $15,3*4($5)
+ addu $4,4*4
+ lw $9,-3*4($6)
+ addu $5,4*4
+ lw $10,-2*4($6)
+ lw $11,-4($6)
+ addu $8,$12
+ sltu $24,$8,$12
+ addu $12,$8,$2
+ sltu $2,$12,$8
+ sw $12,-4*4($4)
+ addu $2,$24
+
+ addu $9,$13
+ sltu $25,$9,$13
+ addu $13,$9,$2
+ sltu $2,$13,$9
+ sw $13,-3*4($4)
+ addu $2,$25
+
+ addu $10,$14
+ sltu $24,$10,$14
+ addu $14,$10,$2
+ sltu $2,$14,$10
+ sw $14,-2*4($4)
+ addu $2,$24
+
+ addu $11,$15
+ sltu $25,$11,$15
+ addu $15,$11,$2
+ sltu $2,$15,$11
+ sw $15,-4($4)
+
+ .set noreorder
+ bgtz $1,.L_bn_add_words_loop
+ addu $2,$25
+
+ beqz $7,.L_bn_add_words_return
+ nop
+
+.L_bn_add_words_tail:
+ .set reorder
+ lw $12,0($5)
+ lw $8,0($6)
+ addu $8,$12
+ subu $7,1
+ sltu $24,$8,$12
+ addu $12,$8,$2
+ sltu $2,$12,$8
+ sw $12,0($4)
+ addu $2,$24
+ beqz $7,.L_bn_add_words_return
+
+ lw $13,4($5)
+ lw $9,4($6)
+ addu $9,$13
+ subu $7,1
+ sltu $25,$9,$13
+ addu $13,$9,$2
+ sltu $2,$13,$9
+ sw $13,4($4)
+ addu $2,$25
+ beqz $7,.L_bn_add_words_return
+
+ lw $14,2*4($5)
+ lw $10,2*4($6)
+ addu $10,$14
+ sltu $24,$10,$14
+ addu $14,$10,$2
+ sltu $2,$14,$10
+ sw $14,2*4($4)
+ addu $2,$24
+
+.L_bn_add_words_return:
+ .set noreorder
+ jr $31
+ move $4,$2
+
+.end bn_add_words_internal
+
+.align 5
+.globl bn_sub_words
+.ent bn_sub_words
+bn_sub_words:
+ .set noreorder
+ bgtz $7,bn_sub_words_internal
+ move $2,$0
+ jr $31
+ move $4,$0
+.end bn_sub_words
+
+.align 5
+.ent bn_sub_words_internal
+bn_sub_words_internal:
+ .set reorder
+ li $3,-4
+ and $1,$7,$3
+ beqz $1,.L_bn_sub_words_tail
+
+.L_bn_sub_words_loop:
+ lw $12,0($5)
+ lw $8,0($6)
+ subu $7,4
+ lw $13,4($5)
+ and $1,$7,$3
+ lw $14,2*4($5)
+ addu $6,4*4
+ lw $15,3*4($5)
+ addu $4,4*4
+ lw $9,-3*4($6)
+ addu $5,4*4
+ lw $10,-2*4($6)
+ lw $11,-4($6)
+ sltu $24,$12,$8
+ subu $8,$12,$8
+ subu $12,$8,$2
+ sgtu $2,$12,$8
+ sw $12,-4*4($4)
+ addu $2,$24
+
+ sltu $25,$13,$9
+ subu $9,$13,$9
+ subu $13,$9,$2
+ sgtu $2,$13,$9
+ sw $13,-3*4($4)
+ addu $2,$25
+
+
+ sltu $24,$14,$10
+ subu $10,$14,$10
+ subu $14,$10,$2
+ sgtu $2,$14,$10
+ sw $14,-2*4($4)
+ addu $2,$24
+
+ sltu $25,$15,$11
+ subu $11,$15,$11
+ subu $15,$11,$2
+ sgtu $2,$15,$11
+ sw $15,-4($4)
+
+ .set noreorder
+ bgtz $1,.L_bn_sub_words_loop
+ addu $2,$25
+
+ beqz $7,.L_bn_sub_words_return
+ nop
+
+.L_bn_sub_words_tail:
+ .set reorder
+ lw $12,0($5)
+ lw $8,0($6)
+ subu $7,1
+ sltu $24,$12,$8
+ subu $8,$12,$8
+ subu $12,$8,$2
+ sgtu $2,$12,$8
+ sw $12,0($4)
+ addu $2,$24
+ beqz $7,.L_bn_sub_words_return
+
+ lw $13,4($5)
+ subu $7,1
+ lw $9,4($6)
+ sltu $25,$13,$9
+ subu $9,$13,$9
+ subu $13,$9,$2
+ sgtu $2,$13,$9
+ sw $13,4($4)
+ addu $2,$25
+ beqz $7,.L_bn_sub_words_return
+
+ lw $14,2*4($5)
+ lw $10,2*4($6)
+ sltu $24,$14,$10
+ subu $10,$14,$10
+ subu $14,$10,$2
+ sgtu $2,$14,$10
+ sw $14,2*4($4)
+ addu $2,$24
+
+.L_bn_sub_words_return:
+ .set noreorder
+ jr $31
+ move $4,$2
+.end bn_sub_words_internal
+
+.align 5
+.globl bn_div_3_words
+.ent bn_div_3_words
+bn_div_3_words:
+ .set noreorder
+ move $7,$4 # we know that bn_div_words does not
+ # touch $7, $10, $11 and preserves $6
+ # so that we can save two arguments
+ # and return address in registers
+ # instead of stack:-)
+
+ lw $4,($7)
+ move $10,$5
+ bne $4,$6,bn_div_3_words_internal
+ lw $5,-4($7)
+ li $2,-1
+ jr $31
+ move $4,$2
+.end bn_div_3_words
+
+.align 5
+.ent bn_div_3_words_internal
+bn_div_3_words_internal:
+ .set reorder
+ move $11,$31
+ bal bn_div_words_internal
+ move $31,$11
+ multu $10,$2
+ lw $14,-2*4($7)
+ move $8,$0
+ mfhi $13
+ mflo $12
+ sltu $24,$13,$5
+.L_bn_div_3_words_inner_loop:
+ bnez $24,.L_bn_div_3_words_inner_loop_done
+ sgeu $1,$14,$12
+ seq $25,$13,$5
+ and $1,$25
+ sltu $15,$12,$10
+ addu $5,$6
+ subu $13,$15
+ subu $12,$10
+ sltu $24,$13,$5
+ sltu $8,$5,$6
+ or $24,$8
+ .set noreorder
+ beqz $1,.L_bn_div_3_words_inner_loop
+ subu $2,1
+ addu $2,1
+ .set reorder
+.L_bn_div_3_words_inner_loop_done:
+ .set noreorder
+ jr $31
+ move $4,$2
+.end bn_div_3_words_internal
+
+.align 5
+.globl bn_div_words
+.ent bn_div_words
+bn_div_words:
+ .set noreorder
+ bnez $6,bn_div_words_internal
+ li $2,-1 # I would rather signal div-by-zero
+ # which can be done with 'break 7'
+ jr $31
+ move $4,$2
+.end bn_div_words
+
+.align 5
+.ent bn_div_words_internal
+bn_div_words_internal:
+ move $3,$0
+ bltz $6,.L_bn_div_words_body
+ move $25,$3
+ sll $6,1
+ bgtz $6,.-4
+ addu $25,1
+
+ .set reorder
+ negu $13,$25
+ li $14,-1
+ sll $14,$13
+ and $14,$4
+ srl $1,$5,$13
+ .set noreorder
+ beqz $14,.+12
+ nop
+ break 6 # signal overflow
+ .set reorder
+ sll $4,$25
+ sll $5,$25
+ or $4,$1
+.L_bn_div_words_body:
+ srl $3,$6,4*4 # bits
+ sgeu $1,$4,$6
+ .set noreorder
+ beqz $1,.+12
+ nop
+ subu $4,$6
+ .set reorder
+
+ li $8,-1
+ srl $9,$4,4*4 # bits
+ srl $8,4*4 # q=0xffffffff
+ beq $3,$9,.L_bn_div_words_skip_div1
+ divu $0,$4,$3
+ mflo $8
+.L_bn_div_words_skip_div1:
+ multu $6,$8
+ sll $15,$4,4*4 # bits
+ srl $1,$5,4*4 # bits
+ or $15,$1
+ mflo $12
+ mfhi $13
+.L_bn_div_words_inner_loop1:
+ sltu $14,$15,$12
+ seq $24,$9,$13
+ sltu $1,$9,$13
+ and $14,$24
+ sltu $2,$12,$6
+ or $1,$14
+ .set noreorder
+ beqz $1,.L_bn_div_words_inner_loop1_done
+ subu $13,$2
+ subu $12,$6
+ b .L_bn_div_words_inner_loop1
+ subu $8,1
+ .set reorder
+.L_bn_div_words_inner_loop1_done:
+
+ sll $5,4*4 # bits
+ subu $4,$15,$12
+ sll $2,$8,4*4 # bits
+
+ li $8,-1
+ srl $9,$4,4*4 # bits
+ srl $8,4*4 # q=0xffffffff
+ beq $3,$9,.L_bn_div_words_skip_div2
+ divu $0,$4,$3
+ mflo $8
+.L_bn_div_words_skip_div2:
+ multu $6,$8
+ sll $15,$4,4*4 # bits
+ srl $1,$5,4*4 # bits
+ or $15,$1
+ mflo $12
+ mfhi $13
+.L_bn_div_words_inner_loop2:
+ sltu $14,$15,$12
+ seq $24,$9,$13
+ sltu $1,$9,$13
+ and $14,$24
+ sltu $3,$12,$6
+ or $1,$14
+ .set noreorder
+ beqz $1,.L_bn_div_words_inner_loop2_done
+ subu $13,$3
+ subu $12,$6
+ b .L_bn_div_words_inner_loop2
+ subu $8,1
+ .set reorder
+.L_bn_div_words_inner_loop2_done:
+
+ subu $4,$15,$12
+ or $2,$8
+ srl $3,$4,$25 # $3 contains remainder if anybody wants it
+ srl $6,$25 # restore $6
+
+ .set noreorder
+ move $5,$3
+ jr $31
+ move $4,$2
+.end bn_div_words_internal
+
+.align 5
+.globl bn_mul_comba8
+.ent bn_mul_comba8
+bn_mul_comba8:
+ .set noreorder
+ .frame $29,6*4,$31
+ .mask 0x003f0000,-4
+ subu $29,6*4
+ sw $21,5*4($29)
+ sw $20,4*4($29)
+ sw $19,3*4($29)
+ sw $18,2*4($29)
+ sw $17,1*4($29)
+ sw $16,0*4($29)
+
+ .set reorder
+ lw $12,0($5) # If compiled with -mips3 option on
+ # R5000 box assembler barks on this
+ # 1ine with "should not have mult/div
+ # as last instruction in bb (R10K
+ # bug)" warning. If anybody out there
+ # has a clue about how to circumvent
+ # this do send me a note.
+ # <appro@fy.chalmers.se>
+
+ lw $8,0($6)
+ lw $13,4($5)
+ lw $14,2*4($5)
+ multu $12,$8 # mul_add_c(a[0],b[0],c1,c2,c3);
+ lw $15,3*4($5)
+ lw $9,4($6)
+ lw $10,2*4($6)
+ lw $11,3*4($6)
+ mflo $2
+ mfhi $3
+
+ lw $16,4*4($5)
+ lw $18,5*4($5)
+ multu $12,$9 # mul_add_c(a[0],b[1],c2,c3,c1);
+ lw $20,6*4($5)
+ lw $5,7*4($5)
+ lw $17,4*4($6)
+ lw $19,5*4($6)
+ mflo $24
+ mfhi $25
+ addu $3,$24
+ sltu $1,$3,$24
+ multu $13,$8 # mul_add_c(a[1],b[0],c2,c3,c1);
+ addu $7,$25,$1
+ lw $21,6*4($6)
+ lw $6,7*4($6)
+ sw $2,0($4) # r[0]=c1;
+ mflo $24
+ mfhi $25
+ addu $3,$24
+ sltu $1,$3,$24
+ multu $14,$8 # mul_add_c(a[2],b[0],c3,c1,c2);
+ addu $25,$1
+ addu $7,$25
+ sltu $2,$7,$25
+ sw $3,4($4) # r[1]=c2;
+
+ mflo $24
+ mfhi $25
+ addu $7,$24
+ sltu $1,$7,$24
+ multu $13,$9 # mul_add_c(a[1],b[1],c3,c1,c2);
+ addu $25,$1
+ addu $2,$25
+ mflo $24
+ mfhi $25
+ addu $7,$24
+ sltu $1,$7,$24
+ multu $12,$10 # mul_add_c(a[0],b[2],c3,c1,c2);
+ addu $25,$1
+ addu $2,$25
+ sltu $3,$2,$25
+ mflo $24
+ mfhi $25
+ addu $7,$24
+ sltu $1,$7,$24
+ multu $12,$11 # mul_add_c(a[0],b[3],c1,c2,c3);
+ addu $25,$1
+ addu $2,$25
+ sltu $1,$2,$25
+ addu $3,$1
+ sw $7,2*4($4) # r[2]=c3;
+
+ mflo $24
+ mfhi $25
+ addu $2,$24
+ sltu $1,$2,$24
+ multu $13,$10 # mul_add_c(a[1],b[2],c1,c2,c3);
+ addu $25,$1
+ addu $3,$25
+ sltu $7,$3,$25
+ mflo $24
+ mfhi $25
+ addu $2,$24
+ sltu $1,$2,$24
+ multu $14,$9 # mul_add_c(a[2],b[1],c1,c2,c3);
+ addu $25,$1
+ addu $3,$25
+ sltu $1,$3,$25
+ addu $7,$1
+ mflo $24
+ mfhi $25
+ addu $2,$24
+ sltu $1,$2,$24
+ multu $15,$8 # mul_add_c(a[3],b[0],c1,c2,c3);
+ addu $25,$1
+ addu $3,$25
+ sltu $1,$3,$25
+ addu $7,$1
+ mflo $24
+ mfhi $25
+ addu $2,$24
+ sltu $1,$2,$24
+ multu $16,$8 # mul_add_c(a[4],b[0],c2,c3,c1);
+ addu $25,$1
+ addu $3,$25
+ sltu $1,$3,$25
+ addu $7,$1
+ sw $2,3*4($4) # r[3]=c1;
+
+ mflo $24
+ mfhi $25
+ addu $3,$24
+ sltu $1,$3,$24
+ multu $15,$9 # mul_add_c(a[3],b[1],c2,c3,c1);
+ addu $25,$1
+ addu $7,$25
+ sltu $2,$7,$25
+ mflo $24
+ mfhi $25
+ addu $3,$24
+ sltu $1,$3,$24
+ multu $14,$10 # mul_add_c(a[2],b[2],c2,c3,c1);
+ addu $25,$1
+ addu $7,$25
+ sltu $1,$7,$25
+ addu $2,$1
+ mflo $24
+ mfhi $25
+ addu $3,$24
+ sltu $1,$3,$24
+ multu $13,$11 # mul_add_c(a[1],b[3],c2,c3,c1);
+ addu $25,$1
+ addu $7,$25
+ sltu $1,$7,$25
+ addu $2,$1
+ mflo $24
+ mfhi $25
+ addu $3,$24
+ sltu $1,$3,$24
+ multu $12,$17 # mul_add_c(a[0],b[4],c2,c3,c1);
+ addu $25,$1
+ addu $7,$25
+ sltu $1,$7,$25
+ addu $2,$1
+ mflo $24
+ mfhi $25
+ addu $3,$24
+ sltu $1,$3,$24
+ multu $12,$19 # mul_add_c(a[0],b[5],c3,c1,c2);
+ addu $25,$1
+ addu $7,$25
+ sltu $1,$7,$25
+ addu $2,$1
+ sw $3,4*4($4) # r[4]=c2;
+
+ mflo $24
+ mfhi $25
+ addu $7,$24
+ sltu $1,$7,$24
+ multu $13,$17 # mul_add_c(a[1],b[4],c3,c1,c2);
+ addu $25,$1
+ addu $2,$25
+ sltu $3,$2,$25
+ mflo $24
+ mfhi $25
+ addu $7,$24
+ sltu $1,$7,$24
+ multu $14,$11 # mul_add_c(a[2],b[3],c3,c1,c2);
+ addu $25,$1
+ addu $2,$25
+ sltu $1,$2,$25
+ addu $3,$1
+ mflo $24
+ mfhi $25
+ addu $7,$24
+ sltu $1,$7,$24
+ multu $15,$10 # mul_add_c(a[3],b[2],c3,c1,c2);
+ addu $25,$1
+ addu $2,$25
+ sltu $1,$2,$25
+ addu $3,$1
+ mflo $24
+ mfhi $25
+ addu $7,$24
+ sltu $1,$7,$24
+ multu $16,$9 # mul_add_c(a[4],b[1],c3,c1,c2);
+ addu $25,$1
+ addu $2,$25
+ sltu $1,$2,$25
+ addu $3,$1
+ mflo $24
+ mfhi $25
+ addu $7,$24
+ sltu $1,$7,$24
+ multu $18,$8 # mul_add_c(a[5],b[0],c3,c1,c2);
+ addu $25,$1
+ addu $2,$25
+ sltu $1,$2,$25
+ addu $3,$1
+ mflo $24
+ mfhi $25
+ addu $7,$24
+ sltu $1,$7,$24
+ multu $20,$8 # mul_add_c(a[6],b[0],c1,c2,c3);
+ addu $25,$1
+ addu $2,$25
+ sltu $1,$2,$25
+ addu $3,$1
+ sw $7,5*4($4) # r[5]=c3;
+
+ mflo $24
+ mfhi $25
+ addu $2,$24
+ sltu $1,$2,$24
+ multu $18,$9 # mul_add_c(a[5],b[1],c1,c2,c3);
+ addu $25,$1
+ addu $3,$25
+ sltu $7,$3,$25
+ mflo $24
+ mfhi $25
+ addu $2,$24
+ sltu $1,$2,$24
+ multu $16,$10 # mul_add_c(a[4],b[2],c1,c2,c3);
+ addu $25,$1
+ addu $3,$25
+ sltu $1,$3,$25
+ addu $7,$1
+ mflo $24
+ mfhi $25
+ addu $2,$24
+ sltu $1,$2,$24
+ multu $15,$11 # mul_add_c(a[3],b[3],c1,c2,c3);
+ addu $25,$1
+ addu $3,$25
+ sltu $1,$3,$25
+ addu $7,$1
+ mflo $24
+ mfhi $25
+ addu $2,$24
+ sltu $1,$2,$24
+ multu $14,$17 # mul_add_c(a[2],b[4],c1,c2,c3);
+ addu $25,$1
+ addu $3,$25
+ sltu $1,$3,$25
+ addu $7,$1
+ mflo $24
+ mfhi $25
+ addu $2,$24
+ sltu $1,$2,$24
+ multu $13,$19 # mul_add_c(a[1],b[5],c1,c2,c3);
+ addu $25,$1
+ addu $3,$25
+ sltu $1,$3,$25
+ addu $7,$1
+ mflo $24
+ mfhi $25
+ addu $2,$24
+ sltu $1,$2,$24
+ multu $12,$21 # mul_add_c(a[0],b[6],c1,c2,c3);
+ addu $25,$1
+ addu $3,$25
+ sltu $1,$3,$25
+ addu $7,$1
+ mflo $24
+ mfhi $25
+ addu $2,$24
+ sltu $1,$2,$24
+ multu $12,$6 # mul_add_c(a[0],b[7],c2,c3,c1);
+ addu $25,$1
+ addu $3,$25
+ sltu $1,$3,$25
+ addu $7,$1
+ sw $2,6*4($4) # r[6]=c1;
+
+ mflo $24
+ mfhi $25
+ addu $3,$24
+ sltu $1,$3,$24
+ multu $13,$21 # mul_add_c(a[1],b[6],c2,c3,c1);
+ addu $25,$1
+ addu $7,$25
+ sltu $2,$7,$25
+ mflo $24
+ mfhi $25
+ addu $3,$24
+ sltu $1,$3,$24
+ multu $14,$19 # mul_add_c(a[2],b[5],c2,c3,c1);
+ addu $25,$1
+ addu $7,$25
+ sltu $1,$7,$25
+ addu $2,$1
+ mflo $24
+ mfhi $25
+ addu $3,$24
+ sltu $1,$3,$24
+ multu $15,$17 # mul_add_c(a[3],b[4],c2,c3,c1);
+ addu $25,$1
+ addu $7,$25
+ sltu $1,$7,$25
+ addu $2,$1
+ mflo $24
+ mfhi $25
+ addu $3,$24
+ sltu $1,$3,$24
+ multu $16,$11 # mul_add_c(a[4],b[3],c2,c3,c1);
+ addu $25,$1
+ addu $7,$25
+ sltu $1,$7,$25
+ addu $2,$1
+ mflo $24
+ mfhi $25
+ addu $3,$24
+ sltu $1,$3,$24
+ multu $18,$10 # mul_add_c(a[5],b[2],c2,c3,c1);
+ addu $25,$1
+ addu $7,$25
+ sltu $1,$7,$25
+ addu $2,$1
+ mflo $24
+ mfhi $25
+ addu $3,$24
+ sltu $1,$3,$24
+ multu $20,$9 # mul_add_c(a[6],b[1],c2,c3,c1);
+ addu $25,$1
+ addu $7,$25
+ sltu $1,$7,$25
+ addu $2,$1
+ mflo $24
+ mfhi $25
+ addu $3,$24
+ sltu $1,$3,$24
+ multu $5,$8 # mul_add_c(a[7],b[0],c2,c3,c1);
+ addu $25,$1
+ addu $7,$25
+ sltu $1,$7,$25
+ addu $2,$1
+ mflo $24
+ mfhi $25
+ addu $3,$24
+ sltu $1,$3,$24
+ multu $5,$9 # mul_add_c(a[7],b[1],c3,c1,c2);
+ addu $25,$1
+ addu $7,$25
+ sltu $1,$7,$25
+ addu $2,$1
+ sw $3,7*4($4) # r[7]=c2;
+
+ mflo $24
+ mfhi $25
+ addu $7,$24
+ sltu $1,$7,$24
+ multu $20,$10 # mul_add_c(a[6],b[2],c3,c1,c2);
+ addu $25,$1
+ addu $2,$25
+ sltu $3,$2,$25
+ mflo $24
+ mfhi $25
+ addu $7,$24
+ sltu $1,$7,$24
+ multu $18,$11 # mul_add_c(a[5],b[3],c3,c1,c2);
+ addu $25,$1
+ addu $2,$25
+ sltu $1,$2,$25
+ addu $3,$1
+ mflo $24
+ mfhi $25
+ addu $7,$24
+ sltu $1,$7,$24
+ multu $16,$17 # mul_add_c(a[4],b[4],c3,c1,c2);
+ addu $25,$1
+ addu $2,$25
+ sltu $1,$2,$25
+ addu $3,$1
+ mflo $24
+ mfhi $25
+ addu $7,$24
+ sltu $1,$7,$24
+ multu $15,$19 # mul_add_c(a[3],b[5],c3,c1,c2);
+ addu $25,$1
+ addu $2,$25
+ sltu $1,$2,$25
+ addu $3,$1
+ mflo $24
+ mfhi $25
+ addu $7,$24
+ sltu $1,$7,$24
+ multu $14,$21 # mul_add_c(a[2],b[6],c3,c1,c2);
+ addu $25,$1
+ addu $2,$25
+ sltu $1,$2,$25
+ addu $3,$1
+ mflo $24
+ mfhi $25
+ addu $7,$24
+ sltu $1,$7,$24
+ multu $13,$6 # mul_add_c(a[1],b[7],c3,c1,c2);
+ addu $25,$1
+ addu $2,$25
+ sltu $1,$2,$25
+ addu $3,$1
+ mflo $24
+ mfhi $25
+ addu $7,$24
+ sltu $1,$7,$24
+ multu $14,$6 # mul_add_c(a[2],b[7],c1,c2,c3);
+ addu $25,$1
+ addu $2,$25
+ sltu $1,$2,$25
+ addu $3,$1
+ sw $7,8*4($4) # r[8]=c3;
+
+ mflo $24
+ mfhi $25
+ addu $2,$24
+ sltu $1,$2,$24
+ multu $15,$21 # mul_add_c(a[3],b[6],c1,c2,c3);
+ addu $25,$1
+ addu $3,$25
+ sltu $7,$3,$25
+ mflo $24
+ mfhi $25
+ addu $2,$24
+ sltu $1,$2,$24
+ multu $16,$19 # mul_add_c(a[4],b[5],c1,c2,c3);
+ addu $25,$1
+ addu $3,$25
+ sltu $1,$3,$25
+ addu $7,$1
+ mflo $24
+ mfhi $25
+ addu $2,$24
+ sltu $1,$2,$24
+ multu $18,$17 # mul_add_c(a[5],b[4],c1,c2,c3);
+ addu $25,$1
+ addu $3,$25
+ sltu $1,$3,$25
+ addu $7,$1
+ mflo $24
+ mfhi $25
+ addu $2,$24
+ sltu $1,$2,$24
+ multu $20,$11 # mul_add_c(a[6],b[3],c1,c2,c3);
+ addu $25,$1
+ addu $3,$25
+ sltu $1,$3,$25
+ addu $7,$1
+ mflo $24
+ mfhi $25
+ addu $2,$24
+ sltu $1,$2,$24
+ multu $5,$10 # mul_add_c(a[7],b[2],c1,c2,c3);
+ addu $25,$1
+ addu $3,$25
+ sltu $1,$3,$25
+ addu $7,$1
+ mflo $24
+ mfhi $25
+ addu $2,$24
+ sltu $1,$2,$24
+ multu $5,$11 # mul_add_c(a[7],b[3],c2,c3,c1);
+ addu $25,$1
+ addu $3,$25
+ sltu $1,$3,$25
+ addu $7,$1
+ sw $2,9*4($4) # r[9]=c1;
+
+ mflo $24
+ mfhi $25
+ addu $3,$24
+ sltu $1,$3,$24
+ multu $20,$17 # mul_add_c(a[6],b[4],c2,c3,c1);
+ addu $25,$1
+ addu $7,$25
+ sltu $2,$7,$25
+ mflo $24
+ mfhi $25
+ addu $3,$24
+ sltu $1,$3,$24
+ multu $18,$19 # mul_add_c(a[5],b[5],c2,c3,c1);
+ addu $25,$1
+ addu $7,$25
+ sltu $1,$7,$25
+ addu $2,$1
+ mflo $24
+ mfhi $25
+ addu $3,$24
+ sltu $1,$3,$24
+ multu $16,$21 # mul_add_c(a[4],b[6],c2,c3,c1);
+ addu $25,$1
+ addu $7,$25
+ sltu $1,$7,$25
+ addu $2,$1
+ mflo $24
+ mfhi $25
+ addu $3,$24
+ sltu $1,$3,$24
+ multu $15,$6 # mul_add_c(a[3],b[7],c2,c3,c1);
+ addu $25,$1
+ addu $7,$25
+ sltu $1,$7,$25
+ addu $2,$1
+ mflo $24
+ mfhi $25
+ addu $3,$24
+ sltu $1,$3,$24
+ multu $16,$6 # mul_add_c(a[4],b[7],c3,c1,c2);
+ addu $25,$1
+ addu $7,$25
+ sltu $1,$7,$25
+ addu $2,$1
+ sw $3,10*4($4) # r[10]=c2;
+
+ mflo $24
+ mfhi $25
+ addu $7,$24
+ sltu $1,$7,$24
+ multu $18,$21 # mul_add_c(a[5],b[6],c3,c1,c2);
+ addu $25,$1
+ addu $2,$25
+ sltu $3,$2,$25
+ mflo $24
+ mfhi $25
+ addu $7,$24
+ sltu $1,$7,$24
+ multu $20,$19 # mul_add_c(a[6],b[5],c3,c1,c2);
+ addu $25,$1
+ addu $2,$25
+ sltu $1,$2,$25
+ addu $3,$1
+ mflo $24
+ mfhi $25
+ addu $7,$24
+ sltu $1,$7,$24
+ multu $5,$17 # mul_add_c(a[7],b[4],c3,c1,c2);
+ addu $25,$1
+ addu $2,$25
+ sltu $1,$2,$25
+ addu $3,$1
+ mflo $24
+ mfhi $25
+ addu $7,$24
+ sltu $1,$7,$24
+ multu $5,$19 # mul_add_c(a[7],b[5],c1,c2,c3);
+ addu $25,$1
+ addu $2,$25
+ sltu $1,$2,$25
+ addu $3,$1
+ sw $7,11*4($4) # r[11]=c3;
+
+ mflo $24
+ mfhi $25
+ addu $2,$24
+ sltu $1,$2,$24
+ multu $20,$21 # mul_add_c(a[6],b[6],c1,c2,c3);
+ addu $25,$1
+ addu $3,$25
+ sltu $7,$3,$25
+ mflo $24
+ mfhi $25
+ addu $2,$24
+ sltu $1,$2,$24
+ multu $18,$6 # mul_add_c(a[5],b[7],c1,c2,c3);
+ addu $25,$1
+ addu $3,$25
+ sltu $1,$3,$25
+ addu $7,$1
+ mflo $24
+ mfhi $25
+ addu $2,$24
+ sltu $1,$2,$24
+ multu $20,$6 # mul_add_c(a[6],b[7],c2,c3,c1);
+ addu $25,$1
+ addu $3,$25
+ sltu $1,$3,$25
+ addu $7,$1
+ sw $2,12*4($4) # r[12]=c1;
+
+ mflo $24
+ mfhi $25
+ addu $3,$24
+ sltu $1,$3,$24
+ multu $5,$21 # mul_add_c(a[7],b[6],c2,c3,c1);
+ addu $25,$1
+ addu $7,$25
+ sltu $2,$7,$25
+ mflo $24
+ mfhi $25
+ addu $3,$24
+ sltu $1,$3,$24
+ multu $5,$6 # mul_add_c(a[7],b[7],c3,c1,c2);
+ addu $25,$1
+ addu $7,$25
+ sltu $1,$7,$25
+ addu $2,$1
+ sw $3,13*4($4) # r[13]=c2;
+
+ mflo $24
+ mfhi $25
+ addu $7,$24
+ sltu $1,$7,$24
+ addu $25,$1
+ addu $2,$25
+ sw $7,14*4($4) # r[14]=c3;
+ sw $2,15*4($4) # r[15]=c1;
+
+ .set noreorder
+ lw $21,5*4($29)
+ lw $20,4*4($29)
+ lw $19,3*4($29)
+ lw $18,2*4($29)
+ lw $17,1*4($29)
+ lw $16,0*4($29)
+ jr $31
+ addu $29,6*4
+.end bn_mul_comba8
+
+.align 5
+.globl bn_mul_comba4
+.ent bn_mul_comba4
+bn_mul_comba4:
+ .set reorder
+ lw $12,0($5)
+ lw $8,0($6)
+ lw $13,4($5)
+ lw $14,2*4($5)
+ multu $12,$8 # mul_add_c(a[0],b[0],c1,c2,c3);
+ lw $15,3*4($5)
+ lw $9,4($6)
+ lw $10,2*4($6)
+ lw $11,3*4($6)
+ mflo $2
+ mfhi $3
+ sw $2,0($4)
+
+ multu $12,$9 # mul_add_c(a[0],b[1],c2,c3,c1);
+ mflo $24
+ mfhi $25
+ addu $3,$24
+ sltu $1,$3,$24
+ multu $13,$8 # mul_add_c(a[1],b[0],c2,c3,c1);
+ addu $7,$25,$1
+ mflo $24
+ mfhi $25
+ addu $3,$24
+ sltu $1,$3,$24
+ multu $14,$8 # mul_add_c(a[2],b[0],c3,c1,c2);
+ addu $25,$1
+ addu $7,$25
+ sltu $2,$7,$25
+ sw $3,4($4)
+
+ mflo $24
+ mfhi $25
+ addu $7,$24
+ sltu $1,$7,$24
+ multu $13,$9 # mul_add_c(a[1],b[1],c3,c1,c2);
+ addu $25,$1
+ addu $2,$25
+ mflo $24
+ mfhi $25
+ addu $7,$24
+ sltu $1,$7,$24
+ multu $12,$10 # mul_add_c(a[0],b[2],c3,c1,c2);
+ addu $25,$1
+ addu $2,$25
+ sltu $3,$2,$25
+ mflo $24
+ mfhi $25
+ addu $7,$24
+ sltu $1,$7,$24
+ multu $12,$11 # mul_add_c(a[0],b[3],c1,c2,c3);
+ addu $25,$1
+ addu $2,$25
+ sltu $1,$2,$25
+ addu $3,$1
+ sw $7,2*4($4)
+
+ mflo $24
+ mfhi $25
+ addu $2,$24
+ sltu $1,$2,$24
+ multu $13,$10 # mul_add_c(a[1],b[2],c1,c2,c3);
+ addu $25,$1
+ addu $3,$25
+ sltu $7,$3,$25
+ mflo $24
+ mfhi $25
+ addu $2,$24
+ sltu $1,$2,$24
+ multu $14,$9 # mul_add_c(a[2],b[1],c1,c2,c3);
+ addu $25,$1
+ addu $3,$25
+ sltu $1,$3,$25
+ addu $7,$1
+ mflo $24
+ mfhi $25
+ addu $2,$24
+ sltu $1,$2,$24
+ multu $15,$8 # mul_add_c(a[3],b[0],c1,c2,c3);
+ addu $25,$1
+ addu $3,$25
+ sltu $1,$3,$25
+ addu $7,$1
+ mflo $24
+ mfhi $25
+ addu $2,$24
+ sltu $1,$2,$24
+ multu $15,$9 # mul_add_c(a[3],b[1],c2,c3,c1);
+ addu $25,$1
+ addu $3,$25
+ sltu $1,$3,$25
+ addu $7,$1
+ sw $2,3*4($4)
+
+ mflo $24
+ mfhi $25
+ addu $3,$24
+ sltu $1,$3,$24
+ multu $14,$10 # mul_add_c(a[2],b[2],c2,c3,c1);
+ addu $25,$1
+ addu $7,$25
+ sltu $2,$7,$25
+ mflo $24
+ mfhi $25
+ addu $3,$24
+ sltu $1,$3,$24
+ multu $13,$11 # mul_add_c(a[1],b[3],c2,c3,c1);
+ addu $25,$1
+ addu $7,$25
+ sltu $1,$7,$25
+ addu $2,$1
+ mflo $24
+ mfhi $25
+ addu $3,$24
+ sltu $1,$3,$24
+ multu $14,$11 # mul_add_c(a[2],b[3],c3,c1,c2);
+ addu $25,$1
+ addu $7,$25
+ sltu $1,$7,$25
+ addu $2,$1
+ sw $3,4*4($4)
+
+ mflo $24
+ mfhi $25
+ addu $7,$24
+ sltu $1,$7,$24
+ multu $15,$10 # mul_add_c(a[3],b[2],c3,c1,c2);
+ addu $25,$1
+ addu $2,$25
+ sltu $3,$2,$25
+ mflo $24
+ mfhi $25
+ addu $7,$24
+ sltu $1,$7,$24
+ multu $15,$11 # mul_add_c(a[3],b[3],c1,c2,c3);
+ addu $25,$1
+ addu $2,$25
+ sltu $1,$2,$25
+ addu $3,$1
+ sw $7,5*4($4)
+
+ mflo $24
+ mfhi $25
+ addu $2,$24
+ sltu $1,$2,$24
+ addu $25,$1
+ addu $3,$25
+ sw $2,6*4($4)
+ sw $3,7*4($4)
+
+ .set noreorder
+ jr $31
+ nop
+.end bn_mul_comba4
+
+.align 5
+.globl bn_sqr_comba8
+.ent bn_sqr_comba8
+bn_sqr_comba8:
+ .set reorder
+ lw $12,0($5)
+ lw $13,4($5)
+ lw $14,2*4($5)
+ lw $15,3*4($5)
+
+ multu $12,$12 # mul_add_c(a[0],b[0],c1,c2,c3);
+ lw $8,4*4($5)
+ lw $9,5*4($5)
+ lw $10,6*4($5)
+ lw $11,7*4($5)
+ mflo $2
+ mfhi $3
+ sw $2,0($4)
+
+ multu $12,$13 # mul_add_c2(a[0],b[1],c2,c3,c1);
+ mflo $24
+ mfhi $25
+ slt $2,$25,$0
+ sll $25,1
+ multu $14,$12 # mul_add_c2(a[2],b[0],c3,c1,c2);
+ slt $6,$24,$0
+ addu $25,$6
+ sll $24,1
+ addu $3,$24
+ sltu $1,$3,$24
+ addu $7,$25,$1
+ sw $3,4($4)
+
+ mflo $24
+ mfhi $25
+ slt $3,$25,$0
+ sll $25,1
+ multu $13,$13 # mul_add_c(a[1],b[1],c3,c1,c2);
+ slt $6,$24,$0
+ addu $25,$6
+ sll $24,1
+ addu $7,$24
+ sltu $1,$7,$24
+ addu $25,$1
+ addu $2,$25
+ sltu $1,$2,$25
+ addu $3,$1
+ mflo $24
+ mfhi $25
+ addu $7,$24
+ sltu $1,$7,$24
+ multu $12,$15 # mul_add_c2(a[0],b[3],c1,c2,c3);
+ addu $25,$1
+ addu $2,$25
+ sltu $1,$2,$25
+ addu $3,$1
+ sw $7,2*4($4)
+
+ mflo $24
+ mfhi $25
+ slt $7,$25,$0
+ sll $25,1
+ multu $13,$14 # mul_add_c2(a[1],b[2],c1,c2,c3);
+ slt $6,$24,$0
+ addu $25,$6
+ sll $24,1
+ addu $2,$24
+ sltu $1,$2,$24
+ addu $25,$1
+ addu $3,$25
+ sltu $1,$3,$25
+ addu $7,$1
+ mflo $24
+ mfhi $25
+ slt $1,$25,$0
+ addu $7,$1
+ multu $8,$12 # mul_add_c2(a[4],b[0],c2,c3,c1);
+ sll $25,1
+ slt $6,$24,$0
+ addu $25,$6
+ sll $24,1
+ addu $2,$24
+ sltu $1,$2,$24
+ addu $25,$1
+ addu $3,$25
+ sltu $1,$3,$25
+ addu $7,$1
+ sw $2,3*4($4)
+
+ mflo $24
+ mfhi $25
+ slt $2,$25,$0
+ sll $25,1
+ multu $15,$13 # mul_add_c2(a[3],b[1],c2,c3,c1);
+ slt $6,$24,$0
+ addu $25,$6
+ sll $24,1
+ addu $3,$24
+ sltu $1,$3,$24
+ addu $25,$1
+ addu $7,$25
+ sltu $1,$7,$25
+ addu $2,$1
+ mflo $24
+ mfhi $25
+ slt $1,$25,$0
+ addu $2,$1
+ multu $14,$14 # mul_add_c(a[2],b[2],c2,c3,c1);
+ sll $25,1
+ slt $6,$24,$0
+ addu $25,$6
+ sll $24,1
+ addu $3,$24
+ sltu $1,$3,$24
+ addu $25,$1
+ addu $7,$25
+ sltu $1,$7,$25
+ addu $2,$1
+ mflo $24
+ mfhi $25
+ addu $3,$24
+ sltu $1,$3,$24
+ multu $12,$9 # mul_add_c2(a[0],b[5],c3,c1,c2);
+ addu $25,$1
+ addu $7,$25
+ sltu $1,$7,$25
+ addu $2,$1
+ sw $3,4*4($4)
+
+ mflo $24
+ mfhi $25
+ slt $3,$25,$0
+ sll $25,1
+ multu $13,$8 # mul_add_c2(a[1],b[4],c3,c1,c2);
+ slt $6,$24,$0
+ addu $25,$6
+ sll $24,1
+ addu $7,$24
+ sltu $1,$7,$24
+ addu $25,$1
+ addu $2,$25
+ sltu $1,$2,$25
+ addu $3,$1
+ mflo $24
+ mfhi $25
+ slt $1,$25,$0
+ addu $3,$1
+ multu $14,$15 # mul_add_c2(a[2],b[3],c3,c1,c2);
+ sll $25,1
+ slt $6,$24,$0
+ addu $25,$6
+ sll $24,1
+ addu $7,$24
+ sltu $1,$7,$24
+ addu $25,$1
+ addu $2,$25
+ sltu $1,$2,$25
+ addu $3,$1
+ mflo $24
+ mfhi $25
+ slt $1,$25,$0
+ multu $10,$12 # mul_add_c2(a[6],b[0],c1,c2,c3);
+ addu $3,$1
+ sll $25,1
+ slt $6,$24,$0
+ addu $25,$6
+ sll $24,1
+ addu $7,$24
+ sltu $1,$7,$24
+ addu $25,$1
+ addu $2,$25
+ sltu $1,$2,$25
+ addu $3,$1
+ sw $7,5*4($4)
+
+ mflo $24
+ mfhi $25
+ slt $7,$25,$0
+ sll $25,1
+ multu $9,$13 # mul_add_c2(a[5],b[1],c1,c2,c3);
+ slt $6,$24,$0
+ addu $25,$6
+ sll $24,1
+ addu $2,$24
+ sltu $1,$2,$24
+ addu $25,$1
+ addu $3,$25
+ sltu $1,$3,$25
+ addu $7,$1
+ mflo $24
+ mfhi $25
+ slt $1,$25,$0
+ addu $7,$1
+ multu $8,$14 # mul_add_c2(a[4],b[2],c1,c2,c3);
+ sll $25,1
+ slt $6,$24,$0
+ addu $25,$6
+ sll $24,1
+ addu $2,$24
+ sltu $1,$2,$24
+ addu $25,$1
+ addu $3,$25
+ sltu $1,$3,$25
+ addu $7,$1
+ mflo $24
+ mfhi $25
+ slt $1,$25,$0
+ addu $7,$1
+ multu $15,$15 # mul_add_c(a[3],b[3],c1,c2,c3);
+ sll $25,1
+ slt $6,$24,$0
+ addu $25,$6
+ sll $24,1
+ addu $2,$24
+ sltu $1,$2,$24
+ addu $25,$1
+ addu $3,$25
+ sltu $1,$3,$25
+ addu $7,$1
+ mflo $24
+ mfhi $25
+ addu $2,$24
+ sltu $1,$2,$24
+ multu $12,$11 # mul_add_c2(a[0],b[7],c2,c3,c1);
+ addu $25,$1
+ addu $3,$25
+ sltu $1,$3,$25
+ addu $7,$1
+ sw $2,6*4($4)
+
+ mflo $24
+ mfhi $25
+ slt $2,$25,$0
+ sll $25,1
+ multu $13,$10 # mul_add_c2(a[1],b[6],c2,c3,c1);
+ slt $6,$24,$0
+ addu $25,$6
+ sll $24,1
+ addu $3,$24
+ sltu $1,$3,$24
+ addu $25,$1
+ addu $7,$25
+ sltu $1,$7,$25
+ addu $2,$1
+ mflo $24
+ mfhi $25
+ slt $1,$25,$0
+ addu $2,$1
+ multu $14,$9 # mul_add_c2(a[2],b[5],c2,c3,c1);
+ sll $25,1
+ slt $6,$24,$0
+ addu $25,$6
+ sll $24,1
+ addu $3,$24
+ sltu $1,$3,$24
+ addu $25,$1
+ addu $7,$25
+ sltu $1,$7,$25
+ addu $2,$1
+ mflo $24
+ mfhi $25
+ slt $1,$25,$0
+ addu $2,$1
+ multu $15,$8 # mul_add_c2(a[3],b[4],c2,c3,c1);
+ sll $25,1
+ slt $6,$24,$0
+ addu $25,$6
+ sll $24,1
+ addu $3,$24
+ sltu $1,$3,$24
+ addu $25,$1
+ addu $7,$25
+ sltu $1,$7,$25
+ addu $2,$1
+ mflo $24
+ mfhi $25
+ slt $1,$25,$0
+ addu $2,$1
+ multu $11,$13 # mul_add_c2(a[7],b[1],c3,c1,c2);
+ sll $25,1
+ slt $6,$24,$0
+ addu $25,$6
+ sll $24,1
+ addu $3,$24
+ sltu $1,$3,$24
+ addu $25,$1
+ addu $7,$25
+ sltu $1,$7,$25
+ addu $2,$1
+ sw $3,7*4($4)
+
+ mflo $24
+ mfhi $25
+ slt $3,$25,$0
+ sll $25,1
+ multu $10,$14 # mul_add_c2(a[6],b[2],c3,c1,c2);
+ slt $6,$24,$0
+ addu $25,$6
+ sll $24,1
+ addu $7,$24
+ sltu $1,$7,$24
+ addu $25,$1
+ addu $2,$25
+ sltu $1,$2,$25
+ addu $3,$1
+ mflo $24
+ mfhi $25
+ slt $1,$25,$0
+ addu $3,$1
+ multu $9,$15 # mul_add_c2(a[5],b[3],c3,c1,c2);
+ sll $25,1
+ slt $6,$24,$0
+ addu $25,$6
+ sll $24,1
+ addu $7,$24
+ sltu $1,$7,$24
+ addu $25,$1
+ addu $2,$25
+ sltu $1,$2,$25
+ addu $3,$1
+ mflo $24
+ mfhi $25
+ slt $1,$25,$0
+ addu $3,$1
+ multu $8,$8 # mul_add_c(a[4],b[4],c3,c1,c2);
+ sll $25,1
+ slt $6,$24,$0
+ addu $25,$6
+ sll $24,1
+ addu $7,$24
+ sltu $1,$7,$24
+ addu $25,$1
+ addu $2,$25
+ sltu $1,$2,$25
+ addu $3,$1
+ mflo $24
+ mfhi $25
+ addu $7,$24
+ sltu $1,$7,$24
+ multu $14,$11 # mul_add_c2(a[2],b[7],c1,c2,c3);
+ addu $25,$1
+ addu $2,$25
+ sltu $1,$2,$25
+ addu $3,$1
+ sw $7,8*4($4)
+
+ mflo $24
+ mfhi $25
+ slt $7,$25,$0
+ sll $25,1
+ multu $15,$10 # mul_add_c2(a[3],b[6],c1,c2,c3);
+ slt $6,$24,$0
+ addu $25,$6
+ sll $24,1
+ addu $2,$24
+ sltu $1,$2,$24
+ addu $25,$1
+ addu $3,$25
+ sltu $1,$3,$25
+ addu $7,$1
+ mflo $24
+ mfhi $25
+ slt $1,$25,$0
+ addu $7,$1
+ multu $8,$9 # mul_add_c2(a[4],b[5],c1,c2,c3);
+ sll $25,1
+ slt $6,$24,$0
+ addu $25,$6
+ sll $24,1
+ addu $2,$24
+ sltu $1,$2,$24
+ addu $25,$1
+ addu $3,$25
+ sltu $1,$3,$25
+ addu $7,$1
+ mflo $24
+ mfhi $25
+ slt $1,$25,$0
+ addu $7,$1
+ multu $11,$15 # mul_add_c2(a[7],b[3],c2,c3,c1);
+ sll $25,1
+ slt $6,$24,$0
+ addu $25,$6
+ sll $24,1
+ addu $2,$24
+ sltu $1,$2,$24
+ addu $25,$1
+ addu $3,$25
+ sltu $1,$3,$25
+ addu $7,$1
+ sw $2,9*4($4)
+
+ mflo $24
+ mfhi $25
+ slt $2,$25,$0
+ sll $25,1
+ multu $10,$8 # mul_add_c2(a[6],b[4],c2,c3,c1);
+ slt $6,$24,$0
+ addu $25,$6
+ sll $24,1
+ addu $3,$24
+ sltu $1,$3,$24
+ addu $25,$1
+ addu $7,$25
+ sltu $1,$7,$25
+ addu $2,$1
+ mflo $24
+ mfhi $25
+ slt $1,$25,$0
+ addu $2,$1
+ multu $9,$9 # mul_add_c(a[5],b[5],c2,c3,c1);
+ sll $25,1
+ slt $6,$24,$0
+ addu $25,$6
+ sll $24,1
+ addu $3,$24
+ sltu $1,$3,$24
+ addu $25,$1
+ addu $7,$25
+ sltu $1,$7,$25
+ addu $2,$1
+ mflo $24
+ mfhi $25
+ addu $3,$24
+ sltu $1,$3,$24
+ multu $8,$11 # mul_add_c2(a[4],b[7],c3,c1,c2);
+ addu $25,$1
+ addu $7,$25
+ sltu $1,$7,$25
+ addu $2,$1
+ sw $3,10*4($4)
+
+ mflo $24
+ mfhi $25
+ slt $3,$25,$0
+ sll $25,1
+ multu $9,$10 # mul_add_c2(a[5],b[6],c3,c1,c2);
+ slt $6,$24,$0
+ addu $25,$6
+ sll $24,1
+ addu $7,$24
+ sltu $1,$7,$24
+ addu $25,$1
+ addu $2,$25
+ sltu $1,$2,$25
+ addu $3,$1
+ mflo $24
+ mfhi $25
+ slt $1,$25,$0
+ addu $3,$1
+ multu $11,$9 # mul_add_c2(a[7],b[5],c1,c2,c3);
+ sll $25,1
+ slt $6,$24,$0
+ addu $25,$6
+ sll $24,1
+ addu $7,$24
+ sltu $1,$7,$24
+ addu $25,$1
+ addu $2,$25
+ sltu $1,$2,$25
+ addu $3,$1
+ sw $7,11*4($4)
+
+ mflo $24
+ mfhi $25
+ slt $7,$25,$0
+ sll $25,1
+ multu $10,$10 # mul_add_c(a[6],b[6],c1,c2,c3);
+ slt $6,$24,$0
+ addu $25,$6
+ sll $24,1
+ addu $2,$24
+ sltu $1,$2,$24
+ addu $25,$1
+ addu $3,$25
+ sltu $1,$3,$25
+ addu $7,$1
+ mflo $24
+ mfhi $25
+ addu $2,$24
+ sltu $1,$2,$24
+ multu $10,$11 # mul_add_c2(a[6],b[7],c2,c3,c1);
+ addu $25,$1
+ addu $3,$25
+ sltu $1,$3,$25
+ addu $7,$1
+ sw $2,12*4($4)
+
+ mflo $24
+ mfhi $25
+ slt $2,$25,$0
+ sll $25,1
+ multu $11,$11 # mul_add_c(a[7],b[7],c3,c1,c2);
+ slt $6,$24,$0
+ addu $25,$6
+ sll $24,1
+ addu $3,$24
+ sltu $1,$3,$24
+ addu $25,$1
+ addu $7,$25
+ sltu $1,$7,$25
+ addu $2,$1
+ sw $3,13*4($4)
+
+ mflo $24
+ mfhi $25
+ addu $7,$24
+ sltu $1,$7,$24
+ addu $25,$1
+ addu $2,$25
+ sw $7,14*4($4)
+ sw $2,15*4($4)
+
+ .set noreorder
+ jr $31
+ nop
+.end bn_sqr_comba8
+
+.align 5
+.globl bn_sqr_comba4
+.ent bn_sqr_comba4
+bn_sqr_comba4:
+ .set reorder
+ lw $12,0($5)
+ lw $13,4($5)
+ multu $12,$12 # mul_add_c(a[0],b[0],c1,c2,c3);
+ lw $14,2*4($5)
+ lw $15,3*4($5)
+ mflo $2
+ mfhi $3
+ sw $2,0($4)
+
+ multu $12,$13 # mul_add_c2(a[0],b[1],c2,c3,c1);
+ mflo $24
+ mfhi $25
+ slt $2,$25,$0
+ sll $25,1
+ multu $14,$12 # mul_add_c2(a[2],b[0],c3,c1,c2);
+ slt $6,$24,$0
+ addu $25,$6
+ sll $24,1
+ addu $3,$24
+ sltu $1,$3,$24
+ addu $7,$25,$1
+ sw $3,4($4)
+
+ mflo $24
+ mfhi $25
+ slt $3,$25,$0
+ sll $25,1
+ multu $13,$13 # mul_add_c(a[1],b[1],c3,c1,c2);
+ slt $6,$24,$0
+ addu $25,$6
+ sll $24,1
+ addu $7,$24
+ sltu $1,$7,$24
+ addu $25,$1
+ addu $2,$25
+ sltu $1,$2,$25
+ addu $3,$1
+ mflo $24
+ mfhi $25
+ addu $7,$24
+ sltu $1,$7,$24
+ multu $12,$15 # mul_add_c2(a[0],b[3],c1,c2,c3);
+ addu $25,$1
+ addu $2,$25
+ sltu $1,$2,$25
+ addu $3,$1
+ sw $7,2*4($4)
+
+ mflo $24
+ mfhi $25
+ slt $7,$25,$0
+ sll $25,1
+ multu $13,$14 # mul_add_c(a2[1],b[2],c1,c2,c3);
+ slt $6,$24,$0
+ addu $25,$6
+ sll $24,1
+ addu $2,$24
+ sltu $1,$2,$24
+ addu $25,$1
+ addu $3,$25
+ sltu $1,$3,$25
+ addu $7,$1
+ mflo $24
+ mfhi $25
+ slt $1,$25,$0
+ addu $7,$1
+ multu $15,$13 # mul_add_c2(a[3],b[1],c2,c3,c1);
+ sll $25,1
+ slt $6,$24,$0
+ addu $25,$6
+ sll $24,1
+ addu $2,$24
+ sltu $1,$2,$24
+ addu $25,$1
+ addu $3,$25
+ sltu $1,$3,$25
+ addu $7,$1
+ sw $2,3*4($4)
+
+ mflo $24
+ mfhi $25
+ slt $2,$25,$0
+ sll $25,1
+ multu $14,$14 # mul_add_c(a[2],b[2],c2,c3,c1);
+ slt $6,$24,$0
+ addu $25,$6
+ sll $24,1
+ addu $3,$24
+ sltu $1,$3,$24
+ addu $25,$1
+ addu $7,$25
+ sltu $1,$7,$25
+ addu $2,$1
+ mflo $24
+ mfhi $25
+ addu $3,$24
+ sltu $1,$3,$24
+ multu $14,$15 # mul_add_c2(a[2],b[3],c3,c1,c2);
+ addu $25,$1
+ addu $7,$25
+ sltu $1,$7,$25
+ addu $2,$1
+ sw $3,4*4($4)
+
+ mflo $24
+ mfhi $25
+ slt $3,$25,$0
+ sll $25,1
+ multu $15,$15 # mul_add_c(a[3],b[3],c1,c2,c3);
+ slt $6,$24,$0
+ addu $25,$6
+ sll $24,1
+ addu $7,$24
+ sltu $1,$7,$24
+ addu $25,$1
+ addu $2,$25
+ sltu $1,$2,$25
+ addu $3,$1
+ sw $7,5*4($4)
+
+ mflo $24
+ mfhi $25
+ addu $2,$24
+ sltu $1,$2,$24
+ addu $25,$1
+ addu $3,$25
+ sw $2,6*4($4)
+ sw $3,7*4($4)
+
+ .set noreorder
+ jr $31
+ nop
+.end bn_sqr_comba4
diff --git a/main/openssl/crypto/bn/asm/co-586.S b/main/openssl/crypto/bn/asm/co-586.S
new file mode 100644
index 00000000..3cb80735
--- /dev/null
+++ b/main/openssl/crypto/bn/asm/co-586.S
@@ -0,0 +1,1254 @@
+.file "crypto/bn/asm/co-586.s"
+.text
+.globl bn_mul_comba8
+.type bn_mul_comba8,@function
+.align 16
+bn_mul_comba8:
+.L_bn_mul_comba8_begin:
+ pushl %esi
+ movl 12(%esp),%esi
+ pushl %edi
+ movl 20(%esp),%edi
+ pushl %ebp
+ pushl %ebx
+ xorl %ebx,%ebx
+ movl (%esi),%eax
+ xorl %ecx,%ecx
+ movl (%edi),%edx
+
+ xorl %ebp,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 20(%esp),%eax
+ adcl %edx,%ecx
+ movl (%edi),%edx
+ adcl $0,%ebp
+ movl %ebx,(%eax)
+ movl 4(%esi),%eax
+
+
+ xorl %ebx,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl (%esi),%eax
+ adcl %edx,%ebp
+ movl 4(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 20(%esp),%eax
+ adcl %edx,%ebp
+ movl (%edi),%edx
+ adcl $0,%ebx
+ movl %ecx,4(%eax)
+ movl 8(%esi),%eax
+
+
+ xorl %ecx,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 4(%esi),%eax
+ adcl %edx,%ebx
+ movl 4(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl (%esi),%eax
+ adcl %edx,%ebx
+ movl 8(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 20(%esp),%eax
+ adcl %edx,%ebx
+ movl (%edi),%edx
+ adcl $0,%ecx
+ movl %ebp,8(%eax)
+ movl 12(%esi),%eax
+
+
+ xorl %ebp,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 8(%esi),%eax
+ adcl %edx,%ecx
+ movl 4(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 4(%esi),%eax
+ adcl %edx,%ecx
+ movl 8(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl (%esi),%eax
+ adcl %edx,%ecx
+ movl 12(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 20(%esp),%eax
+ adcl %edx,%ecx
+ movl (%edi),%edx
+ adcl $0,%ebp
+ movl %ebx,12(%eax)
+ movl 16(%esi),%eax
+
+
+ xorl %ebx,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 12(%esi),%eax
+ adcl %edx,%ebp
+ movl 4(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 8(%esi),%eax
+ adcl %edx,%ebp
+ movl 8(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 4(%esi),%eax
+ adcl %edx,%ebp
+ movl 12(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl (%esi),%eax
+ adcl %edx,%ebp
+ movl 16(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 20(%esp),%eax
+ adcl %edx,%ebp
+ movl (%edi),%edx
+ adcl $0,%ebx
+ movl %ecx,16(%eax)
+ movl 20(%esi),%eax
+
+
+ xorl %ecx,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 16(%esi),%eax
+ adcl %edx,%ebx
+ movl 4(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 12(%esi),%eax
+ adcl %edx,%ebx
+ movl 8(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 8(%esi),%eax
+ adcl %edx,%ebx
+ movl 12(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 4(%esi),%eax
+ adcl %edx,%ebx
+ movl 16(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl (%esi),%eax
+ adcl %edx,%ebx
+ movl 20(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 20(%esp),%eax
+ adcl %edx,%ebx
+ movl (%edi),%edx
+ adcl $0,%ecx
+ movl %ebp,20(%eax)
+ movl 24(%esi),%eax
+
+
+ xorl %ebp,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 20(%esi),%eax
+ adcl %edx,%ecx
+ movl 4(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 16(%esi),%eax
+ adcl %edx,%ecx
+ movl 8(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 12(%esi),%eax
+ adcl %edx,%ecx
+ movl 12(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 8(%esi),%eax
+ adcl %edx,%ecx
+ movl 16(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 4(%esi),%eax
+ adcl %edx,%ecx
+ movl 20(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl (%esi),%eax
+ adcl %edx,%ecx
+ movl 24(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 20(%esp),%eax
+ adcl %edx,%ecx
+ movl (%edi),%edx
+ adcl $0,%ebp
+ movl %ebx,24(%eax)
+ movl 28(%esi),%eax
+
+
+ xorl %ebx,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 24(%esi),%eax
+ adcl %edx,%ebp
+ movl 4(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 20(%esi),%eax
+ adcl %edx,%ebp
+ movl 8(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 16(%esi),%eax
+ adcl %edx,%ebp
+ movl 12(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 12(%esi),%eax
+ adcl %edx,%ebp
+ movl 16(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 8(%esi),%eax
+ adcl %edx,%ebp
+ movl 20(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 4(%esi),%eax
+ adcl %edx,%ebp
+ movl 24(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl (%esi),%eax
+ adcl %edx,%ebp
+ movl 28(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 20(%esp),%eax
+ adcl %edx,%ebp
+ movl 4(%edi),%edx
+ adcl $0,%ebx
+ movl %ecx,28(%eax)
+ movl 28(%esi),%eax
+
+
+ xorl %ecx,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 24(%esi),%eax
+ adcl %edx,%ebx
+ movl 8(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 20(%esi),%eax
+ adcl %edx,%ebx
+ movl 12(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 16(%esi),%eax
+ adcl %edx,%ebx
+ movl 16(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 12(%esi),%eax
+ adcl %edx,%ebx
+ movl 20(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 8(%esi),%eax
+ adcl %edx,%ebx
+ movl 24(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 4(%esi),%eax
+ adcl %edx,%ebx
+ movl 28(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 20(%esp),%eax
+ adcl %edx,%ebx
+ movl 8(%edi),%edx
+ adcl $0,%ecx
+ movl %ebp,32(%eax)
+ movl 28(%esi),%eax
+
+
+ xorl %ebp,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 24(%esi),%eax
+ adcl %edx,%ecx
+ movl 12(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 20(%esi),%eax
+ adcl %edx,%ecx
+ movl 16(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 16(%esi),%eax
+ adcl %edx,%ecx
+ movl 20(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 12(%esi),%eax
+ adcl %edx,%ecx
+ movl 24(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 8(%esi),%eax
+ adcl %edx,%ecx
+ movl 28(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 20(%esp),%eax
+ adcl %edx,%ecx
+ movl 12(%edi),%edx
+ adcl $0,%ebp
+ movl %ebx,36(%eax)
+ movl 28(%esi),%eax
+
+
+ xorl %ebx,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 24(%esi),%eax
+ adcl %edx,%ebp
+ movl 16(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 20(%esi),%eax
+ adcl %edx,%ebp
+ movl 20(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 16(%esi),%eax
+ adcl %edx,%ebp
+ movl 24(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 12(%esi),%eax
+ adcl %edx,%ebp
+ movl 28(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 20(%esp),%eax
+ adcl %edx,%ebp
+ movl 16(%edi),%edx
+ adcl $0,%ebx
+ movl %ecx,40(%eax)
+ movl 28(%esi),%eax
+
+
+ xorl %ecx,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 24(%esi),%eax
+ adcl %edx,%ebx
+ movl 20(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 20(%esi),%eax
+ adcl %edx,%ebx
+ movl 24(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 16(%esi),%eax
+ adcl %edx,%ebx
+ movl 28(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 20(%esp),%eax
+ adcl %edx,%ebx
+ movl 20(%edi),%edx
+ adcl $0,%ecx
+ movl %ebp,44(%eax)
+ movl 28(%esi),%eax
+
+
+ xorl %ebp,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 24(%esi),%eax
+ adcl %edx,%ecx
+ movl 24(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 20(%esi),%eax
+ adcl %edx,%ecx
+ movl 28(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 20(%esp),%eax
+ adcl %edx,%ecx
+ movl 24(%edi),%edx
+ adcl $0,%ebp
+ movl %ebx,48(%eax)
+ movl 28(%esi),%eax
+
+
+ xorl %ebx,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 24(%esi),%eax
+ adcl %edx,%ebp
+ movl 28(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 20(%esp),%eax
+ adcl %edx,%ebp
+ movl 28(%edi),%edx
+ adcl $0,%ebx
+ movl %ecx,52(%eax)
+ movl 28(%esi),%eax
+
+
+ xorl %ecx,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 20(%esp),%eax
+ adcl %edx,%ebx
+ adcl $0,%ecx
+ movl %ebp,56(%eax)
+
+
+ movl %ebx,60(%eax)
+ popl %ebx
+ popl %ebp
+ popl %edi
+ popl %esi
+ ret
+.size bn_mul_comba8,.-.L_bn_mul_comba8_begin
+.globl bn_mul_comba4
+.type bn_mul_comba4,@function
+.align 16
+bn_mul_comba4:
+.L_bn_mul_comba4_begin:
+ pushl %esi
+ movl 12(%esp),%esi
+ pushl %edi
+ movl 20(%esp),%edi
+ pushl %ebp
+ pushl %ebx
+ xorl %ebx,%ebx
+ movl (%esi),%eax
+ xorl %ecx,%ecx
+ movl (%edi),%edx
+
+ xorl %ebp,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 20(%esp),%eax
+ adcl %edx,%ecx
+ movl (%edi),%edx
+ adcl $0,%ebp
+ movl %ebx,(%eax)
+ movl 4(%esi),%eax
+
+
+ xorl %ebx,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl (%esi),%eax
+ adcl %edx,%ebp
+ movl 4(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 20(%esp),%eax
+ adcl %edx,%ebp
+ movl (%edi),%edx
+ adcl $0,%ebx
+ movl %ecx,4(%eax)
+ movl 8(%esi),%eax
+
+
+ xorl %ecx,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 4(%esi),%eax
+ adcl %edx,%ebx
+ movl 4(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl (%esi),%eax
+ adcl %edx,%ebx
+ movl 8(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 20(%esp),%eax
+ adcl %edx,%ebx
+ movl (%edi),%edx
+ adcl $0,%ecx
+ movl %ebp,8(%eax)
+ movl 12(%esi),%eax
+
+
+ xorl %ebp,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 8(%esi),%eax
+ adcl %edx,%ecx
+ movl 4(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 4(%esi),%eax
+ adcl %edx,%ecx
+ movl 8(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl (%esi),%eax
+ adcl %edx,%ecx
+ movl 12(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 20(%esp),%eax
+ adcl %edx,%ecx
+ movl 4(%edi),%edx
+ adcl $0,%ebp
+ movl %ebx,12(%eax)
+ movl 12(%esi),%eax
+
+
+ xorl %ebx,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 8(%esi),%eax
+ adcl %edx,%ebp
+ movl 8(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 4(%esi),%eax
+ adcl %edx,%ebp
+ movl 12(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 20(%esp),%eax
+ adcl %edx,%ebp
+ movl 8(%edi),%edx
+ adcl $0,%ebx
+ movl %ecx,16(%eax)
+ movl 12(%esi),%eax
+
+
+ xorl %ecx,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 8(%esi),%eax
+ adcl %edx,%ebx
+ movl 12(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 20(%esp),%eax
+ adcl %edx,%ebx
+ movl 12(%edi),%edx
+ adcl $0,%ecx
+ movl %ebp,20(%eax)
+ movl 12(%esi),%eax
+
+
+ xorl %ebp,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 20(%esp),%eax
+ adcl %edx,%ecx
+ adcl $0,%ebp
+ movl %ebx,24(%eax)
+
+
+ movl %ecx,28(%eax)
+ popl %ebx
+ popl %ebp
+ popl %edi
+ popl %esi
+ ret
+.size bn_mul_comba4,.-.L_bn_mul_comba4_begin
+.globl bn_sqr_comba8
+.type bn_sqr_comba8,@function
+.align 16
+bn_sqr_comba8:
+.L_bn_sqr_comba8_begin:
+ pushl %esi
+ pushl %edi
+ pushl %ebp
+ pushl %ebx
+ movl 20(%esp),%edi
+ movl 24(%esp),%esi
+ xorl %ebx,%ebx
+ xorl %ecx,%ecx
+ movl (%esi),%eax
+
+ xorl %ebp,%ebp
+
+ mull %eax
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl (%esi),%edx
+ adcl $0,%ebp
+ movl %ebx,(%edi)
+ movl 4(%esi),%eax
+
+
+ xorl %ebx,%ebx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebx
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl 8(%esi),%eax
+ adcl $0,%ebx
+ movl %ecx,4(%edi)
+ movl (%esi),%edx
+
+
+ xorl %ecx,%ecx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ecx
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ movl 4(%esi),%eax
+ adcl $0,%ecx
+
+ mull %eax
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ movl (%esi),%edx
+ adcl $0,%ecx
+ movl %ebp,8(%edi)
+ movl 12(%esi),%eax
+
+
+ xorl %ebp,%ebp
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebp
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl 8(%esi),%eax
+ adcl $0,%ebp
+ movl 4(%esi),%edx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebp
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl 16(%esi),%eax
+ adcl $0,%ebp
+ movl %ebx,12(%edi)
+ movl (%esi),%edx
+
+
+ xorl %ebx,%ebx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebx
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl 12(%esi),%eax
+ adcl $0,%ebx
+ movl 4(%esi),%edx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebx
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl 8(%esi),%eax
+ adcl $0,%ebx
+
+ mull %eax
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl (%esi),%edx
+ adcl $0,%ebx
+ movl %ecx,16(%edi)
+ movl 20(%esi),%eax
+
+
+ xorl %ecx,%ecx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ecx
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ movl 16(%esi),%eax
+ adcl $0,%ecx
+ movl 4(%esi),%edx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ecx
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ movl 12(%esi),%eax
+ adcl $0,%ecx
+ movl 8(%esi),%edx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ecx
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ movl 24(%esi),%eax
+ adcl $0,%ecx
+ movl %ebp,20(%edi)
+ movl (%esi),%edx
+
+
+ xorl %ebp,%ebp
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebp
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl 20(%esi),%eax
+ adcl $0,%ebp
+ movl 4(%esi),%edx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebp
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl 16(%esi),%eax
+ adcl $0,%ebp
+ movl 8(%esi),%edx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebp
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl 12(%esi),%eax
+ adcl $0,%ebp
+
+ mull %eax
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl (%esi),%edx
+ adcl $0,%ebp
+ movl %ebx,24(%edi)
+ movl 28(%esi),%eax
+
+
+ xorl %ebx,%ebx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebx
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl 24(%esi),%eax
+ adcl $0,%ebx
+ movl 4(%esi),%edx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebx
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl 20(%esi),%eax
+ adcl $0,%ebx
+ movl 8(%esi),%edx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebx
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl 16(%esi),%eax
+ adcl $0,%ebx
+ movl 12(%esi),%edx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebx
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl 28(%esi),%eax
+ adcl $0,%ebx
+ movl %ecx,28(%edi)
+ movl 4(%esi),%edx
+
+
+ xorl %ecx,%ecx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ecx
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ movl 24(%esi),%eax
+ adcl $0,%ecx
+ movl 8(%esi),%edx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ecx
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ movl 20(%esi),%eax
+ adcl $0,%ecx
+ movl 12(%esi),%edx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ecx
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ movl 16(%esi),%eax
+ adcl $0,%ecx
+
+ mull %eax
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ movl 8(%esi),%edx
+ adcl $0,%ecx
+ movl %ebp,32(%edi)
+ movl 28(%esi),%eax
+
+
+ xorl %ebp,%ebp
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebp
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl 24(%esi),%eax
+ adcl $0,%ebp
+ movl 12(%esi),%edx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebp
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl 20(%esi),%eax
+ adcl $0,%ebp
+ movl 16(%esi),%edx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebp
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl 28(%esi),%eax
+ adcl $0,%ebp
+ movl %ebx,36(%edi)
+ movl 12(%esi),%edx
+
+
+ xorl %ebx,%ebx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebx
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl 24(%esi),%eax
+ adcl $0,%ebx
+ movl 16(%esi),%edx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebx
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl 20(%esi),%eax
+ adcl $0,%ebx
+
+ mull %eax
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl 16(%esi),%edx
+ adcl $0,%ebx
+ movl %ecx,40(%edi)
+ movl 28(%esi),%eax
+
+
+ xorl %ecx,%ecx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ecx
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ movl 24(%esi),%eax
+ adcl $0,%ecx
+ movl 20(%esi),%edx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ecx
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ movl 28(%esi),%eax
+ adcl $0,%ecx
+ movl %ebp,44(%edi)
+ movl 20(%esi),%edx
+
+
+ xorl %ebp,%ebp
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebp
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl 24(%esi),%eax
+ adcl $0,%ebp
+
+ mull %eax
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl 24(%esi),%edx
+ adcl $0,%ebp
+ movl %ebx,48(%edi)
+ movl 28(%esi),%eax
+
+
+ xorl %ebx,%ebx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebx
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl 28(%esi),%eax
+ adcl $0,%ebx
+ movl %ecx,52(%edi)
+
+
+ xorl %ecx,%ecx
+
+ mull %eax
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ adcl $0,%ecx
+ movl %ebp,56(%edi)
+
+ movl %ebx,60(%edi)
+ popl %ebx
+ popl %ebp
+ popl %edi
+ popl %esi
+ ret
+.size bn_sqr_comba8,.-.L_bn_sqr_comba8_begin
+.globl bn_sqr_comba4
+.type bn_sqr_comba4,@function
+.align 16
+bn_sqr_comba4:
+.L_bn_sqr_comba4_begin:
+ pushl %esi
+ pushl %edi
+ pushl %ebp
+ pushl %ebx
+ movl 20(%esp),%edi
+ movl 24(%esp),%esi
+ xorl %ebx,%ebx
+ xorl %ecx,%ecx
+ movl (%esi),%eax
+
+ xorl %ebp,%ebp
+
+ mull %eax
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl (%esi),%edx
+ adcl $0,%ebp
+ movl %ebx,(%edi)
+ movl 4(%esi),%eax
+
+
+ xorl %ebx,%ebx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebx
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl 8(%esi),%eax
+ adcl $0,%ebx
+ movl %ecx,4(%edi)
+ movl (%esi),%edx
+
+
+ xorl %ecx,%ecx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ecx
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ movl 4(%esi),%eax
+ adcl $0,%ecx
+
+ mull %eax
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ movl (%esi),%edx
+ adcl $0,%ecx
+ movl %ebp,8(%edi)
+ movl 12(%esi),%eax
+
+
+ xorl %ebp,%ebp
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebp
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl 8(%esi),%eax
+ adcl $0,%ebp
+ movl 4(%esi),%edx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebp
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl 12(%esi),%eax
+ adcl $0,%ebp
+ movl %ebx,12(%edi)
+ movl 4(%esi),%edx
+
+
+ xorl %ebx,%ebx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebx
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl 8(%esi),%eax
+ adcl $0,%ebx
+
+ mull %eax
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl 8(%esi),%edx
+ adcl $0,%ebx
+ movl %ecx,16(%edi)
+ movl 12(%esi),%eax
+
+
+ xorl %ecx,%ecx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ecx
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ movl 12(%esi),%eax
+ adcl $0,%ecx
+ movl %ebp,20(%edi)
+
+
+ xorl %ebp,%ebp
+
+ mull %eax
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ adcl $0,%ebp
+ movl %ebx,24(%edi)
+
+ movl %ecx,28(%edi)
+ popl %ebx
+ popl %ebp
+ popl %edi
+ popl %esi
+ ret
+.size bn_sqr_comba4,.-.L_bn_sqr_comba4_begin
diff --git a/main/openssl/crypto/bn/asm/ia64-mont.pl b/main/openssl/crypto/bn/asm/ia64-mont.pl
new file mode 100644
index 00000000..e2586584
--- /dev/null
+++ b/main/openssl/crypto/bn/asm/ia64-mont.pl
@@ -0,0 +1,851 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# January 2010
+#
+# "Teaser" Montgomery multiplication module for IA-64. There are
+# several possibilities for improvement:
+#
+# - modulo-scheduling outer loop would eliminate quite a number of
+# stalls after ldf8, xma and getf.sig outside inner loop and
+# improve shorter key performance;
+# - shorter vector support [with input vectors being fetched only
+# once] should be added;
+# - 2x unroll with help of n0[1] would make the code scalable on
+# "wider" IA-64, "wider" than Itanium 2 that is, which is not of
+# acute interest, because upcoming Tukwila's individual cores are
+# reportedly based on Itanium 2 design;
+# - dedicated squaring procedure(?);
+#
+# January 2010
+#
+# Shorter vector support is implemented by zero-padding ap and np
+# vectors up to 8 elements, or 512 bits. This means that 256-bit
+# inputs will be processed only 2 times faster than 512-bit inputs,
+# not 4 [as one would expect, because algorithm complexity is n^2].
+# The reason for padding is that inputs shorter than 512 bits won't
+# be processed faster anyway, because minimal critical path of the
+# core loop happens to match 512-bit timing. Either way, it resulted
+# in >100% improvement of 512-bit RSA sign benchmark and 50% - of
+# 1024-bit one [in comparison to original version of *this* module].
+#
+# So far 'openssl speed rsa dsa' output on 900MHz Itanium 2 *with*
+# this module is:
+# sign verify sign/s verify/s
+# rsa 512 bits 0.000290s 0.000024s 3452.8 42031.4
+# rsa 1024 bits 0.000793s 0.000058s 1261.7 17172.0
+# rsa 2048 bits 0.005908s 0.000148s 169.3 6754.0
+# rsa 4096 bits 0.033456s 0.000469s 29.9 2133.6
+# dsa 512 bits 0.000253s 0.000198s 3949.9 5057.0
+# dsa 1024 bits 0.000585s 0.000607s 1708.4 1647.4
+# dsa 2048 bits 0.001453s 0.001703s 688.1 587.4
+#
+# ... and *without* (but still with ia64.S):
+#
+# rsa 512 bits 0.000670s 0.000041s 1491.8 24145.5
+# rsa 1024 bits 0.001988s 0.000080s 502.9 12499.3
+# rsa 2048 bits 0.008702s 0.000189s 114.9 5293.9
+# rsa 4096 bits 0.043860s 0.000533s 22.8 1875.9
+# dsa 512 bits 0.000441s 0.000427s 2265.3 2340.6
+# dsa 1024 bits 0.000823s 0.000867s 1215.6 1153.2
+# dsa 2048 bits 0.001894s 0.002179s 528.1 458.9
+#
+# As it can be seen, RSA sign performance improves by 130-30%,
+# hereafter less for longer keys, while verify - by 74-13%.
+# DSA performance improves by 115-30%.
+
+if ($^O eq "hpux") {
+ $ADDP="addp4";
+ for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
+} else { $ADDP="add"; }
+
+$code=<<___;
+.explicit
+.text
+
+// int bn_mul_mont (BN_ULONG *rp,const BN_ULONG *ap,
+// const BN_ULONG *bp,const BN_ULONG *np,
+// const BN_ULONG *n0p,int num);
+.align 64
+.global bn_mul_mont#
+.proc bn_mul_mont#
+bn_mul_mont:
+ .prologue
+ .body
+{ .mmi; cmp4.le p6,p7=2,r37;;
+(p6) cmp4.lt.unc p8,p9=8,r37
+ mov ret0=r0 };;
+{ .bbb;
+(p9) br.cond.dptk.many bn_mul_mont_8
+(p8) br.cond.dpnt.many bn_mul_mont_general
+(p7) br.ret.spnt.many b0 };;
+.endp bn_mul_mont#
+
+prevfs=r2; prevpr=r3; prevlc=r10; prevsp=r11;
+
+rptr=r8; aptr=r9; bptr=r14; nptr=r15;
+tptr=r16; // &tp[0]
+tp_1=r17; // &tp[-1]
+num=r18; len=r19; lc=r20;
+topbit=r21; // carry bit from tmp[num]
+
+n0=f6;
+m0=f7;
+bi=f8;
+
+.align 64
+.local bn_mul_mont_general#
+.proc bn_mul_mont_general#
+bn_mul_mont_general:
+ .prologue
+{ .mmi; .save ar.pfs,prevfs
+ alloc prevfs=ar.pfs,6,2,0,8
+ $ADDP aptr=0,in1
+ .save ar.lc,prevlc
+ mov prevlc=ar.lc }
+{ .mmi; .vframe prevsp
+ mov prevsp=sp
+ $ADDP bptr=0,in2
+ .save pr,prevpr
+ mov prevpr=pr };;
+
+ .body
+ .rotf alo[6],nlo[4],ahi[8],nhi[6]
+ .rotr a[3],n[3],t[2]
+
+{ .mmi; ldf8 bi=[bptr],8 // (*bp++)
+ ldf8 alo[4]=[aptr],16 // ap[0]
+ $ADDP r30=8,in1 };;
+{ .mmi; ldf8 alo[3]=[r30],16 // ap[1]
+ ldf8 alo[2]=[aptr],16 // ap[2]
+ $ADDP in4=0,in4 };;
+{ .mmi; ldf8 alo[1]=[r30] // ap[3]
+ ldf8 n0=[in4] // n0
+ $ADDP rptr=0,in0 }
+{ .mmi; $ADDP nptr=0,in3
+ mov r31=16
+ zxt4 num=in5 };;
+{ .mmi; ldf8 nlo[2]=[nptr],8 // np[0]
+ shladd len=num,3,r0
+ shladd r31=num,3,r31 };;
+{ .mmi; ldf8 nlo[1]=[nptr],8 // np[1]
+ add lc=-5,num
+ sub r31=sp,r31 };;
+{ .mfb; and sp=-16,r31 // alloca
+ xmpy.hu ahi[2]=alo[4],bi // ap[0]*bp[0]
+ nop.b 0 }
+{ .mfb; nop.m 0
+ xmpy.lu alo[4]=alo[4],bi
+ brp.loop.imp .L1st_ctop,.L1st_cend-16
+ };;
+{ .mfi; nop.m 0
+ xma.hu ahi[1]=alo[3],bi,ahi[2] // ap[1]*bp[0]
+ add tp_1=8,sp }
+{ .mfi; nop.m 0
+ xma.lu alo[3]=alo[3],bi,ahi[2]
+ mov pr.rot=0x20001f<<16
+ // ------^----- (p40) at first (p23)
+ // ----------^^ p[16:20]=1
+ };;
+{ .mfi; nop.m 0
+ xmpy.lu m0=alo[4],n0 // (ap[0]*bp[0])*n0
+ mov ar.lc=lc }
+{ .mfi; nop.m 0
+ fcvt.fxu.s1 nhi[1]=f0
+ mov ar.ec=8 };;
+
+.align 32
+.L1st_ctop:
+.pred.rel "mutex",p40,p42
+{ .mfi; (p16) ldf8 alo[0]=[aptr],8 // *(aptr++)
+ (p18) xma.hu ahi[0]=alo[2],bi,ahi[1]
+ (p40) add n[2]=n[2],a[2] } // (p23) }
+{ .mfi; (p18) ldf8 nlo[0]=[nptr],8 // *(nptr++)(p16)
+ (p18) xma.lu alo[2]=alo[2],bi,ahi[1]
+ (p42) add n[2]=n[2],a[2],1 };; // (p23)
+{ .mfi; (p21) getf.sig a[0]=alo[5]
+ (p20) xma.hu nhi[0]=nlo[2],m0,nhi[1]
+ (p42) cmp.leu p41,p39=n[2],a[2] } // (p23)
+{ .mfi; (p23) st8 [tp_1]=n[2],8
+ (p20) xma.lu nlo[2]=nlo[2],m0,nhi[1]
+ (p40) cmp.ltu p41,p39=n[2],a[2] } // (p23)
+{ .mmb; (p21) getf.sig n[0]=nlo[3]
+ (p16) nop.m 0
+ br.ctop.sptk .L1st_ctop };;
+.L1st_cend:
+
+{ .mmi; getf.sig a[0]=ahi[6] // (p24)
+ getf.sig n[0]=nhi[4]
+ add num=-1,num };; // num--
+{ .mmi; .pred.rel "mutex",p40,p42
+(p40) add n[0]=n[0],a[0]
+(p42) add n[0]=n[0],a[0],1
+ sub aptr=aptr,len };; // rewind
+{ .mmi; .pred.rel "mutex",p40,p42
+(p40) cmp.ltu p41,p39=n[0],a[0]
+(p42) cmp.leu p41,p39=n[0],a[0]
+ sub nptr=nptr,len };;
+{ .mmi; .pred.rel "mutex",p39,p41
+(p39) add topbit=r0,r0
+(p41) add topbit=r0,r0,1
+ nop.i 0 }
+{ .mmi; st8 [tp_1]=n[0]
+ add tptr=16,sp
+ add tp_1=8,sp };;
+
+.Louter:
+{ .mmi; ldf8 bi=[bptr],8 // (*bp++)
+ ldf8 ahi[3]=[tptr] // tp[0]
+ add r30=8,aptr };;
+{ .mmi; ldf8 alo[4]=[aptr],16 // ap[0]
+ ldf8 alo[3]=[r30],16 // ap[1]
+ add r31=8,nptr };;
+{ .mfb; ldf8 alo[2]=[aptr],16 // ap[2]
+ xma.hu ahi[2]=alo[4],bi,ahi[3] // ap[0]*bp[i]+tp[0]
+ brp.loop.imp .Linner_ctop,.Linner_cend-16
+ }
+{ .mfb; ldf8 alo[1]=[r30] // ap[3]
+ xma.lu alo[4]=alo[4],bi,ahi[3]
+ clrrrb.pr };;
+{ .mfi; ldf8 nlo[2]=[nptr],16 // np[0]
+ xma.hu ahi[1]=alo[3],bi,ahi[2] // ap[1]*bp[i]
+ nop.i 0 }
+{ .mfi; ldf8 nlo[1]=[r31] // np[1]
+ xma.lu alo[3]=alo[3],bi,ahi[2]
+ mov pr.rot=0x20101f<<16
+ // ------^----- (p40) at first (p23)
+ // --------^--- (p30) at first (p22)
+ // ----------^^ p[16:20]=1
+ };;
+{ .mfi; st8 [tptr]=r0 // tp[0] is already accounted
+ xmpy.lu m0=alo[4],n0 // (ap[0]*bp[i]+tp[0])*n0
+ mov ar.lc=lc }
+{ .mfi;
+ fcvt.fxu.s1 nhi[1]=f0
+ mov ar.ec=8 };;
+
+// This loop spins in 4*(n+7) ticks on Itanium 2 and should spin in
+// 7*(n+7) ticks on Itanium (the one codenamed Merced). Factor of 7
+// in latter case accounts for two-tick pipeline stall, which means
+// that its performance would be ~20% lower than optimal one. No
+// attempt was made to address this, because original Itanium is
+// hardly represented out in the wild...
+.align 32
+.Linner_ctop:
+.pred.rel "mutex",p40,p42
+.pred.rel "mutex",p30,p32
+{ .mfi; (p16) ldf8 alo[0]=[aptr],8 // *(aptr++)
+ (p18) xma.hu ahi[0]=alo[2],bi,ahi[1]
+ (p40) add n[2]=n[2],a[2] } // (p23)
+{ .mfi; (p16) nop.m 0
+ (p18) xma.lu alo[2]=alo[2],bi,ahi[1]
+ (p42) add n[2]=n[2],a[2],1 };; // (p23)
+{ .mfi; (p21) getf.sig a[0]=alo[5]
+ (p16) nop.f 0
+ (p40) cmp.ltu p41,p39=n[2],a[2] } // (p23)
+{ .mfi; (p21) ld8 t[0]=[tptr],8
+ (p16) nop.f 0
+ (p42) cmp.leu p41,p39=n[2],a[2] };; // (p23)
+{ .mfi; (p18) ldf8 nlo[0]=[nptr],8 // *(nptr++)
+ (p20) xma.hu nhi[0]=nlo[2],m0,nhi[1]
+ (p30) add a[1]=a[1],t[1] } // (p22)
+{ .mfi; (p16) nop.m 0
+ (p20) xma.lu nlo[2]=nlo[2],m0,nhi[1]
+ (p32) add a[1]=a[1],t[1],1 };; // (p22)
+{ .mmi; (p21) getf.sig n[0]=nlo[3]
+ (p16) nop.m 0
+ (p30) cmp.ltu p31,p29=a[1],t[1] } // (p22)
+{ .mmb; (p23) st8 [tp_1]=n[2],8
+ (p32) cmp.leu p31,p29=a[1],t[1] // (p22)
+ br.ctop.sptk .Linner_ctop };;
+.Linner_cend:
+
+{ .mmi; getf.sig a[0]=ahi[6] // (p24)
+ getf.sig n[0]=nhi[4]
+ nop.i 0 };;
+
+{ .mmi; .pred.rel "mutex",p31,p33
+(p31) add a[0]=a[0],topbit
+(p33) add a[0]=a[0],topbit,1
+ mov topbit=r0 };;
+{ .mfi; .pred.rel "mutex",p31,p33
+(p31) cmp.ltu p32,p30=a[0],topbit
+(p33) cmp.leu p32,p30=a[0],topbit
+ }
+{ .mfi; .pred.rel "mutex",p40,p42
+(p40) add n[0]=n[0],a[0]
+(p42) add n[0]=n[0],a[0],1
+ };;
+{ .mmi; .pred.rel "mutex",p44,p46
+(p40) cmp.ltu p41,p39=n[0],a[0]
+(p42) cmp.leu p41,p39=n[0],a[0]
+(p32) add topbit=r0,r0,1 }
+
+{ .mmi; st8 [tp_1]=n[0],8
+ cmp4.ne p6,p0=1,num
+ sub aptr=aptr,len };; // rewind
+{ .mmi; sub nptr=nptr,len
+(p41) add topbit=r0,r0,1
+ add tptr=16,sp }
+{ .mmb; add tp_1=8,sp
+ add num=-1,num // num--
+(p6) br.cond.sptk.many .Louter };;
+
+{ .mbb; add lc=4,lc
+ brp.loop.imp .Lsub_ctop,.Lsub_cend-16
+ clrrrb.pr };;
+{ .mii; nop.m 0
+ mov pr.rot=0x10001<<16
+ // ------^---- (p33) at first (p17)
+ mov ar.lc=lc }
+{ .mii; nop.m 0
+ mov ar.ec=3
+ nop.i 0 };;
+
+.Lsub_ctop:
+.pred.rel "mutex",p33,p35
+{ .mfi; (p16) ld8 t[0]=[tptr],8 // t=*(tp++)
+ (p16) nop.f 0
+ (p33) sub n[1]=t[1],n[1] } // (p17)
+{ .mfi; (p16) ld8 n[0]=[nptr],8 // n=*(np++)
+ (p16) nop.f 0
+ (p35) sub n[1]=t[1],n[1],1 };; // (p17)
+{ .mib; (p18) st8 [rptr]=n[2],8 // *(rp++)=r
+ (p33) cmp.gtu p34,p32=n[1],t[1] // (p17)
+ (p18) nop.b 0 }
+{ .mib; (p18) nop.m 0
+ (p35) cmp.geu p34,p32=n[1],t[1] // (p17)
+ br.ctop.sptk .Lsub_ctop };;
+.Lsub_cend:
+
+{ .mmb; .pred.rel "mutex",p34,p36
+(p34) sub topbit=topbit,r0 // (p19)
+(p36) sub topbit=topbit,r0,1
+ brp.loop.imp .Lcopy_ctop,.Lcopy_cend-16
+ }
+{ .mmb; sub rptr=rptr,len // rewind
+ sub tptr=tptr,len
+ clrrrb.pr };;
+{ .mmi; and aptr=tptr,topbit
+ andcm bptr=rptr,topbit
+ mov pr.rot=1<<16 };;
+{ .mii; or nptr=aptr,bptr
+ mov ar.lc=lc
+ mov ar.ec=3 };;
+
+.Lcopy_ctop:
+{ .mmb; (p16) ld8 n[0]=[nptr],8
+ (p18) st8 [tptr]=r0,8
+ (p16) nop.b 0 }
+{ .mmb; (p16) nop.m 0
+ (p18) st8 [rptr]=n[2],8
+ br.ctop.sptk .Lcopy_ctop };;
+.Lcopy_cend:
+
+{ .mmi; mov ret0=1 // signal "handled"
+ rum 1<<5 // clear um.mfh
+ mov ar.lc=prevlc }
+{ .mib; .restore sp
+ mov sp=prevsp
+ mov pr=prevpr,0x1ffff
+ br.ret.sptk.many b0 };;
+.endp bn_mul_mont_general#
+
+a1=r16; a2=r17; a3=r18; a4=r19; a5=r20; a6=r21; a7=r22; a8=r23;
+n1=r24; n2=r25; n3=r26; n4=r27; n5=r28; n6=r29; n7=r30; n8=r31;
+t0=r15;
+
+ai0=f8; ai1=f9; ai2=f10; ai3=f11; ai4=f12; ai5=f13; ai6=f14; ai7=f15;
+ni0=f16; ni1=f17; ni2=f18; ni3=f19; ni4=f20; ni5=f21; ni6=f22; ni7=f23;
+
+.align 64
+.skip 48 // aligns loop body
+.local bn_mul_mont_8#
+.proc bn_mul_mont_8#
+bn_mul_mont_8:
+ .prologue
+{ .mmi; .save ar.pfs,prevfs
+ alloc prevfs=ar.pfs,6,2,0,8
+ .vframe prevsp
+ mov prevsp=sp
+ .save ar.lc,prevlc
+ mov prevlc=ar.lc }
+{ .mmi; add r17=-6*16,sp
+ add sp=-7*16,sp
+ .save pr,prevpr
+ mov prevpr=pr };;
+
+{ .mmi; .save.gf 0,0x10
+ stf.spill [sp]=f16,-16
+ .save.gf 0,0x20
+ stf.spill [r17]=f17,32
+ add r16=-5*16,prevsp};;
+{ .mmi; .save.gf 0,0x40
+ stf.spill [r16]=f18,32
+ .save.gf 0,0x80
+ stf.spill [r17]=f19,32
+ $ADDP aptr=0,in1 };;
+{ .mmi; .save.gf 0,0x100
+ stf.spill [r16]=f20,32
+ .save.gf 0,0x200
+ stf.spill [r17]=f21,32
+ $ADDP r29=8,in1 };;
+{ .mmi; .save.gf 0,0x400
+ stf.spill [r16]=f22
+ .save.gf 0,0x800
+ stf.spill [r17]=f23
+ $ADDP rptr=0,in0 };;
+
+ .body
+ .rotf bj[8],mj[2],tf[2],alo[10],ahi[10],nlo[10],nhi[10]
+ .rotr t[8]
+
+// load input vectors padding them to 8 elements
+{ .mmi; ldf8 ai0=[aptr],16 // ap[0]
+ ldf8 ai1=[r29],16 // ap[1]
+ $ADDP bptr=0,in2 }
+{ .mmi; $ADDP r30=8,in2
+ $ADDP nptr=0,in3
+ $ADDP r31=8,in3 };;
+{ .mmi; ldf8 bj[7]=[bptr],16 // bp[0]
+ ldf8 bj[6]=[r30],16 // bp[1]
+ cmp4.le p4,p5=3,in5 }
+{ .mmi; ldf8 ni0=[nptr],16 // np[0]
+ ldf8 ni1=[r31],16 // np[1]
+ cmp4.le p6,p7=4,in5 };;
+
+{ .mfi; (p4)ldf8 ai2=[aptr],16 // ap[2]
+ (p5)fcvt.fxu ai2=f0
+ cmp4.le p8,p9=5,in5 }
+{ .mfi; (p6)ldf8 ai3=[r29],16 // ap[3]
+ (p7)fcvt.fxu ai3=f0
+ cmp4.le p10,p11=6,in5 }
+{ .mfi; (p4)ldf8 bj[5]=[bptr],16 // bp[2]
+ (p5)fcvt.fxu bj[5]=f0
+ cmp4.le p12,p13=7,in5 }
+{ .mfi; (p6)ldf8 bj[4]=[r30],16 // bp[3]
+ (p7)fcvt.fxu bj[4]=f0
+ cmp4.le p14,p15=8,in5 }
+{ .mfi; (p4)ldf8 ni2=[nptr],16 // np[2]
+ (p5)fcvt.fxu ni2=f0
+ addp4 r28=-1,in5 }
+{ .mfi; (p6)ldf8 ni3=[r31],16 // np[3]
+ (p7)fcvt.fxu ni3=f0
+ $ADDP in4=0,in4 };;
+
+{ .mfi; ldf8 n0=[in4]
+ fcvt.fxu tf[1]=f0
+ nop.i 0 }
+
+{ .mfi; (p8)ldf8 ai4=[aptr],16 // ap[4]
+ (p9)fcvt.fxu ai4=f0
+ mov t[0]=r0 }
+{ .mfi; (p10)ldf8 ai5=[r29],16 // ap[5]
+ (p11)fcvt.fxu ai5=f0
+ mov t[1]=r0 }
+{ .mfi; (p8)ldf8 bj[3]=[bptr],16 // bp[4]
+ (p9)fcvt.fxu bj[3]=f0
+ mov t[2]=r0 }
+{ .mfi; (p10)ldf8 bj[2]=[r30],16 // bp[5]
+ (p11)fcvt.fxu bj[2]=f0
+ mov t[3]=r0 }
+{ .mfi; (p8)ldf8 ni4=[nptr],16 // np[4]
+ (p9)fcvt.fxu ni4=f0
+ mov t[4]=r0 }
+{ .mfi; (p10)ldf8 ni5=[r31],16 // np[5]
+ (p11)fcvt.fxu ni5=f0
+ mov t[5]=r0 };;
+
+{ .mfi; (p12)ldf8 ai6=[aptr],16 // ap[6]
+ (p13)fcvt.fxu ai6=f0
+ mov t[6]=r0 }
+{ .mfi; (p14)ldf8 ai7=[r29],16 // ap[7]
+ (p15)fcvt.fxu ai7=f0
+ mov t[7]=r0 }
+{ .mfi; (p12)ldf8 bj[1]=[bptr],16 // bp[6]
+ (p13)fcvt.fxu bj[1]=f0
+ mov ar.lc=r28 }
+{ .mfi; (p14)ldf8 bj[0]=[r30],16 // bp[7]
+ (p15)fcvt.fxu bj[0]=f0
+ mov ar.ec=1 }
+{ .mfi; (p12)ldf8 ni6=[nptr],16 // np[6]
+ (p13)fcvt.fxu ni6=f0
+ mov pr.rot=1<<16 }
+{ .mfb; (p14)ldf8 ni7=[r31],16 // np[7]
+ (p15)fcvt.fxu ni7=f0
+ brp.loop.imp .Louter_8_ctop,.Louter_8_cend-16
+ };;
+
+// The loop is scheduled for 32*n ticks on Itanium 2. Actual attempt
+// to measure with help of Interval Time Counter indicated that the
+// factor is a tad higher: 33 or 34, if not 35. Exact measurement and
+// addressing the issue is problematic, because I don't have access
+// to platform-specific instruction-level profiler. On Itanium it
+// should run in 56*n ticks, because of higher xma latency...
+.Louter_8_ctop:
+ .pred.rel "mutex",p40,p42
+ .pred.rel "mutex",p48,p50
+{ .mfi; (p16) nop.m 0 // 0:
+ (p16) xma.hu ahi[0]=ai0,bj[7],tf[1] // ap[0]*b[i]+t[0]
+ (p40) add a3=a3,n3 } // (p17) a3+=n3
+{ .mfi; (p42) add a3=a3,n3,1
+ (p16) xma.lu alo[0]=ai0,bj[7],tf[1]
+ (p16) nop.i 0 };;
+{ .mii; (p17) getf.sig a7=alo[8] // 1:
+ (p48) add t[6]=t[6],a3 // (p17) t[6]+=a3
+ (p50) add t[6]=t[6],a3,1 };;
+{ .mfi; (p17) getf.sig a8=ahi[8] // 2:
+ (p17) xma.hu nhi[7]=ni6,mj[1],nhi[6] // np[6]*m0
+ (p40) cmp.ltu p43,p41=a3,n3 }
+{ .mfi; (p42) cmp.leu p43,p41=a3,n3
+ (p17) xma.lu nlo[7]=ni6,mj[1],nhi[6]
+ (p16) nop.i 0 };;
+{ .mii; (p17) getf.sig n5=nlo[6] // 3:
+ (p48) cmp.ltu p51,p49=t[6],a3
+ (p50) cmp.leu p51,p49=t[6],a3 };;
+ .pred.rel "mutex",p41,p43
+ .pred.rel "mutex",p49,p51
+{ .mfi; (p16) nop.m 0 // 4:
+ (p16) xma.hu ahi[1]=ai1,bj[7],ahi[0] // ap[1]*b[i]
+ (p41) add a4=a4,n4 } // (p17) a4+=n4
+{ .mfi; (p43) add a4=a4,n4,1
+ (p16) xma.lu alo[1]=ai1,bj[7],ahi[0]
+ (p16) nop.i 0 };;
+{ .mfi; (p49) add t[5]=t[5],a4 // 5: (p17) t[5]+=a4
+ (p16) xmpy.lu mj[0]=alo[0],n0 // (ap[0]*b[i]+t[0])*n0
+ (p51) add t[5]=t[5],a4,1 };;
+{ .mfi; (p16) nop.m 0 // 6:
+ (p17) xma.hu nhi[8]=ni7,mj[1],nhi[7] // np[7]*m0
+ (p41) cmp.ltu p42,p40=a4,n4 }
+{ .mfi; (p43) cmp.leu p42,p40=a4,n4
+ (p17) xma.lu nlo[8]=ni7,mj[1],nhi[7]
+ (p16) nop.i 0 };;
+{ .mii; (p17) getf.sig n6=nlo[7] // 7:
+ (p49) cmp.ltu p50,p48=t[5],a4
+ (p51) cmp.leu p50,p48=t[5],a4 };;
+ .pred.rel "mutex",p40,p42
+ .pred.rel "mutex",p48,p50
+{ .mfi; (p16) nop.m 0 // 8:
+ (p16) xma.hu ahi[2]=ai2,bj[7],ahi[1] // ap[2]*b[i]
+ (p40) add a5=a5,n5 } // (p17) a5+=n5
+{ .mfi; (p42) add a5=a5,n5,1
+ (p16) xma.lu alo[2]=ai2,bj[7],ahi[1]
+ (p16) nop.i 0 };;
+{ .mii; (p16) getf.sig a1=alo[1] // 9:
+ (p48) add t[4]=t[4],a5 // p(17) t[4]+=a5
+ (p50) add t[4]=t[4],a5,1 };;
+{ .mfi; (p16) nop.m 0 // 10:
+ (p16) xma.hu nhi[0]=ni0,mj[0],alo[0] // np[0]*m0
+ (p40) cmp.ltu p43,p41=a5,n5 }
+{ .mfi; (p42) cmp.leu p43,p41=a5,n5
+ (p16) xma.lu nlo[0]=ni0,mj[0],alo[0]
+ (p16) nop.i 0 };;
+{ .mii; (p17) getf.sig n7=nlo[8] // 11:
+ (p48) cmp.ltu p51,p49=t[4],a5
+ (p50) cmp.leu p51,p49=t[4],a5 };;
+ .pred.rel "mutex",p41,p43
+ .pred.rel "mutex",p49,p51
+{ .mfi; (p17) getf.sig n8=nhi[8] // 12:
+ (p16) xma.hu ahi[3]=ai3,bj[7],ahi[2] // ap[3]*b[i]
+ (p41) add a6=a6,n6 } // (p17) a6+=n6
+{ .mfi; (p43) add a6=a6,n6,1
+ (p16) xma.lu alo[3]=ai3,bj[7],ahi[2]
+ (p16) nop.i 0 };;
+{ .mii; (p16) getf.sig a2=alo[2] // 13:
+ (p49) add t[3]=t[3],a6 // (p17) t[3]+=a6
+ (p51) add t[3]=t[3],a6,1 };;
+{ .mfi; (p16) nop.m 0 // 14:
+ (p16) xma.hu nhi[1]=ni1,mj[0],nhi[0] // np[1]*m0
+ (p41) cmp.ltu p42,p40=a6,n6 }
+{ .mfi; (p43) cmp.leu p42,p40=a6,n6
+ (p16) xma.lu nlo[1]=ni1,mj[0],nhi[0]
+ (p16) nop.i 0 };;
+{ .mii; (p16) nop.m 0 // 15:
+ (p49) cmp.ltu p50,p48=t[3],a6
+ (p51) cmp.leu p50,p48=t[3],a6 };;
+ .pred.rel "mutex",p40,p42
+ .pred.rel "mutex",p48,p50
+{ .mfi; (p16) nop.m 0 // 16:
+ (p16) xma.hu ahi[4]=ai4,bj[7],ahi[3] // ap[4]*b[i]
+ (p40) add a7=a7,n7 } // (p17) a7+=n7
+{ .mfi; (p42) add a7=a7,n7,1
+ (p16) xma.lu alo[4]=ai4,bj[7],ahi[3]
+ (p16) nop.i 0 };;
+{ .mii; (p16) getf.sig a3=alo[3] // 17:
+ (p48) add t[2]=t[2],a7 // (p17) t[2]+=a7
+ (p50) add t[2]=t[2],a7,1 };;
+{ .mfi; (p16) nop.m 0 // 18:
+ (p16) xma.hu nhi[2]=ni2,mj[0],nhi[1] // np[2]*m0
+ (p40) cmp.ltu p43,p41=a7,n7 }
+{ .mfi; (p42) cmp.leu p43,p41=a7,n7
+ (p16) xma.lu nlo[2]=ni2,mj[0],nhi[1]
+ (p16) nop.i 0 };;
+{ .mii; (p16) getf.sig n1=nlo[1] // 19:
+ (p48) cmp.ltu p51,p49=t[2],a7
+ (p50) cmp.leu p51,p49=t[2],a7 };;
+ .pred.rel "mutex",p41,p43
+ .pred.rel "mutex",p49,p51
+{ .mfi; (p16) nop.m 0 // 20:
+ (p16) xma.hu ahi[5]=ai5,bj[7],ahi[4] // ap[5]*b[i]
+ (p41) add a8=a8,n8 } // (p17) a8+=n8
+{ .mfi; (p43) add a8=a8,n8,1
+ (p16) xma.lu alo[5]=ai5,bj[7],ahi[4]
+ (p16) nop.i 0 };;
+{ .mii; (p16) getf.sig a4=alo[4] // 21:
+ (p49) add t[1]=t[1],a8 // (p17) t[1]+=a8
+ (p51) add t[1]=t[1],a8,1 };;
+{ .mfi; (p16) nop.m 0 // 22:
+ (p16) xma.hu nhi[3]=ni3,mj[0],nhi[2] // np[3]*m0
+ (p41) cmp.ltu p42,p40=a8,n8 }
+{ .mfi; (p43) cmp.leu p42,p40=a8,n8
+ (p16) xma.lu nlo[3]=ni3,mj[0],nhi[2]
+ (p16) nop.i 0 };;
+{ .mii; (p16) getf.sig n2=nlo[2] // 23:
+ (p49) cmp.ltu p50,p48=t[1],a8
+ (p51) cmp.leu p50,p48=t[1],a8 };;
+{ .mfi; (p16) nop.m 0 // 24:
+ (p16) xma.hu ahi[6]=ai6,bj[7],ahi[5] // ap[6]*b[i]
+ (p16) add a1=a1,n1 } // (p16) a1+=n1
+{ .mfi; (p16) nop.m 0
+ (p16) xma.lu alo[6]=ai6,bj[7],ahi[5]
+ (p17) mov t[0]=r0 };;
+{ .mii; (p16) getf.sig a5=alo[5] // 25:
+ (p16) add t0=t[7],a1 // (p16) t[7]+=a1
+ (p42) add t[0]=t[0],r0,1 };;
+{ .mfi; (p16) setf.sig tf[0]=t0 // 26:
+ (p16) xma.hu nhi[4]=ni4,mj[0],nhi[3] // np[4]*m0
+ (p50) add t[0]=t[0],r0,1 }
+{ .mfi; (p16) cmp.ltu.unc p42,p40=a1,n1
+ (p16) xma.lu nlo[4]=ni4,mj[0],nhi[3]
+ (p16) nop.i 0 };;
+{ .mii; (p16) getf.sig n3=nlo[3] // 27:
+ (p16) cmp.ltu.unc p50,p48=t0,a1
+ (p16) nop.i 0 };;
+ .pred.rel "mutex",p40,p42
+ .pred.rel "mutex",p48,p50
+{ .mfi; (p16) nop.m 0 // 28:
+ (p16) xma.hu ahi[7]=ai7,bj[7],ahi[6] // ap[7]*b[i]
+ (p40) add a2=a2,n2 } // (p16) a2+=n2
+{ .mfi; (p42) add a2=a2,n2,1
+ (p16) xma.lu alo[7]=ai7,bj[7],ahi[6]
+ (p16) nop.i 0 };;
+{ .mii; (p16) getf.sig a6=alo[6] // 29:
+ (p48) add t[6]=t[6],a2 // (p16) t[6]+=a2
+ (p50) add t[6]=t[6],a2,1 };;
+{ .mfi; (p16) nop.m 0 // 30:
+ (p16) xma.hu nhi[5]=ni5,mj[0],nhi[4] // np[5]*m0
+ (p40) cmp.ltu p41,p39=a2,n2 }
+{ .mfi; (p42) cmp.leu p41,p39=a2,n2
+ (p16) xma.lu nlo[5]=ni5,mj[0],nhi[4]
+ (p16) nop.i 0 };;
+{ .mfi; (p16) getf.sig n4=nlo[4] // 31:
+ (p16) nop.f 0
+ (p48) cmp.ltu p49,p47=t[6],a2 }
+{ .mfb; (p50) cmp.leu p49,p47=t[6],a2
+ (p16) nop.f 0
+ br.ctop.sptk.many .Louter_8_ctop };;
+.Louter_8_cend:
+
+// above loop has to execute one more time, without (p16), which is
+// replaced with merged move of np[8] to GPR bank
+ .pred.rel "mutex",p40,p42
+ .pred.rel "mutex",p48,p50
+{ .mmi; (p0) getf.sig n1=ni0 // 0:
+ (p40) add a3=a3,n3 // (p17) a3+=n3
+ (p42) add a3=a3,n3,1 };;
+{ .mii; (p17) getf.sig a7=alo[8] // 1:
+ (p48) add t[6]=t[6],a3 // (p17) t[6]+=a3
+ (p50) add t[6]=t[6],a3,1 };;
+{ .mfi; (p17) getf.sig a8=ahi[8] // 2:
+ (p17) xma.hu nhi[7]=ni6,mj[1],nhi[6] // np[6]*m0
+ (p40) cmp.ltu p43,p41=a3,n3 }
+{ .mfi; (p42) cmp.leu p43,p41=a3,n3
+ (p17) xma.lu nlo[7]=ni6,mj[1],nhi[6]
+ (p0) nop.i 0 };;
+{ .mii; (p17) getf.sig n5=nlo[6] // 3:
+ (p48) cmp.ltu p51,p49=t[6],a3
+ (p50) cmp.leu p51,p49=t[6],a3 };;
+ .pred.rel "mutex",p41,p43
+ .pred.rel "mutex",p49,p51
+{ .mmi; (p0) getf.sig n2=ni1 // 4:
+ (p41) add a4=a4,n4 // (p17) a4+=n4
+ (p43) add a4=a4,n4,1 };;
+{ .mfi; (p49) add t[5]=t[5],a4 // 5: (p17) t[5]+=a4
+ (p0) nop.f 0
+ (p51) add t[5]=t[5],a4,1 };;
+{ .mfi; (p0) getf.sig n3=ni2 // 6:
+ (p17) xma.hu nhi[8]=ni7,mj[1],nhi[7] // np[7]*m0
+ (p41) cmp.ltu p42,p40=a4,n4 }
+{ .mfi; (p43) cmp.leu p42,p40=a4,n4
+ (p17) xma.lu nlo[8]=ni7,mj[1],nhi[7]
+ (p0) nop.i 0 };;
+{ .mii; (p17) getf.sig n6=nlo[7] // 7:
+ (p49) cmp.ltu p50,p48=t[5],a4
+ (p51) cmp.leu p50,p48=t[5],a4 };;
+ .pred.rel "mutex",p40,p42
+ .pred.rel "mutex",p48,p50
+{ .mii; (p0) getf.sig n4=ni3 // 8:
+ (p40) add a5=a5,n5 // (p17) a5+=n5
+ (p42) add a5=a5,n5,1 };;
+{ .mii; (p0) nop.m 0 // 9:
+ (p48) add t[4]=t[4],a5 // p(17) t[4]+=a5
+ (p50) add t[4]=t[4],a5,1 };;
+{ .mii; (p0) nop.m 0 // 10:
+ (p40) cmp.ltu p43,p41=a5,n5
+ (p42) cmp.leu p43,p41=a5,n5 };;
+{ .mii; (p17) getf.sig n7=nlo[8] // 11:
+ (p48) cmp.ltu p51,p49=t[4],a5
+ (p50) cmp.leu p51,p49=t[4],a5 };;
+ .pred.rel "mutex",p41,p43
+ .pred.rel "mutex",p49,p51
+{ .mii; (p17) getf.sig n8=nhi[8] // 12:
+ (p41) add a6=a6,n6 // (p17) a6+=n6
+ (p43) add a6=a6,n6,1 };;
+{ .mii; (p0) getf.sig n5=ni4 // 13:
+ (p49) add t[3]=t[3],a6 // (p17) t[3]+=a6
+ (p51) add t[3]=t[3],a6,1 };;
+{ .mii; (p0) nop.m 0 // 14:
+ (p41) cmp.ltu p42,p40=a6,n6
+ (p43) cmp.leu p42,p40=a6,n6 };;
+{ .mii; (p0) getf.sig n6=ni5 // 15:
+ (p49) cmp.ltu p50,p48=t[3],a6
+ (p51) cmp.leu p50,p48=t[3],a6 };;
+ .pred.rel "mutex",p40,p42
+ .pred.rel "mutex",p48,p50
+{ .mii; (p0) nop.m 0 // 16:
+ (p40) add a7=a7,n7 // (p17) a7+=n7
+ (p42) add a7=a7,n7,1 };;
+{ .mii; (p0) nop.m 0 // 17:
+ (p48) add t[2]=t[2],a7 // (p17) t[2]+=a7
+ (p50) add t[2]=t[2],a7,1 };;
+{ .mii; (p0) nop.m 0 // 18:
+ (p40) cmp.ltu p43,p41=a7,n7
+ (p42) cmp.leu p43,p41=a7,n7 };;
+{ .mii; (p0) getf.sig n7=ni6 // 19:
+ (p48) cmp.ltu p51,p49=t[2],a7
+ (p50) cmp.leu p51,p49=t[2],a7 };;
+ .pred.rel "mutex",p41,p43
+ .pred.rel "mutex",p49,p51
+{ .mii; (p0) nop.m 0 // 20:
+ (p41) add a8=a8,n8 // (p17) a8+=n8
+ (p43) add a8=a8,n8,1 };;
+{ .mmi; (p0) nop.m 0 // 21:
+ (p49) add t[1]=t[1],a8 // (p17) t[1]+=a8
+ (p51) add t[1]=t[1],a8,1 }
+{ .mmi; (p17) mov t[0]=r0
+ (p41) cmp.ltu p42,p40=a8,n8
+ (p43) cmp.leu p42,p40=a8,n8 };;
+{ .mmi; (p0) getf.sig n8=ni7 // 22:
+ (p49) cmp.ltu p50,p48=t[1],a8
+ (p51) cmp.leu p50,p48=t[1],a8 }
+{ .mmi; (p42) add t[0]=t[0],r0,1
+ (p0) add r16=-7*16,prevsp
+ (p0) add r17=-6*16,prevsp };;
+
+// subtract np[8] from carrybit|tmp[8]
+// carrybit|tmp[8] layout upon exit from above loop is:
+// t[0]|t[1]|t[2]|t[3]|t[4]|t[5]|t[6]|t[7]|t0 (least significant)
+{ .mmi; (p50)add t[0]=t[0],r0,1
+ add r18=-5*16,prevsp
+ sub n1=t0,n1 };;
+{ .mmi; cmp.gtu p34,p32=n1,t0;;
+ .pred.rel "mutex",p32,p34
+ (p32)sub n2=t[7],n2
+ (p34)sub n2=t[7],n2,1 };;
+{ .mii; (p32)cmp.gtu p35,p33=n2,t[7]
+ (p34)cmp.geu p35,p33=n2,t[7];;
+ .pred.rel "mutex",p33,p35
+ (p33)sub n3=t[6],n3 }
+{ .mmi; (p35)sub n3=t[6],n3,1;;
+ (p33)cmp.gtu p34,p32=n3,t[6]
+ (p35)cmp.geu p34,p32=n3,t[6] };;
+ .pred.rel "mutex",p32,p34
+{ .mii; (p32)sub n4=t[5],n4
+ (p34)sub n4=t[5],n4,1;;
+ (p32)cmp.gtu p35,p33=n4,t[5] }
+{ .mmi; (p34)cmp.geu p35,p33=n4,t[5];;
+ .pred.rel "mutex",p33,p35
+ (p33)sub n5=t[4],n5
+ (p35)sub n5=t[4],n5,1 };;
+{ .mii; (p33)cmp.gtu p34,p32=n5,t[4]
+ (p35)cmp.geu p34,p32=n5,t[4];;
+ .pred.rel "mutex",p32,p34
+ (p32)sub n6=t[3],n6 }
+{ .mmi; (p34)sub n6=t[3],n6,1;;
+ (p32)cmp.gtu p35,p33=n6,t[3]
+ (p34)cmp.geu p35,p33=n6,t[3] };;
+ .pred.rel "mutex",p33,p35
+{ .mii; (p33)sub n7=t[2],n7
+ (p35)sub n7=t[2],n7,1;;
+ (p33)cmp.gtu p34,p32=n7,t[2] }
+{ .mmi; (p35)cmp.geu p34,p32=n7,t[2];;
+ .pred.rel "mutex",p32,p34
+ (p32)sub n8=t[1],n8
+ (p34)sub n8=t[1],n8,1 };;
+{ .mii; (p32)cmp.gtu p35,p33=n8,t[1]
+ (p34)cmp.geu p35,p33=n8,t[1];;
+ .pred.rel "mutex",p33,p35
+ (p33)sub a8=t[0],r0 }
+{ .mmi; (p35)sub a8=t[0],r0,1;;
+ (p33)cmp.gtu p34,p32=a8,t[0]
+ (p35)cmp.geu p34,p32=a8,t[0] };;
+
+// save the result, either tmp[num] or tmp[num]-np[num]
+ .pred.rel "mutex",p32,p34
+{ .mmi; (p32)st8 [rptr]=n1,8
+ (p34)st8 [rptr]=t0,8
+ add r19=-4*16,prevsp};;
+{ .mmb; (p32)st8 [rptr]=n2,8
+ (p34)st8 [rptr]=t[7],8
+ (p5)br.cond.dpnt.few .Ldone };;
+{ .mmb; (p32)st8 [rptr]=n3,8
+ (p34)st8 [rptr]=t[6],8
+ (p7)br.cond.dpnt.few .Ldone };;
+{ .mmb; (p32)st8 [rptr]=n4,8
+ (p34)st8 [rptr]=t[5],8
+ (p9)br.cond.dpnt.few .Ldone };;
+{ .mmb; (p32)st8 [rptr]=n5,8
+ (p34)st8 [rptr]=t[4],8
+ (p11)br.cond.dpnt.few .Ldone };;
+{ .mmb; (p32)st8 [rptr]=n6,8
+ (p34)st8 [rptr]=t[3],8
+ (p13)br.cond.dpnt.few .Ldone };;
+{ .mmb; (p32)st8 [rptr]=n7,8
+ (p34)st8 [rptr]=t[2],8
+ (p15)br.cond.dpnt.few .Ldone };;
+{ .mmb; (p32)st8 [rptr]=n8,8
+ (p34)st8 [rptr]=t[1],8
+ nop.b 0 };;
+.Ldone: // epilogue
+{ .mmi; ldf.fill f16=[r16],64
+ ldf.fill f17=[r17],64
+ nop.i 0 }
+{ .mmi; ldf.fill f18=[r18],64
+ ldf.fill f19=[r19],64
+ mov pr=prevpr,0x1ffff };;
+{ .mmi; ldf.fill f20=[r16]
+ ldf.fill f21=[r17]
+ mov ar.lc=prevlc }
+{ .mmi; ldf.fill f22=[r18]
+ ldf.fill f23=[r19]
+ mov ret0=1 } // signal "handled"
+{ .mib; rum 1<<5
+ .restore sp
+ mov sp=prevsp
+ br.ret.sptk.many b0 };;
+.endp bn_mul_mont_8#
+
+.type copyright#,\@object
+copyright:
+stringz "Montgomery multiplication for IA-64, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+$output=shift and open STDOUT,">$output";
+print $code;
+close STDOUT;
diff --git a/main/openssl/crypto/bn/asm/ia64.S b/main/openssl/crypto/bn/asm/ia64.S
index 951abc53..c0cee821 100644
--- a/main/openssl/crypto/bn/asm/ia64.S
+++ b/main/openssl/crypto/bn/asm/ia64.S
@@ -568,7 +568,7 @@ bn_sqr_comba8:
// I've estimated this routine to run in ~120 ticks, but in reality
// (i.e. according to ar.itc) it takes ~160 ticks. Are those extra
// cycles consumed for instructions fetch? Or did I misinterpret some
-// clause in Itanium µ-architecture manual? Comments are welcomed and
+// clause in Itanium µ-architecture manual? Comments are welcomed and
// highly appreciated.
//
// On Itanium 2 it takes ~190 ticks. This is because of stalls on
diff --git a/main/openssl/crypto/bn/asm/mips-mont.S b/main/openssl/crypto/bn/asm/mips-mont.S
new file mode 100644
index 00000000..1b875a2a
--- /dev/null
+++ b/main/openssl/crypto/bn/asm/mips-mont.S
@@ -0,0 +1,284 @@
+.text
+
+.set noat
+.set noreorder
+
+.align 5
+.globl bn_mul_mont
+.ent bn_mul_mont
+bn_mul_mont:
+ lw $8,16($29)
+ lw $9,20($29)
+ slt $1,$9,4
+ bnez $1,1f
+ li $2,0
+ slt $1,$9,17 # on in-order CPU
+ bnez $1,bn_mul_mont_internal
+ nop
+1: jr $31
+ li $4,0
+.end bn_mul_mont
+
+.align 5
+.ent bn_mul_mont_internal
+bn_mul_mont_internal:
+ .frame $30,14*4,$31
+ .mask 0x40000000|16711680,-4
+ sub $29,14*4
+ sw $30,(14-1)*4($29)
+ sw $23,(14-2)*4($29)
+ sw $22,(14-3)*4($29)
+ sw $21,(14-4)*4($29)
+ sw $20,(14-5)*4($29)
+ sw $19,(14-6)*4($29)
+ sw $18,(14-7)*4($29)
+ sw $17,(14-8)*4($29)
+ sw $16,(14-9)*4($29)
+ move $30,$29
+
+ .set reorder
+ lw $8,0($8)
+ lw $13,0($6) # bp[0]
+ lw $12,0($5) # ap[0]
+ lw $14,0($7) # np[0]
+
+ sub $29,2*4 # place for two extra words
+ sll $9,2
+ li $1,-4096
+ sub $29,$9
+ and $29,$1
+
+ multu $12,$13
+ lw $16,4($5)
+ lw $18,4($7)
+ mflo $10
+ mfhi $11
+ multu $10,$8
+ mflo $23
+
+ multu $16,$13
+ mflo $16
+ mfhi $17
+
+ multu $14,$23
+ mflo $24
+ mfhi $25
+ multu $18,$23
+ addu $24,$10
+ sltu $1,$24,$10
+ addu $25,$1
+ mflo $18
+ mfhi $19
+
+ move $15,$29
+ li $22,2*4
+.align 4
+.L1st:
+ .set noreorder
+ add $12,$5,$22
+ add $14,$7,$22
+ lw $12,($12)
+ lw $14,($14)
+
+ multu $12,$13
+ addu $10,$16,$11
+ addu $24,$18,$25
+ sltu $1,$10,$11
+ sltu $2,$24,$25
+ addu $11,$17,$1
+ addu $25,$19,$2
+ mflo $16
+ mfhi $17
+
+ addu $24,$10
+ sltu $1,$24,$10
+ multu $14,$23
+ addu $25,$1
+ addu $22,4
+ sw $24,($15)
+ sltu $2,$22,$9
+ mflo $18
+ mfhi $19
+
+ bnez $2,.L1st
+ add $15,4
+ .set reorder
+
+ addu $10,$16,$11
+ sltu $1,$10,$11
+ addu $11,$17,$1
+
+ addu $24,$18,$25
+ sltu $2,$24,$25
+ addu $25,$19,$2
+ addu $24,$10
+ sltu $1,$24,$10
+ addu $25,$1
+
+ sw $24,($15)
+
+ addu $25,$11
+ sltu $1,$25,$11
+ sw $25,4($15)
+ sw $1,2*4($15)
+
+ li $21,4
+.align 4
+.Louter:
+ add $13,$6,$21
+ lw $13,($13)
+ lw $12,($5)
+ lw $16,4($5)
+ lw $20,($29)
+
+ multu $12,$13
+ lw $14,($7)
+ lw $18,4($7)
+ mflo $10
+ mfhi $11
+ addu $10,$20
+ multu $10,$8
+ sltu $1,$10,$20
+ addu $11,$1
+ mflo $23
+
+ multu $16,$13
+ mflo $16
+ mfhi $17
+
+ multu $14,$23
+ mflo $24
+ mfhi $25
+
+ multu $18,$23
+ addu $24,$10
+ sltu $1,$24,$10
+ addu $25,$1
+ mflo $18
+ mfhi $19
+
+ move $15,$29
+ li $22,2*4
+ lw $20,4($15)
+.align 4
+.Linner:
+ .set noreorder
+ add $12,$5,$22
+ add $14,$7,$22
+ lw $12,($12)
+ lw $14,($14)
+
+ multu $12,$13
+ addu $10,$16,$11
+ addu $24,$18,$25
+ sltu $1,$10,$11
+ sltu $2,$24,$25
+ addu $11,$17,$1
+ addu $25,$19,$2
+ mflo $16
+ mfhi $17
+
+ addu $10,$20
+ addu $22,4
+ multu $14,$23
+ sltu $1,$10,$20
+ addu $24,$10
+ addu $11,$1
+ sltu $2,$24,$10
+ lw $20,2*4($15)
+ addu $25,$2
+ sltu $1,$22,$9
+ mflo $18
+ mfhi $19
+ sw $24,($15)
+ bnez $1,.Linner
+ add $15,4
+ .set reorder
+
+ addu $10,$16,$11
+ sltu $1,$10,$11
+ addu $11,$17,$1
+ addu $10,$20
+ sltu $2,$10,$20
+ addu $11,$2
+
+ lw $20,2*4($15)
+ addu $24,$18,$25
+ sltu $1,$24,$25
+ addu $25,$19,$1
+ addu $24,$10
+ sltu $2,$24,$10
+ addu $25,$2
+ sw $24,($15)
+
+ addu $24,$25,$11
+ sltu $25,$24,$11
+ addu $24,$20
+ sltu $1,$24,$20
+ addu $25,$1
+ sw $24,4($15)
+ sw $25,2*4($15)
+
+ addu $21,4
+ sltu $2,$21,$9
+ bnez $2,.Louter
+
+ .set noreorder
+ add $20,$29,$9 # &tp[num]
+ move $15,$29
+ move $5,$29
+ li $11,0 # clear borrow bit
+
+.align 4
+.Lsub: lw $10,($15)
+ lw $24,($7)
+ add $15,4
+ add $7,4
+ subu $24,$10,$24 # tp[i]-np[i]
+ sgtu $1,$24,$10
+ subu $10,$24,$11
+ sgtu $11,$10,$24
+ sw $10,($4)
+ or $11,$1
+ sltu $1,$15,$20
+ bnez $1,.Lsub
+ add $4,4
+
+ subu $11,$25,$11 # handle upmost overflow bit
+ move $15,$29
+ sub $4,$9 # restore rp
+ not $25,$11
+
+ and $5,$11,$29
+ and $6,$25,$4
+ or $5,$5,$6 # ap=borrow?tp:rp
+
+.align 4
+.Lcopy: lw $12,($5)
+ add $5,4
+ sw $0,($15)
+ add $15,4
+ sltu $1,$15,$20
+ sw $12,($4)
+ bnez $1,.Lcopy
+ add $4,4
+
+ li $4,1
+ li $2,1
+
+ .set noreorder
+ move $29,$30
+ lw $30,(14-1)*4($29)
+ lw $23,(14-2)*4($29)
+ lw $22,(14-3)*4($29)
+ lw $21,(14-4)*4($29)
+ lw $20,(14-5)*4($29)
+ lw $19,(14-6)*4($29)
+ lw $18,(14-7)*4($29)
+ lw $17,(14-8)*4($29)
+ lw $16,(14-9)*4($29)
+ jr $31
+ add $29,14*4
+.end bn_mul_mont_internal
+.rdata
+.asciiz "Montgomery Multiplication for MIPS, CRYPTOGAMS by <appro@openssl.org>"
diff --git a/main/openssl/crypto/bn/asm/mips-mont.pl b/main/openssl/crypto/bn/asm/mips-mont.pl
new file mode 100644
index 00000000..caae04ed
--- /dev/null
+++ b/main/openssl/crypto/bn/asm/mips-mont.pl
@@ -0,0 +1,426 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# This module doesn't present direct interest for OpenSSL, because it
+# doesn't provide better performance for longer keys, at least not on
+# in-order-execution cores. While 512-bit RSA sign operations can be
+# 65% faster in 64-bit mode, 1024-bit ones are only 15% faster, and
+# 4096-bit ones are up to 15% slower. In 32-bit mode it varies from
+# 16% improvement for 512-bit RSA sign to -33% for 4096-bit RSA
+# verify:-( All comparisons are against bn_mul_mont-free assembler.
+# The module might be of interest to embedded system developers, as
+# the code is smaller than 1KB, yet offers >3x improvement on MIPS64
+# and 75-30% [less for longer keys] on MIPS32 over compiler-generated
+# code.
+
+######################################################################
+# There is a number of MIPS ABI in use, O32 and N32/64 are most
+# widely used. Then there is a new contender: NUBI. It appears that if
+# one picks the latter, it's possible to arrange code in ABI neutral
+# manner. Therefore let's stick to NUBI register layout:
+#
+($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
+($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
+($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
+#
+# The return value is placed in $a0. Following coding rules facilitate
+# interoperability:
+#
+# - never ever touch $tp, "thread pointer", former $gp;
+# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
+# old code];
+# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
+#
+# For reference here is register layout for N32/64 MIPS ABIs:
+#
+# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
+# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
+# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
+# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
+#
+$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
+
+if ($flavour =~ /64|n32/i) {
+ $PTR_ADD="dadd"; # incidentally works even on n32
+ $PTR_SUB="dsub"; # incidentally works even on n32
+ $REG_S="sd";
+ $REG_L="ld";
+ $SZREG=8;
+} else {
+ $PTR_ADD="add";
+ $PTR_SUB="sub";
+ $REG_S="sw";
+ $REG_L="lw";
+ $SZREG=4;
+}
+$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0x00fff000 : 0x00ff0000;
+#
+# <appro@openssl.org>
+#
+######################################################################
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+if ($flavour =~ /64|n32/i) {
+ $LD="ld";
+ $ST="sd";
+ $MULTU="dmultu";
+ $ADDU="daddu";
+ $SUBU="dsubu";
+ $BNSZ=8;
+} else {
+ $LD="lw";
+ $ST="sw";
+ $MULTU="multu";
+ $ADDU="addu";
+ $SUBU="subu";
+ $BNSZ=4;
+}
+
+# int bn_mul_mont(
+$rp=$a0; # BN_ULONG *rp,
+$ap=$a1; # const BN_ULONG *ap,
+$bp=$a2; # const BN_ULONG *bp,
+$np=$a3; # const BN_ULONG *np,
+$n0=$a4; # const BN_ULONG *n0,
+$num=$a5; # int num);
+
+$lo0=$a6;
+$hi0=$a7;
+$lo1=$t1;
+$hi1=$t2;
+$aj=$s0;
+$bi=$s1;
+$nj=$s2;
+$tp=$s3;
+$alo=$s4;
+$ahi=$s5;
+$nlo=$s6;
+$nhi=$s7;
+$tj=$s8;
+$i=$s9;
+$j=$s10;
+$m1=$s11;
+
+$FRAMESIZE=14;
+
+$code=<<___;
+.text
+
+.set noat
+.set noreorder
+
+.align 5
+.globl bn_mul_mont
+.ent bn_mul_mont
+bn_mul_mont:
+___
+$code.=<<___ if ($flavour =~ /o32/i);
+ lw $n0,16($sp)
+ lw $num,20($sp)
+___
+$code.=<<___;
+ slt $at,$num,4
+ bnez $at,1f
+ li $t0,0
+ slt $at,$num,17 # on in-order CPU
+ bnez $at,bn_mul_mont_internal
+ nop
+1: jr $ra
+ li $a0,0
+.end bn_mul_mont
+
+.align 5
+.ent bn_mul_mont_internal
+bn_mul_mont_internal:
+ .frame $fp,$FRAMESIZE*$SZREG,$ra
+ .mask 0x40000000|$SAVED_REGS_MASK,-$SZREG
+ $PTR_SUB $sp,$FRAMESIZE*$SZREG
+ $REG_S $fp,($FRAMESIZE-1)*$SZREG($sp)
+ $REG_S $s11,($FRAMESIZE-2)*$SZREG($sp)
+ $REG_S $s10,($FRAMESIZE-3)*$SZREG($sp)
+ $REG_S $s9,($FRAMESIZE-4)*$SZREG($sp)
+ $REG_S $s8,($FRAMESIZE-5)*$SZREG($sp)
+ $REG_S $s7,($FRAMESIZE-6)*$SZREG($sp)
+ $REG_S $s6,($FRAMESIZE-7)*$SZREG($sp)
+ $REG_S $s5,($FRAMESIZE-8)*$SZREG($sp)
+ $REG_S $s4,($FRAMESIZE-9)*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+ $REG_S $s3,($FRAMESIZE-10)*$SZREG($sp)
+ $REG_S $s2,($FRAMESIZE-11)*$SZREG($sp)
+ $REG_S $s1,($FRAMESIZE-12)*$SZREG($sp)
+ $REG_S $s0,($FRAMESIZE-13)*$SZREG($sp)
+___
+$code.=<<___;
+ move $fp,$sp
+
+ .set reorder
+ $LD $n0,0($n0)
+ $LD $bi,0($bp) # bp[0]
+ $LD $aj,0($ap) # ap[0]
+ $LD $nj,0($np) # np[0]
+
+ $PTR_SUB $sp,2*$BNSZ # place for two extra words
+ sll $num,`log($BNSZ)/log(2)`
+ li $at,-4096
+ $PTR_SUB $sp,$num
+ and $sp,$at
+
+ $MULTU $aj,$bi
+ $LD $alo,$BNSZ($ap)
+ $LD $nlo,$BNSZ($np)
+ mflo $lo0
+ mfhi $hi0
+ $MULTU $lo0,$n0
+ mflo $m1
+
+ $MULTU $alo,$bi
+ mflo $alo
+ mfhi $ahi
+
+ $MULTU $nj,$m1
+ mflo $lo1
+ mfhi $hi1
+ $MULTU $nlo,$m1
+ $ADDU $lo1,$lo0
+ sltu $at,$lo1,$lo0
+ $ADDU $hi1,$at
+ mflo $nlo
+ mfhi $nhi
+
+ move $tp,$sp
+ li $j,2*$BNSZ
+.align 4
+.L1st:
+ .set noreorder
+ $PTR_ADD $aj,$ap,$j
+ $PTR_ADD $nj,$np,$j
+ $LD $aj,($aj)
+ $LD $nj,($nj)
+
+ $MULTU $aj,$bi
+ $ADDU $lo0,$alo,$hi0
+ $ADDU $lo1,$nlo,$hi1
+ sltu $at,$lo0,$hi0
+ sltu $t0,$lo1,$hi1
+ $ADDU $hi0,$ahi,$at
+ $ADDU $hi1,$nhi,$t0
+ mflo $alo
+ mfhi $ahi
+
+ $ADDU $lo1,$lo0
+ sltu $at,$lo1,$lo0
+ $MULTU $nj,$m1
+ $ADDU $hi1,$at
+ addu $j,$BNSZ
+ $ST $lo1,($tp)
+ sltu $t0,$j,$num
+ mflo $nlo
+ mfhi $nhi
+
+ bnez $t0,.L1st
+ $PTR_ADD $tp,$BNSZ
+ .set reorder
+
+ $ADDU $lo0,$alo,$hi0
+ sltu $at,$lo0,$hi0
+ $ADDU $hi0,$ahi,$at
+
+ $ADDU $lo1,$nlo,$hi1
+ sltu $t0,$lo1,$hi1
+ $ADDU $hi1,$nhi,$t0
+ $ADDU $lo1,$lo0
+ sltu $at,$lo1,$lo0
+ $ADDU $hi1,$at
+
+ $ST $lo1,($tp)
+
+ $ADDU $hi1,$hi0
+ sltu $at,$hi1,$hi0
+ $ST $hi1,$BNSZ($tp)
+ $ST $at,2*$BNSZ($tp)
+
+ li $i,$BNSZ
+.align 4
+.Louter:
+ $PTR_ADD $bi,$bp,$i
+ $LD $bi,($bi)
+ $LD $aj,($ap)
+ $LD $alo,$BNSZ($ap)
+ $LD $tj,($sp)
+
+ $MULTU $aj,$bi
+ $LD $nj,($np)
+ $LD $nlo,$BNSZ($np)
+ mflo $lo0
+ mfhi $hi0
+ $ADDU $lo0,$tj
+ $MULTU $lo0,$n0
+ sltu $at,$lo0,$tj
+ $ADDU $hi0,$at
+ mflo $m1
+
+ $MULTU $alo,$bi
+ mflo $alo
+ mfhi $ahi
+
+ $MULTU $nj,$m1
+ mflo $lo1
+ mfhi $hi1
+
+ $MULTU $nlo,$m1
+ $ADDU $lo1,$lo0
+ sltu $at,$lo1,$lo0
+ $ADDU $hi1,$at
+ mflo $nlo
+ mfhi $nhi
+
+ move $tp,$sp
+ li $j,2*$BNSZ
+ $LD $tj,$BNSZ($tp)
+.align 4
+.Linner:
+ .set noreorder
+ $PTR_ADD $aj,$ap,$j
+ $PTR_ADD $nj,$np,$j
+ $LD $aj,($aj)
+ $LD $nj,($nj)
+
+ $MULTU $aj,$bi
+ $ADDU $lo0,$alo,$hi0
+ $ADDU $lo1,$nlo,$hi1
+ sltu $at,$lo0,$hi0
+ sltu $t0,$lo1,$hi1
+ $ADDU $hi0,$ahi,$at
+ $ADDU $hi1,$nhi,$t0
+ mflo $alo
+ mfhi $ahi
+
+ $ADDU $lo0,$tj
+ addu $j,$BNSZ
+ $MULTU $nj,$m1
+ sltu $at,$lo0,$tj
+ $ADDU $lo1,$lo0
+ $ADDU $hi0,$at
+ sltu $t0,$lo1,$lo0
+ $LD $tj,2*$BNSZ($tp)
+ $ADDU $hi1,$t0
+ sltu $at,$j,$num
+ mflo $nlo
+ mfhi $nhi
+ $ST $lo1,($tp)
+ bnez $at,.Linner
+ $PTR_ADD $tp,$BNSZ
+ .set reorder
+
+ $ADDU $lo0,$alo,$hi0
+ sltu $at,$lo0,$hi0
+ $ADDU $hi0,$ahi,$at
+ $ADDU $lo0,$tj
+ sltu $t0,$lo0,$tj
+ $ADDU $hi0,$t0
+
+ $LD $tj,2*$BNSZ($tp)
+ $ADDU $lo1,$nlo,$hi1
+ sltu $at,$lo1,$hi1
+ $ADDU $hi1,$nhi,$at
+ $ADDU $lo1,$lo0
+ sltu $t0,$lo1,$lo0
+ $ADDU $hi1,$t0
+ $ST $lo1,($tp)
+
+ $ADDU $lo1,$hi1,$hi0
+ sltu $hi1,$lo1,$hi0
+ $ADDU $lo1,$tj
+ sltu $at,$lo1,$tj
+ $ADDU $hi1,$at
+ $ST $lo1,$BNSZ($tp)
+ $ST $hi1,2*$BNSZ($tp)
+
+ addu $i,$BNSZ
+ sltu $t0,$i,$num
+ bnez $t0,.Louter
+
+ .set noreorder
+ $PTR_ADD $tj,$sp,$num # &tp[num]
+ move $tp,$sp
+ move $ap,$sp
+ li $hi0,0 # clear borrow bit
+
+.align 4
+.Lsub: $LD $lo0,($tp)
+ $LD $lo1,($np)
+ $PTR_ADD $tp,$BNSZ
+ $PTR_ADD $np,$BNSZ
+ $SUBU $lo1,$lo0,$lo1 # tp[i]-np[i]
+ sgtu $at,$lo1,$lo0
+ $SUBU $lo0,$lo1,$hi0
+ sgtu $hi0,$lo0,$lo1
+ $ST $lo0,($rp)
+ or $hi0,$at
+ sltu $at,$tp,$tj
+ bnez $at,.Lsub
+ $PTR_ADD $rp,$BNSZ
+
+ $SUBU $hi0,$hi1,$hi0 # handle upmost overflow bit
+ move $tp,$sp
+ $PTR_SUB $rp,$num # restore rp
+ not $hi1,$hi0
+
+ and $ap,$hi0,$sp
+ and $bp,$hi1,$rp
+ or $ap,$ap,$bp # ap=borrow?tp:rp
+
+.align 4
+.Lcopy: $LD $aj,($ap)
+ $PTR_ADD $ap,$BNSZ
+ $ST $zero,($tp)
+ $PTR_ADD $tp,$BNSZ
+ sltu $at,$tp,$tj
+ $ST $aj,($rp)
+ bnez $at,.Lcopy
+ $PTR_ADD $rp,$BNSZ
+
+ li $a0,1
+ li $t0,1
+
+ .set noreorder
+ move $sp,$fp
+ $REG_L $fp,($FRAMESIZE-1)*$SZREG($sp)
+ $REG_L $s11,($FRAMESIZE-2)*$SZREG($sp)
+ $REG_L $s10,($FRAMESIZE-3)*$SZREG($sp)
+ $REG_L $s9,($FRAMESIZE-4)*$SZREG($sp)
+ $REG_L $s8,($FRAMESIZE-5)*$SZREG($sp)
+ $REG_L $s7,($FRAMESIZE-6)*$SZREG($sp)
+ $REG_L $s6,($FRAMESIZE-7)*$SZREG($sp)
+ $REG_L $s5,($FRAMESIZE-8)*$SZREG($sp)
+ $REG_L $s4,($FRAMESIZE-9)*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+ $REG_L $s3,($FRAMESIZE-10)*$SZREG($sp)
+ $REG_L $s2,($FRAMESIZE-11)*$SZREG($sp)
+ $REG_L $s1,($FRAMESIZE-12)*$SZREG($sp)
+ $REG_L $s0,($FRAMESIZE-13)*$SZREG($sp)
+___
+$code.=<<___;
+ jr $ra
+ $PTR_ADD $sp,$FRAMESIZE*$SZREG
+.end bn_mul_mont_internal
+.rdata
+.asciiz "Montgomery Multiplication for MIPS, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+
+print $code;
+close STDOUT;
diff --git a/main/openssl/crypto/bn/asm/mips.pl b/main/openssl/crypto/bn/asm/mips.pl
new file mode 100644
index 00000000..d2f3ef7b
--- /dev/null
+++ b/main/openssl/crypto/bn/asm/mips.pl
@@ -0,0 +1,2583 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project.
+#
+# Rights for redistribution and usage in source and binary forms are
+# granted according to the OpenSSL license. Warranty of any kind is
+# disclaimed.
+# ====================================================================
+
+
+# July 1999
+#
+# This is drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c.
+#
+# The module is designed to work with either of the "new" MIPS ABI(5),
+# namely N32 or N64, offered by IRIX 6.x. It's not ment to work under
+# IRIX 5.x not only because it doesn't support new ABIs but also
+# because 5.x kernels put R4x00 CPU into 32-bit mode and all those
+# 64-bit instructions (daddu, dmultu, etc.) found below gonna only
+# cause illegal instruction exception:-(
+#
+# In addition the code depends on preprocessor flags set up by MIPSpro
+# compiler driver (either as or cc) and therefore (probably?) can't be
+# compiled by the GNU assembler. GNU C driver manages fine though...
+# I mean as long as -mmips-as is specified or is the default option,
+# because then it simply invokes /usr/bin/as which in turn takes
+# perfect care of the preprocessor definitions. Another neat feature
+# offered by the MIPSpro assembler is an optimization pass. This gave
+# me the opportunity to have the code looking more regular as all those
+# architecture dependent instruction rescheduling details were left to
+# the assembler. Cool, huh?
+#
+# Performance improvement is astonishing! 'apps/openssl speed rsa dsa'
+# goes way over 3 times faster!
+#
+# <appro@fy.chalmers.se>
+
+# October 2010
+#
+# Adapt the module even for 32-bit ABIs and other OSes. The former was
+# achieved by mechanical replacement of 64-bit arithmetic instructions
+# such as dmultu, daddu, etc. with their 32-bit counterparts and
+# adjusting offsets denoting multiples of BN_ULONG. Above mentioned
+# >3x performance improvement naturally does not apply to 32-bit code
+# [because there is no instruction 32-bit compiler can't use], one
+# has to content with 40-85% improvement depending on benchmark and
+# key length, more for longer keys.
+
+$flavour = shift;
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+if ($flavour =~ /64|n32/i) {
+ $LD="ld";
+ $ST="sd";
+ $MULTU="dmultu";
+ $DIVU="ddivu";
+ $ADDU="daddu";
+ $SUBU="dsubu";
+ $SRL="dsrl";
+ $SLL="dsll";
+ $BNSZ=8;
+ $PTR_ADD="daddu";
+ $PTR_SUB="dsubu";
+ $SZREG=8;
+ $REG_S="sd";
+ $REG_L="ld";
+} else {
+ $LD="lw";
+ $ST="sw";
+ $MULTU="multu";
+ $DIVU="divu";
+ $ADDU="addu";
+ $SUBU="subu";
+ $SRL="srl";
+ $SLL="sll";
+ $BNSZ=4;
+ $PTR_ADD="addu";
+ $PTR_SUB="subu";
+ $SZREG=4;
+ $REG_S="sw";
+ $REG_L="lw";
+ $code=".set mips2\n";
+}
+
+# Below is N32/64 register layout used in the original module.
+#
+($zero,$at,$v0,$v1)=map("\$$_",(0..3));
+($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
+($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
+($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
+($ta0,$ta1,$ta2,$ta3)=($a4,$a5,$a6,$a7);
+#
+# No special adaptation is required for O32. NUBI on the other hand
+# is treated by saving/restoring ($v1,$t0..$t3).
+
+$gp=$v1 if ($flavour =~ /nubi/i);
+
+$minus4=$v1;
+
+$code.=<<___;
+.rdata
+.asciiz "mips3.s, Version 1.2"
+.asciiz "MIPS II/III/IV ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>"
+
+.text
+.set noat
+
+.align 5
+.globl bn_mul_add_words
+.ent bn_mul_add_words
+bn_mul_add_words:
+ .set noreorder
+ bgtz $a2,bn_mul_add_words_internal
+ move $v0,$zero
+ jr $ra
+ move $a0,$v0
+.end bn_mul_add_words
+
+.align 5
+.ent bn_mul_add_words_internal
+bn_mul_add_words_internal:
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+ .frame $sp,6*$SZREG,$ra
+ .mask 0x8000f008,-$SZREG
+ .set noreorder
+ $PTR_SUB $sp,6*$SZREG
+ $REG_S $ra,5*$SZREG($sp)
+ $REG_S $t3,4*$SZREG($sp)
+ $REG_S $t2,3*$SZREG($sp)
+ $REG_S $t1,2*$SZREG($sp)
+ $REG_S $t0,1*$SZREG($sp)
+ $REG_S $gp,0*$SZREG($sp)
+___
+$code.=<<___;
+ .set reorder
+ li $minus4,-4
+ and $ta0,$a2,$minus4
+ beqz $ta0,.L_bn_mul_add_words_tail
+
+.L_bn_mul_add_words_loop:
+ $LD $t0,0($a1)
+ $MULTU $t0,$a3
+ $LD $t1,0($a0)
+ $LD $t2,$BNSZ($a1)
+ $LD $t3,$BNSZ($a0)
+ $LD $ta0,2*$BNSZ($a1)
+ $LD $ta1,2*$BNSZ($a0)
+ $ADDU $t1,$v0
+ sltu $v0,$t1,$v0 # All manuals say it "compares 32-bit
+ # values", but it seems to work fine
+ # even on 64-bit registers.
+ mflo $at
+ mfhi $t0
+ $ADDU $t1,$at
+ $ADDU $v0,$t0
+ $MULTU $t2,$a3
+ sltu $at,$t1,$at
+ $ST $t1,0($a0)
+ $ADDU $v0,$at
+
+ $LD $ta2,3*$BNSZ($a1)
+ $LD $ta3,3*$BNSZ($a0)
+ $ADDU $t3,$v0
+ sltu $v0,$t3,$v0
+ mflo $at
+ mfhi $t2
+ $ADDU $t3,$at
+ $ADDU $v0,$t2
+ $MULTU $ta0,$a3
+ sltu $at,$t3,$at
+ $ST $t3,$BNSZ($a0)
+ $ADDU $v0,$at
+
+ subu $a2,4
+ $PTR_ADD $a0,4*$BNSZ
+ $PTR_ADD $a1,4*$BNSZ
+ $ADDU $ta1,$v0
+ sltu $v0,$ta1,$v0
+ mflo $at
+ mfhi $ta0
+ $ADDU $ta1,$at
+ $ADDU $v0,$ta0
+ $MULTU $ta2,$a3
+ sltu $at,$ta1,$at
+ $ST $ta1,-2*$BNSZ($a0)
+ $ADDU $v0,$at
+
+
+ and $ta0,$a2,$minus4
+ $ADDU $ta3,$v0
+ sltu $v0,$ta3,$v0
+ mflo $at
+ mfhi $ta2
+ $ADDU $ta3,$at
+ $ADDU $v0,$ta2
+ sltu $at,$ta3,$at
+ $ST $ta3,-$BNSZ($a0)
+ .set noreorder
+ bgtz $ta0,.L_bn_mul_add_words_loop
+ $ADDU $v0,$at
+
+ beqz $a2,.L_bn_mul_add_words_return
+ nop
+
+.L_bn_mul_add_words_tail:
+ .set reorder
+ $LD $t0,0($a1)
+ $MULTU $t0,$a3
+ $LD $t1,0($a0)
+ subu $a2,1
+ $ADDU $t1,$v0
+ sltu $v0,$t1,$v0
+ mflo $at
+ mfhi $t0
+ $ADDU $t1,$at
+ $ADDU $v0,$t0
+ sltu $at,$t1,$at
+ $ST $t1,0($a0)
+ $ADDU $v0,$at
+ beqz $a2,.L_bn_mul_add_words_return
+
+ $LD $t0,$BNSZ($a1)
+ $MULTU $t0,$a3
+ $LD $t1,$BNSZ($a0)
+ subu $a2,1
+ $ADDU $t1,$v0
+ sltu $v0,$t1,$v0
+ mflo $at
+ mfhi $t0
+ $ADDU $t1,$at
+ $ADDU $v0,$t0
+ sltu $at,$t1,$at
+ $ST $t1,$BNSZ($a0)
+ $ADDU $v0,$at
+ beqz $a2,.L_bn_mul_add_words_return
+
+ $LD $t0,2*$BNSZ($a1)
+ $MULTU $t0,$a3
+ $LD $t1,2*$BNSZ($a0)
+ $ADDU $t1,$v0
+ sltu $v0,$t1,$v0
+ mflo $at
+ mfhi $t0
+ $ADDU $t1,$at
+ $ADDU $v0,$t0
+ sltu $at,$t1,$at
+ $ST $t1,2*$BNSZ($a0)
+ $ADDU $v0,$at
+
+.L_bn_mul_add_words_return:
+ .set noreorder
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+ $REG_L $t3,4*$SZREG($sp)
+ $REG_L $t2,3*$SZREG($sp)
+ $REG_L $t1,2*$SZREG($sp)
+ $REG_L $t0,1*$SZREG($sp)
+ $REG_L $gp,0*$SZREG($sp)
+ $PTR_ADD $sp,6*$SZREG
+___
+$code.=<<___;
+ jr $ra
+ move $a0,$v0
+.end bn_mul_add_words_internal
+
+.align 5
+.globl bn_mul_words
+.ent bn_mul_words
+bn_mul_words:
+ .set noreorder
+ bgtz $a2,bn_mul_words_internal
+ move $v0,$zero
+ jr $ra
+ move $a0,$v0
+.end bn_mul_words
+
+.align 5
+.ent bn_mul_words_internal
+bn_mul_words_internal:
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+ .frame $sp,6*$SZREG,$ra
+ .mask 0x8000f008,-$SZREG
+ .set noreorder
+ $PTR_SUB $sp,6*$SZREG
+ $REG_S $ra,5*$SZREG($sp)
+ $REG_S $t3,4*$SZREG($sp)
+ $REG_S $t2,3*$SZREG($sp)
+ $REG_S $t1,2*$SZREG($sp)
+ $REG_S $t0,1*$SZREG($sp)
+ $REG_S $gp,0*$SZREG($sp)
+___
+$code.=<<___;
+ .set reorder
+ li $minus4,-4
+ and $ta0,$a2,$minus4
+ beqz $ta0,.L_bn_mul_words_tail
+
+.L_bn_mul_words_loop:
+ $LD $t0,0($a1)
+ $MULTU $t0,$a3
+ $LD $t2,$BNSZ($a1)
+ $LD $ta0,2*$BNSZ($a1)
+ $LD $ta2,3*$BNSZ($a1)
+ mflo $at
+ mfhi $t0
+ $ADDU $v0,$at
+ sltu $t1,$v0,$at
+ $MULTU $t2,$a3
+ $ST $v0,0($a0)
+ $ADDU $v0,$t1,$t0
+
+ subu $a2,4
+ $PTR_ADD $a0,4*$BNSZ
+ $PTR_ADD $a1,4*$BNSZ
+ mflo $at
+ mfhi $t2
+ $ADDU $v0,$at
+ sltu $t3,$v0,$at
+ $MULTU $ta0,$a3
+ $ST $v0,-3*$BNSZ($a0)
+ $ADDU $v0,$t3,$t2
+
+ mflo $at
+ mfhi $ta0
+ $ADDU $v0,$at
+ sltu $ta1,$v0,$at
+ $MULTU $ta2,$a3
+ $ST $v0,-2*$BNSZ($a0)
+ $ADDU $v0,$ta1,$ta0
+
+ and $ta0,$a2,$minus4
+ mflo $at
+ mfhi $ta2
+ $ADDU $v0,$at
+ sltu $ta3,$v0,$at
+ $ST $v0,-$BNSZ($a0)
+ .set noreorder
+ bgtz $ta0,.L_bn_mul_words_loop
+ $ADDU $v0,$ta3,$ta2
+
+ beqz $a2,.L_bn_mul_words_return
+ nop
+
+.L_bn_mul_words_tail:
+ .set reorder
+ $LD $t0,0($a1)
+ $MULTU $t0,$a3
+ subu $a2,1
+ mflo $at
+ mfhi $t0
+ $ADDU $v0,$at
+ sltu $t1,$v0,$at
+ $ST $v0,0($a0)
+ $ADDU $v0,$t1,$t0
+ beqz $a2,.L_bn_mul_words_return
+
+ $LD $t0,$BNSZ($a1)
+ $MULTU $t0,$a3
+ subu $a2,1
+ mflo $at
+ mfhi $t0
+ $ADDU $v0,$at
+ sltu $t1,$v0,$at
+ $ST $v0,$BNSZ($a0)
+ $ADDU $v0,$t1,$t0
+ beqz $a2,.L_bn_mul_words_return
+
+ $LD $t0,2*$BNSZ($a1)
+ $MULTU $t0,$a3
+ mflo $at
+ mfhi $t0
+ $ADDU $v0,$at
+ sltu $t1,$v0,$at
+ $ST $v0,2*$BNSZ($a0)
+ $ADDU $v0,$t1,$t0
+
+.L_bn_mul_words_return:
+ .set noreorder
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+ $REG_L $t3,4*$SZREG($sp)
+ $REG_L $t2,3*$SZREG($sp)
+ $REG_L $t1,2*$SZREG($sp)
+ $REG_L $t0,1*$SZREG($sp)
+ $REG_L $gp,0*$SZREG($sp)
+ $PTR_ADD $sp,6*$SZREG
+___
+$code.=<<___;
+ jr $ra
+ move $a0,$v0
+.end bn_mul_words_internal
+
+.align 5
+.globl bn_sqr_words
+.ent bn_sqr_words
+bn_sqr_words:
+ .set noreorder
+ bgtz $a2,bn_sqr_words_internal
+ move $v0,$zero
+ jr $ra
+ move $a0,$v0
+.end bn_sqr_words
+
+.align 5
+.ent bn_sqr_words_internal
+bn_sqr_words_internal:
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+ .frame $sp,6*$SZREG,$ra
+ .mask 0x8000f008,-$SZREG
+ .set noreorder
+ $PTR_SUB $sp,6*$SZREG
+ $REG_S $ra,5*$SZREG($sp)
+ $REG_S $t3,4*$SZREG($sp)
+ $REG_S $t2,3*$SZREG($sp)
+ $REG_S $t1,2*$SZREG($sp)
+ $REG_S $t0,1*$SZREG($sp)
+ $REG_S $gp,0*$SZREG($sp)
+___
+$code.=<<___;
+ .set reorder
+ li $minus4,-4
+ and $ta0,$a2,$minus4
+ beqz $ta0,.L_bn_sqr_words_tail
+
+.L_bn_sqr_words_loop:
+ $LD $t0,0($a1)
+ $MULTU $t0,$t0
+ $LD $t2,$BNSZ($a1)
+ $LD $ta0,2*$BNSZ($a1)
+ $LD $ta2,3*$BNSZ($a1)
+ mflo $t1
+ mfhi $t0
+ $ST $t1,0($a0)
+ $ST $t0,$BNSZ($a0)
+
+ $MULTU $t2,$t2
+ subu $a2,4
+ $PTR_ADD $a0,8*$BNSZ
+ $PTR_ADD $a1,4*$BNSZ
+ mflo $t3
+ mfhi $t2
+ $ST $t3,-6*$BNSZ($a0)
+ $ST $t2,-5*$BNSZ($a0)
+
+ $MULTU $ta0,$ta0
+ mflo $ta1
+ mfhi $ta0
+ $ST $ta1,-4*$BNSZ($a0)
+ $ST $ta0,-3*$BNSZ($a0)
+
+
+ $MULTU $ta2,$ta2
+ and $ta0,$a2,$minus4
+ mflo $ta3
+ mfhi $ta2
+ $ST $ta3,-2*$BNSZ($a0)
+
+ .set noreorder
+ bgtz $ta0,.L_bn_sqr_words_loop
+ $ST $ta2,-$BNSZ($a0)
+
+ beqz $a2,.L_bn_sqr_words_return
+ nop
+
+.L_bn_sqr_words_tail:
+ .set reorder
+ $LD $t0,0($a1)
+ $MULTU $t0,$t0
+ subu $a2,1
+ mflo $t1
+ mfhi $t0
+ $ST $t1,0($a0)
+ $ST $t0,$BNSZ($a0)
+ beqz $a2,.L_bn_sqr_words_return
+
+ $LD $t0,$BNSZ($a1)
+ $MULTU $t0,$t0
+ subu $a2,1
+ mflo $t1
+ mfhi $t0
+ $ST $t1,2*$BNSZ($a0)
+ $ST $t0,3*$BNSZ($a0)
+ beqz $a2,.L_bn_sqr_words_return
+
+ $LD $t0,2*$BNSZ($a1)
+ $MULTU $t0,$t0
+ mflo $t1
+ mfhi $t0
+ $ST $t1,4*$BNSZ($a0)
+ $ST $t0,5*$BNSZ($a0)
+
+.L_bn_sqr_words_return:
+ .set noreorder
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+ $REG_L $t3,4*$SZREG($sp)
+ $REG_L $t2,3*$SZREG($sp)
+ $REG_L $t1,2*$SZREG($sp)
+ $REG_L $t0,1*$SZREG($sp)
+ $REG_L $gp,0*$SZREG($sp)
+ $PTR_ADD $sp,6*$SZREG
+___
+$code.=<<___;
+ jr $ra
+ move $a0,$v0
+
+.end bn_sqr_words_internal
+
+.align 5
+.globl bn_add_words
+.ent bn_add_words
+bn_add_words:
+ .set noreorder
+ bgtz $a3,bn_add_words_internal
+ move $v0,$zero
+ jr $ra
+ move $a0,$v0
+.end bn_add_words
+
+.align 5
+.ent bn_add_words_internal
+bn_add_words_internal:
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+ .frame $sp,6*$SZREG,$ra
+ .mask 0x8000f008,-$SZREG
+ .set noreorder
+ $PTR_SUB $sp,6*$SZREG
+ $REG_S $ra,5*$SZREG($sp)
+ $REG_S $t3,4*$SZREG($sp)
+ $REG_S $t2,3*$SZREG($sp)
+ $REG_S $t1,2*$SZREG($sp)
+ $REG_S $t0,1*$SZREG($sp)
+ $REG_S $gp,0*$SZREG($sp)
+___
+$code.=<<___;
+ .set reorder
+ li $minus4,-4
+ and $at,$a3,$minus4
+ beqz $at,.L_bn_add_words_tail
+
+.L_bn_add_words_loop:
+ $LD $t0,0($a1)
+ $LD $ta0,0($a2)
+ subu $a3,4
+ $LD $t1,$BNSZ($a1)
+ and $at,$a3,$minus4
+ $LD $t2,2*$BNSZ($a1)
+ $PTR_ADD $a2,4*$BNSZ
+ $LD $t3,3*$BNSZ($a1)
+ $PTR_ADD $a0,4*$BNSZ
+ $LD $ta1,-3*$BNSZ($a2)
+ $PTR_ADD $a1,4*$BNSZ
+ $LD $ta2,-2*$BNSZ($a2)
+ $LD $ta3,-$BNSZ($a2)
+ $ADDU $ta0,$t0
+ sltu $t8,$ta0,$t0
+ $ADDU $t0,$ta0,$v0
+ sltu $v0,$t0,$ta0
+ $ST $t0,-4*$BNSZ($a0)
+ $ADDU $v0,$t8
+
+ $ADDU $ta1,$t1
+ sltu $t9,$ta1,$t1
+ $ADDU $t1,$ta1,$v0
+ sltu $v0,$t1,$ta1
+ $ST $t1,-3*$BNSZ($a0)
+ $ADDU $v0,$t9
+
+ $ADDU $ta2,$t2
+ sltu $t8,$ta2,$t2
+ $ADDU $t2,$ta2,$v0
+ sltu $v0,$t2,$ta2
+ $ST $t2,-2*$BNSZ($a0)
+ $ADDU $v0,$t8
+
+ $ADDU $ta3,$t3
+ sltu $t9,$ta3,$t3
+ $ADDU $t3,$ta3,$v0
+ sltu $v0,$t3,$ta3
+ $ST $t3,-$BNSZ($a0)
+
+ .set noreorder
+ bgtz $at,.L_bn_add_words_loop
+ $ADDU $v0,$t9
+
+ beqz $a3,.L_bn_add_words_return
+ nop
+
+.L_bn_add_words_tail:
+ .set reorder
+ $LD $t0,0($a1)
+ $LD $ta0,0($a2)
+ $ADDU $ta0,$t0
+ subu $a3,1
+ sltu $t8,$ta0,$t0
+ $ADDU $t0,$ta0,$v0
+ sltu $v0,$t0,$ta0
+ $ST $t0,0($a0)
+ $ADDU $v0,$t8
+ beqz $a3,.L_bn_add_words_return
+
+ $LD $t1,$BNSZ($a1)
+ $LD $ta1,$BNSZ($a2)
+ $ADDU $ta1,$t1
+ subu $a3,1
+ sltu $t9,$ta1,$t1
+ $ADDU $t1,$ta1,$v0
+ sltu $v0,$t1,$ta1
+ $ST $t1,$BNSZ($a0)
+ $ADDU $v0,$t9
+ beqz $a3,.L_bn_add_words_return
+
+ $LD $t2,2*$BNSZ($a1)
+ $LD $ta2,2*$BNSZ($a2)
+ $ADDU $ta2,$t2
+ sltu $t8,$ta2,$t2
+ $ADDU $t2,$ta2,$v0
+ sltu $v0,$t2,$ta2
+ $ST $t2,2*$BNSZ($a0)
+ $ADDU $v0,$t8
+
+.L_bn_add_words_return:
+ .set noreorder
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+ $REG_L $t3,4*$SZREG($sp)
+ $REG_L $t2,3*$SZREG($sp)
+ $REG_L $t1,2*$SZREG($sp)
+ $REG_L $t0,1*$SZREG($sp)
+ $REG_L $gp,0*$SZREG($sp)
+ $PTR_ADD $sp,6*$SZREG
+___
+$code.=<<___;
+ jr $ra
+ move $a0,$v0
+
+.end bn_add_words_internal
+
+.align 5
+.globl bn_sub_words
+.ent bn_sub_words
+bn_sub_words:
+ .set noreorder
+ bgtz $a3,bn_sub_words_internal
+ move $v0,$zero
+ jr $ra
+ move $a0,$zero
+.end bn_sub_words
+
+.align 5
+.ent bn_sub_words_internal
+bn_sub_words_internal:
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+ .frame $sp,6*$SZREG,$ra
+ .mask 0x8000f008,-$SZREG
+ .set noreorder
+ $PTR_SUB $sp,6*$SZREG
+ $REG_S $ra,5*$SZREG($sp)
+ $REG_S $t3,4*$SZREG($sp)
+ $REG_S $t2,3*$SZREG($sp)
+ $REG_S $t1,2*$SZREG($sp)
+ $REG_S $t0,1*$SZREG($sp)
+ $REG_S $gp,0*$SZREG($sp)
+___
+$code.=<<___;
+ .set reorder
+ li $minus4,-4
+ and $at,$a3,$minus4
+ beqz $at,.L_bn_sub_words_tail
+
+.L_bn_sub_words_loop:
+ $LD $t0,0($a1)
+ $LD $ta0,0($a2)
+ subu $a3,4
+ $LD $t1,$BNSZ($a1)
+ and $at,$a3,$minus4
+ $LD $t2,2*$BNSZ($a1)
+ $PTR_ADD $a2,4*$BNSZ
+ $LD $t3,3*$BNSZ($a1)
+ $PTR_ADD $a0,4*$BNSZ
+ $LD $ta1,-3*$BNSZ($a2)
+ $PTR_ADD $a1,4*$BNSZ
+ $LD $ta2,-2*$BNSZ($a2)
+ $LD $ta3,-$BNSZ($a2)
+ sltu $t8,$t0,$ta0
+ $SUBU $ta0,$t0,$ta0
+ $SUBU $t0,$ta0,$v0
+ sgtu $v0,$t0,$ta0
+ $ST $t0,-4*$BNSZ($a0)
+ $ADDU $v0,$t8
+
+ sltu $t9,$t1,$ta1
+ $SUBU $ta1,$t1,$ta1
+ $SUBU $t1,$ta1,$v0
+ sgtu $v0,$t1,$ta1
+ $ST $t1,-3*$BNSZ($a0)
+ $ADDU $v0,$t9
+
+
+ sltu $t8,$t2,$ta2
+ $SUBU $ta2,$t2,$ta2
+ $SUBU $t2,$ta2,$v0
+ sgtu $v0,$t2,$ta2
+ $ST $t2,-2*$BNSZ($a0)
+ $ADDU $v0,$t8
+
+ sltu $t9,$t3,$ta3
+ $SUBU $ta3,$t3,$ta3
+ $SUBU $t3,$ta3,$v0
+ sgtu $v0,$t3,$ta3
+ $ST $t3,-$BNSZ($a0)
+
+ .set noreorder
+ bgtz $at,.L_bn_sub_words_loop
+ $ADDU $v0,$t9
+
+ beqz $a3,.L_bn_sub_words_return
+ nop
+
+.L_bn_sub_words_tail:
+ .set reorder
+ $LD $t0,0($a1)
+ $LD $ta0,0($a2)
+ subu $a3,1
+ sltu $t8,$t0,$ta0
+ $SUBU $ta0,$t0,$ta0
+ $SUBU $t0,$ta0,$v0
+ sgtu $v0,$t0,$ta0
+ $ST $t0,0($a0)
+ $ADDU $v0,$t8
+ beqz $a3,.L_bn_sub_words_return
+
+ $LD $t1,$BNSZ($a1)
+ subu $a3,1
+ $LD $ta1,$BNSZ($a2)
+ sltu $t9,$t1,$ta1
+ $SUBU $ta1,$t1,$ta1
+ $SUBU $t1,$ta1,$v0
+ sgtu $v0,$t1,$ta1
+ $ST $t1,$BNSZ($a0)
+ $ADDU $v0,$t9
+ beqz $a3,.L_bn_sub_words_return
+
+ $LD $t2,2*$BNSZ($a1)
+ $LD $ta2,2*$BNSZ($a2)
+ sltu $t8,$t2,$ta2
+ $SUBU $ta2,$t2,$ta2
+ $SUBU $t2,$ta2,$v0
+ sgtu $v0,$t2,$ta2
+ $ST $t2,2*$BNSZ($a0)
+ $ADDU $v0,$t8
+
+.L_bn_sub_words_return:
+ .set noreorder
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+ $REG_L $t3,4*$SZREG($sp)
+ $REG_L $t2,3*$SZREG($sp)
+ $REG_L $t1,2*$SZREG($sp)
+ $REG_L $t0,1*$SZREG($sp)
+ $REG_L $gp,0*$SZREG($sp)
+ $PTR_ADD $sp,6*$SZREG
+___
+$code.=<<___;
+ jr $ra
+ move $a0,$v0
+.end bn_sub_words_internal
+
+.align 5
+.globl bn_div_3_words
+.ent bn_div_3_words
+bn_div_3_words:
+ .set noreorder
+ move $a3,$a0 # we know that bn_div_words does not
+ # touch $a3, $ta2, $ta3 and preserves $a2
+ # so that we can save two arguments
+ # and return address in registers
+ # instead of stack:-)
+
+ $LD $a0,($a3)
+ move $ta2,$a1
+ bne $a0,$a2,bn_div_3_words_internal
+ $LD $a1,-$BNSZ($a3)
+ li $v0,-1
+ jr $ra
+ move $a0,$v0
+.end bn_div_3_words
+
+.align 5
+.ent bn_div_3_words_internal
+bn_div_3_words_internal:
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+ .frame $sp,6*$SZREG,$ra
+ .mask 0x8000f008,-$SZREG
+ .set noreorder
+ $PTR_SUB $sp,6*$SZREG
+ $REG_S $ra,5*$SZREG($sp)
+ $REG_S $t3,4*$SZREG($sp)
+ $REG_S $t2,3*$SZREG($sp)
+ $REG_S $t1,2*$SZREG($sp)
+ $REG_S $t0,1*$SZREG($sp)
+ $REG_S $gp,0*$SZREG($sp)
+___
+$code.=<<___;
+ .set reorder
+ move $ta3,$ra
+ bal bn_div_words_internal
+ move $ra,$ta3
+ $MULTU $ta2,$v0
+ $LD $t2,-2*$BNSZ($a3)
+ move $ta0,$zero
+ mfhi $t1
+ mflo $t0
+ sltu $t8,$t1,$a1
+.L_bn_div_3_words_inner_loop:
+ bnez $t8,.L_bn_div_3_words_inner_loop_done
+ sgeu $at,$t2,$t0
+ seq $t9,$t1,$a1
+ and $at,$t9
+ sltu $t3,$t0,$ta2
+ $ADDU $a1,$a2
+ $SUBU $t1,$t3
+ $SUBU $t0,$ta2
+ sltu $t8,$t1,$a1
+ sltu $ta0,$a1,$a2
+ or $t8,$ta0
+ .set noreorder
+ beqz $at,.L_bn_div_3_words_inner_loop
+ $SUBU $v0,1
+ $ADDU $v0,1
+ .set reorder
+.L_bn_div_3_words_inner_loop_done:
+ .set noreorder
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+ $REG_L $t3,4*$SZREG($sp)
+ $REG_L $t2,3*$SZREG($sp)
+ $REG_L $t1,2*$SZREG($sp)
+ $REG_L $t0,1*$SZREG($sp)
+ $REG_L $gp,0*$SZREG($sp)
+ $PTR_ADD $sp,6*$SZREG
+___
+$code.=<<___;
+ jr $ra
+ move $a0,$v0
+.end bn_div_3_words_internal
+
+.align 5
+.globl bn_div_words
+.ent bn_div_words
+bn_div_words:
+ .set noreorder
+ bnez $a2,bn_div_words_internal
+ li $v0,-1 # I would rather signal div-by-zero
+ # which can be done with 'break 7'
+ jr $ra
+ move $a0,$v0
+.end bn_div_words
+
+.align 5
+.ent bn_div_words_internal
+bn_div_words_internal:
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+ .frame $sp,6*$SZREG,$ra
+ .mask 0x8000f008,-$SZREG
+ .set noreorder
+ $PTR_SUB $sp,6*$SZREG
+ $REG_S $ra,5*$SZREG($sp)
+ $REG_S $t3,4*$SZREG($sp)
+ $REG_S $t2,3*$SZREG($sp)
+ $REG_S $t1,2*$SZREG($sp)
+ $REG_S $t0,1*$SZREG($sp)
+ $REG_S $gp,0*$SZREG($sp)
+___
+$code.=<<___;
+ move $v1,$zero
+ bltz $a2,.L_bn_div_words_body
+ move $t9,$v1
+ $SLL $a2,1
+ bgtz $a2,.-4
+ addu $t9,1
+
+ .set reorder
+ negu $t1,$t9
+ li $t2,-1
+ $SLL $t2,$t1
+ and $t2,$a0
+ $SRL $at,$a1,$t1
+ .set noreorder
+ beqz $t2,.+12
+ nop
+ break 6 # signal overflow
+ .set reorder
+ $SLL $a0,$t9
+ $SLL $a1,$t9
+ or $a0,$at
+___
+$QT=$ta0;
+$HH=$ta1;
+$DH=$v1;
+$code.=<<___;
+.L_bn_div_words_body:
+ $SRL $DH,$a2,4*$BNSZ # bits
+ sgeu $at,$a0,$a2
+ .set noreorder
+ beqz $at,.+12
+ nop
+ $SUBU $a0,$a2
+ .set reorder
+
+ li $QT,-1
+ $SRL $HH,$a0,4*$BNSZ # bits
+ $SRL $QT,4*$BNSZ # q=0xffffffff
+ beq $DH,$HH,.L_bn_div_words_skip_div1
+ $DIVU $zero,$a0,$DH
+ mflo $QT
+.L_bn_div_words_skip_div1:
+ $MULTU $a2,$QT
+ $SLL $t3,$a0,4*$BNSZ # bits
+ $SRL $at,$a1,4*$BNSZ # bits
+ or $t3,$at
+ mflo $t0
+ mfhi $t1
+.L_bn_div_words_inner_loop1:
+ sltu $t2,$t3,$t0
+ seq $t8,$HH,$t1
+ sltu $at,$HH,$t1
+ and $t2,$t8
+ sltu $v0,$t0,$a2
+ or $at,$t2
+ .set noreorder
+ beqz $at,.L_bn_div_words_inner_loop1_done
+ $SUBU $t1,$v0
+ $SUBU $t0,$a2
+ b .L_bn_div_words_inner_loop1
+ $SUBU $QT,1
+ .set reorder
+.L_bn_div_words_inner_loop1_done:
+
+ $SLL $a1,4*$BNSZ # bits
+ $SUBU $a0,$t3,$t0
+ $SLL $v0,$QT,4*$BNSZ # bits
+
+ li $QT,-1
+ $SRL $HH,$a0,4*$BNSZ # bits
+ $SRL $QT,4*$BNSZ # q=0xffffffff
+ beq $DH,$HH,.L_bn_div_words_skip_div2
+ $DIVU $zero,$a0,$DH
+ mflo $QT
+.L_bn_div_words_skip_div2:
+ $MULTU $a2,$QT
+ $SLL $t3,$a0,4*$BNSZ # bits
+ $SRL $at,$a1,4*$BNSZ # bits
+ or $t3,$at
+ mflo $t0
+ mfhi $t1
+.L_bn_div_words_inner_loop2:
+ sltu $t2,$t3,$t0
+ seq $t8,$HH,$t1
+ sltu $at,$HH,$t1
+ and $t2,$t8
+ sltu $v1,$t0,$a2
+ or $at,$t2
+ .set noreorder
+ beqz $at,.L_bn_div_words_inner_loop2_done
+ $SUBU $t1,$v1
+ $SUBU $t0,$a2
+ b .L_bn_div_words_inner_loop2
+ $SUBU $QT,1
+ .set reorder
+.L_bn_div_words_inner_loop2_done:
+
+ $SUBU $a0,$t3,$t0
+ or $v0,$QT
+ $SRL $v1,$a0,$t9 # $v1 contains remainder if anybody wants it
+ $SRL $a2,$t9 # restore $a2
+
+ .set noreorder
+ move $a1,$v1
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+ $REG_L $t3,4*$SZREG($sp)
+ $REG_L $t2,3*$SZREG($sp)
+ $REG_L $t1,2*$SZREG($sp)
+ $REG_L $t0,1*$SZREG($sp)
+ $REG_L $gp,0*$SZREG($sp)
+ $PTR_ADD $sp,6*$SZREG
+___
+$code.=<<___;
+ jr $ra
+ move $a0,$v0
+.end bn_div_words_internal
+___
+undef $HH; undef $QT; undef $DH;
+
+($a_0,$a_1,$a_2,$a_3)=($t0,$t1,$t2,$t3);
+($b_0,$b_1,$b_2,$b_3)=($ta0,$ta1,$ta2,$ta3);
+
+($a_4,$a_5,$a_6,$a_7)=($s0,$s2,$s4,$a1); # once we load a[7], no use for $a1
+($b_4,$b_5,$b_6,$b_7)=($s1,$s3,$s5,$a2); # once we load b[7], no use for $a2
+
+($t_1,$t_2,$c_1,$c_2,$c_3)=($t8,$t9,$v0,$v1,$a3);
+
+$code.=<<___;
+
+.align 5
+.globl bn_mul_comba8
+.ent bn_mul_comba8
+bn_mul_comba8:
+ .set noreorder
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+ .frame $sp,12*$SZREG,$ra
+ .mask 0x803ff008,-$SZREG
+ $PTR_SUB $sp,12*$SZREG
+ $REG_S $ra,11*$SZREG($sp)
+ $REG_S $s5,10*$SZREG($sp)
+ $REG_S $s4,9*$SZREG($sp)
+ $REG_S $s3,8*$SZREG($sp)
+ $REG_S $s2,7*$SZREG($sp)
+ $REG_S $s1,6*$SZREG($sp)
+ $REG_S $s0,5*$SZREG($sp)
+ $REG_S $t3,4*$SZREG($sp)
+ $REG_S $t2,3*$SZREG($sp)
+ $REG_S $t1,2*$SZREG($sp)
+ $REG_S $t0,1*$SZREG($sp)
+ $REG_S $gp,0*$SZREG($sp)
+___
+$code.=<<___ if ($flavour !~ /nubi/i);
+ .frame $sp,6*$SZREG,$ra
+ .mask 0x003f0000,-$SZREG
+ $PTR_SUB $sp,6*$SZREG
+ $REG_S $s5,5*$SZREG($sp)
+ $REG_S $s4,4*$SZREG($sp)
+ $REG_S $s3,3*$SZREG($sp)
+ $REG_S $s2,2*$SZREG($sp)
+ $REG_S $s1,1*$SZREG($sp)
+ $REG_S $s0,0*$SZREG($sp)
+___
+$code.=<<___;
+
+ .set reorder
+ $LD $a_0,0($a1) # If compiled with -mips3 option on
+ # R5000 box assembler barks on this
+ # 1ine with "should not have mult/div
+ # as last instruction in bb (R10K
+ # bug)" warning. If anybody out there
+ # has a clue about how to circumvent
+ # this do send me a note.
+ # <appro\@fy.chalmers.se>
+
+ $LD $b_0,0($a2)
+ $LD $a_1,$BNSZ($a1)
+ $LD $a_2,2*$BNSZ($a1)
+ $MULTU $a_0,$b_0 # mul_add_c(a[0],b[0],c1,c2,c3);
+ $LD $a_3,3*$BNSZ($a1)
+ $LD $b_1,$BNSZ($a2)
+ $LD $b_2,2*$BNSZ($a2)
+ $LD $b_3,3*$BNSZ($a2)
+ mflo $c_1
+ mfhi $c_2
+
+ $LD $a_4,4*$BNSZ($a1)
+ $LD $a_5,5*$BNSZ($a1)
+ $MULTU $a_0,$b_1 # mul_add_c(a[0],b[1],c2,c3,c1);
+ $LD $a_6,6*$BNSZ($a1)
+ $LD $a_7,7*$BNSZ($a1)
+ $LD $b_4,4*$BNSZ($a2)
+ $LD $b_5,5*$BNSZ($a2)
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $MULTU $a_1,$b_0 # mul_add_c(a[1],b[0],c2,c3,c1);
+ $ADDU $c_3,$t_2,$at
+ $LD $b_6,6*$BNSZ($a2)
+ $LD $b_7,7*$BNSZ($a2)
+ $ST $c_1,0($a0) # r[0]=c1;
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $MULTU $a_2,$b_0 # mul_add_c(a[2],b[0],c3,c1,c2);
+ $ADDU $t_2,$at
+ $ADDU $c_3,$t_2
+ sltu $c_1,$c_3,$t_2
+ $ST $c_2,$BNSZ($a0) # r[1]=c2;
+
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_3,$t_1
+ sltu $at,$c_3,$t_1
+ $MULTU $a_1,$b_1 # mul_add_c(a[1],b[1],c3,c1,c2);
+ $ADDU $t_2,$at
+ $ADDU $c_1,$t_2
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_3,$t_1
+ sltu $at,$c_3,$t_1
+ $MULTU $a_0,$b_2 # mul_add_c(a[0],b[2],c3,c1,c2);
+ $ADDU $t_2,$at
+ $ADDU $c_1,$t_2
+ sltu $c_2,$c_1,$t_2
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_3,$t_1
+ sltu $at,$c_3,$t_1
+ $MULTU $a_0,$b_3 # mul_add_c(a[0],b[3],c1,c2,c3);
+ $ADDU $t_2,$at
+ $ADDU $c_1,$t_2
+ sltu $at,$c_1,$t_2
+ $ADDU $c_2,$at
+ $ST $c_3,2*$BNSZ($a0) # r[2]=c3;
+
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_1,$t_1
+ sltu $at,$c_1,$t_1
+ $MULTU $a_1,$b_2 # mul_add_c(a[1],b[2],c1,c2,c3);
+ $ADDU $t_2,$at
+ $ADDU $c_2,$t_2
+ sltu $c_3,$c_2,$t_2
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_1,$t_1
+ sltu $at,$c_1,$t_1
+ $MULTU $a_2,$b_1 # mul_add_c(a[2],b[1],c1,c2,c3);
+ $ADDU $t_2,$at
+ $ADDU $c_2,$t_2
+ sltu $at,$c_2,$t_2
+ $ADDU $c_3,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_1,$t_1
+ sltu $at,$c_1,$t_1
+ $MULTU $a_3,$b_0 # mul_add_c(a[3],b[0],c1,c2,c3);
+ $ADDU $t_2,$at
+ $ADDU $c_2,$t_2
+ sltu $at,$c_2,$t_2
+ $ADDU $c_3,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_1,$t_1
+ sltu $at,$c_1,$t_1
+ $MULTU $a_4,$b_0 # mul_add_c(a[4],b[0],c2,c3,c1);
+ $ADDU $t_2,$at
+ $ADDU $c_2,$t_2
+ sltu $at,$c_2,$t_2
+ $ADDU $c_3,$at
+ $ST $c_1,3*$BNSZ($a0) # r[3]=c1;
+
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $MULTU $a_3,$b_1 # mul_add_c(a[3],b[1],c2,c3,c1);
+ $ADDU $t_2,$at
+ $ADDU $c_3,$t_2
+ sltu $c_1,$c_3,$t_2
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $MULTU $a_2,$b_2 # mul_add_c(a[2],b[2],c2,c3,c1);
+ $ADDU $t_2,$at
+ $ADDU $c_3,$t_2
+ sltu $at,$c_3,$t_2
+ $ADDU $c_1,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $MULTU $a_1,$b_3 # mul_add_c(a[1],b[3],c2,c3,c1);
+ $ADDU $t_2,$at
+ $ADDU $c_3,$t_2
+ sltu $at,$c_3,$t_2
+ $ADDU $c_1,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $MULTU $a_0,$b_4 # mul_add_c(a[0],b[4],c2,c3,c1);
+ $ADDU $t_2,$at
+ $ADDU $c_3,$t_2
+ sltu $at,$c_3,$t_2
+ $ADDU $c_1,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $MULTU $a_0,$b_5 # mul_add_c(a[0],b[5],c3,c1,c2);
+ $ADDU $t_2,$at
+ $ADDU $c_3,$t_2
+ sltu $at,$c_3,$t_2
+ $ADDU $c_1,$at
+ $ST $c_2,4*$BNSZ($a0) # r[4]=c2;
+
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_3,$t_1
+ sltu $at,$c_3,$t_1
+ $MULTU $a_1,$b_4 # mul_add_c(a[1],b[4],c3,c1,c2);
+ $ADDU $t_2,$at
+ $ADDU $c_1,$t_2
+ sltu $c_2,$c_1,$t_2
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_3,$t_1
+ sltu $at,$c_3,$t_1
+ $MULTU $a_2,$b_3 # mul_add_c(a[2],b[3],c3,c1,c2);
+ $ADDU $t_2,$at
+ $ADDU $c_1,$t_2
+ sltu $at,$c_1,$t_2
+ $ADDU $c_2,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_3,$t_1
+ sltu $at,$c_3,$t_1
+ $MULTU $a_3,$b_2 # mul_add_c(a[3],b[2],c3,c1,c2);
+ $ADDU $t_2,$at
+ $ADDU $c_1,$t_2
+ sltu $at,$c_1,$t_2
+ $ADDU $c_2,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_3,$t_1
+ sltu $at,$c_3,$t_1
+ $MULTU $a_4,$b_1 # mul_add_c(a[4],b[1],c3,c1,c2);
+ $ADDU $t_2,$at
+ $ADDU $c_1,$t_2
+ sltu $at,$c_1,$t_2
+ $ADDU $c_2,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_3,$t_1
+ sltu $at,$c_3,$t_1
+ $MULTU $a_5,$b_0 # mul_add_c(a[5],b[0],c3,c1,c2);
+ $ADDU $t_2,$at
+ $ADDU $c_1,$t_2
+ sltu $at,$c_1,$t_2
+ $ADDU $c_2,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_3,$t_1
+ sltu $at,$c_3,$t_1
+ $MULTU $a_6,$b_0 # mul_add_c(a[6],b[0],c1,c2,c3);
+ $ADDU $t_2,$at
+ $ADDU $c_1,$t_2
+ sltu $at,$c_1,$t_2
+ $ADDU $c_2,$at
+ $ST $c_3,5*$BNSZ($a0) # r[5]=c3;
+
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_1,$t_1
+ sltu $at,$c_1,$t_1
+ $MULTU $a_5,$b_1 # mul_add_c(a[5],b[1],c1,c2,c3);
+ $ADDU $t_2,$at
+ $ADDU $c_2,$t_2
+ sltu $c_3,$c_2,$t_2
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_1,$t_1
+ sltu $at,$c_1,$t_1
+ $MULTU $a_4,$b_2 # mul_add_c(a[4],b[2],c1,c2,c3);
+ $ADDU $t_2,$at
+ $ADDU $c_2,$t_2
+ sltu $at,$c_2,$t_2
+ $ADDU $c_3,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_1,$t_1
+ sltu $at,$c_1,$t_1
+ $MULTU $a_3,$b_3 # mul_add_c(a[3],b[3],c1,c2,c3);
+ $ADDU $t_2,$at
+ $ADDU $c_2,$t_2
+ sltu $at,$c_2,$t_2
+ $ADDU $c_3,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_1,$t_1
+ sltu $at,$c_1,$t_1
+ $MULTU $a_2,$b_4 # mul_add_c(a[2],b[4],c1,c2,c3);
+ $ADDU $t_2,$at
+ $ADDU $c_2,$t_2
+ sltu $at,$c_2,$t_2
+ $ADDU $c_3,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_1,$t_1
+ sltu $at,$c_1,$t_1
+ $MULTU $a_1,$b_5 # mul_add_c(a[1],b[5],c1,c2,c3);
+ $ADDU $t_2,$at
+ $ADDU $c_2,$t_2
+ sltu $at,$c_2,$t_2
+ $ADDU $c_3,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_1,$t_1
+ sltu $at,$c_1,$t_1
+ $MULTU $a_0,$b_6 # mul_add_c(a[0],b[6],c1,c2,c3);
+ $ADDU $t_2,$at
+ $ADDU $c_2,$t_2
+ sltu $at,$c_2,$t_2
+ $ADDU $c_3,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_1,$t_1
+ sltu $at,$c_1,$t_1
+ $MULTU $a_0,$b_7 # mul_add_c(a[0],b[7],c2,c3,c1);
+ $ADDU $t_2,$at
+ $ADDU $c_2,$t_2
+ sltu $at,$c_2,$t_2
+ $ADDU $c_3,$at
+ $ST $c_1,6*$BNSZ($a0) # r[6]=c1;
+
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $MULTU $a_1,$b_6 # mul_add_c(a[1],b[6],c2,c3,c1);
+ $ADDU $t_2,$at
+ $ADDU $c_3,$t_2
+ sltu $c_1,$c_3,$t_2
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $MULTU $a_2,$b_5 # mul_add_c(a[2],b[5],c2,c3,c1);
+ $ADDU $t_2,$at
+ $ADDU $c_3,$t_2
+ sltu $at,$c_3,$t_2
+ $ADDU $c_1,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $MULTU $a_3,$b_4 # mul_add_c(a[3],b[4],c2,c3,c1);
+ $ADDU $t_2,$at
+ $ADDU $c_3,$t_2
+ sltu $at,$c_3,$t_2
+ $ADDU $c_1,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $MULTU $a_4,$b_3 # mul_add_c(a[4],b[3],c2,c3,c1);
+ $ADDU $t_2,$at
+ $ADDU $c_3,$t_2
+ sltu $at,$c_3,$t_2
+ $ADDU $c_1,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $MULTU $a_5,$b_2 # mul_add_c(a[5],b[2],c2,c3,c1);
+ $ADDU $t_2,$at
+ $ADDU $c_3,$t_2
+ sltu $at,$c_3,$t_2
+ $ADDU $c_1,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $MULTU $a_6,$b_1 # mul_add_c(a[6],b[1],c2,c3,c1);
+ $ADDU $t_2,$at
+ $ADDU $c_3,$t_2
+ sltu $at,$c_3,$t_2
+ $ADDU $c_1,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $MULTU $a_7,$b_0 # mul_add_c(a[7],b[0],c2,c3,c1);
+ $ADDU $t_2,$at
+ $ADDU $c_3,$t_2
+ sltu $at,$c_3,$t_2
+ $ADDU $c_1,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $MULTU $a_7,$b_1 # mul_add_c(a[7],b[1],c3,c1,c2);
+ $ADDU $t_2,$at
+ $ADDU $c_3,$t_2
+ sltu $at,$c_3,$t_2
+ $ADDU $c_1,$at
+ $ST $c_2,7*$BNSZ($a0) # r[7]=c2;
+
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_3,$t_1
+ sltu $at,$c_3,$t_1
+ $MULTU $a_6,$b_2 # mul_add_c(a[6],b[2],c3,c1,c2);
+ $ADDU $t_2,$at
+ $ADDU $c_1,$t_2
+ sltu $c_2,$c_1,$t_2
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_3,$t_1
+ sltu $at,$c_3,$t_1
+ $MULTU $a_5,$b_3 # mul_add_c(a[5],b[3],c3,c1,c2);
+ $ADDU $t_2,$at
+ $ADDU $c_1,$t_2
+ sltu $at,$c_1,$t_2
+ $ADDU $c_2,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_3,$t_1
+ sltu $at,$c_3,$t_1
+ $MULTU $a_4,$b_4 # mul_add_c(a[4],b[4],c3,c1,c2);
+ $ADDU $t_2,$at
+ $ADDU $c_1,$t_2
+ sltu $at,$c_1,$t_2
+ $ADDU $c_2,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_3,$t_1
+ sltu $at,$c_3,$t_1
+ $MULTU $a_3,$b_5 # mul_add_c(a[3],b[5],c3,c1,c2);
+ $ADDU $t_2,$at
+ $ADDU $c_1,$t_2
+ sltu $at,$c_1,$t_2
+ $ADDU $c_2,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_3,$t_1
+ sltu $at,$c_3,$t_1
+ $MULTU $a_2,$b_6 # mul_add_c(a[2],b[6],c3,c1,c2);
+ $ADDU $t_2,$at
+ $ADDU $c_1,$t_2
+ sltu $at,$c_1,$t_2
+ $ADDU $c_2,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_3,$t_1
+ sltu $at,$c_3,$t_1
+ $MULTU $a_1,$b_7 # mul_add_c(a[1],b[7],c3,c1,c2);
+ $ADDU $t_2,$at
+ $ADDU $c_1,$t_2
+ sltu $at,$c_1,$t_2
+ $ADDU $c_2,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_3,$t_1
+ sltu $at,$c_3,$t_1
+ $MULTU $a_2,$b_7 # mul_add_c(a[2],b[7],c1,c2,c3);
+ $ADDU $t_2,$at
+ $ADDU $c_1,$t_2
+ sltu $at,$c_1,$t_2
+ $ADDU $c_2,$at
+ $ST $c_3,8*$BNSZ($a0) # r[8]=c3;
+
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_1,$t_1
+ sltu $at,$c_1,$t_1
+ $MULTU $a_3,$b_6 # mul_add_c(a[3],b[6],c1,c2,c3);
+ $ADDU $t_2,$at
+ $ADDU $c_2,$t_2
+ sltu $c_3,$c_2,$t_2
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_1,$t_1
+ sltu $at,$c_1,$t_1
+ $MULTU $a_4,$b_5 # mul_add_c(a[4],b[5],c1,c2,c3);
+ $ADDU $t_2,$at
+ $ADDU $c_2,$t_2
+ sltu $at,$c_2,$t_2
+ $ADDU $c_3,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_1,$t_1
+ sltu $at,$c_1,$t_1
+ $MULTU $a_5,$b_4 # mul_add_c(a[5],b[4],c1,c2,c3);
+ $ADDU $t_2,$at
+ $ADDU $c_2,$t_2
+ sltu $at,$c_2,$t_2
+ $ADDU $c_3,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_1,$t_1
+ sltu $at,$c_1,$t_1
+ $MULTU $a_6,$b_3 # mul_add_c(a[6],b[3],c1,c2,c3);
+ $ADDU $t_2,$at
+ $ADDU $c_2,$t_2
+ sltu $at,$c_2,$t_2
+ $ADDU $c_3,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_1,$t_1
+ sltu $at,$c_1,$t_1
+ $MULTU $a_7,$b_2 # mul_add_c(a[7],b[2],c1,c2,c3);
+ $ADDU $t_2,$at
+ $ADDU $c_2,$t_2
+ sltu $at,$c_2,$t_2
+ $ADDU $c_3,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_1,$t_1
+ sltu $at,$c_1,$t_1
+ $MULTU $a_7,$b_3 # mul_add_c(a[7],b[3],c2,c3,c1);
+ $ADDU $t_2,$at
+ $ADDU $c_2,$t_2
+ sltu $at,$c_2,$t_2
+ $ADDU $c_3,$at
+ $ST $c_1,9*$BNSZ($a0) # r[9]=c1;
+
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $MULTU $a_6,$b_4 # mul_add_c(a[6],b[4],c2,c3,c1);
+ $ADDU $t_2,$at
+ $ADDU $c_3,$t_2
+ sltu $c_1,$c_3,$t_2
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $MULTU $a_5,$b_5 # mul_add_c(a[5],b[5],c2,c3,c1);
+ $ADDU $t_2,$at
+ $ADDU $c_3,$t_2
+ sltu $at,$c_3,$t_2
+ $ADDU $c_1,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $MULTU $a_4,$b_6 # mul_add_c(a[4],b[6],c2,c3,c1);
+ $ADDU $t_2,$at
+ $ADDU $c_3,$t_2
+ sltu $at,$c_3,$t_2
+ $ADDU $c_1,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $MULTU $a_3,$b_7 # mul_add_c(a[3],b[7],c2,c3,c1);
+ $ADDU $t_2,$at
+ $ADDU $c_3,$t_2
+ sltu $at,$c_3,$t_2
+ $ADDU $c_1,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $MULTU $a_4,$b_7 # mul_add_c(a[4],b[7],c3,c1,c2);
+ $ADDU $t_2,$at
+ $ADDU $c_3,$t_2
+ sltu $at,$c_3,$t_2
+ $ADDU $c_1,$at
+ $ST $c_2,10*$BNSZ($a0) # r[10]=c2;
+
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_3,$t_1
+ sltu $at,$c_3,$t_1
+ $MULTU $a_5,$b_6 # mul_add_c(a[5],b[6],c3,c1,c2);
+ $ADDU $t_2,$at
+ $ADDU $c_1,$t_2
+ sltu $c_2,$c_1,$t_2
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_3,$t_1
+ sltu $at,$c_3,$t_1
+ $MULTU $a_6,$b_5 # mul_add_c(a[6],b[5],c3,c1,c2);
+ $ADDU $t_2,$at
+ $ADDU $c_1,$t_2
+ sltu $at,$c_1,$t_2
+ $ADDU $c_2,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_3,$t_1
+ sltu $at,$c_3,$t_1
+ $MULTU $a_7,$b_4 # mul_add_c(a[7],b[4],c3,c1,c2);
+ $ADDU $t_2,$at
+ $ADDU $c_1,$t_2
+ sltu $at,$c_1,$t_2
+ $ADDU $c_2,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_3,$t_1
+ sltu $at,$c_3,$t_1
+ $MULTU $a_7,$b_5 # mul_add_c(a[7],b[5],c1,c2,c3);
+ $ADDU $t_2,$at
+ $ADDU $c_1,$t_2
+ sltu $at,$c_1,$t_2
+ $ADDU $c_2,$at
+ $ST $c_3,11*$BNSZ($a0) # r[11]=c3;
+
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_1,$t_1
+ sltu $at,$c_1,$t_1
+ $MULTU $a_6,$b_6 # mul_add_c(a[6],b[6],c1,c2,c3);
+ $ADDU $t_2,$at
+ $ADDU $c_2,$t_2
+ sltu $c_3,$c_2,$t_2
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_1,$t_1
+ sltu $at,$c_1,$t_1
+ $MULTU $a_5,$b_7 # mul_add_c(a[5],b[7],c1,c2,c3);
+ $ADDU $t_2,$at
+ $ADDU $c_2,$t_2
+ sltu $at,$c_2,$t_2
+ $ADDU $c_3,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_1,$t_1
+ sltu $at,$c_1,$t_1
+ $MULTU $a_6,$b_7 # mul_add_c(a[6],b[7],c2,c3,c1);
+ $ADDU $t_2,$at
+ $ADDU $c_2,$t_2
+ sltu $at,$c_2,$t_2
+ $ADDU $c_3,$at
+ $ST $c_1,12*$BNSZ($a0) # r[12]=c1;
+
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $MULTU $a_7,$b_6 # mul_add_c(a[7],b[6],c2,c3,c1);
+ $ADDU $t_2,$at
+ $ADDU $c_3,$t_2
+ sltu $c_1,$c_3,$t_2
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $MULTU $a_7,$b_7 # mul_add_c(a[7],b[7],c3,c1,c2);
+ $ADDU $t_2,$at
+ $ADDU $c_3,$t_2
+ sltu $at,$c_3,$t_2
+ $ADDU $c_1,$at
+ $ST $c_2,13*$BNSZ($a0) # r[13]=c2;
+
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_3,$t_1
+ sltu $at,$c_3,$t_1
+ $ADDU $t_2,$at
+ $ADDU $c_1,$t_2
+ $ST $c_3,14*$BNSZ($a0) # r[14]=c3;
+ $ST $c_1,15*$BNSZ($a0) # r[15]=c1;
+
+ .set noreorder
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+ $REG_L $s5,10*$SZREG($sp)
+ $REG_L $s4,9*$SZREG($sp)
+ $REG_L $s3,8*$SZREG($sp)
+ $REG_L $s2,7*$SZREG($sp)
+ $REG_L $s1,6*$SZREG($sp)
+ $REG_L $s0,5*$SZREG($sp)
+ $REG_L $t3,4*$SZREG($sp)
+ $REG_L $t2,3*$SZREG($sp)
+ $REG_L $t1,2*$SZREG($sp)
+ $REG_L $t0,1*$SZREG($sp)
+ $REG_L $gp,0*$SZREG($sp)
+ jr $ra
+ $PTR_ADD $sp,12*$SZREG
+___
+$code.=<<___ if ($flavour !~ /nubi/i);
+ $REG_L $s5,5*$SZREG($sp)
+ $REG_L $s4,4*$SZREG($sp)
+ $REG_L $s3,3*$SZREG($sp)
+ $REG_L $s2,2*$SZREG($sp)
+ $REG_L $s1,1*$SZREG($sp)
+ $REG_L $s0,0*$SZREG($sp)
+ jr $ra
+ $PTR_ADD $sp,6*$SZREG
+___
+$code.=<<___;
+.end bn_mul_comba8
+
+.align 5
+.globl bn_mul_comba4
+.ent bn_mul_comba4
+bn_mul_comba4:
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+ .frame $sp,6*$SZREG,$ra
+ .mask 0x8000f008,-$SZREG
+ .set noreorder
+ $PTR_SUB $sp,6*$SZREG
+ $REG_S $ra,5*$SZREG($sp)
+ $REG_S $t3,4*$SZREG($sp)
+ $REG_S $t2,3*$SZREG($sp)
+ $REG_S $t1,2*$SZREG($sp)
+ $REG_S $t0,1*$SZREG($sp)
+ $REG_S $gp,0*$SZREG($sp)
+___
+$code.=<<___;
+ .set reorder
+ $LD $a_0,0($a1)
+ $LD $b_0,0($a2)
+ $LD $a_1,$BNSZ($a1)
+ $LD $a_2,2*$BNSZ($a1)
+ $MULTU $a_0,$b_0 # mul_add_c(a[0],b[0],c1,c2,c3);
+ $LD $a_3,3*$BNSZ($a1)
+ $LD $b_1,$BNSZ($a2)
+ $LD $b_2,2*$BNSZ($a2)
+ $LD $b_3,3*$BNSZ($a2)
+ mflo $c_1
+ mfhi $c_2
+ $ST $c_1,0($a0)
+
+ $MULTU $a_0,$b_1 # mul_add_c(a[0],b[1],c2,c3,c1);
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $MULTU $a_1,$b_0 # mul_add_c(a[1],b[0],c2,c3,c1);
+ $ADDU $c_3,$t_2,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $MULTU $a_2,$b_0 # mul_add_c(a[2],b[0],c3,c1,c2);
+ $ADDU $t_2,$at
+ $ADDU $c_3,$t_2
+ sltu $c_1,$c_3,$t_2
+ $ST $c_2,$BNSZ($a0)
+
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_3,$t_1
+ sltu $at,$c_3,$t_1
+ $MULTU $a_1,$b_1 # mul_add_c(a[1],b[1],c3,c1,c2);
+ $ADDU $t_2,$at
+ $ADDU $c_1,$t_2
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_3,$t_1
+ sltu $at,$c_3,$t_1
+ $MULTU $a_0,$b_2 # mul_add_c(a[0],b[2],c3,c1,c2);
+ $ADDU $t_2,$at
+ $ADDU $c_1,$t_2
+ sltu $c_2,$c_1,$t_2
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_3,$t_1
+ sltu $at,$c_3,$t_1
+ $MULTU $a_0,$b_3 # mul_add_c(a[0],b[3],c1,c2,c3);
+ $ADDU $t_2,$at
+ $ADDU $c_1,$t_2
+ sltu $at,$c_1,$t_2
+ $ADDU $c_2,$at
+ $ST $c_3,2*$BNSZ($a0)
+
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_1,$t_1
+ sltu $at,$c_1,$t_1
+ $MULTU $a_1,$b_2 # mul_add_c(a[1],b[2],c1,c2,c3);
+ $ADDU $t_2,$at
+ $ADDU $c_2,$t_2
+ sltu $c_3,$c_2,$t_2
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_1,$t_1
+ sltu $at,$c_1,$t_1
+ $MULTU $a_2,$b_1 # mul_add_c(a[2],b[1],c1,c2,c3);
+ $ADDU $t_2,$at
+ $ADDU $c_2,$t_2
+ sltu $at,$c_2,$t_2
+ $ADDU $c_3,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_1,$t_1
+ sltu $at,$c_1,$t_1
+ $MULTU $a_3,$b_0 # mul_add_c(a[3],b[0],c1,c2,c3);
+ $ADDU $t_2,$at
+ $ADDU $c_2,$t_2
+ sltu $at,$c_2,$t_2
+ $ADDU $c_3,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_1,$t_1
+ sltu $at,$c_1,$t_1
+ $MULTU $a_3,$b_1 # mul_add_c(a[3],b[1],c2,c3,c1);
+ $ADDU $t_2,$at
+ $ADDU $c_2,$t_2
+ sltu $at,$c_2,$t_2
+ $ADDU $c_3,$at
+ $ST $c_1,3*$BNSZ($a0)
+
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $MULTU $a_2,$b_2 # mul_add_c(a[2],b[2],c2,c3,c1);
+ $ADDU $t_2,$at
+ $ADDU $c_3,$t_2
+ sltu $c_1,$c_3,$t_2
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $MULTU $a_1,$b_3 # mul_add_c(a[1],b[3],c2,c3,c1);
+ $ADDU $t_2,$at
+ $ADDU $c_3,$t_2
+ sltu $at,$c_3,$t_2
+ $ADDU $c_1,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $MULTU $a_2,$b_3 # mul_add_c(a[2],b[3],c3,c1,c2);
+ $ADDU $t_2,$at
+ $ADDU $c_3,$t_2
+ sltu $at,$c_3,$t_2
+ $ADDU $c_1,$at
+ $ST $c_2,4*$BNSZ($a0)
+
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_3,$t_1
+ sltu $at,$c_3,$t_1
+ $MULTU $a_3,$b_2 # mul_add_c(a[3],b[2],c3,c1,c2);
+ $ADDU $t_2,$at
+ $ADDU $c_1,$t_2
+ sltu $c_2,$c_1,$t_2
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_3,$t_1
+ sltu $at,$c_3,$t_1
+ $MULTU $a_3,$b_3 # mul_add_c(a[3],b[3],c1,c2,c3);
+ $ADDU $t_2,$at
+ $ADDU $c_1,$t_2
+ sltu $at,$c_1,$t_2
+ $ADDU $c_2,$at
+ $ST $c_3,5*$BNSZ($a0)
+
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_1,$t_1
+ sltu $at,$c_1,$t_1
+ $ADDU $t_2,$at
+ $ADDU $c_2,$t_2
+ $ST $c_1,6*$BNSZ($a0)
+ $ST $c_2,7*$BNSZ($a0)
+
+ .set noreorder
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+ $REG_L $t3,4*$SZREG($sp)
+ $REG_L $t2,3*$SZREG($sp)
+ $REG_L $t1,2*$SZREG($sp)
+ $REG_L $t0,1*$SZREG($sp)
+ $REG_L $gp,0*$SZREG($sp)
+ $PTR_ADD $sp,6*$SZREG
+___
+$code.=<<___;
+ jr $ra
+ nop
+.end bn_mul_comba4
+___
+
+($a_4,$a_5,$a_6,$a_7)=($b_0,$b_1,$b_2,$b_3);
+
+$code.=<<___;
+
+.align 5
+.globl bn_sqr_comba8
+.ent bn_sqr_comba8
+bn_sqr_comba8:
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+ .frame $sp,6*$SZREG,$ra
+ .mask 0x8000f008,-$SZREG
+ .set noreorder
+ $PTR_SUB $sp,6*$SZREG
+ $REG_S $ra,5*$SZREG($sp)
+ $REG_S $t3,4*$SZREG($sp)
+ $REG_S $t2,3*$SZREG($sp)
+ $REG_S $t1,2*$SZREG($sp)
+ $REG_S $t0,1*$SZREG($sp)
+ $REG_S $gp,0*$SZREG($sp)
+___
+$code.=<<___;
+ .set reorder
+ $LD $a_0,0($a1)
+ $LD $a_1,$BNSZ($a1)
+ $LD $a_2,2*$BNSZ($a1)
+ $LD $a_3,3*$BNSZ($a1)
+
+ $MULTU $a_0,$a_0 # mul_add_c(a[0],b[0],c1,c2,c3);
+ $LD $a_4,4*$BNSZ($a1)
+ $LD $a_5,5*$BNSZ($a1)
+ $LD $a_6,6*$BNSZ($a1)
+ $LD $a_7,7*$BNSZ($a1)
+ mflo $c_1
+ mfhi $c_2
+ $ST $c_1,0($a0)
+
+ $MULTU $a_0,$a_1 # mul_add_c2(a[0],b[1],c2,c3,c1);
+ mflo $t_1
+ mfhi $t_2
+ slt $c_1,$t_2,$zero
+ $SLL $t_2,1
+ $MULTU $a_2,$a_0 # mul_add_c2(a[2],b[0],c3,c1,c2);
+ slt $a2,$t_1,$zero
+ $ADDU $t_2,$a2
+ $SLL $t_1,1
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $ADDU $c_3,$t_2,$at
+ $ST $c_2,$BNSZ($a0)
+
+ mflo $t_1
+ mfhi $t_2
+ slt $c_2,$t_2,$zero
+ $SLL $t_2,1
+ $MULTU $a_1,$a_1 # mul_add_c(a[1],b[1],c3,c1,c2);
+ slt $a2,$t_1,$zero
+ $ADDU $t_2,$a2
+ $SLL $t_1,1
+ $ADDU $c_3,$t_1
+ sltu $at,$c_3,$t_1
+ $ADDU $t_2,$at
+ $ADDU $c_1,$t_2
+ sltu $at,$c_1,$t_2
+ $ADDU $c_2,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_3,$t_1
+ sltu $at,$c_3,$t_1
+ $MULTU $a_0,$a_3 # mul_add_c2(a[0],b[3],c1,c2,c3);
+ $ADDU $t_2,$at
+ $ADDU $c_1,$t_2
+ sltu $at,$c_1,$t_2
+ $ADDU $c_2,$at
+ $ST $c_3,2*$BNSZ($a0)
+
+ mflo $t_1
+ mfhi $t_2
+ slt $c_3,$t_2,$zero
+ $SLL $t_2,1
+ $MULTU $a_1,$a_2 # mul_add_c2(a[1],b[2],c1,c2,c3);
+ slt $a2,$t_1,$zero
+ $ADDU $t_2,$a2
+ $SLL $t_1,1
+ $ADDU $c_1,$t_1
+ sltu $at,$c_1,$t_1
+ $ADDU $t_2,$at
+ $ADDU $c_2,$t_2
+ sltu $at,$c_2,$t_2
+ $ADDU $c_3,$at
+ mflo $t_1
+ mfhi $t_2
+ slt $at,$t_2,$zero
+ $ADDU $c_3,$at
+ $MULTU $a_4,$a_0 # mul_add_c2(a[4],b[0],c2,c3,c1);
+ $SLL $t_2,1
+ slt $a2,$t_1,$zero
+ $ADDU $t_2,$a2
+ $SLL $t_1,1
+ $ADDU $c_1,$t_1
+ sltu $at,$c_1,$t_1
+ $ADDU $t_2,$at
+ $ADDU $c_2,$t_2
+ sltu $at,$c_2,$t_2
+ $ADDU $c_3,$at
+ $ST $c_1,3*$BNSZ($a0)
+
+ mflo $t_1
+ mfhi $t_2
+ slt $c_1,$t_2,$zero
+ $SLL $t_2,1
+ $MULTU $a_3,$a_1 # mul_add_c2(a[3],b[1],c2,c3,c1);
+ slt $a2,$t_1,$zero
+ $ADDU $t_2,$a2
+ $SLL $t_1,1
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $ADDU $t_2,$at
+ $ADDU $c_3,$t_2
+ sltu $at,$c_3,$t_2
+ $ADDU $c_1,$at
+ mflo $t_1
+ mfhi $t_2
+ slt $at,$t_2,$zero
+ $ADDU $c_1,$at
+ $MULTU $a_2,$a_2 # mul_add_c(a[2],b[2],c2,c3,c1);
+ $SLL $t_2,1
+ slt $a2,$t_1,$zero
+ $ADDU $t_2,$a2
+ $SLL $t_1,1
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $ADDU $t_2,$at
+ $ADDU $c_3,$t_2
+ sltu $at,$c_3,$t_2
+ $ADDU $c_1,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $MULTU $a_0,$a_5 # mul_add_c2(a[0],b[5],c3,c1,c2);
+ $ADDU $t_2,$at
+ $ADDU $c_3,$t_2
+ sltu $at,$c_3,$t_2
+ $ADDU $c_1,$at
+ $ST $c_2,4*$BNSZ($a0)
+
+ mflo $t_1
+ mfhi $t_2
+ slt $c_2,$t_2,$zero
+ $SLL $t_2,1
+ $MULTU $a_1,$a_4 # mul_add_c2(a[1],b[4],c3,c1,c2);
+ slt $a2,$t_1,$zero
+ $ADDU $t_2,$a2
+ $SLL $t_1,1
+ $ADDU $c_3,$t_1
+ sltu $at,$c_3,$t_1
+ $ADDU $t_2,$at
+ $ADDU $c_1,$t_2
+ sltu $at,$c_1,$t_2
+ $ADDU $c_2,$at
+ mflo $t_1
+ mfhi $t_2
+ slt $at,$t_2,$zero
+ $ADDU $c_2,$at
+ $MULTU $a_2,$a_3 # mul_add_c2(a[2],b[3],c3,c1,c2);
+ $SLL $t_2,1
+ slt $a2,$t_1,$zero
+ $ADDU $t_2,$a2
+ $SLL $t_1,1
+ $ADDU $c_3,$t_1
+ sltu $at,$c_3,$t_1
+ $ADDU $t_2,$at
+ $ADDU $c_1,$t_2
+ sltu $at,$c_1,$t_2
+ $ADDU $c_2,$at
+ mflo $t_1
+ mfhi $t_2
+ slt $at,$t_2,$zero
+ $MULTU $a_6,$a_0 # mul_add_c2(a[6],b[0],c1,c2,c3);
+ $ADDU $c_2,$at
+ $SLL $t_2,1
+ slt $a2,$t_1,$zero
+ $ADDU $t_2,$a2
+ $SLL $t_1,1
+ $ADDU $c_3,$t_1
+ sltu $at,$c_3,$t_1
+ $ADDU $t_2,$at
+ $ADDU $c_1,$t_2
+ sltu $at,$c_1,$t_2
+ $ADDU $c_2,$at
+ $ST $c_3,5*$BNSZ($a0)
+
+ mflo $t_1
+ mfhi $t_2
+ slt $c_3,$t_2,$zero
+ $SLL $t_2,1
+ $MULTU $a_5,$a_1 # mul_add_c2(a[5],b[1],c1,c2,c3);
+ slt $a2,$t_1,$zero
+ $ADDU $t_2,$a2
+ $SLL $t_1,1
+ $ADDU $c_1,$t_1
+ sltu $at,$c_1,$t_1
+ $ADDU $t_2,$at
+ $ADDU $c_2,$t_2
+ sltu $at,$c_2,$t_2
+ $ADDU $c_3,$at
+ mflo $t_1
+ mfhi $t_2
+ slt $at,$t_2,$zero
+ $ADDU $c_3,$at
+ $MULTU $a_4,$a_2 # mul_add_c2(a[4],b[2],c1,c2,c3);
+ $SLL $t_2,1
+ slt $a2,$t_1,$zero
+ $ADDU $t_2,$a2
+ $SLL $t_1,1
+ $ADDU $c_1,$t_1
+ sltu $at,$c_1,$t_1
+ $ADDU $t_2,$at
+ $ADDU $c_2,$t_2
+ sltu $at,$c_2,$t_2
+ $ADDU $c_3,$at
+ mflo $t_1
+ mfhi $t_2
+ slt $at,$t_2,$zero
+ $ADDU $c_3,$at
+ $MULTU $a_3,$a_3 # mul_add_c(a[3],b[3],c1,c2,c3);
+ $SLL $t_2,1
+ slt $a2,$t_1,$zero
+ $ADDU $t_2,$a2
+ $SLL $t_1,1
+ $ADDU $c_1,$t_1
+ sltu $at,$c_1,$t_1
+ $ADDU $t_2,$at
+ $ADDU $c_2,$t_2
+ sltu $at,$c_2,$t_2
+ $ADDU $c_3,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_1,$t_1
+ sltu $at,$c_1,$t_1
+ $MULTU $a_0,$a_7 # mul_add_c2(a[0],b[7],c2,c3,c1);
+ $ADDU $t_2,$at
+ $ADDU $c_2,$t_2
+ sltu $at,$c_2,$t_2
+ $ADDU $c_3,$at
+ $ST $c_1,6*$BNSZ($a0)
+
+ mflo $t_1
+ mfhi $t_2
+ slt $c_1,$t_2,$zero
+ $SLL $t_2,1
+ $MULTU $a_1,$a_6 # mul_add_c2(a[1],b[6],c2,c3,c1);
+ slt $a2,$t_1,$zero
+ $ADDU $t_2,$a2
+ $SLL $t_1,1
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $ADDU $t_2,$at
+ $ADDU $c_3,$t_2
+ sltu $at,$c_3,$t_2
+ $ADDU $c_1,$at
+ mflo $t_1
+ mfhi $t_2
+ slt $at,$t_2,$zero
+ $ADDU $c_1,$at
+ $MULTU $a_2,$a_5 # mul_add_c2(a[2],b[5],c2,c3,c1);
+ $SLL $t_2,1
+ slt $a2,$t_1,$zero
+ $ADDU $t_2,$a2
+ $SLL $t_1,1
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $ADDU $t_2,$at
+ $ADDU $c_3,$t_2
+ sltu $at,$c_3,$t_2
+ $ADDU $c_1,$at
+ mflo $t_1
+ mfhi $t_2
+ slt $at,$t_2,$zero
+ $ADDU $c_1,$at
+ $MULTU $a_3,$a_4 # mul_add_c2(a[3],b[4],c2,c3,c1);
+ $SLL $t_2,1
+ slt $a2,$t_1,$zero
+ $ADDU $t_2,$a2
+ $SLL $t_1,1
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $ADDU $t_2,$at
+ $ADDU $c_3,$t_2
+ sltu $at,$c_3,$t_2
+ $ADDU $c_1,$at
+ mflo $t_1
+ mfhi $t_2
+ slt $at,$t_2,$zero
+ $ADDU $c_1,$at
+ $MULTU $a_7,$a_1 # mul_add_c2(a[7],b[1],c3,c1,c2);
+ $SLL $t_2,1
+ slt $a2,$t_1,$zero
+ $ADDU $t_2,$a2
+ $SLL $t_1,1
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $ADDU $t_2,$at
+ $ADDU $c_3,$t_2
+ sltu $at,$c_3,$t_2
+ $ADDU $c_1,$at
+ $ST $c_2,7*$BNSZ($a0)
+
+ mflo $t_1
+ mfhi $t_2
+ slt $c_2,$t_2,$zero
+ $SLL $t_2,1
+ $MULTU $a_6,$a_2 # mul_add_c2(a[6],b[2],c3,c1,c2);
+ slt $a2,$t_1,$zero
+ $ADDU $t_2,$a2
+ $SLL $t_1,1
+ $ADDU $c_3,$t_1
+ sltu $at,$c_3,$t_1
+ $ADDU $t_2,$at
+ $ADDU $c_1,$t_2
+ sltu $at,$c_1,$t_2
+ $ADDU $c_2,$at
+ mflo $t_1
+ mfhi $t_2
+ slt $at,$t_2,$zero
+ $ADDU $c_2,$at
+ $MULTU $a_5,$a_3 # mul_add_c2(a[5],b[3],c3,c1,c2);
+ $SLL $t_2,1
+ slt $a2,$t_1,$zero
+ $ADDU $t_2,$a2
+ $SLL $t_1,1
+ $ADDU $c_3,$t_1
+ sltu $at,$c_3,$t_1
+ $ADDU $t_2,$at
+ $ADDU $c_1,$t_2
+ sltu $at,$c_1,$t_2
+ $ADDU $c_2,$at
+ mflo $t_1
+ mfhi $t_2
+ slt $at,$t_2,$zero
+ $ADDU $c_2,$at
+ $MULTU $a_4,$a_4 # mul_add_c(a[4],b[4],c3,c1,c2);
+ $SLL $t_2,1
+ slt $a2,$t_1,$zero
+ $ADDU $t_2,$a2
+ $SLL $t_1,1
+ $ADDU $c_3,$t_1
+ sltu $at,$c_3,$t_1
+ $ADDU $t_2,$at
+ $ADDU $c_1,$t_2
+ sltu $at,$c_1,$t_2
+ $ADDU $c_2,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_3,$t_1
+ sltu $at,$c_3,$t_1
+ $MULTU $a_2,$a_7 # mul_add_c2(a[2],b[7],c1,c2,c3);
+ $ADDU $t_2,$at
+ $ADDU $c_1,$t_2
+ sltu $at,$c_1,$t_2
+ $ADDU $c_2,$at
+ $ST $c_3,8*$BNSZ($a0)
+
+ mflo $t_1
+ mfhi $t_2
+ slt $c_3,$t_2,$zero
+ $SLL $t_2,1
+ $MULTU $a_3,$a_6 # mul_add_c2(a[3],b[6],c1,c2,c3);
+ slt $a2,$t_1,$zero
+ $ADDU $t_2,$a2
+ $SLL $t_1,1
+ $ADDU $c_1,$t_1
+ sltu $at,$c_1,$t_1
+ $ADDU $t_2,$at
+ $ADDU $c_2,$t_2
+ sltu $at,$c_2,$t_2
+ $ADDU $c_3,$at
+ mflo $t_1
+ mfhi $t_2
+ slt $at,$t_2,$zero
+ $ADDU $c_3,$at
+ $MULTU $a_4,$a_5 # mul_add_c2(a[4],b[5],c1,c2,c3);
+ $SLL $t_2,1
+ slt $a2,$t_1,$zero
+ $ADDU $t_2,$a2
+ $SLL $t_1,1
+ $ADDU $c_1,$t_1
+ sltu $at,$c_1,$t_1
+ $ADDU $t_2,$at
+ $ADDU $c_2,$t_2
+ sltu $at,$c_2,$t_2
+ $ADDU $c_3,$at
+ mflo $t_1
+ mfhi $t_2
+ slt $at,$t_2,$zero
+ $ADDU $c_3,$at
+ $MULTU $a_7,$a_3 # mul_add_c2(a[7],b[3],c2,c3,c1);
+ $SLL $t_2,1
+ slt $a2,$t_1,$zero
+ $ADDU $t_2,$a2
+ $SLL $t_1,1
+ $ADDU $c_1,$t_1
+ sltu $at,$c_1,$t_1
+ $ADDU $t_2,$at
+ $ADDU $c_2,$t_2
+ sltu $at,$c_2,$t_2
+ $ADDU $c_3,$at
+ $ST $c_1,9*$BNSZ($a0)
+
+ mflo $t_1
+ mfhi $t_2
+ slt $c_1,$t_2,$zero
+ $SLL $t_2,1
+ $MULTU $a_6,$a_4 # mul_add_c2(a[6],b[4],c2,c3,c1);
+ slt $a2,$t_1,$zero
+ $ADDU $t_2,$a2
+ $SLL $t_1,1
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $ADDU $t_2,$at
+ $ADDU $c_3,$t_2
+ sltu $at,$c_3,$t_2
+ $ADDU $c_1,$at
+ mflo $t_1
+ mfhi $t_2
+ slt $at,$t_2,$zero
+ $ADDU $c_1,$at
+ $MULTU $a_5,$a_5 # mul_add_c(a[5],b[5],c2,c3,c1);
+ $SLL $t_2,1
+ slt $a2,$t_1,$zero
+ $ADDU $t_2,$a2
+ $SLL $t_1,1
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $ADDU $t_2,$at
+ $ADDU $c_3,$t_2
+ sltu $at,$c_3,$t_2
+ $ADDU $c_1,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $MULTU $a_4,$a_7 # mul_add_c2(a[4],b[7],c3,c1,c2);
+ $ADDU $t_2,$at
+ $ADDU $c_3,$t_2
+ sltu $at,$c_3,$t_2
+ $ADDU $c_1,$at
+ $ST $c_2,10*$BNSZ($a0)
+
+ mflo $t_1
+ mfhi $t_2
+ slt $c_2,$t_2,$zero
+ $SLL $t_2,1
+ $MULTU $a_5,$a_6 # mul_add_c2(a[5],b[6],c3,c1,c2);
+ slt $a2,$t_1,$zero
+ $ADDU $t_2,$a2
+ $SLL $t_1,1
+ $ADDU $c_3,$t_1
+ sltu $at,$c_3,$t_1
+ $ADDU $t_2,$at
+ $ADDU $c_1,$t_2
+ sltu $at,$c_1,$t_2
+ $ADDU $c_2,$at
+ mflo $t_1
+ mfhi $t_2
+ slt $at,$t_2,$zero
+ $ADDU $c_2,$at
+ $MULTU $a_7,$a_5 # mul_add_c2(a[7],b[5],c1,c2,c3);
+ $SLL $t_2,1
+ slt $a2,$t_1,$zero
+ $ADDU $t_2,$a2
+ $SLL $t_1,1
+ $ADDU $c_3,$t_1
+ sltu $at,$c_3,$t_1
+ $ADDU $t_2,$at
+ $ADDU $c_1,$t_2
+ sltu $at,$c_1,$t_2
+ $ADDU $c_2,$at
+ $ST $c_3,11*$BNSZ($a0)
+
+ mflo $t_1
+ mfhi $t_2
+ slt $c_3,$t_2,$zero
+ $SLL $t_2,1
+ $MULTU $a_6,$a_6 # mul_add_c(a[6],b[6],c1,c2,c3);
+ slt $a2,$t_1,$zero
+ $ADDU $t_2,$a2
+ $SLL $t_1,1
+ $ADDU $c_1,$t_1
+ sltu $at,$c_1,$t_1
+ $ADDU $t_2,$at
+ $ADDU $c_2,$t_2
+ sltu $at,$c_2,$t_2
+ $ADDU $c_3,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_1,$t_1
+ sltu $at,$c_1,$t_1
+ $MULTU $a_6,$a_7 # mul_add_c2(a[6],b[7],c2,c3,c1);
+ $ADDU $t_2,$at
+ $ADDU $c_2,$t_2
+ sltu $at,$c_2,$t_2
+ $ADDU $c_3,$at
+ $ST $c_1,12*$BNSZ($a0)
+
+ mflo $t_1
+ mfhi $t_2
+ slt $c_1,$t_2,$zero
+ $SLL $t_2,1
+ $MULTU $a_7,$a_7 # mul_add_c(a[7],b[7],c3,c1,c2);
+ slt $a2,$t_1,$zero
+ $ADDU $t_2,$a2
+ $SLL $t_1,1
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $ADDU $t_2,$at
+ $ADDU $c_3,$t_2
+ sltu $at,$c_3,$t_2
+ $ADDU $c_1,$at
+ $ST $c_2,13*$BNSZ($a0)
+
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_3,$t_1
+ sltu $at,$c_3,$t_1
+ $ADDU $t_2,$at
+ $ADDU $c_1,$t_2
+ $ST $c_3,14*$BNSZ($a0)
+ $ST $c_1,15*$BNSZ($a0)
+
+ .set noreorder
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+ $REG_L $t3,4*$SZREG($sp)
+ $REG_L $t2,3*$SZREG($sp)
+ $REG_L $t1,2*$SZREG($sp)
+ $REG_L $t0,1*$SZREG($sp)
+ $REG_L $gp,0*$SZREG($sp)
+ $PTR_ADD $sp,6*$SZREG
+___
+$code.=<<___;
+ jr $ra
+ nop
+.end bn_sqr_comba8
+
+.align 5
+.globl bn_sqr_comba4
+.ent bn_sqr_comba4
+bn_sqr_comba4:
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+ .frame $sp,6*$SZREG,$ra
+ .mask 0x8000f008,-$SZREG
+ .set noreorder
+ $PTR_SUB $sp,6*$SZREG
+ $REG_S $ra,5*$SZREG($sp)
+ $REG_S $t3,4*$SZREG($sp)
+ $REG_S $t2,3*$SZREG($sp)
+ $REG_S $t1,2*$SZREG($sp)
+ $REG_S $t0,1*$SZREG($sp)
+ $REG_S $gp,0*$SZREG($sp)
+___
+$code.=<<___;
+ .set reorder
+ $LD $a_0,0($a1)
+ $LD $a_1,$BNSZ($a1)
+ $MULTU $a_0,$a_0 # mul_add_c(a[0],b[0],c1,c2,c3);
+ $LD $a_2,2*$BNSZ($a1)
+ $LD $a_3,3*$BNSZ($a1)
+ mflo $c_1
+ mfhi $c_2
+ $ST $c_1,0($a0)
+
+ $MULTU $a_0,$a_1 # mul_add_c2(a[0],b[1],c2,c3,c1);
+ mflo $t_1
+ mfhi $t_2
+ slt $c_1,$t_2,$zero
+ $SLL $t_2,1
+ $MULTU $a_2,$a_0 # mul_add_c2(a[2],b[0],c3,c1,c2);
+ slt $a2,$t_1,$zero
+ $ADDU $t_2,$a2
+ $SLL $t_1,1
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $ADDU $c_3,$t_2,$at
+ $ST $c_2,$BNSZ($a0)
+
+ mflo $t_1
+ mfhi $t_2
+ slt $c_2,$t_2,$zero
+ $SLL $t_2,1
+ $MULTU $a_1,$a_1 # mul_add_c(a[1],b[1],c3,c1,c2);
+ slt $a2,$t_1,$zero
+ $ADDU $t_2,$a2
+ $SLL $t_1,1
+ $ADDU $c_3,$t_1
+ sltu $at,$c_3,$t_1
+ $ADDU $t_2,$at
+ $ADDU $c_1,$t_2
+ sltu $at,$c_1,$t_2
+ $ADDU $c_2,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_3,$t_1
+ sltu $at,$c_3,$t_1
+ $MULTU $a_0,$a_3 # mul_add_c2(a[0],b[3],c1,c2,c3);
+ $ADDU $t_2,$at
+ $ADDU $c_1,$t_2
+ sltu $at,$c_1,$t_2
+ $ADDU $c_2,$at
+ $ST $c_3,2*$BNSZ($a0)
+
+ mflo $t_1
+ mfhi $t_2
+ slt $c_3,$t_2,$zero
+ $SLL $t_2,1
+ $MULTU $a_1,$a_2 # mul_add_c(a2[1],b[2],c1,c2,c3);
+ slt $a2,$t_1,$zero
+ $ADDU $t_2,$a2
+ $SLL $t_1,1
+ $ADDU $c_1,$t_1
+ sltu $at,$c_1,$t_1
+ $ADDU $t_2,$at
+ $ADDU $c_2,$t_2
+ sltu $at,$c_2,$t_2
+ $ADDU $c_3,$at
+ mflo $t_1
+ mfhi $t_2
+ slt $at,$t_2,$zero
+ $ADDU $c_3,$at
+ $MULTU $a_3,$a_1 # mul_add_c2(a[3],b[1],c2,c3,c1);
+ $SLL $t_2,1
+ slt $a2,$t_1,$zero
+ $ADDU $t_2,$a2
+ $SLL $t_1,1
+ $ADDU $c_1,$t_1
+ sltu $at,$c_1,$t_1
+ $ADDU $t_2,$at
+ $ADDU $c_2,$t_2
+ sltu $at,$c_2,$t_2
+ $ADDU $c_3,$at
+ $ST $c_1,3*$BNSZ($a0)
+
+ mflo $t_1
+ mfhi $t_2
+ slt $c_1,$t_2,$zero
+ $SLL $t_2,1
+ $MULTU $a_2,$a_2 # mul_add_c(a[2],b[2],c2,c3,c1);
+ slt $a2,$t_1,$zero
+ $ADDU $t_2,$a2
+ $SLL $t_1,1
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $ADDU $t_2,$at
+ $ADDU $c_3,$t_2
+ sltu $at,$c_3,$t_2
+ $ADDU $c_1,$at
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_2,$t_1
+ sltu $at,$c_2,$t_1
+ $MULTU $a_2,$a_3 # mul_add_c2(a[2],b[3],c3,c1,c2);
+ $ADDU $t_2,$at
+ $ADDU $c_3,$t_2
+ sltu $at,$c_3,$t_2
+ $ADDU $c_1,$at
+ $ST $c_2,4*$BNSZ($a0)
+
+ mflo $t_1
+ mfhi $t_2
+ slt $c_2,$t_2,$zero
+ $SLL $t_2,1
+ $MULTU $a_3,$a_3 # mul_add_c(a[3],b[3],c1,c2,c3);
+ slt $a2,$t_1,$zero
+ $ADDU $t_2,$a2
+ $SLL $t_1,1
+ $ADDU $c_3,$t_1
+ sltu $at,$c_3,$t_1
+ $ADDU $t_2,$at
+ $ADDU $c_1,$t_2
+ sltu $at,$c_1,$t_2
+ $ADDU $c_2,$at
+ $ST $c_3,5*$BNSZ($a0)
+
+ mflo $t_1
+ mfhi $t_2
+ $ADDU $c_1,$t_1
+ sltu $at,$c_1,$t_1
+ $ADDU $t_2,$at
+ $ADDU $c_2,$t_2
+ $ST $c_1,6*$BNSZ($a0)
+ $ST $c_2,7*$BNSZ($a0)
+
+ .set noreorder
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+ $REG_L $t3,4*$SZREG($sp)
+ $REG_L $t2,3*$SZREG($sp)
+ $REG_L $t1,2*$SZREG($sp)
+ $REG_L $t0,1*$SZREG($sp)
+ $REG_L $gp,0*$SZREG($sp)
+ $PTR_ADD $sp,6*$SZREG
+___
+$code.=<<___;
+ jr $ra
+ nop
+.end bn_sqr_comba4
+___
+print $code;
+close STDOUT;
diff --git a/main/openssl/crypto/bn/asm/modexp512-x86_64.S b/main/openssl/crypto/bn/asm/modexp512-x86_64.S
new file mode 100644
index 00000000..6cccafb8
--- /dev/null
+++ b/main/openssl/crypto/bn/asm/modexp512-x86_64.S
@@ -0,0 +1,1773 @@
+.text
+
+.type MULADD_128x512,@function
+.align 16
+MULADD_128x512:
+ movq 0(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r8
+ adcq $0,%rdx
+ movq %r8,0(%rcx)
+ movq %rdx,%rbx
+
+ movq 8(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r9
+ adcq $0,%rdx
+ addq %rbx,%r9
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 16(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r10
+ adcq $0,%rdx
+ addq %rbx,%r10
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 24(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r11
+ adcq $0,%rdx
+ addq %rbx,%r11
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 32(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r12
+ adcq $0,%rdx
+ addq %rbx,%r12
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 40(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r13
+ adcq $0,%rdx
+ addq %rbx,%r13
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 48(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r14
+ adcq $0,%rdx
+ addq %rbx,%r14
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 56(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r15
+ adcq $0,%rdx
+ addq %rbx,%r15
+ adcq $0,%rdx
+ movq %rdx,%r8
+ movq 8(%rdi),%rbp
+ movq 0(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r9
+ adcq $0,%rdx
+ movq %r9,8(%rcx)
+ movq %rdx,%rbx
+
+ movq 8(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r10
+ adcq $0,%rdx
+ addq %rbx,%r10
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 16(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r11
+ adcq $0,%rdx
+ addq %rbx,%r11
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 24(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r12
+ adcq $0,%rdx
+ addq %rbx,%r12
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 32(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r13
+ adcq $0,%rdx
+ addq %rbx,%r13
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 40(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r14
+ adcq $0,%rdx
+ addq %rbx,%r14
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 48(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r15
+ adcq $0,%rdx
+ addq %rbx,%r15
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 56(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r8
+ adcq $0,%rdx
+ addq %rbx,%r8
+ adcq $0,%rdx
+ movq %rdx,%r9
+ .byte 0xf3,0xc3
+.size MULADD_128x512,.-MULADD_128x512
+.type mont_reduce,@function
+.align 16
+mont_reduce:
+ leaq 192(%rsp),%rdi
+ movq 32(%rsp),%rsi
+ addq $576,%rsi
+ leaq 520(%rsp),%rcx
+
+ movq 96(%rcx),%rbp
+ movq 0(%rsi),%rax
+ mulq %rbp
+ movq (%rcx),%r8
+ addq %rax,%r8
+ adcq $0,%rdx
+ movq %r8,0(%rdi)
+ movq %rdx,%rbx
+
+ movq 8(%rsi),%rax
+ mulq %rbp
+ movq 8(%rcx),%r9
+ addq %rax,%r9
+ adcq $0,%rdx
+ addq %rbx,%r9
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 16(%rsi),%rax
+ mulq %rbp
+ movq 16(%rcx),%r10
+ addq %rax,%r10
+ adcq $0,%rdx
+ addq %rbx,%r10
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 24(%rsi),%rax
+ mulq %rbp
+ movq 24(%rcx),%r11
+ addq %rax,%r11
+ adcq $0,%rdx
+ addq %rbx,%r11
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 32(%rsi),%rax
+ mulq %rbp
+ movq 32(%rcx),%r12
+ addq %rax,%r12
+ adcq $0,%rdx
+ addq %rbx,%r12
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 40(%rsi),%rax
+ mulq %rbp
+ movq 40(%rcx),%r13
+ addq %rax,%r13
+ adcq $0,%rdx
+ addq %rbx,%r13
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 48(%rsi),%rax
+ mulq %rbp
+ movq 48(%rcx),%r14
+ addq %rax,%r14
+ adcq $0,%rdx
+ addq %rbx,%r14
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 56(%rsi),%rax
+ mulq %rbp
+ movq 56(%rcx),%r15
+ addq %rax,%r15
+ adcq $0,%rdx
+ addq %rbx,%r15
+ adcq $0,%rdx
+ movq %rdx,%r8
+ movq 104(%rcx),%rbp
+ movq 0(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r9
+ adcq $0,%rdx
+ movq %r9,8(%rdi)
+ movq %rdx,%rbx
+
+ movq 8(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r10
+ adcq $0,%rdx
+ addq %rbx,%r10
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 16(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r11
+ adcq $0,%rdx
+ addq %rbx,%r11
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 24(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r12
+ adcq $0,%rdx
+ addq %rbx,%r12
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 32(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r13
+ adcq $0,%rdx
+ addq %rbx,%r13
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 40(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r14
+ adcq $0,%rdx
+ addq %rbx,%r14
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 48(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r15
+ adcq $0,%rdx
+ addq %rbx,%r15
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 56(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r8
+ adcq $0,%rdx
+ addq %rbx,%r8
+ adcq $0,%rdx
+ movq %rdx,%r9
+ movq 112(%rcx),%rbp
+ movq 0(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r10
+ adcq $0,%rdx
+ movq %r10,16(%rdi)
+ movq %rdx,%rbx
+
+ movq 8(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r11
+ adcq $0,%rdx
+ addq %rbx,%r11
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 16(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r12
+ adcq $0,%rdx
+ addq %rbx,%r12
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 24(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r13
+ adcq $0,%rdx
+ addq %rbx,%r13
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 32(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r14
+ adcq $0,%rdx
+ addq %rbx,%r14
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 40(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r15
+ adcq $0,%rdx
+ addq %rbx,%r15
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 48(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r8
+ adcq $0,%rdx
+ addq %rbx,%r8
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 56(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r9
+ adcq $0,%rdx
+ addq %rbx,%r9
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 120(%rcx),%rbp
+ movq 0(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r11
+ adcq $0,%rdx
+ movq %r11,24(%rdi)
+ movq %rdx,%rbx
+
+ movq 8(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r12
+ adcq $0,%rdx
+ addq %rbx,%r12
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 16(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r13
+ adcq $0,%rdx
+ addq %rbx,%r13
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 24(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r14
+ adcq $0,%rdx
+ addq %rbx,%r14
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 32(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r15
+ adcq $0,%rdx
+ addq %rbx,%r15
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 40(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r8
+ adcq $0,%rdx
+ addq %rbx,%r8
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 48(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r9
+ adcq $0,%rdx
+ addq %rbx,%r9
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 56(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r10
+ adcq $0,%rdx
+ addq %rbx,%r10
+ adcq $0,%rdx
+ movq %rdx,%r11
+ xorq %rax,%rax
+
+ addq 64(%rcx),%r8
+ adcq 72(%rcx),%r9
+ adcq 80(%rcx),%r10
+ adcq 88(%rcx),%r11
+ adcq $0,%rax
+
+
+
+
+ movq %r8,64(%rdi)
+ movq %r9,72(%rdi)
+ movq %r10,%rbp
+ movq %r11,88(%rdi)
+
+ movq %rax,384(%rsp)
+
+ movq 0(%rdi),%r8
+ movq 8(%rdi),%r9
+ movq 16(%rdi),%r10
+ movq 24(%rdi),%r11
+
+
+
+
+
+
+
+
+ addq $80,%rdi
+
+ addq $64,%rsi
+ leaq 296(%rsp),%rcx
+
+ call MULADD_128x512
+
+ movq 384(%rsp),%rax
+
+
+ addq -16(%rdi),%r8
+ adcq -8(%rdi),%r9
+ movq %r8,64(%rcx)
+ movq %r9,72(%rcx)
+
+ adcq %rax,%rax
+ movq %rax,384(%rsp)
+
+ leaq 192(%rsp),%rdi
+ addq $64,%rsi
+
+
+
+
+
+ movq (%rsi),%r8
+ movq 8(%rsi),%rbx
+
+ movq (%rcx),%rax
+ mulq %r8
+ movq %rax,%rbp
+ movq %rdx,%r9
+
+ movq 8(%rcx),%rax
+ mulq %r8
+ addq %rax,%r9
+
+ movq (%rcx),%rax
+ mulq %rbx
+ addq %rax,%r9
+
+ movq %r9,8(%rdi)
+
+
+ subq $192,%rsi
+
+ movq (%rcx),%r8
+ movq 8(%rcx),%r9
+
+ call MULADD_128x512
+
+
+
+
+ movq 0(%rsi),%rax
+ movq 8(%rsi),%rbx
+ movq 16(%rsi),%rdi
+ movq 24(%rsi),%rdx
+
+
+ movq 384(%rsp),%rbp
+
+ addq 64(%rcx),%r8
+ adcq 72(%rcx),%r9
+
+
+ adcq %rbp,%rbp
+
+
+
+ shlq $3,%rbp
+ movq 32(%rsp),%rcx
+ addq %rcx,%rbp
+
+
+ xorq %rsi,%rsi
+
+ addq 0(%rbp),%r10
+ adcq 64(%rbp),%r11
+ adcq 128(%rbp),%r12
+ adcq 192(%rbp),%r13
+ adcq 256(%rbp),%r14
+ adcq 320(%rbp),%r15
+ adcq 384(%rbp),%r8
+ adcq 448(%rbp),%r9
+
+
+
+ sbbq $0,%rsi
+
+
+ andq %rsi,%rax
+ andq %rsi,%rbx
+ andq %rsi,%rdi
+ andq %rsi,%rdx
+
+ movq $1,%rbp
+ subq %rax,%r10
+ sbbq %rbx,%r11
+ sbbq %rdi,%r12
+ sbbq %rdx,%r13
+
+
+
+
+ sbbq $0,%rbp
+
+
+
+ addq $512,%rcx
+ movq 32(%rcx),%rax
+ movq 40(%rcx),%rbx
+ movq 48(%rcx),%rdi
+ movq 56(%rcx),%rdx
+
+
+
+ andq %rsi,%rax
+ andq %rsi,%rbx
+ andq %rsi,%rdi
+ andq %rsi,%rdx
+
+
+
+ subq $1,%rbp
+
+ sbbq %rax,%r14
+ sbbq %rbx,%r15
+ sbbq %rdi,%r8
+ sbbq %rdx,%r9
+
+
+
+ movq 144(%rsp),%rsi
+ movq %r10,0(%rsi)
+ movq %r11,8(%rsi)
+ movq %r12,16(%rsi)
+ movq %r13,24(%rsi)
+ movq %r14,32(%rsi)
+ movq %r15,40(%rsi)
+ movq %r8,48(%rsi)
+ movq %r9,56(%rsi)
+
+ .byte 0xf3,0xc3
+.size mont_reduce,.-mont_reduce
+.type mont_mul_a3b,@function
+.align 16
+mont_mul_a3b:
+
+
+
+
+ movq 0(%rdi),%rbp
+
+ movq %r10,%rax
+ mulq %rbp
+ movq %rax,520(%rsp)
+ movq %rdx,%r10
+ movq %r11,%rax
+ mulq %rbp
+ addq %rax,%r10
+ adcq $0,%rdx
+ movq %rdx,%r11
+ movq %r12,%rax
+ mulq %rbp
+ addq %rax,%r11
+ adcq $0,%rdx
+ movq %rdx,%r12
+ movq %r13,%rax
+ mulq %rbp
+ addq %rax,%r12
+ adcq $0,%rdx
+ movq %rdx,%r13
+ movq %r14,%rax
+ mulq %rbp
+ addq %rax,%r13
+ adcq $0,%rdx
+ movq %rdx,%r14
+ movq %r15,%rax
+ mulq %rbp
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r15
+ movq %r8,%rax
+ mulq %rbp
+ addq %rax,%r15
+ adcq $0,%rdx
+ movq %rdx,%r8
+ movq %r9,%rax
+ mulq %rbp
+ addq %rax,%r8
+ adcq $0,%rdx
+ movq %rdx,%r9
+ movq 8(%rdi),%rbp
+ movq 0(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r10
+ adcq $0,%rdx
+ movq %r10,528(%rsp)
+ movq %rdx,%rbx
+
+ movq 8(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r11
+ adcq $0,%rdx
+ addq %rbx,%r11
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 16(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r12
+ adcq $0,%rdx
+ addq %rbx,%r12
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 24(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r13
+ adcq $0,%rdx
+ addq %rbx,%r13
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 32(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r14
+ adcq $0,%rdx
+ addq %rbx,%r14
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 40(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r15
+ adcq $0,%rdx
+ addq %rbx,%r15
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 48(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r8
+ adcq $0,%rdx
+ addq %rbx,%r8
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 56(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r9
+ adcq $0,%rdx
+ addq %rbx,%r9
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 16(%rdi),%rbp
+ movq 0(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r11
+ adcq $0,%rdx
+ movq %r11,536(%rsp)
+ movq %rdx,%rbx
+
+ movq 8(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r12
+ adcq $0,%rdx
+ addq %rbx,%r12
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 16(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r13
+ adcq $0,%rdx
+ addq %rbx,%r13
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 24(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r14
+ adcq $0,%rdx
+ addq %rbx,%r14
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 32(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r15
+ adcq $0,%rdx
+ addq %rbx,%r15
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 40(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r8
+ adcq $0,%rdx
+ addq %rbx,%r8
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 48(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r9
+ adcq $0,%rdx
+ addq %rbx,%r9
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 56(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r10
+ adcq $0,%rdx
+ addq %rbx,%r10
+ adcq $0,%rdx
+ movq %rdx,%r11
+ movq 24(%rdi),%rbp
+ movq 0(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r12
+ adcq $0,%rdx
+ movq %r12,544(%rsp)
+ movq %rdx,%rbx
+
+ movq 8(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r13
+ adcq $0,%rdx
+ addq %rbx,%r13
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 16(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r14
+ adcq $0,%rdx
+ addq %rbx,%r14
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 24(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r15
+ adcq $0,%rdx
+ addq %rbx,%r15
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 32(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r8
+ adcq $0,%rdx
+ addq %rbx,%r8
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 40(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r9
+ adcq $0,%rdx
+ addq %rbx,%r9
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 48(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r10
+ adcq $0,%rdx
+ addq %rbx,%r10
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 56(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r11
+ adcq $0,%rdx
+ addq %rbx,%r11
+ adcq $0,%rdx
+ movq %rdx,%r12
+ movq 32(%rdi),%rbp
+ movq 0(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r13
+ adcq $0,%rdx
+ movq %r13,552(%rsp)
+ movq %rdx,%rbx
+
+ movq 8(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r14
+ adcq $0,%rdx
+ addq %rbx,%r14
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 16(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r15
+ adcq $0,%rdx
+ addq %rbx,%r15
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 24(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r8
+ adcq $0,%rdx
+ addq %rbx,%r8
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 32(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r9
+ adcq $0,%rdx
+ addq %rbx,%r9
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 40(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r10
+ adcq $0,%rdx
+ addq %rbx,%r10
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 48(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r11
+ adcq $0,%rdx
+ addq %rbx,%r11
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 56(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r12
+ adcq $0,%rdx
+ addq %rbx,%r12
+ adcq $0,%rdx
+ movq %rdx,%r13
+ movq 40(%rdi),%rbp
+ movq 0(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %r14,560(%rsp)
+ movq %rdx,%rbx
+
+ movq 8(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r15
+ adcq $0,%rdx
+ addq %rbx,%r15
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 16(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r8
+ adcq $0,%rdx
+ addq %rbx,%r8
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 24(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r9
+ adcq $0,%rdx
+ addq %rbx,%r9
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 32(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r10
+ adcq $0,%rdx
+ addq %rbx,%r10
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 40(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r11
+ adcq $0,%rdx
+ addq %rbx,%r11
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 48(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r12
+ adcq $0,%rdx
+ addq %rbx,%r12
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 56(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r13
+ adcq $0,%rdx
+ addq %rbx,%r13
+ adcq $0,%rdx
+ movq %rdx,%r14
+ movq 48(%rdi),%rbp
+ movq 0(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r15
+ adcq $0,%rdx
+ movq %r15,568(%rsp)
+ movq %rdx,%rbx
+
+ movq 8(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r8
+ adcq $0,%rdx
+ addq %rbx,%r8
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 16(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r9
+ adcq $0,%rdx
+ addq %rbx,%r9
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 24(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r10
+ adcq $0,%rdx
+ addq %rbx,%r10
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 32(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r11
+ adcq $0,%rdx
+ addq %rbx,%r11
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 40(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r12
+ adcq $0,%rdx
+ addq %rbx,%r12
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 48(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r13
+ adcq $0,%rdx
+ addq %rbx,%r13
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 56(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r14
+ adcq $0,%rdx
+ addq %rbx,%r14
+ adcq $0,%rdx
+ movq %rdx,%r15
+ movq 56(%rdi),%rbp
+ movq 0(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r8
+ adcq $0,%rdx
+ movq %r8,576(%rsp)
+ movq %rdx,%rbx
+
+ movq 8(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r9
+ adcq $0,%rdx
+ addq %rbx,%r9
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 16(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r10
+ adcq $0,%rdx
+ addq %rbx,%r10
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 24(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r11
+ adcq $0,%rdx
+ addq %rbx,%r11
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 32(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r12
+ adcq $0,%rdx
+ addq %rbx,%r12
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 40(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r13
+ adcq $0,%rdx
+ addq %rbx,%r13
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 48(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r14
+ adcq $0,%rdx
+ addq %rbx,%r14
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 56(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r15
+ adcq $0,%rdx
+ addq %rbx,%r15
+ adcq $0,%rdx
+ movq %rdx,%r8
+ movq %r9,584(%rsp)
+ movq %r10,592(%rsp)
+ movq %r11,600(%rsp)
+ movq %r12,608(%rsp)
+ movq %r13,616(%rsp)
+ movq %r14,624(%rsp)
+ movq %r15,632(%rsp)
+ movq %r8,640(%rsp)
+
+
+
+
+
+ jmp mont_reduce
+
+
+.size mont_mul_a3b,.-mont_mul_a3b
+.type sqr_reduce,@function
+.align 16
+sqr_reduce:
+ movq 16(%rsp),%rcx
+
+
+
+ movq %r10,%rbx
+
+ movq %r11,%rax
+ mulq %rbx
+ movq %rax,528(%rsp)
+ movq %rdx,%r10
+ movq %r12,%rax
+ mulq %rbx
+ addq %rax,%r10
+ adcq $0,%rdx
+ movq %rdx,%r11
+ movq %r13,%rax
+ mulq %rbx
+ addq %rax,%r11
+ adcq $0,%rdx
+ movq %rdx,%r12
+ movq %r14,%rax
+ mulq %rbx
+ addq %rax,%r12
+ adcq $0,%rdx
+ movq %rdx,%r13
+ movq %r15,%rax
+ mulq %rbx
+ addq %rax,%r13
+ adcq $0,%rdx
+ movq %rdx,%r14
+ movq %r8,%rax
+ mulq %rbx
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r15
+ movq %r9,%rax
+ mulq %rbx
+ addq %rax,%r15
+ adcq $0,%rdx
+ movq %rdx,%rsi
+
+ movq %r10,536(%rsp)
+
+
+
+
+
+ movq 8(%rcx),%rbx
+
+ movq 16(%rcx),%rax
+ mulq %rbx
+ addq %rax,%r11
+ adcq $0,%rdx
+ movq %r11,544(%rsp)
+
+ movq %rdx,%r10
+ movq 24(%rcx),%rax
+ mulq %rbx
+ addq %rax,%r12
+ adcq $0,%rdx
+ addq %r10,%r12
+ adcq $0,%rdx
+ movq %r12,552(%rsp)
+
+ movq %rdx,%r10
+ movq 32(%rcx),%rax
+ mulq %rbx
+ addq %rax,%r13
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+
+ movq %rdx,%r10
+ movq 40(%rcx),%rax
+ mulq %rbx
+ addq %rax,%r14
+ adcq $0,%rdx
+ addq %r10,%r14
+ adcq $0,%rdx
+
+ movq %rdx,%r10
+ movq %r8,%rax
+ mulq %rbx
+ addq %rax,%r15
+ adcq $0,%rdx
+ addq %r10,%r15
+ adcq $0,%rdx
+
+ movq %rdx,%r10
+ movq %r9,%rax
+ mulq %rbx
+ addq %rax,%rsi
+ adcq $0,%rdx
+ addq %r10,%rsi
+ adcq $0,%rdx
+
+ movq %rdx,%r11
+
+
+
+
+ movq 16(%rcx),%rbx
+
+ movq 24(%rcx),%rax
+ mulq %rbx
+ addq %rax,%r13
+ adcq $0,%rdx
+ movq %r13,560(%rsp)
+
+ movq %rdx,%r10
+ movq 32(%rcx),%rax
+ mulq %rbx
+ addq %rax,%r14
+ adcq $0,%rdx
+ addq %r10,%r14
+ adcq $0,%rdx
+ movq %r14,568(%rsp)
+
+ movq %rdx,%r10
+ movq 40(%rcx),%rax
+ mulq %rbx
+ addq %rax,%r15
+ adcq $0,%rdx
+ addq %r10,%r15
+ adcq $0,%rdx
+
+ movq %rdx,%r10
+ movq %r8,%rax
+ mulq %rbx
+ addq %rax,%rsi
+ adcq $0,%rdx
+ addq %r10,%rsi
+ adcq $0,%rdx
+
+ movq %rdx,%r10
+ movq %r9,%rax
+ mulq %rbx
+ addq %rax,%r11
+ adcq $0,%rdx
+ addq %r10,%r11
+ adcq $0,%rdx
+
+ movq %rdx,%r12
+
+
+
+
+
+ movq 24(%rcx),%rbx
+
+ movq 32(%rcx),%rax
+ mulq %rbx
+ addq %rax,%r15
+ adcq $0,%rdx
+ movq %r15,576(%rsp)
+
+ movq %rdx,%r10
+ movq 40(%rcx),%rax
+ mulq %rbx
+ addq %rax,%rsi
+ adcq $0,%rdx
+ addq %r10,%rsi
+ adcq $0,%rdx
+ movq %rsi,584(%rsp)
+
+ movq %rdx,%r10
+ movq %r8,%rax
+ mulq %rbx
+ addq %rax,%r11
+ adcq $0,%rdx
+ addq %r10,%r11
+ adcq $0,%rdx
+
+ movq %rdx,%r10
+ movq %r9,%rax
+ mulq %rbx
+ addq %rax,%r12
+ adcq $0,%rdx
+ addq %r10,%r12
+ adcq $0,%rdx
+
+ movq %rdx,%r15
+
+
+
+
+ movq 32(%rcx),%rbx
+
+ movq 40(%rcx),%rax
+ mulq %rbx
+ addq %rax,%r11
+ adcq $0,%rdx
+ movq %r11,592(%rsp)
+
+ movq %rdx,%r10
+ movq %r8,%rax
+ mulq %rbx
+ addq %rax,%r12
+ adcq $0,%rdx
+ addq %r10,%r12
+ adcq $0,%rdx
+ movq %r12,600(%rsp)
+
+ movq %rdx,%r10
+ movq %r9,%rax
+ mulq %rbx
+ addq %rax,%r15
+ adcq $0,%rdx
+ addq %r10,%r15
+ adcq $0,%rdx
+
+ movq %rdx,%r11
+
+
+
+
+ movq 40(%rcx),%rbx
+
+ movq %r8,%rax
+ mulq %rbx
+ addq %rax,%r15
+ adcq $0,%rdx
+ movq %r15,608(%rsp)
+
+ movq %rdx,%r10
+ movq %r9,%rax
+ mulq %rbx
+ addq %rax,%r11
+ adcq $0,%rdx
+ addq %r10,%r11
+ adcq $0,%rdx
+ movq %r11,616(%rsp)
+
+ movq %rdx,%r12
+
+
+
+
+ movq %r8,%rbx
+
+ movq %r9,%rax
+ mulq %rbx
+ addq %rax,%r12
+ adcq $0,%rdx
+ movq %r12,624(%rsp)
+
+ movq %rdx,632(%rsp)
+
+
+ movq 528(%rsp),%r10
+ movq 536(%rsp),%r11
+ movq 544(%rsp),%r12
+ movq 552(%rsp),%r13
+ movq 560(%rsp),%r14
+ movq 568(%rsp),%r15
+
+ movq 24(%rcx),%rax
+ mulq %rax
+ movq %rax,%rdi
+ movq %rdx,%r8
+
+ addq %r10,%r10
+ adcq %r11,%r11
+ adcq %r12,%r12
+ adcq %r13,%r13
+ adcq %r14,%r14
+ adcq %r15,%r15
+ adcq $0,%r8
+
+ movq 0(%rcx),%rax
+ mulq %rax
+ movq %rax,520(%rsp)
+ movq %rdx,%rbx
+
+ movq 8(%rcx),%rax
+ mulq %rax
+
+ addq %rbx,%r10
+ adcq %rax,%r11
+ adcq $0,%rdx
+
+ movq %rdx,%rbx
+ movq %r10,528(%rsp)
+ movq %r11,536(%rsp)
+
+ movq 16(%rcx),%rax
+ mulq %rax
+
+ addq %rbx,%r12
+ adcq %rax,%r13
+ adcq $0,%rdx
+
+ movq %rdx,%rbx
+
+ movq %r12,544(%rsp)
+ movq %r13,552(%rsp)
+
+ xorq %rbp,%rbp
+ addq %rbx,%r14
+ adcq %rdi,%r15
+ adcq $0,%rbp
+
+ movq %r14,560(%rsp)
+ movq %r15,568(%rsp)
+
+
+
+
+ movq 576(%rsp),%r10
+ movq 584(%rsp),%r11
+ movq 592(%rsp),%r12
+ movq 600(%rsp),%r13
+ movq 608(%rsp),%r14
+ movq 616(%rsp),%r15
+ movq 624(%rsp),%rdi
+ movq 632(%rsp),%rsi
+
+ movq %r9,%rax
+ mulq %rax
+ movq %rax,%r9
+ movq %rdx,%rbx
+
+ addq %r10,%r10
+ adcq %r11,%r11
+ adcq %r12,%r12
+ adcq %r13,%r13
+ adcq %r14,%r14
+ adcq %r15,%r15
+ adcq %rdi,%rdi
+ adcq %rsi,%rsi
+ adcq $0,%rbx
+
+ addq %rbp,%r10
+
+ movq 32(%rcx),%rax
+ mulq %rax
+
+ addq %r8,%r10
+ adcq %rax,%r11
+ adcq $0,%rdx
+
+ movq %rdx,%rbp
+
+ movq %r10,576(%rsp)
+ movq %r11,584(%rsp)
+
+ movq 40(%rcx),%rax
+ mulq %rax
+
+ addq %rbp,%r12
+ adcq %rax,%r13
+ adcq $0,%rdx
+
+ movq %rdx,%rbp
+
+ movq %r12,592(%rsp)
+ movq %r13,600(%rsp)
+
+ movq 48(%rcx),%rax
+ mulq %rax
+
+ addq %rbp,%r14
+ adcq %rax,%r15
+ adcq $0,%rdx
+
+ movq %r14,608(%rsp)
+ movq %r15,616(%rsp)
+
+ addq %rdx,%rdi
+ adcq %r9,%rsi
+ adcq $0,%rbx
+
+ movq %rdi,624(%rsp)
+ movq %rsi,632(%rsp)
+ movq %rbx,640(%rsp)
+
+ jmp mont_reduce
+
+
+.size sqr_reduce,.-sqr_reduce
+.globl mod_exp_512
+.type mod_exp_512,@function
+mod_exp_512:
+ pushq %rbp
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+
+
+ movq %rsp,%r8
+ subq $2688,%rsp
+ andq $-64,%rsp
+
+
+ movq %r8,0(%rsp)
+ movq %rdi,8(%rsp)
+ movq %rsi,16(%rsp)
+ movq %rcx,24(%rsp)
+.Lbody:
+
+
+
+ pxor %xmm4,%xmm4
+ movdqu 0(%rsi),%xmm0
+ movdqu 16(%rsi),%xmm1
+ movdqu 32(%rsi),%xmm2
+ movdqu 48(%rsi),%xmm3
+ movdqa %xmm4,512(%rsp)
+ movdqa %xmm4,528(%rsp)
+ movdqa %xmm4,608(%rsp)
+ movdqa %xmm4,624(%rsp)
+ movdqa %xmm0,544(%rsp)
+ movdqa %xmm1,560(%rsp)
+ movdqa %xmm2,576(%rsp)
+ movdqa %xmm3,592(%rsp)
+
+
+ movdqu 0(%rdx),%xmm0
+ movdqu 16(%rdx),%xmm1
+ movdqu 32(%rdx),%xmm2
+ movdqu 48(%rdx),%xmm3
+
+ leaq 384(%rsp),%rbx
+ movq %rbx,136(%rsp)
+ call mont_reduce
+
+
+ leaq 448(%rsp),%rcx
+ xorq %rax,%rax
+ movq %rax,0(%rcx)
+ movq %rax,8(%rcx)
+ movq %rax,24(%rcx)
+ movq %rax,32(%rcx)
+ movq %rax,40(%rcx)
+ movq %rax,48(%rcx)
+ movq %rax,56(%rcx)
+ movq %rax,128(%rsp)
+ movq $1,16(%rcx)
+
+ leaq 640(%rsp),%rbp
+ movq %rcx,%rsi
+ movq %rbp,%rdi
+ movq $8,%rax
+loop_0:
+ movq (%rcx),%rbx
+ movw %bx,(%rdi)
+ shrq $16,%rbx
+ movw %bx,64(%rdi)
+ shrq $16,%rbx
+ movw %bx,128(%rdi)
+ shrq $16,%rbx
+ movw %bx,192(%rdi)
+ leaq 8(%rcx),%rcx
+ leaq 256(%rdi),%rdi
+ decq %rax
+ jnz loop_0
+ movq $31,%rax
+ movq %rax,32(%rsp)
+ movq %rbp,40(%rsp)
+
+ movq %rsi,136(%rsp)
+ movq 0(%rsi),%r10
+ movq 8(%rsi),%r11
+ movq 16(%rsi),%r12
+ movq 24(%rsi),%r13
+ movq 32(%rsi),%r14
+ movq 40(%rsi),%r15
+ movq 48(%rsi),%r8
+ movq 56(%rsi),%r9
+init_loop:
+ leaq 384(%rsp),%rdi
+ call mont_mul_a3b
+ leaq 448(%rsp),%rsi
+ movq 40(%rsp),%rbp
+ addq $2,%rbp
+ movq %rbp,40(%rsp)
+ movq %rsi,%rcx
+ movq $8,%rax
+loop_1:
+ movq (%rcx),%rbx
+ movw %bx,(%rbp)
+ shrq $16,%rbx
+ movw %bx,64(%rbp)
+ shrq $16,%rbx
+ movw %bx,128(%rbp)
+ shrq $16,%rbx
+ movw %bx,192(%rbp)
+ leaq 8(%rcx),%rcx
+ leaq 256(%rbp),%rbp
+ decq %rax
+ jnz loop_1
+ movq 32(%rsp),%rax
+ subq $1,%rax
+ movq %rax,32(%rsp)
+ jne init_loop
+
+
+
+ movdqa %xmm0,64(%rsp)
+ movdqa %xmm1,80(%rsp)
+ movdqa %xmm2,96(%rsp)
+ movdqa %xmm3,112(%rsp)
+
+
+
+
+
+ movl 126(%rsp),%eax
+ movq %rax,%rdx
+ shrq $11,%rax
+ andl $2047,%edx
+ movl %edx,126(%rsp)
+ leaq 640(%rsp,%rax,2),%rsi
+ movq 8(%rsp),%rdx
+ movq $4,%rbp
+loop_2:
+ movzwq 192(%rsi),%rbx
+ movzwq 448(%rsi),%rax
+ shlq $16,%rbx
+ shlq $16,%rax
+ movw 128(%rsi),%bx
+ movw 384(%rsi),%ax
+ shlq $16,%rbx
+ shlq $16,%rax
+ movw 64(%rsi),%bx
+ movw 320(%rsi),%ax
+ shlq $16,%rbx
+ shlq $16,%rax
+ movw 0(%rsi),%bx
+ movw 256(%rsi),%ax
+ movq %rbx,0(%rdx)
+ movq %rax,8(%rdx)
+ leaq 512(%rsi),%rsi
+ leaq 16(%rdx),%rdx
+ subq $1,%rbp
+ jnz loop_2
+ movq $505,48(%rsp)
+
+ movq 8(%rsp),%rcx
+ movq %rcx,136(%rsp)
+ movq 0(%rcx),%r10
+ movq 8(%rcx),%r11
+ movq 16(%rcx),%r12
+ movq 24(%rcx),%r13
+ movq 32(%rcx),%r14
+ movq 40(%rcx),%r15
+ movq 48(%rcx),%r8
+ movq 56(%rcx),%r9
+ jmp sqr_2
+
+main_loop_a3b:
+ call sqr_reduce
+ call sqr_reduce
+ call sqr_reduce
+sqr_2:
+ call sqr_reduce
+ call sqr_reduce
+
+
+
+ movq 48(%rsp),%rcx
+ movq %rcx,%rax
+ shrq $4,%rax
+ movl 64(%rsp,%rax,2),%edx
+ andq $15,%rcx
+ shrq %cl,%rdx
+ andq $31,%rdx
+
+ leaq 640(%rsp,%rdx,2),%rsi
+ leaq 448(%rsp),%rdx
+ movq %rdx,%rdi
+ movq $4,%rbp
+loop_3:
+ movzwq 192(%rsi),%rbx
+ movzwq 448(%rsi),%rax
+ shlq $16,%rbx
+ shlq $16,%rax
+ movw 128(%rsi),%bx
+ movw 384(%rsi),%ax
+ shlq $16,%rbx
+ shlq $16,%rax
+ movw 64(%rsi),%bx
+ movw 320(%rsi),%ax
+ shlq $16,%rbx
+ shlq $16,%rax
+ movw 0(%rsi),%bx
+ movw 256(%rsi),%ax
+ movq %rbx,0(%rdx)
+ movq %rax,8(%rdx)
+ leaq 512(%rsi),%rsi
+ leaq 16(%rdx),%rdx
+ subq $1,%rbp
+ jnz loop_3
+ movq 8(%rsp),%rsi
+ call mont_mul_a3b
+
+
+
+ movq 48(%rsp),%rcx
+ subq $5,%rcx
+ movq %rcx,48(%rsp)
+ jge main_loop_a3b
+
+
+
+end_main_loop_a3b:
+
+
+ movq 8(%rsp),%rdx
+ pxor %xmm4,%xmm4
+ movdqu 0(%rdx),%xmm0
+ movdqu 16(%rdx),%xmm1
+ movdqu 32(%rdx),%xmm2
+ movdqu 48(%rdx),%xmm3
+ movdqa %xmm4,576(%rsp)
+ movdqa %xmm4,592(%rsp)
+ movdqa %xmm4,608(%rsp)
+ movdqa %xmm4,624(%rsp)
+ movdqa %xmm0,512(%rsp)
+ movdqa %xmm1,528(%rsp)
+ movdqa %xmm2,544(%rsp)
+ movdqa %xmm3,560(%rsp)
+ call mont_reduce
+
+
+
+ movq 8(%rsp),%rax
+ movq 0(%rax),%r8
+ movq 8(%rax),%r9
+ movq 16(%rax),%r10
+ movq 24(%rax),%r11
+ movq 32(%rax),%r12
+ movq 40(%rax),%r13
+ movq 48(%rax),%r14
+ movq 56(%rax),%r15
+
+
+ movq 24(%rsp),%rbx
+ addq $512,%rbx
+
+ subq 0(%rbx),%r8
+ sbbq 8(%rbx),%r9
+ sbbq 16(%rbx),%r10
+ sbbq 24(%rbx),%r11
+ sbbq 32(%rbx),%r12
+ sbbq 40(%rbx),%r13
+ sbbq 48(%rbx),%r14
+ sbbq 56(%rbx),%r15
+
+
+ movq 0(%rax),%rsi
+ movq 8(%rax),%rdi
+ movq 16(%rax),%rcx
+ movq 24(%rax),%rdx
+ cmovncq %r8,%rsi
+ cmovncq %r9,%rdi
+ cmovncq %r10,%rcx
+ cmovncq %r11,%rdx
+ movq %rsi,0(%rax)
+ movq %rdi,8(%rax)
+ movq %rcx,16(%rax)
+ movq %rdx,24(%rax)
+
+ movq 32(%rax),%rsi
+ movq 40(%rax),%rdi
+ movq 48(%rax),%rcx
+ movq 56(%rax),%rdx
+ cmovncq %r12,%rsi
+ cmovncq %r13,%rdi
+ cmovncq %r14,%rcx
+ cmovncq %r15,%rdx
+ movq %rsi,32(%rax)
+ movq %rdi,40(%rax)
+ movq %rcx,48(%rax)
+ movq %rdx,56(%rax)
+
+ movq 0(%rsp),%rsi
+ movq 0(%rsi),%r15
+ movq 8(%rsi),%r14
+ movq 16(%rsi),%r13
+ movq 24(%rsi),%r12
+ movq 32(%rsi),%rbx
+ movq 40(%rsi),%rbp
+ leaq 48(%rsi),%rsp
+.Lepilogue:
+ .byte 0xf3,0xc3
+.size mod_exp_512, . - mod_exp_512
diff --git a/main/openssl/crypto/bn/asm/modexp512-x86_64.pl b/main/openssl/crypto/bn/asm/modexp512-x86_64.pl
new file mode 100644
index 00000000..bfd6e975
--- /dev/null
+++ b/main/openssl/crypto/bn/asm/modexp512-x86_64.pl
@@ -0,0 +1,1497 @@
+#!/usr/bin/env perl
+#
+# Copyright (c) 2010-2011 Intel Corp.
+# Author: Vinodh.Gopal@intel.com
+# Jim Guilford
+# Erdinc.Ozturk@intel.com
+# Maxim.Perminov@intel.com
+#
+# More information about algorithm used can be found at:
+# http://www.cse.buffalo.edu/srds2009/escs2009_submission_Gopal.pdf
+#
+# ====================================================================
+# Copyright (c) 2011 The OpenSSL Project. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+#
+# 3. All advertising materials mentioning features or use of this
+# software must display the following acknowledgment:
+# "This product includes software developed by the OpenSSL Project
+# for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+#
+# 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+# endorse or promote products derived from this software without
+# prior written permission. For written permission, please contact
+# licensing@OpenSSL.org.
+#
+# 5. Products derived from this software may not be called "OpenSSL"
+# nor may "OpenSSL" appear in their names without prior written
+# permission of the OpenSSL Project.
+#
+# 6. Redistributions of any form whatsoever must retain the following
+# acknowledgment:
+# "This product includes software developed by the OpenSSL Project
+# for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+#
+# THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+# EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+# OF THE POSSIBILITY OF SUCH DAMAGE.
+# ====================================================================
+
+$flavour = shift;
+$output = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+my $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+use strict;
+my $code=".text\n\n";
+my $m=0;
+
+#
+# Define x512 macros
+#
+
+#MULSTEP_512_ADD MACRO x7, x6, x5, x4, x3, x2, x1, x0, dst, src1, src2, add_src, tmp1, tmp2
+#
+# uses rax, rdx, and args
+sub MULSTEP_512_ADD
+{
+ my ($x, $DST, $SRC2, $ASRC, $OP, $TMP)=@_;
+ my @X=@$x; # make a copy
+$code.=<<___;
+ mov (+8*0)($SRC2), %rax
+ mul $OP # rdx:rax = %OP * [0]
+ mov ($ASRC), $X[0]
+ add %rax, $X[0]
+ adc \$0, %rdx
+ mov $X[0], $DST
+___
+for(my $i=1;$i<8;$i++) {
+$code.=<<___;
+ mov %rdx, $TMP
+
+ mov (+8*$i)($SRC2), %rax
+ mul $OP # rdx:rax = %OP * [$i]
+ mov (+8*$i)($ASRC), $X[$i]
+ add %rax, $X[$i]
+ adc \$0, %rdx
+ add $TMP, $X[$i]
+ adc \$0, %rdx
+___
+}
+$code.=<<___;
+ mov %rdx, $X[0]
+___
+}
+
+#MULSTEP_512 MACRO x7, x6, x5, x4, x3, x2, x1, x0, dst, src2, src1_val, tmp
+#
+# uses rax, rdx, and args
+sub MULSTEP_512
+{
+ my ($x, $DST, $SRC2, $OP, $TMP)=@_;
+ my @X=@$x; # make a copy
+$code.=<<___;
+ mov (+8*0)($SRC2), %rax
+ mul $OP # rdx:rax = %OP * [0]
+ add %rax, $X[0]
+ adc \$0, %rdx
+ mov $X[0], $DST
+___
+for(my $i=1;$i<8;$i++) {
+$code.=<<___;
+ mov %rdx, $TMP
+
+ mov (+8*$i)($SRC2), %rax
+ mul $OP # rdx:rax = %OP * [$i]
+ add %rax, $X[$i]
+ adc \$0, %rdx
+ add $TMP, $X[$i]
+ adc \$0, %rdx
+___
+}
+$code.=<<___;
+ mov %rdx, $X[0]
+___
+}
+
+#
+# Swizzle Macros
+#
+
+# macro to copy data from flat space to swizzled table
+#MACRO swizzle pDst, pSrc, tmp1, tmp2
+# pDst and pSrc are modified
+sub swizzle
+{
+ my ($pDst, $pSrc, $cnt, $d0)=@_;
+$code.=<<___;
+ mov \$8, $cnt
+loop_$m:
+ mov ($pSrc), $d0
+ mov $d0#w, ($pDst)
+ shr \$16, $d0
+ mov $d0#w, (+64*1)($pDst)
+ shr \$16, $d0
+ mov $d0#w, (+64*2)($pDst)
+ shr \$16, $d0
+ mov $d0#w, (+64*3)($pDst)
+ lea 8($pSrc), $pSrc
+ lea 64*4($pDst), $pDst
+ dec $cnt
+ jnz loop_$m
+___
+
+ $m++;
+}
+
+# macro to copy data from swizzled table to flat space
+#MACRO unswizzle pDst, pSrc, tmp*3
+sub unswizzle
+{
+ my ($pDst, $pSrc, $cnt, $d0, $d1)=@_;
+$code.=<<___;
+ mov \$4, $cnt
+loop_$m:
+ movzxw (+64*3+256*0)($pSrc), $d0
+ movzxw (+64*3+256*1)($pSrc), $d1
+ shl \$16, $d0
+ shl \$16, $d1
+ mov (+64*2+256*0)($pSrc), $d0#w
+ mov (+64*2+256*1)($pSrc), $d1#w
+ shl \$16, $d0
+ shl \$16, $d1
+ mov (+64*1+256*0)($pSrc), $d0#w
+ mov (+64*1+256*1)($pSrc), $d1#w
+ shl \$16, $d0
+ shl \$16, $d1
+ mov (+64*0+256*0)($pSrc), $d0#w
+ mov (+64*0+256*1)($pSrc), $d1#w
+ mov $d0, (+8*0)($pDst)
+ mov $d1, (+8*1)($pDst)
+ lea 256*2($pSrc), $pSrc
+ lea 8*2($pDst), $pDst
+ sub \$1, $cnt
+ jnz loop_$m
+___
+
+ $m++;
+}
+
+#
+# Data Structures
+#
+
+# Reduce Data
+#
+#
+# Offset Value
+# 0C0 Carries
+# 0B8 X2[10]
+# 0B0 X2[9]
+# 0A8 X2[8]
+# 0A0 X2[7]
+# 098 X2[6]
+# 090 X2[5]
+# 088 X2[4]
+# 080 X2[3]
+# 078 X2[2]
+# 070 X2[1]
+# 068 X2[0]
+# 060 X1[12] P[10]
+# 058 X1[11] P[9] Z[8]
+# 050 X1[10] P[8] Z[7]
+# 048 X1[9] P[7] Z[6]
+# 040 X1[8] P[6] Z[5]
+# 038 X1[7] P[5] Z[4]
+# 030 X1[6] P[4] Z[3]
+# 028 X1[5] P[3] Z[2]
+# 020 X1[4] P[2] Z[1]
+# 018 X1[3] P[1] Z[0]
+# 010 X1[2] P[0] Y[2]
+# 008 X1[1] Q[1] Y[1]
+# 000 X1[0] Q[0] Y[0]
+
+my $X1_offset = 0; # 13 qwords
+my $X2_offset = $X1_offset + 13*8; # 11 qwords
+my $Carries_offset = $X2_offset + 11*8; # 1 qword
+my $Q_offset = 0; # 2 qwords
+my $P_offset = $Q_offset + 2*8; # 11 qwords
+my $Y_offset = 0; # 3 qwords
+my $Z_offset = $Y_offset + 3*8; # 9 qwords
+
+my $Red_Data_Size = $Carries_offset + 1*8; # (25 qwords)
+
+#
+# Stack Frame
+#
+#
+# offset value
+# ... <old stack contents>
+# ...
+# 280 Garray
+
+# 278 tmp16[15]
+# ... ...
+# 200 tmp16[0]
+
+# 1F8 tmp[7]
+# ... ...
+# 1C0 tmp[0]
+
+# 1B8 GT[7]
+# ... ...
+# 180 GT[0]
+
+# 178 Reduce Data
+# ... ...
+# 0B8 Reduce Data
+# 0B0 reserved
+# 0A8 reserved
+# 0A0 reserved
+# 098 reserved
+# 090 reserved
+# 088 reduce result addr
+# 080 exp[8]
+
+# ...
+# 048 exp[1]
+# 040 exp[0]
+
+# 038 reserved
+# 030 loop_idx
+# 028 pg
+# 020 i
+# 018 pData ; arg 4
+# 010 pG ; arg 2
+# 008 pResult ; arg 1
+# 000 rsp ; stack pointer before subtract
+
+my $rsp_offset = 0;
+my $pResult_offset = 8*1 + $rsp_offset;
+my $pG_offset = 8*1 + $pResult_offset;
+my $pData_offset = 8*1 + $pG_offset;
+my $i_offset = 8*1 + $pData_offset;
+my $pg_offset = 8*1 + $i_offset;
+my $loop_idx_offset = 8*1 + $pg_offset;
+my $reserved1_offset = 8*1 + $loop_idx_offset;
+my $exp_offset = 8*1 + $reserved1_offset;
+my $red_result_addr_offset= 8*9 + $exp_offset;
+my $reserved2_offset = 8*1 + $red_result_addr_offset;
+my $Reduce_Data_offset = 8*5 + $reserved2_offset;
+my $GT_offset = $Red_Data_Size + $Reduce_Data_offset;
+my $tmp_offset = 8*8 + $GT_offset;
+my $tmp16_offset = 8*8 + $tmp_offset;
+my $garray_offset = 8*16 + $tmp16_offset;
+my $mem_size = 8*8*32 + $garray_offset;
+
+#
+# Offsets within Reduce Data
+#
+#
+# struct MODF_2FOLD_MONT_512_C1_DATA {
+# UINT64 t[8][8];
+# UINT64 m[8];
+# UINT64 m1[8]; /* 2^768 % m */
+# UINT64 m2[8]; /* 2^640 % m */
+# UINT64 k1[2]; /* (- 1/m) % 2^128 */
+# };
+
+my $T = 0;
+my $M = 512; # = 8 * 8 * 8
+my $M1 = 576; # = 8 * 8 * 9 /* += 8 * 8 */
+my $M2 = 640; # = 8 * 8 * 10 /* += 8 * 8 */
+my $K1 = 704; # = 8 * 8 * 11 /* += 8 * 8 */
+
+#
+# FUNCTIONS
+#
+
+{{{
+#
+# MULADD_128x512 : Function to multiply 128-bits (2 qwords) by 512-bits (8 qwords)
+# and add 512-bits (8 qwords)
+# to get 640 bits (10 qwords)
+# Input: 128-bit mul source: [rdi+8*1], rbp
+# 512-bit mul source: [rsi+8*n]
+# 512-bit add source: r15, r14, ..., r9, r8
+# Output: r9, r8, r15, r14, r13, r12, r11, r10, [rcx+8*1], [rcx+8*0]
+# Clobbers all regs except: rcx, rsi, rdi
+$code.=<<___;
+.type MULADD_128x512,\@abi-omnipotent
+.align 16
+MULADD_128x512:
+___
+ &MULSTEP_512([map("%r$_",(8..15))], "(+8*0)(%rcx)", "%rsi", "%rbp", "%rbx");
+$code.=<<___;
+ mov (+8*1)(%rdi), %rbp
+___
+ &MULSTEP_512([map("%r$_",(9..15,8))], "(+8*1)(%rcx)", "%rsi", "%rbp", "%rbx");
+$code.=<<___;
+ ret
+.size MULADD_128x512,.-MULADD_128x512
+___
+}}}
+
+{{{
+#MULADD_256x512 MACRO pDst, pA, pB, OP, TMP, X7, X6, X5, X4, X3, X2, X1, X0
+#
+# Inputs: pDst: Destination (768 bits, 12 qwords)
+# pA: Multiplicand (1024 bits, 16 qwords)
+# pB: Multiplicand (512 bits, 8 qwords)
+# Dst = Ah * B + Al
+# where Ah is (in qwords) A[15:12] (256 bits) and Al is A[7:0] (512 bits)
+# Results in X3 X2 X1 X0 X7 X6 X5 X4 Dst[3:0]
+# Uses registers: arguments, RAX, RDX
+sub MULADD_256x512
+{
+ my ($pDst, $pA, $pB, $OP, $TMP, $X)=@_;
+$code.=<<___;
+ mov (+8*12)($pA), $OP
+___
+ &MULSTEP_512_ADD($X, "(+8*0)($pDst)", $pB, $pA, $OP, $TMP);
+ push(@$X,shift(@$X));
+
+$code.=<<___;
+ mov (+8*13)($pA), $OP
+___
+ &MULSTEP_512($X, "(+8*1)($pDst)", $pB, $OP, $TMP);
+ push(@$X,shift(@$X));
+
+$code.=<<___;
+ mov (+8*14)($pA), $OP
+___
+ &MULSTEP_512($X, "(+8*2)($pDst)", $pB, $OP, $TMP);
+ push(@$X,shift(@$X));
+
+$code.=<<___;
+ mov (+8*15)($pA), $OP
+___
+ &MULSTEP_512($X, "(+8*3)($pDst)", $pB, $OP, $TMP);
+ push(@$X,shift(@$X));
+}
+
+#
+# mont_reduce(UINT64 *x, /* 1024 bits, 16 qwords */
+# UINT64 *m, /* 512 bits, 8 qwords */
+# MODF_2FOLD_MONT_512_C1_DATA *data,
+# UINT64 *r) /* 512 bits, 8 qwords */
+# Input: x (number to be reduced): tmp16 (Implicit)
+# m (modulus): [pM] (Implicit)
+# data (reduce data): [pData] (Implicit)
+# Output: r (result): Address in [red_res_addr]
+# result also in: r9, r8, r15, r14, r13, r12, r11, r10
+
+my @X=map("%r$_",(8..15));
+
+$code.=<<___;
+.type mont_reduce,\@abi-omnipotent
+.align 16
+mont_reduce:
+___
+
+my $STACK_DEPTH = 8;
+ #
+ # X1 = Xh * M1 + Xl
+$code.=<<___;
+ lea (+$Reduce_Data_offset+$X1_offset+$STACK_DEPTH)(%rsp), %rdi # pX1 (Dst) 769 bits, 13 qwords
+ mov (+$pData_offset+$STACK_DEPTH)(%rsp), %rsi # pM1 (Bsrc) 512 bits, 8 qwords
+ add \$$M1, %rsi
+ lea (+$tmp16_offset+$STACK_DEPTH)(%rsp), %rcx # X (Asrc) 1024 bits, 16 qwords
+
+___
+
+ &MULADD_256x512("%rdi", "%rcx", "%rsi", "%rbp", "%rbx", \@X); # rotates @X 4 times
+ # results in r11, r10, r9, r8, r15, r14, r13, r12, X1[3:0]
+
+$code.=<<___;
+ xor %rax, %rax
+ # X1 += xl
+ add (+8*8)(%rcx), $X[4]
+ adc (+8*9)(%rcx), $X[5]
+ adc (+8*10)(%rcx), $X[6]
+ adc (+8*11)(%rcx), $X[7]
+ adc \$0, %rax
+ # X1 is now rax, r11-r8, r15-r12, tmp16[3:0]
+
+ #
+ # check for carry ;; carry stored in rax
+ mov $X[4], (+8*8)(%rdi) # rdi points to X1
+ mov $X[5], (+8*9)(%rdi)
+ mov $X[6], %rbp
+ mov $X[7], (+8*11)(%rdi)
+
+ mov %rax, (+$Reduce_Data_offset+$Carries_offset+$STACK_DEPTH)(%rsp)
+
+ mov (+8*0)(%rdi), $X[4]
+ mov (+8*1)(%rdi), $X[5]
+ mov (+8*2)(%rdi), $X[6]
+ mov (+8*3)(%rdi), $X[7]
+
+ # X1 is now stored in: X1[11], rbp, X1[9:8], r15-r8
+ # rdi -> X1
+ # rsi -> M1
+
+ #
+ # X2 = Xh * M2 + Xl
+ # do first part (X2 = Xh * M2)
+ add \$8*10, %rdi # rdi -> pXh ; 128 bits, 2 qwords
+ # Xh is actually { [rdi+8*1], rbp }
+ add \$`$M2-$M1`, %rsi # rsi -> M2
+ lea (+$Reduce_Data_offset+$X2_offset+$STACK_DEPTH)(%rsp), %rcx # rcx -> pX2 ; 641 bits, 11 qwords
+___
+ unshift(@X,pop(@X)); unshift(@X,pop(@X));
+$code.=<<___;
+
+ call MULADD_128x512 # args in rcx, rdi / rbp, rsi, r15-r8
+ # result in r9, r8, r15, r14, r13, r12, r11, r10, X2[1:0]
+ mov (+$Reduce_Data_offset+$Carries_offset+$STACK_DEPTH)(%rsp), %rax
+
+ # X2 += Xl
+ add (+8*8-8*10)(%rdi), $X[6] # (-8*10) is to adjust rdi -> Xh to Xl
+ adc (+8*9-8*10)(%rdi), $X[7]
+ mov $X[6], (+8*8)(%rcx)
+ mov $X[7], (+8*9)(%rcx)
+
+ adc %rax, %rax
+ mov %rax, (+$Reduce_Data_offset+$Carries_offset+$STACK_DEPTH)(%rsp)
+
+ lea (+$Reduce_Data_offset+$Q_offset+$STACK_DEPTH)(%rsp), %rdi # rdi -> pQ ; 128 bits, 2 qwords
+ add \$`$K1-$M2`, %rsi # rsi -> pK1 ; 128 bits, 2 qwords
+
+ # MUL_128x128t128 rdi, rcx, rsi ; Q = X2 * K1 (bottom half)
+ # B1:B0 = rsi[1:0] = K1[1:0]
+ # A1:A0 = rcx[1:0] = X2[1:0]
+ # Result = rdi[1],rbp = Q[1],rbp
+ mov (%rsi), %r8 # B0
+ mov (+8*1)(%rsi), %rbx # B1
+
+ mov (%rcx), %rax # A0
+ mul %r8 # B0
+ mov %rax, %rbp
+ mov %rdx, %r9
+
+ mov (+8*1)(%rcx), %rax # A1
+ mul %r8 # B0
+ add %rax, %r9
+
+ mov (%rcx), %rax # A0
+ mul %rbx # B1
+ add %rax, %r9
+
+ mov %r9, (+8*1)(%rdi)
+ # end MUL_128x128t128
+
+ sub \$`$K1-$M`, %rsi
+
+ mov (%rcx), $X[6]
+ mov (+8*1)(%rcx), $X[7] # r9:r8 = X2[1:0]
+
+ call MULADD_128x512 # args in rcx, rdi / rbp, rsi, r15-r8
+ # result in r9, r8, r15, r14, r13, r12, r11, r10, X2[1:0]
+
+ # load first half of m to rdx, rdi, rbx, rax
+ # moved this here for efficiency
+ mov (+8*0)(%rsi), %rax
+ mov (+8*1)(%rsi), %rbx
+ mov (+8*2)(%rsi), %rdi
+ mov (+8*3)(%rsi), %rdx
+
+ # continue with reduction
+ mov (+$Reduce_Data_offset+$Carries_offset+$STACK_DEPTH)(%rsp), %rbp
+
+ add (+8*8)(%rcx), $X[6]
+ adc (+8*9)(%rcx), $X[7]
+
+ #accumulate the final carry to rbp
+ adc %rbp, %rbp
+
+ # Add in overflow corrections: R = (X2>>128) += T[overflow]
+ # R = {r9, r8, r15, r14, ..., r10}
+ shl \$3, %rbp
+ mov (+$pData_offset+$STACK_DEPTH)(%rsp), %rcx # rsi -> Data (and points to T)
+ add %rcx, %rbp # pT ; 512 bits, 8 qwords, spread out
+
+ # rsi will be used to generate a mask after the addition
+ xor %rsi, %rsi
+
+ add (+8*8*0)(%rbp), $X[0]
+ adc (+8*8*1)(%rbp), $X[1]
+ adc (+8*8*2)(%rbp), $X[2]
+ adc (+8*8*3)(%rbp), $X[3]
+ adc (+8*8*4)(%rbp), $X[4]
+ adc (+8*8*5)(%rbp), $X[5]
+ adc (+8*8*6)(%rbp), $X[6]
+ adc (+8*8*7)(%rbp), $X[7]
+
+ # if there is a carry: rsi = 0xFFFFFFFFFFFFFFFF
+ # if carry is clear: rsi = 0x0000000000000000
+ sbb \$0, %rsi
+
+ # if carry is clear, subtract 0. Otherwise, subtract 256 bits of m
+ and %rsi, %rax
+ and %rsi, %rbx
+ and %rsi, %rdi
+ and %rsi, %rdx
+
+ mov \$1, %rbp
+ sub %rax, $X[0]
+ sbb %rbx, $X[1]
+ sbb %rdi, $X[2]
+ sbb %rdx, $X[3]
+
+ # if there is a borrow: rbp = 0
+ # if there is no borrow: rbp = 1
+ # this is used to save the borrows in between the first half and the 2nd half of the subtraction of m
+ sbb \$0, %rbp
+
+ #load second half of m to rdx, rdi, rbx, rax
+
+ add \$$M, %rcx
+ mov (+8*4)(%rcx), %rax
+ mov (+8*5)(%rcx), %rbx
+ mov (+8*6)(%rcx), %rdi
+ mov (+8*7)(%rcx), %rdx
+
+ # use the rsi mask as before
+ # if carry is clear, subtract 0. Otherwise, subtract 256 bits of m
+ and %rsi, %rax
+ and %rsi, %rbx
+ and %rsi, %rdi
+ and %rsi, %rdx
+
+ # if rbp = 0, there was a borrow before, it is moved to the carry flag
+ # if rbp = 1, there was not a borrow before, carry flag is cleared
+ sub \$1, %rbp
+
+ sbb %rax, $X[4]
+ sbb %rbx, $X[5]
+ sbb %rdi, $X[6]
+ sbb %rdx, $X[7]
+
+ # write R back to memory
+
+ mov (+$red_result_addr_offset+$STACK_DEPTH)(%rsp), %rsi
+ mov $X[0], (+8*0)(%rsi)
+ mov $X[1], (+8*1)(%rsi)
+ mov $X[2], (+8*2)(%rsi)
+ mov $X[3], (+8*3)(%rsi)
+ mov $X[4], (+8*4)(%rsi)
+ mov $X[5], (+8*5)(%rsi)
+ mov $X[6], (+8*6)(%rsi)
+ mov $X[7], (+8*7)(%rsi)
+
+ ret
+.size mont_reduce,.-mont_reduce
+___
+}}}
+
+{{{
+#MUL_512x512 MACRO pDst, pA, pB, x7, x6, x5, x4, x3, x2, x1, x0, tmp*2
+#
+# Inputs: pDst: Destination (1024 bits, 16 qwords)
+# pA: Multiplicand (512 bits, 8 qwords)
+# pB: Multiplicand (512 bits, 8 qwords)
+# Uses registers rax, rdx, args
+# B operand in [pB] and also in x7...x0
+sub MUL_512x512
+{
+ my ($pDst, $pA, $pB, $x, $OP, $TMP, $pDst_o)=@_;
+ my ($pDst, $pDst_o) = ($pDst =~ m/([^+]*)\+?(.*)?/);
+ my @X=@$x; # make a copy
+
+$code.=<<___;
+ mov (+8*0)($pA), $OP
+
+ mov $X[0], %rax
+ mul $OP # rdx:rax = %OP * [0]
+ mov %rax, (+$pDst_o+8*0)($pDst)
+ mov %rdx, $X[0]
+___
+for(my $i=1;$i<8;$i++) {
+$code.=<<___;
+ mov $X[$i], %rax
+ mul $OP # rdx:rax = %OP * [$i]
+ add %rax, $X[$i-1]
+ adc \$0, %rdx
+ mov %rdx, $X[$i]
+___
+}
+
+for(my $i=1;$i<8;$i++) {
+$code.=<<___;
+ mov (+8*$i)($pA), $OP
+___
+
+ &MULSTEP_512(\@X, "(+$pDst_o+8*$i)($pDst)", $pB, $OP, $TMP);
+ push(@X,shift(@X));
+}
+
+$code.=<<___;
+ mov $X[0], (+$pDst_o+8*8)($pDst)
+ mov $X[1], (+$pDst_o+8*9)($pDst)
+ mov $X[2], (+$pDst_o+8*10)($pDst)
+ mov $X[3], (+$pDst_o+8*11)($pDst)
+ mov $X[4], (+$pDst_o+8*12)($pDst)
+ mov $X[5], (+$pDst_o+8*13)($pDst)
+ mov $X[6], (+$pDst_o+8*14)($pDst)
+ mov $X[7], (+$pDst_o+8*15)($pDst)
+___
+}
+
+#
+# mont_mul_a3b : subroutine to compute (Src1 * Src2) % M (all 512-bits)
+# Input: src1: Address of source 1: rdi
+# src2: Address of source 2: rsi
+# Output: dst: Address of destination: [red_res_addr]
+# src2 and result also in: r9, r8, r15, r14, r13, r12, r11, r10
+# Temp: Clobbers [tmp16], all registers
+$code.=<<___;
+.type mont_mul_a3b,\@abi-omnipotent
+.align 16
+mont_mul_a3b:
+ #
+ # multiply tmp = src1 * src2
+ # For multiply: dst = rcx, src1 = rdi, src2 = rsi
+ # stack depth is extra 8 from call
+___
+ &MUL_512x512("%rsp+$tmp16_offset+8", "%rdi", "%rsi", [map("%r$_",(10..15,8..9))], "%rbp", "%rbx");
+$code.=<<___;
+ #
+ # Dst = tmp % m
+ # Call reduce(tmp, m, data, dst)
+
+ # tail recursion optimization: jmp to mont_reduce and return from there
+ jmp mont_reduce
+ # call mont_reduce
+ # ret
+.size mont_mul_a3b,.-mont_mul_a3b
+___
+}}}
+
+{{{
+#SQR_512 MACRO pDest, pA, x7, x6, x5, x4, x3, x2, x1, x0, tmp*4
+#
+# Input in memory [pA] and also in x7...x0
+# Uses all argument registers plus rax and rdx
+#
+# This version computes all of the off-diagonal terms into memory,
+# and then it adds in the diagonal terms
+
+sub SQR_512
+{
+ my ($pDst, $pA, $x, $A, $tmp, $x7, $x6, $pDst_o)=@_;
+ my ($pDst, $pDst_o) = ($pDst =~ m/([^+]*)\+?(.*)?/);
+ my @X=@$x; # make a copy
+$code.=<<___;
+ # ------------------
+ # first pass 01...07
+ # ------------------
+ mov $X[0], $A
+
+ mov $X[1],%rax
+ mul $A
+ mov %rax, (+$pDst_o+8*1)($pDst)
+___
+for(my $i=2;$i<8;$i++) {
+$code.=<<___;
+ mov %rdx, $X[$i-2]
+ mov $X[$i],%rax
+ mul $A
+ add %rax, $X[$i-2]
+ adc \$0, %rdx
+___
+}
+$code.=<<___;
+ mov %rdx, $x7
+
+ mov $X[0], (+$pDst_o+8*2)($pDst)
+
+ # ------------------
+ # second pass 12...17
+ # ------------------
+
+ mov (+8*1)($pA), $A
+
+ mov (+8*2)($pA),%rax
+ mul $A
+ add %rax, $X[1]
+ adc \$0, %rdx
+ mov $X[1], (+$pDst_o+8*3)($pDst)
+
+ mov %rdx, $X[0]
+ mov (+8*3)($pA),%rax
+ mul $A
+ add %rax, $X[2]
+ adc \$0, %rdx
+ add $X[0], $X[2]
+ adc \$0, %rdx
+ mov $X[2], (+$pDst_o+8*4)($pDst)
+
+ mov %rdx, $X[0]
+ mov (+8*4)($pA),%rax
+ mul $A
+ add %rax, $X[3]
+ adc \$0, %rdx
+ add $X[0], $X[3]
+ adc \$0, %rdx
+
+ mov %rdx, $X[0]
+ mov (+8*5)($pA),%rax
+ mul $A
+ add %rax, $X[4]
+ adc \$0, %rdx
+ add $X[0], $X[4]
+ adc \$0, %rdx
+
+ mov %rdx, $X[0]
+ mov $X[6],%rax
+ mul $A
+ add %rax, $X[5]
+ adc \$0, %rdx
+ add $X[0], $X[5]
+ adc \$0, %rdx
+
+ mov %rdx, $X[0]
+ mov $X[7],%rax
+ mul $A
+ add %rax, $x7
+ adc \$0, %rdx
+ add $X[0], $x7
+ adc \$0, %rdx
+
+ mov %rdx, $X[1]
+
+ # ------------------
+ # third pass 23...27
+ # ------------------
+ mov (+8*2)($pA), $A
+
+ mov (+8*3)($pA),%rax
+ mul $A
+ add %rax, $X[3]
+ adc \$0, %rdx
+ mov $X[3], (+$pDst_o+8*5)($pDst)
+
+ mov %rdx, $X[0]
+ mov (+8*4)($pA),%rax
+ mul $A
+ add %rax, $X[4]
+ adc \$0, %rdx
+ add $X[0], $X[4]
+ adc \$0, %rdx
+ mov $X[4], (+$pDst_o+8*6)($pDst)
+
+ mov %rdx, $X[0]
+ mov (+8*5)($pA),%rax
+ mul $A
+ add %rax, $X[5]
+ adc \$0, %rdx
+ add $X[0], $X[5]
+ adc \$0, %rdx
+
+ mov %rdx, $X[0]
+ mov $X[6],%rax
+ mul $A
+ add %rax, $x7
+ adc \$0, %rdx
+ add $X[0], $x7
+ adc \$0, %rdx
+
+ mov %rdx, $X[0]
+ mov $X[7],%rax
+ mul $A
+ add %rax, $X[1]
+ adc \$0, %rdx
+ add $X[0], $X[1]
+ adc \$0, %rdx
+
+ mov %rdx, $X[2]
+
+ # ------------------
+ # fourth pass 34...37
+ # ------------------
+
+ mov (+8*3)($pA), $A
+
+ mov (+8*4)($pA),%rax
+ mul $A
+ add %rax, $X[5]
+ adc \$0, %rdx
+ mov $X[5], (+$pDst_o+8*7)($pDst)
+
+ mov %rdx, $X[0]
+ mov (+8*5)($pA),%rax
+ mul $A
+ add %rax, $x7
+ adc \$0, %rdx
+ add $X[0], $x7
+ adc \$0, %rdx
+ mov $x7, (+$pDst_o+8*8)($pDst)
+
+ mov %rdx, $X[0]
+ mov $X[6],%rax
+ mul $A
+ add %rax, $X[1]
+ adc \$0, %rdx
+ add $X[0], $X[1]
+ adc \$0, %rdx
+
+ mov %rdx, $X[0]
+ mov $X[7],%rax
+ mul $A
+ add %rax, $X[2]
+ adc \$0, %rdx
+ add $X[0], $X[2]
+ adc \$0, %rdx
+
+ mov %rdx, $X[5]
+
+ # ------------------
+ # fifth pass 45...47
+ # ------------------
+ mov (+8*4)($pA), $A
+
+ mov (+8*5)($pA),%rax
+ mul $A
+ add %rax, $X[1]
+ adc \$0, %rdx
+ mov $X[1], (+$pDst_o+8*9)($pDst)
+
+ mov %rdx, $X[0]
+ mov $X[6],%rax
+ mul $A
+ add %rax, $X[2]
+ adc \$0, %rdx
+ add $X[0], $X[2]
+ adc \$0, %rdx
+ mov $X[2], (+$pDst_o+8*10)($pDst)
+
+ mov %rdx, $X[0]
+ mov $X[7],%rax
+ mul $A
+ add %rax, $X[5]
+ adc \$0, %rdx
+ add $X[0], $X[5]
+ adc \$0, %rdx
+
+ mov %rdx, $X[1]
+
+ # ------------------
+ # sixth pass 56...57
+ # ------------------
+ mov (+8*5)($pA), $A
+
+ mov $X[6],%rax
+ mul $A
+ add %rax, $X[5]
+ adc \$0, %rdx
+ mov $X[5], (+$pDst_o+8*11)($pDst)
+
+ mov %rdx, $X[0]
+ mov $X[7],%rax
+ mul $A
+ add %rax, $X[1]
+ adc \$0, %rdx
+ add $X[0], $X[1]
+ adc \$0, %rdx
+ mov $X[1], (+$pDst_o+8*12)($pDst)
+
+ mov %rdx, $X[2]
+
+ # ------------------
+ # seventh pass 67
+ # ------------------
+ mov $X[6], $A
+
+ mov $X[7],%rax
+ mul $A
+ add %rax, $X[2]
+ adc \$0, %rdx
+ mov $X[2], (+$pDst_o+8*13)($pDst)
+
+ mov %rdx, (+$pDst_o+8*14)($pDst)
+
+ # start finalize (add in squares, and double off-terms)
+ mov (+$pDst_o+8*1)($pDst), $X[0]
+ mov (+$pDst_o+8*2)($pDst), $X[1]
+ mov (+$pDst_o+8*3)($pDst), $X[2]
+ mov (+$pDst_o+8*4)($pDst), $X[3]
+ mov (+$pDst_o+8*5)($pDst), $X[4]
+ mov (+$pDst_o+8*6)($pDst), $X[5]
+
+ mov (+8*3)($pA), %rax
+ mul %rax
+ mov %rax, $x6
+ mov %rdx, $X[6]
+
+ add $X[0], $X[0]
+ adc $X[1], $X[1]
+ adc $X[2], $X[2]
+ adc $X[3], $X[3]
+ adc $X[4], $X[4]
+ adc $X[5], $X[5]
+ adc \$0, $X[6]
+
+ mov (+8*0)($pA), %rax
+ mul %rax
+ mov %rax, (+$pDst_o+8*0)($pDst)
+ mov %rdx, $A
+
+ mov (+8*1)($pA), %rax
+ mul %rax
+
+ add $A, $X[0]
+ adc %rax, $X[1]
+ adc \$0, %rdx
+
+ mov %rdx, $A
+ mov $X[0], (+$pDst_o+8*1)($pDst)
+ mov $X[1], (+$pDst_o+8*2)($pDst)
+
+ mov (+8*2)($pA), %rax
+ mul %rax
+
+ add $A, $X[2]
+ adc %rax, $X[3]
+ adc \$0, %rdx
+
+ mov %rdx, $A
+
+ mov $X[2], (+$pDst_o+8*3)($pDst)
+ mov $X[3], (+$pDst_o+8*4)($pDst)
+
+ xor $tmp, $tmp
+ add $A, $X[4]
+ adc $x6, $X[5]
+ adc \$0, $tmp
+
+ mov $X[4], (+$pDst_o+8*5)($pDst)
+ mov $X[5], (+$pDst_o+8*6)($pDst)
+
+ # %%tmp has 0/1 in column 7
+ # %%A6 has a full value in column 7
+
+ mov (+$pDst_o+8*7)($pDst), $X[0]
+ mov (+$pDst_o+8*8)($pDst), $X[1]
+ mov (+$pDst_o+8*9)($pDst), $X[2]
+ mov (+$pDst_o+8*10)($pDst), $X[3]
+ mov (+$pDst_o+8*11)($pDst), $X[4]
+ mov (+$pDst_o+8*12)($pDst), $X[5]
+ mov (+$pDst_o+8*13)($pDst), $x6
+ mov (+$pDst_o+8*14)($pDst), $x7
+
+ mov $X[7], %rax
+ mul %rax
+ mov %rax, $X[7]
+ mov %rdx, $A
+
+ add $X[0], $X[0]
+ adc $X[1], $X[1]
+ adc $X[2], $X[2]
+ adc $X[3], $X[3]
+ adc $X[4], $X[4]
+ adc $X[5], $X[5]
+ adc $x6, $x6
+ adc $x7, $x7
+ adc \$0, $A
+
+ add $tmp, $X[0]
+
+ mov (+8*4)($pA), %rax
+ mul %rax
+
+ add $X[6], $X[0]
+ adc %rax, $X[1]
+ adc \$0, %rdx
+
+ mov %rdx, $tmp
+
+ mov $X[0], (+$pDst_o+8*7)($pDst)
+ mov $X[1], (+$pDst_o+8*8)($pDst)
+
+ mov (+8*5)($pA), %rax
+ mul %rax
+
+ add $tmp, $X[2]
+ adc %rax, $X[3]
+ adc \$0, %rdx
+
+ mov %rdx, $tmp
+
+ mov $X[2], (+$pDst_o+8*9)($pDst)
+ mov $X[3], (+$pDst_o+8*10)($pDst)
+
+ mov (+8*6)($pA), %rax
+ mul %rax
+
+ add $tmp, $X[4]
+ adc %rax, $X[5]
+ adc \$0, %rdx
+
+ mov $X[4], (+$pDst_o+8*11)($pDst)
+ mov $X[5], (+$pDst_o+8*12)($pDst)
+
+ add %rdx, $x6
+ adc $X[7], $x7
+ adc \$0, $A
+
+ mov $x6, (+$pDst_o+8*13)($pDst)
+ mov $x7, (+$pDst_o+8*14)($pDst)
+ mov $A, (+$pDst_o+8*15)($pDst)
+___
+}
+
+#
+# sqr_reduce: subroutine to compute Result = reduce(Result * Result)
+#
+# input and result also in: r9, r8, r15, r14, r13, r12, r11, r10
+#
+$code.=<<___;
+.type sqr_reduce,\@abi-omnipotent
+.align 16
+sqr_reduce:
+ mov (+$pResult_offset+8)(%rsp), %rcx
+___
+ &SQR_512("%rsp+$tmp16_offset+8", "%rcx", [map("%r$_",(10..15,8..9))], "%rbx", "%rbp", "%rsi", "%rdi");
+$code.=<<___;
+ # tail recursion optimization: jmp to mont_reduce and return from there
+ jmp mont_reduce
+ # call mont_reduce
+ # ret
+.size sqr_reduce,.-sqr_reduce
+___
+}}}
+
+#
+# MAIN FUNCTION
+#
+
+#mod_exp_512(UINT64 *result, /* 512 bits, 8 qwords */
+# UINT64 *g, /* 512 bits, 8 qwords */
+# UINT64 *exp, /* 512 bits, 8 qwords */
+# struct mod_ctx_512 *data)
+
+# window size = 5
+# table size = 2^5 = 32
+#table_entries equ 32
+#table_size equ table_entries * 8
+$code.=<<___;
+.globl mod_exp_512
+.type mod_exp_512,\@function,4
+mod_exp_512:
+ push %rbp
+ push %rbx
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ # adjust stack down and then align it with cache boundary
+ mov %rsp, %r8
+ sub \$$mem_size, %rsp
+ and \$-64, %rsp
+
+ # store previous stack pointer and arguments
+ mov %r8, (+$rsp_offset)(%rsp)
+ mov %rdi, (+$pResult_offset)(%rsp)
+ mov %rsi, (+$pG_offset)(%rsp)
+ mov %rcx, (+$pData_offset)(%rsp)
+.Lbody:
+ # transform g into montgomery space
+ # GT = reduce(g * C2) = reduce(g * (2^256))
+ # reduce expects to have the input in [tmp16]
+ pxor %xmm4, %xmm4
+ movdqu (+16*0)(%rsi), %xmm0
+ movdqu (+16*1)(%rsi), %xmm1
+ movdqu (+16*2)(%rsi), %xmm2
+ movdqu (+16*3)(%rsi), %xmm3
+ movdqa %xmm4, (+$tmp16_offset+16*0)(%rsp)
+ movdqa %xmm4, (+$tmp16_offset+16*1)(%rsp)
+ movdqa %xmm4, (+$tmp16_offset+16*6)(%rsp)
+ movdqa %xmm4, (+$tmp16_offset+16*7)(%rsp)
+ movdqa %xmm0, (+$tmp16_offset+16*2)(%rsp)
+ movdqa %xmm1, (+$tmp16_offset+16*3)(%rsp)
+ movdqa %xmm2, (+$tmp16_offset+16*4)(%rsp)
+ movdqa %xmm3, (+$tmp16_offset+16*5)(%rsp)
+
+ # load pExp before rdx gets blown away
+ movdqu (+16*0)(%rdx), %xmm0
+ movdqu (+16*1)(%rdx), %xmm1
+ movdqu (+16*2)(%rdx), %xmm2
+ movdqu (+16*3)(%rdx), %xmm3
+
+ lea (+$GT_offset)(%rsp), %rbx
+ mov %rbx, (+$red_result_addr_offset)(%rsp)
+ call mont_reduce
+
+ # Initialize tmp = C
+ lea (+$tmp_offset)(%rsp), %rcx
+ xor %rax, %rax
+ mov %rax, (+8*0)(%rcx)
+ mov %rax, (+8*1)(%rcx)
+ mov %rax, (+8*3)(%rcx)
+ mov %rax, (+8*4)(%rcx)
+ mov %rax, (+8*5)(%rcx)
+ mov %rax, (+8*6)(%rcx)
+ mov %rax, (+8*7)(%rcx)
+ mov %rax, (+$exp_offset+8*8)(%rsp)
+ movq \$1, (+8*2)(%rcx)
+
+ lea (+$garray_offset)(%rsp), %rbp
+ mov %rcx, %rsi # pTmp
+ mov %rbp, %rdi # Garray[][0]
+___
+
+ &swizzle("%rdi", "%rcx", "%rax", "%rbx");
+
+ # for (rax = 31; rax != 0; rax--) {
+ # tmp = reduce(tmp * G)
+ # swizzle(pg, tmp);
+ # pg += 2; }
+$code.=<<___;
+ mov \$31, %rax
+ mov %rax, (+$i_offset)(%rsp)
+ mov %rbp, (+$pg_offset)(%rsp)
+ # rsi -> pTmp
+ mov %rsi, (+$red_result_addr_offset)(%rsp)
+ mov (+8*0)(%rsi), %r10
+ mov (+8*1)(%rsi), %r11
+ mov (+8*2)(%rsi), %r12
+ mov (+8*3)(%rsi), %r13
+ mov (+8*4)(%rsi), %r14
+ mov (+8*5)(%rsi), %r15
+ mov (+8*6)(%rsi), %r8
+ mov (+8*7)(%rsi), %r9
+init_loop:
+ lea (+$GT_offset)(%rsp), %rdi
+ call mont_mul_a3b
+ lea (+$tmp_offset)(%rsp), %rsi
+ mov (+$pg_offset)(%rsp), %rbp
+ add \$2, %rbp
+ mov %rbp, (+$pg_offset)(%rsp)
+ mov %rsi, %rcx # rcx = rsi = addr of tmp
+___
+
+ &swizzle("%rbp", "%rcx", "%rax", "%rbx");
+$code.=<<___;
+ mov (+$i_offset)(%rsp), %rax
+ sub \$1, %rax
+ mov %rax, (+$i_offset)(%rsp)
+ jne init_loop
+
+ #
+ # Copy exponent onto stack
+ movdqa %xmm0, (+$exp_offset+16*0)(%rsp)
+ movdqa %xmm1, (+$exp_offset+16*1)(%rsp)
+ movdqa %xmm2, (+$exp_offset+16*2)(%rsp)
+ movdqa %xmm3, (+$exp_offset+16*3)(%rsp)
+
+
+ #
+ # Do exponentiation
+ # Initialize result to G[exp{511:507}]
+ mov (+$exp_offset+62)(%rsp), %eax
+ mov %rax, %rdx
+ shr \$11, %rax
+ and \$0x07FF, %edx
+ mov %edx, (+$exp_offset+62)(%rsp)
+ lea (+$garray_offset)(%rsp,%rax,2), %rsi
+ mov (+$pResult_offset)(%rsp), %rdx
+___
+
+ &unswizzle("%rdx", "%rsi", "%rbp", "%rbx", "%rax");
+
+ #
+ # Loop variables
+ # rcx = [loop_idx] = index: 510-5 to 0 by 5
+$code.=<<___;
+ movq \$505, (+$loop_idx_offset)(%rsp)
+
+ mov (+$pResult_offset)(%rsp), %rcx
+ mov %rcx, (+$red_result_addr_offset)(%rsp)
+ mov (+8*0)(%rcx), %r10
+ mov (+8*1)(%rcx), %r11
+ mov (+8*2)(%rcx), %r12
+ mov (+8*3)(%rcx), %r13
+ mov (+8*4)(%rcx), %r14
+ mov (+8*5)(%rcx), %r15
+ mov (+8*6)(%rcx), %r8
+ mov (+8*7)(%rcx), %r9
+ jmp sqr_2
+
+main_loop_a3b:
+ call sqr_reduce
+ call sqr_reduce
+ call sqr_reduce
+sqr_2:
+ call sqr_reduce
+ call sqr_reduce
+
+ #
+ # Do multiply, first look up proper value in Garray
+ mov (+$loop_idx_offset)(%rsp), %rcx # bit index
+ mov %rcx, %rax
+ shr \$4, %rax # rax is word pointer
+ mov (+$exp_offset)(%rsp,%rax,2), %edx
+ and \$15, %rcx
+ shrq %cl, %rdx
+ and \$0x1F, %rdx
+
+ lea (+$garray_offset)(%rsp,%rdx,2), %rsi
+ lea (+$tmp_offset)(%rsp), %rdx
+ mov %rdx, %rdi
+___
+
+ &unswizzle("%rdx", "%rsi", "%rbp", "%rbx", "%rax");
+ # rdi = tmp = pG
+
+ #
+ # Call mod_mul_a1(pDst, pSrc1, pSrc2, pM, pData)
+ # result result pG M Data
+$code.=<<___;
+ mov (+$pResult_offset)(%rsp), %rsi
+ call mont_mul_a3b
+
+ #
+ # finish loop
+ mov (+$loop_idx_offset)(%rsp), %rcx
+ sub \$5, %rcx
+ mov %rcx, (+$loop_idx_offset)(%rsp)
+ jge main_loop_a3b
+
+ #
+
+end_main_loop_a3b:
+ # transform result out of Montgomery space
+ # result = reduce(result)
+ mov (+$pResult_offset)(%rsp), %rdx
+ pxor %xmm4, %xmm4
+ movdqu (+16*0)(%rdx), %xmm0
+ movdqu (+16*1)(%rdx), %xmm1
+ movdqu (+16*2)(%rdx), %xmm2
+ movdqu (+16*3)(%rdx), %xmm3
+ movdqa %xmm4, (+$tmp16_offset+16*4)(%rsp)
+ movdqa %xmm4, (+$tmp16_offset+16*5)(%rsp)
+ movdqa %xmm4, (+$tmp16_offset+16*6)(%rsp)
+ movdqa %xmm4, (+$tmp16_offset+16*7)(%rsp)
+ movdqa %xmm0, (+$tmp16_offset+16*0)(%rsp)
+ movdqa %xmm1, (+$tmp16_offset+16*1)(%rsp)
+ movdqa %xmm2, (+$tmp16_offset+16*2)(%rsp)
+ movdqa %xmm3, (+$tmp16_offset+16*3)(%rsp)
+ call mont_reduce
+
+ # If result > m, subract m
+ # load result into r15:r8
+ mov (+$pResult_offset)(%rsp), %rax
+ mov (+8*0)(%rax), %r8
+ mov (+8*1)(%rax), %r9
+ mov (+8*2)(%rax), %r10
+ mov (+8*3)(%rax), %r11
+ mov (+8*4)(%rax), %r12
+ mov (+8*5)(%rax), %r13
+ mov (+8*6)(%rax), %r14
+ mov (+8*7)(%rax), %r15
+
+ # subtract m
+ mov (+$pData_offset)(%rsp), %rbx
+ add \$$M, %rbx
+
+ sub (+8*0)(%rbx), %r8
+ sbb (+8*1)(%rbx), %r9
+ sbb (+8*2)(%rbx), %r10
+ sbb (+8*3)(%rbx), %r11
+ sbb (+8*4)(%rbx), %r12
+ sbb (+8*5)(%rbx), %r13
+ sbb (+8*6)(%rbx), %r14
+ sbb (+8*7)(%rbx), %r15
+
+ # if Carry is clear, replace result with difference
+ mov (+8*0)(%rax), %rsi
+ mov (+8*1)(%rax), %rdi
+ mov (+8*2)(%rax), %rcx
+ mov (+8*3)(%rax), %rdx
+ cmovnc %r8, %rsi
+ cmovnc %r9, %rdi
+ cmovnc %r10, %rcx
+ cmovnc %r11, %rdx
+ mov %rsi, (+8*0)(%rax)
+ mov %rdi, (+8*1)(%rax)
+ mov %rcx, (+8*2)(%rax)
+ mov %rdx, (+8*3)(%rax)
+
+ mov (+8*4)(%rax), %rsi
+ mov (+8*5)(%rax), %rdi
+ mov (+8*6)(%rax), %rcx
+ mov (+8*7)(%rax), %rdx
+ cmovnc %r12, %rsi
+ cmovnc %r13, %rdi
+ cmovnc %r14, %rcx
+ cmovnc %r15, %rdx
+ mov %rsi, (+8*4)(%rax)
+ mov %rdi, (+8*5)(%rax)
+ mov %rcx, (+8*6)(%rax)
+ mov %rdx, (+8*7)(%rax)
+
+ mov (+$rsp_offset)(%rsp), %rsi
+ mov 0(%rsi),%r15
+ mov 8(%rsi),%r14
+ mov 16(%rsi),%r13
+ mov 24(%rsi),%r12
+ mov 32(%rsi),%rbx
+ mov 40(%rsi),%rbp
+ lea 48(%rsi),%rsp
+.Lepilogue:
+ ret
+.size mod_exp_512, . - mod_exp_512
+___
+
+if ($win64) {
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+# CONTEXT *context,DISPATCHER_CONTEXT *disp)
+my $rec="%rcx";
+my $frame="%rdx";
+my $context="%r8";
+my $disp="%r9";
+
+$code.=<<___;
+.extern __imp_RtlVirtualUnwind
+.type mod_exp_512_se_handler,\@abi-omnipotent
+.align 16
+mod_exp_512_se_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ lea .Lbody(%rip),%r10
+ cmp %r10,%rbx # context->Rip<prologue label
+ jb .Lin_prologue
+
+ mov 152($context),%rax # pull context->Rsp
+
+ lea .Lepilogue(%rip),%r10
+ cmp %r10,%rbx # context->Rip>=epilogue label
+ jae .Lin_prologue
+
+ mov $rsp_offset(%rax),%rax # pull saved Rsp
+
+ mov 32(%rax),%rbx
+ mov 40(%rax),%rbp
+ mov 24(%rax),%r12
+ mov 16(%rax),%r13
+ mov 8(%rax),%r14
+ mov 0(%rax),%r15
+ lea 48(%rax),%rax
+ mov %rbx,144($context) # restore context->Rbx
+ mov %rbp,160($context) # restore context->Rbp
+ mov %r12,216($context) # restore context->R12
+ mov %r13,224($context) # restore context->R13
+ mov %r14,232($context) # restore context->R14
+ mov %r15,240($context) # restore context->R15
+
+.Lin_prologue:
+ mov 8(%rax),%rdi
+ mov 16(%rax),%rsi
+ mov %rax,152($context) # restore context->Rsp
+ mov %rsi,168($context) # restore context->Rsi
+ mov %rdi,176($context) # restore context->Rdi
+
+ mov 40($disp),%rdi # disp->ContextRecord
+ mov $context,%rsi # context
+ mov \$154,%ecx # sizeof(CONTEXT)
+ .long 0xa548f3fc # cld; rep movsq
+
+ mov $disp,%rsi
+ xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
+ mov 8(%rsi),%rdx # arg2, disp->ImageBase
+ mov 0(%rsi),%r8 # arg3, disp->ControlPc
+ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
+ mov 40(%rsi),%r10 # disp->ContextRecord
+ lea 56(%rsi),%r11 # &disp->HandlerData
+ lea 24(%rsi),%r12 # &disp->EstablisherFrame
+ mov %r10,32(%rsp) # arg5
+ mov %r11,40(%rsp) # arg6
+ mov %r12,48(%rsp) # arg7
+ mov %rcx,56(%rsp) # arg8, (NULL)
+ call *__imp_RtlVirtualUnwind(%rip)
+
+ mov \$1,%eax # ExceptionContinueSearch
+ add \$64,%rsp
+ popfq
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ pop %rdi
+ pop %rsi
+ ret
+.size mod_exp_512_se_handler,.-mod_exp_512_se_handler
+
+.section .pdata
+.align 4
+ .rva .LSEH_begin_mod_exp_512
+ .rva .LSEH_end_mod_exp_512
+ .rva .LSEH_info_mod_exp_512
+
+.section .xdata
+.align 8
+.LSEH_info_mod_exp_512:
+ .byte 9,0,0,0
+ .rva mod_exp_512_se_handler
+___
+}
+
+sub reg_part {
+my ($reg,$conv)=@_;
+ if ($reg =~ /%r[0-9]+/) { $reg .= $conv; }
+ elsif ($conv eq "b") { $reg =~ s/%[er]([^x]+)x?/%$1l/; }
+ elsif ($conv eq "w") { $reg =~ s/%[er](.+)/%$1/; }
+ elsif ($conv eq "d") { $reg =~ s/%[er](.+)/%e$1/; }
+ return $reg;
+}
+
+$code =~ s/(%[a-z0-9]+)#([bwd])/reg_part($1,$2)/gem;
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+$code =~ s/(\(\+[^)]+\))/eval $1/gem;
+print $code;
+close STDOUT;
diff --git a/main/openssl/crypto/bn/asm/parisc-mont.pl b/main/openssl/crypto/bn/asm/parisc-mont.pl
new file mode 100644
index 00000000..c02ef6f0
--- /dev/null
+++ b/main/openssl/crypto/bn/asm/parisc-mont.pl
@@ -0,0 +1,995 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# On PA-7100LC this module performs ~90-50% better, less for longer
+# keys, than code generated by gcc 3.2 for PA-RISC 1.1. Latter means
+# that compiler utilized xmpyu instruction to perform 32x32=64-bit
+# multiplication, which in turn means that "baseline" performance was
+# optimal in respect to instruction set capabilities. Fair comparison
+# with vendor compiler is problematic, because OpenSSL doesn't define
+# BN_LLONG [presumably] for historical reasons, which drives compiler
+# toward 4 times 16x16=32-bit multiplicatons [plus complementary
+# shifts and additions] instead. This means that you should observe
+# several times improvement over code generated by vendor compiler
+# for PA-RISC 1.1, but the "baseline" is far from optimal. The actual
+# improvement coefficient was never collected on PA-7100LC, or any
+# other 1.1 CPU, because I don't have access to such machine with
+# vendor compiler. But to give you a taste, PA-RISC 1.1 code path
+# reportedly outperformed code generated by cc +DA1.1 +O3 by factor
+# of ~5x on PA-8600.
+#
+# On PA-RISC 2.0 it has to compete with pa-risc2[W].s, which is
+# reportedly ~2x faster than vendor compiler generated code [according
+# to comment in pa-risc2[W].s]. Here comes a catch. Execution core of
+# this implementation is actually 32-bit one, in the sense that it
+# operates on 32-bit values. But pa-risc2[W].s operates on arrays of
+# 64-bit BN_LONGs... How do they interoperate then? No problem. This
+# module picks halves of 64-bit values in reverse order and pretends
+# they were 32-bit BN_LONGs. But can 32-bit core compete with "pure"
+# 64-bit code such as pa-risc2[W].s then? Well, the thing is that
+# 32x32=64-bit multiplication is the best even PA-RISC 2.0 can do,
+# i.e. there is no "wider" multiplication like on most other 64-bit
+# platforms. This means that even being effectively 32-bit, this
+# implementation performs "64-bit" computational task in same amount
+# of arithmetic operations, most notably multiplications. It requires
+# more memory references, most notably to tp[num], but this doesn't
+# seem to exhaust memory port capacity. And indeed, dedicated PA-RISC
+# 2.0 code path provides virtually same performance as pa-risc2[W].s:
+# it's ~10% better for shortest key length and ~10% worse for longest
+# one.
+#
+# In case it wasn't clear. The module has two distinct code paths:
+# PA-RISC 1.1 and PA-RISC 2.0 ones. Latter features carry-free 64-bit
+# additions and 64-bit integer loads, not to mention specific
+# instruction scheduling. In 64-bit build naturally only 2.0 code path
+# is assembled. In 32-bit application context both code paths are
+# assembled, PA-RISC 2.0 CPU is detected at run-time and proper path
+# is taken automatically. Also, in 32-bit build the module imposes
+# couple of limitations: vector lengths has to be even and vector
+# addresses has to be 64-bit aligned. Normally neither is a problem:
+# most common key lengths are even and vectors are commonly malloc-ed,
+# which ensures alignment.
+#
+# Special thanks to polarhome.com for providing HP-UX account on
+# PA-RISC 1.1 machine, and to correspondent who chose to remain
+# anonymous for testing the code on PA-RISC 2.0 machine.
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+
+$flavour = shift;
+$output = shift;
+
+open STDOUT,">$output";
+
+if ($flavour =~ /64/) {
+ $LEVEL ="2.0W";
+ $SIZE_T =8;
+ $FRAME_MARKER =80;
+ $SAVED_RP =16;
+ $PUSH ="std";
+ $PUSHMA ="std,ma";
+ $POP ="ldd";
+ $POPMB ="ldd,mb";
+ $BN_SZ =$SIZE_T;
+} else {
+ $LEVEL ="1.1"; #$LEVEL.="\n\t.ALLOW\t2.0";
+ $SIZE_T =4;
+ $FRAME_MARKER =48;
+ $SAVED_RP =20;
+ $PUSH ="stw";
+ $PUSHMA ="stwm";
+ $POP ="ldw";
+ $POPMB ="ldwm";
+ $BN_SZ =$SIZE_T;
+ if (open CONF,"<${dir}../../opensslconf.h") {
+ while(<CONF>) {
+ if (m/#\s*define\s+SIXTY_FOUR_BIT/) {
+ $BN_SZ=8;
+ $LEVEL="2.0";
+ last;
+ }
+ }
+ close CONF;
+ }
+}
+
+$FRAME=8*$SIZE_T+$FRAME_MARKER; # 8 saved regs + frame marker
+ # [+ argument transfer]
+$LOCALS=$FRAME-$FRAME_MARKER;
+$FRAME+=32; # local variables
+
+$tp="%r31";
+$ti1="%r29";
+$ti0="%r28";
+
+$rp="%r26";
+$ap="%r25";
+$bp="%r24";
+$np="%r23";
+$n0="%r22"; # passed through stack in 32-bit
+$num="%r21"; # passed through stack in 32-bit
+$idx="%r20";
+$arrsz="%r19";
+
+$nm1="%r7";
+$nm0="%r6";
+$ab1="%r5";
+$ab0="%r4";
+
+$fp="%r3";
+$hi1="%r2";
+$hi0="%r1";
+
+$xfer=$n0; # accomodates [-16..15] offset in fld[dw]s
+
+$fm0="%fr4"; $fti=$fm0;
+$fbi="%fr5L";
+$fn0="%fr5R";
+$fai="%fr6"; $fab0="%fr7"; $fab1="%fr8";
+$fni="%fr9"; $fnm0="%fr10"; $fnm1="%fr11";
+
+$code=<<___;
+ .LEVEL $LEVEL
+ .SPACE \$TEXT\$
+ .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
+
+ .EXPORT bn_mul_mont,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
+ .ALIGN 64
+bn_mul_mont
+ .PROC
+ .CALLINFO FRAME=`$FRAME-8*$SIZE_T`,NO_CALLS,SAVE_RP,SAVE_SP,ENTRY_GR=6
+ .ENTRY
+ $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
+ $PUSHMA %r3,$FRAME(%sp)
+ $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
+ $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
+ $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
+ $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
+ $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
+ $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
+ $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
+ ldo -$FRAME(%sp),$fp
+___
+$code.=<<___ if ($SIZE_T==4);
+ ldw `-$FRAME_MARKER-4`($fp),$n0
+ ldw `-$FRAME_MARKER-8`($fp),$num
+ nop
+ nop ; alignment
+___
+$code.=<<___ if ($BN_SZ==4);
+ comiclr,<= 6,$num,%r0 ; are vectors long enough?
+ b L\$abort
+ ldi 0,%r28 ; signal "unhandled"
+ add,ev %r0,$num,$num ; is $num even?
+ b L\$abort
+ nop
+ or $ap,$np,$ti1
+ extru,= $ti1,31,3,%r0 ; are ap and np 64-bit aligned?
+ b L\$abort
+ nop
+ nop ; alignment
+ nop
+
+ fldws 0($n0),${fn0}
+ fldws,ma 4($bp),${fbi} ; bp[0]
+___
+$code.=<<___ if ($BN_SZ==8);
+ comib,> 3,$num,L\$abort ; are vectors long enough?
+ ldi 0,%r28 ; signal "unhandled"
+ addl $num,$num,$num ; I operate on 32-bit values
+
+ fldws 4($n0),${fn0} ; only low part of n0
+ fldws 4($bp),${fbi} ; bp[0] in flipped word order
+___
+$code.=<<___;
+ fldds 0($ap),${fai} ; ap[0,1]
+ fldds 0($np),${fni} ; np[0,1]
+
+ sh2addl $num,%r0,$arrsz
+ ldi 31,$hi0
+ ldo 36($arrsz),$hi1 ; space for tp[num+1]
+ andcm $hi1,$hi0,$hi1 ; align
+ addl $hi1,%sp,%sp
+ $PUSH $fp,-$SIZE_T(%sp)
+
+ ldo `$LOCALS+16`($fp),$xfer
+ ldo `$LOCALS+32+4`($fp),$tp
+
+ xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[0]
+ xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[0]
+ xmpyu ${fn0},${fab0}R,${fm0}
+
+ addl $arrsz,$ap,$ap ; point at the end
+ addl $arrsz,$np,$np
+ subi 0,$arrsz,$idx ; j=0
+ ldo 8($idx),$idx ; j++++
+
+ xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m
+ xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m
+ fstds ${fab0},-16($xfer)
+ fstds ${fnm0},-8($xfer)
+ fstds ${fab1},0($xfer)
+ fstds ${fnm1},8($xfer)
+ flddx $idx($ap),${fai} ; ap[2,3]
+ flddx $idx($np),${fni} ; np[2,3]
+___
+$code.=<<___ if ($BN_SZ==4);
+ mtctl $hi0,%cr11 ; $hi0 still holds 31
+ extrd,u,*= $hi0,%sar,1,$hi0 ; executes on PA-RISC 1.0
+ b L\$parisc11
+ nop
+___
+$code.=<<___; # PA-RISC 2.0 code-path
+ xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0]
+ xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
+ ldd -16($xfer),$ab0
+ fstds ${fab0},-16($xfer)
+
+ extrd,u $ab0,31,32,$hi0
+ extrd,u $ab0,63,32,$ab0
+ ldd -8($xfer),$nm0
+ fstds ${fnm0},-8($xfer)
+ ldo 8($idx),$idx ; j++++
+ addl $ab0,$nm0,$nm0 ; low part is discarded
+ extrd,u $nm0,31,32,$hi1
+
+L\$1st
+ xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[0]
+ xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
+ ldd 0($xfer),$ab1
+ fstds ${fab1},0($xfer)
+ addl $hi0,$ab1,$ab1
+ extrd,u $ab1,31,32,$hi0
+ ldd 8($xfer),$nm1
+ fstds ${fnm1},8($xfer)
+ extrd,u $ab1,63,32,$ab1
+ addl $hi1,$nm1,$nm1
+ flddx $idx($ap),${fai} ; ap[j,j+1]
+ flddx $idx($np),${fni} ; np[j,j+1]
+ addl $ab1,$nm1,$nm1
+ extrd,u $nm1,31,32,$hi1
+
+ xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0]
+ xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
+ ldd -16($xfer),$ab0
+ fstds ${fab0},-16($xfer)
+ addl $hi0,$ab0,$ab0
+ extrd,u $ab0,31,32,$hi0
+ ldd -8($xfer),$nm0
+ fstds ${fnm0},-8($xfer)
+ extrd,u $ab0,63,32,$ab0
+ addl $hi1,$nm0,$nm0
+ stw $nm1,-4($tp) ; tp[j-1]
+ addl $ab0,$nm0,$nm0
+ stw,ma $nm0,8($tp) ; tp[j-1]
+ addib,<> 8,$idx,L\$1st ; j++++
+ extrd,u $nm0,31,32,$hi1
+
+ xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[0]
+ xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m
+ ldd 0($xfer),$ab1
+ fstds ${fab1},0($xfer)
+ addl $hi0,$ab1,$ab1
+ extrd,u $ab1,31,32,$hi0
+ ldd 8($xfer),$nm1
+ fstds ${fnm1},8($xfer)
+ extrd,u $ab1,63,32,$ab1
+ addl $hi1,$nm1,$nm1
+ ldd -16($xfer),$ab0
+ addl $ab1,$nm1,$nm1
+ ldd -8($xfer),$nm0
+ extrd,u $nm1,31,32,$hi1
+
+ addl $hi0,$ab0,$ab0
+ extrd,u $ab0,31,32,$hi0
+ stw $nm1,-4($tp) ; tp[j-1]
+ extrd,u $ab0,63,32,$ab0
+ addl $hi1,$nm0,$nm0
+ ldd 0($xfer),$ab1
+ addl $ab0,$nm0,$nm0
+ ldd,mb 8($xfer),$nm1
+ extrd,u $nm0,31,32,$hi1
+ stw,ma $nm0,8($tp) ; tp[j-1]
+
+ ldo -1($num),$num ; i--
+ subi 0,$arrsz,$idx ; j=0
+___
+$code.=<<___ if ($BN_SZ==4);
+ fldws,ma 4($bp),${fbi} ; bp[1]
+___
+$code.=<<___ if ($BN_SZ==8);
+ fldws 0($bp),${fbi} ; bp[1] in flipped word order
+___
+$code.=<<___;
+ flddx $idx($ap),${fai} ; ap[0,1]
+ flddx $idx($np),${fni} ; np[0,1]
+ fldws 8($xfer),${fti}R ; tp[0]
+ addl $hi0,$ab1,$ab1
+ extrd,u $ab1,31,32,$hi0
+ extrd,u $ab1,63,32,$ab1
+ ldo 8($idx),$idx ; j++++
+ xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[1]
+ xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[1]
+ addl $hi1,$nm1,$nm1
+ addl $ab1,$nm1,$nm1
+ extrd,u $nm1,31,32,$hi1
+ fstws,mb ${fab0}L,-8($xfer) ; save high part
+ stw $nm1,-4($tp) ; tp[j-1]
+
+ fcpy,sgl %fr0,${fti}L ; zero high part
+ fcpy,sgl %fr0,${fab0}L
+ addl $hi1,$hi0,$hi0
+ extrd,u $hi0,31,32,$hi1
+ fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double
+ fcnvxf,dbl,dbl ${fab0},${fab0}
+ stw $hi0,0($tp)
+ stw $hi1,4($tp)
+
+ fadd,dbl ${fti},${fab0},${fab0} ; add tp[0]
+ fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int
+ xmpyu ${fn0},${fab0}R,${fm0}
+ ldo `$LOCALS+32+4`($fp),$tp
+L\$outer
+ xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m
+ xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m
+ fstds ${fab0},-16($xfer) ; 33-bit value
+ fstds ${fnm0},-8($xfer)
+ flddx $idx($ap),${fai} ; ap[2]
+ flddx $idx($np),${fni} ; np[2]
+ ldo 8($idx),$idx ; j++++
+ ldd -16($xfer),$ab0 ; 33-bit value
+ ldd -8($xfer),$nm0
+ ldw 0($xfer),$hi0 ; high part
+
+ xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i]
+ xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
+ extrd,u $ab0,31,32,$ti0 ; carry bit
+ extrd,u $ab0,63,32,$ab0
+ fstds ${fab1},0($xfer)
+ addl $ti0,$hi0,$hi0 ; account carry bit
+ fstds ${fnm1},8($xfer)
+ addl $ab0,$nm0,$nm0 ; low part is discarded
+ ldw 0($tp),$ti1 ; tp[1]
+ extrd,u $nm0,31,32,$hi1
+ fstds ${fab0},-16($xfer)
+ fstds ${fnm0},-8($xfer)
+
+L\$inner
+ xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[i]
+ xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
+ ldd 0($xfer),$ab1
+ fstds ${fab1},0($xfer)
+ addl $hi0,$ti1,$ti1
+ addl $ti1,$ab1,$ab1
+ ldd 8($xfer),$nm1
+ fstds ${fnm1},8($xfer)
+ extrd,u $ab1,31,32,$hi0
+ extrd,u $ab1,63,32,$ab1
+ flddx $idx($ap),${fai} ; ap[j,j+1]
+ flddx $idx($np),${fni} ; np[j,j+1]
+ addl $hi1,$nm1,$nm1
+ addl $ab1,$nm1,$nm1
+ ldw 4($tp),$ti0 ; tp[j]
+ stw $nm1,-4($tp) ; tp[j-1]
+
+ xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i]
+ xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
+ ldd -16($xfer),$ab0
+ fstds ${fab0},-16($xfer)
+ addl $hi0,$ti0,$ti0
+ addl $ti0,$ab0,$ab0
+ ldd -8($xfer),$nm0
+ fstds ${fnm0},-8($xfer)
+ extrd,u $ab0,31,32,$hi0
+ extrd,u $nm1,31,32,$hi1
+ ldw 8($tp),$ti1 ; tp[j]
+ extrd,u $ab0,63,32,$ab0
+ addl $hi1,$nm0,$nm0
+ addl $ab0,$nm0,$nm0
+ stw,ma $nm0,8($tp) ; tp[j-1]
+ addib,<> 8,$idx,L\$inner ; j++++
+ extrd,u $nm0,31,32,$hi1
+
+ xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[i]
+ xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m
+ ldd 0($xfer),$ab1
+ fstds ${fab1},0($xfer)
+ addl $hi0,$ti1,$ti1
+ addl $ti1,$ab1,$ab1
+ ldd 8($xfer),$nm1
+ fstds ${fnm1},8($xfer)
+ extrd,u $ab1,31,32,$hi0
+ extrd,u $ab1,63,32,$ab1
+ ldw 4($tp),$ti0 ; tp[j]
+ addl $hi1,$nm1,$nm1
+ addl $ab1,$nm1,$nm1
+ ldd -16($xfer),$ab0
+ ldd -8($xfer),$nm0
+ extrd,u $nm1,31,32,$hi1
+
+ addl $hi0,$ab0,$ab0
+ addl $ti0,$ab0,$ab0
+ stw $nm1,-4($tp) ; tp[j-1]
+ extrd,u $ab0,31,32,$hi0
+ ldw 8($tp),$ti1 ; tp[j]
+ extrd,u $ab0,63,32,$ab0
+ addl $hi1,$nm0,$nm0
+ ldd 0($xfer),$ab1
+ addl $ab0,$nm0,$nm0
+ ldd,mb 8($xfer),$nm1
+ extrd,u $nm0,31,32,$hi1
+ stw,ma $nm0,8($tp) ; tp[j-1]
+
+ addib,= -1,$num,L\$outerdone ; i--
+ subi 0,$arrsz,$idx ; j=0
+___
+$code.=<<___ if ($BN_SZ==4);
+ fldws,ma 4($bp),${fbi} ; bp[i]
+___
+$code.=<<___ if ($BN_SZ==8);
+ ldi 12,$ti0 ; bp[i] in flipped word order
+ addl,ev %r0,$num,$num
+ ldi -4,$ti0
+ addl $ti0,$bp,$bp
+ fldws 0($bp),${fbi}
+___
+$code.=<<___;
+ flddx $idx($ap),${fai} ; ap[0]
+ addl $hi0,$ab1,$ab1
+ flddx $idx($np),${fni} ; np[0]
+ fldws 8($xfer),${fti}R ; tp[0]
+ addl $ti1,$ab1,$ab1
+ extrd,u $ab1,31,32,$hi0
+ extrd,u $ab1,63,32,$ab1
+
+ ldo 8($idx),$idx ; j++++
+ xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[i]
+ xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[i]
+ ldw 4($tp),$ti0 ; tp[j]
+
+ addl $hi1,$nm1,$nm1
+ fstws,mb ${fab0}L,-8($xfer) ; save high part
+ addl $ab1,$nm1,$nm1
+ extrd,u $nm1,31,32,$hi1
+ fcpy,sgl %fr0,${fti}L ; zero high part
+ fcpy,sgl %fr0,${fab0}L
+ stw $nm1,-4($tp) ; tp[j-1]
+
+ fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double
+ fcnvxf,dbl,dbl ${fab0},${fab0}
+ addl $hi1,$hi0,$hi0
+ fadd,dbl ${fti},${fab0},${fab0} ; add tp[0]
+ addl $ti0,$hi0,$hi0
+ extrd,u $hi0,31,32,$hi1
+ fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int
+ stw $hi0,0($tp)
+ stw $hi1,4($tp)
+ xmpyu ${fn0},${fab0}R,${fm0}
+
+ b L\$outer
+ ldo `$LOCALS+32+4`($fp),$tp
+
+L\$outerdone
+ addl $hi0,$ab1,$ab1
+ addl $ti1,$ab1,$ab1
+ extrd,u $ab1,31,32,$hi0
+ extrd,u $ab1,63,32,$ab1
+
+ ldw 4($tp),$ti0 ; tp[j]
+
+ addl $hi1,$nm1,$nm1
+ addl $ab1,$nm1,$nm1
+ extrd,u $nm1,31,32,$hi1
+ stw $nm1,-4($tp) ; tp[j-1]
+
+ addl $hi1,$hi0,$hi0
+ addl $ti0,$hi0,$hi0
+ extrd,u $hi0,31,32,$hi1
+ stw $hi0,0($tp)
+ stw $hi1,4($tp)
+
+ ldo `$LOCALS+32`($fp),$tp
+ sub %r0,%r0,%r0 ; clear borrow
+___
+$code.=<<___ if ($BN_SZ==4);
+ ldws,ma 4($tp),$ti0
+ extru,= $rp,31,3,%r0 ; is rp 64-bit aligned?
+ b L\$sub_pa11
+ addl $tp,$arrsz,$tp
+L\$sub
+ ldwx $idx($np),$hi0
+ subb $ti0,$hi0,$hi1
+ ldwx $idx($tp),$ti0
+ addib,<> 4,$idx,L\$sub
+ stws,ma $hi1,4($rp)
+
+ subb $ti0,%r0,$hi1
+ ldo -4($tp),$tp
+___
+$code.=<<___ if ($BN_SZ==8);
+ ldd,ma 8($tp),$ti0
+L\$sub
+ ldd $idx($np),$hi0
+ shrpd $ti0,$ti0,32,$ti0 ; flip word order
+ std $ti0,-8($tp) ; save flipped value
+ sub,db $ti0,$hi0,$hi1
+ ldd,ma 8($tp),$ti0
+ addib,<> 8,$idx,L\$sub
+ std,ma $hi1,8($rp)
+
+ extrd,u $ti0,31,32,$ti0 ; carry in flipped word order
+ sub,db $ti0,%r0,$hi1
+ ldo -8($tp),$tp
+___
+$code.=<<___;
+ and $tp,$hi1,$ap
+ andcm $rp,$hi1,$bp
+ or $ap,$bp,$np
+
+ sub $rp,$arrsz,$rp ; rewind rp
+ subi 0,$arrsz,$idx
+ ldo `$LOCALS+32`($fp),$tp
+L\$copy
+ ldd $idx($np),$hi0
+ std,ma %r0,8($tp)
+ addib,<> 8,$idx,.-8 ; L\$copy
+ std,ma $hi0,8($rp)
+___
+
+if ($BN_SZ==4) { # PA-RISC 1.1 code-path
+$ablo=$ab0;
+$abhi=$ab1;
+$nmlo0=$nm0;
+$nmhi0=$nm1;
+$nmlo1="%r9";
+$nmhi1="%r8";
+
+$code.=<<___;
+ b L\$done
+ nop
+
+ .ALIGN 8
+L\$parisc11
+ xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0]
+ xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
+ ldw -12($xfer),$ablo
+ ldw -16($xfer),$hi0
+ ldw -4($xfer),$nmlo0
+ ldw -8($xfer),$nmhi0
+ fstds ${fab0},-16($xfer)
+ fstds ${fnm0},-8($xfer)
+
+ ldo 8($idx),$idx ; j++++
+ add $ablo,$nmlo0,$nmlo0 ; discarded
+ addc %r0,$nmhi0,$hi1
+ ldw 4($xfer),$ablo
+ ldw 0($xfer),$abhi
+ nop
+
+L\$1st_pa11
+ xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[0]
+ flddx $idx($ap),${fai} ; ap[j,j+1]
+ xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
+ flddx $idx($np),${fni} ; np[j,j+1]
+ add $hi0,$ablo,$ablo
+ ldw 12($xfer),$nmlo1
+ addc %r0,$abhi,$hi0
+ ldw 8($xfer),$nmhi1
+ add $ablo,$nmlo1,$nmlo1
+ fstds ${fab1},0($xfer)
+ addc %r0,$nmhi1,$nmhi1
+ fstds ${fnm1},8($xfer)
+ add $hi1,$nmlo1,$nmlo1
+ ldw -12($xfer),$ablo
+ addc %r0,$nmhi1,$hi1
+ ldw -16($xfer),$abhi
+
+ xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0]
+ ldw -4($xfer),$nmlo0
+ xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
+ ldw -8($xfer),$nmhi0
+ add $hi0,$ablo,$ablo
+ stw $nmlo1,-4($tp) ; tp[j-1]
+ addc %r0,$abhi,$hi0
+ fstds ${fab0},-16($xfer)
+ add $ablo,$nmlo0,$nmlo0
+ fstds ${fnm0},-8($xfer)
+ addc %r0,$nmhi0,$nmhi0
+ ldw 0($xfer),$abhi
+ add $hi1,$nmlo0,$nmlo0
+ ldw 4($xfer),$ablo
+ stws,ma $nmlo0,8($tp) ; tp[j-1]
+ addib,<> 8,$idx,L\$1st_pa11 ; j++++
+ addc %r0,$nmhi0,$hi1
+
+ ldw 8($xfer),$nmhi1
+ ldw 12($xfer),$nmlo1
+ xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[0]
+ xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m
+ add $hi0,$ablo,$ablo
+ fstds ${fab1},0($xfer)
+ addc %r0,$abhi,$hi0
+ fstds ${fnm1},8($xfer)
+ add $ablo,$nmlo1,$nmlo1
+ ldw -16($xfer),$abhi
+ addc %r0,$nmhi1,$nmhi1
+ ldw -12($xfer),$ablo
+ add $hi1,$nmlo1,$nmlo1
+ ldw -8($xfer),$nmhi0
+ addc %r0,$nmhi1,$hi1
+ ldw -4($xfer),$nmlo0
+
+ add $hi0,$ablo,$ablo
+ stw $nmlo1,-4($tp) ; tp[j-1]
+ addc %r0,$abhi,$hi0
+ ldw 0($xfer),$abhi
+ add $ablo,$nmlo0,$nmlo0
+ ldw 4($xfer),$ablo
+ addc %r0,$nmhi0,$nmhi0
+ ldws,mb 8($xfer),$nmhi1
+ add $hi1,$nmlo0,$nmlo0
+ ldw 4($xfer),$nmlo1
+ addc %r0,$nmhi0,$hi1
+ stws,ma $nmlo0,8($tp) ; tp[j-1]
+
+ ldo -1($num),$num ; i--
+ subi 0,$arrsz,$idx ; j=0
+
+ fldws,ma 4($bp),${fbi} ; bp[1]
+ flddx $idx($ap),${fai} ; ap[0,1]
+ flddx $idx($np),${fni} ; np[0,1]
+ fldws 8($xfer),${fti}R ; tp[0]
+ add $hi0,$ablo,$ablo
+ addc %r0,$abhi,$hi0
+ ldo 8($idx),$idx ; j++++
+ xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[1]
+ xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[1]
+ add $hi1,$nmlo1,$nmlo1
+ addc %r0,$nmhi1,$nmhi1
+ add $ablo,$nmlo1,$nmlo1
+ addc %r0,$nmhi1,$hi1
+ fstws,mb ${fab0}L,-8($xfer) ; save high part
+ stw $nmlo1,-4($tp) ; tp[j-1]
+
+ fcpy,sgl %fr0,${fti}L ; zero high part
+ fcpy,sgl %fr0,${fab0}L
+ add $hi1,$hi0,$hi0
+ addc %r0,%r0,$hi1
+ fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double
+ fcnvxf,dbl,dbl ${fab0},${fab0}
+ stw $hi0,0($tp)
+ stw $hi1,4($tp)
+
+ fadd,dbl ${fti},${fab0},${fab0} ; add tp[0]
+ fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int
+ xmpyu ${fn0},${fab0}R,${fm0}
+ ldo `$LOCALS+32+4`($fp),$tp
+L\$outer_pa11
+ xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m
+ xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m
+ fstds ${fab0},-16($xfer) ; 33-bit value
+ fstds ${fnm0},-8($xfer)
+ flddx $idx($ap),${fai} ; ap[2,3]
+ flddx $idx($np),${fni} ; np[2,3]
+ ldw -16($xfer),$abhi ; carry bit actually
+ ldo 8($idx),$idx ; j++++
+ ldw -12($xfer),$ablo
+ ldw -8($xfer),$nmhi0
+ ldw -4($xfer),$nmlo0
+ ldw 0($xfer),$hi0 ; high part
+
+ xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i]
+ xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
+ fstds ${fab1},0($xfer)
+ addl $abhi,$hi0,$hi0 ; account carry bit
+ fstds ${fnm1},8($xfer)
+ add $ablo,$nmlo0,$nmlo0 ; discarded
+ ldw 0($tp),$ti1 ; tp[1]
+ addc %r0,$nmhi0,$hi1
+ fstds ${fab0},-16($xfer)
+ fstds ${fnm0},-8($xfer)
+ ldw 4($xfer),$ablo
+ ldw 0($xfer),$abhi
+
+L\$inner_pa11
+ xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[i]
+ flddx $idx($ap),${fai} ; ap[j,j+1]
+ xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
+ flddx $idx($np),${fni} ; np[j,j+1]
+ add $hi0,$ablo,$ablo
+ ldw 4($tp),$ti0 ; tp[j]
+ addc %r0,$abhi,$abhi
+ ldw 12($xfer),$nmlo1
+ add $ti1,$ablo,$ablo
+ ldw 8($xfer),$nmhi1
+ addc %r0,$abhi,$hi0
+ fstds ${fab1},0($xfer)
+ add $ablo,$nmlo1,$nmlo1
+ fstds ${fnm1},8($xfer)
+ addc %r0,$nmhi1,$nmhi1
+ ldw -12($xfer),$ablo
+ add $hi1,$nmlo1,$nmlo1
+ ldw -16($xfer),$abhi
+ addc %r0,$nmhi1,$hi1
+
+ xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i]
+ ldw 8($tp),$ti1 ; tp[j]
+ xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
+ ldw -4($xfer),$nmlo0
+ add $hi0,$ablo,$ablo
+ ldw -8($xfer),$nmhi0
+ addc %r0,$abhi,$abhi
+ stw $nmlo1,-4($tp) ; tp[j-1]
+ add $ti0,$ablo,$ablo
+ fstds ${fab0},-16($xfer)
+ addc %r0,$abhi,$hi0
+ fstds ${fnm0},-8($xfer)
+ add $ablo,$nmlo0,$nmlo0
+ ldw 4($xfer),$ablo
+ addc %r0,$nmhi0,$nmhi0
+ ldw 0($xfer),$abhi
+ add $hi1,$nmlo0,$nmlo0
+ stws,ma $nmlo0,8($tp) ; tp[j-1]
+ addib,<> 8,$idx,L\$inner_pa11 ; j++++
+ addc %r0,$nmhi0,$hi1
+
+ xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[i]
+ ldw 12($xfer),$nmlo1
+ xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m
+ ldw 8($xfer),$nmhi1
+ add $hi0,$ablo,$ablo
+ ldw 4($tp),$ti0 ; tp[j]
+ addc %r0,$abhi,$abhi
+ fstds ${fab1},0($xfer)
+ add $ti1,$ablo,$ablo
+ fstds ${fnm1},8($xfer)
+ addc %r0,$abhi,$hi0
+ ldw -16($xfer),$abhi
+ add $ablo,$nmlo1,$nmlo1
+ ldw -12($xfer),$ablo
+ addc %r0,$nmhi1,$nmhi1
+ ldw -8($xfer),$nmhi0
+ add $hi1,$nmlo1,$nmlo1
+ ldw -4($xfer),$nmlo0
+ addc %r0,$nmhi1,$hi1
+
+ add $hi0,$ablo,$ablo
+ stw $nmlo1,-4($tp) ; tp[j-1]
+ addc %r0,$abhi,$abhi
+ add $ti0,$ablo,$ablo
+ ldw 8($tp),$ti1 ; tp[j]
+ addc %r0,$abhi,$hi0
+ ldw 0($xfer),$abhi
+ add $ablo,$nmlo0,$nmlo0
+ ldw 4($xfer),$ablo
+ addc %r0,$nmhi0,$nmhi0
+ ldws,mb 8($xfer),$nmhi1
+ add $hi1,$nmlo0,$nmlo0
+ ldw 4($xfer),$nmlo1
+ addc %r0,$nmhi0,$hi1
+ stws,ma $nmlo0,8($tp) ; tp[j-1]
+
+ addib,= -1,$num,L\$outerdone_pa11; i--
+ subi 0,$arrsz,$idx ; j=0
+
+ fldws,ma 4($bp),${fbi} ; bp[i]
+ flddx $idx($ap),${fai} ; ap[0]
+ add $hi0,$ablo,$ablo
+ addc %r0,$abhi,$abhi
+ flddx $idx($np),${fni} ; np[0]
+ fldws 8($xfer),${fti}R ; tp[0]
+ add $ti1,$ablo,$ablo
+ addc %r0,$abhi,$hi0
+
+ ldo 8($idx),$idx ; j++++
+ xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[i]
+ xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[i]
+ ldw 4($tp),$ti0 ; tp[j]
+
+ add $hi1,$nmlo1,$nmlo1
+ addc %r0,$nmhi1,$nmhi1
+ fstws,mb ${fab0}L,-8($xfer) ; save high part
+ add $ablo,$nmlo1,$nmlo1
+ addc %r0,$nmhi1,$hi1
+ fcpy,sgl %fr0,${fti}L ; zero high part
+ fcpy,sgl %fr0,${fab0}L
+ stw $nmlo1,-4($tp) ; tp[j-1]
+
+ fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double
+ fcnvxf,dbl,dbl ${fab0},${fab0}
+ add $hi1,$hi0,$hi0
+ addc %r0,%r0,$hi1
+ fadd,dbl ${fti},${fab0},${fab0} ; add tp[0]
+ add $ti0,$hi0,$hi0
+ addc %r0,$hi1,$hi1
+ fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int
+ stw $hi0,0($tp)
+ stw $hi1,4($tp)
+ xmpyu ${fn0},${fab0}R,${fm0}
+
+ b L\$outer_pa11
+ ldo `$LOCALS+32+4`($fp),$tp
+
+L\$outerdone_pa11
+ add $hi0,$ablo,$ablo
+ addc %r0,$abhi,$abhi
+ add $ti1,$ablo,$ablo
+ addc %r0,$abhi,$hi0
+
+ ldw 4($tp),$ti0 ; tp[j]
+
+ add $hi1,$nmlo1,$nmlo1
+ addc %r0,$nmhi1,$nmhi1
+ add $ablo,$nmlo1,$nmlo1
+ addc %r0,$nmhi1,$hi1
+ stw $nmlo1,-4($tp) ; tp[j-1]
+
+ add $hi1,$hi0,$hi0
+ addc %r0,%r0,$hi1
+ add $ti0,$hi0,$hi0
+ addc %r0,$hi1,$hi1
+ stw $hi0,0($tp)
+ stw $hi1,4($tp)
+
+ ldo `$LOCALS+32+4`($fp),$tp
+ sub %r0,%r0,%r0 ; clear borrow
+ ldw -4($tp),$ti0
+ addl $tp,$arrsz,$tp
+L\$sub_pa11
+ ldwx $idx($np),$hi0
+ subb $ti0,$hi0,$hi1
+ ldwx $idx($tp),$ti0
+ addib,<> 4,$idx,L\$sub_pa11
+ stws,ma $hi1,4($rp)
+
+ subb $ti0,%r0,$hi1
+ ldo -4($tp),$tp
+ and $tp,$hi1,$ap
+ andcm $rp,$hi1,$bp
+ or $ap,$bp,$np
+
+ sub $rp,$arrsz,$rp ; rewind rp
+ subi 0,$arrsz,$idx
+ ldo `$LOCALS+32`($fp),$tp
+L\$copy_pa11
+ ldwx $idx($np),$hi0
+ stws,ma %r0,4($tp)
+ addib,<> 4,$idx,L\$copy_pa11
+ stws,ma $hi0,4($rp)
+
+ nop ; alignment
+L\$done
+___
+}
+
+$code.=<<___;
+ ldi 1,%r28 ; signal "handled"
+ ldo $FRAME($fp),%sp ; destroy tp[num+1]
+
+ $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
+ $POP `-$FRAME+1*$SIZE_T`(%sp),%r4
+ $POP `-$FRAME+2*$SIZE_T`(%sp),%r5
+ $POP `-$FRAME+3*$SIZE_T`(%sp),%r6
+ $POP `-$FRAME+4*$SIZE_T`(%sp),%r7
+ $POP `-$FRAME+5*$SIZE_T`(%sp),%r8
+ $POP `-$FRAME+6*$SIZE_T`(%sp),%r9
+ $POP `-$FRAME+7*$SIZE_T`(%sp),%r10
+L\$abort
+ bv (%r2)
+ .EXIT
+ $POPMB -$FRAME(%sp),%r3
+ .PROCEND
+ .STRINGZ "Montgomery Multiplication for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+# Explicitly encode PA-RISC 2.0 instructions used in this module, so
+# that it can be compiled with .LEVEL 1.0. It should be noted that I
+# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
+# directive...
+
+my $ldd = sub {
+ my ($mod,$args) = @_;
+ my $orig = "ldd$mod\t$args";
+
+ if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 4
+ { my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3;
+ sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+ }
+ elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 5
+ { my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3;
+ $opcode|=(($1&0xF)<<17)|(($1&0x10)<<12); # encode offset
+ $opcode|=(1<<5) if ($mod =~ /^,m/);
+ $opcode|=(1<<13) if ($mod =~ /^,mb/);
+ sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+ }
+ else { "\t".$orig; }
+};
+
+my $std = sub {
+ my ($mod,$args) = @_;
+ my $orig = "std$mod\t$args";
+
+ if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 6
+ { my $opcode=(0x03<<26)|($3<<21)|($1<<16)|(1<<12)|(0xB<<6);
+ $opcode|=(($2&0xF)<<1)|(($2&0x10)>>4); # encode offset
+ $opcode|=(1<<5) if ($mod =~ /^,m/);
+ $opcode|=(1<<13) if ($mod =~ /^,mb/);
+ sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+ }
+ else { "\t".$orig; }
+};
+
+my $extrd = sub {
+ my ($mod,$args) = @_;
+ my $orig = "extrd$mod\t$args";
+
+ # I only have ",u" completer, it's implicitly encoded...
+ if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15
+ { my $opcode=(0x36<<26)|($1<<21)|($4<<16);
+ my $len=32-$3;
+ $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos
+ $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len
+ sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+ }
+ elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12
+ { my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
+ my $len=32-$2;
+ $opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len
+ $opcode |= (1<<13) if ($mod =~ /,\**=/);
+ sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+ }
+ else { "\t".$orig; }
+};
+
+my $shrpd = sub {
+ my ($mod,$args) = @_;
+ my $orig = "shrpd$mod\t$args";
+
+ if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14
+ { my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
+ my $cpos=63-$3;
+ $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa
+ sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+ }
+ else { "\t".$orig; }
+};
+
+my $sub = sub {
+ my ($mod,$args) = @_;
+ my $orig = "sub$mod\t$args";
+
+ if ($mod eq ",db" && $args =~ /%r([0-9]+),%r([0-9]+),%r([0-9]+)/) {
+ my $opcode=(0x02<<26)|($2<<21)|($1<<16)|$3;
+ $opcode|=(1<<10); # e1
+ $opcode|=(1<<8); # e2
+ $opcode|=(1<<5); # d
+ sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig
+ }
+ else { "\t".$orig; }
+};
+
+sub assemble {
+ my ($mnemonic,$mod,$args)=@_;
+ my $opcode = eval("\$$mnemonic");
+
+ ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
+}
+
+foreach (split("\n",$code)) {
+ s/\`([^\`]*)\`/eval $1/ge;
+ # flip word order in 64-bit mode...
+ s/(xmpyu\s+)($fai|$fni)([LR])/$1.$2.($3 eq "L"?"R":"L")/e if ($BN_SZ==8);
+ # assemble 2.0 instructions in 32-bit mode...
+ s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($BN_SZ==4);
+
+ s/\bbv\b/bve/gm if ($SIZE_T==8);
+
+ print $_,"\n";
+}
+close STDOUT;
diff --git a/main/openssl/crypto/bn/asm/ppc-mont.pl b/main/openssl/crypto/bn/asm/ppc-mont.pl
index 7849eae9..f9b6992c 100644
--- a/main/openssl/crypto/bn/asm/ppc-mont.pl
+++ b/main/openssl/crypto/bn/asm/ppc-mont.pl
@@ -31,7 +31,6 @@ if ($flavour =~ /32/) {
$BNSZ= $BITS/8;
$SIZE_T=4;
$RZONE= 224;
- $FRAME= $SIZE_T*16;
$LD= "lwz"; # load
$LDU= "lwzu"; # load and update
@@ -51,7 +50,6 @@ if ($flavour =~ /32/) {
$BNSZ= $BITS/8;
$SIZE_T=8;
$RZONE= 288;
- $FRAME= $SIZE_T*16;
# same as above, but 64-bit mnemonics...
$LD= "ld"; # load
@@ -69,6 +67,9 @@ if ($flavour =~ /32/) {
$POP= $LD;
} else { die "nonsense $flavour"; }
+$FRAME=8*$SIZE_T+$RZONE;
+$LOCALS=8*$SIZE_T;
+
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
@@ -89,18 +90,18 @@ $aj="r10";
$nj="r11";
$tj="r12";
# non-volatile registers
-$i="r14";
-$j="r15";
-$tp="r16";
-$m0="r17";
-$m1="r18";
-$lo0="r19";
-$hi0="r20";
-$lo1="r21";
-$hi1="r22";
-$alo="r23";
-$ahi="r24";
-$nlo="r25";
+$i="r20";
+$j="r21";
+$tp="r22";
+$m0="r23";
+$m1="r24";
+$lo0="r25";
+$hi0="r26";
+$lo1="r27";
+$hi1="r28";
+$alo="r29";
+$ahi="r30";
+$nlo="r31";
#
$nhi="r0";
@@ -108,42 +109,48 @@ $code=<<___;
.machine "any"
.text
-.globl .bn_mul_mont
+.globl .bn_mul_mont_int
.align 4
-.bn_mul_mont:
+.bn_mul_mont_int:
cmpwi $num,4
mr $rp,r3 ; $rp is reassigned
li r3,0
bltlr
-
+___
+$code.=<<___ if ($BNSZ==4);
+ cmpwi $num,32 ; longer key performance is not better
+ bgelr
+___
+$code.=<<___;
slwi $num,$num,`log($BNSZ)/log(2)`
li $tj,-4096
- addi $ovf,$num,`$FRAME+$RZONE`
+ addi $ovf,$num,$FRAME
subf $ovf,$ovf,$sp ; $sp-$ovf
and $ovf,$ovf,$tj ; minimize TLB usage
subf $ovf,$sp,$ovf ; $ovf-$sp
+ mr $tj,$sp
srwi $num,$num,`log($BNSZ)/log(2)`
$STUX $sp,$sp,$ovf
- $PUSH r14,`4*$SIZE_T`($sp)
- $PUSH r15,`5*$SIZE_T`($sp)
- $PUSH r16,`6*$SIZE_T`($sp)
- $PUSH r17,`7*$SIZE_T`($sp)
- $PUSH r18,`8*$SIZE_T`($sp)
- $PUSH r19,`9*$SIZE_T`($sp)
- $PUSH r20,`10*$SIZE_T`($sp)
- $PUSH r21,`11*$SIZE_T`($sp)
- $PUSH r22,`12*$SIZE_T`($sp)
- $PUSH r23,`13*$SIZE_T`($sp)
- $PUSH r24,`14*$SIZE_T`($sp)
- $PUSH r25,`15*$SIZE_T`($sp)
+ $PUSH r20,`-12*$SIZE_T`($tj)
+ $PUSH r21,`-11*$SIZE_T`($tj)
+ $PUSH r22,`-10*$SIZE_T`($tj)
+ $PUSH r23,`-9*$SIZE_T`($tj)
+ $PUSH r24,`-8*$SIZE_T`($tj)
+ $PUSH r25,`-7*$SIZE_T`($tj)
+ $PUSH r26,`-6*$SIZE_T`($tj)
+ $PUSH r27,`-5*$SIZE_T`($tj)
+ $PUSH r28,`-4*$SIZE_T`($tj)
+ $PUSH r29,`-3*$SIZE_T`($tj)
+ $PUSH r30,`-2*$SIZE_T`($tj)
+ $PUSH r31,`-1*$SIZE_T`($tj)
$LD $n0,0($n0) ; pull n0[0] value
addi $num,$num,-2 ; adjust $num for counter register
$LD $m0,0($bp) ; m0=bp[0]
$LD $aj,0($ap) ; ap[0]
- addi $tp,$sp,$FRAME
+ addi $tp,$sp,$LOCALS
$UMULL $lo0,$aj,$m0 ; ap[0]*bp[0]
$UMULH $hi0,$aj,$m0
@@ -205,8 +212,8 @@ L1st:
Louter:
$LDX $m0,$bp,$i ; m0=bp[i]
$LD $aj,0($ap) ; ap[0]
- addi $tp,$sp,$FRAME
- $LD $tj,$FRAME($sp) ; tp[0]
+ addi $tp,$sp,$LOCALS
+ $LD $tj,$LOCALS($sp); tp[0]
$UMULL $lo0,$aj,$m0 ; ap[0]*bp[i]
$UMULH $hi0,$aj,$m0
$LD $aj,$BNSZ($ap) ; ap[1]
@@ -273,7 +280,7 @@ Linner:
addi $num,$num,2 ; restore $num
subfc $j,$j,$j ; j=0 and "clear" XER[CA]
- addi $tp,$sp,$FRAME
+ addi $tp,$sp,$LOCALS
mtctr $num
.align 4
@@ -299,23 +306,27 @@ Lcopy: ; copy or in-place refresh
addi $j,$j,$BNSZ
bdnz- Lcopy
- $POP r14,`4*$SIZE_T`($sp)
- $POP r15,`5*$SIZE_T`($sp)
- $POP r16,`6*$SIZE_T`($sp)
- $POP r17,`7*$SIZE_T`($sp)
- $POP r18,`8*$SIZE_T`($sp)
- $POP r19,`9*$SIZE_T`($sp)
- $POP r20,`10*$SIZE_T`($sp)
- $POP r21,`11*$SIZE_T`($sp)
- $POP r22,`12*$SIZE_T`($sp)
- $POP r23,`13*$SIZE_T`($sp)
- $POP r24,`14*$SIZE_T`($sp)
- $POP r25,`15*$SIZE_T`($sp)
- $POP $sp,0($sp)
+ $POP $tj,0($sp)
li r3,1
+ $POP r20,`-12*$SIZE_T`($tj)
+ $POP r21,`-11*$SIZE_T`($tj)
+ $POP r22,`-10*$SIZE_T`($tj)
+ $POP r23,`-9*$SIZE_T`($tj)
+ $POP r24,`-8*$SIZE_T`($tj)
+ $POP r25,`-7*$SIZE_T`($tj)
+ $POP r26,`-6*$SIZE_T`($tj)
+ $POP r27,`-5*$SIZE_T`($tj)
+ $POP r28,`-4*$SIZE_T`($tj)
+ $POP r29,`-3*$SIZE_T`($tj)
+ $POP r30,`-2*$SIZE_T`($tj)
+ $POP r31,`-1*$SIZE_T`($tj)
+ mr $sp,$tj
blr
.long 0
-.asciz "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>"
+ .byte 0,12,4,0,0x80,12,6,0
+ .long 0
+
+.asciz "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@openssl.org>"
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
diff --git a/main/openssl/crypto/bn/asm/ppc.pl b/main/openssl/crypto/bn/asm/ppc.pl
index 37c65d35..1249ce22 100644
--- a/main/openssl/crypto/bn/asm/ppc.pl
+++ b/main/openssl/crypto/bn/asm/ppc.pl
@@ -389,7 +389,9 @@ $data=<<EOF;
$ST r9,`6*$BNSZ`(r3) #r[6]=c1
$ST r10,`7*$BNSZ`(r3) #r[7]=c2
blr
- .long 0x00000000
+ .long 0
+ .byte 0,12,0x14,0,0,0,2,0
+ .long 0
#
# NOTE: The following label name should be changed to
@@ -814,8 +816,9 @@ $data=<<EOF;
blr
-
- .long 0x00000000
+ .long 0
+ .byte 0,12,0x14,0,0,0,2,0
+ .long 0
#
# NOTE: The following label name should be changed to
@@ -949,7 +952,7 @@ $data=<<EOF;
addze r11,r0
#mul_add_c(a[3],b[2],c3,c1,c2);
$LD r6,`3*$BNSZ`(r4)
- $LD r7,`2*$BNSZ`(r4)
+ $LD r7,`2*$BNSZ`(r5)
$UMULL r8,r6,r7
$UMULH r9,r6,r7
addc r12,r8,r12
@@ -966,7 +969,9 @@ $data=<<EOF;
$ST r10,`6*$BNSZ`(r3) #r[6]=c1
$ST r11,`7*$BNSZ`(r3) #r[7]=c2
blr
- .long 0x00000000
+ .long 0
+ .byte 0,12,0x14,0,0,0,3,0
+ .long 0
#
# NOTE: The following label name should be changed to
@@ -1502,7 +1507,9 @@ $data=<<EOF;
$ST r12,`14*$BNSZ`(r3) #r[14]=c3;
$ST r10,`15*$BNSZ`(r3) #r[15]=c1;
blr
- .long 0x00000000
+ .long 0
+ .byte 0,12,0x14,0,0,0,3,0
+ .long 0
#
# NOTE: The following label name should be changed to
@@ -1550,8 +1557,9 @@ Lppcasm_sub_adios:
subfze r3,r0 # if carry bit is set then r3 = 0 else -1
andi. r3,r3,1 # keep only last bit.
blr
- .long 0x00000000
-
+ .long 0
+ .byte 0,12,0x14,0,0,0,4,0
+ .long 0
#
# NOTE: The following label name should be changed to
@@ -1594,7 +1602,9 @@ Lppcasm_add_mainloop:
Lppcasm_add_adios:
addze r3,r0 #return carry bit.
blr
- .long 0x00000000
+ .long 0
+ .byte 0,12,0x14,0,0,0,4,0
+ .long 0
#
# NOTE: The following label name should be changed to
@@ -1707,7 +1717,9 @@ Lppcasm_div8:
Lppcasm_div9:
or r3,r8,r0
blr
- .long 0x00000000
+ .long 0
+ .byte 0,12,0x14,0,0,0,3,0
+ .long 0
#
# NOTE: The following label name should be changed to
@@ -1746,8 +1758,9 @@ Lppcasm_sqr_mainloop:
bdnz- Lppcasm_sqr_mainloop
Lppcasm_sqr_adios:
blr
- .long 0x00000000
-
+ .long 0
+ .byte 0,12,0x14,0,0,0,3,0
+ .long 0
#
# NOTE: The following label name should be changed to
@@ -1850,7 +1863,9 @@ Lppcasm_mw_REM:
Lppcasm_mw_OVER:
addi r3,r12,0
blr
- .long 0x00000000
+ .long 0
+ .byte 0,12,0x14,0,0,0,4,0
+ .long 0
#
# NOTE: The following label name should be changed to
@@ -1973,7 +1988,9 @@ Lppcasm_maw_leftover:
Lppcasm_maw_adios:
addi r3,r12,0
blr
- .long 0x00000000
+ .long 0
+ .byte 0,12,0x14,0,0,0,4,0
+ .long 0
.align 4
EOF
$data =~ s/\`([^\`]*)\`/eval $1/gem;
diff --git a/main/openssl/crypto/bn/asm/ppc64-mont.pl b/main/openssl/crypto/bn/asm/ppc64-mont.pl
index 3449b358..a14e769a 100644
--- a/main/openssl/crypto/bn/asm/ppc64-mont.pl
+++ b/main/openssl/crypto/bn/asm/ppc64-mont.pl
@@ -45,23 +45,40 @@
# on 1.8GHz PPC970, it's only 5-55% faster. Still far from impressive
# in absolute terms, but it's apparently the way Power 6 is...
+# December 2009
+
+# Adapted for 32-bit build this module delivers 25-120%, yes, more
+# than *twice* for longer keys, performance improvement over 32-bit
+# ppc-mont.pl on 1.8GHz PPC970. However! This implementation utilizes
+# even 64-bit integer operations and the trouble is that most PPC
+# operating systems don't preserve upper halves of general purpose
+# registers upon 32-bit signal delivery. They do preserve them upon
+# context switch, but not signalling:-( This means that asynchronous
+# signals have to be blocked upon entry to this subroutine. Signal
+# masking (and of course complementary unmasking) has quite an impact
+# on performance, naturally larger for shorter keys. It's so severe
+# that 512-bit key performance can be as low as 1/3 of expected one.
+# This is why this routine can be engaged for longer key operations
+# only on these OSes, see crypto/ppccap.c for further details. MacOS X
+# is an exception from this and doesn't require signal masking, and
+# that's where above improvement coefficients were collected. For
+# others alternative would be to break dependence on upper halves of
+# GPRs by sticking to 32-bit integer operations...
+
$flavour = shift;
if ($flavour =~ /32/) {
$SIZE_T=4;
$RZONE= 224;
- $FRAME= $SIZE_T*12+8*12;
- $fname= "bn_mul_mont_ppc64";
+ $fname= "bn_mul_mont_fpu64";
$STUX= "stwux"; # store indexed and update
$PUSH= "stw";
$POP= "lwz";
- die "not implemented yet";
} elsif ($flavour =~ /64/) {
$SIZE_T=8;
$RZONE= 288;
- $FRAME= $SIZE_T*12+8*12;
- $fname= "bn_mul_mont";
+ $fname= "bn_mul_mont_fpu64";
# same as above, but 64-bit mnemonics...
$STUX= "stdux"; # store indexed and update
@@ -76,7 +93,7 @@ die "can't locate ppc-xlate.pl";
open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
-$FRAME=($FRAME+63)&~63;
+$FRAME=64; # padded frame header
$TRANSFER=16*8;
$carry="r0";
@@ -93,16 +110,16 @@ $tp="r10";
$j="r11";
$i="r12";
# non-volatile registers
-$nap_d="r14"; # interleaved ap and np in double format
-$a0="r15"; # ap[0]
-$t0="r16"; # temporary registers
-$t1="r17";
-$t2="r18";
-$t3="r19";
-$t4="r20";
-$t5="r21";
-$t6="r22";
-$t7="r23";
+$nap_d="r22"; # interleaved ap and np in double format
+$a0="r23"; # ap[0]
+$t0="r24"; # temporary registers
+$t1="r25";
+$t2="r26";
+$t3="r27";
+$t4="r28";
+$t5="r29";
+$t6="r30";
+$t7="r31";
# PPC offers enough register bank capacity to unroll inner loops twice
#
@@ -132,28 +149,17 @@ $ba="f0"; $bb="f1"; $bc="f2"; $bd="f3";
$na="f4"; $nb="f5"; $nc="f6"; $nd="f7";
$dota="f8"; $dotb="f9";
$A0="f10"; $A1="f11"; $A2="f12"; $A3="f13";
-$N0="f14"; $N1="f15"; $N2="f16"; $N3="f17";
-$T0a="f18"; $T0b="f19";
-$T1a="f20"; $T1b="f21";
-$T2a="f22"; $T2b="f23";
-$T3a="f24"; $T3b="f25";
+$N0="f20"; $N1="f21"; $N2="f22"; $N3="f23";
+$T0a="f24"; $T0b="f25";
+$T1a="f26"; $T1b="f27";
+$T2a="f28"; $T2b="f29";
+$T3a="f30"; $T3b="f31";
# sp----------->+-------------------------------+
# | saved sp |
# +-------------------------------+
-# | |
-# +-------------------------------+
-# | 10 saved gpr, r14-r23 |
-# . .
-# . .
-# +12*size_t +-------------------------------+
-# | 12 saved fpr, f14-f25 |
# . .
-# . .
-# +12*8 +-------------------------------+
-# | padding to 64 byte boundary |
-# . .
-# +X +-------------------------------+
+# +64 +-------------------------------+
# | 16 gpr<->fpr transfer zone |
# . .
# . .
@@ -173,6 +179,16 @@ $T3a="f24"; $T3b="f25";
# . .
# . .
# +-------------------------------+
+# . .
+# -12*size_t +-------------------------------+
+# | 10 saved gpr, r22-r31 |
+# . .
+# . .
+# -12*8 +-------------------------------+
+# | 12 saved fpr, f20-f31 |
+# . .
+# . .
+# +-------------------------------+
$code=<<___;
.machine "any"
@@ -181,14 +197,14 @@ $code=<<___;
.globl .$fname
.align 5
.$fname:
- cmpwi $num,4
+ cmpwi $num,`3*8/$SIZE_T`
mr $rp,r3 ; $rp is reassigned
li r3,0 ; possible "not handled" return code
bltlr-
- andi. r0,$num,1 ; $num has to be even
+ andi. r0,$num,`16/$SIZE_T-1` ; $num has to be "even"
bnelr-
- slwi $num,$num,3 ; num*=8
+ slwi $num,$num,`log($SIZE_T)/log(2)` ; num*=sizeof(BN_LONG)
li $i,-4096
slwi $tp,$num,2 ; place for {an}p_{lh}[num], i.e. 4*num
add $tp,$tp,$num ; place for tp[num+1]
@@ -196,35 +212,50 @@ $code=<<___;
subf $tp,$tp,$sp ; $sp-$tp
and $tp,$tp,$i ; minimize TLB usage
subf $tp,$sp,$tp ; $tp-$sp
+ mr $i,$sp
$STUX $sp,$sp,$tp ; alloca
- $PUSH r14,`2*$SIZE_T`($sp)
- $PUSH r15,`3*$SIZE_T`($sp)
- $PUSH r16,`4*$SIZE_T`($sp)
- $PUSH r17,`5*$SIZE_T`($sp)
- $PUSH r18,`6*$SIZE_T`($sp)
- $PUSH r19,`7*$SIZE_T`($sp)
- $PUSH r20,`8*$SIZE_T`($sp)
- $PUSH r21,`9*$SIZE_T`($sp)
- $PUSH r22,`10*$SIZE_T`($sp)
- $PUSH r23,`11*$SIZE_T`($sp)
- stfd f14,`12*$SIZE_T+0`($sp)
- stfd f15,`12*$SIZE_T+8`($sp)
- stfd f16,`12*$SIZE_T+16`($sp)
- stfd f17,`12*$SIZE_T+24`($sp)
- stfd f18,`12*$SIZE_T+32`($sp)
- stfd f19,`12*$SIZE_T+40`($sp)
- stfd f20,`12*$SIZE_T+48`($sp)
- stfd f21,`12*$SIZE_T+56`($sp)
- stfd f22,`12*$SIZE_T+64`($sp)
- stfd f23,`12*$SIZE_T+72`($sp)
- stfd f24,`12*$SIZE_T+80`($sp)
- stfd f25,`12*$SIZE_T+88`($sp)
-
+ $PUSH r22,`-12*8-10*$SIZE_T`($i)
+ $PUSH r23,`-12*8-9*$SIZE_T`($i)
+ $PUSH r24,`-12*8-8*$SIZE_T`($i)
+ $PUSH r25,`-12*8-7*$SIZE_T`($i)
+ $PUSH r26,`-12*8-6*$SIZE_T`($i)
+ $PUSH r27,`-12*8-5*$SIZE_T`($i)
+ $PUSH r28,`-12*8-4*$SIZE_T`($i)
+ $PUSH r29,`-12*8-3*$SIZE_T`($i)
+ $PUSH r30,`-12*8-2*$SIZE_T`($i)
+ $PUSH r31,`-12*8-1*$SIZE_T`($i)
+ stfd f20,`-12*8`($i)
+ stfd f21,`-11*8`($i)
+ stfd f22,`-10*8`($i)
+ stfd f23,`-9*8`($i)
+ stfd f24,`-8*8`($i)
+ stfd f25,`-7*8`($i)
+ stfd f26,`-6*8`($i)
+ stfd f27,`-5*8`($i)
+ stfd f28,`-4*8`($i)
+ stfd f29,`-3*8`($i)
+ stfd f30,`-2*8`($i)
+ stfd f31,`-1*8`($i)
+___
+$code.=<<___ if ($SIZE_T==8);
ld $a0,0($ap) ; pull ap[0] value
ld $n0,0($n0) ; pull n0[0] value
ld $t3,0($bp) ; bp[0]
-
+___
+$code.=<<___ if ($SIZE_T==4);
+ mr $t1,$n0
+ lwz $a0,0($ap) ; pull ap[0,1] value
+ lwz $t0,4($ap)
+ lwz $n0,0($t1) ; pull n0[0,1] value
+ lwz $t1,4($t1)
+ lwz $t3,0($bp) ; bp[0,1]
+ lwz $t2,4($bp)
+ insrdi $a0,$t0,32,0
+ insrdi $n0,$t1,32,0
+ insrdi $t3,$t2,32,0
+___
+$code.=<<___;
addi $tp,$sp,`$FRAME+$TRANSFER+8+64`
li $i,-64
add $nap_d,$tp,$num
@@ -258,6 +289,8 @@ $code=<<___;
std $t5,`$FRAME+40`($sp)
std $t6,`$FRAME+48`($sp)
std $t7,`$FRAME+56`($sp)
+___
+$code.=<<___ if ($SIZE_T==8);
lwz $t0,4($ap) ; load a[j] as 32-bit word pair
lwz $t1,0($ap)
lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair
@@ -266,6 +299,18 @@ $code=<<___;
lwz $t5,0($np)
lwz $t6,12($np) ; load n[j+1] as 32-bit word pair
lwz $t7,8($np)
+___
+$code.=<<___ if ($SIZE_T==4);
+ lwz $t0,0($ap) ; load a[j..j+3] as 32-bit word pairs
+ lwz $t1,4($ap)
+ lwz $t2,8($ap)
+ lwz $t3,12($ap)
+ lwz $t4,0($np) ; load n[j..j+3] as 32-bit word pairs
+ lwz $t5,4($np)
+ lwz $t6,8($np)
+ lwz $t7,12($np)
+___
+$code.=<<___;
lfd $ba,`$FRAME+0`($sp)
lfd $bb,`$FRAME+8`($sp)
lfd $bc,`$FRAME+16`($sp)
@@ -374,6 +419,8 @@ $code=<<___;
.align 5
L1st:
+___
+$code.=<<___ if ($SIZE_T==8);
lwz $t0,4($ap) ; load a[j] as 32-bit word pair
lwz $t1,0($ap)
lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair
@@ -382,6 +429,18 @@ L1st:
lwz $t5,0($np)
lwz $t6,12($np) ; load n[j+1] as 32-bit word pair
lwz $t7,8($np)
+___
+$code.=<<___ if ($SIZE_T==4);
+ lwz $t0,0($ap) ; load a[j..j+3] as 32-bit word pairs
+ lwz $t1,4($ap)
+ lwz $t2,8($ap)
+ lwz $t3,12($ap)
+ lwz $t4,0($np) ; load n[j..j+3] as 32-bit word pairs
+ lwz $t5,4($np)
+ lwz $t6,8($np)
+ lwz $t7,12($np)
+___
+$code.=<<___;
std $t0,`$FRAME+64`($sp)
std $t1,`$FRAME+72`($sp)
std $t2,`$FRAME+80`($sp)
@@ -559,7 +618,17 @@ L1st:
li $i,8 ; i=1
.align 5
Louter:
+___
+$code.=<<___ if ($SIZE_T==8);
ldx $t3,$bp,$i ; bp[i]
+___
+$code.=<<___ if ($SIZE_T==4);
+ add $t0,$bp,$i
+ lwz $t3,0($t0) ; bp[i,i+1]
+ lwz $t0,4($t0)
+ insrdi $t3,$t0,32,0
+___
+$code.=<<___;
ld $t6,`$FRAME+$TRANSFER+8`($sp) ; tp[0]
mulld $t7,$a0,$t3 ; ap[0]*bp[i]
@@ -761,6 +830,13 @@ Linner:
stfd $T0b,`$FRAME+8`($sp)
add $t7,$t7,$carry
addc $t3,$t0,$t1
+___
+$code.=<<___ if ($SIZE_T==4); # adjust XER[CA]
+ extrdi $t0,$t0,32,0
+ extrdi $t1,$t1,32,0
+ adde $t0,$t0,$t1
+___
+$code.=<<___;
stfd $T1a,`$FRAME+16`($sp)
stfd $T1b,`$FRAME+24`($sp)
insrdi $t4,$t7,16,0 ; 64..127 bits
@@ -768,6 +844,13 @@ Linner:
stfd $T2a,`$FRAME+32`($sp)
stfd $T2b,`$FRAME+40`($sp)
adde $t5,$t4,$t2
+___
+$code.=<<___ if ($SIZE_T==4); # adjust XER[CA]
+ extrdi $t4,$t4,32,0
+ extrdi $t2,$t2,32,0
+ adde $t4,$t4,$t2
+___
+$code.=<<___;
stfd $T3a,`$FRAME+48`($sp)
stfd $T3b,`$FRAME+56`($sp)
addze $carry,$carry
@@ -816,7 +899,21 @@ Linner:
ld $t7,`$FRAME+72`($sp)
addc $t3,$t0,$t1
+___
+$code.=<<___ if ($SIZE_T==4); # adjust XER[CA]
+ extrdi $t0,$t0,32,0
+ extrdi $t1,$t1,32,0
+ adde $t0,$t0,$t1
+___
+$code.=<<___;
adde $t5,$t4,$t2
+___
+$code.=<<___ if ($SIZE_T==4); # adjust XER[CA]
+ extrdi $t4,$t4,32,0
+ extrdi $t2,$t2,32,0
+ adde $t4,$t4,$t2
+___
+$code.=<<___;
addze $carry,$carry
std $t3,-16($tp) ; tp[j-1]
@@ -835,7 +932,9 @@ Linner:
subf $nap_d,$t7,$nap_d ; rewind pointer
cmpw $i,$num
blt- Louter
+___
+$code.=<<___ if ($SIZE_T==8);
subf $np,$num,$np ; rewind np
addi $j,$j,1 ; restore counter
subfc $i,$i,$i ; j=0 and "clear" XER[CA]
@@ -883,34 +982,105 @@ Lcopy: ; copy or in-place refresh
stdx $i,$t4,$i
addi $i,$i,16
bdnz- Lcopy
+___
+$code.=<<___ if ($SIZE_T==4);
+ subf $np,$num,$np ; rewind np
+ addi $j,$j,1 ; restore counter
+ subfc $i,$i,$i ; j=0 and "clear" XER[CA]
+ addi $tp,$sp,`$FRAME+$TRANSFER`
+ addi $np,$np,-4
+ addi $rp,$rp,-4
+ addi $ap,$sp,`$FRAME+$TRANSFER+4`
+ mtctr $j
+
+.align 4
+Lsub: ld $t0,8($tp) ; load tp[j..j+3] in 64-bit word order
+ ldu $t2,16($tp)
+ lwz $t4,4($np) ; load np[j..j+3] in 32-bit word order
+ lwz $t5,8($np)
+ lwz $t6,12($np)
+ lwzu $t7,16($np)
+ extrdi $t1,$t0,32,0
+ extrdi $t3,$t2,32,0
+ subfe $t4,$t4,$t0 ; tp[j]-np[j]
+ stw $t0,4($ap) ; save tp[j..j+3] in 32-bit word order
+ subfe $t5,$t5,$t1 ; tp[j+1]-np[j+1]
+ stw $t1,8($ap)
+ subfe $t6,$t6,$t2 ; tp[j+2]-np[j+2]
+ stw $t2,12($ap)
+ subfe $t7,$t7,$t3 ; tp[j+3]-np[j+3]
+ stwu $t3,16($ap)
+ stw $t4,4($rp)
+ stw $t5,8($rp)
+ stw $t6,12($rp)
+ stwu $t7,16($rp)
+ bdnz- Lsub
+
+ li $i,0
+ subfe $ovf,$i,$ovf ; handle upmost overflow bit
+ addi $tp,$sp,`$FRAME+$TRANSFER+4`
+ subf $rp,$num,$rp ; rewind rp
+ and $ap,$tp,$ovf
+ andc $np,$rp,$ovf
+ or $ap,$ap,$np ; ap=borrow?tp:rp
+ addi $tp,$sp,`$FRAME+$TRANSFER`
+ mtctr $j
+
+.align 4
+Lcopy: ; copy or in-place refresh
+ lwz $t0,4($ap)
+ lwz $t1,8($ap)
+ lwz $t2,12($ap)
+ lwzu $t3,16($ap)
+ std $i,8($nap_d) ; zap nap_d
+ std $i,16($nap_d)
+ std $i,24($nap_d)
+ std $i,32($nap_d)
+ std $i,40($nap_d)
+ std $i,48($nap_d)
+ std $i,56($nap_d)
+ stdu $i,64($nap_d)
+ stw $t0,4($rp)
+ stw $t1,8($rp)
+ stw $t2,12($rp)
+ stwu $t3,16($rp)
+ std $i,8($tp) ; zap tp at once
+ stdu $i,16($tp)
+ bdnz- Lcopy
+___
- $POP r14,`2*$SIZE_T`($sp)
- $POP r15,`3*$SIZE_T`($sp)
- $POP r16,`4*$SIZE_T`($sp)
- $POP r17,`5*$SIZE_T`($sp)
- $POP r18,`6*$SIZE_T`($sp)
- $POP r19,`7*$SIZE_T`($sp)
- $POP r20,`8*$SIZE_T`($sp)
- $POP r21,`9*$SIZE_T`($sp)
- $POP r22,`10*$SIZE_T`($sp)
- $POP r23,`11*$SIZE_T`($sp)
- lfd f14,`12*$SIZE_T+0`($sp)
- lfd f15,`12*$SIZE_T+8`($sp)
- lfd f16,`12*$SIZE_T+16`($sp)
- lfd f17,`12*$SIZE_T+24`($sp)
- lfd f18,`12*$SIZE_T+32`($sp)
- lfd f19,`12*$SIZE_T+40`($sp)
- lfd f20,`12*$SIZE_T+48`($sp)
- lfd f21,`12*$SIZE_T+56`($sp)
- lfd f22,`12*$SIZE_T+64`($sp)
- lfd f23,`12*$SIZE_T+72`($sp)
- lfd f24,`12*$SIZE_T+80`($sp)
- lfd f25,`12*$SIZE_T+88`($sp)
- $POP $sp,0($sp)
+$code.=<<___;
+ $POP $i,0($sp)
li r3,1 ; signal "handled"
+ $POP r22,`-12*8-10*$SIZE_T`($i)
+ $POP r23,`-12*8-9*$SIZE_T`($i)
+ $POP r24,`-12*8-8*$SIZE_T`($i)
+ $POP r25,`-12*8-7*$SIZE_T`($i)
+ $POP r26,`-12*8-6*$SIZE_T`($i)
+ $POP r27,`-12*8-5*$SIZE_T`($i)
+ $POP r28,`-12*8-4*$SIZE_T`($i)
+ $POP r29,`-12*8-3*$SIZE_T`($i)
+ $POP r30,`-12*8-2*$SIZE_T`($i)
+ $POP r31,`-12*8-1*$SIZE_T`($i)
+ lfd f20,`-12*8`($i)
+ lfd f21,`-11*8`($i)
+ lfd f22,`-10*8`($i)
+ lfd f23,`-9*8`($i)
+ lfd f24,`-8*8`($i)
+ lfd f25,`-7*8`($i)
+ lfd f26,`-6*8`($i)
+ lfd f27,`-5*8`($i)
+ lfd f28,`-4*8`($i)
+ lfd f29,`-3*8`($i)
+ lfd f30,`-2*8`($i)
+ lfd f31,`-1*8`($i)
+ mr $sp,$i
blr
.long 0
-.asciz "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@fy.chalmers.se>"
+ .byte 0,12,4,0,0x8c,10,6,0
+ .long 0
+
+.asciz "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@openssl.org>"
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
diff --git a/main/openssl/crypto/bn/asm/s390x-gf2m.pl b/main/openssl/crypto/bn/asm/s390x-gf2m.pl
new file mode 100644
index 00000000..9d18d40e
--- /dev/null
+++ b/main/openssl/crypto/bn/asm/s390x-gf2m.pl
@@ -0,0 +1,221 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# May 2011
+#
+# The module implements bn_GF2m_mul_2x2 polynomial multiplication used
+# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
+# the time being... gcc 4.3 appeared to generate poor code, therefore
+# the effort. And indeed, the module delivers 55%-90%(*) improvement
+# on haviest ECDSA verify and ECDH benchmarks for 163- and 571-bit
+# key lengths on z990, 30%-55%(*) - on z10, and 70%-110%(*) - on z196.
+# This is for 64-bit build. In 32-bit "highgprs" case improvement is
+# even higher, for example on z990 it was measured 80%-150%. ECDSA
+# sign is modest 9%-12% faster. Keep in mind that these coefficients
+# are not ones for bn_GF2m_mul_2x2 itself, as not all CPU time is
+# burnt in it...
+#
+# (*) gcc 4.1 was observed to deliver better results than gcc 4.3,
+# so that improvement coefficients can vary from one specific
+# setup to another.
+
+$flavour = shift;
+
+if ($flavour =~ /3[12]/) {
+ $SIZE_T=4;
+ $g="";
+} else {
+ $SIZE_T=8;
+ $g="g";
+}
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+$stdframe=16*$SIZE_T+4*8;
+
+$rp="%r2";
+$a1="%r3";
+$a0="%r4";
+$b1="%r5";
+$b0="%r6";
+
+$ra="%r14";
+$sp="%r15";
+
+@T=("%r0","%r1");
+@i=("%r12","%r13");
+
+($a1,$a2,$a4,$a8,$a12,$a48)=map("%r$_",(6..11));
+($lo,$hi,$b)=map("%r$_",(3..5)); $a=$lo; $mask=$a8;
+
+$code.=<<___;
+.text
+
+.type _mul_1x1,\@function
+.align 16
+_mul_1x1:
+ lgr $a1,$a
+ sllg $a2,$a,1
+ sllg $a4,$a,2
+ sllg $a8,$a,3
+
+ srag $lo,$a1,63 # broadcast 63rd bit
+ nihh $a1,0x1fff
+ srag @i[0],$a2,63 # broadcast 62nd bit
+ nihh $a2,0x3fff
+ srag @i[1],$a4,63 # broadcast 61st bit
+ nihh $a4,0x7fff
+ ngr $lo,$b
+ ngr @i[0],$b
+ ngr @i[1],$b
+
+ lghi @T[0],0
+ lgr $a12,$a1
+ stg @T[0],`$stdframe+0*8`($sp) # tab[0]=0
+ xgr $a12,$a2
+ stg $a1,`$stdframe+1*8`($sp) # tab[1]=a1
+ lgr $a48,$a4
+ stg $a2,`$stdframe+2*8`($sp) # tab[2]=a2
+ xgr $a48,$a8
+ stg $a12,`$stdframe+3*8`($sp) # tab[3]=a1^a2
+ xgr $a1,$a4
+
+ stg $a4,`$stdframe+4*8`($sp) # tab[4]=a4
+ xgr $a2,$a4
+ stg $a1,`$stdframe+5*8`($sp) # tab[5]=a1^a4
+ xgr $a12,$a4
+ stg $a2,`$stdframe+6*8`($sp) # tab[6]=a2^a4
+ xgr $a1,$a48
+ stg $a12,`$stdframe+7*8`($sp) # tab[7]=a1^a2^a4
+ xgr $a2,$a48
+
+ stg $a8,`$stdframe+8*8`($sp) # tab[8]=a8
+ xgr $a12,$a48
+ stg $a1,`$stdframe+9*8`($sp) # tab[9]=a1^a8
+ xgr $a1,$a4
+ stg $a2,`$stdframe+10*8`($sp) # tab[10]=a2^a8
+ xgr $a2,$a4
+ stg $a12,`$stdframe+11*8`($sp) # tab[11]=a1^a2^a8
+
+ xgr $a12,$a4
+ stg $a48,`$stdframe+12*8`($sp) # tab[12]=a4^a8
+ srlg $hi,$lo,1
+ stg $a1,`$stdframe+13*8`($sp) # tab[13]=a1^a4^a8
+ sllg $lo,$lo,63
+ stg $a2,`$stdframe+14*8`($sp) # tab[14]=a2^a4^a8
+ srlg @T[0],@i[0],2
+ stg $a12,`$stdframe+15*8`($sp) # tab[15]=a1^a2^a4^a8
+
+ lghi $mask,`0xf<<3`
+ sllg $a1,@i[0],62
+ sllg @i[0],$b,3
+ srlg @T[1],@i[1],3
+ ngr @i[0],$mask
+ sllg $a2,@i[1],61
+ srlg @i[1],$b,4-3
+ xgr $hi,@T[0]
+ ngr @i[1],$mask
+ xgr $lo,$a1
+ xgr $hi,@T[1]
+ xgr $lo,$a2
+
+ xg $lo,$stdframe(@i[0],$sp)
+ srlg @i[0],$b,8-3
+ ngr @i[0],$mask
+___
+for($n=1;$n<14;$n++) {
+$code.=<<___;
+ lg @T[1],$stdframe(@i[1],$sp)
+ srlg @i[1],$b,`($n+2)*4`-3
+ sllg @T[0],@T[1],`$n*4`
+ ngr @i[1],$mask
+ srlg @T[1],@T[1],`64-$n*4`
+ xgr $lo,@T[0]
+ xgr $hi,@T[1]
+___
+ push(@i,shift(@i)); push(@T,shift(@T));
+}
+$code.=<<___;
+ lg @T[1],$stdframe(@i[1],$sp)
+ sllg @T[0],@T[1],`$n*4`
+ srlg @T[1],@T[1],`64-$n*4`
+ xgr $lo,@T[0]
+ xgr $hi,@T[1]
+
+ lg @T[0],$stdframe(@i[0],$sp)
+ sllg @T[1],@T[0],`($n+1)*4`
+ srlg @T[0],@T[0],`64-($n+1)*4`
+ xgr $lo,@T[1]
+ xgr $hi,@T[0]
+
+ br $ra
+.size _mul_1x1,.-_mul_1x1
+
+.globl bn_GF2m_mul_2x2
+.type bn_GF2m_mul_2x2,\@function
+.align 16
+bn_GF2m_mul_2x2:
+ stm${g} %r3,%r15,3*$SIZE_T($sp)
+
+ lghi %r1,-$stdframe-128
+ la %r0,0($sp)
+ la $sp,0(%r1,$sp) # alloca
+ st${g} %r0,0($sp) # back chain
+___
+if ($SIZE_T==8) {
+my @r=map("%r$_",(6..9));
+$code.=<<___;
+ bras $ra,_mul_1x1 # a1·b1
+ stmg $lo,$hi,16($rp)
+
+ lg $a,`$stdframe+128+4*$SIZE_T`($sp)
+ lg $b,`$stdframe+128+6*$SIZE_T`($sp)
+ bras $ra,_mul_1x1 # a0·b0
+ stmg $lo,$hi,0($rp)
+
+ lg $a,`$stdframe+128+3*$SIZE_T`($sp)
+ lg $b,`$stdframe+128+5*$SIZE_T`($sp)
+ xg $a,`$stdframe+128+4*$SIZE_T`($sp)
+ xg $b,`$stdframe+128+6*$SIZE_T`($sp)
+ bras $ra,_mul_1x1 # (a0+a1)·(b0+b1)
+ lmg @r[0],@r[3],0($rp)
+
+ xgr $lo,$hi
+ xgr $hi,@r[1]
+ xgr $lo,@r[0]
+ xgr $hi,@r[2]
+ xgr $lo,@r[3]
+ xgr $hi,@r[3]
+ xgr $lo,$hi
+ stg $hi,16($rp)
+ stg $lo,8($rp)
+___
+} else {
+$code.=<<___;
+ sllg %r3,%r3,32
+ sllg %r5,%r5,32
+ or %r3,%r4
+ or %r5,%r6
+ bras $ra,_mul_1x1
+ rllg $lo,$lo,32
+ rllg $hi,$hi,32
+ stmg $lo,$hi,0($rp)
+___
+}
+$code.=<<___;
+ lm${g} %r6,%r15,`$stdframe+128+6*$SIZE_T`($sp)
+ br $ra
+.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
+.string "GF(2^m) Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+print $code;
+close STDOUT;
diff --git a/main/openssl/crypto/bn/asm/s390x-mont.pl b/main/openssl/crypto/bn/asm/s390x-mont.pl
index f61246f5..9fd64e81 100644
--- a/main/openssl/crypto/bn/asm/s390x-mont.pl
+++ b/main/openssl/crypto/bn/asm/s390x-mont.pl
@@ -32,6 +32,33 @@
# Reschedule to minimize/avoid Address Generation Interlock hazard,
# make inner loops counter-based.
+# November 2010.
+#
+# Adapt for -m31 build. If kernel supports what's called "highgprs"
+# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
+# instructions and achieve "64-bit" performance even in 31-bit legacy
+# application context. The feature is not specific to any particular
+# processor, as long as it's "z-CPU". Latter implies that the code
+# remains z/Architecture specific. Compatibility with 32-bit BN_ULONG
+# is achieved by swapping words after 64-bit loads, follow _dswap-s.
+# On z990 it was measured to perform 2.6-2.2 times better than
+# compiler-generated code, less for longer keys...
+
+$flavour = shift;
+
+if ($flavour =~ /3[12]/) {
+ $SIZE_T=4;
+ $g="";
+} else {
+ $SIZE_T=8;
+ $g="g";
+}
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+$stdframe=16*$SIZE_T+4*8;
+
$mn0="%r0";
$num="%r1";
@@ -60,34 +87,44 @@ $code.=<<___;
.globl bn_mul_mont
.type bn_mul_mont,\@function
bn_mul_mont:
- lgf $num,164($sp) # pull $num
- sla $num,3 # $num to enumerate bytes
+ lgf $num,`$stdframe+$SIZE_T-4`($sp) # pull $num
+ sla $num,`log($SIZE_T)/log(2)` # $num to enumerate bytes
la $bp,0($num,$bp)
- stg %r2,16($sp)
+ st${g} %r2,2*$SIZE_T($sp)
cghi $num,16 #
lghi %r2,0 #
blr %r14 # if($num<16) return 0;
+___
+$code.=<<___ if ($flavour =~ /3[12]/);
+ tmll $num,4
+ bnzr %r14 # if ($num&1) return 0;
+___
+$code.=<<___ if ($flavour !~ /3[12]/);
cghi $num,96 #
bhr %r14 # if($num>96) return 0;
+___
+$code.=<<___;
+ stm${g} %r3,%r15,3*$SIZE_T($sp)
- stmg %r3,%r15,24($sp)
-
- lghi $rp,-160-8 # leave room for carry bit
+ lghi $rp,-$stdframe-8 # leave room for carry bit
lcgr $j,$num # -$num
lgr %r0,$sp
la $rp,0($rp,$sp)
la $sp,0($j,$rp) # alloca
- stg %r0,0($sp) # back chain
+ st${g} %r0,0($sp) # back chain
sra $num,3 # restore $num
la $bp,0($j,$bp) # restore $bp
ahi $num,-1 # adjust $num for inner loop
lg $n0,0($n0) # pull n0
+ _dswap $n0
lg $bi,0($bp)
+ _dswap $bi
lg $alo,0($ap)
+ _dswap $alo
mlgr $ahi,$bi # ap[0]*bp[0]
lgr $AHI,$ahi
@@ -95,6 +132,7 @@ bn_mul_mont:
msgr $mn0,$n0
lg $nlo,0($np) #
+ _dswap $nlo
mlgr $nhi,$mn0 # np[0]*m1
algr $nlo,$alo # +="tp[0]"
lghi $NHI,0
@@ -106,12 +144,14 @@ bn_mul_mont:
.align 16
.L1st:
lg $alo,0($j,$ap)
+ _dswap $alo
mlgr $ahi,$bi # ap[j]*bp[0]
algr $alo,$AHI
lghi $AHI,0
alcgr $AHI,$ahi
lg $nlo,0($j,$np)
+ _dswap $nlo
mlgr $nhi,$mn0 # np[j]*m1
algr $nlo,$NHI
lghi $NHI,0
@@ -119,22 +159,24 @@ bn_mul_mont:
algr $nlo,$alo
alcgr $NHI,$nhi
- stg $nlo,160-8($j,$sp) # tp[j-1]=
+ stg $nlo,$stdframe-8($j,$sp) # tp[j-1]=
la $j,8($j) # j++
brct $count,.L1st
algr $NHI,$AHI
lghi $AHI,0
alcgr $AHI,$AHI # upmost overflow bit
- stg $NHI,160-8($j,$sp)
- stg $AHI,160($j,$sp)
+ stg $NHI,$stdframe-8($j,$sp)
+ stg $AHI,$stdframe($j,$sp)
la $bp,8($bp) # bp++
.Louter:
lg $bi,0($bp) # bp[i]
+ _dswap $bi
lg $alo,0($ap)
+ _dswap $alo
mlgr $ahi,$bi # ap[0]*bp[i]
- alg $alo,160($sp) # +=tp[0]
+ alg $alo,$stdframe($sp) # +=tp[0]
lghi $AHI,0
alcgr $AHI,$ahi
@@ -142,6 +184,7 @@ bn_mul_mont:
msgr $mn0,$n0 # tp[0]*n0
lg $nlo,0($np) # np[0]
+ _dswap $nlo
mlgr $nhi,$mn0 # np[0]*m1
algr $nlo,$alo # +="tp[0]"
lghi $NHI,0
@@ -153,14 +196,16 @@ bn_mul_mont:
.align 16
.Linner:
lg $alo,0($j,$ap)
+ _dswap $alo
mlgr $ahi,$bi # ap[j]*bp[i]
algr $alo,$AHI
lghi $AHI,0
alcgr $ahi,$AHI
- alg $alo,160($j,$sp)# +=tp[j]
+ alg $alo,$stdframe($j,$sp)# +=tp[j]
alcgr $AHI,$ahi
lg $nlo,0($j,$np)
+ _dswap $nlo
mlgr $nhi,$mn0 # np[j]*m1
algr $nlo,$NHI
lghi $NHI,0
@@ -168,31 +213,33 @@ bn_mul_mont:
algr $nlo,$alo # +="tp[j]"
alcgr $NHI,$nhi
- stg $nlo,160-8($j,$sp) # tp[j-1]=
+ stg $nlo,$stdframe-8($j,$sp) # tp[j-1]=
la $j,8($j) # j++
brct $count,.Linner
algr $NHI,$AHI
lghi $AHI,0
alcgr $AHI,$AHI
- alg $NHI,160($j,$sp)# accumulate previous upmost overflow bit
+ alg $NHI,$stdframe($j,$sp)# accumulate previous upmost overflow bit
lghi $ahi,0
alcgr $AHI,$ahi # new upmost overflow bit
- stg $NHI,160-8($j,$sp)
- stg $AHI,160($j,$sp)
+ stg $NHI,$stdframe-8($j,$sp)
+ stg $AHI,$stdframe($j,$sp)
la $bp,8($bp) # bp++
- clg $bp,160+8+32($j,$sp) # compare to &bp[num]
+ cl${g} $bp,`$stdframe+8+4*$SIZE_T`($j,$sp) # compare to &bp[num]
jne .Louter
- lg $rp,160+8+16($j,$sp) # reincarnate rp
- la $ap,160($sp)
+ l${g} $rp,`$stdframe+8+2*$SIZE_T`($j,$sp) # reincarnate rp
+ la $ap,$stdframe($sp)
ahi $num,1 # restore $num, incidentally clears "borrow"
la $j,0(%r0)
lr $count,$num
.Lsub: lg $alo,0($j,$ap)
- slbg $alo,0($j,$np)
+ lg $nlo,0($j,$np)
+ _dswap $nlo
+ slbgr $alo,$nlo
stg $alo,0($j,$rp)
la $j,8($j)
brct $count,.Lsub
@@ -207,19 +254,24 @@ bn_mul_mont:
la $j,0(%r0)
lgr $count,$num
-.Lcopy: lg $alo,0($j,$ap) # copy or in-place refresh
- stg $j,160($j,$sp) # zap tp
+.Lcopy: lg $alo,0($j,$ap) # copy or in-place refresh
+ _dswap $alo
+ stg $j,$stdframe($j,$sp) # zap tp
stg $alo,0($j,$rp)
la $j,8($j)
brct $count,.Lcopy
- la %r1,160+8+48($j,$sp)
- lmg %r6,%r15,0(%r1)
+ la %r1,`$stdframe+8+6*$SIZE_T`($j,$sp)
+ lm${g} %r6,%r15,0(%r1)
lghi %r2,1 # signal "processed"
br %r14
.size bn_mul_mont,.-bn_mul_mont
.string "Montgomery Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>"
___
-print $code;
+foreach (split("\n",$code)) {
+ s/\`([^\`]*)\`/eval $1/ge;
+ s/_dswap\s+(%r[0-9]+)/sprintf("rllg\t%s,%s,32",$1,$1) if($SIZE_T==4)/e;
+ print $_,"\n";
+}
close STDOUT;
diff --git a/main/openssl/crypto/bn/asm/x86-gf2m.S b/main/openssl/crypto/bn/asm/x86-gf2m.S
new file mode 100644
index 00000000..9403a5aa
--- /dev/null
+++ b/main/openssl/crypto/bn/asm/x86-gf2m.S
@@ -0,0 +1,335 @@
+.file "crypto/bn/asm/x86-gf2m.s"
+.text
+.type _mul_1x1_mmx,@function
+.align 16
+_mul_1x1_mmx:
+ subl $36,%esp
+ movl %eax,%ecx
+ leal (%eax,%eax,1),%edx
+ andl $1073741823,%ecx
+ leal (%edx,%edx,1),%ebp
+ movl $0,(%esp)
+ andl $2147483647,%edx
+ movd %eax,%mm2
+ movd %ebx,%mm3
+ movl %ecx,4(%esp)
+ xorl %edx,%ecx
+ pxor %mm5,%mm5
+ pxor %mm4,%mm4
+ movl %edx,8(%esp)
+ xorl %ebp,%edx
+ movl %ecx,12(%esp)
+ pcmpgtd %mm2,%mm5
+ paddd %mm2,%mm2
+ xorl %edx,%ecx
+ movl %ebp,16(%esp)
+ xorl %edx,%ebp
+ pand %mm3,%mm5
+ pcmpgtd %mm2,%mm4
+ movl %ecx,20(%esp)
+ xorl %ecx,%ebp
+ psllq $31,%mm5
+ pand %mm3,%mm4
+ movl %edx,24(%esp)
+ movl $7,%esi
+ movl %ebp,28(%esp)
+ movl %esi,%ebp
+ andl %ebx,%esi
+ shrl $3,%ebx
+ movl %ebp,%edi
+ psllq $30,%mm4
+ andl %ebx,%edi
+ shrl $3,%ebx
+ movd (%esp,%esi,4),%mm0
+ movl %ebp,%esi
+ andl %ebx,%esi
+ shrl $3,%ebx
+ movd (%esp,%edi,4),%mm2
+ movl %ebp,%edi
+ psllq $3,%mm2
+ andl %ebx,%edi
+ shrl $3,%ebx
+ pxor %mm2,%mm0
+ movd (%esp,%esi,4),%mm1
+ movl %ebp,%esi
+ psllq $6,%mm1
+ andl %ebx,%esi
+ shrl $3,%ebx
+ pxor %mm1,%mm0
+ movd (%esp,%edi,4),%mm2
+ movl %ebp,%edi
+ psllq $9,%mm2
+ andl %ebx,%edi
+ shrl $3,%ebx
+ pxor %mm2,%mm0
+ movd (%esp,%esi,4),%mm1
+ movl %ebp,%esi
+ psllq $12,%mm1
+ andl %ebx,%esi
+ shrl $3,%ebx
+ pxor %mm1,%mm0
+ movd (%esp,%edi,4),%mm2
+ movl %ebp,%edi
+ psllq $15,%mm2
+ andl %ebx,%edi
+ shrl $3,%ebx
+ pxor %mm2,%mm0
+ movd (%esp,%esi,4),%mm1
+ movl %ebp,%esi
+ psllq $18,%mm1
+ andl %ebx,%esi
+ shrl $3,%ebx
+ pxor %mm1,%mm0
+ movd (%esp,%edi,4),%mm2
+ movl %ebp,%edi
+ psllq $21,%mm2
+ andl %ebx,%edi
+ shrl $3,%ebx
+ pxor %mm2,%mm0
+ movd (%esp,%esi,4),%mm1
+ movl %ebp,%esi
+ psllq $24,%mm1
+ andl %ebx,%esi
+ shrl $3,%ebx
+ pxor %mm1,%mm0
+ movd (%esp,%edi,4),%mm2
+ pxor %mm4,%mm0
+ psllq $27,%mm2
+ pxor %mm2,%mm0
+ movd (%esp,%esi,4),%mm1
+ pxor %mm5,%mm0
+ psllq $30,%mm1
+ addl $36,%esp
+ pxor %mm1,%mm0
+ ret
+.size _mul_1x1_mmx,.-_mul_1x1_mmx
+.type _mul_1x1_ialu,@function
+.align 16
+_mul_1x1_ialu:
+ subl $36,%esp
+ movl %eax,%ecx
+ leal (%eax,%eax,1),%edx
+ leal (,%eax,4),%ebp
+ andl $1073741823,%ecx
+ leal (%eax,%eax,1),%edi
+ sarl $31,%eax
+ movl $0,(%esp)
+ andl $2147483647,%edx
+ movl %ecx,4(%esp)
+ xorl %edx,%ecx
+ movl %edx,8(%esp)
+ xorl %ebp,%edx
+ movl %ecx,12(%esp)
+ xorl %edx,%ecx
+ movl %ebp,16(%esp)
+ xorl %edx,%ebp
+ movl %ecx,20(%esp)
+ xorl %ecx,%ebp
+ sarl $31,%edi
+ andl %ebx,%eax
+ movl %edx,24(%esp)
+ andl %ebx,%edi
+ movl %ebp,28(%esp)
+ movl %eax,%edx
+ shll $31,%eax
+ movl %edi,%ecx
+ shrl $1,%edx
+ movl $7,%esi
+ shll $30,%edi
+ andl %ebx,%esi
+ shrl $2,%ecx
+ xorl %edi,%eax
+ shrl $3,%ebx
+ movl $7,%edi
+ andl %ebx,%edi
+ shrl $3,%ebx
+ xorl %ecx,%edx
+ xorl (%esp,%esi,4),%eax
+ movl $7,%esi
+ andl %ebx,%esi
+ shrl $3,%ebx
+ movl (%esp,%edi,4),%ebp
+ movl $7,%edi
+ movl %ebp,%ecx
+ shll $3,%ebp
+ andl %ebx,%edi
+ shrl $29,%ecx
+ xorl %ebp,%eax
+ shrl $3,%ebx
+ xorl %ecx,%edx
+ movl (%esp,%esi,4),%ecx
+ movl $7,%esi
+ movl %ecx,%ebp
+ shll $6,%ecx
+ andl %ebx,%esi
+ shrl $26,%ebp
+ xorl %ecx,%eax
+ shrl $3,%ebx
+ xorl %ebp,%edx
+ movl (%esp,%edi,4),%ebp
+ movl $7,%edi
+ movl %ebp,%ecx
+ shll $9,%ebp
+ andl %ebx,%edi
+ shrl $23,%ecx
+ xorl %ebp,%eax
+ shrl $3,%ebx
+ xorl %ecx,%edx
+ movl (%esp,%esi,4),%ecx
+ movl $7,%esi
+ movl %ecx,%ebp
+ shll $12,%ecx
+ andl %ebx,%esi
+ shrl $20,%ebp
+ xorl %ecx,%eax
+ shrl $3,%ebx
+ xorl %ebp,%edx
+ movl (%esp,%edi,4),%ebp
+ movl $7,%edi
+ movl %ebp,%ecx
+ shll $15,%ebp
+ andl %ebx,%edi
+ shrl $17,%ecx
+ xorl %ebp,%eax
+ shrl $3,%ebx
+ xorl %ecx,%edx
+ movl (%esp,%esi,4),%ecx
+ movl $7,%esi
+ movl %ecx,%ebp
+ shll $18,%ecx
+ andl %ebx,%esi
+ shrl $14,%ebp
+ xorl %ecx,%eax
+ shrl $3,%ebx
+ xorl %ebp,%edx
+ movl (%esp,%edi,4),%ebp
+ movl $7,%edi
+ movl %ebp,%ecx
+ shll $21,%ebp
+ andl %ebx,%edi
+ shrl $11,%ecx
+ xorl %ebp,%eax
+ shrl $3,%ebx
+ xorl %ecx,%edx
+ movl (%esp,%esi,4),%ecx
+ movl $7,%esi
+ movl %ecx,%ebp
+ shll $24,%ecx
+ andl %ebx,%esi
+ shrl $8,%ebp
+ xorl %ecx,%eax
+ shrl $3,%ebx
+ xorl %ebp,%edx
+ movl (%esp,%edi,4),%ebp
+ movl %ebp,%ecx
+ shll $27,%ebp
+ movl (%esp,%esi,4),%edi
+ shrl $5,%ecx
+ movl %edi,%esi
+ xorl %ebp,%eax
+ shll $30,%edi
+ xorl %ecx,%edx
+ shrl $2,%esi
+ xorl %edi,%eax
+ xorl %esi,%edx
+ addl $36,%esp
+ ret
+.size _mul_1x1_ialu,.-_mul_1x1_ialu
+.globl bn_GF2m_mul_2x2
+.type bn_GF2m_mul_2x2,@function
+.align 16
+bn_GF2m_mul_2x2:
+.L_bn_GF2m_mul_2x2_begin:
+ call .L000PIC_me_up
+.L000PIC_me_up:
+ popl %edx
+ leal _GLOBAL_OFFSET_TABLE_+[.-.L000PIC_me_up](%edx),%edx
+ movl OPENSSL_ia32cap_P@GOT(%edx),%edx
+ movl (%edx),%eax
+ movl 4(%edx),%edx
+ testl $8388608,%eax
+ jz .L001ialu
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 24(%esp),%eax
+ movl 32(%esp),%ebx
+ call _mul_1x1_mmx
+ movq %mm0,%mm7
+ movl 28(%esp),%eax
+ movl 36(%esp),%ebx
+ call _mul_1x1_mmx
+ movq %mm0,%mm6
+ movl 24(%esp),%eax
+ movl 32(%esp),%ebx
+ xorl 28(%esp),%eax
+ xorl 36(%esp),%ebx
+ call _mul_1x1_mmx
+ pxor %mm7,%mm0
+ movl 20(%esp),%eax
+ pxor %mm6,%mm0
+ movq %mm0,%mm2
+ psllq $32,%mm0
+ popl %edi
+ psrlq $32,%mm2
+ popl %esi
+ pxor %mm6,%mm0
+ popl %ebx
+ pxor %mm7,%mm2
+ movq %mm0,(%eax)
+ popl %ebp
+ movq %mm2,8(%eax)
+ emms
+ ret
+.align 16
+.L001ialu:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ subl $20,%esp
+ movl 44(%esp),%eax
+ movl 52(%esp),%ebx
+ call _mul_1x1_ialu
+ movl %eax,8(%esp)
+ movl %edx,12(%esp)
+ movl 48(%esp),%eax
+ movl 56(%esp),%ebx
+ call _mul_1x1_ialu
+ movl %eax,(%esp)
+ movl %edx,4(%esp)
+ movl 44(%esp),%eax
+ movl 52(%esp),%ebx
+ xorl 48(%esp),%eax
+ xorl 56(%esp),%ebx
+ call _mul_1x1_ialu
+ movl 40(%esp),%ebp
+ movl (%esp),%ebx
+ movl 4(%esp),%ecx
+ movl 8(%esp),%edi
+ movl 12(%esp),%esi
+ xorl %edx,%eax
+ xorl %ecx,%edx
+ xorl %ebx,%eax
+ movl %ebx,(%ebp)
+ xorl %edi,%edx
+ movl %esi,12(%ebp)
+ xorl %esi,%eax
+ addl $20,%esp
+ xorl %esi,%edx
+ popl %edi
+ xorl %edx,%eax
+ popl %esi
+ movl %edx,8(%ebp)
+ popl %ebx
+ movl %eax,4(%ebp)
+ popl %ebp
+ ret
+.size bn_GF2m_mul_2x2,.-.L_bn_GF2m_mul_2x2_begin
+.byte 71,70,40,50,94,109,41,32,77,117,108,116,105,112,108,105
+.byte 99,97,116,105,111,110,32,102,111,114,32,120,56,54,44,32
+.byte 67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97
+.byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
+.byte 62,0
+.comm OPENSSL_ia32cap_P,8,4
diff --git a/main/openssl/crypto/bn/asm/x86-gf2m.pl b/main/openssl/crypto/bn/asm/x86-gf2m.pl
new file mode 100644
index 00000000..b5795302
--- /dev/null
+++ b/main/openssl/crypto/bn/asm/x86-gf2m.pl
@@ -0,0 +1,313 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# May 2011
+#
+# The module implements bn_GF2m_mul_2x2 polynomial multiplication used
+# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
+# the time being... Except that it has three code paths: pure integer
+# code suitable for any x86 CPU, MMX code suitable for PIII and later
+# and PCLMULQDQ suitable for Westmere and later. Improvement varies
+# from one benchmark and µ-arch to another. Below are interval values
+# for 163- and 571-bit ECDH benchmarks relative to compiler-generated
+# code:
+#
+# PIII 16%-30%
+# P4 12%-12%
+# Opteron 18%-40%
+# Core2 19%-44%
+# Atom 38%-64%
+# Westmere 53%-121%(PCLMULQDQ)/20%-32%(MMX)
+# Sandy Bridge 72%-127%(PCLMULQDQ)/27%-23%(MMX)
+#
+# Note that above improvement coefficients are not coefficients for
+# bn_GF2m_mul_2x2 itself. For example 120% ECDH improvement is result
+# of bn_GF2m_mul_2x2 being >4x faster. As it gets faster, benchmark
+# is more and more dominated by other subroutines, most notably by
+# BN_GF2m_mod[_mul]_arr...
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+
+&asm_init($ARGV[0],$0,$x86only = $ARGV[$#ARGV] eq "386");
+
+$sse2=0;
+for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
+
+&external_label("OPENSSL_ia32cap_P") if ($sse2);
+
+$a="eax";
+$b="ebx";
+($a1,$a2,$a4)=("ecx","edx","ebp");
+
+$R="mm0";
+@T=("mm1","mm2");
+($A,$B,$B30,$B31)=("mm2","mm3","mm4","mm5");
+@i=("esi","edi");
+
+ if (!$x86only) {
+&function_begin_B("_mul_1x1_mmx");
+ &sub ("esp",32+4);
+ &mov ($a1,$a);
+ &lea ($a2,&DWP(0,$a,$a));
+ &and ($a1,0x3fffffff);
+ &lea ($a4,&DWP(0,$a2,$a2));
+ &mov (&DWP(0*4,"esp"),0);
+ &and ($a2,0x7fffffff);
+ &movd ($A,$a);
+ &movd ($B,$b);
+ &mov (&DWP(1*4,"esp"),$a1); # a1
+ &xor ($a1,$a2); # a1^a2
+ &pxor ($B31,$B31);
+ &pxor ($B30,$B30);
+ &mov (&DWP(2*4,"esp"),$a2); # a2
+ &xor ($a2,$a4); # a2^a4
+ &mov (&DWP(3*4,"esp"),$a1); # a1^a2
+ &pcmpgtd($B31,$A); # broadcast 31st bit
+ &paddd ($A,$A); # $A<<=1
+ &xor ($a1,$a2); # a1^a4=a1^a2^a2^a4
+ &mov (&DWP(4*4,"esp"),$a4); # a4
+ &xor ($a4,$a2); # a2=a4^a2^a4
+ &pand ($B31,$B);
+ &pcmpgtd($B30,$A); # broadcast 30th bit
+ &mov (&DWP(5*4,"esp"),$a1); # a1^a4
+ &xor ($a4,$a1); # a1^a2^a4
+ &psllq ($B31,31);
+ &pand ($B30,$B);
+ &mov (&DWP(6*4,"esp"),$a2); # a2^a4
+ &mov (@i[0],0x7);
+ &mov (&DWP(7*4,"esp"),$a4); # a1^a2^a4
+ &mov ($a4,@i[0]);
+ &and (@i[0],$b);
+ &shr ($b,3);
+ &mov (@i[1],$a4);
+ &psllq ($B30,30);
+ &and (@i[1],$b);
+ &shr ($b,3);
+ &movd ($R,&DWP(0,"esp",@i[0],4));
+ &mov (@i[0],$a4);
+ &and (@i[0],$b);
+ &shr ($b,3);
+ for($n=1;$n<9;$n++) {
+ &movd (@T[1],&DWP(0,"esp",@i[1],4));
+ &mov (@i[1],$a4);
+ &psllq (@T[1],3*$n);
+ &and (@i[1],$b);
+ &shr ($b,3);
+ &pxor ($R,@T[1]);
+
+ push(@i,shift(@i)); push(@T,shift(@T));
+ }
+ &movd (@T[1],&DWP(0,"esp",@i[1],4));
+ &pxor ($R,$B30);
+ &psllq (@T[1],3*$n++);
+ &pxor ($R,@T[1]);
+
+ &movd (@T[0],&DWP(0,"esp",@i[0],4));
+ &pxor ($R,$B31);
+ &psllq (@T[0],3*$n);
+ &add ("esp",32+4);
+ &pxor ($R,@T[0]);
+ &ret ();
+&function_end_B("_mul_1x1_mmx");
+ }
+
+($lo,$hi)=("eax","edx");
+@T=("ecx","ebp");
+
+&function_begin_B("_mul_1x1_ialu");
+ &sub ("esp",32+4);
+ &mov ($a1,$a);
+ &lea ($a2,&DWP(0,$a,$a));
+ &lea ($a4,&DWP(0,"",$a,4));
+ &and ($a1,0x3fffffff);
+ &lea (@i[1],&DWP(0,$lo,$lo));
+ &sar ($lo,31); # broadcast 31st bit
+ &mov (&DWP(0*4,"esp"),0);
+ &and ($a2,0x7fffffff);
+ &mov (&DWP(1*4,"esp"),$a1); # a1
+ &xor ($a1,$a2); # a1^a2
+ &mov (&DWP(2*4,"esp"),$a2); # a2
+ &xor ($a2,$a4); # a2^a4
+ &mov (&DWP(3*4,"esp"),$a1); # a1^a2
+ &xor ($a1,$a2); # a1^a4=a1^a2^a2^a4
+ &mov (&DWP(4*4,"esp"),$a4); # a4
+ &xor ($a4,$a2); # a2=a4^a2^a4
+ &mov (&DWP(5*4,"esp"),$a1); # a1^a4
+ &xor ($a4,$a1); # a1^a2^a4
+ &sar (@i[1],31); # broardcast 30th bit
+ &and ($lo,$b);
+ &mov (&DWP(6*4,"esp"),$a2); # a2^a4
+ &and (@i[1],$b);
+ &mov (&DWP(7*4,"esp"),$a4); # a1^a2^a4
+ &mov ($hi,$lo);
+ &shl ($lo,31);
+ &mov (@T[0],@i[1]);
+ &shr ($hi,1);
+
+ &mov (@i[0],0x7);
+ &shl (@i[1],30);
+ &and (@i[0],$b);
+ &shr (@T[0],2);
+ &xor ($lo,@i[1]);
+
+ &shr ($b,3);
+ &mov (@i[1],0x7); # 5-byte instruction!?
+ &and (@i[1],$b);
+ &shr ($b,3);
+ &xor ($hi,@T[0]);
+ &xor ($lo,&DWP(0,"esp",@i[0],4));
+ &mov (@i[0],0x7);
+ &and (@i[0],$b);
+ &shr ($b,3);
+ for($n=1;$n<9;$n++) {
+ &mov (@T[1],&DWP(0,"esp",@i[1],4));
+ &mov (@i[1],0x7);
+ &mov (@T[0],@T[1]);
+ &shl (@T[1],3*$n);
+ &and (@i[1],$b);
+ &shr (@T[0],32-3*$n);
+ &xor ($lo,@T[1]);
+ &shr ($b,3);
+ &xor ($hi,@T[0]);
+
+ push(@i,shift(@i)); push(@T,shift(@T));
+ }
+ &mov (@T[1],&DWP(0,"esp",@i[1],4));
+ &mov (@T[0],@T[1]);
+ &shl (@T[1],3*$n);
+ &mov (@i[1],&DWP(0,"esp",@i[0],4));
+ &shr (@T[0],32-3*$n); $n++;
+ &mov (@i[0],@i[1]);
+ &xor ($lo,@T[1]);
+ &shl (@i[1],3*$n);
+ &xor ($hi,@T[0]);
+ &shr (@i[0],32-3*$n);
+ &xor ($lo,@i[1]);
+ &xor ($hi,@i[0]);
+
+ &add ("esp",32+4);
+ &ret ();
+&function_end_B("_mul_1x1_ialu");
+
+# void bn_GF2m_mul_2x2(BN_ULONG *r, BN_ULONG a1, BN_ULONG a0, BN_ULONG b1, BN_ULONG b0);
+&function_begin_B("bn_GF2m_mul_2x2");
+if (!$x86only) {
+ &picmeup("edx","OPENSSL_ia32cap_P");
+ &mov ("eax",&DWP(0,"edx"));
+ &mov ("edx",&DWP(4,"edx"));
+ &test ("eax",1<<23); # check MMX bit
+ &jz (&label("ialu"));
+if ($sse2) {
+ &test ("eax",1<<24); # check FXSR bit
+ &jz (&label("mmx"));
+ &test ("edx",1<<1); # check PCLMULQDQ bit
+ &jz (&label("mmx"));
+
+ &movups ("xmm0",&QWP(8,"esp"));
+ &shufps ("xmm0","xmm0",0b10110001);
+ &pclmulqdq ("xmm0","xmm0",1);
+ &mov ("eax",&DWP(4,"esp"));
+ &movups (&QWP(0,"eax"),"xmm0");
+ &ret ();
+
+&set_label("mmx",16);
+}
+ &push ("ebp");
+ &push ("ebx");
+ &push ("esi");
+ &push ("edi");
+ &mov ($a,&wparam(1));
+ &mov ($b,&wparam(3));
+ &call ("_mul_1x1_mmx"); # a1·b1
+ &movq ("mm7",$R);
+
+ &mov ($a,&wparam(2));
+ &mov ($b,&wparam(4));
+ &call ("_mul_1x1_mmx"); # a0·b0
+ &movq ("mm6",$R);
+
+ &mov ($a,&wparam(1));
+ &mov ($b,&wparam(3));
+ &xor ($a,&wparam(2));
+ &xor ($b,&wparam(4));
+ &call ("_mul_1x1_mmx"); # (a0+a1)·(b0+b1)
+ &pxor ($R,"mm7");
+ &mov ($a,&wparam(0));
+ &pxor ($R,"mm6"); # (a0+a1)·(b0+b1)-a1·b1-a0·b0
+
+ &movq ($A,$R);
+ &psllq ($R,32);
+ &pop ("edi");
+ &psrlq ($A,32);
+ &pop ("esi");
+ &pxor ($R,"mm6");
+ &pop ("ebx");
+ &pxor ($A,"mm7");
+ &movq (&QWP(0,$a),$R);
+ &pop ("ebp");
+ &movq (&QWP(8,$a),$A);
+ &emms ();
+ &ret ();
+&set_label("ialu",16);
+}
+ &push ("ebp");
+ &push ("ebx");
+ &push ("esi");
+ &push ("edi");
+ &stack_push(4+1);
+
+ &mov ($a,&wparam(1));
+ &mov ($b,&wparam(3));
+ &call ("_mul_1x1_ialu"); # a1·b1
+ &mov (&DWP(8,"esp"),$lo);
+ &mov (&DWP(12,"esp"),$hi);
+
+ &mov ($a,&wparam(2));
+ &mov ($b,&wparam(4));
+ &call ("_mul_1x1_ialu"); # a0·b0
+ &mov (&DWP(0,"esp"),$lo);
+ &mov (&DWP(4,"esp"),$hi);
+
+ &mov ($a,&wparam(1));
+ &mov ($b,&wparam(3));
+ &xor ($a,&wparam(2));
+ &xor ($b,&wparam(4));
+ &call ("_mul_1x1_ialu"); # (a0+a1)·(b0+b1)
+
+ &mov ("ebp",&wparam(0));
+ @r=("ebx","ecx","edi","esi");
+ &mov (@r[0],&DWP(0,"esp"));
+ &mov (@r[1],&DWP(4,"esp"));
+ &mov (@r[2],&DWP(8,"esp"));
+ &mov (@r[3],&DWP(12,"esp"));
+
+ &xor ($lo,$hi);
+ &xor ($hi,@r[1]);
+ &xor ($lo,@r[0]);
+ &mov (&DWP(0,"ebp"),@r[0]);
+ &xor ($hi,@r[2]);
+ &mov (&DWP(12,"ebp"),@r[3]);
+ &xor ($lo,@r[3]);
+ &stack_pop(4+1);
+ &xor ($hi,@r[3]);
+ &pop ("edi");
+ &xor ($lo,$hi);
+ &pop ("esi");
+ &mov (&DWP(8,"ebp"),$hi);
+ &pop ("ebx");
+ &mov (&DWP(4,"ebp"),$lo);
+ &pop ("ebp");
+ &ret ();
+&function_end_B("bn_GF2m_mul_2x2");
+
+&asciz ("GF(2^m) Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
+
+&asm_finish();
diff --git a/main/openssl/crypto/bn/asm/x86-mont.S b/main/openssl/crypto/bn/asm/x86-mont.S
new file mode 100644
index 00000000..2bbb0e3a
--- /dev/null
+++ b/main/openssl/crypto/bn/asm/x86-mont.S
@@ -0,0 +1,338 @@
+.file "crypto/bn/asm/x86-mont.s"
+.text
+.globl bn_mul_mont
+.type bn_mul_mont,@function
+.align 16
+bn_mul_mont:
+.L_bn_mul_mont_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ xorl %eax,%eax
+ movl 40(%esp),%edi
+ cmpl $4,%edi
+ jl .L000just_leave
+ leal 20(%esp),%esi
+ leal 24(%esp),%edx
+ movl %esp,%ebp
+ addl $2,%edi
+ negl %edi
+ leal -32(%esp,%edi,4),%esp
+ negl %edi
+ movl %esp,%eax
+ subl %edx,%eax
+ andl $2047,%eax
+ subl %eax,%esp
+ xorl %esp,%edx
+ andl $2048,%edx
+ xorl $2048,%edx
+ subl %edx,%esp
+ andl $-64,%esp
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl 8(%esi),%ecx
+ movl 12(%esi),%edx
+ movl 16(%esi),%esi
+ movl (%esi),%esi
+ movl %eax,4(%esp)
+ movl %ebx,8(%esp)
+ movl %ecx,12(%esp)
+ movl %edx,16(%esp)
+ movl %esi,20(%esp)
+ leal -3(%edi),%ebx
+ movl %ebp,24(%esp)
+ movl 8(%esp),%esi
+ leal 1(%ebx),%ebp
+ movl 12(%esp),%edi
+ xorl %ecx,%ecx
+ movl %esi,%edx
+ andl $1,%ebp
+ subl %edi,%edx
+ leal 4(%edi,%ebx,4),%eax
+ orl %edx,%ebp
+ movl (%edi),%edi
+ jz .L001bn_sqr_mont
+ movl %eax,28(%esp)
+ movl (%esi),%eax
+ xorl %edx,%edx
+.align 16
+.L002mull:
+ movl %edx,%ebp
+ mull %edi
+ addl %eax,%ebp
+ leal 1(%ecx),%ecx
+ adcl $0,%edx
+ movl (%esi,%ecx,4),%eax
+ cmpl %ebx,%ecx
+ movl %ebp,28(%esp,%ecx,4)
+ jl .L002mull
+ movl %edx,%ebp
+ mull %edi
+ movl 20(%esp),%edi
+ addl %ebp,%eax
+ movl 16(%esp),%esi
+ adcl $0,%edx
+ imull 32(%esp),%edi
+ movl %eax,32(%esp,%ebx,4)
+ xorl %ecx,%ecx
+ movl %edx,36(%esp,%ebx,4)
+ movl %ecx,40(%esp,%ebx,4)
+ movl (%esi),%eax
+ mull %edi
+ addl 32(%esp),%eax
+ movl 4(%esi),%eax
+ adcl $0,%edx
+ incl %ecx
+ jmp .L0032ndmadd
+.align 16
+.L0041stmadd:
+ movl %edx,%ebp
+ mull %edi
+ addl 32(%esp,%ecx,4),%ebp
+ leal 1(%ecx),%ecx
+ adcl $0,%edx
+ addl %eax,%ebp
+ movl (%esi,%ecx,4),%eax
+ adcl $0,%edx
+ cmpl %ebx,%ecx
+ movl %ebp,28(%esp,%ecx,4)
+ jl .L0041stmadd
+ movl %edx,%ebp
+ mull %edi
+ addl 32(%esp,%ebx,4),%eax
+ movl 20(%esp),%edi
+ adcl $0,%edx
+ movl 16(%esp),%esi
+ addl %eax,%ebp
+ adcl $0,%edx
+ imull 32(%esp),%edi
+ xorl %ecx,%ecx
+ addl 36(%esp,%ebx,4),%edx
+ movl %ebp,32(%esp,%ebx,4)
+ adcl $0,%ecx
+ movl (%esi),%eax
+ movl %edx,36(%esp,%ebx,4)
+ movl %ecx,40(%esp,%ebx,4)
+ mull %edi
+ addl 32(%esp),%eax
+ movl 4(%esi),%eax
+ adcl $0,%edx
+ movl $1,%ecx
+.align 16
+.L0032ndmadd:
+ movl %edx,%ebp
+ mull %edi
+ addl 32(%esp,%ecx,4),%ebp
+ leal 1(%ecx),%ecx
+ adcl $0,%edx
+ addl %eax,%ebp
+ movl (%esi,%ecx,4),%eax
+ adcl $0,%edx
+ cmpl %ebx,%ecx
+ movl %ebp,24(%esp,%ecx,4)
+ jl .L0032ndmadd
+ movl %edx,%ebp
+ mull %edi
+ addl 32(%esp,%ebx,4),%ebp
+ adcl $0,%edx
+ addl %eax,%ebp
+ adcl $0,%edx
+ movl %ebp,28(%esp,%ebx,4)
+ xorl %eax,%eax
+ movl 12(%esp),%ecx
+ addl 36(%esp,%ebx,4),%edx
+ adcl 40(%esp,%ebx,4),%eax
+ leal 4(%ecx),%ecx
+ movl %edx,32(%esp,%ebx,4)
+ cmpl 28(%esp),%ecx
+ movl %eax,36(%esp,%ebx,4)
+ je .L005common_tail
+ movl (%ecx),%edi
+ movl 8(%esp),%esi
+ movl %ecx,12(%esp)
+ xorl %ecx,%ecx
+ xorl %edx,%edx
+ movl (%esi),%eax
+ jmp .L0041stmadd
+.align 16
+.L001bn_sqr_mont:
+ movl %ebx,(%esp)
+ movl %ecx,12(%esp)
+ movl %edi,%eax
+ mull %edi
+ movl %eax,32(%esp)
+ movl %edx,%ebx
+ shrl $1,%edx
+ andl $1,%ebx
+ incl %ecx
+.align 16
+.L006sqr:
+ movl (%esi,%ecx,4),%eax
+ movl %edx,%ebp
+ mull %edi
+ addl %ebp,%eax
+ leal 1(%ecx),%ecx
+ adcl $0,%edx
+ leal (%ebx,%eax,2),%ebp
+ shrl $31,%eax
+ cmpl (%esp),%ecx
+ movl %eax,%ebx
+ movl %ebp,28(%esp,%ecx,4)
+ jl .L006sqr
+ movl (%esi,%ecx,4),%eax
+ movl %edx,%ebp
+ mull %edi
+ addl %ebp,%eax
+ movl 20(%esp),%edi
+ adcl $0,%edx
+ movl 16(%esp),%esi
+ leal (%ebx,%eax,2),%ebp
+ imull 32(%esp),%edi
+ shrl $31,%eax
+ movl %ebp,32(%esp,%ecx,4)
+ leal (%eax,%edx,2),%ebp
+ movl (%esi),%eax
+ shrl $31,%edx
+ movl %ebp,36(%esp,%ecx,4)
+ movl %edx,40(%esp,%ecx,4)
+ mull %edi
+ addl 32(%esp),%eax
+ movl %ecx,%ebx
+ adcl $0,%edx
+ movl 4(%esi),%eax
+ movl $1,%ecx
+.align 16
+.L0073rdmadd:
+ movl %edx,%ebp
+ mull %edi
+ addl 32(%esp,%ecx,4),%ebp
+ adcl $0,%edx
+ addl %eax,%ebp
+ movl 4(%esi,%ecx,4),%eax
+ adcl $0,%edx
+ movl %ebp,28(%esp,%ecx,4)
+ movl %edx,%ebp
+ mull %edi
+ addl 36(%esp,%ecx,4),%ebp
+ leal 2(%ecx),%ecx
+ adcl $0,%edx
+ addl %eax,%ebp
+ movl (%esi,%ecx,4),%eax
+ adcl $0,%edx
+ cmpl %ebx,%ecx
+ movl %ebp,24(%esp,%ecx,4)
+ jl .L0073rdmadd
+ movl %edx,%ebp
+ mull %edi
+ addl 32(%esp,%ebx,4),%ebp
+ adcl $0,%edx
+ addl %eax,%ebp
+ adcl $0,%edx
+ movl %ebp,28(%esp,%ebx,4)
+ movl 12(%esp),%ecx
+ xorl %eax,%eax
+ movl 8(%esp),%esi
+ addl 36(%esp,%ebx,4),%edx
+ adcl 40(%esp,%ebx,4),%eax
+ movl %edx,32(%esp,%ebx,4)
+ cmpl %ebx,%ecx
+ movl %eax,36(%esp,%ebx,4)
+ je .L005common_tail
+ movl 4(%esi,%ecx,4),%edi
+ leal 1(%ecx),%ecx
+ movl %edi,%eax
+ movl %ecx,12(%esp)
+ mull %edi
+ addl 32(%esp,%ecx,4),%eax
+ adcl $0,%edx
+ movl %eax,32(%esp,%ecx,4)
+ xorl %ebp,%ebp
+ cmpl %ebx,%ecx
+ leal 1(%ecx),%ecx
+ je .L008sqrlast
+ movl %edx,%ebx
+ shrl $1,%edx
+ andl $1,%ebx
+.align 16
+.L009sqradd:
+ movl (%esi,%ecx,4),%eax
+ movl %edx,%ebp
+ mull %edi
+ addl %ebp,%eax
+ leal (%eax,%eax,1),%ebp
+ adcl $0,%edx
+ shrl $31,%eax
+ addl 32(%esp,%ecx,4),%ebp
+ leal 1(%ecx),%ecx
+ adcl $0,%eax
+ addl %ebx,%ebp
+ adcl $0,%eax
+ cmpl (%esp),%ecx
+ movl %ebp,28(%esp,%ecx,4)
+ movl %eax,%ebx
+ jle .L009sqradd
+ movl %edx,%ebp
+ addl %edx,%edx
+ shrl $31,%ebp
+ addl %ebx,%edx
+ adcl $0,%ebp
+.L008sqrlast:
+ movl 20(%esp),%edi
+ movl 16(%esp),%esi
+ imull 32(%esp),%edi
+ addl 32(%esp,%ecx,4),%edx
+ movl (%esi),%eax
+ adcl $0,%ebp
+ movl %edx,32(%esp,%ecx,4)
+ movl %ebp,36(%esp,%ecx,4)
+ mull %edi
+ addl 32(%esp),%eax
+ leal -1(%ecx),%ebx
+ adcl $0,%edx
+ movl $1,%ecx
+ movl 4(%esi),%eax
+ jmp .L0073rdmadd
+.align 16
+.L005common_tail:
+ movl 16(%esp),%ebp
+ movl 4(%esp),%edi
+ leal 32(%esp),%esi
+ movl (%esi),%eax
+ movl %ebx,%ecx
+ xorl %edx,%edx
+.align 16
+.L010sub:
+ sbbl (%ebp,%edx,4),%eax
+ movl %eax,(%edi,%edx,4)
+ decl %ecx
+ movl 4(%esi,%edx,4),%eax
+ leal 1(%edx),%edx
+ jge .L010sub
+ sbbl $0,%eax
+ andl %eax,%esi
+ notl %eax
+ movl %edi,%ebp
+ andl %eax,%ebp
+ orl %ebp,%esi
+.align 16
+.L011copy:
+ movl (%esi,%ebx,4),%eax
+ movl %eax,(%edi,%ebx,4)
+ movl %ecx,32(%esp,%ebx,4)
+ decl %ebx
+ jge .L011copy
+ movl 24(%esp),%esp
+ movl $1,%eax
+.L000just_leave:
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size bn_mul_mont,.-.L_bn_mul_mont_begin
+.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105
+.byte 112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56
+.byte 54,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121
+.byte 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46
+.byte 111,114,103,62,0
diff --git a/main/openssl/crypto/bn/asm/x86-mont.pl b/main/openssl/crypto/bn/asm/x86-mont.pl
index 5cd3cd2e..e8f6b050 100755
--- a/main/openssl/crypto/bn/asm/x86-mont.pl
+++ b/main/openssl/crypto/bn/asm/x86-mont.pl
@@ -527,8 +527,10 @@ $sbit=$num;
&jle (&label("sqradd"));
&mov ($carry,"edx");
- &lea ("edx",&DWP(0,$sbit,"edx",2));
+ &add ("edx","edx");
&shr ($carry,31);
+ &add ("edx",$sbit);
+ &adc ($carry,0);
&set_label("sqrlast");
&mov ($word,$_n0);
&mov ($inp,$_np);
diff --git a/main/openssl/crypto/bn/asm/x86_64-gcc.c b/main/openssl/crypto/bn/asm/x86_64-gcc.c
index acb0b401..329946e5 100644
--- a/main/openssl/crypto/bn/asm/x86_64-gcc.c
+++ b/main/openssl/crypto/bn/asm/x86_64-gcc.c
@@ -66,7 +66,7 @@
#undef sqr
/*
- * "m"(a), "+m"(r) is the way to favor DirectPath µ-code;
+ * "m"(a), "+m"(r) is the way to favor DirectPath µ-code;
* "g"(0) let the compiler to decide where does it
* want to keep the value of zero;
*/
diff --git a/main/openssl/crypto/bn/asm/x86_64-gf2m.S b/main/openssl/crypto/bn/asm/x86_64-gf2m.S
new file mode 100644
index 00000000..ccd2ed70
--- /dev/null
+++ b/main/openssl/crypto/bn/asm/x86_64-gf2m.S
@@ -0,0 +1,291 @@
+.text
+
+.type _mul_1x1,@function
+.align 16
+_mul_1x1:
+ subq $128+8,%rsp
+ movq $-1,%r9
+ leaq (%rax,%rax,1),%rsi
+ shrq $3,%r9
+ leaq (,%rax,4),%rdi
+ andq %rax,%r9
+ leaq (,%rax,8),%r12
+ sarq $63,%rax
+ leaq (%r9,%r9,1),%r10
+ sarq $63,%rsi
+ leaq (,%r9,4),%r11
+ andq %rbp,%rax
+ sarq $63,%rdi
+ movq %rax,%rdx
+ shlq $63,%rax
+ andq %rbp,%rsi
+ shrq $1,%rdx
+ movq %rsi,%rcx
+ shlq $62,%rsi
+ andq %rbp,%rdi
+ shrq $2,%rcx
+ xorq %rsi,%rax
+ movq %rdi,%rbx
+ shlq $61,%rdi
+ xorq %rcx,%rdx
+ shrq $3,%rbx
+ xorq %rdi,%rax
+ xorq %rbx,%rdx
+
+ movq %r9,%r13
+ movq $0,0(%rsp)
+ xorq %r10,%r13
+ movq %r9,8(%rsp)
+ movq %r11,%r14
+ movq %r10,16(%rsp)
+ xorq %r12,%r14
+ movq %r13,24(%rsp)
+
+ xorq %r11,%r9
+ movq %r11,32(%rsp)
+ xorq %r11,%r10
+ movq %r9,40(%rsp)
+ xorq %r11,%r13
+ movq %r10,48(%rsp)
+ xorq %r14,%r9
+ movq %r13,56(%rsp)
+ xorq %r14,%r10
+
+ movq %r12,64(%rsp)
+ xorq %r14,%r13
+ movq %r9,72(%rsp)
+ xorq %r11,%r9
+ movq %r10,80(%rsp)
+ xorq %r11,%r10
+ movq %r13,88(%rsp)
+
+ xorq %r11,%r13
+ movq %r14,96(%rsp)
+ movq %r8,%rsi
+ movq %r9,104(%rsp)
+ andq %rbp,%rsi
+ movq %r10,112(%rsp)
+ shrq $4,%rbp
+ movq %r13,120(%rsp)
+ movq %r8,%rdi
+ andq %rbp,%rdi
+ shrq $4,%rbp
+
+ movq (%rsp,%rsi,8),%xmm0
+ movq %r8,%rsi
+ andq %rbp,%rsi
+ shrq $4,%rbp
+ movq (%rsp,%rdi,8),%rcx
+ movq %r8,%rdi
+ movq %rcx,%rbx
+ shlq $4,%rcx
+ andq %rbp,%rdi
+ movq (%rsp,%rsi,8),%xmm1
+ shrq $60,%rbx
+ xorq %rcx,%rax
+ pslldq $1,%xmm1
+ movq %r8,%rsi
+ shrq $4,%rbp
+ xorq %rbx,%rdx
+ andq %rbp,%rsi
+ shrq $4,%rbp
+ pxor %xmm1,%xmm0
+ movq (%rsp,%rdi,8),%rcx
+ movq %r8,%rdi
+ movq %rcx,%rbx
+ shlq $12,%rcx
+ andq %rbp,%rdi
+ movq (%rsp,%rsi,8),%xmm1
+ shrq $52,%rbx
+ xorq %rcx,%rax
+ pslldq $2,%xmm1
+ movq %r8,%rsi
+ shrq $4,%rbp
+ xorq %rbx,%rdx
+ andq %rbp,%rsi
+ shrq $4,%rbp
+ pxor %xmm1,%xmm0
+ movq (%rsp,%rdi,8),%rcx
+ movq %r8,%rdi
+ movq %rcx,%rbx
+ shlq $20,%rcx
+ andq %rbp,%rdi
+ movq (%rsp,%rsi,8),%xmm1
+ shrq $44,%rbx
+ xorq %rcx,%rax
+ pslldq $3,%xmm1
+ movq %r8,%rsi
+ shrq $4,%rbp
+ xorq %rbx,%rdx
+ andq %rbp,%rsi
+ shrq $4,%rbp
+ pxor %xmm1,%xmm0
+ movq (%rsp,%rdi,8),%rcx
+ movq %r8,%rdi
+ movq %rcx,%rbx
+ shlq $28,%rcx
+ andq %rbp,%rdi
+ movq (%rsp,%rsi,8),%xmm1
+ shrq $36,%rbx
+ xorq %rcx,%rax
+ pslldq $4,%xmm1
+ movq %r8,%rsi
+ shrq $4,%rbp
+ xorq %rbx,%rdx
+ andq %rbp,%rsi
+ shrq $4,%rbp
+ pxor %xmm1,%xmm0
+ movq (%rsp,%rdi,8),%rcx
+ movq %r8,%rdi
+ movq %rcx,%rbx
+ shlq $36,%rcx
+ andq %rbp,%rdi
+ movq (%rsp,%rsi,8),%xmm1
+ shrq $28,%rbx
+ xorq %rcx,%rax
+ pslldq $5,%xmm1
+ movq %r8,%rsi
+ shrq $4,%rbp
+ xorq %rbx,%rdx
+ andq %rbp,%rsi
+ shrq $4,%rbp
+ pxor %xmm1,%xmm0
+ movq (%rsp,%rdi,8),%rcx
+ movq %r8,%rdi
+ movq %rcx,%rbx
+ shlq $44,%rcx
+ andq %rbp,%rdi
+ movq (%rsp,%rsi,8),%xmm1
+ shrq $20,%rbx
+ xorq %rcx,%rax
+ pslldq $6,%xmm1
+ movq %r8,%rsi
+ shrq $4,%rbp
+ xorq %rbx,%rdx
+ andq %rbp,%rsi
+ shrq $4,%rbp
+ pxor %xmm1,%xmm0
+ movq (%rsp,%rdi,8),%rcx
+ movq %r8,%rdi
+ movq %rcx,%rbx
+ shlq $52,%rcx
+ andq %rbp,%rdi
+ movq (%rsp,%rsi,8),%xmm1
+ shrq $12,%rbx
+ xorq %rcx,%rax
+ pslldq $7,%xmm1
+ movq %r8,%rsi
+ shrq $4,%rbp
+ xorq %rbx,%rdx
+ andq %rbp,%rsi
+ shrq $4,%rbp
+ pxor %xmm1,%xmm0
+ movq (%rsp,%rdi,8),%rcx
+ movq %rcx,%rbx
+ shlq $60,%rcx
+.byte 102,72,15,126,198
+ shrq $4,%rbx
+ xorq %rcx,%rax
+ psrldq $8,%xmm0
+ xorq %rbx,%rdx
+.byte 102,72,15,126,199
+ xorq %rsi,%rax
+ xorq %rdi,%rdx
+
+ addq $128+8,%rsp
+ .byte 0xf3,0xc3
+.Lend_mul_1x1:
+.size _mul_1x1,.-_mul_1x1
+
+.globl bn_GF2m_mul_2x2
+.type bn_GF2m_mul_2x2,@function
+.align 16
+bn_GF2m_mul_2x2:
+ movq OPENSSL_ia32cap_P(%rip),%rax
+ btq $33,%rax
+ jnc .Lvanilla_mul_2x2
+
+.byte 102,72,15,110,198
+.byte 102,72,15,110,201
+.byte 102,72,15,110,210
+.byte 102,73,15,110,216
+ movdqa %xmm0,%xmm4
+ movdqa %xmm1,%xmm5
+.byte 102,15,58,68,193,0
+ pxor %xmm2,%xmm4
+ pxor %xmm3,%xmm5
+.byte 102,15,58,68,211,0
+.byte 102,15,58,68,229,0
+ xorps %xmm0,%xmm4
+ xorps %xmm2,%xmm4
+ movdqa %xmm4,%xmm5
+ pslldq $8,%xmm4
+ psrldq $8,%xmm5
+ pxor %xmm4,%xmm2
+ pxor %xmm5,%xmm0
+ movdqu %xmm2,0(%rdi)
+ movdqu %xmm0,16(%rdi)
+ .byte 0xf3,0xc3
+
+.align 16
+.Lvanilla_mul_2x2:
+ leaq -136(%rsp),%rsp
+ movq %r14,80(%rsp)
+ movq %r13,88(%rsp)
+ movq %r12,96(%rsp)
+ movq %rbp,104(%rsp)
+ movq %rbx,112(%rsp)
+.Lbody_mul_2x2:
+ movq %rdi,32(%rsp)
+ movq %rsi,40(%rsp)
+ movq %rdx,48(%rsp)
+ movq %rcx,56(%rsp)
+ movq %r8,64(%rsp)
+
+ movq $15,%r8
+ movq %rsi,%rax
+ movq %rcx,%rbp
+ call _mul_1x1
+ movq %rax,16(%rsp)
+ movq %rdx,24(%rsp)
+
+ movq 48(%rsp),%rax
+ movq 64(%rsp),%rbp
+ call _mul_1x1
+ movq %rax,0(%rsp)
+ movq %rdx,8(%rsp)
+
+ movq 40(%rsp),%rax
+ movq 56(%rsp),%rbp
+ xorq 48(%rsp),%rax
+ xorq 64(%rsp),%rbp
+ call _mul_1x1
+ movq 0(%rsp),%rbx
+ movq 8(%rsp),%rcx
+ movq 16(%rsp),%rdi
+ movq 24(%rsp),%rsi
+ movq 32(%rsp),%rbp
+
+ xorq %rdx,%rax
+ xorq %rcx,%rdx
+ xorq %rbx,%rax
+ movq %rbx,0(%rbp)
+ xorq %rdi,%rdx
+ movq %rsi,24(%rbp)
+ xorq %rsi,%rax
+ xorq %rsi,%rdx
+ xorq %rdx,%rax
+ movq %rdx,16(%rbp)
+ movq %rax,8(%rbp)
+
+ movq 80(%rsp),%r14
+ movq 88(%rsp),%r13
+ movq 96(%rsp),%r12
+ movq 104(%rsp),%rbp
+ movq 112(%rsp),%rbx
+ leaq 136(%rsp),%rsp
+ .byte 0xf3,0xc3
+.Lend_mul_2x2:
+.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
+.byte 71,70,40,50,94,109,41,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 16
diff --git a/main/openssl/crypto/bn/asm/x86_64-gf2m.pl b/main/openssl/crypto/bn/asm/x86_64-gf2m.pl
new file mode 100644
index 00000000..42bbec2f
--- /dev/null
+++ b/main/openssl/crypto/bn/asm/x86_64-gf2m.pl
@@ -0,0 +1,390 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# May 2011
+#
+# The module implements bn_GF2m_mul_2x2 polynomial multiplication used
+# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
+# the time being... Except that it has two code paths: code suitable
+# for any x86_64 CPU and PCLMULQDQ one suitable for Westmere and
+# later. Improvement varies from one benchmark and µ-arch to another.
+# Vanilla code path is at most 20% faster than compiler-generated code
+# [not very impressive], while PCLMULQDQ - whole 85%-160% better on
+# 163- and 571-bit ECDH benchmarks on Intel CPUs. Keep in mind that
+# these coefficients are not ones for bn_GF2m_mul_2x2 itself, as not
+# all CPU time is burnt in it...
+
+$flavour = shift;
+$output = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+($lo,$hi)=("%rax","%rdx"); $a=$lo;
+($i0,$i1)=("%rsi","%rdi");
+($t0,$t1)=("%rbx","%rcx");
+($b,$mask)=("%rbp","%r8");
+($a1,$a2,$a4,$a8,$a12,$a48)=map("%r$_",(9..15));
+($R,$Tx)=("%xmm0","%xmm1");
+
+$code.=<<___;
+.text
+
+.type _mul_1x1,\@abi-omnipotent
+.align 16
+_mul_1x1:
+ sub \$128+8,%rsp
+ mov \$-1,$a1
+ lea ($a,$a),$i0
+ shr \$3,$a1
+ lea (,$a,4),$i1
+ and $a,$a1 # a1=a&0x1fffffffffffffff
+ lea (,$a,8),$a8
+ sar \$63,$a # broadcast 63rd bit
+ lea ($a1,$a1),$a2
+ sar \$63,$i0 # broadcast 62nd bit
+ lea (,$a1,4),$a4
+ and $b,$a
+ sar \$63,$i1 # boardcast 61st bit
+ mov $a,$hi # $a is $lo
+ shl \$63,$lo
+ and $b,$i0
+ shr \$1,$hi
+ mov $i0,$t1
+ shl \$62,$i0
+ and $b,$i1
+ shr \$2,$t1
+ xor $i0,$lo
+ mov $i1,$t0
+ shl \$61,$i1
+ xor $t1,$hi
+ shr \$3,$t0
+ xor $i1,$lo
+ xor $t0,$hi
+
+ mov $a1,$a12
+ movq \$0,0(%rsp) # tab[0]=0
+ xor $a2,$a12 # a1^a2
+ mov $a1,8(%rsp) # tab[1]=a1
+ mov $a4,$a48
+ mov $a2,16(%rsp) # tab[2]=a2
+ xor $a8,$a48 # a4^a8
+ mov $a12,24(%rsp) # tab[3]=a1^a2
+
+ xor $a4,$a1
+ mov $a4,32(%rsp) # tab[4]=a4
+ xor $a4,$a2
+ mov $a1,40(%rsp) # tab[5]=a1^a4
+ xor $a4,$a12
+ mov $a2,48(%rsp) # tab[6]=a2^a4
+ xor $a48,$a1 # a1^a4^a4^a8=a1^a8
+ mov $a12,56(%rsp) # tab[7]=a1^a2^a4
+ xor $a48,$a2 # a2^a4^a4^a8=a1^a8
+
+ mov $a8,64(%rsp) # tab[8]=a8
+ xor $a48,$a12 # a1^a2^a4^a4^a8=a1^a2^a8
+ mov $a1,72(%rsp) # tab[9]=a1^a8
+ xor $a4,$a1 # a1^a8^a4
+ mov $a2,80(%rsp) # tab[10]=a2^a8
+ xor $a4,$a2 # a2^a8^a4
+ mov $a12,88(%rsp) # tab[11]=a1^a2^a8
+
+ xor $a4,$a12 # a1^a2^a8^a4
+ mov $a48,96(%rsp) # tab[12]=a4^a8
+ mov $mask,$i0
+ mov $a1,104(%rsp) # tab[13]=a1^a4^a8
+ and $b,$i0
+ mov $a2,112(%rsp) # tab[14]=a2^a4^a8
+ shr \$4,$b
+ mov $a12,120(%rsp) # tab[15]=a1^a2^a4^a8
+ mov $mask,$i1
+ and $b,$i1
+ shr \$4,$b
+
+ movq (%rsp,$i0,8),$R # half of calculations is done in SSE2
+ mov $mask,$i0
+ and $b,$i0
+ shr \$4,$b
+___
+ for ($n=1;$n<8;$n++) {
+ $code.=<<___;
+ mov (%rsp,$i1,8),$t1
+ mov $mask,$i1
+ mov $t1,$t0
+ shl \$`8*$n-4`,$t1
+ and $b,$i1
+ movq (%rsp,$i0,8),$Tx
+ shr \$`64-(8*$n-4)`,$t0
+ xor $t1,$lo
+ pslldq \$$n,$Tx
+ mov $mask,$i0
+ shr \$4,$b
+ xor $t0,$hi
+ and $b,$i0
+ shr \$4,$b
+ pxor $Tx,$R
+___
+ }
+$code.=<<___;
+ mov (%rsp,$i1,8),$t1
+ mov $t1,$t0
+ shl \$`8*$n-4`,$t1
+ movq $R,$i0
+ shr \$`64-(8*$n-4)`,$t0
+ xor $t1,$lo
+ psrldq \$8,$R
+ xor $t0,$hi
+ movq $R,$i1
+ xor $i0,$lo
+ xor $i1,$hi
+
+ add \$128+8,%rsp
+ ret
+.Lend_mul_1x1:
+.size _mul_1x1,.-_mul_1x1
+___
+
+($rp,$a1,$a0,$b1,$b0) = $win64? ("%rcx","%rdx","%r8", "%r9","%r10") : # Win64 order
+ ("%rdi","%rsi","%rdx","%rcx","%r8"); # Unix order
+
+$code.=<<___;
+.extern OPENSSL_ia32cap_P
+.globl bn_GF2m_mul_2x2
+.type bn_GF2m_mul_2x2,\@abi-omnipotent
+.align 16
+bn_GF2m_mul_2x2:
+ mov OPENSSL_ia32cap_P(%rip),%rax
+ bt \$33,%rax
+ jnc .Lvanilla_mul_2x2
+
+ movq $a1,%xmm0
+ movq $b1,%xmm1
+ movq $a0,%xmm2
+___
+$code.=<<___ if ($win64);
+ movq 40(%rsp),%xmm3
+___
+$code.=<<___ if (!$win64);
+ movq $b0,%xmm3
+___
+$code.=<<___;
+ movdqa %xmm0,%xmm4
+ movdqa %xmm1,%xmm5
+ pclmulqdq \$0,%xmm1,%xmm0 # a1·b1
+ pxor %xmm2,%xmm4
+ pxor %xmm3,%xmm5
+ pclmulqdq \$0,%xmm3,%xmm2 # a0·b0
+ pclmulqdq \$0,%xmm5,%xmm4 # (a0+a1)·(b0+b1)
+ xorps %xmm0,%xmm4
+ xorps %xmm2,%xmm4 # (a0+a1)·(b0+b1)-a0·b0-a1·b1
+ movdqa %xmm4,%xmm5
+ pslldq \$8,%xmm4
+ psrldq \$8,%xmm5
+ pxor %xmm4,%xmm2
+ pxor %xmm5,%xmm0
+ movdqu %xmm2,0($rp)
+ movdqu %xmm0,16($rp)
+ ret
+
+.align 16
+.Lvanilla_mul_2x2:
+ lea -8*17(%rsp),%rsp
+___
+$code.=<<___ if ($win64);
+ mov `8*17+40`(%rsp),$b0
+ mov %rdi,8*15(%rsp)
+ mov %rsi,8*16(%rsp)
+___
+$code.=<<___;
+ mov %r14,8*10(%rsp)
+ mov %r13,8*11(%rsp)
+ mov %r12,8*12(%rsp)
+ mov %rbp,8*13(%rsp)
+ mov %rbx,8*14(%rsp)
+.Lbody_mul_2x2:
+ mov $rp,32(%rsp) # save the arguments
+ mov $a1,40(%rsp)
+ mov $a0,48(%rsp)
+ mov $b1,56(%rsp)
+ mov $b0,64(%rsp)
+
+ mov \$0xf,$mask
+ mov $a1,$a
+ mov $b1,$b
+ call _mul_1x1 # a1·b1
+ mov $lo,16(%rsp)
+ mov $hi,24(%rsp)
+
+ mov 48(%rsp),$a
+ mov 64(%rsp),$b
+ call _mul_1x1 # a0·b0
+ mov $lo,0(%rsp)
+ mov $hi,8(%rsp)
+
+ mov 40(%rsp),$a
+ mov 56(%rsp),$b
+ xor 48(%rsp),$a
+ xor 64(%rsp),$b
+ call _mul_1x1 # (a0+a1)·(b0+b1)
+___
+ @r=("%rbx","%rcx","%rdi","%rsi");
+$code.=<<___;
+ mov 0(%rsp),@r[0]
+ mov 8(%rsp),@r[1]
+ mov 16(%rsp),@r[2]
+ mov 24(%rsp),@r[3]
+ mov 32(%rsp),%rbp
+
+ xor $hi,$lo
+ xor @r[1],$hi
+ xor @r[0],$lo
+ mov @r[0],0(%rbp)
+ xor @r[2],$hi
+ mov @r[3],24(%rbp)
+ xor @r[3],$lo
+ xor @r[3],$hi
+ xor $hi,$lo
+ mov $hi,16(%rbp)
+ mov $lo,8(%rbp)
+
+ mov 8*10(%rsp),%r14
+ mov 8*11(%rsp),%r13
+ mov 8*12(%rsp),%r12
+ mov 8*13(%rsp),%rbp
+ mov 8*14(%rsp),%rbx
+___
+$code.=<<___ if ($win64);
+ mov 8*15(%rsp),%rdi
+ mov 8*16(%rsp),%rsi
+___
+$code.=<<___;
+ lea 8*17(%rsp),%rsp
+ ret
+.Lend_mul_2x2:
+.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
+.asciz "GF(2^m) Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align 16
+___
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+# CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern __imp_RtlVirtualUnwind
+
+.type se_handler,\@abi-omnipotent
+.align 16
+se_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 152($context),%rax # pull context->Rsp
+ mov 248($context),%rbx # pull context->Rip
+
+ lea .Lbody_mul_2x2(%rip),%r10
+ cmp %r10,%rbx # context->Rip<"prologue" label
+ jb .Lin_prologue
+
+ mov 8*10(%rax),%r14 # mimic epilogue
+ mov 8*11(%rax),%r13
+ mov 8*12(%rax),%r12
+ mov 8*13(%rax),%rbp
+ mov 8*14(%rax),%rbx
+ mov 8*15(%rax),%rdi
+ mov 8*16(%rax),%rsi
+
+ mov %rbx,144($context) # restore context->Rbx
+ mov %rbp,160($context) # restore context->Rbp
+ mov %rsi,168($context) # restore context->Rsi
+ mov %rdi,176($context) # restore context->Rdi
+ mov %r12,216($context) # restore context->R12
+ mov %r13,224($context) # restore context->R13
+ mov %r14,232($context) # restore context->R14
+
+.Lin_prologue:
+ lea 8*17(%rax),%rax
+ mov %rax,152($context) # restore context->Rsp
+
+ mov 40($disp),%rdi # disp->ContextRecord
+ mov $context,%rsi # context
+ mov \$154,%ecx # sizeof(CONTEXT)
+ .long 0xa548f3fc # cld; rep movsq
+
+ mov $disp,%rsi
+ xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
+ mov 8(%rsi),%rdx # arg2, disp->ImageBase
+ mov 0(%rsi),%r8 # arg3, disp->ControlPc
+ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
+ mov 40(%rsi),%r10 # disp->ContextRecord
+ lea 56(%rsi),%r11 # &disp->HandlerData
+ lea 24(%rsi),%r12 # &disp->EstablisherFrame
+ mov %r10,32(%rsp) # arg5
+ mov %r11,40(%rsp) # arg6
+ mov %r12,48(%rsp) # arg7
+ mov %rcx,56(%rsp) # arg8, (NULL)
+ call *__imp_RtlVirtualUnwind(%rip)
+
+ mov \$1,%eax # ExceptionContinueSearch
+ add \$64,%rsp
+ popfq
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ pop %rdi
+ pop %rsi
+ ret
+.size se_handler,.-se_handler
+
+.section .pdata
+.align 4
+ .rva _mul_1x1
+ .rva .Lend_mul_1x1
+ .rva .LSEH_info_1x1
+
+ .rva .Lvanilla_mul_2x2
+ .rva .Lend_mul_2x2
+ .rva .LSEH_info_2x2
+.section .xdata
+.align 8
+.LSEH_info_1x1:
+ .byte 0x01,0x07,0x02,0x00
+ .byte 0x07,0x01,0x11,0x00 # sub rsp,128+8
+.LSEH_info_2x2:
+ .byte 9,0,0,0
+ .rva se_handler
+___
+}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+print $code;
+close STDOUT;
diff --git a/main/openssl/crypto/bn/asm/x86_64-mont.S b/main/openssl/crypto/bn/asm/x86_64-mont.S
new file mode 100644
index 00000000..95e29052
--- /dev/null
+++ b/main/openssl/crypto/bn/asm/x86_64-mont.S
@@ -0,0 +1,1374 @@
+.text
+
+.globl bn_mul_mont
+.type bn_mul_mont,@function
+.align 16
+bn_mul_mont:
+ testl $3,%r9d
+ jnz .Lmul_enter
+ cmpl $8,%r9d
+ jb .Lmul_enter
+ cmpq %rsi,%rdx
+ jne .Lmul4x_enter
+ jmp .Lsqr4x_enter
+
+.align 16
+.Lmul_enter:
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+
+ movl %r9d,%r9d
+ leaq 2(%r9),%r10
+ movq %rsp,%r11
+ negq %r10
+ leaq (%rsp,%r10,8),%rsp
+ andq $-1024,%rsp
+
+ movq %r11,8(%rsp,%r9,8)
+.Lmul_body:
+ movq %rdx,%r12
+ movq (%r8),%r8
+ movq (%r12),%rbx
+ movq (%rsi),%rax
+
+ xorq %r14,%r14
+ xorq %r15,%r15
+
+ movq %r8,%rbp
+ mulq %rbx
+ movq %rax,%r10
+ movq (%rcx),%rax
+
+ imulq %r10,%rbp
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r10
+ movq 8(%rsi),%rax
+ adcq $0,%rdx
+ movq %rdx,%r13
+
+ leaq 1(%r15),%r15
+ jmp .L1st_enter
+
+.align 16
+.L1st:
+ addq %rax,%r13
+ movq (%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%r13
+ movq %r10,%r11
+ adcq $0,%rdx
+ movq %r13,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+.L1st_enter:
+ mulq %rbx
+ addq %rax,%r11
+ movq (%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ leaq 1(%r15),%r15
+ movq %rdx,%r10
+
+ mulq %rbp
+ cmpq %r9,%r15
+ jne .L1st
+
+ addq %rax,%r13
+ movq (%rsi),%rax
+ adcq $0,%rdx
+ addq %r11,%r13
+ adcq $0,%rdx
+ movq %r13,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+ movq %r10,%r11
+
+ xorq %rdx,%rdx
+ addq %r11,%r13
+ adcq $0,%rdx
+ movq %r13,-8(%rsp,%r9,8)
+ movq %rdx,(%rsp,%r9,8)
+
+ leaq 1(%r14),%r14
+ jmp .Louter
+.align 16
+.Louter:
+ movq (%r12,%r14,8),%rbx
+ xorq %r15,%r15
+ movq %r8,%rbp
+ movq (%rsp),%r10
+ mulq %rbx
+ addq %rax,%r10
+ movq (%rcx),%rax
+ adcq $0,%rdx
+
+ imulq %r10,%rbp
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r10
+ movq 8(%rsi),%rax
+ adcq $0,%rdx
+ movq 8(%rsp),%r10
+ movq %rdx,%r13
+
+ leaq 1(%r15),%r15
+ jmp .Linner_enter
+
+.align 16
+.Linner:
+ addq %rax,%r13
+ movq (%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ movq (%rsp,%r15,8),%r10
+ adcq $0,%rdx
+ movq %r13,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+.Linner_enter:
+ mulq %rbx
+ addq %rax,%r11
+ movq (%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%r10
+ movq %rdx,%r11
+ adcq $0,%r11
+ leaq 1(%r15),%r15
+
+ mulq %rbp
+ cmpq %r9,%r15
+ jne .Linner
+
+ addq %rax,%r13
+ movq (%rsi),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ movq (%rsp,%r15,8),%r10
+ adcq $0,%rdx
+ movq %r13,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+ xorq %rdx,%rdx
+ addq %r11,%r13
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-8(%rsp,%r9,8)
+ movq %rdx,(%rsp,%r9,8)
+
+ leaq 1(%r14),%r14
+ cmpq %r9,%r14
+ jl .Louter
+
+ xorq %r14,%r14
+ movq (%rsp),%rax
+ leaq (%rsp),%rsi
+ movq %r9,%r15
+ jmp .Lsub
+.align 16
+.Lsub: sbbq (%rcx,%r14,8),%rax
+ movq %rax,(%rdi,%r14,8)
+ movq 8(%rsi,%r14,8),%rax
+ leaq 1(%r14),%r14
+ decq %r15
+ jnz .Lsub
+
+ sbbq $0,%rax
+ xorq %r14,%r14
+ andq %rax,%rsi
+ notq %rax
+ movq %rdi,%rcx
+ andq %rax,%rcx
+ movq %r9,%r15
+ orq %rcx,%rsi
+.align 16
+.Lcopy:
+ movq (%rsi,%r14,8),%rax
+ movq %r14,(%rsp,%r14,8)
+ movq %rax,(%rdi,%r14,8)
+ leaq 1(%r14),%r14
+ subq $1,%r15
+ jnz .Lcopy
+
+ movq 8(%rsp,%r9,8),%rsi
+ movq $1,%rax
+ movq (%rsi),%r15
+ movq 8(%rsi),%r14
+ movq 16(%rsi),%r13
+ movq 24(%rsi),%r12
+ movq 32(%rsi),%rbp
+ movq 40(%rsi),%rbx
+ leaq 48(%rsi),%rsp
+.Lmul_epilogue:
+ .byte 0xf3,0xc3
+.size bn_mul_mont,.-bn_mul_mont
+.type bn_mul4x_mont,@function
+.align 16
+bn_mul4x_mont:
+.Lmul4x_enter:
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+
+ movl %r9d,%r9d
+ leaq 4(%r9),%r10
+ movq %rsp,%r11
+ negq %r10
+ leaq (%rsp,%r10,8),%rsp
+ andq $-1024,%rsp
+
+ movq %r11,8(%rsp,%r9,8)
+.Lmul4x_body:
+ movq %rdi,16(%rsp,%r9,8)
+ movq %rdx,%r12
+ movq (%r8),%r8
+ movq (%r12),%rbx
+ movq (%rsi),%rax
+
+ xorq %r14,%r14
+ xorq %r15,%r15
+
+ movq %r8,%rbp
+ mulq %rbx
+ movq %rax,%r10
+ movq (%rcx),%rax
+
+ imulq %r10,%rbp
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r10
+ movq 8(%rsi),%rax
+ adcq $0,%rdx
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 8(%rcx),%rax
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq 16(%rsi),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ leaq 4(%r15),%r15
+ adcq $0,%rdx
+ movq %rdi,(%rsp)
+ movq %rdx,%r13
+ jmp .L1st4x
+.align 16
+.L1st4x:
+ mulq %rbx
+ addq %rax,%r10
+ movq -16(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq -8(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-24(%rsp,%r15,8)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq -8(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq (%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %rdi,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+ mulq %rbx
+ addq %rax,%r10
+ movq (%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq 8(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-8(%rsp,%r15,8)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 8(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ leaq 4(%r15),%r15
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq -16(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %rdi,-32(%rsp,%r15,8)
+ movq %rdx,%r13
+ cmpq %r9,%r15
+ jl .L1st4x
+
+ mulq %rbx
+ addq %rax,%r10
+ movq -16(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq -8(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-24(%rsp,%r15,8)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq -8(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq (%rsi),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %rdi,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+ xorq %rdi,%rdi
+ addq %r10,%r13
+ adcq $0,%rdi
+ movq %r13,-8(%rsp,%r15,8)
+ movq %rdi,(%rsp,%r15,8)
+
+ leaq 1(%r14),%r14
+.align 4
+.Louter4x:
+ movq (%r12,%r14,8),%rbx
+ xorq %r15,%r15
+ movq (%rsp),%r10
+ movq %r8,%rbp
+ mulq %rbx
+ addq %rax,%r10
+ movq (%rcx),%rax
+ adcq $0,%rdx
+
+ imulq %r10,%rbp
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r10
+ movq 8(%rsi),%rax
+ adcq $0,%rdx
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 8(%rcx),%rax
+ adcq $0,%rdx
+ addq 8(%rsp),%r11
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq 16(%rsi),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ leaq 4(%r15),%r15
+ adcq $0,%rdx
+ movq %rdi,(%rsp)
+ movq %rdx,%r13
+ jmp .Linner4x
+.align 16
+.Linner4x:
+ mulq %rbx
+ addq %rax,%r10
+ movq -16(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq -16(%rsp,%r15,8),%r10
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq -8(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-24(%rsp,%r15,8)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq -8(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq -8(%rsp,%r15,8),%r11
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq (%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %rdi,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+ mulq %rbx
+ addq %rax,%r10
+ movq (%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq (%rsp,%r15,8),%r10
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq 8(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-8(%rsp,%r15,8)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 8(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq 8(%rsp,%r15,8),%r11
+ adcq $0,%rdx
+ leaq 4(%r15),%r15
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq -16(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %rdi,-32(%rsp,%r15,8)
+ movq %rdx,%r13
+ cmpq %r9,%r15
+ jl .Linner4x
+
+ mulq %rbx
+ addq %rax,%r10
+ movq -16(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq -16(%rsp,%r15,8),%r10
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq -8(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-24(%rsp,%r15,8)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq -8(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq -8(%rsp,%r15,8),%r11
+ adcq $0,%rdx
+ leaq 1(%r14),%r14
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq (%rsi),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %rdi,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+ xorq %rdi,%rdi
+ addq %r10,%r13
+ adcq $0,%rdi
+ addq (%rsp,%r9,8),%r13
+ adcq $0,%rdi
+ movq %r13,-8(%rsp,%r15,8)
+ movq %rdi,(%rsp,%r15,8)
+
+ cmpq %r9,%r14
+ jl .Louter4x
+ movq 16(%rsp,%r9,8),%rdi
+ movq 0(%rsp),%rax
+ pxor %xmm0,%xmm0
+ movq 8(%rsp),%rdx
+ shrq $2,%r9
+ leaq (%rsp),%rsi
+ xorq %r14,%r14
+
+ subq 0(%rcx),%rax
+ movq 16(%rsi),%rbx
+ movq 24(%rsi),%rbp
+ sbbq 8(%rcx),%rdx
+ leaq -1(%r9),%r15
+ jmp .Lsub4x
+.align 16
+.Lsub4x:
+ movq %rax,0(%rdi,%r14,8)
+ movq %rdx,8(%rdi,%r14,8)
+ sbbq 16(%rcx,%r14,8),%rbx
+ movq 32(%rsi,%r14,8),%rax
+ movq 40(%rsi,%r14,8),%rdx
+ sbbq 24(%rcx,%r14,8),%rbp
+ movq %rbx,16(%rdi,%r14,8)
+ movq %rbp,24(%rdi,%r14,8)
+ sbbq 32(%rcx,%r14,8),%rax
+ movq 48(%rsi,%r14,8),%rbx
+ movq 56(%rsi,%r14,8),%rbp
+ sbbq 40(%rcx,%r14,8),%rdx
+ leaq 4(%r14),%r14
+ decq %r15
+ jnz .Lsub4x
+
+ movq %rax,0(%rdi,%r14,8)
+ movq 32(%rsi,%r14,8),%rax
+ sbbq 16(%rcx,%r14,8),%rbx
+ movq %rdx,8(%rdi,%r14,8)
+ sbbq 24(%rcx,%r14,8),%rbp
+ movq %rbx,16(%rdi,%r14,8)
+
+ sbbq $0,%rax
+ movq %rbp,24(%rdi,%r14,8)
+ xorq %r14,%r14
+ andq %rax,%rsi
+ notq %rax
+ movq %rdi,%rcx
+ andq %rax,%rcx
+ leaq -1(%r9),%r15
+ orq %rcx,%rsi
+
+ movdqu (%rsi),%xmm1
+ movdqa %xmm0,(%rsp)
+ movdqu %xmm1,(%rdi)
+ jmp .Lcopy4x
+.align 16
+.Lcopy4x:
+ movdqu 16(%rsi,%r14,1),%xmm2
+ movdqu 32(%rsi,%r14,1),%xmm1
+ movdqa %xmm0,16(%rsp,%r14,1)
+ movdqu %xmm2,16(%rdi,%r14,1)
+ movdqa %xmm0,32(%rsp,%r14,1)
+ movdqu %xmm1,32(%rdi,%r14,1)
+ leaq 32(%r14),%r14
+ decq %r15
+ jnz .Lcopy4x
+
+ shlq $2,%r9
+ movdqu 16(%rsi,%r14,1),%xmm2
+ movdqa %xmm0,16(%rsp,%r14,1)
+ movdqu %xmm2,16(%rdi,%r14,1)
+ movq 8(%rsp,%r9,8),%rsi
+ movq $1,%rax
+ movq (%rsi),%r15
+ movq 8(%rsi),%r14
+ movq 16(%rsi),%r13
+ movq 24(%rsi),%r12
+ movq 32(%rsi),%rbp
+ movq 40(%rsi),%rbx
+ leaq 48(%rsi),%rsp
+.Lmul4x_epilogue:
+ .byte 0xf3,0xc3
+.size bn_mul4x_mont,.-bn_mul4x_mont
+.type bn_sqr4x_mont,@function
+.align 16
+bn_sqr4x_mont:
+.Lsqr4x_enter:
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+
+ shll $3,%r9d
+ xorq %r10,%r10
+ movq %rsp,%r11
+ subq %r9,%r10
+ movq (%r8),%r8
+ leaq -72(%rsp,%r10,2),%rsp
+ andq $-1024,%rsp
+
+
+
+
+
+
+
+
+
+
+
+ movq %rdi,32(%rsp)
+ movq %rcx,40(%rsp)
+ movq %r8,48(%rsp)
+ movq %r11,56(%rsp)
+.Lsqr4x_body:
+
+
+
+
+
+
+
+ leaq 32(%r10),%rbp
+ leaq (%rsi,%r9,1),%rsi
+
+ movq %r9,%rcx
+
+
+ movq -32(%rsi,%rbp,1),%r14
+ leaq 64(%rsp,%r9,2),%rdi
+ movq -24(%rsi,%rbp,1),%rax
+ leaq -32(%rdi,%rbp,1),%rdi
+ movq -16(%rsi,%rbp,1),%rbx
+ movq %rax,%r15
+
+ mulq %r14
+ movq %rax,%r10
+ movq %rbx,%rax
+ movq %rdx,%r11
+ movq %r10,-24(%rdi,%rbp,1)
+
+ xorq %r10,%r10
+ mulq %r14
+ addq %rax,%r11
+ movq %rbx,%rax
+ adcq %rdx,%r10
+ movq %r11,-16(%rdi,%rbp,1)
+
+ leaq -16(%rbp),%rcx
+
+
+ movq 8(%rsi,%rcx,1),%rbx
+ mulq %r15
+ movq %rax,%r12
+ movq %rbx,%rax
+ movq %rdx,%r13
+
+ xorq %r11,%r11
+ addq %r12,%r10
+ leaq 16(%rcx),%rcx
+ adcq $0,%r11
+ mulq %r14
+ addq %rax,%r10
+ movq %rbx,%rax
+ adcq %rdx,%r11
+ movq %r10,-8(%rdi,%rcx,1)
+ jmp .Lsqr4x_1st
+
+.align 16
+.Lsqr4x_1st:
+ movq (%rsi,%rcx,1),%rbx
+ xorq %r12,%r12
+ mulq %r15
+ addq %rax,%r13
+ movq %rbx,%rax
+ adcq %rdx,%r12
+
+ xorq %r10,%r10
+ addq %r13,%r11
+ adcq $0,%r10
+ mulq %r14
+ addq %rax,%r11
+ movq %rbx,%rax
+ adcq %rdx,%r10
+ movq %r11,(%rdi,%rcx,1)
+
+
+ movq 8(%rsi,%rcx,1),%rbx
+ xorq %r13,%r13
+ mulq %r15
+ addq %rax,%r12
+ movq %rbx,%rax
+ adcq %rdx,%r13
+
+ xorq %r11,%r11
+ addq %r12,%r10
+ adcq $0,%r11
+ mulq %r14
+ addq %rax,%r10
+ movq %rbx,%rax
+ adcq %rdx,%r11
+ movq %r10,8(%rdi,%rcx,1)
+
+ movq 16(%rsi,%rcx,1),%rbx
+ xorq %r12,%r12
+ mulq %r15
+ addq %rax,%r13
+ movq %rbx,%rax
+ adcq %rdx,%r12
+
+ xorq %r10,%r10
+ addq %r13,%r11
+ adcq $0,%r10
+ mulq %r14
+ addq %rax,%r11
+ movq %rbx,%rax
+ adcq %rdx,%r10
+ movq %r11,16(%rdi,%rcx,1)
+
+
+ movq 24(%rsi,%rcx,1),%rbx
+ xorq %r13,%r13
+ mulq %r15
+ addq %rax,%r12
+ movq %rbx,%rax
+ adcq %rdx,%r13
+
+ xorq %r11,%r11
+ addq %r12,%r10
+ leaq 32(%rcx),%rcx
+ adcq $0,%r11
+ mulq %r14
+ addq %rax,%r10
+ movq %rbx,%rax
+ adcq %rdx,%r11
+ movq %r10,-8(%rdi,%rcx,1)
+
+ cmpq $0,%rcx
+ jne .Lsqr4x_1st
+
+ xorq %r12,%r12
+ addq %r11,%r13
+ adcq $0,%r12
+ mulq %r15
+ addq %rax,%r13
+ adcq %rdx,%r12
+
+ movq %r13,(%rdi)
+ leaq 16(%rbp),%rbp
+ movq %r12,8(%rdi)
+ jmp .Lsqr4x_outer
+
+.align 16
+.Lsqr4x_outer:
+ movq -32(%rsi,%rbp,1),%r14
+ leaq 64(%rsp,%r9,2),%rdi
+ movq -24(%rsi,%rbp,1),%rax
+ leaq -32(%rdi,%rbp,1),%rdi
+ movq -16(%rsi,%rbp,1),%rbx
+ movq %rax,%r15
+
+ movq -24(%rdi,%rbp,1),%r10
+ xorq %r11,%r11
+ mulq %r14
+ addq %rax,%r10
+ movq %rbx,%rax
+ adcq %rdx,%r11
+ movq %r10,-24(%rdi,%rbp,1)
+
+ xorq %r10,%r10
+ addq -16(%rdi,%rbp,1),%r11
+ adcq $0,%r10
+ mulq %r14
+ addq %rax,%r11
+ movq %rbx,%rax
+ adcq %rdx,%r10
+ movq %r11,-16(%rdi,%rbp,1)
+
+ leaq -16(%rbp),%rcx
+ xorq %r12,%r12
+
+
+ movq 8(%rsi,%rcx,1),%rbx
+ xorq %r13,%r13
+ addq 8(%rdi,%rcx,1),%r12
+ adcq $0,%r13
+ mulq %r15
+ addq %rax,%r12
+ movq %rbx,%rax
+ adcq %rdx,%r13
+
+ xorq %r11,%r11
+ addq %r12,%r10
+ adcq $0,%r11
+ mulq %r14
+ addq %rax,%r10
+ movq %rbx,%rax
+ adcq %rdx,%r11
+ movq %r10,8(%rdi,%rcx,1)
+
+ leaq 16(%rcx),%rcx
+ jmp .Lsqr4x_inner
+
+.align 16
+.Lsqr4x_inner:
+ movq (%rsi,%rcx,1),%rbx
+ xorq %r12,%r12
+ addq (%rdi,%rcx,1),%r13
+ adcq $0,%r12
+ mulq %r15
+ addq %rax,%r13
+ movq %rbx,%rax
+ adcq %rdx,%r12
+
+ xorq %r10,%r10
+ addq %r13,%r11
+ adcq $0,%r10
+ mulq %r14
+ addq %rax,%r11
+ movq %rbx,%rax
+ adcq %rdx,%r10
+ movq %r11,(%rdi,%rcx,1)
+
+ movq 8(%rsi,%rcx,1),%rbx
+ xorq %r13,%r13
+ addq 8(%rdi,%rcx,1),%r12
+ adcq $0,%r13
+ mulq %r15
+ addq %rax,%r12
+ movq %rbx,%rax
+ adcq %rdx,%r13
+
+ xorq %r11,%r11
+ addq %r12,%r10
+ leaq 16(%rcx),%rcx
+ adcq $0,%r11
+ mulq %r14
+ addq %rax,%r10
+ movq %rbx,%rax
+ adcq %rdx,%r11
+ movq %r10,-8(%rdi,%rcx,1)
+
+ cmpq $0,%rcx
+ jne .Lsqr4x_inner
+
+ xorq %r12,%r12
+ addq %r11,%r13
+ adcq $0,%r12
+ mulq %r15
+ addq %rax,%r13
+ adcq %rdx,%r12
+
+ movq %r13,(%rdi)
+ movq %r12,8(%rdi)
+
+ addq $16,%rbp
+ jnz .Lsqr4x_outer
+
+
+ movq -32(%rsi),%r14
+ leaq 64(%rsp,%r9,2),%rdi
+ movq -24(%rsi),%rax
+ leaq -32(%rdi,%rbp,1),%rdi
+ movq -16(%rsi),%rbx
+ movq %rax,%r15
+
+ xorq %r11,%r11
+ mulq %r14
+ addq %rax,%r10
+ movq %rbx,%rax
+ adcq %rdx,%r11
+ movq %r10,-24(%rdi)
+
+ xorq %r10,%r10
+ addq %r13,%r11
+ adcq $0,%r10
+ mulq %r14
+ addq %rax,%r11
+ movq %rbx,%rax
+ adcq %rdx,%r10
+ movq %r11,-16(%rdi)
+
+ movq -8(%rsi),%rbx
+ mulq %r15
+ addq %rax,%r12
+ movq %rbx,%rax
+ adcq $0,%rdx
+
+ xorq %r11,%r11
+ addq %r12,%r10
+ movq %rdx,%r13
+ adcq $0,%r11
+ mulq %r14
+ addq %rax,%r10
+ movq %rbx,%rax
+ adcq %rdx,%r11
+ movq %r10,-8(%rdi)
+
+ xorq %r12,%r12
+ addq %r11,%r13
+ adcq $0,%r12
+ mulq %r15
+ addq %rax,%r13
+ movq -16(%rsi),%rax
+ adcq %rdx,%r12
+
+ movq %r13,(%rdi)
+ movq %r12,8(%rdi)
+
+ mulq %rbx
+ addq $16,%rbp
+ xorq %r14,%r14
+ subq %r9,%rbp
+ xorq %r15,%r15
+
+ addq %r12,%rax
+ adcq $0,%rdx
+ movq %rax,8(%rdi)
+ movq %rdx,16(%rdi)
+ movq %r15,24(%rdi)
+
+ movq -16(%rsi,%rbp,1),%rax
+ leaq 64(%rsp,%r9,2),%rdi
+ xorq %r10,%r10
+ movq -24(%rdi,%rbp,2),%r11
+
+ leaq (%r14,%r10,2),%r12
+ shrq $63,%r10
+ leaq (%rcx,%r11,2),%r13
+ shrq $63,%r11
+ orq %r10,%r13
+ movq -16(%rdi,%rbp,2),%r10
+ movq %r11,%r14
+ mulq %rax
+ negq %r15
+ movq -8(%rdi,%rbp,2),%r11
+ adcq %rax,%r12
+ movq -8(%rsi,%rbp,1),%rax
+ movq %r12,-32(%rdi,%rbp,2)
+ adcq %rdx,%r13
+
+ leaq (%r14,%r10,2),%rbx
+ movq %r13,-24(%rdi,%rbp,2)
+ sbbq %r15,%r15
+ shrq $63,%r10
+ leaq (%rcx,%r11,2),%r8
+ shrq $63,%r11
+ orq %r10,%r8
+ movq 0(%rdi,%rbp,2),%r10
+ movq %r11,%r14
+ mulq %rax
+ negq %r15
+ movq 8(%rdi,%rbp,2),%r11
+ adcq %rax,%rbx
+ movq 0(%rsi,%rbp,1),%rax
+ movq %rbx,-16(%rdi,%rbp,2)
+ adcq %rdx,%r8
+ leaq 16(%rbp),%rbp
+ movq %r8,-40(%rdi,%rbp,2)
+ sbbq %r15,%r15
+ jmp .Lsqr4x_shift_n_add
+
+.align 16
+.Lsqr4x_shift_n_add:
+ leaq (%r14,%r10,2),%r12
+ shrq $63,%r10
+ leaq (%rcx,%r11,2),%r13
+ shrq $63,%r11
+ orq %r10,%r13
+ movq -16(%rdi,%rbp,2),%r10
+ movq %r11,%r14
+ mulq %rax
+ negq %r15
+ movq -8(%rdi,%rbp,2),%r11
+ adcq %rax,%r12
+ movq -8(%rsi,%rbp,1),%rax
+ movq %r12,-32(%rdi,%rbp,2)
+ adcq %rdx,%r13
+
+ leaq (%r14,%r10,2),%rbx
+ movq %r13,-24(%rdi,%rbp,2)
+ sbbq %r15,%r15
+ shrq $63,%r10
+ leaq (%rcx,%r11,2),%r8
+ shrq $63,%r11
+ orq %r10,%r8
+ movq 0(%rdi,%rbp,2),%r10
+ movq %r11,%r14
+ mulq %rax
+ negq %r15
+ movq 8(%rdi,%rbp,2),%r11
+ adcq %rax,%rbx
+ movq 0(%rsi,%rbp,1),%rax
+ movq %rbx,-16(%rdi,%rbp,2)
+ adcq %rdx,%r8
+
+ leaq (%r14,%r10,2),%r12
+ movq %r8,-8(%rdi,%rbp,2)
+ sbbq %r15,%r15
+ shrq $63,%r10
+ leaq (%rcx,%r11,2),%r13
+ shrq $63,%r11
+ orq %r10,%r13
+ movq 16(%rdi,%rbp,2),%r10
+ movq %r11,%r14
+ mulq %rax
+ negq %r15
+ movq 24(%rdi,%rbp,2),%r11
+ adcq %rax,%r12
+ movq 8(%rsi,%rbp,1),%rax
+ movq %r12,0(%rdi,%rbp,2)
+ adcq %rdx,%r13
+
+ leaq (%r14,%r10,2),%rbx
+ movq %r13,8(%rdi,%rbp,2)
+ sbbq %r15,%r15
+ shrq $63,%r10
+ leaq (%rcx,%r11,2),%r8
+ shrq $63,%r11
+ orq %r10,%r8
+ movq 32(%rdi,%rbp,2),%r10
+ movq %r11,%r14
+ mulq %rax
+ negq %r15
+ movq 40(%rdi,%rbp,2),%r11
+ adcq %rax,%rbx
+ movq 16(%rsi,%rbp,1),%rax
+ movq %rbx,16(%rdi,%rbp,2)
+ adcq %rdx,%r8
+ movq %r8,24(%rdi,%rbp,2)
+ sbbq %r15,%r15
+ addq $32,%rbp
+ jnz .Lsqr4x_shift_n_add
+
+ leaq (%r14,%r10,2),%r12
+ shrq $63,%r10
+ leaq (%rcx,%r11,2),%r13
+ shrq $63,%r11
+ orq %r10,%r13
+ movq -16(%rdi),%r10
+ movq %r11,%r14
+ mulq %rax
+ negq %r15
+ movq -8(%rdi),%r11
+ adcq %rax,%r12
+ movq -8(%rsi),%rax
+ movq %r12,-32(%rdi)
+ adcq %rdx,%r13
+
+ leaq (%r14,%r10,2),%rbx
+ movq %r13,-24(%rdi)
+ sbbq %r15,%r15
+ shrq $63,%r10
+ leaq (%rcx,%r11,2),%r8
+ shrq $63,%r11
+ orq %r10,%r8
+ mulq %rax
+ negq %r15
+ adcq %rax,%rbx
+ adcq %rdx,%r8
+ movq %rbx,-16(%rdi)
+ movq %r8,-8(%rdi)
+ movq 40(%rsp),%rsi
+ movq 48(%rsp),%r8
+ xorq %rcx,%rcx
+ movq %r9,0(%rsp)
+ subq %r9,%rcx
+ movq 64(%rsp),%r10
+ movq %r8,%r14
+ leaq 64(%rsp,%r9,2),%rax
+ leaq 64(%rsp,%r9,1),%rdi
+ movq %rax,8(%rsp)
+ leaq (%rsi,%r9,1),%rsi
+ xorq %rbp,%rbp
+
+ movq 0(%rsi,%rcx,1),%rax
+ movq 8(%rsi,%rcx,1),%r9
+ imulq %r10,%r14
+ movq %rax,%rbx
+ jmp .Lsqr4x_mont_outer
+
+.align 16
+.Lsqr4x_mont_outer:
+ xorq %r11,%r11
+ mulq %r14
+ addq %rax,%r10
+ movq %r9,%rax
+ adcq %rdx,%r11
+ movq %r8,%r15
+
+ xorq %r10,%r10
+ addq 8(%rdi,%rcx,1),%r11
+ adcq $0,%r10
+ mulq %r14
+ addq %rax,%r11
+ movq %rbx,%rax
+ adcq %rdx,%r10
+
+ imulq %r11,%r15
+
+ movq 16(%rsi,%rcx,1),%rbx
+ xorq %r13,%r13
+ addq %r11,%r12
+ adcq $0,%r13
+ mulq %r15
+ addq %rax,%r12
+ movq %rbx,%rax
+ adcq %rdx,%r13
+ movq %r12,8(%rdi,%rcx,1)
+
+ xorq %r11,%r11
+ addq 16(%rdi,%rcx,1),%r10
+ adcq $0,%r11
+ mulq %r14
+ addq %rax,%r10
+ movq %r9,%rax
+ adcq %rdx,%r11
+
+ movq 24(%rsi,%rcx,1),%r9
+ xorq %r12,%r12
+ addq %r10,%r13
+ adcq $0,%r12
+ mulq %r15
+ addq %rax,%r13
+ movq %r9,%rax
+ adcq %rdx,%r12
+ movq %r13,16(%rdi,%rcx,1)
+
+ xorq %r10,%r10
+ addq 24(%rdi,%rcx,1),%r11
+ leaq 32(%rcx),%rcx
+ adcq $0,%r10
+ mulq %r14
+ addq %rax,%r11
+ movq %rbx,%rax
+ adcq %rdx,%r10
+ jmp .Lsqr4x_mont_inner
+
+.align 16
+.Lsqr4x_mont_inner:
+ movq (%rsi,%rcx,1),%rbx
+ xorq %r13,%r13
+ addq %r11,%r12
+ adcq $0,%r13
+ mulq %r15
+ addq %rax,%r12
+ movq %rbx,%rax
+ adcq %rdx,%r13
+ movq %r12,-8(%rdi,%rcx,1)
+
+ xorq %r11,%r11
+ addq (%rdi,%rcx,1),%r10
+ adcq $0,%r11
+ mulq %r14
+ addq %rax,%r10
+ movq %r9,%rax
+ adcq %rdx,%r11
+
+ movq 8(%rsi,%rcx,1),%r9
+ xorq %r12,%r12
+ addq %r10,%r13
+ adcq $0,%r12
+ mulq %r15
+ addq %rax,%r13
+ movq %r9,%rax
+ adcq %rdx,%r12
+ movq %r13,(%rdi,%rcx,1)
+
+ xorq %r10,%r10
+ addq 8(%rdi,%rcx,1),%r11
+ adcq $0,%r10
+ mulq %r14
+ addq %rax,%r11
+ movq %rbx,%rax
+ adcq %rdx,%r10
+
+
+ movq 16(%rsi,%rcx,1),%rbx
+ xorq %r13,%r13
+ addq %r11,%r12
+ adcq $0,%r13
+ mulq %r15
+ addq %rax,%r12
+ movq %rbx,%rax
+ adcq %rdx,%r13
+ movq %r12,8(%rdi,%rcx,1)
+
+ xorq %r11,%r11
+ addq 16(%rdi,%rcx,1),%r10
+ adcq $0,%r11
+ mulq %r14
+ addq %rax,%r10
+ movq %r9,%rax
+ adcq %rdx,%r11
+
+ movq 24(%rsi,%rcx,1),%r9
+ xorq %r12,%r12
+ addq %r10,%r13
+ adcq $0,%r12
+ mulq %r15
+ addq %rax,%r13
+ movq %r9,%rax
+ adcq %rdx,%r12
+ movq %r13,16(%rdi,%rcx,1)
+
+ xorq %r10,%r10
+ addq 24(%rdi,%rcx,1),%r11
+ leaq 32(%rcx),%rcx
+ adcq $0,%r10
+ mulq %r14
+ addq %rax,%r11
+ movq %rbx,%rax
+ adcq %rdx,%r10
+ cmpq $0,%rcx
+ jne .Lsqr4x_mont_inner
+
+ subq 0(%rsp),%rcx
+ movq %r8,%r14
+
+ xorq %r13,%r13
+ addq %r11,%r12
+ adcq $0,%r13
+ mulq %r15
+ addq %rax,%r12
+ movq %r9,%rax
+ adcq %rdx,%r13
+ movq %r12,-8(%rdi)
+
+ xorq %r11,%r11
+ addq (%rdi),%r10
+ adcq $0,%r11
+ movq 0(%rsi,%rcx,1),%rbx
+ addq %rbp,%r10
+ adcq $0,%r11
+
+ imulq 16(%rdi,%rcx,1),%r14
+ xorq %r12,%r12
+ movq 8(%rsi,%rcx,1),%r9
+ addq %r10,%r13
+ movq 16(%rdi,%rcx,1),%r10
+ adcq $0,%r12
+ mulq %r15
+ addq %rax,%r13
+ movq %rbx,%rax
+ adcq %rdx,%r12
+ movq %r13,(%rdi)
+
+ xorq %rbp,%rbp
+ addq 8(%rdi),%r12
+ adcq %rbp,%rbp
+ addq %r11,%r12
+ leaq 16(%rdi),%rdi
+ adcq $0,%rbp
+ movq %r12,-8(%rdi)
+ cmpq 8(%rsp),%rdi
+ jb .Lsqr4x_mont_outer
+
+ movq 0(%rsp),%r9
+ movq %rbp,(%rdi)
+ movq 64(%rsp,%r9,1),%rax
+ leaq 64(%rsp,%r9,1),%rbx
+ movq 40(%rsp),%rsi
+ shrq $5,%r9
+ movq 8(%rbx),%rdx
+ xorq %rbp,%rbp
+
+ movq 32(%rsp),%rdi
+ subq 0(%rsi),%rax
+ movq 16(%rbx),%r10
+ movq 24(%rbx),%r11
+ sbbq 8(%rsi),%rdx
+ leaq -1(%r9),%rcx
+ jmp .Lsqr4x_sub
+.align 16
+.Lsqr4x_sub:
+ movq %rax,0(%rdi,%rbp,8)
+ movq %rdx,8(%rdi,%rbp,8)
+ sbbq 16(%rsi,%rbp,8),%r10
+ movq 32(%rbx,%rbp,8),%rax
+ movq 40(%rbx,%rbp,8),%rdx
+ sbbq 24(%rsi,%rbp,8),%r11
+ movq %r10,16(%rdi,%rbp,8)
+ movq %r11,24(%rdi,%rbp,8)
+ sbbq 32(%rsi,%rbp,8),%rax
+ movq 48(%rbx,%rbp,8),%r10
+ movq 56(%rbx,%rbp,8),%r11
+ sbbq 40(%rsi,%rbp,8),%rdx
+ leaq 4(%rbp),%rbp
+ decq %rcx
+ jnz .Lsqr4x_sub
+
+ movq %rax,0(%rdi,%rbp,8)
+ movq 32(%rbx,%rbp,8),%rax
+ sbbq 16(%rsi,%rbp,8),%r10
+ movq %rdx,8(%rdi,%rbp,8)
+ sbbq 24(%rsi,%rbp,8),%r11
+ movq %r10,16(%rdi,%rbp,8)
+
+ sbbq $0,%rax
+ movq %r11,24(%rdi,%rbp,8)
+ xorq %rbp,%rbp
+ andq %rax,%rbx
+ notq %rax
+ movq %rdi,%rsi
+ andq %rax,%rsi
+ leaq -1(%r9),%rcx
+ orq %rsi,%rbx
+
+ pxor %xmm0,%xmm0
+ leaq 64(%rsp,%r9,8),%rsi
+ movdqu (%rbx),%xmm1
+ leaq (%rsi,%r9,8),%rsi
+ movdqa %xmm0,64(%rsp)
+ movdqa %xmm0,(%rsi)
+ movdqu %xmm1,(%rdi)
+ jmp .Lsqr4x_copy
+.align 16
+.Lsqr4x_copy:
+ movdqu 16(%rbx,%rbp,1),%xmm2
+ movdqu 32(%rbx,%rbp,1),%xmm1
+ movdqa %xmm0,80(%rsp,%rbp,1)
+ movdqa %xmm0,96(%rsp,%rbp,1)
+ movdqa %xmm0,16(%rsi,%rbp,1)
+ movdqa %xmm0,32(%rsi,%rbp,1)
+ movdqu %xmm2,16(%rdi,%rbp,1)
+ movdqu %xmm1,32(%rdi,%rbp,1)
+ leaq 32(%rbp),%rbp
+ decq %rcx
+ jnz .Lsqr4x_copy
+
+ movdqu 16(%rbx,%rbp,1),%xmm2
+ movdqa %xmm0,80(%rsp,%rbp,1)
+ movdqa %xmm0,16(%rsi,%rbp,1)
+ movdqu %xmm2,16(%rdi,%rbp,1)
+ movq 56(%rsp),%rsi
+ movq $1,%rax
+ movq 0(%rsi),%r15
+ movq 8(%rsi),%r14
+ movq 16(%rsi),%r13
+ movq 24(%rsi),%r12
+ movq 32(%rsi),%rbp
+ movq 40(%rsi),%rbx
+ leaq 48(%rsi),%rsp
+.Lsqr4x_epilogue:
+ .byte 0xf3,0xc3
+.size bn_sqr4x_mont,.-bn_sqr4x_mont
+.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 16
diff --git a/main/openssl/crypto/bn/asm/x86_64-mont.pl b/main/openssl/crypto/bn/asm/x86_64-mont.pl
index 3b7a6f24..17fb94c8 100755
--- a/main/openssl/crypto/bn/asm/x86_64-mont.pl
+++ b/main/openssl/crypto/bn/asm/x86_64-mont.pl
@@ -1,7 +1,7 @@
#!/usr/bin/env perl
# ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
@@ -15,6 +15,20 @@
# respectful 50%. It remains to be seen if loop unrolling and
# dedicated squaring routine can provide further improvement...
+# July 2011.
+#
+# Add dedicated squaring procedure. Performance improvement varies
+# from platform to platform, but in average it's ~5%/15%/25%/33%
+# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
+
+# August 2011.
+#
+# Unroll and modulo-schedule inner loops in such manner that they
+# are "fallen through" for input lengths of 8, which is critical for
+# 1024-bit RSA *sign*. Average performance improvement in comparison
+# to *initial* version of this module from 2005 is ~0%/30%/40%/45%
+# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
+
$flavour = shift;
$output = shift;
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
@@ -26,7 +40,8 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
-open STDOUT,"| $^X $xlate $flavour $output";
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
# int bn_mul_mont(
$rp="%rdi"; # BN_ULONG *rp,
@@ -37,7 +52,6 @@ $n0="%r8"; # const BN_ULONG *n0,
$num="%r9"; # int num);
$lo0="%r10";
$hi0="%r11";
-$bp="%r12"; # reassign $bp
$hi1="%r13";
$i="%r14";
$j="%r15";
@@ -51,6 +65,16 @@ $code=<<___;
.type bn_mul_mont,\@function,6
.align 16
bn_mul_mont:
+ test \$3,${num}d
+ jnz .Lmul_enter
+ cmp \$8,${num}d
+ jb .Lmul_enter
+ cmp $ap,$bp
+ jne .Lmul4x_enter
+ jmp .Lsqr4x_enter
+
+.align 16
+.Lmul_enter:
push %rbx
push %rbp
push %r12
@@ -66,48 +90,66 @@ bn_mul_mont:
and \$-1024,%rsp # minimize TLB usage
mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp
-.Lprologue:
- mov %rdx,$bp # $bp reassigned, remember?
-
+.Lmul_body:
+ mov $bp,%r12 # reassign $bp
+___
+ $bp="%r12";
+$code.=<<___;
mov ($n0),$n0 # pull n0[0] value
+ mov ($bp),$m0 # m0=bp[0]
+ mov ($ap),%rax
xor $i,$i # i=0
xor $j,$j # j=0
- mov ($bp),$m0 # m0=bp[0]
- mov ($ap),%rax
+ mov $n0,$m1
mulq $m0 # ap[0]*bp[0]
mov %rax,$lo0
- mov %rdx,$hi0
+ mov ($np),%rax
- imulq $n0,%rax # "tp[0]"*n0
- mov %rax,$m1
+ imulq $lo0,$m1 # "tp[0]"*n0
+ mov %rdx,$hi0
- mulq ($np) # np[0]*m1
- add $lo0,%rax # discarded
+ mulq $m1 # np[0]*m1
+ add %rax,$lo0 # discarded
+ mov 8($ap),%rax
adc \$0,%rdx
mov %rdx,$hi1
lea 1($j),$j # j++
+ jmp .L1st_enter
+
+.align 16
.L1st:
+ add %rax,$hi1
mov ($ap,$j,8),%rax
- mulq $m0 # ap[j]*bp[0]
- add $hi0,%rax
adc \$0,%rdx
- mov %rax,$lo0
+ add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
+ mov $lo0,$hi0
+ adc \$0,%rdx
+ mov $hi1,-16(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$hi1
+
+.L1st_enter:
+ mulq $m0 # ap[j]*bp[0]
+ add %rax,$hi0
mov ($np,$j,8),%rax
- mov %rdx,$hi0
+ adc \$0,%rdx
+ lea 1($j),$j # j++
+ mov %rdx,$lo0
mulq $m1 # np[j]*m1
- add $hi1,%rax
- lea 1($j),$j # j++
+ cmp $num,$j
+ jne .L1st
+
+ add %rax,$hi1
+ mov ($ap),%rax # ap[0]
adc \$0,%rdx
- add $lo0,%rax # np[j]*m1+ap[j]*bp[0]
+ add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
adc \$0,%rdx
- mov %rax,-16(%rsp,$j,8) # tp[j-1]
- cmp $num,$j
+ mov $hi1,-16(%rsp,$j,8) # tp[j-1]
mov %rdx,$hi1
- jl .L1st
+ mov $lo0,$hi0
xor %rdx,%rdx
add $hi0,$hi1
@@ -116,50 +158,64 @@ bn_mul_mont:
mov %rdx,(%rsp,$num,8) # store upmost overflow bit
lea 1($i),$i # i++
-.align 4
+ jmp .Louter
+.align 16
.Louter:
- xor $j,$j # j=0
-
mov ($bp,$i,8),$m0 # m0=bp[i]
- mov ($ap),%rax # ap[0]
+ xor $j,$j # j=0
+ mov $n0,$m1
+ mov (%rsp),$lo0
mulq $m0 # ap[0]*bp[i]
- add (%rsp),%rax # ap[0]*bp[i]+tp[0]
+ add %rax,$lo0 # ap[0]*bp[i]+tp[0]
+ mov ($np),%rax
adc \$0,%rdx
- mov %rax,$lo0
- mov %rdx,$hi0
- imulq $n0,%rax # tp[0]*n0
- mov %rax,$m1
+ imulq $lo0,$m1 # tp[0]*n0
+ mov %rdx,$hi0
- mulq ($np,$j,8) # np[0]*m1
- add $lo0,%rax # discarded
- mov 8(%rsp),$lo0 # tp[1]
+ mulq $m1 # np[0]*m1
+ add %rax,$lo0 # discarded
+ mov 8($ap),%rax
adc \$0,%rdx
+ mov 8(%rsp),$lo0 # tp[1]
mov %rdx,$hi1
lea 1($j),$j # j++
-.align 4
+ jmp .Linner_enter
+
+.align 16
.Linner:
+ add %rax,$hi1
mov ($ap,$j,8),%rax
- mulq $m0 # ap[j]*bp[i]
- add $hi0,%rax
adc \$0,%rdx
- add %rax,$lo0 # ap[j]*bp[i]+tp[j]
+ add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
+ mov (%rsp,$j,8),$lo0
+ adc \$0,%rdx
+ mov $hi1,-16(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$hi1
+
+.Linner_enter:
+ mulq $m0 # ap[j]*bp[i]
+ add %rax,$hi0
mov ($np,$j,8),%rax
adc \$0,%rdx
+ add $hi0,$lo0 # ap[j]*bp[i]+tp[j]
mov %rdx,$hi0
+ adc \$0,$hi0
+ lea 1($j),$j # j++
mulq $m1 # np[j]*m1
- add $hi1,%rax
- lea 1($j),$j # j++
- adc \$0,%rdx
- add $lo0,%rax # np[j]*m1+ap[j]*bp[i]+tp[j]
+ cmp $num,$j
+ jne .Linner
+
+ add %rax,$hi1
+ mov ($ap),%rax # ap[0]
adc \$0,%rdx
+ add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
mov (%rsp,$j,8),$lo0
- cmp $num,$j
- mov %rax,-16(%rsp,$j,8) # tp[j-1]
+ adc \$0,%rdx
+ mov $hi1,-16(%rsp,$j,8) # tp[j-1]
mov %rdx,$hi1
- jl .Linner
xor %rdx,%rdx
add $hi0,$hi1
@@ -173,35 +229,449 @@ bn_mul_mont:
cmp $num,$i
jl .Louter
- lea (%rsp),$ap # borrow ap for tp
- lea -1($num),$j # j=num-1
-
- mov ($ap),%rax # tp[0]
xor $i,$i # i=0 and clear CF!
+ mov (%rsp),%rax # tp[0]
+ lea (%rsp),$ap # borrow ap for tp
+ mov $num,$j # j=num
jmp .Lsub
.align 16
.Lsub: sbb ($np,$i,8),%rax
mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i]
- dec $j # doesn't affect CF!
mov 8($ap,$i,8),%rax # tp[i+1]
lea 1($i),$i # i++
- jge .Lsub
+ dec $j # doesnn't affect CF!
+ jnz .Lsub
sbb \$0,%rax # handle upmost overflow bit
+ xor $i,$i
and %rax,$ap
not %rax
mov $rp,$np
and %rax,$np
- lea -1($num),$j
+ mov $num,$j # j=num
or $np,$ap # ap=borrow?tp:rp
.align 16
.Lcopy: # copy or in-place refresh
+ mov ($ap,$i,8),%rax
+ mov $i,(%rsp,$i,8) # zap temporary vector
+ mov %rax,($rp,$i,8) # rp[i]=tp[i]
+ lea 1($i),$i
+ sub \$1,$j
+ jnz .Lcopy
+
+ mov 8(%rsp,$num,8),%rsi # restore %rsp
+ mov \$1,%rax
+ mov (%rsi),%r15
+ mov 8(%rsi),%r14
+ mov 16(%rsi),%r13
+ mov 24(%rsi),%r12
+ mov 32(%rsi),%rbp
+ mov 40(%rsi),%rbx
+ lea 48(%rsi),%rsp
+.Lmul_epilogue:
+ ret
+.size bn_mul_mont,.-bn_mul_mont
+___
+{{{
+my @A=("%r10","%r11");
+my @N=("%r13","%rdi");
+$code.=<<___;
+.type bn_mul4x_mont,\@function,6
+.align 16
+bn_mul4x_mont:
+.Lmul4x_enter:
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ mov ${num}d,${num}d
+ lea 4($num),%r10
+ mov %rsp,%r11
+ neg %r10
+ lea (%rsp,%r10,8),%rsp # tp=alloca(8*(num+4))
+ and \$-1024,%rsp # minimize TLB usage
+
+ mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp
+.Lmul4x_body:
+ mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp
+ mov %rdx,%r12 # reassign $bp
+___
+ $bp="%r12";
+$code.=<<___;
+ mov ($n0),$n0 # pull n0[0] value
+ mov ($bp),$m0 # m0=bp[0]
+ mov ($ap),%rax
+
+ xor $i,$i # i=0
+ xor $j,$j # j=0
+
+ mov $n0,$m1
+ mulq $m0 # ap[0]*bp[0]
+ mov %rax,$A[0]
+ mov ($np),%rax
+
+ imulq $A[0],$m1 # "tp[0]"*n0
+ mov %rdx,$A[1]
+
+ mulq $m1 # np[0]*m1
+ add %rax,$A[0] # discarded
+ mov 8($ap),%rax
+ adc \$0,%rdx
+ mov %rdx,$N[1]
+
+ mulq $m0
+ add %rax,$A[1]
+ mov 8($np),%rax
+ adc \$0,%rdx
+ mov %rdx,$A[0]
+
+ mulq $m1
+ add %rax,$N[1]
+ mov 16($ap),%rax
+ adc \$0,%rdx
+ add $A[1],$N[1]
+ lea 4($j),$j # j++
+ adc \$0,%rdx
+ mov $N[1],(%rsp)
+ mov %rdx,$N[0]
+ jmp .L1st4x
+.align 16
+.L1st4x:
+ mulq $m0 # ap[j]*bp[0]
+ add %rax,$A[0]
+ mov -16($np,$j,8),%rax
+ adc \$0,%rdx
+ mov %rdx,$A[1]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[0]
+ mov -8($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
+ adc \$0,%rdx
+ mov $N[0],-24(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[1]
+
+ mulq $m0 # ap[j]*bp[0]
+ add %rax,$A[1]
+ mov -8($np,$j,8),%rax
+ adc \$0,%rdx
+ mov %rdx,$A[0]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[1]
mov ($ap,$j,8),%rax
- mov %rax,($rp,$j,8) # rp[i]=tp[i]
- mov $i,(%rsp,$j,8) # zap temporary vector
+ adc \$0,%rdx
+ add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
+ adc \$0,%rdx
+ mov $N[1],-16(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[0]
+
+ mulq $m0 # ap[j]*bp[0]
+ add %rax,$A[0]
+ mov ($np,$j,8),%rax
+ adc \$0,%rdx
+ mov %rdx,$A[1]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[0]
+ mov 8($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
+ adc \$0,%rdx
+ mov $N[0],-8(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[1]
+
+ mulq $m0 # ap[j]*bp[0]
+ add %rax,$A[1]
+ mov 8($np,$j,8),%rax
+ adc \$0,%rdx
+ lea 4($j),$j # j++
+ mov %rdx,$A[0]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[1]
+ mov -16($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
+ adc \$0,%rdx
+ mov $N[1],-32(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[0]
+ cmp $num,$j
+ jl .L1st4x
+
+ mulq $m0 # ap[j]*bp[0]
+ add %rax,$A[0]
+ mov -16($np,$j,8),%rax
+ adc \$0,%rdx
+ mov %rdx,$A[1]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[0]
+ mov -8($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
+ adc \$0,%rdx
+ mov $N[0],-24(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[1]
+
+ mulq $m0 # ap[j]*bp[0]
+ add %rax,$A[1]
+ mov -8($np,$j,8),%rax
+ adc \$0,%rdx
+ mov %rdx,$A[0]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[1]
+ mov ($ap),%rax # ap[0]
+ adc \$0,%rdx
+ add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
+ adc \$0,%rdx
+ mov $N[1],-16(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[0]
+
+ xor $N[1],$N[1]
+ add $A[0],$N[0]
+ adc \$0,$N[1]
+ mov $N[0],-8(%rsp,$j,8)
+ mov $N[1],(%rsp,$j,8) # store upmost overflow bit
+
+ lea 1($i),$i # i++
+.align 4
+.Louter4x:
+ mov ($bp,$i,8),$m0 # m0=bp[i]
+ xor $j,$j # j=0
+ mov (%rsp),$A[0]
+ mov $n0,$m1
+ mulq $m0 # ap[0]*bp[i]
+ add %rax,$A[0] # ap[0]*bp[i]+tp[0]
+ mov ($np),%rax
+ adc \$0,%rdx
+
+ imulq $A[0],$m1 # tp[0]*n0
+ mov %rdx,$A[1]
+
+ mulq $m1 # np[0]*m1
+ add %rax,$A[0] # "$N[0]", discarded
+ mov 8($ap),%rax
+ adc \$0,%rdx
+ mov %rdx,$N[1]
+
+ mulq $m0 # ap[j]*bp[i]
+ add %rax,$A[1]
+ mov 8($np),%rax
+ adc \$0,%rdx
+ add 8(%rsp),$A[1] # +tp[1]
+ adc \$0,%rdx
+ mov %rdx,$A[0]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[1]
+ mov 16($ap),%rax
+ adc \$0,%rdx
+ add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j]
+ lea 4($j),$j # j+=2
+ adc \$0,%rdx
+ mov $N[1],(%rsp) # tp[j-1]
+ mov %rdx,$N[0]
+ jmp .Linner4x
+.align 16
+.Linner4x:
+ mulq $m0 # ap[j]*bp[i]
+ add %rax,$A[0]
+ mov -16($np,$j,8),%rax
+ adc \$0,%rdx
+ add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
+ adc \$0,%rdx
+ mov %rdx,$A[1]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[0]
+ mov -8($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $A[0],$N[0]
+ adc \$0,%rdx
+ mov $N[0],-24(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[1]
+
+ mulq $m0 # ap[j]*bp[i]
+ add %rax,$A[1]
+ mov -8($np,$j,8),%rax
+ adc \$0,%rdx
+ add -8(%rsp,$j,8),$A[1]
+ adc \$0,%rdx
+ mov %rdx,$A[0]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[1]
+ mov ($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $A[1],$N[1]
+ adc \$0,%rdx
+ mov $N[1],-16(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[0]
+
+ mulq $m0 # ap[j]*bp[i]
+ add %rax,$A[0]
+ mov ($np,$j,8),%rax
+ adc \$0,%rdx
+ add (%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
+ adc \$0,%rdx
+ mov %rdx,$A[1]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[0]
+ mov 8($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $A[0],$N[0]
+ adc \$0,%rdx
+ mov $N[0],-8(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[1]
+
+ mulq $m0 # ap[j]*bp[i]
+ add %rax,$A[1]
+ mov 8($np,$j,8),%rax
+ adc \$0,%rdx
+ add 8(%rsp,$j,8),$A[1]
+ adc \$0,%rdx
+ lea 4($j),$j # j++
+ mov %rdx,$A[0]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[1]
+ mov -16($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $A[1],$N[1]
+ adc \$0,%rdx
+ mov $N[1],-32(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[0]
+ cmp $num,$j
+ jl .Linner4x
+
+ mulq $m0 # ap[j]*bp[i]
+ add %rax,$A[0]
+ mov -16($np,$j,8),%rax
+ adc \$0,%rdx
+ add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
+ adc \$0,%rdx
+ mov %rdx,$A[1]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[0]
+ mov -8($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $A[0],$N[0]
+ adc \$0,%rdx
+ mov $N[0],-24(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[1]
+
+ mulq $m0 # ap[j]*bp[i]
+ add %rax,$A[1]
+ mov -8($np,$j,8),%rax
+ adc \$0,%rdx
+ add -8(%rsp,$j,8),$A[1]
+ adc \$0,%rdx
+ lea 1($i),$i # i++
+ mov %rdx,$A[0]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[1]
+ mov ($ap),%rax # ap[0]
+ adc \$0,%rdx
+ add $A[1],$N[1]
+ adc \$0,%rdx
+ mov $N[1],-16(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[0]
+
+ xor $N[1],$N[1]
+ add $A[0],$N[0]
+ adc \$0,$N[1]
+ add (%rsp,$num,8),$N[0] # pull upmost overflow bit
+ adc \$0,$N[1]
+ mov $N[0],-8(%rsp,$j,8)
+ mov $N[1],(%rsp,$j,8) # store upmost overflow bit
+
+ cmp $num,$i
+ jl .Louter4x
+___
+{
+my @ri=("%rax","%rdx",$m0,$m1);
+$code.=<<___;
+ mov 16(%rsp,$num,8),$rp # restore $rp
+ mov 0(%rsp),@ri[0] # tp[0]
+ pxor %xmm0,%xmm0
+ mov 8(%rsp),@ri[1] # tp[1]
+ shr \$2,$num # num/=4
+ lea (%rsp),$ap # borrow ap for tp
+ xor $i,$i # i=0 and clear CF!
+
+ sub 0($np),@ri[0]
+ mov 16($ap),@ri[2] # tp[2]
+ mov 24($ap),@ri[3] # tp[3]
+ sbb 8($np),@ri[1]
+ lea -1($num),$j # j=num/4-1
+ jmp .Lsub4x
+.align 16
+.Lsub4x:
+ mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i]
+ mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i]
+ sbb 16($np,$i,8),@ri[2]
+ mov 32($ap,$i,8),@ri[0] # tp[i+1]
+ mov 40($ap,$i,8),@ri[1]
+ sbb 24($np,$i,8),@ri[3]
+ mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i]
+ mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
+ sbb 32($np,$i,8),@ri[0]
+ mov 48($ap,$i,8),@ri[2]
+ mov 56($ap,$i,8),@ri[3]
+ sbb 40($np,$i,8),@ri[1]
+ lea 4($i),$i # i++
+ dec $j # doesnn't affect CF!
+ jnz .Lsub4x
+
+ mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i]
+ mov 32($ap,$i,8),@ri[0] # load overflow bit
+ sbb 16($np,$i,8),@ri[2]
+ mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i]
+ sbb 24($np,$i,8),@ri[3]
+ mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i]
+
+ sbb \$0,@ri[0] # handle upmost overflow bit
+ mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
+ xor $i,$i # i=0
+ and @ri[0],$ap
+ not @ri[0]
+ mov $rp,$np
+ and @ri[0],$np
+ lea -1($num),$j
+ or $np,$ap # ap=borrow?tp:rp
+
+ movdqu ($ap),%xmm1
+ movdqa %xmm0,(%rsp)
+ movdqu %xmm1,($rp)
+ jmp .Lcopy4x
+.align 16
+.Lcopy4x: # copy or in-place refresh
+ movdqu 16($ap,$i),%xmm2
+ movdqu 32($ap,$i),%xmm1
+ movdqa %xmm0,16(%rsp,$i)
+ movdqu %xmm2,16($rp,$i)
+ movdqa %xmm0,32(%rsp,$i)
+ movdqu %xmm1,32($rp,$i)
+ lea 32($i),$i
dec $j
- jge .Lcopy
+ jnz .Lcopy4x
+ shl \$2,$num
+ movdqu 16($ap,$i),%xmm2
+ movdqa %xmm0,16(%rsp,$i)
+ movdqu %xmm2,16($rp,$i)
+___
+}
+$code.=<<___;
mov 8(%rsp,$num,8),%rsi # restore %rsp
mov \$1,%rax
mov (%rsi),%r15
@@ -211,9 +681,823 @@ bn_mul_mont:
mov 32(%rsi),%rbp
mov 40(%rsi),%rbx
lea 48(%rsi),%rsp
-.Lepilogue:
+.Lmul4x_epilogue:
ret
-.size bn_mul_mont,.-bn_mul_mont
+.size bn_mul4x_mont,.-bn_mul4x_mont
+___
+}}}
+ {{{
+######################################################################
+# void bn_sqr4x_mont(
+my $rptr="%rdi"; # const BN_ULONG *rptr,
+my $aptr="%rsi"; # const BN_ULONG *aptr,
+my $bptr="%rdx"; # not used
+my $nptr="%rcx"; # const BN_ULONG *nptr,
+my $n0 ="%r8"; # const BN_ULONG *n0);
+my $num ="%r9"; # int num, has to be divisible by 4 and
+ # not less than 8
+
+my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
+my @A0=("%r10","%r11");
+my @A1=("%r12","%r13");
+my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
+
+$code.=<<___;
+.type bn_sqr4x_mont,\@function,6
+.align 16
+bn_sqr4x_mont:
+.Lsqr4x_enter:
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ shl \$3,${num}d # convert $num to bytes
+ xor %r10,%r10
+ mov %rsp,%r11 # put aside %rsp
+ sub $num,%r10 # -$num
+ mov ($n0),$n0 # *n0
+ lea -72(%rsp,%r10,2),%rsp # alloca(frame+2*$num)
+ and \$-1024,%rsp # minimize TLB usage
+ ##############################################################
+ # Stack layout
+ #
+ # +0 saved $num, used in reduction section
+ # +8 &t[2*$num], used in reduction section
+ # +32 saved $rptr
+ # +40 saved $nptr
+ # +48 saved *n0
+ # +56 saved %rsp
+ # +64 t[2*$num]
+ #
+ mov $rptr,32(%rsp) # save $rptr
+ mov $nptr,40(%rsp)
+ mov $n0, 48(%rsp)
+ mov %r11, 56(%rsp) # save original %rsp
+.Lsqr4x_body:
+ ##############################################################
+ # Squaring part:
+ #
+ # a) multiply-n-add everything but a[i]*a[i];
+ # b) shift result of a) by 1 to the left and accumulate
+ # a[i]*a[i] products;
+ #
+ lea 32(%r10),$i # $i=-($num-32)
+ lea ($aptr,$num),$aptr # end of a[] buffer, ($aptr,$i)=&ap[2]
+
+ mov $num,$j # $j=$num
+
+ # comments apply to $num==8 case
+ mov -32($aptr,$i),$a0 # a[0]
+ lea 64(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num]
+ mov -24($aptr,$i),%rax # a[1]
+ lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"]
+ mov -16($aptr,$i),$ai # a[2]
+ mov %rax,$a1
+
+ mul $a0 # a[1]*a[0]
+ mov %rax,$A0[0] # a[1]*a[0]
+ mov $ai,%rax # a[2]
+ mov %rdx,$A0[1]
+ mov $A0[0],-24($tptr,$i) # t[1]
+
+ xor $A0[0],$A0[0]
+ mul $a0 # a[2]*a[0]
+ add %rax,$A0[1]
+ mov $ai,%rax
+ adc %rdx,$A0[0]
+ mov $A0[1],-16($tptr,$i) # t[2]
+
+ lea -16($i),$j # j=-16
+
+
+ mov 8($aptr,$j),$ai # a[3]
+ mul $a1 # a[2]*a[1]
+ mov %rax,$A1[0] # a[2]*a[1]+t[3]
+ mov $ai,%rax
+ mov %rdx,$A1[1]
+
+ xor $A0[1],$A0[1]
+ add $A1[0],$A0[0]
+ lea 16($j),$j
+ adc \$0,$A0[1]
+ mul $a0 # a[3]*a[0]
+ add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3]
+ mov $ai,%rax
+ adc %rdx,$A0[1]
+ mov $A0[0],-8($tptr,$j) # t[3]
+ jmp .Lsqr4x_1st
+
+.align 16
+.Lsqr4x_1st:
+ mov ($aptr,$j),$ai # a[4]
+ xor $A1[0],$A1[0]
+ mul $a1 # a[3]*a[1]
+ add %rax,$A1[1] # a[3]*a[1]+t[4]
+ mov $ai,%rax
+ adc %rdx,$A1[0]
+
+ xor $A0[0],$A0[0]
+ add $A1[1],$A0[1]
+ adc \$0,$A0[0]
+ mul $a0 # a[4]*a[0]
+ add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4]
+ mov $ai,%rax # a[3]
+ adc %rdx,$A0[0]
+ mov $A0[1],($tptr,$j) # t[4]
+
+
+ mov 8($aptr,$j),$ai # a[5]
+ xor $A1[1],$A1[1]
+ mul $a1 # a[4]*a[3]
+ add %rax,$A1[0] # a[4]*a[3]+t[5]
+ mov $ai,%rax
+ adc %rdx,$A1[1]
+
+ xor $A0[1],$A0[1]
+ add $A1[0],$A0[0]
+ adc \$0,$A0[1]
+ mul $a0 # a[5]*a[2]
+ add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5]
+ mov $ai,%rax
+ adc %rdx,$A0[1]
+ mov $A0[0],8($tptr,$j) # t[5]
+
+ mov 16($aptr,$j),$ai # a[6]
+ xor $A1[0],$A1[0]
+ mul $a1 # a[5]*a[3]
+ add %rax,$A1[1] # a[5]*a[3]+t[6]
+ mov $ai,%rax
+ adc %rdx,$A1[0]
+
+ xor $A0[0],$A0[0]
+ add $A1[1],$A0[1]
+ adc \$0,$A0[0]
+ mul $a0 # a[6]*a[2]
+ add %rax,$A0[1] # a[6]*a[2]+a[5]*a[3]+t[6]
+ mov $ai,%rax # a[3]
+ adc %rdx,$A0[0]
+ mov $A0[1],16($tptr,$j) # t[6]
+
+
+ mov 24($aptr,$j),$ai # a[7]
+ xor $A1[1],$A1[1]
+ mul $a1 # a[6]*a[5]
+ add %rax,$A1[0] # a[6]*a[5]+t[7]
+ mov $ai,%rax
+ adc %rdx,$A1[1]
+
+ xor $A0[1],$A0[1]
+ add $A1[0],$A0[0]
+ lea 32($j),$j
+ adc \$0,$A0[1]
+ mul $a0 # a[7]*a[4]
+ add %rax,$A0[0] # a[7]*a[4]+a[6]*a[5]+t[6]
+ mov $ai,%rax
+ adc %rdx,$A0[1]
+ mov $A0[0],-8($tptr,$j) # t[7]
+
+ cmp \$0,$j
+ jne .Lsqr4x_1st
+
+ xor $A1[0],$A1[0]
+ add $A0[1],$A1[1]
+ adc \$0,$A1[0]
+ mul $a1 # a[7]*a[5]
+ add %rax,$A1[1]
+ adc %rdx,$A1[0]
+
+ mov $A1[1],($tptr) # t[8]
+ lea 16($i),$i
+ mov $A1[0],8($tptr) # t[9]
+ jmp .Lsqr4x_outer
+
+.align 16
+.Lsqr4x_outer: # comments apply to $num==6 case
+ mov -32($aptr,$i),$a0 # a[0]
+ lea 64(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num]
+ mov -24($aptr,$i),%rax # a[1]
+ lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"]
+ mov -16($aptr,$i),$ai # a[2]
+ mov %rax,$a1
+
+ mov -24($tptr,$i),$A0[0] # t[1]
+ xor $A0[1],$A0[1]
+ mul $a0 # a[1]*a[0]
+ add %rax,$A0[0] # a[1]*a[0]+t[1]
+ mov $ai,%rax # a[2]
+ adc %rdx,$A0[1]
+ mov $A0[0],-24($tptr,$i) # t[1]
+
+ xor $A0[0],$A0[0]
+ add -16($tptr,$i),$A0[1] # a[2]*a[0]+t[2]
+ adc \$0,$A0[0]
+ mul $a0 # a[2]*a[0]
+ add %rax,$A0[1]
+ mov $ai,%rax
+ adc %rdx,$A0[0]
+ mov $A0[1],-16($tptr,$i) # t[2]
+
+ lea -16($i),$j # j=-16
+ xor $A1[0],$A1[0]
+
+
+ mov 8($aptr,$j),$ai # a[3]
+ xor $A1[1],$A1[1]
+ add 8($tptr,$j),$A1[0]
+ adc \$0,$A1[1]
+ mul $a1 # a[2]*a[1]
+ add %rax,$A1[0] # a[2]*a[1]+t[3]
+ mov $ai,%rax
+ adc %rdx,$A1[1]
+
+ xor $A0[1],$A0[1]
+ add $A1[0],$A0[0]
+ adc \$0,$A0[1]
+ mul $a0 # a[3]*a[0]
+ add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3]
+ mov $ai,%rax
+ adc %rdx,$A0[1]
+ mov $A0[0],8($tptr,$j) # t[3]
+
+ lea 16($j),$j
+ jmp .Lsqr4x_inner
+
+.align 16
+.Lsqr4x_inner:
+ mov ($aptr,$j),$ai # a[4]
+ xor $A1[0],$A1[0]
+ add ($tptr,$j),$A1[1]
+ adc \$0,$A1[0]
+ mul $a1 # a[3]*a[1]
+ add %rax,$A1[1] # a[3]*a[1]+t[4]
+ mov $ai,%rax
+ adc %rdx,$A1[0]
+
+ xor $A0[0],$A0[0]
+ add $A1[1],$A0[1]
+ adc \$0,$A0[0]
+ mul $a0 # a[4]*a[0]
+ add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4]
+ mov $ai,%rax # a[3]
+ adc %rdx,$A0[0]
+ mov $A0[1],($tptr,$j) # t[4]
+
+ mov 8($aptr,$j),$ai # a[5]
+ xor $A1[1],$A1[1]
+ add 8($tptr,$j),$A1[0]
+ adc \$0,$A1[1]
+ mul $a1 # a[4]*a[3]
+ add %rax,$A1[0] # a[4]*a[3]+t[5]
+ mov $ai,%rax
+ adc %rdx,$A1[1]
+
+ xor $A0[1],$A0[1]
+ add $A1[0],$A0[0]
+ lea 16($j),$j # j++
+ adc \$0,$A0[1]
+ mul $a0 # a[5]*a[2]
+ add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5]
+ mov $ai,%rax
+ adc %rdx,$A0[1]
+ mov $A0[0],-8($tptr,$j) # t[5], "preloaded t[1]" below
+
+ cmp \$0,$j
+ jne .Lsqr4x_inner
+
+ xor $A1[0],$A1[0]
+ add $A0[1],$A1[1]
+ adc \$0,$A1[0]
+ mul $a1 # a[5]*a[3]
+ add %rax,$A1[1]
+ adc %rdx,$A1[0]
+
+ mov $A1[1],($tptr) # t[6], "preloaded t[2]" below
+ mov $A1[0],8($tptr) # t[7], "preloaded t[3]" below
+
+ add \$16,$i
+ jnz .Lsqr4x_outer
+
+ # comments apply to $num==4 case
+ mov -32($aptr),$a0 # a[0]
+ lea 64(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num]
+ mov -24($aptr),%rax # a[1]
+ lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"]
+ mov -16($aptr),$ai # a[2]
+ mov %rax,$a1
+
+ xor $A0[1],$A0[1]
+ mul $a0 # a[1]*a[0]
+ add %rax,$A0[0] # a[1]*a[0]+t[1], preloaded t[1]
+ mov $ai,%rax # a[2]
+ adc %rdx,$A0[1]
+ mov $A0[0],-24($tptr) # t[1]
+
+ xor $A0[0],$A0[0]
+ add $A1[1],$A0[1] # a[2]*a[0]+t[2], preloaded t[2]
+ adc \$0,$A0[0]
+ mul $a0 # a[2]*a[0]
+ add %rax,$A0[1]
+ mov $ai,%rax
+ adc %rdx,$A0[0]
+ mov $A0[1],-16($tptr) # t[2]
+
+ mov -8($aptr),$ai # a[3]
+ mul $a1 # a[2]*a[1]
+ add %rax,$A1[0] # a[2]*a[1]+t[3], preloaded t[3]
+ mov $ai,%rax
+ adc \$0,%rdx
+
+ xor $A0[1],$A0[1]
+ add $A1[0],$A0[0]
+ mov %rdx,$A1[1]
+ adc \$0,$A0[1]
+ mul $a0 # a[3]*a[0]
+ add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3]
+ mov $ai,%rax
+ adc %rdx,$A0[1]
+ mov $A0[0],-8($tptr) # t[3]
+
+ xor $A1[0],$A1[0]
+ add $A0[1],$A1[1]
+ adc \$0,$A1[0]
+ mul $a1 # a[3]*a[1]
+ add %rax,$A1[1]
+ mov -16($aptr),%rax # a[2]
+ adc %rdx,$A1[0]
+
+ mov $A1[1],($tptr) # t[4]
+ mov $A1[0],8($tptr) # t[5]
+
+ mul $ai # a[2]*a[3]
+___
+{
+my ($shift,$carry)=($a0,$a1);
+my @S=(@A1,$ai,$n0);
+$code.=<<___;
+ add \$16,$i
+ xor $shift,$shift
+ sub $num,$i # $i=16-$num
+ xor $carry,$carry
+
+ add $A1[0],%rax # t[5]
+ adc \$0,%rdx
+ mov %rax,8($tptr) # t[5]
+ mov %rdx,16($tptr) # t[6]
+ mov $carry,24($tptr) # t[7]
+
+ mov -16($aptr,$i),%rax # a[0]
+ lea 64(%rsp,$num,2),$tptr
+ xor $A0[0],$A0[0] # t[0]
+ mov -24($tptr,$i,2),$A0[1] # t[1]
+
+ lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
+ shr \$63,$A0[0]
+ lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 |
+ shr \$63,$A0[1]
+ or $A0[0],$S[1] # | t[2*i]>>63
+ mov -16($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch
+ mov $A0[1],$shift # shift=t[2*i+1]>>63
+ mul %rax # a[i]*a[i]
+ neg $carry # mov $carry,cf
+ mov -8($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch
+ adc %rax,$S[0]
+ mov -8($aptr,$i),%rax # a[i+1] # prefetch
+ mov $S[0],-32($tptr,$i,2)
+ adc %rdx,$S[1]
+
+ lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
+ mov $S[1],-24($tptr,$i,2)
+ sbb $carry,$carry # mov cf,$carry
+ shr \$63,$A0[0]
+ lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 |
+ shr \$63,$A0[1]
+ or $A0[0],$S[3] # | t[2*i]>>63
+ mov 0($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch
+ mov $A0[1],$shift # shift=t[2*i+1]>>63
+ mul %rax # a[i]*a[i]
+ neg $carry # mov $carry,cf
+ mov 8($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch
+ adc %rax,$S[2]
+ mov 0($aptr,$i),%rax # a[i+1] # prefetch
+ mov $S[2],-16($tptr,$i,2)
+ adc %rdx,$S[3]
+ lea 16($i),$i
+ mov $S[3],-40($tptr,$i,2)
+ sbb $carry,$carry # mov cf,$carry
+ jmp .Lsqr4x_shift_n_add
+
+.align 16
+.Lsqr4x_shift_n_add:
+ lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
+ shr \$63,$A0[0]
+ lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 |
+ shr \$63,$A0[1]
+ or $A0[0],$S[1] # | t[2*i]>>63
+ mov -16($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch
+ mov $A0[1],$shift # shift=t[2*i+1]>>63
+ mul %rax # a[i]*a[i]
+ neg $carry # mov $carry,cf
+ mov -8($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch
+ adc %rax,$S[0]
+ mov -8($aptr,$i),%rax # a[i+1] # prefetch
+ mov $S[0],-32($tptr,$i,2)
+ adc %rdx,$S[1]
+
+ lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
+ mov $S[1],-24($tptr,$i,2)
+ sbb $carry,$carry # mov cf,$carry
+ shr \$63,$A0[0]
+ lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 |
+ shr \$63,$A0[1]
+ or $A0[0],$S[3] # | t[2*i]>>63
+ mov 0($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch
+ mov $A0[1],$shift # shift=t[2*i+1]>>63
+ mul %rax # a[i]*a[i]
+ neg $carry # mov $carry,cf
+ mov 8($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch
+ adc %rax,$S[2]
+ mov 0($aptr,$i),%rax # a[i+1] # prefetch
+ mov $S[2],-16($tptr,$i,2)
+ adc %rdx,$S[3]
+
+ lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
+ mov $S[3],-8($tptr,$i,2)
+ sbb $carry,$carry # mov cf,$carry
+ shr \$63,$A0[0]
+ lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 |
+ shr \$63,$A0[1]
+ or $A0[0],$S[1] # | t[2*i]>>63
+ mov 16($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch
+ mov $A0[1],$shift # shift=t[2*i+1]>>63
+ mul %rax # a[i]*a[i]
+ neg $carry # mov $carry,cf
+ mov 24($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch
+ adc %rax,$S[0]
+ mov 8($aptr,$i),%rax # a[i+1] # prefetch
+ mov $S[0],0($tptr,$i,2)
+ adc %rdx,$S[1]
+
+ lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
+ mov $S[1],8($tptr,$i,2)
+ sbb $carry,$carry # mov cf,$carry
+ shr \$63,$A0[0]
+ lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 |
+ shr \$63,$A0[1]
+ or $A0[0],$S[3] # | t[2*i]>>63
+ mov 32($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch
+ mov $A0[1],$shift # shift=t[2*i+1]>>63
+ mul %rax # a[i]*a[i]
+ neg $carry # mov $carry,cf
+ mov 40($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch
+ adc %rax,$S[2]
+ mov 16($aptr,$i),%rax # a[i+1] # prefetch
+ mov $S[2],16($tptr,$i,2)
+ adc %rdx,$S[3]
+ mov $S[3],24($tptr,$i,2)
+ sbb $carry,$carry # mov cf,$carry
+ add \$32,$i
+ jnz .Lsqr4x_shift_n_add
+
+ lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
+ shr \$63,$A0[0]
+ lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 |
+ shr \$63,$A0[1]
+ or $A0[0],$S[1] # | t[2*i]>>63
+ mov -16($tptr),$A0[0] # t[2*i+2] # prefetch
+ mov $A0[1],$shift # shift=t[2*i+1]>>63
+ mul %rax # a[i]*a[i]
+ neg $carry # mov $carry,cf
+ mov -8($tptr),$A0[1] # t[2*i+2+1] # prefetch
+ adc %rax,$S[0]
+ mov -8($aptr),%rax # a[i+1] # prefetch
+ mov $S[0],-32($tptr)
+ adc %rdx,$S[1]
+
+ lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1|shift
+ mov $S[1],-24($tptr)
+ sbb $carry,$carry # mov cf,$carry
+ shr \$63,$A0[0]
+ lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 |
+ shr \$63,$A0[1]
+ or $A0[0],$S[3] # | t[2*i]>>63
+ mul %rax # a[i]*a[i]
+ neg $carry # mov $carry,cf
+ adc %rax,$S[2]
+ adc %rdx,$S[3]
+ mov $S[2],-16($tptr)
+ mov $S[3],-8($tptr)
+___
+}
+##############################################################
+# Montgomery reduction part, "word-by-word" algorithm.
+#
+{
+my ($topbit,$nptr)=("%rbp",$aptr);
+my ($m0,$m1)=($a0,$a1);
+my @Ni=("%rbx","%r9");
+$code.=<<___;
+ mov 40(%rsp),$nptr # restore $nptr
+ mov 48(%rsp),$n0 # restore *n0
+ xor $j,$j
+ mov $num,0(%rsp) # save $num
+ sub $num,$j # $j=-$num
+ mov 64(%rsp),$A0[0] # t[0] # modsched #
+ mov $n0,$m0 # # modsched #
+ lea 64(%rsp,$num,2),%rax # end of t[] buffer
+ lea 64(%rsp,$num),$tptr # end of t[] window
+ mov %rax,8(%rsp) # save end of t[] buffer
+ lea ($nptr,$num),$nptr # end of n[] buffer
+ xor $topbit,$topbit # $topbit=0
+
+ mov 0($nptr,$j),%rax # n[0] # modsched #
+ mov 8($nptr,$j),$Ni[1] # n[1] # modsched #
+ imulq $A0[0],$m0 # m0=t[0]*n0 # modsched #
+ mov %rax,$Ni[0] # # modsched #
+ jmp .Lsqr4x_mont_outer
+
+.align 16
+.Lsqr4x_mont_outer:
+ xor $A0[1],$A0[1]
+ mul $m0 # n[0]*m0
+ add %rax,$A0[0] # n[0]*m0+t[0]
+ mov $Ni[1],%rax
+ adc %rdx,$A0[1]
+ mov $n0,$m1
+
+ xor $A0[0],$A0[0]
+ add 8($tptr,$j),$A0[1]
+ adc \$0,$A0[0]
+ mul $m0 # n[1]*m0
+ add %rax,$A0[1] # n[1]*m0+t[1]
+ mov $Ni[0],%rax
+ adc %rdx,$A0[0]
+
+ imulq $A0[1],$m1
+
+ mov 16($nptr,$j),$Ni[0] # n[2]
+ xor $A1[1],$A1[1]
+ add $A0[1],$A1[0]
+ adc \$0,$A1[1]
+ mul $m1 # n[0]*m1
+ add %rax,$A1[0] # n[0]*m1+"t[1]"
+ mov $Ni[0],%rax
+ adc %rdx,$A1[1]
+ mov $A1[0],8($tptr,$j) # "t[1]"
+
+ xor $A0[1],$A0[1]
+ add 16($tptr,$j),$A0[0]
+ adc \$0,$A0[1]
+ mul $m0 # n[2]*m0
+ add %rax,$A0[0] # n[2]*m0+t[2]
+ mov $Ni[1],%rax
+ adc %rdx,$A0[1]
+
+ mov 24($nptr,$j),$Ni[1] # n[3]
+ xor $A1[0],$A1[0]
+ add $A0[0],$A1[1]
+ adc \$0,$A1[0]
+ mul $m1 # n[1]*m1
+ add %rax,$A1[1] # n[1]*m1+"t[2]"
+ mov $Ni[1],%rax
+ adc %rdx,$A1[0]
+ mov $A1[1],16($tptr,$j) # "t[2]"
+
+ xor $A0[0],$A0[0]
+ add 24($tptr,$j),$A0[1]
+ lea 32($j),$j
+ adc \$0,$A0[0]
+ mul $m0 # n[3]*m0
+ add %rax,$A0[1] # n[3]*m0+t[3]
+ mov $Ni[0],%rax
+ adc %rdx,$A0[0]
+ jmp .Lsqr4x_mont_inner
+
+.align 16
+.Lsqr4x_mont_inner:
+ mov ($nptr,$j),$Ni[0] # n[4]
+ xor $A1[1],$A1[1]
+ add $A0[1],$A1[0]
+ adc \$0,$A1[1]
+ mul $m1 # n[2]*m1
+ add %rax,$A1[0] # n[2]*m1+"t[3]"
+ mov $Ni[0],%rax
+ adc %rdx,$A1[1]
+ mov $A1[0],-8($tptr,$j) # "t[3]"
+
+ xor $A0[1],$A0[1]
+ add ($tptr,$j),$A0[0]
+ adc \$0,$A0[1]
+ mul $m0 # n[4]*m0
+ add %rax,$A0[0] # n[4]*m0+t[4]
+ mov $Ni[1],%rax
+ adc %rdx,$A0[1]
+
+ mov 8($nptr,$j),$Ni[1] # n[5]
+ xor $A1[0],$A1[0]
+ add $A0[0],$A1[1]
+ adc \$0,$A1[0]
+ mul $m1 # n[3]*m1
+ add %rax,$A1[1] # n[3]*m1+"t[4]"
+ mov $Ni[1],%rax
+ adc %rdx,$A1[0]
+ mov $A1[1],($tptr,$j) # "t[4]"
+
+ xor $A0[0],$A0[0]
+ add 8($tptr,$j),$A0[1]
+ adc \$0,$A0[0]
+ mul $m0 # n[5]*m0
+ add %rax,$A0[1] # n[5]*m0+t[5]
+ mov $Ni[0],%rax
+ adc %rdx,$A0[0]
+
+
+ mov 16($nptr,$j),$Ni[0] # n[6]
+ xor $A1[1],$A1[1]
+ add $A0[1],$A1[0]
+ adc \$0,$A1[1]
+ mul $m1 # n[4]*m1
+ add %rax,$A1[0] # n[4]*m1+"t[5]"
+ mov $Ni[0],%rax
+ adc %rdx,$A1[1]
+ mov $A1[0],8($tptr,$j) # "t[5]"
+
+ xor $A0[1],$A0[1]
+ add 16($tptr,$j),$A0[0]
+ adc \$0,$A0[1]
+ mul $m0 # n[6]*m0
+ add %rax,$A0[0] # n[6]*m0+t[6]
+ mov $Ni[1],%rax
+ adc %rdx,$A0[1]
+
+ mov 24($nptr,$j),$Ni[1] # n[7]
+ xor $A1[0],$A1[0]
+ add $A0[0],$A1[1]
+ adc \$0,$A1[0]
+ mul $m1 # n[5]*m1
+ add %rax,$A1[1] # n[5]*m1+"t[6]"
+ mov $Ni[1],%rax
+ adc %rdx,$A1[0]
+ mov $A1[1],16($tptr,$j) # "t[6]"
+
+ xor $A0[0],$A0[0]
+ add 24($tptr,$j),$A0[1]
+ lea 32($j),$j
+ adc \$0,$A0[0]
+ mul $m0 # n[7]*m0
+ add %rax,$A0[1] # n[7]*m0+t[7]
+ mov $Ni[0],%rax
+ adc %rdx,$A0[0]
+ cmp \$0,$j
+ jne .Lsqr4x_mont_inner
+
+ sub 0(%rsp),$j # $j=-$num # modsched #
+ mov $n0,$m0 # # modsched #
+
+ xor $A1[1],$A1[1]
+ add $A0[1],$A1[0]
+ adc \$0,$A1[1]
+ mul $m1 # n[6]*m1
+ add %rax,$A1[0] # n[6]*m1+"t[7]"
+ mov $Ni[1],%rax
+ adc %rdx,$A1[1]
+ mov $A1[0],-8($tptr) # "t[7]"
+
+ xor $A0[1],$A0[1]
+ add ($tptr),$A0[0] # +t[8]
+ adc \$0,$A0[1]
+ mov 0($nptr,$j),$Ni[0] # n[0] # modsched #
+ add $topbit,$A0[0]
+ adc \$0,$A0[1]
+
+ imulq 16($tptr,$j),$m0 # m0=t[0]*n0 # modsched #
+ xor $A1[0],$A1[0]
+ mov 8($nptr,$j),$Ni[1] # n[1] # modsched #
+ add $A0[0],$A1[1]
+ mov 16($tptr,$j),$A0[0] # t[0] # modsched #
+ adc \$0,$A1[0]
+ mul $m1 # n[7]*m1
+ add %rax,$A1[1] # n[7]*m1+"t[8]"
+ mov $Ni[0],%rax # # modsched #
+ adc %rdx,$A1[0]
+ mov $A1[1],($tptr) # "t[8]"
+
+ xor $topbit,$topbit
+ add 8($tptr),$A1[0] # +t[9]
+ adc $topbit,$topbit
+ add $A0[1],$A1[0]
+ lea 16($tptr),$tptr # "t[$num]>>128"
+ adc \$0,$topbit
+ mov $A1[0],-8($tptr) # "t[9]"
+ cmp 8(%rsp),$tptr # are we done?
+ jb .Lsqr4x_mont_outer
+
+ mov 0(%rsp),$num # restore $num
+ mov $topbit,($tptr) # save $topbit
+___
+}
+##############################################################
+# Post-condition, 4x unrolled copy from bn_mul_mont
+#
+{
+my ($tptr,$nptr)=("%rbx",$aptr);
+my @ri=("%rax","%rdx","%r10","%r11");
+$code.=<<___;
+ mov 64(%rsp,$num),@ri[0] # tp[0]
+ lea 64(%rsp,$num),$tptr # upper half of t[2*$num] holds result
+ mov 40(%rsp),$nptr # restore $nptr
+ shr \$5,$num # num/4
+ mov 8($tptr),@ri[1] # t[1]
+ xor $i,$i # i=0 and clear CF!
+
+ mov 32(%rsp),$rptr # restore $rptr
+ sub 0($nptr),@ri[0]
+ mov 16($tptr),@ri[2] # t[2]
+ mov 24($tptr),@ri[3] # t[3]
+ sbb 8($nptr),@ri[1]
+ lea -1($num),$j # j=num/4-1
+ jmp .Lsqr4x_sub
+.align 16
+.Lsqr4x_sub:
+ mov @ri[0],0($rptr,$i,8) # rp[i]=tp[i]-np[i]
+ mov @ri[1],8($rptr,$i,8) # rp[i]=tp[i]-np[i]
+ sbb 16($nptr,$i,8),@ri[2]
+ mov 32($tptr,$i,8),@ri[0] # tp[i+1]
+ mov 40($tptr,$i,8),@ri[1]
+ sbb 24($nptr,$i,8),@ri[3]
+ mov @ri[2],16($rptr,$i,8) # rp[i]=tp[i]-np[i]
+ mov @ri[3],24($rptr,$i,8) # rp[i]=tp[i]-np[i]
+ sbb 32($nptr,$i,8),@ri[0]
+ mov 48($tptr,$i,8),@ri[2]
+ mov 56($tptr,$i,8),@ri[3]
+ sbb 40($nptr,$i,8),@ri[1]
+ lea 4($i),$i # i++
+ dec $j # doesn't affect CF!
+ jnz .Lsqr4x_sub
+
+ mov @ri[0],0($rptr,$i,8) # rp[i]=tp[i]-np[i]
+ mov 32($tptr,$i,8),@ri[0] # load overflow bit
+ sbb 16($nptr,$i,8),@ri[2]
+ mov @ri[1],8($rptr,$i,8) # rp[i]=tp[i]-np[i]
+ sbb 24($nptr,$i,8),@ri[3]
+ mov @ri[2],16($rptr,$i,8) # rp[i]=tp[i]-np[i]
+
+ sbb \$0,@ri[0] # handle upmost overflow bit
+ mov @ri[3],24($rptr,$i,8) # rp[i]=tp[i]-np[i]
+ xor $i,$i # i=0
+ and @ri[0],$tptr
+ not @ri[0]
+ mov $rptr,$nptr
+ and @ri[0],$nptr
+ lea -1($num),$j
+ or $nptr,$tptr # tp=borrow?tp:rp
+
+ pxor %xmm0,%xmm0
+ lea 64(%rsp,$num,8),$nptr
+ movdqu ($tptr),%xmm1
+ lea ($nptr,$num,8),$nptr
+ movdqa %xmm0,64(%rsp) # zap lower half of temporary vector
+ movdqa %xmm0,($nptr) # zap upper half of temporary vector
+ movdqu %xmm1,($rptr)
+ jmp .Lsqr4x_copy
+.align 16
+.Lsqr4x_copy: # copy or in-place refresh
+ movdqu 16($tptr,$i),%xmm2
+ movdqu 32($tptr,$i),%xmm1
+ movdqa %xmm0,80(%rsp,$i) # zap lower half of temporary vector
+ movdqa %xmm0,96(%rsp,$i) # zap lower half of temporary vector
+ movdqa %xmm0,16($nptr,$i) # zap upper half of temporary vector
+ movdqa %xmm0,32($nptr,$i) # zap upper half of temporary vector
+ movdqu %xmm2,16($rptr,$i)
+ movdqu %xmm1,32($rptr,$i)
+ lea 32($i),$i
+ dec $j
+ jnz .Lsqr4x_copy
+
+ movdqu 16($tptr,$i),%xmm2
+ movdqa %xmm0,80(%rsp,$i) # zap lower half of temporary vector
+ movdqa %xmm0,16($nptr,$i) # zap upper half of temporary vector
+ movdqu %xmm2,16($rptr,$i)
+___
+}
+$code.=<<___;
+ mov 56(%rsp),%rsi # restore %rsp
+ mov \$1,%rax
+ mov 0(%rsi),%r15
+ mov 8(%rsi),%r14
+ mov 16(%rsi),%r13
+ mov 24(%rsi),%r12
+ mov 32(%rsi),%rbp
+ mov 40(%rsi),%rbx
+ lea 48(%rsi),%rsp
+.Lsqr4x_epilogue:
+ ret
+.size bn_sqr4x_mont,.-bn_sqr4x_mont
+___
+}}}
+$code.=<<___;
.asciz "Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
.align 16
___
@@ -228,9 +1512,9 @@ $disp="%r9";
$code.=<<___;
.extern __imp_RtlVirtualUnwind
-.type se_handler,\@abi-omnipotent
+.type mul_handler,\@abi-omnipotent
.align 16
-se_handler:
+mul_handler:
push %rsi
push %rdi
push %rbx
@@ -245,15 +1529,20 @@ se_handler:
mov 120($context),%rax # pull context->Rax
mov 248($context),%rbx # pull context->Rip
- lea .Lprologue(%rip),%r10
- cmp %r10,%rbx # context->Rip<.Lprologue
- jb .Lin_prologue
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HandlerData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # end of prologue label
+ cmp %r10,%rbx # context->Rip<end of prologue label
+ jb .Lcommon_seh_tail
mov 152($context),%rax # pull context->Rsp
- lea .Lepilogue(%rip),%r10
- cmp %r10,%rbx # context->Rip>=.Lepilogue
- jae .Lin_prologue
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=epilogue label
+ jae .Lcommon_seh_tail
mov 192($context),%r10 # pull $num
mov 8(%rax,%r10,8),%rax # pull saved stack pointer
@@ -272,7 +1561,53 @@ se_handler:
mov %r14,232($context) # restore context->R14
mov %r15,240($context) # restore context->R15
-.Lin_prologue:
+ jmp .Lcommon_seh_tail
+.size mul_handler,.-mul_handler
+
+.type sqr_handler,\@abi-omnipotent
+.align 16
+sqr_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ lea .Lsqr4x_body(%rip),%r10
+ cmp %r10,%rbx # context->Rip<.Lsqr_body
+ jb .Lcommon_seh_tail
+
+ mov 152($context),%rax # pull context->Rsp
+
+ lea .Lsqr4x_epilogue(%rip),%r10
+ cmp %r10,%rbx # context->Rip>=.Lsqr_epilogue
+ jae .Lcommon_seh_tail
+
+ mov 56(%rax),%rax # pull saved stack pointer
+ lea 48(%rax),%rax
+
+ mov -8(%rax),%rbx
+ mov -16(%rax),%rbp
+ mov -24(%rax),%r12
+ mov -32(%rax),%r13
+ mov -40(%rax),%r14
+ mov -48(%rax),%r15
+ mov %rbx,144($context) # restore context->Rbx
+ mov %rbp,160($context) # restore context->Rbp
+ mov %r12,216($context) # restore context->R12
+ mov %r13,224($context) # restore context->R13
+ mov %r14,232($context) # restore context->R14
+ mov %r15,240($context) # restore context->R15
+
+.Lcommon_seh_tail:
mov 8(%rax),%rdi
mov 16(%rax),%rsi
mov %rax,152($context) # restore context->Rsp
@@ -310,7 +1645,7 @@ se_handler:
pop %rdi
pop %rsi
ret
-.size se_handler,.-se_handler
+.size sqr_handler,.-sqr_handler
.section .pdata
.align 4
@@ -318,11 +1653,27 @@ se_handler:
.rva .LSEH_end_bn_mul_mont
.rva .LSEH_info_bn_mul_mont
+ .rva .LSEH_begin_bn_mul4x_mont
+ .rva .LSEH_end_bn_mul4x_mont
+ .rva .LSEH_info_bn_mul4x_mont
+
+ .rva .LSEH_begin_bn_sqr4x_mont
+ .rva .LSEH_end_bn_sqr4x_mont
+ .rva .LSEH_info_bn_sqr4x_mont
+
.section .xdata
.align 8
.LSEH_info_bn_mul_mont:
.byte 9,0,0,0
- .rva se_handler
+ .rva mul_handler
+ .rva .Lmul_body,.Lmul_epilogue # HandlerData[]
+.LSEH_info_bn_mul4x_mont:
+ .byte 9,0,0,0
+ .rva mul_handler
+ .rva .Lmul4x_body,.Lmul4x_epilogue # HandlerData[]
+.LSEH_info_bn_sqr4x_mont:
+ .byte 9,0,0,0
+ .rva sqr_handler
___
}
diff --git a/main/openssl/crypto/bn/asm/x86_64-mont5.S b/main/openssl/crypto/bn/asm/x86_64-mont5.S
new file mode 100644
index 00000000..49ec6ac6
--- /dev/null
+++ b/main/openssl/crypto/bn/asm/x86_64-mont5.S
@@ -0,0 +1,784 @@
+.text
+
+.globl bn_mul_mont_gather5
+.type bn_mul_mont_gather5,@function
+.align 64
+bn_mul_mont_gather5:
+ testl $3,%r9d
+ jnz .Lmul_enter
+ cmpl $8,%r9d
+ jb .Lmul_enter
+ jmp .Lmul4x_enter
+
+.align 16
+.Lmul_enter:
+ movl %r9d,%r9d
+ movl 8(%rsp),%r10d
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ movq %rsp,%rax
+ leaq 2(%r9),%r11
+ negq %r11
+ leaq (%rsp,%r11,8),%rsp
+ andq $-1024,%rsp
+
+ movq %rax,8(%rsp,%r9,8)
+.Lmul_body:
+ movq %rdx,%r12
+ movq %r10,%r11
+ shrq $3,%r10
+ andq $7,%r11
+ notq %r10
+ leaq .Lmagic_masks(%rip),%rax
+ andq $3,%r10
+ leaq 96(%r12,%r11,8),%r12
+ movq 0(%rax,%r10,8),%xmm4
+ movq 8(%rax,%r10,8),%xmm5
+ movq 16(%rax,%r10,8),%xmm6
+ movq 24(%rax,%r10,8),%xmm7
+
+ movq -96(%r12),%xmm0
+ movq -32(%r12),%xmm1
+ pand %xmm4,%xmm0
+ movq 32(%r12),%xmm2
+ pand %xmm5,%xmm1
+ movq 96(%r12),%xmm3
+ pand %xmm6,%xmm2
+ por %xmm1,%xmm0
+ pand %xmm7,%xmm3
+ por %xmm2,%xmm0
+ leaq 256(%r12),%r12
+ por %xmm3,%xmm0
+
+.byte 102,72,15,126,195
+
+ movq (%r8),%r8
+ movq (%rsi),%rax
+
+ xorq %r14,%r14
+ xorq %r15,%r15
+
+ movq -96(%r12),%xmm0
+ movq -32(%r12),%xmm1
+ pand %xmm4,%xmm0
+ movq 32(%r12),%xmm2
+ pand %xmm5,%xmm1
+
+ movq %r8,%rbp
+ mulq %rbx
+ movq %rax,%r10
+ movq (%rcx),%rax
+
+ movq 96(%r12),%xmm3
+ pand %xmm6,%xmm2
+ por %xmm1,%xmm0
+ pand %xmm7,%xmm3
+
+ imulq %r10,%rbp
+ movq %rdx,%r11
+
+ por %xmm2,%xmm0
+ leaq 256(%r12),%r12
+ por %xmm3,%xmm0
+
+ mulq %rbp
+ addq %rax,%r10
+ movq 8(%rsi),%rax
+ adcq $0,%rdx
+ movq %rdx,%r13
+
+ leaq 1(%r15),%r15
+ jmp .L1st_enter
+
+.align 16
+.L1st:
+ addq %rax,%r13
+ movq (%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%r13
+ movq %r10,%r11
+ adcq $0,%rdx
+ movq %r13,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+.L1st_enter:
+ mulq %rbx
+ addq %rax,%r11
+ movq (%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ leaq 1(%r15),%r15
+ movq %rdx,%r10
+
+ mulq %rbp
+ cmpq %r9,%r15
+ jne .L1st
+
+.byte 102,72,15,126,195
+
+ addq %rax,%r13
+ movq (%rsi),%rax
+ adcq $0,%rdx
+ addq %r11,%r13
+ adcq $0,%rdx
+ movq %r13,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+ movq %r10,%r11
+
+ xorq %rdx,%rdx
+ addq %r11,%r13
+ adcq $0,%rdx
+ movq %r13,-8(%rsp,%r9,8)
+ movq %rdx,(%rsp,%r9,8)
+
+ leaq 1(%r14),%r14
+ jmp .Louter
+.align 16
+.Louter:
+ xorq %r15,%r15
+ movq %r8,%rbp
+ movq (%rsp),%r10
+
+ movq -96(%r12),%xmm0
+ movq -32(%r12),%xmm1
+ pand %xmm4,%xmm0
+ movq 32(%r12),%xmm2
+ pand %xmm5,%xmm1
+
+ mulq %rbx
+ addq %rax,%r10
+ movq (%rcx),%rax
+ adcq $0,%rdx
+
+ movq 96(%r12),%xmm3
+ pand %xmm6,%xmm2
+ por %xmm1,%xmm0
+ pand %xmm7,%xmm3
+
+ imulq %r10,%rbp
+ movq %rdx,%r11
+
+ por %xmm2,%xmm0
+ leaq 256(%r12),%r12
+ por %xmm3,%xmm0
+
+ mulq %rbp
+ addq %rax,%r10
+ movq 8(%rsi),%rax
+ adcq $0,%rdx
+ movq 8(%rsp),%r10
+ movq %rdx,%r13
+
+ leaq 1(%r15),%r15
+ jmp .Linner_enter
+
+.align 16
+.Linner:
+ addq %rax,%r13
+ movq (%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ movq (%rsp,%r15,8),%r10
+ adcq $0,%rdx
+ movq %r13,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+.Linner_enter:
+ mulq %rbx
+ addq %rax,%r11
+ movq (%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%r10
+ movq %rdx,%r11
+ adcq $0,%r11
+ leaq 1(%r15),%r15
+
+ mulq %rbp
+ cmpq %r9,%r15
+ jne .Linner
+
+.byte 102,72,15,126,195
+
+ addq %rax,%r13
+ movq (%rsi),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ movq (%rsp,%r15,8),%r10
+ adcq $0,%rdx
+ movq %r13,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+ xorq %rdx,%rdx
+ addq %r11,%r13
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-8(%rsp,%r9,8)
+ movq %rdx,(%rsp,%r9,8)
+
+ leaq 1(%r14),%r14
+ cmpq %r9,%r14
+ jl .Louter
+
+ xorq %r14,%r14
+ movq (%rsp),%rax
+ leaq (%rsp),%rsi
+ movq %r9,%r15
+ jmp .Lsub
+.align 16
+.Lsub: sbbq (%rcx,%r14,8),%rax
+ movq %rax,(%rdi,%r14,8)
+ movq 8(%rsi,%r14,8),%rax
+ leaq 1(%r14),%r14
+ decq %r15
+ jnz .Lsub
+
+ sbbq $0,%rax
+ xorq %r14,%r14
+ andq %rax,%rsi
+ notq %rax
+ movq %rdi,%rcx
+ andq %rax,%rcx
+ movq %r9,%r15
+ orq %rcx,%rsi
+.align 16
+.Lcopy:
+ movq (%rsi,%r14,8),%rax
+ movq %r14,(%rsp,%r14,8)
+ movq %rax,(%rdi,%r14,8)
+ leaq 1(%r14),%r14
+ subq $1,%r15
+ jnz .Lcopy
+
+ movq 8(%rsp,%r9,8),%rsi
+ movq $1,%rax
+ movq (%rsi),%r15
+ movq 8(%rsi),%r14
+ movq 16(%rsi),%r13
+ movq 24(%rsi),%r12
+ movq 32(%rsi),%rbp
+ movq 40(%rsi),%rbx
+ leaq 48(%rsi),%rsp
+.Lmul_epilogue:
+ .byte 0xf3,0xc3
+.size bn_mul_mont_gather5,.-bn_mul_mont_gather5
+.type bn_mul4x_mont_gather5,@function
+.align 16
+bn_mul4x_mont_gather5:
+.Lmul4x_enter:
+ movl %r9d,%r9d
+ movl 8(%rsp),%r10d
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ movq %rsp,%rax
+ leaq 4(%r9),%r11
+ negq %r11
+ leaq (%rsp,%r11,8),%rsp
+ andq $-1024,%rsp
+
+ movq %rax,8(%rsp,%r9,8)
+.Lmul4x_body:
+ movq %rdi,16(%rsp,%r9,8)
+ movq %rdx,%r12
+ movq %r10,%r11
+ shrq $3,%r10
+ andq $7,%r11
+ notq %r10
+ leaq .Lmagic_masks(%rip),%rax
+ andq $3,%r10
+ leaq 96(%r12,%r11,8),%r12
+ movq 0(%rax,%r10,8),%xmm4
+ movq 8(%rax,%r10,8),%xmm5
+ movq 16(%rax,%r10,8),%xmm6
+ movq 24(%rax,%r10,8),%xmm7
+
+ movq -96(%r12),%xmm0
+ movq -32(%r12),%xmm1
+ pand %xmm4,%xmm0
+ movq 32(%r12),%xmm2
+ pand %xmm5,%xmm1
+ movq 96(%r12),%xmm3
+ pand %xmm6,%xmm2
+ por %xmm1,%xmm0
+ pand %xmm7,%xmm3
+ por %xmm2,%xmm0
+ leaq 256(%r12),%r12
+ por %xmm3,%xmm0
+
+.byte 102,72,15,126,195
+ movq (%r8),%r8
+ movq (%rsi),%rax
+
+ xorq %r14,%r14
+ xorq %r15,%r15
+
+ movq -96(%r12),%xmm0
+ movq -32(%r12),%xmm1
+ pand %xmm4,%xmm0
+ movq 32(%r12),%xmm2
+ pand %xmm5,%xmm1
+
+ movq %r8,%rbp
+ mulq %rbx
+ movq %rax,%r10
+ movq (%rcx),%rax
+
+ movq 96(%r12),%xmm3
+ pand %xmm6,%xmm2
+ por %xmm1,%xmm0
+ pand %xmm7,%xmm3
+
+ imulq %r10,%rbp
+ movq %rdx,%r11
+
+ por %xmm2,%xmm0
+ leaq 256(%r12),%r12
+ por %xmm3,%xmm0
+
+ mulq %rbp
+ addq %rax,%r10
+ movq 8(%rsi),%rax
+ adcq $0,%rdx
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 8(%rcx),%rax
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq 16(%rsi),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ leaq 4(%r15),%r15
+ adcq $0,%rdx
+ movq %rdi,(%rsp)
+ movq %rdx,%r13
+ jmp .L1st4x
+.align 16
+.L1st4x:
+ mulq %rbx
+ addq %rax,%r10
+ movq -16(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq -8(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-24(%rsp,%r15,8)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq -8(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq (%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %rdi,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+ mulq %rbx
+ addq %rax,%r10
+ movq (%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq 8(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-8(%rsp,%r15,8)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 8(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ leaq 4(%r15),%r15
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq -16(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %rdi,-32(%rsp,%r15,8)
+ movq %rdx,%r13
+ cmpq %r9,%r15
+ jl .L1st4x
+
+ mulq %rbx
+ addq %rax,%r10
+ movq -16(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq -8(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-24(%rsp,%r15,8)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq -8(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq (%rsi),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %rdi,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+.byte 102,72,15,126,195
+
+ xorq %rdi,%rdi
+ addq %r10,%r13
+ adcq $0,%rdi
+ movq %r13,-8(%rsp,%r15,8)
+ movq %rdi,(%rsp,%r15,8)
+
+ leaq 1(%r14),%r14
+.align 4
+.Louter4x:
+ xorq %r15,%r15
+ movq -96(%r12),%xmm0
+ movq -32(%r12),%xmm1
+ pand %xmm4,%xmm0
+ movq 32(%r12),%xmm2
+ pand %xmm5,%xmm1
+
+ movq (%rsp),%r10
+ movq %r8,%rbp
+ mulq %rbx
+ addq %rax,%r10
+ movq (%rcx),%rax
+ adcq $0,%rdx
+
+ movq 96(%r12),%xmm3
+ pand %xmm6,%xmm2
+ por %xmm1,%xmm0
+ pand %xmm7,%xmm3
+
+ imulq %r10,%rbp
+ movq %rdx,%r11
+
+ por %xmm2,%xmm0
+ leaq 256(%r12),%r12
+ por %xmm3,%xmm0
+
+ mulq %rbp
+ addq %rax,%r10
+ movq 8(%rsi),%rax
+ adcq $0,%rdx
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 8(%rcx),%rax
+ adcq $0,%rdx
+ addq 8(%rsp),%r11
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq 16(%rsi),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ leaq 4(%r15),%r15
+ adcq $0,%rdx
+ movq %rdx,%r13
+ jmp .Linner4x
+.align 16
+.Linner4x:
+ mulq %rbx
+ addq %rax,%r10
+ movq -16(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq -16(%rsp,%r15,8),%r10
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq -8(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %rdi,-32(%rsp,%r15,8)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq -8(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq -8(%rsp,%r15,8),%r11
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq (%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %r13,-24(%rsp,%r15,8)
+ movq %rdx,%r13
+
+ mulq %rbx
+ addq %rax,%r10
+ movq (%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq (%rsp,%r15,8),%r10
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq 8(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %rdi,-16(%rsp,%r15,8)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 8(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq 8(%rsp,%r15,8),%r11
+ adcq $0,%rdx
+ leaq 4(%r15),%r15
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq -16(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %r13,-40(%rsp,%r15,8)
+ movq %rdx,%r13
+ cmpq %r9,%r15
+ jl .Linner4x
+
+ mulq %rbx
+ addq %rax,%r10
+ movq -16(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq -16(%rsp,%r15,8),%r10
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq -8(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %rdi,-32(%rsp,%r15,8)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq -8(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq -8(%rsp,%r15,8),%r11
+ adcq $0,%rdx
+ leaq 1(%r14),%r14
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq (%rsi),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %r13,-24(%rsp,%r15,8)
+ movq %rdx,%r13
+
+.byte 102,72,15,126,195
+ movq %rdi,-16(%rsp,%r15,8)
+
+ xorq %rdi,%rdi
+ addq %r10,%r13
+ adcq $0,%rdi
+ addq (%rsp,%r9,8),%r13
+ adcq $0,%rdi
+ movq %r13,-8(%rsp,%r15,8)
+ movq %rdi,(%rsp,%r15,8)
+
+ cmpq %r9,%r14
+ jl .Louter4x
+ movq 16(%rsp,%r9,8),%rdi
+ movq 0(%rsp),%rax
+ pxor %xmm0,%xmm0
+ movq 8(%rsp),%rdx
+ shrq $2,%r9
+ leaq (%rsp),%rsi
+ xorq %r14,%r14
+
+ subq 0(%rcx),%rax
+ movq 16(%rsi),%rbx
+ movq 24(%rsi),%rbp
+ sbbq 8(%rcx),%rdx
+ leaq -1(%r9),%r15
+ jmp .Lsub4x
+.align 16
+.Lsub4x:
+ movq %rax,0(%rdi,%r14,8)
+ movq %rdx,8(%rdi,%r14,8)
+ sbbq 16(%rcx,%r14,8),%rbx
+ movq 32(%rsi,%r14,8),%rax
+ movq 40(%rsi,%r14,8),%rdx
+ sbbq 24(%rcx,%r14,8),%rbp
+ movq %rbx,16(%rdi,%r14,8)
+ movq %rbp,24(%rdi,%r14,8)
+ sbbq 32(%rcx,%r14,8),%rax
+ movq 48(%rsi,%r14,8),%rbx
+ movq 56(%rsi,%r14,8),%rbp
+ sbbq 40(%rcx,%r14,8),%rdx
+ leaq 4(%r14),%r14
+ decq %r15
+ jnz .Lsub4x
+
+ movq %rax,0(%rdi,%r14,8)
+ movq 32(%rsi,%r14,8),%rax
+ sbbq 16(%rcx,%r14,8),%rbx
+ movq %rdx,8(%rdi,%r14,8)
+ sbbq 24(%rcx,%r14,8),%rbp
+ movq %rbx,16(%rdi,%r14,8)
+
+ sbbq $0,%rax
+ movq %rbp,24(%rdi,%r14,8)
+ xorq %r14,%r14
+ andq %rax,%rsi
+ notq %rax
+ movq %rdi,%rcx
+ andq %rax,%rcx
+ leaq -1(%r9),%r15
+ orq %rcx,%rsi
+
+ movdqu (%rsi),%xmm1
+ movdqa %xmm0,(%rsp)
+ movdqu %xmm1,(%rdi)
+ jmp .Lcopy4x
+.align 16
+.Lcopy4x:
+ movdqu 16(%rsi,%r14,1),%xmm2
+ movdqu 32(%rsi,%r14,1),%xmm1
+ movdqa %xmm0,16(%rsp,%r14,1)
+ movdqu %xmm2,16(%rdi,%r14,1)
+ movdqa %xmm0,32(%rsp,%r14,1)
+ movdqu %xmm1,32(%rdi,%r14,1)
+ leaq 32(%r14),%r14
+ decq %r15
+ jnz .Lcopy4x
+
+ shlq $2,%r9
+ movdqu 16(%rsi,%r14,1),%xmm2
+ movdqa %xmm0,16(%rsp,%r14,1)
+ movdqu %xmm2,16(%rdi,%r14,1)
+ movq 8(%rsp,%r9,8),%rsi
+ movq $1,%rax
+ movq (%rsi),%r15
+ movq 8(%rsi),%r14
+ movq 16(%rsi),%r13
+ movq 24(%rsi),%r12
+ movq 32(%rsi),%rbp
+ movq 40(%rsi),%rbx
+ leaq 48(%rsi),%rsp
+.Lmul4x_epilogue:
+ .byte 0xf3,0xc3
+.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
+.globl bn_scatter5
+.type bn_scatter5,@function
+.align 16
+bn_scatter5:
+ cmpq $0,%rsi
+ jz .Lscatter_epilogue
+ leaq (%rdx,%rcx,8),%rdx
+.Lscatter:
+ movq (%rdi),%rax
+ leaq 8(%rdi),%rdi
+ movq %rax,(%rdx)
+ leaq 256(%rdx),%rdx
+ subq $1,%rsi
+ jnz .Lscatter
+.Lscatter_epilogue:
+ .byte 0xf3,0xc3
+.size bn_scatter5,.-bn_scatter5
+
+.globl bn_gather5
+.type bn_gather5,@function
+.align 16
+bn_gather5:
+ movq %rcx,%r11
+ shrq $3,%rcx
+ andq $7,%r11
+ notq %rcx
+ leaq .Lmagic_masks(%rip),%rax
+ andq $3,%rcx
+ leaq 96(%rdx,%r11,8),%rdx
+ movq 0(%rax,%rcx,8),%xmm4
+ movq 8(%rax,%rcx,8),%xmm5
+ movq 16(%rax,%rcx,8),%xmm6
+ movq 24(%rax,%rcx,8),%xmm7
+ jmp .Lgather
+.align 16
+.Lgather:
+ movq -96(%rdx),%xmm0
+ movq -32(%rdx),%xmm1
+ pand %xmm4,%xmm0
+ movq 32(%rdx),%xmm2
+ pand %xmm5,%xmm1
+ movq 96(%rdx),%xmm3
+ pand %xmm6,%xmm2
+ por %xmm1,%xmm0
+ pand %xmm7,%xmm3
+ por %xmm2,%xmm0
+ leaq 256(%rdx),%rdx
+ por %xmm3,%xmm0
+
+ movq %xmm0,(%rdi)
+ leaq 8(%rdi),%rdi
+ subq $1,%rsi
+ jnz .Lgather
+ .byte 0xf3,0xc3
+.LSEH_end_bn_gather5:
+.size bn_gather5,.-bn_gather5
+.align 64
+.Lmagic_masks:
+.long 0,0, 0,0, 0,0, -1,-1
+.long 0,0, 0,0, 0,0, 0,0
+.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
diff --git a/main/openssl/crypto/bn/asm/x86_64-mont5.pl b/main/openssl/crypto/bn/asm/x86_64-mont5.pl
new file mode 100755
index 00000000..dae0fe24
--- /dev/null
+++ b/main/openssl/crypto/bn/asm/x86_64-mont5.pl
@@ -0,0 +1,1071 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# August 2011.
+#
+# Companion to x86_64-mont.pl that optimizes cache-timing attack
+# countermeasures. The subroutines are produced by replacing bp[i]
+# references in their x86_64-mont.pl counterparts with cache-neutral
+# references to powers table computed in BN_mod_exp_mont_consttime.
+# In addition subroutine that scatters elements of the powers table
+# is implemented, so that scatter-/gathering can be tuned without
+# bn_exp.c modifications.
+
+$flavour = shift;
+$output = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+# int bn_mul_mont_gather5(
+$rp="%rdi"; # BN_ULONG *rp,
+$ap="%rsi"; # const BN_ULONG *ap,
+$bp="%rdx"; # const BN_ULONG *bp,
+$np="%rcx"; # const BN_ULONG *np,
+$n0="%r8"; # const BN_ULONG *n0,
+$num="%r9"; # int num,
+ # int idx); # 0 to 2^5-1, "index" in $bp holding
+ # pre-computed powers of a', interlaced
+ # in such manner that b[0] is $bp[idx],
+ # b[1] is [2^5+idx], etc.
+$lo0="%r10";
+$hi0="%r11";
+$hi1="%r13";
+$i="%r14";
+$j="%r15";
+$m0="%rbx";
+$m1="%rbp";
+
+$code=<<___;
+.text
+
+.globl bn_mul_mont_gather5
+.type bn_mul_mont_gather5,\@function,6
+.align 64
+bn_mul_mont_gather5:
+ test \$3,${num}d
+ jnz .Lmul_enter
+ cmp \$8,${num}d
+ jb .Lmul_enter
+ jmp .Lmul4x_enter
+
+.align 16
+.Lmul_enter:
+ mov ${num}d,${num}d
+ mov `($win64?56:8)`(%rsp),%r10d # load 7th argument
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+___
+$code.=<<___ if ($win64);
+ lea -0x28(%rsp),%rsp
+ movaps %xmm6,(%rsp)
+ movaps %xmm7,0x10(%rsp)
+.Lmul_alloca:
+___
+$code.=<<___;
+ mov %rsp,%rax
+ lea 2($num),%r11
+ neg %r11
+ lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+2))
+ and \$-1024,%rsp # minimize TLB usage
+
+ mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
+.Lmul_body:
+ mov $bp,%r12 # reassign $bp
+___
+ $bp="%r12";
+ $STRIDE=2**5*8; # 5 is "window size"
+ $N=$STRIDE/4; # should match cache line size
+$code.=<<___;
+ mov %r10,%r11
+ shr \$`log($N/8)/log(2)`,%r10
+ and \$`$N/8-1`,%r11
+ not %r10
+ lea .Lmagic_masks(%rip),%rax
+ and \$`2**5/($N/8)-1`,%r10 # 5 is "window size"
+ lea 96($bp,%r11,8),$bp # pointer within 1st cache line
+ movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which
+ movq 8(%rax,%r10,8),%xmm5 # cache line contains element
+ movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument
+ movq 24(%rax,%r10,8),%xmm7
+
+ movq `0*$STRIDE/4-96`($bp),%xmm0
+ movq `1*$STRIDE/4-96`($bp),%xmm1
+ pand %xmm4,%xmm0
+ movq `2*$STRIDE/4-96`($bp),%xmm2
+ pand %xmm5,%xmm1
+ movq `3*$STRIDE/4-96`($bp),%xmm3
+ pand %xmm6,%xmm2
+ por %xmm1,%xmm0
+ pand %xmm7,%xmm3
+ por %xmm2,%xmm0
+ lea $STRIDE($bp),$bp
+ por %xmm3,%xmm0
+
+ movq %xmm0,$m0 # m0=bp[0]
+
+ mov ($n0),$n0 # pull n0[0] value
+ mov ($ap),%rax
+
+ xor $i,$i # i=0
+ xor $j,$j # j=0
+
+ movq `0*$STRIDE/4-96`($bp),%xmm0
+ movq `1*$STRIDE/4-96`($bp),%xmm1
+ pand %xmm4,%xmm0
+ movq `2*$STRIDE/4-96`($bp),%xmm2
+ pand %xmm5,%xmm1
+
+ mov $n0,$m1
+ mulq $m0 # ap[0]*bp[0]
+ mov %rax,$lo0
+ mov ($np),%rax
+
+ movq `3*$STRIDE/4-96`($bp),%xmm3
+ pand %xmm6,%xmm2
+ por %xmm1,%xmm0
+ pand %xmm7,%xmm3
+
+ imulq $lo0,$m1 # "tp[0]"*n0
+ mov %rdx,$hi0
+
+ por %xmm2,%xmm0
+ lea $STRIDE($bp),$bp
+ por %xmm3,%xmm0
+
+ mulq $m1 # np[0]*m1
+ add %rax,$lo0 # discarded
+ mov 8($ap),%rax
+ adc \$0,%rdx
+ mov %rdx,$hi1
+
+ lea 1($j),$j # j++
+ jmp .L1st_enter
+
+.align 16
+.L1st:
+ add %rax,$hi1
+ mov ($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
+ mov $lo0,$hi0
+ adc \$0,%rdx
+ mov $hi1,-16(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$hi1
+
+.L1st_enter:
+ mulq $m0 # ap[j]*bp[0]
+ add %rax,$hi0
+ mov ($np,$j,8),%rax
+ adc \$0,%rdx
+ lea 1($j),$j # j++
+ mov %rdx,$lo0
+
+ mulq $m1 # np[j]*m1
+ cmp $num,$j
+ jne .L1st
+
+ movq %xmm0,$m0 # bp[1]
+
+ add %rax,$hi1
+ mov ($ap),%rax # ap[0]
+ adc \$0,%rdx
+ add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
+ adc \$0,%rdx
+ mov $hi1,-16(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$hi1
+ mov $lo0,$hi0
+
+ xor %rdx,%rdx
+ add $hi0,$hi1
+ adc \$0,%rdx
+ mov $hi1,-8(%rsp,$num,8)
+ mov %rdx,(%rsp,$num,8) # store upmost overflow bit
+
+ lea 1($i),$i # i++
+ jmp .Louter
+.align 16
+.Louter:
+ xor $j,$j # j=0
+ mov $n0,$m1
+ mov (%rsp),$lo0
+
+ movq `0*$STRIDE/4-96`($bp),%xmm0
+ movq `1*$STRIDE/4-96`($bp),%xmm1
+ pand %xmm4,%xmm0
+ movq `2*$STRIDE/4-96`($bp),%xmm2
+ pand %xmm5,%xmm1
+
+ mulq $m0 # ap[0]*bp[i]
+ add %rax,$lo0 # ap[0]*bp[i]+tp[0]
+ mov ($np),%rax
+ adc \$0,%rdx
+
+ movq `3*$STRIDE/4-96`($bp),%xmm3
+ pand %xmm6,%xmm2
+ por %xmm1,%xmm0
+ pand %xmm7,%xmm3
+
+ imulq $lo0,$m1 # tp[0]*n0
+ mov %rdx,$hi0
+
+ por %xmm2,%xmm0
+ lea $STRIDE($bp),$bp
+ por %xmm3,%xmm0
+
+ mulq $m1 # np[0]*m1
+ add %rax,$lo0 # discarded
+ mov 8($ap),%rax
+ adc \$0,%rdx
+ mov 8(%rsp),$lo0 # tp[1]
+ mov %rdx,$hi1
+
+ lea 1($j),$j # j++
+ jmp .Linner_enter
+
+.align 16
+.Linner:
+ add %rax,$hi1
+ mov ($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
+ mov (%rsp,$j,8),$lo0
+ adc \$0,%rdx
+ mov $hi1,-16(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$hi1
+
+.Linner_enter:
+ mulq $m0 # ap[j]*bp[i]
+ add %rax,$hi0
+ mov ($np,$j,8),%rax
+ adc \$0,%rdx
+ add $hi0,$lo0 # ap[j]*bp[i]+tp[j]
+ mov %rdx,$hi0
+ adc \$0,$hi0
+ lea 1($j),$j # j++
+
+ mulq $m1 # np[j]*m1
+ cmp $num,$j
+ jne .Linner
+
+ movq %xmm0,$m0 # bp[i+1]
+
+ add %rax,$hi1
+ mov ($ap),%rax # ap[0]
+ adc \$0,%rdx
+ add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
+ mov (%rsp,$j,8),$lo0
+ adc \$0,%rdx
+ mov $hi1,-16(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$hi1
+
+ xor %rdx,%rdx
+ add $hi0,$hi1
+ adc \$0,%rdx
+ add $lo0,$hi1 # pull upmost overflow bit
+ adc \$0,%rdx
+ mov $hi1,-8(%rsp,$num,8)
+ mov %rdx,(%rsp,$num,8) # store upmost overflow bit
+
+ lea 1($i),$i # i++
+ cmp $num,$i
+ jl .Louter
+
+ xor $i,$i # i=0 and clear CF!
+ mov (%rsp),%rax # tp[0]
+ lea (%rsp),$ap # borrow ap for tp
+ mov $num,$j # j=num
+ jmp .Lsub
+.align 16
+.Lsub: sbb ($np,$i,8),%rax
+ mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i]
+ mov 8($ap,$i,8),%rax # tp[i+1]
+ lea 1($i),$i # i++
+ dec $j # doesnn't affect CF!
+ jnz .Lsub
+
+ sbb \$0,%rax # handle upmost overflow bit
+ xor $i,$i
+ and %rax,$ap
+ not %rax
+ mov $rp,$np
+ and %rax,$np
+ mov $num,$j # j=num
+ or $np,$ap # ap=borrow?tp:rp
+.align 16
+.Lcopy: # copy or in-place refresh
+ mov ($ap,$i,8),%rax
+ mov $i,(%rsp,$i,8) # zap temporary vector
+ mov %rax,($rp,$i,8) # rp[i]=tp[i]
+ lea 1($i),$i
+ sub \$1,$j
+ jnz .Lcopy
+
+ mov 8(%rsp,$num,8),%rsi # restore %rsp
+ mov \$1,%rax
+___
+$code.=<<___ if ($win64);
+ movaps (%rsi),%xmm6
+ movaps 0x10(%rsi),%xmm7
+ lea 0x28(%rsi),%rsi
+___
+$code.=<<___;
+ mov (%rsi),%r15
+ mov 8(%rsi),%r14
+ mov 16(%rsi),%r13
+ mov 24(%rsi),%r12
+ mov 32(%rsi),%rbp
+ mov 40(%rsi),%rbx
+ lea 48(%rsi),%rsp
+.Lmul_epilogue:
+ ret
+.size bn_mul_mont_gather5,.-bn_mul_mont_gather5
+___
+{{{
+my @A=("%r10","%r11");
+my @N=("%r13","%rdi");
+$code.=<<___;
+.type bn_mul4x_mont_gather5,\@function,6
+.align 16
+bn_mul4x_mont_gather5:
+.Lmul4x_enter:
+ mov ${num}d,${num}d
+ mov `($win64?56:8)`(%rsp),%r10d # load 7th argument
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+___
+$code.=<<___ if ($win64);
+ lea -0x28(%rsp),%rsp
+ movaps %xmm6,(%rsp)
+ movaps %xmm7,0x10(%rsp)
+.Lmul4x_alloca:
+___
+$code.=<<___;
+ mov %rsp,%rax
+ lea 4($num),%r11
+ neg %r11
+ lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+4))
+ and \$-1024,%rsp # minimize TLB usage
+
+ mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
+.Lmul4x_body:
+ mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp
+ mov %rdx,%r12 # reassign $bp
+___
+ $bp="%r12";
+ $STRIDE=2**5*8; # 5 is "window size"
+ $N=$STRIDE/4; # should match cache line size
+$code.=<<___;
+ mov %r10,%r11
+ shr \$`log($N/8)/log(2)`,%r10
+ and \$`$N/8-1`,%r11
+ not %r10
+ lea .Lmagic_masks(%rip),%rax
+ and \$`2**5/($N/8)-1`,%r10 # 5 is "window size"
+ lea 96($bp,%r11,8),$bp # pointer within 1st cache line
+ movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which
+ movq 8(%rax,%r10,8),%xmm5 # cache line contains element
+ movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument
+ movq 24(%rax,%r10,8),%xmm7
+
+ movq `0*$STRIDE/4-96`($bp),%xmm0
+ movq `1*$STRIDE/4-96`($bp),%xmm1
+ pand %xmm4,%xmm0
+ movq `2*$STRIDE/4-96`($bp),%xmm2
+ pand %xmm5,%xmm1
+ movq `3*$STRIDE/4-96`($bp),%xmm3
+ pand %xmm6,%xmm2
+ por %xmm1,%xmm0
+ pand %xmm7,%xmm3
+ por %xmm2,%xmm0
+ lea $STRIDE($bp),$bp
+ por %xmm3,%xmm0
+
+ movq %xmm0,$m0 # m0=bp[0]
+ mov ($n0),$n0 # pull n0[0] value
+ mov ($ap),%rax
+
+ xor $i,$i # i=0
+ xor $j,$j # j=0
+
+ movq `0*$STRIDE/4-96`($bp),%xmm0
+ movq `1*$STRIDE/4-96`($bp),%xmm1
+ pand %xmm4,%xmm0
+ movq `2*$STRIDE/4-96`($bp),%xmm2
+ pand %xmm5,%xmm1
+
+ mov $n0,$m1
+ mulq $m0 # ap[0]*bp[0]
+ mov %rax,$A[0]
+ mov ($np),%rax
+
+ movq `3*$STRIDE/4-96`($bp),%xmm3
+ pand %xmm6,%xmm2
+ por %xmm1,%xmm0
+ pand %xmm7,%xmm3
+
+ imulq $A[0],$m1 # "tp[0]"*n0
+ mov %rdx,$A[1]
+
+ por %xmm2,%xmm0
+ lea $STRIDE($bp),$bp
+ por %xmm3,%xmm0
+
+ mulq $m1 # np[0]*m1
+ add %rax,$A[0] # discarded
+ mov 8($ap),%rax
+ adc \$0,%rdx
+ mov %rdx,$N[1]
+
+ mulq $m0
+ add %rax,$A[1]
+ mov 8($np),%rax
+ adc \$0,%rdx
+ mov %rdx,$A[0]
+
+ mulq $m1
+ add %rax,$N[1]
+ mov 16($ap),%rax
+ adc \$0,%rdx
+ add $A[1],$N[1]
+ lea 4($j),$j # j++
+ adc \$0,%rdx
+ mov $N[1],(%rsp)
+ mov %rdx,$N[0]
+ jmp .L1st4x
+.align 16
+.L1st4x:
+ mulq $m0 # ap[j]*bp[0]
+ add %rax,$A[0]
+ mov -16($np,$j,8),%rax
+ adc \$0,%rdx
+ mov %rdx,$A[1]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[0]
+ mov -8($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
+ adc \$0,%rdx
+ mov $N[0],-24(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[1]
+
+ mulq $m0 # ap[j]*bp[0]
+ add %rax,$A[1]
+ mov -8($np,$j,8),%rax
+ adc \$0,%rdx
+ mov %rdx,$A[0]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[1]
+ mov ($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
+ adc \$0,%rdx
+ mov $N[1],-16(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[0]
+
+ mulq $m0 # ap[j]*bp[0]
+ add %rax,$A[0]
+ mov ($np,$j,8),%rax
+ adc \$0,%rdx
+ mov %rdx,$A[1]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[0]
+ mov 8($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
+ adc \$0,%rdx
+ mov $N[0],-8(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[1]
+
+ mulq $m0 # ap[j]*bp[0]
+ add %rax,$A[1]
+ mov 8($np,$j,8),%rax
+ adc \$0,%rdx
+ lea 4($j),$j # j++
+ mov %rdx,$A[0]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[1]
+ mov -16($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
+ adc \$0,%rdx
+ mov $N[1],-32(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[0]
+ cmp $num,$j
+ jl .L1st4x
+
+ mulq $m0 # ap[j]*bp[0]
+ add %rax,$A[0]
+ mov -16($np,$j,8),%rax
+ adc \$0,%rdx
+ mov %rdx,$A[1]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[0]
+ mov -8($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
+ adc \$0,%rdx
+ mov $N[0],-24(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[1]
+
+ mulq $m0 # ap[j]*bp[0]
+ add %rax,$A[1]
+ mov -8($np,$j,8),%rax
+ adc \$0,%rdx
+ mov %rdx,$A[0]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[1]
+ mov ($ap),%rax # ap[0]
+ adc \$0,%rdx
+ add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
+ adc \$0,%rdx
+ mov $N[1],-16(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[0]
+
+ movq %xmm0,$m0 # bp[1]
+
+ xor $N[1],$N[1]
+ add $A[0],$N[0]
+ adc \$0,$N[1]
+ mov $N[0],-8(%rsp,$j,8)
+ mov $N[1],(%rsp,$j,8) # store upmost overflow bit
+
+ lea 1($i),$i # i++
+.align 4
+.Louter4x:
+ xor $j,$j # j=0
+ movq `0*$STRIDE/4-96`($bp),%xmm0
+ movq `1*$STRIDE/4-96`($bp),%xmm1
+ pand %xmm4,%xmm0
+ movq `2*$STRIDE/4-96`($bp),%xmm2
+ pand %xmm5,%xmm1
+
+ mov (%rsp),$A[0]
+ mov $n0,$m1
+ mulq $m0 # ap[0]*bp[i]
+ add %rax,$A[0] # ap[0]*bp[i]+tp[0]
+ mov ($np),%rax
+ adc \$0,%rdx
+
+ movq `3*$STRIDE/4-96`($bp),%xmm3
+ pand %xmm6,%xmm2
+ por %xmm1,%xmm0
+ pand %xmm7,%xmm3
+
+ imulq $A[0],$m1 # tp[0]*n0
+ mov %rdx,$A[1]
+
+ por %xmm2,%xmm0
+ lea $STRIDE($bp),$bp
+ por %xmm3,%xmm0
+
+ mulq $m1 # np[0]*m1
+ add %rax,$A[0] # "$N[0]", discarded
+ mov 8($ap),%rax
+ adc \$0,%rdx
+ mov %rdx,$N[1]
+
+ mulq $m0 # ap[j]*bp[i]
+ add %rax,$A[1]
+ mov 8($np),%rax
+ adc \$0,%rdx
+ add 8(%rsp),$A[1] # +tp[1]
+ adc \$0,%rdx
+ mov %rdx,$A[0]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[1]
+ mov 16($ap),%rax
+ adc \$0,%rdx
+ add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j]
+ lea 4($j),$j # j+=2
+ adc \$0,%rdx
+ mov %rdx,$N[0]
+ jmp .Linner4x
+.align 16
+.Linner4x:
+ mulq $m0 # ap[j]*bp[i]
+ add %rax,$A[0]
+ mov -16($np,$j,8),%rax
+ adc \$0,%rdx
+ add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
+ adc \$0,%rdx
+ mov %rdx,$A[1]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[0]
+ mov -8($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $A[0],$N[0]
+ adc \$0,%rdx
+ mov $N[1],-32(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[1]
+
+ mulq $m0 # ap[j]*bp[i]
+ add %rax,$A[1]
+ mov -8($np,$j,8),%rax
+ adc \$0,%rdx
+ add -8(%rsp,$j,8),$A[1]
+ adc \$0,%rdx
+ mov %rdx,$A[0]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[1]
+ mov ($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $A[1],$N[1]
+ adc \$0,%rdx
+ mov $N[0],-24(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[0]
+
+ mulq $m0 # ap[j]*bp[i]
+ add %rax,$A[0]
+ mov ($np,$j,8),%rax
+ adc \$0,%rdx
+ add (%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
+ adc \$0,%rdx
+ mov %rdx,$A[1]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[0]
+ mov 8($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $A[0],$N[0]
+ adc \$0,%rdx
+ mov $N[1],-16(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[1]
+
+ mulq $m0 # ap[j]*bp[i]
+ add %rax,$A[1]
+ mov 8($np,$j,8),%rax
+ adc \$0,%rdx
+ add 8(%rsp,$j,8),$A[1]
+ adc \$0,%rdx
+ lea 4($j),$j # j++
+ mov %rdx,$A[0]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[1]
+ mov -16($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $A[1],$N[1]
+ adc \$0,%rdx
+ mov $N[0],-40(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[0]
+ cmp $num,$j
+ jl .Linner4x
+
+ mulq $m0 # ap[j]*bp[i]
+ add %rax,$A[0]
+ mov -16($np,$j,8),%rax
+ adc \$0,%rdx
+ add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
+ adc \$0,%rdx
+ mov %rdx,$A[1]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[0]
+ mov -8($ap,$j,8),%rax
+ adc \$0,%rdx
+ add $A[0],$N[0]
+ adc \$0,%rdx
+ mov $N[1],-32(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[1]
+
+ mulq $m0 # ap[j]*bp[i]
+ add %rax,$A[1]
+ mov -8($np,$j,8),%rax
+ adc \$0,%rdx
+ add -8(%rsp,$j,8),$A[1]
+ adc \$0,%rdx
+ lea 1($i),$i # i++
+ mov %rdx,$A[0]
+
+ mulq $m1 # np[j]*m1
+ add %rax,$N[1]
+ mov ($ap),%rax # ap[0]
+ adc \$0,%rdx
+ add $A[1],$N[1]
+ adc \$0,%rdx
+ mov $N[0],-24(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$N[0]
+
+ movq %xmm0,$m0 # bp[i+1]
+ mov $N[1],-16(%rsp,$j,8) # tp[j-1]
+
+ xor $N[1],$N[1]
+ add $A[0],$N[0]
+ adc \$0,$N[1]
+ add (%rsp,$num,8),$N[0] # pull upmost overflow bit
+ adc \$0,$N[1]
+ mov $N[0],-8(%rsp,$j,8)
+ mov $N[1],(%rsp,$j,8) # store upmost overflow bit
+
+ cmp $num,$i
+ jl .Louter4x
+___
+{
+my @ri=("%rax","%rdx",$m0,$m1);
+$code.=<<___;
+ mov 16(%rsp,$num,8),$rp # restore $rp
+ mov 0(%rsp),@ri[0] # tp[0]
+ pxor %xmm0,%xmm0
+ mov 8(%rsp),@ri[1] # tp[1]
+ shr \$2,$num # num/=4
+ lea (%rsp),$ap # borrow ap for tp
+ xor $i,$i # i=0 and clear CF!
+
+ sub 0($np),@ri[0]
+ mov 16($ap),@ri[2] # tp[2]
+ mov 24($ap),@ri[3] # tp[3]
+ sbb 8($np),@ri[1]
+ lea -1($num),$j # j=num/4-1
+ jmp .Lsub4x
+.align 16
+.Lsub4x:
+ mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i]
+ mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i]
+ sbb 16($np,$i,8),@ri[2]
+ mov 32($ap,$i,8),@ri[0] # tp[i+1]
+ mov 40($ap,$i,8),@ri[1]
+ sbb 24($np,$i,8),@ri[3]
+ mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i]
+ mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
+ sbb 32($np,$i,8),@ri[0]
+ mov 48($ap,$i,8),@ri[2]
+ mov 56($ap,$i,8),@ri[3]
+ sbb 40($np,$i,8),@ri[1]
+ lea 4($i),$i # i++
+ dec $j # doesnn't affect CF!
+ jnz .Lsub4x
+
+ mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i]
+ mov 32($ap,$i,8),@ri[0] # load overflow bit
+ sbb 16($np,$i,8),@ri[2]
+ mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i]
+ sbb 24($np,$i,8),@ri[3]
+ mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i]
+
+ sbb \$0,@ri[0] # handle upmost overflow bit
+ mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
+ xor $i,$i # i=0
+ and @ri[0],$ap
+ not @ri[0]
+ mov $rp,$np
+ and @ri[0],$np
+ lea -1($num),$j
+ or $np,$ap # ap=borrow?tp:rp
+
+ movdqu ($ap),%xmm1
+ movdqa %xmm0,(%rsp)
+ movdqu %xmm1,($rp)
+ jmp .Lcopy4x
+.align 16
+.Lcopy4x: # copy or in-place refresh
+ movdqu 16($ap,$i),%xmm2
+ movdqu 32($ap,$i),%xmm1
+ movdqa %xmm0,16(%rsp,$i)
+ movdqu %xmm2,16($rp,$i)
+ movdqa %xmm0,32(%rsp,$i)
+ movdqu %xmm1,32($rp,$i)
+ lea 32($i),$i
+ dec $j
+ jnz .Lcopy4x
+
+ shl \$2,$num
+ movdqu 16($ap,$i),%xmm2
+ movdqa %xmm0,16(%rsp,$i)
+ movdqu %xmm2,16($rp,$i)
+___
+}
+$code.=<<___;
+ mov 8(%rsp,$num,8),%rsi # restore %rsp
+ mov \$1,%rax
+___
+$code.=<<___ if ($win64);
+ movaps (%rsi),%xmm6
+ movaps 0x10(%rsi),%xmm7
+ lea 0x28(%rsi),%rsi
+___
+$code.=<<___;
+ mov (%rsi),%r15
+ mov 8(%rsi),%r14
+ mov 16(%rsi),%r13
+ mov 24(%rsi),%r12
+ mov 32(%rsi),%rbp
+ mov 40(%rsi),%rbx
+ lea 48(%rsi),%rsp
+.Lmul4x_epilogue:
+ ret
+.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
+___
+}}}
+
+{
+my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order
+ ("%rdi","%rsi","%rdx","%rcx"); # Unix order
+my $out=$inp;
+my $STRIDE=2**5*8;
+my $N=$STRIDE/4;
+
+$code.=<<___;
+.globl bn_scatter5
+.type bn_scatter5,\@abi-omnipotent
+.align 16
+bn_scatter5:
+ cmp \$0, $num
+ jz .Lscatter_epilogue
+ lea ($tbl,$idx,8),$tbl
+.Lscatter:
+ mov ($inp),%rax
+ lea 8($inp),$inp
+ mov %rax,($tbl)
+ lea 32*8($tbl),$tbl
+ sub \$1,$num
+ jnz .Lscatter
+.Lscatter_epilogue:
+ ret
+.size bn_scatter5,.-bn_scatter5
+
+.globl bn_gather5
+.type bn_gather5,\@abi-omnipotent
+.align 16
+bn_gather5:
+___
+$code.=<<___ if ($win64);
+.LSEH_begin_bn_gather5:
+ # I can't trust assembler to use specific encoding:-(
+ .byte 0x48,0x83,0xec,0x28 #sub \$0x28,%rsp
+ .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp)
+ .byte 0x0f,0x29,0x7c,0x24,0x10 #movdqa %xmm7,0x10(%rsp)
+___
+$code.=<<___;
+ mov $idx,%r11
+ shr \$`log($N/8)/log(2)`,$idx
+ and \$`$N/8-1`,%r11
+ not $idx
+ lea .Lmagic_masks(%rip),%rax
+ and \$`2**5/($N/8)-1`,$idx # 5 is "window size"
+ lea 96($tbl,%r11,8),$tbl # pointer within 1st cache line
+ movq 0(%rax,$idx,8),%xmm4 # set of masks denoting which
+ movq 8(%rax,$idx,8),%xmm5 # cache line contains element
+ movq 16(%rax,$idx,8),%xmm6 # denoted by 7th argument
+ movq 24(%rax,$idx,8),%xmm7
+ jmp .Lgather
+.align 16
+.Lgather:
+ movq `0*$STRIDE/4-96`($tbl),%xmm0
+ movq `1*$STRIDE/4-96`($tbl),%xmm1
+ pand %xmm4,%xmm0
+ movq `2*$STRIDE/4-96`($tbl),%xmm2
+ pand %xmm5,%xmm1
+ movq `3*$STRIDE/4-96`($tbl),%xmm3
+ pand %xmm6,%xmm2
+ por %xmm1,%xmm0
+ pand %xmm7,%xmm3
+ por %xmm2,%xmm0
+ lea $STRIDE($tbl),$tbl
+ por %xmm3,%xmm0
+
+ movq %xmm0,($out) # m0=bp[0]
+ lea 8($out),$out
+ sub \$1,$num
+ jnz .Lgather
+___
+$code.=<<___ if ($win64);
+ movaps (%rsp),%xmm6
+ movaps 0x10(%rsp),%xmm7
+ lea 0x28(%rsp),%rsp
+___
+$code.=<<___;
+ ret
+.LSEH_end_bn_gather5:
+.size bn_gather5,.-bn_gather5
+___
+}
+$code.=<<___;
+.align 64
+.Lmagic_masks:
+ .long 0,0, 0,0, 0,0, -1,-1
+ .long 0,0, 0,0, 0,0, 0,0
+.asciz "Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+# CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern __imp_RtlVirtualUnwind
+.type mul_handler,\@abi-omnipotent
+.align 16
+mul_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HandlerData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # end of prologue label
+ cmp %r10,%rbx # context->Rip<end of prologue label
+ jb .Lcommon_seh_tail
+
+ lea `40+48`(%rax),%rax
+
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # end of alloca label
+ cmp %r10,%rbx # context->Rip<end of alloca label
+ jb .Lcommon_seh_tail
+
+ mov 152($context),%rax # pull context->Rsp
+
+ mov 8(%r11),%r10d # HandlerData[2]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=epilogue label
+ jae .Lcommon_seh_tail
+
+ mov 192($context),%r10 # pull $num
+ mov 8(%rax,%r10,8),%rax # pull saved stack pointer
+
+ movaps (%rax),%xmm0
+ movaps 16(%rax),%xmm1
+ lea `40+48`(%rax),%rax
+
+ mov -8(%rax),%rbx
+ mov -16(%rax),%rbp
+ mov -24(%rax),%r12
+ mov -32(%rax),%r13
+ mov -40(%rax),%r14
+ mov -48(%rax),%r15
+ mov %rbx,144($context) # restore context->Rbx
+ mov %rbp,160($context) # restore context->Rbp
+ mov %r12,216($context) # restore context->R12
+ mov %r13,224($context) # restore context->R13
+ mov %r14,232($context) # restore context->R14
+ mov %r15,240($context) # restore context->R15
+ movups %xmm0,512($context) # restore context->Xmm6
+ movups %xmm1,528($context) # restore context->Xmm7
+
+.Lcommon_seh_tail:
+ mov 8(%rax),%rdi
+ mov 16(%rax),%rsi
+ mov %rax,152($context) # restore context->Rsp
+ mov %rsi,168($context) # restore context->Rsi
+ mov %rdi,176($context) # restore context->Rdi
+
+ mov 40($disp),%rdi # disp->ContextRecord
+ mov $context,%rsi # context
+ mov \$154,%ecx # sizeof(CONTEXT)
+ .long 0xa548f3fc # cld; rep movsq
+
+ mov $disp,%rsi
+ xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
+ mov 8(%rsi),%rdx # arg2, disp->ImageBase
+ mov 0(%rsi),%r8 # arg3, disp->ControlPc
+ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
+ mov 40(%rsi),%r10 # disp->ContextRecord
+ lea 56(%rsi),%r11 # &disp->HandlerData
+ lea 24(%rsi),%r12 # &disp->EstablisherFrame
+ mov %r10,32(%rsp) # arg5
+ mov %r11,40(%rsp) # arg6
+ mov %r12,48(%rsp) # arg7
+ mov %rcx,56(%rsp) # arg8, (NULL)
+ call *__imp_RtlVirtualUnwind(%rip)
+
+ mov \$1,%eax # ExceptionContinueSearch
+ add \$64,%rsp
+ popfq
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ pop %rdi
+ pop %rsi
+ ret
+.size mul_handler,.-mul_handler
+
+.section .pdata
+.align 4
+ .rva .LSEH_begin_bn_mul_mont_gather5
+ .rva .LSEH_end_bn_mul_mont_gather5
+ .rva .LSEH_info_bn_mul_mont_gather5
+
+ .rva .LSEH_begin_bn_mul4x_mont_gather5
+ .rva .LSEH_end_bn_mul4x_mont_gather5
+ .rva .LSEH_info_bn_mul4x_mont_gather5
+
+ .rva .LSEH_begin_bn_gather5
+ .rva .LSEH_end_bn_gather5
+ .rva .LSEH_info_bn_gather5
+
+.section .xdata
+.align 8
+.LSEH_info_bn_mul_mont_gather5:
+ .byte 9,0,0,0
+ .rva mul_handler
+ .rva .Lmul_alloca,.Lmul_body,.Lmul_epilogue # HandlerData[]
+.align 8
+.LSEH_info_bn_mul4x_mont_gather5:
+ .byte 9,0,0,0
+ .rva mul_handler
+ .rva .Lmul4x_alloca,.Lmul4x_body,.Lmul4x_epilogue # HandlerData[]
+.align 8
+.LSEH_info_bn_gather5:
+ .byte 0x01,0x0d,0x05,0x00
+ .byte 0x0d,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7
+ .byte 0x08,0x68,0x00,0x00 #movaps (rsp),xmm6
+ .byte 0x04,0x42,0x00,0x00 #sub rsp,0x28
+.align 8
+___
+}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+
+print $code;
+close STDOUT;
diff --git a/main/openssl/crypto/bn/bn.h b/main/openssl/crypto/bn/bn.h
index a0bc4783..9281ce59 100644
--- a/main/openssl/crypto/bn/bn.h
+++ b/main/openssl/crypto/bn/bn.h
@@ -558,6 +558,17 @@ int BN_is_prime_ex(const BIGNUM *p,int nchecks, BN_CTX *ctx, BN_GENCB *cb);
int BN_is_prime_fasttest_ex(const BIGNUM *p,int nchecks, BN_CTX *ctx,
int do_trial_division, BN_GENCB *cb);
+int BN_X931_generate_Xpq(BIGNUM *Xp, BIGNUM *Xq, int nbits, BN_CTX *ctx);
+
+int BN_X931_derive_prime_ex(BIGNUM *p, BIGNUM *p1, BIGNUM *p2,
+ const BIGNUM *Xp, const BIGNUM *Xp1, const BIGNUM *Xp2,
+ const BIGNUM *e, BN_CTX *ctx, BN_GENCB *cb);
+int BN_X931_generate_prime_ex(BIGNUM *p, BIGNUM *p1, BIGNUM *p2,
+ BIGNUM *Xp1, BIGNUM *Xp2,
+ const BIGNUM *Xp,
+ const BIGNUM *e, BN_CTX *ctx,
+ BN_GENCB *cb);
+
BN_MONT_CTX *BN_MONT_CTX_new(void );
void BN_MONT_CTX_init(BN_MONT_CTX *ctx);
int BN_mod_mul_montgomery(BIGNUM *r,const BIGNUM *a,const BIGNUM *b,
@@ -612,6 +623,8 @@ int BN_mod_exp_recp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
int BN_div_recp(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m,
BN_RECP_CTX *recp, BN_CTX *ctx);
+#ifndef OPENSSL_NO_EC2M
+
/* Functions for arithmetic over binary polynomials represented by BIGNUMs.
*
* The BIGNUM::neg property of BIGNUMs representing binary polynomials is
@@ -663,6 +676,8 @@ int BN_GF2m_mod_solve_quad_arr(BIGNUM *r, const BIGNUM *a,
int BN_GF2m_poly2arr(const BIGNUM *a, int p[], int max);
int BN_GF2m_arr2poly(const int p[], BIGNUM *a);
+#endif
+
/* faster mod functions for the 'NIST primes'
* 0 <= a < p^2 */
int BN_nist_mod_192(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx);
@@ -677,6 +692,10 @@ const BIGNUM *BN_get0_nist_prime_256(void);
const BIGNUM *BN_get0_nist_prime_384(void);
const BIGNUM *BN_get0_nist_prime_521(void);
+int BN_generate_dsa_nonce(BIGNUM *out, const BIGNUM *range, const BIGNUM *priv,
+ const unsigned char *message, size_t message_len,
+ BN_CTX *ctx);
+
/* library internal functions */
#define bn_expand(a,bits) ((((((bits+BN_BITS2-1))/BN_BITS2)) <= (a)->dmax)?\
@@ -827,6 +846,7 @@ void ERR_load_BN_strings(void);
#define BN_F_BN_EXP 123
#define BN_F_BN_EXPAND2 108
#define BN_F_BN_EXPAND_INTERNAL 120
+#define BN_F_BN_GENERATE_DSA_NONCE 140
#define BN_F_BN_GF2M_MOD 131
#define BN_F_BN_GF2M_MOD_EXP 132
#define BN_F_BN_GF2M_MOD_MUL 133
@@ -866,6 +886,7 @@ void ERR_load_BN_strings(void);
#define BN_R_NOT_INITIALIZED 107
#define BN_R_NO_INVERSE 108
#define BN_R_NO_SOLUTION 116
+#define BN_R_PRIVATE_KEY_TOO_LARGE 117
#define BN_R_P_IS_NOT_PRIME 112
#define BN_R_TOO_MANY_ITERATIONS 113
#define BN_R_TOO_MANY_TEMPORARY_VARIABLES 109
diff --git a/main/openssl/crypto/bn/bn_blind.c b/main/openssl/crypto/bn/bn_blind.c
index e060592f..9ed8bc2b 100644
--- a/main/openssl/crypto/bn/bn_blind.c
+++ b/main/openssl/crypto/bn/bn_blind.c
@@ -126,7 +126,7 @@ struct bn_blinding_st
* used only by crypto/rsa/rsa_eay.c, rsa_lib.c */
#endif
CRYPTO_THREADID tid;
- unsigned int counter;
+ int counter;
unsigned long flags;
BN_MONT_CTX *m_ctx;
int (*bn_mod_exp)(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
@@ -160,7 +160,10 @@ BN_BLINDING *BN_BLINDING_new(const BIGNUM *A, const BIGNUM *Ai, BIGNUM *mod)
if (BN_get_flags(mod, BN_FLG_CONSTTIME) != 0)
BN_set_flags(ret->mod, BN_FLG_CONSTTIME);
- ret->counter = BN_BLINDING_COUNTER;
+ /* Set the counter to the special value -1
+ * to indicate that this is never-used fresh blinding
+ * that does not need updating before first use. */
+ ret->counter = -1;
CRYPTO_THREADID_current(&ret->tid);
return(ret);
err:
@@ -190,7 +193,10 @@ int BN_BLINDING_update(BN_BLINDING *b, BN_CTX *ctx)
goto err;
}
- if (--(b->counter) == 0 && b->e != NULL &&
+ if (b->counter == -1)
+ b->counter = 0;
+
+ if (++b->counter == BN_BLINDING_COUNTER && b->e != NULL &&
!(b->flags & BN_BLINDING_NO_RECREATE))
{
/* re-create blinding parameters */
@@ -205,8 +211,8 @@ int BN_BLINDING_update(BN_BLINDING *b, BN_CTX *ctx)
ret=1;
err:
- if (b->counter == 0)
- b->counter = BN_BLINDING_COUNTER;
+ if (b->counter == BN_BLINDING_COUNTER)
+ b->counter = 0;
return(ret);
}
@@ -227,6 +233,12 @@ int BN_BLINDING_convert_ex(BIGNUM *n, BIGNUM *r, BN_BLINDING *b, BN_CTX *ctx)
return(0);
}
+ if (b->counter == -1)
+ /* Fresh blinding, doesn't need updating. */
+ b->counter = 0;
+ else if (!BN_BLINDING_update(b,ctx))
+ return(0);
+
if (r != NULL)
{
if (!BN_copy(r, b->Ai)) ret=0;
@@ -247,22 +259,19 @@ int BN_BLINDING_invert_ex(BIGNUM *n, const BIGNUM *r, BN_BLINDING *b, BN_CTX *ct
int ret;
bn_check_top(n);
- if ((b->A == NULL) || (b->Ai == NULL))
- {
- BNerr(BN_F_BN_BLINDING_INVERT_EX,BN_R_NOT_INITIALIZED);
- return(0);
- }
if (r != NULL)
ret = BN_mod_mul(n, n, r, b->mod, ctx);
else
- ret = BN_mod_mul(n, n, b->Ai, b->mod, ctx);
-
- if (ret >= 0)
{
- if (!BN_BLINDING_update(b,ctx))
+ if (b->Ai == NULL)
+ {
+ BNerr(BN_F_BN_BLINDING_INVERT_EX,BN_R_NOT_INITIALIZED);
return(0);
+ }
+ ret = BN_mod_mul(n, n, b->Ai, b->mod, ctx);
}
+
bn_check_top(n);
return(ret);
}
diff --git a/main/openssl/crypto/bn/bn_div.c b/main/openssl/crypto/bn/bn_div.c
index 802a43d6..7b240318 100644
--- a/main/openssl/crypto/bn/bn_div.c
+++ b/main/openssl/crypto/bn/bn_div.c
@@ -141,6 +141,7 @@ int BN_div(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m, const BIGNUM *d,
*
* <appro@fy.chalmers.se>
*/
+#undef bn_div_words
# define bn_div_words(n0,n1,d0) \
({ asm volatile ( \
"divl %4" \
@@ -155,6 +156,7 @@ int BN_div(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m, const BIGNUM *d,
* Same story here, but it's 128-bit by 64-bit division. Wow!
* <appro@fy.chalmers.se>
*/
+# undef bn_div_words
# define bn_div_words(n0,n1,d0) \
({ asm volatile ( \
"divq %4" \
@@ -169,15 +171,13 @@ int BN_div(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m, const BIGNUM *d,
#endif /* OPENSSL_NO_ASM */
-/* BN_div[_no_branch] computes dv := num / divisor, rounding towards
+/* BN_div computes dv := num / divisor, rounding towards
* zero, and sets up rm such that dv*divisor + rm = num holds.
* Thus:
* dv->neg == num->neg ^ divisor->neg (unless the result is zero)
* rm->neg == num->neg (unless the remainder is zero)
* If 'dv' or 'rm' is NULL, the respective value is not returned.
*/
-static int BN_div_no_branch(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num,
- const BIGNUM *divisor, BN_CTX *ctx);
int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor,
BN_CTX *ctx)
{
@@ -186,6 +186,7 @@ int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor,
BN_ULONG *resp,*wnump;
BN_ULONG d0,d1;
int num_n,div_n;
+ int no_branch=0;
/* Invalid zero-padding would have particularly bad consequences
* in the case of 'num', so don't just rely on bn_check_top() for this one
@@ -200,7 +201,7 @@ int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor,
if ((BN_get_flags(num, BN_FLG_CONSTTIME) != 0) || (BN_get_flags(divisor, BN_FLG_CONSTTIME) != 0))
{
- return BN_div_no_branch(dv, rm, num, divisor, ctx);
+ no_branch=1;
}
bn_check_top(dv);
@@ -214,7 +215,7 @@ int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor,
return(0);
}
- if (BN_ucmp(num,divisor) < 0)
+ if (!no_branch && BN_ucmp(num,divisor) < 0)
{
if (rm != NULL)
{ if (BN_copy(rm,num) == NULL) return(0); }
@@ -239,242 +240,25 @@ int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor,
norm_shift+=BN_BITS2;
if (!(BN_lshift(snum,num,norm_shift))) goto err;
snum->neg=0;
- div_n=sdiv->top;
- num_n=snum->top;
- loop=num_n-div_n;
- /* Lets setup a 'window' into snum
- * This is the part that corresponds to the current
- * 'area' being divided */
- wnum.neg = 0;
- wnum.d = &(snum->d[loop]);
- wnum.top = div_n;
- /* only needed when BN_ucmp messes up the values between top and max */
- wnum.dmax = snum->dmax - loop; /* so we don't step out of bounds */
-
- /* Get the top 2 words of sdiv */
- /* div_n=sdiv->top; */
- d0=sdiv->d[div_n-1];
- d1=(div_n == 1)?0:sdiv->d[div_n-2];
-
- /* pointer to the 'top' of snum */
- wnump= &(snum->d[num_n-1]);
-
- /* Setup to 'res' */
- res->neg= (num->neg^divisor->neg);
- if (!bn_wexpand(res,(loop+1))) goto err;
- res->top=loop;
- resp= &(res->d[loop-1]);
-
- /* space for temp */
- if (!bn_wexpand(tmp,(div_n+1))) goto err;
- if (BN_ucmp(&wnum,sdiv) >= 0)
+ if (no_branch)
{
- /* If BN_DEBUG_RAND is defined BN_ucmp changes (via
- * bn_pollute) the const bignum arguments =>
- * clean the values between top and max again */
- bn_clear_top2max(&wnum);
- bn_sub_words(wnum.d, wnum.d, sdiv->d, div_n);
- *resp=1;
- }
- else
- res->top--;
- /* if res->top == 0 then clear the neg value otherwise decrease
- * the resp pointer */
- if (res->top == 0)
- res->neg = 0;
- else
- resp--;
-
- for (i=0; i<loop-1; i++, wnump--, resp--)
- {
- BN_ULONG q,l0;
- /* the first part of the loop uses the top two words of
- * snum and sdiv to calculate a BN_ULONG q such that
- * | wnum - sdiv * q | < sdiv */
-#if defined(BN_DIV3W) && !defined(OPENSSL_NO_ASM)
- BN_ULONG bn_div_3_words(BN_ULONG*,BN_ULONG,BN_ULONG);
- q=bn_div_3_words(wnump,d1,d0);
-#else
- BN_ULONG n0,n1,rem=0;
-
- n0=wnump[0];
- n1=wnump[-1];
- if (n0 == d0)
- q=BN_MASK2;
- else /* n0 < d0 */
- {
-#ifdef BN_LLONG
- BN_ULLONG t2;
-
-#if defined(BN_LLONG) && defined(BN_DIV2W) && !defined(bn_div_words)
- q=(BN_ULONG)(((((BN_ULLONG)n0)<<BN_BITS2)|n1)/d0);
-#else
- q=bn_div_words(n0,n1,d0);
-#ifdef BN_DEBUG_LEVITTE
- fprintf(stderr,"DEBUG: bn_div_words(0x%08X,0x%08X,0x%08\
-X) -> 0x%08X\n",
- n0, n1, d0, q);
-#endif
-#endif
-
-#ifndef REMAINDER_IS_ALREADY_CALCULATED
- /*
- * rem doesn't have to be BN_ULLONG. The least we
- * know it's less that d0, isn't it?
- */
- rem=(n1-q*d0)&BN_MASK2;
-#endif
- t2=(BN_ULLONG)d1*q;
-
- for (;;)
- {
- if (t2 <= ((((BN_ULLONG)rem)<<BN_BITS2)|wnump[-2]))
- break;
- q--;
- rem += d0;
- if (rem < d0) break; /* don't let rem overflow */
- t2 -= d1;
- }
-#else /* !BN_LLONG */
- BN_ULONG t2l,t2h;
-
- q=bn_div_words(n0,n1,d0);
-#ifdef BN_DEBUG_LEVITTE
- fprintf(stderr,"DEBUG: bn_div_words(0x%08X,0x%08X,0x%08\
-X) -> 0x%08X\n",
- n0, n1, d0, q);
-#endif
-#ifndef REMAINDER_IS_ALREADY_CALCULATED
- rem=(n1-q*d0)&BN_MASK2;
-#endif
-
-#if defined(BN_UMULT_LOHI)
- BN_UMULT_LOHI(t2l,t2h,d1,q);
-#elif defined(BN_UMULT_HIGH)
- t2l = d1 * q;
- t2h = BN_UMULT_HIGH(d1,q);
-#else
+ /* Since we don't know whether snum is larger than sdiv,
+ * we pad snum with enough zeroes without changing its
+ * value.
+ */
+ if (snum->top <= sdiv->top+1)
{
- BN_ULONG ql, qh;
- t2l=LBITS(d1); t2h=HBITS(d1);
- ql =LBITS(q); qh =HBITS(q);
- mul64(t2l,t2h,ql,qh); /* t2=(BN_ULLONG)d1*q; */
+ if (bn_wexpand(snum, sdiv->top + 2) == NULL) goto err;
+ for (i = snum->top; i < sdiv->top + 2; i++) snum->d[i] = 0;
+ snum->top = sdiv->top + 2;
}
-#endif
-
- for (;;)
- {
- if ((t2h < rem) ||
- ((t2h == rem) && (t2l <= wnump[-2])))
- break;
- q--;
- rem += d0;
- if (rem < d0) break; /* don't let rem overflow */
- if (t2l < d1) t2h--; t2l -= d1;
- }
-#endif /* !BN_LLONG */
- }
-#endif /* !BN_DIV3W */
-
- l0=bn_mul_words(tmp->d,sdiv->d,div_n,q);
- tmp->d[div_n]=l0;
- wnum.d--;
- /* ingore top values of the bignums just sub the two
- * BN_ULONG arrays with bn_sub_words */
- if (bn_sub_words(wnum.d, wnum.d, tmp->d, div_n+1))
+ else
{
- /* Note: As we have considered only the leading
- * two BN_ULONGs in the calculation of q, sdiv * q
- * might be greater than wnum (but then (q-1) * sdiv
- * is less or equal than wnum)
- */
- q--;
- if (bn_add_words(wnum.d, wnum.d, sdiv->d, div_n))
- /* we can't have an overflow here (assuming
- * that q != 0, but if q == 0 then tmp is
- * zero anyway) */
- (*wnump)++;
+ if (bn_wexpand(snum, snum->top + 1) == NULL) goto err;
+ snum->d[snum->top] = 0;
+ snum->top ++;
}
- /* store part of the result */
- *resp = q;
- }
- bn_correct_top(snum);
- if (rm != NULL)
- {
- /* Keep a copy of the neg flag in num because if rm==num
- * BN_rshift() will overwrite it.
- */
- int neg = num->neg;
- BN_rshift(rm,snum,norm_shift);
- if (!BN_is_zero(rm))
- rm->neg = neg;
- bn_check_top(rm);
- }
- BN_CTX_end(ctx);
- return(1);
-err:
- bn_check_top(rm);
- BN_CTX_end(ctx);
- return(0);
- }
-
-
-/* BN_div_no_branch is a special version of BN_div. It does not contain
- * branches that may leak sensitive information.
- */
-static int BN_div_no_branch(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num,
- const BIGNUM *divisor, BN_CTX *ctx)
- {
- int norm_shift,i,loop;
- BIGNUM *tmp,wnum,*snum,*sdiv,*res;
- BN_ULONG *resp,*wnump;
- BN_ULONG d0,d1;
- int num_n,div_n;
-
- bn_check_top(dv);
- bn_check_top(rm);
- /* bn_check_top(num); */ /* 'num' has been checked in BN_div() */
- bn_check_top(divisor);
-
- if (BN_is_zero(divisor))
- {
- BNerr(BN_F_BN_DIV_NO_BRANCH,BN_R_DIV_BY_ZERO);
- return(0);
- }
-
- BN_CTX_start(ctx);
- tmp=BN_CTX_get(ctx);
- snum=BN_CTX_get(ctx);
- sdiv=BN_CTX_get(ctx);
- if (dv == NULL)
- res=BN_CTX_get(ctx);
- else res=dv;
- if (sdiv == NULL || res == NULL) goto err;
-
- /* First we normalise the numbers */
- norm_shift=BN_BITS2-((BN_num_bits(divisor))%BN_BITS2);
- if (!(BN_lshift(sdiv,divisor,norm_shift))) goto err;
- sdiv->neg=0;
- norm_shift+=BN_BITS2;
- if (!(BN_lshift(snum,num,norm_shift))) goto err;
- snum->neg=0;
-
- /* Since we don't know whether snum is larger than sdiv,
- * we pad snum with enough zeroes without changing its
- * value.
- */
- if (snum->top <= sdiv->top+1)
- {
- if (bn_wexpand(snum, sdiv->top + 2) == NULL) goto err;
- for (i = snum->top; i < sdiv->top + 2; i++) snum->d[i] = 0;
- snum->top = sdiv->top + 2;
- }
- else
- {
- if (bn_wexpand(snum, snum->top + 1) == NULL) goto err;
- snum->d[snum->top] = 0;
- snum->top ++;
}
div_n=sdiv->top;
@@ -500,12 +284,27 @@ static int BN_div_no_branch(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num,
/* Setup to 'res' */
res->neg= (num->neg^divisor->neg);
if (!bn_wexpand(res,(loop+1))) goto err;
- res->top=loop-1;
+ res->top=loop-no_branch;
resp= &(res->d[loop-1]);
/* space for temp */
if (!bn_wexpand(tmp,(div_n+1))) goto err;
+ if (!no_branch)
+ {
+ if (BN_ucmp(&wnum,sdiv) >= 0)
+ {
+ /* If BN_DEBUG_RAND is defined BN_ucmp changes (via
+ * bn_pollute) the const bignum arguments =>
+ * clean the values between top and max again */
+ bn_clear_top2max(&wnum);
+ bn_sub_words(wnum.d, wnum.d, sdiv->d, div_n);
+ *resp=1;
+ }
+ else
+ res->top--;
+ }
+
/* if res->top == 0 then clear the neg value otherwise decrease
* the resp pointer */
if (res->top == 0)
@@ -638,7 +437,7 @@ X) -> 0x%08X\n",
rm->neg = neg;
bn_check_top(rm);
}
- bn_correct_top(res);
+ if (no_branch) bn_correct_top(res);
BN_CTX_end(ctx);
return(1);
err:
@@ -646,5 +445,4 @@ err:
BN_CTX_end(ctx);
return(0);
}
-
#endif
diff --git a/main/openssl/crypto/bn/bn_err.c b/main/openssl/crypto/bn/bn_err.c
index cfe2eb94..f722b525 100644
--- a/main/openssl/crypto/bn/bn_err.c
+++ b/main/openssl/crypto/bn/bn_err.c
@@ -87,6 +87,7 @@ static ERR_STRING_DATA BN_str_functs[]=
{ERR_FUNC(BN_F_BN_EXP), "BN_exp"},
{ERR_FUNC(BN_F_BN_EXPAND2), "bn_expand2"},
{ERR_FUNC(BN_F_BN_EXPAND_INTERNAL), "BN_EXPAND_INTERNAL"},
+{ERR_FUNC(BN_F_BN_GENERATE_DSA_NONCE), "BN_generate_dsa_nonce"},
{ERR_FUNC(BN_F_BN_GF2M_MOD), "BN_GF2m_mod"},
{ERR_FUNC(BN_F_BN_GF2M_MOD_EXP), "BN_GF2m_mod_exp"},
{ERR_FUNC(BN_F_BN_GF2M_MOD_MUL), "BN_GF2m_mod_mul"},
@@ -129,6 +130,7 @@ static ERR_STRING_DATA BN_str_reasons[]=
{ERR_REASON(BN_R_NOT_INITIALIZED) ,"not initialized"},
{ERR_REASON(BN_R_NO_INVERSE) ,"no inverse"},
{ERR_REASON(BN_R_NO_SOLUTION) ,"no solution"},
+{ERR_REASON(BN_R_PRIVATE_KEY_TOO_LARGE) ,"private key too large"},
{ERR_REASON(BN_R_P_IS_NOT_PRIME) ,"p is not prime"},
{ERR_REASON(BN_R_TOO_MANY_ITERATIONS) ,"too many iterations"},
{ERR_REASON(BN_R_TOO_MANY_TEMPORARY_VARIABLES),"too many temporary variables"},
diff --git a/main/openssl/crypto/bn/bn_exp.c b/main/openssl/crypto/bn/bn_exp.c
index d9b6c737..2abf6fd6 100644
--- a/main/openssl/crypto/bn/bn_exp.c
+++ b/main/openssl/crypto/bn/bn_exp.c
@@ -113,6 +113,18 @@
#include "cryptlib.h"
#include "bn_lcl.h"
+#include <stdlib.h>
+#ifdef _WIN32
+# include <malloc.h>
+# ifndef alloca
+# define alloca _alloca
+# endif
+#elif defined(__GNUC__)
+# ifndef alloca
+# define alloca(s) __builtin_alloca((s))
+# endif
+#endif
+
/* maximum precomputation table size for *variable* sliding windows */
#define TABLE_SIZE 32
@@ -522,23 +534,17 @@ err:
* as cache lines are concerned. The following functions are used to transfer a BIGNUM
* from/to that table. */
-static int MOD_EXP_CTIME_COPY_TO_PREBUF(BIGNUM *b, int top, unsigned char *buf, int idx, int width)
+static int MOD_EXP_CTIME_COPY_TO_PREBUF(const BIGNUM *b, int top, unsigned char *buf, int idx, int width)
{
size_t i, j;
- if (bn_wexpand(b, top) == NULL)
- return 0;
- while (b->top < top)
- {
- b->d[b->top++] = 0;
- }
-
+ if (top > b->top)
+ top = b->top; /* this works because 'buf' is explicitly zeroed */
for (i = 0, j=idx; i < top * sizeof b->d[0]; i++, j+=width)
{
buf[j] = ((unsigned char*)b->d)[i];
}
- bn_correct_top(b);
return 1;
}
@@ -561,7 +567,7 @@ static int MOD_EXP_CTIME_COPY_FROM_PREBUF(BIGNUM *b, int top, unsigned char *buf
/* Given a pointer value, compute the next address that is a cache line multiple. */
#define MOD_EXP_CTIME_ALIGN(x_) \
- ((unsigned char*)(x_) + (MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH - (((BN_ULONG)(x_)) & (MOD_EXP_CTIME_MIN_CACHE_LINE_MASK))))
+ ((unsigned char*)(x_) + (MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH - (((size_t)(x_)) & (MOD_EXP_CTIME_MIN_CACHE_LINE_MASK))))
/* This variant of BN_mod_exp_mont() uses fixed windows and the special
* precomputation memory layout to limit data-dependency to a minimum
@@ -572,17 +578,15 @@ static int MOD_EXP_CTIME_COPY_FROM_PREBUF(BIGNUM *b, int top, unsigned char *buf
int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *in_mont)
{
- int i,bits,ret=0,idx,window,wvalue;
+ int i,bits,ret=0,window,wvalue;
int top;
- BIGNUM *r;
- const BIGNUM *aa;
BN_MONT_CTX *mont=NULL;
int numPowers;
unsigned char *powerbufFree=NULL;
int powerbufLen = 0;
unsigned char *powerbuf=NULL;
- BIGNUM *computeTemp=NULL, *am=NULL;
+ BIGNUM tmp, am;
bn_check_top(a);
bn_check_top(p);
@@ -602,10 +606,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
return ret;
}
- /* Initialize BIGNUM context and allocate intermediate result */
BN_CTX_start(ctx);
- r = BN_CTX_get(ctx);
- if (r == NULL) goto err;
/* Allocate a montgomery context if it was not supplied by the caller.
* If this is not done, things will break in the montgomery part.
@@ -620,40 +621,154 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
/* Get the window size to use with size of p. */
window = BN_window_bits_for_ctime_exponent_size(bits);
+#if defined(OPENSSL_BN_ASM_MONT5)
+ if (window==6 && bits<=1024) window=5; /* ~5% improvement of 2048-bit RSA sign */
+#endif
/* Allocate a buffer large enough to hold all of the pre-computed
- * powers of a.
+ * powers of am, am itself and tmp.
*/
numPowers = 1 << window;
- powerbufLen = sizeof(m->d[0])*top*numPowers;
+ powerbufLen = sizeof(m->d[0])*(top*numPowers +
+ ((2*top)>numPowers?(2*top):numPowers));
+#ifdef alloca
+ if (powerbufLen < 3072)
+ powerbufFree = alloca(powerbufLen+MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH);
+ else
+#endif
if ((powerbufFree=(unsigned char*)OPENSSL_malloc(powerbufLen+MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH)) == NULL)
goto err;
powerbuf = MOD_EXP_CTIME_ALIGN(powerbufFree);
memset(powerbuf, 0, powerbufLen);
- /* Initialize the intermediate result. Do this early to save double conversion,
- * once each for a^0 and intermediate result.
- */
- if (!BN_to_montgomery(r,BN_value_one(),mont,ctx)) goto err;
- if (!MOD_EXP_CTIME_COPY_TO_PREBUF(r, top, powerbuf, 0, numPowers)) goto err;
+#ifdef alloca
+ if (powerbufLen < 3072)
+ powerbufFree = NULL;
+#endif
- /* Initialize computeTemp as a^1 with montgomery precalcs */
- computeTemp = BN_CTX_get(ctx);
- am = BN_CTX_get(ctx);
- if (computeTemp==NULL || am==NULL) goto err;
+ /* lay down tmp and am right after powers table */
+ tmp.d = (BN_ULONG *)(powerbuf + sizeof(m->d[0])*top*numPowers);
+ am.d = tmp.d + top;
+ tmp.top = am.top = 0;
+ tmp.dmax = am.dmax = top;
+ tmp.neg = am.neg = 0;
+ tmp.flags = am.flags = BN_FLG_STATIC_DATA;
+
+ /* prepare a^0 in Montgomery domain */
+#if 1
+ if (!BN_to_montgomery(&tmp,BN_value_one(),mont,ctx)) goto err;
+#else
+ tmp.d[0] = (0-m->d[0])&BN_MASK2; /* 2^(top*BN_BITS2) - m */
+ for (i=1;i<top;i++)
+ tmp.d[i] = (~m->d[i])&BN_MASK2;
+ tmp.top = top;
+#endif
+ /* prepare a^1 in Montgomery domain */
if (a->neg || BN_ucmp(a,m) >= 0)
{
- if (!BN_mod(am,a,m,ctx))
- goto err;
- aa= am;
+ if (!BN_mod(&am,a,m,ctx)) goto err;
+ if (!BN_to_montgomery(&am,&am,mont,ctx)) goto err;
}
- else
- aa=a;
- if (!BN_to_montgomery(am,aa,mont,ctx)) goto err;
- if (!BN_copy(computeTemp, am)) goto err;
- if (!MOD_EXP_CTIME_COPY_TO_PREBUF(am, top, powerbuf, 1, numPowers)) goto err;
+ else if (!BN_to_montgomery(&am,a,mont,ctx)) goto err;
+
+#if defined(OPENSSL_BN_ASM_MONT5)
+ /* This optimization uses ideas from http://eprint.iacr.org/2011/239,
+ * specifically optimization of cache-timing attack countermeasures
+ * and pre-computation optimization. */
+
+ /* Dedicated window==4 case improves 512-bit RSA sign by ~15%, but as
+ * 512-bit RSA is hardly relevant, we omit it to spare size... */
+ if (window==5)
+ {
+ void bn_mul_mont_gather5(BN_ULONG *rp,const BN_ULONG *ap,
+ const void *table,const BN_ULONG *np,
+ const BN_ULONG *n0,int num,int power);
+ void bn_scatter5(const BN_ULONG *inp,size_t num,
+ void *table,size_t power);
+ void bn_gather5(BN_ULONG *out,size_t num,
+ void *table,size_t power);
+
+ BN_ULONG *np=mont->N.d, *n0=mont->n0;
+
+ /* BN_to_montgomery can contaminate words above .top
+ * [in BN_DEBUG[_DEBUG] build]... */
+ for (i=am.top; i<top; i++) am.d[i]=0;
+ for (i=tmp.top; i<top; i++) tmp.d[i]=0;
+
+ bn_scatter5(tmp.d,top,powerbuf,0);
+ bn_scatter5(am.d,am.top,powerbuf,1);
+ bn_mul_mont(tmp.d,am.d,am.d,np,n0,top);
+ bn_scatter5(tmp.d,top,powerbuf,2);
+
+#if 0
+ for (i=3; i<32; i++)
+ {
+ /* Calculate a^i = a^(i-1) * a */
+ bn_mul_mont_gather5(tmp.d,am.d,powerbuf,np,n0,top,i-1);
+ bn_scatter5(tmp.d,top,powerbuf,i);
+ }
+#else
+ /* same as above, but uses squaring for 1/2 of operations */
+ for (i=4; i<32; i*=2)
+ {
+ bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top);
+ bn_scatter5(tmp.d,top,powerbuf,i);
+ }
+ for (i=3; i<8; i+=2)
+ {
+ int j;
+ bn_mul_mont_gather5(tmp.d,am.d,powerbuf,np,n0,top,i-1);
+ bn_scatter5(tmp.d,top,powerbuf,i);
+ for (j=2*i; j<32; j*=2)
+ {
+ bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top);
+ bn_scatter5(tmp.d,top,powerbuf,j);
+ }
+ }
+ for (; i<16; i+=2)
+ {
+ bn_mul_mont_gather5(tmp.d,am.d,powerbuf,np,n0,top,i-1);
+ bn_scatter5(tmp.d,top,powerbuf,i);
+ bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top);
+ bn_scatter5(tmp.d,top,powerbuf,2*i);
+ }
+ for (; i<32; i+=2)
+ {
+ bn_mul_mont_gather5(tmp.d,am.d,powerbuf,np,n0,top,i-1);
+ bn_scatter5(tmp.d,top,powerbuf,i);
+ }
+#endif
+ bits--;
+ for (wvalue=0, i=bits%5; i>=0; i--,bits--)
+ wvalue = (wvalue<<1)+BN_is_bit_set(p,bits);
+ bn_gather5(tmp.d,top,powerbuf,wvalue);
+
+ /* Scan the exponent one window at a time starting from the most
+ * significant bits.
+ */
+ while (bits >= 0)
+ {
+ for (wvalue=0, i=0; i<5; i++,bits--)
+ wvalue = (wvalue<<1)+BN_is_bit_set(p,bits);
+
+ bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top);
+ bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top);
+ bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top);
+ bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top);
+ bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top);
+ bn_mul_mont_gather5(tmp.d,tmp.d,powerbuf,np,n0,top,wvalue);
+ }
+
+ tmp.top=top;
+ bn_correct_top(&tmp);
+ }
+ else
+#endif
+ {
+ if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, 0, numPowers)) goto err;
+ if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&am, top, powerbuf, 1, numPowers)) goto err;
/* If the window size is greater than 1, then calculate
* val[i=2..2^winsize-1]. Powers are computed as a*a^(i-1)
@@ -662,62 +777,54 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
*/
if (window > 1)
{
- for (i=2; i<numPowers; i++)
+ if (!BN_mod_mul_montgomery(&tmp,&am,&am,mont,ctx)) goto err;
+ if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, 2, numPowers)) goto err;
+ for (i=3; i<numPowers; i++)
{
/* Calculate a^i = a^(i-1) * a */
- if (!BN_mod_mul_montgomery(computeTemp,am,computeTemp,mont,ctx))
+ if (!BN_mod_mul_montgomery(&tmp,&am,&tmp,mont,ctx))
goto err;
- if (!MOD_EXP_CTIME_COPY_TO_PREBUF(computeTemp, top, powerbuf, i, numPowers)) goto err;
+ if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, i, numPowers)) goto err;
}
}
- /* Adjust the number of bits up to a multiple of the window size.
- * If the exponent length is not a multiple of the window size, then
- * this pads the most significant bits with zeros to normalize the
- * scanning loop to there's no special cases.
- *
- * * NOTE: Making the window size a power of two less than the native
- * * word size ensures that the padded bits won't go past the last
- * * word in the internal BIGNUM structure. Going past the end will
- * * still produce the correct result, but causes a different branch
- * * to be taken in the BN_is_bit_set function.
- */
- bits = ((bits+window-1)/window)*window;
- idx=bits-1; /* The top bit of the window */
-
- /* Scan the exponent one window at a time starting from the most
- * significant bits.
- */
- while (idx >= 0)
+ bits--;
+ for (wvalue=0, i=bits%window; i>=0; i--,bits--)
+ wvalue = (wvalue<<1)+BN_is_bit_set(p,bits);
+ if (!MOD_EXP_CTIME_COPY_FROM_PREBUF(&tmp,top,powerbuf,wvalue,numPowers)) goto err;
+
+ /* Scan the exponent one window at a time starting from the most
+ * significant bits.
+ */
+ while (bits >= 0)
{
wvalue=0; /* The 'value' of the window */
/* Scan the window, squaring the result as we go */
- for (i=0; i<window; i++,idx--)
+ for (i=0; i<window; i++,bits--)
{
- if (!BN_mod_mul_montgomery(r,r,r,mont,ctx)) goto err;
- wvalue = (wvalue<<1)+BN_is_bit_set(p,idx);
+ if (!BN_mod_mul_montgomery(&tmp,&tmp,&tmp,mont,ctx)) goto err;
+ wvalue = (wvalue<<1)+BN_is_bit_set(p,bits);
}
/* Fetch the appropriate pre-computed value from the pre-buf */
- if (!MOD_EXP_CTIME_COPY_FROM_PREBUF(computeTemp, top, powerbuf, wvalue, numPowers)) goto err;
+ if (!MOD_EXP_CTIME_COPY_FROM_PREBUF(&am, top, powerbuf, wvalue, numPowers)) goto err;
/* Multiply the result into the intermediate result */
- if (!BN_mod_mul_montgomery(r,r,computeTemp,mont,ctx)) goto err;
+ if (!BN_mod_mul_montgomery(&tmp,&tmp,&am,mont,ctx)) goto err;
}
+ }
/* Convert the final result from montgomery to standard format */
- if (!BN_from_montgomery(rr,r,mont,ctx)) goto err;
+ if (!BN_from_montgomery(rr,&tmp,mont,ctx)) goto err;
ret=1;
err:
if ((in_mont == NULL) && (mont != NULL)) BN_MONT_CTX_free(mont);
if (powerbuf!=NULL)
{
OPENSSL_cleanse(powerbuf,powerbufLen);
- OPENSSL_free(powerbufFree);
+ if (powerbufFree) OPENSSL_free(powerbufFree);
}
- if (am!=NULL) BN_clear(am);
- if (computeTemp!=NULL) BN_clear(computeTemp);
BN_CTX_end(ctx);
return(ret);
}
@@ -988,4 +1095,3 @@ err:
bn_check_top(r);
return(ret);
}
-
diff --git a/main/openssl/crypto/bn/bn_gcd.c b/main/openssl/crypto/bn/bn_gcd.c
index 4a352119..a808f531 100644
--- a/main/openssl/crypto/bn/bn_gcd.c
+++ b/main/openssl/crypto/bn/bn_gcd.c
@@ -205,6 +205,7 @@ err:
/* solves ax == 1 (mod n) */
static BIGNUM *BN_mod_inverse_no_branch(BIGNUM *in,
const BIGNUM *a, const BIGNUM *n, BN_CTX *ctx);
+
BIGNUM *BN_mod_inverse(BIGNUM *in,
const BIGNUM *a, const BIGNUM *n, BN_CTX *ctx)
{
diff --git a/main/openssl/crypto/bn/bn_gf2m.c b/main/openssl/crypto/bn/bn_gf2m.c
index 432a3aa3..8a4dc20a 100644
--- a/main/openssl/crypto/bn/bn_gf2m.c
+++ b/main/openssl/crypto/bn/bn_gf2m.c
@@ -94,6 +94,8 @@
#include "cryptlib.h"
#include "bn_lcl.h"
+#ifndef OPENSSL_NO_EC2M
+
/* Maximum number of iterations before BN_GF2m_mod_solve_quad_arr should fail. */
#define MAX_ITERATIONS 50
@@ -122,6 +124,7 @@ static const BN_ULONG SQR_tb[16] =
SQR_tb[(w) >> 4 & 0xF] << 8 | SQR_tb[(w) & 0xF]
#endif
+#if !defined(OPENSSL_BN_ASM_GF2m)
/* Product of two polynomials a, b each with degree < BN_BITS2 - 1,
* result is a polynomial r with degree < 2 * BN_BITS - 1
* The caller MUST ensure that the variables have the right amount
@@ -216,7 +219,9 @@ static void bn_GF2m_mul_2x2(BN_ULONG *r, const BN_ULONG a1, const BN_ULONG a0, c
r[2] ^= m1 ^ r[1] ^ r[3]; /* h0 ^= m1 ^ l1 ^ h1; */
r[1] = r[3] ^ r[2] ^ r[0] ^ m1 ^ m0; /* l1 ^= l0 ^ h0 ^ m0; */
}
-
+#else
+void bn_GF2m_mul_2x2(BN_ULONG *r, BN_ULONG a1, BN_ULONG a0, BN_ULONG b1, BN_ULONG b0);
+#endif
/* Add polynomials a and b and store result in r; r could be a or b, a and b
* could be equal; r is the bitwise XOR of a and b.
@@ -360,21 +365,17 @@ int BN_GF2m_mod_arr(BIGNUM *r, const BIGNUM *a, const int p[])
int BN_GF2m_mod(BIGNUM *r, const BIGNUM *a, const BIGNUM *p)
{
int ret = 0;
- const int max = BN_num_bits(p) + 1;
- int *arr=NULL;
+ int arr[6];
bn_check_top(a);
bn_check_top(p);
- if ((arr = (int *)OPENSSL_malloc(sizeof(int) * max)) == NULL) goto err;
- ret = BN_GF2m_poly2arr(p, arr, max);
- if (!ret || ret > max)
+ ret = BN_GF2m_poly2arr(p, arr, sizeof(arr)/sizeof(arr[0]));
+ if (!ret || ret > (int)(sizeof(arr)/sizeof(arr[0])))
{
BNerr(BN_F_BN_GF2M_MOD,BN_R_INVALID_LENGTH);
- goto err;
+ return 0;
}
ret = BN_GF2m_mod_arr(r, a, arr);
bn_check_top(r);
-err:
- if (arr) OPENSSL_free(arr);
return ret;
}
@@ -521,7 +522,7 @@ err:
*/
int BN_GF2m_mod_inv(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx)
{
- BIGNUM *b, *c, *u, *v, *tmp;
+ BIGNUM *b, *c = NULL, *u = NULL, *v = NULL, *tmp;
int ret = 0;
bn_check_top(a);
@@ -529,18 +530,18 @@ int BN_GF2m_mod_inv(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx)
BN_CTX_start(ctx);
- b = BN_CTX_get(ctx);
- c = BN_CTX_get(ctx);
- u = BN_CTX_get(ctx);
- v = BN_CTX_get(ctx);
- if (v == NULL) goto err;
+ if ((b = BN_CTX_get(ctx))==NULL) goto err;
+ if ((c = BN_CTX_get(ctx))==NULL) goto err;
+ if ((u = BN_CTX_get(ctx))==NULL) goto err;
+ if ((v = BN_CTX_get(ctx))==NULL) goto err;
- if (!BN_one(b)) goto err;
if (!BN_GF2m_mod(u, a, p)) goto err;
- if (!BN_copy(v, p)) goto err;
-
if (BN_is_zero(u)) goto err;
+ if (!BN_copy(v, p)) goto err;
+#if 0
+ if (!BN_one(b)) goto err;
+
while (1)
{
while (!BN_is_odd(u))
@@ -565,13 +566,89 @@ int BN_GF2m_mod_inv(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx)
if (!BN_GF2m_add(u, u, v)) goto err;
if (!BN_GF2m_add(b, b, c)) goto err;
}
+#else
+ {
+ int i, ubits = BN_num_bits(u),
+ vbits = BN_num_bits(v), /* v is copy of p */
+ top = p->top;
+ BN_ULONG *udp,*bdp,*vdp,*cdp;
+
+ bn_wexpand(u,top); udp = u->d;
+ for (i=u->top;i<top;i++) udp[i] = 0;
+ u->top = top;
+ bn_wexpand(b,top); bdp = b->d;
+ bdp[0] = 1;
+ for (i=1;i<top;i++) bdp[i] = 0;
+ b->top = top;
+ bn_wexpand(c,top); cdp = c->d;
+ for (i=0;i<top;i++) cdp[i] = 0;
+ c->top = top;
+ vdp = v->d; /* It pays off to "cache" *->d pointers, because
+ * it allows optimizer to be more aggressive.
+ * But we don't have to "cache" p->d, because *p
+ * is declared 'const'... */
+ while (1)
+ {
+ while (ubits && !(udp[0]&1))
+ {
+ BN_ULONG u0,u1,b0,b1,mask;
+
+ u0 = udp[0];
+ b0 = bdp[0];
+ mask = (BN_ULONG)0-(b0&1);
+ b0 ^= p->d[0]&mask;
+ for (i=0;i<top-1;i++)
+ {
+ u1 = udp[i+1];
+ udp[i] = ((u0>>1)|(u1<<(BN_BITS2-1)))&BN_MASK2;
+ u0 = u1;
+ b1 = bdp[i+1]^(p->d[i+1]&mask);
+ bdp[i] = ((b0>>1)|(b1<<(BN_BITS2-1)))&BN_MASK2;
+ b0 = b1;
+ }
+ udp[i] = u0>>1;
+ bdp[i] = b0>>1;
+ ubits--;
+ }
+ if (ubits<=BN_BITS2 && udp[0]==1) break;
+
+ if (ubits<vbits)
+ {
+ i = ubits; ubits = vbits; vbits = i;
+ tmp = u; u = v; v = tmp;
+ tmp = b; b = c; c = tmp;
+ udp = vdp; vdp = v->d;
+ bdp = cdp; cdp = c->d;
+ }
+ for(i=0;i<top;i++)
+ {
+ udp[i] ^= vdp[i];
+ bdp[i] ^= cdp[i];
+ }
+ if (ubits==vbits)
+ {
+ BN_ULONG ul;
+ int utop = (ubits-1)/BN_BITS2;
+
+ while ((ul=udp[utop])==0 && utop) utop--;
+ ubits = utop*BN_BITS2 + BN_num_bits_word(ul);
+ }
+ }
+ bn_correct_top(b);
+ }
+#endif
if (!BN_copy(r, b)) goto err;
bn_check_top(r);
ret = 1;
err:
+#ifdef BN_DEBUG /* BN_CTX_end would complain about the expanded form */
+ bn_correct_top(c);
+ bn_correct_top(u);
+ bn_correct_top(v);
+#endif
BN_CTX_end(ctx);
return ret;
}
@@ -1033,3 +1110,4 @@ int BN_GF2m_arr2poly(const int p[], BIGNUM *a)
return 1;
}
+#endif
diff --git a/main/openssl/crypto/bn/bn_lcl.h b/main/openssl/crypto/bn/bn_lcl.h
index 8e5e98e3..817c773b 100644
--- a/main/openssl/crypto/bn/bn_lcl.h
+++ b/main/openssl/crypto/bn/bn_lcl.h
@@ -238,7 +238,7 @@ extern "C" {
# if defined(__DECC)
# include <c_asm.h>
# define BN_UMULT_HIGH(a,b) (BN_ULONG)asm("umulh %a0,%a1,%v0",(a),(b))
-# elif defined(__GNUC__)
+# elif defined(__GNUC__) && __GNUC__>=2
# define BN_UMULT_HIGH(a,b) ({ \
register BN_ULONG ret; \
asm ("umulh %1,%2,%0" \
@@ -247,7 +247,7 @@ extern "C" {
ret; })
# endif /* compiler */
# elif defined(_ARCH_PPC) && defined(__64BIT__) && defined(SIXTY_FOUR_BIT_LONG)
-# if defined(__GNUC__)
+# if defined(__GNUC__) && __GNUC__>=2
# define BN_UMULT_HIGH(a,b) ({ \
register BN_ULONG ret; \
asm ("mulhdu %0,%1,%2" \
@@ -257,7 +257,7 @@ extern "C" {
# endif /* compiler */
# elif (defined(__x86_64) || defined(__x86_64__)) && \
(defined(SIXTY_FOUR_BIT_LONG) || defined(SIXTY_FOUR_BIT))
-# if defined(__GNUC__)
+# if defined(__GNUC__) && __GNUC__>=2
# define BN_UMULT_HIGH(a,b) ({ \
register BN_ULONG ret,discard; \
asm ("mulq %3" \
@@ -280,6 +280,26 @@ extern "C" {
# define BN_UMULT_HIGH(a,b) __umulh((a),(b))
# define BN_UMULT_LOHI(low,high,a,b) ((low)=_umul128((a),(b),&(high)))
# endif
+# elif defined(__mips) && (defined(SIXTY_FOUR_BIT) || defined(SIXTY_FOUR_BIT_LONG))
+# if defined(__GNUC__) && __GNUC__>=2
+# if __GNUC__>=4 && __GNUC_MINOR__>=4 /* "h" constraint is no more since 4.4 */
+# define BN_UMULT_HIGH(a,b) (((__uint128_t)(a)*(b))>>64)
+# define BN_UMULT_LOHI(low,high,a,b) ({ \
+ __uint128_t ret=(__uint128_t)(a)*(b); \
+ (high)=ret>>64; (low)=ret; })
+# else
+# define BN_UMULT_HIGH(a,b) ({ \
+ register BN_ULONG ret; \
+ asm ("dmultu %1,%2" \
+ : "=h"(ret) \
+ : "r"(a), "r"(b) : "l"); \
+ ret; })
+# define BN_UMULT_LOHI(low,high,a,b)\
+ asm ("dmultu %2,%3" \
+ : "=l"(low),"=h"(high) \
+ : "r"(a), "r"(b));
+# endif
+# endif
# endif /* cpu */
#endif /* OPENSSL_NO_ASM */
@@ -459,6 +479,10 @@ extern "C" {
}
#endif /* !BN_LLONG */
+#if defined(OPENSSL_DOING_MAKEDEPEND) && defined(OPENSSL_FIPS)
+#undef bn_div_words
+#endif
+
void bn_mul_normal(BN_ULONG *r,BN_ULONG *a,int na,BN_ULONG *b,int nb);
void bn_mul_comba8(BN_ULONG *r,BN_ULONG *a,BN_ULONG *b);
void bn_mul_comba4(BN_ULONG *r,BN_ULONG *a,BN_ULONG *b);
diff --git a/main/openssl/crypto/bn/bn_lib.c b/main/openssl/crypto/bn/bn_lib.c
index 5470fbe6..7a5676de 100644
--- a/main/openssl/crypto/bn/bn_lib.c
+++ b/main/openssl/crypto/bn/bn_lib.c
@@ -139,25 +139,6 @@ const BIGNUM *BN_value_one(void)
return(&const_one);
}
-char *BN_options(void)
- {
- static int init=0;
- static char data[16];
-
- if (!init)
- {
- init++;
-#ifdef BN_LLONG
- BIO_snprintf(data,sizeof data,"bn(%d,%d)",
- (int)sizeof(BN_ULLONG)*8,(int)sizeof(BN_ULONG)*8);
-#else
- BIO_snprintf(data,sizeof data,"bn(%d,%d)",
- (int)sizeof(BN_ULONG)*8,(int)sizeof(BN_ULONG)*8);
-#endif
- }
- return(data);
- }
-
int BN_num_bits_word(BN_ULONG l)
{
static const unsigned char bits[256]={
diff --git a/main/openssl/crypto/bn/bn_mont.c b/main/openssl/crypto/bn/bn_mont.c
index 1a866880..427b5cf4 100644
--- a/main/openssl/crypto/bn/bn_mont.c
+++ b/main/openssl/crypto/bn/bn_mont.c
@@ -177,31 +177,26 @@ err:
static int BN_from_montgomery_word(BIGNUM *ret, BIGNUM *r, BN_MONT_CTX *mont)
{
BIGNUM *n;
- BN_ULONG *ap,*np,*rp,n0,v,*nrp;
- int al,nl,max,i,x,ri;
+ BN_ULONG *ap,*np,*rp,n0,v,carry;
+ int nl,max,i;
n= &(mont->N);
- /* mont->ri is the size of mont->N in bits (rounded up
- to the word size) */
- al=ri=mont->ri/BN_BITS2;
-
nl=n->top;
- if ((al == 0) || (nl == 0)) { ret->top=0; return(1); }
+ if (nl == 0) { ret->top=0; return(1); }
- max=(nl+al+1); /* allow for overflow (no?) XXX */
+ max=(2*nl); /* carry is stored separately */
if (bn_wexpand(r,max) == NULL) return(0);
r->neg^=n->neg;
np=n->d;
rp=r->d;
- nrp= &(r->d[nl]);
/* clear the top words of T */
#if 1
for (i=r->top; i<max; i++) /* memset? XXX */
- r->d[i]=0;
+ rp[i]=0;
#else
- memset(&(r->d[r->top]),0,(max-r->top)*sizeof(BN_ULONG));
+ memset(&(rp[r->top]),0,(max-r->top)*sizeof(BN_ULONG));
#endif
r->top=max;
@@ -210,7 +205,7 @@ static int BN_from_montgomery_word(BIGNUM *ret, BIGNUM *r, BN_MONT_CTX *mont)
#ifdef BN_COUNT
fprintf(stderr,"word BN_from_montgomery_word %d * %d\n",nl,nl);
#endif
- for (i=0; i<nl; i++)
+ for (carry=0, i=0; i<nl; i++, rp++)
{
#ifdef __TANDEM
{
@@ -228,61 +223,33 @@ static int BN_from_montgomery_word(BIGNUM *ret, BIGNUM *r, BN_MONT_CTX *mont)
#else
v=bn_mul_add_words(rp,np,nl,(rp[0]*n0)&BN_MASK2);
#endif
- nrp++;
- rp++;
- if (((nrp[-1]+=v)&BN_MASK2) >= v)
- continue;
- else
- {
- if (((++nrp[0])&BN_MASK2) != 0) continue;
- if (((++nrp[1])&BN_MASK2) != 0) continue;
- for (x=2; (((++nrp[x])&BN_MASK2) == 0); x++) ;
- }
- }
- bn_correct_top(r);
-
- /* mont->ri will be a multiple of the word size and below code
- * is kind of BN_rshift(ret,r,mont->ri) equivalent */
- if (r->top <= ri)
- {
- ret->top=0;
- return(1);
+ v = (v+carry+rp[nl])&BN_MASK2;
+ carry |= (v != rp[nl]);
+ carry &= (v <= rp[nl]);
+ rp[nl]=v;
}
- al=r->top-ri;
-#define BRANCH_FREE 1
-#if BRANCH_FREE
- if (bn_wexpand(ret,ri) == NULL) return(0);
- x=0-(((al-ri)>>(sizeof(al)*8-1))&1);
- ret->top=x=(ri&~x)|(al&x); /* min(ri,al) */
+ if (bn_wexpand(ret,nl) == NULL) return(0);
+ ret->top=nl;
ret->neg=r->neg;
rp=ret->d;
- ap=&(r->d[ri]);
+ ap=&(r->d[nl]);
+#define BRANCH_FREE 1
+#if BRANCH_FREE
{
- size_t m1,m2;
-
- v=bn_sub_words(rp,ap,np,ri);
- /* this ----------------^^ works even in al<ri case
- * thanks to zealous zeroing of top of the vector in the
- * beginning. */
+ BN_ULONG *nrp;
+ size_t m;
- /* if (al==ri && !v) || al>ri) nrp=rp; else nrp=ap; */
- /* in other words if subtraction result is real, then
+ v=bn_sub_words(rp,ap,np,nl)-carry;
+ /* if subtraction result is real, then
* trick unconditional memcpy below to perform in-place
* "refresh" instead of actual copy. */
- m1=0-(size_t)(((al-ri)>>(sizeof(al)*8-1))&1); /* al<ri */
- m2=0-(size_t)(((ri-al)>>(sizeof(al)*8-1))&1); /* al>ri */
- m1|=m2; /* (al!=ri) */
- m1|=(0-(size_t)v); /* (al!=ri || v) */
- m1&=~m2; /* (al!=ri || v) && !al>ri */
- nrp=(BN_ULONG *)(((PTR_SIZE_INT)rp&~m1)|((PTR_SIZE_INT)ap&m1));
- }
+ m=(0-(size_t)v);
+ nrp=(BN_ULONG *)(((PTR_SIZE_INT)rp&~m)|((PTR_SIZE_INT)ap&m));
- /* 'i<ri' is chosen to eliminate dependency on input data, even
- * though it results in redundant copy in al<ri case. */
- for (i=0,ri-=4; i<ri; i+=4)
+ for (i=0,nl-=4; i<nl; i+=4)
{
BN_ULONG t1,t2,t3,t4;
@@ -295,40 +262,15 @@ static int BN_from_montgomery_word(BIGNUM *ret, BIGNUM *r, BN_MONT_CTX *mont)
rp[i+2]=t3;
rp[i+3]=t4;
}
- for (ri+=4; i<ri; i++)
+ for (nl+=4; i<nl; i++)
rp[i]=nrp[i], ap[i]=0;
- bn_correct_top(r);
- bn_correct_top(ret);
+ }
#else
- if (bn_wexpand(ret,al) == NULL) return(0);
- ret->top=al;
- ret->neg=r->neg;
-
- rp=ret->d;
- ap=&(r->d[ri]);
- al-=4;
- for (i=0; i<al; i+=4)
- {
- BN_ULONG t1,t2,t3,t4;
-
- t1=ap[i+0];
- t2=ap[i+1];
- t3=ap[i+2];
- t4=ap[i+3];
- rp[i+0]=t1;
- rp[i+1]=t2;
- rp[i+2]=t3;
- rp[i+3]=t4;
- }
- al+=4;
- for (; i<al; i++)
- rp[i]=ap[i];
-
- if (BN_ucmp(ret, &(mont->N)) >= 0)
- {
- if (!BN_usub(ret,ret,&(mont->N))) return(0);
- }
+ if (bn_sub_words (rp,ap,np,nl)-carry)
+ memcpy(rp,ap,nl*sizeof(BN_ULONG));
#endif
+ bn_correct_top(r);
+ bn_correct_top(ret);
bn_check_top(ret);
return(1);
diff --git a/main/openssl/crypto/bn/bn_nist.c b/main/openssl/crypto/bn/bn_nist.c
index c6de0326..e22968d4 100644
--- a/main/openssl/crypto/bn/bn_nist.c
+++ b/main/openssl/crypto/bn/bn_nist.c
@@ -286,26 +286,25 @@ const BIGNUM *BN_get0_nist_prime_521(void)
}
-static void nist_cp_bn_0(BN_ULONG *buf, BN_ULONG *a, int top, int max)
+static void nist_cp_bn_0(BN_ULONG *dst, const BN_ULONG *src, int top, int max)
{
int i;
- BN_ULONG *_tmp1 = (buf), *_tmp2 = (a);
#ifdef BN_DEBUG
OPENSSL_assert(top <= max);
#endif
- for (i = (top); i != 0; i--)
- *_tmp1++ = *_tmp2++;
- for (i = (max) - (top); i != 0; i--)
- *_tmp1++ = (BN_ULONG) 0;
+ for (i = 0; i < top; i++)
+ dst[i] = src[i];
+ for (; i < max; i++)
+ dst[i] = 0;
}
-static void nist_cp_bn(BN_ULONG *buf, BN_ULONG *a, int top)
+static void nist_cp_bn(BN_ULONG *dst, const BN_ULONG *src, int top)
{
int i;
- BN_ULONG *_tmp1 = (buf), *_tmp2 = (a);
- for (i = (top); i != 0; i--)
- *_tmp1++ = *_tmp2++;
+
+ for (i = 0; i < top; i++)
+ dst[i] = src[i];
}
#if BN_BITS2 == 64
@@ -319,6 +318,13 @@ static void nist_cp_bn(BN_ULONG *buf, BN_ULONG *a, int top)
:(to[(n)/2] =((m)&1)?(from[(m)/2]>>32):(from[(m)/2]&BN_MASK2l)))
#define bn_32_set_0(to, n) (((n)&1)?(to[(n)/2]&=BN_MASK2l):(to[(n)/2]=0));
#define bn_cp_32(to,n,from,m) ((m)>=0)?bn_cp_32_naked(to,n,from,m):bn_32_set_0(to,n)
+# if defined(L_ENDIAN)
+# if defined(__arch64__)
+# define NIST_INT64 long
+# else
+# define NIST_INT64 long long
+# endif
+# endif
#else
#define bn_cp_64(to, n, from, m) \
{ \
@@ -330,13 +336,15 @@ static void nist_cp_bn(BN_ULONG *buf, BN_ULONG *a, int top)
bn_32_set_0(to, (n)*2); \
bn_32_set_0(to, (n)*2+1); \
}
-#if BN_BITS2 == 32
#define bn_cp_32(to, n, from, m) (to)[n] = (m>=0)?((from)[m]):0;
#define bn_32_set_0(to, n) (to)[n] = (BN_ULONG)0;
-#endif
+# if defined(_WIN32) && !defined(__GNUC__)
+# define NIST_INT64 __int64
+# elif defined(BN_LLONG)
+# define NIST_INT64 long long
+# endif
#endif /* BN_BITS2 != 64 */
-
#define nist_set_192(to, from, a1, a2, a3) \
{ \
bn_cp_64(to, 0, from, (a3) - 3) \
@@ -350,9 +358,11 @@ int BN_nist_mod_192(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
int top = a->top, i;
int carry;
register BN_ULONG *r_d, *a_d = a->d;
- BN_ULONG t_d[BN_NIST_192_TOP],
- buf[BN_NIST_192_TOP],
- c_d[BN_NIST_192_TOP],
+ union {
+ BN_ULONG bn[BN_NIST_192_TOP];
+ unsigned int ui[BN_NIST_192_TOP*sizeof(BN_ULONG)/sizeof(unsigned int)];
+ } buf;
+ BN_ULONG c_d[BN_NIST_192_TOP],
*res;
PTR_SIZE_INT mask;
static const BIGNUM _bignum_nist_p_192_sqr = {
@@ -385,15 +395,48 @@ int BN_nist_mod_192(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
else
r_d = a_d;
- nist_cp_bn_0(buf, a_d + BN_NIST_192_TOP, top - BN_NIST_192_TOP, BN_NIST_192_TOP);
+ nist_cp_bn_0(buf.bn, a_d + BN_NIST_192_TOP, top - BN_NIST_192_TOP, BN_NIST_192_TOP);
+
+#if defined(NIST_INT64)
+ {
+ NIST_INT64 acc; /* accumulator */
+ unsigned int *rp=(unsigned int *)r_d;
+ const unsigned int *bp=(const unsigned int *)buf.ui;
+
+ acc = rp[0]; acc += bp[3*2-6];
+ acc += bp[5*2-6]; rp[0] = (unsigned int)acc; acc >>= 32;
+
+ acc += rp[1]; acc += bp[3*2-5];
+ acc += bp[5*2-5]; rp[1] = (unsigned int)acc; acc >>= 32;
+
+ acc += rp[2]; acc += bp[3*2-6];
+ acc += bp[4*2-6];
+ acc += bp[5*2-6]; rp[2] = (unsigned int)acc; acc >>= 32;
+
+ acc += rp[3]; acc += bp[3*2-5];
+ acc += bp[4*2-5];
+ acc += bp[5*2-5]; rp[3] = (unsigned int)acc; acc >>= 32;
+
+ acc += rp[4]; acc += bp[4*2-6];
+ acc += bp[5*2-6]; rp[4] = (unsigned int)acc; acc >>= 32;
+
+ acc += rp[5]; acc += bp[4*2-5];
+ acc += bp[5*2-5]; rp[5] = (unsigned int)acc;
+
+ carry = (int)(acc>>32);
+ }
+#else
+ {
+ BN_ULONG t_d[BN_NIST_192_TOP];
- nist_set_192(t_d, buf, 0, 3, 3);
+ nist_set_192(t_d, buf.bn, 0, 3, 3);
carry = (int)bn_add_words(r_d, r_d, t_d, BN_NIST_192_TOP);
- nist_set_192(t_d, buf, 4, 4, 0);
+ nist_set_192(t_d, buf.bn, 4, 4, 0);
carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_192_TOP);
- nist_set_192(t_d, buf, 5, 5, 5)
+ nist_set_192(t_d, buf.bn, 5, 5, 5)
carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_192_TOP);
-
+ }
+#endif
if (carry > 0)
carry = (int)bn_sub_words(r_d,r_d,_nist_p_192[carry-1],BN_NIST_192_TOP);
else
@@ -407,8 +450,9 @@ int BN_nist_mod_192(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
*/
mask = 0-(PTR_SIZE_INT)bn_sub_words(c_d,r_d,_nist_p_192[0],BN_NIST_192_TOP);
mask &= 0-(PTR_SIZE_INT)carry;
+ res = c_d;
res = (BN_ULONG *)
- (((PTR_SIZE_INT)c_d&~mask) | ((PTR_SIZE_INT)r_d&mask));
+ (((PTR_SIZE_INT)res&~mask) | ((PTR_SIZE_INT)r_d&mask));
nist_cp_bn(r_d, res, BN_NIST_192_TOP);
r->top = BN_NIST_192_TOP;
bn_correct_top(r);
@@ -435,9 +479,11 @@ int BN_nist_mod_224(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
int top = a->top, i;
int carry;
BN_ULONG *r_d, *a_d = a->d;
- BN_ULONG t_d[BN_NIST_224_TOP],
- buf[BN_NIST_224_TOP],
- c_d[BN_NIST_224_TOP],
+ union {
+ BN_ULONG bn[BN_NIST_224_TOP];
+ unsigned int ui[BN_NIST_224_TOP*sizeof(BN_ULONG)/sizeof(unsigned int)];
+ } buf;
+ BN_ULONG c_d[BN_NIST_224_TOP],
*res;
PTR_SIZE_INT mask;
union { bn_addsub_f f; PTR_SIZE_INT p; } u;
@@ -474,26 +520,68 @@ int BN_nist_mod_224(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
#if BN_BITS2==64
/* copy upper 256 bits of 448 bit number ... */
- nist_cp_bn_0(t_d, a_d + (BN_NIST_224_TOP-1), top - (BN_NIST_224_TOP-1), BN_NIST_224_TOP);
+ nist_cp_bn_0(c_d, a_d + (BN_NIST_224_TOP-1), top - (BN_NIST_224_TOP-1), BN_NIST_224_TOP);
/* ... and right shift by 32 to obtain upper 224 bits */
- nist_set_224(buf, t_d, 14, 13, 12, 11, 10, 9, 8);
+ nist_set_224(buf.bn, c_d, 14, 13, 12, 11, 10, 9, 8);
/* truncate lower part to 224 bits too */
r_d[BN_NIST_224_TOP-1] &= BN_MASK2l;
#else
- nist_cp_bn_0(buf, a_d + BN_NIST_224_TOP, top - BN_NIST_224_TOP, BN_NIST_224_TOP);
+ nist_cp_bn_0(buf.bn, a_d + BN_NIST_224_TOP, top - BN_NIST_224_TOP, BN_NIST_224_TOP);
#endif
- nist_set_224(t_d, buf, 10, 9, 8, 7, 0, 0, 0);
+
+#if defined(NIST_INT64) && BN_BITS2!=64
+ {
+ NIST_INT64 acc; /* accumulator */
+ unsigned int *rp=(unsigned int *)r_d;
+ const unsigned int *bp=(const unsigned int *)buf.ui;
+
+ acc = rp[0]; acc -= bp[7-7];
+ acc -= bp[11-7]; rp[0] = (unsigned int)acc; acc >>= 32;
+
+ acc += rp[1]; acc -= bp[8-7];
+ acc -= bp[12-7]; rp[1] = (unsigned int)acc; acc >>= 32;
+
+ acc += rp[2]; acc -= bp[9-7];
+ acc -= bp[13-7]; rp[2] = (unsigned int)acc; acc >>= 32;
+
+ acc += rp[3]; acc += bp[7-7];
+ acc += bp[11-7];
+ acc -= bp[10-7]; rp[3] = (unsigned int)acc; acc>>= 32;
+
+ acc += rp[4]; acc += bp[8-7];
+ acc += bp[12-7];
+ acc -= bp[11-7]; rp[4] = (unsigned int)acc; acc >>= 32;
+
+ acc += rp[5]; acc += bp[9-7];
+ acc += bp[13-7];
+ acc -= bp[12-7]; rp[5] = (unsigned int)acc; acc >>= 32;
+
+ acc += rp[6]; acc += bp[10-7];
+ acc -= bp[13-7]; rp[6] = (unsigned int)acc;
+
+ carry = (int)(acc>>32);
+# if BN_BITS2==64
+ rp[7] = carry;
+# endif
+ }
+#else
+ {
+ BN_ULONG t_d[BN_NIST_224_TOP];
+
+ nist_set_224(t_d, buf.bn, 10, 9, 8, 7, 0, 0, 0);
carry = (int)bn_add_words(r_d, r_d, t_d, BN_NIST_224_TOP);
- nist_set_224(t_d, buf, 0, 13, 12, 11, 0, 0, 0);
+ nist_set_224(t_d, buf.bn, 0, 13, 12, 11, 0, 0, 0);
carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_224_TOP);
- nist_set_224(t_d, buf, 13, 12, 11, 10, 9, 8, 7);
+ nist_set_224(t_d, buf.bn, 13, 12, 11, 10, 9, 8, 7);
carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_224_TOP);
- nist_set_224(t_d, buf, 0, 0, 0, 0, 13, 12, 11);
+ nist_set_224(t_d, buf.bn, 0, 0, 0, 0, 13, 12, 11);
carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_224_TOP);
#if BN_BITS2==64
carry = (int)(r_d[BN_NIST_224_TOP-1]>>32);
#endif
+ }
+#endif
u.f = bn_sub_words;
if (carry > 0)
{
@@ -521,7 +609,8 @@ int BN_nist_mod_224(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
/* otherwise it's effectively same as in BN_nist_mod_192... */
mask = 0-(PTR_SIZE_INT)(*u.f)(c_d,r_d,_nist_p_224[0],BN_NIST_224_TOP);
mask &= 0-(PTR_SIZE_INT)carry;
- res = (BN_ULONG *)(((PTR_SIZE_INT)c_d&~mask) |
+ res = c_d;
+ res = (BN_ULONG *)(((PTR_SIZE_INT)res&~mask) |
((PTR_SIZE_INT)r_d&mask));
nist_cp_bn(r_d, res, BN_NIST_224_TOP);
r->top = BN_NIST_224_TOP;
@@ -548,9 +637,11 @@ int BN_nist_mod_256(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
int i, top = a->top;
int carry = 0;
register BN_ULONG *a_d = a->d, *r_d;
- BN_ULONG t_d[BN_NIST_256_TOP],
- buf[BN_NIST_256_TOP],
- c_d[BN_NIST_256_TOP],
+ union {
+ BN_ULONG bn[BN_NIST_256_TOP];
+ unsigned int ui[BN_NIST_256_TOP*sizeof(BN_ULONG)/sizeof(unsigned int)];
+ } buf;
+ BN_ULONG c_d[BN_NIST_256_TOP],
*res;
PTR_SIZE_INT mask;
union { bn_addsub_f f; PTR_SIZE_INT p; } u;
@@ -584,12 +675,87 @@ int BN_nist_mod_256(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
else
r_d = a_d;
- nist_cp_bn_0(buf, a_d + BN_NIST_256_TOP, top - BN_NIST_256_TOP, BN_NIST_256_TOP);
+ nist_cp_bn_0(buf.bn, a_d + BN_NIST_256_TOP, top - BN_NIST_256_TOP, BN_NIST_256_TOP);
+
+#if defined(NIST_INT64)
+ {
+ NIST_INT64 acc; /* accumulator */
+ unsigned int *rp=(unsigned int *)r_d;
+ const unsigned int *bp=(const unsigned int *)buf.ui;
+
+ acc = rp[0]; acc += bp[8-8];
+ acc += bp[9-8];
+ acc -= bp[11-8];
+ acc -= bp[12-8];
+ acc -= bp[13-8];
+ acc -= bp[14-8]; rp[0] = (unsigned int)acc; acc >>= 32;
+
+ acc += rp[1]; acc += bp[9-8];
+ acc += bp[10-8];
+ acc -= bp[12-8];
+ acc -= bp[13-8];
+ acc -= bp[14-8];
+ acc -= bp[15-8]; rp[1] = (unsigned int)acc; acc >>= 32;
+
+ acc += rp[2]; acc += bp[10-8];
+ acc += bp[11-8];
+ acc -= bp[13-8];
+ acc -= bp[14-8];
+ acc -= bp[15-8]; rp[2] = (unsigned int)acc; acc >>= 32;
+
+ acc += rp[3]; acc += bp[11-8];
+ acc += bp[11-8];
+ acc += bp[12-8];
+ acc += bp[12-8];
+ acc += bp[13-8];
+ acc -= bp[15-8];
+ acc -= bp[8-8];
+ acc -= bp[9-8]; rp[3] = (unsigned int)acc; acc >>= 32;
+
+ acc += rp[4]; acc += bp[12-8];
+ acc += bp[12-8];
+ acc += bp[13-8];
+ acc += bp[13-8];
+ acc += bp[14-8];
+ acc -= bp[9-8];
+ acc -= bp[10-8]; rp[4] = (unsigned int)acc; acc >>= 32;
+
+ acc += rp[5]; acc += bp[13-8];
+ acc += bp[13-8];
+ acc += bp[14-8];
+ acc += bp[14-8];
+ acc += bp[15-8];
+ acc -= bp[10-8];
+ acc -= bp[11-8]; rp[5] = (unsigned int)acc; acc >>= 32;
+
+ acc += rp[6]; acc += bp[14-8];
+ acc += bp[14-8];
+ acc += bp[15-8];
+ acc += bp[15-8];
+ acc += bp[14-8];
+ acc += bp[13-8];
+ acc -= bp[8-8];
+ acc -= bp[9-8]; rp[6] = (unsigned int)acc; acc >>= 32;
+
+ acc += rp[7]; acc += bp[15-8];
+ acc += bp[15-8];
+ acc += bp[15-8];
+ acc += bp[8 -8];
+ acc -= bp[10-8];
+ acc -= bp[11-8];
+ acc -= bp[12-8];
+ acc -= bp[13-8]; rp[7] = (unsigned int)acc;
+
+ carry = (int)(acc>>32);
+ }
+#else
+ {
+ BN_ULONG t_d[BN_NIST_256_TOP];
/*S1*/
- nist_set_256(t_d, buf, 15, 14, 13, 12, 11, 0, 0, 0);
+ nist_set_256(t_d, buf.bn, 15, 14, 13, 12, 11, 0, 0, 0);
/*S2*/
- nist_set_256(c_d, buf, 0, 15, 14, 13, 12, 0, 0, 0);
+ nist_set_256(c_d, buf.bn, 0, 15, 14, 13, 12, 0, 0, 0);
carry = (int)bn_add_words(t_d, t_d, c_d, BN_NIST_256_TOP);
/* left shift */
{
@@ -607,24 +773,26 @@ int BN_nist_mod_256(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
}
carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_256_TOP);
/*S3*/
- nist_set_256(t_d, buf, 15, 14, 0, 0, 0, 10, 9, 8);
+ nist_set_256(t_d, buf.bn, 15, 14, 0, 0, 0, 10, 9, 8);
carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_256_TOP);
/*S4*/
- nist_set_256(t_d, buf, 8, 13, 15, 14, 13, 11, 10, 9);
+ nist_set_256(t_d, buf.bn, 8, 13, 15, 14, 13, 11, 10, 9);
carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_256_TOP);
/*D1*/
- nist_set_256(t_d, buf, 10, 8, 0, 0, 0, 13, 12, 11);
+ nist_set_256(t_d, buf.bn, 10, 8, 0, 0, 0, 13, 12, 11);
carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_256_TOP);
/*D2*/
- nist_set_256(t_d, buf, 11, 9, 0, 0, 15, 14, 13, 12);
+ nist_set_256(t_d, buf.bn, 11, 9, 0, 0, 15, 14, 13, 12);
carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_256_TOP);
/*D3*/
- nist_set_256(t_d, buf, 12, 0, 10, 9, 8, 15, 14, 13);
+ nist_set_256(t_d, buf.bn, 12, 0, 10, 9, 8, 15, 14, 13);
carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_256_TOP);
/*D4*/
- nist_set_256(t_d, buf, 13, 0, 11, 10, 9, 0, 15, 14);
+ nist_set_256(t_d, buf.bn, 13, 0, 11, 10, 9, 0, 15, 14);
carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_256_TOP);
+ }
+#endif
/* see BN_nist_mod_224 for explanation */
u.f = bn_sub_words;
if (carry > 0)
@@ -641,7 +809,8 @@ int BN_nist_mod_256(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
mask = 0-(PTR_SIZE_INT)(*u.f)(c_d,r_d,_nist_p_256[0],BN_NIST_256_TOP);
mask &= 0-(PTR_SIZE_INT)carry;
- res = (BN_ULONG *)(((PTR_SIZE_INT)c_d&~mask) |
+ res = c_d;
+ res = (BN_ULONG *)(((PTR_SIZE_INT)res&~mask) |
((PTR_SIZE_INT)r_d&mask));
nist_cp_bn(r_d, res, BN_NIST_256_TOP);
r->top = BN_NIST_256_TOP;
@@ -672,9 +841,11 @@ int BN_nist_mod_384(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
int i, top = a->top;
int carry = 0;
register BN_ULONG *r_d, *a_d = a->d;
- BN_ULONG t_d[BN_NIST_384_TOP],
- buf[BN_NIST_384_TOP],
- c_d[BN_NIST_384_TOP],
+ union {
+ BN_ULONG bn[BN_NIST_384_TOP];
+ unsigned int ui[BN_NIST_384_TOP*sizeof(BN_ULONG)/sizeof(unsigned int)];
+ } buf;
+ BN_ULONG c_d[BN_NIST_384_TOP],
*res;
PTR_SIZE_INT mask;
union { bn_addsub_f f; PTR_SIZE_INT p; } u;
@@ -709,10 +880,100 @@ int BN_nist_mod_384(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
else
r_d = a_d;
- nist_cp_bn_0(buf, a_d + BN_NIST_384_TOP, top - BN_NIST_384_TOP, BN_NIST_384_TOP);
+ nist_cp_bn_0(buf.bn, a_d + BN_NIST_384_TOP, top - BN_NIST_384_TOP, BN_NIST_384_TOP);
+
+#if defined(NIST_INT64)
+ {
+ NIST_INT64 acc; /* accumulator */
+ unsigned int *rp=(unsigned int *)r_d;
+ const unsigned int *bp=(const unsigned int *)buf.ui;
+
+ acc = rp[0]; acc += bp[12-12];
+ acc += bp[21-12];
+ acc += bp[20-12];
+ acc -= bp[23-12]; rp[0] = (unsigned int)acc; acc >>= 32;
+
+ acc += rp[1]; acc += bp[13-12];
+ acc += bp[22-12];
+ acc += bp[23-12];
+ acc -= bp[12-12];
+ acc -= bp[20-12]; rp[1] = (unsigned int)acc; acc >>= 32;
+
+ acc += rp[2]; acc += bp[14-12];
+ acc += bp[23-12];
+ acc -= bp[13-12];
+ acc -= bp[21-12]; rp[2] = (unsigned int)acc; acc >>= 32;
+
+ acc += rp[3]; acc += bp[15-12];
+ acc += bp[12-12];
+ acc += bp[20-12];
+ acc += bp[21-12];
+ acc -= bp[14-12];
+ acc -= bp[22-12];
+ acc -= bp[23-12]; rp[3] = (unsigned int)acc; acc >>= 32;
+
+ acc += rp[4]; acc += bp[21-12];
+ acc += bp[21-12];
+ acc += bp[16-12];
+ acc += bp[13-12];
+ acc += bp[12-12];
+ acc += bp[20-12];
+ acc += bp[22-12];
+ acc -= bp[15-12];
+ acc -= bp[23-12];
+ acc -= bp[23-12]; rp[4] = (unsigned int)acc; acc >>= 32;
+
+ acc += rp[5]; acc += bp[22-12];
+ acc += bp[22-12];
+ acc += bp[17-12];
+ acc += bp[14-12];
+ acc += bp[13-12];
+ acc += bp[21-12];
+ acc += bp[23-12];
+ acc -= bp[16-12]; rp[5] = (unsigned int)acc; acc >>= 32;
+
+ acc += rp[6]; acc += bp[23-12];
+ acc += bp[23-12];
+ acc += bp[18-12];
+ acc += bp[15-12];
+ acc += bp[14-12];
+ acc += bp[22-12];
+ acc -= bp[17-12]; rp[6] = (unsigned int)acc; acc >>= 32;
+
+ acc += rp[7]; acc += bp[19-12];
+ acc += bp[16-12];
+ acc += bp[15-12];
+ acc += bp[23-12];
+ acc -= bp[18-12]; rp[7] = (unsigned int)acc; acc >>= 32;
+
+ acc += rp[8]; acc += bp[20-12];
+ acc += bp[17-12];
+ acc += bp[16-12];
+ acc -= bp[19-12]; rp[8] = (unsigned int)acc; acc >>= 32;
+
+ acc += rp[9]; acc += bp[21-12];
+ acc += bp[18-12];
+ acc += bp[17-12];
+ acc -= bp[20-12]; rp[9] = (unsigned int)acc; acc >>= 32;
+
+ acc += rp[10]; acc += bp[22-12];
+ acc += bp[19-12];
+ acc += bp[18-12];
+ acc -= bp[21-12]; rp[10] = (unsigned int)acc; acc >>= 32;
+
+ acc += rp[11]; acc += bp[23-12];
+ acc += bp[20-12];
+ acc += bp[19-12];
+ acc -= bp[22-12]; rp[11] = (unsigned int)acc;
+
+ carry = (int)(acc>>32);
+ }
+#else
+ {
+ BN_ULONG t_d[BN_NIST_384_TOP];
/*S1*/
- nist_set_256(t_d, buf, 0, 0, 0, 0, 0, 23-4, 22-4, 21-4);
+ nist_set_256(t_d, buf.bn, 0, 0, 0, 0, 0, 23-4, 22-4, 21-4);
/* left shift */
{
register BN_ULONG *ap,t,c;
@@ -729,29 +990,31 @@ int BN_nist_mod_384(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
carry = (int)bn_add_words(r_d+(128/BN_BITS2), r_d+(128/BN_BITS2),
t_d, BN_NIST_256_TOP);
/*S2 */
- carry += (int)bn_add_words(r_d, r_d, buf, BN_NIST_384_TOP);
+ carry += (int)bn_add_words(r_d, r_d, buf.bn, BN_NIST_384_TOP);
/*S3*/
- nist_set_384(t_d,buf,20,19,18,17,16,15,14,13,12,23,22,21);
+ nist_set_384(t_d,buf.bn,20,19,18,17,16,15,14,13,12,23,22,21);
carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_384_TOP);
/*S4*/
- nist_set_384(t_d,buf,19,18,17,16,15,14,13,12,20,0,23,0);
+ nist_set_384(t_d,buf.bn,19,18,17,16,15,14,13,12,20,0,23,0);
carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_384_TOP);
/*S5*/
- nist_set_384(t_d, buf,0,0,0,0,23,22,21,20,0,0,0,0);
+ nist_set_384(t_d, buf.bn,0,0,0,0,23,22,21,20,0,0,0,0);
carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_384_TOP);
/*S6*/
- nist_set_384(t_d,buf,0,0,0,0,0,0,23,22,21,0,0,20);
+ nist_set_384(t_d,buf.bn,0,0,0,0,0,0,23,22,21,0,0,20);
carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_384_TOP);
/*D1*/
- nist_set_384(t_d,buf,22,21,20,19,18,17,16,15,14,13,12,23);
+ nist_set_384(t_d,buf.bn,22,21,20,19,18,17,16,15,14,13,12,23);
carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_384_TOP);
/*D2*/
- nist_set_384(t_d,buf,0,0,0,0,0,0,0,23,22,21,20,0);
+ nist_set_384(t_d,buf.bn,0,0,0,0,0,0,0,23,22,21,20,0);
carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_384_TOP);
/*D3*/
- nist_set_384(t_d,buf,0,0,0,0,0,0,0,23,23,0,0,0);
+ nist_set_384(t_d,buf.bn,0,0,0,0,0,0,0,23,23,0,0,0);
carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_384_TOP);
+ }
+#endif
/* see BN_nist_mod_224 for explanation */
u.f = bn_sub_words;
if (carry > 0)
@@ -768,7 +1031,8 @@ int BN_nist_mod_384(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
mask = 0-(PTR_SIZE_INT)(*u.f)(c_d,r_d,_nist_p_384[0],BN_NIST_384_TOP);
mask &= 0-(PTR_SIZE_INT)carry;
- res = (BN_ULONG *)(((PTR_SIZE_INT)c_d&~mask) |
+ res = c_d;
+ res = (BN_ULONG *)(((PTR_SIZE_INT)res&~mask) |
((PTR_SIZE_INT)r_d&mask));
nist_cp_bn(r_d, res, BN_NIST_384_TOP);
r->top = BN_NIST_384_TOP;
@@ -834,7 +1098,8 @@ int BN_nist_mod_521(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
bn_add_words(r_d,r_d,t_d,BN_NIST_521_TOP);
mask = 0-(PTR_SIZE_INT)bn_sub_words(t_d,r_d,_nist_p_521,BN_NIST_521_TOP);
- res = (BN_ULONG *)(((PTR_SIZE_INT)t_d&~mask) |
+ res = t_d;
+ res = (BN_ULONG *)(((PTR_SIZE_INT)res&~mask) |
((PTR_SIZE_INT)r_d&mask));
nist_cp_bn(r_d,res,BN_NIST_521_TOP);
r->top = BN_NIST_521_TOP;
diff --git a/main/openssl/crypto/bn/bn_print.c b/main/openssl/crypto/bn/bn_print.c
index bebb466d..1743b6a7 100644
--- a/main/openssl/crypto/bn/bn_print.c
+++ b/main/openssl/crypto/bn/bn_print.c
@@ -357,3 +357,22 @@ end:
return(ret);
}
#endif
+
+char *BN_options(void)
+ {
+ static int init=0;
+ static char data[16];
+
+ if (!init)
+ {
+ init++;
+#ifdef BN_LLONG
+ BIO_snprintf(data,sizeof data,"bn(%d,%d)",
+ (int)sizeof(BN_ULLONG)*8,(int)sizeof(BN_ULONG)*8);
+#else
+ BIO_snprintf(data,sizeof data,"bn(%d,%d)",
+ (int)sizeof(BN_ULONG)*8,(int)sizeof(BN_ULONG)*8);
+#endif
+ }
+ return(data);
+ }
diff --git a/main/openssl/crypto/bn/bn_rand.c b/main/openssl/crypto/bn/bn_rand.c
index b376c28f..55676f07 100644
--- a/main/openssl/crypto/bn/bn_rand.c
+++ b/main/openssl/crypto/bn/bn_rand.c
@@ -114,6 +114,7 @@
#include "cryptlib.h"
#include "bn_lcl.h"
#include <openssl/rand.h>
+#include <openssl/sha.h>
static int bnrand(int pseudorand, BIGNUM *rnd, int bits, int top, int bottom)
{
@@ -303,3 +304,72 @@ int BN_pseudo_rand_range(BIGNUM *r, const BIGNUM *range)
{
return bn_rand_range(1, r, range);
}
+
+#ifndef OPENSSL_NO_SHA512
+/* BN_generate_dsa_nonce generates a random number 0 <= out < range. Unlike
+ * BN_rand_range, it also includes the contents of |priv| and |message| in the
+ * generation so that an RNG failure isn't fatal as long as |priv| remains
+ * secret. This is intended for use in DSA and ECDSA where an RNG weakness
+ * leads directly to private key exposure unless this function is used. */
+int BN_generate_dsa_nonce(BIGNUM *out, const BIGNUM *range, const BIGNUM* priv,
+ const unsigned char *message, size_t message_len,
+ BN_CTX *ctx)
+ {
+ SHA512_CTX sha;
+ /* We use 512 bits of random data per iteration to
+ * ensure that we have at least |range| bits of randomness. */
+ unsigned char random_bytes[64];
+ unsigned char digest[SHA512_DIGEST_LENGTH];
+ unsigned done, todo;
+ /* We generate |range|+8 bytes of random output. */
+ const unsigned num_k_bytes = BN_num_bytes(range) + 8;
+ unsigned char private_bytes[96];
+ unsigned char *k_bytes;
+ int ret = 0;
+
+ k_bytes = OPENSSL_malloc(num_k_bytes);
+ if (!k_bytes)
+ goto err;
+
+ /* We copy |priv| into a local buffer to avoid exposing its length. */
+ todo = sizeof(priv->d[0])*priv->top;
+ if (todo > sizeof(private_bytes))
+ {
+ /* No reasonable DSA or ECDSA key should have a private key
+ * this large and we don't handle this case in order to avoid
+ * leaking the length of the private key. */
+ BNerr(BN_F_BN_GENERATE_DSA_NONCE, BN_R_PRIVATE_KEY_TOO_LARGE);
+ goto err;
+ }
+ memcpy(private_bytes, priv->d, todo);
+ memset(private_bytes + todo, 0, sizeof(private_bytes) - todo);
+
+ for (done = 0; done < num_k_bytes;) {
+ if (RAND_bytes(random_bytes, sizeof(random_bytes)) != 1)
+ goto err;
+ SHA512_Init(&sha);
+ SHA512_Update(&sha, &done, sizeof(done));
+ SHA512_Update(&sha, private_bytes, sizeof(private_bytes));
+ SHA512_Update(&sha, message, message_len);
+ SHA512_Update(&sha, random_bytes, sizeof(random_bytes));
+ SHA512_Final(digest, &sha);
+
+ todo = num_k_bytes - done;
+ if (todo > SHA512_DIGEST_LENGTH)
+ todo = SHA512_DIGEST_LENGTH;
+ memcpy(k_bytes + done, digest, todo);
+ done += todo;
+ }
+
+ if (!BN_bin2bn(k_bytes, num_k_bytes, out))
+ goto err;
+ if (BN_mod(out, out, range, ctx) != 1)
+ goto err;
+ ret = 1;
+
+err:
+ if (k_bytes)
+ OPENSSL_free(k_bytes);
+ return ret;
+ }
+#endif /* OPENSSL_NO_SHA512 */
diff --git a/main/openssl/crypto/bn/bn_shift.c b/main/openssl/crypto/bn/bn_shift.c
index c4d301af..a6fca2c4 100644
--- a/main/openssl/crypto/bn/bn_shift.c
+++ b/main/openssl/crypto/bn/bn_shift.c
@@ -99,7 +99,7 @@ int BN_lshift1(BIGNUM *r, const BIGNUM *a)
int BN_rshift1(BIGNUM *r, const BIGNUM *a)
{
BN_ULONG *ap,*rp,t,c;
- int i;
+ int i,j;
bn_check_top(r);
bn_check_top(a);
@@ -109,22 +109,25 @@ int BN_rshift1(BIGNUM *r, const BIGNUM *a)
BN_zero(r);
return(1);
}
+ i = a->top;
+ ap= a->d;
+ j = i-(ap[i-1]==1);
if (a != r)
{
- if (bn_wexpand(r,a->top) == NULL) return(0);
- r->top=a->top;
+ if (bn_wexpand(r,j) == NULL) return(0);
r->neg=a->neg;
}
- ap=a->d;
rp=r->d;
- c=0;
- for (i=a->top-1; i>=0; i--)
+ t=ap[--i];
+ c=(t&1)?BN_TBIT:0;
+ if (t>>=1) rp[i]=t;
+ while (i>0)
{
- t=ap[i];
+ t=ap[--i];
rp[i]=((t>>1)&BN_MASK2)|c;
c=(t&1)?BN_TBIT:0;
}
- bn_correct_top(r);
+ r->top=j;
bn_check_top(r);
return(1);
}
@@ -182,10 +185,11 @@ int BN_rshift(BIGNUM *r, const BIGNUM *a, int n)
BN_zero(r);
return(1);
}
+ i = (BN_num_bits(a)-n+(BN_BITS2-1))/BN_BITS2;
if (r != a)
{
r->neg=a->neg;
- if (bn_wexpand(r,a->top-nw+1) == NULL) return(0);
+ if (bn_wexpand(r,i) == NULL) return(0);
}
else
{
@@ -196,7 +200,7 @@ int BN_rshift(BIGNUM *r, const BIGNUM *a, int n)
f= &(a->d[nw]);
t=r->d;
j=a->top-nw;
- r->top=j;
+ r->top=i;
if (rb == 0)
{
@@ -212,9 +216,8 @@ int BN_rshift(BIGNUM *r, const BIGNUM *a, int n)
l= *(f++);
*(t++) =(tmp|(l<<lb))&BN_MASK2;
}
- *(t++) =(l>>rb)&BN_MASK2;
+ if ((l = (l>>rb)&BN_MASK2)) *(t) = l;
}
- bn_correct_top(r);
bn_check_top(r);
return(1);
}
diff --git a/main/openssl/crypto/bn/bn_word.c b/main/openssl/crypto/bn/bn_word.c
index ee7b87c4..de83a15b 100644
--- a/main/openssl/crypto/bn/bn_word.c
+++ b/main/openssl/crypto/bn/bn_word.c
@@ -144,26 +144,17 @@ int BN_add_word(BIGNUM *a, BN_ULONG w)
a->neg=!(a->neg);
return(i);
}
- /* Only expand (and risk failing) if it's possibly necessary */
- if (((BN_ULONG)(a->d[a->top - 1] + 1) == 0) &&
- (bn_wexpand(a,a->top+1) == NULL))
- return(0);
- i=0;
- for (;;)
+ for (i=0;w!=0 && i<a->top;i++)
{
- if (i >= a->top)
- l=w;
- else
- l=(a->d[i]+w)&BN_MASK2;
- a->d[i]=l;
- if (w > l)
- w=1;
- else
- break;
- i++;
+ a->d[i] = l = (a->d[i]+w)&BN_MASK2;
+ w = (w>l)?1:0;
}
- if (i >= a->top)
+ if (w && i==a->top)
+ {
+ if (bn_wexpand(a,a->top+1) == NULL) return 0;
a->top++;
+ a->d[i]=w;
+ }
bn_check_top(a);
return(1);
}
diff --git a/main/openssl/crypto/bn/bntest.c b/main/openssl/crypto/bn/bntest.c
index 0cd99c5b..06f5954a 100644
--- a/main/openssl/crypto/bn/bntest.c
+++ b/main/openssl/crypto/bn/bntest.c
@@ -262,7 +262,7 @@ int main(int argc, char *argv[])
message(out,"BN_mod_sqrt");
if (!test_sqrt(out,ctx)) goto err;
(void)BIO_flush(out);
-
+#ifndef OPENSSL_NO_EC2M
message(out,"BN_GF2m_add");
if (!test_gf2m_add(out)) goto err;
(void)BIO_flush(out);
@@ -298,7 +298,7 @@ int main(int argc, char *argv[])
message(out,"BN_GF2m_mod_solve_quad");
if (!test_gf2m_mod_solve_quad(out,ctx)) goto err;
(void)BIO_flush(out);
-
+#endif
BN_CTX_free(ctx);
BIO_free(out);
@@ -1061,7 +1061,7 @@ int test_exp(BIO *bp, BN_CTX *ctx)
BN_free(one);
return(1);
}
-
+#ifndef OPENSSL_NO_EC2M
int test_gf2m_add(BIO *bp)
{
BIGNUM a,b,c;
@@ -1636,7 +1636,7 @@ int test_gf2m_mod_solve_quad(BIO *bp,BN_CTX *ctx)
BN_free(e);
return ret;
}
-
+#endif
static int genprime_cb(int p, int n, BN_GENCB *arg)
{
char c='*';
diff --git a/main/openssl/crypto/buffer/buf_str.c b/main/openssl/crypto/buffer/buf_str.c
new file mode 100644
index 00000000..151f5ea9
--- /dev/null
+++ b/main/openssl/crypto/buffer/buf_str.c
@@ -0,0 +1,119 @@
+/* crypto/buffer/buffer.c */
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ *
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to. The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code. The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ *
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * "This product includes cryptographic software written by
+ * Eric Young (eay@cryptsoft.com)"
+ * The word 'cryptographic' can be left out if the rouines from the library
+ * being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from
+ * the apps directory (application code) you must include an acknowledgement:
+ * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed. i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+
+#include <stdio.h>
+#include "cryptlib.h"
+#include <openssl/buffer.h>
+
+char *BUF_strdup(const char *str)
+ {
+ if (str == NULL) return(NULL);
+ return BUF_strndup(str, strlen(str));
+ }
+
+char *BUF_strndup(const char *str, size_t siz)
+ {
+ char *ret;
+
+ if (str == NULL) return(NULL);
+
+ ret=OPENSSL_malloc(siz+1);
+ if (ret == NULL)
+ {
+ BUFerr(BUF_F_BUF_STRNDUP,ERR_R_MALLOC_FAILURE);
+ return(NULL);
+ }
+ BUF_strlcpy(ret,str,siz+1);
+ return(ret);
+ }
+
+void *BUF_memdup(const void *data, size_t siz)
+ {
+ void *ret;
+
+ if (data == NULL) return(NULL);
+
+ ret=OPENSSL_malloc(siz);
+ if (ret == NULL)
+ {
+ BUFerr(BUF_F_BUF_MEMDUP,ERR_R_MALLOC_FAILURE);
+ return(NULL);
+ }
+ return memcpy(ret, data, siz);
+ }
+
+size_t BUF_strlcpy(char *dst, const char *src, size_t size)
+ {
+ size_t l = 0;
+ for(; size > 1 && *src; size--)
+ {
+ *dst++ = *src++;
+ l++;
+ }
+ if (size)
+ *dst = '\0';
+ return l + strlen(src);
+ }
+
+size_t BUF_strlcat(char *dst, const char *src, size_t size)
+ {
+ size_t l = 0;
+ for(; size > 0 && *dst; size--, dst++)
+ l++;
+ return l + BUF_strlcpy(dst, src, size);
+ }
diff --git a/main/openssl/crypto/buffer/buffer.c b/main/openssl/crypto/buffer/buffer.c
index 620ea8d5..d4a4ce43 100644
--- a/main/openssl/crypto/buffer/buffer.c
+++ b/main/openssl/crypto/buffer/buffer.c
@@ -60,6 +60,11 @@
#include "cryptlib.h"
#include <openssl/buffer.h>
+/* LIMIT_BEFORE_EXPANSION is the maximum n such that (n+3)/3*4 < 2**31. That
+ * function is applied in several functions in this file and this limit ensures
+ * that the result fits in an int. */
+#define LIMIT_BEFORE_EXPANSION 0x5ffffffc
+
BUF_MEM *BUF_MEM_new(void)
{
BUF_MEM *ret;
@@ -105,6 +110,12 @@ int BUF_MEM_grow(BUF_MEM *str, size_t len)
str->length=len;
return(len);
}
+ /* This limit is sufficient to ensure (len+3)/3*4 < 2**31 */
+ if (len > LIMIT_BEFORE_EXPANSION)
+ {
+ BUFerr(BUF_F_BUF_MEM_GROW,ERR_R_MALLOC_FAILURE);
+ return 0;
+ }
n=(len+3)/3*4;
if (str->data == NULL)
ret=OPENSSL_malloc(n);
@@ -142,6 +153,12 @@ int BUF_MEM_grow_clean(BUF_MEM *str, size_t len)
str->length=len;
return(len);
}
+ /* This limit is sufficient to ensure (len+3)/3*4 < 2**31 */
+ if (len > LIMIT_BEFORE_EXPANSION)
+ {
+ BUFerr(BUF_F_BUF_MEM_GROW_CLEAN,ERR_R_MALLOC_FAILURE);
+ return 0;
+ }
n=(len+3)/3*4;
if (str->data == NULL)
ret=OPENSSL_malloc(n);
@@ -162,72 +179,14 @@ int BUF_MEM_grow_clean(BUF_MEM *str, size_t len)
return(len);
}
-char *BUF_strdup(const char *str)
- {
- if (str == NULL) return(NULL);
- return BUF_strndup(str, strlen(str));
- }
-
-char *BUF_strndup(const char *str, size_t siz)
- {
- char *ret;
-
- if (str == NULL) return(NULL);
-
- ret=OPENSSL_malloc(siz+1);
- if (ret == NULL)
- {
- BUFerr(BUF_F_BUF_STRNDUP,ERR_R_MALLOC_FAILURE);
- return(NULL);
- }
- BUF_strlcpy(ret,str,siz+1);
- return(ret);
- }
-
-void *BUF_memdup(const void *data, size_t siz)
- {
- void *ret;
-
- if (data == NULL) return(NULL);
-
- ret=OPENSSL_malloc(siz);
- if (ret == NULL)
- {
- BUFerr(BUF_F_BUF_MEMDUP,ERR_R_MALLOC_FAILURE);
- return(NULL);
- }
- return memcpy(ret, data, siz);
- }
-
-size_t BUF_strlcpy(char *dst, const char *src, size_t size)
- {
- size_t l = 0;
- for(; size > 1 && *src; size--)
- {
- *dst++ = *src++;
- l++;
- }
- if (size)
- *dst = '\0';
- return l + strlen(src);
- }
-
-size_t BUF_strlcat(char *dst, const char *src, size_t size)
- {
- size_t l = 0;
- for(; size > 0 && *dst; size--, dst++)
- l++;
- return l + BUF_strlcpy(dst, src, size);
- }
-
-void BUF_reverse(unsigned char *out, unsigned char *in, size_t size)
+void BUF_reverse(unsigned char *out, const unsigned char *in, size_t size)
{
size_t i;
if (in)
{
out += size - 1;
for (i = 0; i < size; i++)
- *in++ = *out--;
+ *out-- = *in++;
}
else
{
diff --git a/main/openssl/crypto/buffer/buffer.h b/main/openssl/crypto/buffer/buffer.h
index 178e4182..f8da32b4 100644
--- a/main/openssl/crypto/buffer/buffer.h
+++ b/main/openssl/crypto/buffer/buffer.h
@@ -88,7 +88,7 @@ int BUF_MEM_grow_clean(BUF_MEM *str, size_t len);
char * BUF_strdup(const char *str);
char * BUF_strndup(const char *str, size_t siz);
void * BUF_memdup(const void *data, size_t siz);
-void BUF_reverse(unsigned char *out, unsigned char *in, size_t siz);
+void BUF_reverse(unsigned char *out, const unsigned char *in, size_t siz);
/* safe string functions */
size_t BUF_strlcpy(char *dst,const char *src,size_t siz);
diff --git a/main/openssl/crypto/cmac/cm_ameth.c b/main/openssl/crypto/cmac/cm_ameth.c
new file mode 100644
index 00000000..0b8e5670
--- /dev/null
+++ b/main/openssl/crypto/cmac/cm_ameth.c
@@ -0,0 +1,97 @@
+/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
+ * project 2010.
+ */
+/* ====================================================================
+ * Copyright (c) 2010 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include <stdio.h>
+#include "cryptlib.h"
+#include <openssl/evp.h>
+#include <openssl/cmac.h>
+#include "asn1_locl.h"
+
+/* CMAC "ASN1" method. This is just here to indicate the
+ * maximum CMAC output length and to free up a CMAC
+ * key.
+ */
+
+static int cmac_size(const EVP_PKEY *pkey)
+ {
+ return EVP_MAX_BLOCK_LENGTH;
+ }
+
+static void cmac_key_free(EVP_PKEY *pkey)
+ {
+ CMAC_CTX *cmctx = (CMAC_CTX *)pkey->pkey.ptr;
+ if (cmctx)
+ CMAC_CTX_free(cmctx);
+ }
+
+const EVP_PKEY_ASN1_METHOD cmac_asn1_meth =
+ {
+ EVP_PKEY_CMAC,
+ EVP_PKEY_CMAC,
+ 0,
+
+ "CMAC",
+ "OpenSSL CMAC method",
+
+ 0,0,0,0,
+
+ 0,0,0,
+
+ cmac_size,
+ 0,
+ 0,0,0,0,0,0,0,
+
+ cmac_key_free,
+ 0,
+ 0,0
+ };
+
diff --git a/main/openssl/crypto/cmac/cm_pmeth.c b/main/openssl/crypto/cmac/cm_pmeth.c
new file mode 100644
index 00000000..072228ec
--- /dev/null
+++ b/main/openssl/crypto/cmac/cm_pmeth.c
@@ -0,0 +1,224 @@
+/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
+ * project 2010.
+ */
+/* ====================================================================
+ * Copyright (c) 2010 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include <stdio.h>
+#include "cryptlib.h"
+#include <openssl/x509.h>
+#include <openssl/x509v3.h>
+#include <openssl/evp.h>
+#include <openssl/cmac.h>
+#include "evp_locl.h"
+
+/* The context structure and "key" is simply a CMAC_CTX */
+
+static int pkey_cmac_init(EVP_PKEY_CTX *ctx)
+ {
+ ctx->data = CMAC_CTX_new();
+ if (!ctx->data)
+ return 0;
+ ctx->keygen_info_count = 0;
+ return 1;
+ }
+
+static int pkey_cmac_copy(EVP_PKEY_CTX *dst, EVP_PKEY_CTX *src)
+ {
+ if (!pkey_cmac_init(dst))
+ return 0;
+ if (!CMAC_CTX_copy(dst->data, src->data))
+ return 0;
+ return 1;
+ }
+
+static void pkey_cmac_cleanup(EVP_PKEY_CTX *ctx)
+ {
+ CMAC_CTX_free(ctx->data);
+ }
+
+static int pkey_cmac_keygen(EVP_PKEY_CTX *ctx, EVP_PKEY *pkey)
+ {
+ CMAC_CTX *cmkey = CMAC_CTX_new();
+ CMAC_CTX *cmctx = ctx->data;
+ if (!cmkey)
+ return 0;
+ if (!CMAC_CTX_copy(cmkey, cmctx))
+ {
+ CMAC_CTX_free(cmkey);
+ return 0;
+ }
+ EVP_PKEY_assign(pkey, EVP_PKEY_CMAC, cmkey);
+
+ return 1;
+ }
+
+static int int_update(EVP_MD_CTX *ctx,const void *data,size_t count)
+ {
+ if (!CMAC_Update(ctx->pctx->data, data, count))
+ return 0;
+ return 1;
+ }
+
+static int cmac_signctx_init(EVP_PKEY_CTX *ctx, EVP_MD_CTX *mctx)
+ {
+ EVP_MD_CTX_set_flags(mctx, EVP_MD_CTX_FLAG_NO_INIT);
+ mctx->update = int_update;
+ return 1;
+ }
+
+static int cmac_signctx(EVP_PKEY_CTX *ctx, unsigned char *sig, size_t *siglen,
+ EVP_MD_CTX *mctx)
+ {
+ return CMAC_Final(ctx->data, sig, siglen);
+ }
+
+static int pkey_cmac_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2)
+ {
+ CMAC_CTX *cmctx = ctx->data;
+ switch (type)
+ {
+
+ case EVP_PKEY_CTRL_SET_MAC_KEY:
+ if (!p2 || p1 < 0)
+ return 0;
+ if (!CMAC_Init(cmctx, p2, p1, NULL, NULL))
+ return 0;
+ break;
+
+ case EVP_PKEY_CTRL_CIPHER:
+ if (!CMAC_Init(cmctx, NULL, 0, p2, ctx->engine))
+ return 0;
+ break;
+
+ case EVP_PKEY_CTRL_MD:
+ if (ctx->pkey && !CMAC_CTX_copy(ctx->data,
+ (CMAC_CTX *)ctx->pkey->pkey.ptr))
+ return 0;
+ if (!CMAC_Init(cmctx, NULL, 0, NULL, NULL))
+ return 0;
+ break;
+
+ default:
+ return -2;
+
+ }
+ return 1;
+ }
+
+static int pkey_cmac_ctrl_str(EVP_PKEY_CTX *ctx,
+ const char *type, const char *value)
+ {
+ if (!value)
+ {
+ return 0;
+ }
+ if (!strcmp(type, "key"))
+ {
+ void *p = (void *)value;
+ return pkey_cmac_ctrl(ctx, EVP_PKEY_CTRL_SET_MAC_KEY,
+ strlen(p), p);
+ }
+ if (!strcmp(type, "cipher"))
+ {
+ const EVP_CIPHER *c;
+ c = EVP_get_cipherbyname(value);
+ if (!c)
+ return 0;
+ return pkey_cmac_ctrl(ctx, EVP_PKEY_CTRL_CIPHER, -1, (void *)c);
+ }
+ if (!strcmp(type, "hexkey"))
+ {
+ unsigned char *key;
+ int r;
+ long keylen;
+ key = string_to_hex(value, &keylen);
+ if (!key)
+ return 0;
+ r = pkey_cmac_ctrl(ctx, EVP_PKEY_CTRL_SET_MAC_KEY, keylen, key);
+ OPENSSL_free(key);
+ return r;
+ }
+ return -2;
+ }
+
+const EVP_PKEY_METHOD cmac_pkey_meth =
+ {
+ EVP_PKEY_CMAC,
+ EVP_PKEY_FLAG_SIGCTX_CUSTOM,
+ pkey_cmac_init,
+ pkey_cmac_copy,
+ pkey_cmac_cleanup,
+
+ 0, 0,
+
+ 0,
+ pkey_cmac_keygen,
+
+ 0, 0,
+
+ 0, 0,
+
+ 0,0,
+
+ cmac_signctx_init,
+ cmac_signctx,
+
+ 0,0,
+
+ 0,0,
+
+ 0,0,
+
+ 0,0,
+
+ pkey_cmac_ctrl,
+ pkey_cmac_ctrl_str
+
+ };
diff --git a/main/openssl/crypto/cmac/cmac.c b/main/openssl/crypto/cmac/cmac.c
new file mode 100644
index 00000000..8b72b096
--- /dev/null
+++ b/main/openssl/crypto/cmac/cmac.c
@@ -0,0 +1,308 @@
+/* crypto/cmac/cmac.c */
+/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
+ * project.
+ */
+/* ====================================================================
+ * Copyright (c) 2010 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "cryptlib.h"
+#include <openssl/cmac.h>
+
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
+
+struct CMAC_CTX_st
+ {
+ /* Cipher context to use */
+ EVP_CIPHER_CTX cctx;
+ /* Keys k1 and k2 */
+ unsigned char k1[EVP_MAX_BLOCK_LENGTH];
+ unsigned char k2[EVP_MAX_BLOCK_LENGTH];
+ /* Temporary block */
+ unsigned char tbl[EVP_MAX_BLOCK_LENGTH];
+ /* Last (possibly partial) block */
+ unsigned char last_block[EVP_MAX_BLOCK_LENGTH];
+ /* Number of bytes in last block: -1 means context not initialised */
+ int nlast_block;
+ };
+
+
+/* Make temporary keys K1 and K2 */
+
+static void make_kn(unsigned char *k1, unsigned char *l, int bl)
+ {
+ int i;
+ /* Shift block to left, including carry */
+ for (i = 0; i < bl; i++)
+ {
+ k1[i] = l[i] << 1;
+ if (i < bl - 1 && l[i + 1] & 0x80)
+ k1[i] |= 1;
+ }
+ /* If MSB set fixup with R */
+ if (l[0] & 0x80)
+ k1[bl - 1] ^= bl == 16 ? 0x87 : 0x1b;
+ }
+
+CMAC_CTX *CMAC_CTX_new(void)
+ {
+ CMAC_CTX *ctx;
+ ctx = OPENSSL_malloc(sizeof(CMAC_CTX));
+ if (!ctx)
+ return NULL;
+ EVP_CIPHER_CTX_init(&ctx->cctx);
+ ctx->nlast_block = -1;
+ return ctx;
+ }
+
+void CMAC_CTX_cleanup(CMAC_CTX *ctx)
+ {
+#ifdef OPENSSL_FIPS
+ if (FIPS_mode() && !ctx->cctx.engine)
+ {
+ FIPS_cmac_ctx_cleanup(ctx);
+ return;
+ }
+#endif
+ EVP_CIPHER_CTX_cleanup(&ctx->cctx);
+ OPENSSL_cleanse(ctx->tbl, EVP_MAX_BLOCK_LENGTH);
+ OPENSSL_cleanse(ctx->k1, EVP_MAX_BLOCK_LENGTH);
+ OPENSSL_cleanse(ctx->k2, EVP_MAX_BLOCK_LENGTH);
+ OPENSSL_cleanse(ctx->last_block, EVP_MAX_BLOCK_LENGTH);
+ ctx->nlast_block = -1;
+ }
+
+EVP_CIPHER_CTX *CMAC_CTX_get0_cipher_ctx(CMAC_CTX *ctx)
+ {
+ return &ctx->cctx;
+ }
+
+void CMAC_CTX_free(CMAC_CTX *ctx)
+ {
+ CMAC_CTX_cleanup(ctx);
+ OPENSSL_free(ctx);
+ }
+
+int CMAC_CTX_copy(CMAC_CTX *out, const CMAC_CTX *in)
+ {
+ int bl;
+ if (in->nlast_block == -1)
+ return 0;
+ if (!EVP_CIPHER_CTX_copy(&out->cctx, &in->cctx))
+ return 0;
+ bl = EVP_CIPHER_CTX_block_size(&in->cctx);
+ memcpy(out->k1, in->k1, bl);
+ memcpy(out->k2, in->k2, bl);
+ memcpy(out->tbl, in->tbl, bl);
+ memcpy(out->last_block, in->last_block, bl);
+ out->nlast_block = in->nlast_block;
+ return 1;
+ }
+
+int CMAC_Init(CMAC_CTX *ctx, const void *key, size_t keylen,
+ const EVP_CIPHER *cipher, ENGINE *impl)
+ {
+ static unsigned char zero_iv[EVP_MAX_BLOCK_LENGTH];
+#ifdef OPENSSL_FIPS
+ if (FIPS_mode())
+ {
+ /* If we have an ENGINE need to allow non FIPS */
+ if ((impl || ctx->cctx.engine)
+ && !(ctx->cctx.flags & EVP_CIPH_FLAG_NON_FIPS_ALLOW))
+
+ {
+ EVPerr(EVP_F_CMAC_INIT, EVP_R_DISABLED_FOR_FIPS);
+ return 0;
+ }
+ /* Other algorithm blocking will be done in FIPS_cmac_init,
+ * via FIPS_cipherinit().
+ */
+ if (!impl && !ctx->cctx.engine)
+ return FIPS_cmac_init(ctx, key, keylen, cipher, NULL);
+ }
+#endif
+ /* All zeros means restart */
+ if (!key && !cipher && !impl && keylen == 0)
+ {
+ /* Not initialised */
+ if (ctx->nlast_block == -1)
+ return 0;
+ if (!EVP_EncryptInit_ex(&ctx->cctx, NULL, NULL, NULL, zero_iv))
+ return 0;
+ memset(ctx->tbl, 0, EVP_CIPHER_CTX_block_size(&ctx->cctx));
+ ctx->nlast_block = 0;
+ return 1;
+ }
+ /* Initialiase context */
+ if (cipher && !EVP_EncryptInit_ex(&ctx->cctx, cipher, impl, NULL, NULL))
+ return 0;
+ /* Non-NULL key means initialisation complete */
+ if (key)
+ {
+ int bl;
+ if (!EVP_CIPHER_CTX_cipher(&ctx->cctx))
+ return 0;
+ if (!EVP_CIPHER_CTX_set_key_length(&ctx->cctx, keylen))
+ return 0;
+ if (!EVP_EncryptInit_ex(&ctx->cctx, NULL, NULL, key, zero_iv))
+ return 0;
+ bl = EVP_CIPHER_CTX_block_size(&ctx->cctx);
+ if (!EVP_Cipher(&ctx->cctx, ctx->tbl, zero_iv, bl))
+ return 0;
+ make_kn(ctx->k1, ctx->tbl, bl);
+ make_kn(ctx->k2, ctx->k1, bl);
+ OPENSSL_cleanse(ctx->tbl, bl);
+ /* Reset context again ready for first data block */
+ if (!EVP_EncryptInit_ex(&ctx->cctx, NULL, NULL, NULL, zero_iv))
+ return 0;
+ /* Zero tbl so resume works */
+ memset(ctx->tbl, 0, bl);
+ ctx->nlast_block = 0;
+ }
+ return 1;
+ }
+
+int CMAC_Update(CMAC_CTX *ctx, const void *in, size_t dlen)
+ {
+ const unsigned char *data = in;
+ size_t bl;
+#ifdef OPENSSL_FIPS
+ if (FIPS_mode() && !ctx->cctx.engine)
+ return FIPS_cmac_update(ctx, in, dlen);
+#endif
+ if (ctx->nlast_block == -1)
+ return 0;
+ if (dlen == 0)
+ return 1;
+ bl = EVP_CIPHER_CTX_block_size(&ctx->cctx);
+ /* Copy into partial block if we need to */
+ if (ctx->nlast_block > 0)
+ {
+ size_t nleft;
+ nleft = bl - ctx->nlast_block;
+ if (dlen < nleft)
+ nleft = dlen;
+ memcpy(ctx->last_block + ctx->nlast_block, data, nleft);
+ dlen -= nleft;
+ ctx->nlast_block += nleft;
+ /* If no more to process return */
+ if (dlen == 0)
+ return 1;
+ data += nleft;
+ /* Else not final block so encrypt it */
+ if (!EVP_Cipher(&ctx->cctx, ctx->tbl, ctx->last_block,bl))
+ return 0;
+ }
+ /* Encrypt all but one of the complete blocks left */
+ while(dlen > bl)
+ {
+ if (!EVP_Cipher(&ctx->cctx, ctx->tbl, data, bl))
+ return 0;
+ dlen -= bl;
+ data += bl;
+ }
+ /* Copy any data left to last block buffer */
+ memcpy(ctx->last_block, data, dlen);
+ ctx->nlast_block = dlen;
+ return 1;
+
+ }
+
+int CMAC_Final(CMAC_CTX *ctx, unsigned char *out, size_t *poutlen)
+ {
+ int i, bl, lb;
+#ifdef OPENSSL_FIPS
+ if (FIPS_mode() && !ctx->cctx.engine)
+ return FIPS_cmac_final(ctx, out, poutlen);
+#endif
+ if (ctx->nlast_block == -1)
+ return 0;
+ bl = EVP_CIPHER_CTX_block_size(&ctx->cctx);
+ *poutlen = (size_t)bl;
+ if (!out)
+ return 1;
+ lb = ctx->nlast_block;
+ /* Is last block complete? */
+ if (lb == bl)
+ {
+ for (i = 0; i < bl; i++)
+ out[i] = ctx->last_block[i] ^ ctx->k1[i];
+ }
+ else
+ {
+ ctx->last_block[lb] = 0x80;
+ if (bl - lb > 1)
+ memset(ctx->last_block + lb + 1, 0, bl - lb - 1);
+ for (i = 0; i < bl; i++)
+ out[i] = ctx->last_block[i] ^ ctx->k2[i];
+ }
+ if (!EVP_Cipher(&ctx->cctx, out, out, bl))
+ {
+ OPENSSL_cleanse(out, bl);
+ return 0;
+ }
+ return 1;
+ }
+
+int CMAC_resume(CMAC_CTX *ctx)
+ {
+ if (ctx->nlast_block == -1)
+ return 0;
+ /* The buffer "tbl" containes the last fully encrypted block
+ * which is the last IV (or all zeroes if no last encrypted block).
+ * The last block has not been modified since CMAC_final().
+ * So reinitliasing using the last decrypted block will allow
+ * CMAC to continue after calling CMAC_Final().
+ */
+ return EVP_EncryptInit_ex(&ctx->cctx, NULL, NULL, NULL, ctx->tbl);
+ }
diff --git a/main/openssl/crypto/cmac/cmac.h b/main/openssl/crypto/cmac/cmac.h
new file mode 100644
index 00000000..712e92dc
--- /dev/null
+++ b/main/openssl/crypto/cmac/cmac.h
@@ -0,0 +1,82 @@
+/* crypto/cmac/cmac.h */
+/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
+ * project.
+ */
+/* ====================================================================
+ * Copyright (c) 2010 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+
+#ifndef HEADER_CMAC_H
+#define HEADER_CMAC_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <openssl/evp.h>
+
+/* Opaque */
+typedef struct CMAC_CTX_st CMAC_CTX;
+
+CMAC_CTX *CMAC_CTX_new(void);
+void CMAC_CTX_cleanup(CMAC_CTX *ctx);
+void CMAC_CTX_free(CMAC_CTX *ctx);
+EVP_CIPHER_CTX *CMAC_CTX_get0_cipher_ctx(CMAC_CTX *ctx);
+int CMAC_CTX_copy(CMAC_CTX *out, const CMAC_CTX *in);
+
+int CMAC_Init(CMAC_CTX *ctx, const void *key, size_t keylen,
+ const EVP_CIPHER *cipher, ENGINE *impl);
+int CMAC_Update(CMAC_CTX *ctx, const void *data, size_t dlen);
+int CMAC_Final(CMAC_CTX *ctx, unsigned char *out, size_t *poutlen);
+int CMAC_resume(CMAC_CTX *ctx);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/main/openssl/crypto/cms/cms.h b/main/openssl/crypto/cms/cms.h
new file mode 100644
index 00000000..36994fa6
--- /dev/null
+++ b/main/openssl/crypto/cms/cms.h
@@ -0,0 +1,501 @@
+/* crypto/cms/cms.h */
+/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
+ * project.
+ */
+/* ====================================================================
+ * Copyright (c) 2008 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+
+#ifndef HEADER_CMS_H
+#define HEADER_CMS_H
+
+#include <openssl/x509.h>
+
+#ifdef OPENSSL_NO_CMS
+#error CMS is disabled.
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+typedef struct CMS_ContentInfo_st CMS_ContentInfo;
+typedef struct CMS_SignerInfo_st CMS_SignerInfo;
+typedef struct CMS_CertificateChoices CMS_CertificateChoices;
+typedef struct CMS_RevocationInfoChoice_st CMS_RevocationInfoChoice;
+typedef struct CMS_RecipientInfo_st CMS_RecipientInfo;
+typedef struct CMS_ReceiptRequest_st CMS_ReceiptRequest;
+typedef struct CMS_Receipt_st CMS_Receipt;
+
+DECLARE_STACK_OF(CMS_SignerInfo)
+DECLARE_STACK_OF(GENERAL_NAMES)
+DECLARE_ASN1_FUNCTIONS(CMS_ContentInfo)
+DECLARE_ASN1_FUNCTIONS(CMS_ReceiptRequest)
+DECLARE_ASN1_PRINT_FUNCTION(CMS_ContentInfo)
+
+#define CMS_SIGNERINFO_ISSUER_SERIAL 0
+#define CMS_SIGNERINFO_KEYIDENTIFIER 1
+
+#define CMS_RECIPINFO_TRANS 0
+#define CMS_RECIPINFO_AGREE 1
+#define CMS_RECIPINFO_KEK 2
+#define CMS_RECIPINFO_PASS 3
+#define CMS_RECIPINFO_OTHER 4
+
+/* S/MIME related flags */
+
+#define CMS_TEXT 0x1
+#define CMS_NOCERTS 0x2
+#define CMS_NO_CONTENT_VERIFY 0x4
+#define CMS_NO_ATTR_VERIFY 0x8
+#define CMS_NOSIGS \
+ (CMS_NO_CONTENT_VERIFY|CMS_NO_ATTR_VERIFY)
+#define CMS_NOINTERN 0x10
+#define CMS_NO_SIGNER_CERT_VERIFY 0x20
+#define CMS_NOVERIFY 0x20
+#define CMS_DETACHED 0x40
+#define CMS_BINARY 0x80
+#define CMS_NOATTR 0x100
+#define CMS_NOSMIMECAP 0x200
+#define CMS_NOOLDMIMETYPE 0x400
+#define CMS_CRLFEOL 0x800
+#define CMS_STREAM 0x1000
+#define CMS_NOCRL 0x2000
+#define CMS_PARTIAL 0x4000
+#define CMS_REUSE_DIGEST 0x8000
+#define CMS_USE_KEYID 0x10000
+#define CMS_DEBUG_DECRYPT 0x20000
+
+const ASN1_OBJECT *CMS_get0_type(CMS_ContentInfo *cms);
+
+BIO *CMS_dataInit(CMS_ContentInfo *cms, BIO *icont);
+int CMS_dataFinal(CMS_ContentInfo *cms, BIO *bio);
+
+ASN1_OCTET_STRING **CMS_get0_content(CMS_ContentInfo *cms);
+int CMS_is_detached(CMS_ContentInfo *cms);
+int CMS_set_detached(CMS_ContentInfo *cms, int detached);
+
+#ifdef HEADER_PEM_H
+DECLARE_PEM_rw_const(CMS, CMS_ContentInfo)
+#endif
+
+int CMS_stream(unsigned char ***boundary, CMS_ContentInfo *cms);
+CMS_ContentInfo *d2i_CMS_bio(BIO *bp, CMS_ContentInfo **cms);
+int i2d_CMS_bio(BIO *bp, CMS_ContentInfo *cms);
+
+BIO *BIO_new_CMS(BIO *out, CMS_ContentInfo *cms);
+int i2d_CMS_bio_stream(BIO *out, CMS_ContentInfo *cms, BIO *in, int flags);
+int PEM_write_bio_CMS_stream(BIO *out, CMS_ContentInfo *cms, BIO *in, int flags);
+CMS_ContentInfo *SMIME_read_CMS(BIO *bio, BIO **bcont);
+int SMIME_write_CMS(BIO *bio, CMS_ContentInfo *cms, BIO *data, int flags);
+
+int CMS_final(CMS_ContentInfo *cms, BIO *data, BIO *dcont, unsigned int flags);
+
+CMS_ContentInfo *CMS_sign(X509 *signcert, EVP_PKEY *pkey, STACK_OF(X509) *certs,
+ BIO *data, unsigned int flags);
+
+CMS_ContentInfo *CMS_sign_receipt(CMS_SignerInfo *si,
+ X509 *signcert, EVP_PKEY *pkey,
+ STACK_OF(X509) *certs,
+ unsigned int flags);
+
+int CMS_data(CMS_ContentInfo *cms, BIO *out, unsigned int flags);
+CMS_ContentInfo *CMS_data_create(BIO *in, unsigned int flags);
+
+int CMS_digest_verify(CMS_ContentInfo *cms, BIO *dcont, BIO *out,
+ unsigned int flags);
+CMS_ContentInfo *CMS_digest_create(BIO *in, const EVP_MD *md,
+ unsigned int flags);
+
+int CMS_EncryptedData_decrypt(CMS_ContentInfo *cms,
+ const unsigned char *key, size_t keylen,
+ BIO *dcont, BIO *out, unsigned int flags);
+
+CMS_ContentInfo *CMS_EncryptedData_encrypt(BIO *in, const EVP_CIPHER *cipher,
+ const unsigned char *key, size_t keylen,
+ unsigned int flags);
+
+int CMS_EncryptedData_set1_key(CMS_ContentInfo *cms, const EVP_CIPHER *ciph,
+ const unsigned char *key, size_t keylen);
+
+int CMS_verify(CMS_ContentInfo *cms, STACK_OF(X509) *certs,
+ X509_STORE *store, BIO *dcont, BIO *out, unsigned int flags);
+
+int CMS_verify_receipt(CMS_ContentInfo *rcms, CMS_ContentInfo *ocms,
+ STACK_OF(X509) *certs,
+ X509_STORE *store, unsigned int flags);
+
+STACK_OF(X509) *CMS_get0_signers(CMS_ContentInfo *cms);
+
+CMS_ContentInfo *CMS_encrypt(STACK_OF(X509) *certs, BIO *in,
+ const EVP_CIPHER *cipher, unsigned int flags);
+
+int CMS_decrypt(CMS_ContentInfo *cms, EVP_PKEY *pkey, X509 *cert,
+ BIO *dcont, BIO *out,
+ unsigned int flags);
+
+int CMS_decrypt_set1_pkey(CMS_ContentInfo *cms, EVP_PKEY *pk, X509 *cert);
+int CMS_decrypt_set1_key(CMS_ContentInfo *cms,
+ unsigned char *key, size_t keylen,
+ unsigned char *id, size_t idlen);
+int CMS_decrypt_set1_password(CMS_ContentInfo *cms,
+ unsigned char *pass, ossl_ssize_t passlen);
+
+STACK_OF(CMS_RecipientInfo) *CMS_get0_RecipientInfos(CMS_ContentInfo *cms);
+int CMS_RecipientInfo_type(CMS_RecipientInfo *ri);
+CMS_ContentInfo *CMS_EnvelopedData_create(const EVP_CIPHER *cipher);
+CMS_RecipientInfo *CMS_add1_recipient_cert(CMS_ContentInfo *cms,
+ X509 *recip, unsigned int flags);
+int CMS_RecipientInfo_set0_pkey(CMS_RecipientInfo *ri, EVP_PKEY *pkey);
+int CMS_RecipientInfo_ktri_cert_cmp(CMS_RecipientInfo *ri, X509 *cert);
+int CMS_RecipientInfo_ktri_get0_algs(CMS_RecipientInfo *ri,
+ EVP_PKEY **pk, X509 **recip,
+ X509_ALGOR **palg);
+int CMS_RecipientInfo_ktri_get0_signer_id(CMS_RecipientInfo *ri,
+ ASN1_OCTET_STRING **keyid,
+ X509_NAME **issuer, ASN1_INTEGER **sno);
+
+CMS_RecipientInfo *CMS_add0_recipient_key(CMS_ContentInfo *cms, int nid,
+ unsigned char *key, size_t keylen,
+ unsigned char *id, size_t idlen,
+ ASN1_GENERALIZEDTIME *date,
+ ASN1_OBJECT *otherTypeId,
+ ASN1_TYPE *otherType);
+
+int CMS_RecipientInfo_kekri_get0_id(CMS_RecipientInfo *ri,
+ X509_ALGOR **palg,
+ ASN1_OCTET_STRING **pid,
+ ASN1_GENERALIZEDTIME **pdate,
+ ASN1_OBJECT **potherid,
+ ASN1_TYPE **pothertype);
+
+int CMS_RecipientInfo_set0_key(CMS_RecipientInfo *ri,
+ unsigned char *key, size_t keylen);
+
+int CMS_RecipientInfo_kekri_id_cmp(CMS_RecipientInfo *ri,
+ const unsigned char *id, size_t idlen);
+
+int CMS_RecipientInfo_set0_password(CMS_RecipientInfo *ri,
+ unsigned char *pass,
+ ossl_ssize_t passlen);
+
+CMS_RecipientInfo *CMS_add0_recipient_password(CMS_ContentInfo *cms,
+ int iter, int wrap_nid, int pbe_nid,
+ unsigned char *pass,
+ ossl_ssize_t passlen,
+ const EVP_CIPHER *kekciph);
+
+int CMS_RecipientInfo_decrypt(CMS_ContentInfo *cms, CMS_RecipientInfo *ri);
+
+int CMS_uncompress(CMS_ContentInfo *cms, BIO *dcont, BIO *out,
+ unsigned int flags);
+CMS_ContentInfo *CMS_compress(BIO *in, int comp_nid, unsigned int flags);
+
+int CMS_set1_eContentType(CMS_ContentInfo *cms, const ASN1_OBJECT *oid);
+const ASN1_OBJECT *CMS_get0_eContentType(CMS_ContentInfo *cms);
+
+CMS_CertificateChoices *CMS_add0_CertificateChoices(CMS_ContentInfo *cms);
+int CMS_add0_cert(CMS_ContentInfo *cms, X509 *cert);
+int CMS_add1_cert(CMS_ContentInfo *cms, X509 *cert);
+STACK_OF(X509) *CMS_get1_certs(CMS_ContentInfo *cms);
+
+CMS_RevocationInfoChoice *CMS_add0_RevocationInfoChoice(CMS_ContentInfo *cms);
+int CMS_add0_crl(CMS_ContentInfo *cms, X509_CRL *crl);
+int CMS_add1_crl(CMS_ContentInfo *cms, X509_CRL *crl);
+STACK_OF(X509_CRL) *CMS_get1_crls(CMS_ContentInfo *cms);
+
+int CMS_SignedData_init(CMS_ContentInfo *cms);
+CMS_SignerInfo *CMS_add1_signer(CMS_ContentInfo *cms,
+ X509 *signer, EVP_PKEY *pk, const EVP_MD *md,
+ unsigned int flags);
+STACK_OF(CMS_SignerInfo) *CMS_get0_SignerInfos(CMS_ContentInfo *cms);
+
+void CMS_SignerInfo_set1_signer_cert(CMS_SignerInfo *si, X509 *signer);
+int CMS_SignerInfo_get0_signer_id(CMS_SignerInfo *si,
+ ASN1_OCTET_STRING **keyid,
+ X509_NAME **issuer, ASN1_INTEGER **sno);
+int CMS_SignerInfo_cert_cmp(CMS_SignerInfo *si, X509 *cert);
+int CMS_set1_signers_certs(CMS_ContentInfo *cms, STACK_OF(X509) *certs,
+ unsigned int flags);
+void CMS_SignerInfo_get0_algs(CMS_SignerInfo *si, EVP_PKEY **pk, X509 **signer,
+ X509_ALGOR **pdig, X509_ALGOR **psig);
+int CMS_SignerInfo_sign(CMS_SignerInfo *si);
+int CMS_SignerInfo_verify(CMS_SignerInfo *si);
+int CMS_SignerInfo_verify_content(CMS_SignerInfo *si, BIO *chain);
+
+int CMS_add_smimecap(CMS_SignerInfo *si, STACK_OF(X509_ALGOR) *algs);
+int CMS_add_simple_smimecap(STACK_OF(X509_ALGOR) **algs,
+ int algnid, int keysize);
+int CMS_add_standard_smimecap(STACK_OF(X509_ALGOR) **smcap);
+
+int CMS_signed_get_attr_count(const CMS_SignerInfo *si);
+int CMS_signed_get_attr_by_NID(const CMS_SignerInfo *si, int nid,
+ int lastpos);
+int CMS_signed_get_attr_by_OBJ(const CMS_SignerInfo *si, ASN1_OBJECT *obj,
+ int lastpos);
+X509_ATTRIBUTE *CMS_signed_get_attr(const CMS_SignerInfo *si, int loc);
+X509_ATTRIBUTE *CMS_signed_delete_attr(CMS_SignerInfo *si, int loc);
+int CMS_signed_add1_attr(CMS_SignerInfo *si, X509_ATTRIBUTE *attr);
+int CMS_signed_add1_attr_by_OBJ(CMS_SignerInfo *si,
+ const ASN1_OBJECT *obj, int type,
+ const void *bytes, int len);
+int CMS_signed_add1_attr_by_NID(CMS_SignerInfo *si,
+ int nid, int type,
+ const void *bytes, int len);
+int CMS_signed_add1_attr_by_txt(CMS_SignerInfo *si,
+ const char *attrname, int type,
+ const void *bytes, int len);
+void *CMS_signed_get0_data_by_OBJ(CMS_SignerInfo *si, ASN1_OBJECT *oid,
+ int lastpos, int type);
+
+int CMS_unsigned_get_attr_count(const CMS_SignerInfo *si);
+int CMS_unsigned_get_attr_by_NID(const CMS_SignerInfo *si, int nid,
+ int lastpos);
+int CMS_unsigned_get_attr_by_OBJ(const CMS_SignerInfo *si, ASN1_OBJECT *obj,
+ int lastpos);
+X509_ATTRIBUTE *CMS_unsigned_get_attr(const CMS_SignerInfo *si, int loc);
+X509_ATTRIBUTE *CMS_unsigned_delete_attr(CMS_SignerInfo *si, int loc);
+int CMS_unsigned_add1_attr(CMS_SignerInfo *si, X509_ATTRIBUTE *attr);
+int CMS_unsigned_add1_attr_by_OBJ(CMS_SignerInfo *si,
+ const ASN1_OBJECT *obj, int type,
+ const void *bytes, int len);
+int CMS_unsigned_add1_attr_by_NID(CMS_SignerInfo *si,
+ int nid, int type,
+ const void *bytes, int len);
+int CMS_unsigned_add1_attr_by_txt(CMS_SignerInfo *si,
+ const char *attrname, int type,
+ const void *bytes, int len);
+void *CMS_unsigned_get0_data_by_OBJ(CMS_SignerInfo *si, ASN1_OBJECT *oid,
+ int lastpos, int type);
+
+#ifdef HEADER_X509V3_H
+
+int CMS_get1_ReceiptRequest(CMS_SignerInfo *si, CMS_ReceiptRequest **prr);
+CMS_ReceiptRequest *CMS_ReceiptRequest_create0(unsigned char *id, int idlen,
+ int allorfirst,
+ STACK_OF(GENERAL_NAMES) *receiptList,
+ STACK_OF(GENERAL_NAMES) *receiptsTo);
+int CMS_add1_ReceiptRequest(CMS_SignerInfo *si, CMS_ReceiptRequest *rr);
+void CMS_ReceiptRequest_get0_values(CMS_ReceiptRequest *rr,
+ ASN1_STRING **pcid,
+ int *pallorfirst,
+ STACK_OF(GENERAL_NAMES) **plist,
+ STACK_OF(GENERAL_NAMES) **prto);
+
+#endif
+
+/* BEGIN ERROR CODES */
+/* The following lines are auto generated by the script mkerr.pl. Any changes
+ * made after this point may be overwritten when the script is next run.
+ */
+void ERR_load_CMS_strings(void);
+
+/* Error codes for the CMS functions. */
+
+/* Function codes. */
+#define CMS_F_CHECK_CONTENT 99
+#define CMS_F_CMS_ADD0_CERT 164
+#define CMS_F_CMS_ADD0_RECIPIENT_KEY 100
+#define CMS_F_CMS_ADD0_RECIPIENT_PASSWORD 165
+#define CMS_F_CMS_ADD1_RECEIPTREQUEST 158
+#define CMS_F_CMS_ADD1_RECIPIENT_CERT 101
+#define CMS_F_CMS_ADD1_SIGNER 102
+#define CMS_F_CMS_ADD1_SIGNINGTIME 103
+#define CMS_F_CMS_COMPRESS 104
+#define CMS_F_CMS_COMPRESSEDDATA_CREATE 105
+#define CMS_F_CMS_COMPRESSEDDATA_INIT_BIO 106
+#define CMS_F_CMS_COPY_CONTENT 107
+#define CMS_F_CMS_COPY_MESSAGEDIGEST 108
+#define CMS_F_CMS_DATA 109
+#define CMS_F_CMS_DATAFINAL 110
+#define CMS_F_CMS_DATAINIT 111
+#define CMS_F_CMS_DECRYPT 112
+#define CMS_F_CMS_DECRYPT_SET1_KEY 113
+#define CMS_F_CMS_DECRYPT_SET1_PASSWORD 166
+#define CMS_F_CMS_DECRYPT_SET1_PKEY 114
+#define CMS_F_CMS_DIGESTALGORITHM_FIND_CTX 115
+#define CMS_F_CMS_DIGESTALGORITHM_INIT_BIO 116
+#define CMS_F_CMS_DIGESTEDDATA_DO_FINAL 117
+#define CMS_F_CMS_DIGEST_VERIFY 118
+#define CMS_F_CMS_ENCODE_RECEIPT 161
+#define CMS_F_CMS_ENCRYPT 119
+#define CMS_F_CMS_ENCRYPTEDCONTENT_INIT_BIO 120
+#define CMS_F_CMS_ENCRYPTEDDATA_DECRYPT 121
+#define CMS_F_CMS_ENCRYPTEDDATA_ENCRYPT 122
+#define CMS_F_CMS_ENCRYPTEDDATA_SET1_KEY 123
+#define CMS_F_CMS_ENVELOPEDDATA_CREATE 124
+#define CMS_F_CMS_ENVELOPEDDATA_INIT_BIO 125
+#define CMS_F_CMS_ENVELOPED_DATA_INIT 126
+#define CMS_F_CMS_FINAL 127
+#define CMS_F_CMS_GET0_CERTIFICATE_CHOICES 128
+#define CMS_F_CMS_GET0_CONTENT 129
+#define CMS_F_CMS_GET0_ECONTENT_TYPE 130
+#define CMS_F_CMS_GET0_ENVELOPED 131
+#define CMS_F_CMS_GET0_REVOCATION_CHOICES 132
+#define CMS_F_CMS_GET0_SIGNED 133
+#define CMS_F_CMS_MSGSIGDIGEST_ADD1 162
+#define CMS_F_CMS_RECEIPTREQUEST_CREATE0 159
+#define CMS_F_CMS_RECEIPT_VERIFY 160
+#define CMS_F_CMS_RECIPIENTINFO_DECRYPT 134
+#define CMS_F_CMS_RECIPIENTINFO_KEKRI_DECRYPT 135
+#define CMS_F_CMS_RECIPIENTINFO_KEKRI_ENCRYPT 136
+#define CMS_F_CMS_RECIPIENTINFO_KEKRI_GET0_ID 137
+#define CMS_F_CMS_RECIPIENTINFO_KEKRI_ID_CMP 138
+#define CMS_F_CMS_RECIPIENTINFO_KTRI_CERT_CMP 139
+#define CMS_F_CMS_RECIPIENTINFO_KTRI_DECRYPT 140
+#define CMS_F_CMS_RECIPIENTINFO_KTRI_ENCRYPT 141
+#define CMS_F_CMS_RECIPIENTINFO_KTRI_GET0_ALGS 142
+#define CMS_F_CMS_RECIPIENTINFO_KTRI_GET0_SIGNER_ID 143
+#define CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT 167
+#define CMS_F_CMS_RECIPIENTINFO_SET0_KEY 144
+#define CMS_F_CMS_RECIPIENTINFO_SET0_PASSWORD 168
+#define CMS_F_CMS_RECIPIENTINFO_SET0_PKEY 145
+#define CMS_F_CMS_SET1_SIGNERIDENTIFIER 146
+#define CMS_F_CMS_SET_DETACHED 147
+#define CMS_F_CMS_SIGN 148
+#define CMS_F_CMS_SIGNED_DATA_INIT 149
+#define CMS_F_CMS_SIGNERINFO_CONTENT_SIGN 150
+#define CMS_F_CMS_SIGNERINFO_SIGN 151
+#define CMS_F_CMS_SIGNERINFO_VERIFY 152
+#define CMS_F_CMS_SIGNERINFO_VERIFY_CERT 153
+#define CMS_F_CMS_SIGNERINFO_VERIFY_CONTENT 154
+#define CMS_F_CMS_SIGN_RECEIPT 163
+#define CMS_F_CMS_STREAM 155
+#define CMS_F_CMS_UNCOMPRESS 156
+#define CMS_F_CMS_VERIFY 157
+
+/* Reason codes. */
+#define CMS_R_ADD_SIGNER_ERROR 99
+#define CMS_R_CERTIFICATE_ALREADY_PRESENT 175
+#define CMS_R_CERTIFICATE_HAS_NO_KEYID 160
+#define CMS_R_CERTIFICATE_VERIFY_ERROR 100
+#define CMS_R_CIPHER_INITIALISATION_ERROR 101
+#define CMS_R_CIPHER_PARAMETER_INITIALISATION_ERROR 102
+#define CMS_R_CMS_DATAFINAL_ERROR 103
+#define CMS_R_CMS_LIB 104
+#define CMS_R_CONTENTIDENTIFIER_MISMATCH 170
+#define CMS_R_CONTENT_NOT_FOUND 105
+#define CMS_R_CONTENT_TYPE_MISMATCH 171
+#define CMS_R_CONTENT_TYPE_NOT_COMPRESSED_DATA 106
+#define CMS_R_CONTENT_TYPE_NOT_ENVELOPED_DATA 107
+#define CMS_R_CONTENT_TYPE_NOT_SIGNED_DATA 108
+#define CMS_R_CONTENT_VERIFY_ERROR 109
+#define CMS_R_CTRL_ERROR 110
+#define CMS_R_CTRL_FAILURE 111
+#define CMS_R_DECRYPT_ERROR 112
+#define CMS_R_DIGEST_ERROR 161
+#define CMS_R_ERROR_GETTING_PUBLIC_KEY 113
+#define CMS_R_ERROR_READING_MESSAGEDIGEST_ATTRIBUTE 114
+#define CMS_R_ERROR_SETTING_KEY 115
+#define CMS_R_ERROR_SETTING_RECIPIENTINFO 116
+#define CMS_R_INVALID_ENCRYPTED_KEY_LENGTH 117
+#define CMS_R_INVALID_KEY_ENCRYPTION_PARAMETER 176
+#define CMS_R_INVALID_KEY_LENGTH 118
+#define CMS_R_MD_BIO_INIT_ERROR 119
+#define CMS_R_MESSAGEDIGEST_ATTRIBUTE_WRONG_LENGTH 120
+#define CMS_R_MESSAGEDIGEST_WRONG_LENGTH 121
+#define CMS_R_MSGSIGDIGEST_ERROR 172
+#define CMS_R_MSGSIGDIGEST_VERIFICATION_FAILURE 162
+#define CMS_R_MSGSIGDIGEST_WRONG_LENGTH 163
+#define CMS_R_NEED_ONE_SIGNER 164
+#define CMS_R_NOT_A_SIGNED_RECEIPT 165
+#define CMS_R_NOT_ENCRYPTED_DATA 122
+#define CMS_R_NOT_KEK 123
+#define CMS_R_NOT_KEY_TRANSPORT 124
+#define CMS_R_NOT_PWRI 177
+#define CMS_R_NOT_SUPPORTED_FOR_THIS_KEY_TYPE 125
+#define CMS_R_NO_CIPHER 126
+#define CMS_R_NO_CONTENT 127
+#define CMS_R_NO_CONTENT_TYPE 173
+#define CMS_R_NO_DEFAULT_DIGEST 128
+#define CMS_R_NO_DIGEST_SET 129
+#define CMS_R_NO_KEY 130
+#define CMS_R_NO_KEY_OR_CERT 174
+#define CMS_R_NO_MATCHING_DIGEST 131
+#define CMS_R_NO_MATCHING_RECIPIENT 132
+#define CMS_R_NO_MATCHING_SIGNATURE 166
+#define CMS_R_NO_MSGSIGDIGEST 167
+#define CMS_R_NO_PASSWORD 178
+#define CMS_R_NO_PRIVATE_KEY 133
+#define CMS_R_NO_PUBLIC_KEY 134
+#define CMS_R_NO_RECEIPT_REQUEST 168
+#define CMS_R_NO_SIGNERS 135
+#define CMS_R_PRIVATE_KEY_DOES_NOT_MATCH_CERTIFICATE 136
+#define CMS_R_RECEIPT_DECODE_ERROR 169
+#define CMS_R_RECIPIENT_ERROR 137
+#define CMS_R_SIGNER_CERTIFICATE_NOT_FOUND 138
+#define CMS_R_SIGNFINAL_ERROR 139
+#define CMS_R_SMIME_TEXT_ERROR 140
+#define CMS_R_STORE_INIT_ERROR 141
+#define CMS_R_TYPE_NOT_COMPRESSED_DATA 142
+#define CMS_R_TYPE_NOT_DATA 143
+#define CMS_R_TYPE_NOT_DIGESTED_DATA 144
+#define CMS_R_TYPE_NOT_ENCRYPTED_DATA 145
+#define CMS_R_TYPE_NOT_ENVELOPED_DATA 146
+#define CMS_R_UNABLE_TO_FINALIZE_CONTEXT 147
+#define CMS_R_UNKNOWN_CIPHER 148
+#define CMS_R_UNKNOWN_DIGEST_ALGORIHM 149
+#define CMS_R_UNKNOWN_ID 150
+#define CMS_R_UNSUPPORTED_COMPRESSION_ALGORITHM 151
+#define CMS_R_UNSUPPORTED_CONTENT_TYPE 152
+#define CMS_R_UNSUPPORTED_KEK_ALGORITHM 153
+#define CMS_R_UNSUPPORTED_KEY_ENCRYPTION_ALGORITHM 179
+#define CMS_R_UNSUPPORTED_RECIPIENT_TYPE 154
+#define CMS_R_UNSUPPORTED_RECPIENTINFO_TYPE 155
+#define CMS_R_UNSUPPORTED_TYPE 156
+#define CMS_R_UNWRAP_ERROR 157
+#define CMS_R_UNWRAP_FAILURE 180
+#define CMS_R_VERIFICATION_FAILURE 158
+#define CMS_R_WRAP_ERROR 159
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/main/openssl/crypto/cms/cms_asn1.c b/main/openssl/crypto/cms/cms_asn1.c
new file mode 100644
index 00000000..cfe67fb6
--- /dev/null
+++ b/main/openssl/crypto/cms/cms_asn1.c
@@ -0,0 +1,389 @@
+/* crypto/cms/cms_asn1.c */
+/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
+ * project.
+ */
+/* ====================================================================
+ * Copyright (c) 2008 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include <openssl/asn1t.h>
+#include <openssl/pem.h>
+#include <openssl/x509v3.h>
+#include "cms.h"
+#include "cms_lcl.h"
+
+
+ASN1_SEQUENCE(CMS_IssuerAndSerialNumber) = {
+ ASN1_SIMPLE(CMS_IssuerAndSerialNumber, issuer, X509_NAME),
+ ASN1_SIMPLE(CMS_IssuerAndSerialNumber, serialNumber, ASN1_INTEGER)
+} ASN1_SEQUENCE_END(CMS_IssuerAndSerialNumber)
+
+ASN1_SEQUENCE(CMS_OtherCertificateFormat) = {
+ ASN1_SIMPLE(CMS_OtherCertificateFormat, otherCertFormat, ASN1_OBJECT),
+ ASN1_OPT(CMS_OtherCertificateFormat, otherCert, ASN1_ANY)
+} ASN1_SEQUENCE_END(CMS_OtherCertificateFormat)
+
+ASN1_CHOICE(CMS_CertificateChoices) = {
+ ASN1_SIMPLE(CMS_CertificateChoices, d.certificate, X509),
+ ASN1_IMP(CMS_CertificateChoices, d.extendedCertificate, ASN1_SEQUENCE, 0),
+ ASN1_IMP(CMS_CertificateChoices, d.v1AttrCert, ASN1_SEQUENCE, 1),
+ ASN1_IMP(CMS_CertificateChoices, d.v2AttrCert, ASN1_SEQUENCE, 2),
+ ASN1_IMP(CMS_CertificateChoices, d.other, CMS_OtherCertificateFormat, 3)
+} ASN1_CHOICE_END(CMS_CertificateChoices)
+
+ASN1_CHOICE(CMS_SignerIdentifier) = {
+ ASN1_SIMPLE(CMS_SignerIdentifier, d.issuerAndSerialNumber, CMS_IssuerAndSerialNumber),
+ ASN1_IMP(CMS_SignerIdentifier, d.subjectKeyIdentifier, ASN1_OCTET_STRING, 0)
+} ASN1_CHOICE_END(CMS_SignerIdentifier)
+
+ASN1_NDEF_SEQUENCE(CMS_EncapsulatedContentInfo) = {
+ ASN1_SIMPLE(CMS_EncapsulatedContentInfo, eContentType, ASN1_OBJECT),
+ ASN1_NDEF_EXP_OPT(CMS_EncapsulatedContentInfo, eContent, ASN1_OCTET_STRING_NDEF, 0)
+} ASN1_NDEF_SEQUENCE_END(CMS_EncapsulatedContentInfo)
+
+/* Minor tweak to operation: free up signer key, cert */
+static int cms_si_cb(int operation, ASN1_VALUE **pval, const ASN1_ITEM *it,
+ void *exarg)
+ {
+ if(operation == ASN1_OP_FREE_POST)
+ {
+ CMS_SignerInfo *si = (CMS_SignerInfo *)*pval;
+ if (si->pkey)
+ EVP_PKEY_free(si->pkey);
+ if (si->signer)
+ X509_free(si->signer);
+ }
+ return 1;
+ }
+
+ASN1_SEQUENCE_cb(CMS_SignerInfo, cms_si_cb) = {
+ ASN1_SIMPLE(CMS_SignerInfo, version, LONG),
+ ASN1_SIMPLE(CMS_SignerInfo, sid, CMS_SignerIdentifier),
+ ASN1_SIMPLE(CMS_SignerInfo, digestAlgorithm, X509_ALGOR),
+ ASN1_IMP_SET_OF_OPT(CMS_SignerInfo, signedAttrs, X509_ATTRIBUTE, 0),
+ ASN1_SIMPLE(CMS_SignerInfo, signatureAlgorithm, X509_ALGOR),
+ ASN1_SIMPLE(CMS_SignerInfo, signature, ASN1_OCTET_STRING),
+ ASN1_IMP_SET_OF_OPT(CMS_SignerInfo, unsignedAttrs, X509_ATTRIBUTE, 1)
+} ASN1_SEQUENCE_END_cb(CMS_SignerInfo, CMS_SignerInfo)
+
+ASN1_SEQUENCE(CMS_OtherRevocationInfoFormat) = {
+ ASN1_SIMPLE(CMS_OtherRevocationInfoFormat, otherRevInfoFormat, ASN1_OBJECT),
+ ASN1_OPT(CMS_OtherRevocationInfoFormat, otherRevInfo, ASN1_ANY)
+} ASN1_SEQUENCE_END(CMS_OtherRevocationInfoFormat)
+
+ASN1_CHOICE(CMS_RevocationInfoChoice) = {
+ ASN1_SIMPLE(CMS_RevocationInfoChoice, d.crl, X509_CRL),
+ ASN1_IMP(CMS_RevocationInfoChoice, d.other, CMS_OtherRevocationInfoFormat, 1)
+} ASN1_CHOICE_END(CMS_RevocationInfoChoice)
+
+ASN1_NDEF_SEQUENCE(CMS_SignedData) = {
+ ASN1_SIMPLE(CMS_SignedData, version, LONG),
+ ASN1_SET_OF(CMS_SignedData, digestAlgorithms, X509_ALGOR),
+ ASN1_SIMPLE(CMS_SignedData, encapContentInfo, CMS_EncapsulatedContentInfo),
+ ASN1_IMP_SET_OF_OPT(CMS_SignedData, certificates, CMS_CertificateChoices, 0),
+ ASN1_IMP_SET_OF_OPT(CMS_SignedData, crls, CMS_RevocationInfoChoice, 1),
+ ASN1_SET_OF(CMS_SignedData, signerInfos, CMS_SignerInfo)
+} ASN1_NDEF_SEQUENCE_END(CMS_SignedData)
+
+ASN1_SEQUENCE(CMS_OriginatorInfo) = {
+ ASN1_IMP_SET_OF_OPT(CMS_OriginatorInfo, certificates, CMS_CertificateChoices, 0),
+ ASN1_IMP_SET_OF_OPT(CMS_OriginatorInfo, crls, CMS_RevocationInfoChoice, 1)
+} ASN1_SEQUENCE_END(CMS_OriginatorInfo)
+
+ASN1_NDEF_SEQUENCE(CMS_EncryptedContentInfo) = {
+ ASN1_SIMPLE(CMS_EncryptedContentInfo, contentType, ASN1_OBJECT),
+ ASN1_SIMPLE(CMS_EncryptedContentInfo, contentEncryptionAlgorithm, X509_ALGOR),
+ ASN1_IMP_OPT(CMS_EncryptedContentInfo, encryptedContent, ASN1_OCTET_STRING_NDEF, 0)
+} ASN1_NDEF_SEQUENCE_END(CMS_EncryptedContentInfo)
+
+ASN1_SEQUENCE(CMS_KeyTransRecipientInfo) = {
+ ASN1_SIMPLE(CMS_KeyTransRecipientInfo, version, LONG),
+ ASN1_SIMPLE(CMS_KeyTransRecipientInfo, rid, CMS_SignerIdentifier),
+ ASN1_SIMPLE(CMS_KeyTransRecipientInfo, keyEncryptionAlgorithm, X509_ALGOR),
+ ASN1_SIMPLE(CMS_KeyTransRecipientInfo, encryptedKey, ASN1_OCTET_STRING)
+} ASN1_SEQUENCE_END(CMS_KeyTransRecipientInfo)
+
+ASN1_SEQUENCE(CMS_OtherKeyAttribute) = {
+ ASN1_SIMPLE(CMS_OtherKeyAttribute, keyAttrId, ASN1_OBJECT),
+ ASN1_OPT(CMS_OtherKeyAttribute, keyAttr, ASN1_ANY)
+} ASN1_SEQUENCE_END(CMS_OtherKeyAttribute)
+
+ASN1_SEQUENCE(CMS_RecipientKeyIdentifier) = {
+ ASN1_SIMPLE(CMS_RecipientKeyIdentifier, subjectKeyIdentifier, ASN1_OCTET_STRING),
+ ASN1_OPT(CMS_RecipientKeyIdentifier, date, ASN1_GENERALIZEDTIME),
+ ASN1_OPT(CMS_RecipientKeyIdentifier, other, CMS_OtherKeyAttribute)
+} ASN1_SEQUENCE_END(CMS_RecipientKeyIdentifier)
+
+ASN1_CHOICE(CMS_KeyAgreeRecipientIdentifier) = {
+ ASN1_SIMPLE(CMS_KeyAgreeRecipientIdentifier, d.issuerAndSerialNumber, CMS_IssuerAndSerialNumber),
+ ASN1_IMP(CMS_KeyAgreeRecipientIdentifier, d.rKeyId, CMS_RecipientKeyIdentifier, 0)
+} ASN1_CHOICE_END(CMS_KeyAgreeRecipientIdentifier)
+
+ASN1_SEQUENCE(CMS_RecipientEncryptedKey) = {
+ ASN1_SIMPLE(CMS_RecipientEncryptedKey, rid, CMS_KeyAgreeRecipientIdentifier),
+ ASN1_SIMPLE(CMS_RecipientEncryptedKey, encryptedKey, ASN1_OCTET_STRING)
+} ASN1_SEQUENCE_END(CMS_RecipientEncryptedKey)
+
+ASN1_SEQUENCE(CMS_OriginatorPublicKey) = {
+ ASN1_SIMPLE(CMS_OriginatorPublicKey, algorithm, X509_ALGOR),
+ ASN1_SIMPLE(CMS_OriginatorPublicKey, publicKey, ASN1_BIT_STRING)
+} ASN1_SEQUENCE_END(CMS_OriginatorPublicKey)
+
+ASN1_CHOICE(CMS_OriginatorIdentifierOrKey) = {
+ ASN1_SIMPLE(CMS_OriginatorIdentifierOrKey, d.issuerAndSerialNumber, CMS_IssuerAndSerialNumber),
+ ASN1_IMP(CMS_OriginatorIdentifierOrKey, d.subjectKeyIdentifier, ASN1_OCTET_STRING, 0),
+ ASN1_IMP(CMS_OriginatorIdentifierOrKey, d.originatorKey, CMS_OriginatorPublicKey, 1)
+} ASN1_CHOICE_END(CMS_OriginatorIdentifierOrKey)
+
+ASN1_SEQUENCE(CMS_KeyAgreeRecipientInfo) = {
+ ASN1_SIMPLE(CMS_KeyAgreeRecipientInfo, version, LONG),
+ ASN1_EXP(CMS_KeyAgreeRecipientInfo, originator, CMS_OriginatorIdentifierOrKey, 0),
+ ASN1_EXP_OPT(CMS_KeyAgreeRecipientInfo, ukm, ASN1_OCTET_STRING, 1),
+ ASN1_SIMPLE(CMS_KeyAgreeRecipientInfo, keyEncryptionAlgorithm, X509_ALGOR),
+ ASN1_SEQUENCE_OF(CMS_KeyAgreeRecipientInfo, recipientEncryptedKeys, CMS_RecipientEncryptedKey)
+} ASN1_SEQUENCE_END(CMS_KeyAgreeRecipientInfo)
+
+ASN1_SEQUENCE(CMS_KEKIdentifier) = {
+ ASN1_SIMPLE(CMS_KEKIdentifier, keyIdentifier, ASN1_OCTET_STRING),
+ ASN1_OPT(CMS_KEKIdentifier, date, ASN1_GENERALIZEDTIME),
+ ASN1_OPT(CMS_KEKIdentifier, other, CMS_OtherKeyAttribute)
+} ASN1_SEQUENCE_END(CMS_KEKIdentifier)
+
+ASN1_SEQUENCE(CMS_KEKRecipientInfo) = {
+ ASN1_SIMPLE(CMS_KEKRecipientInfo, version, LONG),
+ ASN1_SIMPLE(CMS_KEKRecipientInfo, kekid, CMS_KEKIdentifier),
+ ASN1_SIMPLE(CMS_KEKRecipientInfo, keyEncryptionAlgorithm, X509_ALGOR),
+ ASN1_SIMPLE(CMS_KEKRecipientInfo, encryptedKey, ASN1_OCTET_STRING)
+} ASN1_SEQUENCE_END(CMS_KEKRecipientInfo)
+
+ASN1_SEQUENCE(CMS_PasswordRecipientInfo) = {
+ ASN1_SIMPLE(CMS_PasswordRecipientInfo, version, LONG),
+ ASN1_IMP_OPT(CMS_PasswordRecipientInfo, keyDerivationAlgorithm, X509_ALGOR, 0),
+ ASN1_SIMPLE(CMS_PasswordRecipientInfo, keyEncryptionAlgorithm, X509_ALGOR),
+ ASN1_SIMPLE(CMS_PasswordRecipientInfo, encryptedKey, ASN1_OCTET_STRING)
+} ASN1_SEQUENCE_END(CMS_PasswordRecipientInfo)
+
+ASN1_SEQUENCE(CMS_OtherRecipientInfo) = {
+ ASN1_SIMPLE(CMS_OtherRecipientInfo, oriType, ASN1_OBJECT),
+ ASN1_OPT(CMS_OtherRecipientInfo, oriValue, ASN1_ANY)
+} ASN1_SEQUENCE_END(CMS_OtherRecipientInfo)
+
+/* Free up RecipientInfo additional data */
+static int cms_ri_cb(int operation, ASN1_VALUE **pval, const ASN1_ITEM *it,
+ void *exarg)
+ {
+ if(operation == ASN1_OP_FREE_PRE)
+ {
+ CMS_RecipientInfo *ri = (CMS_RecipientInfo *)*pval;
+ if (ri->type == CMS_RECIPINFO_TRANS)
+ {
+ CMS_KeyTransRecipientInfo *ktri = ri->d.ktri;
+ if (ktri->pkey)
+ EVP_PKEY_free(ktri->pkey);
+ if (ktri->recip)
+ X509_free(ktri->recip);
+ }
+ else if (ri->type == CMS_RECIPINFO_KEK)
+ {
+ CMS_KEKRecipientInfo *kekri = ri->d.kekri;
+ if (kekri->key)
+ {
+ OPENSSL_cleanse(kekri->key, kekri->keylen);
+ OPENSSL_free(kekri->key);
+ }
+ }
+ else if (ri->type == CMS_RECIPINFO_PASS)
+ {
+ CMS_PasswordRecipientInfo *pwri = ri->d.pwri;
+ if (pwri->pass)
+ {
+ OPENSSL_cleanse(pwri->pass, pwri->passlen);
+ OPENSSL_free(pwri->pass);
+ }
+ }
+ }
+ return 1;
+ }
+
+ASN1_CHOICE_cb(CMS_RecipientInfo, cms_ri_cb) = {
+ ASN1_SIMPLE(CMS_RecipientInfo, d.ktri, CMS_KeyTransRecipientInfo),
+ ASN1_IMP(CMS_RecipientInfo, d.kari, CMS_KeyAgreeRecipientInfo, 1),
+ ASN1_IMP(CMS_RecipientInfo, d.kekri, CMS_KEKRecipientInfo, 2),
+ ASN1_IMP(CMS_RecipientInfo, d.pwri, CMS_PasswordRecipientInfo, 3),
+ ASN1_IMP(CMS_RecipientInfo, d.ori, CMS_OtherRecipientInfo, 4)
+} ASN1_CHOICE_END_cb(CMS_RecipientInfo, CMS_RecipientInfo, type)
+
+ASN1_NDEF_SEQUENCE(CMS_EnvelopedData) = {
+ ASN1_SIMPLE(CMS_EnvelopedData, version, LONG),
+ ASN1_IMP_OPT(CMS_EnvelopedData, originatorInfo, CMS_OriginatorInfo, 0),
+ ASN1_SET_OF(CMS_EnvelopedData, recipientInfos, CMS_RecipientInfo),
+ ASN1_SIMPLE(CMS_EnvelopedData, encryptedContentInfo, CMS_EncryptedContentInfo),
+ ASN1_IMP_SET_OF_OPT(CMS_EnvelopedData, unprotectedAttrs, X509_ATTRIBUTE, 1)
+} ASN1_NDEF_SEQUENCE_END(CMS_EnvelopedData)
+
+ASN1_NDEF_SEQUENCE(CMS_DigestedData) = {
+ ASN1_SIMPLE(CMS_DigestedData, version, LONG),
+ ASN1_SIMPLE(CMS_DigestedData, digestAlgorithm, X509_ALGOR),
+ ASN1_SIMPLE(CMS_DigestedData, encapContentInfo, CMS_EncapsulatedContentInfo),
+ ASN1_SIMPLE(CMS_DigestedData, digest, ASN1_OCTET_STRING)
+} ASN1_NDEF_SEQUENCE_END(CMS_DigestedData)
+
+ASN1_NDEF_SEQUENCE(CMS_EncryptedData) = {
+ ASN1_SIMPLE(CMS_EncryptedData, version, LONG),
+ ASN1_SIMPLE(CMS_EncryptedData, encryptedContentInfo, CMS_EncryptedContentInfo),
+ ASN1_IMP_SET_OF_OPT(CMS_EncryptedData, unprotectedAttrs, X509_ATTRIBUTE, 1)
+} ASN1_NDEF_SEQUENCE_END(CMS_EncryptedData)
+
+ASN1_NDEF_SEQUENCE(CMS_AuthenticatedData) = {
+ ASN1_SIMPLE(CMS_AuthenticatedData, version, LONG),
+ ASN1_IMP_OPT(CMS_AuthenticatedData, originatorInfo, CMS_OriginatorInfo, 0),
+ ASN1_SET_OF(CMS_AuthenticatedData, recipientInfos, CMS_RecipientInfo),
+ ASN1_SIMPLE(CMS_AuthenticatedData, macAlgorithm, X509_ALGOR),
+ ASN1_IMP(CMS_AuthenticatedData, digestAlgorithm, X509_ALGOR, 1),
+ ASN1_SIMPLE(CMS_AuthenticatedData, encapContentInfo, CMS_EncapsulatedContentInfo),
+ ASN1_IMP_SET_OF_OPT(CMS_AuthenticatedData, authAttrs, X509_ALGOR, 2),
+ ASN1_SIMPLE(CMS_AuthenticatedData, mac, ASN1_OCTET_STRING),
+ ASN1_IMP_SET_OF_OPT(CMS_AuthenticatedData, unauthAttrs, X509_ALGOR, 3)
+} ASN1_NDEF_SEQUENCE_END(CMS_AuthenticatedData)
+
+ASN1_NDEF_SEQUENCE(CMS_CompressedData) = {
+ ASN1_SIMPLE(CMS_CompressedData, version, LONG),
+ ASN1_SIMPLE(CMS_CompressedData, compressionAlgorithm, X509_ALGOR),
+ ASN1_SIMPLE(CMS_CompressedData, encapContentInfo, CMS_EncapsulatedContentInfo),
+} ASN1_NDEF_SEQUENCE_END(CMS_CompressedData)
+
+/* This is the ANY DEFINED BY table for the top level ContentInfo structure */
+
+ASN1_ADB_TEMPLATE(cms_default) = ASN1_EXP(CMS_ContentInfo, d.other, ASN1_ANY, 0);
+
+ASN1_ADB(CMS_ContentInfo) = {
+ ADB_ENTRY(NID_pkcs7_data, ASN1_NDEF_EXP(CMS_ContentInfo, d.data, ASN1_OCTET_STRING_NDEF, 0)),
+ ADB_ENTRY(NID_pkcs7_signed, ASN1_NDEF_EXP(CMS_ContentInfo, d.signedData, CMS_SignedData, 0)),
+ ADB_ENTRY(NID_pkcs7_enveloped, ASN1_NDEF_EXP(CMS_ContentInfo, d.envelopedData, CMS_EnvelopedData, 0)),
+ ADB_ENTRY(NID_pkcs7_digest, ASN1_NDEF_EXP(CMS_ContentInfo, d.digestedData, CMS_DigestedData, 0)),
+ ADB_ENTRY(NID_pkcs7_encrypted, ASN1_NDEF_EXP(CMS_ContentInfo, d.encryptedData, CMS_EncryptedData, 0)),
+ ADB_ENTRY(NID_id_smime_ct_authData, ASN1_NDEF_EXP(CMS_ContentInfo, d.authenticatedData, CMS_AuthenticatedData, 0)),
+ ADB_ENTRY(NID_id_smime_ct_compressedData, ASN1_NDEF_EXP(CMS_ContentInfo, d.compressedData, CMS_CompressedData, 0)),
+} ASN1_ADB_END(CMS_ContentInfo, 0, contentType, 0, &cms_default_tt, NULL);
+
+/* CMS streaming support */
+static int cms_cb(int operation, ASN1_VALUE **pval, const ASN1_ITEM *it,
+ void *exarg)
+ {
+ ASN1_STREAM_ARG *sarg = exarg;
+ CMS_ContentInfo *cms = NULL;
+ if (pval)
+ cms = (CMS_ContentInfo *)*pval;
+ else
+ return 1;
+ switch(operation)
+ {
+
+ case ASN1_OP_STREAM_PRE:
+ if (CMS_stream(&sarg->boundary, cms) <= 0)
+ return 0;
+ case ASN1_OP_DETACHED_PRE:
+ sarg->ndef_bio = CMS_dataInit(cms, sarg->out);
+ if (!sarg->ndef_bio)
+ return 0;
+ break;
+
+ case ASN1_OP_STREAM_POST:
+ case ASN1_OP_DETACHED_POST:
+ if (CMS_dataFinal(cms, sarg->ndef_bio) <= 0)
+ return 0;
+ break;
+
+ }
+ return 1;
+ }
+
+ASN1_NDEF_SEQUENCE_cb(CMS_ContentInfo, cms_cb) = {
+ ASN1_SIMPLE(CMS_ContentInfo, contentType, ASN1_OBJECT),
+ ASN1_ADB_OBJECT(CMS_ContentInfo)
+} ASN1_NDEF_SEQUENCE_END_cb(CMS_ContentInfo, CMS_ContentInfo)
+
+/* Specials for signed attributes */
+
+/* When signing attributes we want to reorder them to match the sorted
+ * encoding.
+ */
+
+ASN1_ITEM_TEMPLATE(CMS_Attributes_Sign) =
+ ASN1_EX_TEMPLATE_TYPE(ASN1_TFLG_SET_ORDER, 0, CMS_ATTRIBUTES, X509_ATTRIBUTE)
+ASN1_ITEM_TEMPLATE_END(CMS_Attributes_Sign)
+
+/* When verifying attributes we need to use the received order. So
+ * we use SEQUENCE OF and tag it to SET OF
+ */
+
+ASN1_ITEM_TEMPLATE(CMS_Attributes_Verify) =
+ ASN1_EX_TEMPLATE_TYPE(ASN1_TFLG_SEQUENCE_OF | ASN1_TFLG_IMPTAG | ASN1_TFLG_UNIVERSAL,
+ V_ASN1_SET, CMS_ATTRIBUTES, X509_ATTRIBUTE)
+ASN1_ITEM_TEMPLATE_END(CMS_Attributes_Verify)
+
+
+
+ASN1_CHOICE(CMS_ReceiptsFrom) = {
+ ASN1_IMP(CMS_ReceiptsFrom, d.allOrFirstTier, LONG, 0),
+ ASN1_IMP_SEQUENCE_OF(CMS_ReceiptsFrom, d.receiptList, GENERAL_NAMES, 1)
+} ASN1_CHOICE_END(CMS_ReceiptsFrom)
+
+ASN1_SEQUENCE(CMS_ReceiptRequest) = {
+ ASN1_SIMPLE(CMS_ReceiptRequest, signedContentIdentifier, ASN1_OCTET_STRING),
+ ASN1_SIMPLE(CMS_ReceiptRequest, receiptsFrom, CMS_ReceiptsFrom),
+ ASN1_SEQUENCE_OF(CMS_ReceiptRequest, receiptsTo, GENERAL_NAMES)
+} ASN1_SEQUENCE_END(CMS_ReceiptRequest)
+
+ASN1_SEQUENCE(CMS_Receipt) = {
+ ASN1_SIMPLE(CMS_Receipt, version, LONG),
+ ASN1_SIMPLE(CMS_Receipt, contentType, ASN1_OBJECT),
+ ASN1_SIMPLE(CMS_Receipt, signedContentIdentifier, ASN1_OCTET_STRING),
+ ASN1_SIMPLE(CMS_Receipt, originatorSignatureValue, ASN1_OCTET_STRING)
+} ASN1_SEQUENCE_END(CMS_Receipt)
+
diff --git a/main/openssl/crypto/cms/cms_att.c b/main/openssl/crypto/cms/cms_att.c
new file mode 100644
index 00000000..5b71722e
--- /dev/null
+++ b/main/openssl/crypto/cms/cms_att.c
@@ -0,0 +1,195 @@
+/* crypto/cms/cms_att.c */
+/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
+ * project.
+ */
+/* ====================================================================
+ * Copyright (c) 2008 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include <openssl/asn1t.h>
+#include <openssl/pem.h>
+#include <openssl/x509v3.h>
+#include <openssl/err.h>
+#include "cms.h"
+#include "cms_lcl.h"
+
+/* CMS SignedData Attribute utilities */
+
+int CMS_signed_get_attr_count(const CMS_SignerInfo *si)
+{
+ return X509at_get_attr_count(si->signedAttrs);
+}
+
+int CMS_signed_get_attr_by_NID(const CMS_SignerInfo *si, int nid,
+ int lastpos)
+{
+ return X509at_get_attr_by_NID(si->signedAttrs, nid, lastpos);
+}
+
+int CMS_signed_get_attr_by_OBJ(const CMS_SignerInfo *si, ASN1_OBJECT *obj,
+ int lastpos)
+{
+ return X509at_get_attr_by_OBJ(si->signedAttrs, obj, lastpos);
+}
+
+X509_ATTRIBUTE *CMS_signed_get_attr(const CMS_SignerInfo *si, int loc)
+{
+ return X509at_get_attr(si->signedAttrs, loc);
+}
+
+X509_ATTRIBUTE *CMS_signed_delete_attr(CMS_SignerInfo *si, int loc)
+{
+ return X509at_delete_attr(si->signedAttrs, loc);
+}
+
+int CMS_signed_add1_attr(CMS_SignerInfo *si, X509_ATTRIBUTE *attr)
+{
+ if(X509at_add1_attr(&si->signedAttrs, attr)) return 1;
+ return 0;
+}
+
+int CMS_signed_add1_attr_by_OBJ(CMS_SignerInfo *si,
+ const ASN1_OBJECT *obj, int type,
+ const void *bytes, int len)
+{
+ if(X509at_add1_attr_by_OBJ(&si->signedAttrs, obj,
+ type, bytes, len)) return 1;
+ return 0;
+}
+
+int CMS_signed_add1_attr_by_NID(CMS_SignerInfo *si,
+ int nid, int type,
+ const void *bytes, int len)
+{
+ if(X509at_add1_attr_by_NID(&si->signedAttrs, nid,
+ type, bytes, len)) return 1;
+ return 0;
+}
+
+int CMS_signed_add1_attr_by_txt(CMS_SignerInfo *si,
+ const char *attrname, int type,
+ const void *bytes, int len)
+{
+ if(X509at_add1_attr_by_txt(&si->signedAttrs, attrname,
+ type, bytes, len)) return 1;
+ return 0;
+}
+
+void *CMS_signed_get0_data_by_OBJ(CMS_SignerInfo *si, ASN1_OBJECT *oid,
+ int lastpos, int type)
+{
+ return X509at_get0_data_by_OBJ(si->signedAttrs, oid, lastpos, type);
+}
+
+int CMS_unsigned_get_attr_count(const CMS_SignerInfo *si)
+{
+ return X509at_get_attr_count(si->unsignedAttrs);
+}
+
+int CMS_unsigned_get_attr_by_NID(const CMS_SignerInfo *si, int nid,
+ int lastpos)
+{
+ return X509at_get_attr_by_NID(si->unsignedAttrs, nid, lastpos);
+}
+
+int CMS_unsigned_get_attr_by_OBJ(const CMS_SignerInfo *si, ASN1_OBJECT *obj,
+ int lastpos)
+{
+ return X509at_get_attr_by_OBJ(si->unsignedAttrs, obj, lastpos);
+}
+
+X509_ATTRIBUTE *CMS_unsigned_get_attr(const CMS_SignerInfo *si, int loc)
+{
+ return X509at_get_attr(si->unsignedAttrs, loc);
+}
+
+X509_ATTRIBUTE *CMS_unsigned_delete_attr(CMS_SignerInfo *si, int loc)
+{
+ return X509at_delete_attr(si->unsignedAttrs, loc);
+}
+
+int CMS_unsigned_add1_attr(CMS_SignerInfo *si, X509_ATTRIBUTE *attr)
+{
+ if(X509at_add1_attr(&si->unsignedAttrs, attr)) return 1;
+ return 0;
+}
+
+int CMS_unsigned_add1_attr_by_OBJ(CMS_SignerInfo *si,
+ const ASN1_OBJECT *obj, int type,
+ const void *bytes, int len)
+{
+ if(X509at_add1_attr_by_OBJ(&si->unsignedAttrs, obj,
+ type, bytes, len)) return 1;
+ return 0;
+}
+
+int CMS_unsigned_add1_attr_by_NID(CMS_SignerInfo *si,
+ int nid, int type,
+ const void *bytes, int len)
+{
+ if(X509at_add1_attr_by_NID(&si->unsignedAttrs, nid,
+ type, bytes, len)) return 1;
+ return 0;
+}
+
+int CMS_unsigned_add1_attr_by_txt(CMS_SignerInfo *si,
+ const char *attrname, int type,
+ const void *bytes, int len)
+{
+ if(X509at_add1_attr_by_txt(&si->unsignedAttrs, attrname,
+ type, bytes, len)) return 1;
+ return 0;
+}
+
+void *CMS_unsigned_get0_data_by_OBJ(CMS_SignerInfo *si, ASN1_OBJECT *oid,
+ int lastpos, int type)
+{
+ return X509at_get0_data_by_OBJ(si->unsignedAttrs, oid, lastpos, type);
+}
+
+/* Specific attribute cases */
diff --git a/main/openssl/crypto/cms/cms_cd.c b/main/openssl/crypto/cms/cms_cd.c
new file mode 100644
index 00000000..20216881
--- /dev/null
+++ b/main/openssl/crypto/cms/cms_cd.c
@@ -0,0 +1,136 @@
+/* crypto/cms/cms_cd.c */
+/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
+ * project.
+ */
+/* ====================================================================
+ * Copyright (c) 2008 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include "cryptlib.h"
+#include <openssl/asn1t.h>
+#include <openssl/pem.h>
+#include <openssl/x509v3.h>
+#include <openssl/err.h>
+#include <openssl/cms.h>
+#include <openssl/bio.h>
+#ifndef OPENSSL_NO_COMP
+#include <openssl/comp.h>
+#endif
+#include "cms_lcl.h"
+
+DECLARE_ASN1_ITEM(CMS_CompressedData)
+
+#ifdef ZLIB
+
+/* CMS CompressedData Utilities */
+
+CMS_ContentInfo *cms_CompressedData_create(int comp_nid)
+ {
+ CMS_ContentInfo *cms;
+ CMS_CompressedData *cd;
+ /* Will need something cleverer if there is ever more than one
+ * compression algorithm or parameters have some meaning...
+ */
+ if (comp_nid != NID_zlib_compression)
+ {
+ CMSerr(CMS_F_CMS_COMPRESSEDDATA_CREATE,
+ CMS_R_UNSUPPORTED_COMPRESSION_ALGORITHM);
+ return NULL;
+ }
+ cms = CMS_ContentInfo_new();
+ if (!cms)
+ return NULL;
+
+ cd = M_ASN1_new_of(CMS_CompressedData);
+
+ if (!cd)
+ goto err;
+
+ cms->contentType = OBJ_nid2obj(NID_id_smime_ct_compressedData);
+ cms->d.compressedData = cd;
+
+ cd->version = 0;
+
+ X509_ALGOR_set0(cd->compressionAlgorithm,
+ OBJ_nid2obj(NID_zlib_compression),
+ V_ASN1_UNDEF, NULL);
+
+ cd->encapContentInfo->eContentType = OBJ_nid2obj(NID_pkcs7_data);
+
+ return cms;
+
+ err:
+
+ if (cms)
+ CMS_ContentInfo_free(cms);
+
+ return NULL;
+ }
+
+BIO *cms_CompressedData_init_bio(CMS_ContentInfo *cms)
+ {
+ CMS_CompressedData *cd;
+ ASN1_OBJECT *compoid;
+ if (OBJ_obj2nid(cms->contentType) != NID_id_smime_ct_compressedData)
+ {
+ CMSerr(CMS_F_CMS_COMPRESSEDDATA_INIT_BIO,
+ CMS_R_CONTENT_TYPE_NOT_COMPRESSED_DATA);
+ return NULL;
+ }
+ cd = cms->d.compressedData;
+ X509_ALGOR_get0(&compoid, NULL, NULL, cd->compressionAlgorithm);
+ if (OBJ_obj2nid(compoid) != NID_zlib_compression)
+ {
+ CMSerr(CMS_F_CMS_COMPRESSEDDATA_INIT_BIO,
+ CMS_R_UNSUPPORTED_COMPRESSION_ALGORITHM);
+ return NULL;
+ }
+ return BIO_new(BIO_f_zlib());
+ }
+
+#endif
diff --git a/main/openssl/crypto/cms/cms_dd.c b/main/openssl/crypto/cms/cms_dd.c
new file mode 100644
index 00000000..8919c15b
--- /dev/null
+++ b/main/openssl/crypto/cms/cms_dd.c
@@ -0,0 +1,148 @@
+/* crypto/cms/cms_dd.c */
+/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
+ * project.
+ */
+/* ====================================================================
+ * Copyright (c) 2008 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include "cryptlib.h"
+#include <openssl/asn1t.h>
+#include <openssl/pem.h>
+#include <openssl/x509v3.h>
+#include <openssl/err.h>
+#include <openssl/cms.h>
+#include "cms_lcl.h"
+
+DECLARE_ASN1_ITEM(CMS_DigestedData)
+
+/* CMS DigestedData Utilities */
+
+CMS_ContentInfo *cms_DigestedData_create(const EVP_MD *md)
+ {
+ CMS_ContentInfo *cms;
+ CMS_DigestedData *dd;
+ cms = CMS_ContentInfo_new();
+ if (!cms)
+ return NULL;
+
+ dd = M_ASN1_new_of(CMS_DigestedData);
+
+ if (!dd)
+ goto err;
+
+ cms->contentType = OBJ_nid2obj(NID_pkcs7_digest);
+ cms->d.digestedData = dd;
+
+ dd->version = 0;
+ dd->encapContentInfo->eContentType = OBJ_nid2obj(NID_pkcs7_data);
+
+ cms_DigestAlgorithm_set(dd->digestAlgorithm, md);
+
+ return cms;
+
+ err:
+
+ if (cms)
+ CMS_ContentInfo_free(cms);
+
+ return NULL;
+ }
+
+BIO *cms_DigestedData_init_bio(CMS_ContentInfo *cms)
+ {
+ CMS_DigestedData *dd;
+ dd = cms->d.digestedData;
+ return cms_DigestAlgorithm_init_bio(dd->digestAlgorithm);
+ }
+
+int cms_DigestedData_do_final(CMS_ContentInfo *cms, BIO *chain, int verify)
+ {
+ EVP_MD_CTX mctx;
+ unsigned char md[EVP_MAX_MD_SIZE];
+ unsigned int mdlen;
+ int r = 0;
+ CMS_DigestedData *dd;
+ EVP_MD_CTX_init(&mctx);
+
+ dd = cms->d.digestedData;
+
+ if (!cms_DigestAlgorithm_find_ctx(&mctx, chain, dd->digestAlgorithm))
+ goto err;
+
+ if (EVP_DigestFinal_ex(&mctx, md, &mdlen) <= 0)
+ goto err;
+
+ if (verify)
+ {
+ if (mdlen != (unsigned int)dd->digest->length)
+ {
+ CMSerr(CMS_F_CMS_DIGESTEDDATA_DO_FINAL,
+ CMS_R_MESSAGEDIGEST_WRONG_LENGTH);
+ goto err;
+ }
+
+ if (memcmp(md, dd->digest->data, mdlen))
+ CMSerr(CMS_F_CMS_DIGESTEDDATA_DO_FINAL,
+ CMS_R_VERIFICATION_FAILURE);
+ else
+ r = 1;
+ }
+ else
+ {
+ if (!ASN1_STRING_set(dd->digest, md, mdlen))
+ goto err;
+ r = 1;
+ }
+
+ err:
+ EVP_MD_CTX_cleanup(&mctx);
+
+ return r;
+
+ }
diff --git a/main/openssl/crypto/cms/cms_enc.c b/main/openssl/crypto/cms/cms_enc.c
new file mode 100644
index 00000000..bebeaf29
--- /dev/null
+++ b/main/openssl/crypto/cms/cms_enc.c
@@ -0,0 +1,294 @@
+/* crypto/cms/cms_enc.c */
+/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
+ * project.
+ */
+/* ====================================================================
+ * Copyright (c) 2008 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include "cryptlib.h"
+#include <openssl/asn1t.h>
+#include <openssl/pem.h>
+#include <openssl/x509v3.h>
+#include <openssl/err.h>
+#include <openssl/cms.h>
+#include <openssl/rand.h>
+#include "cms_lcl.h"
+
+/* CMS EncryptedData Utilities */
+
+DECLARE_ASN1_ITEM(CMS_EncryptedData)
+
+/* Return BIO based on EncryptedContentInfo and key */
+
+BIO *cms_EncryptedContent_init_bio(CMS_EncryptedContentInfo *ec)
+ {
+ BIO *b;
+ EVP_CIPHER_CTX *ctx;
+ const EVP_CIPHER *ciph;
+ X509_ALGOR *calg = ec->contentEncryptionAlgorithm;
+ unsigned char iv[EVP_MAX_IV_LENGTH], *piv = NULL;
+ unsigned char *tkey = NULL;
+ size_t tkeylen = 0;
+
+ int ok = 0;
+
+ int enc, keep_key = 0;
+
+ enc = ec->cipher ? 1 : 0;
+
+ b = BIO_new(BIO_f_cipher());
+ if (!b)
+ {
+ CMSerr(CMS_F_CMS_ENCRYPTEDCONTENT_INIT_BIO,
+ ERR_R_MALLOC_FAILURE);
+ return NULL;
+ }
+
+ BIO_get_cipher_ctx(b, &ctx);
+
+ if (enc)
+ {
+ ciph = ec->cipher;
+ /* If not keeping key set cipher to NULL so subsequent calls
+ * decrypt.
+ */
+ if (ec->key)
+ ec->cipher = NULL;
+ }
+ else
+ {
+ ciph = EVP_get_cipherbyobj(calg->algorithm);
+
+ if (!ciph)
+ {
+ CMSerr(CMS_F_CMS_ENCRYPTEDCONTENT_INIT_BIO,
+ CMS_R_UNKNOWN_CIPHER);
+ goto err;
+ }
+ }
+
+ if (EVP_CipherInit_ex(ctx, ciph, NULL, NULL, NULL, enc) <= 0)
+ {
+ CMSerr(CMS_F_CMS_ENCRYPTEDCONTENT_INIT_BIO,
+ CMS_R_CIPHER_INITIALISATION_ERROR);
+ goto err;
+ }
+
+ if (enc)
+ {
+ int ivlen;
+ calg->algorithm = OBJ_nid2obj(EVP_CIPHER_CTX_type(ctx));
+ /* Generate a random IV if we need one */
+ ivlen = EVP_CIPHER_CTX_iv_length(ctx);
+ if (ivlen > 0)
+ {
+ if (RAND_pseudo_bytes(iv, ivlen) <= 0)
+ goto err;
+ piv = iv;
+ }
+ }
+ else if (EVP_CIPHER_asn1_to_param(ctx, calg->parameter) <= 0)
+ {
+ CMSerr(CMS_F_CMS_ENCRYPTEDCONTENT_INIT_BIO,
+ CMS_R_CIPHER_PARAMETER_INITIALISATION_ERROR);
+ goto err;
+ }
+ tkeylen = EVP_CIPHER_CTX_key_length(ctx);
+ /* Generate random session key */
+ if (!enc || !ec->key)
+ {
+ tkey = OPENSSL_malloc(tkeylen);
+ if (!tkey)
+ {
+ CMSerr(CMS_F_CMS_ENCRYPTEDCONTENT_INIT_BIO,
+ ERR_R_MALLOC_FAILURE);
+ goto err;
+ }
+ if (EVP_CIPHER_CTX_rand_key(ctx, tkey) <= 0)
+ goto err;
+ }
+
+ if (!ec->key)
+ {
+ ec->key = tkey;
+ ec->keylen = tkeylen;
+ tkey = NULL;
+ if (enc)
+ keep_key = 1;
+ else
+ ERR_clear_error();
+
+ }
+
+ if (ec->keylen != tkeylen)
+ {
+ /* If necessary set key length */
+ if (EVP_CIPHER_CTX_set_key_length(ctx, ec->keylen) <= 0)
+ {
+ /* Only reveal failure if debugging so we don't
+ * leak information which may be useful in MMA.
+ */
+ if (enc || ec->debug)
+ {
+ CMSerr(CMS_F_CMS_ENCRYPTEDCONTENT_INIT_BIO,
+ CMS_R_INVALID_KEY_LENGTH);
+ goto err;
+ }
+ else
+ {
+ /* Use random key */
+ OPENSSL_cleanse(ec->key, ec->keylen);
+ OPENSSL_free(ec->key);
+ ec->key = tkey;
+ ec->keylen = tkeylen;
+ tkey = NULL;
+ ERR_clear_error();
+ }
+ }
+ }
+
+ if (EVP_CipherInit_ex(ctx, NULL, NULL, ec->key, piv, enc) <= 0)
+ {
+ CMSerr(CMS_F_CMS_ENCRYPTEDCONTENT_INIT_BIO,
+ CMS_R_CIPHER_INITIALISATION_ERROR);
+ goto err;
+ }
+
+ if (piv)
+ {
+ calg->parameter = ASN1_TYPE_new();
+ if (!calg->parameter)
+ {
+ CMSerr(CMS_F_CMS_ENCRYPTEDCONTENT_INIT_BIO,
+ ERR_R_MALLOC_FAILURE);
+ goto err;
+ }
+ if (EVP_CIPHER_param_to_asn1(ctx, calg->parameter) <= 0)
+ {
+ CMSerr(CMS_F_CMS_ENCRYPTEDCONTENT_INIT_BIO,
+ CMS_R_CIPHER_PARAMETER_INITIALISATION_ERROR);
+ goto err;
+ }
+ }
+ ok = 1;
+
+ err:
+ if (ec->key && !keep_key)
+ {
+ OPENSSL_cleanse(ec->key, ec->keylen);
+ OPENSSL_free(ec->key);
+ ec->key = NULL;
+ }
+ if (tkey)
+ {
+ OPENSSL_cleanse(tkey, tkeylen);
+ OPENSSL_free(tkey);
+ }
+ if (ok)
+ return b;
+ BIO_free(b);
+ return NULL;
+ }
+
+int cms_EncryptedContent_init(CMS_EncryptedContentInfo *ec,
+ const EVP_CIPHER *cipher,
+ const unsigned char *key, size_t keylen)
+ {
+ ec->cipher = cipher;
+ if (key)
+ {
+ ec->key = OPENSSL_malloc(keylen);
+ if (!ec->key)
+ return 0;
+ memcpy(ec->key, key, keylen);
+ }
+ ec->keylen = keylen;
+ if (cipher)
+ ec->contentType = OBJ_nid2obj(NID_pkcs7_data);
+ return 1;
+ }
+
+int CMS_EncryptedData_set1_key(CMS_ContentInfo *cms, const EVP_CIPHER *ciph,
+ const unsigned char *key, size_t keylen)
+ {
+ CMS_EncryptedContentInfo *ec;
+ if (!key || !keylen)
+ {
+ CMSerr(CMS_F_CMS_ENCRYPTEDDATA_SET1_KEY, CMS_R_NO_KEY);
+ return 0;
+ }
+ if (ciph)
+ {
+ cms->d.encryptedData = M_ASN1_new_of(CMS_EncryptedData);
+ if (!cms->d.encryptedData)
+ {
+ CMSerr(CMS_F_CMS_ENCRYPTEDDATA_SET1_KEY,
+ ERR_R_MALLOC_FAILURE);
+ return 0;
+ }
+ cms->contentType = OBJ_nid2obj(NID_pkcs7_encrypted);
+ cms->d.encryptedData->version = 0;
+ }
+ else if (OBJ_obj2nid(cms->contentType) != NID_pkcs7_encrypted)
+ {
+ CMSerr(CMS_F_CMS_ENCRYPTEDDATA_SET1_KEY,
+ CMS_R_NOT_ENCRYPTED_DATA);
+ return 0;
+ }
+ ec = cms->d.encryptedData->encryptedContentInfo;
+ return cms_EncryptedContent_init(ec, ciph, key, keylen);
+ }
+
+BIO *cms_EncryptedData_init_bio(CMS_ContentInfo *cms)
+ {
+ CMS_EncryptedData *enc = cms->d.encryptedData;
+ if (enc->encryptedContentInfo->cipher && enc->unprotectedAttrs)
+ enc->version = 2;
+ return cms_EncryptedContent_init_bio(enc->encryptedContentInfo);
+ }
diff --git a/main/openssl/crypto/cms/cms_env.c b/main/openssl/crypto/cms/cms_env.c
new file mode 100644
index 00000000..be20b1c0
--- /dev/null
+++ b/main/openssl/crypto/cms/cms_env.c
@@ -0,0 +1,876 @@
+/* crypto/cms/cms_env.c */
+/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
+ * project.
+ */
+/* ====================================================================
+ * Copyright (c) 2008 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include "cryptlib.h"
+#include <openssl/asn1t.h>
+#include <openssl/pem.h>
+#include <openssl/x509v3.h>
+#include <openssl/err.h>
+#include <openssl/cms.h>
+#include <openssl/rand.h>
+#include <openssl/aes.h>
+#include "cms_lcl.h"
+#include "asn1_locl.h"
+
+/* CMS EnvelopedData Utilities */
+
+DECLARE_ASN1_ITEM(CMS_EnvelopedData)
+DECLARE_ASN1_ITEM(CMS_KeyTransRecipientInfo)
+DECLARE_ASN1_ITEM(CMS_KEKRecipientInfo)
+DECLARE_ASN1_ITEM(CMS_OtherKeyAttribute)
+
+DECLARE_STACK_OF(CMS_RecipientInfo)
+
+CMS_EnvelopedData *cms_get0_enveloped(CMS_ContentInfo *cms)
+ {
+ if (OBJ_obj2nid(cms->contentType) != NID_pkcs7_enveloped)
+ {
+ CMSerr(CMS_F_CMS_GET0_ENVELOPED,
+ CMS_R_CONTENT_TYPE_NOT_ENVELOPED_DATA);
+ return NULL;
+ }
+ return cms->d.envelopedData;
+ }
+
+static CMS_EnvelopedData *cms_enveloped_data_init(CMS_ContentInfo *cms)
+ {
+ if (cms->d.other == NULL)
+ {
+ cms->d.envelopedData = M_ASN1_new_of(CMS_EnvelopedData);
+ if (!cms->d.envelopedData)
+ {
+ CMSerr(CMS_F_CMS_ENVELOPED_DATA_INIT,
+ ERR_R_MALLOC_FAILURE);
+ return NULL;
+ }
+ cms->d.envelopedData->version = 0;
+ cms->d.envelopedData->encryptedContentInfo->contentType =
+ OBJ_nid2obj(NID_pkcs7_data);
+ ASN1_OBJECT_free(cms->contentType);
+ cms->contentType = OBJ_nid2obj(NID_pkcs7_enveloped);
+ return cms->d.envelopedData;
+ }
+ return cms_get0_enveloped(cms);
+ }
+
+STACK_OF(CMS_RecipientInfo) *CMS_get0_RecipientInfos(CMS_ContentInfo *cms)
+ {
+ CMS_EnvelopedData *env;
+ env = cms_get0_enveloped(cms);
+ if (!env)
+ return NULL;
+ return env->recipientInfos;
+ }
+
+int CMS_RecipientInfo_type(CMS_RecipientInfo *ri)
+ {
+ return ri->type;
+ }
+
+CMS_ContentInfo *CMS_EnvelopedData_create(const EVP_CIPHER *cipher)
+ {
+ CMS_ContentInfo *cms;
+ CMS_EnvelopedData *env;
+ cms = CMS_ContentInfo_new();
+ if (!cms)
+ goto merr;
+ env = cms_enveloped_data_init(cms);
+ if (!env)
+ goto merr;
+ if (!cms_EncryptedContent_init(env->encryptedContentInfo,
+ cipher, NULL, 0))
+ goto merr;
+ return cms;
+ merr:
+ if (cms)
+ CMS_ContentInfo_free(cms);
+ CMSerr(CMS_F_CMS_ENVELOPEDDATA_CREATE, ERR_R_MALLOC_FAILURE);
+ return NULL;
+ }
+
+/* Key Transport Recipient Info (KTRI) routines */
+
+/* Add a recipient certificate. For now only handle key transport.
+ * If we ever handle key agreement will need updating.
+ */
+
+CMS_RecipientInfo *CMS_add1_recipient_cert(CMS_ContentInfo *cms,
+ X509 *recip, unsigned int flags)
+ {
+ CMS_RecipientInfo *ri = NULL;
+ CMS_KeyTransRecipientInfo *ktri;
+ CMS_EnvelopedData *env;
+ EVP_PKEY *pk = NULL;
+ int i, type;
+ env = cms_get0_enveloped(cms);
+ if (!env)
+ goto err;
+
+ /* Initialize recipient info */
+ ri = M_ASN1_new_of(CMS_RecipientInfo);
+ if (!ri)
+ goto merr;
+
+ /* Initialize and add key transport recipient info */
+
+ ri->d.ktri = M_ASN1_new_of(CMS_KeyTransRecipientInfo);
+ if (!ri->d.ktri)
+ goto merr;
+ ri->type = CMS_RECIPINFO_TRANS;
+
+ ktri = ri->d.ktri;
+
+ X509_check_purpose(recip, -1, -1);
+ pk = X509_get_pubkey(recip);
+ if (!pk)
+ {
+ CMSerr(CMS_F_CMS_ADD1_RECIPIENT_CERT,
+ CMS_R_ERROR_GETTING_PUBLIC_KEY);
+ goto err;
+ }
+ CRYPTO_add(&recip->references, 1, CRYPTO_LOCK_X509);
+ ktri->pkey = pk;
+ ktri->recip = recip;
+
+ if (flags & CMS_USE_KEYID)
+ {
+ ktri->version = 2;
+ type = CMS_RECIPINFO_KEYIDENTIFIER;
+ }
+ else
+ {
+ ktri->version = 0;
+ type = CMS_RECIPINFO_ISSUER_SERIAL;
+ }
+
+ /* Not a typo: RecipientIdentifier and SignerIdentifier are the
+ * same structure.
+ */
+
+ if (!cms_set1_SignerIdentifier(ktri->rid, recip, type))
+ goto err;
+
+ if (pk->ameth && pk->ameth->pkey_ctrl)
+ {
+ i = pk->ameth->pkey_ctrl(pk, ASN1_PKEY_CTRL_CMS_ENVELOPE,
+ 0, ri);
+ if (i == -2)
+ {
+ CMSerr(CMS_F_CMS_ADD1_RECIPIENT_CERT,
+ CMS_R_NOT_SUPPORTED_FOR_THIS_KEY_TYPE);
+ goto err;
+ }
+ if (i <= 0)
+ {
+ CMSerr(CMS_F_CMS_ADD1_RECIPIENT_CERT,
+ CMS_R_CTRL_FAILURE);
+ goto err;
+ }
+ }
+
+ if (!sk_CMS_RecipientInfo_push(env->recipientInfos, ri))
+ goto merr;
+
+ return ri;
+
+ merr:
+ CMSerr(CMS_F_CMS_ADD1_RECIPIENT_CERT, ERR_R_MALLOC_FAILURE);
+ err:
+ if (ri)
+ M_ASN1_free_of(ri, CMS_RecipientInfo);
+ return NULL;
+
+ }
+
+int CMS_RecipientInfo_ktri_get0_algs(CMS_RecipientInfo *ri,
+ EVP_PKEY **pk, X509 **recip,
+ X509_ALGOR **palg)
+ {
+ CMS_KeyTransRecipientInfo *ktri;
+ if (ri->type != CMS_RECIPINFO_TRANS)
+ {
+ CMSerr(CMS_F_CMS_RECIPIENTINFO_KTRI_GET0_ALGS,
+ CMS_R_NOT_KEY_TRANSPORT);
+ return 0;
+ }
+
+ ktri = ri->d.ktri;
+
+ if (pk)
+ *pk = ktri->pkey;
+ if (recip)
+ *recip = ktri->recip;
+ if (palg)
+ *palg = ktri->keyEncryptionAlgorithm;
+ return 1;
+ }
+
+int CMS_RecipientInfo_ktri_get0_signer_id(CMS_RecipientInfo *ri,
+ ASN1_OCTET_STRING **keyid,
+ X509_NAME **issuer, ASN1_INTEGER **sno)
+ {
+ CMS_KeyTransRecipientInfo *ktri;
+ if (ri->type != CMS_RECIPINFO_TRANS)
+ {
+ CMSerr(CMS_F_CMS_RECIPIENTINFO_KTRI_GET0_SIGNER_ID,
+ CMS_R_NOT_KEY_TRANSPORT);
+ return 0;
+ }
+ ktri = ri->d.ktri;
+
+ return cms_SignerIdentifier_get0_signer_id(ktri->rid,
+ keyid, issuer, sno);
+ }
+
+int CMS_RecipientInfo_ktri_cert_cmp(CMS_RecipientInfo *ri, X509 *cert)
+ {
+ if (ri->type != CMS_RECIPINFO_TRANS)
+ {
+ CMSerr(CMS_F_CMS_RECIPIENTINFO_KTRI_CERT_CMP,
+ CMS_R_NOT_KEY_TRANSPORT);
+ return -2;
+ }
+ return cms_SignerIdentifier_cert_cmp(ri->d.ktri->rid, cert);
+ }
+
+int CMS_RecipientInfo_set0_pkey(CMS_RecipientInfo *ri, EVP_PKEY *pkey)
+ {
+ if (ri->type != CMS_RECIPINFO_TRANS)
+ {
+ CMSerr(CMS_F_CMS_RECIPIENTINFO_SET0_PKEY,
+ CMS_R_NOT_KEY_TRANSPORT);
+ return 0;
+ }
+ ri->d.ktri->pkey = pkey;
+ return 1;
+ }
+
+/* Encrypt content key in key transport recipient info */
+
+static int cms_RecipientInfo_ktri_encrypt(CMS_ContentInfo *cms,
+ CMS_RecipientInfo *ri)
+ {
+ CMS_KeyTransRecipientInfo *ktri;
+ CMS_EncryptedContentInfo *ec;
+ EVP_PKEY_CTX *pctx = NULL;
+ unsigned char *ek = NULL;
+ size_t eklen;
+
+ int ret = 0;
+
+ if (ri->type != CMS_RECIPINFO_TRANS)
+ {
+ CMSerr(CMS_F_CMS_RECIPIENTINFO_KTRI_ENCRYPT,
+ CMS_R_NOT_KEY_TRANSPORT);
+ return 0;
+ }
+ ktri = ri->d.ktri;
+ ec = cms->d.envelopedData->encryptedContentInfo;
+
+ pctx = EVP_PKEY_CTX_new(ktri->pkey, NULL);
+ if (!pctx)
+ return 0;
+
+ if (EVP_PKEY_encrypt_init(pctx) <= 0)
+ goto err;
+
+ if (EVP_PKEY_CTX_ctrl(pctx, -1, EVP_PKEY_OP_ENCRYPT,
+ EVP_PKEY_CTRL_CMS_ENCRYPT, 0, ri) <= 0)
+ {
+ CMSerr(CMS_F_CMS_RECIPIENTINFO_KTRI_ENCRYPT, CMS_R_CTRL_ERROR);
+ goto err;
+ }
+
+ if (EVP_PKEY_encrypt(pctx, NULL, &eklen, ec->key, ec->keylen) <= 0)
+ goto err;
+
+ ek = OPENSSL_malloc(eklen);
+
+ if (ek == NULL)
+ {
+ CMSerr(CMS_F_CMS_RECIPIENTINFO_KTRI_ENCRYPT,
+ ERR_R_MALLOC_FAILURE);
+ goto err;
+ }
+
+ if (EVP_PKEY_encrypt(pctx, ek, &eklen, ec->key, ec->keylen) <= 0)
+ goto err;
+
+ ASN1_STRING_set0(ktri->encryptedKey, ek, eklen);
+ ek = NULL;
+
+ ret = 1;
+
+ err:
+ if (pctx)
+ EVP_PKEY_CTX_free(pctx);
+ if (ek)
+ OPENSSL_free(ek);
+ return ret;
+
+ }
+
+/* Decrypt content key from KTRI */
+
+static int cms_RecipientInfo_ktri_decrypt(CMS_ContentInfo *cms,
+ CMS_RecipientInfo *ri)
+ {
+ CMS_KeyTransRecipientInfo *ktri = ri->d.ktri;
+ EVP_PKEY_CTX *pctx = NULL;
+ unsigned char *ek = NULL;
+ size_t eklen;
+ int ret = 0;
+ CMS_EncryptedContentInfo *ec;
+ ec = cms->d.envelopedData->encryptedContentInfo;
+
+ if (ktri->pkey == NULL)
+ {
+ CMSerr(CMS_F_CMS_RECIPIENTINFO_KTRI_DECRYPT,
+ CMS_R_NO_PRIVATE_KEY);
+ return 0;
+ }
+
+ pctx = EVP_PKEY_CTX_new(ktri->pkey, NULL);
+ if (!pctx)
+ return 0;
+
+ if (EVP_PKEY_decrypt_init(pctx) <= 0)
+ goto err;
+
+ if (EVP_PKEY_CTX_ctrl(pctx, -1, EVP_PKEY_OP_DECRYPT,
+ EVP_PKEY_CTRL_CMS_DECRYPT, 0, ri) <= 0)
+ {
+ CMSerr(CMS_F_CMS_RECIPIENTINFO_KTRI_DECRYPT, CMS_R_CTRL_ERROR);
+ goto err;
+ }
+
+ if (EVP_PKEY_decrypt(pctx, NULL, &eklen,
+ ktri->encryptedKey->data,
+ ktri->encryptedKey->length) <= 0)
+ goto err;
+
+ ek = OPENSSL_malloc(eklen);
+
+ if (ek == NULL)
+ {
+ CMSerr(CMS_F_CMS_RECIPIENTINFO_KTRI_DECRYPT,
+ ERR_R_MALLOC_FAILURE);
+ goto err;
+ }
+
+ if (EVP_PKEY_decrypt(pctx, ek, &eklen,
+ ktri->encryptedKey->data,
+ ktri->encryptedKey->length) <= 0)
+ {
+ CMSerr(CMS_F_CMS_RECIPIENTINFO_KTRI_DECRYPT, CMS_R_CMS_LIB);
+ goto err;
+ }
+
+ ret = 1;
+
+ if (ec->key)
+ {
+ OPENSSL_cleanse(ec->key, ec->keylen);
+ OPENSSL_free(ec->key);
+ }
+
+ ec->key = ek;
+ ec->keylen = eklen;
+
+ err:
+ if (pctx)
+ EVP_PKEY_CTX_free(pctx);
+ if (!ret && ek)
+ OPENSSL_free(ek);
+
+ return ret;
+ }
+
+/* Key Encrypted Key (KEK) RecipientInfo routines */
+
+int CMS_RecipientInfo_kekri_id_cmp(CMS_RecipientInfo *ri,
+ const unsigned char *id, size_t idlen)
+ {
+ ASN1_OCTET_STRING tmp_os;
+ CMS_KEKRecipientInfo *kekri;
+ if (ri->type != CMS_RECIPINFO_KEK)
+ {
+ CMSerr(CMS_F_CMS_RECIPIENTINFO_KEKRI_ID_CMP, CMS_R_NOT_KEK);
+ return -2;
+ }
+ kekri = ri->d.kekri;
+ tmp_os.type = V_ASN1_OCTET_STRING;
+ tmp_os.flags = 0;
+ tmp_os.data = (unsigned char *)id;
+ tmp_os.length = (int)idlen;
+ return ASN1_OCTET_STRING_cmp(&tmp_os, kekri->kekid->keyIdentifier);
+ }
+
+/* For now hard code AES key wrap info */
+
+static size_t aes_wrap_keylen(int nid)
+ {
+ switch (nid)
+ {
+ case NID_id_aes128_wrap:
+ return 16;
+
+ case NID_id_aes192_wrap:
+ return 24;
+
+ case NID_id_aes256_wrap:
+ return 32;
+
+ default:
+ return 0;
+ }
+ }
+
+CMS_RecipientInfo *CMS_add0_recipient_key(CMS_ContentInfo *cms, int nid,
+ unsigned char *key, size_t keylen,
+ unsigned char *id, size_t idlen,
+ ASN1_GENERALIZEDTIME *date,
+ ASN1_OBJECT *otherTypeId,
+ ASN1_TYPE *otherType)
+ {
+ CMS_RecipientInfo *ri = NULL;
+ CMS_EnvelopedData *env;
+ CMS_KEKRecipientInfo *kekri;
+ env = cms_get0_enveloped(cms);
+ if (!env)
+ goto err;
+
+ if (nid == NID_undef)
+ {
+ switch (keylen)
+ {
+ case 16:
+ nid = NID_id_aes128_wrap;
+ break;
+
+ case 24:
+ nid = NID_id_aes192_wrap;
+ break;
+
+ case 32:
+ nid = NID_id_aes256_wrap;
+ break;
+
+ default:
+ CMSerr(CMS_F_CMS_ADD0_RECIPIENT_KEY,
+ CMS_R_INVALID_KEY_LENGTH);
+ goto err;
+ }
+
+ }
+ else
+ {
+
+ size_t exp_keylen = aes_wrap_keylen(nid);
+
+ if (!exp_keylen)
+ {
+ CMSerr(CMS_F_CMS_ADD0_RECIPIENT_KEY,
+ CMS_R_UNSUPPORTED_KEK_ALGORITHM);
+ goto err;
+ }
+
+ if (keylen != exp_keylen)
+ {
+ CMSerr(CMS_F_CMS_ADD0_RECIPIENT_KEY,
+ CMS_R_INVALID_KEY_LENGTH);
+ goto err;
+ }
+
+ }
+
+ /* Initialize recipient info */
+ ri = M_ASN1_new_of(CMS_RecipientInfo);
+ if (!ri)
+ goto merr;
+
+ ri->d.kekri = M_ASN1_new_of(CMS_KEKRecipientInfo);
+ if (!ri->d.kekri)
+ goto merr;
+ ri->type = CMS_RECIPINFO_KEK;
+
+ kekri = ri->d.kekri;
+
+ if (otherTypeId)
+ {
+ kekri->kekid->other = M_ASN1_new_of(CMS_OtherKeyAttribute);
+ if (kekri->kekid->other == NULL)
+ goto merr;
+ }
+
+ if (!sk_CMS_RecipientInfo_push(env->recipientInfos, ri))
+ goto merr;
+
+
+ /* After this point no calls can fail */
+
+ kekri->version = 4;
+
+ kekri->key = key;
+ kekri->keylen = keylen;
+
+ ASN1_STRING_set0(kekri->kekid->keyIdentifier, id, idlen);
+
+ kekri->kekid->date = date;
+
+ if (kekri->kekid->other)
+ {
+ kekri->kekid->other->keyAttrId = otherTypeId;
+ kekri->kekid->other->keyAttr = otherType;
+ }
+
+ X509_ALGOR_set0(kekri->keyEncryptionAlgorithm,
+ OBJ_nid2obj(nid), V_ASN1_UNDEF, NULL);
+
+ return ri;
+
+ merr:
+ CMSerr(CMS_F_CMS_ADD0_RECIPIENT_KEY, ERR_R_MALLOC_FAILURE);
+ err:
+ if (ri)
+ M_ASN1_free_of(ri, CMS_RecipientInfo);
+ return NULL;
+
+ }
+
+int CMS_RecipientInfo_kekri_get0_id(CMS_RecipientInfo *ri,
+ X509_ALGOR **palg,
+ ASN1_OCTET_STRING **pid,
+ ASN1_GENERALIZEDTIME **pdate,
+ ASN1_OBJECT **potherid,
+ ASN1_TYPE **pothertype)
+ {
+ CMS_KEKIdentifier *rkid;
+ if (ri->type != CMS_RECIPINFO_KEK)
+ {
+ CMSerr(CMS_F_CMS_RECIPIENTINFO_KEKRI_GET0_ID, CMS_R_NOT_KEK);
+ return 0;
+ }
+ rkid = ri->d.kekri->kekid;
+ if (palg)
+ *palg = ri->d.kekri->keyEncryptionAlgorithm;
+ if (pid)
+ *pid = rkid->keyIdentifier;
+ if (pdate)
+ *pdate = rkid->date;
+ if (potherid)
+ {
+ if (rkid->other)
+ *potherid = rkid->other->keyAttrId;
+ else
+ *potherid = NULL;
+ }
+ if (pothertype)
+ {
+ if (rkid->other)
+ *pothertype = rkid->other->keyAttr;
+ else
+ *pothertype = NULL;
+ }
+ return 1;
+ }
+
+int CMS_RecipientInfo_set0_key(CMS_RecipientInfo *ri,
+ unsigned char *key, size_t keylen)
+ {
+ CMS_KEKRecipientInfo *kekri;
+ if (ri->type != CMS_RECIPINFO_KEK)
+ {
+ CMSerr(CMS_F_CMS_RECIPIENTINFO_SET0_KEY, CMS_R_NOT_KEK);
+ return 0;
+ }
+
+ kekri = ri->d.kekri;
+ kekri->key = key;
+ kekri->keylen = keylen;
+ return 1;
+ }
+
+
+/* Encrypt content key in KEK recipient info */
+
+static int cms_RecipientInfo_kekri_encrypt(CMS_ContentInfo *cms,
+ CMS_RecipientInfo *ri)
+ {
+ CMS_EncryptedContentInfo *ec;
+ CMS_KEKRecipientInfo *kekri;
+ AES_KEY actx;
+ unsigned char *wkey = NULL;
+ int wkeylen;
+ int r = 0;
+
+ ec = cms->d.envelopedData->encryptedContentInfo;
+
+ kekri = ri->d.kekri;
+
+ if (!kekri->key)
+ {
+ CMSerr(CMS_F_CMS_RECIPIENTINFO_KEKRI_ENCRYPT, CMS_R_NO_KEY);
+ return 0;
+ }
+
+ if (AES_set_encrypt_key(kekri->key, kekri->keylen << 3, &actx))
+ {
+ CMSerr(CMS_F_CMS_RECIPIENTINFO_KEKRI_ENCRYPT,
+ CMS_R_ERROR_SETTING_KEY);
+ goto err;
+ }
+
+ wkey = OPENSSL_malloc(ec->keylen + 8);
+
+ if (!wkey)
+ {
+ CMSerr(CMS_F_CMS_RECIPIENTINFO_KEKRI_ENCRYPT,
+ ERR_R_MALLOC_FAILURE);
+ goto err;
+ }
+
+ wkeylen = AES_wrap_key(&actx, NULL, wkey, ec->key, ec->keylen);
+
+ if (wkeylen <= 0)
+ {
+ CMSerr(CMS_F_CMS_RECIPIENTINFO_KEKRI_ENCRYPT, CMS_R_WRAP_ERROR);
+ goto err;
+ }
+
+ ASN1_STRING_set0(kekri->encryptedKey, wkey, wkeylen);
+
+ r = 1;
+
+ err:
+
+ if (!r && wkey)
+ OPENSSL_free(wkey);
+ OPENSSL_cleanse(&actx, sizeof(actx));
+
+ return r;
+
+ }
+
+/* Decrypt content key in KEK recipient info */
+
+static int cms_RecipientInfo_kekri_decrypt(CMS_ContentInfo *cms,
+ CMS_RecipientInfo *ri)
+ {
+ CMS_EncryptedContentInfo *ec;
+ CMS_KEKRecipientInfo *kekri;
+ AES_KEY actx;
+ unsigned char *ukey = NULL;
+ int ukeylen;
+ int r = 0, wrap_nid;
+
+ ec = cms->d.envelopedData->encryptedContentInfo;
+
+ kekri = ri->d.kekri;
+
+ if (!kekri->key)
+ {
+ CMSerr(CMS_F_CMS_RECIPIENTINFO_KEKRI_DECRYPT, CMS_R_NO_KEY);
+ return 0;
+ }
+
+ wrap_nid = OBJ_obj2nid(kekri->keyEncryptionAlgorithm->algorithm);
+ if (aes_wrap_keylen(wrap_nid) != kekri->keylen)
+ {
+ CMSerr(CMS_F_CMS_RECIPIENTINFO_KEKRI_DECRYPT,
+ CMS_R_INVALID_KEY_LENGTH);
+ return 0;
+ }
+
+ /* If encrypted key length is invalid don't bother */
+
+ if (kekri->encryptedKey->length < 16)
+ {
+ CMSerr(CMS_F_CMS_RECIPIENTINFO_KEKRI_DECRYPT,
+ CMS_R_INVALID_ENCRYPTED_KEY_LENGTH);
+ goto err;
+ }
+
+ if (AES_set_decrypt_key(kekri->key, kekri->keylen << 3, &actx))
+ {
+ CMSerr(CMS_F_CMS_RECIPIENTINFO_KEKRI_DECRYPT,
+ CMS_R_ERROR_SETTING_KEY);
+ goto err;
+ }
+
+ ukey = OPENSSL_malloc(kekri->encryptedKey->length - 8);
+
+ if (!ukey)
+ {
+ CMSerr(CMS_F_CMS_RECIPIENTINFO_KEKRI_DECRYPT,
+ ERR_R_MALLOC_FAILURE);
+ goto err;
+ }
+
+ ukeylen = AES_unwrap_key(&actx, NULL, ukey,
+ kekri->encryptedKey->data,
+ kekri->encryptedKey->length);
+
+ if (ukeylen <= 0)
+ {
+ CMSerr(CMS_F_CMS_RECIPIENTINFO_KEKRI_DECRYPT,
+ CMS_R_UNWRAP_ERROR);
+ goto err;
+ }
+
+ ec->key = ukey;
+ ec->keylen = ukeylen;
+
+ r = 1;
+
+ err:
+
+ if (!r && ukey)
+ OPENSSL_free(ukey);
+ OPENSSL_cleanse(&actx, sizeof(actx));
+
+ return r;
+
+ }
+
+int CMS_RecipientInfo_decrypt(CMS_ContentInfo *cms, CMS_RecipientInfo *ri)
+ {
+ switch(ri->type)
+ {
+ case CMS_RECIPINFO_TRANS:
+ return cms_RecipientInfo_ktri_decrypt(cms, ri);
+
+ case CMS_RECIPINFO_KEK:
+ return cms_RecipientInfo_kekri_decrypt(cms, ri);
+
+ case CMS_RECIPINFO_PASS:
+ return cms_RecipientInfo_pwri_crypt(cms, ri, 0);
+
+ default:
+ CMSerr(CMS_F_CMS_RECIPIENTINFO_DECRYPT,
+ CMS_R_UNSUPPORTED_RECPIENTINFO_TYPE);
+ return 0;
+ }
+ }
+
+BIO *cms_EnvelopedData_init_bio(CMS_ContentInfo *cms)
+ {
+ CMS_EncryptedContentInfo *ec;
+ STACK_OF(CMS_RecipientInfo) *rinfos;
+ CMS_RecipientInfo *ri;
+ int i, r, ok = 0;
+ BIO *ret;
+
+ /* Get BIO first to set up key */
+
+ ec = cms->d.envelopedData->encryptedContentInfo;
+ ret = cms_EncryptedContent_init_bio(ec);
+
+ /* If error or no cipher end of processing */
+
+ if (!ret || !ec->cipher)
+ return ret;
+
+ /* Now encrypt content key according to each RecipientInfo type */
+
+ rinfos = cms->d.envelopedData->recipientInfos;
+
+ for (i = 0; i < sk_CMS_RecipientInfo_num(rinfos); i++)
+ {
+ ri = sk_CMS_RecipientInfo_value(rinfos, i);
+
+ switch (ri->type)
+ {
+ case CMS_RECIPINFO_TRANS:
+ r = cms_RecipientInfo_ktri_encrypt(cms, ri);
+ break;
+
+ case CMS_RECIPINFO_KEK:
+ r = cms_RecipientInfo_kekri_encrypt(cms, ri);
+ break;
+
+ case CMS_RECIPINFO_PASS:
+ r = cms_RecipientInfo_pwri_crypt(cms, ri, 1);
+ break;
+
+ default:
+ CMSerr(CMS_F_CMS_ENVELOPEDDATA_INIT_BIO,
+ CMS_R_UNSUPPORTED_RECIPIENT_TYPE);
+ goto err;
+ }
+
+ if (r <= 0)
+ {
+ CMSerr(CMS_F_CMS_ENVELOPEDDATA_INIT_BIO,
+ CMS_R_ERROR_SETTING_RECIPIENTINFO);
+ goto err;
+ }
+ }
+
+ ok = 1;
+
+ err:
+ ec->cipher = NULL;
+ if (ec->key)
+ {
+ OPENSSL_cleanse(ec->key, ec->keylen);
+ OPENSSL_free(ec->key);
+ ec->key = NULL;
+ ec->keylen = 0;
+ }
+ if (ok)
+ return ret;
+ BIO_free(ret);
+ return NULL;
+
+ }
diff --git a/main/openssl/crypto/cms/cms_err.c b/main/openssl/crypto/cms/cms_err.c
new file mode 100644
index 00000000..8330ead7
--- /dev/null
+++ b/main/openssl/crypto/cms/cms_err.c
@@ -0,0 +1,245 @@
+/* crypto/cms/cms_err.c */
+/* ====================================================================
+ * Copyright (c) 1999-2009 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * openssl-core@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com). This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+
+/* NOTE: this file was auto generated by the mkerr.pl script: any changes
+ * made to it will be overwritten when the script next updates this file,
+ * only reason strings will be preserved.
+ */
+
+#include <stdio.h>
+#include <openssl/err.h>
+#include <openssl/cms.h>
+
+/* BEGIN ERROR CODES */
+#ifndef OPENSSL_NO_ERR
+
+#define ERR_FUNC(func) ERR_PACK(ERR_LIB_CMS,func,0)
+#define ERR_REASON(reason) ERR_PACK(ERR_LIB_CMS,0,reason)
+
+static ERR_STRING_DATA CMS_str_functs[]=
+ {
+{ERR_FUNC(CMS_F_CHECK_CONTENT), "CHECK_CONTENT"},
+{ERR_FUNC(CMS_F_CMS_ADD0_CERT), "CMS_add0_cert"},
+{ERR_FUNC(CMS_F_CMS_ADD0_RECIPIENT_KEY), "CMS_add0_recipient_key"},
+{ERR_FUNC(CMS_F_CMS_ADD0_RECIPIENT_PASSWORD), "CMS_add0_recipient_password"},
+{ERR_FUNC(CMS_F_CMS_ADD1_RECEIPTREQUEST), "CMS_add1_ReceiptRequest"},
+{ERR_FUNC(CMS_F_CMS_ADD1_RECIPIENT_CERT), "CMS_add1_recipient_cert"},
+{ERR_FUNC(CMS_F_CMS_ADD1_SIGNER), "CMS_add1_signer"},
+{ERR_FUNC(CMS_F_CMS_ADD1_SIGNINGTIME), "CMS_ADD1_SIGNINGTIME"},
+{ERR_FUNC(CMS_F_CMS_COMPRESS), "CMS_compress"},
+{ERR_FUNC(CMS_F_CMS_COMPRESSEDDATA_CREATE), "cms_CompressedData_create"},
+{ERR_FUNC(CMS_F_CMS_COMPRESSEDDATA_INIT_BIO), "cms_CompressedData_init_bio"},
+{ERR_FUNC(CMS_F_CMS_COPY_CONTENT), "CMS_COPY_CONTENT"},
+{ERR_FUNC(CMS_F_CMS_COPY_MESSAGEDIGEST), "CMS_COPY_MESSAGEDIGEST"},
+{ERR_FUNC(CMS_F_CMS_DATA), "CMS_data"},
+{ERR_FUNC(CMS_F_CMS_DATAFINAL), "CMS_dataFinal"},
+{ERR_FUNC(CMS_F_CMS_DATAINIT), "CMS_dataInit"},
+{ERR_FUNC(CMS_F_CMS_DECRYPT), "CMS_decrypt"},
+{ERR_FUNC(CMS_F_CMS_DECRYPT_SET1_KEY), "CMS_decrypt_set1_key"},
+{ERR_FUNC(CMS_F_CMS_DECRYPT_SET1_PASSWORD), "CMS_decrypt_set1_password"},
+{ERR_FUNC(CMS_F_CMS_DECRYPT_SET1_PKEY), "CMS_decrypt_set1_pkey"},
+{ERR_FUNC(CMS_F_CMS_DIGESTALGORITHM_FIND_CTX), "cms_DigestAlgorithm_find_ctx"},
+{ERR_FUNC(CMS_F_CMS_DIGESTALGORITHM_INIT_BIO), "cms_DigestAlgorithm_init_bio"},
+{ERR_FUNC(CMS_F_CMS_DIGESTEDDATA_DO_FINAL), "cms_DigestedData_do_final"},
+{ERR_FUNC(CMS_F_CMS_DIGEST_VERIFY), "CMS_digest_verify"},
+{ERR_FUNC(CMS_F_CMS_ENCODE_RECEIPT), "cms_encode_Receipt"},
+{ERR_FUNC(CMS_F_CMS_ENCRYPT), "CMS_encrypt"},
+{ERR_FUNC(CMS_F_CMS_ENCRYPTEDCONTENT_INIT_BIO), "cms_EncryptedContent_init_bio"},
+{ERR_FUNC(CMS_F_CMS_ENCRYPTEDDATA_DECRYPT), "CMS_EncryptedData_decrypt"},
+{ERR_FUNC(CMS_F_CMS_ENCRYPTEDDATA_ENCRYPT), "CMS_EncryptedData_encrypt"},
+{ERR_FUNC(CMS_F_CMS_ENCRYPTEDDATA_SET1_KEY), "CMS_EncryptedData_set1_key"},
+{ERR_FUNC(CMS_F_CMS_ENVELOPEDDATA_CREATE), "CMS_EnvelopedData_create"},
+{ERR_FUNC(CMS_F_CMS_ENVELOPEDDATA_INIT_BIO), "cms_EnvelopedData_init_bio"},
+{ERR_FUNC(CMS_F_CMS_ENVELOPED_DATA_INIT), "CMS_ENVELOPED_DATA_INIT"},
+{ERR_FUNC(CMS_F_CMS_FINAL), "CMS_final"},
+{ERR_FUNC(CMS_F_CMS_GET0_CERTIFICATE_CHOICES), "CMS_GET0_CERTIFICATE_CHOICES"},
+{ERR_FUNC(CMS_F_CMS_GET0_CONTENT), "CMS_get0_content"},
+{ERR_FUNC(CMS_F_CMS_GET0_ECONTENT_TYPE), "CMS_GET0_ECONTENT_TYPE"},
+{ERR_FUNC(CMS_F_CMS_GET0_ENVELOPED), "cms_get0_enveloped"},
+{ERR_FUNC(CMS_F_CMS_GET0_REVOCATION_CHOICES), "CMS_GET0_REVOCATION_CHOICES"},
+{ERR_FUNC(CMS_F_CMS_GET0_SIGNED), "CMS_GET0_SIGNED"},
+{ERR_FUNC(CMS_F_CMS_MSGSIGDIGEST_ADD1), "cms_msgSigDigest_add1"},
+{ERR_FUNC(CMS_F_CMS_RECEIPTREQUEST_CREATE0), "CMS_ReceiptRequest_create0"},
+{ERR_FUNC(CMS_F_CMS_RECEIPT_VERIFY), "cms_Receipt_verify"},
+{ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_DECRYPT), "CMS_RecipientInfo_decrypt"},
+{ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_KEKRI_DECRYPT), "CMS_RECIPIENTINFO_KEKRI_DECRYPT"},
+{ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_KEKRI_ENCRYPT), "CMS_RECIPIENTINFO_KEKRI_ENCRYPT"},
+{ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_KEKRI_GET0_ID), "CMS_RecipientInfo_kekri_get0_id"},
+{ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_KEKRI_ID_CMP), "CMS_RecipientInfo_kekri_id_cmp"},
+{ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_KTRI_CERT_CMP), "CMS_RecipientInfo_ktri_cert_cmp"},
+{ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_KTRI_DECRYPT), "CMS_RECIPIENTINFO_KTRI_DECRYPT"},
+{ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_KTRI_ENCRYPT), "CMS_RECIPIENTINFO_KTRI_ENCRYPT"},
+{ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_KTRI_GET0_ALGS), "CMS_RecipientInfo_ktri_get0_algs"},
+{ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_KTRI_GET0_SIGNER_ID), "CMS_RecipientInfo_ktri_get0_signer_id"},
+{ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT), "cms_RecipientInfo_pwri_crypt"},
+{ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_SET0_KEY), "CMS_RecipientInfo_set0_key"},
+{ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_SET0_PASSWORD), "CMS_RecipientInfo_set0_password"},
+{ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_SET0_PKEY), "CMS_RecipientInfo_set0_pkey"},
+{ERR_FUNC(CMS_F_CMS_SET1_SIGNERIDENTIFIER), "cms_set1_SignerIdentifier"},
+{ERR_FUNC(CMS_F_CMS_SET_DETACHED), "CMS_set_detached"},
+{ERR_FUNC(CMS_F_CMS_SIGN), "CMS_sign"},
+{ERR_FUNC(CMS_F_CMS_SIGNED_DATA_INIT), "CMS_SIGNED_DATA_INIT"},
+{ERR_FUNC(CMS_F_CMS_SIGNERINFO_CONTENT_SIGN), "CMS_SIGNERINFO_CONTENT_SIGN"},
+{ERR_FUNC(CMS_F_CMS_SIGNERINFO_SIGN), "CMS_SignerInfo_sign"},
+{ERR_FUNC(CMS_F_CMS_SIGNERINFO_VERIFY), "CMS_SignerInfo_verify"},
+{ERR_FUNC(CMS_F_CMS_SIGNERINFO_VERIFY_CERT), "CMS_SIGNERINFO_VERIFY_CERT"},
+{ERR_FUNC(CMS_F_CMS_SIGNERINFO_VERIFY_CONTENT), "CMS_SignerInfo_verify_content"},
+{ERR_FUNC(CMS_F_CMS_SIGN_RECEIPT), "CMS_sign_receipt"},
+{ERR_FUNC(CMS_F_CMS_STREAM), "CMS_stream"},
+{ERR_FUNC(CMS_F_CMS_UNCOMPRESS), "CMS_uncompress"},
+{ERR_FUNC(CMS_F_CMS_VERIFY), "CMS_verify"},
+{0,NULL}
+ };
+
+static ERR_STRING_DATA CMS_str_reasons[]=
+ {
+{ERR_REASON(CMS_R_ADD_SIGNER_ERROR) ,"add signer error"},
+{ERR_REASON(CMS_R_CERTIFICATE_ALREADY_PRESENT),"certificate already present"},
+{ERR_REASON(CMS_R_CERTIFICATE_HAS_NO_KEYID),"certificate has no keyid"},
+{ERR_REASON(CMS_R_CERTIFICATE_VERIFY_ERROR),"certificate verify error"},
+{ERR_REASON(CMS_R_CIPHER_INITIALISATION_ERROR),"cipher initialisation error"},
+{ERR_REASON(CMS_R_CIPHER_PARAMETER_INITIALISATION_ERROR),"cipher parameter initialisation error"},
+{ERR_REASON(CMS_R_CMS_DATAFINAL_ERROR) ,"cms datafinal error"},
+{ERR_REASON(CMS_R_CMS_LIB) ,"cms lib"},
+{ERR_REASON(CMS_R_CONTENTIDENTIFIER_MISMATCH),"contentidentifier mismatch"},
+{ERR_REASON(CMS_R_CONTENT_NOT_FOUND) ,"content not found"},
+{ERR_REASON(CMS_R_CONTENT_TYPE_MISMATCH) ,"content type mismatch"},
+{ERR_REASON(CMS_R_CONTENT_TYPE_NOT_COMPRESSED_DATA),"content type not compressed data"},
+{ERR_REASON(CMS_R_CONTENT_TYPE_NOT_ENVELOPED_DATA),"content type not enveloped data"},
+{ERR_REASON(CMS_R_CONTENT_TYPE_NOT_SIGNED_DATA),"content type not signed data"},
+{ERR_REASON(CMS_R_CONTENT_VERIFY_ERROR) ,"content verify error"},
+{ERR_REASON(CMS_R_CTRL_ERROR) ,"ctrl error"},
+{ERR_REASON(CMS_R_CTRL_FAILURE) ,"ctrl failure"},
+{ERR_REASON(CMS_R_DECRYPT_ERROR) ,"decrypt error"},
+{ERR_REASON(CMS_R_DIGEST_ERROR) ,"digest error"},
+{ERR_REASON(CMS_R_ERROR_GETTING_PUBLIC_KEY),"error getting public key"},
+{ERR_REASON(CMS_R_ERROR_READING_MESSAGEDIGEST_ATTRIBUTE),"error reading messagedigest attribute"},
+{ERR_REASON(CMS_R_ERROR_SETTING_KEY) ,"error setting key"},
+{ERR_REASON(CMS_R_ERROR_SETTING_RECIPIENTINFO),"error setting recipientinfo"},
+{ERR_REASON(CMS_R_INVALID_ENCRYPTED_KEY_LENGTH),"invalid encrypted key length"},
+{ERR_REASON(CMS_R_INVALID_KEY_ENCRYPTION_PARAMETER),"invalid key encryption parameter"},
+{ERR_REASON(CMS_R_INVALID_KEY_LENGTH) ,"invalid key length"},
+{ERR_REASON(CMS_R_MD_BIO_INIT_ERROR) ,"md bio init error"},
+{ERR_REASON(CMS_R_MESSAGEDIGEST_ATTRIBUTE_WRONG_LENGTH),"messagedigest attribute wrong length"},
+{ERR_REASON(CMS_R_MESSAGEDIGEST_WRONG_LENGTH),"messagedigest wrong length"},
+{ERR_REASON(CMS_R_MSGSIGDIGEST_ERROR) ,"msgsigdigest error"},
+{ERR_REASON(CMS_R_MSGSIGDIGEST_VERIFICATION_FAILURE),"msgsigdigest verification failure"},
+{ERR_REASON(CMS_R_MSGSIGDIGEST_WRONG_LENGTH),"msgsigdigest wrong length"},
+{ERR_REASON(CMS_R_NEED_ONE_SIGNER) ,"need one signer"},
+{ERR_REASON(CMS_R_NOT_A_SIGNED_RECEIPT) ,"not a signed receipt"},
+{ERR_REASON(CMS_R_NOT_ENCRYPTED_DATA) ,"not encrypted data"},
+{ERR_REASON(CMS_R_NOT_KEK) ,"not kek"},
+{ERR_REASON(CMS_R_NOT_KEY_TRANSPORT) ,"not key transport"},
+{ERR_REASON(CMS_R_NOT_PWRI) ,"not pwri"},
+{ERR_REASON(CMS_R_NOT_SUPPORTED_FOR_THIS_KEY_TYPE),"not supported for this key type"},
+{ERR_REASON(CMS_R_NO_CIPHER) ,"no cipher"},
+{ERR_REASON(CMS_R_NO_CONTENT) ,"no content"},
+{ERR_REASON(CMS_R_NO_CONTENT_TYPE) ,"no content type"},
+{ERR_REASON(CMS_R_NO_DEFAULT_DIGEST) ,"no default digest"},
+{ERR_REASON(CMS_R_NO_DIGEST_SET) ,"no digest set"},
+{ERR_REASON(CMS_R_NO_KEY) ,"no key"},
+{ERR_REASON(CMS_R_NO_KEY_OR_CERT) ,"no key or cert"},
+{ERR_REASON(CMS_R_NO_MATCHING_DIGEST) ,"no matching digest"},
+{ERR_REASON(CMS_R_NO_MATCHING_RECIPIENT) ,"no matching recipient"},
+{ERR_REASON(CMS_R_NO_MATCHING_SIGNATURE) ,"no matching signature"},
+{ERR_REASON(CMS_R_NO_MSGSIGDIGEST) ,"no msgsigdigest"},
+{ERR_REASON(CMS_R_NO_PASSWORD) ,"no password"},
+{ERR_REASON(CMS_R_NO_PRIVATE_KEY) ,"no private key"},
+{ERR_REASON(CMS_R_NO_PUBLIC_KEY) ,"no public key"},
+{ERR_REASON(CMS_R_NO_RECEIPT_REQUEST) ,"no receipt request"},
+{ERR_REASON(CMS_R_NO_SIGNERS) ,"no signers"},
+{ERR_REASON(CMS_R_PRIVATE_KEY_DOES_NOT_MATCH_CERTIFICATE),"private key does not match certificate"},
+{ERR_REASON(CMS_R_RECEIPT_DECODE_ERROR) ,"receipt decode error"},
+{ERR_REASON(CMS_R_RECIPIENT_ERROR) ,"recipient error"},
+{ERR_REASON(CMS_R_SIGNER_CERTIFICATE_NOT_FOUND),"signer certificate not found"},
+{ERR_REASON(CMS_R_SIGNFINAL_ERROR) ,"signfinal error"},
+{ERR_REASON(CMS_R_SMIME_TEXT_ERROR) ,"smime text error"},
+{ERR_REASON(CMS_R_STORE_INIT_ERROR) ,"store init error"},
+{ERR_REASON(CMS_R_TYPE_NOT_COMPRESSED_DATA),"type not compressed data"},
+{ERR_REASON(CMS_R_TYPE_NOT_DATA) ,"type not data"},
+{ERR_REASON(CMS_R_TYPE_NOT_DIGESTED_DATA),"type not digested data"},
+{ERR_REASON(CMS_R_TYPE_NOT_ENCRYPTED_DATA),"type not encrypted data"},
+{ERR_REASON(CMS_R_TYPE_NOT_ENVELOPED_DATA),"type not enveloped data"},
+{ERR_REASON(CMS_R_UNABLE_TO_FINALIZE_CONTEXT),"unable to finalize context"},
+{ERR_REASON(CMS_R_UNKNOWN_CIPHER) ,"unknown cipher"},
+{ERR_REASON(CMS_R_UNKNOWN_DIGEST_ALGORIHM),"unknown digest algorihm"},
+{ERR_REASON(CMS_R_UNKNOWN_ID) ,"unknown id"},
+{ERR_REASON(CMS_R_UNSUPPORTED_COMPRESSION_ALGORITHM),"unsupported compression algorithm"},
+{ERR_REASON(CMS_R_UNSUPPORTED_CONTENT_TYPE),"unsupported content type"},
+{ERR_REASON(CMS_R_UNSUPPORTED_KEK_ALGORITHM),"unsupported kek algorithm"},
+{ERR_REASON(CMS_R_UNSUPPORTED_KEY_ENCRYPTION_ALGORITHM),"unsupported key encryption algorithm"},
+{ERR_REASON(CMS_R_UNSUPPORTED_RECIPIENT_TYPE),"unsupported recipient type"},
+{ERR_REASON(CMS_R_UNSUPPORTED_RECPIENTINFO_TYPE),"unsupported recpientinfo type"},
+{ERR_REASON(CMS_R_UNSUPPORTED_TYPE) ,"unsupported type"},
+{ERR_REASON(CMS_R_UNWRAP_ERROR) ,"unwrap error"},
+{ERR_REASON(CMS_R_UNWRAP_FAILURE) ,"unwrap failure"},
+{ERR_REASON(CMS_R_VERIFICATION_FAILURE) ,"verification failure"},
+{ERR_REASON(CMS_R_WRAP_ERROR) ,"wrap error"},
+{0,NULL}
+ };
+
+#endif
+
+void ERR_load_CMS_strings(void)
+ {
+#ifndef OPENSSL_NO_ERR
+
+ if (ERR_func_error_string(CMS_str_functs[0].error) == NULL)
+ {
+ ERR_load_strings(0,CMS_str_functs);
+ ERR_load_strings(0,CMS_str_reasons);
+ }
+#endif
+ }
diff --git a/main/openssl/crypto/cms/cms_ess.c b/main/openssl/crypto/cms/cms_ess.c
new file mode 100644
index 00000000..90c0b82f
--- /dev/null
+++ b/main/openssl/crypto/cms/cms_ess.c
@@ -0,0 +1,420 @@
+/* crypto/cms/cms_ess.c */
+/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
+ * project.
+ */
+/* ====================================================================
+ * Copyright (c) 2008 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include "cryptlib.h"
+#include <openssl/asn1t.h>
+#include <openssl/pem.h>
+#include <openssl/rand.h>
+#include <openssl/x509v3.h>
+#include <openssl/err.h>
+#include <openssl/cms.h>
+#include "cms_lcl.h"
+
+DECLARE_ASN1_ITEM(CMS_ReceiptRequest)
+DECLARE_ASN1_ITEM(CMS_Receipt)
+
+IMPLEMENT_ASN1_FUNCTIONS(CMS_ReceiptRequest)
+
+/* ESS services: for now just Signed Receipt related */
+
+int CMS_get1_ReceiptRequest(CMS_SignerInfo *si, CMS_ReceiptRequest **prr)
+ {
+ ASN1_STRING *str;
+ CMS_ReceiptRequest *rr = NULL;
+ if (prr)
+ *prr = NULL;
+ str = CMS_signed_get0_data_by_OBJ(si,
+ OBJ_nid2obj(NID_id_smime_aa_receiptRequest),
+ -3, V_ASN1_SEQUENCE);
+ if (!str)
+ return 0;
+
+ rr = ASN1_item_unpack(str, ASN1_ITEM_rptr(CMS_ReceiptRequest));
+ if (!rr)
+ return -1;
+ if (prr)
+ *prr = rr;
+ else
+ CMS_ReceiptRequest_free(rr);
+ return 1;
+ }
+
+CMS_ReceiptRequest *CMS_ReceiptRequest_create0(unsigned char *id, int idlen,
+ int allorfirst,
+ STACK_OF(GENERAL_NAMES) *receiptList,
+ STACK_OF(GENERAL_NAMES) *receiptsTo)
+ {
+ CMS_ReceiptRequest *rr = NULL;
+
+ rr = CMS_ReceiptRequest_new();
+ if (!rr)
+ goto merr;
+ if (id)
+ ASN1_STRING_set0(rr->signedContentIdentifier, id, idlen);
+ else
+ {
+ if (!ASN1_STRING_set(rr->signedContentIdentifier, NULL, 32))
+ goto merr;
+ if (RAND_pseudo_bytes(rr->signedContentIdentifier->data, 32)
+ <= 0)
+ goto err;
+ }
+
+ sk_GENERAL_NAMES_pop_free(rr->receiptsTo, GENERAL_NAMES_free);
+ rr->receiptsTo = receiptsTo;
+
+ if (receiptList)
+ {
+ rr->receiptsFrom->type = 1;
+ rr->receiptsFrom->d.receiptList = receiptList;
+ }
+ else
+ {
+ rr->receiptsFrom->type = 0;
+ rr->receiptsFrom->d.allOrFirstTier = allorfirst;
+ }
+
+ return rr;
+
+ merr:
+ CMSerr(CMS_F_CMS_RECEIPTREQUEST_CREATE0, ERR_R_MALLOC_FAILURE);
+
+ err:
+ if (rr)
+ CMS_ReceiptRequest_free(rr);
+
+ return NULL;
+
+ }
+
+int CMS_add1_ReceiptRequest(CMS_SignerInfo *si, CMS_ReceiptRequest *rr)
+ {
+ unsigned char *rrder = NULL;
+ int rrderlen, r = 0;
+
+ rrderlen = i2d_CMS_ReceiptRequest(rr, &rrder);
+ if (rrderlen < 0)
+ goto merr;
+
+ if (!CMS_signed_add1_attr_by_NID(si, NID_id_smime_aa_receiptRequest,
+ V_ASN1_SEQUENCE, rrder, rrderlen))
+ goto merr;
+
+ r = 1;
+
+ merr:
+ if (!r)
+ CMSerr(CMS_F_CMS_ADD1_RECEIPTREQUEST, ERR_R_MALLOC_FAILURE);
+
+ if (rrder)
+ OPENSSL_free(rrder);
+
+ return r;
+
+ }
+
+void CMS_ReceiptRequest_get0_values(CMS_ReceiptRequest *rr,
+ ASN1_STRING **pcid,
+ int *pallorfirst,
+ STACK_OF(GENERAL_NAMES) **plist,
+ STACK_OF(GENERAL_NAMES) **prto)
+ {
+ if (pcid)
+ *pcid = rr->signedContentIdentifier;
+ if (rr->receiptsFrom->type == 0)
+ {
+ if (pallorfirst)
+ *pallorfirst = (int)rr->receiptsFrom->d.allOrFirstTier;
+ if (plist)
+ *plist = NULL;
+ }
+ else
+ {
+ if (pallorfirst)
+ *pallorfirst = -1;
+ if (plist)
+ *plist = rr->receiptsFrom->d.receiptList;
+ }
+ if (prto)
+ *prto = rr->receiptsTo;
+ }
+
+/* Digest a SignerInfo structure for msgSigDigest attribute processing */
+
+static int cms_msgSigDigest(CMS_SignerInfo *si,
+ unsigned char *dig, unsigned int *diglen)
+ {
+ const EVP_MD *md;
+ md = EVP_get_digestbyobj(si->digestAlgorithm->algorithm);
+ if (md == NULL)
+ return 0;
+ if (!ASN1_item_digest(ASN1_ITEM_rptr(CMS_Attributes_Verify), md,
+ si->signedAttrs, dig, diglen))
+ return 0;
+ return 1;
+ }
+
+/* Add a msgSigDigest attribute to a SignerInfo */
+
+int cms_msgSigDigest_add1(CMS_SignerInfo *dest, CMS_SignerInfo *src)
+ {
+ unsigned char dig[EVP_MAX_MD_SIZE];
+ unsigned int diglen;
+ if (!cms_msgSigDigest(src, dig, &diglen))
+ {
+ CMSerr(CMS_F_CMS_MSGSIGDIGEST_ADD1, CMS_R_MSGSIGDIGEST_ERROR);
+ return 0;
+ }
+ if (!CMS_signed_add1_attr_by_NID(dest, NID_id_smime_aa_msgSigDigest,
+ V_ASN1_OCTET_STRING, dig, diglen))
+ {
+ CMSerr(CMS_F_CMS_MSGSIGDIGEST_ADD1, ERR_R_MALLOC_FAILURE);
+ return 0;
+ }
+ return 1;
+ }
+
+/* Verify signed receipt after it has already passed normal CMS verify */
+
+int cms_Receipt_verify(CMS_ContentInfo *cms, CMS_ContentInfo *req_cms)
+ {
+ int r = 0, i;
+ CMS_ReceiptRequest *rr = NULL;
+ CMS_Receipt *rct = NULL;
+ STACK_OF(CMS_SignerInfo) *sis, *osis;
+ CMS_SignerInfo *si, *osi = NULL;
+ ASN1_OCTET_STRING *msig, **pcont;
+ ASN1_OBJECT *octype;
+ unsigned char dig[EVP_MAX_MD_SIZE];
+ unsigned int diglen;
+
+ /* Get SignerInfos, also checks SignedData content type */
+ osis = CMS_get0_SignerInfos(req_cms);
+ sis = CMS_get0_SignerInfos(cms);
+ if (!osis || !sis)
+ goto err;
+
+ if (sk_CMS_SignerInfo_num(sis) != 1)
+ {
+ CMSerr(CMS_F_CMS_RECEIPT_VERIFY, CMS_R_NEED_ONE_SIGNER);
+ goto err;
+ }
+
+ /* Check receipt content type */
+ if (OBJ_obj2nid(CMS_get0_eContentType(cms)) != NID_id_smime_ct_receipt)
+ {
+ CMSerr(CMS_F_CMS_RECEIPT_VERIFY, CMS_R_NOT_A_SIGNED_RECEIPT);
+ goto err;
+ }
+
+ /* Extract and decode receipt content */
+ pcont = CMS_get0_content(cms);
+ if (!pcont || !*pcont)
+ {
+ CMSerr(CMS_F_CMS_RECEIPT_VERIFY, CMS_R_NO_CONTENT);
+ goto err;
+ }
+
+ rct = ASN1_item_unpack(*pcont, ASN1_ITEM_rptr(CMS_Receipt));
+
+ if (!rct)
+ {
+ CMSerr(CMS_F_CMS_RECEIPT_VERIFY, CMS_R_RECEIPT_DECODE_ERROR);
+ goto err;
+ }
+
+ /* Locate original request */
+
+ for (i = 0; i < sk_CMS_SignerInfo_num(osis); i++)
+ {
+ osi = sk_CMS_SignerInfo_value(osis, i);
+ if (!ASN1_STRING_cmp(osi->signature,
+ rct->originatorSignatureValue))
+ break;
+ }
+
+ if (i == sk_CMS_SignerInfo_num(osis))
+ {
+ CMSerr(CMS_F_CMS_RECEIPT_VERIFY, CMS_R_NO_MATCHING_SIGNATURE);
+ goto err;
+ }
+
+ si = sk_CMS_SignerInfo_value(sis, 0);
+
+ /* Get msgSigDigest value and compare */
+
+ msig = CMS_signed_get0_data_by_OBJ(si,
+ OBJ_nid2obj(NID_id_smime_aa_msgSigDigest),
+ -3, V_ASN1_OCTET_STRING);
+
+ if (!msig)
+ {
+ CMSerr(CMS_F_CMS_RECEIPT_VERIFY, CMS_R_NO_MSGSIGDIGEST);
+ goto err;
+ }
+
+ if (!cms_msgSigDigest(osi, dig, &diglen))
+ {
+ CMSerr(CMS_F_CMS_RECEIPT_VERIFY, CMS_R_MSGSIGDIGEST_ERROR);
+ goto err;
+ }
+
+ if (diglen != (unsigned int)msig->length)
+ {
+ CMSerr(CMS_F_CMS_RECEIPT_VERIFY,
+ CMS_R_MSGSIGDIGEST_WRONG_LENGTH);
+ goto err;
+ }
+
+ if (memcmp(dig, msig->data, diglen))
+ {
+ CMSerr(CMS_F_CMS_RECEIPT_VERIFY,
+ CMS_R_MSGSIGDIGEST_VERIFICATION_FAILURE);
+ goto err;
+ }
+
+ /* Compare content types */
+
+ octype = CMS_signed_get0_data_by_OBJ(osi,
+ OBJ_nid2obj(NID_pkcs9_contentType),
+ -3, V_ASN1_OBJECT);
+ if (!octype)
+ {
+ CMSerr(CMS_F_CMS_RECEIPT_VERIFY, CMS_R_NO_CONTENT_TYPE);
+ goto err;
+ }
+
+ /* Compare details in receipt request */
+
+ if (OBJ_cmp(octype, rct->contentType))
+ {
+ CMSerr(CMS_F_CMS_RECEIPT_VERIFY, CMS_R_CONTENT_TYPE_MISMATCH);
+ goto err;
+ }
+
+ /* Get original receipt request details */
+
+ if (CMS_get1_ReceiptRequest(osi, &rr) <= 0)
+ {
+ CMSerr(CMS_F_CMS_RECEIPT_VERIFY, CMS_R_NO_RECEIPT_REQUEST);
+ goto err;
+ }
+
+ if (ASN1_STRING_cmp(rr->signedContentIdentifier,
+ rct->signedContentIdentifier))
+ {
+ CMSerr(CMS_F_CMS_RECEIPT_VERIFY,
+ CMS_R_CONTENTIDENTIFIER_MISMATCH);
+ goto err;
+ }
+
+ r = 1;
+
+ err:
+ if (rr)
+ CMS_ReceiptRequest_free(rr);
+ if (rct)
+ M_ASN1_free_of(rct, CMS_Receipt);
+
+ return r;
+
+ }
+
+/* Encode a Receipt into an OCTET STRING read for including into content of
+ * a SignedData ContentInfo.
+ */
+
+ASN1_OCTET_STRING *cms_encode_Receipt(CMS_SignerInfo *si)
+ {
+ CMS_Receipt rct;
+ CMS_ReceiptRequest *rr = NULL;
+ ASN1_OBJECT *ctype;
+ ASN1_OCTET_STRING *os = NULL;
+
+ /* Get original receipt request */
+
+ /* Get original receipt request details */
+
+ if (CMS_get1_ReceiptRequest(si, &rr) <= 0)
+ {
+ CMSerr(CMS_F_CMS_ENCODE_RECEIPT, CMS_R_NO_RECEIPT_REQUEST);
+ goto err;
+ }
+
+ /* Get original content type */
+
+ ctype = CMS_signed_get0_data_by_OBJ(si,
+ OBJ_nid2obj(NID_pkcs9_contentType),
+ -3, V_ASN1_OBJECT);
+ if (!ctype)
+ {
+ CMSerr(CMS_F_CMS_ENCODE_RECEIPT, CMS_R_NO_CONTENT_TYPE);
+ goto err;
+ }
+
+ rct.version = 1;
+ rct.contentType = ctype;
+ rct.signedContentIdentifier = rr->signedContentIdentifier;
+ rct.originatorSignatureValue = si->signature;
+
+ os = ASN1_item_pack(&rct, ASN1_ITEM_rptr(CMS_Receipt), NULL);
+
+ err:
+ if (rr)
+ CMS_ReceiptRequest_free(rr);
+
+ return os;
+
+ }
+
+
diff --git a/main/openssl/crypto/cms/cms_io.c b/main/openssl/crypto/cms/cms_io.c
new file mode 100644
index 00000000..1cb0264c
--- /dev/null
+++ b/main/openssl/crypto/cms/cms_io.c
@@ -0,0 +1,133 @@
+/* crypto/cms/cms_io.c */
+/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
+ * project.
+ */
+/* ====================================================================
+ * Copyright (c) 2008 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include <openssl/asn1t.h>
+#include <openssl/x509.h>
+#include <openssl/err.h>
+#include <openssl/pem.h>
+#include "cms.h"
+#include "cms_lcl.h"
+
+int CMS_stream(unsigned char ***boundary, CMS_ContentInfo *cms)
+ {
+ ASN1_OCTET_STRING **pos;
+ pos = CMS_get0_content(cms);
+ if (!pos)
+ return 0;
+ if (!*pos)
+ *pos = ASN1_OCTET_STRING_new();
+ if (*pos)
+ {
+ (*pos)->flags |= ASN1_STRING_FLAG_NDEF;
+ (*pos)->flags &= ~ASN1_STRING_FLAG_CONT;
+ *boundary = &(*pos)->data;
+ return 1;
+ }
+ CMSerr(CMS_F_CMS_STREAM, ERR_R_MALLOC_FAILURE);
+ return 0;
+ }
+
+CMS_ContentInfo *d2i_CMS_bio(BIO *bp, CMS_ContentInfo **cms)
+ {
+ return ASN1_item_d2i_bio(ASN1_ITEM_rptr(CMS_ContentInfo), bp, cms);
+ }
+
+int i2d_CMS_bio(BIO *bp, CMS_ContentInfo *cms)
+ {
+ return ASN1_item_i2d_bio(ASN1_ITEM_rptr(CMS_ContentInfo), bp, cms);
+ }
+
+IMPLEMENT_PEM_rw_const(CMS, CMS_ContentInfo, PEM_STRING_CMS, CMS_ContentInfo)
+
+BIO *BIO_new_CMS(BIO *out, CMS_ContentInfo *cms)
+ {
+ return BIO_new_NDEF(out, (ASN1_VALUE *)cms,
+ ASN1_ITEM_rptr(CMS_ContentInfo));
+ }
+
+/* CMS wrappers round generalised stream and MIME routines */
+
+int i2d_CMS_bio_stream(BIO *out, CMS_ContentInfo *cms, BIO *in, int flags)
+ {
+ return i2d_ASN1_bio_stream(out, (ASN1_VALUE *)cms, in, flags,
+ ASN1_ITEM_rptr(CMS_ContentInfo));
+ }
+
+int PEM_write_bio_CMS_stream(BIO *out, CMS_ContentInfo *cms, BIO *in, int flags)
+ {
+ return PEM_write_bio_ASN1_stream(out, (ASN1_VALUE *) cms, in, flags,
+ "CMS",
+ ASN1_ITEM_rptr(CMS_ContentInfo));
+ }
+
+int SMIME_write_CMS(BIO *bio, CMS_ContentInfo *cms, BIO *data, int flags)
+ {
+ STACK_OF(X509_ALGOR) *mdalgs;
+ int ctype_nid = OBJ_obj2nid(cms->contentType);
+ int econt_nid = OBJ_obj2nid(CMS_get0_eContentType(cms));
+ if (ctype_nid == NID_pkcs7_signed)
+ mdalgs = cms->d.signedData->digestAlgorithms;
+ else
+ mdalgs = NULL;
+
+ return SMIME_write_ASN1(bio, (ASN1_VALUE *)cms, data, flags,
+ ctype_nid, econt_nid, mdalgs,
+ ASN1_ITEM_rptr(CMS_ContentInfo));
+ }
+
+CMS_ContentInfo *SMIME_read_CMS(BIO *bio, BIO **bcont)
+ {
+ return (CMS_ContentInfo *)SMIME_read_ASN1(bio, bcont,
+ ASN1_ITEM_rptr(CMS_ContentInfo));
+ }
+
diff --git a/main/openssl/crypto/cms/cms_lcl.h b/main/openssl/crypto/cms/cms_lcl.h
new file mode 100644
index 00000000..a9f97301
--- /dev/null
+++ b/main/openssl/crypto/cms/cms_lcl.h
@@ -0,0 +1,473 @@
+/* crypto/cms/cms_lcl.h */
+/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
+ * project.
+ */
+/* ====================================================================
+ * Copyright (c) 2008 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#ifndef HEADER_CMS_LCL_H
+#define HEADER_CMS_LCL_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <openssl/x509.h>
+
+/* Cryptographic message syntax (CMS) structures: taken
+ * from RFC3852
+ */
+
+/* Forward references */
+
+typedef struct CMS_IssuerAndSerialNumber_st CMS_IssuerAndSerialNumber;
+typedef struct CMS_EncapsulatedContentInfo_st CMS_EncapsulatedContentInfo;
+typedef struct CMS_SignerIdentifier_st CMS_SignerIdentifier;
+typedef struct CMS_SignedData_st CMS_SignedData;
+typedef struct CMS_OtherRevocationInfoFormat_st CMS_OtherRevocationInfoFormat;
+typedef struct CMS_OriginatorInfo_st CMS_OriginatorInfo;
+typedef struct CMS_EncryptedContentInfo_st CMS_EncryptedContentInfo;
+typedef struct CMS_EnvelopedData_st CMS_EnvelopedData;
+typedef struct CMS_DigestedData_st CMS_DigestedData;
+typedef struct CMS_EncryptedData_st CMS_EncryptedData;
+typedef struct CMS_AuthenticatedData_st CMS_AuthenticatedData;
+typedef struct CMS_CompressedData_st CMS_CompressedData;
+typedef struct CMS_OtherCertificateFormat_st CMS_OtherCertificateFormat;
+typedef struct CMS_KeyTransRecipientInfo_st CMS_KeyTransRecipientInfo;
+typedef struct CMS_OriginatorPublicKey_st CMS_OriginatorPublicKey;
+typedef struct CMS_OriginatorIdentifierOrKey_st CMS_OriginatorIdentifierOrKey;
+typedef struct CMS_KeyAgreeRecipientInfo_st CMS_KeyAgreeRecipientInfo;
+typedef struct CMS_OtherKeyAttribute_st CMS_OtherKeyAttribute;
+typedef struct CMS_RecipientKeyIdentifier_st CMS_RecipientKeyIdentifier;
+typedef struct CMS_KeyAgreeRecipientIdentifier_st CMS_KeyAgreeRecipientIdentifier;
+typedef struct CMS_RecipientEncryptedKey_st CMS_RecipientEncryptedKey;
+typedef struct CMS_KEKIdentifier_st CMS_KEKIdentifier;
+typedef struct CMS_KEKRecipientInfo_st CMS_KEKRecipientInfo;
+typedef struct CMS_PasswordRecipientInfo_st CMS_PasswordRecipientInfo;
+typedef struct CMS_OtherRecipientInfo_st CMS_OtherRecipientInfo;
+typedef struct CMS_ReceiptsFrom_st CMS_ReceiptsFrom;
+
+struct CMS_ContentInfo_st
+ {
+ ASN1_OBJECT *contentType;
+ union {
+ ASN1_OCTET_STRING *data;
+ CMS_SignedData *signedData;
+ CMS_EnvelopedData *envelopedData;
+ CMS_DigestedData *digestedData;
+ CMS_EncryptedData *encryptedData;
+ CMS_AuthenticatedData *authenticatedData;
+ CMS_CompressedData *compressedData;
+ ASN1_TYPE *other;
+ /* Other types ... */
+ void *otherData;
+ } d;
+ };
+
+struct CMS_SignedData_st
+ {
+ long version;
+ STACK_OF(X509_ALGOR) *digestAlgorithms;
+ CMS_EncapsulatedContentInfo *encapContentInfo;
+ STACK_OF(CMS_CertificateChoices) *certificates;
+ STACK_OF(CMS_RevocationInfoChoice) *crls;
+ STACK_OF(CMS_SignerInfo) *signerInfos;
+ };
+
+struct CMS_EncapsulatedContentInfo_st
+ {
+ ASN1_OBJECT *eContentType;
+ ASN1_OCTET_STRING *eContent;
+ /* Set to 1 if incomplete structure only part set up */
+ int partial;
+ };
+
+struct CMS_SignerInfo_st
+ {
+ long version;
+ CMS_SignerIdentifier *sid;
+ X509_ALGOR *digestAlgorithm;
+ STACK_OF(X509_ATTRIBUTE) *signedAttrs;
+ X509_ALGOR *signatureAlgorithm;
+ ASN1_OCTET_STRING *signature;
+ STACK_OF(X509_ATTRIBUTE) *unsignedAttrs;
+ /* Signing certificate and key */
+ X509 *signer;
+ EVP_PKEY *pkey;
+ };
+
+struct CMS_SignerIdentifier_st
+ {
+ int type;
+ union {
+ CMS_IssuerAndSerialNumber *issuerAndSerialNumber;
+ ASN1_OCTET_STRING *subjectKeyIdentifier;
+ } d;
+ };
+
+struct CMS_EnvelopedData_st
+ {
+ long version;
+ CMS_OriginatorInfo *originatorInfo;
+ STACK_OF(CMS_RecipientInfo) *recipientInfos;
+ CMS_EncryptedContentInfo *encryptedContentInfo;
+ STACK_OF(X509_ATTRIBUTE) *unprotectedAttrs;
+ };
+
+struct CMS_OriginatorInfo_st
+ {
+ STACK_OF(CMS_CertificateChoices) *certificates;
+ STACK_OF(CMS_RevocationInfoChoice) *crls;
+ };
+
+struct CMS_EncryptedContentInfo_st
+ {
+ ASN1_OBJECT *contentType;
+ X509_ALGOR *contentEncryptionAlgorithm;
+ ASN1_OCTET_STRING *encryptedContent;
+ /* Content encryption algorithm and key */
+ const EVP_CIPHER *cipher;
+ unsigned char *key;
+ size_t keylen;
+ /* Set to 1 if we are debugging decrypt and don't fake keys for MMA */
+ int debug;
+ };
+
+struct CMS_RecipientInfo_st
+ {
+ int type;
+ union {
+ CMS_KeyTransRecipientInfo *ktri;
+ CMS_KeyAgreeRecipientInfo *kari;
+ CMS_KEKRecipientInfo *kekri;
+ CMS_PasswordRecipientInfo *pwri;
+ CMS_OtherRecipientInfo *ori;
+ } d;
+ };
+
+typedef CMS_SignerIdentifier CMS_RecipientIdentifier;
+
+struct CMS_KeyTransRecipientInfo_st
+ {
+ long version;
+ CMS_RecipientIdentifier *rid;
+ X509_ALGOR *keyEncryptionAlgorithm;
+ ASN1_OCTET_STRING *encryptedKey;
+ /* Recipient Key and cert */
+ X509 *recip;
+ EVP_PKEY *pkey;
+ };
+
+struct CMS_KeyAgreeRecipientInfo_st
+ {
+ long version;
+ CMS_OriginatorIdentifierOrKey *originator;
+ ASN1_OCTET_STRING *ukm;
+ X509_ALGOR *keyEncryptionAlgorithm;
+ STACK_OF(CMS_RecipientEncryptedKey) *recipientEncryptedKeys;
+ };
+
+struct CMS_OriginatorIdentifierOrKey_st
+ {
+ int type;
+ union {
+ CMS_IssuerAndSerialNumber *issuerAndSerialNumber;
+ ASN1_OCTET_STRING *subjectKeyIdentifier;
+ CMS_OriginatorPublicKey *originatorKey;
+ } d;
+ };
+
+struct CMS_OriginatorPublicKey_st
+ {
+ X509_ALGOR *algorithm;
+ ASN1_BIT_STRING *publicKey;
+ };
+
+struct CMS_RecipientEncryptedKey_st
+ {
+ CMS_KeyAgreeRecipientIdentifier *rid;
+ ASN1_OCTET_STRING *encryptedKey;
+ };
+
+struct CMS_KeyAgreeRecipientIdentifier_st
+ {
+ int type;
+ union {
+ CMS_IssuerAndSerialNumber *issuerAndSerialNumber;
+ CMS_RecipientKeyIdentifier *rKeyId;
+ } d;
+ };
+
+struct CMS_RecipientKeyIdentifier_st
+ {
+ ASN1_OCTET_STRING *subjectKeyIdentifier;
+ ASN1_GENERALIZEDTIME *date;
+ CMS_OtherKeyAttribute *other;
+ };
+
+struct CMS_KEKRecipientInfo_st
+ {
+ long version;
+ CMS_KEKIdentifier *kekid;
+ X509_ALGOR *keyEncryptionAlgorithm;
+ ASN1_OCTET_STRING *encryptedKey;
+ /* Extra info: symmetric key to use */
+ unsigned char *key;
+ size_t keylen;
+ };
+
+struct CMS_KEKIdentifier_st
+ {
+ ASN1_OCTET_STRING *keyIdentifier;
+ ASN1_GENERALIZEDTIME *date;
+ CMS_OtherKeyAttribute *other;
+ };
+
+struct CMS_PasswordRecipientInfo_st
+ {
+ long version;
+ X509_ALGOR *keyDerivationAlgorithm;
+ X509_ALGOR *keyEncryptionAlgorithm;
+ ASN1_OCTET_STRING *encryptedKey;
+ /* Extra info: password to use */
+ unsigned char *pass;
+ size_t passlen;
+ };
+
+struct CMS_OtherRecipientInfo_st
+ {
+ ASN1_OBJECT *oriType;
+ ASN1_TYPE *oriValue;
+ };
+
+struct CMS_DigestedData_st
+ {
+ long version;
+ X509_ALGOR *digestAlgorithm;
+ CMS_EncapsulatedContentInfo *encapContentInfo;
+ ASN1_OCTET_STRING *digest;
+ };
+
+struct CMS_EncryptedData_st
+ {
+ long version;
+ CMS_EncryptedContentInfo *encryptedContentInfo;
+ STACK_OF(X509_ATTRIBUTE) *unprotectedAttrs;
+ };
+
+struct CMS_AuthenticatedData_st
+ {
+ long version;
+ CMS_OriginatorInfo *originatorInfo;
+ STACK_OF(CMS_RecipientInfo) *recipientInfos;
+ X509_ALGOR *macAlgorithm;
+ X509_ALGOR *digestAlgorithm;
+ CMS_EncapsulatedContentInfo *encapContentInfo;
+ STACK_OF(X509_ATTRIBUTE) *authAttrs;
+ ASN1_OCTET_STRING *mac;
+ STACK_OF(X509_ATTRIBUTE) *unauthAttrs;
+ };
+
+struct CMS_CompressedData_st
+ {
+ long version;
+ X509_ALGOR *compressionAlgorithm;
+ STACK_OF(CMS_RecipientInfo) *recipientInfos;
+ CMS_EncapsulatedContentInfo *encapContentInfo;
+ };
+
+struct CMS_RevocationInfoChoice_st
+ {
+ int type;
+ union {
+ X509_CRL *crl;
+ CMS_OtherRevocationInfoFormat *other;
+ } d;
+ };
+
+#define CMS_REVCHOICE_CRL 0
+#define CMS_REVCHOICE_OTHER 1
+
+struct CMS_OtherRevocationInfoFormat_st
+ {
+ ASN1_OBJECT *otherRevInfoFormat;
+ ASN1_TYPE *otherRevInfo;
+ };
+
+struct CMS_CertificateChoices
+ {
+ int type;
+ union {
+ X509 *certificate;
+ ASN1_STRING *extendedCertificate; /* Obsolete */
+ ASN1_STRING *v1AttrCert; /* Left encoded for now */
+ ASN1_STRING *v2AttrCert; /* Left encoded for now */
+ CMS_OtherCertificateFormat *other;
+ } d;
+ };
+
+#define CMS_CERTCHOICE_CERT 0
+#define CMS_CERTCHOICE_EXCERT 1
+#define CMS_CERTCHOICE_V1ACERT 2
+#define CMS_CERTCHOICE_V2ACERT 3
+#define CMS_CERTCHOICE_OTHER 4
+
+struct CMS_OtherCertificateFormat_st
+ {
+ ASN1_OBJECT *otherCertFormat;
+ ASN1_TYPE *otherCert;
+ };
+
+/* This is also defined in pkcs7.h but we duplicate it
+ * to allow the CMS code to be independent of PKCS#7
+ */
+
+struct CMS_IssuerAndSerialNumber_st
+ {
+ X509_NAME *issuer;
+ ASN1_INTEGER *serialNumber;
+ };
+
+struct CMS_OtherKeyAttribute_st
+ {
+ ASN1_OBJECT *keyAttrId;
+ ASN1_TYPE *keyAttr;
+ };
+
+/* ESS structures */
+
+#ifdef HEADER_X509V3_H
+
+struct CMS_ReceiptRequest_st
+ {
+ ASN1_OCTET_STRING *signedContentIdentifier;
+ CMS_ReceiptsFrom *receiptsFrom;
+ STACK_OF(GENERAL_NAMES) *receiptsTo;
+ };
+
+
+struct CMS_ReceiptsFrom_st
+ {
+ int type;
+ union
+ {
+ long allOrFirstTier;
+ STACK_OF(GENERAL_NAMES) *receiptList;
+ } d;
+ };
+#endif
+
+struct CMS_Receipt_st
+ {
+ long version;
+ ASN1_OBJECT *contentType;
+ ASN1_OCTET_STRING *signedContentIdentifier;
+ ASN1_OCTET_STRING *originatorSignatureValue;
+ };
+
+DECLARE_ASN1_FUNCTIONS(CMS_ContentInfo)
+DECLARE_ASN1_ITEM(CMS_SignerInfo)
+DECLARE_ASN1_ITEM(CMS_IssuerAndSerialNumber)
+DECLARE_ASN1_ITEM(CMS_Attributes_Sign)
+DECLARE_ASN1_ITEM(CMS_Attributes_Verify)
+DECLARE_ASN1_ITEM(CMS_RecipientInfo)
+DECLARE_ASN1_ITEM(CMS_PasswordRecipientInfo)
+DECLARE_ASN1_ALLOC_FUNCTIONS(CMS_IssuerAndSerialNumber)
+
+#define CMS_SIGNERINFO_ISSUER_SERIAL 0
+#define CMS_SIGNERINFO_KEYIDENTIFIER 1
+
+#define CMS_RECIPINFO_ISSUER_SERIAL 0
+#define CMS_RECIPINFO_KEYIDENTIFIER 1
+
+BIO *cms_content_bio(CMS_ContentInfo *cms);
+
+CMS_ContentInfo *cms_Data_create(void);
+
+CMS_ContentInfo *cms_DigestedData_create(const EVP_MD *md);
+BIO *cms_DigestedData_init_bio(CMS_ContentInfo *cms);
+int cms_DigestedData_do_final(CMS_ContentInfo *cms, BIO *chain, int verify);
+
+BIO *cms_SignedData_init_bio(CMS_ContentInfo *cms);
+int cms_SignedData_final(CMS_ContentInfo *cms, BIO *chain);
+int cms_set1_SignerIdentifier(CMS_SignerIdentifier *sid, X509 *cert, int type);
+int cms_SignerIdentifier_get0_signer_id(CMS_SignerIdentifier *sid,
+ ASN1_OCTET_STRING **keyid,
+ X509_NAME **issuer, ASN1_INTEGER **sno);
+int cms_SignerIdentifier_cert_cmp(CMS_SignerIdentifier *sid, X509 *cert);
+
+CMS_ContentInfo *cms_CompressedData_create(int comp_nid);
+BIO *cms_CompressedData_init_bio(CMS_ContentInfo *cms);
+
+void cms_DigestAlgorithm_set(X509_ALGOR *alg, const EVP_MD *md);
+BIO *cms_DigestAlgorithm_init_bio(X509_ALGOR *digestAlgorithm);
+int cms_DigestAlgorithm_find_ctx(EVP_MD_CTX *mctx, BIO *chain,
+ X509_ALGOR *mdalg);
+
+BIO *cms_EncryptedContent_init_bio(CMS_EncryptedContentInfo *ec);
+BIO *cms_EncryptedData_init_bio(CMS_ContentInfo *cms);
+int cms_EncryptedContent_init(CMS_EncryptedContentInfo *ec,
+ const EVP_CIPHER *cipher,
+ const unsigned char *key, size_t keylen);
+
+int cms_Receipt_verify(CMS_ContentInfo *cms, CMS_ContentInfo *req_cms);
+int cms_msgSigDigest_add1(CMS_SignerInfo *dest, CMS_SignerInfo *src);
+ASN1_OCTET_STRING *cms_encode_Receipt(CMS_SignerInfo *si);
+
+BIO *cms_EnvelopedData_init_bio(CMS_ContentInfo *cms);
+CMS_EnvelopedData *cms_get0_enveloped(CMS_ContentInfo *cms);
+
+/* PWRI routines */
+int cms_RecipientInfo_pwri_crypt(CMS_ContentInfo *cms, CMS_RecipientInfo *ri,
+ int en_de);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/main/openssl/crypto/cms/cms_lib.c b/main/openssl/crypto/cms/cms_lib.c
new file mode 100644
index 00000000..b62d1bfa
--- /dev/null
+++ b/main/openssl/crypto/cms/cms_lib.c
@@ -0,0 +1,624 @@
+/* crypto/cms/cms_lib.c */
+/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
+ * project.
+ */
+/* ====================================================================
+ * Copyright (c) 2008 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include <openssl/asn1t.h>
+#include <openssl/x509.h>
+#include <openssl/err.h>
+#include <openssl/pem.h>
+#include <openssl/bio.h>
+#include <openssl/asn1.h>
+#include "cms.h"
+#include "cms_lcl.h"
+
+IMPLEMENT_ASN1_FUNCTIONS(CMS_ContentInfo)
+IMPLEMENT_ASN1_PRINT_FUNCTION(CMS_ContentInfo)
+
+DECLARE_ASN1_ITEM(CMS_CertificateChoices)
+DECLARE_ASN1_ITEM(CMS_RevocationInfoChoice)
+DECLARE_STACK_OF(CMS_CertificateChoices)
+DECLARE_STACK_OF(CMS_RevocationInfoChoice)
+
+const ASN1_OBJECT *CMS_get0_type(CMS_ContentInfo *cms)
+ {
+ return cms->contentType;
+ }
+
+CMS_ContentInfo *cms_Data_create(void)
+ {
+ CMS_ContentInfo *cms;
+ cms = CMS_ContentInfo_new();
+ if (cms)
+ {
+ cms->contentType = OBJ_nid2obj(NID_pkcs7_data);
+ /* Never detached */
+ CMS_set_detached(cms, 0);
+ }
+ return cms;
+ }
+
+BIO *cms_content_bio(CMS_ContentInfo *cms)
+ {
+ ASN1_OCTET_STRING **pos = CMS_get0_content(cms);
+ if (!pos)
+ return NULL;
+ /* If content detached data goes nowhere: create NULL BIO */
+ if (!*pos)
+ return BIO_new(BIO_s_null());
+ /* If content not detached and created return memory BIO
+ */
+ if (!*pos || ((*pos)->flags == ASN1_STRING_FLAG_CONT))
+ return BIO_new(BIO_s_mem());
+ /* Else content was read in: return read only BIO for it */
+ return BIO_new_mem_buf((*pos)->data, (*pos)->length);
+ }
+
+BIO *CMS_dataInit(CMS_ContentInfo *cms, BIO *icont)
+ {
+ BIO *cmsbio, *cont;
+ if (icont)
+ cont = icont;
+ else
+ cont = cms_content_bio(cms);
+ if (!cont)
+ {
+ CMSerr(CMS_F_CMS_DATAINIT, CMS_R_NO_CONTENT);
+ return NULL;
+ }
+ switch (OBJ_obj2nid(cms->contentType))
+ {
+
+ case NID_pkcs7_data:
+ return cont;
+
+ case NID_pkcs7_signed:
+ cmsbio = cms_SignedData_init_bio(cms);
+ break;
+
+ case NID_pkcs7_digest:
+ cmsbio = cms_DigestedData_init_bio(cms);
+ break;
+#ifdef ZLIB
+ case NID_id_smime_ct_compressedData:
+ cmsbio = cms_CompressedData_init_bio(cms);
+ break;
+#endif
+
+ case NID_pkcs7_encrypted:
+ cmsbio = cms_EncryptedData_init_bio(cms);
+ break;
+
+ case NID_pkcs7_enveloped:
+ cmsbio = cms_EnvelopedData_init_bio(cms);
+ break;
+
+ default:
+ CMSerr(CMS_F_CMS_DATAINIT, CMS_R_UNSUPPORTED_TYPE);
+ return NULL;
+ }
+
+ if (cmsbio)
+ return BIO_push(cmsbio, cont);
+
+ if (!icont)
+ BIO_free(cont);
+ return NULL;
+
+ }
+
+int CMS_dataFinal(CMS_ContentInfo *cms, BIO *cmsbio)
+ {
+ ASN1_OCTET_STRING **pos = CMS_get0_content(cms);
+ if (!pos)
+ return 0;
+ /* If ebmedded content find memory BIO and set content */
+ if (*pos && ((*pos)->flags & ASN1_STRING_FLAG_CONT))
+ {
+ BIO *mbio;
+ unsigned char *cont;
+ long contlen;
+ mbio = BIO_find_type(cmsbio, BIO_TYPE_MEM);
+ if (!mbio)
+ {
+ CMSerr(CMS_F_CMS_DATAFINAL, CMS_R_CONTENT_NOT_FOUND);
+ return 0;
+ }
+ contlen = BIO_get_mem_data(mbio, &cont);
+ /* Set bio as read only so its content can't be clobbered */
+ BIO_set_flags(mbio, BIO_FLAGS_MEM_RDONLY);
+ BIO_set_mem_eof_return(mbio, 0);
+ ASN1_STRING_set0(*pos, cont, contlen);
+ (*pos)->flags &= ~ASN1_STRING_FLAG_CONT;
+ }
+
+ switch (OBJ_obj2nid(cms->contentType))
+ {
+
+ case NID_pkcs7_data:
+ case NID_pkcs7_enveloped:
+ case NID_pkcs7_encrypted:
+ case NID_id_smime_ct_compressedData:
+ /* Nothing to do */
+ return 1;
+
+ case NID_pkcs7_signed:
+ return cms_SignedData_final(cms, cmsbio);
+
+ case NID_pkcs7_digest:
+ return cms_DigestedData_do_final(cms, cmsbio, 0);
+
+ default:
+ CMSerr(CMS_F_CMS_DATAFINAL, CMS_R_UNSUPPORTED_TYPE);
+ return 0;
+ }
+ }
+
+/* Return an OCTET STRING pointer to content. This allows it to
+ * be accessed or set later.
+ */
+
+ASN1_OCTET_STRING **CMS_get0_content(CMS_ContentInfo *cms)
+ {
+ switch (OBJ_obj2nid(cms->contentType))
+ {
+
+ case NID_pkcs7_data:
+ return &cms->d.data;
+
+ case NID_pkcs7_signed:
+ return &cms->d.signedData->encapContentInfo->eContent;
+
+ case NID_pkcs7_enveloped:
+ return &cms->d.envelopedData->encryptedContentInfo->encryptedContent;
+
+ case NID_pkcs7_digest:
+ return &cms->d.digestedData->encapContentInfo->eContent;
+
+ case NID_pkcs7_encrypted:
+ return &cms->d.encryptedData->encryptedContentInfo->encryptedContent;
+
+ case NID_id_smime_ct_authData:
+ return &cms->d.authenticatedData->encapContentInfo->eContent;
+
+ case NID_id_smime_ct_compressedData:
+ return &cms->d.compressedData->encapContentInfo->eContent;
+
+ default:
+ if (cms->d.other->type == V_ASN1_OCTET_STRING)
+ return &cms->d.other->value.octet_string;
+ CMSerr(CMS_F_CMS_GET0_CONTENT, CMS_R_UNSUPPORTED_CONTENT_TYPE);
+ return NULL;
+
+ }
+ }
+
+/* Return an ASN1_OBJECT pointer to content type. This allows it to
+ * be accessed or set later.
+ */
+
+static ASN1_OBJECT **cms_get0_econtent_type(CMS_ContentInfo *cms)
+ {
+ switch (OBJ_obj2nid(cms->contentType))
+ {
+
+ case NID_pkcs7_signed:
+ return &cms->d.signedData->encapContentInfo->eContentType;
+
+ case NID_pkcs7_enveloped:
+ return &cms->d.envelopedData->encryptedContentInfo->contentType;
+
+ case NID_pkcs7_digest:
+ return &cms->d.digestedData->encapContentInfo->eContentType;
+
+ case NID_pkcs7_encrypted:
+ return &cms->d.encryptedData->encryptedContentInfo->contentType;
+
+ case NID_id_smime_ct_authData:
+ return &cms->d.authenticatedData->encapContentInfo->eContentType;
+
+ case NID_id_smime_ct_compressedData:
+ return &cms->d.compressedData->encapContentInfo->eContentType;
+
+ default:
+ CMSerr(CMS_F_CMS_GET0_ECONTENT_TYPE,
+ CMS_R_UNSUPPORTED_CONTENT_TYPE);
+ return NULL;
+
+ }
+ }
+
+const ASN1_OBJECT *CMS_get0_eContentType(CMS_ContentInfo *cms)
+ {
+ ASN1_OBJECT **petype;
+ petype = cms_get0_econtent_type(cms);
+ if (petype)
+ return *petype;
+ return NULL;
+ }
+
+int CMS_set1_eContentType(CMS_ContentInfo *cms, const ASN1_OBJECT *oid)
+ {
+ ASN1_OBJECT **petype, *etype;
+ petype = cms_get0_econtent_type(cms);
+ if (!petype)
+ return 0;
+ if (!oid)
+ return 1;
+ etype = OBJ_dup(oid);
+ if (!etype)
+ return 0;
+ ASN1_OBJECT_free(*petype);
+ *petype = etype;
+ return 1;
+ }
+
+int CMS_is_detached(CMS_ContentInfo *cms)
+ {
+ ASN1_OCTET_STRING **pos;
+ pos = CMS_get0_content(cms);
+ if (!pos)
+ return -1;
+ if (*pos)
+ return 0;
+ return 1;
+ }
+
+int CMS_set_detached(CMS_ContentInfo *cms, int detached)
+ {
+ ASN1_OCTET_STRING **pos;
+ pos = CMS_get0_content(cms);
+ if (!pos)
+ return 0;
+ if (detached)
+ {
+ if (*pos)
+ {
+ ASN1_OCTET_STRING_free(*pos);
+ *pos = NULL;
+ }
+ return 1;
+ }
+ if (!*pos)
+ *pos = ASN1_OCTET_STRING_new();
+ if (*pos)
+ {
+ /* NB: special flag to show content is created and not
+ * read in.
+ */
+ (*pos)->flags |= ASN1_STRING_FLAG_CONT;
+ return 1;
+ }
+ CMSerr(CMS_F_CMS_SET_DETACHED, ERR_R_MALLOC_FAILURE);
+ return 0;
+ }
+
+/* Set up an X509_ALGOR DigestAlgorithmIdentifier from an EVP_MD */
+
+void cms_DigestAlgorithm_set(X509_ALGOR *alg, const EVP_MD *md)
+ {
+ int param_type;
+
+ if (md->flags & EVP_MD_FLAG_DIGALGID_ABSENT)
+ param_type = V_ASN1_UNDEF;
+ else
+ param_type = V_ASN1_NULL;
+
+ X509_ALGOR_set0(alg, OBJ_nid2obj(EVP_MD_type(md)), param_type, NULL);
+
+ }
+
+/* Create a digest BIO from an X509_ALGOR structure */
+
+BIO *cms_DigestAlgorithm_init_bio(X509_ALGOR *digestAlgorithm)
+ {
+ BIO *mdbio = NULL;
+ ASN1_OBJECT *digestoid;
+ const EVP_MD *digest;
+ X509_ALGOR_get0(&digestoid, NULL, NULL, digestAlgorithm);
+ digest = EVP_get_digestbyobj(digestoid);
+ if (!digest)
+ {
+ CMSerr(CMS_F_CMS_DIGESTALGORITHM_INIT_BIO,
+ CMS_R_UNKNOWN_DIGEST_ALGORIHM);
+ goto err;
+ }
+ mdbio = BIO_new(BIO_f_md());
+ if (!mdbio || !BIO_set_md(mdbio, digest))
+ {
+ CMSerr(CMS_F_CMS_DIGESTALGORITHM_INIT_BIO,
+ CMS_R_MD_BIO_INIT_ERROR);
+ goto err;
+ }
+ return mdbio;
+ err:
+ if (mdbio)
+ BIO_free(mdbio);
+ return NULL;
+ }
+
+/* Locate a message digest content from a BIO chain based on SignerInfo */
+
+int cms_DigestAlgorithm_find_ctx(EVP_MD_CTX *mctx, BIO *chain,
+ X509_ALGOR *mdalg)
+ {
+ int nid;
+ ASN1_OBJECT *mdoid;
+ X509_ALGOR_get0(&mdoid, NULL, NULL, mdalg);
+ nid = OBJ_obj2nid(mdoid);
+ /* Look for digest type to match signature */
+ for (;;)
+ {
+ EVP_MD_CTX *mtmp;
+ chain = BIO_find_type(chain, BIO_TYPE_MD);
+ if (chain == NULL)
+ {
+ CMSerr(CMS_F_CMS_DIGESTALGORITHM_FIND_CTX,
+ CMS_R_NO_MATCHING_DIGEST);
+ return 0;
+ }
+ BIO_get_md_ctx(chain, &mtmp);
+ if (EVP_MD_CTX_type(mtmp) == nid
+ /* Workaround for broken implementations that use signature
+ * algorithm OID instead of digest.
+ */
+ || EVP_MD_pkey_type(EVP_MD_CTX_md(mtmp)) == nid)
+ return EVP_MD_CTX_copy_ex(mctx, mtmp);
+ chain = BIO_next(chain);
+ }
+ }
+
+static STACK_OF(CMS_CertificateChoices) **cms_get0_certificate_choices(CMS_ContentInfo *cms)
+ {
+ switch (OBJ_obj2nid(cms->contentType))
+ {
+
+ case NID_pkcs7_signed:
+ return &cms->d.signedData->certificates;
+
+ case NID_pkcs7_enveloped:
+ return &cms->d.envelopedData->originatorInfo->certificates;
+
+ default:
+ CMSerr(CMS_F_CMS_GET0_CERTIFICATE_CHOICES,
+ CMS_R_UNSUPPORTED_CONTENT_TYPE);
+ return NULL;
+
+ }
+ }
+
+CMS_CertificateChoices *CMS_add0_CertificateChoices(CMS_ContentInfo *cms)
+ {
+ STACK_OF(CMS_CertificateChoices) **pcerts;
+ CMS_CertificateChoices *cch;
+ pcerts = cms_get0_certificate_choices(cms);
+ if (!pcerts)
+ return NULL;
+ if (!*pcerts)
+ *pcerts = sk_CMS_CertificateChoices_new_null();
+ if (!*pcerts)
+ return NULL;
+ cch = M_ASN1_new_of(CMS_CertificateChoices);
+ if (!cch)
+ return NULL;
+ if (!sk_CMS_CertificateChoices_push(*pcerts, cch))
+ {
+ M_ASN1_free_of(cch, CMS_CertificateChoices);
+ return NULL;
+ }
+ return cch;
+ }
+
+int CMS_add0_cert(CMS_ContentInfo *cms, X509 *cert)
+ {
+ CMS_CertificateChoices *cch;
+ STACK_OF(CMS_CertificateChoices) **pcerts;
+ int i;
+ pcerts = cms_get0_certificate_choices(cms);
+ if (!pcerts)
+ return 0;
+ if (!pcerts)
+ return 0;
+ for (i = 0; i < sk_CMS_CertificateChoices_num(*pcerts); i++)
+ {
+ cch = sk_CMS_CertificateChoices_value(*pcerts, i);
+ if (cch->type == CMS_CERTCHOICE_CERT)
+ {
+ if (!X509_cmp(cch->d.certificate, cert))
+ {
+ CMSerr(CMS_F_CMS_ADD0_CERT,
+ CMS_R_CERTIFICATE_ALREADY_PRESENT);
+ return 0;
+ }
+ }
+ }
+ cch = CMS_add0_CertificateChoices(cms);
+ if (!cch)
+ return 0;
+ cch->type = CMS_CERTCHOICE_CERT;
+ cch->d.certificate = cert;
+ return 1;
+ }
+
+int CMS_add1_cert(CMS_ContentInfo *cms, X509 *cert)
+ {
+ int r;
+ r = CMS_add0_cert(cms, cert);
+ if (r > 0)
+ CRYPTO_add(&cert->references, 1, CRYPTO_LOCK_X509);
+ return r;
+ }
+
+static STACK_OF(CMS_RevocationInfoChoice) **cms_get0_revocation_choices(CMS_ContentInfo *cms)
+ {
+ switch (OBJ_obj2nid(cms->contentType))
+ {
+
+ case NID_pkcs7_signed:
+ return &cms->d.signedData->crls;
+
+ case NID_pkcs7_enveloped:
+ return &cms->d.envelopedData->originatorInfo->crls;
+
+ default:
+ CMSerr(CMS_F_CMS_GET0_REVOCATION_CHOICES,
+ CMS_R_UNSUPPORTED_CONTENT_TYPE);
+ return NULL;
+
+ }
+ }
+
+CMS_RevocationInfoChoice *CMS_add0_RevocationInfoChoice(CMS_ContentInfo *cms)
+ {
+ STACK_OF(CMS_RevocationInfoChoice) **pcrls;
+ CMS_RevocationInfoChoice *rch;
+ pcrls = cms_get0_revocation_choices(cms);
+ if (!pcrls)
+ return NULL;
+ if (!*pcrls)
+ *pcrls = sk_CMS_RevocationInfoChoice_new_null();
+ if (!*pcrls)
+ return NULL;
+ rch = M_ASN1_new_of(CMS_RevocationInfoChoice);
+ if (!rch)
+ return NULL;
+ if (!sk_CMS_RevocationInfoChoice_push(*pcrls, rch))
+ {
+ M_ASN1_free_of(rch, CMS_RevocationInfoChoice);
+ return NULL;
+ }
+ return rch;
+ }
+
+int CMS_add0_crl(CMS_ContentInfo *cms, X509_CRL *crl)
+ {
+ CMS_RevocationInfoChoice *rch;
+ rch = CMS_add0_RevocationInfoChoice(cms);
+ if (!rch)
+ return 0;
+ rch->type = CMS_REVCHOICE_CRL;
+ rch->d.crl = crl;
+ return 1;
+ }
+
+int CMS_add1_crl(CMS_ContentInfo *cms, X509_CRL *crl)
+ {
+ int r;
+ r = CMS_add0_crl(cms, crl);
+ if (r > 0)
+ CRYPTO_add(&crl->references, 1, CRYPTO_LOCK_X509_CRL);
+ return r;
+ }
+
+STACK_OF(X509) *CMS_get1_certs(CMS_ContentInfo *cms)
+ {
+ STACK_OF(X509) *certs = NULL;
+ CMS_CertificateChoices *cch;
+ STACK_OF(CMS_CertificateChoices) **pcerts;
+ int i;
+ pcerts = cms_get0_certificate_choices(cms);
+ if (!pcerts)
+ return NULL;
+ for (i = 0; i < sk_CMS_CertificateChoices_num(*pcerts); i++)
+ {
+ cch = sk_CMS_CertificateChoices_value(*pcerts, i);
+ if (cch->type == 0)
+ {
+ if (!certs)
+ {
+ certs = sk_X509_new_null();
+ if (!certs)
+ return NULL;
+ }
+ if (!sk_X509_push(certs, cch->d.certificate))
+ {
+ sk_X509_pop_free(certs, X509_free);
+ return NULL;
+ }
+ CRYPTO_add(&cch->d.certificate->references,
+ 1, CRYPTO_LOCK_X509);
+ }
+ }
+ return certs;
+
+ }
+
+STACK_OF(X509_CRL) *CMS_get1_crls(CMS_ContentInfo *cms)
+ {
+ STACK_OF(X509_CRL) *crls = NULL;
+ STACK_OF(CMS_RevocationInfoChoice) **pcrls;
+ CMS_RevocationInfoChoice *rch;
+ int i;
+ pcrls = cms_get0_revocation_choices(cms);
+ if (!pcrls)
+ return NULL;
+ for (i = 0; i < sk_CMS_RevocationInfoChoice_num(*pcrls); i++)
+ {
+ rch = sk_CMS_RevocationInfoChoice_value(*pcrls, i);
+ if (rch->type == 0)
+ {
+ if (!crls)
+ {
+ crls = sk_X509_CRL_new_null();
+ if (!crls)
+ return NULL;
+ }
+ if (!sk_X509_CRL_push(crls, rch->d.crl))
+ {
+ sk_X509_CRL_pop_free(crls, X509_CRL_free);
+ return NULL;
+ }
+ CRYPTO_add(&rch->d.crl->references,
+ 1, CRYPTO_LOCK_X509_CRL);
+ }
+ }
+ return crls;
+ }
diff --git a/main/openssl/crypto/cms/cms_pwri.c b/main/openssl/crypto/cms/cms_pwri.c
new file mode 100644
index 00000000..b79612a1
--- /dev/null
+++ b/main/openssl/crypto/cms/cms_pwri.c
@@ -0,0 +1,454 @@
+/* crypto/cms/cms_pwri.c */
+/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
+ * project.
+ */
+/* ====================================================================
+ * Copyright (c) 2009 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include "cryptlib.h"
+#include <openssl/asn1t.h>
+#include <openssl/pem.h>
+#include <openssl/x509v3.h>
+#include <openssl/err.h>
+#include <openssl/cms.h>
+#include <openssl/rand.h>
+#include <openssl/aes.h>
+#include "cms_lcl.h"
+#include "asn1_locl.h"
+
+int CMS_RecipientInfo_set0_password(CMS_RecipientInfo *ri,
+ unsigned char *pass, ossl_ssize_t passlen)
+ {
+ CMS_PasswordRecipientInfo *pwri;
+ if (ri->type != CMS_RECIPINFO_PASS)
+ {
+ CMSerr(CMS_F_CMS_RECIPIENTINFO_SET0_PASSWORD, CMS_R_NOT_PWRI);
+ return 0;
+ }
+
+ pwri = ri->d.pwri;
+ pwri->pass = pass;
+ if (pass && passlen < 0)
+ passlen = strlen((char *)pass);
+ pwri->passlen = passlen;
+ return 1;
+ }
+
+CMS_RecipientInfo *CMS_add0_recipient_password(CMS_ContentInfo *cms,
+ int iter, int wrap_nid, int pbe_nid,
+ unsigned char *pass,
+ ossl_ssize_t passlen,
+ const EVP_CIPHER *kekciph)
+ {
+ CMS_RecipientInfo *ri = NULL;
+ CMS_EnvelopedData *env;
+ CMS_PasswordRecipientInfo *pwri;
+ EVP_CIPHER_CTX ctx;
+ X509_ALGOR *encalg = NULL;
+ unsigned char iv[EVP_MAX_IV_LENGTH];
+ int ivlen;
+ env = cms_get0_enveloped(cms);
+ if (!env)
+ goto err;
+
+ if (wrap_nid <= 0)
+ wrap_nid = NID_id_alg_PWRI_KEK;
+
+ if (pbe_nid <= 0)
+ pbe_nid = NID_id_pbkdf2;
+
+ /* Get from enveloped data */
+ if (kekciph == NULL)
+ kekciph = env->encryptedContentInfo->cipher;
+
+ if (kekciph == NULL)
+ {
+ CMSerr(CMS_F_CMS_ADD0_RECIPIENT_PASSWORD, CMS_R_NO_CIPHER);
+ return NULL;
+ }
+ if (wrap_nid != NID_id_alg_PWRI_KEK)
+ {
+ CMSerr(CMS_F_CMS_ADD0_RECIPIENT_PASSWORD,
+ CMS_R_UNSUPPORTED_KEY_ENCRYPTION_ALGORITHM);
+ return NULL;
+ }
+
+ /* Setup algorithm identifier for cipher */
+ encalg = X509_ALGOR_new();
+ EVP_CIPHER_CTX_init(&ctx);
+
+ if (EVP_EncryptInit_ex(&ctx, kekciph, NULL, NULL, NULL) <= 0)
+ {
+ CMSerr(CMS_F_CMS_ADD0_RECIPIENT_PASSWORD, ERR_R_EVP_LIB);
+ goto err;
+ }
+
+ ivlen = EVP_CIPHER_CTX_iv_length(&ctx);
+
+ if (ivlen > 0)
+ {
+ if (RAND_pseudo_bytes(iv, ivlen) <= 0)
+ goto err;
+ if (EVP_EncryptInit_ex(&ctx, NULL, NULL, NULL, iv) <= 0)
+ {
+ CMSerr(CMS_F_CMS_ADD0_RECIPIENT_PASSWORD,
+ ERR_R_EVP_LIB);
+ goto err;
+ }
+ encalg->parameter = ASN1_TYPE_new();
+ if (!encalg->parameter)
+ {
+ CMSerr(CMS_F_CMS_ADD0_RECIPIENT_PASSWORD,
+ ERR_R_MALLOC_FAILURE);
+ goto err;
+ }
+ if (EVP_CIPHER_param_to_asn1(&ctx, encalg->parameter) <= 0)
+ {
+ CMSerr(CMS_F_CMS_ADD0_RECIPIENT_PASSWORD,
+ CMS_R_CIPHER_PARAMETER_INITIALISATION_ERROR);
+ goto err;
+ }
+ }
+
+
+ encalg->algorithm = OBJ_nid2obj(EVP_CIPHER_CTX_type(&ctx));
+
+ EVP_CIPHER_CTX_cleanup(&ctx);
+
+ /* Initialize recipient info */
+ ri = M_ASN1_new_of(CMS_RecipientInfo);
+ if (!ri)
+ goto merr;
+
+ ri->d.pwri = M_ASN1_new_of(CMS_PasswordRecipientInfo);
+ if (!ri->d.pwri)
+ goto merr;
+ ri->type = CMS_RECIPINFO_PASS;
+
+ pwri = ri->d.pwri;
+ /* Since this is overwritten, free up empty structure already there */
+ X509_ALGOR_free(pwri->keyEncryptionAlgorithm);
+ pwri->keyEncryptionAlgorithm = X509_ALGOR_new();
+ if (!pwri->keyEncryptionAlgorithm)
+ goto merr;
+ pwri->keyEncryptionAlgorithm->algorithm = OBJ_nid2obj(wrap_nid);
+ pwri->keyEncryptionAlgorithm->parameter = ASN1_TYPE_new();
+ if (!pwri->keyEncryptionAlgorithm->parameter)
+ goto merr;
+
+ if(!ASN1_item_pack(encalg, ASN1_ITEM_rptr(X509_ALGOR),
+ &pwri->keyEncryptionAlgorithm->parameter->value.sequence))
+ goto merr;
+ pwri->keyEncryptionAlgorithm->parameter->type = V_ASN1_SEQUENCE;
+
+ X509_ALGOR_free(encalg);
+ encalg = NULL;
+
+ /* Setup PBE algorithm */
+
+ pwri->keyDerivationAlgorithm = PKCS5_pbkdf2_set(iter, NULL, 0, -1, -1);
+
+ if (!pwri->keyDerivationAlgorithm)
+ goto err;
+
+ CMS_RecipientInfo_set0_password(ri, pass, passlen);
+ pwri->version = 0;
+
+ if (!sk_CMS_RecipientInfo_push(env->recipientInfos, ri))
+ goto merr;
+
+ return ri;
+
+ merr:
+ CMSerr(CMS_F_CMS_ADD0_RECIPIENT_PASSWORD, ERR_R_MALLOC_FAILURE);
+ err:
+ EVP_CIPHER_CTX_cleanup(&ctx);
+ if (ri)
+ M_ASN1_free_of(ri, CMS_RecipientInfo);
+ if (encalg)
+ X509_ALGOR_free(encalg);
+ return NULL;
+
+ }
+
+/* This is an implementation of the key wrapping mechanism in RFC3211,
+ * at some point this should go into EVP.
+ */
+
+static int kek_unwrap_key(unsigned char *out, size_t *outlen,
+ const unsigned char *in, size_t inlen, EVP_CIPHER_CTX *ctx)
+ {
+ size_t blocklen = EVP_CIPHER_CTX_block_size(ctx);
+ unsigned char *tmp;
+ int outl, rv = 0;
+ if (inlen < 2 * blocklen)
+ {
+ /* too small */
+ return 0;
+ }
+ if (inlen % blocklen)
+ {
+ /* Invalid size */
+ return 0;
+ }
+ tmp = OPENSSL_malloc(inlen);
+ /* setup IV by decrypting last two blocks */
+ EVP_DecryptUpdate(ctx, tmp + inlen - 2 * blocklen, &outl,
+ in + inlen - 2 * blocklen, blocklen * 2);
+ /* Do a decrypt of last decrypted block to set IV to correct value
+ * output it to start of buffer so we don't corrupt decrypted block
+ * this works because buffer is at least two block lengths long.
+ */
+ EVP_DecryptUpdate(ctx, tmp, &outl,
+ tmp + inlen - blocklen, blocklen);
+ /* Can now decrypt first n - 1 blocks */
+ EVP_DecryptUpdate(ctx, tmp, &outl, in, inlen - blocklen);
+
+ /* Reset IV to original value */
+ EVP_DecryptInit_ex(ctx, NULL, NULL, NULL, NULL);
+ /* Decrypt again */
+ EVP_DecryptUpdate(ctx, tmp, &outl, tmp, inlen);
+ /* Check check bytes */
+ if (((tmp[1] ^ tmp[4]) & (tmp[2] ^ tmp[5]) & (tmp[3] ^ tmp[6])) != 0xff)
+ {
+ /* Check byte failure */
+ goto err;
+ }
+ if (inlen < (size_t)(tmp[0] - 4 ))
+ {
+ /* Invalid length value */
+ goto err;
+ }
+ *outlen = (size_t)tmp[0];
+ memcpy(out, tmp + 4, *outlen);
+ rv = 1;
+ err:
+ OPENSSL_cleanse(tmp, inlen);
+ OPENSSL_free(tmp);
+ return rv;
+
+ }
+
+static int kek_wrap_key(unsigned char *out, size_t *outlen,
+ const unsigned char *in, size_t inlen, EVP_CIPHER_CTX *ctx)
+ {
+ size_t blocklen = EVP_CIPHER_CTX_block_size(ctx);
+ size_t olen;
+ int dummy;
+ /* First decide length of output buffer: need header and round up to
+ * multiple of block length.
+ */
+ olen = (inlen + 4 + blocklen - 1)/blocklen;
+ olen *= blocklen;
+ if (olen < 2 * blocklen)
+ {
+ /* Key too small */
+ return 0;
+ }
+ if (inlen > 0xFF)
+ {
+ /* Key too large */
+ return 0;
+ }
+ if (out)
+ {
+ /* Set header */
+ out[0] = (unsigned char)inlen;
+ out[1] = in[0] ^ 0xFF;
+ out[2] = in[1] ^ 0xFF;
+ out[3] = in[2] ^ 0xFF;
+ memcpy(out + 4, in, inlen);
+ /* Add random padding to end */
+ if (olen > inlen + 4)
+ RAND_pseudo_bytes(out + 4 + inlen, olen - 4 - inlen);
+ /* Encrypt twice */
+ EVP_EncryptUpdate(ctx, out, &dummy, out, olen);
+ EVP_EncryptUpdate(ctx, out, &dummy, out, olen);
+ }
+
+ *outlen = olen;
+
+ return 1;
+ }
+
+/* Encrypt/Decrypt content key in PWRI recipient info */
+
+int cms_RecipientInfo_pwri_crypt(CMS_ContentInfo *cms, CMS_RecipientInfo *ri,
+ int en_de)
+ {
+ CMS_EncryptedContentInfo *ec;
+ CMS_PasswordRecipientInfo *pwri;
+ const unsigned char *p = NULL;
+ int plen;
+ int r = 0;
+ X509_ALGOR *algtmp, *kekalg = NULL;
+ EVP_CIPHER_CTX kekctx;
+ const EVP_CIPHER *kekcipher;
+ unsigned char *key = NULL;
+ size_t keylen;
+
+ ec = cms->d.envelopedData->encryptedContentInfo;
+
+ pwri = ri->d.pwri;
+ EVP_CIPHER_CTX_init(&kekctx);
+
+ if (!pwri->pass)
+ {
+ CMSerr(CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT, CMS_R_NO_PASSWORD);
+ return 0;
+ }
+ algtmp = pwri->keyEncryptionAlgorithm;
+
+ if (!algtmp || OBJ_obj2nid(algtmp->algorithm) != NID_id_alg_PWRI_KEK)
+ {
+ CMSerr(CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT,
+ CMS_R_UNSUPPORTED_KEY_ENCRYPTION_ALGORITHM);
+ return 0;
+ }
+
+ if (algtmp->parameter->type == V_ASN1_SEQUENCE)
+ {
+ p = algtmp->parameter->value.sequence->data;
+ plen = algtmp->parameter->value.sequence->length;
+ kekalg = d2i_X509_ALGOR(NULL, &p, plen);
+ }
+ if (kekalg == NULL)
+ {
+ CMSerr(CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT,
+ CMS_R_INVALID_KEY_ENCRYPTION_PARAMETER);
+ return 0;
+ }
+
+ kekcipher = EVP_get_cipherbyobj(kekalg->algorithm);
+
+ if(!kekcipher)
+ {
+ CMSerr(CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT,
+ CMS_R_UNKNOWN_CIPHER);
+ goto err;
+ }
+
+ /* Fixup cipher based on AlgorithmIdentifier to set IV etc */
+ if (!EVP_CipherInit_ex(&kekctx, kekcipher, NULL, NULL, NULL, en_de))
+ goto err;
+ EVP_CIPHER_CTX_set_padding(&kekctx, 0);
+ if(EVP_CIPHER_asn1_to_param(&kekctx, kekalg->parameter) < 0)
+ {
+ CMSerr(CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT,
+ CMS_R_CIPHER_PARAMETER_INITIALISATION_ERROR);
+ goto err;
+ }
+
+ algtmp = pwri->keyDerivationAlgorithm;
+
+ /* Finish password based key derivation to setup key in "ctx" */
+
+ if (EVP_PBE_CipherInit(algtmp->algorithm,
+ (char *)pwri->pass, pwri->passlen,
+ algtmp->parameter, &kekctx, en_de) < 0)
+ {
+ CMSerr(CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT, ERR_R_EVP_LIB);
+ goto err;
+ }
+
+ /* Finally wrap/unwrap the key */
+
+ if (en_de)
+ {
+
+ if (!kek_wrap_key(NULL, &keylen, ec->key, ec->keylen, &kekctx))
+ goto err;
+
+ key = OPENSSL_malloc(keylen);
+
+ if (!key)
+ goto err;
+
+ if (!kek_wrap_key(key, &keylen, ec->key, ec->keylen, &kekctx))
+ goto err;
+ pwri->encryptedKey->data = key;
+ pwri->encryptedKey->length = keylen;
+ }
+ else
+ {
+ key = OPENSSL_malloc(pwri->encryptedKey->length);
+
+ if (!key)
+ {
+ CMSerr(CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT,
+ ERR_R_MALLOC_FAILURE);
+ goto err;
+ }
+ if (!kek_unwrap_key(key, &keylen,
+ pwri->encryptedKey->data,
+ pwri->encryptedKey->length, &kekctx))
+ {
+ CMSerr(CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT,
+ CMS_R_UNWRAP_FAILURE);
+ goto err;
+ }
+
+ ec->key = key;
+ ec->keylen = keylen;
+
+ }
+
+ r = 1;
+
+ err:
+
+ EVP_CIPHER_CTX_cleanup(&kekctx);
+
+ if (!r && key)
+ OPENSSL_free(key);
+ X509_ALGOR_free(kekalg);
+
+ return r;
+
+ }
diff --git a/main/openssl/crypto/cms/cms_sd.c b/main/openssl/crypto/cms/cms_sd.c
new file mode 100644
index 00000000..77fbd135
--- /dev/null
+++ b/main/openssl/crypto/cms/cms_sd.c
@@ -0,0 +1,985 @@
+/* crypto/cms/cms_sd.c */
+/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
+ * project.
+ */
+/* ====================================================================
+ * Copyright (c) 2008 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include "cryptlib.h"
+#include <openssl/asn1t.h>
+#include <openssl/pem.h>
+#include <openssl/x509v3.h>
+#include <openssl/err.h>
+#include <openssl/cms.h>
+#include "cms_lcl.h"
+#include "asn1_locl.h"
+
+/* CMS SignedData Utilities */
+
+DECLARE_ASN1_ITEM(CMS_SignedData)
+
+static CMS_SignedData *cms_get0_signed(CMS_ContentInfo *cms)
+ {
+ if (OBJ_obj2nid(cms->contentType) != NID_pkcs7_signed)
+ {
+ CMSerr(CMS_F_CMS_GET0_SIGNED, CMS_R_CONTENT_TYPE_NOT_SIGNED_DATA);
+ return NULL;
+ }
+ return cms->d.signedData;
+ }
+
+static CMS_SignedData *cms_signed_data_init(CMS_ContentInfo *cms)
+ {
+ if (cms->d.other == NULL)
+ {
+ cms->d.signedData = M_ASN1_new_of(CMS_SignedData);
+ if (!cms->d.signedData)
+ {
+ CMSerr(CMS_F_CMS_SIGNED_DATA_INIT, ERR_R_MALLOC_FAILURE);
+ return NULL;
+ }
+ cms->d.signedData->version = 1;
+ cms->d.signedData->encapContentInfo->eContentType =
+ OBJ_nid2obj(NID_pkcs7_data);
+ cms->d.signedData->encapContentInfo->partial = 1;
+ ASN1_OBJECT_free(cms->contentType);
+ cms->contentType = OBJ_nid2obj(NID_pkcs7_signed);
+ return cms->d.signedData;
+ }
+ return cms_get0_signed(cms);
+ }
+
+/* Just initialize SignedData e.g. for certs only structure */
+
+int CMS_SignedData_init(CMS_ContentInfo *cms)
+ {
+ if (cms_signed_data_init(cms))
+ return 1;
+ else
+ return 0;
+ }
+
+/* Check structures and fixup version numbers (if necessary) */
+
+static void cms_sd_set_version(CMS_SignedData *sd)
+ {
+ int i;
+ CMS_CertificateChoices *cch;
+ CMS_RevocationInfoChoice *rch;
+ CMS_SignerInfo *si;
+
+ for (i = 0; i < sk_CMS_CertificateChoices_num(sd->certificates); i++)
+ {
+ cch = sk_CMS_CertificateChoices_value(sd->certificates, i);
+ if (cch->type == CMS_CERTCHOICE_OTHER)
+ {
+ if (sd->version < 5)
+ sd->version = 5;
+ }
+ else if (cch->type == CMS_CERTCHOICE_V2ACERT)
+ {
+ if (sd->version < 4)
+ sd->version = 4;
+ }
+ else if (cch->type == CMS_CERTCHOICE_V1ACERT)
+ {
+ if (sd->version < 3)
+ sd->version = 3;
+ }
+ }
+
+ for (i = 0; i < sk_CMS_RevocationInfoChoice_num(sd->crls); i++)
+ {
+ rch = sk_CMS_RevocationInfoChoice_value(sd->crls, i);
+ if (rch->type == CMS_REVCHOICE_OTHER)
+ {
+ if (sd->version < 5)
+ sd->version = 5;
+ }
+ }
+
+ if ((OBJ_obj2nid(sd->encapContentInfo->eContentType) != NID_pkcs7_data)
+ && (sd->version < 3))
+ sd->version = 3;
+
+ for (i = 0; i < sk_CMS_SignerInfo_num(sd->signerInfos); i++)
+ {
+ si = sk_CMS_SignerInfo_value(sd->signerInfos, i);
+ if (si->sid->type == CMS_SIGNERINFO_KEYIDENTIFIER)
+ {
+ if (si->version < 3)
+ si->version = 3;
+ if (sd->version < 3)
+ sd->version = 3;
+ }
+ else
+ sd->version = 1;
+ }
+
+ if (sd->version < 1)
+ sd->version = 1;
+
+ }
+
+/* Copy an existing messageDigest value */
+
+static int cms_copy_messageDigest(CMS_ContentInfo *cms, CMS_SignerInfo *si)
+ {
+ STACK_OF(CMS_SignerInfo) *sinfos;
+ CMS_SignerInfo *sitmp;
+ int i;
+ sinfos = CMS_get0_SignerInfos(cms);
+ for (i = 0; i < sk_CMS_SignerInfo_num(sinfos); i++)
+ {
+ ASN1_OCTET_STRING *messageDigest;
+ sitmp = sk_CMS_SignerInfo_value(sinfos, i);
+ if (sitmp == si)
+ continue;
+ if (CMS_signed_get_attr_count(sitmp) < 0)
+ continue;
+ if (OBJ_cmp(si->digestAlgorithm->algorithm,
+ sitmp->digestAlgorithm->algorithm))
+ continue;
+ messageDigest = CMS_signed_get0_data_by_OBJ(sitmp,
+ OBJ_nid2obj(NID_pkcs9_messageDigest),
+ -3, V_ASN1_OCTET_STRING);
+ if (!messageDigest)
+ {
+ CMSerr(CMS_F_CMS_COPY_MESSAGEDIGEST,
+ CMS_R_ERROR_READING_MESSAGEDIGEST_ATTRIBUTE);
+ return 0;
+ }
+
+ if (CMS_signed_add1_attr_by_NID(si, NID_pkcs9_messageDigest,
+ V_ASN1_OCTET_STRING,
+ messageDigest, -1))
+ return 1;
+ else
+ return 0;
+ }
+ CMSerr(CMS_F_CMS_COPY_MESSAGEDIGEST, CMS_R_NO_MATCHING_DIGEST);
+ return 0;
+ }
+
+int cms_set1_SignerIdentifier(CMS_SignerIdentifier *sid, X509 *cert, int type)
+ {
+ switch(type)
+ {
+ case CMS_SIGNERINFO_ISSUER_SERIAL:
+ sid->d.issuerAndSerialNumber =
+ M_ASN1_new_of(CMS_IssuerAndSerialNumber);
+ if (!sid->d.issuerAndSerialNumber)
+ goto merr;
+ if (!X509_NAME_set(&sid->d.issuerAndSerialNumber->issuer,
+ X509_get_issuer_name(cert)))
+ goto merr;
+ if (!ASN1_STRING_copy(
+ sid->d.issuerAndSerialNumber->serialNumber,
+ X509_get_serialNumber(cert)))
+ goto merr;
+ break;
+
+ case CMS_SIGNERINFO_KEYIDENTIFIER:
+ if (!cert->skid)
+ {
+ CMSerr(CMS_F_CMS_SET1_SIGNERIDENTIFIER,
+ CMS_R_CERTIFICATE_HAS_NO_KEYID);
+ return 0;
+ }
+ sid->d.subjectKeyIdentifier = ASN1_STRING_dup(cert->skid);
+ if (!sid->d.subjectKeyIdentifier)
+ goto merr;
+ break;
+
+ default:
+ CMSerr(CMS_F_CMS_SET1_SIGNERIDENTIFIER, CMS_R_UNKNOWN_ID);
+ return 0;
+ }
+
+ sid->type = type;
+
+ return 1;
+
+ merr:
+ CMSerr(CMS_F_CMS_SET1_SIGNERIDENTIFIER, ERR_R_MALLOC_FAILURE);
+ return 0;
+
+ }
+
+int cms_SignerIdentifier_get0_signer_id(CMS_SignerIdentifier *sid,
+ ASN1_OCTET_STRING **keyid,
+ X509_NAME **issuer, ASN1_INTEGER **sno)
+ {
+ if (sid->type == CMS_SIGNERINFO_ISSUER_SERIAL)
+ {
+ if (issuer)
+ *issuer = sid->d.issuerAndSerialNumber->issuer;
+ if (sno)
+ *sno = sid->d.issuerAndSerialNumber->serialNumber;
+ }
+ else if (sid->type == CMS_SIGNERINFO_KEYIDENTIFIER)
+ {
+ if (keyid)
+ *keyid = sid->d.subjectKeyIdentifier;
+ }
+ else
+ return 0;
+ return 1;
+ }
+
+int cms_SignerIdentifier_cert_cmp(CMS_SignerIdentifier *sid, X509 *cert)
+ {
+ int ret;
+ if (sid->type == CMS_SIGNERINFO_ISSUER_SERIAL)
+ {
+ ret = X509_NAME_cmp(sid->d.issuerAndSerialNumber->issuer,
+ X509_get_issuer_name(cert));
+ if (ret)
+ return ret;
+ return ASN1_INTEGER_cmp(sid->d.issuerAndSerialNumber->serialNumber,
+ X509_get_serialNumber(cert));
+ }
+ else if (sid->type == CMS_SIGNERINFO_KEYIDENTIFIER)
+ {
+ X509_check_purpose(cert, -1, -1);
+ if (!cert->skid)
+ return -1;
+ return ASN1_OCTET_STRING_cmp(sid->d.subjectKeyIdentifier,
+ cert->skid);
+ }
+ else
+ return -1;
+ }
+
+CMS_SignerInfo *CMS_add1_signer(CMS_ContentInfo *cms,
+ X509 *signer, EVP_PKEY *pk, const EVP_MD *md,
+ unsigned int flags)
+ {
+ CMS_SignedData *sd;
+ CMS_SignerInfo *si = NULL;
+ X509_ALGOR *alg;
+ int i, type;
+ if(!X509_check_private_key(signer, pk))
+ {
+ CMSerr(CMS_F_CMS_ADD1_SIGNER,
+ CMS_R_PRIVATE_KEY_DOES_NOT_MATCH_CERTIFICATE);
+ return NULL;
+ }
+ sd = cms_signed_data_init(cms);
+ if (!sd)
+ goto err;
+ si = M_ASN1_new_of(CMS_SignerInfo);
+ if (!si)
+ goto merr;
+ X509_check_purpose(signer, -1, -1);
+
+ CRYPTO_add(&pk->references, 1, CRYPTO_LOCK_EVP_PKEY);
+ CRYPTO_add(&signer->references, 1, CRYPTO_LOCK_X509);
+
+ si->pkey = pk;
+ si->signer = signer;
+
+ if (flags & CMS_USE_KEYID)
+ {
+ si->version = 3;
+ if (sd->version < 3)
+ sd->version = 3;
+ type = CMS_SIGNERINFO_KEYIDENTIFIER;
+ }
+ else
+ {
+ type = CMS_SIGNERINFO_ISSUER_SERIAL;
+ si->version = 1;
+ }
+
+ if (!cms_set1_SignerIdentifier(si->sid, signer, type))
+ goto err;
+
+ if (md == NULL)
+ {
+ int def_nid;
+ if (EVP_PKEY_get_default_digest_nid(pk, &def_nid) <= 0)
+ goto err;
+ md = EVP_get_digestbynid(def_nid);
+ if (md == NULL)
+ {
+ CMSerr(CMS_F_CMS_ADD1_SIGNER, CMS_R_NO_DEFAULT_DIGEST);
+ goto err;
+ }
+ }
+
+ if (!md)
+ {
+ CMSerr(CMS_F_CMS_ADD1_SIGNER, CMS_R_NO_DIGEST_SET);
+ goto err;
+ }
+
+ cms_DigestAlgorithm_set(si->digestAlgorithm, md);
+
+ /* See if digest is present in digestAlgorithms */
+ for (i = 0; i < sk_X509_ALGOR_num(sd->digestAlgorithms); i++)
+ {
+ ASN1_OBJECT *aoid;
+ alg = sk_X509_ALGOR_value(sd->digestAlgorithms, i);
+ X509_ALGOR_get0(&aoid, NULL, NULL, alg);
+ if (OBJ_obj2nid(aoid) == EVP_MD_type(md))
+ break;
+ }
+
+ if (i == sk_X509_ALGOR_num(sd->digestAlgorithms))
+ {
+ alg = X509_ALGOR_new();
+ if (!alg)
+ goto merr;
+ cms_DigestAlgorithm_set(alg, md);
+ if (!sk_X509_ALGOR_push(sd->digestAlgorithms, alg))
+ {
+ X509_ALGOR_free(alg);
+ goto merr;
+ }
+ }
+
+ if (pk->ameth && pk->ameth->pkey_ctrl)
+ {
+ i = pk->ameth->pkey_ctrl(pk, ASN1_PKEY_CTRL_CMS_SIGN,
+ 0, si);
+ if (i == -2)
+ {
+ CMSerr(CMS_F_CMS_ADD1_SIGNER,
+ CMS_R_NOT_SUPPORTED_FOR_THIS_KEY_TYPE);
+ goto err;
+ }
+ if (i <= 0)
+ {
+ CMSerr(CMS_F_CMS_ADD1_SIGNER, CMS_R_CTRL_FAILURE);
+ goto err;
+ }
+ }
+
+ if (!(flags & CMS_NOATTR))
+ {
+ /* Initialialize signed attributes strutucture so other
+ * attributes such as signing time etc are added later
+ * even if we add none here.
+ */
+ if (!si->signedAttrs)
+ {
+ si->signedAttrs = sk_X509_ATTRIBUTE_new_null();
+ if (!si->signedAttrs)
+ goto merr;
+ }
+
+ if (!(flags & CMS_NOSMIMECAP))
+ {
+ STACK_OF(X509_ALGOR) *smcap = NULL;
+ i = CMS_add_standard_smimecap(&smcap);
+ if (i)
+ i = CMS_add_smimecap(si, smcap);
+ sk_X509_ALGOR_pop_free(smcap, X509_ALGOR_free);
+ if (!i)
+ goto merr;
+ }
+ if (flags & CMS_REUSE_DIGEST)
+ {
+ if (!cms_copy_messageDigest(cms, si))
+ goto err;
+ if (!(flags & CMS_PARTIAL) &&
+ !CMS_SignerInfo_sign(si))
+ goto err;
+ }
+ }
+
+ if (!(flags & CMS_NOCERTS))
+ {
+ /* NB ignore -1 return for duplicate cert */
+ if (!CMS_add1_cert(cms, signer))
+ goto merr;
+ }
+
+ if (!sd->signerInfos)
+ sd->signerInfos = sk_CMS_SignerInfo_new_null();
+ if (!sd->signerInfos ||
+ !sk_CMS_SignerInfo_push(sd->signerInfos, si))
+ goto merr;
+
+ return si;
+
+ merr:
+ CMSerr(CMS_F_CMS_ADD1_SIGNER, ERR_R_MALLOC_FAILURE);
+ err:
+ if (si)
+ M_ASN1_free_of(si, CMS_SignerInfo);
+ return NULL;
+
+ }
+
+static int cms_add1_signingTime(CMS_SignerInfo *si, ASN1_TIME *t)
+ {
+ ASN1_TIME *tt;
+ int r = 0;
+ if (t)
+ tt = t;
+ else
+ tt = X509_gmtime_adj(NULL, 0);
+
+ if (!tt)
+ goto merr;
+
+ if (CMS_signed_add1_attr_by_NID(si, NID_pkcs9_signingTime,
+ tt->type, tt, -1) <= 0)
+ goto merr;
+
+ r = 1;
+
+ merr:
+
+ if (!t)
+ ASN1_TIME_free(tt);
+
+ if (!r)
+ CMSerr(CMS_F_CMS_ADD1_SIGNINGTIME, ERR_R_MALLOC_FAILURE);
+
+ return r;
+
+ }
+
+STACK_OF(CMS_SignerInfo) *CMS_get0_SignerInfos(CMS_ContentInfo *cms)
+ {
+ CMS_SignedData *sd;
+ sd = cms_get0_signed(cms);
+ if (!sd)
+ return NULL;
+ return sd->signerInfos;
+ }
+
+STACK_OF(X509) *CMS_get0_signers(CMS_ContentInfo *cms)
+ {
+ STACK_OF(X509) *signers = NULL;
+ STACK_OF(CMS_SignerInfo) *sinfos;
+ CMS_SignerInfo *si;
+ int i;
+ sinfos = CMS_get0_SignerInfos(cms);
+ for (i = 0; i < sk_CMS_SignerInfo_num(sinfos); i++)
+ {
+ si = sk_CMS_SignerInfo_value(sinfos, i);
+ if (si->signer)
+ {
+ if (!signers)
+ {
+ signers = sk_X509_new_null();
+ if (!signers)
+ return NULL;
+ }
+ if (!sk_X509_push(signers, si->signer))
+ {
+ sk_X509_free(signers);
+ return NULL;
+ }
+ }
+ }
+ return signers;
+ }
+
+void CMS_SignerInfo_set1_signer_cert(CMS_SignerInfo *si, X509 *signer)
+ {
+ if (signer)
+ {
+ CRYPTO_add(&signer->references, 1, CRYPTO_LOCK_X509);
+ if (si->pkey)
+ EVP_PKEY_free(si->pkey);
+ si->pkey = X509_get_pubkey(signer);
+ }
+ if (si->signer)
+ X509_free(si->signer);
+ si->signer = signer;
+ }
+
+int CMS_SignerInfo_get0_signer_id(CMS_SignerInfo *si,
+ ASN1_OCTET_STRING **keyid,
+ X509_NAME **issuer, ASN1_INTEGER **sno)
+ {
+ return cms_SignerIdentifier_get0_signer_id(si->sid, keyid, issuer, sno);
+ }
+
+int CMS_SignerInfo_cert_cmp(CMS_SignerInfo *si, X509 *cert)
+ {
+ return cms_SignerIdentifier_cert_cmp(si->sid, cert);
+ }
+
+int CMS_set1_signers_certs(CMS_ContentInfo *cms, STACK_OF(X509) *scerts,
+ unsigned int flags)
+ {
+ CMS_SignedData *sd;
+ CMS_SignerInfo *si;
+ CMS_CertificateChoices *cch;
+ STACK_OF(CMS_CertificateChoices) *certs;
+ X509 *x;
+ int i, j;
+ int ret = 0;
+ sd = cms_get0_signed(cms);
+ if (!sd)
+ return -1;
+ certs = sd->certificates;
+ for (i = 0; i < sk_CMS_SignerInfo_num(sd->signerInfos); i++)
+ {
+ si = sk_CMS_SignerInfo_value(sd->signerInfos, i);
+ if (si->signer)
+ continue;
+
+ for (j = 0; j < sk_X509_num(scerts); j++)
+ {
+ x = sk_X509_value(scerts, j);
+ if (CMS_SignerInfo_cert_cmp(si, x) == 0)
+ {
+ CMS_SignerInfo_set1_signer_cert(si, x);
+ ret++;
+ break;
+ }
+ }
+
+ if (si->signer || (flags & CMS_NOINTERN))
+ continue;
+
+ for (j = 0; j < sk_CMS_CertificateChoices_num(certs); j++)
+ {
+ cch = sk_CMS_CertificateChoices_value(certs, j);
+ if (cch->type != 0)
+ continue;
+ x = cch->d.certificate;
+ if (CMS_SignerInfo_cert_cmp(si, x) == 0)
+ {
+ CMS_SignerInfo_set1_signer_cert(si, x);
+ ret++;
+ break;
+ }
+ }
+ }
+ return ret;
+ }
+
+void CMS_SignerInfo_get0_algs(CMS_SignerInfo *si, EVP_PKEY **pk, X509 **signer,
+ X509_ALGOR **pdig, X509_ALGOR **psig)
+ {
+ if (pk)
+ *pk = si->pkey;
+ if (signer)
+ *signer = si->signer;
+ if (pdig)
+ *pdig = si->digestAlgorithm;
+ if (psig)
+ *psig = si->signatureAlgorithm;
+ }
+
+static int cms_SignerInfo_content_sign(CMS_ContentInfo *cms,
+ CMS_SignerInfo *si, BIO *chain)
+ {
+ EVP_MD_CTX mctx;
+ int r = 0;
+ EVP_MD_CTX_init(&mctx);
+
+
+ if (!si->pkey)
+ {
+ CMSerr(CMS_F_CMS_SIGNERINFO_CONTENT_SIGN, CMS_R_NO_PRIVATE_KEY);
+ return 0;
+ }
+
+ if (!cms_DigestAlgorithm_find_ctx(&mctx, chain, si->digestAlgorithm))
+ goto err;
+
+ /* If any signed attributes calculate and add messageDigest attribute */
+
+ if (CMS_signed_get_attr_count(si) >= 0)
+ {
+ ASN1_OBJECT *ctype =
+ cms->d.signedData->encapContentInfo->eContentType;
+ unsigned char md[EVP_MAX_MD_SIZE];
+ unsigned int mdlen;
+ if (!EVP_DigestFinal_ex(&mctx, md, &mdlen))
+ goto err;
+ if (!CMS_signed_add1_attr_by_NID(si, NID_pkcs9_messageDigest,
+ V_ASN1_OCTET_STRING,
+ md, mdlen))
+ goto err;
+ /* Copy content type across */
+ if (CMS_signed_add1_attr_by_NID(si, NID_pkcs9_contentType,
+ V_ASN1_OBJECT, ctype, -1) <= 0)
+ goto err;
+ if (!CMS_SignerInfo_sign(si))
+ goto err;
+ }
+ else
+ {
+ unsigned char *sig;
+ unsigned int siglen;
+ sig = OPENSSL_malloc(EVP_PKEY_size(si->pkey));
+ if (!sig)
+ {
+ CMSerr(CMS_F_CMS_SIGNERINFO_CONTENT_SIGN,
+ ERR_R_MALLOC_FAILURE);
+ goto err;
+ }
+ if (!EVP_SignFinal(&mctx, sig, &siglen, si->pkey))
+ {
+ CMSerr(CMS_F_CMS_SIGNERINFO_CONTENT_SIGN,
+ CMS_R_SIGNFINAL_ERROR);
+ OPENSSL_free(sig);
+ goto err;
+ }
+ ASN1_STRING_set0(si->signature, sig, siglen);
+ }
+
+ r = 1;
+
+ err:
+ EVP_MD_CTX_cleanup(&mctx);
+ return r;
+
+ }
+
+int cms_SignedData_final(CMS_ContentInfo *cms, BIO *chain)
+ {
+ STACK_OF(CMS_SignerInfo) *sinfos;
+ CMS_SignerInfo *si;
+ int i;
+ sinfos = CMS_get0_SignerInfos(cms);
+ for (i = 0; i < sk_CMS_SignerInfo_num(sinfos); i++)
+ {
+ si = sk_CMS_SignerInfo_value(sinfos, i);
+ if (!cms_SignerInfo_content_sign(cms, si, chain))
+ return 0;
+ }
+ cms->d.signedData->encapContentInfo->partial = 0;
+ return 1;
+ }
+
+int CMS_SignerInfo_sign(CMS_SignerInfo *si)
+ {
+ EVP_MD_CTX mctx;
+ EVP_PKEY_CTX *pctx;
+ unsigned char *abuf = NULL;
+ int alen;
+ size_t siglen;
+ const EVP_MD *md = NULL;
+
+ md = EVP_get_digestbyobj(si->digestAlgorithm->algorithm);
+ if (md == NULL)
+ return 0;
+
+ EVP_MD_CTX_init(&mctx);
+
+ if (CMS_signed_get_attr_by_NID(si, NID_pkcs9_signingTime, -1) < 0)
+ {
+ if (!cms_add1_signingTime(si, NULL))
+ goto err;
+ }
+
+ if (EVP_DigestSignInit(&mctx, &pctx, md, NULL, si->pkey) <= 0)
+ goto err;
+
+ if (EVP_PKEY_CTX_ctrl(pctx, -1, EVP_PKEY_OP_SIGN,
+ EVP_PKEY_CTRL_CMS_SIGN, 0, si) <= 0)
+ {
+ CMSerr(CMS_F_CMS_SIGNERINFO_SIGN, CMS_R_CTRL_ERROR);
+ goto err;
+ }
+
+ alen = ASN1_item_i2d((ASN1_VALUE *)si->signedAttrs,&abuf,
+ ASN1_ITEM_rptr(CMS_Attributes_Sign));
+ if(!abuf)
+ goto err;
+ if (EVP_DigestSignUpdate(&mctx, abuf, alen) <= 0)
+ goto err;
+ if (EVP_DigestSignFinal(&mctx, NULL, &siglen) <= 0)
+ goto err;
+ OPENSSL_free(abuf);
+ abuf = OPENSSL_malloc(siglen);
+ if(!abuf)
+ goto err;
+ if (EVP_DigestSignFinal(&mctx, abuf, &siglen) <= 0)
+ goto err;
+
+ if (EVP_PKEY_CTX_ctrl(pctx, -1, EVP_PKEY_OP_SIGN,
+ EVP_PKEY_CTRL_CMS_SIGN, 1, si) <= 0)
+ {
+ CMSerr(CMS_F_CMS_SIGNERINFO_SIGN, CMS_R_CTRL_ERROR);
+ goto err;
+ }
+
+ EVP_MD_CTX_cleanup(&mctx);
+
+ ASN1_STRING_set0(si->signature, abuf, siglen);
+
+ return 1;
+
+ err:
+ if (abuf)
+ OPENSSL_free(abuf);
+ EVP_MD_CTX_cleanup(&mctx);
+ return 0;
+
+ }
+
+int CMS_SignerInfo_verify(CMS_SignerInfo *si)
+ {
+ EVP_MD_CTX mctx;
+ EVP_PKEY_CTX *pctx;
+ unsigned char *abuf = NULL;
+ int alen, r = -1;
+ const EVP_MD *md = NULL;
+
+ if (!si->pkey)
+ {
+ CMSerr(CMS_F_CMS_SIGNERINFO_VERIFY, CMS_R_NO_PUBLIC_KEY);
+ return -1;
+ }
+
+ md = EVP_get_digestbyobj(si->digestAlgorithm->algorithm);
+ if (md == NULL)
+ return -1;
+ EVP_MD_CTX_init(&mctx);
+ if (EVP_DigestVerifyInit(&mctx, &pctx, md, NULL, si->pkey) <= 0)
+ goto err;
+
+ alen = ASN1_item_i2d((ASN1_VALUE *)si->signedAttrs,&abuf,
+ ASN1_ITEM_rptr(CMS_Attributes_Verify));
+ if(!abuf)
+ goto err;
+ r = EVP_DigestVerifyUpdate(&mctx, abuf, alen);
+ OPENSSL_free(abuf);
+ if (r <= 0)
+ {
+ r = -1;
+ goto err;
+ }
+ r = EVP_DigestVerifyFinal(&mctx,
+ si->signature->data, si->signature->length);
+ if (r <= 0)
+ CMSerr(CMS_F_CMS_SIGNERINFO_VERIFY, CMS_R_VERIFICATION_FAILURE);
+ err:
+ EVP_MD_CTX_cleanup(&mctx);
+ return r;
+ }
+
+/* Create a chain of digest BIOs from a CMS ContentInfo */
+
+BIO *cms_SignedData_init_bio(CMS_ContentInfo *cms)
+ {
+ int i;
+ CMS_SignedData *sd;
+ BIO *chain = NULL;
+ sd = cms_get0_signed(cms);
+ if (!sd)
+ return NULL;
+ if (cms->d.signedData->encapContentInfo->partial)
+ cms_sd_set_version(sd);
+ for (i = 0; i < sk_X509_ALGOR_num(sd->digestAlgorithms); i++)
+ {
+ X509_ALGOR *digestAlgorithm;
+ BIO *mdbio;
+ digestAlgorithm = sk_X509_ALGOR_value(sd->digestAlgorithms, i);
+ mdbio = cms_DigestAlgorithm_init_bio(digestAlgorithm);
+ if (!mdbio)
+ goto err;
+ if (chain)
+ BIO_push(chain, mdbio);
+ else
+ chain = mdbio;
+ }
+ return chain;
+ err:
+ if (chain)
+ BIO_free_all(chain);
+ return NULL;
+ }
+
+int CMS_SignerInfo_verify_content(CMS_SignerInfo *si, BIO *chain)
+ {
+ ASN1_OCTET_STRING *os = NULL;
+ EVP_MD_CTX mctx;
+ int r = -1;
+ EVP_MD_CTX_init(&mctx);
+ /* If we have any signed attributes look for messageDigest value */
+ if (CMS_signed_get_attr_count(si) >= 0)
+ {
+ os = CMS_signed_get0_data_by_OBJ(si,
+ OBJ_nid2obj(NID_pkcs9_messageDigest),
+ -3, V_ASN1_OCTET_STRING);
+ if (!os)
+ {
+ CMSerr(CMS_F_CMS_SIGNERINFO_VERIFY_CONTENT,
+ CMS_R_ERROR_READING_MESSAGEDIGEST_ATTRIBUTE);
+ goto err;
+ }
+ }
+
+ if (!cms_DigestAlgorithm_find_ctx(&mctx, chain, si->digestAlgorithm))
+ goto err;
+
+ /* If messageDigest found compare it */
+
+ if (os)
+ {
+ unsigned char mval[EVP_MAX_MD_SIZE];
+ unsigned int mlen;
+ if (EVP_DigestFinal_ex(&mctx, mval, &mlen) <= 0)
+ {
+ CMSerr(CMS_F_CMS_SIGNERINFO_VERIFY_CONTENT,
+ CMS_R_UNABLE_TO_FINALIZE_CONTEXT);
+ goto err;
+ }
+ if (mlen != (unsigned int)os->length)
+ {
+ CMSerr(CMS_F_CMS_SIGNERINFO_VERIFY_CONTENT,
+ CMS_R_MESSAGEDIGEST_ATTRIBUTE_WRONG_LENGTH);
+ goto err;
+ }
+
+ if (memcmp(mval, os->data, mlen))
+ {
+ CMSerr(CMS_F_CMS_SIGNERINFO_VERIFY_CONTENT,
+ CMS_R_VERIFICATION_FAILURE);
+ r = 0;
+ }
+ else
+ r = 1;
+ }
+ else
+ {
+ r = EVP_VerifyFinal(&mctx, si->signature->data,
+ si->signature->length, si->pkey);
+ if (r <= 0)
+ {
+ CMSerr(CMS_F_CMS_SIGNERINFO_VERIFY_CONTENT,
+ CMS_R_VERIFICATION_FAILURE);
+ r = 0;
+ }
+ }
+
+ err:
+ EVP_MD_CTX_cleanup(&mctx);
+ return r;
+
+ }
+
+int CMS_add_smimecap(CMS_SignerInfo *si, STACK_OF(X509_ALGOR) *algs)
+ {
+ unsigned char *smder = NULL;
+ int smderlen, r;
+ smderlen = i2d_X509_ALGORS(algs, &smder);
+ if (smderlen <= 0)
+ return 0;
+ r = CMS_signed_add1_attr_by_NID(si, NID_SMIMECapabilities,
+ V_ASN1_SEQUENCE, smder, smderlen);
+ OPENSSL_free(smder);
+ return r;
+ }
+
+int CMS_add_simple_smimecap(STACK_OF(X509_ALGOR) **algs,
+ int algnid, int keysize)
+ {
+ X509_ALGOR *alg;
+ ASN1_INTEGER *key = NULL;
+ if (keysize > 0)
+ {
+ key = ASN1_INTEGER_new();
+ if (!key || !ASN1_INTEGER_set(key, keysize))
+ return 0;
+ }
+ alg = X509_ALGOR_new();
+ if (!alg)
+ {
+ if (key)
+ ASN1_INTEGER_free(key);
+ return 0;
+ }
+
+ X509_ALGOR_set0(alg, OBJ_nid2obj(algnid),
+ key ? V_ASN1_INTEGER : V_ASN1_UNDEF, key);
+ if (!*algs)
+ *algs = sk_X509_ALGOR_new_null();
+ if (!*algs || !sk_X509_ALGOR_push(*algs, alg))
+ {
+ X509_ALGOR_free(alg);
+ return 0;
+ }
+ return 1;
+ }
+
+/* Check to see if a cipher exists and if so add S/MIME capabilities */
+
+static int cms_add_cipher_smcap(STACK_OF(X509_ALGOR) **sk, int nid, int arg)
+ {
+ if (EVP_get_cipherbynid(nid))
+ return CMS_add_simple_smimecap(sk, nid, arg);
+ return 1;
+ }
+
+static int cms_add_digest_smcap(STACK_OF(X509_ALGOR) **sk, int nid, int arg)
+ {
+ if (EVP_get_digestbynid(nid))
+ return CMS_add_simple_smimecap(sk, nid, arg);
+ return 1;
+ }
+
+int CMS_add_standard_smimecap(STACK_OF(X509_ALGOR) **smcap)
+ {
+ if (!cms_add_cipher_smcap(smcap, NID_aes_256_cbc, -1)
+ || !cms_add_digest_smcap(smcap, NID_id_GostR3411_94, -1)
+ || !cms_add_cipher_smcap(smcap, NID_id_Gost28147_89, -1)
+ || !cms_add_cipher_smcap(smcap, NID_aes_192_cbc, -1)
+ || !cms_add_cipher_smcap(smcap, NID_aes_128_cbc, -1)
+ || !cms_add_cipher_smcap(smcap, NID_des_ede3_cbc, -1)
+ || !cms_add_cipher_smcap(smcap, NID_rc2_cbc, 128)
+ || !cms_add_cipher_smcap(smcap, NID_rc2_cbc, 64)
+ || !cms_add_cipher_smcap(smcap, NID_des_cbc, -1)
+ || !cms_add_cipher_smcap(smcap, NID_rc2_cbc, 40))
+ return 0;
+ return 1;
+ }
diff --git a/main/openssl/crypto/cms/cms_smime.c b/main/openssl/crypto/cms/cms_smime.c
new file mode 100644
index 00000000..8c56e3a8
--- /dev/null
+++ b/main/openssl/crypto/cms/cms_smime.c
@@ -0,0 +1,850 @@
+/* crypto/cms/cms_smime.c */
+/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
+ * project.
+ */
+/* ====================================================================
+ * Copyright (c) 2008 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include "cryptlib.h"
+#include <openssl/asn1t.h>
+#include <openssl/x509.h>
+#include <openssl/x509v3.h>
+#include <openssl/err.h>
+#include <openssl/cms.h>
+#include "cms_lcl.h"
+
+static int cms_copy_content(BIO *out, BIO *in, unsigned int flags)
+ {
+ unsigned char buf[4096];
+ int r = 0, i;
+ BIO *tmpout = NULL;
+
+ if (out == NULL)
+ tmpout = BIO_new(BIO_s_null());
+ else if (flags & CMS_TEXT)
+ {
+ tmpout = BIO_new(BIO_s_mem());
+ BIO_set_mem_eof_return(tmpout, 0);
+ }
+ else
+ tmpout = out;
+
+ if(!tmpout)
+ {
+ CMSerr(CMS_F_CMS_COPY_CONTENT,ERR_R_MALLOC_FAILURE);
+ goto err;
+ }
+
+ /* Read all content through chain to process digest, decrypt etc */
+ for (;;)
+ {
+ i=BIO_read(in,buf,sizeof(buf));
+ if (i <= 0)
+ {
+ if (BIO_method_type(in) == BIO_TYPE_CIPHER)
+ {
+ if (!BIO_get_cipher_status(in))
+ goto err;
+ }
+ if (i < 0)
+ goto err;
+ break;
+ }
+
+ if (tmpout && (BIO_write(tmpout, buf, i) != i))
+ goto err;
+ }
+
+ if(flags & CMS_TEXT)
+ {
+ if(!SMIME_text(tmpout, out))
+ {
+ CMSerr(CMS_F_CMS_COPY_CONTENT,CMS_R_SMIME_TEXT_ERROR);
+ goto err;
+ }
+ }
+
+ r = 1;
+
+ err:
+ if (tmpout && (tmpout != out))
+ BIO_free(tmpout);
+ return r;
+
+ }
+
+static int check_content(CMS_ContentInfo *cms)
+ {
+ ASN1_OCTET_STRING **pos = CMS_get0_content(cms);
+ if (!pos || !*pos)
+ {
+ CMSerr(CMS_F_CHECK_CONTENT, CMS_R_NO_CONTENT);
+ return 0;
+ }
+ return 1;
+ }
+
+static void do_free_upto(BIO *f, BIO *upto)
+ {
+ if (upto)
+ {
+ BIO *tbio;
+ do
+ {
+ tbio = BIO_pop(f);
+ BIO_free(f);
+ f = tbio;
+ }
+ while (f != upto);
+ }
+ else
+ BIO_free_all(f);
+ }
+
+int CMS_data(CMS_ContentInfo *cms, BIO *out, unsigned int flags)
+ {
+ BIO *cont;
+ int r;
+ if (OBJ_obj2nid(CMS_get0_type(cms)) != NID_pkcs7_data)
+ {
+ CMSerr(CMS_F_CMS_DATA, CMS_R_TYPE_NOT_DATA);
+ return 0;
+ }
+ cont = CMS_dataInit(cms, NULL);
+ if (!cont)
+ return 0;
+ r = cms_copy_content(out, cont, flags);
+ BIO_free_all(cont);
+ return r;
+ }
+
+CMS_ContentInfo *CMS_data_create(BIO *in, unsigned int flags)
+ {
+ CMS_ContentInfo *cms;
+ cms = cms_Data_create();
+ if (!cms)
+ return NULL;
+
+ if ((flags & CMS_STREAM) || CMS_final(cms, in, NULL, flags))
+ return cms;
+
+ CMS_ContentInfo_free(cms);
+
+ return NULL;
+ }
+
+int CMS_digest_verify(CMS_ContentInfo *cms, BIO *dcont, BIO *out,
+ unsigned int flags)
+ {
+ BIO *cont;
+ int r;
+ if (OBJ_obj2nid(CMS_get0_type(cms)) != NID_pkcs7_digest)
+ {
+ CMSerr(CMS_F_CMS_DIGEST_VERIFY, CMS_R_TYPE_NOT_DIGESTED_DATA);
+ return 0;
+ }
+
+ if (!dcont && !check_content(cms))
+ return 0;
+
+ cont = CMS_dataInit(cms, dcont);
+ if (!cont)
+ return 0;
+ r = cms_copy_content(out, cont, flags);
+ if (r)
+ r = cms_DigestedData_do_final(cms, cont, 1);
+ do_free_upto(cont, dcont);
+ return r;
+ }
+
+CMS_ContentInfo *CMS_digest_create(BIO *in, const EVP_MD *md,
+ unsigned int flags)
+ {
+ CMS_ContentInfo *cms;
+ if (!md)
+ md = EVP_sha1();
+ cms = cms_DigestedData_create(md);
+ if (!cms)
+ return NULL;
+
+ if(!(flags & CMS_DETACHED))
+ CMS_set_detached(cms, 0);
+
+ if ((flags & CMS_STREAM) || CMS_final(cms, in, NULL, flags))
+ return cms;
+
+ CMS_ContentInfo_free(cms);
+ return NULL;
+ }
+
+int CMS_EncryptedData_decrypt(CMS_ContentInfo *cms,
+ const unsigned char *key, size_t keylen,
+ BIO *dcont, BIO *out, unsigned int flags)
+ {
+ BIO *cont;
+ int r;
+ if (OBJ_obj2nid(CMS_get0_type(cms)) != NID_pkcs7_encrypted)
+ {
+ CMSerr(CMS_F_CMS_ENCRYPTEDDATA_DECRYPT,
+ CMS_R_TYPE_NOT_ENCRYPTED_DATA);
+ return 0;
+ }
+
+ if (!dcont && !check_content(cms))
+ return 0;
+
+ if (CMS_EncryptedData_set1_key(cms, NULL, key, keylen) <= 0)
+ return 0;
+ cont = CMS_dataInit(cms, dcont);
+ if (!cont)
+ return 0;
+ r = cms_copy_content(out, cont, flags);
+ do_free_upto(cont, dcont);
+ return r;
+ }
+
+CMS_ContentInfo *CMS_EncryptedData_encrypt(BIO *in, const EVP_CIPHER *cipher,
+ const unsigned char *key, size_t keylen,
+ unsigned int flags)
+ {
+ CMS_ContentInfo *cms;
+ if (!cipher)
+ {
+ CMSerr(CMS_F_CMS_ENCRYPTEDDATA_ENCRYPT, CMS_R_NO_CIPHER);
+ return NULL;
+ }
+ cms = CMS_ContentInfo_new();
+ if (!cms)
+ return NULL;
+ if (!CMS_EncryptedData_set1_key(cms, cipher, key, keylen))
+ return NULL;
+
+ if(!(flags & CMS_DETACHED))
+ CMS_set_detached(cms, 0);
+
+ if ((flags & (CMS_STREAM|CMS_PARTIAL))
+ || CMS_final(cms, in, NULL, flags))
+ return cms;
+
+ CMS_ContentInfo_free(cms);
+ return NULL;
+ }
+
+static int cms_signerinfo_verify_cert(CMS_SignerInfo *si,
+ X509_STORE *store,
+ STACK_OF(X509) *certs,
+ STACK_OF(X509_CRL) *crls,
+ unsigned int flags)
+ {
+ X509_STORE_CTX ctx;
+ X509 *signer;
+ int i, j, r = 0;
+ CMS_SignerInfo_get0_algs(si, NULL, &signer, NULL, NULL);
+ if (!X509_STORE_CTX_init(&ctx, store, signer, certs))
+ {
+ CMSerr(CMS_F_CMS_SIGNERINFO_VERIFY_CERT,
+ CMS_R_STORE_INIT_ERROR);
+ goto err;
+ }
+ X509_STORE_CTX_set_default(&ctx, "smime_sign");
+ if (crls)
+ X509_STORE_CTX_set0_crls(&ctx, crls);
+
+ i = X509_verify_cert(&ctx);
+ if (i <= 0)
+ {
+ j = X509_STORE_CTX_get_error(&ctx);
+ CMSerr(CMS_F_CMS_SIGNERINFO_VERIFY_CERT,
+ CMS_R_CERTIFICATE_VERIFY_ERROR);
+ ERR_add_error_data(2, "Verify error:",
+ X509_verify_cert_error_string(j));
+ goto err;
+ }
+ r = 1;
+ err:
+ X509_STORE_CTX_cleanup(&ctx);
+ return r;
+
+ }
+
+int CMS_verify(CMS_ContentInfo *cms, STACK_OF(X509) *certs,
+ X509_STORE *store, BIO *dcont, BIO *out, unsigned int flags)
+ {
+ CMS_SignerInfo *si;
+ STACK_OF(CMS_SignerInfo) *sinfos;
+ STACK_OF(X509) *cms_certs = NULL;
+ STACK_OF(X509_CRL) *crls = NULL;
+ X509 *signer;
+ int i, scount = 0, ret = 0;
+ BIO *cmsbio = NULL, *tmpin = NULL;
+
+ if (!dcont && !check_content(cms))
+ return 0;
+
+ /* Attempt to find all signer certificates */
+
+ sinfos = CMS_get0_SignerInfos(cms);
+
+ if (sk_CMS_SignerInfo_num(sinfos) <= 0)
+ {
+ CMSerr(CMS_F_CMS_VERIFY, CMS_R_NO_SIGNERS);
+ goto err;
+ }
+
+ for (i = 0; i < sk_CMS_SignerInfo_num(sinfos); i++)
+ {
+ si = sk_CMS_SignerInfo_value(sinfos, i);
+ CMS_SignerInfo_get0_algs(si, NULL, &signer, NULL, NULL);
+ if (signer)
+ scount++;
+ }
+
+ if (scount != sk_CMS_SignerInfo_num(sinfos))
+ scount += CMS_set1_signers_certs(cms, certs, flags);
+
+ if (scount != sk_CMS_SignerInfo_num(sinfos))
+ {
+ CMSerr(CMS_F_CMS_VERIFY, CMS_R_SIGNER_CERTIFICATE_NOT_FOUND);
+ goto err;
+ }
+
+ /* Attempt to verify all signers certs */
+
+ if (!(flags & CMS_NO_SIGNER_CERT_VERIFY))
+ {
+ cms_certs = CMS_get1_certs(cms);
+ if (!(flags & CMS_NOCRL))
+ crls = CMS_get1_crls(cms);
+ for (i = 0; i < sk_CMS_SignerInfo_num(sinfos); i++)
+ {
+ si = sk_CMS_SignerInfo_value(sinfos, i);
+ if (!cms_signerinfo_verify_cert(si, store,
+ cms_certs, crls, flags))
+ goto err;
+ }
+ }
+
+ /* Attempt to verify all SignerInfo signed attribute signatures */
+
+ if (!(flags & CMS_NO_ATTR_VERIFY))
+ {
+ for (i = 0; i < sk_CMS_SignerInfo_num(sinfos); i++)
+ {
+ si = sk_CMS_SignerInfo_value(sinfos, i);
+ if (CMS_signed_get_attr_count(si) < 0)
+ continue;
+ if (CMS_SignerInfo_verify(si) <= 0)
+ goto err;
+ }
+ }
+
+ /* Performance optimization: if the content is a memory BIO then
+ * store its contents in a temporary read only memory BIO. This
+ * avoids potentially large numbers of slow copies of data which will
+ * occur when reading from a read write memory BIO when signatures
+ * are calculated.
+ */
+
+ if (dcont && (BIO_method_type(dcont) == BIO_TYPE_MEM))
+ {
+ char *ptr;
+ long len;
+ len = BIO_get_mem_data(dcont, &ptr);
+ tmpin = BIO_new_mem_buf(ptr, len);
+ if (tmpin == NULL)
+ {
+ CMSerr(CMS_F_CMS_VERIFY,ERR_R_MALLOC_FAILURE);
+ return 0;
+ }
+ }
+ else
+ tmpin = dcont;
+
+
+ cmsbio=CMS_dataInit(cms, tmpin);
+ if (!cmsbio)
+ goto err;
+
+ if (!cms_copy_content(out, cmsbio, flags))
+ goto err;
+
+ if (!(flags & CMS_NO_CONTENT_VERIFY))
+ {
+ for (i = 0; i < sk_CMS_SignerInfo_num(sinfos); i++)
+ {
+ si = sk_CMS_SignerInfo_value(sinfos, i);
+ if (CMS_SignerInfo_verify_content(si, cmsbio) <= 0)
+ {
+ CMSerr(CMS_F_CMS_VERIFY,
+ CMS_R_CONTENT_VERIFY_ERROR);
+ goto err;
+ }
+ }
+ }
+
+ ret = 1;
+
+ err:
+
+ if (dcont && (tmpin == dcont))
+ do_free_upto(cmsbio, dcont);
+ else
+ BIO_free_all(cmsbio);
+
+ if (cms_certs)
+ sk_X509_pop_free(cms_certs, X509_free);
+ if (crls)
+ sk_X509_CRL_pop_free(crls, X509_CRL_free);
+
+ return ret;
+ }
+
+int CMS_verify_receipt(CMS_ContentInfo *rcms, CMS_ContentInfo *ocms,
+ STACK_OF(X509) *certs,
+ X509_STORE *store, unsigned int flags)
+ {
+ int r;
+ flags &= ~(CMS_DETACHED|CMS_TEXT);
+ r = CMS_verify(rcms, certs, store, NULL, NULL, flags);
+ if (r <= 0)
+ return r;
+ return cms_Receipt_verify(rcms, ocms);
+ }
+
+CMS_ContentInfo *CMS_sign(X509 *signcert, EVP_PKEY *pkey, STACK_OF(X509) *certs,
+ BIO *data, unsigned int flags)
+ {
+ CMS_ContentInfo *cms;
+ int i;
+
+ cms = CMS_ContentInfo_new();
+ if (!cms || !CMS_SignedData_init(cms))
+ goto merr;
+
+ if (pkey && !CMS_add1_signer(cms, signcert, pkey, NULL, flags))
+ {
+ CMSerr(CMS_F_CMS_SIGN, CMS_R_ADD_SIGNER_ERROR);
+ goto err;
+ }
+
+ for (i = 0; i < sk_X509_num(certs); i++)
+ {
+ X509 *x = sk_X509_value(certs, i);
+ if (!CMS_add1_cert(cms, x))
+ goto merr;
+ }
+
+ if(!(flags & CMS_DETACHED))
+ CMS_set_detached(cms, 0);
+
+ if ((flags & (CMS_STREAM|CMS_PARTIAL))
+ || CMS_final(cms, data, NULL, flags))
+ return cms;
+ else
+ goto err;
+
+ merr:
+ CMSerr(CMS_F_CMS_SIGN, ERR_R_MALLOC_FAILURE);
+
+ err:
+ if (cms)
+ CMS_ContentInfo_free(cms);
+ return NULL;
+ }
+
+CMS_ContentInfo *CMS_sign_receipt(CMS_SignerInfo *si,
+ X509 *signcert, EVP_PKEY *pkey,
+ STACK_OF(X509) *certs,
+ unsigned int flags)
+ {
+ CMS_SignerInfo *rct_si;
+ CMS_ContentInfo *cms = NULL;
+ ASN1_OCTET_STRING **pos, *os;
+ BIO *rct_cont = NULL;
+ int r = 0;
+
+ flags &= ~(CMS_STREAM|CMS_TEXT);
+ /* Not really detached but avoids content being allocated */
+ flags |= CMS_PARTIAL|CMS_BINARY|CMS_DETACHED;
+ if (!pkey || !signcert)
+ {
+ CMSerr(CMS_F_CMS_SIGN_RECEIPT, CMS_R_NO_KEY_OR_CERT);
+ return NULL;
+ }
+
+ /* Initialize signed data */
+
+ cms = CMS_sign(NULL, NULL, certs, NULL, flags);
+ if (!cms)
+ goto err;
+
+ /* Set inner content type to signed receipt */
+ if (!CMS_set1_eContentType(cms, OBJ_nid2obj(NID_id_smime_ct_receipt)))
+ goto err;
+
+ rct_si = CMS_add1_signer(cms, signcert, pkey, NULL, flags);
+ if (!rct_si)
+ {
+ CMSerr(CMS_F_CMS_SIGN_RECEIPT, CMS_R_ADD_SIGNER_ERROR);
+ goto err;
+ }
+
+ os = cms_encode_Receipt(si);
+
+ if (!os)
+ goto err;
+
+ /* Set content to digest */
+ rct_cont = BIO_new_mem_buf(os->data, os->length);
+ if (!rct_cont)
+ goto err;
+
+ /* Add msgSigDigest attribute */
+
+ if (!cms_msgSigDigest_add1(rct_si, si))
+ goto err;
+
+ /* Finalize structure */
+ if (!CMS_final(cms, rct_cont, NULL, flags))
+ goto err;
+
+ /* Set embedded content */
+ pos = CMS_get0_content(cms);
+ *pos = os;
+
+ r = 1;
+
+ err:
+ if (rct_cont)
+ BIO_free(rct_cont);
+ if (r)
+ return cms;
+ CMS_ContentInfo_free(cms);
+ return NULL;
+
+ }
+
+CMS_ContentInfo *CMS_encrypt(STACK_OF(X509) *certs, BIO *data,
+ const EVP_CIPHER *cipher, unsigned int flags)
+ {
+ CMS_ContentInfo *cms;
+ int i;
+ X509 *recip;
+ cms = CMS_EnvelopedData_create(cipher);
+ if (!cms)
+ goto merr;
+ for (i = 0; i < sk_X509_num(certs); i++)
+ {
+ recip = sk_X509_value(certs, i);
+ if (!CMS_add1_recipient_cert(cms, recip, flags))
+ {
+ CMSerr(CMS_F_CMS_ENCRYPT, CMS_R_RECIPIENT_ERROR);
+ goto err;
+ }
+ }
+
+ if(!(flags & CMS_DETACHED))
+ CMS_set_detached(cms, 0);
+
+ if ((flags & (CMS_STREAM|CMS_PARTIAL))
+ || CMS_final(cms, data, NULL, flags))
+ return cms;
+ else
+ goto err;
+
+ merr:
+ CMSerr(CMS_F_CMS_ENCRYPT, ERR_R_MALLOC_FAILURE);
+ err:
+ if (cms)
+ CMS_ContentInfo_free(cms);
+ return NULL;
+ }
+
+int CMS_decrypt_set1_pkey(CMS_ContentInfo *cms, EVP_PKEY *pk, X509 *cert)
+ {
+ STACK_OF(CMS_RecipientInfo) *ris;
+ CMS_RecipientInfo *ri;
+ int i, r;
+ int debug = 0;
+ ris = CMS_get0_RecipientInfos(cms);
+ if (ris)
+ debug = cms->d.envelopedData->encryptedContentInfo->debug;
+ for (i = 0; i < sk_CMS_RecipientInfo_num(ris); i++)
+ {
+ ri = sk_CMS_RecipientInfo_value(ris, i);
+ if (CMS_RecipientInfo_type(ri) != CMS_RECIPINFO_TRANS)
+ continue;
+ /* If we have a cert try matching RecipientInfo
+ * otherwise try them all.
+ */
+ if (!cert || (CMS_RecipientInfo_ktri_cert_cmp(ri, cert) == 0))
+ {
+ CMS_RecipientInfo_set0_pkey(ri, pk);
+ r = CMS_RecipientInfo_decrypt(cms, ri);
+ CMS_RecipientInfo_set0_pkey(ri, NULL);
+ if (cert)
+ {
+ /* If not debugging clear any error and
+ * return success to avoid leaking of
+ * information useful to MMA
+ */
+ if (!debug)
+ {
+ ERR_clear_error();
+ return 1;
+ }
+ if (r > 0)
+ return 1;
+ CMSerr(CMS_F_CMS_DECRYPT_SET1_PKEY,
+ CMS_R_DECRYPT_ERROR);
+ return 0;
+ }
+ /* If no cert and not debugging don't leave loop
+ * after first successful decrypt. Always attempt
+ * to decrypt all recipients to avoid leaking timing
+ * of a successful decrypt.
+ */
+ else if (r > 0 && debug)
+ return 1;
+ }
+ }
+ /* If no cert and not debugging always return success */
+ if (!cert && !debug)
+ {
+ ERR_clear_error();
+ return 1;
+ }
+
+ CMSerr(CMS_F_CMS_DECRYPT_SET1_PKEY, CMS_R_NO_MATCHING_RECIPIENT);
+ return 0;
+
+ }
+
+int CMS_decrypt_set1_key(CMS_ContentInfo *cms,
+ unsigned char *key, size_t keylen,
+ unsigned char *id, size_t idlen)
+ {
+ STACK_OF(CMS_RecipientInfo) *ris;
+ CMS_RecipientInfo *ri;
+ int i, r;
+ ris = CMS_get0_RecipientInfos(cms);
+ for (i = 0; i < sk_CMS_RecipientInfo_num(ris); i++)
+ {
+ ri = sk_CMS_RecipientInfo_value(ris, i);
+ if (CMS_RecipientInfo_type(ri) != CMS_RECIPINFO_KEK)
+ continue;
+
+ /* If we have an id try matching RecipientInfo
+ * otherwise try them all.
+ */
+ if (!id || (CMS_RecipientInfo_kekri_id_cmp(ri, id, idlen) == 0))
+ {
+ CMS_RecipientInfo_set0_key(ri, key, keylen);
+ r = CMS_RecipientInfo_decrypt(cms, ri);
+ CMS_RecipientInfo_set0_key(ri, NULL, 0);
+ if (r > 0)
+ return 1;
+ if (id)
+ {
+ CMSerr(CMS_F_CMS_DECRYPT_SET1_KEY,
+ CMS_R_DECRYPT_ERROR);
+ return 0;
+ }
+ ERR_clear_error();
+ }
+ }
+
+ CMSerr(CMS_F_CMS_DECRYPT_SET1_KEY, CMS_R_NO_MATCHING_RECIPIENT);
+ return 0;
+
+ }
+
+int CMS_decrypt_set1_password(CMS_ContentInfo *cms,
+ unsigned char *pass, ossl_ssize_t passlen)
+ {
+ STACK_OF(CMS_RecipientInfo) *ris;
+ CMS_RecipientInfo *ri;
+ int i, r;
+ ris = CMS_get0_RecipientInfos(cms);
+ for (i = 0; i < sk_CMS_RecipientInfo_num(ris); i++)
+ {
+ ri = sk_CMS_RecipientInfo_value(ris, i);
+ if (CMS_RecipientInfo_type(ri) != CMS_RECIPINFO_PASS)
+ continue;
+ CMS_RecipientInfo_set0_password(ri, pass, passlen);
+ r = CMS_RecipientInfo_decrypt(cms, ri);
+ CMS_RecipientInfo_set0_password(ri, NULL, 0);
+ if (r > 0)
+ return 1;
+ }
+
+ CMSerr(CMS_F_CMS_DECRYPT_SET1_PASSWORD, CMS_R_NO_MATCHING_RECIPIENT);
+ return 0;
+
+ }
+
+int CMS_decrypt(CMS_ContentInfo *cms, EVP_PKEY *pk, X509 *cert,
+ BIO *dcont, BIO *out,
+ unsigned int flags)
+ {
+ int r;
+ BIO *cont;
+ if (OBJ_obj2nid(CMS_get0_type(cms)) != NID_pkcs7_enveloped)
+ {
+ CMSerr(CMS_F_CMS_DECRYPT, CMS_R_TYPE_NOT_ENVELOPED_DATA);
+ return 0;
+ }
+ if (!dcont && !check_content(cms))
+ return 0;
+ if (flags & CMS_DEBUG_DECRYPT)
+ cms->d.envelopedData->encryptedContentInfo->debug = 1;
+ else
+ cms->d.envelopedData->encryptedContentInfo->debug = 0;
+ if (!pk && !cert && !dcont && !out)
+ return 1;
+ if (pk && !CMS_decrypt_set1_pkey(cms, pk, cert))
+ return 0;
+ cont = CMS_dataInit(cms, dcont);
+ if (!cont)
+ return 0;
+ r = cms_copy_content(out, cont, flags);
+ do_free_upto(cont, dcont);
+ return r;
+ }
+
+int CMS_final(CMS_ContentInfo *cms, BIO *data, BIO *dcont, unsigned int flags)
+ {
+ BIO *cmsbio;
+ int ret = 0;
+ if (!(cmsbio = CMS_dataInit(cms, dcont)))
+ {
+ CMSerr(CMS_F_CMS_FINAL,ERR_R_MALLOC_FAILURE);
+ return 0;
+ }
+
+ SMIME_crlf_copy(data, cmsbio, flags);
+
+ (void)BIO_flush(cmsbio);
+
+
+ if (!CMS_dataFinal(cms, cmsbio))
+ {
+ CMSerr(CMS_F_CMS_FINAL,CMS_R_CMS_DATAFINAL_ERROR);
+ goto err;
+ }
+
+ ret = 1;
+
+ err:
+ do_free_upto(cmsbio, dcont);
+
+ return ret;
+
+ }
+
+#ifdef ZLIB
+
+int CMS_uncompress(CMS_ContentInfo *cms, BIO *dcont, BIO *out,
+ unsigned int flags)
+ {
+ BIO *cont;
+ int r;
+ if (OBJ_obj2nid(CMS_get0_type(cms)) != NID_id_smime_ct_compressedData)
+ {
+ CMSerr(CMS_F_CMS_UNCOMPRESS,
+ CMS_R_TYPE_NOT_COMPRESSED_DATA);
+ return 0;
+ }
+
+ if (!dcont && !check_content(cms))
+ return 0;
+
+ cont = CMS_dataInit(cms, dcont);
+ if (!cont)
+ return 0;
+ r = cms_copy_content(out, cont, flags);
+ do_free_upto(cont, dcont);
+ return r;
+ }
+
+CMS_ContentInfo *CMS_compress(BIO *in, int comp_nid, unsigned int flags)
+ {
+ CMS_ContentInfo *cms;
+ if (comp_nid <= 0)
+ comp_nid = NID_zlib_compression;
+ cms = cms_CompressedData_create(comp_nid);
+ if (!cms)
+ return NULL;
+
+ if(!(flags & CMS_DETACHED))
+ CMS_set_detached(cms, 0);
+
+ if ((flags & CMS_STREAM) || CMS_final(cms, in, NULL, flags))
+ return cms;
+
+ CMS_ContentInfo_free(cms);
+ return NULL;
+ }
+
+#else
+
+int CMS_uncompress(CMS_ContentInfo *cms, BIO *dcont, BIO *out,
+ unsigned int flags)
+ {
+ CMSerr(CMS_F_CMS_UNCOMPRESS, CMS_R_UNSUPPORTED_COMPRESSION_ALGORITHM);
+ return 0;
+ }
+
+CMS_ContentInfo *CMS_compress(BIO *in, int comp_nid, unsigned int flags)
+ {
+ CMSerr(CMS_F_CMS_COMPRESS, CMS_R_UNSUPPORTED_COMPRESSION_ALGORITHM);
+ return NULL;
+ }
+
+#endif
diff --git a/main/openssl/crypto/comp/c_rle.c b/main/openssl/crypto/comp/c_rle.c
index 18bceae5..47dfb67f 100644
--- a/main/openssl/crypto/comp/c_rle.c
+++ b/main/openssl/crypto/comp/c_rle.c
@@ -30,7 +30,7 @@ static int rle_compress_block(COMP_CTX *ctx, unsigned char *out,
{
/* int i; */
- if (olen < (ilen+1))
+ if (ilen == 0 || olen < (ilen-1))
{
/* ZZZZZZZZZZZZZZZZZZZZZZ */
return(-1);
@@ -46,7 +46,7 @@ static int rle_expand_block(COMP_CTX *ctx, unsigned char *out,
{
int i;
- if (ilen == 0 || olen < (ilen-1))
+ if (olen < (ilen-1))
{
/* ZZZZZZZZZZZZZZZZZZZZZZ */
return(-1);
diff --git a/main/openssl/crypto/conf/conf_mall.c b/main/openssl/crypto/conf/conf_mall.c
index c6f4cb2d..213890e0 100644
--- a/main/openssl/crypto/conf/conf_mall.c
+++ b/main/openssl/crypto/conf/conf_mall.c
@@ -76,5 +76,6 @@ void OPENSSL_load_builtin_modules(void)
#ifndef OPENSSL_NO_ENGINE
ENGINE_add_conf_module();
#endif
+ EVP_add_alg_module();
}
diff --git a/main/openssl/crypto/cpt_err.c b/main/openssl/crypto/cpt_err.c
index 139b9284..289005f6 100644
--- a/main/openssl/crypto/cpt_err.c
+++ b/main/openssl/crypto/cpt_err.c
@@ -1,6 +1,6 @@
/* crypto/cpt_err.c */
/* ====================================================================
- * Copyright (c) 1999-2006 The OpenSSL Project. All rights reserved.
+ * Copyright (c) 1999-2011 The OpenSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -76,6 +76,7 @@ static ERR_STRING_DATA CRYPTO_str_functs[]=
{ERR_FUNC(CRYPTO_F_CRYPTO_SET_EX_DATA), "CRYPTO_set_ex_data"},
{ERR_FUNC(CRYPTO_F_DEF_ADD_INDEX), "DEF_ADD_INDEX"},
{ERR_FUNC(CRYPTO_F_DEF_GET_CLASS), "DEF_GET_CLASS"},
+{ERR_FUNC(CRYPTO_F_FIPS_MODE_SET), "FIPS_mode_set"},
{ERR_FUNC(CRYPTO_F_INT_DUP_EX_DATA), "INT_DUP_EX_DATA"},
{ERR_FUNC(CRYPTO_F_INT_FREE_EX_DATA), "INT_FREE_EX_DATA"},
{ERR_FUNC(CRYPTO_F_INT_NEW_EX_DATA), "INT_NEW_EX_DATA"},
@@ -84,6 +85,7 @@ static ERR_STRING_DATA CRYPTO_str_functs[]=
static ERR_STRING_DATA CRYPTO_str_reasons[]=
{
+{ERR_REASON(CRYPTO_R_FIPS_MODE_NOT_SUPPORTED),"fips mode not supported"},
{ERR_REASON(CRYPTO_R_NO_DYNLOCK_CREATE_CALLBACK),"no dynlock create callback"},
{0,NULL}
};
diff --git a/main/openssl/crypto/cryptlib.c b/main/openssl/crypto/cryptlib.c
index 24fe123e..304c6b70 100644
--- a/main/openssl/crypto/cryptlib.c
+++ b/main/openssl/crypto/cryptlib.c
@@ -409,6 +409,10 @@ int (*CRYPTO_get_add_lock_callback(void))(int *num,int mount,int type,
void CRYPTO_set_locking_callback(void (*func)(int mode,int type,
const char *file,int line))
{
+ /* Calling this here ensures initialisation before any threads
+ * are started.
+ */
+ OPENSSL_init();
locking_callback=func;
}
@@ -500,7 +504,7 @@ void CRYPTO_THREADID_current(CRYPTO_THREADID *id)
CRYPTO_THREADID_set_numeric(id, (unsigned long)find_thread(NULL));
#else
/* For everything else, default to using the address of 'errno' */
- CRYPTO_THREADID_set_pointer(id, &errno);
+ CRYPTO_THREADID_set_pointer(id, (void*)&errno);
#endif
}
@@ -661,28 +665,53 @@ const char *CRYPTO_get_lock_name(int type)
defined(__INTEL__) || \
defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || defined(_M_X64)
-unsigned long OPENSSL_ia32cap_P=0;
-unsigned long *OPENSSL_ia32cap_loc(void) { return &OPENSSL_ia32cap_P; }
+unsigned int OPENSSL_ia32cap_P[2];
+unsigned long *OPENSSL_ia32cap_loc(void)
+{ if (sizeof(long)==4)
+ /*
+ * If 32-bit application pulls address of OPENSSL_ia32cap_P[0]
+ * clear second element to maintain the illusion that vector
+ * is 32-bit.
+ */
+ OPENSSL_ia32cap_P[1]=0;
+ return (unsigned long *)OPENSSL_ia32cap_P;
+}
#if defined(OPENSSL_CPUID_OBJ) && !defined(OPENSSL_NO_ASM) && !defined(I386_ONLY)
#define OPENSSL_CPUID_SETUP
+#if defined(_WIN32)
+typedef unsigned __int64 IA32CAP;
+#else
+typedef unsigned long long IA32CAP;
+#endif
void OPENSSL_cpuid_setup(void)
{ static int trigger=0;
- unsigned long OPENSSL_ia32_cpuid(void);
+ IA32CAP OPENSSL_ia32_cpuid(void);
+ IA32CAP vec;
char *env;
if (trigger) return;
trigger=1;
- if ((env=getenv("OPENSSL_ia32cap")))
- OPENSSL_ia32cap_P = strtoul(env,NULL,0)|(1<<10);
+ if ((env=getenv("OPENSSL_ia32cap"))) {
+ int off = (env[0]=='~')?1:0;
+#if defined(_WIN32)
+ if (!sscanf(env+off,"%I64i",&vec)) vec = strtoul(env+off,NULL,0);
+#else
+ if (!sscanf(env+off,"%lli",(long long *)&vec)) vec = strtoul(env+off,NULL,0);
+#endif
+ if (off) vec = OPENSSL_ia32_cpuid()&~vec;
+ }
else
- OPENSSL_ia32cap_P = OPENSSL_ia32_cpuid()|(1<<10);
+ vec = OPENSSL_ia32_cpuid();
+
/*
* |(1<<10) sets a reserved bit to signal that variable
* was initialized already... This is to avoid interference
* with cpuid snippets in ELF .init segment.
*/
+ OPENSSL_ia32cap_P[0] = (unsigned int)vec|(1<<10);
+ OPENSSL_ia32cap_P[1] = (unsigned int)(vec>>32);
}
#endif
@@ -896,3 +925,16 @@ void OpenSSLDie(const char *file,int line,const char *assertion)
}
void *OPENSSL_stderr(void) { return stderr; }
+
+int CRYPTO_memcmp(const void *in_a, const void *in_b, size_t len)
+ {
+ size_t i;
+ const unsigned char *a = in_a;
+ const unsigned char *b = in_b;
+ unsigned char x = 0;
+
+ for (i = 0; i < len; i++)
+ x |= a[i] ^ b[i];
+
+ return x;
+ }
diff --git a/main/openssl/crypto/cryptlib.h b/main/openssl/crypto/cryptlib.h
index fc249c57..d26f9630 100644
--- a/main/openssl/crypto/cryptlib.h
+++ b/main/openssl/crypto/cryptlib.h
@@ -99,8 +99,8 @@ extern "C" {
#define HEX_SIZE(type) (sizeof(type)*2)
void OPENSSL_cpuid_setup(void);
-extern unsigned long OPENSSL_ia32cap_P;
-void OPENSSL_showfatal(const char *,...);
+extern unsigned int OPENSSL_ia32cap_P[];
+void OPENSSL_showfatal(const char *fmta,...);
void *OPENSSL_stderr(void);
extern int OPENSSL_NONPIC_relocated;
diff --git a/main/openssl/crypto/crypto.h b/main/openssl/crypto/crypto.h
index b0360cec..f92fc518 100644
--- a/main/openssl/crypto/crypto.h
+++ b/main/openssl/crypto/crypto.h
@@ -488,10 +488,10 @@ void CRYPTO_get_mem_debug_functions(void (**m)(void *,int,const char *,int,int),
long (**go)(void));
void *CRYPTO_malloc_locked(int num, const char *file, int line);
-void CRYPTO_free_locked(void *);
+void CRYPTO_free_locked(void *ptr);
void *CRYPTO_malloc(int num, const char *file, int line);
char *CRYPTO_strdup(const char *str, const char *file, int line);
-void CRYPTO_free(void *);
+void CRYPTO_free(void *ptr);
void *CRYPTO_realloc(void *addr,int num, const char *file, int line);
void *CRYPTO_realloc_clean(void *addr,int old_num,int num,const char *file,
int line);
@@ -547,6 +547,40 @@ unsigned long *OPENSSL_ia32cap_loc(void);
#define OPENSSL_ia32cap (*(OPENSSL_ia32cap_loc()))
int OPENSSL_isservice(void);
+int FIPS_mode(void);
+int FIPS_mode_set(int r);
+
+void OPENSSL_init(void);
+
+#define fips_md_init(alg) fips_md_init_ctx(alg, alg)
+
+#ifdef OPENSSL_FIPS
+#define fips_md_init_ctx(alg, cx) \
+ int alg##_Init(cx##_CTX *c) \
+ { \
+ if (FIPS_mode()) OpenSSLDie(__FILE__, __LINE__, \
+ "Low level API call to digest " #alg " forbidden in FIPS mode!"); \
+ return private_##alg##_Init(c); \
+ } \
+ int private_##alg##_Init(cx##_CTX *c)
+
+#define fips_cipher_abort(alg) \
+ if (FIPS_mode()) OpenSSLDie(__FILE__, __LINE__, \
+ "Low level API call to cipher " #alg " forbidden in FIPS mode!")
+
+#else
+#define fips_md_init_ctx(alg, cx) \
+ int alg##_Init(cx##_CTX *c)
+#define fips_cipher_abort(alg) while(0)
+#endif
+
+/* CRYPTO_memcmp returns zero iff the |len| bytes at |a| and |b| are equal. It
+ * takes an amount of time dependent on |len|, but independent of the contents
+ * of |a| and |b|. Unlike memcmp, it cannot be used to put elements into a
+ * defined order as the return value when a != b is undefined, other than to be
+ * non-zero. */
+int CRYPTO_memcmp(const void *a, const void *b, size_t len);
+
/* BEGIN ERROR CODES */
/* The following lines are auto generated by the script mkerr.pl. Any changes
* made after this point may be overwritten when the script is next run.
@@ -562,11 +596,13 @@ void ERR_load_CRYPTO_strings(void);
#define CRYPTO_F_CRYPTO_SET_EX_DATA 102
#define CRYPTO_F_DEF_ADD_INDEX 104
#define CRYPTO_F_DEF_GET_CLASS 105
+#define CRYPTO_F_FIPS_MODE_SET 109
#define CRYPTO_F_INT_DUP_EX_DATA 106
#define CRYPTO_F_INT_FREE_EX_DATA 107
#define CRYPTO_F_INT_NEW_EX_DATA 108
/* Reason codes. */
+#define CRYPTO_R_FIPS_MODE_NOT_SUPPORTED 101
#define CRYPTO_R_NO_DYNLOCK_CREATE_CALLBACK 100
#ifdef __cplusplus
diff --git a/main/openssl/crypto/des/asm/crypt586.S b/main/openssl/crypto/des/asm/crypt586.S
new file mode 100644
index 00000000..fb321ba9
--- /dev/null
+++ b/main/openssl/crypto/des/asm/crypt586.S
@@ -0,0 +1,879 @@
+.file "crypt586.s"
+.text
+.globl fcrypt_body
+.type fcrypt_body,@function
+.align 16
+fcrypt_body:
+.L_fcrypt_body_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+
+
+ xorl %edi,%edi
+ xorl %esi,%esi
+ call .L000PIC_me_up
+.L000PIC_me_up:
+ popl %edx
+ leal _GLOBAL_OFFSET_TABLE_+[.-.L000PIC_me_up](%edx),%edx
+ movl DES_SPtrans@GOT(%edx),%edx
+ pushl %edx
+ movl 28(%esp),%ebp
+ pushl $25
+.L001start:
+
+
+ movl 36(%esp),%eax
+ movl %esi,%edx
+ shrl $16,%edx
+ movl 40(%esp),%ecx
+ xorl %esi,%edx
+ andl %edx,%eax
+ andl %ecx,%edx
+ movl %eax,%ebx
+ shll $16,%ebx
+ movl %edx,%ecx
+ shll $16,%ecx
+ xorl %ebx,%eax
+ xorl %ecx,%edx
+ movl (%ebp),%ebx
+ xorl %ebx,%eax
+ movl 4(%ebp),%ecx
+ xorl %esi,%eax
+ xorl %esi,%edx
+ xorl %ecx,%edx
+ andl $0xfcfcfcfc,%eax
+ xorl %ebx,%ebx
+ andl $0xcfcfcfcf,%edx
+ xorl %ecx,%ecx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ movl 4(%esp),%ebp
+ xorl (%ebp,%ebx,1),%edi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%edi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ movl 0x600(%ebp,%ebx,1),%ebx
+ xorl %ebx,%edi
+ movl 0x700(%ebp,%ecx,1),%ebx
+ xorl %ebx,%edi
+ movl 0x400(%ebp,%eax,1),%ebx
+ xorl %ebx,%edi
+ movl 0x500(%ebp,%edx,1),%ebx
+ xorl %ebx,%edi
+ movl 32(%esp),%ebp
+
+
+ movl 36(%esp),%eax
+ movl %edi,%edx
+ shrl $16,%edx
+ movl 40(%esp),%ecx
+ xorl %edi,%edx
+ andl %edx,%eax
+ andl %ecx,%edx
+ movl %eax,%ebx
+ shll $16,%ebx
+ movl %edx,%ecx
+ shll $16,%ecx
+ xorl %ebx,%eax
+ xorl %ecx,%edx
+ movl 8(%ebp),%ebx
+ xorl %ebx,%eax
+ movl 12(%ebp),%ecx
+ xorl %edi,%eax
+ xorl %edi,%edx
+ xorl %ecx,%edx
+ andl $0xfcfcfcfc,%eax
+ xorl %ebx,%ebx
+ andl $0xcfcfcfcf,%edx
+ xorl %ecx,%ecx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ movl 4(%esp),%ebp
+ xorl (%ebp,%ebx,1),%esi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%esi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ movl 0x600(%ebp,%ebx,1),%ebx
+ xorl %ebx,%esi
+ movl 0x700(%ebp,%ecx,1),%ebx
+ xorl %ebx,%esi
+ movl 0x400(%ebp,%eax,1),%ebx
+ xorl %ebx,%esi
+ movl 0x500(%ebp,%edx,1),%ebx
+ xorl %ebx,%esi
+ movl 32(%esp),%ebp
+
+
+ movl 36(%esp),%eax
+ movl %esi,%edx
+ shrl $16,%edx
+ movl 40(%esp),%ecx
+ xorl %esi,%edx
+ andl %edx,%eax
+ andl %ecx,%edx
+ movl %eax,%ebx
+ shll $16,%ebx
+ movl %edx,%ecx
+ shll $16,%ecx
+ xorl %ebx,%eax
+ xorl %ecx,%edx
+ movl 16(%ebp),%ebx
+ xorl %ebx,%eax
+ movl 20(%ebp),%ecx
+ xorl %esi,%eax
+ xorl %esi,%edx
+ xorl %ecx,%edx
+ andl $0xfcfcfcfc,%eax
+ xorl %ebx,%ebx
+ andl $0xcfcfcfcf,%edx
+ xorl %ecx,%ecx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ movl 4(%esp),%ebp
+ xorl (%ebp,%ebx,1),%edi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%edi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ movl 0x600(%ebp,%ebx,1),%ebx
+ xorl %ebx,%edi
+ movl 0x700(%ebp,%ecx,1),%ebx
+ xorl %ebx,%edi
+ movl 0x400(%ebp,%eax,1),%ebx
+ xorl %ebx,%edi
+ movl 0x500(%ebp,%edx,1),%ebx
+ xorl %ebx,%edi
+ movl 32(%esp),%ebp
+
+
+ movl 36(%esp),%eax
+ movl %edi,%edx
+ shrl $16,%edx
+ movl 40(%esp),%ecx
+ xorl %edi,%edx
+ andl %edx,%eax
+ andl %ecx,%edx
+ movl %eax,%ebx
+ shll $16,%ebx
+ movl %edx,%ecx
+ shll $16,%ecx
+ xorl %ebx,%eax
+ xorl %ecx,%edx
+ movl 24(%ebp),%ebx
+ xorl %ebx,%eax
+ movl 28(%ebp),%ecx
+ xorl %edi,%eax
+ xorl %edi,%edx
+ xorl %ecx,%edx
+ andl $0xfcfcfcfc,%eax
+ xorl %ebx,%ebx
+ andl $0xcfcfcfcf,%edx
+ xorl %ecx,%ecx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ movl 4(%esp),%ebp
+ xorl (%ebp,%ebx,1),%esi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%esi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ movl 0x600(%ebp,%ebx,1),%ebx
+ xorl %ebx,%esi
+ movl 0x700(%ebp,%ecx,1),%ebx
+ xorl %ebx,%esi
+ movl 0x400(%ebp,%eax,1),%ebx
+ xorl %ebx,%esi
+ movl 0x500(%ebp,%edx,1),%ebx
+ xorl %ebx,%esi
+ movl 32(%esp),%ebp
+
+
+ movl 36(%esp),%eax
+ movl %esi,%edx
+ shrl $16,%edx
+ movl 40(%esp),%ecx
+ xorl %esi,%edx
+ andl %edx,%eax
+ andl %ecx,%edx
+ movl %eax,%ebx
+ shll $16,%ebx
+ movl %edx,%ecx
+ shll $16,%ecx
+ xorl %ebx,%eax
+ xorl %ecx,%edx
+ movl 32(%ebp),%ebx
+ xorl %ebx,%eax
+ movl 36(%ebp),%ecx
+ xorl %esi,%eax
+ xorl %esi,%edx
+ xorl %ecx,%edx
+ andl $0xfcfcfcfc,%eax
+ xorl %ebx,%ebx
+ andl $0xcfcfcfcf,%edx
+ xorl %ecx,%ecx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ movl 4(%esp),%ebp
+ xorl (%ebp,%ebx,1),%edi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%edi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ movl 0x600(%ebp,%ebx,1),%ebx
+ xorl %ebx,%edi
+ movl 0x700(%ebp,%ecx,1),%ebx
+ xorl %ebx,%edi
+ movl 0x400(%ebp,%eax,1),%ebx
+ xorl %ebx,%edi
+ movl 0x500(%ebp,%edx,1),%ebx
+ xorl %ebx,%edi
+ movl 32(%esp),%ebp
+
+
+ movl 36(%esp),%eax
+ movl %edi,%edx
+ shrl $16,%edx
+ movl 40(%esp),%ecx
+ xorl %edi,%edx
+ andl %edx,%eax
+ andl %ecx,%edx
+ movl %eax,%ebx
+ shll $16,%ebx
+ movl %edx,%ecx
+ shll $16,%ecx
+ xorl %ebx,%eax
+ xorl %ecx,%edx
+ movl 40(%ebp),%ebx
+ xorl %ebx,%eax
+ movl 44(%ebp),%ecx
+ xorl %edi,%eax
+ xorl %edi,%edx
+ xorl %ecx,%edx
+ andl $0xfcfcfcfc,%eax
+ xorl %ebx,%ebx
+ andl $0xcfcfcfcf,%edx
+ xorl %ecx,%ecx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ movl 4(%esp),%ebp
+ xorl (%ebp,%ebx,1),%esi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%esi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ movl 0x600(%ebp,%ebx,1),%ebx
+ xorl %ebx,%esi
+ movl 0x700(%ebp,%ecx,1),%ebx
+ xorl %ebx,%esi
+ movl 0x400(%ebp,%eax,1),%ebx
+ xorl %ebx,%esi
+ movl 0x500(%ebp,%edx,1),%ebx
+ xorl %ebx,%esi
+ movl 32(%esp),%ebp
+
+
+ movl 36(%esp),%eax
+ movl %esi,%edx
+ shrl $16,%edx
+ movl 40(%esp),%ecx
+ xorl %esi,%edx
+ andl %edx,%eax
+ andl %ecx,%edx
+ movl %eax,%ebx
+ shll $16,%ebx
+ movl %edx,%ecx
+ shll $16,%ecx
+ xorl %ebx,%eax
+ xorl %ecx,%edx
+ movl 48(%ebp),%ebx
+ xorl %ebx,%eax
+ movl 52(%ebp),%ecx
+ xorl %esi,%eax
+ xorl %esi,%edx
+ xorl %ecx,%edx
+ andl $0xfcfcfcfc,%eax
+ xorl %ebx,%ebx
+ andl $0xcfcfcfcf,%edx
+ xorl %ecx,%ecx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ movl 4(%esp),%ebp
+ xorl (%ebp,%ebx,1),%edi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%edi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ movl 0x600(%ebp,%ebx,1),%ebx
+ xorl %ebx,%edi
+ movl 0x700(%ebp,%ecx,1),%ebx
+ xorl %ebx,%edi
+ movl 0x400(%ebp,%eax,1),%ebx
+ xorl %ebx,%edi
+ movl 0x500(%ebp,%edx,1),%ebx
+ xorl %ebx,%edi
+ movl 32(%esp),%ebp
+
+
+ movl 36(%esp),%eax
+ movl %edi,%edx
+ shrl $16,%edx
+ movl 40(%esp),%ecx
+ xorl %edi,%edx
+ andl %edx,%eax
+ andl %ecx,%edx
+ movl %eax,%ebx
+ shll $16,%ebx
+ movl %edx,%ecx
+ shll $16,%ecx
+ xorl %ebx,%eax
+ xorl %ecx,%edx
+ movl 56(%ebp),%ebx
+ xorl %ebx,%eax
+ movl 60(%ebp),%ecx
+ xorl %edi,%eax
+ xorl %edi,%edx
+ xorl %ecx,%edx
+ andl $0xfcfcfcfc,%eax
+ xorl %ebx,%ebx
+ andl $0xcfcfcfcf,%edx
+ xorl %ecx,%ecx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ movl 4(%esp),%ebp
+ xorl (%ebp,%ebx,1),%esi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%esi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ movl 0x600(%ebp,%ebx,1),%ebx
+ xorl %ebx,%esi
+ movl 0x700(%ebp,%ecx,1),%ebx
+ xorl %ebx,%esi
+ movl 0x400(%ebp,%eax,1),%ebx
+ xorl %ebx,%esi
+ movl 0x500(%ebp,%edx,1),%ebx
+ xorl %ebx,%esi
+ movl 32(%esp),%ebp
+
+
+ movl 36(%esp),%eax
+ movl %esi,%edx
+ shrl $16,%edx
+ movl 40(%esp),%ecx
+ xorl %esi,%edx
+ andl %edx,%eax
+ andl %ecx,%edx
+ movl %eax,%ebx
+ shll $16,%ebx
+ movl %edx,%ecx
+ shll $16,%ecx
+ xorl %ebx,%eax
+ xorl %ecx,%edx
+ movl 64(%ebp),%ebx
+ xorl %ebx,%eax
+ movl 68(%ebp),%ecx
+ xorl %esi,%eax
+ xorl %esi,%edx
+ xorl %ecx,%edx
+ andl $0xfcfcfcfc,%eax
+ xorl %ebx,%ebx
+ andl $0xcfcfcfcf,%edx
+ xorl %ecx,%ecx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ movl 4(%esp),%ebp
+ xorl (%ebp,%ebx,1),%edi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%edi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ movl 0x600(%ebp,%ebx,1),%ebx
+ xorl %ebx,%edi
+ movl 0x700(%ebp,%ecx,1),%ebx
+ xorl %ebx,%edi
+ movl 0x400(%ebp,%eax,1),%ebx
+ xorl %ebx,%edi
+ movl 0x500(%ebp,%edx,1),%ebx
+ xorl %ebx,%edi
+ movl 32(%esp),%ebp
+
+
+ movl 36(%esp),%eax
+ movl %edi,%edx
+ shrl $16,%edx
+ movl 40(%esp),%ecx
+ xorl %edi,%edx
+ andl %edx,%eax
+ andl %ecx,%edx
+ movl %eax,%ebx
+ shll $16,%ebx
+ movl %edx,%ecx
+ shll $16,%ecx
+ xorl %ebx,%eax
+ xorl %ecx,%edx
+ movl 72(%ebp),%ebx
+ xorl %ebx,%eax
+ movl 76(%ebp),%ecx
+ xorl %edi,%eax
+ xorl %edi,%edx
+ xorl %ecx,%edx
+ andl $0xfcfcfcfc,%eax
+ xorl %ebx,%ebx
+ andl $0xcfcfcfcf,%edx
+ xorl %ecx,%ecx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ movl 4(%esp),%ebp
+ xorl (%ebp,%ebx,1),%esi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%esi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ movl 0x600(%ebp,%ebx,1),%ebx
+ xorl %ebx,%esi
+ movl 0x700(%ebp,%ecx,1),%ebx
+ xorl %ebx,%esi
+ movl 0x400(%ebp,%eax,1),%ebx
+ xorl %ebx,%esi
+ movl 0x500(%ebp,%edx,1),%ebx
+ xorl %ebx,%esi
+ movl 32(%esp),%ebp
+
+
+ movl 36(%esp),%eax
+ movl %esi,%edx
+ shrl $16,%edx
+ movl 40(%esp),%ecx
+ xorl %esi,%edx
+ andl %edx,%eax
+ andl %ecx,%edx
+ movl %eax,%ebx
+ shll $16,%ebx
+ movl %edx,%ecx
+ shll $16,%ecx
+ xorl %ebx,%eax
+ xorl %ecx,%edx
+ movl 80(%ebp),%ebx
+ xorl %ebx,%eax
+ movl 84(%ebp),%ecx
+ xorl %esi,%eax
+ xorl %esi,%edx
+ xorl %ecx,%edx
+ andl $0xfcfcfcfc,%eax
+ xorl %ebx,%ebx
+ andl $0xcfcfcfcf,%edx
+ xorl %ecx,%ecx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ movl 4(%esp),%ebp
+ xorl (%ebp,%ebx,1),%edi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%edi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ movl 0x600(%ebp,%ebx,1),%ebx
+ xorl %ebx,%edi
+ movl 0x700(%ebp,%ecx,1),%ebx
+ xorl %ebx,%edi
+ movl 0x400(%ebp,%eax,1),%ebx
+ xorl %ebx,%edi
+ movl 0x500(%ebp,%edx,1),%ebx
+ xorl %ebx,%edi
+ movl 32(%esp),%ebp
+
+
+ movl 36(%esp),%eax
+ movl %edi,%edx
+ shrl $16,%edx
+ movl 40(%esp),%ecx
+ xorl %edi,%edx
+ andl %edx,%eax
+ andl %ecx,%edx
+ movl %eax,%ebx
+ shll $16,%ebx
+ movl %edx,%ecx
+ shll $16,%ecx
+ xorl %ebx,%eax
+ xorl %ecx,%edx
+ movl 88(%ebp),%ebx
+ xorl %ebx,%eax
+ movl 92(%ebp),%ecx
+ xorl %edi,%eax
+ xorl %edi,%edx
+ xorl %ecx,%edx
+ andl $0xfcfcfcfc,%eax
+ xorl %ebx,%ebx
+ andl $0xcfcfcfcf,%edx
+ xorl %ecx,%ecx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ movl 4(%esp),%ebp
+ xorl (%ebp,%ebx,1),%esi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%esi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ movl 0x600(%ebp,%ebx,1),%ebx
+ xorl %ebx,%esi
+ movl 0x700(%ebp,%ecx,1),%ebx
+ xorl %ebx,%esi
+ movl 0x400(%ebp,%eax,1),%ebx
+ xorl %ebx,%esi
+ movl 0x500(%ebp,%edx,1),%ebx
+ xorl %ebx,%esi
+ movl 32(%esp),%ebp
+
+
+ movl 36(%esp),%eax
+ movl %esi,%edx
+ shrl $16,%edx
+ movl 40(%esp),%ecx
+ xorl %esi,%edx
+ andl %edx,%eax
+ andl %ecx,%edx
+ movl %eax,%ebx
+ shll $16,%ebx
+ movl %edx,%ecx
+ shll $16,%ecx
+ xorl %ebx,%eax
+ xorl %ecx,%edx
+ movl 96(%ebp),%ebx
+ xorl %ebx,%eax
+ movl 100(%ebp),%ecx
+ xorl %esi,%eax
+ xorl %esi,%edx
+ xorl %ecx,%edx
+ andl $0xfcfcfcfc,%eax
+ xorl %ebx,%ebx
+ andl $0xcfcfcfcf,%edx
+ xorl %ecx,%ecx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ movl 4(%esp),%ebp
+ xorl (%ebp,%ebx,1),%edi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%edi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ movl 0x600(%ebp,%ebx,1),%ebx
+ xorl %ebx,%edi
+ movl 0x700(%ebp,%ecx,1),%ebx
+ xorl %ebx,%edi
+ movl 0x400(%ebp,%eax,1),%ebx
+ xorl %ebx,%edi
+ movl 0x500(%ebp,%edx,1),%ebx
+ xorl %ebx,%edi
+ movl 32(%esp),%ebp
+
+
+ movl 36(%esp),%eax
+ movl %edi,%edx
+ shrl $16,%edx
+ movl 40(%esp),%ecx
+ xorl %edi,%edx
+ andl %edx,%eax
+ andl %ecx,%edx
+ movl %eax,%ebx
+ shll $16,%ebx
+ movl %edx,%ecx
+ shll $16,%ecx
+ xorl %ebx,%eax
+ xorl %ecx,%edx
+ movl 104(%ebp),%ebx
+ xorl %ebx,%eax
+ movl 108(%ebp),%ecx
+ xorl %edi,%eax
+ xorl %edi,%edx
+ xorl %ecx,%edx
+ andl $0xfcfcfcfc,%eax
+ xorl %ebx,%ebx
+ andl $0xcfcfcfcf,%edx
+ xorl %ecx,%ecx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ movl 4(%esp),%ebp
+ xorl (%ebp,%ebx,1),%esi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%esi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ movl 0x600(%ebp,%ebx,1),%ebx
+ xorl %ebx,%esi
+ movl 0x700(%ebp,%ecx,1),%ebx
+ xorl %ebx,%esi
+ movl 0x400(%ebp,%eax,1),%ebx
+ xorl %ebx,%esi
+ movl 0x500(%ebp,%edx,1),%ebx
+ xorl %ebx,%esi
+ movl 32(%esp),%ebp
+
+
+ movl 36(%esp),%eax
+ movl %esi,%edx
+ shrl $16,%edx
+ movl 40(%esp),%ecx
+ xorl %esi,%edx
+ andl %edx,%eax
+ andl %ecx,%edx
+ movl %eax,%ebx
+ shll $16,%ebx
+ movl %edx,%ecx
+ shll $16,%ecx
+ xorl %ebx,%eax
+ xorl %ecx,%edx
+ movl 112(%ebp),%ebx
+ xorl %ebx,%eax
+ movl 116(%ebp),%ecx
+ xorl %esi,%eax
+ xorl %esi,%edx
+ xorl %ecx,%edx
+ andl $0xfcfcfcfc,%eax
+ xorl %ebx,%ebx
+ andl $0xcfcfcfcf,%edx
+ xorl %ecx,%ecx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ movl 4(%esp),%ebp
+ xorl (%ebp,%ebx,1),%edi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%edi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ movl 0x600(%ebp,%ebx,1),%ebx
+ xorl %ebx,%edi
+ movl 0x700(%ebp,%ecx,1),%ebx
+ xorl %ebx,%edi
+ movl 0x400(%ebp,%eax,1),%ebx
+ xorl %ebx,%edi
+ movl 0x500(%ebp,%edx,1),%ebx
+ xorl %ebx,%edi
+ movl 32(%esp),%ebp
+
+
+ movl 36(%esp),%eax
+ movl %edi,%edx
+ shrl $16,%edx
+ movl 40(%esp),%ecx
+ xorl %edi,%edx
+ andl %edx,%eax
+ andl %ecx,%edx
+ movl %eax,%ebx
+ shll $16,%ebx
+ movl %edx,%ecx
+ shll $16,%ecx
+ xorl %ebx,%eax
+ xorl %ecx,%edx
+ movl 120(%ebp),%ebx
+ xorl %ebx,%eax
+ movl 124(%ebp),%ecx
+ xorl %edi,%eax
+ xorl %edi,%edx
+ xorl %ecx,%edx
+ andl $0xfcfcfcfc,%eax
+ xorl %ebx,%ebx
+ andl $0xcfcfcfcf,%edx
+ xorl %ecx,%ecx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ movl 4(%esp),%ebp
+ xorl (%ebp,%ebx,1),%esi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%esi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ movl 0x600(%ebp,%ebx,1),%ebx
+ xorl %ebx,%esi
+ movl 0x700(%ebp,%ecx,1),%ebx
+ xorl %ebx,%esi
+ movl 0x400(%ebp,%eax,1),%ebx
+ xorl %ebx,%esi
+ movl 0x500(%ebp,%edx,1),%ebx
+ xorl %ebx,%esi
+ movl 32(%esp),%ebp
+ movl (%esp),%ebx
+ movl %edi,%eax
+ decl %ebx
+ movl %esi,%edi
+ movl %eax,%esi
+ movl %ebx,(%esp)
+ jnz .L001start
+
+
+ movl 28(%esp),%edx
+ rorl $1,%edi
+ movl %esi,%eax
+ xorl %edi,%esi
+ andl $0xaaaaaaaa,%esi
+ xorl %esi,%eax
+ xorl %esi,%edi
+
+ roll $23,%eax
+ movl %eax,%esi
+ xorl %edi,%eax
+ andl $0x03fc03fc,%eax
+ xorl %eax,%esi
+ xorl %eax,%edi
+
+ roll $10,%esi
+ movl %esi,%eax
+ xorl %edi,%esi
+ andl $0x33333333,%esi
+ xorl %esi,%eax
+ xorl %esi,%edi
+
+ roll $18,%edi
+ movl %edi,%esi
+ xorl %eax,%edi
+ andl $0xfff0000f,%edi
+ xorl %edi,%esi
+ xorl %edi,%eax
+
+ roll $12,%esi
+ movl %esi,%edi
+ xorl %eax,%esi
+ andl $0xf0f0f0f0,%esi
+ xorl %esi,%edi
+ xorl %esi,%eax
+
+ rorl $4,%eax
+ movl %eax,(%edx)
+ movl %edi,4(%edx)
+ addl $8,%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size fcrypt_body,.-.L_fcrypt_body_begin
diff --git a/main/openssl/crypto/des/asm/des-586.S b/main/openssl/crypto/des/asm/des-586.S
new file mode 100644
index 00000000..2fbd340d
--- /dev/null
+++ b/main/openssl/crypto/des/asm/des-586.S
@@ -0,0 +1,1837 @@
+.file "des-586.s"
+.text
+.globl DES_SPtrans
+.type _x86_DES_encrypt,@function
+.align 16
+_x86_DES_encrypt:
+ pushl %ecx
+
+ movl (%ecx),%eax
+ xorl %ebx,%ebx
+ movl 4(%ecx),%edx
+ xorl %esi,%eax
+ xorl %ecx,%ecx
+ xorl %esi,%edx
+ andl $0xfcfcfcfc,%eax
+ andl $0xcfcfcfcf,%edx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ xorl (%ebp,%ebx,1),%edi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%edi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ xorl 0x600(%ebp,%ebx,1),%edi
+ xorl 0x700(%ebp,%ecx,1),%edi
+ movl (%esp),%ecx
+ xorl 0x400(%ebp,%eax,1),%edi
+ xorl 0x500(%ebp,%edx,1),%edi
+
+ movl 8(%ecx),%eax
+ xorl %ebx,%ebx
+ movl 12(%ecx),%edx
+ xorl %edi,%eax
+ xorl %ecx,%ecx
+ xorl %edi,%edx
+ andl $0xfcfcfcfc,%eax
+ andl $0xcfcfcfcf,%edx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ xorl (%ebp,%ebx,1),%esi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%esi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ xorl 0x600(%ebp,%ebx,1),%esi
+ xorl 0x700(%ebp,%ecx,1),%esi
+ movl (%esp),%ecx
+ xorl 0x400(%ebp,%eax,1),%esi
+ xorl 0x500(%ebp,%edx,1),%esi
+
+ movl 16(%ecx),%eax
+ xorl %ebx,%ebx
+ movl 20(%ecx),%edx
+ xorl %esi,%eax
+ xorl %ecx,%ecx
+ xorl %esi,%edx
+ andl $0xfcfcfcfc,%eax
+ andl $0xcfcfcfcf,%edx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ xorl (%ebp,%ebx,1),%edi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%edi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ xorl 0x600(%ebp,%ebx,1),%edi
+ xorl 0x700(%ebp,%ecx,1),%edi
+ movl (%esp),%ecx
+ xorl 0x400(%ebp,%eax,1),%edi
+ xorl 0x500(%ebp,%edx,1),%edi
+
+ movl 24(%ecx),%eax
+ xorl %ebx,%ebx
+ movl 28(%ecx),%edx
+ xorl %edi,%eax
+ xorl %ecx,%ecx
+ xorl %edi,%edx
+ andl $0xfcfcfcfc,%eax
+ andl $0xcfcfcfcf,%edx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ xorl (%ebp,%ebx,1),%esi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%esi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ xorl 0x600(%ebp,%ebx,1),%esi
+ xorl 0x700(%ebp,%ecx,1),%esi
+ movl (%esp),%ecx
+ xorl 0x400(%ebp,%eax,1),%esi
+ xorl 0x500(%ebp,%edx,1),%esi
+
+ movl 32(%ecx),%eax
+ xorl %ebx,%ebx
+ movl 36(%ecx),%edx
+ xorl %esi,%eax
+ xorl %ecx,%ecx
+ xorl %esi,%edx
+ andl $0xfcfcfcfc,%eax
+ andl $0xcfcfcfcf,%edx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ xorl (%ebp,%ebx,1),%edi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%edi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ xorl 0x600(%ebp,%ebx,1),%edi
+ xorl 0x700(%ebp,%ecx,1),%edi
+ movl (%esp),%ecx
+ xorl 0x400(%ebp,%eax,1),%edi
+ xorl 0x500(%ebp,%edx,1),%edi
+
+ movl 40(%ecx),%eax
+ xorl %ebx,%ebx
+ movl 44(%ecx),%edx
+ xorl %edi,%eax
+ xorl %ecx,%ecx
+ xorl %edi,%edx
+ andl $0xfcfcfcfc,%eax
+ andl $0xcfcfcfcf,%edx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ xorl (%ebp,%ebx,1),%esi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%esi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ xorl 0x600(%ebp,%ebx,1),%esi
+ xorl 0x700(%ebp,%ecx,1),%esi
+ movl (%esp),%ecx
+ xorl 0x400(%ebp,%eax,1),%esi
+ xorl 0x500(%ebp,%edx,1),%esi
+
+ movl 48(%ecx),%eax
+ xorl %ebx,%ebx
+ movl 52(%ecx),%edx
+ xorl %esi,%eax
+ xorl %ecx,%ecx
+ xorl %esi,%edx
+ andl $0xfcfcfcfc,%eax
+ andl $0xcfcfcfcf,%edx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ xorl (%ebp,%ebx,1),%edi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%edi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ xorl 0x600(%ebp,%ebx,1),%edi
+ xorl 0x700(%ebp,%ecx,1),%edi
+ movl (%esp),%ecx
+ xorl 0x400(%ebp,%eax,1),%edi
+ xorl 0x500(%ebp,%edx,1),%edi
+
+ movl 56(%ecx),%eax
+ xorl %ebx,%ebx
+ movl 60(%ecx),%edx
+ xorl %edi,%eax
+ xorl %ecx,%ecx
+ xorl %edi,%edx
+ andl $0xfcfcfcfc,%eax
+ andl $0xcfcfcfcf,%edx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ xorl (%ebp,%ebx,1),%esi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%esi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ xorl 0x600(%ebp,%ebx,1),%esi
+ xorl 0x700(%ebp,%ecx,1),%esi
+ movl (%esp),%ecx
+ xorl 0x400(%ebp,%eax,1),%esi
+ xorl 0x500(%ebp,%edx,1),%esi
+
+ movl 64(%ecx),%eax
+ xorl %ebx,%ebx
+ movl 68(%ecx),%edx
+ xorl %esi,%eax
+ xorl %ecx,%ecx
+ xorl %esi,%edx
+ andl $0xfcfcfcfc,%eax
+ andl $0xcfcfcfcf,%edx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ xorl (%ebp,%ebx,1),%edi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%edi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ xorl 0x600(%ebp,%ebx,1),%edi
+ xorl 0x700(%ebp,%ecx,1),%edi
+ movl (%esp),%ecx
+ xorl 0x400(%ebp,%eax,1),%edi
+ xorl 0x500(%ebp,%edx,1),%edi
+
+ movl 72(%ecx),%eax
+ xorl %ebx,%ebx
+ movl 76(%ecx),%edx
+ xorl %edi,%eax
+ xorl %ecx,%ecx
+ xorl %edi,%edx
+ andl $0xfcfcfcfc,%eax
+ andl $0xcfcfcfcf,%edx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ xorl (%ebp,%ebx,1),%esi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%esi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ xorl 0x600(%ebp,%ebx,1),%esi
+ xorl 0x700(%ebp,%ecx,1),%esi
+ movl (%esp),%ecx
+ xorl 0x400(%ebp,%eax,1),%esi
+ xorl 0x500(%ebp,%edx,1),%esi
+
+ movl 80(%ecx),%eax
+ xorl %ebx,%ebx
+ movl 84(%ecx),%edx
+ xorl %esi,%eax
+ xorl %ecx,%ecx
+ xorl %esi,%edx
+ andl $0xfcfcfcfc,%eax
+ andl $0xcfcfcfcf,%edx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ xorl (%ebp,%ebx,1),%edi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%edi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ xorl 0x600(%ebp,%ebx,1),%edi
+ xorl 0x700(%ebp,%ecx,1),%edi
+ movl (%esp),%ecx
+ xorl 0x400(%ebp,%eax,1),%edi
+ xorl 0x500(%ebp,%edx,1),%edi
+
+ movl 88(%ecx),%eax
+ xorl %ebx,%ebx
+ movl 92(%ecx),%edx
+ xorl %edi,%eax
+ xorl %ecx,%ecx
+ xorl %edi,%edx
+ andl $0xfcfcfcfc,%eax
+ andl $0xcfcfcfcf,%edx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ xorl (%ebp,%ebx,1),%esi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%esi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ xorl 0x600(%ebp,%ebx,1),%esi
+ xorl 0x700(%ebp,%ecx,1),%esi
+ movl (%esp),%ecx
+ xorl 0x400(%ebp,%eax,1),%esi
+ xorl 0x500(%ebp,%edx,1),%esi
+
+ movl 96(%ecx),%eax
+ xorl %ebx,%ebx
+ movl 100(%ecx),%edx
+ xorl %esi,%eax
+ xorl %ecx,%ecx
+ xorl %esi,%edx
+ andl $0xfcfcfcfc,%eax
+ andl $0xcfcfcfcf,%edx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ xorl (%ebp,%ebx,1),%edi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%edi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ xorl 0x600(%ebp,%ebx,1),%edi
+ xorl 0x700(%ebp,%ecx,1),%edi
+ movl (%esp),%ecx
+ xorl 0x400(%ebp,%eax,1),%edi
+ xorl 0x500(%ebp,%edx,1),%edi
+
+ movl 104(%ecx),%eax
+ xorl %ebx,%ebx
+ movl 108(%ecx),%edx
+ xorl %edi,%eax
+ xorl %ecx,%ecx
+ xorl %edi,%edx
+ andl $0xfcfcfcfc,%eax
+ andl $0xcfcfcfcf,%edx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ xorl (%ebp,%ebx,1),%esi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%esi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ xorl 0x600(%ebp,%ebx,1),%esi
+ xorl 0x700(%ebp,%ecx,1),%esi
+ movl (%esp),%ecx
+ xorl 0x400(%ebp,%eax,1),%esi
+ xorl 0x500(%ebp,%edx,1),%esi
+
+ movl 112(%ecx),%eax
+ xorl %ebx,%ebx
+ movl 116(%ecx),%edx
+ xorl %esi,%eax
+ xorl %ecx,%ecx
+ xorl %esi,%edx
+ andl $0xfcfcfcfc,%eax
+ andl $0xcfcfcfcf,%edx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ xorl (%ebp,%ebx,1),%edi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%edi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ xorl 0x600(%ebp,%ebx,1),%edi
+ xorl 0x700(%ebp,%ecx,1),%edi
+ movl (%esp),%ecx
+ xorl 0x400(%ebp,%eax,1),%edi
+ xorl 0x500(%ebp,%edx,1),%edi
+
+ movl 120(%ecx),%eax
+ xorl %ebx,%ebx
+ movl 124(%ecx),%edx
+ xorl %edi,%eax
+ xorl %ecx,%ecx
+ xorl %edi,%edx
+ andl $0xfcfcfcfc,%eax
+ andl $0xcfcfcfcf,%edx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ xorl (%ebp,%ebx,1),%esi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%esi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ xorl 0x600(%ebp,%ebx,1),%esi
+ xorl 0x700(%ebp,%ecx,1),%esi
+ movl (%esp),%ecx
+ xorl 0x400(%ebp,%eax,1),%esi
+ xorl 0x500(%ebp,%edx,1),%esi
+ addl $4,%esp
+ ret
+.size _x86_DES_encrypt,.-_x86_DES_encrypt
+.type _x86_DES_decrypt,@function
+.align 16
+_x86_DES_decrypt:
+ pushl %ecx
+
+ movl 120(%ecx),%eax
+ xorl %ebx,%ebx
+ movl 124(%ecx),%edx
+ xorl %esi,%eax
+ xorl %ecx,%ecx
+ xorl %esi,%edx
+ andl $0xfcfcfcfc,%eax
+ andl $0xcfcfcfcf,%edx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ xorl (%ebp,%ebx,1),%edi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%edi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ xorl 0x600(%ebp,%ebx,1),%edi
+ xorl 0x700(%ebp,%ecx,1),%edi
+ movl (%esp),%ecx
+ xorl 0x400(%ebp,%eax,1),%edi
+ xorl 0x500(%ebp,%edx,1),%edi
+
+ movl 112(%ecx),%eax
+ xorl %ebx,%ebx
+ movl 116(%ecx),%edx
+ xorl %edi,%eax
+ xorl %ecx,%ecx
+ xorl %edi,%edx
+ andl $0xfcfcfcfc,%eax
+ andl $0xcfcfcfcf,%edx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ xorl (%ebp,%ebx,1),%esi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%esi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ xorl 0x600(%ebp,%ebx,1),%esi
+ xorl 0x700(%ebp,%ecx,1),%esi
+ movl (%esp),%ecx
+ xorl 0x400(%ebp,%eax,1),%esi
+ xorl 0x500(%ebp,%edx,1),%esi
+
+ movl 104(%ecx),%eax
+ xorl %ebx,%ebx
+ movl 108(%ecx),%edx
+ xorl %esi,%eax
+ xorl %ecx,%ecx
+ xorl %esi,%edx
+ andl $0xfcfcfcfc,%eax
+ andl $0xcfcfcfcf,%edx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ xorl (%ebp,%ebx,1),%edi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%edi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ xorl 0x600(%ebp,%ebx,1),%edi
+ xorl 0x700(%ebp,%ecx,1),%edi
+ movl (%esp),%ecx
+ xorl 0x400(%ebp,%eax,1),%edi
+ xorl 0x500(%ebp,%edx,1),%edi
+
+ movl 96(%ecx),%eax
+ xorl %ebx,%ebx
+ movl 100(%ecx),%edx
+ xorl %edi,%eax
+ xorl %ecx,%ecx
+ xorl %edi,%edx
+ andl $0xfcfcfcfc,%eax
+ andl $0xcfcfcfcf,%edx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ xorl (%ebp,%ebx,1),%esi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%esi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ xorl 0x600(%ebp,%ebx,1),%esi
+ xorl 0x700(%ebp,%ecx,1),%esi
+ movl (%esp),%ecx
+ xorl 0x400(%ebp,%eax,1),%esi
+ xorl 0x500(%ebp,%edx,1),%esi
+
+ movl 88(%ecx),%eax
+ xorl %ebx,%ebx
+ movl 92(%ecx),%edx
+ xorl %esi,%eax
+ xorl %ecx,%ecx
+ xorl %esi,%edx
+ andl $0xfcfcfcfc,%eax
+ andl $0xcfcfcfcf,%edx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ xorl (%ebp,%ebx,1),%edi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%edi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ xorl 0x600(%ebp,%ebx,1),%edi
+ xorl 0x700(%ebp,%ecx,1),%edi
+ movl (%esp),%ecx
+ xorl 0x400(%ebp,%eax,1),%edi
+ xorl 0x500(%ebp,%edx,1),%edi
+
+ movl 80(%ecx),%eax
+ xorl %ebx,%ebx
+ movl 84(%ecx),%edx
+ xorl %edi,%eax
+ xorl %ecx,%ecx
+ xorl %edi,%edx
+ andl $0xfcfcfcfc,%eax
+ andl $0xcfcfcfcf,%edx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ xorl (%ebp,%ebx,1),%esi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%esi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ xorl 0x600(%ebp,%ebx,1),%esi
+ xorl 0x700(%ebp,%ecx,1),%esi
+ movl (%esp),%ecx
+ xorl 0x400(%ebp,%eax,1),%esi
+ xorl 0x500(%ebp,%edx,1),%esi
+
+ movl 72(%ecx),%eax
+ xorl %ebx,%ebx
+ movl 76(%ecx),%edx
+ xorl %esi,%eax
+ xorl %ecx,%ecx
+ xorl %esi,%edx
+ andl $0xfcfcfcfc,%eax
+ andl $0xcfcfcfcf,%edx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ xorl (%ebp,%ebx,1),%edi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%edi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ xorl 0x600(%ebp,%ebx,1),%edi
+ xorl 0x700(%ebp,%ecx,1),%edi
+ movl (%esp),%ecx
+ xorl 0x400(%ebp,%eax,1),%edi
+ xorl 0x500(%ebp,%edx,1),%edi
+
+ movl 64(%ecx),%eax
+ xorl %ebx,%ebx
+ movl 68(%ecx),%edx
+ xorl %edi,%eax
+ xorl %ecx,%ecx
+ xorl %edi,%edx
+ andl $0xfcfcfcfc,%eax
+ andl $0xcfcfcfcf,%edx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ xorl (%ebp,%ebx,1),%esi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%esi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ xorl 0x600(%ebp,%ebx,1),%esi
+ xorl 0x700(%ebp,%ecx,1),%esi
+ movl (%esp),%ecx
+ xorl 0x400(%ebp,%eax,1),%esi
+ xorl 0x500(%ebp,%edx,1),%esi
+
+ movl 56(%ecx),%eax
+ xorl %ebx,%ebx
+ movl 60(%ecx),%edx
+ xorl %esi,%eax
+ xorl %ecx,%ecx
+ xorl %esi,%edx
+ andl $0xfcfcfcfc,%eax
+ andl $0xcfcfcfcf,%edx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ xorl (%ebp,%ebx,1),%edi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%edi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ xorl 0x600(%ebp,%ebx,1),%edi
+ xorl 0x700(%ebp,%ecx,1),%edi
+ movl (%esp),%ecx
+ xorl 0x400(%ebp,%eax,1),%edi
+ xorl 0x500(%ebp,%edx,1),%edi
+
+ movl 48(%ecx),%eax
+ xorl %ebx,%ebx
+ movl 52(%ecx),%edx
+ xorl %edi,%eax
+ xorl %ecx,%ecx
+ xorl %edi,%edx
+ andl $0xfcfcfcfc,%eax
+ andl $0xcfcfcfcf,%edx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ xorl (%ebp,%ebx,1),%esi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%esi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ xorl 0x600(%ebp,%ebx,1),%esi
+ xorl 0x700(%ebp,%ecx,1),%esi
+ movl (%esp),%ecx
+ xorl 0x400(%ebp,%eax,1),%esi
+ xorl 0x500(%ebp,%edx,1),%esi
+
+ movl 40(%ecx),%eax
+ xorl %ebx,%ebx
+ movl 44(%ecx),%edx
+ xorl %esi,%eax
+ xorl %ecx,%ecx
+ xorl %esi,%edx
+ andl $0xfcfcfcfc,%eax
+ andl $0xcfcfcfcf,%edx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ xorl (%ebp,%ebx,1),%edi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%edi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ xorl 0x600(%ebp,%ebx,1),%edi
+ xorl 0x700(%ebp,%ecx,1),%edi
+ movl (%esp),%ecx
+ xorl 0x400(%ebp,%eax,1),%edi
+ xorl 0x500(%ebp,%edx,1),%edi
+
+ movl 32(%ecx),%eax
+ xorl %ebx,%ebx
+ movl 36(%ecx),%edx
+ xorl %edi,%eax
+ xorl %ecx,%ecx
+ xorl %edi,%edx
+ andl $0xfcfcfcfc,%eax
+ andl $0xcfcfcfcf,%edx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ xorl (%ebp,%ebx,1),%esi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%esi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ xorl 0x600(%ebp,%ebx,1),%esi
+ xorl 0x700(%ebp,%ecx,1),%esi
+ movl (%esp),%ecx
+ xorl 0x400(%ebp,%eax,1),%esi
+ xorl 0x500(%ebp,%edx,1),%esi
+
+ movl 24(%ecx),%eax
+ xorl %ebx,%ebx
+ movl 28(%ecx),%edx
+ xorl %esi,%eax
+ xorl %ecx,%ecx
+ xorl %esi,%edx
+ andl $0xfcfcfcfc,%eax
+ andl $0xcfcfcfcf,%edx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ xorl (%ebp,%ebx,1),%edi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%edi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ xorl 0x600(%ebp,%ebx,1),%edi
+ xorl 0x700(%ebp,%ecx,1),%edi
+ movl (%esp),%ecx
+ xorl 0x400(%ebp,%eax,1),%edi
+ xorl 0x500(%ebp,%edx,1),%edi
+
+ movl 16(%ecx),%eax
+ xorl %ebx,%ebx
+ movl 20(%ecx),%edx
+ xorl %edi,%eax
+ xorl %ecx,%ecx
+ xorl %edi,%edx
+ andl $0xfcfcfcfc,%eax
+ andl $0xcfcfcfcf,%edx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ xorl (%ebp,%ebx,1),%esi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%esi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ xorl 0x600(%ebp,%ebx,1),%esi
+ xorl 0x700(%ebp,%ecx,1),%esi
+ movl (%esp),%ecx
+ xorl 0x400(%ebp,%eax,1),%esi
+ xorl 0x500(%ebp,%edx,1),%esi
+
+ movl 8(%ecx),%eax
+ xorl %ebx,%ebx
+ movl 12(%ecx),%edx
+ xorl %esi,%eax
+ xorl %ecx,%ecx
+ xorl %esi,%edx
+ andl $0xfcfcfcfc,%eax
+ andl $0xcfcfcfcf,%edx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ xorl (%ebp,%ebx,1),%edi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%edi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%edi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ xorl 0x600(%ebp,%ebx,1),%edi
+ xorl 0x700(%ebp,%ecx,1),%edi
+ movl (%esp),%ecx
+ xorl 0x400(%ebp,%eax,1),%edi
+ xorl 0x500(%ebp,%edx,1),%edi
+
+ movl (%ecx),%eax
+ xorl %ebx,%ebx
+ movl 4(%ecx),%edx
+ xorl %edi,%eax
+ xorl %ecx,%ecx
+ xorl %edi,%edx
+ andl $0xfcfcfcfc,%eax
+ andl $0xcfcfcfcf,%edx
+ movb %al,%bl
+ movb %ah,%cl
+ rorl $4,%edx
+ xorl (%ebp,%ebx,1),%esi
+ movb %dl,%bl
+ xorl 0x200(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ shrl $16,%eax
+ xorl 0x100(%ebp,%ebx,1),%esi
+ movb %ah,%bl
+ shrl $16,%edx
+ xorl 0x300(%ebp,%ecx,1),%esi
+ movb %dh,%cl
+ andl $0xff,%eax
+ andl $0xff,%edx
+ xorl 0x600(%ebp,%ebx,1),%esi
+ xorl 0x700(%ebp,%ecx,1),%esi
+ movl (%esp),%ecx
+ xorl 0x400(%ebp,%eax,1),%esi
+ xorl 0x500(%ebp,%edx,1),%esi
+ addl $4,%esp
+ ret
+.size _x86_DES_decrypt,.-_x86_DES_decrypt
+.globl DES_encrypt1
+.type DES_encrypt1,@function
+.align 16
+DES_encrypt1:
+.L_DES_encrypt1_begin:
+ pushl %esi
+ pushl %edi
+
+
+ movl 12(%esp),%esi
+ xorl %ecx,%ecx
+ pushl %ebx
+ pushl %ebp
+ movl (%esi),%eax
+ movl 28(%esp),%ebx
+ movl 4(%esi),%edi
+
+
+ roll $4,%eax
+ movl %eax,%esi
+ xorl %edi,%eax
+ andl $0xf0f0f0f0,%eax
+ xorl %eax,%esi
+ xorl %eax,%edi
+
+ roll $20,%edi
+ movl %edi,%eax
+ xorl %esi,%edi
+ andl $0xfff0000f,%edi
+ xorl %edi,%eax
+ xorl %edi,%esi
+
+ roll $14,%eax
+ movl %eax,%edi
+ xorl %esi,%eax
+ andl $0x33333333,%eax
+ xorl %eax,%edi
+ xorl %eax,%esi
+
+ roll $22,%esi
+ movl %esi,%eax
+ xorl %edi,%esi
+ andl $0x03fc03fc,%esi
+ xorl %esi,%eax
+ xorl %esi,%edi
+
+ roll $9,%eax
+ movl %eax,%esi
+ xorl %edi,%eax
+ andl $0xaaaaaaaa,%eax
+ xorl %eax,%esi
+ xorl %eax,%edi
+
+ roll $1,%edi
+ call .L000pic_point
+.L000pic_point:
+ popl %ebp
+ leal DES_SPtrans-.L000pic_point(%ebp),%ebp
+ movl 24(%esp),%ecx
+ cmpl $0,%ebx
+ je .L001decrypt
+ call _x86_DES_encrypt
+ jmp .L002done
+.L001decrypt:
+ call _x86_DES_decrypt
+.L002done:
+
+
+ movl 20(%esp),%edx
+ rorl $1,%esi
+ movl %edi,%eax
+ xorl %esi,%edi
+ andl $0xaaaaaaaa,%edi
+ xorl %edi,%eax
+ xorl %edi,%esi
+
+ roll $23,%eax
+ movl %eax,%edi
+ xorl %esi,%eax
+ andl $0x03fc03fc,%eax
+ xorl %eax,%edi
+ xorl %eax,%esi
+
+ roll $10,%edi
+ movl %edi,%eax
+ xorl %esi,%edi
+ andl $0x33333333,%edi
+ xorl %edi,%eax
+ xorl %edi,%esi
+
+ roll $18,%esi
+ movl %esi,%edi
+ xorl %eax,%esi
+ andl $0xfff0000f,%esi
+ xorl %esi,%edi
+ xorl %esi,%eax
+
+ roll $12,%edi
+ movl %edi,%esi
+ xorl %eax,%edi
+ andl $0xf0f0f0f0,%edi
+ xorl %edi,%esi
+ xorl %edi,%eax
+
+ rorl $4,%eax
+ movl %eax,(%edx)
+ movl %esi,4(%edx)
+ popl %ebp
+ popl %ebx
+ popl %edi
+ popl %esi
+ ret
+.size DES_encrypt1,.-.L_DES_encrypt1_begin
+.globl DES_encrypt2
+.type DES_encrypt2,@function
+.align 16
+DES_encrypt2:
+.L_DES_encrypt2_begin:
+ pushl %esi
+ pushl %edi
+
+
+ movl 12(%esp),%eax
+ xorl %ecx,%ecx
+ pushl %ebx
+ pushl %ebp
+ movl (%eax),%esi
+ movl 28(%esp),%ebx
+ roll $3,%esi
+ movl 4(%eax),%edi
+ roll $3,%edi
+ call .L003pic_point
+.L003pic_point:
+ popl %ebp
+ leal DES_SPtrans-.L003pic_point(%ebp),%ebp
+ movl 24(%esp),%ecx
+ cmpl $0,%ebx
+ je .L004decrypt
+ call _x86_DES_encrypt
+ jmp .L005done
+.L004decrypt:
+ call _x86_DES_decrypt
+.L005done:
+
+
+ rorl $3,%edi
+ movl 20(%esp),%eax
+ rorl $3,%esi
+ movl %edi,(%eax)
+ movl %esi,4(%eax)
+ popl %ebp
+ popl %ebx
+ popl %edi
+ popl %esi
+ ret
+.size DES_encrypt2,.-.L_DES_encrypt2_begin
+.globl DES_encrypt3
+.type DES_encrypt3,@function
+.align 16
+DES_encrypt3:
+.L_DES_encrypt3_begin:
+ pushl %ebx
+ movl 8(%esp),%ebx
+ pushl %ebp
+ pushl %esi
+ pushl %edi
+
+
+ movl (%ebx),%edi
+ movl 4(%ebx),%esi
+ subl $12,%esp
+
+
+ roll $4,%edi
+ movl %edi,%edx
+ xorl %esi,%edi
+ andl $0xf0f0f0f0,%edi
+ xorl %edi,%edx
+ xorl %edi,%esi
+
+ roll $20,%esi
+ movl %esi,%edi
+ xorl %edx,%esi
+ andl $0xfff0000f,%esi
+ xorl %esi,%edi
+ xorl %esi,%edx
+
+ roll $14,%edi
+ movl %edi,%esi
+ xorl %edx,%edi
+ andl $0x33333333,%edi
+ xorl %edi,%esi
+ xorl %edi,%edx
+
+ roll $22,%edx
+ movl %edx,%edi
+ xorl %esi,%edx
+ andl $0x03fc03fc,%edx
+ xorl %edx,%edi
+ xorl %edx,%esi
+
+ roll $9,%edi
+ movl %edi,%edx
+ xorl %esi,%edi
+ andl $0xaaaaaaaa,%edi
+ xorl %edi,%edx
+ xorl %edi,%esi
+
+ rorl $3,%edx
+ rorl $2,%esi
+ movl %esi,4(%ebx)
+ movl 36(%esp),%eax
+ movl %edx,(%ebx)
+ movl 40(%esp),%edi
+ movl 44(%esp),%esi
+ movl $1,8(%esp)
+ movl %eax,4(%esp)
+ movl %ebx,(%esp)
+ call .L_DES_encrypt2_begin
+ movl $0,8(%esp)
+ movl %edi,4(%esp)
+ movl %ebx,(%esp)
+ call .L_DES_encrypt2_begin
+ movl $1,8(%esp)
+ movl %esi,4(%esp)
+ movl %ebx,(%esp)
+ call .L_DES_encrypt2_begin
+ addl $12,%esp
+ movl (%ebx),%edi
+ movl 4(%ebx),%esi
+
+
+ roll $2,%esi
+ roll $3,%edi
+ movl %edi,%eax
+ xorl %esi,%edi
+ andl $0xaaaaaaaa,%edi
+ xorl %edi,%eax
+ xorl %edi,%esi
+
+ roll $23,%eax
+ movl %eax,%edi
+ xorl %esi,%eax
+ andl $0x03fc03fc,%eax
+ xorl %eax,%edi
+ xorl %eax,%esi
+
+ roll $10,%edi
+ movl %edi,%eax
+ xorl %esi,%edi
+ andl $0x33333333,%edi
+ xorl %edi,%eax
+ xorl %edi,%esi
+
+ roll $18,%esi
+ movl %esi,%edi
+ xorl %eax,%esi
+ andl $0xfff0000f,%esi
+ xorl %esi,%edi
+ xorl %esi,%eax
+
+ roll $12,%edi
+ movl %edi,%esi
+ xorl %eax,%edi
+ andl $0xf0f0f0f0,%edi
+ xorl %edi,%esi
+ xorl %edi,%eax
+
+ rorl $4,%eax
+ movl %eax,(%ebx)
+ movl %esi,4(%ebx)
+ popl %edi
+ popl %esi
+ popl %ebp
+ popl %ebx
+ ret
+.size DES_encrypt3,.-.L_DES_encrypt3_begin
+.globl DES_decrypt3
+.type DES_decrypt3,@function
+.align 16
+DES_decrypt3:
+.L_DES_decrypt3_begin:
+ pushl %ebx
+ movl 8(%esp),%ebx
+ pushl %ebp
+ pushl %esi
+ pushl %edi
+
+
+ movl (%ebx),%edi
+ movl 4(%ebx),%esi
+ subl $12,%esp
+
+
+ roll $4,%edi
+ movl %edi,%edx
+ xorl %esi,%edi
+ andl $0xf0f0f0f0,%edi
+ xorl %edi,%edx
+ xorl %edi,%esi
+
+ roll $20,%esi
+ movl %esi,%edi
+ xorl %edx,%esi
+ andl $0xfff0000f,%esi
+ xorl %esi,%edi
+ xorl %esi,%edx
+
+ roll $14,%edi
+ movl %edi,%esi
+ xorl %edx,%edi
+ andl $0x33333333,%edi
+ xorl %edi,%esi
+ xorl %edi,%edx
+
+ roll $22,%edx
+ movl %edx,%edi
+ xorl %esi,%edx
+ andl $0x03fc03fc,%edx
+ xorl %edx,%edi
+ xorl %edx,%esi
+
+ roll $9,%edi
+ movl %edi,%edx
+ xorl %esi,%edi
+ andl $0xaaaaaaaa,%edi
+ xorl %edi,%edx
+ xorl %edi,%esi
+
+ rorl $3,%edx
+ rorl $2,%esi
+ movl %esi,4(%ebx)
+ movl 36(%esp),%esi
+ movl %edx,(%ebx)
+ movl 40(%esp),%edi
+ movl 44(%esp),%eax
+ movl $0,8(%esp)
+ movl %eax,4(%esp)
+ movl %ebx,(%esp)
+ call .L_DES_encrypt2_begin
+ movl $1,8(%esp)
+ movl %edi,4(%esp)
+ movl %ebx,(%esp)
+ call .L_DES_encrypt2_begin
+ movl $0,8(%esp)
+ movl %esi,4(%esp)
+ movl %ebx,(%esp)
+ call .L_DES_encrypt2_begin
+ addl $12,%esp
+ movl (%ebx),%edi
+ movl 4(%ebx),%esi
+
+
+ roll $2,%esi
+ roll $3,%edi
+ movl %edi,%eax
+ xorl %esi,%edi
+ andl $0xaaaaaaaa,%edi
+ xorl %edi,%eax
+ xorl %edi,%esi
+
+ roll $23,%eax
+ movl %eax,%edi
+ xorl %esi,%eax
+ andl $0x03fc03fc,%eax
+ xorl %eax,%edi
+ xorl %eax,%esi
+
+ roll $10,%edi
+ movl %edi,%eax
+ xorl %esi,%edi
+ andl $0x33333333,%edi
+ xorl %edi,%eax
+ xorl %edi,%esi
+
+ roll $18,%esi
+ movl %esi,%edi
+ xorl %eax,%esi
+ andl $0xfff0000f,%esi
+ xorl %esi,%edi
+ xorl %esi,%eax
+
+ roll $12,%edi
+ movl %edi,%esi
+ xorl %eax,%edi
+ andl $0xf0f0f0f0,%edi
+ xorl %edi,%esi
+ xorl %edi,%eax
+
+ rorl $4,%eax
+ movl %eax,(%ebx)
+ movl %esi,4(%ebx)
+ popl %edi
+ popl %esi
+ popl %ebp
+ popl %ebx
+ ret
+.size DES_decrypt3,.-.L_DES_decrypt3_begin
+.globl DES_ncbc_encrypt
+.type DES_ncbc_encrypt,@function
+.align 16
+DES_ncbc_encrypt:
+.L_DES_ncbc_encrypt_begin:
+
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 28(%esp),%ebp
+
+ movl 36(%esp),%ebx
+ movl (%ebx),%esi
+ movl 4(%ebx),%edi
+ pushl %edi
+ pushl %esi
+ pushl %edi
+ pushl %esi
+ movl %esp,%ebx
+ movl 36(%esp),%esi
+ movl 40(%esp),%edi
+
+ movl 56(%esp),%ecx
+
+ pushl %ecx
+
+ movl 52(%esp),%eax
+ pushl %eax
+ pushl %ebx
+ cmpl $0,%ecx
+ jz .L006decrypt
+ andl $4294967288,%ebp
+ movl 12(%esp),%eax
+ movl 16(%esp),%ebx
+ jz .L007encrypt_finish
+.L008encrypt_loop:
+ movl (%esi),%ecx
+ movl 4(%esi),%edx
+ xorl %ecx,%eax
+ xorl %edx,%ebx
+ movl %eax,12(%esp)
+ movl %ebx,16(%esp)
+ call .L_DES_encrypt1_begin
+ movl 12(%esp),%eax
+ movl 16(%esp),%ebx
+ movl %eax,(%edi)
+ movl %ebx,4(%edi)
+ addl $8,%esi
+ addl $8,%edi
+ subl $8,%ebp
+ jnz .L008encrypt_loop
+.L007encrypt_finish:
+ movl 56(%esp),%ebp
+ andl $7,%ebp
+ jz .L009finish
+ call .L010PIC_point
+.L010PIC_point:
+ popl %edx
+ leal .L011cbc_enc_jmp_table-.L010PIC_point(%edx),%ecx
+ movl (%ecx,%ebp,4),%ebp
+ addl %edx,%ebp
+ xorl %ecx,%ecx
+ xorl %edx,%edx
+ jmp *%ebp
+.L012ej7:
+ movb 6(%esi),%dh
+ shll $8,%edx
+.L013ej6:
+ movb 5(%esi),%dh
+.L014ej5:
+ movb 4(%esi),%dl
+.L015ej4:
+ movl (%esi),%ecx
+ jmp .L016ejend
+.L017ej3:
+ movb 2(%esi),%ch
+ shll $8,%ecx
+.L018ej2:
+ movb 1(%esi),%ch
+.L019ej1:
+ movb (%esi),%cl
+.L016ejend:
+ xorl %ecx,%eax
+ xorl %edx,%ebx
+ movl %eax,12(%esp)
+ movl %ebx,16(%esp)
+ call .L_DES_encrypt1_begin
+ movl 12(%esp),%eax
+ movl 16(%esp),%ebx
+ movl %eax,(%edi)
+ movl %ebx,4(%edi)
+ jmp .L009finish
+.L006decrypt:
+ andl $4294967288,%ebp
+ movl 20(%esp),%eax
+ movl 24(%esp),%ebx
+ jz .L020decrypt_finish
+.L021decrypt_loop:
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl %eax,12(%esp)
+ movl %ebx,16(%esp)
+ call .L_DES_encrypt1_begin
+ movl 12(%esp),%eax
+ movl 16(%esp),%ebx
+ movl 20(%esp),%ecx
+ movl 24(%esp),%edx
+ xorl %eax,%ecx
+ xorl %ebx,%edx
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl %ecx,(%edi)
+ movl %edx,4(%edi)
+ movl %eax,20(%esp)
+ movl %ebx,24(%esp)
+ addl $8,%esi
+ addl $8,%edi
+ subl $8,%ebp
+ jnz .L021decrypt_loop
+.L020decrypt_finish:
+ movl 56(%esp),%ebp
+ andl $7,%ebp
+ jz .L009finish
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl %eax,12(%esp)
+ movl %ebx,16(%esp)
+ call .L_DES_encrypt1_begin
+ movl 12(%esp),%eax
+ movl 16(%esp),%ebx
+ movl 20(%esp),%ecx
+ movl 24(%esp),%edx
+ xorl %eax,%ecx
+ xorl %ebx,%edx
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+.L022dj7:
+ rorl $16,%edx
+ movb %dl,6(%edi)
+ shrl $16,%edx
+.L023dj6:
+ movb %dh,5(%edi)
+.L024dj5:
+ movb %dl,4(%edi)
+.L025dj4:
+ movl %ecx,(%edi)
+ jmp .L026djend
+.L027dj3:
+ rorl $16,%ecx
+ movb %cl,2(%edi)
+ shll $16,%ecx
+.L028dj2:
+ movb %ch,1(%esi)
+.L029dj1:
+ movb %cl,(%esi)
+.L026djend:
+ jmp .L009finish
+.L009finish:
+ movl 64(%esp),%ecx
+ addl $28,%esp
+ movl %eax,(%ecx)
+ movl %ebx,4(%ecx)
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.align 64
+.L011cbc_enc_jmp_table:
+.long 0
+.long .L019ej1-.L010PIC_point
+.long .L018ej2-.L010PIC_point
+.long .L017ej3-.L010PIC_point
+.long .L015ej4-.L010PIC_point
+.long .L014ej5-.L010PIC_point
+.long .L013ej6-.L010PIC_point
+.long .L012ej7-.L010PIC_point
+.align 64
+.size DES_ncbc_encrypt,.-.L_DES_ncbc_encrypt_begin
+.globl DES_ede3_cbc_encrypt
+.type DES_ede3_cbc_encrypt,@function
+.align 16
+DES_ede3_cbc_encrypt:
+.L_DES_ede3_cbc_encrypt_begin:
+
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 28(%esp),%ebp
+
+ movl 44(%esp),%ebx
+ movl (%ebx),%esi
+ movl 4(%ebx),%edi
+ pushl %edi
+ pushl %esi
+ pushl %edi
+ pushl %esi
+ movl %esp,%ebx
+ movl 36(%esp),%esi
+ movl 40(%esp),%edi
+
+ movl 64(%esp),%ecx
+
+ movl 56(%esp),%eax
+ pushl %eax
+
+ movl 56(%esp),%eax
+ pushl %eax
+
+ movl 56(%esp),%eax
+ pushl %eax
+ pushl %ebx
+ cmpl $0,%ecx
+ jz .L030decrypt
+ andl $4294967288,%ebp
+ movl 16(%esp),%eax
+ movl 20(%esp),%ebx
+ jz .L031encrypt_finish
+.L032encrypt_loop:
+ movl (%esi),%ecx
+ movl 4(%esi),%edx
+ xorl %ecx,%eax
+ xorl %edx,%ebx
+ movl %eax,16(%esp)
+ movl %ebx,20(%esp)
+ call .L_DES_encrypt3_begin
+ movl 16(%esp),%eax
+ movl 20(%esp),%ebx
+ movl %eax,(%edi)
+ movl %ebx,4(%edi)
+ addl $8,%esi
+ addl $8,%edi
+ subl $8,%ebp
+ jnz .L032encrypt_loop
+.L031encrypt_finish:
+ movl 60(%esp),%ebp
+ andl $7,%ebp
+ jz .L033finish
+ call .L034PIC_point
+.L034PIC_point:
+ popl %edx
+ leal .L035cbc_enc_jmp_table-.L034PIC_point(%edx),%ecx
+ movl (%ecx,%ebp,4),%ebp
+ addl %edx,%ebp
+ xorl %ecx,%ecx
+ xorl %edx,%edx
+ jmp *%ebp
+.L036ej7:
+ movb 6(%esi),%dh
+ shll $8,%edx
+.L037ej6:
+ movb 5(%esi),%dh
+.L038ej5:
+ movb 4(%esi),%dl
+.L039ej4:
+ movl (%esi),%ecx
+ jmp .L040ejend
+.L041ej3:
+ movb 2(%esi),%ch
+ shll $8,%ecx
+.L042ej2:
+ movb 1(%esi),%ch
+.L043ej1:
+ movb (%esi),%cl
+.L040ejend:
+ xorl %ecx,%eax
+ xorl %edx,%ebx
+ movl %eax,16(%esp)
+ movl %ebx,20(%esp)
+ call .L_DES_encrypt3_begin
+ movl 16(%esp),%eax
+ movl 20(%esp),%ebx
+ movl %eax,(%edi)
+ movl %ebx,4(%edi)
+ jmp .L033finish
+.L030decrypt:
+ andl $4294967288,%ebp
+ movl 24(%esp),%eax
+ movl 28(%esp),%ebx
+ jz .L044decrypt_finish
+.L045decrypt_loop:
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl %eax,16(%esp)
+ movl %ebx,20(%esp)
+ call .L_DES_decrypt3_begin
+ movl 16(%esp),%eax
+ movl 20(%esp),%ebx
+ movl 24(%esp),%ecx
+ movl 28(%esp),%edx
+ xorl %eax,%ecx
+ xorl %ebx,%edx
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl %ecx,(%edi)
+ movl %edx,4(%edi)
+ movl %eax,24(%esp)
+ movl %ebx,28(%esp)
+ addl $8,%esi
+ addl $8,%edi
+ subl $8,%ebp
+ jnz .L045decrypt_loop
+.L044decrypt_finish:
+ movl 60(%esp),%ebp
+ andl $7,%ebp
+ jz .L033finish
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl %eax,16(%esp)
+ movl %ebx,20(%esp)
+ call .L_DES_decrypt3_begin
+ movl 16(%esp),%eax
+ movl 20(%esp),%ebx
+ movl 24(%esp),%ecx
+ movl 28(%esp),%edx
+ xorl %eax,%ecx
+ xorl %ebx,%edx
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+.L046dj7:
+ rorl $16,%edx
+ movb %dl,6(%edi)
+ shrl $16,%edx
+.L047dj6:
+ movb %dh,5(%edi)
+.L048dj5:
+ movb %dl,4(%edi)
+.L049dj4:
+ movl %ecx,(%edi)
+ jmp .L050djend
+.L051dj3:
+ rorl $16,%ecx
+ movb %cl,2(%edi)
+ shll $16,%ecx
+.L052dj2:
+ movb %ch,1(%esi)
+.L053dj1:
+ movb %cl,(%esi)
+.L050djend:
+ jmp .L033finish
+.L033finish:
+ movl 76(%esp),%ecx
+ addl $32,%esp
+ movl %eax,(%ecx)
+ movl %ebx,4(%ecx)
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.align 64
+.L035cbc_enc_jmp_table:
+.long 0
+.long .L043ej1-.L034PIC_point
+.long .L042ej2-.L034PIC_point
+.long .L041ej3-.L034PIC_point
+.long .L039ej4-.L034PIC_point
+.long .L038ej5-.L034PIC_point
+.long .L037ej6-.L034PIC_point
+.long .L036ej7-.L034PIC_point
+.align 64
+.size DES_ede3_cbc_encrypt,.-.L_DES_ede3_cbc_encrypt_begin
+.align 64
+DES_SPtrans:
+.long 34080768,524288,33554434,34080770
+.long 33554432,526338,524290,33554434
+.long 526338,34080768,34078720,2050
+.long 33556482,33554432,0,524290
+.long 524288,2,33556480,526336
+.long 34080770,34078720,2050,33556480
+.long 2,2048,526336,34078722
+.long 2048,33556482,34078722,0
+.long 0,34080770,33556480,524290
+.long 34080768,524288,2050,33556480
+.long 34078722,2048,526336,33554434
+.long 526338,2,33554434,34078720
+.long 34080770,526336,34078720,33556482
+.long 33554432,2050,524290,0
+.long 524288,33554432,33556482,34080768
+.long 2,34078722,2048,526338
+.long 1074823184,0,1081344,1074790400
+.long 1073741840,32784,1073774592,1081344
+.long 32768,1074790416,16,1073774592
+.long 1048592,1074823168,1074790400,16
+.long 1048576,1073774608,1074790416,32768
+.long 1081360,1073741824,0,1048592
+.long 1073774608,1081360,1074823168,1073741840
+.long 1073741824,1048576,32784,1074823184
+.long 1048592,1074823168,1073774592,1081360
+.long 1074823184,1048592,1073741840,0
+.long 1073741824,32784,1048576,1074790416
+.long 32768,1073741824,1081360,1073774608
+.long 1074823168,32768,0,1073741840
+.long 16,1074823184,1081344,1074790400
+.long 1074790416,1048576,32784,1073774592
+.long 1073774608,16,1074790400,1081344
+.long 67108865,67371264,256,67109121
+.long 262145,67108864,67109121,262400
+.long 67109120,262144,67371008,1
+.long 67371265,257,1,67371009
+.long 0,262145,67371264,256
+.long 257,67371265,262144,67108865
+.long 67371009,67109120,262401,67371008
+.long 262400,0,67108864,262401
+.long 67371264,256,1,262144
+.long 257,262145,67371008,67109121
+.long 0,67371264,262400,67371009
+.long 262145,67108864,67371265,1
+.long 262401,67108865,67108864,67371265
+.long 262144,67109120,67109121,262400
+.long 67109120,0,67371009,257
+.long 67108865,262401,256,67371008
+.long 4198408,268439552,8,272633864
+.long 0,272629760,268439560,4194312
+.long 272633856,268435464,268435456,4104
+.long 268435464,4198408,4194304,268435456
+.long 272629768,4198400,4096,8
+.long 4198400,268439560,272629760,4096
+.long 4104,0,4194312,272633856
+.long 268439552,272629768,272633864,4194304
+.long 272629768,4104,4194304,268435464
+.long 4198400,268439552,8,272629760
+.long 268439560,0,4096,4194312
+.long 0,272629768,272633856,4096
+.long 268435456,272633864,4198408,4194304
+.long 272633864,8,268439552,4198408
+.long 4194312,4198400,272629760,268439560
+.long 4104,268435456,268435464,272633856
+.long 134217728,65536,1024,134284320
+.long 134283296,134218752,66592,134283264
+.long 65536,32,134217760,66560
+.long 134218784,134283296,134284288,0
+.long 66560,134217728,65568,1056
+.long 134218752,66592,0,134217760
+.long 32,134218784,134284320,65568
+.long 134283264,1024,1056,134284288
+.long 134284288,134218784,65568,134283264
+.long 65536,32,134217760,134218752
+.long 134217728,66560,134284320,0
+.long 66592,134217728,1024,65568
+.long 134218784,1024,0,134284320
+.long 134283296,134284288,1056,65536
+.long 66560,134283296,134218752,1056
+.long 32,66592,134283264,134217760
+.long 2147483712,2097216,0,2149588992
+.long 2097216,8192,2147491904,2097152
+.long 8256,2149589056,2105344,2147483648
+.long 2147491840,2147483712,2149580800,2105408
+.long 2097152,2147491904,2149580864,0
+.long 8192,64,2149588992,2149580864
+.long 2149589056,2149580800,2147483648,8256
+.long 64,2105344,2105408,2147491840
+.long 8256,2147483648,2147491840,2105408
+.long 2149588992,2097216,0,2147491840
+.long 2147483648,8192,2149580864,2097152
+.long 2097216,2149589056,2105344,64
+.long 2149589056,2105344,2097152,2147491904
+.long 2147483712,2149580800,2105408,0
+.long 8192,2147483712,2147491904,2149588992
+.long 2149580800,8256,64,2149580864
+.long 16384,512,16777728,16777220
+.long 16794116,16388,16896,0
+.long 16777216,16777732,516,16793600
+.long 4,16794112,16793600,516
+.long 16777732,16384,16388,16794116
+.long 0,16777728,16777220,16896
+.long 16793604,16900,16794112,4
+.long 16900,16793604,512,16777216
+.long 16900,16793600,16793604,516
+.long 16384,512,16777216,16793604
+.long 16777732,16900,16896,0
+.long 512,16777220,4,16777728
+.long 0,16777732,16777728,16896
+.long 516,16384,16794116,16777216
+.long 16794112,4,16388,16794116
+.long 16777220,16794112,16793600,16388
+.long 545259648,545390592,131200,0
+.long 537001984,8388736,545259520,545390720
+.long 128,536870912,8519680,131200
+.long 8519808,537002112,536871040,545259520
+.long 131072,8519808,8388736,537001984
+.long 545390720,536871040,0,8519680
+.long 536870912,8388608,537002112,545259648
+.long 8388608,131072,545390592,128
+.long 8388608,131072,536871040,545390720
+.long 131200,536870912,0,8519680
+.long 545259648,537002112,537001984,8388736
+.long 545390592,128,8388736,537001984
+.long 545390720,8388608,545259520,536871040
+.long 8519680,131200,537002112,545259520
+.long 128,545390592,8519808,0
+.long 536870912,545259648,131072,8519808
diff --git a/main/openssl/crypto/des/des.h b/main/openssl/crypto/des/des.h
index 92b66635..1eaedcbd 100644
--- a/main/openssl/crypto/des/des.h
+++ b/main/openssl/crypto/des/des.h
@@ -224,6 +224,9 @@ int DES_set_key(const_DES_cblock *key,DES_key_schedule *schedule);
int DES_key_sched(const_DES_cblock *key,DES_key_schedule *schedule);
int DES_set_key_checked(const_DES_cblock *key,DES_key_schedule *schedule);
void DES_set_key_unchecked(const_DES_cblock *key,DES_key_schedule *schedule);
+#ifdef OPENSSL_FIPS
+void private_DES_set_key_unchecked(const_DES_cblock *key,DES_key_schedule *schedule);
+#endif
void DES_string_to_key(const char *str,DES_cblock *key);
void DES_string_to_2keys(const char *str,DES_cblock *key1,DES_cblock *key2);
void DES_cfb64_encrypt(const unsigned char *in,unsigned char *out,long length,
diff --git a/main/openssl/crypto/des/set_key.c b/main/openssl/crypto/des/set_key.c
index 3004cc3a..da4d62e1 100644
--- a/main/openssl/crypto/des/set_key.c
+++ b/main/openssl/crypto/des/set_key.c
@@ -63,6 +63,7 @@
* 1.1 added norm_expand_bits
* 1.0 First working version
*/
+#include <openssl/crypto.h>
#include "des_locl.h"
OPENSSL_IMPLEMENT_GLOBAL(int,DES_check_key,0) /* defaults to false */
@@ -335,6 +336,13 @@ int DES_set_key_checked(const_DES_cblock *key, DES_key_schedule *schedule)
}
void DES_set_key_unchecked(const_DES_cblock *key, DES_key_schedule *schedule)
+#ifdef OPENSSL_FIPS
+ {
+ fips_cipher_abort(DES);
+ private_DES_set_key_unchecked(key, schedule);
+ }
+void private_DES_set_key_unchecked(const_DES_cblock *key, DES_key_schedule *schedule)
+#endif
{
static const int shifts2[16]={0,0,1,1,1,1,1,1,0,1,1,1,1,1,1,0};
register DES_LONG c,d,t,s,t2;
diff --git a/main/openssl/crypto/des/str2key.c b/main/openssl/crypto/des/str2key.c
index 9c2054bd..1077f99d 100644
--- a/main/openssl/crypto/des/str2key.c
+++ b/main/openssl/crypto/des/str2key.c
@@ -56,8 +56,8 @@
* [including the GNU Public Licence.]
*/
-#include "des_locl.h"
#include <openssl/crypto.h>
+#include "des_locl.h"
void DES_string_to_key(const char *str, DES_cblock *key)
{
diff --git a/main/openssl/crypto/dh/dh.h b/main/openssl/crypto/dh/dh.h
index 849309a4..ea59e610 100644
--- a/main/openssl/crypto/dh/dh.h
+++ b/main/openssl/crypto/dh/dh.h
@@ -86,6 +86,21 @@
* be used for all exponents.
*/
+/* If this flag is set the DH method is FIPS compliant and can be used
+ * in FIPS mode. This is set in the validated module method. If an
+ * application sets this flag in its own methods it is its reposibility
+ * to ensure the result is compliant.
+ */
+
+#define DH_FLAG_FIPS_METHOD 0x0400
+
+/* If this flag is set the operations normally disabled in FIPS mode are
+ * permitted it is then the applications responsibility to ensure that the
+ * usage is compliant.
+ */
+
+#define DH_FLAG_NON_FIPS_ALLOW 0x0400
+
#ifdef __cplusplus
extern "C" {
#endif
@@ -230,6 +245,9 @@ void ERR_load_DH_strings(void);
#define DH_F_COMPUTE_KEY 102
#define DH_F_DHPARAMS_PRINT_FP 101
#define DH_F_DH_BUILTIN_GENPARAMS 106
+#define DH_F_DH_COMPUTE_KEY 114
+#define DH_F_DH_GENERATE_KEY 115
+#define DH_F_DH_GENERATE_PARAMETERS_EX 116
#define DH_F_DH_NEW_METHOD 105
#define DH_F_DH_PARAM_DECODE 107
#define DH_F_DH_PRIV_DECODE 110
@@ -249,7 +267,9 @@ void ERR_load_DH_strings(void);
#define DH_R_DECODE_ERROR 104
#define DH_R_INVALID_PUBKEY 102
#define DH_R_KEYS_NOT_SET 108
+#define DH_R_KEY_SIZE_TOO_SMALL 110
#define DH_R_MODULUS_TOO_LARGE 103
+#define DH_R_NON_FIPS_METHOD 111
#define DH_R_NO_PARAMETERS_SET 107
#define DH_R_NO_PRIVATE_VALUE 100
#define DH_R_PARAMETER_ENCODING_ERROR 105
diff --git a/main/openssl/crypto/dh/dh_ameth.c b/main/openssl/crypto/dh/dh_ameth.c
index 377caf96..02ec2d47 100644
--- a/main/openssl/crypto/dh/dh_ameth.c
+++ b/main/openssl/crypto/dh/dh_ameth.c
@@ -493,6 +493,7 @@ const EVP_PKEY_ASN1_METHOD dh_asn1_meth =
dh_copy_parameters,
dh_cmp_parameters,
dh_param_print,
+ 0,
int_dh_free,
0
diff --git a/main/openssl/crypto/dh/dh_err.c b/main/openssl/crypto/dh/dh_err.c
index d5cf0c22..56d3df73 100644
--- a/main/openssl/crypto/dh/dh_err.c
+++ b/main/openssl/crypto/dh/dh_err.c
@@ -1,6 +1,6 @@
/* crypto/dh/dh_err.c */
/* ====================================================================
- * Copyright (c) 1999-2006 The OpenSSL Project. All rights reserved.
+ * Copyright (c) 1999-2011 The OpenSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -73,6 +73,9 @@ static ERR_STRING_DATA DH_str_functs[]=
{ERR_FUNC(DH_F_COMPUTE_KEY), "COMPUTE_KEY"},
{ERR_FUNC(DH_F_DHPARAMS_PRINT_FP), "DHparams_print_fp"},
{ERR_FUNC(DH_F_DH_BUILTIN_GENPARAMS), "DH_BUILTIN_GENPARAMS"},
+{ERR_FUNC(DH_F_DH_COMPUTE_KEY), "DH_compute_key"},
+{ERR_FUNC(DH_F_DH_GENERATE_KEY), "DH_generate_key"},
+{ERR_FUNC(DH_F_DH_GENERATE_PARAMETERS_EX), "DH_generate_parameters_ex"},
{ERR_FUNC(DH_F_DH_NEW_METHOD), "DH_new_method"},
{ERR_FUNC(DH_F_DH_PARAM_DECODE), "DH_PARAM_DECODE"},
{ERR_FUNC(DH_F_DH_PRIV_DECODE), "DH_PRIV_DECODE"},
@@ -95,7 +98,9 @@ static ERR_STRING_DATA DH_str_reasons[]=
{ERR_REASON(DH_R_DECODE_ERROR) ,"decode error"},
{ERR_REASON(DH_R_INVALID_PUBKEY) ,"invalid public key"},
{ERR_REASON(DH_R_KEYS_NOT_SET) ,"keys not set"},
+{ERR_REASON(DH_R_KEY_SIZE_TOO_SMALL) ,"key size too small"},
{ERR_REASON(DH_R_MODULUS_TOO_LARGE) ,"modulus too large"},
+{ERR_REASON(DH_R_NON_FIPS_METHOD) ,"non fips method"},
{ERR_REASON(DH_R_NO_PARAMETERS_SET) ,"no parameters set"},
{ERR_REASON(DH_R_NO_PRIVATE_VALUE) ,"no private value"},
{ERR_REASON(DH_R_PARAMETER_ENCODING_ERROR),"parameter encoding error"},
diff --git a/main/openssl/crypto/dh/dh_gen.c b/main/openssl/crypto/dh/dh_gen.c
index cfd5b118..7b1fe9c9 100644
--- a/main/openssl/crypto/dh/dh_gen.c
+++ b/main/openssl/crypto/dh/dh_gen.c
@@ -66,12 +66,29 @@
#include <openssl/bn.h>
#include <openssl/dh.h>
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
+
static int dh_builtin_genparams(DH *ret, int prime_len, int generator, BN_GENCB *cb);
int DH_generate_parameters_ex(DH *ret, int prime_len, int generator, BN_GENCB *cb)
{
+#ifdef OPENSSL_FIPS
+ if (FIPS_mode() && !(ret->meth->flags & DH_FLAG_FIPS_METHOD)
+ && !(ret->flags & DH_FLAG_NON_FIPS_ALLOW))
+ {
+ DHerr(DH_F_DH_GENERATE_PARAMETERS_EX, DH_R_NON_FIPS_METHOD);
+ return 0;
+ }
+#endif
if(ret->meth->generate_params)
return ret->meth->generate_params(ret, prime_len, generator, cb);
+#ifdef OPENSSL_FIPS
+ if (FIPS_mode())
+ return FIPS_dh_generate_parameters_ex(ret, prime_len,
+ generator, cb);
+#endif
return dh_builtin_genparams(ret, prime_len, generator, cb);
}
diff --git a/main/openssl/crypto/dh/dh_key.c b/main/openssl/crypto/dh/dh_key.c
index e7db4403..89a74db4 100644
--- a/main/openssl/crypto/dh/dh_key.c
+++ b/main/openssl/crypto/dh/dh_key.c
@@ -73,11 +73,27 @@ static int dh_finish(DH *dh);
int DH_generate_key(DH *dh)
{
+#ifdef OPENSSL_FIPS
+ if (FIPS_mode() && !(dh->meth->flags & DH_FLAG_FIPS_METHOD)
+ && !(dh->flags & DH_FLAG_NON_FIPS_ALLOW))
+ {
+ DHerr(DH_F_DH_GENERATE_KEY, DH_R_NON_FIPS_METHOD);
+ return 0;
+ }
+#endif
return dh->meth->generate_key(dh);
}
int DH_compute_key(unsigned char *key, const BIGNUM *pub_key, DH *dh)
{
+#ifdef OPENSSL_FIPS
+ if (FIPS_mode() && !(dh->meth->flags & DH_FLAG_FIPS_METHOD)
+ && !(dh->flags & DH_FLAG_NON_FIPS_ALLOW))
+ {
+ DHerr(DH_F_DH_COMPUTE_KEY, DH_R_NON_FIPS_METHOD);
+ return 0;
+ }
+#endif
return dh->meth->compute_key(key, pub_key, dh);
}
@@ -138,8 +154,21 @@ static int generate_key(DH *dh)
if (generate_new_key)
{
- l = dh->length ? dh->length : BN_num_bits(dh->p)-1; /* secret exponent length */
- if (!BN_rand(priv_key, l, 0, 0)) goto err;
+ if (dh->q)
+ {
+ do
+ {
+ if (!BN_rand_range(priv_key, dh->q))
+ goto err;
+ }
+ while (BN_is_zero(priv_key) || BN_is_one(priv_key));
+ }
+ else
+ {
+ /* secret exponent length */
+ l = dh->length ? dh->length : BN_num_bits(dh->p)-1;
+ if (!BN_rand(priv_key, l, 0, 0)) goto err;
+ }
}
{
diff --git a/main/openssl/crypto/dh/dh_lib.c b/main/openssl/crypto/dh/dh_lib.c
index 7aef080e..00218f2b 100644
--- a/main/openssl/crypto/dh/dh_lib.c
+++ b/main/openssl/crypto/dh/dh_lib.c
@@ -64,6 +64,10 @@
#include <openssl/engine.h>
#endif
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
+
const char DH_version[]="Diffie-Hellman" OPENSSL_VERSION_PTEXT;
static const DH_METHOD *default_DH_method = NULL;
@@ -76,7 +80,16 @@ void DH_set_default_method(const DH_METHOD *meth)
const DH_METHOD *DH_get_default_method(void)
{
if(!default_DH_method)
+ {
+#ifdef OPENSSL_FIPS
+ if (FIPS_mode())
+ return FIPS_dh_openssl();
+ else
+ return DH_OpenSSL();
+#else
default_DH_method = DH_OpenSSL();
+#endif
+ }
return default_DH_method;
}
@@ -156,7 +169,7 @@ DH *DH_new_method(ENGINE *engine)
ret->counter = NULL;
ret->method_mont_p=NULL;
ret->references = 1;
- ret->flags=ret->meth->flags;
+ ret->flags=ret->meth->flags & ~DH_FLAG_NON_FIPS_ALLOW;
CRYPTO_new_ex_data(CRYPTO_EX_INDEX_DH, ret, &ret->ex_data);
if ((ret->meth->init != NULL) && !ret->meth->init(ret))
{
diff --git a/main/openssl/crypto/dsa/dsa.h b/main/openssl/crypto/dsa/dsa.h
index ac50a5c8..7531c653 100644
--- a/main/openssl/crypto/dsa/dsa.h
+++ b/main/openssl/crypto/dsa/dsa.h
@@ -96,6 +96,25 @@
* faster variable sliding window method to
* be used for all exponents.
*/
+#define DSA_FLAG_NONCE_FROM_HASH 0x04 /* Causes the DSA nonce to be calculated
+ from SHA512(private_key + H(message) +
+ random). This strengthens DSA against a
+ weak PRNG. */
+
+/* If this flag is set the DSA method is FIPS compliant and can be used
+ * in FIPS mode. This is set in the validated module method. If an
+ * application sets this flag in its own methods it is its reposibility
+ * to ensure the result is compliant.
+ */
+
+#define DSA_FLAG_FIPS_METHOD 0x0400
+
+/* If this flag is set the operations normally disabled in FIPS mode are
+ * permitted it is then the applications responsibility to ensure that the
+ * usage is compliant.
+ */
+
+#define DSA_FLAG_NON_FIPS_ALLOW 0x0400
#ifdef __cplusplus
extern "C" {
@@ -115,8 +134,9 @@ struct dsa_method
{
const char *name;
DSA_SIG * (*dsa_do_sign)(const unsigned char *dgst, int dlen, DSA *dsa);
- int (*dsa_sign_setup)(DSA *dsa, BN_CTX *ctx_in, BIGNUM **kinvp,
- BIGNUM **rp);
+ int (*dsa_sign_setup)(DSA *dsa, BN_CTX *ctx_in,
+ BIGNUM **kinvp, BIGNUM **rp,
+ const unsigned char *dgst, int dlen);
int (*dsa_do_verify)(const unsigned char *dgst, int dgst_len,
DSA_SIG *sig, DSA *dsa);
int (*dsa_mod_exp)(DSA *dsa, BIGNUM *rr, BIGNUM *a1, BIGNUM *p1,
@@ -272,6 +292,8 @@ void ERR_load_DSA_strings(void);
#define DSA_F_DSAPARAMS_PRINT_FP 101
#define DSA_F_DSA_DO_SIGN 112
#define DSA_F_DSA_DO_VERIFY 113
+#define DSA_F_DSA_GENERATE_KEY 124
+#define DSA_F_DSA_GENERATE_PARAMETERS_EX 123
#define DSA_F_DSA_NEW_METHOD 103
#define DSA_F_DSA_PARAM_DECODE 119
#define DSA_F_DSA_PRINT_FP 105
@@ -282,6 +304,7 @@ void ERR_load_DSA_strings(void);
#define DSA_F_DSA_SIGN 106
#define DSA_F_DSA_SIGN_SETUP 107
#define DSA_F_DSA_SIG_NEW 109
+#define DSA_F_DSA_SIG_PRINT 125
#define DSA_F_DSA_VERIFY 108
#define DSA_F_I2D_DSA_SIG 111
#define DSA_F_OLD_DSA_PRIV_DECODE 122
@@ -298,6 +321,9 @@ void ERR_load_DSA_strings(void);
#define DSA_R_INVALID_DIGEST_TYPE 106
#define DSA_R_MISSING_PARAMETERS 101
#define DSA_R_MODULUS_TOO_LARGE 103
+#define DSA_R_NEED_NEW_SETUP_VALUES 110
+#define DSA_R_NONCE_CANNOT_BE_PRECOMPUTED 112
+#define DSA_R_NON_FIPS_DSA_METHOD 111
#define DSA_R_NO_PARAMETERS_SET 107
#define DSA_R_PARAMETER_ENCODING_ERROR 105
diff --git a/main/openssl/crypto/dsa/dsa_ameth.c b/main/openssl/crypto/dsa/dsa_ameth.c
index 6413aae4..376156ec 100644
--- a/main/openssl/crypto/dsa/dsa_ameth.c
+++ b/main/openssl/crypto/dsa/dsa_ameth.c
@@ -542,6 +542,52 @@ static int old_dsa_priv_encode(const EVP_PKEY *pkey, unsigned char **pder)
return i2d_DSAPrivateKey(pkey->pkey.dsa, pder);
}
+static int dsa_sig_print(BIO *bp, const X509_ALGOR *sigalg,
+ const ASN1_STRING *sig,
+ int indent, ASN1_PCTX *pctx)
+ {
+ DSA_SIG *dsa_sig;
+ const unsigned char *p;
+ if (!sig)
+ {
+ if (BIO_puts(bp, "\n") <= 0)
+ return 0;
+ else
+ return 1;
+ }
+ p = sig->data;
+ dsa_sig = d2i_DSA_SIG(NULL, &p, sig->length);
+ if (dsa_sig)
+ {
+ int rv = 0;
+ size_t buf_len = 0;
+ unsigned char *m=NULL;
+ update_buflen(dsa_sig->r, &buf_len);
+ update_buflen(dsa_sig->s, &buf_len);
+ m = OPENSSL_malloc(buf_len+10);
+ if (m == NULL)
+ {
+ DSAerr(DSA_F_DSA_SIG_PRINT,ERR_R_MALLOC_FAILURE);
+ goto err;
+ }
+
+ if (BIO_write(bp, "\n", 1) != 1)
+ goto err;
+
+ if (!ASN1_bn_print(bp,"r: ",dsa_sig->r,m,indent))
+ goto err;
+ if (!ASN1_bn_print(bp,"s: ",dsa_sig->s,m,indent))
+ goto err;
+ rv = 1;
+ err:
+ if (m)
+ OPENSSL_free(m);
+ DSA_SIG_free(dsa_sig);
+ return rv;
+ }
+ return X509_signature_dump(bp, sig, indent);
+ }
+
static int dsa_pkey_ctrl(EVP_PKEY *pkey, int op, long arg1, void *arg2)
{
switch (op)
@@ -647,6 +693,7 @@ const EVP_PKEY_ASN1_METHOD dsa_asn1_meths[] =
dsa_copy_parameters,
dsa_cmp_parameters,
dsa_param_print,
+ dsa_sig_print,
int_dsa_free,
dsa_pkey_ctrl,
diff --git a/main/openssl/crypto/dsa/dsa_asn1.c b/main/openssl/crypto/dsa/dsa_asn1.c
index c37460b2..60585343 100644
--- a/main/openssl/crypto/dsa/dsa_asn1.c
+++ b/main/openssl/crypto/dsa/dsa_asn1.c
@@ -61,6 +61,7 @@
#include <openssl/dsa.h>
#include <openssl/asn1.h>
#include <openssl/asn1t.h>
+#include <openssl/rand.h>
/* Override the default new methods */
static int sig_cb(int operation, ASN1_VALUE **pval, const ASN1_ITEM *it,
@@ -87,7 +88,7 @@ ASN1_SEQUENCE_cb(DSA_SIG, sig_cb) = {
ASN1_SIMPLE(DSA_SIG, s, CBIGNUM)
} ASN1_SEQUENCE_END_cb(DSA_SIG, DSA_SIG)
-IMPLEMENT_ASN1_FUNCTIONS_const(DSA_SIG)
+IMPLEMENT_ASN1_ENCODE_FUNCTIONS_const_fname(DSA_SIG, DSA_SIG, DSA_SIG)
/* Override the default free and new methods */
static int dsa_cb(int operation, ASN1_VALUE **pval, const ASN1_ITEM *it,
@@ -148,3 +149,40 @@ DSA *DSAparams_dup(DSA *dsa)
{
return ASN1_item_dup(ASN1_ITEM_rptr(DSAparams), dsa);
}
+
+int DSA_sign(int type, const unsigned char *dgst, int dlen, unsigned char *sig,
+ unsigned int *siglen, DSA *dsa)
+ {
+ DSA_SIG *s;
+ RAND_seed(dgst, dlen);
+ s=DSA_do_sign(dgst,dlen,dsa);
+ if (s == NULL)
+ {
+ *siglen=0;
+ return(0);
+ }
+ *siglen=i2d_DSA_SIG(s,&sig);
+ DSA_SIG_free(s);
+ return(1);
+ }
+
+/* data has already been hashed (probably with SHA or SHA-1). */
+/* returns
+ * 1: correct signature
+ * 0: incorrect signature
+ * -1: error
+ */
+int DSA_verify(int type, const unsigned char *dgst, int dgst_len,
+ const unsigned char *sigbuf, int siglen, DSA *dsa)
+ {
+ DSA_SIG *s;
+ int ret=-1;
+
+ s = DSA_SIG_new();
+ if (s == NULL) return(ret);
+ if (d2i_DSA_SIG(&s,&sigbuf,siglen) == NULL) goto err;
+ ret=DSA_do_verify(dgst,dgst_len,s,dsa);
+err:
+ DSA_SIG_free(s);
+ return(ret);
+ }
diff --git a/main/openssl/crypto/dsa/dsa_err.c b/main/openssl/crypto/dsa/dsa_err.c
index bba984e9..e6171cce 100644
--- a/main/openssl/crypto/dsa/dsa_err.c
+++ b/main/openssl/crypto/dsa/dsa_err.c
@@ -1,6 +1,6 @@
/* crypto/dsa/dsa_err.c */
/* ====================================================================
- * Copyright (c) 1999-2006 The OpenSSL Project. All rights reserved.
+ * Copyright (c) 1999-2011 The OpenSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -76,6 +76,8 @@ static ERR_STRING_DATA DSA_str_functs[]=
{ERR_FUNC(DSA_F_DSAPARAMS_PRINT_FP), "DSAparams_print_fp"},
{ERR_FUNC(DSA_F_DSA_DO_SIGN), "DSA_do_sign"},
{ERR_FUNC(DSA_F_DSA_DO_VERIFY), "DSA_do_verify"},
+{ERR_FUNC(DSA_F_DSA_GENERATE_KEY), "DSA_generate_key"},
+{ERR_FUNC(DSA_F_DSA_GENERATE_PARAMETERS_EX), "DSA_generate_parameters_ex"},
{ERR_FUNC(DSA_F_DSA_NEW_METHOD), "DSA_new_method"},
{ERR_FUNC(DSA_F_DSA_PARAM_DECODE), "DSA_PARAM_DECODE"},
{ERR_FUNC(DSA_F_DSA_PRINT_FP), "DSA_print_fp"},
@@ -86,6 +88,7 @@ static ERR_STRING_DATA DSA_str_functs[]=
{ERR_FUNC(DSA_F_DSA_SIGN), "DSA_sign"},
{ERR_FUNC(DSA_F_DSA_SIGN_SETUP), "DSA_sign_setup"},
{ERR_FUNC(DSA_F_DSA_SIG_NEW), "DSA_SIG_new"},
+{ERR_FUNC(DSA_F_DSA_SIG_PRINT), "DSA_SIG_PRINT"},
{ERR_FUNC(DSA_F_DSA_VERIFY), "DSA_verify"},
{ERR_FUNC(DSA_F_I2D_DSA_SIG), "i2d_DSA_SIG"},
{ERR_FUNC(DSA_F_OLD_DSA_PRIV_DECODE), "OLD_DSA_PRIV_DECODE"},
@@ -105,6 +108,9 @@ static ERR_STRING_DATA DSA_str_reasons[]=
{ERR_REASON(DSA_R_INVALID_DIGEST_TYPE) ,"invalid digest type"},
{ERR_REASON(DSA_R_MISSING_PARAMETERS) ,"missing parameters"},
{ERR_REASON(DSA_R_MODULUS_TOO_LARGE) ,"modulus too large"},
+{ERR_REASON(DSA_R_NEED_NEW_SETUP_VALUES) ,"need new setup values"},
+{ERR_REASON(DSA_R_NONCE_CANNOT_BE_PRECOMPUTED),"nonce cannot be precomputed"},
+{ERR_REASON(DSA_R_NON_FIPS_DSA_METHOD) ,"non fips dsa method"},
{ERR_REASON(DSA_R_NO_PARAMETERS_SET) ,"no parameters set"},
{ERR_REASON(DSA_R_PARAMETER_ENCODING_ERROR),"parameter encoding error"},
{0,NULL}
diff --git a/main/openssl/crypto/dsa/dsa_gen.c b/main/openssl/crypto/dsa/dsa_gen.c
index cb0b4538..c398761d 100644
--- a/main/openssl/crypto/dsa/dsa_gen.c
+++ b/main/openssl/crypto/dsa/dsa_gen.c
@@ -81,13 +81,33 @@
#include <openssl/sha.h>
#include "dsa_locl.h"
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
+
int DSA_generate_parameters_ex(DSA *ret, int bits,
const unsigned char *seed_in, int seed_len,
int *counter_ret, unsigned long *h_ret, BN_GENCB *cb)
{
+#ifdef OPENSSL_FIPS
+ if (FIPS_mode() && !(ret->meth->flags & DSA_FLAG_FIPS_METHOD)
+ && !(ret->flags & DSA_FLAG_NON_FIPS_ALLOW))
+ {
+ DSAerr(DSA_F_DSA_GENERATE_PARAMETERS_EX, DSA_R_NON_FIPS_DSA_METHOD);
+ return 0;
+ }
+#endif
if(ret->meth->dsa_paramgen)
return ret->meth->dsa_paramgen(ret, bits, seed_in, seed_len,
counter_ret, h_ret, cb);
+#ifdef OPENSSL_FIPS
+ else if (FIPS_mode())
+ {
+ return FIPS_dsa_generate_parameters_ex(ret, bits,
+ seed_in, seed_len,
+ counter_ret, h_ret, cb);
+ }
+#endif
else
{
const EVP_MD *evpmd;
@@ -105,12 +125,13 @@ int DSA_generate_parameters_ex(DSA *ret, int bits,
}
return dsa_builtin_paramgen(ret, bits, qbits, evpmd,
- seed_in, seed_len, counter_ret, h_ret, cb);
+ seed_in, seed_len, NULL, counter_ret, h_ret, cb);
}
}
int dsa_builtin_paramgen(DSA *ret, size_t bits, size_t qbits,
const EVP_MD *evpmd, const unsigned char *seed_in, size_t seed_len,
+ unsigned char *seed_out,
int *counter_ret, unsigned long *h_ret, BN_GENCB *cb)
{
int ok=0;
@@ -201,8 +222,10 @@ int dsa_builtin_paramgen(DSA *ret, size_t bits, size_t qbits,
}
/* step 2 */
- EVP_Digest(seed, qsize, md, NULL, evpmd, NULL);
- EVP_Digest(buf, qsize, buf2, NULL, evpmd, NULL);
+ if (!EVP_Digest(seed, qsize, md, NULL, evpmd, NULL))
+ goto err;
+ if (!EVP_Digest(buf, qsize, buf2, NULL, evpmd, NULL))
+ goto err;
for (i = 0; i < qsize; i++)
md[i]^=buf2[i];
@@ -251,7 +274,9 @@ int dsa_builtin_paramgen(DSA *ret, size_t bits, size_t qbits,
break;
}
- EVP_Digest(buf, qsize, md ,NULL, evpmd, NULL);
+ if (!EVP_Digest(buf, qsize, md ,NULL, evpmd,
+ NULL))
+ goto err;
/* step 8 */
if (!BN_bin2bn(md, qsize, r0))
@@ -332,6 +357,8 @@ err:
}
if (counter_ret != NULL) *counter_ret=counter;
if (h_ret != NULL) *h_ret=h;
+ if (seed_out)
+ memcpy(seed_out, seed, qsize);
}
if(ctx)
{
diff --git a/main/openssl/crypto/dsa/dsa_key.c b/main/openssl/crypto/dsa/dsa_key.c
index c4aa86bc..9cf669b9 100644
--- a/main/openssl/crypto/dsa/dsa_key.c
+++ b/main/openssl/crypto/dsa/dsa_key.c
@@ -64,12 +64,28 @@
#include <openssl/dsa.h>
#include <openssl/rand.h>
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
+
static int dsa_builtin_keygen(DSA *dsa);
int DSA_generate_key(DSA *dsa)
{
+#ifdef OPENSSL_FIPS
+ if (FIPS_mode() && !(dsa->meth->flags & DSA_FLAG_FIPS_METHOD)
+ && !(dsa->flags & DSA_FLAG_NON_FIPS_ALLOW))
+ {
+ DSAerr(DSA_F_DSA_GENERATE_KEY, DSA_R_NON_FIPS_DSA_METHOD);
+ return 0;
+ }
+#endif
if(dsa->meth->dsa_keygen)
return dsa->meth->dsa_keygen(dsa);
+#ifdef OPENSSL_FIPS
+ if (FIPS_mode())
+ return FIPS_dsa_generate_key(dsa);
+#endif
return dsa_builtin_keygen(dsa);
}
diff --git a/main/openssl/crypto/dsa/dsa_lib.c b/main/openssl/crypto/dsa/dsa_lib.c
index e9b75902..96d8d0c4 100644
--- a/main/openssl/crypto/dsa/dsa_lib.c
+++ b/main/openssl/crypto/dsa/dsa_lib.c
@@ -70,6 +70,10 @@
#include <openssl/dh.h>
#endif
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
+
const char DSA_version[]="DSA" OPENSSL_VERSION_PTEXT;
static const DSA_METHOD *default_DSA_method = NULL;
@@ -82,7 +86,16 @@ void DSA_set_default_method(const DSA_METHOD *meth)
const DSA_METHOD *DSA_get_default_method(void)
{
if(!default_DSA_method)
+ {
+#ifdef OPENSSL_FIPS
+ if (FIPS_mode())
+ return FIPS_dsa_openssl();
+ else
+ return DSA_OpenSSL();
+#else
default_DSA_method = DSA_OpenSSL();
+#endif
+ }
return default_DSA_method;
}
@@ -163,7 +176,7 @@ DSA *DSA_new_method(ENGINE *engine)
ret->method_mont_p=NULL;
ret->references=1;
- ret->flags=ret->meth->flags;
+ ret->flags=ret->meth->flags & ~DSA_FLAG_NON_FIPS_ALLOW;
CRYPTO_new_ex_data(CRYPTO_EX_INDEX_DSA, ret, &ret->ex_data);
if ((ret->meth->init != NULL) && !ret->meth->init(ret))
{
@@ -276,7 +289,8 @@ void *DSA_get_ex_data(DSA *d, int idx)
DH *DSA_dup_DH(const DSA *r)
{
/* DSA has p, q, g, optional pub_key, optional priv_key.
- * DH has p, optional length, g, optional pub_key, optional priv_key.
+ * DH has p, optional length, g, optional pub_key, optional priv_key,
+ * optional q.
*/
DH *ret = NULL;
@@ -290,7 +304,11 @@ DH *DSA_dup_DH(const DSA *r)
if ((ret->p = BN_dup(r->p)) == NULL)
goto err;
if (r->q != NULL)
+ {
ret->length = BN_num_bits(r->q);
+ if ((ret->q = BN_dup(r->q)) == NULL)
+ goto err;
+ }
if (r->g != NULL)
if ((ret->g = BN_dup(r->g)) == NULL)
goto err;
diff --git a/main/openssl/crypto/dsa/dsa_locl.h b/main/openssl/crypto/dsa/dsa_locl.h
index 2b8cfee3..21e2e452 100644
--- a/main/openssl/crypto/dsa/dsa_locl.h
+++ b/main/openssl/crypto/dsa/dsa_locl.h
@@ -56,4 +56,5 @@
int dsa_builtin_paramgen(DSA *ret, size_t bits, size_t qbits,
const EVP_MD *evpmd, const unsigned char *seed_in, size_t seed_len,
+ unsigned char *seed_out,
int *counter_ret, unsigned long *h_ret, BN_GENCB *cb);
diff --git a/main/openssl/crypto/dsa/dsa_ossl.c b/main/openssl/crypto/dsa/dsa_ossl.c
index a3ddd7d2..177fc545 100644
--- a/main/openssl/crypto/dsa/dsa_ossl.c
+++ b/main/openssl/crypto/dsa/dsa_ossl.c
@@ -67,7 +67,9 @@
#include <openssl/asn1.h>
static DSA_SIG *dsa_do_sign(const unsigned char *dgst, int dlen, DSA *dsa);
-static int dsa_sign_setup(DSA *dsa, BN_CTX *ctx_in, BIGNUM **kinvp, BIGNUM **rp);
+static int dsa_sign_setup(DSA *dsa, BN_CTX *ctx_in,
+ BIGNUM **kinvp, BIGNUM **rp,
+ const unsigned char *dgst, int dlen);
static int dsa_do_verify(const unsigned char *dgst, int dgst_len, DSA_SIG *sig,
DSA *dsa);
static int dsa_init(DSA *dsa);
@@ -136,6 +138,7 @@ static DSA_SIG *dsa_do_sign(const unsigned char *dgst, int dlen, DSA *dsa)
BN_CTX *ctx=NULL;
int reason=ERR_R_BN_LIB;
DSA_SIG *ret=NULL;
+ int noredo = 0;
BN_init(&m);
BN_init(&xr);
@@ -150,10 +153,11 @@ static DSA_SIG *dsa_do_sign(const unsigned char *dgst, int dlen, DSA *dsa)
if (s == NULL) goto err;
ctx=BN_CTX_new();
if (ctx == NULL) goto err;
-
+redo:
if ((dsa->kinv == NULL) || (dsa->r == NULL))
{
- if (!DSA_sign_setup(dsa,ctx,&kinv,&r)) goto err;
+ if (!dsa->meth->dsa_sign_setup(dsa,ctx,&kinv,&r,dgst,dlen))
+ goto err;
}
else
{
@@ -161,6 +165,7 @@ static DSA_SIG *dsa_do_sign(const unsigned char *dgst, int dlen, DSA *dsa)
dsa->kinv=NULL;
r=dsa->r;
dsa->r=NULL;
+ noredo = 1;
}
@@ -181,6 +186,18 @@ static DSA_SIG *dsa_do_sign(const unsigned char *dgst, int dlen, DSA *dsa)
ret=DSA_SIG_new();
if (ret == NULL) goto err;
+ /* Redo if r or s is zero as required by FIPS 186-3: this is
+ * very unlikely.
+ */
+ if (BN_is_zero(r) || BN_is_zero(s))
+ {
+ if (noredo)
+ {
+ reason = DSA_R_NEED_NEW_SETUP_VALUES;
+ goto err;
+ }
+ goto redo;
+ }
ret->r = r;
ret->s = s;
@@ -199,7 +216,9 @@ err:
return(ret);
}
-static int dsa_sign_setup(DSA *dsa, BN_CTX *ctx_in, BIGNUM **kinvp, BIGNUM **rp)
+static int dsa_sign_setup(DSA *dsa, BN_CTX *ctx_in,
+ BIGNUM **kinvp, BIGNUM **rp,
+ const unsigned char *dgst, int dlen)
{
BN_CTX *ctx;
BIGNUM k,kq,*K,*kinv=NULL,*r=NULL;
@@ -225,8 +244,21 @@ static int dsa_sign_setup(DSA *dsa, BN_CTX *ctx_in, BIGNUM **kinvp, BIGNUM **rp)
/* Get random k */
do
- if (!BN_rand_range(&k, dsa->q)) goto err;
- while (BN_is_zero(&k));
+ {
+#ifndef OPENSSL_NO_SHA512
+ if (dsa->flags & DSA_FLAG_NONCE_FROM_HASH)
+ {
+ /* If DSA_FLAG_NONCE_FROM_HASH is set then we calculate k from
+ * SHA512(private_key + H(message) + random). This protects the
+ * private key from a weak PRNG. */
+ if (!BN_generate_dsa_nonce(&k, dsa->q, dsa->priv_key, dgst,
+ dlen, ctx))
+ goto err;
+ }
+ else
+#endif
+ if (!BN_rand_range(&k, dsa->q)) goto err;
+ } while (BN_is_zero(&k));
if ((dsa->flags & DSA_FLAG_NO_EXP_CONSTTIME) == 0)
{
BN_set_flags(&k, BN_FLG_CONSTTIME);
diff --git a/main/openssl/crypto/dsa/dsa_pmeth.c b/main/openssl/crypto/dsa/dsa_pmeth.c
index e2df54fe..715d8d67 100644
--- a/main/openssl/crypto/dsa/dsa_pmeth.c
+++ b/main/openssl/crypto/dsa/dsa_pmeth.c
@@ -189,7 +189,9 @@ static int pkey_dsa_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2)
EVP_MD_type((const EVP_MD *)p2) != NID_dsa &&
EVP_MD_type((const EVP_MD *)p2) != NID_dsaWithSHA &&
EVP_MD_type((const EVP_MD *)p2) != NID_sha224 &&
- EVP_MD_type((const EVP_MD *)p2) != NID_sha256)
+ EVP_MD_type((const EVP_MD *)p2) != NID_sha256 &&
+ EVP_MD_type((const EVP_MD *)p2) != NID_sha384 &&
+ EVP_MD_type((const EVP_MD *)p2) != NID_sha512)
{
DSAerr(DSA_F_PKEY_DSA_CTRL, DSA_R_INVALID_DIGEST_TYPE);
return 0;
@@ -253,7 +255,7 @@ static int pkey_dsa_paramgen(EVP_PKEY_CTX *ctx, EVP_PKEY *pkey)
if (!dsa)
return 0;
ret = dsa_builtin_paramgen(dsa, dctx->nbits, dctx->qbits, dctx->pmd,
- NULL, 0, NULL, NULL, pcb);
+ NULL, 0, NULL, NULL, NULL, pcb);
if (ret)
EVP_PKEY_assign_DSA(pkey, dsa);
else
diff --git a/main/openssl/crypto/dsa/dsa_sign.c b/main/openssl/crypto/dsa/dsa_sign.c
index 17555e58..8ace300a 100644
--- a/main/openssl/crypto/dsa/dsa_sign.c
+++ b/main/openssl/crypto/dsa/dsa_sign.c
@@ -61,30 +61,61 @@
#include "cryptlib.h"
#include <openssl/dsa.h>
#include <openssl/rand.h>
+#include <openssl/bn.h>
DSA_SIG * DSA_do_sign(const unsigned char *dgst, int dlen, DSA *dsa)
{
+#ifdef OPENSSL_FIPS
+ if (FIPS_mode() && !(dsa->meth->flags & DSA_FLAG_FIPS_METHOD)
+ && !(dsa->flags & DSA_FLAG_NON_FIPS_ALLOW))
+ {
+ DSAerr(DSA_F_DSA_DO_SIGN, DSA_R_NON_FIPS_DSA_METHOD);
+ return NULL;
+ }
+#endif
return dsa->meth->dsa_do_sign(dgst, dlen, dsa);
}
-int DSA_sign(int type, const unsigned char *dgst, int dlen, unsigned char *sig,
- unsigned int *siglen, DSA *dsa)
+int DSA_sign_setup(DSA *dsa, BN_CTX *ctx_in, BIGNUM **kinvp, BIGNUM **rp)
{
- DSA_SIG *s;
- RAND_seed(dgst, dlen);
- s=DSA_do_sign(dgst,dlen,dsa);
- if (s == NULL)
+#ifdef OPENSSL_FIPS
+ if (FIPS_mode() && !(dsa->meth->flags & DSA_FLAG_FIPS_METHOD)
+ && !(dsa->flags & DSA_FLAG_NON_FIPS_ALLOW))
{
- *siglen=0;
- return(0);
+ DSAerr(DSA_F_DSA_SIGN_SETUP, DSA_R_NON_FIPS_DSA_METHOD);
+ return 0;
}
- *siglen=i2d_DSA_SIG(s,&sig);
- DSA_SIG_free(s);
- return(1);
+#endif
+ if (dsa->flags & DSA_FLAG_NONCE_FROM_HASH)
+ {
+ /* You cannot precompute the DSA nonce if it is required to
+ * depend on the message. */
+ DSAerr(DSA_F_DSA_SIGN_SETUP, DSA_R_NONCE_CANNOT_BE_PRECOMPUTED);
+ return 0;
+ }
+ return dsa->meth->dsa_sign_setup(dsa, ctx_in, kinvp, rp, NULL, 0);
}
-int DSA_sign_setup(DSA *dsa, BN_CTX *ctx_in, BIGNUM **kinvp, BIGNUM **rp)
+DSA_SIG *DSA_SIG_new(void)
{
- return dsa->meth->dsa_sign_setup(dsa, ctx_in, kinvp, rp);
+ DSA_SIG *sig;
+ sig = OPENSSL_malloc(sizeof(DSA_SIG));
+ if (!sig)
+ return NULL;
+ sig->r = NULL;
+ sig->s = NULL;
+ return sig;
+ }
+
+void DSA_SIG_free(DSA_SIG *sig)
+ {
+ if (sig)
+ {
+ if (sig->r)
+ BN_free(sig->r);
+ if (sig->s)
+ BN_free(sig->s);
+ OPENSSL_free(sig);
+ }
}
diff --git a/main/openssl/crypto/dsa/dsa_vrf.c b/main/openssl/crypto/dsa/dsa_vrf.c
index 226a75ff..674cb5fa 100644
--- a/main/openssl/crypto/dsa/dsa_vrf.c
+++ b/main/openssl/crypto/dsa/dsa_vrf.c
@@ -64,26 +64,13 @@
int DSA_do_verify(const unsigned char *dgst, int dgst_len, DSA_SIG *sig,
DSA *dsa)
{
+#ifdef OPENSSL_FIPS
+ if (FIPS_mode() && !(dsa->meth->flags & DSA_FLAG_FIPS_METHOD)
+ && !(dsa->flags & DSA_FLAG_NON_FIPS_ALLOW))
+ {
+ DSAerr(DSA_F_DSA_DO_VERIFY, DSA_R_NON_FIPS_DSA_METHOD);
+ return -1;
+ }
+#endif
return dsa->meth->dsa_do_verify(dgst, dgst_len, sig, dsa);
}
-
-/* data has already been hashed (probably with SHA or SHA-1). */
-/* returns
- * 1: correct signature
- * 0: incorrect signature
- * -1: error
- */
-int DSA_verify(int type, const unsigned char *dgst, int dgst_len,
- const unsigned char *sigbuf, int siglen, DSA *dsa)
- {
- DSA_SIG *s;
- int ret=-1;
-
- s = DSA_SIG_new();
- if (s == NULL) return(ret);
- if (d2i_DSA_SIG(&s,&sigbuf,siglen) == NULL) goto err;
- ret=DSA_do_verify(dgst,dgst_len,s,dsa);
-err:
- DSA_SIG_free(s);
- return(ret);
- }
diff --git a/main/openssl/crypto/dso/dso_dlfcn.c b/main/openssl/crypto/dso/dso_dlfcn.c
index c2bc6176..5f225480 100644
--- a/main/openssl/crypto/dso/dso_dlfcn.c
+++ b/main/openssl/crypto/dso/dso_dlfcn.c
@@ -86,7 +86,8 @@ DSO_METHOD *DSO_METHOD_dlfcn(void)
# if defined(_AIX) || defined(__CYGWIN__) || \
defined(__SCO_VERSION__) || defined(_SCO_ELF) || \
(defined(__osf__) && !defined(RTLD_NEXT)) || \
- (defined(__OpenBSD__) && !defined(RTLD_SELF))
+ (defined(__OpenBSD__) && !defined(RTLD_SELF)) || \
+ defined(__ANDROID__)
# undef HAVE_DLINFO
# endif
#endif
diff --git a/main/openssl/crypto/dso/dso_lib.c b/main/openssl/crypto/dso/dso_lib.c
index 8a15b794..78015298 100644
--- a/main/openssl/crypto/dso/dso_lib.c
+++ b/main/openssl/crypto/dso/dso_lib.c
@@ -237,11 +237,19 @@ DSO *DSO_load(DSO *dso, const char *filename, DSO_METHOD *meth, int flags)
if(ret->meth->dso_load == NULL)
{
DSOerr(DSO_F_DSO_LOAD,DSO_R_UNSUPPORTED);
+ /* Make sure we unset the filename on failure, because we use
+ * this to determine when the DSO has been loaded above. */
+ OPENSSL_free(ret->filename);
+ ret->filename = NULL;
goto err;
}
if(!ret->meth->dso_load(ret))
{
DSOerr(DSO_F_DSO_LOAD,DSO_R_LOAD_FAILED);
+ /* Make sure we unset the filename on failure, because we use
+ * this to determine when the DSO has been loaded above. */
+ OPENSSL_free(ret->filename);
+ ret->filename = NULL;
goto err;
}
/* Load succeeded */
diff --git a/main/openssl/crypto/ec/ec.h b/main/openssl/crypto/ec/ec.h
index ee707813..d008a0da 100644
--- a/main/openssl/crypto/ec/ec.h
+++ b/main/openssl/crypto/ec/ec.h
@@ -151,7 +151,24 @@ const EC_METHOD *EC_GFp_mont_method(void);
*/
const EC_METHOD *EC_GFp_nist_method(void);
+#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
+/** Returns 64-bit optimized methods for nistp224
+ * \return EC_METHOD object
+ */
+const EC_METHOD *EC_GFp_nistp224_method(void);
+
+/** Returns 64-bit optimized methods for nistp256
+ * \return EC_METHOD object
+ */
+const EC_METHOD *EC_GFp_nistp256_method(void);
+
+/** Returns 64-bit optimized methods for nistp521
+ * \return EC_METHOD object
+ */
+const EC_METHOD *EC_GFp_nistp521_method(void);
+#endif
+#ifndef OPENSSL_NO_EC2M
/********************************************************************/
/* EC_METHOD for curves over GF(2^m) */
/********************************************************************/
@@ -161,6 +178,8 @@ const EC_METHOD *EC_GFp_nist_method(void);
*/
const EC_METHOD *EC_GF2m_simple_method(void);
+#endif
+
/********************************************************************/
/* EC_GROUP functions */
@@ -255,10 +274,10 @@ int EC_GROUP_get_curve_name(const EC_GROUP *group);
void EC_GROUP_set_asn1_flag(EC_GROUP *group, int flag);
int EC_GROUP_get_asn1_flag(const EC_GROUP *group);
-void EC_GROUP_set_point_conversion_form(EC_GROUP *, point_conversion_form_t);
+void EC_GROUP_set_point_conversion_form(EC_GROUP *group, point_conversion_form_t form);
point_conversion_form_t EC_GROUP_get_point_conversion_form(const EC_GROUP *);
-unsigned char *EC_GROUP_get0_seed(const EC_GROUP *);
+unsigned char *EC_GROUP_get0_seed(const EC_GROUP *x);
size_t EC_GROUP_get_seed_len(const EC_GROUP *);
size_t EC_GROUP_set_seed(EC_GROUP *, const unsigned char *, size_t len);
@@ -282,6 +301,7 @@ int EC_GROUP_set_curve_GFp(EC_GROUP *group, const BIGNUM *p, const BIGNUM *a, co
*/
int EC_GROUP_get_curve_GFp(const EC_GROUP *group, BIGNUM *p, BIGNUM *a, BIGNUM *b, BN_CTX *ctx);
+#ifndef OPENSSL_NO_EC2M
/** Sets the parameter of a ec over GF2m defined by y^2 + x*y = x^3 + a*x^2 + b
* \param group EC_GROUP object
* \param p BIGNUM with the polynomial defining the underlying field
@@ -301,7 +321,7 @@ int EC_GROUP_set_curve_GF2m(EC_GROUP *group, const BIGNUM *p, const BIGNUM *a, c
* \return 1 on success and 0 if an error occured
*/
int EC_GROUP_get_curve_GF2m(const EC_GROUP *group, BIGNUM *p, BIGNUM *a, BIGNUM *b, BN_CTX *ctx);
-
+#endif
/** Returns the number of bits needed to represent a field element
* \param group EC_GROUP object
* \return number of bits needed to represent a field element
@@ -342,7 +362,7 @@ int EC_GROUP_cmp(const EC_GROUP *a, const EC_GROUP *b, BN_CTX *ctx);
* \return newly created EC_GROUP object with the specified parameters
*/
EC_GROUP *EC_GROUP_new_curve_GFp(const BIGNUM *p, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx);
-
+#ifndef OPENSSL_NO_EC2M
/** Creates a new EC_GROUP object with the specified parameters defined
* over GF2m (defined by the equation y^2 + x*y = x^3 + a*x^2 + b)
* \param p BIGNUM with the polynomial defining the underlying field
@@ -352,7 +372,7 @@ EC_GROUP *EC_GROUP_new_curve_GFp(const BIGNUM *p, const BIGNUM *a, const BIGNUM
* \return newly created EC_GROUP object with the specified parameters
*/
EC_GROUP *EC_GROUP_new_curve_GF2m(const BIGNUM *p, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx);
-
+#endif
/** Creates a EC_GROUP object with a curve specified by a NID
* \param nid NID of the OID of the curve name
* \return newly created EC_GROUP object with specified curve or NULL
@@ -481,7 +501,7 @@ int EC_POINT_get_affine_coordinates_GFp(const EC_GROUP *group,
*/
int EC_POINT_set_compressed_coordinates_GFp(const EC_GROUP *group, EC_POINT *p,
const BIGNUM *x, int y_bit, BN_CTX *ctx);
-
+#ifndef OPENSSL_NO_EC2M
/** Sets the affine coordinates of a EC_POINT over GF2m
* \param group underlying EC_GROUP object
* \param p EC_POINT object
@@ -514,7 +534,7 @@ int EC_POINT_get_affine_coordinates_GF2m(const EC_GROUP *group,
*/
int EC_POINT_set_compressed_coordinates_GF2m(const EC_GROUP *group, EC_POINT *p,
const BIGNUM *x, int y_bit, BN_CTX *ctx);
-
+#endif
/** Encodes a EC_POINT object to a octet string
* \param group underlying EC_GROUP object
* \param p EC_POINT object
@@ -606,8 +626,8 @@ int EC_POINT_is_on_curve(const EC_GROUP *group, const EC_POINT *point, BN_CTX *c
*/
int EC_POINT_cmp(const EC_GROUP *group, const EC_POINT *a, const EC_POINT *b, BN_CTX *ctx);
-int EC_POINT_make_affine(const EC_GROUP *, EC_POINT *, BN_CTX *);
-int EC_POINTs_make_affine(const EC_GROUP *, size_t num, EC_POINT *[], BN_CTX *);
+int EC_POINT_make_affine(const EC_GROUP *group, EC_POINT *point, BN_CTX *ctx);
+int EC_POINTs_make_affine(const EC_GROUP *group, size_t num, EC_POINT *points[], BN_CTX *ctx);
/** Computes r = generator * n sum_{i=0}^num p[i] * m[i]
* \param group underlying EC_GROUP object
@@ -653,9 +673,11 @@ int EC_GROUP_have_precompute_mult(const EC_GROUP *group);
/* EC_GROUP_get_basis_type() returns the NID of the basis type
* used to represent the field elements */
int EC_GROUP_get_basis_type(const EC_GROUP *);
+#ifndef OPENSSL_NO_EC2M
int EC_GROUP_get_trinomial_basis(const EC_GROUP *, unsigned int *k);
int EC_GROUP_get_pentanomial_basis(const EC_GROUP *, unsigned int *k1,
unsigned int *k2, unsigned int *k3);
+#endif
#define OPENSSL_EC_NAMED_CURVE 0x001
@@ -689,11 +711,21 @@ typedef struct ec_key_st EC_KEY;
#define EC_PKEY_NO_PARAMETERS 0x001
#define EC_PKEY_NO_PUBKEY 0x002
+/* some values for the flags field */
+#define EC_FLAG_NON_FIPS_ALLOW 0x1
+#define EC_FLAG_FIPS_CHECKED 0x2
+
/** Creates a new EC_KEY object.
* \return EC_KEY object or NULL if an error occurred.
*/
EC_KEY *EC_KEY_new(void);
+int EC_KEY_get_flags(const EC_KEY *key);
+
+void EC_KEY_set_flags(EC_KEY *key, int flags);
+
+void EC_KEY_clear_flags(EC_KEY *key, int flags);
+
/** Creates a new EC_KEY object using a named curve as underlying
* EC_GROUP object.
* \param nid NID of the named curve.
@@ -768,16 +800,35 @@ const EC_POINT *EC_KEY_get0_public_key(const EC_KEY *key);
int EC_KEY_set_public_key(EC_KEY *key, const EC_POINT *pub);
unsigned EC_KEY_get_enc_flags(const EC_KEY *key);
-void EC_KEY_set_enc_flags(EC_KEY *, unsigned int);
-point_conversion_form_t EC_KEY_get_conv_form(const EC_KEY *);
-void EC_KEY_set_conv_form(EC_KEY *, point_conversion_form_t);
+void EC_KEY_set_enc_flags(EC_KEY *eckey, unsigned int flags);
+point_conversion_form_t EC_KEY_get_conv_form(const EC_KEY *key);
+void EC_KEY_set_conv_form(EC_KEY *eckey, point_conversion_form_t cform);
/* functions to set/get method specific data */
-void *EC_KEY_get_key_method_data(EC_KEY *,
+void *EC_KEY_get_key_method_data(EC_KEY *key,
void *(*dup_func)(void *), void (*free_func)(void *), void (*clear_free_func)(void *));
-void EC_KEY_insert_key_method_data(EC_KEY *, void *data,
+/** Sets the key method data of an EC_KEY object, if none has yet been set.
+ * \param key EC_KEY object
+ * \param data opaque data to install.
+ * \param dup_func a function that duplicates |data|.
+ * \param free_func a function that frees |data|.
+ * \param clear_free_func a function that wipes and frees |data|.
+ * \return the previously set data pointer, or NULL if |data| was inserted.
+ */
+void *EC_KEY_insert_key_method_data(EC_KEY *key, void *data,
void *(*dup_func)(void *), void (*free_func)(void *), void (*clear_free_func)(void *));
/* wrapper functions for the underlying EC_GROUP object */
-void EC_KEY_set_asn1_flag(EC_KEY *, int);
+void EC_KEY_set_asn1_flag(EC_KEY *eckey, int asn1_flag);
+
+/** Sets whether ECDSA operations with the given key will calculate their k
+ * value from SHA512(private_key + message + random) in order to protect
+ * against a weak PRNG.
+ * \param on Whether to calculate k from a hash or not
+ */
+void EC_KEY_set_nonce_from_hash(EC_KEY *key, int on);
+
+/** Returns the value of nonce_from_hash
+ */
+int EC_KEY_get_nonce_from_hash(const EC_KEY *key);
/** Creates a table of pre-computed multiples of the generator to
* accelerate further EC_KEY operations.
@@ -799,6 +850,15 @@ int EC_KEY_generate_key(EC_KEY *key);
*/
int EC_KEY_check_key(const EC_KEY *key);
+/** Sets a public key from affine coordindates performing
+ * neccessary NIST PKV tests.
+ * \param key the EC_KEY object
+ * \param x public key x coordinate
+ * \param y public key y coordinate
+ * \return 1 on success and 0 otherwise.
+ */
+int EC_KEY_set_public_key_affine_coordinates(EC_KEY *key, BIGNUM *x, BIGNUM *y);
+
/********************************************************************/
/* de- and encoding functions for SEC1 ECPrivateKey */
@@ -926,6 +986,7 @@ void ERR_load_EC_strings(void);
/* Error codes for the EC functions. */
/* Function codes. */
+#define EC_F_BN_TO_FELEM 224
#define EC_F_COMPUTE_WNAF 143
#define EC_F_D2I_ECPARAMETERS 144
#define EC_F_D2I_ECPKPARAMETERS 145
@@ -968,6 +1029,15 @@ void ERR_load_EC_strings(void);
#define EC_F_EC_GFP_MONT_FIELD_SQR 132
#define EC_F_EC_GFP_MONT_GROUP_SET_CURVE 189
#define EC_F_EC_GFP_MONT_GROUP_SET_CURVE_GFP 135
+#define EC_F_EC_GFP_NISTP224_GROUP_SET_CURVE 225
+#define EC_F_EC_GFP_NISTP224_POINTS_MUL 228
+#define EC_F_EC_GFP_NISTP224_POINT_GET_AFFINE_COORDINATES 226
+#define EC_F_EC_GFP_NISTP256_GROUP_SET_CURVE 230
+#define EC_F_EC_GFP_NISTP256_POINTS_MUL 231
+#define EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES 232
+#define EC_F_EC_GFP_NISTP521_GROUP_SET_CURVE 233
+#define EC_F_EC_GFP_NISTP521_POINTS_MUL 234
+#define EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES 235
#define EC_F_EC_GFP_NIST_FIELD_MUL 200
#define EC_F_EC_GFP_NIST_FIELD_SQR 201
#define EC_F_EC_GFP_NIST_GROUP_SET_CURVE 202
@@ -1010,6 +1080,7 @@ void ERR_load_EC_strings(void);
#define EC_F_EC_KEY_NEW 182
#define EC_F_EC_KEY_PRINT 180
#define EC_F_EC_KEY_PRINT_FP 181
+#define EC_F_EC_KEY_SET_PUBLIC_KEY_AFFINE_COORDINATES 229
#define EC_F_EC_POINTS_MAKE_AFFINE 136
#define EC_F_EC_POINT_ADD 112
#define EC_F_EC_POINT_CMP 113
@@ -1040,6 +1111,9 @@ void ERR_load_EC_strings(void);
#define EC_F_I2D_ECPKPARAMETERS 191
#define EC_F_I2D_ECPRIVATEKEY 192
#define EC_F_I2O_ECPUBLICKEY 151
+#define EC_F_NISTP224_PRE_COMP_NEW 227
+#define EC_F_NISTP256_PRE_COMP_NEW 236
+#define EC_F_NISTP521_PRE_COMP_NEW 237
#define EC_F_O2I_ECPUBLICKEY 152
#define EC_F_OLD_EC_PRIV_DECODE 222
#define EC_F_PKEY_EC_CTRL 197
@@ -1052,12 +1126,15 @@ void ERR_load_EC_strings(void);
/* Reason codes. */
#define EC_R_ASN1_ERROR 115
#define EC_R_ASN1_UNKNOWN_FIELD 116
+#define EC_R_BIGNUM_OUT_OF_RANGE 144
#define EC_R_BUFFER_TOO_SMALL 100
+#define EC_R_COORDINATES_OUT_OF_RANGE 146
#define EC_R_D2I_ECPKPARAMETERS_FAILURE 117
#define EC_R_DECODE_ERROR 142
#define EC_R_DISCRIMINANT_IS_ZERO 118
#define EC_R_EC_GROUP_NEW_BY_NAME_FAILURE 119
#define EC_R_FIELD_TOO_LARGE 143
+#define EC_R_GF2M_NOT_SUPPORTED 147
#define EC_R_GROUP2PKPARAMETERS_FAILURE 120
#define EC_R_I2D_ECPKPARAMETERS_FAILURE 121
#define EC_R_INCOMPATIBLE_OBJECTS 101
@@ -1092,6 +1169,7 @@ void ERR_load_EC_strings(void);
#define EC_R_UNKNOWN_GROUP 129
#define EC_R_UNKNOWN_ORDER 114
#define EC_R_UNSUPPORTED_FIELD 131
+#define EC_R_WRONG_CURVE_PARAMETERS 145
#define EC_R_WRONG_ORDER 130
#ifdef __cplusplus
diff --git a/main/openssl/crypto/ec/ec2_mult.c b/main/openssl/crypto/ec/ec2_mult.c
index e12b9b28..26f4a783 100644
--- a/main/openssl/crypto/ec/ec2_mult.c
+++ b/main/openssl/crypto/ec/ec2_mult.c
@@ -71,6 +71,8 @@
#include "ec_lcl.h"
+#ifndef OPENSSL_NO_EC2M
+
/* Compute the x-coordinate x/z for the point 2*(x/z) in Montgomery projective
* coordinates.
@@ -384,3 +386,5 @@ int ec_GF2m_have_precompute_mult(const EC_GROUP *group)
{
return ec_wNAF_have_precompute_mult(group);
}
+
+#endif
diff --git a/main/openssl/crypto/ec/ec2_oct.c b/main/openssl/crypto/ec/ec2_oct.c
new file mode 100644
index 00000000..f1d75e5d
--- /dev/null
+++ b/main/openssl/crypto/ec/ec2_oct.c
@@ -0,0 +1,407 @@
+/* crypto/ec/ec2_oct.c */
+/* ====================================================================
+ * Copyright 2002 Sun Microsystems, Inc. ALL RIGHTS RESERVED.
+ *
+ * The Elliptic Curve Public-Key Crypto Library (ECC Code) included
+ * herein is developed by SUN MICROSYSTEMS, INC., and is contributed
+ * to the OpenSSL project.
+ *
+ * The ECC Code is licensed pursuant to the OpenSSL open source
+ * license provided below.
+ *
+ * The software is originally written by Sheueling Chang Shantz and
+ * Douglas Stebila of Sun Microsystems Laboratories.
+ *
+ */
+/* ====================================================================
+ * Copyright (c) 1998-2005 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com). This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+
+#include <openssl/err.h>
+
+#include "ec_lcl.h"
+
+#ifndef OPENSSL_NO_EC2M
+
+/* Calculates and sets the affine coordinates of an EC_POINT from the given
+ * compressed coordinates. Uses algorithm 2.3.4 of SEC 1.
+ * Note that the simple implementation only uses affine coordinates.
+ *
+ * The method is from the following publication:
+ *
+ * Harper, Menezes, Vanstone:
+ * "Public-Key Cryptosystems with Very Small Key Lengths",
+ * EUROCRYPT '92, Springer-Verlag LNCS 658,
+ * published February 1993
+ *
+ * US Patents 6,141,420 and 6,618,483 (Vanstone, Mullin, Agnew) describe
+ * the same method, but claim no priority date earlier than July 29, 1994
+ * (and additionally fail to cite the EUROCRYPT '92 publication as prior art).
+ */
+int ec_GF2m_simple_set_compressed_coordinates(const EC_GROUP *group, EC_POINT *point,
+ const BIGNUM *x_, int y_bit, BN_CTX *ctx)
+ {
+ BN_CTX *new_ctx = NULL;
+ BIGNUM *tmp, *x, *y, *z;
+ int ret = 0, z0;
+
+ /* clear error queue */
+ ERR_clear_error();
+
+ if (ctx == NULL)
+ {
+ ctx = new_ctx = BN_CTX_new();
+ if (ctx == NULL)
+ return 0;
+ }
+
+ y_bit = (y_bit != 0) ? 1 : 0;
+
+ BN_CTX_start(ctx);
+ tmp = BN_CTX_get(ctx);
+ x = BN_CTX_get(ctx);
+ y = BN_CTX_get(ctx);
+ z = BN_CTX_get(ctx);
+ if (z == NULL) goto err;
+
+ if (!BN_GF2m_mod_arr(x, x_, group->poly)) goto err;
+ if (BN_is_zero(x))
+ {
+ if (!BN_GF2m_mod_sqrt_arr(y, &group->b, group->poly, ctx)) goto err;
+ }
+ else
+ {
+ if (!group->meth->field_sqr(group, tmp, x, ctx)) goto err;
+ if (!group->meth->field_div(group, tmp, &group->b, tmp, ctx)) goto err;
+ if (!BN_GF2m_add(tmp, &group->a, tmp)) goto err;
+ if (!BN_GF2m_add(tmp, x, tmp)) goto err;
+ if (!BN_GF2m_mod_solve_quad_arr(z, tmp, group->poly, ctx))
+ {
+ unsigned long err = ERR_peek_last_error();
+
+ if (ERR_GET_LIB(err) == ERR_LIB_BN && ERR_GET_REASON(err) == BN_R_NO_SOLUTION)
+ {
+ ERR_clear_error();
+ ECerr(EC_F_EC_GF2M_SIMPLE_SET_COMPRESSED_COORDINATES, EC_R_INVALID_COMPRESSED_POINT);
+ }
+ else
+ ECerr(EC_F_EC_GF2M_SIMPLE_SET_COMPRESSED_COORDINATES, ERR_R_BN_LIB);
+ goto err;
+ }
+ z0 = (BN_is_odd(z)) ? 1 : 0;
+ if (!group->meth->field_mul(group, y, x, z, ctx)) goto err;
+ if (z0 != y_bit)
+ {
+ if (!BN_GF2m_add(y, y, x)) goto err;
+ }
+ }
+
+ if (!EC_POINT_set_affine_coordinates_GF2m(group, point, x, y, ctx)) goto err;
+
+ ret = 1;
+
+ err:
+ BN_CTX_end(ctx);
+ if (new_ctx != NULL)
+ BN_CTX_free(new_ctx);
+ return ret;
+ }
+
+
+/* Converts an EC_POINT to an octet string.
+ * If buf is NULL, the encoded length will be returned.
+ * If the length len of buf is smaller than required an error will be returned.
+ */
+size_t ec_GF2m_simple_point2oct(const EC_GROUP *group, const EC_POINT *point, point_conversion_form_t form,
+ unsigned char *buf, size_t len, BN_CTX *ctx)
+ {
+ size_t ret;
+ BN_CTX *new_ctx = NULL;
+ int used_ctx = 0;
+ BIGNUM *x, *y, *yxi;
+ size_t field_len, i, skip;
+
+ if ((form != POINT_CONVERSION_COMPRESSED)
+ && (form != POINT_CONVERSION_UNCOMPRESSED)
+ && (form != POINT_CONVERSION_HYBRID))
+ {
+ ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, EC_R_INVALID_FORM);
+ goto err;
+ }
+
+ if (EC_POINT_is_at_infinity(group, point))
+ {
+ /* encodes to a single 0 octet */
+ if (buf != NULL)
+ {
+ if (len < 1)
+ {
+ ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, EC_R_BUFFER_TOO_SMALL);
+ return 0;
+ }
+ buf[0] = 0;
+ }
+ return 1;
+ }
+
+
+ /* ret := required output buffer length */
+ field_len = (EC_GROUP_get_degree(group) + 7) / 8;
+ ret = (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2*field_len;
+
+ /* if 'buf' is NULL, just return required length */
+ if (buf != NULL)
+ {
+ if (len < ret)
+ {
+ ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, EC_R_BUFFER_TOO_SMALL);
+ goto err;
+ }
+
+ if (ctx == NULL)
+ {
+ ctx = new_ctx = BN_CTX_new();
+ if (ctx == NULL)
+ return 0;
+ }
+
+ BN_CTX_start(ctx);
+ used_ctx = 1;
+ x = BN_CTX_get(ctx);
+ y = BN_CTX_get(ctx);
+ yxi = BN_CTX_get(ctx);
+ if (yxi == NULL) goto err;
+
+ if (!EC_POINT_get_affine_coordinates_GF2m(group, point, x, y, ctx)) goto err;
+
+ buf[0] = form;
+ if ((form != POINT_CONVERSION_UNCOMPRESSED) && !BN_is_zero(x))
+ {
+ if (!group->meth->field_div(group, yxi, y, x, ctx)) goto err;
+ if (BN_is_odd(yxi)) buf[0]++;
+ }
+
+ i = 1;
+
+ skip = field_len - BN_num_bytes(x);
+ if (skip > field_len)
+ {
+ ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
+ goto err;
+ }
+ while (skip > 0)
+ {
+ buf[i++] = 0;
+ skip--;
+ }
+ skip = BN_bn2bin(x, buf + i);
+ i += skip;
+ if (i != 1 + field_len)
+ {
+ ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
+ goto err;
+ }
+
+ if (form == POINT_CONVERSION_UNCOMPRESSED || form == POINT_CONVERSION_HYBRID)
+ {
+ skip = field_len - BN_num_bytes(y);
+ if (skip > field_len)
+ {
+ ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
+ goto err;
+ }
+ while (skip > 0)
+ {
+ buf[i++] = 0;
+ skip--;
+ }
+ skip = BN_bn2bin(y, buf + i);
+ i += skip;
+ }
+
+ if (i != ret)
+ {
+ ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
+ goto err;
+ }
+ }
+
+ if (used_ctx)
+ BN_CTX_end(ctx);
+ if (new_ctx != NULL)
+ BN_CTX_free(new_ctx);
+ return ret;
+
+ err:
+ if (used_ctx)
+ BN_CTX_end(ctx);
+ if (new_ctx != NULL)
+ BN_CTX_free(new_ctx);
+ return 0;
+ }
+
+
+/* Converts an octet string representation to an EC_POINT.
+ * Note that the simple implementation only uses affine coordinates.
+ */
+int ec_GF2m_simple_oct2point(const EC_GROUP *group, EC_POINT *point,
+ const unsigned char *buf, size_t len, BN_CTX *ctx)
+ {
+ point_conversion_form_t form;
+ int y_bit;
+ BN_CTX *new_ctx = NULL;
+ BIGNUM *x, *y, *yxi;
+ size_t field_len, enc_len;
+ int ret = 0;
+
+ if (len == 0)
+ {
+ ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_BUFFER_TOO_SMALL);
+ return 0;
+ }
+ form = buf[0];
+ y_bit = form & 1;
+ form = form & ~1U;
+ if ((form != 0) && (form != POINT_CONVERSION_COMPRESSED)
+ && (form != POINT_CONVERSION_UNCOMPRESSED)
+ && (form != POINT_CONVERSION_HYBRID))
+ {
+ ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+ return 0;
+ }
+ if ((form == 0 || form == POINT_CONVERSION_UNCOMPRESSED) && y_bit)
+ {
+ ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+ return 0;
+ }
+
+ if (form == 0)
+ {
+ if (len != 1)
+ {
+ ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+ return 0;
+ }
+
+ return EC_POINT_set_to_infinity(group, point);
+ }
+
+ field_len = (EC_GROUP_get_degree(group) + 7) / 8;
+ enc_len = (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2*field_len;
+
+ if (len != enc_len)
+ {
+ ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+ return 0;
+ }
+
+ if (ctx == NULL)
+ {
+ ctx = new_ctx = BN_CTX_new();
+ if (ctx == NULL)
+ return 0;
+ }
+
+ BN_CTX_start(ctx);
+ x = BN_CTX_get(ctx);
+ y = BN_CTX_get(ctx);
+ yxi = BN_CTX_get(ctx);
+ if (yxi == NULL) goto err;
+
+ if (!BN_bin2bn(buf + 1, field_len, x)) goto err;
+ if (BN_ucmp(x, &group->field) >= 0)
+ {
+ ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+ goto err;
+ }
+
+ if (form == POINT_CONVERSION_COMPRESSED)
+ {
+ if (!EC_POINT_set_compressed_coordinates_GF2m(group, point, x, y_bit, ctx)) goto err;
+ }
+ else
+ {
+ if (!BN_bin2bn(buf + 1 + field_len, field_len, y)) goto err;
+ if (BN_ucmp(y, &group->field) >= 0)
+ {
+ ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+ goto err;
+ }
+ if (form == POINT_CONVERSION_HYBRID)
+ {
+ if (!group->meth->field_div(group, yxi, y, x, ctx)) goto err;
+ if (y_bit != BN_is_odd(yxi))
+ {
+ ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+ goto err;
+ }
+ }
+
+ if (!EC_POINT_set_affine_coordinates_GF2m(group, point, x, y, ctx)) goto err;
+ }
+
+ if (!EC_POINT_is_on_curve(group, point, ctx)) /* test required by X9.62 */
+ {
+ ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_POINT_IS_NOT_ON_CURVE);
+ goto err;
+ }
+
+ ret = 1;
+
+ err:
+ BN_CTX_end(ctx);
+ if (new_ctx != NULL)
+ BN_CTX_free(new_ctx);
+ return ret;
+ }
+#endif
diff --git a/main/openssl/crypto/ec/ec2_smpl.c b/main/openssl/crypto/ec/ec2_smpl.c
index af94458c..e0e59c7d 100644
--- a/main/openssl/crypto/ec/ec2_smpl.c
+++ b/main/openssl/crypto/ec/ec2_smpl.c
@@ -71,10 +71,20 @@
#include "ec_lcl.h"
+#ifndef OPENSSL_NO_EC2M
+
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
+
const EC_METHOD *EC_GF2m_simple_method(void)
{
+#ifdef OPENSSL_FIPS
+ return fips_ec_gf2m_simple_method();
+#else
static const EC_METHOD ret = {
+ EC_FLAGS_DEFAULT_OCT,
NID_X9_62_characteristic_two_field,
ec_GF2m_simple_group_init,
ec_GF2m_simple_group_finish,
@@ -93,9 +103,7 @@ const EC_METHOD *EC_GF2m_simple_method(void)
0 /* get_Jprojective_coordinates_GFp */,
ec_GF2m_simple_point_set_affine_coordinates,
ec_GF2m_simple_point_get_affine_coordinates,
- ec_GF2m_simple_set_compressed_coordinates,
- ec_GF2m_simple_point2oct,
- ec_GF2m_simple_oct2point,
+ 0,0,0,
ec_GF2m_simple_add,
ec_GF2m_simple_dbl,
ec_GF2m_simple_invert,
@@ -118,6 +126,7 @@ const EC_METHOD *EC_GF2m_simple_method(void)
0 /* field_set_to_one */ };
return &ret;
+#endif
}
@@ -405,340 +414,6 @@ int ec_GF2m_simple_point_get_affine_coordinates(const EC_GROUP *group, const EC_
return ret;
}
-
-/* Calculates and sets the affine coordinates of an EC_POINT from the given
- * compressed coordinates. Uses algorithm 2.3.4 of SEC 1.
- * Note that the simple implementation only uses affine coordinates.
- *
- * The method is from the following publication:
- *
- * Harper, Menezes, Vanstone:
- * "Public-Key Cryptosystems with Very Small Key Lengths",
- * EUROCRYPT '92, Springer-Verlag LNCS 658,
- * published February 1993
- *
- * US Patents 6,141,420 and 6,618,483 (Vanstone, Mullin, Agnew) describe
- * the same method, but claim no priority date earlier than July 29, 1994
- * (and additionally fail to cite the EUROCRYPT '92 publication as prior art).
- */
-int ec_GF2m_simple_set_compressed_coordinates(const EC_GROUP *group, EC_POINT *point,
- const BIGNUM *x_, int y_bit, BN_CTX *ctx)
- {
- BN_CTX *new_ctx = NULL;
- BIGNUM *tmp, *x, *y, *z;
- int ret = 0, z0;
-
- /* clear error queue */
- ERR_clear_error();
-
- if (ctx == NULL)
- {
- ctx = new_ctx = BN_CTX_new();
- if (ctx == NULL)
- return 0;
- }
-
- y_bit = (y_bit != 0) ? 1 : 0;
-
- BN_CTX_start(ctx);
- tmp = BN_CTX_get(ctx);
- x = BN_CTX_get(ctx);
- y = BN_CTX_get(ctx);
- z = BN_CTX_get(ctx);
- if (z == NULL) goto err;
-
- if (!BN_GF2m_mod_arr(x, x_, group->poly)) goto err;
- if (BN_is_zero(x))
- {
- if (!BN_GF2m_mod_sqrt_arr(y, &group->b, group->poly, ctx)) goto err;
- }
- else
- {
- if (!group->meth->field_sqr(group, tmp, x, ctx)) goto err;
- if (!group->meth->field_div(group, tmp, &group->b, tmp, ctx)) goto err;
- if (!BN_GF2m_add(tmp, &group->a, tmp)) goto err;
- if (!BN_GF2m_add(tmp, x, tmp)) goto err;
- if (!BN_GF2m_mod_solve_quad_arr(z, tmp, group->poly, ctx))
- {
- unsigned long err = ERR_peek_last_error();
-
- if (ERR_GET_LIB(err) == ERR_LIB_BN && ERR_GET_REASON(err) == BN_R_NO_SOLUTION)
- {
- ERR_clear_error();
- ECerr(EC_F_EC_GF2M_SIMPLE_SET_COMPRESSED_COORDINATES, EC_R_INVALID_COMPRESSED_POINT);
- }
- else
- ECerr(EC_F_EC_GF2M_SIMPLE_SET_COMPRESSED_COORDINATES, ERR_R_BN_LIB);
- goto err;
- }
- z0 = (BN_is_odd(z)) ? 1 : 0;
- if (!group->meth->field_mul(group, y, x, z, ctx)) goto err;
- if (z0 != y_bit)
- {
- if (!BN_GF2m_add(y, y, x)) goto err;
- }
- }
-
- if (!EC_POINT_set_affine_coordinates_GF2m(group, point, x, y, ctx)) goto err;
-
- ret = 1;
-
- err:
- BN_CTX_end(ctx);
- if (new_ctx != NULL)
- BN_CTX_free(new_ctx);
- return ret;
- }
-
-
-/* Converts an EC_POINT to an octet string.
- * If buf is NULL, the encoded length will be returned.
- * If the length len of buf is smaller than required an error will be returned.
- */
-size_t ec_GF2m_simple_point2oct(const EC_GROUP *group, const EC_POINT *point, point_conversion_form_t form,
- unsigned char *buf, size_t len, BN_CTX *ctx)
- {
- size_t ret;
- BN_CTX *new_ctx = NULL;
- int used_ctx = 0;
- BIGNUM *x, *y, *yxi;
- size_t field_len, i, skip;
-
- if ((form != POINT_CONVERSION_COMPRESSED)
- && (form != POINT_CONVERSION_UNCOMPRESSED)
- && (form != POINT_CONVERSION_HYBRID))
- {
- ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, EC_R_INVALID_FORM);
- goto err;
- }
-
- if (EC_POINT_is_at_infinity(group, point))
- {
- /* encodes to a single 0 octet */
- if (buf != NULL)
- {
- if (len < 1)
- {
- ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, EC_R_BUFFER_TOO_SMALL);
- return 0;
- }
- buf[0] = 0;
- }
- return 1;
- }
-
-
- /* ret := required output buffer length */
- field_len = (EC_GROUP_get_degree(group) + 7) / 8;
- ret = (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2*field_len;
-
- /* if 'buf' is NULL, just return required length */
- if (buf != NULL)
- {
- if (len < ret)
- {
- ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, EC_R_BUFFER_TOO_SMALL);
- goto err;
- }
-
- if (ctx == NULL)
- {
- ctx = new_ctx = BN_CTX_new();
- if (ctx == NULL)
- return 0;
- }
-
- BN_CTX_start(ctx);
- used_ctx = 1;
- x = BN_CTX_get(ctx);
- y = BN_CTX_get(ctx);
- yxi = BN_CTX_get(ctx);
- if (yxi == NULL) goto err;
-
- if (!EC_POINT_get_affine_coordinates_GF2m(group, point, x, y, ctx)) goto err;
-
- buf[0] = form;
- if ((form != POINT_CONVERSION_UNCOMPRESSED) && !BN_is_zero(x))
- {
- if (!group->meth->field_div(group, yxi, y, x, ctx)) goto err;
- if (BN_is_odd(yxi)) buf[0]++;
- }
-
- i = 1;
-
- skip = field_len - BN_num_bytes(x);
- if (skip > field_len)
- {
- ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
- goto err;
- }
- while (skip > 0)
- {
- buf[i++] = 0;
- skip--;
- }
- skip = BN_bn2bin(x, buf + i);
- i += skip;
- if (i != 1 + field_len)
- {
- ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
- goto err;
- }
-
- if (form == POINT_CONVERSION_UNCOMPRESSED || form == POINT_CONVERSION_HYBRID)
- {
- skip = field_len - BN_num_bytes(y);
- if (skip > field_len)
- {
- ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
- goto err;
- }
- while (skip > 0)
- {
- buf[i++] = 0;
- skip--;
- }
- skip = BN_bn2bin(y, buf + i);
- i += skip;
- }
-
- if (i != ret)
- {
- ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
- goto err;
- }
- }
-
- if (used_ctx)
- BN_CTX_end(ctx);
- if (new_ctx != NULL)
- BN_CTX_free(new_ctx);
- return ret;
-
- err:
- if (used_ctx)
- BN_CTX_end(ctx);
- if (new_ctx != NULL)
- BN_CTX_free(new_ctx);
- return 0;
- }
-
-
-/* Converts an octet string representation to an EC_POINT.
- * Note that the simple implementation only uses affine coordinates.
- */
-int ec_GF2m_simple_oct2point(const EC_GROUP *group, EC_POINT *point,
- const unsigned char *buf, size_t len, BN_CTX *ctx)
- {
- point_conversion_form_t form;
- int y_bit;
- BN_CTX *new_ctx = NULL;
- BIGNUM *x, *y, *yxi;
- size_t field_len, enc_len;
- int ret = 0;
-
- if (len == 0)
- {
- ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_BUFFER_TOO_SMALL);
- return 0;
- }
- form = buf[0];
- y_bit = form & 1;
- form = form & ~1U;
- if ((form != 0) && (form != POINT_CONVERSION_COMPRESSED)
- && (form != POINT_CONVERSION_UNCOMPRESSED)
- && (form != POINT_CONVERSION_HYBRID))
- {
- ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
- return 0;
- }
- if ((form == 0 || form == POINT_CONVERSION_UNCOMPRESSED) && y_bit)
- {
- ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
- return 0;
- }
-
- if (form == 0)
- {
- if (len != 1)
- {
- ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
- return 0;
- }
-
- return EC_POINT_set_to_infinity(group, point);
- }
-
- field_len = (EC_GROUP_get_degree(group) + 7) / 8;
- enc_len = (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2*field_len;
-
- if (len != enc_len)
- {
- ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
- return 0;
- }
-
- if (ctx == NULL)
- {
- ctx = new_ctx = BN_CTX_new();
- if (ctx == NULL)
- return 0;
- }
-
- BN_CTX_start(ctx);
- x = BN_CTX_get(ctx);
- y = BN_CTX_get(ctx);
- yxi = BN_CTX_get(ctx);
- if (yxi == NULL) goto err;
-
- if (!BN_bin2bn(buf + 1, field_len, x)) goto err;
- if (BN_ucmp(x, &group->field) >= 0)
- {
- ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
- goto err;
- }
-
- if (form == POINT_CONVERSION_COMPRESSED)
- {
- if (!EC_POINT_set_compressed_coordinates_GF2m(group, point, x, y_bit, ctx)) goto err;
- }
- else
- {
- if (!BN_bin2bn(buf + 1 + field_len, field_len, y)) goto err;
- if (BN_ucmp(y, &group->field) >= 0)
- {
- ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
- goto err;
- }
- if (form == POINT_CONVERSION_HYBRID)
- {
- if (!group->meth->field_div(group, yxi, y, x, ctx)) goto err;
- if (y_bit != BN_is_odd(yxi))
- {
- ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
- goto err;
- }
- }
-
- if (!EC_POINT_set_affine_coordinates_GF2m(group, point, x, y, ctx)) goto err;
- }
-
- if (!EC_POINT_is_on_curve(group, point, ctx)) /* test required by X9.62 */
- {
- ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_POINT_IS_NOT_ON_CURVE);
- goto err;
- }
-
- ret = 1;
-
- err:
- BN_CTX_end(ctx);
- if (new_ctx != NULL)
- BN_CTX_free(new_ctx);
- return ret;
- }
-
-
/* Computes a + b and stores the result in r. r could be a or b, a could be b.
* Uses algorithm A.10.2 of IEEE P1363.
*/
@@ -887,7 +562,7 @@ int ec_GF2m_simple_is_on_curve(const EC_GROUP *group, const EC_POINT *point, BN_
field_sqr = group->meth->field_sqr;
/* only support affine coordinates */
- if (!point->Z_is_one) goto err;
+ if (!point->Z_is_one) return -1;
if (ctx == NULL)
{
@@ -1040,3 +715,5 @@ int ec_GF2m_simple_field_div(const EC_GROUP *group, BIGNUM *r, const BIGNUM *a,
{
return BN_GF2m_mod_div(r, a, b, &group->field, ctx);
}
+
+#endif
diff --git a/main/openssl/crypto/ec/ec_ameth.c b/main/openssl/crypto/ec/ec_ameth.c
index c00f7d74..0ce45240 100644
--- a/main/openssl/crypto/ec/ec_ameth.c
+++ b/main/openssl/crypto/ec/ec_ameth.c
@@ -88,7 +88,7 @@ static int eckey_param2type(int *pptype, void **ppval, EC_KEY *ec_key)
if (!pstr)
return 0;
pstr->length = i2d_ECParameters(ec_key, &pstr->data);
- if (pstr->length < 0)
+ if (pstr->length <= 0)
{
ASN1_STRING_free(pstr);
ECerr(EC_F_ECKEY_PARAM2TYPE, ERR_R_EC_LIB);
@@ -651,6 +651,7 @@ const EVP_PKEY_ASN1_METHOD eckey_asn1_meth =
ec_copy_parameters,
ec_cmp_parameters,
eckey_param_print,
+ 0,
int_ec_free,
ec_pkey_ctrl,
diff --git a/main/openssl/crypto/ec/ec_asn1.c b/main/openssl/crypto/ec/ec_asn1.c
index ae555398..145807b6 100644
--- a/main/openssl/crypto/ec/ec_asn1.c
+++ b/main/openssl/crypto/ec/ec_asn1.c
@@ -83,13 +83,14 @@ int EC_GROUP_get_basis_type(const EC_GROUP *group)
/* everything else is currently not supported */
return 0;
}
-
+#ifndef OPENSSL_NO_EC2M
int EC_GROUP_get_trinomial_basis(const EC_GROUP *group, unsigned int *k)
{
if (group == NULL)
return 0;
- if (EC_GROUP_method_of(group)->group_set_curve != ec_GF2m_simple_group_set_curve
+ if (EC_METHOD_get_field_type(EC_GROUP_method_of(group)) !=
+ NID_X9_62_characteristic_two_field
|| !((group->poly[0] != 0) && (group->poly[1] != 0) && (group->poly[2] == 0)))
{
ECerr(EC_F_EC_GROUP_GET_TRINOMIAL_BASIS, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
@@ -101,14 +102,14 @@ int EC_GROUP_get_trinomial_basis(const EC_GROUP *group, unsigned int *k)
return 1;
}
-
int EC_GROUP_get_pentanomial_basis(const EC_GROUP *group, unsigned int *k1,
unsigned int *k2, unsigned int *k3)
{
if (group == NULL)
return 0;
- if (EC_GROUP_method_of(group)->group_set_curve != ec_GF2m_simple_group_set_curve
+ if (EC_METHOD_get_field_type(EC_GROUP_method_of(group)) !=
+ NID_X9_62_characteristic_two_field
|| !((group->poly[0] != 0) && (group->poly[1] != 0) && (group->poly[2] != 0) && (group->poly[3] != 0) && (group->poly[4] == 0)))
{
ECerr(EC_F_EC_GROUP_GET_PENTANOMIAL_BASIS, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
@@ -124,7 +125,7 @@ int EC_GROUP_get_pentanomial_basis(const EC_GROUP *group, unsigned int *k1,
return 1;
}
-
+#endif
/* some structures needed for the asn1 encoding */
@@ -340,6 +341,12 @@ static int ec_asn1_group2fieldid(const EC_GROUP *group, X9_62_FIELDID *field)
}
}
else /* nid == NID_X9_62_characteristic_two_field */
+#ifdef OPENSSL_NO_EC2M
+ {
+ ECerr(EC_F_EC_ASN1_GROUP2FIELDID, EC_R_GF2M_NOT_SUPPORTED);
+ goto err;
+ }
+#else
{
int field_type;
X9_62_CHARACTERISTIC_TWO *char_two;
@@ -419,6 +426,7 @@ static int ec_asn1_group2fieldid(const EC_GROUP *group, X9_62_FIELDID *field)
}
}
}
+#endif
ok = 1;
@@ -456,6 +464,7 @@ static int ec_asn1_group2curve(const EC_GROUP *group, X9_62_CURVE *curve)
goto err;
}
}
+#ifndef OPENSSL_NO_EC2M
else /* nid == NID_X9_62_characteristic_two_field */
{
if (!EC_GROUP_get_curve_GF2m(group, NULL, tmp_1, tmp_2, NULL))
@@ -464,7 +473,7 @@ static int ec_asn1_group2curve(const EC_GROUP *group, X9_62_CURVE *curve)
goto err;
}
}
-
+#endif
len_1 = (size_t)BN_num_bytes(tmp_1);
len_2 = (size_t)BN_num_bytes(tmp_2);
@@ -775,8 +784,13 @@ static EC_GROUP *ec_asn1_parameters2group(const ECPARAMETERS *params)
/* get the field parameters */
tmp = OBJ_obj2nid(params->fieldID->fieldType);
-
if (tmp == NID_X9_62_characteristic_two_field)
+#ifdef OPENSSL_NO_EC2M
+ {
+ ECerr(EC_F_EC_ASN1_PARAMETERS2GROUP, EC_R_GF2M_NOT_SUPPORTED);
+ goto err;
+ }
+#else
{
X9_62_CHARACTERISTIC_TWO *char_two;
@@ -862,6 +876,7 @@ static EC_GROUP *ec_asn1_parameters2group(const ECPARAMETERS *params)
/* create the EC_GROUP structure */
ret = EC_GROUP_new_curve_GF2m(p, a, b, NULL);
}
+#endif
else if (tmp == NID_X9_62_prime_field)
{
/* we have a curve over a prime field */
@@ -1065,6 +1080,7 @@ EC_GROUP *d2i_ECPKParameters(EC_GROUP **a, const unsigned char **in, long len)
if ((group = ec_asn1_pkparameters2group(params)) == NULL)
{
ECerr(EC_F_D2I_ECPKPARAMETERS, EC_R_PKPARAMETERS2GROUP_FAILURE);
+ ECPKPARAMETERS_free(params);
return NULL;
}
diff --git a/main/openssl/crypto/ec/ec_curve.c b/main/openssl/crypto/ec/ec_curve.c
index 23274e40..c72fb269 100644
--- a/main/openssl/crypto/ec/ec_curve.c
+++ b/main/openssl/crypto/ec/ec_curve.c
@@ -3,7 +3,7 @@
* Written by Nils Larsch for the OpenSSL project.
*/
/* ====================================================================
- * Copyright (c) 1998-2004 The OpenSSL Project. All rights reserved.
+ * Copyright (c) 1998-2010 The OpenSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -72,6 +72,7 @@
#include "ec_lcl.h"
#include <openssl/err.h>
#include <openssl/obj_mac.h>
+#include <openssl/opensslconf.h>
typedef struct {
int field_type, /* either NID_X9_62_prime_field or
@@ -703,6 +704,8 @@ static const struct { EC_CURVE_DATA h; unsigned char data[0+28*6]; }
0x13,0xDD,0x29,0x45,0x5C,0x5C,0x2A,0x3D }
};
+#ifndef OPENSSL_NO_EC2M
+
/* characteristic two curves */
static const struct { EC_CURVE_DATA h; unsigned char data[20+15*6]; }
_EC_SECG_CHAR2_113R1 = {
@@ -1300,7 +1303,7 @@ static const struct { EC_CURVE_DATA h; unsigned char data[20+21*6]; }
{ 0x53,0x81,0x4C,0x05,0x0D,0x44,0xD6,0x96,0xE6,0x76, /* seed */
0x87,0x56,0x15,0x17,0x58,0x0C,0xA4,0xE2,0x9F,0xFD,
- 0x08,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p */
+ 0x08,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p */
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01,
0x07,
0x01,0x08,0xB3,0x9E,0x77,0xC4,0xB1,0x08,0xBE,0xD9, /* a */
@@ -1817,103 +1820,128 @@ static const struct { EC_CURVE_DATA h; unsigned char data[0+24*6]; }
0xBA,0xFC,0xA7,0x5E }
};
+#endif
+
typedef struct _ec_list_element_st {
int nid;
const EC_CURVE_DATA *data;
+ const EC_METHOD *(*meth)(void);
const char *comment;
} ec_list_element;
static const ec_list_element curve_list[] = {
- /* prime field curves */
+ /* prime field curves */
/* secg curves */
- { NID_secp112r1, &_EC_SECG_PRIME_112R1.h, "SECG/WTLS curve over a 112 bit prime field"},
- { NID_secp112r2, &_EC_SECG_PRIME_112R2.h, "SECG curve over a 112 bit prime field"},
- { NID_secp128r1, &_EC_SECG_PRIME_128R1.h, "SECG curve over a 128 bit prime field"},
- { NID_secp128r2, &_EC_SECG_PRIME_128R2.h, "SECG curve over a 128 bit prime field"},
- { NID_secp160k1, &_EC_SECG_PRIME_160K1.h, "SECG curve over a 160 bit prime field"},
- { NID_secp160r1, &_EC_SECG_PRIME_160R1.h, "SECG curve over a 160 bit prime field"},
- { NID_secp160r2, &_EC_SECG_PRIME_160R2.h, "SECG/WTLS curve over a 160 bit prime field"},
+ { NID_secp112r1, &_EC_SECG_PRIME_112R1.h, 0, "SECG/WTLS curve over a 112 bit prime field" },
+ { NID_secp112r2, &_EC_SECG_PRIME_112R2.h, 0, "SECG curve over a 112 bit prime field" },
+ { NID_secp128r1, &_EC_SECG_PRIME_128R1.h, 0, "SECG curve over a 128 bit prime field" },
+ { NID_secp128r2, &_EC_SECG_PRIME_128R2.h, 0, "SECG curve over a 128 bit prime field" },
+ { NID_secp160k1, &_EC_SECG_PRIME_160K1.h, 0, "SECG curve over a 160 bit prime field" },
+ { NID_secp160r1, &_EC_SECG_PRIME_160R1.h, 0, "SECG curve over a 160 bit prime field" },
+ { NID_secp160r2, &_EC_SECG_PRIME_160R2.h, 0, "SECG/WTLS curve over a 160 bit prime field" },
/* SECG secp192r1 is the same as X9.62 prime192v1 and hence omitted */
- { NID_secp192k1, &_EC_SECG_PRIME_192K1.h, "SECG curve over a 192 bit prime field"},
- { NID_secp224k1, &_EC_SECG_PRIME_224K1.h, "SECG curve over a 224 bit prime field"},
- { NID_secp224r1, &_EC_NIST_PRIME_224.h, "NIST/SECG curve over a 224 bit prime field"},
- { NID_secp256k1, &_EC_SECG_PRIME_256K1.h, "SECG curve over a 256 bit prime field"},
+ { NID_secp192k1, &_EC_SECG_PRIME_192K1.h, 0, "SECG curve over a 192 bit prime field" },
+ { NID_secp224k1, &_EC_SECG_PRIME_224K1.h, 0, "SECG curve over a 224 bit prime field" },
+#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
+ { NID_secp224r1, &_EC_NIST_PRIME_224.h, EC_GFp_nistp224_method, "NIST/SECG curve over a 224 bit prime field" },
+#else
+ { NID_secp224r1, &_EC_NIST_PRIME_224.h, 0, "NIST/SECG curve over a 224 bit prime field" },
+#endif
+ { NID_secp256k1, &_EC_SECG_PRIME_256K1.h, 0, "SECG curve over a 256 bit prime field" },
/* SECG secp256r1 is the same as X9.62 prime256v1 and hence omitted */
- { NID_secp384r1, &_EC_NIST_PRIME_384.h, "NIST/SECG curve over a 384 bit prime field"},
- { NID_secp521r1, &_EC_NIST_PRIME_521.h, "NIST/SECG curve over a 521 bit prime field"},
+ { NID_secp384r1, &_EC_NIST_PRIME_384.h, 0, "NIST/SECG curve over a 384 bit prime field" },
+#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
+ { NID_secp521r1, &_EC_NIST_PRIME_521.h, EC_GFp_nistp521_method, "NIST/SECG curve over a 521 bit prime field" },
+#else
+ { NID_secp521r1, &_EC_NIST_PRIME_521.h, 0, "NIST/SECG curve over a 521 bit prime field" },
+#endif
/* X9.62 curves */
- { NID_X9_62_prime192v1, &_EC_NIST_PRIME_192.h, "NIST/X9.62/SECG curve over a 192 bit prime field"},
- { NID_X9_62_prime192v2, &_EC_X9_62_PRIME_192V2.h, "X9.62 curve over a 192 bit prime field"},
- { NID_X9_62_prime192v3, &_EC_X9_62_PRIME_192V3.h, "X9.62 curve over a 192 bit prime field"},
- { NID_X9_62_prime239v1, &_EC_X9_62_PRIME_239V1.h, "X9.62 curve over a 239 bit prime field"},
- { NID_X9_62_prime239v2, &_EC_X9_62_PRIME_239V2.h, "X9.62 curve over a 239 bit prime field"},
- { NID_X9_62_prime239v3, &_EC_X9_62_PRIME_239V3.h, "X9.62 curve over a 239 bit prime field"},
- { NID_X9_62_prime256v1, &_EC_X9_62_PRIME_256V1.h, "X9.62/SECG curve over a 256 bit prime field"},
+ { NID_X9_62_prime192v1, &_EC_NIST_PRIME_192.h, 0, "NIST/X9.62/SECG curve over a 192 bit prime field" },
+ { NID_X9_62_prime192v2, &_EC_X9_62_PRIME_192V2.h, 0, "X9.62 curve over a 192 bit prime field" },
+ { NID_X9_62_prime192v3, &_EC_X9_62_PRIME_192V3.h, 0, "X9.62 curve over a 192 bit prime field" },
+ { NID_X9_62_prime239v1, &_EC_X9_62_PRIME_239V1.h, 0, "X9.62 curve over a 239 bit prime field" },
+ { NID_X9_62_prime239v2, &_EC_X9_62_PRIME_239V2.h, 0, "X9.62 curve over a 239 bit prime field" },
+ { NID_X9_62_prime239v3, &_EC_X9_62_PRIME_239V3.h, 0, "X9.62 curve over a 239 bit prime field" },
+#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
+ { NID_X9_62_prime256v1, &_EC_X9_62_PRIME_256V1.h, EC_GFp_nistp256_method, "X9.62/SECG curve over a 256 bit prime field" },
+#else
+ { NID_X9_62_prime256v1, &_EC_X9_62_PRIME_256V1.h, 0, "X9.62/SECG curve over a 256 bit prime field" },
+#endif
+#ifndef OPENSSL_NO_EC2M
/* characteristic two field curves */
/* NIST/SECG curves */
- { NID_sect113r1, &_EC_SECG_CHAR2_113R1.h, "SECG curve over a 113 bit binary field"},
- { NID_sect113r2, &_EC_SECG_CHAR2_113R2.h, "SECG curve over a 113 bit binary field"},
- { NID_sect131r1, &_EC_SECG_CHAR2_131R1.h, "SECG/WTLS curve over a 131 bit binary field"},
- { NID_sect131r2, &_EC_SECG_CHAR2_131R2.h, "SECG curve over a 131 bit binary field"},
- { NID_sect163k1, &_EC_NIST_CHAR2_163K.h, "NIST/SECG/WTLS curve over a 163 bit binary field" },
- { NID_sect163r1, &_EC_SECG_CHAR2_163R1.h, "SECG curve over a 163 bit binary field"},
- { NID_sect163r2, &_EC_NIST_CHAR2_163B.h, "NIST/SECG curve over a 163 bit binary field" },
- { NID_sect193r1, &_EC_SECG_CHAR2_193R1.h, "SECG curve over a 193 bit binary field"},
- { NID_sect193r2, &_EC_SECG_CHAR2_193R2.h, "SECG curve over a 193 bit binary field"},
- { NID_sect233k1, &_EC_NIST_CHAR2_233K.h, "NIST/SECG/WTLS curve over a 233 bit binary field" },
- { NID_sect233r1, &_EC_NIST_CHAR2_233B.h, "NIST/SECG/WTLS curve over a 233 bit binary field" },
- { NID_sect239k1, &_EC_SECG_CHAR2_239K1.h, "SECG curve over a 239 bit binary field"},
- { NID_sect283k1, &_EC_NIST_CHAR2_283K.h, "NIST/SECG curve over a 283 bit binary field" },
- { NID_sect283r1, &_EC_NIST_CHAR2_283B.h, "NIST/SECG curve over a 283 bit binary field" },
- { NID_sect409k1, &_EC_NIST_CHAR2_409K.h, "NIST/SECG curve over a 409 bit binary field" },
- { NID_sect409r1, &_EC_NIST_CHAR2_409B.h, "NIST/SECG curve over a 409 bit binary field" },
- { NID_sect571k1, &_EC_NIST_CHAR2_571K.h, "NIST/SECG curve over a 571 bit binary field" },
- { NID_sect571r1, &_EC_NIST_CHAR2_571B.h, "NIST/SECG curve over a 571 bit binary field" },
+ { NID_sect113r1, &_EC_SECG_CHAR2_113R1.h, 0, "SECG curve over a 113 bit binary field" },
+ { NID_sect113r2, &_EC_SECG_CHAR2_113R2.h, 0, "SECG curve over a 113 bit binary field" },
+ { NID_sect131r1, &_EC_SECG_CHAR2_131R1.h, 0, "SECG/WTLS curve over a 131 bit binary field" },
+ { NID_sect131r2, &_EC_SECG_CHAR2_131R2.h, 0, "SECG curve over a 131 bit binary field" },
+ { NID_sect163k1, &_EC_NIST_CHAR2_163K.h, 0, "NIST/SECG/WTLS curve over a 163 bit binary field" },
+ { NID_sect163r1, &_EC_SECG_CHAR2_163R1.h, 0, "SECG curve over a 163 bit binary field" },
+ { NID_sect163r2, &_EC_NIST_CHAR2_163B.h, 0, "NIST/SECG curve over a 163 bit binary field" },
+ { NID_sect193r1, &_EC_SECG_CHAR2_193R1.h, 0, "SECG curve over a 193 bit binary field" },
+ { NID_sect193r2, &_EC_SECG_CHAR2_193R2.h, 0, "SECG curve over a 193 bit binary field" },
+ { NID_sect233k1, &_EC_NIST_CHAR2_233K.h, 0, "NIST/SECG/WTLS curve over a 233 bit binary field" },
+ { NID_sect233r1, &_EC_NIST_CHAR2_233B.h, 0, "NIST/SECG/WTLS curve over a 233 bit binary field" },
+ { NID_sect239k1, &_EC_SECG_CHAR2_239K1.h, 0, "SECG curve over a 239 bit binary field" },
+ { NID_sect283k1, &_EC_NIST_CHAR2_283K.h, 0, "NIST/SECG curve over a 283 bit binary field" },
+ { NID_sect283r1, &_EC_NIST_CHAR2_283B.h, 0, "NIST/SECG curve over a 283 bit binary field" },
+ { NID_sect409k1, &_EC_NIST_CHAR2_409K.h, 0, "NIST/SECG curve over a 409 bit binary field" },
+ { NID_sect409r1, &_EC_NIST_CHAR2_409B.h, 0, "NIST/SECG curve over a 409 bit binary field" },
+ { NID_sect571k1, &_EC_NIST_CHAR2_571K.h, 0, "NIST/SECG curve over a 571 bit binary field" },
+ { NID_sect571r1, &_EC_NIST_CHAR2_571B.h, 0, "NIST/SECG curve over a 571 bit binary field" },
/* X9.62 curves */
- { NID_X9_62_c2pnb163v1, &_EC_X9_62_CHAR2_163V1.h, "X9.62 curve over a 163 bit binary field"},
- { NID_X9_62_c2pnb163v2, &_EC_X9_62_CHAR2_163V2.h, "X9.62 curve over a 163 bit binary field"},
- { NID_X9_62_c2pnb163v3, &_EC_X9_62_CHAR2_163V3.h, "X9.62 curve over a 163 bit binary field"},
- { NID_X9_62_c2pnb176v1, &_EC_X9_62_CHAR2_176V1.h, "X9.62 curve over a 176 bit binary field"},
- { NID_X9_62_c2tnb191v1, &_EC_X9_62_CHAR2_191V1.h, "X9.62 curve over a 191 bit binary field"},
- { NID_X9_62_c2tnb191v2, &_EC_X9_62_CHAR2_191V2.h, "X9.62 curve over a 191 bit binary field"},
- { NID_X9_62_c2tnb191v3, &_EC_X9_62_CHAR2_191V3.h, "X9.62 curve over a 191 bit binary field"},
- { NID_X9_62_c2pnb208w1, &_EC_X9_62_CHAR2_208W1.h, "X9.62 curve over a 208 bit binary field"},
- { NID_X9_62_c2tnb239v1, &_EC_X9_62_CHAR2_239V1.h, "X9.62 curve over a 239 bit binary field"},
- { NID_X9_62_c2tnb239v2, &_EC_X9_62_CHAR2_239V2.h, "X9.62 curve over a 239 bit binary field"},
- { NID_X9_62_c2tnb239v3, &_EC_X9_62_CHAR2_239V3.h, "X9.62 curve over a 239 bit binary field"},
- { NID_X9_62_c2pnb272w1, &_EC_X9_62_CHAR2_272W1.h, "X9.62 curve over a 272 bit binary field"},
- { NID_X9_62_c2pnb304w1, &_EC_X9_62_CHAR2_304W1.h, "X9.62 curve over a 304 bit binary field"},
- { NID_X9_62_c2tnb359v1, &_EC_X9_62_CHAR2_359V1.h, "X9.62 curve over a 359 bit binary field"},
- { NID_X9_62_c2pnb368w1, &_EC_X9_62_CHAR2_368W1.h, "X9.62 curve over a 368 bit binary field"},
- { NID_X9_62_c2tnb431r1, &_EC_X9_62_CHAR2_431R1.h, "X9.62 curve over a 431 bit binary field"},
+ { NID_X9_62_c2pnb163v1, &_EC_X9_62_CHAR2_163V1.h, 0, "X9.62 curve over a 163 bit binary field" },
+ { NID_X9_62_c2pnb163v2, &_EC_X9_62_CHAR2_163V2.h, 0, "X9.62 curve over a 163 bit binary field" },
+ { NID_X9_62_c2pnb163v3, &_EC_X9_62_CHAR2_163V3.h, 0, "X9.62 curve over a 163 bit binary field" },
+ { NID_X9_62_c2pnb176v1, &_EC_X9_62_CHAR2_176V1.h, 0, "X9.62 curve over a 176 bit binary field" },
+ { NID_X9_62_c2tnb191v1, &_EC_X9_62_CHAR2_191V1.h, 0, "X9.62 curve over a 191 bit binary field" },
+ { NID_X9_62_c2tnb191v2, &_EC_X9_62_CHAR2_191V2.h, 0, "X9.62 curve over a 191 bit binary field" },
+ { NID_X9_62_c2tnb191v3, &_EC_X9_62_CHAR2_191V3.h, 0, "X9.62 curve over a 191 bit binary field" },
+ { NID_X9_62_c2pnb208w1, &_EC_X9_62_CHAR2_208W1.h, 0, "X9.62 curve over a 208 bit binary field" },
+ { NID_X9_62_c2tnb239v1, &_EC_X9_62_CHAR2_239V1.h, 0, "X9.62 curve over a 239 bit binary field" },
+ { NID_X9_62_c2tnb239v2, &_EC_X9_62_CHAR2_239V2.h, 0, "X9.62 curve over a 239 bit binary field" },
+ { NID_X9_62_c2tnb239v3, &_EC_X9_62_CHAR2_239V3.h, 0, "X9.62 curve over a 239 bit binary field" },
+ { NID_X9_62_c2pnb272w1, &_EC_X9_62_CHAR2_272W1.h, 0, "X9.62 curve over a 272 bit binary field" },
+ { NID_X9_62_c2pnb304w1, &_EC_X9_62_CHAR2_304W1.h, 0, "X9.62 curve over a 304 bit binary field" },
+ { NID_X9_62_c2tnb359v1, &_EC_X9_62_CHAR2_359V1.h, 0, "X9.62 curve over a 359 bit binary field" },
+ { NID_X9_62_c2pnb368w1, &_EC_X9_62_CHAR2_368W1.h, 0, "X9.62 curve over a 368 bit binary field" },
+ { NID_X9_62_c2tnb431r1, &_EC_X9_62_CHAR2_431R1.h, 0, "X9.62 curve over a 431 bit binary field" },
/* the WAP/WTLS curves
* [unlike SECG, spec has its own OIDs for curves from X9.62] */
- { NID_wap_wsg_idm_ecid_wtls1, &_EC_WTLS_1.h, "WTLS curve over a 113 bit binary field"},
- { NID_wap_wsg_idm_ecid_wtls3, &_EC_NIST_CHAR2_163K.h, "NIST/SECG/WTLS curve over a 163 bit binary field"},
- { NID_wap_wsg_idm_ecid_wtls4, &_EC_SECG_CHAR2_113R1.h, "SECG curve over a 113 bit binary field"},
- { NID_wap_wsg_idm_ecid_wtls5, &_EC_X9_62_CHAR2_163V1.h, "X9.62 curve over a 163 bit binary field"},
- { NID_wap_wsg_idm_ecid_wtls6, &_EC_SECG_PRIME_112R1.h, "SECG/WTLS curve over a 112 bit prime field"},
- { NID_wap_wsg_idm_ecid_wtls7, &_EC_SECG_PRIME_160R2.h, "SECG/WTLS curve over a 160 bit prime field"},
- { NID_wap_wsg_idm_ecid_wtls8, &_EC_WTLS_8.h, "WTLS curve over a 112 bit prime field"},
- { NID_wap_wsg_idm_ecid_wtls9, &_EC_WTLS_9.h, "WTLS curve over a 160 bit prime field" },
- { NID_wap_wsg_idm_ecid_wtls10, &_EC_NIST_CHAR2_233K.h, "NIST/SECG/WTLS curve over a 233 bit binary field"},
- { NID_wap_wsg_idm_ecid_wtls11, &_EC_NIST_CHAR2_233B.h, "NIST/SECG/WTLS curve over a 233 bit binary field"},
- { NID_wap_wsg_idm_ecid_wtls12, &_EC_WTLS_12.h, "WTLS curvs over a 224 bit prime field"},
+ { NID_wap_wsg_idm_ecid_wtls1, &_EC_WTLS_1.h, 0, "WTLS curve over a 113 bit binary field" },
+ { NID_wap_wsg_idm_ecid_wtls3, &_EC_NIST_CHAR2_163K.h, 0, "NIST/SECG/WTLS curve over a 163 bit binary field" },
+ { NID_wap_wsg_idm_ecid_wtls4, &_EC_SECG_CHAR2_113R1.h, 0, "SECG curve over a 113 bit binary field" },
+ { NID_wap_wsg_idm_ecid_wtls5, &_EC_X9_62_CHAR2_163V1.h, 0, "X9.62 curve over a 163 bit binary field" },
+#endif
+ { NID_wap_wsg_idm_ecid_wtls6, &_EC_SECG_PRIME_112R1.h, 0, "SECG/WTLS curve over a 112 bit prime field" },
+ { NID_wap_wsg_idm_ecid_wtls7, &_EC_SECG_PRIME_160R2.h, 0, "SECG/WTLS curve over a 160 bit prime field" },
+ { NID_wap_wsg_idm_ecid_wtls8, &_EC_WTLS_8.h, 0, "WTLS curve over a 112 bit prime field" },
+ { NID_wap_wsg_idm_ecid_wtls9, &_EC_WTLS_9.h, 0, "WTLS curve over a 160 bit prime field" },
+#ifndef OPENSSL_NO_EC2M
+ { NID_wap_wsg_idm_ecid_wtls10, &_EC_NIST_CHAR2_233K.h, 0, "NIST/SECG/WTLS curve over a 233 bit binary field" },
+ { NID_wap_wsg_idm_ecid_wtls11, &_EC_NIST_CHAR2_233B.h, 0, "NIST/SECG/WTLS curve over a 233 bit binary field" },
+#endif
+ { NID_wap_wsg_idm_ecid_wtls12, &_EC_WTLS_12.h, 0, "WTLS curvs over a 224 bit prime field" },
+#ifndef OPENSSL_NO_EC2M
/* IPSec curves */
- { NID_ipsec3, &_EC_IPSEC_155_ID3.h, "\n\tIPSec/IKE/Oakley curve #3 over a 155 bit binary field.\n""\tNot suitable for ECDSA.\n\tQuestionable extension field!"},
- { NID_ipsec4, &_EC_IPSEC_185_ID4.h, "\n\tIPSec/IKE/Oakley curve #4 over a 185 bit binary field.\n""\tNot suitable for ECDSA.\n\tQuestionable extension field!"},
+ { NID_ipsec3, &_EC_IPSEC_155_ID3.h, 0, "\n\tIPSec/IKE/Oakley curve #3 over a 155 bit binary field.\n"
+ "\tNot suitable for ECDSA.\n\tQuestionable extension field!" },
+ { NID_ipsec4, &_EC_IPSEC_185_ID4.h, 0, "\n\tIPSec/IKE/Oakley curve #4 over a 185 bit binary field.\n"
+ "\tNot suitable for ECDSA.\n\tQuestionable extension field!" },
+#endif
};
#define curve_list_length (sizeof(curve_list)/sizeof(ec_list_element))
-static EC_GROUP *ec_group_new_from_data(const EC_CURVE_DATA *data)
+static EC_GROUP *ec_group_new_from_data(const ec_list_element curve)
{
EC_GROUP *group=NULL;
EC_POINT *P=NULL;
BN_CTX *ctx=NULL;
- BIGNUM *p=NULL, *a=NULL, *b=NULL, *x=NULL, *y=NULL, *order=NULL;
+ BIGNUM *p=NULL, *a=NULL, *b=NULL, *x=NULL, *y=NULL, *order=NULL;
int ok=0;
int seed_len,param_len;
+ const EC_METHOD *meth;
+ const EC_CURVE_DATA *data;
const unsigned char *params;
if ((ctx = BN_CTX_new()) == NULL)
@@ -1922,10 +1950,11 @@ static EC_GROUP *ec_group_new_from_data(const EC_CURVE_DATA *data)
goto err;
}
+ data = curve.data;
seed_len = data->seed_len;
param_len = data->param_len;
- params = (const unsigned char *)(data+1); /* skip header */
- params += seed_len; /* skip seed */
+ params = (const unsigned char *)(data+1); /* skip header */
+ params += seed_len; /* skip seed */
if (!(p = BN_bin2bn(params+0*param_len, param_len, NULL))
|| !(a = BN_bin2bn(params+1*param_len, param_len, NULL))
@@ -1935,7 +1964,17 @@ static EC_GROUP *ec_group_new_from_data(const EC_CURVE_DATA *data)
goto err;
}
- if (data->field_type == NID_X9_62_prime_field)
+ if (curve.meth != 0)
+ {
+ meth = curve.meth();
+ if (((group = EC_GROUP_new(meth)) == NULL) ||
+ (!(group->meth->group_set_curve(group, p, a, b, ctx))))
+ {
+ ECerr(EC_F_EC_GROUP_NEW_FROM_DATA, ERR_R_EC_LIB);
+ goto err;
+ }
+ }
+ else if (data->field_type == NID_X9_62_prime_field)
{
if ((group = EC_GROUP_new_curve_GFp(p, a, b, ctx)) == NULL)
{
@@ -1943,6 +1982,7 @@ static EC_GROUP *ec_group_new_from_data(const EC_CURVE_DATA *data)
goto err;
}
}
+#ifndef OPENSSL_NO_EC2M
else /* field_type == NID_X9_62_characteristic_two_field */
{
if ((group = EC_GROUP_new_curve_GF2m(p, a, b, ctx)) == NULL)
@@ -1951,20 +1991,21 @@ static EC_GROUP *ec_group_new_from_data(const EC_CURVE_DATA *data)
goto err;
}
}
+#endif
if ((P = EC_POINT_new(group)) == NULL)
{
ECerr(EC_F_EC_GROUP_NEW_FROM_DATA, ERR_R_EC_LIB);
goto err;
}
-
+
if (!(x = BN_bin2bn(params+3*param_len, param_len, NULL))
|| !(y = BN_bin2bn(params+4*param_len, param_len, NULL)))
{
ECerr(EC_F_EC_GROUP_NEW_FROM_DATA, ERR_R_BN_LIB);
goto err;
}
- if (!EC_POINT_set_affine_coordinates_GF2m(group, P, x, y, ctx))
+ if (!EC_POINT_set_affine_coordinates_GFp(group, P, x, y, ctx))
{
ECerr(EC_F_EC_GROUP_NEW_FROM_DATA, ERR_R_EC_LIB);
goto err;
@@ -2025,7 +2066,7 @@ EC_GROUP *EC_GROUP_new_by_curve_name(int nid)
for (i=0; i<curve_list_length; i++)
if (curve_list[i].nid == nid)
{
- ret = ec_group_new_from_data(curve_list[i].data);
+ ret = ec_group_new_from_data(curve_list[i]);
break;
}
diff --git a/main/openssl/crypto/ec/ec_cvt.c b/main/openssl/crypto/ec/ec_cvt.c
index d45640ba..bfcbab35 100644
--- a/main/openssl/crypto/ec/ec_cvt.c
+++ b/main/openssl/crypto/ec/ec_cvt.c
@@ -78,7 +78,32 @@ EC_GROUP *EC_GROUP_new_curve_GFp(const BIGNUM *p, const BIGNUM *a, const BIGNUM
const EC_METHOD *meth;
EC_GROUP *ret;
+#if defined(OPENSSL_BN_ASM_MONT)
+ /*
+ * This might appear controversial, but the fact is that generic
+ * prime method was observed to deliver better performance even
+ * for NIST primes on a range of platforms, e.g.: 60%-15%
+ * improvement on IA-64, ~25% on ARM, 30%-90% on P4, 20%-25%
+ * in 32-bit build and 35%--12% in 64-bit build on Core2...
+ * Coefficients are relative to optimized bn_nist.c for most
+ * intensive ECDSA verify and ECDH operations for 192- and 521-
+ * bit keys respectively. Choice of these boundary values is
+ * arguable, because the dependency of improvement coefficient
+ * from key length is not a "monotone" curve. For example while
+ * 571-bit result is 23% on ARM, 384-bit one is -1%. But it's
+ * generally faster, sometimes "respectfully" faster, sometimes
+ * "tolerably" slower... What effectively happens is that loop
+ * with bn_mul_add_words is put against bn_mul_mont, and the
+ * latter "wins" on short vectors. Correct solution should be
+ * implementing dedicated NxN multiplication subroutines for
+ * small N. But till it materializes, let's stick to generic
+ * prime method...
+ * <appro>
+ */
+ meth = EC_GFp_mont_method();
+#else
meth = EC_GFp_nist_method();
+#endif
ret = EC_GROUP_new(meth);
if (ret == NULL)
@@ -122,7 +147,7 @@ EC_GROUP *EC_GROUP_new_curve_GFp(const BIGNUM *p, const BIGNUM *a, const BIGNUM
return ret;
}
-
+#ifndef OPENSSL_NO_EC2M
EC_GROUP *EC_GROUP_new_curve_GF2m(const BIGNUM *p, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx)
{
const EC_METHOD *meth;
@@ -142,3 +167,4 @@ EC_GROUP *EC_GROUP_new_curve_GF2m(const BIGNUM *p, const BIGNUM *a, const BIGNUM
return ret;
}
+#endif
diff --git a/main/openssl/crypto/ec/ec_err.c b/main/openssl/crypto/ec/ec_err.c
index 84b48333..0d193987 100644
--- a/main/openssl/crypto/ec/ec_err.c
+++ b/main/openssl/crypto/ec/ec_err.c
@@ -1,6 +1,6 @@
/* crypto/ec/ec_err.c */
/* ====================================================================
- * Copyright (c) 1999-2007 The OpenSSL Project. All rights reserved.
+ * Copyright (c) 1999-2011 The OpenSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -70,6 +70,7 @@
static ERR_STRING_DATA EC_str_functs[]=
{
+{ERR_FUNC(EC_F_BN_TO_FELEM), "BN_TO_FELEM"},
{ERR_FUNC(EC_F_COMPUTE_WNAF), "COMPUTE_WNAF"},
{ERR_FUNC(EC_F_D2I_ECPARAMETERS), "d2i_ECParameters"},
{ERR_FUNC(EC_F_D2I_ECPKPARAMETERS), "d2i_ECPKParameters"},
@@ -112,6 +113,15 @@ static ERR_STRING_DATA EC_str_functs[]=
{ERR_FUNC(EC_F_EC_GFP_MONT_FIELD_SQR), "ec_GFp_mont_field_sqr"},
{ERR_FUNC(EC_F_EC_GFP_MONT_GROUP_SET_CURVE), "ec_GFp_mont_group_set_curve"},
{ERR_FUNC(EC_F_EC_GFP_MONT_GROUP_SET_CURVE_GFP), "EC_GFP_MONT_GROUP_SET_CURVE_GFP"},
+{ERR_FUNC(EC_F_EC_GFP_NISTP224_GROUP_SET_CURVE), "ec_GFp_nistp224_group_set_curve"},
+{ERR_FUNC(EC_F_EC_GFP_NISTP224_POINTS_MUL), "ec_GFp_nistp224_points_mul"},
+{ERR_FUNC(EC_F_EC_GFP_NISTP224_POINT_GET_AFFINE_COORDINATES), "ec_GFp_nistp224_point_get_affine_coordinates"},
+{ERR_FUNC(EC_F_EC_GFP_NISTP256_GROUP_SET_CURVE), "ec_GFp_nistp256_group_set_curve"},
+{ERR_FUNC(EC_F_EC_GFP_NISTP256_POINTS_MUL), "ec_GFp_nistp256_points_mul"},
+{ERR_FUNC(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES), "ec_GFp_nistp256_point_get_affine_coordinates"},
+{ERR_FUNC(EC_F_EC_GFP_NISTP521_GROUP_SET_CURVE), "ec_GFp_nistp521_group_set_curve"},
+{ERR_FUNC(EC_F_EC_GFP_NISTP521_POINTS_MUL), "ec_GFp_nistp521_points_mul"},
+{ERR_FUNC(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES), "ec_GFp_nistp521_point_get_affine_coordinates"},
{ERR_FUNC(EC_F_EC_GFP_NIST_FIELD_MUL), "ec_GFp_nist_field_mul"},
{ERR_FUNC(EC_F_EC_GFP_NIST_FIELD_SQR), "ec_GFp_nist_field_sqr"},
{ERR_FUNC(EC_F_EC_GFP_NIST_GROUP_SET_CURVE), "ec_GFp_nist_group_set_curve"},
@@ -154,6 +164,7 @@ static ERR_STRING_DATA EC_str_functs[]=
{ERR_FUNC(EC_F_EC_KEY_NEW), "EC_KEY_new"},
{ERR_FUNC(EC_F_EC_KEY_PRINT), "EC_KEY_print"},
{ERR_FUNC(EC_F_EC_KEY_PRINT_FP), "EC_KEY_print_fp"},
+{ERR_FUNC(EC_F_EC_KEY_SET_PUBLIC_KEY_AFFINE_COORDINATES), "EC_KEY_set_public_key_affine_coordinates"},
{ERR_FUNC(EC_F_EC_POINTS_MAKE_AFFINE), "EC_POINTs_make_affine"},
{ERR_FUNC(EC_F_EC_POINT_ADD), "EC_POINT_add"},
{ERR_FUNC(EC_F_EC_POINT_CMP), "EC_POINT_cmp"},
@@ -184,6 +195,9 @@ static ERR_STRING_DATA EC_str_functs[]=
{ERR_FUNC(EC_F_I2D_ECPKPARAMETERS), "i2d_ECPKParameters"},
{ERR_FUNC(EC_F_I2D_ECPRIVATEKEY), "i2d_ECPrivateKey"},
{ERR_FUNC(EC_F_I2O_ECPUBLICKEY), "i2o_ECPublicKey"},
+{ERR_FUNC(EC_F_NISTP224_PRE_COMP_NEW), "NISTP224_PRE_COMP_NEW"},
+{ERR_FUNC(EC_F_NISTP256_PRE_COMP_NEW), "NISTP256_PRE_COMP_NEW"},
+{ERR_FUNC(EC_F_NISTP521_PRE_COMP_NEW), "NISTP521_PRE_COMP_NEW"},
{ERR_FUNC(EC_F_O2I_ECPUBLICKEY), "o2i_ECPublicKey"},
{ERR_FUNC(EC_F_OLD_EC_PRIV_DECODE), "OLD_EC_PRIV_DECODE"},
{ERR_FUNC(EC_F_PKEY_EC_CTRL), "PKEY_EC_CTRL"},
@@ -199,12 +213,15 @@ static ERR_STRING_DATA EC_str_reasons[]=
{
{ERR_REASON(EC_R_ASN1_ERROR) ,"asn1 error"},
{ERR_REASON(EC_R_ASN1_UNKNOWN_FIELD) ,"asn1 unknown field"},
+{ERR_REASON(EC_R_BIGNUM_OUT_OF_RANGE) ,"bignum out of range"},
{ERR_REASON(EC_R_BUFFER_TOO_SMALL) ,"buffer too small"},
+{ERR_REASON(EC_R_COORDINATES_OUT_OF_RANGE),"coordinates out of range"},
{ERR_REASON(EC_R_D2I_ECPKPARAMETERS_FAILURE),"d2i ecpkparameters failure"},
{ERR_REASON(EC_R_DECODE_ERROR) ,"decode error"},
{ERR_REASON(EC_R_DISCRIMINANT_IS_ZERO) ,"discriminant is zero"},
{ERR_REASON(EC_R_EC_GROUP_NEW_BY_NAME_FAILURE),"ec group new by name failure"},
{ERR_REASON(EC_R_FIELD_TOO_LARGE) ,"field too large"},
+{ERR_REASON(EC_R_GF2M_NOT_SUPPORTED) ,"gf2m not supported"},
{ERR_REASON(EC_R_GROUP2PKPARAMETERS_FAILURE),"group2pkparameters failure"},
{ERR_REASON(EC_R_I2D_ECPKPARAMETERS_FAILURE),"i2d ecpkparameters failure"},
{ERR_REASON(EC_R_INCOMPATIBLE_OBJECTS) ,"incompatible objects"},
@@ -239,6 +256,7 @@ static ERR_STRING_DATA EC_str_reasons[]=
{ERR_REASON(EC_R_UNKNOWN_GROUP) ,"unknown group"},
{ERR_REASON(EC_R_UNKNOWN_ORDER) ,"unknown order"},
{ERR_REASON(EC_R_UNSUPPORTED_FIELD) ,"unsupported field"},
+{ERR_REASON(EC_R_WRONG_CURVE_PARAMETERS) ,"wrong curve parameters"},
{ERR_REASON(EC_R_WRONG_ORDER) ,"wrong order"},
{0,NULL}
};
diff --git a/main/openssl/crypto/ec/ec_key.c b/main/openssl/crypto/ec/ec_key.c
index 522802c0..73dd7b97 100644
--- a/main/openssl/crypto/ec/ec_key.c
+++ b/main/openssl/crypto/ec/ec_key.c
@@ -64,7 +64,9 @@
#include <string.h>
#include "ec_lcl.h"
#include <openssl/err.h>
-#include <string.h>
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
EC_KEY *EC_KEY_new(void)
{
@@ -78,10 +80,12 @@ EC_KEY *EC_KEY_new(void)
}
ret->version = 1;
+ ret->flags = 0;
ret->group = NULL;
ret->pub_key = NULL;
ret->priv_key= NULL;
ret->enc_flag= 0;
+ ret->nonce_from_hash_flag = 0;
ret->conv_form = POINT_CONVERSION_UNCOMPRESSED;
ret->references= 1;
ret->method_data = NULL;
@@ -195,8 +199,10 @@ EC_KEY *EC_KEY_copy(EC_KEY *dest, const EC_KEY *src)
/* copy the rest */
dest->enc_flag = src->enc_flag;
+ dest->nonce_from_hash_flag = src->nonce_from_hash_flag;
dest->conv_form = src->conv_form;
dest->version = src->version;
+ dest->flags = src->flags;
return dest;
}
@@ -237,6 +243,11 @@ int EC_KEY_generate_key(EC_KEY *eckey)
BIGNUM *priv_key = NULL, *order = NULL;
EC_POINT *pub_key = NULL;
+#ifdef OPENSSL_FIPS
+ if (FIPS_mode())
+ return FIPS_ec_key_generate_key(eckey);
+#endif
+
if (!eckey || !eckey->group)
{
ECerr(EC_F_EC_KEY_GENERATE_KEY, ERR_R_PASSED_NULL_PARAMETER);
@@ -371,6 +382,82 @@ err:
return(ok);
}
+int EC_KEY_set_public_key_affine_coordinates(EC_KEY *key, BIGNUM *x, BIGNUM *y)
+ {
+ BN_CTX *ctx = NULL;
+ BIGNUM *tx, *ty;
+ EC_POINT *point = NULL;
+ int ok = 0, tmp_nid, is_char_two = 0;
+
+ if (!key || !key->group || !x || !y)
+ {
+ ECerr(EC_F_EC_KEY_SET_PUBLIC_KEY_AFFINE_COORDINATES,
+ ERR_R_PASSED_NULL_PARAMETER);
+ return 0;
+ }
+ ctx = BN_CTX_new();
+ if (!ctx)
+ goto err;
+
+ point = EC_POINT_new(key->group);
+
+ if (!point)
+ goto err;
+
+ tmp_nid = EC_METHOD_get_field_type(EC_GROUP_method_of(key->group));
+
+ if (tmp_nid == NID_X9_62_characteristic_two_field)
+ is_char_two = 1;
+
+ tx = BN_CTX_get(ctx);
+ ty = BN_CTX_get(ctx);
+#ifndef OPENSSL_NO_EC2M
+ if (is_char_two)
+ {
+ if (!EC_POINT_set_affine_coordinates_GF2m(key->group, point,
+ x, y, ctx))
+ goto err;
+ if (!EC_POINT_get_affine_coordinates_GF2m(key->group, point,
+ tx, ty, ctx))
+ goto err;
+ }
+ else
+#endif
+ {
+ if (!EC_POINT_set_affine_coordinates_GFp(key->group, point,
+ x, y, ctx))
+ goto err;
+ if (!EC_POINT_get_affine_coordinates_GFp(key->group, point,
+ tx, ty, ctx))
+ goto err;
+ }
+ /* Check if retrieved coordinates match originals: if not values
+ * are out of range.
+ */
+ if (BN_cmp(x, tx) || BN_cmp(y, ty))
+ {
+ ECerr(EC_F_EC_KEY_SET_PUBLIC_KEY_AFFINE_COORDINATES,
+ EC_R_COORDINATES_OUT_OF_RANGE);
+ goto err;
+ }
+
+ if (!EC_KEY_set_public_key(key, point))
+ goto err;
+
+ if (EC_KEY_check_key(key) == 0)
+ goto err;
+
+ ok = 1;
+
+ err:
+ if (ctx)
+ BN_CTX_free(ctx);
+ if (point)
+ EC_POINT_free(point);
+ return ok;
+
+ }
+
const EC_GROUP *EC_KEY_get0_group(const EC_KEY *key)
{
return key->group;
@@ -420,6 +507,16 @@ void EC_KEY_set_enc_flags(EC_KEY *key, unsigned int flags)
key->enc_flag = flags;
}
+int EC_KEY_get_nonce_from_hash(const EC_KEY *key)
+ {
+ return key->nonce_from_hash_flag;
+ }
+
+void EC_KEY_set_nonce_from_hash(EC_KEY *key, int on)
+ {
+ key->nonce_from_hash_flag = on != 0;
+ }
+
point_conversion_form_t EC_KEY_get_conv_form(const EC_KEY *key)
{
return key->conv_form;
@@ -435,18 +532,27 @@ void EC_KEY_set_conv_form(EC_KEY *key, point_conversion_form_t cform)
void *EC_KEY_get_key_method_data(EC_KEY *key,
void *(*dup_func)(void *), void (*free_func)(void *), void (*clear_free_func)(void *))
{
- return EC_EX_DATA_get_data(key->method_data, dup_func, free_func, clear_free_func);
+ void *ret;
+
+ CRYPTO_r_lock(CRYPTO_LOCK_EC);
+ ret = EC_EX_DATA_get_data(key->method_data, dup_func, free_func, clear_free_func);
+ CRYPTO_r_unlock(CRYPTO_LOCK_EC);
+
+ return ret;
}
-void EC_KEY_insert_key_method_data(EC_KEY *key, void *data,
+void *EC_KEY_insert_key_method_data(EC_KEY *key, void *data,
void *(*dup_func)(void *), void (*free_func)(void *), void (*clear_free_func)(void *))
{
EC_EXTRA_DATA *ex_data;
+
CRYPTO_w_lock(CRYPTO_LOCK_EC);
ex_data = EC_EX_DATA_get_data(key->method_data, dup_func, free_func, clear_free_func);
if (ex_data == NULL)
EC_EX_DATA_set_data(&key->method_data, data, dup_func, free_func, clear_free_func);
CRYPTO_w_unlock(CRYPTO_LOCK_EC);
+
+ return ex_data;
}
void EC_KEY_set_asn1_flag(EC_KEY *key, int flag)
@@ -461,3 +567,18 @@ int EC_KEY_precompute_mult(EC_KEY *key, BN_CTX *ctx)
return 0;
return EC_GROUP_precompute_mult(key->group, ctx);
}
+
+int EC_KEY_get_flags(const EC_KEY *key)
+ {
+ return key->flags;
+ }
+
+void EC_KEY_set_flags(EC_KEY *key, int flags)
+ {
+ key->flags |= flags;
+ }
+
+void EC_KEY_clear_flags(EC_KEY *key, int flags)
+ {
+ key->flags &= ~flags;
+ }
diff --git a/main/openssl/crypto/ec/ec_lcl.h b/main/openssl/crypto/ec/ec_lcl.h
index 3e2c34b0..6f714c75 100644
--- a/main/openssl/crypto/ec/ec_lcl.h
+++ b/main/openssl/crypto/ec/ec_lcl.h
@@ -3,7 +3,7 @@
* Originally written by Bodo Moeller for the OpenSSL project.
*/
/* ====================================================================
- * Copyright (c) 1998-2003 The OpenSSL Project. All rights reserved.
+ * Copyright (c) 1998-2010 The OpenSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -82,10 +82,15 @@
# endif
#endif
+/* Use default functions for poin2oct, oct2point and compressed coordinates */
+#define EC_FLAGS_DEFAULT_OCT 0x1
+
/* Structure details are not part of the exported interface,
* so all this may change in future versions. */
struct ec_method_st {
+ /* Various method flags */
+ int flags;
/* used by EC_METHOD_get_field_type: */
int field_type; /* a NID */
@@ -241,9 +246,11 @@ struct ec_key_st {
BIGNUM *priv_key;
unsigned int enc_flag;
+ char nonce_from_hash_flag;
point_conversion_form_t conv_form;
int references;
+ int flags;
EC_EXTRA_DATA *method_data;
} /* EC_KEY */;
@@ -391,3 +398,50 @@ int ec_GF2m_simple_mul(const EC_GROUP *group, EC_POINT *r, const BIGNUM *scalar,
size_t num, const EC_POINT *points[], const BIGNUM *scalars[], BN_CTX *);
int ec_GF2m_precompute_mult(EC_GROUP *group, BN_CTX *ctx);
int ec_GF2m_have_precompute_mult(const EC_GROUP *group);
+
+/* method functions in ec2_mult.c */
+int ec_GF2m_simple_mul(const EC_GROUP *group, EC_POINT *r, const BIGNUM *scalar,
+ size_t num, const EC_POINT *points[], const BIGNUM *scalars[], BN_CTX *);
+int ec_GF2m_precompute_mult(EC_GROUP *group, BN_CTX *ctx);
+int ec_GF2m_have_precompute_mult(const EC_GROUP *group);
+
+#ifndef OPENSSL_EC_NISTP_64_GCC_128
+/* method functions in ecp_nistp224.c */
+int ec_GFp_nistp224_group_init(EC_GROUP *group);
+int ec_GFp_nistp224_group_set_curve(EC_GROUP *group, const BIGNUM *p, const BIGNUM *a, const BIGNUM *n, BN_CTX *);
+int ec_GFp_nistp224_point_get_affine_coordinates(const EC_GROUP *group, const EC_POINT *point, BIGNUM *x, BIGNUM *y, BN_CTX *ctx);
+int ec_GFp_nistp224_mul(const EC_GROUP *group, EC_POINT *r, const BIGNUM *scalar, size_t num, const EC_POINT *points[], const BIGNUM *scalars[], BN_CTX *);
+int ec_GFp_nistp224_points_mul(const EC_GROUP *group, EC_POINT *r, const BIGNUM *scalar, size_t num, const EC_POINT *points[], const BIGNUM *scalars[], BN_CTX *ctx);
+int ec_GFp_nistp224_precompute_mult(EC_GROUP *group, BN_CTX *ctx);
+int ec_GFp_nistp224_have_precompute_mult(const EC_GROUP *group);
+
+/* method functions in ecp_nistp256.c */
+int ec_GFp_nistp256_group_init(EC_GROUP *group);
+int ec_GFp_nistp256_group_set_curve(EC_GROUP *group, const BIGNUM *p, const BIGNUM *a, const BIGNUM *n, BN_CTX *);
+int ec_GFp_nistp256_point_get_affine_coordinates(const EC_GROUP *group, const EC_POINT *point, BIGNUM *x, BIGNUM *y, BN_CTX *ctx);
+int ec_GFp_nistp256_mul(const EC_GROUP *group, EC_POINT *r, const BIGNUM *scalar, size_t num, const EC_POINT *points[], const BIGNUM *scalars[], BN_CTX *);
+int ec_GFp_nistp256_points_mul(const EC_GROUP *group, EC_POINT *r, const BIGNUM *scalar, size_t num, const EC_POINT *points[], const BIGNUM *scalars[], BN_CTX *ctx);
+int ec_GFp_nistp256_precompute_mult(EC_GROUP *group, BN_CTX *ctx);
+int ec_GFp_nistp256_have_precompute_mult(const EC_GROUP *group);
+
+/* method functions in ecp_nistp521.c */
+int ec_GFp_nistp521_group_init(EC_GROUP *group);
+int ec_GFp_nistp521_group_set_curve(EC_GROUP *group, const BIGNUM *p, const BIGNUM *a, const BIGNUM *n, BN_CTX *);
+int ec_GFp_nistp521_point_get_affine_coordinates(const EC_GROUP *group, const EC_POINT *point, BIGNUM *x, BIGNUM *y, BN_CTX *ctx);
+int ec_GFp_nistp521_mul(const EC_GROUP *group, EC_POINT *r, const BIGNUM *scalar, size_t num, const EC_POINT *points[], const BIGNUM *scalars[], BN_CTX *);
+int ec_GFp_nistp521_points_mul(const EC_GROUP *group, EC_POINT *r, const BIGNUM *scalar, size_t num, const EC_POINT *points[], const BIGNUM *scalars[], BN_CTX *ctx);
+int ec_GFp_nistp521_precompute_mult(EC_GROUP *group, BN_CTX *ctx);
+int ec_GFp_nistp521_have_precompute_mult(const EC_GROUP *group);
+
+/* utility functions in ecp_nistputil.c */
+void ec_GFp_nistp_points_make_affine_internal(size_t num, void *point_array,
+ size_t felem_size, void *tmp_felems,
+ void (*felem_one)(void *out),
+ int (*felem_is_zero)(const void *in),
+ void (*felem_assign)(void *out, const void *in),
+ void (*felem_square)(void *out, const void *in),
+ void (*felem_mul)(void *out, const void *in1, const void *in2),
+ void (*felem_inv)(void *out, const void *in),
+ void (*felem_contract)(void *out, const void *in));
+void ec_GFp_nistp_recode_scalar_bits(unsigned char *sign, unsigned char *digit, unsigned char in);
+#endif
diff --git a/main/openssl/crypto/ec/ec_lib.c b/main/openssl/crypto/ec/ec_lib.c
index dd7da0fc..de9a0cc2 100644
--- a/main/openssl/crypto/ec/ec_lib.c
+++ b/main/openssl/crypto/ec/ec_lib.c
@@ -425,7 +425,7 @@ int EC_GROUP_get_curve_GFp(const EC_GROUP *group, BIGNUM *p, BIGNUM *a, BIGNUM *
return group->meth->group_get_curve(group, p, a, b, ctx);
}
-
+#ifndef OPENSSL_NO_EC2M
int EC_GROUP_set_curve_GF2m(EC_GROUP *group, const BIGNUM *p, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx)
{
if (group->meth->group_set_curve == 0)
@@ -446,7 +446,7 @@ int EC_GROUP_get_curve_GF2m(const EC_GROUP *group, BIGNUM *p, BIGNUM *a, BIGNUM
}
return group->meth->group_get_curve(group, p, a, b, ctx);
}
-
+#endif
int EC_GROUP_get_degree(const EC_GROUP *group)
{
@@ -480,10 +480,10 @@ int EC_GROUP_cmp(const EC_GROUP *a, const EC_GROUP *b, BN_CTX *ctx)
if (EC_METHOD_get_field_type(EC_GROUP_method_of(a)) !=
EC_METHOD_get_field_type(EC_GROUP_method_of(b)))
return 1;
- /* compare the curve name (if present) */
+ /* compare the curve name (if present in both) */
if (EC_GROUP_get_curve_name(a) && EC_GROUP_get_curve_name(b) &&
- EC_GROUP_get_curve_name(a) == EC_GROUP_get_curve_name(b))
- return 0;
+ EC_GROUP_get_curve_name(a) != EC_GROUP_get_curve_name(b))
+ return 1;
if (!ctx)
ctx_new = ctx = BN_CTX_new();
@@ -856,7 +856,7 @@ int EC_POINT_set_affine_coordinates_GFp(const EC_GROUP *group, EC_POINT *point,
return group->meth->point_set_affine_coordinates(group, point, x, y, ctx);
}
-
+#ifndef OPENSSL_NO_EC2M
int EC_POINT_set_affine_coordinates_GF2m(const EC_GROUP *group, EC_POINT *point,
const BIGNUM *x, const BIGNUM *y, BN_CTX *ctx)
{
@@ -872,7 +872,7 @@ int EC_POINT_set_affine_coordinates_GF2m(const EC_GROUP *group, EC_POINT *point,
}
return group->meth->point_set_affine_coordinates(group, point, x, y, ctx);
}
-
+#endif
int EC_POINT_get_affine_coordinates_GFp(const EC_GROUP *group, const EC_POINT *point,
BIGNUM *x, BIGNUM *y, BN_CTX *ctx)
@@ -890,7 +890,7 @@ int EC_POINT_get_affine_coordinates_GFp(const EC_GROUP *group, const EC_POINT *p
return group->meth->point_get_affine_coordinates(group, point, x, y, ctx);
}
-
+#ifndef OPENSSL_NO_EC2M
int EC_POINT_get_affine_coordinates_GF2m(const EC_GROUP *group, const EC_POINT *point,
BIGNUM *x, BIGNUM *y, BN_CTX *ctx)
{
@@ -906,75 +906,7 @@ int EC_POINT_get_affine_coordinates_GF2m(const EC_GROUP *group, const EC_POINT *
}
return group->meth->point_get_affine_coordinates(group, point, x, y, ctx);
}
-
-
-int EC_POINT_set_compressed_coordinates_GFp(const EC_GROUP *group, EC_POINT *point,
- const BIGNUM *x, int y_bit, BN_CTX *ctx)
- {
- if (group->meth->point_set_compressed_coordinates == 0)
- {
- ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GFP, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
- return 0;
- }
- if (group->meth != point->meth)
- {
- ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GFP, EC_R_INCOMPATIBLE_OBJECTS);
- return 0;
- }
- return group->meth->point_set_compressed_coordinates(group, point, x, y_bit, ctx);
- }
-
-
-int EC_POINT_set_compressed_coordinates_GF2m(const EC_GROUP *group, EC_POINT *point,
- const BIGNUM *x, int y_bit, BN_CTX *ctx)
- {
- if (group->meth->point_set_compressed_coordinates == 0)
- {
- ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GF2M, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
- return 0;
- }
- if (group->meth != point->meth)
- {
- ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GF2M, EC_R_INCOMPATIBLE_OBJECTS);
- return 0;
- }
- return group->meth->point_set_compressed_coordinates(group, point, x, y_bit, ctx);
- }
-
-
-size_t EC_POINT_point2oct(const EC_GROUP *group, const EC_POINT *point, point_conversion_form_t form,
- unsigned char *buf, size_t len, BN_CTX *ctx)
- {
- if (group->meth->point2oct == 0)
- {
- ECerr(EC_F_EC_POINT_POINT2OCT, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
- return 0;
- }
- if (group->meth != point->meth)
- {
- ECerr(EC_F_EC_POINT_POINT2OCT, EC_R_INCOMPATIBLE_OBJECTS);
- return 0;
- }
- return group->meth->point2oct(group, point, form, buf, len, ctx);
- }
-
-
-int EC_POINT_oct2point(const EC_GROUP *group, EC_POINT *point,
- const unsigned char *buf, size_t len, BN_CTX *ctx)
- {
- if (group->meth->oct2point == 0)
- {
- ECerr(EC_F_EC_POINT_OCT2POINT, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
- return 0;
- }
- if (group->meth != point->meth)
- {
- ECerr(EC_F_EC_POINT_OCT2POINT, EC_R_INCOMPATIBLE_OBJECTS);
- return 0;
- }
- return group->meth->oct2point(group, point, buf, len, ctx);
- }
-
+#endif
int EC_POINT_add(const EC_GROUP *group, EC_POINT *r, const EC_POINT *a, const EC_POINT *b, BN_CTX *ctx)
{
@@ -1061,12 +993,12 @@ int EC_POINT_cmp(const EC_GROUP *group, const EC_POINT *a, const EC_POINT *b, BN
if (group->meth->point_cmp == 0)
{
ECerr(EC_F_EC_POINT_CMP, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
- return 0;
+ return -1;
}
if ((group->meth != a->meth) || (a->meth != b->meth))
{
ECerr(EC_F_EC_POINT_CMP, EC_R_INCOMPATIBLE_OBJECTS);
- return 0;
+ return -1;
}
return group->meth->point_cmp(group, a, b, ctx);
}
diff --git a/main/openssl/crypto/ec/ec_oct.c b/main/openssl/crypto/ec/ec_oct.c
new file mode 100644
index 00000000..fd9db079
--- /dev/null
+++ b/main/openssl/crypto/ec/ec_oct.c
@@ -0,0 +1,199 @@
+/* crypto/ec/ec_lib.c */
+/*
+ * Originally written by Bodo Moeller for the OpenSSL project.
+ */
+/* ====================================================================
+ * Copyright (c) 1998-2003 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com). This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+/* ====================================================================
+ * Copyright 2002 Sun Microsystems, Inc. ALL RIGHTS RESERVED.
+ * Binary polynomial ECC support in OpenSSL originally developed by
+ * SUN MICROSYSTEMS, INC., and contributed to the OpenSSL project.
+ */
+
+#include <string.h>
+
+#include <openssl/err.h>
+#include <openssl/opensslv.h>
+
+#include "ec_lcl.h"
+
+int EC_POINT_set_compressed_coordinates_GFp(const EC_GROUP *group, EC_POINT *point,
+ const BIGNUM *x, int y_bit, BN_CTX *ctx)
+ {
+ if (group->meth->point_set_compressed_coordinates == 0
+ && !(group->meth->flags & EC_FLAGS_DEFAULT_OCT))
+ {
+ ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GFP, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
+ return 0;
+ }
+ if (group->meth != point->meth)
+ {
+ ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GFP, EC_R_INCOMPATIBLE_OBJECTS);
+ return 0;
+ }
+ if(group->meth->flags & EC_FLAGS_DEFAULT_OCT)
+ {
+ if (group->meth->field_type == NID_X9_62_prime_field)
+ return ec_GFp_simple_set_compressed_coordinates(
+ group, point, x, y_bit, ctx);
+ else
+#ifdef OPENSSL_NO_EC2M
+ {
+ ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GFP, EC_R_GF2M_NOT_SUPPORTED);
+ return 0;
+ }
+#else
+ return ec_GF2m_simple_set_compressed_coordinates(
+ group, point, x, y_bit, ctx);
+#endif
+ }
+ return group->meth->point_set_compressed_coordinates(group, point, x, y_bit, ctx);
+ }
+
+#ifndef OPENSSL_NO_EC2M
+int EC_POINT_set_compressed_coordinates_GF2m(const EC_GROUP *group, EC_POINT *point,
+ const BIGNUM *x, int y_bit, BN_CTX *ctx)
+ {
+ if (group->meth->point_set_compressed_coordinates == 0
+ && !(group->meth->flags & EC_FLAGS_DEFAULT_OCT))
+ {
+ ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GF2M, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
+ return 0;
+ }
+ if (group->meth != point->meth)
+ {
+ ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GF2M, EC_R_INCOMPATIBLE_OBJECTS);
+ return 0;
+ }
+ if(group->meth->flags & EC_FLAGS_DEFAULT_OCT)
+ {
+ if (group->meth->field_type == NID_X9_62_prime_field)
+ return ec_GFp_simple_set_compressed_coordinates(
+ group, point, x, y_bit, ctx);
+ else
+ return ec_GF2m_simple_set_compressed_coordinates(
+ group, point, x, y_bit, ctx);
+ }
+ return group->meth->point_set_compressed_coordinates(group, point, x, y_bit, ctx);
+ }
+#endif
+
+size_t EC_POINT_point2oct(const EC_GROUP *group, const EC_POINT *point, point_conversion_form_t form,
+ unsigned char *buf, size_t len, BN_CTX *ctx)
+ {
+ if (group->meth->point2oct == 0
+ && !(group->meth->flags & EC_FLAGS_DEFAULT_OCT))
+ {
+ ECerr(EC_F_EC_POINT_POINT2OCT, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
+ return 0;
+ }
+ if (group->meth != point->meth)
+ {
+ ECerr(EC_F_EC_POINT_POINT2OCT, EC_R_INCOMPATIBLE_OBJECTS);
+ return 0;
+ }
+ if(group->meth->flags & EC_FLAGS_DEFAULT_OCT)
+ {
+ if (group->meth->field_type == NID_X9_62_prime_field)
+ return ec_GFp_simple_point2oct(group, point,
+ form, buf, len, ctx);
+ else
+#ifdef OPENSSL_NO_EC2M
+ {
+ ECerr(EC_F_EC_POINT_POINT2OCT, EC_R_GF2M_NOT_SUPPORTED);
+ return 0;
+ }
+#else
+ return ec_GF2m_simple_point2oct(group, point,
+ form, buf, len, ctx);
+#endif
+ }
+
+ return group->meth->point2oct(group, point, form, buf, len, ctx);
+ }
+
+
+int EC_POINT_oct2point(const EC_GROUP *group, EC_POINT *point,
+ const unsigned char *buf, size_t len, BN_CTX *ctx)
+ {
+ if (group->meth->oct2point == 0
+ && !(group->meth->flags & EC_FLAGS_DEFAULT_OCT))
+ {
+ ECerr(EC_F_EC_POINT_OCT2POINT, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
+ return 0;
+ }
+ if (group->meth != point->meth)
+ {
+ ECerr(EC_F_EC_POINT_OCT2POINT, EC_R_INCOMPATIBLE_OBJECTS);
+ return 0;
+ }
+ if(group->meth->flags & EC_FLAGS_DEFAULT_OCT)
+ {
+ if (group->meth->field_type == NID_X9_62_prime_field)
+ return ec_GFp_simple_oct2point(group, point,
+ buf, len, ctx);
+ else
+#ifdef OPENSSL_NO_EC2M
+ {
+ ECerr(EC_F_EC_POINT_OCT2POINT, EC_R_GF2M_NOT_SUPPORTED);
+ return 0;
+ }
+#else
+ return ec_GF2m_simple_oct2point(group, point,
+ buf, len, ctx);
+#endif
+ }
+ return group->meth->oct2point(group, point, buf, len, ctx);
+ }
+
diff --git a/main/openssl/crypto/ec/ec_pmeth.c b/main/openssl/crypto/ec/ec_pmeth.c
index f433076c..66ee397d 100644
--- a/main/openssl/crypto/ec/ec_pmeth.c
+++ b/main/openssl/crypto/ec/ec_pmeth.c
@@ -188,7 +188,7 @@ static int pkey_ec_derive(EVP_PKEY_CTX *ctx, unsigned char *key, size_t *keylen)
pubkey = EC_KEY_get0_public_key(ctx->peerkey->pkey.ec);
- /* NB: unlike PKS#3 DH, if *outlen is less than maximum size this is
+ /* NB: unlike PKCS#3 DH, if *outlen is less than maximum size this is
* not an error, the result is truncated.
*/
@@ -221,6 +221,7 @@ static int pkey_ec_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2)
case EVP_PKEY_CTRL_MD:
if (EVP_MD_type((const EVP_MD *)p2) != NID_sha1 &&
+ EVP_MD_type((const EVP_MD *)p2) != NID_ecdsa_with_SHA1 &&
EVP_MD_type((const EVP_MD *)p2) != NID_sha224 &&
EVP_MD_type((const EVP_MD *)p2) != NID_sha256 &&
EVP_MD_type((const EVP_MD *)p2) != NID_sha384 &&
diff --git a/main/openssl/crypto/ec/eck_prn.c b/main/openssl/crypto/ec/eck_prn.c
index 7d3e175a..06de8f39 100644
--- a/main/openssl/crypto/ec/eck_prn.c
+++ b/main/openssl/crypto/ec/eck_prn.c
@@ -207,7 +207,7 @@ int ECPKParameters_print(BIO *bp, const EC_GROUP *x, int off)
reason = ERR_R_MALLOC_FAILURE;
goto err;
}
-
+#ifndef OPENSSL_NO_EC2M
if (is_char_two)
{
if (!EC_GROUP_get_curve_GF2m(x, p, a, b, ctx))
@@ -217,6 +217,7 @@ int ECPKParameters_print(BIO *bp, const EC_GROUP *x, int off)
}
}
else /* prime field */
+#endif
{
if (!EC_GROUP_get_curve_GFp(x, p, a, b, ctx))
{
diff --git a/main/openssl/crypto/ec/ecp_mont.c b/main/openssl/crypto/ec/ecp_mont.c
index 9fc4a466..f04f132c 100644
--- a/main/openssl/crypto/ec/ecp_mont.c
+++ b/main/openssl/crypto/ec/ecp_mont.c
@@ -63,12 +63,20 @@
#include <openssl/err.h>
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
+
#include "ec_lcl.h"
const EC_METHOD *EC_GFp_mont_method(void)
{
+#ifdef OPENSSL_FIPS
+ return fips_ec_gfp_mont_method();
+#else
static const EC_METHOD ret = {
+ EC_FLAGS_DEFAULT_OCT,
NID_X9_62_prime_field,
ec_GFp_mont_group_init,
ec_GFp_mont_group_finish,
@@ -87,9 +95,7 @@ const EC_METHOD *EC_GFp_mont_method(void)
ec_GFp_simple_get_Jprojective_coordinates_GFp,
ec_GFp_simple_point_set_affine_coordinates,
ec_GFp_simple_point_get_affine_coordinates,
- ec_GFp_simple_set_compressed_coordinates,
- ec_GFp_simple_point2oct,
- ec_GFp_simple_oct2point,
+ 0,0,0,
ec_GFp_simple_add,
ec_GFp_simple_dbl,
ec_GFp_simple_invert,
@@ -109,6 +115,7 @@ const EC_METHOD *EC_GFp_mont_method(void)
ec_GFp_mont_field_set_to_one };
return &ret;
+#endif
}
diff --git a/main/openssl/crypto/ec/ecp_nist.c b/main/openssl/crypto/ec/ecp_nist.c
index 2a5682ea..aad2d5f4 100644
--- a/main/openssl/crypto/ec/ecp_nist.c
+++ b/main/openssl/crypto/ec/ecp_nist.c
@@ -67,9 +67,17 @@
#include <openssl/obj_mac.h>
#include "ec_lcl.h"
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
+
const EC_METHOD *EC_GFp_nist_method(void)
{
+#ifdef OPENSSL_FIPS
+ return fips_ec_gfp_nist_method();
+#else
static const EC_METHOD ret = {
+ EC_FLAGS_DEFAULT_OCT,
NID_X9_62_prime_field,
ec_GFp_simple_group_init,
ec_GFp_simple_group_finish,
@@ -88,9 +96,7 @@ const EC_METHOD *EC_GFp_nist_method(void)
ec_GFp_simple_get_Jprojective_coordinates_GFp,
ec_GFp_simple_point_set_affine_coordinates,
ec_GFp_simple_point_get_affine_coordinates,
- ec_GFp_simple_set_compressed_coordinates,
- ec_GFp_simple_point2oct,
- ec_GFp_simple_oct2point,
+ 0,0,0,
ec_GFp_simple_add,
ec_GFp_simple_dbl,
ec_GFp_simple_invert,
@@ -110,6 +116,7 @@ const EC_METHOD *EC_GFp_nist_method(void)
0 /* field_set_to_one */ };
return &ret;
+#endif
}
int ec_GFp_nist_group_copy(EC_GROUP *dest, const EC_GROUP *src)
diff --git a/main/openssl/crypto/ec/ecp_oct.c b/main/openssl/crypto/ec/ecp_oct.c
new file mode 100644
index 00000000..374a0ee7
--- /dev/null
+++ b/main/openssl/crypto/ec/ecp_oct.c
@@ -0,0 +1,433 @@
+/* crypto/ec/ecp_oct.c */
+/* Includes code written by Lenka Fibikova <fibikova@exp-math.uni-essen.de>
+ * for the OpenSSL project.
+ * Includes code written by Bodo Moeller for the OpenSSL project.
+*/
+/* ====================================================================
+ * Copyright (c) 1998-2002 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com). This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+/* ====================================================================
+ * Copyright 2002 Sun Microsystems, Inc. ALL RIGHTS RESERVED.
+ * Portions of this software developed by SUN MICROSYSTEMS, INC.,
+ * and contributed to the OpenSSL project.
+ */
+
+#include <openssl/err.h>
+#include <openssl/symhacks.h>
+
+#include "ec_lcl.h"
+
+int ec_GFp_simple_set_compressed_coordinates(const EC_GROUP *group, EC_POINT *point,
+ const BIGNUM *x_, int y_bit, BN_CTX *ctx)
+ {
+ BN_CTX *new_ctx = NULL;
+ BIGNUM *tmp1, *tmp2, *x, *y;
+ int ret = 0;
+
+ /* clear error queue*/
+ ERR_clear_error();
+
+ if (ctx == NULL)
+ {
+ ctx = new_ctx = BN_CTX_new();
+ if (ctx == NULL)
+ return 0;
+ }
+
+ y_bit = (y_bit != 0);
+
+ BN_CTX_start(ctx);
+ tmp1 = BN_CTX_get(ctx);
+ tmp2 = BN_CTX_get(ctx);
+ x = BN_CTX_get(ctx);
+ y = BN_CTX_get(ctx);
+ if (y == NULL) goto err;
+
+ /* Recover y. We have a Weierstrass equation
+ * y^2 = x^3 + a*x + b,
+ * so y is one of the square roots of x^3 + a*x + b.
+ */
+
+ /* tmp1 := x^3 */
+ if (!BN_nnmod(x, x_, &group->field,ctx)) goto err;
+ if (group->meth->field_decode == 0)
+ {
+ /* field_{sqr,mul} work on standard representation */
+ if (!group->meth->field_sqr(group, tmp2, x_, ctx)) goto err;
+ if (!group->meth->field_mul(group, tmp1, tmp2, x_, ctx)) goto err;
+ }
+ else
+ {
+ if (!BN_mod_sqr(tmp2, x_, &group->field, ctx)) goto err;
+ if (!BN_mod_mul(tmp1, tmp2, x_, &group->field, ctx)) goto err;
+ }
+
+ /* tmp1 := tmp1 + a*x */
+ if (group->a_is_minus3)
+ {
+ if (!BN_mod_lshift1_quick(tmp2, x, &group->field)) goto err;
+ if (!BN_mod_add_quick(tmp2, tmp2, x, &group->field)) goto err;
+ if (!BN_mod_sub_quick(tmp1, tmp1, tmp2, &group->field)) goto err;
+ }
+ else
+ {
+ if (group->meth->field_decode)
+ {
+ if (!group->meth->field_decode(group, tmp2, &group->a, ctx)) goto err;
+ if (!BN_mod_mul(tmp2, tmp2, x, &group->field, ctx)) goto err;
+ }
+ else
+ {
+ /* field_mul works on standard representation */
+ if (!group->meth->field_mul(group, tmp2, &group->a, x, ctx)) goto err;
+ }
+
+ if (!BN_mod_add_quick(tmp1, tmp1, tmp2, &group->field)) goto err;
+ }
+
+ /* tmp1 := tmp1 + b */
+ if (group->meth->field_decode)
+ {
+ if (!group->meth->field_decode(group, tmp2, &group->b, ctx)) goto err;
+ if (!BN_mod_add_quick(tmp1, tmp1, tmp2, &group->field)) goto err;
+ }
+ else
+ {
+ if (!BN_mod_add_quick(tmp1, tmp1, &group->b, &group->field)) goto err;
+ }
+
+ if (!BN_mod_sqrt(y, tmp1, &group->field, ctx))
+ {
+ unsigned long err = ERR_peek_last_error();
+
+ if (ERR_GET_LIB(err) == ERR_LIB_BN && ERR_GET_REASON(err) == BN_R_NOT_A_SQUARE)
+ {
+ ERR_clear_error();
+ ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, EC_R_INVALID_COMPRESSED_POINT);
+ }
+ else
+ ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, ERR_R_BN_LIB);
+ goto err;
+ }
+
+ if (y_bit != BN_is_odd(y))
+ {
+ if (BN_is_zero(y))
+ {
+ int kron;
+
+ kron = BN_kronecker(x, &group->field, ctx);
+ if (kron == -2) goto err;
+
+ if (kron == 1)
+ ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, EC_R_INVALID_COMPRESSION_BIT);
+ else
+ /* BN_mod_sqrt() should have cought this error (not a square) */
+ ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, EC_R_INVALID_COMPRESSED_POINT);
+ goto err;
+ }
+ if (!BN_usub(y, &group->field, y)) goto err;
+ }
+ if (y_bit != BN_is_odd(y))
+ {
+ ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, ERR_R_INTERNAL_ERROR);
+ goto err;
+ }
+
+ if (!EC_POINT_set_affine_coordinates_GFp(group, point, x, y, ctx)) goto err;
+
+ ret = 1;
+
+ err:
+ BN_CTX_end(ctx);
+ if (new_ctx != NULL)
+ BN_CTX_free(new_ctx);
+ return ret;
+ }
+
+
+size_t ec_GFp_simple_point2oct(const EC_GROUP *group, const EC_POINT *point, point_conversion_form_t form,
+ unsigned char *buf, size_t len, BN_CTX *ctx)
+ {
+ size_t ret;
+ BN_CTX *new_ctx = NULL;
+ int used_ctx = 0;
+ BIGNUM *x, *y;
+ size_t field_len, i, skip;
+
+ if ((form != POINT_CONVERSION_COMPRESSED)
+ && (form != POINT_CONVERSION_UNCOMPRESSED)
+ && (form != POINT_CONVERSION_HYBRID))
+ {
+ ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, EC_R_INVALID_FORM);
+ goto err;
+ }
+
+ if (EC_POINT_is_at_infinity(group, point))
+ {
+ /* encodes to a single 0 octet */
+ if (buf != NULL)
+ {
+ if (len < 1)
+ {
+ ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, EC_R_BUFFER_TOO_SMALL);
+ return 0;
+ }
+ buf[0] = 0;
+ }
+ return 1;
+ }
+
+
+ /* ret := required output buffer length */
+ field_len = BN_num_bytes(&group->field);
+ ret = (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2*field_len;
+
+ /* if 'buf' is NULL, just return required length */
+ if (buf != NULL)
+ {
+ if (len < ret)
+ {
+ ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, EC_R_BUFFER_TOO_SMALL);
+ goto err;
+ }
+
+ if (ctx == NULL)
+ {
+ ctx = new_ctx = BN_CTX_new();
+ if (ctx == NULL)
+ return 0;
+ }
+
+ BN_CTX_start(ctx);
+ used_ctx = 1;
+ x = BN_CTX_get(ctx);
+ y = BN_CTX_get(ctx);
+ if (y == NULL) goto err;
+
+ if (!EC_POINT_get_affine_coordinates_GFp(group, point, x, y, ctx)) goto err;
+
+ if ((form == POINT_CONVERSION_COMPRESSED || form == POINT_CONVERSION_HYBRID) && BN_is_odd(y))
+ buf[0] = form + 1;
+ else
+ buf[0] = form;
+
+ i = 1;
+
+ skip = field_len - BN_num_bytes(x);
+ if (skip > field_len)
+ {
+ ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
+ goto err;
+ }
+ while (skip > 0)
+ {
+ buf[i++] = 0;
+ skip--;
+ }
+ skip = BN_bn2bin(x, buf + i);
+ i += skip;
+ if (i != 1 + field_len)
+ {
+ ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
+ goto err;
+ }
+
+ if (form == POINT_CONVERSION_UNCOMPRESSED || form == POINT_CONVERSION_HYBRID)
+ {
+ skip = field_len - BN_num_bytes(y);
+ if (skip > field_len)
+ {
+ ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
+ goto err;
+ }
+ while (skip > 0)
+ {
+ buf[i++] = 0;
+ skip--;
+ }
+ skip = BN_bn2bin(y, buf + i);
+ i += skip;
+ }
+
+ if (i != ret)
+ {
+ ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
+ goto err;
+ }
+ }
+
+ if (used_ctx)
+ BN_CTX_end(ctx);
+ if (new_ctx != NULL)
+ BN_CTX_free(new_ctx);
+ return ret;
+
+ err:
+ if (used_ctx)
+ BN_CTX_end(ctx);
+ if (new_ctx != NULL)
+ BN_CTX_free(new_ctx);
+ return 0;
+ }
+
+
+int ec_GFp_simple_oct2point(const EC_GROUP *group, EC_POINT *point,
+ const unsigned char *buf, size_t len, BN_CTX *ctx)
+ {
+ point_conversion_form_t form;
+ int y_bit;
+ BN_CTX *new_ctx = NULL;
+ BIGNUM *x, *y;
+ size_t field_len, enc_len;
+ int ret = 0;
+
+ if (len == 0)
+ {
+ ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_BUFFER_TOO_SMALL);
+ return 0;
+ }
+ form = buf[0];
+ y_bit = form & 1;
+ form = form & ~1U;
+ if ((form != 0) && (form != POINT_CONVERSION_COMPRESSED)
+ && (form != POINT_CONVERSION_UNCOMPRESSED)
+ && (form != POINT_CONVERSION_HYBRID))
+ {
+ ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+ return 0;
+ }
+ if ((form == 0 || form == POINT_CONVERSION_UNCOMPRESSED) && y_bit)
+ {
+ ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+ return 0;
+ }
+
+ if (form == 0)
+ {
+ if (len != 1)
+ {
+ ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+ return 0;
+ }
+
+ return EC_POINT_set_to_infinity(group, point);
+ }
+
+ field_len = BN_num_bytes(&group->field);
+ enc_len = (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2*field_len;
+
+ if (len != enc_len)
+ {
+ ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+ return 0;
+ }
+
+ if (ctx == NULL)
+ {
+ ctx = new_ctx = BN_CTX_new();
+ if (ctx == NULL)
+ return 0;
+ }
+
+ BN_CTX_start(ctx);
+ x = BN_CTX_get(ctx);
+ y = BN_CTX_get(ctx);
+ if (y == NULL) goto err;
+
+ if (!BN_bin2bn(buf + 1, field_len, x)) goto err;
+ if (BN_ucmp(x, &group->field) >= 0)
+ {
+ ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+ goto err;
+ }
+
+ if (form == POINT_CONVERSION_COMPRESSED)
+ {
+ if (!EC_POINT_set_compressed_coordinates_GFp(group, point, x, y_bit, ctx)) goto err;
+ }
+ else
+ {
+ if (!BN_bin2bn(buf + 1 + field_len, field_len, y)) goto err;
+ if (BN_ucmp(y, &group->field) >= 0)
+ {
+ ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+ goto err;
+ }
+ if (form == POINT_CONVERSION_HYBRID)
+ {
+ if (y_bit != BN_is_odd(y))
+ {
+ ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+ goto err;
+ }
+ }
+
+ if (!EC_POINT_set_affine_coordinates_GFp(group, point, x, y, ctx)) goto err;
+ }
+
+ if (!EC_POINT_is_on_curve(group, point, ctx)) /* test required by X9.62 */
+ {
+ ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_POINT_IS_NOT_ON_CURVE);
+ goto err;
+ }
+
+ ret = 1;
+
+ err:
+ BN_CTX_end(ctx);
+ if (new_ctx != NULL)
+ BN_CTX_free(new_ctx);
+ return ret;
+ }
+
diff --git a/main/openssl/crypto/ec/ecp_smpl.c b/main/openssl/crypto/ec/ecp_smpl.c
index 66a92e2a..7cbb321f 100644
--- a/main/openssl/crypto/ec/ecp_smpl.c
+++ b/main/openssl/crypto/ec/ecp_smpl.c
@@ -65,11 +65,19 @@
#include <openssl/err.h>
#include <openssl/symhacks.h>
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
+
#include "ec_lcl.h"
const EC_METHOD *EC_GFp_simple_method(void)
{
+#ifdef OPENSSL_FIPS
+ return fips_ec_gfp_simple_method();
+#else
static const EC_METHOD ret = {
+ EC_FLAGS_DEFAULT_OCT,
NID_X9_62_prime_field,
ec_GFp_simple_group_init,
ec_GFp_simple_group_finish,
@@ -88,9 +96,7 @@ const EC_METHOD *EC_GFp_simple_method(void)
ec_GFp_simple_get_Jprojective_coordinates_GFp,
ec_GFp_simple_point_set_affine_coordinates,
ec_GFp_simple_point_get_affine_coordinates,
- ec_GFp_simple_set_compressed_coordinates,
- ec_GFp_simple_point2oct,
- ec_GFp_simple_oct2point,
+ 0,0,0,
ec_GFp_simple_add,
ec_GFp_simple_dbl,
ec_GFp_simple_invert,
@@ -110,6 +116,7 @@ const EC_METHOD *EC_GFp_simple_method(void)
0 /* field_set_to_one */ };
return &ret;
+#endif
}
@@ -633,372 +640,6 @@ int ec_GFp_simple_point_get_affine_coordinates(const EC_GROUP *group, const EC_P
return ret;
}
-
-int ec_GFp_simple_set_compressed_coordinates(const EC_GROUP *group, EC_POINT *point,
- const BIGNUM *x_, int y_bit, BN_CTX *ctx)
- {
- BN_CTX *new_ctx = NULL;
- BIGNUM *tmp1, *tmp2, *x, *y;
- int ret = 0;
-
- /* clear error queue*/
- ERR_clear_error();
-
- if (ctx == NULL)
- {
- ctx = new_ctx = BN_CTX_new();
- if (ctx == NULL)
- return 0;
- }
-
- y_bit = (y_bit != 0);
-
- BN_CTX_start(ctx);
- tmp1 = BN_CTX_get(ctx);
- tmp2 = BN_CTX_get(ctx);
- x = BN_CTX_get(ctx);
- y = BN_CTX_get(ctx);
- if (y == NULL) goto err;
-
- /* Recover y. We have a Weierstrass equation
- * y^2 = x^3 + a*x + b,
- * so y is one of the square roots of x^3 + a*x + b.
- */
-
- /* tmp1 := x^3 */
- if (!BN_nnmod(x, x_, &group->field,ctx)) goto err;
- if (group->meth->field_decode == 0)
- {
- /* field_{sqr,mul} work on standard representation */
- if (!group->meth->field_sqr(group, tmp2, x_, ctx)) goto err;
- if (!group->meth->field_mul(group, tmp1, tmp2, x_, ctx)) goto err;
- }
- else
- {
- if (!BN_mod_sqr(tmp2, x_, &group->field, ctx)) goto err;
- if (!BN_mod_mul(tmp1, tmp2, x_, &group->field, ctx)) goto err;
- }
-
- /* tmp1 := tmp1 + a*x */
- if (group->a_is_minus3)
- {
- if (!BN_mod_lshift1_quick(tmp2, x, &group->field)) goto err;
- if (!BN_mod_add_quick(tmp2, tmp2, x, &group->field)) goto err;
- if (!BN_mod_sub_quick(tmp1, tmp1, tmp2, &group->field)) goto err;
- }
- else
- {
- if (group->meth->field_decode)
- {
- if (!group->meth->field_decode(group, tmp2, &group->a, ctx)) goto err;
- if (!BN_mod_mul(tmp2, tmp2, x, &group->field, ctx)) goto err;
- }
- else
- {
- /* field_mul works on standard representation */
- if (!group->meth->field_mul(group, tmp2, &group->a, x, ctx)) goto err;
- }
-
- if (!BN_mod_add_quick(tmp1, tmp1, tmp2, &group->field)) goto err;
- }
-
- /* tmp1 := tmp1 + b */
- if (group->meth->field_decode)
- {
- if (!group->meth->field_decode(group, tmp2, &group->b, ctx)) goto err;
- if (!BN_mod_add_quick(tmp1, tmp1, tmp2, &group->field)) goto err;
- }
- else
- {
- if (!BN_mod_add_quick(tmp1, tmp1, &group->b, &group->field)) goto err;
- }
-
- if (!BN_mod_sqrt(y, tmp1, &group->field, ctx))
- {
- unsigned long err = ERR_peek_last_error();
-
- if (ERR_GET_LIB(err) == ERR_LIB_BN && ERR_GET_REASON(err) == BN_R_NOT_A_SQUARE)
- {
- ERR_clear_error();
- ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, EC_R_INVALID_COMPRESSED_POINT);
- }
- else
- ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, ERR_R_BN_LIB);
- goto err;
- }
-
- if (y_bit != BN_is_odd(y))
- {
- if (BN_is_zero(y))
- {
- int kron;
-
- kron = BN_kronecker(x, &group->field, ctx);
- if (kron == -2) goto err;
-
- if (kron == 1)
- ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, EC_R_INVALID_COMPRESSION_BIT);
- else
- /* BN_mod_sqrt() should have cought this error (not a square) */
- ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, EC_R_INVALID_COMPRESSED_POINT);
- goto err;
- }
- if (!BN_usub(y, &group->field, y)) goto err;
- }
- if (y_bit != BN_is_odd(y))
- {
- ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, ERR_R_INTERNAL_ERROR);
- goto err;
- }
-
- if (!EC_POINT_set_affine_coordinates_GFp(group, point, x, y, ctx)) goto err;
-
- ret = 1;
-
- err:
- BN_CTX_end(ctx);
- if (new_ctx != NULL)
- BN_CTX_free(new_ctx);
- return ret;
- }
-
-
-size_t ec_GFp_simple_point2oct(const EC_GROUP *group, const EC_POINT *point, point_conversion_form_t form,
- unsigned char *buf, size_t len, BN_CTX *ctx)
- {
- size_t ret;
- BN_CTX *new_ctx = NULL;
- int used_ctx = 0;
- BIGNUM *x, *y;
- size_t field_len, i, skip;
-
- if ((form != POINT_CONVERSION_COMPRESSED)
- && (form != POINT_CONVERSION_UNCOMPRESSED)
- && (form != POINT_CONVERSION_HYBRID))
- {
- ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, EC_R_INVALID_FORM);
- goto err;
- }
-
- if (EC_POINT_is_at_infinity(group, point))
- {
- /* encodes to a single 0 octet */
- if (buf != NULL)
- {
- if (len < 1)
- {
- ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, EC_R_BUFFER_TOO_SMALL);
- return 0;
- }
- buf[0] = 0;
- }
- return 1;
- }
-
-
- /* ret := required output buffer length */
- field_len = BN_num_bytes(&group->field);
- ret = (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2*field_len;
-
- /* if 'buf' is NULL, just return required length */
- if (buf != NULL)
- {
- if (len < ret)
- {
- ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, EC_R_BUFFER_TOO_SMALL);
- goto err;
- }
-
- if (ctx == NULL)
- {
- ctx = new_ctx = BN_CTX_new();
- if (ctx == NULL)
- return 0;
- }
-
- BN_CTX_start(ctx);
- used_ctx = 1;
- x = BN_CTX_get(ctx);
- y = BN_CTX_get(ctx);
- if (y == NULL) goto err;
-
- if (!EC_POINT_get_affine_coordinates_GFp(group, point, x, y, ctx)) goto err;
-
- if ((form == POINT_CONVERSION_COMPRESSED || form == POINT_CONVERSION_HYBRID) && BN_is_odd(y))
- buf[0] = form + 1;
- else
- buf[0] = form;
-
- i = 1;
-
- skip = field_len - BN_num_bytes(x);
- if (skip > field_len)
- {
- ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
- goto err;
- }
- while (skip > 0)
- {
- buf[i++] = 0;
- skip--;
- }
- skip = BN_bn2bin(x, buf + i);
- i += skip;
- if (i != 1 + field_len)
- {
- ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
- goto err;
- }
-
- if (form == POINT_CONVERSION_UNCOMPRESSED || form == POINT_CONVERSION_HYBRID)
- {
- skip = field_len - BN_num_bytes(y);
- if (skip > field_len)
- {
- ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
- goto err;
- }
- while (skip > 0)
- {
- buf[i++] = 0;
- skip--;
- }
- skip = BN_bn2bin(y, buf + i);
- i += skip;
- }
-
- if (i != ret)
- {
- ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
- goto err;
- }
- }
-
- if (used_ctx)
- BN_CTX_end(ctx);
- if (new_ctx != NULL)
- BN_CTX_free(new_ctx);
- return ret;
-
- err:
- if (used_ctx)
- BN_CTX_end(ctx);
- if (new_ctx != NULL)
- BN_CTX_free(new_ctx);
- return 0;
- }
-
-
-int ec_GFp_simple_oct2point(const EC_GROUP *group, EC_POINT *point,
- const unsigned char *buf, size_t len, BN_CTX *ctx)
- {
- point_conversion_form_t form;
- int y_bit;
- BN_CTX *new_ctx = NULL;
- BIGNUM *x, *y;
- size_t field_len, enc_len;
- int ret = 0;
-
- if (len == 0)
- {
- ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_BUFFER_TOO_SMALL);
- return 0;
- }
- form = buf[0];
- y_bit = form & 1;
- form = form & ~1U;
- if ((form != 0) && (form != POINT_CONVERSION_COMPRESSED)
- && (form != POINT_CONVERSION_UNCOMPRESSED)
- && (form != POINT_CONVERSION_HYBRID))
- {
- ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
- return 0;
- }
- if ((form == 0 || form == POINT_CONVERSION_UNCOMPRESSED) && y_bit)
- {
- ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
- return 0;
- }
-
- if (form == 0)
- {
- if (len != 1)
- {
- ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
- return 0;
- }
-
- return EC_POINT_set_to_infinity(group, point);
- }
-
- field_len = BN_num_bytes(&group->field);
- enc_len = (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2*field_len;
-
- if (len != enc_len)
- {
- ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
- return 0;
- }
-
- if (ctx == NULL)
- {
- ctx = new_ctx = BN_CTX_new();
- if (ctx == NULL)
- return 0;
- }
-
- BN_CTX_start(ctx);
- x = BN_CTX_get(ctx);
- y = BN_CTX_get(ctx);
- if (y == NULL) goto err;
-
- if (!BN_bin2bn(buf + 1, field_len, x)) goto err;
- if (BN_ucmp(x, &group->field) >= 0)
- {
- ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
- goto err;
- }
-
- if (form == POINT_CONVERSION_COMPRESSED)
- {
- if (!EC_POINT_set_compressed_coordinates_GFp(group, point, x, y_bit, ctx)) goto err;
- }
- else
- {
- if (!BN_bin2bn(buf + 1 + field_len, field_len, y)) goto err;
- if (BN_ucmp(y, &group->field) >= 0)
- {
- ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
- goto err;
- }
- if (form == POINT_CONVERSION_HYBRID)
- {
- if (y_bit != BN_is_odd(y))
- {
- ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
- goto err;
- }
- }
-
- if (!EC_POINT_set_affine_coordinates_GFp(group, point, x, y, ctx)) goto err;
- }
-
- if (!EC_POINT_is_on_curve(group, point, ctx)) /* test required by X9.62 */
- {
- ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_POINT_IS_NOT_ON_CURVE);
- goto err;
- }
-
- ret = 1;
-
- err:
- BN_CTX_end(ctx);
- if (new_ctx != NULL)
- BN_CTX_free(new_ctx);
- return ret;
- }
-
-
int ec_GFp_simple_add(const EC_GROUP *group, EC_POINT *r, const EC_POINT *a, const EC_POINT *b, BN_CTX *ctx)
{
int (*field_mul)(const EC_GROUP *, BIGNUM *, const BIGNUM *, const BIGNUM *, BN_CTX *);
diff --git a/main/openssl/crypto/ec/ectest.c b/main/openssl/crypto/ec/ectest.c
index 7509cb9c..102eaa9b 100644
--- a/main/openssl/crypto/ec/ectest.c
+++ b/main/openssl/crypto/ec/ectest.c
@@ -94,6 +94,7 @@ int main(int argc, char * argv[]) { puts("Elliptic curves are disabled."); retur
#include <openssl/objects.h>
#include <openssl/rand.h>
#include <openssl/bn.h>
+#include <openssl/opensslconf.h>
#if defined(_MSC_VER) && defined(_MIPS_) && (_MSC_VER/100==12)
/* suppress "too big too optimize" warning */
@@ -107,10 +108,6 @@ int main(int argc, char * argv[]) { puts("Elliptic curves are disabled."); retur
EXIT(1); \
} while (0)
-void prime_field_tests(void);
-void char2_field_tests(void);
-void internal_curve_test(void);
-
#define TIMING_BASE_PT 0
#define TIMING_RAND_PT 1
#define TIMING_SIMUL 2
@@ -195,8 +192,51 @@ static void timings(EC_GROUP *group, int type, BN_CTX *ctx)
}
#endif
-void prime_field_tests()
- {
+/* test multiplication with group order, long and negative scalars */
+static void group_order_tests(EC_GROUP *group)
+ {
+ BIGNUM *n1, *n2, *order;
+ EC_POINT *P = EC_POINT_new(group);
+ EC_POINT *Q = EC_POINT_new(group);
+ BN_CTX *ctx = BN_CTX_new();
+
+ n1 = BN_new(); n2 = BN_new(); order = BN_new();
+ fprintf(stdout, "verify group order ...");
+ fflush(stdout);
+ if (!EC_GROUP_get_order(group, order, ctx)) ABORT;
+ if (!EC_POINT_mul(group, Q, order, NULL, NULL, ctx)) ABORT;
+ if (!EC_POINT_is_at_infinity(group, Q)) ABORT;
+ fprintf(stdout, ".");
+ fflush(stdout);
+ if (!EC_GROUP_precompute_mult(group, ctx)) ABORT;
+ if (!EC_POINT_mul(group, Q, order, NULL, NULL, ctx)) ABORT;
+ if (!EC_POINT_is_at_infinity(group, Q)) ABORT;
+ fprintf(stdout, " ok\n");
+ fprintf(stdout, "long/negative scalar tests ... ");
+ if (!BN_one(n1)) ABORT;
+ /* n1 = 1 - order */
+ if (!BN_sub(n1, n1, order)) ABORT;
+ if(!EC_POINT_mul(group, Q, NULL, P, n1, ctx)) ABORT;
+ if (0 != EC_POINT_cmp(group, Q, P, ctx)) ABORT;
+ /* n2 = 1 + order */
+ if (!BN_add(n2, order, BN_value_one())) ABORT;
+ if(!EC_POINT_mul(group, Q, NULL, P, n2, ctx)) ABORT;
+ if (0 != EC_POINT_cmp(group, Q, P, ctx)) ABORT;
+ /* n2 = (1 - order) * (1 + order) */
+ if (!BN_mul(n2, n1, n2, ctx)) ABORT;
+ if(!EC_POINT_mul(group, Q, NULL, P, n2, ctx)) ABORT;
+ if (0 != EC_POINT_cmp(group, Q, P, ctx)) ABORT;
+ fprintf(stdout, "ok\n");
+ EC_POINT_free(P);
+ EC_POINT_free(Q);
+ BN_free(n1);
+ BN_free(n2);
+ BN_free(order);
+ BN_CTX_free(ctx);
+ }
+
+static void prime_field_tests(void)
+ {
BN_CTX *ctx = NULL;
BIGNUM *p, *a, *b;
EC_GROUP *group;
@@ -321,21 +361,21 @@ void prime_field_tests()
if (len == 0) ABORT;
if (!EC_POINT_oct2point(group, P, buf, len, ctx)) ABORT;
if (0 != EC_POINT_cmp(group, P, Q, ctx)) ABORT;
- fprintf(stdout, "Generator as octect string, compressed form:\n ");
+ fprintf(stdout, "Generator as octet string, compressed form:\n ");
for (i = 0; i < len; i++) fprintf(stdout, "%02X", buf[i]);
len = EC_POINT_point2oct(group, Q, POINT_CONVERSION_UNCOMPRESSED, buf, sizeof buf, ctx);
if (len == 0) ABORT;
if (!EC_POINT_oct2point(group, P, buf, len, ctx)) ABORT;
if (0 != EC_POINT_cmp(group, P, Q, ctx)) ABORT;
- fprintf(stdout, "\nGenerator as octect string, uncompressed form:\n ");
+ fprintf(stdout, "\nGenerator as octet string, uncompressed form:\n ");
for (i = 0; i < len; i++) fprintf(stdout, "%02X", buf[i]);
len = EC_POINT_point2oct(group, Q, POINT_CONVERSION_HYBRID, buf, sizeof buf, ctx);
if (len == 0) ABORT;
if (!EC_POINT_oct2point(group, P, buf, len, ctx)) ABORT;
if (0 != EC_POINT_cmp(group, P, Q, ctx)) ABORT;
- fprintf(stdout, "\nGenerator as octect string, hybrid form:\n ");
+ fprintf(stdout, "\nGenerator as octet string, hybrid form:\n ");
for (i = 0; i < len; i++) fprintf(stdout, "%02X", buf[i]);
if (!EC_POINT_get_Jprojective_coordinates_GFp(group, R, x, y, z, ctx)) ABORT;
@@ -381,17 +421,7 @@ void prime_field_tests()
if (EC_GROUP_get_degree(group) != 160) ABORT;
fprintf(stdout, " ok\n");
- fprintf(stdout, "verify group order ...");
- fflush(stdout);
- if (!EC_GROUP_get_order(group, z, ctx)) ABORT;
- if (!EC_POINT_mul(group, Q, z, NULL, NULL, ctx)) ABORT;
- if (!EC_POINT_is_at_infinity(group, Q)) ABORT;
- fprintf(stdout, ".");
- fflush(stdout);
- if (!EC_GROUP_precompute_mult(group, ctx)) ABORT;
- if (!EC_POINT_mul(group, Q, z, NULL, NULL, ctx)) ABORT;
- if (!EC_POINT_is_at_infinity(group, Q)) ABORT;
- fprintf(stdout, " ok\n");
+ group_order_tests(group);
if (!(P_160 = EC_GROUP_new(EC_GROUP_method_of(group)))) ABORT;
if (!EC_GROUP_copy(P_160, group)) ABORT;
@@ -425,17 +455,7 @@ void prime_field_tests()
if (EC_GROUP_get_degree(group) != 192) ABORT;
fprintf(stdout, " ok\n");
- fprintf(stdout, "verify group order ...");
- fflush(stdout);
- if (!EC_GROUP_get_order(group, z, ctx)) ABORT;
- if (!EC_POINT_mul(group, Q, z, NULL, NULL, ctx)) ABORT;
- if (!EC_POINT_is_at_infinity(group, Q)) ABORT;
- fprintf(stdout, ".");
- fflush(stdout);
- if (!EC_GROUP_precompute_mult(group, ctx)) ABORT;
- if (!EC_POINT_mul(group, Q, z, NULL, NULL, ctx)) ABORT;
- if (!EC_POINT_is_at_infinity(group, Q)) ABORT;
- fprintf(stdout, " ok\n");
+ group_order_tests(group);
if (!(P_192 = EC_GROUP_new(EC_GROUP_method_of(group)))) ABORT;
if (!EC_GROUP_copy(P_192, group)) ABORT;
@@ -469,17 +489,7 @@ void prime_field_tests()
if (EC_GROUP_get_degree(group) != 224) ABORT;
fprintf(stdout, " ok\n");
- fprintf(stdout, "verify group order ...");
- fflush(stdout);
- if (!EC_GROUP_get_order(group, z, ctx)) ABORT;
- if (!EC_POINT_mul(group, Q, z, NULL, NULL, ctx)) ABORT;
- if (!EC_POINT_is_at_infinity(group, Q)) ABORT;
- fprintf(stdout, ".");
- fflush(stdout);
- if (!EC_GROUP_precompute_mult(group, ctx)) ABORT;
- if (!EC_POINT_mul(group, Q, z, NULL, NULL, ctx)) ABORT;
- if (!EC_POINT_is_at_infinity(group, Q)) ABORT;
- fprintf(stdout, " ok\n");
+ group_order_tests(group);
if (!(P_224 = EC_GROUP_new(EC_GROUP_method_of(group)))) ABORT;
if (!EC_GROUP_copy(P_224, group)) ABORT;
@@ -514,17 +524,7 @@ void prime_field_tests()
if (EC_GROUP_get_degree(group) != 256) ABORT;
fprintf(stdout, " ok\n");
- fprintf(stdout, "verify group order ...");
- fflush(stdout);
- if (!EC_GROUP_get_order(group, z, ctx)) ABORT;
- if (!EC_POINT_mul(group, Q, z, NULL, NULL, ctx)) ABORT;
- if (!EC_POINT_is_at_infinity(group, Q)) ABORT;
- fprintf(stdout, ".");
- fflush(stdout);
- if (!EC_GROUP_precompute_mult(group, ctx)) ABORT;
- if (!EC_POINT_mul(group, Q, z, NULL, NULL, ctx)) ABORT;
- if (!EC_POINT_is_at_infinity(group, Q)) ABORT;
- fprintf(stdout, " ok\n");
+ group_order_tests(group);
if (!(P_256 = EC_GROUP_new(EC_GROUP_method_of(group)))) ABORT;
if (!EC_GROUP_copy(P_256, group)) ABORT;
@@ -563,18 +563,8 @@ void prime_field_tests()
fprintf(stdout, "verify degree ...");
if (EC_GROUP_get_degree(group) != 384) ABORT;
fprintf(stdout, " ok\n");
-
- fprintf(stdout, "verify group order ...");
- fflush(stdout);
- if (!EC_GROUP_get_order(group, z, ctx)) ABORT;
- if (!EC_POINT_mul(group, Q, z, NULL, NULL, ctx)) ABORT;
- if (!EC_POINT_is_at_infinity(group, Q)) ABORT;
- fprintf(stdout, ".");
- fflush(stdout);
- if (!EC_GROUP_precompute_mult(group, ctx)) ABORT;
- if (!EC_POINT_mul(group, Q, z, NULL, NULL, ctx)) ABORT;
- if (!EC_POINT_is_at_infinity(group, Q)) ABORT;
- fprintf(stdout, " ok\n");
+
+ group_order_tests(group);
if (!(P_384 = EC_GROUP_new(EC_GROUP_method_of(group)))) ABORT;
if (!EC_GROUP_copy(P_384, group)) ABORT;
@@ -619,18 +609,8 @@ void prime_field_tests()
fprintf(stdout, "verify degree ...");
if (EC_GROUP_get_degree(group) != 521) ABORT;
fprintf(stdout, " ok\n");
-
- fprintf(stdout, "verify group order ...");
- fflush(stdout);
- if (!EC_GROUP_get_order(group, z, ctx)) ABORT;
- if (!EC_POINT_mul(group, Q, z, NULL, NULL, ctx)) ABORT;
- if (!EC_POINT_is_at_infinity(group, Q)) ABORT;
- fprintf(stdout, ".");
- fflush(stdout);
- if (!EC_GROUP_precompute_mult(group, ctx)) ABORT;
- if (!EC_POINT_mul(group, Q, z, NULL, NULL, ctx)) ABORT;
- if (!EC_POINT_is_at_infinity(group, Q)) ABORT;
- fprintf(stdout, " ok\n");
+
+ group_order_tests(group);
if (!(P_521 = EC_GROUP_new(EC_GROUP_method_of(group)))) ABORT;
if (!EC_GROUP_copy(P_521, group)) ABORT;
@@ -659,6 +639,7 @@ void prime_field_tests()
points[2] = Q;
points[3] = Q;
+ if (!EC_GROUP_get_order(group, z, ctx)) ABORT;
if (!BN_add(y, z, BN_value_one())) ABORT;
if (BN_is_odd(y)) ABORT;
if (!BN_rshift1(y, y)) ABORT;
@@ -792,22 +773,14 @@ void prime_field_tests()
fprintf(stdout, "verify degree ..."); \
if (EC_GROUP_get_degree(group) != _degree) ABORT; \
fprintf(stdout, " ok\n"); \
- fprintf(stdout, "verify group order ..."); \
- fflush(stdout); \
- if (!EC_GROUP_get_order(group, z, ctx)) ABORT; \
- if (!EC_POINT_mul(group, Q, z, NULL, NULL, ctx)) ABORT; \
- if (!EC_POINT_is_at_infinity(group, Q)) ABORT; \
- fprintf(stdout, "."); \
- fflush(stdout); \
- if (!EC_GROUP_precompute_mult(group, ctx)) ABORT; \
- if (!EC_POINT_mul(group, Q, z, NULL, NULL, ctx)) ABORT; \
- if (!EC_POINT_is_at_infinity(group, Q)) ABORT; \
- fprintf(stdout, " ok\n"); \
+ group_order_tests(group); \
if (!(_variable = EC_GROUP_new(EC_GROUP_method_of(group)))) ABORT; \
- if (!EC_GROUP_copy(_variable, group)) ABORT;
+ if (!EC_GROUP_copy(_variable, group)) ABORT; \
-void char2_field_tests()
- {
+#ifndef OPENSSL_NO_EC2M
+
+static void char2_field_tests(void)
+ {
BN_CTX *ctx = NULL;
BIGNUM *p, *a, *b;
EC_GROUP *group;
@@ -1239,8 +1212,9 @@ void char2_field_tests()
if (C2_B571) EC_GROUP_free(C2_B571);
}
+#endif
-void internal_curve_test(void)
+static void internal_curve_test(void)
{
EC_builtin_curve *curves = NULL;
size_t crv_len = 0, n = 0;
@@ -1287,13 +1261,189 @@ void internal_curve_test(void)
EC_GROUP_free(group);
}
if (ok)
- fprintf(stdout, " ok\n");
+ fprintf(stdout, " ok\n\n");
else
- fprintf(stdout, " failed\n");
+ {
+ fprintf(stdout, " failed\n\n");
+ ABORT;
+ }
OPENSSL_free(curves);
return;
}
+#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
+/* nistp_test_params contains magic numbers for testing our optimized
+ * implementations of several NIST curves with characteristic > 3. */
+struct nistp_test_params
+ {
+ const EC_METHOD* (*meth) ();
+ int degree;
+ /* Qx, Qy and D are taken from
+ * http://csrc.nist.gov/groups/ST/toolkit/documents/Examples/ECDSA_Prime.pdf
+ * Otherwise, values are standard curve parameters from FIPS 180-3 */
+ const char *p, *a, *b, *Qx, *Qy, *Gx, *Gy, *order, *d;
+ };
+
+static const struct nistp_test_params nistp_tests_params[] =
+ {
+ {
+ /* P-224 */
+ EC_GFp_nistp224_method,
+ 224,
+ "FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF000000000000000000000001", /* p */
+ "FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFE", /* a */
+ "B4050A850C04B3ABF54132565044B0B7D7BFD8BA270B39432355FFB4", /* b */
+ "E84FB0B8E7000CB657D7973CF6B42ED78B301674276DF744AF130B3E", /* Qx */
+ "4376675C6FC5612C21A0FF2D2A89D2987DF7A2BC52183B5982298555", /* Qy */
+ "B70E0CBD6BB4BF7F321390B94A03C1D356C21122343280D6115C1D21", /* Gx */
+ "BD376388B5F723FB4C22DFE6CD4375A05A07476444D5819985007E34", /* Gy */
+ "FFFFFFFFFFFFFFFFFFFFFFFFFFFF16A2E0B8F03E13DD29455C5C2A3D", /* order */
+ "3F0C488E987C80BE0FEE521F8D90BE6034EC69AE11CA72AA777481E8", /* d */
+ },
+ {
+ /* P-256 */
+ EC_GFp_nistp256_method,
+ 256,
+ "ffffffff00000001000000000000000000000000ffffffffffffffffffffffff", /* p */
+ "ffffffff00000001000000000000000000000000fffffffffffffffffffffffc", /* a */
+ "5ac635d8aa3a93e7b3ebbd55769886bc651d06b0cc53b0f63bce3c3e27d2604b", /* b */
+ "b7e08afdfe94bad3f1dc8c734798ba1c62b3a0ad1e9ea2a38201cd0889bc7a19", /* Qx */
+ "3603f747959dbf7a4bb226e41928729063adc7ae43529e61b563bbc606cc5e09", /* Qy */
+ "6b17d1f2e12c4247f8bce6e563a440f277037d812deb33a0f4a13945d898c296", /* Gx */
+ "4fe342e2fe1a7f9b8ee7eb4a7c0f9e162bce33576b315ececbb6406837bf51f5", /* Gy */
+ "ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc632551", /* order */
+ "c477f9f65c22cce20657faa5b2d1d8122336f851a508a1ed04e479c34985bf96", /* d */
+ },
+ {
+ /* P-521 */
+ EC_GFp_nistp521_method,
+ 521,
+ "1ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff", /* p */
+ "1fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffc", /* a */
+ "051953eb9618e1c9a1f929a21a0b68540eea2da725b99b315f3b8b489918ef109e156193951ec7e937b1652c0bd3bb1bf073573df883d2c34f1ef451fd46b503f00", /* b */
+ "0098e91eef9a68452822309c52fab453f5f117c1da8ed796b255e9ab8f6410cca16e59df403a6bdc6ca467a37056b1e54b3005d8ac030decfeb68df18b171885d5c4", /* Qx */
+ "0164350c321aecfc1cca1ba4364c9b15656150b4b78d6a48d7d28e7f31985ef17be8554376b72900712c4b83ad668327231526e313f5f092999a4632fd50d946bc2e", /* Qy */
+ "c6858e06b70404e9cd9e3ecb662395b4429c648139053fb521f828af606b4d3dbaa14b5e77efe75928fe1dc127a2ffa8de3348b3c1856a429bf97e7e31c2e5bd66", /* Gx */
+ "11839296a789a3bc0045c8a5fb42c7d1bd998f54449579b446817afbd17273e662c97ee72995ef42640c550b9013fad0761353c7086a272c24088be94769fd16650", /* Gy */
+ "1fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffa51868783bf2f966b7fcc0148f709a5d03bb5c9b8899c47aebb6fb71e91386409", /* order */
+ "0100085f47b8e1b8b11b7eb33028c0b2888e304bfc98501955b45bba1478dc184eeedf09b86a5f7c21994406072787205e69a63709fe35aa93ba333514b24f961722", /* d */
+ },
+ };
+
+void nistp_single_test(const struct nistp_test_params *test)
+ {
+ BN_CTX *ctx;
+ BIGNUM *p, *a, *b, *x, *y, *n, *m, *order;
+ EC_GROUP *NISTP;
+ EC_POINT *G, *P, *Q, *Q_CHECK;
+
+ fprintf(stdout, "\nNIST curve P-%d (optimised implementation):\n", test->degree);
+ ctx = BN_CTX_new();
+ p = BN_new();
+ a = BN_new();
+ b = BN_new();
+ x = BN_new(); y = BN_new();
+ m = BN_new(); n = BN_new(); order = BN_new();
+
+ NISTP = EC_GROUP_new(test->meth());
+ if(!NISTP) ABORT;
+ if (!BN_hex2bn(&p, test->p)) ABORT;
+ if (1 != BN_is_prime_ex(p, BN_prime_checks, ctx, NULL)) ABORT;
+ if (!BN_hex2bn(&a, test->a)) ABORT;
+ if (!BN_hex2bn(&b, test->b)) ABORT;
+ if (!EC_GROUP_set_curve_GFp(NISTP, p, a, b, ctx)) ABORT;
+ G = EC_POINT_new(NISTP);
+ P = EC_POINT_new(NISTP);
+ Q = EC_POINT_new(NISTP);
+ Q_CHECK = EC_POINT_new(NISTP);
+ if(!BN_hex2bn(&x, test->Qx)) ABORT;
+ if(!BN_hex2bn(&y, test->Qy)) ABORT;
+ if(!EC_POINT_set_affine_coordinates_GFp(NISTP, Q_CHECK, x, y, ctx)) ABORT;
+ if (!BN_hex2bn(&x, test->Gx)) ABORT;
+ if (!BN_hex2bn(&y, test->Gy)) ABORT;
+ if (!EC_POINT_set_affine_coordinates_GFp(NISTP, G, x, y, ctx)) ABORT;
+ if (!BN_hex2bn(&order, test->order)) ABORT;
+ if (!EC_GROUP_set_generator(NISTP, G, order, BN_value_one())) ABORT;
+
+ fprintf(stdout, "verify degree ... ");
+ if (EC_GROUP_get_degree(NISTP) != test->degree) ABORT;
+ fprintf(stdout, "ok\n");
+
+ fprintf(stdout, "NIST test vectors ... ");
+ if (!BN_hex2bn(&n, test->d)) ABORT;
+ /* fixed point multiplication */
+ EC_POINT_mul(NISTP, Q, n, NULL, NULL, ctx);
+ if (0 != EC_POINT_cmp(NISTP, Q, Q_CHECK, ctx)) ABORT;
+ /* random point multiplication */
+ EC_POINT_mul(NISTP, Q, NULL, G, n, ctx);
+ if (0 != EC_POINT_cmp(NISTP, Q, Q_CHECK, ctx)) ABORT;
+
+ /* set generator to P = 2*G, where G is the standard generator */
+ if (!EC_POINT_dbl(NISTP, P, G, ctx)) ABORT;
+ if (!EC_GROUP_set_generator(NISTP, P, order, BN_value_one())) ABORT;
+ /* set the scalar to m=n/2, where n is the NIST test scalar */
+ if (!BN_rshift(m, n, 1)) ABORT;
+
+ /* test the non-standard generator */
+ /* fixed point multiplication */
+ EC_POINT_mul(NISTP, Q, m, NULL, NULL, ctx);
+ if (0 != EC_POINT_cmp(NISTP, Q, Q_CHECK, ctx)) ABORT;
+ /* random point multiplication */
+ EC_POINT_mul(NISTP, Q, NULL, P, m, ctx);
+ if (0 != EC_POINT_cmp(NISTP, Q, Q_CHECK, ctx)) ABORT;
+
+ /* now repeat all tests with precomputation */
+ if (!EC_GROUP_precompute_mult(NISTP, ctx)) ABORT;
+
+ /* fixed point multiplication */
+ EC_POINT_mul(NISTP, Q, m, NULL, NULL, ctx);
+ if (0 != EC_POINT_cmp(NISTP, Q, Q_CHECK, ctx)) ABORT;
+ /* random point multiplication */
+ EC_POINT_mul(NISTP, Q, NULL, P, m, ctx);
+ if (0 != EC_POINT_cmp(NISTP, Q, Q_CHECK, ctx)) ABORT;
+
+ /* reset generator */
+ if (!EC_GROUP_set_generator(NISTP, G, order, BN_value_one())) ABORT;
+ /* fixed point multiplication */
+ EC_POINT_mul(NISTP, Q, n, NULL, NULL, ctx);
+ if (0 != EC_POINT_cmp(NISTP, Q, Q_CHECK, ctx)) ABORT;
+ /* random point multiplication */
+ EC_POINT_mul(NISTP, Q, NULL, G, n, ctx);
+ if (0 != EC_POINT_cmp(NISTP, Q, Q_CHECK, ctx)) ABORT;
+
+ fprintf(stdout, "ok\n");
+ group_order_tests(NISTP);
+#if 0
+ timings(NISTP, TIMING_BASE_PT, ctx);
+ timings(NISTP, TIMING_RAND_PT, ctx);
+#endif
+ EC_GROUP_free(NISTP);
+ EC_POINT_free(G);
+ EC_POINT_free(P);
+ EC_POINT_free(Q);
+ EC_POINT_free(Q_CHECK);
+ BN_free(n);
+ BN_free(m);
+ BN_free(p);
+ BN_free(a);
+ BN_free(b);
+ BN_free(x);
+ BN_free(y);
+ BN_free(order);
+ BN_CTX_free(ctx);
+ }
+
+void nistp_tests()
+ {
+ unsigned i;
+
+ for (i = 0; i < sizeof(nistp_tests_params) / sizeof(struct nistp_test_params); i++)
+ {
+ nistp_single_test(&nistp_tests_params[i]);
+ }
+ }
+#endif
+
static const char rnd_seed[] = "string to make the random number generator think it has entropy";
int main(int argc, char *argv[])
@@ -1317,7 +1467,12 @@ int main(int argc, char *argv[])
prime_field_tests();
puts("");
+#ifndef OPENSSL_NO_EC2M
char2_field_tests();
+#endif
+#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
+ nistp_tests();
+#endif
/* test the internal curves */
internal_curve_test();
diff --git a/main/openssl/crypto/ecdh/ecdh.h b/main/openssl/crypto/ecdh/ecdh.h
index b4b58ee6..8887102c 100644
--- a/main/openssl/crypto/ecdh/ecdh.h
+++ b/main/openssl/crypto/ecdh/ecdh.h
@@ -109,11 +109,13 @@ void ERR_load_ECDH_strings(void);
/* Error codes for the ECDH functions. */
/* Function codes. */
+#define ECDH_F_ECDH_CHECK 102
#define ECDH_F_ECDH_COMPUTE_KEY 100
#define ECDH_F_ECDH_DATA_NEW_METHOD 101
/* Reason codes. */
#define ECDH_R_KDF_FAILED 102
+#define ECDH_R_NON_FIPS_METHOD 103
#define ECDH_R_NO_PRIVATE_VALUE 100
#define ECDH_R_POINT_ARITHMETIC_FAILURE 101
diff --git a/main/openssl/crypto/ecdh/ecdhtest.c b/main/openssl/crypto/ecdh/ecdhtest.c
index 212a87ef..823d7baa 100644
--- a/main/openssl/crypto/ecdh/ecdhtest.c
+++ b/main/openssl/crypto/ecdh/ecdhtest.c
@@ -158,11 +158,13 @@ static int test_ecdh_curve(int nid, const char *text, BN_CTX *ctx, BIO *out)
if (!EC_POINT_get_affine_coordinates_GFp(group,
EC_KEY_get0_public_key(a), x_a, y_a, ctx)) goto err;
}
+#ifndef OPENSSL_NO_EC2M
else
{
if (!EC_POINT_get_affine_coordinates_GF2m(group,
EC_KEY_get0_public_key(a), x_a, y_a, ctx)) goto err;
}
+#endif
#ifdef NOISY
BIO_puts(out," pri 1=");
BN_print(out,a->priv_key);
@@ -183,11 +185,13 @@ static int test_ecdh_curve(int nid, const char *text, BN_CTX *ctx, BIO *out)
if (!EC_POINT_get_affine_coordinates_GFp(group,
EC_KEY_get0_public_key(b), x_b, y_b, ctx)) goto err;
}
+#ifndef OPENSSL_NO_EC2M
else
{
if (!EC_POINT_get_affine_coordinates_GF2m(group,
EC_KEY_get0_public_key(b), x_b, y_b, ctx)) goto err;
}
+#endif
#ifdef NOISY
BIO_puts(out," pri 2=");
@@ -324,6 +328,7 @@ int main(int argc, char *argv[])
if (!test_ecdh_curve(NID_X9_62_prime256v1, "NIST Prime-Curve P-256", ctx, out)) goto err;
if (!test_ecdh_curve(NID_secp384r1, "NIST Prime-Curve P-384", ctx, out)) goto err;
if (!test_ecdh_curve(NID_secp521r1, "NIST Prime-Curve P-521", ctx, out)) goto err;
+#ifndef OPENSSL_NO_EC2M
/* NIST BINARY CURVES TESTS */
if (!test_ecdh_curve(NID_sect163k1, "NIST Binary-Curve K-163", ctx, out)) goto err;
if (!test_ecdh_curve(NID_sect163r2, "NIST Binary-Curve B-163", ctx, out)) goto err;
@@ -335,6 +340,7 @@ int main(int argc, char *argv[])
if (!test_ecdh_curve(NID_sect409r1, "NIST Binary-Curve B-409", ctx, out)) goto err;
if (!test_ecdh_curve(NID_sect571k1, "NIST Binary-Curve K-571", ctx, out)) goto err;
if (!test_ecdh_curve(NID_sect571r1, "NIST Binary-Curve B-571", ctx, out)) goto err;
+#endif
ret = 0;
diff --git a/main/openssl/crypto/ecdh/ech_err.c b/main/openssl/crypto/ecdh/ech_err.c
index 6f4b0c99..3bd24739 100644
--- a/main/openssl/crypto/ecdh/ech_err.c
+++ b/main/openssl/crypto/ecdh/ech_err.c
@@ -1,6 +1,6 @@
/* crypto/ecdh/ech_err.c */
/* ====================================================================
- * Copyright (c) 1999-2006 The OpenSSL Project. All rights reserved.
+ * Copyright (c) 1999-2011 The OpenSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -70,6 +70,7 @@
static ERR_STRING_DATA ECDH_str_functs[]=
{
+{ERR_FUNC(ECDH_F_ECDH_CHECK), "ECDH_CHECK"},
{ERR_FUNC(ECDH_F_ECDH_COMPUTE_KEY), "ECDH_compute_key"},
{ERR_FUNC(ECDH_F_ECDH_DATA_NEW_METHOD), "ECDH_DATA_new_method"},
{0,NULL}
@@ -78,6 +79,7 @@ static ERR_STRING_DATA ECDH_str_functs[]=
static ERR_STRING_DATA ECDH_str_reasons[]=
{
{ERR_REASON(ECDH_R_KDF_FAILED) ,"KDF failed"},
+{ERR_REASON(ECDH_R_NON_FIPS_METHOD) ,"non fips method"},
{ERR_REASON(ECDH_R_NO_PRIVATE_VALUE) ,"no private value"},
{ERR_REASON(ECDH_R_POINT_ARITHMETIC_FAILURE),"point arithmetic failure"},
{0,NULL}
diff --git a/main/openssl/crypto/ecdh/ech_key.c b/main/openssl/crypto/ecdh/ech_key.c
index f44da929..2988899e 100644
--- a/main/openssl/crypto/ecdh/ech_key.c
+++ b/main/openssl/crypto/ecdh/ech_key.c
@@ -68,9 +68,6 @@
*/
#include "ech_locl.h"
-#ifndef OPENSSL_NO_ENGINE
-#include <openssl/engine.h>
-#endif
int ECDH_compute_key(void *out, size_t outlen, const EC_POINT *pub_key,
EC_KEY *eckey,
diff --git a/main/openssl/crypto/ecdh/ech_lib.c b/main/openssl/crypto/ecdh/ech_lib.c
index 4d8ea03d..0644431b 100644
--- a/main/openssl/crypto/ecdh/ech_lib.c
+++ b/main/openssl/crypto/ecdh/ech_lib.c
@@ -73,6 +73,9 @@
#include <openssl/engine.h>
#endif
#include <openssl/err.h>
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
const char ECDH_version[]="ECDH" OPENSSL_VERSION_PTEXT;
@@ -90,7 +93,16 @@ void ECDH_set_default_method(const ECDH_METHOD *meth)
const ECDH_METHOD *ECDH_get_default_method(void)
{
if(!default_ECDH_method)
+ {
+#ifdef OPENSSL_FIPS
+ if (FIPS_mode())
+ return FIPS_ecdh_openssl();
+ else
+ return ECDH_OpenSSL();
+#else
default_ECDH_method = ECDH_OpenSSL();
+#endif
+ }
return default_ECDH_method;
}
@@ -210,11 +222,26 @@ ECDH_DATA *ecdh_check(EC_KEY *key)
ecdh_data = (ECDH_DATA *)ecdh_data_new();
if (ecdh_data == NULL)
return NULL;
- EC_KEY_insert_key_method_data(key, (void *)ecdh_data,
- ecdh_data_dup, ecdh_data_free, ecdh_data_free);
+ data = EC_KEY_insert_key_method_data(key, (void *)ecdh_data,
+ ecdh_data_dup, ecdh_data_free, ecdh_data_free);
+ if (data != NULL)
+ {
+ /* Another thread raced us to install the key_method
+ * data and won. */
+ ecdh_data_free(ecdh_data);
+ ecdh_data = (ECDH_DATA *)data;
+ }
}
else
ecdh_data = (ECDH_DATA *)data;
+#ifdef OPENSSL_FIPS
+ if (FIPS_mode() && !(ecdh_data->flags & ECDH_FLAG_FIPS_METHOD)
+ && !(EC_KEY_get_flags(key) & EC_FLAG_NON_FIPS_ALLOW))
+ {
+ ECDHerr(ECDH_F_ECDH_CHECK, ECDH_R_NON_FIPS_METHOD);
+ return NULL;
+ }
+#endif
return ecdh_data;
diff --git a/main/openssl/crypto/ecdh/ech_locl.h b/main/openssl/crypto/ecdh/ech_locl.h
index f658526a..f6cad6a8 100644
--- a/main/openssl/crypto/ecdh/ech_locl.h
+++ b/main/openssl/crypto/ecdh/ech_locl.h
@@ -75,6 +75,14 @@ struct ecdh_method
char *app_data;
};
+/* If this flag is set the ECDH method is FIPS compliant and can be used
+ * in FIPS mode. This is set in the validated module method. If an
+ * application sets this flag in its own methods it is its responsibility
+ * to ensure the result is compliant.
+ */
+
+#define ECDH_FLAG_FIPS_METHOD 0x1
+
typedef struct ecdh_data_st {
/* EC_KEY_METH_DATA part */
int (*init)(EC_KEY *);
diff --git a/main/openssl/crypto/ecdh/ech_ossl.c b/main/openssl/crypto/ecdh/ech_ossl.c
index 2a40ff12..4a30628f 100644
--- a/main/openssl/crypto/ecdh/ech_ossl.c
+++ b/main/openssl/crypto/ecdh/ech_ossl.c
@@ -157,6 +157,7 @@ static int ecdh_compute_key(void *out, size_t outlen, const EC_POINT *pub_key,
goto err;
}
}
+#ifndef OPENSSL_NO_EC2M
else
{
if (!EC_POINT_get_affine_coordinates_GF2m(group, tmp, x, y, ctx))
@@ -165,6 +166,7 @@ static int ecdh_compute_key(void *out, size_t outlen, const EC_POINT *pub_key,
goto err;
}
}
+#endif
buflen = (EC_GROUP_get_degree(group) + 7)/8;
len = BN_num_bytes(x);
diff --git a/main/openssl/crypto/ecdsa/ecdsa.h b/main/openssl/crypto/ecdsa/ecdsa.h
index e61c5398..dc6a36b1 100644
--- a/main/openssl/crypto/ecdsa/ecdsa.h
+++ b/main/openssl/crypto/ecdsa/ecdsa.h
@@ -238,6 +238,7 @@ void ERR_load_ECDSA_strings(void);
/* Error codes for the ECDSA functions. */
/* Function codes. */
+#define ECDSA_F_ECDSA_CHECK 104
#define ECDSA_F_ECDSA_DATA_NEW_METHOD 100
#define ECDSA_F_ECDSA_DO_SIGN 101
#define ECDSA_F_ECDSA_DO_VERIFY 102
@@ -249,6 +250,8 @@ void ERR_load_ECDSA_strings(void);
#define ECDSA_R_ERR_EC_LIB 102
#define ECDSA_R_MISSING_PARAMETERS 103
#define ECDSA_R_NEED_NEW_SETUP_VALUES 106
+#define ECDSA_R_NONCE_CANNOT_BE_PRECOMPUTED 108
+#define ECDSA_R_NON_FIPS_METHOD 107
#define ECDSA_R_RANDOM_NUMBER_GENERATION_FAILED 104
#define ECDSA_R_SIGNATURE_MALLOC_FAILED 105
diff --git a/main/openssl/crypto/ecdsa/ecdsatest.c b/main/openssl/crypto/ecdsa/ecdsatest.c
index 26a4a9ee..537bb303 100644
--- a/main/openssl/crypto/ecdsa/ecdsatest.c
+++ b/main/openssl/crypto/ecdsa/ecdsatest.c
@@ -262,6 +262,7 @@ int x9_62_tests(BIO *out)
"3238135532097973577080787768312505059318910517550078427819"
"78505179448783"))
goto x962_err;
+#ifndef OPENSSL_NO_EC2M
if (!x9_62_test_internal(out, NID_X9_62_c2tnb191v1,
"87194383164871543355722284926904419997237591535066528048",
"308992691965804947361541664549085895292153777025772063598"))
@@ -272,7 +273,7 @@ int x9_62_tests(BIO *out)
"1970303740007316867383349976549972270528498040721988191026"
"49413465737174"))
goto x962_err;
-
+#endif
ret = 1;
x962_err:
if (!restore_rand())
@@ -286,9 +287,13 @@ int test_builtin(BIO *out)
size_t crv_len = 0, n = 0;
EC_KEY *eckey = NULL, *wrong_eckey = NULL;
EC_GROUP *group;
+ ECDSA_SIG *ecdsa_sig = NULL;
unsigned char digest[20], wrong_digest[20];
- unsigned char *signature = NULL;
- unsigned int sig_len;
+ unsigned char *signature = NULL;
+ const unsigned char *sig_ptr;
+ unsigned char *sig_ptr2;
+ unsigned char *raw_buf = NULL;
+ unsigned int sig_len, degree, r_len, s_len, bn_len, buf_len;
int nid, ret = 0;
/* fill digest values with some random data */
@@ -338,7 +343,8 @@ int test_builtin(BIO *out)
if (EC_KEY_set_group(eckey, group) == 0)
goto builtin_err;
EC_GROUP_free(group);
- if (EC_GROUP_get_degree(EC_KEY_get0_group(eckey)) < 160)
+ degree = EC_GROUP_get_degree(EC_KEY_get0_group(eckey));
+ if (degree < 160)
/* drop the curve */
{
EC_KEY_free(eckey);
@@ -414,26 +420,89 @@ int test_builtin(BIO *out)
}
BIO_printf(out, ".");
(void)BIO_flush(out);
- /* modify a single byte of the signature */
- offset = signature[10] % sig_len;
- dirt = signature[11];
- signature[offset] ^= dirt ? dirt : 1;
+ /* wrong length */
+ if (ECDSA_verify(0, digest, 20, signature, sig_len - 1,
+ eckey) == 1)
+ {
+ BIO_printf(out, " failed\n");
+ goto builtin_err;
+ }
+ BIO_printf(out, ".");
+ (void)BIO_flush(out);
+
+ /* Modify a single byte of the signature: to ensure we don't
+ * garble the ASN1 structure, we read the raw signature and
+ * modify a byte in one of the bignums directly. */
+ sig_ptr = signature;
+ if ((ecdsa_sig = d2i_ECDSA_SIG(NULL, &sig_ptr, sig_len)) == NULL)
+ {
+ BIO_printf(out, " failed\n");
+ goto builtin_err;
+ }
+
+ /* Store the two BIGNUMs in raw_buf. */
+ r_len = BN_num_bytes(ecdsa_sig->r);
+ s_len = BN_num_bytes(ecdsa_sig->s);
+ bn_len = (degree + 7) / 8;
+ if ((r_len > bn_len) || (s_len > bn_len))
+ {
+ BIO_printf(out, " failed\n");
+ goto builtin_err;
+ }
+ buf_len = 2 * bn_len;
+ if ((raw_buf = OPENSSL_malloc(buf_len)) == NULL)
+ goto builtin_err;
+ /* Pad the bignums with leading zeroes. */
+ memset(raw_buf, 0, buf_len);
+ BN_bn2bin(ecdsa_sig->r, raw_buf + bn_len - r_len);
+ BN_bn2bin(ecdsa_sig->s, raw_buf + buf_len - s_len);
+
+ /* Modify a single byte in the buffer. */
+ offset = raw_buf[10] % buf_len;
+ dirt = raw_buf[11] ? raw_buf[11] : 1;
+ raw_buf[offset] ^= dirt;
+ /* Now read the BIGNUMs back in from raw_buf. */
+ if ((BN_bin2bn(raw_buf, bn_len, ecdsa_sig->r) == NULL) ||
+ (BN_bin2bn(raw_buf + bn_len, bn_len, ecdsa_sig->s) == NULL))
+ goto builtin_err;
+
+ sig_ptr2 = signature;
+ sig_len = i2d_ECDSA_SIG(ecdsa_sig, &sig_ptr2);
if (ECDSA_verify(0, digest, 20, signature, sig_len, eckey) == 1)
{
BIO_printf(out, " failed\n");
goto builtin_err;
}
+ /* Sanity check: undo the modification and verify signature. */
+ raw_buf[offset] ^= dirt;
+ if ((BN_bin2bn(raw_buf, bn_len, ecdsa_sig->r) == NULL) ||
+ (BN_bin2bn(raw_buf + bn_len, bn_len, ecdsa_sig->s) == NULL))
+ goto builtin_err;
+
+ sig_ptr2 = signature;
+ sig_len = i2d_ECDSA_SIG(ecdsa_sig, &sig_ptr2);
+ if (ECDSA_verify(0, digest, 20, signature, sig_len, eckey) != 1)
+ {
+ BIO_printf(out, " failed\n");
+ goto builtin_err;
+ }
BIO_printf(out, ".");
(void)BIO_flush(out);
BIO_printf(out, " ok\n");
/* cleanup */
+ /* clean bogus errors */
+ ERR_clear_error();
OPENSSL_free(signature);
signature = NULL;
EC_KEY_free(eckey);
eckey = NULL;
EC_KEY_free(wrong_eckey);
wrong_eckey = NULL;
+ ECDSA_SIG_free(ecdsa_sig);
+ ecdsa_sig = NULL;
+ OPENSSL_free(raw_buf);
+ raw_buf = NULL;
}
ret = 1;
@@ -442,8 +511,12 @@ builtin_err:
EC_KEY_free(eckey);
if (wrong_eckey)
EC_KEY_free(wrong_eckey);
+ if (ecdsa_sig)
+ ECDSA_SIG_free(ecdsa_sig);
if (signature)
OPENSSL_free(signature);
+ if (raw_buf)
+ OPENSSL_free(raw_buf);
if (curves)
OPENSSL_free(curves);
diff --git a/main/openssl/crypto/ecdsa/ecs_err.c b/main/openssl/crypto/ecdsa/ecs_err.c
index 98e38d53..7406c6d8 100644
--- a/main/openssl/crypto/ecdsa/ecs_err.c
+++ b/main/openssl/crypto/ecdsa/ecs_err.c
@@ -1,6 +1,6 @@
/* crypto/ecdsa/ecs_err.c */
/* ====================================================================
- * Copyright (c) 1999-2006 The OpenSSL Project. All rights reserved.
+ * Copyright (c) 1999-2011 The OpenSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -70,6 +70,7 @@
static ERR_STRING_DATA ECDSA_str_functs[]=
{
+{ERR_FUNC(ECDSA_F_ECDSA_CHECK), "ECDSA_CHECK"},
{ERR_FUNC(ECDSA_F_ECDSA_DATA_NEW_METHOD), "ECDSA_DATA_NEW_METHOD"},
{ERR_FUNC(ECDSA_F_ECDSA_DO_SIGN), "ECDSA_do_sign"},
{ERR_FUNC(ECDSA_F_ECDSA_DO_VERIFY), "ECDSA_do_verify"},
@@ -84,6 +85,8 @@ static ERR_STRING_DATA ECDSA_str_reasons[]=
{ERR_REASON(ECDSA_R_ERR_EC_LIB) ,"err ec lib"},
{ERR_REASON(ECDSA_R_MISSING_PARAMETERS) ,"missing parameters"},
{ERR_REASON(ECDSA_R_NEED_NEW_SETUP_VALUES),"need new setup values"},
+{ERR_REASON(ECDSA_R_NONCE_CANNOT_BE_PRECOMPUTED),"nonce cannot be precomputed"},
+{ERR_REASON(ECDSA_R_NON_FIPS_METHOD) ,"non fips method"},
{ERR_REASON(ECDSA_R_RANDOM_NUMBER_GENERATION_FAILED),"random number generation failed"},
{ERR_REASON(ECDSA_R_SIGNATURE_MALLOC_FAILED),"signature malloc failed"},
{0,NULL}
diff --git a/main/openssl/crypto/ecdsa/ecs_lib.c b/main/openssl/crypto/ecdsa/ecs_lib.c
index 2ebae3aa..814a6bf4 100644
--- a/main/openssl/crypto/ecdsa/ecs_lib.c
+++ b/main/openssl/crypto/ecdsa/ecs_lib.c
@@ -60,6 +60,9 @@
#endif
#include <openssl/err.h>
#include <openssl/bn.h>
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
const char ECDSA_version[]="ECDSA" OPENSSL_VERSION_PTEXT;
@@ -77,7 +80,16 @@ void ECDSA_set_default_method(const ECDSA_METHOD *meth)
const ECDSA_METHOD *ECDSA_get_default_method(void)
{
if(!default_ECDSA_method)
+ {
+#ifdef OPENSSL_FIPS
+ if (FIPS_mode())
+ return FIPS_ecdsa_openssl();
+ else
+ return ECDSA_OpenSSL();
+#else
default_ECDSA_method = ECDSA_OpenSSL();
+#endif
+ }
return default_ECDSA_method;
}
@@ -188,12 +200,26 @@ ECDSA_DATA *ecdsa_check(EC_KEY *key)
ecdsa_data = (ECDSA_DATA *)ecdsa_data_new();
if (ecdsa_data == NULL)
return NULL;
- EC_KEY_insert_key_method_data(key, (void *)ecdsa_data,
- ecdsa_data_dup, ecdsa_data_free, ecdsa_data_free);
+ data = EC_KEY_insert_key_method_data(key, (void *)ecdsa_data,
+ ecdsa_data_dup, ecdsa_data_free, ecdsa_data_free);
+ if (data != NULL)
+ {
+ /* Another thread raced us to install the key_method
+ * data and won. */
+ ecdsa_data_free(ecdsa_data);
+ ecdsa_data = (ECDSA_DATA *)data;
+ }
}
else
ecdsa_data = (ECDSA_DATA *)data;
-
+#ifdef OPENSSL_FIPS
+ if (FIPS_mode() && !(ecdsa_data->flags & ECDSA_FLAG_FIPS_METHOD)
+ && !(EC_KEY_get_flags(key) & EC_FLAG_NON_FIPS_ALLOW))
+ {
+ ECDSAerr(ECDSA_F_ECDSA_CHECK, ECDSA_R_NON_FIPS_METHOD);
+ return NULL;
+ }
+#endif
return ecdsa_data;
}
diff --git a/main/openssl/crypto/ecdsa/ecs_locl.h b/main/openssl/crypto/ecdsa/ecs_locl.h
index 3a69a840..46f7ad91 100644
--- a/main/openssl/crypto/ecdsa/ecs_locl.h
+++ b/main/openssl/crypto/ecdsa/ecs_locl.h
@@ -70,8 +70,9 @@ struct ecdsa_method
const char *name;
ECDSA_SIG *(*ecdsa_do_sign)(const unsigned char *dgst, int dgst_len,
const BIGNUM *inv, const BIGNUM *rp, EC_KEY *eckey);
- int (*ecdsa_sign_setup)(EC_KEY *eckey, BN_CTX *ctx, BIGNUM **kinv,
- BIGNUM **r);
+ int (*ecdsa_sign_setup)(EC_KEY *eckey, BN_CTX *ctx,
+ BIGNUM **kinv, BIGNUM **r,
+ const unsigned char *dgst, int dlen);
int (*ecdsa_do_verify)(const unsigned char *dgst, int dgst_len,
const ECDSA_SIG *sig, EC_KEY *eckey);
#if 0
@@ -82,6 +83,14 @@ struct ecdsa_method
char *app_data;
};
+/* If this flag is set the ECDSA method is FIPS compliant and can be used
+ * in FIPS mode. This is set in the validated module method. If an
+ * application sets this flag in its own methods it is its responsibility
+ * to ensure the result is compliant.
+ */
+
+#define ECDSA_FLAG_FIPS_METHOD 0x1
+
typedef struct ecdsa_data_st {
/* EC_KEY_METH_DATA part */
int (*init)(EC_KEY *);
diff --git a/main/openssl/crypto/ecdsa/ecs_ossl.c b/main/openssl/crypto/ecdsa/ecs_ossl.c
index 1bbf328d..325aca8c 100644
--- a/main/openssl/crypto/ecdsa/ecs_ossl.c
+++ b/main/openssl/crypto/ecdsa/ecs_ossl.c
@@ -60,11 +60,13 @@
#include <openssl/err.h>
#include <openssl/obj_mac.h>
#include <openssl/bn.h>
+#include <openssl/rand.h>
static ECDSA_SIG *ecdsa_do_sign(const unsigned char *dgst, int dlen,
const BIGNUM *, const BIGNUM *, EC_KEY *eckey);
-static int ecdsa_sign_setup(EC_KEY *eckey, BN_CTX *ctx_in, BIGNUM **kinvp,
- BIGNUM **rp);
+static int ecdsa_sign_setup(EC_KEY *eckey, BN_CTX *ctx_in,
+ BIGNUM **kinvp, BIGNUM **rp,
+ const unsigned char *dgst, int dlen);
static int ecdsa_do_verify(const unsigned char *dgst, int dgst_len,
const ECDSA_SIG *sig, EC_KEY *eckey);
@@ -86,8 +88,9 @@ const ECDSA_METHOD *ECDSA_OpenSSL(void)
return &openssl_ecdsa_meth;
}
-static int ecdsa_sign_setup(EC_KEY *eckey, BN_CTX *ctx_in, BIGNUM **kinvp,
- BIGNUM **rp)
+static int ecdsa_sign_setup(EC_KEY *eckey, BN_CTX *ctx_in,
+ BIGNUM **kinvp, BIGNUM **rp,
+ const unsigned char *dgst, int dlen)
{
BN_CTX *ctx = NULL;
BIGNUM *k = NULL, *r = NULL, *order = NULL, *X = NULL;
@@ -136,11 +139,28 @@ static int ecdsa_sign_setup(EC_KEY *eckey, BN_CTX *ctx_in, BIGNUM **kinvp,
{
/* get random k */
do
- if (!BN_rand_range(k, order))
+#ifndef OPENSSL_NO_SHA512
+ if (EC_KEY_get_nonce_from_hash(eckey))
{
- ECDSAerr(ECDSA_F_ECDSA_SIGN_SETUP,
- ECDSA_R_RANDOM_NUMBER_GENERATION_FAILED);
- goto err;
+ if (!BN_generate_dsa_nonce(
+ k, order,
+ EC_KEY_get0_private_key(eckey),
+ dgst, dlen, ctx))
+ {
+ ECDSAerr(ECDSA_F_ECDSA_SIGN_SETUP,
+ ECDSA_R_RANDOM_NUMBER_GENERATION_FAILED);
+ goto err;
+ }
+ }
+ else
+#endif
+ {
+ if (!BN_rand_range(k, order))
+ {
+ ECDSAerr(ECDSA_F_ECDSA_SIGN_SETUP,
+ ECDSA_R_RANDOM_NUMBER_GENERATION_FAILED);
+ goto err;
+ }
}
while (BN_is_zero(k));
@@ -167,6 +187,7 @@ static int ecdsa_sign_setup(EC_KEY *eckey, BN_CTX *ctx_in, BIGNUM **kinvp,
goto err;
}
}
+#ifndef OPENSSL_NO_EC2M
else /* NID_X9_62_characteristic_two_field */
{
if (!EC_POINT_get_affine_coordinates_GF2m(group,
@@ -176,6 +197,7 @@ static int ecdsa_sign_setup(EC_KEY *eckey, BN_CTX *ctx_in, BIGNUM **kinvp,
goto err;
}
}
+#endif
if (!BN_nnmod(r, X, order, ctx))
{
ECDSAerr(ECDSA_F_ECDSA_SIGN_SETUP, ERR_R_BN_LIB);
@@ -280,7 +302,7 @@ static ECDSA_SIG *ecdsa_do_sign(const unsigned char *dgst, int dgst_len,
{
if (in_kinv == NULL || in_r == NULL)
{
- if (!ECDSA_sign_setup(eckey, ctx, &kinv, &ret->r))
+ if (!ecdsa->meth->ecdsa_sign_setup(eckey, ctx, &kinv, &ret->r, dgst, dgst_len))
{
ECDSAerr(ECDSA_F_ECDSA_DO_SIGN,ERR_R_ECDSA_LIB);
goto err;
@@ -454,6 +476,7 @@ static int ecdsa_do_verify(const unsigned char *dgst, int dgst_len,
goto err;
}
}
+#ifndef OPENSSL_NO_EC2M
else /* NID_X9_62_characteristic_two_field */
{
if (!EC_POINT_get_affine_coordinates_GF2m(group,
@@ -463,7 +486,7 @@ static int ecdsa_do_verify(const unsigned char *dgst, int dgst_len,
goto err;
}
}
-
+#endif
if (!BN_nnmod(u1, X, order, ctx))
{
ECDSAerr(ECDSA_F_ECDSA_DO_VERIFY, ERR_R_BN_LIB);
diff --git a/main/openssl/crypto/ecdsa/ecs_sign.c b/main/openssl/crypto/ecdsa/ecs_sign.c
index 353d5af5..ea79a24b 100644
--- a/main/openssl/crypto/ecdsa/ecs_sign.c
+++ b/main/openssl/crypto/ecdsa/ecs_sign.c
@@ -58,6 +58,7 @@
#include <openssl/engine.h>
#endif
#include <openssl/rand.h>
+#include <openssl/err.h>
ECDSA_SIG *ECDSA_do_sign(const unsigned char *dgst, int dlen, EC_KEY *eckey)
{
@@ -102,5 +103,12 @@ int ECDSA_sign_setup(EC_KEY *eckey, BN_CTX *ctx_in, BIGNUM **kinvp,
ECDSA_DATA *ecdsa = ecdsa_check(eckey);
if (ecdsa == NULL)
return 0;
- return ecdsa->meth->ecdsa_sign_setup(eckey, ctx_in, kinvp, rp);
+ if (EC_KEY_get_nonce_from_hash(eckey))
+ {
+ /* You cannot precompute the ECDSA nonce if it is required to
+ * depend on the message. */
+ ECDSAerr(ECDSA_F_ECDSA_SIGN_SETUP, ECDSA_R_NONCE_CANNOT_BE_PRECOMPUTED);
+ return 0;
+ }
+ return ecdsa->meth->ecdsa_sign_setup(eckey, ctx_in, kinvp, rp, NULL, 0);
}
diff --git a/main/openssl/crypto/engine/eng_all.c b/main/openssl/crypto/engine/eng_all.c
index 22c12045..6093376d 100644
--- a/main/openssl/crypto/engine/eng_all.c
+++ b/main/openssl/crypto/engine/eng_all.c
@@ -61,6 +61,8 @@
void ENGINE_load_builtin_engines(void)
{
+ /* Some ENGINEs need this */
+ OPENSSL_cpuid_setup();
#if 0
/* There's no longer any need for an "openssl" ENGINE unless, one day,
* it is the *only* way for standard builtin implementations to be be
@@ -71,6 +73,12 @@ void ENGINE_load_builtin_engines(void)
#if !defined(OPENSSL_NO_HW) && (defined(__OpenBSD__) || defined(__FreeBSD__) || defined(HAVE_CRYPTODEV))
ENGINE_load_cryptodev();
#endif
+#ifndef OPENSSL_NO_RSAX
+ ENGINE_load_rsax();
+#endif
+#ifndef OPENSSL_NO_RDRAND
+ ENGINE_load_rdrand();
+#endif
ENGINE_load_dynamic();
#ifndef OPENSSL_NO_STATIC_ENGINE
#ifndef OPENSSL_NO_HW
@@ -112,6 +120,7 @@ void ENGINE_load_builtin_engines(void)
ENGINE_load_capi();
#endif
#endif
+ ENGINE_register_all_complete();
}
#if defined(__OpenBSD__) || defined(__FreeBSD__) || defined(HAVE_CRYPTODEV)
diff --git a/main/openssl/crypto/engine/eng_cryptodev.c b/main/openssl/crypto/engine/eng_cryptodev.c
index 52f4ca39..5a715aca 100644
--- a/main/openssl/crypto/engine/eng_cryptodev.c
+++ b/main/openssl/crypto/engine/eng_cryptodev.c
@@ -79,8 +79,6 @@ struct dev_crypto_state {
unsigned char digest_res[HASH_MAX_LEN];
char *mac_data;
int mac_len;
-
- int copy;
#endif
};
@@ -200,6 +198,7 @@ get_dev_crypto(void)
if ((fd = open_dev_crypto()) == -1)
return (-1);
+#ifndef CRIOGET_NOT_NEEDED
if (ioctl(fd, CRIOGET, &retfd) == -1)
return (-1);
@@ -208,9 +207,19 @@ get_dev_crypto(void)
close(retfd);
return (-1);
}
+#else
+ retfd = fd;
+#endif
return (retfd);
}
+static void put_dev_crypto(int fd)
+{
+#ifndef CRIOGET_NOT_NEEDED
+ close(fd);
+#endif
+}
+
/* Caching version for asym operations */
static int
get_asym_dev_crypto(void)
@@ -252,7 +261,7 @@ get_cryptodev_ciphers(const int **cnids)
ioctl(fd, CIOCFSESSION, &sess.ses) != -1)
nids[count++] = ciphers[i].nid;
}
- close(fd);
+ put_dev_crypto(fd);
if (count > 0)
*cnids = nids;
@@ -291,7 +300,7 @@ get_cryptodev_digests(const int **cnids)
ioctl(fd, CIOCFSESSION, &sess.ses) != -1)
nids[count++] = digests[i].nid;
}
- close(fd);
+ put_dev_crypto(fd);
if (count > 0)
*cnids = nids;
@@ -436,7 +445,7 @@ cryptodev_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
sess->cipher = cipher;
if (ioctl(state->d_fd, CIOCGSESSION, sess) == -1) {
- close(state->d_fd);
+ put_dev_crypto(state->d_fd);
state->d_fd = -1;
return (0);
}
@@ -473,7 +482,7 @@ cryptodev_cleanup(EVP_CIPHER_CTX *ctx)
} else {
ret = 1;
}
- close(state->d_fd);
+ put_dev_crypto(state->d_fd);
state->d_fd = -1;
return (ret);
@@ -686,7 +695,7 @@ static int cryptodev_digest_init(EVP_MD_CTX *ctx)
sess->mac = digest;
if (ioctl(state->d_fd, CIOCGSESSION, sess) < 0) {
- close(state->d_fd);
+ put_dev_crypto(state->d_fd);
state->d_fd = -1;
printf("cryptodev_digest_init: Open session failed\n");
return (0);
@@ -758,14 +767,12 @@ static int cryptodev_digest_final(EVP_MD_CTX *ctx, unsigned char *md)
if (! (ctx->flags & EVP_MD_CTX_FLAG_ONESHOT) ) {
/* if application doesn't support one buffer */
memset(&cryp, 0, sizeof(cryp));
-
cryp.ses = sess->ses;
cryp.flags = 0;
cryp.len = state->mac_len;
cryp.src = state->mac_data;
cryp.dst = NULL;
cryp.mac = (caddr_t)md;
-
if (ioctl(state->d_fd, CIOCCRYPT, &cryp) < 0) {
printf("cryptodev_digest_final: digest failed\n");
return (0);
@@ -786,6 +793,9 @@ static int cryptodev_digest_cleanup(EVP_MD_CTX *ctx)
struct dev_crypto_state *state = ctx->md_data;
struct session_op *sess = &state->d_sess;
+ if (state == NULL)
+ return 0;
+
if (state->d_fd < 0) {
printf("cryptodev_digest_cleanup: illegal input\n");
return (0);
@@ -797,16 +807,13 @@ static int cryptodev_digest_cleanup(EVP_MD_CTX *ctx)
state->mac_len = 0;
}
- if (state->copy)
- return 1;
-
if (ioctl(state->d_fd, CIOCFSESSION, &sess->ses) < 0) {
printf("cryptodev_digest_cleanup: failed to close session\n");
ret = 0;
} else {
ret = 1;
}
- close(state->d_fd);
+ put_dev_crypto(state->d_fd);
state->d_fd = -1;
return (ret);
@@ -816,15 +823,39 @@ static int cryptodev_digest_copy(EVP_MD_CTX *to,const EVP_MD_CTX *from)
{
struct dev_crypto_state *fstate = from->md_data;
struct dev_crypto_state *dstate = to->md_data;
+ struct session_op *sess;
+ int digest;
- memcpy(dstate, fstate, sizeof(struct dev_crypto_state));
+ if (dstate == NULL || fstate == NULL)
+ return 1;
- if (fstate->mac_len != 0) {
- dstate->mac_data = OPENSSL_malloc(fstate->mac_len);
- memcpy(dstate->mac_data, fstate->mac_data, fstate->mac_len);
+ memcpy(dstate, fstate, sizeof(struct dev_crypto_state));
+
+ sess = &dstate->d_sess;
+
+ digest = digest_nid_to_cryptodev(to->digest->type);
+
+ sess->mackey = dstate->dummy_mac_key;
+ sess->mackeylen = digest_key_length(to->digest->type);
+ sess->mac = digest;
+
+ dstate->d_fd = get_dev_crypto();
+
+ if (ioctl(dstate->d_fd, CIOCGSESSION, sess) < 0) {
+ put_dev_crypto(dstate->d_fd);
+ dstate->d_fd = -1;
+ printf("cryptodev_digest_init: Open session failed\n");
+ return (0);
}
- dstate->copy = 1;
+ if (fstate->mac_len != 0) {
+ if (fstate->mac_data != NULL)
+ {
+ dstate->mac_data = OPENSSL_malloc(fstate->mac_len);
+ memcpy(dstate->mac_data, fstate->mac_data, fstate->mac_len);
+ dstate->mac_len = fstate->mac_len;
+ }
+ }
return 1;
}
@@ -1347,11 +1378,11 @@ ENGINE_load_cryptodev(void)
* find out what asymmetric crypto algorithms we support
*/
if (ioctl(fd, CIOCASYMFEAT, &cryptodev_asymfeat) == -1) {
- close(fd);
+ put_dev_crypto(fd);
ENGINE_free(engine);
return;
}
- close(fd);
+ put_dev_crypto(fd);
if (!ENGINE_set_id(engine, "cryptodev") ||
!ENGINE_set_name(engine, "BSD cryptodev engine") ||
diff --git a/main/openssl/crypto/engine/eng_dyn.c b/main/openssl/crypto/engine/eng_dyn.c
index 807da7a5..8fb8634e 100644
--- a/main/openssl/crypto/engine/eng_dyn.c
+++ b/main/openssl/crypto/engine/eng_dyn.c
@@ -408,7 +408,7 @@ static int int_load(dynamic_data_ctx *ctx)
int num, loop;
/* Unless told not to, try a direct load */
if((ctx->dir_load != 2) && (DSO_load(ctx->dynamic_dso,
- ctx->DYNAMIC_LIBNAME, NULL, 0)) != NULL)
+ ctx->DYNAMIC_LIBNAME, NULL, 0) != NULL))
return 1;
/* If we're not allowed to use 'dirs' or we have none, fail */
if(!ctx->dir_load || (num = sk_OPENSSL_STRING_num(ctx->dirs)) < 1)
@@ -423,6 +423,9 @@ static int int_load(dynamic_data_ctx *ctx)
{
/* Found what we're looking for */
OPENSSL_free(merge);
+ /* Previous failed loop iterations, if any, will have resulted in
+ * errors. Clear them out before returning success. */
+ ERR_clear_error();
return 1;
}
OPENSSL_free(merge);
diff --git a/main/openssl/crypto/engine/eng_fat.c b/main/openssl/crypto/engine/eng_fat.c
index db66e623..789b8d57 100644
--- a/main/openssl/crypto/engine/eng_fat.c
+++ b/main/openssl/crypto/engine/eng_fat.c
@@ -176,6 +176,7 @@ int ENGINE_register_all_complete(void)
ENGINE *e;
for(e=ENGINE_get_first() ; e ; e=ENGINE_get_next(e))
- ENGINE_register_complete(e);
+ if (!(e->flags & ENGINE_FLAGS_NO_REGISTER_ALL))
+ ENGINE_register_complete(e);
return 1;
}
diff --git a/main/openssl/crypto/engine/engine.h b/main/openssl/crypto/engine/engine.h
index 943aeae2..f8be4977 100644
--- a/main/openssl/crypto/engine/engine.h
+++ b/main/openssl/crypto/engine/engine.h
@@ -141,6 +141,13 @@ extern "C" {
* the existing ENGINE's structural reference count. */
#define ENGINE_FLAGS_BY_ID_COPY (int)0x0004
+/* This flag if for an ENGINE that does not want its methods registered as
+ * part of ENGINE_register_all_complete() for example if the methods are
+ * not usable as default methods.
+ */
+
+#define ENGINE_FLAGS_NO_REGISTER_ALL (int)0x0008
+
/* ENGINEs can support their own command types, and these flags are used in
* ENGINE_CTRL_GET_CMD_FLAGS to indicate to the caller what kind of input each
* command expects. Currently only numeric and string input is supported. If a
@@ -344,6 +351,8 @@ void ENGINE_load_gost(void);
#endif
#endif
void ENGINE_load_cryptodev(void);
+void ENGINE_load_rsax(void);
+void ENGINE_load_rdrand(void);
void ENGINE_load_builtin_engines(void);
/* Get and set global flags (ENGINE_TABLE_FLAG_***) for the implementation
diff --git a/main/openssl/crypto/engine/tb_asnmth.c b/main/openssl/crypto/engine/tb_asnmth.c
new file mode 100644
index 00000000..75090339
--- /dev/null
+++ b/main/openssl/crypto/engine/tb_asnmth.c
@@ -0,0 +1,246 @@
+/* ====================================================================
+ * Copyright (c) 2006 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com). This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+
+#include "eng_int.h"
+#include "asn1_locl.h"
+#include <openssl/evp.h>
+
+/* If this symbol is defined then ENGINE_get_pkey_asn1_meth_engine(), the
+ * function that is used by EVP to hook in pkey_asn1_meth code and cache
+ * defaults (etc), will display brief debugging summaries to stderr with the
+ * 'nid'. */
+/* #define ENGINE_PKEY_ASN1_METH_DEBUG */
+
+static ENGINE_TABLE *pkey_asn1_meth_table = NULL;
+
+void ENGINE_unregister_pkey_asn1_meths(ENGINE *e)
+ {
+ engine_table_unregister(&pkey_asn1_meth_table, e);
+ }
+
+static void engine_unregister_all_pkey_asn1_meths(void)
+ {
+ engine_table_cleanup(&pkey_asn1_meth_table);
+ }
+
+int ENGINE_register_pkey_asn1_meths(ENGINE *e)
+ {
+ if(e->pkey_asn1_meths)
+ {
+ const int *nids;
+ int num_nids = e->pkey_asn1_meths(e, NULL, &nids, 0);
+ if(num_nids > 0)
+ return engine_table_register(&pkey_asn1_meth_table,
+ engine_unregister_all_pkey_asn1_meths, e, nids,
+ num_nids, 0);
+ }
+ return 1;
+ }
+
+void ENGINE_register_all_pkey_asn1_meths(void)
+ {
+ ENGINE *e;
+
+ for(e=ENGINE_get_first() ; e ; e=ENGINE_get_next(e))
+ ENGINE_register_pkey_asn1_meths(e);
+ }
+
+int ENGINE_set_default_pkey_asn1_meths(ENGINE *e)
+ {
+ if(e->pkey_asn1_meths)
+ {
+ const int *nids;
+ int num_nids = e->pkey_asn1_meths(e, NULL, &nids, 0);
+ if(num_nids > 0)
+ return engine_table_register(&pkey_asn1_meth_table,
+ engine_unregister_all_pkey_asn1_meths, e, nids,
+ num_nids, 1);
+ }
+ return 1;
+ }
+
+/* Exposed API function to get a functional reference from the implementation
+ * table (ie. try to get a functional reference from the tabled structural
+ * references) for a given pkey_asn1_meth 'nid' */
+ENGINE *ENGINE_get_pkey_asn1_meth_engine(int nid)
+ {
+ return engine_table_select(&pkey_asn1_meth_table, nid);
+ }
+
+/* Obtains a pkey_asn1_meth implementation from an ENGINE functional reference */
+const EVP_PKEY_ASN1_METHOD *ENGINE_get_pkey_asn1_meth(ENGINE *e, int nid)
+ {
+ EVP_PKEY_ASN1_METHOD *ret;
+ ENGINE_PKEY_ASN1_METHS_PTR fn = ENGINE_get_pkey_asn1_meths(e);
+ if(!fn || !fn(e, &ret, NULL, nid))
+ {
+ ENGINEerr(ENGINE_F_ENGINE_GET_PKEY_ASN1_METH,
+ ENGINE_R_UNIMPLEMENTED_PUBLIC_KEY_METHOD);
+ return NULL;
+ }
+ return ret;
+ }
+
+/* Gets the pkey_asn1_meth callback from an ENGINE structure */
+ENGINE_PKEY_ASN1_METHS_PTR ENGINE_get_pkey_asn1_meths(const ENGINE *e)
+ {
+ return e->pkey_asn1_meths;
+ }
+
+/* Sets the pkey_asn1_meth callback in an ENGINE structure */
+int ENGINE_set_pkey_asn1_meths(ENGINE *e, ENGINE_PKEY_ASN1_METHS_PTR f)
+ {
+ e->pkey_asn1_meths = f;
+ return 1;
+ }
+
+/* Internal function to free up EVP_PKEY_ASN1_METHOD structures before an
+ * ENGINE is destroyed
+ */
+
+void engine_pkey_asn1_meths_free(ENGINE *e)
+ {
+ int i;
+ EVP_PKEY_ASN1_METHOD *pkm;
+ if (e->pkey_asn1_meths)
+ {
+ const int *pknids;
+ int npknids;
+ npknids = e->pkey_asn1_meths(e, NULL, &pknids, 0);
+ for (i = 0; i < npknids; i++)
+ {
+ if (e->pkey_asn1_meths(e, &pkm, NULL, pknids[i]))
+ {
+ EVP_PKEY_asn1_free(pkm);
+ }
+ }
+ }
+ }
+
+/* Find a method based on a string. This does a linear search through
+ * all implemented algorithms. This is OK in practice because only
+ * a small number of algorithms are likely to be implemented in an engine
+ * and it is not used for speed critical operations.
+ */
+
+const EVP_PKEY_ASN1_METHOD *ENGINE_get_pkey_asn1_meth_str(ENGINE *e,
+ const char *str, int len)
+ {
+ int i, nidcount;
+ const int *nids;
+ EVP_PKEY_ASN1_METHOD *ameth;
+ if (!e->pkey_asn1_meths)
+ return NULL;
+ if (len == -1)
+ len = strlen(str);
+ nidcount = e->pkey_asn1_meths(e, NULL, &nids, 0);
+ for (i = 0; i < nidcount; i++)
+ {
+ e->pkey_asn1_meths(e, &ameth, NULL, nids[i]);
+ if (((int)strlen(ameth->pem_str) == len) &&
+ !strncasecmp(ameth->pem_str, str, len))
+ return ameth;
+ }
+ return NULL;
+ }
+
+typedef struct
+ {
+ ENGINE *e;
+ const EVP_PKEY_ASN1_METHOD *ameth;
+ const char *str;
+ int len;
+ } ENGINE_FIND_STR;
+
+static void look_str_cb(int nid, STACK_OF(ENGINE) *sk, ENGINE *def, void *arg)
+ {
+ ENGINE_FIND_STR *lk = arg;
+ int i;
+ if (lk->ameth)
+ return;
+ for (i = 0; i < sk_ENGINE_num(sk); i++)
+ {
+ ENGINE *e = sk_ENGINE_value(sk, i);
+ EVP_PKEY_ASN1_METHOD *ameth;
+ e->pkey_asn1_meths(e, &ameth, NULL, nid);
+ if (((int)strlen(ameth->pem_str) == lk->len) &&
+ !strncasecmp(ameth->pem_str, lk->str, lk->len))
+ {
+ lk->e = e;
+ lk->ameth = ameth;
+ return;
+ }
+ }
+ }
+
+const EVP_PKEY_ASN1_METHOD *ENGINE_pkey_asn1_find_str(ENGINE **pe,
+ const char *str, int len)
+ {
+ ENGINE_FIND_STR fstr;
+ fstr.e = NULL;
+ fstr.ameth = NULL;
+ fstr.str = str;
+ fstr.len = len;
+ CRYPTO_w_lock(CRYPTO_LOCK_ENGINE);
+ engine_table_doall(pkey_asn1_meth_table, look_str_cb, &fstr);
+ /* If found obtain a structural reference to engine */
+ if (fstr.e)
+ {
+ fstr.e->struct_ref++;
+ engine_ref_debug(fstr.e, 0, 1)
+ }
+ *pe = fstr.e;
+ CRYPTO_w_unlock(CRYPTO_LOCK_ENGINE);
+ return fstr.ameth;
+ }
diff --git a/main/openssl/crypto/engine/tb_pkmeth.c b/main/openssl/crypto/engine/tb_pkmeth.c
new file mode 100644
index 00000000..1cdb967f
--- /dev/null
+++ b/main/openssl/crypto/engine/tb_pkmeth.c
@@ -0,0 +1,167 @@
+/* ====================================================================
+ * Copyright (c) 2006 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com). This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+
+#include "eng_int.h"
+#include <openssl/evp.h>
+
+/* If this symbol is defined then ENGINE_get_pkey_meth_engine(), the function
+ * that is used by EVP to hook in pkey_meth code and cache defaults (etc), will
+ * display brief debugging summaries to stderr with the 'nid'. */
+/* #define ENGINE_PKEY_METH_DEBUG */
+
+static ENGINE_TABLE *pkey_meth_table = NULL;
+
+void ENGINE_unregister_pkey_meths(ENGINE *e)
+ {
+ engine_table_unregister(&pkey_meth_table, e);
+ }
+
+static void engine_unregister_all_pkey_meths(void)
+ {
+ engine_table_cleanup(&pkey_meth_table);
+ }
+
+int ENGINE_register_pkey_meths(ENGINE *e)
+ {
+ if(e->pkey_meths)
+ {
+ const int *nids;
+ int num_nids = e->pkey_meths(e, NULL, &nids, 0);
+ if(num_nids > 0)
+ return engine_table_register(&pkey_meth_table,
+ engine_unregister_all_pkey_meths, e, nids,
+ num_nids, 0);
+ }
+ return 1;
+ }
+
+void ENGINE_register_all_pkey_meths()
+ {
+ ENGINE *e;
+
+ for(e=ENGINE_get_first() ; e ; e=ENGINE_get_next(e))
+ ENGINE_register_pkey_meths(e);
+ }
+
+int ENGINE_set_default_pkey_meths(ENGINE *e)
+ {
+ if(e->pkey_meths)
+ {
+ const int *nids;
+ int num_nids = e->pkey_meths(e, NULL, &nids, 0);
+ if(num_nids > 0)
+ return engine_table_register(&pkey_meth_table,
+ engine_unregister_all_pkey_meths, e, nids,
+ num_nids, 1);
+ }
+ return 1;
+ }
+
+/* Exposed API function to get a functional reference from the implementation
+ * table (ie. try to get a functional reference from the tabled structural
+ * references) for a given pkey_meth 'nid' */
+ENGINE *ENGINE_get_pkey_meth_engine(int nid)
+ {
+ return engine_table_select(&pkey_meth_table, nid);
+ }
+
+/* Obtains a pkey_meth implementation from an ENGINE functional reference */
+const EVP_PKEY_METHOD *ENGINE_get_pkey_meth(ENGINE *e, int nid)
+ {
+ EVP_PKEY_METHOD *ret;
+ ENGINE_PKEY_METHS_PTR fn = ENGINE_get_pkey_meths(e);
+ if(!fn || !fn(e, &ret, NULL, nid))
+ {
+ ENGINEerr(ENGINE_F_ENGINE_GET_PKEY_METH,
+ ENGINE_R_UNIMPLEMENTED_PUBLIC_KEY_METHOD);
+ return NULL;
+ }
+ return ret;
+ }
+
+/* Gets the pkey_meth callback from an ENGINE structure */
+ENGINE_PKEY_METHS_PTR ENGINE_get_pkey_meths(const ENGINE *e)
+ {
+ return e->pkey_meths;
+ }
+
+/* Sets the pkey_meth callback in an ENGINE structure */
+int ENGINE_set_pkey_meths(ENGINE *e, ENGINE_PKEY_METHS_PTR f)
+ {
+ e->pkey_meths = f;
+ return 1;
+ }
+
+/* Internal function to free up EVP_PKEY_METHOD structures before an
+ * ENGINE is destroyed
+ */
+
+void engine_pkey_meths_free(ENGINE *e)
+ {
+ int i;
+ EVP_PKEY_METHOD *pkm;
+ if (e->pkey_meths)
+ {
+ const int *pknids;
+ int npknids;
+ npknids = e->pkey_meths(e, NULL, &pknids, 0);
+ for (i = 0; i < npknids; i++)
+ {
+ if (e->pkey_meths(e, &pkm, NULL, pknids[i]))
+ {
+ EVP_PKEY_meth_free(pkm);
+ }
+ }
+ }
+ }
diff --git a/main/openssl/crypto/err/err.c b/main/openssl/crypto/err/err.c
index 69713a6e..fcdb2440 100644
--- a/main/openssl/crypto/err/err.c
+++ b/main/openssl/crypto/err/err.c
@@ -1066,6 +1066,13 @@ void ERR_set_error_data(char *data, int flags)
void ERR_add_error_data(int num, ...)
{
va_list args;
+ va_start(args, num);
+ ERR_add_error_vdata(num, args);
+ va_end(args);
+ }
+
+void ERR_add_error_vdata(int num, va_list args)
+ {
int i,n,s;
char *str,*p,*a;
@@ -1074,7 +1081,6 @@ void ERR_add_error_data(int num, ...)
if (str == NULL) return;
str[0]='\0';
- va_start(args, num);
n=0;
for (i=0; i<num; i++)
{
@@ -1090,7 +1096,7 @@ void ERR_add_error_data(int num, ...)
if (p == NULL)
{
OPENSSL_free(str);
- goto err;
+ return;
}
else
str=p;
@@ -1099,9 +1105,6 @@ void ERR_add_error_data(int num, ...)
}
}
ERR_set_error_data(str,ERR_TXT_MALLOCED|ERR_TXT_STRING);
-
-err:
- va_end(args);
}
int ERR_set_mark(void)
diff --git a/main/openssl/crypto/err/err.h b/main/openssl/crypto/err/err.h
index b9f8c16d..974cc9cc 100644
--- a/main/openssl/crypto/err/err.h
+++ b/main/openssl/crypto/err/err.h
@@ -344,8 +344,9 @@ void ERR_print_errors_fp(FILE *fp);
#endif
#ifndef OPENSSL_NO_BIO
void ERR_print_errors(BIO *bp);
-void ERR_add_error_data(int num, ...);
#endif
+void ERR_add_error_data(int num, ...);
+void ERR_add_error_vdata(int num, va_list args);
void ERR_load_strings(int lib,ERR_STRING_DATA str[]);
void ERR_unload_strings(int lib,ERR_STRING_DATA str[]);
void ERR_load_ERR_strings(void);
diff --git a/main/openssl/crypto/err/err_all.c b/main/openssl/crypto/err/err_all.c
index fc049e8e..8eb547d9 100644
--- a/main/openssl/crypto/err/err_all.c
+++ b/main/openssl/crypto/err/err_all.c
@@ -64,7 +64,9 @@
#endif
#include <openssl/buffer.h>
#include <openssl/bio.h>
+#ifndef OPENSSL_NO_COMP
#include <openssl/comp.h>
+#endif
#ifndef OPENSSL_NO_RSA
#include <openssl/rsa.h>
#endif
@@ -95,6 +97,9 @@
#include <openssl/ui.h>
#include <openssl/ocsp.h>
#include <openssl/err.h>
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
#include <openssl/ts.h>
#ifndef OPENSSL_NO_CMS
#include <openssl/cms.h>
@@ -102,7 +107,6 @@
#ifndef OPENSSL_NO_JPAKE
#include <openssl/jpake.h>
#endif
-#include <openssl/comp.h>
void ERR_load_crypto_strings(void)
{
@@ -126,7 +130,9 @@ void ERR_load_crypto_strings(void)
ERR_load_ASN1_strings();
ERR_load_CONF_strings();
ERR_load_CRYPTO_strings();
+#ifndef OPENSSL_NO_COMP
ERR_load_COMP_strings();
+#endif
#ifndef OPENSSL_NO_EC
ERR_load_EC_strings();
#endif
@@ -149,12 +155,14 @@ void ERR_load_crypto_strings(void)
#endif
ERR_load_OCSP_strings();
ERR_load_UI_strings();
+#ifdef OPENSSL_FIPS
+ ERR_load_FIPS_strings();
+#endif
#ifndef OPENSSL_NO_CMS
ERR_load_CMS_strings();
#endif
#ifndef OPENSSL_NO_JPAKE
ERR_load_JPAKE_strings();
#endif
- ERR_load_COMP_strings();
#endif
}
diff --git a/main/openssl/crypto/evp/bio_md.c b/main/openssl/crypto/evp/bio_md.c
index 9841e32e..144fdfd5 100644
--- a/main/openssl/crypto/evp/bio_md.c
+++ b/main/openssl/crypto/evp/bio_md.c
@@ -153,8 +153,12 @@ static int md_write(BIO *b, const char *in, int inl)
{
if (ret > 0)
{
- EVP_DigestUpdate(ctx,(const unsigned char *)in,
- (unsigned int)ret);
+ if (!EVP_DigestUpdate(ctx,(const unsigned char *)in,
+ (unsigned int)ret))
+ {
+ BIO_clear_retry_flags(b);
+ return 0;
+ }
}
}
if(b->next_bio != NULL)
@@ -220,7 +224,8 @@ static long md_ctrl(BIO *b, int cmd, long num, void *ptr)
case BIO_CTRL_DUP:
dbio=ptr;
dctx=dbio->ptr;
- EVP_MD_CTX_copy_ex(dctx,ctx);
+ if (!EVP_MD_CTX_copy_ex(dctx,ctx))
+ return 0;
b->init=1;
break;
default:
diff --git a/main/openssl/crypto/evp/bio_ok.c b/main/openssl/crypto/evp/bio_ok.c
index 98bc1ab4..e6433535 100644
--- a/main/openssl/crypto/evp/bio_ok.c
+++ b/main/openssl/crypto/evp/bio_ok.c
@@ -133,10 +133,10 @@ static int ok_new(BIO *h);
static int ok_free(BIO *data);
static long ok_callback_ctrl(BIO *h, int cmd, bio_info_cb *fp);
-static void sig_out(BIO* b);
-static void sig_in(BIO* b);
-static void block_out(BIO* b);
-static void block_in(BIO* b);
+static int sig_out(BIO* b);
+static int sig_in(BIO* b);
+static int block_out(BIO* b);
+static int block_in(BIO* b);
#define OK_BLOCK_SIZE (1024*4)
#define OK_BLOCK_BLOCK 4
#define IOBS (OK_BLOCK_SIZE+ OK_BLOCK_BLOCK+ 3*EVP_MAX_MD_SIZE)
@@ -266,10 +266,24 @@ static int ok_read(BIO *b, char *out, int outl)
ctx->buf_len+= i;
/* no signature yet -- check if we got one */
- if (ctx->sigio == 1) sig_in(b);
+ if (ctx->sigio == 1)
+ {
+ if (!sig_in(b))
+ {
+ BIO_clear_retry_flags(b);
+ return 0;
+ }
+ }
/* signature ok -- check if we got block */
- if (ctx->sigio == 0) block_in(b);
+ if (ctx->sigio == 0)
+ {
+ if (!block_in(b))
+ {
+ BIO_clear_retry_flags(b);
+ return 0;
+ }
+ }
/* invalid block -- cancel */
if (ctx->cont <= 0) break;
@@ -293,7 +307,8 @@ static int ok_write(BIO *b, const char *in, int inl)
if ((ctx == NULL) || (b->next_bio == NULL) || (b->init == 0)) return(0);
- if(ctx->sigio) sig_out(b);
+ if(ctx->sigio && !sig_out(b))
+ return 0;
do{
BIO_clear_retry_flags(b);
@@ -332,7 +347,11 @@ static int ok_write(BIO *b, const char *in, int inl)
if(ctx->buf_len >= OK_BLOCK_SIZE+ OK_BLOCK_BLOCK)
{
- block_out(b);
+ if (!block_out(b))
+ {
+ BIO_clear_retry_flags(b);
+ return 0;
+ }
}
}while(inl > 0);
@@ -379,7 +398,8 @@ static long ok_ctrl(BIO *b, int cmd, long num, void *ptr)
case BIO_CTRL_FLUSH:
/* do a final write */
if(ctx->blockout == 0)
- block_out(b);
+ if (!block_out(b))
+ return 0;
while (ctx->blockout)
{
@@ -408,7 +428,8 @@ static long ok_ctrl(BIO *b, int cmd, long num, void *ptr)
break;
case BIO_C_SET_MD:
md=ptr;
- EVP_DigestInit_ex(&ctx->md, md, NULL);
+ if (!EVP_DigestInit_ex(&ctx->md, md, NULL))
+ return 0;
b->init=1;
break;
case BIO_C_GET_MD:
@@ -455,7 +476,7 @@ static void longswap(void *_ptr, size_t len)
}
}
-static void sig_out(BIO* b)
+static int sig_out(BIO* b)
{
BIO_OK_CTX *ctx;
EVP_MD_CTX *md;
@@ -463,9 +484,10 @@ static void sig_out(BIO* b)
ctx=b->ptr;
md=&ctx->md;
- if(ctx->buf_len+ 2* md->digest->md_size > OK_BLOCK_SIZE) return;
+ if(ctx->buf_len+ 2* md->digest->md_size > OK_BLOCK_SIZE) return 1;
- EVP_DigestInit_ex(md, md->digest, NULL);
+ if (!EVP_DigestInit_ex(md, md->digest, NULL))
+ goto berr;
/* FIXME: there's absolutely no guarantee this makes any sense at all,
* particularly now EVP_MD_CTX has been restructured.
*/
@@ -474,14 +496,20 @@ static void sig_out(BIO* b)
longswap(&(ctx->buf[ctx->buf_len]), md->digest->md_size);
ctx->buf_len+= md->digest->md_size;
- EVP_DigestUpdate(md, WELLKNOWN, strlen(WELLKNOWN));
- EVP_DigestFinal_ex(md, &(ctx->buf[ctx->buf_len]), NULL);
+ if (!EVP_DigestUpdate(md, WELLKNOWN, strlen(WELLKNOWN)))
+ goto berr;
+ if (!EVP_DigestFinal_ex(md, &(ctx->buf[ctx->buf_len]), NULL))
+ goto berr;
ctx->buf_len+= md->digest->md_size;
ctx->blockout= 1;
ctx->sigio= 0;
+ return 1;
+ berr:
+ BIO_clear_retry_flags(b);
+ return 0;
}
-static void sig_in(BIO* b)
+static int sig_in(BIO* b)
{
BIO_OK_CTX *ctx;
EVP_MD_CTX *md;
@@ -491,15 +519,18 @@ static void sig_in(BIO* b)
ctx=b->ptr;
md=&ctx->md;
- if((int)(ctx->buf_len-ctx->buf_off) < 2*md->digest->md_size) return;
+ if((int)(ctx->buf_len-ctx->buf_off) < 2*md->digest->md_size) return 1;
- EVP_DigestInit_ex(md, md->digest, NULL);
+ if (!EVP_DigestInit_ex(md, md->digest, NULL))
+ goto berr;
memcpy(md->md_data, &(ctx->buf[ctx->buf_off]), md->digest->md_size);
longswap(md->md_data, md->digest->md_size);
ctx->buf_off+= md->digest->md_size;
- EVP_DigestUpdate(md, WELLKNOWN, strlen(WELLKNOWN));
- EVP_DigestFinal_ex(md, tmp, NULL);
+ if (!EVP_DigestUpdate(md, WELLKNOWN, strlen(WELLKNOWN)))
+ goto berr;
+ if (!EVP_DigestFinal_ex(md, tmp, NULL))
+ goto berr;
ret= memcmp(&(ctx->buf[ctx->buf_off]), tmp, md->digest->md_size) == 0;
ctx->buf_off+= md->digest->md_size;
if(ret == 1)
@@ -516,9 +547,13 @@ static void sig_in(BIO* b)
{
ctx->cont= 0;
}
+ return 1;
+ berr:
+ BIO_clear_retry_flags(b);
+ return 0;
}
-static void block_out(BIO* b)
+static int block_out(BIO* b)
{
BIO_OK_CTX *ctx;
EVP_MD_CTX *md;
@@ -532,13 +567,20 @@ static void block_out(BIO* b)
ctx->buf[1]=(unsigned char)(tl>>16);
ctx->buf[2]=(unsigned char)(tl>>8);
ctx->buf[3]=(unsigned char)(tl);
- EVP_DigestUpdate(md, (unsigned char*) &(ctx->buf[OK_BLOCK_BLOCK]), tl);
- EVP_DigestFinal_ex(md, &(ctx->buf[ctx->buf_len]), NULL);
+ if (!EVP_DigestUpdate(md,
+ (unsigned char*) &(ctx->buf[OK_BLOCK_BLOCK]), tl))
+ goto berr;
+ if (!EVP_DigestFinal_ex(md, &(ctx->buf[ctx->buf_len]), NULL))
+ goto berr;
ctx->buf_len+= md->digest->md_size;
ctx->blockout= 1;
+ return 1;
+ berr:
+ BIO_clear_retry_flags(b);
+ return 0;
}
-static void block_in(BIO* b)
+static int block_in(BIO* b)
{
BIO_OK_CTX *ctx;
EVP_MD_CTX *md;
@@ -554,10 +596,13 @@ static void block_in(BIO* b)
tl|=ctx->buf[2]; tl<<=8;
tl|=ctx->buf[3];
- if (ctx->buf_len < tl+ OK_BLOCK_BLOCK+ md->digest->md_size) return;
+ if (ctx->buf_len < tl+ OK_BLOCK_BLOCK+ md->digest->md_size) return 1;
- EVP_DigestUpdate(md, (unsigned char*) &(ctx->buf[OK_BLOCK_BLOCK]), tl);
- EVP_DigestFinal_ex(md, tmp, NULL);
+ if (!EVP_DigestUpdate(md,
+ (unsigned char*) &(ctx->buf[OK_BLOCK_BLOCK]), tl))
+ goto berr;
+ if (!EVP_DigestFinal_ex(md, tmp, NULL))
+ goto berr;
if(memcmp(&(ctx->buf[tl+ OK_BLOCK_BLOCK]), tmp, md->digest->md_size) == 0)
{
/* there might be parts from next block lurking around ! */
@@ -571,5 +616,9 @@ static void block_in(BIO* b)
{
ctx->cont= 0;
}
+ return 1;
+ berr:
+ BIO_clear_retry_flags(b);
+ return 0;
}
diff --git a/main/openssl/crypto/evp/c_allc.c b/main/openssl/crypto/evp/c_allc.c
index c5f92683..2a45d435 100644
--- a/main/openssl/crypto/evp/c_allc.c
+++ b/main/openssl/crypto/evp/c_allc.c
@@ -98,6 +98,9 @@ void OpenSSL_add_all_ciphers(void)
#ifndef OPENSSL_NO_RC4
EVP_add_cipher(EVP_rc4());
EVP_add_cipher(EVP_rc4_40());
+#ifndef OPENSSL_NO_MD5
+ EVP_add_cipher(EVP_rc4_hmac_md5());
+#endif
#endif
#ifndef OPENSSL_NO_IDEA
@@ -166,9 +169,9 @@ void OpenSSL_add_all_ciphers(void)
EVP_add_cipher(EVP_aes_128_cfb1());
EVP_add_cipher(EVP_aes_128_cfb8());
EVP_add_cipher(EVP_aes_128_ofb());
-#if 0
EVP_add_cipher(EVP_aes_128_ctr());
-#endif
+ EVP_add_cipher(EVP_aes_128_gcm());
+ EVP_add_cipher(EVP_aes_128_xts());
EVP_add_cipher_alias(SN_aes_128_cbc,"AES128");
EVP_add_cipher_alias(SN_aes_128_cbc,"aes128");
EVP_add_cipher(EVP_aes_192_ecb());
@@ -177,9 +180,8 @@ void OpenSSL_add_all_ciphers(void)
EVP_add_cipher(EVP_aes_192_cfb1());
EVP_add_cipher(EVP_aes_192_cfb8());
EVP_add_cipher(EVP_aes_192_ofb());
-#if 0
EVP_add_cipher(EVP_aes_192_ctr());
-#endif
+ EVP_add_cipher(EVP_aes_192_gcm());
EVP_add_cipher_alias(SN_aes_192_cbc,"AES192");
EVP_add_cipher_alias(SN_aes_192_cbc,"aes192");
EVP_add_cipher(EVP_aes_256_ecb());
@@ -188,11 +190,15 @@ void OpenSSL_add_all_ciphers(void)
EVP_add_cipher(EVP_aes_256_cfb1());
EVP_add_cipher(EVP_aes_256_cfb8());
EVP_add_cipher(EVP_aes_256_ofb());
-#if 0
EVP_add_cipher(EVP_aes_256_ctr());
-#endif
+ EVP_add_cipher(EVP_aes_256_gcm());
+ EVP_add_cipher(EVP_aes_256_xts());
EVP_add_cipher_alias(SN_aes_256_cbc,"AES256");
EVP_add_cipher_alias(SN_aes_256_cbc,"aes256");
+#if !defined(OPENSSL_NO_SHA) && !defined(OPENSSL_NO_SHA1)
+ EVP_add_cipher(EVP_aes_128_cbc_hmac_sha1());
+ EVP_add_cipher(EVP_aes_256_cbc_hmac_sha1());
+#endif
#endif
#ifndef OPENSSL_NO_CAMELLIA
diff --git a/main/openssl/crypto/evp/digest.c b/main/openssl/crypto/evp/digest.c
index 982ba2b1..d14e8e48 100644
--- a/main/openssl/crypto/evp/digest.c
+++ b/main/openssl/crypto/evp/digest.c
@@ -117,6 +117,10 @@
#include <openssl/engine.h>
#endif
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
+
void EVP_MD_CTX_init(EVP_MD_CTX *ctx)
{
memset(ctx,'\0',sizeof *ctx);
@@ -225,12 +229,26 @@ skip_to_init:
}
if (ctx->flags & EVP_MD_CTX_FLAG_NO_INIT)
return 1;
+#ifdef OPENSSL_FIPS
+ if (FIPS_mode())
+ {
+ if (FIPS_digestinit(ctx, type))
+ return 1;
+ OPENSSL_free(ctx->md_data);
+ ctx->md_data = NULL;
+ return 0;
+ }
+#endif
return ctx->digest->init(ctx);
}
int EVP_DigestUpdate(EVP_MD_CTX *ctx, const void *data, size_t count)
{
+#ifdef OPENSSL_FIPS
+ return FIPS_digestupdate(ctx, data, count);
+#else
return ctx->update(ctx,data,count);
+#endif
}
/* The caller can assume that this removes any secret data from the context */
@@ -245,6 +263,9 @@ int EVP_DigestFinal(EVP_MD_CTX *ctx, unsigned char *md, unsigned int *size)
/* The caller can assume that this removes any secret data from the context */
int EVP_DigestFinal_ex(EVP_MD_CTX *ctx, unsigned char *md, unsigned int *size)
{
+#ifdef OPENSSL_FIPS
+ return FIPS_digestfinal(ctx, md, size);
+#else
int ret;
OPENSSL_assert(ctx->digest->md_size <= EVP_MAX_MD_SIZE);
@@ -258,6 +279,7 @@ int EVP_DigestFinal_ex(EVP_MD_CTX *ctx, unsigned char *md, unsigned int *size)
}
memset(ctx->md_data,0,ctx->digest->ctx_size);
return ret;
+#endif
}
int EVP_MD_CTX_copy(EVP_MD_CTX *out, const EVP_MD_CTX *in)
@@ -344,13 +366,17 @@ int EVP_Digest(const void *data, size_t count,
void EVP_MD_CTX_destroy(EVP_MD_CTX *ctx)
{
- EVP_MD_CTX_cleanup(ctx);
- OPENSSL_free(ctx);
+ if (ctx)
+ {
+ EVP_MD_CTX_cleanup(ctx);
+ OPENSSL_free(ctx);
+ }
}
/* This call frees resources associated with the context */
int EVP_MD_CTX_cleanup(EVP_MD_CTX *ctx)
{
+#ifndef OPENSSL_FIPS
/* Don't assume ctx->md_data was cleaned in EVP_Digest_Final,
* because sometimes only copies of the context are ever finalised.
*/
@@ -363,6 +389,7 @@ int EVP_MD_CTX_cleanup(EVP_MD_CTX *ctx)
OPENSSL_cleanse(ctx->md_data,ctx->digest->ctx_size);
OPENSSL_free(ctx->md_data);
}
+#endif
if (ctx->pctx)
EVP_PKEY_CTX_free(ctx->pctx);
#ifndef OPENSSL_NO_ENGINE
@@ -371,6 +398,9 @@ int EVP_MD_CTX_cleanup(EVP_MD_CTX *ctx)
* functional reference we held for this reason. */
ENGINE_finish(ctx->engine);
#endif
+#ifdef OPENSSL_FIPS
+ FIPS_md_ctx_cleanup(ctx);
+#endif
memset(ctx,'\0',sizeof *ctx);
return 1;
diff --git a/main/openssl/crypto/evp/e_aes.c b/main/openssl/crypto/evp/e_aes.c
index bd6c0a3a..c7869b69 100644
--- a/main/openssl/crypto/evp/e_aes.c
+++ b/main/openssl/crypto/evp/e_aes.c
@@ -1,5 +1,5 @@
/* ====================================================================
- * Copyright (c) 2001 The OpenSSL Project. All rights reserved.
+ * Copyright (c) 2001-2011 The OpenSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -56,57 +56,511 @@
#include <assert.h>
#include <openssl/aes.h>
#include "evp_locl.h"
-
-static int aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
- const unsigned char *iv, int enc);
+#ifndef OPENSSL_FIPS
+#include "modes_lcl.h"
+#include <openssl/rand.h>
typedef struct
{
AES_KEY ks;
+ block128_f block;
+ union {
+ cbc128_f cbc;
+ ctr128_f ctr;
+ } stream;
} EVP_AES_KEY;
-#define data(ctx) EVP_C_DATA(EVP_AES_KEY,ctx)
-
-IMPLEMENT_BLOCK_CIPHER(aes_128, ks, AES, EVP_AES_KEY,
- NID_aes_128, 16, 16, 16, 128,
- 0, aes_init_key, NULL,
- EVP_CIPHER_set_asn1_iv,
- EVP_CIPHER_get_asn1_iv,
- NULL)
-IMPLEMENT_BLOCK_CIPHER(aes_192, ks, AES, EVP_AES_KEY,
- NID_aes_192, 16, 24, 16, 128,
- 0, aes_init_key, NULL,
- EVP_CIPHER_set_asn1_iv,
- EVP_CIPHER_get_asn1_iv,
- NULL)
-IMPLEMENT_BLOCK_CIPHER(aes_256, ks, AES, EVP_AES_KEY,
- NID_aes_256, 16, 32, 16, 128,
- 0, aes_init_key, NULL,
- EVP_CIPHER_set_asn1_iv,
- EVP_CIPHER_get_asn1_iv,
- NULL)
-
-#define IMPLEMENT_AES_CFBR(ksize,cbits) IMPLEMENT_CFBR(aes,AES,EVP_AES_KEY,ks,ksize,cbits,16)
-
-IMPLEMENT_AES_CFBR(128,1)
-IMPLEMENT_AES_CFBR(192,1)
-IMPLEMENT_AES_CFBR(256,1)
-
-IMPLEMENT_AES_CFBR(128,8)
-IMPLEMENT_AES_CFBR(192,8)
-IMPLEMENT_AES_CFBR(256,8)
+typedef struct
+ {
+ AES_KEY ks; /* AES key schedule to use */
+ int key_set; /* Set if key initialised */
+ int iv_set; /* Set if an iv is set */
+ GCM128_CONTEXT gcm;
+ unsigned char *iv; /* Temporary IV store */
+ int ivlen; /* IV length */
+ int taglen;
+ int iv_gen; /* It is OK to generate IVs */
+ int tls_aad_len; /* TLS AAD length */
+ ctr128_f ctr;
+ } EVP_AES_GCM_CTX;
+
+typedef struct
+ {
+ AES_KEY ks1, ks2; /* AES key schedules to use */
+ XTS128_CONTEXT xts;
+ void (*stream)(const unsigned char *in,
+ unsigned char *out, size_t length,
+ const AES_KEY *key1, const AES_KEY *key2,
+ const unsigned char iv[16]);
+ } EVP_AES_XTS_CTX;
+
+typedef struct
+ {
+ AES_KEY ks; /* AES key schedule to use */
+ int key_set; /* Set if key initialised */
+ int iv_set; /* Set if an iv is set */
+ int tag_set; /* Set if tag is valid */
+ int len_set; /* Set if message length set */
+ int L, M; /* L and M parameters from RFC3610 */
+ CCM128_CONTEXT ccm;
+ ccm128_f str;
+ } EVP_AES_CCM_CTX;
+
+#define MAXBITCHUNK ((size_t)1<<(sizeof(size_t)*8-4))
+
+#ifdef VPAES_ASM
+int vpaes_set_encrypt_key(const unsigned char *userKey, int bits,
+ AES_KEY *key);
+int vpaes_set_decrypt_key(const unsigned char *userKey, int bits,
+ AES_KEY *key);
+
+void vpaes_encrypt(const unsigned char *in, unsigned char *out,
+ const AES_KEY *key);
+void vpaes_decrypt(const unsigned char *in, unsigned char *out,
+ const AES_KEY *key);
+
+void vpaes_cbc_encrypt(const unsigned char *in,
+ unsigned char *out,
+ size_t length,
+ const AES_KEY *key,
+ unsigned char *ivec, int enc);
+#endif
+#ifdef BSAES_ASM
+void bsaes_cbc_encrypt(const unsigned char *in, unsigned char *out,
+ size_t length, const AES_KEY *key,
+ unsigned char ivec[16], int enc);
+void bsaes_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out,
+ size_t len, const AES_KEY *key,
+ const unsigned char ivec[16]);
+void bsaes_xts_encrypt(const unsigned char *inp, unsigned char *out,
+ size_t len, const AES_KEY *key1,
+ const AES_KEY *key2, const unsigned char iv[16]);
+void bsaes_xts_decrypt(const unsigned char *inp, unsigned char *out,
+ size_t len, const AES_KEY *key1,
+ const AES_KEY *key2, const unsigned char iv[16]);
+#endif
+#ifdef AES_CTR_ASM
+void AES_ctr32_encrypt(const unsigned char *in, unsigned char *out,
+ size_t blocks, const AES_KEY *key,
+ const unsigned char ivec[AES_BLOCK_SIZE]);
+#endif
+#ifdef AES_XTS_ASM
+void AES_xts_encrypt(const char *inp,char *out,size_t len,
+ const AES_KEY *key1, const AES_KEY *key2,
+ const unsigned char iv[16]);
+void AES_xts_decrypt(const char *inp,char *out,size_t len,
+ const AES_KEY *key1, const AES_KEY *key2,
+ const unsigned char iv[16]);
+#endif
+
+#if defined(AES_ASM) && !defined(I386_ONLY) && ( \
+ ((defined(__i386) || defined(__i386__) || \
+ defined(_M_IX86)) && defined(OPENSSL_IA32_SSE2))|| \
+ defined(__x86_64) || defined(__x86_64__) || \
+ defined(_M_AMD64) || defined(_M_X64) || \
+ defined(__INTEL__) )
+
+extern unsigned int OPENSSL_ia32cap_P[2];
+
+#ifdef VPAES_ASM
+#define VPAES_CAPABLE (OPENSSL_ia32cap_P[1]&(1<<(41-32)))
+#endif
+#ifdef BSAES_ASM
+#define BSAES_CAPABLE VPAES_CAPABLE
+#endif
+/*
+ * AES-NI section
+ */
+#define AESNI_CAPABLE (OPENSSL_ia32cap_P[1]&(1<<(57-32)))
+
+int aesni_set_encrypt_key(const unsigned char *userKey, int bits,
+ AES_KEY *key);
+int aesni_set_decrypt_key(const unsigned char *userKey, int bits,
+ AES_KEY *key);
+
+void aesni_encrypt(const unsigned char *in, unsigned char *out,
+ const AES_KEY *key);
+void aesni_decrypt(const unsigned char *in, unsigned char *out,
+ const AES_KEY *key);
+
+void aesni_ecb_encrypt(const unsigned char *in,
+ unsigned char *out,
+ size_t length,
+ const AES_KEY *key,
+ int enc);
+void aesni_cbc_encrypt(const unsigned char *in,
+ unsigned char *out,
+ size_t length,
+ const AES_KEY *key,
+ unsigned char *ivec, int enc);
+
+void aesni_ctr32_encrypt_blocks(const unsigned char *in,
+ unsigned char *out,
+ size_t blocks,
+ const void *key,
+ const unsigned char *ivec);
+
+void aesni_xts_encrypt(const unsigned char *in,
+ unsigned char *out,
+ size_t length,
+ const AES_KEY *key1, const AES_KEY *key2,
+ const unsigned char iv[16]);
+
+void aesni_xts_decrypt(const unsigned char *in,
+ unsigned char *out,
+ size_t length,
+ const AES_KEY *key1, const AES_KEY *key2,
+ const unsigned char iv[16]);
+
+void aesni_ccm64_encrypt_blocks (const unsigned char *in,
+ unsigned char *out,
+ size_t blocks,
+ const void *key,
+ const unsigned char ivec[16],
+ unsigned char cmac[16]);
+
+void aesni_ccm64_decrypt_blocks (const unsigned char *in,
+ unsigned char *out,
+ size_t blocks,
+ const void *key,
+ const unsigned char ivec[16],
+ unsigned char cmac[16]);
+
+static int aesni_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+ const unsigned char *iv, int enc)
+ {
+ int ret, mode;
+ EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data;
+
+ mode = ctx->cipher->flags & EVP_CIPH_MODE;
+ if ((mode == EVP_CIPH_ECB_MODE || mode == EVP_CIPH_CBC_MODE)
+ && !enc)
+ {
+ ret = aesni_set_decrypt_key(key, ctx->key_len*8, ctx->cipher_data);
+ dat->block = (block128_f)aesni_decrypt;
+ dat->stream.cbc = mode==EVP_CIPH_CBC_MODE ?
+ (cbc128_f)aesni_cbc_encrypt :
+ NULL;
+ }
+ else {
+ ret = aesni_set_encrypt_key(key, ctx->key_len*8, ctx->cipher_data);
+ dat->block = (block128_f)aesni_encrypt;
+ if (mode==EVP_CIPH_CBC_MODE)
+ dat->stream.cbc = (cbc128_f)aesni_cbc_encrypt;
+ else if (mode==EVP_CIPH_CTR_MODE)
+ dat->stream.ctr = (ctr128_f)aesni_ctr32_encrypt_blocks;
+ else
+ dat->stream.cbc = NULL;
+ }
+
+ if(ret < 0)
+ {
+ EVPerr(EVP_F_AESNI_INIT_KEY,EVP_R_AES_KEY_SETUP_FAILED);
+ return 0;
+ }
+
+ return 1;
+ }
+
+static int aesni_cbc_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+ const unsigned char *in, size_t len)
+{
+ aesni_cbc_encrypt(in,out,len,ctx->cipher_data,ctx->iv,ctx->encrypt);
+
+ return 1;
+}
+
+static int aesni_ecb_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+ const unsigned char *in, size_t len)
+{
+ size_t bl = ctx->cipher->block_size;
+
+ if (len<bl) return 1;
+
+ aesni_ecb_encrypt(in,out,len,ctx->cipher_data,ctx->encrypt);
+
+ return 1;
+}
+
+#define aesni_ofb_cipher aes_ofb_cipher
+static int aesni_ofb_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+ const unsigned char *in,size_t len);
+
+#define aesni_cfb_cipher aes_cfb_cipher
+static int aesni_cfb_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+ const unsigned char *in,size_t len);
+
+#define aesni_cfb8_cipher aes_cfb8_cipher
+static int aesni_cfb8_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+ const unsigned char *in,size_t len);
+
+#define aesni_cfb1_cipher aes_cfb1_cipher
+static int aesni_cfb1_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+ const unsigned char *in,size_t len);
+
+#define aesni_ctr_cipher aes_ctr_cipher
+static int aesni_ctr_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len);
+
+static int aesni_gcm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+ const unsigned char *iv, int enc)
+ {
+ EVP_AES_GCM_CTX *gctx = ctx->cipher_data;
+ if (!iv && !key)
+ return 1;
+ if (key)
+ {
+ aesni_set_encrypt_key(key, ctx->key_len * 8, &gctx->ks);
+ CRYPTO_gcm128_init(&gctx->gcm, &gctx->ks,
+ (block128_f)aesni_encrypt);
+ gctx->ctr = (ctr128_f)aesni_ctr32_encrypt_blocks;
+ /* If we have an iv can set it directly, otherwise use
+ * saved IV.
+ */
+ if (iv == NULL && gctx->iv_set)
+ iv = gctx->iv;
+ if (iv)
+ {
+ CRYPTO_gcm128_setiv(&gctx->gcm, iv, gctx->ivlen);
+ gctx->iv_set = 1;
+ }
+ gctx->key_set = 1;
+ }
+ else
+ {
+ /* If key set use IV, otherwise copy */
+ if (gctx->key_set)
+ CRYPTO_gcm128_setiv(&gctx->gcm, iv, gctx->ivlen);
+ else
+ memcpy(gctx->iv, iv, gctx->ivlen);
+ gctx->iv_set = 1;
+ gctx->iv_gen = 0;
+ }
+ return 1;
+ }
+
+#define aesni_gcm_cipher aes_gcm_cipher
+static int aesni_gcm_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len);
+
+static int aesni_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+ const unsigned char *iv, int enc)
+ {
+ EVP_AES_XTS_CTX *xctx = ctx->cipher_data;
+ if (!iv && !key)
+ return 1;
+
+ if (key)
+ {
+ /* key_len is two AES keys */
+ if (enc)
+ {
+ aesni_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1);
+ xctx->xts.block1 = (block128_f)aesni_encrypt;
+ xctx->stream = aesni_xts_encrypt;
+ }
+ else
+ {
+ aesni_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1);
+ xctx->xts.block1 = (block128_f)aesni_decrypt;
+ xctx->stream = aesni_xts_decrypt;
+ }
+
+ aesni_set_encrypt_key(key + ctx->key_len/2,
+ ctx->key_len * 4, &xctx->ks2);
+ xctx->xts.block2 = (block128_f)aesni_encrypt;
+
+ xctx->xts.key1 = &xctx->ks1;
+ }
+
+ if (iv)
+ {
+ xctx->xts.key2 = &xctx->ks2;
+ memcpy(ctx->iv, iv, 16);
+ }
+
+ return 1;
+ }
+
+#define aesni_xts_cipher aes_xts_cipher
+static int aesni_xts_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len);
+
+static int aesni_ccm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+ const unsigned char *iv, int enc)
+ {
+ EVP_AES_CCM_CTX *cctx = ctx->cipher_data;
+ if (!iv && !key)
+ return 1;
+ if (key)
+ {
+ aesni_set_encrypt_key(key, ctx->key_len * 8, &cctx->ks);
+ CRYPTO_ccm128_init(&cctx->ccm, cctx->M, cctx->L,
+ &cctx->ks, (block128_f)aesni_encrypt);
+ cctx->str = enc?(ccm128_f)aesni_ccm64_encrypt_blocks :
+ (ccm128_f)aesni_ccm64_decrypt_blocks;
+ cctx->key_set = 1;
+ }
+ if (iv)
+ {
+ memcpy(ctx->iv, iv, 15 - cctx->L);
+ cctx->iv_set = 1;
+ }
+ return 1;
+ }
+
+#define aesni_ccm_cipher aes_ccm_cipher
+static int aesni_ccm_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len);
+
+#define BLOCK_CIPHER_generic(nid,keylen,blocksize,ivlen,nmode,mode,MODE,flags) \
+static const EVP_CIPHER aesni_##keylen##_##mode = { \
+ nid##_##keylen##_##nmode,blocksize,keylen/8,ivlen, \
+ flags|EVP_CIPH_##MODE##_MODE, \
+ aesni_init_key, \
+ aesni_##mode##_cipher, \
+ NULL, \
+ sizeof(EVP_AES_KEY), \
+ NULL,NULL,NULL,NULL }; \
+static const EVP_CIPHER aes_##keylen##_##mode = { \
+ nid##_##keylen##_##nmode,blocksize, \
+ keylen/8,ivlen, \
+ flags|EVP_CIPH_##MODE##_MODE, \
+ aes_init_key, \
+ aes_##mode##_cipher, \
+ NULL, \
+ sizeof(EVP_AES_KEY), \
+ NULL,NULL,NULL,NULL }; \
+const EVP_CIPHER *EVP_aes_##keylen##_##mode(void) \
+{ return AESNI_CAPABLE?&aesni_##keylen##_##mode:&aes_##keylen##_##mode; }
+
+#define BLOCK_CIPHER_custom(nid,keylen,blocksize,ivlen,mode,MODE,flags) \
+static const EVP_CIPHER aesni_##keylen##_##mode = { \
+ nid##_##keylen##_##mode,blocksize, \
+ (EVP_CIPH_##MODE##_MODE==EVP_CIPH_XTS_MODE?2:1)*keylen/8, ivlen, \
+ flags|EVP_CIPH_##MODE##_MODE, \
+ aesni_##mode##_init_key, \
+ aesni_##mode##_cipher, \
+ aes_##mode##_cleanup, \
+ sizeof(EVP_AES_##MODE##_CTX), \
+ NULL,NULL,aes_##mode##_ctrl,NULL }; \
+static const EVP_CIPHER aes_##keylen##_##mode = { \
+ nid##_##keylen##_##mode,blocksize, \
+ (EVP_CIPH_##MODE##_MODE==EVP_CIPH_XTS_MODE?2:1)*keylen/8, ivlen, \
+ flags|EVP_CIPH_##MODE##_MODE, \
+ aes_##mode##_init_key, \
+ aes_##mode##_cipher, \
+ aes_##mode##_cleanup, \
+ sizeof(EVP_AES_##MODE##_CTX), \
+ NULL,NULL,aes_##mode##_ctrl,NULL }; \
+const EVP_CIPHER *EVP_aes_##keylen##_##mode(void) \
+{ return AESNI_CAPABLE?&aesni_##keylen##_##mode:&aes_##keylen##_##mode; }
+
+#else
+
+#define BLOCK_CIPHER_generic(nid,keylen,blocksize,ivlen,nmode,mode,MODE,flags) \
+static const EVP_CIPHER aes_##keylen##_##mode = { \
+ nid##_##keylen##_##nmode,blocksize,keylen/8,ivlen, \
+ flags|EVP_CIPH_##MODE##_MODE, \
+ aes_init_key, \
+ aes_##mode##_cipher, \
+ NULL, \
+ sizeof(EVP_AES_KEY), \
+ NULL,NULL,NULL,NULL }; \
+const EVP_CIPHER *EVP_aes_##keylen##_##mode(void) \
+{ return &aes_##keylen##_##mode; }
+
+#define BLOCK_CIPHER_custom(nid,keylen,blocksize,ivlen,mode,MODE,flags) \
+static const EVP_CIPHER aes_##keylen##_##mode = { \
+ nid##_##keylen##_##mode,blocksize, \
+ (EVP_CIPH_##MODE##_MODE==EVP_CIPH_XTS_MODE?2:1)*keylen/8, ivlen, \
+ flags|EVP_CIPH_##MODE##_MODE, \
+ aes_##mode##_init_key, \
+ aes_##mode##_cipher, \
+ aes_##mode##_cleanup, \
+ sizeof(EVP_AES_##MODE##_CTX), \
+ NULL,NULL,aes_##mode##_ctrl,NULL }; \
+const EVP_CIPHER *EVP_aes_##keylen##_##mode(void) \
+{ return &aes_##keylen##_##mode; }
+#endif
+
+#define BLOCK_CIPHER_generic_pack(nid,keylen,flags) \
+ BLOCK_CIPHER_generic(nid,keylen,16,16,cbc,cbc,CBC,flags|EVP_CIPH_FLAG_DEFAULT_ASN1) \
+ BLOCK_CIPHER_generic(nid,keylen,16,0,ecb,ecb,ECB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1) \
+ BLOCK_CIPHER_generic(nid,keylen,1,16,ofb128,ofb,OFB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1) \
+ BLOCK_CIPHER_generic(nid,keylen,1,16,cfb128,cfb,CFB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1) \
+ BLOCK_CIPHER_generic(nid,keylen,1,16,cfb1,cfb1,CFB,flags) \
+ BLOCK_CIPHER_generic(nid,keylen,1,16,cfb8,cfb8,CFB,flags) \
+ BLOCK_CIPHER_generic(nid,keylen,1,16,ctr,ctr,CTR,flags)
static int aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
const unsigned char *iv, int enc)
{
- int ret;
+ int ret, mode;
+ EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data;
- if ((ctx->cipher->flags & EVP_CIPH_MODE) == EVP_CIPH_CFB_MODE
- || (ctx->cipher->flags & EVP_CIPH_MODE) == EVP_CIPH_OFB_MODE
- || enc)
- ret=AES_set_encrypt_key(key, ctx->key_len * 8, ctx->cipher_data);
+ mode = ctx->cipher->flags & EVP_CIPH_MODE;
+ if ((mode == EVP_CIPH_ECB_MODE || mode == EVP_CIPH_CBC_MODE)
+ && !enc)
+#ifdef BSAES_CAPABLE
+ if (BSAES_CAPABLE && mode==EVP_CIPH_CBC_MODE)
+ {
+ ret = AES_set_decrypt_key(key,ctx->key_len*8,&dat->ks);
+ dat->block = (block128_f)AES_decrypt;
+ dat->stream.cbc = (cbc128_f)bsaes_cbc_encrypt;
+ }
+ else
+#endif
+#ifdef VPAES_CAPABLE
+ if (VPAES_CAPABLE)
+ {
+ ret = vpaes_set_decrypt_key(key,ctx->key_len*8,&dat->ks);
+ dat->block = (block128_f)vpaes_decrypt;
+ dat->stream.cbc = mode==EVP_CIPH_CBC_MODE ?
+ (cbc128_f)vpaes_cbc_encrypt :
+ NULL;
+ }
+ else
+#endif
+ {
+ ret = AES_set_decrypt_key(key,ctx->key_len*8,&dat->ks);
+ dat->block = (block128_f)AES_decrypt;
+ dat->stream.cbc = mode==EVP_CIPH_CBC_MODE ?
+ (cbc128_f)AES_cbc_encrypt :
+ NULL;
+ }
else
- ret=AES_set_decrypt_key(key, ctx->key_len * 8, ctx->cipher_data);
+#ifdef BSAES_CAPABLE
+ if (BSAES_CAPABLE && mode==EVP_CIPH_CTR_MODE)
+ {
+ ret = AES_set_encrypt_key(key,ctx->key_len*8,&dat->ks);
+ dat->block = (block128_f)AES_encrypt;
+ dat->stream.ctr = (ctr128_f)bsaes_ctr32_encrypt_blocks;
+ }
+ else
+#endif
+#ifdef VPAES_CAPABLE
+ if (VPAES_CAPABLE)
+ {
+ ret = vpaes_set_encrypt_key(key,ctx->key_len*8,&dat->ks);
+ dat->block = (block128_f)vpaes_encrypt;
+ dat->stream.cbc = mode==EVP_CIPH_CBC_MODE ?
+ (cbc128_f)vpaes_cbc_encrypt :
+ NULL;
+ }
+ else
+#endif
+ {
+ ret = AES_set_encrypt_key(key,ctx->key_len*8,&dat->ks);
+ dat->block = (block128_f)AES_encrypt;
+ dat->stream.cbc = mode==EVP_CIPH_CBC_MODE ?
+ (cbc128_f)AES_cbc_encrypt :
+ NULL;
+#ifdef AES_CTR_ASM
+ if (mode==EVP_CIPH_CTR_MODE)
+ dat->stream.ctr = (ctr128_f)AES_ctr32_encrypt;
+#endif
+ }
if(ret < 0)
{
@@ -117,4 +571,750 @@ static int aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
return 1;
}
+static int aes_cbc_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+ const unsigned char *in, size_t len)
+{
+ EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data;
+
+ if (dat->stream.cbc)
+ (*dat->stream.cbc)(in,out,len,&dat->ks,ctx->iv,ctx->encrypt);
+ else if (ctx->encrypt)
+ CRYPTO_cbc128_encrypt(in,out,len,&dat->ks,ctx->iv,dat->block);
+ else
+ CRYPTO_cbc128_encrypt(in,out,len,&dat->ks,ctx->iv,dat->block);
+
+ return 1;
+}
+
+static int aes_ecb_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+ const unsigned char *in, size_t len)
+{
+ size_t bl = ctx->cipher->block_size;
+ size_t i;
+ EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data;
+
+ if (len<bl) return 1;
+
+ for (i=0,len-=bl;i<=len;i+=bl)
+ (*dat->block)(in+i,out+i,&dat->ks);
+
+ return 1;
+}
+
+static int aes_ofb_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+ const unsigned char *in,size_t len)
+{
+ EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data;
+
+ CRYPTO_ofb128_encrypt(in,out,len,&dat->ks,
+ ctx->iv,&ctx->num,dat->block);
+ return 1;
+}
+
+static int aes_cfb_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+ const unsigned char *in,size_t len)
+{
+ EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data;
+
+ CRYPTO_cfb128_encrypt(in,out,len,&dat->ks,
+ ctx->iv,&ctx->num,ctx->encrypt,dat->block);
+ return 1;
+}
+
+static int aes_cfb8_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+ const unsigned char *in,size_t len)
+{
+ EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data;
+
+ CRYPTO_cfb128_8_encrypt(in,out,len,&dat->ks,
+ ctx->iv,&ctx->num,ctx->encrypt,dat->block);
+ return 1;
+}
+
+static int aes_cfb1_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+ const unsigned char *in,size_t len)
+{
+ EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data;
+
+ if (ctx->flags&EVP_CIPH_FLAG_LENGTH_BITS) {
+ CRYPTO_cfb128_1_encrypt(in,out,len,&dat->ks,
+ ctx->iv,&ctx->num,ctx->encrypt,dat->block);
+ return 1;
+ }
+
+ while (len>=MAXBITCHUNK) {
+ CRYPTO_cfb128_1_encrypt(in,out,MAXBITCHUNK*8,&dat->ks,
+ ctx->iv,&ctx->num,ctx->encrypt,dat->block);
+ len-=MAXBITCHUNK;
+ }
+ if (len)
+ CRYPTO_cfb128_1_encrypt(in,out,len*8,&dat->ks,
+ ctx->iv,&ctx->num,ctx->encrypt,dat->block);
+
+ return 1;
+}
+
+static int aes_ctr_cipher (EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len)
+{
+ unsigned int num = ctx->num;
+ EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data;
+
+ if (dat->stream.ctr)
+ CRYPTO_ctr128_encrypt_ctr32(in,out,len,&dat->ks,
+ ctx->iv,ctx->buf,&num,dat->stream.ctr);
+ else
+ CRYPTO_ctr128_encrypt(in,out,len,&dat->ks,
+ ctx->iv,ctx->buf,&num,dat->block);
+ ctx->num = (size_t)num;
+ return 1;
+}
+
+BLOCK_CIPHER_generic_pack(NID_aes,128,EVP_CIPH_FLAG_FIPS)
+BLOCK_CIPHER_generic_pack(NID_aes,192,EVP_CIPH_FLAG_FIPS)
+BLOCK_CIPHER_generic_pack(NID_aes,256,EVP_CIPH_FLAG_FIPS)
+
+static int aes_gcm_cleanup(EVP_CIPHER_CTX *c)
+ {
+ EVP_AES_GCM_CTX *gctx = c->cipher_data;
+ OPENSSL_cleanse(&gctx->gcm, sizeof(gctx->gcm));
+ if (gctx->iv != c->iv)
+ OPENSSL_free(gctx->iv);
+ return 1;
+ }
+
+/* increment counter (64-bit int) by 1 */
+static void ctr64_inc(unsigned char *counter) {
+ int n=8;
+ unsigned char c;
+
+ do {
+ --n;
+ c = counter[n];
+ ++c;
+ counter[n] = c;
+ if (c) return;
+ } while (n);
+}
+
+static int aes_gcm_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr)
+ {
+ EVP_AES_GCM_CTX *gctx = c->cipher_data;
+ switch (type)
+ {
+ case EVP_CTRL_INIT:
+ gctx->key_set = 0;
+ gctx->iv_set = 0;
+ gctx->ivlen = c->cipher->iv_len;
+ gctx->iv = c->iv;
+ gctx->taglen = -1;
+ gctx->iv_gen = 0;
+ gctx->tls_aad_len = -1;
+ return 1;
+
+ case EVP_CTRL_GCM_SET_IVLEN:
+ if (arg <= 0)
+ return 0;
+#ifdef OPENSSL_FIPS
+ if (FIPS_module_mode() && !(c->flags & EVP_CIPH_FLAG_NON_FIPS_ALLOW)
+ && arg < 12)
+ return 0;
+#endif
+ /* Allocate memory for IV if needed */
+ if ((arg > EVP_MAX_IV_LENGTH) && (arg > gctx->ivlen))
+ {
+ if (gctx->iv != c->iv)
+ OPENSSL_free(gctx->iv);
+ gctx->iv = OPENSSL_malloc(arg);
+ if (!gctx->iv)
+ return 0;
+ }
+ gctx->ivlen = arg;
+ return 1;
+
+ case EVP_CTRL_GCM_SET_TAG:
+ if (arg <= 0 || arg > 16 || c->encrypt)
+ return 0;
+ memcpy(c->buf, ptr, arg);
+ gctx->taglen = arg;
+ return 1;
+
+ case EVP_CTRL_GCM_GET_TAG:
+ if (arg <= 0 || arg > 16 || !c->encrypt || gctx->taglen < 0)
+ return 0;
+ memcpy(ptr, c->buf, arg);
+ return 1;
+
+ case EVP_CTRL_GCM_SET_IV_FIXED:
+ /* Special case: -1 length restores whole IV */
+ if (arg == -1)
+ {
+ memcpy(gctx->iv, ptr, gctx->ivlen);
+ gctx->iv_gen = 1;
+ return 1;
+ }
+ /* Fixed field must be at least 4 bytes and invocation field
+ * at least 8.
+ */
+ if ((arg < 4) || (gctx->ivlen - arg) < 8)
+ return 0;
+ if (arg)
+ memcpy(gctx->iv, ptr, arg);
+ if (c->encrypt &&
+ RAND_bytes(gctx->iv + arg, gctx->ivlen - arg) <= 0)
+ return 0;
+ gctx->iv_gen = 1;
+ return 1;
+
+ case EVP_CTRL_GCM_IV_GEN:
+ if (gctx->iv_gen == 0 || gctx->key_set == 0)
+ return 0;
+ CRYPTO_gcm128_setiv(&gctx->gcm, gctx->iv, gctx->ivlen);
+ if (arg <= 0 || arg > gctx->ivlen)
+ arg = gctx->ivlen;
+ memcpy(ptr, gctx->iv + gctx->ivlen - arg, arg);
+ /* Invocation field will be at least 8 bytes in size and
+ * so no need to check wrap around or increment more than
+ * last 8 bytes.
+ */
+ ctr64_inc(gctx->iv + gctx->ivlen - 8);
+ gctx->iv_set = 1;
+ return 1;
+
+ case EVP_CTRL_GCM_SET_IV_INV:
+ if (gctx->iv_gen == 0 || gctx->key_set == 0 || c->encrypt)
+ return 0;
+ memcpy(gctx->iv + gctx->ivlen - arg, ptr, arg);
+ CRYPTO_gcm128_setiv(&gctx->gcm, gctx->iv, gctx->ivlen);
+ gctx->iv_set = 1;
+ return 1;
+
+ case EVP_CTRL_AEAD_TLS1_AAD:
+ /* Save the AAD for later use */
+ if (arg != 13)
+ return 0;
+ memcpy(c->buf, ptr, arg);
+ gctx->tls_aad_len = arg;
+ {
+ unsigned int len=c->buf[arg-2]<<8|c->buf[arg-1];
+ /* Correct length for explicit IV */
+ len -= EVP_GCM_TLS_EXPLICIT_IV_LEN;
+ /* If decrypting correct for tag too */
+ if (!c->encrypt)
+ len -= EVP_GCM_TLS_TAG_LEN;
+ c->buf[arg-2] = len>>8;
+ c->buf[arg-1] = len & 0xff;
+ }
+ /* Extra padding: tag appended to record */
+ return EVP_GCM_TLS_TAG_LEN;
+
+ default:
+ return -1;
+
+ }
+ }
+
+static int aes_gcm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+ const unsigned char *iv, int enc)
+ {
+ EVP_AES_GCM_CTX *gctx = ctx->cipher_data;
+ if (!iv && !key)
+ return 1;
+ if (key)
+ { do {
+#ifdef BSAES_CAPABLE
+ if (BSAES_CAPABLE)
+ {
+ AES_set_encrypt_key(key,ctx->key_len*8,&gctx->ks);
+ CRYPTO_gcm128_init(&gctx->gcm,&gctx->ks,
+ (block128_f)AES_encrypt);
+ gctx->ctr = (ctr128_f)bsaes_ctr32_encrypt_blocks;
+ break;
+ }
+ else
+#endif
+#ifdef VPAES_CAPABLE
+ if (VPAES_CAPABLE)
+ {
+ vpaes_set_encrypt_key(key,ctx->key_len*8,&gctx->ks);
+ CRYPTO_gcm128_init(&gctx->gcm,&gctx->ks,
+ (block128_f)vpaes_encrypt);
+ gctx->ctr = NULL;
+ break;
+ }
+ else
+#endif
+ (void)0; /* terminate potentially open 'else' */
+
+ AES_set_encrypt_key(key, ctx->key_len * 8, &gctx->ks);
+ CRYPTO_gcm128_init(&gctx->gcm, &gctx->ks, (block128_f)AES_encrypt);
+#ifdef AES_CTR_ASM
+ gctx->ctr = (ctr128_f)AES_ctr32_encrypt;
+#else
+ gctx->ctr = NULL;
+#endif
+ } while (0);
+
+ /* If we have an iv can set it directly, otherwise use
+ * saved IV.
+ */
+ if (iv == NULL && gctx->iv_set)
+ iv = gctx->iv;
+ if (iv)
+ {
+ CRYPTO_gcm128_setiv(&gctx->gcm, iv, gctx->ivlen);
+ gctx->iv_set = 1;
+ }
+ gctx->key_set = 1;
+ }
+ else
+ {
+ /* If key set use IV, otherwise copy */
+ if (gctx->key_set)
+ CRYPTO_gcm128_setiv(&gctx->gcm, iv, gctx->ivlen);
+ else
+ memcpy(gctx->iv, iv, gctx->ivlen);
+ gctx->iv_set = 1;
+ gctx->iv_gen = 0;
+ }
+ return 1;
+ }
+
+/* Handle TLS GCM packet format. This consists of the last portion of the IV
+ * followed by the payload and finally the tag. On encrypt generate IV,
+ * encrypt payload and write the tag. On verify retrieve IV, decrypt payload
+ * and verify tag.
+ */
+
+static int aes_gcm_tls_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len)
+ {
+ EVP_AES_GCM_CTX *gctx = ctx->cipher_data;
+ int rv = -1;
+ /* Encrypt/decrypt must be performed in place */
+ if (out != in || len < (EVP_GCM_TLS_EXPLICIT_IV_LEN+EVP_GCM_TLS_TAG_LEN))
+ return -1;
+ /* Set IV from start of buffer or generate IV and write to start
+ * of buffer.
+ */
+ if (EVP_CIPHER_CTX_ctrl(ctx, ctx->encrypt ?
+ EVP_CTRL_GCM_IV_GEN : EVP_CTRL_GCM_SET_IV_INV,
+ EVP_GCM_TLS_EXPLICIT_IV_LEN, out) <= 0)
+ goto err;
+ /* Use saved AAD */
+ if (CRYPTO_gcm128_aad(&gctx->gcm, ctx->buf, gctx->tls_aad_len))
+ goto err;
+ /* Fix buffer and length to point to payload */
+ in += EVP_GCM_TLS_EXPLICIT_IV_LEN;
+ out += EVP_GCM_TLS_EXPLICIT_IV_LEN;
+ len -= EVP_GCM_TLS_EXPLICIT_IV_LEN + EVP_GCM_TLS_TAG_LEN;
+ if (ctx->encrypt)
+ {
+ /* Encrypt payload */
+ if (gctx->ctr)
+ {
+ if (CRYPTO_gcm128_encrypt_ctr32(&gctx->gcm,
+ in, out, len,
+ gctx->ctr))
+ goto err;
+ }
+ else {
+ if (CRYPTO_gcm128_encrypt(&gctx->gcm, in, out, len))
+ goto err;
+ }
+ out += len;
+ /* Finally write tag */
+ CRYPTO_gcm128_tag(&gctx->gcm, out, EVP_GCM_TLS_TAG_LEN);
+ rv = len + EVP_GCM_TLS_EXPLICIT_IV_LEN + EVP_GCM_TLS_TAG_LEN;
+ }
+ else
+ {
+ /* Decrypt */
+ if (gctx->ctr)
+ {
+ if (CRYPTO_gcm128_decrypt_ctr32(&gctx->gcm,
+ in, out, len,
+ gctx->ctr))
+ goto err;
+ }
+ else {
+ if (CRYPTO_gcm128_decrypt(&gctx->gcm, in, out, len))
+ goto err;
+ }
+ /* Retrieve tag */
+ CRYPTO_gcm128_tag(&gctx->gcm, ctx->buf,
+ EVP_GCM_TLS_TAG_LEN);
+ /* If tag mismatch wipe buffer */
+ if (memcmp(ctx->buf, in + len, EVP_GCM_TLS_TAG_LEN))
+ {
+ OPENSSL_cleanse(out, len);
+ goto err;
+ }
+ rv = len;
+ }
+
+ err:
+ gctx->iv_set = 0;
+ gctx->tls_aad_len = -1;
+ return rv;
+ }
+
+static int aes_gcm_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len)
+ {
+ EVP_AES_GCM_CTX *gctx = ctx->cipher_data;
+ /* If not set up, return error */
+ if (!gctx->key_set)
+ return -1;
+
+ if (gctx->tls_aad_len >= 0)
+ return aes_gcm_tls_cipher(ctx, out, in, len);
+
+ if (!gctx->iv_set)
+ return -1;
+ if (in)
+ {
+ if (out == NULL)
+ {
+ if (CRYPTO_gcm128_aad(&gctx->gcm, in, len))
+ return -1;
+ }
+ else if (ctx->encrypt)
+ {
+ if (gctx->ctr)
+ {
+ if (CRYPTO_gcm128_encrypt_ctr32(&gctx->gcm,
+ in, out, len,
+ gctx->ctr))
+ return -1;
+ }
+ else {
+ if (CRYPTO_gcm128_encrypt(&gctx->gcm, in, out, len))
+ return -1;
+ }
+ }
+ else
+ {
+ if (gctx->ctr)
+ {
+ if (CRYPTO_gcm128_decrypt_ctr32(&gctx->gcm,
+ in, out, len,
+ gctx->ctr))
+ return -1;
+ }
+ else {
+ if (CRYPTO_gcm128_decrypt(&gctx->gcm, in, out, len))
+ return -1;
+ }
+ }
+ return len;
+ }
+ else
+ {
+ if (!ctx->encrypt)
+ {
+ if (gctx->taglen < 0)
+ return -1;
+ if (CRYPTO_gcm128_finish(&gctx->gcm,
+ ctx->buf, gctx->taglen) != 0)
+ return -1;
+ gctx->iv_set = 0;
+ return 0;
+ }
+ CRYPTO_gcm128_tag(&gctx->gcm, ctx->buf, 16);
+ gctx->taglen = 16;
+ /* Don't reuse the IV */
+ gctx->iv_set = 0;
+ return 0;
+ }
+
+ }
+
+#define CUSTOM_FLAGS (EVP_CIPH_FLAG_DEFAULT_ASN1 \
+ | EVP_CIPH_CUSTOM_IV | EVP_CIPH_FLAG_CUSTOM_CIPHER \
+ | EVP_CIPH_ALWAYS_CALL_INIT | EVP_CIPH_CTRL_INIT)
+
+BLOCK_CIPHER_custom(NID_aes,128,1,12,gcm,GCM,
+ EVP_CIPH_FLAG_FIPS|EVP_CIPH_FLAG_AEAD_CIPHER|CUSTOM_FLAGS)
+BLOCK_CIPHER_custom(NID_aes,192,1,12,gcm,GCM,
+ EVP_CIPH_FLAG_FIPS|EVP_CIPH_FLAG_AEAD_CIPHER|CUSTOM_FLAGS)
+BLOCK_CIPHER_custom(NID_aes,256,1,12,gcm,GCM,
+ EVP_CIPH_FLAG_FIPS|EVP_CIPH_FLAG_AEAD_CIPHER|CUSTOM_FLAGS)
+
+static int aes_xts_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr)
+ {
+ EVP_AES_XTS_CTX *xctx = c->cipher_data;
+ if (type != EVP_CTRL_INIT)
+ return -1;
+ /* key1 and key2 are used as an indicator both key and IV are set */
+ xctx->xts.key1 = NULL;
+ xctx->xts.key2 = NULL;
+ return 1;
+ }
+
+static int aes_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+ const unsigned char *iv, int enc)
+ {
+ EVP_AES_XTS_CTX *xctx = ctx->cipher_data;
+ if (!iv && !key)
+ return 1;
+
+ if (key) do
+ {
+#ifdef AES_XTS_ASM
+ xctx->stream = enc ? AES_xts_encrypt : AES_xts_decrypt;
+#else
+ xctx->stream = NULL;
+#endif
+ /* key_len is two AES keys */
+#ifdef BSAES_CAPABLE
+ if (BSAES_CAPABLE)
+ xctx->stream = enc ? bsaes_xts_encrypt : bsaes_xts_decrypt;
+ else
+#endif
+#ifdef VPAES_CAPABLE
+ if (VPAES_CAPABLE)
+ {
+ if (enc)
+ {
+ vpaes_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1);
+ xctx->xts.block1 = (block128_f)vpaes_encrypt;
+ }
+ else
+ {
+ vpaes_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1);
+ xctx->xts.block1 = (block128_f)vpaes_decrypt;
+ }
+
+ vpaes_set_encrypt_key(key + ctx->key_len/2,
+ ctx->key_len * 4, &xctx->ks2);
+ xctx->xts.block2 = (block128_f)vpaes_encrypt;
+
+ xctx->xts.key1 = &xctx->ks1;
+ break;
+ }
+ else
+#endif
+ (void)0; /* terminate potentially open 'else' */
+
+ if (enc)
+ {
+ AES_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1);
+ xctx->xts.block1 = (block128_f)AES_encrypt;
+ }
+ else
+ {
+ AES_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1);
+ xctx->xts.block1 = (block128_f)AES_decrypt;
+ }
+
+ AES_set_encrypt_key(key + ctx->key_len/2,
+ ctx->key_len * 4, &xctx->ks2);
+ xctx->xts.block2 = (block128_f)AES_encrypt;
+
+ xctx->xts.key1 = &xctx->ks1;
+ } while (0);
+
+ if (iv)
+ {
+ xctx->xts.key2 = &xctx->ks2;
+ memcpy(ctx->iv, iv, 16);
+ }
+
+ return 1;
+ }
+
+static int aes_xts_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len)
+ {
+ EVP_AES_XTS_CTX *xctx = ctx->cipher_data;
+ if (!xctx->xts.key1 || !xctx->xts.key2)
+ return 0;
+ if (!out || !in || len<AES_BLOCK_SIZE)
+ return 0;
+#ifdef OPENSSL_FIPS
+ /* Requirement of SP800-38E */
+ if (FIPS_module_mode() && !(ctx->flags & EVP_CIPH_FLAG_NON_FIPS_ALLOW) &&
+ (len > (1UL<<20)*16))
+ {
+ EVPerr(EVP_F_AES_XTS_CIPHER, EVP_R_TOO_LARGE);
+ return 0;
+ }
+#endif
+ if (xctx->stream)
+ (*xctx->stream)(in, out, len,
+ xctx->xts.key1, xctx->xts.key2, ctx->iv);
+ else if (CRYPTO_xts128_encrypt(&xctx->xts, ctx->iv, in, out, len,
+ ctx->encrypt))
+ return 0;
+ return 1;
+ }
+
+#define aes_xts_cleanup NULL
+
+#define XTS_FLAGS (EVP_CIPH_FLAG_DEFAULT_ASN1 | EVP_CIPH_CUSTOM_IV \
+ | EVP_CIPH_ALWAYS_CALL_INIT | EVP_CIPH_CTRL_INIT)
+
+BLOCK_CIPHER_custom(NID_aes,128,1,16,xts,XTS,EVP_CIPH_FLAG_FIPS|XTS_FLAGS)
+BLOCK_CIPHER_custom(NID_aes,256,1,16,xts,XTS,EVP_CIPH_FLAG_FIPS|XTS_FLAGS)
+
+static int aes_ccm_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr)
+ {
+ EVP_AES_CCM_CTX *cctx = c->cipher_data;
+ switch (type)
+ {
+ case EVP_CTRL_INIT:
+ cctx->key_set = 0;
+ cctx->iv_set = 0;
+ cctx->L = 8;
+ cctx->M = 12;
+ cctx->tag_set = 0;
+ cctx->len_set = 0;
+ return 1;
+
+ case EVP_CTRL_CCM_SET_IVLEN:
+ arg = 15 - arg;
+ case EVP_CTRL_CCM_SET_L:
+ if (arg < 2 || arg > 8)
+ return 0;
+ cctx->L = arg;
+ return 1;
+
+ case EVP_CTRL_CCM_SET_TAG:
+ if ((arg & 1) || arg < 4 || arg > 16)
+ return 0;
+ if ((c->encrypt && ptr) || (!c->encrypt && !ptr))
+ return 0;
+ if (ptr)
+ {
+ cctx->tag_set = 1;
+ memcpy(c->buf, ptr, arg);
+ }
+ cctx->M = arg;
+ return 1;
+
+ case EVP_CTRL_CCM_GET_TAG:
+ if (!c->encrypt || !cctx->tag_set)
+ return 0;
+ if(!CRYPTO_ccm128_tag(&cctx->ccm, ptr, (size_t)arg))
+ return 0;
+ cctx->tag_set = 0;
+ cctx->iv_set = 0;
+ cctx->len_set = 0;
+ return 1;
+
+ default:
+ return -1;
+
+ }
+ }
+
+static int aes_ccm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+ const unsigned char *iv, int enc)
+ {
+ EVP_AES_CCM_CTX *cctx = ctx->cipher_data;
+ if (!iv && !key)
+ return 1;
+ if (key) do
+ {
+#ifdef VPAES_CAPABLE
+ if (VPAES_CAPABLE)
+ {
+ vpaes_set_encrypt_key(key, ctx->key_len*8, &cctx->ks);
+ CRYPTO_ccm128_init(&cctx->ccm, cctx->M, cctx->L,
+ &cctx->ks, (block128_f)vpaes_encrypt);
+ cctx->str = NULL;
+ cctx->key_set = 1;
+ break;
+ }
+#endif
+ AES_set_encrypt_key(key, ctx->key_len * 8, &cctx->ks);
+ CRYPTO_ccm128_init(&cctx->ccm, cctx->M, cctx->L,
+ &cctx->ks, (block128_f)AES_encrypt);
+ cctx->str = NULL;
+ cctx->key_set = 1;
+ } while (0);
+ if (iv)
+ {
+ memcpy(ctx->iv, iv, 15 - cctx->L);
+ cctx->iv_set = 1;
+ }
+ return 1;
+ }
+
+static int aes_ccm_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len)
+ {
+ EVP_AES_CCM_CTX *cctx = ctx->cipher_data;
+ CCM128_CONTEXT *ccm = &cctx->ccm;
+ /* If not set up, return error */
+ if (!cctx->iv_set && !cctx->key_set)
+ return -1;
+ if (!ctx->encrypt && !cctx->tag_set)
+ return -1;
+ if (!out)
+ {
+ if (!in)
+ {
+ if (CRYPTO_ccm128_setiv(ccm, ctx->iv, 15 - cctx->L,len))
+ return -1;
+ cctx->len_set = 1;
+ return len;
+ }
+ /* If have AAD need message length */
+ if (!cctx->len_set && len)
+ return -1;
+ CRYPTO_ccm128_aad(ccm, in, len);
+ return len;
+ }
+ /* EVP_*Final() doesn't return any data */
+ if (!in)
+ return 0;
+ /* If not set length yet do it */
+ if (!cctx->len_set)
+ {
+ if (CRYPTO_ccm128_setiv(ccm, ctx->iv, 15 - cctx->L, len))
+ return -1;
+ cctx->len_set = 1;
+ }
+ if (ctx->encrypt)
+ {
+ if (cctx->str ? CRYPTO_ccm128_encrypt_ccm64(ccm, in, out, len,
+ cctx->str) :
+ CRYPTO_ccm128_encrypt(ccm, in, out, len))
+ return -1;
+ cctx->tag_set = 1;
+ return len;
+ }
+ else
+ {
+ int rv = -1;
+ if (cctx->str ? !CRYPTO_ccm128_decrypt_ccm64(ccm, in, out, len,
+ cctx->str) :
+ !CRYPTO_ccm128_decrypt(ccm, in, out, len))
+ {
+ unsigned char tag[16];
+ if (CRYPTO_ccm128_tag(ccm, tag, cctx->M))
+ {
+ if (!memcmp(tag, ctx->buf, cctx->M))
+ rv = len;
+ }
+ }
+ if (rv == -1)
+ OPENSSL_cleanse(out, len);
+ cctx->iv_set = 0;
+ cctx->tag_set = 0;
+ cctx->len_set = 0;
+ return rv;
+ }
+
+ }
+
+#define aes_ccm_cleanup NULL
+
+BLOCK_CIPHER_custom(NID_aes,128,1,12,ccm,CCM,EVP_CIPH_FLAG_FIPS|CUSTOM_FLAGS)
+BLOCK_CIPHER_custom(NID_aes,192,1,12,ccm,CCM,EVP_CIPH_FLAG_FIPS|CUSTOM_FLAGS)
+BLOCK_CIPHER_custom(NID_aes,256,1,12,ccm,CCM,EVP_CIPH_FLAG_FIPS|CUSTOM_FLAGS)
+
+#endif
#endif
diff --git a/main/openssl/crypto/evp/e_aes_cbc_hmac_sha1.c b/main/openssl/crypto/evp/e_aes_cbc_hmac_sha1.c
new file mode 100644
index 00000000..fb2c884a
--- /dev/null
+++ b/main/openssl/crypto/evp/e_aes_cbc_hmac_sha1.c
@@ -0,0 +1,581 @@
+/* ====================================================================
+ * Copyright (c) 2011-2013 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include <openssl/opensslconf.h>
+
+#include <stdio.h>
+#include <string.h>
+
+#if !defined(OPENSSL_NO_AES) && !defined(OPENSSL_NO_SHA1)
+
+#include <openssl/evp.h>
+#include <openssl/objects.h>
+#include <openssl/aes.h>
+#include <openssl/sha.h>
+#include "evp_locl.h"
+
+#ifndef EVP_CIPH_FLAG_AEAD_CIPHER
+#define EVP_CIPH_FLAG_AEAD_CIPHER 0x200000
+#define EVP_CTRL_AEAD_TLS1_AAD 0x16
+#define EVP_CTRL_AEAD_SET_MAC_KEY 0x17
+#endif
+
+#if !defined(EVP_CIPH_FLAG_DEFAULT_ASN1)
+#define EVP_CIPH_FLAG_DEFAULT_ASN1 0
+#endif
+
+#define TLS1_1_VERSION 0x0302
+
+typedef struct
+ {
+ AES_KEY ks;
+ SHA_CTX head,tail,md;
+ size_t payload_length; /* AAD length in decrypt case */
+ union {
+ unsigned int tls_ver;
+ unsigned char tls_aad[16]; /* 13 used */
+ } aux;
+ } EVP_AES_HMAC_SHA1;
+
+#define NO_PAYLOAD_LENGTH ((size_t)-1)
+
+#if defined(AES_ASM) && ( \
+ defined(__x86_64) || defined(__x86_64__) || \
+ defined(_M_AMD64) || defined(_M_X64) || \
+ defined(__INTEL__) )
+
+#if defined(__GNUC__) && __GNUC__>=2 && !defined(PEDANTIC)
+# define BSWAP(x) ({ unsigned int r=(x); asm ("bswapl %0":"=r"(r):"0"(r)); r; })
+#endif
+
+extern unsigned int OPENSSL_ia32cap_P[2];
+#define AESNI_CAPABLE (1<<(57-32))
+
+int aesni_set_encrypt_key(const unsigned char *userKey, int bits,
+ AES_KEY *key);
+int aesni_set_decrypt_key(const unsigned char *userKey, int bits,
+ AES_KEY *key);
+
+void aesni_cbc_encrypt(const unsigned char *in,
+ unsigned char *out,
+ size_t length,
+ const AES_KEY *key,
+ unsigned char *ivec, int enc);
+
+void aesni_cbc_sha1_enc (const void *inp, void *out, size_t blocks,
+ const AES_KEY *key, unsigned char iv[16],
+ SHA_CTX *ctx,const void *in0);
+
+#define data(ctx) ((EVP_AES_HMAC_SHA1 *)(ctx)->cipher_data)
+
+static int aesni_cbc_hmac_sha1_init_key(EVP_CIPHER_CTX *ctx,
+ const unsigned char *inkey,
+ const unsigned char *iv, int enc)
+ {
+ EVP_AES_HMAC_SHA1 *key = data(ctx);
+ int ret;
+
+ if (enc)
+ ret=aesni_set_encrypt_key(inkey,ctx->key_len*8,&key->ks);
+ else
+ ret=aesni_set_decrypt_key(inkey,ctx->key_len*8,&key->ks);
+
+ SHA1_Init(&key->head); /* handy when benchmarking */
+ key->tail = key->head;
+ key->md = key->head;
+
+ key->payload_length = NO_PAYLOAD_LENGTH;
+
+ return ret<0?0:1;
+ }
+
+#define STITCHED_CALL
+
+#if !defined(STITCHED_CALL)
+#define aes_off 0
+#endif
+
+void sha1_block_data_order (void *c,const void *p,size_t len);
+
+static void sha1_update(SHA_CTX *c,const void *data,size_t len)
+{ const unsigned char *ptr = data;
+ size_t res;
+
+ if ((res = c->num)) {
+ res = SHA_CBLOCK-res;
+ if (len<res) res=len;
+ SHA1_Update (c,ptr,res);
+ ptr += res;
+ len -= res;
+ }
+
+ res = len % SHA_CBLOCK;
+ len -= res;
+
+ if (len) {
+ sha1_block_data_order(c,ptr,len/SHA_CBLOCK);
+
+ ptr += len;
+ c->Nh += len>>29;
+ c->Nl += len<<=3;
+ if (c->Nl<(unsigned int)len) c->Nh++;
+ }
+
+ if (res)
+ SHA1_Update(c,ptr,res);
+}
+
+#ifdef SHA1_Update
+#undef SHA1_Update
+#endif
+#define SHA1_Update sha1_update
+
+static int aesni_cbc_hmac_sha1_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len)
+ {
+ EVP_AES_HMAC_SHA1 *key = data(ctx);
+ unsigned int l;
+ size_t plen = key->payload_length,
+ iv = 0, /* explicit IV in TLS 1.1 and later */
+ sha_off = 0;
+#if defined(STITCHED_CALL)
+ size_t aes_off = 0,
+ blocks;
+
+ sha_off = SHA_CBLOCK-key->md.num;
+#endif
+
+ key->payload_length = NO_PAYLOAD_LENGTH;
+
+ if (len%AES_BLOCK_SIZE) return 0;
+
+ if (ctx->encrypt) {
+ if (plen==NO_PAYLOAD_LENGTH)
+ plen = len;
+ else if (len!=((plen+SHA_DIGEST_LENGTH+AES_BLOCK_SIZE)&-AES_BLOCK_SIZE))
+ return 0;
+ else if (key->aux.tls_ver >= TLS1_1_VERSION)
+ iv = AES_BLOCK_SIZE;
+
+#if defined(STITCHED_CALL)
+ if (plen>(sha_off+iv) && (blocks=(plen-(sha_off+iv))/SHA_CBLOCK)) {
+ SHA1_Update(&key->md,in+iv,sha_off);
+
+ aesni_cbc_sha1_enc(in,out,blocks,&key->ks,
+ ctx->iv,&key->md,in+iv+sha_off);
+ blocks *= SHA_CBLOCK;
+ aes_off += blocks;
+ sha_off += blocks;
+ key->md.Nh += blocks>>29;
+ key->md.Nl += blocks<<=3;
+ if (key->md.Nl<(unsigned int)blocks) key->md.Nh++;
+ } else {
+ sha_off = 0;
+ }
+#endif
+ sha_off += iv;
+ SHA1_Update(&key->md,in+sha_off,plen-sha_off);
+
+ if (plen!=len) { /* "TLS" mode of operation */
+ if (in!=out)
+ memcpy(out+aes_off,in+aes_off,plen-aes_off);
+
+ /* calculate HMAC and append it to payload */
+ SHA1_Final(out+plen,&key->md);
+ key->md = key->tail;
+ SHA1_Update(&key->md,out+plen,SHA_DIGEST_LENGTH);
+ SHA1_Final(out+plen,&key->md);
+
+ /* pad the payload|hmac */
+ plen += SHA_DIGEST_LENGTH;
+ for (l=len-plen-1;plen<len;plen++) out[plen]=l;
+ /* encrypt HMAC|padding at once */
+ aesni_cbc_encrypt(out+aes_off,out+aes_off,len-aes_off,
+ &key->ks,ctx->iv,1);
+ } else {
+ aesni_cbc_encrypt(in+aes_off,out+aes_off,len-aes_off,
+ &key->ks,ctx->iv,1);
+ }
+ } else {
+ union { unsigned int u[SHA_DIGEST_LENGTH/sizeof(unsigned int)];
+ unsigned char c[32+SHA_DIGEST_LENGTH]; } mac, *pmac;
+
+ /* arrange cache line alignment */
+ pmac = (void *)(((size_t)mac.c+31)&((size_t)0-32));
+
+ /* decrypt HMAC|padding at once */
+ aesni_cbc_encrypt(in,out,len,
+ &key->ks,ctx->iv,0);
+
+ if (plen) { /* "TLS" mode of operation */
+ size_t inp_len, mask, j, i;
+ unsigned int res, maxpad, pad, bitlen;
+ int ret = 1;
+ union { unsigned int u[SHA_LBLOCK];
+ unsigned char c[SHA_CBLOCK]; }
+ *data = (void *)key->md.data;
+
+ if ((key->aux.tls_aad[plen-4]<<8|key->aux.tls_aad[plen-3])
+ >= TLS1_1_VERSION)
+ iv = AES_BLOCK_SIZE;
+
+ if (len<(iv+SHA_DIGEST_LENGTH+1))
+ return 0;
+
+ /* omit explicit iv */
+ out += iv;
+ len -= iv;
+
+ /* figure out payload length */
+ pad = out[len-1];
+ maxpad = len-(SHA_DIGEST_LENGTH+1);
+ maxpad |= (255-maxpad)>>(sizeof(maxpad)*8-8);
+ maxpad &= 255;
+
+ inp_len = len - (SHA_DIGEST_LENGTH+pad+1);
+ mask = (0-((inp_len-len)>>(sizeof(inp_len)*8-1)));
+ inp_len &= mask;
+ ret &= (int)mask;
+
+ key->aux.tls_aad[plen-2] = inp_len>>8;
+ key->aux.tls_aad[plen-1] = inp_len;
+
+ /* calculate HMAC */
+ key->md = key->head;
+ SHA1_Update(&key->md,key->aux.tls_aad,plen);
+
+#if 1
+ len -= SHA_DIGEST_LENGTH; /* amend mac */
+ if (len>=(256+SHA_CBLOCK)) {
+ j = (len-(256+SHA_CBLOCK))&(0-SHA_CBLOCK);
+ j += SHA_CBLOCK-key->md.num;
+ SHA1_Update(&key->md,out,j);
+ out += j;
+ len -= j;
+ inp_len -= j;
+ }
+
+ /* but pretend as if we hashed padded payload */
+ bitlen = key->md.Nl+(inp_len<<3); /* at most 18 bits */
+#ifdef BSWAP
+ bitlen = BSWAP(bitlen);
+#else
+ mac.c[0] = 0;
+ mac.c[1] = (unsigned char)(bitlen>>16);
+ mac.c[2] = (unsigned char)(bitlen>>8);
+ mac.c[3] = (unsigned char)bitlen;
+ bitlen = mac.u[0];
+#endif
+
+ pmac->u[0]=0;
+ pmac->u[1]=0;
+ pmac->u[2]=0;
+ pmac->u[3]=0;
+ pmac->u[4]=0;
+
+ for (res=key->md.num, j=0;j<len;j++) {
+ size_t c = out[j];
+ mask = (j-inp_len)>>(sizeof(j)*8-8);
+ c &= mask;
+ c |= 0x80&~mask&~((inp_len-j)>>(sizeof(j)*8-8));
+ data->c[res++]=(unsigned char)c;
+
+ if (res!=SHA_CBLOCK) continue;
+
+ /* j is not incremented yet */
+ mask = 0-((inp_len+7-j)>>(sizeof(j)*8-1));
+ data->u[SHA_LBLOCK-1] |= bitlen&mask;
+ sha1_block_data_order(&key->md,data,1);
+ mask &= 0-((j-inp_len-72)>>(sizeof(j)*8-1));
+ pmac->u[0] |= key->md.h0 & mask;
+ pmac->u[1] |= key->md.h1 & mask;
+ pmac->u[2] |= key->md.h2 & mask;
+ pmac->u[3] |= key->md.h3 & mask;
+ pmac->u[4] |= key->md.h4 & mask;
+ res=0;
+ }
+
+ for(i=res;i<SHA_CBLOCK;i++,j++) data->c[i]=0;
+
+ if (res>SHA_CBLOCK-8) {
+ mask = 0-((inp_len+8-j)>>(sizeof(j)*8-1));
+ data->u[SHA_LBLOCK-1] |= bitlen&mask;
+ sha1_block_data_order(&key->md,data,1);
+ mask &= 0-((j-inp_len-73)>>(sizeof(j)*8-1));
+ pmac->u[0] |= key->md.h0 & mask;
+ pmac->u[1] |= key->md.h1 & mask;
+ pmac->u[2] |= key->md.h2 & mask;
+ pmac->u[3] |= key->md.h3 & mask;
+ pmac->u[4] |= key->md.h4 & mask;
+
+ memset(data,0,SHA_CBLOCK);
+ j+=64;
+ }
+ data->u[SHA_LBLOCK-1] = bitlen;
+ sha1_block_data_order(&key->md,data,1);
+ mask = 0-((j-inp_len-73)>>(sizeof(j)*8-1));
+ pmac->u[0] |= key->md.h0 & mask;
+ pmac->u[1] |= key->md.h1 & mask;
+ pmac->u[2] |= key->md.h2 & mask;
+ pmac->u[3] |= key->md.h3 & mask;
+ pmac->u[4] |= key->md.h4 & mask;
+
+#ifdef BSWAP
+ pmac->u[0] = BSWAP(pmac->u[0]);
+ pmac->u[1] = BSWAP(pmac->u[1]);
+ pmac->u[2] = BSWAP(pmac->u[2]);
+ pmac->u[3] = BSWAP(pmac->u[3]);
+ pmac->u[4] = BSWAP(pmac->u[4]);
+#else
+ for (i=0;i<5;i++) {
+ res = pmac->u[i];
+ pmac->c[4*i+0]=(unsigned char)(res>>24);
+ pmac->c[4*i+1]=(unsigned char)(res>>16);
+ pmac->c[4*i+2]=(unsigned char)(res>>8);
+ pmac->c[4*i+3]=(unsigned char)res;
+ }
+#endif
+ len += SHA_DIGEST_LENGTH;
+#else
+ SHA1_Update(&key->md,out,inp_len);
+ res = key->md.num;
+ SHA1_Final(pmac->c,&key->md);
+
+ {
+ unsigned int inp_blocks, pad_blocks;
+
+ /* but pretend as if we hashed padded payload */
+ inp_blocks = 1+((SHA_CBLOCK-9-res)>>(sizeof(res)*8-1));
+ res += (unsigned int)(len-inp_len);
+ pad_blocks = res / SHA_CBLOCK;
+ res %= SHA_CBLOCK;
+ pad_blocks += 1+((SHA_CBLOCK-9-res)>>(sizeof(res)*8-1));
+ for (;inp_blocks<pad_blocks;inp_blocks++)
+ sha1_block_data_order(&key->md,data,1);
+ }
+#endif
+ key->md = key->tail;
+ SHA1_Update(&key->md,pmac->c,SHA_DIGEST_LENGTH);
+ SHA1_Final(pmac->c,&key->md);
+
+ /* verify HMAC */
+ out += inp_len;
+ len -= inp_len;
+#if 1
+ {
+ unsigned char *p = out+len-1-maxpad-SHA_DIGEST_LENGTH;
+ size_t off = out-p;
+ unsigned int c, cmask;
+
+ maxpad += SHA_DIGEST_LENGTH;
+ for (res=0,i=0,j=0;j<maxpad;j++) {
+ c = p[j];
+ cmask = ((int)(j-off-SHA_DIGEST_LENGTH))>>(sizeof(int)*8-1);
+ res |= (c^pad)&~cmask; /* ... and padding */
+ cmask &= ((int)(off-1-j))>>(sizeof(int)*8-1);
+ res |= (c^pmac->c[i])&cmask;
+ i += 1&cmask;
+ }
+ maxpad -= SHA_DIGEST_LENGTH;
+
+ res = 0-((0-res)>>(sizeof(res)*8-1));
+ ret &= (int)~res;
+ }
+#else
+ for (res=0,i=0;i<SHA_DIGEST_LENGTH;i++)
+ res |= out[i]^pmac->c[i];
+ res = 0-((0-res)>>(sizeof(res)*8-1));
+ ret &= (int)~res;
+
+ /* verify padding */
+ pad = (pad&~res) | (maxpad&res);
+ out = out+len-1-pad;
+ for (res=0,i=0;i<pad;i++)
+ res |= out[i]^pad;
+
+ res = (0-res)>>(sizeof(res)*8-1);
+ ret &= (int)~res;
+#endif
+ return ret;
+ } else {
+ SHA1_Update(&key->md,out,len);
+ }
+ }
+
+ return 1;
+ }
+
+static int aesni_cbc_hmac_sha1_ctrl(EVP_CIPHER_CTX *ctx, int type, int arg, void *ptr)
+ {
+ EVP_AES_HMAC_SHA1 *key = data(ctx);
+
+ switch (type)
+ {
+ case EVP_CTRL_AEAD_SET_MAC_KEY:
+ {
+ unsigned int i;
+ unsigned char hmac_key[64];
+
+ memset (hmac_key,0,sizeof(hmac_key));
+
+ if (arg > (int)sizeof(hmac_key)) {
+ SHA1_Init(&key->head);
+ SHA1_Update(&key->head,ptr,arg);
+ SHA1_Final(hmac_key,&key->head);
+ } else {
+ memcpy(hmac_key,ptr,arg);
+ }
+
+ for (i=0;i<sizeof(hmac_key);i++)
+ hmac_key[i] ^= 0x36; /* ipad */
+ SHA1_Init(&key->head);
+ SHA1_Update(&key->head,hmac_key,sizeof(hmac_key));
+
+ for (i=0;i<sizeof(hmac_key);i++)
+ hmac_key[i] ^= 0x36^0x5c; /* opad */
+ SHA1_Init(&key->tail);
+ SHA1_Update(&key->tail,hmac_key,sizeof(hmac_key));
+
+ OPENSSL_cleanse(hmac_key,sizeof(hmac_key));
+
+ return 1;
+ }
+ case EVP_CTRL_AEAD_TLS1_AAD:
+ {
+ unsigned char *p=ptr;
+ unsigned int len=p[arg-2]<<8|p[arg-1];
+
+ if (ctx->encrypt)
+ {
+ key->payload_length = len;
+ if ((key->aux.tls_ver=p[arg-4]<<8|p[arg-3]) >= TLS1_1_VERSION) {
+ len -= AES_BLOCK_SIZE;
+ p[arg-2] = len>>8;
+ p[arg-1] = len;
+ }
+ key->md = key->head;
+ SHA1_Update(&key->md,p,arg);
+
+ return (int)(((len+SHA_DIGEST_LENGTH+AES_BLOCK_SIZE)&-AES_BLOCK_SIZE)
+ - len);
+ }
+ else
+ {
+ if (arg>13) arg = 13;
+ memcpy(key->aux.tls_aad,ptr,arg);
+ key->payload_length = arg;
+
+ return SHA_DIGEST_LENGTH;
+ }
+ }
+ default:
+ return -1;
+ }
+ }
+
+static EVP_CIPHER aesni_128_cbc_hmac_sha1_cipher =
+ {
+#ifdef NID_aes_128_cbc_hmac_sha1
+ NID_aes_128_cbc_hmac_sha1,
+#else
+ NID_undef,
+#endif
+ 16,16,16,
+ EVP_CIPH_CBC_MODE|EVP_CIPH_FLAG_DEFAULT_ASN1|EVP_CIPH_FLAG_AEAD_CIPHER,
+ aesni_cbc_hmac_sha1_init_key,
+ aesni_cbc_hmac_sha1_cipher,
+ NULL,
+ sizeof(EVP_AES_HMAC_SHA1),
+ EVP_CIPH_FLAG_DEFAULT_ASN1?NULL:EVP_CIPHER_set_asn1_iv,
+ EVP_CIPH_FLAG_DEFAULT_ASN1?NULL:EVP_CIPHER_get_asn1_iv,
+ aesni_cbc_hmac_sha1_ctrl,
+ NULL
+ };
+
+static EVP_CIPHER aesni_256_cbc_hmac_sha1_cipher =
+ {
+#ifdef NID_aes_256_cbc_hmac_sha1
+ NID_aes_256_cbc_hmac_sha1,
+#else
+ NID_undef,
+#endif
+ 16,32,16,
+ EVP_CIPH_CBC_MODE|EVP_CIPH_FLAG_DEFAULT_ASN1|EVP_CIPH_FLAG_AEAD_CIPHER,
+ aesni_cbc_hmac_sha1_init_key,
+ aesni_cbc_hmac_sha1_cipher,
+ NULL,
+ sizeof(EVP_AES_HMAC_SHA1),
+ EVP_CIPH_FLAG_DEFAULT_ASN1?NULL:EVP_CIPHER_set_asn1_iv,
+ EVP_CIPH_FLAG_DEFAULT_ASN1?NULL:EVP_CIPHER_get_asn1_iv,
+ aesni_cbc_hmac_sha1_ctrl,
+ NULL
+ };
+
+const EVP_CIPHER *EVP_aes_128_cbc_hmac_sha1(void)
+ {
+ return(OPENSSL_ia32cap_P[1]&AESNI_CAPABLE?
+ &aesni_128_cbc_hmac_sha1_cipher:NULL);
+ }
+
+const EVP_CIPHER *EVP_aes_256_cbc_hmac_sha1(void)
+ {
+ return(OPENSSL_ia32cap_P[1]&AESNI_CAPABLE?
+ &aesni_256_cbc_hmac_sha1_cipher:NULL);
+ }
+#else
+const EVP_CIPHER *EVP_aes_128_cbc_hmac_sha1(void)
+ {
+ return NULL;
+ }
+const EVP_CIPHER *EVP_aes_256_cbc_hmac_sha1(void)
+ {
+ return NULL;
+ }
+#endif
+#endif
diff --git a/main/openssl/crypto/evp/e_des3.c b/main/openssl/crypto/evp/e_des3.c
index 3232cfe0..8d7b7de2 100644
--- a/main/openssl/crypto/evp/e_des3.c
+++ b/main/openssl/crypto/evp/e_des3.c
@@ -65,6 +65,8 @@
#include <openssl/des.h>
#include <openssl/rand.h>
+#ifndef OPENSSL_FIPS
+
static int des_ede_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
const unsigned char *iv,int enc);
@@ -99,7 +101,7 @@ static int des_ede_ecb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
static int des_ede_ofb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
const unsigned char *in, size_t inl)
{
- if (inl>=EVP_MAXCHUNK)
+ while (inl>=EVP_MAXCHUNK)
{
DES_ede3_ofb64_encrypt(in, out, (long)EVP_MAXCHUNK,
&data(ctx)->ks1, &data(ctx)->ks2, &data(ctx)->ks3,
@@ -130,7 +132,7 @@ static int des_ede_cbc_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
printf("\n");
}
#endif /* KSSL_DEBUG */
- if (inl>=EVP_MAXCHUNK)
+ while (inl>=EVP_MAXCHUNK)
{
DES_ede3_cbc_encrypt(in, out, (long)EVP_MAXCHUNK,
&data(ctx)->ks1, &data(ctx)->ks2, &data(ctx)->ks3,
@@ -149,7 +151,7 @@ static int des_ede_cbc_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
static int des_ede_cfb64_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
const unsigned char *in, size_t inl)
{
- if (inl>=EVP_MAXCHUNK)
+ while (inl>=EVP_MAXCHUNK)
{
DES_ede3_cfb64_encrypt(in, out, (long)EVP_MAXCHUNK,
&data(ctx)->ks1, &data(ctx)->ks2, &data(ctx)->ks3,
@@ -311,3 +313,4 @@ const EVP_CIPHER *EVP_des_ede3(void)
return &des_ede3_ecb;
}
#endif
+#endif
diff --git a/main/openssl/crypto/evp/e_null.c b/main/openssl/crypto/evp/e_null.c
index 7cf50e14..f0c1f78b 100644
--- a/main/openssl/crypto/evp/e_null.c
+++ b/main/openssl/crypto/evp/e_null.c
@@ -61,6 +61,8 @@
#include <openssl/evp.h>
#include <openssl/objects.h>
+#ifndef OPENSSL_FIPS
+
static int null_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
const unsigned char *iv,int enc);
static int null_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
@@ -99,4 +101,4 @@ static int null_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
memcpy((char *)out,(const char *)in,inl);
return 1;
}
-
+#endif
diff --git a/main/openssl/crypto/evp/e_rc2.c b/main/openssl/crypto/evp/e_rc2.c
index f78d7811..d4c33b58 100644
--- a/main/openssl/crypto/evp/e_rc2.c
+++ b/main/openssl/crypto/evp/e_rc2.c
@@ -183,7 +183,8 @@ static int rc2_get_asn1_type_and_iv(EVP_CIPHER_CTX *c, ASN1_TYPE *type)
key_bits =rc2_magic_to_meth((int)num);
if (!key_bits)
return(-1);
- if(i > 0) EVP_CipherInit_ex(c, NULL, NULL, NULL, iv, -1);
+ if(i > 0 && !EVP_CipherInit_ex(c, NULL, NULL, NULL, iv, -1))
+ return -1;
EVP_CIPHER_CTX_ctrl(c, EVP_CTRL_SET_RC2_KEY_BITS, key_bits, NULL);
EVP_CIPHER_CTX_set_key_length(c, key_bits / 8);
}
diff --git a/main/openssl/crypto/evp/e_rc4.c b/main/openssl/crypto/evp/e_rc4.c
index 8b5175e0..b4f6bda8 100644
--- a/main/openssl/crypto/evp/e_rc4.c
+++ b/main/openssl/crypto/evp/e_rc4.c
@@ -62,6 +62,7 @@
#ifndef OPENSSL_NO_RC4
#include <openssl/evp.h>
+#include "evp_locl.h"
#include <openssl/objects.h>
#include <openssl/rc4.h>
diff --git a/main/openssl/crypto/evp/e_rc4_hmac_md5.c b/main/openssl/crypto/evp/e_rc4_hmac_md5.c
new file mode 100644
index 00000000..56563191
--- /dev/null
+++ b/main/openssl/crypto/evp/e_rc4_hmac_md5.c
@@ -0,0 +1,298 @@
+/* ====================================================================
+ * Copyright (c) 2011 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include <openssl/opensslconf.h>
+
+#include <stdio.h>
+#include <string.h>
+
+#if !defined(OPENSSL_NO_RC4) && !defined(OPENSSL_NO_MD5)
+
+#include <openssl/evp.h>
+#include <openssl/objects.h>
+#include <openssl/rc4.h>
+#include <openssl/md5.h>
+
+#ifndef EVP_CIPH_FLAG_AEAD_CIPHER
+#define EVP_CIPH_FLAG_AEAD_CIPHER 0x200000
+#define EVP_CTRL_AEAD_TLS1_AAD 0x16
+#define EVP_CTRL_AEAD_SET_MAC_KEY 0x17
+#endif
+
+/* FIXME: surely this is available elsewhere? */
+#define EVP_RC4_KEY_SIZE 16
+
+typedef struct
+ {
+ RC4_KEY ks;
+ MD5_CTX head,tail,md;
+ size_t payload_length;
+ } EVP_RC4_HMAC_MD5;
+
+#define NO_PAYLOAD_LENGTH ((size_t)-1)
+
+void rc4_md5_enc (RC4_KEY *key, const void *in0, void *out,
+ MD5_CTX *ctx,const void *inp,size_t blocks);
+
+#define data(ctx) ((EVP_RC4_HMAC_MD5 *)(ctx)->cipher_data)
+
+static int rc4_hmac_md5_init_key(EVP_CIPHER_CTX *ctx,
+ const unsigned char *inkey,
+ const unsigned char *iv, int enc)
+ {
+ EVP_RC4_HMAC_MD5 *key = data(ctx);
+
+ RC4_set_key(&key->ks,EVP_CIPHER_CTX_key_length(ctx),
+ inkey);
+
+ MD5_Init(&key->head); /* handy when benchmarking */
+ key->tail = key->head;
+ key->md = key->head;
+
+ key->payload_length = NO_PAYLOAD_LENGTH;
+
+ return 1;
+ }
+
+#if !defined(OPENSSL_NO_ASM) && ( \
+ defined(__x86_64) || defined(__x86_64__) || \
+ defined(_M_AMD64) || defined(_M_X64) || \
+ defined(__INTEL__) ) && \
+ !(defined(__APPLE__) && defined(__MACH__))
+#define STITCHED_CALL
+#endif
+
+#if !defined(STITCHED_CALL)
+#define rc4_off 0
+#define md5_off 0
+#endif
+
+static int rc4_hmac_md5_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len)
+ {
+ EVP_RC4_HMAC_MD5 *key = data(ctx);
+#if defined(STITCHED_CALL)
+ size_t rc4_off = 32-1-(key->ks.x&(32-1)), /* 32 is $MOD from rc4_md5-x86_64.pl */
+ md5_off = MD5_CBLOCK-key->md.num,
+ blocks;
+ unsigned int l;
+ extern unsigned int OPENSSL_ia32cap_P[];
+#endif
+ size_t plen = key->payload_length;
+
+ if (plen!=NO_PAYLOAD_LENGTH && len!=(plen+MD5_DIGEST_LENGTH)) return 0;
+
+ if (ctx->encrypt) {
+ if (plen==NO_PAYLOAD_LENGTH) plen = len;
+#if defined(STITCHED_CALL)
+ /* cipher has to "fall behind" */
+ if (rc4_off>md5_off) md5_off+=MD5_CBLOCK;
+
+ if (plen>md5_off && (blocks=(plen-md5_off)/MD5_CBLOCK) &&
+ (OPENSSL_ia32cap_P[0]&(1<<20))==0) {
+ MD5_Update(&key->md,in,md5_off);
+ RC4(&key->ks,rc4_off,in,out);
+
+ rc4_md5_enc(&key->ks,in+rc4_off,out+rc4_off,
+ &key->md,in+md5_off,blocks);
+ blocks *= MD5_CBLOCK;
+ rc4_off += blocks;
+ md5_off += blocks;
+ key->md.Nh += blocks>>29;
+ key->md.Nl += blocks<<=3;
+ if (key->md.Nl<(unsigned int)blocks) key->md.Nh++;
+ } else {
+ rc4_off = 0;
+ md5_off = 0;
+ }
+#endif
+ MD5_Update(&key->md,in+md5_off,plen-md5_off);
+
+ if (plen!=len) { /* "TLS" mode of operation */
+ if (in!=out)
+ memcpy(out+rc4_off,in+rc4_off,plen-rc4_off);
+
+ /* calculate HMAC and append it to payload */
+ MD5_Final(out+plen,&key->md);
+ key->md = key->tail;
+ MD5_Update(&key->md,out+plen,MD5_DIGEST_LENGTH);
+ MD5_Final(out+plen,&key->md);
+ /* encrypt HMAC at once */
+ RC4(&key->ks,len-rc4_off,out+rc4_off,out+rc4_off);
+ } else {
+ RC4(&key->ks,len-rc4_off,in+rc4_off,out+rc4_off);
+ }
+ } else {
+ unsigned char mac[MD5_DIGEST_LENGTH];
+#if defined(STITCHED_CALL)
+ /* digest has to "fall behind" */
+ if (md5_off>rc4_off) rc4_off += 2*MD5_CBLOCK;
+ else rc4_off += MD5_CBLOCK;
+
+ if (len>rc4_off && (blocks=(len-rc4_off)/MD5_CBLOCK) &&
+ (OPENSSL_ia32cap_P[0]&(1<<20))==0) {
+ RC4(&key->ks,rc4_off,in,out);
+ MD5_Update(&key->md,out,md5_off);
+
+ rc4_md5_enc(&key->ks,in+rc4_off,out+rc4_off,
+ &key->md,out+md5_off,blocks);
+ blocks *= MD5_CBLOCK;
+ rc4_off += blocks;
+ md5_off += blocks;
+ l = (key->md.Nl+(blocks<<3))&0xffffffffU;
+ if (l<key->md.Nl) key->md.Nh++;
+ key->md.Nl = l;
+ key->md.Nh += blocks>>29;
+ } else {
+ md5_off=0;
+ rc4_off=0;
+ }
+#endif
+ /* decrypt HMAC at once */
+ RC4(&key->ks,len-rc4_off,in+rc4_off,out+rc4_off);
+ if (plen!=NO_PAYLOAD_LENGTH) { /* "TLS" mode of operation */
+ MD5_Update(&key->md,out+md5_off,plen-md5_off);
+
+ /* calculate HMAC and verify it */
+ MD5_Final(mac,&key->md);
+ key->md = key->tail;
+ MD5_Update(&key->md,mac,MD5_DIGEST_LENGTH);
+ MD5_Final(mac,&key->md);
+
+ if (memcmp(out+plen,mac,MD5_DIGEST_LENGTH))
+ return 0;
+ } else {
+ MD5_Update(&key->md,out+md5_off,len-md5_off);
+ }
+ }
+
+ key->payload_length = NO_PAYLOAD_LENGTH;
+
+ return 1;
+ }
+
+static int rc4_hmac_md5_ctrl(EVP_CIPHER_CTX *ctx, int type, int arg, void *ptr)
+ {
+ EVP_RC4_HMAC_MD5 *key = data(ctx);
+
+ switch (type)
+ {
+ case EVP_CTRL_AEAD_SET_MAC_KEY:
+ {
+ unsigned int i;
+ unsigned char hmac_key[64];
+
+ memset (hmac_key,0,sizeof(hmac_key));
+
+ if (arg > (int)sizeof(hmac_key)) {
+ MD5_Init(&key->head);
+ MD5_Update(&key->head,ptr,arg);
+ MD5_Final(hmac_key,&key->head);
+ } else {
+ memcpy(hmac_key,ptr,arg);
+ }
+
+ for (i=0;i<sizeof(hmac_key);i++)
+ hmac_key[i] ^= 0x36; /* ipad */
+ MD5_Init(&key->head);
+ MD5_Update(&key->head,hmac_key,sizeof(hmac_key));
+
+ for (i=0;i<sizeof(hmac_key);i++)
+ hmac_key[i] ^= 0x36^0x5c; /* opad */
+ MD5_Init(&key->tail);
+ MD5_Update(&key->tail,hmac_key,sizeof(hmac_key));
+
+ return 1;
+ }
+ case EVP_CTRL_AEAD_TLS1_AAD:
+ {
+ unsigned char *p=ptr;
+ unsigned int len=p[arg-2]<<8|p[arg-1];
+
+ if (!ctx->encrypt)
+ {
+ len -= MD5_DIGEST_LENGTH;
+ p[arg-2] = len>>8;
+ p[arg-1] = len;
+ }
+ key->payload_length=len;
+ key->md = key->head;
+ MD5_Update(&key->md,p,arg);
+
+ return MD5_DIGEST_LENGTH;
+ }
+ default:
+ return -1;
+ }
+ }
+
+static EVP_CIPHER r4_hmac_md5_cipher=
+ {
+#ifdef NID_rc4_hmac_md5
+ NID_rc4_hmac_md5,
+#else
+ NID_undef,
+#endif
+ 1,EVP_RC4_KEY_SIZE,0,
+ EVP_CIPH_STREAM_CIPHER|EVP_CIPH_VARIABLE_LENGTH|EVP_CIPH_FLAG_AEAD_CIPHER,
+ rc4_hmac_md5_init_key,
+ rc4_hmac_md5_cipher,
+ NULL,
+ sizeof(EVP_RC4_HMAC_MD5),
+ NULL,
+ NULL,
+ rc4_hmac_md5_ctrl,
+ NULL
+ };
+
+const EVP_CIPHER *EVP_rc4_hmac_md5(void)
+ {
+ return(&r4_hmac_md5_cipher);
+ }
+#endif
diff --git a/main/openssl/crypto/evp/evp.h b/main/openssl/crypto/evp/evp.h
index 9f9795e2..e43a58e6 100644
--- a/main/openssl/crypto/evp/evp.h
+++ b/main/openssl/crypto/evp/evp.h
@@ -83,7 +83,7 @@
#define EVP_RC5_32_12_16_KEY_SIZE 16
*/
#define EVP_MAX_MD_SIZE 64 /* longest known is SHA512 */
-#define EVP_MAX_KEY_LENGTH 32
+#define EVP_MAX_KEY_LENGTH 64
#define EVP_MAX_IV_LENGTH 16
#define EVP_MAX_BLOCK_LENGTH 32
@@ -116,6 +116,7 @@
#define EVP_PKEY_DH NID_dhKeyAgreement
#define EVP_PKEY_EC NID_X9_62_id_ecPublicKey
#define EVP_PKEY_HMAC NID_hmac
+#define EVP_PKEY_CMAC NID_cmac
#ifdef __cplusplus
extern "C" {
@@ -216,6 +217,8 @@ typedef int evp_verify_method(int type,const unsigned char *m,
#define EVP_MD_FLAG_DIGALGID_CUSTOM 0x0018
+#define EVP_MD_FLAG_FIPS 0x0400 /* Note if suitable for use in FIPS mode */
+
/* Digest ctrls */
#define EVP_MD_CTRL_DIGALGID 0x1
@@ -325,6 +328,10 @@ struct evp_cipher_st
#define EVP_CIPH_CBC_MODE 0x2
#define EVP_CIPH_CFB_MODE 0x3
#define EVP_CIPH_OFB_MODE 0x4
+#define EVP_CIPH_CTR_MODE 0x5
+#define EVP_CIPH_GCM_MODE 0x6
+#define EVP_CIPH_CCM_MODE 0x7
+#define EVP_CIPH_XTS_MODE 0x10001
#define EVP_CIPH_MODE 0xF0007
/* Set if variable length cipher */
#define EVP_CIPH_VARIABLE_LENGTH 0x8
@@ -346,6 +353,15 @@ struct evp_cipher_st
#define EVP_CIPH_FLAG_DEFAULT_ASN1 0x1000
/* Buffer length in bits not bytes: CFB1 mode only */
#define EVP_CIPH_FLAG_LENGTH_BITS 0x2000
+/* Note if suitable for use in FIPS mode */
+#define EVP_CIPH_FLAG_FIPS 0x4000
+/* Allow non FIPS cipher in FIPS mode */
+#define EVP_CIPH_FLAG_NON_FIPS_ALLOW 0x8000
+/* Cipher handles any and all padding logic as well
+ * as finalisation.
+ */
+#define EVP_CIPH_FLAG_CUSTOM_CIPHER 0x100000
+#define EVP_CIPH_FLAG_AEAD_CIPHER 0x200000
/* ctrl() values */
@@ -358,6 +374,33 @@ struct evp_cipher_st
#define EVP_CTRL_RAND_KEY 0x6
#define EVP_CTRL_PBE_PRF_NID 0x7
#define EVP_CTRL_COPY 0x8
+#define EVP_CTRL_GCM_SET_IVLEN 0x9
+#define EVP_CTRL_GCM_GET_TAG 0x10
+#define EVP_CTRL_GCM_SET_TAG 0x11
+#define EVP_CTRL_GCM_SET_IV_FIXED 0x12
+#define EVP_CTRL_GCM_IV_GEN 0x13
+#define EVP_CTRL_CCM_SET_IVLEN EVP_CTRL_GCM_SET_IVLEN
+#define EVP_CTRL_CCM_GET_TAG EVP_CTRL_GCM_GET_TAG
+#define EVP_CTRL_CCM_SET_TAG EVP_CTRL_GCM_SET_TAG
+#define EVP_CTRL_CCM_SET_L 0x14
+#define EVP_CTRL_CCM_SET_MSGLEN 0x15
+/* AEAD cipher deduces payload length and returns number of bytes
+ * required to store MAC and eventual padding. Subsequent call to
+ * EVP_Cipher even appends/verifies MAC.
+ */
+#define EVP_CTRL_AEAD_TLS1_AAD 0x16
+/* Used by composite AEAD ciphers, no-op in GCM, CCM... */
+#define EVP_CTRL_AEAD_SET_MAC_KEY 0x17
+/* Set the GCM invocation field, decrypt only */
+#define EVP_CTRL_GCM_SET_IV_INV 0x18
+
+/* GCM TLS constants */
+/* Length of fixed part of IV derived from PRF */
+#define EVP_GCM_TLS_FIXED_IV_LEN 4
+/* Length of explicit part of IV part of TLS records */
+#define EVP_GCM_TLS_EXPLICIT_IV_LEN 8
+/* Length of tag for TLS */
+#define EVP_GCM_TLS_TAG_LEN 16
typedef struct evp_cipher_info_st
{
@@ -375,7 +418,7 @@ struct evp_cipher_ctx_st
unsigned char oiv[EVP_MAX_IV_LENGTH]; /* original iv */
unsigned char iv[EVP_MAX_IV_LENGTH]; /* working iv */
unsigned char buf[EVP_MAX_BLOCK_LENGTH];/* saved partial block */
- int num; /* used by cfb/ofb mode */
+ int num; /* used by cfb/ofb/ctr mode */
void *app_data; /* application stuff */
int key_len; /* May change for variable length cipher */
@@ -695,6 +738,9 @@ const EVP_MD *EVP_dev_crypto_md5(void);
#ifndef OPENSSL_NO_RC4
const EVP_CIPHER *EVP_rc4(void);
const EVP_CIPHER *EVP_rc4_40(void);
+#ifndef OPENSSL_NO_MD5
+const EVP_CIPHER *EVP_rc4_hmac_md5(void);
+#endif
#endif
#ifndef OPENSSL_NO_IDEA
const EVP_CIPHER *EVP_idea_ecb(void);
@@ -741,9 +787,10 @@ const EVP_CIPHER *EVP_aes_128_cfb8(void);
const EVP_CIPHER *EVP_aes_128_cfb128(void);
# define EVP_aes_128_cfb EVP_aes_128_cfb128
const EVP_CIPHER *EVP_aes_128_ofb(void);
-#if 0
const EVP_CIPHER *EVP_aes_128_ctr(void);
-#endif
+const EVP_CIPHER *EVP_aes_128_ccm(void);
+const EVP_CIPHER *EVP_aes_128_gcm(void);
+const EVP_CIPHER *EVP_aes_128_xts(void);
const EVP_CIPHER *EVP_aes_192_ecb(void);
const EVP_CIPHER *EVP_aes_192_cbc(void);
const EVP_CIPHER *EVP_aes_192_cfb1(void);
@@ -751,9 +798,9 @@ const EVP_CIPHER *EVP_aes_192_cfb8(void);
const EVP_CIPHER *EVP_aes_192_cfb128(void);
# define EVP_aes_192_cfb EVP_aes_192_cfb128
const EVP_CIPHER *EVP_aes_192_ofb(void);
-#if 0
const EVP_CIPHER *EVP_aes_192_ctr(void);
-#endif
+const EVP_CIPHER *EVP_aes_192_ccm(void);
+const EVP_CIPHER *EVP_aes_192_gcm(void);
const EVP_CIPHER *EVP_aes_256_ecb(void);
const EVP_CIPHER *EVP_aes_256_cbc(void);
const EVP_CIPHER *EVP_aes_256_cfb1(void);
@@ -761,8 +808,13 @@ const EVP_CIPHER *EVP_aes_256_cfb8(void);
const EVP_CIPHER *EVP_aes_256_cfb128(void);
# define EVP_aes_256_cfb EVP_aes_256_cfb128
const EVP_CIPHER *EVP_aes_256_ofb(void);
-#if 0
const EVP_CIPHER *EVP_aes_256_ctr(void);
+const EVP_CIPHER *EVP_aes_256_ccm(void);
+const EVP_CIPHER *EVP_aes_256_gcm(void);
+const EVP_CIPHER *EVP_aes_256_xts(void);
+#if !defined(OPENSSL_NO_SHA) && !defined(OPENSSL_NO_SHA1)
+const EVP_CIPHER *EVP_aes_128_cbc_hmac_sha1(void);
+const EVP_CIPHER *EVP_aes_256_cbc_hmac_sha1(void);
#endif
#endif
#ifndef OPENSSL_NO_CAMELLIA
@@ -869,6 +921,7 @@ struct ec_key_st *EVP_PKEY_get1_EC_KEY(EVP_PKEY *pkey);
#endif
EVP_PKEY * EVP_PKEY_new(void);
+EVP_PKEY * EVP_PKEY_dup(EVP_PKEY *pkey);
void EVP_PKEY_free(EVP_PKEY *pkey);
EVP_PKEY * d2i_PublicKey(int type,EVP_PKEY **a, const unsigned char **pp,
@@ -1047,13 +1100,22 @@ void EVP_PKEY_asn1_set_ctrl(EVP_PKEY_ASN1_METHOD *ameth,
#define EVP_PKEY_CTRL_CMS_DECRYPT 10
#define EVP_PKEY_CTRL_CMS_SIGN 11
+#define EVP_PKEY_CTRL_CIPHER 12
+
#define EVP_PKEY_ALG_CTRL 0x1000
#define EVP_PKEY_FLAG_AUTOARGLEN 2
+/* Method handles all operations: don't assume any digest related
+ * defaults.
+ */
+#define EVP_PKEY_FLAG_SIGCTX_CUSTOM 4
const EVP_PKEY_METHOD *EVP_PKEY_meth_find(int type);
EVP_PKEY_METHOD* EVP_PKEY_meth_new(int id, int flags);
+void EVP_PKEY_meth_get0_info(int *ppkey_id, int *pflags,
+ const EVP_PKEY_METHOD *meth);
+void EVP_PKEY_meth_copy(EVP_PKEY_METHOD *dst, const EVP_PKEY_METHOD *src);
void EVP_PKEY_meth_free(EVP_PKEY_METHOD *pmeth);
int EVP_PKEY_meth_add0(const EVP_PKEY_METHOD *pmeth);
@@ -1071,7 +1133,7 @@ int EVP_PKEY_CTX_get_operation(EVP_PKEY_CTX *ctx);
void EVP_PKEY_CTX_set0_keygen_info(EVP_PKEY_CTX *ctx, int *dat, int datlen);
EVP_PKEY *EVP_PKEY_new_mac_key(int type, ENGINE *e,
- unsigned char *key, int keylen);
+ const unsigned char *key, int keylen);
void EVP_PKEY_CTX_set_data(EVP_PKEY_CTX *ctx, void *data);
void *EVP_PKEY_CTX_get_data(EVP_PKEY_CTX *ctx);
@@ -1181,6 +1243,8 @@ void EVP_PKEY_meth_set_ctrl(EVP_PKEY_METHOD *pmeth,
int (*ctrl_str)(EVP_PKEY_CTX *ctx,
const char *type, const char *value));
+void EVP_add_alg_module(void);
+
/* BEGIN ERROR CODES */
/* The following lines are auto generated by the script mkerr.pl. Any changes
* made after this point may be overwritten when the script is next run.
@@ -1190,8 +1254,14 @@ void ERR_load_EVP_strings(void);
/* Error codes for the EVP functions. */
/* Function codes. */
+#define EVP_F_AESNI_INIT_KEY 165
+#define EVP_F_AESNI_XTS_CIPHER 176
#define EVP_F_AES_INIT_KEY 133
+#define EVP_F_AES_XTS 172
+#define EVP_F_AES_XTS_CIPHER 175
+#define EVP_F_ALG_MODULE_INIT 177
#define EVP_F_CAMELLIA_INIT_KEY 159
+#define EVP_F_CMAC_INIT 173
#define EVP_F_D2I_PKEY 100
#define EVP_F_DO_SIGVER_INIT 161
#define EVP_F_DSAPKEY2PKCS8 134
@@ -1246,15 +1316,24 @@ void ERR_load_EVP_strings(void);
#define EVP_F_EVP_RIJNDAEL 126
#define EVP_F_EVP_SIGNFINAL 107
#define EVP_F_EVP_VERIFYFINAL 108
+#define EVP_F_FIPS_CIPHERINIT 166
+#define EVP_F_FIPS_CIPHER_CTX_COPY 170
+#define EVP_F_FIPS_CIPHER_CTX_CTRL 167
+#define EVP_F_FIPS_CIPHER_CTX_SET_KEY_LENGTH 171
+#define EVP_F_FIPS_DIGESTINIT 168
+#define EVP_F_FIPS_MD_CTX_COPY 169
+#define EVP_F_HMAC_INIT_EX 174
#define EVP_F_INT_CTX_NEW 157
#define EVP_F_PKCS5_PBE_KEYIVGEN 117
#define EVP_F_PKCS5_V2_PBE_KEYIVGEN 118
+#define EVP_F_PKCS5_V2_PBKDF2_KEYIVGEN 164
#define EVP_F_PKCS8_SET_BROKEN 112
#define EVP_F_PKEY_SET_TYPE 158
#define EVP_F_RC2_MAGIC_TO_METH 109
#define EVP_F_RC5_CTRL 125
/* Reason codes. */
+#define EVP_R_AES_IV_SETUP_FAILED 162
#define EVP_R_AES_KEY_SETUP_FAILED 143
#define EVP_R_ASN1_LIB 140
#define EVP_R_BAD_BLOCK_LENGTH 136
@@ -1272,16 +1351,21 @@ void ERR_load_EVP_strings(void);
#define EVP_R_DECODE_ERROR 114
#define EVP_R_DIFFERENT_KEY_TYPES 101
#define EVP_R_DIFFERENT_PARAMETERS 153
+#define EVP_R_DISABLED_FOR_FIPS 163
#define EVP_R_ENCODE_ERROR 115
+#define EVP_R_ERROR_LOADING_SECTION 165
+#define EVP_R_ERROR_SETTING_FIPS_MODE 166
#define EVP_R_EVP_PBE_CIPHERINIT_ERROR 119
#define EVP_R_EXPECTING_AN_RSA_KEY 127
#define EVP_R_EXPECTING_A_DH_KEY 128
#define EVP_R_EXPECTING_A_DSA_KEY 129
#define EVP_R_EXPECTING_A_ECDSA_KEY 141
#define EVP_R_EXPECTING_A_EC_KEY 142
+#define EVP_R_FIPS_MODE_NOT_SUPPORTED 167
#define EVP_R_INITIALIZATION_ERROR 134
#define EVP_R_INPUT_NOT_INITIALIZED 111
#define EVP_R_INVALID_DIGEST 152
+#define EVP_R_INVALID_FIPS_MODE 168
#define EVP_R_INVALID_KEY_LENGTH 130
#define EVP_R_INVALID_OPERATION 148
#define EVP_R_IV_TOO_LARGE 102
@@ -1303,8 +1387,10 @@ void ERR_load_EVP_strings(void);
#define EVP_R_PRIVATE_KEY_DECODE_ERROR 145
#define EVP_R_PRIVATE_KEY_ENCODE_ERROR 146
#define EVP_R_PUBLIC_KEY_NOT_RSA 106
+#define EVP_R_TOO_LARGE 164
#define EVP_R_UNKNOWN_CIPHER 160
#define EVP_R_UNKNOWN_DIGEST 161
+#define EVP_R_UNKNOWN_OPTION 169
#define EVP_R_UNKNOWN_PBE_ALGORITHM 121
#define EVP_R_UNSUPORTED_NUMBER_OF_ROUNDS 135
#define EVP_R_UNSUPPORTED_ALGORITHM 156
diff --git a/main/openssl/crypto/evp/evp_cnf.c b/main/openssl/crypto/evp/evp_cnf.c
new file mode 100644
index 00000000..2e4db302
--- /dev/null
+++ b/main/openssl/crypto/evp/evp_cnf.c
@@ -0,0 +1,125 @@
+/* evp_cnf.c */
+/* Written by Stephen Henson (steve@openssl.org) for the OpenSSL
+ * project 2007.
+ */
+/* ====================================================================
+ * Copyright (c) 2007 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com). This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+
+#include <stdio.h>
+#include <ctype.h>
+#include <openssl/crypto.h>
+#include "cryptlib.h"
+#include <openssl/conf.h>
+#include <openssl/dso.h>
+#include <openssl/x509.h>
+#include <openssl/x509v3.h>
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
+
+
+/* Algorithm configuration module. */
+
+static int alg_module_init(CONF_IMODULE *md, const CONF *cnf)
+ {
+ int i;
+ const char *oid_section;
+ STACK_OF(CONF_VALUE) *sktmp;
+ CONF_VALUE *oval;
+ oid_section = CONF_imodule_get_value(md);
+ if(!(sktmp = NCONF_get_section(cnf, oid_section)))
+ {
+ EVPerr(EVP_F_ALG_MODULE_INIT, EVP_R_ERROR_LOADING_SECTION);
+ return 0;
+ }
+ for(i = 0; i < sk_CONF_VALUE_num(sktmp); i++)
+ {
+ oval = sk_CONF_VALUE_value(sktmp, i);
+ if (!strcmp(oval->name, "fips_mode"))
+ {
+ int m;
+ if (!X509V3_get_value_bool(oval, &m))
+ {
+ EVPerr(EVP_F_ALG_MODULE_INIT, EVP_R_INVALID_FIPS_MODE);
+ return 0;
+ }
+ if (m > 0)
+ {
+#ifdef OPENSSL_FIPS
+ if (!FIPS_mode() && !FIPS_mode_set(1))
+ {
+ EVPerr(EVP_F_ALG_MODULE_INIT, EVP_R_ERROR_SETTING_FIPS_MODE);
+ return 0;
+ }
+#else
+ EVPerr(EVP_F_ALG_MODULE_INIT, EVP_R_FIPS_MODE_NOT_SUPPORTED);
+ return 0;
+#endif
+ }
+ }
+ else
+ {
+ EVPerr(EVP_F_ALG_MODULE_INIT, EVP_R_UNKNOWN_OPTION);
+ ERR_add_error_data(4, "name=", oval->name,
+ ", value=", oval->value);
+ }
+
+ }
+ return 1;
+ }
+
+void EVP_add_alg_module(void)
+ {
+ CONF_module_add("alg_section", alg_module_init, 0);
+ }
diff --git a/main/openssl/crypto/evp/evp_enc.c b/main/openssl/crypto/evp/evp_enc.c
index c268d25c..0c54f05e 100644
--- a/main/openssl/crypto/evp/evp_enc.c
+++ b/main/openssl/crypto/evp/evp_enc.c
@@ -64,8 +64,18 @@
#ifndef OPENSSL_NO_ENGINE
#include <openssl/engine.h>
#endif
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
#include "evp_locl.h"
+#ifdef OPENSSL_FIPS
+#define M_do_cipher(ctx, out, in, inl) FIPS_cipher(ctx, out, in, inl)
+#else
+#define M_do_cipher(ctx, out, in, inl) ctx->cipher->do_cipher(ctx, out, in, inl)
+#endif
+
+
const char EVP_version[]="EVP" OPENSSL_VERSION_PTEXT;
void EVP_CIPHER_CTX_init(EVP_CIPHER_CTX *ctx)
@@ -115,10 +125,14 @@ int EVP_CipherInit_ex(EVP_CIPHER_CTX *ctx, const EVP_CIPHER *cipher, ENGINE *imp
/* Ensure a context left lying around from last time is cleared
* (the previous check attempted to avoid this if the same
* ENGINE and EVP_CIPHER could be used). */
- EVP_CIPHER_CTX_cleanup(ctx);
-
- /* Restore encrypt field: it is zeroed by cleanup */
- ctx->encrypt = enc;
+ if (ctx->cipher)
+ {
+ unsigned long flags = ctx->flags;
+ EVP_CIPHER_CTX_cleanup(ctx);
+ /* Restore encrypt and flags */
+ ctx->encrypt = enc;
+ ctx->flags = flags;
+ }
#ifndef OPENSSL_NO_ENGINE
if(impl)
{
@@ -155,6 +169,10 @@ int EVP_CipherInit_ex(EVP_CIPHER_CTX *ctx, const EVP_CIPHER *cipher, ENGINE *imp
ctx->engine = NULL;
#endif
+#ifdef OPENSSL_FIPS
+ if (FIPS_mode())
+ return FIPS_cipherinit(ctx, cipher, key, iv, enc);
+#endif
ctx->cipher=cipher;
if (ctx->cipher->ctx_size)
{
@@ -188,6 +206,10 @@ int EVP_CipherInit_ex(EVP_CIPHER_CTX *ctx, const EVP_CIPHER *cipher, ENGINE *imp
#ifndef OPENSSL_NO_ENGINE
skip_to_init:
#endif
+#ifdef OPENSSL_FIPS
+ if (FIPS_mode())
+ return FIPS_cipherinit(ctx, cipher, key, iv, enc);
+#endif
/* we assume block size is a power of 2 in *cryptUpdate */
OPENSSL_assert(ctx->cipher->block_size == 1
|| ctx->cipher->block_size == 8
@@ -214,6 +236,13 @@ skip_to_init:
memcpy(ctx->iv, ctx->oiv, EVP_CIPHER_CTX_iv_length(ctx));
break;
+ case EVP_CIPH_CTR_MODE:
+ ctx->num = 0;
+ /* Don't reuse IV for CTR mode */
+ if(iv)
+ memcpy(ctx->iv, iv, EVP_CIPHER_CTX_iv_length(ctx));
+ break;
+
default:
return 0;
break;
@@ -280,6 +309,16 @@ int EVP_EncryptUpdate(EVP_CIPHER_CTX *ctx, unsigned char *out, int *outl,
{
int i,j,bl;
+ if (ctx->cipher->flags & EVP_CIPH_FLAG_CUSTOM_CIPHER)
+ {
+ i = M_do_cipher(ctx, out, in, inl);
+ if (i < 0)
+ return 0;
+ else
+ *outl = i;
+ return 1;
+ }
+
if (inl <= 0)
{
*outl = 0;
@@ -288,7 +327,7 @@ int EVP_EncryptUpdate(EVP_CIPHER_CTX *ctx, unsigned char *out, int *outl,
if(ctx->buf_len == 0 && (inl&(ctx->block_mask)) == 0)
{
- if(ctx->cipher->do_cipher(ctx,out,in,inl))
+ if(M_do_cipher(ctx,out,in,inl))
{
*outl=inl;
return 1;
@@ -315,7 +354,7 @@ int EVP_EncryptUpdate(EVP_CIPHER_CTX *ctx, unsigned char *out, int *outl,
{
j=bl-i;
memcpy(&(ctx->buf[i]),in,j);
- if(!ctx->cipher->do_cipher(ctx,out,ctx->buf,bl)) return 0;
+ if(!M_do_cipher(ctx,out,ctx->buf,bl)) return 0;
inl-=j;
in+=j;
out+=bl;
@@ -328,7 +367,7 @@ int EVP_EncryptUpdate(EVP_CIPHER_CTX *ctx, unsigned char *out, int *outl,
inl-=i;
if (inl > 0)
{
- if(!ctx->cipher->do_cipher(ctx,out,in,inl)) return 0;
+ if(!M_do_cipher(ctx,out,in,inl)) return 0;
*outl+=inl;
}
@@ -350,6 +389,16 @@ int EVP_EncryptFinal_ex(EVP_CIPHER_CTX *ctx, unsigned char *out, int *outl)
int n,ret;
unsigned int i, b, bl;
+ if (ctx->cipher->flags & EVP_CIPH_FLAG_CUSTOM_CIPHER)
+ {
+ ret = M_do_cipher(ctx, out, NULL, 0);
+ if (ret < 0)
+ return 0;
+ else
+ *outl = ret;
+ return 1;
+ }
+
b=ctx->cipher->block_size;
OPENSSL_assert(b <= sizeof ctx->buf);
if (b == 1)
@@ -372,7 +421,7 @@ int EVP_EncryptFinal_ex(EVP_CIPHER_CTX *ctx, unsigned char *out, int *outl)
n=b-bl;
for (i=bl; i<b; i++)
ctx->buf[i]=n;
- ret=ctx->cipher->do_cipher(ctx,out,ctx->buf,b);
+ ret=M_do_cipher(ctx,out,ctx->buf,b);
if(ret)
@@ -387,6 +436,19 @@ int EVP_DecryptUpdate(EVP_CIPHER_CTX *ctx, unsigned char *out, int *outl,
int fix_len;
unsigned int b;
+ if (ctx->cipher->flags & EVP_CIPH_FLAG_CUSTOM_CIPHER)
+ {
+ fix_len = M_do_cipher(ctx, out, in, inl);
+ if (fix_len < 0)
+ {
+ *outl = 0;
+ return 0;
+ }
+ else
+ *outl = fix_len;
+ return 1;
+ }
+
if (inl <= 0)
{
*outl = 0;
@@ -440,8 +502,18 @@ int EVP_DecryptFinal_ex(EVP_CIPHER_CTX *ctx, unsigned char *out, int *outl)
{
int i,n;
unsigned int b;
-
*outl=0;
+
+ if (ctx->cipher->flags & EVP_CIPH_FLAG_CUSTOM_CIPHER)
+ {
+ i = M_do_cipher(ctx, out, NULL, 0);
+ if (i < 0)
+ return 0;
+ else
+ *outl = i;
+ return 1;
+ }
+
b=ctx->cipher->block_size;
if (ctx->flags & EVP_CIPH_NO_PADDING)
{
@@ -496,6 +568,7 @@ void EVP_CIPHER_CTX_free(EVP_CIPHER_CTX *ctx)
int EVP_CIPHER_CTX_cleanup(EVP_CIPHER_CTX *c)
{
+#ifndef OPENSSL_FIPS
if (c->cipher != NULL)
{
if(c->cipher->cleanup && !c->cipher->cleanup(c))
@@ -506,12 +579,16 @@ int EVP_CIPHER_CTX_cleanup(EVP_CIPHER_CTX *c)
}
if (c->cipher_data)
OPENSSL_free(c->cipher_data);
+#endif
#ifndef OPENSSL_NO_ENGINE
if (c->engine)
/* The EVP_CIPHER we used belongs to an ENGINE, release the
* functional reference we held for this reason. */
ENGINE_finish(c->engine);
#endif
+#ifdef OPENSSL_FIPS
+ FIPS_cipher_ctx_cleanup(c);
+#endif
memset(c,0,sizeof(EVP_CIPHER_CTX));
return 1;
}
diff --git a/main/openssl/crypto/evp/evp_err.c b/main/openssl/crypto/evp/evp_err.c
index d8bfec09..08eab988 100644
--- a/main/openssl/crypto/evp/evp_err.c
+++ b/main/openssl/crypto/evp/evp_err.c
@@ -1,6 +1,6 @@
/* crypto/evp/evp_err.c */
/* ====================================================================
- * Copyright (c) 1999-2008 The OpenSSL Project. All rights reserved.
+ * Copyright (c) 1999-2011 The OpenSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -70,8 +70,14 @@
static ERR_STRING_DATA EVP_str_functs[]=
{
+{ERR_FUNC(EVP_F_AESNI_INIT_KEY), "AESNI_INIT_KEY"},
+{ERR_FUNC(EVP_F_AESNI_XTS_CIPHER), "AESNI_XTS_CIPHER"},
{ERR_FUNC(EVP_F_AES_INIT_KEY), "AES_INIT_KEY"},
+{ERR_FUNC(EVP_F_AES_XTS), "AES_XTS"},
+{ERR_FUNC(EVP_F_AES_XTS_CIPHER), "AES_XTS_CIPHER"},
+{ERR_FUNC(EVP_F_ALG_MODULE_INIT), "ALG_MODULE_INIT"},
{ERR_FUNC(EVP_F_CAMELLIA_INIT_KEY), "CAMELLIA_INIT_KEY"},
+{ERR_FUNC(EVP_F_CMAC_INIT), "CMAC_INIT"},
{ERR_FUNC(EVP_F_D2I_PKEY), "D2I_PKEY"},
{ERR_FUNC(EVP_F_DO_SIGVER_INIT), "DO_SIGVER_INIT"},
{ERR_FUNC(EVP_F_DSAPKEY2PKCS8), "DSAPKEY2PKCS8"},
@@ -86,7 +92,7 @@ static ERR_STRING_DATA EVP_str_functs[]=
{ERR_FUNC(EVP_F_EVP_DIGESTINIT_EX), "EVP_DigestInit_ex"},
{ERR_FUNC(EVP_F_EVP_ENCRYPTFINAL_EX), "EVP_EncryptFinal_ex"},
{ERR_FUNC(EVP_F_EVP_MD_CTX_COPY_EX), "EVP_MD_CTX_copy_ex"},
-{ERR_FUNC(EVP_F_EVP_MD_SIZE), "EVP_MD_SIZE"},
+{ERR_FUNC(EVP_F_EVP_MD_SIZE), "EVP_MD_size"},
{ERR_FUNC(EVP_F_EVP_OPENINIT), "EVP_OpenInit"},
{ERR_FUNC(EVP_F_EVP_PBE_ALG_ADD), "EVP_PBE_alg_add"},
{ERR_FUNC(EVP_F_EVP_PBE_ALG_ADD_TYPE), "EVP_PBE_alg_add_type"},
@@ -126,9 +132,17 @@ static ERR_STRING_DATA EVP_str_functs[]=
{ERR_FUNC(EVP_F_EVP_RIJNDAEL), "EVP_RIJNDAEL"},
{ERR_FUNC(EVP_F_EVP_SIGNFINAL), "EVP_SignFinal"},
{ERR_FUNC(EVP_F_EVP_VERIFYFINAL), "EVP_VerifyFinal"},
+{ERR_FUNC(EVP_F_FIPS_CIPHERINIT), "FIPS_CIPHERINIT"},
+{ERR_FUNC(EVP_F_FIPS_CIPHER_CTX_COPY), "FIPS_CIPHER_CTX_COPY"},
+{ERR_FUNC(EVP_F_FIPS_CIPHER_CTX_CTRL), "FIPS_CIPHER_CTX_CTRL"},
+{ERR_FUNC(EVP_F_FIPS_CIPHER_CTX_SET_KEY_LENGTH), "FIPS_CIPHER_CTX_SET_KEY_LENGTH"},
+{ERR_FUNC(EVP_F_FIPS_DIGESTINIT), "FIPS_DIGESTINIT"},
+{ERR_FUNC(EVP_F_FIPS_MD_CTX_COPY), "FIPS_MD_CTX_COPY"},
+{ERR_FUNC(EVP_F_HMAC_INIT_EX), "HMAC_Init_ex"},
{ERR_FUNC(EVP_F_INT_CTX_NEW), "INT_CTX_NEW"},
{ERR_FUNC(EVP_F_PKCS5_PBE_KEYIVGEN), "PKCS5_PBE_keyivgen"},
{ERR_FUNC(EVP_F_PKCS5_V2_PBE_KEYIVGEN), "PKCS5_v2_PBE_keyivgen"},
+{ERR_FUNC(EVP_F_PKCS5_V2_PBKDF2_KEYIVGEN), "PKCS5_V2_PBKDF2_KEYIVGEN"},
{ERR_FUNC(EVP_F_PKCS8_SET_BROKEN), "PKCS8_set_broken"},
{ERR_FUNC(EVP_F_PKEY_SET_TYPE), "PKEY_SET_TYPE"},
{ERR_FUNC(EVP_F_RC2_MAGIC_TO_METH), "RC2_MAGIC_TO_METH"},
@@ -138,6 +152,7 @@ static ERR_STRING_DATA EVP_str_functs[]=
static ERR_STRING_DATA EVP_str_reasons[]=
{
+{ERR_REASON(EVP_R_AES_IV_SETUP_FAILED) ,"aes iv setup failed"},
{ERR_REASON(EVP_R_AES_KEY_SETUP_FAILED) ,"aes key setup failed"},
{ERR_REASON(EVP_R_ASN1_LIB) ,"asn1 lib"},
{ERR_REASON(EVP_R_BAD_BLOCK_LENGTH) ,"bad block length"},
@@ -155,16 +170,21 @@ static ERR_STRING_DATA EVP_str_reasons[]=
{ERR_REASON(EVP_R_DECODE_ERROR) ,"decode error"},
{ERR_REASON(EVP_R_DIFFERENT_KEY_TYPES) ,"different key types"},
{ERR_REASON(EVP_R_DIFFERENT_PARAMETERS) ,"different parameters"},
+{ERR_REASON(EVP_R_DISABLED_FOR_FIPS) ,"disabled for fips"},
{ERR_REASON(EVP_R_ENCODE_ERROR) ,"encode error"},
+{ERR_REASON(EVP_R_ERROR_LOADING_SECTION) ,"error loading section"},
+{ERR_REASON(EVP_R_ERROR_SETTING_FIPS_MODE),"error setting fips mode"},
{ERR_REASON(EVP_R_EVP_PBE_CIPHERINIT_ERROR),"evp pbe cipherinit error"},
{ERR_REASON(EVP_R_EXPECTING_AN_RSA_KEY) ,"expecting an rsa key"},
{ERR_REASON(EVP_R_EXPECTING_A_DH_KEY) ,"expecting a dh key"},
{ERR_REASON(EVP_R_EXPECTING_A_DSA_KEY) ,"expecting a dsa key"},
{ERR_REASON(EVP_R_EXPECTING_A_ECDSA_KEY) ,"expecting a ecdsa key"},
{ERR_REASON(EVP_R_EXPECTING_A_EC_KEY) ,"expecting a ec key"},
+{ERR_REASON(EVP_R_FIPS_MODE_NOT_SUPPORTED),"fips mode not supported"},
{ERR_REASON(EVP_R_INITIALIZATION_ERROR) ,"initialization error"},
{ERR_REASON(EVP_R_INPUT_NOT_INITIALIZED) ,"input not initialized"},
{ERR_REASON(EVP_R_INVALID_DIGEST) ,"invalid digest"},
+{ERR_REASON(EVP_R_INVALID_FIPS_MODE) ,"invalid fips mode"},
{ERR_REASON(EVP_R_INVALID_KEY_LENGTH) ,"invalid key length"},
{ERR_REASON(EVP_R_INVALID_OPERATION) ,"invalid operation"},
{ERR_REASON(EVP_R_IV_TOO_LARGE) ,"iv too large"},
@@ -186,8 +206,10 @@ static ERR_STRING_DATA EVP_str_reasons[]=
{ERR_REASON(EVP_R_PRIVATE_KEY_DECODE_ERROR),"private key decode error"},
{ERR_REASON(EVP_R_PRIVATE_KEY_ENCODE_ERROR),"private key encode error"},
{ERR_REASON(EVP_R_PUBLIC_KEY_NOT_RSA) ,"public key not rsa"},
+{ERR_REASON(EVP_R_TOO_LARGE) ,"too large"},
{ERR_REASON(EVP_R_UNKNOWN_CIPHER) ,"unknown cipher"},
{ERR_REASON(EVP_R_UNKNOWN_DIGEST) ,"unknown digest"},
+{ERR_REASON(EVP_R_UNKNOWN_OPTION) ,"unknown option"},
{ERR_REASON(EVP_R_UNKNOWN_PBE_ALGORITHM) ,"unknown pbe algorithm"},
{ERR_REASON(EVP_R_UNSUPORTED_NUMBER_OF_ROUNDS),"unsuported number of rounds"},
{ERR_REASON(EVP_R_UNSUPPORTED_ALGORITHM) ,"unsupported algorithm"},
diff --git a/main/openssl/crypto/evp/evp_key.c b/main/openssl/crypto/evp/evp_key.c
index 839d6a3a..7961fbeb 100644
--- a/main/openssl/crypto/evp/evp_key.c
+++ b/main/openssl/crypto/evp/evp_key.c
@@ -120,7 +120,7 @@ int EVP_BytesToKey(const EVP_CIPHER *type, const EVP_MD *md,
unsigned char md_buf[EVP_MAX_MD_SIZE];
int niv,nkey,addmd=0;
unsigned int mds=0,i;
-
+ int rv = 0;
nkey=type->key_len;
niv=type->iv_len;
OPENSSL_assert(nkey <= EVP_MAX_KEY_LENGTH);
@@ -134,17 +134,24 @@ int EVP_BytesToKey(const EVP_CIPHER *type, const EVP_MD *md,
if (!EVP_DigestInit_ex(&c,md, NULL))
return 0;
if (addmd++)
- EVP_DigestUpdate(&c,&(md_buf[0]),mds);
- EVP_DigestUpdate(&c,data,datal);
+ if (!EVP_DigestUpdate(&c,&(md_buf[0]),mds))
+ goto err;
+ if (!EVP_DigestUpdate(&c,data,datal))
+ goto err;
if (salt != NULL)
- EVP_DigestUpdate(&c,salt,PKCS5_SALT_LEN);
- EVP_DigestFinal_ex(&c,&(md_buf[0]),&mds);
+ if (!EVP_DigestUpdate(&c,salt,PKCS5_SALT_LEN))
+ goto err;
+ if (!EVP_DigestFinal_ex(&c,&(md_buf[0]),&mds))
+ goto err;
for (i=1; i<(unsigned int)count; i++)
{
- EVP_DigestInit_ex(&c,md, NULL);
- EVP_DigestUpdate(&c,&(md_buf[0]),mds);
- EVP_DigestFinal_ex(&c,&(md_buf[0]),&mds);
+ if (!EVP_DigestInit_ex(&c,md, NULL))
+ goto err;
+ if (!EVP_DigestUpdate(&c,&(md_buf[0]),mds))
+ goto err;
+ if (!EVP_DigestFinal_ex(&c,&(md_buf[0]),&mds))
+ goto err;
}
i=0;
if (nkey)
@@ -173,8 +180,10 @@ int EVP_BytesToKey(const EVP_CIPHER *type, const EVP_MD *md,
}
if ((nkey == 0) && (niv == 0)) break;
}
+ rv = type->key_len;
+ err:
EVP_MD_CTX_cleanup(&c);
OPENSSL_cleanse(&(md_buf[0]),EVP_MAX_MD_SIZE);
- return(type->key_len);
+ return rv;
}
diff --git a/main/openssl/crypto/evp/evp_lib.c b/main/openssl/crypto/evp/evp_lib.c
index 40951a04..b180e482 100644
--- a/main/openssl/crypto/evp/evp_lib.c
+++ b/main/openssl/crypto/evp/evp_lib.c
@@ -67,6 +67,8 @@ int EVP_CIPHER_param_to_asn1(EVP_CIPHER_CTX *c, ASN1_TYPE *type)
if (c->cipher->set_asn1_parameters != NULL)
ret=c->cipher->set_asn1_parameters(c,type);
+ else if (c->cipher->flags & EVP_CIPH_FLAG_DEFAULT_ASN1)
+ ret=EVP_CIPHER_set_asn1_iv(c, type);
else
ret=-1;
return(ret);
@@ -78,6 +80,8 @@ int EVP_CIPHER_asn1_to_param(EVP_CIPHER_CTX *c, ASN1_TYPE *type)
if (c->cipher->get_asn1_parameters != NULL)
ret=c->cipher->get_asn1_parameters(c,type);
+ else if (c->cipher->flags & EVP_CIPH_FLAG_DEFAULT_ASN1)
+ ret=EVP_CIPHER_get_asn1_iv(c, type);
else
ret=-1;
return(ret);
diff --git a/main/openssl/crypto/evp/evp_locl.h b/main/openssl/crypto/evp/evp_locl.h
index 292d74c1..08c0a66d 100644
--- a/main/openssl/crypto/evp/evp_locl.h
+++ b/main/openssl/crypto/evp/evp_locl.h
@@ -343,3 +343,43 @@ struct evp_pkey_method_st
} /* EVP_PKEY_METHOD */;
void evp_pkey_set_cb_translate(BN_GENCB *cb, EVP_PKEY_CTX *ctx);
+
+int PKCS5_v2_PBKDF2_keyivgen(EVP_CIPHER_CTX *ctx, const char *pass, int passlen,
+ ASN1_TYPE *param,
+ const EVP_CIPHER *c, const EVP_MD *md, int en_de);
+
+#ifdef OPENSSL_FIPS
+
+#ifdef OPENSSL_DOING_MAKEDEPEND
+#undef SHA1_Init
+#undef SHA1_Update
+#undef SHA224_Init
+#undef SHA256_Init
+#undef SHA384_Init
+#undef SHA512_Init
+#undef DES_set_key_unchecked
+#endif
+
+#define RIPEMD160_Init private_RIPEMD160_Init
+#define WHIRLPOOL_Init private_WHIRLPOOL_Init
+#define MD5_Init private_MD5_Init
+#define MD4_Init private_MD4_Init
+#define MD2_Init private_MD2_Init
+#define MDC2_Init private_MDC2_Init
+#define SHA_Init private_SHA_Init
+#define SHA1_Init private_SHA1_Init
+#define SHA224_Init private_SHA224_Init
+#define SHA256_Init private_SHA256_Init
+#define SHA384_Init private_SHA384_Init
+#define SHA512_Init private_SHA512_Init
+
+#define BF_set_key private_BF_set_key
+#define CAST_set_key private_CAST_set_key
+#define idea_set_encrypt_key private_idea_set_encrypt_key
+#define SEED_set_key private_SEED_set_key
+#define RC2_set_key private_RC2_set_key
+#define RC4_set_key private_RC4_set_key
+#define DES_set_key_unchecked private_DES_set_key_unchecked
+#define Camellia_set_key private_Camellia_set_key
+
+#endif
diff --git a/main/openssl/crypto/evp/evp_pbe.c b/main/openssl/crypto/evp/evp_pbe.c
index c9d932d2..f8c32d82 100644
--- a/main/openssl/crypto/evp/evp_pbe.c
+++ b/main/openssl/crypto/evp/evp_pbe.c
@@ -61,6 +61,7 @@
#include <openssl/evp.h>
#include <openssl/pkcs12.h>
#include <openssl/x509.h>
+#include "evp_locl.h"
/* Password based encryption (PBE) functions */
@@ -87,6 +88,10 @@ static const EVP_PBE_CTL builtin_pbe[] =
{EVP_PBE_TYPE_OUTER, NID_pbeWithSHA1AndRC2_CBC,
NID_rc2_64_cbc, NID_sha1, PKCS5_PBE_keyivgen},
+#ifndef OPENSSL_NO_HMAC
+ {EVP_PBE_TYPE_OUTER, NID_id_pbkdf2, -1, -1, PKCS5_v2_PBKDF2_keyivgen},
+#endif
+
{EVP_PBE_TYPE_OUTER, NID_pbe_WithSHA1And128BitRC4,
NID_rc4, NID_sha1, PKCS12_PBE_keyivgen},
{EVP_PBE_TYPE_OUTER, NID_pbe_WithSHA1And40BitRC4,
diff --git a/main/openssl/crypto/evp/evptests.txt b/main/openssl/crypto/evp/evptests.txt
index beb12144..c273707c 100644
--- a/main/openssl/crypto/evp/evptests.txt
+++ b/main/openssl/crypto/evp/evptests.txt
@@ -158,6 +158,19 @@ AES-256-OFB:603DEB1015CA71BE2B73AEF0857D77811F352C073B6108D72D9810A30914DFF4:B7B
AES-256-OFB:603DEB1015CA71BE2B73AEF0857D77811F352C073B6108D72D9810A30914DFF4:E1C656305ED1A7A6563805746FE03EDC:30C81C46A35CE411E5FBC1191A0A52EF:71AB47A086E86EEDF39D1C5BBA97C408:0
AES-256-OFB:603DEB1015CA71BE2B73AEF0857D77811F352C073B6108D72D9810A30914DFF4:41635BE625B48AFC1666DD42A09D96E7:F69F2445DF4F9B17AD2B417BE66C3710:0126141D67F37BE8538F5A8BE740E484:0
+# AES Counter test vectors from RFC3686
+aes-128-ctr:AE6852F8121067CC4BF7A5765577F39E:00000030000000000000000000000001:53696E676C6520626C6F636B206D7367:E4095D4FB7A7B3792D6175A3261311B8:1
+aes-128-ctr:7E24067817FAE0D743D6CE1F32539163:006CB6DBC0543B59DA48D90B00000001:000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F:5104A106168A72D9790D41EE8EDAD388EB2E1EFC46DA57C8FCE630DF9141BE28:1
+aes-128-ctr:7691BE035E5020A8AC6E618529F9A0DC:00E0017B27777F3F4A1786F000000001:000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F20212223:C1CF48A89F2FFDD9CF4652E9EFDB72D74540A42BDE6D7836D59A5CEAAEF3105325B2072F:1
+
+aes-192-ctr:16AF5B145FC9F579C175F93E3BFB0EED863D06CCFDB78515:0000004836733C147D6D93CB00000001:53696E676C6520626C6F636B206D7367:4B55384FE259C9C84E7935A003CBE928:1
+aes-192-ctr:7C5CB2401B3DC33C19E7340819E0F69C678C3DB8E6F6A91A:0096B03B020C6EADC2CB500D00000001:000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F:453243FC609B23327EDFAAFA7131CD9F8490701C5AD4A79CFC1FE0FF42F4FB00:1
+aes-192-ctr:02BF391EE8ECB159B959617B0965279BF59B60A786D3E0FE:0007BDFD5CBD60278DCC091200000001:000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F20212223:96893FC55E5C722F540B7DD1DDF7E758D288BC95C69165884536C811662F2188ABEE0935:1
+
+aes-256-ctr:776BEFF2851DB06F4C8A0542C8696F6C6A81AF1EEC96B4D37FC1D689E6C1C104:00000060DB5672C97AA8F0B200000001:53696E676C6520626C6F636B206D7367:145AD01DBF824EC7560863DC71E3E0C0:1
+aes-256-ctr:F6D66D6BD52D59BB0796365879EFF886C66DD51A5B6A99744B50590C87A23884:00FAAC24C1585EF15A43D87500000001:000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F:F05E231B3894612C49EE000B804EB2A9B8306B508F839D6A5530831D9344AF1C:1
+aes-256-ctr:FF7A617CE69148E4F1726E2F43581DE2AA62D9F805532EDFF1EED687FB54153D:001CC5B751A51D70A1C1114800000001:000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F20212223:EB6C52821D0BBBF7CE7594462ACA4FAAB407DF866569FD07F48CC0B583D6071F1EC0E6B8:1
+
# DES ECB tests (from destest)
DES-ECB:0000000000000000::0000000000000000:8CA64DE9C1B123A7
diff --git a/main/openssl/crypto/evp/m_dss.c b/main/openssl/crypto/evp/m_dss.c
index 48c26895..6fb7e9a8 100644
--- a/main/openssl/crypto/evp/m_dss.c
+++ b/main/openssl/crypto/evp/m_dss.c
@@ -60,12 +60,13 @@
#include "cryptlib.h"
#include <openssl/evp.h>
#include <openssl/objects.h>
-#include <openssl/x509.h>
+#include <openssl/sha.h>
#ifndef OPENSSL_NO_DSA
#include <openssl/dsa.h>
#endif
#ifndef OPENSSL_NO_SHA
+#ifndef OPENSSL_FIPS
static int init(EVP_MD_CTX *ctx)
{ return SHA1_Init(ctx->md_data); }
@@ -97,3 +98,4 @@ const EVP_MD *EVP_dss(void)
return(&dsa_md);
}
#endif
+#endif
diff --git a/main/openssl/crypto/evp/m_dss1.c b/main/openssl/crypto/evp/m_dss1.c
index 4f03fb70..2df362a6 100644
--- a/main/openssl/crypto/evp/m_dss1.c
+++ b/main/openssl/crypto/evp/m_dss1.c
@@ -63,11 +63,13 @@
#include <openssl/evp.h>
#include <openssl/objects.h>
-#include <openssl/x509.h>
+#include <openssl/sha.h>
#ifndef OPENSSL_NO_DSA
#include <openssl/dsa.h>
#endif
+#ifndef OPENSSL_FIPS
+
static int init(EVP_MD_CTX *ctx)
{ return SHA1_Init(ctx->md_data); }
@@ -98,3 +100,4 @@ const EVP_MD *EVP_dss1(void)
return(&dss1_md);
}
#endif
+#endif
diff --git a/main/openssl/crypto/evp/m_ecdsa.c b/main/openssl/crypto/evp/m_ecdsa.c
index 8d87a49e..4b15fb0f 100644
--- a/main/openssl/crypto/evp/m_ecdsa.c
+++ b/main/openssl/crypto/evp/m_ecdsa.c
@@ -116,6 +116,8 @@
#include <openssl/x509.h>
#ifndef OPENSSL_NO_SHA
+#ifndef OPENSSL_FIPS
+
static int init(EVP_MD_CTX *ctx)
{ return SHA1_Init(ctx->md_data); }
@@ -146,3 +148,4 @@ const EVP_MD *EVP_ecdsa(void)
return(&ecdsa_md);
}
#endif
+#endif
diff --git a/main/openssl/crypto/evp/m_md4.c b/main/openssl/crypto/evp/m_md4.c
index 1e0b7c5b..6d47f61b 100644
--- a/main/openssl/crypto/evp/m_md4.c
+++ b/main/openssl/crypto/evp/m_md4.c
@@ -69,6 +69,8 @@
#include <openssl/rsa.h>
#endif
+#include "evp_locl.h"
+
static int init(EVP_MD_CTX *ctx)
{ return MD4_Init(ctx->md_data); }
diff --git a/main/openssl/crypto/evp/m_md5.c b/main/openssl/crypto/evp/m_md5.c
index 63c14211..9a8bae02 100644
--- a/main/openssl/crypto/evp/m_md5.c
+++ b/main/openssl/crypto/evp/m_md5.c
@@ -68,6 +68,7 @@
#ifndef OPENSSL_NO_RSA
#include <openssl/rsa.h>
#endif
+#include "evp_locl.h"
static int init(EVP_MD_CTX *ctx)
{ return MD5_Init(ctx->md_data); }
diff --git a/main/openssl/crypto/evp/m_mdc2.c b/main/openssl/crypto/evp/m_mdc2.c
index b08d5598..3602bed3 100644
--- a/main/openssl/crypto/evp/m_mdc2.c
+++ b/main/openssl/crypto/evp/m_mdc2.c
@@ -69,6 +69,8 @@
#include <openssl/rsa.h>
#endif
+#include "evp_locl.h"
+
static int init(EVP_MD_CTX *ctx)
{ return MDC2_Init(ctx->md_data); }
diff --git a/main/openssl/crypto/evp/m_ripemd.c b/main/openssl/crypto/evp/m_ripemd.c
index a1d60ee7..7bf4804c 100644
--- a/main/openssl/crypto/evp/m_ripemd.c
+++ b/main/openssl/crypto/evp/m_ripemd.c
@@ -68,6 +68,7 @@
#ifndef OPENSSL_NO_RSA
#include <openssl/rsa.h>
#endif
+#include "evp_locl.h"
static int init(EVP_MD_CTX *ctx)
{ return RIPEMD160_Init(ctx->md_data); }
diff --git a/main/openssl/crypto/evp/m_sha1.c b/main/openssl/crypto/evp/m_sha1.c
index 9a2790fd..bd0c01ad 100644
--- a/main/openssl/crypto/evp/m_sha1.c
+++ b/main/openssl/crypto/evp/m_sha1.c
@@ -59,15 +59,18 @@
#include <stdio.h>
#include "cryptlib.h"
+#ifndef OPENSSL_FIPS
+
#ifndef OPENSSL_NO_SHA
#include <openssl/evp.h>
#include <openssl/objects.h>
-#include <openssl/x509.h>
+#include <openssl/sha.h>
#ifndef OPENSSL_NO_RSA
#include <openssl/rsa.h>
#endif
+
static int init(EVP_MD_CTX *ctx)
{ return SHA1_Init(ctx->md_data); }
@@ -202,3 +205,5 @@ static const EVP_MD sha512_md=
const EVP_MD *EVP_sha512(void)
{ return(&sha512_md); }
#endif /* ifndef OPENSSL_NO_SHA512 */
+
+#endif
diff --git a/main/openssl/crypto/evp/m_wp.c b/main/openssl/crypto/evp/m_wp.c
index 1ce47c04..c51bc2d5 100644
--- a/main/openssl/crypto/evp/m_wp.c
+++ b/main/openssl/crypto/evp/m_wp.c
@@ -9,6 +9,7 @@
#include <openssl/objects.h>
#include <openssl/x509.h>
#include <openssl/whrlpool.h>
+#include "evp_locl.h"
static int init(EVP_MD_CTX *ctx)
{ return WHIRLPOOL_Init(ctx->md_data); }
diff --git a/main/openssl/crypto/evp/names.c b/main/openssl/crypto/evp/names.c
index f2869f5c..6311ad7c 100644
--- a/main/openssl/crypto/evp/names.c
+++ b/main/openssl/crypto/evp/names.c
@@ -66,6 +66,10 @@ int EVP_add_cipher(const EVP_CIPHER *c)
{
int r;
+ if (c == NULL) return 0;
+
+ OPENSSL_init();
+
r=OBJ_NAME_add(OBJ_nid2sn(c->nid),OBJ_NAME_TYPE_CIPHER_METH,(const char *)c);
if (r == 0) return(0);
check_defer(c->nid);
@@ -78,6 +82,7 @@ int EVP_add_digest(const EVP_MD *md)
{
int r;
const char *name;
+ OPENSSL_init();
name=OBJ_nid2sn(md->type);
r=OBJ_NAME_add(name,OBJ_NAME_TYPE_MD_METH,(const char *)md);
diff --git a/main/openssl/crypto/evp/p5_crpt.c b/main/openssl/crypto/evp/p5_crpt.c
index 7ecfa8da..294cc90d 100644
--- a/main/openssl/crypto/evp/p5_crpt.c
+++ b/main/openssl/crypto/evp/p5_crpt.c
@@ -82,6 +82,8 @@ int PKCS5_PBE_keyivgen(EVP_CIPHER_CTX *cctx, const char *pass, int passlen,
unsigned char *salt;
const unsigned char *pbuf;
int mdsize;
+ int rv = 0;
+ EVP_MD_CTX_init(&ctx);
/* Extract useful info from parameter */
if (param == NULL || param->type != V_ASN1_SEQUENCE ||
@@ -104,29 +106,38 @@ int PKCS5_PBE_keyivgen(EVP_CIPHER_CTX *cctx, const char *pass, int passlen,
if(!pass) passlen = 0;
else if(passlen == -1) passlen = strlen(pass);
- EVP_MD_CTX_init(&ctx);
- EVP_DigestInit_ex(&ctx, md, NULL);
- EVP_DigestUpdate(&ctx, pass, passlen);
- EVP_DigestUpdate(&ctx, salt, saltlen);
+ if (!EVP_DigestInit_ex(&ctx, md, NULL))
+ goto err;
+ if (!EVP_DigestUpdate(&ctx, pass, passlen))
+ goto err;
+ if (!EVP_DigestUpdate(&ctx, salt, saltlen))
+ goto err;
PBEPARAM_free(pbe);
- EVP_DigestFinal_ex(&ctx, md_tmp, NULL);
+ if (!EVP_DigestFinal_ex(&ctx, md_tmp, NULL))
+ goto err;
mdsize = EVP_MD_size(md);
if (mdsize < 0)
return 0;
for (i = 1; i < iter; i++) {
- EVP_DigestInit_ex(&ctx, md, NULL);
- EVP_DigestUpdate(&ctx, md_tmp, mdsize);
- EVP_DigestFinal_ex (&ctx, md_tmp, NULL);
+ if (!EVP_DigestInit_ex(&ctx, md, NULL))
+ goto err;
+ if (!EVP_DigestUpdate(&ctx, md_tmp, mdsize))
+ goto err;
+ if (!EVP_DigestFinal_ex (&ctx, md_tmp, NULL))
+ goto err;
}
- EVP_MD_CTX_cleanup(&ctx);
OPENSSL_assert(EVP_CIPHER_key_length(cipher) <= (int)sizeof(md_tmp));
memcpy(key, md_tmp, EVP_CIPHER_key_length(cipher));
OPENSSL_assert(EVP_CIPHER_iv_length(cipher) <= 16);
memcpy(iv, md_tmp + (16 - EVP_CIPHER_iv_length(cipher)),
EVP_CIPHER_iv_length(cipher));
- EVP_CipherInit_ex(cctx, cipher, NULL, key, iv, en_de);
+ if (!EVP_CipherInit_ex(cctx, cipher, NULL, key, iv, en_de))
+ goto err;
OPENSSL_cleanse(md_tmp, EVP_MAX_MD_SIZE);
OPENSSL_cleanse(key, EVP_MAX_KEY_LENGTH);
OPENSSL_cleanse(iv, EVP_MAX_IV_LENGTH);
- return 1;
+ rv = 1;
+ err:
+ EVP_MD_CTX_cleanup(&ctx);
+ return rv;
}
diff --git a/main/openssl/crypto/evp/p5_crpt2.c b/main/openssl/crypto/evp/p5_crpt2.c
index 334379f3..fe3c6c88 100644
--- a/main/openssl/crypto/evp/p5_crpt2.c
+++ b/main/openssl/crypto/evp/p5_crpt2.c
@@ -62,6 +62,7 @@
#include <openssl/x509.h>
#include <openssl/evp.h>
#include <openssl/hmac.h>
+#include "evp_locl.h"
/* set this to print out info about the keygen algorithm */
/* #define DEBUG_PKCS5V2 */
@@ -84,19 +85,24 @@ int PKCS5_PBKDF2_HMAC(const char *pass, int passlen,
unsigned char digtmp[EVP_MAX_MD_SIZE], *p, itmp[4];
int cplen, j, k, tkeylen, mdlen;
unsigned long i = 1;
- HMAC_CTX hctx;
+ HMAC_CTX hctx_tpl, hctx;
mdlen = EVP_MD_size(digest);
if (mdlen < 0)
return 0;
- HMAC_CTX_init(&hctx);
+ HMAC_CTX_init(&hctx_tpl);
p = out;
tkeylen = keylen;
if(!pass)
passlen = 0;
else if(passlen == -1)
passlen = strlen(pass);
+ if (!HMAC_Init_ex(&hctx_tpl, pass, passlen, digest, NULL))
+ {
+ HMAC_CTX_cleanup(&hctx_tpl);
+ return 0;
+ }
while(tkeylen)
{
if(tkeylen > mdlen)
@@ -110,15 +116,36 @@ int PKCS5_PBKDF2_HMAC(const char *pass, int passlen,
itmp[1] = (unsigned char)((i >> 16) & 0xff);
itmp[2] = (unsigned char)((i >> 8) & 0xff);
itmp[3] = (unsigned char)(i & 0xff);
- HMAC_Init_ex(&hctx, pass, passlen, digest, NULL);
- HMAC_Update(&hctx, salt, saltlen);
- HMAC_Update(&hctx, itmp, 4);
- HMAC_Final(&hctx, digtmp, NULL);
+ if (!HMAC_CTX_copy(&hctx, &hctx_tpl))
+ {
+ HMAC_CTX_cleanup(&hctx_tpl);
+ return 0;
+ }
+ if (!HMAC_Update(&hctx, salt, saltlen)
+ || !HMAC_Update(&hctx, itmp, 4)
+ || !HMAC_Final(&hctx, digtmp, NULL))
+ {
+ HMAC_CTX_cleanup(&hctx_tpl);
+ HMAC_CTX_cleanup(&hctx);
+ return 0;
+ }
+ HMAC_CTX_cleanup(&hctx);
memcpy(p, digtmp, cplen);
for(j = 1; j < iter; j++)
{
- HMAC(digest, pass, passlen,
- digtmp, mdlen, digtmp, NULL);
+ if (!HMAC_CTX_copy(&hctx, &hctx_tpl))
+ {
+ HMAC_CTX_cleanup(&hctx_tpl);
+ return 0;
+ }
+ if (!HMAC_Update(&hctx, digtmp, mdlen)
+ || !HMAC_Final(&hctx, digtmp, NULL))
+ {
+ HMAC_CTX_cleanup(&hctx_tpl);
+ HMAC_CTX_cleanup(&hctx);
+ return 0;
+ }
+ HMAC_CTX_cleanup(&hctx);
for(k = 0; k < cplen; k++)
p[k] ^= digtmp[k];
}
@@ -126,7 +153,7 @@ int PKCS5_PBKDF2_HMAC(const char *pass, int passlen,
i++;
p+= cplen;
}
- HMAC_CTX_cleanup(&hctx);
+ HMAC_CTX_cleanup(&hctx_tpl);
#ifdef DEBUG_PKCS5V2
fprintf(stderr, "Password:\n");
h__dump (pass, passlen);
@@ -168,27 +195,24 @@ int PKCS5_v2_PBE_keyivgen(EVP_CIPHER_CTX *ctx, const char *pass, int passlen,
ASN1_TYPE *param, const EVP_CIPHER *c, const EVP_MD *md,
int en_de)
{
- unsigned char *salt, key[EVP_MAX_KEY_LENGTH];
const unsigned char *pbuf;
- int saltlen, iter, plen;
- unsigned int keylen;
+ int plen;
PBE2PARAM *pbe2 = NULL;
const EVP_CIPHER *cipher;
- PBKDF2PARAM *kdf = NULL;
- const EVP_MD *prfmd;
- int prf_nid, hmac_md_nid;
+
+ int rv = 0;
if (param == NULL || param->type != V_ASN1_SEQUENCE ||
param->value.sequence == NULL) {
EVPerr(EVP_F_PKCS5_V2_PBE_KEYIVGEN,EVP_R_DECODE_ERROR);
- return 0;
+ goto err;
}
pbuf = param->value.sequence->data;
plen = param->value.sequence->length;
if(!(pbe2 = d2i_PBE2PARAM(NULL, &pbuf, plen))) {
EVPerr(EVP_F_PKCS5_V2_PBE_KEYIVGEN,EVP_R_DECODE_ERROR);
- return 0;
+ goto err;
}
/* See if we recognise the key derivation function */
@@ -211,38 +235,63 @@ int PKCS5_v2_PBE_keyivgen(EVP_CIPHER_CTX *ctx, const char *pass, int passlen,
}
/* Fixup cipher based on AlgorithmIdentifier */
- EVP_CipherInit_ex(ctx, cipher, NULL, NULL, NULL, en_de);
+ if (!EVP_CipherInit_ex(ctx, cipher, NULL, NULL, NULL, en_de))
+ goto err;
if(EVP_CIPHER_asn1_to_param(ctx, pbe2->encryption->parameter) < 0) {
EVPerr(EVP_F_PKCS5_V2_PBE_KEYIVGEN,
EVP_R_CIPHER_PARAMETER_ERROR);
goto err;
}
+ rv = PKCS5_v2_PBKDF2_keyivgen(ctx, pass, passlen,
+ pbe2->keyfunc->parameter, c, md, en_de);
+ err:
+ PBE2PARAM_free(pbe2);
+ return rv;
+}
+
+int PKCS5_v2_PBKDF2_keyivgen(EVP_CIPHER_CTX *ctx, const char *pass, int passlen,
+ ASN1_TYPE *param,
+ const EVP_CIPHER *c, const EVP_MD *md, int en_de)
+{
+ unsigned char *salt, key[EVP_MAX_KEY_LENGTH];
+ const unsigned char *pbuf;
+ int saltlen, iter, plen;
+ int rv = 0;
+ unsigned int keylen = 0;
+ int prf_nid, hmac_md_nid;
+ PBKDF2PARAM *kdf = NULL;
+ const EVP_MD *prfmd;
+
+ if (EVP_CIPHER_CTX_cipher(ctx) == NULL)
+ {
+ EVPerr(EVP_F_PKCS5_V2_PBKDF2_KEYIVGEN,EVP_R_NO_CIPHER_SET);
+ goto err;
+ }
keylen = EVP_CIPHER_CTX_key_length(ctx);
OPENSSL_assert(keylen <= sizeof key);
- /* Now decode key derivation function */
+ /* Decode parameter */
- if(!pbe2->keyfunc->parameter ||
- (pbe2->keyfunc->parameter->type != V_ASN1_SEQUENCE))
+ if(!param || (param->type != V_ASN1_SEQUENCE))
{
- EVPerr(EVP_F_PKCS5_V2_PBE_KEYIVGEN,EVP_R_DECODE_ERROR);
+ EVPerr(EVP_F_PKCS5_V2_PBKDF2_KEYIVGEN,EVP_R_DECODE_ERROR);
goto err;
}
- pbuf = pbe2->keyfunc->parameter->value.sequence->data;
- plen = pbe2->keyfunc->parameter->value.sequence->length;
+ pbuf = param->value.sequence->data;
+ plen = param->value.sequence->length;
+
if(!(kdf = d2i_PBKDF2PARAM(NULL, &pbuf, plen)) ) {
- EVPerr(EVP_F_PKCS5_V2_PBE_KEYIVGEN,EVP_R_DECODE_ERROR);
+ EVPerr(EVP_F_PKCS5_V2_PBKDF2_KEYIVGEN,EVP_R_DECODE_ERROR);
goto err;
}
- PBE2PARAM_free(pbe2);
- pbe2 = NULL;
+ keylen = EVP_CIPHER_CTX_key_length(ctx);
/* Now check the parameters of the kdf */
if(kdf->keylength && (ASN1_INTEGER_get(kdf->keylength) != (int)keylen)){
- EVPerr(EVP_F_PKCS5_V2_PBE_KEYIVGEN,
+ EVPerr(EVP_F_PKCS5_V2_PBKDF2_KEYIVGEN,
EVP_R_UNSUPPORTED_KEYLENGTH);
goto err;
}
@@ -254,19 +303,19 @@ int PKCS5_v2_PBE_keyivgen(EVP_CIPHER_CTX *ctx, const char *pass, int passlen,
if (!EVP_PBE_find(EVP_PBE_TYPE_PRF, prf_nid, NULL, &hmac_md_nid, 0))
{
- EVPerr(EVP_F_PKCS5_V2_PBE_KEYIVGEN, EVP_R_UNSUPPORTED_PRF);
+ EVPerr(EVP_F_PKCS5_V2_PBKDF2_KEYIVGEN, EVP_R_UNSUPPORTED_PRF);
goto err;
}
prfmd = EVP_get_digestbynid(hmac_md_nid);
if (prfmd == NULL)
{
- EVPerr(EVP_F_PKCS5_V2_PBE_KEYIVGEN, EVP_R_UNSUPPORTED_PRF);
+ EVPerr(EVP_F_PKCS5_V2_PBKDF2_KEYIVGEN, EVP_R_UNSUPPORTED_PRF);
goto err;
}
if(kdf->salt->type != V_ASN1_OCTET_STRING) {
- EVPerr(EVP_F_PKCS5_V2_PBE_KEYIVGEN,
+ EVPerr(EVP_F_PKCS5_V2_PBKDF2_KEYIVGEN,
EVP_R_UNSUPPORTED_SALT_TYPE);
goto err;
}
@@ -278,15 +327,11 @@ int PKCS5_v2_PBE_keyivgen(EVP_CIPHER_CTX *ctx, const char *pass, int passlen,
if(!PKCS5_PBKDF2_HMAC(pass, passlen, salt, saltlen, iter, prfmd,
keylen, key))
goto err;
- EVP_CipherInit_ex(ctx, NULL, NULL, key, NULL, en_de);
- OPENSSL_cleanse(key, keylen);
- PBKDF2PARAM_free(kdf);
- return 1;
-
+ rv = EVP_CipherInit_ex(ctx, NULL, NULL, key, NULL, en_de);
err:
- PBE2PARAM_free(pbe2);
+ OPENSSL_cleanse(key, keylen);
PBKDF2PARAM_free(kdf);
- return 0;
+ return rv;
}
#ifdef DEBUG_PKCS5V2
diff --git a/main/openssl/crypto/evp/p_lib.c b/main/openssl/crypto/evp/p_lib.c
index e26ccd0d..bd1977d7 100644
--- a/main/openssl/crypto/evp/p_lib.c
+++ b/main/openssl/crypto/evp/p_lib.c
@@ -200,6 +200,12 @@ EVP_PKEY *EVP_PKEY_new(void)
return(ret);
}
+EVP_PKEY *EVP_PKEY_dup(EVP_PKEY *pkey)
+ {
+ CRYPTO_add(&pkey->references, 1, CRYPTO_LOCK_EVP_PKEY);
+ return pkey;
+ }
+
/* Setup a public key ASN1 method and ENGINE from a NID or a string.
* If pkey is NULL just return 1 or 0 if the algorithm exists.
*/
diff --git a/main/openssl/crypto/evp/p_open.c b/main/openssl/crypto/evp/p_open.c
index 53a59a29..c748fbea 100644
--- a/main/openssl/crypto/evp/p_open.c
+++ b/main/openssl/crypto/evp/p_open.c
@@ -115,7 +115,8 @@ int EVP_OpenFinal(EVP_CIPHER_CTX *ctx, unsigned char *out, int *outl)
int i;
i=EVP_DecryptFinal_ex(ctx,out,outl);
- EVP_DecryptInit_ex(ctx,NULL,NULL,NULL,NULL);
+ if (i)
+ i = EVP_DecryptInit_ex(ctx,NULL,NULL,NULL,NULL);
return(i);
}
#else /* !OPENSSL_NO_RSA */
diff --git a/main/openssl/crypto/evp/p_seal.c b/main/openssl/crypto/evp/p_seal.c
index d8324526..e5919b0f 100644
--- a/main/openssl/crypto/evp/p_seal.c
+++ b/main/openssl/crypto/evp/p_seal.c
@@ -110,6 +110,7 @@ int EVP_SealFinal(EVP_CIPHER_CTX *ctx, unsigned char *out, int *outl)
{
int i;
i = EVP_EncryptFinal_ex(ctx,out,outl);
- EVP_EncryptInit_ex(ctx,NULL,NULL,NULL,NULL);
+ if (i)
+ i = EVP_EncryptInit_ex(ctx,NULL,NULL,NULL,NULL);
return i;
}
diff --git a/main/openssl/crypto/evp/p_sign.c b/main/openssl/crypto/evp/p_sign.c
index bb893f5b..8afb6643 100644
--- a/main/openssl/crypto/evp/p_sign.c
+++ b/main/openssl/crypto/evp/p_sign.c
@@ -80,18 +80,20 @@ int EVP_SignFinal(EVP_MD_CTX *ctx, unsigned char *sigret, unsigned int *siglen,
{
unsigned char m[EVP_MAX_MD_SIZE];
unsigned int m_len;
- int i,ok=0,v;
+ int i = 0,ok = 0,v;
EVP_MD_CTX tmp_ctx;
+ EVP_PKEY_CTX *pkctx = NULL;
*siglen=0;
EVP_MD_CTX_init(&tmp_ctx);
- EVP_MD_CTX_copy_ex(&tmp_ctx,ctx);
- EVP_DigestFinal_ex(&tmp_ctx,&(m[0]),&m_len);
+ if (!EVP_MD_CTX_copy_ex(&tmp_ctx,ctx))
+ goto err;
+ if (!EVP_DigestFinal_ex(&tmp_ctx,&(m[0]),&m_len))
+ goto err;
EVP_MD_CTX_cleanup(&tmp_ctx);
if (ctx->digest->flags & EVP_MD_FLAG_PKEY_METHOD_SIGNATURE)
{
- EVP_PKEY_CTX *pkctx = NULL;
size_t sltmp = (size_t)EVP_PKEY_size(pkey);
i = 0;
pkctx = EVP_PKEY_CTX_new(pkey, NULL);
diff --git a/main/openssl/crypto/evp/p_verify.c b/main/openssl/crypto/evp/p_verify.c
index 41d4b671..c66d63cc 100644
--- a/main/openssl/crypto/evp/p_verify.c
+++ b/main/openssl/crypto/evp/p_verify.c
@@ -67,17 +67,19 @@ int EVP_VerifyFinal(EVP_MD_CTX *ctx, const unsigned char *sigbuf,
{
unsigned char m[EVP_MAX_MD_SIZE];
unsigned int m_len;
- int i,ok=0,v;
+ int i = 0,ok = 0,v;
EVP_MD_CTX tmp_ctx;
+ EVP_PKEY_CTX *pkctx = NULL;
EVP_MD_CTX_init(&tmp_ctx);
- EVP_MD_CTX_copy_ex(&tmp_ctx,ctx);
- EVP_DigestFinal_ex(&tmp_ctx,&(m[0]),&m_len);
+ if (!EVP_MD_CTX_copy_ex(&tmp_ctx,ctx))
+ goto err;
+ if (!EVP_DigestFinal_ex(&tmp_ctx,&(m[0]),&m_len))
+ goto err;
EVP_MD_CTX_cleanup(&tmp_ctx);
if (ctx->digest->flags & EVP_MD_FLAG_PKEY_METHOD_SIGNATURE)
{
- EVP_PKEY_CTX *pkctx = NULL;
i = -1;
pkctx = EVP_PKEY_CTX_new(pkey, NULL);
if (!pkctx)
diff --git a/main/openssl/crypto/evp/pmeth_gn.c b/main/openssl/crypto/evp/pmeth_gn.c
index 5d74161a..4651c813 100644
--- a/main/openssl/crypto/evp/pmeth_gn.c
+++ b/main/openssl/crypto/evp/pmeth_gn.c
@@ -199,7 +199,7 @@ int EVP_PKEY_CTX_get_keygen_info(EVP_PKEY_CTX *ctx, int idx)
}
EVP_PKEY *EVP_PKEY_new_mac_key(int type, ENGINE *e,
- unsigned char *key, int keylen)
+ const unsigned char *key, int keylen)
{
EVP_PKEY_CTX *mac_ctx = NULL;
EVP_PKEY *mac_key = NULL;
@@ -209,7 +209,8 @@ EVP_PKEY *EVP_PKEY_new_mac_key(int type, ENGINE *e,
if (EVP_PKEY_keygen_init(mac_ctx) <= 0)
goto merr;
if (EVP_PKEY_CTX_ctrl(mac_ctx, -1, EVP_PKEY_OP_KEYGEN,
- EVP_PKEY_CTRL_SET_MAC_KEY, keylen, key) <= 0)
+ EVP_PKEY_CTRL_SET_MAC_KEY,
+ keylen, (void *)key) <= 0)
goto merr;
if (EVP_PKEY_keygen(mac_ctx, &mac_key) <= 0)
goto merr;
diff --git a/main/openssl/crypto/evp/pmeth_lib.c b/main/openssl/crypto/evp/pmeth_lib.c
index 5481d4b8..acfa7b6f 100644
--- a/main/openssl/crypto/evp/pmeth_lib.c
+++ b/main/openssl/crypto/evp/pmeth_lib.c
@@ -73,7 +73,7 @@ DECLARE_STACK_OF(EVP_PKEY_METHOD)
STACK_OF(EVP_PKEY_METHOD) *app_pkey_methods = NULL;
extern const EVP_PKEY_METHOD rsa_pkey_meth, dh_pkey_meth, dsa_pkey_meth;
-extern const EVP_PKEY_METHOD ec_pkey_meth, hmac_pkey_meth;
+extern const EVP_PKEY_METHOD ec_pkey_meth, hmac_pkey_meth, cmac_pkey_meth;
static const EVP_PKEY_METHOD *standard_methods[] =
{
@@ -90,6 +90,7 @@ static const EVP_PKEY_METHOD *standard_methods[] =
&ec_pkey_meth,
#endif
&hmac_pkey_meth,
+ &cmac_pkey_meth
};
DECLARE_OBJ_BSEARCH_CMP_FN(const EVP_PKEY_METHOD *, const EVP_PKEY_METHOD *,
@@ -203,6 +204,8 @@ EVP_PKEY_METHOD* EVP_PKEY_meth_new(int id, int flags)
if (!pmeth)
return NULL;
+ memset(pmeth, 0, sizeof(EVP_PKEY_METHOD));
+
pmeth->pkey_id = id;
pmeth->flags = flags | EVP_PKEY_FLAG_DYNAMIC;
@@ -235,6 +238,56 @@ EVP_PKEY_METHOD* EVP_PKEY_meth_new(int id, int flags)
return pmeth;
}
+void EVP_PKEY_meth_get0_info(int *ppkey_id, int *pflags,
+ const EVP_PKEY_METHOD *meth)
+ {
+ if (ppkey_id)
+ *ppkey_id = meth->pkey_id;
+ if (pflags)
+ *pflags = meth->flags;
+ }
+
+void EVP_PKEY_meth_copy(EVP_PKEY_METHOD *dst, const EVP_PKEY_METHOD *src)
+ {
+
+ dst->init = src->init;
+ dst->copy = src->copy;
+ dst->cleanup = src->cleanup;
+
+ dst->paramgen_init = src->paramgen_init;
+ dst->paramgen = src->paramgen;
+
+ dst->keygen_init = src->keygen_init;
+ dst->keygen = src->keygen;
+
+ dst->sign_init = src->sign_init;
+ dst->sign = src->sign;
+
+ dst->verify_init = src->verify_init;
+ dst->verify = src->verify;
+
+ dst->verify_recover_init = src->verify_recover_init;
+ dst->verify_recover = src->verify_recover;
+
+ dst->signctx_init = src->signctx_init;
+ dst->signctx = src->signctx;
+
+ dst->verifyctx_init = src->verifyctx_init;
+ dst->verifyctx = src->verifyctx;
+
+ dst->encrypt_init = src->encrypt_init;
+ dst->encrypt = src->encrypt;
+
+ dst->decrypt_init = src->decrypt_init;
+ dst->decrypt = src->decrypt;
+
+ dst->derive_init = src->derive_init;
+ dst->derive = src->derive;
+
+ dst->ctrl = src->ctrl;
+ dst->ctrl_str = src->ctrl_str;
+ }
+
void EVP_PKEY_meth_free(EVP_PKEY_METHOD *pmeth)
{
if (pmeth && (pmeth->flags & EVP_PKEY_FLAG_DYNAMIC))
diff --git a/main/openssl/crypto/hmac/hm_ameth.c b/main/openssl/crypto/hmac/hm_ameth.c
index 6d8a8914..e03f24ae 100644
--- a/main/openssl/crypto/hmac/hm_ameth.c
+++ b/main/openssl/crypto/hmac/hm_ameth.c
@@ -153,7 +153,7 @@ const EVP_PKEY_ASN1_METHOD hmac_asn1_meth =
hmac_size,
0,
- 0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,
hmac_key_free,
hmac_pkey_ctrl,
diff --git a/main/openssl/crypto/hmac/hm_pmeth.c b/main/openssl/crypto/hmac/hm_pmeth.c
index 71e8567a..0daa4451 100644
--- a/main/openssl/crypto/hmac/hm_pmeth.c
+++ b/main/openssl/crypto/hmac/hm_pmeth.c
@@ -100,7 +100,8 @@ static int pkey_hmac_copy(EVP_PKEY_CTX *dst, EVP_PKEY_CTX *src)
dctx = dst->data;
dctx->md = sctx->md;
HMAC_CTX_init(&dctx->ctx);
- HMAC_CTX_copy(&dctx->ctx, &sctx->ctx);
+ if (!HMAC_CTX_copy(&dctx->ctx, &sctx->ctx))
+ return 0;
if (sctx->ktmp.data)
{
if (!ASN1_OCTET_STRING_set(&dctx->ktmp,
@@ -141,7 +142,8 @@ static int pkey_hmac_keygen(EVP_PKEY_CTX *ctx, EVP_PKEY *pkey)
static int int_update(EVP_MD_CTX *ctx,const void *data,size_t count)
{
HMAC_PKEY_CTX *hctx = ctx->pctx->data;
- HMAC_Update(&hctx->ctx, data, count);
+ if (!HMAC_Update(&hctx->ctx, data, count))
+ return 0;
return 1;
}
@@ -167,7 +169,8 @@ static int hmac_signctx(EVP_PKEY_CTX *ctx, unsigned char *sig, size_t *siglen,
if (!sig)
return 1;
- HMAC_Final(&hctx->ctx, sig, &hlen);
+ if (!HMAC_Final(&hctx->ctx, sig, &hlen))
+ return 0;
*siglen = (size_t)hlen;
return 1;
}
@@ -192,8 +195,9 @@ static int pkey_hmac_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2)
case EVP_PKEY_CTRL_DIGESTINIT:
key = (ASN1_OCTET_STRING *)ctx->pkey->pkey.ptr;
- HMAC_Init_ex(&hctx->ctx, key->data, key->length, hctx->md,
- ctx->engine);
+ if (!HMAC_Init_ex(&hctx->ctx, key->data, key->length, hctx->md,
+ ctx->engine))
+ return 0;
break;
default:
diff --git a/main/openssl/crypto/hmac/hmac.c b/main/openssl/crypto/hmac/hmac.c
index 6c98fc43..ba27cbf5 100644
--- a/main/openssl/crypto/hmac/hmac.c
+++ b/main/openssl/crypto/hmac/hmac.c
@@ -61,12 +61,34 @@
#include "cryptlib.h"
#include <openssl/hmac.h>
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
+
int HMAC_Init_ex(HMAC_CTX *ctx, const void *key, int len,
const EVP_MD *md, ENGINE *impl)
{
int i,j,reset=0;
unsigned char pad[HMAC_MAX_MD_CBLOCK];
+#ifdef OPENSSL_FIPS
+ if (FIPS_mode())
+ {
+ /* If we have an ENGINE need to allow non FIPS */
+ if ((impl || ctx->i_ctx.engine)
+ && !(ctx->i_ctx.flags & EVP_CIPH_FLAG_NON_FIPS_ALLOW))
+ {
+ EVPerr(EVP_F_HMAC_INIT_EX, EVP_R_DISABLED_FOR_FIPS);
+ return 0;
+ }
+ /* Other algorithm blocking will be done in FIPS_cmac_init,
+ * via FIPS_hmac_init_ex().
+ */
+ if (!impl && !ctx->i_ctx.engine)
+ return FIPS_hmac_init_ex(ctx, key, len, md, NULL);
+ }
+#endif
+
if (md != NULL)
{
reset=1;
@@ -133,6 +155,10 @@ int HMAC_Init(HMAC_CTX *ctx, const void *key, int len, const EVP_MD *md)
int HMAC_Update(HMAC_CTX *ctx, const unsigned char *data, size_t len)
{
+#ifdef OPENSSL_FIPS
+ if (FIPS_mode() && !ctx->i_ctx.engine)
+ return FIPS_hmac_update(ctx, data, len);
+#endif
return EVP_DigestUpdate(&ctx->md_ctx,data,len);
}
@@ -140,6 +166,10 @@ int HMAC_Final(HMAC_CTX *ctx, unsigned char *md, unsigned int *len)
{
unsigned int i;
unsigned char buf[EVP_MAX_MD_SIZE];
+#ifdef OPENSSL_FIPS
+ if (FIPS_mode() && !ctx->i_ctx.engine)
+ return FIPS_hmac_final(ctx, md, len);
+#endif
if (!EVP_DigestFinal_ex(&ctx->md_ctx,buf,&i))
goto err;
@@ -179,6 +209,13 @@ int HMAC_CTX_copy(HMAC_CTX *dctx, HMAC_CTX *sctx)
void HMAC_CTX_cleanup(HMAC_CTX *ctx)
{
+#ifdef OPENSSL_FIPS
+ if (FIPS_mode() && !ctx->i_ctx.engine)
+ {
+ FIPS_hmac_ctx_cleanup(ctx);
+ return;
+ }
+#endif
EVP_MD_CTX_cleanup(&ctx->i_ctx);
EVP_MD_CTX_cleanup(&ctx->o_ctx);
EVP_MD_CTX_cleanup(&ctx->md_ctx);
diff --git a/main/openssl/crypto/ia64cpuid.S b/main/openssl/crypto/ia64cpuid.S
index d705fff7..7832b9b6 100644
--- a/main/openssl/crypto/ia64cpuid.S
+++ b/main/openssl/crypto/ia64cpuid.S
@@ -26,7 +26,7 @@ OPENSSL_atomic_add:
{ .mii; mov ar.ccv=r2
add r8=r2,r33
mov r3=r2 };;
-{ .mmi; mf
+{ .mmi; mf;;
cmpxchg4.acq r2=[r32],r8,ar.ccv
nop.i 0 };;
{ .mib; cmp.ne p6,p0=r2,r3
diff --git a/main/openssl/crypto/md4/md4.h b/main/openssl/crypto/md4/md4.h
index c3ed9b3f..a55368a7 100644
--- a/main/openssl/crypto/md4/md4.h
+++ b/main/openssl/crypto/md4/md4.h
@@ -105,6 +105,9 @@ typedef struct MD4state_st
unsigned int num;
} MD4_CTX;
+#ifdef OPENSSL_FIPS
+int private_MD4_Init(MD4_CTX *c);
+#endif
int MD4_Init(MD4_CTX *c);
int MD4_Update(MD4_CTX *c, const void *data, size_t len);
int MD4_Final(unsigned char *md, MD4_CTX *c);
diff --git a/main/openssl/crypto/md4/md4_dgst.c b/main/openssl/crypto/md4/md4_dgst.c
index e0c42e85..b5b165b0 100644
--- a/main/openssl/crypto/md4/md4_dgst.c
+++ b/main/openssl/crypto/md4/md4_dgst.c
@@ -57,8 +57,9 @@
*/
#include <stdio.h>
-#include "md4_locl.h"
#include <openssl/opensslv.h>
+#include <openssl/crypto.h>
+#include "md4_locl.h"
const char MD4_version[]="MD4" OPENSSL_VERSION_PTEXT;
@@ -70,7 +71,7 @@ const char MD4_version[]="MD4" OPENSSL_VERSION_PTEXT;
#define INIT_DATA_C (unsigned long)0x98badcfeL
#define INIT_DATA_D (unsigned long)0x10325476L
-int MD4_Init(MD4_CTX *c)
+fips_md_init(MD4)
{
memset (c,0,sizeof(*c));
c->A=INIT_DATA_A;
@@ -105,22 +106,23 @@ void md4_block_data_order (MD4_CTX *c, const void *data_, size_t num)
for (;num--;)
{
- HOST_c2l(data,l); X( 0)=l; HOST_c2l(data,l); X( 1)=l;
+ (void)HOST_c2l(data,l); X( 0)=l;
+ (void)HOST_c2l(data,l); X( 1)=l;
/* Round 0 */
- R0(A,B,C,D,X( 0), 3,0); HOST_c2l(data,l); X( 2)=l;
- R0(D,A,B,C,X( 1), 7,0); HOST_c2l(data,l); X( 3)=l;
- R0(C,D,A,B,X( 2),11,0); HOST_c2l(data,l); X( 4)=l;
- R0(B,C,D,A,X( 3),19,0); HOST_c2l(data,l); X( 5)=l;
- R0(A,B,C,D,X( 4), 3,0); HOST_c2l(data,l); X( 6)=l;
- R0(D,A,B,C,X( 5), 7,0); HOST_c2l(data,l); X( 7)=l;
- R0(C,D,A,B,X( 6),11,0); HOST_c2l(data,l); X( 8)=l;
- R0(B,C,D,A,X( 7),19,0); HOST_c2l(data,l); X( 9)=l;
- R0(A,B,C,D,X( 8), 3,0); HOST_c2l(data,l); X(10)=l;
- R0(D,A,B,C,X( 9), 7,0); HOST_c2l(data,l); X(11)=l;
- R0(C,D,A,B,X(10),11,0); HOST_c2l(data,l); X(12)=l;
- R0(B,C,D,A,X(11),19,0); HOST_c2l(data,l); X(13)=l;
- R0(A,B,C,D,X(12), 3,0); HOST_c2l(data,l); X(14)=l;
- R0(D,A,B,C,X(13), 7,0); HOST_c2l(data,l); X(15)=l;
+ R0(A,B,C,D,X( 0), 3,0); (void)HOST_c2l(data,l); X( 2)=l;
+ R0(D,A,B,C,X( 1), 7,0); (void)HOST_c2l(data,l); X( 3)=l;
+ R0(C,D,A,B,X( 2),11,0); (void)HOST_c2l(data,l); X( 4)=l;
+ R0(B,C,D,A,X( 3),19,0); (void)HOST_c2l(data,l); X( 5)=l;
+ R0(A,B,C,D,X( 4), 3,0); (void)HOST_c2l(data,l); X( 6)=l;
+ R0(D,A,B,C,X( 5), 7,0); (void)HOST_c2l(data,l); X( 7)=l;
+ R0(C,D,A,B,X( 6),11,0); (void)HOST_c2l(data,l); X( 8)=l;
+ R0(B,C,D,A,X( 7),19,0); (void)HOST_c2l(data,l); X( 9)=l;
+ R0(A,B,C,D,X( 8), 3,0); (void)HOST_c2l(data,l); X(10)=l;
+ R0(D,A,B,C,X( 9), 7,0); (void)HOST_c2l(data,l); X(11)=l;
+ R0(C,D,A,B,X(10),11,0); (void)HOST_c2l(data,l); X(12)=l;
+ R0(B,C,D,A,X(11),19,0); (void)HOST_c2l(data,l); X(13)=l;
+ R0(A,B,C,D,X(12), 3,0); (void)HOST_c2l(data,l); X(14)=l;
+ R0(D,A,B,C,X(13), 7,0); (void)HOST_c2l(data,l); X(15)=l;
R0(C,D,A,B,X(14),11,0);
R0(B,C,D,A,X(15),19,0);
/* Round 1 */
diff --git a/main/openssl/crypto/md4/md4_locl.h b/main/openssl/crypto/md4/md4_locl.h
index c8085b0e..99c3e500 100644
--- a/main/openssl/crypto/md4/md4_locl.h
+++ b/main/openssl/crypto/md4/md4_locl.h
@@ -77,10 +77,10 @@ void md4_block_data_order (MD4_CTX *c, const void *p,size_t num);
#define HASH_FINAL MD4_Final
#define HASH_MAKE_STRING(c,s) do { \
unsigned long ll; \
- ll=(c)->A; HOST_l2c(ll,(s)); \
- ll=(c)->B; HOST_l2c(ll,(s)); \
- ll=(c)->C; HOST_l2c(ll,(s)); \
- ll=(c)->D; HOST_l2c(ll,(s)); \
+ ll=(c)->A; (void)HOST_l2c(ll,(s)); \
+ ll=(c)->B; (void)HOST_l2c(ll,(s)); \
+ ll=(c)->C; (void)HOST_l2c(ll,(s)); \
+ ll=(c)->D; (void)HOST_l2c(ll,(s)); \
} while (0)
#define HASH_BLOCK_DATA_ORDER md4_block_data_order
diff --git a/main/openssl/crypto/md5/asm/md5-586.S b/main/openssl/crypto/md5/asm/md5-586.S
new file mode 100644
index 00000000..23e4de7d
--- /dev/null
+++ b/main/openssl/crypto/md5/asm/md5-586.S
@@ -0,0 +1,679 @@
+.file "crypto/md5/asm/md5-586.s"
+.text
+.globl md5_block_asm_data_order
+.type md5_block_asm_data_order,@function
+.align 16
+md5_block_asm_data_order:
+.L_md5_block_asm_data_order_begin:
+ pushl %esi
+ pushl %edi
+ movl 12(%esp),%edi
+ movl 16(%esp),%esi
+ movl 20(%esp),%ecx
+ pushl %ebp
+ shll $6,%ecx
+ pushl %ebx
+ addl %esi,%ecx
+ subl $64,%ecx
+ movl (%edi),%eax
+ pushl %ecx
+ movl 4(%edi),%ebx
+ movl 8(%edi),%ecx
+ movl 12(%edi),%edx
+.L000start:
+
+
+ movl %ecx,%edi
+ movl (%esi),%ebp
+
+ xorl %edx,%edi
+ andl %ebx,%edi
+ leal 3614090360(%eax,%ebp,1),%eax
+ xorl %edx,%edi
+ addl %edi,%eax
+ movl %ebx,%edi
+ roll $7,%eax
+ movl 4(%esi),%ebp
+ addl %ebx,%eax
+
+ xorl %ecx,%edi
+ andl %eax,%edi
+ leal 3905402710(%edx,%ebp,1),%edx
+ xorl %ecx,%edi
+ addl %edi,%edx
+ movl %eax,%edi
+ roll $12,%edx
+ movl 8(%esi),%ebp
+ addl %eax,%edx
+
+ xorl %ebx,%edi
+ andl %edx,%edi
+ leal 606105819(%ecx,%ebp,1),%ecx
+ xorl %ebx,%edi
+ addl %edi,%ecx
+ movl %edx,%edi
+ roll $17,%ecx
+ movl 12(%esi),%ebp
+ addl %edx,%ecx
+
+ xorl %eax,%edi
+ andl %ecx,%edi
+ leal 3250441966(%ebx,%ebp,1),%ebx
+ xorl %eax,%edi
+ addl %edi,%ebx
+ movl %ecx,%edi
+ roll $22,%ebx
+ movl 16(%esi),%ebp
+ addl %ecx,%ebx
+
+ xorl %edx,%edi
+ andl %ebx,%edi
+ leal 4118548399(%eax,%ebp,1),%eax
+ xorl %edx,%edi
+ addl %edi,%eax
+ movl %ebx,%edi
+ roll $7,%eax
+ movl 20(%esi),%ebp
+ addl %ebx,%eax
+
+ xorl %ecx,%edi
+ andl %eax,%edi
+ leal 1200080426(%edx,%ebp,1),%edx
+ xorl %ecx,%edi
+ addl %edi,%edx
+ movl %eax,%edi
+ roll $12,%edx
+ movl 24(%esi),%ebp
+ addl %eax,%edx
+
+ xorl %ebx,%edi
+ andl %edx,%edi
+ leal 2821735955(%ecx,%ebp,1),%ecx
+ xorl %ebx,%edi
+ addl %edi,%ecx
+ movl %edx,%edi
+ roll $17,%ecx
+ movl 28(%esi),%ebp
+ addl %edx,%ecx
+
+ xorl %eax,%edi
+ andl %ecx,%edi
+ leal 4249261313(%ebx,%ebp,1),%ebx
+ xorl %eax,%edi
+ addl %edi,%ebx
+ movl %ecx,%edi
+ roll $22,%ebx
+ movl 32(%esi),%ebp
+ addl %ecx,%ebx
+
+ xorl %edx,%edi
+ andl %ebx,%edi
+ leal 1770035416(%eax,%ebp,1),%eax
+ xorl %edx,%edi
+ addl %edi,%eax
+ movl %ebx,%edi
+ roll $7,%eax
+ movl 36(%esi),%ebp
+ addl %ebx,%eax
+
+ xorl %ecx,%edi
+ andl %eax,%edi
+ leal 2336552879(%edx,%ebp,1),%edx
+ xorl %ecx,%edi
+ addl %edi,%edx
+ movl %eax,%edi
+ roll $12,%edx
+ movl 40(%esi),%ebp
+ addl %eax,%edx
+
+ xorl %ebx,%edi
+ andl %edx,%edi
+ leal 4294925233(%ecx,%ebp,1),%ecx
+ xorl %ebx,%edi
+ addl %edi,%ecx
+ movl %edx,%edi
+ roll $17,%ecx
+ movl 44(%esi),%ebp
+ addl %edx,%ecx
+
+ xorl %eax,%edi
+ andl %ecx,%edi
+ leal 2304563134(%ebx,%ebp,1),%ebx
+ xorl %eax,%edi
+ addl %edi,%ebx
+ movl %ecx,%edi
+ roll $22,%ebx
+ movl 48(%esi),%ebp
+ addl %ecx,%ebx
+
+ xorl %edx,%edi
+ andl %ebx,%edi
+ leal 1804603682(%eax,%ebp,1),%eax
+ xorl %edx,%edi
+ addl %edi,%eax
+ movl %ebx,%edi
+ roll $7,%eax
+ movl 52(%esi),%ebp
+ addl %ebx,%eax
+
+ xorl %ecx,%edi
+ andl %eax,%edi
+ leal 4254626195(%edx,%ebp,1),%edx
+ xorl %ecx,%edi
+ addl %edi,%edx
+ movl %eax,%edi
+ roll $12,%edx
+ movl 56(%esi),%ebp
+ addl %eax,%edx
+
+ xorl %ebx,%edi
+ andl %edx,%edi
+ leal 2792965006(%ecx,%ebp,1),%ecx
+ xorl %ebx,%edi
+ addl %edi,%ecx
+ movl %edx,%edi
+ roll $17,%ecx
+ movl 60(%esi),%ebp
+ addl %edx,%ecx
+
+ xorl %eax,%edi
+ andl %ecx,%edi
+ leal 1236535329(%ebx,%ebp,1),%ebx
+ xorl %eax,%edi
+ addl %edi,%ebx
+ movl %ecx,%edi
+ roll $22,%ebx
+ movl 4(%esi),%ebp
+ addl %ecx,%ebx
+
+
+
+ leal 4129170786(%eax,%ebp,1),%eax
+ xorl %ebx,%edi
+ andl %edx,%edi
+ movl 24(%esi),%ebp
+ xorl %ecx,%edi
+ addl %edi,%eax
+ movl %ebx,%edi
+ roll $5,%eax
+ addl %ebx,%eax
+
+ leal 3225465664(%edx,%ebp,1),%edx
+ xorl %eax,%edi
+ andl %ecx,%edi
+ movl 44(%esi),%ebp
+ xorl %ebx,%edi
+ addl %edi,%edx
+ movl %eax,%edi
+ roll $9,%edx
+ addl %eax,%edx
+
+ leal 643717713(%ecx,%ebp,1),%ecx
+ xorl %edx,%edi
+ andl %ebx,%edi
+ movl (%esi),%ebp
+ xorl %eax,%edi
+ addl %edi,%ecx
+ movl %edx,%edi
+ roll $14,%ecx
+ addl %edx,%ecx
+
+ leal 3921069994(%ebx,%ebp,1),%ebx
+ xorl %ecx,%edi
+ andl %eax,%edi
+ movl 20(%esi),%ebp
+ xorl %edx,%edi
+ addl %edi,%ebx
+ movl %ecx,%edi
+ roll $20,%ebx
+ addl %ecx,%ebx
+
+ leal 3593408605(%eax,%ebp,1),%eax
+ xorl %ebx,%edi
+ andl %edx,%edi
+ movl 40(%esi),%ebp
+ xorl %ecx,%edi
+ addl %edi,%eax
+ movl %ebx,%edi
+ roll $5,%eax
+ addl %ebx,%eax
+
+ leal 38016083(%edx,%ebp,1),%edx
+ xorl %eax,%edi
+ andl %ecx,%edi
+ movl 60(%esi),%ebp
+ xorl %ebx,%edi
+ addl %edi,%edx
+ movl %eax,%edi
+ roll $9,%edx
+ addl %eax,%edx
+
+ leal 3634488961(%ecx,%ebp,1),%ecx
+ xorl %edx,%edi
+ andl %ebx,%edi
+ movl 16(%esi),%ebp
+ xorl %eax,%edi
+ addl %edi,%ecx
+ movl %edx,%edi
+ roll $14,%ecx
+ addl %edx,%ecx
+
+ leal 3889429448(%ebx,%ebp,1),%ebx
+ xorl %ecx,%edi
+ andl %eax,%edi
+ movl 36(%esi),%ebp
+ xorl %edx,%edi
+ addl %edi,%ebx
+ movl %ecx,%edi
+ roll $20,%ebx
+ addl %ecx,%ebx
+
+ leal 568446438(%eax,%ebp,1),%eax
+ xorl %ebx,%edi
+ andl %edx,%edi
+ movl 56(%esi),%ebp
+ xorl %ecx,%edi
+ addl %edi,%eax
+ movl %ebx,%edi
+ roll $5,%eax
+ addl %ebx,%eax
+
+ leal 3275163606(%edx,%ebp,1),%edx
+ xorl %eax,%edi
+ andl %ecx,%edi
+ movl 12(%esi),%ebp
+ xorl %ebx,%edi
+ addl %edi,%edx
+ movl %eax,%edi
+ roll $9,%edx
+ addl %eax,%edx
+
+ leal 4107603335(%ecx,%ebp,1),%ecx
+ xorl %edx,%edi
+ andl %ebx,%edi
+ movl 32(%esi),%ebp
+ xorl %eax,%edi
+ addl %edi,%ecx
+ movl %edx,%edi
+ roll $14,%ecx
+ addl %edx,%ecx
+
+ leal 1163531501(%ebx,%ebp,1),%ebx
+ xorl %ecx,%edi
+ andl %eax,%edi
+ movl 52(%esi),%ebp
+ xorl %edx,%edi
+ addl %edi,%ebx
+ movl %ecx,%edi
+ roll $20,%ebx
+ addl %ecx,%ebx
+
+ leal 2850285829(%eax,%ebp,1),%eax
+ xorl %ebx,%edi
+ andl %edx,%edi
+ movl 8(%esi),%ebp
+ xorl %ecx,%edi
+ addl %edi,%eax
+ movl %ebx,%edi
+ roll $5,%eax
+ addl %ebx,%eax
+
+ leal 4243563512(%edx,%ebp,1),%edx
+ xorl %eax,%edi
+ andl %ecx,%edi
+ movl 28(%esi),%ebp
+ xorl %ebx,%edi
+ addl %edi,%edx
+ movl %eax,%edi
+ roll $9,%edx
+ addl %eax,%edx
+
+ leal 1735328473(%ecx,%ebp,1),%ecx
+ xorl %edx,%edi
+ andl %ebx,%edi
+ movl 48(%esi),%ebp
+ xorl %eax,%edi
+ addl %edi,%ecx
+ movl %edx,%edi
+ roll $14,%ecx
+ addl %edx,%ecx
+
+ leal 2368359562(%ebx,%ebp,1),%ebx
+ xorl %ecx,%edi
+ andl %eax,%edi
+ movl 20(%esi),%ebp
+ xorl %edx,%edi
+ addl %edi,%ebx
+ movl %ecx,%edi
+ roll $20,%ebx
+ addl %ecx,%ebx
+
+
+
+ xorl %edx,%edi
+ xorl %ebx,%edi
+ leal 4294588738(%eax,%ebp,1),%eax
+ addl %edi,%eax
+ roll $4,%eax
+ movl 32(%esi),%ebp
+ movl %ebx,%edi
+
+ leal 2272392833(%edx,%ebp,1),%edx
+ addl %ebx,%eax
+ xorl %ecx,%edi
+ xorl %eax,%edi
+ movl 44(%esi),%ebp
+ addl %edi,%edx
+ movl %eax,%edi
+ roll $11,%edx
+ addl %eax,%edx
+
+ xorl %ebx,%edi
+ xorl %edx,%edi
+ leal 1839030562(%ecx,%ebp,1),%ecx
+ addl %edi,%ecx
+ roll $16,%ecx
+ movl 56(%esi),%ebp
+ movl %edx,%edi
+
+ leal 4259657740(%ebx,%ebp,1),%ebx
+ addl %edx,%ecx
+ xorl %eax,%edi
+ xorl %ecx,%edi
+ movl 4(%esi),%ebp
+ addl %edi,%ebx
+ movl %ecx,%edi
+ roll $23,%ebx
+ addl %ecx,%ebx
+
+ xorl %edx,%edi
+ xorl %ebx,%edi
+ leal 2763975236(%eax,%ebp,1),%eax
+ addl %edi,%eax
+ roll $4,%eax
+ movl 16(%esi),%ebp
+ movl %ebx,%edi
+
+ leal 1272893353(%edx,%ebp,1),%edx
+ addl %ebx,%eax
+ xorl %ecx,%edi
+ xorl %eax,%edi
+ movl 28(%esi),%ebp
+ addl %edi,%edx
+ movl %eax,%edi
+ roll $11,%edx
+ addl %eax,%edx
+
+ xorl %ebx,%edi
+ xorl %edx,%edi
+ leal 4139469664(%ecx,%ebp,1),%ecx
+ addl %edi,%ecx
+ roll $16,%ecx
+ movl 40(%esi),%ebp
+ movl %edx,%edi
+
+ leal 3200236656(%ebx,%ebp,1),%ebx
+ addl %edx,%ecx
+ xorl %eax,%edi
+ xorl %ecx,%edi
+ movl 52(%esi),%ebp
+ addl %edi,%ebx
+ movl %ecx,%edi
+ roll $23,%ebx
+ addl %ecx,%ebx
+
+ xorl %edx,%edi
+ xorl %ebx,%edi
+ leal 681279174(%eax,%ebp,1),%eax
+ addl %edi,%eax
+ roll $4,%eax
+ movl (%esi),%ebp
+ movl %ebx,%edi
+
+ leal 3936430074(%edx,%ebp,1),%edx
+ addl %ebx,%eax
+ xorl %ecx,%edi
+ xorl %eax,%edi
+ movl 12(%esi),%ebp
+ addl %edi,%edx
+ movl %eax,%edi
+ roll $11,%edx
+ addl %eax,%edx
+
+ xorl %ebx,%edi
+ xorl %edx,%edi
+ leal 3572445317(%ecx,%ebp,1),%ecx
+ addl %edi,%ecx
+ roll $16,%ecx
+ movl 24(%esi),%ebp
+ movl %edx,%edi
+
+ leal 76029189(%ebx,%ebp,1),%ebx
+ addl %edx,%ecx
+ xorl %eax,%edi
+ xorl %ecx,%edi
+ movl 36(%esi),%ebp
+ addl %edi,%ebx
+ movl %ecx,%edi
+ roll $23,%ebx
+ addl %ecx,%ebx
+
+ xorl %edx,%edi
+ xorl %ebx,%edi
+ leal 3654602809(%eax,%ebp,1),%eax
+ addl %edi,%eax
+ roll $4,%eax
+ movl 48(%esi),%ebp
+ movl %ebx,%edi
+
+ leal 3873151461(%edx,%ebp,1),%edx
+ addl %ebx,%eax
+ xorl %ecx,%edi
+ xorl %eax,%edi
+ movl 60(%esi),%ebp
+ addl %edi,%edx
+ movl %eax,%edi
+ roll $11,%edx
+ addl %eax,%edx
+
+ xorl %ebx,%edi
+ xorl %edx,%edi
+ leal 530742520(%ecx,%ebp,1),%ecx
+ addl %edi,%ecx
+ roll $16,%ecx
+ movl 8(%esi),%ebp
+ movl %edx,%edi
+
+ leal 3299628645(%ebx,%ebp,1),%ebx
+ addl %edx,%ecx
+ xorl %eax,%edi
+ xorl %ecx,%edi
+ movl (%esi),%ebp
+ addl %edi,%ebx
+ movl $-1,%edi
+ roll $23,%ebx
+ addl %ecx,%ebx
+
+
+
+ xorl %edx,%edi
+ orl %ebx,%edi
+ leal 4096336452(%eax,%ebp,1),%eax
+ xorl %ecx,%edi
+ movl 28(%esi),%ebp
+ addl %edi,%eax
+ movl $-1,%edi
+ roll $6,%eax
+ xorl %ecx,%edi
+ addl %ebx,%eax
+
+ orl %eax,%edi
+ leal 1126891415(%edx,%ebp,1),%edx
+ xorl %ebx,%edi
+ movl 56(%esi),%ebp
+ addl %edi,%edx
+ movl $-1,%edi
+ roll $10,%edx
+ xorl %ebx,%edi
+ addl %eax,%edx
+
+ orl %edx,%edi
+ leal 2878612391(%ecx,%ebp,1),%ecx
+ xorl %eax,%edi
+ movl 20(%esi),%ebp
+ addl %edi,%ecx
+ movl $-1,%edi
+ roll $15,%ecx
+ xorl %eax,%edi
+ addl %edx,%ecx
+
+ orl %ecx,%edi
+ leal 4237533241(%ebx,%ebp,1),%ebx
+ xorl %edx,%edi
+ movl 48(%esi),%ebp
+ addl %edi,%ebx
+ movl $-1,%edi
+ roll $21,%ebx
+ xorl %edx,%edi
+ addl %ecx,%ebx
+
+ orl %ebx,%edi
+ leal 1700485571(%eax,%ebp,1),%eax
+ xorl %ecx,%edi
+ movl 12(%esi),%ebp
+ addl %edi,%eax
+ movl $-1,%edi
+ roll $6,%eax
+ xorl %ecx,%edi
+ addl %ebx,%eax
+
+ orl %eax,%edi
+ leal 2399980690(%edx,%ebp,1),%edx
+ xorl %ebx,%edi
+ movl 40(%esi),%ebp
+ addl %edi,%edx
+ movl $-1,%edi
+ roll $10,%edx
+ xorl %ebx,%edi
+ addl %eax,%edx
+
+ orl %edx,%edi
+ leal 4293915773(%ecx,%ebp,1),%ecx
+ xorl %eax,%edi
+ movl 4(%esi),%ebp
+ addl %edi,%ecx
+ movl $-1,%edi
+ roll $15,%ecx
+ xorl %eax,%edi
+ addl %edx,%ecx
+
+ orl %ecx,%edi
+ leal 2240044497(%ebx,%ebp,1),%ebx
+ xorl %edx,%edi
+ movl 32(%esi),%ebp
+ addl %edi,%ebx
+ movl $-1,%edi
+ roll $21,%ebx
+ xorl %edx,%edi
+ addl %ecx,%ebx
+
+ orl %ebx,%edi
+ leal 1873313359(%eax,%ebp,1),%eax
+ xorl %ecx,%edi
+ movl 60(%esi),%ebp
+ addl %edi,%eax
+ movl $-1,%edi
+ roll $6,%eax
+ xorl %ecx,%edi
+ addl %ebx,%eax
+
+ orl %eax,%edi
+ leal 4264355552(%edx,%ebp,1),%edx
+ xorl %ebx,%edi
+ movl 24(%esi),%ebp
+ addl %edi,%edx
+ movl $-1,%edi
+ roll $10,%edx
+ xorl %ebx,%edi
+ addl %eax,%edx
+
+ orl %edx,%edi
+ leal 2734768916(%ecx,%ebp,1),%ecx
+ xorl %eax,%edi
+ movl 52(%esi),%ebp
+ addl %edi,%ecx
+ movl $-1,%edi
+ roll $15,%ecx
+ xorl %eax,%edi
+ addl %edx,%ecx
+
+ orl %ecx,%edi
+ leal 1309151649(%ebx,%ebp,1),%ebx
+ xorl %edx,%edi
+ movl 16(%esi),%ebp
+ addl %edi,%ebx
+ movl $-1,%edi
+ roll $21,%ebx
+ xorl %edx,%edi
+ addl %ecx,%ebx
+
+ orl %ebx,%edi
+ leal 4149444226(%eax,%ebp,1),%eax
+ xorl %ecx,%edi
+ movl 44(%esi),%ebp
+ addl %edi,%eax
+ movl $-1,%edi
+ roll $6,%eax
+ xorl %ecx,%edi
+ addl %ebx,%eax
+
+ orl %eax,%edi
+ leal 3174756917(%edx,%ebp,1),%edx
+ xorl %ebx,%edi
+ movl 8(%esi),%ebp
+ addl %edi,%edx
+ movl $-1,%edi
+ roll $10,%edx
+ xorl %ebx,%edi
+ addl %eax,%edx
+
+ orl %edx,%edi
+ leal 718787259(%ecx,%ebp,1),%ecx
+ xorl %eax,%edi
+ movl 36(%esi),%ebp
+ addl %edi,%ecx
+ movl $-1,%edi
+ roll $15,%ecx
+ xorl %eax,%edi
+ addl %edx,%ecx
+
+ orl %ecx,%edi
+ leal 3951481745(%ebx,%ebp,1),%ebx
+ xorl %edx,%edi
+ movl 24(%esp),%ebp
+ addl %edi,%ebx
+ addl $64,%esi
+ roll $21,%ebx
+ movl (%ebp),%edi
+ addl %ecx,%ebx
+ addl %edi,%eax
+ movl 4(%ebp),%edi
+ addl %edi,%ebx
+ movl 8(%ebp),%edi
+ addl %edi,%ecx
+ movl 12(%ebp),%edi
+ addl %edi,%edx
+ movl %eax,(%ebp)
+ movl %ebx,4(%ebp)
+ movl (%esp),%edi
+ movl %ecx,8(%ebp)
+ movl %edx,12(%ebp)
+ cmpl %esi,%edi
+ jae .L000start
+ popl %eax
+ popl %ebx
+ popl %ebp
+ popl %edi
+ popl %esi
+ ret
+.size md5_block_asm_data_order,.-.L_md5_block_asm_data_order_begin
diff --git a/main/openssl/crypto/md5/asm/md5-x86_64.S b/main/openssl/crypto/md5/asm/md5-x86_64.S
new file mode 100644
index 00000000..235d5e4e
--- /dev/null
+++ b/main/openssl/crypto/md5/asm/md5-x86_64.S
@@ -0,0 +1,668 @@
+.text
+.align 16
+
+.globl md5_block_asm_data_order
+.type md5_block_asm_data_order,@function
+md5_block_asm_data_order:
+ pushq %rbp
+ pushq %rbx
+ pushq %r12
+ pushq %r14
+ pushq %r15
+.Lprologue:
+
+
+
+
+ movq %rdi,%rbp
+ shlq $6,%rdx
+ leaq (%rsi,%rdx,1),%rdi
+ movl 0(%rbp),%eax
+ movl 4(%rbp),%ebx
+ movl 8(%rbp),%ecx
+ movl 12(%rbp),%edx
+
+
+
+
+
+
+
+ cmpq %rdi,%rsi
+ je .Lend
+
+
+.Lloop:
+ movl %eax,%r8d
+ movl %ebx,%r9d
+ movl %ecx,%r14d
+ movl %edx,%r15d
+ movl 0(%rsi),%r10d
+ movl %edx,%r11d
+ xorl %ecx,%r11d
+ leal -680876936(%rax,%r10,1),%eax
+ andl %ebx,%r11d
+ xorl %edx,%r11d
+ movl 4(%rsi),%r10d
+ addl %r11d,%eax
+ roll $7,%eax
+ movl %ecx,%r11d
+ addl %ebx,%eax
+ xorl %ebx,%r11d
+ leal -389564586(%rdx,%r10,1),%edx
+ andl %eax,%r11d
+ xorl %ecx,%r11d
+ movl 8(%rsi),%r10d
+ addl %r11d,%edx
+ roll $12,%edx
+ movl %ebx,%r11d
+ addl %eax,%edx
+ xorl %eax,%r11d
+ leal 606105819(%rcx,%r10,1),%ecx
+ andl %edx,%r11d
+ xorl %ebx,%r11d
+ movl 12(%rsi),%r10d
+ addl %r11d,%ecx
+ roll $17,%ecx
+ movl %eax,%r11d
+ addl %edx,%ecx
+ xorl %edx,%r11d
+ leal -1044525330(%rbx,%r10,1),%ebx
+ andl %ecx,%r11d
+ xorl %eax,%r11d
+ movl 16(%rsi),%r10d
+ addl %r11d,%ebx
+ roll $22,%ebx
+ movl %edx,%r11d
+ addl %ecx,%ebx
+ xorl %ecx,%r11d
+ leal -176418897(%rax,%r10,1),%eax
+ andl %ebx,%r11d
+ xorl %edx,%r11d
+ movl 20(%rsi),%r10d
+ addl %r11d,%eax
+ roll $7,%eax
+ movl %ecx,%r11d
+ addl %ebx,%eax
+ xorl %ebx,%r11d
+ leal 1200080426(%rdx,%r10,1),%edx
+ andl %eax,%r11d
+ xorl %ecx,%r11d
+ movl 24(%rsi),%r10d
+ addl %r11d,%edx
+ roll $12,%edx
+ movl %ebx,%r11d
+ addl %eax,%edx
+ xorl %eax,%r11d
+ leal -1473231341(%rcx,%r10,1),%ecx
+ andl %edx,%r11d
+ xorl %ebx,%r11d
+ movl 28(%rsi),%r10d
+ addl %r11d,%ecx
+ roll $17,%ecx
+ movl %eax,%r11d
+ addl %edx,%ecx
+ xorl %edx,%r11d
+ leal -45705983(%rbx,%r10,1),%ebx
+ andl %ecx,%r11d
+ xorl %eax,%r11d
+ movl 32(%rsi),%r10d
+ addl %r11d,%ebx
+ roll $22,%ebx
+ movl %edx,%r11d
+ addl %ecx,%ebx
+ xorl %ecx,%r11d
+ leal 1770035416(%rax,%r10,1),%eax
+ andl %ebx,%r11d
+ xorl %edx,%r11d
+ movl 36(%rsi),%r10d
+ addl %r11d,%eax
+ roll $7,%eax
+ movl %ecx,%r11d
+ addl %ebx,%eax
+ xorl %ebx,%r11d
+ leal -1958414417(%rdx,%r10,1),%edx
+ andl %eax,%r11d
+ xorl %ecx,%r11d
+ movl 40(%rsi),%r10d
+ addl %r11d,%edx
+ roll $12,%edx
+ movl %ebx,%r11d
+ addl %eax,%edx
+ xorl %eax,%r11d
+ leal -42063(%rcx,%r10,1),%ecx
+ andl %edx,%r11d
+ xorl %ebx,%r11d
+ movl 44(%rsi),%r10d
+ addl %r11d,%ecx
+ roll $17,%ecx
+ movl %eax,%r11d
+ addl %edx,%ecx
+ xorl %edx,%r11d
+ leal -1990404162(%rbx,%r10,1),%ebx
+ andl %ecx,%r11d
+ xorl %eax,%r11d
+ movl 48(%rsi),%r10d
+ addl %r11d,%ebx
+ roll $22,%ebx
+ movl %edx,%r11d
+ addl %ecx,%ebx
+ xorl %ecx,%r11d
+ leal 1804603682(%rax,%r10,1),%eax
+ andl %ebx,%r11d
+ xorl %edx,%r11d
+ movl 52(%rsi),%r10d
+ addl %r11d,%eax
+ roll $7,%eax
+ movl %ecx,%r11d
+ addl %ebx,%eax
+ xorl %ebx,%r11d
+ leal -40341101(%rdx,%r10,1),%edx
+ andl %eax,%r11d
+ xorl %ecx,%r11d
+ movl 56(%rsi),%r10d
+ addl %r11d,%edx
+ roll $12,%edx
+ movl %ebx,%r11d
+ addl %eax,%edx
+ xorl %eax,%r11d
+ leal -1502002290(%rcx,%r10,1),%ecx
+ andl %edx,%r11d
+ xorl %ebx,%r11d
+ movl 60(%rsi),%r10d
+ addl %r11d,%ecx
+ roll $17,%ecx
+ movl %eax,%r11d
+ addl %edx,%ecx
+ xorl %edx,%r11d
+ leal 1236535329(%rbx,%r10,1),%ebx
+ andl %ecx,%r11d
+ xorl %eax,%r11d
+ movl 0(%rsi),%r10d
+ addl %r11d,%ebx
+ roll $22,%ebx
+ movl %edx,%r11d
+ addl %ecx,%ebx
+ movl 4(%rsi),%r10d
+ movl %edx,%r11d
+ movl %edx,%r12d
+ notl %r11d
+ leal -165796510(%rax,%r10,1),%eax
+ andl %ebx,%r12d
+ andl %ecx,%r11d
+ movl 24(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %ecx,%r11d
+ addl %r12d,%eax
+ movl %ecx,%r12d
+ roll $5,%eax
+ addl %ebx,%eax
+ notl %r11d
+ leal -1069501632(%rdx,%r10,1),%edx
+ andl %eax,%r12d
+ andl %ebx,%r11d
+ movl 44(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %ebx,%r11d
+ addl %r12d,%edx
+ movl %ebx,%r12d
+ roll $9,%edx
+ addl %eax,%edx
+ notl %r11d
+ leal 643717713(%rcx,%r10,1),%ecx
+ andl %edx,%r12d
+ andl %eax,%r11d
+ movl 0(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %eax,%r11d
+ addl %r12d,%ecx
+ movl %eax,%r12d
+ roll $14,%ecx
+ addl %edx,%ecx
+ notl %r11d
+ leal -373897302(%rbx,%r10,1),%ebx
+ andl %ecx,%r12d
+ andl %edx,%r11d
+ movl 20(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %edx,%r11d
+ addl %r12d,%ebx
+ movl %edx,%r12d
+ roll $20,%ebx
+ addl %ecx,%ebx
+ notl %r11d
+ leal -701558691(%rax,%r10,1),%eax
+ andl %ebx,%r12d
+ andl %ecx,%r11d
+ movl 40(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %ecx,%r11d
+ addl %r12d,%eax
+ movl %ecx,%r12d
+ roll $5,%eax
+ addl %ebx,%eax
+ notl %r11d
+ leal 38016083(%rdx,%r10,1),%edx
+ andl %eax,%r12d
+ andl %ebx,%r11d
+ movl 60(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %ebx,%r11d
+ addl %r12d,%edx
+ movl %ebx,%r12d
+ roll $9,%edx
+ addl %eax,%edx
+ notl %r11d
+ leal -660478335(%rcx,%r10,1),%ecx
+ andl %edx,%r12d
+ andl %eax,%r11d
+ movl 16(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %eax,%r11d
+ addl %r12d,%ecx
+ movl %eax,%r12d
+ roll $14,%ecx
+ addl %edx,%ecx
+ notl %r11d
+ leal -405537848(%rbx,%r10,1),%ebx
+ andl %ecx,%r12d
+ andl %edx,%r11d
+ movl 36(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %edx,%r11d
+ addl %r12d,%ebx
+ movl %edx,%r12d
+ roll $20,%ebx
+ addl %ecx,%ebx
+ notl %r11d
+ leal 568446438(%rax,%r10,1),%eax
+ andl %ebx,%r12d
+ andl %ecx,%r11d
+ movl 56(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %ecx,%r11d
+ addl %r12d,%eax
+ movl %ecx,%r12d
+ roll $5,%eax
+ addl %ebx,%eax
+ notl %r11d
+ leal -1019803690(%rdx,%r10,1),%edx
+ andl %eax,%r12d
+ andl %ebx,%r11d
+ movl 12(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %ebx,%r11d
+ addl %r12d,%edx
+ movl %ebx,%r12d
+ roll $9,%edx
+ addl %eax,%edx
+ notl %r11d
+ leal -187363961(%rcx,%r10,1),%ecx
+ andl %edx,%r12d
+ andl %eax,%r11d
+ movl 32(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %eax,%r11d
+ addl %r12d,%ecx
+ movl %eax,%r12d
+ roll $14,%ecx
+ addl %edx,%ecx
+ notl %r11d
+ leal 1163531501(%rbx,%r10,1),%ebx
+ andl %ecx,%r12d
+ andl %edx,%r11d
+ movl 52(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %edx,%r11d
+ addl %r12d,%ebx
+ movl %edx,%r12d
+ roll $20,%ebx
+ addl %ecx,%ebx
+ notl %r11d
+ leal -1444681467(%rax,%r10,1),%eax
+ andl %ebx,%r12d
+ andl %ecx,%r11d
+ movl 8(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %ecx,%r11d
+ addl %r12d,%eax
+ movl %ecx,%r12d
+ roll $5,%eax
+ addl %ebx,%eax
+ notl %r11d
+ leal -51403784(%rdx,%r10,1),%edx
+ andl %eax,%r12d
+ andl %ebx,%r11d
+ movl 28(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %ebx,%r11d
+ addl %r12d,%edx
+ movl %ebx,%r12d
+ roll $9,%edx
+ addl %eax,%edx
+ notl %r11d
+ leal 1735328473(%rcx,%r10,1),%ecx
+ andl %edx,%r12d
+ andl %eax,%r11d
+ movl 48(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %eax,%r11d
+ addl %r12d,%ecx
+ movl %eax,%r12d
+ roll $14,%ecx
+ addl %edx,%ecx
+ notl %r11d
+ leal -1926607734(%rbx,%r10,1),%ebx
+ andl %ecx,%r12d
+ andl %edx,%r11d
+ movl 0(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %edx,%r11d
+ addl %r12d,%ebx
+ movl %edx,%r12d
+ roll $20,%ebx
+ addl %ecx,%ebx
+ movl 20(%rsi),%r10d
+ movl %ecx,%r11d
+ leal -378558(%rax,%r10,1),%eax
+ movl 32(%rsi),%r10d
+ xorl %edx,%r11d
+ xorl %ebx,%r11d
+ addl %r11d,%eax
+ roll $4,%eax
+ movl %ebx,%r11d
+ addl %ebx,%eax
+ leal -2022574463(%rdx,%r10,1),%edx
+ movl 44(%rsi),%r10d
+ xorl %ecx,%r11d
+ xorl %eax,%r11d
+ addl %r11d,%edx
+ roll $11,%edx
+ movl %eax,%r11d
+ addl %eax,%edx
+ leal 1839030562(%rcx,%r10,1),%ecx
+ movl 56(%rsi),%r10d
+ xorl %ebx,%r11d
+ xorl %edx,%r11d
+ addl %r11d,%ecx
+ roll $16,%ecx
+ movl %edx,%r11d
+ addl %edx,%ecx
+ leal -35309556(%rbx,%r10,1),%ebx
+ movl 4(%rsi),%r10d
+ xorl %eax,%r11d
+ xorl %ecx,%r11d
+ addl %r11d,%ebx
+ roll $23,%ebx
+ movl %ecx,%r11d
+ addl %ecx,%ebx
+ leal -1530992060(%rax,%r10,1),%eax
+ movl 16(%rsi),%r10d
+ xorl %edx,%r11d
+ xorl %ebx,%r11d
+ addl %r11d,%eax
+ roll $4,%eax
+ movl %ebx,%r11d
+ addl %ebx,%eax
+ leal 1272893353(%rdx,%r10,1),%edx
+ movl 28(%rsi),%r10d
+ xorl %ecx,%r11d
+ xorl %eax,%r11d
+ addl %r11d,%edx
+ roll $11,%edx
+ movl %eax,%r11d
+ addl %eax,%edx
+ leal -155497632(%rcx,%r10,1),%ecx
+ movl 40(%rsi),%r10d
+ xorl %ebx,%r11d
+ xorl %edx,%r11d
+ addl %r11d,%ecx
+ roll $16,%ecx
+ movl %edx,%r11d
+ addl %edx,%ecx
+ leal -1094730640(%rbx,%r10,1),%ebx
+ movl 52(%rsi),%r10d
+ xorl %eax,%r11d
+ xorl %ecx,%r11d
+ addl %r11d,%ebx
+ roll $23,%ebx
+ movl %ecx,%r11d
+ addl %ecx,%ebx
+ leal 681279174(%rax,%r10,1),%eax
+ movl 0(%rsi),%r10d
+ xorl %edx,%r11d
+ xorl %ebx,%r11d
+ addl %r11d,%eax
+ roll $4,%eax
+ movl %ebx,%r11d
+ addl %ebx,%eax
+ leal -358537222(%rdx,%r10,1),%edx
+ movl 12(%rsi),%r10d
+ xorl %ecx,%r11d
+ xorl %eax,%r11d
+ addl %r11d,%edx
+ roll $11,%edx
+ movl %eax,%r11d
+ addl %eax,%edx
+ leal -722521979(%rcx,%r10,1),%ecx
+ movl 24(%rsi),%r10d
+ xorl %ebx,%r11d
+ xorl %edx,%r11d
+ addl %r11d,%ecx
+ roll $16,%ecx
+ movl %edx,%r11d
+ addl %edx,%ecx
+ leal 76029189(%rbx,%r10,1),%ebx
+ movl 36(%rsi),%r10d
+ xorl %eax,%r11d
+ xorl %ecx,%r11d
+ addl %r11d,%ebx
+ roll $23,%ebx
+ movl %ecx,%r11d
+ addl %ecx,%ebx
+ leal -640364487(%rax,%r10,1),%eax
+ movl 48(%rsi),%r10d
+ xorl %edx,%r11d
+ xorl %ebx,%r11d
+ addl %r11d,%eax
+ roll $4,%eax
+ movl %ebx,%r11d
+ addl %ebx,%eax
+ leal -421815835(%rdx,%r10,1),%edx
+ movl 60(%rsi),%r10d
+ xorl %ecx,%r11d
+ xorl %eax,%r11d
+ addl %r11d,%edx
+ roll $11,%edx
+ movl %eax,%r11d
+ addl %eax,%edx
+ leal 530742520(%rcx,%r10,1),%ecx
+ movl 8(%rsi),%r10d
+ xorl %ebx,%r11d
+ xorl %edx,%r11d
+ addl %r11d,%ecx
+ roll $16,%ecx
+ movl %edx,%r11d
+ addl %edx,%ecx
+ leal -995338651(%rbx,%r10,1),%ebx
+ movl 0(%rsi),%r10d
+ xorl %eax,%r11d
+ xorl %ecx,%r11d
+ addl %r11d,%ebx
+ roll $23,%ebx
+ movl %ecx,%r11d
+ addl %ecx,%ebx
+ movl 0(%rsi),%r10d
+ movl $4294967295,%r11d
+ xorl %edx,%r11d
+ leal -198630844(%rax,%r10,1),%eax
+ orl %ebx,%r11d
+ xorl %ecx,%r11d
+ addl %r11d,%eax
+ movl 28(%rsi),%r10d
+ movl $4294967295,%r11d
+ roll $6,%eax
+ xorl %ecx,%r11d
+ addl %ebx,%eax
+ leal 1126891415(%rdx,%r10,1),%edx
+ orl %eax,%r11d
+ xorl %ebx,%r11d
+ addl %r11d,%edx
+ movl 56(%rsi),%r10d
+ movl $4294967295,%r11d
+ roll $10,%edx
+ xorl %ebx,%r11d
+ addl %eax,%edx
+ leal -1416354905(%rcx,%r10,1),%ecx
+ orl %edx,%r11d
+ xorl %eax,%r11d
+ addl %r11d,%ecx
+ movl 20(%rsi),%r10d
+ movl $4294967295,%r11d
+ roll $15,%ecx
+ xorl %eax,%r11d
+ addl %edx,%ecx
+ leal -57434055(%rbx,%r10,1),%ebx
+ orl %ecx,%r11d
+ xorl %edx,%r11d
+ addl %r11d,%ebx
+ movl 48(%rsi),%r10d
+ movl $4294967295,%r11d
+ roll $21,%ebx
+ xorl %edx,%r11d
+ addl %ecx,%ebx
+ leal 1700485571(%rax,%r10,1),%eax
+ orl %ebx,%r11d
+ xorl %ecx,%r11d
+ addl %r11d,%eax
+ movl 12(%rsi),%r10d
+ movl $4294967295,%r11d
+ roll $6,%eax
+ xorl %ecx,%r11d
+ addl %ebx,%eax
+ leal -1894986606(%rdx,%r10,1),%edx
+ orl %eax,%r11d
+ xorl %ebx,%r11d
+ addl %r11d,%edx
+ movl 40(%rsi),%r10d
+ movl $4294967295,%r11d
+ roll $10,%edx
+ xorl %ebx,%r11d
+ addl %eax,%edx
+ leal -1051523(%rcx,%r10,1),%ecx
+ orl %edx,%r11d
+ xorl %eax,%r11d
+ addl %r11d,%ecx
+ movl 4(%rsi),%r10d
+ movl $4294967295,%r11d
+ roll $15,%ecx
+ xorl %eax,%r11d
+ addl %edx,%ecx
+ leal -2054922799(%rbx,%r10,1),%ebx
+ orl %ecx,%r11d
+ xorl %edx,%r11d
+ addl %r11d,%ebx
+ movl 32(%rsi),%r10d
+ movl $4294967295,%r11d
+ roll $21,%ebx
+ xorl %edx,%r11d
+ addl %ecx,%ebx
+ leal 1873313359(%rax,%r10,1),%eax
+ orl %ebx,%r11d
+ xorl %ecx,%r11d
+ addl %r11d,%eax
+ movl 60(%rsi),%r10d
+ movl $4294967295,%r11d
+ roll $6,%eax
+ xorl %ecx,%r11d
+ addl %ebx,%eax
+ leal -30611744(%rdx,%r10,1),%edx
+ orl %eax,%r11d
+ xorl %ebx,%r11d
+ addl %r11d,%edx
+ movl 24(%rsi),%r10d
+ movl $4294967295,%r11d
+ roll $10,%edx
+ xorl %ebx,%r11d
+ addl %eax,%edx
+ leal -1560198380(%rcx,%r10,1),%ecx
+ orl %edx,%r11d
+ xorl %eax,%r11d
+ addl %r11d,%ecx
+ movl 52(%rsi),%r10d
+ movl $4294967295,%r11d
+ roll $15,%ecx
+ xorl %eax,%r11d
+ addl %edx,%ecx
+ leal 1309151649(%rbx,%r10,1),%ebx
+ orl %ecx,%r11d
+ xorl %edx,%r11d
+ addl %r11d,%ebx
+ movl 16(%rsi),%r10d
+ movl $4294967295,%r11d
+ roll $21,%ebx
+ xorl %edx,%r11d
+ addl %ecx,%ebx
+ leal -145523070(%rax,%r10,1),%eax
+ orl %ebx,%r11d
+ xorl %ecx,%r11d
+ addl %r11d,%eax
+ movl 44(%rsi),%r10d
+ movl $4294967295,%r11d
+ roll $6,%eax
+ xorl %ecx,%r11d
+ addl %ebx,%eax
+ leal -1120210379(%rdx,%r10,1),%edx
+ orl %eax,%r11d
+ xorl %ebx,%r11d
+ addl %r11d,%edx
+ movl 8(%rsi),%r10d
+ movl $4294967295,%r11d
+ roll $10,%edx
+ xorl %ebx,%r11d
+ addl %eax,%edx
+ leal 718787259(%rcx,%r10,1),%ecx
+ orl %edx,%r11d
+ xorl %eax,%r11d
+ addl %r11d,%ecx
+ movl 36(%rsi),%r10d
+ movl $4294967295,%r11d
+ roll $15,%ecx
+ xorl %eax,%r11d
+ addl %edx,%ecx
+ leal -343485551(%rbx,%r10,1),%ebx
+ orl %ecx,%r11d
+ xorl %edx,%r11d
+ addl %r11d,%ebx
+ movl 0(%rsi),%r10d
+ movl $4294967295,%r11d
+ roll $21,%ebx
+ xorl %edx,%r11d
+ addl %ecx,%ebx
+
+ addl %r8d,%eax
+ addl %r9d,%ebx
+ addl %r14d,%ecx
+ addl %r15d,%edx
+
+
+ addq $64,%rsi
+ cmpq %rdi,%rsi
+ jb .Lloop
+
+
+.Lend:
+ movl %eax,0(%rbp)
+ movl %ebx,4(%rbp)
+ movl %ecx,8(%rbp)
+ movl %edx,12(%rbp)
+
+ movq (%rsp),%r15
+ movq 8(%rsp),%r14
+ movq 16(%rsp),%r12
+ movq 24(%rsp),%rbx
+ movq 32(%rsp),%rbp
+ addq $40,%rsp
+.Lepilogue:
+ .byte 0xf3,0xc3
+.size md5_block_asm_data_order,.-md5_block_asm_data_order
diff --git a/main/openssl/crypto/md5/asm/md5-x86_64.pl b/main/openssl/crypto/md5/asm/md5-x86_64.pl
index 86788543..f11224d1 100755
--- a/main/openssl/crypto/md5/asm/md5-x86_64.pl
+++ b/main/openssl/crypto/md5/asm/md5-x86_64.pl
@@ -120,7 +120,8 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; my $dir=$1; my $xlate;
die "can't locate x86_64-xlate.pl";
no warnings qw(uninitialized);
-open STDOUT,"| $^X $xlate $flavour $output";
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
$code .= <<EOF;
.text
diff --git a/main/openssl/crypto/md5/md5.h b/main/openssl/crypto/md5/md5.h
index 4cbf8438..541cc925 100644
--- a/main/openssl/crypto/md5/md5.h
+++ b/main/openssl/crypto/md5/md5.h
@@ -105,6 +105,9 @@ typedef struct MD5state_st
unsigned int num;
} MD5_CTX;
+#ifdef OPENSSL_FIPS
+int private_MD5_Init(MD5_CTX *c);
+#endif
int MD5_Init(MD5_CTX *c);
int MD5_Update(MD5_CTX *c, const void *data, size_t len);
int MD5_Final(unsigned char *md, MD5_CTX *c);
diff --git a/main/openssl/crypto/md5/md5_dgst.c b/main/openssl/crypto/md5/md5_dgst.c
index beace632..265890de 100644
--- a/main/openssl/crypto/md5/md5_dgst.c
+++ b/main/openssl/crypto/md5/md5_dgst.c
@@ -59,6 +59,7 @@
#include <stdio.h>
#include "md5_locl.h"
#include <openssl/opensslv.h>
+#include <openssl/crypto.h>
const char MD5_version[]="MD5" OPENSSL_VERSION_PTEXT;
@@ -70,7 +71,7 @@ const char MD5_version[]="MD5" OPENSSL_VERSION_PTEXT;
#define INIT_DATA_C (unsigned long)0x98badcfeL
#define INIT_DATA_D (unsigned long)0x10325476L
-int MD5_Init(MD5_CTX *c)
+fips_md_init(MD5)
{
memset (c,0,sizeof(*c));
c->A=INIT_DATA_A;
diff --git a/main/openssl/crypto/md5/md5_locl.h b/main/openssl/crypto/md5/md5_locl.h
index 968d5779..74d63d1f 100644
--- a/main/openssl/crypto/md5/md5_locl.h
+++ b/main/openssl/crypto/md5/md5_locl.h
@@ -86,10 +86,10 @@ void md5_block_data_order (MD5_CTX *c, const void *p,size_t num);
#define HASH_FINAL MD5_Final
#define HASH_MAKE_STRING(c,s) do { \
unsigned long ll; \
- ll=(c)->A; HOST_l2c(ll,(s)); \
- ll=(c)->B; HOST_l2c(ll,(s)); \
- ll=(c)->C; HOST_l2c(ll,(s)); \
- ll=(c)->D; HOST_l2c(ll,(s)); \
+ ll=(c)->A; (void)HOST_l2c(ll,(s)); \
+ ll=(c)->B; (void)HOST_l2c(ll,(s)); \
+ ll=(c)->C; (void)HOST_l2c(ll,(s)); \
+ ll=(c)->D; (void)HOST_l2c(ll,(s)); \
} while (0)
#define HASH_BLOCK_DATA_ORDER md5_block_data_order
diff --git a/main/openssl/crypto/mdc2/mdc2.h b/main/openssl/crypto/mdc2/mdc2.h
index 72778a52..f3e8e579 100644
--- a/main/openssl/crypto/mdc2/mdc2.h
+++ b/main/openssl/crypto/mdc2/mdc2.h
@@ -81,6 +81,9 @@ typedef struct mdc2_ctx_st
} MDC2_CTX;
+#ifdef OPENSSL_FIPS
+int private_MDC2_Init(MDC2_CTX *c);
+#endif
int MDC2_Init(MDC2_CTX *c);
int MDC2_Update(MDC2_CTX *c, const unsigned char *data, size_t len);
int MDC2_Final(unsigned char *md, MDC2_CTX *c);
diff --git a/main/openssl/crypto/mdc2/mdc2dgst.c b/main/openssl/crypto/mdc2/mdc2dgst.c
index 4aa406ed..d66ed6a1 100644
--- a/main/openssl/crypto/mdc2/mdc2dgst.c
+++ b/main/openssl/crypto/mdc2/mdc2dgst.c
@@ -59,6 +59,7 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
+#include <openssl/crypto.h>
#include <openssl/des.h>
#include <openssl/mdc2.h>
@@ -75,7 +76,7 @@
*((c)++)=(unsigned char)(((l)>>24L)&0xff))
static void mdc2_body(MDC2_CTX *c, const unsigned char *in, size_t len);
-int MDC2_Init(MDC2_CTX *c)
+fips_md_init(MDC2)
{
c->num=0;
c->pad_type=1;
diff --git a/main/openssl/crypto/mem.c b/main/openssl/crypto/mem.c
index 6f80dd33..1cc62eaf 100644
--- a/main/openssl/crypto/mem.c
+++ b/main/openssl/crypto/mem.c
@@ -121,10 +121,11 @@ static void (*set_debug_options_func)(long) = NULL;
static long (*get_debug_options_func)(void) = NULL;
#endif
-
int CRYPTO_set_mem_functions(void *(*m)(size_t), void *(*r)(void *, size_t),
void (*f)(void *))
{
+ /* Dummy call just to ensure OPENSSL_init() gets linked in */
+ OPENSSL_init();
if (!allow_customize)
return 0;
if ((m == 0) || (r == 0) || (f == 0))
@@ -186,6 +187,7 @@ int CRYPTO_set_mem_debug_functions(void (*m)(void *,int,const char *,int,int),
{
if (!allow_customize_debug)
return 0;
+ OPENSSL_init();
malloc_debug_func=m;
realloc_debug_func=r;
free_debug_func=f;
@@ -361,6 +363,10 @@ void *CRYPTO_realloc_clean(void *str, int old_len, int num, const char *file,
if (num <= 0) return NULL;
+ /* We don't support shrinking the buffer. Note the memcpy that copies
+ * |old_len| bytes to the new buffer, below. */
+ if (num < old_len) return NULL;
+
if (realloc_debug_func != NULL)
realloc_debug_func(str, NULL, num, file, line, 0);
ret=malloc_ex_func(num,file,line);
diff --git a/main/openssl/crypto/modes/asm/ghash-alpha.pl b/main/openssl/crypto/modes/asm/ghash-alpha.pl
new file mode 100644
index 00000000..aa360293
--- /dev/null
+++ b/main/openssl/crypto/modes/asm/ghash-alpha.pl
@@ -0,0 +1,460 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# March 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+128 bytes shared table]. Even though
+# loops are aggressively modulo-scheduled in respect to references to
+# Htbl and Z.hi updates for 8 cycles per byte, measured performance is
+# ~12 cycles per processed byte on 21264 CPU. It seems to be a dynamic
+# scheduling "glitch," because uprofile(1) indicates uniform sample
+# distribution, as if all instruction bundles execute in 1.5 cycles.
+# Meaning that it could have been even faster, yet 12 cycles is ~60%
+# better than gcc-generated code and ~80% than code generated by vendor
+# compiler.
+
+$cnt="v0"; # $0
+$t0="t0";
+$t1="t1";
+$t2="t2";
+$Thi0="t3"; # $4
+$Tlo0="t4";
+$Thi1="t5";
+$Tlo1="t6";
+$rem="t7"; # $8
+#################
+$Xi="a0"; # $16, input argument block
+$Htbl="a1";
+$inp="a2";
+$len="a3";
+$nlo="a4"; # $20
+$nhi="a5";
+$Zhi="t8";
+$Zlo="t9";
+$Xhi="t10"; # $24
+$Xlo="t11";
+$remp="t12";
+$rem_4bit="AT"; # $28
+
+{ my $N;
+ sub loop() {
+
+ $N++;
+$code.=<<___;
+.align 4
+ extbl $Xlo,7,$nlo
+ and $nlo,0xf0,$nhi
+ sll $nlo,4,$nlo
+ and $nlo,0xf0,$nlo
+
+ addq $nlo,$Htbl,$nlo
+ ldq $Zlo,8($nlo)
+ addq $nhi,$Htbl,$nhi
+ ldq $Zhi,0($nlo)
+
+ and $Zlo,0x0f,$remp
+ sll $Zhi,60,$t0
+ lda $cnt,6(zero)
+ extbl $Xlo,6,$nlo
+
+ ldq $Tlo1,8($nhi)
+ s8addq $remp,$rem_4bit,$remp
+ ldq $Thi1,0($nhi)
+ srl $Zlo,4,$Zlo
+
+ ldq $rem,0($remp)
+ srl $Zhi,4,$Zhi
+ xor $t0,$Zlo,$Zlo
+ and $nlo,0xf0,$nhi
+
+ xor $Tlo1,$Zlo,$Zlo
+ sll $nlo,4,$nlo
+ xor $Thi1,$Zhi,$Zhi
+ and $nlo,0xf0,$nlo
+
+ addq $nlo,$Htbl,$nlo
+ ldq $Tlo0,8($nlo)
+ addq $nhi,$Htbl,$nhi
+ ldq $Thi0,0($nlo)
+
+.Looplo$N:
+ and $Zlo,0x0f,$remp
+ sll $Zhi,60,$t0
+ subq $cnt,1,$cnt
+ srl $Zlo,4,$Zlo
+
+ ldq $Tlo1,8($nhi)
+ xor $rem,$Zhi,$Zhi
+ ldq $Thi1,0($nhi)
+ s8addq $remp,$rem_4bit,$remp
+
+ ldq $rem,0($remp)
+ srl $Zhi,4,$Zhi
+ xor $t0,$Zlo,$Zlo
+ extbl $Xlo,$cnt,$nlo
+
+ and $nlo,0xf0,$nhi
+ xor $Thi0,$Zhi,$Zhi
+ xor $Tlo0,$Zlo,$Zlo
+ sll $nlo,4,$nlo
+
+
+ and $Zlo,0x0f,$remp
+ sll $Zhi,60,$t0
+ and $nlo,0xf0,$nlo
+ srl $Zlo,4,$Zlo
+
+ s8addq $remp,$rem_4bit,$remp
+ xor $rem,$Zhi,$Zhi
+ addq $nlo,$Htbl,$nlo
+ addq $nhi,$Htbl,$nhi
+
+ ldq $rem,0($remp)
+ srl $Zhi,4,$Zhi
+ ldq $Tlo0,8($nlo)
+ xor $t0,$Zlo,$Zlo
+
+ xor $Tlo1,$Zlo,$Zlo
+ xor $Thi1,$Zhi,$Zhi
+ ldq $Thi0,0($nlo)
+ bne $cnt,.Looplo$N
+
+
+ and $Zlo,0x0f,$remp
+ sll $Zhi,60,$t0
+ lda $cnt,7(zero)
+ srl $Zlo,4,$Zlo
+
+ ldq $Tlo1,8($nhi)
+ xor $rem,$Zhi,$Zhi
+ ldq $Thi1,0($nhi)
+ s8addq $remp,$rem_4bit,$remp
+
+ ldq $rem,0($remp)
+ srl $Zhi,4,$Zhi
+ xor $t0,$Zlo,$Zlo
+ extbl $Xhi,$cnt,$nlo
+
+ and $nlo,0xf0,$nhi
+ xor $Thi0,$Zhi,$Zhi
+ xor $Tlo0,$Zlo,$Zlo
+ sll $nlo,4,$nlo
+
+ and $Zlo,0x0f,$remp
+ sll $Zhi,60,$t0
+ and $nlo,0xf0,$nlo
+ srl $Zlo,4,$Zlo
+
+ s8addq $remp,$rem_4bit,$remp
+ xor $rem,$Zhi,$Zhi
+ addq $nlo,$Htbl,$nlo
+ addq $nhi,$Htbl,$nhi
+
+ ldq $rem,0($remp)
+ srl $Zhi,4,$Zhi
+ ldq $Tlo0,8($nlo)
+ xor $t0,$Zlo,$Zlo
+
+ xor $Tlo1,$Zlo,$Zlo
+ xor $Thi1,$Zhi,$Zhi
+ ldq $Thi0,0($nlo)
+ unop
+
+
+.Loophi$N:
+ and $Zlo,0x0f,$remp
+ sll $Zhi,60,$t0
+ subq $cnt,1,$cnt
+ srl $Zlo,4,$Zlo
+
+ ldq $Tlo1,8($nhi)
+ xor $rem,$Zhi,$Zhi
+ ldq $Thi1,0($nhi)
+ s8addq $remp,$rem_4bit,$remp
+
+ ldq $rem,0($remp)
+ srl $Zhi,4,$Zhi
+ xor $t0,$Zlo,$Zlo
+ extbl $Xhi,$cnt,$nlo
+
+ and $nlo,0xf0,$nhi
+ xor $Thi0,$Zhi,$Zhi
+ xor $Tlo0,$Zlo,$Zlo
+ sll $nlo,4,$nlo
+
+
+ and $Zlo,0x0f,$remp
+ sll $Zhi,60,$t0
+ and $nlo,0xf0,$nlo
+ srl $Zlo,4,$Zlo
+
+ s8addq $remp,$rem_4bit,$remp
+ xor $rem,$Zhi,$Zhi
+ addq $nlo,$Htbl,$nlo
+ addq $nhi,$Htbl,$nhi
+
+ ldq $rem,0($remp)
+ srl $Zhi,4,$Zhi
+ ldq $Tlo0,8($nlo)
+ xor $t0,$Zlo,$Zlo
+
+ xor $Tlo1,$Zlo,$Zlo
+ xor $Thi1,$Zhi,$Zhi
+ ldq $Thi0,0($nlo)
+ bne $cnt,.Loophi$N
+
+
+ and $Zlo,0x0f,$remp
+ sll $Zhi,60,$t0
+ srl $Zlo,4,$Zlo
+
+ ldq $Tlo1,8($nhi)
+ xor $rem,$Zhi,$Zhi
+ ldq $Thi1,0($nhi)
+ s8addq $remp,$rem_4bit,$remp
+
+ ldq $rem,0($remp)
+ srl $Zhi,4,$Zhi
+ xor $t0,$Zlo,$Zlo
+
+ xor $Tlo0,$Zlo,$Zlo
+ xor $Thi0,$Zhi,$Zhi
+
+ and $Zlo,0x0f,$remp
+ sll $Zhi,60,$t0
+ srl $Zlo,4,$Zlo
+
+ s8addq $remp,$rem_4bit,$remp
+ xor $rem,$Zhi,$Zhi
+
+ ldq $rem,0($remp)
+ srl $Zhi,4,$Zhi
+ xor $Tlo1,$Zlo,$Zlo
+ xor $Thi1,$Zhi,$Zhi
+ xor $t0,$Zlo,$Zlo
+ xor $rem,$Zhi,$Zhi
+___
+}}
+
+$code=<<___;
+#ifdef __linux__
+#include <asm/regdef.h>
+#else
+#include <asm.h>
+#include <regdef.h>
+#endif
+
+.text
+
+.set noat
+.set noreorder
+.globl gcm_gmult_4bit
+.align 4
+.ent gcm_gmult_4bit
+gcm_gmult_4bit:
+ .frame sp,0,ra
+ .prologue 0
+
+ ldq $Xlo,8($Xi)
+ ldq $Xhi,0($Xi)
+
+ bsr $t0,picmeup
+ nop
+___
+
+ &loop();
+
+$code.=<<___;
+ srl $Zlo,24,$t0 # byte swap
+ srl $Zlo,8,$t1
+
+ sll $Zlo,8,$t2
+ sll $Zlo,24,$Zlo
+ zapnot $t0,0x11,$t0
+ zapnot $t1,0x22,$t1
+
+ zapnot $Zlo,0x88,$Zlo
+ or $t0,$t1,$t0
+ zapnot $t2,0x44,$t2
+
+ or $Zlo,$t0,$Zlo
+ srl $Zhi,24,$t0
+ srl $Zhi,8,$t1
+
+ or $Zlo,$t2,$Zlo
+ sll $Zhi,8,$t2
+ sll $Zhi,24,$Zhi
+
+ srl $Zlo,32,$Xlo
+ sll $Zlo,32,$Zlo
+
+ zapnot $t0,0x11,$t0
+ zapnot $t1,0x22,$t1
+ or $Zlo,$Xlo,$Xlo
+
+ zapnot $Zhi,0x88,$Zhi
+ or $t0,$t1,$t0
+ zapnot $t2,0x44,$t2
+
+ or $Zhi,$t0,$Zhi
+ or $Zhi,$t2,$Zhi
+
+ srl $Zhi,32,$Xhi
+ sll $Zhi,32,$Zhi
+
+ or $Zhi,$Xhi,$Xhi
+ stq $Xlo,8($Xi)
+ stq $Xhi,0($Xi)
+
+ ret (ra)
+.end gcm_gmult_4bit
+___
+
+$inhi="s0";
+$inlo="s1";
+
+$code.=<<___;
+.globl gcm_ghash_4bit
+.align 4
+.ent gcm_ghash_4bit
+gcm_ghash_4bit:
+ lda sp,-32(sp)
+ stq ra,0(sp)
+ stq s0,8(sp)
+ stq s1,16(sp)
+ .mask 0x04000600,-32
+ .frame sp,32,ra
+ .prologue 0
+
+ ldq_u $inhi,0($inp)
+ ldq_u $Thi0,7($inp)
+ ldq_u $inlo,8($inp)
+ ldq_u $Tlo0,15($inp)
+ ldq $Xhi,0($Xi)
+ ldq $Xlo,8($Xi)
+
+ bsr $t0,picmeup
+ nop
+
+.Louter:
+ extql $inhi,$inp,$inhi
+ extqh $Thi0,$inp,$Thi0
+ or $inhi,$Thi0,$inhi
+ lda $inp,16($inp)
+
+ extql $inlo,$inp,$inlo
+ extqh $Tlo0,$inp,$Tlo0
+ or $inlo,$Tlo0,$inlo
+ subq $len,16,$len
+
+ xor $Xlo,$inlo,$Xlo
+ xor $Xhi,$inhi,$Xhi
+___
+
+ &loop();
+
+$code.=<<___;
+ srl $Zlo,24,$t0 # byte swap
+ srl $Zlo,8,$t1
+
+ sll $Zlo,8,$t2
+ sll $Zlo,24,$Zlo
+ zapnot $t0,0x11,$t0
+ zapnot $t1,0x22,$t1
+
+ zapnot $Zlo,0x88,$Zlo
+ or $t0,$t1,$t0
+ zapnot $t2,0x44,$t2
+
+ or $Zlo,$t0,$Zlo
+ srl $Zhi,24,$t0
+ srl $Zhi,8,$t1
+
+ or $Zlo,$t2,$Zlo
+ sll $Zhi,8,$t2
+ sll $Zhi,24,$Zhi
+
+ srl $Zlo,32,$Xlo
+ sll $Zlo,32,$Zlo
+ beq $len,.Ldone
+
+ zapnot $t0,0x11,$t0
+ zapnot $t1,0x22,$t1
+ or $Zlo,$Xlo,$Xlo
+ ldq_u $inhi,0($inp)
+
+ zapnot $Zhi,0x88,$Zhi
+ or $t0,$t1,$t0
+ zapnot $t2,0x44,$t2
+ ldq_u $Thi0,7($inp)
+
+ or $Zhi,$t0,$Zhi
+ or $Zhi,$t2,$Zhi
+ ldq_u $inlo,8($inp)
+ ldq_u $Tlo0,15($inp)
+
+ srl $Zhi,32,$Xhi
+ sll $Zhi,32,$Zhi
+
+ or $Zhi,$Xhi,$Xhi
+ br zero,.Louter
+
+.Ldone:
+ zapnot $t0,0x11,$t0
+ zapnot $t1,0x22,$t1
+ or $Zlo,$Xlo,$Xlo
+
+ zapnot $Zhi,0x88,$Zhi
+ or $t0,$t1,$t0
+ zapnot $t2,0x44,$t2
+
+ or $Zhi,$t0,$Zhi
+ or $Zhi,$t2,$Zhi
+
+ srl $Zhi,32,$Xhi
+ sll $Zhi,32,$Zhi
+
+ or $Zhi,$Xhi,$Xhi
+
+ stq $Xlo,8($Xi)
+ stq $Xhi,0($Xi)
+
+ .set noreorder
+ /*ldq ra,0(sp)*/
+ ldq s0,8(sp)
+ ldq s1,16(sp)
+ lda sp,32(sp)
+ ret (ra)
+.end gcm_ghash_4bit
+
+.align 4
+.ent picmeup
+picmeup:
+ .frame sp,0,$t0
+ .prologue 0
+ br $rem_4bit,.Lpic
+.Lpic: lda $rem_4bit,12($rem_4bit)
+ ret ($t0)
+.end picmeup
+ nop
+rem_4bit:
+ .long 0,0x0000<<16, 0,0x1C20<<16, 0,0x3840<<16, 0,0x2460<<16
+ .long 0,0x7080<<16, 0,0x6CA0<<16, 0,0x48C0<<16, 0,0x54E0<<16
+ .long 0,0xE100<<16, 0,0xFD20<<16, 0,0xD940<<16, 0,0xC560<<16
+ .long 0,0x9180<<16, 0,0x8DA0<<16, 0,0xA9C0<<16, 0,0xB5E0<<16
+.ascii "GHASH for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
+.align 4
+
+___
+$output=shift and open STDOUT,">$output";
+print $code;
+close STDOUT;
+
diff --git a/main/openssl/crypto/modes/asm/ghash-armv4.S b/main/openssl/crypto/modes/asm/ghash-armv4.S
new file mode 100644
index 00000000..d66c4cbf
--- /dev/null
+++ b/main/openssl/crypto/modes/asm/ghash-armv4.S
@@ -0,0 +1,408 @@
+#include "arm_arch.h"
+
+.text
+.code 32
+
+.type rem_4bit,%object
+.align 5
+rem_4bit:
+.short 0x0000,0x1C20,0x3840,0x2460
+.short 0x7080,0x6CA0,0x48C0,0x54E0
+.short 0xE100,0xFD20,0xD940,0xC560
+.short 0x9180,0x8DA0,0xA9C0,0xB5E0
+.size rem_4bit,.-rem_4bit
+
+.type rem_4bit_get,%function
+rem_4bit_get:
+ sub r2,pc,#8
+ sub r2,r2,#32 @ &rem_4bit
+ b .Lrem_4bit_got
+ nop
+.size rem_4bit_get,.-rem_4bit_get
+
+.global gcm_ghash_4bit
+.type gcm_ghash_4bit,%function
+gcm_ghash_4bit:
+ sub r12,pc,#8
+ add r3,r2,r3 @ r3 to point at the end
+ stmdb sp!,{r3-r11,lr} @ save r3/end too
+ sub r12,r12,#48 @ &rem_4bit
+
+ ldmia r12,{r4-r11} @ copy rem_4bit ...
+ stmdb sp!,{r4-r11} @ ... to stack
+
+ ldrb r12,[r2,#15]
+ ldrb r14,[r0,#15]
+.Louter:
+ eor r12,r12,r14
+ and r14,r12,#0xf0
+ and r12,r12,#0x0f
+ mov r3,#14
+
+ add r7,r1,r12,lsl#4
+ ldmia r7,{r4-r7} @ load Htbl[nlo]
+ add r11,r1,r14
+ ldrb r12,[r2,#14]
+
+ and r14,r4,#0xf @ rem
+ ldmia r11,{r8-r11} @ load Htbl[nhi]
+ add r14,r14,r14
+ eor r4,r8,r4,lsr#4
+ ldrh r8,[sp,r14] @ rem_4bit[rem]
+ eor r4,r4,r5,lsl#28
+ ldrb r14,[r0,#14]
+ eor r5,r9,r5,lsr#4
+ eor r5,r5,r6,lsl#28
+ eor r6,r10,r6,lsr#4
+ eor r6,r6,r7,lsl#28
+ eor r7,r11,r7,lsr#4
+ eor r12,r12,r14
+ and r14,r12,#0xf0
+ and r12,r12,#0x0f
+ eor r7,r7,r8,lsl#16
+
+.Linner:
+ add r11,r1,r12,lsl#4
+ and r12,r4,#0xf @ rem
+ subs r3,r3,#1
+ add r12,r12,r12
+ ldmia r11,{r8-r11} @ load Htbl[nlo]
+ eor r4,r8,r4,lsr#4
+ eor r4,r4,r5,lsl#28
+ eor r5,r9,r5,lsr#4
+ eor r5,r5,r6,lsl#28
+ ldrh r8,[sp,r12] @ rem_4bit[rem]
+ eor r6,r10,r6,lsr#4
+ ldrplb r12,[r2,r3]
+ eor r6,r6,r7,lsl#28
+ eor r7,r11,r7,lsr#4
+
+ add r11,r1,r14
+ and r14,r4,#0xf @ rem
+ eor r7,r7,r8,lsl#16 @ ^= rem_4bit[rem]
+ add r14,r14,r14
+ ldmia r11,{r8-r11} @ load Htbl[nhi]
+ eor r4,r8,r4,lsr#4
+ ldrplb r8,[r0,r3]
+ eor r4,r4,r5,lsl#28
+ eor r5,r9,r5,lsr#4
+ ldrh r9,[sp,r14]
+ eor r5,r5,r6,lsl#28
+ eor r6,r10,r6,lsr#4
+ eor r6,r6,r7,lsl#28
+ eorpl r12,r12,r8
+ eor r7,r11,r7,lsr#4
+ andpl r14,r12,#0xf0
+ andpl r12,r12,#0x0f
+ eor r7,r7,r9,lsl#16 @ ^= rem_4bit[rem]
+ bpl .Linner
+
+ ldr r3,[sp,#32] @ re-load r3/end
+ add r2,r2,#16
+ mov r14,r4
+#if __ARM_ARCH__>=7 && defined(__ARMEL__)
+ rev r4,r4
+ str r4,[r0,#12]
+#elif defined(__ARMEB__)
+ str r4,[r0,#12]
+#else
+ mov r9,r4,lsr#8
+ strb r4,[r0,#12+3]
+ mov r10,r4,lsr#16
+ strb r9,[r0,#12+2]
+ mov r11,r4,lsr#24
+ strb r10,[r0,#12+1]
+ strb r11,[r0,#12]
+#endif
+ cmp r2,r3
+#if __ARM_ARCH__>=7 && defined(__ARMEL__)
+ rev r5,r5
+ str r5,[r0,#8]
+#elif defined(__ARMEB__)
+ str r5,[r0,#8]
+#else
+ mov r9,r5,lsr#8
+ strb r5,[r0,#8+3]
+ mov r10,r5,lsr#16
+ strb r9,[r0,#8+2]
+ mov r11,r5,lsr#24
+ strb r10,[r0,#8+1]
+ strb r11,[r0,#8]
+#endif
+ ldrneb r12,[r2,#15]
+#if __ARM_ARCH__>=7 && defined(__ARMEL__)
+ rev r6,r6
+ str r6,[r0,#4]
+#elif defined(__ARMEB__)
+ str r6,[r0,#4]
+#else
+ mov r9,r6,lsr#8
+ strb r6,[r0,#4+3]
+ mov r10,r6,lsr#16
+ strb r9,[r0,#4+2]
+ mov r11,r6,lsr#24
+ strb r10,[r0,#4+1]
+ strb r11,[r0,#4]
+#endif
+
+#if __ARM_ARCH__>=7 && defined(__ARMEL__)
+ rev r7,r7
+ str r7,[r0,#0]
+#elif defined(__ARMEB__)
+ str r7,[r0,#0]
+#else
+ mov r9,r7,lsr#8
+ strb r7,[r0,#0+3]
+ mov r10,r7,lsr#16
+ strb r9,[r0,#0+2]
+ mov r11,r7,lsr#24
+ strb r10,[r0,#0+1]
+ strb r11,[r0,#0]
+#endif
+
+ bne .Louter
+
+ add sp,sp,#36
+#if __ARM_ARCH__>=5
+ ldmia sp!,{r4-r11,pc}
+#else
+ ldmia sp!,{r4-r11,lr}
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ .word 0xe12fff1e @ interoperable with Thumb ISA:-)
+#endif
+.size gcm_ghash_4bit,.-gcm_ghash_4bit
+
+.global gcm_gmult_4bit
+.type gcm_gmult_4bit,%function
+gcm_gmult_4bit:
+ stmdb sp!,{r4-r11,lr}
+ ldrb r12,[r0,#15]
+ b rem_4bit_get
+.Lrem_4bit_got:
+ and r14,r12,#0xf0
+ and r12,r12,#0x0f
+ mov r3,#14
+
+ add r7,r1,r12,lsl#4
+ ldmia r7,{r4-r7} @ load Htbl[nlo]
+ ldrb r12,[r0,#14]
+
+ add r11,r1,r14
+ and r14,r4,#0xf @ rem
+ ldmia r11,{r8-r11} @ load Htbl[nhi]
+ add r14,r14,r14
+ eor r4,r8,r4,lsr#4
+ ldrh r8,[r2,r14] @ rem_4bit[rem]
+ eor r4,r4,r5,lsl#28
+ eor r5,r9,r5,lsr#4
+ eor r5,r5,r6,lsl#28
+ eor r6,r10,r6,lsr#4
+ eor r6,r6,r7,lsl#28
+ eor r7,r11,r7,lsr#4
+ and r14,r12,#0xf0
+ eor r7,r7,r8,lsl#16
+ and r12,r12,#0x0f
+
+.Loop:
+ add r11,r1,r12,lsl#4
+ and r12,r4,#0xf @ rem
+ subs r3,r3,#1
+ add r12,r12,r12
+ ldmia r11,{r8-r11} @ load Htbl[nlo]
+ eor r4,r8,r4,lsr#4
+ eor r4,r4,r5,lsl#28
+ eor r5,r9,r5,lsr#4
+ eor r5,r5,r6,lsl#28
+ ldrh r8,[r2,r12] @ rem_4bit[rem]
+ eor r6,r10,r6,lsr#4
+ ldrplb r12,[r0,r3]
+ eor r6,r6,r7,lsl#28
+ eor r7,r11,r7,lsr#4
+
+ add r11,r1,r14
+ and r14,r4,#0xf @ rem
+ eor r7,r7,r8,lsl#16 @ ^= rem_4bit[rem]
+ add r14,r14,r14
+ ldmia r11,{r8-r11} @ load Htbl[nhi]
+ eor r4,r8,r4,lsr#4
+ eor r4,r4,r5,lsl#28
+ eor r5,r9,r5,lsr#4
+ ldrh r8,[r2,r14] @ rem_4bit[rem]
+ eor r5,r5,r6,lsl#28
+ eor r6,r10,r6,lsr#4
+ eor r6,r6,r7,lsl#28
+ eor r7,r11,r7,lsr#4
+ andpl r14,r12,#0xf0
+ andpl r12,r12,#0x0f
+ eor r7,r7,r8,lsl#16 @ ^= rem_4bit[rem]
+ bpl .Loop
+#if __ARM_ARCH__>=7 && defined(__ARMEL__)
+ rev r4,r4
+ str r4,[r0,#12]
+#elif defined(__ARMEB__)
+ str r4,[r0,#12]
+#else
+ mov r9,r4,lsr#8
+ strb r4,[r0,#12+3]
+ mov r10,r4,lsr#16
+ strb r9,[r0,#12+2]
+ mov r11,r4,lsr#24
+ strb r10,[r0,#12+1]
+ strb r11,[r0,#12]
+#endif
+
+#if __ARM_ARCH__>=7 && defined(__ARMEL__)
+ rev r5,r5
+ str r5,[r0,#8]
+#elif defined(__ARMEB__)
+ str r5,[r0,#8]
+#else
+ mov r9,r5,lsr#8
+ strb r5,[r0,#8+3]
+ mov r10,r5,lsr#16
+ strb r9,[r0,#8+2]
+ mov r11,r5,lsr#24
+ strb r10,[r0,#8+1]
+ strb r11,[r0,#8]
+#endif
+
+#if __ARM_ARCH__>=7 && defined(__ARMEL__)
+ rev r6,r6
+ str r6,[r0,#4]
+#elif defined(__ARMEB__)
+ str r6,[r0,#4]
+#else
+ mov r9,r6,lsr#8
+ strb r6,[r0,#4+3]
+ mov r10,r6,lsr#16
+ strb r9,[r0,#4+2]
+ mov r11,r6,lsr#24
+ strb r10,[r0,#4+1]
+ strb r11,[r0,#4]
+#endif
+
+#if __ARM_ARCH__>=7 && defined(__ARMEL__)
+ rev r7,r7
+ str r7,[r0,#0]
+#elif defined(__ARMEB__)
+ str r7,[r0,#0]
+#else
+ mov r9,r7,lsr#8
+ strb r7,[r0,#0+3]
+ mov r10,r7,lsr#16
+ strb r9,[r0,#0+2]
+ mov r11,r7,lsr#24
+ strb r10,[r0,#0+1]
+ strb r11,[r0,#0]
+#endif
+
+#if __ARM_ARCH__>=5
+ ldmia sp!,{r4-r11,pc}
+#else
+ ldmia sp!,{r4-r11,lr}
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ .word 0xe12fff1e @ interoperable with Thumb ISA:-)
+#endif
+.size gcm_gmult_4bit,.-gcm_gmult_4bit
+#if __ARM_ARCH__>=7
+.fpu neon
+
+.global gcm_gmult_neon
+.type gcm_gmult_neon,%function
+.align 4
+gcm_gmult_neon:
+ sub r1,#16 @ point at H in GCM128_CTX
+ vld1.64 d29,[r0,:64]!@ load Xi
+ vmov.i32 d5,#0xe1 @ our irreducible polynomial
+ vld1.64 d28,[r0,:64]!
+ vshr.u64 d5,#32
+ vldmia r1,{d0-d1} @ load H
+ veor q12,q12
+#ifdef __ARMEL__
+ vrev64.8 q14,q14
+#endif
+ veor q13,q13
+ veor q11,q11
+ mov r1,#16
+ veor q10,q10
+ mov r3,#16
+ veor d2,d2
+ vdup.8 d4,d28[0] @ broadcast lowest byte
+ b .Linner_neon
+.size gcm_gmult_neon,.-gcm_gmult_neon
+
+.global gcm_ghash_neon
+.type gcm_ghash_neon,%function
+.align 4
+gcm_ghash_neon:
+ vld1.64 d21,[r0,:64]! @ load Xi
+ vmov.i32 d5,#0xe1 @ our irreducible polynomial
+ vld1.64 d20,[r0,:64]!
+ vshr.u64 d5,#32
+ vldmia r0,{d0-d1} @ load H
+ veor q12,q12
+ nop
+#ifdef __ARMEL__
+ vrev64.8 q10,q10
+#endif
+.Louter_neon:
+ vld1.64 d29,[r2]! @ load inp
+ veor q13,q13
+ vld1.64 d28,[r2]!
+ veor q11,q11
+ mov r1,#16
+#ifdef __ARMEL__
+ vrev64.8 q14,q14
+#endif
+ veor d2,d2
+ veor q14,q10 @ inp^=Xi
+ veor q10,q10
+ vdup.8 d4,d28[0] @ broadcast lowest byte
+.Linner_neon:
+ subs r1,r1,#1
+ vmull.p8 q9,d1,d4 @ H.lo·Xi[i]
+ vmull.p8 q8,d0,d4 @ H.hi·Xi[i]
+ vext.8 q14,q12,#1 @ IN>>=8
+
+ veor q10,q13 @ modulo-scheduled part
+ vshl.i64 d22,#48
+ vdup.8 d4,d28[0] @ broadcast lowest byte
+ veor d3,d18,d20
+
+ veor d21,d22
+ vuzp.8 q9,q8
+ vsli.8 d2,d3,#1 @ compose the "carry" byte
+ vext.8 q10,q12,#1 @ Z>>=8
+
+ vmull.p8 q11,d2,d5 @ "carry"·0xe1
+ vshr.u8 d2,d3,#7 @ save Z's bottom bit
+ vext.8 q13,q9,q12,#1 @ Qlo>>=8
+ veor q10,q8
+ bne .Linner_neon
+
+ veor q10,q13 @ modulo-scheduled artefact
+ vshl.i64 d22,#48
+ veor d21,d22
+
+ @ finalization, normalize Z:Zo
+ vand d2,d5 @ suffices to mask the bit
+ vshr.u64 d3,d20,#63
+ vshl.i64 q10,#1
+ subs r3,#16
+ vorr q10,q1 @ Z=Z:Zo<<1
+ bne .Louter_neon
+
+#ifdef __ARMEL__
+ vrev64.8 q10,q10
+#endif
+ sub r0,#16
+ vst1.64 d21,[r0,:64]! @ write out Xi
+ vst1.64 d20,[r0,:64]
+
+ .word 0xe12fff1e
+.size gcm_ghash_neon,.-gcm_ghash_neon
+#endif
+.asciz "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro@openssl.org>"
+.align 2
diff --git a/main/openssl/crypto/modes/asm/ghash-armv4.pl b/main/openssl/crypto/modes/asm/ghash-armv4.pl
new file mode 100644
index 00000000..e46f8e34
--- /dev/null
+++ b/main/openssl/crypto/modes/asm/ghash-armv4.pl
@@ -0,0 +1,429 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# April 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+32 bytes shared table]. There is no
+# experimental performance data available yet. The only approximation
+# that can be made at this point is based on code size. Inner loop is
+# 32 instructions long and on single-issue core should execute in <40
+# cycles. Having verified that gcc 3.4 didn't unroll corresponding
+# loop, this assembler loop body was found to be ~3x smaller than
+# compiler-generated one...
+#
+# July 2010
+#
+# Rescheduling for dual-issue pipeline resulted in 8.5% improvement on
+# Cortex A8 core and ~25 cycles per processed byte (which was observed
+# to be ~3 times faster than gcc-generated code:-)
+#
+# February 2011
+#
+# Profiler-assisted and platform-specific optimization resulted in 7%
+# improvement on Cortex A8 core and ~23.5 cycles per byte.
+#
+# March 2011
+#
+# Add NEON implementation featuring polynomial multiplication, i.e. no
+# lookup tables involved. On Cortex A8 it was measured to process one
+# byte in 15 cycles or 55% faster than integer-only code.
+
+# ====================================================================
+# Note about "528B" variant. In ARM case it makes lesser sense to
+# implement it for following reasons:
+#
+# - performance improvement won't be anywhere near 50%, because 128-
+# bit shift operation is neatly fused with 128-bit xor here, and
+# "538B" variant would eliminate only 4-5 instructions out of 32
+# in the inner loop (meaning that estimated improvement is ~15%);
+# - ARM-based systems are often embedded ones and extra memory
+# consumption might be unappreciated (for so little improvement);
+#
+# Byte order [in]dependence. =========================================
+#
+# Caller is expected to maintain specific *dword* order in Htable,
+# namely with *least* significant dword of 128-bit value at *lower*
+# address. This differs completely from C code and has everything to
+# do with ldm instruction and order in which dwords are "consumed" by
+# algorithm. *Byte* order within these dwords in turn is whatever
+# *native* byte order on current platform. See gcm128.c for working
+# example...
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+$Xi="r0"; # argument block
+$Htbl="r1";
+$inp="r2";
+$len="r3";
+
+$Zll="r4"; # variables
+$Zlh="r5";
+$Zhl="r6";
+$Zhh="r7";
+$Tll="r8";
+$Tlh="r9";
+$Thl="r10";
+$Thh="r11";
+$nlo="r12";
+################# r13 is stack pointer
+$nhi="r14";
+################# r15 is program counter
+
+$rem_4bit=$inp; # used in gcm_gmult_4bit
+$cnt=$len;
+
+sub Zsmash() {
+ my $i=12;
+ my @args=@_;
+ for ($Zll,$Zlh,$Zhl,$Zhh) {
+ $code.=<<___;
+#if __ARM_ARCH__>=7 && defined(__ARMEL__)
+ rev $_,$_
+ str $_,[$Xi,#$i]
+#elif defined(__ARMEB__)
+ str $_,[$Xi,#$i]
+#else
+ mov $Tlh,$_,lsr#8
+ strb $_,[$Xi,#$i+3]
+ mov $Thl,$_,lsr#16
+ strb $Tlh,[$Xi,#$i+2]
+ mov $Thh,$_,lsr#24
+ strb $Thl,[$Xi,#$i+1]
+ strb $Thh,[$Xi,#$i]
+#endif
+___
+ $code.="\t".shift(@args)."\n";
+ $i-=4;
+ }
+}
+
+$code=<<___;
+#include "arm_arch.h"
+
+.text
+.code 32
+
+.type rem_4bit,%object
+.align 5
+rem_4bit:
+.short 0x0000,0x1C20,0x3840,0x2460
+.short 0x7080,0x6CA0,0x48C0,0x54E0
+.short 0xE100,0xFD20,0xD940,0xC560
+.short 0x9180,0x8DA0,0xA9C0,0xB5E0
+.size rem_4bit,.-rem_4bit
+
+.type rem_4bit_get,%function
+rem_4bit_get:
+ sub $rem_4bit,pc,#8
+ sub $rem_4bit,$rem_4bit,#32 @ &rem_4bit
+ b .Lrem_4bit_got
+ nop
+.size rem_4bit_get,.-rem_4bit_get
+
+.global gcm_ghash_4bit
+.type gcm_ghash_4bit,%function
+gcm_ghash_4bit:
+ sub r12,pc,#8
+ add $len,$inp,$len @ $len to point at the end
+ stmdb sp!,{r3-r11,lr} @ save $len/end too
+ sub r12,r12,#48 @ &rem_4bit
+
+ ldmia r12,{r4-r11} @ copy rem_4bit ...
+ stmdb sp!,{r4-r11} @ ... to stack
+
+ ldrb $nlo,[$inp,#15]
+ ldrb $nhi,[$Xi,#15]
+.Louter:
+ eor $nlo,$nlo,$nhi
+ and $nhi,$nlo,#0xf0
+ and $nlo,$nlo,#0x0f
+ mov $cnt,#14
+
+ add $Zhh,$Htbl,$nlo,lsl#4
+ ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo]
+ add $Thh,$Htbl,$nhi
+ ldrb $nlo,[$inp,#14]
+
+ and $nhi,$Zll,#0xf @ rem
+ ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
+ add $nhi,$nhi,$nhi
+ eor $Zll,$Tll,$Zll,lsr#4
+ ldrh $Tll,[sp,$nhi] @ rem_4bit[rem]
+ eor $Zll,$Zll,$Zlh,lsl#28
+ ldrb $nhi,[$Xi,#14]
+ eor $Zlh,$Tlh,$Zlh,lsr#4
+ eor $Zlh,$Zlh,$Zhl,lsl#28
+ eor $Zhl,$Thl,$Zhl,lsr#4
+ eor $Zhl,$Zhl,$Zhh,lsl#28
+ eor $Zhh,$Thh,$Zhh,lsr#4
+ eor $nlo,$nlo,$nhi
+ and $nhi,$nlo,#0xf0
+ and $nlo,$nlo,#0x0f
+ eor $Zhh,$Zhh,$Tll,lsl#16
+
+.Linner:
+ add $Thh,$Htbl,$nlo,lsl#4
+ and $nlo,$Zll,#0xf @ rem
+ subs $cnt,$cnt,#1
+ add $nlo,$nlo,$nlo
+ ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo]
+ eor $Zll,$Tll,$Zll,lsr#4
+ eor $Zll,$Zll,$Zlh,lsl#28
+ eor $Zlh,$Tlh,$Zlh,lsr#4
+ eor $Zlh,$Zlh,$Zhl,lsl#28
+ ldrh $Tll,[sp,$nlo] @ rem_4bit[rem]
+ eor $Zhl,$Thl,$Zhl,lsr#4
+ ldrplb $nlo,[$inp,$cnt]
+ eor $Zhl,$Zhl,$Zhh,lsl#28
+ eor $Zhh,$Thh,$Zhh,lsr#4
+
+ add $Thh,$Htbl,$nhi
+ and $nhi,$Zll,#0xf @ rem
+ eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
+ add $nhi,$nhi,$nhi
+ ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
+ eor $Zll,$Tll,$Zll,lsr#4
+ ldrplb $Tll,[$Xi,$cnt]
+ eor $Zll,$Zll,$Zlh,lsl#28
+ eor $Zlh,$Tlh,$Zlh,lsr#4
+ ldrh $Tlh,[sp,$nhi]
+ eor $Zlh,$Zlh,$Zhl,lsl#28
+ eor $Zhl,$Thl,$Zhl,lsr#4
+ eor $Zhl,$Zhl,$Zhh,lsl#28
+ eorpl $nlo,$nlo,$Tll
+ eor $Zhh,$Thh,$Zhh,lsr#4
+ andpl $nhi,$nlo,#0xf0
+ andpl $nlo,$nlo,#0x0f
+ eor $Zhh,$Zhh,$Tlh,lsl#16 @ ^= rem_4bit[rem]
+ bpl .Linner
+
+ ldr $len,[sp,#32] @ re-load $len/end
+ add $inp,$inp,#16
+ mov $nhi,$Zll
+___
+ &Zsmash("cmp\t$inp,$len","ldrneb\t$nlo,[$inp,#15]");
+$code.=<<___;
+ bne .Louter
+
+ add sp,sp,#36
+#if __ARM_ARCH__>=5
+ ldmia sp!,{r4-r11,pc}
+#else
+ ldmia sp!,{r4-r11,lr}
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ bx lr @ interoperable with Thumb ISA:-)
+#endif
+.size gcm_ghash_4bit,.-gcm_ghash_4bit
+
+.global gcm_gmult_4bit
+.type gcm_gmult_4bit,%function
+gcm_gmult_4bit:
+ stmdb sp!,{r4-r11,lr}
+ ldrb $nlo,[$Xi,#15]
+ b rem_4bit_get
+.Lrem_4bit_got:
+ and $nhi,$nlo,#0xf0
+ and $nlo,$nlo,#0x0f
+ mov $cnt,#14
+
+ add $Zhh,$Htbl,$nlo,lsl#4
+ ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo]
+ ldrb $nlo,[$Xi,#14]
+
+ add $Thh,$Htbl,$nhi
+ and $nhi,$Zll,#0xf @ rem
+ ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
+ add $nhi,$nhi,$nhi
+ eor $Zll,$Tll,$Zll,lsr#4
+ ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem]
+ eor $Zll,$Zll,$Zlh,lsl#28
+ eor $Zlh,$Tlh,$Zlh,lsr#4
+ eor $Zlh,$Zlh,$Zhl,lsl#28
+ eor $Zhl,$Thl,$Zhl,lsr#4
+ eor $Zhl,$Zhl,$Zhh,lsl#28
+ eor $Zhh,$Thh,$Zhh,lsr#4
+ and $nhi,$nlo,#0xf0
+ eor $Zhh,$Zhh,$Tll,lsl#16
+ and $nlo,$nlo,#0x0f
+
+.Loop:
+ add $Thh,$Htbl,$nlo,lsl#4
+ and $nlo,$Zll,#0xf @ rem
+ subs $cnt,$cnt,#1
+ add $nlo,$nlo,$nlo
+ ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo]
+ eor $Zll,$Tll,$Zll,lsr#4
+ eor $Zll,$Zll,$Zlh,lsl#28
+ eor $Zlh,$Tlh,$Zlh,lsr#4
+ eor $Zlh,$Zlh,$Zhl,lsl#28
+ ldrh $Tll,[$rem_4bit,$nlo] @ rem_4bit[rem]
+ eor $Zhl,$Thl,$Zhl,lsr#4
+ ldrplb $nlo,[$Xi,$cnt]
+ eor $Zhl,$Zhl,$Zhh,lsl#28
+ eor $Zhh,$Thh,$Zhh,lsr#4
+
+ add $Thh,$Htbl,$nhi
+ and $nhi,$Zll,#0xf @ rem
+ eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
+ add $nhi,$nhi,$nhi
+ ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
+ eor $Zll,$Tll,$Zll,lsr#4
+ eor $Zll,$Zll,$Zlh,lsl#28
+ eor $Zlh,$Tlh,$Zlh,lsr#4
+ ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem]
+ eor $Zlh,$Zlh,$Zhl,lsl#28
+ eor $Zhl,$Thl,$Zhl,lsr#4
+ eor $Zhl,$Zhl,$Zhh,lsl#28
+ eor $Zhh,$Thh,$Zhh,lsr#4
+ andpl $nhi,$nlo,#0xf0
+ andpl $nlo,$nlo,#0x0f
+ eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
+ bpl .Loop
+___
+ &Zsmash();
+$code.=<<___;
+#if __ARM_ARCH__>=5
+ ldmia sp!,{r4-r11,pc}
+#else
+ ldmia sp!,{r4-r11,lr}
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ bx lr @ interoperable with Thumb ISA:-)
+#endif
+.size gcm_gmult_4bit,.-gcm_gmult_4bit
+___
+{
+my $cnt=$Htbl; # $Htbl is used once in the very beginning
+
+my ($Hhi, $Hlo, $Zo, $T, $xi, $mod) = map("d$_",(0..7));
+my ($Qhi, $Qlo, $Z, $R, $zero, $Qpost, $IN) = map("q$_",(8..15));
+
+# Z:Zo keeps 128-bit result shifted by 1 to the right, with bottom bit
+# in Zo. Or should I say "top bit", because GHASH is specified in
+# reverse bit order? Otherwise straightforward 128-bt H by one input
+# byte multiplication and modulo-reduction, times 16.
+
+sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
+sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
+sub Q() { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; }
+
+$code.=<<___;
+#if __ARM_ARCH__>=7
+.fpu neon
+
+.global gcm_gmult_neon
+.type gcm_gmult_neon,%function
+.align 4
+gcm_gmult_neon:
+ sub $Htbl,#16 @ point at H in GCM128_CTX
+ vld1.64 `&Dhi("$IN")`,[$Xi,:64]!@ load Xi
+ vmov.i32 $mod,#0xe1 @ our irreducible polynomial
+ vld1.64 `&Dlo("$IN")`,[$Xi,:64]!
+ vshr.u64 $mod,#32
+ vldmia $Htbl,{$Hhi-$Hlo} @ load H
+ veor $zero,$zero
+#ifdef __ARMEL__
+ vrev64.8 $IN,$IN
+#endif
+ veor $Qpost,$Qpost
+ veor $R,$R
+ mov $cnt,#16
+ veor $Z,$Z
+ mov $len,#16
+ veor $Zo,$Zo
+ vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte
+ b .Linner_neon
+.size gcm_gmult_neon,.-gcm_gmult_neon
+
+.global gcm_ghash_neon
+.type gcm_ghash_neon,%function
+.align 4
+gcm_ghash_neon:
+ vld1.64 `&Dhi("$Z")`,[$Xi,:64]! @ load Xi
+ vmov.i32 $mod,#0xe1 @ our irreducible polynomial
+ vld1.64 `&Dlo("$Z")`,[$Xi,:64]!
+ vshr.u64 $mod,#32
+ vldmia $Xi,{$Hhi-$Hlo} @ load H
+ veor $zero,$zero
+ nop
+#ifdef __ARMEL__
+ vrev64.8 $Z,$Z
+#endif
+.Louter_neon:
+ vld1.64 `&Dhi($IN)`,[$inp]! @ load inp
+ veor $Qpost,$Qpost
+ vld1.64 `&Dlo($IN)`,[$inp]!
+ veor $R,$R
+ mov $cnt,#16
+#ifdef __ARMEL__
+ vrev64.8 $IN,$IN
+#endif
+ veor $Zo,$Zo
+ veor $IN,$Z @ inp^=Xi
+ veor $Z,$Z
+ vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte
+.Linner_neon:
+ subs $cnt,$cnt,#1
+ vmull.p8 $Qlo,$Hlo,$xi @ H.lo·Xi[i]
+ vmull.p8 $Qhi,$Hhi,$xi @ H.hi·Xi[i]
+ vext.8 $IN,$zero,#1 @ IN>>=8
+
+ veor $Z,$Qpost @ modulo-scheduled part
+ vshl.i64 `&Dlo("$R")`,#48
+ vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte
+ veor $T,`&Dlo("$Qlo")`,`&Dlo("$Z")`
+
+ veor `&Dhi("$Z")`,`&Dlo("$R")`
+ vuzp.8 $Qlo,$Qhi
+ vsli.8 $Zo,$T,#1 @ compose the "carry" byte
+ vext.8 $Z,$zero,#1 @ Z>>=8
+
+ vmull.p8 $R,$Zo,$mod @ "carry"·0xe1
+ vshr.u8 $Zo,$T,#7 @ save Z's bottom bit
+ vext.8 $Qpost,$Qlo,$zero,#1 @ Qlo>>=8
+ veor $Z,$Qhi
+ bne .Linner_neon
+
+ veor $Z,$Qpost @ modulo-scheduled artefact
+ vshl.i64 `&Dlo("$R")`,#48
+ veor `&Dhi("$Z")`,`&Dlo("$R")`
+
+ @ finalization, normalize Z:Zo
+ vand $Zo,$mod @ suffices to mask the bit
+ vshr.u64 `&Dhi(&Q("$Zo"))`,`&Dlo("$Z")`,#63
+ vshl.i64 $Z,#1
+ subs $len,#16
+ vorr $Z,`&Q("$Zo")` @ Z=Z:Zo<<1
+ bne .Louter_neon
+
+#ifdef __ARMEL__
+ vrev64.8 $Z,$Z
+#endif
+ sub $Xi,#16
+ vst1.64 `&Dhi("$Z")`,[$Xi,:64]! @ write out Xi
+ vst1.64 `&Dlo("$Z")`,[$Xi,:64]
+
+ bx lr
+.size gcm_ghash_neon,.-gcm_ghash_neon
+#endif
+___
+}
+$code.=<<___;
+.asciz "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
+.align 2
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
+print $code;
+close STDOUT; # enforce flush
diff --git a/main/openssl/crypto/modes/asm/ghash-ia64.pl b/main/openssl/crypto/modes/asm/ghash-ia64.pl
new file mode 100755
index 00000000..0354c954
--- /dev/null
+++ b/main/openssl/crypto/modes/asm/ghash-ia64.pl
@@ -0,0 +1,463 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# March 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+128 bytes shared table]. Streamed
+# GHASH performance was measured to be 6.67 cycles per processed byte
+# on Itanium 2, which is >90% better than Microsoft compiler generated
+# code. To anchor to something else sha1-ia64.pl module processes one
+# byte in 5.7 cycles. On Itanium GHASH should run at ~8.5 cycles per
+# byte.
+
+# September 2010
+#
+# It was originally thought that it makes lesser sense to implement
+# "528B" variant on Itanium 2 for following reason. Because number of
+# functional units is naturally limited, it appeared impossible to
+# implement "528B" loop in 4 cycles, only in 5. This would mean that
+# theoretically performance improvement couldn't be more than 20%.
+# But occasionally you prove yourself wrong:-) I figured out a way to
+# fold couple of instructions and having freed yet another instruction
+# slot by unrolling the loop... Resulting performance is 4.45 cycles
+# per processed byte and 50% better than "256B" version. On original
+# Itanium performance should remain the same as the "256B" version,
+# i.e. ~8.5 cycles.
+
+$output=shift and (open STDOUT,">$output" or die "can't open $output: $!");
+
+if ($^O eq "hpux") {
+ $ADDP="addp4";
+ for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
+} else { $ADDP="add"; }
+for (@ARGV) { $big_endian=1 if (/\-DB_ENDIAN/);
+ $big_endian=0 if (/\-DL_ENDIAN/); }
+if (!defined($big_endian))
+ { $big_endian=(unpack('L',pack('N',1))==1); }
+
+sub loop() {
+my $label=shift;
+my ($p16,$p17)=(shift)?("p63","p63"):("p16","p17"); # mask references to inp
+
+# Loop is scheduled for 6 ticks on Itanium 2 and 8 on Itanium, i.e.
+# in scalable manner;-) Naturally assuming data in L1 cache...
+# Special note about 'dep' instruction, which is used to construct
+# &rem_4bit[Zlo&0xf]. It works, because rem_4bit is aligned at 128
+# bytes boundary and lower 7 bits of its address are guaranteed to
+# be zero.
+$code.=<<___;
+$label:
+{ .mfi; (p18) ld8 Hlo=[Hi[1]],-8
+ (p19) dep rem=Zlo,rem_4bitp,3,4 }
+{ .mfi; (p19) xor Zhi=Zhi,Hhi
+ ($p17) xor xi[1]=xi[1],in[1] };;
+{ .mfi; (p18) ld8 Hhi=[Hi[1]]
+ (p19) shrp Zlo=Zhi,Zlo,4 }
+{ .mfi; (p19) ld8 rem=[rem]
+ (p18) and Hi[1]=mask0xf0,xi[2] };;
+{ .mmi; ($p16) ld1 in[0]=[inp],-1
+ (p18) xor Zlo=Zlo,Hlo
+ (p19) shr.u Zhi=Zhi,4 }
+{ .mib; (p19) xor Hhi=Hhi,rem
+ (p18) add Hi[1]=Htbl,Hi[1] };;
+
+{ .mfi; (p18) ld8 Hlo=[Hi[1]],-8
+ (p18) dep rem=Zlo,rem_4bitp,3,4 }
+{ .mfi; (p17) shladd Hi[0]=xi[1],4,r0
+ (p18) xor Zhi=Zhi,Hhi };;
+{ .mfi; (p18) ld8 Hhi=[Hi[1]]
+ (p18) shrp Zlo=Zhi,Zlo,4 }
+{ .mfi; (p18) ld8 rem=[rem]
+ (p17) and Hi[0]=mask0xf0,Hi[0] };;
+{ .mmi; (p16) ld1 xi[0]=[Xi],-1
+ (p18) xor Zlo=Zlo,Hlo
+ (p18) shr.u Zhi=Zhi,4 }
+{ .mib; (p18) xor Hhi=Hhi,rem
+ (p17) add Hi[0]=Htbl,Hi[0]
+ br.ctop.sptk $label };;
+___
+}
+
+$code=<<___;
+.explicit
+.text
+
+prevfs=r2; prevlc=r3; prevpr=r8;
+mask0xf0=r21;
+rem=r22; rem_4bitp=r23;
+Xi=r24; Htbl=r25;
+inp=r26; end=r27;
+Hhi=r28; Hlo=r29;
+Zhi=r30; Zlo=r31;
+
+.align 128
+.skip 16 // aligns loop body
+.global gcm_gmult_4bit#
+.proc gcm_gmult_4bit#
+gcm_gmult_4bit:
+ .prologue
+{ .mmi; .save ar.pfs,prevfs
+ alloc prevfs=ar.pfs,2,6,0,8
+ $ADDP Xi=15,in0 // &Xi[15]
+ mov rem_4bitp=ip }
+{ .mii; $ADDP Htbl=8,in1 // &Htbl[0].lo
+ .save ar.lc,prevlc
+ mov prevlc=ar.lc
+ .save pr,prevpr
+ mov prevpr=pr };;
+
+ .body
+ .rotr in[3],xi[3],Hi[2]
+
+{ .mib; ld1 xi[2]=[Xi],-1 // Xi[15]
+ mov mask0xf0=0xf0
+ brp.loop.imp .Loop1,.Lend1-16};;
+{ .mmi; ld1 xi[1]=[Xi],-1 // Xi[14]
+ };;
+{ .mii; shladd Hi[1]=xi[2],4,r0
+ mov pr.rot=0x7<<16
+ mov ar.lc=13 };;
+{ .mii; and Hi[1]=mask0xf0,Hi[1]
+ mov ar.ec=3
+ xor Zlo=Zlo,Zlo };;
+{ .mii; add Hi[1]=Htbl,Hi[1] // &Htbl[nlo].lo
+ add rem_4bitp=rem_4bit#-gcm_gmult_4bit#,rem_4bitp
+ xor Zhi=Zhi,Zhi };;
+___
+ &loop (".Loop1",1);
+$code.=<<___;
+.Lend1:
+{ .mib; xor Zhi=Zhi,Hhi };; // modulo-scheduling artefact
+{ .mib; mux1 Zlo=Zlo,\@rev };;
+{ .mib; mux1 Zhi=Zhi,\@rev };;
+{ .mmi; add Hlo=9,Xi;; // ;; is here to prevent
+ add Hhi=1,Xi };; // pipeline flush on Itanium
+{ .mib; st8 [Hlo]=Zlo
+ mov pr=prevpr,0x1ffff };;
+{ .mib; st8 [Hhi]=Zhi
+ mov ar.lc=prevlc
+ br.ret.sptk.many b0 };;
+.endp gcm_gmult_4bit#
+___
+
+######################################################################
+# "528B" (well, "512B" actualy) streamed GHASH
+#
+$Xip="in0";
+$Htbl="in1";
+$inp="in2";
+$len="in3";
+$rem_8bit="loc0";
+$mask0xff="loc1";
+($sum,$rum) = $big_endian ? ("nop.m","nop.m") : ("sum","rum");
+
+sub load_htable() {
+ for (my $i=0;$i<8;$i++) {
+ $code.=<<___;
+{ .mmi; ld8 r`16+2*$i+1`=[r8],16 // Htable[$i].hi
+ ld8 r`16+2*$i`=[r9],16 } // Htable[$i].lo
+{ .mmi; ldf8 f`32+2*$i+1`=[r10],16 // Htable[`8+$i`].hi
+ ldf8 f`32+2*$i`=[r11],16 // Htable[`8+$i`].lo
+___
+ $code.=shift if (($i+$#_)==7);
+ $code.="\t};;\n"
+ }
+}
+
+$code.=<<___;
+prevsp=r3;
+
+.align 32
+.skip 16 // aligns loop body
+.global gcm_ghash_4bit#
+.proc gcm_ghash_4bit#
+gcm_ghash_4bit:
+ .prologue
+{ .mmi; .save ar.pfs,prevfs
+ alloc prevfs=ar.pfs,4,2,0,0
+ .vframe prevsp
+ mov prevsp=sp
+ mov $rem_8bit=ip };;
+ .body
+{ .mfi; $ADDP r8=0+0,$Htbl
+ $ADDP r9=0+8,$Htbl }
+{ .mfi; $ADDP r10=128+0,$Htbl
+ $ADDP r11=128+8,$Htbl };;
+___
+ &load_htable(
+ " $ADDP $Xip=15,$Xip", # &Xi[15]
+ " $ADDP $len=$len,$inp", # &inp[len]
+ " $ADDP $inp=15,$inp", # &inp[15]
+ " mov $mask0xff=0xff",
+ " add sp=-512,sp",
+ " andcm sp=sp,$mask0xff", # align stack frame
+ " add r14=0,sp",
+ " add r15=8,sp");
+$code.=<<___;
+{ .mmi; $sum 1<<1 // go big-endian
+ add r8=256+0,sp
+ add r9=256+8,sp }
+{ .mmi; add r10=256+128+0,sp
+ add r11=256+128+8,sp
+ add $len=-17,$len };;
+___
+for($i=0;$i<8;$i++) { # generate first half of Hshr4[]
+my ($rlo,$rhi)=("r".eval(16+2*$i),"r".eval(16+2*$i+1));
+$code.=<<___;
+{ .mmi; st8 [r8]=$rlo,16 // Htable[$i].lo
+ st8 [r9]=$rhi,16 // Htable[$i].hi
+ shrp $rlo=$rhi,$rlo,4 }//;;
+{ .mmi; stf8 [r10]=f`32+2*$i`,16 // Htable[`8+$i`].lo
+ stf8 [r11]=f`32+2*$i+1`,16 // Htable[`8+$i`].hi
+ shr.u $rhi=$rhi,4 };;
+{ .mmi; st8 [r14]=$rlo,16 // Htable[$i].lo>>4
+ st8 [r15]=$rhi,16 }//;; // Htable[$i].hi>>4
+___
+}
+$code.=<<___;
+{ .mmi; ld8 r16=[r8],16 // Htable[8].lo
+ ld8 r17=[r9],16 };; // Htable[8].hi
+{ .mmi; ld8 r18=[r8],16 // Htable[9].lo
+ ld8 r19=[r9],16 } // Htable[9].hi
+{ .mmi; rum 1<<5 // clear um.mfh
+ shrp r16=r17,r16,4 };;
+___
+for($i=0;$i<6;$i++) { # generate second half of Hshr4[]
+$code.=<<___;
+{ .mmi; ld8 r`20+2*$i`=[r8],16 // Htable[`10+$i`].lo
+ ld8 r`20+2*$i+1`=[r9],16 // Htable[`10+$i`].hi
+ shr.u r`16+2*$i+1`=r`16+2*$i+1`,4 };;
+{ .mmi; st8 [r14]=r`16+2*$i`,16 // Htable[`8+$i`].lo>>4
+ st8 [r15]=r`16+2*$i+1`,16 // Htable[`8+$i`].hi>>4
+ shrp r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4 }
+___
+}
+$code.=<<___;
+{ .mmi; shr.u r`16+2*$i+1`=r`16+2*$i+1`,4 };;
+{ .mmi; st8 [r14]=r`16+2*$i`,16 // Htable[`8+$i`].lo>>4
+ st8 [r15]=r`16+2*$i+1`,16 // Htable[`8+$i`].hi>>4
+ shrp r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4 }
+{ .mmi; add $Htbl=256,sp // &Htable[0]
+ add $rem_8bit=rem_8bit#-gcm_ghash_4bit#,$rem_8bit
+ shr.u r`18+2*$i+1`=r`18+2*$i+1`,4 };;
+{ .mmi; st8 [r14]=r`18+2*$i` // Htable[`8+$i`].lo>>4
+ st8 [r15]=r`18+2*$i+1` } // Htable[`8+$i`].hi>>4
+___
+
+$in="r15";
+@xi=("r16","r17");
+@rem=("r18","r19");
+($Alo,$Ahi,$Blo,$Bhi,$Zlo,$Zhi)=("r20","r21","r22","r23","r24","r25");
+($Atbl,$Btbl)=("r26","r27");
+
+$code.=<<___; # (p16)
+{ .mmi; ld1 $in=[$inp],-1 //(p16) *inp--
+ ld1 $xi[0]=[$Xip],-1 //(p16) *Xi--
+ cmp.eq p0,p6=r0,r0 };; // clear p6
+___
+push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
+
+$code.=<<___; # (p16),(p17)
+{ .mmi; ld1 $xi[0]=[$Xip],-1 //(p16) *Xi--
+ xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i]
+{ .mii; ld1 $in=[$inp],-1 //(p16) *inp--
+ dep $Atbl=$xi[1],$Htbl,4,4 //(p17) &Htable[nlo].lo
+ and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0
+.align 32
+.LOOP:
+{ .mmi;
+(p6) st8 [$Xip]=$Zhi,13
+ xor $Zlo=$Zlo,$Zlo
+ add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi].lo
+___
+push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
+
+$code.=<<___; # (p16),(p17),(p18)
+{ .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi
+ ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo
+ xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i]
+{ .mfi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi
+ dep $Atbl=$xi[1],$Htbl,4,4 } //(p17) &Htable[nlo].lo
+{ .mfi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4
+ xor $Zlo=$Zlo,$Alo };; //(p18) Z.lo^=Htable[nlo].lo
+{ .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
+ ld1 $in=[$inp],-1 } //(p16) *inp--
+{ .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4)
+ mov $Zhi=$Ahi //(p18) Z.hi^=Htable[nlo].hi
+ and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0
+{ .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi
+ ld1 $xi[0]=[$Xip],-1 //(p16) *Xi--
+ shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
+{ .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
+ add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi]
+___
+push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
+
+for ($i=1;$i<14;$i++) {
+# Above and below fragments are derived from this one by removing
+# unsuitable (p??) instructions.
+$code.=<<___; # (p16),(p17),(p18),(p19)
+{ .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi
+ ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo
+ shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8
+{ .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem]
+ xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo
+ xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i]
+{ .mmi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi
+ ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem]
+ dep $Atbl=$xi[1],$Htbl,4,4 } //(p17) &Htable[nlo].lo
+{ .mmi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4
+ xor $Zlo=$Zlo,$Alo //(p18) Z.lo^=Htable[nlo].lo
+ xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi
+{ .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
+ ld1 $in=[$inp],-1 //(p16) *inp--
+ shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48
+{ .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4)
+ xor $Zhi=$Zhi,$Ahi //(p18) Z.hi^=Htable[nlo].hi
+ and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0
+{ .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi
+ ld1 $xi[0]=[$Xip],-1 //(p16) *Xi--
+ shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
+{ .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
+ xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48
+ add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi]
+___
+push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
+}
+
+$code.=<<___; # (p17),(p18),(p19)
+{ .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi
+ ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo
+ shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8
+{ .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem]
+ xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo
+ xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i]
+{ .mmi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi
+ ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem]
+ dep $Atbl=$xi[1],$Htbl,4,4 };; //(p17) &Htable[nlo].lo
+{ .mmi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4
+ xor $Zlo=$Zlo,$Alo //(p18) Z.lo^=Htable[nlo].lo
+ xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi
+{ .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
+ shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48
+{ .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4)
+ xor $Zhi=$Zhi,$Ahi //(p18) Z.hi^=Htable[nlo].hi
+ and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0
+{ .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi
+ shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
+{ .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
+ xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48
+ add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi]
+___
+push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
+
+$code.=<<___; # (p18),(p19)
+{ .mfi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi
+ shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8
+{ .mfi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem]
+ xor $Zlo=$Zlo,$Blo };; //(p19) Z.lo^=Hshr4[nhi].lo
+{ .mfi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi
+ xor $Zlo=$Zlo,$Alo } //(p18) Z.lo^=Htable[nlo].lo
+{ .mfi; ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem]
+ xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi
+{ .mfi; ld8 $Blo=[$Btbl],8 //(p18) Htable[nhi].lo,&Htable[nhi].hi
+ shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48
+{ .mfi; shladd $rem[0]=$Zlo,4,r0 //(p18) Z.lo<<4
+ xor $Zhi=$Zhi,$Ahi };; //(p18) Z.hi^=Htable[nlo].hi
+{ .mfi; ld8 $Bhi=[$Btbl] //(p18) Htable[nhi].hi
+ shrp $Zlo=$Zhi,$Zlo,4 } //(p18) Z.lo=(Z.hi<<60)|(Z.lo>>4)
+{ .mfi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
+ xor $Zhi=$Zhi,$rem[1] };; //(p19) Z.hi^=rem_8bit[rem]<<48
+___
+push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
+
+$code.=<<___; # (p19)
+{ .mmi; cmp.ltu p6,p0=$inp,$len
+ add $inp=32,$inp
+ shr.u $Zhi=$Zhi,4 } //(p19) Z.hi>>=4
+{ .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem]
+ xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo
+ add $Xip=9,$Xip };; // &Xi.lo
+{ .mmi; ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem]
+(p6) ld1 $in=[$inp],-1 //[p16] *inp--
+(p6) extr.u $xi[1]=$Zlo,8,8 } //[p17] Xi[14]
+{ .mmi; xor $Zhi=$Zhi,$Bhi //(p19) Z.hi^=Hshr4[nhi].hi
+(p6) and $xi[0]=$Zlo,$mask0xff };; //[p16] Xi[15]
+{ .mmi; st8 [$Xip]=$Zlo,-8
+(p6) xor $xi[0]=$xi[0],$in //[p17] xi=$xi[i]^inp[i]
+ shl $rem[1]=$rem[1],48 };; //(p19) rem_8bit[rem]<<48
+{ .mmi;
+(p6) ld1 $in=[$inp],-1 //[p16] *inp--
+ xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48
+(p6) dep $Atbl=$xi[0],$Htbl,4,4 } //[p17] &Htable[nlo].lo
+{ .mib;
+(p6) and $xi[0]=-16,$xi[0] //[p17] nhi=xi&0xf0
+(p6) br.cond.dptk.many .LOOP };;
+
+{ .mib; st8 [$Xip]=$Zhi };;
+{ .mib; $rum 1<<1 // return to little-endian
+ .restore sp
+ mov sp=prevsp
+ br.ret.sptk.many b0 };;
+.endp gcm_ghash_4bit#
+___
+$code.=<<___;
+.align 128
+.type rem_4bit#,\@object
+rem_4bit:
+ data8 0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48
+ data8 0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48
+ data8 0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48
+ data8 0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48
+.size rem_4bit#,128
+.type rem_8bit#,\@object
+rem_8bit:
+ data1 0x00,0x00, 0x01,0xC2, 0x03,0x84, 0x02,0x46, 0x07,0x08, 0x06,0xCA, 0x04,0x8C, 0x05,0x4E
+ data1 0x0E,0x10, 0x0F,0xD2, 0x0D,0x94, 0x0C,0x56, 0x09,0x18, 0x08,0xDA, 0x0A,0x9C, 0x0B,0x5E
+ data1 0x1C,0x20, 0x1D,0xE2, 0x1F,0xA4, 0x1E,0x66, 0x1B,0x28, 0x1A,0xEA, 0x18,0xAC, 0x19,0x6E
+ data1 0x12,0x30, 0x13,0xF2, 0x11,0xB4, 0x10,0x76, 0x15,0x38, 0x14,0xFA, 0x16,0xBC, 0x17,0x7E
+ data1 0x38,0x40, 0x39,0x82, 0x3B,0xC4, 0x3A,0x06, 0x3F,0x48, 0x3E,0x8A, 0x3C,0xCC, 0x3D,0x0E
+ data1 0x36,0x50, 0x37,0x92, 0x35,0xD4, 0x34,0x16, 0x31,0x58, 0x30,0x9A, 0x32,0xDC, 0x33,0x1E
+ data1 0x24,0x60, 0x25,0xA2, 0x27,0xE4, 0x26,0x26, 0x23,0x68, 0x22,0xAA, 0x20,0xEC, 0x21,0x2E
+ data1 0x2A,0x70, 0x2B,0xB2, 0x29,0xF4, 0x28,0x36, 0x2D,0x78, 0x2C,0xBA, 0x2E,0xFC, 0x2F,0x3E
+ data1 0x70,0x80, 0x71,0x42, 0x73,0x04, 0x72,0xC6, 0x77,0x88, 0x76,0x4A, 0x74,0x0C, 0x75,0xCE
+ data1 0x7E,0x90, 0x7F,0x52, 0x7D,0x14, 0x7C,0xD6, 0x79,0x98, 0x78,0x5A, 0x7A,0x1C, 0x7B,0xDE
+ data1 0x6C,0xA0, 0x6D,0x62, 0x6F,0x24, 0x6E,0xE6, 0x6B,0xA8, 0x6A,0x6A, 0x68,0x2C, 0x69,0xEE
+ data1 0x62,0xB0, 0x63,0x72, 0x61,0x34, 0x60,0xF6, 0x65,0xB8, 0x64,0x7A, 0x66,0x3C, 0x67,0xFE
+ data1 0x48,0xC0, 0x49,0x02, 0x4B,0x44, 0x4A,0x86, 0x4F,0xC8, 0x4E,0x0A, 0x4C,0x4C, 0x4D,0x8E
+ data1 0x46,0xD0, 0x47,0x12, 0x45,0x54, 0x44,0x96, 0x41,0xD8, 0x40,0x1A, 0x42,0x5C, 0x43,0x9E
+ data1 0x54,0xE0, 0x55,0x22, 0x57,0x64, 0x56,0xA6, 0x53,0xE8, 0x52,0x2A, 0x50,0x6C, 0x51,0xAE
+ data1 0x5A,0xF0, 0x5B,0x32, 0x59,0x74, 0x58,0xB6, 0x5D,0xF8, 0x5C,0x3A, 0x5E,0x7C, 0x5F,0xBE
+ data1 0xE1,0x00, 0xE0,0xC2, 0xE2,0x84, 0xE3,0x46, 0xE6,0x08, 0xE7,0xCA, 0xE5,0x8C, 0xE4,0x4E
+ data1 0xEF,0x10, 0xEE,0xD2, 0xEC,0x94, 0xED,0x56, 0xE8,0x18, 0xE9,0xDA, 0xEB,0x9C, 0xEA,0x5E
+ data1 0xFD,0x20, 0xFC,0xE2, 0xFE,0xA4, 0xFF,0x66, 0xFA,0x28, 0xFB,0xEA, 0xF9,0xAC, 0xF8,0x6E
+ data1 0xF3,0x30, 0xF2,0xF2, 0xF0,0xB4, 0xF1,0x76, 0xF4,0x38, 0xF5,0xFA, 0xF7,0xBC, 0xF6,0x7E
+ data1 0xD9,0x40, 0xD8,0x82, 0xDA,0xC4, 0xDB,0x06, 0xDE,0x48, 0xDF,0x8A, 0xDD,0xCC, 0xDC,0x0E
+ data1 0xD7,0x50, 0xD6,0x92, 0xD4,0xD4, 0xD5,0x16, 0xD0,0x58, 0xD1,0x9A, 0xD3,0xDC, 0xD2,0x1E
+ data1 0xC5,0x60, 0xC4,0xA2, 0xC6,0xE4, 0xC7,0x26, 0xC2,0x68, 0xC3,0xAA, 0xC1,0xEC, 0xC0,0x2E
+ data1 0xCB,0x70, 0xCA,0xB2, 0xC8,0xF4, 0xC9,0x36, 0xCC,0x78, 0xCD,0xBA, 0xCF,0xFC, 0xCE,0x3E
+ data1 0x91,0x80, 0x90,0x42, 0x92,0x04, 0x93,0xC6, 0x96,0x88, 0x97,0x4A, 0x95,0x0C, 0x94,0xCE
+ data1 0x9F,0x90, 0x9E,0x52, 0x9C,0x14, 0x9D,0xD6, 0x98,0x98, 0x99,0x5A, 0x9B,0x1C, 0x9A,0xDE
+ data1 0x8D,0xA0, 0x8C,0x62, 0x8E,0x24, 0x8F,0xE6, 0x8A,0xA8, 0x8B,0x6A, 0x89,0x2C, 0x88,0xEE
+ data1 0x83,0xB0, 0x82,0x72, 0x80,0x34, 0x81,0xF6, 0x84,0xB8, 0x85,0x7A, 0x87,0x3C, 0x86,0xFE
+ data1 0xA9,0xC0, 0xA8,0x02, 0xAA,0x44, 0xAB,0x86, 0xAE,0xC8, 0xAF,0x0A, 0xAD,0x4C, 0xAC,0x8E
+ data1 0xA7,0xD0, 0xA6,0x12, 0xA4,0x54, 0xA5,0x96, 0xA0,0xD8, 0xA1,0x1A, 0xA3,0x5C, 0xA2,0x9E
+ data1 0xB5,0xE0, 0xB4,0x22, 0xB6,0x64, 0xB7,0xA6, 0xB2,0xE8, 0xB3,0x2A, 0xB1,0x6C, 0xB0,0xAE
+ data1 0xBB,0xF0, 0xBA,0x32, 0xB8,0x74, 0xB9,0xB6, 0xBC,0xF8, 0xBD,0x3A, 0xBF,0x7C, 0xBE,0xBE
+.size rem_8bit#,512
+stringz "GHASH for IA64, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+$code =~ s/mux1(\s+)\S+\@rev/nop.i$1 0x0/gm if ($big_endian);
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+
+print $code;
+close STDOUT;
diff --git a/main/openssl/crypto/modes/asm/ghash-parisc.pl b/main/openssl/crypto/modes/asm/ghash-parisc.pl
new file mode 100644
index 00000000..d5ad96b4
--- /dev/null
+++ b/main/openssl/crypto/modes/asm/ghash-parisc.pl
@@ -0,0 +1,731 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# April 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+128 bytes shared table]. On PA-7100LC
+# it processes one byte in 19.6 cycles, which is more than twice as
+# fast as code generated by gcc 3.2. PA-RISC 2.0 loop is scheduled for
+# 8 cycles, but measured performance on PA-8600 system is ~9 cycles per
+# processed byte. This is ~2.2x faster than 64-bit code generated by
+# vendor compiler (which used to be very hard to beat:-).
+#
+# Special thanks to polarhome.com for providing HP-UX account.
+
+$flavour = shift;
+$output = shift;
+open STDOUT,">$output";
+
+if ($flavour =~ /64/) {
+ $LEVEL ="2.0W";
+ $SIZE_T =8;
+ $FRAME_MARKER =80;
+ $SAVED_RP =16;
+ $PUSH ="std";
+ $PUSHMA ="std,ma";
+ $POP ="ldd";
+ $POPMB ="ldd,mb";
+ $NREGS =6;
+} else {
+ $LEVEL ="1.0"; #"\n\t.ALLOW\t2.0";
+ $SIZE_T =4;
+ $FRAME_MARKER =48;
+ $SAVED_RP =20;
+ $PUSH ="stw";
+ $PUSHMA ="stwm";
+ $POP ="ldw";
+ $POPMB ="ldwm";
+ $NREGS =11;
+}
+
+$FRAME=10*$SIZE_T+$FRAME_MARKER;# NREGS saved regs + frame marker
+ # [+ argument transfer]
+
+################# volatile registers
+$Xi="%r26"; # argument block
+$Htbl="%r25";
+$inp="%r24";
+$len="%r23";
+$Hhh=$Htbl; # variables
+$Hll="%r22";
+$Zhh="%r21";
+$Zll="%r20";
+$cnt="%r19";
+$rem_4bit="%r28";
+$rem="%r29";
+$mask0xf0="%r31";
+
+################# preserved registers
+$Thh="%r1";
+$Tll="%r2";
+$nlo="%r3";
+$nhi="%r4";
+$byte="%r5";
+if ($SIZE_T==4) {
+ $Zhl="%r6";
+ $Zlh="%r7";
+ $Hhl="%r8";
+ $Hlh="%r9";
+ $Thl="%r10";
+ $Tlh="%r11";
+}
+$rem2="%r6"; # used in PA-RISC 2.0 code
+
+$code.=<<___;
+ .LEVEL $LEVEL
+ .SPACE \$TEXT\$
+ .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
+
+ .EXPORT gcm_gmult_4bit,ENTRY,ARGW0=GR,ARGW1=GR
+ .ALIGN 64
+gcm_gmult_4bit
+ .PROC
+ .CALLINFO FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=$NREGS
+ .ENTRY
+ $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
+ $PUSHMA %r3,$FRAME(%sp)
+ $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
+ $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
+ $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
+___
+$code.=<<___ if ($SIZE_T==4);
+ $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
+ $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
+ $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
+ $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
+ $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp)
+___
+$code.=<<___;
+ blr %r0,$rem_4bit
+ ldi 3,$rem
+L\$pic_gmult
+ andcm $rem_4bit,$rem,$rem_4bit
+ addl $inp,$len,$len
+ ldo L\$rem_4bit-L\$pic_gmult($rem_4bit),$rem_4bit
+ ldi 0xf0,$mask0xf0
+___
+$code.=<<___ if ($SIZE_T==4);
+ ldi 31,$rem
+ mtctl $rem,%cr11
+ extrd,u,*= $rem,%sar,1,$rem ; executes on PA-RISC 1.0
+ b L\$parisc1_gmult
+ nop
+___
+
+$code.=<<___;
+ ldb 15($Xi),$nlo
+ ldo 8($Htbl),$Hll
+
+ and $mask0xf0,$nlo,$nhi
+ depd,z $nlo,59,4,$nlo
+
+ ldd $nlo($Hll),$Zll
+ ldd $nlo($Hhh),$Zhh
+
+ depd,z $Zll,60,4,$rem
+ shrpd $Zhh,$Zll,4,$Zll
+ extrd,u $Zhh,59,60,$Zhh
+ ldb 14($Xi),$nlo
+
+ ldd $nhi($Hll),$Tll
+ ldd $nhi($Hhh),$Thh
+ and $mask0xf0,$nlo,$nhi
+ depd,z $nlo,59,4,$nlo
+
+ xor $Tll,$Zll,$Zll
+ xor $Thh,$Zhh,$Zhh
+ ldd $rem($rem_4bit),$rem
+ b L\$oop_gmult_pa2
+ ldi 13,$cnt
+
+ .ALIGN 8
+L\$oop_gmult_pa2
+ xor $rem,$Zhh,$Zhh ; moved here to work around gas bug
+ depd,z $Zll,60,4,$rem
+
+ shrpd $Zhh,$Zll,4,$Zll
+ extrd,u $Zhh,59,60,$Zhh
+ ldd $nlo($Hll),$Tll
+ ldd $nlo($Hhh),$Thh
+
+ xor $Tll,$Zll,$Zll
+ xor $Thh,$Zhh,$Zhh
+ ldd $rem($rem_4bit),$rem
+
+ xor $rem,$Zhh,$Zhh
+ depd,z $Zll,60,4,$rem
+ ldbx $cnt($Xi),$nlo
+
+ shrpd $Zhh,$Zll,4,$Zll
+ extrd,u $Zhh,59,60,$Zhh
+ ldd $nhi($Hll),$Tll
+ ldd $nhi($Hhh),$Thh
+
+ and $mask0xf0,$nlo,$nhi
+ depd,z $nlo,59,4,$nlo
+ ldd $rem($rem_4bit),$rem
+
+ xor $Tll,$Zll,$Zll
+ addib,uv -1,$cnt,L\$oop_gmult_pa2
+ xor $Thh,$Zhh,$Zhh
+
+ xor $rem,$Zhh,$Zhh
+ depd,z $Zll,60,4,$rem
+
+ shrpd $Zhh,$Zll,4,$Zll
+ extrd,u $Zhh,59,60,$Zhh
+ ldd $nlo($Hll),$Tll
+ ldd $nlo($Hhh),$Thh
+
+ xor $Tll,$Zll,$Zll
+ xor $Thh,$Zhh,$Zhh
+ ldd $rem($rem_4bit),$rem
+
+ xor $rem,$Zhh,$Zhh
+ depd,z $Zll,60,4,$rem
+
+ shrpd $Zhh,$Zll,4,$Zll
+ extrd,u $Zhh,59,60,$Zhh
+ ldd $nhi($Hll),$Tll
+ ldd $nhi($Hhh),$Thh
+
+ xor $Tll,$Zll,$Zll
+ xor $Thh,$Zhh,$Zhh
+ ldd $rem($rem_4bit),$rem
+
+ xor $rem,$Zhh,$Zhh
+ std $Zll,8($Xi)
+ std $Zhh,0($Xi)
+___
+
+$code.=<<___ if ($SIZE_T==4);
+ b L\$done_gmult
+ nop
+
+L\$parisc1_gmult
+ ldb 15($Xi),$nlo
+ ldo 12($Htbl),$Hll
+ ldo 8($Htbl),$Hlh
+ ldo 4($Htbl),$Hhl
+
+ and $mask0xf0,$nlo,$nhi
+ zdep $nlo,27,4,$nlo
+
+ ldwx $nlo($Hll),$Zll
+ ldwx $nlo($Hlh),$Zlh
+ ldwx $nlo($Hhl),$Zhl
+ ldwx $nlo($Hhh),$Zhh
+ zdep $Zll,28,4,$rem
+ ldb 14($Xi),$nlo
+ ldwx $rem($rem_4bit),$rem
+ shrpw $Zlh,$Zll,4,$Zll
+ ldwx $nhi($Hll),$Tll
+ shrpw $Zhl,$Zlh,4,$Zlh
+ ldwx $nhi($Hlh),$Tlh
+ shrpw $Zhh,$Zhl,4,$Zhl
+ ldwx $nhi($Hhl),$Thl
+ extru $Zhh,27,28,$Zhh
+ ldwx $nhi($Hhh),$Thh
+ xor $rem,$Zhh,$Zhh
+ and $mask0xf0,$nlo,$nhi
+ zdep $nlo,27,4,$nlo
+
+ xor $Tll,$Zll,$Zll
+ ldwx $nlo($Hll),$Tll
+ xor $Tlh,$Zlh,$Zlh
+ ldwx $nlo($Hlh),$Tlh
+ xor $Thl,$Zhl,$Zhl
+ b L\$oop_gmult_pa1
+ ldi 13,$cnt
+
+ .ALIGN 8
+L\$oop_gmult_pa1
+ zdep $Zll,28,4,$rem
+ ldwx $nlo($Hhl),$Thl
+ xor $Thh,$Zhh,$Zhh
+ ldwx $rem($rem_4bit),$rem
+ shrpw $Zlh,$Zll,4,$Zll
+ ldwx $nlo($Hhh),$Thh
+ shrpw $Zhl,$Zlh,4,$Zlh
+ ldbx $cnt($Xi),$nlo
+ xor $Tll,$Zll,$Zll
+ ldwx $nhi($Hll),$Tll
+ shrpw $Zhh,$Zhl,4,$Zhl
+ xor $Tlh,$Zlh,$Zlh
+ ldwx $nhi($Hlh),$Tlh
+ extru $Zhh,27,28,$Zhh
+ xor $Thl,$Zhl,$Zhl
+ ldwx $nhi($Hhl),$Thl
+ xor $rem,$Zhh,$Zhh
+ zdep $Zll,28,4,$rem
+ xor $Thh,$Zhh,$Zhh
+ ldwx $nhi($Hhh),$Thh
+ shrpw $Zlh,$Zll,4,$Zll
+ ldwx $rem($rem_4bit),$rem
+ shrpw $Zhl,$Zlh,4,$Zlh
+ shrpw $Zhh,$Zhl,4,$Zhl
+ and $mask0xf0,$nlo,$nhi
+ extru $Zhh,27,28,$Zhh
+ zdep $nlo,27,4,$nlo
+ xor $Tll,$Zll,$Zll
+ ldwx $nlo($Hll),$Tll
+ xor $Tlh,$Zlh,$Zlh
+ ldwx $nlo($Hlh),$Tlh
+ xor $rem,$Zhh,$Zhh
+ addib,uv -1,$cnt,L\$oop_gmult_pa1
+ xor $Thl,$Zhl,$Zhl
+
+ zdep $Zll,28,4,$rem
+ ldwx $nlo($Hhl),$Thl
+ xor $Thh,$Zhh,$Zhh
+ ldwx $rem($rem_4bit),$rem
+ shrpw $Zlh,$Zll,4,$Zll
+ ldwx $nlo($Hhh),$Thh
+ shrpw $Zhl,$Zlh,4,$Zlh
+ xor $Tll,$Zll,$Zll
+ ldwx $nhi($Hll),$Tll
+ shrpw $Zhh,$Zhl,4,$Zhl
+ xor $Tlh,$Zlh,$Zlh
+ ldwx $nhi($Hlh),$Tlh
+ extru $Zhh,27,28,$Zhh
+ xor $rem,$Zhh,$Zhh
+ xor $Thl,$Zhl,$Zhl
+ ldwx $nhi($Hhl),$Thl
+ xor $Thh,$Zhh,$Zhh
+ ldwx $nhi($Hhh),$Thh
+ zdep $Zll,28,4,$rem
+ ldwx $rem($rem_4bit),$rem
+ shrpw $Zlh,$Zll,4,$Zll
+ shrpw $Zhl,$Zlh,4,$Zlh
+ shrpw $Zhh,$Zhl,4,$Zhl
+ extru $Zhh,27,28,$Zhh
+ xor $Tll,$Zll,$Zll
+ xor $Tlh,$Zlh,$Zlh
+ xor $rem,$Zhh,$Zhh
+ stw $Zll,12($Xi)
+ xor $Thl,$Zhl,$Zhl
+ stw $Zlh,8($Xi)
+ xor $Thh,$Zhh,$Zhh
+ stw $Zhl,4($Xi)
+ stw $Zhh,0($Xi)
+___
+$code.=<<___;
+L\$done_gmult
+ $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
+ $POP `-$FRAME+1*$SIZE_T`(%sp),%r4
+ $POP `-$FRAME+2*$SIZE_T`(%sp),%r5
+ $POP `-$FRAME+3*$SIZE_T`(%sp),%r6
+___
+$code.=<<___ if ($SIZE_T==4);
+ $POP `-$FRAME+4*$SIZE_T`(%sp),%r7
+ $POP `-$FRAME+5*$SIZE_T`(%sp),%r8
+ $POP `-$FRAME+6*$SIZE_T`(%sp),%r9
+ $POP `-$FRAME+7*$SIZE_T`(%sp),%r10
+ $POP `-$FRAME+8*$SIZE_T`(%sp),%r11
+___
+$code.=<<___;
+ bv (%r2)
+ .EXIT
+ $POPMB -$FRAME(%sp),%r3
+ .PROCEND
+
+ .EXPORT gcm_ghash_4bit,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
+ .ALIGN 64
+gcm_ghash_4bit
+ .PROC
+ .CALLINFO FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=11
+ .ENTRY
+ $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
+ $PUSHMA %r3,$FRAME(%sp)
+ $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
+ $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
+ $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
+___
+$code.=<<___ if ($SIZE_T==4);
+ $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
+ $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
+ $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
+ $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
+ $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp)
+___
+$code.=<<___;
+ blr %r0,$rem_4bit
+ ldi 3,$rem
+L\$pic_ghash
+ andcm $rem_4bit,$rem,$rem_4bit
+ addl $inp,$len,$len
+ ldo L\$rem_4bit-L\$pic_ghash($rem_4bit),$rem_4bit
+ ldi 0xf0,$mask0xf0
+___
+$code.=<<___ if ($SIZE_T==4);
+ ldi 31,$rem
+ mtctl $rem,%cr11
+ extrd,u,*= $rem,%sar,1,$rem ; executes on PA-RISC 1.0
+ b L\$parisc1_ghash
+ nop
+___
+
+$code.=<<___;
+ ldb 15($Xi),$nlo
+ ldo 8($Htbl),$Hll
+
+L\$outer_ghash_pa2
+ ldb 15($inp),$nhi
+ xor $nhi,$nlo,$nlo
+ and $mask0xf0,$nlo,$nhi
+ depd,z $nlo,59,4,$nlo
+
+ ldd $nlo($Hll),$Zll
+ ldd $nlo($Hhh),$Zhh
+
+ depd,z $Zll,60,4,$rem
+ shrpd $Zhh,$Zll,4,$Zll
+ extrd,u $Zhh,59,60,$Zhh
+ ldb 14($Xi),$nlo
+ ldb 14($inp),$byte
+
+ ldd $nhi($Hll),$Tll
+ ldd $nhi($Hhh),$Thh
+ xor $byte,$nlo,$nlo
+ and $mask0xf0,$nlo,$nhi
+ depd,z $nlo,59,4,$nlo
+
+ xor $Tll,$Zll,$Zll
+ xor $Thh,$Zhh,$Zhh
+ ldd $rem($rem_4bit),$rem
+ b L\$oop_ghash_pa2
+ ldi 13,$cnt
+
+ .ALIGN 8
+L\$oop_ghash_pa2
+ xor $rem,$Zhh,$Zhh ; moved here to work around gas bug
+ depd,z $Zll,60,4,$rem2
+
+ shrpd $Zhh,$Zll,4,$Zll
+ extrd,u $Zhh,59,60,$Zhh
+ ldd $nlo($Hll),$Tll
+ ldd $nlo($Hhh),$Thh
+
+ xor $Tll,$Zll,$Zll
+ xor $Thh,$Zhh,$Zhh
+ ldbx $cnt($Xi),$nlo
+ ldbx $cnt($inp),$byte
+
+ depd,z $Zll,60,4,$rem
+ shrpd $Zhh,$Zll,4,$Zll
+ ldd $rem2($rem_4bit),$rem2
+
+ xor $rem2,$Zhh,$Zhh
+ xor $byte,$nlo,$nlo
+ ldd $nhi($Hll),$Tll
+ ldd $nhi($Hhh),$Thh
+
+ and $mask0xf0,$nlo,$nhi
+ depd,z $nlo,59,4,$nlo
+
+ extrd,u $Zhh,59,60,$Zhh
+ xor $Tll,$Zll,$Zll
+
+ ldd $rem($rem_4bit),$rem
+ addib,uv -1,$cnt,L\$oop_ghash_pa2
+ xor $Thh,$Zhh,$Zhh
+
+ xor $rem,$Zhh,$Zhh
+ depd,z $Zll,60,4,$rem2
+
+ shrpd $Zhh,$Zll,4,$Zll
+ extrd,u $Zhh,59,60,$Zhh
+ ldd $nlo($Hll),$Tll
+ ldd $nlo($Hhh),$Thh
+
+ xor $Tll,$Zll,$Zll
+ xor $Thh,$Zhh,$Zhh
+
+ depd,z $Zll,60,4,$rem
+ shrpd $Zhh,$Zll,4,$Zll
+ ldd $rem2($rem_4bit),$rem2
+
+ xor $rem2,$Zhh,$Zhh
+ ldd $nhi($Hll),$Tll
+ ldd $nhi($Hhh),$Thh
+
+ extrd,u $Zhh,59,60,$Zhh
+ xor $Tll,$Zll,$Zll
+ xor $Thh,$Zhh,$Zhh
+ ldd $rem($rem_4bit),$rem
+
+ xor $rem,$Zhh,$Zhh
+ std $Zll,8($Xi)
+ ldo 16($inp),$inp
+ std $Zhh,0($Xi)
+ cmpb,*<> $inp,$len,L\$outer_ghash_pa2
+ copy $Zll,$nlo
+___
+
+$code.=<<___ if ($SIZE_T==4);
+ b L\$done_ghash
+ nop
+
+L\$parisc1_ghash
+ ldb 15($Xi),$nlo
+ ldo 12($Htbl),$Hll
+ ldo 8($Htbl),$Hlh
+ ldo 4($Htbl),$Hhl
+
+L\$outer_ghash_pa1
+ ldb 15($inp),$byte
+ xor $byte,$nlo,$nlo
+ and $mask0xf0,$nlo,$nhi
+ zdep $nlo,27,4,$nlo
+
+ ldwx $nlo($Hll),$Zll
+ ldwx $nlo($Hlh),$Zlh
+ ldwx $nlo($Hhl),$Zhl
+ ldwx $nlo($Hhh),$Zhh
+ zdep $Zll,28,4,$rem
+ ldb 14($Xi),$nlo
+ ldb 14($inp),$byte
+ ldwx $rem($rem_4bit),$rem
+ shrpw $Zlh,$Zll,4,$Zll
+ ldwx $nhi($Hll),$Tll
+ shrpw $Zhl,$Zlh,4,$Zlh
+ ldwx $nhi($Hlh),$Tlh
+ shrpw $Zhh,$Zhl,4,$Zhl
+ ldwx $nhi($Hhl),$Thl
+ extru $Zhh,27,28,$Zhh
+ ldwx $nhi($Hhh),$Thh
+ xor $byte,$nlo,$nlo
+ xor $rem,$Zhh,$Zhh
+ and $mask0xf0,$nlo,$nhi
+ zdep $nlo,27,4,$nlo
+
+ xor $Tll,$Zll,$Zll
+ ldwx $nlo($Hll),$Tll
+ xor $Tlh,$Zlh,$Zlh
+ ldwx $nlo($Hlh),$Tlh
+ xor $Thl,$Zhl,$Zhl
+ b L\$oop_ghash_pa1
+ ldi 13,$cnt
+
+ .ALIGN 8
+L\$oop_ghash_pa1
+ zdep $Zll,28,4,$rem
+ ldwx $nlo($Hhl),$Thl
+ xor $Thh,$Zhh,$Zhh
+ ldwx $rem($rem_4bit),$rem
+ shrpw $Zlh,$Zll,4,$Zll
+ ldwx $nlo($Hhh),$Thh
+ shrpw $Zhl,$Zlh,4,$Zlh
+ ldbx $cnt($Xi),$nlo
+ xor $Tll,$Zll,$Zll
+ ldwx $nhi($Hll),$Tll
+ shrpw $Zhh,$Zhl,4,$Zhl
+ ldbx $cnt($inp),$byte
+ xor $Tlh,$Zlh,$Zlh
+ ldwx $nhi($Hlh),$Tlh
+ extru $Zhh,27,28,$Zhh
+ xor $Thl,$Zhl,$Zhl
+ ldwx $nhi($Hhl),$Thl
+ xor $rem,$Zhh,$Zhh
+ zdep $Zll,28,4,$rem
+ xor $Thh,$Zhh,$Zhh
+ ldwx $nhi($Hhh),$Thh
+ shrpw $Zlh,$Zll,4,$Zll
+ ldwx $rem($rem_4bit),$rem
+ shrpw $Zhl,$Zlh,4,$Zlh
+ xor $byte,$nlo,$nlo
+ shrpw $Zhh,$Zhl,4,$Zhl
+ and $mask0xf0,$nlo,$nhi
+ extru $Zhh,27,28,$Zhh
+ zdep $nlo,27,4,$nlo
+ xor $Tll,$Zll,$Zll
+ ldwx $nlo($Hll),$Tll
+ xor $Tlh,$Zlh,$Zlh
+ ldwx $nlo($Hlh),$Tlh
+ xor $rem,$Zhh,$Zhh
+ addib,uv -1,$cnt,L\$oop_ghash_pa1
+ xor $Thl,$Zhl,$Zhl
+
+ zdep $Zll,28,4,$rem
+ ldwx $nlo($Hhl),$Thl
+ xor $Thh,$Zhh,$Zhh
+ ldwx $rem($rem_4bit),$rem
+ shrpw $Zlh,$Zll,4,$Zll
+ ldwx $nlo($Hhh),$Thh
+ shrpw $Zhl,$Zlh,4,$Zlh
+ xor $Tll,$Zll,$Zll
+ ldwx $nhi($Hll),$Tll
+ shrpw $Zhh,$Zhl,4,$Zhl
+ xor $Tlh,$Zlh,$Zlh
+ ldwx $nhi($Hlh),$Tlh
+ extru $Zhh,27,28,$Zhh
+ xor $rem,$Zhh,$Zhh
+ xor $Thl,$Zhl,$Zhl
+ ldwx $nhi($Hhl),$Thl
+ xor $Thh,$Zhh,$Zhh
+ ldwx $nhi($Hhh),$Thh
+ zdep $Zll,28,4,$rem
+ ldwx $rem($rem_4bit),$rem
+ shrpw $Zlh,$Zll,4,$Zll
+ shrpw $Zhl,$Zlh,4,$Zlh
+ shrpw $Zhh,$Zhl,4,$Zhl
+ extru $Zhh,27,28,$Zhh
+ xor $Tll,$Zll,$Zll
+ xor $Tlh,$Zlh,$Zlh
+ xor $rem,$Zhh,$Zhh
+ stw $Zll,12($Xi)
+ xor $Thl,$Zhl,$Zhl
+ stw $Zlh,8($Xi)
+ xor $Thh,$Zhh,$Zhh
+ stw $Zhl,4($Xi)
+ ldo 16($inp),$inp
+ stw $Zhh,0($Xi)
+ comb,<> $inp,$len,L\$outer_ghash_pa1
+ copy $Zll,$nlo
+___
+$code.=<<___;
+L\$done_ghash
+ $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
+ $POP `-$FRAME+1*$SIZE_T`(%sp),%r4
+ $POP `-$FRAME+2*$SIZE_T`(%sp),%r5
+ $POP `-$FRAME+3*$SIZE_T`(%sp),%r6
+___
+$code.=<<___ if ($SIZE_T==4);
+ $POP `-$FRAME+4*$SIZE_T`(%sp),%r7
+ $POP `-$FRAME+5*$SIZE_T`(%sp),%r8
+ $POP `-$FRAME+6*$SIZE_T`(%sp),%r9
+ $POP `-$FRAME+7*$SIZE_T`(%sp),%r10
+ $POP `-$FRAME+8*$SIZE_T`(%sp),%r11
+___
+$code.=<<___;
+ bv (%r2)
+ .EXIT
+ $POPMB -$FRAME(%sp),%r3
+ .PROCEND
+
+ .ALIGN 64
+L\$rem_4bit
+ .WORD `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
+ .WORD `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
+ .WORD `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
+ .WORD `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
+ .STRINGZ "GHASH for PA-RISC, GRYPTOGAMS by <appro\@openssl.org>"
+ .ALIGN 64
+___
+
+# Explicitly encode PA-RISC 2.0 instructions used in this module, so
+# that it can be compiled with .LEVEL 1.0. It should be noted that I
+# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
+# directive...
+
+my $ldd = sub {
+ my ($mod,$args) = @_;
+ my $orig = "ldd$mod\t$args";
+
+ if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 4
+ { my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3;
+ sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+ }
+ elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 5
+ { my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3;
+ $opcode|=(($1&0xF)<<17)|(($1&0x10)<<12); # encode offset
+ $opcode|=(1<<5) if ($mod =~ /^,m/);
+ $opcode|=(1<<13) if ($mod =~ /^,mb/);
+ sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+ }
+ else { "\t".$orig; }
+};
+
+my $std = sub {
+ my ($mod,$args) = @_;
+ my $orig = "std$mod\t$args";
+
+ if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices
+ { my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1);
+ sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+ }
+ else { "\t".$orig; }
+};
+
+my $extrd = sub {
+ my ($mod,$args) = @_;
+ my $orig = "extrd$mod\t$args";
+
+ # I only have ",u" completer, it's implicitly encoded...
+ if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15
+ { my $opcode=(0x36<<26)|($1<<21)|($4<<16);
+ my $len=32-$3;
+ $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos
+ $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len
+ sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+ }
+ elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12
+ { my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
+ my $len=32-$2;
+ $opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len
+ $opcode |= (1<<13) if ($mod =~ /,\**=/);
+ sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+ }
+ else { "\t".$orig; }
+};
+
+my $shrpd = sub {
+ my ($mod,$args) = @_;
+ my $orig = "shrpd$mod\t$args";
+
+ if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14
+ { my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
+ my $cpos=63-$3;
+ $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa
+ sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+ }
+ elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/) # format 11
+ { sprintf "\t.WORD\t0x%08x\t; %s",
+ (0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig;
+ }
+ else { "\t".$orig; }
+};
+
+my $depd = sub {
+ my ($mod,$args) = @_;
+ my $orig = "depd$mod\t$args";
+
+ # I only have ",z" completer, it's impicitly encoded...
+ if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 16
+ { my $opcode=(0x3c<<26)|($4<<21)|($1<<16);
+ my $cpos=63-$2;
+ my $len=32-$3;
+ $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode pos
+ $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len
+ sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+ }
+ else { "\t".$orig; }
+};
+
+sub assemble {
+ my ($mnemonic,$mod,$args)=@_;
+ my $opcode = eval("\$$mnemonic");
+
+ ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
+}
+
+foreach (split("\n",$code)) {
+ s/\`([^\`]*)\`/eval $1/ge;
+ if ($SIZE_T==4) {
+ s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e;
+ s/cmpb,\*/comb,/;
+ s/,\*/,/;
+ }
+ s/\bbv\b/bve/ if ($SIZE_T==8);
+ print $_,"\n";
+}
+
+close STDOUT;
diff --git a/main/openssl/crypto/modes/asm/ghash-s390x.pl b/main/openssl/crypto/modes/asm/ghash-s390x.pl
new file mode 100644
index 00000000..6a40d5d8
--- /dev/null
+++ b/main/openssl/crypto/modes/asm/ghash-s390x.pl
@@ -0,0 +1,262 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# September 2010.
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+128 bytes shared table]. Performance
+# was measured to be ~18 cycles per processed byte on z10, which is
+# almost 40% better than gcc-generated code. It should be noted that
+# 18 cycles is worse result than expected: loop is scheduled for 12
+# and the result should be close to 12. In the lack of instruction-
+# level profiling data it's impossible to tell why...
+
+# November 2010.
+#
+# Adapt for -m31 build. If kernel supports what's called "highgprs"
+# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
+# instructions and achieve "64-bit" performance even in 31-bit legacy
+# application context. The feature is not specific to any particular
+# processor, as long as it's "z-CPU". Latter implies that the code
+# remains z/Architecture specific. On z990 it was measured to perform
+# 2.8x better than 32-bit code generated by gcc 4.3.
+
+# March 2011.
+#
+# Support for hardware KIMD-GHASH is verified to produce correct
+# result and therefore is engaged. On z196 it was measured to process
+# 8KB buffer ~7 faster than software implementation. It's not as
+# impressive for smaller buffer sizes and for smallest 16-bytes buffer
+# it's actually almost 2 times slower. Which is the reason why
+# KIMD-GHASH is not used in gcm_gmult_4bit.
+
+$flavour = shift;
+
+if ($flavour =~ /3[12]/) {
+ $SIZE_T=4;
+ $g="";
+} else {
+ $SIZE_T=8;
+ $g="g";
+}
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+$softonly=0;
+
+$Zhi="%r0";
+$Zlo="%r1";
+
+$Xi="%r2"; # argument block
+$Htbl="%r3";
+$inp="%r4";
+$len="%r5";
+
+$rem0="%r6"; # variables
+$rem1="%r7";
+$nlo="%r8";
+$nhi="%r9";
+$xi="%r10";
+$cnt="%r11";
+$tmp="%r12";
+$x78="%r13";
+$rem_4bit="%r14";
+
+$sp="%r15";
+
+$code.=<<___;
+.text
+
+.globl gcm_gmult_4bit
+.align 32
+gcm_gmult_4bit:
+___
+$code.=<<___ if(!$softonly && 0); # hardware is slow for single block...
+ larl %r1,OPENSSL_s390xcap_P
+ lg %r0,0(%r1)
+ tmhl %r0,0x4000 # check for message-security-assist
+ jz .Lsoft_gmult
+ lghi %r0,0
+ la %r1,16($sp)
+ .long 0xb93e0004 # kimd %r0,%r4
+ lg %r1,24($sp)
+ tmhh %r1,0x4000 # check for function 65
+ jz .Lsoft_gmult
+ stg %r0,16($sp) # arrange 16 bytes of zero input
+ stg %r0,24($sp)
+ lghi %r0,65 # function 65
+ la %r1,0($Xi) # H lies right after Xi in gcm128_context
+ la $inp,16($sp)
+ lghi $len,16
+ .long 0xb93e0004 # kimd %r0,$inp
+ brc 1,.-4 # pay attention to "partial completion"
+ br %r14
+.align 32
+.Lsoft_gmult:
+___
+$code.=<<___;
+ stm${g} %r6,%r14,6*$SIZE_T($sp)
+
+ aghi $Xi,-1
+ lghi $len,1
+ lghi $x78,`0xf<<3`
+ larl $rem_4bit,rem_4bit
+
+ lg $Zlo,8+1($Xi) # Xi
+ j .Lgmult_shortcut
+.type gcm_gmult_4bit,\@function
+.size gcm_gmult_4bit,(.-gcm_gmult_4bit)
+
+.globl gcm_ghash_4bit
+.align 32
+gcm_ghash_4bit:
+___
+$code.=<<___ if(!$softonly);
+ larl %r1,OPENSSL_s390xcap_P
+ lg %r0,0(%r1)
+ tmhl %r0,0x4000 # check for message-security-assist
+ jz .Lsoft_ghash
+ lghi %r0,0
+ la %r1,16($sp)
+ .long 0xb93e0004 # kimd %r0,%r4
+ lg %r1,24($sp)
+ tmhh %r1,0x4000 # check for function 65
+ jz .Lsoft_ghash
+ lghi %r0,65 # function 65
+ la %r1,0($Xi) # H lies right after Xi in gcm128_context
+ .long 0xb93e0004 # kimd %r0,$inp
+ brc 1,.-4 # pay attention to "partial completion"
+ br %r14
+.align 32
+.Lsoft_ghash:
+___
+$code.=<<___ if ($flavour =~ /3[12]/);
+ llgfr $len,$len
+___
+$code.=<<___;
+ stm${g} %r6,%r14,6*$SIZE_T($sp)
+
+ aghi $Xi,-1
+ srlg $len,$len,4
+ lghi $x78,`0xf<<3`
+ larl $rem_4bit,rem_4bit
+
+ lg $Zlo,8+1($Xi) # Xi
+ lg $Zhi,0+1($Xi)
+ lghi $tmp,0
+.Louter:
+ xg $Zhi,0($inp) # Xi ^= inp
+ xg $Zlo,8($inp)
+ xgr $Zhi,$tmp
+ stg $Zlo,8+1($Xi)
+ stg $Zhi,0+1($Xi)
+
+.Lgmult_shortcut:
+ lghi $tmp,0xf0
+ sllg $nlo,$Zlo,4
+ srlg $xi,$Zlo,8 # extract second byte
+ ngr $nlo,$tmp
+ lgr $nhi,$Zlo
+ lghi $cnt,14
+ ngr $nhi,$tmp
+
+ lg $Zlo,8($nlo,$Htbl)
+ lg $Zhi,0($nlo,$Htbl)
+
+ sllg $nlo,$xi,4
+ sllg $rem0,$Zlo,3
+ ngr $nlo,$tmp
+ ngr $rem0,$x78
+ ngr $xi,$tmp
+
+ sllg $tmp,$Zhi,60
+ srlg $Zlo,$Zlo,4
+ srlg $Zhi,$Zhi,4
+ xg $Zlo,8($nhi,$Htbl)
+ xg $Zhi,0($nhi,$Htbl)
+ lgr $nhi,$xi
+ sllg $rem1,$Zlo,3
+ xgr $Zlo,$tmp
+ ngr $rem1,$x78
+ j .Lghash_inner
+.align 16
+.Lghash_inner:
+ srlg $Zlo,$Zlo,4
+ sllg $tmp,$Zhi,60
+ xg $Zlo,8($nlo,$Htbl)
+ srlg $Zhi,$Zhi,4
+ llgc $xi,0($cnt,$Xi)
+ xg $Zhi,0($nlo,$Htbl)
+ sllg $nlo,$xi,4
+ xg $Zhi,0($rem0,$rem_4bit)
+ nill $nlo,0xf0
+ sllg $rem0,$Zlo,3
+ xgr $Zlo,$tmp
+ ngr $rem0,$x78
+ nill $xi,0xf0
+
+ sllg $tmp,$Zhi,60
+ srlg $Zlo,$Zlo,4
+ srlg $Zhi,$Zhi,4
+ xg $Zlo,8($nhi,$Htbl)
+ xg $Zhi,0($nhi,$Htbl)
+ lgr $nhi,$xi
+ xg $Zhi,0($rem1,$rem_4bit)
+ sllg $rem1,$Zlo,3
+ xgr $Zlo,$tmp
+ ngr $rem1,$x78
+ brct $cnt,.Lghash_inner
+
+ sllg $tmp,$Zhi,60
+ srlg $Zlo,$Zlo,4
+ srlg $Zhi,$Zhi,4
+ xg $Zlo,8($nlo,$Htbl)
+ xg $Zhi,0($nlo,$Htbl)
+ sllg $xi,$Zlo,3
+ xg $Zhi,0($rem0,$rem_4bit)
+ xgr $Zlo,$tmp
+ ngr $xi,$x78
+
+ sllg $tmp,$Zhi,60
+ srlg $Zlo,$Zlo,4
+ srlg $Zhi,$Zhi,4
+ xg $Zlo,8($nhi,$Htbl)
+ xg $Zhi,0($nhi,$Htbl)
+ xgr $Zlo,$tmp
+ xg $Zhi,0($rem1,$rem_4bit)
+
+ lg $tmp,0($xi,$rem_4bit)
+ la $inp,16($inp)
+ sllg $tmp,$tmp,4 # correct last rem_4bit[rem]
+ brctg $len,.Louter
+
+ xgr $Zhi,$tmp
+ stg $Zlo,8+1($Xi)
+ stg $Zhi,0+1($Xi)
+ lm${g} %r6,%r14,6*$SIZE_T($sp)
+ br %r14
+.type gcm_ghash_4bit,\@function
+.size gcm_ghash_4bit,(.-gcm_ghash_4bit)
+
+.align 64
+rem_4bit:
+ .long `0x0000<<12`,0,`0x1C20<<12`,0,`0x3840<<12`,0,`0x2460<<12`,0
+ .long `0x7080<<12`,0,`0x6CA0<<12`,0,`0x48C0<<12`,0,`0x54E0<<12`,0
+ .long `0xE100<<12`,0,`0xFD20<<12`,0,`0xD940<<12`,0,`0xC560<<12`,0
+ .long `0x9180<<12`,0,`0x8DA0<<12`,0,`0xA9C0<<12`,0,`0xB5E0<<12`,0
+.type rem_4bit,\@object
+.size rem_4bit,(.-rem_4bit)
+.string "GHASH for s390x, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+print $code;
+close STDOUT;
diff --git a/main/openssl/crypto/modes/asm/ghash-sparcv9.pl b/main/openssl/crypto/modes/asm/ghash-sparcv9.pl
new file mode 100644
index 00000000..70e7b044
--- /dev/null
+++ b/main/openssl/crypto/modes/asm/ghash-sparcv9.pl
@@ -0,0 +1,330 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# March 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+128 bytes shared table]. Performance
+# results are for streamed GHASH subroutine on UltraSPARC pre-Tx CPU
+# and are expressed in cycles per processed byte, less is better:
+#
+# gcc 3.3.x cc 5.2 this assembler
+#
+# 32-bit build 81.4 43.3 12.6 (+546%/+244%)
+# 64-bit build 20.2 21.2 12.6 (+60%/+68%)
+#
+# Here is data collected on UltraSPARC T1 system running Linux:
+#
+# gcc 4.4.1 this assembler
+#
+# 32-bit build 566 50 (+1000%)
+# 64-bit build 56 50 (+12%)
+#
+# I don't quite understand why difference between 32-bit and 64-bit
+# compiler-generated code is so big. Compilers *were* instructed to
+# generate code for UltraSPARC and should have used 64-bit registers
+# for Z vector (see C code) even in 32-bit build... Oh well, it only
+# means more impressive improvement coefficients for this assembler
+# module;-) Loops are aggressively modulo-scheduled in respect to
+# references to input data and Z.hi updates to achieve 12 cycles
+# timing. To anchor to something else, sha1-sparcv9.pl spends 11.6
+# cycles to process one byte on UltraSPARC pre-Tx CPU and ~24 on T1.
+
+$bits=32;
+for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
+if ($bits==64) { $bias=2047; $frame=192; }
+else { $bias=0; $frame=112; }
+
+$output=shift;
+open STDOUT,">$output";
+
+$Zhi="%o0"; # 64-bit values
+$Zlo="%o1";
+$Thi="%o2";
+$Tlo="%o3";
+$rem="%o4";
+$tmp="%o5";
+
+$nhi="%l0"; # small values and pointers
+$nlo="%l1";
+$xi0="%l2";
+$xi1="%l3";
+$rem_4bit="%l4";
+$remi="%l5";
+$Htblo="%l6";
+$cnt="%l7";
+
+$Xi="%i0"; # input argument block
+$Htbl="%i1";
+$inp="%i2";
+$len="%i3";
+
+$code.=<<___;
+.section ".text",#alloc,#execinstr
+
+.align 64
+rem_4bit:
+ .long `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
+ .long `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
+ .long `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
+ .long `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
+.type rem_4bit,#object
+.size rem_4bit,(.-rem_4bit)
+
+.globl gcm_ghash_4bit
+.align 32
+gcm_ghash_4bit:
+ save %sp,-$frame,%sp
+ ldub [$inp+15],$nlo
+ ldub [$Xi+15],$xi0
+ ldub [$Xi+14],$xi1
+ add $len,$inp,$len
+ add $Htbl,8,$Htblo
+
+1: call .+8
+ add %o7,rem_4bit-1b,$rem_4bit
+
+.Louter:
+ xor $xi0,$nlo,$nlo
+ and $nlo,0xf0,$nhi
+ and $nlo,0x0f,$nlo
+ sll $nlo,4,$nlo
+ ldx [$Htblo+$nlo],$Zlo
+ ldx [$Htbl+$nlo],$Zhi
+
+ ldub [$inp+14],$nlo
+
+ ldx [$Htblo+$nhi],$Tlo
+ and $Zlo,0xf,$remi
+ ldx [$Htbl+$nhi],$Thi
+ sll $remi,3,$remi
+ ldx [$rem_4bit+$remi],$rem
+ srlx $Zlo,4,$Zlo
+ mov 13,$cnt
+ sllx $Zhi,60,$tmp
+ xor $Tlo,$Zlo,$Zlo
+ srlx $Zhi,4,$Zhi
+ xor $Zlo,$tmp,$Zlo
+
+ xor $xi1,$nlo,$nlo
+ and $Zlo,0xf,$remi
+ and $nlo,0xf0,$nhi
+ and $nlo,0x0f,$nlo
+ ba .Lghash_inner
+ sll $nlo,4,$nlo
+.align 32
+.Lghash_inner:
+ ldx [$Htblo+$nlo],$Tlo
+ sll $remi,3,$remi
+ xor $Thi,$Zhi,$Zhi
+ ldx [$Htbl+$nlo],$Thi
+ srlx $Zlo,4,$Zlo
+ xor $rem,$Zhi,$Zhi
+ ldx [$rem_4bit+$remi],$rem
+ sllx $Zhi,60,$tmp
+ xor $Tlo,$Zlo,$Zlo
+ ldub [$inp+$cnt],$nlo
+ srlx $Zhi,4,$Zhi
+ xor $Zlo,$tmp,$Zlo
+ ldub [$Xi+$cnt],$xi1
+ xor $Thi,$Zhi,$Zhi
+ and $Zlo,0xf,$remi
+
+ ldx [$Htblo+$nhi],$Tlo
+ sll $remi,3,$remi
+ xor $rem,$Zhi,$Zhi
+ ldx [$Htbl+$nhi],$Thi
+ srlx $Zlo,4,$Zlo
+ ldx [$rem_4bit+$remi],$rem
+ sllx $Zhi,60,$tmp
+ xor $xi1,$nlo,$nlo
+ srlx $Zhi,4,$Zhi
+ and $nlo,0xf0,$nhi
+ addcc $cnt,-1,$cnt
+ xor $Zlo,$tmp,$Zlo
+ and $nlo,0x0f,$nlo
+ xor $Tlo,$Zlo,$Zlo
+ sll $nlo,4,$nlo
+ blu .Lghash_inner
+ and $Zlo,0xf,$remi
+
+ ldx [$Htblo+$nlo],$Tlo
+ sll $remi,3,$remi
+ xor $Thi,$Zhi,$Zhi
+ ldx [$Htbl+$nlo],$Thi
+ srlx $Zlo,4,$Zlo
+ xor $rem,$Zhi,$Zhi
+ ldx [$rem_4bit+$remi],$rem
+ sllx $Zhi,60,$tmp
+ xor $Tlo,$Zlo,$Zlo
+ srlx $Zhi,4,$Zhi
+ xor $Zlo,$tmp,$Zlo
+ xor $Thi,$Zhi,$Zhi
+
+ add $inp,16,$inp
+ cmp $inp,$len
+ be,pn `$bits==64?"%xcc":"%icc"`,.Ldone
+ and $Zlo,0xf,$remi
+
+ ldx [$Htblo+$nhi],$Tlo
+ sll $remi,3,$remi
+ xor $rem,$Zhi,$Zhi
+ ldx [$Htbl+$nhi],$Thi
+ srlx $Zlo,4,$Zlo
+ ldx [$rem_4bit+$remi],$rem
+ sllx $Zhi,60,$tmp
+ xor $Tlo,$Zlo,$Zlo
+ ldub [$inp+15],$nlo
+ srlx $Zhi,4,$Zhi
+ xor $Zlo,$tmp,$Zlo
+ xor $Thi,$Zhi,$Zhi
+ stx $Zlo,[$Xi+8]
+ xor $rem,$Zhi,$Zhi
+ stx $Zhi,[$Xi]
+ srl $Zlo,8,$xi1
+ and $Zlo,0xff,$xi0
+ ba .Louter
+ and $xi1,0xff,$xi1
+.align 32
+.Ldone:
+ ldx [$Htblo+$nhi],$Tlo
+ sll $remi,3,$remi
+ xor $rem,$Zhi,$Zhi
+ ldx [$Htbl+$nhi],$Thi
+ srlx $Zlo,4,$Zlo
+ ldx [$rem_4bit+$remi],$rem
+ sllx $Zhi,60,$tmp
+ xor $Tlo,$Zlo,$Zlo
+ srlx $Zhi,4,$Zhi
+ xor $Zlo,$tmp,$Zlo
+ xor $Thi,$Zhi,$Zhi
+ stx $Zlo,[$Xi+8]
+ xor $rem,$Zhi,$Zhi
+ stx $Zhi,[$Xi]
+
+ ret
+ restore
+.type gcm_ghash_4bit,#function
+.size gcm_ghash_4bit,(.-gcm_ghash_4bit)
+___
+
+undef $inp;
+undef $len;
+
+$code.=<<___;
+.globl gcm_gmult_4bit
+.align 32
+gcm_gmult_4bit:
+ save %sp,-$frame,%sp
+ ldub [$Xi+15],$nlo
+ add $Htbl,8,$Htblo
+
+1: call .+8
+ add %o7,rem_4bit-1b,$rem_4bit
+
+ and $nlo,0xf0,$nhi
+ and $nlo,0x0f,$nlo
+ sll $nlo,4,$nlo
+ ldx [$Htblo+$nlo],$Zlo
+ ldx [$Htbl+$nlo],$Zhi
+
+ ldub [$Xi+14],$nlo
+
+ ldx [$Htblo+$nhi],$Tlo
+ and $Zlo,0xf,$remi
+ ldx [$Htbl+$nhi],$Thi
+ sll $remi,3,$remi
+ ldx [$rem_4bit+$remi],$rem
+ srlx $Zlo,4,$Zlo
+ mov 13,$cnt
+ sllx $Zhi,60,$tmp
+ xor $Tlo,$Zlo,$Zlo
+ srlx $Zhi,4,$Zhi
+ xor $Zlo,$tmp,$Zlo
+
+ and $Zlo,0xf,$remi
+ and $nlo,0xf0,$nhi
+ and $nlo,0x0f,$nlo
+ ba .Lgmult_inner
+ sll $nlo,4,$nlo
+.align 32
+.Lgmult_inner:
+ ldx [$Htblo+$nlo],$Tlo
+ sll $remi,3,$remi
+ xor $Thi,$Zhi,$Zhi
+ ldx [$Htbl+$nlo],$Thi
+ srlx $Zlo,4,$Zlo
+ xor $rem,$Zhi,$Zhi
+ ldx [$rem_4bit+$remi],$rem
+ sllx $Zhi,60,$tmp
+ xor $Tlo,$Zlo,$Zlo
+ ldub [$Xi+$cnt],$nlo
+ srlx $Zhi,4,$Zhi
+ xor $Zlo,$tmp,$Zlo
+ xor $Thi,$Zhi,$Zhi
+ and $Zlo,0xf,$remi
+
+ ldx [$Htblo+$nhi],$Tlo
+ sll $remi,3,$remi
+ xor $rem,$Zhi,$Zhi
+ ldx [$Htbl+$nhi],$Thi
+ srlx $Zlo,4,$Zlo
+ ldx [$rem_4bit+$remi],$rem
+ sllx $Zhi,60,$tmp
+ srlx $Zhi,4,$Zhi
+ and $nlo,0xf0,$nhi
+ addcc $cnt,-1,$cnt
+ xor $Zlo,$tmp,$Zlo
+ and $nlo,0x0f,$nlo
+ xor $Tlo,$Zlo,$Zlo
+ sll $nlo,4,$nlo
+ blu .Lgmult_inner
+ and $Zlo,0xf,$remi
+
+ ldx [$Htblo+$nlo],$Tlo
+ sll $remi,3,$remi
+ xor $Thi,$Zhi,$Zhi
+ ldx [$Htbl+$nlo],$Thi
+ srlx $Zlo,4,$Zlo
+ xor $rem,$Zhi,$Zhi
+ ldx [$rem_4bit+$remi],$rem
+ sllx $Zhi,60,$tmp
+ xor $Tlo,$Zlo,$Zlo
+ srlx $Zhi,4,$Zhi
+ xor $Zlo,$tmp,$Zlo
+ xor $Thi,$Zhi,$Zhi
+ and $Zlo,0xf,$remi
+
+ ldx [$Htblo+$nhi],$Tlo
+ sll $remi,3,$remi
+ xor $rem,$Zhi,$Zhi
+ ldx [$Htbl+$nhi],$Thi
+ srlx $Zlo,4,$Zlo
+ ldx [$rem_4bit+$remi],$rem
+ sllx $Zhi,60,$tmp
+ xor $Tlo,$Zlo,$Zlo
+ srlx $Zhi,4,$Zhi
+ xor $Zlo,$tmp,$Zlo
+ xor $Thi,$Zhi,$Zhi
+ stx $Zlo,[$Xi+8]
+ xor $rem,$Zhi,$Zhi
+ stx $Zhi,[$Xi]
+
+ ret
+ restore
+.type gcm_gmult_4bit,#function
+.size gcm_gmult_4bit,(.-gcm_gmult_4bit)
+.asciz "GHASH for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
+.align 4
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+print $code;
+close STDOUT;
diff --git a/main/openssl/crypto/modes/asm/ghash-x86.S b/main/openssl/crypto/modes/asm/ghash-x86.S
new file mode 100644
index 00000000..cb9ae20d
--- /dev/null
+++ b/main/openssl/crypto/modes/asm/ghash-x86.S
@@ -0,0 +1,728 @@
+.file "ghash-x86.s"
+.text
+.globl gcm_gmult_4bit_x86
+.type gcm_gmult_4bit_x86,@function
+.align 16
+gcm_gmult_4bit_x86:
+.L_gcm_gmult_4bit_x86_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ subl $84,%esp
+ movl 104(%esp),%edi
+ movl 108(%esp),%esi
+ movl (%edi),%ebp
+ movl 4(%edi),%edx
+ movl 8(%edi),%ecx
+ movl 12(%edi),%ebx
+ movl $0,16(%esp)
+ movl $471859200,20(%esp)
+ movl $943718400,24(%esp)
+ movl $610271232,28(%esp)
+ movl $1887436800,32(%esp)
+ movl $1822425088,36(%esp)
+ movl $1220542464,40(%esp)
+ movl $1423966208,44(%esp)
+ movl $3774873600,48(%esp)
+ movl $4246732800,52(%esp)
+ movl $3644850176,56(%esp)
+ movl $3311403008,60(%esp)
+ movl $2441084928,64(%esp)
+ movl $2376073216,68(%esp)
+ movl $2847932416,72(%esp)
+ movl $3051356160,76(%esp)
+ movl %ebp,(%esp)
+ movl %edx,4(%esp)
+ movl %ecx,8(%esp)
+ movl %ebx,12(%esp)
+ shrl $20,%ebx
+ andl $240,%ebx
+ movl 4(%esi,%ebx,1),%ebp
+ movl (%esi,%ebx,1),%edx
+ movl 12(%esi,%ebx,1),%ecx
+ movl 8(%esi,%ebx,1),%ebx
+ xorl %eax,%eax
+ movl $15,%edi
+ jmp .L000x86_loop
+.align 16
+.L000x86_loop:
+ movb %bl,%al
+ shrdl $4,%ecx,%ebx
+ andb $15,%al
+ shrdl $4,%edx,%ecx
+ shrdl $4,%ebp,%edx
+ shrl $4,%ebp
+ xorl 16(%esp,%eax,4),%ebp
+ movb (%esp,%edi,1),%al
+ andb $240,%al
+ xorl 8(%esi,%eax,1),%ebx
+ xorl 12(%esi,%eax,1),%ecx
+ xorl (%esi,%eax,1),%edx
+ xorl 4(%esi,%eax,1),%ebp
+ decl %edi
+ js .L001x86_break
+ movb %bl,%al
+ shrdl $4,%ecx,%ebx
+ andb $15,%al
+ shrdl $4,%edx,%ecx
+ shrdl $4,%ebp,%edx
+ shrl $4,%ebp
+ xorl 16(%esp,%eax,4),%ebp
+ movb (%esp,%edi,1),%al
+ shlb $4,%al
+ xorl 8(%esi,%eax,1),%ebx
+ xorl 12(%esi,%eax,1),%ecx
+ xorl (%esi,%eax,1),%edx
+ xorl 4(%esi,%eax,1),%ebp
+ jmp .L000x86_loop
+.align 16
+.L001x86_break:
+ bswap %ebx
+ bswap %ecx
+ bswap %edx
+ bswap %ebp
+ movl 104(%esp),%edi
+ movl %ebx,12(%edi)
+ movl %ecx,8(%edi)
+ movl %edx,4(%edi)
+ movl %ebp,(%edi)
+ addl $84,%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size gcm_gmult_4bit_x86,.-.L_gcm_gmult_4bit_x86_begin
+.globl gcm_ghash_4bit_x86
+.type gcm_ghash_4bit_x86,@function
+.align 16
+gcm_ghash_4bit_x86:
+.L_gcm_ghash_4bit_x86_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ subl $84,%esp
+ movl 104(%esp),%ebx
+ movl 108(%esp),%esi
+ movl 112(%esp),%edi
+ movl 116(%esp),%ecx
+ addl %edi,%ecx
+ movl %ecx,116(%esp)
+ movl (%ebx),%ebp
+ movl 4(%ebx),%edx
+ movl 8(%ebx),%ecx
+ movl 12(%ebx),%ebx
+ movl $0,16(%esp)
+ movl $471859200,20(%esp)
+ movl $943718400,24(%esp)
+ movl $610271232,28(%esp)
+ movl $1887436800,32(%esp)
+ movl $1822425088,36(%esp)
+ movl $1220542464,40(%esp)
+ movl $1423966208,44(%esp)
+ movl $3774873600,48(%esp)
+ movl $4246732800,52(%esp)
+ movl $3644850176,56(%esp)
+ movl $3311403008,60(%esp)
+ movl $2441084928,64(%esp)
+ movl $2376073216,68(%esp)
+ movl $2847932416,72(%esp)
+ movl $3051356160,76(%esp)
+.align 16
+.L002x86_outer_loop:
+ xorl 12(%edi),%ebx
+ xorl 8(%edi),%ecx
+ xorl 4(%edi),%edx
+ xorl (%edi),%ebp
+ movl %ebx,12(%esp)
+ movl %ecx,8(%esp)
+ movl %edx,4(%esp)
+ movl %ebp,(%esp)
+ shrl $20,%ebx
+ andl $240,%ebx
+ movl 4(%esi,%ebx,1),%ebp
+ movl (%esi,%ebx,1),%edx
+ movl 12(%esi,%ebx,1),%ecx
+ movl 8(%esi,%ebx,1),%ebx
+ xorl %eax,%eax
+ movl $15,%edi
+ jmp .L003x86_loop
+.align 16
+.L003x86_loop:
+ movb %bl,%al
+ shrdl $4,%ecx,%ebx
+ andb $15,%al
+ shrdl $4,%edx,%ecx
+ shrdl $4,%ebp,%edx
+ shrl $4,%ebp
+ xorl 16(%esp,%eax,4),%ebp
+ movb (%esp,%edi,1),%al
+ andb $240,%al
+ xorl 8(%esi,%eax,1),%ebx
+ xorl 12(%esi,%eax,1),%ecx
+ xorl (%esi,%eax,1),%edx
+ xorl 4(%esi,%eax,1),%ebp
+ decl %edi
+ js .L004x86_break
+ movb %bl,%al
+ shrdl $4,%ecx,%ebx
+ andb $15,%al
+ shrdl $4,%edx,%ecx
+ shrdl $4,%ebp,%edx
+ shrl $4,%ebp
+ xorl 16(%esp,%eax,4),%ebp
+ movb (%esp,%edi,1),%al
+ shlb $4,%al
+ xorl 8(%esi,%eax,1),%ebx
+ xorl 12(%esi,%eax,1),%ecx
+ xorl (%esi,%eax,1),%edx
+ xorl 4(%esi,%eax,1),%ebp
+ jmp .L003x86_loop
+.align 16
+.L004x86_break:
+ bswap %ebx
+ bswap %ecx
+ bswap %edx
+ bswap %ebp
+ movl 112(%esp),%edi
+ leal 16(%edi),%edi
+ cmpl 116(%esp),%edi
+ movl %edi,112(%esp)
+ jb .L002x86_outer_loop
+ movl 104(%esp),%edi
+ movl %ebx,12(%edi)
+ movl %ecx,8(%edi)
+ movl %edx,4(%edi)
+ movl %ebp,(%edi)
+ addl $84,%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size gcm_ghash_4bit_x86,.-.L_gcm_ghash_4bit_x86_begin
+.type _mmx_gmult_4bit_inner,@function
+.align 16
+_mmx_gmult_4bit_inner:
+ xorl %ecx,%ecx
+ movl %ebx,%edx
+ movb %dl,%cl
+ shlb $4,%cl
+ andl $240,%edx
+ movq 8(%esi,%ecx,1),%mm0
+ movq (%esi,%ecx,1),%mm1
+ movd %mm0,%ebp
+ psrlq $4,%mm0
+ movq %mm1,%mm2
+ psrlq $4,%mm1
+ pxor 8(%esi,%edx,1),%mm0
+ movb 14(%edi),%cl
+ psllq $60,%mm2
+ andl $15,%ebp
+ pxor (%esi,%edx,1),%mm1
+ movl %ecx,%edx
+ movd %mm0,%ebx
+ pxor %mm2,%mm0
+ shlb $4,%cl
+ psrlq $4,%mm0
+ movq %mm1,%mm2
+ psrlq $4,%mm1
+ pxor 8(%esi,%ecx,1),%mm0
+ psllq $60,%mm2
+ andl $240,%edx
+ pxor (%eax,%ebp,8),%mm1
+ andl $15,%ebx
+ pxor (%esi,%ecx,1),%mm1
+ movd %mm0,%ebp
+ pxor %mm2,%mm0
+ psrlq $4,%mm0
+ movq %mm1,%mm2
+ psrlq $4,%mm1
+ pxor 8(%esi,%edx,1),%mm0
+ movb 13(%edi),%cl
+ psllq $60,%mm2
+ pxor (%eax,%ebx,8),%mm1
+ andl $15,%ebp
+ pxor (%esi,%edx,1),%mm1
+ movl %ecx,%edx
+ movd %mm0,%ebx
+ pxor %mm2,%mm0
+ shlb $4,%cl
+ psrlq $4,%mm0
+ movq %mm1,%mm2
+ psrlq $4,%mm1
+ pxor 8(%esi,%ecx,1),%mm0
+ psllq $60,%mm2
+ andl $240,%edx
+ pxor (%eax,%ebp,8),%mm1
+ andl $15,%ebx
+ pxor (%esi,%ecx,1),%mm1
+ movd %mm0,%ebp
+ pxor %mm2,%mm0
+ psrlq $4,%mm0
+ movq %mm1,%mm2
+ psrlq $4,%mm1
+ pxor 8(%esi,%edx,1),%mm0
+ movb 12(%edi),%cl
+ psllq $60,%mm2
+ pxor (%eax,%ebx,8),%mm1
+ andl $15,%ebp
+ pxor (%esi,%edx,1),%mm1
+ movl %ecx,%edx
+ movd %mm0,%ebx
+ pxor %mm2,%mm0
+ shlb $4,%cl
+ psrlq $4,%mm0
+ movq %mm1,%mm2
+ psrlq $4,%mm1
+ pxor 8(%esi,%ecx,1),%mm0
+ psllq $60,%mm2
+ andl $240,%edx
+ pxor (%eax,%ebp,8),%mm1
+ andl $15,%ebx
+ pxor (%esi,%ecx,1),%mm1
+ movd %mm0,%ebp
+ pxor %mm2,%mm0
+ psrlq $4,%mm0
+ movq %mm1,%mm2
+ psrlq $4,%mm1
+ pxor 8(%esi,%edx,1),%mm0
+ movb 11(%edi),%cl
+ psllq $60,%mm2
+ pxor (%eax,%ebx,8),%mm1
+ andl $15,%ebp
+ pxor (%esi,%edx,1),%mm1
+ movl %ecx,%edx
+ movd %mm0,%ebx
+ pxor %mm2,%mm0
+ shlb $4,%cl
+ psrlq $4,%mm0
+ movq %mm1,%mm2
+ psrlq $4,%mm1
+ pxor 8(%esi,%ecx,1),%mm0
+ psllq $60,%mm2
+ andl $240,%edx
+ pxor (%eax,%ebp,8),%mm1
+ andl $15,%ebx
+ pxor (%esi,%ecx,1),%mm1
+ movd %mm0,%ebp
+ pxor %mm2,%mm0
+ psrlq $4,%mm0
+ movq %mm1,%mm2
+ psrlq $4,%mm1
+ pxor 8(%esi,%edx,1),%mm0
+ movb 10(%edi),%cl
+ psllq $60,%mm2
+ pxor (%eax,%ebx,8),%mm1
+ andl $15,%ebp
+ pxor (%esi,%edx,1),%mm1
+ movl %ecx,%edx
+ movd %mm0,%ebx
+ pxor %mm2,%mm0
+ shlb $4,%cl
+ psrlq $4,%mm0
+ movq %mm1,%mm2
+ psrlq $4,%mm1
+ pxor 8(%esi,%ecx,1),%mm0
+ psllq $60,%mm2
+ andl $240,%edx
+ pxor (%eax,%ebp,8),%mm1
+ andl $15,%ebx
+ pxor (%esi,%ecx,1),%mm1
+ movd %mm0,%ebp
+ pxor %mm2,%mm0
+ psrlq $4,%mm0
+ movq %mm1,%mm2
+ psrlq $4,%mm1
+ pxor 8(%esi,%edx,1),%mm0
+ movb 9(%edi),%cl
+ psllq $60,%mm2
+ pxor (%eax,%ebx,8),%mm1
+ andl $15,%ebp
+ pxor (%esi,%edx,1),%mm1
+ movl %ecx,%edx
+ movd %mm0,%ebx
+ pxor %mm2,%mm0
+ shlb $4,%cl
+ psrlq $4,%mm0
+ movq %mm1,%mm2
+ psrlq $4,%mm1
+ pxor 8(%esi,%ecx,1),%mm0
+ psllq $60,%mm2
+ andl $240,%edx
+ pxor (%eax,%ebp,8),%mm1
+ andl $15,%ebx
+ pxor (%esi,%ecx,1),%mm1
+ movd %mm0,%ebp
+ pxor %mm2,%mm0
+ psrlq $4,%mm0
+ movq %mm1,%mm2
+ psrlq $4,%mm1
+ pxor 8(%esi,%edx,1),%mm0
+ movb 8(%edi),%cl
+ psllq $60,%mm2
+ pxor (%eax,%ebx,8),%mm1
+ andl $15,%ebp
+ pxor (%esi,%edx,1),%mm1
+ movl %ecx,%edx
+ movd %mm0,%ebx
+ pxor %mm2,%mm0
+ shlb $4,%cl
+ psrlq $4,%mm0
+ movq %mm1,%mm2
+ psrlq $4,%mm1
+ pxor 8(%esi,%ecx,1),%mm0
+ psllq $60,%mm2
+ andl $240,%edx
+ pxor (%eax,%ebp,8),%mm1
+ andl $15,%ebx
+ pxor (%esi,%ecx,1),%mm1
+ movd %mm0,%ebp
+ pxor %mm2,%mm0
+ psrlq $4,%mm0
+ movq %mm1,%mm2
+ psrlq $4,%mm1
+ pxor 8(%esi,%edx,1),%mm0
+ movb 7(%edi),%cl
+ psllq $60,%mm2
+ pxor (%eax,%ebx,8),%mm1
+ andl $15,%ebp
+ pxor (%esi,%edx,1),%mm1
+ movl %ecx,%edx
+ movd %mm0,%ebx
+ pxor %mm2,%mm0
+ shlb $4,%cl
+ psrlq $4,%mm0
+ movq %mm1,%mm2
+ psrlq $4,%mm1
+ pxor 8(%esi,%ecx,1),%mm0
+ psllq $60,%mm2
+ andl $240,%edx
+ pxor (%eax,%ebp,8),%mm1
+ andl $15,%ebx
+ pxor (%esi,%ecx,1),%mm1
+ movd %mm0,%ebp
+ pxor %mm2,%mm0
+ psrlq $4,%mm0
+ movq %mm1,%mm2
+ psrlq $4,%mm1
+ pxor 8(%esi,%edx,1),%mm0
+ movb 6(%edi),%cl
+ psllq $60,%mm2
+ pxor (%eax,%ebx,8),%mm1
+ andl $15,%ebp
+ pxor (%esi,%edx,1),%mm1
+ movl %ecx,%edx
+ movd %mm0,%ebx
+ pxor %mm2,%mm0
+ shlb $4,%cl
+ psrlq $4,%mm0
+ movq %mm1,%mm2
+ psrlq $4,%mm1
+ pxor 8(%esi,%ecx,1),%mm0
+ psllq $60,%mm2
+ andl $240,%edx
+ pxor (%eax,%ebp,8),%mm1
+ andl $15,%ebx
+ pxor (%esi,%ecx,1),%mm1
+ movd %mm0,%ebp
+ pxor %mm2,%mm0
+ psrlq $4,%mm0
+ movq %mm1,%mm2
+ psrlq $4,%mm1
+ pxor 8(%esi,%edx,1),%mm0
+ movb 5(%edi),%cl
+ psllq $60,%mm2
+ pxor (%eax,%ebx,8),%mm1
+ andl $15,%ebp
+ pxor (%esi,%edx,1),%mm1
+ movl %ecx,%edx
+ movd %mm0,%ebx
+ pxor %mm2,%mm0
+ shlb $4,%cl
+ psrlq $4,%mm0
+ movq %mm1,%mm2
+ psrlq $4,%mm1
+ pxor 8(%esi,%ecx,1),%mm0
+ psllq $60,%mm2
+ andl $240,%edx
+ pxor (%eax,%ebp,8),%mm1
+ andl $15,%ebx
+ pxor (%esi,%ecx,1),%mm1
+ movd %mm0,%ebp
+ pxor %mm2,%mm0
+ psrlq $4,%mm0
+ movq %mm1,%mm2
+ psrlq $4,%mm1
+ pxor 8(%esi,%edx,1),%mm0
+ movb 4(%edi),%cl
+ psllq $60,%mm2
+ pxor (%eax,%ebx,8),%mm1
+ andl $15,%ebp
+ pxor (%esi,%edx,1),%mm1
+ movl %ecx,%edx
+ movd %mm0,%ebx
+ pxor %mm2,%mm0
+ shlb $4,%cl
+ psrlq $4,%mm0
+ movq %mm1,%mm2
+ psrlq $4,%mm1
+ pxor 8(%esi,%ecx,1),%mm0
+ psllq $60,%mm2
+ andl $240,%edx
+ pxor (%eax,%ebp,8),%mm1
+ andl $15,%ebx
+ pxor (%esi,%ecx,1),%mm1
+ movd %mm0,%ebp
+ pxor %mm2,%mm0
+ psrlq $4,%mm0
+ movq %mm1,%mm2
+ psrlq $4,%mm1
+ pxor 8(%esi,%edx,1),%mm0
+ movb 3(%edi),%cl
+ psllq $60,%mm2
+ pxor (%eax,%ebx,8),%mm1
+ andl $15,%ebp
+ pxor (%esi,%edx,1),%mm1
+ movl %ecx,%edx
+ movd %mm0,%ebx
+ pxor %mm2,%mm0
+ shlb $4,%cl
+ psrlq $4,%mm0
+ movq %mm1,%mm2
+ psrlq $4,%mm1
+ pxor 8(%esi,%ecx,1),%mm0
+ psllq $60,%mm2
+ andl $240,%edx
+ pxor (%eax,%ebp,8),%mm1
+ andl $15,%ebx
+ pxor (%esi,%ecx,1),%mm1
+ movd %mm0,%ebp
+ pxor %mm2,%mm0
+ psrlq $4,%mm0
+ movq %mm1,%mm2
+ psrlq $4,%mm1
+ pxor 8(%esi,%edx,1),%mm0
+ movb 2(%edi),%cl
+ psllq $60,%mm2
+ pxor (%eax,%ebx,8),%mm1
+ andl $15,%ebp
+ pxor (%esi,%edx,1),%mm1
+ movl %ecx,%edx
+ movd %mm0,%ebx
+ pxor %mm2,%mm0
+ shlb $4,%cl
+ psrlq $4,%mm0
+ movq %mm1,%mm2
+ psrlq $4,%mm1
+ pxor 8(%esi,%ecx,1),%mm0
+ psllq $60,%mm2
+ andl $240,%edx
+ pxor (%eax,%ebp,8),%mm1
+ andl $15,%ebx
+ pxor (%esi,%ecx,1),%mm1
+ movd %mm0,%ebp
+ pxor %mm2,%mm0
+ psrlq $4,%mm0
+ movq %mm1,%mm2
+ psrlq $4,%mm1
+ pxor 8(%esi,%edx,1),%mm0
+ movb 1(%edi),%cl
+ psllq $60,%mm2
+ pxor (%eax,%ebx,8),%mm1
+ andl $15,%ebp
+ pxor (%esi,%edx,1),%mm1
+ movl %ecx,%edx
+ movd %mm0,%ebx
+ pxor %mm2,%mm0
+ shlb $4,%cl
+ psrlq $4,%mm0
+ movq %mm1,%mm2
+ psrlq $4,%mm1
+ pxor 8(%esi,%ecx,1),%mm0
+ psllq $60,%mm2
+ andl $240,%edx
+ pxor (%eax,%ebp,8),%mm1
+ andl $15,%ebx
+ pxor (%esi,%ecx,1),%mm1
+ movd %mm0,%ebp
+ pxor %mm2,%mm0
+ psrlq $4,%mm0
+ movq %mm1,%mm2
+ psrlq $4,%mm1
+ pxor 8(%esi,%edx,1),%mm0
+ movb (%edi),%cl
+ psllq $60,%mm2
+ pxor (%eax,%ebx,8),%mm1
+ andl $15,%ebp
+ pxor (%esi,%edx,1),%mm1
+ movl %ecx,%edx
+ movd %mm0,%ebx
+ pxor %mm2,%mm0
+ shlb $4,%cl
+ psrlq $4,%mm0
+ movq %mm1,%mm2
+ psrlq $4,%mm1
+ pxor 8(%esi,%ecx,1),%mm0
+ psllq $60,%mm2
+ andl $240,%edx
+ pxor (%eax,%ebp,8),%mm1
+ andl $15,%ebx
+ pxor (%esi,%ecx,1),%mm1
+ movd %mm0,%ebp
+ pxor %mm2,%mm0
+ psrlq $4,%mm0
+ movq %mm1,%mm2
+ psrlq $4,%mm1
+ pxor 8(%esi,%edx,1),%mm0
+ psllq $60,%mm2
+ pxor (%eax,%ebx,8),%mm1
+ andl $15,%ebp
+ pxor (%esi,%edx,1),%mm1
+ movd %mm0,%ebx
+ pxor %mm2,%mm0
+ movl 4(%eax,%ebp,8),%edi
+ psrlq $32,%mm0
+ movd %mm1,%edx
+ psrlq $32,%mm1
+ movd %mm0,%ecx
+ movd %mm1,%ebp
+ shll $4,%edi
+ bswap %ebx
+ bswap %edx
+ bswap %ecx
+ xorl %edi,%ebp
+ bswap %ebp
+ ret
+.size _mmx_gmult_4bit_inner,.-_mmx_gmult_4bit_inner
+.globl gcm_gmult_4bit_mmx
+.type gcm_gmult_4bit_mmx,@function
+.align 16
+gcm_gmult_4bit_mmx:
+.L_gcm_gmult_4bit_mmx_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%edi
+ movl 24(%esp),%esi
+ call .L005pic_point
+.L005pic_point:
+ popl %eax
+ leal .Lrem_4bit-.L005pic_point(%eax),%eax
+ movzbl 15(%edi),%ebx
+ call _mmx_gmult_4bit_inner
+ movl 20(%esp),%edi
+ emms
+ movl %ebx,12(%edi)
+ movl %edx,4(%edi)
+ movl %ecx,8(%edi)
+ movl %ebp,(%edi)
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size gcm_gmult_4bit_mmx,.-.L_gcm_gmult_4bit_mmx_begin
+.globl gcm_ghash_4bit_mmx
+.type gcm_ghash_4bit_mmx,@function
+.align 16
+gcm_ghash_4bit_mmx:
+.L_gcm_ghash_4bit_mmx_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%ebp
+ movl 24(%esp),%esi
+ movl 28(%esp),%edi
+ movl 32(%esp),%ecx
+ call .L006pic_point
+.L006pic_point:
+ popl %eax
+ leal .Lrem_4bit-.L006pic_point(%eax),%eax
+ addl %edi,%ecx
+ movl %ecx,32(%esp)
+ subl $20,%esp
+ movl 12(%ebp),%ebx
+ movl 4(%ebp),%edx
+ movl 8(%ebp),%ecx
+ movl (%ebp),%ebp
+ jmp .L007mmx_outer_loop
+.align 16
+.L007mmx_outer_loop:
+ xorl 12(%edi),%ebx
+ xorl 4(%edi),%edx
+ xorl 8(%edi),%ecx
+ xorl (%edi),%ebp
+ movl %edi,48(%esp)
+ movl %ebx,12(%esp)
+ movl %edx,4(%esp)
+ movl %ecx,8(%esp)
+ movl %ebp,(%esp)
+ movl %esp,%edi
+ shrl $24,%ebx
+ call _mmx_gmult_4bit_inner
+ movl 48(%esp),%edi
+ leal 16(%edi),%edi
+ cmpl 52(%esp),%edi
+ jb .L007mmx_outer_loop
+ movl 40(%esp),%edi
+ emms
+ movl %ebx,12(%edi)
+ movl %edx,4(%edi)
+ movl %ecx,8(%edi)
+ movl %ebp,(%edi)
+ addl $20,%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size gcm_ghash_4bit_mmx,.-.L_gcm_ghash_4bit_mmx_begin
+.align 64
+.Lrem_4bit:
+.long 0,0,0,29491200,0,58982400,0,38141952
+.long 0,117964800,0,113901568,0,76283904,0,88997888
+.long 0,235929600,0,265420800,0,227803136,0,206962688
+.long 0,152567808,0,148504576,0,177995776,0,190709760
+.align 64
+.L008rem_8bit:
+.value 0,450,900,582,1800,1738,1164,1358
+.value 3600,4050,3476,3158,2328,2266,2716,2910
+.value 7200,7650,8100,7782,6952,6890,6316,6510
+.value 4656,5106,4532,4214,5432,5370,5820,6014
+.value 14400,14722,15300,14854,16200,16010,15564,15630
+.value 13904,14226,13780,13334,12632,12442,13020,13086
+.value 9312,9634,10212,9766,9064,8874,8428,8494
+.value 10864,11186,10740,10294,11640,11450,12028,12094
+.value 28800,28994,29444,29382,30600,30282,29708,30158
+.value 32400,32594,32020,31958,31128,30810,31260,31710
+.value 27808,28002,28452,28390,27560,27242,26668,27118
+.value 25264,25458,24884,24822,26040,25722,26172,26622
+.value 18624,18690,19268,19078,20424,19978,19532,19854
+.value 18128,18194,17748,17558,16856,16410,16988,17310
+.value 21728,21794,22372,22182,21480,21034,20588,20910
+.value 23280,23346,22900,22710,24056,23610,24188,24510
+.value 57600,57538,57988,58182,58888,59338,58764,58446
+.value 61200,61138,60564,60758,59416,59866,60316,59998
+.value 64800,64738,65188,65382,64040,64490,63916,63598
+.value 62256,62194,61620,61814,62520,62970,63420,63102
+.value 55616,55426,56004,56070,56904,57226,56780,56334
+.value 55120,54930,54484,54550,53336,53658,54236,53790
+.value 50528,50338,50916,50982,49768,50090,49644,49198
+.value 52080,51890,51444,51510,52344,52666,53244,52798
+.value 37248,36930,37380,37830,38536,38730,38156,38094
+.value 40848,40530,39956,40406,39064,39258,39708,39646
+.value 36256,35938,36388,36838,35496,35690,35116,35054
+.value 33712,33394,32820,33270,33976,34170,34620,34558
+.value 43456,43010,43588,43910,44744,44810,44364,44174
+.value 42960,42514,42068,42390,41176,41242,41820,41630
+.value 46560,46114,46692,47014,45800,45866,45420,45230
+.value 48112,47666,47220,47542,48376,48442,49020,48830
+.byte 71,72,65,83,72,32,102,111,114,32,120,56,54,44,32,67
+.byte 82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112
+.byte 112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62
+.byte 0
diff --git a/main/openssl/crypto/modes/asm/ghash-x86.pl b/main/openssl/crypto/modes/asm/ghash-x86.pl
new file mode 100644
index 00000000..2426cd0c
--- /dev/null
+++ b/main/openssl/crypto/modes/asm/ghash-x86.pl
@@ -0,0 +1,1342 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# March, May, June 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+64/128 bytes fixed table]. It has two
+# code paths: vanilla x86 and vanilla MMX. Former will be executed on
+# 486 and Pentium, latter on all others. MMX GHASH features so called
+# "528B" variant of "4-bit" method utilizing additional 256+16 bytes
+# of per-key storage [+512 bytes shared table]. Performance results
+# are for streamed GHASH subroutine and are expressed in cycles per
+# processed byte, less is better:
+#
+# gcc 2.95.3(*) MMX assembler x86 assembler
+#
+# Pentium 105/111(**) - 50
+# PIII 68 /75 12.2 24
+# P4 125/125 17.8 84(***)
+# Opteron 66 /70 10.1 30
+# Core2 54 /67 8.4 18
+#
+# (*) gcc 3.4.x was observed to generate few percent slower code,
+# which is one of reasons why 2.95.3 results were chosen,
+# another reason is lack of 3.4.x results for older CPUs;
+# comparison with MMX results is not completely fair, because C
+# results are for vanilla "256B" implementation, while
+# assembler results are for "528B";-)
+# (**) second number is result for code compiled with -fPIC flag,
+# which is actually more relevant, because assembler code is
+# position-independent;
+# (***) see comment in non-MMX routine for further details;
+#
+# To summarize, it's >2-5 times faster than gcc-generated code. To
+# anchor it to something else SHA1 assembler processes one byte in
+# 11-13 cycles on contemporary x86 cores. As for choice of MMX in
+# particular, see comment at the end of the file...
+
+# May 2010
+#
+# Add PCLMULQDQ version performing at 2.10 cycles per processed byte.
+# The question is how close is it to theoretical limit? The pclmulqdq
+# instruction latency appears to be 14 cycles and there can't be more
+# than 2 of them executing at any given time. This means that single
+# Karatsuba multiplication would take 28 cycles *plus* few cycles for
+# pre- and post-processing. Then multiplication has to be followed by
+# modulo-reduction. Given that aggregated reduction method [see
+# "Carry-less Multiplication and Its Usage for Computing the GCM Mode"
+# white paper by Intel] allows you to perform reduction only once in
+# a while we can assume that asymptotic performance can be estimated
+# as (28+Tmod/Naggr)/16, where Tmod is time to perform reduction
+# and Naggr is the aggregation factor.
+#
+# Before we proceed to this implementation let's have closer look at
+# the best-performing code suggested by Intel in their white paper.
+# By tracing inter-register dependencies Tmod is estimated as ~19
+# cycles and Naggr chosen by Intel is 4, resulting in 2.05 cycles per
+# processed byte. As implied, this is quite optimistic estimate,
+# because it does not account for Karatsuba pre- and post-processing,
+# which for a single multiplication is ~5 cycles. Unfortunately Intel
+# does not provide performance data for GHASH alone. But benchmarking
+# AES_GCM_encrypt ripped out of Fig. 15 of the white paper with aadt
+# alone resulted in 2.46 cycles per byte of out 16KB buffer. Note that
+# the result accounts even for pre-computing of degrees of the hash
+# key H, but its portion is negligible at 16KB buffer size.
+#
+# Moving on to the implementation in question. Tmod is estimated as
+# ~13 cycles and Naggr is 2, giving asymptotic performance of ...
+# 2.16. How is it possible that measured performance is better than
+# optimistic theoretical estimate? There is one thing Intel failed
+# to recognize. By serializing GHASH with CTR in same subroutine
+# former's performance is really limited to above (Tmul + Tmod/Naggr)
+# equation. But if GHASH procedure is detached, the modulo-reduction
+# can be interleaved with Naggr-1 multiplications at instruction level
+# and under ideal conditions even disappear from the equation. So that
+# optimistic theoretical estimate for this implementation is ...
+# 28/16=1.75, and not 2.16. Well, it's probably way too optimistic,
+# at least for such small Naggr. I'd argue that (28+Tproc/Naggr),
+# where Tproc is time required for Karatsuba pre- and post-processing,
+# is more realistic estimate. In this case it gives ... 1.91 cycles.
+# Or in other words, depending on how well we can interleave reduction
+# and one of the two multiplications the performance should be betwen
+# 1.91 and 2.16. As already mentioned, this implementation processes
+# one byte out of 8KB buffer in 2.10 cycles, while x86_64 counterpart
+# - in 2.02. x86_64 performance is better, because larger register
+# bank allows to interleave reduction and multiplication better.
+#
+# Does it make sense to increase Naggr? To start with it's virtually
+# impossible in 32-bit mode, because of limited register bank
+# capacity. Otherwise improvement has to be weighed agiainst slower
+# setup, as well as code size and complexity increase. As even
+# optimistic estimate doesn't promise 30% performance improvement,
+# there are currently no plans to increase Naggr.
+#
+# Special thanks to David Woodhouse <dwmw2@infradead.org> for
+# providing access to a Westmere-based system on behalf of Intel
+# Open Source Technology Centre.
+
+# January 2010
+#
+# Tweaked to optimize transitions between integer and FP operations
+# on same XMM register, PCLMULQDQ subroutine was measured to process
+# one byte in 2.07 cycles on Sandy Bridge, and in 2.12 - on Westmere.
+# The minor regression on Westmere is outweighed by ~15% improvement
+# on Sandy Bridge. Strangely enough attempt to modify 64-bit code in
+# similar manner resulted in almost 20% degradation on Sandy Bridge,
+# where original 64-bit code processes one byte in 1.95 cycles.
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+
+&asm_init($ARGV[0],"ghash-x86.pl",$x86only = $ARGV[$#ARGV] eq "386");
+
+$sse2=0;
+for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
+
+($Zhh,$Zhl,$Zlh,$Zll) = ("ebp","edx","ecx","ebx");
+$inp = "edi";
+$Htbl = "esi";
+
+$unroll = 0; # Affects x86 loop. Folded loop performs ~7% worse
+ # than unrolled, which has to be weighted against
+ # 2.5x x86-specific code size reduction.
+
+sub x86_loop {
+ my $off = shift;
+ my $rem = "eax";
+
+ &mov ($Zhh,&DWP(4,$Htbl,$Zll));
+ &mov ($Zhl,&DWP(0,$Htbl,$Zll));
+ &mov ($Zlh,&DWP(12,$Htbl,$Zll));
+ &mov ($Zll,&DWP(8,$Htbl,$Zll));
+ &xor ($rem,$rem); # avoid partial register stalls on PIII
+
+ # shrd practically kills P4, 2.5x deterioration, but P4 has
+ # MMX code-path to execute. shrd runs tad faster [than twice
+ # the shifts, move's and or's] on pre-MMX Pentium (as well as
+ # PIII and Core2), *but* minimizes code size, spares register
+ # and thus allows to fold the loop...
+ if (!$unroll) {
+ my $cnt = $inp;
+ &mov ($cnt,15);
+ &jmp (&label("x86_loop"));
+ &set_label("x86_loop",16);
+ for($i=1;$i<=2;$i++) {
+ &mov (&LB($rem),&LB($Zll));
+ &shrd ($Zll,$Zlh,4);
+ &and (&LB($rem),0xf);
+ &shrd ($Zlh,$Zhl,4);
+ &shrd ($Zhl,$Zhh,4);
+ &shr ($Zhh,4);
+ &xor ($Zhh,&DWP($off+16,"esp",$rem,4));
+
+ &mov (&LB($rem),&BP($off,"esp",$cnt));
+ if ($i&1) {
+ &and (&LB($rem),0xf0);
+ } else {
+ &shl (&LB($rem),4);
+ }
+
+ &xor ($Zll,&DWP(8,$Htbl,$rem));
+ &xor ($Zlh,&DWP(12,$Htbl,$rem));
+ &xor ($Zhl,&DWP(0,$Htbl,$rem));
+ &xor ($Zhh,&DWP(4,$Htbl,$rem));
+
+ if ($i&1) {
+ &dec ($cnt);
+ &js (&label("x86_break"));
+ } else {
+ &jmp (&label("x86_loop"));
+ }
+ }
+ &set_label("x86_break",16);
+ } else {
+ for($i=1;$i<32;$i++) {
+ &comment($i);
+ &mov (&LB($rem),&LB($Zll));
+ &shrd ($Zll,$Zlh,4);
+ &and (&LB($rem),0xf);
+ &shrd ($Zlh,$Zhl,4);
+ &shrd ($Zhl,$Zhh,4);
+ &shr ($Zhh,4);
+ &xor ($Zhh,&DWP($off+16,"esp",$rem,4));
+
+ if ($i&1) {
+ &mov (&LB($rem),&BP($off+15-($i>>1),"esp"));
+ &and (&LB($rem),0xf0);
+ } else {
+ &mov (&LB($rem),&BP($off+15-($i>>1),"esp"));
+ &shl (&LB($rem),4);
+ }
+
+ &xor ($Zll,&DWP(8,$Htbl,$rem));
+ &xor ($Zlh,&DWP(12,$Htbl,$rem));
+ &xor ($Zhl,&DWP(0,$Htbl,$rem));
+ &xor ($Zhh,&DWP(4,$Htbl,$rem));
+ }
+ }
+ &bswap ($Zll);
+ &bswap ($Zlh);
+ &bswap ($Zhl);
+ if (!$x86only) {
+ &bswap ($Zhh);
+ } else {
+ &mov ("eax",$Zhh);
+ &bswap ("eax");
+ &mov ($Zhh,"eax");
+ }
+}
+
+if ($unroll) {
+ &function_begin_B("_x86_gmult_4bit_inner");
+ &x86_loop(4);
+ &ret ();
+ &function_end_B("_x86_gmult_4bit_inner");
+}
+
+sub deposit_rem_4bit {
+ my $bias = shift;
+
+ &mov (&DWP($bias+0, "esp"),0x0000<<16);
+ &mov (&DWP($bias+4, "esp"),0x1C20<<16);
+ &mov (&DWP($bias+8, "esp"),0x3840<<16);
+ &mov (&DWP($bias+12,"esp"),0x2460<<16);
+ &mov (&DWP($bias+16,"esp"),0x7080<<16);
+ &mov (&DWP($bias+20,"esp"),0x6CA0<<16);
+ &mov (&DWP($bias+24,"esp"),0x48C0<<16);
+ &mov (&DWP($bias+28,"esp"),0x54E0<<16);
+ &mov (&DWP($bias+32,"esp"),0xE100<<16);
+ &mov (&DWP($bias+36,"esp"),0xFD20<<16);
+ &mov (&DWP($bias+40,"esp"),0xD940<<16);
+ &mov (&DWP($bias+44,"esp"),0xC560<<16);
+ &mov (&DWP($bias+48,"esp"),0x9180<<16);
+ &mov (&DWP($bias+52,"esp"),0x8DA0<<16);
+ &mov (&DWP($bias+56,"esp"),0xA9C0<<16);
+ &mov (&DWP($bias+60,"esp"),0xB5E0<<16);
+}
+
+$suffix = $x86only ? "" : "_x86";
+
+&function_begin("gcm_gmult_4bit".$suffix);
+ &stack_push(16+4+1); # +1 for stack alignment
+ &mov ($inp,&wparam(0)); # load Xi
+ &mov ($Htbl,&wparam(1)); # load Htable
+
+ &mov ($Zhh,&DWP(0,$inp)); # load Xi[16]
+ &mov ($Zhl,&DWP(4,$inp));
+ &mov ($Zlh,&DWP(8,$inp));
+ &mov ($Zll,&DWP(12,$inp));
+
+ &deposit_rem_4bit(16);
+
+ &mov (&DWP(0,"esp"),$Zhh); # copy Xi[16] on stack
+ &mov (&DWP(4,"esp"),$Zhl);
+ &mov (&DWP(8,"esp"),$Zlh);
+ &mov (&DWP(12,"esp"),$Zll);
+ &shr ($Zll,20);
+ &and ($Zll,0xf0);
+
+ if ($unroll) {
+ &call ("_x86_gmult_4bit_inner");
+ } else {
+ &x86_loop(0);
+ &mov ($inp,&wparam(0));
+ }
+
+ &mov (&DWP(12,$inp),$Zll);
+ &mov (&DWP(8,$inp),$Zlh);
+ &mov (&DWP(4,$inp),$Zhl);
+ &mov (&DWP(0,$inp),$Zhh);
+ &stack_pop(16+4+1);
+&function_end("gcm_gmult_4bit".$suffix);
+
+&function_begin("gcm_ghash_4bit".$suffix);
+ &stack_push(16+4+1); # +1 for 64-bit alignment
+ &mov ($Zll,&wparam(0)); # load Xi
+ &mov ($Htbl,&wparam(1)); # load Htable
+ &mov ($inp,&wparam(2)); # load in
+ &mov ("ecx",&wparam(3)); # load len
+ &add ("ecx",$inp);
+ &mov (&wparam(3),"ecx");
+
+ &mov ($Zhh,&DWP(0,$Zll)); # load Xi[16]
+ &mov ($Zhl,&DWP(4,$Zll));
+ &mov ($Zlh,&DWP(8,$Zll));
+ &mov ($Zll,&DWP(12,$Zll));
+
+ &deposit_rem_4bit(16);
+
+ &set_label("x86_outer_loop",16);
+ &xor ($Zll,&DWP(12,$inp)); # xor with input
+ &xor ($Zlh,&DWP(8,$inp));
+ &xor ($Zhl,&DWP(4,$inp));
+ &xor ($Zhh,&DWP(0,$inp));
+ &mov (&DWP(12,"esp"),$Zll); # dump it on stack
+ &mov (&DWP(8,"esp"),$Zlh);
+ &mov (&DWP(4,"esp"),$Zhl);
+ &mov (&DWP(0,"esp"),$Zhh);
+
+ &shr ($Zll,20);
+ &and ($Zll,0xf0);
+
+ if ($unroll) {
+ &call ("_x86_gmult_4bit_inner");
+ } else {
+ &x86_loop(0);
+ &mov ($inp,&wparam(2));
+ }
+ &lea ($inp,&DWP(16,$inp));
+ &cmp ($inp,&wparam(3));
+ &mov (&wparam(2),$inp) if (!$unroll);
+ &jb (&label("x86_outer_loop"));
+
+ &mov ($inp,&wparam(0)); # load Xi
+ &mov (&DWP(12,$inp),$Zll);
+ &mov (&DWP(8,$inp),$Zlh);
+ &mov (&DWP(4,$inp),$Zhl);
+ &mov (&DWP(0,$inp),$Zhh);
+ &stack_pop(16+4+1);
+&function_end("gcm_ghash_4bit".$suffix);
+
+if (!$x86only) {{{
+
+&static_label("rem_4bit");
+
+if (!$sse2) {{ # pure-MMX "May" version...
+
+$S=12; # shift factor for rem_4bit
+
+&function_begin_B("_mmx_gmult_4bit_inner");
+# MMX version performs 3.5 times better on P4 (see comment in non-MMX
+# routine for further details), 100% better on Opteron, ~70% better
+# on Core2 and PIII... In other words effort is considered to be well
+# spent... Since initial release the loop was unrolled in order to
+# "liberate" register previously used as loop counter. Instead it's
+# used to optimize critical path in 'Z.hi ^= rem_4bit[Z.lo&0xf]'.
+# The path involves move of Z.lo from MMX to integer register,
+# effective address calculation and finally merge of value to Z.hi.
+# Reference to rem_4bit is scheduled so late that I had to >>4
+# rem_4bit elements. This resulted in 20-45% procent improvement
+# on contemporary µ-archs.
+{
+ my $cnt;
+ my $rem_4bit = "eax";
+ my @rem = ($Zhh,$Zll);
+ my $nhi = $Zhl;
+ my $nlo = $Zlh;
+
+ my ($Zlo,$Zhi) = ("mm0","mm1");
+ my $tmp = "mm2";
+
+ &xor ($nlo,$nlo); # avoid partial register stalls on PIII
+ &mov ($nhi,$Zll);
+ &mov (&LB($nlo),&LB($nhi));
+ &shl (&LB($nlo),4);
+ &and ($nhi,0xf0);
+ &movq ($Zlo,&QWP(8,$Htbl,$nlo));
+ &movq ($Zhi,&QWP(0,$Htbl,$nlo));
+ &movd ($rem[0],$Zlo);
+
+ for ($cnt=28;$cnt>=-2;$cnt--) {
+ my $odd = $cnt&1;
+ my $nix = $odd ? $nlo : $nhi;
+
+ &shl (&LB($nlo),4) if ($odd);
+ &psrlq ($Zlo,4);
+ &movq ($tmp,$Zhi);
+ &psrlq ($Zhi,4);
+ &pxor ($Zlo,&QWP(8,$Htbl,$nix));
+ &mov (&LB($nlo),&BP($cnt/2,$inp)) if (!$odd && $cnt>=0);
+ &psllq ($tmp,60);
+ &and ($nhi,0xf0) if ($odd);
+ &pxor ($Zhi,&QWP(0,$rem_4bit,$rem[1],8)) if ($cnt<28);
+ &and ($rem[0],0xf);
+ &pxor ($Zhi,&QWP(0,$Htbl,$nix));
+ &mov ($nhi,$nlo) if (!$odd && $cnt>=0);
+ &movd ($rem[1],$Zlo);
+ &pxor ($Zlo,$tmp);
+
+ push (@rem,shift(@rem)); # "rotate" registers
+ }
+
+ &mov ($inp,&DWP(4,$rem_4bit,$rem[1],8)); # last rem_4bit[rem]
+
+ &psrlq ($Zlo,32); # lower part of Zlo is already there
+ &movd ($Zhl,$Zhi);
+ &psrlq ($Zhi,32);
+ &movd ($Zlh,$Zlo);
+ &movd ($Zhh,$Zhi);
+ &shl ($inp,4); # compensate for rem_4bit[i] being >>4
+
+ &bswap ($Zll);
+ &bswap ($Zhl);
+ &bswap ($Zlh);
+ &xor ($Zhh,$inp);
+ &bswap ($Zhh);
+
+ &ret ();
+}
+&function_end_B("_mmx_gmult_4bit_inner");
+
+&function_begin("gcm_gmult_4bit_mmx");
+ &mov ($inp,&wparam(0)); # load Xi
+ &mov ($Htbl,&wparam(1)); # load Htable
+
+ &call (&label("pic_point"));
+ &set_label("pic_point");
+ &blindpop("eax");
+ &lea ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
+
+ &movz ($Zll,&BP(15,$inp));
+
+ &call ("_mmx_gmult_4bit_inner");
+
+ &mov ($inp,&wparam(0)); # load Xi
+ &emms ();
+ &mov (&DWP(12,$inp),$Zll);
+ &mov (&DWP(4,$inp),$Zhl);
+ &mov (&DWP(8,$inp),$Zlh);
+ &mov (&DWP(0,$inp),$Zhh);
+&function_end("gcm_gmult_4bit_mmx");
+
+# Streamed version performs 20% better on P4, 7% on Opteron,
+# 10% on Core2 and PIII...
+&function_begin("gcm_ghash_4bit_mmx");
+ &mov ($Zhh,&wparam(0)); # load Xi
+ &mov ($Htbl,&wparam(1)); # load Htable
+ &mov ($inp,&wparam(2)); # load in
+ &mov ($Zlh,&wparam(3)); # load len
+
+ &call (&label("pic_point"));
+ &set_label("pic_point");
+ &blindpop("eax");
+ &lea ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
+
+ &add ($Zlh,$inp);
+ &mov (&wparam(3),$Zlh); # len to point at the end of input
+ &stack_push(4+1); # +1 for stack alignment
+
+ &mov ($Zll,&DWP(12,$Zhh)); # load Xi[16]
+ &mov ($Zhl,&DWP(4,$Zhh));
+ &mov ($Zlh,&DWP(8,$Zhh));
+ &mov ($Zhh,&DWP(0,$Zhh));
+ &jmp (&label("mmx_outer_loop"));
+
+ &set_label("mmx_outer_loop",16);
+ &xor ($Zll,&DWP(12,$inp));
+ &xor ($Zhl,&DWP(4,$inp));
+ &xor ($Zlh,&DWP(8,$inp));
+ &xor ($Zhh,&DWP(0,$inp));
+ &mov (&wparam(2),$inp);
+ &mov (&DWP(12,"esp"),$Zll);
+ &mov (&DWP(4,"esp"),$Zhl);
+ &mov (&DWP(8,"esp"),$Zlh);
+ &mov (&DWP(0,"esp"),$Zhh);
+
+ &mov ($inp,"esp");
+ &shr ($Zll,24);
+
+ &call ("_mmx_gmult_4bit_inner");
+
+ &mov ($inp,&wparam(2));
+ &lea ($inp,&DWP(16,$inp));
+ &cmp ($inp,&wparam(3));
+ &jb (&label("mmx_outer_loop"));
+
+ &mov ($inp,&wparam(0)); # load Xi
+ &emms ();
+ &mov (&DWP(12,$inp),$Zll);
+ &mov (&DWP(4,$inp),$Zhl);
+ &mov (&DWP(8,$inp),$Zlh);
+ &mov (&DWP(0,$inp),$Zhh);
+
+ &stack_pop(4+1);
+&function_end("gcm_ghash_4bit_mmx");
+
+}} else {{ # "June" MMX version...
+ # ... has slower "April" gcm_gmult_4bit_mmx with folded
+ # loop. This is done to conserve code size...
+$S=16; # shift factor for rem_4bit
+
+sub mmx_loop() {
+# MMX version performs 2.8 times better on P4 (see comment in non-MMX
+# routine for further details), 40% better on Opteron and Core2, 50%
+# better on PIII... In other words effort is considered to be well
+# spent...
+ my $inp = shift;
+ my $rem_4bit = shift;
+ my $cnt = $Zhh;
+ my $nhi = $Zhl;
+ my $nlo = $Zlh;
+ my $rem = $Zll;
+
+ my ($Zlo,$Zhi) = ("mm0","mm1");
+ my $tmp = "mm2";
+
+ &xor ($nlo,$nlo); # avoid partial register stalls on PIII
+ &mov ($nhi,$Zll);
+ &mov (&LB($nlo),&LB($nhi));
+ &mov ($cnt,14);
+ &shl (&LB($nlo),4);
+ &and ($nhi,0xf0);
+ &movq ($Zlo,&QWP(8,$Htbl,$nlo));
+ &movq ($Zhi,&QWP(0,$Htbl,$nlo));
+ &movd ($rem,$Zlo);
+ &jmp (&label("mmx_loop"));
+
+ &set_label("mmx_loop",16);
+ &psrlq ($Zlo,4);
+ &and ($rem,0xf);
+ &movq ($tmp,$Zhi);
+ &psrlq ($Zhi,4);
+ &pxor ($Zlo,&QWP(8,$Htbl,$nhi));
+ &mov (&LB($nlo),&BP(0,$inp,$cnt));
+ &psllq ($tmp,60);
+ &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8));
+ &dec ($cnt);
+ &movd ($rem,$Zlo);
+ &pxor ($Zhi,&QWP(0,$Htbl,$nhi));
+ &mov ($nhi,$nlo);
+ &pxor ($Zlo,$tmp);
+ &js (&label("mmx_break"));
+
+ &shl (&LB($nlo),4);
+ &and ($rem,0xf);
+ &psrlq ($Zlo,4);
+ &and ($nhi,0xf0);
+ &movq ($tmp,$Zhi);
+ &psrlq ($Zhi,4);
+ &pxor ($Zlo,&QWP(8,$Htbl,$nlo));
+ &psllq ($tmp,60);
+ &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8));
+ &movd ($rem,$Zlo);
+ &pxor ($Zhi,&QWP(0,$Htbl,$nlo));
+ &pxor ($Zlo,$tmp);
+ &jmp (&label("mmx_loop"));
+
+ &set_label("mmx_break",16);
+ &shl (&LB($nlo),4);
+ &and ($rem,0xf);
+ &psrlq ($Zlo,4);
+ &and ($nhi,0xf0);
+ &movq ($tmp,$Zhi);
+ &psrlq ($Zhi,4);
+ &pxor ($Zlo,&QWP(8,$Htbl,$nlo));
+ &psllq ($tmp,60);
+ &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8));
+ &movd ($rem,$Zlo);
+ &pxor ($Zhi,&QWP(0,$Htbl,$nlo));
+ &pxor ($Zlo,$tmp);
+
+ &psrlq ($Zlo,4);
+ &and ($rem,0xf);
+ &movq ($tmp,$Zhi);
+ &psrlq ($Zhi,4);
+ &pxor ($Zlo,&QWP(8,$Htbl,$nhi));
+ &psllq ($tmp,60);
+ &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8));
+ &movd ($rem,$Zlo);
+ &pxor ($Zhi,&QWP(0,$Htbl,$nhi));
+ &pxor ($Zlo,$tmp);
+
+ &psrlq ($Zlo,32); # lower part of Zlo is already there
+ &movd ($Zhl,$Zhi);
+ &psrlq ($Zhi,32);
+ &movd ($Zlh,$Zlo);
+ &movd ($Zhh,$Zhi);
+
+ &bswap ($Zll);
+ &bswap ($Zhl);
+ &bswap ($Zlh);
+ &bswap ($Zhh);
+}
+
+&function_begin("gcm_gmult_4bit_mmx");
+ &mov ($inp,&wparam(0)); # load Xi
+ &mov ($Htbl,&wparam(1)); # load Htable
+
+ &call (&label("pic_point"));
+ &set_label("pic_point");
+ &blindpop("eax");
+ &lea ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
+
+ &movz ($Zll,&BP(15,$inp));
+
+ &mmx_loop($inp,"eax");
+
+ &emms ();
+ &mov (&DWP(12,$inp),$Zll);
+ &mov (&DWP(4,$inp),$Zhl);
+ &mov (&DWP(8,$inp),$Zlh);
+ &mov (&DWP(0,$inp),$Zhh);
+&function_end("gcm_gmult_4bit_mmx");
+
+######################################################################
+# Below subroutine is "528B" variant of "4-bit" GCM GHASH function
+# (see gcm128.c for details). It provides further 20-40% performance
+# improvement over above mentioned "May" version.
+
+&static_label("rem_8bit");
+
+&function_begin("gcm_ghash_4bit_mmx");
+{ my ($Zlo,$Zhi) = ("mm7","mm6");
+ my $rem_8bit = "esi";
+ my $Htbl = "ebx";
+
+ # parameter block
+ &mov ("eax",&wparam(0)); # Xi
+ &mov ("ebx",&wparam(1)); # Htable
+ &mov ("ecx",&wparam(2)); # inp
+ &mov ("edx",&wparam(3)); # len
+ &mov ("ebp","esp"); # original %esp
+ &call (&label("pic_point"));
+ &set_label ("pic_point");
+ &blindpop ($rem_8bit);
+ &lea ($rem_8bit,&DWP(&label("rem_8bit")."-".&label("pic_point"),$rem_8bit));
+
+ &sub ("esp",512+16+16); # allocate stack frame...
+ &and ("esp",-64); # ...and align it
+ &sub ("esp",16); # place for (u8)(H[]<<4)
+
+ &add ("edx","ecx"); # pointer to the end of input
+ &mov (&DWP(528+16+0,"esp"),"eax"); # save Xi
+ &mov (&DWP(528+16+8,"esp"),"edx"); # save inp+len
+ &mov (&DWP(528+16+12,"esp"),"ebp"); # save original %esp
+
+ { my @lo = ("mm0","mm1","mm2");
+ my @hi = ("mm3","mm4","mm5");
+ my @tmp = ("mm6","mm7");
+ my ($off1,$off2,$i) = (0,0,);
+
+ &add ($Htbl,128); # optimize for size
+ &lea ("edi",&DWP(16+128,"esp"));
+ &lea ("ebp",&DWP(16+256+128,"esp"));
+
+ # decompose Htable (low and high parts are kept separately),
+ # generate Htable[]>>4, (u8)(Htable[]<<4), save to stack...
+ for ($i=0;$i<18;$i++) {
+
+ &mov ("edx",&DWP(16*$i+8-128,$Htbl)) if ($i<16);
+ &movq ($lo[0],&QWP(16*$i+8-128,$Htbl)) if ($i<16);
+ &psllq ($tmp[1],60) if ($i>1);
+ &movq ($hi[0],&QWP(16*$i+0-128,$Htbl)) if ($i<16);
+ &por ($lo[2],$tmp[1]) if ($i>1);
+ &movq (&QWP($off1-128,"edi"),$lo[1]) if ($i>0 && $i<17);
+ &psrlq ($lo[1],4) if ($i>0 && $i<17);
+ &movq (&QWP($off1,"edi"),$hi[1]) if ($i>0 && $i<17);
+ &movq ($tmp[0],$hi[1]) if ($i>0 && $i<17);
+ &movq (&QWP($off2-128,"ebp"),$lo[2]) if ($i>1);
+ &psrlq ($hi[1],4) if ($i>0 && $i<17);
+ &movq (&QWP($off2,"ebp"),$hi[2]) if ($i>1);
+ &shl ("edx",4) if ($i<16);
+ &mov (&BP($i,"esp"),&LB("edx")) if ($i<16);
+
+ unshift (@lo,pop(@lo)); # "rotate" registers
+ unshift (@hi,pop(@hi));
+ unshift (@tmp,pop(@tmp));
+ $off1 += 8 if ($i>0);
+ $off2 += 8 if ($i>1);
+ }
+ }
+
+ &movq ($Zhi,&QWP(0,"eax"));
+ &mov ("ebx",&DWP(8,"eax"));
+ &mov ("edx",&DWP(12,"eax")); # load Xi
+
+&set_label("outer",16);
+ { my $nlo = "eax";
+ my $dat = "edx";
+ my @nhi = ("edi","ebp");
+ my @rem = ("ebx","ecx");
+ my @red = ("mm0","mm1","mm2");
+ my $tmp = "mm3";
+
+ &xor ($dat,&DWP(12,"ecx")); # merge input data
+ &xor ("ebx",&DWP(8,"ecx"));
+ &pxor ($Zhi,&QWP(0,"ecx"));
+ &lea ("ecx",&DWP(16,"ecx")); # inp+=16
+ #&mov (&DWP(528+12,"esp"),$dat); # save inp^Xi
+ &mov (&DWP(528+8,"esp"),"ebx");
+ &movq (&QWP(528+0,"esp"),$Zhi);
+ &mov (&DWP(528+16+4,"esp"),"ecx"); # save inp
+
+ &xor ($nlo,$nlo);
+ &rol ($dat,8);
+ &mov (&LB($nlo),&LB($dat));
+ &mov ($nhi[1],$nlo);
+ &and (&LB($nlo),0x0f);
+ &shr ($nhi[1],4);
+ &pxor ($red[0],$red[0]);
+ &rol ($dat,8); # next byte
+ &pxor ($red[1],$red[1]);
+ &pxor ($red[2],$red[2]);
+
+ # Just like in "May" verson modulo-schedule for critical path in
+ # 'Z.hi ^= rem_8bit[Z.lo&0xff^((u8)H[nhi]<<4)]<<48'. Final 'pxor'
+ # is scheduled so late that rem_8bit[] has to be shifted *right*
+ # by 16, which is why last argument to pinsrw is 2, which
+ # corresponds to <<32=<<48>>16...
+ for ($j=11,$i=0;$i<15;$i++) {
+
+ if ($i>0) {
+ &pxor ($Zlo,&QWP(16,"esp",$nlo,8)); # Z^=H[nlo]
+ &rol ($dat,8); # next byte
+ &pxor ($Zhi,&QWP(16+128,"esp",$nlo,8));
+
+ &pxor ($Zlo,$tmp);
+ &pxor ($Zhi,&QWP(16+256+128,"esp",$nhi[0],8));
+ &xor (&LB($rem[1]),&BP(0,"esp",$nhi[0])); # rem^(H[nhi]<<4)
+ } else {
+ &movq ($Zlo,&QWP(16,"esp",$nlo,8));
+ &movq ($Zhi,&QWP(16+128,"esp",$nlo,8));
+ }
+
+ &mov (&LB($nlo),&LB($dat));
+ &mov ($dat,&DWP(528+$j,"esp")) if (--$j%4==0);
+
+ &movd ($rem[0],$Zlo);
+ &movz ($rem[1],&LB($rem[1])) if ($i>0);
+ &psrlq ($Zlo,8); # Z>>=8
+
+ &movq ($tmp,$Zhi);
+ &mov ($nhi[0],$nlo);
+ &psrlq ($Zhi,8);
+
+ &pxor ($Zlo,&QWP(16+256+0,"esp",$nhi[1],8)); # Z^=H[nhi]>>4
+ &and (&LB($nlo),0x0f);
+ &psllq ($tmp,56);
+
+ &pxor ($Zhi,$red[1]) if ($i>1);
+ &shr ($nhi[0],4);
+ &pinsrw ($red[0],&WP(0,$rem_8bit,$rem[1],2),2) if ($i>0);
+
+ unshift (@red,pop(@red)); # "rotate" registers
+ unshift (@rem,pop(@rem));
+ unshift (@nhi,pop(@nhi));
+ }
+
+ &pxor ($Zlo,&QWP(16,"esp",$nlo,8)); # Z^=H[nlo]
+ &pxor ($Zhi,&QWP(16+128,"esp",$nlo,8));
+ &xor (&LB($rem[1]),&BP(0,"esp",$nhi[0])); # rem^(H[nhi]<<4)
+
+ &pxor ($Zlo,$tmp);
+ &pxor ($Zhi,&QWP(16+256+128,"esp",$nhi[0],8));
+ &movz ($rem[1],&LB($rem[1]));
+
+ &pxor ($red[2],$red[2]); # clear 2nd word
+ &psllq ($red[1],4);
+
+ &movd ($rem[0],$Zlo);
+ &psrlq ($Zlo,4); # Z>>=4
+
+ &movq ($tmp,$Zhi);
+ &psrlq ($Zhi,4);
+ &shl ($rem[0],4); # rem<<4
+
+ &pxor ($Zlo,&QWP(16,"esp",$nhi[1],8)); # Z^=H[nhi]
+ &psllq ($tmp,60);
+ &movz ($rem[0],&LB($rem[0]));
+
+ &pxor ($Zlo,$tmp);
+ &pxor ($Zhi,&QWP(16+128,"esp",$nhi[1],8));
+
+ &pinsrw ($red[0],&WP(0,$rem_8bit,$rem[1],2),2);
+ &pxor ($Zhi,$red[1]);
+
+ &movd ($dat,$Zlo);
+ &pinsrw ($red[2],&WP(0,$rem_8bit,$rem[0],2),3); # last is <<48
+
+ &psllq ($red[0],12); # correct by <<16>>4
+ &pxor ($Zhi,$red[0]);
+ &psrlq ($Zlo,32);
+ &pxor ($Zhi,$red[2]);
+
+ &mov ("ecx",&DWP(528+16+4,"esp")); # restore inp
+ &movd ("ebx",$Zlo);
+ &movq ($tmp,$Zhi); # 01234567
+ &psllw ($Zhi,8); # 1.3.5.7.
+ &psrlw ($tmp,8); # .0.2.4.6
+ &por ($Zhi,$tmp); # 10325476
+ &bswap ($dat);
+ &pshufw ($Zhi,$Zhi,0b00011011); # 76543210
+ &bswap ("ebx");
+
+ &cmp ("ecx",&DWP(528+16+8,"esp")); # are we done?
+ &jne (&label("outer"));
+ }
+
+ &mov ("eax",&DWP(528+16+0,"esp")); # restore Xi
+ &mov (&DWP(12,"eax"),"edx");
+ &mov (&DWP(8,"eax"),"ebx");
+ &movq (&QWP(0,"eax"),$Zhi);
+
+ &mov ("esp",&DWP(528+16+12,"esp")); # restore original %esp
+ &emms ();
+}
+&function_end("gcm_ghash_4bit_mmx");
+}}
+
+if ($sse2) {{
+######################################################################
+# PCLMULQDQ version.
+
+$Xip="eax";
+$Htbl="edx";
+$const="ecx";
+$inp="esi";
+$len="ebx";
+
+($Xi,$Xhi)=("xmm0","xmm1"); $Hkey="xmm2";
+($T1,$T2,$T3)=("xmm3","xmm4","xmm5");
+($Xn,$Xhn)=("xmm6","xmm7");
+
+&static_label("bswap");
+
+sub clmul64x64_T2 { # minimal "register" pressure
+my ($Xhi,$Xi,$Hkey)=@_;
+
+ &movdqa ($Xhi,$Xi); #
+ &pshufd ($T1,$Xi,0b01001110);
+ &pshufd ($T2,$Hkey,0b01001110);
+ &pxor ($T1,$Xi); #
+ &pxor ($T2,$Hkey);
+
+ &pclmulqdq ($Xi,$Hkey,0x00); #######
+ &pclmulqdq ($Xhi,$Hkey,0x11); #######
+ &pclmulqdq ($T1,$T2,0x00); #######
+ &xorps ($T1,$Xi); #
+ &xorps ($T1,$Xhi); #
+
+ &movdqa ($T2,$T1); #
+ &psrldq ($T1,8);
+ &pslldq ($T2,8); #
+ &pxor ($Xhi,$T1);
+ &pxor ($Xi,$T2); #
+}
+
+sub clmul64x64_T3 {
+# Even though this subroutine offers visually better ILP, it
+# was empirically found to be a tad slower than above version.
+# At least in gcm_ghash_clmul context. But it's just as well,
+# because loop modulo-scheduling is possible only thanks to
+# minimized "register" pressure...
+my ($Xhi,$Xi,$Hkey)=@_;
+
+ &movdqa ($T1,$Xi); #
+ &movdqa ($Xhi,$Xi);
+ &pclmulqdq ($Xi,$Hkey,0x00); #######
+ &pclmulqdq ($Xhi,$Hkey,0x11); #######
+ &pshufd ($T2,$T1,0b01001110); #
+ &pshufd ($T3,$Hkey,0b01001110);
+ &pxor ($T2,$T1); #
+ &pxor ($T3,$Hkey);
+ &pclmulqdq ($T2,$T3,0x00); #######
+ &pxor ($T2,$Xi); #
+ &pxor ($T2,$Xhi); #
+
+ &movdqa ($T3,$T2); #
+ &psrldq ($T2,8);
+ &pslldq ($T3,8); #
+ &pxor ($Xhi,$T2);
+ &pxor ($Xi,$T3); #
+}
+
+if (1) { # Algorithm 9 with <<1 twist.
+ # Reduction is shorter and uses only two
+ # temporary registers, which makes it better
+ # candidate for interleaving with 64x64
+ # multiplication. Pre-modulo-scheduled loop
+ # was found to be ~20% faster than Algorithm 5
+ # below. Algorithm 9 was therefore chosen for
+ # further optimization...
+
+sub reduction_alg9 { # 17/13 times faster than Intel version
+my ($Xhi,$Xi) = @_;
+
+ # 1st phase
+ &movdqa ($T1,$Xi); #
+ &psllq ($Xi,1);
+ &pxor ($Xi,$T1); #
+ &psllq ($Xi,5); #
+ &pxor ($Xi,$T1); #
+ &psllq ($Xi,57); #
+ &movdqa ($T2,$Xi); #
+ &pslldq ($Xi,8);
+ &psrldq ($T2,8); #
+ &pxor ($Xi,$T1);
+ &pxor ($Xhi,$T2); #
+
+ # 2nd phase
+ &movdqa ($T2,$Xi);
+ &psrlq ($Xi,5);
+ &pxor ($Xi,$T2); #
+ &psrlq ($Xi,1); #
+ &pxor ($Xi,$T2); #
+ &pxor ($T2,$Xhi);
+ &psrlq ($Xi,1); #
+ &pxor ($Xi,$T2); #
+}
+
+&function_begin_B("gcm_init_clmul");
+ &mov ($Htbl,&wparam(0));
+ &mov ($Xip,&wparam(1));
+
+ &call (&label("pic"));
+&set_label("pic");
+ &blindpop ($const);
+ &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
+
+ &movdqu ($Hkey,&QWP(0,$Xip));
+ &pshufd ($Hkey,$Hkey,0b01001110);# dword swap
+
+ # <<1 twist
+ &pshufd ($T2,$Hkey,0b11111111); # broadcast uppermost dword
+ &movdqa ($T1,$Hkey);
+ &psllq ($Hkey,1);
+ &pxor ($T3,$T3); #
+ &psrlq ($T1,63);
+ &pcmpgtd ($T3,$T2); # broadcast carry bit
+ &pslldq ($T1,8);
+ &por ($Hkey,$T1); # H<<=1
+
+ # magic reduction
+ &pand ($T3,&QWP(16,$const)); # 0x1c2_polynomial
+ &pxor ($Hkey,$T3); # if(carry) H^=0x1c2_polynomial
+
+ # calculate H^2
+ &movdqa ($Xi,$Hkey);
+ &clmul64x64_T2 ($Xhi,$Xi,$Hkey);
+ &reduction_alg9 ($Xhi,$Xi);
+
+ &movdqu (&QWP(0,$Htbl),$Hkey); # save H
+ &movdqu (&QWP(16,$Htbl),$Xi); # save H^2
+
+ &ret ();
+&function_end_B("gcm_init_clmul");
+
+&function_begin_B("gcm_gmult_clmul");
+ &mov ($Xip,&wparam(0));
+ &mov ($Htbl,&wparam(1));
+
+ &call (&label("pic"));
+&set_label("pic");
+ &blindpop ($const);
+ &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
+
+ &movdqu ($Xi,&QWP(0,$Xip));
+ &movdqa ($T3,&QWP(0,$const));
+ &movups ($Hkey,&QWP(0,$Htbl));
+ &pshufb ($Xi,$T3);
+
+ &clmul64x64_T2 ($Xhi,$Xi,$Hkey);
+ &reduction_alg9 ($Xhi,$Xi);
+
+ &pshufb ($Xi,$T3);
+ &movdqu (&QWP(0,$Xip),$Xi);
+
+ &ret ();
+&function_end_B("gcm_gmult_clmul");
+
+&function_begin("gcm_ghash_clmul");
+ &mov ($Xip,&wparam(0));
+ &mov ($Htbl,&wparam(1));
+ &mov ($inp,&wparam(2));
+ &mov ($len,&wparam(3));
+
+ &call (&label("pic"));
+&set_label("pic");
+ &blindpop ($const);
+ &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
+
+ &movdqu ($Xi,&QWP(0,$Xip));
+ &movdqa ($T3,&QWP(0,$const));
+ &movdqu ($Hkey,&QWP(0,$Htbl));
+ &pshufb ($Xi,$T3);
+
+ &sub ($len,0x10);
+ &jz (&label("odd_tail"));
+
+ #######
+ # Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
+ # [(H*Ii+1) + (H*Xi+1)] mod P =
+ # [(H*Ii+1) + H^2*(Ii+Xi)] mod P
+ #
+ &movdqu ($T1,&QWP(0,$inp)); # Ii
+ &movdqu ($Xn,&QWP(16,$inp)); # Ii+1
+ &pshufb ($T1,$T3);
+ &pshufb ($Xn,$T3);
+ &pxor ($Xi,$T1); # Ii+Xi
+
+ &clmul64x64_T2 ($Xhn,$Xn,$Hkey); # H*Ii+1
+ &movups ($Hkey,&QWP(16,$Htbl)); # load H^2
+
+ &lea ($inp,&DWP(32,$inp)); # i+=2
+ &sub ($len,0x20);
+ &jbe (&label("even_tail"));
+
+&set_label("mod_loop");
+ &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi)
+ &movdqu ($T1,&QWP(0,$inp)); # Ii
+ &movups ($Hkey,&QWP(0,$Htbl)); # load H
+
+ &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
+ &pxor ($Xhi,$Xhn);
+
+ &movdqu ($Xn,&QWP(16,$inp)); # Ii+1
+ &pshufb ($T1,$T3);
+ &pshufb ($Xn,$T3);
+
+ &movdqa ($T3,$Xn); #&clmul64x64_TX ($Xhn,$Xn,$Hkey); H*Ii+1
+ &movdqa ($Xhn,$Xn);
+ &pxor ($Xhi,$T1); # "Ii+Xi", consume early
+
+ &movdqa ($T1,$Xi); #&reduction_alg9($Xhi,$Xi); 1st phase
+ &psllq ($Xi,1);
+ &pxor ($Xi,$T1); #
+ &psllq ($Xi,5); #
+ &pxor ($Xi,$T1); #
+ &pclmulqdq ($Xn,$Hkey,0x00); #######
+ &psllq ($Xi,57); #
+ &movdqa ($T2,$Xi); #
+ &pslldq ($Xi,8);
+ &psrldq ($T2,8); #
+ &pxor ($Xi,$T1);
+ &pshufd ($T1,$T3,0b01001110);
+ &pxor ($Xhi,$T2); #
+ &pxor ($T1,$T3);
+ &pshufd ($T3,$Hkey,0b01001110);
+ &pxor ($T3,$Hkey); #
+
+ &pclmulqdq ($Xhn,$Hkey,0x11); #######
+ &movdqa ($T2,$Xi); # 2nd phase
+ &psrlq ($Xi,5);
+ &pxor ($Xi,$T2); #
+ &psrlq ($Xi,1); #
+ &pxor ($Xi,$T2); #
+ &pxor ($T2,$Xhi);
+ &psrlq ($Xi,1); #
+ &pxor ($Xi,$T2); #
+
+ &pclmulqdq ($T1,$T3,0x00); #######
+ &movups ($Hkey,&QWP(16,$Htbl)); # load H^2
+ &xorps ($T1,$Xn); #
+ &xorps ($T1,$Xhn); #
+
+ &movdqa ($T3,$T1); #
+ &psrldq ($T1,8);
+ &pslldq ($T3,8); #
+ &pxor ($Xhn,$T1);
+ &pxor ($Xn,$T3); #
+ &movdqa ($T3,&QWP(0,$const));
+
+ &lea ($inp,&DWP(32,$inp));
+ &sub ($len,0x20);
+ &ja (&label("mod_loop"));
+
+&set_label("even_tail");
+ &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi)
+
+ &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
+ &pxor ($Xhi,$Xhn);
+
+ &reduction_alg9 ($Xhi,$Xi);
+
+ &test ($len,$len);
+ &jnz (&label("done"));
+
+ &movups ($Hkey,&QWP(0,$Htbl)); # load H
+&set_label("odd_tail");
+ &movdqu ($T1,&QWP(0,$inp)); # Ii
+ &pshufb ($T1,$T3);
+ &pxor ($Xi,$T1); # Ii+Xi
+
+ &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H*(Ii+Xi)
+ &reduction_alg9 ($Xhi,$Xi);
+
+&set_label("done");
+ &pshufb ($Xi,$T3);
+ &movdqu (&QWP(0,$Xip),$Xi);
+&function_end("gcm_ghash_clmul");
+
+} else { # Algorith 5. Kept for reference purposes.
+
+sub reduction_alg5 { # 19/16 times faster than Intel version
+my ($Xhi,$Xi)=@_;
+
+ # <<1
+ &movdqa ($T1,$Xi); #
+ &movdqa ($T2,$Xhi);
+ &pslld ($Xi,1);
+ &pslld ($Xhi,1); #
+ &psrld ($T1,31);
+ &psrld ($T2,31); #
+ &movdqa ($T3,$T1);
+ &pslldq ($T1,4);
+ &psrldq ($T3,12); #
+ &pslldq ($T2,4);
+ &por ($Xhi,$T3); #
+ &por ($Xi,$T1);
+ &por ($Xhi,$T2); #
+
+ # 1st phase
+ &movdqa ($T1,$Xi);
+ &movdqa ($T2,$Xi);
+ &movdqa ($T3,$Xi); #
+ &pslld ($T1,31);
+ &pslld ($T2,30);
+ &pslld ($Xi,25); #
+ &pxor ($T1,$T2);
+ &pxor ($T1,$Xi); #
+ &movdqa ($T2,$T1); #
+ &pslldq ($T1,12);
+ &psrldq ($T2,4); #
+ &pxor ($T3,$T1);
+
+ # 2nd phase
+ &pxor ($Xhi,$T3); #
+ &movdqa ($Xi,$T3);
+ &movdqa ($T1,$T3);
+ &psrld ($Xi,1); #
+ &psrld ($T1,2);
+ &psrld ($T3,7); #
+ &pxor ($Xi,$T1);
+ &pxor ($Xhi,$T2);
+ &pxor ($Xi,$T3); #
+ &pxor ($Xi,$Xhi); #
+}
+
+&function_begin_B("gcm_init_clmul");
+ &mov ($Htbl,&wparam(0));
+ &mov ($Xip,&wparam(1));
+
+ &call (&label("pic"));
+&set_label("pic");
+ &blindpop ($const);
+ &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
+
+ &movdqu ($Hkey,&QWP(0,$Xip));
+ &pshufd ($Hkey,$Hkey,0b01001110);# dword swap
+
+ # calculate H^2
+ &movdqa ($Xi,$Hkey);
+ &clmul64x64_T3 ($Xhi,$Xi,$Hkey);
+ &reduction_alg5 ($Xhi,$Xi);
+
+ &movdqu (&QWP(0,$Htbl),$Hkey); # save H
+ &movdqu (&QWP(16,$Htbl),$Xi); # save H^2
+
+ &ret ();
+&function_end_B("gcm_init_clmul");
+
+&function_begin_B("gcm_gmult_clmul");
+ &mov ($Xip,&wparam(0));
+ &mov ($Htbl,&wparam(1));
+
+ &call (&label("pic"));
+&set_label("pic");
+ &blindpop ($const);
+ &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
+
+ &movdqu ($Xi,&QWP(0,$Xip));
+ &movdqa ($Xn,&QWP(0,$const));
+ &movdqu ($Hkey,&QWP(0,$Htbl));
+ &pshufb ($Xi,$Xn);
+
+ &clmul64x64_T3 ($Xhi,$Xi,$Hkey);
+ &reduction_alg5 ($Xhi,$Xi);
+
+ &pshufb ($Xi,$Xn);
+ &movdqu (&QWP(0,$Xip),$Xi);
+
+ &ret ();
+&function_end_B("gcm_gmult_clmul");
+
+&function_begin("gcm_ghash_clmul");
+ &mov ($Xip,&wparam(0));
+ &mov ($Htbl,&wparam(1));
+ &mov ($inp,&wparam(2));
+ &mov ($len,&wparam(3));
+
+ &call (&label("pic"));
+&set_label("pic");
+ &blindpop ($const);
+ &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
+
+ &movdqu ($Xi,&QWP(0,$Xip));
+ &movdqa ($T3,&QWP(0,$const));
+ &movdqu ($Hkey,&QWP(0,$Htbl));
+ &pshufb ($Xi,$T3);
+
+ &sub ($len,0x10);
+ &jz (&label("odd_tail"));
+
+ #######
+ # Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
+ # [(H*Ii+1) + (H*Xi+1)] mod P =
+ # [(H*Ii+1) + H^2*(Ii+Xi)] mod P
+ #
+ &movdqu ($T1,&QWP(0,$inp)); # Ii
+ &movdqu ($Xn,&QWP(16,$inp)); # Ii+1
+ &pshufb ($T1,$T3);
+ &pshufb ($Xn,$T3);
+ &pxor ($Xi,$T1); # Ii+Xi
+
+ &clmul64x64_T3 ($Xhn,$Xn,$Hkey); # H*Ii+1
+ &movdqu ($Hkey,&QWP(16,$Htbl)); # load H^2
+
+ &sub ($len,0x20);
+ &lea ($inp,&DWP(32,$inp)); # i+=2
+ &jbe (&label("even_tail"));
+
+&set_label("mod_loop");
+ &clmul64x64_T3 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi)
+ &movdqu ($Hkey,&QWP(0,$Htbl)); # load H
+
+ &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
+ &pxor ($Xhi,$Xhn);
+
+ &reduction_alg5 ($Xhi,$Xi);
+
+ #######
+ &movdqa ($T3,&QWP(0,$const));
+ &movdqu ($T1,&QWP(0,$inp)); # Ii
+ &movdqu ($Xn,&QWP(16,$inp)); # Ii+1
+ &pshufb ($T1,$T3);
+ &pshufb ($Xn,$T3);
+ &pxor ($Xi,$T1); # Ii+Xi
+
+ &clmul64x64_T3 ($Xhn,$Xn,$Hkey); # H*Ii+1
+ &movdqu ($Hkey,&QWP(16,$Htbl)); # load H^2
+
+ &sub ($len,0x20);
+ &lea ($inp,&DWP(32,$inp));
+ &ja (&label("mod_loop"));
+
+&set_label("even_tail");
+ &clmul64x64_T3 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi)
+
+ &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
+ &pxor ($Xhi,$Xhn);
+
+ &reduction_alg5 ($Xhi,$Xi);
+
+ &movdqa ($T3,&QWP(0,$const));
+ &test ($len,$len);
+ &jnz (&label("done"));
+
+ &movdqu ($Hkey,&QWP(0,$Htbl)); # load H
+&set_label("odd_tail");
+ &movdqu ($T1,&QWP(0,$inp)); # Ii
+ &pshufb ($T1,$T3);
+ &pxor ($Xi,$T1); # Ii+Xi
+
+ &clmul64x64_T3 ($Xhi,$Xi,$Hkey); # H*(Ii+Xi)
+ &reduction_alg5 ($Xhi,$Xi);
+
+ &movdqa ($T3,&QWP(0,$const));
+&set_label("done");
+ &pshufb ($Xi,$T3);
+ &movdqu (&QWP(0,$Xip),$Xi);
+&function_end("gcm_ghash_clmul");
+
+}
+
+&set_label("bswap",64);
+ &data_byte(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
+ &data_byte(1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2); # 0x1c2_polynomial
+}} # $sse2
+
+&set_label("rem_4bit",64);
+ &data_word(0,0x0000<<$S,0,0x1C20<<$S,0,0x3840<<$S,0,0x2460<<$S);
+ &data_word(0,0x7080<<$S,0,0x6CA0<<$S,0,0x48C0<<$S,0,0x54E0<<$S);
+ &data_word(0,0xE100<<$S,0,0xFD20<<$S,0,0xD940<<$S,0,0xC560<<$S);
+ &data_word(0,0x9180<<$S,0,0x8DA0<<$S,0,0xA9C0<<$S,0,0xB5E0<<$S);
+&set_label("rem_8bit",64);
+ &data_short(0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E);
+ &data_short(0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E);
+ &data_short(0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E);
+ &data_short(0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E);
+ &data_short(0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E);
+ &data_short(0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E);
+ &data_short(0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E);
+ &data_short(0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E);
+ &data_short(0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE);
+ &data_short(0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE);
+ &data_short(0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE);
+ &data_short(0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE);
+ &data_short(0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E);
+ &data_short(0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E);
+ &data_short(0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE);
+ &data_short(0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE);
+ &data_short(0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E);
+ &data_short(0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E);
+ &data_short(0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E);
+ &data_short(0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E);
+ &data_short(0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E);
+ &data_short(0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E);
+ &data_short(0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E);
+ &data_short(0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E);
+ &data_short(0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE);
+ &data_short(0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE);
+ &data_short(0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE);
+ &data_short(0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE);
+ &data_short(0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E);
+ &data_short(0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E);
+ &data_short(0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE);
+ &data_short(0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE);
+}}} # !$x86only
+
+&asciz("GHASH for x86, CRYPTOGAMS by <appro\@openssl.org>");
+&asm_finish();
+
+# A question was risen about choice of vanilla MMX. Or rather why wasn't
+# SSE2 chosen instead? In addition to the fact that MMX runs on legacy
+# CPUs such as PIII, "4-bit" MMX version was observed to provide better
+# performance than *corresponding* SSE2 one even on contemporary CPUs.
+# SSE2 results were provided by Peter-Michael Hager. He maintains SSE2
+# implementation featuring full range of lookup-table sizes, but with
+# per-invocation lookup table setup. Latter means that table size is
+# chosen depending on how much data is to be hashed in every given call,
+# more data - larger table. Best reported result for Core2 is ~4 cycles
+# per processed byte out of 64KB block. This number accounts even for
+# 64KB table setup overhead. As discussed in gcm128.c we choose to be
+# more conservative in respect to lookup table sizes, but how do the
+# results compare? Minimalistic "256B" MMX version delivers ~11 cycles
+# on same platform. As also discussed in gcm128.c, next in line "8-bit
+# Shoup's" or "4KB" method should deliver twice the performance of
+# "256B" one, in other words not worse than ~6 cycles per byte. It
+# should be also be noted that in SSE2 case improvement can be "super-
+# linear," i.e. more than twice, mostly because >>8 maps to single
+# instruction on SSE2 register. This is unlike "4-bit" case when >>4
+# maps to same amount of instructions in both MMX and SSE2 cases.
+# Bottom line is that switch to SSE2 is considered to be justifiable
+# only in case we choose to implement "8-bit" method...
diff --git a/main/openssl/crypto/modes/asm/ghash-x86_64.S b/main/openssl/crypto/modes/asm/ghash-x86_64.S
new file mode 100644
index 00000000..62d39c65
--- /dev/null
+++ b/main/openssl/crypto/modes/asm/ghash-x86_64.S
@@ -0,0 +1,1026 @@
+.text
+
+.globl gcm_gmult_4bit
+.type gcm_gmult_4bit,@function
+.align 16
+gcm_gmult_4bit:
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+.Lgmult_prologue:
+
+ movzbq 15(%rdi),%r8
+ leaq .Lrem_4bit(%rip),%r11
+ xorq %rax,%rax
+ xorq %rbx,%rbx
+ movb %r8b,%al
+ movb %r8b,%bl
+ shlb $4,%al
+ movq $14,%rcx
+ movq 8(%rsi,%rax,1),%r8
+ movq (%rsi,%rax,1),%r9
+ andb $240,%bl
+ movq %r8,%rdx
+ jmp .Loop1
+
+.align 16
+.Loop1:
+ shrq $4,%r8
+ andq $15,%rdx
+ movq %r9,%r10
+ movb (%rdi,%rcx,1),%al
+ shrq $4,%r9
+ xorq 8(%rsi,%rbx,1),%r8
+ shlq $60,%r10
+ xorq (%rsi,%rbx,1),%r9
+ movb %al,%bl
+ xorq (%r11,%rdx,8),%r9
+ movq %r8,%rdx
+ shlb $4,%al
+ xorq %r10,%r8
+ decq %rcx
+ js .Lbreak1
+
+ shrq $4,%r8
+ andq $15,%rdx
+ movq %r9,%r10
+ shrq $4,%r9
+ xorq 8(%rsi,%rax,1),%r8
+ shlq $60,%r10
+ xorq (%rsi,%rax,1),%r9
+ andb $240,%bl
+ xorq (%r11,%rdx,8),%r9
+ movq %r8,%rdx
+ xorq %r10,%r8
+ jmp .Loop1
+
+.align 16
+.Lbreak1:
+ shrq $4,%r8
+ andq $15,%rdx
+ movq %r9,%r10
+ shrq $4,%r9
+ xorq 8(%rsi,%rax,1),%r8
+ shlq $60,%r10
+ xorq (%rsi,%rax,1),%r9
+ andb $240,%bl
+ xorq (%r11,%rdx,8),%r9
+ movq %r8,%rdx
+ xorq %r10,%r8
+
+ shrq $4,%r8
+ andq $15,%rdx
+ movq %r9,%r10
+ shrq $4,%r9
+ xorq 8(%rsi,%rbx,1),%r8
+ shlq $60,%r10
+ xorq (%rsi,%rbx,1),%r9
+ xorq %r10,%r8
+ xorq (%r11,%rdx,8),%r9
+
+ bswapq %r8
+ bswapq %r9
+ movq %r8,8(%rdi)
+ movq %r9,(%rdi)
+
+ movq 16(%rsp),%rbx
+ leaq 24(%rsp),%rsp
+.Lgmult_epilogue:
+ .byte 0xf3,0xc3
+.size gcm_gmult_4bit,.-gcm_gmult_4bit
+.globl gcm_ghash_4bit
+.type gcm_ghash_4bit,@function
+.align 16
+gcm_ghash_4bit:
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ subq $280,%rsp
+.Lghash_prologue:
+ movq %rdx,%r14
+ movq %rcx,%r15
+ subq $-128,%rsi
+ leaq 16+128(%rsp),%rbp
+ xorl %edx,%edx
+ movq 0+0-128(%rsi),%r8
+ movq 0+8-128(%rsi),%rax
+ movb %al,%dl
+ shrq $4,%rax
+ movq %r8,%r10
+ shrq $4,%r8
+ movq 16+0-128(%rsi),%r9
+ shlb $4,%dl
+ movq 16+8-128(%rsi),%rbx
+ shlq $60,%r10
+ movb %dl,0(%rsp)
+ orq %r10,%rax
+ movb %bl,%dl
+ shrq $4,%rbx
+ movq %r9,%r10
+ shrq $4,%r9
+ movq %r8,0(%rbp)
+ movq 32+0-128(%rsi),%r8
+ shlb $4,%dl
+ movq %rax,0-128(%rbp)
+ movq 32+8-128(%rsi),%rax
+ shlq $60,%r10
+ movb %dl,1(%rsp)
+ orq %r10,%rbx
+ movb %al,%dl
+ shrq $4,%rax
+ movq %r8,%r10
+ shrq $4,%r8
+ movq %r9,8(%rbp)
+ movq 48+0-128(%rsi),%r9
+ shlb $4,%dl
+ movq %rbx,8-128(%rbp)
+ movq 48+8-128(%rsi),%rbx
+ shlq $60,%r10
+ movb %dl,2(%rsp)
+ orq %r10,%rax
+ movb %bl,%dl
+ shrq $4,%rbx
+ movq %r9,%r10
+ shrq $4,%r9
+ movq %r8,16(%rbp)
+ movq 64+0-128(%rsi),%r8
+ shlb $4,%dl
+ movq %rax,16-128(%rbp)
+ movq 64+8-128(%rsi),%rax
+ shlq $60,%r10
+ movb %dl,3(%rsp)
+ orq %r10,%rbx
+ movb %al,%dl
+ shrq $4,%rax
+ movq %r8,%r10
+ shrq $4,%r8
+ movq %r9,24(%rbp)
+ movq 80+0-128(%rsi),%r9
+ shlb $4,%dl
+ movq %rbx,24-128(%rbp)
+ movq 80+8-128(%rsi),%rbx
+ shlq $60,%r10
+ movb %dl,4(%rsp)
+ orq %r10,%rax
+ movb %bl,%dl
+ shrq $4,%rbx
+ movq %r9,%r10
+ shrq $4,%r9
+ movq %r8,32(%rbp)
+ movq 96+0-128(%rsi),%r8
+ shlb $4,%dl
+ movq %rax,32-128(%rbp)
+ movq 96+8-128(%rsi),%rax
+ shlq $60,%r10
+ movb %dl,5(%rsp)
+ orq %r10,%rbx
+ movb %al,%dl
+ shrq $4,%rax
+ movq %r8,%r10
+ shrq $4,%r8
+ movq %r9,40(%rbp)
+ movq 112+0-128(%rsi),%r9
+ shlb $4,%dl
+ movq %rbx,40-128(%rbp)
+ movq 112+8-128(%rsi),%rbx
+ shlq $60,%r10
+ movb %dl,6(%rsp)
+ orq %r10,%rax
+ movb %bl,%dl
+ shrq $4,%rbx
+ movq %r9,%r10
+ shrq $4,%r9
+ movq %r8,48(%rbp)
+ movq 128+0-128(%rsi),%r8
+ shlb $4,%dl
+ movq %rax,48-128(%rbp)
+ movq 128+8-128(%rsi),%rax
+ shlq $60,%r10
+ movb %dl,7(%rsp)
+ orq %r10,%rbx
+ movb %al,%dl
+ shrq $4,%rax
+ movq %r8,%r10
+ shrq $4,%r8
+ movq %r9,56(%rbp)
+ movq 144+0-128(%rsi),%r9
+ shlb $4,%dl
+ movq %rbx,56-128(%rbp)
+ movq 144+8-128(%rsi),%rbx
+ shlq $60,%r10
+ movb %dl,8(%rsp)
+ orq %r10,%rax
+ movb %bl,%dl
+ shrq $4,%rbx
+ movq %r9,%r10
+ shrq $4,%r9
+ movq %r8,64(%rbp)
+ movq 160+0-128(%rsi),%r8
+ shlb $4,%dl
+ movq %rax,64-128(%rbp)
+ movq 160+8-128(%rsi),%rax
+ shlq $60,%r10
+ movb %dl,9(%rsp)
+ orq %r10,%rbx
+ movb %al,%dl
+ shrq $4,%rax
+ movq %r8,%r10
+ shrq $4,%r8
+ movq %r9,72(%rbp)
+ movq 176+0-128(%rsi),%r9
+ shlb $4,%dl
+ movq %rbx,72-128(%rbp)
+ movq 176+8-128(%rsi),%rbx
+ shlq $60,%r10
+ movb %dl,10(%rsp)
+ orq %r10,%rax
+ movb %bl,%dl
+ shrq $4,%rbx
+ movq %r9,%r10
+ shrq $4,%r9
+ movq %r8,80(%rbp)
+ movq 192+0-128(%rsi),%r8
+ shlb $4,%dl
+ movq %rax,80-128(%rbp)
+ movq 192+8-128(%rsi),%rax
+ shlq $60,%r10
+ movb %dl,11(%rsp)
+ orq %r10,%rbx
+ movb %al,%dl
+ shrq $4,%rax
+ movq %r8,%r10
+ shrq $4,%r8
+ movq %r9,88(%rbp)
+ movq 208+0-128(%rsi),%r9
+ shlb $4,%dl
+ movq %rbx,88-128(%rbp)
+ movq 208+8-128(%rsi),%rbx
+ shlq $60,%r10
+ movb %dl,12(%rsp)
+ orq %r10,%rax
+ movb %bl,%dl
+ shrq $4,%rbx
+ movq %r9,%r10
+ shrq $4,%r9
+ movq %r8,96(%rbp)
+ movq 224+0-128(%rsi),%r8
+ shlb $4,%dl
+ movq %rax,96-128(%rbp)
+ movq 224+8-128(%rsi),%rax
+ shlq $60,%r10
+ movb %dl,13(%rsp)
+ orq %r10,%rbx
+ movb %al,%dl
+ shrq $4,%rax
+ movq %r8,%r10
+ shrq $4,%r8
+ movq %r9,104(%rbp)
+ movq 240+0-128(%rsi),%r9
+ shlb $4,%dl
+ movq %rbx,104-128(%rbp)
+ movq 240+8-128(%rsi),%rbx
+ shlq $60,%r10
+ movb %dl,14(%rsp)
+ orq %r10,%rax
+ movb %bl,%dl
+ shrq $4,%rbx
+ movq %r9,%r10
+ shrq $4,%r9
+ movq %r8,112(%rbp)
+ shlb $4,%dl
+ movq %rax,112-128(%rbp)
+ shlq $60,%r10
+ movb %dl,15(%rsp)
+ orq %r10,%rbx
+ movq %r9,120(%rbp)
+ movq %rbx,120-128(%rbp)
+ addq $-128,%rsi
+ movq 8(%rdi),%r8
+ movq 0(%rdi),%r9
+ addq %r14,%r15
+ leaq .Lrem_8bit(%rip),%r11
+ jmp .Louter_loop
+.align 16
+.Louter_loop:
+ xorq (%r14),%r9
+ movq 8(%r14),%rdx
+ leaq 16(%r14),%r14
+ xorq %r8,%rdx
+ movq %r9,(%rdi)
+ movq %rdx,8(%rdi)
+ shrq $32,%rdx
+ xorq %rax,%rax
+ roll $8,%edx
+ movb %dl,%al
+ movzbl %dl,%ebx
+ shlb $4,%al
+ shrl $4,%ebx
+ roll $8,%edx
+ movq 8(%rsi,%rax,1),%r8
+ movq (%rsi,%rax,1),%r9
+ movb %dl,%al
+ movzbl %dl,%ecx
+ shlb $4,%al
+ movzbq (%rsp,%rbx,1),%r12
+ shrl $4,%ecx
+ xorq %r8,%r12
+ movq %r9,%r10
+ shrq $8,%r8
+ movzbq %r12b,%r12
+ shrq $8,%r9
+ xorq -128(%rbp,%rbx,8),%r8
+ shlq $56,%r10
+ xorq (%rbp,%rbx,8),%r9
+ roll $8,%edx
+ xorq 8(%rsi,%rax,1),%r8
+ xorq (%rsi,%rax,1),%r9
+ movb %dl,%al
+ xorq %r10,%r8
+ movzwq (%r11,%r12,2),%r12
+ movzbl %dl,%ebx
+ shlb $4,%al
+ movzbq (%rsp,%rcx,1),%r13
+ shrl $4,%ebx
+ shlq $48,%r12
+ xorq %r8,%r13
+ movq %r9,%r10
+ xorq %r12,%r9
+ shrq $8,%r8
+ movzbq %r13b,%r13
+ shrq $8,%r9
+ xorq -128(%rbp,%rcx,8),%r8
+ shlq $56,%r10
+ xorq (%rbp,%rcx,8),%r9
+ roll $8,%edx
+ xorq 8(%rsi,%rax,1),%r8
+ xorq (%rsi,%rax,1),%r9
+ movb %dl,%al
+ xorq %r10,%r8
+ movzwq (%r11,%r13,2),%r13
+ movzbl %dl,%ecx
+ shlb $4,%al
+ movzbq (%rsp,%rbx,1),%r12
+ shrl $4,%ecx
+ shlq $48,%r13
+ xorq %r8,%r12
+ movq %r9,%r10
+ xorq %r13,%r9
+ shrq $8,%r8
+ movzbq %r12b,%r12
+ movl 8(%rdi),%edx
+ shrq $8,%r9
+ xorq -128(%rbp,%rbx,8),%r8
+ shlq $56,%r10
+ xorq (%rbp,%rbx,8),%r9
+ roll $8,%edx
+ xorq 8(%rsi,%rax,1),%r8
+ xorq (%rsi,%rax,1),%r9
+ movb %dl,%al
+ xorq %r10,%r8
+ movzwq (%r11,%r12,2),%r12
+ movzbl %dl,%ebx
+ shlb $4,%al
+ movzbq (%rsp,%rcx,1),%r13
+ shrl $4,%ebx
+ shlq $48,%r12
+ xorq %r8,%r13
+ movq %r9,%r10
+ xorq %r12,%r9
+ shrq $8,%r8
+ movzbq %r13b,%r13
+ shrq $8,%r9
+ xorq -128(%rbp,%rcx,8),%r8
+ shlq $56,%r10
+ xorq (%rbp,%rcx,8),%r9
+ roll $8,%edx
+ xorq 8(%rsi,%rax,1),%r8
+ xorq (%rsi,%rax,1),%r9
+ movb %dl,%al
+ xorq %r10,%r8
+ movzwq (%r11,%r13,2),%r13
+ movzbl %dl,%ecx
+ shlb $4,%al
+ movzbq (%rsp,%rbx,1),%r12
+ shrl $4,%ecx
+ shlq $48,%r13
+ xorq %r8,%r12
+ movq %r9,%r10
+ xorq %r13,%r9
+ shrq $8,%r8
+ movzbq %r12b,%r12
+ shrq $8,%r9
+ xorq -128(%rbp,%rbx,8),%r8
+ shlq $56,%r10
+ xorq (%rbp,%rbx,8),%r9
+ roll $8,%edx
+ xorq 8(%rsi,%rax,1),%r8
+ xorq (%rsi,%rax,1),%r9
+ movb %dl,%al
+ xorq %r10,%r8
+ movzwq (%r11,%r12,2),%r12
+ movzbl %dl,%ebx
+ shlb $4,%al
+ movzbq (%rsp,%rcx,1),%r13
+ shrl $4,%ebx
+ shlq $48,%r12
+ xorq %r8,%r13
+ movq %r9,%r10
+ xorq %r12,%r9
+ shrq $8,%r8
+ movzbq %r13b,%r13
+ shrq $8,%r9
+ xorq -128(%rbp,%rcx,8),%r8
+ shlq $56,%r10
+ xorq (%rbp,%rcx,8),%r9
+ roll $8,%edx
+ xorq 8(%rsi,%rax,1),%r8
+ xorq (%rsi,%rax,1),%r9
+ movb %dl,%al
+ xorq %r10,%r8
+ movzwq (%r11,%r13,2),%r13
+ movzbl %dl,%ecx
+ shlb $4,%al
+ movzbq (%rsp,%rbx,1),%r12
+ shrl $4,%ecx
+ shlq $48,%r13
+ xorq %r8,%r12
+ movq %r9,%r10
+ xorq %r13,%r9
+ shrq $8,%r8
+ movzbq %r12b,%r12
+ movl 4(%rdi),%edx
+ shrq $8,%r9
+ xorq -128(%rbp,%rbx,8),%r8
+ shlq $56,%r10
+ xorq (%rbp,%rbx,8),%r9
+ roll $8,%edx
+ xorq 8(%rsi,%rax,1),%r8
+ xorq (%rsi,%rax,1),%r9
+ movb %dl,%al
+ xorq %r10,%r8
+ movzwq (%r11,%r12,2),%r12
+ movzbl %dl,%ebx
+ shlb $4,%al
+ movzbq (%rsp,%rcx,1),%r13
+ shrl $4,%ebx
+ shlq $48,%r12
+ xorq %r8,%r13
+ movq %r9,%r10
+ xorq %r12,%r9
+ shrq $8,%r8
+ movzbq %r13b,%r13
+ shrq $8,%r9
+ xorq -128(%rbp,%rcx,8),%r8
+ shlq $56,%r10
+ xorq (%rbp,%rcx,8),%r9
+ roll $8,%edx
+ xorq 8(%rsi,%rax,1),%r8
+ xorq (%rsi,%rax,1),%r9
+ movb %dl,%al
+ xorq %r10,%r8
+ movzwq (%r11,%r13,2),%r13
+ movzbl %dl,%ecx
+ shlb $4,%al
+ movzbq (%rsp,%rbx,1),%r12
+ shrl $4,%ecx
+ shlq $48,%r13
+ xorq %r8,%r12
+ movq %r9,%r10
+ xorq %r13,%r9
+ shrq $8,%r8
+ movzbq %r12b,%r12
+ shrq $8,%r9
+ xorq -128(%rbp,%rbx,8),%r8
+ shlq $56,%r10
+ xorq (%rbp,%rbx,8),%r9
+ roll $8,%edx
+ xorq 8(%rsi,%rax,1),%r8
+ xorq (%rsi,%rax,1),%r9
+ movb %dl,%al
+ xorq %r10,%r8
+ movzwq (%r11,%r12,2),%r12
+ movzbl %dl,%ebx
+ shlb $4,%al
+ movzbq (%rsp,%rcx,1),%r13
+ shrl $4,%ebx
+ shlq $48,%r12
+ xorq %r8,%r13
+ movq %r9,%r10
+ xorq %r12,%r9
+ shrq $8,%r8
+ movzbq %r13b,%r13
+ shrq $8,%r9
+ xorq -128(%rbp,%rcx,8),%r8
+ shlq $56,%r10
+ xorq (%rbp,%rcx,8),%r9
+ roll $8,%edx
+ xorq 8(%rsi,%rax,1),%r8
+ xorq (%rsi,%rax,1),%r9
+ movb %dl,%al
+ xorq %r10,%r8
+ movzwq (%r11,%r13,2),%r13
+ movzbl %dl,%ecx
+ shlb $4,%al
+ movzbq (%rsp,%rbx,1),%r12
+ shrl $4,%ecx
+ shlq $48,%r13
+ xorq %r8,%r12
+ movq %r9,%r10
+ xorq %r13,%r9
+ shrq $8,%r8
+ movzbq %r12b,%r12
+ movl 0(%rdi),%edx
+ shrq $8,%r9
+ xorq -128(%rbp,%rbx,8),%r8
+ shlq $56,%r10
+ xorq (%rbp,%rbx,8),%r9
+ roll $8,%edx
+ xorq 8(%rsi,%rax,1),%r8
+ xorq (%rsi,%rax,1),%r9
+ movb %dl,%al
+ xorq %r10,%r8
+ movzwq (%r11,%r12,2),%r12
+ movzbl %dl,%ebx
+ shlb $4,%al
+ movzbq (%rsp,%rcx,1),%r13
+ shrl $4,%ebx
+ shlq $48,%r12
+ xorq %r8,%r13
+ movq %r9,%r10
+ xorq %r12,%r9
+ shrq $8,%r8
+ movzbq %r13b,%r13
+ shrq $8,%r9
+ xorq -128(%rbp,%rcx,8),%r8
+ shlq $56,%r10
+ xorq (%rbp,%rcx,8),%r9
+ roll $8,%edx
+ xorq 8(%rsi,%rax,1),%r8
+ xorq (%rsi,%rax,1),%r9
+ movb %dl,%al
+ xorq %r10,%r8
+ movzwq (%r11,%r13,2),%r13
+ movzbl %dl,%ecx
+ shlb $4,%al
+ movzbq (%rsp,%rbx,1),%r12
+ shrl $4,%ecx
+ shlq $48,%r13
+ xorq %r8,%r12
+ movq %r9,%r10
+ xorq %r13,%r9
+ shrq $8,%r8
+ movzbq %r12b,%r12
+ shrq $8,%r9
+ xorq -128(%rbp,%rbx,8),%r8
+ shlq $56,%r10
+ xorq (%rbp,%rbx,8),%r9
+ roll $8,%edx
+ xorq 8(%rsi,%rax,1),%r8
+ xorq (%rsi,%rax,1),%r9
+ movb %dl,%al
+ xorq %r10,%r8
+ movzwq (%r11,%r12,2),%r12
+ movzbl %dl,%ebx
+ shlb $4,%al
+ movzbq (%rsp,%rcx,1),%r13
+ shrl $4,%ebx
+ shlq $48,%r12
+ xorq %r8,%r13
+ movq %r9,%r10
+ xorq %r12,%r9
+ shrq $8,%r8
+ movzbq %r13b,%r13
+ shrq $8,%r9
+ xorq -128(%rbp,%rcx,8),%r8
+ shlq $56,%r10
+ xorq (%rbp,%rcx,8),%r9
+ roll $8,%edx
+ xorq 8(%rsi,%rax,1),%r8
+ xorq (%rsi,%rax,1),%r9
+ movb %dl,%al
+ xorq %r10,%r8
+ movzwq (%r11,%r13,2),%r13
+ movzbl %dl,%ecx
+ shlb $4,%al
+ movzbq (%rsp,%rbx,1),%r12
+ andl $240,%ecx
+ shlq $48,%r13
+ xorq %r8,%r12
+ movq %r9,%r10
+ xorq %r13,%r9
+ shrq $8,%r8
+ movzbq %r12b,%r12
+ movl -4(%rdi),%edx
+ shrq $8,%r9
+ xorq -128(%rbp,%rbx,8),%r8
+ shlq $56,%r10
+ xorq (%rbp,%rbx,8),%r9
+ movzwq (%r11,%r12,2),%r12
+ xorq 8(%rsi,%rax,1),%r8
+ xorq (%rsi,%rax,1),%r9
+ shlq $48,%r12
+ xorq %r10,%r8
+ xorq %r12,%r9
+ movzbq %r8b,%r13
+ shrq $4,%r8
+ movq %r9,%r10
+ shlb $4,%r13b
+ shrq $4,%r9
+ xorq 8(%rsi,%rcx,1),%r8
+ movzwq (%r11,%r13,2),%r13
+ shlq $60,%r10
+ xorq (%rsi,%rcx,1),%r9
+ xorq %r10,%r8
+ shlq $48,%r13
+ bswapq %r8
+ xorq %r13,%r9
+ bswapq %r9
+ cmpq %r15,%r14
+ jb .Louter_loop
+ movq %r8,8(%rdi)
+ movq %r9,(%rdi)
+
+ leaq 280(%rsp),%rsi
+ movq 0(%rsi),%r15
+ movq 8(%rsi),%r14
+ movq 16(%rsi),%r13
+ movq 24(%rsi),%r12
+ movq 32(%rsi),%rbp
+ movq 40(%rsi),%rbx
+ leaq 48(%rsi),%rsp
+.Lghash_epilogue:
+ .byte 0xf3,0xc3
+.size gcm_ghash_4bit,.-gcm_ghash_4bit
+.globl gcm_init_clmul
+.type gcm_init_clmul,@function
+.align 16
+gcm_init_clmul:
+ movdqu (%rsi),%xmm2
+ pshufd $78,%xmm2,%xmm2
+
+
+ pshufd $255,%xmm2,%xmm4
+ movdqa %xmm2,%xmm3
+ psllq $1,%xmm2
+ pxor %xmm5,%xmm5
+ psrlq $63,%xmm3
+ pcmpgtd %xmm4,%xmm5
+ pslldq $8,%xmm3
+ por %xmm3,%xmm2
+
+
+ pand .L0x1c2_polynomial(%rip),%xmm5
+ pxor %xmm5,%xmm2
+
+
+ movdqa %xmm2,%xmm0
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+ pshufd $78,%xmm2,%xmm4
+ pxor %xmm0,%xmm3
+ pxor %xmm2,%xmm4
+.byte 102,15,58,68,194,0
+.byte 102,15,58,68,202,17
+.byte 102,15,58,68,220,0
+ pxor %xmm0,%xmm3
+ pxor %xmm1,%xmm3
+
+ movdqa %xmm3,%xmm4
+ psrldq $8,%xmm3
+ pslldq $8,%xmm4
+ pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+
+ movdqa %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $5,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm4
+ pslldq $8,%xmm0
+ psrldq $8,%xmm4
+ pxor %xmm3,%xmm0
+ pxor %xmm4,%xmm1
+
+
+ movdqa %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm0
+ pxor %xmm1,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm0
+ movdqu %xmm2,(%rdi)
+ movdqu %xmm0,16(%rdi)
+ .byte 0xf3,0xc3
+.size gcm_init_clmul,.-gcm_init_clmul
+.globl gcm_gmult_clmul
+.type gcm_gmult_clmul,@function
+.align 16
+gcm_gmult_clmul:
+ movdqu (%rdi),%xmm0
+ movdqa .Lbswap_mask(%rip),%xmm5
+ movdqu (%rsi),%xmm2
+.byte 102,15,56,0,197
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+ pshufd $78,%xmm2,%xmm4
+ pxor %xmm0,%xmm3
+ pxor %xmm2,%xmm4
+.byte 102,15,58,68,194,0
+.byte 102,15,58,68,202,17
+.byte 102,15,58,68,220,0
+ pxor %xmm0,%xmm3
+ pxor %xmm1,%xmm3
+
+ movdqa %xmm3,%xmm4
+ psrldq $8,%xmm3
+ pslldq $8,%xmm4
+ pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+
+ movdqa %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $5,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm4
+ pslldq $8,%xmm0
+ psrldq $8,%xmm4
+ pxor %xmm3,%xmm0
+ pxor %xmm4,%xmm1
+
+
+ movdqa %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm0
+ pxor %xmm1,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm0
+.byte 102,15,56,0,197
+ movdqu %xmm0,(%rdi)
+ .byte 0xf3,0xc3
+.size gcm_gmult_clmul,.-gcm_gmult_clmul
+.globl gcm_ghash_clmul
+.type gcm_ghash_clmul,@function
+.align 16
+gcm_ghash_clmul:
+ movdqa .Lbswap_mask(%rip),%xmm5
+
+ movdqu (%rdi),%xmm0
+ movdqu (%rsi),%xmm2
+.byte 102,15,56,0,197
+
+ subq $16,%rcx
+ jz .Lodd_tail
+
+ movdqu 16(%rsi),%xmm8
+
+
+
+
+
+ movdqu (%rdx),%xmm3
+ movdqu 16(%rdx),%xmm6
+.byte 102,15,56,0,221
+.byte 102,15,56,0,245
+ pxor %xmm3,%xmm0
+ movdqa %xmm6,%xmm7
+ pshufd $78,%xmm6,%xmm3
+ pshufd $78,%xmm2,%xmm4
+ pxor %xmm6,%xmm3
+ pxor %xmm2,%xmm4
+.byte 102,15,58,68,242,0
+.byte 102,15,58,68,250,17
+.byte 102,15,58,68,220,0
+ pxor %xmm6,%xmm3
+ pxor %xmm7,%xmm3
+
+ movdqa %xmm3,%xmm4
+ psrldq $8,%xmm3
+ pslldq $8,%xmm4
+ pxor %xmm3,%xmm7
+ pxor %xmm4,%xmm6
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+ pshufd $78,%xmm8,%xmm4
+ pxor %xmm0,%xmm3
+ pxor %xmm8,%xmm4
+
+ leaq 32(%rdx),%rdx
+ subq $32,%rcx
+ jbe .Leven_tail
+
+.Lmod_loop:
+.byte 102,65,15,58,68,192,0
+.byte 102,65,15,58,68,200,17
+.byte 102,15,58,68,220,0
+ pxor %xmm0,%xmm3
+ pxor %xmm1,%xmm3
+
+ movdqa %xmm3,%xmm4
+ psrldq $8,%xmm3
+ pslldq $8,%xmm4
+ pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+ movdqu (%rdx),%xmm3
+ pxor %xmm6,%xmm0
+ pxor %xmm7,%xmm1
+
+ movdqu 16(%rdx),%xmm6
+.byte 102,15,56,0,221
+.byte 102,15,56,0,245
+
+ movdqa %xmm6,%xmm7
+ pshufd $78,%xmm6,%xmm9
+ pshufd $78,%xmm2,%xmm10
+ pxor %xmm6,%xmm9
+ pxor %xmm2,%xmm10
+ pxor %xmm3,%xmm1
+
+ movdqa %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $5,%xmm0
+ pxor %xmm3,%xmm0
+.byte 102,15,58,68,242,0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm4
+ pslldq $8,%xmm0
+ psrldq $8,%xmm4
+ pxor %xmm3,%xmm0
+ pxor %xmm4,%xmm1
+
+.byte 102,15,58,68,250,17
+ movdqa %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm0
+ pxor %xmm1,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm0
+
+.byte 102,69,15,58,68,202,0
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+ pshufd $78,%xmm8,%xmm4
+ pxor %xmm0,%xmm3
+ pxor %xmm8,%xmm4
+
+ pxor %xmm6,%xmm9
+ pxor %xmm7,%xmm9
+ movdqa %xmm9,%xmm10
+ psrldq $8,%xmm9
+ pslldq $8,%xmm10
+ pxor %xmm9,%xmm7
+ pxor %xmm10,%xmm6
+
+ leaq 32(%rdx),%rdx
+ subq $32,%rcx
+ ja .Lmod_loop
+
+.Leven_tail:
+.byte 102,65,15,58,68,192,0
+.byte 102,65,15,58,68,200,17
+.byte 102,15,58,68,220,0
+ pxor %xmm0,%xmm3
+ pxor %xmm1,%xmm3
+
+ movdqa %xmm3,%xmm4
+ psrldq $8,%xmm3
+ pslldq $8,%xmm4
+ pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+ pxor %xmm6,%xmm0
+ pxor %xmm7,%xmm1
+
+ movdqa %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $5,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm4
+ pslldq $8,%xmm0
+ psrldq $8,%xmm4
+ pxor %xmm3,%xmm0
+ pxor %xmm4,%xmm1
+
+
+ movdqa %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm0
+ pxor %xmm1,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm0
+ testq %rcx,%rcx
+ jnz .Ldone
+
+.Lodd_tail:
+ movdqu (%rdx),%xmm3
+.byte 102,15,56,0,221
+ pxor %xmm3,%xmm0
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+ pshufd $78,%xmm2,%xmm4
+ pxor %xmm0,%xmm3
+ pxor %xmm2,%xmm4
+.byte 102,15,58,68,194,0
+.byte 102,15,58,68,202,17
+.byte 102,15,58,68,220,0
+ pxor %xmm0,%xmm3
+ pxor %xmm1,%xmm3
+
+ movdqa %xmm3,%xmm4
+ psrldq $8,%xmm3
+ pslldq $8,%xmm4
+ pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+
+ movdqa %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $5,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm4
+ pslldq $8,%xmm0
+ psrldq $8,%xmm4
+ pxor %xmm3,%xmm0
+ pxor %xmm4,%xmm1
+
+
+ movdqa %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm0
+ pxor %xmm1,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm0
+.Ldone:
+.byte 102,15,56,0,197
+ movdqu %xmm0,(%rdi)
+ .byte 0xf3,0xc3
+.LSEH_end_gcm_ghash_clmul:
+.size gcm_ghash_clmul,.-gcm_ghash_clmul
+.align 64
+.Lbswap_mask:
+.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.L0x1c2_polynomial:
+.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+.align 64
+.type .Lrem_4bit,@object
+.Lrem_4bit:
+.long 0,0,0,471859200,0,943718400,0,610271232
+.long 0,1887436800,0,1822425088,0,1220542464,0,1423966208
+.long 0,3774873600,0,4246732800,0,3644850176,0,3311403008
+.long 0,2441084928,0,2376073216,0,2847932416,0,3051356160
+.type .Lrem_8bit,@object
+.Lrem_8bit:
+.value 0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E
+.value 0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E
+.value 0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E
+.value 0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E
+.value 0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E
+.value 0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E
+.value 0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E
+.value 0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E
+.value 0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE
+.value 0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE
+.value 0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE
+.value 0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE
+.value 0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E
+.value 0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E
+.value 0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE
+.value 0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE
+.value 0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E
+.value 0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E
+.value 0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E
+.value 0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E
+.value 0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E
+.value 0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E
+.value 0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E
+.value 0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E
+.value 0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE
+.value 0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE
+.value 0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE
+.value 0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE
+.value 0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E
+.value 0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E
+.value 0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE
+.value 0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE
+
+.byte 71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 64
diff --git a/main/openssl/crypto/modes/asm/ghash-x86_64.pl b/main/openssl/crypto/modes/asm/ghash-x86_64.pl
new file mode 100644
index 00000000..38d779ed
--- /dev/null
+++ b/main/openssl/crypto/modes/asm/ghash-x86_64.pl
@@ -0,0 +1,806 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# March, June 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that
+# it uses 256 bytes per-key table [+128 bytes shared table]. GHASH
+# function features so called "528B" variant utilizing additional
+# 256+16 bytes of per-key storage [+512 bytes shared table].
+# Performance results are for this streamed GHASH subroutine and are
+# expressed in cycles per processed byte, less is better:
+#
+# gcc 3.4.x(*) assembler
+#
+# P4 28.6 14.0 +100%
+# Opteron 19.3 7.7 +150%
+# Core2 17.8 8.1(**) +120%
+#
+# (*) comparison is not completely fair, because C results are
+# for vanilla "256B" implementation, while assembler results
+# are for "528B";-)
+# (**) it's mystery [to me] why Core2 result is not same as for
+# Opteron;
+
+# May 2010
+#
+# Add PCLMULQDQ version performing at 2.02 cycles per processed byte.
+# See ghash-x86.pl for background information and details about coding
+# techniques.
+#
+# Special thanks to David Woodhouse <dwmw2@infradead.org> for
+# providing access to a Westmere-based system on behalf of Intel
+# Open Source Technology Centre.
+
+$flavour = shift;
+$output = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+# common register layout
+$nlo="%rax";
+$nhi="%rbx";
+$Zlo="%r8";
+$Zhi="%r9";
+$tmp="%r10";
+$rem_4bit = "%r11";
+
+$Xi="%rdi";
+$Htbl="%rsi";
+
+# per-function register layout
+$cnt="%rcx";
+$rem="%rdx";
+
+sub LB() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/ or
+ $r =~ s/%[er]([sd]i)/%\1l/ or
+ $r =~ s/%[er](bp)/%\1l/ or
+ $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; }
+
+sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
+ my $arg = pop;
+ $arg = "\$$arg" if ($arg*1 eq $arg);
+ $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
+}
+
+{ my $N;
+ sub loop() {
+ my $inp = shift;
+
+ $N++;
+$code.=<<___;
+ xor $nlo,$nlo
+ xor $nhi,$nhi
+ mov `&LB("$Zlo")`,`&LB("$nlo")`
+ mov `&LB("$Zlo")`,`&LB("$nhi")`
+ shl \$4,`&LB("$nlo")`
+ mov \$14,$cnt
+ mov 8($Htbl,$nlo),$Zlo
+ mov ($Htbl,$nlo),$Zhi
+ and \$0xf0,`&LB("$nhi")`
+ mov $Zlo,$rem
+ jmp .Loop$N
+
+.align 16
+.Loop$N:
+ shr \$4,$Zlo
+ and \$0xf,$rem
+ mov $Zhi,$tmp
+ mov ($inp,$cnt),`&LB("$nlo")`
+ shr \$4,$Zhi
+ xor 8($Htbl,$nhi),$Zlo
+ shl \$60,$tmp
+ xor ($Htbl,$nhi),$Zhi
+ mov `&LB("$nlo")`,`&LB("$nhi")`
+ xor ($rem_4bit,$rem,8),$Zhi
+ mov $Zlo,$rem
+ shl \$4,`&LB("$nlo")`
+ xor $tmp,$Zlo
+ dec $cnt
+ js .Lbreak$N
+
+ shr \$4,$Zlo
+ and \$0xf,$rem
+ mov $Zhi,$tmp
+ shr \$4,$Zhi
+ xor 8($Htbl,$nlo),$Zlo
+ shl \$60,$tmp
+ xor ($Htbl,$nlo),$Zhi
+ and \$0xf0,`&LB("$nhi")`
+ xor ($rem_4bit,$rem,8),$Zhi
+ mov $Zlo,$rem
+ xor $tmp,$Zlo
+ jmp .Loop$N
+
+.align 16
+.Lbreak$N:
+ shr \$4,$Zlo
+ and \$0xf,$rem
+ mov $Zhi,$tmp
+ shr \$4,$Zhi
+ xor 8($Htbl,$nlo),$Zlo
+ shl \$60,$tmp
+ xor ($Htbl,$nlo),$Zhi
+ and \$0xf0,`&LB("$nhi")`
+ xor ($rem_4bit,$rem,8),$Zhi
+ mov $Zlo,$rem
+ xor $tmp,$Zlo
+
+ shr \$4,$Zlo
+ and \$0xf,$rem
+ mov $Zhi,$tmp
+ shr \$4,$Zhi
+ xor 8($Htbl,$nhi),$Zlo
+ shl \$60,$tmp
+ xor ($Htbl,$nhi),$Zhi
+ xor $tmp,$Zlo
+ xor ($rem_4bit,$rem,8),$Zhi
+
+ bswap $Zlo
+ bswap $Zhi
+___
+}}
+
+$code=<<___;
+.text
+
+.globl gcm_gmult_4bit
+.type gcm_gmult_4bit,\@function,2
+.align 16
+gcm_gmult_4bit:
+ push %rbx
+ push %rbp # %rbp and %r12 are pushed exclusively in
+ push %r12 # order to reuse Win64 exception handler...
+.Lgmult_prologue:
+
+ movzb 15($Xi),$Zlo
+ lea .Lrem_4bit(%rip),$rem_4bit
+___
+ &loop ($Xi);
+$code.=<<___;
+ mov $Zlo,8($Xi)
+ mov $Zhi,($Xi)
+
+ mov 16(%rsp),%rbx
+ lea 24(%rsp),%rsp
+.Lgmult_epilogue:
+ ret
+.size gcm_gmult_4bit,.-gcm_gmult_4bit
+___
+
+# per-function register layout
+$inp="%rdx";
+$len="%rcx";
+$rem_8bit=$rem_4bit;
+
+$code.=<<___;
+.globl gcm_ghash_4bit
+.type gcm_ghash_4bit,\@function,4
+.align 16
+gcm_ghash_4bit:
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ sub \$280,%rsp
+.Lghash_prologue:
+ mov $inp,%r14 # reassign couple of args
+ mov $len,%r15
+___
+{ my $inp="%r14";
+ my $dat="%edx";
+ my $len="%r15";
+ my @nhi=("%ebx","%ecx");
+ my @rem=("%r12","%r13");
+ my $Hshr4="%rbp";
+
+ &sub ($Htbl,-128); # size optimization
+ &lea ($Hshr4,"16+128(%rsp)");
+ { my @lo =($nlo,$nhi);
+ my @hi =($Zlo,$Zhi);
+
+ &xor ($dat,$dat);
+ for ($i=0,$j=-2;$i<18;$i++,$j++) {
+ &mov ("$j(%rsp)",&LB($dat)) if ($i>1);
+ &or ($lo[0],$tmp) if ($i>1);
+ &mov (&LB($dat),&LB($lo[1])) if ($i>0 && $i<17);
+ &shr ($lo[1],4) if ($i>0 && $i<17);
+ &mov ($tmp,$hi[1]) if ($i>0 && $i<17);
+ &shr ($hi[1],4) if ($i>0 && $i<17);
+ &mov ("8*$j($Hshr4)",$hi[0]) if ($i>1);
+ &mov ($hi[0],"16*$i+0-128($Htbl)") if ($i<16);
+ &shl (&LB($dat),4) if ($i>0 && $i<17);
+ &mov ("8*$j-128($Hshr4)",$lo[0]) if ($i>1);
+ &mov ($lo[0],"16*$i+8-128($Htbl)") if ($i<16);
+ &shl ($tmp,60) if ($i>0 && $i<17);
+
+ push (@lo,shift(@lo));
+ push (@hi,shift(@hi));
+ }
+ }
+ &add ($Htbl,-128);
+ &mov ($Zlo,"8($Xi)");
+ &mov ($Zhi,"0($Xi)");
+ &add ($len,$inp); # pointer to the end of data
+ &lea ($rem_8bit,".Lrem_8bit(%rip)");
+ &jmp (".Louter_loop");
+
+$code.=".align 16\n.Louter_loop:\n";
+ &xor ($Zhi,"($inp)");
+ &mov ("%rdx","8($inp)");
+ &lea ($inp,"16($inp)");
+ &xor ("%rdx",$Zlo);
+ &mov ("($Xi)",$Zhi);
+ &mov ("8($Xi)","%rdx");
+ &shr ("%rdx",32);
+
+ &xor ($nlo,$nlo);
+ &rol ($dat,8);
+ &mov (&LB($nlo),&LB($dat));
+ &movz ($nhi[0],&LB($dat));
+ &shl (&LB($nlo),4);
+ &shr ($nhi[0],4);
+
+ for ($j=11,$i=0;$i<15;$i++) {
+ &rol ($dat,8);
+ &xor ($Zlo,"8($Htbl,$nlo)") if ($i>0);
+ &xor ($Zhi,"($Htbl,$nlo)") if ($i>0);
+ &mov ($Zlo,"8($Htbl,$nlo)") if ($i==0);
+ &mov ($Zhi,"($Htbl,$nlo)") if ($i==0);
+
+ &mov (&LB($nlo),&LB($dat));
+ &xor ($Zlo,$tmp) if ($i>0);
+ &movzw ($rem[1],"($rem_8bit,$rem[1],2)") if ($i>0);
+
+ &movz ($nhi[1],&LB($dat));
+ &shl (&LB($nlo),4);
+ &movzb ($rem[0],"(%rsp,$nhi[0])");
+
+ &shr ($nhi[1],4) if ($i<14);
+ &and ($nhi[1],0xf0) if ($i==14);
+ &shl ($rem[1],48) if ($i>0);
+ &xor ($rem[0],$Zlo);
+
+ &mov ($tmp,$Zhi);
+ &xor ($Zhi,$rem[1]) if ($i>0);
+ &shr ($Zlo,8);
+
+ &movz ($rem[0],&LB($rem[0]));
+ &mov ($dat,"$j($Xi)") if (--$j%4==0);
+ &shr ($Zhi,8);
+
+ &xor ($Zlo,"-128($Hshr4,$nhi[0],8)");
+ &shl ($tmp,56);
+ &xor ($Zhi,"($Hshr4,$nhi[0],8)");
+
+ unshift (@nhi,pop(@nhi)); # "rotate" registers
+ unshift (@rem,pop(@rem));
+ }
+ &movzw ($rem[1],"($rem_8bit,$rem[1],2)");
+ &xor ($Zlo,"8($Htbl,$nlo)");
+ &xor ($Zhi,"($Htbl,$nlo)");
+
+ &shl ($rem[1],48);
+ &xor ($Zlo,$tmp);
+
+ &xor ($Zhi,$rem[1]);
+ &movz ($rem[0],&LB($Zlo));
+ &shr ($Zlo,4);
+
+ &mov ($tmp,$Zhi);
+ &shl (&LB($rem[0]),4);
+ &shr ($Zhi,4);
+
+ &xor ($Zlo,"8($Htbl,$nhi[0])");
+ &movzw ($rem[0],"($rem_8bit,$rem[0],2)");
+ &shl ($tmp,60);
+
+ &xor ($Zhi,"($Htbl,$nhi[0])");
+ &xor ($Zlo,$tmp);
+ &shl ($rem[0],48);
+
+ &bswap ($Zlo);
+ &xor ($Zhi,$rem[0]);
+
+ &bswap ($Zhi);
+ &cmp ($inp,$len);
+ &jb (".Louter_loop");
+}
+$code.=<<___;
+ mov $Zlo,8($Xi)
+ mov $Zhi,($Xi)
+
+ lea 280(%rsp),%rsi
+ mov 0(%rsi),%r15
+ mov 8(%rsi),%r14
+ mov 16(%rsi),%r13
+ mov 24(%rsi),%r12
+ mov 32(%rsi),%rbp
+ mov 40(%rsi),%rbx
+ lea 48(%rsi),%rsp
+.Lghash_epilogue:
+ ret
+.size gcm_ghash_4bit,.-gcm_ghash_4bit
+___
+
+######################################################################
+# PCLMULQDQ version.
+
+@_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
+ ("%rdi","%rsi","%rdx","%rcx"); # Unix order
+
+($Xi,$Xhi)=("%xmm0","%xmm1"); $Hkey="%xmm2";
+($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5");
+
+sub clmul64x64_T2 { # minimal register pressure
+my ($Xhi,$Xi,$Hkey,$modulo)=@_;
+
+$code.=<<___ if (!defined($modulo));
+ movdqa $Xi,$Xhi #
+ pshufd \$0b01001110,$Xi,$T1
+ pshufd \$0b01001110,$Hkey,$T2
+ pxor $Xi,$T1 #
+ pxor $Hkey,$T2
+___
+$code.=<<___;
+ pclmulqdq \$0x00,$Hkey,$Xi #######
+ pclmulqdq \$0x11,$Hkey,$Xhi #######
+ pclmulqdq \$0x00,$T2,$T1 #######
+ pxor $Xi,$T1 #
+ pxor $Xhi,$T1 #
+
+ movdqa $T1,$T2 #
+ psrldq \$8,$T1
+ pslldq \$8,$T2 #
+ pxor $T1,$Xhi
+ pxor $T2,$Xi #
+___
+}
+
+sub reduction_alg9 { # 17/13 times faster than Intel version
+my ($Xhi,$Xi) = @_;
+
+$code.=<<___;
+ # 1st phase
+ movdqa $Xi,$T1 #
+ psllq \$1,$Xi
+ pxor $T1,$Xi #
+ psllq \$5,$Xi #
+ pxor $T1,$Xi #
+ psllq \$57,$Xi #
+ movdqa $Xi,$T2 #
+ pslldq \$8,$Xi
+ psrldq \$8,$T2 #
+ pxor $T1,$Xi
+ pxor $T2,$Xhi #
+
+ # 2nd phase
+ movdqa $Xi,$T2
+ psrlq \$5,$Xi
+ pxor $T2,$Xi #
+ psrlq \$1,$Xi #
+ pxor $T2,$Xi #
+ pxor $Xhi,$T2
+ psrlq \$1,$Xi #
+ pxor $T2,$Xi #
+___
+}
+
+{ my ($Htbl,$Xip)=@_4args;
+
+$code.=<<___;
+.globl gcm_init_clmul
+.type gcm_init_clmul,\@abi-omnipotent
+.align 16
+gcm_init_clmul:
+ movdqu ($Xip),$Hkey
+ pshufd \$0b01001110,$Hkey,$Hkey # dword swap
+
+ # <<1 twist
+ pshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword
+ movdqa $Hkey,$T1
+ psllq \$1,$Hkey
+ pxor $T3,$T3 #
+ psrlq \$63,$T1
+ pcmpgtd $T2,$T3 # broadcast carry bit
+ pslldq \$8,$T1
+ por $T1,$Hkey # H<<=1
+
+ # magic reduction
+ pand .L0x1c2_polynomial(%rip),$T3
+ pxor $T3,$Hkey # if(carry) H^=0x1c2_polynomial
+
+ # calculate H^2
+ movdqa $Hkey,$Xi
+___
+ &clmul64x64_T2 ($Xhi,$Xi,$Hkey);
+ &reduction_alg9 ($Xhi,$Xi);
+$code.=<<___;
+ movdqu $Hkey,($Htbl) # save H
+ movdqu $Xi,16($Htbl) # save H^2
+ ret
+.size gcm_init_clmul,.-gcm_init_clmul
+___
+}
+
+{ my ($Xip,$Htbl)=@_4args;
+
+$code.=<<___;
+.globl gcm_gmult_clmul
+.type gcm_gmult_clmul,\@abi-omnipotent
+.align 16
+gcm_gmult_clmul:
+ movdqu ($Xip),$Xi
+ movdqa .Lbswap_mask(%rip),$T3
+ movdqu ($Htbl),$Hkey
+ pshufb $T3,$Xi
+___
+ &clmul64x64_T2 ($Xhi,$Xi,$Hkey);
+ &reduction_alg9 ($Xhi,$Xi);
+$code.=<<___;
+ pshufb $T3,$Xi
+ movdqu $Xi,($Xip)
+ ret
+.size gcm_gmult_clmul,.-gcm_gmult_clmul
+___
+}
+
+{ my ($Xip,$Htbl,$inp,$len)=@_4args;
+ my $Xn="%xmm6";
+ my $Xhn="%xmm7";
+ my $Hkey2="%xmm8";
+ my $T1n="%xmm9";
+ my $T2n="%xmm10";
+
+$code.=<<___;
+.globl gcm_ghash_clmul
+.type gcm_ghash_clmul,\@abi-omnipotent
+.align 16
+gcm_ghash_clmul:
+___
+$code.=<<___ if ($win64);
+.LSEH_begin_gcm_ghash_clmul:
+ # I can't trust assembler to use specific encoding:-(
+ .byte 0x48,0x83,0xec,0x58 #sub \$0x58,%rsp
+ .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp)
+ .byte 0x0f,0x29,0x7c,0x24,0x10 #movdqa %xmm7,0x10(%rsp)
+ .byte 0x44,0x0f,0x29,0x44,0x24,0x20 #movaps %xmm8,0x20(%rsp)
+ .byte 0x44,0x0f,0x29,0x4c,0x24,0x30 #movaps %xmm9,0x30(%rsp)
+ .byte 0x44,0x0f,0x29,0x54,0x24,0x40 #movaps %xmm10,0x40(%rsp)
+___
+$code.=<<___;
+ movdqa .Lbswap_mask(%rip),$T3
+
+ movdqu ($Xip),$Xi
+ movdqu ($Htbl),$Hkey
+ pshufb $T3,$Xi
+
+ sub \$0x10,$len
+ jz .Lodd_tail
+
+ movdqu 16($Htbl),$Hkey2
+ #######
+ # Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
+ # [(H*Ii+1) + (H*Xi+1)] mod P =
+ # [(H*Ii+1) + H^2*(Ii+Xi)] mod P
+ #
+ movdqu ($inp),$T1 # Ii
+ movdqu 16($inp),$Xn # Ii+1
+ pshufb $T3,$T1
+ pshufb $T3,$Xn
+ pxor $T1,$Xi # Ii+Xi
+___
+ &clmul64x64_T2 ($Xhn,$Xn,$Hkey); # H*Ii+1
+$code.=<<___;
+ movdqa $Xi,$Xhi #
+ pshufd \$0b01001110,$Xi,$T1
+ pshufd \$0b01001110,$Hkey2,$T2
+ pxor $Xi,$T1 #
+ pxor $Hkey2,$T2
+
+ lea 32($inp),$inp # i+=2
+ sub \$0x20,$len
+ jbe .Leven_tail
+
+.Lmod_loop:
+___
+ &clmul64x64_T2 ($Xhi,$Xi,$Hkey2,1); # H^2*(Ii+Xi)
+$code.=<<___;
+ movdqu ($inp),$T1 # Ii
+ pxor $Xn,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
+ pxor $Xhn,$Xhi
+
+ movdqu 16($inp),$Xn # Ii+1
+ pshufb $T3,$T1
+ pshufb $T3,$Xn
+
+ movdqa $Xn,$Xhn #
+ pshufd \$0b01001110,$Xn,$T1n
+ pshufd \$0b01001110,$Hkey,$T2n
+ pxor $Xn,$T1n #
+ pxor $Hkey,$T2n
+ pxor $T1,$Xhi # "Ii+Xi", consume early
+
+ movdqa $Xi,$T1 # 1st phase
+ psllq \$1,$Xi
+ pxor $T1,$Xi #
+ psllq \$5,$Xi #
+ pxor $T1,$Xi #
+ pclmulqdq \$0x00,$Hkey,$Xn #######
+ psllq \$57,$Xi #
+ movdqa $Xi,$T2 #
+ pslldq \$8,$Xi
+ psrldq \$8,$T2 #
+ pxor $T1,$Xi
+ pxor $T2,$Xhi #
+
+ pclmulqdq \$0x11,$Hkey,$Xhn #######
+ movdqa $Xi,$T2 # 2nd phase
+ psrlq \$5,$Xi
+ pxor $T2,$Xi #
+ psrlq \$1,$Xi #
+ pxor $T2,$Xi #
+ pxor $Xhi,$T2
+ psrlq \$1,$Xi #
+ pxor $T2,$Xi #
+
+ pclmulqdq \$0x00,$T2n,$T1n #######
+ movdqa $Xi,$Xhi #
+ pshufd \$0b01001110,$Xi,$T1
+ pshufd \$0b01001110,$Hkey2,$T2
+ pxor $Xi,$T1 #
+ pxor $Hkey2,$T2
+
+ pxor $Xn,$T1n #
+ pxor $Xhn,$T1n #
+ movdqa $T1n,$T2n #
+ psrldq \$8,$T1n
+ pslldq \$8,$T2n #
+ pxor $T1n,$Xhn
+ pxor $T2n,$Xn #
+
+ lea 32($inp),$inp
+ sub \$0x20,$len
+ ja .Lmod_loop
+
+.Leven_tail:
+___
+ &clmul64x64_T2 ($Xhi,$Xi,$Hkey2,1); # H^2*(Ii+Xi)
+$code.=<<___;
+ pxor $Xn,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
+ pxor $Xhn,$Xhi
+___
+ &reduction_alg9 ($Xhi,$Xi);
+$code.=<<___;
+ test $len,$len
+ jnz .Ldone
+
+.Lodd_tail:
+ movdqu ($inp),$T1 # Ii
+ pshufb $T3,$T1
+ pxor $T1,$Xi # Ii+Xi
+___
+ &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H*(Ii+Xi)
+ &reduction_alg9 ($Xhi,$Xi);
+$code.=<<___;
+.Ldone:
+ pshufb $T3,$Xi
+ movdqu $Xi,($Xip)
+___
+$code.=<<___ if ($win64);
+ movaps (%rsp),%xmm6
+ movaps 0x10(%rsp),%xmm7
+ movaps 0x20(%rsp),%xmm8
+ movaps 0x30(%rsp),%xmm9
+ movaps 0x40(%rsp),%xmm10
+ add \$0x58,%rsp
+___
+$code.=<<___;
+ ret
+.LSEH_end_gcm_ghash_clmul:
+.size gcm_ghash_clmul,.-gcm_ghash_clmul
+___
+}
+
+$code.=<<___;
+.align 64
+.Lbswap_mask:
+ .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.L0x1c2_polynomial:
+ .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+.align 64
+.type .Lrem_4bit,\@object
+.Lrem_4bit:
+ .long 0,`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`
+ .long 0,`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`
+ .long 0,`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`
+ .long 0,`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`
+.type .Lrem_8bit,\@object
+.Lrem_8bit:
+ .value 0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E
+ .value 0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E
+ .value 0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E
+ .value 0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E
+ .value 0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E
+ .value 0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E
+ .value 0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E
+ .value 0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E
+ .value 0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE
+ .value 0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE
+ .value 0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE
+ .value 0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE
+ .value 0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E
+ .value 0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E
+ .value 0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE
+ .value 0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE
+ .value 0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E
+ .value 0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E
+ .value 0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E
+ .value 0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E
+ .value 0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E
+ .value 0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E
+ .value 0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E
+ .value 0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E
+ .value 0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE
+ .value 0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE
+ .value 0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE
+ .value 0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE
+ .value 0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E
+ .value 0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E
+ .value 0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE
+ .value 0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE
+
+.asciz "GHASH for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align 64
+___
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+# CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern __imp_RtlVirtualUnwind
+.type se_handler,\@abi-omnipotent
+.align 16
+se_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HandlerData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # prologue label
+ cmp %r10,%rbx # context->Rip<prologue label
+ jb .Lin_prologue
+
+ mov 152($context),%rax # pull context->Rsp
+
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=epilogue label
+ jae .Lin_prologue
+
+ lea 24(%rax),%rax # adjust "rsp"
+
+ mov -8(%rax),%rbx
+ mov -16(%rax),%rbp
+ mov -24(%rax),%r12
+ mov %rbx,144($context) # restore context->Rbx
+ mov %rbp,160($context) # restore context->Rbp
+ mov %r12,216($context) # restore context->R12
+
+.Lin_prologue:
+ mov 8(%rax),%rdi
+ mov 16(%rax),%rsi
+ mov %rax,152($context) # restore context->Rsp
+ mov %rsi,168($context) # restore context->Rsi
+ mov %rdi,176($context) # restore context->Rdi
+
+ mov 40($disp),%rdi # disp->ContextRecord
+ mov $context,%rsi # context
+ mov \$`1232/8`,%ecx # sizeof(CONTEXT)
+ .long 0xa548f3fc # cld; rep movsq
+
+ mov $disp,%rsi
+ xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
+ mov 8(%rsi),%rdx # arg2, disp->ImageBase
+ mov 0(%rsi),%r8 # arg3, disp->ControlPc
+ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
+ mov 40(%rsi),%r10 # disp->ContextRecord
+ lea 56(%rsi),%r11 # &disp->HandlerData
+ lea 24(%rsi),%r12 # &disp->EstablisherFrame
+ mov %r10,32(%rsp) # arg5
+ mov %r11,40(%rsp) # arg6
+ mov %r12,48(%rsp) # arg7
+ mov %rcx,56(%rsp) # arg8, (NULL)
+ call *__imp_RtlVirtualUnwind(%rip)
+
+ mov \$1,%eax # ExceptionContinueSearch
+ add \$64,%rsp
+ popfq
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ pop %rdi
+ pop %rsi
+ ret
+.size se_handler,.-se_handler
+
+.section .pdata
+.align 4
+ .rva .LSEH_begin_gcm_gmult_4bit
+ .rva .LSEH_end_gcm_gmult_4bit
+ .rva .LSEH_info_gcm_gmult_4bit
+
+ .rva .LSEH_begin_gcm_ghash_4bit
+ .rva .LSEH_end_gcm_ghash_4bit
+ .rva .LSEH_info_gcm_ghash_4bit
+
+ .rva .LSEH_begin_gcm_ghash_clmul
+ .rva .LSEH_end_gcm_ghash_clmul
+ .rva .LSEH_info_gcm_ghash_clmul
+
+.section .xdata
+.align 8
+.LSEH_info_gcm_gmult_4bit:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lgmult_prologue,.Lgmult_epilogue # HandlerData
+.LSEH_info_gcm_ghash_4bit:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lghash_prologue,.Lghash_epilogue # HandlerData
+.LSEH_info_gcm_ghash_clmul:
+ .byte 0x01,0x1f,0x0b,0x00
+ .byte 0x1f,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10
+ .byte 0x19,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9
+ .byte 0x13,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8
+ .byte 0x0d,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7
+ .byte 0x08,0x68,0x00,0x00 #movaps (rsp),xmm6
+ .byte 0x04,0xa2,0x00,0x00 #sub rsp,0x58
+___
+}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+
+print $code;
+
+close STDOUT;
diff --git a/main/openssl/crypto/modes/cbc128.c b/main/openssl/crypto/modes/cbc128.c
index 8f8bd563..0e54f754 100644
--- a/main/openssl/crypto/modes/cbc128.c
+++ b/main/openssl/crypto/modes/cbc128.c
@@ -48,7 +48,8 @@
*
*/
-#include "modes.h"
+#include <openssl/crypto.h>
+#include "modes_lcl.h"
#include <string.h>
#ifndef MODES_DEBUG
@@ -58,12 +59,7 @@
#endif
#include <assert.h>
-#define STRICT_ALIGNMENT 1
-#if defined(__i386) || defined(__i386__) || \
- defined(__x86_64) || defined(__x86_64__) || \
- defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64) || \
- defined(__s390__) || defined(__s390x__)
-# undef STRICT_ALIGNMENT
+#ifndef STRICT_ALIGNMENT
# define STRICT_ALIGNMENT 0
#endif
@@ -121,7 +117,7 @@ void CRYPTO_cbc128_decrypt(const unsigned char *in, unsigned char *out,
unsigned char ivec[16], block128_f block)
{
size_t n;
- union { size_t align; unsigned char c[16]; } tmp;
+ union { size_t t[16/sizeof(size_t)]; unsigned char c[16]; } tmp;
assert(in && out && key && ivec);
@@ -141,11 +137,13 @@ void CRYPTO_cbc128_decrypt(const unsigned char *in, unsigned char *out,
out += 16;
}
}
- else {
+ else if (16%sizeof(size_t) == 0) { /* always true */
while (len>=16) {
+ size_t *out_t=(size_t *)out, *iv_t=(size_t *)iv;
+
(*block)(in, out, key);
- for(n=0; n<16; n+=sizeof(size_t))
- *(size_t *)(out+n) ^= *(size_t *)(iv+n);
+ for(n=0; n<16/sizeof(size_t); n++)
+ out_t[n] ^= iv_t[n];
iv = in;
len -= 16;
in += 16;
@@ -169,15 +167,16 @@ void CRYPTO_cbc128_decrypt(const unsigned char *in, unsigned char *out,
out += 16;
}
}
- else {
- size_t c;
+ else if (16%sizeof(size_t) == 0) { /* always true */
while (len>=16) {
+ size_t c, *out_t=(size_t *)out, *ivec_t=(size_t *)ivec;
+ const size_t *in_t=(const size_t *)in;
+
(*block)(in, tmp.c, key);
- for(n=0; n<16; n+=sizeof(size_t)) {
- c = *(size_t *)(in+n);
- *(size_t *)(out+n) =
- *(size_t *)(tmp.c+n) ^ *(size_t *)(ivec+n);
- *(size_t *)(ivec+n) = c;
+ for(n=0; n<16/sizeof(size_t); n++) {
+ c = in_t[n];
+ out_t[n] = tmp.t[n] ^ ivec_t[n];
+ ivec_t[n] = c;
}
len -= 16;
in += 16;
diff --git a/main/openssl/crypto/modes/ccm128.c b/main/openssl/crypto/modes/ccm128.c
new file mode 100644
index 00000000..3ce11d0d
--- /dev/null
+++ b/main/openssl/crypto/modes/ccm128.c
@@ -0,0 +1,441 @@
+/* ====================================================================
+ * Copyright (c) 2011 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include <openssl/crypto.h>
+#include "modes_lcl.h"
+#include <string.h>
+
+#ifndef MODES_DEBUG
+# ifndef NDEBUG
+# define NDEBUG
+# endif
+#endif
+#include <assert.h>
+
+/* First you setup M and L parameters and pass the key schedule.
+ * This is called once per session setup... */
+void CRYPTO_ccm128_init(CCM128_CONTEXT *ctx,
+ unsigned int M,unsigned int L,void *key,block128_f block)
+{
+ memset(ctx->nonce.c,0,sizeof(ctx->nonce.c));
+ ctx->nonce.c[0] = ((u8)(L-1)&7) | (u8)(((M-2)/2)&7)<<3;
+ ctx->blocks = 0;
+ ctx->block = block;
+ ctx->key = key;
+}
+
+/* !!! Following interfaces are to be called *once* per packet !!! */
+
+/* Then you setup per-message nonce and pass the length of the message */
+int CRYPTO_ccm128_setiv(CCM128_CONTEXT *ctx,
+ const unsigned char *nonce,size_t nlen,size_t mlen)
+{
+ unsigned int L = ctx->nonce.c[0]&7; /* the L parameter */
+
+ if (nlen<(14-L)) return -1; /* nonce is too short */
+
+ if (sizeof(mlen)==8 && L>=3) {
+ ctx->nonce.c[8] = (u8)(mlen>>(56%(sizeof(mlen)*8)));
+ ctx->nonce.c[9] = (u8)(mlen>>(48%(sizeof(mlen)*8)));
+ ctx->nonce.c[10] = (u8)(mlen>>(40%(sizeof(mlen)*8)));
+ ctx->nonce.c[11] = (u8)(mlen>>(32%(sizeof(mlen)*8)));
+ }
+ else
+ ctx->nonce.u[1] = 0;
+
+ ctx->nonce.c[12] = (u8)(mlen>>24);
+ ctx->nonce.c[13] = (u8)(mlen>>16);
+ ctx->nonce.c[14] = (u8)(mlen>>8);
+ ctx->nonce.c[15] = (u8)mlen;
+
+ ctx->nonce.c[0] &= ~0x40; /* clear Adata flag */
+ memcpy(&ctx->nonce.c[1],nonce,14-L);
+
+ return 0;
+}
+
+/* Then you pass additional authentication data, this is optional */
+void CRYPTO_ccm128_aad(CCM128_CONTEXT *ctx,
+ const unsigned char *aad,size_t alen)
+{ unsigned int i;
+ block128_f block = ctx->block;
+
+ if (alen==0) return;
+
+ ctx->nonce.c[0] |= 0x40; /* set Adata flag */
+ (*block)(ctx->nonce.c,ctx->cmac.c,ctx->key),
+ ctx->blocks++;
+
+ if (alen<(0x10000-0x100)) {
+ ctx->cmac.c[0] ^= (u8)(alen>>8);
+ ctx->cmac.c[1] ^= (u8)alen;
+ i=2;
+ }
+ else if (sizeof(alen)==8 && alen>=(size_t)1<<(32%(sizeof(alen)*8))) {
+ ctx->cmac.c[0] ^= 0xFF;
+ ctx->cmac.c[1] ^= 0xFF;
+ ctx->cmac.c[2] ^= (u8)(alen>>(56%(sizeof(alen)*8)));
+ ctx->cmac.c[3] ^= (u8)(alen>>(48%(sizeof(alen)*8)));
+ ctx->cmac.c[4] ^= (u8)(alen>>(40%(sizeof(alen)*8)));
+ ctx->cmac.c[5] ^= (u8)(alen>>(32%(sizeof(alen)*8)));
+ ctx->cmac.c[6] ^= (u8)(alen>>24);
+ ctx->cmac.c[7] ^= (u8)(alen>>16);
+ ctx->cmac.c[8] ^= (u8)(alen>>8);
+ ctx->cmac.c[9] ^= (u8)alen;
+ i=10;
+ }
+ else {
+ ctx->cmac.c[0] ^= 0xFF;
+ ctx->cmac.c[1] ^= 0xFE;
+ ctx->cmac.c[2] ^= (u8)(alen>>24);
+ ctx->cmac.c[3] ^= (u8)(alen>>16);
+ ctx->cmac.c[4] ^= (u8)(alen>>8);
+ ctx->cmac.c[5] ^= (u8)alen;
+ i=6;
+ }
+
+ do {
+ for(;i<16 && alen;++i,++aad,--alen)
+ ctx->cmac.c[i] ^= *aad;
+ (*block)(ctx->cmac.c,ctx->cmac.c,ctx->key),
+ ctx->blocks++;
+ i=0;
+ } while (alen);
+}
+
+/* Finally you encrypt or decrypt the message */
+
+/* counter part of nonce may not be larger than L*8 bits,
+ * L is not larger than 8, therefore 64-bit counter... */
+static void ctr64_inc(unsigned char *counter) {
+ unsigned int n=8;
+ u8 c;
+
+ counter += 8;
+ do {
+ --n;
+ c = counter[n];
+ ++c;
+ counter[n] = c;
+ if (c) return;
+ } while (n);
+}
+
+int CRYPTO_ccm128_encrypt(CCM128_CONTEXT *ctx,
+ const unsigned char *inp, unsigned char *out,
+ size_t len)
+{
+ size_t n;
+ unsigned int i,L;
+ unsigned char flags0 = ctx->nonce.c[0];
+ block128_f block = ctx->block;
+ void * key = ctx->key;
+ union { u64 u[2]; u8 c[16]; } scratch;
+
+ if (!(flags0&0x40))
+ (*block)(ctx->nonce.c,ctx->cmac.c,key),
+ ctx->blocks++;
+
+ ctx->nonce.c[0] = L = flags0&7;
+ for (n=0,i=15-L;i<15;++i) {
+ n |= ctx->nonce.c[i];
+ ctx->nonce.c[i]=0;
+ n <<= 8;
+ }
+ n |= ctx->nonce.c[15]; /* reconstructed length */
+ ctx->nonce.c[15]=1;
+
+ if (n!=len) return -1; /* length mismatch */
+
+ ctx->blocks += ((len+15)>>3)|1;
+ if (ctx->blocks > (U64(1)<<61)) return -2; /* too much data */
+
+ while (len>=16) {
+#if defined(STRICT_ALIGNMENT)
+ union { u64 u[2]; u8 c[16]; } temp;
+
+ memcpy (temp.c,inp,16);
+ ctx->cmac.u[0] ^= temp.u[0];
+ ctx->cmac.u[1] ^= temp.u[1];
+#else
+ ctx->cmac.u[0] ^= ((u64*)inp)[0];
+ ctx->cmac.u[1] ^= ((u64*)inp)[1];
+#endif
+ (*block)(ctx->cmac.c,ctx->cmac.c,key);
+ (*block)(ctx->nonce.c,scratch.c,key);
+ ctr64_inc(ctx->nonce.c);
+#if defined(STRICT_ALIGNMENT)
+ temp.u[0] ^= scratch.u[0];
+ temp.u[1] ^= scratch.u[1];
+ memcpy(out,temp.c,16);
+#else
+ ((u64*)out)[0] = scratch.u[0]^((u64*)inp)[0];
+ ((u64*)out)[1] = scratch.u[1]^((u64*)inp)[1];
+#endif
+ inp += 16;
+ out += 16;
+ len -= 16;
+ }
+
+ if (len) {
+ for (i=0; i<len; ++i) ctx->cmac.c[i] ^= inp[i];
+ (*block)(ctx->cmac.c,ctx->cmac.c,key);
+ (*block)(ctx->nonce.c,scratch.c,key);
+ for (i=0; i<len; ++i) out[i] = scratch.c[i]^inp[i];
+ }
+
+ for (i=15-L;i<16;++i)
+ ctx->nonce.c[i]=0;
+
+ (*block)(ctx->nonce.c,scratch.c,key);
+ ctx->cmac.u[0] ^= scratch.u[0];
+ ctx->cmac.u[1] ^= scratch.u[1];
+
+ ctx->nonce.c[0] = flags0;
+
+ return 0;
+}
+
+int CRYPTO_ccm128_decrypt(CCM128_CONTEXT *ctx,
+ const unsigned char *inp, unsigned char *out,
+ size_t len)
+{
+ size_t n;
+ unsigned int i,L;
+ unsigned char flags0 = ctx->nonce.c[0];
+ block128_f block = ctx->block;
+ void * key = ctx->key;
+ union { u64 u[2]; u8 c[16]; } scratch;
+
+ if (!(flags0&0x40))
+ (*block)(ctx->nonce.c,ctx->cmac.c,key);
+
+ ctx->nonce.c[0] = L = flags0&7;
+ for (n=0,i=15-L;i<15;++i) {
+ n |= ctx->nonce.c[i];
+ ctx->nonce.c[i]=0;
+ n <<= 8;
+ }
+ n |= ctx->nonce.c[15]; /* reconstructed length */
+ ctx->nonce.c[15]=1;
+
+ if (n!=len) return -1;
+
+ while (len>=16) {
+#if defined(STRICT_ALIGNMENT)
+ union { u64 u[2]; u8 c[16]; } temp;
+#endif
+ (*block)(ctx->nonce.c,scratch.c,key);
+ ctr64_inc(ctx->nonce.c);
+#if defined(STRICT_ALIGNMENT)
+ memcpy (temp.c,inp,16);
+ ctx->cmac.u[0] ^= (scratch.u[0] ^= temp.u[0]);
+ ctx->cmac.u[1] ^= (scratch.u[1] ^= temp.u[1]);
+ memcpy (out,scratch.c,16);
+#else
+ ctx->cmac.u[0] ^= (((u64*)out)[0] = scratch.u[0]^((u64*)inp)[0]);
+ ctx->cmac.u[1] ^= (((u64*)out)[1] = scratch.u[1]^((u64*)inp)[1]);
+#endif
+ (*block)(ctx->cmac.c,ctx->cmac.c,key);
+
+ inp += 16;
+ out += 16;
+ len -= 16;
+ }
+
+ if (len) {
+ (*block)(ctx->nonce.c,scratch.c,key);
+ for (i=0; i<len; ++i)
+ ctx->cmac.c[i] ^= (out[i] = scratch.c[i]^inp[i]);
+ (*block)(ctx->cmac.c,ctx->cmac.c,key);
+ }
+
+ for (i=15-L;i<16;++i)
+ ctx->nonce.c[i]=0;
+
+ (*block)(ctx->nonce.c,scratch.c,key);
+ ctx->cmac.u[0] ^= scratch.u[0];
+ ctx->cmac.u[1] ^= scratch.u[1];
+
+ ctx->nonce.c[0] = flags0;
+
+ return 0;
+}
+
+static void ctr64_add (unsigned char *counter,size_t inc)
+{ size_t n=8, val=0;
+
+ counter += 8;
+ do {
+ --n;
+ val += counter[n] + (inc&0xff);
+ counter[n] = (unsigned char)val;
+ val >>= 8; /* carry bit */
+ inc >>= 8;
+ } while(n && (inc || val));
+}
+
+int CRYPTO_ccm128_encrypt_ccm64(CCM128_CONTEXT *ctx,
+ const unsigned char *inp, unsigned char *out,
+ size_t len,ccm128_f stream)
+{
+ size_t n;
+ unsigned int i,L;
+ unsigned char flags0 = ctx->nonce.c[0];
+ block128_f block = ctx->block;
+ void * key = ctx->key;
+ union { u64 u[2]; u8 c[16]; } scratch;
+
+ if (!(flags0&0x40))
+ (*block)(ctx->nonce.c,ctx->cmac.c,key),
+ ctx->blocks++;
+
+ ctx->nonce.c[0] = L = flags0&7;
+ for (n=0,i=15-L;i<15;++i) {
+ n |= ctx->nonce.c[i];
+ ctx->nonce.c[i]=0;
+ n <<= 8;
+ }
+ n |= ctx->nonce.c[15]; /* reconstructed length */
+ ctx->nonce.c[15]=1;
+
+ if (n!=len) return -1; /* length mismatch */
+
+ ctx->blocks += ((len+15)>>3)|1;
+ if (ctx->blocks > (U64(1)<<61)) return -2; /* too much data */
+
+ if ((n=len/16)) {
+ (*stream)(inp,out,n,key,ctx->nonce.c,ctx->cmac.c);
+ n *= 16;
+ inp += n;
+ out += n;
+ len -= n;
+ if (len) ctr64_add(ctx->nonce.c,n/16);
+ }
+
+ if (len) {
+ for (i=0; i<len; ++i) ctx->cmac.c[i] ^= inp[i];
+ (*block)(ctx->cmac.c,ctx->cmac.c,key);
+ (*block)(ctx->nonce.c,scratch.c,key);
+ for (i=0; i<len; ++i) out[i] = scratch.c[i]^inp[i];
+ }
+
+ for (i=15-L;i<16;++i)
+ ctx->nonce.c[i]=0;
+
+ (*block)(ctx->nonce.c,scratch.c,key);
+ ctx->cmac.u[0] ^= scratch.u[0];
+ ctx->cmac.u[1] ^= scratch.u[1];
+
+ ctx->nonce.c[0] = flags0;
+
+ return 0;
+}
+
+int CRYPTO_ccm128_decrypt_ccm64(CCM128_CONTEXT *ctx,
+ const unsigned char *inp, unsigned char *out,
+ size_t len,ccm128_f stream)
+{
+ size_t n;
+ unsigned int i,L;
+ unsigned char flags0 = ctx->nonce.c[0];
+ block128_f block = ctx->block;
+ void * key = ctx->key;
+ union { u64 u[2]; u8 c[16]; } scratch;
+
+ if (!(flags0&0x40))
+ (*block)(ctx->nonce.c,ctx->cmac.c,key);
+
+ ctx->nonce.c[0] = L = flags0&7;
+ for (n=0,i=15-L;i<15;++i) {
+ n |= ctx->nonce.c[i];
+ ctx->nonce.c[i]=0;
+ n <<= 8;
+ }
+ n |= ctx->nonce.c[15]; /* reconstructed length */
+ ctx->nonce.c[15]=1;
+
+ if (n!=len) return -1;
+
+ if ((n=len/16)) {
+ (*stream)(inp,out,n,key,ctx->nonce.c,ctx->cmac.c);
+ n *= 16;
+ inp += n;
+ out += n;
+ len -= n;
+ if (len) ctr64_add(ctx->nonce.c,n/16);
+ }
+
+ if (len) {
+ (*block)(ctx->nonce.c,scratch.c,key);
+ for (i=0; i<len; ++i)
+ ctx->cmac.c[i] ^= (out[i] = scratch.c[i]^inp[i]);
+ (*block)(ctx->cmac.c,ctx->cmac.c,key);
+ }
+
+ for (i=15-L;i<16;++i)
+ ctx->nonce.c[i]=0;
+
+ (*block)(ctx->nonce.c,scratch.c,key);
+ ctx->cmac.u[0] ^= scratch.u[0];
+ ctx->cmac.u[1] ^= scratch.u[1];
+
+ ctx->nonce.c[0] = flags0;
+
+ return 0;
+}
+
+size_t CRYPTO_ccm128_tag(CCM128_CONTEXT *ctx,unsigned char *tag,size_t len)
+{ unsigned int M = (ctx->nonce.c[0]>>3)&7; /* the M parameter */
+
+ M *= 2; M += 2;
+ if (len<M) return 0;
+ memcpy(tag,ctx->cmac.c,M);
+ return M;
+}
diff --git a/main/openssl/crypto/modes/cfb128.c b/main/openssl/crypto/modes/cfb128.c
index e5938c61..4e6f5d35 100644
--- a/main/openssl/crypto/modes/cfb128.c
+++ b/main/openssl/crypto/modes/cfb128.c
@@ -48,7 +48,8 @@
*
*/
-#include "modes.h"
+#include <openssl/crypto.h>
+#include "modes_lcl.h"
#include <string.h>
#ifndef MODES_DEBUG
@@ -58,14 +59,6 @@
#endif
#include <assert.h>
-#define STRICT_ALIGNMENT
-#if defined(__i386) || defined(__i386__) || \
- defined(__x86_64) || defined(__x86_64__) || \
- defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64) || \
- defined(__s390__) || defined(__s390x__)
-# undef STRICT_ALIGNMENT
-#endif
-
/* The input and output encrypted as though 128bit cfb mode is being
* used. The extra state information to record how much of the
* 128bit block we have used is contained in *num;
diff --git a/main/openssl/crypto/modes/ctr128.c b/main/openssl/crypto/modes/ctr128.c
index 932037f5..ee642c58 100644
--- a/main/openssl/crypto/modes/ctr128.c
+++ b/main/openssl/crypto/modes/ctr128.c
@@ -48,7 +48,8 @@
*
*/
-#include "modes.h"
+#include <openssl/crypto.h>
+#include "modes_lcl.h"
#include <string.h>
#ifndef MODES_DEBUG
@@ -58,17 +59,6 @@
#endif
#include <assert.h>
-typedef unsigned int u32;
-typedef unsigned char u8;
-
-#define STRICT_ALIGNMENT
-#if defined(__i386) || defined(__i386__) || \
- defined(__x86_64) || defined(__x86_64__) || \
- defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64) || \
- defined(__s390__) || defined(__s390x__)
-# undef STRICT_ALIGNMENT
-#endif
-
/* NOTE: the IV/counter CTR mode is big-endian. The code itself
* is endian-neutral. */
@@ -182,3 +172,81 @@ void CRYPTO_ctr128_encrypt(const unsigned char *in, unsigned char *out,
*num=n;
}
+
+/* increment upper 96 bits of 128-bit counter by 1 */
+static void ctr96_inc(unsigned char *counter) {
+ u32 n=12;
+ u8 c;
+
+ do {
+ --n;
+ c = counter[n];
+ ++c;
+ counter[n] = c;
+ if (c) return;
+ } while (n);
+}
+
+void CRYPTO_ctr128_encrypt_ctr32(const unsigned char *in, unsigned char *out,
+ size_t len, const void *key,
+ unsigned char ivec[16], unsigned char ecount_buf[16],
+ unsigned int *num, ctr128_f func)
+{
+ unsigned int n,ctr32;
+
+ assert(in && out && key && ecount_buf && num);
+ assert(*num < 16);
+
+ n = *num;
+
+ while (n && len) {
+ *(out++) = *(in++) ^ ecount_buf[n];
+ --len;
+ n = (n+1) % 16;
+ }
+
+ ctr32 = GETU32(ivec+12);
+ while (len>=16) {
+ size_t blocks = len/16;
+ /*
+ * 1<<28 is just a not-so-small yet not-so-large number...
+ * Below condition is practically never met, but it has to
+ * be checked for code correctness.
+ */
+ if (sizeof(size_t)>sizeof(unsigned int) && blocks>(1U<<28))
+ blocks = (1U<<28);
+ /*
+ * As (*func) operates on 32-bit counter, caller
+ * has to handle overflow. 'if' below detects the
+ * overflow, which is then handled by limiting the
+ * amount of blocks to the exact overflow point...
+ */
+ ctr32 += (u32)blocks;
+ if (ctr32 < blocks) {
+ blocks -= ctr32;
+ ctr32 = 0;
+ }
+ (*func)(in,out,blocks,key,ivec);
+ /* (*ctr) does not update ivec, caller does: */
+ PUTU32(ivec+12,ctr32);
+ /* ... overflow was detected, propogate carry. */
+ if (ctr32 == 0) ctr96_inc(ivec);
+ blocks *= 16;
+ len -= blocks;
+ out += blocks;
+ in += blocks;
+ }
+ if (len) {
+ memset(ecount_buf,0,16);
+ (*func)(ecount_buf,ecount_buf,1,key,ivec);
+ ++ctr32;
+ PUTU32(ivec+12,ctr32);
+ if (ctr32 == 0) ctr96_inc(ivec);
+ while (len--) {
+ out[n] = in[n] ^ ecount_buf[n];
+ ++n;
+ }
+ }
+
+ *num=n;
+}
diff --git a/main/openssl/crypto/modes/gcm128.c b/main/openssl/crypto/modes/gcm128.c
new file mode 100644
index 00000000..250063de
--- /dev/null
+++ b/main/openssl/crypto/modes/gcm128.c
@@ -0,0 +1,1817 @@
+/* ====================================================================
+ * Copyright (c) 2010 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#define OPENSSL_FIPSAPI
+
+#include <openssl/crypto.h>
+#include "modes_lcl.h"
+#include <string.h>
+
+#ifndef MODES_DEBUG
+# ifndef NDEBUG
+# define NDEBUG
+# endif
+#endif
+#include <assert.h>
+
+#if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
+/* redefine, because alignment is ensured */
+#undef GETU32
+#define GETU32(p) BSWAP4(*(const u32 *)(p))
+#undef PUTU32
+#define PUTU32(p,v) *(u32 *)(p) = BSWAP4(v)
+#endif
+
+#define PACK(s) ((size_t)(s)<<(sizeof(size_t)*8-16))
+#define REDUCE1BIT(V) do { \
+ if (sizeof(size_t)==8) { \
+ u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
+ V.lo = (V.hi<<63)|(V.lo>>1); \
+ V.hi = (V.hi>>1 )^T; \
+ } \
+ else { \
+ u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
+ V.lo = (V.hi<<63)|(V.lo>>1); \
+ V.hi = (V.hi>>1 )^((u64)T<<32); \
+ } \
+} while(0)
+
+/*
+ * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
+ * never be set to 8. 8 is effectively reserved for testing purposes.
+ * TABLE_BITS>1 are lookup-table-driven implementations referred to as
+ * "Shoup's" in GCM specification. In other words OpenSSL does not cover
+ * whole spectrum of possible table driven implementations. Why? In
+ * non-"Shoup's" case memory access pattern is segmented in such manner,
+ * that it's trivial to see that cache timing information can reveal
+ * fair portion of intermediate hash value. Given that ciphertext is
+ * always available to attacker, it's possible for him to attempt to
+ * deduce secret parameter H and if successful, tamper with messages
+ * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
+ * not as trivial, but there is no reason to believe that it's resistant
+ * to cache-timing attack. And the thing about "8-bit" implementation is
+ * that it consumes 16 (sixteen) times more memory, 4KB per individual
+ * key + 1KB shared. Well, on pros side it should be twice as fast as
+ * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
+ * was observed to run ~75% faster, closer to 100% for commercial
+ * compilers... Yet "4-bit" procedure is preferred, because it's
+ * believed to provide better security-performance balance and adequate
+ * all-round performance. "All-round" refers to things like:
+ *
+ * - shorter setup time effectively improves overall timing for
+ * handling short messages;
+ * - larger table allocation can become unbearable because of VM
+ * subsystem penalties (for example on Windows large enough free
+ * results in VM working set trimming, meaning that consequent
+ * malloc would immediately incur working set expansion);
+ * - larger table has larger cache footprint, which can affect
+ * performance of other code paths (not necessarily even from same
+ * thread in Hyper-Threading world);
+ *
+ * Value of 1 is not appropriate for performance reasons.
+ */
+#if TABLE_BITS==8
+
+static void gcm_init_8bit(u128 Htable[256], u64 H[2])
+{
+ int i, j;
+ u128 V;
+
+ Htable[0].hi = 0;
+ Htable[0].lo = 0;
+ V.hi = H[0];
+ V.lo = H[1];
+
+ for (Htable[128]=V, i=64; i>0; i>>=1) {
+ REDUCE1BIT(V);
+ Htable[i] = V;
+ }
+
+ for (i=2; i<256; i<<=1) {
+ u128 *Hi = Htable+i, H0 = *Hi;
+ for (j=1; j<i; ++j) {
+ Hi[j].hi = H0.hi^Htable[j].hi;
+ Hi[j].lo = H0.lo^Htable[j].lo;
+ }
+ }
+}
+
+static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
+{
+ u128 Z = { 0, 0};
+ const u8 *xi = (const u8 *)Xi+15;
+ size_t rem, n = *xi;
+ const union { long one; char little; } is_endian = {1};
+ static const size_t rem_8bit[256] = {
+ PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
+ PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
+ PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
+ PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
+ PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
+ PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
+ PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
+ PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
+ PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
+ PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
+ PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
+ PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
+ PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
+ PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
+ PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
+ PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
+ PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
+ PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
+ PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
+ PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
+ PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
+ PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
+ PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
+ PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
+ PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
+ PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
+ PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
+ PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
+ PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
+ PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
+ PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
+ PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
+ PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
+ PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
+ PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
+ PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
+ PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
+ PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
+ PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
+ PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
+ PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
+ PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
+ PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
+ PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
+ PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
+ PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
+ PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
+ PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
+ PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
+ PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
+ PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
+ PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
+ PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
+ PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
+ PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
+ PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
+ PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
+ PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
+ PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
+ PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
+ PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
+ PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
+ PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
+ PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) };
+
+ while (1) {
+ Z.hi ^= Htable[n].hi;
+ Z.lo ^= Htable[n].lo;
+
+ if ((u8 *)Xi==xi) break;
+
+ n = *(--xi);
+
+ rem = (size_t)Z.lo&0xff;
+ Z.lo = (Z.hi<<56)|(Z.lo>>8);
+ Z.hi = (Z.hi>>8);
+ if (sizeof(size_t)==8)
+ Z.hi ^= rem_8bit[rem];
+ else
+ Z.hi ^= (u64)rem_8bit[rem]<<32;
+ }
+
+ if (is_endian.little) {
+#ifdef BSWAP8
+ Xi[0] = BSWAP8(Z.hi);
+ Xi[1] = BSWAP8(Z.lo);
+#else
+ u8 *p = (u8 *)Xi;
+ u32 v;
+ v = (u32)(Z.hi>>32); PUTU32(p,v);
+ v = (u32)(Z.hi); PUTU32(p+4,v);
+ v = (u32)(Z.lo>>32); PUTU32(p+8,v);
+ v = (u32)(Z.lo); PUTU32(p+12,v);
+#endif
+ }
+ else {
+ Xi[0] = Z.hi;
+ Xi[1] = Z.lo;
+ }
+}
+#define GCM_MUL(ctx,Xi) gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
+
+#elif TABLE_BITS==4
+
+static void gcm_init_4bit(u128 Htable[16], u64 H[2])
+{
+ u128 V;
+#if defined(OPENSSL_SMALL_FOOTPRINT)
+ int i;
+#endif
+
+ Htable[0].hi = 0;
+ Htable[0].lo = 0;
+ V.hi = H[0];
+ V.lo = H[1];
+
+#if defined(OPENSSL_SMALL_FOOTPRINT)
+ for (Htable[8]=V, i=4; i>0; i>>=1) {
+ REDUCE1BIT(V);
+ Htable[i] = V;
+ }
+
+ for (i=2; i<16; i<<=1) {
+ u128 *Hi = Htable+i;
+ int j;
+ for (V=*Hi, j=1; j<i; ++j) {
+ Hi[j].hi = V.hi^Htable[j].hi;
+ Hi[j].lo = V.lo^Htable[j].lo;
+ }
+ }
+#else
+ Htable[8] = V;
+ REDUCE1BIT(V);
+ Htable[4] = V;
+ REDUCE1BIT(V);
+ Htable[2] = V;
+ REDUCE1BIT(V);
+ Htable[1] = V;
+ Htable[3].hi = V.hi^Htable[2].hi, Htable[3].lo = V.lo^Htable[2].lo;
+ V=Htable[4];
+ Htable[5].hi = V.hi^Htable[1].hi, Htable[5].lo = V.lo^Htable[1].lo;
+ Htable[6].hi = V.hi^Htable[2].hi, Htable[6].lo = V.lo^Htable[2].lo;
+ Htable[7].hi = V.hi^Htable[3].hi, Htable[7].lo = V.lo^Htable[3].lo;
+ V=Htable[8];
+ Htable[9].hi = V.hi^Htable[1].hi, Htable[9].lo = V.lo^Htable[1].lo;
+ Htable[10].hi = V.hi^Htable[2].hi, Htable[10].lo = V.lo^Htable[2].lo;
+ Htable[11].hi = V.hi^Htable[3].hi, Htable[11].lo = V.lo^Htable[3].lo;
+ Htable[12].hi = V.hi^Htable[4].hi, Htable[12].lo = V.lo^Htable[4].lo;
+ Htable[13].hi = V.hi^Htable[5].hi, Htable[13].lo = V.lo^Htable[5].lo;
+ Htable[14].hi = V.hi^Htable[6].hi, Htable[14].lo = V.lo^Htable[6].lo;
+ Htable[15].hi = V.hi^Htable[7].hi, Htable[15].lo = V.lo^Htable[7].lo;
+#endif
+#if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
+ /*
+ * ARM assembler expects specific dword order in Htable.
+ */
+ {
+ int j;
+ const union { long one; char little; } is_endian = {1};
+
+ if (is_endian.little)
+ for (j=0;j<16;++j) {
+ V = Htable[j];
+ Htable[j].hi = V.lo;
+ Htable[j].lo = V.hi;
+ }
+ else
+ for (j=0;j<16;++j) {
+ V = Htable[j];
+ Htable[j].hi = V.lo<<32|V.lo>>32;
+ Htable[j].lo = V.hi<<32|V.hi>>32;
+ }
+ }
+#endif
+}
+
+#ifndef GHASH_ASM
+static const size_t rem_4bit[16] = {
+ PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
+ PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
+ PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
+ PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
+
+static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
+{
+ u128 Z;
+ int cnt = 15;
+ size_t rem, nlo, nhi;
+ const union { long one; char little; } is_endian = {1};
+
+ nlo = ((const u8 *)Xi)[15];
+ nhi = nlo>>4;
+ nlo &= 0xf;
+
+ Z.hi = Htable[nlo].hi;
+ Z.lo = Htable[nlo].lo;
+
+ while (1) {
+ rem = (size_t)Z.lo&0xf;
+ Z.lo = (Z.hi<<60)|(Z.lo>>4);
+ Z.hi = (Z.hi>>4);
+ if (sizeof(size_t)==8)
+ Z.hi ^= rem_4bit[rem];
+ else
+ Z.hi ^= (u64)rem_4bit[rem]<<32;
+
+ Z.hi ^= Htable[nhi].hi;
+ Z.lo ^= Htable[nhi].lo;
+
+ if (--cnt<0) break;
+
+ nlo = ((const u8 *)Xi)[cnt];
+ nhi = nlo>>4;
+ nlo &= 0xf;
+
+ rem = (size_t)Z.lo&0xf;
+ Z.lo = (Z.hi<<60)|(Z.lo>>4);
+ Z.hi = (Z.hi>>4);
+ if (sizeof(size_t)==8)
+ Z.hi ^= rem_4bit[rem];
+ else
+ Z.hi ^= (u64)rem_4bit[rem]<<32;
+
+ Z.hi ^= Htable[nlo].hi;
+ Z.lo ^= Htable[nlo].lo;
+ }
+
+ if (is_endian.little) {
+#ifdef BSWAP8
+ Xi[0] = BSWAP8(Z.hi);
+ Xi[1] = BSWAP8(Z.lo);
+#else
+ u8 *p = (u8 *)Xi;
+ u32 v;
+ v = (u32)(Z.hi>>32); PUTU32(p,v);
+ v = (u32)(Z.hi); PUTU32(p+4,v);
+ v = (u32)(Z.lo>>32); PUTU32(p+8,v);
+ v = (u32)(Z.lo); PUTU32(p+12,v);
+#endif
+ }
+ else {
+ Xi[0] = Z.hi;
+ Xi[1] = Z.lo;
+ }
+}
+
+#if !defined(OPENSSL_SMALL_FOOTPRINT)
+/*
+ * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
+ * details... Compiler-generated code doesn't seem to give any
+ * performance improvement, at least not on x86[_64]. It's here
+ * mostly as reference and a placeholder for possible future
+ * non-trivial optimization[s]...
+ */
+static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],
+ const u8 *inp,size_t len)
+{
+ u128 Z;
+ int cnt;
+ size_t rem, nlo, nhi;
+ const union { long one; char little; } is_endian = {1};
+
+#if 1
+ do {
+ cnt = 15;
+ nlo = ((const u8 *)Xi)[15];
+ nlo ^= inp[15];
+ nhi = nlo>>4;
+ nlo &= 0xf;
+
+ Z.hi = Htable[nlo].hi;
+ Z.lo = Htable[nlo].lo;
+
+ while (1) {
+ rem = (size_t)Z.lo&0xf;
+ Z.lo = (Z.hi<<60)|(Z.lo>>4);
+ Z.hi = (Z.hi>>4);
+ if (sizeof(size_t)==8)
+ Z.hi ^= rem_4bit[rem];
+ else
+ Z.hi ^= (u64)rem_4bit[rem]<<32;
+
+ Z.hi ^= Htable[nhi].hi;
+ Z.lo ^= Htable[nhi].lo;
+
+ if (--cnt<0) break;
+
+ nlo = ((const u8 *)Xi)[cnt];
+ nlo ^= inp[cnt];
+ nhi = nlo>>4;
+ nlo &= 0xf;
+
+ rem = (size_t)Z.lo&0xf;
+ Z.lo = (Z.hi<<60)|(Z.lo>>4);
+ Z.hi = (Z.hi>>4);
+ if (sizeof(size_t)==8)
+ Z.hi ^= rem_4bit[rem];
+ else
+ Z.hi ^= (u64)rem_4bit[rem]<<32;
+
+ Z.hi ^= Htable[nlo].hi;
+ Z.lo ^= Htable[nlo].lo;
+ }
+#else
+ /*
+ * Extra 256+16 bytes per-key plus 512 bytes shared tables
+ * [should] give ~50% improvement... One could have PACK()-ed
+ * the rem_8bit even here, but the priority is to minimize
+ * cache footprint...
+ */
+ u128 Hshr4[16]; /* Htable shifted right by 4 bits */
+ u8 Hshl4[16]; /* Htable shifted left by 4 bits */
+ static const unsigned short rem_8bit[256] = {
+ 0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
+ 0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
+ 0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
+ 0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
+ 0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
+ 0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
+ 0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
+ 0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
+ 0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
+ 0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
+ 0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
+ 0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
+ 0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
+ 0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
+ 0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
+ 0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
+ 0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
+ 0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
+ 0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
+ 0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
+ 0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
+ 0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
+ 0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
+ 0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
+ 0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
+ 0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
+ 0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
+ 0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
+ 0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
+ 0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
+ 0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
+ 0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE };
+ /*
+ * This pre-processing phase slows down procedure by approximately
+ * same time as it makes each loop spin faster. In other words
+ * single block performance is approximately same as straightforward
+ * "4-bit" implementation, and then it goes only faster...
+ */
+ for (cnt=0; cnt<16; ++cnt) {
+ Z.hi = Htable[cnt].hi;
+ Z.lo = Htable[cnt].lo;
+ Hshr4[cnt].lo = (Z.hi<<60)|(Z.lo>>4);
+ Hshr4[cnt].hi = (Z.hi>>4);
+ Hshl4[cnt] = (u8)(Z.lo<<4);
+ }
+
+ do {
+ for (Z.lo=0, Z.hi=0, cnt=15; cnt; --cnt) {
+ nlo = ((const u8 *)Xi)[cnt];
+ nlo ^= inp[cnt];
+ nhi = nlo>>4;
+ nlo &= 0xf;
+
+ Z.hi ^= Htable[nlo].hi;
+ Z.lo ^= Htable[nlo].lo;
+
+ rem = (size_t)Z.lo&0xff;
+
+ Z.lo = (Z.hi<<56)|(Z.lo>>8);
+ Z.hi = (Z.hi>>8);
+
+ Z.hi ^= Hshr4[nhi].hi;
+ Z.lo ^= Hshr4[nhi].lo;
+ Z.hi ^= (u64)rem_8bit[rem^Hshl4[nhi]]<<48;
+ }
+
+ nlo = ((const u8 *)Xi)[0];
+ nlo ^= inp[0];
+ nhi = nlo>>4;
+ nlo &= 0xf;
+
+ Z.hi ^= Htable[nlo].hi;
+ Z.lo ^= Htable[nlo].lo;
+
+ rem = (size_t)Z.lo&0xf;
+
+ Z.lo = (Z.hi<<60)|(Z.lo>>4);
+ Z.hi = (Z.hi>>4);
+
+ Z.hi ^= Htable[nhi].hi;
+ Z.lo ^= Htable[nhi].lo;
+ Z.hi ^= ((u64)rem_8bit[rem<<4])<<48;
+#endif
+
+ if (is_endian.little) {
+#ifdef BSWAP8
+ Xi[0] = BSWAP8(Z.hi);
+ Xi[1] = BSWAP8(Z.lo);
+#else
+ u8 *p = (u8 *)Xi;
+ u32 v;
+ v = (u32)(Z.hi>>32); PUTU32(p,v);
+ v = (u32)(Z.hi); PUTU32(p+4,v);
+ v = (u32)(Z.lo>>32); PUTU32(p+8,v);
+ v = (u32)(Z.lo); PUTU32(p+12,v);
+#endif
+ }
+ else {
+ Xi[0] = Z.hi;
+ Xi[1] = Z.lo;
+ }
+ } while (inp+=16, len-=16);
+}
+#endif
+#else
+void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]);
+void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
+#endif
+
+#define GCM_MUL(ctx,Xi) gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
+#if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
+#define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
+/* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
+ * trashing effect. In other words idea is to hash data while it's
+ * still in L1 cache after encryption pass... */
+#define GHASH_CHUNK (3*1024)
+#endif
+
+#else /* TABLE_BITS */
+
+static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
+{
+ u128 V,Z = { 0,0 };
+ long X;
+ int i,j;
+ const long *xi = (const long *)Xi;
+ const union { long one; char little; } is_endian = {1};
+
+ V.hi = H[0]; /* H is in host byte order, no byte swapping */
+ V.lo = H[1];
+
+ for (j=0; j<16/sizeof(long); ++j) {
+ if (is_endian.little) {
+ if (sizeof(long)==8) {
+#ifdef BSWAP8
+ X = (long)(BSWAP8(xi[j]));
+#else
+ const u8 *p = (const u8 *)(xi+j);
+ X = (long)((u64)GETU32(p)<<32|GETU32(p+4));
+#endif
+ }
+ else {
+ const u8 *p = (const u8 *)(xi+j);
+ X = (long)GETU32(p);
+ }
+ }
+ else
+ X = xi[j];
+
+ for (i=0; i<8*sizeof(long); ++i, X<<=1) {
+ u64 M = (u64)(X>>(8*sizeof(long)-1));
+ Z.hi ^= V.hi&M;
+ Z.lo ^= V.lo&M;
+
+ REDUCE1BIT(V);
+ }
+ }
+
+ if (is_endian.little) {
+#ifdef BSWAP8
+ Xi[0] = BSWAP8(Z.hi);
+ Xi[1] = BSWAP8(Z.lo);
+#else
+ u8 *p = (u8 *)Xi;
+ u32 v;
+ v = (u32)(Z.hi>>32); PUTU32(p,v);
+ v = (u32)(Z.hi); PUTU32(p+4,v);
+ v = (u32)(Z.lo>>32); PUTU32(p+8,v);
+ v = (u32)(Z.lo); PUTU32(p+12,v);
+#endif
+ }
+ else {
+ Xi[0] = Z.hi;
+ Xi[1] = Z.lo;
+ }
+}
+#define GCM_MUL(ctx,Xi) gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
+
+#endif
+
+#if TABLE_BITS==4 && defined(GHASH_ASM)
+# if !defined(I386_ONLY) && \
+ (defined(__i386) || defined(__i386__) || \
+ defined(__x86_64) || defined(__x86_64__) || \
+ defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64))
+# define GHASH_ASM_X86_OR_64
+# define GCM_FUNCREF_4BIT
+extern unsigned int OPENSSL_ia32cap_P[2];
+
+void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]);
+void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
+void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
+
+# if defined(__i386) || defined(__i386__) || defined(_M_IX86)
+# define GHASH_ASM_X86
+void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]);
+void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
+
+void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
+void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
+# endif
+# elif defined(__arm__) || defined(__arm)
+# include "arm_arch.h"
+# if __ARM_ARCH__>=7
+# define GHASH_ASM_ARM
+# define GCM_FUNCREF_4BIT
+void gcm_gmult_neon(u64 Xi[2],const u128 Htable[16]);
+void gcm_ghash_neon(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
+# endif
+# endif
+#endif
+
+#ifdef GCM_FUNCREF_4BIT
+# undef GCM_MUL
+# define GCM_MUL(ctx,Xi) (*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
+# ifdef GHASH
+# undef GHASH
+# define GHASH(ctx,in,len) (*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
+# endif
+#endif
+
+void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
+{
+ const union { long one; char little; } is_endian = {1};
+
+ memset(ctx,0,sizeof(*ctx));
+ ctx->block = block;
+ ctx->key = key;
+
+ (*block)(ctx->H.c,ctx->H.c,key);
+
+ if (is_endian.little) {
+ /* H is stored in host byte order */
+#ifdef BSWAP8
+ ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
+ ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
+#else
+ u8 *p = ctx->H.c;
+ u64 hi,lo;
+ hi = (u64)GETU32(p) <<32|GETU32(p+4);
+ lo = (u64)GETU32(p+8)<<32|GETU32(p+12);
+ ctx->H.u[0] = hi;
+ ctx->H.u[1] = lo;
+#endif
+ }
+
+#if TABLE_BITS==8
+ gcm_init_8bit(ctx->Htable,ctx->H.u);
+#elif TABLE_BITS==4
+# if defined(GHASH_ASM_X86_OR_64)
+# if !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
+ if (OPENSSL_ia32cap_P[0]&(1<<24) && /* check FXSR bit */
+ OPENSSL_ia32cap_P[1]&(1<<1) ) { /* check PCLMULQDQ bit */
+ gcm_init_clmul(ctx->Htable,ctx->H.u);
+ ctx->gmult = gcm_gmult_clmul;
+ ctx->ghash = gcm_ghash_clmul;
+ return;
+ }
+# endif
+ gcm_init_4bit(ctx->Htable,ctx->H.u);
+# if defined(GHASH_ASM_X86) /* x86 only */
+# if defined(OPENSSL_IA32_SSE2)
+ if (OPENSSL_ia32cap_P[0]&(1<<25)) { /* check SSE bit */
+# else
+ if (OPENSSL_ia32cap_P[0]&(1<<23)) { /* check MMX bit */
+# endif
+ ctx->gmult = gcm_gmult_4bit_mmx;
+ ctx->ghash = gcm_ghash_4bit_mmx;
+ } else {
+ ctx->gmult = gcm_gmult_4bit_x86;
+ ctx->ghash = gcm_ghash_4bit_x86;
+ }
+# else
+ ctx->gmult = gcm_gmult_4bit;
+ ctx->ghash = gcm_ghash_4bit;
+# endif
+# elif defined(GHASH_ASM_ARM)
+ if (OPENSSL_armcap_P & ARMV7_NEON) {
+ ctx->gmult = gcm_gmult_neon;
+ ctx->ghash = gcm_ghash_neon;
+ } else {
+ gcm_init_4bit(ctx->Htable,ctx->H.u);
+ ctx->gmult = gcm_gmult_4bit;
+ ctx->ghash = gcm_ghash_4bit;
+ }
+# else
+ gcm_init_4bit(ctx->Htable,ctx->H.u);
+# endif
+#endif
+}
+
+void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len)
+{
+ const union { long one; char little; } is_endian = {1};
+ unsigned int ctr;
+#ifdef GCM_FUNCREF_4BIT
+ void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
+#endif
+
+ ctx->Yi.u[0] = 0;
+ ctx->Yi.u[1] = 0;
+ ctx->Xi.u[0] = 0;
+ ctx->Xi.u[1] = 0;
+ ctx->len.u[0] = 0; /* AAD length */
+ ctx->len.u[1] = 0; /* message length */
+ ctx->ares = 0;
+ ctx->mres = 0;
+
+ if (len==12) {
+ memcpy(ctx->Yi.c,iv,12);
+ ctx->Yi.c[15]=1;
+ ctr=1;
+ }
+ else {
+ size_t i;
+ u64 len0 = len;
+
+ while (len>=16) {
+ for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i];
+ GCM_MUL(ctx,Yi);
+ iv += 16;
+ len -= 16;
+ }
+ if (len) {
+ for (i=0; i<len; ++i) ctx->Yi.c[i] ^= iv[i];
+ GCM_MUL(ctx,Yi);
+ }
+ len0 <<= 3;
+ if (is_endian.little) {
+#ifdef BSWAP8
+ ctx->Yi.u[1] ^= BSWAP8(len0);
+#else
+ ctx->Yi.c[8] ^= (u8)(len0>>56);
+ ctx->Yi.c[9] ^= (u8)(len0>>48);
+ ctx->Yi.c[10] ^= (u8)(len0>>40);
+ ctx->Yi.c[11] ^= (u8)(len0>>32);
+ ctx->Yi.c[12] ^= (u8)(len0>>24);
+ ctx->Yi.c[13] ^= (u8)(len0>>16);
+ ctx->Yi.c[14] ^= (u8)(len0>>8);
+ ctx->Yi.c[15] ^= (u8)(len0);
+#endif
+ }
+ else
+ ctx->Yi.u[1] ^= len0;
+
+ GCM_MUL(ctx,Yi);
+
+ if (is_endian.little)
+ ctr = GETU32(ctx->Yi.c+12);
+ else
+ ctr = ctx->Yi.d[3];
+ }
+
+ (*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key);
+ ++ctr;
+ if (is_endian.little)
+ PUTU32(ctx->Yi.c+12,ctr);
+ else
+ ctx->Yi.d[3] = ctr;
+}
+
+int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len)
+{
+ size_t i;
+ unsigned int n;
+ u64 alen = ctx->len.u[0];
+#ifdef GCM_FUNCREF_4BIT
+ void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
+# ifdef GHASH
+ void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
+ const u8 *inp,size_t len) = ctx->ghash;
+# endif
+#endif
+
+ if (ctx->len.u[1]) return -2;
+
+ alen += len;
+ if (alen>(U64(1)<<61) || (sizeof(len)==8 && alen<len))
+ return -1;
+ ctx->len.u[0] = alen;
+
+ n = ctx->ares;
+ if (n) {
+ while (n && len) {
+ ctx->Xi.c[n] ^= *(aad++);
+ --len;
+ n = (n+1)%16;
+ }
+ if (n==0) GCM_MUL(ctx,Xi);
+ else {
+ ctx->ares = n;
+ return 0;
+ }
+ }
+
+#ifdef GHASH
+ if ((i = (len&(size_t)-16))) {
+ GHASH(ctx,aad,i);
+ aad += i;
+ len -= i;
+ }
+#else
+ while (len>=16) {
+ for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i];
+ GCM_MUL(ctx,Xi);
+ aad += 16;
+ len -= 16;
+ }
+#endif
+ if (len) {
+ n = (unsigned int)len;
+ for (i=0; i<len; ++i) ctx->Xi.c[i] ^= aad[i];
+ }
+
+ ctx->ares = n;
+ return 0;
+}
+
+int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
+ const unsigned char *in, unsigned char *out,
+ size_t len)
+{
+ const union { long one; char little; } is_endian = {1};
+ unsigned int n, ctr;
+ size_t i;
+ u64 mlen = ctx->len.u[1];
+ block128_f block = ctx->block;
+ void *key = ctx->key;
+#ifdef GCM_FUNCREF_4BIT
+ void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
+# ifdef GHASH
+ void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
+ const u8 *inp,size_t len) = ctx->ghash;
+# endif
+#endif
+
+#if 0
+ n = (unsigned int)mlen%16; /* alternative to ctx->mres */
+#endif
+ mlen += len;
+ if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
+ return -1;
+ ctx->len.u[1] = mlen;
+
+ if (ctx->ares) {
+ /* First call to encrypt finalizes GHASH(AAD) */
+ GCM_MUL(ctx,Xi);
+ ctx->ares = 0;
+ }
+
+ if (is_endian.little)
+ ctr = GETU32(ctx->Yi.c+12);
+ else
+ ctr = ctx->Yi.d[3];
+
+ n = ctx->mres;
+#if !defined(OPENSSL_SMALL_FOOTPRINT)
+ if (16%sizeof(size_t) == 0) do { /* always true actually */
+ if (n) {
+ while (n && len) {
+ ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
+ --len;
+ n = (n+1)%16;
+ }
+ if (n==0) GCM_MUL(ctx,Xi);
+ else {
+ ctx->mres = n;
+ return 0;
+ }
+ }
+#if defined(STRICT_ALIGNMENT)
+ if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
+ break;
+#endif
+#if defined(GHASH) && defined(GHASH_CHUNK)
+ while (len>=GHASH_CHUNK) {
+ size_t j=GHASH_CHUNK;
+
+ while (j) {
+ size_t *out_t=(size_t *)out;
+ const size_t *in_t=(const size_t *)in;
+
+ (*block)(ctx->Yi.c,ctx->EKi.c,key);
+ ++ctr;
+ if (is_endian.little)
+ PUTU32(ctx->Yi.c+12,ctr);
+ else
+ ctx->Yi.d[3] = ctr;
+ for (i=0; i<16/sizeof(size_t); ++i)
+ out_t[i] = in_t[i] ^ ctx->EKi.t[i];
+ out += 16;
+ in += 16;
+ j -= 16;
+ }
+ GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK);
+ len -= GHASH_CHUNK;
+ }
+ if ((i = (len&(size_t)-16))) {
+ size_t j=i;
+
+ while (len>=16) {
+ size_t *out_t=(size_t *)out;
+ const size_t *in_t=(const size_t *)in;
+
+ (*block)(ctx->Yi.c,ctx->EKi.c,key);
+ ++ctr;
+ if (is_endian.little)
+ PUTU32(ctx->Yi.c+12,ctr);
+ else
+ ctx->Yi.d[3] = ctr;
+ for (i=0; i<16/sizeof(size_t); ++i)
+ out_t[i] = in_t[i] ^ ctx->EKi.t[i];
+ out += 16;
+ in += 16;
+ len -= 16;
+ }
+ GHASH(ctx,out-j,j);
+ }
+#else
+ while (len>=16) {
+ size_t *out_t=(size_t *)out;
+ const size_t *in_t=(const size_t *)in;
+
+ (*block)(ctx->Yi.c,ctx->EKi.c,key);
+ ++ctr;
+ if (is_endian.little)
+ PUTU32(ctx->Yi.c+12,ctr);
+ else
+ ctx->Yi.d[3] = ctr;
+ for (i=0; i<16/sizeof(size_t); ++i)
+ ctx->Xi.t[i] ^=
+ out_t[i] = in_t[i]^ctx->EKi.t[i];
+ GCM_MUL(ctx,Xi);
+ out += 16;
+ in += 16;
+ len -= 16;
+ }
+#endif
+ if (len) {
+ (*block)(ctx->Yi.c,ctx->EKi.c,key);
+ ++ctr;
+ if (is_endian.little)
+ PUTU32(ctx->Yi.c+12,ctr);
+ else
+ ctx->Yi.d[3] = ctr;
+ while (len--) {
+ ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
+ ++n;
+ }
+ }
+
+ ctx->mres = n;
+ return 0;
+ } while(0);
+#endif
+ for (i=0;i<len;++i) {
+ if (n==0) {
+ (*block)(ctx->Yi.c,ctx->EKi.c,key);
+ ++ctr;
+ if (is_endian.little)
+ PUTU32(ctx->Yi.c+12,ctr);
+ else
+ ctx->Yi.d[3] = ctr;
+ }
+ ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n];
+ n = (n+1)%16;
+ if (n==0)
+ GCM_MUL(ctx,Xi);
+ }
+
+ ctx->mres = n;
+ return 0;
+}
+
+int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
+ const unsigned char *in, unsigned char *out,
+ size_t len)
+{
+ const union { long one; char little; } is_endian = {1};
+ unsigned int n, ctr;
+ size_t i;
+ u64 mlen = ctx->len.u[1];
+ block128_f block = ctx->block;
+ void *key = ctx->key;
+#ifdef GCM_FUNCREF_4BIT
+ void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
+# ifdef GHASH
+ void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
+ const u8 *inp,size_t len) = ctx->ghash;
+# endif
+#endif
+
+ mlen += len;
+ if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
+ return -1;
+ ctx->len.u[1] = mlen;
+
+ if (ctx->ares) {
+ /* First call to decrypt finalizes GHASH(AAD) */
+ GCM_MUL(ctx,Xi);
+ ctx->ares = 0;
+ }
+
+ if (is_endian.little)
+ ctr = GETU32(ctx->Yi.c+12);
+ else
+ ctr = ctx->Yi.d[3];
+
+ n = ctx->mres;
+#if !defined(OPENSSL_SMALL_FOOTPRINT)
+ if (16%sizeof(size_t) == 0) do { /* always true actually */
+ if (n) {
+ while (n && len) {
+ u8 c = *(in++);
+ *(out++) = c^ctx->EKi.c[n];
+ ctx->Xi.c[n] ^= c;
+ --len;
+ n = (n+1)%16;
+ }
+ if (n==0) GCM_MUL (ctx,Xi);
+ else {
+ ctx->mres = n;
+ return 0;
+ }
+ }
+#if defined(STRICT_ALIGNMENT)
+ if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
+ break;
+#endif
+#if defined(GHASH) && defined(GHASH_CHUNK)
+ while (len>=GHASH_CHUNK) {
+ size_t j=GHASH_CHUNK;
+
+ GHASH(ctx,in,GHASH_CHUNK);
+ while (j) {
+ size_t *out_t=(size_t *)out;
+ const size_t *in_t=(const size_t *)in;
+
+ (*block)(ctx->Yi.c,ctx->EKi.c,key);
+ ++ctr;
+ if (is_endian.little)
+ PUTU32(ctx->Yi.c+12,ctr);
+ else
+ ctx->Yi.d[3] = ctr;
+ for (i=0; i<16/sizeof(size_t); ++i)
+ out_t[i] = in_t[i]^ctx->EKi.t[i];
+ out += 16;
+ in += 16;
+ j -= 16;
+ }
+ len -= GHASH_CHUNK;
+ }
+ if ((i = (len&(size_t)-16))) {
+ GHASH(ctx,in,i);
+ while (len>=16) {
+ size_t *out_t=(size_t *)out;
+ const size_t *in_t=(const size_t *)in;
+
+ (*block)(ctx->Yi.c,ctx->EKi.c,key);
+ ++ctr;
+ if (is_endian.little)
+ PUTU32(ctx->Yi.c+12,ctr);
+ else
+ ctx->Yi.d[3] = ctr;
+ for (i=0; i<16/sizeof(size_t); ++i)
+ out_t[i] = in_t[i]^ctx->EKi.t[i];
+ out += 16;
+ in += 16;
+ len -= 16;
+ }
+ }
+#else
+ while (len>=16) {
+ size_t *out_t=(size_t *)out;
+ const size_t *in_t=(const size_t *)in;
+
+ (*block)(ctx->Yi.c,ctx->EKi.c,key);
+ ++ctr;
+ if (is_endian.little)
+ PUTU32(ctx->Yi.c+12,ctr);
+ else
+ ctx->Yi.d[3] = ctr;
+ for (i=0; i<16/sizeof(size_t); ++i) {
+ size_t c = in[i];
+ out[i] = c^ctx->EKi.t[i];
+ ctx->Xi.t[i] ^= c;
+ }
+ GCM_MUL(ctx,Xi);
+ out += 16;
+ in += 16;
+ len -= 16;
+ }
+#endif
+ if (len) {
+ (*block)(ctx->Yi.c,ctx->EKi.c,key);
+ ++ctr;
+ if (is_endian.little)
+ PUTU32(ctx->Yi.c+12,ctr);
+ else
+ ctx->Yi.d[3] = ctr;
+ while (len--) {
+ u8 c = in[n];
+ ctx->Xi.c[n] ^= c;
+ out[n] = c^ctx->EKi.c[n];
+ ++n;
+ }
+ }
+
+ ctx->mres = n;
+ return 0;
+ } while(0);
+#endif
+ for (i=0;i<len;++i) {
+ u8 c;
+ if (n==0) {
+ (*block)(ctx->Yi.c,ctx->EKi.c,key);
+ ++ctr;
+ if (is_endian.little)
+ PUTU32(ctx->Yi.c+12,ctr);
+ else
+ ctx->Yi.d[3] = ctr;
+ }
+ c = in[i];
+ out[i] = c^ctx->EKi.c[n];
+ ctx->Xi.c[n] ^= c;
+ n = (n+1)%16;
+ if (n==0)
+ GCM_MUL(ctx,Xi);
+ }
+
+ ctx->mres = n;
+ return 0;
+}
+
+int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
+ const unsigned char *in, unsigned char *out,
+ size_t len, ctr128_f stream)
+{
+ const union { long one; char little; } is_endian = {1};
+ unsigned int n, ctr;
+ size_t i;
+ u64 mlen = ctx->len.u[1];
+ void *key = ctx->key;
+#ifdef GCM_FUNCREF_4BIT
+ void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
+# ifdef GHASH
+ void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
+ const u8 *inp,size_t len) = ctx->ghash;
+# endif
+#endif
+
+ mlen += len;
+ if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
+ return -1;
+ ctx->len.u[1] = mlen;
+
+ if (ctx->ares) {
+ /* First call to encrypt finalizes GHASH(AAD) */
+ GCM_MUL(ctx,Xi);
+ ctx->ares = 0;
+ }
+
+ if (is_endian.little)
+ ctr = GETU32(ctx->Yi.c+12);
+ else
+ ctr = ctx->Yi.d[3];
+
+ n = ctx->mres;
+ if (n) {
+ while (n && len) {
+ ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
+ --len;
+ n = (n+1)%16;
+ }
+ if (n==0) GCM_MUL(ctx,Xi);
+ else {
+ ctx->mres = n;
+ return 0;
+ }
+ }
+#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
+ while (len>=GHASH_CHUNK) {
+ (*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
+ ctr += GHASH_CHUNK/16;
+ if (is_endian.little)
+ PUTU32(ctx->Yi.c+12,ctr);
+ else
+ ctx->Yi.d[3] = ctr;
+ GHASH(ctx,out,GHASH_CHUNK);
+ out += GHASH_CHUNK;
+ in += GHASH_CHUNK;
+ len -= GHASH_CHUNK;
+ }
+#endif
+ if ((i = (len&(size_t)-16))) {
+ size_t j=i/16;
+
+ (*stream)(in,out,j,key,ctx->Yi.c);
+ ctr += (unsigned int)j;
+ if (is_endian.little)
+ PUTU32(ctx->Yi.c+12,ctr);
+ else
+ ctx->Yi.d[3] = ctr;
+ in += i;
+ len -= i;
+#if defined(GHASH)
+ GHASH(ctx,out,i);
+ out += i;
+#else
+ while (j--) {
+ for (i=0;i<16;++i) ctx->Xi.c[i] ^= out[i];
+ GCM_MUL(ctx,Xi);
+ out += 16;
+ }
+#endif
+ }
+ if (len) {
+ (*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
+ ++ctr;
+ if (is_endian.little)
+ PUTU32(ctx->Yi.c+12,ctr);
+ else
+ ctx->Yi.d[3] = ctr;
+ while (len--) {
+ ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
+ ++n;
+ }
+ }
+
+ ctx->mres = n;
+ return 0;
+}
+
+int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
+ const unsigned char *in, unsigned char *out,
+ size_t len,ctr128_f stream)
+{
+ const union { long one; char little; } is_endian = {1};
+ unsigned int n, ctr;
+ size_t i;
+ u64 mlen = ctx->len.u[1];
+ void *key = ctx->key;
+#ifdef GCM_FUNCREF_4BIT
+ void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
+# ifdef GHASH
+ void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
+ const u8 *inp,size_t len) = ctx->ghash;
+# endif
+#endif
+
+ mlen += len;
+ if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
+ return -1;
+ ctx->len.u[1] = mlen;
+
+ if (ctx->ares) {
+ /* First call to decrypt finalizes GHASH(AAD) */
+ GCM_MUL(ctx,Xi);
+ ctx->ares = 0;
+ }
+
+ if (is_endian.little)
+ ctr = GETU32(ctx->Yi.c+12);
+ else
+ ctr = ctx->Yi.d[3];
+
+ n = ctx->mres;
+ if (n) {
+ while (n && len) {
+ u8 c = *(in++);
+ *(out++) = c^ctx->EKi.c[n];
+ ctx->Xi.c[n] ^= c;
+ --len;
+ n = (n+1)%16;
+ }
+ if (n==0) GCM_MUL (ctx,Xi);
+ else {
+ ctx->mres = n;
+ return 0;
+ }
+ }
+#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
+ while (len>=GHASH_CHUNK) {
+ GHASH(ctx,in,GHASH_CHUNK);
+ (*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
+ ctr += GHASH_CHUNK/16;
+ if (is_endian.little)
+ PUTU32(ctx->Yi.c+12,ctr);
+ else
+ ctx->Yi.d[3] = ctr;
+ out += GHASH_CHUNK;
+ in += GHASH_CHUNK;
+ len -= GHASH_CHUNK;
+ }
+#endif
+ if ((i = (len&(size_t)-16))) {
+ size_t j=i/16;
+
+#if defined(GHASH)
+ GHASH(ctx,in,i);
+#else
+ while (j--) {
+ size_t k;
+ for (k=0;k<16;++k) ctx->Xi.c[k] ^= in[k];
+ GCM_MUL(ctx,Xi);
+ in += 16;
+ }
+ j = i/16;
+ in -= i;
+#endif
+ (*stream)(in,out,j,key,ctx->Yi.c);
+ ctr += (unsigned int)j;
+ if (is_endian.little)
+ PUTU32(ctx->Yi.c+12,ctr);
+ else
+ ctx->Yi.d[3] = ctr;
+ out += i;
+ in += i;
+ len -= i;
+ }
+ if (len) {
+ (*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
+ ++ctr;
+ if (is_endian.little)
+ PUTU32(ctx->Yi.c+12,ctr);
+ else
+ ctx->Yi.d[3] = ctr;
+ while (len--) {
+ u8 c = in[n];
+ ctx->Xi.c[n] ^= c;
+ out[n] = c^ctx->EKi.c[n];
+ ++n;
+ }
+ }
+
+ ctx->mres = n;
+ return 0;
+}
+
+int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag,
+ size_t len)
+{
+ const union { long one; char little; } is_endian = {1};
+ u64 alen = ctx->len.u[0]<<3;
+ u64 clen = ctx->len.u[1]<<3;
+#ifdef GCM_FUNCREF_4BIT
+ void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
+#endif
+
+ if (ctx->mres || ctx->ares)
+ GCM_MUL(ctx,Xi);
+
+ if (is_endian.little) {
+#ifdef BSWAP8
+ alen = BSWAP8(alen);
+ clen = BSWAP8(clen);
+#else
+ u8 *p = ctx->len.c;
+
+ ctx->len.u[0] = alen;
+ ctx->len.u[1] = clen;
+
+ alen = (u64)GETU32(p) <<32|GETU32(p+4);
+ clen = (u64)GETU32(p+8)<<32|GETU32(p+12);
+#endif
+ }
+
+ ctx->Xi.u[0] ^= alen;
+ ctx->Xi.u[1] ^= clen;
+ GCM_MUL(ctx,Xi);
+
+ ctx->Xi.u[0] ^= ctx->EK0.u[0];
+ ctx->Xi.u[1] ^= ctx->EK0.u[1];
+
+ if (tag && len<=sizeof(ctx->Xi))
+ return memcmp(ctx->Xi.c,tag,len);
+ else
+ return -1;
+}
+
+void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
+{
+ CRYPTO_gcm128_finish(ctx, NULL, 0);
+ memcpy(tag, ctx->Xi.c, len<=sizeof(ctx->Xi.c)?len:sizeof(ctx->Xi.c));
+}
+
+GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
+{
+ GCM128_CONTEXT *ret;
+
+ if ((ret = (GCM128_CONTEXT *)OPENSSL_malloc(sizeof(GCM128_CONTEXT))))
+ CRYPTO_gcm128_init(ret,key,block);
+
+ return ret;
+}
+
+void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
+{
+ if (ctx) {
+ OPENSSL_cleanse(ctx,sizeof(*ctx));
+ OPENSSL_free(ctx);
+ }
+}
+
+#if defined(SELFTEST)
+#include <stdio.h>
+#include <openssl/aes.h>
+
+/* Test Case 1 */
+static const u8 K1[16],
+ *P1=NULL,
+ *A1=NULL,
+ IV1[12],
+ *C1=NULL,
+ T1[]= {0x58,0xe2,0xfc,0xce,0xfa,0x7e,0x30,0x61,0x36,0x7f,0x1d,0x57,0xa4,0xe7,0x45,0x5a};
+
+/* Test Case 2 */
+#define K2 K1
+#define A2 A1
+#define IV2 IV1
+static const u8 P2[16],
+ C2[]= {0x03,0x88,0xda,0xce,0x60,0xb6,0xa3,0x92,0xf3,0x28,0xc2,0xb9,0x71,0xb2,0xfe,0x78},
+ T2[]= {0xab,0x6e,0x47,0xd4,0x2c,0xec,0x13,0xbd,0xf5,0x3a,0x67,0xb2,0x12,0x57,0xbd,0xdf};
+
+/* Test Case 3 */
+#define A3 A2
+static const u8 K3[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
+ P3[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
+ 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
+ 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
+ 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
+ IV3[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
+ C3[]= {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
+ 0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
+ 0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
+ 0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91,0x47,0x3f,0x59,0x85},
+ T3[]= {0x4d,0x5c,0x2a,0xf3,0x27,0xcd,0x64,0xa6,0x2c,0xf3,0x5a,0xbd,0x2b,0xa6,0xfa,0xb4};
+
+/* Test Case 4 */
+#define K4 K3
+#define IV4 IV3
+static const u8 P4[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
+ 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
+ 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
+ 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
+ A4[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
+ 0xab,0xad,0xda,0xd2},
+ C4[]= {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
+ 0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
+ 0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
+ 0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91},
+ T4[]= {0x5b,0xc9,0x4f,0xbc,0x32,0x21,0xa5,0xdb,0x94,0xfa,0xe9,0x5a,0xe7,0x12,0x1a,0x47};
+
+/* Test Case 5 */
+#define K5 K4
+#define P5 P4
+#define A5 A4
+static const u8 IV5[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
+ C5[]= {0x61,0x35,0x3b,0x4c,0x28,0x06,0x93,0x4a,0x77,0x7f,0xf5,0x1f,0xa2,0x2a,0x47,0x55,
+ 0x69,0x9b,0x2a,0x71,0x4f,0xcd,0xc6,0xf8,0x37,0x66,0xe5,0xf9,0x7b,0x6c,0x74,0x23,
+ 0x73,0x80,0x69,0x00,0xe4,0x9f,0x24,0xb2,0x2b,0x09,0x75,0x44,0xd4,0x89,0x6b,0x42,
+ 0x49,0x89,0xb5,0xe1,0xeb,0xac,0x0f,0x07,0xc2,0x3f,0x45,0x98},
+ T5[]= {0x36,0x12,0xd2,0xe7,0x9e,0x3b,0x07,0x85,0x56,0x1b,0xe1,0x4a,0xac,0xa2,0xfc,0xcb};
+
+/* Test Case 6 */
+#define K6 K5
+#define P6 P5
+#define A6 A5
+static const u8 IV6[]= {0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
+ 0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
+ 0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
+ 0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
+ C6[]= {0x8c,0xe2,0x49,0x98,0x62,0x56,0x15,0xb6,0x03,0xa0,0x33,0xac,0xa1,0x3f,0xb8,0x94,
+ 0xbe,0x91,0x12,0xa5,0xc3,0xa2,0x11,0xa8,0xba,0x26,0x2a,0x3c,0xca,0x7e,0x2c,0xa7,
+ 0x01,0xe4,0xa9,0xa4,0xfb,0xa4,0x3c,0x90,0xcc,0xdc,0xb2,0x81,0xd4,0x8c,0x7c,0x6f,
+ 0xd6,0x28,0x75,0xd2,0xac,0xa4,0x17,0x03,0x4c,0x34,0xae,0xe5},
+ T6[]= {0x61,0x9c,0xc5,0xae,0xff,0xfe,0x0b,0xfa,0x46,0x2a,0xf4,0x3c,0x16,0x99,0xd0,0x50};
+
+/* Test Case 7 */
+static const u8 K7[24],
+ *P7=NULL,
+ *A7=NULL,
+ IV7[12],
+ *C7=NULL,
+ T7[]= {0xcd,0x33,0xb2,0x8a,0xc7,0x73,0xf7,0x4b,0xa0,0x0e,0xd1,0xf3,0x12,0x57,0x24,0x35};
+
+/* Test Case 8 */
+#define K8 K7
+#define IV8 IV7
+#define A8 A7
+static const u8 P8[16],
+ C8[]= {0x98,0xe7,0x24,0x7c,0x07,0xf0,0xfe,0x41,0x1c,0x26,0x7e,0x43,0x84,0xb0,0xf6,0x00},
+ T8[]= {0x2f,0xf5,0x8d,0x80,0x03,0x39,0x27,0xab,0x8e,0xf4,0xd4,0x58,0x75,0x14,0xf0,0xfb};
+
+/* Test Case 9 */
+#define A9 A8
+static const u8 K9[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
+ 0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c},
+ P9[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
+ 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
+ 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
+ 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
+ IV9[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
+ C9[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
+ 0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
+ 0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
+ 0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10,0xac,0xad,0xe2,0x56},
+ T9[]= {0x99,0x24,0xa7,0xc8,0x58,0x73,0x36,0xbf,0xb1,0x18,0x02,0x4d,0xb8,0x67,0x4a,0x14};
+
+/* Test Case 10 */
+#define K10 K9
+#define IV10 IV9
+static const u8 P10[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
+ 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
+ 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
+ 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
+ A10[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
+ 0xab,0xad,0xda,0xd2},
+ C10[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
+ 0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
+ 0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
+ 0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10},
+ T10[]= {0x25,0x19,0x49,0x8e,0x80,0xf1,0x47,0x8f,0x37,0xba,0x55,0xbd,0x6d,0x27,0x61,0x8c};
+
+/* Test Case 11 */
+#define K11 K10
+#define P11 P10
+#define A11 A10
+static const u8 IV11[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
+ C11[]= {0x0f,0x10,0xf5,0x99,0xae,0x14,0xa1,0x54,0xed,0x24,0xb3,0x6e,0x25,0x32,0x4d,0xb8,
+ 0xc5,0x66,0x63,0x2e,0xf2,0xbb,0xb3,0x4f,0x83,0x47,0x28,0x0f,0xc4,0x50,0x70,0x57,
+ 0xfd,0xdc,0x29,0xdf,0x9a,0x47,0x1f,0x75,0xc6,0x65,0x41,0xd4,0xd4,0xda,0xd1,0xc9,
+ 0xe9,0x3a,0x19,0xa5,0x8e,0x8b,0x47,0x3f,0xa0,0xf0,0x62,0xf7},
+ T11[]= {0x65,0xdc,0xc5,0x7f,0xcf,0x62,0x3a,0x24,0x09,0x4f,0xcc,0xa4,0x0d,0x35,0x33,0xf8};
+
+/* Test Case 12 */
+#define K12 K11
+#define P12 P11
+#define A12 A11
+static const u8 IV12[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
+ 0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
+ 0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
+ 0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
+ C12[]= {0xd2,0x7e,0x88,0x68,0x1c,0xe3,0x24,0x3c,0x48,0x30,0x16,0x5a,0x8f,0xdc,0xf9,0xff,
+ 0x1d,0xe9,0xa1,0xd8,0xe6,0xb4,0x47,0xef,0x6e,0xf7,0xb7,0x98,0x28,0x66,0x6e,0x45,
+ 0x81,0xe7,0x90,0x12,0xaf,0x34,0xdd,0xd9,0xe2,0xf0,0x37,0x58,0x9b,0x29,0x2d,0xb3,
+ 0xe6,0x7c,0x03,0x67,0x45,0xfa,0x22,0xe7,0xe9,0xb7,0x37,0x3b},
+ T12[]= {0xdc,0xf5,0x66,0xff,0x29,0x1c,0x25,0xbb,0xb8,0x56,0x8f,0xc3,0xd3,0x76,0xa6,0xd9};
+
+/* Test Case 13 */
+static const u8 K13[32],
+ *P13=NULL,
+ *A13=NULL,
+ IV13[12],
+ *C13=NULL,
+ T13[]={0x53,0x0f,0x8a,0xfb,0xc7,0x45,0x36,0xb9,0xa9,0x63,0xb4,0xf1,0xc4,0xcb,0x73,0x8b};
+
+/* Test Case 14 */
+#define K14 K13
+#define A14 A13
+static const u8 P14[16],
+ IV14[12],
+ C14[]= {0xce,0xa7,0x40,0x3d,0x4d,0x60,0x6b,0x6e,0x07,0x4e,0xc5,0xd3,0xba,0xf3,0x9d,0x18},
+ T14[]= {0xd0,0xd1,0xc8,0xa7,0x99,0x99,0x6b,0xf0,0x26,0x5b,0x98,0xb5,0xd4,0x8a,0xb9,0x19};
+
+/* Test Case 15 */
+#define A15 A14
+static const u8 K15[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
+ 0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
+ P15[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
+ 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
+ 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
+ 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
+ IV15[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
+ C15[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
+ 0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
+ 0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
+ 0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad},
+ T15[]= {0xb0,0x94,0xda,0xc5,0xd9,0x34,0x71,0xbd,0xec,0x1a,0x50,0x22,0x70,0xe3,0xcc,0x6c};
+
+/* Test Case 16 */
+#define K16 K15
+#define IV16 IV15
+static const u8 P16[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
+ 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
+ 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
+ 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
+ A16[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
+ 0xab,0xad,0xda,0xd2},
+ C16[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
+ 0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
+ 0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
+ 0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62},
+ T16[]= {0x76,0xfc,0x6e,0xce,0x0f,0x4e,0x17,0x68,0xcd,0xdf,0x88,0x53,0xbb,0x2d,0x55,0x1b};
+
+/* Test Case 17 */
+#define K17 K16
+#define P17 P16
+#define A17 A16
+static const u8 IV17[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
+ C17[]= {0xc3,0x76,0x2d,0xf1,0xca,0x78,0x7d,0x32,0xae,0x47,0xc1,0x3b,0xf1,0x98,0x44,0xcb,
+ 0xaf,0x1a,0xe1,0x4d,0x0b,0x97,0x6a,0xfa,0xc5,0x2f,0xf7,0xd7,0x9b,0xba,0x9d,0xe0,
+ 0xfe,0xb5,0x82,0xd3,0x39,0x34,0xa4,0xf0,0x95,0x4c,0xc2,0x36,0x3b,0xc7,0x3f,0x78,
+ 0x62,0xac,0x43,0x0e,0x64,0xab,0xe4,0x99,0xf4,0x7c,0x9b,0x1f},
+ T17[]= {0x3a,0x33,0x7d,0xbf,0x46,0xa7,0x92,0xc4,0x5e,0x45,0x49,0x13,0xfe,0x2e,0xa8,0xf2};
+
+/* Test Case 18 */
+#define K18 K17
+#define P18 P17
+#define A18 A17
+static const u8 IV18[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
+ 0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
+ 0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
+ 0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
+ C18[]= {0x5a,0x8d,0xef,0x2f,0x0c,0x9e,0x53,0xf1,0xf7,0x5d,0x78,0x53,0x65,0x9e,0x2a,0x20,
+ 0xee,0xb2,0xb2,0x2a,0xaf,0xde,0x64,0x19,0xa0,0x58,0xab,0x4f,0x6f,0x74,0x6b,0xf4,
+ 0x0f,0xc0,0xc3,0xb7,0x80,0xf2,0x44,0x45,0x2d,0xa3,0xeb,0xf1,0xc5,0xd8,0x2c,0xde,
+ 0xa2,0x41,0x89,0x97,0x20,0x0e,0xf8,0x2e,0x44,0xae,0x7e,0x3f},
+ T18[]= {0xa4,0x4a,0x82,0x66,0xee,0x1c,0x8e,0xb0,0xc8,0xb5,0xd4,0xcf,0x5a,0xe9,0xf1,0x9a};
+
+/* Test Case 19 */
+#define K19 K1
+#define P19 P1
+#define IV19 IV1
+#define C19 C1
+static const u8 A19[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
+ 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
+ 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
+ 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55,
+ 0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
+ 0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
+ 0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
+ 0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad},
+ T19[]= {0x5f,0xea,0x79,0x3a,0x2d,0x6f,0x97,0x4d,0x37,0xe6,0x8e,0x0c,0xb8,0xff,0x94,0x92};
+
+/* Test Case 20 */
+#define K20 K1
+#define A20 A1
+static const u8 IV20[64]={0xff,0xff,0xff,0xff}, /* this results in 0xff in counter LSB */
+ P20[288],
+ C20[]= {0x56,0xb3,0x37,0x3c,0xa9,0xef,0x6e,0x4a,0x2b,0x64,0xfe,0x1e,0x9a,0x17,0xb6,0x14,
+ 0x25,0xf1,0x0d,0x47,0xa7,0x5a,0x5f,0xce,0x13,0xef,0xc6,0xbc,0x78,0x4a,0xf2,0x4f,
+ 0x41,0x41,0xbd,0xd4,0x8c,0xf7,0xc7,0x70,0x88,0x7a,0xfd,0x57,0x3c,0xca,0x54,0x18,
+ 0xa9,0xae,0xff,0xcd,0x7c,0x5c,0xed,0xdf,0xc6,0xa7,0x83,0x97,0xb9,0xa8,0x5b,0x49,
+ 0x9d,0xa5,0x58,0x25,0x72,0x67,0xca,0xab,0x2a,0xd0,0xb2,0x3c,0xa4,0x76,0xa5,0x3c,
+ 0xb1,0x7f,0xb4,0x1c,0x4b,0x8b,0x47,0x5c,0xb4,0xf3,0xf7,0x16,0x50,0x94,0xc2,0x29,
+ 0xc9,0xe8,0xc4,0xdc,0x0a,0x2a,0x5f,0xf1,0x90,0x3e,0x50,0x15,0x11,0x22,0x13,0x76,
+ 0xa1,0xcd,0xb8,0x36,0x4c,0x50,0x61,0xa2,0x0c,0xae,0x74,0xbc,0x4a,0xcd,0x76,0xce,
+ 0xb0,0xab,0xc9,0xfd,0x32,0x17,0xef,0x9f,0x8c,0x90,0xbe,0x40,0x2d,0xdf,0x6d,0x86,
+ 0x97,0xf4,0xf8,0x80,0xdf,0xf1,0x5b,0xfb,0x7a,0x6b,0x28,0x24,0x1e,0xc8,0xfe,0x18,
+ 0x3c,0x2d,0x59,0xe3,0xf9,0xdf,0xff,0x65,0x3c,0x71,0x26,0xf0,0xac,0xb9,0xe6,0x42,
+ 0x11,0xf4,0x2b,0xae,0x12,0xaf,0x46,0x2b,0x10,0x70,0xbe,0xf1,0xab,0x5e,0x36,0x06,
+ 0x87,0x2c,0xa1,0x0d,0xee,0x15,0xb3,0x24,0x9b,0x1a,0x1b,0x95,0x8f,0x23,0x13,0x4c,
+ 0x4b,0xcc,0xb7,0xd0,0x32,0x00,0xbc,0xe4,0x20,0xa2,0xf8,0xeb,0x66,0xdc,0xf3,0x64,
+ 0x4d,0x14,0x23,0xc1,0xb5,0x69,0x90,0x03,0xc1,0x3e,0xce,0xf4,0xbf,0x38,0xa3,0xb6,
+ 0x0e,0xed,0xc3,0x40,0x33,0xba,0xc1,0x90,0x27,0x83,0xdc,0x6d,0x89,0xe2,0xe7,0x74,
+ 0x18,0x8a,0x43,0x9c,0x7e,0xbc,0xc0,0x67,0x2d,0xbd,0xa4,0xdd,0xcf,0xb2,0x79,0x46,
+ 0x13,0xb0,0xbe,0x41,0x31,0x5e,0xf7,0x78,0x70,0x8a,0x70,0xee,0x7d,0x75,0x16,0x5c},
+ T20[]= {0x8b,0x30,0x7f,0x6b,0x33,0x28,0x6d,0x0a,0xb0,0x26,0xa9,0xed,0x3f,0xe1,0xe8,0x5f};
+
+#define TEST_CASE(n) do { \
+ u8 out[sizeof(P##n)]; \
+ AES_set_encrypt_key(K##n,sizeof(K##n)*8,&key); \
+ CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt); \
+ CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n)); \
+ memset(out,0,sizeof(out)); \
+ if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n)); \
+ if (P##n) CRYPTO_gcm128_encrypt(&ctx,P##n,out,sizeof(out)); \
+ if (CRYPTO_gcm128_finish(&ctx,T##n,16) || \
+ (C##n && memcmp(out,C##n,sizeof(out)))) \
+ ret++, printf ("encrypt test#%d failed.\n",n); \
+ CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n)); \
+ memset(out,0,sizeof(out)); \
+ if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n)); \
+ if (C##n) CRYPTO_gcm128_decrypt(&ctx,C##n,out,sizeof(out)); \
+ if (CRYPTO_gcm128_finish(&ctx,T##n,16) || \
+ (P##n && memcmp(out,P##n,sizeof(out)))) \
+ ret++, printf ("decrypt test#%d failed.\n",n); \
+ } while(0)
+
+int main()
+{
+ GCM128_CONTEXT ctx;
+ AES_KEY key;
+ int ret=0;
+
+ TEST_CASE(1);
+ TEST_CASE(2);
+ TEST_CASE(3);
+ TEST_CASE(4);
+ TEST_CASE(5);
+ TEST_CASE(6);
+ TEST_CASE(7);
+ TEST_CASE(8);
+ TEST_CASE(9);
+ TEST_CASE(10);
+ TEST_CASE(11);
+ TEST_CASE(12);
+ TEST_CASE(13);
+ TEST_CASE(14);
+ TEST_CASE(15);
+ TEST_CASE(16);
+ TEST_CASE(17);
+ TEST_CASE(18);
+ TEST_CASE(19);
+ TEST_CASE(20);
+
+#ifdef OPENSSL_CPUID_OBJ
+ {
+ size_t start,stop,gcm_t,ctr_t,OPENSSL_rdtsc();
+ union { u64 u; u8 c[1024]; } buf;
+ int i;
+
+ AES_set_encrypt_key(K1,sizeof(K1)*8,&key);
+ CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);
+ CRYPTO_gcm128_setiv(&ctx,IV1,sizeof(IV1));
+
+ CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
+ start = OPENSSL_rdtsc();
+ CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
+ gcm_t = OPENSSL_rdtsc() - start;
+
+ CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
+ &key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
+ (block128_f)AES_encrypt);
+ start = OPENSSL_rdtsc();
+ CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
+ &key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
+ (block128_f)AES_encrypt);
+ ctr_t = OPENSSL_rdtsc() - start;
+
+ printf("%.2f-%.2f=%.2f\n",
+ gcm_t/(double)sizeof(buf),
+ ctr_t/(double)sizeof(buf),
+ (gcm_t-ctr_t)/(double)sizeof(buf));
+#ifdef GHASH
+ {
+ void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
+ const u8 *inp,size_t len) = ctx.ghash;
+
+ GHASH((&ctx),buf.c,sizeof(buf));
+ start = OPENSSL_rdtsc();
+ for (i=0;i<100;++i) GHASH((&ctx),buf.c,sizeof(buf));
+ gcm_t = OPENSSL_rdtsc() - start;
+ printf("%.2f\n",gcm_t/(double)sizeof(buf)/(double)i);
+ }
+#endif
+ }
+#endif
+
+ return ret;
+}
+#endif
diff --git a/main/openssl/crypto/modes/modes_lcl.h b/main/openssl/crypto/modes/modes_lcl.h
new file mode 100644
index 00000000..9d83e128
--- /dev/null
+++ b/main/openssl/crypto/modes/modes_lcl.h
@@ -0,0 +1,128 @@
+/* ====================================================================
+ * Copyright (c) 2010 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use is governed by OpenSSL license.
+ * ====================================================================
+ */
+
+#include <openssl/modes.h>
+
+
+#if (defined(_WIN32) || defined(_WIN64)) && !defined(__MINGW32__)
+typedef __int64 i64;
+typedef unsigned __int64 u64;
+#define U64(C) C##UI64
+#elif defined(__arch64__)
+typedef long i64;
+typedef unsigned long u64;
+#define U64(C) C##UL
+#else
+typedef long long i64;
+typedef unsigned long long u64;
+#define U64(C) C##ULL
+#endif
+
+typedef unsigned int u32;
+typedef unsigned char u8;
+
+#define STRICT_ALIGNMENT 1
+#if defined(__i386) || defined(__i386__) || \
+ defined(__x86_64) || defined(__x86_64__) || \
+ defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64) || \
+ defined(__s390__) || defined(__s390x__)
+# undef STRICT_ALIGNMENT
+#endif
+
+#if !defined(PEDANTIC) && !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM)
+#if defined(__GNUC__) && __GNUC__>=2
+# if defined(__x86_64) || defined(__x86_64__)
+# define BSWAP8(x) ({ u64 ret=(x); \
+ asm ("bswapq %0" \
+ : "+r"(ret)); ret; })
+# define BSWAP4(x) ({ u32 ret=(x); \
+ asm ("bswapl %0" \
+ : "+r"(ret)); ret; })
+# elif (defined(__i386) || defined(__i386__)) && !defined(I386_ONLY)
+# define BSWAP8(x) ({ u32 lo=(u64)(x)>>32,hi=(x); \
+ asm ("bswapl %0; bswapl %1" \
+ : "+r"(hi),"+r"(lo)); \
+ (u64)hi<<32|lo; })
+# define BSWAP4(x) ({ u32 ret=(x); \
+ asm ("bswapl %0" \
+ : "+r"(ret)); ret; })
+# elif (defined(__arm__) || defined(__arm)) && !defined(STRICT_ALIGNMENT)
+# define BSWAP8(x) ({ u32 lo=(u64)(x)>>32,hi=(x); \
+ asm ("rev %0,%0; rev %1,%1" \
+ : "+r"(hi),"+r"(lo)); \
+ (u64)hi<<32|lo; })
+# define BSWAP4(x) ({ u32 ret; \
+ asm ("rev %0,%1" \
+ : "=r"(ret) : "r"((u32)(x))); \
+ ret; })
+# endif
+#elif defined(_MSC_VER)
+# if _MSC_VER>=1300
+# pragma intrinsic(_byteswap_uint64,_byteswap_ulong)
+# define BSWAP8(x) _byteswap_uint64((u64)(x))
+# define BSWAP4(x) _byteswap_ulong((u32)(x))
+# elif defined(_M_IX86)
+ __inline u32 _bswap4(u32 val) {
+ _asm mov eax,val
+ _asm bswap eax
+ }
+# define BSWAP4(x) _bswap4(x)
+# endif
+#endif
+#endif
+
+#if defined(BSWAP4) && !defined(STRICT_ALIGNMENT)
+#define GETU32(p) BSWAP4(*(const u32 *)(p))
+#define PUTU32(p,v) *(u32 *)(p) = BSWAP4(v)
+#else
+#define GETU32(p) ((u32)(p)[0]<<24|(u32)(p)[1]<<16|(u32)(p)[2]<<8|(u32)(p)[3])
+#define PUTU32(p,v) ((p)[0]=(u8)((v)>>24),(p)[1]=(u8)((v)>>16),(p)[2]=(u8)((v)>>8),(p)[3]=(u8)(v))
+#endif
+
+/* GCM definitions */
+
+typedef struct { u64 hi,lo; } u128;
+
+#ifdef TABLE_BITS
+#undef TABLE_BITS
+#endif
+/*
+ * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
+ * never be set to 8 [or 1]. For further information see gcm128.c.
+ */
+#define TABLE_BITS 4
+
+struct gcm128_context {
+ /* Following 6 names follow names in GCM specification */
+ union { u64 u[2]; u32 d[4]; u8 c[16]; size_t t[16/sizeof(size_t)]; }
+ Yi,EKi,EK0,len,Xi,H;
+ /* Relative position of Xi, H and pre-computed Htable is used
+ * in some assembler modules, i.e. don't change the order! */
+#if TABLE_BITS==8
+ u128 Htable[256];
+#else
+ u128 Htable[16];
+ void (*gmult)(u64 Xi[2],const u128 Htable[16]);
+ void (*ghash)(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
+#endif
+ unsigned int mres, ares;
+ block128_f block;
+ void *key;
+};
+
+struct xts128_context {
+ void *key1, *key2;
+ block128_f block1,block2;
+};
+
+struct ccm128_context {
+ union { u64 u[2]; u8 c[16]; } nonce, cmac;
+ u64 blocks;
+ block128_f block;
+ void *key;
+};
+
diff --git a/main/openssl/crypto/modes/ofb128.c b/main/openssl/crypto/modes/ofb128.c
index c732e2ec..01c01702 100644
--- a/main/openssl/crypto/modes/ofb128.c
+++ b/main/openssl/crypto/modes/ofb128.c
@@ -48,7 +48,8 @@
*
*/
-#include "modes.h"
+#include <openssl/crypto.h>
+#include "modes_lcl.h"
#include <string.h>
#ifndef MODES_DEBUG
@@ -58,14 +59,6 @@
#endif
#include <assert.h>
-#define STRICT_ALIGNMENT
-#if defined(__i386) || defined(__i386__) || \
- defined(__x86_64) || defined(__x86_64__) || \
- defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64) || \
- defined(__s390__) || defined(__s390x__)
-# undef STRICT_ALIGNMENT
-#endif
-
/* The input and output encrypted as though 128bit ofb mode is being
* used. The extra state information to record how much of the
* 128bit block we have used is contained in *num;
diff --git a/main/openssl/crypto/modes/xts128.c b/main/openssl/crypto/modes/xts128.c
new file mode 100644
index 00000000..9cf27a25
--- /dev/null
+++ b/main/openssl/crypto/modes/xts128.c
@@ -0,0 +1,187 @@
+/* ====================================================================
+ * Copyright (c) 2011 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include <openssl/crypto.h>
+#include "modes_lcl.h"
+#include <string.h>
+
+#ifndef MODES_DEBUG
+# ifndef NDEBUG
+# define NDEBUG
+# endif
+#endif
+#include <assert.h>
+
+int CRYPTO_xts128_encrypt(const XTS128_CONTEXT *ctx, const unsigned char iv[16],
+ const unsigned char *inp, unsigned char *out,
+ size_t len, int enc)
+{
+ const union { long one; char little; } is_endian = {1};
+ union { u64 u[2]; u32 d[4]; u8 c[16]; } tweak, scratch;
+ unsigned int i;
+
+ if (len<16) return -1;
+
+ memcpy(tweak.c, iv, 16);
+
+ (*ctx->block2)(tweak.c,tweak.c,ctx->key2);
+
+ if (!enc && (len%16)) len-=16;
+
+ while (len>=16) {
+#if defined(STRICT_ALIGNMENT)
+ memcpy(scratch.c,inp,16);
+ scratch.u[0] ^= tweak.u[0];
+ scratch.u[1] ^= tweak.u[1];
+#else
+ scratch.u[0] = ((u64*)inp)[0]^tweak.u[0];
+ scratch.u[1] = ((u64*)inp)[1]^tweak.u[1];
+#endif
+ (*ctx->block1)(scratch.c,scratch.c,ctx->key1);
+#if defined(STRICT_ALIGNMENT)
+ scratch.u[0] ^= tweak.u[0];
+ scratch.u[1] ^= tweak.u[1];
+ memcpy(out,scratch.c,16);
+#else
+ ((u64*)out)[0] = scratch.u[0]^=tweak.u[0];
+ ((u64*)out)[1] = scratch.u[1]^=tweak.u[1];
+#endif
+ inp += 16;
+ out += 16;
+ len -= 16;
+
+ if (len==0) return 0;
+
+ if (is_endian.little) {
+ unsigned int carry,res;
+
+ res = 0x87&(((int)tweak.d[3])>>31);
+ carry = (unsigned int)(tweak.u[0]>>63);
+ tweak.u[0] = (tweak.u[0]<<1)^res;
+ tweak.u[1] = (tweak.u[1]<<1)|carry;
+ }
+ else {
+ size_t c;
+
+ for (c=0,i=0;i<16;++i) {
+ /*+ substitutes for |, because c is 1 bit */
+ c += ((size_t)tweak.c[i])<<1;
+ tweak.c[i] = (u8)c;
+ c = c>>8;
+ }
+ tweak.c[0] ^= (u8)(0x87&(0-c));
+ }
+ }
+ if (enc) {
+ for (i=0;i<len;++i) {
+ u8 c = inp[i];
+ out[i] = scratch.c[i];
+ scratch.c[i] = c;
+ }
+ scratch.u[0] ^= tweak.u[0];
+ scratch.u[1] ^= tweak.u[1];
+ (*ctx->block1)(scratch.c,scratch.c,ctx->key1);
+ scratch.u[0] ^= tweak.u[0];
+ scratch.u[1] ^= tweak.u[1];
+ memcpy(out-16,scratch.c,16);
+ }
+ else {
+ union { u64 u[2]; u8 c[16]; } tweak1;
+
+ if (is_endian.little) {
+ unsigned int carry,res;
+
+ res = 0x87&(((int)tweak.d[3])>>31);
+ carry = (unsigned int)(tweak.u[0]>>63);
+ tweak1.u[0] = (tweak.u[0]<<1)^res;
+ tweak1.u[1] = (tweak.u[1]<<1)|carry;
+ }
+ else {
+ size_t c;
+
+ for (c=0,i=0;i<16;++i) {
+ /*+ substitutes for |, because c is 1 bit */
+ c += ((size_t)tweak.c[i])<<1;
+ tweak1.c[i] = (u8)c;
+ c = c>>8;
+ }
+ tweak1.c[0] ^= (u8)(0x87&(0-c));
+ }
+#if defined(STRICT_ALIGNMENT)
+ memcpy(scratch.c,inp,16);
+ scratch.u[0] ^= tweak1.u[0];
+ scratch.u[1] ^= tweak1.u[1];
+#else
+ scratch.u[0] = ((u64*)inp)[0]^tweak1.u[0];
+ scratch.u[1] = ((u64*)inp)[1]^tweak1.u[1];
+#endif
+ (*ctx->block1)(scratch.c,scratch.c,ctx->key1);
+ scratch.u[0] ^= tweak1.u[0];
+ scratch.u[1] ^= tweak1.u[1];
+
+ for (i=0;i<len;++i) {
+ u8 c = inp[16+i];
+ out[16+i] = scratch.c[i];
+ scratch.c[i] = c;
+ }
+ scratch.u[0] ^= tweak.u[0];
+ scratch.u[1] ^= tweak.u[1];
+ (*ctx->block1)(scratch.c,scratch.c,ctx->key1);
+#if defined(STRICT_ALIGNMENT)
+ scratch.u[0] ^= tweak.u[0];
+ scratch.u[1] ^= tweak.u[1];
+ memcpy (out,scratch.c,16);
+#else
+ ((u64*)out)[0] = scratch.u[0]^tweak.u[0];
+ ((u64*)out)[1] = scratch.u[1]^tweak.u[1];
+#endif
+ }
+
+ return 0;
+}
diff --git a/main/openssl/crypto/o_init.c b/main/openssl/crypto/o_init.c
new file mode 100644
index 00000000..db4cdc44
--- /dev/null
+++ b/main/openssl/crypto/o_init.c
@@ -0,0 +1,82 @@
+/* o_init.c */
+/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
+ * project.
+ */
+/* ====================================================================
+ * Copyright (c) 2011 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ */
+
+#include <e_os.h>
+#include <openssl/err.h>
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#include <openssl/rand.h>
+#endif
+
+/* Perform any essential OpenSSL initialization operations.
+ * Currently only sets FIPS callbacks
+ */
+
+void OPENSSL_init(void)
+ {
+ static int done = 0;
+ if (done)
+ return;
+ done = 1;
+#ifdef OPENSSL_FIPS
+ FIPS_set_locking_callbacks(CRYPTO_lock, CRYPTO_add_lock);
+ FIPS_set_error_callbacks(ERR_put_error, ERR_add_error_vdata);
+ FIPS_set_malloc_callbacks(CRYPTO_malloc, CRYPTO_free);
+ RAND_init_fips();
+#endif
+#if 0
+ fprintf(stderr, "Called OPENSSL_init\n");
+#endif
+ }
+
diff --git a/main/openssl/crypto/objects/o_names.c b/main/openssl/crypto/objects/o_names.c
index 84380a96..4a548c2e 100644
--- a/main/openssl/crypto/objects/o_names.c
+++ b/main/openssl/crypto/objects/o_names.c
@@ -73,7 +73,7 @@ int OBJ_NAME_new_index(unsigned long (*hash_func)(const char *),
name_funcs_stack=sk_NAME_FUNCS_new_null();
MemCheck_on();
}
- if ((name_funcs_stack == NULL))
+ if (name_funcs_stack == NULL)
{
/* ERROR */
return(0);
diff --git a/main/openssl/crypto/objects/obj_dat.h b/main/openssl/crypto/objects/obj_dat.h
index 6449be60..d404ad07 100644
--- a/main/openssl/crypto/objects/obj_dat.h
+++ b/main/openssl/crypto/objects/obj_dat.h
@@ -62,12 +62,12 @@
* [including the GNU Public Licence.]
*/
-#define NUM_NID 893
-#define NUM_SN 886
-#define NUM_LN 886
-#define NUM_OBJ 840
+#define NUM_NID 920
+#define NUM_SN 913
+#define NUM_LN 913
+#define NUM_OBJ 857
-static const unsigned char lvalues[5824]={
+static const unsigned char lvalues[5980]={
0x00, /* [ 0] OBJ_undef */
0x2A,0x86,0x48,0x86,0xF7,0x0D, /* [ 1] OBJ_rsadsi */
0x2A,0x86,0x48,0x86,0xF7,0x0D,0x01, /* [ 7] OBJ_pkcs */
@@ -908,6 +908,23 @@ static const unsigned char lvalues[5824]={
0x55,0x04,0x34, /* [5814] OBJ_supportedAlgorithms */
0x55,0x04,0x35, /* [5817] OBJ_deltaRevocationList */
0x55,0x04,0x36, /* [5820] OBJ_dmdName */
+0x2A,0x86,0x48,0x86,0xF7,0x0D,0x01,0x09,0x10,0x03,0x09,/* [5823] OBJ_id_alg_PWRI_KEK */
+0x60,0x86,0x48,0x01,0x65,0x03,0x04,0x01,0x06,/* [5834] OBJ_aes_128_gcm */
+0x60,0x86,0x48,0x01,0x65,0x03,0x04,0x01,0x07,/* [5843] OBJ_aes_128_ccm */
+0x60,0x86,0x48,0x01,0x65,0x03,0x04,0x01,0x08,/* [5852] OBJ_id_aes128_wrap_pad */
+0x60,0x86,0x48,0x01,0x65,0x03,0x04,0x01,0x1A,/* [5861] OBJ_aes_192_gcm */
+0x60,0x86,0x48,0x01,0x65,0x03,0x04,0x01,0x1B,/* [5870] OBJ_aes_192_ccm */
+0x60,0x86,0x48,0x01,0x65,0x03,0x04,0x01,0x1C,/* [5879] OBJ_id_aes192_wrap_pad */
+0x60,0x86,0x48,0x01,0x65,0x03,0x04,0x01,0x2E,/* [5888] OBJ_aes_256_gcm */
+0x60,0x86,0x48,0x01,0x65,0x03,0x04,0x01,0x2F,/* [5897] OBJ_aes_256_ccm */
+0x60,0x86,0x48,0x01,0x65,0x03,0x04,0x01,0x30,/* [5906] OBJ_id_aes256_wrap_pad */
+0x2A,0x83,0x08,0x8C,0x9A,0x4B,0x3D,0x01,0x01,0x03,0x02,/* [5915] OBJ_id_camellia128_wrap */
+0x2A,0x83,0x08,0x8C,0x9A,0x4B,0x3D,0x01,0x01,0x03,0x03,/* [5926] OBJ_id_camellia192_wrap */
+0x2A,0x83,0x08,0x8C,0x9A,0x4B,0x3D,0x01,0x01,0x03,0x04,/* [5937] OBJ_id_camellia256_wrap */
+0x55,0x1D,0x25,0x00, /* [5948] OBJ_anyExtendedKeyUsage */
+0x2A,0x86,0x48,0x86,0xF7,0x0D,0x01,0x01,0x08,/* [5952] OBJ_mgf1 */
+0x2A,0x86,0x48,0x86,0xF7,0x0D,0x01,0x01,0x0A,/* [5961] OBJ_rsassaPss */
+0x2A,0x86,0x48,0x86,0xF7,0x0D,0x01,0x01,0x07,/* [5970] OBJ_rsaesOaep */
};
static const ASN1_OBJECT nid_objs[NUM_NID]={
@@ -2351,28 +2368,74 @@ static const ASN1_OBJECT nid_objs[NUM_NID]={
{"deltaRevocationList","deltaRevocationList",NID_deltaRevocationList,
3,&(lvalues[5817]),0},
{"dmdName","dmdName",NID_dmdName,3,&(lvalues[5820]),0},
+{"id-alg-PWRI-KEK","id-alg-PWRI-KEK",NID_id_alg_PWRI_KEK,11,
+ &(lvalues[5823]),0},
+{"CMAC","cmac",NID_cmac,0,NULL,0},
+{"id-aes128-GCM","aes-128-gcm",NID_aes_128_gcm,9,&(lvalues[5834]),0},
+{"id-aes128-CCM","aes-128-ccm",NID_aes_128_ccm,9,&(lvalues[5843]),0},
+{"id-aes128-wrap-pad","id-aes128-wrap-pad",NID_id_aes128_wrap_pad,9,
+ &(lvalues[5852]),0},
+{"id-aes192-GCM","aes-192-gcm",NID_aes_192_gcm,9,&(lvalues[5861]),0},
+{"id-aes192-CCM","aes-192-ccm",NID_aes_192_ccm,9,&(lvalues[5870]),0},
+{"id-aes192-wrap-pad","id-aes192-wrap-pad",NID_id_aes192_wrap_pad,9,
+ &(lvalues[5879]),0},
+{"id-aes256-GCM","aes-256-gcm",NID_aes_256_gcm,9,&(lvalues[5888]),0},
+{"id-aes256-CCM","aes-256-ccm",NID_aes_256_ccm,9,&(lvalues[5897]),0},
+{"id-aes256-wrap-pad","id-aes256-wrap-pad",NID_id_aes256_wrap_pad,9,
+ &(lvalues[5906]),0},
+{"AES-128-CTR","aes-128-ctr",NID_aes_128_ctr,0,NULL,0},
+{"AES-192-CTR","aes-192-ctr",NID_aes_192_ctr,0,NULL,0},
+{"AES-256-CTR","aes-256-ctr",NID_aes_256_ctr,0,NULL,0},
+{"id-camellia128-wrap","id-camellia128-wrap",NID_id_camellia128_wrap,
+ 11,&(lvalues[5915]),0},
+{"id-camellia192-wrap","id-camellia192-wrap",NID_id_camellia192_wrap,
+ 11,&(lvalues[5926]),0},
+{"id-camellia256-wrap","id-camellia256-wrap",NID_id_camellia256_wrap,
+ 11,&(lvalues[5937]),0},
+{"anyExtendedKeyUsage","Any Extended Key Usage",
+ NID_anyExtendedKeyUsage,4,&(lvalues[5948]),0},
+{"MGF1","mgf1",NID_mgf1,9,&(lvalues[5952]),0},
+{"RSASSA-PSS","rsassaPss",NID_rsassaPss,9,&(lvalues[5961]),0},
+{"AES-128-XTS","aes-128-xts",NID_aes_128_xts,0,NULL,0},
+{"AES-256-XTS","aes-256-xts",NID_aes_256_xts,0,NULL,0},
+{"RC4-HMAC-MD5","rc4-hmac-md5",NID_rc4_hmac_md5,0,NULL,0},
+{"AES-128-CBC-HMAC-SHA1","aes-128-cbc-hmac-sha1",
+ NID_aes_128_cbc_hmac_sha1,0,NULL,0},
+{"AES-192-CBC-HMAC-SHA1","aes-192-cbc-hmac-sha1",
+ NID_aes_192_cbc_hmac_sha1,0,NULL,0},
+{"AES-256-CBC-HMAC-SHA1","aes-256-cbc-hmac-sha1",
+ NID_aes_256_cbc_hmac_sha1,0,NULL,0},
+{"RSAES-OAEP","rsaesOaep",NID_rsaesOaep,9,&(lvalues[5970]),0},
};
static const unsigned int sn_objs[NUM_SN]={
364, /* "AD_DVCS" */
419, /* "AES-128-CBC" */
+916, /* "AES-128-CBC-HMAC-SHA1" */
421, /* "AES-128-CFB" */
650, /* "AES-128-CFB1" */
653, /* "AES-128-CFB8" */
+904, /* "AES-128-CTR" */
418, /* "AES-128-ECB" */
420, /* "AES-128-OFB" */
+913, /* "AES-128-XTS" */
423, /* "AES-192-CBC" */
+917, /* "AES-192-CBC-HMAC-SHA1" */
425, /* "AES-192-CFB" */
651, /* "AES-192-CFB1" */
654, /* "AES-192-CFB8" */
+905, /* "AES-192-CTR" */
422, /* "AES-192-ECB" */
424, /* "AES-192-OFB" */
427, /* "AES-256-CBC" */
+918, /* "AES-256-CBC-HMAC-SHA1" */
429, /* "AES-256-CFB" */
652, /* "AES-256-CFB1" */
655, /* "AES-256-CFB8" */
+906, /* "AES-256-CTR" */
426, /* "AES-256-ECB" */
428, /* "AES-256-OFB" */
+914, /* "AES-256-XTS" */
91, /* "BF-CBC" */
93, /* "BF-CFB" */
92, /* "BF-ECB" */
@@ -2400,6 +2463,7 @@ static const unsigned int sn_objs[NUM_SN]={
110, /* "CAST5-CFB" */
109, /* "CAST5-ECB" */
111, /* "CAST5-OFB" */
+894, /* "CMAC" */
13, /* "CN" */
141, /* "CRLReason" */
417, /* "CSPName" */
@@ -2451,6 +2515,7 @@ static const unsigned int sn_objs[NUM_SN]={
4, /* "MD5" */
114, /* "MD5-SHA1" */
95, /* "MDC2" */
+911, /* "MGF1" */
388, /* "Mail" */
393, /* "NULL" */
404, /* "NULL" */
@@ -2487,6 +2552,7 @@ static const unsigned int sn_objs[NUM_SN]={
40, /* "RC2-OFB" */
5, /* "RC4" */
97, /* "RC4-40" */
+915, /* "RC4-HMAC-MD5" */
120, /* "RC5-CBC" */
122, /* "RC5-CFB" */
121, /* "RC5-ECB" */
@@ -2507,6 +2573,8 @@ static const unsigned int sn_objs[NUM_SN]={
668, /* "RSA-SHA256" */
669, /* "RSA-SHA384" */
670, /* "RSA-SHA512" */
+919, /* "RSAES-OAEP" */
+912, /* "RSASSA-PSS" */
777, /* "SEED-CBC" */
779, /* "SEED-CFB" */
776, /* "SEED-ECB" */
@@ -2540,6 +2608,7 @@ static const unsigned int sn_objs[NUM_SN]={
363, /* "ad_timestamping" */
376, /* "algorithm" */
405, /* "ansi-X9-62" */
+910, /* "anyExtendedKeyUsage" */
746, /* "anyPolicy" */
370, /* "archiveCutoff" */
484, /* "associatedDomain" */
@@ -2716,14 +2785,27 @@ static const unsigned int sn_objs[NUM_SN]={
357, /* "id-aca-group" */
358, /* "id-aca-role" */
176, /* "id-ad" */
+896, /* "id-aes128-CCM" */
+895, /* "id-aes128-GCM" */
788, /* "id-aes128-wrap" */
+897, /* "id-aes128-wrap-pad" */
+899, /* "id-aes192-CCM" */
+898, /* "id-aes192-GCM" */
789, /* "id-aes192-wrap" */
+900, /* "id-aes192-wrap-pad" */
+902, /* "id-aes256-CCM" */
+901, /* "id-aes256-GCM" */
790, /* "id-aes256-wrap" */
+903, /* "id-aes256-wrap-pad" */
262, /* "id-alg" */
+893, /* "id-alg-PWRI-KEK" */
323, /* "id-alg-des40" */
326, /* "id-alg-dh-pop" */
325, /* "id-alg-dh-sig-hmac-sha1" */
324, /* "id-alg-noSignature" */
+907, /* "id-camellia128-wrap" */
+908, /* "id-camellia192-wrap" */
+909, /* "id-camellia256-wrap" */
268, /* "id-cct" */
361, /* "id-cct-PKIData" */
362, /* "id-cct-PKIResponse" */
@@ -3246,6 +3328,7 @@ static const unsigned int ln_objs[NUM_LN]={
363, /* "AD Time Stamping" */
405, /* "ANSI X9.62" */
368, /* "Acceptable OCSP Responses" */
+910, /* "Any Extended Key Usage" */
664, /* "Any language" */
177, /* "Authority Information Access" */
365, /* "Basic OCSP Response" */
@@ -3386,23 +3469,37 @@ static const unsigned int ln_objs[NUM_LN]={
364, /* "ad dvcs" */
606, /* "additional verification" */
419, /* "aes-128-cbc" */
+916, /* "aes-128-cbc-hmac-sha1" */
+896, /* "aes-128-ccm" */
421, /* "aes-128-cfb" */
650, /* "aes-128-cfb1" */
653, /* "aes-128-cfb8" */
+904, /* "aes-128-ctr" */
418, /* "aes-128-ecb" */
+895, /* "aes-128-gcm" */
420, /* "aes-128-ofb" */
+913, /* "aes-128-xts" */
423, /* "aes-192-cbc" */
+917, /* "aes-192-cbc-hmac-sha1" */
+899, /* "aes-192-ccm" */
425, /* "aes-192-cfb" */
651, /* "aes-192-cfb1" */
654, /* "aes-192-cfb8" */
+905, /* "aes-192-ctr" */
422, /* "aes-192-ecb" */
+898, /* "aes-192-gcm" */
424, /* "aes-192-ofb" */
427, /* "aes-256-cbc" */
+918, /* "aes-256-cbc-hmac-sha1" */
+902, /* "aes-256-ccm" */
429, /* "aes-256-cfb" */
652, /* "aes-256-cfb1" */
655, /* "aes-256-cfb8" */
+906, /* "aes-256-ctr" */
426, /* "aes-256-ecb" */
+901, /* "aes-256-gcm" */
428, /* "aes-256-ofb" */
+914, /* "aes-256-xts" */
376, /* "algorithm" */
484, /* "associatedDomain" */
485, /* "associatedName" */
@@ -3467,6 +3564,7 @@ static const unsigned int ln_objs[NUM_LN]={
407, /* "characteristic-two-field" */
395, /* "clearance" */
633, /* "cleartext track 2" */
+894, /* "cmac" */
13, /* "commonName" */
513, /* "content types" */
50, /* "contentType" */
@@ -3602,13 +3700,20 @@ static const unsigned int ln_objs[NUM_LN]={
358, /* "id-aca-role" */
176, /* "id-ad" */
788, /* "id-aes128-wrap" */
+897, /* "id-aes128-wrap-pad" */
789, /* "id-aes192-wrap" */
+900, /* "id-aes192-wrap-pad" */
790, /* "id-aes256-wrap" */
+903, /* "id-aes256-wrap-pad" */
262, /* "id-alg" */
+893, /* "id-alg-PWRI-KEK" */
323, /* "id-alg-des40" */
326, /* "id-alg-dh-pop" */
325, /* "id-alg-dh-sig-hmac-sha1" */
324, /* "id-alg-noSignature" */
+907, /* "id-camellia128-wrap" */
+908, /* "id-camellia192-wrap" */
+909, /* "id-camellia256-wrap" */
268, /* "id-cct" */
361, /* "id-cct-PKIData" */
362, /* "id-cct-PKIResponse" */
@@ -3806,6 +3911,7 @@ static const unsigned int ln_objs[NUM_LN]={
602, /* "merchant initiated auth" */
514, /* "message extensions" */
51, /* "messageDigest" */
+911, /* "mgf1" */
506, /* "mime-mhs-bodies" */
505, /* "mime-mhs-headings" */
488, /* "mobileTelephoneNumber" */
@@ -3889,6 +3995,7 @@ static const unsigned int ln_objs[NUM_LN]={
40, /* "rc2-ofb" */
5, /* "rc4" */
97, /* "rc4-40" */
+915, /* "rc4-hmac-md5" */
120, /* "rc5-cbc" */
122, /* "rc5-cfb" */
121, /* "rc5-ecb" */
@@ -3905,6 +4012,8 @@ static const unsigned int ln_objs[NUM_LN]={
6, /* "rsaEncryption" */
644, /* "rsaOAEPEncryptionSET" */
377, /* "rsaSignature" */
+919, /* "rsaesOaep" */
+912, /* "rsassaPss" */
124, /* "run length compression" */
482, /* "sOARecord" */
155, /* "safeContentsBag" */
@@ -4254,6 +4363,7 @@ static const unsigned int obj_objs[NUM_OBJ]={
96, /* OBJ_mdc2WithRSA 2 5 8 3 100 */
95, /* OBJ_mdc2 2 5 8 3 101 */
746, /* OBJ_any_policy 2 5 29 32 0 */
+910, /* OBJ_anyExtendedKeyUsage 2 5 29 37 0 */
519, /* OBJ_setct_PANData 2 23 42 0 0 */
520, /* OBJ_setct_PANToken 2 23 42 0 1 */
521, /* OBJ_setct_PANOnly 2 23 42 0 2 */
@@ -4720,6 +4830,9 @@ static const unsigned int obj_objs[NUM_OBJ]={
8, /* OBJ_md5WithRSAEncryption 1 2 840 113549 1 1 4 */
65, /* OBJ_sha1WithRSAEncryption 1 2 840 113549 1 1 5 */
644, /* OBJ_rsaOAEPEncryptionSET 1 2 840 113549 1 1 6 */
+919, /* OBJ_rsaesOaep 1 2 840 113549 1 1 7 */
+911, /* OBJ_mgf1 1 2 840 113549 1 1 8 */
+912, /* OBJ_rsassaPss 1 2 840 113549 1 1 10 */
668, /* OBJ_sha256WithRSAEncryption 1 2 840 113549 1 1 11 */
669, /* OBJ_sha384WithRSAEncryption 1 2 840 113549 1 1 12 */
670, /* OBJ_sha512WithRSAEncryption 1 2 840 113549 1 1 13 */
@@ -4785,16 +4898,25 @@ static const unsigned int obj_objs[NUM_OBJ]={
420, /* OBJ_aes_128_ofb128 2 16 840 1 101 3 4 1 3 */
421, /* OBJ_aes_128_cfb128 2 16 840 1 101 3 4 1 4 */
788, /* OBJ_id_aes128_wrap 2 16 840 1 101 3 4 1 5 */
+895, /* OBJ_aes_128_gcm 2 16 840 1 101 3 4 1 6 */
+896, /* OBJ_aes_128_ccm 2 16 840 1 101 3 4 1 7 */
+897, /* OBJ_id_aes128_wrap_pad 2 16 840 1 101 3 4 1 8 */
422, /* OBJ_aes_192_ecb 2 16 840 1 101 3 4 1 21 */
423, /* OBJ_aes_192_cbc 2 16 840 1 101 3 4 1 22 */
424, /* OBJ_aes_192_ofb128 2 16 840 1 101 3 4 1 23 */
425, /* OBJ_aes_192_cfb128 2 16 840 1 101 3 4 1 24 */
789, /* OBJ_id_aes192_wrap 2 16 840 1 101 3 4 1 25 */
+898, /* OBJ_aes_192_gcm 2 16 840 1 101 3 4 1 26 */
+899, /* OBJ_aes_192_ccm 2 16 840 1 101 3 4 1 27 */
+900, /* OBJ_id_aes192_wrap_pad 2 16 840 1 101 3 4 1 28 */
426, /* OBJ_aes_256_ecb 2 16 840 1 101 3 4 1 41 */
427, /* OBJ_aes_256_cbc 2 16 840 1 101 3 4 1 42 */
428, /* OBJ_aes_256_ofb128 2 16 840 1 101 3 4 1 43 */
429, /* OBJ_aes_256_cfb128 2 16 840 1 101 3 4 1 44 */
790, /* OBJ_id_aes256_wrap 2 16 840 1 101 3 4 1 45 */
+901, /* OBJ_aes_256_gcm 2 16 840 1 101 3 4 1 46 */
+902, /* OBJ_aes_256_ccm 2 16 840 1 101 3 4 1 47 */
+903, /* OBJ_id_aes256_wrap_pad 2 16 840 1 101 3 4 1 48 */
672, /* OBJ_sha256 2 16 840 1 101 3 4 2 1 */
673, /* OBJ_sha384 2 16 840 1 101 3 4 2 2 */
674, /* OBJ_sha512 2 16 840 1 101 3 4 2 3 */
@@ -4901,6 +5023,9 @@ static const unsigned int obj_objs[NUM_OBJ]={
751, /* OBJ_camellia_128_cbc 1 2 392 200011 61 1 1 1 2 */
752, /* OBJ_camellia_192_cbc 1 2 392 200011 61 1 1 1 3 */
753, /* OBJ_camellia_256_cbc 1 2 392 200011 61 1 1 1 4 */
+907, /* OBJ_id_camellia128_wrap 1 2 392 200011 61 1 1 3 2 */
+908, /* OBJ_id_camellia192_wrap 1 2 392 200011 61 1 1 3 3 */
+909, /* OBJ_id_camellia256_wrap 1 2 392 200011 61 1 1 3 4 */
196, /* OBJ_id_smime_mod_cms 1 2 840 113549 1 9 16 0 1 */
197, /* OBJ_id_smime_mod_ess 1 2 840 113549 1 9 16 0 2 */
198, /* OBJ_id_smime_mod_oid 1 2 840 113549 1 9 16 0 3 */
@@ -4956,6 +5081,7 @@ static const unsigned int obj_objs[NUM_OBJ]={
246, /* OBJ_id_smime_alg_CMS3DESwrap 1 2 840 113549 1 9 16 3 6 */
247, /* OBJ_id_smime_alg_CMSRC2wrap 1 2 840 113549 1 9 16 3 7 */
125, /* OBJ_zlib_compression 1 2 840 113549 1 9 16 3 8 */
+893, /* OBJ_id_alg_PWRI_KEK 1 2 840 113549 1 9 16 3 9 */
248, /* OBJ_id_smime_cd_ldap 1 2 840 113549 1 9 16 4 1 */
249, /* OBJ_id_smime_spq_ets_sqt_uri 1 2 840 113549 1 9 16 5 1 */
250, /* OBJ_id_smime_spq_ets_sqt_unotice 1 2 840 113549 1 9 16 5 2 */
diff --git a/main/openssl/crypto/objects/obj_mac.h b/main/openssl/crypto/objects/obj_mac.h
index 282f11a8..b5ea7cda 100644
--- a/main/openssl/crypto/objects/obj_mac.h
+++ b/main/openssl/crypto/objects/obj_mac.h
@@ -580,6 +580,21 @@
#define NID_sha1WithRSAEncryption 65
#define OBJ_sha1WithRSAEncryption OBJ_pkcs1,5L
+#define SN_rsaesOaep "RSAES-OAEP"
+#define LN_rsaesOaep "rsaesOaep"
+#define NID_rsaesOaep 919
+#define OBJ_rsaesOaep OBJ_pkcs1,7L
+
+#define SN_mgf1 "MGF1"
+#define LN_mgf1 "mgf1"
+#define NID_mgf1 911
+#define OBJ_mgf1 OBJ_pkcs1,8L
+
+#define SN_rsassaPss "RSASSA-PSS"
+#define LN_rsassaPss "rsassaPss"
+#define NID_rsassaPss 912
+#define OBJ_rsassaPss OBJ_pkcs1,10L
+
#define SN_sha256WithRSAEncryption "RSA-SHA256"
#define LN_sha256WithRSAEncryption "sha256WithRSAEncryption"
#define NID_sha256WithRSAEncryption 668
@@ -981,6 +996,10 @@
#define NID_id_smime_alg_CMSRC2wrap 247
#define OBJ_id_smime_alg_CMSRC2wrap OBJ_id_smime_alg,7L
+#define SN_id_alg_PWRI_KEK "id-alg-PWRI-KEK"
+#define NID_id_alg_PWRI_KEK 893
+#define OBJ_id_alg_PWRI_KEK OBJ_id_smime_alg,9L
+
#define SN_id_smime_cd_ldap "id-smime-cd-ldap"
#define NID_id_smime_cd_ldap 248
#define OBJ_id_smime_cd_ldap OBJ_id_smime_cd,1L
@@ -2399,6 +2418,11 @@
#define NID_no_rev_avail 403
#define OBJ_no_rev_avail OBJ_id_ce,56L
+#define SN_anyExtendedKeyUsage "anyExtendedKeyUsage"
+#define LN_anyExtendedKeyUsage "Any Extended Key Usage"
+#define NID_anyExtendedKeyUsage 910
+#define OBJ_anyExtendedKeyUsage OBJ_ext_key_usage,0L
+
#define SN_netscape "Netscape"
#define LN_netscape "Netscape Communications Corp."
#define NID_netscape 57
@@ -2586,6 +2610,24 @@
#define NID_aes_128_cfb128 421
#define OBJ_aes_128_cfb128 OBJ_aes,4L
+#define SN_id_aes128_wrap "id-aes128-wrap"
+#define NID_id_aes128_wrap 788
+#define OBJ_id_aes128_wrap OBJ_aes,5L
+
+#define SN_aes_128_gcm "id-aes128-GCM"
+#define LN_aes_128_gcm "aes-128-gcm"
+#define NID_aes_128_gcm 895
+#define OBJ_aes_128_gcm OBJ_aes,6L
+
+#define SN_aes_128_ccm "id-aes128-CCM"
+#define LN_aes_128_ccm "aes-128-ccm"
+#define NID_aes_128_ccm 896
+#define OBJ_aes_128_ccm OBJ_aes,7L
+
+#define SN_id_aes128_wrap_pad "id-aes128-wrap-pad"
+#define NID_id_aes128_wrap_pad 897
+#define OBJ_id_aes128_wrap_pad OBJ_aes,8L
+
#define SN_aes_192_ecb "AES-192-ECB"
#define LN_aes_192_ecb "aes-192-ecb"
#define NID_aes_192_ecb 422
@@ -2606,6 +2648,24 @@
#define NID_aes_192_cfb128 425
#define OBJ_aes_192_cfb128 OBJ_aes,24L
+#define SN_id_aes192_wrap "id-aes192-wrap"
+#define NID_id_aes192_wrap 789
+#define OBJ_id_aes192_wrap OBJ_aes,25L
+
+#define SN_aes_192_gcm "id-aes192-GCM"
+#define LN_aes_192_gcm "aes-192-gcm"
+#define NID_aes_192_gcm 898
+#define OBJ_aes_192_gcm OBJ_aes,26L
+
+#define SN_aes_192_ccm "id-aes192-CCM"
+#define LN_aes_192_ccm "aes-192-ccm"
+#define NID_aes_192_ccm 899
+#define OBJ_aes_192_ccm OBJ_aes,27L
+
+#define SN_id_aes192_wrap_pad "id-aes192-wrap-pad"
+#define NID_id_aes192_wrap_pad 900
+#define OBJ_id_aes192_wrap_pad OBJ_aes,28L
+
#define SN_aes_256_ecb "AES-256-ECB"
#define LN_aes_256_ecb "aes-256-ecb"
#define NID_aes_256_ecb 426
@@ -2626,6 +2686,24 @@
#define NID_aes_256_cfb128 429
#define OBJ_aes_256_cfb128 OBJ_aes,44L
+#define SN_id_aes256_wrap "id-aes256-wrap"
+#define NID_id_aes256_wrap 790
+#define OBJ_id_aes256_wrap OBJ_aes,45L
+
+#define SN_aes_256_gcm "id-aes256-GCM"
+#define LN_aes_256_gcm "aes-256-gcm"
+#define NID_aes_256_gcm 901
+#define OBJ_aes_256_gcm OBJ_aes,46L
+
+#define SN_aes_256_ccm "id-aes256-CCM"
+#define LN_aes_256_ccm "aes-256-ccm"
+#define NID_aes_256_ccm 902
+#define OBJ_aes_256_ccm OBJ_aes,47L
+
+#define SN_id_aes256_wrap_pad "id-aes256-wrap-pad"
+#define NID_id_aes256_wrap_pad 903
+#define OBJ_id_aes256_wrap_pad OBJ_aes,48L
+
#define SN_aes_128_cfb1 "AES-128-CFB1"
#define LN_aes_128_cfb1 "aes-128-cfb1"
#define NID_aes_128_cfb1 650
@@ -2650,6 +2728,26 @@
#define LN_aes_256_cfb8 "aes-256-cfb8"
#define NID_aes_256_cfb8 655
+#define SN_aes_128_ctr "AES-128-CTR"
+#define LN_aes_128_ctr "aes-128-ctr"
+#define NID_aes_128_ctr 904
+
+#define SN_aes_192_ctr "AES-192-CTR"
+#define LN_aes_192_ctr "aes-192-ctr"
+#define NID_aes_192_ctr 905
+
+#define SN_aes_256_ctr "AES-256-CTR"
+#define LN_aes_256_ctr "aes-256-ctr"
+#define NID_aes_256_ctr 906
+
+#define SN_aes_128_xts "AES-128-XTS"
+#define LN_aes_128_xts "aes-128-xts"
+#define NID_aes_128_xts 913
+
+#define SN_aes_256_xts "AES-256-XTS"
+#define LN_aes_256_xts "aes-256-xts"
+#define NID_aes_256_xts 914
+
#define SN_des_cfb1 "DES-CFB1"
#define LN_des_cfb1 "des-cfb1"
#define NID_des_cfb1 656
@@ -2666,18 +2764,6 @@
#define LN_des_ede3_cfb8 "des-ede3-cfb8"
#define NID_des_ede3_cfb8 659
-#define SN_id_aes128_wrap "id-aes128-wrap"
-#define NID_id_aes128_wrap 788
-#define OBJ_id_aes128_wrap OBJ_aes,5L
-
-#define SN_id_aes192_wrap "id-aes192-wrap"
-#define NID_id_aes192_wrap 789
-#define OBJ_id_aes192_wrap OBJ_aes,25L
-
-#define SN_id_aes256_wrap "id-aes256-wrap"
-#define NID_id_aes256_wrap 790
-#define OBJ_id_aes256_wrap OBJ_aes,45L
-
#define OBJ_nist_hashalgs OBJ_nistAlgorithms,2L
#define SN_sha256 "SHA256"
@@ -3810,6 +3896,18 @@
#define NID_camellia_256_cbc 753
#define OBJ_camellia_256_cbc 1L,2L,392L,200011L,61L,1L,1L,1L,4L
+#define SN_id_camellia128_wrap "id-camellia128-wrap"
+#define NID_id_camellia128_wrap 907
+#define OBJ_id_camellia128_wrap 1L,2L,392L,200011L,61L,1L,1L,3L,2L
+
+#define SN_id_camellia192_wrap "id-camellia192-wrap"
+#define NID_id_camellia192_wrap 908
+#define OBJ_id_camellia192_wrap 1L,2L,392L,200011L,61L,1L,1L,3L,3L
+
+#define SN_id_camellia256_wrap "id-camellia256-wrap"
+#define NID_id_camellia256_wrap 909
+#define OBJ_id_camellia256_wrap 1L,2L,392L,200011L,61L,1L,1L,3L,4L
+
#define OBJ_ntt_ds 0L,3L,4401L,5L
#define OBJ_camellia OBJ_ntt_ds,3L,1L,9L
@@ -3912,3 +4010,23 @@
#define LN_hmac "hmac"
#define NID_hmac 855
+#define SN_cmac "CMAC"
+#define LN_cmac "cmac"
+#define NID_cmac 894
+
+#define SN_rc4_hmac_md5 "RC4-HMAC-MD5"
+#define LN_rc4_hmac_md5 "rc4-hmac-md5"
+#define NID_rc4_hmac_md5 915
+
+#define SN_aes_128_cbc_hmac_sha1 "AES-128-CBC-HMAC-SHA1"
+#define LN_aes_128_cbc_hmac_sha1 "aes-128-cbc-hmac-sha1"
+#define NID_aes_128_cbc_hmac_sha1 916
+
+#define SN_aes_192_cbc_hmac_sha1 "AES-192-CBC-HMAC-SHA1"
+#define LN_aes_192_cbc_hmac_sha1 "aes-192-cbc-hmac-sha1"
+#define NID_aes_192_cbc_hmac_sha1 917
+
+#define SN_aes_256_cbc_hmac_sha1 "AES-256-CBC-HMAC-SHA1"
+#define LN_aes_256_cbc_hmac_sha1 "aes-256-cbc-hmac-sha1"
+#define NID_aes_256_cbc_hmac_sha1 918
+
diff --git a/main/openssl/crypto/objects/obj_mac.num b/main/openssl/crypto/objects/obj_mac.num
index 8c50aac2..1d0a7c80 100644
--- a/main/openssl/crypto/objects/obj_mac.num
+++ b/main/openssl/crypto/objects/obj_mac.num
@@ -890,3 +890,30 @@ houseIdentifier 889
supportedAlgorithms 890
deltaRevocationList 891
dmdName 892
+id_alg_PWRI_KEK 893
+cmac 894
+aes_128_gcm 895
+aes_128_ccm 896
+id_aes128_wrap_pad 897
+aes_192_gcm 898
+aes_192_ccm 899
+id_aes192_wrap_pad 900
+aes_256_gcm 901
+aes_256_ccm 902
+id_aes256_wrap_pad 903
+aes_128_ctr 904
+aes_192_ctr 905
+aes_256_ctr 906
+id_camellia128_wrap 907
+id_camellia192_wrap 908
+id_camellia256_wrap 909
+anyExtendedKeyUsage 910
+mgf1 911
+rsassaPss 912
+aes_128_xts 913
+aes_256_xts 914
+rc4_hmac_md5 915
+aes_128_cbc_hmac_sha1 916
+aes_192_cbc_hmac_sha1 917
+aes_256_cbc_hmac_sha1 918
+rsaesOaep 919
diff --git a/main/openssl/crypto/objects/obj_xref.c b/main/openssl/crypto/objects/obj_xref.c
index 152eca5c..9f744bce 100644
--- a/main/openssl/crypto/objects/obj_xref.c
+++ b/main/openssl/crypto/objects/obj_xref.c
@@ -110,8 +110,10 @@ int OBJ_find_sigid_algs(int signid, int *pdig_nid, int *ppkey_nid)
#endif
if (rv == NULL)
return 0;
- *pdig_nid = rv->hash_id;
- *ppkey_nid = rv->pkey_id;
+ if (pdig_nid)
+ *pdig_nid = rv->hash_id;
+ if (ppkey_nid)
+ *ppkey_nid = rv->pkey_id;
return 1;
}
@@ -144,7 +146,8 @@ int OBJ_find_sigid_by_algs(int *psignid, int dig_nid, int pkey_nid)
#endif
if (rv == NULL)
return 0;
- *psignid = (*rv)->sign_id;
+ if (psignid)
+ *psignid = (*rv)->sign_id;
return 1;
}
diff --git a/main/openssl/crypto/objects/obj_xref.h b/main/openssl/crypto/objects/obj_xref.h
index d5b9b8e1..e23938c2 100644
--- a/main/openssl/crypto/objects/obj_xref.h
+++ b/main/openssl/crypto/objects/obj_xref.h
@@ -38,10 +38,12 @@ static const nid_triple sigoid_srt[] =
{NID_id_GostR3411_94_with_GostR3410_94, NID_id_GostR3411_94, NID_id_GostR3410_94},
{NID_id_GostR3411_94_with_GostR3410_94_cc, NID_id_GostR3411_94, NID_id_GostR3410_94_cc},
{NID_id_GostR3411_94_with_GostR3410_2001_cc, NID_id_GostR3411_94, NID_id_GostR3410_2001_cc},
+ {NID_rsassaPss, NID_undef, NID_rsaEncryption},
};
static const nid_triple * const sigoid_srt_xref[] =
{
+ &sigoid_srt[29],
&sigoid_srt[17],
&sigoid_srt[18],
&sigoid_srt[0],
diff --git a/main/openssl/crypto/objects/obj_xref.txt b/main/openssl/crypto/objects/obj_xref.txt
index e45b3d34..cb917182 100644
--- a/main/openssl/crypto/objects/obj_xref.txt
+++ b/main/openssl/crypto/objects/obj_xref.txt
@@ -13,6 +13,10 @@ sha512WithRSAEncryption sha512 rsaEncryption
sha224WithRSAEncryption sha224 rsaEncryption
mdc2WithRSA mdc2 rsaEncryption
ripemd160WithRSA ripemd160 rsaEncryption
+# For PSS the digest algorithm can vary and depends on the included
+# AlgorithmIdentifier. The digest "undef" indicates the public key
+# method should handle this explicitly.
+rsassaPss undef rsaEncryption
# Alternative deprecated OIDs. By using the older "rsa" OID this
# type will be recognized by not normally used.
diff --git a/main/openssl/crypto/objects/objects.txt b/main/openssl/crypto/objects/objects.txt
index e61fe60c..d3bfad72 100644
--- a/main/openssl/crypto/objects/objects.txt
+++ b/main/openssl/crypto/objects/objects.txt
@@ -166,6 +166,10 @@ pkcs1 3 : RSA-MD4 : md4WithRSAEncryption
pkcs1 4 : RSA-MD5 : md5WithRSAEncryption
pkcs1 5 : RSA-SHA1 : sha1WithRSAEncryption
# According to PKCS #1 version 2.1
+pkcs1 7 : RSAES-OAEP : rsaesOaep
+pkcs1 8 : MGF1 : mgf1
+pkcs1 10 : RSASSA-PSS : rsassaPss
+
pkcs1 11 : RSA-SHA256 : sha256WithRSAEncryption
pkcs1 12 : RSA-SHA384 : sha384WithRSAEncryption
pkcs1 13 : RSA-SHA512 : sha512WithRSAEncryption
@@ -299,6 +303,7 @@ id-smime-alg 4 : id-smime-alg-RC2wrap
id-smime-alg 5 : id-smime-alg-ESDH
id-smime-alg 6 : id-smime-alg-CMS3DESwrap
id-smime-alg 7 : id-smime-alg-CMSRC2wrap
+id-smime-alg 9 : id-alg-PWRI-KEK
# S/MIME Certificate Distribution
id-smime-cd 1 : id-smime-cd-ldap
@@ -770,6 +775,10 @@ id-ce 55 : targetInformation : X509v3 AC Targeting
!Cname no-rev-avail
id-ce 56 : noRevAvail : X509v3 No Revocation Available
+# From RFC5280
+ext-key-usage 0 : anyExtendedKeyUsage : Any Extended Key Usage
+
+
!Cname netscape
2 16 840 1 113730 : Netscape : Netscape Communications Corp.
!Cname netscape-cert-extension
@@ -846,6 +855,10 @@ aes 2 : AES-128-CBC : aes-128-cbc
aes 3 : AES-128-OFB : aes-128-ofb
!Cname aes-128-cfb128
aes 4 : AES-128-CFB : aes-128-cfb
+aes 5 : id-aes128-wrap
+aes 6 : id-aes128-GCM : aes-128-gcm
+aes 7 : id-aes128-CCM : aes-128-ccm
+aes 8 : id-aes128-wrap-pad
aes 21 : AES-192-ECB : aes-192-ecb
aes 22 : AES-192-CBC : aes-192-cbc
@@ -853,6 +866,10 @@ aes 22 : AES-192-CBC : aes-192-cbc
aes 23 : AES-192-OFB : aes-192-ofb
!Cname aes-192-cfb128
aes 24 : AES-192-CFB : aes-192-cfb
+aes 25 : id-aes192-wrap
+aes 26 : id-aes192-GCM : aes-192-gcm
+aes 27 : id-aes192-CCM : aes-192-ccm
+aes 28 : id-aes192-wrap-pad
aes 41 : AES-256-ECB : aes-256-ecb
aes 42 : AES-256-CBC : aes-256-cbc
@@ -860,6 +877,10 @@ aes 42 : AES-256-CBC : aes-256-cbc
aes 43 : AES-256-OFB : aes-256-ofb
!Cname aes-256-cfb128
aes 44 : AES-256-CFB : aes-256-cfb
+aes 45 : id-aes256-wrap
+aes 46 : id-aes256-GCM : aes-256-gcm
+aes 47 : id-aes256-CCM : aes-256-ccm
+aes 48 : id-aes256-wrap-pad
# There are no OIDs for these modes...
@@ -869,15 +890,16 @@ aes 44 : AES-256-CFB : aes-256-cfb
: AES-128-CFB8 : aes-128-cfb8
: AES-192-CFB8 : aes-192-cfb8
: AES-256-CFB8 : aes-256-cfb8
+ : AES-128-CTR : aes-128-ctr
+ : AES-192-CTR : aes-192-ctr
+ : AES-256-CTR : aes-256-ctr
+ : AES-128-XTS : aes-128-xts
+ : AES-256-XTS : aes-256-xts
: DES-CFB1 : des-cfb1
: DES-CFB8 : des-cfb8
: DES-EDE3-CFB1 : des-ede3-cfb1
: DES-EDE3-CFB8 : des-ede3-cfb8
-aes 5 : id-aes128-wrap
-aes 25 : id-aes192-wrap
-aes 45 : id-aes256-wrap
-
# OIDs for SHA224, SHA256, SHA385 and SHA512, according to x9.84.
!Alias nist_hashalgs nistAlgorithms 2
nist_hashalgs 1 : SHA256 : sha256
@@ -1211,6 +1233,9 @@ cryptocom 1 8 1 : id-GostR3410-2001-ParamSet-cc : GOST R 3410-2001 Parameter Se
1 2 392 200011 61 1 1 1 2 : CAMELLIA-128-CBC : camellia-128-cbc
1 2 392 200011 61 1 1 1 3 : CAMELLIA-192-CBC : camellia-192-cbc
1 2 392 200011 61 1 1 1 4 : CAMELLIA-256-CBC : camellia-256-cbc
+1 2 392 200011 61 1 1 3 2 : id-camellia128-wrap
+1 2 392 200011 61 1 1 3 3 : id-camellia192-wrap
+1 2 392 200011 61 1 1 3 4 : id-camellia256-wrap
# Definitions for Camellia cipher - ECB, CFB, OFB MODE
@@ -1257,3 +1282,11 @@ kisa 1 6 : SEED-OFB : seed-ofb
# There is no OID that just denotes "HMAC" oddly enough...
: HMAC : hmac
+# Nor CMAC either
+ : CMAC : cmac
+
+# Synthetic composite ciphersuites
+ : RC4-HMAC-MD5 : rc4-hmac-md5
+ : AES-128-CBC-HMAC-SHA1 : aes-128-cbc-hmac-sha1
+ : AES-192-CBC-HMAC-SHA1 : aes-192-cbc-hmac-sha1
+ : AES-256-CBC-HMAC-SHA1 : aes-256-cbc-hmac-sha1
diff --git a/main/openssl/crypto/ocsp/ocsp_lib.c b/main/openssl/crypto/ocsp/ocsp_lib.c
index e92b86c0..a94dc838 100644
--- a/main/openssl/crypto/ocsp/ocsp_lib.c
+++ b/main/openssl/crypto/ocsp/ocsp_lib.c
@@ -124,7 +124,8 @@ OCSP_CERTID *OCSP_cert_id_new(const EVP_MD *dgst,
if (!(ASN1_OCTET_STRING_set(cid->issuerNameHash, md, i))) goto err;
/* Calculate the issuerKey hash, excluding tag and length */
- EVP_Digest(issuerKey->data, issuerKey->length, md, &i, dgst, NULL);
+ if (!EVP_Digest(issuerKey->data, issuerKey->length, md, &i, dgst, NULL))
+ goto err;
if (!(ASN1_OCTET_STRING_set(cid->issuerKeyHash, md, i))) goto err;
diff --git a/main/openssl/crypto/ocsp/ocsp_vfy.c b/main/openssl/crypto/ocsp/ocsp_vfy.c
index 415d67e6..27671830 100644
--- a/main/openssl/crypto/ocsp/ocsp_vfy.c
+++ b/main/openssl/crypto/ocsp/ocsp_vfy.c
@@ -91,9 +91,12 @@ int OCSP_basic_verify(OCSP_BASICRESP *bs, STACK_OF(X509) *certs,
{
EVP_PKEY *skey;
skey = X509_get_pubkey(signer);
- ret = OCSP_BASICRESP_verify(bs, skey, 0);
- EVP_PKEY_free(skey);
- if(ret <= 0)
+ if (skey)
+ {
+ ret = OCSP_BASICRESP_verify(bs, skey, 0);
+ EVP_PKEY_free(skey);
+ }
+ if(!skey || ret <= 0)
{
OCSPerr(OCSP_F_OCSP_BASIC_VERIFY, OCSP_R_SIGNATURE_FAILURE);
goto end;
@@ -108,6 +111,7 @@ int OCSP_basic_verify(OCSP_BASICRESP *bs, STACK_OF(X509) *certs,
init_res = X509_STORE_CTX_init(&ctx, st, signer, bs->certs);
if(!init_res)
{
+ ret = -1;
OCSPerr(OCSP_F_OCSP_BASIC_VERIFY,ERR_R_X509_LIB);
goto end;
}
diff --git a/main/openssl/crypto/opensslconf-32.h b/main/openssl/crypto/opensslconf-32.h
new file mode 100644
index 00000000..d6625489
--- /dev/null
+++ b/main/openssl/crypto/opensslconf-32.h
@@ -0,0 +1,316 @@
+/* opensslconf.h */
+/* WARNING: Generated automatically from opensslconf.h.in by Configure. */
+
+/* OpenSSL was configured with the following options: */
+#ifndef OPENSSL_DOING_MAKEDEPEND
+
+
+#ifndef OPENSSL_NO_CAMELLIA
+# define OPENSSL_NO_CAMELLIA
+#endif
+#ifndef OPENSSL_NO_CAPIENG
+# define OPENSSL_NO_CAPIENG
+#endif
+#ifndef OPENSSL_NO_CAST
+# define OPENSSL_NO_CAST
+#endif
+#ifndef OPENSSL_NO_DTLS1
+# define OPENSSL_NO_DTLS1
+#endif
+#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
+# define OPENSSL_NO_EC_NISTP_64_GCC_128
+#endif
+#ifndef OPENSSL_NO_GMP
+# define OPENSSL_NO_GMP
+#endif
+#ifndef OPENSSL_NO_GOST
+# define OPENSSL_NO_GOST
+#endif
+#ifndef OPENSSL_NO_HEARTBEATS
+# define OPENSSL_NO_HEARTBEATS
+#endif
+#ifndef OPENSSL_NO_IDEA
+# define OPENSSL_NO_IDEA
+#endif
+#ifndef OPENSSL_NO_JPAKE
+# define OPENSSL_NO_JPAKE
+#endif
+#ifndef OPENSSL_NO_KRB5
+# define OPENSSL_NO_KRB5
+#endif
+#ifndef OPENSSL_NO_MD2
+# define OPENSSL_NO_MD2
+#endif
+#ifndef OPENSSL_NO_MDC2
+# define OPENSSL_NO_MDC2
+#endif
+#ifndef OPENSSL_NO_RC5
+# define OPENSSL_NO_RC5
+#endif
+#ifndef OPENSSL_NO_RDRAND
+# define OPENSSL_NO_RDRAND
+#endif
+#ifndef OPENSSL_NO_RFC3779
+# define OPENSSL_NO_RFC3779
+#endif
+#ifndef OPENSSL_NO_RSAX
+# define OPENSSL_NO_RSAX
+#endif
+#ifndef OPENSSL_NO_SCTP
+# define OPENSSL_NO_SCTP
+#endif
+#ifndef OPENSSL_NO_SEED
+# define OPENSSL_NO_SEED
+#endif
+#ifndef OPENSSL_NO_SHA0
+# define OPENSSL_NO_SHA0
+#endif
+#ifndef OPENSSL_NO_STATIC_ENGINE
+# define OPENSSL_NO_STATIC_ENGINE
+#endif
+#ifndef OPENSSL_NO_STORE
+# define OPENSSL_NO_STORE
+#endif
+#ifndef OPENSSL_NO_WHIRLPOOL
+# define OPENSSL_NO_WHIRLPOOL
+#endif
+
+#endif /* OPENSSL_DOING_MAKEDEPEND */
+
+#ifndef OPENSSL_THREADS
+# define OPENSSL_THREADS
+#endif
+#ifndef OPENSSL_NO_DYNAMIC_ENGINE
+# define OPENSSL_NO_DYNAMIC_ENGINE
+#endif
+
+/* The OPENSSL_NO_* macros are also defined as NO_* if the application
+ asks for it. This is a transient feature that is provided for those
+ who haven't had the time to do the appropriate changes in their
+ applications. */
+#ifdef OPENSSL_ALGORITHM_DEFINES
+# if defined(OPENSSL_NO_CAMELLIA) && !defined(NO_CAMELLIA)
+# define NO_CAMELLIA
+# endif
+# if defined(OPENSSL_NO_CAPIENG) && !defined(NO_CAPIENG)
+# define NO_CAPIENG
+# endif
+# if defined(OPENSSL_NO_CAST) && !defined(NO_CAST)
+# define NO_CAST
+# endif
+# if defined(OPENSSL_NO_DTLS1) && !defined(NO_DTLS1)
+# define NO_DTLS1
+# endif
+# if defined(OPENSSL_NO_EC_NISTP_64_GCC_128) && !defined(NO_EC_NISTP_64_GCC_128)
+# define NO_EC_NISTP_64_GCC_128
+# endif
+# if defined(OPENSSL_NO_GMP) && !defined(NO_GMP)
+# define NO_GMP
+# endif
+# if defined(OPENSSL_NO_GOST) && !defined(NO_GOST)
+# define NO_GOST
+# endif
+# if defined(OPENSSL_NO_HEARTBEATS) && !defined(NO_HEARTBEATS)
+# define NO_HEARTBEATS
+# endif
+# if defined(OPENSSL_NO_IDEA) && !defined(NO_IDEA)
+# define NO_IDEA
+# endif
+# if defined(OPENSSL_NO_JPAKE) && !defined(NO_JPAKE)
+# define NO_JPAKE
+# endif
+# if defined(OPENSSL_NO_KRB5) && !defined(NO_KRB5)
+# define NO_KRB5
+# endif
+# if defined(OPENSSL_NO_MD2) && !defined(NO_MD2)
+# define NO_MD2
+# endif
+# if defined(OPENSSL_NO_MDC2) && !defined(NO_MDC2)
+# define NO_MDC2
+# endif
+# if defined(OPENSSL_NO_RC5) && !defined(NO_RC5)
+# define NO_RC5
+# endif
+# if defined(OPENSSL_NO_RDRAND) && !defined(NO_RDRAND)
+# define NO_RDRAND
+# endif
+# if defined(OPENSSL_NO_RFC3779) && !defined(NO_RFC3779)
+# define NO_RFC3779
+# endif
+# if defined(OPENSSL_NO_RSAX) && !defined(NO_RSAX)
+# define NO_RSAX
+# endif
+# if defined(OPENSSL_NO_SCTP) && !defined(NO_SCTP)
+# define NO_SCTP
+# endif
+# if defined(OPENSSL_NO_SEED) && !defined(NO_SEED)
+# define NO_SEED
+# endif
+# if defined(OPENSSL_NO_SHA0) && !defined(NO_SHA0)
+# define NO_SHA0
+# endif
+# if defined(OPENSSL_NO_STATIC_ENGINE) && !defined(NO_STATIC_ENGINE)
+# define NO_STATIC_ENGINE
+# endif
+# if defined(OPENSSL_NO_STORE) && !defined(NO_STORE)
+# define NO_STORE
+# endif
+# if defined(OPENSSL_NO_WHIRLPOOL) && !defined(NO_WHIRLPOOL)
+# define NO_WHIRLPOOL
+# endif
+#endif
+
+/* crypto/opensslconf.h.in */
+
+/* Generate 80386 code? */
+#undef I386_ONLY
+
+#if !(defined(VMS) || defined(__VMS)) /* VMS uses logical names instead */
+#if defined(HEADER_CRYPTLIB_H) && !defined(OPENSSLDIR)
+#define ENGINESDIR "/usr/local/ssl/lib/engines"
+#define OPENSSLDIR "/usr/local/ssl"
+#endif
+#endif
+
+#undef OPENSSL_UNISTD
+#define OPENSSL_UNISTD <unistd.h>
+
+#undef OPENSSL_EXPORT_VAR_AS_FUNCTION
+
+#if defined(HEADER_IDEA_H) && !defined(IDEA_INT)
+#define IDEA_INT unsigned int
+#endif
+
+#if defined(HEADER_MD2_H) && !defined(MD2_INT)
+#define MD2_INT unsigned int
+#endif
+
+#if defined(HEADER_RC2_H) && !defined(RC2_INT)
+/* I need to put in a mod for the alpha - eay */
+#define RC2_INT unsigned int
+#endif
+
+#if defined(HEADER_RC4_H)
+#if !defined(RC4_INT)
+/* using int types make the structure larger but make the code faster
+ * on most boxes I have tested - up to %20 faster. */
+/*
+ * I don't know what does "most" mean, but declaring "int" is a must on:
+ * - Intel P6 because partial register stalls are very expensive;
+ * - elder Alpha because it lacks byte load/store instructions;
+ */
+#define RC4_INT unsigned char
+#endif
+#if !defined(RC4_CHUNK)
+/*
+ * This enables code handling data aligned at natural CPU word
+ * boundary. See crypto/rc4/rc4_enc.c for further details.
+ */
+#define RC4_CHUNK unsigned long
+#endif
+#endif
+
+#if (defined(HEADER_NEW_DES_H) || defined(HEADER_DES_H)) && !defined(DES_LONG)
+/* If this is set to 'unsigned int' on a DEC Alpha, this gives about a
+ * %20 speed up (longs are 8 bytes, int's are 4). */
+#ifndef DES_LONG
+#define DES_LONG unsigned int
+#endif
+#endif
+
+#if defined(HEADER_BN_H) && !defined(CONFIG_HEADER_BN_H)
+#define CONFIG_HEADER_BN_H
+#define BN_LLONG
+
+/* Should we define BN_DIV2W here? */
+
+/* Only one for the following should be defined */
+#undef SIXTY_FOUR_BIT_LONG
+#undef SIXTY_FOUR_BIT
+#define THIRTY_TWO_BIT
+#endif
+
+#if defined(HEADER_RC4_LOCL_H) && !defined(CONFIG_HEADER_RC4_LOCL_H)
+#define CONFIG_HEADER_RC4_LOCL_H
+/* if this is defined data[i] is used instead of *data, this is a %20
+ * speedup on x86 */
+#undef RC4_INDEX
+#endif
+
+#if defined(HEADER_BF_LOCL_H) && !defined(CONFIG_HEADER_BF_LOCL_H)
+#define CONFIG_HEADER_BF_LOCL_H
+#define BF_PTR
+#endif /* HEADER_BF_LOCL_H */
+
+#if defined(HEADER_DES_LOCL_H) && !defined(CONFIG_HEADER_DES_LOCL_H)
+#define CONFIG_HEADER_DES_LOCL_H
+#ifndef DES_DEFAULT_OPTIONS
+/* the following is tweaked from a config script, that is why it is a
+ * protected undef/define */
+#ifndef DES_PTR
+#undef DES_PTR
+#endif
+
+/* This helps C compiler generate the correct code for multiple functional
+ * units. It reduces register dependancies at the expense of 2 more
+ * registers */
+#ifndef DES_RISC1
+#undef DES_RISC1
+#endif
+
+#ifndef DES_RISC2
+#undef DES_RISC2
+#endif
+
+#if defined(DES_RISC1) && defined(DES_RISC2)
+YOU SHOULD NOT HAVE BOTH DES_RISC1 AND DES_RISC2 DEFINED!!!!!
+#endif
+
+/* Unroll the inner loop, this sometimes helps, sometimes hinders.
+ * Very mucy CPU dependant */
+#ifndef DES_UNROLL
+#define DES_UNROLL
+#endif
+
+/* These default values were supplied by
+ * Peter Gutman <pgut001@cs.auckland.ac.nz>
+ * They are only used if nothing else has been defined */
+#if !defined(DES_PTR) && !defined(DES_RISC1) && !defined(DES_RISC2) && !defined(DES_UNROLL)
+/* Special defines which change the way the code is built depending on the
+ CPU and OS. For SGI machines you can use _MIPS_SZLONG (32 or 64) to find
+ even newer MIPS CPU's, but at the moment one size fits all for
+ optimization options. Older Sparc's work better with only UNROLL, but
+ there's no way to tell at compile time what it is you're running on */
+
+#if defined( sun ) /* Newer Sparc's */
+# define DES_PTR
+# define DES_RISC1
+# define DES_UNROLL
+#elif defined( __ultrix ) /* Older MIPS */
+# define DES_PTR
+# define DES_RISC2
+# define DES_UNROLL
+#elif defined( __osf1__ ) /* Alpha */
+# define DES_PTR
+# define DES_RISC2
+#elif defined ( _AIX ) /* RS6000 */
+ /* Unknown */
+#elif defined( __hpux ) /* HP-PA */
+ /* Unknown */
+#elif defined( __aux ) /* 68K */
+ /* Unknown */
+#elif defined( __dgux ) /* 88K (but P6 in latest boxes) */
+# define DES_UNROLL
+#elif defined( __sgi ) /* Newer MIPS */
+# define DES_PTR
+# define DES_RISC2
+# define DES_UNROLL
+#elif defined(i386) || defined(__i386__) /* x86 boxes, should be gcc */
+# define DES_PTR
+# define DES_RISC1
+# define DES_UNROLL
+#endif /* Systems-specific speed defines */
+#endif
+
+#endif /* DES_DEFAULT_OPTIONS */
+#endif /* HEADER_DES_LOCL_H */
diff --git a/main/openssl/crypto/opensslconf-64.h b/main/openssl/crypto/opensslconf-64.h
new file mode 100644
index 00000000..70c5a2cb
--- /dev/null
+++ b/main/openssl/crypto/opensslconf-64.h
@@ -0,0 +1,316 @@
+/* opensslconf.h */
+/* WARNING: Generated automatically from opensslconf.h.in by Configure. */
+
+/* OpenSSL was configured with the following options: */
+#ifndef OPENSSL_DOING_MAKEDEPEND
+
+
+#ifndef OPENSSL_NO_CAMELLIA
+# define OPENSSL_NO_CAMELLIA
+#endif
+#ifndef OPENSSL_NO_CAPIENG
+# define OPENSSL_NO_CAPIENG
+#endif
+#ifndef OPENSSL_NO_CAST
+# define OPENSSL_NO_CAST
+#endif
+#ifndef OPENSSL_NO_DTLS1
+# define OPENSSL_NO_DTLS1
+#endif
+#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
+# define OPENSSL_NO_EC_NISTP_64_GCC_128
+#endif
+#ifndef OPENSSL_NO_GMP
+# define OPENSSL_NO_GMP
+#endif
+#ifndef OPENSSL_NO_GOST
+# define OPENSSL_NO_GOST
+#endif
+#ifndef OPENSSL_NO_HEARTBEATS
+# define OPENSSL_NO_HEARTBEATS
+#endif
+#ifndef OPENSSL_NO_IDEA
+# define OPENSSL_NO_IDEA
+#endif
+#ifndef OPENSSL_NO_JPAKE
+# define OPENSSL_NO_JPAKE
+#endif
+#ifndef OPENSSL_NO_KRB5
+# define OPENSSL_NO_KRB5
+#endif
+#ifndef OPENSSL_NO_MD2
+# define OPENSSL_NO_MD2
+#endif
+#ifndef OPENSSL_NO_MDC2
+# define OPENSSL_NO_MDC2
+#endif
+#ifndef OPENSSL_NO_RC5
+# define OPENSSL_NO_RC5
+#endif
+#ifndef OPENSSL_NO_RDRAND
+# define OPENSSL_NO_RDRAND
+#endif
+#ifndef OPENSSL_NO_RFC3779
+# define OPENSSL_NO_RFC3779
+#endif
+#ifndef OPENSSL_NO_RSAX
+# define OPENSSL_NO_RSAX
+#endif
+#ifndef OPENSSL_NO_SCTP
+# define OPENSSL_NO_SCTP
+#endif
+#ifndef OPENSSL_NO_SEED
+# define OPENSSL_NO_SEED
+#endif
+#ifndef OPENSSL_NO_SHA0
+# define OPENSSL_NO_SHA0
+#endif
+#ifndef OPENSSL_NO_STATIC_ENGINE
+# define OPENSSL_NO_STATIC_ENGINE
+#endif
+#ifndef OPENSSL_NO_STORE
+# define OPENSSL_NO_STORE
+#endif
+#ifndef OPENSSL_NO_WHIRLPOOL
+# define OPENSSL_NO_WHIRLPOOL
+#endif
+
+#endif /* OPENSSL_DOING_MAKEDEPEND */
+
+#ifndef OPENSSL_THREADS
+# define OPENSSL_THREADS
+#endif
+#ifndef OPENSSL_NO_DYNAMIC_ENGINE
+# define OPENSSL_NO_DYNAMIC_ENGINE
+#endif
+
+/* The OPENSSL_NO_* macros are also defined as NO_* if the application
+ asks for it. This is a transient feature that is provided for those
+ who haven't had the time to do the appropriate changes in their
+ applications. */
+#ifdef OPENSSL_ALGORITHM_DEFINES
+# if defined(OPENSSL_NO_CAMELLIA) && !defined(NO_CAMELLIA)
+# define NO_CAMELLIA
+# endif
+# if defined(OPENSSL_NO_CAPIENG) && !defined(NO_CAPIENG)
+# define NO_CAPIENG
+# endif
+# if defined(OPENSSL_NO_CAST) && !defined(NO_CAST)
+# define NO_CAST
+# endif
+# if defined(OPENSSL_NO_DTLS1) && !defined(NO_DTLS1)
+# define NO_DTLS1
+# endif
+# if defined(OPENSSL_NO_EC_NISTP_64_GCC_128) && !defined(NO_EC_NISTP_64_GCC_128)
+# define NO_EC_NISTP_64_GCC_128
+# endif
+# if defined(OPENSSL_NO_GMP) && !defined(NO_GMP)
+# define NO_GMP
+# endif
+# if defined(OPENSSL_NO_GOST) && !defined(NO_GOST)
+# define NO_GOST
+# endif
+# if defined(OPENSSL_NO_HEARTBEATS) && !defined(NO_HEARTBEATS)
+# define NO_HEARTBEATS
+# endif
+# if defined(OPENSSL_NO_IDEA) && !defined(NO_IDEA)
+# define NO_IDEA
+# endif
+# if defined(OPENSSL_NO_JPAKE) && !defined(NO_JPAKE)
+# define NO_JPAKE
+# endif
+# if defined(OPENSSL_NO_KRB5) && !defined(NO_KRB5)
+# define NO_KRB5
+# endif
+# if defined(OPENSSL_NO_MD2) && !defined(NO_MD2)
+# define NO_MD2
+# endif
+# if defined(OPENSSL_NO_MDC2) && !defined(NO_MDC2)
+# define NO_MDC2
+# endif
+# if defined(OPENSSL_NO_RC5) && !defined(NO_RC5)
+# define NO_RC5
+# endif
+# if defined(OPENSSL_NO_RDRAND) && !defined(NO_RDRAND)
+# define NO_RDRAND
+# endif
+# if defined(OPENSSL_NO_RFC3779) && !defined(NO_RFC3779)
+# define NO_RFC3779
+# endif
+# if defined(OPENSSL_NO_RSAX) && !defined(NO_RSAX)
+# define NO_RSAX
+# endif
+# if defined(OPENSSL_NO_SCTP) && !defined(NO_SCTP)
+# define NO_SCTP
+# endif
+# if defined(OPENSSL_NO_SEED) && !defined(NO_SEED)
+# define NO_SEED
+# endif
+# if defined(OPENSSL_NO_SHA0) && !defined(NO_SHA0)
+# define NO_SHA0
+# endif
+# if defined(OPENSSL_NO_STATIC_ENGINE) && !defined(NO_STATIC_ENGINE)
+# define NO_STATIC_ENGINE
+# endif
+# if defined(OPENSSL_NO_STORE) && !defined(NO_STORE)
+# define NO_STORE
+# endif
+# if defined(OPENSSL_NO_WHIRLPOOL) && !defined(NO_WHIRLPOOL)
+# define NO_WHIRLPOOL
+# endif
+#endif
+
+/* crypto/opensslconf.h.in */
+
+/* Generate 80386 code? */
+#undef I386_ONLY
+
+#if !(defined(VMS) || defined(__VMS)) /* VMS uses logical names instead */
+#if defined(HEADER_CRYPTLIB_H) && !defined(OPENSSLDIR)
+#define ENGINESDIR "/usr/local/ssl/lib/engines"
+#define OPENSSLDIR "/usr/local/ssl"
+#endif
+#endif
+
+#undef OPENSSL_UNISTD
+#define OPENSSL_UNISTD <unistd.h>
+
+#undef OPENSSL_EXPORT_VAR_AS_FUNCTION
+
+#if defined(HEADER_IDEA_H) && !defined(IDEA_INT)
+#define IDEA_INT unsigned int
+#endif
+
+#if defined(HEADER_MD2_H) && !defined(MD2_INT)
+#define MD2_INT unsigned int
+#endif
+
+#if defined(HEADER_RC2_H) && !defined(RC2_INT)
+/* I need to put in a mod for the alpha - eay */
+#define RC2_INT unsigned int
+#endif
+
+#if defined(HEADER_RC4_H)
+#if !defined(RC4_INT)
+/* using int types make the structure larger but make the code faster
+ * on most boxes I have tested - up to %20 faster. */
+/*
+ * I don't know what does "most" mean, but declaring "int" is a must on:
+ * - Intel P6 because partial register stalls are very expensive;
+ * - elder Alpha because it lacks byte load/store instructions;
+ */
+#define RC4_INT unsigned char
+#endif
+#if !defined(RC4_CHUNK)
+/*
+ * This enables code handling data aligned at natural CPU word
+ * boundary. See crypto/rc4/rc4_enc.c for further details.
+ */
+#define RC4_CHUNK unsigned long
+#endif
+#endif
+
+#if (defined(HEADER_NEW_DES_H) || defined(HEADER_DES_H)) && !defined(DES_LONG)
+/* If this is set to 'unsigned int' on a DEC Alpha, this gives about a
+ * %20 speed up (longs are 8 bytes, int's are 4). */
+#ifndef DES_LONG
+#define DES_LONG unsigned int
+#endif
+#endif
+
+#if defined(HEADER_BN_H) && !defined(CONFIG_HEADER_BN_H)
+#define CONFIG_HEADER_BN_H
+#undef BN_LLONG
+
+/* Should we define BN_DIV2W here? */
+
+/* Only one for the following should be defined */
+#define SIXTY_FOUR_BIT_LONG
+#undef SIXTY_FOUR_BIT
+#undef THIRTY_TWO_BIT
+#endif
+
+#if defined(HEADER_RC4_LOCL_H) && !defined(CONFIG_HEADER_RC4_LOCL_H)
+#define CONFIG_HEADER_RC4_LOCL_H
+/* if this is defined data[i] is used instead of *data, this is a %20
+ * speedup on x86 */
+#undef RC4_INDEX
+#endif
+
+#if defined(HEADER_BF_LOCL_H) && !defined(CONFIG_HEADER_BF_LOCL_H)
+#define CONFIG_HEADER_BF_LOCL_H
+#define BF_PTR
+#endif /* HEADER_BF_LOCL_H */
+
+#if defined(HEADER_DES_LOCL_H) && !defined(CONFIG_HEADER_DES_LOCL_H)
+#define CONFIG_HEADER_DES_LOCL_H
+#ifndef DES_DEFAULT_OPTIONS
+/* the following is tweaked from a config script, that is why it is a
+ * protected undef/define */
+#ifndef DES_PTR
+#undef DES_PTR
+#endif
+
+/* This helps C compiler generate the correct code for multiple functional
+ * units. It reduces register dependancies at the expense of 2 more
+ * registers */
+#ifndef DES_RISC1
+#undef DES_RISC1
+#endif
+
+#ifndef DES_RISC2
+#undef DES_RISC2
+#endif
+
+#if defined(DES_RISC1) && defined(DES_RISC2)
+YOU SHOULD NOT HAVE BOTH DES_RISC1 AND DES_RISC2 DEFINED!!!!!
+#endif
+
+/* Unroll the inner loop, this sometimes helps, sometimes hinders.
+ * Very mucy CPU dependant */
+#ifndef DES_UNROLL
+#define DES_UNROLL
+#endif
+
+/* These default values were supplied by
+ * Peter Gutman <pgut001@cs.auckland.ac.nz>
+ * They are only used if nothing else has been defined */
+#if !defined(DES_PTR) && !defined(DES_RISC1) && !defined(DES_RISC2) && !defined(DES_UNROLL)
+/* Special defines which change the way the code is built depending on the
+ CPU and OS. For SGI machines you can use _MIPS_SZLONG (32 or 64) to find
+ even newer MIPS CPU's, but at the moment one size fits all for
+ optimization options. Older Sparc's work better with only UNROLL, but
+ there's no way to tell at compile time what it is you're running on */
+
+#if defined( sun ) /* Newer Sparc's */
+# define DES_PTR
+# define DES_RISC1
+# define DES_UNROLL
+#elif defined( __ultrix ) /* Older MIPS */
+# define DES_PTR
+# define DES_RISC2
+# define DES_UNROLL
+#elif defined( __osf1__ ) /* Alpha */
+# define DES_PTR
+# define DES_RISC2
+#elif defined ( _AIX ) /* RS6000 */
+ /* Unknown */
+#elif defined( __hpux ) /* HP-PA */
+ /* Unknown */
+#elif defined( __aux ) /* 68K */
+ /* Unknown */
+#elif defined( __dgux ) /* 88K (but P6 in latest boxes) */
+# define DES_UNROLL
+#elif defined( __sgi ) /* Newer MIPS */
+# define DES_PTR
+# define DES_RISC2
+# define DES_UNROLL
+#elif defined(i386) || defined(__i386__) /* x86 boxes, should be gcc */
+# define DES_PTR
+# define DES_RISC1
+# define DES_UNROLL
+#endif /* Systems-specific speed defines */
+#endif
+
+#endif /* DES_DEFAULT_OPTIONS */
+#endif /* HEADER_DES_LOCL_H */
diff --git a/main/openssl/crypto/opensslconf-static-32.h b/main/openssl/crypto/opensslconf-static-32.h
new file mode 100644
index 00000000..d6625489
--- /dev/null
+++ b/main/openssl/crypto/opensslconf-static-32.h
@@ -0,0 +1,316 @@
+/* opensslconf.h */
+/* WARNING: Generated automatically from opensslconf.h.in by Configure. */
+
+/* OpenSSL was configured with the following options: */
+#ifndef OPENSSL_DOING_MAKEDEPEND
+
+
+#ifndef OPENSSL_NO_CAMELLIA
+# define OPENSSL_NO_CAMELLIA
+#endif
+#ifndef OPENSSL_NO_CAPIENG
+# define OPENSSL_NO_CAPIENG
+#endif
+#ifndef OPENSSL_NO_CAST
+# define OPENSSL_NO_CAST
+#endif
+#ifndef OPENSSL_NO_DTLS1
+# define OPENSSL_NO_DTLS1
+#endif
+#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
+# define OPENSSL_NO_EC_NISTP_64_GCC_128
+#endif
+#ifndef OPENSSL_NO_GMP
+# define OPENSSL_NO_GMP
+#endif
+#ifndef OPENSSL_NO_GOST
+# define OPENSSL_NO_GOST
+#endif
+#ifndef OPENSSL_NO_HEARTBEATS
+# define OPENSSL_NO_HEARTBEATS
+#endif
+#ifndef OPENSSL_NO_IDEA
+# define OPENSSL_NO_IDEA
+#endif
+#ifndef OPENSSL_NO_JPAKE
+# define OPENSSL_NO_JPAKE
+#endif
+#ifndef OPENSSL_NO_KRB5
+# define OPENSSL_NO_KRB5
+#endif
+#ifndef OPENSSL_NO_MD2
+# define OPENSSL_NO_MD2
+#endif
+#ifndef OPENSSL_NO_MDC2
+# define OPENSSL_NO_MDC2
+#endif
+#ifndef OPENSSL_NO_RC5
+# define OPENSSL_NO_RC5
+#endif
+#ifndef OPENSSL_NO_RDRAND
+# define OPENSSL_NO_RDRAND
+#endif
+#ifndef OPENSSL_NO_RFC3779
+# define OPENSSL_NO_RFC3779
+#endif
+#ifndef OPENSSL_NO_RSAX
+# define OPENSSL_NO_RSAX
+#endif
+#ifndef OPENSSL_NO_SCTP
+# define OPENSSL_NO_SCTP
+#endif
+#ifndef OPENSSL_NO_SEED
+# define OPENSSL_NO_SEED
+#endif
+#ifndef OPENSSL_NO_SHA0
+# define OPENSSL_NO_SHA0
+#endif
+#ifndef OPENSSL_NO_STATIC_ENGINE
+# define OPENSSL_NO_STATIC_ENGINE
+#endif
+#ifndef OPENSSL_NO_STORE
+# define OPENSSL_NO_STORE
+#endif
+#ifndef OPENSSL_NO_WHIRLPOOL
+# define OPENSSL_NO_WHIRLPOOL
+#endif
+
+#endif /* OPENSSL_DOING_MAKEDEPEND */
+
+#ifndef OPENSSL_THREADS
+# define OPENSSL_THREADS
+#endif
+#ifndef OPENSSL_NO_DYNAMIC_ENGINE
+# define OPENSSL_NO_DYNAMIC_ENGINE
+#endif
+
+/* The OPENSSL_NO_* macros are also defined as NO_* if the application
+ asks for it. This is a transient feature that is provided for those
+ who haven't had the time to do the appropriate changes in their
+ applications. */
+#ifdef OPENSSL_ALGORITHM_DEFINES
+# if defined(OPENSSL_NO_CAMELLIA) && !defined(NO_CAMELLIA)
+# define NO_CAMELLIA
+# endif
+# if defined(OPENSSL_NO_CAPIENG) && !defined(NO_CAPIENG)
+# define NO_CAPIENG
+# endif
+# if defined(OPENSSL_NO_CAST) && !defined(NO_CAST)
+# define NO_CAST
+# endif
+# if defined(OPENSSL_NO_DTLS1) && !defined(NO_DTLS1)
+# define NO_DTLS1
+# endif
+# if defined(OPENSSL_NO_EC_NISTP_64_GCC_128) && !defined(NO_EC_NISTP_64_GCC_128)
+# define NO_EC_NISTP_64_GCC_128
+# endif
+# if defined(OPENSSL_NO_GMP) && !defined(NO_GMP)
+# define NO_GMP
+# endif
+# if defined(OPENSSL_NO_GOST) && !defined(NO_GOST)
+# define NO_GOST
+# endif
+# if defined(OPENSSL_NO_HEARTBEATS) && !defined(NO_HEARTBEATS)
+# define NO_HEARTBEATS
+# endif
+# if defined(OPENSSL_NO_IDEA) && !defined(NO_IDEA)
+# define NO_IDEA
+# endif
+# if defined(OPENSSL_NO_JPAKE) && !defined(NO_JPAKE)
+# define NO_JPAKE
+# endif
+# if defined(OPENSSL_NO_KRB5) && !defined(NO_KRB5)
+# define NO_KRB5
+# endif
+# if defined(OPENSSL_NO_MD2) && !defined(NO_MD2)
+# define NO_MD2
+# endif
+# if defined(OPENSSL_NO_MDC2) && !defined(NO_MDC2)
+# define NO_MDC2
+# endif
+# if defined(OPENSSL_NO_RC5) && !defined(NO_RC5)
+# define NO_RC5
+# endif
+# if defined(OPENSSL_NO_RDRAND) && !defined(NO_RDRAND)
+# define NO_RDRAND
+# endif
+# if defined(OPENSSL_NO_RFC3779) && !defined(NO_RFC3779)
+# define NO_RFC3779
+# endif
+# if defined(OPENSSL_NO_RSAX) && !defined(NO_RSAX)
+# define NO_RSAX
+# endif
+# if defined(OPENSSL_NO_SCTP) && !defined(NO_SCTP)
+# define NO_SCTP
+# endif
+# if defined(OPENSSL_NO_SEED) && !defined(NO_SEED)
+# define NO_SEED
+# endif
+# if defined(OPENSSL_NO_SHA0) && !defined(NO_SHA0)
+# define NO_SHA0
+# endif
+# if defined(OPENSSL_NO_STATIC_ENGINE) && !defined(NO_STATIC_ENGINE)
+# define NO_STATIC_ENGINE
+# endif
+# if defined(OPENSSL_NO_STORE) && !defined(NO_STORE)
+# define NO_STORE
+# endif
+# if defined(OPENSSL_NO_WHIRLPOOL) && !defined(NO_WHIRLPOOL)
+# define NO_WHIRLPOOL
+# endif
+#endif
+
+/* crypto/opensslconf.h.in */
+
+/* Generate 80386 code? */
+#undef I386_ONLY
+
+#if !(defined(VMS) || defined(__VMS)) /* VMS uses logical names instead */
+#if defined(HEADER_CRYPTLIB_H) && !defined(OPENSSLDIR)
+#define ENGINESDIR "/usr/local/ssl/lib/engines"
+#define OPENSSLDIR "/usr/local/ssl"
+#endif
+#endif
+
+#undef OPENSSL_UNISTD
+#define OPENSSL_UNISTD <unistd.h>
+
+#undef OPENSSL_EXPORT_VAR_AS_FUNCTION
+
+#if defined(HEADER_IDEA_H) && !defined(IDEA_INT)
+#define IDEA_INT unsigned int
+#endif
+
+#if defined(HEADER_MD2_H) && !defined(MD2_INT)
+#define MD2_INT unsigned int
+#endif
+
+#if defined(HEADER_RC2_H) && !defined(RC2_INT)
+/* I need to put in a mod for the alpha - eay */
+#define RC2_INT unsigned int
+#endif
+
+#if defined(HEADER_RC4_H)
+#if !defined(RC4_INT)
+/* using int types make the structure larger but make the code faster
+ * on most boxes I have tested - up to %20 faster. */
+/*
+ * I don't know what does "most" mean, but declaring "int" is a must on:
+ * - Intel P6 because partial register stalls are very expensive;
+ * - elder Alpha because it lacks byte load/store instructions;
+ */
+#define RC4_INT unsigned char
+#endif
+#if !defined(RC4_CHUNK)
+/*
+ * This enables code handling data aligned at natural CPU word
+ * boundary. See crypto/rc4/rc4_enc.c for further details.
+ */
+#define RC4_CHUNK unsigned long
+#endif
+#endif
+
+#if (defined(HEADER_NEW_DES_H) || defined(HEADER_DES_H)) && !defined(DES_LONG)
+/* If this is set to 'unsigned int' on a DEC Alpha, this gives about a
+ * %20 speed up (longs are 8 bytes, int's are 4). */
+#ifndef DES_LONG
+#define DES_LONG unsigned int
+#endif
+#endif
+
+#if defined(HEADER_BN_H) && !defined(CONFIG_HEADER_BN_H)
+#define CONFIG_HEADER_BN_H
+#define BN_LLONG
+
+/* Should we define BN_DIV2W here? */
+
+/* Only one for the following should be defined */
+#undef SIXTY_FOUR_BIT_LONG
+#undef SIXTY_FOUR_BIT
+#define THIRTY_TWO_BIT
+#endif
+
+#if defined(HEADER_RC4_LOCL_H) && !defined(CONFIG_HEADER_RC4_LOCL_H)
+#define CONFIG_HEADER_RC4_LOCL_H
+/* if this is defined data[i] is used instead of *data, this is a %20
+ * speedup on x86 */
+#undef RC4_INDEX
+#endif
+
+#if defined(HEADER_BF_LOCL_H) && !defined(CONFIG_HEADER_BF_LOCL_H)
+#define CONFIG_HEADER_BF_LOCL_H
+#define BF_PTR
+#endif /* HEADER_BF_LOCL_H */
+
+#if defined(HEADER_DES_LOCL_H) && !defined(CONFIG_HEADER_DES_LOCL_H)
+#define CONFIG_HEADER_DES_LOCL_H
+#ifndef DES_DEFAULT_OPTIONS
+/* the following is tweaked from a config script, that is why it is a
+ * protected undef/define */
+#ifndef DES_PTR
+#undef DES_PTR
+#endif
+
+/* This helps C compiler generate the correct code for multiple functional
+ * units. It reduces register dependancies at the expense of 2 more
+ * registers */
+#ifndef DES_RISC1
+#undef DES_RISC1
+#endif
+
+#ifndef DES_RISC2
+#undef DES_RISC2
+#endif
+
+#if defined(DES_RISC1) && defined(DES_RISC2)
+YOU SHOULD NOT HAVE BOTH DES_RISC1 AND DES_RISC2 DEFINED!!!!!
+#endif
+
+/* Unroll the inner loop, this sometimes helps, sometimes hinders.
+ * Very mucy CPU dependant */
+#ifndef DES_UNROLL
+#define DES_UNROLL
+#endif
+
+/* These default values were supplied by
+ * Peter Gutman <pgut001@cs.auckland.ac.nz>
+ * They are only used if nothing else has been defined */
+#if !defined(DES_PTR) && !defined(DES_RISC1) && !defined(DES_RISC2) && !defined(DES_UNROLL)
+/* Special defines which change the way the code is built depending on the
+ CPU and OS. For SGI machines you can use _MIPS_SZLONG (32 or 64) to find
+ even newer MIPS CPU's, but at the moment one size fits all for
+ optimization options. Older Sparc's work better with only UNROLL, but
+ there's no way to tell at compile time what it is you're running on */
+
+#if defined( sun ) /* Newer Sparc's */
+# define DES_PTR
+# define DES_RISC1
+# define DES_UNROLL
+#elif defined( __ultrix ) /* Older MIPS */
+# define DES_PTR
+# define DES_RISC2
+# define DES_UNROLL
+#elif defined( __osf1__ ) /* Alpha */
+# define DES_PTR
+# define DES_RISC2
+#elif defined ( _AIX ) /* RS6000 */
+ /* Unknown */
+#elif defined( __hpux ) /* HP-PA */
+ /* Unknown */
+#elif defined( __aux ) /* 68K */
+ /* Unknown */
+#elif defined( __dgux ) /* 88K (but P6 in latest boxes) */
+# define DES_UNROLL
+#elif defined( __sgi ) /* Newer MIPS */
+# define DES_PTR
+# define DES_RISC2
+# define DES_UNROLL
+#elif defined(i386) || defined(__i386__) /* x86 boxes, should be gcc */
+# define DES_PTR
+# define DES_RISC1
+# define DES_UNROLL
+#endif /* Systems-specific speed defines */
+#endif
+
+#endif /* DES_DEFAULT_OPTIONS */
+#endif /* HEADER_DES_LOCL_H */
diff --git a/main/openssl/crypto/opensslconf-static-64.h b/main/openssl/crypto/opensslconf-static-64.h
new file mode 100644
index 00000000..70c5a2cb
--- /dev/null
+++ b/main/openssl/crypto/opensslconf-static-64.h
@@ -0,0 +1,316 @@
+/* opensslconf.h */
+/* WARNING: Generated automatically from opensslconf.h.in by Configure. */
+
+/* OpenSSL was configured with the following options: */
+#ifndef OPENSSL_DOING_MAKEDEPEND
+
+
+#ifndef OPENSSL_NO_CAMELLIA
+# define OPENSSL_NO_CAMELLIA
+#endif
+#ifndef OPENSSL_NO_CAPIENG
+# define OPENSSL_NO_CAPIENG
+#endif
+#ifndef OPENSSL_NO_CAST
+# define OPENSSL_NO_CAST
+#endif
+#ifndef OPENSSL_NO_DTLS1
+# define OPENSSL_NO_DTLS1
+#endif
+#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
+# define OPENSSL_NO_EC_NISTP_64_GCC_128
+#endif
+#ifndef OPENSSL_NO_GMP
+# define OPENSSL_NO_GMP
+#endif
+#ifndef OPENSSL_NO_GOST
+# define OPENSSL_NO_GOST
+#endif
+#ifndef OPENSSL_NO_HEARTBEATS
+# define OPENSSL_NO_HEARTBEATS
+#endif
+#ifndef OPENSSL_NO_IDEA
+# define OPENSSL_NO_IDEA
+#endif
+#ifndef OPENSSL_NO_JPAKE
+# define OPENSSL_NO_JPAKE
+#endif
+#ifndef OPENSSL_NO_KRB5
+# define OPENSSL_NO_KRB5
+#endif
+#ifndef OPENSSL_NO_MD2
+# define OPENSSL_NO_MD2
+#endif
+#ifndef OPENSSL_NO_MDC2
+# define OPENSSL_NO_MDC2
+#endif
+#ifndef OPENSSL_NO_RC5
+# define OPENSSL_NO_RC5
+#endif
+#ifndef OPENSSL_NO_RDRAND
+# define OPENSSL_NO_RDRAND
+#endif
+#ifndef OPENSSL_NO_RFC3779
+# define OPENSSL_NO_RFC3779
+#endif
+#ifndef OPENSSL_NO_RSAX
+# define OPENSSL_NO_RSAX
+#endif
+#ifndef OPENSSL_NO_SCTP
+# define OPENSSL_NO_SCTP
+#endif
+#ifndef OPENSSL_NO_SEED
+# define OPENSSL_NO_SEED
+#endif
+#ifndef OPENSSL_NO_SHA0
+# define OPENSSL_NO_SHA0
+#endif
+#ifndef OPENSSL_NO_STATIC_ENGINE
+# define OPENSSL_NO_STATIC_ENGINE
+#endif
+#ifndef OPENSSL_NO_STORE
+# define OPENSSL_NO_STORE
+#endif
+#ifndef OPENSSL_NO_WHIRLPOOL
+# define OPENSSL_NO_WHIRLPOOL
+#endif
+
+#endif /* OPENSSL_DOING_MAKEDEPEND */
+
+#ifndef OPENSSL_THREADS
+# define OPENSSL_THREADS
+#endif
+#ifndef OPENSSL_NO_DYNAMIC_ENGINE
+# define OPENSSL_NO_DYNAMIC_ENGINE
+#endif
+
+/* The OPENSSL_NO_* macros are also defined as NO_* if the application
+ asks for it. This is a transient feature that is provided for those
+ who haven't had the time to do the appropriate changes in their
+ applications. */
+#ifdef OPENSSL_ALGORITHM_DEFINES
+# if defined(OPENSSL_NO_CAMELLIA) && !defined(NO_CAMELLIA)
+# define NO_CAMELLIA
+# endif
+# if defined(OPENSSL_NO_CAPIENG) && !defined(NO_CAPIENG)
+# define NO_CAPIENG
+# endif
+# if defined(OPENSSL_NO_CAST) && !defined(NO_CAST)
+# define NO_CAST
+# endif
+# if defined(OPENSSL_NO_DTLS1) && !defined(NO_DTLS1)
+# define NO_DTLS1
+# endif
+# if defined(OPENSSL_NO_EC_NISTP_64_GCC_128) && !defined(NO_EC_NISTP_64_GCC_128)
+# define NO_EC_NISTP_64_GCC_128
+# endif
+# if defined(OPENSSL_NO_GMP) && !defined(NO_GMP)
+# define NO_GMP
+# endif
+# if defined(OPENSSL_NO_GOST) && !defined(NO_GOST)
+# define NO_GOST
+# endif
+# if defined(OPENSSL_NO_HEARTBEATS) && !defined(NO_HEARTBEATS)
+# define NO_HEARTBEATS
+# endif
+# if defined(OPENSSL_NO_IDEA) && !defined(NO_IDEA)
+# define NO_IDEA
+# endif
+# if defined(OPENSSL_NO_JPAKE) && !defined(NO_JPAKE)
+# define NO_JPAKE
+# endif
+# if defined(OPENSSL_NO_KRB5) && !defined(NO_KRB5)
+# define NO_KRB5
+# endif
+# if defined(OPENSSL_NO_MD2) && !defined(NO_MD2)
+# define NO_MD2
+# endif
+# if defined(OPENSSL_NO_MDC2) && !defined(NO_MDC2)
+# define NO_MDC2
+# endif
+# if defined(OPENSSL_NO_RC5) && !defined(NO_RC5)
+# define NO_RC5
+# endif
+# if defined(OPENSSL_NO_RDRAND) && !defined(NO_RDRAND)
+# define NO_RDRAND
+# endif
+# if defined(OPENSSL_NO_RFC3779) && !defined(NO_RFC3779)
+# define NO_RFC3779
+# endif
+# if defined(OPENSSL_NO_RSAX) && !defined(NO_RSAX)
+# define NO_RSAX
+# endif
+# if defined(OPENSSL_NO_SCTP) && !defined(NO_SCTP)
+# define NO_SCTP
+# endif
+# if defined(OPENSSL_NO_SEED) && !defined(NO_SEED)
+# define NO_SEED
+# endif
+# if defined(OPENSSL_NO_SHA0) && !defined(NO_SHA0)
+# define NO_SHA0
+# endif
+# if defined(OPENSSL_NO_STATIC_ENGINE) && !defined(NO_STATIC_ENGINE)
+# define NO_STATIC_ENGINE
+# endif
+# if defined(OPENSSL_NO_STORE) && !defined(NO_STORE)
+# define NO_STORE
+# endif
+# if defined(OPENSSL_NO_WHIRLPOOL) && !defined(NO_WHIRLPOOL)
+# define NO_WHIRLPOOL
+# endif
+#endif
+
+/* crypto/opensslconf.h.in */
+
+/* Generate 80386 code? */
+#undef I386_ONLY
+
+#if !(defined(VMS) || defined(__VMS)) /* VMS uses logical names instead */
+#if defined(HEADER_CRYPTLIB_H) && !defined(OPENSSLDIR)
+#define ENGINESDIR "/usr/local/ssl/lib/engines"
+#define OPENSSLDIR "/usr/local/ssl"
+#endif
+#endif
+
+#undef OPENSSL_UNISTD
+#define OPENSSL_UNISTD <unistd.h>
+
+#undef OPENSSL_EXPORT_VAR_AS_FUNCTION
+
+#if defined(HEADER_IDEA_H) && !defined(IDEA_INT)
+#define IDEA_INT unsigned int
+#endif
+
+#if defined(HEADER_MD2_H) && !defined(MD2_INT)
+#define MD2_INT unsigned int
+#endif
+
+#if defined(HEADER_RC2_H) && !defined(RC2_INT)
+/* I need to put in a mod for the alpha - eay */
+#define RC2_INT unsigned int
+#endif
+
+#if defined(HEADER_RC4_H)
+#if !defined(RC4_INT)
+/* using int types make the structure larger but make the code faster
+ * on most boxes I have tested - up to %20 faster. */
+/*
+ * I don't know what does "most" mean, but declaring "int" is a must on:
+ * - Intel P6 because partial register stalls are very expensive;
+ * - elder Alpha because it lacks byte load/store instructions;
+ */
+#define RC4_INT unsigned char
+#endif
+#if !defined(RC4_CHUNK)
+/*
+ * This enables code handling data aligned at natural CPU word
+ * boundary. See crypto/rc4/rc4_enc.c for further details.
+ */
+#define RC4_CHUNK unsigned long
+#endif
+#endif
+
+#if (defined(HEADER_NEW_DES_H) || defined(HEADER_DES_H)) && !defined(DES_LONG)
+/* If this is set to 'unsigned int' on a DEC Alpha, this gives about a
+ * %20 speed up (longs are 8 bytes, int's are 4). */
+#ifndef DES_LONG
+#define DES_LONG unsigned int
+#endif
+#endif
+
+#if defined(HEADER_BN_H) && !defined(CONFIG_HEADER_BN_H)
+#define CONFIG_HEADER_BN_H
+#undef BN_LLONG
+
+/* Should we define BN_DIV2W here? */
+
+/* Only one for the following should be defined */
+#define SIXTY_FOUR_BIT_LONG
+#undef SIXTY_FOUR_BIT
+#undef THIRTY_TWO_BIT
+#endif
+
+#if defined(HEADER_RC4_LOCL_H) && !defined(CONFIG_HEADER_RC4_LOCL_H)
+#define CONFIG_HEADER_RC4_LOCL_H
+/* if this is defined data[i] is used instead of *data, this is a %20
+ * speedup on x86 */
+#undef RC4_INDEX
+#endif
+
+#if defined(HEADER_BF_LOCL_H) && !defined(CONFIG_HEADER_BF_LOCL_H)
+#define CONFIG_HEADER_BF_LOCL_H
+#define BF_PTR
+#endif /* HEADER_BF_LOCL_H */
+
+#if defined(HEADER_DES_LOCL_H) && !defined(CONFIG_HEADER_DES_LOCL_H)
+#define CONFIG_HEADER_DES_LOCL_H
+#ifndef DES_DEFAULT_OPTIONS
+/* the following is tweaked from a config script, that is why it is a
+ * protected undef/define */
+#ifndef DES_PTR
+#undef DES_PTR
+#endif
+
+/* This helps C compiler generate the correct code for multiple functional
+ * units. It reduces register dependancies at the expense of 2 more
+ * registers */
+#ifndef DES_RISC1
+#undef DES_RISC1
+#endif
+
+#ifndef DES_RISC2
+#undef DES_RISC2
+#endif
+
+#if defined(DES_RISC1) && defined(DES_RISC2)
+YOU SHOULD NOT HAVE BOTH DES_RISC1 AND DES_RISC2 DEFINED!!!!!
+#endif
+
+/* Unroll the inner loop, this sometimes helps, sometimes hinders.
+ * Very mucy CPU dependant */
+#ifndef DES_UNROLL
+#define DES_UNROLL
+#endif
+
+/* These default values were supplied by
+ * Peter Gutman <pgut001@cs.auckland.ac.nz>
+ * They are only used if nothing else has been defined */
+#if !defined(DES_PTR) && !defined(DES_RISC1) && !defined(DES_RISC2) && !defined(DES_UNROLL)
+/* Special defines which change the way the code is built depending on the
+ CPU and OS. For SGI machines you can use _MIPS_SZLONG (32 or 64) to find
+ even newer MIPS CPU's, but at the moment one size fits all for
+ optimization options. Older Sparc's work better with only UNROLL, but
+ there's no way to tell at compile time what it is you're running on */
+
+#if defined( sun ) /* Newer Sparc's */
+# define DES_PTR
+# define DES_RISC1
+# define DES_UNROLL
+#elif defined( __ultrix ) /* Older MIPS */
+# define DES_PTR
+# define DES_RISC2
+# define DES_UNROLL
+#elif defined( __osf1__ ) /* Alpha */
+# define DES_PTR
+# define DES_RISC2
+#elif defined ( _AIX ) /* RS6000 */
+ /* Unknown */
+#elif defined( __hpux ) /* HP-PA */
+ /* Unknown */
+#elif defined( __aux ) /* 68K */
+ /* Unknown */
+#elif defined( __dgux ) /* 88K (but P6 in latest boxes) */
+# define DES_UNROLL
+#elif defined( __sgi ) /* Newer MIPS */
+# define DES_PTR
+# define DES_RISC2
+# define DES_UNROLL
+#elif defined(i386) || defined(__i386__) /* x86 boxes, should be gcc */
+# define DES_PTR
+# define DES_RISC1
+# define DES_UNROLL
+#endif /* Systems-specific speed defines */
+#endif
+
+#endif /* DES_DEFAULT_OPTIONS */
+#endif /* HEADER_DES_LOCL_H */
diff --git a/main/openssl/crypto/opensslconf-static-trusty.h b/main/openssl/crypto/opensslconf-static-trusty.h
new file mode 100644
index 00000000..06f9f982
--- /dev/null
+++ b/main/openssl/crypto/opensslconf-static-trusty.h
@@ -0,0 +1,448 @@
+/* opensslconf.h */
+/* WARNING: Generated automatically from opensslconf.h.in by Configure. */
+
+/* OpenSSL was configured with the following options: */
+#ifndef OPENSSL_DOING_MAKEDEPEND
+
+
+#ifndef OPENSSL_NO_CAMELLIA
+# define OPENSSL_NO_CAMELLIA
+#endif
+#ifndef OPENSSL_NO_CAPIENG
+# define OPENSSL_NO_CAPIENG
+#endif
+#ifndef OPENSSL_NO_CAST
+# define OPENSSL_NO_CAST
+#endif
+#ifndef OPENSSL_NO_CMS
+# define OPENSSL_NO_CMS
+#endif
+#ifndef OPENSSL_NO_COMP
+# define OPENSSL_NO_COMP
+#endif
+#ifndef OPENSSL_NO_CONF
+# define OPENSSL_NO_CONF
+#endif
+#ifndef OPENSSL_NO_DES
+# define OPENSSL_NO_DES
+#endif
+#ifndef OPENSSL_NO_DTLS1
+# define OPENSSL_NO_DTLS1
+#endif
+#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
+# define OPENSSL_NO_EC_NISTP_64_GCC_128
+#endif
+#ifndef OPENSSL_NO_GMP
+# define OPENSSL_NO_GMP
+#endif
+#ifndef OPENSSL_NO_GOST
+# define OPENSSL_NO_GOST
+#endif
+#ifndef OPENSSL_NO_HEARTBEATS
+# define OPENSSL_NO_HEARTBEATS
+#endif
+#ifndef OPENSSL_NO_IDEA
+# define OPENSSL_NO_IDEA
+#endif
+#ifndef OPENSSL_NO_JPAKE
+# define OPENSSL_NO_JPAKE
+#endif
+#ifndef OPENSSL_NO_KRB5
+# define OPENSSL_NO_KRB5
+#endif
+#ifndef OPENSSL_NO_LOCKING
+# define OPENSSL_NO_LOCKING
+#endif
+#ifndef OPENSSL_NO_MD2
+# define OPENSSL_NO_MD2
+#endif
+#ifndef OPENSSL_NO_MD4
+# define OPENSSL_NO_MD4
+#endif
+#ifndef OPENSSL_NO_MD5
+# define OPENSSL_NO_MD5
+#endif
+#ifndef OPENSSL_NO_MDC2
+# define OPENSSL_NO_MDC2
+#endif
+#ifndef OPENSSL_NO_OCSP
+# define OPENSSL_NO_OCSP
+#endif
+#ifndef OPENSSL_NO_PEM
+# define OPENSSL_NO_PEM
+#endif
+#ifndef OPENSSL_NO_PKCS12
+# define OPENSSL_NO_PKCS12
+#endif
+#ifndef OPENSSL_NO_PQUEUE
+# define OPENSSL_NO_PQUEUE
+#endif
+#ifndef OPENSSL_NO_RC2
+# define OPENSSL_NO_RC2
+#endif
+#ifndef OPENSSL_NO_RC4
+# define OPENSSL_NO_RC4
+#endif
+#ifndef OPENSSL_NO_RC5
+# define OPENSSL_NO_RC5
+#endif
+#ifndef OPENSSL_NO_RDRAND
+# define OPENSSL_NO_RDRAND
+#endif
+#ifndef OPENSSL_NO_RFC3779
+# define OPENSSL_NO_RFC3779
+#endif
+#ifndef OPENSSL_NO_RIPEMD
+# define OPENSSL_NO_RIPEMD
+#endif
+#ifndef OPENSSL_NO_RSAX
+# define OPENSSL_NO_RSAX
+#endif
+#ifndef OPENSSL_NO_SCTP
+# define OPENSSL_NO_SCTP
+#endif
+#ifndef OPENSSL_NO_SEED
+# define OPENSSL_NO_SEED
+#endif
+#ifndef OPENSSL_NO_SHA0
+# define OPENSSL_NO_SHA0
+#endif
+#ifndef OPENSSL_NO_SRP
+# define OPENSSL_NO_SRP
+#endif
+#ifndef OPENSSL_NO_SSL2
+# define OPENSSL_NO_SSL2
+#endif
+#ifndef OPENSSL_NO_SSL3
+# define OPENSSL_NO_SSL3
+#endif
+#ifndef OPENSSL_NO_STATIC_ENGINE
+# define OPENSSL_NO_STATIC_ENGINE
+#endif
+#ifndef OPENSSL_NO_STORE
+# define OPENSSL_NO_STORE
+#endif
+#ifndef OPENSSL_NO_TLS1
+# define OPENSSL_NO_TLS1
+#endif
+#ifndef OPENSSL_NO_TLSEXT
+# define OPENSSL_NO_TLSEXT
+#endif
+#ifndef OPENSSL_NO_TS
+# define OPENSSL_NO_TS
+#endif
+#ifndef OPENSSL_NO_TXT_DB
+# define OPENSSL_NO_TXT_DB
+#endif
+#ifndef OPENSSL_NO_UI
+# define OPENSSL_NO_UI
+#endif
+#ifndef OPENSSL_NO_WHIRLPOOL
+# define OPENSSL_NO_WHIRLPOOL
+#endif
+
+#endif /* OPENSSL_DOING_MAKEDEPEND */
+
+#ifndef OPENSSL_NO_ERR
+# define OPENSSL_NO_ERR
+#endif
+#ifndef OPENSSL_NO_DYNAMIC_ENGINE
+# define OPENSSL_NO_DYNAMIC_ENGINE
+#endif
+
+/* The OPENSSL_NO_* macros are also defined as NO_* if the application
+ asks for it. This is a transient feature that is provided for those
+ who haven't had the time to do the appropriate changes in their
+ applications. */
+#ifdef OPENSSL_ALGORITHM_DEFINES
+# if defined(OPENSSL_NO_CAMELLIA) && !defined(NO_CAMELLIA)
+# define NO_CAMELLIA
+# endif
+# if defined(OPENSSL_NO_CAPIENG) && !defined(NO_CAPIENG)
+# define NO_CAPIENG
+# endif
+# if defined(OPENSSL_NO_CAST) && !defined(NO_CAST)
+# define NO_CAST
+# endif
+# if defined(OPENSSL_NO_CMS) && !defined(NO_CMS)
+# define NO_CMS
+# endif
+# if defined(OPENSSL_NO_COMP) && !defined(NO_COMP)
+# define NO_COMP
+# endif
+# if defined(OPENSSL_NO_CONF) && !defined(NO_CONF)
+# define NO_CONF
+# endif
+# if defined(OPENSSL_NO_DES) && !defined(NO_DES)
+# define NO_DES
+# endif
+# if defined(OPENSSL_NO_DTLS1) && !defined(NO_DTLS1)
+# define NO_DTLS1
+# endif
+# if defined(OPENSSL_NO_EC_NISTP_64_GCC_128) && !defined(NO_EC_NISTP_64_GCC_128)
+# define NO_EC_NISTP_64_GCC_128
+# endif
+# if defined(OPENSSL_NO_GMP) && !defined(NO_GMP)
+# define NO_GMP
+# endif
+# if defined(OPENSSL_NO_GOST) && !defined(NO_GOST)
+# define NO_GOST
+# endif
+# if defined(OPENSSL_NO_HEARTBEATS) && !defined(NO_HEARTBEATS)
+# define NO_HEARTBEATS
+# endif
+# if defined(OPENSSL_NO_IDEA) && !defined(NO_IDEA)
+# define NO_IDEA
+# endif
+# if defined(OPENSSL_NO_JPAKE) && !defined(NO_JPAKE)
+# define NO_JPAKE
+# endif
+# if defined(OPENSSL_NO_KRB5) && !defined(NO_KRB5)
+# define NO_KRB5
+# endif
+# if defined(OPENSSL_NO_LOCKING) && !defined(NO_LOCKING)
+# define NO_LOCKING
+# endif
+# if defined(OPENSSL_NO_MD2) && !defined(NO_MD2)
+# define NO_MD2
+# endif
+# if defined(OPENSSL_NO_MD4) && !defined(NO_MD4)
+# define NO_MD4
+# endif
+# if defined(OPENSSL_NO_MD5) && !defined(NO_MD5)
+# define NO_MD5
+# endif
+# if defined(OPENSSL_NO_MDC2) && !defined(NO_MDC2)
+# define NO_MDC2
+# endif
+# if defined(OPENSSL_NO_OCSP) && !defined(NO_OCSP)
+# define NO_OCSP
+# endif
+# if defined(OPENSSL_NO_PEM) && !defined(NO_PEM)
+# define NO_PEM
+# endif
+# if defined(OPENSSL_NO_PKCS12) && !defined(NO_PKCS12)
+# define NO_PKCS12
+# endif
+# if defined(OPENSSL_NO_PQUEUE) && !defined(NO_PQUEUE)
+# define NO_PQUEUE
+# endif
+# if defined(OPENSSL_NO_RC2) && !defined(NO_RC2)
+# define NO_RC2
+# endif
+# if defined(OPENSSL_NO_RC4) && !defined(NO_RC4)
+# define NO_RC4
+# endif
+# if defined(OPENSSL_NO_RC5) && !defined(NO_RC5)
+# define NO_RC5
+# endif
+# if defined(OPENSSL_NO_RDRAND) && !defined(NO_RDRAND)
+# define NO_RDRAND
+# endif
+# if defined(OPENSSL_NO_RFC3779) && !defined(NO_RFC3779)
+# define NO_RFC3779
+# endif
+# if defined(OPENSSL_NO_RIPEMD) && !defined(NO_RIPEMD)
+# define NO_RIPEMD
+# endif
+# if defined(OPENSSL_NO_RSAX) && !defined(NO_RSAX)
+# define NO_RSAX
+# endif
+# if defined(OPENSSL_NO_SCTP) && !defined(NO_SCTP)
+# define NO_SCTP
+# endif
+# if defined(OPENSSL_NO_SEED) && !defined(NO_SEED)
+# define NO_SEED
+# endif
+# if defined(OPENSSL_NO_SHA0) && !defined(NO_SHA0)
+# define NO_SHA0
+# endif
+# if defined(OPENSSL_NO_SRP) && !defined(NO_SRP)
+# define NO_SRP
+# endif
+# if defined(OPENSSL_NO_SSL2) && !defined(NO_SSL2)
+# define NO_SSL2
+# endif
+# if defined(OPENSSL_NO_SSL3) && !defined(NO_SSL3)
+# define NO_SSL3
+# endif
+# if defined(OPENSSL_NO_STATIC_ENGINE) && !defined(NO_STATIC_ENGINE)
+# define NO_STATIC_ENGINE
+# endif
+# if defined(OPENSSL_NO_STORE) && !defined(NO_STORE)
+# define NO_STORE
+# endif
+# if defined(OPENSSL_NO_TLS1) && !defined(NO_TLS1)
+# define NO_TLS1
+# endif
+# if defined(OPENSSL_NO_TLSEXT) && !defined(NO_TLSEXT)
+# define NO_TLSEXT
+# endif
+# if defined(OPENSSL_NO_TS) && !defined(NO_TS)
+# define NO_TS
+# endif
+# if defined(OPENSSL_NO_TXT_DB) && !defined(NO_TXT_DB)
+# define NO_TXT_DB
+# endif
+# if defined(OPENSSL_NO_UI) && !defined(NO_UI)
+# define NO_UI
+# endif
+# if defined(OPENSSL_NO_WHIRLPOOL) && !defined(NO_WHIRLPOOL)
+# define NO_WHIRLPOOL
+# endif
+#endif
+
+/* crypto/opensslconf.h.in */
+
+/* Generate 80386 code? */
+#undef I386_ONLY
+
+#if !(defined(VMS) || defined(__VMS)) /* VMS uses logical names instead */
+#if defined(HEADER_CRYPTLIB_H) && !defined(OPENSSLDIR)
+#define ENGINESDIR "/usr/local/ssl/lib/engines"
+#define OPENSSLDIR "/usr/local/ssl"
+#endif
+#endif
+
+#undef OPENSSL_UNISTD
+#define OPENSSL_UNISTD <trusty_std.h>
+
+#undef OPENSSL_EXPORT_VAR_AS_FUNCTION
+
+#if defined(HEADER_IDEA_H) && !defined(IDEA_INT)
+#define IDEA_INT unsigned int
+#endif
+
+#if defined(HEADER_MD2_H) && !defined(MD2_INT)
+#define MD2_INT unsigned int
+#endif
+
+#if defined(HEADER_RC2_H) && !defined(RC2_INT)
+/* I need to put in a mod for the alpha - eay */
+#define RC2_INT unsigned int
+#endif
+
+#if defined(HEADER_RC4_H)
+#if !defined(RC4_INT)
+/* using int types make the structure larger but make the code faster
+ * on most boxes I have tested - up to %20 faster. */
+/*
+ * I don't know what does "most" mean, but declaring "int" is a must on:
+ * - Intel P6 because partial register stalls are very expensive;
+ * - elder Alpha because it lacks byte load/store instructions;
+ */
+#define RC4_INT unsigned int
+#endif
+#if !defined(RC4_CHUNK)
+/*
+ * This enables code handling data aligned at natural CPU word
+ * boundary. See crypto/rc4/rc4_enc.c for further details.
+ */
+#undef RC4_CHUNK
+#endif
+#endif
+
+#if (defined(HEADER_NEW_DES_H) || defined(HEADER_DES_H)) && !defined(DES_LONG)
+/* If this is set to 'unsigned int' on a DEC Alpha, this gives about a
+ * %20 speed up (longs are 8 bytes, int's are 4). */
+#ifndef DES_LONG
+#define DES_LONG unsigned long
+#endif
+#endif
+
+#if defined(HEADER_BN_H) && !defined(CONFIG_HEADER_BN_H)
+#define CONFIG_HEADER_BN_H
+#undef BN_LLONG
+
+/* Should we define BN_DIV2W here? */
+
+/* Only one for the following should be defined */
+#undef SIXTY_FOUR_BIT_LONG
+#undef SIXTY_FOUR_BIT
+#define THIRTY_TWO_BIT
+#endif
+
+#if defined(HEADER_RC4_LOCL_H) && !defined(CONFIG_HEADER_RC4_LOCL_H)
+#define CONFIG_HEADER_RC4_LOCL_H
+/* if this is defined data[i] is used instead of *data, this is a %20
+ * speedup on x86 */
+#undef RC4_INDEX
+#endif
+
+#if defined(HEADER_BF_LOCL_H) && !defined(CONFIG_HEADER_BF_LOCL_H)
+#define CONFIG_HEADER_BF_LOCL_H
+#undef BF_PTR
+#endif /* HEADER_BF_LOCL_H */
+
+#if defined(HEADER_DES_LOCL_H) && !defined(CONFIG_HEADER_DES_LOCL_H)
+#define CONFIG_HEADER_DES_LOCL_H
+#ifndef DES_DEFAULT_OPTIONS
+/* the following is tweaked from a config script, that is why it is a
+ * protected undef/define */
+#ifndef DES_PTR
+#undef DES_PTR
+#endif
+
+/* This helps C compiler generate the correct code for multiple functional
+ * units. It reduces register dependancies at the expense of 2 more
+ * registers */
+#ifndef DES_RISC1
+#undef DES_RISC1
+#endif
+
+#ifndef DES_RISC2
+#undef DES_RISC2
+#endif
+
+#if defined(DES_RISC1) && defined(DES_RISC2)
+YOU SHOULD NOT HAVE BOTH DES_RISC1 AND DES_RISC2 DEFINED!!!!!
+#endif
+
+/* Unroll the inner loop, this sometimes helps, sometimes hinders.
+ * Very mucy CPU dependant */
+#ifndef DES_UNROLL
+#undef DES_UNROLL
+#endif
+
+/* These default values were supplied by
+ * Peter Gutman <pgut001@cs.auckland.ac.nz>
+ * They are only used if nothing else has been defined */
+#if !defined(DES_PTR) && !defined(DES_RISC1) && !defined(DES_RISC2) && !defined(DES_UNROLL)
+/* Special defines which change the way the code is built depending on the
+ CPU and OS. For SGI machines you can use _MIPS_SZLONG (32 or 64) to find
+ even newer MIPS CPU's, but at the moment one size fits all for
+ optimization options. Older Sparc's work better with only UNROLL, but
+ there's no way to tell at compile time what it is you're running on */
+
+#if defined( sun ) /* Newer Sparc's */
+# define DES_PTR
+# define DES_RISC1
+# define DES_UNROLL
+#elif defined( __ultrix ) /* Older MIPS */
+# define DES_PTR
+# define DES_RISC2
+# define DES_UNROLL
+#elif defined( __osf1__ ) /* Alpha */
+# define DES_PTR
+# define DES_RISC2
+#elif defined ( _AIX ) /* RS6000 */
+ /* Unknown */
+#elif defined( __hpux ) /* HP-PA */
+ /* Unknown */
+#elif defined( __aux ) /* 68K */
+ /* Unknown */
+#elif defined( __dgux ) /* 88K (but P6 in latest boxes) */
+# define DES_UNROLL
+#elif defined( __sgi ) /* Newer MIPS */
+# define DES_PTR
+# define DES_RISC2
+# define DES_UNROLL
+#elif defined(i386) || defined(__i386__) /* x86 boxes, should be gcc */
+# define DES_PTR
+# define DES_RISC1
+# define DES_UNROLL
+#endif /* Systems-specific speed defines */
+#endif
+
+#endif /* DES_DEFAULT_OPTIONS */
+#endif /* HEADER_DES_LOCL_H */
diff --git a/main/openssl/crypto/opensslconf-static.h b/main/openssl/crypto/opensslconf-static.h
new file mode 100644
index 00000000..f63a6e0a
--- /dev/null
+++ b/main/openssl/crypto/opensslconf-static.h
@@ -0,0 +1,6 @@
+// Auto-generated - DO NOT EDIT!
+#if defined(__LP64__)
+#include "opensslconf-static-64.h"
+#else
+#include "opensslconf-static-32.h"
+#endif
diff --git a/main/openssl/crypto/opensslconf-trusty.h b/main/openssl/crypto/opensslconf-trusty.h
new file mode 100644
index 00000000..06f9f982
--- /dev/null
+++ b/main/openssl/crypto/opensslconf-trusty.h
@@ -0,0 +1,448 @@
+/* opensslconf.h */
+/* WARNING: Generated automatically from opensslconf.h.in by Configure. */
+
+/* OpenSSL was configured with the following options: */
+#ifndef OPENSSL_DOING_MAKEDEPEND
+
+
+#ifndef OPENSSL_NO_CAMELLIA
+# define OPENSSL_NO_CAMELLIA
+#endif
+#ifndef OPENSSL_NO_CAPIENG
+# define OPENSSL_NO_CAPIENG
+#endif
+#ifndef OPENSSL_NO_CAST
+# define OPENSSL_NO_CAST
+#endif
+#ifndef OPENSSL_NO_CMS
+# define OPENSSL_NO_CMS
+#endif
+#ifndef OPENSSL_NO_COMP
+# define OPENSSL_NO_COMP
+#endif
+#ifndef OPENSSL_NO_CONF
+# define OPENSSL_NO_CONF
+#endif
+#ifndef OPENSSL_NO_DES
+# define OPENSSL_NO_DES
+#endif
+#ifndef OPENSSL_NO_DTLS1
+# define OPENSSL_NO_DTLS1
+#endif
+#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
+# define OPENSSL_NO_EC_NISTP_64_GCC_128
+#endif
+#ifndef OPENSSL_NO_GMP
+# define OPENSSL_NO_GMP
+#endif
+#ifndef OPENSSL_NO_GOST
+# define OPENSSL_NO_GOST
+#endif
+#ifndef OPENSSL_NO_HEARTBEATS
+# define OPENSSL_NO_HEARTBEATS
+#endif
+#ifndef OPENSSL_NO_IDEA
+# define OPENSSL_NO_IDEA
+#endif
+#ifndef OPENSSL_NO_JPAKE
+# define OPENSSL_NO_JPAKE
+#endif
+#ifndef OPENSSL_NO_KRB5
+# define OPENSSL_NO_KRB5
+#endif
+#ifndef OPENSSL_NO_LOCKING
+# define OPENSSL_NO_LOCKING
+#endif
+#ifndef OPENSSL_NO_MD2
+# define OPENSSL_NO_MD2
+#endif
+#ifndef OPENSSL_NO_MD4
+# define OPENSSL_NO_MD4
+#endif
+#ifndef OPENSSL_NO_MD5
+# define OPENSSL_NO_MD5
+#endif
+#ifndef OPENSSL_NO_MDC2
+# define OPENSSL_NO_MDC2
+#endif
+#ifndef OPENSSL_NO_OCSP
+# define OPENSSL_NO_OCSP
+#endif
+#ifndef OPENSSL_NO_PEM
+# define OPENSSL_NO_PEM
+#endif
+#ifndef OPENSSL_NO_PKCS12
+# define OPENSSL_NO_PKCS12
+#endif
+#ifndef OPENSSL_NO_PQUEUE
+# define OPENSSL_NO_PQUEUE
+#endif
+#ifndef OPENSSL_NO_RC2
+# define OPENSSL_NO_RC2
+#endif
+#ifndef OPENSSL_NO_RC4
+# define OPENSSL_NO_RC4
+#endif
+#ifndef OPENSSL_NO_RC5
+# define OPENSSL_NO_RC5
+#endif
+#ifndef OPENSSL_NO_RDRAND
+# define OPENSSL_NO_RDRAND
+#endif
+#ifndef OPENSSL_NO_RFC3779
+# define OPENSSL_NO_RFC3779
+#endif
+#ifndef OPENSSL_NO_RIPEMD
+# define OPENSSL_NO_RIPEMD
+#endif
+#ifndef OPENSSL_NO_RSAX
+# define OPENSSL_NO_RSAX
+#endif
+#ifndef OPENSSL_NO_SCTP
+# define OPENSSL_NO_SCTP
+#endif
+#ifndef OPENSSL_NO_SEED
+# define OPENSSL_NO_SEED
+#endif
+#ifndef OPENSSL_NO_SHA0
+# define OPENSSL_NO_SHA0
+#endif
+#ifndef OPENSSL_NO_SRP
+# define OPENSSL_NO_SRP
+#endif
+#ifndef OPENSSL_NO_SSL2
+# define OPENSSL_NO_SSL2
+#endif
+#ifndef OPENSSL_NO_SSL3
+# define OPENSSL_NO_SSL3
+#endif
+#ifndef OPENSSL_NO_STATIC_ENGINE
+# define OPENSSL_NO_STATIC_ENGINE
+#endif
+#ifndef OPENSSL_NO_STORE
+# define OPENSSL_NO_STORE
+#endif
+#ifndef OPENSSL_NO_TLS1
+# define OPENSSL_NO_TLS1
+#endif
+#ifndef OPENSSL_NO_TLSEXT
+# define OPENSSL_NO_TLSEXT
+#endif
+#ifndef OPENSSL_NO_TS
+# define OPENSSL_NO_TS
+#endif
+#ifndef OPENSSL_NO_TXT_DB
+# define OPENSSL_NO_TXT_DB
+#endif
+#ifndef OPENSSL_NO_UI
+# define OPENSSL_NO_UI
+#endif
+#ifndef OPENSSL_NO_WHIRLPOOL
+# define OPENSSL_NO_WHIRLPOOL
+#endif
+
+#endif /* OPENSSL_DOING_MAKEDEPEND */
+
+#ifndef OPENSSL_NO_ERR
+# define OPENSSL_NO_ERR
+#endif
+#ifndef OPENSSL_NO_DYNAMIC_ENGINE
+# define OPENSSL_NO_DYNAMIC_ENGINE
+#endif
+
+/* The OPENSSL_NO_* macros are also defined as NO_* if the application
+ asks for it. This is a transient feature that is provided for those
+ who haven't had the time to do the appropriate changes in their
+ applications. */
+#ifdef OPENSSL_ALGORITHM_DEFINES
+# if defined(OPENSSL_NO_CAMELLIA) && !defined(NO_CAMELLIA)
+# define NO_CAMELLIA
+# endif
+# if defined(OPENSSL_NO_CAPIENG) && !defined(NO_CAPIENG)
+# define NO_CAPIENG
+# endif
+# if defined(OPENSSL_NO_CAST) && !defined(NO_CAST)
+# define NO_CAST
+# endif
+# if defined(OPENSSL_NO_CMS) && !defined(NO_CMS)
+# define NO_CMS
+# endif
+# if defined(OPENSSL_NO_COMP) && !defined(NO_COMP)
+# define NO_COMP
+# endif
+# if defined(OPENSSL_NO_CONF) && !defined(NO_CONF)
+# define NO_CONF
+# endif
+# if defined(OPENSSL_NO_DES) && !defined(NO_DES)
+# define NO_DES
+# endif
+# if defined(OPENSSL_NO_DTLS1) && !defined(NO_DTLS1)
+# define NO_DTLS1
+# endif
+# if defined(OPENSSL_NO_EC_NISTP_64_GCC_128) && !defined(NO_EC_NISTP_64_GCC_128)
+# define NO_EC_NISTP_64_GCC_128
+# endif
+# if defined(OPENSSL_NO_GMP) && !defined(NO_GMP)
+# define NO_GMP
+# endif
+# if defined(OPENSSL_NO_GOST) && !defined(NO_GOST)
+# define NO_GOST
+# endif
+# if defined(OPENSSL_NO_HEARTBEATS) && !defined(NO_HEARTBEATS)
+# define NO_HEARTBEATS
+# endif
+# if defined(OPENSSL_NO_IDEA) && !defined(NO_IDEA)
+# define NO_IDEA
+# endif
+# if defined(OPENSSL_NO_JPAKE) && !defined(NO_JPAKE)
+# define NO_JPAKE
+# endif
+# if defined(OPENSSL_NO_KRB5) && !defined(NO_KRB5)
+# define NO_KRB5
+# endif
+# if defined(OPENSSL_NO_LOCKING) && !defined(NO_LOCKING)
+# define NO_LOCKING
+# endif
+# if defined(OPENSSL_NO_MD2) && !defined(NO_MD2)
+# define NO_MD2
+# endif
+# if defined(OPENSSL_NO_MD4) && !defined(NO_MD4)
+# define NO_MD4
+# endif
+# if defined(OPENSSL_NO_MD5) && !defined(NO_MD5)
+# define NO_MD5
+# endif
+# if defined(OPENSSL_NO_MDC2) && !defined(NO_MDC2)
+# define NO_MDC2
+# endif
+# if defined(OPENSSL_NO_OCSP) && !defined(NO_OCSP)
+# define NO_OCSP
+# endif
+# if defined(OPENSSL_NO_PEM) && !defined(NO_PEM)
+# define NO_PEM
+# endif
+# if defined(OPENSSL_NO_PKCS12) && !defined(NO_PKCS12)
+# define NO_PKCS12
+# endif
+# if defined(OPENSSL_NO_PQUEUE) && !defined(NO_PQUEUE)
+# define NO_PQUEUE
+# endif
+# if defined(OPENSSL_NO_RC2) && !defined(NO_RC2)
+# define NO_RC2
+# endif
+# if defined(OPENSSL_NO_RC4) && !defined(NO_RC4)
+# define NO_RC4
+# endif
+# if defined(OPENSSL_NO_RC5) && !defined(NO_RC5)
+# define NO_RC5
+# endif
+# if defined(OPENSSL_NO_RDRAND) && !defined(NO_RDRAND)
+# define NO_RDRAND
+# endif
+# if defined(OPENSSL_NO_RFC3779) && !defined(NO_RFC3779)
+# define NO_RFC3779
+# endif
+# if defined(OPENSSL_NO_RIPEMD) && !defined(NO_RIPEMD)
+# define NO_RIPEMD
+# endif
+# if defined(OPENSSL_NO_RSAX) && !defined(NO_RSAX)
+# define NO_RSAX
+# endif
+# if defined(OPENSSL_NO_SCTP) && !defined(NO_SCTP)
+# define NO_SCTP
+# endif
+# if defined(OPENSSL_NO_SEED) && !defined(NO_SEED)
+# define NO_SEED
+# endif
+# if defined(OPENSSL_NO_SHA0) && !defined(NO_SHA0)
+# define NO_SHA0
+# endif
+# if defined(OPENSSL_NO_SRP) && !defined(NO_SRP)
+# define NO_SRP
+# endif
+# if defined(OPENSSL_NO_SSL2) && !defined(NO_SSL2)
+# define NO_SSL2
+# endif
+# if defined(OPENSSL_NO_SSL3) && !defined(NO_SSL3)
+# define NO_SSL3
+# endif
+# if defined(OPENSSL_NO_STATIC_ENGINE) && !defined(NO_STATIC_ENGINE)
+# define NO_STATIC_ENGINE
+# endif
+# if defined(OPENSSL_NO_STORE) && !defined(NO_STORE)
+# define NO_STORE
+# endif
+# if defined(OPENSSL_NO_TLS1) && !defined(NO_TLS1)
+# define NO_TLS1
+# endif
+# if defined(OPENSSL_NO_TLSEXT) && !defined(NO_TLSEXT)
+# define NO_TLSEXT
+# endif
+# if defined(OPENSSL_NO_TS) && !defined(NO_TS)
+# define NO_TS
+# endif
+# if defined(OPENSSL_NO_TXT_DB) && !defined(NO_TXT_DB)
+# define NO_TXT_DB
+# endif
+# if defined(OPENSSL_NO_UI) && !defined(NO_UI)
+# define NO_UI
+# endif
+# if defined(OPENSSL_NO_WHIRLPOOL) && !defined(NO_WHIRLPOOL)
+# define NO_WHIRLPOOL
+# endif
+#endif
+
+/* crypto/opensslconf.h.in */
+
+/* Generate 80386 code? */
+#undef I386_ONLY
+
+#if !(defined(VMS) || defined(__VMS)) /* VMS uses logical names instead */
+#if defined(HEADER_CRYPTLIB_H) && !defined(OPENSSLDIR)
+#define ENGINESDIR "/usr/local/ssl/lib/engines"
+#define OPENSSLDIR "/usr/local/ssl"
+#endif
+#endif
+
+#undef OPENSSL_UNISTD
+#define OPENSSL_UNISTD <trusty_std.h>
+
+#undef OPENSSL_EXPORT_VAR_AS_FUNCTION
+
+#if defined(HEADER_IDEA_H) && !defined(IDEA_INT)
+#define IDEA_INT unsigned int
+#endif
+
+#if defined(HEADER_MD2_H) && !defined(MD2_INT)
+#define MD2_INT unsigned int
+#endif
+
+#if defined(HEADER_RC2_H) && !defined(RC2_INT)
+/* I need to put in a mod for the alpha - eay */
+#define RC2_INT unsigned int
+#endif
+
+#if defined(HEADER_RC4_H)
+#if !defined(RC4_INT)
+/* using int types make the structure larger but make the code faster
+ * on most boxes I have tested - up to %20 faster. */
+/*
+ * I don't know what does "most" mean, but declaring "int" is a must on:
+ * - Intel P6 because partial register stalls are very expensive;
+ * - elder Alpha because it lacks byte load/store instructions;
+ */
+#define RC4_INT unsigned int
+#endif
+#if !defined(RC4_CHUNK)
+/*
+ * This enables code handling data aligned at natural CPU word
+ * boundary. See crypto/rc4/rc4_enc.c for further details.
+ */
+#undef RC4_CHUNK
+#endif
+#endif
+
+#if (defined(HEADER_NEW_DES_H) || defined(HEADER_DES_H)) && !defined(DES_LONG)
+/* If this is set to 'unsigned int' on a DEC Alpha, this gives about a
+ * %20 speed up (longs are 8 bytes, int's are 4). */
+#ifndef DES_LONG
+#define DES_LONG unsigned long
+#endif
+#endif
+
+#if defined(HEADER_BN_H) && !defined(CONFIG_HEADER_BN_H)
+#define CONFIG_HEADER_BN_H
+#undef BN_LLONG
+
+/* Should we define BN_DIV2W here? */
+
+/* Only one for the following should be defined */
+#undef SIXTY_FOUR_BIT_LONG
+#undef SIXTY_FOUR_BIT
+#define THIRTY_TWO_BIT
+#endif
+
+#if defined(HEADER_RC4_LOCL_H) && !defined(CONFIG_HEADER_RC4_LOCL_H)
+#define CONFIG_HEADER_RC4_LOCL_H
+/* if this is defined data[i] is used instead of *data, this is a %20
+ * speedup on x86 */
+#undef RC4_INDEX
+#endif
+
+#if defined(HEADER_BF_LOCL_H) && !defined(CONFIG_HEADER_BF_LOCL_H)
+#define CONFIG_HEADER_BF_LOCL_H
+#undef BF_PTR
+#endif /* HEADER_BF_LOCL_H */
+
+#if defined(HEADER_DES_LOCL_H) && !defined(CONFIG_HEADER_DES_LOCL_H)
+#define CONFIG_HEADER_DES_LOCL_H
+#ifndef DES_DEFAULT_OPTIONS
+/* the following is tweaked from a config script, that is why it is a
+ * protected undef/define */
+#ifndef DES_PTR
+#undef DES_PTR
+#endif
+
+/* This helps C compiler generate the correct code for multiple functional
+ * units. It reduces register dependancies at the expense of 2 more
+ * registers */
+#ifndef DES_RISC1
+#undef DES_RISC1
+#endif
+
+#ifndef DES_RISC2
+#undef DES_RISC2
+#endif
+
+#if defined(DES_RISC1) && defined(DES_RISC2)
+YOU SHOULD NOT HAVE BOTH DES_RISC1 AND DES_RISC2 DEFINED!!!!!
+#endif
+
+/* Unroll the inner loop, this sometimes helps, sometimes hinders.
+ * Very mucy CPU dependant */
+#ifndef DES_UNROLL
+#undef DES_UNROLL
+#endif
+
+/* These default values were supplied by
+ * Peter Gutman <pgut001@cs.auckland.ac.nz>
+ * They are only used if nothing else has been defined */
+#if !defined(DES_PTR) && !defined(DES_RISC1) && !defined(DES_RISC2) && !defined(DES_UNROLL)
+/* Special defines which change the way the code is built depending on the
+ CPU and OS. For SGI machines you can use _MIPS_SZLONG (32 or 64) to find
+ even newer MIPS CPU's, but at the moment one size fits all for
+ optimization options. Older Sparc's work better with only UNROLL, but
+ there's no way to tell at compile time what it is you're running on */
+
+#if defined( sun ) /* Newer Sparc's */
+# define DES_PTR
+# define DES_RISC1
+# define DES_UNROLL
+#elif defined( __ultrix ) /* Older MIPS */
+# define DES_PTR
+# define DES_RISC2
+# define DES_UNROLL
+#elif defined( __osf1__ ) /* Alpha */
+# define DES_PTR
+# define DES_RISC2
+#elif defined ( _AIX ) /* RS6000 */
+ /* Unknown */
+#elif defined( __hpux ) /* HP-PA */
+ /* Unknown */
+#elif defined( __aux ) /* 68K */
+ /* Unknown */
+#elif defined( __dgux ) /* 88K (but P6 in latest boxes) */
+# define DES_UNROLL
+#elif defined( __sgi ) /* Newer MIPS */
+# define DES_PTR
+# define DES_RISC2
+# define DES_UNROLL
+#elif defined(i386) || defined(__i386__) /* x86 boxes, should be gcc */
+# define DES_PTR
+# define DES_RISC1
+# define DES_UNROLL
+#endif /* Systems-specific speed defines */
+#endif
+
+#endif /* DES_DEFAULT_OPTIONS */
+#endif /* HEADER_DES_LOCL_H */
diff --git a/main/openssl/crypto/opensslconf.h b/main/openssl/crypto/opensslconf.h
index 26ac6ba3..94212a08 100644
--- a/main/openssl/crypto/opensslconf.h
+++ b/main/openssl/crypto/opensslconf.h
@@ -1,250 +1,10 @@
-/* opensslconf.h */
-/* WARNING: Generated automatically from opensslconf.h.in by Configure. */
-
-/* OpenSSL was configured with the following options: */
-#ifndef OPENSSL_DOING_MAKEDEPEND
-
-
-#ifndef OPENSSL_NO_CAST
-# define OPENSSL_NO_CAST
+// Auto-generated - DO NOT EDIT!
+#ifndef OPENSSL_SYS_TRUSTY
+#if defined(__LP64__)
+#include "opensslconf-64.h"
+#else
+#include "opensslconf-32.h"
#endif
-#ifndef OPENSSL_NO_GMP
-# define OPENSSL_NO_GMP
+#else
+#include "opensslconf-trusty.h"
#endif
-#ifndef OPENSSL_NO_IDEA
-# define OPENSSL_NO_IDEA
-#endif
-#ifndef OPENSSL_NO_JPAKE
-# define OPENSSL_NO_JPAKE
-#endif
-#ifndef OPENSSL_NO_KRB5
-# define OPENSSL_NO_KRB5
-#endif
-#ifndef OPENSSL_NO_MD2
-# define OPENSSL_NO_MD2
-#endif
-#ifndef OPENSSL_NO_RC5
-# define OPENSSL_NO_RC5
-#endif
-#ifndef OPENSSL_NO_RFC3779
-# define OPENSSL_NO_RFC3779
-#endif
-#ifndef OPENSSL_NO_SEED
-# define OPENSSL_NO_SEED
-#endif
-#ifndef OPENSSL_NO_SHA0
-# define OPENSSL_NO_SHA0
-#endif
-#ifndef OPENSSL_NO_STORE
-# define OPENSSL_NO_STORE
-#endif
-#ifndef OPENSSL_NO_WHRLPOOL
-# define OPENSSL_NO_WHRLPOOL
-#endif
-
-#endif /* OPENSSL_DOING_MAKEDEPEND */
-
-#ifndef OPENSSL_THREADS
-# define OPENSSL_THREADS
-#endif
-#ifndef OPENSSL_NO_DYNAMIC_ENGINE
-# define OPENSSL_NO_DYNAMIC_ENGINE
-#endif
-
-/* The OPENSSL_NO_* macros are also defined as NO_* if the application
- asks for it. This is a transient feature that is provided for those
- who haven't had the time to do the appropriate changes in their
- applications. */
-#ifdef OPENSSL_ALGORITHM_DEFINES
-# if defined(OPENSSL_NO_CAST) && !defined(NO_CAST)
-# define NO_CAST
-# endif
-# if defined(OPENSSL_NO_GMP) && !defined(NO_GMP)
-# define NO_GMP
-# endif
-# if defined(OPENSSL_NO_IDEA) && !defined(NO_IDEA)
-# define NO_IDEA
-# endif
-# if defined(OPENSSL_NO_JPAKE) && !defined(NO_JPAKE)
-# define NO_JPAKE
-# endif
-# if defined(OPENSSL_NO_KRB5) && !defined(NO_KRB5)
-# define NO_KRB5
-# endif
-# if defined(OPENSSL_NO_MD2) && !defined(NO_MD2)
-# define NO_MD2
-# endif
-# if defined(OPENSSL_NO_RC5) && !defined(NO_RC5)
-# define NO_RC5
-# endif
-# if defined(OPENSSL_NO_RFC3779) && !defined(NO_RFC3779)
-# define NO_RFC3779
-# endif
-# if defined(OPENSSL_NO_SEED) && !defined(NO_SEED)
-# define NO_SEED
-# endif
-# if defined(OPENSSL_NO_SHA0) && !defined(NO_SHA0)
-# define NO_SHA0
-# endif
-# if defined(OPENSSL_NO_STORE) && !defined(NO_STORE)
-# define NO_STORE
-# endif
-# if defined(OPENSSL_NO_WHRLPOOL) && !defined(NO_WHRLPOOL)
-# define NO_WHRLPOOL
-# endif
-#endif
-
-/* crypto/opensslconf.h.in */
-
-/* Generate 80386 code? */
-#undef I386_ONLY
-
-#if !(defined(VMS) || defined(__VMS)) /* VMS uses logical names instead */
-#if defined(HEADER_CRYPTLIB_H) && !defined(OPENSSLDIR)
-#define ENGINESDIR "/usr/local/ssl/lib/engines"
-#define OPENSSLDIR "/usr/local/ssl"
-#endif
-#endif
-
-#undef OPENSSL_UNISTD
-#define OPENSSL_UNISTD <unistd.h>
-
-#undef OPENSSL_EXPORT_VAR_AS_FUNCTION
-
-#if defined(HEADER_IDEA_H) && !defined(IDEA_INT)
-#define IDEA_INT unsigned int
-#endif
-
-#if defined(HEADER_MD2_H) && !defined(MD2_INT)
-#define MD2_INT unsigned int
-#endif
-
-#if defined(HEADER_RC2_H) && !defined(RC2_INT)
-/* I need to put in a mod for the alpha - eay */
-#define RC2_INT unsigned int
-#endif
-
-#if defined(HEADER_RC4_H)
-#if !defined(RC4_INT)
-/* using int types make the structure larger but make the code faster
- * on most boxes I have tested - up to %20 faster. */
-/*
- * I don't know what does "most" mean, but declaring "int" is a must on:
- * - Intel P6 because partial register stalls are very expensive;
- * - elder Alpha because it lacks byte load/store instructions;
- */
-#define RC4_INT unsigned char
-#endif
-#if !defined(RC4_CHUNK)
-/*
- * This enables code handling data aligned at natural CPU word
- * boundary. See crypto/rc4/rc4_enc.c for further details.
- */
-#define RC4_CHUNK unsigned long
-#endif
-#endif
-
-#if (defined(HEADER_NEW_DES_H) || defined(HEADER_DES_H)) && !defined(DES_LONG)
-/* If this is set to 'unsigned int' on a DEC Alpha, this gives about a
- * %20 speed up (longs are 8 bytes, int's are 4). */
-#ifndef DES_LONG
-#define DES_LONG unsigned int
-#endif
-#endif
-
-#if defined(HEADER_BN_H) && !defined(CONFIG_HEADER_BN_H)
-#define CONFIG_HEADER_BN_H
-#define BN_LLONG
-
-/* Should we define BN_DIV2W here? */
-
-/* Only one for the following should be defined */
-#undef SIXTY_FOUR_BIT_LONG
-#undef SIXTY_FOUR_BIT
-#define THIRTY_TWO_BIT
-#endif
-
-#if defined(HEADER_RC4_LOCL_H) && !defined(CONFIG_HEADER_RC4_LOCL_H)
-#define CONFIG_HEADER_RC4_LOCL_H
-/* if this is defined data[i] is used instead of *data, this is a %20
- * speedup on x86 */
-#undef RC4_INDEX
-#endif
-
-#if defined(HEADER_BF_LOCL_H) && !defined(CONFIG_HEADER_BF_LOCL_H)
-#define CONFIG_HEADER_BF_LOCL_H
-#define BF_PTR
-#endif /* HEADER_BF_LOCL_H */
-
-#if defined(HEADER_DES_LOCL_H) && !defined(CONFIG_HEADER_DES_LOCL_H)
-#define CONFIG_HEADER_DES_LOCL_H
-#ifndef DES_DEFAULT_OPTIONS
-/* the following is tweaked from a config script, that is why it is a
- * protected undef/define */
-#ifndef DES_PTR
-#undef DES_PTR
-#endif
-
-/* This helps C compiler generate the correct code for multiple functional
- * units. It reduces register dependancies at the expense of 2 more
- * registers */
-#ifndef DES_RISC1
-#undef DES_RISC1
-#endif
-
-#ifndef DES_RISC2
-#undef DES_RISC2
-#endif
-
-#if defined(DES_RISC1) && defined(DES_RISC2)
-YOU SHOULD NOT HAVE BOTH DES_RISC1 AND DES_RISC2 DEFINED!!!!!
-#endif
-
-/* Unroll the inner loop, this sometimes helps, sometimes hinders.
- * Very mucy CPU dependant */
-#ifndef DES_UNROLL
-#define DES_UNROLL
-#endif
-
-/* These default values were supplied by
- * Peter Gutman <pgut001@cs.auckland.ac.nz>
- * They are only used if nothing else has been defined */
-#if !defined(DES_PTR) && !defined(DES_RISC1) && !defined(DES_RISC2) && !defined(DES_UNROLL)
-/* Special defines which change the way the code is built depending on the
- CPU and OS. For SGI machines you can use _MIPS_SZLONG (32 or 64) to find
- even newer MIPS CPU's, but at the moment one size fits all for
- optimization options. Older Sparc's work better with only UNROLL, but
- there's no way to tell at compile time what it is you're running on */
-
-#if defined( sun ) /* Newer Sparc's */
-# define DES_PTR
-# define DES_RISC1
-# define DES_UNROLL
-#elif defined( __ultrix ) /* Older MIPS */
-# define DES_PTR
-# define DES_RISC2
-# define DES_UNROLL
-#elif defined( __osf1__ ) /* Alpha */
-# define DES_PTR
-# define DES_RISC2
-#elif defined ( _AIX ) /* RS6000 */
- /* Unknown */
-#elif defined( __hpux ) /* HP-PA */
- /* Unknown */
-#elif defined( __aux ) /* 68K */
- /* Unknown */
-#elif defined( __dgux ) /* 88K (but P6 in latest boxes) */
-# define DES_UNROLL
-#elif defined( __sgi ) /* Newer MIPS */
-# define DES_PTR
-# define DES_RISC2
-# define DES_UNROLL
-#elif defined(i386) || defined(__i386__) /* x86 boxes, should be gcc */
-# define DES_PTR
-# define DES_RISC1
-# define DES_UNROLL
-#endif /* Systems-specific speed defines */
-#endif
-
-#endif /* DES_DEFAULT_OPTIONS */
-#endif /* HEADER_DES_LOCL_H */
diff --git a/main/openssl/crypto/opensslv.h b/main/openssl/crypto/opensslv.h
index 310a3387..b27a5bb8 100644
--- a/main/openssl/crypto/opensslv.h
+++ b/main/openssl/crypto/opensslv.h
@@ -25,11 +25,11 @@
* (Prior to 0.9.5a beta1, a different scheme was used: MMNNFFRBB for
* major minor fix final patch/beta)
*/
-#define OPENSSL_VERSION_NUMBER 0x1000005fL
+#define OPENSSL_VERSION_NUMBER 0x1000106fL
#ifdef OPENSSL_FIPS
-#define OPENSSL_VERSION_TEXT "OpenSSL 1.0.0e-fips 6 Sep 2011"
+#define OPENSSL_VERSION_TEXT "OpenSSL 1.0.1f-fips 6 Jan 2014"
#else
-#define OPENSSL_VERSION_TEXT "OpenSSL 1.0.0e 6 Sep 2011"
+#define OPENSSL_VERSION_TEXT "OpenSSL 1.0.1f 6 Jan 2014"
#endif
#define OPENSSL_VERSION_PTEXT " part of " OPENSSL_VERSION_TEXT
diff --git a/main/openssl/crypto/ossl_typ.h b/main/openssl/crypto/ossl_typ.h
index 12bd7014..ea9227f6 100644
--- a/main/openssl/crypto/ossl_typ.h
+++ b/main/openssl/crypto/ossl_typ.h
@@ -91,10 +91,12 @@ typedef struct asn1_string_st ASN1_TIME;
typedef struct asn1_string_st ASN1_GENERALIZEDTIME;
typedef struct asn1_string_st ASN1_VISIBLESTRING;
typedef struct asn1_string_st ASN1_UTF8STRING;
+typedef struct asn1_string_st ASN1_STRING;
typedef int ASN1_BOOLEAN;
typedef int ASN1_NULL;
#endif
+typedef struct ASN1_ITEM_st ASN1_ITEM;
typedef struct asn1_pctx_st ASN1_PCTX;
#ifdef OPENSSL_SYS_WIN32
diff --git a/main/openssl/crypto/pariscid.pl b/main/openssl/crypto/pariscid.pl
new file mode 100644
index 00000000..bfc56fdc
--- /dev/null
+++ b/main/openssl/crypto/pariscid.pl
@@ -0,0 +1,225 @@
+#!/usr/bin/env perl
+
+$flavour = shift;
+$output = shift;
+open STDOUT,">$output";
+
+if ($flavour =~ /64/) {
+ $LEVEL ="2.0W";
+ $SIZE_T =8;
+ $ST ="std";
+} else {
+ $LEVEL ="1.1";
+ $SIZE_T =4;
+ $ST ="stw";
+}
+
+$rp="%r2";
+$sp="%r30";
+$rv="%r28";
+
+$code=<<___;
+ .LEVEL $LEVEL
+ .SPACE \$TEXT\$
+ .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
+
+ .EXPORT OPENSSL_cpuid_setup,ENTRY
+ .ALIGN 8
+OPENSSL_cpuid_setup
+ .PROC
+ .CALLINFO NO_CALLS
+ .ENTRY
+ bv ($rp)
+ .EXIT
+ nop
+ .PROCEND
+
+ .EXPORT OPENSSL_rdtsc,ENTRY
+ .ALIGN 8
+OPENSSL_rdtsc
+ .PROC
+ .CALLINFO NO_CALLS
+ .ENTRY
+ mfctl %cr16,$rv
+ bv ($rp)
+ .EXIT
+ nop
+ .PROCEND
+
+ .EXPORT OPENSSL_wipe_cpu,ENTRY
+ .ALIGN 8
+OPENSSL_wipe_cpu
+ .PROC
+ .CALLINFO NO_CALLS
+ .ENTRY
+ xor %r0,%r0,%r1
+ fcpy,dbl %fr0,%fr4
+ xor %r0,%r0,%r19
+ fcpy,dbl %fr0,%fr5
+ xor %r0,%r0,%r20
+ fcpy,dbl %fr0,%fr6
+ xor %r0,%r0,%r21
+ fcpy,dbl %fr0,%fr7
+ xor %r0,%r0,%r22
+ fcpy,dbl %fr0,%fr8
+ xor %r0,%r0,%r23
+ fcpy,dbl %fr0,%fr9
+ xor %r0,%r0,%r24
+ fcpy,dbl %fr0,%fr10
+ xor %r0,%r0,%r25
+ fcpy,dbl %fr0,%fr11
+ xor %r0,%r0,%r26
+ fcpy,dbl %fr0,%fr22
+ xor %r0,%r0,%r29
+ fcpy,dbl %fr0,%fr23
+ xor %r0,%r0,%r31
+ fcpy,dbl %fr0,%fr24
+ fcpy,dbl %fr0,%fr25
+ fcpy,dbl %fr0,%fr26
+ fcpy,dbl %fr0,%fr27
+ fcpy,dbl %fr0,%fr28
+ fcpy,dbl %fr0,%fr29
+ fcpy,dbl %fr0,%fr30
+ fcpy,dbl %fr0,%fr31
+ bv ($rp)
+ .EXIT
+ ldo 0($sp),$rv
+ .PROCEND
+___
+{
+my $inp="%r26";
+my $len="%r25";
+
+$code.=<<___;
+ .EXPORT OPENSSL_cleanse,ENTRY,ARGW0=GR,ARGW1=GR
+ .ALIGN 8
+OPENSSL_cleanse
+ .PROC
+ .CALLINFO NO_CALLS
+ .ENTRY
+ cmpib,*= 0,$len,L\$done
+ nop
+ cmpib,*>>= 15,$len,L\$ittle
+ ldi $SIZE_T-1,%r1
+
+L\$align
+ and,*<> $inp,%r1,%r28
+ b,n L\$aligned
+ stb %r0,0($inp)
+ ldo -1($len),$len
+ b L\$align
+ ldo 1($inp),$inp
+
+L\$aligned
+ andcm $len,%r1,%r28
+L\$ot
+ $ST %r0,0($inp)
+ addib,*<> -$SIZE_T,%r28,L\$ot
+ ldo $SIZE_T($inp),$inp
+
+ and,*<> $len,%r1,$len
+ b,n L\$done
+L\$ittle
+ stb %r0,0($inp)
+ addib,*<> -1,$len,L\$ittle
+ ldo 1($inp),$inp
+L\$done
+ bv ($rp)
+ .EXIT
+ nop
+ .PROCEND
+___
+}
+{
+my ($out,$cnt,$max)=("%r26","%r25","%r24");
+my ($tick,$lasttick)=("%r23","%r22");
+my ($diff,$lastdiff)=("%r21","%r20");
+
+$code.=<<___;
+ .EXPORT OPENSSL_instrument_bus,ENTRY,ARGW0=GR,ARGW1=GR
+ .ALIGN 8
+OPENSSL_instrument_bus
+ .PROC
+ .CALLINFO NO_CALLS
+ .ENTRY
+ copy $cnt,$rv
+ mfctl %cr16,$tick
+ copy $tick,$lasttick
+ ldi 0,$diff
+
+ fdc 0($out)
+ ldw 0($out),$tick
+ add $diff,$tick,$tick
+ stw $tick,0($out)
+L\$oop
+ mfctl %cr16,$tick
+ sub $tick,$lasttick,$diff
+ copy $tick,$lasttick
+
+ fdc 0($out)
+ ldw 0($out),$tick
+ add $diff,$tick,$tick
+ stw $tick,0($out)
+
+ addib,<> -1,$cnt,L\$oop
+ addi 4,$out,$out
+
+ bv ($rp)
+ .EXIT
+ sub $rv,$cnt,$rv
+ .PROCEND
+
+ .EXPORT OPENSSL_instrument_bus2,ENTRY,ARGW0=GR,ARGW1=GR
+ .ALIGN 8
+OPENSSL_instrument_bus2
+ .PROC
+ .CALLINFO NO_CALLS
+ .ENTRY
+ copy $cnt,$rv
+ sub %r0,$cnt,$cnt
+
+ mfctl %cr16,$tick
+ copy $tick,$lasttick
+ ldi 0,$diff
+
+ fdc 0($out)
+ ldw 0($out),$tick
+ add $diff,$tick,$tick
+ stw $tick,0($out)
+
+ mfctl %cr16,$tick
+ sub $tick,$lasttick,$diff
+ copy $tick,$lasttick
+L\$oop2
+ copy $diff,$lastdiff
+ fdc 0($out)
+ ldw 0($out),$tick
+ add $diff,$tick,$tick
+ stw $tick,0($out)
+
+ addib,= -1,$max,L\$done2
+ nop
+
+ mfctl %cr16,$tick
+ sub $tick,$lasttick,$diff
+ copy $tick,$lasttick
+ cmpclr,<> $lastdiff,$diff,$tick
+ ldi 1,$tick
+
+ ldi 1,%r1
+ xor %r1,$tick,$tick
+ addb,<> $tick,$cnt,L\$oop2
+ shladd,l $tick,2,$out,$out
+L\$done2
+ bv ($rp)
+ .EXIT
+ add $rv,$cnt,$rv
+ .PROCEND
+___
+}
+$code =~ s/cmpib,\*/comib,/gm if ($SIZE_T==4);
+$code =~ s/,\*/,/gm if ($SIZE_T==4);
+$code =~ s/\bbv\b/bve/gm if ($SIZE_T==8);
+print $code;
+close STDOUT;
+
diff --git a/main/openssl/crypto/pem/pem_all.c b/main/openssl/crypto/pem/pem_all.c
index 3e7a6093..eac0460e 100644
--- a/main/openssl/crypto/pem/pem_all.c
+++ b/main/openssl/crypto/pem/pem_all.c
@@ -193,7 +193,61 @@ RSA *PEM_read_RSAPrivateKey(FILE *fp, RSA **rsa, pem_password_cb *cb,
#endif
+#ifdef OPENSSL_FIPS
+
+int PEM_write_bio_RSAPrivateKey(BIO *bp, RSA *x, const EVP_CIPHER *enc,
+ unsigned char *kstr, int klen,
+ pem_password_cb *cb, void *u)
+{
+ if (FIPS_mode())
+ {
+ EVP_PKEY *k;
+ int ret;
+ k = EVP_PKEY_new();
+ if (!k)
+ return 0;
+ EVP_PKEY_set1_RSA(k, x);
+
+ ret = PEM_write_bio_PrivateKey(bp, k, enc, kstr, klen, cb, u);
+ EVP_PKEY_free(k);
+ return ret;
+ }
+ else
+ return PEM_ASN1_write_bio((i2d_of_void *)i2d_RSAPrivateKey,
+ PEM_STRING_RSA,bp,x,enc,kstr,klen,cb,u);
+}
+
+#ifndef OPENSSL_NO_FP_API
+int PEM_write_RSAPrivateKey(FILE *fp, RSA *x, const EVP_CIPHER *enc,
+ unsigned char *kstr, int klen,
+ pem_password_cb *cb, void *u)
+{
+ if (FIPS_mode())
+ {
+ EVP_PKEY *k;
+ int ret;
+ k = EVP_PKEY_new();
+ if (!k)
+ return 0;
+
+ EVP_PKEY_set1_RSA(k, x);
+
+ ret = PEM_write_PrivateKey(fp, k, enc, kstr, klen, cb, u);
+ EVP_PKEY_free(k);
+ return ret;
+ }
+ else
+ return PEM_ASN1_write((i2d_of_void *)i2d_RSAPrivateKey,
+ PEM_STRING_RSA,fp,x,enc,kstr,klen,cb,u);
+}
+#endif
+
+#else
+
IMPLEMENT_PEM_write_cb_const(RSAPrivateKey, RSA, PEM_STRING_RSA, RSAPrivateKey)
+
+#endif
+
IMPLEMENT_PEM_rw_const(RSAPublicKey, RSA, PEM_STRING_RSA_PUBLIC, RSAPublicKey)
IMPLEMENT_PEM_rw(RSA_PUBKEY, RSA, PEM_STRING_PUBLIC, RSA_PUBKEY)
@@ -223,7 +277,59 @@ DSA *PEM_read_bio_DSAPrivateKey(BIO *bp, DSA **dsa, pem_password_cb *cb,
return pkey_get_dsa(pktmp, dsa); /* will free pktmp */
}
+#ifdef OPENSSL_FIPS
+
+int PEM_write_bio_DSAPrivateKey(BIO *bp, DSA *x, const EVP_CIPHER *enc,
+ unsigned char *kstr, int klen,
+ pem_password_cb *cb, void *u)
+{
+ if (FIPS_mode())
+ {
+ EVP_PKEY *k;
+ int ret;
+ k = EVP_PKEY_new();
+ if (!k)
+ return 0;
+ EVP_PKEY_set1_DSA(k, x);
+
+ ret = PEM_write_bio_PrivateKey(bp, k, enc, kstr, klen, cb, u);
+ EVP_PKEY_free(k);
+ return ret;
+ }
+ else
+ return PEM_ASN1_write_bio((i2d_of_void *)i2d_DSAPrivateKey,
+ PEM_STRING_DSA,bp,x,enc,kstr,klen,cb,u);
+}
+
+#ifndef OPENSSL_NO_FP_API
+int PEM_write_DSAPrivateKey(FILE *fp, DSA *x, const EVP_CIPHER *enc,
+ unsigned char *kstr, int klen,
+ pem_password_cb *cb, void *u)
+{
+ if (FIPS_mode())
+ {
+ EVP_PKEY *k;
+ int ret;
+ k = EVP_PKEY_new();
+ if (!k)
+ return 0;
+ EVP_PKEY_set1_DSA(k, x);
+ ret = PEM_write_PrivateKey(fp, k, enc, kstr, klen, cb, u);
+ EVP_PKEY_free(k);
+ return ret;
+ }
+ else
+ return PEM_ASN1_write((i2d_of_void *)i2d_DSAPrivateKey,
+ PEM_STRING_DSA,fp,x,enc,kstr,klen,cb,u);
+}
+#endif
+
+#else
+
IMPLEMENT_PEM_write_cb_const(DSAPrivateKey, DSA, PEM_STRING_DSA, DSAPrivateKey)
+
+#endif
+
IMPLEMENT_PEM_rw(DSA_PUBKEY, DSA, PEM_STRING_PUBLIC, DSA_PUBKEY)
#ifndef OPENSSL_NO_FP_API
@@ -269,8 +375,63 @@ EC_KEY *PEM_read_bio_ECPrivateKey(BIO *bp, EC_KEY **key, pem_password_cb *cb,
IMPLEMENT_PEM_rw_const(ECPKParameters, EC_GROUP, PEM_STRING_ECPARAMETERS, ECPKParameters)
+
+
+#ifdef OPENSSL_FIPS
+
+int PEM_write_bio_ECPrivateKey(BIO *bp, EC_KEY *x, const EVP_CIPHER *enc,
+ unsigned char *kstr, int klen,
+ pem_password_cb *cb, void *u)
+{
+ if (FIPS_mode())
+ {
+ EVP_PKEY *k;
+ int ret;
+ k = EVP_PKEY_new();
+ if (!k)
+ return 0;
+ EVP_PKEY_set1_EC_KEY(k, x);
+
+ ret = PEM_write_bio_PrivateKey(bp, k, enc, kstr, klen, cb, u);
+ EVP_PKEY_free(k);
+ return ret;
+ }
+ else
+ return PEM_ASN1_write_bio((i2d_of_void *)i2d_ECPrivateKey,
+ PEM_STRING_ECPRIVATEKEY,
+ bp,x,enc,kstr,klen,cb,u);
+}
+
+#ifndef OPENSSL_NO_FP_API
+int PEM_write_ECPrivateKey(FILE *fp, EC_KEY *x, const EVP_CIPHER *enc,
+ unsigned char *kstr, int klen,
+ pem_password_cb *cb, void *u)
+{
+ if (FIPS_mode())
+ {
+ EVP_PKEY *k;
+ int ret;
+ k = EVP_PKEY_new();
+ if (!k)
+ return 0;
+ EVP_PKEY_set1_EC_KEY(k, x);
+ ret = PEM_write_PrivateKey(fp, k, enc, kstr, klen, cb, u);
+ EVP_PKEY_free(k);
+ return ret;
+ }
+ else
+ return PEM_ASN1_write((i2d_of_void *)i2d_ECPrivateKey,
+ PEM_STRING_ECPRIVATEKEY,
+ fp,x,enc,kstr,klen,cb,u);
+}
+#endif
+
+#else
+
IMPLEMENT_PEM_write_cb(ECPrivateKey, EC_KEY, PEM_STRING_ECPRIVATEKEY, ECPrivateKey)
+#endif
+
IMPLEMENT_PEM_rw(EC_PUBKEY, EC_KEY, PEM_STRING_PUBLIC, EC_PUBKEY)
#ifndef OPENSSL_NO_FP_API
diff --git a/main/openssl/crypto/pem/pem_info.c b/main/openssl/crypto/pem/pem_info.c
index 1b2be527..cc7f24a9 100644
--- a/main/openssl/crypto/pem/pem_info.c
+++ b/main/openssl/crypto/pem/pem_info.c
@@ -167,6 +167,7 @@ start:
#ifndef OPENSSL_NO_RSA
if (strcmp(name,PEM_STRING_RSA) == 0)
{
+ d2i=(D2I_OF(void))d2i_RSAPrivateKey;
if (xi->x_pkey != NULL)
{
if (!sk_X509_INFO_push(ret,xi)) goto err;
diff --git a/main/openssl/crypto/pem/pem_lib.c b/main/openssl/crypto/pem/pem_lib.c
index cfc89a99..5a421fc4 100644
--- a/main/openssl/crypto/pem/pem_lib.c
+++ b/main/openssl/crypto/pem/pem_lib.c
@@ -394,7 +394,8 @@ int PEM_ASN1_write_bio(i2d_of_void *i2d, const char *name, BIO *bp,
goto err;
/* The 'iv' is used as the iv and as a salt. It is
* NOT taken from the BytesToKey function */
- EVP_BytesToKey(enc,EVP_md5(),iv,kstr,klen,1,key,NULL);
+ if (!EVP_BytesToKey(enc,EVP_md5(),iv,kstr,klen,1,key,NULL))
+ goto err;
if (kstr == (unsigned char *)buf) OPENSSL_cleanse(buf,PEM_BUFSIZE);
@@ -406,12 +407,15 @@ int PEM_ASN1_write_bio(i2d_of_void *i2d, const char *name, BIO *bp,
/* k=strlen(buf); */
EVP_CIPHER_CTX_init(&ctx);
- EVP_EncryptInit_ex(&ctx,enc,NULL,key,iv);
- EVP_EncryptUpdate(&ctx,data,&j,data,i);
- EVP_EncryptFinal_ex(&ctx,&(data[j]),&i);
+ ret = 1;
+ if (!EVP_EncryptInit_ex(&ctx,enc,NULL,key,iv)
+ || !EVP_EncryptUpdate(&ctx,data,&j,data,i)
+ || !EVP_EncryptFinal_ex(&ctx,&(data[j]),&i))
+ ret = 0;
EVP_CIPHER_CTX_cleanup(&ctx);
+ if (ret == 0)
+ goto err;
i+=j;
- ret=1;
}
else
{
@@ -459,14 +463,17 @@ int PEM_do_header(EVP_CIPHER_INFO *cipher, unsigned char *data, long *plen,
ebcdic2ascii(buf, buf, klen);
#endif
- EVP_BytesToKey(cipher->cipher,EVP_md5(),&(cipher->iv[0]),
- (unsigned char *)buf,klen,1,key,NULL);
+ if (!EVP_BytesToKey(cipher->cipher,EVP_md5(),&(cipher->iv[0]),
+ (unsigned char *)buf,klen,1,key,NULL))
+ return 0;
j=(int)len;
EVP_CIPHER_CTX_init(&ctx);
- EVP_DecryptInit_ex(&ctx,cipher->cipher,NULL, key,&(cipher->iv[0]));
- EVP_DecryptUpdate(&ctx,data,&i,data,j);
- o=EVP_DecryptFinal_ex(&ctx,&(data[i]),&j);
+ o = EVP_DecryptInit_ex(&ctx,cipher->cipher,NULL, key,&(cipher->iv[0]));
+ if (o)
+ o = EVP_DecryptUpdate(&ctx,data,&i,data,j);
+ if (o)
+ o = EVP_DecryptFinal_ex(&ctx,&(data[i]),&j);
EVP_CIPHER_CTX_cleanup(&ctx);
OPENSSL_cleanse((char *)buf,sizeof(buf));
OPENSSL_cleanse((char *)key,sizeof(key));
diff --git a/main/openssl/crypto/pem/pem_seal.c b/main/openssl/crypto/pem/pem_seal.c
index 59690b56..b6b4e134 100644
--- a/main/openssl/crypto/pem/pem_seal.c
+++ b/main/openssl/crypto/pem/pem_seal.c
@@ -96,7 +96,8 @@ int PEM_SealInit(PEM_ENCODE_SEAL_CTX *ctx, EVP_CIPHER *type, EVP_MD *md_type,
EVP_EncodeInit(&ctx->encode);
EVP_MD_CTX_init(&ctx->md);
- EVP_SignInit(&ctx->md,md_type);
+ if (!EVP_SignInit(&ctx->md,md_type))
+ goto err;
EVP_CIPHER_CTX_init(&ctx->cipher);
ret=EVP_SealInit(&ctx->cipher,type,ek,ekl,iv,pubk,npubk);
@@ -163,7 +164,8 @@ int PEM_SealFinal(PEM_ENCODE_SEAL_CTX *ctx, unsigned char *sig, int *sigl,
goto err;
}
- EVP_EncryptFinal_ex(&ctx->cipher,s,(int *)&i);
+ if (!EVP_EncryptFinal_ex(&ctx->cipher,s,(int *)&i))
+ goto err;
EVP_EncodeUpdate(&ctx->encode,out,&j,s,i);
*outl=j;
out+=j;
diff --git a/main/openssl/crypto/pem/pvkfmt.c b/main/openssl/crypto/pem/pvkfmt.c
index 5f130c45..b1bf71a5 100644
--- a/main/openssl/crypto/pem/pvkfmt.c
+++ b/main/openssl/crypto/pem/pvkfmt.c
@@ -709,13 +709,16 @@ static int derive_pvk_key(unsigned char *key,
const unsigned char *pass, int passlen)
{
EVP_MD_CTX mctx;
+ int rv = 1;
EVP_MD_CTX_init(&mctx);
- EVP_DigestInit_ex(&mctx, EVP_sha1(), NULL);
- EVP_DigestUpdate(&mctx, salt, saltlen);
- EVP_DigestUpdate(&mctx, pass, passlen);
- EVP_DigestFinal_ex(&mctx, key, NULL);
+ if (!EVP_DigestInit_ex(&mctx, EVP_sha1(), NULL)
+ || !EVP_DigestUpdate(&mctx, salt, saltlen)
+ || !EVP_DigestUpdate(&mctx, pass, passlen)
+ || !EVP_DigestFinal_ex(&mctx, key, NULL))
+ rv = 0;
+
EVP_MD_CTX_cleanup(&mctx);
- return 1;
+ return rv;
}
@@ -727,11 +730,12 @@ static EVP_PKEY *do_PVK_body(const unsigned char **in,
const unsigned char *p = *in;
unsigned int magic;
unsigned char *enctmp = NULL, *q;
+ EVP_CIPHER_CTX cctx;
+ EVP_CIPHER_CTX_init(&cctx);
if (saltlen)
{
char psbuf[PEM_BUFSIZE];
unsigned char keybuf[20];
- EVP_CIPHER_CTX cctx;
int enctmplen, inlen;
if (cb)
inlen=cb(psbuf,PEM_BUFSIZE,0,u);
@@ -757,37 +761,41 @@ static EVP_PKEY *do_PVK_body(const unsigned char **in,
p += 8;
inlen = keylen - 8;
q = enctmp + 8;
- EVP_CIPHER_CTX_init(&cctx);
- EVP_DecryptInit_ex(&cctx, EVP_rc4(), NULL, keybuf, NULL);
- EVP_DecryptUpdate(&cctx, q, &enctmplen, p, inlen);
- EVP_DecryptFinal_ex(&cctx, q + enctmplen, &enctmplen);
+ if (!EVP_DecryptInit_ex(&cctx, EVP_rc4(), NULL, keybuf, NULL))
+ goto err;
+ if (!EVP_DecryptUpdate(&cctx, q, &enctmplen, p, inlen))
+ goto err;
+ if (!EVP_DecryptFinal_ex(&cctx, q + enctmplen, &enctmplen))
+ goto err;
magic = read_ledword((const unsigned char **)&q);
if (magic != MS_RSA2MAGIC && magic != MS_DSS2MAGIC)
{
q = enctmp + 8;
memset(keybuf + 5, 0, 11);
- EVP_DecryptInit_ex(&cctx, EVP_rc4(), NULL, keybuf,
- NULL);
+ if (!EVP_DecryptInit_ex(&cctx, EVP_rc4(), NULL, keybuf,
+ NULL))
+ goto err;
OPENSSL_cleanse(keybuf, 20);
- EVP_DecryptUpdate(&cctx, q, &enctmplen, p, inlen);
- EVP_DecryptFinal_ex(&cctx, q + enctmplen,
- &enctmplen);
+ if (!EVP_DecryptUpdate(&cctx, q, &enctmplen, p, inlen))
+ goto err;
+ if (!EVP_DecryptFinal_ex(&cctx, q + enctmplen,
+ &enctmplen))
+ goto err;
magic = read_ledword((const unsigned char **)&q);
if (magic != MS_RSA2MAGIC && magic != MS_DSS2MAGIC)
{
- EVP_CIPHER_CTX_cleanup(&cctx);
PEMerr(PEM_F_DO_PVK_BODY, PEM_R_BAD_DECRYPT);
goto err;
}
}
else
OPENSSL_cleanse(keybuf, 20);
- EVP_CIPHER_CTX_cleanup(&cctx);
p = enctmp;
}
ret = b2i_PrivateKey(&p, keylen);
err:
+ EVP_CIPHER_CTX_cleanup(&cctx);
if (enctmp && saltlen)
OPENSSL_free(enctmp);
return ret;
@@ -841,6 +849,8 @@ static int i2b_PVK(unsigned char **out, EVP_PKEY*pk, int enclevel,
{
int outlen = 24, pklen;
unsigned char *p, *salt = NULL;
+ EVP_CIPHER_CTX cctx;
+ EVP_CIPHER_CTX_init(&cctx);
if (enclevel)
outlen += PVK_SALTLEN;
pklen = do_i2b(NULL, pk, 0);
@@ -885,7 +895,6 @@ static int i2b_PVK(unsigned char **out, EVP_PKEY*pk, int enclevel,
{
char psbuf[PEM_BUFSIZE];
unsigned char keybuf[20];
- EVP_CIPHER_CTX cctx;
int enctmplen, inlen;
if (cb)
inlen=cb(psbuf,PEM_BUFSIZE,1,u);
@@ -902,16 +911,19 @@ static int i2b_PVK(unsigned char **out, EVP_PKEY*pk, int enclevel,
if (enclevel == 1)
memset(keybuf + 5, 0, 11);
p = salt + PVK_SALTLEN + 8;
- EVP_CIPHER_CTX_init(&cctx);
- EVP_EncryptInit_ex(&cctx, EVP_rc4(), NULL, keybuf, NULL);
+ if (!EVP_EncryptInit_ex(&cctx, EVP_rc4(), NULL, keybuf, NULL))
+ goto error;
OPENSSL_cleanse(keybuf, 20);
- EVP_DecryptUpdate(&cctx, p, &enctmplen, p, pklen - 8);
- EVP_DecryptFinal_ex(&cctx, p + enctmplen, &enctmplen);
- EVP_CIPHER_CTX_cleanup(&cctx);
+ if (!EVP_DecryptUpdate(&cctx, p, &enctmplen, p, pklen - 8))
+ goto error;
+ if (!EVP_DecryptFinal_ex(&cctx, p + enctmplen, &enctmplen))
+ goto error;
}
+ EVP_CIPHER_CTX_cleanup(&cctx);
return outlen;
error:
+ EVP_CIPHER_CTX_cleanup(&cctx);
return -1;
}
diff --git a/main/openssl/crypto/perlasm/cbc.pl b/main/openssl/crypto/perlasm/cbc.pl
index 6fc25109..24561e75 100644
--- a/main/openssl/crypto/perlasm/cbc.pl
+++ b/main/openssl/crypto/perlasm/cbc.pl
@@ -150,7 +150,7 @@ sub cbc
&set_label("PIC_point");
&blindpop("edx");
&lea("ecx",&DWP(&label("cbc_enc_jmp_table")."-".&label("PIC_point"),"edx"));
- &mov($count,&DWP(0,"ecx",$count,4))
+ &mov($count,&DWP(0,"ecx",$count,4));
&add($count,"edx");
&xor("ecx","ecx");
&xor("edx","edx");
diff --git a/main/openssl/crypto/perlasm/ppc-xlate.pl b/main/openssl/crypto/perlasm/ppc-xlate.pl
index 4579671c..a3edd982 100755
--- a/main/openssl/crypto/perlasm/ppc-xlate.pl
+++ b/main/openssl/crypto/perlasm/ppc-xlate.pl
@@ -31,10 +31,9 @@ my $globl = sub {
$ret .= ".type $name,\@function";
last;
};
- /linux.*64/ && do { $ret .= ".globl .$name\n";
- $ret .= ".type .$name,\@function\n";
+ /linux.*64/ && do { $ret .= ".globl $name\n";
+ $ret .= ".type $name,\@function\n";
$ret .= ".section \".opd\",\"aw\"\n";
- $ret .= ".globl $name\n";
$ret .= ".align 3\n";
$ret .= "$name:\n";
$ret .= ".quad .$name,.TOC.\@tocbase,0\n";
@@ -62,6 +61,14 @@ my $machine = sub {
}
".machine $arch";
};
+my $size = sub {
+ if ($flavour =~ /linux.*32/)
+ { shift;
+ ".size " . join(",",@_);
+ }
+ else
+ { ""; }
+};
my $asciz = sub {
shift;
my $line = join(",",@_);
diff --git a/main/openssl/crypto/perlasm/x86_64-xlate.pl b/main/openssl/crypto/perlasm/x86_64-xlate.pl
index e47116b7..56d9b64b 100755
--- a/main/openssl/crypto/perlasm/x86_64-xlate.pl
+++ b/main/openssl/crypto/perlasm/x86_64-xlate.pl
@@ -62,12 +62,8 @@ my $flavour = shift;
my $output = shift;
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
-{ my ($stddev,$stdino,@junk)=stat(STDOUT);
- my ($outdev,$outino,@junk)=stat($output);
-
- open STDOUT,">$output" || die "can't open $output: $!"
- if ($stddev!=$outdev || $stdino!=$outino);
-}
+open STDOUT,">$output" || die "can't open $output: $!"
+ if (defined($output));
my $gas=1; $gas=0 if ($output =~ /\.asm$/);
my $elf=1; $elf=0 if (!$gas);
@@ -116,12 +112,16 @@ my %globals;
$line = substr($line,@+[0]); $line =~ s/^\s+//;
undef $self->{sz};
- if ($self->{op} =~ /^(movz)b.*/) { # movz is pain...
+ if ($self->{op} =~ /^(movz)x?([bw]).*/) { # movz is pain...
$self->{op} = $1;
- $self->{sz} = "b";
+ $self->{sz} = $2;
} elsif ($self->{op} =~ /call|jmp/) {
$self->{sz} = "";
- } elsif ($self->{op} =~ /^p/ && $' !~ /^(ush|op)/) { # SSEn
+ } elsif ($self->{op} =~ /^p/ && $' !~ /^(ush|op|insrw)/) { # SSEn
+ $self->{sz} = "";
+ } elsif ($self->{op} =~ /^v/) { # VEX
+ $self->{sz} = "";
+ } elsif ($self->{op} =~ /movq/ && $line =~ /%xmm/) {
$self->{sz} = "";
} elsif ($self->{op} =~ /([a-z]{3,})([qlwb])$/) {
$self->{op} = $1;
@@ -246,35 +246,39 @@ my %globals;
$self->{index} =~ s/^[er](.?[0-9xpi])[d]?$/r\1/;
$self->{base} =~ s/^[er](.?[0-9xpi])[d]?$/r\1/;
+ # Solaris /usr/ccs/bin/as can't handle multiplications
+ # in $self->{label}, new gas requires sign extension...
+ use integer;
+ $self->{label} =~ s/(?<![\w\$\.])(0x?[0-9a-f]+)/oct($1)/egi;
+ $self->{label} =~ s/([0-9]+\s*[\*\/\%]\s*[0-9]+)/eval($1)/eg;
+ $self->{label} =~ s/([0-9]+)/$1<<32>>32/eg;
+
if ($gas) {
- # Solaris /usr/ccs/bin/as can't handle multiplications
- # in $self->{label}, new gas requires sign extension...
- use integer;
- $self->{label} =~ s/(?<![\w\$\.])(0x?[0-9a-f]+)/oct($1)/egi;
- $self->{label} =~ s/([0-9]+\s*[\*\/\%]\s*[0-9]+)/eval($1)/eg;
- $self->{label} =~ s/([0-9]+)/$1<<32>>32/eg;
$self->{label} =~ s/^___imp_/__imp__/ if ($flavour eq "mingw64");
if (defined($self->{index})) {
- sprintf "%s%s(%%%s,%%%s,%d)",$self->{asterisk},
- $self->{label},$self->{base},
+ sprintf "%s%s(%s,%%%s,%d)",$self->{asterisk},
+ $self->{label},
+ $self->{base}?"%$self->{base}":"",
$self->{index},$self->{scale};
} else {
sprintf "%s%s(%%%s)", $self->{asterisk},$self->{label},$self->{base};
}
} else {
- %szmap = ( b=>"BYTE$PTR", w=>"WORD$PTR", l=>"DWORD$PTR", q=>"QWORD$PTR" );
+ %szmap = ( b=>"BYTE$PTR", w=>"WORD$PTR", l=>"DWORD$PTR",
+ q=>"QWORD$PTR",o=>"OWORD$PTR",x=>"XMMWORD$PTR" );
$self->{label} =~ s/\./\$/g;
$self->{label} =~ s/(?<![\w\$\.])0x([0-9a-f]+)/0$1h/ig;
$self->{label} = "($self->{label})" if ($self->{label} =~ /[\*\+\-\/]/);
- $sz="q" if ($self->{asterisk});
+ $sz="q" if ($self->{asterisk} || opcode->mnemonic() eq "movq");
+ $sz="l" if (opcode->mnemonic() eq "movd");
if (defined($self->{index})) {
- sprintf "%s[%s%s*%d+%s]",$szmap{$sz},
+ sprintf "%s[%s%s*%d%s]",$szmap{$sz},
$self->{label}?"$self->{label}+":"",
$self->{index},$self->{scale},
- $self->{base};
+ $self->{base}?"+$self->{base}":"";
} elsif ($self->{base} eq "rip") {
sprintf "%s[%s]",$szmap{$sz},$self->{label};
} else {
@@ -506,6 +510,12 @@ my %globals;
}
} elsif ($dir =~ /\.(text|data)/) {
$current_segment=".$1";
+ } elsif ($dir =~ /\.hidden/) {
+ if ($flavour eq "macosx") { $self->{value} = ".private_extern\t$prefix$line"; }
+ elsif ($flavour eq "mingw64") { $self->{value} = ""; }
+ } elsif ($dir =~ /\.comm/) {
+ $self->{value} = "$dir\t$prefix$line";
+ $self->{value} =~ s|,([0-9]+),([0-9]+)$|",$1,".log($2)/log(2)|e if ($flavour eq "macosx");
}
$line = "";
return $self;
@@ -555,7 +565,8 @@ my %globals;
$v.=" READONLY";
$v.=" ALIGN(".($1 eq "p" ? 4 : 8).")" if ($masm>=$masmref);
} elsif ($line=~/\.CRT\$/i) {
- $v.=" READONLY DWORD";
+ $v.=" READONLY ";
+ $v.=$masm>=$masmref ? "ALIGN(8)" : "DWORD";
}
}
$current_segment = $line;
@@ -577,7 +588,7 @@ my %globals;
$self->{value}="${decor}SEH_end_$current_function->{name}:";
$self->{value}.=":\n" if($masm);
}
- $self->{value}.="$current_function->{name}\tENDP" if($masm);
+ $self->{value}.="$current_function->{name}\tENDP" if($masm && $current_function->{name});
undef $current_function;
}
last;
@@ -613,6 +624,19 @@ my %globals;
.join(",",@str) if (@str);
last;
};
+ /\.comm/ && do { my @str=split(/,\s*/,$line);
+ my $v=undef;
+ if ($nasm) {
+ $v.="common $prefix@str[0] @str[1]";
+ } else {
+ $v="$current_segment\tENDS\n" if ($current_segment);
+ $current_segment = "_DATA";
+ $v.="$current_segment\tSEGMENT\n";
+ $v.="COMM @str[0]:DWORD:".@str[1]/4;
+ }
+ $self->{value} = $v;
+ last;
+ };
}
$line = "";
}
@@ -625,9 +649,133 @@ my %globals;
}
}
+sub rex {
+ local *opcode=shift;
+ my ($dst,$src,$rex)=@_;
+
+ $rex|=0x04 if($dst>=8);
+ $rex|=0x01 if($src>=8);
+ push @opcode,($rex|0x40) if ($rex);
+}
+
+# older gas and ml64 don't handle SSE>2 instructions
+my %regrm = ( "%eax"=>0, "%ecx"=>1, "%edx"=>2, "%ebx"=>3,
+ "%esp"=>4, "%ebp"=>5, "%esi"=>6, "%edi"=>7 );
+
+my $movq = sub { # elderly gas can't handle inter-register movq
+ my $arg = shift;
+ my @opcode=(0x66);
+ if ($arg =~ /%xmm([0-9]+),\s*%r(\w+)/) {
+ my ($src,$dst)=($1,$2);
+ if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; }
+ rex(\@opcode,$src,$dst,0x8);
+ push @opcode,0x0f,0x7e;
+ push @opcode,0xc0|(($src&7)<<3)|($dst&7); # ModR/M
+ @opcode;
+ } elsif ($arg =~ /%r(\w+),\s*%xmm([0-9]+)/) {
+ my ($src,$dst)=($2,$1);
+ if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; }
+ rex(\@opcode,$src,$dst,0x8);
+ push @opcode,0x0f,0x6e;
+ push @opcode,0xc0|(($src&7)<<3)|($dst&7); # ModR/M
+ @opcode;
+ } else {
+ ();
+ }
+};
+
+my $pextrd = sub {
+ if (shift =~ /\$([0-9]+),\s*%xmm([0-9]+),\s*(%\w+)/) {
+ my @opcode=(0x66);
+ $imm=$1;
+ $src=$2;
+ $dst=$3;
+ if ($dst =~ /%r([0-9]+)d/) { $dst = $1; }
+ elsif ($dst =~ /%e/) { $dst = $regrm{$dst}; }
+ rex(\@opcode,$src,$dst);
+ push @opcode,0x0f,0x3a,0x16;
+ push @opcode,0xc0|(($src&7)<<3)|($dst&7); # ModR/M
+ push @opcode,$imm;
+ @opcode;
+ } else {
+ ();
+ }
+};
+
+my $pinsrd = sub {
+ if (shift =~ /\$([0-9]+),\s*(%\w+),\s*%xmm([0-9]+)/) {
+ my @opcode=(0x66);
+ $imm=$1;
+ $src=$2;
+ $dst=$3;
+ if ($src =~ /%r([0-9]+)/) { $src = $1; }
+ elsif ($src =~ /%e/) { $src = $regrm{$src}; }
+ rex(\@opcode,$dst,$src);
+ push @opcode,0x0f,0x3a,0x22;
+ push @opcode,0xc0|(($dst&7)<<3)|($src&7); # ModR/M
+ push @opcode,$imm;
+ @opcode;
+ } else {
+ ();
+ }
+};
+
+my $pshufb = sub {
+ if (shift =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+ my @opcode=(0x66);
+ rex(\@opcode,$2,$1);
+ push @opcode,0x0f,0x38,0x00;
+ push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
+ @opcode;
+ } else {
+ ();
+ }
+};
+
+my $palignr = sub {
+ if (shift =~ /\$([0-9]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+ my @opcode=(0x66);
+ rex(\@opcode,$3,$2);
+ push @opcode,0x0f,0x3a,0x0f;
+ push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
+ push @opcode,$1;
+ @opcode;
+ } else {
+ ();
+ }
+};
+
+my $pclmulqdq = sub {
+ if (shift =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+ my @opcode=(0x66);
+ rex(\@opcode,$3,$2);
+ push @opcode,0x0f,0x3a,0x44;
+ push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
+ my $c=$1;
+ push @opcode,$c=~/^0/?oct($c):$c;
+ @opcode;
+ } else {
+ ();
+ }
+};
+
+my $rdrand = sub {
+ if (shift =~ /%[er](\w+)/) {
+ my @opcode=();
+ my $dst=$1;
+ if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; }
+ rex(\@opcode,0,$1,8);
+ push @opcode,0x0f,0xc7,0xf0|($dst&7);
+ @opcode;
+ } else {
+ ();
+ }
+};
+
if ($nasm) {
print <<___;
default rel
+%define XMMWORD
___
} elsif ($masm) {
print <<___;
@@ -644,14 +792,22 @@ while($line=<>) {
undef $label;
undef $opcode;
- undef $sz;
undef @args;
if ($label=label->re(\$line)) { print $label->out(); }
if (directive->re(\$line)) {
printf "%s",directive->out();
- } elsif ($opcode=opcode->re(\$line)) { ARGUMENT: while (1) {
+ } elsif ($opcode=opcode->re(\$line)) {
+ my $asm = eval("\$".$opcode->mnemonic());
+ undef @bytes;
+
+ if ((ref($asm) eq 'CODE') && scalar(@bytes=&$asm($line))) {
+ print $gas?".byte\t":"DB\t",join(',',@bytes),"\n";
+ next;
+ }
+
+ ARGUMENT: while (1) {
my $arg;
if ($arg=register->re(\$line)) { opcode->size($arg->size()); }
@@ -667,19 +823,26 @@ while($line=<>) {
$line =~ s/^,\s*//;
} # ARGUMENT:
- $sz=opcode->size();
-
if ($#args>=0) {
my $insn;
+ my $sz=opcode->size();
+
if ($gas) {
$insn = $opcode->out($#args>=1?$args[$#args]->size():$sz);
+ @args = map($_->out($sz),@args);
+ printf "\t%s\t%s",$insn,join(",",@args);
} else {
$insn = $opcode->out();
- $insn .= $sz if (map($_->out() =~ /x?mm/,@args));
+ foreach (@args) {
+ my $arg = $_->out();
+ # $insn.=$sz compensates for movq, pinsrw, ...
+ if ($arg =~ /^xmm[0-9]+$/) { $insn.=$sz; $sz="x" if(!$sz); last; }
+ if ($arg =~ /^mm[0-9]+$/) { $insn.=$sz; $sz="q" if(!$sz); last; }
+ }
@args = reverse(@args);
undef $sz if ($nasm && $opcode->mnemonic() eq "lea");
+ printf "\t%s\t%s",$insn,join(",",map($_->out($sz),@args));
}
- printf "\t%s\t%s",$insn,join(",",map($_->out($sz),@args));
} else {
printf "\t%s",$opcode->out();
}
diff --git a/main/openssl/crypto/perlasm/x86asm.pl b/main/openssl/crypto/perlasm/x86asm.pl
index 28080caa..eb543db2 100644
--- a/main/openssl/crypto/perlasm/x86asm.pl
+++ b/main/openssl/crypto/perlasm/x86asm.pl
@@ -80,6 +80,57 @@ sub ::movq
{ &::generic("movq",@_); }
}
+# SSE>2 instructions
+my %regrm = ( "eax"=>0, "ecx"=>1, "edx"=>2, "ebx"=>3,
+ "esp"=>4, "ebp"=>5, "esi"=>6, "edi"=>7 );
+sub ::pextrd
+{ my($dst,$src,$imm)=@_;
+ if ("$dst:$src" =~ /(e[a-dsd][ixp]):xmm([0-7])/)
+ { &::data_byte(0x66,0x0f,0x3a,0x16,0xc0|($2<<3)|$regrm{$1},$imm); }
+ else
+ { &::generic("pextrd",@_); }
+}
+
+sub ::pinsrd
+{ my($dst,$src,$imm)=@_;
+ if ("$dst:$src" =~ /xmm([0-7]):(e[a-dsd][ixp])/)
+ { &::data_byte(0x66,0x0f,0x3a,0x22,0xc0|($1<<3)|$regrm{$2},$imm); }
+ else
+ { &::generic("pinsrd",@_); }
+}
+
+sub ::pshufb
+{ my($dst,$src)=@_;
+ if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
+ { &data_byte(0x66,0x0f,0x38,0x00,0xc0|($1<<3)|$2); }
+ else
+ { &::generic("pshufb",@_); }
+}
+
+sub ::palignr
+{ my($dst,$src,$imm)=@_;
+ if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
+ { &::data_byte(0x66,0x0f,0x3a,0x0f,0xc0|($1<<3)|$2,$imm); }
+ else
+ { &::generic("palignr",@_); }
+}
+
+sub ::pclmulqdq
+{ my($dst,$src,$imm)=@_;
+ if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
+ { &::data_byte(0x66,0x0f,0x3a,0x44,0xc0|($1<<3)|$2,$imm); }
+ else
+ { &::generic("pclmulqdq",@_); }
+}
+
+sub ::rdrand
+{ my ($dst)=@_;
+ if ($dst =~ /(e[a-dsd][ixp])/)
+ { &::data_byte(0x0f,0xc7,0xf0|$regrm{$dst}); }
+ else
+ { &::generic("rdrand",@_); }
+}
+
# label management
$lbdecor="L"; # local label decoration, set by package
$label="000";
@@ -167,7 +218,7 @@ sub ::asm_init
$filename=$fn;
$i386=$cpu;
- $elf=$cpp=$coff=$aout=$macosx=$win32=$netware=$mwerks=0;
+ $elf=$cpp=$coff=$aout=$macosx=$win32=$netware=$mwerks=$android=0;
if (($type eq "elf"))
{ $elf=1; require "x86gas.pl"; }
elsif (($type eq "a\.out"))
@@ -184,6 +235,8 @@ sub ::asm_init
{ $win32=1; require "x86masm.pl"; }
elsif (($type eq "macosx"))
{ $aout=1; $macosx=1; require "x86gas.pl"; }
+ elsif (($type eq "android"))
+ { $elf=1; $android=1; require "x86gas.pl"; }
else
{ print STDERR <<"EOF";
Pick one target type from
diff --git a/main/openssl/crypto/perlasm/x86gas.pl b/main/openssl/crypto/perlasm/x86gas.pl
index 6eab727f..682a3a31 100644
--- a/main/openssl/crypto/perlasm/x86gas.pl
+++ b/main/openssl/crypto/perlasm/x86gas.pl
@@ -45,9 +45,8 @@ sub ::generic
undef $suffix if ($dst =~ m/^%[xm]/o || $src =~ m/^%[xm]/o);
if ($#_==0) { &::emit($opcode); }
- elsif ($opcode =~ m/^j/o && $#_==1) { &::emit($opcode,@arg); }
- elsif ($opcode eq "call" && $#_==1) { &::emit($opcode,@arg); }
- elsif ($opcode =~ m/^set/&& $#_==1) { &::emit($opcode,@arg); }
+ elsif ($#_==1 && $opcode =~ m/^(call|clflush|j|loop|set)/o)
+ { &::emit($opcode,@arg); }
else { &::emit($opcode.$suffix,@arg);}
1;
@@ -91,6 +90,7 @@ sub ::DWP
}
sub ::QWP { &::DWP(@_); }
sub ::BP { &::DWP(@_); }
+sub ::WP { &::DWP(@_); }
sub ::BC { @_; }
sub ::DWC { @_; }
@@ -149,22 +149,24 @@ sub ::public_label
{ push(@out,".globl\t".&::LABEL($_[0],$nmdecor.$_[0])."\n"); }
sub ::file_end
-{ if (grep {/\b${nmdecor}OPENSSL_ia32cap_P\b/i} @out) {
- my $tmp=".comm\t${nmdecor}OPENSSL_ia32cap_P,4";
- if ($::elf) { push (@out,"$tmp,4\n"); }
- else { push (@out,"$tmp\n"); }
- }
- if ($::macosx)
+{ if ($::macosx)
{ if (%non_lazy_ptr)
{ push(@out,".section __IMPORT,__pointers,non_lazy_symbol_pointers\n");
foreach $i (keys %non_lazy_ptr)
{ push(@out,"$non_lazy_ptr{$i}:\n.indirect_symbol\t$i\n.long\t0\n"); }
}
}
+ if (grep {/\b${nmdecor}OPENSSL_ia32cap_P\b/i} @out) {
+ my $tmp=".comm\t${nmdecor}OPENSSL_ia32cap_P,8";
+ if ($::macosx) { push (@out,"$tmp,2\n"); }
+ elsif ($::elf) { push (@out,"$tmp,4\n"); }
+ else { push (@out,"$tmp\n"); }
+ }
push(@out,$initseg) if ($initseg);
}
sub ::data_byte { push(@out,".byte\t".join(',',@_)."\n"); }
+sub ::data_short{ push(@out,".value\t".join(',',@_)."\n"); }
sub ::data_word { push(@out,".long\t".join(',',@_)."\n"); }
sub ::align
@@ -180,7 +182,7 @@ sub ::align
sub ::picmeup
{ my($dst,$sym,$base,$reflabel)=@_;
- if ($::pic && ($::elf || $::aout))
+ if (($::pic && ($::elf || $::aout)) || $::macosx)
{ if (!defined($base))
{ &::call(&::label("PIC_me_up"));
&::set_label("PIC_me_up");
@@ -206,13 +208,17 @@ sub ::picmeup
sub ::initseg
{ my $f=$nmdecor.shift;
- if ($::elf)
+ if ($::android)
+ { $initseg.=<<___;
+.section .init_array
+.align 4
+.long $f
+___
+ }
+ elsif ($::elf)
{ $initseg.=<<___;
.section .init
call $f
- jmp .Linitalign
-.align $align
-.Linitalign:
___
}
elsif ($::coff)
diff --git a/main/openssl/crypto/perlasm/x86masm.pl b/main/openssl/crypto/perlasm/x86masm.pl
index 3d50e4a7..f937d07c 100644
--- a/main/openssl/crypto/perlasm/x86masm.pl
+++ b/main/openssl/crypto/perlasm/x86masm.pl
@@ -14,9 +14,11 @@ sub ::generic
{ my ($opcode,@arg)=@_;
# fix hexadecimal constants
- for (@arg) { s/0x([0-9a-f]+)/0$1h/oi; }
+ for (@arg) { s/(?<![\w\$\.])0x([0-9a-f]+)/0$1h/oi; }
- if ($opcode !~ /movq/)
+ if ($opcode =~ /lea/ && @arg[1] =~ s/.*PTR\s+(\(.*\))$/OFFSET $1/) # no []
+ { $opcode="mov"; }
+ elsif ($opcode !~ /movq/)
{ # fix xmm references
$arg[0] =~ s/\b[A-Z]+WORD\s+PTR/XMMWORD PTR/i if ($arg[1]=~/\bxmm[0-7]\b/i);
$arg[1] =~ s/\b[A-Z]+WORD\s+PTR/XMMWORD PTR/i if ($arg[0]=~/\bxmm[0-7]\b/i);
@@ -31,6 +33,7 @@ sub ::generic
sub ::call { &::emit("call",(&::islabel($_[0]) or "$nmdecor$_[0]")); }
sub ::call_ptr { &::emit("call",@_); }
sub ::jmp_ptr { &::emit("jmp",@_); }
+sub ::lock { &::data_byte(0xf0); }
sub get_mem
{ my($size,$addr,$reg1,$reg2,$idx)=@_;
@@ -65,6 +68,7 @@ sub get_mem
$ret;
}
sub ::BP { &get_mem("BYTE",@_); }
+sub ::WP { &get_mem("WORD",@_); }
sub ::DWP { &get_mem("DWORD",@_); }
sub ::QWP { &get_mem("QWORD",@_); }
sub ::BC { "@_"; }
@@ -129,7 +133,7 @@ ___
if (grep {/\b${nmdecor}OPENSSL_ia32cap_P\b/i} @out)
{ my $comm=<<___;
.bss SEGMENT 'BSS'
-COMM ${nmdecor}OPENSSL_ia32cap_P:DWORD
+COMM ${nmdecor}OPENSSL_ia32cap_P:QWORD
.bss ENDS
___
# comment out OPENSSL_ia32cap_P declarations
@@ -156,6 +160,9 @@ sub ::public_label
sub ::data_byte
{ push(@out,("DB\t").join(',',@_)."\n"); }
+sub ::data_short
+{ push(@out,("DW\t").join(',',@_)."\n"); }
+
sub ::data_word
{ push(@out,("DD\t").join(',',@_)."\n"); }
@@ -181,4 +188,11 @@ ___
sub ::dataseg
{ push(@out,"$segment\tENDS\n_DATA\tSEGMENT\n"); $segment="_DATA"; }
+sub ::safeseh
+{ my $nm=shift;
+ push(@out,"IF \@Version GE 710\n");
+ push(@out,".SAFESEH ".&::LABEL($nm,$nmdecor.$nm)."\n");
+ push(@out,"ENDIF\n");
+}
+
1;
diff --git a/main/openssl/crypto/perlasm/x86nasm.pl b/main/openssl/crypto/perlasm/x86nasm.pl
index ce2bed9b..ca2511c9 100644
--- a/main/openssl/crypto/perlasm/x86nasm.pl
+++ b/main/openssl/crypto/perlasm/x86nasm.pl
@@ -19,6 +19,8 @@ sub ::generic
{ $_[0] = "NEAR $_[0]"; }
elsif ($opcode eq "lea" && $#_==1) # wipe storage qualifier from lea
{ $_[1] =~ s/^[^\[]*\[/\[/o; }
+ elsif ($opcode eq "clflush" && $#_==0)
+ { $_[0] =~ s/^[^\[]*\[/\[/o; }
}
&::emit($opcode,@_);
1;
@@ -67,6 +69,7 @@ sub get_mem
}
sub ::BP { &get_mem("BYTE",@_); }
sub ::DWP { &get_mem("DWORD",@_); }
+sub ::WP { &get_mem("WORD",@_); }
sub ::QWP { &get_mem("",@_); }
sub ::BC { (($::mwerks)?"":"BYTE ")."@_"; }
sub ::DWC { (($::mwerks)?"":"DWORD ")."@_"; }
@@ -114,7 +117,7 @@ sub ::file_end
{ if (grep {/\b${nmdecor}OPENSSL_ia32cap_P\b/i} @out)
{ my $comm=<<___;
${drdecor}segment .bss
-${drdecor}common ${nmdecor}OPENSSL_ia32cap_P 4
+${drdecor}common ${nmdecor}OPENSSL_ia32cap_P 8
___
# comment out OPENSSL_ia32cap_P declarations
grep {s/(^extern\s+${nmdecor}OPENSSL_ia32cap_P)/\;$1/} @out;
@@ -135,7 +138,8 @@ sub ::public_label
sub ::data_byte
{ push(@out,(($::mwerks)?".byte\t":"db\t").join(',',@_)."\n"); }
-
+sub ::data_short
+{ push(@out,(($::mwerks)?".word\t":"dw\t").join(',',@_)."\n"); }
sub ::data_word
{ push(@out,(($::mwerks)?".long\t":"dd\t").join(',',@_)."\n"); }
@@ -163,4 +167,11 @@ sub ::dataseg
else { push(@out,"section\t.data align=4\n"); }
}
+sub ::safeseh
+{ my $nm=shift;
+ push(@out,"%if __NASM_VERSION_ID__ >= 0x02030000\n");
+ push(@out,"safeseh ".&::LABEL($nm,$nmdecor.$nm)."\n");
+ push(@out,"%endif\n");
+}
+
1;
diff --git a/main/openssl/crypto/pkcs12/p12_crt.c b/main/openssl/crypto/pkcs12/p12_crt.c
index 96b131de..a34915d0 100644
--- a/main/openssl/crypto/pkcs12/p12_crt.c
+++ b/main/openssl/crypto/pkcs12/p12_crt.c
@@ -90,7 +90,14 @@ PKCS12 *PKCS12_create(char *pass, char *name, EVP_PKEY *pkey, X509 *cert,
/* Set defaults */
if (!nid_cert)
+ {
+#ifdef OPENSSL_FIPS
+ if (FIPS_mode())
+ nid_cert = NID_pbe_WithSHA1And3_Key_TripleDES_CBC;
+ else
+#endif
nid_cert = NID_pbe_WithSHA1And40BitRC2_CBC;
+ }
if (!nid_key)
nid_key = NID_pbe_WithSHA1And3_Key_TripleDES_CBC;
if (!iter)
diff --git a/main/openssl/crypto/pkcs12/p12_decr.c b/main/openssl/crypto/pkcs12/p12_decr.c
index ba77dbbe..9d3557e8 100644
--- a/main/openssl/crypto/pkcs12/p12_decr.c
+++ b/main/openssl/crypto/pkcs12/p12_decr.c
@@ -89,7 +89,14 @@ unsigned char * PKCS12_pbe_crypt(X509_ALGOR *algor, const char *pass,
goto err;
}
- EVP_CipherUpdate(&ctx, out, &i, in, inlen);
+ if (!EVP_CipherUpdate(&ctx, out, &i, in, inlen))
+ {
+ OPENSSL_free(out);
+ out = NULL;
+ PKCS12err(PKCS12_F_PKCS12_PBE_CRYPT,ERR_R_EVP_LIB);
+ goto err;
+ }
+
outlen = i;
if(!EVP_CipherFinal_ex(&ctx, out + i, &i)) {
OPENSSL_free(out);
diff --git a/main/openssl/crypto/pkcs12/p12_key.c b/main/openssl/crypto/pkcs12/p12_key.c
index 424203f6..61d58502 100644
--- a/main/openssl/crypto/pkcs12/p12_key.c
+++ b/main/openssl/crypto/pkcs12/p12_key.c
@@ -152,14 +152,16 @@ int PKCS12_key_gen_uni(unsigned char *pass, int passlen, unsigned char *salt,
for (i = 0; i < Slen; i++) *p++ = salt[i % saltlen];
for (i = 0; i < Plen; i++) *p++ = pass[i % passlen];
for (;;) {
- EVP_DigestInit_ex(&ctx, md_type, NULL);
- EVP_DigestUpdate(&ctx, D, v);
- EVP_DigestUpdate(&ctx, I, Ilen);
- EVP_DigestFinal_ex(&ctx, Ai, NULL);
+ if (!EVP_DigestInit_ex(&ctx, md_type, NULL)
+ || !EVP_DigestUpdate(&ctx, D, v)
+ || !EVP_DigestUpdate(&ctx, I, Ilen)
+ || !EVP_DigestFinal_ex(&ctx, Ai, NULL))
+ goto err;
for (j = 1; j < iter; j++) {
- EVP_DigestInit_ex(&ctx, md_type, NULL);
- EVP_DigestUpdate(&ctx, Ai, u);
- EVP_DigestFinal_ex(&ctx, Ai, NULL);
+ if (!EVP_DigestInit_ex(&ctx, md_type, NULL)
+ || !EVP_DigestUpdate(&ctx, Ai, u)
+ || !EVP_DigestFinal_ex(&ctx, Ai, NULL))
+ goto err;
}
memcpy (out, Ai, min (n, u));
if (u >= n) {
@@ -174,24 +176,32 @@ int PKCS12_key_gen_uni(unsigned char *pass, int passlen, unsigned char *salt,
out += u;
for (j = 0; j < v; j++) B[j] = Ai[j % u];
/* Work out B + 1 first then can use B as tmp space */
- if (!BN_bin2bn (B, v, Bpl1)) goto err;
- if (!BN_add_word (Bpl1, 1)) goto err;
+ if (!BN_bin2bn (B, v, Bpl1))
+ goto err;
+ if (!BN_add_word (Bpl1, 1))
+ goto err;
for (j = 0; j < Ilen ; j+=v) {
- if (!BN_bin2bn (I + j, v, Ij)) goto err;
- if (!BN_add (Ij, Ij, Bpl1)) goto err;
- BN_bn2bin (Ij, B);
+ if (!BN_bin2bn(I + j, v, Ij))
+ goto err;
+ if (!BN_add(Ij, Ij, Bpl1))
+ goto err;
+ if (!BN_bn2bin(Ij, B))
+ goto err;
Ijlen = BN_num_bytes (Ij);
/* If more than 2^(v*8) - 1 cut off MSB */
if (Ijlen > v) {
- BN_bn2bin (Ij, B);
+ if (!BN_bn2bin (Ij, B))
+ goto err;
memcpy (I + j, B + 1, v);
#ifndef PKCS12_BROKEN_KEYGEN
/* If less than v bytes pad with zeroes */
} else if (Ijlen < v) {
memset(I + j, 0, v - Ijlen);
- BN_bn2bin(Ij, I + j + v - Ijlen);
+ if (!BN_bn2bin(Ij, I + j + v - Ijlen))
+ goto err;
#endif
- } else BN_bn2bin (Ij, I + j);
+ } else if (!BN_bn2bin (Ij, I + j))
+ goto err;
}
}
diff --git a/main/openssl/crypto/pkcs12/p12_kiss.c b/main/openssl/crypto/pkcs12/p12_kiss.c
index 292cc3ed..206b1b0b 100644
--- a/main/openssl/crypto/pkcs12/p12_kiss.c
+++ b/main/openssl/crypto/pkcs12/p12_kiss.c
@@ -167,7 +167,7 @@ int PKCS12_parse(PKCS12 *p12, const char *pass, EVP_PKEY **pkey, X509 **cert,
if (cert && *cert)
X509_free(*cert);
if (x)
- X509_free(*cert);
+ X509_free(x);
if (ocerts)
sk_X509_pop_free(ocerts, X509_free);
return 0;
diff --git a/main/openssl/crypto/pkcs12/p12_mutl.c b/main/openssl/crypto/pkcs12/p12_mutl.c
index 9ab740d5..96de1bd1 100644
--- a/main/openssl/crypto/pkcs12/p12_mutl.c
+++ b/main/openssl/crypto/pkcs12/p12_mutl.c
@@ -97,10 +97,14 @@ int PKCS12_gen_mac(PKCS12 *p12, const char *pass, int passlen,
return 0;
}
HMAC_CTX_init(&hmac);
- HMAC_Init_ex(&hmac, key, md_size, md_type, NULL);
- HMAC_Update(&hmac, p12->authsafes->d.data->data,
- p12->authsafes->d.data->length);
- HMAC_Final(&hmac, mac, maclen);
+ if (!HMAC_Init_ex(&hmac, key, md_size, md_type, NULL)
+ || !HMAC_Update(&hmac, p12->authsafes->d.data->data,
+ p12->authsafes->d.data->length)
+ || !HMAC_Final(&hmac, mac, maclen))
+ {
+ HMAC_CTX_cleanup(&hmac);
+ return 0;
+ }
HMAC_CTX_cleanup(&hmac);
return 1;
}
diff --git a/main/openssl/crypto/pkcs7/pk7_doit.c b/main/openssl/crypto/pkcs7/pk7_doit.c
index 3bf1a367..77fda3b8 100644
--- a/main/openssl/crypto/pkcs7/pk7_doit.c
+++ b/main/openssl/crypto/pkcs7/pk7_doit.c
@@ -204,11 +204,11 @@ static int pkcs7_decrypt_rinfo(unsigned char **pek, int *peklen,
unsigned char *ek = NULL;
size_t eklen;
- int ret = 0;
+ int ret = -1;
pctx = EVP_PKEY_CTX_new(pkey, NULL);
if (!pctx)
- return 0;
+ return -1;
if (EVP_PKEY_decrypt_init(pctx) <= 0)
goto err;
@@ -235,12 +235,19 @@ static int pkcs7_decrypt_rinfo(unsigned char **pek, int *peklen,
if (EVP_PKEY_decrypt(pctx, ek, &eklen,
ri->enc_key->data, ri->enc_key->length) <= 0)
{
+ ret = 0;
PKCS7err(PKCS7_F_PKCS7_DECRYPT_RINFO, ERR_R_EVP_LIB);
goto err;
}
ret = 1;
+ if (*pek)
+ {
+ OPENSSL_cleanse(*pek, *peklen);
+ OPENSSL_free(*pek);
+ }
+
*pek = ek;
*peklen = eklen;
@@ -423,6 +430,8 @@ BIO *PKCS7_dataDecode(PKCS7 *p7, EVP_PKEY *pkey, BIO *in_bio, X509 *pcert)
STACK_OF(X509_ALGOR) *md_sk=NULL;
STACK_OF(PKCS7_RECIP_INFO) *rsk=NULL;
PKCS7_RECIP_INFO *ri=NULL;
+ unsigned char *ek = NULL, *tkey = NULL;
+ int eklen = 0, tkeylen = 0;
i=OBJ_obj2nid(p7->type);
p7->state=PKCS7_S_HEADER;
@@ -500,8 +509,6 @@ BIO *PKCS7_dataDecode(PKCS7 *p7, EVP_PKEY *pkey, BIO *in_bio, X509 *pcert)
int max;
X509_OBJECT ret;
#endif
- unsigned char *ek = NULL;
- int eklen;
if ((etmp=BIO_new(BIO_f_cipher())) == NULL)
{
@@ -534,29 +541,28 @@ BIO *PKCS7_dataDecode(PKCS7 *p7, EVP_PKEY *pkey, BIO *in_bio, X509 *pcert)
}
/* If we haven't got a certificate try each ri in turn */
-
if (pcert == NULL)
{
+ /* Always attempt to decrypt all rinfo even
+ * after sucess as a defence against MMA timing
+ * attacks.
+ */
for (i=0; i<sk_PKCS7_RECIP_INFO_num(rsk); i++)
{
ri=sk_PKCS7_RECIP_INFO_value(rsk,i);
+
if (pkcs7_decrypt_rinfo(&ek, &eklen,
- ri, pkey) > 0)
- break;
+ ri, pkey) < 0)
+ goto err;
ERR_clear_error();
- ri = NULL;
- }
- if (ri == NULL)
- {
- PKCS7err(PKCS7_F_PKCS7_DATADECODE,
- PKCS7_R_NO_RECIPIENT_MATCHES_KEY);
- goto err;
}
}
else
{
- if (pkcs7_decrypt_rinfo(&ek, &eklen, ri, pkey) <= 0)
+ /* Only exit on fatal errors, not decrypt failure */
+ if (pkcs7_decrypt_rinfo(&ek, &eklen, ri, pkey) < 0)
goto err;
+ ERR_clear_error();
}
evp_ctx=NULL;
@@ -565,6 +571,19 @@ BIO *PKCS7_dataDecode(PKCS7 *p7, EVP_PKEY *pkey, BIO *in_bio, X509 *pcert)
goto err;
if (EVP_CIPHER_asn1_to_param(evp_ctx,enc_alg->parameter) < 0)
goto err;
+ /* Generate random key as MMA defence */
+ tkeylen = EVP_CIPHER_CTX_key_length(evp_ctx);
+ tkey = OPENSSL_malloc(tkeylen);
+ if (!tkey)
+ goto err;
+ if (EVP_CIPHER_CTX_rand_key(evp_ctx, tkey) <= 0)
+ goto err;
+ if (ek == NULL)
+ {
+ ek = tkey;
+ eklen = tkeylen;
+ tkey = NULL;
+ }
if (eklen != EVP_CIPHER_CTX_key_length(evp_ctx)) {
/* Some S/MIME clients don't use the same key
@@ -573,11 +592,16 @@ BIO *PKCS7_dataDecode(PKCS7 *p7, EVP_PKEY *pkey, BIO *in_bio, X509 *pcert)
*/
if(!EVP_CIPHER_CTX_set_key_length(evp_ctx, eklen))
{
- PKCS7err(PKCS7_F_PKCS7_DATADECODE,
- PKCS7_R_DECRYPTED_KEY_IS_WRONG_LENGTH);
- goto err;
+ /* Use random key as MMA defence */
+ OPENSSL_cleanse(ek, eklen);
+ OPENSSL_free(ek);
+ ek = tkey;
+ eklen = tkeylen;
+ tkey = NULL;
}
}
+ /* Clear errors so we don't leak information useful in MMA */
+ ERR_clear_error();
if (EVP_CipherInit_ex(evp_ctx,NULL,NULL,ek,NULL,0) <= 0)
goto err;
@@ -585,6 +609,13 @@ BIO *PKCS7_dataDecode(PKCS7 *p7, EVP_PKEY *pkey, BIO *in_bio, X509 *pcert)
{
OPENSSL_cleanse(ek,eklen);
OPENSSL_free(ek);
+ ek = NULL;
+ }
+ if (tkey)
+ {
+ OPENSSL_cleanse(tkey,tkeylen);
+ OPENSSL_free(tkey);
+ tkey = NULL;
}
if (out == NULL)
@@ -627,6 +658,16 @@ BIO *PKCS7_dataDecode(PKCS7 *p7, EVP_PKEY *pkey, BIO *in_bio, X509 *pcert)
if (0)
{
err:
+ if (ek)
+ {
+ OPENSSL_cleanse(ek,eklen);
+ OPENSSL_free(ek);
+ }
+ if (tkey)
+ {
+ OPENSSL_cleanse(tkey,tkeylen);
+ OPENSSL_free(tkey);
+ }
if (out != NULL) BIO_free_all(out);
if (btmp != NULL) BIO_free_all(btmp);
if (etmp != NULL) BIO_free_all(etmp);
@@ -676,7 +717,11 @@ static int do_pkcs7_signed_attrib(PKCS7_SIGNER_INFO *si, EVP_MD_CTX *mctx)
}
/* Add digest */
- EVP_DigestFinal_ex(mctx, md_data,&md_len);
+ if (!EVP_DigestFinal_ex(mctx, md_data,&md_len))
+ {
+ PKCS7err(PKCS7_F_DO_PKCS7_SIGNED_ATTRIB, ERR_R_EVP_LIB);
+ return 0;
+ }
if (!PKCS7_add1_attrib_digest(si, md_data, md_len))
{
PKCS7err(PKCS7_F_DO_PKCS7_SIGNED_ATTRIB, ERR_R_MALLOC_FAILURE);
@@ -784,7 +829,8 @@ int PKCS7_dataFinal(PKCS7 *p7, BIO *bio)
/* We now have the EVP_MD_CTX, lets do the
* signing. */
- EVP_MD_CTX_copy_ex(&ctx_tmp,mdc);
+ if (!EVP_MD_CTX_copy_ex(&ctx_tmp,mdc))
+ goto err;
sk=si->auth_attr;
@@ -822,7 +868,8 @@ int PKCS7_dataFinal(PKCS7 *p7, BIO *bio)
if (!PKCS7_find_digest(&mdc, bio,
OBJ_obj2nid(p7->d.digest->md->algorithm)))
goto err;
- EVP_DigestFinal_ex(mdc,md_data,&md_len);
+ if (!EVP_DigestFinal_ex(mdc,md_data,&md_len))
+ goto err;
M_ASN1_OCTET_STRING_set(p7->d.digest->digest, md_data, md_len);
}
@@ -1015,7 +1062,8 @@ int PKCS7_signatureVerify(BIO *bio, PKCS7 *p7, PKCS7_SIGNER_INFO *si,
/* mdc is the digest ctx that we want, unless there are attributes,
* in which case the digest is the signed attributes */
- EVP_MD_CTX_copy_ex(&mdc_tmp,mdc);
+ if (!EVP_MD_CTX_copy_ex(&mdc_tmp,mdc))
+ goto err;
sk=si->auth_attr;
if ((sk != NULL) && (sk_X509_ATTRIBUTE_num(sk) != 0))
@@ -1025,7 +1073,8 @@ int PKCS7_signatureVerify(BIO *bio, PKCS7 *p7, PKCS7_SIGNER_INFO *si,
int alen;
ASN1_OCTET_STRING *message_digest;
- EVP_DigestFinal_ex(&mdc_tmp,md_dat,&md_len);
+ if (!EVP_DigestFinal_ex(&mdc_tmp,md_dat,&md_len))
+ goto err;
message_digest=PKCS7_digest_from_attributes(sk);
if (!message_digest)
{
@@ -1050,7 +1099,8 @@ for (ii=0; ii<md_len; ii++) printf("%02X",md_dat[ii]); printf(" calc\n");
goto err;
}
- EVP_VerifyInit_ex(&mdc_tmp,EVP_get_digestbynid(md_type), NULL);
+ if (!EVP_VerifyInit_ex(&mdc_tmp,EVP_get_digestbynid(md_type), NULL))
+ goto err;
alen = ASN1_item_i2d((ASN1_VALUE *)sk, &abuf,
ASN1_ITEM_rptr(PKCS7_ATTR_VERIFY));
@@ -1060,7 +1110,8 @@ for (ii=0; ii<md_len; ii++) printf("%02X",md_dat[ii]); printf(" calc\n");
ret = -1;
goto err;
}
- EVP_VerifyUpdate(&mdc_tmp, abuf, alen);
+ if (!EVP_VerifyUpdate(&mdc_tmp, abuf, alen))
+ goto err;
OPENSSL_free(abuf);
}
diff --git a/main/openssl/crypto/pkcs7/pk7_smime.c b/main/openssl/crypto/pkcs7/pk7_smime.c
index 86742d0d..a5104f8d 100644
--- a/main/openssl/crypto/pkcs7/pk7_smime.c
+++ b/main/openssl/crypto/pkcs7/pk7_smime.c
@@ -573,15 +573,34 @@ int PKCS7_decrypt(PKCS7 *p7, EVP_PKEY *pkey, X509 *cert, BIO *data, int flags)
return 0;
}
ret = SMIME_text(bread, data);
+ if (ret > 0 && BIO_method_type(tmpmem) == BIO_TYPE_CIPHER)
+ {
+ if (!BIO_get_cipher_status(tmpmem))
+ ret = 0;
+ }
BIO_free_all(bread);
return ret;
} else {
for(;;) {
i = BIO_read(tmpmem, buf, sizeof(buf));
- if(i <= 0) break;
- BIO_write(data, buf, i);
+ if(i <= 0)
+ {
+ ret = 1;
+ if (BIO_method_type(tmpmem) == BIO_TYPE_CIPHER)
+ {
+ if (!BIO_get_cipher_status(tmpmem))
+ ret = 0;
+ }
+
+ break;
+ }
+ if (BIO_write(data, buf, i) != i)
+ {
+ ret = 0;
+ break;
+ }
}
BIO_free_all(tmpmem);
- return 1;
+ return ret;
}
}
diff --git a/main/openssl/crypto/ppccpuid.pl b/main/openssl/crypto/ppccpuid.pl
index 369e1d0d..4ba736a1 100755
--- a/main/openssl/crypto/ppccpuid.pl
+++ b/main/openssl/crypto/ppccpuid.pl
@@ -23,36 +23,67 @@ $code=<<___;
.machine "any"
.text
-.globl .OPENSSL_cpuid_setup
+.globl .OPENSSL_ppc64_probe
.align 4
-.OPENSSL_cpuid_setup:
+.OPENSSL_ppc64_probe:
+ fcfid f1,f1
+ extrdi r0,r0,32,0
blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,0,0
+
+.globl .OPENSSL_altivec_probe
+.align 4
+.OPENSSL_altivec_probe:
+ .long 0x10000484 # vor v0,v0,v0
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,0,0
.globl .OPENSSL_wipe_cpu
.align 4
.OPENSSL_wipe_cpu:
xor r0,r0,r0
+ fmr f0,f31
+ fmr f1,f31
+ fmr f2,f31
mr r3,r1
+ fmr f3,f31
xor r4,r4,r4
+ fmr f4,f31
xor r5,r5,r5
+ fmr f5,f31
xor r6,r6,r6
+ fmr f6,f31
xor r7,r7,r7
+ fmr f7,f31
xor r8,r8,r8
+ fmr f8,f31
xor r9,r9,r9
+ fmr f9,f31
xor r10,r10,r10
+ fmr f10,f31
xor r11,r11,r11
+ fmr f11,f31
xor r12,r12,r12
+ fmr f12,f31
+ fmr f13,f31
blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,0,0
.globl .OPENSSL_atomic_add
.align 4
.OPENSSL_atomic_add:
-Loop: lwarx r5,0,r3
+Ladd: lwarx r5,0,r3
add r0,r4,r5
stwcx. r0,0,r3
- bne- Loop
+ bne- Ladd
$SIGNX r3,r0
blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,2,0
+ .long 0
.globl .OPENSSL_rdtsc
.align 4
@@ -60,6 +91,8 @@ Loop: lwarx r5,0,r3
mftb r3
mftbu r4
blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,0,0
.globl .OPENSSL_cleanse
.align 4
@@ -72,7 +105,7 @@ Loop: lwarx r5,0,r3
Little: mtctr r4
stb r0,0(r3)
addi r3,r3,1
- bdnz- \$-8
+ bdnz \$-8
blr
Lot: andi. r5,r3,3
beq Laligned
@@ -85,10 +118,13 @@ Laligned:
mtctr r5
stw r0,0(r3)
addi r3,r3,4
- bdnz- \$-8
+ bdnz \$-8
andi. r4,r4,3
bne Little
blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,2,0
+ .long 0
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
diff --git a/main/openssl/crypto/rand/md_rand.c b/main/openssl/crypto/rand/md_rand.c
index b2f04ff1..dd291637 100644
--- a/main/openssl/crypto/rand/md_rand.c
+++ b/main/openssl/crypto/rand/md_rand.c
@@ -109,6 +109,8 @@
*
*/
+#define OPENSSL_FIPSEVP
+
#ifdef MD_RAND_DEBUG
# ifndef NDEBUG
# define NDEBUG
@@ -121,10 +123,10 @@
#include "e_os.h"
+#include <openssl/crypto.h>
#include <openssl/rand.h>
#include "rand_lcl.h"
-#include <openssl/crypto.h>
#include <openssl/err.h>
#ifdef BN_DEBUG
@@ -157,13 +159,14 @@ const char RAND_version[]="RAND" OPENSSL_VERSION_PTEXT;
static void ssleay_rand_cleanup(void);
static void ssleay_rand_seed(const void *buf, int num);
static void ssleay_rand_add(const void *buf, int num, double add_entropy);
-static int ssleay_rand_bytes(unsigned char *buf, int num);
+static int ssleay_rand_bytes(unsigned char *buf, int num, int pseudo);
+static int ssleay_rand_nopseudo_bytes(unsigned char *buf, int num);
static int ssleay_rand_pseudo_bytes(unsigned char *buf, int num);
static int ssleay_rand_status(void);
RAND_METHOD rand_ssleay_meth={
ssleay_rand_seed,
- ssleay_rand_bytes,
+ ssleay_rand_nopseudo_bytes,
ssleay_rand_cleanup,
ssleay_rand_add,
ssleay_rand_pseudo_bytes,
@@ -328,7 +331,7 @@ static void ssleay_rand_seed(const void *buf, int num)
ssleay_rand_add(buf, num, (double)num);
}
-static int ssleay_rand_bytes(unsigned char *buf, int num)
+static int ssleay_rand_bytes(unsigned char *buf, int num, int pseudo)
{
static volatile int stirred_pool = 0;
int i,j,k,st_num,st_idx;
@@ -377,8 +380,11 @@ static int ssleay_rand_bytes(unsigned char *buf, int num)
* are fed into the hash function and the results are kept in the
* global 'md'.
*/
-
- CRYPTO_w_lock(CRYPTO_LOCK_RAND);
+#ifdef OPENSSL_FIPS
+ /* NB: in FIPS mode we are already under a lock */
+ if (!FIPS_mode())
+#endif
+ CRYPTO_w_lock(CRYPTO_LOCK_RAND);
/* prevent ssleay_rand_bytes() from trying to obtain the lock again */
CRYPTO_w_lock(CRYPTO_LOCK_RAND2);
@@ -457,7 +463,10 @@ static int ssleay_rand_bytes(unsigned char *buf, int num)
/* before unlocking, we must clear 'crypto_lock_rand' */
crypto_lock_rand = 0;
- CRYPTO_w_unlock(CRYPTO_LOCK_RAND);
+#ifdef OPENSSL_FIPS
+ if (!FIPS_mode())
+#endif
+ CRYPTO_w_unlock(CRYPTO_LOCK_RAND);
while (num > 0)
{
@@ -509,15 +518,23 @@ static int ssleay_rand_bytes(unsigned char *buf, int num)
MD_Init(&m);
MD_Update(&m,(unsigned char *)&(md_c[0]),sizeof(md_c));
MD_Update(&m,local_md,MD_DIGEST_LENGTH);
- CRYPTO_w_lock(CRYPTO_LOCK_RAND);
+#ifdef OPENSSL_FIPS
+ if (!FIPS_mode())
+#endif
+ CRYPTO_w_lock(CRYPTO_LOCK_RAND);
MD_Update(&m,md,MD_DIGEST_LENGTH);
MD_Final(&m,md);
- CRYPTO_w_unlock(CRYPTO_LOCK_RAND);
+#ifdef OPENSSL_FIPS
+ if (!FIPS_mode())
+#endif
+ CRYPTO_w_unlock(CRYPTO_LOCK_RAND);
EVP_MD_CTX_cleanup(&m);
if (ok)
return(1);
- else
+ else if (pseudo)
+ return 0;
+ else
{
RANDerr(RAND_F_SSLEAY_RAND_BYTES,RAND_R_PRNG_NOT_SEEDED);
ERR_add_error_data(1, "You need to read the OpenSSL FAQ, "
@@ -526,22 +543,16 @@ static int ssleay_rand_bytes(unsigned char *buf, int num)
}
}
+static int ssleay_rand_nopseudo_bytes(unsigned char *buf, int num)
+ {
+ return ssleay_rand_bytes(buf, num, 0);
+ }
+
/* pseudo-random bytes that are guaranteed to be unique but not
unpredictable */
static int ssleay_rand_pseudo_bytes(unsigned char *buf, int num)
{
- int ret;
- unsigned long err;
-
- ret = RAND_bytes(buf, num);
- if (ret == 0)
- {
- err = ERR_peek_error();
- if (ERR_GET_LIB(err) == ERR_LIB_RAND &&
- ERR_GET_REASON(err) == RAND_R_PRNG_NOT_SEEDED)
- ERR_clear_error();
- }
- return (ret);
+ return ssleay_rand_bytes(buf, num, 1);
}
static int ssleay_rand_status(void)
diff --git a/main/openssl/crypto/rand/rand.h b/main/openssl/crypto/rand/rand.h
index ac6c0217..bb5520e8 100644
--- a/main/openssl/crypto/rand/rand.h
+++ b/main/openssl/crypto/rand/rand.h
@@ -119,6 +119,11 @@ int RAND_event(UINT, WPARAM, LPARAM);
#endif
+#ifdef OPENSSL_FIPS
+void RAND_set_fips_drbg_type(int type, int flags);
+int RAND_init_fips(void);
+#endif
+
/* BEGIN ERROR CODES */
/* The following lines are auto generated by the script mkerr.pl. Any changes
* made after this point may be overwritten when the script is next run.
@@ -129,9 +134,14 @@ void ERR_load_RAND_strings(void);
/* Function codes. */
#define RAND_F_RAND_GET_RAND_METHOD 101
+#define RAND_F_RAND_INIT_FIPS 102
#define RAND_F_SSLEAY_RAND_BYTES 100
/* Reason codes. */
+#define RAND_R_DUAL_EC_DRBG_DISABLED 104
+#define RAND_R_ERROR_INITIALISING_DRBG 102
+#define RAND_R_ERROR_INSTANTIATING_DRBG 103
+#define RAND_R_NO_FIPS_RANDOM_METHOD_SET 101
#define RAND_R_PRNG_NOT_SEEDED 100
#ifdef __cplusplus
diff --git a/main/openssl/crypto/rand/rand_err.c b/main/openssl/crypto/rand/rand_err.c
index 03cda4dd..c4c80fc8 100644
--- a/main/openssl/crypto/rand/rand_err.c
+++ b/main/openssl/crypto/rand/rand_err.c
@@ -1,6 +1,6 @@
/* crypto/rand/rand_err.c */
/* ====================================================================
- * Copyright (c) 1999-2006 The OpenSSL Project. All rights reserved.
+ * Copyright (c) 1999-2011 The OpenSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -71,12 +71,17 @@
static ERR_STRING_DATA RAND_str_functs[]=
{
{ERR_FUNC(RAND_F_RAND_GET_RAND_METHOD), "RAND_get_rand_method"},
+{ERR_FUNC(RAND_F_RAND_INIT_FIPS), "RAND_init_fips"},
{ERR_FUNC(RAND_F_SSLEAY_RAND_BYTES), "SSLEAY_RAND_BYTES"},
{0,NULL}
};
static ERR_STRING_DATA RAND_str_reasons[]=
{
+{ERR_REASON(RAND_R_DUAL_EC_DRBG_DISABLED),"dual ec drbg disabled"},
+{ERR_REASON(RAND_R_ERROR_INITIALISING_DRBG),"error initialising drbg"},
+{ERR_REASON(RAND_R_ERROR_INSTANTIATING_DRBG),"error instantiating drbg"},
+{ERR_REASON(RAND_R_NO_FIPS_RANDOM_METHOD_SET),"no fips random method set"},
{ERR_REASON(RAND_R_PRNG_NOT_SEEDED) ,"PRNG not seeded"},
{0,NULL}
};
diff --git a/main/openssl/crypto/rand/rand_lib.c b/main/openssl/crypto/rand/rand_lib.c
index 513e3389..5ac0e14c 100644
--- a/main/openssl/crypto/rand/rand_lib.c
+++ b/main/openssl/crypto/rand/rand_lib.c
@@ -60,10 +60,16 @@
#include <time.h>
#include "cryptlib.h"
#include <openssl/rand.h>
+
#ifndef OPENSSL_NO_ENGINE
#include <openssl/engine.h>
#endif
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#include <openssl/fips_rand.h>
+#endif
+
#ifndef OPENSSL_NO_ENGINE
/* non-NULL if default_RAND_meth is ENGINE-provided */
static ENGINE *funct_ref =NULL;
@@ -174,3 +180,127 @@ int RAND_status(void)
return meth->status();
return 0;
}
+
+#ifdef OPENSSL_FIPS
+
+/* FIPS DRBG initialisation code. This sets up the DRBG for use by the
+ * rest of OpenSSL.
+ */
+
+/* Entropy gatherer: use standard OpenSSL PRNG to seed (this will gather
+ * entropy internally through RAND_poll().
+ */
+
+static size_t drbg_get_entropy(DRBG_CTX *ctx, unsigned char **pout,
+ int entropy, size_t min_len, size_t max_len)
+ {
+ /* Round up request to multiple of block size */
+ min_len = ((min_len + 19) / 20) * 20;
+ *pout = OPENSSL_malloc(min_len);
+ if (!*pout)
+ return 0;
+ if (RAND_SSLeay()->bytes(*pout, min_len) <= 0)
+ {
+ OPENSSL_free(*pout);
+ *pout = NULL;
+ return 0;
+ }
+ return min_len;
+ }
+
+static void drbg_free_entropy(DRBG_CTX *ctx, unsigned char *out, size_t olen)
+ {
+ if (out)
+ {
+ OPENSSL_cleanse(out, olen);
+ OPENSSL_free(out);
+ }
+ }
+
+/* Set "additional input" when generating random data. This uses the
+ * current PID, a time value and a counter.
+ */
+
+static size_t drbg_get_adin(DRBG_CTX *ctx, unsigned char **pout)
+ {
+ /* Use of static variables is OK as this happens under a lock */
+ static unsigned char buf[16];
+ static unsigned long counter;
+ FIPS_get_timevec(buf, &counter);
+ *pout = buf;
+ return sizeof(buf);
+ }
+
+/* RAND_add() and RAND_seed() pass through to OpenSSL PRNG so it is
+ * correctly seeded by RAND_poll().
+ */
+
+static int drbg_rand_add(DRBG_CTX *ctx, const void *in, int inlen,
+ double entropy)
+ {
+ RAND_SSLeay()->add(in, inlen, entropy);
+ return 1;
+ }
+
+static int drbg_rand_seed(DRBG_CTX *ctx, const void *in, int inlen)
+ {
+ RAND_SSLeay()->seed(in, inlen);
+ return 1;
+ }
+
+#ifndef OPENSSL_DRBG_DEFAULT_TYPE
+#define OPENSSL_DRBG_DEFAULT_TYPE NID_aes_256_ctr
+#endif
+#ifndef OPENSSL_DRBG_DEFAULT_FLAGS
+#define OPENSSL_DRBG_DEFAULT_FLAGS DRBG_FLAG_CTR_USE_DF
+#endif
+
+static int fips_drbg_type = OPENSSL_DRBG_DEFAULT_TYPE;
+static int fips_drbg_flags = OPENSSL_DRBG_DEFAULT_FLAGS;
+
+void RAND_set_fips_drbg_type(int type, int flags)
+ {
+ fips_drbg_type = type;
+ fips_drbg_flags = flags;
+ }
+
+int RAND_init_fips(void)
+ {
+ DRBG_CTX *dctx;
+ size_t plen;
+ unsigned char pers[32], *p;
+#ifndef OPENSSL_ALLOW_DUAL_EC_DRBG
+ if (fips_drbg_type >> 16)
+ {
+ RANDerr(RAND_F_RAND_INIT_FIPS, RAND_R_DUAL_EC_DRBG_DISABLED);
+ return 0;
+ }
+#endif
+
+ dctx = FIPS_get_default_drbg();
+ if (FIPS_drbg_init(dctx, fips_drbg_type, fips_drbg_flags) <= 0)
+ {
+ RANDerr(RAND_F_RAND_INIT_FIPS, RAND_R_ERROR_INITIALISING_DRBG);
+ return 0;
+ }
+
+ FIPS_drbg_set_callbacks(dctx,
+ drbg_get_entropy, drbg_free_entropy, 20,
+ drbg_get_entropy, drbg_free_entropy);
+ FIPS_drbg_set_rand_callbacks(dctx, drbg_get_adin, 0,
+ drbg_rand_seed, drbg_rand_add);
+ /* Personalisation string: a string followed by date time vector */
+ strcpy((char *)pers, "OpenSSL DRBG2.0");
+ plen = drbg_get_adin(dctx, &p);
+ memcpy(pers + 16, p, plen);
+
+ if (FIPS_drbg_instantiate(dctx, pers, sizeof(pers)) <= 0)
+ {
+ RANDerr(RAND_F_RAND_INIT_FIPS, RAND_R_ERROR_INSTANTIATING_DRBG);
+ return 0;
+ }
+ FIPS_rand_set_method(FIPS_drbg_method());
+ return 1;
+ }
+
+#endif
diff --git a/main/openssl/crypto/rand/rand_unix.c b/main/openssl/crypto/rand/rand_unix.c
index e9ead3a5..e3a65571 100644
--- a/main/openssl/crypto/rand/rand_unix.c
+++ b/main/openssl/crypto/rand/rand_unix.c
@@ -133,47 +133,87 @@
# define FD_SETSIZE (8*sizeof(fd_set))
#endif
-#ifdef __VOS__
+#if defined(OPENSSL_SYS_VOS)
+
+/* The following algorithm repeatedly samples the real-time clock
+ (RTC) to generate a sequence of unpredictable data. The algorithm
+ relies upon the uneven execution speed of the code (due to factors
+ such as cache misses, interrupts, bus activity, and scheduling) and
+ upon the rather large relative difference between the speed of the
+ clock and the rate at which it can be read.
+
+ If this code is ported to an environment where execution speed is
+ more constant or where the RTC ticks at a much slower rate, or the
+ clock can be read with fewer instructions, it is likely that the
+ results would be far more predictable.
+
+ As a precaution, we generate 4 times the minimum required amount of
+ seed data. */
+
int RAND_poll(void)
{
- unsigned char buf[ENTROPY_NEEDED];
+ short int code;
+ gid_t curr_gid;
pid_t curr_pid;
uid_t curr_uid;
- static int first=1;
- int i;
- long rnd = 0;
+ int i, k;
struct timespec ts;
- unsigned seed;
-
-/* The VOS random() function starts from a static seed so its
- initial value is predictable. If random() returns the
- initial value, reseed it with dynamic data. The VOS
- real-time clock has a granularity of 1 nsec so it should be
- reasonably difficult to predict its exact value. Do not
- gratuitously reseed the PRNG because other code in this
- process or thread may be using it. */
-
- if (first) {
- first = 0;
- rnd = random ();
- if (rnd == 1804289383) {
- clock_gettime (CLOCK_REALTIME, &ts);
- curr_pid = getpid();
- curr_uid = getuid();
- seed = ts.tv_sec ^ ts.tv_nsec ^ curr_pid ^ curr_uid;
- srandom (seed);
- }
- }
+ unsigned char v;
- for (i = 0; i < sizeof(buf); i++) {
- if (i % 4 == 0)
- rnd = random();
- buf[i] = rnd;
- rnd >>= 8;
- }
- RAND_add(buf, sizeof(buf), ENTROPY_NEEDED);
- memset(buf, 0, sizeof(buf));
+#ifdef OPENSSL_SYS_VOS_HPPA
+ long duration;
+ extern void s$sleep (long *_duration, short int *_code);
+#else
+#ifdef OPENSSL_SYS_VOS_IA32
+ long long duration;
+ extern void s$sleep2 (long long *_duration, short int *_code);
+#else
+#error "Unsupported Platform."
+#endif /* OPENSSL_SYS_VOS_IA32 */
+#endif /* OPENSSL_SYS_VOS_HPPA */
+
+ /* Seed with the gid, pid, and uid, to ensure *some*
+ variation between different processes. */
+
+ curr_gid = getgid();
+ RAND_add (&curr_gid, sizeof curr_gid, 1);
+ curr_gid = 0;
+
+ curr_pid = getpid();
+ RAND_add (&curr_pid, sizeof curr_pid, 1);
+ curr_pid = 0;
+
+ curr_uid = getuid();
+ RAND_add (&curr_uid, sizeof curr_uid, 1);
+ curr_uid = 0;
+ for (i=0; i<(ENTROPY_NEEDED*4); i++)
+ {
+ /* burn some cpu; hope for interrupts, cache
+ collisions, bus interference, etc. */
+ for (k=0; k<99; k++)
+ ts.tv_nsec = random ();
+
+#ifdef OPENSSL_SYS_VOS_HPPA
+ /* sleep for 1/1024 of a second (976 us). */
+ duration = 1;
+ s$sleep (&duration, &code);
+#else
+#ifdef OPENSSL_SYS_VOS_IA32
+ /* sleep for 1/65536 of a second (15 us). */
+ duration = 1;
+ s$sleep2 (&duration, &code);
+#endif /* OPENSSL_SYS_VOS_IA32 */
+#endif /* OPENSSL_SYS_VOS_HPPA */
+
+ /* get wall clock time. */
+ clock_gettime (CLOCK_REALTIME, &ts);
+
+ /* take 8 bits */
+ v = (unsigned char) (ts.tv_nsec % 256);
+ RAND_add (&v, sizeof v, 1);
+ v = 0;
+ }
return 1;
}
#elif defined __OpenBSD__
diff --git a/main/openssl/crypto/rand/randfile.c b/main/openssl/crypto/rand/randfile.c
index bc7d9c58..7f142807 100644
--- a/main/openssl/crypto/rand/randfile.c
+++ b/main/openssl/crypto/rand/randfile.c
@@ -57,7 +57,9 @@
*/
/* We need to define this to get macros like S_IFBLK and S_IFCHR */
+#if !defined(OPENSSL_SYS_VXWORKS)
#define _XOPEN_SOURCE 500
+#endif
#include <errno.h>
#include <stdio.h>
@@ -137,7 +139,7 @@ int RAND_load_file(const char *file, long bytes)
in=fopen(file,"rb");
#endif
if (in == NULL) goto err;
-#if defined(S_IFBLK) && defined(S_IFCHR) && !defined(OPNESSL_NO_POSIX_IO)
+#if defined(S_IFBLK) && defined(S_IFCHR) && !defined(OPENSSL_NO_POSIX_IO)
if (sb.st_mode & (S_IFBLK | S_IFCHR)) {
/* this file is a device. we don't want read an infinite number
* of bytes from a random device, nor do we want to use buffered
diff --git a/main/openssl/crypto/rc2/rc2.h b/main/openssl/crypto/rc2/rc2.h
index 34c83623..e542ec94 100644
--- a/main/openssl/crypto/rc2/rc2.h
+++ b/main/openssl/crypto/rc2/rc2.h
@@ -79,7 +79,9 @@ typedef struct rc2_key_st
RC2_INT data[64];
} RC2_KEY;
-
+#ifdef OPENSSL_FIPS
+void private_RC2_set_key(RC2_KEY *key, int len, const unsigned char *data,int bits);
+#endif
void RC2_set_key(RC2_KEY *key, int len, const unsigned char *data,int bits);
void RC2_ecb_encrypt(const unsigned char *in,unsigned char *out,RC2_KEY *key,
int enc);
diff --git a/main/openssl/crypto/rc2/rc2_skey.c b/main/openssl/crypto/rc2/rc2_skey.c
index 0150b0e0..6668ac01 100644
--- a/main/openssl/crypto/rc2/rc2_skey.c
+++ b/main/openssl/crypto/rc2/rc2_skey.c
@@ -56,6 +56,7 @@
* [including the GNU Public Licence.]
*/
+#include <openssl/crypto.h>
#include <openssl/rc2.h>
#include "rc2_locl.h"
@@ -95,6 +96,13 @@ static const unsigned char key_table[256]={
* the same as specifying 1024 for the 'bits' parameter. Bsafe uses
* a version where the bits parameter is the same as len*8 */
void RC2_set_key(RC2_KEY *key, int len, const unsigned char *data, int bits)
+#ifdef OPENSSL_FIPS
+ {
+ fips_cipher_abort(RC2);
+ private_RC2_set_key(key, len, data, bits);
+ }
+void private_RC2_set_key(RC2_KEY *key, int len, const unsigned char *data, int bits)
+#endif
{
int i,j;
unsigned char *k;
diff --git a/main/openssl/crypto/rc4/asm/rc4-586.pl b/main/openssl/crypto/rc4/asm/rc4-586.pl
index 38a44a70..5c9ac6ad 100644
--- a/main/openssl/crypto/rc4/asm/rc4-586.pl
+++ b/main/openssl/crypto/rc4/asm/rc4-586.pl
@@ -28,6 +28,34 @@
#
# <appro@fy.chalmers.se>
+# May 2011
+#
+# Optimize for Core2 and Westmere [and incidentally Opteron]. Current
+# performance in cycles per processed byte (less is better) and
+# improvement relative to previous version of this module is:
+#
+# Pentium 10.2 # original numbers
+# Pentium III 7.8(*)
+# Intel P4 7.5
+#
+# Opteron 6.1/+20% # new MMX numbers
+# Core2 5.3/+67%(**)
+# Westmere 5.1/+94%(**)
+# Sandy Bridge 5.0/+8%
+# Atom 12.6/+6%
+#
+# (*) PIII can actually deliver 6.6 cycles per byte with MMX code,
+# but this specific code performs poorly on Core2. And vice
+# versa, below MMX/SSE code delivering 5.8/7.1 on Core2 performs
+# poorly on PIII, at 8.0/14.5:-( As PIII is not a "hot" CPU
+# [anymore], I chose to discard PIII-specific code path and opt
+# for original IALU-only code, which is why MMX/SSE code path
+# is guarded by SSE2 bit (see below), not MMX/SSE.
+# (**) Performance vs. block size on Core2 and Westmere had a maximum
+# at ... 64 bytes block size. And it was quite a maximum, 40-60%
+# in comparison to largest 8KB block size. Above improvement
+# coefficients are for the largest block size.
+
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";
@@ -62,6 +90,68 @@ sub RC4_loop {
&$func ($out,&DWP(0,$dat,$ty,4));
}
+if ($alt=0) {
+ # >20% faster on Atom and Sandy Bridge[!], 8% faster on Opteron,
+ # but ~40% slower on Core2 and Westmere... Attempt to add movz
+ # brings down Opteron by 25%, Atom and Sandy Bridge by 15%, yet
+ # on Core2 with movz it's almost 20% slower than below alternative
+ # code... Yes, it's a total mess...
+ my @XX=($xx,$out);
+ $RC4_loop_mmx = sub { # SSE actually...
+ my $i=shift;
+ my $j=$i<=0?0:$i>>1;
+ my $mm=$i<=0?"mm0":"mm".($i&1);
+
+ &add (&LB($yy),&LB($tx));
+ &lea (@XX[1],&DWP(1,@XX[0]));
+ &pxor ("mm2","mm0") if ($i==0);
+ &psllq ("mm1",8) if ($i==0);
+ &and (@XX[1],0xff);
+ &pxor ("mm0","mm0") if ($i<=0);
+ &mov ($ty,&DWP(0,$dat,$yy,4));
+ &mov (&DWP(0,$dat,$yy,4),$tx);
+ &pxor ("mm1","mm2") if ($i==0);
+ &mov (&DWP(0,$dat,$XX[0],4),$ty);
+ &add (&LB($ty),&LB($tx));
+ &movd (@XX[0],"mm7") if ($i==0);
+ &mov ($tx,&DWP(0,$dat,@XX[1],4));
+ &pxor ("mm1","mm1") if ($i==1);
+ &movq ("mm2",&QWP(0,$inp)) if ($i==1);
+ &movq (&QWP(-8,(@XX[0],$inp)),"mm1") if ($i==0);
+ &pinsrw ($mm,&DWP(0,$dat,$ty,4),$j);
+
+ push (@XX,shift(@XX)) if ($i>=0);
+ }
+} else {
+ # Using pinsrw here improves performane on Intel CPUs by 2-3%, but
+ # brings down AMD by 7%...
+ $RC4_loop_mmx = sub {
+ my $i=shift;
+
+ &add (&LB($yy),&LB($tx));
+ &psllq ("mm1",8*(($i-1)&7)) if (abs($i)!=1);
+ &mov ($ty,&DWP(0,$dat,$yy,4));
+ &mov (&DWP(0,$dat,$yy,4),$tx);
+ &mov (&DWP(0,$dat,$xx,4),$ty);
+ &inc ($xx);
+ &add ($ty,$tx);
+ &movz ($xx,&LB($xx)); # (*)
+ &movz ($ty,&LB($ty)); # (*)
+ &pxor ("mm2",$i==1?"mm0":"mm1") if ($i>=0);
+ &movq ("mm0",&QWP(0,$inp)) if ($i<=0);
+ &movq (&QWP(-8,($out,$inp)),"mm2") if ($i==0);
+ &mov ($tx,&DWP(0,$dat,$xx,4));
+ &movd ($i>0?"mm1":"mm2",&DWP(0,$dat,$ty,4));
+
+ # (*) This is the key to Core2 and Westmere performance.
+ # Whithout movz out-of-order execution logic confuses
+ # itself and fails to reorder loads and stores. Problem
+ # appears to be fixed in Sandy Bridge...
+ }
+}
+
+&external_label("OPENSSL_ia32cap_P");
+
# void RC4(RC4_KEY *key,size_t len,const unsigned char *inp,unsigned char *out);
&function_begin("RC4");
&mov ($dat,&wparam(0)); # load key schedule pointer
@@ -94,11 +184,56 @@ sub RC4_loop {
&and ($ty,-4); # how many 4-byte chunks?
&jz (&label("loop1"));
+ &test ($ty,-8);
+ &mov (&wparam(3),$out); # $out as accumulator in these loops
+ &jz (&label("go4loop4"));
+
+ &picmeup($out,"OPENSSL_ia32cap_P");
+ &bt (&DWP(0,$out),26); # check SSE2 bit [could have been MMX]
+ &jnc (&label("go4loop4"));
+
+ &mov ($out,&wparam(3)) if (!$alt);
+ &movd ("mm7",&wparam(3)) if ($alt);
+ &and ($ty,-8);
+ &lea ($ty,&DWP(-8,$inp,$ty));
+ &mov (&DWP(-4,$dat),$ty); # save input+(len/8)*8-8
+
+ &$RC4_loop_mmx(-1);
+ &jmp(&label("loop_mmx_enter"));
+
+ &set_label("loop_mmx",16);
+ &$RC4_loop_mmx(0);
+ &set_label("loop_mmx_enter");
+ for ($i=1;$i<8;$i++) { &$RC4_loop_mmx($i); }
+ &mov ($ty,$yy);
+ &xor ($yy,$yy); # this is second key to Core2
+ &mov (&LB($yy),&LB($ty)); # and Westmere performance...
+ &cmp ($inp,&DWP(-4,$dat));
+ &lea ($inp,&DWP(8,$inp));
+ &jb (&label("loop_mmx"));
+
+ if ($alt) {
+ &movd ($out,"mm7");
+ &pxor ("mm2","mm0");
+ &psllq ("mm1",8);
+ &pxor ("mm1","mm2");
+ &movq (&QWP(-8,$out,$inp),"mm1");
+ } else {
+ &psllq ("mm1",56);
+ &pxor ("mm2","mm1");
+ &movq (&QWP(-8,$out,$inp),"mm2");
+ }
+ &emms ();
+
+ &cmp ($inp,&wparam(1)); # compare to input+len
+ &je (&label("done"));
+ &jmp (&label("loop1"));
+
+&set_label("go4loop4",16);
&lea ($ty,&DWP(-4,$inp,$ty));
&mov (&wparam(2),$ty); # save input+(len/4)*4-4
- &mov (&wparam(3),$out); # $out as accumulator in this loop
- &set_label("loop4",16);
+ &set_label("loop4");
for ($i=0;$i<4;$i++) { RC4_loop($i); }
&ror ($out,8);
&xor ($out,&DWP(0,$inp));
@@ -151,7 +286,7 @@ sub RC4_loop {
&set_label("done");
&dec (&LB($xx));
- &mov (&BP(-4,$dat),&LB($yy)); # save key->y
+ &mov (&DWP(-4,$dat),$yy); # save key->y
&mov (&BP(-8,$dat),&LB($xx)); # save key->x
&set_label("abort");
&function_end("RC4");
@@ -164,10 +299,8 @@ $idi="ebp";
$ido="ecx";
$idx="edx";
-&external_label("OPENSSL_ia32cap_P");
-
# void RC4_set_key(RC4_KEY *key,int len,const unsigned char *data);
-&function_begin("RC4_set_key");
+&function_begin("private_RC4_set_key");
&mov ($out,&wparam(0)); # load key
&mov ($idi,&wparam(1)); # load len
&mov ($inp,&wparam(2)); # load data
@@ -245,7 +378,7 @@ $idx="edx";
&xor ("eax","eax");
&mov (&DWP(-8,$out),"eax"); # key->x=0;
&mov (&DWP(-4,$out),"eax"); # key->y=0;
-&function_end("RC4_set_key");
+&function_end("private_RC4_set_key");
# const char *RC4_options(void);
&function_begin_B("RC4_options");
@@ -254,14 +387,21 @@ $idx="edx";
&blindpop("eax");
&lea ("eax",&DWP(&label("opts")."-".&label("pic_point"),"eax"));
&picmeup("edx","OPENSSL_ia32cap_P");
- &bt (&DWP(0,"edx"),20);
- &jnc (&label("skip"));
- &add ("eax",12);
- &set_label("skip");
+ &mov ("edx",&DWP(0,"edx"));
+ &bt ("edx",20);
+ &jc (&label("1xchar"));
+ &bt ("edx",26);
+ &jnc (&label("ret"));
+ &add ("eax",25);
+ &ret ();
+&set_label("1xchar");
+ &add ("eax",12);
+&set_label("ret");
&ret ();
&set_label("opts",64);
&asciz ("rc4(4x,int)");
&asciz ("rc4(1x,char)");
+&asciz ("rc4(8x,mmx)");
&asciz ("RC4 for x86, CRYPTOGAMS by <appro\@openssl.org>");
&align (64);
&function_end_B("RC4_options");
diff --git a/main/openssl/crypto/rc4/asm/rc4-md5-x86_64.S b/main/openssl/crypto/rc4/asm/rc4-md5-x86_64.S
new file mode 100644
index 00000000..aab3c6db
--- /dev/null
+++ b/main/openssl/crypto/rc4/asm/rc4-md5-x86_64.S
@@ -0,0 +1,1259 @@
+.text
+.align 16
+
+.globl rc4_md5_enc
+.type rc4_md5_enc,@function
+rc4_md5_enc:
+ cmpq $0,%r9
+ je .Labort
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ subq $40,%rsp
+.Lbody:
+ movq %rcx,%r11
+ movq %r9,%r12
+ movq %rsi,%r13
+ movq %rdx,%r14
+ movq %r8,%r15
+ xorq %rbp,%rbp
+ xorq %rcx,%rcx
+
+ leaq 8(%rdi),%rdi
+ movb -8(%rdi),%bpl
+ movb -4(%rdi),%cl
+
+ incb %bpl
+ subq %r13,%r14
+ movl (%rdi,%rbp,4),%eax
+ addb %al,%cl
+ leaq (%rdi,%rbp,4),%rsi
+ shlq $6,%r12
+ addq %r15,%r12
+ movq %r12,16(%rsp)
+
+ movq %r11,24(%rsp)
+ movl 0(%r11),%r8d
+ movl 4(%r11),%r9d
+ movl 8(%r11),%r10d
+ movl 12(%r11),%r11d
+ jmp .Loop
+
+.align 16
+.Loop:
+ movl %r8d,0(%rsp)
+ movl %r9d,4(%rsp)
+ movl %r10d,8(%rsp)
+ movl %r11d,%r12d
+ movl %r11d,12(%rsp)
+ pxor %xmm0,%xmm0
+ movl (%rdi,%rcx,4),%edx
+ xorl %r10d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ andl %r9d,%r12d
+ addl 0(%r15),%r8d
+ addb %dl,%al
+ movl 4(%rsi),%ebx
+ addl $3614090360,%r8d
+ xorl %r11d,%r12d
+ movzbl %al,%eax
+ movl %edx,0(%rsi)
+ addl %r12d,%r8d
+ addb %bl,%cl
+ roll $7,%r8d
+ movl %r10d,%r12d
+ movd (%rdi,%rax,4),%xmm0
+
+ addl %r9d,%r8d
+ pxor %xmm1,%xmm1
+ movl (%rdi,%rcx,4),%edx
+ xorl %r9d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ andl %r8d,%r12d
+ addl 4(%r15),%r11d
+ addb %dl,%bl
+ movl 8(%rsi),%eax
+ addl $3905402710,%r11d
+ xorl %r10d,%r12d
+ movzbl %bl,%ebx
+ movl %edx,4(%rsi)
+ addl %r12d,%r11d
+ addb %al,%cl
+ roll $12,%r11d
+ movl %r9d,%r12d
+ movd (%rdi,%rbx,4),%xmm1
+
+ addl %r8d,%r11d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r8d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ andl %r11d,%r12d
+ addl 8(%r15),%r10d
+ addb %dl,%al
+ movl 12(%rsi),%ebx
+ addl $606105819,%r10d
+ xorl %r9d,%r12d
+ movzbl %al,%eax
+ movl %edx,8(%rsi)
+ addl %r12d,%r10d
+ addb %bl,%cl
+ roll $17,%r10d
+ movl %r8d,%r12d
+ pinsrw $1,(%rdi,%rax,4),%xmm0
+
+ addl %r11d,%r10d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r11d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ andl %r10d,%r12d
+ addl 12(%r15),%r9d
+ addb %dl,%bl
+ movl 16(%rsi),%eax
+ addl $3250441966,%r9d
+ xorl %r8d,%r12d
+ movzbl %bl,%ebx
+ movl %edx,12(%rsi)
+ addl %r12d,%r9d
+ addb %al,%cl
+ roll $22,%r9d
+ movl %r11d,%r12d
+ pinsrw $1,(%rdi,%rbx,4),%xmm1
+
+ addl %r10d,%r9d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r10d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ andl %r9d,%r12d
+ addl 16(%r15),%r8d
+ addb %dl,%al
+ movl 20(%rsi),%ebx
+ addl $4118548399,%r8d
+ xorl %r11d,%r12d
+ movzbl %al,%eax
+ movl %edx,16(%rsi)
+ addl %r12d,%r8d
+ addb %bl,%cl
+ roll $7,%r8d
+ movl %r10d,%r12d
+ pinsrw $2,(%rdi,%rax,4),%xmm0
+
+ addl %r9d,%r8d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r9d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ andl %r8d,%r12d
+ addl 20(%r15),%r11d
+ addb %dl,%bl
+ movl 24(%rsi),%eax
+ addl $1200080426,%r11d
+ xorl %r10d,%r12d
+ movzbl %bl,%ebx
+ movl %edx,20(%rsi)
+ addl %r12d,%r11d
+ addb %al,%cl
+ roll $12,%r11d
+ movl %r9d,%r12d
+ pinsrw $2,(%rdi,%rbx,4),%xmm1
+
+ addl %r8d,%r11d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r8d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ andl %r11d,%r12d
+ addl 24(%r15),%r10d
+ addb %dl,%al
+ movl 28(%rsi),%ebx
+ addl $2821735955,%r10d
+ xorl %r9d,%r12d
+ movzbl %al,%eax
+ movl %edx,24(%rsi)
+ addl %r12d,%r10d
+ addb %bl,%cl
+ roll $17,%r10d
+ movl %r8d,%r12d
+ pinsrw $3,(%rdi,%rax,4),%xmm0
+
+ addl %r11d,%r10d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r11d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ andl %r10d,%r12d
+ addl 28(%r15),%r9d
+ addb %dl,%bl
+ movl 32(%rsi),%eax
+ addl $4249261313,%r9d
+ xorl %r8d,%r12d
+ movzbl %bl,%ebx
+ movl %edx,28(%rsi)
+ addl %r12d,%r9d
+ addb %al,%cl
+ roll $22,%r9d
+ movl %r11d,%r12d
+ pinsrw $3,(%rdi,%rbx,4),%xmm1
+
+ addl %r10d,%r9d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r10d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ andl %r9d,%r12d
+ addl 32(%r15),%r8d
+ addb %dl,%al
+ movl 36(%rsi),%ebx
+ addl $1770035416,%r8d
+ xorl %r11d,%r12d
+ movzbl %al,%eax
+ movl %edx,32(%rsi)
+ addl %r12d,%r8d
+ addb %bl,%cl
+ roll $7,%r8d
+ movl %r10d,%r12d
+ pinsrw $4,(%rdi,%rax,4),%xmm0
+
+ addl %r9d,%r8d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r9d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ andl %r8d,%r12d
+ addl 36(%r15),%r11d
+ addb %dl,%bl
+ movl 40(%rsi),%eax
+ addl $2336552879,%r11d
+ xorl %r10d,%r12d
+ movzbl %bl,%ebx
+ movl %edx,36(%rsi)
+ addl %r12d,%r11d
+ addb %al,%cl
+ roll $12,%r11d
+ movl %r9d,%r12d
+ pinsrw $4,(%rdi,%rbx,4),%xmm1
+
+ addl %r8d,%r11d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r8d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ andl %r11d,%r12d
+ addl 40(%r15),%r10d
+ addb %dl,%al
+ movl 44(%rsi),%ebx
+ addl $4294925233,%r10d
+ xorl %r9d,%r12d
+ movzbl %al,%eax
+ movl %edx,40(%rsi)
+ addl %r12d,%r10d
+ addb %bl,%cl
+ roll $17,%r10d
+ movl %r8d,%r12d
+ pinsrw $5,(%rdi,%rax,4),%xmm0
+
+ addl %r11d,%r10d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r11d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ andl %r10d,%r12d
+ addl 44(%r15),%r9d
+ addb %dl,%bl
+ movl 48(%rsi),%eax
+ addl $2304563134,%r9d
+ xorl %r8d,%r12d
+ movzbl %bl,%ebx
+ movl %edx,44(%rsi)
+ addl %r12d,%r9d
+ addb %al,%cl
+ roll $22,%r9d
+ movl %r11d,%r12d
+ pinsrw $5,(%rdi,%rbx,4),%xmm1
+
+ addl %r10d,%r9d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r10d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ andl %r9d,%r12d
+ addl 48(%r15),%r8d
+ addb %dl,%al
+ movl 52(%rsi),%ebx
+ addl $1804603682,%r8d
+ xorl %r11d,%r12d
+ movzbl %al,%eax
+ movl %edx,48(%rsi)
+ addl %r12d,%r8d
+ addb %bl,%cl
+ roll $7,%r8d
+ movl %r10d,%r12d
+ pinsrw $6,(%rdi,%rax,4),%xmm0
+
+ addl %r9d,%r8d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r9d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ andl %r8d,%r12d
+ addl 52(%r15),%r11d
+ addb %dl,%bl
+ movl 56(%rsi),%eax
+ addl $4254626195,%r11d
+ xorl %r10d,%r12d
+ movzbl %bl,%ebx
+ movl %edx,52(%rsi)
+ addl %r12d,%r11d
+ addb %al,%cl
+ roll $12,%r11d
+ movl %r9d,%r12d
+ pinsrw $6,(%rdi,%rbx,4),%xmm1
+
+ addl %r8d,%r11d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r8d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ andl %r11d,%r12d
+ addl 56(%r15),%r10d
+ addb %dl,%al
+ movl 60(%rsi),%ebx
+ addl $2792965006,%r10d
+ xorl %r9d,%r12d
+ movzbl %al,%eax
+ movl %edx,56(%rsi)
+ addl %r12d,%r10d
+ addb %bl,%cl
+ roll $17,%r10d
+ movl %r8d,%r12d
+ pinsrw $7,(%rdi,%rax,4),%xmm0
+
+ addl %r11d,%r10d
+ movdqu (%r13),%xmm2
+ movl (%rdi,%rcx,4),%edx
+ xorl %r11d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ andl %r10d,%r12d
+ addl 60(%r15),%r9d
+ addb %dl,%bl
+ movl 64(%rsi),%eax
+ addl $1236535329,%r9d
+ xorl %r8d,%r12d
+ movzbl %bl,%ebx
+ movl %edx,60(%rsi)
+ addl %r12d,%r9d
+ addb %al,%cl
+ roll $22,%r9d
+ movl %r10d,%r12d
+ pinsrw $7,(%rdi,%rbx,4),%xmm1
+
+ addl %r10d,%r9d
+ psllq $8,%xmm1
+ pxor %xmm0,%xmm2
+ pxor %xmm1,%xmm2
+ pxor %xmm0,%xmm0
+ movl (%rdi,%rcx,4),%edx
+ xorl %r9d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ andl %r11d,%r12d
+ addl 4(%r15),%r8d
+ addb %dl,%al
+ movl 68(%rsi),%ebx
+ addl $4129170786,%r8d
+ xorl %r10d,%r12d
+ movzbl %al,%eax
+ movl %edx,64(%rsi)
+ addl %r12d,%r8d
+ addb %bl,%cl
+ roll $5,%r8d
+ movl %r9d,%r12d
+ movd (%rdi,%rax,4),%xmm0
+
+ addl %r9d,%r8d
+ pxor %xmm1,%xmm1
+ movl (%rdi,%rcx,4),%edx
+ xorl %r8d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ andl %r10d,%r12d
+ addl 24(%r15),%r11d
+ addb %dl,%bl
+ movl 72(%rsi),%eax
+ addl $3225465664,%r11d
+ xorl %r9d,%r12d
+ movzbl %bl,%ebx
+ movl %edx,68(%rsi)
+ addl %r12d,%r11d
+ addb %al,%cl
+ roll $9,%r11d
+ movl %r8d,%r12d
+ movd (%rdi,%rbx,4),%xmm1
+
+ addl %r8d,%r11d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r11d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ andl %r9d,%r12d
+ addl 44(%r15),%r10d
+ addb %dl,%al
+ movl 76(%rsi),%ebx
+ addl $643717713,%r10d
+ xorl %r8d,%r12d
+ movzbl %al,%eax
+ movl %edx,72(%rsi)
+ addl %r12d,%r10d
+ addb %bl,%cl
+ roll $14,%r10d
+ movl %r11d,%r12d
+ pinsrw $1,(%rdi,%rax,4),%xmm0
+
+ addl %r11d,%r10d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r10d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ andl %r8d,%r12d
+ addl 0(%r15),%r9d
+ addb %dl,%bl
+ movl 80(%rsi),%eax
+ addl $3921069994,%r9d
+ xorl %r11d,%r12d
+ movzbl %bl,%ebx
+ movl %edx,76(%rsi)
+ addl %r12d,%r9d
+ addb %al,%cl
+ roll $20,%r9d
+ movl %r10d,%r12d
+ pinsrw $1,(%rdi,%rbx,4),%xmm1
+
+ addl %r10d,%r9d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r9d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ andl %r11d,%r12d
+ addl 20(%r15),%r8d
+ addb %dl,%al
+ movl 84(%rsi),%ebx
+ addl $3593408605,%r8d
+ xorl %r10d,%r12d
+ movzbl %al,%eax
+ movl %edx,80(%rsi)
+ addl %r12d,%r8d
+ addb %bl,%cl
+ roll $5,%r8d
+ movl %r9d,%r12d
+ pinsrw $2,(%rdi,%rax,4),%xmm0
+
+ addl %r9d,%r8d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r8d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ andl %r10d,%r12d
+ addl 40(%r15),%r11d
+ addb %dl,%bl
+ movl 88(%rsi),%eax
+ addl $38016083,%r11d
+ xorl %r9d,%r12d
+ movzbl %bl,%ebx
+ movl %edx,84(%rsi)
+ addl %r12d,%r11d
+ addb %al,%cl
+ roll $9,%r11d
+ movl %r8d,%r12d
+ pinsrw $2,(%rdi,%rbx,4),%xmm1
+
+ addl %r8d,%r11d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r11d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ andl %r9d,%r12d
+ addl 60(%r15),%r10d
+ addb %dl,%al
+ movl 92(%rsi),%ebx
+ addl $3634488961,%r10d
+ xorl %r8d,%r12d
+ movzbl %al,%eax
+ movl %edx,88(%rsi)
+ addl %r12d,%r10d
+ addb %bl,%cl
+ roll $14,%r10d
+ movl %r11d,%r12d
+ pinsrw $3,(%rdi,%rax,4),%xmm0
+
+ addl %r11d,%r10d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r10d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ andl %r8d,%r12d
+ addl 16(%r15),%r9d
+ addb %dl,%bl
+ movl 96(%rsi),%eax
+ addl $3889429448,%r9d
+ xorl %r11d,%r12d
+ movzbl %bl,%ebx
+ movl %edx,92(%rsi)
+ addl %r12d,%r9d
+ addb %al,%cl
+ roll $20,%r9d
+ movl %r10d,%r12d
+ pinsrw $3,(%rdi,%rbx,4),%xmm1
+
+ addl %r10d,%r9d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r9d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ andl %r11d,%r12d
+ addl 36(%r15),%r8d
+ addb %dl,%al
+ movl 100(%rsi),%ebx
+ addl $568446438,%r8d
+ xorl %r10d,%r12d
+ movzbl %al,%eax
+ movl %edx,96(%rsi)
+ addl %r12d,%r8d
+ addb %bl,%cl
+ roll $5,%r8d
+ movl %r9d,%r12d
+ pinsrw $4,(%rdi,%rax,4),%xmm0
+
+ addl %r9d,%r8d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r8d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ andl %r10d,%r12d
+ addl 56(%r15),%r11d
+ addb %dl,%bl
+ movl 104(%rsi),%eax
+ addl $3275163606,%r11d
+ xorl %r9d,%r12d
+ movzbl %bl,%ebx
+ movl %edx,100(%rsi)
+ addl %r12d,%r11d
+ addb %al,%cl
+ roll $9,%r11d
+ movl %r8d,%r12d
+ pinsrw $4,(%rdi,%rbx,4),%xmm1
+
+ addl %r8d,%r11d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r11d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ andl %r9d,%r12d
+ addl 12(%r15),%r10d
+ addb %dl,%al
+ movl 108(%rsi),%ebx
+ addl $4107603335,%r10d
+ xorl %r8d,%r12d
+ movzbl %al,%eax
+ movl %edx,104(%rsi)
+ addl %r12d,%r10d
+ addb %bl,%cl
+ roll $14,%r10d
+ movl %r11d,%r12d
+ pinsrw $5,(%rdi,%rax,4),%xmm0
+
+ addl %r11d,%r10d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r10d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ andl %r8d,%r12d
+ addl 32(%r15),%r9d
+ addb %dl,%bl
+ movl 112(%rsi),%eax
+ addl $1163531501,%r9d
+ xorl %r11d,%r12d
+ movzbl %bl,%ebx
+ movl %edx,108(%rsi)
+ addl %r12d,%r9d
+ addb %al,%cl
+ roll $20,%r9d
+ movl %r10d,%r12d
+ pinsrw $5,(%rdi,%rbx,4),%xmm1
+
+ addl %r10d,%r9d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r9d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ andl %r11d,%r12d
+ addl 52(%r15),%r8d
+ addb %dl,%al
+ movl 116(%rsi),%ebx
+ addl $2850285829,%r8d
+ xorl %r10d,%r12d
+ movzbl %al,%eax
+ movl %edx,112(%rsi)
+ addl %r12d,%r8d
+ addb %bl,%cl
+ roll $5,%r8d
+ movl %r9d,%r12d
+ pinsrw $6,(%rdi,%rax,4),%xmm0
+
+ addl %r9d,%r8d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r8d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ andl %r10d,%r12d
+ addl 8(%r15),%r11d
+ addb %dl,%bl
+ movl 120(%rsi),%eax
+ addl $4243563512,%r11d
+ xorl %r9d,%r12d
+ movzbl %bl,%ebx
+ movl %edx,116(%rsi)
+ addl %r12d,%r11d
+ addb %al,%cl
+ roll $9,%r11d
+ movl %r8d,%r12d
+ pinsrw $6,(%rdi,%rbx,4),%xmm1
+
+ addl %r8d,%r11d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r11d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ andl %r9d,%r12d
+ addl 28(%r15),%r10d
+ addb %dl,%al
+ movl 124(%rsi),%ebx
+ addl $1735328473,%r10d
+ xorl %r8d,%r12d
+ movzbl %al,%eax
+ movl %edx,120(%rsi)
+ addl %r12d,%r10d
+ addb %bl,%cl
+ roll $14,%r10d
+ movl %r11d,%r12d
+ pinsrw $7,(%rdi,%rax,4),%xmm0
+
+ addl %r11d,%r10d
+ movdqu 16(%r13),%xmm3
+ addb $32,%bpl
+ movl (%rdi,%rcx,4),%edx
+ xorl %r10d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ andl %r8d,%r12d
+ addl 48(%r15),%r9d
+ addb %dl,%bl
+ movl 0(%rdi,%rbp,4),%eax
+ addl $2368359562,%r9d
+ xorl %r11d,%r12d
+ movzbl %bl,%ebx
+ movl %edx,124(%rsi)
+ addl %r12d,%r9d
+ addb %al,%cl
+ roll $20,%r9d
+ movl %r11d,%r12d
+ pinsrw $7,(%rdi,%rbx,4),%xmm1
+
+ addl %r10d,%r9d
+ movq %rcx,%rsi
+ xorq %rcx,%rcx
+ movb %sil,%cl
+ leaq (%rdi,%rbp,4),%rsi
+ psllq $8,%xmm1
+ pxor %xmm0,%xmm3
+ pxor %xmm1,%xmm3
+ pxor %xmm0,%xmm0
+ movl (%rdi,%rcx,4),%edx
+ xorl %r10d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ xorl %r9d,%r12d
+ addl 20(%r15),%r8d
+ addb %dl,%al
+ movl 4(%rsi),%ebx
+ addl $4294588738,%r8d
+ movzbl %al,%eax
+ addl %r12d,%r8d
+ movl %edx,0(%rsi)
+ addb %bl,%cl
+ roll $4,%r8d
+ movl %r10d,%r12d
+ movd (%rdi,%rax,4),%xmm0
+
+ addl %r9d,%r8d
+ pxor %xmm1,%xmm1
+ movl (%rdi,%rcx,4),%edx
+ xorl %r9d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ xorl %r8d,%r12d
+ addl 32(%r15),%r11d
+ addb %dl,%bl
+ movl 8(%rsi),%eax
+ addl $2272392833,%r11d
+ movzbl %bl,%ebx
+ addl %r12d,%r11d
+ movl %edx,4(%rsi)
+ addb %al,%cl
+ roll $11,%r11d
+ movl %r9d,%r12d
+ movd (%rdi,%rbx,4),%xmm1
+
+ addl %r8d,%r11d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r8d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ xorl %r11d,%r12d
+ addl 44(%r15),%r10d
+ addb %dl,%al
+ movl 12(%rsi),%ebx
+ addl $1839030562,%r10d
+ movzbl %al,%eax
+ addl %r12d,%r10d
+ movl %edx,8(%rsi)
+ addb %bl,%cl
+ roll $16,%r10d
+ movl %r8d,%r12d
+ pinsrw $1,(%rdi,%rax,4),%xmm0
+
+ addl %r11d,%r10d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r11d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ xorl %r10d,%r12d
+ addl 56(%r15),%r9d
+ addb %dl,%bl
+ movl 16(%rsi),%eax
+ addl $4259657740,%r9d
+ movzbl %bl,%ebx
+ addl %r12d,%r9d
+ movl %edx,12(%rsi)
+ addb %al,%cl
+ roll $23,%r9d
+ movl %r11d,%r12d
+ pinsrw $1,(%rdi,%rbx,4),%xmm1
+
+ addl %r10d,%r9d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r10d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ xorl %r9d,%r12d
+ addl 4(%r15),%r8d
+ addb %dl,%al
+ movl 20(%rsi),%ebx
+ addl $2763975236,%r8d
+ movzbl %al,%eax
+ addl %r12d,%r8d
+ movl %edx,16(%rsi)
+ addb %bl,%cl
+ roll $4,%r8d
+ movl %r10d,%r12d
+ pinsrw $2,(%rdi,%rax,4),%xmm0
+
+ addl %r9d,%r8d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r9d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ xorl %r8d,%r12d
+ addl 16(%r15),%r11d
+ addb %dl,%bl
+ movl 24(%rsi),%eax
+ addl $1272893353,%r11d
+ movzbl %bl,%ebx
+ addl %r12d,%r11d
+ movl %edx,20(%rsi)
+ addb %al,%cl
+ roll $11,%r11d
+ movl %r9d,%r12d
+ pinsrw $2,(%rdi,%rbx,4),%xmm1
+
+ addl %r8d,%r11d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r8d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ xorl %r11d,%r12d
+ addl 28(%r15),%r10d
+ addb %dl,%al
+ movl 28(%rsi),%ebx
+ addl $4139469664,%r10d
+ movzbl %al,%eax
+ addl %r12d,%r10d
+ movl %edx,24(%rsi)
+ addb %bl,%cl
+ roll $16,%r10d
+ movl %r8d,%r12d
+ pinsrw $3,(%rdi,%rax,4),%xmm0
+
+ addl %r11d,%r10d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r11d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ xorl %r10d,%r12d
+ addl 40(%r15),%r9d
+ addb %dl,%bl
+ movl 32(%rsi),%eax
+ addl $3200236656,%r9d
+ movzbl %bl,%ebx
+ addl %r12d,%r9d
+ movl %edx,28(%rsi)
+ addb %al,%cl
+ roll $23,%r9d
+ movl %r11d,%r12d
+ pinsrw $3,(%rdi,%rbx,4),%xmm1
+
+ addl %r10d,%r9d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r10d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ xorl %r9d,%r12d
+ addl 52(%r15),%r8d
+ addb %dl,%al
+ movl 36(%rsi),%ebx
+ addl $681279174,%r8d
+ movzbl %al,%eax
+ addl %r12d,%r8d
+ movl %edx,32(%rsi)
+ addb %bl,%cl
+ roll $4,%r8d
+ movl %r10d,%r12d
+ pinsrw $4,(%rdi,%rax,4),%xmm0
+
+ addl %r9d,%r8d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r9d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ xorl %r8d,%r12d
+ addl 0(%r15),%r11d
+ addb %dl,%bl
+ movl 40(%rsi),%eax
+ addl $3936430074,%r11d
+ movzbl %bl,%ebx
+ addl %r12d,%r11d
+ movl %edx,36(%rsi)
+ addb %al,%cl
+ roll $11,%r11d
+ movl %r9d,%r12d
+ pinsrw $4,(%rdi,%rbx,4),%xmm1
+
+ addl %r8d,%r11d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r8d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ xorl %r11d,%r12d
+ addl 12(%r15),%r10d
+ addb %dl,%al
+ movl 44(%rsi),%ebx
+ addl $3572445317,%r10d
+ movzbl %al,%eax
+ addl %r12d,%r10d
+ movl %edx,40(%rsi)
+ addb %bl,%cl
+ roll $16,%r10d
+ movl %r8d,%r12d
+ pinsrw $5,(%rdi,%rax,4),%xmm0
+
+ addl %r11d,%r10d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r11d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ xorl %r10d,%r12d
+ addl 24(%r15),%r9d
+ addb %dl,%bl
+ movl 48(%rsi),%eax
+ addl $76029189,%r9d
+ movzbl %bl,%ebx
+ addl %r12d,%r9d
+ movl %edx,44(%rsi)
+ addb %al,%cl
+ roll $23,%r9d
+ movl %r11d,%r12d
+ pinsrw $5,(%rdi,%rbx,4),%xmm1
+
+ addl %r10d,%r9d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r10d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ xorl %r9d,%r12d
+ addl 36(%r15),%r8d
+ addb %dl,%al
+ movl 52(%rsi),%ebx
+ addl $3654602809,%r8d
+ movzbl %al,%eax
+ addl %r12d,%r8d
+ movl %edx,48(%rsi)
+ addb %bl,%cl
+ roll $4,%r8d
+ movl %r10d,%r12d
+ pinsrw $6,(%rdi,%rax,4),%xmm0
+
+ addl %r9d,%r8d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r9d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ xorl %r8d,%r12d
+ addl 48(%r15),%r11d
+ addb %dl,%bl
+ movl 56(%rsi),%eax
+ addl $3873151461,%r11d
+ movzbl %bl,%ebx
+ addl %r12d,%r11d
+ movl %edx,52(%rsi)
+ addb %al,%cl
+ roll $11,%r11d
+ movl %r9d,%r12d
+ pinsrw $6,(%rdi,%rbx,4),%xmm1
+
+ addl %r8d,%r11d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r8d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ xorl %r11d,%r12d
+ addl 60(%r15),%r10d
+ addb %dl,%al
+ movl 60(%rsi),%ebx
+ addl $530742520,%r10d
+ movzbl %al,%eax
+ addl %r12d,%r10d
+ movl %edx,56(%rsi)
+ addb %bl,%cl
+ roll $16,%r10d
+ movl %r8d,%r12d
+ pinsrw $7,(%rdi,%rax,4),%xmm0
+
+ addl %r11d,%r10d
+ movdqu 32(%r13),%xmm4
+ movl (%rdi,%rcx,4),%edx
+ xorl %r11d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ xorl %r10d,%r12d
+ addl 8(%r15),%r9d
+ addb %dl,%bl
+ movl 64(%rsi),%eax
+ addl $3299628645,%r9d
+ movzbl %bl,%ebx
+ addl %r12d,%r9d
+ movl %edx,60(%rsi)
+ addb %al,%cl
+ roll $23,%r9d
+ movl $-1,%r12d
+ pinsrw $7,(%rdi,%rbx,4),%xmm1
+
+ addl %r10d,%r9d
+ psllq $8,%xmm1
+ pxor %xmm0,%xmm4
+ pxor %xmm1,%xmm4
+ pxor %xmm0,%xmm0
+ movl (%rdi,%rcx,4),%edx
+ xorl %r11d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ orl %r9d,%r12d
+ addl 0(%r15),%r8d
+ addb %dl,%al
+ movl 68(%rsi),%ebx
+ addl $4096336452,%r8d
+ movzbl %al,%eax
+ xorl %r10d,%r12d
+ movl %edx,64(%rsi)
+ addl %r12d,%r8d
+ addb %bl,%cl
+ roll $6,%r8d
+ movl $-1,%r12d
+ movd (%rdi,%rax,4),%xmm0
+
+ addl %r9d,%r8d
+ pxor %xmm1,%xmm1
+ movl (%rdi,%rcx,4),%edx
+ xorl %r10d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ orl %r8d,%r12d
+ addl 28(%r15),%r11d
+ addb %dl,%bl
+ movl 72(%rsi),%eax
+ addl $1126891415,%r11d
+ movzbl %bl,%ebx
+ xorl %r9d,%r12d
+ movl %edx,68(%rsi)
+ addl %r12d,%r11d
+ addb %al,%cl
+ roll $10,%r11d
+ movl $-1,%r12d
+ movd (%rdi,%rbx,4),%xmm1
+
+ addl %r8d,%r11d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r9d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ orl %r11d,%r12d
+ addl 56(%r15),%r10d
+ addb %dl,%al
+ movl 76(%rsi),%ebx
+ addl $2878612391,%r10d
+ movzbl %al,%eax
+ xorl %r8d,%r12d
+ movl %edx,72(%rsi)
+ addl %r12d,%r10d
+ addb %bl,%cl
+ roll $15,%r10d
+ movl $-1,%r12d
+ pinsrw $1,(%rdi,%rax,4),%xmm0
+
+ addl %r11d,%r10d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r8d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ orl %r10d,%r12d
+ addl 20(%r15),%r9d
+ addb %dl,%bl
+ movl 80(%rsi),%eax
+ addl $4237533241,%r9d
+ movzbl %bl,%ebx
+ xorl %r11d,%r12d
+ movl %edx,76(%rsi)
+ addl %r12d,%r9d
+ addb %al,%cl
+ roll $21,%r9d
+ movl $-1,%r12d
+ pinsrw $1,(%rdi,%rbx,4),%xmm1
+
+ addl %r10d,%r9d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r11d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ orl %r9d,%r12d
+ addl 48(%r15),%r8d
+ addb %dl,%al
+ movl 84(%rsi),%ebx
+ addl $1700485571,%r8d
+ movzbl %al,%eax
+ xorl %r10d,%r12d
+ movl %edx,80(%rsi)
+ addl %r12d,%r8d
+ addb %bl,%cl
+ roll $6,%r8d
+ movl $-1,%r12d
+ pinsrw $2,(%rdi,%rax,4),%xmm0
+
+ addl %r9d,%r8d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r10d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ orl %r8d,%r12d
+ addl 12(%r15),%r11d
+ addb %dl,%bl
+ movl 88(%rsi),%eax
+ addl $2399980690,%r11d
+ movzbl %bl,%ebx
+ xorl %r9d,%r12d
+ movl %edx,84(%rsi)
+ addl %r12d,%r11d
+ addb %al,%cl
+ roll $10,%r11d
+ movl $-1,%r12d
+ pinsrw $2,(%rdi,%rbx,4),%xmm1
+
+ addl %r8d,%r11d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r9d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ orl %r11d,%r12d
+ addl 40(%r15),%r10d
+ addb %dl,%al
+ movl 92(%rsi),%ebx
+ addl $4293915773,%r10d
+ movzbl %al,%eax
+ xorl %r8d,%r12d
+ movl %edx,88(%rsi)
+ addl %r12d,%r10d
+ addb %bl,%cl
+ roll $15,%r10d
+ movl $-1,%r12d
+ pinsrw $3,(%rdi,%rax,4),%xmm0
+
+ addl %r11d,%r10d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r8d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ orl %r10d,%r12d
+ addl 4(%r15),%r9d
+ addb %dl,%bl
+ movl 96(%rsi),%eax
+ addl $2240044497,%r9d
+ movzbl %bl,%ebx
+ xorl %r11d,%r12d
+ movl %edx,92(%rsi)
+ addl %r12d,%r9d
+ addb %al,%cl
+ roll $21,%r9d
+ movl $-1,%r12d
+ pinsrw $3,(%rdi,%rbx,4),%xmm1
+
+ addl %r10d,%r9d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r11d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ orl %r9d,%r12d
+ addl 32(%r15),%r8d
+ addb %dl,%al
+ movl 100(%rsi),%ebx
+ addl $1873313359,%r8d
+ movzbl %al,%eax
+ xorl %r10d,%r12d
+ movl %edx,96(%rsi)
+ addl %r12d,%r8d
+ addb %bl,%cl
+ roll $6,%r8d
+ movl $-1,%r12d
+ pinsrw $4,(%rdi,%rax,4),%xmm0
+
+ addl %r9d,%r8d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r10d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ orl %r8d,%r12d
+ addl 60(%r15),%r11d
+ addb %dl,%bl
+ movl 104(%rsi),%eax
+ addl $4264355552,%r11d
+ movzbl %bl,%ebx
+ xorl %r9d,%r12d
+ movl %edx,100(%rsi)
+ addl %r12d,%r11d
+ addb %al,%cl
+ roll $10,%r11d
+ movl $-1,%r12d
+ pinsrw $4,(%rdi,%rbx,4),%xmm1
+
+ addl %r8d,%r11d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r9d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ orl %r11d,%r12d
+ addl 24(%r15),%r10d
+ addb %dl,%al
+ movl 108(%rsi),%ebx
+ addl $2734768916,%r10d
+ movzbl %al,%eax
+ xorl %r8d,%r12d
+ movl %edx,104(%rsi)
+ addl %r12d,%r10d
+ addb %bl,%cl
+ roll $15,%r10d
+ movl $-1,%r12d
+ pinsrw $5,(%rdi,%rax,4),%xmm0
+
+ addl %r11d,%r10d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r8d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ orl %r10d,%r12d
+ addl 52(%r15),%r9d
+ addb %dl,%bl
+ movl 112(%rsi),%eax
+ addl $1309151649,%r9d
+ movzbl %bl,%ebx
+ xorl %r11d,%r12d
+ movl %edx,108(%rsi)
+ addl %r12d,%r9d
+ addb %al,%cl
+ roll $21,%r9d
+ movl $-1,%r12d
+ pinsrw $5,(%rdi,%rbx,4),%xmm1
+
+ addl %r10d,%r9d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r11d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ orl %r9d,%r12d
+ addl 16(%r15),%r8d
+ addb %dl,%al
+ movl 116(%rsi),%ebx
+ addl $4149444226,%r8d
+ movzbl %al,%eax
+ xorl %r10d,%r12d
+ movl %edx,112(%rsi)
+ addl %r12d,%r8d
+ addb %bl,%cl
+ roll $6,%r8d
+ movl $-1,%r12d
+ pinsrw $6,(%rdi,%rax,4),%xmm0
+
+ addl %r9d,%r8d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r10d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ orl %r8d,%r12d
+ addl 44(%r15),%r11d
+ addb %dl,%bl
+ movl 120(%rsi),%eax
+ addl $3174756917,%r11d
+ movzbl %bl,%ebx
+ xorl %r9d,%r12d
+ movl %edx,116(%rsi)
+ addl %r12d,%r11d
+ addb %al,%cl
+ roll $10,%r11d
+ movl $-1,%r12d
+ pinsrw $6,(%rdi,%rbx,4),%xmm1
+
+ addl %r8d,%r11d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r9d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ orl %r11d,%r12d
+ addl 8(%r15),%r10d
+ addb %dl,%al
+ movl 124(%rsi),%ebx
+ addl $718787259,%r10d
+ movzbl %al,%eax
+ xorl %r8d,%r12d
+ movl %edx,120(%rsi)
+ addl %r12d,%r10d
+ addb %bl,%cl
+ roll $15,%r10d
+ movl $-1,%r12d
+ pinsrw $7,(%rdi,%rax,4),%xmm0
+
+ addl %r11d,%r10d
+ movdqu 48(%r13),%xmm5
+ addb $32,%bpl
+ movl (%rdi,%rcx,4),%edx
+ xorl %r8d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ orl %r10d,%r12d
+ addl 36(%r15),%r9d
+ addb %dl,%bl
+ movl 0(%rdi,%rbp,4),%eax
+ addl $3951481745,%r9d
+ movzbl %bl,%ebx
+ xorl %r11d,%r12d
+ movl %edx,124(%rsi)
+ addl %r12d,%r9d
+ addb %al,%cl
+ roll $21,%r9d
+ movl $-1,%r12d
+ pinsrw $7,(%rdi,%rbx,4),%xmm1
+
+ addl %r10d,%r9d
+ movq %rbp,%rsi
+ xorq %rbp,%rbp
+ movb %sil,%bpl
+ movq %rcx,%rsi
+ xorq %rcx,%rcx
+ movb %sil,%cl
+ leaq (%rdi,%rbp,4),%rsi
+ psllq $8,%xmm1
+ pxor %xmm0,%xmm5
+ pxor %xmm1,%xmm5
+ addl 0(%rsp),%r8d
+ addl 4(%rsp),%r9d
+ addl 8(%rsp),%r10d
+ addl 12(%rsp),%r11d
+
+ movdqu %xmm2,(%r14,%r13,1)
+ movdqu %xmm3,16(%r14,%r13,1)
+ movdqu %xmm4,32(%r14,%r13,1)
+ movdqu %xmm5,48(%r14,%r13,1)
+ leaq 64(%r15),%r15
+ leaq 64(%r13),%r13
+ cmpq 16(%rsp),%r15
+ jb .Loop
+
+ movq 24(%rsp),%r12
+ subb %al,%cl
+ movl %r8d,0(%r12)
+ movl %r9d,4(%r12)
+ movl %r10d,8(%r12)
+ movl %r11d,12(%r12)
+ subb $1,%bpl
+ movl %ebp,-8(%rdi)
+ movl %ecx,-4(%rdi)
+
+ movq 40(%rsp),%r15
+ movq 48(%rsp),%r14
+ movq 56(%rsp),%r13
+ movq 64(%rsp),%r12
+ movq 72(%rsp),%rbp
+ movq 80(%rsp),%rbx
+ leaq 88(%rsp),%rsp
+.Lepilogue:
+.Labort:
+ .byte 0xf3,0xc3
+.size rc4_md5_enc,.-rc4_md5_enc
diff --git a/main/openssl/crypto/rc4/asm/rc4-md5-x86_64.pl b/main/openssl/crypto/rc4/asm/rc4-md5-x86_64.pl
new file mode 100644
index 00000000..272fa91e
--- /dev/null
+++ b/main/openssl/crypto/rc4/asm/rc4-md5-x86_64.pl
@@ -0,0 +1,632 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# June 2011
+#
+# This is RC4+MD5 "stitch" implementation. The idea, as spelled in
+# http://download.intel.com/design/intarch/papers/323686.pdf, is that
+# since both algorithms exhibit instruction-level parallelism, ILP,
+# below theoretical maximum, interleaving them would allow to utilize
+# processor resources better and achieve better performance. RC4
+# instruction sequence is virtually identical to rc4-x86_64.pl, which
+# is heavily based on submission by Maxim Perminov, Maxim Locktyukhin
+# and Jim Guilford of Intel. MD5 is fresh implementation aiming to
+# minimize register usage, which was used as "main thread" with RC4
+# weaved into it, one RC4 round per one MD5 round. In addition to the
+# stiched subroutine the script can generate standalone replacement
+# md5_block_asm_data_order and RC4. Below are performance numbers in
+# cycles per processed byte, less is better, for these the standalone
+# subroutines, sum of them, and stitched one:
+#
+# RC4 MD5 RC4+MD5 stitch gain
+# Opteron 6.5(*) 5.4 11.9 7.0 +70%(*)
+# Core2 6.5 5.8 12.3 7.7 +60%
+# Westmere 4.3 5.2 9.5 7.0 +36%
+# Sandy Bridge 4.2 5.5 9.7 6.8 +43%
+# Atom 9.3 6.5 15.8 11.1 +42%
+#
+# (*) rc4-x86_64.pl delivers 5.3 on Opteron, so real improvement
+# is +53%...
+
+my ($rc4,$md5)=(1,1); # what to generate?
+my $D="#" if (!$md5); # if set to "#", MD5 is stitched into RC4(),
+ # but its result is discarded. Idea here is
+ # to be able to use 'openssl speed rc4' for
+ # benchmarking the stitched subroutine...
+
+my $flavour = shift;
+my $output = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+my $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; my $dir=$1; my $xlate;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+my ($dat,$in0,$out,$ctx,$inp,$len, $func,$nargs);
+
+if ($rc4 && !$md5) {
+ ($dat,$len,$in0,$out) = ("%rdi","%rsi","%rdx","%rcx");
+ $func="RC4"; $nargs=4;
+} elsif ($md5 && !$rc4) {
+ ($ctx,$inp,$len) = ("%rdi","%rsi","%rdx");
+ $func="md5_block_asm_data_order"; $nargs=3;
+} else {
+ ($dat,$in0,$out,$ctx,$inp,$len) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
+ $func="rc4_md5_enc"; $nargs=6;
+ # void rc4_md5_enc(
+ # RC4_KEY *key, #
+ # const void *in0, # RC4 input
+ # void *out, # RC4 output
+ # MD5_CTX *ctx, #
+ # const void *inp, # MD5 input
+ # size_t len); # number of 64-byte blocks
+}
+
+my @K=( 0xd76aa478,0xe8c7b756,0x242070db,0xc1bdceee,
+ 0xf57c0faf,0x4787c62a,0xa8304613,0xfd469501,
+ 0x698098d8,0x8b44f7af,0xffff5bb1,0x895cd7be,
+ 0x6b901122,0xfd987193,0xa679438e,0x49b40821,
+
+ 0xf61e2562,0xc040b340,0x265e5a51,0xe9b6c7aa,
+ 0xd62f105d,0x02441453,0xd8a1e681,0xe7d3fbc8,
+ 0x21e1cde6,0xc33707d6,0xf4d50d87,0x455a14ed,
+ 0xa9e3e905,0xfcefa3f8,0x676f02d9,0x8d2a4c8a,
+
+ 0xfffa3942,0x8771f681,0x6d9d6122,0xfde5380c,
+ 0xa4beea44,0x4bdecfa9,0xf6bb4b60,0xbebfbc70,
+ 0x289b7ec6,0xeaa127fa,0xd4ef3085,0x04881d05,
+ 0xd9d4d039,0xe6db99e5,0x1fa27cf8,0xc4ac5665,
+
+ 0xf4292244,0x432aff97,0xab9423a7,0xfc93a039,
+ 0x655b59c3,0x8f0ccc92,0xffeff47d,0x85845dd1,
+ 0x6fa87e4f,0xfe2ce6e0,0xa3014314,0x4e0811a1,
+ 0xf7537e82,0xbd3af235,0x2ad7d2bb,0xeb86d391 );
+
+my @V=("%r8d","%r9d","%r10d","%r11d"); # MD5 registers
+my $tmp="%r12d";
+
+my @XX=("%rbp","%rsi"); # RC4 registers
+my @TX=("%rax","%rbx");
+my $YY="%rcx";
+my $TY="%rdx";
+
+my $MOD=32; # 16, 32 or 64
+
+$code.=<<___;
+.text
+.align 16
+
+.globl $func
+.type $func,\@function,$nargs
+$func:
+ cmp \$0,$len
+ je .Labort
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ sub \$40,%rsp
+.Lbody:
+___
+if ($rc4) {
+$code.=<<___;
+$D#md5# mov $ctx,%r11 # reassign arguments
+ mov $len,%r12
+ mov $in0,%r13
+ mov $out,%r14
+$D#md5# mov $inp,%r15
+___
+ $ctx="%r11" if ($md5); # reassign arguments
+ $len="%r12";
+ $in0="%r13";
+ $out="%r14";
+ $inp="%r15" if ($md5);
+ $inp=$in0 if (!$md5);
+$code.=<<___;
+ xor $XX[0],$XX[0]
+ xor $YY,$YY
+
+ lea 8($dat),$dat
+ mov -8($dat),$XX[0]#b
+ mov -4($dat),$YY#b
+
+ inc $XX[0]#b
+ sub $in0,$out
+ movl ($dat,$XX[0],4),$TX[0]#d
+___
+$code.=<<___ if (!$md5);
+ xor $TX[1],$TX[1]
+ test \$-128,$len
+ jz .Loop1
+ sub $XX[0],$TX[1]
+ and \$`$MOD-1`,$TX[1]
+ jz .Loop${MOD}_is_hot
+ sub $TX[1],$len
+.Loop${MOD}_warmup:
+ add $TX[0]#b,$YY#b
+ movl ($dat,$YY,4),$TY#d
+ movl $TX[0]#d,($dat,$YY,4)
+ movl $TY#d,($dat,$XX[0],4)
+ add $TY#b,$TX[0]#b
+ inc $XX[0]#b
+ movl ($dat,$TX[0],4),$TY#d
+ movl ($dat,$XX[0],4),$TX[0]#d
+ xorb ($in0),$TY#b
+ movb $TY#b,($out,$in0)
+ lea 1($in0),$in0
+ dec $TX[1]
+ jnz .Loop${MOD}_warmup
+
+ mov $YY,$TX[1]
+ xor $YY,$YY
+ mov $TX[1]#b,$YY#b
+
+.Loop${MOD}_is_hot:
+ mov $len,32(%rsp) # save original $len
+ shr \$6,$len # number of 64-byte blocks
+___
+ if ($D && !$md5) { # stitch in dummy MD5
+ $md5=1;
+ $ctx="%r11";
+ $inp="%r15";
+ $code.=<<___;
+ mov %rsp,$ctx
+ mov $in0,$inp
+___
+ }
+}
+$code.=<<___;
+#rc4# add $TX[0]#b,$YY#b
+#rc4# lea ($dat,$XX[0],4),$XX[1]
+ shl \$6,$len
+ add $inp,$len # pointer to the end of input
+ mov $len,16(%rsp)
+
+#md5# mov $ctx,24(%rsp) # save pointer to MD5_CTX
+#md5# mov 0*4($ctx),$V[0] # load current hash value from MD5_CTX
+#md5# mov 1*4($ctx),$V[1]
+#md5# mov 2*4($ctx),$V[2]
+#md5# mov 3*4($ctx),$V[3]
+ jmp .Loop
+
+.align 16
+.Loop:
+#md5# mov $V[0],0*4(%rsp) # put aside current hash value
+#md5# mov $V[1],1*4(%rsp)
+#md5# mov $V[2],2*4(%rsp)
+#md5# mov $V[3],$tmp # forward reference
+#md5# mov $V[3],3*4(%rsp)
+___
+
+sub R0 {
+ my ($i,$a,$b,$c,$d)=@_;
+ my @rot0=(7,12,17,22);
+ my $j=$i%16;
+ my $k=$i%$MOD;
+ my $xmm="%xmm".($j&1);
+ $code.=" movdqu ($in0),%xmm2\n" if ($rc4 && $j==15);
+ $code.=" add \$$MOD,$XX[0]#b\n" if ($rc4 && $j==15 && $k==$MOD-1);
+ $code.=" pxor $xmm,$xmm\n" if ($rc4 && $j<=1);
+ $code.=<<___;
+#rc4# movl ($dat,$YY,4),$TY#d
+#md5# xor $c,$tmp
+#rc4# movl $TX[0]#d,($dat,$YY,4)
+#md5# and $b,$tmp
+#md5# add 4*`$j`($inp),$a
+#rc4# add $TY#b,$TX[0]#b
+#rc4# movl `4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d
+#md5# add \$$K[$i],$a
+#md5# xor $d,$tmp
+#rc4# movz $TX[0]#b,$TX[0]#d
+#rc4# movl $TY#d,4*$k($XX[1])
+#md5# add $tmp,$a
+#rc4# add $TX[1]#b,$YY#b
+#md5# rol \$$rot0[$j%4],$a
+#md5# mov `$j==15?"$b":"$c"`,$tmp # forward reference
+#rc4# pinsrw \$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n
+#md5# add $b,$a
+___
+ $code.=<<___ if ($rc4 && $j==15 && $k==$MOD-1);
+ mov $YY,$XX[1]
+ xor $YY,$YY # keyword to partial register
+ mov $XX[1]#b,$YY#b
+ lea ($dat,$XX[0],4),$XX[1]
+___
+ $code.=<<___ if ($rc4 && $j==15);
+ psllq \$8,%xmm1
+ pxor %xmm0,%xmm2
+ pxor %xmm1,%xmm2
+___
+}
+sub R1 {
+ my ($i,$a,$b,$c,$d)=@_;
+ my @rot1=(5,9,14,20);
+ my $j=$i%16;
+ my $k=$i%$MOD;
+ my $xmm="%xmm".($j&1);
+ $code.=" movdqu 16($in0),%xmm3\n" if ($rc4 && $j==15);
+ $code.=" add \$$MOD,$XX[0]#b\n" if ($rc4 && $j==15 && $k==$MOD-1);
+ $code.=" pxor $xmm,$xmm\n" if ($rc4 && $j<=1);
+ $code.=<<___;
+#rc4# movl ($dat,$YY,4),$TY#d
+#md5# xor $b,$tmp
+#rc4# movl $TX[0]#d,($dat,$YY,4)
+#md5# and $d,$tmp
+#md5# add 4*`((1+5*$j)%16)`($inp),$a
+#rc4# add $TY#b,$TX[0]#b
+#rc4# movl `4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d
+#md5# add \$$K[$i],$a
+#md5# xor $c,$tmp
+#rc4# movz $TX[0]#b,$TX[0]#d
+#rc4# movl $TY#d,4*$k($XX[1])
+#md5# add $tmp,$a
+#rc4# add $TX[1]#b,$YY#b
+#md5# rol \$$rot1[$j%4],$a
+#md5# mov `$j==15?"$c":"$b"`,$tmp # forward reference
+#rc4# pinsrw \$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n
+#md5# add $b,$a
+___
+ $code.=<<___ if ($rc4 && $j==15 && $k==$MOD-1);
+ mov $YY,$XX[1]
+ xor $YY,$YY # keyword to partial register
+ mov $XX[1]#b,$YY#b
+ lea ($dat,$XX[0],4),$XX[1]
+___
+ $code.=<<___ if ($rc4 && $j==15);
+ psllq \$8,%xmm1
+ pxor %xmm0,%xmm3
+ pxor %xmm1,%xmm3
+___
+}
+sub R2 {
+ my ($i,$a,$b,$c,$d)=@_;
+ my @rot2=(4,11,16,23);
+ my $j=$i%16;
+ my $k=$i%$MOD;
+ my $xmm="%xmm".($j&1);
+ $code.=" movdqu 32($in0),%xmm4\n" if ($rc4 && $j==15);
+ $code.=" add \$$MOD,$XX[0]#b\n" if ($rc4 && $j==15 && $k==$MOD-1);
+ $code.=" pxor $xmm,$xmm\n" if ($rc4 && $j<=1);
+ $code.=<<___;
+#rc4# movl ($dat,$YY,4),$TY#d
+#md5# xor $c,$tmp
+#rc4# movl $TX[0]#d,($dat,$YY,4)
+#md5# xor $b,$tmp
+#md5# add 4*`((5+3*$j)%16)`($inp),$a
+#rc4# add $TY#b,$TX[0]#b
+#rc4# movl `4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d
+#md5# add \$$K[$i],$a
+#rc4# movz $TX[0]#b,$TX[0]#d
+#md5# add $tmp,$a
+#rc4# movl $TY#d,4*$k($XX[1])
+#rc4# add $TX[1]#b,$YY#b
+#md5# rol \$$rot2[$j%4],$a
+#md5# mov `$j==15?"\\\$-1":"$c"`,$tmp # forward reference
+#rc4# pinsrw \$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n
+#md5# add $b,$a
+___
+ $code.=<<___ if ($rc4 && $j==15 && $k==$MOD-1);
+ mov $YY,$XX[1]
+ xor $YY,$YY # keyword to partial register
+ mov $XX[1]#b,$YY#b
+ lea ($dat,$XX[0],4),$XX[1]
+___
+ $code.=<<___ if ($rc4 && $j==15);
+ psllq \$8,%xmm1
+ pxor %xmm0,%xmm4
+ pxor %xmm1,%xmm4
+___
+}
+sub R3 {
+ my ($i,$a,$b,$c,$d)=@_;
+ my @rot3=(6,10,15,21);
+ my $j=$i%16;
+ my $k=$i%$MOD;
+ my $xmm="%xmm".($j&1);
+ $code.=" movdqu 48($in0),%xmm5\n" if ($rc4 && $j==15);
+ $code.=" add \$$MOD,$XX[0]#b\n" if ($rc4 && $j==15 && $k==$MOD-1);
+ $code.=" pxor $xmm,$xmm\n" if ($rc4 && $j<=1);
+ $code.=<<___;
+#rc4# movl ($dat,$YY,4),$TY#d
+#md5# xor $d,$tmp
+#rc4# movl $TX[0]#d,($dat,$YY,4)
+#md5# or $b,$tmp
+#md5# add 4*`((7*$j)%16)`($inp),$a
+#rc4# add $TY#b,$TX[0]#b
+#rc4# movl `4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d
+#md5# add \$$K[$i],$a
+#rc4# movz $TX[0]#b,$TX[0]#d
+#md5# xor $c,$tmp
+#rc4# movl $TY#d,4*$k($XX[1])
+#md5# add $tmp,$a
+#rc4# add $TX[1]#b,$YY#b
+#md5# rol \$$rot3[$j%4],$a
+#md5# mov \$-1,$tmp # forward reference
+#rc4# pinsrw \$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n
+#md5# add $b,$a
+___
+ $code.=<<___ if ($rc4 && $j==15);
+ mov $XX[0],$XX[1]
+ xor $XX[0],$XX[0] # keyword to partial register
+ mov $XX[1]#b,$XX[0]#b
+ mov $YY,$XX[1]
+ xor $YY,$YY # keyword to partial register
+ mov $XX[1]#b,$YY#b
+ lea ($dat,$XX[0],4),$XX[1]
+ psllq \$8,%xmm1
+ pxor %xmm0,%xmm5
+ pxor %xmm1,%xmm5
+___
+}
+
+my $i=0;
+for(;$i<16;$i++) { R0($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); }
+for(;$i<32;$i++) { R1($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); }
+for(;$i<48;$i++) { R2($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); }
+for(;$i<64;$i++) { R3($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); }
+
+$code.=<<___;
+#md5# add 0*4(%rsp),$V[0] # accumulate hash value
+#md5# add 1*4(%rsp),$V[1]
+#md5# add 2*4(%rsp),$V[2]
+#md5# add 3*4(%rsp),$V[3]
+
+#rc4# movdqu %xmm2,($out,$in0) # write RC4 output
+#rc4# movdqu %xmm3,16($out,$in0)
+#rc4# movdqu %xmm4,32($out,$in0)
+#rc4# movdqu %xmm5,48($out,$in0)
+#md5# lea 64($inp),$inp
+#rc4# lea 64($in0),$in0
+ cmp 16(%rsp),$inp # are we done?
+ jb .Loop
+
+#md5# mov 24(%rsp),$len # restore pointer to MD5_CTX
+#rc4# sub $TX[0]#b,$YY#b # correct $YY
+#md5# mov $V[0],0*4($len) # write MD5_CTX
+#md5# mov $V[1],1*4($len)
+#md5# mov $V[2],2*4($len)
+#md5# mov $V[3],3*4($len)
+___
+$code.=<<___ if ($rc4 && (!$md5 || $D));
+ mov 32(%rsp),$len # restore original $len
+ and \$63,$len # remaining bytes
+ jnz .Loop1
+ jmp .Ldone
+
+.align 16
+.Loop1:
+ add $TX[0]#b,$YY#b
+ movl ($dat,$YY,4),$TY#d
+ movl $TX[0]#d,($dat,$YY,4)
+ movl $TY#d,($dat,$XX[0],4)
+ add $TY#b,$TX[0]#b
+ inc $XX[0]#b
+ movl ($dat,$TX[0],4),$TY#d
+ movl ($dat,$XX[0],4),$TX[0]#d
+ xorb ($in0),$TY#b
+ movb $TY#b,($out,$in0)
+ lea 1($in0),$in0
+ dec $len
+ jnz .Loop1
+
+.Ldone:
+___
+$code.=<<___;
+#rc4# sub \$1,$XX[0]#b
+#rc4# movl $XX[0]#d,-8($dat)
+#rc4# movl $YY#d,-4($dat)
+
+ mov 40(%rsp),%r15
+ mov 48(%rsp),%r14
+ mov 56(%rsp),%r13
+ mov 64(%rsp),%r12
+ mov 72(%rsp),%rbp
+ mov 80(%rsp),%rbx
+ lea 88(%rsp),%rsp
+.Lepilogue:
+.Labort:
+ ret
+.size $func,.-$func
+___
+
+if ($rc4 && $D) { # sole purpose of this section is to provide
+ # option to use the generated module as drop-in
+ # replacement for rc4-x86_64.pl for debugging
+ # and testing purposes...
+my ($idx,$ido)=("%r8","%r9");
+my ($dat,$len,$inp)=("%rdi","%rsi","%rdx");
+
+$code.=<<___;
+.globl RC4_set_key
+.type RC4_set_key,\@function,3
+.align 16
+RC4_set_key:
+ lea 8($dat),$dat
+ lea ($inp,$len),$inp
+ neg $len
+ mov $len,%rcx
+ xor %eax,%eax
+ xor $ido,$ido
+ xor %r10,%r10
+ xor %r11,%r11
+ jmp .Lw1stloop
+
+.align 16
+.Lw1stloop:
+ mov %eax,($dat,%rax,4)
+ add \$1,%al
+ jnc .Lw1stloop
+
+ xor $ido,$ido
+ xor $idx,$idx
+.align 16
+.Lw2ndloop:
+ mov ($dat,$ido,4),%r10d
+ add ($inp,$len,1),$idx#b
+ add %r10b,$idx#b
+ add \$1,$len
+ mov ($dat,$idx,4),%r11d
+ cmovz %rcx,$len
+ mov %r10d,($dat,$idx,4)
+ mov %r11d,($dat,$ido,4)
+ add \$1,$ido#b
+ jnc .Lw2ndloop
+
+ xor %eax,%eax
+ mov %eax,-8($dat)
+ mov %eax,-4($dat)
+ ret
+.size RC4_set_key,.-RC4_set_key
+
+.globl RC4_options
+.type RC4_options,\@abi-omnipotent
+.align 16
+RC4_options:
+ lea .Lopts(%rip),%rax
+ ret
+.align 64
+.Lopts:
+.asciz "rc4(64x,int)"
+.align 64
+.size RC4_options,.-RC4_options
+___
+}
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+# CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+my $rec="%rcx";
+my $frame="%rdx";
+my $context="%r8";
+my $disp="%r9";
+
+$code.=<<___;
+.extern __imp_RtlVirtualUnwind
+.type se_handler,\@abi-omnipotent
+.align 16
+se_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ lea .Lbody(%rip),%r10
+ cmp %r10,%rbx # context->Rip<.Lbody
+ jb .Lin_prologue
+
+ mov 152($context),%rax # pull context->Rsp
+
+ lea .Lepilogue(%rip),%r10
+ cmp %r10,%rbx # context->Rip>=.Lepilogue
+ jae .Lin_prologue
+
+ mov 40(%rax),%r15
+ mov 48(%rax),%r14
+ mov 56(%rax),%r13
+ mov 64(%rax),%r12
+ mov 72(%rax),%rbp
+ mov 80(%rax),%rbx
+ lea 88(%rax),%rax
+
+ mov %rbx,144($context) # restore context->Rbx
+ mov %rbp,160($context) # restore context->Rbp
+ mov %r12,216($context) # restore context->R12
+ mov %r13,224($context) # restore context->R12
+ mov %r14,232($context) # restore context->R14
+ mov %r15,240($context) # restore context->R15
+
+.Lin_prologue:
+ mov 8(%rax),%rdi
+ mov 16(%rax),%rsi
+ mov %rax,152($context) # restore context->Rsp
+ mov %rsi,168($context) # restore context->Rsi
+ mov %rdi,176($context) # restore context->Rdi
+
+ mov 40($disp),%rdi # disp->ContextRecord
+ mov $context,%rsi # context
+ mov \$154,%ecx # sizeof(CONTEXT)
+ .long 0xa548f3fc # cld; rep movsq
+
+ mov $disp,%rsi
+ xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
+ mov 8(%rsi),%rdx # arg2, disp->ImageBase
+ mov 0(%rsi),%r8 # arg3, disp->ControlPc
+ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
+ mov 40(%rsi),%r10 # disp->ContextRecord
+ lea 56(%rsi),%r11 # &disp->HandlerData
+ lea 24(%rsi),%r12 # &disp->EstablisherFrame
+ mov %r10,32(%rsp) # arg5
+ mov %r11,40(%rsp) # arg6
+ mov %r12,48(%rsp) # arg7
+ mov %rcx,56(%rsp) # arg8, (NULL)
+ call *__imp_RtlVirtualUnwind(%rip)
+
+ mov \$1,%eax # ExceptionContinueSearch
+ add \$64,%rsp
+ popfq
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ pop %rdi
+ pop %rsi
+ ret
+.size se_handler,.-se_handler
+
+.section .pdata
+.align 4
+ .rva .LSEH_begin_$func
+ .rva .LSEH_end_$func
+ .rva .LSEH_info_$func
+
+.section .xdata
+.align 8
+.LSEH_info_$func:
+ .byte 9,0,0,0
+ .rva se_handler
+___
+}
+
+sub reg_part {
+my ($reg,$conv)=@_;
+ if ($reg =~ /%r[0-9]+/) { $reg .= $conv; }
+ elsif ($conv eq "b") { $reg =~ s/%[er]([^x]+)x?/%$1l/; }
+ elsif ($conv eq "w") { $reg =~ s/%[er](.+)/%$1/; }
+ elsif ($conv eq "d") { $reg =~ s/%[er](.+)/%e$1/; }
+ return $reg;
+}
+
+$code =~ s/(%[a-z0-9]+)#([bwd])/reg_part($1,$2)/gem;
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+$code =~ s/pinsrw\s+\$0,/movd /gm;
+
+$code =~ s/#md5#//gm if ($md5);
+$code =~ s/#rc4#//gm if ($rc4);
+
+print $code;
+
+close STDOUT;
diff --git a/main/openssl/crypto/rc4/asm/rc4-parisc.pl b/main/openssl/crypto/rc4/asm/rc4-parisc.pl
new file mode 100644
index 00000000..ad7e6565
--- /dev/null
+++ b/main/openssl/crypto/rc4/asm/rc4-parisc.pl
@@ -0,0 +1,314 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# RC4 for PA-RISC.
+
+# June 2009.
+#
+# Performance is 33% better than gcc 3.2 generated code on PA-7100LC.
+# For reference, [4x] unrolled loop is >40% faster than folded one.
+# It's possible to unroll loop 8 times on PA-RISC 2.0, but improvement
+# is believed to be not sufficient to justify the effort...
+#
+# Special thanks to polarhome.com for providing HP-UX account.
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+
+$flavour = shift;
+$output = shift;
+open STDOUT,">$output";
+
+if ($flavour =~ /64/) {
+ $LEVEL ="2.0W";
+ $SIZE_T =8;
+ $FRAME_MARKER =80;
+ $SAVED_RP =16;
+ $PUSH ="std";
+ $PUSHMA ="std,ma";
+ $POP ="ldd";
+ $POPMB ="ldd,mb";
+} else {
+ $LEVEL ="1.0";
+ $SIZE_T =4;
+ $FRAME_MARKER =48;
+ $SAVED_RP =20;
+ $PUSH ="stw";
+ $PUSHMA ="stwm";
+ $POP ="ldw";
+ $POPMB ="ldwm";
+}
+
+$FRAME=4*$SIZE_T+$FRAME_MARKER; # 4 saved regs + frame marker
+ # [+ argument transfer]
+$SZ=1; # defaults to RC4_CHAR
+if (open CONF,"<${dir}../../opensslconf.h") {
+ while(<CONF>) {
+ if (m/#\s*define\s+RC4_INT\s+(.*)/) {
+ $SZ = ($1=~/char$/) ? 1 : 4;
+ last;
+ }
+ }
+ close CONF;
+}
+
+if ($SZ==1) { # RC4_CHAR
+ $LD="ldb";
+ $LDX="ldbx";
+ $MKX="addl";
+ $ST="stb";
+} else { # RC4_INT (~5% faster than RC4_CHAR on PA-7100LC)
+ $LD="ldw";
+ $LDX="ldwx,s";
+ $MKX="sh2addl";
+ $ST="stw";
+}
+
+$key="%r26";
+$len="%r25";
+$inp="%r24";
+$out="%r23";
+
+@XX=("%r19","%r20");
+@TX=("%r21","%r22");
+$YY="%r28";
+$TY="%r29";
+
+$acc="%r1";
+$ix="%r2";
+$iy="%r3";
+$dat0="%r4";
+$dat1="%r5";
+$rem="%r6";
+$mask="%r31";
+
+sub unrolledloopbody {
+for ($i=0;$i<4;$i++) {
+$code.=<<___;
+ ldo 1($XX[0]),$XX[1]
+ `sprintf("$LDX %$TY(%$key),%$dat1") if ($i>0)`
+ and $mask,$XX[1],$XX[1]
+ $LDX $YY($key),$TY
+ $MKX $YY,$key,$ix
+ $LDX $XX[1]($key),$TX[1]
+ $MKX $XX[0],$key,$iy
+ $ST $TX[0],0($ix)
+ comclr,<> $XX[1],$YY,%r0 ; conditional
+ copy $TX[0],$TX[1] ; move
+ `sprintf("%sdep %$dat1,%d,8,%$acc",$i==1?"z":"",8*($i-1)+7) if ($i>0)`
+ $ST $TY,0($iy)
+ addl $TX[0],$TY,$TY
+ addl $TX[1],$YY,$YY
+ and $mask,$TY,$TY
+ and $mask,$YY,$YY
+___
+push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers
+} }
+
+sub foldedloop {
+my ($label,$count)=@_;
+$code.=<<___;
+$label
+ $MKX $YY,$key,$iy
+ $LDX $YY($key),$TY
+ $MKX $XX[0],$key,$ix
+ $ST $TX[0],0($iy)
+ ldo 1($XX[0]),$XX[0]
+ $ST $TY,0($ix)
+ addl $TX[0],$TY,$TY
+ ldbx $inp($out),$dat1
+ and $mask,$TY,$TY
+ and $mask,$XX[0],$XX[0]
+ $LDX $TY($key),$acc
+ $LDX $XX[0]($key),$TX[0]
+ ldo 1($out),$out
+ xor $dat1,$acc,$acc
+ addl $TX[0],$YY,$YY
+ stb $acc,-1($out)
+ addib,<> -1,$count,$label ; $count is always small
+ and $mask,$YY,$YY
+___
+}
+
+$code=<<___;
+ .LEVEL $LEVEL
+ .SPACE \$TEXT\$
+ .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
+
+ .EXPORT RC4,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
+RC4
+ .PROC
+ .CALLINFO FRAME=`$FRAME-4*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=6
+ .ENTRY
+ $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
+ $PUSHMA %r3,$FRAME(%sp)
+ $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
+ $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
+ $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
+
+ cmpib,*= 0,$len,L\$abort
+ sub $inp,$out,$inp ; distance between $inp and $out
+
+ $LD `0*$SZ`($key),$XX[0]
+ $LD `1*$SZ`($key),$YY
+ ldo `2*$SZ`($key),$key
+
+ ldi 0xff,$mask
+ ldi 3,$dat0
+
+ ldo 1($XX[0]),$XX[0] ; warm up loop
+ and $mask,$XX[0],$XX[0]
+ $LDX $XX[0]($key),$TX[0]
+ addl $TX[0],$YY,$YY
+ cmpib,*>>= 6,$len,L\$oop1 ; is $len large enough to bother?
+ and $mask,$YY,$YY
+
+ and,<> $out,$dat0,$rem ; is $out aligned?
+ b L\$alignedout
+ subi 4,$rem,$rem
+ sub $len,$rem,$len
+___
+&foldedloop("L\$alignout",$rem); # process till $out is aligned
+
+$code.=<<___;
+L\$alignedout ; $len is at least 4 here
+ and,<> $inp,$dat0,$acc ; is $inp aligned?
+ b L\$oop4
+ sub $inp,$acc,$rem ; align $inp
+
+ sh3addl $acc,%r0,$acc
+ subi 32,$acc,$acc
+ mtctl $acc,%cr11 ; load %sar with vshd align factor
+ ldwx $rem($out),$dat0
+ ldo 4($rem),$rem
+L\$oop4misalignedinp
+___
+&unrolledloopbody();
+$code.=<<___;
+ $LDX $TY($key),$ix
+ ldwx $rem($out),$dat1
+ ldo -4($len),$len
+ or $ix,$acc,$acc ; last piece, no need to dep
+ vshd $dat0,$dat1,$iy ; align data
+ copy $dat1,$dat0
+ xor $iy,$acc,$acc
+ stw $acc,0($out)
+ cmpib,*<< 3,$len,L\$oop4misalignedinp
+ ldo 4($out),$out
+ cmpib,*= 0,$len,L\$done
+ nop
+ b L\$oop1
+ nop
+
+ .ALIGN 8
+L\$oop4
+___
+&unrolledloopbody();
+$code.=<<___;
+ $LDX $TY($key),$ix
+ ldwx $inp($out),$dat0
+ ldo -4($len),$len
+ or $ix,$acc,$acc ; last piece, no need to dep
+ xor $dat0,$acc,$acc
+ stw $acc,0($out)
+ cmpib,*<< 3,$len,L\$oop4
+ ldo 4($out),$out
+ cmpib,*= 0,$len,L\$done
+ nop
+___
+&foldedloop("L\$oop1",$len);
+$code.=<<___;
+L\$done
+ $POP `-$FRAME-$SAVED_RP`(%sp),%r2
+ ldo -1($XX[0]),$XX[0] ; chill out loop
+ sub $YY,$TX[0],$YY
+ and $mask,$XX[0],$XX[0]
+ and $mask,$YY,$YY
+ $ST $XX[0],`-2*$SZ`($key)
+ $ST $YY,`-1*$SZ`($key)
+ $POP `-$FRAME+1*$SIZE_T`(%sp),%r4
+ $POP `-$FRAME+2*$SIZE_T`(%sp),%r5
+ $POP `-$FRAME+3*$SIZE_T`(%sp),%r6
+L\$abort
+ bv (%r2)
+ .EXIT
+ $POPMB -$FRAME(%sp),%r3
+ .PROCEND
+___
+
+$code.=<<___;
+
+ .EXPORT private_RC4_set_key,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
+ .ALIGN 8
+private_RC4_set_key
+ .PROC
+ .CALLINFO NO_CALLS
+ .ENTRY
+ $ST %r0,`0*$SZ`($key)
+ $ST %r0,`1*$SZ`($key)
+ ldo `2*$SZ`($key),$key
+ copy %r0,@XX[0]
+L\$1st
+ $ST @XX[0],0($key)
+ ldo 1(@XX[0]),@XX[0]
+ bb,>= @XX[0],`31-8`,L\$1st ; @XX[0]<256
+ ldo $SZ($key),$key
+
+ ldo `-256*$SZ`($key),$key ; rewind $key
+ addl $len,$inp,$inp ; $inp to point at the end
+ sub %r0,$len,%r23 ; inverse index
+ copy %r0,@XX[0]
+ copy %r0,@XX[1]
+ ldi 0xff,$mask
+
+L\$2nd
+ $LDX @XX[0]($key),@TX[0]
+ ldbx %r23($inp),@TX[1]
+ addi,nuv 1,%r23,%r23 ; increment and conditional
+ sub %r0,$len,%r23 ; inverse index
+ addl @TX[0],@XX[1],@XX[1]
+ addl @TX[1],@XX[1],@XX[1]
+ and $mask,@XX[1],@XX[1]
+ $MKX @XX[0],$key,$TY
+ $LDX @XX[1]($key),@TX[1]
+ $MKX @XX[1],$key,$YY
+ ldo 1(@XX[0]),@XX[0]
+ $ST @TX[0],0($YY)
+ bb,>= @XX[0],`31-8`,L\$2nd ; @XX[0]<256
+ $ST @TX[1],0($TY)
+
+ bv,n (%r2)
+ .EXIT
+ nop
+ .PROCEND
+
+ .EXPORT RC4_options,ENTRY
+ .ALIGN 8
+RC4_options
+ .PROC
+ .CALLINFO NO_CALLS
+ .ENTRY
+ blr %r0,%r28
+ ldi 3,%r1
+L\$pic
+ andcm %r28,%r1,%r28
+ bv (%r2)
+ .EXIT
+ ldo L\$opts-L\$pic(%r28),%r28
+ .PROCEND
+ .ALIGN 8
+L\$opts
+ .STRINGZ "rc4(4x,`$SZ==1?"char":"int"`)"
+ .STRINGZ "RC4 for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
+___
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+$code =~ s/cmpib,\*/comib,/gm if ($SIZE_T==4);
+$code =~ s/\bbv\b/bve/gm if ($SIZE_T==8);
+
+print $code;
+close STDOUT;
diff --git a/main/openssl/crypto/rc4/asm/rc4-s390x.pl b/main/openssl/crypto/rc4/asm/rc4-s390x.pl
index 96681fa0..7528ece1 100644
--- a/main/openssl/crypto/rc4/asm/rc4-s390x.pl
+++ b/main/openssl/crypto/rc4/asm/rc4-s390x.pl
@@ -13,6 +13,29 @@
# "cluster" Address Generation Interlocks, so that one pipeline stall
# resolves several dependencies.
+# November 2010.
+#
+# Adapt for -m31 build. If kernel supports what's called "highgprs"
+# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
+# instructions and achieve "64-bit" performance even in 31-bit legacy
+# application context. The feature is not specific to any particular
+# processor, as long as it's "z-CPU". Latter implies that the code
+# remains z/Architecture specific. On z990 it was measured to perform
+# 50% better than code generated by gcc 4.3.
+
+$flavour = shift;
+
+if ($flavour =~ /3[12]/) {
+ $SIZE_T=4;
+ $g="";
+} else {
+ $SIZE_T=8;
+ $g="g";
+}
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
$rp="%r14";
$sp="%r15";
$code=<<___;
@@ -39,7 +62,12 @@ $code.=<<___;
.type RC4,\@function
.align 64
RC4:
- stmg %r6,%r11,48($sp)
+ stm${g} %r6,%r11,6*$SIZE_T($sp)
+___
+$code.=<<___ if ($flavour =~ /3[12]/);
+ llgfr $len,$len
+___
+$code.=<<___;
llgc $XX[0],0($key)
llgc $YY,1($key)
la $XX[0],1($XX[0])
@@ -90,7 +118,7 @@ $code.=<<___;
xgr $acc,$TX[1]
stg $acc,0($out)
la $out,8($out)
- brct $cnt,.Loop8
+ brctg $cnt,.Loop8
.Lshort:
lghi $acc,7
@@ -122,7 +150,7 @@ $code.=<<___;
ahi $XX[0],-1
stc $XX[0],0($key)
stc $YY,1($key)
- lmg %r6,%r11,48($sp)
+ lm${g} %r6,%r11,6*$SIZE_T($sp)
br $rp
.size RC4,.-RC4
.string "RC4 for s390x, CRYPTOGAMS by <appro\@openssl.org>"
@@ -143,11 +171,11 @@ $ikey="%r7";
$iinp="%r8";
$code.=<<___;
-.globl RC4_set_key
-.type RC4_set_key,\@function
+.globl private_RC4_set_key
+.type private_RC4_set_key,\@function
.align 64
-RC4_set_key:
- stmg %r6,%r8,48($sp)
+private_RC4_set_key:
+ stm${g} %r6,%r8,6*$SIZE_T($sp)
lhi $cnt,256
la $idx,0(%r0)
sth $idx,0($key)
@@ -180,9 +208,9 @@ RC4_set_key:
la $iinp,0(%r0)
j .L2ndloop
.Ldone:
- lmg %r6,%r8,48($sp)
+ lm${g} %r6,%r8,6*$SIZE_T($sp)
br $rp
-.size RC4_set_key,.-RC4_set_key
+.size private_RC4_set_key,.-private_RC4_set_key
___
}
@@ -203,3 +231,4 @@ RC4_options:
___
print $code;
+close STDOUT; # force flush
diff --git a/main/openssl/crypto/rc4/asm/rc4-x86_64.S b/main/openssl/crypto/rc4/asm/rc4-x86_64.S
new file mode 100644
index 00000000..af161582
--- /dev/null
+++ b/main/openssl/crypto/rc4/asm/rc4-x86_64.S
@@ -0,0 +1,615 @@
+.text
+
+
+.globl RC4
+.type RC4,@function
+.align 16
+RC4: orq %rsi,%rsi
+ jne .Lentry
+ .byte 0xf3,0xc3
+.Lentry:
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+.Lprologue:
+ movq %rsi,%r11
+ movq %rdx,%r12
+ movq %rcx,%r13
+ xorq %r10,%r10
+ xorq %rcx,%rcx
+
+ leaq 8(%rdi),%rdi
+ movb -8(%rdi),%r10b
+ movb -4(%rdi),%cl
+ cmpl $-1,256(%rdi)
+ je .LRC4_CHAR
+ movl OPENSSL_ia32cap_P(%rip),%r8d
+ xorq %rbx,%rbx
+ incb %r10b
+ subq %r10,%rbx
+ subq %r12,%r13
+ movl (%rdi,%r10,4),%eax
+ testq $-16,%r11
+ jz .Lloop1
+ btl $30,%r8d
+ jc .Lintel
+ andq $7,%rbx
+ leaq 1(%r10),%rsi
+ jz .Loop8
+ subq %rbx,%r11
+.Loop8_warmup:
+ addb %al,%cl
+ movl (%rdi,%rcx,4),%edx
+ movl %eax,(%rdi,%rcx,4)
+ movl %edx,(%rdi,%r10,4)
+ addb %dl,%al
+ incb %r10b
+ movl (%rdi,%rax,4),%edx
+ movl (%rdi,%r10,4),%eax
+ xorb (%r12),%dl
+ movb %dl,(%r13,%r12,1)
+ leaq 1(%r12),%r12
+ decq %rbx
+ jnz .Loop8_warmup
+
+ leaq 1(%r10),%rsi
+ jmp .Loop8
+.align 16
+.Loop8:
+ addb %al,%cl
+ movl (%rdi,%rcx,4),%edx
+ movl %eax,(%rdi,%rcx,4)
+ movl 0(%rdi,%rsi,4),%ebx
+ rorq $8,%r8
+ movl %edx,0(%rdi,%r10,4)
+ addb %al,%dl
+ movb (%rdi,%rdx,4),%r8b
+ addb %bl,%cl
+ movl (%rdi,%rcx,4),%edx
+ movl %ebx,(%rdi,%rcx,4)
+ movl 4(%rdi,%rsi,4),%eax
+ rorq $8,%r8
+ movl %edx,4(%rdi,%r10,4)
+ addb %bl,%dl
+ movb (%rdi,%rdx,4),%r8b
+ addb %al,%cl
+ movl (%rdi,%rcx,4),%edx
+ movl %eax,(%rdi,%rcx,4)
+ movl 8(%rdi,%rsi,4),%ebx
+ rorq $8,%r8
+ movl %edx,8(%rdi,%r10,4)
+ addb %al,%dl
+ movb (%rdi,%rdx,4),%r8b
+ addb %bl,%cl
+ movl (%rdi,%rcx,4),%edx
+ movl %ebx,(%rdi,%rcx,4)
+ movl 12(%rdi,%rsi,4),%eax
+ rorq $8,%r8
+ movl %edx,12(%rdi,%r10,4)
+ addb %bl,%dl
+ movb (%rdi,%rdx,4),%r8b
+ addb %al,%cl
+ movl (%rdi,%rcx,4),%edx
+ movl %eax,(%rdi,%rcx,4)
+ movl 16(%rdi,%rsi,4),%ebx
+ rorq $8,%r8
+ movl %edx,16(%rdi,%r10,4)
+ addb %al,%dl
+ movb (%rdi,%rdx,4),%r8b
+ addb %bl,%cl
+ movl (%rdi,%rcx,4),%edx
+ movl %ebx,(%rdi,%rcx,4)
+ movl 20(%rdi,%rsi,4),%eax
+ rorq $8,%r8
+ movl %edx,20(%rdi,%r10,4)
+ addb %bl,%dl
+ movb (%rdi,%rdx,4),%r8b
+ addb %al,%cl
+ movl (%rdi,%rcx,4),%edx
+ movl %eax,(%rdi,%rcx,4)
+ movl 24(%rdi,%rsi,4),%ebx
+ rorq $8,%r8
+ movl %edx,24(%rdi,%r10,4)
+ addb %al,%dl
+ movb (%rdi,%rdx,4),%r8b
+ addb $8,%sil
+ addb %bl,%cl
+ movl (%rdi,%rcx,4),%edx
+ movl %ebx,(%rdi,%rcx,4)
+ movl -4(%rdi,%rsi,4),%eax
+ rorq $8,%r8
+ movl %edx,28(%rdi,%r10,4)
+ addb %bl,%dl
+ movb (%rdi,%rdx,4),%r8b
+ addb $8,%r10b
+ rorq $8,%r8
+ subq $8,%r11
+
+ xorq (%r12),%r8
+ movq %r8,(%r13,%r12,1)
+ leaq 8(%r12),%r12
+
+ testq $-8,%r11
+ jnz .Loop8
+ cmpq $0,%r11
+ jne .Lloop1
+ jmp .Lexit
+
+.align 16
+.Lintel:
+ testq $-32,%r11
+ jz .Lloop1
+ andq $15,%rbx
+ jz .Loop16_is_hot
+ subq %rbx,%r11
+.Loop16_warmup:
+ addb %al,%cl
+ movl (%rdi,%rcx,4),%edx
+ movl %eax,(%rdi,%rcx,4)
+ movl %edx,(%rdi,%r10,4)
+ addb %dl,%al
+ incb %r10b
+ movl (%rdi,%rax,4),%edx
+ movl (%rdi,%r10,4),%eax
+ xorb (%r12),%dl
+ movb %dl,(%r13,%r12,1)
+ leaq 1(%r12),%r12
+ decq %rbx
+ jnz .Loop16_warmup
+
+ movq %rcx,%rbx
+ xorq %rcx,%rcx
+ movb %bl,%cl
+
+.Loop16_is_hot:
+ leaq (%rdi,%r10,4),%rsi
+ addb %al,%cl
+ movl (%rdi,%rcx,4),%edx
+ pxor %xmm0,%xmm0
+ movl %eax,(%rdi,%rcx,4)
+ addb %dl,%al
+ movl 4(%rsi),%ebx
+ movzbl %al,%eax
+ movl %edx,0(%rsi)
+ addb %bl,%cl
+ pinsrw $0,(%rdi,%rax,4),%xmm0
+ jmp .Loop16_enter
+.align 16
+.Loop16:
+ addb %al,%cl
+ movl (%rdi,%rcx,4),%edx
+ pxor %xmm0,%xmm2
+ psllq $8,%xmm1
+ pxor %xmm0,%xmm0
+ movl %eax,(%rdi,%rcx,4)
+ addb %dl,%al
+ movl 4(%rsi),%ebx
+ movzbl %al,%eax
+ movl %edx,0(%rsi)
+ pxor %xmm1,%xmm2
+ addb %bl,%cl
+ pinsrw $0,(%rdi,%rax,4),%xmm0
+ movdqu %xmm2,(%r13,%r12,1)
+ leaq 16(%r12),%r12
+.Loop16_enter:
+ movl (%rdi,%rcx,4),%edx
+ pxor %xmm1,%xmm1
+ movl %ebx,(%rdi,%rcx,4)
+ addb %dl,%bl
+ movl 8(%rsi),%eax
+ movzbl %bl,%ebx
+ movl %edx,4(%rsi)
+ addb %al,%cl
+ pinsrw $0,(%rdi,%rbx,4),%xmm1
+ movl (%rdi,%rcx,4),%edx
+ movl %eax,(%rdi,%rcx,4)
+ addb %dl,%al
+ movl 12(%rsi),%ebx
+ movzbl %al,%eax
+ movl %edx,8(%rsi)
+ addb %bl,%cl
+ pinsrw $1,(%rdi,%rax,4),%xmm0
+ movl (%rdi,%rcx,4),%edx
+ movl %ebx,(%rdi,%rcx,4)
+ addb %dl,%bl
+ movl 16(%rsi),%eax
+ movzbl %bl,%ebx
+ movl %edx,12(%rsi)
+ addb %al,%cl
+ pinsrw $1,(%rdi,%rbx,4),%xmm1
+ movl (%rdi,%rcx,4),%edx
+ movl %eax,(%rdi,%rcx,4)
+ addb %dl,%al
+ movl 20(%rsi),%ebx
+ movzbl %al,%eax
+ movl %edx,16(%rsi)
+ addb %bl,%cl
+ pinsrw $2,(%rdi,%rax,4),%xmm0
+ movl (%rdi,%rcx,4),%edx
+ movl %ebx,(%rdi,%rcx,4)
+ addb %dl,%bl
+ movl 24(%rsi),%eax
+ movzbl %bl,%ebx
+ movl %edx,20(%rsi)
+ addb %al,%cl
+ pinsrw $2,(%rdi,%rbx,4),%xmm1
+ movl (%rdi,%rcx,4),%edx
+ movl %eax,(%rdi,%rcx,4)
+ addb %dl,%al
+ movl 28(%rsi),%ebx
+ movzbl %al,%eax
+ movl %edx,24(%rsi)
+ addb %bl,%cl
+ pinsrw $3,(%rdi,%rax,4),%xmm0
+ movl (%rdi,%rcx,4),%edx
+ movl %ebx,(%rdi,%rcx,4)
+ addb %dl,%bl
+ movl 32(%rsi),%eax
+ movzbl %bl,%ebx
+ movl %edx,28(%rsi)
+ addb %al,%cl
+ pinsrw $3,(%rdi,%rbx,4),%xmm1
+ movl (%rdi,%rcx,4),%edx
+ movl %eax,(%rdi,%rcx,4)
+ addb %dl,%al
+ movl 36(%rsi),%ebx
+ movzbl %al,%eax
+ movl %edx,32(%rsi)
+ addb %bl,%cl
+ pinsrw $4,(%rdi,%rax,4),%xmm0
+ movl (%rdi,%rcx,4),%edx
+ movl %ebx,(%rdi,%rcx,4)
+ addb %dl,%bl
+ movl 40(%rsi),%eax
+ movzbl %bl,%ebx
+ movl %edx,36(%rsi)
+ addb %al,%cl
+ pinsrw $4,(%rdi,%rbx,4),%xmm1
+ movl (%rdi,%rcx,4),%edx
+ movl %eax,(%rdi,%rcx,4)
+ addb %dl,%al
+ movl 44(%rsi),%ebx
+ movzbl %al,%eax
+ movl %edx,40(%rsi)
+ addb %bl,%cl
+ pinsrw $5,(%rdi,%rax,4),%xmm0
+ movl (%rdi,%rcx,4),%edx
+ movl %ebx,(%rdi,%rcx,4)
+ addb %dl,%bl
+ movl 48(%rsi),%eax
+ movzbl %bl,%ebx
+ movl %edx,44(%rsi)
+ addb %al,%cl
+ pinsrw $5,(%rdi,%rbx,4),%xmm1
+ movl (%rdi,%rcx,4),%edx
+ movl %eax,(%rdi,%rcx,4)
+ addb %dl,%al
+ movl 52(%rsi),%ebx
+ movzbl %al,%eax
+ movl %edx,48(%rsi)
+ addb %bl,%cl
+ pinsrw $6,(%rdi,%rax,4),%xmm0
+ movl (%rdi,%rcx,4),%edx
+ movl %ebx,(%rdi,%rcx,4)
+ addb %dl,%bl
+ movl 56(%rsi),%eax
+ movzbl %bl,%ebx
+ movl %edx,52(%rsi)
+ addb %al,%cl
+ pinsrw $6,(%rdi,%rbx,4),%xmm1
+ movl (%rdi,%rcx,4),%edx
+ movl %eax,(%rdi,%rcx,4)
+ addb %dl,%al
+ movl 60(%rsi),%ebx
+ movzbl %al,%eax
+ movl %edx,56(%rsi)
+ addb %bl,%cl
+ pinsrw $7,(%rdi,%rax,4),%xmm0
+ addb $16,%r10b
+ movdqu (%r12),%xmm2
+ movl (%rdi,%rcx,4),%edx
+ movl %ebx,(%rdi,%rcx,4)
+ addb %dl,%bl
+ movzbl %bl,%ebx
+ movl %edx,60(%rsi)
+ leaq (%rdi,%r10,4),%rsi
+ pinsrw $7,(%rdi,%rbx,4),%xmm1
+ movl (%rsi),%eax
+ movq %rcx,%rbx
+ xorq %rcx,%rcx
+ subq $16,%r11
+ movb %bl,%cl
+ testq $-16,%r11
+ jnz .Loop16
+
+ psllq $8,%xmm1
+ pxor %xmm0,%xmm2
+ pxor %xmm1,%xmm2
+ movdqu %xmm2,(%r13,%r12,1)
+ leaq 16(%r12),%r12
+
+ cmpq $0,%r11
+ jne .Lloop1
+ jmp .Lexit
+
+.align 16
+.Lloop1:
+ addb %al,%cl
+ movl (%rdi,%rcx,4),%edx
+ movl %eax,(%rdi,%rcx,4)
+ movl %edx,(%rdi,%r10,4)
+ addb %dl,%al
+ incb %r10b
+ movl (%rdi,%rax,4),%edx
+ movl (%rdi,%r10,4),%eax
+ xorb (%r12),%dl
+ movb %dl,(%r13,%r12,1)
+ leaq 1(%r12),%r12
+ decq %r11
+ jnz .Lloop1
+ jmp .Lexit
+
+.align 16
+.LRC4_CHAR:
+ addb $1,%r10b
+ movzbl (%rdi,%r10,1),%eax
+ testq $-8,%r11
+ jz .Lcloop1
+ jmp .Lcloop8
+.align 16
+.Lcloop8:
+ movl (%r12),%r8d
+ movl 4(%r12),%r9d
+ addb %al,%cl
+ leaq 1(%r10),%rsi
+ movzbl (%rdi,%rcx,1),%edx
+ movzbl %sil,%esi
+ movzbl (%rdi,%rsi,1),%ebx
+ movb %al,(%rdi,%rcx,1)
+ cmpq %rsi,%rcx
+ movb %dl,(%rdi,%r10,1)
+ jne .Lcmov0
+ movq %rax,%rbx
+.Lcmov0:
+ addb %al,%dl
+ xorb (%rdi,%rdx,1),%r8b
+ rorl $8,%r8d
+ addb %bl,%cl
+ leaq 1(%rsi),%r10
+ movzbl (%rdi,%rcx,1),%edx
+ movzbl %r10b,%r10d
+ movzbl (%rdi,%r10,1),%eax
+ movb %bl,(%rdi,%rcx,1)
+ cmpq %r10,%rcx
+ movb %dl,(%rdi,%rsi,1)
+ jne .Lcmov1
+ movq %rbx,%rax
+.Lcmov1:
+ addb %bl,%dl
+ xorb (%rdi,%rdx,1),%r8b
+ rorl $8,%r8d
+ addb %al,%cl
+ leaq 1(%r10),%rsi
+ movzbl (%rdi,%rcx,1),%edx
+ movzbl %sil,%esi
+ movzbl (%rdi,%rsi,1),%ebx
+ movb %al,(%rdi,%rcx,1)
+ cmpq %rsi,%rcx
+ movb %dl,(%rdi,%r10,1)
+ jne .Lcmov2
+ movq %rax,%rbx
+.Lcmov2:
+ addb %al,%dl
+ xorb (%rdi,%rdx,1),%r8b
+ rorl $8,%r8d
+ addb %bl,%cl
+ leaq 1(%rsi),%r10
+ movzbl (%rdi,%rcx,1),%edx
+ movzbl %r10b,%r10d
+ movzbl (%rdi,%r10,1),%eax
+ movb %bl,(%rdi,%rcx,1)
+ cmpq %r10,%rcx
+ movb %dl,(%rdi,%rsi,1)
+ jne .Lcmov3
+ movq %rbx,%rax
+.Lcmov3:
+ addb %bl,%dl
+ xorb (%rdi,%rdx,1),%r8b
+ rorl $8,%r8d
+ addb %al,%cl
+ leaq 1(%r10),%rsi
+ movzbl (%rdi,%rcx,1),%edx
+ movzbl %sil,%esi
+ movzbl (%rdi,%rsi,1),%ebx
+ movb %al,(%rdi,%rcx,1)
+ cmpq %rsi,%rcx
+ movb %dl,(%rdi,%r10,1)
+ jne .Lcmov4
+ movq %rax,%rbx
+.Lcmov4:
+ addb %al,%dl
+ xorb (%rdi,%rdx,1),%r9b
+ rorl $8,%r9d
+ addb %bl,%cl
+ leaq 1(%rsi),%r10
+ movzbl (%rdi,%rcx,1),%edx
+ movzbl %r10b,%r10d
+ movzbl (%rdi,%r10,1),%eax
+ movb %bl,(%rdi,%rcx,1)
+ cmpq %r10,%rcx
+ movb %dl,(%rdi,%rsi,1)
+ jne .Lcmov5
+ movq %rbx,%rax
+.Lcmov5:
+ addb %bl,%dl
+ xorb (%rdi,%rdx,1),%r9b
+ rorl $8,%r9d
+ addb %al,%cl
+ leaq 1(%r10),%rsi
+ movzbl (%rdi,%rcx,1),%edx
+ movzbl %sil,%esi
+ movzbl (%rdi,%rsi,1),%ebx
+ movb %al,(%rdi,%rcx,1)
+ cmpq %rsi,%rcx
+ movb %dl,(%rdi,%r10,1)
+ jne .Lcmov6
+ movq %rax,%rbx
+.Lcmov6:
+ addb %al,%dl
+ xorb (%rdi,%rdx,1),%r9b
+ rorl $8,%r9d
+ addb %bl,%cl
+ leaq 1(%rsi),%r10
+ movzbl (%rdi,%rcx,1),%edx
+ movzbl %r10b,%r10d
+ movzbl (%rdi,%r10,1),%eax
+ movb %bl,(%rdi,%rcx,1)
+ cmpq %r10,%rcx
+ movb %dl,(%rdi,%rsi,1)
+ jne .Lcmov7
+ movq %rbx,%rax
+.Lcmov7:
+ addb %bl,%dl
+ xorb (%rdi,%rdx,1),%r9b
+ rorl $8,%r9d
+ leaq -8(%r11),%r11
+ movl %r8d,(%r13)
+ leaq 8(%r12),%r12
+ movl %r9d,4(%r13)
+ leaq 8(%r13),%r13
+
+ testq $-8,%r11
+ jnz .Lcloop8
+ cmpq $0,%r11
+ jne .Lcloop1
+ jmp .Lexit
+.align 16
+.Lcloop1:
+ addb %al,%cl
+ movzbl %cl,%ecx
+ movzbl (%rdi,%rcx,1),%edx
+ movb %al,(%rdi,%rcx,1)
+ movb %dl,(%rdi,%r10,1)
+ addb %al,%dl
+ addb $1,%r10b
+ movzbl %dl,%edx
+ movzbl %r10b,%r10d
+ movzbl (%rdi,%rdx,1),%edx
+ movzbl (%rdi,%r10,1),%eax
+ xorb (%r12),%dl
+ leaq 1(%r12),%r12
+ movb %dl,(%r13)
+ leaq 1(%r13),%r13
+ subq $1,%r11
+ jnz .Lcloop1
+ jmp .Lexit
+
+.align 16
+.Lexit:
+ subb $1,%r10b
+ movl %r10d,-8(%rdi)
+ movl %ecx,-4(%rdi)
+
+ movq (%rsp),%r13
+ movq 8(%rsp),%r12
+ movq 16(%rsp),%rbx
+ addq $24,%rsp
+.Lepilogue:
+ .byte 0xf3,0xc3
+.size RC4,.-RC4
+.globl private_RC4_set_key
+.type private_RC4_set_key,@function
+.align 16
+private_RC4_set_key:
+ leaq 8(%rdi),%rdi
+ leaq (%rdx,%rsi,1),%rdx
+ negq %rsi
+ movq %rsi,%rcx
+ xorl %eax,%eax
+ xorq %r9,%r9
+ xorq %r10,%r10
+ xorq %r11,%r11
+
+ movl OPENSSL_ia32cap_P(%rip),%r8d
+ btl $20,%r8d
+ jc .Lc1stloop
+ jmp .Lw1stloop
+
+.align 16
+.Lw1stloop:
+ movl %eax,(%rdi,%rax,4)
+ addb $1,%al
+ jnc .Lw1stloop
+
+ xorq %r9,%r9
+ xorq %r8,%r8
+.align 16
+.Lw2ndloop:
+ movl (%rdi,%r9,4),%r10d
+ addb (%rdx,%rsi,1),%r8b
+ addb %r10b,%r8b
+ addq $1,%rsi
+ movl (%rdi,%r8,4),%r11d
+ cmovzq %rcx,%rsi
+ movl %r10d,(%rdi,%r8,4)
+ movl %r11d,(%rdi,%r9,4)
+ addb $1,%r9b
+ jnc .Lw2ndloop
+ jmp .Lexit_key
+
+.align 16
+.Lc1stloop:
+ movb %al,(%rdi,%rax,1)
+ addb $1,%al
+ jnc .Lc1stloop
+
+ xorq %r9,%r9
+ xorq %r8,%r8
+.align 16
+.Lc2ndloop:
+ movb (%rdi,%r9,1),%r10b
+ addb (%rdx,%rsi,1),%r8b
+ addb %r10b,%r8b
+ addq $1,%rsi
+ movb (%rdi,%r8,1),%r11b
+ jnz .Lcnowrap
+ movq %rcx,%rsi
+.Lcnowrap:
+ movb %r10b,(%rdi,%r8,1)
+ movb %r11b,(%rdi,%r9,1)
+ addb $1,%r9b
+ jnc .Lc2ndloop
+ movl $-1,256(%rdi)
+
+.align 16
+.Lexit_key:
+ xorl %eax,%eax
+ movl %eax,-8(%rdi)
+ movl %eax,-4(%rdi)
+ .byte 0xf3,0xc3
+.size private_RC4_set_key,.-private_RC4_set_key
+
+.globl RC4_options
+.type RC4_options,@function
+.align 16
+RC4_options:
+ leaq .Lopts(%rip),%rax
+ movl OPENSSL_ia32cap_P(%rip),%edx
+ btl $20,%edx
+ jc .L8xchar
+ btl $30,%edx
+ jnc .Ldone
+ addq $25,%rax
+ .byte 0xf3,0xc3
+.L8xchar:
+ addq $12,%rax
+.Ldone:
+ .byte 0xf3,0xc3
+.align 64
+.Lopts:
+.byte 114,99,52,40,56,120,44,105,110,116,41,0
+.byte 114,99,52,40,56,120,44,99,104,97,114,41,0
+.byte 114,99,52,40,49,54,120,44,105,110,116,41,0
+.byte 82,67,52,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 64
+.size RC4_options,.-RC4_options
diff --git a/main/openssl/crypto/rc4/asm/rc4-x86_64.pl b/main/openssl/crypto/rc4/asm/rc4-x86_64.pl
index 677be5fe..20722d3e 100755..100644
--- a/main/openssl/crypto/rc4/asm/rc4-x86_64.pl
+++ b/main/openssl/crypto/rc4/asm/rc4-x86_64.pl
@@ -7,6 +7,8 @@
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
+# July 2004
+#
# 2.22x RC4 tune-up:-) It should be noted though that my hand [as in
# "hand-coded assembler"] doesn't stand for the whole improvement
# coefficient. It turned out that eliminating RC4_CHAR from config
@@ -19,6 +21,8 @@
# to operate on partial registers, it turned out to be the best bet.
# At least for AMD... How IA32E would perform remains to be seen...
+# November 2004
+#
# As was shown by Marc Bevand reordering of couple of load operations
# results in even higher performance gain of 3.3x:-) At least on
# Opteron... For reference, 1x in this case is RC4_CHAR C-code
@@ -26,6 +30,8 @@
# Latter means that if you want to *estimate* what to expect from
# *your* Opteron, then multiply 54 by 3.3 and clock frequency in GHz.
+# November 2004
+#
# Intel P4 EM64T core was found to run the AMD64 code really slow...
# The only way to achieve comparable performance on P4 was to keep
# RC4_CHAR. Kind of ironic, huh? As it's apparently impossible to
@@ -33,10 +39,14 @@
# on either AMD and Intel platforms, I implement both cases. See
# rc4_skey.c for further details...
+# April 2005
+#
# P4 EM64T core appears to be "allergic" to 64-bit inc/dec. Replacing
# those with add/sub results in 50% performance improvement of folded
# loop...
+# May 2005
+#
# As was shown by Zou Nanhai loop unrolling can improve Intel EM64T
# performance by >30% [unlike P4 32-bit case that is]. But this is
# provided that loads are reordered even more aggressively! Both code
@@ -46,10 +56,12 @@
# achieves respectful 432MBps on 2.8GHz processor now. For reference.
# If executed on Xeon, current RC4_CHAR code-path is 2.7x faster than
# RC4_INT code-path. While if executed on Opteron, it's only 25%
-# slower than the RC4_INT one [meaning that if CPU µ-arch detection
+# slower than the RC4_INT one [meaning that if CPU µ-arch detection
# is not implemented, then this final RC4_CHAR code-path should be
# preferred, as it provides better *all-round* performance].
+# March 2007
+#
# Intel Core2 was observed to perform poorly on both code paths:-( It
# apparently suffers from some kind of partial register stall, which
# occurs in 64-bit mode only [as virtually identical 32-bit loop was
@@ -58,6 +70,37 @@
# fit for Core2 and therefore the code was modified to skip cloop8 on
# this CPU.
+# May 2010
+#
+# Intel Westmere was observed to perform suboptimally. Adding yet
+# another movzb to cloop1 improved performance by almost 50%! Core2
+# performance is improved too, but nominally...
+
+# May 2011
+#
+# The only code path that was not modified is P4-specific one. Non-P4
+# Intel code path optimization is heavily based on submission by Maxim
+# Perminov, Maxim Locktyukhin and Jim Guilford of Intel. I've used
+# some of the ideas even in attempt to optmize the original RC4_INT
+# code path... Current performance in cycles per processed byte (less
+# is better) and improvement coefficients relative to previous
+# version of this module are:
+#
+# Opteron 5.3/+0%(*)
+# P4 6.5
+# Core2 6.2/+15%(**)
+# Westmere 4.2/+60%
+# Sandy Bridge 4.2/+120%
+# Atom 9.3/+80%
+#
+# (*) But corresponding loop has less instructions, which should have
+# positive effect on upcoming Bulldozer, which has one less ALU.
+# For reference, Intel code runs at 6.8 cpb rate on Opteron.
+# (**) Note that Core2 result is ~15% lower than corresponding result
+# for 32-bit code, meaning that it's possible to improve it,
+# but more than likely at the cost of the others (see rc4-586.pl
+# to get the idea)...
+
$flavour = shift;
$output = shift;
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
@@ -69,20 +112,18 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
-open STDOUT,"| $^X $xlate $flavour $output";
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
$dat="%rdi"; # arg1
$len="%rsi"; # arg2
$inp="%rdx"; # arg3
$out="%rcx"; # arg4
-@XX=("%r8","%r10");
-@TX=("%r9","%r11");
-$YY="%r12";
-$TY="%r13";
-
+{
$code=<<___;
.text
+.extern OPENSSL_ia32cap_P
.globl RC4
.type RC4,\@function,4
@@ -95,48 +136,173 @@ RC4: or $len,$len
push %r12
push %r13
.Lprologue:
+ mov $len,%r11
+ mov $inp,%r12
+ mov $out,%r13
+___
+my $len="%r11"; # reassign input arguments
+my $inp="%r12";
+my $out="%r13";
- add \$8,$dat
- movl -8($dat),$XX[0]#d
- movl -4($dat),$YY#d
+my @XX=("%r10","%rsi");
+my @TX=("%rax","%rbx");
+my $YY="%rcx";
+my $TY="%rdx";
+
+$code.=<<___;
+ xor $XX[0],$XX[0]
+ xor $YY,$YY
+
+ lea 8($dat),$dat
+ mov -8($dat),$XX[0]#b
+ mov -4($dat),$YY#b
cmpl \$-1,256($dat)
je .LRC4_CHAR
+ mov OPENSSL_ia32cap_P(%rip),%r8d
+ xor $TX[1],$TX[1]
inc $XX[0]#b
+ sub $XX[0],$TX[1]
+ sub $inp,$out
movl ($dat,$XX[0],4),$TX[0]#d
- test \$-8,$len
+ test \$-16,$len
jz .Lloop1
- jmp .Lloop8
+ bt \$30,%r8d # Intel CPU?
+ jc .Lintel
+ and \$7,$TX[1]
+ lea 1($XX[0]),$XX[1]
+ jz .Loop8
+ sub $TX[1],$len
+.Loop8_warmup:
+ add $TX[0]#b,$YY#b
+ movl ($dat,$YY,4),$TY#d
+ movl $TX[0]#d,($dat,$YY,4)
+ movl $TY#d,($dat,$XX[0],4)
+ add $TY#b,$TX[0]#b
+ inc $XX[0]#b
+ movl ($dat,$TX[0],4),$TY#d
+ movl ($dat,$XX[0],4),$TX[0]#d
+ xorb ($inp),$TY#b
+ movb $TY#b,($out,$inp)
+ lea 1($inp),$inp
+ dec $TX[1]
+ jnz .Loop8_warmup
+
+ lea 1($XX[0]),$XX[1]
+ jmp .Loop8
.align 16
-.Lloop8:
+.Loop8:
___
for ($i=0;$i<8;$i++) {
+$code.=<<___ if ($i==7);
+ add \$8,$XX[1]#b
+___
$code.=<<___;
add $TX[0]#b,$YY#b
- mov $XX[0],$XX[1]
movl ($dat,$YY,4),$TY#d
- ror \$8,%rax # ror is redundant when $i=0
- inc $XX[1]#b
- movl ($dat,$XX[1],4),$TX[1]#d
- cmp $XX[1],$YY
movl $TX[0]#d,($dat,$YY,4)
- cmove $TX[0],$TX[1]
- movl $TY#d,($dat,$XX[0],4)
+ movl `4*($i==7?-1:$i)`($dat,$XX[1],4),$TX[1]#d
+ ror \$8,%r8 # ror is redundant when $i=0
+ movl $TY#d,4*$i($dat,$XX[0],4)
add $TX[0]#b,$TY#b
- movb ($dat,$TY,4),%al
+ movb ($dat,$TY,4),%r8b
___
-push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers
+push(@TX,shift(@TX)); #push(@XX,shift(@XX)); # "rotate" registers
}
$code.=<<___;
- ror \$8,%rax
+ add \$8,$XX[0]#b
+ ror \$8,%r8
sub \$8,$len
- xor ($inp),%rax
- add \$8,$inp
- mov %rax,($out)
- add \$8,$out
+ xor ($inp),%r8
+ mov %r8,($out,$inp)
+ lea 8($inp),$inp
test \$-8,$len
- jnz .Lloop8
+ jnz .Loop8
+ cmp \$0,$len
+ jne .Lloop1
+ jmp .Lexit
+
+.align 16
+.Lintel:
+ test \$-32,$len
+ jz .Lloop1
+ and \$15,$TX[1]
+ jz .Loop16_is_hot
+ sub $TX[1],$len
+.Loop16_warmup:
+ add $TX[0]#b,$YY#b
+ movl ($dat,$YY,4),$TY#d
+ movl $TX[0]#d,($dat,$YY,4)
+ movl $TY#d,($dat,$XX[0],4)
+ add $TY#b,$TX[0]#b
+ inc $XX[0]#b
+ movl ($dat,$TX[0],4),$TY#d
+ movl ($dat,$XX[0],4),$TX[0]#d
+ xorb ($inp),$TY#b
+ movb $TY#b,($out,$inp)
+ lea 1($inp),$inp
+ dec $TX[1]
+ jnz .Loop16_warmup
+
+ mov $YY,$TX[1]
+ xor $YY,$YY
+ mov $TX[1]#b,$YY#b
+
+.Loop16_is_hot:
+ lea ($dat,$XX[0],4),$XX[1]
+___
+sub RC4_loop {
+ my $i=shift;
+ my $j=$i<0?0:$i;
+ my $xmm="%xmm".($j&1);
+
+ $code.=" add \$16,$XX[0]#b\n" if ($i==15);
+ $code.=" movdqu ($inp),%xmm2\n" if ($i==15);
+ $code.=" add $TX[0]#b,$YY#b\n" if ($i<=0);
+ $code.=" movl ($dat,$YY,4),$TY#d\n";
+ $code.=" pxor %xmm0,%xmm2\n" if ($i==0);
+ $code.=" psllq \$8,%xmm1\n" if ($i==0);
+ $code.=" pxor $xmm,$xmm\n" if ($i<=1);
+ $code.=" movl $TX[0]#d,($dat,$YY,4)\n";
+ $code.=" add $TY#b,$TX[0]#b\n";
+ $code.=" movl `4*($j+1)`($XX[1]),$TX[1]#d\n" if ($i<15);
+ $code.=" movz $TX[0]#b,$TX[0]#d\n";
+ $code.=" movl $TY#d,4*$j($XX[1])\n";
+ $code.=" pxor %xmm1,%xmm2\n" if ($i==0);
+ $code.=" lea ($dat,$XX[0],4),$XX[1]\n" if ($i==15);
+ $code.=" add $TX[1]#b,$YY#b\n" if ($i<15);
+ $code.=" pinsrw \$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n";
+ $code.=" movdqu %xmm2,($out,$inp)\n" if ($i==0);
+ $code.=" lea 16($inp),$inp\n" if ($i==0);
+ $code.=" movl ($XX[1]),$TX[1]#d\n" if ($i==15);
+}
+ RC4_loop(-1);
+$code.=<<___;
+ jmp .Loop16_enter
+.align 16
+.Loop16:
+___
+
+for ($i=0;$i<16;$i++) {
+ $code.=".Loop16_enter:\n" if ($i==1);
+ RC4_loop($i);
+ push(@TX,shift(@TX)); # "rotate" registers
+}
+$code.=<<___;
+ mov $YY,$TX[1]
+ xor $YY,$YY # keyword to partial register
+ sub \$16,$len
+ mov $TX[1]#b,$YY#b
+ test \$-16,$len
+ jnz .Loop16
+
+ psllq \$8,%xmm1
+ pxor %xmm0,%xmm2
+ pxor %xmm1,%xmm2
+ movdqu %xmm2,($out,$inp)
+ lea 16($inp),$inp
+
cmp \$0,$len
jne .Lloop1
jmp .Lexit
@@ -152,9 +318,8 @@ $code.=<<___;
movl ($dat,$TX[0],4),$TY#d
movl ($dat,$XX[0],4),$TX[0]#d
xorb ($inp),$TY#b
- inc $inp
- movb $TY#b,($out)
- inc $out
+ movb $TY#b,($out,$inp)
+ lea 1($inp),$inp
dec $len
jnz .Lloop1
jmp .Lexit
@@ -165,13 +330,11 @@ $code.=<<___;
movzb ($dat,$XX[0]),$TX[0]#d
test \$-8,$len
jz .Lcloop1
- cmpl \$0,260($dat)
- jnz .Lcloop1
jmp .Lcloop8
.align 16
.Lcloop8:
- mov ($inp),%eax
- mov 4($inp),%ebx
+ mov ($inp),%r8d
+ mov 4($inp),%r9d
___
# unroll 2x4-wise, because 64-bit rotates kill Intel P4...
for ($i=0;$i<4;$i++) {
@@ -188,8 +351,8 @@ $code.=<<___;
mov $TX[0],$TX[1]
.Lcmov$i:
add $TX[0]#b,$TY#b
- xor ($dat,$TY),%al
- ror \$8,%eax
+ xor ($dat,$TY),%r8b
+ ror \$8,%r8d
___
push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers
}
@@ -207,16 +370,16 @@ $code.=<<___;
mov $TX[0],$TX[1]
.Lcmov$i:
add $TX[0]#b,$TY#b
- xor ($dat,$TY),%bl
- ror \$8,%ebx
+ xor ($dat,$TY),%r9b
+ ror \$8,%r9d
___
push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers
}
$code.=<<___;
lea -8($len),$len
- mov %eax,($out)
+ mov %r8d,($out)
lea 8($inp),$inp
- mov %ebx,4($out)
+ mov %r9d,4($out)
lea 8($out),$out
test \$-8,$len
@@ -229,6 +392,7 @@ $code.=<<___;
.align 16
.Lcloop1:
add $TX[0]#b,$YY#b
+ movzb $YY#b,$YY#d
movzb ($dat,$YY),$TY#d
movb $TX[0]#b,($dat,$YY)
movb $TY#b,($dat,$XX[0])
@@ -260,16 +424,16 @@ $code.=<<___;
ret
.size RC4,.-RC4
___
+}
$idx="%r8";
$ido="%r9";
$code.=<<___;
-.extern OPENSSL_ia32cap_P
-.globl RC4_set_key
-.type RC4_set_key,\@function,3
+.globl private_RC4_set_key
+.type private_RC4_set_key,\@function,3
.align 16
-RC4_set_key:
+private_RC4_set_key:
lea 8($dat),$dat
lea ($inp,$len),$inp
neg $len
@@ -280,12 +444,9 @@ RC4_set_key:
xor %r11,%r11
mov OPENSSL_ia32cap_P(%rip),$idx#d
- bt \$20,$idx#d
- jnc .Lw1stloop
- bt \$30,$idx#d
- setc $ido#b
- mov $ido#d,260($dat)
- jmp .Lc1stloop
+ bt \$20,$idx#d # RC4_CHAR?
+ jc .Lc1stloop
+ jmp .Lw1stloop
.align 16
.Lw1stloop:
@@ -339,7 +500,7 @@ RC4_set_key:
mov %eax,-8($dat)
mov %eax,-4($dat)
ret
-.size RC4_set_key,.-RC4_set_key
+.size private_RC4_set_key,.-private_RC4_set_key
.globl RC4_options
.type RC4_options,\@abi-omnipotent
@@ -348,18 +509,20 @@ RC4_options:
lea .Lopts(%rip),%rax
mov OPENSSL_ia32cap_P(%rip),%edx
bt \$20,%edx
- jnc .Ldone
- add \$12,%rax
+ jc .L8xchar
bt \$30,%edx
jnc .Ldone
- add \$13,%rax
+ add \$25,%rax
+ ret
+.L8xchar:
+ add \$12,%rax
.Ldone:
ret
.align 64
.Lopts:
.asciz "rc4(8x,int)"
.asciz "rc4(8x,char)"
-.asciz "rc4(1x,char)"
+.asciz "rc4(16x,int)"
.asciz "RC4 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
.align 64
.size RC4_options,.-RC4_options
@@ -482,22 +645,32 @@ key_se_handler:
.rva .LSEH_end_RC4
.rva .LSEH_info_RC4
- .rva .LSEH_begin_RC4_set_key
- .rva .LSEH_end_RC4_set_key
- .rva .LSEH_info_RC4_set_key
+ .rva .LSEH_begin_private_RC4_set_key
+ .rva .LSEH_end_private_RC4_set_key
+ .rva .LSEH_info_private_RC4_set_key
.section .xdata
.align 8
.LSEH_info_RC4:
.byte 9,0,0,0
.rva stream_se_handler
-.LSEH_info_RC4_set_key:
+.LSEH_info_private_RC4_set_key:
.byte 9,0,0,0
.rva key_se_handler
___
}
-$code =~ s/#([bwd])/$1/gm;
+sub reg_part {
+my ($reg,$conv)=@_;
+ if ($reg =~ /%r[0-9]+/) { $reg .= $conv; }
+ elsif ($conv eq "b") { $reg =~ s/%[er]([^x]+)x?/%$1l/; }
+ elsif ($conv eq "w") { $reg =~ s/%[er](.+)/%$1/; }
+ elsif ($conv eq "d") { $reg =~ s/%[er](.+)/%e$1/; }
+ return $reg;
+}
+
+$code =~ s/(%[a-z0-9]+)#([bwd])/reg_part($1,$2)/gem;
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
print $code;
diff --git a/main/openssl/crypto/rc4/rc4.h b/main/openssl/crypto/rc4/rc4.h
index 29d1accc..88ceb46b 100644
--- a/main/openssl/crypto/rc4/rc4.h
+++ b/main/openssl/crypto/rc4/rc4.h
@@ -79,6 +79,7 @@ typedef struct rc4_key_st
const char *RC4_options(void);
void RC4_set_key(RC4_KEY *key, int len, const unsigned char *data);
+void private_RC4_set_key(RC4_KEY *key, int len, const unsigned char *data);
void RC4(RC4_KEY *key, size_t len, const unsigned char *indata,
unsigned char *outdata);
diff --git a/main/openssl/crypto/rc4/rc4_skey.c b/main/openssl/crypto/rc4/rc4_skey.c
index b22c40b0..fda27636 100644
--- a/main/openssl/crypto/rc4/rc4_skey.c
+++ b/main/openssl/crypto/rc4/rc4_skey.c
@@ -85,7 +85,7 @@ const char *RC4_options(void)
* Date: Wed, 14 Sep 1994 06:35:31 GMT
*/
-void RC4_set_key(RC4_KEY *key, int len, const unsigned char *data)
+void private_RC4_set_key(RC4_KEY *key, int len, const unsigned char *data)
{
register RC4_INT tmp;
register int id1,id2;
@@ -104,40 +104,6 @@ void RC4_set_key(RC4_KEY *key, int len, const unsigned char *data)
d[(n)]=d[id2]; \
d[id2]=tmp; }
-#if defined(OPENSSL_CPUID_OBJ) && !defined(OPENSSL_NO_ASM)
-# if defined(__i386) || defined(__i386__) || defined(_M_IX86) || \
- defined(__INTEL__) || \
- defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64)
- if (sizeof(RC4_INT) > 1) {
- /*
- * Unlike all other x86 [and x86_64] implementations,
- * Intel P4 core [including EM64T] was found to perform
- * poorly with wider RC4_INT. Performance improvement
- * for IA-32 hand-coded assembler turned out to be 2.8x
- * if re-coded for RC4_CHAR! It's however inappropriate
- * to just switch to RC4_CHAR for x86[_64], as non-P4
- * implementations suffer from significant performance
- * losses then, e.g. PIII exhibits >2x deterioration,
- * and so does Opteron. In order to assure optimal
- * all-round performance, let us [try to] detect P4 at
- * run-time by checking upon HTT bit in CPU capability
- * vector and set up compressed key schedule, which is
- * recognized by correspondingly updated assembler
- * module...
- * <appro@fy.chalmers.se>
- */
- if (OPENSSL_ia32cap_P & (1<<28)) {
- unsigned char *cp=(unsigned char *)d;
-
- for (i=0;i<256;i++) cp[i]=i;
- for (i=0;i<256;i++) SK_LOOP(cp,i);
- /* mark schedule as compressed! */
- d[256/sizeof(RC4_INT)]=-1;
- return;
- }
- }
-# endif
-#endif
for (i=0; i < 256; i++) d[i]=i;
for (i=0; i < 256; i+=4)
{
diff --git a/main/openssl/crypto/rc4/rc4_utl.c b/main/openssl/crypto/rc4/rc4_utl.c
new file mode 100644
index 00000000..ab3f02fe
--- /dev/null
+++ b/main/openssl/crypto/rc4/rc4_utl.c
@@ -0,0 +1,62 @@
+/* crypto/rc4/rc4_utl.c -*- mode:C; c-file-style: "eay" -*- */
+/* ====================================================================
+ * Copyright (c) 2011 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ */
+
+#include <openssl/opensslv.h>
+#include <openssl/crypto.h>
+#include <openssl/rc4.h>
+
+void RC4_set_key(RC4_KEY *key, int len, const unsigned char *data)
+ {
+#ifdef OPENSSL_FIPS
+ fips_cipher_abort(RC4);
+#endif
+ private_RC4_set_key(key, len, data);
+ }
diff --git a/main/openssl/crypto/rc4/rc4test.c b/main/openssl/crypto/rc4/rc4test.c
index 633a79e7..4312605c 100644
--- a/main/openssl/crypto/rc4/rc4test.c
+++ b/main/openssl/crypto/rc4/rc4test.c
@@ -120,6 +120,12 @@ int main(int argc, char *argv[])
RC4_KEY key;
unsigned char obuf[512];
+#if !defined(OPENSSL_PIC)
+ void OPENSSL_cpuid_setup(void);
+
+ OPENSSL_cpuid_setup();
+#endif
+
for (i=0; i<6; i++)
{
RC4_set_key(&key,keys[i][0],&(keys[i][1]));
diff --git a/main/openssl/crypto/ripemd/ripemd.h b/main/openssl/crypto/ripemd/ripemd.h
index 5942eb61..189bd8c9 100644
--- a/main/openssl/crypto/ripemd/ripemd.h
+++ b/main/openssl/crypto/ripemd/ripemd.h
@@ -91,6 +91,9 @@ typedef struct RIPEMD160state_st
unsigned int num;
} RIPEMD160_CTX;
+#ifdef OPENSSL_FIPS
+int private_RIPEMD160_Init(RIPEMD160_CTX *c);
+#endif
int RIPEMD160_Init(RIPEMD160_CTX *c);
int RIPEMD160_Update(RIPEMD160_CTX *c, const void *data, size_t len);
int RIPEMD160_Final(unsigned char *md, RIPEMD160_CTX *c);
diff --git a/main/openssl/crypto/ripemd/rmd_dgst.c b/main/openssl/crypto/ripemd/rmd_dgst.c
index 59b017f8..d8e72da5 100644
--- a/main/openssl/crypto/ripemd/rmd_dgst.c
+++ b/main/openssl/crypto/ripemd/rmd_dgst.c
@@ -59,6 +59,7 @@
#include <stdio.h>
#include "rmd_locl.h"
#include <openssl/opensslv.h>
+#include <openssl/crypto.h>
const char RMD160_version[]="RIPE-MD160" OPENSSL_VERSION_PTEXT;
@@ -69,7 +70,7 @@ const char RMD160_version[]="RIPE-MD160" OPENSSL_VERSION_PTEXT;
void ripemd160_block(RIPEMD160_CTX *c, unsigned long *p,size_t num);
# endif
-int RIPEMD160_Init(RIPEMD160_CTX *c)
+fips_md_init(RIPEMD160)
{
memset (c,0,sizeof(*c));
c->A=RIPEMD160_A;
@@ -104,21 +105,21 @@ void ripemd160_block_data_order (RIPEMD160_CTX *ctx, const void *p, size_t num)
A=ctx->A; B=ctx->B; C=ctx->C; D=ctx->D; E=ctx->E;
- HOST_c2l(data,l); X( 0)=l; HOST_c2l(data,l); X( 1)=l;
- RIP1(A,B,C,D,E,WL00,SL00); HOST_c2l(data,l); X( 2)=l;
- RIP1(E,A,B,C,D,WL01,SL01); HOST_c2l(data,l); X( 3)=l;
- RIP1(D,E,A,B,C,WL02,SL02); HOST_c2l(data,l); X( 4)=l;
- RIP1(C,D,E,A,B,WL03,SL03); HOST_c2l(data,l); X( 5)=l;
- RIP1(B,C,D,E,A,WL04,SL04); HOST_c2l(data,l); X( 6)=l;
- RIP1(A,B,C,D,E,WL05,SL05); HOST_c2l(data,l); X( 7)=l;
- RIP1(E,A,B,C,D,WL06,SL06); HOST_c2l(data,l); X( 8)=l;
- RIP1(D,E,A,B,C,WL07,SL07); HOST_c2l(data,l); X( 9)=l;
- RIP1(C,D,E,A,B,WL08,SL08); HOST_c2l(data,l); X(10)=l;
- RIP1(B,C,D,E,A,WL09,SL09); HOST_c2l(data,l); X(11)=l;
- RIP1(A,B,C,D,E,WL10,SL10); HOST_c2l(data,l); X(12)=l;
- RIP1(E,A,B,C,D,WL11,SL11); HOST_c2l(data,l); X(13)=l;
- RIP1(D,E,A,B,C,WL12,SL12); HOST_c2l(data,l); X(14)=l;
- RIP1(C,D,E,A,B,WL13,SL13); HOST_c2l(data,l); X(15)=l;
+ (void)HOST_c2l(data,l); X( 0)=l;(void)HOST_c2l(data,l); X( 1)=l;
+ RIP1(A,B,C,D,E,WL00,SL00); (void)HOST_c2l(data,l); X( 2)=l;
+ RIP1(E,A,B,C,D,WL01,SL01); (void)HOST_c2l(data,l); X( 3)=l;
+ RIP1(D,E,A,B,C,WL02,SL02); (void)HOST_c2l(data,l); X( 4)=l;
+ RIP1(C,D,E,A,B,WL03,SL03); (void)HOST_c2l(data,l); X( 5)=l;
+ RIP1(B,C,D,E,A,WL04,SL04); (void)HOST_c2l(data,l); X( 6)=l;
+ RIP1(A,B,C,D,E,WL05,SL05); (void)HOST_c2l(data,l); X( 7)=l;
+ RIP1(E,A,B,C,D,WL06,SL06); (void)HOST_c2l(data,l); X( 8)=l;
+ RIP1(D,E,A,B,C,WL07,SL07); (void)HOST_c2l(data,l); X( 9)=l;
+ RIP1(C,D,E,A,B,WL08,SL08); (void)HOST_c2l(data,l); X(10)=l;
+ RIP1(B,C,D,E,A,WL09,SL09); (void)HOST_c2l(data,l); X(11)=l;
+ RIP1(A,B,C,D,E,WL10,SL10); (void)HOST_c2l(data,l); X(12)=l;
+ RIP1(E,A,B,C,D,WL11,SL11); (void)HOST_c2l(data,l); X(13)=l;
+ RIP1(D,E,A,B,C,WL12,SL12); (void)HOST_c2l(data,l); X(14)=l;
+ RIP1(C,D,E,A,B,WL13,SL13); (void)HOST_c2l(data,l); X(15)=l;
RIP1(B,C,D,E,A,WL14,SL14);
RIP1(A,B,C,D,E,WL15,SL15);
diff --git a/main/openssl/crypto/ripemd/rmd_locl.h b/main/openssl/crypto/ripemd/rmd_locl.h
index f14b346e..2bd8957d 100644
--- a/main/openssl/crypto/ripemd/rmd_locl.h
+++ b/main/openssl/crypto/ripemd/rmd_locl.h
@@ -88,11 +88,11 @@ void ripemd160_block_data_order (RIPEMD160_CTX *c, const void *p,size_t num);
#define HASH_FINAL RIPEMD160_Final
#define HASH_MAKE_STRING(c,s) do { \
unsigned long ll; \
- ll=(c)->A; HOST_l2c(ll,(s)); \
- ll=(c)->B; HOST_l2c(ll,(s)); \
- ll=(c)->C; HOST_l2c(ll,(s)); \
- ll=(c)->D; HOST_l2c(ll,(s)); \
- ll=(c)->E; HOST_l2c(ll,(s)); \
+ ll=(c)->A; (void)HOST_l2c(ll,(s)); \
+ ll=(c)->B; (void)HOST_l2c(ll,(s)); \
+ ll=(c)->C; (void)HOST_l2c(ll,(s)); \
+ ll=(c)->D; (void)HOST_l2c(ll,(s)); \
+ ll=(c)->E; (void)HOST_l2c(ll,(s)); \
} while (0)
#define HASH_BLOCK_DATA_ORDER ripemd160_block_data_order
diff --git a/main/openssl/crypto/rsa/rsa.h b/main/openssl/crypto/rsa/rsa.h
index cf743436..5f269e57 100644
--- a/main/openssl/crypto/rsa/rsa.h
+++ b/main/openssl/crypto/rsa/rsa.h
@@ -222,12 +222,22 @@ struct rsa_st
EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, -1, EVP_PKEY_CTRL_RSA_PADDING, \
pad, NULL)
+#define EVP_PKEY_CTX_get_rsa_padding(ctx, ppad) \
+ EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, -1, \
+ EVP_PKEY_CTRL_GET_RSA_PADDING, 0, ppad)
+
#define EVP_PKEY_CTX_set_rsa_pss_saltlen(ctx, len) \
EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, \
(EVP_PKEY_OP_SIGN|EVP_PKEY_OP_VERIFY), \
EVP_PKEY_CTRL_RSA_PSS_SALTLEN, \
len, NULL)
+#define EVP_PKEY_CTX_get_rsa_pss_saltlen(ctx, plen) \
+ EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, \
+ (EVP_PKEY_OP_SIGN|EVP_PKEY_OP_VERIFY), \
+ EVP_PKEY_CTRL_GET_RSA_PSS_SALTLEN, \
+ 0, plen)
+
#define EVP_PKEY_CTX_set_rsa_keygen_bits(ctx, bits) \
EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, EVP_PKEY_OP_KEYGEN, \
EVP_PKEY_CTRL_RSA_KEYGEN_BITS, bits, NULL)
@@ -236,11 +246,24 @@ struct rsa_st
EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, EVP_PKEY_OP_KEYGEN, \
EVP_PKEY_CTRL_RSA_KEYGEN_PUBEXP, 0, pubexp)
+#define EVP_PKEY_CTX_set_rsa_mgf1_md(ctx, md) \
+ EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, EVP_PKEY_OP_TYPE_SIG, \
+ EVP_PKEY_CTRL_RSA_MGF1_MD, 0, (void *)md)
+
+#define EVP_PKEY_CTX_get_rsa_mgf1_md(ctx, pmd) \
+ EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, EVP_PKEY_OP_TYPE_SIG, \
+ EVP_PKEY_CTRL_GET_RSA_MGF1_MD, 0, (void *)pmd)
+
#define EVP_PKEY_CTRL_RSA_PADDING (EVP_PKEY_ALG_CTRL + 1)
#define EVP_PKEY_CTRL_RSA_PSS_SALTLEN (EVP_PKEY_ALG_CTRL + 2)
#define EVP_PKEY_CTRL_RSA_KEYGEN_BITS (EVP_PKEY_ALG_CTRL + 3)
#define EVP_PKEY_CTRL_RSA_KEYGEN_PUBEXP (EVP_PKEY_ALG_CTRL + 4)
+#define EVP_PKEY_CTRL_RSA_MGF1_MD (EVP_PKEY_ALG_CTRL + 5)
+
+#define EVP_PKEY_CTRL_GET_RSA_PADDING (EVP_PKEY_ALG_CTRL + 6)
+#define EVP_PKEY_CTRL_GET_RSA_PSS_SALTLEN (EVP_PKEY_ALG_CTRL + 7)
+#define EVP_PKEY_CTRL_GET_RSA_MGF1_MD (EVP_PKEY_ALG_CTRL + 8)
#define RSA_PKCS1_PADDING 1
#define RSA_SSLV23_PADDING 2
@@ -257,7 +280,7 @@ struct rsa_st
RSA * RSA_new(void);
RSA * RSA_new_method(ENGINE *engine);
-int RSA_size(const RSA *);
+int RSA_size(const RSA *rsa);
/* Deprecated version */
#ifndef OPENSSL_NO_DEPRECATED
@@ -300,6 +323,16 @@ const RSA_METHOD *RSA_null_method(void);
DECLARE_ASN1_ENCODE_FUNCTIONS_const(RSA, RSAPublicKey)
DECLARE_ASN1_ENCODE_FUNCTIONS_const(RSA, RSAPrivateKey)
+typedef struct rsa_pss_params_st
+ {
+ X509_ALGOR *hashAlgorithm;
+ X509_ALGOR *maskGenAlgorithm;
+ ASN1_INTEGER *saltLength;
+ ASN1_INTEGER *trailerField;
+ } RSA_PSS_PARAMS;
+
+DECLARE_ASN1_FUNCTIONS(RSA_PSS_PARAMS)
+
#ifndef OPENSSL_NO_FP_API
int RSA_print_fp(FILE *fp, const RSA *r,int offset);
#endif
@@ -380,6 +413,14 @@ int RSA_padding_add_PKCS1_PSS(RSA *rsa, unsigned char *EM,
const unsigned char *mHash,
const EVP_MD *Hash, int sLen);
+int RSA_verify_PKCS1_PSS_mgf1(RSA *rsa, const unsigned char *mHash,
+ const EVP_MD *Hash, const EVP_MD *mgf1Hash,
+ const unsigned char *EM, int sLen);
+
+int RSA_padding_add_PKCS1_PSS_mgf1(RSA *rsa, unsigned char *EM,
+ const unsigned char *mHash,
+ const EVP_MD *Hash, const EVP_MD *mgf1Hash, int sLen);
+
int RSA_get_ex_new_index(long argl, void *argp, CRYPTO_EX_new *new_func,
CRYPTO_EX_dup *dup_func, CRYPTO_EX_free *free_func);
int RSA_set_ex_data(RSA *r,int idx,void *arg);
@@ -388,6 +429,25 @@ void *RSA_get_ex_data(const RSA *r, int idx);
RSA *RSAPublicKey_dup(RSA *rsa);
RSA *RSAPrivateKey_dup(RSA *rsa);
+/* If this flag is set the RSA method is FIPS compliant and can be used
+ * in FIPS mode. This is set in the validated module method. If an
+ * application sets this flag in its own methods it is its responsibility
+ * to ensure the result is compliant.
+ */
+
+#define RSA_FLAG_FIPS_METHOD 0x0400
+
+/* If this flag is set the operations normally disabled in FIPS mode are
+ * permitted it is then the applications responsibility to ensure that the
+ * usage is compliant.
+ */
+
+#define RSA_FLAG_NON_FIPS_ALLOW 0x0400
+/* Application has decided PRNG is good enough to generate a key: don't
+ * check.
+ */
+#define RSA_FLAG_CHECKED 0x0800
+
/* BEGIN ERROR CODES */
/* The following lines are auto generated by the script mkerr.pl. Any changes
* made after this point may be overwritten when the script is next run.
@@ -405,6 +465,7 @@ void ERR_load_RSA_strings(void);
#define RSA_F_PKEY_RSA_CTRL 143
#define RSA_F_PKEY_RSA_CTRL_STR 144
#define RSA_F_PKEY_RSA_SIGN 142
+#define RSA_F_PKEY_RSA_VERIFY 154
#define RSA_F_PKEY_RSA_VERIFYRECOVER 141
#define RSA_F_RSA_BUILTIN_KEYGEN 129
#define RSA_F_RSA_CHECK_KEY 123
@@ -413,6 +474,8 @@ void ERR_load_RSA_strings(void);
#define RSA_F_RSA_EAY_PUBLIC_DECRYPT 103
#define RSA_F_RSA_EAY_PUBLIC_ENCRYPT 104
#define RSA_F_RSA_GENERATE_KEY 105
+#define RSA_F_RSA_GENERATE_KEY_EX 155
+#define RSA_F_RSA_ITEM_VERIFY 156
#define RSA_F_RSA_MEMORY_LOCK 130
#define RSA_F_RSA_NEW_METHOD 106
#define RSA_F_RSA_NULL 124
@@ -424,6 +487,7 @@ void ERR_load_RSA_strings(void);
#define RSA_F_RSA_PADDING_ADD_NONE 107
#define RSA_F_RSA_PADDING_ADD_PKCS1_OAEP 121
#define RSA_F_RSA_PADDING_ADD_PKCS1_PSS 125
+#define RSA_F_RSA_PADDING_ADD_PKCS1_PSS_MGF1 148
#define RSA_F_RSA_PADDING_ADD_PKCS1_TYPE_1 108
#define RSA_F_RSA_PADDING_ADD_PKCS1_TYPE_2 109
#define RSA_F_RSA_PADDING_ADD_SSLV23 110
@@ -436,8 +500,12 @@ void ERR_load_RSA_strings(void);
#define RSA_F_RSA_PADDING_CHECK_X931 128
#define RSA_F_RSA_PRINT 115
#define RSA_F_RSA_PRINT_FP 116
+#define RSA_F_RSA_PRIVATE_DECRYPT 150
+#define RSA_F_RSA_PRIVATE_ENCRYPT 151
#define RSA_F_RSA_PRIV_DECODE 137
#define RSA_F_RSA_PRIV_ENCODE 138
+#define RSA_F_RSA_PUBLIC_DECRYPT 152
+#define RSA_F_RSA_PUBLIC_ENCRYPT 153
#define RSA_F_RSA_PUB_DECODE 139
#define RSA_F_RSA_SETUP_BLINDING 136
#define RSA_F_RSA_SIGN 117
@@ -445,6 +513,7 @@ void ERR_load_RSA_strings(void);
#define RSA_F_RSA_VERIFY 119
#define RSA_F_RSA_VERIFY_ASN1_OCTET_STRING 120
#define RSA_F_RSA_VERIFY_PKCS1_PSS 126
+#define RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1 149
/* Reason codes. */
#define RSA_R_ALGORITHM_MISMATCH 100
@@ -470,19 +539,24 @@ void ERR_load_RSA_strings(void);
#define RSA_R_INVALID_HEADER 137
#define RSA_R_INVALID_KEYBITS 145
#define RSA_R_INVALID_MESSAGE_LENGTH 131
+#define RSA_R_INVALID_MGF1_MD 156
#define RSA_R_INVALID_PADDING 138
#define RSA_R_INVALID_PADDING_MODE 141
+#define RSA_R_INVALID_PSS_PARAMETERS 149
#define RSA_R_INVALID_PSS_SALTLEN 146
+#define RSA_R_INVALID_SALT_LENGTH 150
#define RSA_R_INVALID_TRAILER 139
#define RSA_R_INVALID_X931_DIGEST 142
#define RSA_R_IQMP_NOT_INVERSE_OF_Q 126
#define RSA_R_KEY_SIZE_TOO_SMALL 120
#define RSA_R_LAST_OCTET_INVALID 134
#define RSA_R_MODULUS_TOO_LARGE 105
+#define RSA_R_NON_FIPS_RSA_METHOD 157
#define RSA_R_NO_PUBLIC_EXPONENT 140
#define RSA_R_NULL_BEFORE_BLOCK_MISSING 113
#define RSA_R_N_DOES_NOT_EQUAL_P_Q 127
#define RSA_R_OAEP_DECODING_ERROR 121
+#define RSA_R_OPERATION_NOT_ALLOWED_IN_FIPS_MODE 158
#define RSA_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE 148
#define RSA_R_PADDING_CHECK_FAILED 114
#define RSA_R_P_NOT_PRIME 128
@@ -493,7 +567,12 @@ void ERR_load_RSA_strings(void);
#define RSA_R_SSLV3_ROLLBACK_ATTACK 115
#define RSA_R_THE_ASN1_OBJECT_IDENTIFIER_IS_NOT_KNOWN_FOR_THIS_MD 116
#define RSA_R_UNKNOWN_ALGORITHM_TYPE 117
+#define RSA_R_UNKNOWN_MASK_DIGEST 151
#define RSA_R_UNKNOWN_PADDING_TYPE 118
+#define RSA_R_UNKNOWN_PSS_DIGEST 152
+#define RSA_R_UNSUPPORTED_MASK_ALGORITHM 153
+#define RSA_R_UNSUPPORTED_MASK_PARAMETER 154
+#define RSA_R_UNSUPPORTED_SIGNATURE_TYPE 155
#define RSA_R_VALUE_MISSING 147
#define RSA_R_WRONG_SIGNATURE_LENGTH 119
diff --git a/main/openssl/crypto/rsa/rsa_ameth.c b/main/openssl/crypto/rsa/rsa_ameth.c
index 8c320988..5a2062f9 100644
--- a/main/openssl/crypto/rsa/rsa_ameth.c
+++ b/main/openssl/crypto/rsa/rsa_ameth.c
@@ -265,6 +265,147 @@ static int rsa_priv_print(BIO *bp, const EVP_PKEY *pkey, int indent,
return do_rsa_print(bp, pkey->pkey.rsa, indent, 1);
}
+static RSA_PSS_PARAMS *rsa_pss_decode(const X509_ALGOR *alg,
+ X509_ALGOR **pmaskHash)
+ {
+ const unsigned char *p;
+ int plen;
+ RSA_PSS_PARAMS *pss;
+
+ *pmaskHash = NULL;
+
+ if (!alg->parameter || alg->parameter->type != V_ASN1_SEQUENCE)
+ return NULL;
+ p = alg->parameter->value.sequence->data;
+ plen = alg->parameter->value.sequence->length;
+ pss = d2i_RSA_PSS_PARAMS(NULL, &p, plen);
+
+ if (!pss)
+ return NULL;
+
+ if (pss->maskGenAlgorithm)
+ {
+ ASN1_TYPE *param = pss->maskGenAlgorithm->parameter;
+ if (OBJ_obj2nid(pss->maskGenAlgorithm->algorithm) == NID_mgf1
+ && param->type == V_ASN1_SEQUENCE)
+ {
+ p = param->value.sequence->data;
+ plen = param->value.sequence->length;
+ *pmaskHash = d2i_X509_ALGOR(NULL, &p, plen);
+ }
+ }
+
+ return pss;
+ }
+
+static int rsa_pss_param_print(BIO *bp, RSA_PSS_PARAMS *pss,
+ X509_ALGOR *maskHash, int indent)
+ {
+ int rv = 0;
+ if (!pss)
+ {
+ if (BIO_puts(bp, " (INVALID PSS PARAMETERS)\n") <= 0)
+ return 0;
+ return 1;
+ }
+ if (BIO_puts(bp, "\n") <= 0)
+ goto err;
+ if (!BIO_indent(bp, indent, 128))
+ goto err;
+ if (BIO_puts(bp, "Hash Algorithm: ") <= 0)
+ goto err;
+
+ if (pss->hashAlgorithm)
+ {
+ if (i2a_ASN1_OBJECT(bp, pss->hashAlgorithm->algorithm) <= 0)
+ goto err;
+ }
+ else if (BIO_puts(bp, "sha1 (default)") <= 0)
+ goto err;
+
+ if (BIO_puts(bp, "\n") <= 0)
+ goto err;
+
+ if (!BIO_indent(bp, indent, 128))
+ goto err;
+
+ if (BIO_puts(bp, "Mask Algorithm: ") <= 0)
+ goto err;
+ if (pss->maskGenAlgorithm)
+ {
+ if (i2a_ASN1_OBJECT(bp, pss->maskGenAlgorithm->algorithm) <= 0)
+ goto err;
+ if (BIO_puts(bp, " with ") <= 0)
+ goto err;
+ if (maskHash)
+ {
+ if (i2a_ASN1_OBJECT(bp, maskHash->algorithm) <= 0)
+ goto err;
+ }
+ else if (BIO_puts(bp, "INVALID") <= 0)
+ goto err;
+ }
+ else if (BIO_puts(bp, "mgf1 with sha1 (default)") <= 0)
+ goto err;
+ BIO_puts(bp, "\n");
+
+ if (!BIO_indent(bp, indent, 128))
+ goto err;
+ if (BIO_puts(bp, "Salt Length: 0x") <= 0)
+ goto err;
+ if (pss->saltLength)
+ {
+ if (i2a_ASN1_INTEGER(bp, pss->saltLength) <= 0)
+ goto err;
+ }
+ else if (BIO_puts(bp, "0x14 (default)") <= 0)
+ goto err;
+ BIO_puts(bp, "\n");
+
+ if (!BIO_indent(bp, indent, 128))
+ goto err;
+ if (BIO_puts(bp, "Trailer Field: 0x") <= 0)
+ goto err;
+ if (pss->trailerField)
+ {
+ if (i2a_ASN1_INTEGER(bp, pss->trailerField) <= 0)
+ goto err;
+ }
+ else if (BIO_puts(bp, "BC (default)") <= 0)
+ goto err;
+ BIO_puts(bp, "\n");
+
+ rv = 1;
+
+ err:
+ return rv;
+
+ }
+
+static int rsa_sig_print(BIO *bp, const X509_ALGOR *sigalg,
+ const ASN1_STRING *sig,
+ int indent, ASN1_PCTX *pctx)
+ {
+ if (OBJ_obj2nid(sigalg->algorithm) == NID_rsassaPss)
+ {
+ int rv;
+ RSA_PSS_PARAMS *pss;
+ X509_ALGOR *maskHash;
+ pss = rsa_pss_decode(sigalg, &maskHash);
+ rv = rsa_pss_param_print(bp, pss, maskHash, indent);
+ if (pss)
+ RSA_PSS_PARAMS_free(pss);
+ if (maskHash)
+ X509_ALGOR_free(maskHash);
+ if (!rv)
+ return 0;
+ }
+ else if (!sig && BIO_puts(bp, "\n") <= 0)
+ return 0;
+ if (sig)
+ return X509_signature_dump(bp, sig, indent);
+ return 1;
+ }
static int rsa_pkey_ctrl(EVP_PKEY *pkey, int op, long arg1, void *arg2)
{
@@ -310,6 +451,211 @@ static int rsa_pkey_ctrl(EVP_PKEY *pkey, int op, long arg1, void *arg2)
}
+/* Customised RSA item verification routine. This is called
+ * when a signature is encountered requiring special handling. We
+ * currently only handle PSS.
+ */
+
+
+static int rsa_item_verify(EVP_MD_CTX *ctx, const ASN1_ITEM *it, void *asn,
+ X509_ALGOR *sigalg, ASN1_BIT_STRING *sig,
+ EVP_PKEY *pkey)
+ {
+ int rv = -1;
+ int saltlen;
+ const EVP_MD *mgf1md = NULL, *md = NULL;
+ RSA_PSS_PARAMS *pss;
+ X509_ALGOR *maskHash;
+ EVP_PKEY_CTX *pkctx;
+ /* Sanity check: make sure it is PSS */
+ if (OBJ_obj2nid(sigalg->algorithm) != NID_rsassaPss)
+ {
+ RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_UNSUPPORTED_SIGNATURE_TYPE);
+ return -1;
+ }
+ /* Decode PSS parameters */
+ pss = rsa_pss_decode(sigalg, &maskHash);
+
+ if (pss == NULL)
+ {
+ RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_INVALID_PSS_PARAMETERS);
+ goto err;
+ }
+ /* Check mask and lookup mask hash algorithm */
+ if (pss->maskGenAlgorithm)
+ {
+ if (OBJ_obj2nid(pss->maskGenAlgorithm->algorithm) != NID_mgf1)
+ {
+ RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_UNSUPPORTED_MASK_ALGORITHM);
+ goto err;
+ }
+ if (!maskHash)
+ {
+ RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_UNSUPPORTED_MASK_PARAMETER);
+ goto err;
+ }
+ mgf1md = EVP_get_digestbyobj(maskHash->algorithm);
+ if (mgf1md == NULL)
+ {
+ RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_UNKNOWN_MASK_DIGEST);
+ goto err;
+ }
+ }
+ else
+ mgf1md = EVP_sha1();
+
+ if (pss->hashAlgorithm)
+ {
+ md = EVP_get_digestbyobj(pss->hashAlgorithm->algorithm);
+ if (md == NULL)
+ {
+ RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_UNKNOWN_PSS_DIGEST);
+ goto err;
+ }
+ }
+ else
+ md = EVP_sha1();
+
+ if (pss->saltLength)
+ {
+ saltlen = ASN1_INTEGER_get(pss->saltLength);
+
+ /* Could perform more salt length sanity checks but the main
+ * RSA routines will trap other invalid values anyway.
+ */
+ if (saltlen < 0)
+ {
+ RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_INVALID_SALT_LENGTH);
+ goto err;
+ }
+ }
+ else
+ saltlen = 20;
+
+ /* low-level routines support only trailer field 0xbc (value 1)
+ * and PKCS#1 says we should reject any other value anyway.
+ */
+ if (pss->trailerField && ASN1_INTEGER_get(pss->trailerField) != 1)
+ {
+ RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_INVALID_TRAILER);
+ goto err;
+ }
+
+ /* We have all parameters now set up context */
+
+ if (!EVP_DigestVerifyInit(ctx, &pkctx, md, NULL, pkey))
+ goto err;
+
+ if (EVP_PKEY_CTX_set_rsa_padding(pkctx, RSA_PKCS1_PSS_PADDING) <= 0)
+ goto err;
+
+ if (EVP_PKEY_CTX_set_rsa_pss_saltlen(pkctx, saltlen) <= 0)
+ goto err;
+
+ if (EVP_PKEY_CTX_set_rsa_mgf1_md(pkctx, mgf1md) <= 0)
+ goto err;
+ /* Carry on */
+ rv = 2;
+
+ err:
+ RSA_PSS_PARAMS_free(pss);
+ if (maskHash)
+ X509_ALGOR_free(maskHash);
+ return rv;
+ }
+
+static int rsa_item_sign(EVP_MD_CTX *ctx, const ASN1_ITEM *it, void *asn,
+ X509_ALGOR *alg1, X509_ALGOR *alg2,
+ ASN1_BIT_STRING *sig)
+ {
+ int pad_mode;
+ EVP_PKEY_CTX *pkctx = ctx->pctx;
+ if (EVP_PKEY_CTX_get_rsa_padding(pkctx, &pad_mode) <= 0)
+ return 0;
+ if (pad_mode == RSA_PKCS1_PADDING)
+ return 2;
+ if (pad_mode == RSA_PKCS1_PSS_PADDING)
+ {
+ const EVP_MD *sigmd, *mgf1md;
+ RSA_PSS_PARAMS *pss = NULL;
+ X509_ALGOR *mgf1alg = NULL;
+ ASN1_STRING *os1 = NULL, *os2 = NULL;
+ EVP_PKEY *pk = EVP_PKEY_CTX_get0_pkey(pkctx);
+ int saltlen, rv = 0;
+ sigmd = EVP_MD_CTX_md(ctx);
+ if (EVP_PKEY_CTX_get_rsa_mgf1_md(pkctx, &mgf1md) <= 0)
+ goto err;
+ if (!EVP_PKEY_CTX_get_rsa_pss_saltlen(pkctx, &saltlen))
+ goto err;
+ if (saltlen == -1)
+ saltlen = EVP_MD_size(sigmd);
+ else if (saltlen == -2)
+ {
+ saltlen = EVP_PKEY_size(pk) - EVP_MD_size(sigmd) - 2;
+ if (((EVP_PKEY_bits(pk) - 1) & 0x7) == 0)
+ saltlen--;
+ }
+ pss = RSA_PSS_PARAMS_new();
+ if (!pss)
+ goto err;
+ if (saltlen != 20)
+ {
+ pss->saltLength = ASN1_INTEGER_new();
+ if (!pss->saltLength)
+ goto err;
+ if (!ASN1_INTEGER_set(pss->saltLength, saltlen))
+ goto err;
+ }
+ if (EVP_MD_type(sigmd) != NID_sha1)
+ {
+ pss->hashAlgorithm = X509_ALGOR_new();
+ if (!pss->hashAlgorithm)
+ goto err;
+ X509_ALGOR_set_md(pss->hashAlgorithm, sigmd);
+ }
+ if (EVP_MD_type(mgf1md) != NID_sha1)
+ {
+ ASN1_STRING *stmp = NULL;
+ /* need to embed algorithm ID inside another */
+ mgf1alg = X509_ALGOR_new();
+ X509_ALGOR_set_md(mgf1alg, mgf1md);
+ if (!ASN1_item_pack(mgf1alg, ASN1_ITEM_rptr(X509_ALGOR),
+ &stmp))
+ goto err;
+ pss->maskGenAlgorithm = X509_ALGOR_new();
+ if (!pss->maskGenAlgorithm)
+ goto err;
+ X509_ALGOR_set0(pss->maskGenAlgorithm,
+ OBJ_nid2obj(NID_mgf1),
+ V_ASN1_SEQUENCE, stmp);
+ }
+ /* Finally create string with pss parameter encoding. */
+ if (!ASN1_item_pack(pss, ASN1_ITEM_rptr(RSA_PSS_PARAMS), &os1))
+ goto err;
+ if (alg2)
+ {
+ os2 = ASN1_STRING_dup(os1);
+ if (!os2)
+ goto err;
+ X509_ALGOR_set0(alg2, OBJ_nid2obj(NID_rsassaPss),
+ V_ASN1_SEQUENCE, os2);
+ }
+ X509_ALGOR_set0(alg1, OBJ_nid2obj(NID_rsassaPss),
+ V_ASN1_SEQUENCE, os1);
+ os1 = os2 = NULL;
+ rv = 3;
+ err:
+ if (mgf1alg)
+ X509_ALGOR_free(mgf1alg);
+ if (pss)
+ RSA_PSS_PARAMS_free(pss);
+ if (os1)
+ ASN1_STRING_free(os1);
+ return rv;
+
+ }
+ return 2;
+ }
const EVP_PKEY_ASN1_METHOD rsa_asn1_meths[] =
{
@@ -335,10 +681,13 @@ const EVP_PKEY_ASN1_METHOD rsa_asn1_meths[] =
0,0,0,0,0,0,
+ rsa_sig_print,
int_rsa_free,
rsa_pkey_ctrl,
old_rsa_priv_decode,
- old_rsa_priv_encode
+ old_rsa_priv_encode,
+ rsa_item_verify,
+ rsa_item_sign
},
{
diff --git a/main/openssl/crypto/rsa/rsa_asn1.c b/main/openssl/crypto/rsa/rsa_asn1.c
index 4efca8cd..6ed5de3d 100644
--- a/main/openssl/crypto/rsa/rsa_asn1.c
+++ b/main/openssl/crypto/rsa/rsa_asn1.c
@@ -60,6 +60,7 @@
#include "cryptlib.h"
#include <openssl/bn.h>
#include <openssl/rsa.h>
+#include <openssl/x509.h>
#include <openssl/asn1t.h>
/* Override the default free and new methods */
@@ -96,6 +97,15 @@ ASN1_SEQUENCE_cb(RSAPublicKey, rsa_cb) = {
ASN1_SIMPLE(RSA, e, BIGNUM),
} ASN1_SEQUENCE_END_cb(RSA, RSAPublicKey)
+ASN1_SEQUENCE(RSA_PSS_PARAMS) = {
+ ASN1_EXP_OPT(RSA_PSS_PARAMS, hashAlgorithm, X509_ALGOR,0),
+ ASN1_EXP_OPT(RSA_PSS_PARAMS, maskGenAlgorithm, X509_ALGOR,1),
+ ASN1_EXP_OPT(RSA_PSS_PARAMS, saltLength, ASN1_INTEGER,2),
+ ASN1_EXP_OPT(RSA_PSS_PARAMS, trailerField, ASN1_INTEGER,3)
+} ASN1_SEQUENCE_END(RSA_PSS_PARAMS)
+
+IMPLEMENT_ASN1_FUNCTIONS(RSA_PSS_PARAMS)
+
IMPLEMENT_ASN1_ENCODE_FUNCTIONS_const_fname(RSA, RSAPrivateKey, RSAPrivateKey)
IMPLEMENT_ASN1_ENCODE_FUNCTIONS_const_fname(RSA, RSAPublicKey, RSAPublicKey)
diff --git a/main/openssl/crypto/rsa/rsa_chk.c b/main/openssl/crypto/rsa/rsa_chk.c
index 9d848db8..cc30e771 100644
--- a/main/openssl/crypto/rsa/rsa_chk.c
+++ b/main/openssl/crypto/rsa/rsa_chk.c
@@ -59,6 +59,12 @@ int RSA_check_key(const RSA *key)
BN_CTX *ctx;
int r;
int ret=1;
+
+ if (!key->p || !key->q || !key->n || !key->e || !key->d)
+ {
+ RSAerr(RSA_F_RSA_CHECK_KEY, RSA_R_VALUE_MISSING);
+ return 0;
+ }
i = BN_new();
j = BN_new();
diff --git a/main/openssl/crypto/rsa/rsa_crpt.c b/main/openssl/crypto/rsa/rsa_crpt.c
new file mode 100644
index 00000000..d3e44785
--- /dev/null
+++ b/main/openssl/crypto/rsa/rsa_crpt.c
@@ -0,0 +1,257 @@
+/* crypto/rsa/rsa_lib.c */
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ *
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to. The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code. The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ *
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * "This product includes cryptographic software written by
+ * Eric Young (eay@cryptsoft.com)"
+ * The word 'cryptographic' can be left out if the rouines from the library
+ * being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from
+ * the apps directory (application code) you must include an acknowledgement:
+ * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed. i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+
+#include <stdio.h>
+#include <openssl/crypto.h>
+#include "cryptlib.h"
+#include <openssl/lhash.h>
+#include <openssl/bn.h>
+#include <openssl/rsa.h>
+#include <openssl/rand.h>
+#ifndef OPENSSL_NO_ENGINE
+#include <openssl/engine.h>
+#endif
+
+int RSA_size(const RSA *r)
+ {
+ return(BN_num_bytes(r->n));
+ }
+
+int RSA_public_encrypt(int flen, const unsigned char *from, unsigned char *to,
+ RSA *rsa, int padding)
+ {
+#ifdef OPENSSL_FIPS
+ if (FIPS_mode() && !(rsa->meth->flags & RSA_FLAG_FIPS_METHOD)
+ && !(rsa->flags & RSA_FLAG_NON_FIPS_ALLOW))
+ {
+ RSAerr(RSA_F_RSA_PUBLIC_ENCRYPT, RSA_R_NON_FIPS_RSA_METHOD);
+ return -1;
+ }
+#endif
+ return(rsa->meth->rsa_pub_enc(flen, from, to, rsa, padding));
+ }
+
+int RSA_private_encrypt(int flen, const unsigned char *from, unsigned char *to,
+ RSA *rsa, int padding)
+ {
+#ifdef OPENSSL_FIPS
+ if (FIPS_mode() && !(rsa->meth->flags & RSA_FLAG_FIPS_METHOD)
+ && !(rsa->flags & RSA_FLAG_NON_FIPS_ALLOW))
+ {
+ RSAerr(RSA_F_RSA_PRIVATE_ENCRYPT, RSA_R_NON_FIPS_RSA_METHOD);
+ return -1;
+ }
+#endif
+ return(rsa->meth->rsa_priv_enc(flen, from, to, rsa, padding));
+ }
+
+int RSA_private_decrypt(int flen, const unsigned char *from, unsigned char *to,
+ RSA *rsa, int padding)
+ {
+#ifdef OPENSSL_FIPS
+ if (FIPS_mode() && !(rsa->meth->flags & RSA_FLAG_FIPS_METHOD)
+ && !(rsa->flags & RSA_FLAG_NON_FIPS_ALLOW))
+ {
+ RSAerr(RSA_F_RSA_PRIVATE_DECRYPT, RSA_R_NON_FIPS_RSA_METHOD);
+ return -1;
+ }
+#endif
+ return(rsa->meth->rsa_priv_dec(flen, from, to, rsa, padding));
+ }
+
+int RSA_public_decrypt(int flen, const unsigned char *from, unsigned char *to,
+ RSA *rsa, int padding)
+ {
+#ifdef OPENSSL_FIPS
+ if (FIPS_mode() && !(rsa->meth->flags & RSA_FLAG_FIPS_METHOD)
+ && !(rsa->flags & RSA_FLAG_NON_FIPS_ALLOW))
+ {
+ RSAerr(RSA_F_RSA_PUBLIC_DECRYPT, RSA_R_NON_FIPS_RSA_METHOD);
+ return -1;
+ }
+#endif
+ return(rsa->meth->rsa_pub_dec(flen, from, to, rsa, padding));
+ }
+
+int RSA_flags(const RSA *r)
+ {
+ return((r == NULL)?0:r->meth->flags);
+ }
+
+void RSA_blinding_off(RSA *rsa)
+ {
+ if (rsa->blinding != NULL)
+ {
+ BN_BLINDING_free(rsa->blinding);
+ rsa->blinding=NULL;
+ }
+ rsa->flags &= ~RSA_FLAG_BLINDING;
+ rsa->flags |= RSA_FLAG_NO_BLINDING;
+ }
+
+int RSA_blinding_on(RSA *rsa, BN_CTX *ctx)
+ {
+ int ret=0;
+
+ if (rsa->blinding != NULL)
+ RSA_blinding_off(rsa);
+
+ rsa->blinding = RSA_setup_blinding(rsa, ctx);
+ if (rsa->blinding == NULL)
+ goto err;
+
+ rsa->flags |= RSA_FLAG_BLINDING;
+ rsa->flags &= ~RSA_FLAG_NO_BLINDING;
+ ret=1;
+err:
+ return(ret);
+ }
+
+static BIGNUM *rsa_get_public_exp(const BIGNUM *d, const BIGNUM *p,
+ const BIGNUM *q, BN_CTX *ctx)
+{
+ BIGNUM *ret = NULL, *r0, *r1, *r2;
+
+ if (d == NULL || p == NULL || q == NULL)
+ return NULL;
+
+ BN_CTX_start(ctx);
+ r0 = BN_CTX_get(ctx);
+ r1 = BN_CTX_get(ctx);
+ r2 = BN_CTX_get(ctx);
+ if (r2 == NULL)
+ goto err;
+
+ if (!BN_sub(r1, p, BN_value_one())) goto err;
+ if (!BN_sub(r2, q, BN_value_one())) goto err;
+ if (!BN_mul(r0, r1, r2, ctx)) goto err;
+
+ ret = BN_mod_inverse(NULL, d, r0, ctx);
+err:
+ BN_CTX_end(ctx);
+ return ret;
+}
+
+BN_BLINDING *RSA_setup_blinding(RSA *rsa, BN_CTX *in_ctx)
+{
+ BIGNUM local_n;
+ BIGNUM *e,*n;
+ BN_CTX *ctx;
+ BN_BLINDING *ret = NULL;
+
+ if (in_ctx == NULL)
+ {
+ if ((ctx = BN_CTX_new()) == NULL) return 0;
+ }
+ else
+ ctx = in_ctx;
+
+ BN_CTX_start(ctx);
+ e = BN_CTX_get(ctx);
+ if (e == NULL)
+ {
+ RSAerr(RSA_F_RSA_SETUP_BLINDING, ERR_R_MALLOC_FAILURE);
+ goto err;
+ }
+
+ if (rsa->e == NULL)
+ {
+ e = rsa_get_public_exp(rsa->d, rsa->p, rsa->q, ctx);
+ if (e == NULL)
+ {
+ RSAerr(RSA_F_RSA_SETUP_BLINDING, RSA_R_NO_PUBLIC_EXPONENT);
+ goto err;
+ }
+ }
+ else
+ e = rsa->e;
+
+
+ if ((RAND_status() == 0) && rsa->d != NULL && rsa->d->d != NULL)
+ {
+ /* if PRNG is not properly seeded, resort to secret
+ * exponent as unpredictable seed */
+ RAND_add(rsa->d->d, rsa->d->dmax * sizeof rsa->d->d[0], 0.0);
+ }
+
+ if (!(rsa->flags & RSA_FLAG_NO_CONSTTIME))
+ {
+ /* Set BN_FLG_CONSTTIME flag */
+ n = &local_n;
+ BN_with_flags(n, rsa->n, BN_FLG_CONSTTIME);
+ }
+ else
+ n = rsa->n;
+
+ ret = BN_BLINDING_create_param(NULL, e, n, ctx,
+ rsa->meth->bn_mod_exp, rsa->_method_mod_n);
+ if (ret == NULL)
+ {
+ RSAerr(RSA_F_RSA_SETUP_BLINDING, ERR_R_BN_LIB);
+ goto err;
+ }
+ CRYPTO_THREADID_current(BN_BLINDING_thread_id(ret));
+err:
+ BN_CTX_end(ctx);
+ if (in_ctx == NULL)
+ BN_CTX_free(ctx);
+ if(rsa->e == NULL)
+ BN_free(e);
+
+ return ret;
+}
diff --git a/main/openssl/crypto/rsa/rsa_eay.c b/main/openssl/crypto/rsa/rsa_eay.c
index 7c941885..88ee2cb5 100644
--- a/main/openssl/crypto/rsa/rsa_eay.c
+++ b/main/openssl/crypto/rsa/rsa_eay.c
@@ -314,51 +314,56 @@ static BN_BLINDING *rsa_get_blinding(RSA *rsa, int *local, BN_CTX *ctx)
return ret;
}
-static int rsa_blinding_convert(BN_BLINDING *b, int local, BIGNUM *f,
- BIGNUM *r, BN_CTX *ctx)
-{
- if (local)
+static int rsa_blinding_convert(BN_BLINDING *b, BIGNUM *f, BIGNUM *unblind,
+ BN_CTX *ctx)
+ {
+ if (unblind == NULL)
+ /* Local blinding: store the unblinding factor
+ * in BN_BLINDING. */
return BN_BLINDING_convert_ex(f, NULL, b, ctx);
else
{
- int ret;
- CRYPTO_r_lock(CRYPTO_LOCK_RSA_BLINDING);
- ret = BN_BLINDING_convert_ex(f, r, b, ctx);
- CRYPTO_r_unlock(CRYPTO_LOCK_RSA_BLINDING);
- return ret;
- }
-}
-
-static int rsa_blinding_invert(BN_BLINDING *b, int local, BIGNUM *f,
- BIGNUM *r, BN_CTX *ctx)
-{
- if (local)
- return BN_BLINDING_invert_ex(f, NULL, b, ctx);
- else
- {
+ /* Shared blinding: store the unblinding factor
+ * outside BN_BLINDING. */
int ret;
CRYPTO_w_lock(CRYPTO_LOCK_RSA_BLINDING);
- ret = BN_BLINDING_invert_ex(f, r, b, ctx);
+ ret = BN_BLINDING_convert_ex(f, unblind, b, ctx);
CRYPTO_w_unlock(CRYPTO_LOCK_RSA_BLINDING);
return ret;
}
-}
+ }
+
+static int rsa_blinding_invert(BN_BLINDING *b, BIGNUM *f, BIGNUM *unblind,
+ BN_CTX *ctx)
+ {
+ /* For local blinding, unblind is set to NULL, and BN_BLINDING_invert_ex
+ * will use the unblinding factor stored in BN_BLINDING.
+ * If BN_BLINDING is shared between threads, unblind must be non-null:
+ * BN_BLINDING_invert_ex will then use the local unblinding factor,
+ * and will only read the modulus from BN_BLINDING.
+ * In both cases it's safe to access the blinding without a lock.
+ */
+ return BN_BLINDING_invert_ex(f, unblind, b, ctx);
+ }
/* signing */
static int RSA_eay_private_encrypt(int flen, const unsigned char *from,
unsigned char *to, RSA *rsa, int padding)
{
- BIGNUM *f, *ret, *br, *res;
+ BIGNUM *f, *ret, *res;
int i,j,k,num=0,r= -1;
unsigned char *buf=NULL;
BN_CTX *ctx=NULL;
int local_blinding = 0;
+ /* Used only if the blinding structure is shared. A non-NULL unblind
+ * instructs rsa_blinding_convert() and rsa_blinding_invert() to store
+ * the unblinding factor outside the blinding structure. */
+ BIGNUM *unblind = NULL;
BN_BLINDING *blinding = NULL;
if ((ctx=BN_CTX_new()) == NULL) goto err;
BN_CTX_start(ctx);
f = BN_CTX_get(ctx);
- br = BN_CTX_get(ctx);
ret = BN_CTX_get(ctx);
num = BN_num_bytes(rsa->n);
buf = OPENSSL_malloc(num);
@@ -406,8 +411,15 @@ static int RSA_eay_private_encrypt(int flen, const unsigned char *from,
}
if (blinding != NULL)
- if (!rsa_blinding_convert(blinding, local_blinding, f, br, ctx))
+ {
+ if (!local_blinding && ((unblind = BN_CTX_get(ctx)) == NULL))
+ {
+ RSAerr(RSA_F_RSA_EAY_PRIVATE_ENCRYPT,ERR_R_MALLOC_FAILURE);
+ goto err;
+ }
+ if (!rsa_blinding_convert(blinding, f, unblind, ctx))
goto err;
+ }
if ( (rsa->flags & RSA_FLAG_EXT_PKEY) ||
((rsa->p != NULL) &&
@@ -441,7 +453,7 @@ static int RSA_eay_private_encrypt(int flen, const unsigned char *from,
}
if (blinding)
- if (!rsa_blinding_invert(blinding, local_blinding, ret, br, ctx))
+ if (!rsa_blinding_invert(blinding, ret, unblind, ctx))
goto err;
if (padding == RSA_X931_PADDING)
@@ -480,18 +492,21 @@ err:
static int RSA_eay_private_decrypt(int flen, const unsigned char *from,
unsigned char *to, RSA *rsa, int padding)
{
- BIGNUM *f, *ret, *br;
+ BIGNUM *f, *ret;
int j,num=0,r= -1;
unsigned char *p;
unsigned char *buf=NULL;
BN_CTX *ctx=NULL;
int local_blinding = 0;
+ /* Used only if the blinding structure is shared. A non-NULL unblind
+ * instructs rsa_blinding_convert() and rsa_blinding_invert() to store
+ * the unblinding factor outside the blinding structure. */
+ BIGNUM *unblind = NULL;
BN_BLINDING *blinding = NULL;
if((ctx = BN_CTX_new()) == NULL) goto err;
BN_CTX_start(ctx);
f = BN_CTX_get(ctx);
- br = BN_CTX_get(ctx);
ret = BN_CTX_get(ctx);
num = BN_num_bytes(rsa->n);
buf = OPENSSL_malloc(num);
@@ -529,8 +544,15 @@ static int RSA_eay_private_decrypt(int flen, const unsigned char *from,
}
if (blinding != NULL)
- if (!rsa_blinding_convert(blinding, local_blinding, f, br, ctx))
+ {
+ if (!local_blinding && ((unblind = BN_CTX_get(ctx)) == NULL))
+ {
+ RSAerr(RSA_F_RSA_EAY_PRIVATE_DECRYPT,ERR_R_MALLOC_FAILURE);
goto err;
+ }
+ if (!rsa_blinding_convert(blinding, f, unblind, ctx))
+ goto err;
+ }
/* do the decrypt */
if ( (rsa->flags & RSA_FLAG_EXT_PKEY) ||
@@ -564,7 +586,7 @@ static int RSA_eay_private_decrypt(int flen, const unsigned char *from,
}
if (blinding)
- if (!rsa_blinding_invert(blinding, local_blinding, ret, br, ctx))
+ if (!rsa_blinding_invert(blinding, ret, unblind, ctx))
goto err;
p=buf;
@@ -825,12 +847,12 @@ static int RSA_eay_mod_exp(BIGNUM *r0, const BIGNUM *I, RSA *rsa, BN_CTX *ctx)
if (!BN_mod(r0,pr1,rsa->p,ctx)) goto err;
/* If p < q it is occasionally possible for the correction of
- * adding 'p' if r0 is negative above to leave the result still
+ * adding 'p' if r0 is negative above to leave the result still
* negative. This can break the private key operations: the following
* second correction should *always* correct this rare occurrence.
* This will *never* happen with OpenSSL generated keys because
- * they ensure p > q [steve]
- */
+ * they ensure p > q [steve]
+ */
if (BN_is_negative(r0))
if (!BN_add(r0,r0,rsa->p)) goto err;
if (!BN_mul(r1,r0,rsa->q,ctx)) goto err;
diff --git a/main/openssl/crypto/rsa/rsa_err.c b/main/openssl/crypto/rsa/rsa_err.c
index cf9f1106..46e0bf99 100644
--- a/main/openssl/crypto/rsa/rsa_err.c
+++ b/main/openssl/crypto/rsa/rsa_err.c
@@ -1,6 +1,6 @@
/* crypto/rsa/rsa_err.c */
/* ====================================================================
- * Copyright (c) 1999-2008 The OpenSSL Project. All rights reserved.
+ * Copyright (c) 1999-2011 The OpenSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -78,6 +78,7 @@ static ERR_STRING_DATA RSA_str_functs[]=
{ERR_FUNC(RSA_F_PKEY_RSA_CTRL), "PKEY_RSA_CTRL"},
{ERR_FUNC(RSA_F_PKEY_RSA_CTRL_STR), "PKEY_RSA_CTRL_STR"},
{ERR_FUNC(RSA_F_PKEY_RSA_SIGN), "PKEY_RSA_SIGN"},
+{ERR_FUNC(RSA_F_PKEY_RSA_VERIFY), "PKEY_RSA_VERIFY"},
{ERR_FUNC(RSA_F_PKEY_RSA_VERIFYRECOVER), "PKEY_RSA_VERIFYRECOVER"},
{ERR_FUNC(RSA_F_RSA_BUILTIN_KEYGEN), "RSA_BUILTIN_KEYGEN"},
{ERR_FUNC(RSA_F_RSA_CHECK_KEY), "RSA_check_key"},
@@ -86,6 +87,8 @@ static ERR_STRING_DATA RSA_str_functs[]=
{ERR_FUNC(RSA_F_RSA_EAY_PUBLIC_DECRYPT), "RSA_EAY_PUBLIC_DECRYPT"},
{ERR_FUNC(RSA_F_RSA_EAY_PUBLIC_ENCRYPT), "RSA_EAY_PUBLIC_ENCRYPT"},
{ERR_FUNC(RSA_F_RSA_GENERATE_KEY), "RSA_generate_key"},
+{ERR_FUNC(RSA_F_RSA_GENERATE_KEY_EX), "RSA_generate_key_ex"},
+{ERR_FUNC(RSA_F_RSA_ITEM_VERIFY), "RSA_ITEM_VERIFY"},
{ERR_FUNC(RSA_F_RSA_MEMORY_LOCK), "RSA_memory_lock"},
{ERR_FUNC(RSA_F_RSA_NEW_METHOD), "RSA_new_method"},
{ERR_FUNC(RSA_F_RSA_NULL), "RSA_NULL"},
@@ -97,6 +100,7 @@ static ERR_STRING_DATA RSA_str_functs[]=
{ERR_FUNC(RSA_F_RSA_PADDING_ADD_NONE), "RSA_padding_add_none"},
{ERR_FUNC(RSA_F_RSA_PADDING_ADD_PKCS1_OAEP), "RSA_padding_add_PKCS1_OAEP"},
{ERR_FUNC(RSA_F_RSA_PADDING_ADD_PKCS1_PSS), "RSA_padding_add_PKCS1_PSS"},
+{ERR_FUNC(RSA_F_RSA_PADDING_ADD_PKCS1_PSS_MGF1), "RSA_padding_add_PKCS1_PSS_mgf1"},
{ERR_FUNC(RSA_F_RSA_PADDING_ADD_PKCS1_TYPE_1), "RSA_padding_add_PKCS1_type_1"},
{ERR_FUNC(RSA_F_RSA_PADDING_ADD_PKCS1_TYPE_2), "RSA_padding_add_PKCS1_type_2"},
{ERR_FUNC(RSA_F_RSA_PADDING_ADD_SSLV23), "RSA_padding_add_SSLv23"},
@@ -109,8 +113,12 @@ static ERR_STRING_DATA RSA_str_functs[]=
{ERR_FUNC(RSA_F_RSA_PADDING_CHECK_X931), "RSA_padding_check_X931"},
{ERR_FUNC(RSA_F_RSA_PRINT), "RSA_print"},
{ERR_FUNC(RSA_F_RSA_PRINT_FP), "RSA_print_fp"},
+{ERR_FUNC(RSA_F_RSA_PRIVATE_DECRYPT), "RSA_private_decrypt"},
+{ERR_FUNC(RSA_F_RSA_PRIVATE_ENCRYPT), "RSA_private_encrypt"},
{ERR_FUNC(RSA_F_RSA_PRIV_DECODE), "RSA_PRIV_DECODE"},
{ERR_FUNC(RSA_F_RSA_PRIV_ENCODE), "RSA_PRIV_ENCODE"},
+{ERR_FUNC(RSA_F_RSA_PUBLIC_DECRYPT), "RSA_public_decrypt"},
+{ERR_FUNC(RSA_F_RSA_PUBLIC_ENCRYPT), "RSA_public_encrypt"},
{ERR_FUNC(RSA_F_RSA_PUB_DECODE), "RSA_PUB_DECODE"},
{ERR_FUNC(RSA_F_RSA_SETUP_BLINDING), "RSA_setup_blinding"},
{ERR_FUNC(RSA_F_RSA_SIGN), "RSA_sign"},
@@ -118,6 +126,7 @@ static ERR_STRING_DATA RSA_str_functs[]=
{ERR_FUNC(RSA_F_RSA_VERIFY), "RSA_verify"},
{ERR_FUNC(RSA_F_RSA_VERIFY_ASN1_OCTET_STRING), "RSA_verify_ASN1_OCTET_STRING"},
{ERR_FUNC(RSA_F_RSA_VERIFY_PKCS1_PSS), "RSA_verify_PKCS1_PSS"},
+{ERR_FUNC(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1), "RSA_verify_PKCS1_PSS_mgf1"},
{0,NULL}
};
@@ -146,19 +155,24 @@ static ERR_STRING_DATA RSA_str_reasons[]=
{ERR_REASON(RSA_R_INVALID_HEADER) ,"invalid header"},
{ERR_REASON(RSA_R_INVALID_KEYBITS) ,"invalid keybits"},
{ERR_REASON(RSA_R_INVALID_MESSAGE_LENGTH),"invalid message length"},
+{ERR_REASON(RSA_R_INVALID_MGF1_MD) ,"invalid mgf1 md"},
{ERR_REASON(RSA_R_INVALID_PADDING) ,"invalid padding"},
{ERR_REASON(RSA_R_INVALID_PADDING_MODE) ,"invalid padding mode"},
+{ERR_REASON(RSA_R_INVALID_PSS_PARAMETERS),"invalid pss parameters"},
{ERR_REASON(RSA_R_INVALID_PSS_SALTLEN) ,"invalid pss saltlen"},
+{ERR_REASON(RSA_R_INVALID_SALT_LENGTH) ,"invalid salt length"},
{ERR_REASON(RSA_R_INVALID_TRAILER) ,"invalid trailer"},
{ERR_REASON(RSA_R_INVALID_X931_DIGEST) ,"invalid x931 digest"},
{ERR_REASON(RSA_R_IQMP_NOT_INVERSE_OF_Q) ,"iqmp not inverse of q"},
{ERR_REASON(RSA_R_KEY_SIZE_TOO_SMALL) ,"key size too small"},
{ERR_REASON(RSA_R_LAST_OCTET_INVALID) ,"last octet invalid"},
{ERR_REASON(RSA_R_MODULUS_TOO_LARGE) ,"modulus too large"},
+{ERR_REASON(RSA_R_NON_FIPS_RSA_METHOD) ,"non fips rsa method"},
{ERR_REASON(RSA_R_NO_PUBLIC_EXPONENT) ,"no public exponent"},
{ERR_REASON(RSA_R_NULL_BEFORE_BLOCK_MISSING),"null before block missing"},
{ERR_REASON(RSA_R_N_DOES_NOT_EQUAL_P_Q) ,"n does not equal p q"},
{ERR_REASON(RSA_R_OAEP_DECODING_ERROR) ,"oaep decoding error"},
+{ERR_REASON(RSA_R_OPERATION_NOT_ALLOWED_IN_FIPS_MODE),"operation not allowed in fips mode"},
{ERR_REASON(RSA_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE),"operation not supported for this keytype"},
{ERR_REASON(RSA_R_PADDING_CHECK_FAILED) ,"padding check failed"},
{ERR_REASON(RSA_R_P_NOT_PRIME) ,"p not prime"},
@@ -169,7 +183,12 @@ static ERR_STRING_DATA RSA_str_reasons[]=
{ERR_REASON(RSA_R_SSLV3_ROLLBACK_ATTACK) ,"sslv3 rollback attack"},
{ERR_REASON(RSA_R_THE_ASN1_OBJECT_IDENTIFIER_IS_NOT_KNOWN_FOR_THIS_MD),"the asn1 object identifier is not known for this md"},
{ERR_REASON(RSA_R_UNKNOWN_ALGORITHM_TYPE),"unknown algorithm type"},
+{ERR_REASON(RSA_R_UNKNOWN_MASK_DIGEST) ,"unknown mask digest"},
{ERR_REASON(RSA_R_UNKNOWN_PADDING_TYPE) ,"unknown padding type"},
+{ERR_REASON(RSA_R_UNKNOWN_PSS_DIGEST) ,"unknown pss digest"},
+{ERR_REASON(RSA_R_UNSUPPORTED_MASK_ALGORITHM),"unsupported mask algorithm"},
+{ERR_REASON(RSA_R_UNSUPPORTED_MASK_PARAMETER),"unsupported mask parameter"},
+{ERR_REASON(RSA_R_UNSUPPORTED_SIGNATURE_TYPE),"unsupported signature type"},
{ERR_REASON(RSA_R_VALUE_MISSING) ,"value missing"},
{ERR_REASON(RSA_R_WRONG_SIGNATURE_LENGTH),"wrong signature length"},
{0,NULL}
diff --git a/main/openssl/crypto/rsa/rsa_gen.c b/main/openssl/crypto/rsa/rsa_gen.c
index 767f7ab6..42290cce 100644
--- a/main/openssl/crypto/rsa/rsa_gen.c
+++ b/main/openssl/crypto/rsa/rsa_gen.c
@@ -67,6 +67,9 @@
#include "cryptlib.h"
#include <openssl/bn.h>
#include <openssl/rsa.h>
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
static int rsa_builtin_keygen(RSA *rsa, int bits, BIGNUM *e_value, BN_GENCB *cb);
@@ -77,8 +80,20 @@ static int rsa_builtin_keygen(RSA *rsa, int bits, BIGNUM *e_value, BN_GENCB *cb)
* now just because key-generation is part of RSA_METHOD. */
int RSA_generate_key_ex(RSA *rsa, int bits, BIGNUM *e_value, BN_GENCB *cb)
{
+#ifdef OPENSSL_FIPS
+ if (FIPS_mode() && !(rsa->meth->flags & RSA_FLAG_FIPS_METHOD)
+ && !(rsa->flags & RSA_FLAG_NON_FIPS_ALLOW))
+ {
+ RSAerr(RSA_F_RSA_GENERATE_KEY_EX, RSA_R_NON_FIPS_RSA_METHOD);
+ return 0;
+ }
+#endif
if(rsa->meth->rsa_keygen)
return rsa->meth->rsa_keygen(rsa, bits, e_value, cb);
+#ifdef OPENSSL_FIPS
+ if (FIPS_mode())
+ return FIPS_rsa_generate_key_ex(rsa, bits, e_value, cb);
+#endif
return rsa_builtin_keygen(rsa, bits, e_value, cb);
}
diff --git a/main/openssl/crypto/rsa/rsa_lib.c b/main/openssl/crypto/rsa/rsa_lib.c
index de45088d..c95ceafc 100644
--- a/main/openssl/crypto/rsa/rsa_lib.c
+++ b/main/openssl/crypto/rsa/rsa_lib.c
@@ -67,6 +67,10 @@
#include <openssl/engine.h>
#endif
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
+
const char RSA_version[]="RSA" OPENSSL_VERSION_PTEXT;
static const RSA_METHOD *default_RSA_meth=NULL;
@@ -87,12 +91,15 @@ const RSA_METHOD *RSA_get_default_method(void)
{
if (default_RSA_meth == NULL)
{
+#ifdef OPENSSL_FIPS
+ if (FIPS_mode())
+ return FIPS_rsa_pkcs1_ssleay();
+ else
+ return RSA_PKCS1_SSLeay();
+#else
#ifdef RSA_NULL
default_RSA_meth=RSA_null_method();
#else
-#if 0 /* was: #ifdef RSAref */
- default_RSA_meth=RSA_PKCS1_RSAref();
-#else
default_RSA_meth=RSA_PKCS1_SSLeay();
#endif
#endif
@@ -181,7 +188,7 @@ RSA *RSA_new_method(ENGINE *engine)
ret->blinding=NULL;
ret->mt_blinding=NULL;
ret->bignum_data=NULL;
- ret->flags=ret->meth->flags;
+ ret->flags=ret->meth->flags & ~RSA_FLAG_NON_FIPS_ALLOW;
if (!CRYPTO_new_ex_data(CRYPTO_EX_INDEX_RSA, ret, &ret->ex_data))
{
#ifndef OPENSSL_NO_ENGINE
@@ -280,163 +287,6 @@ void *RSA_get_ex_data(const RSA *r, int idx)
return(CRYPTO_get_ex_data(&r->ex_data,idx));
}
-int RSA_size(const RSA *r)
- {
- return(BN_num_bytes(r->n));
- }
-
-int RSA_public_encrypt(int flen, const unsigned char *from, unsigned char *to,
- RSA *rsa, int padding)
- {
- return(rsa->meth->rsa_pub_enc(flen, from, to, rsa, padding));
- }
-
-int RSA_private_encrypt(int flen, const unsigned char *from, unsigned char *to,
- RSA *rsa, int padding)
- {
- return(rsa->meth->rsa_priv_enc(flen, from, to, rsa, padding));
- }
-
-int RSA_private_decrypt(int flen, const unsigned char *from, unsigned char *to,
- RSA *rsa, int padding)
- {
- return(rsa->meth->rsa_priv_dec(flen, from, to, rsa, padding));
- }
-
-int RSA_public_decrypt(int flen, const unsigned char *from, unsigned char *to,
- RSA *rsa, int padding)
- {
- return(rsa->meth->rsa_pub_dec(flen, from, to, rsa, padding));
- }
-
-int RSA_flags(const RSA *r)
- {
- return((r == NULL)?0:r->meth->flags);
- }
-
-void RSA_blinding_off(RSA *rsa)
- {
- if (rsa->blinding != NULL)
- {
- BN_BLINDING_free(rsa->blinding);
- rsa->blinding=NULL;
- }
- rsa->flags &= ~RSA_FLAG_BLINDING;
- rsa->flags |= RSA_FLAG_NO_BLINDING;
- }
-
-int RSA_blinding_on(RSA *rsa, BN_CTX *ctx)
- {
- int ret=0;
-
- if (rsa->blinding != NULL)
- RSA_blinding_off(rsa);
-
- rsa->blinding = RSA_setup_blinding(rsa, ctx);
- if (rsa->blinding == NULL)
- goto err;
-
- rsa->flags |= RSA_FLAG_BLINDING;
- rsa->flags &= ~RSA_FLAG_NO_BLINDING;
- ret=1;
-err:
- return(ret);
- }
-
-static BIGNUM *rsa_get_public_exp(const BIGNUM *d, const BIGNUM *p,
- const BIGNUM *q, BN_CTX *ctx)
-{
- BIGNUM *ret = NULL, *r0, *r1, *r2;
-
- if (d == NULL || p == NULL || q == NULL)
- return NULL;
-
- BN_CTX_start(ctx);
- r0 = BN_CTX_get(ctx);
- r1 = BN_CTX_get(ctx);
- r2 = BN_CTX_get(ctx);
- if (r2 == NULL)
- goto err;
-
- if (!BN_sub(r1, p, BN_value_one())) goto err;
- if (!BN_sub(r2, q, BN_value_one())) goto err;
- if (!BN_mul(r0, r1, r2, ctx)) goto err;
-
- ret = BN_mod_inverse(NULL, d, r0, ctx);
-err:
- BN_CTX_end(ctx);
- return ret;
-}
-
-BN_BLINDING *RSA_setup_blinding(RSA *rsa, BN_CTX *in_ctx)
-{
- BIGNUM local_n;
- BIGNUM *e,*n;
- BN_CTX *ctx;
- BN_BLINDING *ret = NULL;
-
- if (in_ctx == NULL)
- {
- if ((ctx = BN_CTX_new()) == NULL) return 0;
- }
- else
- ctx = in_ctx;
-
- BN_CTX_start(ctx);
- e = BN_CTX_get(ctx);
- if (e == NULL)
- {
- RSAerr(RSA_F_RSA_SETUP_BLINDING, ERR_R_MALLOC_FAILURE);
- goto err;
- }
-
- if (rsa->e == NULL)
- {
- e = rsa_get_public_exp(rsa->d, rsa->p, rsa->q, ctx);
- if (e == NULL)
- {
- RSAerr(RSA_F_RSA_SETUP_BLINDING, RSA_R_NO_PUBLIC_EXPONENT);
- goto err;
- }
- }
- else
- e = rsa->e;
-
-
- if ((RAND_status() == 0) && rsa->d != NULL && rsa->d->d != NULL)
- {
- /* if PRNG is not properly seeded, resort to secret
- * exponent as unpredictable seed */
- RAND_add(rsa->d->d, rsa->d->dmax * sizeof rsa->d->d[0], 0.0);
- }
-
- if (!(rsa->flags & RSA_FLAG_NO_CONSTTIME))
- {
- /* Set BN_FLG_CONSTTIME flag */
- n = &local_n;
- BN_with_flags(n, rsa->n, BN_FLG_CONSTTIME);
- }
- else
- n = rsa->n;
-
- ret = BN_BLINDING_create_param(NULL, e, n, ctx,
- rsa->meth->bn_mod_exp, rsa->_method_mod_n);
- if (ret == NULL)
- {
- RSAerr(RSA_F_RSA_SETUP_BLINDING, ERR_R_BN_LIB);
- goto err;
- }
- CRYPTO_THREADID_current(BN_BLINDING_thread_id(ret));
-err:
- BN_CTX_end(ctx);
- if (in_ctx == NULL)
- BN_CTX_free(ctx);
- if(rsa->e == NULL)
- BN_free(e);
-
- return ret;
-}
-
int RSA_memory_lock(RSA *r)
{
int i,j,k,off;
diff --git a/main/openssl/crypto/rsa/rsa_oaep.c b/main/openssl/crypto/rsa/rsa_oaep.c
index 18d307ea..af4d24a5 100644
--- a/main/openssl/crypto/rsa/rsa_oaep.c
+++ b/main/openssl/crypto/rsa/rsa_oaep.c
@@ -56,7 +56,8 @@ int RSA_padding_add_PKCS1_OAEP(unsigned char *to, int tlen,
seed = to + 1;
db = to + SHA_DIGEST_LENGTH + 1;
- EVP_Digest((void *)param, plen, db, NULL, EVP_sha1(), NULL);
+ if (!EVP_Digest((void *)param, plen, db, NULL, EVP_sha1(), NULL))
+ return 0;
memset(db + SHA_DIGEST_LENGTH, 0,
emlen - flen - 2 * SHA_DIGEST_LENGTH - 1);
db[emlen - flen - SHA_DIGEST_LENGTH - 1] = 0x01;
@@ -145,9 +146,10 @@ int RSA_padding_check_PKCS1_OAEP(unsigned char *to, int tlen,
for (i = 0; i < dblen; i++)
db[i] ^= maskeddb[i];
- EVP_Digest((void *)param, plen, phash, NULL, EVP_sha1(), NULL);
+ if (!EVP_Digest((void *)param, plen, phash, NULL, EVP_sha1(), NULL))
+ return -1;
- if (memcmp(db, phash, SHA_DIGEST_LENGTH) != 0 || bad)
+ if (CRYPTO_memcmp(db, phash, SHA_DIGEST_LENGTH) != 0 || bad)
goto decoding_err;
else
{
diff --git a/main/openssl/crypto/rsa/rsa_pmeth.c b/main/openssl/crypto/rsa/rsa_pmeth.c
index c6892ecd..157aa5c4 100644
--- a/main/openssl/crypto/rsa/rsa_pmeth.c
+++ b/main/openssl/crypto/rsa/rsa_pmeth.c
@@ -63,6 +63,12 @@
#include <openssl/rsa.h>
#include <openssl/bn.h>
#include <openssl/evp.h>
+#ifndef OPENSSL_NO_CMS
+#include <openssl/cms.h>
+#endif
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
#include "evp_locl.h"
#include "rsa_locl.h"
@@ -79,6 +85,8 @@ typedef struct
int pad_mode;
/* message digest */
const EVP_MD *md;
+ /* message digest for MGF1 */
+ const EVP_MD *mgf1md;
/* PSS/OAEP salt length */
int saltlen;
/* Temp buffer */
@@ -95,6 +103,7 @@ static int pkey_rsa_init(EVP_PKEY_CTX *ctx)
rctx->pub_exp = NULL;
rctx->pad_mode = RSA_PKCS1_PADDING;
rctx->md = NULL;
+ rctx->mgf1md = NULL;
rctx->tbuf = NULL;
rctx->saltlen = -2;
@@ -147,6 +156,31 @@ static void pkey_rsa_cleanup(EVP_PKEY_CTX *ctx)
OPENSSL_free(rctx);
}
}
+#ifdef OPENSSL_FIPS
+/* FIP checker. Return value indicates status of context parameters:
+ * 1 : redirect to FIPS.
+ * 0 : don't redirect to FIPS.
+ * -1 : illegal operation in FIPS mode.
+ */
+
+static int pkey_fips_check_ctx(EVP_PKEY_CTX *ctx)
+ {
+ RSA_PKEY_CTX *rctx = ctx->data;
+ RSA *rsa = ctx->pkey->pkey.rsa;
+ int rv = -1;
+ if (!FIPS_mode())
+ return 0;
+ if (rsa->flags & RSA_FLAG_NON_FIPS_ALLOW)
+ rv = 0;
+ if (!(rsa->meth->flags & RSA_FLAG_FIPS_METHOD) && rv)
+ return -1;
+ if (rctx->md && !(rctx->md->flags & EVP_MD_FLAG_FIPS))
+ return rv;
+ if (rctx->mgf1md && !(rctx->mgf1md->flags & EVP_MD_FLAG_FIPS))
+ return rv;
+ return 1;
+ }
+#endif
static int pkey_rsa_sign(EVP_PKEY_CTX *ctx, unsigned char *sig, size_t *siglen,
const unsigned char *tbs, size_t tbslen)
@@ -155,6 +189,15 @@ static int pkey_rsa_sign(EVP_PKEY_CTX *ctx, unsigned char *sig, size_t *siglen,
RSA_PKEY_CTX *rctx = ctx->data;
RSA *rsa = ctx->pkey->pkey.rsa;
+#ifdef OPENSSL_FIPS
+ ret = pkey_fips_check_ctx(ctx);
+ if (ret < 0)
+ {
+ RSAerr(RSA_F_PKEY_RSA_SIGN, RSA_R_OPERATION_NOT_ALLOWED_IN_FIPS_MODE);
+ return -1;
+ }
+#endif
+
if (rctx->md)
{
if (tbslen != (size_t)EVP_MD_size(rctx->md))
@@ -163,7 +206,36 @@ static int pkey_rsa_sign(EVP_PKEY_CTX *ctx, unsigned char *sig, size_t *siglen,
RSA_R_INVALID_DIGEST_LENGTH);
return -1;
}
- if (rctx->pad_mode == RSA_X931_PADDING)
+#ifdef OPENSSL_FIPS
+ if (ret > 0)
+ {
+ unsigned int slen;
+ ret = FIPS_rsa_sign_digest(rsa, tbs, tbslen, rctx->md,
+ rctx->pad_mode,
+ rctx->saltlen,
+ rctx->mgf1md,
+ sig, &slen);
+ if (ret > 0)
+ *siglen = slen;
+ else
+ *siglen = 0;
+ return ret;
+ }
+#endif
+
+ if (EVP_MD_type(rctx->md) == NID_mdc2)
+ {
+ unsigned int sltmp;
+ if (rctx->pad_mode != RSA_PKCS1_PADDING)
+ return -1;
+ ret = RSA_sign_ASN1_OCTET_STRING(NID_mdc2,
+ tbs, tbslen, sig, &sltmp, rsa);
+
+ if (ret <= 0)
+ return ret;
+ ret = sltmp;
+ }
+ else if (rctx->pad_mode == RSA_X931_PADDING)
{
if (!setup_tbuf(rctx, ctx))
return -1;
@@ -186,8 +258,10 @@ static int pkey_rsa_sign(EVP_PKEY_CTX *ctx, unsigned char *sig, size_t *siglen,
{
if (!setup_tbuf(rctx, ctx))
return -1;
- if (!RSA_padding_add_PKCS1_PSS(rsa, rctx->tbuf, tbs,
- rctx->md, rctx->saltlen))
+ if (!RSA_padding_add_PKCS1_PSS_mgf1(rsa,
+ rctx->tbuf, tbs,
+ rctx->md, rctx->mgf1md,
+ rctx->saltlen))
return -1;
ret = RSA_private_encrypt(RSA_size(rsa), rctx->tbuf,
sig, rsa, RSA_NO_PADDING);
@@ -269,8 +343,30 @@ static int pkey_rsa_verify(EVP_PKEY_CTX *ctx,
RSA_PKEY_CTX *rctx = ctx->data;
RSA *rsa = ctx->pkey->pkey.rsa;
size_t rslen;
+#ifdef OPENSSL_FIPS
+ int rv;
+ rv = pkey_fips_check_ctx(ctx);
+ if (rv < 0)
+ {
+ RSAerr(RSA_F_PKEY_RSA_VERIFY, RSA_R_OPERATION_NOT_ALLOWED_IN_FIPS_MODE);
+ return -1;
+ }
+#endif
if (rctx->md)
{
+#ifdef OPENSSL_FIPS
+ if (rv > 0)
+ {
+ return FIPS_rsa_verify_digest(rsa,
+ tbs, tbslen,
+ rctx->md,
+ rctx->pad_mode,
+ rctx->saltlen,
+ rctx->mgf1md,
+ sig, siglen);
+
+ }
+#endif
if (rctx->pad_mode == RSA_PKCS1_PADDING)
return RSA_verify(EVP_MD_type(rctx->md), tbs, tbslen,
sig, siglen, rsa);
@@ -289,7 +385,8 @@ static int pkey_rsa_verify(EVP_PKEY_CTX *ctx,
rsa, RSA_NO_PADDING);
if (ret <= 0)
return 0;
- ret = RSA_verify_PKCS1_PSS(rsa, tbs, rctx->md,
+ ret = RSA_verify_PKCS1_PSS_mgf1(rsa, tbs,
+ rctx->md, rctx->mgf1md,
rctx->tbuf, rctx->saltlen);
if (ret <= 0)
return 0;
@@ -403,15 +500,25 @@ static int pkey_rsa_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2)
RSA_R_ILLEGAL_OR_UNSUPPORTED_PADDING_MODE);
return -2;
+ case EVP_PKEY_CTRL_GET_RSA_PADDING:
+ *(int *)p2 = rctx->pad_mode;
+ return 1;
+
case EVP_PKEY_CTRL_RSA_PSS_SALTLEN:
- if (p1 < -2)
- return -2;
+ case EVP_PKEY_CTRL_GET_RSA_PSS_SALTLEN:
if (rctx->pad_mode != RSA_PKCS1_PSS_PADDING)
{
RSAerr(RSA_F_PKEY_RSA_CTRL, RSA_R_INVALID_PSS_SALTLEN);
return -2;
}
- rctx->saltlen = p1;
+ if (type == EVP_PKEY_CTRL_GET_RSA_PSS_SALTLEN)
+ *(int *)p2 = rctx->saltlen;
+ else
+ {
+ if (p1 < -2)
+ return -2;
+ rctx->saltlen = p1;
+ }
return 1;
case EVP_PKEY_CTRL_RSA_KEYGEN_BITS:
@@ -435,16 +542,45 @@ static int pkey_rsa_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2)
rctx->md = p2;
return 1;
+ case EVP_PKEY_CTRL_RSA_MGF1_MD:
+ case EVP_PKEY_CTRL_GET_RSA_MGF1_MD:
+ if (rctx->pad_mode != RSA_PKCS1_PSS_PADDING)
+ {
+ RSAerr(RSA_F_PKEY_RSA_CTRL, RSA_R_INVALID_MGF1_MD);
+ return -2;
+ }
+ if (type == EVP_PKEY_CTRL_GET_RSA_MGF1_MD)
+ {
+ if (rctx->mgf1md)
+ *(const EVP_MD **)p2 = rctx->mgf1md;
+ else
+ *(const EVP_MD **)p2 = rctx->md;
+ }
+ else
+ rctx->mgf1md = p2;
+ return 1;
+
case EVP_PKEY_CTRL_DIGESTINIT:
case EVP_PKEY_CTRL_PKCS7_ENCRYPT:
case EVP_PKEY_CTRL_PKCS7_DECRYPT:
case EVP_PKEY_CTRL_PKCS7_SIGN:
+ return 1;
#ifndef OPENSSL_NO_CMS
- case EVP_PKEY_CTRL_CMS_ENCRYPT:
case EVP_PKEY_CTRL_CMS_DECRYPT:
+ {
+ X509_ALGOR *alg = NULL;
+ ASN1_OBJECT *encalg = NULL;
+ if (p2)
+ CMS_RecipientInfo_ktri_get0_algs(p2, NULL, NULL, &alg);
+ if (alg)
+ X509_ALGOR_get0(&encalg, NULL, NULL, alg);
+ if (encalg && OBJ_obj2nid(encalg) == NID_rsaesOaep)
+ rctx->pad_mode = RSA_PKCS1_OAEP_PADDING;
+ }
+ case EVP_PKEY_CTRL_CMS_ENCRYPT:
case EVP_PKEY_CTRL_CMS_SIGN:
-#endif
return 1;
+#endif
case EVP_PKEY_CTRL_PEER_KEY:
RSAerr(RSA_F_PKEY_RSA_CTRL,
RSA_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE);
@@ -475,6 +611,8 @@ static int pkey_rsa_ctrl_str(EVP_PKEY_CTX *ctx,
pm = RSA_NO_PADDING;
else if (!strcmp(value, "oeap"))
pm = RSA_PKCS1_OAEP_PADDING;
+ else if (!strcmp(value, "oaep"))
+ pm = RSA_PKCS1_OAEP_PADDING;
else if (!strcmp(value, "x931"))
pm = RSA_X931_PADDING;
else if (!strcmp(value, "pss"))
diff --git a/main/openssl/crypto/rsa/rsa_pss.c b/main/openssl/crypto/rsa/rsa_pss.c
index ac211e2f..5f9f533d 100644
--- a/main/openssl/crypto/rsa/rsa_pss.c
+++ b/main/openssl/crypto/rsa/rsa_pss.c
@@ -73,6 +73,13 @@ static const unsigned char zeroes[] = {0,0,0,0,0,0,0,0};
int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash,
const EVP_MD *Hash, const unsigned char *EM, int sLen)
{
+ return RSA_verify_PKCS1_PSS_mgf1(rsa, mHash, Hash, NULL, EM, sLen);
+ }
+
+int RSA_verify_PKCS1_PSS_mgf1(RSA *rsa, const unsigned char *mHash,
+ const EVP_MD *Hash, const EVP_MD *mgf1Hash,
+ const unsigned char *EM, int sLen)
+ {
int i;
int ret = 0;
int hLen, maskedDBLen, MSBits, emLen;
@@ -80,6 +87,10 @@ int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash,
unsigned char *DB = NULL;
EVP_MD_CTX ctx;
unsigned char H_[EVP_MAX_MD_SIZE];
+ EVP_MD_CTX_init(&ctx);
+
+ if (mgf1Hash == NULL)
+ mgf1Hash = Hash;
hLen = EVP_MD_size(Hash);
if (hLen < 0)
@@ -94,7 +105,7 @@ int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash,
else if (sLen == -2) sLen = -2;
else if (sLen < -2)
{
- RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, RSA_R_SLEN_CHECK_FAILED);
+ RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, RSA_R_SLEN_CHECK_FAILED);
goto err;
}
@@ -102,7 +113,7 @@ int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash,
emLen = RSA_size(rsa);
if (EM[0] & (0xFF << MSBits))
{
- RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, RSA_R_FIRST_OCTET_INVALID);
+ RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, RSA_R_FIRST_OCTET_INVALID);
goto err;
}
if (MSBits == 0)
@@ -112,12 +123,12 @@ int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash,
}
if (emLen < (hLen + sLen + 2)) /* sLen can be small negative */
{
- RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, RSA_R_DATA_TOO_LARGE);
+ RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, RSA_R_DATA_TOO_LARGE);
goto err;
}
if (EM[emLen - 1] != 0xbc)
{
- RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, RSA_R_LAST_OCTET_INVALID);
+ RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, RSA_R_LAST_OCTET_INVALID);
goto err;
}
maskedDBLen = emLen - hLen - 1;
@@ -125,10 +136,10 @@ int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash,
DB = OPENSSL_malloc(maskedDBLen);
if (!DB)
{
- RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, ERR_R_MALLOC_FAILURE);
+ RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, ERR_R_MALLOC_FAILURE);
goto err;
}
- if (PKCS1_MGF1(DB, maskedDBLen, H, hLen, Hash) < 0)
+ if (PKCS1_MGF1(DB, maskedDBLen, H, hLen, mgf1Hash) < 0)
goto err;
for (i = 0; i < maskedDBLen; i++)
DB[i] ^= EM[i];
@@ -137,25 +148,28 @@ int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash,
for (i = 0; DB[i] == 0 && i < (maskedDBLen-1); i++) ;
if (DB[i++] != 0x1)
{
- RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, RSA_R_SLEN_RECOVERY_FAILED);
+ RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, RSA_R_SLEN_RECOVERY_FAILED);
goto err;
}
if (sLen >= 0 && (maskedDBLen - i) != sLen)
{
- RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, RSA_R_SLEN_CHECK_FAILED);
+ RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, RSA_R_SLEN_CHECK_FAILED);
goto err;
}
- EVP_MD_CTX_init(&ctx);
- EVP_DigestInit_ex(&ctx, Hash, NULL);
- EVP_DigestUpdate(&ctx, zeroes, sizeof zeroes);
- EVP_DigestUpdate(&ctx, mHash, hLen);
+ if (!EVP_DigestInit_ex(&ctx, Hash, NULL)
+ || !EVP_DigestUpdate(&ctx, zeroes, sizeof zeroes)
+ || !EVP_DigestUpdate(&ctx, mHash, hLen))
+ goto err;
if (maskedDBLen - i)
- EVP_DigestUpdate(&ctx, DB + i, maskedDBLen - i);
- EVP_DigestFinal(&ctx, H_, NULL);
- EVP_MD_CTX_cleanup(&ctx);
+ {
+ if (!EVP_DigestUpdate(&ctx, DB + i, maskedDBLen - i))
+ goto err;
+ }
+ if (!EVP_DigestFinal_ex(&ctx, H_, NULL))
+ goto err;
if (memcmp(H_, H, hLen))
{
- RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, RSA_R_BAD_SIGNATURE);
+ RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, RSA_R_BAD_SIGNATURE);
ret = 0;
}
else
@@ -164,6 +178,7 @@ int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash,
err:
if (DB)
OPENSSL_free(DB);
+ EVP_MD_CTX_cleanup(&ctx);
return ret;
@@ -173,12 +188,22 @@ int RSA_padding_add_PKCS1_PSS(RSA *rsa, unsigned char *EM,
const unsigned char *mHash,
const EVP_MD *Hash, int sLen)
{
+ return RSA_padding_add_PKCS1_PSS_mgf1(rsa, EM, mHash, Hash, NULL, sLen);
+ }
+
+int RSA_padding_add_PKCS1_PSS_mgf1(RSA *rsa, unsigned char *EM,
+ const unsigned char *mHash,
+ const EVP_MD *Hash, const EVP_MD *mgf1Hash, int sLen)
+ {
int i;
int ret = 0;
int hLen, maskedDBLen, MSBits, emLen;
unsigned char *H, *salt = NULL, *p;
EVP_MD_CTX ctx;
+ if (mgf1Hash == NULL)
+ mgf1Hash = Hash;
+
hLen = EVP_MD_size(Hash);
if (hLen < 0)
goto err;
@@ -192,7 +217,7 @@ int RSA_padding_add_PKCS1_PSS(RSA *rsa, unsigned char *EM,
else if (sLen == -2) sLen = -2;
else if (sLen < -2)
{
- RSAerr(RSA_F_RSA_PADDING_ADD_PKCS1_PSS, RSA_R_SLEN_CHECK_FAILED);
+ RSAerr(RSA_F_RSA_PADDING_ADD_PKCS1_PSS_MGF1, RSA_R_SLEN_CHECK_FAILED);
goto err;
}
@@ -209,8 +234,7 @@ int RSA_padding_add_PKCS1_PSS(RSA *rsa, unsigned char *EM,
}
else if (emLen < (hLen + sLen + 2))
{
- RSAerr(RSA_F_RSA_PADDING_ADD_PKCS1_PSS,
- RSA_R_DATA_TOO_LARGE_FOR_KEY_SIZE);
+ RSAerr(RSA_F_RSA_PADDING_ADD_PKCS1_PSS_MGF1,RSA_R_DATA_TOO_LARGE_FOR_KEY_SIZE);
goto err;
}
if (sLen > 0)
@@ -218,8 +242,7 @@ int RSA_padding_add_PKCS1_PSS(RSA *rsa, unsigned char *EM,
salt = OPENSSL_malloc(sLen);
if (!salt)
{
- RSAerr(RSA_F_RSA_PADDING_ADD_PKCS1_PSS,
- ERR_R_MALLOC_FAILURE);
+ RSAerr(RSA_F_RSA_PADDING_ADD_PKCS1_PSS_MGF1,ERR_R_MALLOC_FAILURE);
goto err;
}
if (RAND_bytes(salt, sLen) <= 0)
@@ -228,16 +251,18 @@ int RSA_padding_add_PKCS1_PSS(RSA *rsa, unsigned char *EM,
maskedDBLen = emLen - hLen - 1;
H = EM + maskedDBLen;
EVP_MD_CTX_init(&ctx);
- EVP_DigestInit_ex(&ctx, Hash, NULL);
- EVP_DigestUpdate(&ctx, zeroes, sizeof zeroes);
- EVP_DigestUpdate(&ctx, mHash, hLen);
- if (sLen)
- EVP_DigestUpdate(&ctx, salt, sLen);
- EVP_DigestFinal(&ctx, H, NULL);
+ if (!EVP_DigestInit_ex(&ctx, Hash, NULL)
+ || !EVP_DigestUpdate(&ctx, zeroes, sizeof zeroes)
+ || !EVP_DigestUpdate(&ctx, mHash, hLen))
+ goto err;
+ if (sLen && !EVP_DigestUpdate(&ctx, salt, sLen))
+ goto err;
+ if (!EVP_DigestFinal_ex(&ctx, H, NULL))
+ goto err;
EVP_MD_CTX_cleanup(&ctx);
/* Generate dbMask in place then perform XOR on it */
- if (PKCS1_MGF1(EM, maskedDBLen, H, hLen, Hash))
+ if (PKCS1_MGF1(EM, maskedDBLen, H, hLen, mgf1Hash))
goto err;
p = EM;
diff --git a/main/openssl/crypto/rsa/rsa_sign.c b/main/openssl/crypto/rsa/rsa_sign.c
index 0be4ec7f..b6f6037a 100644
--- a/main/openssl/crypto/rsa/rsa_sign.c
+++ b/main/openssl/crypto/rsa/rsa_sign.c
@@ -77,6 +77,14 @@ int RSA_sign(int type, const unsigned char *m, unsigned int m_len,
const unsigned char *s = NULL;
X509_ALGOR algor;
ASN1_OCTET_STRING digest;
+#ifdef OPENSSL_FIPS
+ if (FIPS_mode() && !(rsa->meth->flags & RSA_FLAG_FIPS_METHOD)
+ && !(rsa->flags & RSA_FLAG_NON_FIPS_ALLOW))
+ {
+ RSAerr(RSA_F_RSA_SIGN, RSA_R_NON_FIPS_RSA_METHOD);
+ return 0;
+ }
+#endif
if((rsa->flags & RSA_FLAG_SIGN_VER) && rsa->meth->rsa_sign)
{
return rsa->meth->rsa_sign(type, m, m_len,
@@ -153,6 +161,15 @@ int int_rsa_verify(int dtype, const unsigned char *m,
unsigned char *s;
X509_SIG *sig=NULL;
+#ifdef OPENSSL_FIPS
+ if (FIPS_mode() && !(rsa->meth->flags & RSA_FLAG_FIPS_METHOD)
+ && !(rsa->flags & RSA_FLAG_NON_FIPS_ALLOW))
+ {
+ RSAerr(RSA_F_INT_RSA_VERIFY, RSA_R_NON_FIPS_RSA_METHOD);
+ return 0;
+ }
+#endif
+
if (siglen != (unsigned int)RSA_size(rsa))
{
RSAerr(RSA_F_INT_RSA_VERIFY,RSA_R_WRONG_SIGNATURE_LENGTH);
@@ -182,6 +199,22 @@ int int_rsa_verify(int dtype, const unsigned char *m,
i=RSA_public_decrypt((int)siglen,sigbuf,s,rsa,RSA_PKCS1_PADDING);
if (i <= 0) goto err;
+ /* Oddball MDC2 case: signature can be OCTET STRING.
+ * check for correct tag and length octets.
+ */
+ if (dtype == NID_mdc2 && i == 18 && s[0] == 0x04 && s[1] == 0x10)
+ {
+ if (rm)
+ {
+ memcpy(rm, s + 2, 16);
+ *prm_len = 16;
+ ret = 1;
+ }
+ else if(memcmp(m, s + 2, 16))
+ RSAerr(RSA_F_INT_RSA_VERIFY,RSA_R_BAD_SIGNATURE);
+ else
+ ret = 1;
+ }
/* Special case: SSL signature */
if(dtype == NID_md5_sha1) {
diff --git a/main/openssl/crypto/s390xcap.c b/main/openssl/crypto/s390xcap.c
index ffbe0235..f2e94ef4 100644
--- a/main/openssl/crypto/s390xcap.c
+++ b/main/openssl/crypto/s390xcap.c
@@ -4,7 +4,7 @@
#include <setjmp.h>
#include <signal.h>
-extern unsigned long OPENSSL_s390xcap_P;
+extern unsigned long OPENSSL_s390xcap_P[];
static sigjmp_buf ill_jmp;
static void ill_handler (int sig) { siglongjmp(ill_jmp,sig); }
@@ -16,7 +16,9 @@ void OPENSSL_cpuid_setup(void)
sigset_t oset;
struct sigaction ill_act,oact;
- if (OPENSSL_s390xcap_P) return;
+ if (OPENSSL_s390xcap_P[0]) return;
+
+ OPENSSL_s390xcap_P[0] = 1UL<<(8*sizeof(unsigned long)-1);
memset(&ill_act,0,sizeof(ill_act));
ill_act.sa_handler = ill_handler;
@@ -27,10 +29,8 @@ void OPENSSL_cpuid_setup(void)
sigaction (SIGILL,&ill_act,&oact);
/* protection against missing store-facility-list-extended */
- if (sigsetjmp(ill_jmp,0) == 0)
- OPENSSL_s390xcap_P = OPENSSL_s390x_facilities();
- else
- OPENSSL_s390xcap_P = 1UL<<63;
+ if (sigsetjmp(ill_jmp,1) == 0)
+ OPENSSL_s390x_facilities();
sigaction (SIGILL,&oact,NULL);
sigprocmask(SIG_SETMASK,&oset,NULL);
diff --git a/main/openssl/crypto/s390xcpuid.S b/main/openssl/crypto/s390xcpuid.S
index b053c6a2..06815347 100644
--- a/main/openssl/crypto/s390xcpuid.S
+++ b/main/openssl/crypto/s390xcpuid.S
@@ -5,10 +5,14 @@
.align 16
OPENSSL_s390x_facilities:
lghi %r0,0
- .long 0xb2b0f010 # stfle 16(%r15)
- lg %r2,16(%r15)
- larl %r1,OPENSSL_s390xcap_P
- stg %r2,0(%r1)
+ larl %r2,OPENSSL_s390xcap_P
+ stg %r0,8(%r2)
+ .long 0xb2b02000 # stfle 0(%r2)
+ brc 8,.Ldone
+ lghi %r0,1
+ .long 0xb2b02000 # stfle 0(%r2)
+.Ldone:
+ lg %r2,0(%r2)
br %r14
.size OPENSSL_s390x_facilities,.-OPENSSL_s390x_facilities
@@ -58,6 +62,9 @@ OPENSSL_wipe_cpu:
.type OPENSSL_cleanse,@function
.align 16
OPENSSL_cleanse:
+#if !defined(__s390x__) && !defined(__s390x)
+ llgfr %r3,%r3
+#endif
lghi %r4,15
lghi %r0,0
clgr %r3,%r4
@@ -89,4 +96,4 @@ OPENSSL_cleanse:
.section .init
brasl %r14,OPENSSL_cpuid_setup
-.comm OPENSSL_s390xcap_P,8,8
+.comm OPENSSL_s390xcap_P,16,8
diff --git a/main/openssl/crypto/sha/asm/sha1-586.S b/main/openssl/crypto/sha/asm/sha1-586.S
new file mode 100644
index 00000000..e77f6541
--- /dev/null
+++ b/main/openssl/crypto/sha/asm/sha1-586.S
@@ -0,0 +1,1380 @@
+.file "sha1-586.s"
+.text
+.globl sha1_block_data_order
+.type sha1_block_data_order,@function
+.align 16
+sha1_block_data_order:
+.L_sha1_block_data_order_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%ebp
+ movl 24(%esp),%esi
+ movl 28(%esp),%eax
+ subl $76,%esp
+ shll $6,%eax
+ addl %esi,%eax
+ movl %eax,104(%esp)
+ movl 16(%ebp),%edi
+ jmp .L000loop
+.align 16
+.L000loop:
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl 8(%esi),%ecx
+ movl 12(%esi),%edx
+ bswap %eax
+ bswap %ebx
+ bswap %ecx
+ bswap %edx
+ movl %eax,(%esp)
+ movl %ebx,4(%esp)
+ movl %ecx,8(%esp)
+ movl %edx,12(%esp)
+ movl 16(%esi),%eax
+ movl 20(%esi),%ebx
+ movl 24(%esi),%ecx
+ movl 28(%esi),%edx
+ bswap %eax
+ bswap %ebx
+ bswap %ecx
+ bswap %edx
+ movl %eax,16(%esp)
+ movl %ebx,20(%esp)
+ movl %ecx,24(%esp)
+ movl %edx,28(%esp)
+ movl 32(%esi),%eax
+ movl 36(%esi),%ebx
+ movl 40(%esi),%ecx
+ movl 44(%esi),%edx
+ bswap %eax
+ bswap %ebx
+ bswap %ecx
+ bswap %edx
+ movl %eax,32(%esp)
+ movl %ebx,36(%esp)
+ movl %ecx,40(%esp)
+ movl %edx,44(%esp)
+ movl 48(%esi),%eax
+ movl 52(%esi),%ebx
+ movl 56(%esi),%ecx
+ movl 60(%esi),%edx
+ bswap %eax
+ bswap %ebx
+ bswap %ecx
+ bswap %edx
+ movl %eax,48(%esp)
+ movl %ebx,52(%esp)
+ movl %ecx,56(%esp)
+ movl %edx,60(%esp)
+ movl %esi,100(%esp)
+ movl (%ebp),%eax
+ movl 4(%ebp),%ebx
+ movl 8(%ebp),%ecx
+ movl 12(%ebp),%edx
+
+ movl %ecx,%esi
+ movl %eax,%ebp
+ roll $5,%ebp
+ xorl %edx,%esi
+ addl %edi,%ebp
+ movl (%esp),%edi
+ andl %ebx,%esi
+ rorl $2,%ebx
+ xorl %edx,%esi
+ leal 1518500249(%ebp,%edi,1),%ebp
+ addl %esi,%ebp
+
+ movl %ebx,%edi
+ movl %ebp,%esi
+ roll $5,%ebp
+ xorl %ecx,%edi
+ addl %edx,%ebp
+ movl 4(%esp),%edx
+ andl %eax,%edi
+ rorl $2,%eax
+ xorl %ecx,%edi
+ leal 1518500249(%ebp,%edx,1),%ebp
+ addl %edi,%ebp
+
+ movl %eax,%edx
+ movl %ebp,%edi
+ roll $5,%ebp
+ xorl %ebx,%edx
+ addl %ecx,%ebp
+ movl 8(%esp),%ecx
+ andl %esi,%edx
+ rorl $2,%esi
+ xorl %ebx,%edx
+ leal 1518500249(%ebp,%ecx,1),%ebp
+ addl %edx,%ebp
+
+ movl %esi,%ecx
+ movl %ebp,%edx
+ roll $5,%ebp
+ xorl %eax,%ecx
+ addl %ebx,%ebp
+ movl 12(%esp),%ebx
+ andl %edi,%ecx
+ rorl $2,%edi
+ xorl %eax,%ecx
+ leal 1518500249(%ebp,%ebx,1),%ebp
+ addl %ecx,%ebp
+
+ movl %edi,%ebx
+ movl %ebp,%ecx
+ roll $5,%ebp
+ xorl %esi,%ebx
+ addl %eax,%ebp
+ movl 16(%esp),%eax
+ andl %edx,%ebx
+ rorl $2,%edx
+ xorl %esi,%ebx
+ leal 1518500249(%ebp,%eax,1),%ebp
+ addl %ebx,%ebp
+
+ movl %edx,%eax
+ movl %ebp,%ebx
+ roll $5,%ebp
+ xorl %edi,%eax
+ addl %esi,%ebp
+ movl 20(%esp),%esi
+ andl %ecx,%eax
+ rorl $2,%ecx
+ xorl %edi,%eax
+ leal 1518500249(%ebp,%esi,1),%ebp
+ addl %eax,%ebp
+
+ movl %ecx,%esi
+ movl %ebp,%eax
+ roll $5,%ebp
+ xorl %edx,%esi
+ addl %edi,%ebp
+ movl 24(%esp),%edi
+ andl %ebx,%esi
+ rorl $2,%ebx
+ xorl %edx,%esi
+ leal 1518500249(%ebp,%edi,1),%ebp
+ addl %esi,%ebp
+
+ movl %ebx,%edi
+ movl %ebp,%esi
+ roll $5,%ebp
+ xorl %ecx,%edi
+ addl %edx,%ebp
+ movl 28(%esp),%edx
+ andl %eax,%edi
+ rorl $2,%eax
+ xorl %ecx,%edi
+ leal 1518500249(%ebp,%edx,1),%ebp
+ addl %edi,%ebp
+
+ movl %eax,%edx
+ movl %ebp,%edi
+ roll $5,%ebp
+ xorl %ebx,%edx
+ addl %ecx,%ebp
+ movl 32(%esp),%ecx
+ andl %esi,%edx
+ rorl $2,%esi
+ xorl %ebx,%edx
+ leal 1518500249(%ebp,%ecx,1),%ebp
+ addl %edx,%ebp
+
+ movl %esi,%ecx
+ movl %ebp,%edx
+ roll $5,%ebp
+ xorl %eax,%ecx
+ addl %ebx,%ebp
+ movl 36(%esp),%ebx
+ andl %edi,%ecx
+ rorl $2,%edi
+ xorl %eax,%ecx
+ leal 1518500249(%ebp,%ebx,1),%ebp
+ addl %ecx,%ebp
+
+ movl %edi,%ebx
+ movl %ebp,%ecx
+ roll $5,%ebp
+ xorl %esi,%ebx
+ addl %eax,%ebp
+ movl 40(%esp),%eax
+ andl %edx,%ebx
+ rorl $2,%edx
+ xorl %esi,%ebx
+ leal 1518500249(%ebp,%eax,1),%ebp
+ addl %ebx,%ebp
+
+ movl %edx,%eax
+ movl %ebp,%ebx
+ roll $5,%ebp
+ xorl %edi,%eax
+ addl %esi,%ebp
+ movl 44(%esp),%esi
+ andl %ecx,%eax
+ rorl $2,%ecx
+ xorl %edi,%eax
+ leal 1518500249(%ebp,%esi,1),%ebp
+ addl %eax,%ebp
+
+ movl %ecx,%esi
+ movl %ebp,%eax
+ roll $5,%ebp
+ xorl %edx,%esi
+ addl %edi,%ebp
+ movl 48(%esp),%edi
+ andl %ebx,%esi
+ rorl $2,%ebx
+ xorl %edx,%esi
+ leal 1518500249(%ebp,%edi,1),%ebp
+ addl %esi,%ebp
+
+ movl %ebx,%edi
+ movl %ebp,%esi
+ roll $5,%ebp
+ xorl %ecx,%edi
+ addl %edx,%ebp
+ movl 52(%esp),%edx
+ andl %eax,%edi
+ rorl $2,%eax
+ xorl %ecx,%edi
+ leal 1518500249(%ebp,%edx,1),%ebp
+ addl %edi,%ebp
+
+ movl %eax,%edx
+ movl %ebp,%edi
+ roll $5,%ebp
+ xorl %ebx,%edx
+ addl %ecx,%ebp
+ movl 56(%esp),%ecx
+ andl %esi,%edx
+ rorl $2,%esi
+ xorl %ebx,%edx
+ leal 1518500249(%ebp,%ecx,1),%ebp
+ addl %edx,%ebp
+
+ movl %esi,%ecx
+ movl %ebp,%edx
+ roll $5,%ebp
+ xorl %eax,%ecx
+ addl %ebx,%ebp
+ movl 60(%esp),%ebx
+ andl %edi,%ecx
+ rorl $2,%edi
+ xorl %eax,%ecx
+ leal 1518500249(%ebp,%ebx,1),%ebp
+ movl (%esp),%ebx
+ addl %ebp,%ecx
+
+ movl %edi,%ebp
+ xorl 8(%esp),%ebx
+ xorl %esi,%ebp
+ xorl 32(%esp),%ebx
+ andl %edx,%ebp
+ xorl 52(%esp),%ebx
+ roll $1,%ebx
+ xorl %esi,%ebp
+ addl %ebp,%eax
+ movl %ecx,%ebp
+ rorl $2,%edx
+ movl %ebx,(%esp)
+ roll $5,%ebp
+ leal 1518500249(%ebx,%eax,1),%ebx
+ movl 4(%esp),%eax
+ addl %ebp,%ebx
+
+ movl %edx,%ebp
+ xorl 12(%esp),%eax
+ xorl %edi,%ebp
+ xorl 36(%esp),%eax
+ andl %ecx,%ebp
+ xorl 56(%esp),%eax
+ roll $1,%eax
+ xorl %edi,%ebp
+ addl %ebp,%esi
+ movl %ebx,%ebp
+ rorl $2,%ecx
+ movl %eax,4(%esp)
+ roll $5,%ebp
+ leal 1518500249(%eax,%esi,1),%eax
+ movl 8(%esp),%esi
+ addl %ebp,%eax
+
+ movl %ecx,%ebp
+ xorl 16(%esp),%esi
+ xorl %edx,%ebp
+ xorl 40(%esp),%esi
+ andl %ebx,%ebp
+ xorl 60(%esp),%esi
+ roll $1,%esi
+ xorl %edx,%ebp
+ addl %ebp,%edi
+ movl %eax,%ebp
+ rorl $2,%ebx
+ movl %esi,8(%esp)
+ roll $5,%ebp
+ leal 1518500249(%esi,%edi,1),%esi
+ movl 12(%esp),%edi
+ addl %ebp,%esi
+
+ movl %ebx,%ebp
+ xorl 20(%esp),%edi
+ xorl %ecx,%ebp
+ xorl 44(%esp),%edi
+ andl %eax,%ebp
+ xorl (%esp),%edi
+ roll $1,%edi
+ xorl %ecx,%ebp
+ addl %ebp,%edx
+ movl %esi,%ebp
+ rorl $2,%eax
+ movl %edi,12(%esp)
+ roll $5,%ebp
+ leal 1518500249(%edi,%edx,1),%edi
+ movl 16(%esp),%edx
+ addl %ebp,%edi
+
+ movl %esi,%ebp
+ xorl 24(%esp),%edx
+ xorl %eax,%ebp
+ xorl 48(%esp),%edx
+ xorl %ebx,%ebp
+ xorl 4(%esp),%edx
+ roll $1,%edx
+ addl %ebp,%ecx
+ rorl $2,%esi
+ movl %edi,%ebp
+ roll $5,%ebp
+ movl %edx,16(%esp)
+ leal 1859775393(%edx,%ecx,1),%edx
+ movl 20(%esp),%ecx
+ addl %ebp,%edx
+
+ movl %edi,%ebp
+ xorl 28(%esp),%ecx
+ xorl %esi,%ebp
+ xorl 52(%esp),%ecx
+ xorl %eax,%ebp
+ xorl 8(%esp),%ecx
+ roll $1,%ecx
+ addl %ebp,%ebx
+ rorl $2,%edi
+ movl %edx,%ebp
+ roll $5,%ebp
+ movl %ecx,20(%esp)
+ leal 1859775393(%ecx,%ebx,1),%ecx
+ movl 24(%esp),%ebx
+ addl %ebp,%ecx
+
+ movl %edx,%ebp
+ xorl 32(%esp),%ebx
+ xorl %edi,%ebp
+ xorl 56(%esp),%ebx
+ xorl %esi,%ebp
+ xorl 12(%esp),%ebx
+ roll $1,%ebx
+ addl %ebp,%eax
+ rorl $2,%edx
+ movl %ecx,%ebp
+ roll $5,%ebp
+ movl %ebx,24(%esp)
+ leal 1859775393(%ebx,%eax,1),%ebx
+ movl 28(%esp),%eax
+ addl %ebp,%ebx
+
+ movl %ecx,%ebp
+ xorl 36(%esp),%eax
+ xorl %edx,%ebp
+ xorl 60(%esp),%eax
+ xorl %edi,%ebp
+ xorl 16(%esp),%eax
+ roll $1,%eax
+ addl %ebp,%esi
+ rorl $2,%ecx
+ movl %ebx,%ebp
+ roll $5,%ebp
+ movl %eax,28(%esp)
+ leal 1859775393(%eax,%esi,1),%eax
+ movl 32(%esp),%esi
+ addl %ebp,%eax
+
+ movl %ebx,%ebp
+ xorl 40(%esp),%esi
+ xorl %ecx,%ebp
+ xorl (%esp),%esi
+ xorl %edx,%ebp
+ xorl 20(%esp),%esi
+ roll $1,%esi
+ addl %ebp,%edi
+ rorl $2,%ebx
+ movl %eax,%ebp
+ roll $5,%ebp
+ movl %esi,32(%esp)
+ leal 1859775393(%esi,%edi,1),%esi
+ movl 36(%esp),%edi
+ addl %ebp,%esi
+
+ movl %eax,%ebp
+ xorl 44(%esp),%edi
+ xorl %ebx,%ebp
+ xorl 4(%esp),%edi
+ xorl %ecx,%ebp
+ xorl 24(%esp),%edi
+ roll $1,%edi
+ addl %ebp,%edx
+ rorl $2,%eax
+ movl %esi,%ebp
+ roll $5,%ebp
+ movl %edi,36(%esp)
+ leal 1859775393(%edi,%edx,1),%edi
+ movl 40(%esp),%edx
+ addl %ebp,%edi
+
+ movl %esi,%ebp
+ xorl 48(%esp),%edx
+ xorl %eax,%ebp
+ xorl 8(%esp),%edx
+ xorl %ebx,%ebp
+ xorl 28(%esp),%edx
+ roll $1,%edx
+ addl %ebp,%ecx
+ rorl $2,%esi
+ movl %edi,%ebp
+ roll $5,%ebp
+ movl %edx,40(%esp)
+ leal 1859775393(%edx,%ecx,1),%edx
+ movl 44(%esp),%ecx
+ addl %ebp,%edx
+
+ movl %edi,%ebp
+ xorl 52(%esp),%ecx
+ xorl %esi,%ebp
+ xorl 12(%esp),%ecx
+ xorl %eax,%ebp
+ xorl 32(%esp),%ecx
+ roll $1,%ecx
+ addl %ebp,%ebx
+ rorl $2,%edi
+ movl %edx,%ebp
+ roll $5,%ebp
+ movl %ecx,44(%esp)
+ leal 1859775393(%ecx,%ebx,1),%ecx
+ movl 48(%esp),%ebx
+ addl %ebp,%ecx
+
+ movl %edx,%ebp
+ xorl 56(%esp),%ebx
+ xorl %edi,%ebp
+ xorl 16(%esp),%ebx
+ xorl %esi,%ebp
+ xorl 36(%esp),%ebx
+ roll $1,%ebx
+ addl %ebp,%eax
+ rorl $2,%edx
+ movl %ecx,%ebp
+ roll $5,%ebp
+ movl %ebx,48(%esp)
+ leal 1859775393(%ebx,%eax,1),%ebx
+ movl 52(%esp),%eax
+ addl %ebp,%ebx
+
+ movl %ecx,%ebp
+ xorl 60(%esp),%eax
+ xorl %edx,%ebp
+ xorl 20(%esp),%eax
+ xorl %edi,%ebp
+ xorl 40(%esp),%eax
+ roll $1,%eax
+ addl %ebp,%esi
+ rorl $2,%ecx
+ movl %ebx,%ebp
+ roll $5,%ebp
+ movl %eax,52(%esp)
+ leal 1859775393(%eax,%esi,1),%eax
+ movl 56(%esp),%esi
+ addl %ebp,%eax
+
+ movl %ebx,%ebp
+ xorl (%esp),%esi
+ xorl %ecx,%ebp
+ xorl 24(%esp),%esi
+ xorl %edx,%ebp
+ xorl 44(%esp),%esi
+ roll $1,%esi
+ addl %ebp,%edi
+ rorl $2,%ebx
+ movl %eax,%ebp
+ roll $5,%ebp
+ movl %esi,56(%esp)
+ leal 1859775393(%esi,%edi,1),%esi
+ movl 60(%esp),%edi
+ addl %ebp,%esi
+
+ movl %eax,%ebp
+ xorl 4(%esp),%edi
+ xorl %ebx,%ebp
+ xorl 28(%esp),%edi
+ xorl %ecx,%ebp
+ xorl 48(%esp),%edi
+ roll $1,%edi
+ addl %ebp,%edx
+ rorl $2,%eax
+ movl %esi,%ebp
+ roll $5,%ebp
+ movl %edi,60(%esp)
+ leal 1859775393(%edi,%edx,1),%edi
+ movl (%esp),%edx
+ addl %ebp,%edi
+
+ movl %esi,%ebp
+ xorl 8(%esp),%edx
+ xorl %eax,%ebp
+ xorl 32(%esp),%edx
+ xorl %ebx,%ebp
+ xorl 52(%esp),%edx
+ roll $1,%edx
+ addl %ebp,%ecx
+ rorl $2,%esi
+ movl %edi,%ebp
+ roll $5,%ebp
+ movl %edx,(%esp)
+ leal 1859775393(%edx,%ecx,1),%edx
+ movl 4(%esp),%ecx
+ addl %ebp,%edx
+
+ movl %edi,%ebp
+ xorl 12(%esp),%ecx
+ xorl %esi,%ebp
+ xorl 36(%esp),%ecx
+ xorl %eax,%ebp
+ xorl 56(%esp),%ecx
+ roll $1,%ecx
+ addl %ebp,%ebx
+ rorl $2,%edi
+ movl %edx,%ebp
+ roll $5,%ebp
+ movl %ecx,4(%esp)
+ leal 1859775393(%ecx,%ebx,1),%ecx
+ movl 8(%esp),%ebx
+ addl %ebp,%ecx
+
+ movl %edx,%ebp
+ xorl 16(%esp),%ebx
+ xorl %edi,%ebp
+ xorl 40(%esp),%ebx
+ xorl %esi,%ebp
+ xorl 60(%esp),%ebx
+ roll $1,%ebx
+ addl %ebp,%eax
+ rorl $2,%edx
+ movl %ecx,%ebp
+ roll $5,%ebp
+ movl %ebx,8(%esp)
+ leal 1859775393(%ebx,%eax,1),%ebx
+ movl 12(%esp),%eax
+ addl %ebp,%ebx
+
+ movl %ecx,%ebp
+ xorl 20(%esp),%eax
+ xorl %edx,%ebp
+ xorl 44(%esp),%eax
+ xorl %edi,%ebp
+ xorl (%esp),%eax
+ roll $1,%eax
+ addl %ebp,%esi
+ rorl $2,%ecx
+ movl %ebx,%ebp
+ roll $5,%ebp
+ movl %eax,12(%esp)
+ leal 1859775393(%eax,%esi,1),%eax
+ movl 16(%esp),%esi
+ addl %ebp,%eax
+
+ movl %ebx,%ebp
+ xorl 24(%esp),%esi
+ xorl %ecx,%ebp
+ xorl 48(%esp),%esi
+ xorl %edx,%ebp
+ xorl 4(%esp),%esi
+ roll $1,%esi
+ addl %ebp,%edi
+ rorl $2,%ebx
+ movl %eax,%ebp
+ roll $5,%ebp
+ movl %esi,16(%esp)
+ leal 1859775393(%esi,%edi,1),%esi
+ movl 20(%esp),%edi
+ addl %ebp,%esi
+
+ movl %eax,%ebp
+ xorl 28(%esp),%edi
+ xorl %ebx,%ebp
+ xorl 52(%esp),%edi
+ xorl %ecx,%ebp
+ xorl 8(%esp),%edi
+ roll $1,%edi
+ addl %ebp,%edx
+ rorl $2,%eax
+ movl %esi,%ebp
+ roll $5,%ebp
+ movl %edi,20(%esp)
+ leal 1859775393(%edi,%edx,1),%edi
+ movl 24(%esp),%edx
+ addl %ebp,%edi
+
+ movl %esi,%ebp
+ xorl 32(%esp),%edx
+ xorl %eax,%ebp
+ xorl 56(%esp),%edx
+ xorl %ebx,%ebp
+ xorl 12(%esp),%edx
+ roll $1,%edx
+ addl %ebp,%ecx
+ rorl $2,%esi
+ movl %edi,%ebp
+ roll $5,%ebp
+ movl %edx,24(%esp)
+ leal 1859775393(%edx,%ecx,1),%edx
+ movl 28(%esp),%ecx
+ addl %ebp,%edx
+
+ movl %edi,%ebp
+ xorl 36(%esp),%ecx
+ xorl %esi,%ebp
+ xorl 60(%esp),%ecx
+ xorl %eax,%ebp
+ xorl 16(%esp),%ecx
+ roll $1,%ecx
+ addl %ebp,%ebx
+ rorl $2,%edi
+ movl %edx,%ebp
+ roll $5,%ebp
+ movl %ecx,28(%esp)
+ leal 1859775393(%ecx,%ebx,1),%ecx
+ movl 32(%esp),%ebx
+ addl %ebp,%ecx
+
+ movl %edi,%ebp
+ xorl 40(%esp),%ebx
+ xorl %esi,%ebp
+ xorl (%esp),%ebx
+ andl %edx,%ebp
+ xorl 20(%esp),%ebx
+ roll $1,%ebx
+ addl %eax,%ebp
+ rorl $2,%edx
+ movl %ecx,%eax
+ roll $5,%eax
+ movl %ebx,32(%esp)
+ leal 2400959708(%ebx,%ebp,1),%ebx
+ movl %edi,%ebp
+ addl %eax,%ebx
+ andl %esi,%ebp
+ movl 36(%esp),%eax
+ addl %ebp,%ebx
+
+ movl %edx,%ebp
+ xorl 44(%esp),%eax
+ xorl %edi,%ebp
+ xorl 4(%esp),%eax
+ andl %ecx,%ebp
+ xorl 24(%esp),%eax
+ roll $1,%eax
+ addl %esi,%ebp
+ rorl $2,%ecx
+ movl %ebx,%esi
+ roll $5,%esi
+ movl %eax,36(%esp)
+ leal 2400959708(%eax,%ebp,1),%eax
+ movl %edx,%ebp
+ addl %esi,%eax
+ andl %edi,%ebp
+ movl 40(%esp),%esi
+ addl %ebp,%eax
+
+ movl %ecx,%ebp
+ xorl 48(%esp),%esi
+ xorl %edx,%ebp
+ xorl 8(%esp),%esi
+ andl %ebx,%ebp
+ xorl 28(%esp),%esi
+ roll $1,%esi
+ addl %edi,%ebp
+ rorl $2,%ebx
+ movl %eax,%edi
+ roll $5,%edi
+ movl %esi,40(%esp)
+ leal 2400959708(%esi,%ebp,1),%esi
+ movl %ecx,%ebp
+ addl %edi,%esi
+ andl %edx,%ebp
+ movl 44(%esp),%edi
+ addl %ebp,%esi
+
+ movl %ebx,%ebp
+ xorl 52(%esp),%edi
+ xorl %ecx,%ebp
+ xorl 12(%esp),%edi
+ andl %eax,%ebp
+ xorl 32(%esp),%edi
+ roll $1,%edi
+ addl %edx,%ebp
+ rorl $2,%eax
+ movl %esi,%edx
+ roll $5,%edx
+ movl %edi,44(%esp)
+ leal 2400959708(%edi,%ebp,1),%edi
+ movl %ebx,%ebp
+ addl %edx,%edi
+ andl %ecx,%ebp
+ movl 48(%esp),%edx
+ addl %ebp,%edi
+
+ movl %eax,%ebp
+ xorl 56(%esp),%edx
+ xorl %ebx,%ebp
+ xorl 16(%esp),%edx
+ andl %esi,%ebp
+ xorl 36(%esp),%edx
+ roll $1,%edx
+ addl %ecx,%ebp
+ rorl $2,%esi
+ movl %edi,%ecx
+ roll $5,%ecx
+ movl %edx,48(%esp)
+ leal 2400959708(%edx,%ebp,1),%edx
+ movl %eax,%ebp
+ addl %ecx,%edx
+ andl %ebx,%ebp
+ movl 52(%esp),%ecx
+ addl %ebp,%edx
+
+ movl %esi,%ebp
+ xorl 60(%esp),%ecx
+ xorl %eax,%ebp
+ xorl 20(%esp),%ecx
+ andl %edi,%ebp
+ xorl 40(%esp),%ecx
+ roll $1,%ecx
+ addl %ebx,%ebp
+ rorl $2,%edi
+ movl %edx,%ebx
+ roll $5,%ebx
+ movl %ecx,52(%esp)
+ leal 2400959708(%ecx,%ebp,1),%ecx
+ movl %esi,%ebp
+ addl %ebx,%ecx
+ andl %eax,%ebp
+ movl 56(%esp),%ebx
+ addl %ebp,%ecx
+
+ movl %edi,%ebp
+ xorl (%esp),%ebx
+ xorl %esi,%ebp
+ xorl 24(%esp),%ebx
+ andl %edx,%ebp
+ xorl 44(%esp),%ebx
+ roll $1,%ebx
+ addl %eax,%ebp
+ rorl $2,%edx
+ movl %ecx,%eax
+ roll $5,%eax
+ movl %ebx,56(%esp)
+ leal 2400959708(%ebx,%ebp,1),%ebx
+ movl %edi,%ebp
+ addl %eax,%ebx
+ andl %esi,%ebp
+ movl 60(%esp),%eax
+ addl %ebp,%ebx
+
+ movl %edx,%ebp
+ xorl 4(%esp),%eax
+ xorl %edi,%ebp
+ xorl 28(%esp),%eax
+ andl %ecx,%ebp
+ xorl 48(%esp),%eax
+ roll $1,%eax
+ addl %esi,%ebp
+ rorl $2,%ecx
+ movl %ebx,%esi
+ roll $5,%esi
+ movl %eax,60(%esp)
+ leal 2400959708(%eax,%ebp,1),%eax
+ movl %edx,%ebp
+ addl %esi,%eax
+ andl %edi,%ebp
+ movl (%esp),%esi
+ addl %ebp,%eax
+
+ movl %ecx,%ebp
+ xorl 8(%esp),%esi
+ xorl %edx,%ebp
+ xorl 32(%esp),%esi
+ andl %ebx,%ebp
+ xorl 52(%esp),%esi
+ roll $1,%esi
+ addl %edi,%ebp
+ rorl $2,%ebx
+ movl %eax,%edi
+ roll $5,%edi
+ movl %esi,(%esp)
+ leal 2400959708(%esi,%ebp,1),%esi
+ movl %ecx,%ebp
+ addl %edi,%esi
+ andl %edx,%ebp
+ movl 4(%esp),%edi
+ addl %ebp,%esi
+
+ movl %ebx,%ebp
+ xorl 12(%esp),%edi
+ xorl %ecx,%ebp
+ xorl 36(%esp),%edi
+ andl %eax,%ebp
+ xorl 56(%esp),%edi
+ roll $1,%edi
+ addl %edx,%ebp
+ rorl $2,%eax
+ movl %esi,%edx
+ roll $5,%edx
+ movl %edi,4(%esp)
+ leal 2400959708(%edi,%ebp,1),%edi
+ movl %ebx,%ebp
+ addl %edx,%edi
+ andl %ecx,%ebp
+ movl 8(%esp),%edx
+ addl %ebp,%edi
+
+ movl %eax,%ebp
+ xorl 16(%esp),%edx
+ xorl %ebx,%ebp
+ xorl 40(%esp),%edx
+ andl %esi,%ebp
+ xorl 60(%esp),%edx
+ roll $1,%edx
+ addl %ecx,%ebp
+ rorl $2,%esi
+ movl %edi,%ecx
+ roll $5,%ecx
+ movl %edx,8(%esp)
+ leal 2400959708(%edx,%ebp,1),%edx
+ movl %eax,%ebp
+ addl %ecx,%edx
+ andl %ebx,%ebp
+ movl 12(%esp),%ecx
+ addl %ebp,%edx
+
+ movl %esi,%ebp
+ xorl 20(%esp),%ecx
+ xorl %eax,%ebp
+ xorl 44(%esp),%ecx
+ andl %edi,%ebp
+ xorl (%esp),%ecx
+ roll $1,%ecx
+ addl %ebx,%ebp
+ rorl $2,%edi
+ movl %edx,%ebx
+ roll $5,%ebx
+ movl %ecx,12(%esp)
+ leal 2400959708(%ecx,%ebp,1),%ecx
+ movl %esi,%ebp
+ addl %ebx,%ecx
+ andl %eax,%ebp
+ movl 16(%esp),%ebx
+ addl %ebp,%ecx
+
+ movl %edi,%ebp
+ xorl 24(%esp),%ebx
+ xorl %esi,%ebp
+ xorl 48(%esp),%ebx
+ andl %edx,%ebp
+ xorl 4(%esp),%ebx
+ roll $1,%ebx
+ addl %eax,%ebp
+ rorl $2,%edx
+ movl %ecx,%eax
+ roll $5,%eax
+ movl %ebx,16(%esp)
+ leal 2400959708(%ebx,%ebp,1),%ebx
+ movl %edi,%ebp
+ addl %eax,%ebx
+ andl %esi,%ebp
+ movl 20(%esp),%eax
+ addl %ebp,%ebx
+
+ movl %edx,%ebp
+ xorl 28(%esp),%eax
+ xorl %edi,%ebp
+ xorl 52(%esp),%eax
+ andl %ecx,%ebp
+ xorl 8(%esp),%eax
+ roll $1,%eax
+ addl %esi,%ebp
+ rorl $2,%ecx
+ movl %ebx,%esi
+ roll $5,%esi
+ movl %eax,20(%esp)
+ leal 2400959708(%eax,%ebp,1),%eax
+ movl %edx,%ebp
+ addl %esi,%eax
+ andl %edi,%ebp
+ movl 24(%esp),%esi
+ addl %ebp,%eax
+
+ movl %ecx,%ebp
+ xorl 32(%esp),%esi
+ xorl %edx,%ebp
+ xorl 56(%esp),%esi
+ andl %ebx,%ebp
+ xorl 12(%esp),%esi
+ roll $1,%esi
+ addl %edi,%ebp
+ rorl $2,%ebx
+ movl %eax,%edi
+ roll $5,%edi
+ movl %esi,24(%esp)
+ leal 2400959708(%esi,%ebp,1),%esi
+ movl %ecx,%ebp
+ addl %edi,%esi
+ andl %edx,%ebp
+ movl 28(%esp),%edi
+ addl %ebp,%esi
+
+ movl %ebx,%ebp
+ xorl 36(%esp),%edi
+ xorl %ecx,%ebp
+ xorl 60(%esp),%edi
+ andl %eax,%ebp
+ xorl 16(%esp),%edi
+ roll $1,%edi
+ addl %edx,%ebp
+ rorl $2,%eax
+ movl %esi,%edx
+ roll $5,%edx
+ movl %edi,28(%esp)
+ leal 2400959708(%edi,%ebp,1),%edi
+ movl %ebx,%ebp
+ addl %edx,%edi
+ andl %ecx,%ebp
+ movl 32(%esp),%edx
+ addl %ebp,%edi
+
+ movl %eax,%ebp
+ xorl 40(%esp),%edx
+ xorl %ebx,%ebp
+ xorl (%esp),%edx
+ andl %esi,%ebp
+ xorl 20(%esp),%edx
+ roll $1,%edx
+ addl %ecx,%ebp
+ rorl $2,%esi
+ movl %edi,%ecx
+ roll $5,%ecx
+ movl %edx,32(%esp)
+ leal 2400959708(%edx,%ebp,1),%edx
+ movl %eax,%ebp
+ addl %ecx,%edx
+ andl %ebx,%ebp
+ movl 36(%esp),%ecx
+ addl %ebp,%edx
+
+ movl %esi,%ebp
+ xorl 44(%esp),%ecx
+ xorl %eax,%ebp
+ xorl 4(%esp),%ecx
+ andl %edi,%ebp
+ xorl 24(%esp),%ecx
+ roll $1,%ecx
+ addl %ebx,%ebp
+ rorl $2,%edi
+ movl %edx,%ebx
+ roll $5,%ebx
+ movl %ecx,36(%esp)
+ leal 2400959708(%ecx,%ebp,1),%ecx
+ movl %esi,%ebp
+ addl %ebx,%ecx
+ andl %eax,%ebp
+ movl 40(%esp),%ebx
+ addl %ebp,%ecx
+
+ movl %edi,%ebp
+ xorl 48(%esp),%ebx
+ xorl %esi,%ebp
+ xorl 8(%esp),%ebx
+ andl %edx,%ebp
+ xorl 28(%esp),%ebx
+ roll $1,%ebx
+ addl %eax,%ebp
+ rorl $2,%edx
+ movl %ecx,%eax
+ roll $5,%eax
+ movl %ebx,40(%esp)
+ leal 2400959708(%ebx,%ebp,1),%ebx
+ movl %edi,%ebp
+ addl %eax,%ebx
+ andl %esi,%ebp
+ movl 44(%esp),%eax
+ addl %ebp,%ebx
+
+ movl %edx,%ebp
+ xorl 52(%esp),%eax
+ xorl %edi,%ebp
+ xorl 12(%esp),%eax
+ andl %ecx,%ebp
+ xorl 32(%esp),%eax
+ roll $1,%eax
+ addl %esi,%ebp
+ rorl $2,%ecx
+ movl %ebx,%esi
+ roll $5,%esi
+ movl %eax,44(%esp)
+ leal 2400959708(%eax,%ebp,1),%eax
+ movl %edx,%ebp
+ addl %esi,%eax
+ andl %edi,%ebp
+ movl 48(%esp),%esi
+ addl %ebp,%eax
+
+ movl %ebx,%ebp
+ xorl 56(%esp),%esi
+ xorl %ecx,%ebp
+ xorl 16(%esp),%esi
+ xorl %edx,%ebp
+ xorl 36(%esp),%esi
+ roll $1,%esi
+ addl %ebp,%edi
+ rorl $2,%ebx
+ movl %eax,%ebp
+ roll $5,%ebp
+ movl %esi,48(%esp)
+ leal 3395469782(%esi,%edi,1),%esi
+ movl 52(%esp),%edi
+ addl %ebp,%esi
+
+ movl %eax,%ebp
+ xorl 60(%esp),%edi
+ xorl %ebx,%ebp
+ xorl 20(%esp),%edi
+ xorl %ecx,%ebp
+ xorl 40(%esp),%edi
+ roll $1,%edi
+ addl %ebp,%edx
+ rorl $2,%eax
+ movl %esi,%ebp
+ roll $5,%ebp
+ movl %edi,52(%esp)
+ leal 3395469782(%edi,%edx,1),%edi
+ movl 56(%esp),%edx
+ addl %ebp,%edi
+
+ movl %esi,%ebp
+ xorl (%esp),%edx
+ xorl %eax,%ebp
+ xorl 24(%esp),%edx
+ xorl %ebx,%ebp
+ xorl 44(%esp),%edx
+ roll $1,%edx
+ addl %ebp,%ecx
+ rorl $2,%esi
+ movl %edi,%ebp
+ roll $5,%ebp
+ movl %edx,56(%esp)
+ leal 3395469782(%edx,%ecx,1),%edx
+ movl 60(%esp),%ecx
+ addl %ebp,%edx
+
+ movl %edi,%ebp
+ xorl 4(%esp),%ecx
+ xorl %esi,%ebp
+ xorl 28(%esp),%ecx
+ xorl %eax,%ebp
+ xorl 48(%esp),%ecx
+ roll $1,%ecx
+ addl %ebp,%ebx
+ rorl $2,%edi
+ movl %edx,%ebp
+ roll $5,%ebp
+ movl %ecx,60(%esp)
+ leal 3395469782(%ecx,%ebx,1),%ecx
+ movl (%esp),%ebx
+ addl %ebp,%ecx
+
+ movl %edx,%ebp
+ xorl 8(%esp),%ebx
+ xorl %edi,%ebp
+ xorl 32(%esp),%ebx
+ xorl %esi,%ebp
+ xorl 52(%esp),%ebx
+ roll $1,%ebx
+ addl %ebp,%eax
+ rorl $2,%edx
+ movl %ecx,%ebp
+ roll $5,%ebp
+ movl %ebx,(%esp)
+ leal 3395469782(%ebx,%eax,1),%ebx
+ movl 4(%esp),%eax
+ addl %ebp,%ebx
+
+ movl %ecx,%ebp
+ xorl 12(%esp),%eax
+ xorl %edx,%ebp
+ xorl 36(%esp),%eax
+ xorl %edi,%ebp
+ xorl 56(%esp),%eax
+ roll $1,%eax
+ addl %ebp,%esi
+ rorl $2,%ecx
+ movl %ebx,%ebp
+ roll $5,%ebp
+ movl %eax,4(%esp)
+ leal 3395469782(%eax,%esi,1),%eax
+ movl 8(%esp),%esi
+ addl %ebp,%eax
+
+ movl %ebx,%ebp
+ xorl 16(%esp),%esi
+ xorl %ecx,%ebp
+ xorl 40(%esp),%esi
+ xorl %edx,%ebp
+ xorl 60(%esp),%esi
+ roll $1,%esi
+ addl %ebp,%edi
+ rorl $2,%ebx
+ movl %eax,%ebp
+ roll $5,%ebp
+ movl %esi,8(%esp)
+ leal 3395469782(%esi,%edi,1),%esi
+ movl 12(%esp),%edi
+ addl %ebp,%esi
+
+ movl %eax,%ebp
+ xorl 20(%esp),%edi
+ xorl %ebx,%ebp
+ xorl 44(%esp),%edi
+ xorl %ecx,%ebp
+ xorl (%esp),%edi
+ roll $1,%edi
+ addl %ebp,%edx
+ rorl $2,%eax
+ movl %esi,%ebp
+ roll $5,%ebp
+ movl %edi,12(%esp)
+ leal 3395469782(%edi,%edx,1),%edi
+ movl 16(%esp),%edx
+ addl %ebp,%edi
+
+ movl %esi,%ebp
+ xorl 24(%esp),%edx
+ xorl %eax,%ebp
+ xorl 48(%esp),%edx
+ xorl %ebx,%ebp
+ xorl 4(%esp),%edx
+ roll $1,%edx
+ addl %ebp,%ecx
+ rorl $2,%esi
+ movl %edi,%ebp
+ roll $5,%ebp
+ movl %edx,16(%esp)
+ leal 3395469782(%edx,%ecx,1),%edx
+ movl 20(%esp),%ecx
+ addl %ebp,%edx
+
+ movl %edi,%ebp
+ xorl 28(%esp),%ecx
+ xorl %esi,%ebp
+ xorl 52(%esp),%ecx
+ xorl %eax,%ebp
+ xorl 8(%esp),%ecx
+ roll $1,%ecx
+ addl %ebp,%ebx
+ rorl $2,%edi
+ movl %edx,%ebp
+ roll $5,%ebp
+ movl %ecx,20(%esp)
+ leal 3395469782(%ecx,%ebx,1),%ecx
+ movl 24(%esp),%ebx
+ addl %ebp,%ecx
+
+ movl %edx,%ebp
+ xorl 32(%esp),%ebx
+ xorl %edi,%ebp
+ xorl 56(%esp),%ebx
+ xorl %esi,%ebp
+ xorl 12(%esp),%ebx
+ roll $1,%ebx
+ addl %ebp,%eax
+ rorl $2,%edx
+ movl %ecx,%ebp
+ roll $5,%ebp
+ movl %ebx,24(%esp)
+ leal 3395469782(%ebx,%eax,1),%ebx
+ movl 28(%esp),%eax
+ addl %ebp,%ebx
+
+ movl %ecx,%ebp
+ xorl 36(%esp),%eax
+ xorl %edx,%ebp
+ xorl 60(%esp),%eax
+ xorl %edi,%ebp
+ xorl 16(%esp),%eax
+ roll $1,%eax
+ addl %ebp,%esi
+ rorl $2,%ecx
+ movl %ebx,%ebp
+ roll $5,%ebp
+ movl %eax,28(%esp)
+ leal 3395469782(%eax,%esi,1),%eax
+ movl 32(%esp),%esi
+ addl %ebp,%eax
+
+ movl %ebx,%ebp
+ xorl 40(%esp),%esi
+ xorl %ecx,%ebp
+ xorl (%esp),%esi
+ xorl %edx,%ebp
+ xorl 20(%esp),%esi
+ roll $1,%esi
+ addl %ebp,%edi
+ rorl $2,%ebx
+ movl %eax,%ebp
+ roll $5,%ebp
+ movl %esi,32(%esp)
+ leal 3395469782(%esi,%edi,1),%esi
+ movl 36(%esp),%edi
+ addl %ebp,%esi
+
+ movl %eax,%ebp
+ xorl 44(%esp),%edi
+ xorl %ebx,%ebp
+ xorl 4(%esp),%edi
+ xorl %ecx,%ebp
+ xorl 24(%esp),%edi
+ roll $1,%edi
+ addl %ebp,%edx
+ rorl $2,%eax
+ movl %esi,%ebp
+ roll $5,%ebp
+ movl %edi,36(%esp)
+ leal 3395469782(%edi,%edx,1),%edi
+ movl 40(%esp),%edx
+ addl %ebp,%edi
+
+ movl %esi,%ebp
+ xorl 48(%esp),%edx
+ xorl %eax,%ebp
+ xorl 8(%esp),%edx
+ xorl %ebx,%ebp
+ xorl 28(%esp),%edx
+ roll $1,%edx
+ addl %ebp,%ecx
+ rorl $2,%esi
+ movl %edi,%ebp
+ roll $5,%ebp
+ movl %edx,40(%esp)
+ leal 3395469782(%edx,%ecx,1),%edx
+ movl 44(%esp),%ecx
+ addl %ebp,%edx
+
+ movl %edi,%ebp
+ xorl 52(%esp),%ecx
+ xorl %esi,%ebp
+ xorl 12(%esp),%ecx
+ xorl %eax,%ebp
+ xorl 32(%esp),%ecx
+ roll $1,%ecx
+ addl %ebp,%ebx
+ rorl $2,%edi
+ movl %edx,%ebp
+ roll $5,%ebp
+ movl %ecx,44(%esp)
+ leal 3395469782(%ecx,%ebx,1),%ecx
+ movl 48(%esp),%ebx
+ addl %ebp,%ecx
+
+ movl %edx,%ebp
+ xorl 56(%esp),%ebx
+ xorl %edi,%ebp
+ xorl 16(%esp),%ebx
+ xorl %esi,%ebp
+ xorl 36(%esp),%ebx
+ roll $1,%ebx
+ addl %ebp,%eax
+ rorl $2,%edx
+ movl %ecx,%ebp
+ roll $5,%ebp
+ movl %ebx,48(%esp)
+ leal 3395469782(%ebx,%eax,1),%ebx
+ movl 52(%esp),%eax
+ addl %ebp,%ebx
+
+ movl %ecx,%ebp
+ xorl 60(%esp),%eax
+ xorl %edx,%ebp
+ xorl 20(%esp),%eax
+ xorl %edi,%ebp
+ xorl 40(%esp),%eax
+ roll $1,%eax
+ addl %ebp,%esi
+ rorl $2,%ecx
+ movl %ebx,%ebp
+ roll $5,%ebp
+ leal 3395469782(%eax,%esi,1),%eax
+ movl 56(%esp),%esi
+ addl %ebp,%eax
+
+ movl %ebx,%ebp
+ xorl (%esp),%esi
+ xorl %ecx,%ebp
+ xorl 24(%esp),%esi
+ xorl %edx,%ebp
+ xorl 44(%esp),%esi
+ roll $1,%esi
+ addl %ebp,%edi
+ rorl $2,%ebx
+ movl %eax,%ebp
+ roll $5,%ebp
+ leal 3395469782(%esi,%edi,1),%esi
+ movl 60(%esp),%edi
+ addl %ebp,%esi
+
+ movl %eax,%ebp
+ xorl 4(%esp),%edi
+ xorl %ebx,%ebp
+ xorl 28(%esp),%edi
+ xorl %ecx,%ebp
+ xorl 48(%esp),%edi
+ roll $1,%edi
+ addl %ebp,%edx
+ rorl $2,%eax
+ movl %esi,%ebp
+ roll $5,%ebp
+ leal 3395469782(%edi,%edx,1),%edi
+ addl %ebp,%edi
+ movl 96(%esp),%ebp
+ movl 100(%esp),%edx
+ addl (%ebp),%edi
+ addl 4(%ebp),%esi
+ addl 8(%ebp),%eax
+ addl 12(%ebp),%ebx
+ addl 16(%ebp),%ecx
+ movl %edi,(%ebp)
+ addl $64,%edx
+ movl %esi,4(%ebp)
+ cmpl 104(%esp),%edx
+ movl %eax,8(%ebp)
+ movl %ecx,%edi
+ movl %ebx,12(%ebp)
+ movl %edx,%esi
+ movl %ecx,16(%ebp)
+ jb .L000loop
+ addl $76,%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size sha1_block_data_order,.-.L_sha1_block_data_order_begin
+.byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115
+.byte 102,111,114,109,32,102,111,114,32,120,56,54,44,32,67,82
+.byte 89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112
+.byte 114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
diff --git a/main/openssl/crypto/sha/asm/sha1-586.pl b/main/openssl/crypto/sha/asm/sha1-586.pl
index a1f87628..2b119ffa 100644
--- a/main/openssl/crypto/sha/asm/sha1-586.pl
+++ b/main/openssl/crypto/sha/asm/sha1-586.pl
@@ -12,6 +12,8 @@
# commentary below], and in 2006 the rest was rewritten in order to
# gain freedom to liberate licensing terms.
+# January, September 2004.
+#
# It was noted that Intel IA-32 C compiler generates code which
# performs ~30% *faster* on P4 CPU than original *hand-coded*
# SHA1 assembler implementation. To address this problem (and
@@ -31,12 +33,92 @@
# ----------------------------------------------------------------
# <appro@fy.chalmers.se>
+# August 2009.
+#
+# George Spelvin has tipped that F_40_59(b,c,d) can be rewritten as
+# '(c&d) + (b&(c^d))', which allows to accumulate partial results
+# and lighten "pressure" on scratch registers. This resulted in
+# >12% performance improvement on contemporary AMD cores (with no
+# degradation on other CPUs:-). Also, the code was revised to maximize
+# "distance" between instructions producing input to 'lea' instruction
+# and the 'lea' instruction itself, which is essential for Intel Atom
+# core and resulted in ~15% improvement.
+
+# October 2010.
+#
+# Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it
+# is to offload message schedule denoted by Wt in NIST specification,
+# or Xupdate in OpenSSL source, to SIMD unit. The idea is not novel,
+# and in SSE2 context was first explored by Dean Gaudet in 2004, see
+# http://arctic.org/~dean/crypto/sha1.html. Since then several things
+# have changed that made it interesting again:
+#
+# a) XMM units became faster and wider;
+# b) instruction set became more versatile;
+# c) an important observation was made by Max Locktykhin, which made
+# it possible to reduce amount of instructions required to perform
+# the operation in question, for further details see
+# http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/.
+
+# April 2011.
+#
+# Add AVX code path, probably most controversial... The thing is that
+# switch to AVX alone improves performance by as little as 4% in
+# comparison to SSSE3 code path. But below result doesn't look like
+# 4% improvement... Trouble is that Sandy Bridge decodes 'ro[rl]' as
+# pair of µ-ops, and it's the additional µ-ops, two per round, that
+# make it run slower than Core2 and Westmere. But 'sh[rl]d' is decoded
+# as single µ-op by Sandy Bridge and it's replacing 'ro[rl]' with
+# equivalent 'sh[rl]d' that is responsible for the impressive 5.1
+# cycles per processed byte. But 'sh[rl]d' is not something that used
+# to be fast, nor does it appear to be fast in upcoming Bulldozer
+# [according to its optimization manual]. Which is why AVX code path
+# is guarded by *both* AVX and synthetic bit denoting Intel CPUs.
+# One can argue that it's unfair to AMD, but without 'sh[rl]d' it
+# makes no sense to keep the AVX code path. If somebody feels that
+# strongly, it's probably more appropriate to discuss possibility of
+# using vector rotate XOP on AMD...
+
+######################################################################
+# Current performance is summarized in following table. Numbers are
+# CPU clock cycles spent to process single byte (less is better).
+#
+# x86 SSSE3 AVX
+# Pentium 15.7 -
+# PIII 11.5 -
+# P4 10.6 -
+# AMD K8 7.1 -
+# Core2 7.3 6.1/+20% -
+# Atom 12.5 9.5(*)/+32% -
+# Westmere 7.3 5.6/+30% -
+# Sandy Bridge 8.8 6.2/+40% 5.1(**)/+70%
+#
+# (*) Loop is 1056 instructions long and expected result is ~8.25.
+# It remains mystery [to me] why ILP is limited to 1.7.
+#
+# (**) As per above comment, the result is for AVX *plus* sh[rl]d.
+
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";
&asm_init($ARGV[0],"sha1-586.pl",$ARGV[$#ARGV] eq "386");
+$xmm=$ymm=0;
+for (@ARGV) { $xmm=1 if (/-DOPENSSL_IA32_SSE2/); }
+
+$ymm=1 if ($xmm &&
+ `$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+ =~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
+ $1>=2.19); # first version supporting AVX
+
+$ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32n" &&
+ `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
+ $1>=2.03); # first version supporting AVX
+
+&external_label("OPENSSL_ia32cap_P") if ($xmm);
+
+
$A="eax";
$B="ebx";
$C="ecx";
@@ -47,6 +129,10 @@ $tmp1="ebp";
@V=($A,$B,$C,$D,$E,$T);
+$alt=0; # 1 denotes alternative IALU implementation, which performs
+ # 8% *worse* on P4, same on Westmere and Atom, 2% better on
+ # Sandy Bridge...
+
sub BODY_00_15
{
local($n,$a,$b,$c,$d,$e,$f)=@_;
@@ -59,16 +145,18 @@ sub BODY_00_15
&rotl($tmp1,5); # tmp1=ROTATE(a,5)
&xor($f,$d);
&add($tmp1,$e); # tmp1+=e;
- &and($f,$b);
- &mov($e,&swtmp($n%16)); # e becomes volatile and is loaded
+ &mov($e,&swtmp($n%16)); # e becomes volatile and is loaded
# with xi, also note that e becomes
# f in next round...
- &xor($f,$d); # f holds F_00_19(b,c,d)
+ &and($f,$b);
&rotr($b,2); # b=ROTATE(b,30)
- &lea($tmp1,&DWP(0x5a827999,$tmp1,$e)); # tmp1+=K_00_19+xi
+ &xor($f,$d); # f holds F_00_19(b,c,d)
+ &lea($tmp1,&DWP(0x5a827999,$tmp1,$e)); # tmp1+=K_00_19+xi
- if ($n==15) { &add($f,$tmp1); } # f+=tmp1
+ if ($n==15) { &mov($e,&swtmp(($n+1)%16));# pre-fetch f for next round
+ &add($f,$tmp1); } # f+=tmp1
else { &add($tmp1,$f); } # f becomes a in next round
+ &mov($tmp1,$a) if ($alt && $n==15);
}
sub BODY_16_19
@@ -77,22 +165,41 @@ sub BODY_16_19
&comment("16_19 $n");
- &mov($f,&swtmp($n%16)); # f to hold Xupdate(xi,xa,xb,xc,xd)
- &mov($tmp1,$c); # tmp1 to hold F_00_19(b,c,d)
- &xor($f,&swtmp(($n+2)%16));
- &xor($tmp1,$d);
- &xor($f,&swtmp(($n+8)%16));
- &and($tmp1,$b); # tmp1 holds F_00_19(b,c,d)
- &rotr($b,2); # b=ROTATE(b,30)
+if ($alt) {
+ &xor($c,$d);
+ &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd)
+ &and($tmp1,$c); # tmp1 to hold F_00_19(b,c,d), b&=c^d
+ &xor($f,&swtmp(($n+8)%16));
+ &xor($tmp1,$d); # tmp1=F_00_19(b,c,d)
+ &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd
+ &rotl($f,1); # f=ROTATE(f,1)
+ &add($e,$tmp1); # e+=F_00_19(b,c,d)
+ &xor($c,$d); # restore $c
+ &mov($tmp1,$a); # b in next round
+ &rotr($b,$n==16?2:7); # b=ROTATE(b,30)
+ &mov(&swtmp($n%16),$f); # xi=f
+ &rotl($a,5); # ROTATE(a,5)
+ &lea($f,&DWP(0x5a827999,$f,$e));# f+=F_00_19(b,c,d)+e
+ &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round
+ &add($f,$a); # f+=ROTATE(a,5)
+} else {
+ &mov($tmp1,$c); # tmp1 to hold F_00_19(b,c,d)
+ &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd)
+ &xor($tmp1,$d);
+ &xor($f,&swtmp(($n+8)%16));
+ &and($tmp1,$b);
&xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd
&rotl($f,1); # f=ROTATE(f,1)
&xor($tmp1,$d); # tmp1=F_00_19(b,c,d)
- &mov(&swtmp($n%16),$f); # xi=f
- &lea($f,&DWP(0x5a827999,$f,$e));# f+=K_00_19+e
- &mov($e,$a); # e becomes volatile
- &rotl($e,5); # e=ROTATE(a,5)
- &add($f,$tmp1); # f+=F_00_19(b,c,d)
- &add($f,$e); # f+=ROTATE(a,5)
+ &add($e,$tmp1); # e+=F_00_19(b,c,d)
+ &mov($tmp1,$a);
+ &rotr($b,2); # b=ROTATE(b,30)
+ &mov(&swtmp($n%16),$f); # xi=f
+ &rotl($tmp1,5); # ROTATE(a,5)
+ &lea($f,&DWP(0x5a827999,$f,$e));# f+=F_00_19(b,c,d)+e
+ &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round
+ &add($f,$tmp1); # f+=ROTATE(a,5)
+}
}
sub BODY_20_39
@@ -102,21 +209,41 @@ sub BODY_20_39
&comment("20_39 $n");
+if ($alt) {
+ &xor($tmp1,$c); # tmp1 to hold F_20_39(b,c,d), b^=c
+ &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd)
+ &xor($tmp1,$d); # tmp1 holds F_20_39(b,c,d)
+ &xor($f,&swtmp(($n+8)%16));
+ &add($e,$tmp1); # e+=F_20_39(b,c,d)
+ &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd
+ &rotl($f,1); # f=ROTATE(f,1)
+ &mov($tmp1,$a); # b in next round
+ &rotr($b,7); # b=ROTATE(b,30)
+ &mov(&swtmp($n%16),$f) if($n<77);# xi=f
+ &rotl($a,5); # ROTATE(a,5)
+ &xor($b,$c) if($n==39);# warm up for BODY_40_59
+ &and($tmp1,$b) if($n==39);
+ &lea($f,&DWP($K,$f,$e)); # f+=e+K_XX_YY
+ &mov($e,&swtmp(($n+1)%16)) if($n<79);# pre-fetch f for next round
+ &add($f,$a); # f+=ROTATE(a,5)
+ &rotr($a,5) if ($n==79);
+} else {
&mov($tmp1,$b); # tmp1 to hold F_20_39(b,c,d)
- &mov($f,&swtmp($n%16)); # f to hold Xupdate(xi,xa,xb,xc,xd)
- &rotr($b,2); # b=ROTATE(b,30)
- &xor($f,&swtmp(($n+2)%16));
+ &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd)
&xor($tmp1,$c);
&xor($f,&swtmp(($n+8)%16));
&xor($tmp1,$d); # tmp1 holds F_20_39(b,c,d)
&xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd
&rotl($f,1); # f=ROTATE(f,1)
- &add($tmp1,$e);
- &mov(&swtmp($n%16),$f); # xi=f
- &mov($e,$a); # e becomes volatile
- &rotl($e,5); # e=ROTATE(a,5)
- &lea($f,&DWP($K,$f,$tmp1)); # f+=K_20_39+e
- &add($f,$e); # f+=ROTATE(a,5)
+ &add($e,$tmp1); # e+=F_20_39(b,c,d)
+ &rotr($b,2); # b=ROTATE(b,30)
+ &mov($tmp1,$a);
+ &rotl($tmp1,5); # ROTATE(a,5)
+ &mov(&swtmp($n%16),$f) if($n<77);# xi=f
+ &lea($f,&DWP($K,$f,$e)); # f+=e+K_XX_YY
+ &mov($e,&swtmp(($n+1)%16)) if($n<79);# pre-fetch f for next round
+ &add($f,$tmp1); # f+=ROTATE(a,5)
+}
}
sub BODY_40_59
@@ -125,41 +252,86 @@ sub BODY_40_59
&comment("40_59 $n");
- &mov($f,&swtmp($n%16)); # f to hold Xupdate(xi,xa,xb,xc,xd)
- &mov($tmp1,&swtmp(($n+2)%16));
- &xor($f,$tmp1);
- &mov($tmp1,&swtmp(($n+8)%16));
- &xor($f,$tmp1);
- &mov($tmp1,&swtmp(($n+13)%16));
- &xor($f,$tmp1); # f holds xa^xb^xc^xd
- &mov($tmp1,$b); # tmp1 to hold F_40_59(b,c,d)
+if ($alt) {
+ &add($e,$tmp1); # e+=b&(c^d)
+ &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd)
+ &mov($tmp1,$d);
+ &xor($f,&swtmp(($n+8)%16));
+ &xor($c,$d); # restore $c
+ &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd
&rotl($f,1); # f=ROTATE(f,1)
- &or($tmp1,$c);
- &mov(&swtmp($n%16),$f); # xi=f
- &and($tmp1,$d);
- &lea($f,&DWP(0x8f1bbcdc,$f,$e));# f+=K_40_59+e
- &mov($e,$b); # e becomes volatile and is used
- # to calculate F_40_59(b,c,d)
+ &and($tmp1,$c);
+ &rotr($b,7); # b=ROTATE(b,30)
+ &add($e,$tmp1); # e+=c&d
+ &mov($tmp1,$a); # b in next round
+ &mov(&swtmp($n%16),$f); # xi=f
+ &rotl($a,5); # ROTATE(a,5)
+ &xor($b,$c) if ($n<59);
+ &and($tmp1,$b) if ($n<59);# tmp1 to hold F_40_59(b,c,d)
+ &lea($f,&DWP(0x8f1bbcdc,$f,$e));# f+=K_40_59+e+(b&(c^d))
+ &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round
+ &add($f,$a); # f+=ROTATE(a,5)
+} else {
+ &mov($tmp1,$c); # tmp1 to hold F_40_59(b,c,d)
+ &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd)
+ &xor($tmp1,$d);
+ &xor($f,&swtmp(($n+8)%16));
+ &and($tmp1,$b);
+ &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd
+ &rotl($f,1); # f=ROTATE(f,1)
+ &add($tmp1,$e); # b&(c^d)+=e
&rotr($b,2); # b=ROTATE(b,30)
- &and($e,$c);
- &or($tmp1,$e); # tmp1 holds F_40_59(b,c,d)
- &mov($e,$a);
- &rotl($e,5); # e=ROTATE(a,5)
- &add($f,$tmp1); # f+=tmp1;
+ &mov($e,$a); # e becomes volatile
+ &rotl($e,5); # ROTATE(a,5)
+ &mov(&swtmp($n%16),$f); # xi=f
+ &lea($f,&DWP(0x8f1bbcdc,$f,$tmp1));# f+=K_40_59+e+(b&(c^d))
+ &mov($tmp1,$c);
&add($f,$e); # f+=ROTATE(a,5)
+ &and($tmp1,$d);
+ &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round
+ &add($f,$tmp1); # f+=c&d
+}
}
&function_begin("sha1_block_data_order");
+if ($xmm) {
+ &static_label("ssse3_shortcut");
+ &static_label("avx_shortcut") if ($ymm);
+ &static_label("K_XX_XX");
+
+ &call (&label("pic_point")); # make it PIC!
+ &set_label("pic_point");
+ &blindpop($tmp1);
+ &picmeup($T,"OPENSSL_ia32cap_P",$tmp1,&label("pic_point"));
+ &lea ($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1));
+
+ &mov ($A,&DWP(0,$T));
+ &mov ($D,&DWP(4,$T));
+ &test ($D,1<<9); # check SSSE3 bit
+ &jz (&label("x86"));
+ &test ($A,1<<24); # check FXSR bit
+ &jz (&label("x86"));
+ if ($ymm) {
+ &and ($D,1<<28); # mask AVX bit
+ &and ($A,1<<30); # mask "Intel CPU" bit
+ &or ($A,$D);
+ &cmp ($A,1<<28|1<<30);
+ &je (&label("avx_shortcut"));
+ }
+ &jmp (&label("ssse3_shortcut"));
+ &set_label("x86",16);
+}
&mov($tmp1,&wparam(0)); # SHA_CTX *c
&mov($T,&wparam(1)); # const void *input
&mov($A,&wparam(2)); # size_t num
- &stack_push(16); # allocate X[16]
+ &stack_push(16+3); # allocate X[16]
&shl($A,6);
&add($A,$T);
&mov(&wparam(2),$A); # pointer beyond the end of input
&mov($E,&DWP(16,$tmp1));# pre-load E
+ &jmp(&label("loop"));
- &set_label("loop",16);
+&set_label("loop",16);
# copy input chunk to X, but reversing byte order!
for ($i=0; $i<16; $i+=4)
@@ -213,8 +385,845 @@ sub BODY_40_59
&mov(&DWP(16,$tmp1),$C);
&jb(&label("loop"));
- &stack_pop(16);
+ &stack_pop(16+3);
&function_end("sha1_block_data_order");
+
+if ($xmm) {
+######################################################################
+# The SSSE3 implementation.
+#
+# %xmm[0-7] are used as ring @X[] buffer containing quadruples of last
+# 32 elements of the message schedule or Xupdate outputs. First 4
+# quadruples are simply byte-swapped input, next 4 are calculated
+# according to method originally suggested by Dean Gaudet (modulo
+# being implemented in SSSE3). Once 8 quadruples or 32 elements are
+# collected, it switches to routine proposed by Max Locktyukhin.
+#
+# Calculations inevitably require temporary reqisters, and there are
+# no %xmm registers left to spare. For this reason part of the ring
+# buffer, X[2..4] to be specific, is offloaded to 3 quadriples ring
+# buffer on the stack. Keep in mind that X[2] is alias X[-6], X[3] -
+# X[-5], and X[4] - X[-4]...
+#
+# Another notable optimization is aggressive stack frame compression
+# aiming to minimize amount of 9-byte instructions...
+#
+# Yet another notable optimization is "jumping" $B variable. It means
+# that there is no register permanently allocated for $B value. This
+# allowed to eliminate one instruction from body_20_39...
+#
+my $Xi=4; # 4xSIMD Xupdate round, start pre-seeded
+my @X=map("xmm$_",(4..7,0..3)); # pre-seeded for $Xi=4
+my @V=($A,$B,$C,$D,$E);
+my $j=0; # hash round
+my @T=($T,$tmp1);
+my $inp;
+
+my $_rol=sub { &rol(@_) };
+my $_ror=sub { &ror(@_) };
+
+&function_begin("_sha1_block_data_order_ssse3");
+ &call (&label("pic_point")); # make it PIC!
+ &set_label("pic_point");
+ &blindpop($tmp1);
+ &lea ($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1));
+&set_label("ssse3_shortcut");
+
+ &movdqa (@X[3],&QWP(0,$tmp1)); # K_00_19
+ &movdqa (@X[4],&QWP(16,$tmp1)); # K_20_39
+ &movdqa (@X[5],&QWP(32,$tmp1)); # K_40_59
+ &movdqa (@X[6],&QWP(48,$tmp1)); # K_60_79
+ &movdqa (@X[2],&QWP(64,$tmp1)); # pbswap mask
+
+ &mov ($E,&wparam(0)); # load argument block
+ &mov ($inp=@T[1],&wparam(1));
+ &mov ($D,&wparam(2));
+ &mov (@T[0],"esp");
+
+ # stack frame layout
+ #
+ # +0 X[0]+K X[1]+K X[2]+K X[3]+K # XMM->IALU xfer area
+ # X[4]+K X[5]+K X[6]+K X[7]+K
+ # X[8]+K X[9]+K X[10]+K X[11]+K
+ # X[12]+K X[13]+K X[14]+K X[15]+K
+ #
+ # +64 X[0] X[1] X[2] X[3] # XMM->XMM backtrace area
+ # X[4] X[5] X[6] X[7]
+ # X[8] X[9] X[10] X[11] # even borrowed for K_00_19
+ #
+ # +112 K_20_39 K_20_39 K_20_39 K_20_39 # constants
+ # K_40_59 K_40_59 K_40_59 K_40_59
+ # K_60_79 K_60_79 K_60_79 K_60_79
+ # K_00_19 K_00_19 K_00_19 K_00_19
+ # pbswap mask
+ #
+ # +192 ctx # argument block
+ # +196 inp
+ # +200 end
+ # +204 esp
+ &sub ("esp",208);
+ &and ("esp",-64);
+
+ &movdqa (&QWP(112+0,"esp"),@X[4]); # copy constants
+ &movdqa (&QWP(112+16,"esp"),@X[5]);
+ &movdqa (&QWP(112+32,"esp"),@X[6]);
+ &shl ($D,6); # len*64
+ &movdqa (&QWP(112+48,"esp"),@X[3]);
+ &add ($D,$inp); # end of input
+ &movdqa (&QWP(112+64,"esp"),@X[2]);
+ &add ($inp,64);
+ &mov (&DWP(192+0,"esp"),$E); # save argument block
+ &mov (&DWP(192+4,"esp"),$inp);
+ &mov (&DWP(192+8,"esp"),$D);
+ &mov (&DWP(192+12,"esp"),@T[0]); # save original %esp
+
+ &mov ($A,&DWP(0,$E)); # load context
+ &mov ($B,&DWP(4,$E));
+ &mov ($C,&DWP(8,$E));
+ &mov ($D,&DWP(12,$E));
+ &mov ($E,&DWP(16,$E));
+ &mov (@T[0],$B); # magic seed
+
+ &movdqu (@X[-4&7],&QWP(-64,$inp)); # load input to %xmm[0-3]
+ &movdqu (@X[-3&7],&QWP(-48,$inp));
+ &movdqu (@X[-2&7],&QWP(-32,$inp));
+ &movdqu (@X[-1&7],&QWP(-16,$inp));
+ &pshufb (@X[-4&7],@X[2]); # byte swap
+ &pshufb (@X[-3&7],@X[2]);
+ &pshufb (@X[-2&7],@X[2]);
+ &movdqa (&QWP(112-16,"esp"),@X[3]); # borrow last backtrace slot
+ &pshufb (@X[-1&7],@X[2]);
+ &paddd (@X[-4&7],@X[3]); # add K_00_19
+ &paddd (@X[-3&7],@X[3]);
+ &paddd (@X[-2&7],@X[3]);
+ &movdqa (&QWP(0,"esp"),@X[-4&7]); # X[]+K xfer to IALU
+ &psubd (@X[-4&7],@X[3]); # restore X[]
+ &movdqa (&QWP(0+16,"esp"),@X[-3&7]);
+ &psubd (@X[-3&7],@X[3]);
+ &movdqa (&QWP(0+32,"esp"),@X[-2&7]);
+ &psubd (@X[-2&7],@X[3]);
+ &movdqa (@X[0],@X[-3&7]);
+ &jmp (&label("loop"));
+
+######################################################################
+# SSE instruction sequence is first broken to groups of indepentent
+# instructions, independent in respect to their inputs and shifter
+# (not all architectures have more than one). Then IALU instructions
+# are "knitted in" between the SSE groups. Distance is maintained for
+# SSE latency of 2 in hope that it fits better upcoming AMD Bulldozer
+# [which allegedly also implements SSSE3]...
+#
+# Temporary registers usage. X[2] is volatile at the entry and at the
+# end is restored from backtrace ring buffer. X[3] is expected to
+# contain current K_XX_XX constant and is used to caclulate X[-1]+K
+# from previous round, it becomes volatile the moment the value is
+# saved to stack for transfer to IALU. X[4] becomes volatile whenever
+# X[-4] is accumulated and offloaded to backtrace ring buffer, at the
+# end it is loaded with next K_XX_XX [which becomes X[3] in next
+# round]...
+#
+sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
+ my ($a,$b,$c,$d,$e);
+
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &palignr(@X[0],@X[-4&7],8); # compose "X[-14]" in "X[0]"
+ &movdqa (@X[2],@X[-1&7]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &paddd (@X[3],@X[-1&7]);
+ &movdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]);# save X[] to backtrace buffer
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &psrldq (@X[2],4); # "X[-3]", 3 dwords
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &pxor (@X[2],@X[-2&7]); # "X[-3]"^"X[-8]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &pxor (@X[0],@X[2]); # "X[0]"^="X[-3]"^"X[-8]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &movdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer to IALU
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &movdqa (@X[4],@X[0]);
+ &movdqa (@X[2],@X[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &pslldq (@X[4],12); # "X[0]"<<96, extract one dword
+ &paddd (@X[0],@X[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &psrld (@X[2],31);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &movdqa (@X[3],@X[4]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &psrld (@X[4],30);
+ &por (@X[0],@X[2]); # "X[0]"<<<=1
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &movdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if ($Xi>5); # restore X[] from backtrace buffer
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &pslld (@X[3],2);
+ &pxor (@X[0],@X[4]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &movdqa (@X[4],&QWP(112-16+16*(($Xi)/5),"esp")); # K_XX_XX
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &pxor (@X[0],@X[3]); # "X[0]"^=("X[0]"<<96)<<<2
+ &movdqa (@X[1],@X[-2&7]) if ($Xi<7);
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ foreach (@insns) { eval; } # remaining instructions [if any]
+
+ $Xi++; push(@X,shift(@X)); # "rotate" X[]
+}
+
+sub Xupdate_ssse3_32_79()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions
+ my ($a,$b,$c,$d,$e);
+
+ &movdqa (@X[2],@X[-1&7]) if ($Xi==8);
+ eval(shift(@insns)); # body_20_39
+ &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
+ &palignr(@X[2],@X[-2&7],8); # compose "X[-6]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+
+ &pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
+ &movdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]); # save X[] to backtrace buffer
+ eval(shift(@insns));
+ eval(shift(@insns));
+ if ($Xi%5) {
+ &movdqa (@X[4],@X[3]); # "perpetuate" K_XX_XX...
+ } else { # ... or load next one
+ &movdqa (@X[4],&QWP(112-16+16*($Xi/5),"esp"));
+ }
+ &paddd (@X[3],@X[-1&7]);
+ eval(shift(@insns)); # ror
+ eval(shift(@insns));
+
+ &pxor (@X[0],@X[2]); # "X[0]"^="X[-6]"
+ eval(shift(@insns)); # body_20_39
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+
+ &movdqa (@X[2],@X[0]);
+ &movdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer to IALU
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # ror
+ eval(shift(@insns));
+
+ &pslld (@X[0],2);
+ eval(shift(@insns)); # body_20_39
+ eval(shift(@insns));
+ &psrld (@X[2],30);
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # ror
+ eval(shift(@insns));
+
+ &por (@X[0],@X[2]); # "X[0]"<<<=2
+ eval(shift(@insns)); # body_20_39
+ eval(shift(@insns));
+ &movdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if($Xi<19); # restore X[] from backtrace buffer
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # ror
+ &movdqa (@X[3],@X[0]) if ($Xi<19);
+ eval(shift(@insns));
+
+ foreach (@insns) { eval; } # remaining instructions
+
+ $Xi++; push(@X,shift(@X)); # "rotate" X[]
+}
+
+sub Xuplast_ssse3_80()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
+ my ($a,$b,$c,$d,$e);
+
+ eval(shift(@insns));
+ &paddd (@X[3],@X[-1&7]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &movdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer IALU
+
+ foreach (@insns) { eval; } # remaining instructions
+
+ &mov ($inp=@T[1],&DWP(192+4,"esp"));
+ &cmp ($inp,&DWP(192+8,"esp"));
+ &je (&label("done"));
+
+ &movdqa (@X[3],&QWP(112+48,"esp")); # K_00_19
+ &movdqa (@X[2],&QWP(112+64,"esp")); # pbswap mask
+ &movdqu (@X[-4&7],&QWP(0,$inp)); # load input
+ &movdqu (@X[-3&7],&QWP(16,$inp));
+ &movdqu (@X[-2&7],&QWP(32,$inp));
+ &movdqu (@X[-1&7],&QWP(48,$inp));
+ &add ($inp,64);
+ &pshufb (@X[-4&7],@X[2]); # byte swap
+ &mov (&DWP(192+4,"esp"),$inp);
+ &movdqa (&QWP(112-16,"esp"),@X[3]); # borrow last backtrace slot
+
+ $Xi=0;
+}
+
+sub Xloop_ssse3()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
+ my ($a,$b,$c,$d,$e);
+
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &pshufb (@X[($Xi-3)&7],@X[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &paddd (@X[($Xi-4)&7],@X[3]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &movdqa (&QWP(0+16*$Xi,"esp"),@X[($Xi-4)&7]); # X[]+K xfer to IALU
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &psubd (@X[($Xi-4)&7],@X[3]);
+
+ foreach (@insns) { eval; }
+ $Xi++;
+}
+
+sub Xtail_ssse3()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
+ my ($a,$b,$c,$d,$e);
+
+ foreach (@insns) { eval; }
+}
+
+sub body_00_19 () {
+ (
+ '($a,$b,$c,$d,$e)=@V;'.
+ '&add ($e,&DWP(4*($j&15),"esp"));', # X[]+K xfer
+ '&xor ($c,$d);',
+ '&mov (@T[1],$a);', # $b in next round
+ '&$_rol ($a,5);',
+ '&and (@T[0],$c);', # ($b&($c^$d))
+ '&xor ($c,$d);', # restore $c
+ '&xor (@T[0],$d);',
+ '&add ($e,$a);',
+ '&$_ror ($b,$j?7:2);', # $b>>>2
+ '&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+ );
+}
+
+sub body_20_39 () {
+ (
+ '($a,$b,$c,$d,$e)=@V;'.
+ '&add ($e,&DWP(4*($j++&15),"esp"));', # X[]+K xfer
+ '&xor (@T[0],$d);', # ($b^$d)
+ '&mov (@T[1],$a);', # $b in next round
+ '&$_rol ($a,5);',
+ '&xor (@T[0],$c);', # ($b^$d^$c)
+ '&add ($e,$a);',
+ '&$_ror ($b,7);', # $b>>>2
+ '&add ($e,@T[0]);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+ );
+}
+
+sub body_40_59 () {
+ (
+ '($a,$b,$c,$d,$e)=@V;'.
+ '&mov (@T[1],$c);',
+ '&xor ($c,$d);',
+ '&add ($e,&DWP(4*($j++&15),"esp"));', # X[]+K xfer
+ '&and (@T[1],$d);',
+ '&and (@T[0],$c);', # ($b&($c^$d))
+ '&$_ror ($b,7);', # $b>>>2
+ '&add ($e,@T[1]);',
+ '&mov (@T[1],$a);', # $b in next round
+ '&$_rol ($a,5);',
+ '&add ($e,@T[0]);',
+ '&xor ($c,$d);', # restore $c
+ '&add ($e,$a);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+ );
+}
+
+&set_label("loop",16);
+ &Xupdate_ssse3_16_31(\&body_00_19);
+ &Xupdate_ssse3_16_31(\&body_00_19);
+ &Xupdate_ssse3_16_31(\&body_00_19);
+ &Xupdate_ssse3_16_31(\&body_00_19);
+ &Xupdate_ssse3_32_79(\&body_00_19);
+ &Xupdate_ssse3_32_79(\&body_20_39);
+ &Xupdate_ssse3_32_79(\&body_20_39);
+ &Xupdate_ssse3_32_79(\&body_20_39);
+ &Xupdate_ssse3_32_79(\&body_20_39);
+ &Xupdate_ssse3_32_79(\&body_20_39);
+ &Xupdate_ssse3_32_79(\&body_40_59);
+ &Xupdate_ssse3_32_79(\&body_40_59);
+ &Xupdate_ssse3_32_79(\&body_40_59);
+ &Xupdate_ssse3_32_79(\&body_40_59);
+ &Xupdate_ssse3_32_79(\&body_40_59);
+ &Xupdate_ssse3_32_79(\&body_20_39);
+ &Xuplast_ssse3_80(\&body_20_39); # can jump to "done"
+
+ $saved_j=$j; @saved_V=@V;
+
+ &Xloop_ssse3(\&body_20_39);
+ &Xloop_ssse3(\&body_20_39);
+ &Xloop_ssse3(\&body_20_39);
+
+ &mov (@T[1],&DWP(192,"esp")); # update context
+ &add ($A,&DWP(0,@T[1]));
+ &add (@T[0],&DWP(4,@T[1])); # $b
+ &add ($C,&DWP(8,@T[1]));
+ &mov (&DWP(0,@T[1]),$A);
+ &add ($D,&DWP(12,@T[1]));
+ &mov (&DWP(4,@T[1]),@T[0]);
+ &add ($E,&DWP(16,@T[1]));
+ &mov (&DWP(8,@T[1]),$C);
+ &mov ($B,@T[0]);
+ &mov (&DWP(12,@T[1]),$D);
+ &mov (&DWP(16,@T[1]),$E);
+ &movdqa (@X[0],@X[-3&7]);
+
+ &jmp (&label("loop"));
+
+&set_label("done",16); $j=$saved_j; @V=@saved_V;
+
+ &Xtail_ssse3(\&body_20_39);
+ &Xtail_ssse3(\&body_20_39);
+ &Xtail_ssse3(\&body_20_39);
+
+ &mov (@T[1],&DWP(192,"esp")); # update context
+ &add ($A,&DWP(0,@T[1]));
+ &mov ("esp",&DWP(192+12,"esp")); # restore %esp
+ &add (@T[0],&DWP(4,@T[1])); # $b
+ &add ($C,&DWP(8,@T[1]));
+ &mov (&DWP(0,@T[1]),$A);
+ &add ($D,&DWP(12,@T[1]));
+ &mov (&DWP(4,@T[1]),@T[0]);
+ &add ($E,&DWP(16,@T[1]));
+ &mov (&DWP(8,@T[1]),$C);
+ &mov (&DWP(12,@T[1]),$D);
+ &mov (&DWP(16,@T[1]),$E);
+
+&function_end("_sha1_block_data_order_ssse3");
+
+if ($ymm) {
+my $Xi=4; # 4xSIMD Xupdate round, start pre-seeded
+my @X=map("xmm$_",(4..7,0..3)); # pre-seeded for $Xi=4
+my @V=($A,$B,$C,$D,$E);
+my $j=0; # hash round
+my @T=($T,$tmp1);
+my $inp;
+
+my $_rol=sub { &shld(@_[0],@_) };
+my $_ror=sub { &shrd(@_[0],@_) };
+
+&function_begin("_sha1_block_data_order_avx");
+ &call (&label("pic_point")); # make it PIC!
+ &set_label("pic_point");
+ &blindpop($tmp1);
+ &lea ($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1));
+&set_label("avx_shortcut");
+ &vzeroall();
+
+ &vmovdqa(@X[3],&QWP(0,$tmp1)); # K_00_19
+ &vmovdqa(@X[4],&QWP(16,$tmp1)); # K_20_39
+ &vmovdqa(@X[5],&QWP(32,$tmp1)); # K_40_59
+ &vmovdqa(@X[6],&QWP(48,$tmp1)); # K_60_79
+ &vmovdqa(@X[2],&QWP(64,$tmp1)); # pbswap mask
+
+ &mov ($E,&wparam(0)); # load argument block
+ &mov ($inp=@T[1],&wparam(1));
+ &mov ($D,&wparam(2));
+ &mov (@T[0],"esp");
+
+ # stack frame layout
+ #
+ # +0 X[0]+K X[1]+K X[2]+K X[3]+K # XMM->IALU xfer area
+ # X[4]+K X[5]+K X[6]+K X[7]+K
+ # X[8]+K X[9]+K X[10]+K X[11]+K
+ # X[12]+K X[13]+K X[14]+K X[15]+K
+ #
+ # +64 X[0] X[1] X[2] X[3] # XMM->XMM backtrace area
+ # X[4] X[5] X[6] X[7]
+ # X[8] X[9] X[10] X[11] # even borrowed for K_00_19
+ #
+ # +112 K_20_39 K_20_39 K_20_39 K_20_39 # constants
+ # K_40_59 K_40_59 K_40_59 K_40_59
+ # K_60_79 K_60_79 K_60_79 K_60_79
+ # K_00_19 K_00_19 K_00_19 K_00_19
+ # pbswap mask
+ #
+ # +192 ctx # argument block
+ # +196 inp
+ # +200 end
+ # +204 esp
+ &sub ("esp",208);
+ &and ("esp",-64);
+
+ &vmovdqa(&QWP(112+0,"esp"),@X[4]); # copy constants
+ &vmovdqa(&QWP(112+16,"esp"),@X[5]);
+ &vmovdqa(&QWP(112+32,"esp"),@X[6]);
+ &shl ($D,6); # len*64
+ &vmovdqa(&QWP(112+48,"esp"),@X[3]);
+ &add ($D,$inp); # end of input
+ &vmovdqa(&QWP(112+64,"esp"),@X[2]);
+ &add ($inp,64);
+ &mov (&DWP(192+0,"esp"),$E); # save argument block
+ &mov (&DWP(192+4,"esp"),$inp);
+ &mov (&DWP(192+8,"esp"),$D);
+ &mov (&DWP(192+12,"esp"),@T[0]); # save original %esp
+
+ &mov ($A,&DWP(0,$E)); # load context
+ &mov ($B,&DWP(4,$E));
+ &mov ($C,&DWP(8,$E));
+ &mov ($D,&DWP(12,$E));
+ &mov ($E,&DWP(16,$E));
+ &mov (@T[0],$B); # magic seed
+
+ &vmovdqu(@X[-4&7],&QWP(-64,$inp)); # load input to %xmm[0-3]
+ &vmovdqu(@X[-3&7],&QWP(-48,$inp));
+ &vmovdqu(@X[-2&7],&QWP(-32,$inp));
+ &vmovdqu(@X[-1&7],&QWP(-16,$inp));
+ &vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap
+ &vpshufb(@X[-3&7],@X[-3&7],@X[2]);
+ &vpshufb(@X[-2&7],@X[-2&7],@X[2]);
+ &vmovdqa(&QWP(112-16,"esp"),@X[3]); # borrow last backtrace slot
+ &vpshufb(@X[-1&7],@X[-1&7],@X[2]);
+ &vpaddd (@X[0],@X[-4&7],@X[3]); # add K_00_19
+ &vpaddd (@X[1],@X[-3&7],@X[3]);
+ &vpaddd (@X[2],@X[-2&7],@X[3]);
+ &vmovdqa(&QWP(0,"esp"),@X[0]); # X[]+K xfer to IALU
+ &vmovdqa(&QWP(0+16,"esp"),@X[1]);
+ &vmovdqa(&QWP(0+32,"esp"),@X[2]);
+ &jmp (&label("loop"));
+
+sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
+ my ($a,$b,$c,$d,$e);
+
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpaddd (@X[3],@X[3],@X[-1&7]);
+ &vmovdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]);# save X[] to backtrace buffer
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpsrldq(@X[2],@X[-1&7],4); # "X[-3]", 3 dwords
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpxor (@X[2],@X[2],@X[-2&7]); # "X[-3]"^"X[-8]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vmovdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer to IALU
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpxor (@X[0],@X[0],@X[2]); # "X[0]"^="X[-3]"^"X[-8]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpsrld (@X[2],@X[0],31);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpslldq(@X[4],@X[0],12); # "X[0]"<<96, extract one dword
+ &vpaddd (@X[0],@X[0],@X[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpsrld (@X[3],@X[4],30);
+ &vpor (@X[0],@X[0],@X[2]); # "X[0]"<<<=1
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpslld (@X[4],@X[4],2);
+ &vmovdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if ($Xi>5); # restore X[] from backtrace buffer
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpxor (@X[0],@X[0],@X[3]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpxor (@X[0],@X[0],@X[4]); # "X[0]"^=("X[0]"<<96)<<<2
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vmovdqa (@X[4],&QWP(112-16+16*(($Xi)/5),"esp")); # K_XX_XX
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ foreach (@insns) { eval; } # remaining instructions [if any]
+
+ $Xi++; push(@X,shift(@X)); # "rotate" X[]
+}
+
+sub Xupdate_avx_32_79()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions
+ my ($a,$b,$c,$d,$e);
+
+ &vpalignr(@X[2],@X[-1&7],@X[-2&7],8); # compose "X[-6]"
+ &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
+ eval(shift(@insns)); # body_20_39
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+
+ &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
+ &vmovdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]); # save X[] to backtrace buffer
+ eval(shift(@insns));
+ eval(shift(@insns));
+ if ($Xi%5) {
+ &vmovdqa (@X[4],@X[3]); # "perpetuate" K_XX_XX...
+ } else { # ... or load next one
+ &vmovdqa (@X[4],&QWP(112-16+16*($Xi/5),"esp"));
+ }
+ &vpaddd (@X[3],@X[3],@X[-1&7]);
+ eval(shift(@insns)); # ror
+ eval(shift(@insns));
+
+ &vpxor (@X[0],@X[0],@X[2]); # "X[0]"^="X[-6]"
+ eval(shift(@insns)); # body_20_39
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+
+ &vpsrld (@X[2],@X[0],30);
+ &vmovdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer to IALU
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # ror
+ eval(shift(@insns));
+
+ &vpslld (@X[0],@X[0],2);
+ eval(shift(@insns)); # body_20_39
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # ror
+ eval(shift(@insns));
+
+ &vpor (@X[0],@X[0],@X[2]); # "X[0]"<<<=2
+ eval(shift(@insns)); # body_20_39
+ eval(shift(@insns));
+ &vmovdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if($Xi<19); # restore X[] from backtrace buffer
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # ror
+ eval(shift(@insns));
+
+ foreach (@insns) { eval; } # remaining instructions
+
+ $Xi++; push(@X,shift(@X)); # "rotate" X[]
+}
+
+sub Xuplast_avx_80()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
+ my ($a,$b,$c,$d,$e);
+
+ eval(shift(@insns));
+ &vpaddd (@X[3],@X[3],@X[-1&7]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vmovdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer IALU
+
+ foreach (@insns) { eval; } # remaining instructions
+
+ &mov ($inp=@T[1],&DWP(192+4,"esp"));
+ &cmp ($inp,&DWP(192+8,"esp"));
+ &je (&label("done"));
+
+ &vmovdqa(@X[3],&QWP(112+48,"esp")); # K_00_19
+ &vmovdqa(@X[2],&QWP(112+64,"esp")); # pbswap mask
+ &vmovdqu(@X[-4&7],&QWP(0,$inp)); # load input
+ &vmovdqu(@X[-3&7],&QWP(16,$inp));
+ &vmovdqu(@X[-2&7],&QWP(32,$inp));
+ &vmovdqu(@X[-1&7],&QWP(48,$inp));
+ &add ($inp,64);
+ &vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap
+ &mov (&DWP(192+4,"esp"),$inp);
+ &vmovdqa(&QWP(112-16,"esp"),@X[3]); # borrow last backtrace slot
+
+ $Xi=0;
+}
+
+sub Xloop_avx()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
+ my ($a,$b,$c,$d,$e);
+
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpshufb (@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpaddd (@X[$Xi&7],@X[($Xi-4)&7],@X[3]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vmovdqa (&QWP(0+16*$Xi,"esp"),@X[$Xi&7]); # X[]+K xfer to IALU
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ foreach (@insns) { eval; }
+ $Xi++;
+}
+
+sub Xtail_avx()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
+ my ($a,$b,$c,$d,$e);
+
+ foreach (@insns) { eval; }
+}
+
+&set_label("loop",16);
+ &Xupdate_avx_16_31(\&body_00_19);
+ &Xupdate_avx_16_31(\&body_00_19);
+ &Xupdate_avx_16_31(\&body_00_19);
+ &Xupdate_avx_16_31(\&body_00_19);
+ &Xupdate_avx_32_79(\&body_00_19);
+ &Xupdate_avx_32_79(\&body_20_39);
+ &Xupdate_avx_32_79(\&body_20_39);
+ &Xupdate_avx_32_79(\&body_20_39);
+ &Xupdate_avx_32_79(\&body_20_39);
+ &Xupdate_avx_32_79(\&body_20_39);
+ &Xupdate_avx_32_79(\&body_40_59);
+ &Xupdate_avx_32_79(\&body_40_59);
+ &Xupdate_avx_32_79(\&body_40_59);
+ &Xupdate_avx_32_79(\&body_40_59);
+ &Xupdate_avx_32_79(\&body_40_59);
+ &Xupdate_avx_32_79(\&body_20_39);
+ &Xuplast_avx_80(\&body_20_39); # can jump to "done"
+
+ $saved_j=$j; @saved_V=@V;
+
+ &Xloop_avx(\&body_20_39);
+ &Xloop_avx(\&body_20_39);
+ &Xloop_avx(\&body_20_39);
+
+ &mov (@T[1],&DWP(192,"esp")); # update context
+ &add ($A,&DWP(0,@T[1]));
+ &add (@T[0],&DWP(4,@T[1])); # $b
+ &add ($C,&DWP(8,@T[1]));
+ &mov (&DWP(0,@T[1]),$A);
+ &add ($D,&DWP(12,@T[1]));
+ &mov (&DWP(4,@T[1]),@T[0]);
+ &add ($E,&DWP(16,@T[1]));
+ &mov (&DWP(8,@T[1]),$C);
+ &mov ($B,@T[0]);
+ &mov (&DWP(12,@T[1]),$D);
+ &mov (&DWP(16,@T[1]),$E);
+
+ &jmp (&label("loop"));
+
+&set_label("done",16); $j=$saved_j; @V=@saved_V;
+
+ &Xtail_avx(\&body_20_39);
+ &Xtail_avx(\&body_20_39);
+ &Xtail_avx(\&body_20_39);
+
+ &vzeroall();
+
+ &mov (@T[1],&DWP(192,"esp")); # update context
+ &add ($A,&DWP(0,@T[1]));
+ &mov ("esp",&DWP(192+12,"esp")); # restore %esp
+ &add (@T[0],&DWP(4,@T[1])); # $b
+ &add ($C,&DWP(8,@T[1]));
+ &mov (&DWP(0,@T[1]),$A);
+ &add ($D,&DWP(12,@T[1]));
+ &mov (&DWP(4,@T[1]),@T[0]);
+ &add ($E,&DWP(16,@T[1]));
+ &mov (&DWP(8,@T[1]),$C);
+ &mov (&DWP(12,@T[1]),$D);
+ &mov (&DWP(16,@T[1]),$E);
+&function_end("_sha1_block_data_order_avx");
+}
+&set_label("K_XX_XX",64);
+&data_word(0x5a827999,0x5a827999,0x5a827999,0x5a827999); # K_00_19
+&data_word(0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1); # K_20_39
+&data_word(0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc); # K_40_59
+&data_word(0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6); # K_60_79
+&data_word(0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f); # pbswap mask
+}
&asciz("SHA1 block transform for x86, CRYPTOGAMS by <appro\@openssl.org>");
&asm_finish();
diff --git a/main/openssl/crypto/sha/asm/sha1-alpha.pl b/main/openssl/crypto/sha/asm/sha1-alpha.pl
new file mode 100644
index 00000000..6c4b9251
--- /dev/null
+++ b/main/openssl/crypto/sha/asm/sha1-alpha.pl
@@ -0,0 +1,322 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# SHA1 block procedure for Alpha.
+
+# On 21264 performance is 33% better than code generated by vendor
+# compiler, and 75% better than GCC [3.4], and in absolute terms is
+# 8.7 cycles per processed byte. Implementation features vectorized
+# byte swap, but not Xupdate.
+
+@X=( "\$0", "\$1", "\$2", "\$3", "\$4", "\$5", "\$6", "\$7",
+ "\$8", "\$9", "\$10", "\$11", "\$12", "\$13", "\$14", "\$15");
+$ctx="a0"; # $16
+$inp="a1";
+$num="a2";
+$A="a3";
+$B="a4"; # 20
+$C="a5";
+$D="t8";
+$E="t9"; @V=($A,$B,$C,$D,$E);
+$t0="t10"; # 24
+$t1="t11";
+$t2="ra";
+$t3="t12";
+$K="AT"; # 28
+
+sub BODY_00_19 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___ if ($i==0);
+ ldq_u @X[0],0+0($inp)
+ ldq_u @X[1],0+7($inp)
+___
+$code.=<<___ if (!($i&1) && $i<14);
+ ldq_u @X[$i+2],($i+2)*4+0($inp)
+ ldq_u @X[$i+3],($i+2)*4+7($inp)
+___
+$code.=<<___ if (!($i&1) && $i<15);
+ extql @X[$i],$inp,@X[$i]
+ extqh @X[$i+1],$inp,@X[$i+1]
+
+ or @X[$i+1],@X[$i],@X[$i] # pair of 32-bit values are fetched
+
+ srl @X[$i],24,$t0 # vectorized byte swap
+ srl @X[$i],8,$t2
+
+ sll @X[$i],8,$t3
+ sll @X[$i],24,@X[$i]
+ zapnot $t0,0x11,$t0
+ zapnot $t2,0x22,$t2
+
+ zapnot @X[$i],0x88,@X[$i]
+ or $t0,$t2,$t0
+ zapnot $t3,0x44,$t3
+ sll $a,5,$t1
+
+ or @X[$i],$t0,@X[$i]
+ addl $K,$e,$e
+ and $b,$c,$t2
+ zapnot $a,0xf,$a
+
+ or @X[$i],$t3,@X[$i]
+ srl $a,27,$t0
+ bic $d,$b,$t3
+ sll $b,30,$b
+
+ extll @X[$i],4,@X[$i+1] # extract upper half
+ or $t2,$t3,$t2
+ addl @X[$i],$e,$e
+
+ addl $t1,$e,$e
+ srl $b,32,$t3
+ zapnot @X[$i],0xf,@X[$i]
+
+ addl $t0,$e,$e
+ addl $t2,$e,$e
+ or $t3,$b,$b
+___
+$code.=<<___ if (($i&1) && $i<15);
+ sll $a,5,$t1
+ addl $K,$e,$e
+ and $b,$c,$t2
+ zapnot $a,0xf,$a
+
+ srl $a,27,$t0
+ addl @X[$i%16],$e,$e
+ bic $d,$b,$t3
+ sll $b,30,$b
+
+ or $t2,$t3,$t2
+ addl $t1,$e,$e
+ srl $b,32,$t3
+ zapnot @X[$i],0xf,@X[$i]
+
+ addl $t0,$e,$e
+ addl $t2,$e,$e
+ or $t3,$b,$b
+___
+$code.=<<___ if ($i>=15); # with forward Xupdate
+ sll $a,5,$t1
+ addl $K,$e,$e
+ and $b,$c,$t2
+ xor @X[($j+2)%16],@X[$j%16],@X[$j%16]
+
+ zapnot $a,0xf,$a
+ addl @X[$i%16],$e,$e
+ bic $d,$b,$t3
+ xor @X[($j+8)%16],@X[$j%16],@X[$j%16]
+
+ srl $a,27,$t0
+ addl $t1,$e,$e
+ or $t2,$t3,$t2
+ xor @X[($j+13)%16],@X[$j%16],@X[$j%16]
+
+ sll $b,30,$b
+ addl $t0,$e,$e
+ srl @X[$j%16],31,$t1
+
+ addl $t2,$e,$e
+ srl $b,32,$t3
+ addl @X[$j%16],@X[$j%16],@X[$j%16]
+
+ or $t3,$b,$b
+ zapnot @X[$i%16],0xf,@X[$i%16]
+ or $t1,@X[$j%16],@X[$j%16]
+___
+}
+
+sub BODY_20_39 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___ if ($i<79); # with forward Xupdate
+ sll $a,5,$t1
+ addl $K,$e,$e
+ zapnot $a,0xf,$a
+ xor @X[($j+2)%16],@X[$j%16],@X[$j%16]
+
+ sll $b,30,$t3
+ addl $t1,$e,$e
+ xor $b,$c,$t2
+ xor @X[($j+8)%16],@X[$j%16],@X[$j%16]
+
+ srl $b,2,$b
+ addl @X[$i%16],$e,$e
+ xor $d,$t2,$t2
+ xor @X[($j+13)%16],@X[$j%16],@X[$j%16]
+
+ srl @X[$j%16],31,$t1
+ addl $t2,$e,$e
+ srl $a,27,$t0
+ addl @X[$j%16],@X[$j%16],@X[$j%16]
+
+ or $t3,$b,$b
+ addl $t0,$e,$e
+ or $t1,@X[$j%16],@X[$j%16]
+___
+$code.=<<___ if ($i<77);
+ zapnot @X[$i%16],0xf,@X[$i%16]
+___
+$code.=<<___ if ($i==79); # with context fetch
+ sll $a,5,$t1
+ addl $K,$e,$e
+ zapnot $a,0xf,$a
+ ldl @X[0],0($ctx)
+
+ sll $b,30,$t3
+ addl $t1,$e,$e
+ xor $b,$c,$t2
+ ldl @X[1],4($ctx)
+
+ srl $b,2,$b
+ addl @X[$i%16],$e,$e
+ xor $d,$t2,$t2
+ ldl @X[2],8($ctx)
+
+ srl $a,27,$t0
+ addl $t2,$e,$e
+ ldl @X[3],12($ctx)
+
+ or $t3,$b,$b
+ addl $t0,$e,$e
+ ldl @X[4],16($ctx)
+___
+}
+
+sub BODY_40_59 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___; # with forward Xupdate
+ sll $a,5,$t1
+ addl $K,$e,$e
+ zapnot $a,0xf,$a
+ xor @X[($j+2)%16],@X[$j%16],@X[$j%16]
+
+ srl $a,27,$t0
+ and $b,$c,$t2
+ and $b,$d,$t3
+ xor @X[($j+8)%16],@X[$j%16],@X[$j%16]
+
+ sll $b,30,$b
+ addl $t1,$e,$e
+ xor @X[($j+13)%16],@X[$j%16],@X[$j%16]
+
+ srl @X[$j%16],31,$t1
+ addl $t0,$e,$e
+ or $t2,$t3,$t2
+ and $c,$d,$t3
+
+ or $t2,$t3,$t2
+ srl $b,32,$t3
+ addl @X[$i%16],$e,$e
+ addl @X[$j%16],@X[$j%16],@X[$j%16]
+
+ or $t3,$b,$b
+ addl $t2,$e,$e
+ or $t1,@X[$j%16],@X[$j%16]
+ zapnot @X[$i%16],0xf,@X[$i%16]
+___
+}
+
+$code=<<___;
+#ifdef __linux__
+#include <asm/regdef.h>
+#else
+#include <asm.h>
+#include <regdef.h>
+#endif
+
+.text
+
+.set noat
+.set noreorder
+.globl sha1_block_data_order
+.align 5
+.ent sha1_block_data_order
+sha1_block_data_order:
+ lda sp,-64(sp)
+ stq ra,0(sp)
+ stq s0,8(sp)
+ stq s1,16(sp)
+ stq s2,24(sp)
+ stq s3,32(sp)
+ stq s4,40(sp)
+ stq s5,48(sp)
+ stq fp,56(sp)
+ .mask 0x0400fe00,-64
+ .frame sp,64,ra
+ .prologue 0
+
+ ldl $A,0($ctx)
+ ldl $B,4($ctx)
+ sll $num,6,$num
+ ldl $C,8($ctx)
+ ldl $D,12($ctx)
+ ldl $E,16($ctx)
+ addq $inp,$num,$num
+
+.Lloop:
+ .set noreorder
+ ldah $K,23170(zero)
+ zapnot $B,0xf,$B
+ lda $K,31129($K) # K_00_19
+___
+for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
+
+$code.=<<___;
+ ldah $K,28378(zero)
+ lda $K,-5215($K) # K_20_39
+___
+for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+
+$code.=<<___;
+ ldah $K,-28900(zero)
+ lda $K,-17188($K) # K_40_59
+___
+for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
+
+$code.=<<___;
+ ldah $K,-13725(zero)
+ lda $K,-15914($K) # K_60_79
+___
+for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+
+$code.=<<___;
+ addl @X[0],$A,$A
+ addl @X[1],$B,$B
+ addl @X[2],$C,$C
+ addl @X[3],$D,$D
+ addl @X[4],$E,$E
+ stl $A,0($ctx)
+ stl $B,4($ctx)
+ addq $inp,64,$inp
+ stl $C,8($ctx)
+ stl $D,12($ctx)
+ stl $E,16($ctx)
+ cmpult $inp,$num,$t1
+ bne $t1,.Lloop
+
+ .set noreorder
+ ldq ra,0(sp)
+ ldq s0,8(sp)
+ ldq s1,16(sp)
+ ldq s2,24(sp)
+ ldq s3,32(sp)
+ ldq s4,40(sp)
+ ldq s5,48(sp)
+ ldq fp,56(sp)
+ lda sp,64(sp)
+ ret (ra)
+.end sha1_block_data_order
+.ascii "SHA1 block transform for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
+.align 2
+___
+$output=shift and open STDOUT,">$output";
+print $code;
+close STDOUT;
diff --git a/main/openssl/crypto/sha/asm/sha1-armv4-large.pl b/main/openssl/crypto/sha/asm/sha1-armv4-large.pl
index 79e3f613..33da3e0e 100644
--- a/main/openssl/crypto/sha/asm/sha1-armv4-large.pl
+++ b/main/openssl/crypto/sha/asm/sha1-armv4-large.pl
@@ -47,6 +47,10 @@
# Cortex A8 core and in absolute terms ~870 cycles per input block
# [or 13.6 cycles per byte].
+# February 2011.
+#
+# Profiler-assisted and platform-specific optimization resulted in 10%
+# improvement on Cortex A8 core and 12.2 cycles per byte.
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
@@ -76,31 +80,41 @@ $code.=<<___;
add $e,$K,$e,ror#2 @ E+=K_xx_xx
ldr $t3,[$Xi,#2*4]
eor $t0,$t0,$t1
- eor $t2,$t2,$t3
+ eor $t2,$t2,$t3 @ 1 cycle stall
eor $t1,$c,$d @ F_xx_xx
mov $t0,$t0,ror#31
add $e,$e,$a,ror#27 @ E+=ROR(A,27)
eor $t0,$t0,$t2,ror#31
+ str $t0,[$Xi,#-4]!
$opt1 @ F_xx_xx
$opt2 @ F_xx_xx
add $e,$e,$t0 @ E+=X[i]
- str $t0,[$Xi,#-4]!
___
}
sub BODY_00_15 {
my ($a,$b,$c,$d,$e)=@_;
$code.=<<___;
- ldrb $t0,[$inp],#4
- ldrb $t1,[$inp,#-1]
- ldrb $t2,[$inp,#-2]
+#if __ARM_ARCH__<7
+ ldrb $t1,[$inp,#2]
+ ldrb $t0,[$inp,#3]
+ ldrb $t2,[$inp,#1]
add $e,$K,$e,ror#2 @ E+=K_00_19
- ldrb $t3,[$inp,#-3]
+ ldrb $t3,[$inp],#4
+ orr $t0,$t0,$t1,lsl#8
+ eor $t1,$c,$d @ F_xx_xx
+ orr $t0,$t0,$t2,lsl#16
add $e,$e,$a,ror#27 @ E+=ROR(A,27)
- orr $t0,$t1,$t0,lsl#24
+ orr $t0,$t0,$t3,lsl#24
+#else
+ ldr $t0,[$inp],#4 @ handles unaligned
+ add $e,$K,$e,ror#2 @ E+=K_00_19
eor $t1,$c,$d @ F_xx_xx
- orr $t0,$t0,$t2,lsl#8
- orr $t0,$t0,$t3,lsl#16
+ add $e,$e,$a,ror#27 @ E+=ROR(A,27)
+#ifdef __ARMEL__
+ rev $t0,$t0 @ byte swap
+#endif
+#endif
and $t1,$b,$t1,ror#2
add $e,$e,$t0 @ E+=X[i]
eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D)
@@ -136,6 +150,8 @@ ___
}
$code=<<___;
+#include "arm_arch.h"
+
.text
.global sha1_block_data_order
@@ -161,7 +177,7 @@ for($i=0;$i<5;$i++) {
$code.=<<___;
teq $Xi,sp
bne .L_00_15 @ [((11+4)*5+2)*3]
- sub sp,sp,#5*4
+ sub sp,sp,#25*4
___
&BODY_00_15(@V); unshift(@V,pop(@V));
&BODY_16_19(@V); unshift(@V,pop(@V));
@@ -171,7 +187,6 @@ ___
$code.=<<___;
ldr $K,.LK_20_39 @ [+15+16*4]
- sub sp,sp,#20*4
cmn sp,#0 @ [+3], clear carry to denote 20_39
.L_20_39_or_60_79:
___
@@ -210,10 +225,14 @@ $code.=<<___;
teq $inp,$len
bne .Lloop @ [+18], total 1307
+#if __ARM_ARCH__>=5
+ ldmia sp!,{r4-r12,pc}
+#else
ldmia sp!,{r4-r12,lr}
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
+#endif
.align 2
.LK_00_19: .word 0x5a827999
.LK_20_39: .word 0x6ed9eba1
diff --git a/main/openssl/crypto/sha/asm/sha1-armv4-large.s b/main/openssl/crypto/sha/asm/sha1-armv4-large.s
index 7f687d9f..639ae78a 100644
--- a/main/openssl/crypto/sha/asm/sha1-armv4-large.s
+++ b/main/openssl/crypto/sha/asm/sha1-armv4-large.s
@@ -1,3 +1,5 @@
+#include "arm_arch.h"
+
.text
.global sha1_block_data_order
@@ -16,76 +18,126 @@ sha1_block_data_order:
mov r6,r6,ror#30
mov r7,r7,ror#30 @ [6]
.L_00_15:
- ldrb r9,[r1],#4
- ldrb r10,[r1,#-1]
- ldrb r11,[r1,#-2]
+#if __ARM_ARCH__<7
+ ldrb r10,[r1,#2]
+ ldrb r9,[r1,#3]
+ ldrb r11,[r1,#1]
add r7,r8,r7,ror#2 @ E+=K_00_19
- ldrb r12,[r1,#-3]
+ ldrb r12,[r1],#4
+ orr r9,r9,r10,lsl#8
+ eor r10,r5,r6 @ F_xx_xx
+ orr r9,r9,r11,lsl#16
add r7,r7,r3,ror#27 @ E+=ROR(A,27)
- orr r9,r10,r9,lsl#24
+ orr r9,r9,r12,lsl#24
+#else
+ ldr r9,[r1],#4 @ handles unaligned
+ add r7,r8,r7,ror#2 @ E+=K_00_19
eor r10,r5,r6 @ F_xx_xx
- orr r9,r9,r11,lsl#8
- orr r9,r9,r12,lsl#16
+ add r7,r7,r3,ror#27 @ E+=ROR(A,27)
+#ifdef __ARMEL__
+ rev r9,r9 @ byte swap
+#endif
+#endif
and r10,r4,r10,ror#2
add r7,r7,r9 @ E+=X[i]
eor r10,r10,r6,ror#2 @ F_00_19(B,C,D)
str r9,[r14,#-4]!
add r7,r7,r10 @ E+=F_00_19(B,C,D)
- ldrb r9,[r1],#4
- ldrb r10,[r1,#-1]
- ldrb r11,[r1,#-2]
+#if __ARM_ARCH__<7
+ ldrb r10,[r1,#2]
+ ldrb r9,[r1,#3]
+ ldrb r11,[r1,#1]
add r6,r8,r6,ror#2 @ E+=K_00_19
- ldrb r12,[r1,#-3]
+ ldrb r12,[r1],#4
+ orr r9,r9,r10,lsl#8
+ eor r10,r4,r5 @ F_xx_xx
+ orr r9,r9,r11,lsl#16
add r6,r6,r7,ror#27 @ E+=ROR(A,27)
- orr r9,r10,r9,lsl#24
+ orr r9,r9,r12,lsl#24
+#else
+ ldr r9,[r1],#4 @ handles unaligned
+ add r6,r8,r6,ror#2 @ E+=K_00_19
eor r10,r4,r5 @ F_xx_xx
- orr r9,r9,r11,lsl#8
- orr r9,r9,r12,lsl#16
+ add r6,r6,r7,ror#27 @ E+=ROR(A,27)
+#ifdef __ARMEL__
+ rev r9,r9 @ byte swap
+#endif
+#endif
and r10,r3,r10,ror#2
add r6,r6,r9 @ E+=X[i]
eor r10,r10,r5,ror#2 @ F_00_19(B,C,D)
str r9,[r14,#-4]!
add r6,r6,r10 @ E+=F_00_19(B,C,D)
- ldrb r9,[r1],#4
- ldrb r10,[r1,#-1]
- ldrb r11,[r1,#-2]
+#if __ARM_ARCH__<7
+ ldrb r10,[r1,#2]
+ ldrb r9,[r1,#3]
+ ldrb r11,[r1,#1]
add r5,r8,r5,ror#2 @ E+=K_00_19
- ldrb r12,[r1,#-3]
+ ldrb r12,[r1],#4
+ orr r9,r9,r10,lsl#8
+ eor r10,r3,r4 @ F_xx_xx
+ orr r9,r9,r11,lsl#16
add r5,r5,r6,ror#27 @ E+=ROR(A,27)
- orr r9,r10,r9,lsl#24
+ orr r9,r9,r12,lsl#24
+#else
+ ldr r9,[r1],#4 @ handles unaligned
+ add r5,r8,r5,ror#2 @ E+=K_00_19
eor r10,r3,r4 @ F_xx_xx
- orr r9,r9,r11,lsl#8
- orr r9,r9,r12,lsl#16
+ add r5,r5,r6,ror#27 @ E+=ROR(A,27)
+#ifdef __ARMEL__
+ rev r9,r9 @ byte swap
+#endif
+#endif
and r10,r7,r10,ror#2
add r5,r5,r9 @ E+=X[i]
eor r10,r10,r4,ror#2 @ F_00_19(B,C,D)
str r9,[r14,#-4]!
add r5,r5,r10 @ E+=F_00_19(B,C,D)
- ldrb r9,[r1],#4
- ldrb r10,[r1,#-1]
- ldrb r11,[r1,#-2]
+#if __ARM_ARCH__<7
+ ldrb r10,[r1,#2]
+ ldrb r9,[r1,#3]
+ ldrb r11,[r1,#1]
add r4,r8,r4,ror#2 @ E+=K_00_19
- ldrb r12,[r1,#-3]
+ ldrb r12,[r1],#4
+ orr r9,r9,r10,lsl#8
+ eor r10,r7,r3 @ F_xx_xx
+ orr r9,r9,r11,lsl#16
add r4,r4,r5,ror#27 @ E+=ROR(A,27)
- orr r9,r10,r9,lsl#24
+ orr r9,r9,r12,lsl#24
+#else
+ ldr r9,[r1],#4 @ handles unaligned
+ add r4,r8,r4,ror#2 @ E+=K_00_19
eor r10,r7,r3 @ F_xx_xx
- orr r9,r9,r11,lsl#8
- orr r9,r9,r12,lsl#16
+ add r4,r4,r5,ror#27 @ E+=ROR(A,27)
+#ifdef __ARMEL__
+ rev r9,r9 @ byte swap
+#endif
+#endif
and r10,r6,r10,ror#2
add r4,r4,r9 @ E+=X[i]
eor r10,r10,r3,ror#2 @ F_00_19(B,C,D)
str r9,[r14,#-4]!
add r4,r4,r10 @ E+=F_00_19(B,C,D)
- ldrb r9,[r1],#4
- ldrb r10,[r1,#-1]
- ldrb r11,[r1,#-2]
+#if __ARM_ARCH__<7
+ ldrb r10,[r1,#2]
+ ldrb r9,[r1,#3]
+ ldrb r11,[r1,#1]
add r3,r8,r3,ror#2 @ E+=K_00_19
- ldrb r12,[r1,#-3]
+ ldrb r12,[r1],#4
+ orr r9,r9,r10,lsl#8
+ eor r10,r6,r7 @ F_xx_xx
+ orr r9,r9,r11,lsl#16
add r3,r3,r4,ror#27 @ E+=ROR(A,27)
- orr r9,r10,r9,lsl#24
+ orr r9,r9,r12,lsl#24
+#else
+ ldr r9,[r1],#4 @ handles unaligned
+ add r3,r8,r3,ror#2 @ E+=K_00_19
eor r10,r6,r7 @ F_xx_xx
- orr r9,r9,r11,lsl#8
- orr r9,r9,r12,lsl#16
+ add r3,r3,r4,ror#27 @ E+=ROR(A,27)
+#ifdef __ARMEL__
+ rev r9,r9 @ byte swap
+#endif
+#endif
and r10,r5,r10,ror#2
add r3,r3,r9 @ E+=X[i]
eor r10,r10,r7,ror#2 @ F_00_19(B,C,D)
@@ -93,17 +145,27 @@ sha1_block_data_order:
add r3,r3,r10 @ E+=F_00_19(B,C,D)
teq r14,sp
bne .L_00_15 @ [((11+4)*5+2)*3]
- sub sp,sp,#5*4
- ldrb r9,[r1],#4
- ldrb r10,[r1,#-1]
- ldrb r11,[r1,#-2]
+ sub sp,sp,#25*4
+#if __ARM_ARCH__<7
+ ldrb r10,[r1,#2]
+ ldrb r9,[r1,#3]
+ ldrb r11,[r1,#1]
add r7,r8,r7,ror#2 @ E+=K_00_19
- ldrb r12,[r1,#-3]
+ ldrb r12,[r1],#4
+ orr r9,r9,r10,lsl#8
+ eor r10,r5,r6 @ F_xx_xx
+ orr r9,r9,r11,lsl#16
add r7,r7,r3,ror#27 @ E+=ROR(A,27)
- orr r9,r10,r9,lsl#24
+ orr r9,r9,r12,lsl#24
+#else
+ ldr r9,[r1],#4 @ handles unaligned
+ add r7,r8,r7,ror#2 @ E+=K_00_19
eor r10,r5,r6 @ F_xx_xx
- orr r9,r9,r11,lsl#8
- orr r9,r9,r12,lsl#16
+ add r7,r7,r3,ror#27 @ E+=ROR(A,27)
+#ifdef __ARMEL__
+ rev r9,r9 @ byte swap
+#endif
+#endif
and r10,r4,r10,ror#2
add r7,r7,r9 @ E+=X[i]
eor r10,r10,r6,ror#2 @ F_00_19(B,C,D)
@@ -115,15 +177,15 @@ sha1_block_data_order:
add r6,r8,r6,ror#2 @ E+=K_xx_xx
ldr r12,[r14,#2*4]
eor r9,r9,r10
- eor r11,r11,r12
+ eor r11,r11,r12 @ 1 cycle stall
eor r10,r4,r5 @ F_xx_xx
mov r9,r9,ror#31
add r6,r6,r7,ror#27 @ E+=ROR(A,27)
eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
and r10,r3,r10,ror#2 @ F_xx_xx
@ F_xx_xx
add r6,r6,r9 @ E+=X[i]
- str r9,[r14,#-4]!
eor r10,r10,r5,ror#2 @ F_00_19(B,C,D)
add r6,r6,r10 @ E+=F_00_19(B,C,D)
ldr r9,[r14,#15*4]
@@ -132,15 +194,15 @@ sha1_block_data_order:
add r5,r8,r5,ror#2 @ E+=K_xx_xx
ldr r12,[r14,#2*4]
eor r9,r9,r10
- eor r11,r11,r12
+ eor r11,r11,r12 @ 1 cycle stall
eor r10,r3,r4 @ F_xx_xx
mov r9,r9,ror#31
add r5,r5,r6,ror#27 @ E+=ROR(A,27)
eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
and r10,r7,r10,ror#2 @ F_xx_xx
@ F_xx_xx
add r5,r5,r9 @ E+=X[i]
- str r9,[r14,#-4]!
eor r10,r10,r4,ror#2 @ F_00_19(B,C,D)
add r5,r5,r10 @ E+=F_00_19(B,C,D)
ldr r9,[r14,#15*4]
@@ -149,15 +211,15 @@ sha1_block_data_order:
add r4,r8,r4,ror#2 @ E+=K_xx_xx
ldr r12,[r14,#2*4]
eor r9,r9,r10
- eor r11,r11,r12
+ eor r11,r11,r12 @ 1 cycle stall
eor r10,r7,r3 @ F_xx_xx
mov r9,r9,ror#31
add r4,r4,r5,ror#27 @ E+=ROR(A,27)
eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
and r10,r6,r10,ror#2 @ F_xx_xx
@ F_xx_xx
add r4,r4,r9 @ E+=X[i]
- str r9,[r14,#-4]!
eor r10,r10,r3,ror#2 @ F_00_19(B,C,D)
add r4,r4,r10 @ E+=F_00_19(B,C,D)
ldr r9,[r14,#15*4]
@@ -166,20 +228,19 @@ sha1_block_data_order:
add r3,r8,r3,ror#2 @ E+=K_xx_xx
ldr r12,[r14,#2*4]
eor r9,r9,r10
- eor r11,r11,r12
+ eor r11,r11,r12 @ 1 cycle stall
eor r10,r6,r7 @ F_xx_xx
mov r9,r9,ror#31
add r3,r3,r4,ror#27 @ E+=ROR(A,27)
eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
and r10,r5,r10,ror#2 @ F_xx_xx
@ F_xx_xx
add r3,r3,r9 @ E+=X[i]
- str r9,[r14,#-4]!
eor r10,r10,r7,ror#2 @ F_00_19(B,C,D)
add r3,r3,r10 @ E+=F_00_19(B,C,D)
ldr r8,.LK_20_39 @ [+15+16*4]
- sub sp,sp,#20*4
cmn sp,#0 @ [+3], clear carry to denote 20_39
.L_20_39_or_60_79:
ldr r9,[r14,#15*4]
@@ -188,15 +249,15 @@ sha1_block_data_order:
add r7,r8,r7,ror#2 @ E+=K_xx_xx
ldr r12,[r14,#2*4]
eor r9,r9,r10
- eor r11,r11,r12
+ eor r11,r11,r12 @ 1 cycle stall
eor r10,r5,r6 @ F_xx_xx
mov r9,r9,ror#31
add r7,r7,r3,ror#27 @ E+=ROR(A,27)
eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
eor r10,r4,r10,ror#2 @ F_xx_xx
@ F_xx_xx
add r7,r7,r9 @ E+=X[i]
- str r9,[r14,#-4]!
add r7,r7,r10 @ E+=F_20_39(B,C,D)
ldr r9,[r14,#15*4]
ldr r10,[r14,#13*4]
@@ -204,15 +265,15 @@ sha1_block_data_order:
add r6,r8,r6,ror#2 @ E+=K_xx_xx
ldr r12,[r14,#2*4]
eor r9,r9,r10
- eor r11,r11,r12
+ eor r11,r11,r12 @ 1 cycle stall
eor r10,r4,r5 @ F_xx_xx
mov r9,r9,ror#31
add r6,r6,r7,ror#27 @ E+=ROR(A,27)
eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
eor r10,r3,r10,ror#2 @ F_xx_xx
@ F_xx_xx
add r6,r6,r9 @ E+=X[i]
- str r9,[r14,#-4]!
add r6,r6,r10 @ E+=F_20_39(B,C,D)
ldr r9,[r14,#15*4]
ldr r10,[r14,#13*4]
@@ -220,15 +281,15 @@ sha1_block_data_order:
add r5,r8,r5,ror#2 @ E+=K_xx_xx
ldr r12,[r14,#2*4]
eor r9,r9,r10
- eor r11,r11,r12
+ eor r11,r11,r12 @ 1 cycle stall
eor r10,r3,r4 @ F_xx_xx
mov r9,r9,ror#31
add r5,r5,r6,ror#27 @ E+=ROR(A,27)
eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
eor r10,r7,r10,ror#2 @ F_xx_xx
@ F_xx_xx
add r5,r5,r9 @ E+=X[i]
- str r9,[r14,#-4]!
add r5,r5,r10 @ E+=F_20_39(B,C,D)
ldr r9,[r14,#15*4]
ldr r10,[r14,#13*4]
@@ -236,15 +297,15 @@ sha1_block_data_order:
add r4,r8,r4,ror#2 @ E+=K_xx_xx
ldr r12,[r14,#2*4]
eor r9,r9,r10
- eor r11,r11,r12
+ eor r11,r11,r12 @ 1 cycle stall
eor r10,r7,r3 @ F_xx_xx
mov r9,r9,ror#31
add r4,r4,r5,ror#27 @ E+=ROR(A,27)
eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
eor r10,r6,r10,ror#2 @ F_xx_xx
@ F_xx_xx
add r4,r4,r9 @ E+=X[i]
- str r9,[r14,#-4]!
add r4,r4,r10 @ E+=F_20_39(B,C,D)
ldr r9,[r14,#15*4]
ldr r10,[r14,#13*4]
@@ -252,15 +313,15 @@ sha1_block_data_order:
add r3,r8,r3,ror#2 @ E+=K_xx_xx
ldr r12,[r14,#2*4]
eor r9,r9,r10
- eor r11,r11,r12
+ eor r11,r11,r12 @ 1 cycle stall
eor r10,r6,r7 @ F_xx_xx
mov r9,r9,ror#31
add r3,r3,r4,ror#27 @ E+=ROR(A,27)
eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
eor r10,r5,r10,ror#2 @ F_xx_xx
@ F_xx_xx
add r3,r3,r9 @ E+=X[i]
- str r9,[r14,#-4]!
add r3,r3,r10 @ E+=F_20_39(B,C,D)
teq r14,sp @ preserve carry
bne .L_20_39_or_60_79 @ [+((12+3)*5+2)*4]
@@ -275,15 +336,15 @@ sha1_block_data_order:
add r7,r8,r7,ror#2 @ E+=K_xx_xx
ldr r12,[r14,#2*4]
eor r9,r9,r10
- eor r11,r11,r12
+ eor r11,r11,r12 @ 1 cycle stall
eor r10,r5,r6 @ F_xx_xx
mov r9,r9,ror#31
add r7,r7,r3,ror#27 @ E+=ROR(A,27)
eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
and r10,r4,r10,ror#2 @ F_xx_xx
and r11,r5,r6 @ F_xx_xx
add r7,r7,r9 @ E+=X[i]
- str r9,[r14,#-4]!
add r7,r7,r10 @ E+=F_40_59(B,C,D)
add r7,r7,r11,ror#2
ldr r9,[r14,#15*4]
@@ -292,15 +353,15 @@ sha1_block_data_order:
add r6,r8,r6,ror#2 @ E+=K_xx_xx
ldr r12,[r14,#2*4]
eor r9,r9,r10
- eor r11,r11,r12
+ eor r11,r11,r12 @ 1 cycle stall
eor r10,r4,r5 @ F_xx_xx
mov r9,r9,ror#31
add r6,r6,r7,ror#27 @ E+=ROR(A,27)
eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
and r10,r3,r10,ror#2 @ F_xx_xx
and r11,r4,r5 @ F_xx_xx
add r6,r6,r9 @ E+=X[i]
- str r9,[r14,#-4]!
add r6,r6,r10 @ E+=F_40_59(B,C,D)
add r6,r6,r11,ror#2
ldr r9,[r14,#15*4]
@@ -309,15 +370,15 @@ sha1_block_data_order:
add r5,r8,r5,ror#2 @ E+=K_xx_xx
ldr r12,[r14,#2*4]
eor r9,r9,r10
- eor r11,r11,r12
+ eor r11,r11,r12 @ 1 cycle stall
eor r10,r3,r4 @ F_xx_xx
mov r9,r9,ror#31
add r5,r5,r6,ror#27 @ E+=ROR(A,27)
eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
and r10,r7,r10,ror#2 @ F_xx_xx
and r11,r3,r4 @ F_xx_xx
add r5,r5,r9 @ E+=X[i]
- str r9,[r14,#-4]!
add r5,r5,r10 @ E+=F_40_59(B,C,D)
add r5,r5,r11,ror#2
ldr r9,[r14,#15*4]
@@ -326,15 +387,15 @@ sha1_block_data_order:
add r4,r8,r4,ror#2 @ E+=K_xx_xx
ldr r12,[r14,#2*4]
eor r9,r9,r10
- eor r11,r11,r12
+ eor r11,r11,r12 @ 1 cycle stall
eor r10,r7,r3 @ F_xx_xx
mov r9,r9,ror#31
add r4,r4,r5,ror#27 @ E+=ROR(A,27)
eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
and r10,r6,r10,ror#2 @ F_xx_xx
and r11,r7,r3 @ F_xx_xx
add r4,r4,r9 @ E+=X[i]
- str r9,[r14,#-4]!
add r4,r4,r10 @ E+=F_40_59(B,C,D)
add r4,r4,r11,ror#2
ldr r9,[r14,#15*4]
@@ -343,15 +404,15 @@ sha1_block_data_order:
add r3,r8,r3,ror#2 @ E+=K_xx_xx
ldr r12,[r14,#2*4]
eor r9,r9,r10
- eor r11,r11,r12
+ eor r11,r11,r12 @ 1 cycle stall
eor r10,r6,r7 @ F_xx_xx
mov r9,r9,ror#31
add r3,r3,r4,ror#27 @ E+=ROR(A,27)
eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
and r10,r5,r10,ror#2 @ F_xx_xx
and r11,r6,r7 @ F_xx_xx
add r3,r3,r9 @ E+=X[i]
- str r9,[r14,#-4]!
add r3,r3,r10 @ E+=F_40_59(B,C,D)
add r3,r3,r11,ror#2
teq r14,sp
@@ -373,10 +434,14 @@ sha1_block_data_order:
teq r1,r2
bne .Lloop @ [+18], total 1307
+#if __ARM_ARCH__>=5
+ ldmia sp!,{r4-r12,pc}
+#else
ldmia sp!,{r4-r12,lr}
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
.word 0xe12fff1e @ interoperable with Thumb ISA:-)
+#endif
.align 2
.LK_00_19: .word 0x5a827999
.LK_20_39: .word 0x6ed9eba1
diff --git a/main/openssl/crypto/sha/asm/sha1-ia64.pl b/main/openssl/crypto/sha/asm/sha1-ia64.pl
index 51c4f47e..02d35d16 100644
--- a/main/openssl/crypto/sha/asm/sha1-ia64.pl
+++ b/main/openssl/crypto/sha/asm/sha1-ia64.pl
@@ -15,7 +15,7 @@
# is >50% better than HP C and >2x better than gcc.
$code=<<___;
-.ident \"sha1-ia64.s, version 1.2\"
+.ident \"sha1-ia64.s, version 1.3\"
.ident \"IA-64 ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>\"
.explicit
@@ -26,14 +26,10 @@ if ($^O eq "hpux") {
$ADDP="addp4";
for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
} else { $ADDP="add"; }
-for (@ARGV) { $big_endian=1 if (/\-DB_ENDIAN/);
- $big_endian=0 if (/\-DL_ENDIAN/); }
-if (!defined($big_endian))
- { $big_endian=(unpack('L',pack('N',1))==1); }
#$human=1;
if ($human) { # useful for visual code auditing...
- ($A,$B,$C,$D,$E,$T) = ("A","B","C","D","E","T");
+ ($A,$B,$C,$D,$E) = ("A","B","C","D","E");
($h0,$h1,$h2,$h3,$h4) = ("h0","h1","h2","h3","h4");
($K_00_19, $K_20_39, $K_40_59, $K_60_79) =
( "K_00_19","K_20_39","K_40_59","K_60_79" );
@@ -41,47 +37,50 @@ if ($human) { # useful for visual code auditing...
"X8", "X9","X10","X11","X12","X13","X14","X15" );
}
else {
- ($A,$B,$C,$D,$E,$T) = ("loc0","loc1","loc2","loc3","loc4","loc5");
- ($h0,$h1,$h2,$h3,$h4) = ("loc6","loc7","loc8","loc9","loc10");
+ ($A,$B,$C,$D,$E) = ("loc0","loc1","loc2","loc3","loc4");
+ ($h0,$h1,$h2,$h3,$h4) = ("loc5","loc6","loc7","loc8","loc9");
($K_00_19, $K_20_39, $K_40_59, $K_60_79) =
- ( "r14", "r15", "loc11", "loc12" );
+ ( "r14", "r15", "loc10", "loc11" );
@X= ( "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23",
"r24", "r25", "r26", "r27", "r28", "r29", "r30", "r31" );
}
sub BODY_00_15 {
local *code=shift;
-local ($i,$a,$b,$c,$d,$e,$f)=@_;
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+my $Xn=@X[$j%16];
$code.=<<___ if ($i==0);
-{ .mmi; ld1 $X[$i&0xf]=[inp],2 // MSB
+{ .mmi; ld1 $X[$i]=[inp],2 // MSB
ld1 tmp2=[tmp3],2 };;
{ .mmi; ld1 tmp0=[inp],2
ld1 tmp4=[tmp3],2 // LSB
- dep $X[$i&0xf]=$X[$i&0xf],tmp2,8,8 };;
+ dep $X[$i]=$X[$i],tmp2,8,8 };;
___
if ($i<15) {
$code.=<<___;
-{ .mmi; ld1 $X[($i+1)&0xf]=[inp],2 // +1
+{ .mmi; ld1 $Xn=[inp],2 // forward Xload
+ nop.m 0x0
dep tmp1=tmp0,tmp4,8,8 };;
-{ .mmi; ld1 tmp2=[tmp3],2 // +1
+{ .mmi; ld1 tmp2=[tmp3],2 // forward Xload
and tmp4=$c,$b
- dep $X[$i&0xf]=$X[$i&0xf],tmp1,16,16 } //;;
-{ .mmi; andcm tmp1=$d,$b
- add tmp0=$e,$K_00_19
+ dep $X[$i]=$X[$i],tmp1,16,16} //;;
+{ .mmi; add $e=$e,$K_00_19 // e+=K_00_19
+ andcm tmp1=$d,$b
dep.z tmp5=$a,5,27 };; // a<<5
-{ .mmi; or tmp4=tmp4,tmp1 // F_00_19(b,c,d)=(b&c)|(~b&d)
- add $f=tmp0,$X[$i&0xf] // f=xi+e+K_00_19
+{ .mmi; add $e=$e,$X[$i] // e+=Xload
+ or tmp4=tmp4,tmp1 // F_00_19(b,c,d)=(b&c)|(~b&d)
extr.u tmp1=$a,27,5 };; // a>>27
-{ .mmi; ld1 tmp0=[inp],2 // +1
- add $f=$f,tmp4 // f+=F_00_19(b,c,d)
+{ .mmi; ld1 tmp0=[inp],2 // forward Xload
+ add $e=$e,tmp4 // e+=F_00_19(b,c,d)
shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30)
-{ .mmi; ld1 tmp4=[tmp3],2 // +1
+{ .mmi; ld1 tmp4=[tmp3],2 // forward Xload
or tmp5=tmp1,tmp5 // ROTATE(a,5)
mux2 tmp6=$a,0x44 };; // see b in next iteration
-{ .mii; add $f=$f,tmp5 // f+=ROTATE(a,5)
- dep $X[($i+1)&0xf]=$X[($i+1)&0xf],tmp2,8,8 // +1
- mux2 $X[$i&0xf]=$X[$i&0xf],0x44 } //;;
+{ .mii; add $e=$e,tmp5 // e+=ROTATE(a,5)
+ dep $Xn=$Xn,tmp2,8,8 // forward Xload
+ mux2 $X[$i]=$X[$i],0x44 } //;;
___
}
@@ -89,24 +88,24 @@ else {
$code.=<<___;
{ .mii; and tmp3=$c,$b
dep tmp1=tmp0,tmp4,8,8;;
- dep $X[$i&0xf]=$X[$i&0xf],tmp1,16,16 } //;;
-{ .mmi; andcm tmp1=$d,$b
- add tmp0=$e,$K_00_19
+ dep $X[$i]=$X[$i],tmp1,16,16} //;;
+{ .mmi; add $e=$e,$K_00_19 // e+=K_00_19
+ andcm tmp1=$d,$b
dep.z tmp5=$a,5,27 };; // a<<5
-{ .mmi; or tmp4=tmp3,tmp1 // F_00_19(b,c,d)=(b&c)|(~b&d)
- add $f=tmp0,$X[$i&0xf] // f=xi+e+K_00_19
+{ .mmi; add $e=$e,$X[$i] // e+=Xupdate
+ or tmp4=tmp3,tmp1 // F_00_19(b,c,d)=(b&c)|(~b&d)
extr.u tmp1=$a,27,5 } // a>>27
-{ .mmi; xor tmp2=$X[($i+0+1)&0xf],$X[($i+2+1)&0xf] // +1
- xor tmp3=$X[($i+8+1)&0xf],$X[($i+13+1)&0xf] // +1
+{ .mmi; xor $Xn=$Xn,$X[($j+2)%16] // forward Xupdate
+ xor tmp3=$X[($j+8)%16],$X[($j+13)%16] // forward Xupdate
nop.i 0 };;
-{ .mmi; add $f=$f,tmp4 // f+=F_00_19(b,c,d)
- xor tmp2=tmp2,tmp3 // +1
+{ .mmi; add $e=$e,tmp4 // e+=F_00_19(b,c,d)
+ xor $Xn=$Xn,tmp3 // forward Xupdate
shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30)
{ .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5)
mux2 tmp6=$a,0x44 };; // see b in next iteration
-{ .mii; add $f=$f,tmp1 // f+=ROTATE(a,5)
- shrp $e=tmp2,tmp2,31 // f+1=ROTATE(x[0]^x[2]^x[8]^x[13],1)
- mux2 $X[$i&0xf]=$X[$i&0xf],0x44 };;
+{ .mii; add $e=$e,tmp1 // e+=ROTATE(a,5)
+ shrp $Xn=$Xn,$Xn,31 // ROTATE(x[0]^x[2]^x[8]^x[13],1)
+ mux2 $X[$i]=$X[$i],0x44 };;
___
}
@@ -114,27 +113,28 @@ ___
sub BODY_16_19 {
local *code=shift;
-local ($i,$a,$b,$c,$d,$e,$f)=@_;
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+my $Xn=@X[$j%16];
$code.=<<___;
-{ .mmi; mov $X[$i&0xf]=$f // Xupdate
- and tmp0=$c,$b
+{ .mib; add $e=$e,$K_00_19 // e+=K_00_19
dep.z tmp5=$a,5,27 } // a<<5
-{ .mmi; andcm tmp1=$d,$b
- add tmp4=$e,$K_00_19 };;
-{ .mmi; or tmp0=tmp0,tmp1 // F_00_19(b,c,d)=(b&c)|(~b&d)
- add $f=$f,tmp4 // f+=e+K_00_19
+{ .mib; andcm tmp1=$d,$b
+ and tmp0=$c,$b };;
+{ .mmi; add $e=$e,$X[$i%16] // e+=Xupdate
+ or tmp0=tmp0,tmp1 // F_00_19(b,c,d)=(b&c)|(~b&d)
extr.u tmp1=$a,27,5 } // a>>27
-{ .mmi; xor tmp2=$X[($i+0+1)&0xf],$X[($i+2+1)&0xf] // +1
- xor tmp3=$X[($i+8+1)&0xf],$X[($i+13+1)&0xf] // +1
+{ .mmi; xor $Xn=$Xn,$X[($j+2)%16] // forward Xupdate
+ xor tmp3=$X[($j+8)%16],$X[($j+13)%16] // forward Xupdate
nop.i 0 };;
-{ .mmi; add $f=$f,tmp0 // f+=F_00_19(b,c,d)
- xor tmp2=tmp2,tmp3 // +1
+{ .mmi; add $e=$e,tmp0 // f+=F_00_19(b,c,d)
+ xor $Xn=$Xn,tmp3 // forward Xupdate
shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30)
{ .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5)
mux2 tmp6=$a,0x44 };; // see b in next iteration
-{ .mii; add $f=$f,tmp1 // f+=ROTATE(a,5)
- shrp $e=tmp2,tmp2,31 // f+1=ROTATE(x[0]^x[2]^x[8]^x[13],1)
+{ .mii; add $e=$e,tmp1 // e+=ROTATE(a,5)
+ shrp $Xn=$Xn,$Xn,31 // ROTATE(x[0]^x[2]^x[8]^x[13],1)
nop.i 0 };;
___
@@ -142,49 +142,47 @@ ___
sub BODY_20_39 {
local *code=shift;
-local ($i,$a,$b,$c,$d,$e,$f,$Konst)=@_;
+my ($i,$a,$b,$c,$d,$e,$Konst)=@_;
$Konst = $K_20_39 if (!defined($Konst));
+my $j=$i+1;
+my $Xn=@X[$j%16];
if ($i<79) {
$code.=<<___;
-{ .mib; mov $X[$i&0xf]=$f // Xupdate
+{ .mib; add $e=$e,$Konst // e+=K_XX_XX
dep.z tmp5=$a,5,27 } // a<<5
{ .mib; xor tmp0=$c,$b
- add tmp4=$e,$Konst };;
-{ .mmi; xor tmp0=tmp0,$d // F_20_39(b,c,d)=b^c^d
- add $f=$f,tmp4 // f+=e+K_20_39
+ xor $Xn=$Xn,$X[($j+2)%16] };; // forward Xupdate
+{ .mib; add $e=$e,$X[$i%16] // e+=Xupdate
extr.u tmp1=$a,27,5 } // a>>27
-{ .mmi; xor tmp2=$X[($i+0+1)&0xf],$X[($i+2+1)&0xf] // +1
- xor tmp3=$X[($i+8+1)&0xf],$X[($i+13+1)&0xf] // +1
- nop.i 0 };;
-{ .mmi; add $f=$f,tmp0 // f+=F_20_39(b,c,d)
- xor tmp2=tmp2,tmp3 // +1
+{ .mib; xor tmp0=tmp0,$d // F_20_39(b,c,d)=b^c^d
+ xor $Xn=$Xn,$X[($j+8)%16] };; // forward Xupdate
+{ .mmi; add $e=$e,tmp0 // e+=F_20_39(b,c,d)
+ xor $Xn=$Xn,$X[($j+13)%16] // forward Xupdate
shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30)
{ .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5)
mux2 tmp6=$a,0x44 };; // see b in next iteration
-{ .mii; add $f=$f,tmp1 // f+=ROTATE(a,5)
- shrp $e=tmp2,tmp2,31 // f+1=ROTATE(x[0]^x[2]^x[8]^x[13],1)
+{ .mii; add $e=$e,tmp1 // e+=ROTATE(a,5)
+ shrp $Xn=$Xn,$Xn,31 // ROTATE(x[0]^x[2]^x[8]^x[13],1)
nop.i 0 };;
___
}
else {
$code.=<<___;
-{ .mib; mov $X[$i&0xf]=$f // Xupdate
+{ .mib; add $e=$e,$Konst // e+=K_60_79
dep.z tmp5=$a,5,27 } // a<<5
{ .mib; xor tmp0=$c,$b
- add tmp4=$e,$Konst };;
-{ .mib; xor tmp0=tmp0,$d // F_20_39(b,c,d)=b^c^d
- extr.u tmp1=$a,27,5 } // a>>27
-{ .mib; add $f=$f,tmp4 // f+=e+K_20_39
add $h1=$h1,$a };; // wrap up
-{ .mmi; add $f=$f,tmp0 // f+=F_20_39(b,c,d)
- shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30) ;;?
-{ .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5)
+{ .mib; add $e=$e,$X[$i%16] // e+=Xupdate
+ extr.u tmp1=$a,27,5 } // a>>27
+{ .mib; xor tmp0=tmp0,$d // F_20_39(b,c,d)=b^c^d
add $h3=$h3,$c };; // wrap up
-{ .mib; add tmp3=1,inp // used in unaligned codepath
- add $f=$f,tmp1 } // f+=ROTATE(a,5)
-{ .mib; add $h2=$h2,$b // wrap up
+{ .mmi; add $e=$e,tmp0 // e+=F_20_39(b,c,d)
+ or tmp1=tmp1,tmp5 // ROTATE(a,5)
+ shrp $b=tmp6,tmp6,2 };; // b=ROTATE(b,30) ;;?
+{ .mmi; add $e=$e,tmp1 // e+=ROTATE(a,5)
+ add tmp3=1,inp // used in unaligned codepath
add $h4=$h4,$d };; // wrap up
___
@@ -193,29 +191,29 @@ ___
sub BODY_40_59 {
local *code=shift;
-local ($i,$a,$b,$c,$d,$e,$f)=@_;
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+my $Xn=@X[$j%16];
$code.=<<___;
-{ .mmi; mov $X[$i&0xf]=$f // Xupdate
- and tmp0=$c,$b
+{ .mib; add $e=$e,$K_40_59 // e+=K_40_59
dep.z tmp5=$a,5,27 } // a<<5
-{ .mmi; and tmp1=$d,$b
- add tmp4=$e,$K_40_59 };;
-{ .mmi; or tmp0=tmp0,tmp1 // (b&c)|(b&d)
- add $f=$f,tmp4 // f+=e+K_40_59
+{ .mib; and tmp1=$c,$d
+ xor tmp0=$c,$d };;
+{ .mmi; add $e=$e,$X[$i%16] // e+=Xupdate
+ add tmp5=tmp5,tmp1 // a<<5+(c&d)
extr.u tmp1=$a,27,5 } // a>>27
-{ .mmi; and tmp4=$c,$d
- xor tmp2=$X[($i+0+1)&0xf],$X[($i+2+1)&0xf] // +1
- xor tmp3=$X[($i+8+1)&0xf],$X[($i+13+1)&0xf] // +1
- };;
-{ .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5)
- xor tmp2=tmp2,tmp3 // +1
+{ .mmi; and tmp0=tmp0,$b
+ xor $Xn=$Xn,$X[($j+2)%16] // forward Xupdate
+ xor tmp3=$X[($j+8)%16],$X[($j+13)%16] };; // forward Xupdate
+{ .mmi; add $e=$e,tmp0 // e+=b&(c^d)
+ add tmp5=tmp5,tmp1 // ROTATE(a,5)+(c&d)
shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30)
-{ .mmi; or tmp0=tmp0,tmp4 // F_40_59(b,c,d)=(b&c)|(b&d)|(c&d)
+{ .mmi; xor $Xn=$Xn,tmp3
mux2 tmp6=$a,0x44 };; // see b in next iteration
-{ .mii; add $f=$f,tmp0 // f+=F_40_59(b,c,d)
- shrp $e=tmp2,tmp2,31;; // f+1=ROTATE(x[0]^x[2]^x[8]^x[13],1)
- add $f=$f,tmp1 };; // f+=ROTATE(a,5)
+{ .mii; add $e=$e,tmp5 // e+=ROTATE(a,5)+(c&d)
+ shrp $Xn=$Xn,$Xn,31 // ROTATE(x[0]^x[2]^x[8]^x[13],1)
+ nop.i 0x0 };;
___
}
@@ -237,7 +235,7 @@ inp=r33; // in1
.align 32
sha1_block_data_order:
.prologue
-{ .mmi; alloc tmp1=ar.pfs,3,15,0,0
+{ .mmi; alloc tmp1=ar.pfs,3,14,0,0
$ADDP tmp0=4,ctx
.save ar.lc,r3
mov r3=ar.lc }
@@ -245,8 +243,8 @@ sha1_block_data_order:
$ADDP inp=0,inp
mov r2=pr };;
tmp4=in2;
-tmp5=loc13;
-tmp6=loc14;
+tmp5=loc12;
+tmp6=loc13;
.body
{ .mlx; ld4 $h0=[ctx],8
movl $K_00_19=0x5a827999 }
@@ -273,7 +271,8 @@ tmp6=loc14;
___
-{ my $i,@V=($A,$B,$C,$D,$E,$T);
+{ my $i;
+ my @V=($A,$B,$C,$D,$E);
for($i=0;$i<16;$i++) { &BODY_00_15(\$code,$i,@V); unshift(@V,pop(@V)); }
for(;$i<20;$i++) { &BODY_16_19(\$code,$i,@V); unshift(@V,pop(@V)); }
@@ -281,12 +280,12 @@ ___
for(;$i<60;$i++) { &BODY_40_59(\$code,$i,@V); unshift(@V,pop(@V)); }
for(;$i<80;$i++) { &BODY_60_79(\$code,$i,@V); unshift(@V,pop(@V)); }
- (($V[5] eq $D) and ($V[0] eq $E)) or die; # double-check
+ (($V[0] eq $A) and ($V[4] eq $E)) or die; # double-check
}
$code.=<<___;
-{ .mmb; add $h0=$h0,$E
- nop.m 0
+{ .mmb; add $h0=$h0,$A
+ add $h2=$h2,$C
br.ctop.dptk.many .Ldtop };;
.Ldend:
{ .mmi; add tmp0=4,ctx
diff --git a/main/openssl/crypto/sha/asm/sha1-mips.S b/main/openssl/crypto/sha/asm/sha1-mips.S
new file mode 100644
index 00000000..865da255
--- /dev/null
+++ b/main/openssl/crypto/sha/asm/sha1-mips.S
@@ -0,0 +1,1664 @@
+#ifdef OPENSSL_FIPSCANISTER
+# include <openssl/fipssyms.h>
+#endif
+
+.text
+
+.set noat
+.set noreorder
+.align 5
+.globl sha1_block_data_order
+.ent sha1_block_data_order
+sha1_block_data_order:
+ .frame $29,16*4,$31
+ .mask 3237937152,-4
+ .set noreorder
+ sub $29,16*4
+ sw $31,(16-1)*4($29)
+ sw $30,(16-2)*4($29)
+ sw $23,(16-3)*4($29)
+ sw $22,(16-4)*4($29)
+ sw $21,(16-5)*4($29)
+ sw $20,(16-6)*4($29)
+ sw $19,(16-7)*4($29)
+ sw $18,(16-8)*4($29)
+ sw $17,(16-9)*4($29)
+ sw $16,(16-10)*4($29)
+ sll $6,6
+ add $6,$5
+ sw $6,0($29)
+ lw $1,0($4)
+ lw $2,4($4)
+ lw $3,8($4)
+ lw $7,12($4)
+ b .Loop
+ lw $24,16($4)
+.align 4
+.Loop:
+ .set reorder
+ lwl $8,3($5)
+ lui $31,0x5a82
+ lwr $8,0($5)
+ ori $31,0x7999 # K_00_19
+ srl $25,$8,24 # byte swap(0)
+ srl $6,$8,8
+ andi $30,$8,0xFF00
+ sll $8,$8,24
+ andi $6,0xFF00
+ sll $30,$30,8
+ or $8,$25
+ or $6,$30
+ or $8,$6
+ lwl $9,1*4+3($5)
+ sll $25,$1,5 # 0
+ addu $24,$31
+ lwr $9,1*4+0($5)
+ srl $6,$1,27
+ addu $24,$25
+ xor $25,$3,$7
+ addu $24,$6
+ sll $30,$2,30
+ and $25,$2
+ srl $2,$2,2
+ xor $25,$7
+ addu $24,$8
+ or $2,$30
+ addu $24,$25
+ srl $25,$9,24 # byte swap(1)
+ srl $6,$9,8
+ andi $30,$9,0xFF00
+ sll $9,$9,24
+ andi $6,0xFF00
+ sll $30,$30,8
+ or $9,$25
+ or $6,$30
+ or $9,$6
+ lwl $10,2*4+3($5)
+ sll $25,$24,5 # 1
+ addu $7,$31
+ lwr $10,2*4+0($5)
+ srl $6,$24,27
+ addu $7,$25
+ xor $25,$2,$3
+ addu $7,$6
+ sll $30,$1,30
+ and $25,$1
+ srl $1,$1,2
+ xor $25,$3
+ addu $7,$9
+ or $1,$30
+ addu $7,$25
+ srl $25,$10,24 # byte swap(2)
+ srl $6,$10,8
+ andi $30,$10,0xFF00
+ sll $10,$10,24
+ andi $6,0xFF00
+ sll $30,$30,8
+ or $10,$25
+ or $6,$30
+ or $10,$6
+ lwl $11,3*4+3($5)
+ sll $25,$7,5 # 2
+ addu $3,$31
+ lwr $11,3*4+0($5)
+ srl $6,$7,27
+ addu $3,$25
+ xor $25,$1,$2
+ addu $3,$6
+ sll $30,$24,30
+ and $25,$24
+ srl $24,$24,2
+ xor $25,$2
+ addu $3,$10
+ or $24,$30
+ addu $3,$25
+ srl $25,$11,24 # byte swap(3)
+ srl $6,$11,8
+ andi $30,$11,0xFF00
+ sll $11,$11,24
+ andi $6,0xFF00
+ sll $30,$30,8
+ or $11,$25
+ or $6,$30
+ or $11,$6
+ lwl $12,4*4+3($5)
+ sll $25,$3,5 # 3
+ addu $2,$31
+ lwr $12,4*4+0($5)
+ srl $6,$3,27
+ addu $2,$25
+ xor $25,$24,$1
+ addu $2,$6
+ sll $30,$7,30
+ and $25,$7
+ srl $7,$7,2
+ xor $25,$1
+ addu $2,$11
+ or $7,$30
+ addu $2,$25
+ srl $25,$12,24 # byte swap(4)
+ srl $6,$12,8
+ andi $30,$12,0xFF00
+ sll $12,$12,24
+ andi $6,0xFF00
+ sll $30,$30,8
+ or $12,$25
+ or $6,$30
+ or $12,$6
+ lwl $13,5*4+3($5)
+ sll $25,$2,5 # 4
+ addu $1,$31
+ lwr $13,5*4+0($5)
+ srl $6,$2,27
+ addu $1,$25
+ xor $25,$7,$24
+ addu $1,$6
+ sll $30,$3,30
+ and $25,$3
+ srl $3,$3,2
+ xor $25,$24
+ addu $1,$12
+ or $3,$30
+ addu $1,$25
+ srl $25,$13,24 # byte swap(5)
+ srl $6,$13,8
+ andi $30,$13,0xFF00
+ sll $13,$13,24
+ andi $6,0xFF00
+ sll $30,$30,8
+ or $13,$25
+ or $6,$30
+ or $13,$6
+ lwl $14,6*4+3($5)
+ sll $25,$1,5 # 5
+ addu $24,$31
+ lwr $14,6*4+0($5)
+ srl $6,$1,27
+ addu $24,$25
+ xor $25,$3,$7
+ addu $24,$6
+ sll $30,$2,30
+ and $25,$2
+ srl $2,$2,2
+ xor $25,$7
+ addu $24,$13
+ or $2,$30
+ addu $24,$25
+ srl $25,$14,24 # byte swap(6)
+ srl $6,$14,8
+ andi $30,$14,0xFF00
+ sll $14,$14,24
+ andi $6,0xFF00
+ sll $30,$30,8
+ or $14,$25
+ or $6,$30
+ or $14,$6
+ lwl $15,7*4+3($5)
+ sll $25,$24,5 # 6
+ addu $7,$31
+ lwr $15,7*4+0($5)
+ srl $6,$24,27
+ addu $7,$25
+ xor $25,$2,$3
+ addu $7,$6
+ sll $30,$1,30
+ and $25,$1
+ srl $1,$1,2
+ xor $25,$3
+ addu $7,$14
+ or $1,$30
+ addu $7,$25
+ srl $25,$15,24 # byte swap(7)
+ srl $6,$15,8
+ andi $30,$15,0xFF00
+ sll $15,$15,24
+ andi $6,0xFF00
+ sll $30,$30,8
+ or $15,$25
+ or $6,$30
+ or $15,$6
+ lwl $16,8*4+3($5)
+ sll $25,$7,5 # 7
+ addu $3,$31
+ lwr $16,8*4+0($5)
+ srl $6,$7,27
+ addu $3,$25
+ xor $25,$1,$2
+ addu $3,$6
+ sll $30,$24,30
+ and $25,$24
+ srl $24,$24,2
+ xor $25,$2
+ addu $3,$15
+ or $24,$30
+ addu $3,$25
+ srl $25,$16,24 # byte swap(8)
+ srl $6,$16,8
+ andi $30,$16,0xFF00
+ sll $16,$16,24
+ andi $6,0xFF00
+ sll $30,$30,8
+ or $16,$25
+ or $6,$30
+ or $16,$6
+ lwl $17,9*4+3($5)
+ sll $25,$3,5 # 8
+ addu $2,$31
+ lwr $17,9*4+0($5)
+ srl $6,$3,27
+ addu $2,$25
+ xor $25,$24,$1
+ addu $2,$6
+ sll $30,$7,30
+ and $25,$7
+ srl $7,$7,2
+ xor $25,$1
+ addu $2,$16
+ or $7,$30
+ addu $2,$25
+ srl $25,$17,24 # byte swap(9)
+ srl $6,$17,8
+ andi $30,$17,0xFF00
+ sll $17,$17,24
+ andi $6,0xFF00
+ sll $30,$30,8
+ or $17,$25
+ or $6,$30
+ or $17,$6
+ lwl $18,10*4+3($5)
+ sll $25,$2,5 # 9
+ addu $1,$31
+ lwr $18,10*4+0($5)
+ srl $6,$2,27
+ addu $1,$25
+ xor $25,$7,$24
+ addu $1,$6
+ sll $30,$3,30
+ and $25,$3
+ srl $3,$3,2
+ xor $25,$24
+ addu $1,$17
+ or $3,$30
+ addu $1,$25
+ srl $25,$18,24 # byte swap(10)
+ srl $6,$18,8
+ andi $30,$18,0xFF00
+ sll $18,$18,24
+ andi $6,0xFF00
+ sll $30,$30,8
+ or $18,$25
+ or $6,$30
+ or $18,$6
+ lwl $19,11*4+3($5)
+ sll $25,$1,5 # 10
+ addu $24,$31
+ lwr $19,11*4+0($5)
+ srl $6,$1,27
+ addu $24,$25
+ xor $25,$3,$7
+ addu $24,$6
+ sll $30,$2,30
+ and $25,$2
+ srl $2,$2,2
+ xor $25,$7
+ addu $24,$18
+ or $2,$30
+ addu $24,$25
+ srl $25,$19,24 # byte swap(11)
+ srl $6,$19,8
+ andi $30,$19,0xFF00
+ sll $19,$19,24
+ andi $6,0xFF00
+ sll $30,$30,8
+ or $19,$25
+ or $6,$30
+ or $19,$6
+ lwl $20,12*4+3($5)
+ sll $25,$24,5 # 11
+ addu $7,$31
+ lwr $20,12*4+0($5)
+ srl $6,$24,27
+ addu $7,$25
+ xor $25,$2,$3
+ addu $7,$6
+ sll $30,$1,30
+ and $25,$1
+ srl $1,$1,2
+ xor $25,$3
+ addu $7,$19
+ or $1,$30
+ addu $7,$25
+ srl $25,$20,24 # byte swap(12)
+ srl $6,$20,8
+ andi $30,$20,0xFF00
+ sll $20,$20,24
+ andi $6,0xFF00
+ sll $30,$30,8
+ or $20,$25
+ or $6,$30
+ or $20,$6
+ lwl $21,13*4+3($5)
+ sll $25,$7,5 # 12
+ addu $3,$31
+ lwr $21,13*4+0($5)
+ srl $6,$7,27
+ addu $3,$25
+ xor $25,$1,$2
+ addu $3,$6
+ sll $30,$24,30
+ and $25,$24
+ srl $24,$24,2
+ xor $25,$2
+ addu $3,$20
+ or $24,$30
+ addu $3,$25
+ srl $25,$21,24 # byte swap(13)
+ srl $6,$21,8
+ andi $30,$21,0xFF00
+ sll $21,$21,24
+ andi $6,0xFF00
+ sll $30,$30,8
+ or $21,$25
+ or $6,$30
+ or $21,$6
+ lwl $22,14*4+3($5)
+ sll $25,$3,5 # 13
+ addu $2,$31
+ lwr $22,14*4+0($5)
+ srl $6,$3,27
+ addu $2,$25
+ xor $25,$24,$1
+ addu $2,$6
+ sll $30,$7,30
+ and $25,$7
+ srl $7,$7,2
+ xor $25,$1
+ addu $2,$21
+ or $7,$30
+ addu $2,$25
+ srl $25,$22,24 # byte swap(14)
+ srl $6,$22,8
+ andi $30,$22,0xFF00
+ sll $22,$22,24
+ andi $6,0xFF00
+ sll $30,$30,8
+ or $22,$25
+ or $6,$30
+ or $22,$6
+ lwl $23,15*4+3($5)
+ sll $25,$2,5 # 14
+ addu $1,$31
+ lwr $23,15*4+0($5)
+ srl $6,$2,27
+ addu $1,$25
+ xor $25,$7,$24
+ addu $1,$6
+ sll $30,$3,30
+ and $25,$3
+ srl $3,$3,2
+ xor $25,$24
+ addu $1,$22
+ or $3,$30
+ addu $1,$25
+ srl $25,$23,24 # byte swap(15)
+ srl $6,$23,8
+ andi $30,$23,0xFF00
+ sll $23,$23,24
+ andi $6,0xFF00
+ sll $30,$30,8
+ or $23,$25
+ or $23,$6
+ or $23,$30
+ xor $8,$10
+ sll $25,$1,5 # 15
+ addu $24,$31
+ srl $6,$1,27
+ addu $24,$25
+ xor $8,$16
+ xor $25,$3,$7
+ addu $24,$6
+ xor $8,$21
+ sll $30,$2,30
+ and $25,$2
+ srl $6,$8,31
+ addu $8,$8
+ srl $2,$2,2
+ xor $25,$7
+ or $8,$6
+ addu $24,$23
+ or $2,$30
+ addu $24,$25
+ xor $9,$11
+ sll $25,$24,5 # 16
+ addu $7,$31
+ srl $6,$24,27
+ addu $7,$25
+ xor $9,$17
+ xor $25,$2,$3
+ addu $7,$6
+ xor $9,$22
+ sll $30,$1,30
+ and $25,$1
+ srl $6,$9,31
+ addu $9,$9
+ srl $1,$1,2
+ xor $25,$3
+ or $9,$6
+ addu $7,$8
+ or $1,$30
+ addu $7,$25
+ xor $10,$12
+ sll $25,$7,5 # 17
+ addu $3,$31
+ srl $6,$7,27
+ addu $3,$25
+ xor $10,$18
+ xor $25,$1,$2
+ addu $3,$6
+ xor $10,$23
+ sll $30,$24,30
+ and $25,$24
+ srl $6,$10,31
+ addu $10,$10
+ srl $24,$24,2
+ xor $25,$2
+ or $10,$6
+ addu $3,$9
+ or $24,$30
+ addu $3,$25
+ xor $11,$13
+ sll $25,$3,5 # 18
+ addu $2,$31
+ srl $6,$3,27
+ addu $2,$25
+ xor $11,$19
+ xor $25,$24,$1
+ addu $2,$6
+ xor $11,$8
+ sll $30,$7,30
+ and $25,$7
+ srl $6,$11,31
+ addu $11,$11
+ srl $7,$7,2
+ xor $25,$1
+ or $11,$6
+ addu $2,$10
+ or $7,$30
+ addu $2,$25
+ xor $12,$14
+ sll $25,$2,5 # 19
+ addu $1,$31
+ srl $6,$2,27
+ addu $1,$25
+ xor $12,$20
+ xor $25,$7,$24
+ addu $1,$6
+ xor $12,$9
+ sll $30,$3,30
+ and $25,$3
+ srl $6,$12,31
+ addu $12,$12
+ srl $3,$3,2
+ xor $25,$24
+ or $12,$6
+ addu $1,$11
+ or $3,$30
+ addu $1,$25
+ lui $31,0x6ed9
+ ori $31,0xeba1 # K_20_39
+ xor $13,$15
+ sll $25,$1,5 # 20
+ addu $24,$31
+ srl $6,$1,27
+ addu $24,$25
+ xor $13,$21
+ xor $25,$3,$7
+ addu $24,$6
+ xor $13,$10
+ sll $30,$2,30
+ xor $25,$2
+ srl $6,$13,31
+ addu $13,$13
+ srl $2,$2,2
+ addu $24,$12
+ or $13,$6
+ or $2,$30
+ addu $24,$25
+ xor $14,$16
+ sll $25,$24,5 # 21
+ addu $7,$31
+ srl $6,$24,27
+ addu $7,$25
+ xor $14,$22
+ xor $25,$2,$3
+ addu $7,$6
+ xor $14,$11
+ sll $30,$1,30
+ xor $25,$1
+ srl $6,$14,31
+ addu $14,$14
+ srl $1,$1,2
+ addu $7,$13
+ or $14,$6
+ or $1,$30
+ addu $7,$25
+ xor $15,$17
+ sll $25,$7,5 # 22
+ addu $3,$31
+ srl $6,$7,27
+ addu $3,$25
+ xor $15,$23
+ xor $25,$1,$2
+ addu $3,$6
+ xor $15,$12
+ sll $30,$24,30
+ xor $25,$24
+ srl $6,$15,31
+ addu $15,$15
+ srl $24,$24,2
+ addu $3,$14
+ or $15,$6
+ or $24,$30
+ addu $3,$25
+ xor $16,$18
+ sll $25,$3,5 # 23
+ addu $2,$31
+ srl $6,$3,27
+ addu $2,$25
+ xor $16,$8
+ xor $25,$24,$1
+ addu $2,$6
+ xor $16,$13
+ sll $30,$7,30
+ xor $25,$7
+ srl $6,$16,31
+ addu $16,$16
+ srl $7,$7,2
+ addu $2,$15
+ or $16,$6
+ or $7,$30
+ addu $2,$25
+ xor $17,$19
+ sll $25,$2,5 # 24
+ addu $1,$31
+ srl $6,$2,27
+ addu $1,$25
+ xor $17,$9
+ xor $25,$7,$24
+ addu $1,$6
+ xor $17,$14
+ sll $30,$3,30
+ xor $25,$3
+ srl $6,$17,31
+ addu $17,$17
+ srl $3,$3,2
+ addu $1,$16
+ or $17,$6
+ or $3,$30
+ addu $1,$25
+ xor $18,$20
+ sll $25,$1,5 # 25
+ addu $24,$31
+ srl $6,$1,27
+ addu $24,$25
+ xor $18,$10
+ xor $25,$3,$7
+ addu $24,$6
+ xor $18,$15
+ sll $30,$2,30
+ xor $25,$2
+ srl $6,$18,31
+ addu $18,$18
+ srl $2,$2,2
+ addu $24,$17
+ or $18,$6
+ or $2,$30
+ addu $24,$25
+ xor $19,$21
+ sll $25,$24,5 # 26
+ addu $7,$31
+ srl $6,$24,27
+ addu $7,$25
+ xor $19,$11
+ xor $25,$2,$3
+ addu $7,$6
+ xor $19,$16
+ sll $30,$1,30
+ xor $25,$1
+ srl $6,$19,31
+ addu $19,$19
+ srl $1,$1,2
+ addu $7,$18
+ or $19,$6
+ or $1,$30
+ addu $7,$25
+ xor $20,$22
+ sll $25,$7,5 # 27
+ addu $3,$31
+ srl $6,$7,27
+ addu $3,$25
+ xor $20,$12
+ xor $25,$1,$2
+ addu $3,$6
+ xor $20,$17
+ sll $30,$24,30
+ xor $25,$24
+ srl $6,$20,31
+ addu $20,$20
+ srl $24,$24,2
+ addu $3,$19
+ or $20,$6
+ or $24,$30
+ addu $3,$25
+ xor $21,$23
+ sll $25,$3,5 # 28
+ addu $2,$31
+ srl $6,$3,27
+ addu $2,$25
+ xor $21,$13
+ xor $25,$24,$1
+ addu $2,$6
+ xor $21,$18
+ sll $30,$7,30
+ xor $25,$7
+ srl $6,$21,31
+ addu $21,$21
+ srl $7,$7,2
+ addu $2,$20
+ or $21,$6
+ or $7,$30
+ addu $2,$25
+ xor $22,$8
+ sll $25,$2,5 # 29
+ addu $1,$31
+ srl $6,$2,27
+ addu $1,$25
+ xor $22,$14
+ xor $25,$7,$24
+ addu $1,$6
+ xor $22,$19
+ sll $30,$3,30
+ xor $25,$3
+ srl $6,$22,31
+ addu $22,$22
+ srl $3,$3,2
+ addu $1,$21
+ or $22,$6
+ or $3,$30
+ addu $1,$25
+ xor $23,$9
+ sll $25,$1,5 # 30
+ addu $24,$31
+ srl $6,$1,27
+ addu $24,$25
+ xor $23,$15
+ xor $25,$3,$7
+ addu $24,$6
+ xor $23,$20
+ sll $30,$2,30
+ xor $25,$2
+ srl $6,$23,31
+ addu $23,$23
+ srl $2,$2,2
+ addu $24,$22
+ or $23,$6
+ or $2,$30
+ addu $24,$25
+ xor $8,$10
+ sll $25,$24,5 # 31
+ addu $7,$31
+ srl $6,$24,27
+ addu $7,$25
+ xor $8,$16
+ xor $25,$2,$3
+ addu $7,$6
+ xor $8,$21
+ sll $30,$1,30
+ xor $25,$1
+ srl $6,$8,31
+ addu $8,$8
+ srl $1,$1,2
+ addu $7,$23
+ or $8,$6
+ or $1,$30
+ addu $7,$25
+ xor $9,$11
+ sll $25,$7,5 # 32
+ addu $3,$31
+ srl $6,$7,27
+ addu $3,$25
+ xor $9,$17
+ xor $25,$1,$2
+ addu $3,$6
+ xor $9,$22
+ sll $30,$24,30
+ xor $25,$24
+ srl $6,$9,31
+ addu $9,$9
+ srl $24,$24,2
+ addu $3,$8
+ or $9,$6
+ or $24,$30
+ addu $3,$25
+ xor $10,$12
+ sll $25,$3,5 # 33
+ addu $2,$31
+ srl $6,$3,27
+ addu $2,$25
+ xor $10,$18
+ xor $25,$24,$1
+ addu $2,$6
+ xor $10,$23
+ sll $30,$7,30
+ xor $25,$7
+ srl $6,$10,31
+ addu $10,$10
+ srl $7,$7,2
+ addu $2,$9
+ or $10,$6
+ or $7,$30
+ addu $2,$25
+ xor $11,$13
+ sll $25,$2,5 # 34
+ addu $1,$31
+ srl $6,$2,27
+ addu $1,$25
+ xor $11,$19
+ xor $25,$7,$24
+ addu $1,$6
+ xor $11,$8
+ sll $30,$3,30
+ xor $25,$3
+ srl $6,$11,31
+ addu $11,$11
+ srl $3,$3,2
+ addu $1,$10
+ or $11,$6
+ or $3,$30
+ addu $1,$25
+ xor $12,$14
+ sll $25,$1,5 # 35
+ addu $24,$31
+ srl $6,$1,27
+ addu $24,$25
+ xor $12,$20
+ xor $25,$3,$7
+ addu $24,$6
+ xor $12,$9
+ sll $30,$2,30
+ xor $25,$2
+ srl $6,$12,31
+ addu $12,$12
+ srl $2,$2,2
+ addu $24,$11
+ or $12,$6
+ or $2,$30
+ addu $24,$25
+ xor $13,$15
+ sll $25,$24,5 # 36
+ addu $7,$31
+ srl $6,$24,27
+ addu $7,$25
+ xor $13,$21
+ xor $25,$2,$3
+ addu $7,$6
+ xor $13,$10
+ sll $30,$1,30
+ xor $25,$1
+ srl $6,$13,31
+ addu $13,$13
+ srl $1,$1,2
+ addu $7,$12
+ or $13,$6
+ or $1,$30
+ addu $7,$25
+ xor $14,$16
+ sll $25,$7,5 # 37
+ addu $3,$31
+ srl $6,$7,27
+ addu $3,$25
+ xor $14,$22
+ xor $25,$1,$2
+ addu $3,$6
+ xor $14,$11
+ sll $30,$24,30
+ xor $25,$24
+ srl $6,$14,31
+ addu $14,$14
+ srl $24,$24,2
+ addu $3,$13
+ or $14,$6
+ or $24,$30
+ addu $3,$25
+ xor $15,$17
+ sll $25,$3,5 # 38
+ addu $2,$31
+ srl $6,$3,27
+ addu $2,$25
+ xor $15,$23
+ xor $25,$24,$1
+ addu $2,$6
+ xor $15,$12
+ sll $30,$7,30
+ xor $25,$7
+ srl $6,$15,31
+ addu $15,$15
+ srl $7,$7,2
+ addu $2,$14
+ or $15,$6
+ or $7,$30
+ addu $2,$25
+ xor $16,$18
+ sll $25,$2,5 # 39
+ addu $1,$31
+ srl $6,$2,27
+ addu $1,$25
+ xor $16,$8
+ xor $25,$7,$24
+ addu $1,$6
+ xor $16,$13
+ sll $30,$3,30
+ xor $25,$3
+ srl $6,$16,31
+ addu $16,$16
+ srl $3,$3,2
+ addu $1,$15
+ or $16,$6
+ or $3,$30
+ addu $1,$25
+ lui $31,0x8f1b
+ ori $31,0xbcdc # K_40_59
+ xor $17,$19
+ sll $25,$1,5 # 40
+ addu $24,$31
+ srl $6,$1,27
+ addu $24,$25
+ xor $17,$9
+ and $25,$3,$7
+ addu $24,$6
+ xor $17,$14
+ sll $30,$2,30
+ addu $24,$25
+ srl $6,$17,31
+ xor $25,$3,$7
+ addu $17,$17
+ and $25,$2
+ srl $2,$2,2
+ or $17,$6
+ addu $24,$16
+ or $2,$30
+ addu $24,$25
+ xor $18,$20
+ sll $25,$24,5 # 41
+ addu $7,$31
+ srl $6,$24,27
+ addu $7,$25
+ xor $18,$10
+ and $25,$2,$3
+ addu $7,$6
+ xor $18,$15
+ sll $30,$1,30
+ addu $7,$25
+ srl $6,$18,31
+ xor $25,$2,$3
+ addu $18,$18
+ and $25,$1
+ srl $1,$1,2
+ or $18,$6
+ addu $7,$17
+ or $1,$30
+ addu $7,$25
+ xor $19,$21
+ sll $25,$7,5 # 42
+ addu $3,$31
+ srl $6,$7,27
+ addu $3,$25
+ xor $19,$11
+ and $25,$1,$2
+ addu $3,$6
+ xor $19,$16
+ sll $30,$24,30
+ addu $3,$25
+ srl $6,$19,31
+ xor $25,$1,$2
+ addu $19,$19
+ and $25,$24
+ srl $24,$24,2
+ or $19,$6
+ addu $3,$18
+ or $24,$30
+ addu $3,$25
+ xor $20,$22
+ sll $25,$3,5 # 43
+ addu $2,$31
+ srl $6,$3,27
+ addu $2,$25
+ xor $20,$12
+ and $25,$24,$1
+ addu $2,$6
+ xor $20,$17
+ sll $30,$7,30
+ addu $2,$25
+ srl $6,$20,31
+ xor $25,$24,$1
+ addu $20,$20
+ and $25,$7
+ srl $7,$7,2
+ or $20,$6
+ addu $2,$19
+ or $7,$30
+ addu $2,$25
+ xor $21,$23
+ sll $25,$2,5 # 44
+ addu $1,$31
+ srl $6,$2,27
+ addu $1,$25
+ xor $21,$13
+ and $25,$7,$24
+ addu $1,$6
+ xor $21,$18
+ sll $30,$3,30
+ addu $1,$25
+ srl $6,$21,31
+ xor $25,$7,$24
+ addu $21,$21
+ and $25,$3
+ srl $3,$3,2
+ or $21,$6
+ addu $1,$20
+ or $3,$30
+ addu $1,$25
+ xor $22,$8
+ sll $25,$1,5 # 45
+ addu $24,$31
+ srl $6,$1,27
+ addu $24,$25
+ xor $22,$14
+ and $25,$3,$7
+ addu $24,$6
+ xor $22,$19
+ sll $30,$2,30
+ addu $24,$25
+ srl $6,$22,31
+ xor $25,$3,$7
+ addu $22,$22
+ and $25,$2
+ srl $2,$2,2
+ or $22,$6
+ addu $24,$21
+ or $2,$30
+ addu $24,$25
+ xor $23,$9
+ sll $25,$24,5 # 46
+ addu $7,$31
+ srl $6,$24,27
+ addu $7,$25
+ xor $23,$15
+ and $25,$2,$3
+ addu $7,$6
+ xor $23,$20
+ sll $30,$1,30
+ addu $7,$25
+ srl $6,$23,31
+ xor $25,$2,$3
+ addu $23,$23
+ and $25,$1
+ srl $1,$1,2
+ or $23,$6
+ addu $7,$22
+ or $1,$30
+ addu $7,$25
+ xor $8,$10
+ sll $25,$7,5 # 47
+ addu $3,$31
+ srl $6,$7,27
+ addu $3,$25
+ xor $8,$16
+ and $25,$1,$2
+ addu $3,$6
+ xor $8,$21
+ sll $30,$24,30
+ addu $3,$25
+ srl $6,$8,31
+ xor $25,$1,$2
+ addu $8,$8
+ and $25,$24
+ srl $24,$24,2
+ or $8,$6
+ addu $3,$23
+ or $24,$30
+ addu $3,$25
+ xor $9,$11
+ sll $25,$3,5 # 48
+ addu $2,$31
+ srl $6,$3,27
+ addu $2,$25
+ xor $9,$17
+ and $25,$24,$1
+ addu $2,$6
+ xor $9,$22
+ sll $30,$7,30
+ addu $2,$25
+ srl $6,$9,31
+ xor $25,$24,$1
+ addu $9,$9
+ and $25,$7
+ srl $7,$7,2
+ or $9,$6
+ addu $2,$8
+ or $7,$30
+ addu $2,$25
+ xor $10,$12
+ sll $25,$2,5 # 49
+ addu $1,$31
+ srl $6,$2,27
+ addu $1,$25
+ xor $10,$18
+ and $25,$7,$24
+ addu $1,$6
+ xor $10,$23
+ sll $30,$3,30
+ addu $1,$25
+ srl $6,$10,31
+ xor $25,$7,$24
+ addu $10,$10
+ and $25,$3
+ srl $3,$3,2
+ or $10,$6
+ addu $1,$9
+ or $3,$30
+ addu $1,$25
+ xor $11,$13
+ sll $25,$1,5 # 50
+ addu $24,$31
+ srl $6,$1,27
+ addu $24,$25
+ xor $11,$19
+ and $25,$3,$7
+ addu $24,$6
+ xor $11,$8
+ sll $30,$2,30
+ addu $24,$25
+ srl $6,$11,31
+ xor $25,$3,$7
+ addu $11,$11
+ and $25,$2
+ srl $2,$2,2
+ or $11,$6
+ addu $24,$10
+ or $2,$30
+ addu $24,$25
+ xor $12,$14
+ sll $25,$24,5 # 51
+ addu $7,$31
+ srl $6,$24,27
+ addu $7,$25
+ xor $12,$20
+ and $25,$2,$3
+ addu $7,$6
+ xor $12,$9
+ sll $30,$1,30
+ addu $7,$25
+ srl $6,$12,31
+ xor $25,$2,$3
+ addu $12,$12
+ and $25,$1
+ srl $1,$1,2
+ or $12,$6
+ addu $7,$11
+ or $1,$30
+ addu $7,$25
+ xor $13,$15
+ sll $25,$7,5 # 52
+ addu $3,$31
+ srl $6,$7,27
+ addu $3,$25
+ xor $13,$21
+ and $25,$1,$2
+ addu $3,$6
+ xor $13,$10
+ sll $30,$24,30
+ addu $3,$25
+ srl $6,$13,31
+ xor $25,$1,$2
+ addu $13,$13
+ and $25,$24
+ srl $24,$24,2
+ or $13,$6
+ addu $3,$12
+ or $24,$30
+ addu $3,$25
+ xor $14,$16
+ sll $25,$3,5 # 53
+ addu $2,$31
+ srl $6,$3,27
+ addu $2,$25
+ xor $14,$22
+ and $25,$24,$1
+ addu $2,$6
+ xor $14,$11
+ sll $30,$7,30
+ addu $2,$25
+ srl $6,$14,31
+ xor $25,$24,$1
+ addu $14,$14
+ and $25,$7
+ srl $7,$7,2
+ or $14,$6
+ addu $2,$13
+ or $7,$30
+ addu $2,$25
+ xor $15,$17
+ sll $25,$2,5 # 54
+ addu $1,$31
+ srl $6,$2,27
+ addu $1,$25
+ xor $15,$23
+ and $25,$7,$24
+ addu $1,$6
+ xor $15,$12
+ sll $30,$3,30
+ addu $1,$25
+ srl $6,$15,31
+ xor $25,$7,$24
+ addu $15,$15
+ and $25,$3
+ srl $3,$3,2
+ or $15,$6
+ addu $1,$14
+ or $3,$30
+ addu $1,$25
+ xor $16,$18
+ sll $25,$1,5 # 55
+ addu $24,$31
+ srl $6,$1,27
+ addu $24,$25
+ xor $16,$8
+ and $25,$3,$7
+ addu $24,$6
+ xor $16,$13
+ sll $30,$2,30
+ addu $24,$25
+ srl $6,$16,31
+ xor $25,$3,$7
+ addu $16,$16
+ and $25,$2
+ srl $2,$2,2
+ or $16,$6
+ addu $24,$15
+ or $2,$30
+ addu $24,$25
+ xor $17,$19
+ sll $25,$24,5 # 56
+ addu $7,$31
+ srl $6,$24,27
+ addu $7,$25
+ xor $17,$9
+ and $25,$2,$3
+ addu $7,$6
+ xor $17,$14
+ sll $30,$1,30
+ addu $7,$25
+ srl $6,$17,31
+ xor $25,$2,$3
+ addu $17,$17
+ and $25,$1
+ srl $1,$1,2
+ or $17,$6
+ addu $7,$16
+ or $1,$30
+ addu $7,$25
+ xor $18,$20
+ sll $25,$7,5 # 57
+ addu $3,$31
+ srl $6,$7,27
+ addu $3,$25
+ xor $18,$10
+ and $25,$1,$2
+ addu $3,$6
+ xor $18,$15
+ sll $30,$24,30
+ addu $3,$25
+ srl $6,$18,31
+ xor $25,$1,$2
+ addu $18,$18
+ and $25,$24
+ srl $24,$24,2
+ or $18,$6
+ addu $3,$17
+ or $24,$30
+ addu $3,$25
+ xor $19,$21
+ sll $25,$3,5 # 58
+ addu $2,$31
+ srl $6,$3,27
+ addu $2,$25
+ xor $19,$11
+ and $25,$24,$1
+ addu $2,$6
+ xor $19,$16
+ sll $30,$7,30
+ addu $2,$25
+ srl $6,$19,31
+ xor $25,$24,$1
+ addu $19,$19
+ and $25,$7
+ srl $7,$7,2
+ or $19,$6
+ addu $2,$18
+ or $7,$30
+ addu $2,$25
+ xor $20,$22
+ sll $25,$2,5 # 59
+ addu $1,$31
+ srl $6,$2,27
+ addu $1,$25
+ xor $20,$12
+ and $25,$7,$24
+ addu $1,$6
+ xor $20,$17
+ sll $30,$3,30
+ addu $1,$25
+ srl $6,$20,31
+ xor $25,$7,$24
+ addu $20,$20
+ and $25,$3
+ srl $3,$3,2
+ or $20,$6
+ addu $1,$19
+ or $3,$30
+ addu $1,$25
+ lui $31,0xca62
+ ori $31,0xc1d6 # K_60_79
+ xor $21,$23
+ sll $25,$1,5 # 60
+ addu $24,$31
+ srl $6,$1,27
+ addu $24,$25
+ xor $21,$13
+ xor $25,$3,$7
+ addu $24,$6
+ xor $21,$18
+ sll $30,$2,30
+ xor $25,$2
+ srl $6,$21,31
+ addu $21,$21
+ srl $2,$2,2
+ addu $24,$20
+ or $21,$6
+ or $2,$30
+ addu $24,$25
+ xor $22,$8
+ sll $25,$24,5 # 61
+ addu $7,$31
+ srl $6,$24,27
+ addu $7,$25
+ xor $22,$14
+ xor $25,$2,$3
+ addu $7,$6
+ xor $22,$19
+ sll $30,$1,30
+ xor $25,$1
+ srl $6,$22,31
+ addu $22,$22
+ srl $1,$1,2
+ addu $7,$21
+ or $22,$6
+ or $1,$30
+ addu $7,$25
+ xor $23,$9
+ sll $25,$7,5 # 62
+ addu $3,$31
+ srl $6,$7,27
+ addu $3,$25
+ xor $23,$15
+ xor $25,$1,$2
+ addu $3,$6
+ xor $23,$20
+ sll $30,$24,30
+ xor $25,$24
+ srl $6,$23,31
+ addu $23,$23
+ srl $24,$24,2
+ addu $3,$22
+ or $23,$6
+ or $24,$30
+ addu $3,$25
+ xor $8,$10
+ sll $25,$3,5 # 63
+ addu $2,$31
+ srl $6,$3,27
+ addu $2,$25
+ xor $8,$16
+ xor $25,$24,$1
+ addu $2,$6
+ xor $8,$21
+ sll $30,$7,30
+ xor $25,$7
+ srl $6,$8,31
+ addu $8,$8
+ srl $7,$7,2
+ addu $2,$23
+ or $8,$6
+ or $7,$30
+ addu $2,$25
+ xor $9,$11
+ sll $25,$2,5 # 64
+ addu $1,$31
+ srl $6,$2,27
+ addu $1,$25
+ xor $9,$17
+ xor $25,$7,$24
+ addu $1,$6
+ xor $9,$22
+ sll $30,$3,30
+ xor $25,$3
+ srl $6,$9,31
+ addu $9,$9
+ srl $3,$3,2
+ addu $1,$8
+ or $9,$6
+ or $3,$30
+ addu $1,$25
+ xor $10,$12
+ sll $25,$1,5 # 65
+ addu $24,$31
+ srl $6,$1,27
+ addu $24,$25
+ xor $10,$18
+ xor $25,$3,$7
+ addu $24,$6
+ xor $10,$23
+ sll $30,$2,30
+ xor $25,$2
+ srl $6,$10,31
+ addu $10,$10
+ srl $2,$2,2
+ addu $24,$9
+ or $10,$6
+ or $2,$30
+ addu $24,$25
+ xor $11,$13
+ sll $25,$24,5 # 66
+ addu $7,$31
+ srl $6,$24,27
+ addu $7,$25
+ xor $11,$19
+ xor $25,$2,$3
+ addu $7,$6
+ xor $11,$8
+ sll $30,$1,30
+ xor $25,$1
+ srl $6,$11,31
+ addu $11,$11
+ srl $1,$1,2
+ addu $7,$10
+ or $11,$6
+ or $1,$30
+ addu $7,$25
+ xor $12,$14
+ sll $25,$7,5 # 67
+ addu $3,$31
+ srl $6,$7,27
+ addu $3,$25
+ xor $12,$20
+ xor $25,$1,$2
+ addu $3,$6
+ xor $12,$9
+ sll $30,$24,30
+ xor $25,$24
+ srl $6,$12,31
+ addu $12,$12
+ srl $24,$24,2
+ addu $3,$11
+ or $12,$6
+ or $24,$30
+ addu $3,$25
+ xor $13,$15
+ sll $25,$3,5 # 68
+ addu $2,$31
+ srl $6,$3,27
+ addu $2,$25
+ xor $13,$21
+ xor $25,$24,$1
+ addu $2,$6
+ xor $13,$10
+ sll $30,$7,30
+ xor $25,$7
+ srl $6,$13,31
+ addu $13,$13
+ srl $7,$7,2
+ addu $2,$12
+ or $13,$6
+ or $7,$30
+ addu $2,$25
+ xor $14,$16
+ sll $25,$2,5 # 69
+ addu $1,$31
+ srl $6,$2,27
+ addu $1,$25
+ xor $14,$22
+ xor $25,$7,$24
+ addu $1,$6
+ xor $14,$11
+ sll $30,$3,30
+ xor $25,$3
+ srl $6,$14,31
+ addu $14,$14
+ srl $3,$3,2
+ addu $1,$13
+ or $14,$6
+ or $3,$30
+ addu $1,$25
+ xor $15,$17
+ sll $25,$1,5 # 70
+ addu $24,$31
+ srl $6,$1,27
+ addu $24,$25
+ xor $15,$23
+ xor $25,$3,$7
+ addu $24,$6
+ xor $15,$12
+ sll $30,$2,30
+ xor $25,$2
+ srl $6,$15,31
+ addu $15,$15
+ srl $2,$2,2
+ addu $24,$14
+ or $15,$6
+ or $2,$30
+ addu $24,$25
+ xor $16,$18
+ sll $25,$24,5 # 71
+ addu $7,$31
+ srl $6,$24,27
+ addu $7,$25
+ xor $16,$8
+ xor $25,$2,$3
+ addu $7,$6
+ xor $16,$13
+ sll $30,$1,30
+ xor $25,$1
+ srl $6,$16,31
+ addu $16,$16
+ srl $1,$1,2
+ addu $7,$15
+ or $16,$6
+ or $1,$30
+ addu $7,$25
+ xor $17,$19
+ sll $25,$7,5 # 72
+ addu $3,$31
+ srl $6,$7,27
+ addu $3,$25
+ xor $17,$9
+ xor $25,$1,$2
+ addu $3,$6
+ xor $17,$14
+ sll $30,$24,30
+ xor $25,$24
+ srl $6,$17,31
+ addu $17,$17
+ srl $24,$24,2
+ addu $3,$16
+ or $17,$6
+ or $24,$30
+ addu $3,$25
+ xor $18,$20
+ sll $25,$3,5 # 73
+ addu $2,$31
+ srl $6,$3,27
+ addu $2,$25
+ xor $18,$10
+ xor $25,$24,$1
+ addu $2,$6
+ xor $18,$15
+ sll $30,$7,30
+ xor $25,$7
+ srl $6,$18,31
+ addu $18,$18
+ srl $7,$7,2
+ addu $2,$17
+ or $18,$6
+ or $7,$30
+ addu $2,$25
+ xor $19,$21
+ sll $25,$2,5 # 74
+ addu $1,$31
+ srl $6,$2,27
+ addu $1,$25
+ xor $19,$11
+ xor $25,$7,$24
+ addu $1,$6
+ xor $19,$16
+ sll $30,$3,30
+ xor $25,$3
+ srl $6,$19,31
+ addu $19,$19
+ srl $3,$3,2
+ addu $1,$18
+ or $19,$6
+ or $3,$30
+ addu $1,$25
+ xor $20,$22
+ sll $25,$1,5 # 75
+ addu $24,$31
+ srl $6,$1,27
+ addu $24,$25
+ xor $20,$12
+ xor $25,$3,$7
+ addu $24,$6
+ xor $20,$17
+ sll $30,$2,30
+ xor $25,$2
+ srl $6,$20,31
+ addu $20,$20
+ srl $2,$2,2
+ addu $24,$19
+ or $20,$6
+ or $2,$30
+ addu $24,$25
+ xor $21,$23
+ sll $25,$24,5 # 76
+ addu $7,$31
+ srl $6,$24,27
+ addu $7,$25
+ xor $21,$13
+ xor $25,$2,$3
+ addu $7,$6
+ xor $21,$18
+ sll $30,$1,30
+ xor $25,$1
+ srl $6,$21,31
+ addu $21,$21
+ srl $1,$1,2
+ addu $7,$20
+ or $21,$6
+ or $1,$30
+ addu $7,$25
+ xor $22,$8
+ sll $25,$7,5 # 77
+ addu $3,$31
+ srl $6,$7,27
+ addu $3,$25
+ xor $22,$14
+ xor $25,$1,$2
+ addu $3,$6
+ xor $22,$19
+ sll $30,$24,30
+ xor $25,$24
+ srl $6,$22,31
+ addu $22,$22
+ srl $24,$24,2
+ addu $3,$21
+ or $22,$6
+ or $24,$30
+ addu $3,$25
+ xor $23,$9
+ sll $25,$3,5 # 78
+ addu $2,$31
+ srl $6,$3,27
+ addu $2,$25
+ xor $23,$15
+ xor $25,$24,$1
+ addu $2,$6
+ xor $23,$20
+ sll $30,$7,30
+ xor $25,$7
+ srl $6,$23,31
+ addu $23,$23
+ srl $7,$7,2
+ addu $2,$22
+ or $23,$6
+ or $7,$30
+ addu $2,$25
+ lw $8,0($4)
+ sll $25,$2,5 # 79
+ addu $1,$31
+ lw $9,4($4)
+ srl $6,$2,27
+ addu $1,$25
+ lw $10,8($4)
+ xor $25,$7,$24
+ addu $1,$6
+ lw $11,12($4)
+ sll $30,$3,30
+ xor $25,$3
+ lw $12,16($4)
+ srl $3,$3,2
+ addu $1,$23
+ or $3,$30
+ addu $1,$25
+ add $5,64
+ lw $6,0($29)
+
+ addu $1,$8
+ addu $2,$9
+ sw $1,0($4)
+ addu $3,$10
+ addu $7,$11
+ sw $2,4($4)
+ addu $24,$12
+ sw $3,8($4)
+ sw $7,12($4)
+ sw $24,16($4)
+ .set noreorder
+ bne $5,$6,.Loop
+ nop
+
+ .set noreorder
+ lw $31,(16-1)*4($29)
+ lw $30,(16-2)*4($29)
+ lw $23,(16-3)*4($29)
+ lw $22,(16-4)*4($29)
+ lw $21,(16-5)*4($29)
+ lw $20,(16-6)*4($29)
+ lw $19,(16-7)*4($29)
+ lw $18,(16-8)*4($29)
+ lw $17,(16-9)*4($29)
+ lw $16,(16-10)*4($29)
+ jr $31
+ add $29,16*4
+.end sha1_block_data_order
+.rdata
+.asciiz "SHA1 for MIPS, CRYPTOGAMS by <appro@openssl.org>"
diff --git a/main/openssl/crypto/sha/asm/sha1-mips.pl b/main/openssl/crypto/sha/asm/sha1-mips.pl
new file mode 100644
index 00000000..f1a702f3
--- /dev/null
+++ b/main/openssl/crypto/sha/asm/sha1-mips.pl
@@ -0,0 +1,354 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# SHA1 block procedure for MIPS.
+
+# Performance improvement is 30% on unaligned input. The "secret" is
+# to deploy lwl/lwr pair to load unaligned input. One could have
+# vectorized Xupdate on MIPSIII/IV, but the goal was to code MIPS32-
+# compatible subroutine. There is room for minor optimization on
+# little-endian platforms...
+
+######################################################################
+# There is a number of MIPS ABI in use, O32 and N32/64 are most
+# widely used. Then there is a new contender: NUBI. It appears that if
+# one picks the latter, it's possible to arrange code in ABI neutral
+# manner. Therefore let's stick to NUBI register layout:
+#
+($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
+($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
+($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
+#
+# The return value is placed in $a0. Following coding rules facilitate
+# interoperability:
+#
+# - never ever touch $tp, "thread pointer", former $gp;
+# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
+# old code];
+# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
+#
+# For reference here is register layout for N32/64 MIPS ABIs:
+#
+# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
+# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
+# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
+# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
+#
+$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
+
+if ($flavour =~ /64|n32/i) {
+ $PTR_ADD="dadd"; # incidentally works even on n32
+ $PTR_SUB="dsub"; # incidentally works even on n32
+ $REG_S="sd";
+ $REG_L="ld";
+ $PTR_SLL="dsll"; # incidentally works even on n32
+ $SZREG=8;
+} else {
+ $PTR_ADD="add";
+ $PTR_SUB="sub";
+ $REG_S="sw";
+ $REG_L="lw";
+ $PTR_SLL="sll";
+ $SZREG=4;
+}
+#
+# <appro@openssl.org>
+#
+######################################################################
+
+$big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0;
+
+for (@ARGV) { $output=$_ if (/^\w[\w\-]*\.\w+$/); }
+open STDOUT,">$output";
+
+if (!defined($big_endian))
+ { $big_endian=(unpack('L',pack('N',1))==1); }
+
+# offsets of the Most and Least Significant Bytes
+$MSB=$big_endian?0:3;
+$LSB=3&~$MSB;
+
+@X=map("\$$_",(8..23)); # a4-a7,s0-s11
+
+$ctx=$a0;
+$inp=$a1;
+$num=$a2;
+$A="\$1";
+$B="\$2";
+$C="\$3";
+$D="\$7";
+$E="\$24"; @V=($A,$B,$C,$D,$E);
+$t0="\$25";
+$t1=$num; # $num is offloaded to stack
+$t2="\$30"; # fp
+$K="\$31"; # ra
+
+sub BODY_00_14 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___ if (!$big_endian);
+ srl $t0,@X[$i],24 # byte swap($i)
+ srl $t1,@X[$i],8
+ andi $t2,@X[$i],0xFF00
+ sll @X[$i],@X[$i],24
+ andi $t1,0xFF00
+ sll $t2,$t2,8
+ or @X[$i],$t0
+ or $t1,$t2
+ or @X[$i],$t1
+___
+$code.=<<___;
+ lwl @X[$j],$j*4+$MSB($inp)
+ sll $t0,$a,5 # $i
+ addu $e,$K
+ lwr @X[$j],$j*4+$LSB($inp)
+ srl $t1,$a,27
+ addu $e,$t0
+ xor $t0,$c,$d
+ addu $e,$t1
+ sll $t2,$b,30
+ and $t0,$b
+ srl $b,$b,2
+ xor $t0,$d
+ addu $e,@X[$i]
+ or $b,$t2
+ addu $e,$t0
+___
+}
+
+sub BODY_15_19 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+
+$code.=<<___ if (!$big_endian && $i==15);
+ srl $t0,@X[$i],24 # byte swap($i)
+ srl $t1,@X[$i],8
+ andi $t2,@X[$i],0xFF00
+ sll @X[$i],@X[$i],24
+ andi $t1,0xFF00
+ sll $t2,$t2,8
+ or @X[$i],$t0
+ or @X[$i],$t1
+ or @X[$i],$t2
+___
+$code.=<<___;
+ xor @X[$j%16],@X[($j+2)%16]
+ sll $t0,$a,5 # $i
+ addu $e,$K
+ srl $t1,$a,27
+ addu $e,$t0
+ xor @X[$j%16],@X[($j+8)%16]
+ xor $t0,$c,$d
+ addu $e,$t1
+ xor @X[$j%16],@X[($j+13)%16]
+ sll $t2,$b,30
+ and $t0,$b
+ srl $t1,@X[$j%16],31
+ addu @X[$j%16],@X[$j%16]
+ srl $b,$b,2
+ xor $t0,$d
+ or @X[$j%16],$t1
+ addu $e,@X[$i%16]
+ or $b,$t2
+ addu $e,$t0
+___
+}
+
+sub BODY_20_39 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___ if ($i<79);
+ xor @X[$j%16],@X[($j+2)%16]
+ sll $t0,$a,5 # $i
+ addu $e,$K
+ srl $t1,$a,27
+ addu $e,$t0
+ xor @X[$j%16],@X[($j+8)%16]
+ xor $t0,$c,$d
+ addu $e,$t1
+ xor @X[$j%16],@X[($j+13)%16]
+ sll $t2,$b,30
+ xor $t0,$b
+ srl $t1,@X[$j%16],31
+ addu @X[$j%16],@X[$j%16]
+ srl $b,$b,2
+ addu $e,@X[$i%16]
+ or @X[$j%16],$t1
+ or $b,$t2
+ addu $e,$t0
+___
+$code.=<<___ if ($i==79);
+ lw @X[0],0($ctx)
+ sll $t0,$a,5 # $i
+ addu $e,$K
+ lw @X[1],4($ctx)
+ srl $t1,$a,27
+ addu $e,$t0
+ lw @X[2],8($ctx)
+ xor $t0,$c,$d
+ addu $e,$t1
+ lw @X[3],12($ctx)
+ sll $t2,$b,30
+ xor $t0,$b
+ lw @X[4],16($ctx)
+ srl $b,$b,2
+ addu $e,@X[$i%16]
+ or $b,$t2
+ addu $e,$t0
+___
+}
+
+sub BODY_40_59 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___ if ($i<79);
+ xor @X[$j%16],@X[($j+2)%16]
+ sll $t0,$a,5 # $i
+ addu $e,$K
+ srl $t1,$a,27
+ addu $e,$t0
+ xor @X[$j%16],@X[($j+8)%16]
+ and $t0,$c,$d
+ addu $e,$t1
+ xor @X[$j%16],@X[($j+13)%16]
+ sll $t2,$b,30
+ addu $e,$t0
+ srl $t1,@X[$j%16],31
+ xor $t0,$c,$d
+ addu @X[$j%16],@X[$j%16]
+ and $t0,$b
+ srl $b,$b,2
+ or @X[$j%16],$t1
+ addu $e,@X[$i%16]
+ or $b,$t2
+ addu $e,$t0
+___
+}
+
+$FRAMESIZE=16; # large enough to accomodate NUBI saved registers
+$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000;
+
+$code=<<___;
+#ifdef OPENSSL_FIPSCANISTER
+# include <openssl/fipssyms.h>
+#endif
+
+.text
+
+.set noat
+.set noreorder
+.align 5
+.globl sha1_block_data_order
+.ent sha1_block_data_order
+sha1_block_data_order:
+ .frame $sp,$FRAMESIZE*$SZREG,$ra
+ .mask $SAVED_REGS_MASK,-$SZREG
+ .set noreorder
+ $PTR_SUB $sp,$FRAMESIZE*$SZREG
+ $REG_S $ra,($FRAMESIZE-1)*$SZREG($sp)
+ $REG_S $fp,($FRAMESIZE-2)*$SZREG($sp)
+ $REG_S $s11,($FRAMESIZE-3)*$SZREG($sp)
+ $REG_S $s10,($FRAMESIZE-4)*$SZREG($sp)
+ $REG_S $s9,($FRAMESIZE-5)*$SZREG($sp)
+ $REG_S $s8,($FRAMESIZE-6)*$SZREG($sp)
+ $REG_S $s7,($FRAMESIZE-7)*$SZREG($sp)
+ $REG_S $s6,($FRAMESIZE-8)*$SZREG($sp)
+ $REG_S $s5,($FRAMESIZE-9)*$SZREG($sp)
+ $REG_S $s4,($FRAMESIZE-10)*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
+ $REG_S $s3,($FRAMESIZE-11)*$SZREG($sp)
+ $REG_S $s2,($FRAMESIZE-12)*$SZREG($sp)
+ $REG_S $s1,($FRAMESIZE-13)*$SZREG($sp)
+ $REG_S $s0,($FRAMESIZE-14)*$SZREG($sp)
+ $REG_S $gp,($FRAMESIZE-15)*$SZREG($sp)
+___
+$code.=<<___;
+ $PTR_SLL $num,6
+ $PTR_ADD $num,$inp
+ $REG_S $num,0($sp)
+ lw $A,0($ctx)
+ lw $B,4($ctx)
+ lw $C,8($ctx)
+ lw $D,12($ctx)
+ b .Loop
+ lw $E,16($ctx)
+.align 4
+.Loop:
+ .set reorder
+ lwl @X[0],$MSB($inp)
+ lui $K,0x5a82
+ lwr @X[0],$LSB($inp)
+ ori $K,0x7999 # K_00_19
+___
+for ($i=0;$i<15;$i++) { &BODY_00_14($i,@V); unshift(@V,pop(@V)); }
+for (;$i<20;$i++) { &BODY_15_19($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+ lui $K,0x6ed9
+ ori $K,0xeba1 # K_20_39
+___
+for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+ lui $K,0x8f1b
+ ori $K,0xbcdc # K_40_59
+___
+for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+ lui $K,0xca62
+ ori $K,0xc1d6 # K_60_79
+___
+for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+ $PTR_ADD $inp,64
+ $REG_L $num,0($sp)
+
+ addu $A,$X[0]
+ addu $B,$X[1]
+ sw $A,0($ctx)
+ addu $C,$X[2]
+ addu $D,$X[3]
+ sw $B,4($ctx)
+ addu $E,$X[4]
+ sw $C,8($ctx)
+ sw $D,12($ctx)
+ sw $E,16($ctx)
+ .set noreorder
+ bne $inp,$num,.Loop
+ nop
+
+ .set noreorder
+ $REG_L $ra,($FRAMESIZE-1)*$SZREG($sp)
+ $REG_L $fp,($FRAMESIZE-2)*$SZREG($sp)
+ $REG_L $s11,($FRAMESIZE-3)*$SZREG($sp)
+ $REG_L $s10,($FRAMESIZE-4)*$SZREG($sp)
+ $REG_L $s9,($FRAMESIZE-5)*$SZREG($sp)
+ $REG_L $s8,($FRAMESIZE-6)*$SZREG($sp)
+ $REG_L $s7,($FRAMESIZE-7)*$SZREG($sp)
+ $REG_L $s6,($FRAMESIZE-8)*$SZREG($sp)
+ $REG_L $s5,($FRAMESIZE-9)*$SZREG($sp)
+ $REG_L $s4,($FRAMESIZE-10)*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+ $REG_L $s3,($FRAMESIZE-11)*$SZREG($sp)
+ $REG_L $s2,($FRAMESIZE-12)*$SZREG($sp)
+ $REG_L $s1,($FRAMESIZE-13)*$SZREG($sp)
+ $REG_L $s0,($FRAMESIZE-14)*$SZREG($sp)
+ $REG_L $gp,($FRAMESIZE-15)*$SZREG($sp)
+___
+$code.=<<___;
+ jr $ra
+ $PTR_ADD $sp,$FRAMESIZE*$SZREG
+.end sha1_block_data_order
+.rdata
+.asciiz "SHA1 for MIPS, CRYPTOGAMS by <appro\@openssl.org>"
+___
+print $code;
+close STDOUT;
diff --git a/main/openssl/crypto/sha/asm/sha1-parisc.pl b/main/openssl/crypto/sha/asm/sha1-parisc.pl
new file mode 100644
index 00000000..6e5a328a
--- /dev/null
+++ b/main/openssl/crypto/sha/asm/sha1-parisc.pl
@@ -0,0 +1,260 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# SHA1 block procedure for PA-RISC.
+
+# June 2009.
+#
+# On PA-7100LC performance is >30% better than gcc 3.2 generated code
+# for aligned input and >50% better for unaligned. Compared to vendor
+# compiler on PA-8600 it's almost 60% faster in 64-bit build and just
+# few percent faster in 32-bit one (this for aligned input, data for
+# unaligned input is not available).
+#
+# Special thanks to polarhome.com for providing HP-UX account.
+
+$flavour = shift;
+$output = shift;
+open STDOUT,">$output";
+
+if ($flavour =~ /64/) {
+ $LEVEL ="2.0W";
+ $SIZE_T =8;
+ $FRAME_MARKER =80;
+ $SAVED_RP =16;
+ $PUSH ="std";
+ $PUSHMA ="std,ma";
+ $POP ="ldd";
+ $POPMB ="ldd,mb";
+} else {
+ $LEVEL ="1.0";
+ $SIZE_T =4;
+ $FRAME_MARKER =48;
+ $SAVED_RP =20;
+ $PUSH ="stw";
+ $PUSHMA ="stwm";
+ $POP ="ldw";
+ $POPMB ="ldwm";
+}
+
+$FRAME=14*$SIZE_T+$FRAME_MARKER;# 14 saved regs + frame marker
+ # [+ argument transfer]
+$ctx="%r26"; # arg0
+$inp="%r25"; # arg1
+$num="%r24"; # arg2
+
+$t0="%r28";
+$t1="%r29";
+$K="%r31";
+
+@X=("%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8",
+ "%r9", "%r10","%r11","%r12","%r13","%r14","%r15","%r16",$t0);
+
+@V=($A,$B,$C,$D,$E)=("%r19","%r20","%r21","%r22","%r23");
+
+sub BODY_00_19 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___ if ($i<15);
+ addl $K,$e,$e ; $i
+ shd $a,$a,27,$t1
+ addl @X[$i],$e,$e
+ and $c,$b,$t0
+ addl $t1,$e,$e
+ andcm $d,$b,$t1
+ shd $b,$b,2,$b
+ or $t1,$t0,$t0
+ addl $t0,$e,$e
+___
+$code.=<<___ if ($i>=15); # with forward Xupdate
+ addl $K,$e,$e ; $i
+ shd $a,$a,27,$t1
+ xor @X[($j+2)%16],@X[$j%16],@X[$j%16]
+ addl @X[$i%16],$e,$e
+ and $c,$b,$t0
+ xor @X[($j+8)%16],@X[$j%16],@X[$j%16]
+ addl $t1,$e,$e
+ andcm $d,$b,$t1
+ shd $b,$b,2,$b
+ or $t1,$t0,$t0
+ xor @X[($j+13)%16],@X[$j%16],@X[$j%16]
+ add $t0,$e,$e
+ shd @X[$j%16],@X[$j%16],31,@X[$j%16]
+___
+}
+
+sub BODY_20_39 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___ if ($i<79);
+ xor @X[($j+2)%16],@X[$j%16],@X[$j%16] ; $i
+ addl $K,$e,$e
+ shd $a,$a,27,$t1
+ xor @X[($j+8)%16],@X[$j%16],@X[$j%16]
+ addl @X[$i%16],$e,$e
+ xor $b,$c,$t0
+ xor @X[($j+13)%16],@X[$j%16],@X[$j%16]
+ addl $t1,$e,$e
+ shd $b,$b,2,$b
+ xor $d,$t0,$t0
+ shd @X[$j%16],@X[$j%16],31,@X[$j%16]
+ addl $t0,$e,$e
+___
+$code.=<<___ if ($i==79); # with context load
+ ldw 0($ctx),@X[0] ; $i
+ addl $K,$e,$e
+ shd $a,$a,27,$t1
+ ldw 4($ctx),@X[1]
+ addl @X[$i%16],$e,$e
+ xor $b,$c,$t0
+ ldw 8($ctx),@X[2]
+ addl $t1,$e,$e
+ shd $b,$b,2,$b
+ xor $d,$t0,$t0
+ ldw 12($ctx),@X[3]
+ addl $t0,$e,$e
+ ldw 16($ctx),@X[4]
+___
+}
+
+sub BODY_40_59 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___;
+ shd $a,$a,27,$t1 ; $i
+ addl $K,$e,$e
+ xor @X[($j+2)%16],@X[$j%16],@X[$j%16]
+ xor $d,$c,$t0
+ addl @X[$i%16],$e,$e
+ xor @X[($j+8)%16],@X[$j%16],@X[$j%16]
+ and $b,$t0,$t0
+ addl $t1,$e,$e
+ shd $b,$b,2,$b
+ xor @X[($j+13)%16],@X[$j%16],@X[$j%16]
+ addl $t0,$e,$e
+ and $d,$c,$t1
+ shd @X[$j%16],@X[$j%16],31,@X[$j%16]
+ addl $t1,$e,$e
+___
+}
+
+$code=<<___;
+ .LEVEL $LEVEL
+ .SPACE \$TEXT\$
+ .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
+
+ .EXPORT sha1_block_data_order,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
+sha1_block_data_order
+ .PROC
+ .CALLINFO FRAME=`$FRAME-14*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=16
+ .ENTRY
+ $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
+ $PUSHMA %r3,$FRAME(%sp)
+ $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
+ $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
+ $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
+ $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
+ $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
+ $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
+ $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
+ $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp)
+ $PUSH %r12,`-$FRAME+9*$SIZE_T`(%sp)
+ $PUSH %r13,`-$FRAME+10*$SIZE_T`(%sp)
+ $PUSH %r14,`-$FRAME+11*$SIZE_T`(%sp)
+ $PUSH %r15,`-$FRAME+12*$SIZE_T`(%sp)
+ $PUSH %r16,`-$FRAME+13*$SIZE_T`(%sp)
+
+ ldw 0($ctx),$A
+ ldw 4($ctx),$B
+ ldw 8($ctx),$C
+ ldw 12($ctx),$D
+ ldw 16($ctx),$E
+
+ extru $inp,31,2,$t0 ; t0=inp&3;
+ sh3addl $t0,%r0,$t0 ; t0*=8;
+ subi 32,$t0,$t0 ; t0=32-t0;
+ mtctl $t0,%cr11 ; %sar=t0;
+
+L\$oop
+ ldi 3,$t0
+ andcm $inp,$t0,$t0 ; 64-bit neutral
+___
+ for ($i=0;$i<15;$i++) { # load input block
+ $code.="\tldw `4*$i`($t0),@X[$i]\n"; }
+$code.=<<___;
+ cmpb,*= $inp,$t0,L\$aligned
+ ldw 60($t0),@X[15]
+ ldw 64($t0),@X[16]
+___
+ for ($i=0;$i<16;$i++) { # align input
+ $code.="\tvshd @X[$i],@X[$i+1],@X[$i]\n"; }
+$code.=<<___;
+L\$aligned
+ ldil L'0x5a827000,$K ; K_00_19
+ ldo 0x999($K),$K
+___
+for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+ ldil L'0x6ed9e000,$K ; K_20_39
+ ldo 0xba1($K),$K
+___
+
+for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+ ldil L'0x8f1bb000,$K ; K_40_59
+ ldo 0xcdc($K),$K
+___
+
+for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+ ldil L'0xca62c000,$K ; K_60_79
+ ldo 0x1d6($K),$K
+___
+for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+
+$code.=<<___;
+ addl @X[0],$A,$A
+ addl @X[1],$B,$B
+ addl @X[2],$C,$C
+ addl @X[3],$D,$D
+ addl @X[4],$E,$E
+ stw $A,0($ctx)
+ stw $B,4($ctx)
+ stw $C,8($ctx)
+ stw $D,12($ctx)
+ stw $E,16($ctx)
+ addib,*<> -1,$num,L\$oop
+ ldo 64($inp),$inp
+
+ $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
+ $POP `-$FRAME+1*$SIZE_T`(%sp),%r4
+ $POP `-$FRAME+2*$SIZE_T`(%sp),%r5
+ $POP `-$FRAME+3*$SIZE_T`(%sp),%r6
+ $POP `-$FRAME+4*$SIZE_T`(%sp),%r7
+ $POP `-$FRAME+5*$SIZE_T`(%sp),%r8
+ $POP `-$FRAME+6*$SIZE_T`(%sp),%r9
+ $POP `-$FRAME+7*$SIZE_T`(%sp),%r10
+ $POP `-$FRAME+8*$SIZE_T`(%sp),%r11
+ $POP `-$FRAME+9*$SIZE_T`(%sp),%r12
+ $POP `-$FRAME+10*$SIZE_T`(%sp),%r13
+ $POP `-$FRAME+11*$SIZE_T`(%sp),%r14
+ $POP `-$FRAME+12*$SIZE_T`(%sp),%r15
+ $POP `-$FRAME+13*$SIZE_T`(%sp),%r16
+ bv (%r2)
+ .EXIT
+ $POPMB -$FRAME(%sp),%r3
+ .PROCEND
+ .STRINGZ "SHA1 block transform for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+$code =~ s/,\*/,/gm if ($SIZE_T==4);
+$code =~ s/\bbv\b/bve/gm if ($SIZE_T==8);
+print $code;
+close STDOUT;
diff --git a/main/openssl/crypto/sha/asm/sha1-ppc.pl b/main/openssl/crypto/sha/asm/sha1-ppc.pl
index dcd0fcdf..2140dd2f 100755
--- a/main/openssl/crypto/sha/asm/sha1-ppc.pl
+++ b/main/openssl/crypto/sha/asm/sha1-ppc.pl
@@ -24,12 +24,14 @@ $flavour = shift;
if ($flavour =~ /64/) {
$SIZE_T =8;
+ $LRSAVE =2*$SIZE_T;
$UCMP ="cmpld";
$STU ="stdu";
$POP ="ld";
$PUSH ="std";
} elsif ($flavour =~ /32/) {
$SIZE_T =4;
+ $LRSAVE =$SIZE_T;
$UCMP ="cmplw";
$STU ="stwu";
$POP ="lwz";
@@ -43,7 +45,8 @@ die "can't locate ppc-xlate.pl";
open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
-$FRAME=24*$SIZE_T;
+$FRAME=24*$SIZE_T+64;
+$LOCALS=6*$SIZE_T;
$K ="r0";
$sp ="r1";
@@ -162,9 +165,8 @@ $code=<<___;
.globl .sha1_block_data_order
.align 4
.sha1_block_data_order:
+ $STU $sp,-$FRAME($sp)
mflr r0
- $STU $sp,`-($FRAME+64)`($sp)
- $PUSH r0,`$FRAME-$SIZE_T*18`($sp)
$PUSH r15,`$FRAME-$SIZE_T*17`($sp)
$PUSH r16,`$FRAME-$SIZE_T*16`($sp)
$PUSH r17,`$FRAME-$SIZE_T*15`($sp)
@@ -182,6 +184,7 @@ $code=<<___;
$PUSH r29,`$FRAME-$SIZE_T*3`($sp)
$PUSH r30,`$FRAME-$SIZE_T*2`($sp)
$PUSH r31,`$FRAME-$SIZE_T*1`($sp)
+ $PUSH r0,`$FRAME+$LRSAVE`($sp)
lwz $A,0($ctx)
lwz $B,4($ctx)
lwz $C,8($ctx)
@@ -192,37 +195,14 @@ $code=<<___;
Laligned:
mtctr $num
bl Lsha1_block_private
-Ldone:
- $POP r0,`$FRAME-$SIZE_T*18`($sp)
- $POP r15,`$FRAME-$SIZE_T*17`($sp)
- $POP r16,`$FRAME-$SIZE_T*16`($sp)
- $POP r17,`$FRAME-$SIZE_T*15`($sp)
- $POP r18,`$FRAME-$SIZE_T*14`($sp)
- $POP r19,`$FRAME-$SIZE_T*13`($sp)
- $POP r20,`$FRAME-$SIZE_T*12`($sp)
- $POP r21,`$FRAME-$SIZE_T*11`($sp)
- $POP r22,`$FRAME-$SIZE_T*10`($sp)
- $POP r23,`$FRAME-$SIZE_T*9`($sp)
- $POP r24,`$FRAME-$SIZE_T*8`($sp)
- $POP r25,`$FRAME-$SIZE_T*7`($sp)
- $POP r26,`$FRAME-$SIZE_T*6`($sp)
- $POP r27,`$FRAME-$SIZE_T*5`($sp)
- $POP r28,`$FRAME-$SIZE_T*4`($sp)
- $POP r29,`$FRAME-$SIZE_T*3`($sp)
- $POP r30,`$FRAME-$SIZE_T*2`($sp)
- $POP r31,`$FRAME-$SIZE_T*1`($sp)
- mtlr r0
- addi $sp,$sp,`$FRAME+64`
- blr
-___
+ b Ldone
-# PowerPC specification allows an implementation to be ill-behaved
-# upon unaligned access which crosses page boundary. "Better safe
-# than sorry" principle makes me treat it specially. But I don't
-# look for particular offending word, but rather for 64-byte input
-# block which crosses the boundary. Once found that block is aligned
-# and hashed separately...
-$code.=<<___;
+; PowerPC specification allows an implementation to be ill-behaved
+; upon unaligned access which crosses page boundary. "Better safe
+; than sorry" principle makes me treat it specially. But I don't
+; look for particular offending word, but rather for 64-byte input
+; block which crosses the boundary. Once found that block is aligned
+; and hashed separately...
.align 4
Lunaligned:
subfic $t1,$inp,4096
@@ -237,7 +217,7 @@ Lunaligned:
Lcross_page:
li $t1,16
mtctr $t1
- addi r20,$sp,$FRAME ; spot below the frame
+ addi r20,$sp,$LOCALS ; spot within the frame
Lmemcpy:
lbz r16,0($inp)
lbz r17,1($inp)
@@ -251,15 +231,40 @@ Lmemcpy:
addi r20,r20,4
bdnz Lmemcpy
- $PUSH $inp,`$FRAME-$SIZE_T*19`($sp)
+ $PUSH $inp,`$FRAME-$SIZE_T*18`($sp)
li $t1,1
- addi $inp,$sp,$FRAME
+ addi $inp,$sp,$LOCALS
mtctr $t1
bl Lsha1_block_private
- $POP $inp,`$FRAME-$SIZE_T*19`($sp)
+ $POP $inp,`$FRAME-$SIZE_T*18`($sp)
addic. $num,$num,-1
bne- Lunaligned
- b Ldone
+
+Ldone:
+ $POP r0,`$FRAME+$LRSAVE`($sp)
+ $POP r15,`$FRAME-$SIZE_T*17`($sp)
+ $POP r16,`$FRAME-$SIZE_T*16`($sp)
+ $POP r17,`$FRAME-$SIZE_T*15`($sp)
+ $POP r18,`$FRAME-$SIZE_T*14`($sp)
+ $POP r19,`$FRAME-$SIZE_T*13`($sp)
+ $POP r20,`$FRAME-$SIZE_T*12`($sp)
+ $POP r21,`$FRAME-$SIZE_T*11`($sp)
+ $POP r22,`$FRAME-$SIZE_T*10`($sp)
+ $POP r23,`$FRAME-$SIZE_T*9`($sp)
+ $POP r24,`$FRAME-$SIZE_T*8`($sp)
+ $POP r25,`$FRAME-$SIZE_T*7`($sp)
+ $POP r26,`$FRAME-$SIZE_T*6`($sp)
+ $POP r27,`$FRAME-$SIZE_T*5`($sp)
+ $POP r28,`$FRAME-$SIZE_T*4`($sp)
+ $POP r29,`$FRAME-$SIZE_T*3`($sp)
+ $POP r30,`$FRAME-$SIZE_T*2`($sp)
+ $POP r31,`$FRAME-$SIZE_T*1`($sp)
+ mtlr r0
+ addi $sp,$sp,$FRAME
+ blr
+ .long 0
+ .byte 0,12,4,1,0x80,18,3,0
+ .long 0
___
# This is private block function, which uses tailored calling
@@ -309,6 +314,8 @@ $code.=<<___;
addi $inp,$inp,`16*4`
bdnz- Lsha1_block_private
blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,0,0
___
$code.=<<___;
.asciz "SHA1 block transform for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>"
diff --git a/main/openssl/crypto/sha/asm/sha1-s390x.pl b/main/openssl/crypto/sha/asm/sha1-s390x.pl
index 4b178482..9193dda4 100644
--- a/main/openssl/crypto/sha/asm/sha1-s390x.pl
+++ b/main/openssl/crypto/sha/asm/sha1-s390x.pl
@@ -21,9 +21,28 @@
# instructions to favour dual-issue z10 pipeline. On z10 hardware is
# "only" ~2.3x faster than software.
+# November 2010.
+#
+# Adapt for -m31 build. If kernel supports what's called "highgprs"
+# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
+# instructions and achieve "64-bit" performance even in 31-bit legacy
+# application context. The feature is not specific to any particular
+# processor, as long as it's "z-CPU". Latter implies that the code
+# remains z/Architecture specific.
+
$kimdfunc=1; # magic function code for kimd instruction
-$output=shift;
+$flavour = shift;
+
+if ($flavour =~ /3[12]/) {
+ $SIZE_T=4;
+ $g="";
+} else {
+ $SIZE_T=8;
+ $g="g";
+}
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
$K_00_39="%r0"; $K=$K_00_39;
@@ -42,13 +61,14 @@ $t1="%r11";
@X=("%r12","%r13","%r14");
$sp="%r15";
-$frame=160+16*4;
+$stdframe=16*$SIZE_T+4*8;
+$frame=$stdframe+16*4;
sub Xupdate {
my $i=shift;
$code.=<<___ if ($i==15);
- lg $prefetch,160($sp) ### Xupdate(16) warm-up
+ lg $prefetch,$stdframe($sp) ### Xupdate(16) warm-up
lr $X[0],$X[2]
___
return if ($i&1); # Xupdate is vectorized and executed every 2nd cycle
@@ -58,8 +78,8 @@ $code.=<<___ if ($i<16);
___
$code.=<<___ if ($i>=16);
xgr $X[0],$prefetch ### Xupdate($i)
- lg $prefetch,`160+4*(($i+2)%16)`($sp)
- xg $X[0],`160+4*(($i+8)%16)`($sp)
+ lg $prefetch,`$stdframe+4*(($i+2)%16)`($sp)
+ xg $X[0],`$stdframe+4*(($i+8)%16)`($sp)
xgr $X[0],$prefetch
rll $X[0],$X[0],1
rllg $X[1],$X[0],32
@@ -68,7 +88,7 @@ $code.=<<___ if ($i>=16);
lr $X[2],$X[1] # feedback
___
$code.=<<___ if ($i<=70);
- stg $X[0],`160+4*($i%16)`($sp)
+ stg $X[0],`$stdframe+4*($i%16)`($sp)
___
unshift(@X,pop(@X));
}
@@ -148,9 +168,9 @@ $code.=<<___ if ($kimdfunc);
tmhl %r0,0x4000 # check for message-security assist
jz .Lsoftware
lghi %r0,0
- la %r1,16($sp)
+ la %r1,`2*$SIZE_T`($sp)
.long 0xb93e0002 # kimd %r0,%r2
- lg %r0,16($sp)
+ lg %r0,`2*$SIZE_T`($sp)
tmhh %r0,`0x8000>>$kimdfunc`
jz .Lsoftware
lghi %r0,$kimdfunc
@@ -165,11 +185,11 @@ $code.=<<___ if ($kimdfunc);
___
$code.=<<___;
lghi %r1,-$frame
- stg $ctx,16($sp)
- stmg %r6,%r15,48($sp)
+ st${g} $ctx,`2*$SIZE_T`($sp)
+ stm${g} %r6,%r15,`6*$SIZE_T`($sp)
lgr %r0,$sp
la $sp,0(%r1,$sp)
- stg %r0,0($sp)
+ st${g} %r0,0($sp)
larl $t0,Ktable
llgf $A,0($ctx)
@@ -199,7 +219,7 @@ ___
for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
- lg $ctx,`$frame+16`($sp)
+ l${g} $ctx,`$frame+2*$SIZE_T`($sp)
la $inp,64($inp)
al $A,0($ctx)
al $B,4($ctx)
@@ -211,13 +231,13 @@ $code.=<<___;
st $C,8($ctx)
st $D,12($ctx)
st $E,16($ctx)
- brct $len,.Lloop
+ brct${g} $len,.Lloop
- lmg %r6,%r15,`$frame+48`($sp)
+ lm${g} %r6,%r15,`$frame+6*$SIZE_T`($sp)
br %r14
.size sha1_block_data_order,.-sha1_block_data_order
.string "SHA1 block transform for s390x, CRYPTOGAMS by <appro\@openssl.org>"
-.comm OPENSSL_s390xcap_P,8,8
+.comm OPENSSL_s390xcap_P,16,8
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
diff --git a/main/openssl/crypto/sha/asm/sha1-sparcv9a.pl b/main/openssl/crypto/sha/asm/sha1-sparcv9a.pl
index 85e8d680..e65291bb 100644
--- a/main/openssl/crypto/sha/asm/sha1-sparcv9a.pl
+++ b/main/openssl/crypto/sha/asm/sha1-sparcv9a.pl
@@ -549,7 +549,7 @@ ___
# programmer detect if current CPU is VIS capable at run-time.
sub unvis {
my ($mnemonic,$rs1,$rs2,$rd)=@_;
-my $ref,$opf;
+my ($ref,$opf);
my %visopf = ( "fmul8ulx16" => 0x037,
"faligndata" => 0x048,
"fpadd32" => 0x052,
diff --git a/main/openssl/crypto/sha/asm/sha1-x86_64.S b/main/openssl/crypto/sha/asm/sha1-x86_64.S
new file mode 100644
index 00000000..3922e203
--- /dev/null
+++ b/main/openssl/crypto/sha/asm/sha1-x86_64.S
@@ -0,0 +1,2486 @@
+.text
+
+
+.globl sha1_block_data_order
+.type sha1_block_data_order,@function
+.align 16
+sha1_block_data_order:
+ movl OPENSSL_ia32cap_P+0(%rip),%r9d
+ movl OPENSSL_ia32cap_P+4(%rip),%r8d
+ testl $512,%r8d
+ jz .Lialu
+ jmp _ssse3_shortcut
+
+.align 16
+.Lialu:
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ movq %rsp,%r11
+ movq %rdi,%r8
+ subq $72,%rsp
+ movq %rsi,%r9
+ andq $-64,%rsp
+ movq %rdx,%r10
+ movq %r11,64(%rsp)
+.Lprologue:
+
+ movl 0(%r8),%esi
+ movl 4(%r8),%edi
+ movl 8(%r8),%r11d
+ movl 12(%r8),%r12d
+ movl 16(%r8),%r13d
+ jmp .Lloop
+
+.align 16
+.Lloop:
+ movl 0(%r9),%edx
+ bswapl %edx
+ movl %edx,0(%rsp)
+ movl %r11d,%eax
+ movl 4(%r9),%ebp
+ movl %esi,%ecx
+ xorl %r12d,%eax
+ bswapl %ebp
+ roll $5,%ecx
+ leal 1518500249(%rdx,%r13,1),%r13d
+ andl %edi,%eax
+ movl %ebp,4(%rsp)
+ addl %ecx,%r13d
+ xorl %r12d,%eax
+ roll $30,%edi
+ addl %eax,%r13d
+ movl %edi,%eax
+ movl 8(%r9),%edx
+ movl %r13d,%ecx
+ xorl %r11d,%eax
+ bswapl %edx
+ roll $5,%ecx
+ leal 1518500249(%rbp,%r12,1),%r12d
+ andl %esi,%eax
+ movl %edx,8(%rsp)
+ addl %ecx,%r12d
+ xorl %r11d,%eax
+ roll $30,%esi
+ addl %eax,%r12d
+ movl %esi,%eax
+ movl 12(%r9),%ebp
+ movl %r12d,%ecx
+ xorl %edi,%eax
+ bswapl %ebp
+ roll $5,%ecx
+ leal 1518500249(%rdx,%r11,1),%r11d
+ andl %r13d,%eax
+ movl %ebp,12(%rsp)
+ addl %ecx,%r11d
+ xorl %edi,%eax
+ roll $30,%r13d
+ addl %eax,%r11d
+ movl %r13d,%eax
+ movl 16(%r9),%edx
+ movl %r11d,%ecx
+ xorl %esi,%eax
+ bswapl %edx
+ roll $5,%ecx
+ leal 1518500249(%rbp,%rdi,1),%edi
+ andl %r12d,%eax
+ movl %edx,16(%rsp)
+ addl %ecx,%edi
+ xorl %esi,%eax
+ roll $30,%r12d
+ addl %eax,%edi
+ movl %r12d,%eax
+ movl 20(%r9),%ebp
+ movl %edi,%ecx
+ xorl %r13d,%eax
+ bswapl %ebp
+ roll $5,%ecx
+ leal 1518500249(%rdx,%rsi,1),%esi
+ andl %r11d,%eax
+ movl %ebp,20(%rsp)
+ addl %ecx,%esi
+ xorl %r13d,%eax
+ roll $30,%r11d
+ addl %eax,%esi
+ movl %r11d,%eax
+ movl 24(%r9),%edx
+ movl %esi,%ecx
+ xorl %r12d,%eax
+ bswapl %edx
+ roll $5,%ecx
+ leal 1518500249(%rbp,%r13,1),%r13d
+ andl %edi,%eax
+ movl %edx,24(%rsp)
+ addl %ecx,%r13d
+ xorl %r12d,%eax
+ roll $30,%edi
+ addl %eax,%r13d
+ movl %edi,%eax
+ movl 28(%r9),%ebp
+ movl %r13d,%ecx
+ xorl %r11d,%eax
+ bswapl %ebp
+ roll $5,%ecx
+ leal 1518500249(%rdx,%r12,1),%r12d
+ andl %esi,%eax
+ movl %ebp,28(%rsp)
+ addl %ecx,%r12d
+ xorl %r11d,%eax
+ roll $30,%esi
+ addl %eax,%r12d
+ movl %esi,%eax
+ movl 32(%r9),%edx
+ movl %r12d,%ecx
+ xorl %edi,%eax
+ bswapl %edx
+ roll $5,%ecx
+ leal 1518500249(%rbp,%r11,1),%r11d
+ andl %r13d,%eax
+ movl %edx,32(%rsp)
+ addl %ecx,%r11d
+ xorl %edi,%eax
+ roll $30,%r13d
+ addl %eax,%r11d
+ movl %r13d,%eax
+ movl 36(%r9),%ebp
+ movl %r11d,%ecx
+ xorl %esi,%eax
+ bswapl %ebp
+ roll $5,%ecx
+ leal 1518500249(%rdx,%rdi,1),%edi
+ andl %r12d,%eax
+ movl %ebp,36(%rsp)
+ addl %ecx,%edi
+ xorl %esi,%eax
+ roll $30,%r12d
+ addl %eax,%edi
+ movl %r12d,%eax
+ movl 40(%r9),%edx
+ movl %edi,%ecx
+ xorl %r13d,%eax
+ bswapl %edx
+ roll $5,%ecx
+ leal 1518500249(%rbp,%rsi,1),%esi
+ andl %r11d,%eax
+ movl %edx,40(%rsp)
+ addl %ecx,%esi
+ xorl %r13d,%eax
+ roll $30,%r11d
+ addl %eax,%esi
+ movl %r11d,%eax
+ movl 44(%r9),%ebp
+ movl %esi,%ecx
+ xorl %r12d,%eax
+ bswapl %ebp
+ roll $5,%ecx
+ leal 1518500249(%rdx,%r13,1),%r13d
+ andl %edi,%eax
+ movl %ebp,44(%rsp)
+ addl %ecx,%r13d
+ xorl %r12d,%eax
+ roll $30,%edi
+ addl %eax,%r13d
+ movl %edi,%eax
+ movl 48(%r9),%edx
+ movl %r13d,%ecx
+ xorl %r11d,%eax
+ bswapl %edx
+ roll $5,%ecx
+ leal 1518500249(%rbp,%r12,1),%r12d
+ andl %esi,%eax
+ movl %edx,48(%rsp)
+ addl %ecx,%r12d
+ xorl %r11d,%eax
+ roll $30,%esi
+ addl %eax,%r12d
+ movl %esi,%eax
+ movl 52(%r9),%ebp
+ movl %r12d,%ecx
+ xorl %edi,%eax
+ bswapl %ebp
+ roll $5,%ecx
+ leal 1518500249(%rdx,%r11,1),%r11d
+ andl %r13d,%eax
+ movl %ebp,52(%rsp)
+ addl %ecx,%r11d
+ xorl %edi,%eax
+ roll $30,%r13d
+ addl %eax,%r11d
+ movl %r13d,%eax
+ movl 56(%r9),%edx
+ movl %r11d,%ecx
+ xorl %esi,%eax
+ bswapl %edx
+ roll $5,%ecx
+ leal 1518500249(%rbp,%rdi,1),%edi
+ andl %r12d,%eax
+ movl %edx,56(%rsp)
+ addl %ecx,%edi
+ xorl %esi,%eax
+ roll $30,%r12d
+ addl %eax,%edi
+ movl %r12d,%eax
+ movl 60(%r9),%ebp
+ movl %edi,%ecx
+ xorl %r13d,%eax
+ bswapl %ebp
+ roll $5,%ecx
+ leal 1518500249(%rdx,%rsi,1),%esi
+ andl %r11d,%eax
+ movl %ebp,60(%rsp)
+ addl %ecx,%esi
+ xorl %r13d,%eax
+ roll $30,%r11d
+ addl %eax,%esi
+ movl 0(%rsp),%edx
+ movl %r11d,%eax
+ movl %esi,%ecx
+ xorl 8(%rsp),%edx
+ xorl %r12d,%eax
+ roll $5,%ecx
+ xorl 32(%rsp),%edx
+ andl %edi,%eax
+ leal 1518500249(%rbp,%r13,1),%r13d
+ xorl 52(%rsp),%edx
+ xorl %r12d,%eax
+ roll $1,%edx
+ addl %ecx,%r13d
+ roll $30,%edi
+ movl %edx,0(%rsp)
+ addl %eax,%r13d
+ movl 4(%rsp),%ebp
+ movl %edi,%eax
+ movl %r13d,%ecx
+ xorl 12(%rsp),%ebp
+ xorl %r11d,%eax
+ roll $5,%ecx
+ xorl 36(%rsp),%ebp
+ andl %esi,%eax
+ leal 1518500249(%rdx,%r12,1),%r12d
+ xorl 56(%rsp),%ebp
+ xorl %r11d,%eax
+ roll $1,%ebp
+ addl %ecx,%r12d
+ roll $30,%esi
+ movl %ebp,4(%rsp)
+ addl %eax,%r12d
+ movl 8(%rsp),%edx
+ movl %esi,%eax
+ movl %r12d,%ecx
+ xorl 16(%rsp),%edx
+ xorl %edi,%eax
+ roll $5,%ecx
+ xorl 40(%rsp),%edx
+ andl %r13d,%eax
+ leal 1518500249(%rbp,%r11,1),%r11d
+ xorl 60(%rsp),%edx
+ xorl %edi,%eax
+ roll $1,%edx
+ addl %ecx,%r11d
+ roll $30,%r13d
+ movl %edx,8(%rsp)
+ addl %eax,%r11d
+ movl 12(%rsp),%ebp
+ movl %r13d,%eax
+ movl %r11d,%ecx
+ xorl 20(%rsp),%ebp
+ xorl %esi,%eax
+ roll $5,%ecx
+ xorl 44(%rsp),%ebp
+ andl %r12d,%eax
+ leal 1518500249(%rdx,%rdi,1),%edi
+ xorl 0(%rsp),%ebp
+ xorl %esi,%eax
+ roll $1,%ebp
+ addl %ecx,%edi
+ roll $30,%r12d
+ movl %ebp,12(%rsp)
+ addl %eax,%edi
+ movl 16(%rsp),%edx
+ movl %r12d,%eax
+ movl %edi,%ecx
+ xorl 24(%rsp),%edx
+ xorl %r13d,%eax
+ roll $5,%ecx
+ xorl 48(%rsp),%edx
+ andl %r11d,%eax
+ leal 1518500249(%rbp,%rsi,1),%esi
+ xorl 4(%rsp),%edx
+ xorl %r13d,%eax
+ roll $1,%edx
+ addl %ecx,%esi
+ roll $30,%r11d
+ movl %edx,16(%rsp)
+ addl %eax,%esi
+ movl 20(%rsp),%ebp
+ movl %r11d,%eax
+ movl %esi,%ecx
+ xorl 28(%rsp),%ebp
+ xorl %edi,%eax
+ roll $5,%ecx
+ leal 1859775393(%rdx,%r13,1),%r13d
+ xorl 52(%rsp),%ebp
+ xorl %r12d,%eax
+ addl %ecx,%r13d
+ xorl 8(%rsp),%ebp
+ roll $30,%edi
+ addl %eax,%r13d
+ roll $1,%ebp
+ movl %ebp,20(%rsp)
+ movl 24(%rsp),%edx
+ movl %edi,%eax
+ movl %r13d,%ecx
+ xorl 32(%rsp),%edx
+ xorl %esi,%eax
+ roll $5,%ecx
+ leal 1859775393(%rbp,%r12,1),%r12d
+ xorl 56(%rsp),%edx
+ xorl %r11d,%eax
+ addl %ecx,%r12d
+ xorl 12(%rsp),%edx
+ roll $30,%esi
+ addl %eax,%r12d
+ roll $1,%edx
+ movl %edx,24(%rsp)
+ movl 28(%rsp),%ebp
+ movl %esi,%eax
+ movl %r12d,%ecx
+ xorl 36(%rsp),%ebp
+ xorl %r13d,%eax
+ roll $5,%ecx
+ leal 1859775393(%rdx,%r11,1),%r11d
+ xorl 60(%rsp),%ebp
+ xorl %edi,%eax
+ addl %ecx,%r11d
+ xorl 16(%rsp),%ebp
+ roll $30,%r13d
+ addl %eax,%r11d
+ roll $1,%ebp
+ movl %ebp,28(%rsp)
+ movl 32(%rsp),%edx
+ movl %r13d,%eax
+ movl %r11d,%ecx
+ xorl 40(%rsp),%edx
+ xorl %r12d,%eax
+ roll $5,%ecx
+ leal 1859775393(%rbp,%rdi,1),%edi
+ xorl 0(%rsp),%edx
+ xorl %esi,%eax
+ addl %ecx,%edi
+ xorl 20(%rsp),%edx
+ roll $30,%r12d
+ addl %eax,%edi
+ roll $1,%edx
+ movl %edx,32(%rsp)
+ movl 36(%rsp),%ebp
+ movl %r12d,%eax
+ movl %edi,%ecx
+ xorl 44(%rsp),%ebp
+ xorl %r11d,%eax
+ roll $5,%ecx
+ leal 1859775393(%rdx,%rsi,1),%esi
+ xorl 4(%rsp),%ebp
+ xorl %r13d,%eax
+ addl %ecx,%esi
+ xorl 24(%rsp),%ebp
+ roll $30,%r11d
+ addl %eax,%esi
+ roll $1,%ebp
+ movl %ebp,36(%rsp)
+ movl 40(%rsp),%edx
+ movl %r11d,%eax
+ movl %esi,%ecx
+ xorl 48(%rsp),%edx
+ xorl %edi,%eax
+ roll $5,%ecx
+ leal 1859775393(%rbp,%r13,1),%r13d
+ xorl 8(%rsp),%edx
+ xorl %r12d,%eax
+ addl %ecx,%r13d
+ xorl 28(%rsp),%edx
+ roll $30,%edi
+ addl %eax,%r13d
+ roll $1,%edx
+ movl %edx,40(%rsp)
+ movl 44(%rsp),%ebp
+ movl %edi,%eax
+ movl %r13d,%ecx
+ xorl 52(%rsp),%ebp
+ xorl %esi,%eax
+ roll $5,%ecx
+ leal 1859775393(%rdx,%r12,1),%r12d
+ xorl 12(%rsp),%ebp
+ xorl %r11d,%eax
+ addl %ecx,%r12d
+ xorl 32(%rsp),%ebp
+ roll $30,%esi
+ addl %eax,%r12d
+ roll $1,%ebp
+ movl %ebp,44(%rsp)
+ movl 48(%rsp),%edx
+ movl %esi,%eax
+ movl %r12d,%ecx
+ xorl 56(%rsp),%edx
+ xorl %r13d,%eax
+ roll $5,%ecx
+ leal 1859775393(%rbp,%r11,1),%r11d
+ xorl 16(%rsp),%edx
+ xorl %edi,%eax
+ addl %ecx,%r11d
+ xorl 36(%rsp),%edx
+ roll $30,%r13d
+ addl %eax,%r11d
+ roll $1,%edx
+ movl %edx,48(%rsp)
+ movl 52(%rsp),%ebp
+ movl %r13d,%eax
+ movl %r11d,%ecx
+ xorl 60(%rsp),%ebp
+ xorl %r12d,%eax
+ roll $5,%ecx
+ leal 1859775393(%rdx,%rdi,1),%edi
+ xorl 20(%rsp),%ebp
+ xorl %esi,%eax
+ addl %ecx,%edi
+ xorl 40(%rsp),%ebp
+ roll $30,%r12d
+ addl %eax,%edi
+ roll $1,%ebp
+ movl %ebp,52(%rsp)
+ movl 56(%rsp),%edx
+ movl %r12d,%eax
+ movl %edi,%ecx
+ xorl 0(%rsp),%edx
+ xorl %r11d,%eax
+ roll $5,%ecx
+ leal 1859775393(%rbp,%rsi,1),%esi
+ xorl 24(%rsp),%edx
+ xorl %r13d,%eax
+ addl %ecx,%esi
+ xorl 44(%rsp),%edx
+ roll $30,%r11d
+ addl %eax,%esi
+ roll $1,%edx
+ movl %edx,56(%rsp)
+ movl 60(%rsp),%ebp
+ movl %r11d,%eax
+ movl %esi,%ecx
+ xorl 4(%rsp),%ebp
+ xorl %edi,%eax
+ roll $5,%ecx
+ leal 1859775393(%rdx,%r13,1),%r13d
+ xorl 28(%rsp),%ebp
+ xorl %r12d,%eax
+ addl %ecx,%r13d
+ xorl 48(%rsp),%ebp
+ roll $30,%edi
+ addl %eax,%r13d
+ roll $1,%ebp
+ movl %ebp,60(%rsp)
+ movl 0(%rsp),%edx
+ movl %edi,%eax
+ movl %r13d,%ecx
+ xorl 8(%rsp),%edx
+ xorl %esi,%eax
+ roll $5,%ecx
+ leal 1859775393(%rbp,%r12,1),%r12d
+ xorl 32(%rsp),%edx
+ xorl %r11d,%eax
+ addl %ecx,%r12d
+ xorl 52(%rsp),%edx
+ roll $30,%esi
+ addl %eax,%r12d
+ roll $1,%edx
+ movl %edx,0(%rsp)
+ movl 4(%rsp),%ebp
+ movl %esi,%eax
+ movl %r12d,%ecx
+ xorl 12(%rsp),%ebp
+ xorl %r13d,%eax
+ roll $5,%ecx
+ leal 1859775393(%rdx,%r11,1),%r11d
+ xorl 36(%rsp),%ebp
+ xorl %edi,%eax
+ addl %ecx,%r11d
+ xorl 56(%rsp),%ebp
+ roll $30,%r13d
+ addl %eax,%r11d
+ roll $1,%ebp
+ movl %ebp,4(%rsp)
+ movl 8(%rsp),%edx
+ movl %r13d,%eax
+ movl %r11d,%ecx
+ xorl 16(%rsp),%edx
+ xorl %r12d,%eax
+ roll $5,%ecx
+ leal 1859775393(%rbp,%rdi,1),%edi
+ xorl 40(%rsp),%edx
+ xorl %esi,%eax
+ addl %ecx,%edi
+ xorl 60(%rsp),%edx
+ roll $30,%r12d
+ addl %eax,%edi
+ roll $1,%edx
+ movl %edx,8(%rsp)
+ movl 12(%rsp),%ebp
+ movl %r12d,%eax
+ movl %edi,%ecx
+ xorl 20(%rsp),%ebp
+ xorl %r11d,%eax
+ roll $5,%ecx
+ leal 1859775393(%rdx,%rsi,1),%esi
+ xorl 44(%rsp),%ebp
+ xorl %r13d,%eax
+ addl %ecx,%esi
+ xorl 0(%rsp),%ebp
+ roll $30,%r11d
+ addl %eax,%esi
+ roll $1,%ebp
+ movl %ebp,12(%rsp)
+ movl 16(%rsp),%edx
+ movl %r11d,%eax
+ movl %esi,%ecx
+ xorl 24(%rsp),%edx
+ xorl %edi,%eax
+ roll $5,%ecx
+ leal 1859775393(%rbp,%r13,1),%r13d
+ xorl 48(%rsp),%edx
+ xorl %r12d,%eax
+ addl %ecx,%r13d
+ xorl 4(%rsp),%edx
+ roll $30,%edi
+ addl %eax,%r13d
+ roll $1,%edx
+ movl %edx,16(%rsp)
+ movl 20(%rsp),%ebp
+ movl %edi,%eax
+ movl %r13d,%ecx
+ xorl 28(%rsp),%ebp
+ xorl %esi,%eax
+ roll $5,%ecx
+ leal 1859775393(%rdx,%r12,1),%r12d
+ xorl 52(%rsp),%ebp
+ xorl %r11d,%eax
+ addl %ecx,%r12d
+ xorl 8(%rsp),%ebp
+ roll $30,%esi
+ addl %eax,%r12d
+ roll $1,%ebp
+ movl %ebp,20(%rsp)
+ movl 24(%rsp),%edx
+ movl %esi,%eax
+ movl %r12d,%ecx
+ xorl 32(%rsp),%edx
+ xorl %r13d,%eax
+ roll $5,%ecx
+ leal 1859775393(%rbp,%r11,1),%r11d
+ xorl 56(%rsp),%edx
+ xorl %edi,%eax
+ addl %ecx,%r11d
+ xorl 12(%rsp),%edx
+ roll $30,%r13d
+ addl %eax,%r11d
+ roll $1,%edx
+ movl %edx,24(%rsp)
+ movl 28(%rsp),%ebp
+ movl %r13d,%eax
+ movl %r11d,%ecx
+ xorl 36(%rsp),%ebp
+ xorl %r12d,%eax
+ roll $5,%ecx
+ leal 1859775393(%rdx,%rdi,1),%edi
+ xorl 60(%rsp),%ebp
+ xorl %esi,%eax
+ addl %ecx,%edi
+ xorl 16(%rsp),%ebp
+ roll $30,%r12d
+ addl %eax,%edi
+ roll $1,%ebp
+ movl %ebp,28(%rsp)
+ movl 32(%rsp),%edx
+ movl %r12d,%eax
+ movl %edi,%ecx
+ xorl 40(%rsp),%edx
+ xorl %r11d,%eax
+ roll $5,%ecx
+ leal 1859775393(%rbp,%rsi,1),%esi
+ xorl 0(%rsp),%edx
+ xorl %r13d,%eax
+ addl %ecx,%esi
+ xorl 20(%rsp),%edx
+ roll $30,%r11d
+ addl %eax,%esi
+ roll $1,%edx
+ movl %edx,32(%rsp)
+ movl 36(%rsp),%ebp
+ movl %r11d,%eax
+ movl %r11d,%ebx
+ xorl 44(%rsp),%ebp
+ andl %r12d,%eax
+ movl %esi,%ecx
+ xorl 4(%rsp),%ebp
+ xorl %r12d,%ebx
+ leal -1894007588(%rdx,%r13,1),%r13d
+ roll $5,%ecx
+ xorl 24(%rsp),%ebp
+ addl %eax,%r13d
+ andl %edi,%ebx
+ roll $1,%ebp
+ addl %ebx,%r13d
+ roll $30,%edi
+ movl %ebp,36(%rsp)
+ addl %ecx,%r13d
+ movl 40(%rsp),%edx
+ movl %edi,%eax
+ movl %edi,%ebx
+ xorl 48(%rsp),%edx
+ andl %r11d,%eax
+ movl %r13d,%ecx
+ xorl 8(%rsp),%edx
+ xorl %r11d,%ebx
+ leal -1894007588(%rbp,%r12,1),%r12d
+ roll $5,%ecx
+ xorl 28(%rsp),%edx
+ addl %eax,%r12d
+ andl %esi,%ebx
+ roll $1,%edx
+ addl %ebx,%r12d
+ roll $30,%esi
+ movl %edx,40(%rsp)
+ addl %ecx,%r12d
+ movl 44(%rsp),%ebp
+ movl %esi,%eax
+ movl %esi,%ebx
+ xorl 52(%rsp),%ebp
+ andl %edi,%eax
+ movl %r12d,%ecx
+ xorl 12(%rsp),%ebp
+ xorl %edi,%ebx
+ leal -1894007588(%rdx,%r11,1),%r11d
+ roll $5,%ecx
+ xorl 32(%rsp),%ebp
+ addl %eax,%r11d
+ andl %r13d,%ebx
+ roll $1,%ebp
+ addl %ebx,%r11d
+ roll $30,%r13d
+ movl %ebp,44(%rsp)
+ addl %ecx,%r11d
+ movl 48(%rsp),%edx
+ movl %r13d,%eax
+ movl %r13d,%ebx
+ xorl 56(%rsp),%edx
+ andl %esi,%eax
+ movl %r11d,%ecx
+ xorl 16(%rsp),%edx
+ xorl %esi,%ebx
+ leal -1894007588(%rbp,%rdi,1),%edi
+ roll $5,%ecx
+ xorl 36(%rsp),%edx
+ addl %eax,%edi
+ andl %r12d,%ebx
+ roll $1,%edx
+ addl %ebx,%edi
+ roll $30,%r12d
+ movl %edx,48(%rsp)
+ addl %ecx,%edi
+ movl 52(%rsp),%ebp
+ movl %r12d,%eax
+ movl %r12d,%ebx
+ xorl 60(%rsp),%ebp
+ andl %r13d,%eax
+ movl %edi,%ecx
+ xorl 20(%rsp),%ebp
+ xorl %r13d,%ebx
+ leal -1894007588(%rdx,%rsi,1),%esi
+ roll $5,%ecx
+ xorl 40(%rsp),%ebp
+ addl %eax,%esi
+ andl %r11d,%ebx
+ roll $1,%ebp
+ addl %ebx,%esi
+ roll $30,%r11d
+ movl %ebp,52(%rsp)
+ addl %ecx,%esi
+ movl 56(%rsp),%edx
+ movl %r11d,%eax
+ movl %r11d,%ebx
+ xorl 0(%rsp),%edx
+ andl %r12d,%eax
+ movl %esi,%ecx
+ xorl 24(%rsp),%edx
+ xorl %r12d,%ebx
+ leal -1894007588(%rbp,%r13,1),%r13d
+ roll $5,%ecx
+ xorl 44(%rsp),%edx
+ addl %eax,%r13d
+ andl %edi,%ebx
+ roll $1,%edx
+ addl %ebx,%r13d
+ roll $30,%edi
+ movl %edx,56(%rsp)
+ addl %ecx,%r13d
+ movl 60(%rsp),%ebp
+ movl %edi,%eax
+ movl %edi,%ebx
+ xorl 4(%rsp),%ebp
+ andl %r11d,%eax
+ movl %r13d,%ecx
+ xorl 28(%rsp),%ebp
+ xorl %r11d,%ebx
+ leal -1894007588(%rdx,%r12,1),%r12d
+ roll $5,%ecx
+ xorl 48(%rsp),%ebp
+ addl %eax,%r12d
+ andl %esi,%ebx
+ roll $1,%ebp
+ addl %ebx,%r12d
+ roll $30,%esi
+ movl %ebp,60(%rsp)
+ addl %ecx,%r12d
+ movl 0(%rsp),%edx
+ movl %esi,%eax
+ movl %esi,%ebx
+ xorl 8(%rsp),%edx
+ andl %edi,%eax
+ movl %r12d,%ecx
+ xorl 32(%rsp),%edx
+ xorl %edi,%ebx
+ leal -1894007588(%rbp,%r11,1),%r11d
+ roll $5,%ecx
+ xorl 52(%rsp),%edx
+ addl %eax,%r11d
+ andl %r13d,%ebx
+ roll $1,%edx
+ addl %ebx,%r11d
+ roll $30,%r13d
+ movl %edx,0(%rsp)
+ addl %ecx,%r11d
+ movl 4(%rsp),%ebp
+ movl %r13d,%eax
+ movl %r13d,%ebx
+ xorl 12(%rsp),%ebp
+ andl %esi,%eax
+ movl %r11d,%ecx
+ xorl 36(%rsp),%ebp
+ xorl %esi,%ebx
+ leal -1894007588(%rdx,%rdi,1),%edi
+ roll $5,%ecx
+ xorl 56(%rsp),%ebp
+ addl %eax,%edi
+ andl %r12d,%ebx
+ roll $1,%ebp
+ addl %ebx,%edi
+ roll $30,%r12d
+ movl %ebp,4(%rsp)
+ addl %ecx,%edi
+ movl 8(%rsp),%edx
+ movl %r12d,%eax
+ movl %r12d,%ebx
+ xorl 16(%rsp),%edx
+ andl %r13d,%eax
+ movl %edi,%ecx
+ xorl 40(%rsp),%edx
+ xorl %r13d,%ebx
+ leal -1894007588(%rbp,%rsi,1),%esi
+ roll $5,%ecx
+ xorl 60(%rsp),%edx
+ addl %eax,%esi
+ andl %r11d,%ebx
+ roll $1,%edx
+ addl %ebx,%esi
+ roll $30,%r11d
+ movl %edx,8(%rsp)
+ addl %ecx,%esi
+ movl 12(%rsp),%ebp
+ movl %r11d,%eax
+ movl %r11d,%ebx
+ xorl 20(%rsp),%ebp
+ andl %r12d,%eax
+ movl %esi,%ecx
+ xorl 44(%rsp),%ebp
+ xorl %r12d,%ebx
+ leal -1894007588(%rdx,%r13,1),%r13d
+ roll $5,%ecx
+ xorl 0(%rsp),%ebp
+ addl %eax,%r13d
+ andl %edi,%ebx
+ roll $1,%ebp
+ addl %ebx,%r13d
+ roll $30,%edi
+ movl %ebp,12(%rsp)
+ addl %ecx,%r13d
+ movl 16(%rsp),%edx
+ movl %edi,%eax
+ movl %edi,%ebx
+ xorl 24(%rsp),%edx
+ andl %r11d,%eax
+ movl %r13d,%ecx
+ xorl 48(%rsp),%edx
+ xorl %r11d,%ebx
+ leal -1894007588(%rbp,%r12,1),%r12d
+ roll $5,%ecx
+ xorl 4(%rsp),%edx
+ addl %eax,%r12d
+ andl %esi,%ebx
+ roll $1,%edx
+ addl %ebx,%r12d
+ roll $30,%esi
+ movl %edx,16(%rsp)
+ addl %ecx,%r12d
+ movl 20(%rsp),%ebp
+ movl %esi,%eax
+ movl %esi,%ebx
+ xorl 28(%rsp),%ebp
+ andl %edi,%eax
+ movl %r12d,%ecx
+ xorl 52(%rsp),%ebp
+ xorl %edi,%ebx
+ leal -1894007588(%rdx,%r11,1),%r11d
+ roll $5,%ecx
+ xorl 8(%rsp),%ebp
+ addl %eax,%r11d
+ andl %r13d,%ebx
+ roll $1,%ebp
+ addl %ebx,%r11d
+ roll $30,%r13d
+ movl %ebp,20(%rsp)
+ addl %ecx,%r11d
+ movl 24(%rsp),%edx
+ movl %r13d,%eax
+ movl %r13d,%ebx
+ xorl 32(%rsp),%edx
+ andl %esi,%eax
+ movl %r11d,%ecx
+ xorl 56(%rsp),%edx
+ xorl %esi,%ebx
+ leal -1894007588(%rbp,%rdi,1),%edi
+ roll $5,%ecx
+ xorl 12(%rsp),%edx
+ addl %eax,%edi
+ andl %r12d,%ebx
+ roll $1,%edx
+ addl %ebx,%edi
+ roll $30,%r12d
+ movl %edx,24(%rsp)
+ addl %ecx,%edi
+ movl 28(%rsp),%ebp
+ movl %r12d,%eax
+ movl %r12d,%ebx
+ xorl 36(%rsp),%ebp
+ andl %r13d,%eax
+ movl %edi,%ecx
+ xorl 60(%rsp),%ebp
+ xorl %r13d,%ebx
+ leal -1894007588(%rdx,%rsi,1),%esi
+ roll $5,%ecx
+ xorl 16(%rsp),%ebp
+ addl %eax,%esi
+ andl %r11d,%ebx
+ roll $1,%ebp
+ addl %ebx,%esi
+ roll $30,%r11d
+ movl %ebp,28(%rsp)
+ addl %ecx,%esi
+ movl 32(%rsp),%edx
+ movl %r11d,%eax
+ movl %r11d,%ebx
+ xorl 40(%rsp),%edx
+ andl %r12d,%eax
+ movl %esi,%ecx
+ xorl 0(%rsp),%edx
+ xorl %r12d,%ebx
+ leal -1894007588(%rbp,%r13,1),%r13d
+ roll $5,%ecx
+ xorl 20(%rsp),%edx
+ addl %eax,%r13d
+ andl %edi,%ebx
+ roll $1,%edx
+ addl %ebx,%r13d
+ roll $30,%edi
+ movl %edx,32(%rsp)
+ addl %ecx,%r13d
+ movl 36(%rsp),%ebp
+ movl %edi,%eax
+ movl %edi,%ebx
+ xorl 44(%rsp),%ebp
+ andl %r11d,%eax
+ movl %r13d,%ecx
+ xorl 4(%rsp),%ebp
+ xorl %r11d,%ebx
+ leal -1894007588(%rdx,%r12,1),%r12d
+ roll $5,%ecx
+ xorl 24(%rsp),%ebp
+ addl %eax,%r12d
+ andl %esi,%ebx
+ roll $1,%ebp
+ addl %ebx,%r12d
+ roll $30,%esi
+ movl %ebp,36(%rsp)
+ addl %ecx,%r12d
+ movl 40(%rsp),%edx
+ movl %esi,%eax
+ movl %esi,%ebx
+ xorl 48(%rsp),%edx
+ andl %edi,%eax
+ movl %r12d,%ecx
+ xorl 8(%rsp),%edx
+ xorl %edi,%ebx
+ leal -1894007588(%rbp,%r11,1),%r11d
+ roll $5,%ecx
+ xorl 28(%rsp),%edx
+ addl %eax,%r11d
+ andl %r13d,%ebx
+ roll $1,%edx
+ addl %ebx,%r11d
+ roll $30,%r13d
+ movl %edx,40(%rsp)
+ addl %ecx,%r11d
+ movl 44(%rsp),%ebp
+ movl %r13d,%eax
+ movl %r13d,%ebx
+ xorl 52(%rsp),%ebp
+ andl %esi,%eax
+ movl %r11d,%ecx
+ xorl 12(%rsp),%ebp
+ xorl %esi,%ebx
+ leal -1894007588(%rdx,%rdi,1),%edi
+ roll $5,%ecx
+ xorl 32(%rsp),%ebp
+ addl %eax,%edi
+ andl %r12d,%ebx
+ roll $1,%ebp
+ addl %ebx,%edi
+ roll $30,%r12d
+ movl %ebp,44(%rsp)
+ addl %ecx,%edi
+ movl 48(%rsp),%edx
+ movl %r12d,%eax
+ movl %r12d,%ebx
+ xorl 56(%rsp),%edx
+ andl %r13d,%eax
+ movl %edi,%ecx
+ xorl 16(%rsp),%edx
+ xorl %r13d,%ebx
+ leal -1894007588(%rbp,%rsi,1),%esi
+ roll $5,%ecx
+ xorl 36(%rsp),%edx
+ addl %eax,%esi
+ andl %r11d,%ebx
+ roll $1,%edx
+ addl %ebx,%esi
+ roll $30,%r11d
+ movl %edx,48(%rsp)
+ addl %ecx,%esi
+ movl 52(%rsp),%ebp
+ movl %r11d,%eax
+ movl %esi,%ecx
+ xorl 60(%rsp),%ebp
+ xorl %edi,%eax
+ roll $5,%ecx
+ leal -899497514(%rdx,%r13,1),%r13d
+ xorl 20(%rsp),%ebp
+ xorl %r12d,%eax
+ addl %ecx,%r13d
+ xorl 40(%rsp),%ebp
+ roll $30,%edi
+ addl %eax,%r13d
+ roll $1,%ebp
+ movl %ebp,52(%rsp)
+ movl 56(%rsp),%edx
+ movl %edi,%eax
+ movl %r13d,%ecx
+ xorl 0(%rsp),%edx
+ xorl %esi,%eax
+ roll $5,%ecx
+ leal -899497514(%rbp,%r12,1),%r12d
+ xorl 24(%rsp),%edx
+ xorl %r11d,%eax
+ addl %ecx,%r12d
+ xorl 44(%rsp),%edx
+ roll $30,%esi
+ addl %eax,%r12d
+ roll $1,%edx
+ movl %edx,56(%rsp)
+ movl 60(%rsp),%ebp
+ movl %esi,%eax
+ movl %r12d,%ecx
+ xorl 4(%rsp),%ebp
+ xorl %r13d,%eax
+ roll $5,%ecx
+ leal -899497514(%rdx,%r11,1),%r11d
+ xorl 28(%rsp),%ebp
+ xorl %edi,%eax
+ addl %ecx,%r11d
+ xorl 48(%rsp),%ebp
+ roll $30,%r13d
+ addl %eax,%r11d
+ roll $1,%ebp
+ movl %ebp,60(%rsp)
+ movl 0(%rsp),%edx
+ movl %r13d,%eax
+ movl %r11d,%ecx
+ xorl 8(%rsp),%edx
+ xorl %r12d,%eax
+ roll $5,%ecx
+ leal -899497514(%rbp,%rdi,1),%edi
+ xorl 32(%rsp),%edx
+ xorl %esi,%eax
+ addl %ecx,%edi
+ xorl 52(%rsp),%edx
+ roll $30,%r12d
+ addl %eax,%edi
+ roll $1,%edx
+ movl %edx,0(%rsp)
+ movl 4(%rsp),%ebp
+ movl %r12d,%eax
+ movl %edi,%ecx
+ xorl 12(%rsp),%ebp
+ xorl %r11d,%eax
+ roll $5,%ecx
+ leal -899497514(%rdx,%rsi,1),%esi
+ xorl 36(%rsp),%ebp
+ xorl %r13d,%eax
+ addl %ecx,%esi
+ xorl 56(%rsp),%ebp
+ roll $30,%r11d
+ addl %eax,%esi
+ roll $1,%ebp
+ movl %ebp,4(%rsp)
+ movl 8(%rsp),%edx
+ movl %r11d,%eax
+ movl %esi,%ecx
+ xorl 16(%rsp),%edx
+ xorl %edi,%eax
+ roll $5,%ecx
+ leal -899497514(%rbp,%r13,1),%r13d
+ xorl 40(%rsp),%edx
+ xorl %r12d,%eax
+ addl %ecx,%r13d
+ xorl 60(%rsp),%edx
+ roll $30,%edi
+ addl %eax,%r13d
+ roll $1,%edx
+ movl %edx,8(%rsp)
+ movl 12(%rsp),%ebp
+ movl %edi,%eax
+ movl %r13d,%ecx
+ xorl 20(%rsp),%ebp
+ xorl %esi,%eax
+ roll $5,%ecx
+ leal -899497514(%rdx,%r12,1),%r12d
+ xorl 44(%rsp),%ebp
+ xorl %r11d,%eax
+ addl %ecx,%r12d
+ xorl 0(%rsp),%ebp
+ roll $30,%esi
+ addl %eax,%r12d
+ roll $1,%ebp
+ movl %ebp,12(%rsp)
+ movl 16(%rsp),%edx
+ movl %esi,%eax
+ movl %r12d,%ecx
+ xorl 24(%rsp),%edx
+ xorl %r13d,%eax
+ roll $5,%ecx
+ leal -899497514(%rbp,%r11,1),%r11d
+ xorl 48(%rsp),%edx
+ xorl %edi,%eax
+ addl %ecx,%r11d
+ xorl 4(%rsp),%edx
+ roll $30,%r13d
+ addl %eax,%r11d
+ roll $1,%edx
+ movl %edx,16(%rsp)
+ movl 20(%rsp),%ebp
+ movl %r13d,%eax
+ movl %r11d,%ecx
+ xorl 28(%rsp),%ebp
+ xorl %r12d,%eax
+ roll $5,%ecx
+ leal -899497514(%rdx,%rdi,1),%edi
+ xorl 52(%rsp),%ebp
+ xorl %esi,%eax
+ addl %ecx,%edi
+ xorl 8(%rsp),%ebp
+ roll $30,%r12d
+ addl %eax,%edi
+ roll $1,%ebp
+ movl %ebp,20(%rsp)
+ movl 24(%rsp),%edx
+ movl %r12d,%eax
+ movl %edi,%ecx
+ xorl 32(%rsp),%edx
+ xorl %r11d,%eax
+ roll $5,%ecx
+ leal -899497514(%rbp,%rsi,1),%esi
+ xorl 56(%rsp),%edx
+ xorl %r13d,%eax
+ addl %ecx,%esi
+ xorl 12(%rsp),%edx
+ roll $30,%r11d
+ addl %eax,%esi
+ roll $1,%edx
+ movl %edx,24(%rsp)
+ movl 28(%rsp),%ebp
+ movl %r11d,%eax
+ movl %esi,%ecx
+ xorl 36(%rsp),%ebp
+ xorl %edi,%eax
+ roll $5,%ecx
+ leal -899497514(%rdx,%r13,1),%r13d
+ xorl 60(%rsp),%ebp
+ xorl %r12d,%eax
+ addl %ecx,%r13d
+ xorl 16(%rsp),%ebp
+ roll $30,%edi
+ addl %eax,%r13d
+ roll $1,%ebp
+ movl %ebp,28(%rsp)
+ movl 32(%rsp),%edx
+ movl %edi,%eax
+ movl %r13d,%ecx
+ xorl 40(%rsp),%edx
+ xorl %esi,%eax
+ roll $5,%ecx
+ leal -899497514(%rbp,%r12,1),%r12d
+ xorl 0(%rsp),%edx
+ xorl %r11d,%eax
+ addl %ecx,%r12d
+ xorl 20(%rsp),%edx
+ roll $30,%esi
+ addl %eax,%r12d
+ roll $1,%edx
+ movl %edx,32(%rsp)
+ movl 36(%rsp),%ebp
+ movl %esi,%eax
+ movl %r12d,%ecx
+ xorl 44(%rsp),%ebp
+ xorl %r13d,%eax
+ roll $5,%ecx
+ leal -899497514(%rdx,%r11,1),%r11d
+ xorl 4(%rsp),%ebp
+ xorl %edi,%eax
+ addl %ecx,%r11d
+ xorl 24(%rsp),%ebp
+ roll $30,%r13d
+ addl %eax,%r11d
+ roll $1,%ebp
+ movl %ebp,36(%rsp)
+ movl 40(%rsp),%edx
+ movl %r13d,%eax
+ movl %r11d,%ecx
+ xorl 48(%rsp),%edx
+ xorl %r12d,%eax
+ roll $5,%ecx
+ leal -899497514(%rbp,%rdi,1),%edi
+ xorl 8(%rsp),%edx
+ xorl %esi,%eax
+ addl %ecx,%edi
+ xorl 28(%rsp),%edx
+ roll $30,%r12d
+ addl %eax,%edi
+ roll $1,%edx
+ movl %edx,40(%rsp)
+ movl 44(%rsp),%ebp
+ movl %r12d,%eax
+ movl %edi,%ecx
+ xorl 52(%rsp),%ebp
+ xorl %r11d,%eax
+ roll $5,%ecx
+ leal -899497514(%rdx,%rsi,1),%esi
+ xorl 12(%rsp),%ebp
+ xorl %r13d,%eax
+ addl %ecx,%esi
+ xorl 32(%rsp),%ebp
+ roll $30,%r11d
+ addl %eax,%esi
+ roll $1,%ebp
+ movl %ebp,44(%rsp)
+ movl 48(%rsp),%edx
+ movl %r11d,%eax
+ movl %esi,%ecx
+ xorl 56(%rsp),%edx
+ xorl %edi,%eax
+ roll $5,%ecx
+ leal -899497514(%rbp,%r13,1),%r13d
+ xorl 16(%rsp),%edx
+ xorl %r12d,%eax
+ addl %ecx,%r13d
+ xorl 36(%rsp),%edx
+ roll $30,%edi
+ addl %eax,%r13d
+ roll $1,%edx
+ movl %edx,48(%rsp)
+ movl 52(%rsp),%ebp
+ movl %edi,%eax
+ movl %r13d,%ecx
+ xorl 60(%rsp),%ebp
+ xorl %esi,%eax
+ roll $5,%ecx
+ leal -899497514(%rdx,%r12,1),%r12d
+ xorl 20(%rsp),%ebp
+ xorl %r11d,%eax
+ addl %ecx,%r12d
+ xorl 40(%rsp),%ebp
+ roll $30,%esi
+ addl %eax,%r12d
+ roll $1,%ebp
+ movl 56(%rsp),%edx
+ movl %esi,%eax
+ movl %r12d,%ecx
+ xorl 0(%rsp),%edx
+ xorl %r13d,%eax
+ roll $5,%ecx
+ leal -899497514(%rbp,%r11,1),%r11d
+ xorl 24(%rsp),%edx
+ xorl %edi,%eax
+ addl %ecx,%r11d
+ xorl 44(%rsp),%edx
+ roll $30,%r13d
+ addl %eax,%r11d
+ roll $1,%edx
+ movl 60(%rsp),%ebp
+ movl %r13d,%eax
+ movl %r11d,%ecx
+ xorl 4(%rsp),%ebp
+ xorl %r12d,%eax
+ roll $5,%ecx
+ leal -899497514(%rdx,%rdi,1),%edi
+ xorl 28(%rsp),%ebp
+ xorl %esi,%eax
+ addl %ecx,%edi
+ xorl 48(%rsp),%ebp
+ roll $30,%r12d
+ addl %eax,%edi
+ roll $1,%ebp
+ movl %r12d,%eax
+ movl %edi,%ecx
+ xorl %r11d,%eax
+ leal -899497514(%rbp,%rsi,1),%esi
+ roll $5,%ecx
+ xorl %r13d,%eax
+ addl %ecx,%esi
+ roll $30,%r11d
+ addl %eax,%esi
+ addl 0(%r8),%esi
+ addl 4(%r8),%edi
+ addl 8(%r8),%r11d
+ addl 12(%r8),%r12d
+ addl 16(%r8),%r13d
+ movl %esi,0(%r8)
+ movl %edi,4(%r8)
+ movl %r11d,8(%r8)
+ movl %r12d,12(%r8)
+ movl %r13d,16(%r8)
+
+ subq $1,%r10
+ leaq 64(%r9),%r9
+ jnz .Lloop
+
+ movq 64(%rsp),%rsi
+ movq (%rsi),%r13
+ movq 8(%rsi),%r12
+ movq 16(%rsi),%rbp
+ movq 24(%rsi),%rbx
+ leaq 32(%rsi),%rsp
+.Lepilogue:
+ .byte 0xf3,0xc3
+.size sha1_block_data_order,.-sha1_block_data_order
+.type sha1_block_data_order_ssse3,@function
+.align 16
+sha1_block_data_order_ssse3:
+_ssse3_shortcut:
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ leaq -64(%rsp),%rsp
+ movq %rdi,%r8
+ movq %rsi,%r9
+ movq %rdx,%r10
+
+ shlq $6,%r10
+ addq %r9,%r10
+ leaq K_XX_XX(%rip),%r11
+
+ movl 0(%r8),%eax
+ movl 4(%r8),%ebx
+ movl 8(%r8),%ecx
+ movl 12(%r8),%edx
+ movl %ebx,%esi
+ movl 16(%r8),%ebp
+
+ movdqa 64(%r11),%xmm6
+ movdqa 0(%r11),%xmm9
+ movdqu 0(%r9),%xmm0
+ movdqu 16(%r9),%xmm1
+ movdqu 32(%r9),%xmm2
+ movdqu 48(%r9),%xmm3
+.byte 102,15,56,0,198
+ addq $64,%r9
+.byte 102,15,56,0,206
+.byte 102,15,56,0,214
+.byte 102,15,56,0,222
+ paddd %xmm9,%xmm0
+ paddd %xmm9,%xmm1
+ paddd %xmm9,%xmm2
+ movdqa %xmm0,0(%rsp)
+ psubd %xmm9,%xmm0
+ movdqa %xmm1,16(%rsp)
+ psubd %xmm9,%xmm1
+ movdqa %xmm2,32(%rsp)
+ psubd %xmm9,%xmm2
+ jmp .Loop_ssse3
+.align 16
+.Loop_ssse3:
+ movdqa %xmm1,%xmm4
+ addl 0(%rsp),%ebp
+ xorl %edx,%ecx
+ movdqa %xmm3,%xmm8
+.byte 102,15,58,15,224,8
+ movl %eax,%edi
+ roll $5,%eax
+ paddd %xmm3,%xmm9
+ andl %ecx,%esi
+ xorl %edx,%ecx
+ psrldq $4,%xmm8
+ xorl %edx,%esi
+ addl %eax,%ebp
+ pxor %xmm0,%xmm4
+ rorl $2,%ebx
+ addl %esi,%ebp
+ pxor %xmm2,%xmm8
+ addl 4(%rsp),%edx
+ xorl %ecx,%ebx
+ movl %ebp,%esi
+ roll $5,%ebp
+ pxor %xmm8,%xmm4
+ andl %ebx,%edi
+ xorl %ecx,%ebx
+ movdqa %xmm9,48(%rsp)
+ xorl %ecx,%edi
+ addl %ebp,%edx
+ movdqa %xmm4,%xmm10
+ movdqa %xmm4,%xmm8
+ rorl $7,%eax
+ addl %edi,%edx
+ addl 8(%rsp),%ecx
+ xorl %ebx,%eax
+ pslldq $12,%xmm10
+ paddd %xmm4,%xmm4
+ movl %edx,%edi
+ roll $5,%edx
+ andl %eax,%esi
+ xorl %ebx,%eax
+ psrld $31,%xmm8
+ xorl %ebx,%esi
+ addl %edx,%ecx
+ movdqa %xmm10,%xmm9
+ rorl $7,%ebp
+ addl %esi,%ecx
+ psrld $30,%xmm10
+ por %xmm8,%xmm4
+ addl 12(%rsp),%ebx
+ xorl %eax,%ebp
+ movl %ecx,%esi
+ roll $5,%ecx
+ pslld $2,%xmm9
+ pxor %xmm10,%xmm4
+ andl %ebp,%edi
+ xorl %eax,%ebp
+ movdqa 0(%r11),%xmm10
+ xorl %eax,%edi
+ addl %ecx,%ebx
+ pxor %xmm9,%xmm4
+ rorl $7,%edx
+ addl %edi,%ebx
+ movdqa %xmm2,%xmm5
+ addl 16(%rsp),%eax
+ xorl %ebp,%edx
+ movdqa %xmm4,%xmm9
+.byte 102,15,58,15,233,8
+ movl %ebx,%edi
+ roll $5,%ebx
+ paddd %xmm4,%xmm10
+ andl %edx,%esi
+ xorl %ebp,%edx
+ psrldq $4,%xmm9
+ xorl %ebp,%esi
+ addl %ebx,%eax
+ pxor %xmm1,%xmm5
+ rorl $7,%ecx
+ addl %esi,%eax
+ pxor %xmm3,%xmm9
+ addl 20(%rsp),%ebp
+ xorl %edx,%ecx
+ movl %eax,%esi
+ roll $5,%eax
+ pxor %xmm9,%xmm5
+ andl %ecx,%edi
+ xorl %edx,%ecx
+ movdqa %xmm10,0(%rsp)
+ xorl %edx,%edi
+ addl %eax,%ebp
+ movdqa %xmm5,%xmm8
+ movdqa %xmm5,%xmm9
+ rorl $7,%ebx
+ addl %edi,%ebp
+ addl 24(%rsp),%edx
+ xorl %ecx,%ebx
+ pslldq $12,%xmm8
+ paddd %xmm5,%xmm5
+ movl %ebp,%edi
+ roll $5,%ebp
+ andl %ebx,%esi
+ xorl %ecx,%ebx
+ psrld $31,%xmm9
+ xorl %ecx,%esi
+ addl %ebp,%edx
+ movdqa %xmm8,%xmm10
+ rorl $7,%eax
+ addl %esi,%edx
+ psrld $30,%xmm8
+ por %xmm9,%xmm5
+ addl 28(%rsp),%ecx
+ xorl %ebx,%eax
+ movl %edx,%esi
+ roll $5,%edx
+ pslld $2,%xmm10
+ pxor %xmm8,%xmm5
+ andl %eax,%edi
+ xorl %ebx,%eax
+ movdqa 16(%r11),%xmm8
+ xorl %ebx,%edi
+ addl %edx,%ecx
+ pxor %xmm10,%xmm5
+ rorl $7,%ebp
+ addl %edi,%ecx
+ movdqa %xmm3,%xmm6
+ addl 32(%rsp),%ebx
+ xorl %eax,%ebp
+ movdqa %xmm5,%xmm10
+.byte 102,15,58,15,242,8
+ movl %ecx,%edi
+ roll $5,%ecx
+ paddd %xmm5,%xmm8
+ andl %ebp,%esi
+ xorl %eax,%ebp
+ psrldq $4,%xmm10
+ xorl %eax,%esi
+ addl %ecx,%ebx
+ pxor %xmm2,%xmm6
+ rorl $7,%edx
+ addl %esi,%ebx
+ pxor %xmm4,%xmm10
+ addl 36(%rsp),%eax
+ xorl %ebp,%edx
+ movl %ebx,%esi
+ roll $5,%ebx
+ pxor %xmm10,%xmm6
+ andl %edx,%edi
+ xorl %ebp,%edx
+ movdqa %xmm8,16(%rsp)
+ xorl %ebp,%edi
+ addl %ebx,%eax
+ movdqa %xmm6,%xmm9
+ movdqa %xmm6,%xmm10
+ rorl $7,%ecx
+ addl %edi,%eax
+ addl 40(%rsp),%ebp
+ xorl %edx,%ecx
+ pslldq $12,%xmm9
+ paddd %xmm6,%xmm6
+ movl %eax,%edi
+ roll $5,%eax
+ andl %ecx,%esi
+ xorl %edx,%ecx
+ psrld $31,%xmm10
+ xorl %edx,%esi
+ addl %eax,%ebp
+ movdqa %xmm9,%xmm8
+ rorl $7,%ebx
+ addl %esi,%ebp
+ psrld $30,%xmm9
+ por %xmm10,%xmm6
+ addl 44(%rsp),%edx
+ xorl %ecx,%ebx
+ movl %ebp,%esi
+ roll $5,%ebp
+ pslld $2,%xmm8
+ pxor %xmm9,%xmm6
+ andl %ebx,%edi
+ xorl %ecx,%ebx
+ movdqa 16(%r11),%xmm9
+ xorl %ecx,%edi
+ addl %ebp,%edx
+ pxor %xmm8,%xmm6
+ rorl $7,%eax
+ addl %edi,%edx
+ movdqa %xmm4,%xmm7
+ addl 48(%rsp),%ecx
+ xorl %ebx,%eax
+ movdqa %xmm6,%xmm8
+.byte 102,15,58,15,251,8
+ movl %edx,%edi
+ roll $5,%edx
+ paddd %xmm6,%xmm9
+ andl %eax,%esi
+ xorl %ebx,%eax
+ psrldq $4,%xmm8
+ xorl %ebx,%esi
+ addl %edx,%ecx
+ pxor %xmm3,%xmm7
+ rorl $7,%ebp
+ addl %esi,%ecx
+ pxor %xmm5,%xmm8
+ addl 52(%rsp),%ebx
+ xorl %eax,%ebp
+ movl %ecx,%esi
+ roll $5,%ecx
+ pxor %xmm8,%xmm7
+ andl %ebp,%edi
+ xorl %eax,%ebp
+ movdqa %xmm9,32(%rsp)
+ xorl %eax,%edi
+ addl %ecx,%ebx
+ movdqa %xmm7,%xmm10
+ movdqa %xmm7,%xmm8
+ rorl $7,%edx
+ addl %edi,%ebx
+ addl 56(%rsp),%eax
+ xorl %ebp,%edx
+ pslldq $12,%xmm10
+ paddd %xmm7,%xmm7
+ movl %ebx,%edi
+ roll $5,%ebx
+ andl %edx,%esi
+ xorl %ebp,%edx
+ psrld $31,%xmm8
+ xorl %ebp,%esi
+ addl %ebx,%eax
+ movdqa %xmm10,%xmm9
+ rorl $7,%ecx
+ addl %esi,%eax
+ psrld $30,%xmm10
+ por %xmm8,%xmm7
+ addl 60(%rsp),%ebp
+ xorl %edx,%ecx
+ movl %eax,%esi
+ roll $5,%eax
+ pslld $2,%xmm9
+ pxor %xmm10,%xmm7
+ andl %ecx,%edi
+ xorl %edx,%ecx
+ movdqa 16(%r11),%xmm10
+ xorl %edx,%edi
+ addl %eax,%ebp
+ pxor %xmm9,%xmm7
+ rorl $7,%ebx
+ addl %edi,%ebp
+ movdqa %xmm7,%xmm9
+ addl 0(%rsp),%edx
+ pxor %xmm4,%xmm0
+.byte 102,68,15,58,15,206,8
+ xorl %ecx,%ebx
+ movl %ebp,%edi
+ roll $5,%ebp
+ pxor %xmm1,%xmm0
+ andl %ebx,%esi
+ xorl %ecx,%ebx
+ movdqa %xmm10,%xmm8
+ paddd %xmm7,%xmm10
+ xorl %ecx,%esi
+ addl %ebp,%edx
+ pxor %xmm9,%xmm0
+ rorl $7,%eax
+ addl %esi,%edx
+ addl 4(%rsp),%ecx
+ xorl %ebx,%eax
+ movdqa %xmm0,%xmm9
+ movdqa %xmm10,48(%rsp)
+ movl %edx,%esi
+ roll $5,%edx
+ andl %eax,%edi
+ xorl %ebx,%eax
+ pslld $2,%xmm0
+ xorl %ebx,%edi
+ addl %edx,%ecx
+ psrld $30,%xmm9
+ rorl $7,%ebp
+ addl %edi,%ecx
+ addl 8(%rsp),%ebx
+ xorl %eax,%ebp
+ movl %ecx,%edi
+ roll $5,%ecx
+ por %xmm9,%xmm0
+ andl %ebp,%esi
+ xorl %eax,%ebp
+ movdqa %xmm0,%xmm10
+ xorl %eax,%esi
+ addl %ecx,%ebx
+ rorl $7,%edx
+ addl %esi,%ebx
+ addl 12(%rsp),%eax
+ xorl %ebp,%edx
+ movl %ebx,%esi
+ roll $5,%ebx
+ andl %edx,%edi
+ xorl %ebp,%edx
+ xorl %ebp,%edi
+ addl %ebx,%eax
+ rorl $7,%ecx
+ addl %edi,%eax
+ addl 16(%rsp),%ebp
+ pxor %xmm5,%xmm1
+.byte 102,68,15,58,15,215,8
+ xorl %edx,%esi
+ movl %eax,%edi
+ roll $5,%eax
+ pxor %xmm2,%xmm1
+ xorl %ecx,%esi
+ addl %eax,%ebp
+ movdqa %xmm8,%xmm9
+ paddd %xmm0,%xmm8
+ rorl $7,%ebx
+ addl %esi,%ebp
+ pxor %xmm10,%xmm1
+ addl 20(%rsp),%edx
+ xorl %ecx,%edi
+ movl %ebp,%esi
+ roll $5,%ebp
+ movdqa %xmm1,%xmm10
+ movdqa %xmm8,0(%rsp)
+ xorl %ebx,%edi
+ addl %ebp,%edx
+ rorl $7,%eax
+ addl %edi,%edx
+ pslld $2,%xmm1
+ addl 24(%rsp),%ecx
+ xorl %ebx,%esi
+ psrld $30,%xmm10
+ movl %edx,%edi
+ roll $5,%edx
+ xorl %eax,%esi
+ addl %edx,%ecx
+ rorl $7,%ebp
+ addl %esi,%ecx
+ por %xmm10,%xmm1
+ addl 28(%rsp),%ebx
+ xorl %eax,%edi
+ movdqa %xmm1,%xmm8
+ movl %ecx,%esi
+ roll $5,%ecx
+ xorl %ebp,%edi
+ addl %ecx,%ebx
+ rorl $7,%edx
+ addl %edi,%ebx
+ addl 32(%rsp),%eax
+ pxor %xmm6,%xmm2
+.byte 102,68,15,58,15,192,8
+ xorl %ebp,%esi
+ movl %ebx,%edi
+ roll $5,%ebx
+ pxor %xmm3,%xmm2
+ xorl %edx,%esi
+ addl %ebx,%eax
+ movdqa 32(%r11),%xmm10
+ paddd %xmm1,%xmm9
+ rorl $7,%ecx
+ addl %esi,%eax
+ pxor %xmm8,%xmm2
+ addl 36(%rsp),%ebp
+ xorl %edx,%edi
+ movl %eax,%esi
+ roll $5,%eax
+ movdqa %xmm2,%xmm8
+ movdqa %xmm9,16(%rsp)
+ xorl %ecx,%edi
+ addl %eax,%ebp
+ rorl $7,%ebx
+ addl %edi,%ebp
+ pslld $2,%xmm2
+ addl 40(%rsp),%edx
+ xorl %ecx,%esi
+ psrld $30,%xmm8
+ movl %ebp,%edi
+ roll $5,%ebp
+ xorl %ebx,%esi
+ addl %ebp,%edx
+ rorl $7,%eax
+ addl %esi,%edx
+ por %xmm8,%xmm2
+ addl 44(%rsp),%ecx
+ xorl %ebx,%edi
+ movdqa %xmm2,%xmm9
+ movl %edx,%esi
+ roll $5,%edx
+ xorl %eax,%edi
+ addl %edx,%ecx
+ rorl $7,%ebp
+ addl %edi,%ecx
+ addl 48(%rsp),%ebx
+ pxor %xmm7,%xmm3
+.byte 102,68,15,58,15,201,8
+ xorl %eax,%esi
+ movl %ecx,%edi
+ roll $5,%ecx
+ pxor %xmm4,%xmm3
+ xorl %ebp,%esi
+ addl %ecx,%ebx
+ movdqa %xmm10,%xmm8
+ paddd %xmm2,%xmm10
+ rorl $7,%edx
+ addl %esi,%ebx
+ pxor %xmm9,%xmm3
+ addl 52(%rsp),%eax
+ xorl %ebp,%edi
+ movl %ebx,%esi
+ roll $5,%ebx
+ movdqa %xmm3,%xmm9
+ movdqa %xmm10,32(%rsp)
+ xorl %edx,%edi
+ addl %ebx,%eax
+ rorl $7,%ecx
+ addl %edi,%eax
+ pslld $2,%xmm3
+ addl 56(%rsp),%ebp
+ xorl %edx,%esi
+ psrld $30,%xmm9
+ movl %eax,%edi
+ roll $5,%eax
+ xorl %ecx,%esi
+ addl %eax,%ebp
+ rorl $7,%ebx
+ addl %esi,%ebp
+ por %xmm9,%xmm3
+ addl 60(%rsp),%edx
+ xorl %ecx,%edi
+ movdqa %xmm3,%xmm10
+ movl %ebp,%esi
+ roll $5,%ebp
+ xorl %ebx,%edi
+ addl %ebp,%edx
+ rorl $7,%eax
+ addl %edi,%edx
+ addl 0(%rsp),%ecx
+ pxor %xmm0,%xmm4
+.byte 102,68,15,58,15,210,8
+ xorl %ebx,%esi
+ movl %edx,%edi
+ roll $5,%edx
+ pxor %xmm5,%xmm4
+ xorl %eax,%esi
+ addl %edx,%ecx
+ movdqa %xmm8,%xmm9
+ paddd %xmm3,%xmm8
+ rorl $7,%ebp
+ addl %esi,%ecx
+ pxor %xmm10,%xmm4
+ addl 4(%rsp),%ebx
+ xorl %eax,%edi
+ movl %ecx,%esi
+ roll $5,%ecx
+ movdqa %xmm4,%xmm10
+ movdqa %xmm8,48(%rsp)
+ xorl %ebp,%edi
+ addl %ecx,%ebx
+ rorl $7,%edx
+ addl %edi,%ebx
+ pslld $2,%xmm4
+ addl 8(%rsp),%eax
+ xorl %ebp,%esi
+ psrld $30,%xmm10
+ movl %ebx,%edi
+ roll $5,%ebx
+ xorl %edx,%esi
+ addl %ebx,%eax
+ rorl $7,%ecx
+ addl %esi,%eax
+ por %xmm10,%xmm4
+ addl 12(%rsp),%ebp
+ xorl %edx,%edi
+ movdqa %xmm4,%xmm8
+ movl %eax,%esi
+ roll $5,%eax
+ xorl %ecx,%edi
+ addl %eax,%ebp
+ rorl $7,%ebx
+ addl %edi,%ebp
+ addl 16(%rsp),%edx
+ pxor %xmm1,%xmm5
+.byte 102,68,15,58,15,195,8
+ xorl %ecx,%esi
+ movl %ebp,%edi
+ roll $5,%ebp
+ pxor %xmm6,%xmm5
+ xorl %ebx,%esi
+ addl %ebp,%edx
+ movdqa %xmm9,%xmm10
+ paddd %xmm4,%xmm9
+ rorl $7,%eax
+ addl %esi,%edx
+ pxor %xmm8,%xmm5
+ addl 20(%rsp),%ecx
+ xorl %ebx,%edi
+ movl %edx,%esi
+ roll $5,%edx
+ movdqa %xmm5,%xmm8
+ movdqa %xmm9,0(%rsp)
+ xorl %eax,%edi
+ addl %edx,%ecx
+ rorl $7,%ebp
+ addl %edi,%ecx
+ pslld $2,%xmm5
+ addl 24(%rsp),%ebx
+ xorl %eax,%esi
+ psrld $30,%xmm8
+ movl %ecx,%edi
+ roll $5,%ecx
+ xorl %ebp,%esi
+ addl %ecx,%ebx
+ rorl $7,%edx
+ addl %esi,%ebx
+ por %xmm8,%xmm5
+ addl 28(%rsp),%eax
+ xorl %ebp,%edi
+ movdqa %xmm5,%xmm9
+ movl %ebx,%esi
+ roll $5,%ebx
+ xorl %edx,%edi
+ addl %ebx,%eax
+ rorl $7,%ecx
+ addl %edi,%eax
+ movl %ecx,%edi
+ pxor %xmm2,%xmm6
+.byte 102,68,15,58,15,204,8
+ xorl %edx,%ecx
+ addl 32(%rsp),%ebp
+ andl %edx,%edi
+ pxor %xmm7,%xmm6
+ andl %ecx,%esi
+ rorl $7,%ebx
+ movdqa %xmm10,%xmm8
+ paddd %xmm5,%xmm10
+ addl %edi,%ebp
+ movl %eax,%edi
+ pxor %xmm9,%xmm6
+ roll $5,%eax
+ addl %esi,%ebp
+ xorl %edx,%ecx
+ addl %eax,%ebp
+ movdqa %xmm6,%xmm9
+ movdqa %xmm10,16(%rsp)
+ movl %ebx,%esi
+ xorl %ecx,%ebx
+ addl 36(%rsp),%edx
+ andl %ecx,%esi
+ pslld $2,%xmm6
+ andl %ebx,%edi
+ rorl $7,%eax
+ psrld $30,%xmm9
+ addl %esi,%edx
+ movl %ebp,%esi
+ roll $5,%ebp
+ addl %edi,%edx
+ xorl %ecx,%ebx
+ addl %ebp,%edx
+ por %xmm9,%xmm6
+ movl %eax,%edi
+ xorl %ebx,%eax
+ movdqa %xmm6,%xmm10
+ addl 40(%rsp),%ecx
+ andl %ebx,%edi
+ andl %eax,%esi
+ rorl $7,%ebp
+ addl %edi,%ecx
+ movl %edx,%edi
+ roll $5,%edx
+ addl %esi,%ecx
+ xorl %ebx,%eax
+ addl %edx,%ecx
+ movl %ebp,%esi
+ xorl %eax,%ebp
+ addl 44(%rsp),%ebx
+ andl %eax,%esi
+ andl %ebp,%edi
+ rorl $7,%edx
+ addl %esi,%ebx
+ movl %ecx,%esi
+ roll $5,%ecx
+ addl %edi,%ebx
+ xorl %eax,%ebp
+ addl %ecx,%ebx
+ movl %edx,%edi
+ pxor %xmm3,%xmm7
+.byte 102,68,15,58,15,213,8
+ xorl %ebp,%edx
+ addl 48(%rsp),%eax
+ andl %ebp,%edi
+ pxor %xmm0,%xmm7
+ andl %edx,%esi
+ rorl $7,%ecx
+ movdqa 48(%r11),%xmm9
+ paddd %xmm6,%xmm8
+ addl %edi,%eax
+ movl %ebx,%edi
+ pxor %xmm10,%xmm7
+ roll $5,%ebx
+ addl %esi,%eax
+ xorl %ebp,%edx
+ addl %ebx,%eax
+ movdqa %xmm7,%xmm10
+ movdqa %xmm8,32(%rsp)
+ movl %ecx,%esi
+ xorl %edx,%ecx
+ addl 52(%rsp),%ebp
+ andl %edx,%esi
+ pslld $2,%xmm7
+ andl %ecx,%edi
+ rorl $7,%ebx
+ psrld $30,%xmm10
+ addl %esi,%ebp
+ movl %eax,%esi
+ roll $5,%eax
+ addl %edi,%ebp
+ xorl %edx,%ecx
+ addl %eax,%ebp
+ por %xmm10,%xmm7
+ movl %ebx,%edi
+ xorl %ecx,%ebx
+ movdqa %xmm7,%xmm8
+ addl 56(%rsp),%edx
+ andl %ecx,%edi
+ andl %ebx,%esi
+ rorl $7,%eax
+ addl %edi,%edx
+ movl %ebp,%edi
+ roll $5,%ebp
+ addl %esi,%edx
+ xorl %ecx,%ebx
+ addl %ebp,%edx
+ movl %eax,%esi
+ xorl %ebx,%eax
+ addl 60(%rsp),%ecx
+ andl %ebx,%esi
+ andl %eax,%edi
+ rorl $7,%ebp
+ addl %esi,%ecx
+ movl %edx,%esi
+ roll $5,%edx
+ addl %edi,%ecx
+ xorl %ebx,%eax
+ addl %edx,%ecx
+ movl %ebp,%edi
+ pxor %xmm4,%xmm0
+.byte 102,68,15,58,15,198,8
+ xorl %eax,%ebp
+ addl 0(%rsp),%ebx
+ andl %eax,%edi
+ pxor %xmm1,%xmm0
+ andl %ebp,%esi
+ rorl $7,%edx
+ movdqa %xmm9,%xmm10
+ paddd %xmm7,%xmm9
+ addl %edi,%ebx
+ movl %ecx,%edi
+ pxor %xmm8,%xmm0
+ roll $5,%ecx
+ addl %esi,%ebx
+ xorl %eax,%ebp
+ addl %ecx,%ebx
+ movdqa %xmm0,%xmm8
+ movdqa %xmm9,48(%rsp)
+ movl %edx,%esi
+ xorl %ebp,%edx
+ addl 4(%rsp),%eax
+ andl %ebp,%esi
+ pslld $2,%xmm0
+ andl %edx,%edi
+ rorl $7,%ecx
+ psrld $30,%xmm8
+ addl %esi,%eax
+ movl %ebx,%esi
+ roll $5,%ebx
+ addl %edi,%eax
+ xorl %ebp,%edx
+ addl %ebx,%eax
+ por %xmm8,%xmm0
+ movl %ecx,%edi
+ xorl %edx,%ecx
+ movdqa %xmm0,%xmm9
+ addl 8(%rsp),%ebp
+ andl %edx,%edi
+ andl %ecx,%esi
+ rorl $7,%ebx
+ addl %edi,%ebp
+ movl %eax,%edi
+ roll $5,%eax
+ addl %esi,%ebp
+ xorl %edx,%ecx
+ addl %eax,%ebp
+ movl %ebx,%esi
+ xorl %ecx,%ebx
+ addl 12(%rsp),%edx
+ andl %ecx,%esi
+ andl %ebx,%edi
+ rorl $7,%eax
+ addl %esi,%edx
+ movl %ebp,%esi
+ roll $5,%ebp
+ addl %edi,%edx
+ xorl %ecx,%ebx
+ addl %ebp,%edx
+ movl %eax,%edi
+ pxor %xmm5,%xmm1
+.byte 102,68,15,58,15,207,8
+ xorl %ebx,%eax
+ addl 16(%rsp),%ecx
+ andl %ebx,%edi
+ pxor %xmm2,%xmm1
+ andl %eax,%esi
+ rorl $7,%ebp
+ movdqa %xmm10,%xmm8
+ paddd %xmm0,%xmm10
+ addl %edi,%ecx
+ movl %edx,%edi
+ pxor %xmm9,%xmm1
+ roll $5,%edx
+ addl %esi,%ecx
+ xorl %ebx,%eax
+ addl %edx,%ecx
+ movdqa %xmm1,%xmm9
+ movdqa %xmm10,0(%rsp)
+ movl %ebp,%esi
+ xorl %eax,%ebp
+ addl 20(%rsp),%ebx
+ andl %eax,%esi
+ pslld $2,%xmm1
+ andl %ebp,%edi
+ rorl $7,%edx
+ psrld $30,%xmm9
+ addl %esi,%ebx
+ movl %ecx,%esi
+ roll $5,%ecx
+ addl %edi,%ebx
+ xorl %eax,%ebp
+ addl %ecx,%ebx
+ por %xmm9,%xmm1
+ movl %edx,%edi
+ xorl %ebp,%edx
+ movdqa %xmm1,%xmm10
+ addl 24(%rsp),%eax
+ andl %ebp,%edi
+ andl %edx,%esi
+ rorl $7,%ecx
+ addl %edi,%eax
+ movl %ebx,%edi
+ roll $5,%ebx
+ addl %esi,%eax
+ xorl %ebp,%edx
+ addl %ebx,%eax
+ movl %ecx,%esi
+ xorl %edx,%ecx
+ addl 28(%rsp),%ebp
+ andl %edx,%esi
+ andl %ecx,%edi
+ rorl $7,%ebx
+ addl %esi,%ebp
+ movl %eax,%esi
+ roll $5,%eax
+ addl %edi,%ebp
+ xorl %edx,%ecx
+ addl %eax,%ebp
+ movl %ebx,%edi
+ pxor %xmm6,%xmm2
+.byte 102,68,15,58,15,208,8
+ xorl %ecx,%ebx
+ addl 32(%rsp),%edx
+ andl %ecx,%edi
+ pxor %xmm3,%xmm2
+ andl %ebx,%esi
+ rorl $7,%eax
+ movdqa %xmm8,%xmm9
+ paddd %xmm1,%xmm8
+ addl %edi,%edx
+ movl %ebp,%edi
+ pxor %xmm10,%xmm2
+ roll $5,%ebp
+ addl %esi,%edx
+ xorl %ecx,%ebx
+ addl %ebp,%edx
+ movdqa %xmm2,%xmm10
+ movdqa %xmm8,16(%rsp)
+ movl %eax,%esi
+ xorl %ebx,%eax
+ addl 36(%rsp),%ecx
+ andl %ebx,%esi
+ pslld $2,%xmm2
+ andl %eax,%edi
+ rorl $7,%ebp
+ psrld $30,%xmm10
+ addl %esi,%ecx
+ movl %edx,%esi
+ roll $5,%edx
+ addl %edi,%ecx
+ xorl %ebx,%eax
+ addl %edx,%ecx
+ por %xmm10,%xmm2
+ movl %ebp,%edi
+ xorl %eax,%ebp
+ movdqa %xmm2,%xmm8
+ addl 40(%rsp),%ebx
+ andl %eax,%edi
+ andl %ebp,%esi
+ rorl $7,%edx
+ addl %edi,%ebx
+ movl %ecx,%edi
+ roll $5,%ecx
+ addl %esi,%ebx
+ xorl %eax,%ebp
+ addl %ecx,%ebx
+ movl %edx,%esi
+ xorl %ebp,%edx
+ addl 44(%rsp),%eax
+ andl %ebp,%esi
+ andl %edx,%edi
+ rorl $7,%ecx
+ addl %esi,%eax
+ movl %ebx,%esi
+ roll $5,%ebx
+ addl %edi,%eax
+ xorl %ebp,%edx
+ addl %ebx,%eax
+ addl 48(%rsp),%ebp
+ pxor %xmm7,%xmm3
+.byte 102,68,15,58,15,193,8
+ xorl %edx,%esi
+ movl %eax,%edi
+ roll $5,%eax
+ pxor %xmm4,%xmm3
+ xorl %ecx,%esi
+ addl %eax,%ebp
+ movdqa %xmm9,%xmm10
+ paddd %xmm2,%xmm9
+ rorl $7,%ebx
+ addl %esi,%ebp
+ pxor %xmm8,%xmm3
+ addl 52(%rsp),%edx
+ xorl %ecx,%edi
+ movl %ebp,%esi
+ roll $5,%ebp
+ movdqa %xmm3,%xmm8
+ movdqa %xmm9,32(%rsp)
+ xorl %ebx,%edi
+ addl %ebp,%edx
+ rorl $7,%eax
+ addl %edi,%edx
+ pslld $2,%xmm3
+ addl 56(%rsp),%ecx
+ xorl %ebx,%esi
+ psrld $30,%xmm8
+ movl %edx,%edi
+ roll $5,%edx
+ xorl %eax,%esi
+ addl %edx,%ecx
+ rorl $7,%ebp
+ addl %esi,%ecx
+ por %xmm8,%xmm3
+ addl 60(%rsp),%ebx
+ xorl %eax,%edi
+ movl %ecx,%esi
+ roll $5,%ecx
+ xorl %ebp,%edi
+ addl %ecx,%ebx
+ rorl $7,%edx
+ addl %edi,%ebx
+ addl 0(%rsp),%eax
+ paddd %xmm3,%xmm10
+ xorl %ebp,%esi
+ movl %ebx,%edi
+ roll $5,%ebx
+ xorl %edx,%esi
+ movdqa %xmm10,48(%rsp)
+ addl %ebx,%eax
+ rorl $7,%ecx
+ addl %esi,%eax
+ addl 4(%rsp),%ebp
+ xorl %edx,%edi
+ movl %eax,%esi
+ roll $5,%eax
+ xorl %ecx,%edi
+ addl %eax,%ebp
+ rorl $7,%ebx
+ addl %edi,%ebp
+ addl 8(%rsp),%edx
+ xorl %ecx,%esi
+ movl %ebp,%edi
+ roll $5,%ebp
+ xorl %ebx,%esi
+ addl %ebp,%edx
+ rorl $7,%eax
+ addl %esi,%edx
+ addl 12(%rsp),%ecx
+ xorl %ebx,%edi
+ movl %edx,%esi
+ roll $5,%edx
+ xorl %eax,%edi
+ addl %edx,%ecx
+ rorl $7,%ebp
+ addl %edi,%ecx
+ cmpq %r10,%r9
+ je .Ldone_ssse3
+ movdqa 64(%r11),%xmm6
+ movdqa 0(%r11),%xmm9
+ movdqu 0(%r9),%xmm0
+ movdqu 16(%r9),%xmm1
+ movdqu 32(%r9),%xmm2
+ movdqu 48(%r9),%xmm3
+.byte 102,15,56,0,198
+ addq $64,%r9
+ addl 16(%rsp),%ebx
+ xorl %eax,%esi
+.byte 102,15,56,0,206
+ movl %ecx,%edi
+ roll $5,%ecx
+ paddd %xmm9,%xmm0
+ xorl %ebp,%esi
+ addl %ecx,%ebx
+ rorl $7,%edx
+ addl %esi,%ebx
+ movdqa %xmm0,0(%rsp)
+ addl 20(%rsp),%eax
+ xorl %ebp,%edi
+ psubd %xmm9,%xmm0
+ movl %ebx,%esi
+ roll $5,%ebx
+ xorl %edx,%edi
+ addl %ebx,%eax
+ rorl $7,%ecx
+ addl %edi,%eax
+ addl 24(%rsp),%ebp
+ xorl %edx,%esi
+ movl %eax,%edi
+ roll $5,%eax
+ xorl %ecx,%esi
+ addl %eax,%ebp
+ rorl $7,%ebx
+ addl %esi,%ebp
+ addl 28(%rsp),%edx
+ xorl %ecx,%edi
+ movl %ebp,%esi
+ roll $5,%ebp
+ xorl %ebx,%edi
+ addl %ebp,%edx
+ rorl $7,%eax
+ addl %edi,%edx
+ addl 32(%rsp),%ecx
+ xorl %ebx,%esi
+.byte 102,15,56,0,214
+ movl %edx,%edi
+ roll $5,%edx
+ paddd %xmm9,%xmm1
+ xorl %eax,%esi
+ addl %edx,%ecx
+ rorl $7,%ebp
+ addl %esi,%ecx
+ movdqa %xmm1,16(%rsp)
+ addl 36(%rsp),%ebx
+ xorl %eax,%edi
+ psubd %xmm9,%xmm1
+ movl %ecx,%esi
+ roll $5,%ecx
+ xorl %ebp,%edi
+ addl %ecx,%ebx
+ rorl $7,%edx
+ addl %edi,%ebx
+ addl 40(%rsp),%eax
+ xorl %ebp,%esi
+ movl %ebx,%edi
+ roll $5,%ebx
+ xorl %edx,%esi
+ addl %ebx,%eax
+ rorl $7,%ecx
+ addl %esi,%eax
+ addl 44(%rsp),%ebp
+ xorl %edx,%edi
+ movl %eax,%esi
+ roll $5,%eax
+ xorl %ecx,%edi
+ addl %eax,%ebp
+ rorl $7,%ebx
+ addl %edi,%ebp
+ addl 48(%rsp),%edx
+ xorl %ecx,%esi
+.byte 102,15,56,0,222
+ movl %ebp,%edi
+ roll $5,%ebp
+ paddd %xmm9,%xmm2
+ xorl %ebx,%esi
+ addl %ebp,%edx
+ rorl $7,%eax
+ addl %esi,%edx
+ movdqa %xmm2,32(%rsp)
+ addl 52(%rsp),%ecx
+ xorl %ebx,%edi
+ psubd %xmm9,%xmm2
+ movl %edx,%esi
+ roll $5,%edx
+ xorl %eax,%edi
+ addl %edx,%ecx
+ rorl $7,%ebp
+ addl %edi,%ecx
+ addl 56(%rsp),%ebx
+ xorl %eax,%esi
+ movl %ecx,%edi
+ roll $5,%ecx
+ xorl %ebp,%esi
+ addl %ecx,%ebx
+ rorl $7,%edx
+ addl %esi,%ebx
+ addl 60(%rsp),%eax
+ xorl %ebp,%edi
+ movl %ebx,%esi
+ roll $5,%ebx
+ xorl %edx,%edi
+ addl %ebx,%eax
+ rorl $7,%ecx
+ addl %edi,%eax
+ addl 0(%r8),%eax
+ addl 4(%r8),%esi
+ addl 8(%r8),%ecx
+ addl 12(%r8),%edx
+ movl %eax,0(%r8)
+ addl 16(%r8),%ebp
+ movl %esi,4(%r8)
+ movl %esi,%ebx
+ movl %ecx,8(%r8)
+ movl %edx,12(%r8)
+ movl %ebp,16(%r8)
+ jmp .Loop_ssse3
+
+.align 16
+.Ldone_ssse3:
+ addl 16(%rsp),%ebx
+ xorl %eax,%esi
+ movl %ecx,%edi
+ roll $5,%ecx
+ xorl %ebp,%esi
+ addl %ecx,%ebx
+ rorl $7,%edx
+ addl %esi,%ebx
+ addl 20(%rsp),%eax
+ xorl %ebp,%edi
+ movl %ebx,%esi
+ roll $5,%ebx
+ xorl %edx,%edi
+ addl %ebx,%eax
+ rorl $7,%ecx
+ addl %edi,%eax
+ addl 24(%rsp),%ebp
+ xorl %edx,%esi
+ movl %eax,%edi
+ roll $5,%eax
+ xorl %ecx,%esi
+ addl %eax,%ebp
+ rorl $7,%ebx
+ addl %esi,%ebp
+ addl 28(%rsp),%edx
+ xorl %ecx,%edi
+ movl %ebp,%esi
+ roll $5,%ebp
+ xorl %ebx,%edi
+ addl %ebp,%edx
+ rorl $7,%eax
+ addl %edi,%edx
+ addl 32(%rsp),%ecx
+ xorl %ebx,%esi
+ movl %edx,%edi
+ roll $5,%edx
+ xorl %eax,%esi
+ addl %edx,%ecx
+ rorl $7,%ebp
+ addl %esi,%ecx
+ addl 36(%rsp),%ebx
+ xorl %eax,%edi
+ movl %ecx,%esi
+ roll $5,%ecx
+ xorl %ebp,%edi
+ addl %ecx,%ebx
+ rorl $7,%edx
+ addl %edi,%ebx
+ addl 40(%rsp),%eax
+ xorl %ebp,%esi
+ movl %ebx,%edi
+ roll $5,%ebx
+ xorl %edx,%esi
+ addl %ebx,%eax
+ rorl $7,%ecx
+ addl %esi,%eax
+ addl 44(%rsp),%ebp
+ xorl %edx,%edi
+ movl %eax,%esi
+ roll $5,%eax
+ xorl %ecx,%edi
+ addl %eax,%ebp
+ rorl $7,%ebx
+ addl %edi,%ebp
+ addl 48(%rsp),%edx
+ xorl %ecx,%esi
+ movl %ebp,%edi
+ roll $5,%ebp
+ xorl %ebx,%esi
+ addl %ebp,%edx
+ rorl $7,%eax
+ addl %esi,%edx
+ addl 52(%rsp),%ecx
+ xorl %ebx,%edi
+ movl %edx,%esi
+ roll $5,%edx
+ xorl %eax,%edi
+ addl %edx,%ecx
+ rorl $7,%ebp
+ addl %edi,%ecx
+ addl 56(%rsp),%ebx
+ xorl %eax,%esi
+ movl %ecx,%edi
+ roll $5,%ecx
+ xorl %ebp,%esi
+ addl %ecx,%ebx
+ rorl $7,%edx
+ addl %esi,%ebx
+ addl 60(%rsp),%eax
+ xorl %ebp,%edi
+ movl %ebx,%esi
+ roll $5,%ebx
+ xorl %edx,%edi
+ addl %ebx,%eax
+ rorl $7,%ecx
+ addl %edi,%eax
+ addl 0(%r8),%eax
+ addl 4(%r8),%esi
+ addl 8(%r8),%ecx
+ movl %eax,0(%r8)
+ addl 12(%r8),%edx
+ movl %esi,4(%r8)
+ addl 16(%r8),%ebp
+ movl %ecx,8(%r8)
+ movl %edx,12(%r8)
+ movl %ebp,16(%r8)
+ leaq 64(%rsp),%rsi
+ movq 0(%rsi),%r12
+ movq 8(%rsi),%rbp
+ movq 16(%rsi),%rbx
+ leaq 24(%rsi),%rsp
+.Lepilogue_ssse3:
+ .byte 0xf3,0xc3
+.size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
+.align 64
+K_XX_XX:
+.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999
+.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1
+.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc
+.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6
+.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 64
diff --git a/main/openssl/crypto/sha/asm/sha1-x86_64.pl b/main/openssl/crypto/sha/asm/sha1-x86_64.pl
index 4edc5ea9..f15c7ec3 100755
--- a/main/openssl/crypto/sha/asm/sha1-x86_64.pl
+++ b/main/openssl/crypto/sha/asm/sha1-x86_64.pl
@@ -16,7 +16,7 @@
# There was suggestion to mechanically translate 32-bit code, but I
# dismissed it, reasoning that x86_64 offers enough register bank
# capacity to fully utilize SHA-1 parallelism. Therefore this fresh
-# implementation:-) However! While 64-bit code does performs better
+# implementation:-) However! While 64-bit code does perform better
# on Opteron, I failed to beat 32-bit assembler on EM64T core. Well,
# x86_64 does offer larger *addressable* bank, but out-of-order core
# reaches for even more registers through dynamic aliasing, and EM64T
@@ -29,6 +29,38 @@
# Xeon P4 +65% +0% 9.9
# Core2 +60% +10% 7.0
+# August 2009.
+#
+# The code was revised to minimize code size and to maximize
+# "distance" between instructions producing input to 'lea'
+# instruction and the 'lea' instruction itself, which is essential
+# for Intel Atom core.
+
+# October 2010.
+#
+# Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it
+# is to offload message schedule denoted by Wt in NIST specification,
+# or Xupdate in OpenSSL source, to SIMD unit. See sha1-586.pl module
+# for background and implementation details. The only difference from
+# 32-bit code is that 64-bit code doesn't have to spill @X[] elements
+# to free temporary registers.
+
+# April 2011.
+#
+# Add AVX code path. See sha1-586.pl for further information.
+
+######################################################################
+# Current performance is summarized in following table. Numbers are
+# CPU clock cycles spent to process single byte (less is better).
+#
+# x86_64 SSSE3 AVX
+# P4 9.8 -
+# Opteron 6.6 -
+# Core2 6.7 6.1/+10% -
+# Atom 11.0 9.7/+13% -
+# Westmere 7.1 5.6/+27% -
+# Sandy Bridge 7.9 6.3/+25% 5.2/+51%
+
$flavour = shift;
$output = shift;
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
@@ -40,7 +72,18 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
-open STDOUT,"| $^X $xlate $flavour $output";
+$avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+ =~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
+ $1>=2.19);
+$avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+ `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
+ $1>=2.09);
+$avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+ `ml64 2>&1` =~ /Version ([0-9]+)\./ &&
+ $1>=10);
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
$ctx="%rdi"; # 1st arg
$inp="%rsi"; # 2nd arg
@@ -51,196 +94,994 @@ $ctx="%r8";
$inp="%r9";
$num="%r10";
-$xi="%eax";
-$t0="%ebx";
-$t1="%ecx";
-$A="%edx";
-$B="%esi";
-$C="%edi";
-$D="%ebp";
-$E="%r11d";
-$T="%r12d";
-
-@V=($A,$B,$C,$D,$E,$T);
+$t0="%eax";
+$t1="%ebx";
+$t2="%ecx";
+@xi=("%edx","%ebp");
+$A="%esi";
+$B="%edi";
+$C="%r11d";
+$D="%r12d";
+$E="%r13d";
-sub PROLOGUE {
-my $func=shift;
-$code.=<<___;
-.globl $func
-.type $func,\@function,3
-.align 16
-$func:
- push %rbx
- push %rbp
- push %r12
- mov %rsp,%r11
- mov %rdi,$ctx # reassigned argument
- sub \$`8+16*4`,%rsp
- mov %rsi,$inp # reassigned argument
- and \$-64,%rsp
- mov %rdx,$num # reassigned argument
- mov %r11,`16*4`(%rsp)
-.Lprologue:
-
- mov 0($ctx),$A
- mov 4($ctx),$B
- mov 8($ctx),$C
- mov 12($ctx),$D
- mov 16($ctx),$E
-___
-}
-
-sub EPILOGUE {
-my $func=shift;
-$code.=<<___;
- mov `16*4`(%rsp),%rsi
- mov (%rsi),%r12
- mov 8(%rsi),%rbp
- mov 16(%rsi),%rbx
- lea 24(%rsi),%rsp
-.Lepilogue:
- ret
-.size $func,.-$func
-___
-}
+@V=($A,$B,$C,$D,$E);
sub BODY_00_19 {
-my ($i,$a,$b,$c,$d,$e,$f,$host)=@_;
+my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;
$code.=<<___ if ($i==0);
- mov `4*$i`($inp),$xi
- `"bswap $xi" if(!defined($host))`
- mov $xi,`4*$i`(%rsp)
+ mov `4*$i`($inp),$xi[0]
+ bswap $xi[0]
+ mov $xi[0],`4*$i`(%rsp)
___
$code.=<<___ if ($i<15);
- lea 0x5a827999($xi,$e),$f
mov $c,$t0
- mov `4*$j`($inp),$xi
- mov $a,$e
+ mov `4*$j`($inp),$xi[1]
+ mov $a,$t2
xor $d,$t0
- `"bswap $xi" if(!defined($host))`
- rol \$5,$e
+ bswap $xi[1]
+ rol \$5,$t2
+ lea 0x5a827999($xi[0],$e),$e
and $b,$t0
- mov $xi,`4*$j`(%rsp)
- add $e,$f
+ mov $xi[1],`4*$j`(%rsp)
+ add $t2,$e
xor $d,$t0
rol \$30,$b
- add $t0,$f
+ add $t0,$e
___
$code.=<<___ if ($i>=15);
- lea 0x5a827999($xi,$e),$f
- mov `4*($j%16)`(%rsp),$xi
+ mov `4*($j%16)`(%rsp),$xi[1]
mov $c,$t0
- mov $a,$e
- xor `4*(($j+2)%16)`(%rsp),$xi
+ mov $a,$t2
+ xor `4*(($j+2)%16)`(%rsp),$xi[1]
xor $d,$t0
- rol \$5,$e
- xor `4*(($j+8)%16)`(%rsp),$xi
+ rol \$5,$t2
+ xor `4*(($j+8)%16)`(%rsp),$xi[1]
and $b,$t0
- add $e,$f
- xor `4*(($j+13)%16)`(%rsp),$xi
+ lea 0x5a827999($xi[0],$e),$e
+ xor `4*(($j+13)%16)`(%rsp),$xi[1]
xor $d,$t0
+ rol \$1,$xi[1]
+ add $t2,$e
rol \$30,$b
- add $t0,$f
- rol \$1,$xi
- mov $xi,`4*($j%16)`(%rsp)
+ mov $xi[1],`4*($j%16)`(%rsp)
+ add $t0,$e
___
+unshift(@xi,pop(@xi));
}
sub BODY_20_39 {
-my ($i,$a,$b,$c,$d,$e,$f)=@_;
+my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;
my $K=($i<40)?0x6ed9eba1:0xca62c1d6;
$code.=<<___ if ($i<79);
- lea $K($xi,$e),$f
- mov `4*($j%16)`(%rsp),$xi
+ mov `4*($j%16)`(%rsp),$xi[1]
mov $c,$t0
- mov $a,$e
- xor `4*(($j+2)%16)`(%rsp),$xi
+ mov $a,$t2
+ xor `4*(($j+2)%16)`(%rsp),$xi[1]
xor $b,$t0
- rol \$5,$e
- xor `4*(($j+8)%16)`(%rsp),$xi
+ rol \$5,$t2
+ lea $K($xi[0],$e),$e
+ xor `4*(($j+8)%16)`(%rsp),$xi[1]
xor $d,$t0
- add $e,$f
- xor `4*(($j+13)%16)`(%rsp),$xi
+ add $t2,$e
+ xor `4*(($j+13)%16)`(%rsp),$xi[1]
rol \$30,$b
- add $t0,$f
- rol \$1,$xi
+ add $t0,$e
+ rol \$1,$xi[1]
___
$code.=<<___ if ($i<76);
- mov $xi,`4*($j%16)`(%rsp)
+ mov $xi[1],`4*($j%16)`(%rsp)
___
$code.=<<___ if ($i==79);
- lea $K($xi,$e),$f
mov $c,$t0
- mov $a,$e
+ mov $a,$t2
xor $b,$t0
- rol \$5,$e
+ lea $K($xi[0],$e),$e
+ rol \$5,$t2
xor $d,$t0
- add $e,$f
+ add $t2,$e
rol \$30,$b
- add $t0,$f
+ add $t0,$e
___
+unshift(@xi,pop(@xi));
}
sub BODY_40_59 {
-my ($i,$a,$b,$c,$d,$e,$f)=@_;
+my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;
$code.=<<___;
- lea 0x8f1bbcdc($xi,$e),$f
- mov `4*($j%16)`(%rsp),$xi
- mov $b,$t0
- mov $b,$t1
- xor `4*(($j+2)%16)`(%rsp),$xi
- mov $a,$e
- and $c,$t0
- xor `4*(($j+8)%16)`(%rsp),$xi
- or $c,$t1
- rol \$5,$e
- xor `4*(($j+13)%16)`(%rsp),$xi
- and $d,$t1
- add $e,$f
- rol \$1,$xi
- or $t1,$t0
+ mov `4*($j%16)`(%rsp),$xi[1]
+ mov $c,$t0
+ mov $c,$t1
+ xor `4*(($j+2)%16)`(%rsp),$xi[1]
+ and $d,$t0
+ mov $a,$t2
+ xor `4*(($j+8)%16)`(%rsp),$xi[1]
+ xor $d,$t1
+ lea 0x8f1bbcdc($xi[0],$e),$e
+ rol \$5,$t2
+ xor `4*(($j+13)%16)`(%rsp),$xi[1]
+ add $t0,$e
+ and $b,$t1
+ rol \$1,$xi[1]
+ add $t1,$e
rol \$30,$b
- mov $xi,`4*($j%16)`(%rsp)
- add $t0,$f
+ mov $xi[1],`4*($j%16)`(%rsp)
+ add $t2,$e
___
+unshift(@xi,pop(@xi));
}
-$code=".text\n";
+$code.=<<___;
+.text
+.extern OPENSSL_ia32cap_P
-&PROLOGUE("sha1_block_data_order");
-$code.=".align 4\n.Lloop:\n";
+.globl sha1_block_data_order
+.type sha1_block_data_order,\@function,3
+.align 16
+sha1_block_data_order:
+ mov OPENSSL_ia32cap_P+0(%rip),%r9d
+ mov OPENSSL_ia32cap_P+4(%rip),%r8d
+ test \$`1<<9`,%r8d # check SSSE3 bit
+ jz .Lialu
+___
+$code.=<<___ if ($avx);
+ and \$`1<<28`,%r8d # mask AVX bit
+ and \$`1<<30`,%r9d # mask "Intel CPU" bit
+ or %r9d,%r8d
+ cmp \$`1<<28|1<<30`,%r8d
+ je _avx_shortcut
+___
+$code.=<<___;
+ jmp _ssse3_shortcut
+
+.align 16
+.Lialu:
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ mov %rsp,%r11
+ mov %rdi,$ctx # reassigned argument
+ sub \$`8+16*4`,%rsp
+ mov %rsi,$inp # reassigned argument
+ and \$-64,%rsp
+ mov %rdx,$num # reassigned argument
+ mov %r11,`16*4`(%rsp)
+.Lprologue:
+
+ mov 0($ctx),$A
+ mov 4($ctx),$B
+ mov 8($ctx),$C
+ mov 12($ctx),$D
+ mov 16($ctx),$E
+ jmp .Lloop
+
+.align 16
+.Lloop:
+___
for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
- add 0($ctx),$E
- add 4($ctx),$T
- add 8($ctx),$A
- add 12($ctx),$B
- add 16($ctx),$C
- mov $E,0($ctx)
- mov $T,4($ctx)
- mov $A,8($ctx)
- mov $B,12($ctx)
- mov $C,16($ctx)
-
- xchg $E,$A # mov $E,$A
- xchg $T,$B # mov $T,$B
- xchg $E,$C # mov $A,$C
- xchg $T,$D # mov $B,$D
- # mov $C,$E
- lea `16*4`($inp),$inp
+ add 0($ctx),$A
+ add 4($ctx),$B
+ add 8($ctx),$C
+ add 12($ctx),$D
+ add 16($ctx),$E
+ mov $A,0($ctx)
+ mov $B,4($ctx)
+ mov $C,8($ctx)
+ mov $D,12($ctx)
+ mov $E,16($ctx)
+
sub \$1,$num
+ lea `16*4`($inp),$inp
jnz .Lloop
+
+ mov `16*4`(%rsp),%rsi
+ mov (%rsi),%r13
+ mov 8(%rsi),%r12
+ mov 16(%rsi),%rbp
+ mov 24(%rsi),%rbx
+ lea 32(%rsi),%rsp
+.Lepilogue:
+ ret
+.size sha1_block_data_order,.-sha1_block_data_order
___
-&EPILOGUE("sha1_block_data_order");
+{{{
+my $Xi=4;
+my @X=map("%xmm$_",(4..7,0..3));
+my @Tx=map("%xmm$_",(8..10));
+my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization
+my @T=("%esi","%edi");
+my $j=0;
+my $K_XX_XX="%r11";
+
+my $_rol=sub { &rol(@_) };
+my $_ror=sub { &ror(@_) };
+
+$code.=<<___;
+.type sha1_block_data_order_ssse3,\@function,3
+.align 16
+sha1_block_data_order_ssse3:
+_ssse3_shortcut:
+ push %rbx
+ push %rbp
+ push %r12
+ lea `-64-($win64?5*16:0)`(%rsp),%rsp
+___
+$code.=<<___ if ($win64);
+ movaps %xmm6,64+0(%rsp)
+ movaps %xmm7,64+16(%rsp)
+ movaps %xmm8,64+32(%rsp)
+ movaps %xmm9,64+48(%rsp)
+ movaps %xmm10,64+64(%rsp)
+.Lprologue_ssse3:
+___
+$code.=<<___;
+ mov %rdi,$ctx # reassigned argument
+ mov %rsi,$inp # reassigned argument
+ mov %rdx,$num # reassigned argument
+
+ shl \$6,$num
+ add $inp,$num
+ lea K_XX_XX(%rip),$K_XX_XX
+
+ mov 0($ctx),$A # load context
+ mov 4($ctx),$B
+ mov 8($ctx),$C
+ mov 12($ctx),$D
+ mov $B,@T[0] # magic seed
+ mov 16($ctx),$E
+
+ movdqa 64($K_XX_XX),@X[2] # pbswap mask
+ movdqa 0($K_XX_XX),@Tx[1] # K_00_19
+ movdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
+ movdqu 16($inp),@X[-3&7]
+ movdqu 32($inp),@X[-2&7]
+ movdqu 48($inp),@X[-1&7]
+ pshufb @X[2],@X[-4&7] # byte swap
+ add \$64,$inp
+ pshufb @X[2],@X[-3&7]
+ pshufb @X[2],@X[-2&7]
+ pshufb @X[2],@X[-1&7]
+ paddd @Tx[1],@X[-4&7] # add K_00_19
+ paddd @Tx[1],@X[-3&7]
+ paddd @Tx[1],@X[-2&7]
+ movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU
+ psubd @Tx[1],@X[-4&7] # restore X[]
+ movdqa @X[-3&7],16(%rsp)
+ psubd @Tx[1],@X[-3&7]
+ movdqa @X[-2&7],32(%rsp)
+ psubd @Tx[1],@X[-2&7]
+ jmp .Loop_ssse3
+___
+
+sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
+ my $arg = pop;
+ $arg = "\$$arg" if ($arg*1 eq $arg);
+ $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
+}
+
+sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
+ my ($a,$b,$c,$d,$e);
+
+ &movdqa (@X[0],@X[-3&7]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &movdqa (@Tx[0],@X[-1&7]);
+ &palignr(@X[0],@X[-4&7],8); # compose "X[-14]" in "X[0]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &paddd (@Tx[1],@X[-1&7]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &psrldq (@Tx[0],4); # "X[-3]", 3 dwords
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &pxor (@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &movdqa (@Tx[2],@X[0]);
+ &movdqa (@Tx[0],@X[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &pslldq (@Tx[2],12); # "X[0]"<<96, extract one dword
+ &paddd (@X[0],@X[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &psrld (@Tx[0],31);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &movdqa (@Tx[1],@Tx[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &psrld (@Tx[2],30);
+ &por (@X[0],@Tx[0]); # "X[0]"<<<=1
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &pslld (@Tx[1],2);
+ &pxor (@X[0],@Tx[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &movdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &pxor (@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2
+
+ foreach (@insns) { eval; } # remaining instructions [if any]
+
+ $Xi++; push(@X,shift(@X)); # "rotate" X[]
+ push(@Tx,shift(@Tx));
+}
+
+sub Xupdate_ssse3_32_79()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions
+ my ($a,$b,$c,$d,$e);
+
+ &movdqa (@Tx[0],@X[-1&7]) if ($Xi==8);
+ eval(shift(@insns)); # body_20_39
+ &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
+ &palignr(@Tx[0],@X[-2&7],8); # compose "X[-6]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+
+ &pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
+ eval(shift(@insns));
+ eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/);
+ if ($Xi%5) {
+ &movdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
+ } else { # ... or load next one
+ &movdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
+ }
+ &paddd (@Tx[1],@X[-1&7]);
+ eval(shift(@insns)); # ror
+ eval(shift(@insns));
+
+ &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-6]"
+ eval(shift(@insns)); # body_20_39
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+
+ &movdqa (@Tx[0],@X[0]);
+ &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # ror
+ eval(shift(@insns));
+
+ &pslld (@X[0],2);
+ eval(shift(@insns)); # body_20_39
+ eval(shift(@insns));
+ &psrld (@Tx[0],30);
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # ror
+ eval(shift(@insns));
+
+ &por (@X[0],@Tx[0]); # "X[0]"<<<=2
+ eval(shift(@insns)); # body_20_39
+ eval(shift(@insns));
+ &movdqa (@Tx[1],@X[0]) if ($Xi<19);
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+ eval(shift(@insns));
+
+ foreach (@insns) { eval; } # remaining instructions
+
+ $Xi++; push(@X,shift(@X)); # "rotate" X[]
+ push(@Tx,shift(@Tx));
+}
+
+sub Xuplast_ssse3_80()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
+ my ($a,$b,$c,$d,$e);
+
+ eval(shift(@insns));
+ &paddd (@Tx[1],@X[-1&7]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
+
+ foreach (@insns) { eval; } # remaining instructions
+
+ &cmp ($inp,$num);
+ &je (".Ldone_ssse3");
+
+ unshift(@Tx,pop(@Tx));
+
+ &movdqa (@X[2],"64($K_XX_XX)"); # pbswap mask
+ &movdqa (@Tx[1],"0($K_XX_XX)"); # K_00_19
+ &movdqu (@X[-4&7],"0($inp)"); # load input
+ &movdqu (@X[-3&7],"16($inp)");
+ &movdqu (@X[-2&7],"32($inp)");
+ &movdqu (@X[-1&7],"48($inp)");
+ &pshufb (@X[-4&7],@X[2]); # byte swap
+ &add ($inp,64);
+
+ $Xi=0;
+}
+
+sub Xloop_ssse3()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
+ my ($a,$b,$c,$d,$e);
+
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &pshufb (@X[($Xi-3)&7],@X[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &paddd (@X[($Xi-4)&7],@Tx[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]); # X[]+K xfer to IALU
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &psubd (@X[($Xi-4)&7],@Tx[1]);
+
+ foreach (@insns) { eval; }
+ $Xi++;
+}
+
+sub Xtail_ssse3()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
+ my ($a,$b,$c,$d,$e);
+
+ foreach (@insns) { eval; }
+}
+
+sub body_00_19 () {
+ (
+ '($a,$b,$c,$d,$e)=@V;'.
+ '&add ($e,eval(4*($j&15))."(%rsp)");', # X[]+K xfer
+ '&xor ($c,$d);',
+ '&mov (@T[1],$a);', # $b in next round
+ '&$_rol ($a,5);',
+ '&and (@T[0],$c);', # ($b&($c^$d))
+ '&xor ($c,$d);', # restore $c
+ '&xor (@T[0],$d);',
+ '&add ($e,$a);',
+ '&$_ror ($b,$j?7:2);', # $b>>>2
+ '&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+ );
+}
+
+sub body_20_39 () {
+ (
+ '($a,$b,$c,$d,$e)=@V;'.
+ '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer
+ '&xor (@T[0],$d);', # ($b^$d)
+ '&mov (@T[1],$a);', # $b in next round
+ '&$_rol ($a,5);',
+ '&xor (@T[0],$c);', # ($b^$d^$c)
+ '&add ($e,$a);',
+ '&$_ror ($b,7);', # $b>>>2
+ '&add ($e,@T[0]);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+ );
+}
+
+sub body_40_59 () {
+ (
+ '($a,$b,$c,$d,$e)=@V;'.
+ '&mov (@T[1],$c);',
+ '&xor ($c,$d);',
+ '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer
+ '&and (@T[1],$d);',
+ '&and (@T[0],$c);', # ($b&($c^$d))
+ '&$_ror ($b,7);', # $b>>>2
+ '&add ($e,@T[1]);',
+ '&mov (@T[1],$a);', # $b in next round
+ '&$_rol ($a,5);',
+ '&add ($e,@T[0]);',
+ '&xor ($c,$d);', # restore $c
+ '&add ($e,$a);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+ );
+}
$code.=<<___;
-.asciz "SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
.align 16
+.Loop_ssse3:
+___
+ &Xupdate_ssse3_16_31(\&body_00_19);
+ &Xupdate_ssse3_16_31(\&body_00_19);
+ &Xupdate_ssse3_16_31(\&body_00_19);
+ &Xupdate_ssse3_16_31(\&body_00_19);
+ &Xupdate_ssse3_32_79(\&body_00_19);
+ &Xupdate_ssse3_32_79(\&body_20_39);
+ &Xupdate_ssse3_32_79(\&body_20_39);
+ &Xupdate_ssse3_32_79(\&body_20_39);
+ &Xupdate_ssse3_32_79(\&body_20_39);
+ &Xupdate_ssse3_32_79(\&body_20_39);
+ &Xupdate_ssse3_32_79(\&body_40_59);
+ &Xupdate_ssse3_32_79(\&body_40_59);
+ &Xupdate_ssse3_32_79(\&body_40_59);
+ &Xupdate_ssse3_32_79(\&body_40_59);
+ &Xupdate_ssse3_32_79(\&body_40_59);
+ &Xupdate_ssse3_32_79(\&body_20_39);
+ &Xuplast_ssse3_80(\&body_20_39); # can jump to "done"
+
+ $saved_j=$j; @saved_V=@V;
+
+ &Xloop_ssse3(\&body_20_39);
+ &Xloop_ssse3(\&body_20_39);
+ &Xloop_ssse3(\&body_20_39);
+
+$code.=<<___;
+ add 0($ctx),$A # update context
+ add 4($ctx),@T[0]
+ add 8($ctx),$C
+ add 12($ctx),$D
+ mov $A,0($ctx)
+ add 16($ctx),$E
+ mov @T[0],4($ctx)
+ mov @T[0],$B # magic seed
+ mov $C,8($ctx)
+ mov $D,12($ctx)
+ mov $E,16($ctx)
+ jmp .Loop_ssse3
+
+.align 16
+.Ldone_ssse3:
+___
+ $j=$saved_j; @V=@saved_V;
+
+ &Xtail_ssse3(\&body_20_39);
+ &Xtail_ssse3(\&body_20_39);
+ &Xtail_ssse3(\&body_20_39);
+
+$code.=<<___;
+ add 0($ctx),$A # update context
+ add 4($ctx),@T[0]
+ add 8($ctx),$C
+ mov $A,0($ctx)
+ add 12($ctx),$D
+ mov @T[0],4($ctx)
+ add 16($ctx),$E
+ mov $C,8($ctx)
+ mov $D,12($ctx)
+ mov $E,16($ctx)
+___
+$code.=<<___ if ($win64);
+ movaps 64+0(%rsp),%xmm6
+ movaps 64+16(%rsp),%xmm7
+ movaps 64+32(%rsp),%xmm8
+ movaps 64+48(%rsp),%xmm9
+ movaps 64+64(%rsp),%xmm10
+___
+$code.=<<___;
+ lea `64+($win64?5*16:0)`(%rsp),%rsi
+ mov 0(%rsi),%r12
+ mov 8(%rsi),%rbp
+ mov 16(%rsi),%rbx
+ lea 24(%rsi),%rsp
+.Lepilogue_ssse3:
+ ret
+.size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
+___
+
+if ($avx) {
+my $Xi=4;
+my @X=map("%xmm$_",(4..7,0..3));
+my @Tx=map("%xmm$_",(8..10));
+my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization
+my @T=("%esi","%edi");
+my $j=0;
+my $K_XX_XX="%r11";
+
+my $_rol=sub { &shld(@_[0],@_) };
+my $_ror=sub { &shrd(@_[0],@_) };
+
+$code.=<<___;
+.type sha1_block_data_order_avx,\@function,3
+.align 16
+sha1_block_data_order_avx:
+_avx_shortcut:
+ push %rbx
+ push %rbp
+ push %r12
+ lea `-64-($win64?5*16:0)`(%rsp),%rsp
+___
+$code.=<<___ if ($win64);
+ movaps %xmm6,64+0(%rsp)
+ movaps %xmm7,64+16(%rsp)
+ movaps %xmm8,64+32(%rsp)
+ movaps %xmm9,64+48(%rsp)
+ movaps %xmm10,64+64(%rsp)
+.Lprologue_avx:
+___
+$code.=<<___;
+ mov %rdi,$ctx # reassigned argument
+ mov %rsi,$inp # reassigned argument
+ mov %rdx,$num # reassigned argument
+ vzeroupper
+
+ shl \$6,$num
+ add $inp,$num
+ lea K_XX_XX(%rip),$K_XX_XX
+
+ mov 0($ctx),$A # load context
+ mov 4($ctx),$B
+ mov 8($ctx),$C
+ mov 12($ctx),$D
+ mov $B,@T[0] # magic seed
+ mov 16($ctx),$E
+
+ vmovdqa 64($K_XX_XX),@X[2] # pbswap mask
+ vmovdqa 0($K_XX_XX),@Tx[1] # K_00_19
+ vmovdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
+ vmovdqu 16($inp),@X[-3&7]
+ vmovdqu 32($inp),@X[-2&7]
+ vmovdqu 48($inp),@X[-1&7]
+ vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap
+ add \$64,$inp
+ vpshufb @X[2],@X[-3&7],@X[-3&7]
+ vpshufb @X[2],@X[-2&7],@X[-2&7]
+ vpshufb @X[2],@X[-1&7],@X[-1&7]
+ vpaddd @Tx[1],@X[-4&7],@X[0] # add K_00_19
+ vpaddd @Tx[1],@X[-3&7],@X[1]
+ vpaddd @Tx[1],@X[-2&7],@X[2]
+ vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU
+ vmovdqa @X[1],16(%rsp)
+ vmovdqa @X[2],32(%rsp)
+ jmp .Loop_avx
+___
+
+sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
+ my ($a,$b,$c,$d,$e);
+
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpaddd (@Tx[1],@Tx[1],@X[-1&7]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpsrld (@Tx[0],@X[0],31);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword
+ &vpaddd (@X[0],@X[0],@X[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpsrld (@Tx[1],@Tx[2],30);
+ &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpslld (@Tx[2],@Tx[2],2);
+ &vpxor (@X[0],@X[0],@Tx[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vmovdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+
+ foreach (@insns) { eval; } # remaining instructions [if any]
+
+ $Xi++; push(@X,shift(@X)); # "rotate" X[]
+ push(@Tx,shift(@Tx));
+}
+
+sub Xupdate_avx_32_79()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions
+ my ($a,$b,$c,$d,$e);
+
+ &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8); # compose "X[-6]"
+ &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
+ eval(shift(@insns)); # body_20_39
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+
+ &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
+ eval(shift(@insns));
+ eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/);
+ if ($Xi%5) {
+ &vmovdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
+ } else { # ... or load next one
+ &vmovdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
+ }
+ &vpaddd (@Tx[1],@Tx[1],@X[-1&7]);
+ eval(shift(@insns)); # ror
+ eval(shift(@insns));
+
+ &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-6]"
+ eval(shift(@insns)); # body_20_39
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+
+ &vpsrld (@Tx[0],@X[0],30);
+ &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # ror
+ eval(shift(@insns));
+
+ &vpslld (@X[0],@X[0],2);
+ eval(shift(@insns)); # body_20_39
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # ror
+ eval(shift(@insns));
+
+ &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2
+ eval(shift(@insns)); # body_20_39
+ eval(shift(@insns));
+ &vmovdqa (@Tx[1],@X[0]) if ($Xi<19);
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+ eval(shift(@insns));
+
+ foreach (@insns) { eval; } # remaining instructions
+
+ $Xi++; push(@X,shift(@X)); # "rotate" X[]
+ push(@Tx,shift(@Tx));
+}
+
+sub Xuplast_avx_80()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
+ my ($a,$b,$c,$d,$e);
+
+ eval(shift(@insns));
+ &vpaddd (@Tx[1],@Tx[1],@X[-1&7]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
+
+ foreach (@insns) { eval; } # remaining instructions
+
+ &cmp ($inp,$num);
+ &je (".Ldone_avx");
+
+ unshift(@Tx,pop(@Tx));
+
+ &vmovdqa(@X[2],"64($K_XX_XX)"); # pbswap mask
+ &vmovdqa(@Tx[1],"0($K_XX_XX)"); # K_00_19
+ &vmovdqu(@X[-4&7],"0($inp)"); # load input
+ &vmovdqu(@X[-3&7],"16($inp)");
+ &vmovdqu(@X[-2&7],"32($inp)");
+ &vmovdqu(@X[-1&7],"48($inp)");
+ &vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap
+ &add ($inp,64);
+
+ $Xi=0;
+}
+
+sub Xloop_avx()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
+ my ($a,$b,$c,$d,$e);
+
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpaddd (@X[$Xi&7],@X[($Xi-4)&7],@Tx[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]); # X[]+K xfer to IALU
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ foreach (@insns) { eval; }
+ $Xi++;
+}
+
+sub Xtail_avx()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
+ my ($a,$b,$c,$d,$e);
+
+ foreach (@insns) { eval; }
+}
+
+$code.=<<___;
+.align 16
+.Loop_avx:
+___
+ &Xupdate_avx_16_31(\&body_00_19);
+ &Xupdate_avx_16_31(\&body_00_19);
+ &Xupdate_avx_16_31(\&body_00_19);
+ &Xupdate_avx_16_31(\&body_00_19);
+ &Xupdate_avx_32_79(\&body_00_19);
+ &Xupdate_avx_32_79(\&body_20_39);
+ &Xupdate_avx_32_79(\&body_20_39);
+ &Xupdate_avx_32_79(\&body_20_39);
+ &Xupdate_avx_32_79(\&body_20_39);
+ &Xupdate_avx_32_79(\&body_20_39);
+ &Xupdate_avx_32_79(\&body_40_59);
+ &Xupdate_avx_32_79(\&body_40_59);
+ &Xupdate_avx_32_79(\&body_40_59);
+ &Xupdate_avx_32_79(\&body_40_59);
+ &Xupdate_avx_32_79(\&body_40_59);
+ &Xupdate_avx_32_79(\&body_20_39);
+ &Xuplast_avx_80(\&body_20_39); # can jump to "done"
+
+ $saved_j=$j; @saved_V=@V;
+
+ &Xloop_avx(\&body_20_39);
+ &Xloop_avx(\&body_20_39);
+ &Xloop_avx(\&body_20_39);
+
+$code.=<<___;
+ add 0($ctx),$A # update context
+ add 4($ctx),@T[0]
+ add 8($ctx),$C
+ add 12($ctx),$D
+ mov $A,0($ctx)
+ add 16($ctx),$E
+ mov @T[0],4($ctx)
+ mov @T[0],$B # magic seed
+ mov $C,8($ctx)
+ mov $D,12($ctx)
+ mov $E,16($ctx)
+ jmp .Loop_avx
+
+.align 16
+.Ldone_avx:
+___
+ $j=$saved_j; @V=@saved_V;
+
+ &Xtail_avx(\&body_20_39);
+ &Xtail_avx(\&body_20_39);
+ &Xtail_avx(\&body_20_39);
+
+$code.=<<___;
+ vzeroupper
+
+ add 0($ctx),$A # update context
+ add 4($ctx),@T[0]
+ add 8($ctx),$C
+ mov $A,0($ctx)
+ add 12($ctx),$D
+ mov @T[0],4($ctx)
+ add 16($ctx),$E
+ mov $C,8($ctx)
+ mov $D,12($ctx)
+ mov $E,16($ctx)
+___
+$code.=<<___ if ($win64);
+ movaps 64+0(%rsp),%xmm6
+ movaps 64+16(%rsp),%xmm7
+ movaps 64+32(%rsp),%xmm8
+ movaps 64+48(%rsp),%xmm9
+ movaps 64+64(%rsp),%xmm10
+___
+$code.=<<___;
+ lea `64+($win64?5*16:0)`(%rsp),%rsi
+ mov 0(%rsi),%r12
+ mov 8(%rsi),%rbp
+ mov 16(%rsi),%rbx
+ lea 24(%rsi),%rsp
+.Lepilogue_avx:
+ ret
+.size sha1_block_data_order_avx,.-sha1_block_data_order_avx
+___
+}
+$code.=<<___;
+.align 64
+K_XX_XX:
+.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19
+.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39
+.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59
+.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79
+.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask
+___
+}}}
+$code.=<<___;
+.asciz "SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align 64
___
# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
@@ -272,25 +1113,75 @@ se_handler:
lea .Lprologue(%rip),%r10
cmp %r10,%rbx # context->Rip<.Lprologue
- jb .Lin_prologue
+ jb .Lcommon_seh_tail
mov 152($context),%rax # pull context->Rsp
lea .Lepilogue(%rip),%r10
cmp %r10,%rbx # context->Rip>=.Lepilogue
- jae .Lin_prologue
+ jae .Lcommon_seh_tail
mov `16*4`(%rax),%rax # pull saved stack pointer
- lea 24(%rax),%rax
+ lea 32(%rax),%rax
mov -8(%rax),%rbx
mov -16(%rax),%rbp
mov -24(%rax),%r12
+ mov -32(%rax),%r13
mov %rbx,144($context) # restore context->Rbx
mov %rbp,160($context) # restore context->Rbp
mov %r12,216($context) # restore context->R12
+ mov %r13,224($context) # restore context->R13
+
+ jmp .Lcommon_seh_tail
+.size se_handler,.-se_handler
-.Lin_prologue:
+.type ssse3_handler,\@abi-omnipotent
+.align 16
+ssse3_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HandlerData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # prologue label
+ cmp %r10,%rbx # context->Rip<prologue label
+ jb .Lcommon_seh_tail
+
+ mov 152($context),%rax # pull context->Rsp
+
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=epilogue label
+ jae .Lcommon_seh_tail
+
+ lea 64(%rax),%rsi
+ lea 512($context),%rdi # &context.Xmm6
+ mov \$10,%ecx
+ .long 0xa548f3fc # cld; rep movsq
+ lea `24+64+5*16`(%rax),%rax # adjust stack pointer
+
+ mov -8(%rax),%rbx
+ mov -16(%rax),%rbp
+ mov -24(%rax),%r12
+ mov %rbx,144($context) # restore context->Rbx
+ mov %rbp,160($context) # restore context->Rbp
+ mov %r12,216($context) # restore cotnext->R12
+
+.Lcommon_seh_tail:
mov 8(%rax),%rdi
mov 16(%rax),%rsi
mov %rax,152($context) # restore context->Rsp
@@ -328,19 +1219,38 @@ se_handler:
pop %rdi
pop %rsi
ret
-.size se_handler,.-se_handler
+.size ssse3_handler,.-ssse3_handler
.section .pdata
.align 4
.rva .LSEH_begin_sha1_block_data_order
.rva .LSEH_end_sha1_block_data_order
.rva .LSEH_info_sha1_block_data_order
-
+ .rva .LSEH_begin_sha1_block_data_order_ssse3
+ .rva .LSEH_end_sha1_block_data_order_ssse3
+ .rva .LSEH_info_sha1_block_data_order_ssse3
+___
+$code.=<<___ if ($avx);
+ .rva .LSEH_begin_sha1_block_data_order_avx
+ .rva .LSEH_end_sha1_block_data_order_avx
+ .rva .LSEH_info_sha1_block_data_order_avx
+___
+$code.=<<___;
.section .xdata
.align 8
.LSEH_info_sha1_block_data_order:
.byte 9,0,0,0
.rva se_handler
+.LSEH_info_sha1_block_data_order_ssse3:
+ .byte 9,0,0,0
+ .rva ssse3_handler
+ .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[]
+___
+$code.=<<___ if ($avx);
+.LSEH_info_sha1_block_data_order_avx:
+ .byte 9,0,0,0
+ .rva ssse3_handler
+ .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
___
}
diff --git a/main/openssl/crypto/sha/asm/sha256-586.S b/main/openssl/crypto/sha/asm/sha256-586.S
new file mode 100644
index 00000000..77a89514
--- /dev/null
+++ b/main/openssl/crypto/sha/asm/sha256-586.S
@@ -0,0 +1,258 @@
+.file "sha512-586.s"
+.text
+.globl sha256_block_data_order
+.type sha256_block_data_order,@function
+.align 16
+sha256_block_data_order:
+.L_sha256_block_data_order_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%esi
+ movl 24(%esp),%edi
+ movl 28(%esp),%eax
+ movl %esp,%ebx
+ call .L000pic_point
+.L000pic_point:
+ popl %ebp
+ leal .L001K256-.L000pic_point(%ebp),%ebp
+ subl $16,%esp
+ andl $-64,%esp
+ shll $6,%eax
+ addl %edi,%eax
+ movl %esi,(%esp)
+ movl %edi,4(%esp)
+ movl %eax,8(%esp)
+ movl %ebx,12(%esp)
+.align 16
+.L002loop:
+ movl (%edi),%eax
+ movl 4(%edi),%ebx
+ movl 8(%edi),%ecx
+ movl 12(%edi),%edx
+ bswap %eax
+ bswap %ebx
+ bswap %ecx
+ bswap %edx
+ pushl %eax
+ pushl %ebx
+ pushl %ecx
+ pushl %edx
+ movl 16(%edi),%eax
+ movl 20(%edi),%ebx
+ movl 24(%edi),%ecx
+ movl 28(%edi),%edx
+ bswap %eax
+ bswap %ebx
+ bswap %ecx
+ bswap %edx
+ pushl %eax
+ pushl %ebx
+ pushl %ecx
+ pushl %edx
+ movl 32(%edi),%eax
+ movl 36(%edi),%ebx
+ movl 40(%edi),%ecx
+ movl 44(%edi),%edx
+ bswap %eax
+ bswap %ebx
+ bswap %ecx
+ bswap %edx
+ pushl %eax
+ pushl %ebx
+ pushl %ecx
+ pushl %edx
+ movl 48(%edi),%eax
+ movl 52(%edi),%ebx
+ movl 56(%edi),%ecx
+ movl 60(%edi),%edx
+ bswap %eax
+ bswap %ebx
+ bswap %ecx
+ bswap %edx
+ pushl %eax
+ pushl %ebx
+ pushl %ecx
+ pushl %edx
+ addl $64,%edi
+ subl $32,%esp
+ movl %edi,100(%esp)
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl 8(%esi),%ecx
+ movl 12(%esi),%edi
+ movl %ebx,4(%esp)
+ movl %ecx,8(%esp)
+ movl %edi,12(%esp)
+ movl 16(%esi),%edx
+ movl 20(%esi),%ebx
+ movl 24(%esi),%ecx
+ movl 28(%esi),%edi
+ movl %ebx,20(%esp)
+ movl %ecx,24(%esp)
+ movl %edi,28(%esp)
+.align 16
+.L00300_15:
+ movl 92(%esp),%ebx
+ movl %edx,%ecx
+ rorl $14,%ecx
+ movl 20(%esp),%esi
+ xorl %edx,%ecx
+ rorl $5,%ecx
+ xorl %edx,%ecx
+ rorl $6,%ecx
+ movl 24(%esp),%edi
+ addl %ecx,%ebx
+ xorl %edi,%esi
+ movl %edx,16(%esp)
+ movl %eax,%ecx
+ andl %edx,%esi
+ movl 12(%esp),%edx
+ xorl %edi,%esi
+ movl %eax,%edi
+ addl %esi,%ebx
+ rorl $9,%ecx
+ addl 28(%esp),%ebx
+ xorl %eax,%ecx
+ rorl $11,%ecx
+ movl 4(%esp),%esi
+ xorl %eax,%ecx
+ rorl $2,%ecx
+ addl %ebx,%edx
+ movl 8(%esp),%edi
+ addl %ecx,%ebx
+ movl %eax,(%esp)
+ movl %eax,%ecx
+ subl $4,%esp
+ orl %esi,%eax
+ andl %esi,%ecx
+ andl %edi,%eax
+ movl (%ebp),%esi
+ orl %ecx,%eax
+ addl $4,%ebp
+ addl %ebx,%eax
+ addl %esi,%edx
+ addl %esi,%eax
+ cmpl $3248222580,%esi
+ jne .L00300_15
+ movl 152(%esp),%ebx
+.align 16
+.L00416_63:
+ movl %ebx,%esi
+ movl 100(%esp),%ecx
+ rorl $11,%esi
+ movl %ecx,%edi
+ xorl %ebx,%esi
+ rorl $7,%esi
+ shrl $3,%ebx
+ rorl $2,%edi
+ xorl %esi,%ebx
+ xorl %ecx,%edi
+ rorl $17,%edi
+ shrl $10,%ecx
+ addl 156(%esp),%ebx
+ xorl %ecx,%edi
+ addl 120(%esp),%ebx
+ movl %edx,%ecx
+ addl %edi,%ebx
+ rorl $14,%ecx
+ movl 20(%esp),%esi
+ xorl %edx,%ecx
+ rorl $5,%ecx
+ movl %ebx,92(%esp)
+ xorl %edx,%ecx
+ rorl $6,%ecx
+ movl 24(%esp),%edi
+ addl %ecx,%ebx
+ xorl %edi,%esi
+ movl %edx,16(%esp)
+ movl %eax,%ecx
+ andl %edx,%esi
+ movl 12(%esp),%edx
+ xorl %edi,%esi
+ movl %eax,%edi
+ addl %esi,%ebx
+ rorl $9,%ecx
+ addl 28(%esp),%ebx
+ xorl %eax,%ecx
+ rorl $11,%ecx
+ movl 4(%esp),%esi
+ xorl %eax,%ecx
+ rorl $2,%ecx
+ addl %ebx,%edx
+ movl 8(%esp),%edi
+ addl %ecx,%ebx
+ movl %eax,(%esp)
+ movl %eax,%ecx
+ subl $4,%esp
+ orl %esi,%eax
+ andl %esi,%ecx
+ andl %edi,%eax
+ movl (%ebp),%esi
+ orl %ecx,%eax
+ addl $4,%ebp
+ addl %ebx,%eax
+ movl 152(%esp),%ebx
+ addl %esi,%edx
+ addl %esi,%eax
+ cmpl $3329325298,%esi
+ jne .L00416_63
+ movl 352(%esp),%esi
+ movl 4(%esp),%ebx
+ movl 8(%esp),%ecx
+ movl 12(%esp),%edi
+ addl (%esi),%eax
+ addl 4(%esi),%ebx
+ addl 8(%esi),%ecx
+ addl 12(%esi),%edi
+ movl %eax,(%esi)
+ movl %ebx,4(%esi)
+ movl %ecx,8(%esi)
+ movl %edi,12(%esi)
+ movl 20(%esp),%eax
+ movl 24(%esp),%ebx
+ movl 28(%esp),%ecx
+ movl 356(%esp),%edi
+ addl 16(%esi),%edx
+ addl 20(%esi),%eax
+ addl 24(%esi),%ebx
+ addl 28(%esi),%ecx
+ movl %edx,16(%esi)
+ movl %eax,20(%esi)
+ movl %ebx,24(%esi)
+ movl %ecx,28(%esi)
+ addl $352,%esp
+ subl $256,%ebp
+ cmpl 8(%esp),%edi
+ jb .L002loop
+ movl 12(%esp),%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.align 64
+.L001K256:
+.long 1116352408,1899447441,3049323471,3921009573
+.long 961987163,1508970993,2453635748,2870763221
+.long 3624381080,310598401,607225278,1426881987
+.long 1925078388,2162078206,2614888103,3248222580
+.long 3835390401,4022224774,264347078,604807628
+.long 770255983,1249150122,1555081692,1996064986
+.long 2554220882,2821834349,2952996808,3210313671
+.long 3336571891,3584528711,113926993,338241895
+.long 666307205,773529912,1294757372,1396182291
+.long 1695183700,1986661051,2177026350,2456956037
+.long 2730485921,2820302411,3259730800,3345764771
+.long 3516065817,3600352804,4094571909,275423344
+.long 430227734,506948616,659060556,883997877
+.long 958139571,1322822218,1537002063,1747873779
+.long 1955562222,2024104815,2227730452,2361852424
+.long 2428436474,2756734187,3204031479,3329325298
+.size sha256_block_data_order,.-.L_sha256_block_data_order_begin
+.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97
+.byte 110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32
+.byte 67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97
+.byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
+.byte 62,0
diff --git a/main/openssl/crypto/sha/asm/sha256-586.pl b/main/openssl/crypto/sha/asm/sha256-586.pl
index ecc8b69c..52a7c7f8 100644
--- a/main/openssl/crypto/sha/asm/sha256-586.pl
+++ b/main/openssl/crypto/sha/asm/sha256-586.pl
@@ -14,14 +14,14 @@
# Pentium PIII P4 AMD K8 Core2
# gcc 46 36 41 27 26
# icc 57 33 38 25 23
-# x86 asm 40 30 35 20 20
-# x86_64 asm(*) - - 21 15.8 16.5
+# x86 asm 40 30 33 20 18
+# x86_64 asm(*) - - 21 16 16
#
# (*) x86_64 assembler performance is presented for reference
# purposes.
#
# Performance improvement over compiler generated code varies from
-# 10% to 40% [see above]. Not very impressive on some µ-archs, but
+# 10% to 40% [see above]. Not very impressive on some µ-archs, but
# it's 5 times smaller and optimizies amount of writes.
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
@@ -48,20 +48,19 @@ sub BODY_00_15() {
my $in_16_63=shift;
&mov ("ecx",$E);
- &add ($T,&DWP(4*(8+15+16-9),"esp")) if ($in_16_63); # T += X[-7]
- &ror ("ecx",6);
- &mov ("edi",$E);
- &ror ("edi",11);
+ &add ($T,"edi") if ($in_16_63); # T += sigma1(X[-2])
+ &ror ("ecx",25-11);
&mov ("esi",$Foff);
- &xor ("ecx","edi");
- &ror ("edi",25-11);
+ &xor ("ecx",$E);
+ &ror ("ecx",11-6);
&mov (&DWP(4*(8+15),"esp"),$T) if ($in_16_63); # save X[0]
- &xor ("ecx","edi"); # Sigma1(e)
+ &xor ("ecx",$E);
+ &ror ("ecx",6); # Sigma1(e)
&mov ("edi",$Goff);
&add ($T,"ecx"); # T += Sigma1(e)
- &mov ($Eoff,$E); # modulo-scheduled
&xor ("esi","edi");
+ &mov ($Eoff,$E); # modulo-scheduled
&mov ("ecx",$A);
&and ("esi",$E);
&mov ($E,$Doff); # e becomes d, which is e in next iteration
@@ -69,14 +68,14 @@ sub BODY_00_15() {
&mov ("edi",$A);
&add ($T,"esi"); # T += Ch(e,f,g)
- &ror ("ecx",2);
+ &ror ("ecx",22-13);
&add ($T,$Hoff); # T += h
- &ror ("edi",13);
+ &xor ("ecx",$A);
+ &ror ("ecx",13-2);
&mov ("esi",$Boff);
- &xor ("ecx","edi");
- &ror ("edi",22-13);
+ &xor ("ecx",$A);
+ &ror ("ecx",2); # Sigma0(a)
&add ($E,$T); # d += T
- &xor ("ecx","edi"); # Sigma0(a)
&mov ("edi",$Coff);
&add ($T,"ecx"); # T += Sigma0(a)
@@ -168,23 +167,22 @@ sub BODY_00_15() {
&set_label("16_63",16);
&mov ("esi",$T);
&mov ("ecx",&DWP(4*(8+15+16-14),"esp"));
- &shr ($T,3);
- &ror ("esi",7);
- &xor ($T,"esi");
&ror ("esi",18-7);
&mov ("edi","ecx");
- &xor ($T,"esi"); # T = sigma0(X[-15])
+ &xor ("esi",$T);
+ &ror ("esi",7);
+ &shr ($T,3);
- &shr ("ecx",10);
- &mov ("esi",&DWP(4*(8+15+16),"esp"));
- &ror ("edi",17);
- &xor ("ecx","edi");
&ror ("edi",19-17);
- &add ($T,"esi"); # T += X[-16]
- &xor ("edi","ecx") # sigma1(X[-2])
+ &xor ($T,"esi"); # T = sigma0(X[-15])
+ &xor ("edi","ecx");
+ &ror ("edi",17);
+ &shr ("ecx",10);
+ &add ($T,&DWP(4*(8+15+16),"esp")); # T += X[-16]
+ &xor ("edi","ecx"); # sigma1(X[-2])
- &add ($T,"edi"); # T += sigma1(X[-2])
- # &add ($T,&DWP(4*(8+15+16-9),"esp")); # T += X[-7], moved to BODY_00_15(1)
+ &add ($T,&DWP(4*(8+15+16-9),"esp")); # T += X[-7]
+ # &add ($T,"edi"); # T += sigma1(X[-2])
# &mov (&DWP(4*(8+15),"esp"),$T); # save X[0]
&BODY_00_15(1);
diff --git a/main/openssl/crypto/sha/asm/sha256-armv4.pl b/main/openssl/crypto/sha/asm/sha256-armv4.pl
index 492cb62b..9c84e8d9 100644
--- a/main/openssl/crypto/sha/asm/sha256-armv4.pl
+++ b/main/openssl/crypto/sha/asm/sha256-armv4.pl
@@ -18,11 +18,16 @@
# Rescheduling for dual-issue pipeline resulted in 22% improvement on
# Cortex A8 core and ~20 cycles per processed byte.
+# February 2011.
+#
+# Profiler-assisted and platform-specific optimization resulted in 16%
+# improvement on Cortex A8 core and ~17 cycles per processed byte.
+
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
$ctx="r0"; $t0="r0";
-$inp="r1";
+$inp="r1"; $t3="r1";
$len="r2"; $t1="r2";
$T1="r3";
$A="r4";
@@ -46,6 +51,9 @@ sub BODY_00_15 {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
$code.=<<___ if ($i<16);
+#if __ARM_ARCH__>=7
+ ldr $T1,[$inp],#4
+#else
ldrb $T1,[$inp,#3] @ $i
ldrb $t2,[$inp,#2]
ldrb $t1,[$inp,#1]
@@ -53,16 +61,24 @@ $code.=<<___ if ($i<16);
orr $T1,$T1,$t2,lsl#8
orr $T1,$T1,$t1,lsl#16
orr $T1,$T1,$t0,lsl#24
- `"str $inp,[sp,#17*4]" if ($i==15)`
+#endif
___
$code.=<<___;
- ldr $t2,[$Ktbl],#4 @ *K256++
mov $t0,$e,ror#$Sigma1[0]
- str $T1,[sp,#`$i%16`*4]
+ ldr $t2,[$Ktbl],#4 @ *K256++
eor $t0,$t0,$e,ror#$Sigma1[1]
eor $t1,$f,$g
+#if $i>=16
+ add $T1,$T1,$t3 @ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+ rev $T1,$T1
+#endif
+#if $i==15
+ str $inp,[sp,#17*4] @ leave room for $t3
+#endif
eor $t0,$t0,$e,ror#$Sigma1[2] @ Sigma1(e)
and $t1,$t1,$e
+ str $T1,[sp,#`$i%16`*4]
add $T1,$T1,$t0
eor $t1,$t1,$g @ Ch(e,f,g)
add $T1,$T1,$h
@@ -71,6 +87,9 @@ $code.=<<___;
eor $h,$h,$a,ror#$Sigma0[1]
add $T1,$T1,$t2
eor $h,$h,$a,ror#$Sigma0[2] @ Sigma0(a)
+#if $i>=15
+ ldr $t3,[sp,#`($i+2)%16`*4] @ from BODY_16_xx
+#endif
orr $t0,$a,$b
and $t1,$a,$b
and $t0,$t0,$c
@@ -85,24 +104,26 @@ sub BODY_16_XX {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
$code.=<<___;
- ldr $t1,[sp,#`($i+1)%16`*4] @ $i
+ @ ldr $t3,[sp,#`($i+1)%16`*4] @ $i
ldr $t2,[sp,#`($i+14)%16`*4]
+ mov $t0,$t3,ror#$sigma0[0]
ldr $T1,[sp,#`($i+0)%16`*4]
- mov $t0,$t1,ror#$sigma0[0]
- ldr $inp,[sp,#`($i+9)%16`*4]
- eor $t0,$t0,$t1,ror#$sigma0[1]
- eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1])
- mov $t1,$t2,ror#$sigma1[0]
+ eor $t0,$t0,$t3,ror#$sigma0[1]
+ ldr $t1,[sp,#`($i+9)%16`*4]
+ eor $t0,$t0,$t3,lsr#$sigma0[2] @ sigma0(X[i+1])
+ mov $t3,$t2,ror#$sigma1[0]
add $T1,$T1,$t0
- eor $t1,$t1,$t2,ror#$sigma1[1]
- add $T1,$T1,$inp
- eor $t1,$t1,$t2,lsr#$sigma1[2] @ sigma1(X[i+14])
+ eor $t3,$t3,$t2,ror#$sigma1[1]
add $T1,$T1,$t1
+ eor $t3,$t3,$t2,lsr#$sigma1[2] @ sigma1(X[i+14])
+ @ add $T1,$T1,$t3
___
&BODY_00_15(@_);
}
$code=<<___;
+#include "arm_arch.h"
+
.text
.code 32
@@ -132,7 +153,7 @@ K256:
sha256_block_data_order:
sub r3,pc,#8 @ sha256_block_data_order
add $len,$inp,$len,lsl#6 @ len to point at the end of inp
- stmdb sp!,{$ctx,$inp,$len,r4-r12,lr}
+ stmdb sp!,{$ctx,$inp,$len,r4-r11,lr}
ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
sub $Ktbl,r3,#256 @ K256
sub sp,sp,#16*4 @ alloca(X[16])
@@ -171,10 +192,14 @@ $code.=<<___;
bne .Loop
add sp,sp,#`16+3`*4 @ destroy frame
- ldmia sp!,{r4-r12,lr}
+#if __ARM_ARCH__>=5
+ ldmia sp!,{r4-r11,pc}
+#else
+ ldmia sp!,{r4-r11,lr}
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
+#endif
.size sha256_block_data_order,.-sha256_block_data_order
.asciz "SHA256 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
diff --git a/main/openssl/crypto/sha/asm/sha256-armv4.s b/main/openssl/crypto/sha/asm/sha256-armv4.s
index ee903dc4..9c20a63c 100644
--- a/main/openssl/crypto/sha/asm/sha256-armv4.s
+++ b/main/openssl/crypto/sha/asm/sha256-armv4.s
@@ -1,3 +1,5 @@
+#include "arm_arch.h"
+
.text
.code 32
@@ -27,11 +29,14 @@ K256:
sha256_block_data_order:
sub r3,pc,#8 @ sha256_block_data_order
add r2,r1,r2,lsl#6 @ len to point at the end of inp
- stmdb sp!,{r0,r1,r2,r4-r12,lr}
+ stmdb sp!,{r0,r1,r2,r4-r11,lr}
ldmia r0,{r4,r5,r6,r7,r8,r9,r10,r11}
sub r14,r3,#256 @ K256
sub sp,sp,#16*4 @ alloca(X[16])
.Loop:
+#if __ARM_ARCH__>=7
+ ldr r3,[r1],#4
+#else
ldrb r3,[r1,#3] @ 0
ldrb r12,[r1,#2]
ldrb r2,[r1,#1]
@@ -39,14 +44,22 @@ sha256_block_data_order:
orr r3,r3,r12,lsl#8
orr r3,r3,r2,lsl#16
orr r3,r3,r0,lsl#24
-
- ldr r12,[r14],#4 @ *K256++
+#endif
mov r0,r8,ror#6
- str r3,[sp,#0*4]
+ ldr r12,[r14],#4 @ *K256++
eor r0,r0,r8,ror#11
eor r2,r9,r10
+#if 0>=16
+ add r3,r3,r1 @ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+ rev r3,r3
+#endif
+#if 0==15
+ str r1,[sp,#17*4] @ leave room for r1
+#endif
eor r0,r0,r8,ror#25 @ Sigma1(e)
and r2,r2,r8
+ str r3,[sp,#0*4]
add r3,r3,r0
eor r2,r2,r10 @ Ch(e,f,g)
add r3,r3,r11
@@ -55,6 +68,9 @@ sha256_block_data_order:
eor r11,r11,r4,ror#13
add r3,r3,r12
eor r11,r11,r4,ror#22 @ Sigma0(a)
+#if 0>=15
+ ldr r1,[sp,#2*4] @ from BODY_16_xx
+#endif
orr r0,r4,r5
and r2,r4,r5
and r0,r0,r6
@@ -62,6 +78,9 @@ sha256_block_data_order:
orr r0,r0,r2 @ Maj(a,b,c)
add r7,r7,r3
add r11,r11,r0
+#if __ARM_ARCH__>=7
+ ldr r3,[r1],#4
+#else
ldrb r3,[r1,#3] @ 1
ldrb r12,[r1,#2]
ldrb r2,[r1,#1]
@@ -69,14 +88,22 @@ sha256_block_data_order:
orr r3,r3,r12,lsl#8
orr r3,r3,r2,lsl#16
orr r3,r3,r0,lsl#24
-
- ldr r12,[r14],#4 @ *K256++
+#endif
mov r0,r7,ror#6
- str r3,[sp,#1*4]
+ ldr r12,[r14],#4 @ *K256++
eor r0,r0,r7,ror#11
eor r2,r8,r9
+#if 1>=16
+ add r3,r3,r1 @ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+ rev r3,r3
+#endif
+#if 1==15
+ str r1,[sp,#17*4] @ leave room for r1
+#endif
eor r0,r0,r7,ror#25 @ Sigma1(e)
and r2,r2,r7
+ str r3,[sp,#1*4]
add r3,r3,r0
eor r2,r2,r9 @ Ch(e,f,g)
add r3,r3,r10
@@ -85,6 +112,9 @@ sha256_block_data_order:
eor r10,r10,r11,ror#13
add r3,r3,r12
eor r10,r10,r11,ror#22 @ Sigma0(a)
+#if 1>=15
+ ldr r1,[sp,#3*4] @ from BODY_16_xx
+#endif
orr r0,r11,r4
and r2,r11,r4
and r0,r0,r5
@@ -92,6 +122,9 @@ sha256_block_data_order:
orr r0,r0,r2 @ Maj(a,b,c)
add r6,r6,r3
add r10,r10,r0
+#if __ARM_ARCH__>=7
+ ldr r3,[r1],#4
+#else
ldrb r3,[r1,#3] @ 2
ldrb r12,[r1,#2]
ldrb r2,[r1,#1]
@@ -99,14 +132,22 @@ sha256_block_data_order:
orr r3,r3,r12,lsl#8
orr r3,r3,r2,lsl#16
orr r3,r3,r0,lsl#24
-
- ldr r12,[r14],#4 @ *K256++
+#endif
mov r0,r6,ror#6
- str r3,[sp,#2*4]
+ ldr r12,[r14],#4 @ *K256++
eor r0,r0,r6,ror#11
eor r2,r7,r8
+#if 2>=16
+ add r3,r3,r1 @ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+ rev r3,r3
+#endif
+#if 2==15
+ str r1,[sp,#17*4] @ leave room for r1
+#endif
eor r0,r0,r6,ror#25 @ Sigma1(e)
and r2,r2,r6
+ str r3,[sp,#2*4]
add r3,r3,r0
eor r2,r2,r8 @ Ch(e,f,g)
add r3,r3,r9
@@ -115,6 +156,9 @@ sha256_block_data_order:
eor r9,r9,r10,ror#13
add r3,r3,r12
eor r9,r9,r10,ror#22 @ Sigma0(a)
+#if 2>=15
+ ldr r1,[sp,#4*4] @ from BODY_16_xx
+#endif
orr r0,r10,r11
and r2,r10,r11
and r0,r0,r4
@@ -122,6 +166,9 @@ sha256_block_data_order:
orr r0,r0,r2 @ Maj(a,b,c)
add r5,r5,r3
add r9,r9,r0
+#if __ARM_ARCH__>=7
+ ldr r3,[r1],#4
+#else
ldrb r3,[r1,#3] @ 3
ldrb r12,[r1,#2]
ldrb r2,[r1,#1]
@@ -129,14 +176,22 @@ sha256_block_data_order:
orr r3,r3,r12,lsl#8
orr r3,r3,r2,lsl#16
orr r3,r3,r0,lsl#24
-
- ldr r12,[r14],#4 @ *K256++
+#endif
mov r0,r5,ror#6
- str r3,[sp,#3*4]
+ ldr r12,[r14],#4 @ *K256++
eor r0,r0,r5,ror#11
eor r2,r6,r7
+#if 3>=16
+ add r3,r3,r1 @ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+ rev r3,r3
+#endif
+#if 3==15
+ str r1,[sp,#17*4] @ leave room for r1
+#endif
eor r0,r0,r5,ror#25 @ Sigma1(e)
and r2,r2,r5
+ str r3,[sp,#3*4]
add r3,r3,r0
eor r2,r2,r7 @ Ch(e,f,g)
add r3,r3,r8
@@ -145,6 +200,9 @@ sha256_block_data_order:
eor r8,r8,r9,ror#13
add r3,r3,r12
eor r8,r8,r9,ror#22 @ Sigma0(a)
+#if 3>=15
+ ldr r1,[sp,#5*4] @ from BODY_16_xx
+#endif
orr r0,r9,r10
and r2,r9,r10
and r0,r0,r11
@@ -152,6 +210,9 @@ sha256_block_data_order:
orr r0,r0,r2 @ Maj(a,b,c)
add r4,r4,r3
add r8,r8,r0
+#if __ARM_ARCH__>=7
+ ldr r3,[r1],#4
+#else
ldrb r3,[r1,#3] @ 4
ldrb r12,[r1,#2]
ldrb r2,[r1,#1]
@@ -159,14 +220,22 @@ sha256_block_data_order:
orr r3,r3,r12,lsl#8
orr r3,r3,r2,lsl#16
orr r3,r3,r0,lsl#24
-
- ldr r12,[r14],#4 @ *K256++
+#endif
mov r0,r4,ror#6
- str r3,[sp,#4*4]
+ ldr r12,[r14],#4 @ *K256++
eor r0,r0,r4,ror#11
eor r2,r5,r6
+#if 4>=16
+ add r3,r3,r1 @ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+ rev r3,r3
+#endif
+#if 4==15
+ str r1,[sp,#17*4] @ leave room for r1
+#endif
eor r0,r0,r4,ror#25 @ Sigma1(e)
and r2,r2,r4
+ str r3,[sp,#4*4]
add r3,r3,r0
eor r2,r2,r6 @ Ch(e,f,g)
add r3,r3,r7
@@ -175,6 +244,9 @@ sha256_block_data_order:
eor r7,r7,r8,ror#13
add r3,r3,r12
eor r7,r7,r8,ror#22 @ Sigma0(a)
+#if 4>=15
+ ldr r1,[sp,#6*4] @ from BODY_16_xx
+#endif
orr r0,r8,r9
and r2,r8,r9
and r0,r0,r10
@@ -182,6 +254,9 @@ sha256_block_data_order:
orr r0,r0,r2 @ Maj(a,b,c)
add r11,r11,r3
add r7,r7,r0
+#if __ARM_ARCH__>=7
+ ldr r3,[r1],#4
+#else
ldrb r3,[r1,#3] @ 5
ldrb r12,[r1,#2]
ldrb r2,[r1,#1]
@@ -189,14 +264,22 @@ sha256_block_data_order:
orr r3,r3,r12,lsl#8
orr r3,r3,r2,lsl#16
orr r3,r3,r0,lsl#24
-
- ldr r12,[r14],#4 @ *K256++
+#endif
mov r0,r11,ror#6
- str r3,[sp,#5*4]
+ ldr r12,[r14],#4 @ *K256++
eor r0,r0,r11,ror#11
eor r2,r4,r5
+#if 5>=16
+ add r3,r3,r1 @ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+ rev r3,r3
+#endif
+#if 5==15
+ str r1,[sp,#17*4] @ leave room for r1
+#endif
eor r0,r0,r11,ror#25 @ Sigma1(e)
and r2,r2,r11
+ str r3,[sp,#5*4]
add r3,r3,r0
eor r2,r2,r5 @ Ch(e,f,g)
add r3,r3,r6
@@ -205,6 +288,9 @@ sha256_block_data_order:
eor r6,r6,r7,ror#13
add r3,r3,r12
eor r6,r6,r7,ror#22 @ Sigma0(a)
+#if 5>=15
+ ldr r1,[sp,#7*4] @ from BODY_16_xx
+#endif
orr r0,r7,r8
and r2,r7,r8
and r0,r0,r9
@@ -212,6 +298,9 @@ sha256_block_data_order:
orr r0,r0,r2 @ Maj(a,b,c)
add r10,r10,r3
add r6,r6,r0
+#if __ARM_ARCH__>=7
+ ldr r3,[r1],#4
+#else
ldrb r3,[r1,#3] @ 6
ldrb r12,[r1,#2]
ldrb r2,[r1,#1]
@@ -219,14 +308,22 @@ sha256_block_data_order:
orr r3,r3,r12,lsl#8
orr r3,r3,r2,lsl#16
orr r3,r3,r0,lsl#24
-
- ldr r12,[r14],#4 @ *K256++
+#endif
mov r0,r10,ror#6
- str r3,[sp,#6*4]
+ ldr r12,[r14],#4 @ *K256++
eor r0,r0,r10,ror#11
eor r2,r11,r4
+#if 6>=16
+ add r3,r3,r1 @ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+ rev r3,r3
+#endif
+#if 6==15
+ str r1,[sp,#17*4] @ leave room for r1
+#endif
eor r0,r0,r10,ror#25 @ Sigma1(e)
and r2,r2,r10
+ str r3,[sp,#6*4]
add r3,r3,r0
eor r2,r2,r4 @ Ch(e,f,g)
add r3,r3,r5
@@ -235,6 +332,9 @@ sha256_block_data_order:
eor r5,r5,r6,ror#13
add r3,r3,r12
eor r5,r5,r6,ror#22 @ Sigma0(a)
+#if 6>=15
+ ldr r1,[sp,#8*4] @ from BODY_16_xx
+#endif
orr r0,r6,r7
and r2,r6,r7
and r0,r0,r8
@@ -242,6 +342,9 @@ sha256_block_data_order:
orr r0,r0,r2 @ Maj(a,b,c)
add r9,r9,r3
add r5,r5,r0
+#if __ARM_ARCH__>=7
+ ldr r3,[r1],#4
+#else
ldrb r3,[r1,#3] @ 7
ldrb r12,[r1,#2]
ldrb r2,[r1,#1]
@@ -249,14 +352,22 @@ sha256_block_data_order:
orr r3,r3,r12,lsl#8
orr r3,r3,r2,lsl#16
orr r3,r3,r0,lsl#24
-
- ldr r12,[r14],#4 @ *K256++
+#endif
mov r0,r9,ror#6
- str r3,[sp,#7*4]
+ ldr r12,[r14],#4 @ *K256++
eor r0,r0,r9,ror#11
eor r2,r10,r11
+#if 7>=16
+ add r3,r3,r1 @ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+ rev r3,r3
+#endif
+#if 7==15
+ str r1,[sp,#17*4] @ leave room for r1
+#endif
eor r0,r0,r9,ror#25 @ Sigma1(e)
and r2,r2,r9
+ str r3,[sp,#7*4]
add r3,r3,r0
eor r2,r2,r11 @ Ch(e,f,g)
add r3,r3,r4
@@ -265,6 +376,9 @@ sha256_block_data_order:
eor r4,r4,r5,ror#13
add r3,r3,r12
eor r4,r4,r5,ror#22 @ Sigma0(a)
+#if 7>=15
+ ldr r1,[sp,#9*4] @ from BODY_16_xx
+#endif
orr r0,r5,r6
and r2,r5,r6
and r0,r0,r7
@@ -272,6 +386,9 @@ sha256_block_data_order:
orr r0,r0,r2 @ Maj(a,b,c)
add r8,r8,r3
add r4,r4,r0
+#if __ARM_ARCH__>=7
+ ldr r3,[r1],#4
+#else
ldrb r3,[r1,#3] @ 8
ldrb r12,[r1,#2]
ldrb r2,[r1,#1]
@@ -279,14 +396,22 @@ sha256_block_data_order:
orr r3,r3,r12,lsl#8
orr r3,r3,r2,lsl#16
orr r3,r3,r0,lsl#24
-
- ldr r12,[r14],#4 @ *K256++
+#endif
mov r0,r8,ror#6
- str r3,[sp,#8*4]
+ ldr r12,[r14],#4 @ *K256++
eor r0,r0,r8,ror#11
eor r2,r9,r10
+#if 8>=16
+ add r3,r3,r1 @ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+ rev r3,r3
+#endif
+#if 8==15
+ str r1,[sp,#17*4] @ leave room for r1
+#endif
eor r0,r0,r8,ror#25 @ Sigma1(e)
and r2,r2,r8
+ str r3,[sp,#8*4]
add r3,r3,r0
eor r2,r2,r10 @ Ch(e,f,g)
add r3,r3,r11
@@ -295,6 +420,9 @@ sha256_block_data_order:
eor r11,r11,r4,ror#13
add r3,r3,r12
eor r11,r11,r4,ror#22 @ Sigma0(a)
+#if 8>=15
+ ldr r1,[sp,#10*4] @ from BODY_16_xx
+#endif
orr r0,r4,r5
and r2,r4,r5
and r0,r0,r6
@@ -302,6 +430,9 @@ sha256_block_data_order:
orr r0,r0,r2 @ Maj(a,b,c)
add r7,r7,r3
add r11,r11,r0
+#if __ARM_ARCH__>=7
+ ldr r3,[r1],#4
+#else
ldrb r3,[r1,#3] @ 9
ldrb r12,[r1,#2]
ldrb r2,[r1,#1]
@@ -309,14 +440,22 @@ sha256_block_data_order:
orr r3,r3,r12,lsl#8
orr r3,r3,r2,lsl#16
orr r3,r3,r0,lsl#24
-
- ldr r12,[r14],#4 @ *K256++
+#endif
mov r0,r7,ror#6
- str r3,[sp,#9*4]
+ ldr r12,[r14],#4 @ *K256++
eor r0,r0,r7,ror#11
eor r2,r8,r9
+#if 9>=16
+ add r3,r3,r1 @ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+ rev r3,r3
+#endif
+#if 9==15
+ str r1,[sp,#17*4] @ leave room for r1
+#endif
eor r0,r0,r7,ror#25 @ Sigma1(e)
and r2,r2,r7
+ str r3,[sp,#9*4]
add r3,r3,r0
eor r2,r2,r9 @ Ch(e,f,g)
add r3,r3,r10
@@ -325,6 +464,9 @@ sha256_block_data_order:
eor r10,r10,r11,ror#13
add r3,r3,r12
eor r10,r10,r11,ror#22 @ Sigma0(a)
+#if 9>=15
+ ldr r1,[sp,#11*4] @ from BODY_16_xx
+#endif
orr r0,r11,r4
and r2,r11,r4
and r0,r0,r5
@@ -332,6 +474,9 @@ sha256_block_data_order:
orr r0,r0,r2 @ Maj(a,b,c)
add r6,r6,r3
add r10,r10,r0
+#if __ARM_ARCH__>=7
+ ldr r3,[r1],#4
+#else
ldrb r3,[r1,#3] @ 10
ldrb r12,[r1,#2]
ldrb r2,[r1,#1]
@@ -339,14 +484,22 @@ sha256_block_data_order:
orr r3,r3,r12,lsl#8
orr r3,r3,r2,lsl#16
orr r3,r3,r0,lsl#24
-
- ldr r12,[r14],#4 @ *K256++
+#endif
mov r0,r6,ror#6
- str r3,[sp,#10*4]
+ ldr r12,[r14],#4 @ *K256++
eor r0,r0,r6,ror#11
eor r2,r7,r8
+#if 10>=16
+ add r3,r3,r1 @ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+ rev r3,r3
+#endif
+#if 10==15
+ str r1,[sp,#17*4] @ leave room for r1
+#endif
eor r0,r0,r6,ror#25 @ Sigma1(e)
and r2,r2,r6
+ str r3,[sp,#10*4]
add r3,r3,r0
eor r2,r2,r8 @ Ch(e,f,g)
add r3,r3,r9
@@ -355,6 +508,9 @@ sha256_block_data_order:
eor r9,r9,r10,ror#13
add r3,r3,r12
eor r9,r9,r10,ror#22 @ Sigma0(a)
+#if 10>=15
+ ldr r1,[sp,#12*4] @ from BODY_16_xx
+#endif
orr r0,r10,r11
and r2,r10,r11
and r0,r0,r4
@@ -362,6 +518,9 @@ sha256_block_data_order:
orr r0,r0,r2 @ Maj(a,b,c)
add r5,r5,r3
add r9,r9,r0
+#if __ARM_ARCH__>=7
+ ldr r3,[r1],#4
+#else
ldrb r3,[r1,#3] @ 11
ldrb r12,[r1,#2]
ldrb r2,[r1,#1]
@@ -369,14 +528,22 @@ sha256_block_data_order:
orr r3,r3,r12,lsl#8
orr r3,r3,r2,lsl#16
orr r3,r3,r0,lsl#24
-
- ldr r12,[r14],#4 @ *K256++
+#endif
mov r0,r5,ror#6
- str r3,[sp,#11*4]
+ ldr r12,[r14],#4 @ *K256++
eor r0,r0,r5,ror#11
eor r2,r6,r7
+#if 11>=16
+ add r3,r3,r1 @ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+ rev r3,r3
+#endif
+#if 11==15
+ str r1,[sp,#17*4] @ leave room for r1
+#endif
eor r0,r0,r5,ror#25 @ Sigma1(e)
and r2,r2,r5
+ str r3,[sp,#11*4]
add r3,r3,r0
eor r2,r2,r7 @ Ch(e,f,g)
add r3,r3,r8
@@ -385,6 +552,9 @@ sha256_block_data_order:
eor r8,r8,r9,ror#13
add r3,r3,r12
eor r8,r8,r9,ror#22 @ Sigma0(a)
+#if 11>=15
+ ldr r1,[sp,#13*4] @ from BODY_16_xx
+#endif
orr r0,r9,r10
and r2,r9,r10
and r0,r0,r11
@@ -392,6 +562,9 @@ sha256_block_data_order:
orr r0,r0,r2 @ Maj(a,b,c)
add r4,r4,r3
add r8,r8,r0
+#if __ARM_ARCH__>=7
+ ldr r3,[r1],#4
+#else
ldrb r3,[r1,#3] @ 12
ldrb r12,[r1,#2]
ldrb r2,[r1,#1]
@@ -399,14 +572,22 @@ sha256_block_data_order:
orr r3,r3,r12,lsl#8
orr r3,r3,r2,lsl#16
orr r3,r3,r0,lsl#24
-
- ldr r12,[r14],#4 @ *K256++
+#endif
mov r0,r4,ror#6
- str r3,[sp,#12*4]
+ ldr r12,[r14],#4 @ *K256++
eor r0,r0,r4,ror#11
eor r2,r5,r6
+#if 12>=16
+ add r3,r3,r1 @ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+ rev r3,r3
+#endif
+#if 12==15
+ str r1,[sp,#17*4] @ leave room for r1
+#endif
eor r0,r0,r4,ror#25 @ Sigma1(e)
and r2,r2,r4
+ str r3,[sp,#12*4]
add r3,r3,r0
eor r2,r2,r6 @ Ch(e,f,g)
add r3,r3,r7
@@ -415,6 +596,9 @@ sha256_block_data_order:
eor r7,r7,r8,ror#13
add r3,r3,r12
eor r7,r7,r8,ror#22 @ Sigma0(a)
+#if 12>=15
+ ldr r1,[sp,#14*4] @ from BODY_16_xx
+#endif
orr r0,r8,r9
and r2,r8,r9
and r0,r0,r10
@@ -422,6 +606,9 @@ sha256_block_data_order:
orr r0,r0,r2 @ Maj(a,b,c)
add r11,r11,r3
add r7,r7,r0
+#if __ARM_ARCH__>=7
+ ldr r3,[r1],#4
+#else
ldrb r3,[r1,#3] @ 13
ldrb r12,[r1,#2]
ldrb r2,[r1,#1]
@@ -429,14 +616,22 @@ sha256_block_data_order:
orr r3,r3,r12,lsl#8
orr r3,r3,r2,lsl#16
orr r3,r3,r0,lsl#24
-
- ldr r12,[r14],#4 @ *K256++
+#endif
mov r0,r11,ror#6
- str r3,[sp,#13*4]
+ ldr r12,[r14],#4 @ *K256++
eor r0,r0,r11,ror#11
eor r2,r4,r5
+#if 13>=16
+ add r3,r3,r1 @ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+ rev r3,r3
+#endif
+#if 13==15
+ str r1,[sp,#17*4] @ leave room for r1
+#endif
eor r0,r0,r11,ror#25 @ Sigma1(e)
and r2,r2,r11
+ str r3,[sp,#13*4]
add r3,r3,r0
eor r2,r2,r5 @ Ch(e,f,g)
add r3,r3,r6
@@ -445,6 +640,9 @@ sha256_block_data_order:
eor r6,r6,r7,ror#13
add r3,r3,r12
eor r6,r6,r7,ror#22 @ Sigma0(a)
+#if 13>=15
+ ldr r1,[sp,#15*4] @ from BODY_16_xx
+#endif
orr r0,r7,r8
and r2,r7,r8
and r0,r0,r9
@@ -452,6 +650,9 @@ sha256_block_data_order:
orr r0,r0,r2 @ Maj(a,b,c)
add r10,r10,r3
add r6,r6,r0
+#if __ARM_ARCH__>=7
+ ldr r3,[r1],#4
+#else
ldrb r3,[r1,#3] @ 14
ldrb r12,[r1,#2]
ldrb r2,[r1,#1]
@@ -459,14 +660,22 @@ sha256_block_data_order:
orr r3,r3,r12,lsl#8
orr r3,r3,r2,lsl#16
orr r3,r3,r0,lsl#24
-
- ldr r12,[r14],#4 @ *K256++
+#endif
mov r0,r10,ror#6
- str r3,[sp,#14*4]
+ ldr r12,[r14],#4 @ *K256++
eor r0,r0,r10,ror#11
eor r2,r11,r4
+#if 14>=16
+ add r3,r3,r1 @ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+ rev r3,r3
+#endif
+#if 14==15
+ str r1,[sp,#17*4] @ leave room for r1
+#endif
eor r0,r0,r10,ror#25 @ Sigma1(e)
and r2,r2,r10
+ str r3,[sp,#14*4]
add r3,r3,r0
eor r2,r2,r4 @ Ch(e,f,g)
add r3,r3,r5
@@ -475,6 +684,9 @@ sha256_block_data_order:
eor r5,r5,r6,ror#13
add r3,r3,r12
eor r5,r5,r6,ror#22 @ Sigma0(a)
+#if 14>=15
+ ldr r1,[sp,#0*4] @ from BODY_16_xx
+#endif
orr r0,r6,r7
and r2,r6,r7
and r0,r0,r8
@@ -482,6 +694,9 @@ sha256_block_data_order:
orr r0,r0,r2 @ Maj(a,b,c)
add r9,r9,r3
add r5,r5,r0
+#if __ARM_ARCH__>=7
+ ldr r3,[r1],#4
+#else
ldrb r3,[r1,#3] @ 15
ldrb r12,[r1,#2]
ldrb r2,[r1,#1]
@@ -489,14 +704,22 @@ sha256_block_data_order:
orr r3,r3,r12,lsl#8
orr r3,r3,r2,lsl#16
orr r3,r3,r0,lsl#24
- str r1,[sp,#17*4]
- ldr r12,[r14],#4 @ *K256++
+#endif
mov r0,r9,ror#6
- str r3,[sp,#15*4]
+ ldr r12,[r14],#4 @ *K256++
eor r0,r0,r9,ror#11
eor r2,r10,r11
+#if 15>=16
+ add r3,r3,r1 @ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+ rev r3,r3
+#endif
+#if 15==15
+ str r1,[sp,#17*4] @ leave room for r1
+#endif
eor r0,r0,r9,ror#25 @ Sigma1(e)
and r2,r2,r9
+ str r3,[sp,#15*4]
add r3,r3,r0
eor r2,r2,r11 @ Ch(e,f,g)
add r3,r3,r4
@@ -505,6 +728,9 @@ sha256_block_data_order:
eor r4,r4,r5,ror#13
add r3,r3,r12
eor r4,r4,r5,ror#22 @ Sigma0(a)
+#if 15>=15
+ ldr r1,[sp,#1*4] @ from BODY_16_xx
+#endif
orr r0,r5,r6
and r2,r5,r6
and r0,r0,r7
@@ -513,26 +739,34 @@ sha256_block_data_order:
add r8,r8,r3
add r4,r4,r0
.Lrounds_16_xx:
- ldr r2,[sp,#1*4] @ 16
+ @ ldr r1,[sp,#1*4] @ 16
ldr r12,[sp,#14*4]
+ mov r0,r1,ror#7
ldr r3,[sp,#0*4]
- mov r0,r2,ror#7
- ldr r1,[sp,#9*4]
- eor r0,r0,r2,ror#18
- eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
- mov r2,r12,ror#17
+ eor r0,r0,r1,ror#18
+ ldr r2,[sp,#9*4]
+ eor r0,r0,r1,lsr#3 @ sigma0(X[i+1])
+ mov r1,r12,ror#17
add r3,r3,r0
- eor r2,r2,r12,ror#19
- add r3,r3,r1
- eor r2,r2,r12,lsr#10 @ sigma1(X[i+14])
+ eor r1,r1,r12,ror#19
add r3,r3,r2
- ldr r12,[r14],#4 @ *K256++
+ eor r1,r1,r12,lsr#10 @ sigma1(X[i+14])
+ @ add r3,r3,r1
mov r0,r8,ror#6
- str r3,[sp,#0*4]
+ ldr r12,[r14],#4 @ *K256++
eor r0,r0,r8,ror#11
eor r2,r9,r10
+#if 16>=16
+ add r3,r3,r1 @ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+ rev r3,r3
+#endif
+#if 16==15
+ str r1,[sp,#17*4] @ leave room for r1
+#endif
eor r0,r0,r8,ror#25 @ Sigma1(e)
and r2,r2,r8
+ str r3,[sp,#0*4]
add r3,r3,r0
eor r2,r2,r10 @ Ch(e,f,g)
add r3,r3,r11
@@ -541,6 +775,9 @@ sha256_block_data_order:
eor r11,r11,r4,ror#13
add r3,r3,r12
eor r11,r11,r4,ror#22 @ Sigma0(a)
+#if 16>=15
+ ldr r1,[sp,#2*4] @ from BODY_16_xx
+#endif
orr r0,r4,r5
and r2,r4,r5
and r0,r0,r6
@@ -548,26 +785,34 @@ sha256_block_data_order:
orr r0,r0,r2 @ Maj(a,b,c)
add r7,r7,r3
add r11,r11,r0
- ldr r2,[sp,#2*4] @ 17
+ @ ldr r1,[sp,#2*4] @ 17
ldr r12,[sp,#15*4]
+ mov r0,r1,ror#7
ldr r3,[sp,#1*4]
- mov r0,r2,ror#7
- ldr r1,[sp,#10*4]
- eor r0,r0,r2,ror#18
- eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
- mov r2,r12,ror#17
+ eor r0,r0,r1,ror#18
+ ldr r2,[sp,#10*4]
+ eor r0,r0,r1,lsr#3 @ sigma0(X[i+1])
+ mov r1,r12,ror#17
add r3,r3,r0
- eor r2,r2,r12,ror#19
- add r3,r3,r1
- eor r2,r2,r12,lsr#10 @ sigma1(X[i+14])
+ eor r1,r1,r12,ror#19
add r3,r3,r2
- ldr r12,[r14],#4 @ *K256++
+ eor r1,r1,r12,lsr#10 @ sigma1(X[i+14])
+ @ add r3,r3,r1
mov r0,r7,ror#6
- str r3,[sp,#1*4]
+ ldr r12,[r14],#4 @ *K256++
eor r0,r0,r7,ror#11
eor r2,r8,r9
+#if 17>=16
+ add r3,r3,r1 @ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+ rev r3,r3
+#endif
+#if 17==15
+ str r1,[sp,#17*4] @ leave room for r1
+#endif
eor r0,r0,r7,ror#25 @ Sigma1(e)
and r2,r2,r7
+ str r3,[sp,#1*4]
add r3,r3,r0
eor r2,r2,r9 @ Ch(e,f,g)
add r3,r3,r10
@@ -576,6 +821,9 @@ sha256_block_data_order:
eor r10,r10,r11,ror#13
add r3,r3,r12
eor r10,r10,r11,ror#22 @ Sigma0(a)
+#if 17>=15
+ ldr r1,[sp,#3*4] @ from BODY_16_xx
+#endif
orr r0,r11,r4
and r2,r11,r4
and r0,r0,r5
@@ -583,26 +831,34 @@ sha256_block_data_order:
orr r0,r0,r2 @ Maj(a,b,c)
add r6,r6,r3
add r10,r10,r0
- ldr r2,[sp,#3*4] @ 18
+ @ ldr r1,[sp,#3*4] @ 18
ldr r12,[sp,#0*4]
+ mov r0,r1,ror#7
ldr r3,[sp,#2*4]
- mov r0,r2,ror#7
- ldr r1,[sp,#11*4]
- eor r0,r0,r2,ror#18
- eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
- mov r2,r12,ror#17
+ eor r0,r0,r1,ror#18
+ ldr r2,[sp,#11*4]
+ eor r0,r0,r1,lsr#3 @ sigma0(X[i+1])
+ mov r1,r12,ror#17
add r3,r3,r0
- eor r2,r2,r12,ror#19
- add r3,r3,r1
- eor r2,r2,r12,lsr#10 @ sigma1(X[i+14])
+ eor r1,r1,r12,ror#19
add r3,r3,r2
- ldr r12,[r14],#4 @ *K256++
+ eor r1,r1,r12,lsr#10 @ sigma1(X[i+14])
+ @ add r3,r3,r1
mov r0,r6,ror#6
- str r3,[sp,#2*4]
+ ldr r12,[r14],#4 @ *K256++
eor r0,r0,r6,ror#11
eor r2,r7,r8
+#if 18>=16
+ add r3,r3,r1 @ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+ rev r3,r3
+#endif
+#if 18==15
+ str r1,[sp,#17*4] @ leave room for r1
+#endif
eor r0,r0,r6,ror#25 @ Sigma1(e)
and r2,r2,r6
+ str r3,[sp,#2*4]
add r3,r3,r0
eor r2,r2,r8 @ Ch(e,f,g)
add r3,r3,r9
@@ -611,6 +867,9 @@ sha256_block_data_order:
eor r9,r9,r10,ror#13
add r3,r3,r12
eor r9,r9,r10,ror#22 @ Sigma0(a)
+#if 18>=15
+ ldr r1,[sp,#4*4] @ from BODY_16_xx
+#endif
orr r0,r10,r11
and r2,r10,r11
and r0,r0,r4
@@ -618,26 +877,34 @@ sha256_block_data_order:
orr r0,r0,r2 @ Maj(a,b,c)
add r5,r5,r3
add r9,r9,r0
- ldr r2,[sp,#4*4] @ 19
+ @ ldr r1,[sp,#4*4] @ 19
ldr r12,[sp,#1*4]
+ mov r0,r1,ror#7
ldr r3,[sp,#3*4]
- mov r0,r2,ror#7
- ldr r1,[sp,#12*4]
- eor r0,r0,r2,ror#18
- eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
- mov r2,r12,ror#17
+ eor r0,r0,r1,ror#18
+ ldr r2,[sp,#12*4]
+ eor r0,r0,r1,lsr#3 @ sigma0(X[i+1])
+ mov r1,r12,ror#17
add r3,r3,r0
- eor r2,r2,r12,ror#19
- add r3,r3,r1
- eor r2,r2,r12,lsr#10 @ sigma1(X[i+14])
+ eor r1,r1,r12,ror#19
add r3,r3,r2
- ldr r12,[r14],#4 @ *K256++
+ eor r1,r1,r12,lsr#10 @ sigma1(X[i+14])
+ @ add r3,r3,r1
mov r0,r5,ror#6
- str r3,[sp,#3*4]
+ ldr r12,[r14],#4 @ *K256++
eor r0,r0,r5,ror#11
eor r2,r6,r7
+#if 19>=16
+ add r3,r3,r1 @ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+ rev r3,r3
+#endif
+#if 19==15
+ str r1,[sp,#17*4] @ leave room for r1
+#endif
eor r0,r0,r5,ror#25 @ Sigma1(e)
and r2,r2,r5
+ str r3,[sp,#3*4]
add r3,r3,r0
eor r2,r2,r7 @ Ch(e,f,g)
add r3,r3,r8
@@ -646,6 +913,9 @@ sha256_block_data_order:
eor r8,r8,r9,ror#13
add r3,r3,r12
eor r8,r8,r9,ror#22 @ Sigma0(a)
+#if 19>=15
+ ldr r1,[sp,#5*4] @ from BODY_16_xx
+#endif
orr r0,r9,r10
and r2,r9,r10
and r0,r0,r11
@@ -653,26 +923,34 @@ sha256_block_data_order:
orr r0,r0,r2 @ Maj(a,b,c)
add r4,r4,r3
add r8,r8,r0
- ldr r2,[sp,#5*4] @ 20
+ @ ldr r1,[sp,#5*4] @ 20
ldr r12,[sp,#2*4]
+ mov r0,r1,ror#7
ldr r3,[sp,#4*4]
- mov r0,r2,ror#7
- ldr r1,[sp,#13*4]
- eor r0,r0,r2,ror#18
- eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
- mov r2,r12,ror#17
+ eor r0,r0,r1,ror#18
+ ldr r2,[sp,#13*4]
+ eor r0,r0,r1,lsr#3 @ sigma0(X[i+1])
+ mov r1,r12,ror#17
add r3,r3,r0
- eor r2,r2,r12,ror#19
- add r3,r3,r1
- eor r2,r2,r12,lsr#10 @ sigma1(X[i+14])
+ eor r1,r1,r12,ror#19
add r3,r3,r2
- ldr r12,[r14],#4 @ *K256++
+ eor r1,r1,r12,lsr#10 @ sigma1(X[i+14])
+ @ add r3,r3,r1
mov r0,r4,ror#6
- str r3,[sp,#4*4]
+ ldr r12,[r14],#4 @ *K256++
eor r0,r0,r4,ror#11
eor r2,r5,r6
+#if 20>=16
+ add r3,r3,r1 @ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+ rev r3,r3
+#endif
+#if 20==15
+ str r1,[sp,#17*4] @ leave room for r1
+#endif
eor r0,r0,r4,ror#25 @ Sigma1(e)
and r2,r2,r4
+ str r3,[sp,#4*4]
add r3,r3,r0
eor r2,r2,r6 @ Ch(e,f,g)
add r3,r3,r7
@@ -681,6 +959,9 @@ sha256_block_data_order:
eor r7,r7,r8,ror#13
add r3,r3,r12
eor r7,r7,r8,ror#22 @ Sigma0(a)
+#if 20>=15
+ ldr r1,[sp,#6*4] @ from BODY_16_xx
+#endif
orr r0,r8,r9
and r2,r8,r9
and r0,r0,r10
@@ -688,26 +969,34 @@ sha256_block_data_order:
orr r0,r0,r2 @ Maj(a,b,c)
add r11,r11,r3
add r7,r7,r0
- ldr r2,[sp,#6*4] @ 21
+ @ ldr r1,[sp,#6*4] @ 21
ldr r12,[sp,#3*4]
+ mov r0,r1,ror#7
ldr r3,[sp,#5*4]
- mov r0,r2,ror#7
- ldr r1,[sp,#14*4]
- eor r0,r0,r2,ror#18
- eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
- mov r2,r12,ror#17
+ eor r0,r0,r1,ror#18
+ ldr r2,[sp,#14*4]
+ eor r0,r0,r1,lsr#3 @ sigma0(X[i+1])
+ mov r1,r12,ror#17
add r3,r3,r0
- eor r2,r2,r12,ror#19
- add r3,r3,r1
- eor r2,r2,r12,lsr#10 @ sigma1(X[i+14])
+ eor r1,r1,r12,ror#19
add r3,r3,r2
- ldr r12,[r14],#4 @ *K256++
+ eor r1,r1,r12,lsr#10 @ sigma1(X[i+14])
+ @ add r3,r3,r1
mov r0,r11,ror#6
- str r3,[sp,#5*4]
+ ldr r12,[r14],#4 @ *K256++
eor r0,r0,r11,ror#11
eor r2,r4,r5
+#if 21>=16
+ add r3,r3,r1 @ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+ rev r3,r3
+#endif
+#if 21==15
+ str r1,[sp,#17*4] @ leave room for r1
+#endif
eor r0,r0,r11,ror#25 @ Sigma1(e)
and r2,r2,r11
+ str r3,[sp,#5*4]
add r3,r3,r0
eor r2,r2,r5 @ Ch(e,f,g)
add r3,r3,r6
@@ -716,6 +1005,9 @@ sha256_block_data_order:
eor r6,r6,r7,ror#13
add r3,r3,r12
eor r6,r6,r7,ror#22 @ Sigma0(a)
+#if 21>=15
+ ldr r1,[sp,#7*4] @ from BODY_16_xx
+#endif
orr r0,r7,r8
and r2,r7,r8
and r0,r0,r9
@@ -723,26 +1015,34 @@ sha256_block_data_order:
orr r0,r0,r2 @ Maj(a,b,c)
add r10,r10,r3
add r6,r6,r0
- ldr r2,[sp,#7*4] @ 22
+ @ ldr r1,[sp,#7*4] @ 22
ldr r12,[sp,#4*4]
+ mov r0,r1,ror#7
ldr r3,[sp,#6*4]
- mov r0,r2,ror#7
- ldr r1,[sp,#15*4]
- eor r0,r0,r2,ror#18
- eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
- mov r2,r12,ror#17
+ eor r0,r0,r1,ror#18
+ ldr r2,[sp,#15*4]
+ eor r0,r0,r1,lsr#3 @ sigma0(X[i+1])
+ mov r1,r12,ror#17
add r3,r3,r0
- eor r2,r2,r12,ror#19
- add r3,r3,r1
- eor r2,r2,r12,lsr#10 @ sigma1(X[i+14])
+ eor r1,r1,r12,ror#19
add r3,r3,r2
- ldr r12,[r14],#4 @ *K256++
+ eor r1,r1,r12,lsr#10 @ sigma1(X[i+14])
+ @ add r3,r3,r1
mov r0,r10,ror#6
- str r3,[sp,#6*4]
+ ldr r12,[r14],#4 @ *K256++
eor r0,r0,r10,ror#11
eor r2,r11,r4
+#if 22>=16
+ add r3,r3,r1 @ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+ rev r3,r3
+#endif
+#if 22==15
+ str r1,[sp,#17*4] @ leave room for r1
+#endif
eor r0,r0,r10,ror#25 @ Sigma1(e)
and r2,r2,r10
+ str r3,[sp,#6*4]
add r3,r3,r0
eor r2,r2,r4 @ Ch(e,f,g)
add r3,r3,r5
@@ -751,6 +1051,9 @@ sha256_block_data_order:
eor r5,r5,r6,ror#13
add r3,r3,r12
eor r5,r5,r6,ror#22 @ Sigma0(a)
+#if 22>=15
+ ldr r1,[sp,#8*4] @ from BODY_16_xx
+#endif
orr r0,r6,r7
and r2,r6,r7
and r0,r0,r8
@@ -758,26 +1061,34 @@ sha256_block_data_order:
orr r0,r0,r2 @ Maj(a,b,c)
add r9,r9,r3
add r5,r5,r0
- ldr r2,[sp,#8*4] @ 23
+ @ ldr r1,[sp,#8*4] @ 23
ldr r12,[sp,#5*4]
+ mov r0,r1,ror#7
ldr r3,[sp,#7*4]
- mov r0,r2,ror#7
- ldr r1,[sp,#0*4]
- eor r0,r0,r2,ror#18
- eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
- mov r2,r12,ror#17
+ eor r0,r0,r1,ror#18
+ ldr r2,[sp,#0*4]
+ eor r0,r0,r1,lsr#3 @ sigma0(X[i+1])
+ mov r1,r12,ror#17
add r3,r3,r0
- eor r2,r2,r12,ror#19
- add r3,r3,r1
- eor r2,r2,r12,lsr#10 @ sigma1(X[i+14])
+ eor r1,r1,r12,ror#19
add r3,r3,r2
- ldr r12,[r14],#4 @ *K256++
+ eor r1,r1,r12,lsr#10 @ sigma1(X[i+14])
+ @ add r3,r3,r1
mov r0,r9,ror#6
- str r3,[sp,#7*4]
+ ldr r12,[r14],#4 @ *K256++
eor r0,r0,r9,ror#11
eor r2,r10,r11
+#if 23>=16
+ add r3,r3,r1 @ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+ rev r3,r3
+#endif
+#if 23==15
+ str r1,[sp,#17*4] @ leave room for r1
+#endif
eor r0,r0,r9,ror#25 @ Sigma1(e)
and r2,r2,r9
+ str r3,[sp,#7*4]
add r3,r3,r0
eor r2,r2,r11 @ Ch(e,f,g)
add r3,r3,r4
@@ -786,6 +1097,9 @@ sha256_block_data_order:
eor r4,r4,r5,ror#13
add r3,r3,r12
eor r4,r4,r5,ror#22 @ Sigma0(a)
+#if 23>=15
+ ldr r1,[sp,#9*4] @ from BODY_16_xx
+#endif
orr r0,r5,r6
and r2,r5,r6
and r0,r0,r7
@@ -793,26 +1107,34 @@ sha256_block_data_order:
orr r0,r0,r2 @ Maj(a,b,c)
add r8,r8,r3
add r4,r4,r0
- ldr r2,[sp,#9*4] @ 24
+ @ ldr r1,[sp,#9*4] @ 24
ldr r12,[sp,#6*4]
+ mov r0,r1,ror#7
ldr r3,[sp,#8*4]
- mov r0,r2,ror#7
- ldr r1,[sp,#1*4]
- eor r0,r0,r2,ror#18
- eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
- mov r2,r12,ror#17
+ eor r0,r0,r1,ror#18
+ ldr r2,[sp,#1*4]
+ eor r0,r0,r1,lsr#3 @ sigma0(X[i+1])
+ mov r1,r12,ror#17
add r3,r3,r0
- eor r2,r2,r12,ror#19
- add r3,r3,r1
- eor r2,r2,r12,lsr#10 @ sigma1(X[i+14])
+ eor r1,r1,r12,ror#19
add r3,r3,r2
- ldr r12,[r14],#4 @ *K256++
+ eor r1,r1,r12,lsr#10 @ sigma1(X[i+14])
+ @ add r3,r3,r1
mov r0,r8,ror#6
- str r3,[sp,#8*4]
+ ldr r12,[r14],#4 @ *K256++
eor r0,r0,r8,ror#11
eor r2,r9,r10
+#if 24>=16
+ add r3,r3,r1 @ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+ rev r3,r3
+#endif
+#if 24==15
+ str r1,[sp,#17*4] @ leave room for r1
+#endif
eor r0,r0,r8,ror#25 @ Sigma1(e)
and r2,r2,r8
+ str r3,[sp,#8*4]
add r3,r3,r0
eor r2,r2,r10 @ Ch(e,f,g)
add r3,r3,r11
@@ -821,6 +1143,9 @@ sha256_block_data_order:
eor r11,r11,r4,ror#13
add r3,r3,r12
eor r11,r11,r4,ror#22 @ Sigma0(a)
+#if 24>=15
+ ldr r1,[sp,#10*4] @ from BODY_16_xx
+#endif
orr r0,r4,r5
and r2,r4,r5
and r0,r0,r6
@@ -828,26 +1153,34 @@ sha256_block_data_order:
orr r0,r0,r2 @ Maj(a,b,c)
add r7,r7,r3
add r11,r11,r0
- ldr r2,[sp,#10*4] @ 25
+ @ ldr r1,[sp,#10*4] @ 25
ldr r12,[sp,#7*4]
+ mov r0,r1,ror#7
ldr r3,[sp,#9*4]
- mov r0,r2,ror#7
- ldr r1,[sp,#2*4]
- eor r0,r0,r2,ror#18
- eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
- mov r2,r12,ror#17
+ eor r0,r0,r1,ror#18
+ ldr r2,[sp,#2*4]
+ eor r0,r0,r1,lsr#3 @ sigma0(X[i+1])
+ mov r1,r12,ror#17
add r3,r3,r0
- eor r2,r2,r12,ror#19
- add r3,r3,r1
- eor r2,r2,r12,lsr#10 @ sigma1(X[i+14])
+ eor r1,r1,r12,ror#19
add r3,r3,r2
- ldr r12,[r14],#4 @ *K256++
+ eor r1,r1,r12,lsr#10 @ sigma1(X[i+14])
+ @ add r3,r3,r1
mov r0,r7,ror#6
- str r3,[sp,#9*4]
+ ldr r12,[r14],#4 @ *K256++
eor r0,r0,r7,ror#11
eor r2,r8,r9
+#if 25>=16
+ add r3,r3,r1 @ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+ rev r3,r3
+#endif
+#if 25==15
+ str r1,[sp,#17*4] @ leave room for r1
+#endif
eor r0,r0,r7,ror#25 @ Sigma1(e)
and r2,r2,r7
+ str r3,[sp,#9*4]
add r3,r3,r0
eor r2,r2,r9 @ Ch(e,f,g)
add r3,r3,r10
@@ -856,6 +1189,9 @@ sha256_block_data_order:
eor r10,r10,r11,ror#13
add r3,r3,r12
eor r10,r10,r11,ror#22 @ Sigma0(a)
+#if 25>=15
+ ldr r1,[sp,#11*4] @ from BODY_16_xx
+#endif
orr r0,r11,r4
and r2,r11,r4
and r0,r0,r5
@@ -863,26 +1199,34 @@ sha256_block_data_order:
orr r0,r0,r2 @ Maj(a,b,c)
add r6,r6,r3
add r10,r10,r0
- ldr r2,[sp,#11*4] @ 26
+ @ ldr r1,[sp,#11*4] @ 26
ldr r12,[sp,#8*4]
+ mov r0,r1,ror#7
ldr r3,[sp,#10*4]
- mov r0,r2,ror#7
- ldr r1,[sp,#3*4]
- eor r0,r0,r2,ror#18
- eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
- mov r2,r12,ror#17
+ eor r0,r0,r1,ror#18
+ ldr r2,[sp,#3*4]
+ eor r0,r0,r1,lsr#3 @ sigma0(X[i+1])
+ mov r1,r12,ror#17
add r3,r3,r0
- eor r2,r2,r12,ror#19
- add r3,r3,r1
- eor r2,r2,r12,lsr#10 @ sigma1(X[i+14])
+ eor r1,r1,r12,ror#19
add r3,r3,r2
- ldr r12,[r14],#4 @ *K256++
+ eor r1,r1,r12,lsr#10 @ sigma1(X[i+14])
+ @ add r3,r3,r1
mov r0,r6,ror#6
- str r3,[sp,#10*4]
+ ldr r12,[r14],#4 @ *K256++
eor r0,r0,r6,ror#11
eor r2,r7,r8
+#if 26>=16
+ add r3,r3,r1 @ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+ rev r3,r3
+#endif
+#if 26==15
+ str r1,[sp,#17*4] @ leave room for r1
+#endif
eor r0,r0,r6,ror#25 @ Sigma1(e)
and r2,r2,r6
+ str r3,[sp,#10*4]
add r3,r3,r0
eor r2,r2,r8 @ Ch(e,f,g)
add r3,r3,r9
@@ -891,6 +1235,9 @@ sha256_block_data_order:
eor r9,r9,r10,ror#13
add r3,r3,r12
eor r9,r9,r10,ror#22 @ Sigma0(a)
+#if 26>=15
+ ldr r1,[sp,#12*4] @ from BODY_16_xx
+#endif
orr r0,r10,r11
and r2,r10,r11
and r0,r0,r4
@@ -898,26 +1245,34 @@ sha256_block_data_order:
orr r0,r0,r2 @ Maj(a,b,c)
add r5,r5,r3
add r9,r9,r0
- ldr r2,[sp,#12*4] @ 27
+ @ ldr r1,[sp,#12*4] @ 27
ldr r12,[sp,#9*4]
+ mov r0,r1,ror#7
ldr r3,[sp,#11*4]
- mov r0,r2,ror#7
- ldr r1,[sp,#4*4]
- eor r0,r0,r2,ror#18
- eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
- mov r2,r12,ror#17
+ eor r0,r0,r1,ror#18
+ ldr r2,[sp,#4*4]
+ eor r0,r0,r1,lsr#3 @ sigma0(X[i+1])
+ mov r1,r12,ror#17
add r3,r3,r0
- eor r2,r2,r12,ror#19
- add r3,r3,r1
- eor r2,r2,r12,lsr#10 @ sigma1(X[i+14])
+ eor r1,r1,r12,ror#19
add r3,r3,r2
- ldr r12,[r14],#4 @ *K256++
+ eor r1,r1,r12,lsr#10 @ sigma1(X[i+14])
+ @ add r3,r3,r1
mov r0,r5,ror#6
- str r3,[sp,#11*4]
+ ldr r12,[r14],#4 @ *K256++
eor r0,r0,r5,ror#11
eor r2,r6,r7
+#if 27>=16
+ add r3,r3,r1 @ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+ rev r3,r3
+#endif
+#if 27==15
+ str r1,[sp,#17*4] @ leave room for r1
+#endif
eor r0,r0,r5,ror#25 @ Sigma1(e)
and r2,r2,r5
+ str r3,[sp,#11*4]
add r3,r3,r0
eor r2,r2,r7 @ Ch(e,f,g)
add r3,r3,r8
@@ -926,6 +1281,9 @@ sha256_block_data_order:
eor r8,r8,r9,ror#13
add r3,r3,r12
eor r8,r8,r9,ror#22 @ Sigma0(a)
+#if 27>=15
+ ldr r1,[sp,#13*4] @ from BODY_16_xx
+#endif
orr r0,r9,r10
and r2,r9,r10
and r0,r0,r11
@@ -933,26 +1291,34 @@ sha256_block_data_order:
orr r0,r0,r2 @ Maj(a,b,c)
add r4,r4,r3
add r8,r8,r0
- ldr r2,[sp,#13*4] @ 28
+ @ ldr r1,[sp,#13*4] @ 28
ldr r12,[sp,#10*4]
+ mov r0,r1,ror#7
ldr r3,[sp,#12*4]
- mov r0,r2,ror#7
- ldr r1,[sp,#5*4]
- eor r0,r0,r2,ror#18
- eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
- mov r2,r12,ror#17
+ eor r0,r0,r1,ror#18
+ ldr r2,[sp,#5*4]
+ eor r0,r0,r1,lsr#3 @ sigma0(X[i+1])
+ mov r1,r12,ror#17
add r3,r3,r0
- eor r2,r2,r12,ror#19
- add r3,r3,r1
- eor r2,r2,r12,lsr#10 @ sigma1(X[i+14])
+ eor r1,r1,r12,ror#19
add r3,r3,r2
- ldr r12,[r14],#4 @ *K256++
+ eor r1,r1,r12,lsr#10 @ sigma1(X[i+14])
+ @ add r3,r3,r1
mov r0,r4,ror#6
- str r3,[sp,#12*4]
+ ldr r12,[r14],#4 @ *K256++
eor r0,r0,r4,ror#11
eor r2,r5,r6
+#if 28>=16
+ add r3,r3,r1 @ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+ rev r3,r3
+#endif
+#if 28==15
+ str r1,[sp,#17*4] @ leave room for r1
+#endif
eor r0,r0,r4,ror#25 @ Sigma1(e)
and r2,r2,r4
+ str r3,[sp,#12*4]
add r3,r3,r0
eor r2,r2,r6 @ Ch(e,f,g)
add r3,r3,r7
@@ -961,6 +1327,9 @@ sha256_block_data_order:
eor r7,r7,r8,ror#13
add r3,r3,r12
eor r7,r7,r8,ror#22 @ Sigma0(a)
+#if 28>=15
+ ldr r1,[sp,#14*4] @ from BODY_16_xx
+#endif
orr r0,r8,r9
and r2,r8,r9
and r0,r0,r10
@@ -968,26 +1337,34 @@ sha256_block_data_order:
orr r0,r0,r2 @ Maj(a,b,c)
add r11,r11,r3
add r7,r7,r0
- ldr r2,[sp,#14*4] @ 29
+ @ ldr r1,[sp,#14*4] @ 29
ldr r12,[sp,#11*4]
+ mov r0,r1,ror#7
ldr r3,[sp,#13*4]
- mov r0,r2,ror#7
- ldr r1,[sp,#6*4]
- eor r0,r0,r2,ror#18
- eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
- mov r2,r12,ror#17
+ eor r0,r0,r1,ror#18
+ ldr r2,[sp,#6*4]
+ eor r0,r0,r1,lsr#3 @ sigma0(X[i+1])
+ mov r1,r12,ror#17
add r3,r3,r0
- eor r2,r2,r12,ror#19
- add r3,r3,r1
- eor r2,r2,r12,lsr#10 @ sigma1(X[i+14])
+ eor r1,r1,r12,ror#19
add r3,r3,r2
- ldr r12,[r14],#4 @ *K256++
+ eor r1,r1,r12,lsr#10 @ sigma1(X[i+14])
+ @ add r3,r3,r1
mov r0,r11,ror#6
- str r3,[sp,#13*4]
+ ldr r12,[r14],#4 @ *K256++
eor r0,r0,r11,ror#11
eor r2,r4,r5
+#if 29>=16
+ add r3,r3,r1 @ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+ rev r3,r3
+#endif
+#if 29==15
+ str r1,[sp,#17*4] @ leave room for r1
+#endif
eor r0,r0,r11,ror#25 @ Sigma1(e)
and r2,r2,r11
+ str r3,[sp,#13*4]
add r3,r3,r0
eor r2,r2,r5 @ Ch(e,f,g)
add r3,r3,r6
@@ -996,6 +1373,9 @@ sha256_block_data_order:
eor r6,r6,r7,ror#13
add r3,r3,r12
eor r6,r6,r7,ror#22 @ Sigma0(a)
+#if 29>=15
+ ldr r1,[sp,#15*4] @ from BODY_16_xx
+#endif
orr r0,r7,r8
and r2,r7,r8
and r0,r0,r9
@@ -1003,26 +1383,34 @@ sha256_block_data_order:
orr r0,r0,r2 @ Maj(a,b,c)
add r10,r10,r3
add r6,r6,r0
- ldr r2,[sp,#15*4] @ 30
+ @ ldr r1,[sp,#15*4] @ 30
ldr r12,[sp,#12*4]
+ mov r0,r1,ror#7
ldr r3,[sp,#14*4]
- mov r0,r2,ror#7
- ldr r1,[sp,#7*4]
- eor r0,r0,r2,ror#18
- eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
- mov r2,r12,ror#17
+ eor r0,r0,r1,ror#18
+ ldr r2,[sp,#7*4]
+ eor r0,r0,r1,lsr#3 @ sigma0(X[i+1])
+ mov r1,r12,ror#17
add r3,r3,r0
- eor r2,r2,r12,ror#19
- add r3,r3,r1
- eor r2,r2,r12,lsr#10 @ sigma1(X[i+14])
+ eor r1,r1,r12,ror#19
add r3,r3,r2
- ldr r12,[r14],#4 @ *K256++
+ eor r1,r1,r12,lsr#10 @ sigma1(X[i+14])
+ @ add r3,r3,r1
mov r0,r10,ror#6
- str r3,[sp,#14*4]
+ ldr r12,[r14],#4 @ *K256++
eor r0,r0,r10,ror#11
eor r2,r11,r4
+#if 30>=16
+ add r3,r3,r1 @ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+ rev r3,r3
+#endif
+#if 30==15
+ str r1,[sp,#17*4] @ leave room for r1
+#endif
eor r0,r0,r10,ror#25 @ Sigma1(e)
and r2,r2,r10
+ str r3,[sp,#14*4]
add r3,r3,r0
eor r2,r2,r4 @ Ch(e,f,g)
add r3,r3,r5
@@ -1031,6 +1419,9 @@ sha256_block_data_order:
eor r5,r5,r6,ror#13
add r3,r3,r12
eor r5,r5,r6,ror#22 @ Sigma0(a)
+#if 30>=15
+ ldr r1,[sp,#0*4] @ from BODY_16_xx
+#endif
orr r0,r6,r7
and r2,r6,r7
and r0,r0,r8
@@ -1038,26 +1429,34 @@ sha256_block_data_order:
orr r0,r0,r2 @ Maj(a,b,c)
add r9,r9,r3
add r5,r5,r0
- ldr r2,[sp,#0*4] @ 31
+ @ ldr r1,[sp,#0*4] @ 31
ldr r12,[sp,#13*4]
+ mov r0,r1,ror#7
ldr r3,[sp,#15*4]
- mov r0,r2,ror#7
- ldr r1,[sp,#8*4]
- eor r0,r0,r2,ror#18
- eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
- mov r2,r12,ror#17
+ eor r0,r0,r1,ror#18
+ ldr r2,[sp,#8*4]
+ eor r0,r0,r1,lsr#3 @ sigma0(X[i+1])
+ mov r1,r12,ror#17
add r3,r3,r0
- eor r2,r2,r12,ror#19
- add r3,r3,r1
- eor r2,r2,r12,lsr#10 @ sigma1(X[i+14])
+ eor r1,r1,r12,ror#19
add r3,r3,r2
- ldr r12,[r14],#4 @ *K256++
+ eor r1,r1,r12,lsr#10 @ sigma1(X[i+14])
+ @ add r3,r3,r1
mov r0,r9,ror#6
- str r3,[sp,#15*4]
+ ldr r12,[r14],#4 @ *K256++
eor r0,r0,r9,ror#11
eor r2,r10,r11
+#if 31>=16
+ add r3,r3,r1 @ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+ rev r3,r3
+#endif
+#if 31==15
+ str r1,[sp,#17*4] @ leave room for r1
+#endif
eor r0,r0,r9,ror#25 @ Sigma1(e)
and r2,r2,r9
+ str r3,[sp,#15*4]
add r3,r3,r0
eor r2,r2,r11 @ Ch(e,f,g)
add r3,r3,r4
@@ -1066,6 +1465,9 @@ sha256_block_data_order:
eor r4,r4,r5,ror#13
add r3,r3,r12
eor r4,r4,r5,ror#22 @ Sigma0(a)
+#if 31>=15
+ ldr r1,[sp,#1*4] @ from BODY_16_xx
+#endif
orr r0,r5,r6
and r2,r5,r6
and r0,r0,r7
@@ -1102,10 +1504,14 @@ sha256_block_data_order:
bne .Loop
add sp,sp,#19*4 @ destroy frame
- ldmia sp!,{r4-r12,lr}
+#if __ARM_ARCH__>=5
+ ldmia sp!,{r4-r11,pc}
+#else
+ ldmia sp!,{r4-r11,lr}
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
.word 0xe12fff1e @ interoperable with Thumb ISA:-)
+#endif
.size sha256_block_data_order,.-sha256_block_data_order
.asciz "SHA256 block transform for ARMv4, CRYPTOGAMS by <appro@openssl.org>"
.align 2
diff --git a/main/openssl/crypto/sha/asm/sha256-mips.S b/main/openssl/crypto/sha/asm/sha256-mips.S
new file mode 100644
index 00000000..2bd728e9
--- /dev/null
+++ b/main/openssl/crypto/sha/asm/sha256-mips.S
@@ -0,0 +1,1998 @@
+#ifdef OPENSSL_FIPSCANISTER
+# include <openssl/fipssyms.h>
+#endif
+
+.text
+.set noat
+#if !defined(__vxworks) || defined(__pic__)
+.option pic2
+#endif
+
+.align 5
+.globl sha256_block_data_order
+.ent sha256_block_data_order
+sha256_block_data_order:
+ .frame $29,128,$31
+ .mask 3237937152,-4
+ .set noreorder
+ .cpload $25
+ sub $29,128
+ sw $31,128-1*4($29)
+ sw $30,128-2*4($29)
+ sw $23,128-3*4($29)
+ sw $22,128-4*4($29)
+ sw $21,128-5*4($29)
+ sw $20,128-6*4($29)
+ sw $19,128-7*4($29)
+ sw $18,128-8*4($29)
+ sw $17,128-9*4($29)
+ sw $16,128-10*4($29)
+ sll $23,$6,6
+ .set reorder
+ la $6,K256 # PIC-ified 'load address'
+
+ lw $1,0*4($4) # load context
+ lw $2,1*4($4)
+ lw $3,2*4($4)
+ lw $7,3*4($4)
+ lw $24,4*4($4)
+ lw $25,5*4($4)
+ lw $30,6*4($4)
+ lw $31,7*4($4)
+
+ add $23,$5 # pointer to the end of input
+ sw $23,16*4($29)
+ b .Loop
+
+.align 5
+.Loop:
+ lwl $8,3($5)
+ lwr $8,0($5)
+ lwl $9,7($5)
+ lwr $9,4($5)
+ srl $13,$8,24 # byte swap(0)
+ srl $14,$8,8
+ andi $15,$8,0xFF00
+ sll $8,$8,24
+ andi $14,0xFF00
+ sll $15,$15,8
+ or $8,$13
+ or $14,$15
+ or $8,$14
+ addu $12,$8,$31 # 0
+ srl $31,$24,6
+ xor $15,$25,$30
+ sll $14,$24,7
+ and $15,$24
+ srl $13,$24,11
+ xor $31,$14
+ sll $14,$24,21
+ xor $31,$13
+ srl $13,$24,25
+ xor $31,$14
+ sll $14,$24,26
+ xor $31,$13
+ xor $15,$30 # Ch(e,f,g)
+ xor $13,$14,$31 # Sigma1(e)
+
+ srl $31,$1,2
+ addu $12,$15
+ lw $15,0($6) # K[0]
+ sll $14,$1,10
+ addu $12,$13
+ srl $13,$1,13
+ xor $31,$14
+ sll $14,$1,19
+ xor $31,$13
+ srl $13,$1,22
+ xor $31,$14
+ sll $14,$1,30
+ xor $31,$13
+ sw $8,0($29) # offload to ring buffer
+ xor $31,$14 # Sigma0(a)
+
+ or $13,$1,$2
+ and $14,$1,$2
+ and $13,$3
+ or $14,$13 # Maj(a,b,c)
+ addu $12,$15 # +=K[0]
+ addu $31,$14
+
+ addu $7,$12
+ addu $31,$12
+ lwl $10,11($5)
+ lwr $10,8($5)
+ srl $14,$9,24 # byte swap(1)
+ srl $15,$9,8
+ andi $16,$9,0xFF00
+ sll $9,$9,24
+ andi $15,0xFF00
+ sll $16,$16,8
+ or $9,$14
+ or $15,$16
+ or $9,$15
+ addu $13,$9,$30 # 1
+ srl $30,$7,6
+ xor $16,$24,$25
+ sll $15,$7,7
+ and $16,$7
+ srl $14,$7,11
+ xor $30,$15
+ sll $15,$7,21
+ xor $30,$14
+ srl $14,$7,25
+ xor $30,$15
+ sll $15,$7,26
+ xor $30,$14
+ xor $16,$25 # Ch(e,f,g)
+ xor $14,$15,$30 # Sigma1(e)
+
+ srl $30,$31,2
+ addu $13,$16
+ lw $16,4($6) # K[1]
+ sll $15,$31,10
+ addu $13,$14
+ srl $14,$31,13
+ xor $30,$15
+ sll $15,$31,19
+ xor $30,$14
+ srl $14,$31,22
+ xor $30,$15
+ sll $15,$31,30
+ xor $30,$14
+ sw $9,4($29) # offload to ring buffer
+ xor $30,$15 # Sigma0(a)
+
+ or $14,$31,$1
+ and $15,$31,$1
+ and $14,$2
+ or $15,$14 # Maj(a,b,c)
+ addu $13,$16 # +=K[1]
+ addu $30,$15
+
+ addu $3,$13
+ addu $30,$13
+ lwl $11,15($5)
+ lwr $11,12($5)
+ srl $15,$10,24 # byte swap(2)
+ srl $16,$10,8
+ andi $17,$10,0xFF00
+ sll $10,$10,24
+ andi $16,0xFF00
+ sll $17,$17,8
+ or $10,$15
+ or $16,$17
+ or $10,$16
+ addu $14,$10,$25 # 2
+ srl $25,$3,6
+ xor $17,$7,$24
+ sll $16,$3,7
+ and $17,$3
+ srl $15,$3,11
+ xor $25,$16
+ sll $16,$3,21
+ xor $25,$15
+ srl $15,$3,25
+ xor $25,$16
+ sll $16,$3,26
+ xor $25,$15
+ xor $17,$24 # Ch(e,f,g)
+ xor $15,$16,$25 # Sigma1(e)
+
+ srl $25,$30,2
+ addu $14,$17
+ lw $17,8($6) # K[2]
+ sll $16,$30,10
+ addu $14,$15
+ srl $15,$30,13
+ xor $25,$16
+ sll $16,$30,19
+ xor $25,$15
+ srl $15,$30,22
+ xor $25,$16
+ sll $16,$30,30
+ xor $25,$15
+ sw $10,8($29) # offload to ring buffer
+ xor $25,$16 # Sigma0(a)
+
+ or $15,$30,$31
+ and $16,$30,$31
+ and $15,$1
+ or $16,$15 # Maj(a,b,c)
+ addu $14,$17 # +=K[2]
+ addu $25,$16
+
+ addu $2,$14
+ addu $25,$14
+ lwl $12,19($5)
+ lwr $12,16($5)
+ srl $16,$11,24 # byte swap(3)
+ srl $17,$11,8
+ andi $18,$11,0xFF00
+ sll $11,$11,24
+ andi $17,0xFF00
+ sll $18,$18,8
+ or $11,$16
+ or $17,$18
+ or $11,$17
+ addu $15,$11,$24 # 3
+ srl $24,$2,6
+ xor $18,$3,$7
+ sll $17,$2,7
+ and $18,$2
+ srl $16,$2,11
+ xor $24,$17
+ sll $17,$2,21
+ xor $24,$16
+ srl $16,$2,25
+ xor $24,$17
+ sll $17,$2,26
+ xor $24,$16
+ xor $18,$7 # Ch(e,f,g)
+ xor $16,$17,$24 # Sigma1(e)
+
+ srl $24,$25,2
+ addu $15,$18
+ lw $18,12($6) # K[3]
+ sll $17,$25,10
+ addu $15,$16
+ srl $16,$25,13
+ xor $24,$17
+ sll $17,$25,19
+ xor $24,$16
+ srl $16,$25,22
+ xor $24,$17
+ sll $17,$25,30
+ xor $24,$16
+ sw $11,12($29) # offload to ring buffer
+ xor $24,$17 # Sigma0(a)
+
+ or $16,$25,$30
+ and $17,$25,$30
+ and $16,$31
+ or $17,$16 # Maj(a,b,c)
+ addu $15,$18 # +=K[3]
+ addu $24,$17
+
+ addu $1,$15
+ addu $24,$15
+ lwl $13,23($5)
+ lwr $13,20($5)
+ srl $17,$12,24 # byte swap(4)
+ srl $18,$12,8
+ andi $19,$12,0xFF00
+ sll $12,$12,24
+ andi $18,0xFF00
+ sll $19,$19,8
+ or $12,$17
+ or $18,$19
+ or $12,$18
+ addu $16,$12,$7 # 4
+ srl $7,$1,6
+ xor $19,$2,$3
+ sll $18,$1,7
+ and $19,$1
+ srl $17,$1,11
+ xor $7,$18
+ sll $18,$1,21
+ xor $7,$17
+ srl $17,$1,25
+ xor $7,$18
+ sll $18,$1,26
+ xor $7,$17
+ xor $19,$3 # Ch(e,f,g)
+ xor $17,$18,$7 # Sigma1(e)
+
+ srl $7,$24,2
+ addu $16,$19
+ lw $19,16($6) # K[4]
+ sll $18,$24,10
+ addu $16,$17
+ srl $17,$24,13
+ xor $7,$18
+ sll $18,$24,19
+ xor $7,$17
+ srl $17,$24,22
+ xor $7,$18
+ sll $18,$24,30
+ xor $7,$17
+ sw $12,16($29) # offload to ring buffer
+ xor $7,$18 # Sigma0(a)
+
+ or $17,$24,$25
+ and $18,$24,$25
+ and $17,$30
+ or $18,$17 # Maj(a,b,c)
+ addu $16,$19 # +=K[4]
+ addu $7,$18
+
+ addu $31,$16
+ addu $7,$16
+ lwl $14,27($5)
+ lwr $14,24($5)
+ srl $18,$13,24 # byte swap(5)
+ srl $19,$13,8
+ andi $20,$13,0xFF00
+ sll $13,$13,24
+ andi $19,0xFF00
+ sll $20,$20,8
+ or $13,$18
+ or $19,$20
+ or $13,$19
+ addu $17,$13,$3 # 5
+ srl $3,$31,6
+ xor $20,$1,$2
+ sll $19,$31,7
+ and $20,$31
+ srl $18,$31,11
+ xor $3,$19
+ sll $19,$31,21
+ xor $3,$18
+ srl $18,$31,25
+ xor $3,$19
+ sll $19,$31,26
+ xor $3,$18
+ xor $20,$2 # Ch(e,f,g)
+ xor $18,$19,$3 # Sigma1(e)
+
+ srl $3,$7,2
+ addu $17,$20
+ lw $20,20($6) # K[5]
+ sll $19,$7,10
+ addu $17,$18
+ srl $18,$7,13
+ xor $3,$19
+ sll $19,$7,19
+ xor $3,$18
+ srl $18,$7,22
+ xor $3,$19
+ sll $19,$7,30
+ xor $3,$18
+ sw $13,20($29) # offload to ring buffer
+ xor $3,$19 # Sigma0(a)
+
+ or $18,$7,$24
+ and $19,$7,$24
+ and $18,$25
+ or $19,$18 # Maj(a,b,c)
+ addu $17,$20 # +=K[5]
+ addu $3,$19
+
+ addu $30,$17
+ addu $3,$17
+ lwl $15,31($5)
+ lwr $15,28($5)
+ srl $19,$14,24 # byte swap(6)
+ srl $20,$14,8
+ andi $21,$14,0xFF00
+ sll $14,$14,24
+ andi $20,0xFF00
+ sll $21,$21,8
+ or $14,$19
+ or $20,$21
+ or $14,$20
+ addu $18,$14,$2 # 6
+ srl $2,$30,6
+ xor $21,$31,$1
+ sll $20,$30,7
+ and $21,$30
+ srl $19,$30,11
+ xor $2,$20
+ sll $20,$30,21
+ xor $2,$19
+ srl $19,$30,25
+ xor $2,$20
+ sll $20,$30,26
+ xor $2,$19
+ xor $21,$1 # Ch(e,f,g)
+ xor $19,$20,$2 # Sigma1(e)
+
+ srl $2,$3,2
+ addu $18,$21
+ lw $21,24($6) # K[6]
+ sll $20,$3,10
+ addu $18,$19
+ srl $19,$3,13
+ xor $2,$20
+ sll $20,$3,19
+ xor $2,$19
+ srl $19,$3,22
+ xor $2,$20
+ sll $20,$3,30
+ xor $2,$19
+ sw $14,24($29) # offload to ring buffer
+ xor $2,$20 # Sigma0(a)
+
+ or $19,$3,$7
+ and $20,$3,$7
+ and $19,$24
+ or $20,$19 # Maj(a,b,c)
+ addu $18,$21 # +=K[6]
+ addu $2,$20
+
+ addu $25,$18
+ addu $2,$18
+ lwl $16,35($5)
+ lwr $16,32($5)
+ srl $20,$15,24 # byte swap(7)
+ srl $21,$15,8
+ andi $22,$15,0xFF00
+ sll $15,$15,24
+ andi $21,0xFF00
+ sll $22,$22,8
+ or $15,$20
+ or $21,$22
+ or $15,$21
+ addu $19,$15,$1 # 7
+ srl $1,$25,6
+ xor $22,$30,$31
+ sll $21,$25,7
+ and $22,$25
+ srl $20,$25,11
+ xor $1,$21
+ sll $21,$25,21
+ xor $1,$20
+ srl $20,$25,25
+ xor $1,$21
+ sll $21,$25,26
+ xor $1,$20
+ xor $22,$31 # Ch(e,f,g)
+ xor $20,$21,$1 # Sigma1(e)
+
+ srl $1,$2,2
+ addu $19,$22
+ lw $22,28($6) # K[7]
+ sll $21,$2,10
+ addu $19,$20
+ srl $20,$2,13
+ xor $1,$21
+ sll $21,$2,19
+ xor $1,$20
+ srl $20,$2,22
+ xor $1,$21
+ sll $21,$2,30
+ xor $1,$20
+ sw $15,28($29) # offload to ring buffer
+ xor $1,$21 # Sigma0(a)
+
+ or $20,$2,$3
+ and $21,$2,$3
+ and $20,$7
+ or $21,$20 # Maj(a,b,c)
+ addu $19,$22 # +=K[7]
+ addu $1,$21
+
+ addu $24,$19
+ addu $1,$19
+ lwl $17,39($5)
+ lwr $17,36($5)
+ srl $21,$16,24 # byte swap(8)
+ srl $22,$16,8
+ andi $23,$16,0xFF00
+ sll $16,$16,24
+ andi $22,0xFF00
+ sll $23,$23,8
+ or $16,$21
+ or $22,$23
+ or $16,$22
+ addu $20,$16,$31 # 8
+ srl $31,$24,6
+ xor $23,$25,$30
+ sll $22,$24,7
+ and $23,$24
+ srl $21,$24,11
+ xor $31,$22
+ sll $22,$24,21
+ xor $31,$21
+ srl $21,$24,25
+ xor $31,$22
+ sll $22,$24,26
+ xor $31,$21
+ xor $23,$30 # Ch(e,f,g)
+ xor $21,$22,$31 # Sigma1(e)
+
+ srl $31,$1,2
+ addu $20,$23
+ lw $23,32($6) # K[8]
+ sll $22,$1,10
+ addu $20,$21
+ srl $21,$1,13
+ xor $31,$22
+ sll $22,$1,19
+ xor $31,$21
+ srl $21,$1,22
+ xor $31,$22
+ sll $22,$1,30
+ xor $31,$21
+ sw $16,32($29) # offload to ring buffer
+ xor $31,$22 # Sigma0(a)
+
+ or $21,$1,$2
+ and $22,$1,$2
+ and $21,$3
+ or $22,$21 # Maj(a,b,c)
+ addu $20,$23 # +=K[8]
+ addu $31,$22
+
+ addu $7,$20
+ addu $31,$20
+ lwl $18,43($5)
+ lwr $18,40($5)
+ srl $22,$17,24 # byte swap(9)
+ srl $23,$17,8
+ andi $8,$17,0xFF00
+ sll $17,$17,24
+ andi $23,0xFF00
+ sll $8,$8,8
+ or $17,$22
+ or $23,$8
+ or $17,$23
+ addu $21,$17,$30 # 9
+ srl $30,$7,6
+ xor $8,$24,$25
+ sll $23,$7,7
+ and $8,$7
+ srl $22,$7,11
+ xor $30,$23
+ sll $23,$7,21
+ xor $30,$22
+ srl $22,$7,25
+ xor $30,$23
+ sll $23,$7,26
+ xor $30,$22
+ xor $8,$25 # Ch(e,f,g)
+ xor $22,$23,$30 # Sigma1(e)
+
+ srl $30,$31,2
+ addu $21,$8
+ lw $8,36($6) # K[9]
+ sll $23,$31,10
+ addu $21,$22
+ srl $22,$31,13
+ xor $30,$23
+ sll $23,$31,19
+ xor $30,$22
+ srl $22,$31,22
+ xor $30,$23
+ sll $23,$31,30
+ xor $30,$22
+ sw $17,36($29) # offload to ring buffer
+ xor $30,$23 # Sigma0(a)
+
+ or $22,$31,$1
+ and $23,$31,$1
+ and $22,$2
+ or $23,$22 # Maj(a,b,c)
+ addu $21,$8 # +=K[9]
+ addu $30,$23
+
+ addu $3,$21
+ addu $30,$21
+ lwl $19,47($5)
+ lwr $19,44($5)
+ srl $23,$18,24 # byte swap(10)
+ srl $8,$18,8
+ andi $9,$18,0xFF00
+ sll $18,$18,24
+ andi $8,0xFF00
+ sll $9,$9,8
+ or $18,$23
+ or $8,$9
+ or $18,$8
+ addu $22,$18,$25 # 10
+ srl $25,$3,6
+ xor $9,$7,$24
+ sll $8,$3,7
+ and $9,$3
+ srl $23,$3,11
+ xor $25,$8
+ sll $8,$3,21
+ xor $25,$23
+ srl $23,$3,25
+ xor $25,$8
+ sll $8,$3,26
+ xor $25,$23
+ xor $9,$24 # Ch(e,f,g)
+ xor $23,$8,$25 # Sigma1(e)
+
+ srl $25,$30,2
+ addu $22,$9
+ lw $9,40($6) # K[10]
+ sll $8,$30,10
+ addu $22,$23
+ srl $23,$30,13
+ xor $25,$8
+ sll $8,$30,19
+ xor $25,$23
+ srl $23,$30,22
+ xor $25,$8
+ sll $8,$30,30
+ xor $25,$23
+ sw $18,40($29) # offload to ring buffer
+ xor $25,$8 # Sigma0(a)
+
+ or $23,$30,$31
+ and $8,$30,$31
+ and $23,$1
+ or $8,$23 # Maj(a,b,c)
+ addu $22,$9 # +=K[10]
+ addu $25,$8
+
+ addu $2,$22
+ addu $25,$22
+ lwl $20,51($5)
+ lwr $20,48($5)
+ srl $8,$19,24 # byte swap(11)
+ srl $9,$19,8
+ andi $10,$19,0xFF00
+ sll $19,$19,24
+ andi $9,0xFF00
+ sll $10,$10,8
+ or $19,$8
+ or $9,$10
+ or $19,$9
+ addu $23,$19,$24 # 11
+ srl $24,$2,6
+ xor $10,$3,$7
+ sll $9,$2,7
+ and $10,$2
+ srl $8,$2,11
+ xor $24,$9
+ sll $9,$2,21
+ xor $24,$8
+ srl $8,$2,25
+ xor $24,$9
+ sll $9,$2,26
+ xor $24,$8
+ xor $10,$7 # Ch(e,f,g)
+ xor $8,$9,$24 # Sigma1(e)
+
+ srl $24,$25,2
+ addu $23,$10
+ lw $10,44($6) # K[11]
+ sll $9,$25,10
+ addu $23,$8
+ srl $8,$25,13
+ xor $24,$9
+ sll $9,$25,19
+ xor $24,$8
+ srl $8,$25,22
+ xor $24,$9
+ sll $9,$25,30
+ xor $24,$8
+ sw $19,44($29) # offload to ring buffer
+ xor $24,$9 # Sigma0(a)
+
+ or $8,$25,$30
+ and $9,$25,$30
+ and $8,$31
+ or $9,$8 # Maj(a,b,c)
+ addu $23,$10 # +=K[11]
+ addu $24,$9
+
+ addu $1,$23
+ addu $24,$23
+ lwl $21,55($5)
+ lwr $21,52($5)
+ srl $9,$20,24 # byte swap(12)
+ srl $10,$20,8
+ andi $11,$20,0xFF00
+ sll $20,$20,24
+ andi $10,0xFF00
+ sll $11,$11,8
+ or $20,$9
+ or $10,$11
+ or $20,$10
+ addu $8,$20,$7 # 12
+ srl $7,$1,6
+ xor $11,$2,$3
+ sll $10,$1,7
+ and $11,$1
+ srl $9,$1,11
+ xor $7,$10
+ sll $10,$1,21
+ xor $7,$9
+ srl $9,$1,25
+ xor $7,$10
+ sll $10,$1,26
+ xor $7,$9
+ xor $11,$3 # Ch(e,f,g)
+ xor $9,$10,$7 # Sigma1(e)
+
+ srl $7,$24,2
+ addu $8,$11
+ lw $11,48($6) # K[12]
+ sll $10,$24,10
+ addu $8,$9
+ srl $9,$24,13
+ xor $7,$10
+ sll $10,$24,19
+ xor $7,$9
+ srl $9,$24,22
+ xor $7,$10
+ sll $10,$24,30
+ xor $7,$9
+ sw $20,48($29) # offload to ring buffer
+ xor $7,$10 # Sigma0(a)
+
+ or $9,$24,$25
+ and $10,$24,$25
+ and $9,$30
+ or $10,$9 # Maj(a,b,c)
+ addu $8,$11 # +=K[12]
+ addu $7,$10
+
+ addu $31,$8
+ addu $7,$8
+ lwl $22,59($5)
+ lwr $22,56($5)
+ srl $10,$21,24 # byte swap(13)
+ srl $11,$21,8
+ andi $12,$21,0xFF00
+ sll $21,$21,24
+ andi $11,0xFF00
+ sll $12,$12,8
+ or $21,$10
+ or $11,$12
+ or $21,$11
+ addu $9,$21,$3 # 13
+ srl $3,$31,6
+ xor $12,$1,$2
+ sll $11,$31,7
+ and $12,$31
+ srl $10,$31,11
+ xor $3,$11
+ sll $11,$31,21
+ xor $3,$10
+ srl $10,$31,25
+ xor $3,$11
+ sll $11,$31,26
+ xor $3,$10
+ xor $12,$2 # Ch(e,f,g)
+ xor $10,$11,$3 # Sigma1(e)
+
+ srl $3,$7,2
+ addu $9,$12
+ lw $12,52($6) # K[13]
+ sll $11,$7,10
+ addu $9,$10
+ srl $10,$7,13
+ xor $3,$11
+ sll $11,$7,19
+ xor $3,$10
+ srl $10,$7,22
+ xor $3,$11
+ sll $11,$7,30
+ xor $3,$10
+ sw $21,52($29) # offload to ring buffer
+ xor $3,$11 # Sigma0(a)
+
+ or $10,$7,$24
+ and $11,$7,$24
+ and $10,$25
+ or $11,$10 # Maj(a,b,c)
+ addu $9,$12 # +=K[13]
+ addu $3,$11
+
+ addu $30,$9
+ addu $3,$9
+ lw $8,0($29) # prefetch from ring buffer
+ lwl $23,63($5)
+ lwr $23,60($5)
+ srl $11,$22,24 # byte swap(14)
+ srl $12,$22,8
+ andi $13,$22,0xFF00
+ sll $22,$22,24
+ andi $12,0xFF00
+ sll $13,$13,8
+ or $22,$11
+ or $12,$13
+ or $22,$12
+ addu $10,$22,$2 # 14
+ srl $2,$30,6
+ xor $13,$31,$1
+ sll $12,$30,7
+ and $13,$30
+ srl $11,$30,11
+ xor $2,$12
+ sll $12,$30,21
+ xor $2,$11
+ srl $11,$30,25
+ xor $2,$12
+ sll $12,$30,26
+ xor $2,$11
+ xor $13,$1 # Ch(e,f,g)
+ xor $11,$12,$2 # Sigma1(e)
+
+ srl $2,$3,2
+ addu $10,$13
+ lw $13,56($6) # K[14]
+ sll $12,$3,10
+ addu $10,$11
+ srl $11,$3,13
+ xor $2,$12
+ sll $12,$3,19
+ xor $2,$11
+ srl $11,$3,22
+ xor $2,$12
+ sll $12,$3,30
+ xor $2,$11
+ sw $22,56($29) # offload to ring buffer
+ xor $2,$12 # Sigma0(a)
+
+ or $11,$3,$7
+ and $12,$3,$7
+ and $11,$24
+ or $12,$11 # Maj(a,b,c)
+ addu $10,$13 # +=K[14]
+ addu $2,$12
+
+ addu $25,$10
+ addu $2,$10
+ lw $9,4($29) # prefetch from ring buffer
+ srl $12,$23,24 # byte swap(15)
+ srl $13,$23,8
+ andi $14,$23,0xFF00
+ sll $23,$23,24
+ andi $13,0xFF00
+ sll $14,$14,8
+ or $23,$12
+ or $13,$14
+ or $23,$13
+ addu $11,$23,$1 # 15
+ srl $1,$25,6
+ xor $14,$30,$31
+ sll $13,$25,7
+ and $14,$25
+ srl $12,$25,11
+ xor $1,$13
+ sll $13,$25,21
+ xor $1,$12
+ srl $12,$25,25
+ xor $1,$13
+ sll $13,$25,26
+ xor $1,$12
+ xor $14,$31 # Ch(e,f,g)
+ xor $12,$13,$1 # Sigma1(e)
+
+ srl $1,$2,2
+ addu $11,$14
+ lw $14,60($6) # K[15]
+ sll $13,$2,10
+ addu $11,$12
+ srl $12,$2,13
+ xor $1,$13
+ sll $13,$2,19
+ xor $1,$12
+ srl $12,$2,22
+ xor $1,$13
+ sll $13,$2,30
+ xor $1,$12
+ sw $23,60($29) # offload to ring buffer
+ xor $1,$13 # Sigma0(a)
+
+ or $12,$2,$3
+ and $13,$2,$3
+ and $12,$7
+ or $13,$12 # Maj(a,b,c)
+ addu $11,$14 # +=K[15]
+ addu $1,$13
+
+ addu $24,$11
+ addu $1,$11
+ lw $10,8($29) # prefetch from ring buffer
+ b .L16_xx
+.align 4
+.L16_xx:
+ srl $14,$9,3 # Xupdate(16)
+ addu $8,$17 # +=X[i+9]
+ sll $13,$9,14
+ srl $12,$9,7
+ xor $14,$13
+ sll $13,11
+ xor $14,$12
+ srl $12,$9,18
+ xor $14,$13
+
+ srl $15,$22,10
+ xor $14,$12 # sigma0(X[i+1])
+ sll $13,$22,13
+ addu $8,$14
+ srl $12,$22,17
+ xor $15,$13
+ sll $13,2
+ xor $15,$12
+ srl $12,$22,19
+ xor $15,$13
+
+ xor $15,$12 # sigma1(X[i+14])
+ addu $8,$15
+ addu $12,$8,$31 # 16
+ srl $31,$24,6
+ xor $15,$25,$30
+ sll $14,$24,7
+ and $15,$24
+ srl $13,$24,11
+ xor $31,$14
+ sll $14,$24,21
+ xor $31,$13
+ srl $13,$24,25
+ xor $31,$14
+ sll $14,$24,26
+ xor $31,$13
+ xor $15,$30 # Ch(e,f,g)
+ xor $13,$14,$31 # Sigma1(e)
+
+ srl $31,$1,2
+ addu $12,$15
+ lw $15,64($6) # K[16]
+ sll $14,$1,10
+ addu $12,$13
+ srl $13,$1,13
+ xor $31,$14
+ sll $14,$1,19
+ xor $31,$13
+ srl $13,$1,22
+ xor $31,$14
+ sll $14,$1,30
+ xor $31,$13
+ sw $8,0($29) # offload to ring buffer
+ xor $31,$14 # Sigma0(a)
+
+ or $13,$1,$2
+ and $14,$1,$2
+ and $13,$3
+ or $14,$13 # Maj(a,b,c)
+ addu $12,$15 # +=K[16]
+ addu $31,$14
+
+ addu $7,$12
+ addu $31,$12
+ lw $11,12($29) # prefetch from ring buffer
+ srl $15,$10,3 # Xupdate(17)
+ addu $9,$18 # +=X[i+9]
+ sll $14,$10,14
+ srl $13,$10,7
+ xor $15,$14
+ sll $14,11
+ xor $15,$13
+ srl $13,$10,18
+ xor $15,$14
+
+ srl $16,$23,10
+ xor $15,$13 # sigma0(X[i+1])
+ sll $14,$23,13
+ addu $9,$15
+ srl $13,$23,17
+ xor $16,$14
+ sll $14,2
+ xor $16,$13
+ srl $13,$23,19
+ xor $16,$14
+
+ xor $16,$13 # sigma1(X[i+14])
+ addu $9,$16
+ addu $13,$9,$30 # 17
+ srl $30,$7,6
+ xor $16,$24,$25
+ sll $15,$7,7
+ and $16,$7
+ srl $14,$7,11
+ xor $30,$15
+ sll $15,$7,21
+ xor $30,$14
+ srl $14,$7,25
+ xor $30,$15
+ sll $15,$7,26
+ xor $30,$14
+ xor $16,$25 # Ch(e,f,g)
+ xor $14,$15,$30 # Sigma1(e)
+
+ srl $30,$31,2
+ addu $13,$16
+ lw $16,68($6) # K[17]
+ sll $15,$31,10
+ addu $13,$14
+ srl $14,$31,13
+ xor $30,$15
+ sll $15,$31,19
+ xor $30,$14
+ srl $14,$31,22
+ xor $30,$15
+ sll $15,$31,30
+ xor $30,$14
+ sw $9,4($29) # offload to ring buffer
+ xor $30,$15 # Sigma0(a)
+
+ or $14,$31,$1
+ and $15,$31,$1
+ and $14,$2
+ or $15,$14 # Maj(a,b,c)
+ addu $13,$16 # +=K[17]
+ addu $30,$15
+
+ addu $3,$13
+ addu $30,$13
+ lw $12,16($29) # prefetch from ring buffer
+ srl $16,$11,3 # Xupdate(18)
+ addu $10,$19 # +=X[i+9]
+ sll $15,$11,14
+ srl $14,$11,7
+ xor $16,$15
+ sll $15,11
+ xor $16,$14
+ srl $14,$11,18
+ xor $16,$15
+
+ srl $17,$8,10
+ xor $16,$14 # sigma0(X[i+1])
+ sll $15,$8,13
+ addu $10,$16
+ srl $14,$8,17
+ xor $17,$15
+ sll $15,2
+ xor $17,$14
+ srl $14,$8,19
+ xor $17,$15
+
+ xor $17,$14 # sigma1(X[i+14])
+ addu $10,$17
+ addu $14,$10,$25 # 18
+ srl $25,$3,6
+ xor $17,$7,$24
+ sll $16,$3,7
+ and $17,$3
+ srl $15,$3,11
+ xor $25,$16
+ sll $16,$3,21
+ xor $25,$15
+ srl $15,$3,25
+ xor $25,$16
+ sll $16,$3,26
+ xor $25,$15
+ xor $17,$24 # Ch(e,f,g)
+ xor $15,$16,$25 # Sigma1(e)
+
+ srl $25,$30,2
+ addu $14,$17
+ lw $17,72($6) # K[18]
+ sll $16,$30,10
+ addu $14,$15
+ srl $15,$30,13
+ xor $25,$16
+ sll $16,$30,19
+ xor $25,$15
+ srl $15,$30,22
+ xor $25,$16
+ sll $16,$30,30
+ xor $25,$15
+ sw $10,8($29) # offload to ring buffer
+ xor $25,$16 # Sigma0(a)
+
+ or $15,$30,$31
+ and $16,$30,$31
+ and $15,$1
+ or $16,$15 # Maj(a,b,c)
+ addu $14,$17 # +=K[18]
+ addu $25,$16
+
+ addu $2,$14
+ addu $25,$14
+ lw $13,20($29) # prefetch from ring buffer
+ srl $17,$12,3 # Xupdate(19)
+ addu $11,$20 # +=X[i+9]
+ sll $16,$12,14
+ srl $15,$12,7
+ xor $17,$16
+ sll $16,11
+ xor $17,$15
+ srl $15,$12,18
+ xor $17,$16
+
+ srl $18,$9,10
+ xor $17,$15 # sigma0(X[i+1])
+ sll $16,$9,13
+ addu $11,$17
+ srl $15,$9,17
+ xor $18,$16
+ sll $16,2
+ xor $18,$15
+ srl $15,$9,19
+ xor $18,$16
+
+ xor $18,$15 # sigma1(X[i+14])
+ addu $11,$18
+ addu $15,$11,$24 # 19
+ srl $24,$2,6
+ xor $18,$3,$7
+ sll $17,$2,7
+ and $18,$2
+ srl $16,$2,11
+ xor $24,$17
+ sll $17,$2,21
+ xor $24,$16
+ srl $16,$2,25
+ xor $24,$17
+ sll $17,$2,26
+ xor $24,$16
+ xor $18,$7 # Ch(e,f,g)
+ xor $16,$17,$24 # Sigma1(e)
+
+ srl $24,$25,2
+ addu $15,$18
+ lw $18,76($6) # K[19]
+ sll $17,$25,10
+ addu $15,$16
+ srl $16,$25,13
+ xor $24,$17
+ sll $17,$25,19
+ xor $24,$16
+ srl $16,$25,22
+ xor $24,$17
+ sll $17,$25,30
+ xor $24,$16
+ sw $11,12($29) # offload to ring buffer
+ xor $24,$17 # Sigma0(a)
+
+ or $16,$25,$30
+ and $17,$25,$30
+ and $16,$31
+ or $17,$16 # Maj(a,b,c)
+ addu $15,$18 # +=K[19]
+ addu $24,$17
+
+ addu $1,$15
+ addu $24,$15
+ lw $14,24($29) # prefetch from ring buffer
+ srl $18,$13,3 # Xupdate(20)
+ addu $12,$21 # +=X[i+9]
+ sll $17,$13,14
+ srl $16,$13,7
+ xor $18,$17
+ sll $17,11
+ xor $18,$16
+ srl $16,$13,18
+ xor $18,$17
+
+ srl $19,$10,10
+ xor $18,$16 # sigma0(X[i+1])
+ sll $17,$10,13
+ addu $12,$18
+ srl $16,$10,17
+ xor $19,$17
+ sll $17,2
+ xor $19,$16
+ srl $16,$10,19
+ xor $19,$17
+
+ xor $19,$16 # sigma1(X[i+14])
+ addu $12,$19
+ addu $16,$12,$7 # 20
+ srl $7,$1,6
+ xor $19,$2,$3
+ sll $18,$1,7
+ and $19,$1
+ srl $17,$1,11
+ xor $7,$18
+ sll $18,$1,21
+ xor $7,$17
+ srl $17,$1,25
+ xor $7,$18
+ sll $18,$1,26
+ xor $7,$17
+ xor $19,$3 # Ch(e,f,g)
+ xor $17,$18,$7 # Sigma1(e)
+
+ srl $7,$24,2
+ addu $16,$19
+ lw $19,80($6) # K[20]
+ sll $18,$24,10
+ addu $16,$17
+ srl $17,$24,13
+ xor $7,$18
+ sll $18,$24,19
+ xor $7,$17
+ srl $17,$24,22
+ xor $7,$18
+ sll $18,$24,30
+ xor $7,$17
+ sw $12,16($29) # offload to ring buffer
+ xor $7,$18 # Sigma0(a)
+
+ or $17,$24,$25
+ and $18,$24,$25
+ and $17,$30
+ or $18,$17 # Maj(a,b,c)
+ addu $16,$19 # +=K[20]
+ addu $7,$18
+
+ addu $31,$16
+ addu $7,$16
+ lw $15,28($29) # prefetch from ring buffer
+ srl $19,$14,3 # Xupdate(21)
+ addu $13,$22 # +=X[i+9]
+ sll $18,$14,14
+ srl $17,$14,7
+ xor $19,$18
+ sll $18,11
+ xor $19,$17
+ srl $17,$14,18
+ xor $19,$18
+
+ srl $20,$11,10
+ xor $19,$17 # sigma0(X[i+1])
+ sll $18,$11,13
+ addu $13,$19
+ srl $17,$11,17
+ xor $20,$18
+ sll $18,2
+ xor $20,$17
+ srl $17,$11,19
+ xor $20,$18
+
+ xor $20,$17 # sigma1(X[i+14])
+ addu $13,$20
+ addu $17,$13,$3 # 21
+ srl $3,$31,6
+ xor $20,$1,$2
+ sll $19,$31,7
+ and $20,$31
+ srl $18,$31,11
+ xor $3,$19
+ sll $19,$31,21
+ xor $3,$18
+ srl $18,$31,25
+ xor $3,$19
+ sll $19,$31,26
+ xor $3,$18
+ xor $20,$2 # Ch(e,f,g)
+ xor $18,$19,$3 # Sigma1(e)
+
+ srl $3,$7,2
+ addu $17,$20
+ lw $20,84($6) # K[21]
+ sll $19,$7,10
+ addu $17,$18
+ srl $18,$7,13
+ xor $3,$19
+ sll $19,$7,19
+ xor $3,$18
+ srl $18,$7,22
+ xor $3,$19
+ sll $19,$7,30
+ xor $3,$18
+ sw $13,20($29) # offload to ring buffer
+ xor $3,$19 # Sigma0(a)
+
+ or $18,$7,$24
+ and $19,$7,$24
+ and $18,$25
+ or $19,$18 # Maj(a,b,c)
+ addu $17,$20 # +=K[21]
+ addu $3,$19
+
+ addu $30,$17
+ addu $3,$17
+ lw $16,32($29) # prefetch from ring buffer
+ srl $20,$15,3 # Xupdate(22)
+ addu $14,$23 # +=X[i+9]
+ sll $19,$15,14
+ srl $18,$15,7
+ xor $20,$19
+ sll $19,11
+ xor $20,$18
+ srl $18,$15,18
+ xor $20,$19
+
+ srl $21,$12,10
+ xor $20,$18 # sigma0(X[i+1])
+ sll $19,$12,13
+ addu $14,$20
+ srl $18,$12,17
+ xor $21,$19
+ sll $19,2
+ xor $21,$18
+ srl $18,$12,19
+ xor $21,$19
+
+ xor $21,$18 # sigma1(X[i+14])
+ addu $14,$21
+ addu $18,$14,$2 # 22
+ srl $2,$30,6
+ xor $21,$31,$1
+ sll $20,$30,7
+ and $21,$30
+ srl $19,$30,11
+ xor $2,$20
+ sll $20,$30,21
+ xor $2,$19
+ srl $19,$30,25
+ xor $2,$20
+ sll $20,$30,26
+ xor $2,$19
+ xor $21,$1 # Ch(e,f,g)
+ xor $19,$20,$2 # Sigma1(e)
+
+ srl $2,$3,2
+ addu $18,$21
+ lw $21,88($6) # K[22]
+ sll $20,$3,10
+ addu $18,$19
+ srl $19,$3,13
+ xor $2,$20
+ sll $20,$3,19
+ xor $2,$19
+ srl $19,$3,22
+ xor $2,$20
+ sll $20,$3,30
+ xor $2,$19
+ sw $14,24($29) # offload to ring buffer
+ xor $2,$20 # Sigma0(a)
+
+ or $19,$3,$7
+ and $20,$3,$7
+ and $19,$24
+ or $20,$19 # Maj(a,b,c)
+ addu $18,$21 # +=K[22]
+ addu $2,$20
+
+ addu $25,$18
+ addu $2,$18
+ lw $17,36($29) # prefetch from ring buffer
+ srl $21,$16,3 # Xupdate(23)
+ addu $15,$8 # +=X[i+9]
+ sll $20,$16,14
+ srl $19,$16,7
+ xor $21,$20
+ sll $20,11
+ xor $21,$19
+ srl $19,$16,18
+ xor $21,$20
+
+ srl $22,$13,10
+ xor $21,$19 # sigma0(X[i+1])
+ sll $20,$13,13
+ addu $15,$21
+ srl $19,$13,17
+ xor $22,$20
+ sll $20,2
+ xor $22,$19
+ srl $19,$13,19
+ xor $22,$20
+
+ xor $22,$19 # sigma1(X[i+14])
+ addu $15,$22
+ addu $19,$15,$1 # 23
+ srl $1,$25,6
+ xor $22,$30,$31
+ sll $21,$25,7
+ and $22,$25
+ srl $20,$25,11
+ xor $1,$21
+ sll $21,$25,21
+ xor $1,$20
+ srl $20,$25,25
+ xor $1,$21
+ sll $21,$25,26
+ xor $1,$20
+ xor $22,$31 # Ch(e,f,g)
+ xor $20,$21,$1 # Sigma1(e)
+
+ srl $1,$2,2
+ addu $19,$22
+ lw $22,92($6) # K[23]
+ sll $21,$2,10
+ addu $19,$20
+ srl $20,$2,13
+ xor $1,$21
+ sll $21,$2,19
+ xor $1,$20
+ srl $20,$2,22
+ xor $1,$21
+ sll $21,$2,30
+ xor $1,$20
+ sw $15,28($29) # offload to ring buffer
+ xor $1,$21 # Sigma0(a)
+
+ or $20,$2,$3
+ and $21,$2,$3
+ and $20,$7
+ or $21,$20 # Maj(a,b,c)
+ addu $19,$22 # +=K[23]
+ addu $1,$21
+
+ addu $24,$19
+ addu $1,$19
+ lw $18,40($29) # prefetch from ring buffer
+ srl $22,$17,3 # Xupdate(24)
+ addu $16,$9 # +=X[i+9]
+ sll $21,$17,14
+ srl $20,$17,7
+ xor $22,$21
+ sll $21,11
+ xor $22,$20
+ srl $20,$17,18
+ xor $22,$21
+
+ srl $23,$14,10
+ xor $22,$20 # sigma0(X[i+1])
+ sll $21,$14,13
+ addu $16,$22
+ srl $20,$14,17
+ xor $23,$21
+ sll $21,2
+ xor $23,$20
+ srl $20,$14,19
+ xor $23,$21
+
+ xor $23,$20 # sigma1(X[i+14])
+ addu $16,$23
+ addu $20,$16,$31 # 24
+ srl $31,$24,6
+ xor $23,$25,$30
+ sll $22,$24,7
+ and $23,$24
+ srl $21,$24,11
+ xor $31,$22
+ sll $22,$24,21
+ xor $31,$21
+ srl $21,$24,25
+ xor $31,$22
+ sll $22,$24,26
+ xor $31,$21
+ xor $23,$30 # Ch(e,f,g)
+ xor $21,$22,$31 # Sigma1(e)
+
+ srl $31,$1,2
+ addu $20,$23
+ lw $23,96($6) # K[24]
+ sll $22,$1,10
+ addu $20,$21
+ srl $21,$1,13
+ xor $31,$22
+ sll $22,$1,19
+ xor $31,$21
+ srl $21,$1,22
+ xor $31,$22
+ sll $22,$1,30
+ xor $31,$21
+ sw $16,32($29) # offload to ring buffer
+ xor $31,$22 # Sigma0(a)
+
+ or $21,$1,$2
+ and $22,$1,$2
+ and $21,$3
+ or $22,$21 # Maj(a,b,c)
+ addu $20,$23 # +=K[24]
+ addu $31,$22
+
+ addu $7,$20
+ addu $31,$20
+ lw $19,44($29) # prefetch from ring buffer
+ srl $23,$18,3 # Xupdate(25)
+ addu $17,$10 # +=X[i+9]
+ sll $22,$18,14
+ srl $21,$18,7
+ xor $23,$22
+ sll $22,11
+ xor $23,$21
+ srl $21,$18,18
+ xor $23,$22
+
+ srl $8,$15,10
+ xor $23,$21 # sigma0(X[i+1])
+ sll $22,$15,13
+ addu $17,$23
+ srl $21,$15,17
+ xor $8,$22
+ sll $22,2
+ xor $8,$21
+ srl $21,$15,19
+ xor $8,$22
+
+ xor $8,$21 # sigma1(X[i+14])
+ addu $17,$8
+ addu $21,$17,$30 # 25
+ srl $30,$7,6
+ xor $8,$24,$25
+ sll $23,$7,7
+ and $8,$7
+ srl $22,$7,11
+ xor $30,$23
+ sll $23,$7,21
+ xor $30,$22
+ srl $22,$7,25
+ xor $30,$23
+ sll $23,$7,26
+ xor $30,$22
+ xor $8,$25 # Ch(e,f,g)
+ xor $22,$23,$30 # Sigma1(e)
+
+ srl $30,$31,2
+ addu $21,$8
+ lw $8,100($6) # K[25]
+ sll $23,$31,10
+ addu $21,$22
+ srl $22,$31,13
+ xor $30,$23
+ sll $23,$31,19
+ xor $30,$22
+ srl $22,$31,22
+ xor $30,$23
+ sll $23,$31,30
+ xor $30,$22
+ sw $17,36($29) # offload to ring buffer
+ xor $30,$23 # Sigma0(a)
+
+ or $22,$31,$1
+ and $23,$31,$1
+ and $22,$2
+ or $23,$22 # Maj(a,b,c)
+ addu $21,$8 # +=K[25]
+ addu $30,$23
+
+ addu $3,$21
+ addu $30,$21
+ lw $20,48($29) # prefetch from ring buffer
+ srl $8,$19,3 # Xupdate(26)
+ addu $18,$11 # +=X[i+9]
+ sll $23,$19,14
+ srl $22,$19,7
+ xor $8,$23
+ sll $23,11
+ xor $8,$22
+ srl $22,$19,18
+ xor $8,$23
+
+ srl $9,$16,10
+ xor $8,$22 # sigma0(X[i+1])
+ sll $23,$16,13
+ addu $18,$8
+ srl $22,$16,17
+ xor $9,$23
+ sll $23,2
+ xor $9,$22
+ srl $22,$16,19
+ xor $9,$23
+
+ xor $9,$22 # sigma1(X[i+14])
+ addu $18,$9
+ addu $22,$18,$25 # 26
+ srl $25,$3,6
+ xor $9,$7,$24
+ sll $8,$3,7
+ and $9,$3
+ srl $23,$3,11
+ xor $25,$8
+ sll $8,$3,21
+ xor $25,$23
+ srl $23,$3,25
+ xor $25,$8
+ sll $8,$3,26
+ xor $25,$23
+ xor $9,$24 # Ch(e,f,g)
+ xor $23,$8,$25 # Sigma1(e)
+
+ srl $25,$30,2
+ addu $22,$9
+ lw $9,104($6) # K[26]
+ sll $8,$30,10
+ addu $22,$23
+ srl $23,$30,13
+ xor $25,$8
+ sll $8,$30,19
+ xor $25,$23
+ srl $23,$30,22
+ xor $25,$8
+ sll $8,$30,30
+ xor $25,$23
+ sw $18,40($29) # offload to ring buffer
+ xor $25,$8 # Sigma0(a)
+
+ or $23,$30,$31
+ and $8,$30,$31
+ and $23,$1
+ or $8,$23 # Maj(a,b,c)
+ addu $22,$9 # +=K[26]
+ addu $25,$8
+
+ addu $2,$22
+ addu $25,$22
+ lw $21,52($29) # prefetch from ring buffer
+ srl $9,$20,3 # Xupdate(27)
+ addu $19,$12 # +=X[i+9]
+ sll $8,$20,14
+ srl $23,$20,7
+ xor $9,$8
+ sll $8,11
+ xor $9,$23
+ srl $23,$20,18
+ xor $9,$8
+
+ srl $10,$17,10
+ xor $9,$23 # sigma0(X[i+1])
+ sll $8,$17,13
+ addu $19,$9
+ srl $23,$17,17
+ xor $10,$8
+ sll $8,2
+ xor $10,$23
+ srl $23,$17,19
+ xor $10,$8
+
+ xor $10,$23 # sigma1(X[i+14])
+ addu $19,$10
+ addu $23,$19,$24 # 27
+ srl $24,$2,6
+ xor $10,$3,$7
+ sll $9,$2,7
+ and $10,$2
+ srl $8,$2,11
+ xor $24,$9
+ sll $9,$2,21
+ xor $24,$8
+ srl $8,$2,25
+ xor $24,$9
+ sll $9,$2,26
+ xor $24,$8
+ xor $10,$7 # Ch(e,f,g)
+ xor $8,$9,$24 # Sigma1(e)
+
+ srl $24,$25,2
+ addu $23,$10
+ lw $10,108($6) # K[27]
+ sll $9,$25,10
+ addu $23,$8
+ srl $8,$25,13
+ xor $24,$9
+ sll $9,$25,19
+ xor $24,$8
+ srl $8,$25,22
+ xor $24,$9
+ sll $9,$25,30
+ xor $24,$8
+ sw $19,44($29) # offload to ring buffer
+ xor $24,$9 # Sigma0(a)
+
+ or $8,$25,$30
+ and $9,$25,$30
+ and $8,$31
+ or $9,$8 # Maj(a,b,c)
+ addu $23,$10 # +=K[27]
+ addu $24,$9
+
+ addu $1,$23
+ addu $24,$23
+ lw $22,56($29) # prefetch from ring buffer
+ srl $10,$21,3 # Xupdate(28)
+ addu $20,$13 # +=X[i+9]
+ sll $9,$21,14
+ srl $8,$21,7
+ xor $10,$9
+ sll $9,11
+ xor $10,$8
+ srl $8,$21,18
+ xor $10,$9
+
+ srl $11,$18,10
+ xor $10,$8 # sigma0(X[i+1])
+ sll $9,$18,13
+ addu $20,$10
+ srl $8,$18,17
+ xor $11,$9
+ sll $9,2
+ xor $11,$8
+ srl $8,$18,19
+ xor $11,$9
+
+ xor $11,$8 # sigma1(X[i+14])
+ addu $20,$11
+ addu $8,$20,$7 # 28
+ srl $7,$1,6
+ xor $11,$2,$3
+ sll $10,$1,7
+ and $11,$1
+ srl $9,$1,11
+ xor $7,$10
+ sll $10,$1,21
+ xor $7,$9
+ srl $9,$1,25
+ xor $7,$10
+ sll $10,$1,26
+ xor $7,$9
+ xor $11,$3 # Ch(e,f,g)
+ xor $9,$10,$7 # Sigma1(e)
+
+ srl $7,$24,2
+ addu $8,$11
+ lw $11,112($6) # K[28]
+ sll $10,$24,10
+ addu $8,$9
+ srl $9,$24,13
+ xor $7,$10
+ sll $10,$24,19
+ xor $7,$9
+ srl $9,$24,22
+ xor $7,$10
+ sll $10,$24,30
+ xor $7,$9
+ sw $20,48($29) # offload to ring buffer
+ xor $7,$10 # Sigma0(a)
+
+ or $9,$24,$25
+ and $10,$24,$25
+ and $9,$30
+ or $10,$9 # Maj(a,b,c)
+ addu $8,$11 # +=K[28]
+ addu $7,$10
+
+ addu $31,$8
+ addu $7,$8
+ lw $23,60($29) # prefetch from ring buffer
+ srl $11,$22,3 # Xupdate(29)
+ addu $21,$14 # +=X[i+9]
+ sll $10,$22,14
+ srl $9,$22,7
+ xor $11,$10
+ sll $10,11
+ xor $11,$9
+ srl $9,$22,18
+ xor $11,$10
+
+ srl $12,$19,10
+ xor $11,$9 # sigma0(X[i+1])
+ sll $10,$19,13
+ addu $21,$11
+ srl $9,$19,17
+ xor $12,$10
+ sll $10,2
+ xor $12,$9
+ srl $9,$19,19
+ xor $12,$10
+
+ xor $12,$9 # sigma1(X[i+14])
+ addu $21,$12
+ addu $9,$21,$3 # 29
+ srl $3,$31,6
+ xor $12,$1,$2
+ sll $11,$31,7
+ and $12,$31
+ srl $10,$31,11
+ xor $3,$11
+ sll $11,$31,21
+ xor $3,$10
+ srl $10,$31,25
+ xor $3,$11
+ sll $11,$31,26
+ xor $3,$10
+ xor $12,$2 # Ch(e,f,g)
+ xor $10,$11,$3 # Sigma1(e)
+
+ srl $3,$7,2
+ addu $9,$12
+ lw $12,116($6) # K[29]
+ sll $11,$7,10
+ addu $9,$10
+ srl $10,$7,13
+ xor $3,$11
+ sll $11,$7,19
+ xor $3,$10
+ srl $10,$7,22
+ xor $3,$11
+ sll $11,$7,30
+ xor $3,$10
+ sw $21,52($29) # offload to ring buffer
+ xor $3,$11 # Sigma0(a)
+
+ or $10,$7,$24
+ and $11,$7,$24
+ and $10,$25
+ or $11,$10 # Maj(a,b,c)
+ addu $9,$12 # +=K[29]
+ addu $3,$11
+
+ addu $30,$9
+ addu $3,$9
+ lw $8,0($29) # prefetch from ring buffer
+ srl $12,$23,3 # Xupdate(30)
+ addu $22,$15 # +=X[i+9]
+ sll $11,$23,14
+ srl $10,$23,7
+ xor $12,$11
+ sll $11,11
+ xor $12,$10
+ srl $10,$23,18
+ xor $12,$11
+
+ srl $13,$20,10
+ xor $12,$10 # sigma0(X[i+1])
+ sll $11,$20,13
+ addu $22,$12
+ srl $10,$20,17
+ xor $13,$11
+ sll $11,2
+ xor $13,$10
+ srl $10,$20,19
+ xor $13,$11
+
+ xor $13,$10 # sigma1(X[i+14])
+ addu $22,$13
+ addu $10,$22,$2 # 30
+ srl $2,$30,6
+ xor $13,$31,$1
+ sll $12,$30,7
+ and $13,$30
+ srl $11,$30,11
+ xor $2,$12
+ sll $12,$30,21
+ xor $2,$11
+ srl $11,$30,25
+ xor $2,$12
+ sll $12,$30,26
+ xor $2,$11
+ xor $13,$1 # Ch(e,f,g)
+ xor $11,$12,$2 # Sigma1(e)
+
+ srl $2,$3,2
+ addu $10,$13
+ lw $13,120($6) # K[30]
+ sll $12,$3,10
+ addu $10,$11
+ srl $11,$3,13
+ xor $2,$12
+ sll $12,$3,19
+ xor $2,$11
+ srl $11,$3,22
+ xor $2,$12
+ sll $12,$3,30
+ xor $2,$11
+ sw $22,56($29) # offload to ring buffer
+ xor $2,$12 # Sigma0(a)
+
+ or $11,$3,$7
+ and $12,$3,$7
+ and $11,$24
+ or $12,$11 # Maj(a,b,c)
+ addu $10,$13 # +=K[30]
+ addu $2,$12
+
+ addu $25,$10
+ addu $2,$10
+ lw $9,4($29) # prefetch from ring buffer
+ srl $13,$8,3 # Xupdate(31)
+ addu $23,$16 # +=X[i+9]
+ sll $12,$8,14
+ srl $11,$8,7
+ xor $13,$12
+ sll $12,11
+ xor $13,$11
+ srl $11,$8,18
+ xor $13,$12
+
+ srl $14,$21,10
+ xor $13,$11 # sigma0(X[i+1])
+ sll $12,$21,13
+ addu $23,$13
+ srl $11,$21,17
+ xor $14,$12
+ sll $12,2
+ xor $14,$11
+ srl $11,$21,19
+ xor $14,$12
+
+ xor $14,$11 # sigma1(X[i+14])
+ addu $23,$14
+ addu $11,$23,$1 # 31
+ srl $1,$25,6
+ xor $14,$30,$31
+ sll $13,$25,7
+ and $14,$25
+ srl $12,$25,11
+ xor $1,$13
+ sll $13,$25,21
+ xor $1,$12
+ srl $12,$25,25
+ xor $1,$13
+ sll $13,$25,26
+ xor $1,$12
+ xor $14,$31 # Ch(e,f,g)
+ xor $12,$13,$1 # Sigma1(e)
+
+ srl $1,$2,2
+ addu $11,$14
+ lw $14,124($6) # K[31]
+ sll $13,$2,10
+ addu $11,$12
+ srl $12,$2,13
+ xor $1,$13
+ sll $13,$2,19
+ xor $1,$12
+ srl $12,$2,22
+ xor $1,$13
+ sll $13,$2,30
+ xor $1,$12
+ sw $23,60($29) # offload to ring buffer
+ xor $1,$13 # Sigma0(a)
+
+ or $12,$2,$3
+ and $13,$2,$3
+ and $12,$7
+ or $13,$12 # Maj(a,b,c)
+ addu $11,$14 # +=K[31]
+ addu $1,$13
+
+ addu $24,$11
+ addu $1,$11
+ lw $10,8($29) # prefetch from ring buffer
+ and $14,0xfff
+ li $15,2290
+ .set noreorder
+ bne $14,$15,.L16_xx
+ add $6,16*4 # Ktbl+=16
+
+ lw $23,16*4($29) # restore pointer to the end of input
+ lw $8,0*4($4)
+ lw $9,1*4($4)
+ lw $10,2*4($4)
+ add $5,16*4
+ lw $11,3*4($4)
+ addu $1,$8
+ lw $12,4*4($4)
+ addu $2,$9
+ lw $13,5*4($4)
+ addu $3,$10
+ lw $14,6*4($4)
+ addu $7,$11
+ lw $15,7*4($4)
+ addu $24,$12
+ sw $1,0*4($4)
+ addu $25,$13
+ sw $2,1*4($4)
+ addu $30,$14
+ sw $3,2*4($4)
+ addu $31,$15
+ sw $7,3*4($4)
+ sw $24,4*4($4)
+ sw $25,5*4($4)
+ sw $30,6*4($4)
+ sw $31,7*4($4)
+
+ bne $5,$23,.Loop
+ sub $6,192 # rewind $6
+
+ lw $31,128-1*4($29)
+ lw $30,128-2*4($29)
+ lw $23,128-3*4($29)
+ lw $22,128-4*4($29)
+ lw $21,128-5*4($29)
+ lw $20,128-6*4($29)
+ lw $19,128-7*4($29)
+ lw $18,128-8*4($29)
+ lw $17,128-9*4($29)
+ lw $16,128-10*4($29)
+ jr $31
+ add $29,128
+.end sha256_block_data_order
+
+.rdata
+.align 5
+K256:
+ .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
+ .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
+ .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
+ .word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
+ .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
+ .word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
+ .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
+ .word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
+ .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
+ .word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
+ .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
+ .word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
+ .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
+ .word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
+ .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
+ .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+.asciiz "SHA256 for MIPS, CRYPTOGAMS by <appro@openssl.org>"
+.align 5
+
diff --git a/main/openssl/crypto/sha/asm/sha256-x86_64.S b/main/openssl/crypto/sha/asm/sha256-x86_64.S
new file mode 100644
index 00000000..db5b898f
--- /dev/null
+++ b/main/openssl/crypto/sha/asm/sha256-x86_64.S
@@ -0,0 +1,1778 @@
+.text
+
+.globl sha256_block_data_order
+.type sha256_block_data_order,@function
+.align 16
+sha256_block_data_order:
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ movq %rsp,%r11
+ shlq $4,%rdx
+ subq $64+32,%rsp
+ leaq (%rsi,%rdx,4),%rdx
+ andq $-64,%rsp
+ movq %rdi,64+0(%rsp)
+ movq %rsi,64+8(%rsp)
+ movq %rdx,64+16(%rsp)
+ movq %r11,64+24(%rsp)
+.Lprologue:
+
+ leaq K256(%rip),%rbp
+
+ movl 0(%rdi),%eax
+ movl 4(%rdi),%ebx
+ movl 8(%rdi),%ecx
+ movl 12(%rdi),%edx
+ movl 16(%rdi),%r8d
+ movl 20(%rdi),%r9d
+ movl 24(%rdi),%r10d
+ movl 28(%rdi),%r11d
+ jmp .Lloop
+
+.align 16
+.Lloop:
+ xorq %rdi,%rdi
+ movl 0(%rsi),%r12d
+ movl %r8d,%r13d
+ movl %eax,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %r9d,%r15d
+ movl %r12d,0(%rsp)
+
+ rorl $9,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r15d
+
+ rorl $5,%r13d
+ addl %r11d,%r12d
+ xorl %eax,%r14d
+
+ addl (%rbp,%rdi,4),%r12d
+ andl %r8d,%r15d
+ movl %ebx,%r11d
+
+ rorl $11,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r15d
+
+ xorl %ecx,%r11d
+ xorl %eax,%r14d
+ addl %r15d,%r12d
+ movl %ebx,%r15d
+
+ rorl $6,%r13d
+ andl %eax,%r11d
+ andl %ecx,%r15d
+
+ rorl $2,%r14d
+ addl %r13d,%r12d
+ addl %r15d,%r11d
+
+ addl %r12d,%edx
+ addl %r12d,%r11d
+ leaq 1(%rdi),%rdi
+ addl %r14d,%r11d
+
+ movl 4(%rsi),%r12d
+ movl %edx,%r13d
+ movl %r11d,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %r8d,%r15d
+ movl %r12d,4(%rsp)
+
+ rorl $9,%r14d
+ xorl %edx,%r13d
+ xorl %r9d,%r15d
+
+ rorl $5,%r13d
+ addl %r10d,%r12d
+ xorl %r11d,%r14d
+
+ addl (%rbp,%rdi,4),%r12d
+ andl %edx,%r15d
+ movl %eax,%r10d
+
+ rorl $11,%r14d
+ xorl %edx,%r13d
+ xorl %r9d,%r15d
+
+ xorl %ebx,%r10d
+ xorl %r11d,%r14d
+ addl %r15d,%r12d
+ movl %eax,%r15d
+
+ rorl $6,%r13d
+ andl %r11d,%r10d
+ andl %ebx,%r15d
+
+ rorl $2,%r14d
+ addl %r13d,%r12d
+ addl %r15d,%r10d
+
+ addl %r12d,%ecx
+ addl %r12d,%r10d
+ leaq 1(%rdi),%rdi
+ addl %r14d,%r10d
+
+ movl 8(%rsi),%r12d
+ movl %ecx,%r13d
+ movl %r10d,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %edx,%r15d
+ movl %r12d,8(%rsp)
+
+ rorl $9,%r14d
+ xorl %ecx,%r13d
+ xorl %r8d,%r15d
+
+ rorl $5,%r13d
+ addl %r9d,%r12d
+ xorl %r10d,%r14d
+
+ addl (%rbp,%rdi,4),%r12d
+ andl %ecx,%r15d
+ movl %r11d,%r9d
+
+ rorl $11,%r14d
+ xorl %ecx,%r13d
+ xorl %r8d,%r15d
+
+ xorl %eax,%r9d
+ xorl %r10d,%r14d
+ addl %r15d,%r12d
+ movl %r11d,%r15d
+
+ rorl $6,%r13d
+ andl %r10d,%r9d
+ andl %eax,%r15d
+
+ rorl $2,%r14d
+ addl %r13d,%r12d
+ addl %r15d,%r9d
+
+ addl %r12d,%ebx
+ addl %r12d,%r9d
+ leaq 1(%rdi),%rdi
+ addl %r14d,%r9d
+
+ movl 12(%rsi),%r12d
+ movl %ebx,%r13d
+ movl %r9d,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %ecx,%r15d
+ movl %r12d,12(%rsp)
+
+ rorl $9,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r15d
+
+ rorl $5,%r13d
+ addl %r8d,%r12d
+ xorl %r9d,%r14d
+
+ addl (%rbp,%rdi,4),%r12d
+ andl %ebx,%r15d
+ movl %r10d,%r8d
+
+ rorl $11,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r15d
+
+ xorl %r11d,%r8d
+ xorl %r9d,%r14d
+ addl %r15d,%r12d
+ movl %r10d,%r15d
+
+ rorl $6,%r13d
+ andl %r9d,%r8d
+ andl %r11d,%r15d
+
+ rorl $2,%r14d
+ addl %r13d,%r12d
+ addl %r15d,%r8d
+
+ addl %r12d,%eax
+ addl %r12d,%r8d
+ leaq 1(%rdi),%rdi
+ addl %r14d,%r8d
+
+ movl 16(%rsi),%r12d
+ movl %eax,%r13d
+ movl %r8d,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %ebx,%r15d
+ movl %r12d,16(%rsp)
+
+ rorl $9,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r15d
+
+ rorl $5,%r13d
+ addl %edx,%r12d
+ xorl %r8d,%r14d
+
+ addl (%rbp,%rdi,4),%r12d
+ andl %eax,%r15d
+ movl %r9d,%edx
+
+ rorl $11,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r15d
+
+ xorl %r10d,%edx
+ xorl %r8d,%r14d
+ addl %r15d,%r12d
+ movl %r9d,%r15d
+
+ rorl $6,%r13d
+ andl %r8d,%edx
+ andl %r10d,%r15d
+
+ rorl $2,%r14d
+ addl %r13d,%r12d
+ addl %r15d,%edx
+
+ addl %r12d,%r11d
+ addl %r12d,%edx
+ leaq 1(%rdi),%rdi
+ addl %r14d,%edx
+
+ movl 20(%rsi),%r12d
+ movl %r11d,%r13d
+ movl %edx,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %eax,%r15d
+ movl %r12d,20(%rsp)
+
+ rorl $9,%r14d
+ xorl %r11d,%r13d
+ xorl %ebx,%r15d
+
+ rorl $5,%r13d
+ addl %ecx,%r12d
+ xorl %edx,%r14d
+
+ addl (%rbp,%rdi,4),%r12d
+ andl %r11d,%r15d
+ movl %r8d,%ecx
+
+ rorl $11,%r14d
+ xorl %r11d,%r13d
+ xorl %ebx,%r15d
+
+ xorl %r9d,%ecx
+ xorl %edx,%r14d
+ addl %r15d,%r12d
+ movl %r8d,%r15d
+
+ rorl $6,%r13d
+ andl %edx,%ecx
+ andl %r9d,%r15d
+
+ rorl $2,%r14d
+ addl %r13d,%r12d
+ addl %r15d,%ecx
+
+ addl %r12d,%r10d
+ addl %r12d,%ecx
+ leaq 1(%rdi),%rdi
+ addl %r14d,%ecx
+
+ movl 24(%rsi),%r12d
+ movl %r10d,%r13d
+ movl %ecx,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %r11d,%r15d
+ movl %r12d,24(%rsp)
+
+ rorl $9,%r14d
+ xorl %r10d,%r13d
+ xorl %eax,%r15d
+
+ rorl $5,%r13d
+ addl %ebx,%r12d
+ xorl %ecx,%r14d
+
+ addl (%rbp,%rdi,4),%r12d
+ andl %r10d,%r15d
+ movl %edx,%ebx
+
+ rorl $11,%r14d
+ xorl %r10d,%r13d
+ xorl %eax,%r15d
+
+ xorl %r8d,%ebx
+ xorl %ecx,%r14d
+ addl %r15d,%r12d
+ movl %edx,%r15d
+
+ rorl $6,%r13d
+ andl %ecx,%ebx
+ andl %r8d,%r15d
+
+ rorl $2,%r14d
+ addl %r13d,%r12d
+ addl %r15d,%ebx
+
+ addl %r12d,%r9d
+ addl %r12d,%ebx
+ leaq 1(%rdi),%rdi
+ addl %r14d,%ebx
+
+ movl 28(%rsi),%r12d
+ movl %r9d,%r13d
+ movl %ebx,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %r10d,%r15d
+ movl %r12d,28(%rsp)
+
+ rorl $9,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r15d
+
+ rorl $5,%r13d
+ addl %eax,%r12d
+ xorl %ebx,%r14d
+
+ addl (%rbp,%rdi,4),%r12d
+ andl %r9d,%r15d
+ movl %ecx,%eax
+
+ rorl $11,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r15d
+
+ xorl %edx,%eax
+ xorl %ebx,%r14d
+ addl %r15d,%r12d
+ movl %ecx,%r15d
+
+ rorl $6,%r13d
+ andl %ebx,%eax
+ andl %edx,%r15d
+
+ rorl $2,%r14d
+ addl %r13d,%r12d
+ addl %r15d,%eax
+
+ addl %r12d,%r8d
+ addl %r12d,%eax
+ leaq 1(%rdi),%rdi
+ addl %r14d,%eax
+
+ movl 32(%rsi),%r12d
+ movl %r8d,%r13d
+ movl %eax,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %r9d,%r15d
+ movl %r12d,32(%rsp)
+
+ rorl $9,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r15d
+
+ rorl $5,%r13d
+ addl %r11d,%r12d
+ xorl %eax,%r14d
+
+ addl (%rbp,%rdi,4),%r12d
+ andl %r8d,%r15d
+ movl %ebx,%r11d
+
+ rorl $11,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r15d
+
+ xorl %ecx,%r11d
+ xorl %eax,%r14d
+ addl %r15d,%r12d
+ movl %ebx,%r15d
+
+ rorl $6,%r13d
+ andl %eax,%r11d
+ andl %ecx,%r15d
+
+ rorl $2,%r14d
+ addl %r13d,%r12d
+ addl %r15d,%r11d
+
+ addl %r12d,%edx
+ addl %r12d,%r11d
+ leaq 1(%rdi),%rdi
+ addl %r14d,%r11d
+
+ movl 36(%rsi),%r12d
+ movl %edx,%r13d
+ movl %r11d,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %r8d,%r15d
+ movl %r12d,36(%rsp)
+
+ rorl $9,%r14d
+ xorl %edx,%r13d
+ xorl %r9d,%r15d
+
+ rorl $5,%r13d
+ addl %r10d,%r12d
+ xorl %r11d,%r14d
+
+ addl (%rbp,%rdi,4),%r12d
+ andl %edx,%r15d
+ movl %eax,%r10d
+
+ rorl $11,%r14d
+ xorl %edx,%r13d
+ xorl %r9d,%r15d
+
+ xorl %ebx,%r10d
+ xorl %r11d,%r14d
+ addl %r15d,%r12d
+ movl %eax,%r15d
+
+ rorl $6,%r13d
+ andl %r11d,%r10d
+ andl %ebx,%r15d
+
+ rorl $2,%r14d
+ addl %r13d,%r12d
+ addl %r15d,%r10d
+
+ addl %r12d,%ecx
+ addl %r12d,%r10d
+ leaq 1(%rdi),%rdi
+ addl %r14d,%r10d
+
+ movl 40(%rsi),%r12d
+ movl %ecx,%r13d
+ movl %r10d,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %edx,%r15d
+ movl %r12d,40(%rsp)
+
+ rorl $9,%r14d
+ xorl %ecx,%r13d
+ xorl %r8d,%r15d
+
+ rorl $5,%r13d
+ addl %r9d,%r12d
+ xorl %r10d,%r14d
+
+ addl (%rbp,%rdi,4),%r12d
+ andl %ecx,%r15d
+ movl %r11d,%r9d
+
+ rorl $11,%r14d
+ xorl %ecx,%r13d
+ xorl %r8d,%r15d
+
+ xorl %eax,%r9d
+ xorl %r10d,%r14d
+ addl %r15d,%r12d
+ movl %r11d,%r15d
+
+ rorl $6,%r13d
+ andl %r10d,%r9d
+ andl %eax,%r15d
+
+ rorl $2,%r14d
+ addl %r13d,%r12d
+ addl %r15d,%r9d
+
+ addl %r12d,%ebx
+ addl %r12d,%r9d
+ leaq 1(%rdi),%rdi
+ addl %r14d,%r9d
+
+ movl 44(%rsi),%r12d
+ movl %ebx,%r13d
+ movl %r9d,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %ecx,%r15d
+ movl %r12d,44(%rsp)
+
+ rorl $9,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r15d
+
+ rorl $5,%r13d
+ addl %r8d,%r12d
+ xorl %r9d,%r14d
+
+ addl (%rbp,%rdi,4),%r12d
+ andl %ebx,%r15d
+ movl %r10d,%r8d
+
+ rorl $11,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r15d
+
+ xorl %r11d,%r8d
+ xorl %r9d,%r14d
+ addl %r15d,%r12d
+ movl %r10d,%r15d
+
+ rorl $6,%r13d
+ andl %r9d,%r8d
+ andl %r11d,%r15d
+
+ rorl $2,%r14d
+ addl %r13d,%r12d
+ addl %r15d,%r8d
+
+ addl %r12d,%eax
+ addl %r12d,%r8d
+ leaq 1(%rdi),%rdi
+ addl %r14d,%r8d
+
+ movl 48(%rsi),%r12d
+ movl %eax,%r13d
+ movl %r8d,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %ebx,%r15d
+ movl %r12d,48(%rsp)
+
+ rorl $9,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r15d
+
+ rorl $5,%r13d
+ addl %edx,%r12d
+ xorl %r8d,%r14d
+
+ addl (%rbp,%rdi,4),%r12d
+ andl %eax,%r15d
+ movl %r9d,%edx
+
+ rorl $11,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r15d
+
+ xorl %r10d,%edx
+ xorl %r8d,%r14d
+ addl %r15d,%r12d
+ movl %r9d,%r15d
+
+ rorl $6,%r13d
+ andl %r8d,%edx
+ andl %r10d,%r15d
+
+ rorl $2,%r14d
+ addl %r13d,%r12d
+ addl %r15d,%edx
+
+ addl %r12d,%r11d
+ addl %r12d,%edx
+ leaq 1(%rdi),%rdi
+ addl %r14d,%edx
+
+ movl 52(%rsi),%r12d
+ movl %r11d,%r13d
+ movl %edx,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %eax,%r15d
+ movl %r12d,52(%rsp)
+
+ rorl $9,%r14d
+ xorl %r11d,%r13d
+ xorl %ebx,%r15d
+
+ rorl $5,%r13d
+ addl %ecx,%r12d
+ xorl %edx,%r14d
+
+ addl (%rbp,%rdi,4),%r12d
+ andl %r11d,%r15d
+ movl %r8d,%ecx
+
+ rorl $11,%r14d
+ xorl %r11d,%r13d
+ xorl %ebx,%r15d
+
+ xorl %r9d,%ecx
+ xorl %edx,%r14d
+ addl %r15d,%r12d
+ movl %r8d,%r15d
+
+ rorl $6,%r13d
+ andl %edx,%ecx
+ andl %r9d,%r15d
+
+ rorl $2,%r14d
+ addl %r13d,%r12d
+ addl %r15d,%ecx
+
+ addl %r12d,%r10d
+ addl %r12d,%ecx
+ leaq 1(%rdi),%rdi
+ addl %r14d,%ecx
+
+ movl 56(%rsi),%r12d
+ movl %r10d,%r13d
+ movl %ecx,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %r11d,%r15d
+ movl %r12d,56(%rsp)
+
+ rorl $9,%r14d
+ xorl %r10d,%r13d
+ xorl %eax,%r15d
+
+ rorl $5,%r13d
+ addl %ebx,%r12d
+ xorl %ecx,%r14d
+
+ addl (%rbp,%rdi,4),%r12d
+ andl %r10d,%r15d
+ movl %edx,%ebx
+
+ rorl $11,%r14d
+ xorl %r10d,%r13d
+ xorl %eax,%r15d
+
+ xorl %r8d,%ebx
+ xorl %ecx,%r14d
+ addl %r15d,%r12d
+ movl %edx,%r15d
+
+ rorl $6,%r13d
+ andl %ecx,%ebx
+ andl %r8d,%r15d
+
+ rorl $2,%r14d
+ addl %r13d,%r12d
+ addl %r15d,%ebx
+
+ addl %r12d,%r9d
+ addl %r12d,%ebx
+ leaq 1(%rdi),%rdi
+ addl %r14d,%ebx
+
+ movl 60(%rsi),%r12d
+ movl %r9d,%r13d
+ movl %ebx,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %r10d,%r15d
+ movl %r12d,60(%rsp)
+
+ rorl $9,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r15d
+
+ rorl $5,%r13d
+ addl %eax,%r12d
+ xorl %ebx,%r14d
+
+ addl (%rbp,%rdi,4),%r12d
+ andl %r9d,%r15d
+ movl %ecx,%eax
+
+ rorl $11,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r15d
+
+ xorl %edx,%eax
+ xorl %ebx,%r14d
+ addl %r15d,%r12d
+ movl %ecx,%r15d
+
+ rorl $6,%r13d
+ andl %ebx,%eax
+ andl %edx,%r15d
+
+ rorl $2,%r14d
+ addl %r13d,%r12d
+ addl %r15d,%eax
+
+ addl %r12d,%r8d
+ addl %r12d,%eax
+ leaq 1(%rdi),%rdi
+ addl %r14d,%eax
+
+ jmp .Lrounds_16_xx
+.align 16
+.Lrounds_16_xx:
+ movl 4(%rsp),%r13d
+ movl 56(%rsp),%r14d
+ movl %r13d,%r12d
+ movl %r14d,%r15d
+
+ rorl $11,%r12d
+ xorl %r13d,%r12d
+ shrl $3,%r13d
+
+ rorl $7,%r12d
+ xorl %r12d,%r13d
+ movl 36(%rsp),%r12d
+
+ rorl $2,%r15d
+ xorl %r14d,%r15d
+ shrl $10,%r14d
+
+ rorl $17,%r15d
+ addl %r13d,%r12d
+ xorl %r15d,%r14d
+
+ addl 0(%rsp),%r12d
+ movl %r8d,%r13d
+ addl %r14d,%r12d
+ movl %eax,%r14d
+ rorl $14,%r13d
+ movl %r9d,%r15d
+ movl %r12d,0(%rsp)
+
+ rorl $9,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r15d
+
+ rorl $5,%r13d
+ addl %r11d,%r12d
+ xorl %eax,%r14d
+
+ addl (%rbp,%rdi,4),%r12d
+ andl %r8d,%r15d
+ movl %ebx,%r11d
+
+ rorl $11,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r15d
+
+ xorl %ecx,%r11d
+ xorl %eax,%r14d
+ addl %r15d,%r12d
+ movl %ebx,%r15d
+
+ rorl $6,%r13d
+ andl %eax,%r11d
+ andl %ecx,%r15d
+
+ rorl $2,%r14d
+ addl %r13d,%r12d
+ addl %r15d,%r11d
+
+ addl %r12d,%edx
+ addl %r12d,%r11d
+ leaq 1(%rdi),%rdi
+ addl %r14d,%r11d
+
+ movl 8(%rsp),%r13d
+ movl 60(%rsp),%r14d
+ movl %r13d,%r12d
+ movl %r14d,%r15d
+
+ rorl $11,%r12d
+ xorl %r13d,%r12d
+ shrl $3,%r13d
+
+ rorl $7,%r12d
+ xorl %r12d,%r13d
+ movl 40(%rsp),%r12d
+
+ rorl $2,%r15d
+ xorl %r14d,%r15d
+ shrl $10,%r14d
+
+ rorl $17,%r15d
+ addl %r13d,%r12d
+ xorl %r15d,%r14d
+
+ addl 4(%rsp),%r12d
+ movl %edx,%r13d
+ addl %r14d,%r12d
+ movl %r11d,%r14d
+ rorl $14,%r13d
+ movl %r8d,%r15d
+ movl %r12d,4(%rsp)
+
+ rorl $9,%r14d
+ xorl %edx,%r13d
+ xorl %r9d,%r15d
+
+ rorl $5,%r13d
+ addl %r10d,%r12d
+ xorl %r11d,%r14d
+
+ addl (%rbp,%rdi,4),%r12d
+ andl %edx,%r15d
+ movl %eax,%r10d
+
+ rorl $11,%r14d
+ xorl %edx,%r13d
+ xorl %r9d,%r15d
+
+ xorl %ebx,%r10d
+ xorl %r11d,%r14d
+ addl %r15d,%r12d
+ movl %eax,%r15d
+
+ rorl $6,%r13d
+ andl %r11d,%r10d
+ andl %ebx,%r15d
+
+ rorl $2,%r14d
+ addl %r13d,%r12d
+ addl %r15d,%r10d
+
+ addl %r12d,%ecx
+ addl %r12d,%r10d
+ leaq 1(%rdi),%rdi
+ addl %r14d,%r10d
+
+ movl 12(%rsp),%r13d
+ movl 0(%rsp),%r14d
+ movl %r13d,%r12d
+ movl %r14d,%r15d
+
+ rorl $11,%r12d
+ xorl %r13d,%r12d
+ shrl $3,%r13d
+
+ rorl $7,%r12d
+ xorl %r12d,%r13d
+ movl 44(%rsp),%r12d
+
+ rorl $2,%r15d
+ xorl %r14d,%r15d
+ shrl $10,%r14d
+
+ rorl $17,%r15d
+ addl %r13d,%r12d
+ xorl %r15d,%r14d
+
+ addl 8(%rsp),%r12d
+ movl %ecx,%r13d
+ addl %r14d,%r12d
+ movl %r10d,%r14d
+ rorl $14,%r13d
+ movl %edx,%r15d
+ movl %r12d,8(%rsp)
+
+ rorl $9,%r14d
+ xorl %ecx,%r13d
+ xorl %r8d,%r15d
+
+ rorl $5,%r13d
+ addl %r9d,%r12d
+ xorl %r10d,%r14d
+
+ addl (%rbp,%rdi,4),%r12d
+ andl %ecx,%r15d
+ movl %r11d,%r9d
+
+ rorl $11,%r14d
+ xorl %ecx,%r13d
+ xorl %r8d,%r15d
+
+ xorl %eax,%r9d
+ xorl %r10d,%r14d
+ addl %r15d,%r12d
+ movl %r11d,%r15d
+
+ rorl $6,%r13d
+ andl %r10d,%r9d
+ andl %eax,%r15d
+
+ rorl $2,%r14d
+ addl %r13d,%r12d
+ addl %r15d,%r9d
+
+ addl %r12d,%ebx
+ addl %r12d,%r9d
+ leaq 1(%rdi),%rdi
+ addl %r14d,%r9d
+
+ movl 16(%rsp),%r13d
+ movl 4(%rsp),%r14d
+ movl %r13d,%r12d
+ movl %r14d,%r15d
+
+ rorl $11,%r12d
+ xorl %r13d,%r12d
+ shrl $3,%r13d
+
+ rorl $7,%r12d
+ xorl %r12d,%r13d
+ movl 48(%rsp),%r12d
+
+ rorl $2,%r15d
+ xorl %r14d,%r15d
+ shrl $10,%r14d
+
+ rorl $17,%r15d
+ addl %r13d,%r12d
+ xorl %r15d,%r14d
+
+ addl 12(%rsp),%r12d
+ movl %ebx,%r13d
+ addl %r14d,%r12d
+ movl %r9d,%r14d
+ rorl $14,%r13d
+ movl %ecx,%r15d
+ movl %r12d,12(%rsp)
+
+ rorl $9,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r15d
+
+ rorl $5,%r13d
+ addl %r8d,%r12d
+ xorl %r9d,%r14d
+
+ addl (%rbp,%rdi,4),%r12d
+ andl %ebx,%r15d
+ movl %r10d,%r8d
+
+ rorl $11,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r15d
+
+ xorl %r11d,%r8d
+ xorl %r9d,%r14d
+ addl %r15d,%r12d
+ movl %r10d,%r15d
+
+ rorl $6,%r13d
+ andl %r9d,%r8d
+ andl %r11d,%r15d
+
+ rorl $2,%r14d
+ addl %r13d,%r12d
+ addl %r15d,%r8d
+
+ addl %r12d,%eax
+ addl %r12d,%r8d
+ leaq 1(%rdi),%rdi
+ addl %r14d,%r8d
+
+ movl 20(%rsp),%r13d
+ movl 8(%rsp),%r14d
+ movl %r13d,%r12d
+ movl %r14d,%r15d
+
+ rorl $11,%r12d
+ xorl %r13d,%r12d
+ shrl $3,%r13d
+
+ rorl $7,%r12d
+ xorl %r12d,%r13d
+ movl 52(%rsp),%r12d
+
+ rorl $2,%r15d
+ xorl %r14d,%r15d
+ shrl $10,%r14d
+
+ rorl $17,%r15d
+ addl %r13d,%r12d
+ xorl %r15d,%r14d
+
+ addl 16(%rsp),%r12d
+ movl %eax,%r13d
+ addl %r14d,%r12d
+ movl %r8d,%r14d
+ rorl $14,%r13d
+ movl %ebx,%r15d
+ movl %r12d,16(%rsp)
+
+ rorl $9,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r15d
+
+ rorl $5,%r13d
+ addl %edx,%r12d
+ xorl %r8d,%r14d
+
+ addl (%rbp,%rdi,4),%r12d
+ andl %eax,%r15d
+ movl %r9d,%edx
+
+ rorl $11,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r15d
+
+ xorl %r10d,%edx
+ xorl %r8d,%r14d
+ addl %r15d,%r12d
+ movl %r9d,%r15d
+
+ rorl $6,%r13d
+ andl %r8d,%edx
+ andl %r10d,%r15d
+
+ rorl $2,%r14d
+ addl %r13d,%r12d
+ addl %r15d,%edx
+
+ addl %r12d,%r11d
+ addl %r12d,%edx
+ leaq 1(%rdi),%rdi
+ addl %r14d,%edx
+
+ movl 24(%rsp),%r13d
+ movl 12(%rsp),%r14d
+ movl %r13d,%r12d
+ movl %r14d,%r15d
+
+ rorl $11,%r12d
+ xorl %r13d,%r12d
+ shrl $3,%r13d
+
+ rorl $7,%r12d
+ xorl %r12d,%r13d
+ movl 56(%rsp),%r12d
+
+ rorl $2,%r15d
+ xorl %r14d,%r15d
+ shrl $10,%r14d
+
+ rorl $17,%r15d
+ addl %r13d,%r12d
+ xorl %r15d,%r14d
+
+ addl 20(%rsp),%r12d
+ movl %r11d,%r13d
+ addl %r14d,%r12d
+ movl %edx,%r14d
+ rorl $14,%r13d
+ movl %eax,%r15d
+ movl %r12d,20(%rsp)
+
+ rorl $9,%r14d
+ xorl %r11d,%r13d
+ xorl %ebx,%r15d
+
+ rorl $5,%r13d
+ addl %ecx,%r12d
+ xorl %edx,%r14d
+
+ addl (%rbp,%rdi,4),%r12d
+ andl %r11d,%r15d
+ movl %r8d,%ecx
+
+ rorl $11,%r14d
+ xorl %r11d,%r13d
+ xorl %ebx,%r15d
+
+ xorl %r9d,%ecx
+ xorl %edx,%r14d
+ addl %r15d,%r12d
+ movl %r8d,%r15d
+
+ rorl $6,%r13d
+ andl %edx,%ecx
+ andl %r9d,%r15d
+
+ rorl $2,%r14d
+ addl %r13d,%r12d
+ addl %r15d,%ecx
+
+ addl %r12d,%r10d
+ addl %r12d,%ecx
+ leaq 1(%rdi),%rdi
+ addl %r14d,%ecx
+
+ movl 28(%rsp),%r13d
+ movl 16(%rsp),%r14d
+ movl %r13d,%r12d
+ movl %r14d,%r15d
+
+ rorl $11,%r12d
+ xorl %r13d,%r12d
+ shrl $3,%r13d
+
+ rorl $7,%r12d
+ xorl %r12d,%r13d
+ movl 60(%rsp),%r12d
+
+ rorl $2,%r15d
+ xorl %r14d,%r15d
+ shrl $10,%r14d
+
+ rorl $17,%r15d
+ addl %r13d,%r12d
+ xorl %r15d,%r14d
+
+ addl 24(%rsp),%r12d
+ movl %r10d,%r13d
+ addl %r14d,%r12d
+ movl %ecx,%r14d
+ rorl $14,%r13d
+ movl %r11d,%r15d
+ movl %r12d,24(%rsp)
+
+ rorl $9,%r14d
+ xorl %r10d,%r13d
+ xorl %eax,%r15d
+
+ rorl $5,%r13d
+ addl %ebx,%r12d
+ xorl %ecx,%r14d
+
+ addl (%rbp,%rdi,4),%r12d
+ andl %r10d,%r15d
+ movl %edx,%ebx
+
+ rorl $11,%r14d
+ xorl %r10d,%r13d
+ xorl %eax,%r15d
+
+ xorl %r8d,%ebx
+ xorl %ecx,%r14d
+ addl %r15d,%r12d
+ movl %edx,%r15d
+
+ rorl $6,%r13d
+ andl %ecx,%ebx
+ andl %r8d,%r15d
+
+ rorl $2,%r14d
+ addl %r13d,%r12d
+ addl %r15d,%ebx
+
+ addl %r12d,%r9d
+ addl %r12d,%ebx
+ leaq 1(%rdi),%rdi
+ addl %r14d,%ebx
+
+ movl 32(%rsp),%r13d
+ movl 20(%rsp),%r14d
+ movl %r13d,%r12d
+ movl %r14d,%r15d
+
+ rorl $11,%r12d
+ xorl %r13d,%r12d
+ shrl $3,%r13d
+
+ rorl $7,%r12d
+ xorl %r12d,%r13d
+ movl 0(%rsp),%r12d
+
+ rorl $2,%r15d
+ xorl %r14d,%r15d
+ shrl $10,%r14d
+
+ rorl $17,%r15d
+ addl %r13d,%r12d
+ xorl %r15d,%r14d
+
+ addl 28(%rsp),%r12d
+ movl %r9d,%r13d
+ addl %r14d,%r12d
+ movl %ebx,%r14d
+ rorl $14,%r13d
+ movl %r10d,%r15d
+ movl %r12d,28(%rsp)
+
+ rorl $9,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r15d
+
+ rorl $5,%r13d
+ addl %eax,%r12d
+ xorl %ebx,%r14d
+
+ addl (%rbp,%rdi,4),%r12d
+ andl %r9d,%r15d
+ movl %ecx,%eax
+
+ rorl $11,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r15d
+
+ xorl %edx,%eax
+ xorl %ebx,%r14d
+ addl %r15d,%r12d
+ movl %ecx,%r15d
+
+ rorl $6,%r13d
+ andl %ebx,%eax
+ andl %edx,%r15d
+
+ rorl $2,%r14d
+ addl %r13d,%r12d
+ addl %r15d,%eax
+
+ addl %r12d,%r8d
+ addl %r12d,%eax
+ leaq 1(%rdi),%rdi
+ addl %r14d,%eax
+
+ movl 36(%rsp),%r13d
+ movl 24(%rsp),%r14d
+ movl %r13d,%r12d
+ movl %r14d,%r15d
+
+ rorl $11,%r12d
+ xorl %r13d,%r12d
+ shrl $3,%r13d
+
+ rorl $7,%r12d
+ xorl %r12d,%r13d
+ movl 4(%rsp),%r12d
+
+ rorl $2,%r15d
+ xorl %r14d,%r15d
+ shrl $10,%r14d
+
+ rorl $17,%r15d
+ addl %r13d,%r12d
+ xorl %r15d,%r14d
+
+ addl 32(%rsp),%r12d
+ movl %r8d,%r13d
+ addl %r14d,%r12d
+ movl %eax,%r14d
+ rorl $14,%r13d
+ movl %r9d,%r15d
+ movl %r12d,32(%rsp)
+
+ rorl $9,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r15d
+
+ rorl $5,%r13d
+ addl %r11d,%r12d
+ xorl %eax,%r14d
+
+ addl (%rbp,%rdi,4),%r12d
+ andl %r8d,%r15d
+ movl %ebx,%r11d
+
+ rorl $11,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r15d
+
+ xorl %ecx,%r11d
+ xorl %eax,%r14d
+ addl %r15d,%r12d
+ movl %ebx,%r15d
+
+ rorl $6,%r13d
+ andl %eax,%r11d
+ andl %ecx,%r15d
+
+ rorl $2,%r14d
+ addl %r13d,%r12d
+ addl %r15d,%r11d
+
+ addl %r12d,%edx
+ addl %r12d,%r11d
+ leaq 1(%rdi),%rdi
+ addl %r14d,%r11d
+
+ movl 40(%rsp),%r13d
+ movl 28(%rsp),%r14d
+ movl %r13d,%r12d
+ movl %r14d,%r15d
+
+ rorl $11,%r12d
+ xorl %r13d,%r12d
+ shrl $3,%r13d
+
+ rorl $7,%r12d
+ xorl %r12d,%r13d
+ movl 8(%rsp),%r12d
+
+ rorl $2,%r15d
+ xorl %r14d,%r15d
+ shrl $10,%r14d
+
+ rorl $17,%r15d
+ addl %r13d,%r12d
+ xorl %r15d,%r14d
+
+ addl 36(%rsp),%r12d
+ movl %edx,%r13d
+ addl %r14d,%r12d
+ movl %r11d,%r14d
+ rorl $14,%r13d
+ movl %r8d,%r15d
+ movl %r12d,36(%rsp)
+
+ rorl $9,%r14d
+ xorl %edx,%r13d
+ xorl %r9d,%r15d
+
+ rorl $5,%r13d
+ addl %r10d,%r12d
+ xorl %r11d,%r14d
+
+ addl (%rbp,%rdi,4),%r12d
+ andl %edx,%r15d
+ movl %eax,%r10d
+
+ rorl $11,%r14d
+ xorl %edx,%r13d
+ xorl %r9d,%r15d
+
+ xorl %ebx,%r10d
+ xorl %r11d,%r14d
+ addl %r15d,%r12d
+ movl %eax,%r15d
+
+ rorl $6,%r13d
+ andl %r11d,%r10d
+ andl %ebx,%r15d
+
+ rorl $2,%r14d
+ addl %r13d,%r12d
+ addl %r15d,%r10d
+
+ addl %r12d,%ecx
+ addl %r12d,%r10d
+ leaq 1(%rdi),%rdi
+ addl %r14d,%r10d
+
+ movl 44(%rsp),%r13d
+ movl 32(%rsp),%r14d
+ movl %r13d,%r12d
+ movl %r14d,%r15d
+
+ rorl $11,%r12d
+ xorl %r13d,%r12d
+ shrl $3,%r13d
+
+ rorl $7,%r12d
+ xorl %r12d,%r13d
+ movl 12(%rsp),%r12d
+
+ rorl $2,%r15d
+ xorl %r14d,%r15d
+ shrl $10,%r14d
+
+ rorl $17,%r15d
+ addl %r13d,%r12d
+ xorl %r15d,%r14d
+
+ addl 40(%rsp),%r12d
+ movl %ecx,%r13d
+ addl %r14d,%r12d
+ movl %r10d,%r14d
+ rorl $14,%r13d
+ movl %edx,%r15d
+ movl %r12d,40(%rsp)
+
+ rorl $9,%r14d
+ xorl %ecx,%r13d
+ xorl %r8d,%r15d
+
+ rorl $5,%r13d
+ addl %r9d,%r12d
+ xorl %r10d,%r14d
+
+ addl (%rbp,%rdi,4),%r12d
+ andl %ecx,%r15d
+ movl %r11d,%r9d
+
+ rorl $11,%r14d
+ xorl %ecx,%r13d
+ xorl %r8d,%r15d
+
+ xorl %eax,%r9d
+ xorl %r10d,%r14d
+ addl %r15d,%r12d
+ movl %r11d,%r15d
+
+ rorl $6,%r13d
+ andl %r10d,%r9d
+ andl %eax,%r15d
+
+ rorl $2,%r14d
+ addl %r13d,%r12d
+ addl %r15d,%r9d
+
+ addl %r12d,%ebx
+ addl %r12d,%r9d
+ leaq 1(%rdi),%rdi
+ addl %r14d,%r9d
+
+ movl 48(%rsp),%r13d
+ movl 36(%rsp),%r14d
+ movl %r13d,%r12d
+ movl %r14d,%r15d
+
+ rorl $11,%r12d
+ xorl %r13d,%r12d
+ shrl $3,%r13d
+
+ rorl $7,%r12d
+ xorl %r12d,%r13d
+ movl 16(%rsp),%r12d
+
+ rorl $2,%r15d
+ xorl %r14d,%r15d
+ shrl $10,%r14d
+
+ rorl $17,%r15d
+ addl %r13d,%r12d
+ xorl %r15d,%r14d
+
+ addl 44(%rsp),%r12d
+ movl %ebx,%r13d
+ addl %r14d,%r12d
+ movl %r9d,%r14d
+ rorl $14,%r13d
+ movl %ecx,%r15d
+ movl %r12d,44(%rsp)
+
+ rorl $9,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r15d
+
+ rorl $5,%r13d
+ addl %r8d,%r12d
+ xorl %r9d,%r14d
+
+ addl (%rbp,%rdi,4),%r12d
+ andl %ebx,%r15d
+ movl %r10d,%r8d
+
+ rorl $11,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r15d
+
+ xorl %r11d,%r8d
+ xorl %r9d,%r14d
+ addl %r15d,%r12d
+ movl %r10d,%r15d
+
+ rorl $6,%r13d
+ andl %r9d,%r8d
+ andl %r11d,%r15d
+
+ rorl $2,%r14d
+ addl %r13d,%r12d
+ addl %r15d,%r8d
+
+ addl %r12d,%eax
+ addl %r12d,%r8d
+ leaq 1(%rdi),%rdi
+ addl %r14d,%r8d
+
+ movl 52(%rsp),%r13d
+ movl 40(%rsp),%r14d
+ movl %r13d,%r12d
+ movl %r14d,%r15d
+
+ rorl $11,%r12d
+ xorl %r13d,%r12d
+ shrl $3,%r13d
+
+ rorl $7,%r12d
+ xorl %r12d,%r13d
+ movl 20(%rsp),%r12d
+
+ rorl $2,%r15d
+ xorl %r14d,%r15d
+ shrl $10,%r14d
+
+ rorl $17,%r15d
+ addl %r13d,%r12d
+ xorl %r15d,%r14d
+
+ addl 48(%rsp),%r12d
+ movl %eax,%r13d
+ addl %r14d,%r12d
+ movl %r8d,%r14d
+ rorl $14,%r13d
+ movl %ebx,%r15d
+ movl %r12d,48(%rsp)
+
+ rorl $9,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r15d
+
+ rorl $5,%r13d
+ addl %edx,%r12d
+ xorl %r8d,%r14d
+
+ addl (%rbp,%rdi,4),%r12d
+ andl %eax,%r15d
+ movl %r9d,%edx
+
+ rorl $11,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r15d
+
+ xorl %r10d,%edx
+ xorl %r8d,%r14d
+ addl %r15d,%r12d
+ movl %r9d,%r15d
+
+ rorl $6,%r13d
+ andl %r8d,%edx
+ andl %r10d,%r15d
+
+ rorl $2,%r14d
+ addl %r13d,%r12d
+ addl %r15d,%edx
+
+ addl %r12d,%r11d
+ addl %r12d,%edx
+ leaq 1(%rdi),%rdi
+ addl %r14d,%edx
+
+ movl 56(%rsp),%r13d
+ movl 44(%rsp),%r14d
+ movl %r13d,%r12d
+ movl %r14d,%r15d
+
+ rorl $11,%r12d
+ xorl %r13d,%r12d
+ shrl $3,%r13d
+
+ rorl $7,%r12d
+ xorl %r12d,%r13d
+ movl 24(%rsp),%r12d
+
+ rorl $2,%r15d
+ xorl %r14d,%r15d
+ shrl $10,%r14d
+
+ rorl $17,%r15d
+ addl %r13d,%r12d
+ xorl %r15d,%r14d
+
+ addl 52(%rsp),%r12d
+ movl %r11d,%r13d
+ addl %r14d,%r12d
+ movl %edx,%r14d
+ rorl $14,%r13d
+ movl %eax,%r15d
+ movl %r12d,52(%rsp)
+
+ rorl $9,%r14d
+ xorl %r11d,%r13d
+ xorl %ebx,%r15d
+
+ rorl $5,%r13d
+ addl %ecx,%r12d
+ xorl %edx,%r14d
+
+ addl (%rbp,%rdi,4),%r12d
+ andl %r11d,%r15d
+ movl %r8d,%ecx
+
+ rorl $11,%r14d
+ xorl %r11d,%r13d
+ xorl %ebx,%r15d
+
+ xorl %r9d,%ecx
+ xorl %edx,%r14d
+ addl %r15d,%r12d
+ movl %r8d,%r15d
+
+ rorl $6,%r13d
+ andl %edx,%ecx
+ andl %r9d,%r15d
+
+ rorl $2,%r14d
+ addl %r13d,%r12d
+ addl %r15d,%ecx
+
+ addl %r12d,%r10d
+ addl %r12d,%ecx
+ leaq 1(%rdi),%rdi
+ addl %r14d,%ecx
+
+ movl 60(%rsp),%r13d
+ movl 48(%rsp),%r14d
+ movl %r13d,%r12d
+ movl %r14d,%r15d
+
+ rorl $11,%r12d
+ xorl %r13d,%r12d
+ shrl $3,%r13d
+
+ rorl $7,%r12d
+ xorl %r12d,%r13d
+ movl 28(%rsp),%r12d
+
+ rorl $2,%r15d
+ xorl %r14d,%r15d
+ shrl $10,%r14d
+
+ rorl $17,%r15d
+ addl %r13d,%r12d
+ xorl %r15d,%r14d
+
+ addl 56(%rsp),%r12d
+ movl %r10d,%r13d
+ addl %r14d,%r12d
+ movl %ecx,%r14d
+ rorl $14,%r13d
+ movl %r11d,%r15d
+ movl %r12d,56(%rsp)
+
+ rorl $9,%r14d
+ xorl %r10d,%r13d
+ xorl %eax,%r15d
+
+ rorl $5,%r13d
+ addl %ebx,%r12d
+ xorl %ecx,%r14d
+
+ addl (%rbp,%rdi,4),%r12d
+ andl %r10d,%r15d
+ movl %edx,%ebx
+
+ rorl $11,%r14d
+ xorl %r10d,%r13d
+ xorl %eax,%r15d
+
+ xorl %r8d,%ebx
+ xorl %ecx,%r14d
+ addl %r15d,%r12d
+ movl %edx,%r15d
+
+ rorl $6,%r13d
+ andl %ecx,%ebx
+ andl %r8d,%r15d
+
+ rorl $2,%r14d
+ addl %r13d,%r12d
+ addl %r15d,%ebx
+
+ addl %r12d,%r9d
+ addl %r12d,%ebx
+ leaq 1(%rdi),%rdi
+ addl %r14d,%ebx
+
+ movl 0(%rsp),%r13d
+ movl 52(%rsp),%r14d
+ movl %r13d,%r12d
+ movl %r14d,%r15d
+
+ rorl $11,%r12d
+ xorl %r13d,%r12d
+ shrl $3,%r13d
+
+ rorl $7,%r12d
+ xorl %r12d,%r13d
+ movl 32(%rsp),%r12d
+
+ rorl $2,%r15d
+ xorl %r14d,%r15d
+ shrl $10,%r14d
+
+ rorl $17,%r15d
+ addl %r13d,%r12d
+ xorl %r15d,%r14d
+
+ addl 60(%rsp),%r12d
+ movl %r9d,%r13d
+ addl %r14d,%r12d
+ movl %ebx,%r14d
+ rorl $14,%r13d
+ movl %r10d,%r15d
+ movl %r12d,60(%rsp)
+
+ rorl $9,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r15d
+
+ rorl $5,%r13d
+ addl %eax,%r12d
+ xorl %ebx,%r14d
+
+ addl (%rbp,%rdi,4),%r12d
+ andl %r9d,%r15d
+ movl %ecx,%eax
+
+ rorl $11,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r15d
+
+ xorl %edx,%eax
+ xorl %ebx,%r14d
+ addl %r15d,%r12d
+ movl %ecx,%r15d
+
+ rorl $6,%r13d
+ andl %ebx,%eax
+ andl %edx,%r15d
+
+ rorl $2,%r14d
+ addl %r13d,%r12d
+ addl %r15d,%eax
+
+ addl %r12d,%r8d
+ addl %r12d,%eax
+ leaq 1(%rdi),%rdi
+ addl %r14d,%eax
+
+ cmpq $64,%rdi
+ jb .Lrounds_16_xx
+
+ movq 64+0(%rsp),%rdi
+ leaq 64(%rsi),%rsi
+
+ addl 0(%rdi),%eax
+ addl 4(%rdi),%ebx
+ addl 8(%rdi),%ecx
+ addl 12(%rdi),%edx
+ addl 16(%rdi),%r8d
+ addl 20(%rdi),%r9d
+ addl 24(%rdi),%r10d
+ addl 28(%rdi),%r11d
+
+ cmpq 64+16(%rsp),%rsi
+
+ movl %eax,0(%rdi)
+ movl %ebx,4(%rdi)
+ movl %ecx,8(%rdi)
+ movl %edx,12(%rdi)
+ movl %r8d,16(%rdi)
+ movl %r9d,20(%rdi)
+ movl %r10d,24(%rdi)
+ movl %r11d,28(%rdi)
+ jb .Lloop
+
+ movq 64+24(%rsp),%rsi
+ movq (%rsi),%r15
+ movq 8(%rsi),%r14
+ movq 16(%rsi),%r13
+ movq 24(%rsi),%r12
+ movq 32(%rsi),%rbp
+ movq 40(%rsi),%rbx
+ leaq 48(%rsi),%rsp
+.Lepilogue:
+ .byte 0xf3,0xc3
+.size sha256_block_data_order,.-sha256_block_data_order
+.align 64
+.type K256,@object
+K256:
+.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
diff --git a/main/openssl/crypto/sha/asm/sha512-586.S b/main/openssl/crypto/sha/asm/sha512-586.S
new file mode 100644
index 00000000..4b806f35
--- /dev/null
+++ b/main/openssl/crypto/sha/asm/sha512-586.S
@@ -0,0 +1,563 @@
+.file "sha512-586.s"
+.text
+.globl sha512_block_data_order
+.type sha512_block_data_order,@function
+.align 16
+sha512_block_data_order:
+.L_sha512_block_data_order_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%esi
+ movl 24(%esp),%edi
+ movl 28(%esp),%eax
+ movl %esp,%ebx
+ call .L000pic_point
+.L000pic_point:
+ popl %ebp
+ leal .L001K512-.L000pic_point(%ebp),%ebp
+ subl $16,%esp
+ andl $-64,%esp
+ shll $7,%eax
+ addl %edi,%eax
+ movl %esi,(%esp)
+ movl %edi,4(%esp)
+ movl %eax,8(%esp)
+ movl %ebx,12(%esp)
+.align 16
+.L002loop_x86:
+ movl (%edi),%eax
+ movl 4(%edi),%ebx
+ movl 8(%edi),%ecx
+ movl 12(%edi),%edx
+ bswap %eax
+ bswap %ebx
+ bswap %ecx
+ bswap %edx
+ pushl %eax
+ pushl %ebx
+ pushl %ecx
+ pushl %edx
+ movl 16(%edi),%eax
+ movl 20(%edi),%ebx
+ movl 24(%edi),%ecx
+ movl 28(%edi),%edx
+ bswap %eax
+ bswap %ebx
+ bswap %ecx
+ bswap %edx
+ pushl %eax
+ pushl %ebx
+ pushl %ecx
+ pushl %edx
+ movl 32(%edi),%eax
+ movl 36(%edi),%ebx
+ movl 40(%edi),%ecx
+ movl 44(%edi),%edx
+ bswap %eax
+ bswap %ebx
+ bswap %ecx
+ bswap %edx
+ pushl %eax
+ pushl %ebx
+ pushl %ecx
+ pushl %edx
+ movl 48(%edi),%eax
+ movl 52(%edi),%ebx
+ movl 56(%edi),%ecx
+ movl 60(%edi),%edx
+ bswap %eax
+ bswap %ebx
+ bswap %ecx
+ bswap %edx
+ pushl %eax
+ pushl %ebx
+ pushl %ecx
+ pushl %edx
+ movl 64(%edi),%eax
+ movl 68(%edi),%ebx
+ movl 72(%edi),%ecx
+ movl 76(%edi),%edx
+ bswap %eax
+ bswap %ebx
+ bswap %ecx
+ bswap %edx
+ pushl %eax
+ pushl %ebx
+ pushl %ecx
+ pushl %edx
+ movl 80(%edi),%eax
+ movl 84(%edi),%ebx
+ movl 88(%edi),%ecx
+ movl 92(%edi),%edx
+ bswap %eax
+ bswap %ebx
+ bswap %ecx
+ bswap %edx
+ pushl %eax
+ pushl %ebx
+ pushl %ecx
+ pushl %edx
+ movl 96(%edi),%eax
+ movl 100(%edi),%ebx
+ movl 104(%edi),%ecx
+ movl 108(%edi),%edx
+ bswap %eax
+ bswap %ebx
+ bswap %ecx
+ bswap %edx
+ pushl %eax
+ pushl %ebx
+ pushl %ecx
+ pushl %edx
+ movl 112(%edi),%eax
+ movl 116(%edi),%ebx
+ movl 120(%edi),%ecx
+ movl 124(%edi),%edx
+ bswap %eax
+ bswap %ebx
+ bswap %ecx
+ bswap %edx
+ pushl %eax
+ pushl %ebx
+ pushl %ecx
+ pushl %edx
+ addl $128,%edi
+ subl $72,%esp
+ movl %edi,204(%esp)
+ leal 8(%esp),%edi
+ movl $16,%ecx
+.long 2784229001
+.align 16
+.L00300_15_x86:
+ movl 40(%esp),%ecx
+ movl 44(%esp),%edx
+ movl %ecx,%esi
+ shrl $9,%ecx
+ movl %edx,%edi
+ shrl $9,%edx
+ movl %ecx,%ebx
+ shll $14,%esi
+ movl %edx,%eax
+ shll $14,%edi
+ xorl %esi,%ebx
+ shrl $5,%ecx
+ xorl %edi,%eax
+ shrl $5,%edx
+ xorl %ecx,%eax
+ shll $4,%esi
+ xorl %edx,%ebx
+ shll $4,%edi
+ xorl %esi,%ebx
+ shrl $4,%ecx
+ xorl %edi,%eax
+ shrl $4,%edx
+ xorl %ecx,%eax
+ shll $5,%esi
+ xorl %edx,%ebx
+ shll $5,%edi
+ xorl %esi,%eax
+ xorl %edi,%ebx
+ movl 48(%esp),%ecx
+ movl 52(%esp),%edx
+ movl 56(%esp),%esi
+ movl 60(%esp),%edi
+ addl 64(%esp),%eax
+ adcl 68(%esp),%ebx
+ xorl %esi,%ecx
+ xorl %edi,%edx
+ andl 40(%esp),%ecx
+ andl 44(%esp),%edx
+ addl 192(%esp),%eax
+ adcl 196(%esp),%ebx
+ xorl %esi,%ecx
+ xorl %edi,%edx
+ movl (%ebp),%esi
+ movl 4(%ebp),%edi
+ addl %ecx,%eax
+ adcl %edx,%ebx
+ movl 32(%esp),%ecx
+ movl 36(%esp),%edx
+ addl %esi,%eax
+ adcl %edi,%ebx
+ movl %eax,(%esp)
+ movl %ebx,4(%esp)
+ addl %ecx,%eax
+ adcl %edx,%ebx
+ movl 8(%esp),%ecx
+ movl 12(%esp),%edx
+ movl %eax,32(%esp)
+ movl %ebx,36(%esp)
+ movl %ecx,%esi
+ shrl $2,%ecx
+ movl %edx,%edi
+ shrl $2,%edx
+ movl %ecx,%ebx
+ shll $4,%esi
+ movl %edx,%eax
+ shll $4,%edi
+ xorl %esi,%ebx
+ shrl $5,%ecx
+ xorl %edi,%eax
+ shrl $5,%edx
+ xorl %ecx,%ebx
+ shll $21,%esi
+ xorl %edx,%eax
+ shll $21,%edi
+ xorl %esi,%eax
+ shrl $21,%ecx
+ xorl %edi,%ebx
+ shrl $21,%edx
+ xorl %ecx,%eax
+ shll $5,%esi
+ xorl %edx,%ebx
+ shll $5,%edi
+ xorl %esi,%eax
+ xorl %edi,%ebx
+ movl 8(%esp),%ecx
+ movl 12(%esp),%edx
+ movl 16(%esp),%esi
+ movl 20(%esp),%edi
+ addl (%esp),%eax
+ adcl 4(%esp),%ebx
+ orl %esi,%ecx
+ orl %edi,%edx
+ andl 24(%esp),%ecx
+ andl 28(%esp),%edx
+ andl 8(%esp),%esi
+ andl 12(%esp),%edi
+ orl %esi,%ecx
+ orl %edi,%edx
+ addl %ecx,%eax
+ adcl %edx,%ebx
+ movl %eax,(%esp)
+ movl %ebx,4(%esp)
+ movb (%ebp),%dl
+ subl $8,%esp
+ leal 8(%ebp),%ebp
+ cmpb $148,%dl
+ jne .L00300_15_x86
+.align 16
+.L00416_79_x86:
+ movl 312(%esp),%ecx
+ movl 316(%esp),%edx
+ movl %ecx,%esi
+ shrl $1,%ecx
+ movl %edx,%edi
+ shrl $1,%edx
+ movl %ecx,%eax
+ shll $24,%esi
+ movl %edx,%ebx
+ shll $24,%edi
+ xorl %esi,%ebx
+ shrl $6,%ecx
+ xorl %edi,%eax
+ shrl $6,%edx
+ xorl %ecx,%eax
+ shll $7,%esi
+ xorl %edx,%ebx
+ shll $1,%edi
+ xorl %esi,%ebx
+ shrl $1,%ecx
+ xorl %edi,%eax
+ shrl $1,%edx
+ xorl %ecx,%eax
+ shll $6,%edi
+ xorl %edx,%ebx
+ xorl %edi,%eax
+ movl %eax,(%esp)
+ movl %ebx,4(%esp)
+ movl 208(%esp),%ecx
+ movl 212(%esp),%edx
+ movl %ecx,%esi
+ shrl $6,%ecx
+ movl %edx,%edi
+ shrl $6,%edx
+ movl %ecx,%eax
+ shll $3,%esi
+ movl %edx,%ebx
+ shll $3,%edi
+ xorl %esi,%eax
+ shrl $13,%ecx
+ xorl %edi,%ebx
+ shrl $13,%edx
+ xorl %ecx,%eax
+ shll $10,%esi
+ xorl %edx,%ebx
+ shll $10,%edi
+ xorl %esi,%ebx
+ shrl $10,%ecx
+ xorl %edi,%eax
+ shrl $10,%edx
+ xorl %ecx,%ebx
+ shll $13,%edi
+ xorl %edx,%eax
+ xorl %edi,%eax
+ movl 320(%esp),%ecx
+ movl 324(%esp),%edx
+ addl (%esp),%eax
+ adcl 4(%esp),%ebx
+ movl 248(%esp),%esi
+ movl 252(%esp),%edi
+ addl %ecx,%eax
+ adcl %edx,%ebx
+ addl %esi,%eax
+ adcl %edi,%ebx
+ movl %eax,192(%esp)
+ movl %ebx,196(%esp)
+ movl 40(%esp),%ecx
+ movl 44(%esp),%edx
+ movl %ecx,%esi
+ shrl $9,%ecx
+ movl %edx,%edi
+ shrl $9,%edx
+ movl %ecx,%ebx
+ shll $14,%esi
+ movl %edx,%eax
+ shll $14,%edi
+ xorl %esi,%ebx
+ shrl $5,%ecx
+ xorl %edi,%eax
+ shrl $5,%edx
+ xorl %ecx,%eax
+ shll $4,%esi
+ xorl %edx,%ebx
+ shll $4,%edi
+ xorl %esi,%ebx
+ shrl $4,%ecx
+ xorl %edi,%eax
+ shrl $4,%edx
+ xorl %ecx,%eax
+ shll $5,%esi
+ xorl %edx,%ebx
+ shll $5,%edi
+ xorl %esi,%eax
+ xorl %edi,%ebx
+ movl 48(%esp),%ecx
+ movl 52(%esp),%edx
+ movl 56(%esp),%esi
+ movl 60(%esp),%edi
+ addl 64(%esp),%eax
+ adcl 68(%esp),%ebx
+ xorl %esi,%ecx
+ xorl %edi,%edx
+ andl 40(%esp),%ecx
+ andl 44(%esp),%edx
+ addl 192(%esp),%eax
+ adcl 196(%esp),%ebx
+ xorl %esi,%ecx
+ xorl %edi,%edx
+ movl (%ebp),%esi
+ movl 4(%ebp),%edi
+ addl %ecx,%eax
+ adcl %edx,%ebx
+ movl 32(%esp),%ecx
+ movl 36(%esp),%edx
+ addl %esi,%eax
+ adcl %edi,%ebx
+ movl %eax,(%esp)
+ movl %ebx,4(%esp)
+ addl %ecx,%eax
+ adcl %edx,%ebx
+ movl 8(%esp),%ecx
+ movl 12(%esp),%edx
+ movl %eax,32(%esp)
+ movl %ebx,36(%esp)
+ movl %ecx,%esi
+ shrl $2,%ecx
+ movl %edx,%edi
+ shrl $2,%edx
+ movl %ecx,%ebx
+ shll $4,%esi
+ movl %edx,%eax
+ shll $4,%edi
+ xorl %esi,%ebx
+ shrl $5,%ecx
+ xorl %edi,%eax
+ shrl $5,%edx
+ xorl %ecx,%ebx
+ shll $21,%esi
+ xorl %edx,%eax
+ shll $21,%edi
+ xorl %esi,%eax
+ shrl $21,%ecx
+ xorl %edi,%ebx
+ shrl $21,%edx
+ xorl %ecx,%eax
+ shll $5,%esi
+ xorl %edx,%ebx
+ shll $5,%edi
+ xorl %esi,%eax
+ xorl %edi,%ebx
+ movl 8(%esp),%ecx
+ movl 12(%esp),%edx
+ movl 16(%esp),%esi
+ movl 20(%esp),%edi
+ addl (%esp),%eax
+ adcl 4(%esp),%ebx
+ orl %esi,%ecx
+ orl %edi,%edx
+ andl 24(%esp),%ecx
+ andl 28(%esp),%edx
+ andl 8(%esp),%esi
+ andl 12(%esp),%edi
+ orl %esi,%ecx
+ orl %edi,%edx
+ addl %ecx,%eax
+ adcl %edx,%ebx
+ movl %eax,(%esp)
+ movl %ebx,4(%esp)
+ movb (%ebp),%dl
+ subl $8,%esp
+ leal 8(%ebp),%ebp
+ cmpb $23,%dl
+ jne .L00416_79_x86
+ movl 840(%esp),%esi
+ movl 844(%esp),%edi
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl 8(%esi),%ecx
+ movl 12(%esi),%edx
+ addl 8(%esp),%eax
+ adcl 12(%esp),%ebx
+ movl %eax,(%esi)
+ movl %ebx,4(%esi)
+ addl 16(%esp),%ecx
+ adcl 20(%esp),%edx
+ movl %ecx,8(%esi)
+ movl %edx,12(%esi)
+ movl 16(%esi),%eax
+ movl 20(%esi),%ebx
+ movl 24(%esi),%ecx
+ movl 28(%esi),%edx
+ addl 24(%esp),%eax
+ adcl 28(%esp),%ebx
+ movl %eax,16(%esi)
+ movl %ebx,20(%esi)
+ addl 32(%esp),%ecx
+ adcl 36(%esp),%edx
+ movl %ecx,24(%esi)
+ movl %edx,28(%esi)
+ movl 32(%esi),%eax
+ movl 36(%esi),%ebx
+ movl 40(%esi),%ecx
+ movl 44(%esi),%edx
+ addl 40(%esp),%eax
+ adcl 44(%esp),%ebx
+ movl %eax,32(%esi)
+ movl %ebx,36(%esi)
+ addl 48(%esp),%ecx
+ adcl 52(%esp),%edx
+ movl %ecx,40(%esi)
+ movl %edx,44(%esi)
+ movl 48(%esi),%eax
+ movl 52(%esi),%ebx
+ movl 56(%esi),%ecx
+ movl 60(%esi),%edx
+ addl 56(%esp),%eax
+ adcl 60(%esp),%ebx
+ movl %eax,48(%esi)
+ movl %ebx,52(%esi)
+ addl 64(%esp),%ecx
+ adcl 68(%esp),%edx
+ movl %ecx,56(%esi)
+ movl %edx,60(%esi)
+ addl $840,%esp
+ subl $640,%ebp
+ cmpl 8(%esp),%edi
+ jb .L002loop_x86
+ movl 12(%esp),%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.align 64
+.L001K512:
+.long 3609767458,1116352408
+.long 602891725,1899447441
+.long 3964484399,3049323471
+.long 2173295548,3921009573
+.long 4081628472,961987163
+.long 3053834265,1508970993
+.long 2937671579,2453635748
+.long 3664609560,2870763221
+.long 2734883394,3624381080
+.long 1164996542,310598401
+.long 1323610764,607225278
+.long 3590304994,1426881987
+.long 4068182383,1925078388
+.long 991336113,2162078206
+.long 633803317,2614888103
+.long 3479774868,3248222580
+.long 2666613458,3835390401
+.long 944711139,4022224774
+.long 2341262773,264347078
+.long 2007800933,604807628
+.long 1495990901,770255983
+.long 1856431235,1249150122
+.long 3175218132,1555081692
+.long 2198950837,1996064986
+.long 3999719339,2554220882
+.long 766784016,2821834349
+.long 2566594879,2952996808
+.long 3203337956,3210313671
+.long 1034457026,3336571891
+.long 2466948901,3584528711
+.long 3758326383,113926993
+.long 168717936,338241895
+.long 1188179964,666307205
+.long 1546045734,773529912
+.long 1522805485,1294757372
+.long 2643833823,1396182291
+.long 2343527390,1695183700
+.long 1014477480,1986661051
+.long 1206759142,2177026350
+.long 344077627,2456956037
+.long 1290863460,2730485921
+.long 3158454273,2820302411
+.long 3505952657,3259730800
+.long 106217008,3345764771
+.long 3606008344,3516065817
+.long 1432725776,3600352804
+.long 1467031594,4094571909
+.long 851169720,275423344
+.long 3100823752,430227734
+.long 1363258195,506948616
+.long 3750685593,659060556
+.long 3785050280,883997877
+.long 3318307427,958139571
+.long 3812723403,1322822218
+.long 2003034995,1537002063
+.long 3602036899,1747873779
+.long 1575990012,1955562222
+.long 1125592928,2024104815
+.long 2716904306,2227730452
+.long 442776044,2361852424
+.long 593698344,2428436474
+.long 3733110249,2756734187
+.long 2999351573,3204031479
+.long 3815920427,3329325298
+.long 3928383900,3391569614
+.long 566280711,3515267271
+.long 3454069534,3940187606
+.long 4000239992,4118630271
+.long 1914138554,116418474
+.long 2731055270,174292421
+.long 3203993006,289380356
+.long 320620315,460393269
+.long 587496836,685471733
+.long 1086792851,852142971
+.long 365543100,1017036298
+.long 2618297676,1126000580
+.long 3409855158,1288033470
+.long 4234509866,1501505948
+.long 987167468,1607167915
+.long 1246189591,1816402316
+.size sha512_block_data_order,.-.L_sha512_block_data_order_begin
+.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97
+.byte 110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32
+.byte 67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97
+.byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
+.byte 62,0
diff --git a/main/openssl/crypto/sha/asm/sha512-586.pl b/main/openssl/crypto/sha/asm/sha512-586.pl
index 5b9f3337..9f8c51eb 100644
--- a/main/openssl/crypto/sha/asm/sha512-586.pl
+++ b/main/openssl/crypto/sha/asm/sha512-586.pl
@@ -23,7 +23,7 @@
#
# IALU code-path is optimized for elder Pentiums. On vanilla Pentium
# performance improvement over compiler generated code reaches ~60%,
-# while on PIII - ~35%. On newer µ-archs improvement varies from 15%
+# while on PIII - ~35%. On newer µ-archs improvement varies from 15%
# to 50%, but it's less important as they are expected to execute SSE2
# code-path, which is commonly ~2-3x faster [than compiler generated
# code]. SSE2 code-path is as fast as original sha512-sse2.pl, even
@@ -142,9 +142,9 @@ sub BODY_00_15_x86 {
&mov ("edx",$Ehi);
&mov ("esi","ecx");
- &shr ("ecx",9) # lo>>9
+ &shr ("ecx",9); # lo>>9
&mov ("edi","edx");
- &shr ("edx",9) # hi>>9
+ &shr ("edx",9); # hi>>9
&mov ("ebx","ecx");
&shl ("esi",14); # lo<<14
&mov ("eax","edx");
@@ -207,9 +207,9 @@ sub BODY_00_15_x86 {
&mov ($Dhi,"ebx");
&mov ("esi","ecx");
- &shr ("ecx",2) # lo>>2
+ &shr ("ecx",2); # lo>>2
&mov ("edi","edx");
- &shr ("edx",2) # hi>>2
+ &shr ("edx",2); # hi>>2
&mov ("ebx","ecx");
&shl ("esi",4); # lo<<4
&mov ("eax","edx");
@@ -452,9 +452,9 @@ if ($sse2) {
&mov ("edx",&DWP(8*(9+15+16-1)+4,"esp"));
&mov ("esi","ecx");
- &shr ("ecx",1) # lo>>1
+ &shr ("ecx",1); # lo>>1
&mov ("edi","edx");
- &shr ("edx",1) # hi>>1
+ &shr ("edx",1); # hi>>1
&mov ("eax","ecx");
&shl ("esi",24); # lo<<24
&mov ("ebx","edx");
@@ -488,9 +488,9 @@ if ($sse2) {
&mov ("edx",&DWP(8*(9+15+16-14)+4,"esp"));
&mov ("esi","ecx");
- &shr ("ecx",6) # lo>>6
+ &shr ("ecx",6); # lo>>6
&mov ("edi","edx");
- &shr ("edx",6) # hi>>6
+ &shr ("edx",6); # hi>>6
&mov ("eax","ecx");
&shl ("esi",3); # lo<<3
&mov ("ebx","edx");
diff --git a/main/openssl/crypto/sha/asm/sha512-armv4.pl b/main/openssl/crypto/sha/asm/sha512-armv4.pl
index 3a35861a..7faf37b1 100644
--- a/main/openssl/crypto/sha/asm/sha512-armv4.pl
+++ b/main/openssl/crypto/sha/asm/sha512-armv4.pl
@@ -18,22 +18,33 @@
# Rescheduling for dual-issue pipeline resulted in 6% improvement on
# Cortex A8 core and ~40 cycles per processed byte.
+# February 2011.
+#
+# Profiler-assisted and platform-specific optimization resulted in 7%
+# improvement on Coxtex A8 core and ~38 cycles per byte.
+
+# March 2011.
+#
+# Add NEON implementation. On Cortex A8 it was measured to process
+# one byte in 25.5 cycles or 47% faster than integer-only code.
+
# Byte order [in]dependence. =========================================
#
-# Caller is expected to maintain specific *dword* order in h[0-7],
-# namely with most significant dword at *lower* address, which is
-# reflected in below two parameters. *Byte* order within these dwords
-# in turn is whatever *native* byte order on current platform.
-$hi=0;
-$lo=4;
+# Originally caller was expected to maintain specific *dword* order in
+# h[0-7], namely with most significant dword at *lower* address, which
+# was reflected in below two parameters as 0 and 4. Now caller is
+# expected to maintain native byte order for whole 64-bit values.
+$hi="HI";
+$lo="LO";
# ====================================================================
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
-$ctx="r0";
+$ctx="r0"; # parameter block
$inp="r1";
$len="r2";
+
$Tlo="r3";
$Thi="r4";
$Alo="r5";
@@ -61,15 +72,17 @@ $Xoff=8*8;
sub BODY_00_15() {
my $magic = shift;
$code.=<<___;
- ldr $t2,[sp,#$Hoff+0] @ h.lo
- ldr $t3,[sp,#$Hoff+4] @ h.hi
@ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
@ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
@ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
mov $t0,$Elo,lsr#14
+ str $Tlo,[sp,#$Xoff+0]
mov $t1,$Ehi,lsr#14
+ str $Thi,[sp,#$Xoff+4]
eor $t0,$t0,$Ehi,lsl#18
+ ldr $t2,[sp,#$Hoff+0] @ h.lo
eor $t1,$t1,$Elo,lsl#18
+ ldr $t3,[sp,#$Hoff+4] @ h.hi
eor $t0,$t0,$Elo,lsr#18
eor $t1,$t1,$Ehi,lsr#18
eor $t0,$t0,$Ehi,lsl#14
@@ -96,25 +109,24 @@ $code.=<<___;
and $t1,$t1,$Ehi
str $Ahi,[sp,#$Aoff+4]
eor $t0,$t0,$t2
- ldr $t2,[$Ktbl,#4] @ K[i].lo
+ ldr $t2,[$Ktbl,#$lo] @ K[i].lo
eor $t1,$t1,$t3 @ Ch(e,f,g)
- ldr $t3,[$Ktbl,#0] @ K[i].hi
+ ldr $t3,[$Ktbl,#$hi] @ K[i].hi
adds $Tlo,$Tlo,$t0
ldr $Elo,[sp,#$Doff+0] @ d.lo
adc $Thi,$Thi,$t1 @ T += Ch(e,f,g)
ldr $Ehi,[sp,#$Doff+4] @ d.hi
adds $Tlo,$Tlo,$t2
+ and $t0,$t2,#0xff
adc $Thi,$Thi,$t3 @ T += K[i]
adds $Elo,$Elo,$Tlo
+ ldr $t2,[sp,#$Boff+0] @ b.lo
adc $Ehi,$Ehi,$Thi @ d += T
-
- and $t0,$t2,#0xff
teq $t0,#$magic
- orreq $Ktbl,$Ktbl,#1
- ldr $t2,[sp,#$Boff+0] @ b.lo
ldr $t3,[sp,#$Coff+0] @ c.lo
+ orreq $Ktbl,$Ktbl,#1
@ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
@ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
@ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
@@ -131,80 +143,100 @@ $code.=<<___;
eor $t0,$t0,$Alo,lsl#25
eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a)
adds $Tlo,$Tlo,$t0
+ and $t0,$Alo,$t2
adc $Thi,$Thi,$t1 @ T += Sigma0(a)
- and $t0,$Alo,$t2
- orr $Alo,$Alo,$t2
ldr $t1,[sp,#$Boff+4] @ b.hi
+ orr $Alo,$Alo,$t2
ldr $t2,[sp,#$Coff+4] @ c.hi
and $Alo,$Alo,$t3
- orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo
and $t3,$Ahi,$t1
orr $Ahi,$Ahi,$t1
+ orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo
and $Ahi,$Ahi,$t2
- orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi
adds $Alo,$Alo,$Tlo
- adc $Ahi,$Ahi,$Thi @ h += T
-
+ orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi
sub sp,sp,#8
+ adc $Ahi,$Ahi,$Thi @ h += T
+ tst $Ktbl,#1
add $Ktbl,$Ktbl,#8
___
}
$code=<<___;
+#include "arm_arch.h"
+#ifdef __ARMEL__
+# define LO 0
+# define HI 4
+# define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1
+#else
+# define HI 0
+# define LO 4
+# define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1
+#endif
+
.text
.code 32
.type K512,%object
.align 5
K512:
-.word 0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
-.word 0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc
-.word 0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019
-.word 0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118
-.word 0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe
-.word 0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2
-.word 0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1
-.word 0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694
-.word 0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3
-.word 0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65
-.word 0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483
-.word 0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5
-.word 0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210
-.word 0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4
-.word 0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725
-.word 0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70
-.word 0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926
-.word 0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df
-.word 0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8
-.word 0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b
-.word 0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001
-.word 0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30
-.word 0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910
-.word 0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8
-.word 0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53
-.word 0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8
-.word 0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb
-.word 0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3
-.word 0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60
-.word 0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec
-.word 0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9
-.word 0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b
-.word 0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207
-.word 0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178
-.word 0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6
-.word 0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b
-.word 0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493
-.word 0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
-.word 0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
-.word 0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
+WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
+WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
+WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
+WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
+WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
+WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
+WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
+WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
+WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
+WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
+WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
+WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
+WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
+WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
+WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
+WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
+WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
+WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
+WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
+WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
+WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
+WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
+WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
+WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
+WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
+WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
+WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
+WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
+WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
+WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
+WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
+WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
+WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
+WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
+WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
+WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
+WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
+WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
+WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
+WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
.size K512,.-K512
+.LOPENSSL_armcap:
+.word OPENSSL_armcap_P-sha512_block_data_order
+.skip 32-4
.global sha512_block_data_order
.type sha512_block_data_order,%function
sha512_block_data_order:
sub r3,pc,#8 @ sha512_block_data_order
add $len,$inp,$len,lsl#7 @ len to point at the end of inp
+#if __ARM_ARCH__>=7
+ ldr r12,.LOPENSSL_armcap
+ ldr r12,[r3,r12] @ OPENSSL_armcap_P
+ tst r12,#1
+ bne .LNEON
+#endif
stmdb sp!,{r4-r12,lr}
- sub $Ktbl,r3,#640 @ K512
+ sub $Ktbl,r3,#672 @ K512
sub sp,sp,#9*8
ldr $Elo,[$ctx,#$Eoff+$lo]
@@ -238,6 +270,7 @@ sha512_block_data_order:
str $Thi,[sp,#$Foff+4]
.L00_15:
+#if __ARM_ARCH__<7
ldrb $Tlo,[$inp,#7]
ldrb $t0, [$inp,#6]
ldrb $t1, [$inp,#5]
@@ -252,26 +285,30 @@ sha512_block_data_order:
orr $Thi,$Thi,$t3,lsl#8
orr $Thi,$Thi,$t0,lsl#16
orr $Thi,$Thi,$t1,lsl#24
- str $Tlo,[sp,#$Xoff+0]
- str $Thi,[sp,#$Xoff+4]
+#else
+ ldr $Tlo,[$inp,#4]
+ ldr $Thi,[$inp],#8
+#ifdef __ARMEL__
+ rev $Tlo,$Tlo
+ rev $Thi,$Thi
+#endif
+#endif
___
&BODY_00_15(0x94);
$code.=<<___;
tst $Ktbl,#1
beq .L00_15
- bic $Ktbl,$Ktbl,#1
-
-.L16_79:
ldr $t0,[sp,#`$Xoff+8*(16-1)`+0]
ldr $t1,[sp,#`$Xoff+8*(16-1)`+4]
- ldr $t2,[sp,#`$Xoff+8*(16-14)`+0]
- ldr $t3,[sp,#`$Xoff+8*(16-14)`+4]
-
+ bic $Ktbl,$Ktbl,#1
+.L16_79:
@ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
@ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
@ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
mov $Tlo,$t0,lsr#1
+ ldr $t2,[sp,#`$Xoff+8*(16-14)`+0]
mov $Thi,$t1,lsr#1
+ ldr $t3,[sp,#`$Xoff+8*(16-14)`+4]
eor $Tlo,$Tlo,$t1,lsl#31
eor $Thi,$Thi,$t0,lsl#31
eor $Tlo,$Tlo,$t0,lsr#8
@@ -295,25 +332,24 @@ $code.=<<___;
eor $t1,$t1,$t3,lsl#3
eor $t0,$t0,$t2,lsr#6
eor $t1,$t1,$t3,lsr#6
+ ldr $t2,[sp,#`$Xoff+8*(16-9)`+0]
eor $t0,$t0,$t3,lsl#26
- ldr $t2,[sp,#`$Xoff+8*(16-9)`+0]
ldr $t3,[sp,#`$Xoff+8*(16-9)`+4]
adds $Tlo,$Tlo,$t0
+ ldr $t0,[sp,#`$Xoff+8*16`+0]
adc $Thi,$Thi,$t1
- ldr $t0,[sp,#`$Xoff+8*16`+0]
ldr $t1,[sp,#`$Xoff+8*16`+4]
adds $Tlo,$Tlo,$t2
adc $Thi,$Thi,$t3
adds $Tlo,$Tlo,$t0
adc $Thi,$Thi,$t1
- str $Tlo,[sp,#$Xoff+0]
- str $Thi,[sp,#$Xoff+4]
___
&BODY_00_15(0x17);
$code.=<<___;
- tst $Ktbl,#1
+ ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0]
+ ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4]
beq .L16_79
bic $Ktbl,$Ktbl,#1
@@ -324,12 +360,12 @@ $code.=<<___;
ldr $t2, [$ctx,#$Boff+$lo]
ldr $t3, [$ctx,#$Boff+$hi]
adds $t0,$Alo,$t0
- adc $t1,$Ahi,$t1
- adds $t2,$Tlo,$t2
- adc $t3,$Thi,$t3
str $t0, [$ctx,#$Aoff+$lo]
+ adc $t1,$Ahi,$t1
str $t1, [$ctx,#$Aoff+$hi]
+ adds $t2,$Tlo,$t2
str $t2, [$ctx,#$Boff+$lo]
+ adc $t3,$Thi,$t3
str $t3, [$ctx,#$Boff+$hi]
ldr $Alo,[sp,#$Coff+0]
@@ -341,12 +377,12 @@ $code.=<<___;
ldr $t2, [$ctx,#$Doff+$lo]
ldr $t3, [$ctx,#$Doff+$hi]
adds $t0,$Alo,$t0
- adc $t1,$Ahi,$t1
- adds $t2,$Tlo,$t2
- adc $t3,$Thi,$t3
str $t0, [$ctx,#$Coff+$lo]
+ adc $t1,$Ahi,$t1
str $t1, [$ctx,#$Coff+$hi]
+ adds $t2,$Tlo,$t2
str $t2, [$ctx,#$Doff+$lo]
+ adc $t3,$Thi,$t3
str $t3, [$ctx,#$Doff+$hi]
ldr $Tlo,[sp,#$Foff+0]
@@ -356,12 +392,12 @@ $code.=<<___;
ldr $t2, [$ctx,#$Foff+$lo]
ldr $t3, [$ctx,#$Foff+$hi]
adds $Elo,$Elo,$t0
- adc $Ehi,$Ehi,$t1
- adds $t2,$Tlo,$t2
- adc $t3,$Thi,$t3
str $Elo,[$ctx,#$Eoff+$lo]
+ adc $Ehi,$Ehi,$t1
str $Ehi,[$ctx,#$Eoff+$hi]
+ adds $t2,$Tlo,$t2
str $t2, [$ctx,#$Foff+$lo]
+ adc $t3,$Thi,$t3
str $t3, [$ctx,#$Foff+$hi]
ldr $Alo,[sp,#$Goff+0]
@@ -373,12 +409,12 @@ $code.=<<___;
ldr $t2, [$ctx,#$Hoff+$lo]
ldr $t3, [$ctx,#$Hoff+$hi]
adds $t0,$Alo,$t0
- adc $t1,$Ahi,$t1
- adds $t2,$Tlo,$t2
- adc $t3,$Thi,$t3
str $t0, [$ctx,#$Goff+$lo]
+ adc $t1,$Ahi,$t1
str $t1, [$ctx,#$Goff+$hi]
+ adds $t2,$Tlo,$t2
str $t2, [$ctx,#$Hoff+$lo]
+ adc $t3,$Thi,$t3
str $t3, [$ctx,#$Hoff+$hi]
add sp,sp,#640
@@ -388,13 +424,156 @@ $code.=<<___;
bne .Loop
add sp,sp,#8*9 @ destroy frame
+#if __ARM_ARCH__>=5
+ ldmia sp!,{r4-r12,pc}
+#else
ldmia sp!,{r4-r12,lr}
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
-.size sha512_block_data_order,.-sha512_block_data_order
-.asciz "SHA512 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
+#endif
+___
+
+{
+my @Sigma0=(28,34,39);
+my @Sigma1=(14,18,41);
+my @sigma0=(1, 8, 7);
+my @sigma1=(19,61,6);
+
+my $Ktbl="r3";
+my $cnt="r12"; # volatile register known as ip, intra-procedure-call scratch
+
+my @X=map("d$_",(0..15));
+my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
+
+sub NEON_00_15() {
+my $i=shift;
+my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
+my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31)); # temps
+
+$code.=<<___ if ($i<16 || $i&1);
+ vshr.u64 $t0,$e,#@Sigma1[0] @ $i
+#if $i<16
+ vld1.64 {@X[$i%16]},[$inp]! @ handles unaligned
+#endif
+ vshr.u64 $t1,$e,#@Sigma1[1]
+ vshr.u64 $t2,$e,#@Sigma1[2]
+___
+$code.=<<___;
+ vld1.64 {$K},[$Ktbl,:64]! @ K[i++]
+ vsli.64 $t0,$e,#`64-@Sigma1[0]`
+ vsli.64 $t1,$e,#`64-@Sigma1[1]`
+ vsli.64 $t2,$e,#`64-@Sigma1[2]`
+#if $i<16 && defined(__ARMEL__)
+ vrev64.8 @X[$i],@X[$i]
+#endif
+ vadd.i64 $T1,$K,$h
+ veor $Ch,$f,$g
+ veor $t0,$t1
+ vand $Ch,$e
+ veor $t0,$t2 @ Sigma1(e)
+ veor $Ch,$g @ Ch(e,f,g)
+ vadd.i64 $T1,$t0
+ vshr.u64 $t0,$a,#@Sigma0[0]
+ vadd.i64 $T1,$Ch
+ vshr.u64 $t1,$a,#@Sigma0[1]
+ vshr.u64 $t2,$a,#@Sigma0[2]
+ vsli.64 $t0,$a,#`64-@Sigma0[0]`
+ vsli.64 $t1,$a,#`64-@Sigma0[1]`
+ vsli.64 $t2,$a,#`64-@Sigma0[2]`
+ vadd.i64 $T1,@X[$i%16]
+ vorr $Maj,$a,$c
+ vand $Ch,$a,$c
+ veor $h,$t0,$t1
+ vand $Maj,$b
+ veor $h,$t2 @ Sigma0(a)
+ vorr $Maj,$Ch @ Maj(a,b,c)
+ vadd.i64 $h,$T1
+ vadd.i64 $d,$T1
+ vadd.i64 $h,$Maj
+___
+}
+
+sub NEON_16_79() {
+my $i=shift;
+
+if ($i&1) { &NEON_00_15($i,@_); return; }
+
+# 2x-vectorized, therefore runs every 2nd round
+my @X=map("q$_",(0..7)); # view @X as 128-bit vector
+my ($t0,$t1,$s0,$s1) = map("q$_",(12..15)); # temps
+my ($d0,$d1,$d2) = map("d$_",(24..26)); # temps from NEON_00_15
+my $e=@_[4]; # $e from NEON_00_15
+$i /= 2;
+$code.=<<___;
+ vshr.u64 $t0,@X[($i+7)%8],#@sigma1[0]
+ vshr.u64 $t1,@X[($i+7)%8],#@sigma1[1]
+ vshr.u64 $s1,@X[($i+7)%8],#@sigma1[2]
+ vsli.64 $t0,@X[($i+7)%8],#`64-@sigma1[0]`
+ vext.8 $s0,@X[$i%8],@X[($i+1)%8],#8 @ X[i+1]
+ vsli.64 $t1,@X[($i+7)%8],#`64-@sigma1[1]`
+ veor $s1,$t0
+ vshr.u64 $t0,$s0,#@sigma0[0]
+ veor $s1,$t1 @ sigma1(X[i+14])
+ vshr.u64 $t1,$s0,#@sigma0[1]
+ vadd.i64 @X[$i%8],$s1
+ vshr.u64 $s1,$s0,#@sigma0[2]
+ vsli.64 $t0,$s0,#`64-@sigma0[0]`
+ vsli.64 $t1,$s0,#`64-@sigma0[1]`
+ vext.8 $s0,@X[($i+4)%8],@X[($i+5)%8],#8 @ X[i+9]
+ veor $s1,$t0
+ vshr.u64 $d0,$e,#@Sigma1[0] @ from NEON_00_15
+ vadd.i64 @X[$i%8],$s0
+ vshr.u64 $d1,$e,#@Sigma1[1] @ from NEON_00_15
+ veor $s1,$t1 @ sigma0(X[i+1])
+ vshr.u64 $d2,$e,#@Sigma1[2] @ from NEON_00_15
+ vadd.i64 @X[$i%8],$s1
+___
+ &NEON_00_15(2*$i,@_);
+}
+
+$code.=<<___;
+#if __ARM_ARCH__>=7
+.fpu neon
+
+.align 4
+.LNEON:
+ dmb @ errata #451034 on early Cortex A8
+ vstmdb sp!,{d8-d15} @ ABI specification says so
+ sub $Ktbl,r3,#672 @ K512
+ vldmia $ctx,{$A-$H} @ load context
+.Loop_neon:
+___
+for($i=0;$i<16;$i++) { &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+ mov $cnt,#4
+.L16_79_neon:
+ subs $cnt,#1
+___
+for(;$i<32;$i++) { &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+ bne .L16_79_neon
+
+ vldmia $ctx,{d24-d31} @ load context to temp
+ vadd.i64 q8,q12 @ vectorized accumulate
+ vadd.i64 q9,q13
+ vadd.i64 q10,q14
+ vadd.i64 q11,q15
+ vstmia $ctx,{$A-$H} @ save context
+ teq $inp,$len
+ sub $Ktbl,#640 @ rewind K512
+ bne .Loop_neon
+
+ vldmia sp!,{d8-d15} @ epilogue
+ bx lr
+#endif
+___
+}
+$code.=<<___;
+.size sha512_block_data_order,.-sha512_block_data_order
+.asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
+.comm OPENSSL_armcap_P,4,4
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
diff --git a/main/openssl/crypto/sha/asm/sha512-armv4.s b/main/openssl/crypto/sha/asm/sha512-armv4.s
index b030c16c..57301922 100644
--- a/main/openssl/crypto/sha/asm/sha512-armv4.s
+++ b/main/openssl/crypto/sha/asm/sha512-armv4.s
@@ -1,90 +1,111 @@
+#include "arm_arch.h"
+#ifdef __ARMEL__
+# define LO 0
+# define HI 4
+# define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1
+#else
+# define HI 0
+# define LO 4
+# define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1
+#endif
+
.text
.code 32
.type K512,%object
.align 5
K512:
-.word 0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
-.word 0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc
-.word 0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019
-.word 0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118
-.word 0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe
-.word 0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2
-.word 0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1
-.word 0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694
-.word 0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3
-.word 0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65
-.word 0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483
-.word 0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5
-.word 0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210
-.word 0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4
-.word 0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725
-.word 0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70
-.word 0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926
-.word 0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df
-.word 0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8
-.word 0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b
-.word 0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001
-.word 0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30
-.word 0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910
-.word 0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8
-.word 0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53
-.word 0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8
-.word 0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb
-.word 0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3
-.word 0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60
-.word 0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec
-.word 0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9
-.word 0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b
-.word 0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207
-.word 0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178
-.word 0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6
-.word 0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b
-.word 0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493
-.word 0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
-.word 0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
-.word 0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
+WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
+WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
+WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
+WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
+WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
+WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
+WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
+WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
+WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
+WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
+WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
+WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
+WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
+WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
+WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
+WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
+WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
+WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
+WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
+WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
+WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
+WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
+WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
+WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
+WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
+WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
+WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
+WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
+WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
+WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
+WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
+WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
+WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
+WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
+WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
+WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
+WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
+WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
+WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
+WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
.size K512,.-K512
+.LOPENSSL_armcap:
+.word OPENSSL_armcap_P-sha512_block_data_order
+.skip 32-4
.global sha512_block_data_order
.type sha512_block_data_order,%function
sha512_block_data_order:
sub r3,pc,#8 @ sha512_block_data_order
add r2,r1,r2,lsl#7 @ len to point at the end of inp
+#if __ARM_ARCH__>=7
+ ldr r12,.LOPENSSL_armcap
+ ldr r12,[r3,r12] @ OPENSSL_armcap_P
+ tst r12,#1
+ bne .LNEON
+#endif
stmdb sp!,{r4-r12,lr}
- sub r14,r3,#640 @ K512
+ sub r14,r3,#672 @ K512
sub sp,sp,#9*8
- ldr r7,[r0,#32+4]
- ldr r8,[r0,#32+0]
- ldr r9, [r0,#48+4]
- ldr r10, [r0,#48+0]
- ldr r11, [r0,#56+4]
- ldr r12, [r0,#56+0]
+ ldr r7,[r0,#32+LO]
+ ldr r8,[r0,#32+HI]
+ ldr r9, [r0,#48+LO]
+ ldr r10, [r0,#48+HI]
+ ldr r11, [r0,#56+LO]
+ ldr r12, [r0,#56+HI]
.Loop:
str r9, [sp,#48+0]
str r10, [sp,#48+4]
str r11, [sp,#56+0]
str r12, [sp,#56+4]
- ldr r5,[r0,#0+4]
- ldr r6,[r0,#0+0]
- ldr r3,[r0,#8+4]
- ldr r4,[r0,#8+0]
- ldr r9, [r0,#16+4]
- ldr r10, [r0,#16+0]
- ldr r11, [r0,#24+4]
- ldr r12, [r0,#24+0]
+ ldr r5,[r0,#0+LO]
+ ldr r6,[r0,#0+HI]
+ ldr r3,[r0,#8+LO]
+ ldr r4,[r0,#8+HI]
+ ldr r9, [r0,#16+LO]
+ ldr r10, [r0,#16+HI]
+ ldr r11, [r0,#24+LO]
+ ldr r12, [r0,#24+HI]
str r3,[sp,#8+0]
str r4,[sp,#8+4]
str r9, [sp,#16+0]
str r10, [sp,#16+4]
str r11, [sp,#24+0]
str r12, [sp,#24+4]
- ldr r3,[r0,#40+4]
- ldr r4,[r0,#40+0]
+ ldr r3,[r0,#40+LO]
+ ldr r4,[r0,#40+HI]
str r3,[sp,#40+0]
str r4,[sp,#40+4]
.L00_15:
+#if __ARM_ARCH__<7
ldrb r3,[r1,#7]
ldrb r9, [r1,#6]
ldrb r10, [r1,#5]
@@ -99,17 +120,25 @@ sha512_block_data_order:
orr r4,r4,r12,lsl#8
orr r4,r4,r9,lsl#16
orr r4,r4,r10,lsl#24
- str r3,[sp,#64+0]
- str r4,[sp,#64+4]
- ldr r11,[sp,#56+0] @ h.lo
- ldr r12,[sp,#56+4] @ h.hi
+#else
+ ldr r3,[r1,#4]
+ ldr r4,[r1],#8
+#ifdef __ARMEL__
+ rev r3,r3
+ rev r4,r4
+#endif
+#endif
@ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
@ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
@ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
mov r9,r7,lsr#14
+ str r3,[sp,#64+0]
mov r10,r8,lsr#14
+ str r4,[sp,#64+4]
eor r9,r9,r8,lsl#18
+ ldr r11,[sp,#56+0] @ h.lo
eor r10,r10,r7,lsl#18
+ ldr r12,[sp,#56+4] @ h.hi
eor r9,r9,r7,lsr#18
eor r10,r10,r8,lsr#18
eor r9,r9,r8,lsl#14
@@ -136,25 +165,24 @@ sha512_block_data_order:
and r10,r10,r8
str r6,[sp,#0+4]
eor r9,r9,r11
- ldr r11,[r14,#4] @ K[i].lo
+ ldr r11,[r14,#LO] @ K[i].lo
eor r10,r10,r12 @ Ch(e,f,g)
- ldr r12,[r14,#0] @ K[i].hi
+ ldr r12,[r14,#HI] @ K[i].hi
adds r3,r3,r9
ldr r7,[sp,#24+0] @ d.lo
adc r4,r4,r10 @ T += Ch(e,f,g)
ldr r8,[sp,#24+4] @ d.hi
adds r3,r3,r11
+ and r9,r11,#0xff
adc r4,r4,r12 @ T += K[i]
adds r7,r7,r3
+ ldr r11,[sp,#8+0] @ b.lo
adc r8,r8,r4 @ d += T
-
- and r9,r11,#0xff
teq r9,#148
- orreq r14,r14,#1
- ldr r11,[sp,#8+0] @ b.lo
ldr r12,[sp,#16+0] @ c.lo
+ orreq r14,r14,#1
@ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
@ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
@ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
@@ -171,38 +199,36 @@ sha512_block_data_order:
eor r9,r9,r5,lsl#25
eor r10,r10,r6,lsl#25 @ Sigma0(a)
adds r3,r3,r9
+ and r9,r5,r11
adc r4,r4,r10 @ T += Sigma0(a)
- and r9,r5,r11
- orr r5,r5,r11
ldr r10,[sp,#8+4] @ b.hi
+ orr r5,r5,r11
ldr r11,[sp,#16+4] @ c.hi
and r5,r5,r12
- orr r5,r5,r9 @ Maj(a,b,c).lo
and r12,r6,r10
orr r6,r6,r10
+ orr r5,r5,r9 @ Maj(a,b,c).lo
and r6,r6,r11
- orr r6,r6,r12 @ Maj(a,b,c).hi
adds r5,r5,r3
- adc r6,r6,r4 @ h += T
-
+ orr r6,r6,r12 @ Maj(a,b,c).hi
sub sp,sp,#8
+ adc r6,r6,r4 @ h += T
+ tst r14,#1
add r14,r14,#8
tst r14,#1
beq .L00_15
- bic r14,r14,#1
-
-.L16_79:
ldr r9,[sp,#184+0]
ldr r10,[sp,#184+4]
- ldr r11,[sp,#80+0]
- ldr r12,[sp,#80+4]
-
+ bic r14,r14,#1
+.L16_79:
@ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
@ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
@ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
mov r3,r9,lsr#1
+ ldr r11,[sp,#80+0]
mov r4,r10,lsr#1
+ ldr r12,[sp,#80+4]
eor r3,r3,r10,lsl#31
eor r4,r4,r9,lsl#31
eor r3,r3,r9,lsr#8
@@ -226,30 +252,30 @@ sha512_block_data_order:
eor r10,r10,r12,lsl#3
eor r9,r9,r11,lsr#6
eor r10,r10,r12,lsr#6
+ ldr r11,[sp,#120+0]
eor r9,r9,r12,lsl#26
- ldr r11,[sp,#120+0]
ldr r12,[sp,#120+4]
adds r3,r3,r9
+ ldr r9,[sp,#192+0]
adc r4,r4,r10
- ldr r9,[sp,#192+0]
ldr r10,[sp,#192+4]
adds r3,r3,r11
adc r4,r4,r12
adds r3,r3,r9
adc r4,r4,r10
- str r3,[sp,#64+0]
- str r4,[sp,#64+4]
- ldr r11,[sp,#56+0] @ h.lo
- ldr r12,[sp,#56+4] @ h.hi
@ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
@ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
@ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
mov r9,r7,lsr#14
+ str r3,[sp,#64+0]
mov r10,r8,lsr#14
+ str r4,[sp,#64+4]
eor r9,r9,r8,lsl#18
+ ldr r11,[sp,#56+0] @ h.lo
eor r10,r10,r7,lsl#18
+ ldr r12,[sp,#56+4] @ h.hi
eor r9,r9,r7,lsr#18
eor r10,r10,r8,lsr#18
eor r9,r9,r8,lsl#14
@@ -276,25 +302,24 @@ sha512_block_data_order:
and r10,r10,r8
str r6,[sp,#0+4]
eor r9,r9,r11
- ldr r11,[r14,#4] @ K[i].lo
+ ldr r11,[r14,#LO] @ K[i].lo
eor r10,r10,r12 @ Ch(e,f,g)
- ldr r12,[r14,#0] @ K[i].hi
+ ldr r12,[r14,#HI] @ K[i].hi
adds r3,r3,r9
ldr r7,[sp,#24+0] @ d.lo
adc r4,r4,r10 @ T += Ch(e,f,g)
ldr r8,[sp,#24+4] @ d.hi
adds r3,r3,r11
+ and r9,r11,#0xff
adc r4,r4,r12 @ T += K[i]
adds r7,r7,r3
+ ldr r11,[sp,#8+0] @ b.lo
adc r8,r8,r4 @ d += T
-
- and r9,r11,#0xff
teq r9,#23
- orreq r14,r14,#1
- ldr r11,[sp,#8+0] @ b.lo
ldr r12,[sp,#16+0] @ c.lo
+ orreq r14,r14,#1
@ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
@ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
@ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
@@ -311,90 +336,91 @@ sha512_block_data_order:
eor r9,r9,r5,lsl#25
eor r10,r10,r6,lsl#25 @ Sigma0(a)
adds r3,r3,r9
+ and r9,r5,r11
adc r4,r4,r10 @ T += Sigma0(a)
- and r9,r5,r11
- orr r5,r5,r11
ldr r10,[sp,#8+4] @ b.hi
+ orr r5,r5,r11
ldr r11,[sp,#16+4] @ c.hi
and r5,r5,r12
- orr r5,r5,r9 @ Maj(a,b,c).lo
and r12,r6,r10
orr r6,r6,r10
+ orr r5,r5,r9 @ Maj(a,b,c).lo
and r6,r6,r11
- orr r6,r6,r12 @ Maj(a,b,c).hi
adds r5,r5,r3
- adc r6,r6,r4 @ h += T
-
+ orr r6,r6,r12 @ Maj(a,b,c).hi
sub sp,sp,#8
- add r14,r14,#8
+ adc r6,r6,r4 @ h += T
tst r14,#1
+ add r14,r14,#8
+ ldreq r9,[sp,#184+0]
+ ldreq r10,[sp,#184+4]
beq .L16_79
bic r14,r14,#1
ldr r3,[sp,#8+0]
ldr r4,[sp,#8+4]
- ldr r9, [r0,#0+4]
- ldr r10, [r0,#0+0]
- ldr r11, [r0,#8+4]
- ldr r12, [r0,#8+0]
+ ldr r9, [r0,#0+LO]
+ ldr r10, [r0,#0+HI]
+ ldr r11, [r0,#8+LO]
+ ldr r12, [r0,#8+HI]
adds r9,r5,r9
+ str r9, [r0,#0+LO]
adc r10,r6,r10
+ str r10, [r0,#0+HI]
adds r11,r3,r11
+ str r11, [r0,#8+LO]
adc r12,r4,r12
- str r9, [r0,#0+4]
- str r10, [r0,#0+0]
- str r11, [r0,#8+4]
- str r12, [r0,#8+0]
+ str r12, [r0,#8+HI]
ldr r5,[sp,#16+0]
ldr r6,[sp,#16+4]
ldr r3,[sp,#24+0]
ldr r4,[sp,#24+4]
- ldr r9, [r0,#16+4]
- ldr r10, [r0,#16+0]
- ldr r11, [r0,#24+4]
- ldr r12, [r0,#24+0]
+ ldr r9, [r0,#16+LO]
+ ldr r10, [r0,#16+HI]
+ ldr r11, [r0,#24+LO]
+ ldr r12, [r0,#24+HI]
adds r9,r5,r9
+ str r9, [r0,#16+LO]
adc r10,r6,r10
+ str r10, [r0,#16+HI]
adds r11,r3,r11
+ str r11, [r0,#24+LO]
adc r12,r4,r12
- str r9, [r0,#16+4]
- str r10, [r0,#16+0]
- str r11, [r0,#24+4]
- str r12, [r0,#24+0]
+ str r12, [r0,#24+HI]
ldr r3,[sp,#40+0]
ldr r4,[sp,#40+4]
- ldr r9, [r0,#32+4]
- ldr r10, [r0,#32+0]
- ldr r11, [r0,#40+4]
- ldr r12, [r0,#40+0]
+ ldr r9, [r0,#32+LO]
+ ldr r10, [r0,#32+HI]
+ ldr r11, [r0,#40+LO]
+ ldr r12, [r0,#40+HI]
adds r7,r7,r9
+ str r7,[r0,#32+LO]
adc r8,r8,r10
+ str r8,[r0,#32+HI]
adds r11,r3,r11
+ str r11, [r0,#40+LO]
adc r12,r4,r12
- str r7,[r0,#32+4]
- str r8,[r0,#32+0]
- str r11, [r0,#40+4]
- str r12, [r0,#40+0]
+ str r12, [r0,#40+HI]
ldr r5,[sp,#48+0]
ldr r6,[sp,#48+4]
ldr r3,[sp,#56+0]
ldr r4,[sp,#56+4]
- ldr r9, [r0,#48+4]
- ldr r10, [r0,#48+0]
- ldr r11, [r0,#56+4]
- ldr r12, [r0,#56+0]
+ ldr r9, [r0,#48+LO]
+ ldr r10, [r0,#48+HI]
+ ldr r11, [r0,#56+LO]
+ ldr r12, [r0,#56+HI]
adds r9,r5,r9
+ str r9, [r0,#48+LO]
adc r10,r6,r10
+ str r10, [r0,#48+HI]
adds r11,r3,r11
+ str r11, [r0,#56+LO]
adc r12,r4,r12
- str r9, [r0,#48+4]
- str r10, [r0,#48+0]
- str r11, [r0,#56+4]
- str r12, [r0,#56+0]
+ str r12, [r0,#56+HI]
add sp,sp,#640
sub r14,r14,#640
@@ -403,10 +429,1355 @@ sha512_block_data_order:
bne .Loop
add sp,sp,#8*9 @ destroy frame
+#if __ARM_ARCH__>=5
+ ldmia sp!,{r4-r12,pc}
+#else
ldmia sp!,{r4-r12,lr}
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
.word 0xe12fff1e @ interoperable with Thumb ISA:-)
-.size sha512_block_data_order,.-sha512_block_data_order
-.asciz "SHA512 block transform for ARMv4, CRYPTOGAMS by <appro@openssl.org>"
+#endif
+#if __ARM_ARCH__>=7
+.fpu neon
+
+.align 4
+.LNEON:
+ dmb @ errata #451034 on early Cortex A8
+ vstmdb sp!,{d8-d15} @ ABI specification says so
+ sub r3,r3,#672 @ K512
+ vldmia r0,{d16-d23} @ load context
+.Loop_neon:
+ vshr.u64 d24,d20,#14 @ 0
+#if 0<16
+ vld1.64 {d0},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d20,#18
+ vshr.u64 d26,d20,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d20,#50
+ vsli.64 d25,d20,#46
+ vsli.64 d26,d20,#23
+#if 0<16 && defined(__ARMEL__)
+ vrev64.8 d0,d0
+#endif
+ vadd.i64 d27,d28,d23
+ veor d29,d21,d22
+ veor d24,d25
+ vand d29,d20
+ veor d24,d26 @ Sigma1(e)
+ veor d29,d22 @ Ch(e,f,g)
+ vadd.i64 d27,d24
+ vshr.u64 d24,d16,#28
+ vadd.i64 d27,d29
+ vshr.u64 d25,d16,#34
+ vshr.u64 d26,d16,#39
+ vsli.64 d24,d16,#36
+ vsli.64 d25,d16,#30
+ vsli.64 d26,d16,#25
+ vadd.i64 d27,d0
+ vorr d30,d16,d18
+ vand d29,d16,d18
+ veor d23,d24,d25
+ vand d30,d17
+ veor d23,d26 @ Sigma0(a)
+ vorr d30,d29 @ Maj(a,b,c)
+ vadd.i64 d23,d27
+ vadd.i64 d19,d27
+ vadd.i64 d23,d30
+ vshr.u64 d24,d19,#14 @ 1
+#if 1<16
+ vld1.64 {d1},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d19,#18
+ vshr.u64 d26,d19,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d19,#50
+ vsli.64 d25,d19,#46
+ vsli.64 d26,d19,#23
+#if 1<16 && defined(__ARMEL__)
+ vrev64.8 d1,d1
+#endif
+ vadd.i64 d27,d28,d22
+ veor d29,d20,d21
+ veor d24,d25
+ vand d29,d19
+ veor d24,d26 @ Sigma1(e)
+ veor d29,d21 @ Ch(e,f,g)
+ vadd.i64 d27,d24
+ vshr.u64 d24,d23,#28
+ vadd.i64 d27,d29
+ vshr.u64 d25,d23,#34
+ vshr.u64 d26,d23,#39
+ vsli.64 d24,d23,#36
+ vsli.64 d25,d23,#30
+ vsli.64 d26,d23,#25
+ vadd.i64 d27,d1
+ vorr d30,d23,d17
+ vand d29,d23,d17
+ veor d22,d24,d25
+ vand d30,d16
+ veor d22,d26 @ Sigma0(a)
+ vorr d30,d29 @ Maj(a,b,c)
+ vadd.i64 d22,d27
+ vadd.i64 d18,d27
+ vadd.i64 d22,d30
+ vshr.u64 d24,d18,#14 @ 2
+#if 2<16
+ vld1.64 {d2},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d18,#18
+ vshr.u64 d26,d18,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d18,#50
+ vsli.64 d25,d18,#46
+ vsli.64 d26,d18,#23
+#if 2<16 && defined(__ARMEL__)
+ vrev64.8 d2,d2
+#endif
+ vadd.i64 d27,d28,d21
+ veor d29,d19,d20
+ veor d24,d25
+ vand d29,d18
+ veor d24,d26 @ Sigma1(e)
+ veor d29,d20 @ Ch(e,f,g)
+ vadd.i64 d27,d24
+ vshr.u64 d24,d22,#28
+ vadd.i64 d27,d29
+ vshr.u64 d25,d22,#34
+ vshr.u64 d26,d22,#39
+ vsli.64 d24,d22,#36
+ vsli.64 d25,d22,#30
+ vsli.64 d26,d22,#25
+ vadd.i64 d27,d2
+ vorr d30,d22,d16
+ vand d29,d22,d16
+ veor d21,d24,d25
+ vand d30,d23
+ veor d21,d26 @ Sigma0(a)
+ vorr d30,d29 @ Maj(a,b,c)
+ vadd.i64 d21,d27
+ vadd.i64 d17,d27
+ vadd.i64 d21,d30
+ vshr.u64 d24,d17,#14 @ 3
+#if 3<16
+ vld1.64 {d3},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d17,#18
+ vshr.u64 d26,d17,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d17,#50
+ vsli.64 d25,d17,#46
+ vsli.64 d26,d17,#23
+#if 3<16 && defined(__ARMEL__)
+ vrev64.8 d3,d3
+#endif
+ vadd.i64 d27,d28,d20
+ veor d29,d18,d19
+ veor d24,d25
+ vand d29,d17
+ veor d24,d26 @ Sigma1(e)
+ veor d29,d19 @ Ch(e,f,g)
+ vadd.i64 d27,d24
+ vshr.u64 d24,d21,#28
+ vadd.i64 d27,d29
+ vshr.u64 d25,d21,#34
+ vshr.u64 d26,d21,#39
+ vsli.64 d24,d21,#36
+ vsli.64 d25,d21,#30
+ vsli.64 d26,d21,#25
+ vadd.i64 d27,d3
+ vorr d30,d21,d23
+ vand d29,d21,d23
+ veor d20,d24,d25
+ vand d30,d22
+ veor d20,d26 @ Sigma0(a)
+ vorr d30,d29 @ Maj(a,b,c)
+ vadd.i64 d20,d27
+ vadd.i64 d16,d27
+ vadd.i64 d20,d30
+ vshr.u64 d24,d16,#14 @ 4
+#if 4<16
+ vld1.64 {d4},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d16,#18
+ vshr.u64 d26,d16,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d16,#50
+ vsli.64 d25,d16,#46
+ vsli.64 d26,d16,#23
+#if 4<16 && defined(__ARMEL__)
+ vrev64.8 d4,d4
+#endif
+ vadd.i64 d27,d28,d19
+ veor d29,d17,d18
+ veor d24,d25
+ vand d29,d16
+ veor d24,d26 @ Sigma1(e)
+ veor d29,d18 @ Ch(e,f,g)
+ vadd.i64 d27,d24
+ vshr.u64 d24,d20,#28
+ vadd.i64 d27,d29
+ vshr.u64 d25,d20,#34
+ vshr.u64 d26,d20,#39
+ vsli.64 d24,d20,#36
+ vsli.64 d25,d20,#30
+ vsli.64 d26,d20,#25
+ vadd.i64 d27,d4
+ vorr d30,d20,d22
+ vand d29,d20,d22
+ veor d19,d24,d25
+ vand d30,d21
+ veor d19,d26 @ Sigma0(a)
+ vorr d30,d29 @ Maj(a,b,c)
+ vadd.i64 d19,d27
+ vadd.i64 d23,d27
+ vadd.i64 d19,d30
+ vshr.u64 d24,d23,#14 @ 5
+#if 5<16
+ vld1.64 {d5},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d23,#18
+ vshr.u64 d26,d23,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d23,#50
+ vsli.64 d25,d23,#46
+ vsli.64 d26,d23,#23
+#if 5<16 && defined(__ARMEL__)
+ vrev64.8 d5,d5
+#endif
+ vadd.i64 d27,d28,d18
+ veor d29,d16,d17
+ veor d24,d25
+ vand d29,d23
+ veor d24,d26 @ Sigma1(e)
+ veor d29,d17 @ Ch(e,f,g)
+ vadd.i64 d27,d24
+ vshr.u64 d24,d19,#28
+ vadd.i64 d27,d29
+ vshr.u64 d25,d19,#34
+ vshr.u64 d26,d19,#39
+ vsli.64 d24,d19,#36
+ vsli.64 d25,d19,#30
+ vsli.64 d26,d19,#25
+ vadd.i64 d27,d5
+ vorr d30,d19,d21
+ vand d29,d19,d21
+ veor d18,d24,d25
+ vand d30,d20
+ veor d18,d26 @ Sigma0(a)
+ vorr d30,d29 @ Maj(a,b,c)
+ vadd.i64 d18,d27
+ vadd.i64 d22,d27
+ vadd.i64 d18,d30
+ vshr.u64 d24,d22,#14 @ 6
+#if 6<16
+ vld1.64 {d6},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d22,#18
+ vshr.u64 d26,d22,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d22,#50
+ vsli.64 d25,d22,#46
+ vsli.64 d26,d22,#23
+#if 6<16 && defined(__ARMEL__)
+ vrev64.8 d6,d6
+#endif
+ vadd.i64 d27,d28,d17
+ veor d29,d23,d16
+ veor d24,d25
+ vand d29,d22
+ veor d24,d26 @ Sigma1(e)
+ veor d29,d16 @ Ch(e,f,g)
+ vadd.i64 d27,d24
+ vshr.u64 d24,d18,#28
+ vadd.i64 d27,d29
+ vshr.u64 d25,d18,#34
+ vshr.u64 d26,d18,#39
+ vsli.64 d24,d18,#36
+ vsli.64 d25,d18,#30
+ vsli.64 d26,d18,#25
+ vadd.i64 d27,d6
+ vorr d30,d18,d20
+ vand d29,d18,d20
+ veor d17,d24,d25
+ vand d30,d19
+ veor d17,d26 @ Sigma0(a)
+ vorr d30,d29 @ Maj(a,b,c)
+ vadd.i64 d17,d27
+ vadd.i64 d21,d27
+ vadd.i64 d17,d30
+ vshr.u64 d24,d21,#14 @ 7
+#if 7<16
+ vld1.64 {d7},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d21,#18
+ vshr.u64 d26,d21,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d21,#50
+ vsli.64 d25,d21,#46
+ vsli.64 d26,d21,#23
+#if 7<16 && defined(__ARMEL__)
+ vrev64.8 d7,d7
+#endif
+ vadd.i64 d27,d28,d16
+ veor d29,d22,d23
+ veor d24,d25
+ vand d29,d21
+ veor d24,d26 @ Sigma1(e)
+ veor d29,d23 @ Ch(e,f,g)
+ vadd.i64 d27,d24
+ vshr.u64 d24,d17,#28
+ vadd.i64 d27,d29
+ vshr.u64 d25,d17,#34
+ vshr.u64 d26,d17,#39
+ vsli.64 d24,d17,#36
+ vsli.64 d25,d17,#30
+ vsli.64 d26,d17,#25
+ vadd.i64 d27,d7
+ vorr d30,d17,d19
+ vand d29,d17,d19
+ veor d16,d24,d25
+ vand d30,d18
+ veor d16,d26 @ Sigma0(a)
+ vorr d30,d29 @ Maj(a,b,c)
+ vadd.i64 d16,d27
+ vadd.i64 d20,d27
+ vadd.i64 d16,d30
+ vshr.u64 d24,d20,#14 @ 8
+#if 8<16
+ vld1.64 {d8},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d20,#18
+ vshr.u64 d26,d20,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d20,#50
+ vsli.64 d25,d20,#46
+ vsli.64 d26,d20,#23
+#if 8<16 && defined(__ARMEL__)
+ vrev64.8 d8,d8
+#endif
+ vadd.i64 d27,d28,d23
+ veor d29,d21,d22
+ veor d24,d25
+ vand d29,d20
+ veor d24,d26 @ Sigma1(e)
+ veor d29,d22 @ Ch(e,f,g)
+ vadd.i64 d27,d24
+ vshr.u64 d24,d16,#28
+ vadd.i64 d27,d29
+ vshr.u64 d25,d16,#34
+ vshr.u64 d26,d16,#39
+ vsli.64 d24,d16,#36
+ vsli.64 d25,d16,#30
+ vsli.64 d26,d16,#25
+ vadd.i64 d27,d8
+ vorr d30,d16,d18
+ vand d29,d16,d18
+ veor d23,d24,d25
+ vand d30,d17
+ veor d23,d26 @ Sigma0(a)
+ vorr d30,d29 @ Maj(a,b,c)
+ vadd.i64 d23,d27
+ vadd.i64 d19,d27
+ vadd.i64 d23,d30
+ vshr.u64 d24,d19,#14 @ 9
+#if 9<16
+ vld1.64 {d9},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d19,#18
+ vshr.u64 d26,d19,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d19,#50
+ vsli.64 d25,d19,#46
+ vsli.64 d26,d19,#23
+#if 9<16 && defined(__ARMEL__)
+ vrev64.8 d9,d9
+#endif
+ vadd.i64 d27,d28,d22
+ veor d29,d20,d21
+ veor d24,d25
+ vand d29,d19
+ veor d24,d26 @ Sigma1(e)
+ veor d29,d21 @ Ch(e,f,g)
+ vadd.i64 d27,d24
+ vshr.u64 d24,d23,#28
+ vadd.i64 d27,d29
+ vshr.u64 d25,d23,#34
+ vshr.u64 d26,d23,#39
+ vsli.64 d24,d23,#36
+ vsli.64 d25,d23,#30
+ vsli.64 d26,d23,#25
+ vadd.i64 d27,d9
+ vorr d30,d23,d17
+ vand d29,d23,d17
+ veor d22,d24,d25
+ vand d30,d16
+ veor d22,d26 @ Sigma0(a)
+ vorr d30,d29 @ Maj(a,b,c)
+ vadd.i64 d22,d27
+ vadd.i64 d18,d27
+ vadd.i64 d22,d30
+ vshr.u64 d24,d18,#14 @ 10
+#if 10<16
+ vld1.64 {d10},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d18,#18
+ vshr.u64 d26,d18,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d18,#50
+ vsli.64 d25,d18,#46
+ vsli.64 d26,d18,#23
+#if 10<16 && defined(__ARMEL__)
+ vrev64.8 d10,d10
+#endif
+ vadd.i64 d27,d28,d21
+ veor d29,d19,d20
+ veor d24,d25
+ vand d29,d18
+ veor d24,d26 @ Sigma1(e)
+ veor d29,d20 @ Ch(e,f,g)
+ vadd.i64 d27,d24
+ vshr.u64 d24,d22,#28
+ vadd.i64 d27,d29
+ vshr.u64 d25,d22,#34
+ vshr.u64 d26,d22,#39
+ vsli.64 d24,d22,#36
+ vsli.64 d25,d22,#30
+ vsli.64 d26,d22,#25
+ vadd.i64 d27,d10
+ vorr d30,d22,d16
+ vand d29,d22,d16
+ veor d21,d24,d25
+ vand d30,d23
+ veor d21,d26 @ Sigma0(a)
+ vorr d30,d29 @ Maj(a,b,c)
+ vadd.i64 d21,d27
+ vadd.i64 d17,d27
+ vadd.i64 d21,d30
+ vshr.u64 d24,d17,#14 @ 11
+#if 11<16
+ vld1.64 {d11},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d17,#18
+ vshr.u64 d26,d17,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d17,#50
+ vsli.64 d25,d17,#46
+ vsli.64 d26,d17,#23
+#if 11<16 && defined(__ARMEL__)
+ vrev64.8 d11,d11
+#endif
+ vadd.i64 d27,d28,d20
+ veor d29,d18,d19
+ veor d24,d25
+ vand d29,d17
+ veor d24,d26 @ Sigma1(e)
+ veor d29,d19 @ Ch(e,f,g)
+ vadd.i64 d27,d24
+ vshr.u64 d24,d21,#28
+ vadd.i64 d27,d29
+ vshr.u64 d25,d21,#34
+ vshr.u64 d26,d21,#39
+ vsli.64 d24,d21,#36
+ vsli.64 d25,d21,#30
+ vsli.64 d26,d21,#25
+ vadd.i64 d27,d11
+ vorr d30,d21,d23
+ vand d29,d21,d23
+ veor d20,d24,d25
+ vand d30,d22
+ veor d20,d26 @ Sigma0(a)
+ vorr d30,d29 @ Maj(a,b,c)
+ vadd.i64 d20,d27
+ vadd.i64 d16,d27
+ vadd.i64 d20,d30
+ vshr.u64 d24,d16,#14 @ 12
+#if 12<16
+ vld1.64 {d12},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d16,#18
+ vshr.u64 d26,d16,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d16,#50
+ vsli.64 d25,d16,#46
+ vsli.64 d26,d16,#23
+#if 12<16 && defined(__ARMEL__)
+ vrev64.8 d12,d12
+#endif
+ vadd.i64 d27,d28,d19
+ veor d29,d17,d18
+ veor d24,d25
+ vand d29,d16
+ veor d24,d26 @ Sigma1(e)
+ veor d29,d18 @ Ch(e,f,g)
+ vadd.i64 d27,d24
+ vshr.u64 d24,d20,#28
+ vadd.i64 d27,d29
+ vshr.u64 d25,d20,#34
+ vshr.u64 d26,d20,#39
+ vsli.64 d24,d20,#36
+ vsli.64 d25,d20,#30
+ vsli.64 d26,d20,#25
+ vadd.i64 d27,d12
+ vorr d30,d20,d22
+ vand d29,d20,d22
+ veor d19,d24,d25
+ vand d30,d21
+ veor d19,d26 @ Sigma0(a)
+ vorr d30,d29 @ Maj(a,b,c)
+ vadd.i64 d19,d27
+ vadd.i64 d23,d27
+ vadd.i64 d19,d30
+ vshr.u64 d24,d23,#14 @ 13
+#if 13<16
+ vld1.64 {d13},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d23,#18
+ vshr.u64 d26,d23,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d23,#50
+ vsli.64 d25,d23,#46
+ vsli.64 d26,d23,#23
+#if 13<16 && defined(__ARMEL__)
+ vrev64.8 d13,d13
+#endif
+ vadd.i64 d27,d28,d18
+ veor d29,d16,d17
+ veor d24,d25
+ vand d29,d23
+ veor d24,d26 @ Sigma1(e)
+ veor d29,d17 @ Ch(e,f,g)
+ vadd.i64 d27,d24
+ vshr.u64 d24,d19,#28
+ vadd.i64 d27,d29
+ vshr.u64 d25,d19,#34
+ vshr.u64 d26,d19,#39
+ vsli.64 d24,d19,#36
+ vsli.64 d25,d19,#30
+ vsli.64 d26,d19,#25
+ vadd.i64 d27,d13
+ vorr d30,d19,d21
+ vand d29,d19,d21
+ veor d18,d24,d25
+ vand d30,d20
+ veor d18,d26 @ Sigma0(a)
+ vorr d30,d29 @ Maj(a,b,c)
+ vadd.i64 d18,d27
+ vadd.i64 d22,d27
+ vadd.i64 d18,d30
+ vshr.u64 d24,d22,#14 @ 14
+#if 14<16
+ vld1.64 {d14},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d22,#18
+ vshr.u64 d26,d22,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d22,#50
+ vsli.64 d25,d22,#46
+ vsli.64 d26,d22,#23
+#if 14<16 && defined(__ARMEL__)
+ vrev64.8 d14,d14
+#endif
+ vadd.i64 d27,d28,d17
+ veor d29,d23,d16
+ veor d24,d25
+ vand d29,d22
+ veor d24,d26 @ Sigma1(e)
+ veor d29,d16 @ Ch(e,f,g)
+ vadd.i64 d27,d24
+ vshr.u64 d24,d18,#28
+ vadd.i64 d27,d29
+ vshr.u64 d25,d18,#34
+ vshr.u64 d26,d18,#39
+ vsli.64 d24,d18,#36
+ vsli.64 d25,d18,#30
+ vsli.64 d26,d18,#25
+ vadd.i64 d27,d14
+ vorr d30,d18,d20
+ vand d29,d18,d20
+ veor d17,d24,d25
+ vand d30,d19
+ veor d17,d26 @ Sigma0(a)
+ vorr d30,d29 @ Maj(a,b,c)
+ vadd.i64 d17,d27
+ vadd.i64 d21,d27
+ vadd.i64 d17,d30
+ vshr.u64 d24,d21,#14 @ 15
+#if 15<16
+ vld1.64 {d15},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d21,#18
+ vshr.u64 d26,d21,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d21,#50
+ vsli.64 d25,d21,#46
+ vsli.64 d26,d21,#23
+#if 15<16 && defined(__ARMEL__)
+ vrev64.8 d15,d15
+#endif
+ vadd.i64 d27,d28,d16
+ veor d29,d22,d23
+ veor d24,d25
+ vand d29,d21
+ veor d24,d26 @ Sigma1(e)
+ veor d29,d23 @ Ch(e,f,g)
+ vadd.i64 d27,d24
+ vshr.u64 d24,d17,#28
+ vadd.i64 d27,d29
+ vshr.u64 d25,d17,#34
+ vshr.u64 d26,d17,#39
+ vsli.64 d24,d17,#36
+ vsli.64 d25,d17,#30
+ vsli.64 d26,d17,#25
+ vadd.i64 d27,d15
+ vorr d30,d17,d19
+ vand d29,d17,d19
+ veor d16,d24,d25
+ vand d30,d18
+ veor d16,d26 @ Sigma0(a)
+ vorr d30,d29 @ Maj(a,b,c)
+ vadd.i64 d16,d27
+ vadd.i64 d20,d27
+ vadd.i64 d16,d30
+ mov r12,#4
+.L16_79_neon:
+ subs r12,#1
+ vshr.u64 q12,q7,#19
+ vshr.u64 q13,q7,#61
+ vshr.u64 q15,q7,#6
+ vsli.64 q12,q7,#45
+ vext.8 q14,q0,q1,#8 @ X[i+1]
+ vsli.64 q13,q7,#3
+ veor q15,q12
+ vshr.u64 q12,q14,#1
+ veor q15,q13 @ sigma1(X[i+14])
+ vshr.u64 q13,q14,#8
+ vadd.i64 q0,q15
+ vshr.u64 q15,q14,#7
+ vsli.64 q12,q14,#63
+ vsli.64 q13,q14,#56
+ vext.8 q14,q4,q5,#8 @ X[i+9]
+ veor q15,q12
+ vshr.u64 d24,d20,#14 @ from NEON_00_15
+ vadd.i64 q0,q14
+ vshr.u64 d25,d20,#18 @ from NEON_00_15
+ veor q15,q13 @ sigma0(X[i+1])
+ vshr.u64 d26,d20,#41 @ from NEON_00_15
+ vadd.i64 q0,q15
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d20,#50
+ vsli.64 d25,d20,#46
+ vsli.64 d26,d20,#23
+#if 16<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ vadd.i64 d27,d28,d23
+ veor d29,d21,d22
+ veor d24,d25
+ vand d29,d20
+ veor d24,d26 @ Sigma1(e)
+ veor d29,d22 @ Ch(e,f,g)
+ vadd.i64 d27,d24
+ vshr.u64 d24,d16,#28
+ vadd.i64 d27,d29
+ vshr.u64 d25,d16,#34
+ vshr.u64 d26,d16,#39
+ vsli.64 d24,d16,#36
+ vsli.64 d25,d16,#30
+ vsli.64 d26,d16,#25
+ vadd.i64 d27,d0
+ vorr d30,d16,d18
+ vand d29,d16,d18
+ veor d23,d24,d25
+ vand d30,d17
+ veor d23,d26 @ Sigma0(a)
+ vorr d30,d29 @ Maj(a,b,c)
+ vadd.i64 d23,d27
+ vadd.i64 d19,d27
+ vadd.i64 d23,d30
+ vshr.u64 d24,d19,#14 @ 17
+#if 17<16
+ vld1.64 {d1},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d19,#18
+ vshr.u64 d26,d19,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d19,#50
+ vsli.64 d25,d19,#46
+ vsli.64 d26,d19,#23
+#if 17<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ vadd.i64 d27,d28,d22
+ veor d29,d20,d21
+ veor d24,d25
+ vand d29,d19
+ veor d24,d26 @ Sigma1(e)
+ veor d29,d21 @ Ch(e,f,g)
+ vadd.i64 d27,d24
+ vshr.u64 d24,d23,#28
+ vadd.i64 d27,d29
+ vshr.u64 d25,d23,#34
+ vshr.u64 d26,d23,#39
+ vsli.64 d24,d23,#36
+ vsli.64 d25,d23,#30
+ vsli.64 d26,d23,#25
+ vadd.i64 d27,d1
+ vorr d30,d23,d17
+ vand d29,d23,d17
+ veor d22,d24,d25
+ vand d30,d16
+ veor d22,d26 @ Sigma0(a)
+ vorr d30,d29 @ Maj(a,b,c)
+ vadd.i64 d22,d27
+ vadd.i64 d18,d27
+ vadd.i64 d22,d30
+ vshr.u64 q12,q0,#19
+ vshr.u64 q13,q0,#61
+ vshr.u64 q15,q0,#6
+ vsli.64 q12,q0,#45
+ vext.8 q14,q1,q2,#8 @ X[i+1]
+ vsli.64 q13,q0,#3
+ veor q15,q12
+ vshr.u64 q12,q14,#1
+ veor q15,q13 @ sigma1(X[i+14])
+ vshr.u64 q13,q14,#8
+ vadd.i64 q1,q15
+ vshr.u64 q15,q14,#7
+ vsli.64 q12,q14,#63
+ vsli.64 q13,q14,#56
+ vext.8 q14,q5,q6,#8 @ X[i+9]
+ veor q15,q12
+ vshr.u64 d24,d18,#14 @ from NEON_00_15
+ vadd.i64 q1,q14
+ vshr.u64 d25,d18,#18 @ from NEON_00_15
+ veor q15,q13 @ sigma0(X[i+1])
+ vshr.u64 d26,d18,#41 @ from NEON_00_15
+ vadd.i64 q1,q15
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d18,#50
+ vsli.64 d25,d18,#46
+ vsli.64 d26,d18,#23
+#if 18<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ vadd.i64 d27,d28,d21
+ veor d29,d19,d20
+ veor d24,d25
+ vand d29,d18
+ veor d24,d26 @ Sigma1(e)
+ veor d29,d20 @ Ch(e,f,g)
+ vadd.i64 d27,d24
+ vshr.u64 d24,d22,#28
+ vadd.i64 d27,d29
+ vshr.u64 d25,d22,#34
+ vshr.u64 d26,d22,#39
+ vsli.64 d24,d22,#36
+ vsli.64 d25,d22,#30
+ vsli.64 d26,d22,#25
+ vadd.i64 d27,d2
+ vorr d30,d22,d16
+ vand d29,d22,d16
+ veor d21,d24,d25
+ vand d30,d23
+ veor d21,d26 @ Sigma0(a)
+ vorr d30,d29 @ Maj(a,b,c)
+ vadd.i64 d21,d27
+ vadd.i64 d17,d27
+ vadd.i64 d21,d30
+ vshr.u64 d24,d17,#14 @ 19
+#if 19<16
+ vld1.64 {d3},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d17,#18
+ vshr.u64 d26,d17,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d17,#50
+ vsli.64 d25,d17,#46
+ vsli.64 d26,d17,#23
+#if 19<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ vadd.i64 d27,d28,d20
+ veor d29,d18,d19
+ veor d24,d25
+ vand d29,d17
+ veor d24,d26 @ Sigma1(e)
+ veor d29,d19 @ Ch(e,f,g)
+ vadd.i64 d27,d24
+ vshr.u64 d24,d21,#28
+ vadd.i64 d27,d29
+ vshr.u64 d25,d21,#34
+ vshr.u64 d26,d21,#39
+ vsli.64 d24,d21,#36
+ vsli.64 d25,d21,#30
+ vsli.64 d26,d21,#25
+ vadd.i64 d27,d3
+ vorr d30,d21,d23
+ vand d29,d21,d23
+ veor d20,d24,d25
+ vand d30,d22
+ veor d20,d26 @ Sigma0(a)
+ vorr d30,d29 @ Maj(a,b,c)
+ vadd.i64 d20,d27
+ vadd.i64 d16,d27
+ vadd.i64 d20,d30
+ vshr.u64 q12,q1,#19
+ vshr.u64 q13,q1,#61
+ vshr.u64 q15,q1,#6
+ vsli.64 q12,q1,#45
+ vext.8 q14,q2,q3,#8 @ X[i+1]
+ vsli.64 q13,q1,#3
+ veor q15,q12
+ vshr.u64 q12,q14,#1
+ veor q15,q13 @ sigma1(X[i+14])
+ vshr.u64 q13,q14,#8
+ vadd.i64 q2,q15
+ vshr.u64 q15,q14,#7
+ vsli.64 q12,q14,#63
+ vsli.64 q13,q14,#56
+ vext.8 q14,q6,q7,#8 @ X[i+9]
+ veor q15,q12
+ vshr.u64 d24,d16,#14 @ from NEON_00_15
+ vadd.i64 q2,q14
+ vshr.u64 d25,d16,#18 @ from NEON_00_15
+ veor q15,q13 @ sigma0(X[i+1])
+ vshr.u64 d26,d16,#41 @ from NEON_00_15
+ vadd.i64 q2,q15
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d16,#50
+ vsli.64 d25,d16,#46
+ vsli.64 d26,d16,#23
+#if 20<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ vadd.i64 d27,d28,d19
+ veor d29,d17,d18
+ veor d24,d25
+ vand d29,d16
+ veor d24,d26 @ Sigma1(e)
+ veor d29,d18 @ Ch(e,f,g)
+ vadd.i64 d27,d24
+ vshr.u64 d24,d20,#28
+ vadd.i64 d27,d29
+ vshr.u64 d25,d20,#34
+ vshr.u64 d26,d20,#39
+ vsli.64 d24,d20,#36
+ vsli.64 d25,d20,#30
+ vsli.64 d26,d20,#25
+ vadd.i64 d27,d4
+ vorr d30,d20,d22
+ vand d29,d20,d22
+ veor d19,d24,d25
+ vand d30,d21
+ veor d19,d26 @ Sigma0(a)
+ vorr d30,d29 @ Maj(a,b,c)
+ vadd.i64 d19,d27
+ vadd.i64 d23,d27
+ vadd.i64 d19,d30
+ vshr.u64 d24,d23,#14 @ 21
+#if 21<16
+ vld1.64 {d5},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d23,#18
+ vshr.u64 d26,d23,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d23,#50
+ vsli.64 d25,d23,#46
+ vsli.64 d26,d23,#23
+#if 21<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ vadd.i64 d27,d28,d18
+ veor d29,d16,d17
+ veor d24,d25
+ vand d29,d23
+ veor d24,d26 @ Sigma1(e)
+ veor d29,d17 @ Ch(e,f,g)
+ vadd.i64 d27,d24
+ vshr.u64 d24,d19,#28
+ vadd.i64 d27,d29
+ vshr.u64 d25,d19,#34
+ vshr.u64 d26,d19,#39
+ vsli.64 d24,d19,#36
+ vsli.64 d25,d19,#30
+ vsli.64 d26,d19,#25
+ vadd.i64 d27,d5
+ vorr d30,d19,d21
+ vand d29,d19,d21
+ veor d18,d24,d25
+ vand d30,d20
+ veor d18,d26 @ Sigma0(a)
+ vorr d30,d29 @ Maj(a,b,c)
+ vadd.i64 d18,d27
+ vadd.i64 d22,d27
+ vadd.i64 d18,d30
+ vshr.u64 q12,q2,#19
+ vshr.u64 q13,q2,#61
+ vshr.u64 q15,q2,#6
+ vsli.64 q12,q2,#45
+ vext.8 q14,q3,q4,#8 @ X[i+1]
+ vsli.64 q13,q2,#3
+ veor q15,q12
+ vshr.u64 q12,q14,#1
+ veor q15,q13 @ sigma1(X[i+14])
+ vshr.u64 q13,q14,#8
+ vadd.i64 q3,q15
+ vshr.u64 q15,q14,#7
+ vsli.64 q12,q14,#63
+ vsli.64 q13,q14,#56
+ vext.8 q14,q7,q0,#8 @ X[i+9]
+ veor q15,q12
+ vshr.u64 d24,d22,#14 @ from NEON_00_15
+ vadd.i64 q3,q14
+ vshr.u64 d25,d22,#18 @ from NEON_00_15
+ veor q15,q13 @ sigma0(X[i+1])
+ vshr.u64 d26,d22,#41 @ from NEON_00_15
+ vadd.i64 q3,q15
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d22,#50
+ vsli.64 d25,d22,#46
+ vsli.64 d26,d22,#23
+#if 22<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ vadd.i64 d27,d28,d17
+ veor d29,d23,d16
+ veor d24,d25
+ vand d29,d22
+ veor d24,d26 @ Sigma1(e)
+ veor d29,d16 @ Ch(e,f,g)
+ vadd.i64 d27,d24
+ vshr.u64 d24,d18,#28
+ vadd.i64 d27,d29
+ vshr.u64 d25,d18,#34
+ vshr.u64 d26,d18,#39
+ vsli.64 d24,d18,#36
+ vsli.64 d25,d18,#30
+ vsli.64 d26,d18,#25
+ vadd.i64 d27,d6
+ vorr d30,d18,d20
+ vand d29,d18,d20
+ veor d17,d24,d25
+ vand d30,d19
+ veor d17,d26 @ Sigma0(a)
+ vorr d30,d29 @ Maj(a,b,c)
+ vadd.i64 d17,d27
+ vadd.i64 d21,d27
+ vadd.i64 d17,d30
+ vshr.u64 d24,d21,#14 @ 23
+#if 23<16
+ vld1.64 {d7},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d21,#18
+ vshr.u64 d26,d21,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d21,#50
+ vsli.64 d25,d21,#46
+ vsli.64 d26,d21,#23
+#if 23<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ vadd.i64 d27,d28,d16
+ veor d29,d22,d23
+ veor d24,d25
+ vand d29,d21
+ veor d24,d26 @ Sigma1(e)
+ veor d29,d23 @ Ch(e,f,g)
+ vadd.i64 d27,d24
+ vshr.u64 d24,d17,#28
+ vadd.i64 d27,d29
+ vshr.u64 d25,d17,#34
+ vshr.u64 d26,d17,#39
+ vsli.64 d24,d17,#36
+ vsli.64 d25,d17,#30
+ vsli.64 d26,d17,#25
+ vadd.i64 d27,d7
+ vorr d30,d17,d19
+ vand d29,d17,d19
+ veor d16,d24,d25
+ vand d30,d18
+ veor d16,d26 @ Sigma0(a)
+ vorr d30,d29 @ Maj(a,b,c)
+ vadd.i64 d16,d27
+ vadd.i64 d20,d27
+ vadd.i64 d16,d30
+ vshr.u64 q12,q3,#19
+ vshr.u64 q13,q3,#61
+ vshr.u64 q15,q3,#6
+ vsli.64 q12,q3,#45
+ vext.8 q14,q4,q5,#8 @ X[i+1]
+ vsli.64 q13,q3,#3
+ veor q15,q12
+ vshr.u64 q12,q14,#1
+ veor q15,q13 @ sigma1(X[i+14])
+ vshr.u64 q13,q14,#8
+ vadd.i64 q4,q15
+ vshr.u64 q15,q14,#7
+ vsli.64 q12,q14,#63
+ vsli.64 q13,q14,#56
+ vext.8 q14,q0,q1,#8 @ X[i+9]
+ veor q15,q12
+ vshr.u64 d24,d20,#14 @ from NEON_00_15
+ vadd.i64 q4,q14
+ vshr.u64 d25,d20,#18 @ from NEON_00_15
+ veor q15,q13 @ sigma0(X[i+1])
+ vshr.u64 d26,d20,#41 @ from NEON_00_15
+ vadd.i64 q4,q15
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d20,#50
+ vsli.64 d25,d20,#46
+ vsli.64 d26,d20,#23
+#if 24<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ vadd.i64 d27,d28,d23
+ veor d29,d21,d22
+ veor d24,d25
+ vand d29,d20
+ veor d24,d26 @ Sigma1(e)
+ veor d29,d22 @ Ch(e,f,g)
+ vadd.i64 d27,d24
+ vshr.u64 d24,d16,#28
+ vadd.i64 d27,d29
+ vshr.u64 d25,d16,#34
+ vshr.u64 d26,d16,#39
+ vsli.64 d24,d16,#36
+ vsli.64 d25,d16,#30
+ vsli.64 d26,d16,#25
+ vadd.i64 d27,d8
+ vorr d30,d16,d18
+ vand d29,d16,d18
+ veor d23,d24,d25
+ vand d30,d17
+ veor d23,d26 @ Sigma0(a)
+ vorr d30,d29 @ Maj(a,b,c)
+ vadd.i64 d23,d27
+ vadd.i64 d19,d27
+ vadd.i64 d23,d30
+ vshr.u64 d24,d19,#14 @ 25
+#if 25<16
+ vld1.64 {d9},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d19,#18
+ vshr.u64 d26,d19,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d19,#50
+ vsli.64 d25,d19,#46
+ vsli.64 d26,d19,#23
+#if 25<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ vadd.i64 d27,d28,d22
+ veor d29,d20,d21
+ veor d24,d25
+ vand d29,d19
+ veor d24,d26 @ Sigma1(e)
+ veor d29,d21 @ Ch(e,f,g)
+ vadd.i64 d27,d24
+ vshr.u64 d24,d23,#28
+ vadd.i64 d27,d29
+ vshr.u64 d25,d23,#34
+ vshr.u64 d26,d23,#39
+ vsli.64 d24,d23,#36
+ vsli.64 d25,d23,#30
+ vsli.64 d26,d23,#25
+ vadd.i64 d27,d9
+ vorr d30,d23,d17
+ vand d29,d23,d17
+ veor d22,d24,d25
+ vand d30,d16
+ veor d22,d26 @ Sigma0(a)
+ vorr d30,d29 @ Maj(a,b,c)
+ vadd.i64 d22,d27
+ vadd.i64 d18,d27
+ vadd.i64 d22,d30
+ vshr.u64 q12,q4,#19
+ vshr.u64 q13,q4,#61
+ vshr.u64 q15,q4,#6
+ vsli.64 q12,q4,#45
+ vext.8 q14,q5,q6,#8 @ X[i+1]
+ vsli.64 q13,q4,#3
+ veor q15,q12
+ vshr.u64 q12,q14,#1
+ veor q15,q13 @ sigma1(X[i+14])
+ vshr.u64 q13,q14,#8
+ vadd.i64 q5,q15
+ vshr.u64 q15,q14,#7
+ vsli.64 q12,q14,#63
+ vsli.64 q13,q14,#56
+ vext.8 q14,q1,q2,#8 @ X[i+9]
+ veor q15,q12
+ vshr.u64 d24,d18,#14 @ from NEON_00_15
+ vadd.i64 q5,q14
+ vshr.u64 d25,d18,#18 @ from NEON_00_15
+ veor q15,q13 @ sigma0(X[i+1])
+ vshr.u64 d26,d18,#41 @ from NEON_00_15
+ vadd.i64 q5,q15
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d18,#50
+ vsli.64 d25,d18,#46
+ vsli.64 d26,d18,#23
+#if 26<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ vadd.i64 d27,d28,d21
+ veor d29,d19,d20
+ veor d24,d25
+ vand d29,d18
+ veor d24,d26 @ Sigma1(e)
+ veor d29,d20 @ Ch(e,f,g)
+ vadd.i64 d27,d24
+ vshr.u64 d24,d22,#28
+ vadd.i64 d27,d29
+ vshr.u64 d25,d22,#34
+ vshr.u64 d26,d22,#39
+ vsli.64 d24,d22,#36
+ vsli.64 d25,d22,#30
+ vsli.64 d26,d22,#25
+ vadd.i64 d27,d10
+ vorr d30,d22,d16
+ vand d29,d22,d16
+ veor d21,d24,d25
+ vand d30,d23
+ veor d21,d26 @ Sigma0(a)
+ vorr d30,d29 @ Maj(a,b,c)
+ vadd.i64 d21,d27
+ vadd.i64 d17,d27
+ vadd.i64 d21,d30
+ vshr.u64 d24,d17,#14 @ 27
+#if 27<16
+ vld1.64 {d11},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d17,#18
+ vshr.u64 d26,d17,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d17,#50
+ vsli.64 d25,d17,#46
+ vsli.64 d26,d17,#23
+#if 27<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ vadd.i64 d27,d28,d20
+ veor d29,d18,d19
+ veor d24,d25
+ vand d29,d17
+ veor d24,d26 @ Sigma1(e)
+ veor d29,d19 @ Ch(e,f,g)
+ vadd.i64 d27,d24
+ vshr.u64 d24,d21,#28
+ vadd.i64 d27,d29
+ vshr.u64 d25,d21,#34
+ vshr.u64 d26,d21,#39
+ vsli.64 d24,d21,#36
+ vsli.64 d25,d21,#30
+ vsli.64 d26,d21,#25
+ vadd.i64 d27,d11
+ vorr d30,d21,d23
+ vand d29,d21,d23
+ veor d20,d24,d25
+ vand d30,d22
+ veor d20,d26 @ Sigma0(a)
+ vorr d30,d29 @ Maj(a,b,c)
+ vadd.i64 d20,d27
+ vadd.i64 d16,d27
+ vadd.i64 d20,d30
+ vshr.u64 q12,q5,#19
+ vshr.u64 q13,q5,#61
+ vshr.u64 q15,q5,#6
+ vsli.64 q12,q5,#45
+ vext.8 q14,q6,q7,#8 @ X[i+1]
+ vsli.64 q13,q5,#3
+ veor q15,q12
+ vshr.u64 q12,q14,#1
+ veor q15,q13 @ sigma1(X[i+14])
+ vshr.u64 q13,q14,#8
+ vadd.i64 q6,q15
+ vshr.u64 q15,q14,#7
+ vsli.64 q12,q14,#63
+ vsli.64 q13,q14,#56
+ vext.8 q14,q2,q3,#8 @ X[i+9]
+ veor q15,q12
+ vshr.u64 d24,d16,#14 @ from NEON_00_15
+ vadd.i64 q6,q14
+ vshr.u64 d25,d16,#18 @ from NEON_00_15
+ veor q15,q13 @ sigma0(X[i+1])
+ vshr.u64 d26,d16,#41 @ from NEON_00_15
+ vadd.i64 q6,q15
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d16,#50
+ vsli.64 d25,d16,#46
+ vsli.64 d26,d16,#23
+#if 28<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ vadd.i64 d27,d28,d19
+ veor d29,d17,d18
+ veor d24,d25
+ vand d29,d16
+ veor d24,d26 @ Sigma1(e)
+ veor d29,d18 @ Ch(e,f,g)
+ vadd.i64 d27,d24
+ vshr.u64 d24,d20,#28
+ vadd.i64 d27,d29
+ vshr.u64 d25,d20,#34
+ vshr.u64 d26,d20,#39
+ vsli.64 d24,d20,#36
+ vsli.64 d25,d20,#30
+ vsli.64 d26,d20,#25
+ vadd.i64 d27,d12
+ vorr d30,d20,d22
+ vand d29,d20,d22
+ veor d19,d24,d25
+ vand d30,d21
+ veor d19,d26 @ Sigma0(a)
+ vorr d30,d29 @ Maj(a,b,c)
+ vadd.i64 d19,d27
+ vadd.i64 d23,d27
+ vadd.i64 d19,d30
+ vshr.u64 d24,d23,#14 @ 29
+#if 29<16
+ vld1.64 {d13},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d23,#18
+ vshr.u64 d26,d23,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d23,#50
+ vsli.64 d25,d23,#46
+ vsli.64 d26,d23,#23
+#if 29<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ vadd.i64 d27,d28,d18
+ veor d29,d16,d17
+ veor d24,d25
+ vand d29,d23
+ veor d24,d26 @ Sigma1(e)
+ veor d29,d17 @ Ch(e,f,g)
+ vadd.i64 d27,d24
+ vshr.u64 d24,d19,#28
+ vadd.i64 d27,d29
+ vshr.u64 d25,d19,#34
+ vshr.u64 d26,d19,#39
+ vsli.64 d24,d19,#36
+ vsli.64 d25,d19,#30
+ vsli.64 d26,d19,#25
+ vadd.i64 d27,d13
+ vorr d30,d19,d21
+ vand d29,d19,d21
+ veor d18,d24,d25
+ vand d30,d20
+ veor d18,d26 @ Sigma0(a)
+ vorr d30,d29 @ Maj(a,b,c)
+ vadd.i64 d18,d27
+ vadd.i64 d22,d27
+ vadd.i64 d18,d30
+ vshr.u64 q12,q6,#19
+ vshr.u64 q13,q6,#61
+ vshr.u64 q15,q6,#6
+ vsli.64 q12,q6,#45
+ vext.8 q14,q7,q0,#8 @ X[i+1]
+ vsli.64 q13,q6,#3
+ veor q15,q12
+ vshr.u64 q12,q14,#1
+ veor q15,q13 @ sigma1(X[i+14])
+ vshr.u64 q13,q14,#8
+ vadd.i64 q7,q15
+ vshr.u64 q15,q14,#7
+ vsli.64 q12,q14,#63
+ vsli.64 q13,q14,#56
+ vext.8 q14,q3,q4,#8 @ X[i+9]
+ veor q15,q12
+ vshr.u64 d24,d22,#14 @ from NEON_00_15
+ vadd.i64 q7,q14
+ vshr.u64 d25,d22,#18 @ from NEON_00_15
+ veor q15,q13 @ sigma0(X[i+1])
+ vshr.u64 d26,d22,#41 @ from NEON_00_15
+ vadd.i64 q7,q15
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d22,#50
+ vsli.64 d25,d22,#46
+ vsli.64 d26,d22,#23
+#if 30<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ vadd.i64 d27,d28,d17
+ veor d29,d23,d16
+ veor d24,d25
+ vand d29,d22
+ veor d24,d26 @ Sigma1(e)
+ veor d29,d16 @ Ch(e,f,g)
+ vadd.i64 d27,d24
+ vshr.u64 d24,d18,#28
+ vadd.i64 d27,d29
+ vshr.u64 d25,d18,#34
+ vshr.u64 d26,d18,#39
+ vsli.64 d24,d18,#36
+ vsli.64 d25,d18,#30
+ vsli.64 d26,d18,#25
+ vadd.i64 d27,d14
+ vorr d30,d18,d20
+ vand d29,d18,d20
+ veor d17,d24,d25
+ vand d30,d19
+ veor d17,d26 @ Sigma0(a)
+ vorr d30,d29 @ Maj(a,b,c)
+ vadd.i64 d17,d27
+ vadd.i64 d21,d27
+ vadd.i64 d17,d30
+ vshr.u64 d24,d21,#14 @ 31
+#if 31<16
+ vld1.64 {d15},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d21,#18
+ vshr.u64 d26,d21,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d21,#50
+ vsli.64 d25,d21,#46
+ vsli.64 d26,d21,#23
+#if 31<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ vadd.i64 d27,d28,d16
+ veor d29,d22,d23
+ veor d24,d25
+ vand d29,d21
+ veor d24,d26 @ Sigma1(e)
+ veor d29,d23 @ Ch(e,f,g)
+ vadd.i64 d27,d24
+ vshr.u64 d24,d17,#28
+ vadd.i64 d27,d29
+ vshr.u64 d25,d17,#34
+ vshr.u64 d26,d17,#39
+ vsli.64 d24,d17,#36
+ vsli.64 d25,d17,#30
+ vsli.64 d26,d17,#25
+ vadd.i64 d27,d15
+ vorr d30,d17,d19
+ vand d29,d17,d19
+ veor d16,d24,d25
+ vand d30,d18
+ veor d16,d26 @ Sigma0(a)
+ vorr d30,d29 @ Maj(a,b,c)
+ vadd.i64 d16,d27
+ vadd.i64 d20,d27
+ vadd.i64 d16,d30
+ bne .L16_79_neon
+
+ vldmia r0,{d24-d31} @ load context to temp
+ vadd.i64 q8,q12 @ vectorized accumulate
+ vadd.i64 q9,q13
+ vadd.i64 q10,q14
+ vadd.i64 q11,q15
+ vstmia r0,{d16-d23} @ save context
+ teq r1,r2
+ sub r3,#640 @ rewind K512
+ bne .Loop_neon
+
+ vldmia sp!,{d8-d15} @ epilogue
+ .word 0xe12fff1e
+#endif
+.size sha512_block_data_order,.-sha512_block_data_order
+.asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro@openssl.org>"
.align 2
+.comm OPENSSL_armcap_P,4,4
diff --git a/main/openssl/crypto/sha/asm/sha512-mips.pl b/main/openssl/crypto/sha/asm/sha512-mips.pl
new file mode 100644
index 00000000..ffa053bb
--- /dev/null
+++ b/main/openssl/crypto/sha/asm/sha512-mips.pl
@@ -0,0 +1,455 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# SHA2 block procedures for MIPS.
+
+# October 2010.
+#
+# SHA256 performance improvement on MIPS R5000 CPU is ~27% over gcc-
+# generated code in o32 build and ~55% in n32/64 build. SHA512 [which
+# for now can only be compiled for MIPS64 ISA] improvement is modest
+# ~17%, but it comes for free, because it's same instruction sequence.
+# Improvement coefficients are for aligned input.
+
+######################################################################
+# There is a number of MIPS ABI in use, O32 and N32/64 are most
+# widely used. Then there is a new contender: NUBI. It appears that if
+# one picks the latter, it's possible to arrange code in ABI neutral
+# manner. Therefore let's stick to NUBI register layout:
+#
+($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
+($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
+($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
+#
+# The return value is placed in $a0. Following coding rules facilitate
+# interoperability:
+#
+# - never ever touch $tp, "thread pointer", former $gp [o32 can be
+# excluded from the rule, because it's specified volatile];
+# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
+# old code];
+# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
+#
+# For reference here is register layout for N32/64 MIPS ABIs:
+#
+# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
+# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
+# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
+# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
+#
+$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
+
+if ($flavour =~ /64|n32/i) {
+ $PTR_ADD="dadd"; # incidentally works even on n32
+ $PTR_SUB="dsub"; # incidentally works even on n32
+ $REG_S="sd";
+ $REG_L="ld";
+ $PTR_SLL="dsll"; # incidentally works even on n32
+ $SZREG=8;
+} else {
+ $PTR_ADD="add";
+ $PTR_SUB="sub";
+ $REG_S="sw";
+ $REG_L="lw";
+ $PTR_SLL="sll";
+ $SZREG=4;
+}
+$pf = ($flavour =~ /nubi/i) ? $t0 : $t2;
+#
+# <appro@openssl.org>
+#
+######################################################################
+
+$big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0;
+
+for (@ARGV) { $output=$_ if (/^\w[\w\-]*\.\w+$/); }
+open STDOUT,">$output";
+
+if (!defined($big_endian)) { $big_endian=(unpack('L',pack('N',1))==1); }
+
+if ($output =~ /512/) {
+ $label="512";
+ $SZ=8;
+ $LD="ld"; # load from memory
+ $ST="sd"; # store to memory
+ $SLL="dsll"; # shift left logical
+ $SRL="dsrl"; # shift right logical
+ $ADDU="daddu";
+ @Sigma0=(28,34,39);
+ @Sigma1=(14,18,41);
+ @sigma0=( 7, 1, 8); # right shift first
+ @sigma1=( 6,19,61); # right shift first
+ $lastK=0x817;
+ $rounds=80;
+} else {
+ $label="256";
+ $SZ=4;
+ $LD="lw"; # load from memory
+ $ST="sw"; # store to memory
+ $SLL="sll"; # shift left logical
+ $SRL="srl"; # shift right logical
+ $ADDU="addu";
+ @Sigma0=( 2,13,22);
+ @Sigma1=( 6,11,25);
+ @sigma0=( 3, 7,18); # right shift first
+ @sigma1=(10,17,19); # right shift first
+ $lastK=0x8f2;
+ $rounds=64;
+}
+
+$MSB = $big_endian ? 0 : ($SZ-1);
+$LSB = ($SZ-1)&~$MSB;
+
+@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("\$$_",(1,2,3,7,24,25,30,31));
+@X=map("\$$_",(8..23));
+
+$ctx=$a0;
+$inp=$a1;
+$len=$a2; $Ktbl=$len;
+
+sub BODY_00_15 {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
+my ($T1,$tmp0,$tmp1,$tmp2)=(@X[4],@X[5],@X[6],@X[7]);
+
+$code.=<<___ if ($i<15);
+ ${LD}l @X[1],`($i+1)*$SZ+$MSB`($inp)
+ ${LD}r @X[1],`($i+1)*$SZ+$LSB`($inp)
+___
+$code.=<<___ if (!$big_endian && $i<16 && $SZ==4);
+ srl $tmp0,@X[0],24 # byte swap($i)
+ srl $tmp1,@X[0],8
+ andi $tmp2,@X[0],0xFF00
+ sll @X[0],@X[0],24
+ andi $tmp1,0xFF00
+ sll $tmp2,$tmp2,8
+ or @X[0],$tmp0
+ or $tmp1,$tmp2
+ or @X[0],$tmp1
+___
+$code.=<<___ if (!$big_endian && $i<16 && $SZ==8);
+ ori $tmp0,$zero,0xFF
+ dsll $tmp2,$tmp0,32
+ or $tmp0,$tmp2 # 0x000000FF000000FF
+ and $tmp1,@X[0],$tmp0 # byte swap($i)
+ dsrl $tmp2,@X[0],24
+ dsll $tmp1,24
+ and $tmp2,$tmp0
+ dsll $tmp0,8 # 0x0000FF000000FF00
+ or $tmp1,$tmp2
+ and $tmp2,@X[0],$tmp0
+ dsrl @X[0],8
+ dsll $tmp2,8
+ and @X[0],$tmp0
+ or $tmp1,$tmp2
+ or @X[0],$tmp1
+ dsrl $tmp1,@X[0],32
+ dsll @X[0],32
+ or @X[0],$tmp1
+___
+$code.=<<___;
+ $ADDU $T1,$X[0],$h # $i
+ $SRL $h,$e,@Sigma1[0]
+ xor $tmp2,$f,$g
+ $SLL $tmp1,$e,`$SZ*8-@Sigma1[2]`
+ and $tmp2,$e
+ $SRL $tmp0,$e,@Sigma1[1]
+ xor $h,$tmp1
+ $SLL $tmp1,$e,`$SZ*8-@Sigma1[1]`
+ xor $h,$tmp0
+ $SRL $tmp0,$e,@Sigma1[2]
+ xor $h,$tmp1
+ $SLL $tmp1,$e,`$SZ*8-@Sigma1[0]`
+ xor $h,$tmp0
+ xor $tmp2,$g # Ch(e,f,g)
+ xor $tmp0,$tmp1,$h # Sigma1(e)
+
+ $SRL $h,$a,@Sigma0[0]
+ $ADDU $T1,$tmp2
+ $LD $tmp2,`$i*$SZ`($Ktbl) # K[$i]
+ $SLL $tmp1,$a,`$SZ*8-@Sigma0[2]`
+ $ADDU $T1,$tmp0
+ $SRL $tmp0,$a,@Sigma0[1]
+ xor $h,$tmp1
+ $SLL $tmp1,$a,`$SZ*8-@Sigma0[1]`
+ xor $h,$tmp0
+ $SRL $tmp0,$a,@Sigma0[2]
+ xor $h,$tmp1
+ $SLL $tmp1,$a,`$SZ*8-@Sigma0[0]`
+ xor $h,$tmp0
+ $ST @X[0],`($i%16)*$SZ`($sp) # offload to ring buffer
+ xor $h,$tmp1 # Sigma0(a)
+
+ or $tmp0,$a,$b
+ and $tmp1,$a,$b
+ and $tmp0,$c
+ or $tmp1,$tmp0 # Maj(a,b,c)
+ $ADDU $T1,$tmp2 # +=K[$i]
+ $ADDU $h,$tmp1
+
+ $ADDU $d,$T1
+ $ADDU $h,$T1
+___
+$code.=<<___ if ($i>=13);
+ $LD @X[3],`(($i+3)%16)*$SZ`($sp) # prefetch from ring buffer
+___
+}
+
+sub BODY_16_XX {
+my $i=@_[0];
+my ($tmp0,$tmp1,$tmp2,$tmp3)=(@X[4],@X[5],@X[6],@X[7]);
+
+$code.=<<___;
+ $SRL $tmp2,@X[1],@sigma0[0] # Xupdate($i)
+ $ADDU @X[0],@X[9] # +=X[i+9]
+ $SLL $tmp1,@X[1],`$SZ*8-@sigma0[2]`
+ $SRL $tmp0,@X[1],@sigma0[1]
+ xor $tmp2,$tmp1
+ $SLL $tmp1,`@sigma0[2]-@sigma0[1]`
+ xor $tmp2,$tmp0
+ $SRL $tmp0,@X[1],@sigma0[2]
+ xor $tmp2,$tmp1
+
+ $SRL $tmp3,@X[14],@sigma1[0]
+ xor $tmp2,$tmp0 # sigma0(X[i+1])
+ $SLL $tmp1,@X[14],`$SZ*8-@sigma1[2]`
+ $ADDU @X[0],$tmp2
+ $SRL $tmp0,@X[14],@sigma1[1]
+ xor $tmp3,$tmp1
+ $SLL $tmp1,`@sigma1[2]-@sigma1[1]`
+ xor $tmp3,$tmp0
+ $SRL $tmp0,@X[14],@sigma1[2]
+ xor $tmp3,$tmp1
+
+ xor $tmp3,$tmp0 # sigma1(X[i+14])
+ $ADDU @X[0],$tmp3
+___
+ &BODY_00_15(@_);
+}
+
+$FRAMESIZE=16*$SZ+16*$SZREG;
+$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000;
+
+$code.=<<___;
+#ifdef OPENSSL_FIPSCANISTER
+# include <openssl/fipssyms.h>
+#endif
+
+.text
+.set noat
+#if !defined(__vxworks) || defined(__pic__)
+.option pic2
+#endif
+
+.align 5
+.globl sha${label}_block_data_order
+.ent sha${label}_block_data_order
+sha${label}_block_data_order:
+ .frame $sp,$FRAMESIZE,$ra
+ .mask $SAVED_REGS_MASK,-$SZREG
+ .set noreorder
+___
+$code.=<<___ if ($flavour =~ /o32/i); # o32 PIC-ification
+ .cpload $pf
+___
+$code.=<<___;
+ $PTR_SUB $sp,$FRAMESIZE
+ $REG_S $ra,$FRAMESIZE-1*$SZREG($sp)
+ $REG_S $fp,$FRAMESIZE-2*$SZREG($sp)
+ $REG_S $s11,$FRAMESIZE-3*$SZREG($sp)
+ $REG_S $s10,$FRAMESIZE-4*$SZREG($sp)
+ $REG_S $s9,$FRAMESIZE-5*$SZREG($sp)
+ $REG_S $s8,$FRAMESIZE-6*$SZREG($sp)
+ $REG_S $s7,$FRAMESIZE-7*$SZREG($sp)
+ $REG_S $s6,$FRAMESIZE-8*$SZREG($sp)
+ $REG_S $s5,$FRAMESIZE-9*$SZREG($sp)
+ $REG_S $s4,$FRAMESIZE-10*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
+ $REG_S $s3,$FRAMESIZE-11*$SZREG($sp)
+ $REG_S $s2,$FRAMESIZE-12*$SZREG($sp)
+ $REG_S $s1,$FRAMESIZE-13*$SZREG($sp)
+ $REG_S $s0,$FRAMESIZE-14*$SZREG($sp)
+ $REG_S $gp,$FRAMESIZE-15*$SZREG($sp)
+___
+$code.=<<___;
+ $PTR_SLL @X[15],$len,`log(16*$SZ)/log(2)`
+___
+$code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification
+ .cplocal $Ktbl
+ .cpsetup $pf,$zero,sha${label}_block_data_order
+___
+$code.=<<___;
+ .set reorder
+ la $Ktbl,K${label} # PIC-ified 'load address'
+
+ $LD $A,0*$SZ($ctx) # load context
+ $LD $B,1*$SZ($ctx)
+ $LD $C,2*$SZ($ctx)
+ $LD $D,3*$SZ($ctx)
+ $LD $E,4*$SZ($ctx)
+ $LD $F,5*$SZ($ctx)
+ $LD $G,6*$SZ($ctx)
+ $LD $H,7*$SZ($ctx)
+
+ $PTR_ADD @X[15],$inp # pointer to the end of input
+ $REG_S @X[15],16*$SZ($sp)
+ b .Loop
+
+.align 5
+.Loop:
+ ${LD}l @X[0],$MSB($inp)
+ ${LD}r @X[0],$LSB($inp)
+___
+for ($i=0;$i<16;$i++)
+{ &BODY_00_15($i,@V); unshift(@V,pop(@V)); push(@X,shift(@X)); }
+$code.=<<___;
+ b .L16_xx
+.align 4
+.L16_xx:
+___
+for (;$i<32;$i++)
+{ &BODY_16_XX($i,@V); unshift(@V,pop(@V)); push(@X,shift(@X)); }
+$code.=<<___;
+ and @X[6],0xfff
+ li @X[7],$lastK
+ .set noreorder
+ bne @X[6],@X[7],.L16_xx
+ $PTR_ADD $Ktbl,16*$SZ # Ktbl+=16
+
+ $REG_L @X[15],16*$SZ($sp) # restore pointer to the end of input
+ $LD @X[0],0*$SZ($ctx)
+ $LD @X[1],1*$SZ($ctx)
+ $LD @X[2],2*$SZ($ctx)
+ $PTR_ADD $inp,16*$SZ
+ $LD @X[3],3*$SZ($ctx)
+ $ADDU $A,@X[0]
+ $LD @X[4],4*$SZ($ctx)
+ $ADDU $B,@X[1]
+ $LD @X[5],5*$SZ($ctx)
+ $ADDU $C,@X[2]
+ $LD @X[6],6*$SZ($ctx)
+ $ADDU $D,@X[3]
+ $LD @X[7],7*$SZ($ctx)
+ $ADDU $E,@X[4]
+ $ST $A,0*$SZ($ctx)
+ $ADDU $F,@X[5]
+ $ST $B,1*$SZ($ctx)
+ $ADDU $G,@X[6]
+ $ST $C,2*$SZ($ctx)
+ $ADDU $H,@X[7]
+ $ST $D,3*$SZ($ctx)
+ $ST $E,4*$SZ($ctx)
+ $ST $F,5*$SZ($ctx)
+ $ST $G,6*$SZ($ctx)
+ $ST $H,7*$SZ($ctx)
+
+ bne $inp,@X[15],.Loop
+ $PTR_SUB $Ktbl,`($rounds-16)*$SZ` # rewind $Ktbl
+
+ $REG_L $ra,$FRAMESIZE-1*$SZREG($sp)
+ $REG_L $fp,$FRAMESIZE-2*$SZREG($sp)
+ $REG_L $s11,$FRAMESIZE-3*$SZREG($sp)
+ $REG_L $s10,$FRAMESIZE-4*$SZREG($sp)
+ $REG_L $s9,$FRAMESIZE-5*$SZREG($sp)
+ $REG_L $s8,$FRAMESIZE-6*$SZREG($sp)
+ $REG_L $s7,$FRAMESIZE-7*$SZREG($sp)
+ $REG_L $s6,$FRAMESIZE-8*$SZREG($sp)
+ $REG_L $s5,$FRAMESIZE-9*$SZREG($sp)
+ $REG_L $s4,$FRAMESIZE-10*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+ $REG_L $s3,$FRAMESIZE-11*$SZREG($sp)
+ $REG_L $s2,$FRAMESIZE-12*$SZREG($sp)
+ $REG_L $s1,$FRAMESIZE-13*$SZREG($sp)
+ $REG_L $s0,$FRAMESIZE-14*$SZREG($sp)
+ $REG_L $gp,$FRAMESIZE-15*$SZREG($sp)
+___
+$code.=<<___;
+ jr $ra
+ $PTR_ADD $sp,$FRAMESIZE
+.end sha${label}_block_data_order
+
+.rdata
+.align 5
+K${label}:
+___
+if ($SZ==4) {
+$code.=<<___;
+ .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
+ .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
+ .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
+ .word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
+ .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
+ .word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
+ .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
+ .word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
+ .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
+ .word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
+ .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
+ .word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
+ .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
+ .word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
+ .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
+ .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+___
+} else {
+$code.=<<___;
+ .dword 0x428a2f98d728ae22, 0x7137449123ef65cd
+ .dword 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc
+ .dword 0x3956c25bf348b538, 0x59f111f1b605d019
+ .dword 0x923f82a4af194f9b, 0xab1c5ed5da6d8118
+ .dword 0xd807aa98a3030242, 0x12835b0145706fbe
+ .dword 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2
+ .dword 0x72be5d74f27b896f, 0x80deb1fe3b1696b1
+ .dword 0x9bdc06a725c71235, 0xc19bf174cf692694
+ .dword 0xe49b69c19ef14ad2, 0xefbe4786384f25e3
+ .dword 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65
+ .dword 0x2de92c6f592b0275, 0x4a7484aa6ea6e483
+ .dword 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5
+ .dword 0x983e5152ee66dfab, 0xa831c66d2db43210
+ .dword 0xb00327c898fb213f, 0xbf597fc7beef0ee4
+ .dword 0xc6e00bf33da88fc2, 0xd5a79147930aa725
+ .dword 0x06ca6351e003826f, 0x142929670a0e6e70
+ .dword 0x27b70a8546d22ffc, 0x2e1b21385c26c926
+ .dword 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df
+ .dword 0x650a73548baf63de, 0x766a0abb3c77b2a8
+ .dword 0x81c2c92e47edaee6, 0x92722c851482353b
+ .dword 0xa2bfe8a14cf10364, 0xa81a664bbc423001
+ .dword 0xc24b8b70d0f89791, 0xc76c51a30654be30
+ .dword 0xd192e819d6ef5218, 0xd69906245565a910
+ .dword 0xf40e35855771202a, 0x106aa07032bbd1b8
+ .dword 0x19a4c116b8d2d0c8, 0x1e376c085141ab53
+ .dword 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8
+ .dword 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb
+ .dword 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3
+ .dword 0x748f82ee5defb2fc, 0x78a5636f43172f60
+ .dword 0x84c87814a1f0ab72, 0x8cc702081a6439ec
+ .dword 0x90befffa23631e28, 0xa4506cebde82bde9
+ .dword 0xbef9a3f7b2c67915, 0xc67178f2e372532b
+ .dword 0xca273eceea26619c, 0xd186b8c721c0c207
+ .dword 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178
+ .dword 0x06f067aa72176fba, 0x0a637dc5a2c898a6
+ .dword 0x113f9804bef90dae, 0x1b710b35131c471b
+ .dword 0x28db77f523047d84, 0x32caab7b40c72493
+ .dword 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c
+ .dword 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a
+ .dword 0x5fcb6fab3ad6faec, 0x6c44198c4a475817
+___
+}
+$code.=<<___;
+.asciiz "SHA${label} for MIPS, CRYPTOGAMS by <appro\@openssl.org>"
+.align 5
+
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+print $code;
+close STDOUT;
diff --git a/main/openssl/crypto/sha/asm/sha512-parisc.pl b/main/openssl/crypto/sha/asm/sha512-parisc.pl
new file mode 100755
index 00000000..fc0e15b3
--- /dev/null
+++ b/main/openssl/crypto/sha/asm/sha512-parisc.pl
@@ -0,0 +1,793 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# SHA256/512 block procedure for PA-RISC.
+
+# June 2009.
+#
+# SHA256 performance is >75% better than gcc 3.2 generated code on
+# PA-7100LC. Compared to code generated by vendor compiler this
+# implementation is almost 70% faster in 64-bit build, but delivers
+# virtually same performance in 32-bit build on PA-8600.
+#
+# SHA512 performance is >2.9x better than gcc 3.2 generated code on
+# PA-7100LC, PA-RISC 1.1 processor. Then implementation detects if the
+# code is executed on PA-RISC 2.0 processor and switches to 64-bit
+# code path delivering adequate peformance even in "blended" 32-bit
+# build. Though 64-bit code is not any faster than code generated by
+# vendor compiler on PA-8600...
+#
+# Special thanks to polarhome.com for providing HP-UX account.
+
+$flavour = shift;
+$output = shift;
+open STDOUT,">$output";
+
+if ($flavour =~ /64/) {
+ $LEVEL ="2.0W";
+ $SIZE_T =8;
+ $FRAME_MARKER =80;
+ $SAVED_RP =16;
+ $PUSH ="std";
+ $PUSHMA ="std,ma";
+ $POP ="ldd";
+ $POPMB ="ldd,mb";
+} else {
+ $LEVEL ="1.0";
+ $SIZE_T =4;
+ $FRAME_MARKER =48;
+ $SAVED_RP =20;
+ $PUSH ="stw";
+ $PUSHMA ="stwm";
+ $POP ="ldw";
+ $POPMB ="ldwm";
+}
+
+if ($output =~ /512/) {
+ $func="sha512_block_data_order";
+ $SZ=8;
+ @Sigma0=(28,34,39);
+ @Sigma1=(14,18,41);
+ @sigma0=(1, 8, 7);
+ @sigma1=(19,61, 6);
+ $rounds=80;
+ $LAST10BITS=0x017;
+ $LD="ldd";
+ $LDM="ldd,ma";
+ $ST="std";
+} else {
+ $func="sha256_block_data_order";
+ $SZ=4;
+ @Sigma0=( 2,13,22);
+ @Sigma1=( 6,11,25);
+ @sigma0=( 7,18, 3);
+ @sigma1=(17,19,10);
+ $rounds=64;
+ $LAST10BITS=0x0f2;
+ $LD="ldw";
+ $LDM="ldwm";
+ $ST="stw";
+}
+
+$FRAME=16*$SIZE_T+$FRAME_MARKER;# 16 saved regs + frame marker
+ # [+ argument transfer]
+$XOFF=16*$SZ+32; # local variables
+$FRAME+=$XOFF;
+$XOFF+=$FRAME_MARKER; # distance between %sp and local variables
+
+$ctx="%r26"; # zapped by $a0
+$inp="%r25"; # zapped by $a1
+$num="%r24"; # zapped by $t0
+
+$a0 ="%r26";
+$a1 ="%r25";
+$t0 ="%r24";
+$t1 ="%r29";
+$Tbl="%r31";
+
+@V=($A,$B,$C,$D,$E,$F,$G,$H)=("%r17","%r18","%r19","%r20","%r21","%r22","%r23","%r28");
+
+@X=("%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8",
+ "%r9", "%r10","%r11","%r12","%r13","%r14","%r15","%r16",$inp);
+
+sub ROUND_00_15 {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
+$code.=<<___;
+ _ror $e,$Sigma1[0],$a0
+ and $f,$e,$t0
+ _ror $e,$Sigma1[1],$a1
+ addl $t1,$h,$h
+ andcm $g,$e,$t1
+ xor $a1,$a0,$a0
+ _ror $a1,`$Sigma1[2]-$Sigma1[1]`,$a1
+ or $t0,$t1,$t1 ; Ch(e,f,g)
+ addl @X[$i%16],$h,$h
+ xor $a0,$a1,$a1 ; Sigma1(e)
+ addl $t1,$h,$h
+ _ror $a,$Sigma0[0],$a0
+ addl $a1,$h,$h
+
+ _ror $a,$Sigma0[1],$a1
+ and $a,$b,$t0
+ and $a,$c,$t1
+ xor $a1,$a0,$a0
+ _ror $a1,`$Sigma0[2]-$Sigma0[1]`,$a1
+ xor $t1,$t0,$t0
+ and $b,$c,$t1
+ xor $a0,$a1,$a1 ; Sigma0(a)
+ addl $h,$d,$d
+ xor $t1,$t0,$t0 ; Maj(a,b,c)
+ `"$LDM $SZ($Tbl),$t1" if ($i<15)`
+ addl $a1,$h,$h
+ addl $t0,$h,$h
+
+___
+}
+
+sub ROUND_16_xx {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
+$i-=16;
+$code.=<<___;
+ _ror @X[($i+1)%16],$sigma0[0],$a0
+ _ror @X[($i+1)%16],$sigma0[1],$a1
+ addl @X[($i+9)%16],@X[$i],@X[$i]
+ _ror @X[($i+14)%16],$sigma1[0],$t0
+ _ror @X[($i+14)%16],$sigma1[1],$t1
+ xor $a1,$a0,$a0
+ _shr @X[($i+1)%16],$sigma0[2],$a1
+ xor $t1,$t0,$t0
+ _shr @X[($i+14)%16],$sigma1[2],$t1
+ xor $a1,$a0,$a0 ; sigma0(X[(i+1)&0x0f])
+ xor $t1,$t0,$t0 ; sigma1(X[(i+14)&0x0f])
+ $LDM $SZ($Tbl),$t1
+ addl $a0,@X[$i],@X[$i]
+ addl $t0,@X[$i],@X[$i]
+___
+$code.=<<___ if ($i==15);
+ extru $t1,31,10,$a1
+ comiclr,<> $LAST10BITS,$a1,%r0
+ ldo 1($Tbl),$Tbl ; signal end of $Tbl
+___
+&ROUND_00_15($i+16,$a,$b,$c,$d,$e,$f,$g,$h);
+}
+
+$code=<<___;
+ .LEVEL $LEVEL
+ .SPACE \$TEXT\$
+ .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
+
+ .ALIGN 64
+L\$table
+___
+$code.=<<___ if ($SZ==8);
+ .WORD 0x428a2f98,0xd728ae22,0x71374491,0x23ef65cd
+ .WORD 0xb5c0fbcf,0xec4d3b2f,0xe9b5dba5,0x8189dbbc
+ .WORD 0x3956c25b,0xf348b538,0x59f111f1,0xb605d019
+ .WORD 0x923f82a4,0xaf194f9b,0xab1c5ed5,0xda6d8118
+ .WORD 0xd807aa98,0xa3030242,0x12835b01,0x45706fbe
+ .WORD 0x243185be,0x4ee4b28c,0x550c7dc3,0xd5ffb4e2
+ .WORD 0x72be5d74,0xf27b896f,0x80deb1fe,0x3b1696b1
+ .WORD 0x9bdc06a7,0x25c71235,0xc19bf174,0xcf692694
+ .WORD 0xe49b69c1,0x9ef14ad2,0xefbe4786,0x384f25e3
+ .WORD 0x0fc19dc6,0x8b8cd5b5,0x240ca1cc,0x77ac9c65
+ .WORD 0x2de92c6f,0x592b0275,0x4a7484aa,0x6ea6e483
+ .WORD 0x5cb0a9dc,0xbd41fbd4,0x76f988da,0x831153b5
+ .WORD 0x983e5152,0xee66dfab,0xa831c66d,0x2db43210
+ .WORD 0xb00327c8,0x98fb213f,0xbf597fc7,0xbeef0ee4
+ .WORD 0xc6e00bf3,0x3da88fc2,0xd5a79147,0x930aa725
+ .WORD 0x06ca6351,0xe003826f,0x14292967,0x0a0e6e70
+ .WORD 0x27b70a85,0x46d22ffc,0x2e1b2138,0x5c26c926
+ .WORD 0x4d2c6dfc,0x5ac42aed,0x53380d13,0x9d95b3df
+ .WORD 0x650a7354,0x8baf63de,0x766a0abb,0x3c77b2a8
+ .WORD 0x81c2c92e,0x47edaee6,0x92722c85,0x1482353b
+ .WORD 0xa2bfe8a1,0x4cf10364,0xa81a664b,0xbc423001
+ .WORD 0xc24b8b70,0xd0f89791,0xc76c51a3,0x0654be30
+ .WORD 0xd192e819,0xd6ef5218,0xd6990624,0x5565a910
+ .WORD 0xf40e3585,0x5771202a,0x106aa070,0x32bbd1b8
+ .WORD 0x19a4c116,0xb8d2d0c8,0x1e376c08,0x5141ab53
+ .WORD 0x2748774c,0xdf8eeb99,0x34b0bcb5,0xe19b48a8
+ .WORD 0x391c0cb3,0xc5c95a63,0x4ed8aa4a,0xe3418acb
+ .WORD 0x5b9cca4f,0x7763e373,0x682e6ff3,0xd6b2b8a3
+ .WORD 0x748f82ee,0x5defb2fc,0x78a5636f,0x43172f60
+ .WORD 0x84c87814,0xa1f0ab72,0x8cc70208,0x1a6439ec
+ .WORD 0x90befffa,0x23631e28,0xa4506ceb,0xde82bde9
+ .WORD 0xbef9a3f7,0xb2c67915,0xc67178f2,0xe372532b
+ .WORD 0xca273ece,0xea26619c,0xd186b8c7,0x21c0c207
+ .WORD 0xeada7dd6,0xcde0eb1e,0xf57d4f7f,0xee6ed178
+ .WORD 0x06f067aa,0x72176fba,0x0a637dc5,0xa2c898a6
+ .WORD 0x113f9804,0xbef90dae,0x1b710b35,0x131c471b
+ .WORD 0x28db77f5,0x23047d84,0x32caab7b,0x40c72493
+ .WORD 0x3c9ebe0a,0x15c9bebc,0x431d67c4,0x9c100d4c
+ .WORD 0x4cc5d4be,0xcb3e42b6,0x597f299c,0xfc657e2a
+ .WORD 0x5fcb6fab,0x3ad6faec,0x6c44198c,0x4a475817
+___
+$code.=<<___ if ($SZ==4);
+ .WORD 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+ .WORD 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+ .WORD 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+ .WORD 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+ .WORD 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+ .WORD 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+ .WORD 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+ .WORD 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+ .WORD 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+ .WORD 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+ .WORD 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+ .WORD 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+ .WORD 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+ .WORD 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+ .WORD 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+ .WORD 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+___
+$code.=<<___;
+
+ .EXPORT $func,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
+ .ALIGN 64
+$func
+ .PROC
+ .CALLINFO FRAME=`$FRAME-16*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=18
+ .ENTRY
+ $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
+ $PUSHMA %r3,$FRAME(%sp)
+ $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
+ $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
+ $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
+ $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
+ $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
+ $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
+ $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
+ $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp)
+ $PUSH %r12,`-$FRAME+9*$SIZE_T`(%sp)
+ $PUSH %r13,`-$FRAME+10*$SIZE_T`(%sp)
+ $PUSH %r14,`-$FRAME+11*$SIZE_T`(%sp)
+ $PUSH %r15,`-$FRAME+12*$SIZE_T`(%sp)
+ $PUSH %r16,`-$FRAME+13*$SIZE_T`(%sp)
+ $PUSH %r17,`-$FRAME+14*$SIZE_T`(%sp)
+ $PUSH %r18,`-$FRAME+15*$SIZE_T`(%sp)
+
+ _shl $num,`log(16*$SZ)/log(2)`,$num
+ addl $inp,$num,$num ; $num to point at the end of $inp
+
+ $PUSH $num,`-$FRAME_MARKER-4*$SIZE_T`(%sp) ; save arguments
+ $PUSH $inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp)
+ $PUSH $ctx,`-$FRAME_MARKER-2*$SIZE_T`(%sp)
+
+ blr %r0,$Tbl
+ ldi 3,$t1
+L\$pic
+ andcm $Tbl,$t1,$Tbl ; wipe privilege level
+ ldo L\$table-L\$pic($Tbl),$Tbl
+___
+$code.=<<___ if ($SZ==8 && $SIZE_T==4);
+ ldi 31,$t1
+ mtctl $t1,%cr11
+ extrd,u,*= $t1,%sar,1,$t1 ; executes on PA-RISC 1.0
+ b L\$parisc1
+ nop
+___
+$code.=<<___;
+ $LD `0*$SZ`($ctx),$A ; load context
+ $LD `1*$SZ`($ctx),$B
+ $LD `2*$SZ`($ctx),$C
+ $LD `3*$SZ`($ctx),$D
+ $LD `4*$SZ`($ctx),$E
+ $LD `5*$SZ`($ctx),$F
+ $LD `6*$SZ`($ctx),$G
+ $LD `7*$SZ`($ctx),$H
+
+ extru $inp,31,`log($SZ)/log(2)`,$t0
+ sh3addl $t0,%r0,$t0
+ subi `8*$SZ`,$t0,$t0
+ mtctl $t0,%cr11 ; load %sar with align factor
+
+L\$oop
+ ldi `$SZ-1`,$t0
+ $LDM $SZ($Tbl),$t1
+ andcm $inp,$t0,$t0 ; align $inp
+___
+ for ($i=0;$i<15;$i++) { # load input block
+ $code.="\t$LD `$SZ*$i`($t0),@X[$i]\n"; }
+$code.=<<___;
+ cmpb,*= $inp,$t0,L\$aligned
+ $LD `$SZ*15`($t0),@X[15]
+ $LD `$SZ*16`($t0),@X[16]
+___
+ for ($i=0;$i<16;$i++) { # align data
+ $code.="\t_align @X[$i],@X[$i+1],@X[$i]\n"; }
+$code.=<<___;
+L\$aligned
+ nop ; otherwise /usr/ccs/bin/as is confused by below .WORD
+___
+
+for($i=0;$i<16;$i++) { &ROUND_00_15($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+L\$rounds
+ nop ; otherwise /usr/ccs/bin/as is confused by below .WORD
+___
+for(;$i<32;$i++) { &ROUND_16_xx($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+ bb,>= $Tbl,31,L\$rounds ; end of $Tbl signalled?
+ nop
+
+ $POP `-$FRAME_MARKER-2*$SIZE_T`(%sp),$ctx ; restore arguments
+ $POP `-$FRAME_MARKER-3*$SIZE_T`(%sp),$inp
+ $POP `-$FRAME_MARKER-4*$SIZE_T`(%sp),$num
+ ldo `-$rounds*$SZ-1`($Tbl),$Tbl ; rewind $Tbl
+
+ $LD `0*$SZ`($ctx),@X[0] ; load context
+ $LD `1*$SZ`($ctx),@X[1]
+ $LD `2*$SZ`($ctx),@X[2]
+ $LD `3*$SZ`($ctx),@X[3]
+ $LD `4*$SZ`($ctx),@X[4]
+ $LD `5*$SZ`($ctx),@X[5]
+ addl @X[0],$A,$A
+ $LD `6*$SZ`($ctx),@X[6]
+ addl @X[1],$B,$B
+ $LD `7*$SZ`($ctx),@X[7]
+ ldo `16*$SZ`($inp),$inp ; advance $inp
+
+ $ST $A,`0*$SZ`($ctx) ; save context
+ addl @X[2],$C,$C
+ $ST $B,`1*$SZ`($ctx)
+ addl @X[3],$D,$D
+ $ST $C,`2*$SZ`($ctx)
+ addl @X[4],$E,$E
+ $ST $D,`3*$SZ`($ctx)
+ addl @X[5],$F,$F
+ $ST $E,`4*$SZ`($ctx)
+ addl @X[6],$G,$G
+ $ST $F,`5*$SZ`($ctx)
+ addl @X[7],$H,$H
+ $ST $G,`6*$SZ`($ctx)
+ $ST $H,`7*$SZ`($ctx)
+
+ cmpb,*<>,n $inp,$num,L\$oop
+ $PUSH $inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp) ; save $inp
+___
+if ($SZ==8 && $SIZE_T==4) # SHA512 for 32-bit PA-RISC 1.0
+{{
+$code.=<<___;
+ b L\$done
+ nop
+
+ .ALIGN 64
+L\$parisc1
+___
+
+@V=( $Ahi, $Alo, $Bhi, $Blo, $Chi, $Clo, $Dhi, $Dlo,
+ $Ehi, $Elo, $Fhi, $Flo, $Ghi, $Glo, $Hhi, $Hlo) =
+ ( "%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8",
+ "%r9","%r10","%r11","%r12","%r13","%r14","%r15","%r16");
+$a0 ="%r17";
+$a1 ="%r18";
+$a2 ="%r19";
+$a3 ="%r20";
+$t0 ="%r21";
+$t1 ="%r22";
+$t2 ="%r28";
+$t3 ="%r29";
+$Tbl="%r31";
+
+@X=("%r23","%r24","%r25","%r26"); # zaps $num,$inp,$ctx
+
+sub ROUND_00_15_pa1 {
+my ($i,$ahi,$alo,$bhi,$blo,$chi,$clo,$dhi,$dlo,
+ $ehi,$elo,$fhi,$flo,$ghi,$glo,$hhi,$hlo,$flag)=@_;
+my ($Xhi,$Xlo,$Xnhi,$Xnlo) = @X;
+
+$code.=<<___ if (!$flag);
+ ldw `-$XOFF+8*(($i+1)%16)`(%sp),$Xnhi
+ ldw `-$XOFF+8*(($i+1)%16)+4`(%sp),$Xnlo ; load X[i+1]
+___
+$code.=<<___;
+ shd $ehi,$elo,$Sigma1[0],$t0
+ add $Xlo,$hlo,$hlo
+ shd $elo,$ehi,$Sigma1[0],$t1
+ addc $Xhi,$hhi,$hhi ; h += X[i]
+ shd $ehi,$elo,$Sigma1[1],$t2
+ ldwm 8($Tbl),$Xhi
+ shd $elo,$ehi,$Sigma1[1],$t3
+ ldw -4($Tbl),$Xlo ; load K[i]
+ xor $t2,$t0,$t0
+ xor $t3,$t1,$t1
+ and $flo,$elo,$a0
+ and $fhi,$ehi,$a1
+ shd $ehi,$elo,$Sigma1[2],$t2
+ andcm $glo,$elo,$a2
+ shd $elo,$ehi,$Sigma1[2],$t3
+ andcm $ghi,$ehi,$a3
+ xor $t2,$t0,$t0
+ xor $t3,$t1,$t1 ; Sigma1(e)
+ add $Xlo,$hlo,$hlo
+ xor $a2,$a0,$a0
+ addc $Xhi,$hhi,$hhi ; h += K[i]
+ xor $a3,$a1,$a1 ; Ch(e,f,g)
+
+ add $t0,$hlo,$hlo
+ shd $ahi,$alo,$Sigma0[0],$t0
+ addc $t1,$hhi,$hhi ; h += Sigma1(e)
+ shd $alo,$ahi,$Sigma0[0],$t1
+ add $a0,$hlo,$hlo
+ shd $ahi,$alo,$Sigma0[1],$t2
+ addc $a1,$hhi,$hhi ; h += Ch(e,f,g)
+ shd $alo,$ahi,$Sigma0[1],$t3
+
+ xor $t2,$t0,$t0
+ xor $t3,$t1,$t1
+ shd $ahi,$alo,$Sigma0[2],$t2
+ and $alo,$blo,$a0
+ shd $alo,$ahi,$Sigma0[2],$t3
+ and $ahi,$bhi,$a1
+ xor $t2,$t0,$t0
+ xor $t3,$t1,$t1 ; Sigma0(a)
+
+ and $alo,$clo,$a2
+ and $ahi,$chi,$a3
+ xor $a2,$a0,$a0
+ add $hlo,$dlo,$dlo
+ xor $a3,$a1,$a1
+ addc $hhi,$dhi,$dhi ; d += h
+ and $blo,$clo,$a2
+ add $t0,$hlo,$hlo
+ and $bhi,$chi,$a3
+ addc $t1,$hhi,$hhi ; h += Sigma0(a)
+ xor $a2,$a0,$a0
+ add $a0,$hlo,$hlo
+ xor $a3,$a1,$a1 ; Maj(a,b,c)
+ addc $a1,$hhi,$hhi ; h += Maj(a,b,c)
+
+___
+$code.=<<___ if ($i==15 && $flag);
+ extru $Xlo,31,10,$Xlo
+ comiclr,= $LAST10BITS,$Xlo,%r0
+ b L\$rounds_pa1
+ nop
+___
+push(@X,shift(@X)); push(@X,shift(@X));
+}
+
+sub ROUND_16_xx_pa1 {
+my ($Xhi,$Xlo,$Xnhi,$Xnlo) = @X;
+my ($i)=shift;
+$i-=16;
+$code.=<<___;
+ ldw `-$XOFF+8*(($i+1)%16)`(%sp),$Xnhi
+ ldw `-$XOFF+8*(($i+1)%16)+4`(%sp),$Xnlo ; load X[i+1]
+ ldw `-$XOFF+8*(($i+9)%16)`(%sp),$a1
+ ldw `-$XOFF+8*(($i+9)%16)+4`(%sp),$a0 ; load X[i+9]
+ ldw `-$XOFF+8*(($i+14)%16)`(%sp),$a3
+ ldw `-$XOFF+8*(($i+14)%16)+4`(%sp),$a2 ; load X[i+14]
+ shd $Xnhi,$Xnlo,$sigma0[0],$t0
+ shd $Xnlo,$Xnhi,$sigma0[0],$t1
+ add $a0,$Xlo,$Xlo
+ shd $Xnhi,$Xnlo,$sigma0[1],$t2
+ addc $a1,$Xhi,$Xhi
+ shd $Xnlo,$Xnhi,$sigma0[1],$t3
+ xor $t2,$t0,$t0
+ shd $Xnhi,$Xnlo,$sigma0[2],$t2
+ xor $t3,$t1,$t1
+ extru $Xnhi,`31-$sigma0[2]`,`32-$sigma0[2]`,$t3
+ xor $t2,$t0,$t0
+ shd $a3,$a2,$sigma1[0],$a0
+ xor $t3,$t1,$t1 ; sigma0(X[i+1)&0x0f])
+ shd $a2,$a3,$sigma1[0],$a1
+ add $t0,$Xlo,$Xlo
+ shd $a3,$a2,$sigma1[1],$t2
+ addc $t1,$Xhi,$Xhi
+ shd $a2,$a3,$sigma1[1],$t3
+ xor $t2,$a0,$a0
+ shd $a3,$a2,$sigma1[2],$t2
+ xor $t3,$a1,$a1
+ extru $a3,`31-$sigma1[2]`,`32-$sigma1[2]`,$t3
+ xor $t2,$a0,$a0
+ xor $t3,$a1,$a1 ; sigma0(X[i+14)&0x0f])
+ add $a0,$Xlo,$Xlo
+ addc $a1,$Xhi,$Xhi
+
+ stw $Xhi,`-$XOFF+8*($i%16)`(%sp)
+ stw $Xlo,`-$XOFF+8*($i%16)+4`(%sp)
+___
+&ROUND_00_15_pa1($i,@_,1);
+}
+$code.=<<___;
+ ldw `0*4`($ctx),$Ahi ; load context
+ ldw `1*4`($ctx),$Alo
+ ldw `2*4`($ctx),$Bhi
+ ldw `3*4`($ctx),$Blo
+ ldw `4*4`($ctx),$Chi
+ ldw `5*4`($ctx),$Clo
+ ldw `6*4`($ctx),$Dhi
+ ldw `7*4`($ctx),$Dlo
+ ldw `8*4`($ctx),$Ehi
+ ldw `9*4`($ctx),$Elo
+ ldw `10*4`($ctx),$Fhi
+ ldw `11*4`($ctx),$Flo
+ ldw `12*4`($ctx),$Ghi
+ ldw `13*4`($ctx),$Glo
+ ldw `14*4`($ctx),$Hhi
+ ldw `15*4`($ctx),$Hlo
+
+ extru $inp,31,2,$t0
+ sh3addl $t0,%r0,$t0
+ subi 32,$t0,$t0
+ mtctl $t0,%cr11 ; load %sar with align factor
+
+L\$oop_pa1
+ extru $inp,31,2,$a3
+ comib,= 0,$a3,L\$aligned_pa1
+ sub $inp,$a3,$inp
+
+ ldw `0*4`($inp),$X[0]
+ ldw `1*4`($inp),$X[1]
+ ldw `2*4`($inp),$t2
+ ldw `3*4`($inp),$t3
+ ldw `4*4`($inp),$a0
+ ldw `5*4`($inp),$a1
+ ldw `6*4`($inp),$a2
+ ldw `7*4`($inp),$a3
+ vshd $X[0],$X[1],$X[0]
+ vshd $X[1],$t2,$X[1]
+ stw $X[0],`-$XOFF+0*4`(%sp)
+ ldw `8*4`($inp),$t0
+ vshd $t2,$t3,$t2
+ stw $X[1],`-$XOFF+1*4`(%sp)
+ ldw `9*4`($inp),$t1
+ vshd $t3,$a0,$t3
+___
+{
+my @t=($t2,$t3,$a0,$a1,$a2,$a3,$t0,$t1);
+for ($i=2;$i<=(128/4-8);$i++) {
+$code.=<<___;
+ stw $t[0],`-$XOFF+$i*4`(%sp)
+ ldw `(8+$i)*4`($inp),$t[0]
+ vshd $t[1],$t[2],$t[1]
+___
+push(@t,shift(@t));
+}
+for (;$i<(128/4-1);$i++) {
+$code.=<<___;
+ stw $t[0],`-$XOFF+$i*4`(%sp)
+ vshd $t[1],$t[2],$t[1]
+___
+push(@t,shift(@t));
+}
+$code.=<<___;
+ b L\$collected_pa1
+ stw $t[0],`-$XOFF+$i*4`(%sp)
+
+___
+}
+$code.=<<___;
+L\$aligned_pa1
+ ldw `0*4`($inp),$X[0]
+ ldw `1*4`($inp),$X[1]
+ ldw `2*4`($inp),$t2
+ ldw `3*4`($inp),$t3
+ ldw `4*4`($inp),$a0
+ ldw `5*4`($inp),$a1
+ ldw `6*4`($inp),$a2
+ ldw `7*4`($inp),$a3
+ stw $X[0],`-$XOFF+0*4`(%sp)
+ ldw `8*4`($inp),$t0
+ stw $X[1],`-$XOFF+1*4`(%sp)
+ ldw `9*4`($inp),$t1
+___
+{
+my @t=($t2,$t3,$a0,$a1,$a2,$a3,$t0,$t1);
+for ($i=2;$i<(128/4-8);$i++) {
+$code.=<<___;
+ stw $t[0],`-$XOFF+$i*4`(%sp)
+ ldw `(8+$i)*4`($inp),$t[0]
+___
+push(@t,shift(@t));
+}
+for (;$i<128/4;$i++) {
+$code.=<<___;
+ stw $t[0],`-$XOFF+$i*4`(%sp)
+___
+push(@t,shift(@t));
+}
+$code.="L\$collected_pa1\n";
+}
+
+for($i=0;$i<16;$i++) { &ROUND_00_15_pa1($i,@V); unshift(@V,pop(@V)); unshift(@V,pop(@V)); }
+$code.="L\$rounds_pa1\n";
+for(;$i<32;$i++) { &ROUND_16_xx_pa1($i,@V); unshift(@V,pop(@V)); unshift(@V,pop(@V)); }
+
+$code.=<<___;
+ $POP `-$FRAME_MARKER-2*$SIZE_T`(%sp),$ctx ; restore arguments
+ $POP `-$FRAME_MARKER-3*$SIZE_T`(%sp),$inp
+ $POP `-$FRAME_MARKER-4*$SIZE_T`(%sp),$num
+ ldo `-$rounds*$SZ`($Tbl),$Tbl ; rewind $Tbl
+
+ ldw `0*4`($ctx),$t1 ; update context
+ ldw `1*4`($ctx),$t0
+ ldw `2*4`($ctx),$t3
+ ldw `3*4`($ctx),$t2
+ ldw `4*4`($ctx),$a1
+ ldw `5*4`($ctx),$a0
+ ldw `6*4`($ctx),$a3
+ add $t0,$Alo,$Alo
+ ldw `7*4`($ctx),$a2
+ addc $t1,$Ahi,$Ahi
+ ldw `8*4`($ctx),$t1
+ add $t2,$Blo,$Blo
+ ldw `9*4`($ctx),$t0
+ addc $t3,$Bhi,$Bhi
+ ldw `10*4`($ctx),$t3
+ add $a0,$Clo,$Clo
+ ldw `11*4`($ctx),$t2
+ addc $a1,$Chi,$Chi
+ ldw `12*4`($ctx),$a1
+ add $a2,$Dlo,$Dlo
+ ldw `13*4`($ctx),$a0
+ addc $a3,$Dhi,$Dhi
+ ldw `14*4`($ctx),$a3
+ add $t0,$Elo,$Elo
+ ldw `15*4`($ctx),$a2
+ addc $t1,$Ehi,$Ehi
+ stw $Ahi,`0*4`($ctx)
+ add $t2,$Flo,$Flo
+ stw $Alo,`1*4`($ctx)
+ addc $t3,$Fhi,$Fhi
+ stw $Bhi,`2*4`($ctx)
+ add $a0,$Glo,$Glo
+ stw $Blo,`3*4`($ctx)
+ addc $a1,$Ghi,$Ghi
+ stw $Chi,`4*4`($ctx)
+ add $a2,$Hlo,$Hlo
+ stw $Clo,`5*4`($ctx)
+ addc $a3,$Hhi,$Hhi
+ stw $Dhi,`6*4`($ctx)
+ ldo `16*$SZ`($inp),$inp ; advance $inp
+ stw $Dlo,`7*4`($ctx)
+ stw $Ehi,`8*4`($ctx)
+ stw $Elo,`9*4`($ctx)
+ stw $Fhi,`10*4`($ctx)
+ stw $Flo,`11*4`($ctx)
+ stw $Ghi,`12*4`($ctx)
+ stw $Glo,`13*4`($ctx)
+ stw $Hhi,`14*4`($ctx)
+ comb,= $inp,$num,L\$done
+ stw $Hlo,`15*4`($ctx)
+ b L\$oop_pa1
+ $PUSH $inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp) ; save $inp
+L\$done
+___
+}}
+$code.=<<___;
+ $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
+ $POP `-$FRAME+1*$SIZE_T`(%sp),%r4
+ $POP `-$FRAME+2*$SIZE_T`(%sp),%r5
+ $POP `-$FRAME+3*$SIZE_T`(%sp),%r6
+ $POP `-$FRAME+4*$SIZE_T`(%sp),%r7
+ $POP `-$FRAME+5*$SIZE_T`(%sp),%r8
+ $POP `-$FRAME+6*$SIZE_T`(%sp),%r9
+ $POP `-$FRAME+7*$SIZE_T`(%sp),%r10
+ $POP `-$FRAME+8*$SIZE_T`(%sp),%r11
+ $POP `-$FRAME+9*$SIZE_T`(%sp),%r12
+ $POP `-$FRAME+10*$SIZE_T`(%sp),%r13
+ $POP `-$FRAME+11*$SIZE_T`(%sp),%r14
+ $POP `-$FRAME+12*$SIZE_T`(%sp),%r15
+ $POP `-$FRAME+13*$SIZE_T`(%sp),%r16
+ $POP `-$FRAME+14*$SIZE_T`(%sp),%r17
+ $POP `-$FRAME+15*$SIZE_T`(%sp),%r18
+ bv (%r2)
+ .EXIT
+ $POPMB -$FRAME(%sp),%r3
+ .PROCEND
+ .STRINGZ "SHA`64*$SZ` block transform for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+# Explicitly encode PA-RISC 2.0 instructions used in this module, so
+# that it can be compiled with .LEVEL 1.0. It should be noted that I
+# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
+# directive...
+
+my $ldd = sub {
+ my ($mod,$args) = @_;
+ my $orig = "ldd$mod\t$args";
+
+ if ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 3 suffices
+ { my $opcode=(0x14<<26)|($2<<21)|($3<<16)|(($1&0x1FF8)<<1)|(($1>>13)&1);
+ $opcode|=(1<<3) if ($mod =~ /^,m/);
+ $opcode|=(1<<2) if ($mod =~ /^,mb/);
+ sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+ }
+ else { "\t".$orig; }
+};
+
+my $std = sub {
+ my ($mod,$args) = @_;
+ my $orig = "std$mod\t$args";
+
+ if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices
+ { my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1);
+ sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+ }
+ else { "\t".$orig; }
+};
+
+my $extrd = sub {
+ my ($mod,$args) = @_;
+ my $orig = "extrd$mod\t$args";
+
+ # I only have ",u" completer, it's implicitly encoded...
+ if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15
+ { my $opcode=(0x36<<26)|($1<<21)|($4<<16);
+ my $len=32-$3;
+ $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos
+ $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len
+ sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+ }
+ elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12
+ { my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
+ my $len=32-$2;
+ $opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len
+ $opcode |= (1<<13) if ($mod =~ /,\**=/);
+ sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+ }
+ else { "\t".$orig; }
+};
+
+my $shrpd = sub {
+ my ($mod,$args) = @_;
+ my $orig = "shrpd$mod\t$args";
+
+ if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14
+ { my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
+ my $cpos=63-$3;
+ $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa
+ sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+ }
+ elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/) # format 11
+ { sprintf "\t.WORD\t0x%08x\t; %s",
+ (0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig;
+ }
+ else { "\t".$orig; }
+};
+
+sub assemble {
+ my ($mnemonic,$mod,$args)=@_;
+ my $opcode = eval("\$$mnemonic");
+
+ ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
+}
+
+foreach (split("\n",$code)) {
+ s/\`([^\`]*)\`/eval $1/ge;
+
+ s/shd\s+(%r[0-9]+),(%r[0-9]+),([0-9]+)/
+ $3>31 ? sprintf("shd\t%$2,%$1,%d",$3-32) # rotation for >=32
+ : sprintf("shd\t%$1,%$2,%d",$3)/e or
+ # translate made up instructons: _ror, _shr, _align, _shl
+ s/_ror(\s+)(%r[0-9]+),/
+ ($SZ==4 ? "shd" : "shrpd")."$1$2,$2,"/e or
+
+ s/_shr(\s+%r[0-9]+),([0-9]+),/
+ $SZ==4 ? sprintf("extru%s,%d,%d,",$1,31-$2,32-$2)
+ : sprintf("extrd,u%s,%d,%d,",$1,63-$2,64-$2)/e or
+
+ s/_align(\s+%r[0-9]+,%r[0-9]+),/
+ ($SZ==4 ? "vshd$1," : "shrpd$1,%sar,")/e or
+
+ s/_shl(\s+%r[0-9]+),([0-9]+),/
+ $SIZE_T==4 ? sprintf("zdep%s,%d,%d,",$1,31-$2,32-$2)
+ : sprintf("depd,z%s,%d,%d,",$1,63-$2,64-$2)/e;
+
+ s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($SIZE_T==4);
+
+ s/cmpb,\*/comb,/ if ($SIZE_T==4);
+
+ s/\bbv\b/bve/ if ($SIZE_T==8);
+
+ print $_,"\n";
+}
+
+close STDOUT;
diff --git a/main/openssl/crypto/sha/asm/sha512-ppc.pl b/main/openssl/crypto/sha/asm/sha512-ppc.pl
index 768a6a6f..6b44a68e 100755
--- a/main/openssl/crypto/sha/asm/sha512-ppc.pl
+++ b/main/openssl/crypto/sha/asm/sha512-ppc.pl
@@ -40,6 +40,7 @@ $output =shift;
if ($flavour =~ /64/) {
$SIZE_T=8;
+ $LRSAVE=2*$SIZE_T;
$STU="stdu";
$UCMP="cmpld";
$SHL="sldi";
@@ -47,6 +48,7 @@ if ($flavour =~ /64/) {
$PUSH="std";
} elsif ($flavour =~ /32/) {
$SIZE_T=4;
+ $LRSAVE=$SIZE_T;
$STU="stwu";
$UCMP="cmplw";
$SHL="slwi";
@@ -87,7 +89,8 @@ if ($output =~ /512/) {
$SHR="srwi";
}
-$FRAME=32*$SIZE_T;
+$FRAME=32*$SIZE_T+16*$SZ;
+$LOCALS=6*$SIZE_T;
$sp ="r1";
$toc="r2";
@@ -179,13 +182,12 @@ $code=<<___;
.globl $func
.align 6
$func:
+ $STU $sp,-$FRAME($sp)
mflr r0
- $STU $sp,`-($FRAME+16*$SZ)`($sp)
$SHL $num,$num,`log(16*$SZ)/log(2)`
$PUSH $ctx,`$FRAME-$SIZE_T*22`($sp)
- $PUSH r0,`$FRAME-$SIZE_T*21`($sp)
$PUSH $toc,`$FRAME-$SIZE_T*20`($sp)
$PUSH r13,`$FRAME-$SIZE_T*19`($sp)
$PUSH r14,`$FRAME-$SIZE_T*18`($sp)
@@ -206,6 +208,7 @@ $func:
$PUSH r29,`$FRAME-$SIZE_T*3`($sp)
$PUSH r30,`$FRAME-$SIZE_T*2`($sp)
$PUSH r31,`$FRAME-$SIZE_T*1`($sp)
+ $PUSH r0,`$FRAME+$LRSAVE`($sp)
$LD $A,`0*$SZ`($ctx)
mr $inp,r4 ; incarnate $inp
@@ -217,7 +220,7 @@ $func:
$LD $G,`6*$SZ`($ctx)
$LD $H,`7*$SZ`($ctx)
- b LPICmeup
+ bl LPICmeup
LPICedup:
andi. r0,$inp,3
bne Lunaligned
@@ -226,40 +229,14 @@ Laligned:
$PUSH $num,`$FRAME-$SIZE_T*24`($sp) ; end pointer
$PUSH $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer
bl Lsha2_block_private
-Ldone:
- $POP r0,`$FRAME-$SIZE_T*21`($sp)
- $POP $toc,`$FRAME-$SIZE_T*20`($sp)
- $POP r13,`$FRAME-$SIZE_T*19`($sp)
- $POP r14,`$FRAME-$SIZE_T*18`($sp)
- $POP r15,`$FRAME-$SIZE_T*17`($sp)
- $POP r16,`$FRAME-$SIZE_T*16`($sp)
- $POP r17,`$FRAME-$SIZE_T*15`($sp)
- $POP r18,`$FRAME-$SIZE_T*14`($sp)
- $POP r19,`$FRAME-$SIZE_T*13`($sp)
- $POP r20,`$FRAME-$SIZE_T*12`($sp)
- $POP r21,`$FRAME-$SIZE_T*11`($sp)
- $POP r22,`$FRAME-$SIZE_T*10`($sp)
- $POP r23,`$FRAME-$SIZE_T*9`($sp)
- $POP r24,`$FRAME-$SIZE_T*8`($sp)
- $POP r25,`$FRAME-$SIZE_T*7`($sp)
- $POP r26,`$FRAME-$SIZE_T*6`($sp)
- $POP r27,`$FRAME-$SIZE_T*5`($sp)
- $POP r28,`$FRAME-$SIZE_T*4`($sp)
- $POP r29,`$FRAME-$SIZE_T*3`($sp)
- $POP r30,`$FRAME-$SIZE_T*2`($sp)
- $POP r31,`$FRAME-$SIZE_T*1`($sp)
- mtlr r0
- addi $sp,$sp,`$FRAME+16*$SZ`
- blr
-___
+ b Ldone
-# PowerPC specification allows an implementation to be ill-behaved
-# upon unaligned access which crosses page boundary. "Better safe
-# than sorry" principle makes me treat it specially. But I don't
-# look for particular offending word, but rather for the input
-# block which crosses the boundary. Once found that block is aligned
-# and hashed separately...
-$code.=<<___;
+; PowerPC specification allows an implementation to be ill-behaved
+; upon unaligned access which crosses page boundary. "Better safe
+; than sorry" principle makes me treat it specially. But I don't
+; look for particular offending word, but rather for the input
+; block which crosses the boundary. Once found that block is aligned
+; and hashed separately...
.align 4
Lunaligned:
subfic $t1,$inp,4096
@@ -278,7 +255,7 @@ Lunaligned:
Lcross_page:
li $t1,`16*$SZ/4`
mtctr $t1
- addi r20,$sp,$FRAME ; aligned spot below the frame
+ addi r20,$sp,$LOCALS ; aligned spot below the frame
Lmemcpy:
lbz r16,0($inp)
lbz r17,1($inp)
@@ -293,8 +270,8 @@ Lmemcpy:
bdnz Lmemcpy
$PUSH $inp,`$FRAME-$SIZE_T*26`($sp) ; save real inp
- addi $t1,$sp,`$FRAME+16*$SZ` ; fictitious end pointer
- addi $inp,$sp,$FRAME ; fictitious inp pointer
+ addi $t1,$sp,`$LOCALS+16*$SZ` ; fictitious end pointer
+ addi $inp,$sp,$LOCALS ; fictitious inp pointer
$PUSH $num,`$FRAME-$SIZE_T*25`($sp) ; save real num
$PUSH $t1,`$FRAME-$SIZE_T*24`($sp) ; end pointer
$PUSH $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer
@@ -303,10 +280,36 @@ Lmemcpy:
$POP $num,`$FRAME-$SIZE_T*25`($sp) ; restore real num
addic. $num,$num,`-16*$SZ` ; num--
bne- Lunaligned
- b Ldone
-___
-$code.=<<___;
+Ldone:
+ $POP r0,`$FRAME+$LRSAVE`($sp)
+ $POP $toc,`$FRAME-$SIZE_T*20`($sp)
+ $POP r13,`$FRAME-$SIZE_T*19`($sp)
+ $POP r14,`$FRAME-$SIZE_T*18`($sp)
+ $POP r15,`$FRAME-$SIZE_T*17`($sp)
+ $POP r16,`$FRAME-$SIZE_T*16`($sp)
+ $POP r17,`$FRAME-$SIZE_T*15`($sp)
+ $POP r18,`$FRAME-$SIZE_T*14`($sp)
+ $POP r19,`$FRAME-$SIZE_T*13`($sp)
+ $POP r20,`$FRAME-$SIZE_T*12`($sp)
+ $POP r21,`$FRAME-$SIZE_T*11`($sp)
+ $POP r22,`$FRAME-$SIZE_T*10`($sp)
+ $POP r23,`$FRAME-$SIZE_T*9`($sp)
+ $POP r24,`$FRAME-$SIZE_T*8`($sp)
+ $POP r25,`$FRAME-$SIZE_T*7`($sp)
+ $POP r26,`$FRAME-$SIZE_T*6`($sp)
+ $POP r27,`$FRAME-$SIZE_T*5`($sp)
+ $POP r28,`$FRAME-$SIZE_T*4`($sp)
+ $POP r29,`$FRAME-$SIZE_T*3`($sp)
+ $POP r30,`$FRAME-$SIZE_T*2`($sp)
+ $POP r31,`$FRAME-$SIZE_T*1`($sp)
+ mtlr r0
+ addi $sp,$sp,$FRAME
+ blr
+ .long 0
+ .byte 0,12,4,1,0x80,18,3,0
+ .long 0
+
.align 4
Lsha2_block_private:
___
@@ -372,6 +375,8 @@ $code.=<<___;
$ST $H,`7*$SZ`($ctx)
bne Lsha2_block_private
blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,0,0
___
# Ugly hack here, because PPC assembler syntax seem to vary too
@@ -379,22 +384,15 @@ ___
$code.=<<___;
.align 6
LPICmeup:
- bl LPIC
- addi $Tbl,$Tbl,`64-4` ; "distance" between . and last nop
- b LPICedup
- nop
- nop
- nop
- nop
- nop
-LPIC: mflr $Tbl
+ mflr r0
+ bcl 20,31,\$+4
+ mflr $Tbl ; vvvvvv "distance" between . and 1st data entry
+ addi $Tbl,$Tbl,`64-8`
+ mtlr r0
blr
- nop
- nop
- nop
- nop
- nop
- nop
+ .long 0
+ .byte 0,12,0x14,0,0,0,0,0
+ .space `64-9*4`
___
$code.=<<___ if ($SZ==8);
.long 0x428a2f98,0xd728ae22,0x71374491,0x23ef65cd
diff --git a/main/openssl/crypto/sha/asm/sha512-s390x.pl b/main/openssl/crypto/sha/asm/sha512-s390x.pl
index e7ef2d5a..079a3fc7 100644
--- a/main/openssl/crypto/sha/asm/sha512-s390x.pl
+++ b/main/openssl/crypto/sha/asm/sha512-s390x.pl
@@ -26,6 +26,26 @@
# favour dual-issue z10 pipeline. Hardware SHA256/512 is ~4.7x faster
# than software.
+# November 2010.
+#
+# Adapt for -m31 build. If kernel supports what's called "highgprs"
+# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
+# instructions and achieve "64-bit" performance even in 31-bit legacy
+# application context. The feature is not specific to any particular
+# processor, as long as it's "z-CPU". Latter implies that the code
+# remains z/Architecture specific. On z900 SHA256 was measured to
+# perform 2.4x and SHA512 - 13x better than code generated by gcc 4.3.
+
+$flavour = shift;
+
+if ($flavour =~ /3[12]/) {
+ $SIZE_T=4;
+ $g="";
+} else {
+ $SIZE_T=8;
+ $g="g";
+}
+
$t0="%r0";
$t1="%r1";
$ctx="%r2"; $t2="%r2";
@@ -44,7 +64,7 @@ $tbl="%r13";
$T1="%r14";
$sp="%r15";
-$output=shift;
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
if ($output =~ /512/) {
@@ -78,7 +98,8 @@ if ($output =~ /512/) {
}
$Func="sha${label}_block_data_order";
$Table="K${label}";
-$frame=160+16*$SZ;
+$stdframe=16*$SIZE_T+4*8;
+$frame=$stdframe+16*$SZ;
sub BODY_00_15 {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
@@ -93,9 +114,9 @@ $code.=<<___;
xgr $t0,$t1
$ROT $t1,$t1,`$Sigma1[2]-$Sigma1[1]`
xgr $t2,$g
- $ST $T1,`160+$SZ*($i%16)`($sp)
+ $ST $T1,`$stdframe+$SZ*($i%16)`($sp)
xgr $t0,$t1 # Sigma1(e)
- la $T1,0($T1,$h) # T1+=h
+ algr $T1,$h # T1+=h
ngr $t2,$e
lgr $t1,$a
algr $T1,$t0 # T1+=Sigma1(e)
@@ -113,7 +134,7 @@ $code.=<<___;
ngr $t2,$b
algr $h,$T1 # h+=T1
ogr $t2,$t1 # Maj(a,b,c)
- la $d,0($d,$T1) # d+=T1
+ algr $d,$T1 # d+=T1
algr $h,$t2 # h+=Maj(a,b,c)
___
}
@@ -122,19 +143,19 @@ sub BODY_16_XX {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
$code.=<<___;
- $LD $T1,`160+$SZ*(($i+1)%16)`($sp) ### $i
- $LD $t1,`160+$SZ*(($i+14)%16)`($sp)
+ $LD $T1,`$stdframe+$SZ*(($i+1)%16)`($sp) ### $i
+ $LD $t1,`$stdframe+$SZ*(($i+14)%16)`($sp)
$ROT $t0,$T1,$sigma0[0]
$SHR $T1,$sigma0[2]
$ROT $t2,$t0,`$sigma0[1]-$sigma0[0]`
xgr $T1,$t0
$ROT $t0,$t1,$sigma1[0]
- xgr $T1,$t2 # sigma0(X[i+1])
+ xgr $T1,$t2 # sigma0(X[i+1])
$SHR $t1,$sigma1[2]
- $ADD $T1,`160+$SZ*($i%16)`($sp) # +=X[i]
+ $ADD $T1,`$stdframe+$SZ*($i%16)`($sp) # +=X[i]
xgr $t1,$t0
$ROT $t0,$t0,`$sigma1[1]-$sigma1[0]`
- $ADD $T1,`160+$SZ*(($i+9)%16)`($sp) # +=X[i+9]
+ $ADD $T1,`$stdframe+$SZ*(($i+9)%16)`($sp) # +=X[i+9]
xgr $t1,$t0 # sigma1(X[i+14])
algr $T1,$t1 # +=sigma1(X[i+14])
___
@@ -212,6 +233,7 @@ $code.=<<___;
.globl $Func
.type $Func,\@function
$Func:
+ sllg $len,$len,`log(16*$SZ)/log(2)`
___
$code.=<<___ if ($kimdfunc);
larl %r1,OPENSSL_s390xcap_P
@@ -219,15 +241,15 @@ $code.=<<___ if ($kimdfunc);
tmhl %r0,0x4000 # check for message-security assist
jz .Lsoftware
lghi %r0,0
- la %r1,16($sp)
+ la %r1,`2*$SIZE_T`($sp)
.long 0xb93e0002 # kimd %r0,%r2
- lg %r0,16($sp)
+ lg %r0,`2*$SIZE_T`($sp)
tmhh %r0,`0x8000>>$kimdfunc`
jz .Lsoftware
lghi %r0,$kimdfunc
lgr %r1,$ctx
lgr %r2,$inp
- sllg %r3,$len,`log(16*$SZ)/log(2)`
+ lgr %r3,$len
.long 0xb93e0002 # kimd %r0,%r2
brc 1,.-4 # pay attention to "partial completion"
br %r14
@@ -235,13 +257,12 @@ $code.=<<___ if ($kimdfunc);
.Lsoftware:
___
$code.=<<___;
- sllg $len,$len,`log(16*$SZ)/log(2)`
lghi %r1,-$frame
- agr $len,$inp
- stmg $ctx,%r15,16($sp)
+ la $len,0($len,$inp)
+ stm${g} $ctx,%r15,`2*$SIZE_T`($sp)
lgr %r0,$sp
la $sp,0(%r1,$sp)
- stg %r0,0($sp)
+ st${g} %r0,0($sp)
larl $tbl,$Table
$LD $A,`0*$SZ`($ctx)
@@ -265,7 +286,7 @@ $code.=<<___;
clgr $len,$t0
jne .Lrounds_16_xx
- lg $ctx,`$frame+16`($sp)
+ l${g} $ctx,`$frame+2*$SIZE_T`($sp)
la $inp,`16*$SZ`($inp)
$ADD $A,`0*$SZ`($ctx)
$ADD $B,`1*$SZ`($ctx)
@@ -283,14 +304,14 @@ $code.=<<___;
$ST $F,`5*$SZ`($ctx)
$ST $G,`6*$SZ`($ctx)
$ST $H,`7*$SZ`($ctx)
- clg $inp,`$frame+32`($sp)
+ cl${g} $inp,`$frame+4*$SIZE_T`($sp)
jne .Lloop
- lmg %r6,%r15,`$frame+48`($sp)
+ lm${g} %r6,%r15,`$frame+6*$SIZE_T`($sp)
br %r14
.size $Func,.-$Func
.string "SHA${label} block transform for s390x, CRYPTOGAMS by <appro\@openssl.org>"
-.comm OPENSSL_s390xcap_P,8,8
+.comm OPENSSL_s390xcap_P,16,8
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
diff --git a/main/openssl/crypto/sha/asm/sha512-sparcv9.pl b/main/openssl/crypto/sha/asm/sha512-sparcv9.pl
index ec5d7813..58574078 100644
--- a/main/openssl/crypto/sha/asm/sha512-sparcv9.pl
+++ b/main/openssl/crypto/sha/asm/sha512-sparcv9.pl
@@ -305,9 +305,9 @@ $code.=<<___;
srlx @X[(($i+9)/2)%8],32,$tmp1 ! X[i+9]
xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14])
srl @X[($i/2)%8],0,$tmp0
+ add $tmp2,$tmp1,$tmp1
add $xi,$T1,$T1 ! +=X[i]
xor $tmp0,@X[($i/2)%8],@X[($i/2)%8]
- add $tmp2,$T1,$T1
add $tmp1,$T1,$T1
srl $T1,0,$T1
@@ -318,9 +318,9 @@ ___
$code.=<<___;
srlx @X[($i/2)%8],32,$tmp1 ! X[i]
xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14])
- srl @X[($i/2)%8],0,@X[($i/2)%8]
add $xi,$T1,$T1 ! +=X[i+9]
- add $tmp2,$T1,$T1
+ add $tmp2,$tmp1,$tmp1
+ srl @X[($i/2)%8],0,@X[($i/2)%8]
add $tmp1,$T1,$T1
sllx $T1,32,$tmp0
diff --git a/main/openssl/crypto/sha/asm/sha512-x86_64.S b/main/openssl/crypto/sha/asm/sha512-x86_64.S
new file mode 100644
index 00000000..2d3294e0
--- /dev/null
+++ b/main/openssl/crypto/sha/asm/sha512-x86_64.S
@@ -0,0 +1,1802 @@
+.text
+
+.globl sha512_block_data_order
+.type sha512_block_data_order,@function
+.align 16
+sha512_block_data_order:
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ movq %rsp,%r11
+ shlq $4,%rdx
+ subq $128+32,%rsp
+ leaq (%rsi,%rdx,8),%rdx
+ andq $-64,%rsp
+ movq %rdi,128+0(%rsp)
+ movq %rsi,128+8(%rsp)
+ movq %rdx,128+16(%rsp)
+ movq %r11,128+24(%rsp)
+.Lprologue:
+
+ leaq K512(%rip),%rbp
+
+ movq 0(%rdi),%rax
+ movq 8(%rdi),%rbx
+ movq 16(%rdi),%rcx
+ movq 24(%rdi),%rdx
+ movq 32(%rdi),%r8
+ movq 40(%rdi),%r9
+ movq 48(%rdi),%r10
+ movq 56(%rdi),%r11
+ jmp .Lloop
+
+.align 16
+.Lloop:
+ xorq %rdi,%rdi
+ movq 0(%rsi),%r12
+ movq %r8,%r13
+ movq %rax,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %r9,%r15
+ movq %r12,0(%rsp)
+
+ rorq $5,%r14
+ xorq %r8,%r13
+ xorq %r10,%r15
+
+ rorq $4,%r13
+ addq %r11,%r12
+ xorq %rax,%r14
+
+ addq (%rbp,%rdi,8),%r12
+ andq %r8,%r15
+ movq %rbx,%r11
+
+ rorq $6,%r14
+ xorq %r8,%r13
+ xorq %r10,%r15
+
+ xorq %rcx,%r11
+ xorq %rax,%r14
+ addq %r15,%r12
+ movq %rbx,%r15
+
+ rorq $14,%r13
+ andq %rax,%r11
+ andq %rcx,%r15
+
+ rorq $28,%r14
+ addq %r13,%r12
+ addq %r15,%r11
+
+ addq %r12,%rdx
+ addq %r12,%r11
+ leaq 1(%rdi),%rdi
+ addq %r14,%r11
+
+ movq 8(%rsi),%r12
+ movq %rdx,%r13
+ movq %r11,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %r8,%r15
+ movq %r12,8(%rsp)
+
+ rorq $5,%r14
+ xorq %rdx,%r13
+ xorq %r9,%r15
+
+ rorq $4,%r13
+ addq %r10,%r12
+ xorq %r11,%r14
+
+ addq (%rbp,%rdi,8),%r12
+ andq %rdx,%r15
+ movq %rax,%r10
+
+ rorq $6,%r14
+ xorq %rdx,%r13
+ xorq %r9,%r15
+
+ xorq %rbx,%r10
+ xorq %r11,%r14
+ addq %r15,%r12
+ movq %rax,%r15
+
+ rorq $14,%r13
+ andq %r11,%r10
+ andq %rbx,%r15
+
+ rorq $28,%r14
+ addq %r13,%r12
+ addq %r15,%r10
+
+ addq %r12,%rcx
+ addq %r12,%r10
+ leaq 1(%rdi),%rdi
+ addq %r14,%r10
+
+ movq 16(%rsi),%r12
+ movq %rcx,%r13
+ movq %r10,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %rdx,%r15
+ movq %r12,16(%rsp)
+
+ rorq $5,%r14
+ xorq %rcx,%r13
+ xorq %r8,%r15
+
+ rorq $4,%r13
+ addq %r9,%r12
+ xorq %r10,%r14
+
+ addq (%rbp,%rdi,8),%r12
+ andq %rcx,%r15
+ movq %r11,%r9
+
+ rorq $6,%r14
+ xorq %rcx,%r13
+ xorq %r8,%r15
+
+ xorq %rax,%r9
+ xorq %r10,%r14
+ addq %r15,%r12
+ movq %r11,%r15
+
+ rorq $14,%r13
+ andq %r10,%r9
+ andq %rax,%r15
+
+ rorq $28,%r14
+ addq %r13,%r12
+ addq %r15,%r9
+
+ addq %r12,%rbx
+ addq %r12,%r9
+ leaq 1(%rdi),%rdi
+ addq %r14,%r9
+
+ movq 24(%rsi),%r12
+ movq %rbx,%r13
+ movq %r9,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %rcx,%r15
+ movq %r12,24(%rsp)
+
+ rorq $5,%r14
+ xorq %rbx,%r13
+ xorq %rdx,%r15
+
+ rorq $4,%r13
+ addq %r8,%r12
+ xorq %r9,%r14
+
+ addq (%rbp,%rdi,8),%r12
+ andq %rbx,%r15
+ movq %r10,%r8
+
+ rorq $6,%r14
+ xorq %rbx,%r13
+ xorq %rdx,%r15
+
+ xorq %r11,%r8
+ xorq %r9,%r14
+ addq %r15,%r12
+ movq %r10,%r15
+
+ rorq $14,%r13
+ andq %r9,%r8
+ andq %r11,%r15
+
+ rorq $28,%r14
+ addq %r13,%r12
+ addq %r15,%r8
+
+ addq %r12,%rax
+ addq %r12,%r8
+ leaq 1(%rdi),%rdi
+ addq %r14,%r8
+
+ movq 32(%rsi),%r12
+ movq %rax,%r13
+ movq %r8,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %rbx,%r15
+ movq %r12,32(%rsp)
+
+ rorq $5,%r14
+ xorq %rax,%r13
+ xorq %rcx,%r15
+
+ rorq $4,%r13
+ addq %rdx,%r12
+ xorq %r8,%r14
+
+ addq (%rbp,%rdi,8),%r12
+ andq %rax,%r15
+ movq %r9,%rdx
+
+ rorq $6,%r14
+ xorq %rax,%r13
+ xorq %rcx,%r15
+
+ xorq %r10,%rdx
+ xorq %r8,%r14
+ addq %r15,%r12
+ movq %r9,%r15
+
+ rorq $14,%r13
+ andq %r8,%rdx
+ andq %r10,%r15
+
+ rorq $28,%r14
+ addq %r13,%r12
+ addq %r15,%rdx
+
+ addq %r12,%r11
+ addq %r12,%rdx
+ leaq 1(%rdi),%rdi
+ addq %r14,%rdx
+
+ movq 40(%rsi),%r12
+ movq %r11,%r13
+ movq %rdx,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %rax,%r15
+ movq %r12,40(%rsp)
+
+ rorq $5,%r14
+ xorq %r11,%r13
+ xorq %rbx,%r15
+
+ rorq $4,%r13
+ addq %rcx,%r12
+ xorq %rdx,%r14
+
+ addq (%rbp,%rdi,8),%r12
+ andq %r11,%r15
+ movq %r8,%rcx
+
+ rorq $6,%r14
+ xorq %r11,%r13
+ xorq %rbx,%r15
+
+ xorq %r9,%rcx
+ xorq %rdx,%r14
+ addq %r15,%r12
+ movq %r8,%r15
+
+ rorq $14,%r13
+ andq %rdx,%rcx
+ andq %r9,%r15
+
+ rorq $28,%r14
+ addq %r13,%r12
+ addq %r15,%rcx
+
+ addq %r12,%r10
+ addq %r12,%rcx
+ leaq 1(%rdi),%rdi
+ addq %r14,%rcx
+
+ movq 48(%rsi),%r12
+ movq %r10,%r13
+ movq %rcx,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %r11,%r15
+ movq %r12,48(%rsp)
+
+ rorq $5,%r14
+ xorq %r10,%r13
+ xorq %rax,%r15
+
+ rorq $4,%r13
+ addq %rbx,%r12
+ xorq %rcx,%r14
+
+ addq (%rbp,%rdi,8),%r12
+ andq %r10,%r15
+ movq %rdx,%rbx
+
+ rorq $6,%r14
+ xorq %r10,%r13
+ xorq %rax,%r15
+
+ xorq %r8,%rbx
+ xorq %rcx,%r14
+ addq %r15,%r12
+ movq %rdx,%r15
+
+ rorq $14,%r13
+ andq %rcx,%rbx
+ andq %r8,%r15
+
+ rorq $28,%r14
+ addq %r13,%r12
+ addq %r15,%rbx
+
+ addq %r12,%r9
+ addq %r12,%rbx
+ leaq 1(%rdi),%rdi
+ addq %r14,%rbx
+
+ movq 56(%rsi),%r12
+ movq %r9,%r13
+ movq %rbx,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %r10,%r15
+ movq %r12,56(%rsp)
+
+ rorq $5,%r14
+ xorq %r9,%r13
+ xorq %r11,%r15
+
+ rorq $4,%r13
+ addq %rax,%r12
+ xorq %rbx,%r14
+
+ addq (%rbp,%rdi,8),%r12
+ andq %r9,%r15
+ movq %rcx,%rax
+
+ rorq $6,%r14
+ xorq %r9,%r13
+ xorq %r11,%r15
+
+ xorq %rdx,%rax
+ xorq %rbx,%r14
+ addq %r15,%r12
+ movq %rcx,%r15
+
+ rorq $14,%r13
+ andq %rbx,%rax
+ andq %rdx,%r15
+
+ rorq $28,%r14
+ addq %r13,%r12
+ addq %r15,%rax
+
+ addq %r12,%r8
+ addq %r12,%rax
+ leaq 1(%rdi),%rdi
+ addq %r14,%rax
+
+ movq 64(%rsi),%r12
+ movq %r8,%r13
+ movq %rax,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %r9,%r15
+ movq %r12,64(%rsp)
+
+ rorq $5,%r14
+ xorq %r8,%r13
+ xorq %r10,%r15
+
+ rorq $4,%r13
+ addq %r11,%r12
+ xorq %rax,%r14
+
+ addq (%rbp,%rdi,8),%r12
+ andq %r8,%r15
+ movq %rbx,%r11
+
+ rorq $6,%r14
+ xorq %r8,%r13
+ xorq %r10,%r15
+
+ xorq %rcx,%r11
+ xorq %rax,%r14
+ addq %r15,%r12
+ movq %rbx,%r15
+
+ rorq $14,%r13
+ andq %rax,%r11
+ andq %rcx,%r15
+
+ rorq $28,%r14
+ addq %r13,%r12
+ addq %r15,%r11
+
+ addq %r12,%rdx
+ addq %r12,%r11
+ leaq 1(%rdi),%rdi
+ addq %r14,%r11
+
+ movq 72(%rsi),%r12
+ movq %rdx,%r13
+ movq %r11,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %r8,%r15
+ movq %r12,72(%rsp)
+
+ rorq $5,%r14
+ xorq %rdx,%r13
+ xorq %r9,%r15
+
+ rorq $4,%r13
+ addq %r10,%r12
+ xorq %r11,%r14
+
+ addq (%rbp,%rdi,8),%r12
+ andq %rdx,%r15
+ movq %rax,%r10
+
+ rorq $6,%r14
+ xorq %rdx,%r13
+ xorq %r9,%r15
+
+ xorq %rbx,%r10
+ xorq %r11,%r14
+ addq %r15,%r12
+ movq %rax,%r15
+
+ rorq $14,%r13
+ andq %r11,%r10
+ andq %rbx,%r15
+
+ rorq $28,%r14
+ addq %r13,%r12
+ addq %r15,%r10
+
+ addq %r12,%rcx
+ addq %r12,%r10
+ leaq 1(%rdi),%rdi
+ addq %r14,%r10
+
+ movq 80(%rsi),%r12
+ movq %rcx,%r13
+ movq %r10,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %rdx,%r15
+ movq %r12,80(%rsp)
+
+ rorq $5,%r14
+ xorq %rcx,%r13
+ xorq %r8,%r15
+
+ rorq $4,%r13
+ addq %r9,%r12
+ xorq %r10,%r14
+
+ addq (%rbp,%rdi,8),%r12
+ andq %rcx,%r15
+ movq %r11,%r9
+
+ rorq $6,%r14
+ xorq %rcx,%r13
+ xorq %r8,%r15
+
+ xorq %rax,%r9
+ xorq %r10,%r14
+ addq %r15,%r12
+ movq %r11,%r15
+
+ rorq $14,%r13
+ andq %r10,%r9
+ andq %rax,%r15
+
+ rorq $28,%r14
+ addq %r13,%r12
+ addq %r15,%r9
+
+ addq %r12,%rbx
+ addq %r12,%r9
+ leaq 1(%rdi),%rdi
+ addq %r14,%r9
+
+ movq 88(%rsi),%r12
+ movq %rbx,%r13
+ movq %r9,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %rcx,%r15
+ movq %r12,88(%rsp)
+
+ rorq $5,%r14
+ xorq %rbx,%r13
+ xorq %rdx,%r15
+
+ rorq $4,%r13
+ addq %r8,%r12
+ xorq %r9,%r14
+
+ addq (%rbp,%rdi,8),%r12
+ andq %rbx,%r15
+ movq %r10,%r8
+
+ rorq $6,%r14
+ xorq %rbx,%r13
+ xorq %rdx,%r15
+
+ xorq %r11,%r8
+ xorq %r9,%r14
+ addq %r15,%r12
+ movq %r10,%r15
+
+ rorq $14,%r13
+ andq %r9,%r8
+ andq %r11,%r15
+
+ rorq $28,%r14
+ addq %r13,%r12
+ addq %r15,%r8
+
+ addq %r12,%rax
+ addq %r12,%r8
+ leaq 1(%rdi),%rdi
+ addq %r14,%r8
+
+ movq 96(%rsi),%r12
+ movq %rax,%r13
+ movq %r8,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %rbx,%r15
+ movq %r12,96(%rsp)
+
+ rorq $5,%r14
+ xorq %rax,%r13
+ xorq %rcx,%r15
+
+ rorq $4,%r13
+ addq %rdx,%r12
+ xorq %r8,%r14
+
+ addq (%rbp,%rdi,8),%r12
+ andq %rax,%r15
+ movq %r9,%rdx
+
+ rorq $6,%r14
+ xorq %rax,%r13
+ xorq %rcx,%r15
+
+ xorq %r10,%rdx
+ xorq %r8,%r14
+ addq %r15,%r12
+ movq %r9,%r15
+
+ rorq $14,%r13
+ andq %r8,%rdx
+ andq %r10,%r15
+
+ rorq $28,%r14
+ addq %r13,%r12
+ addq %r15,%rdx
+
+ addq %r12,%r11
+ addq %r12,%rdx
+ leaq 1(%rdi),%rdi
+ addq %r14,%rdx
+
+ movq 104(%rsi),%r12
+ movq %r11,%r13
+ movq %rdx,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %rax,%r15
+ movq %r12,104(%rsp)
+
+ rorq $5,%r14
+ xorq %r11,%r13
+ xorq %rbx,%r15
+
+ rorq $4,%r13
+ addq %rcx,%r12
+ xorq %rdx,%r14
+
+ addq (%rbp,%rdi,8),%r12
+ andq %r11,%r15
+ movq %r8,%rcx
+
+ rorq $6,%r14
+ xorq %r11,%r13
+ xorq %rbx,%r15
+
+ xorq %r9,%rcx
+ xorq %rdx,%r14
+ addq %r15,%r12
+ movq %r8,%r15
+
+ rorq $14,%r13
+ andq %rdx,%rcx
+ andq %r9,%r15
+
+ rorq $28,%r14
+ addq %r13,%r12
+ addq %r15,%rcx
+
+ addq %r12,%r10
+ addq %r12,%rcx
+ leaq 1(%rdi),%rdi
+ addq %r14,%rcx
+
+ movq 112(%rsi),%r12
+ movq %r10,%r13
+ movq %rcx,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %r11,%r15
+ movq %r12,112(%rsp)
+
+ rorq $5,%r14
+ xorq %r10,%r13
+ xorq %rax,%r15
+
+ rorq $4,%r13
+ addq %rbx,%r12
+ xorq %rcx,%r14
+
+ addq (%rbp,%rdi,8),%r12
+ andq %r10,%r15
+ movq %rdx,%rbx
+
+ rorq $6,%r14
+ xorq %r10,%r13
+ xorq %rax,%r15
+
+ xorq %r8,%rbx
+ xorq %rcx,%r14
+ addq %r15,%r12
+ movq %rdx,%r15
+
+ rorq $14,%r13
+ andq %rcx,%rbx
+ andq %r8,%r15
+
+ rorq $28,%r14
+ addq %r13,%r12
+ addq %r15,%rbx
+
+ addq %r12,%r9
+ addq %r12,%rbx
+ leaq 1(%rdi),%rdi
+ addq %r14,%rbx
+
+ movq 120(%rsi),%r12
+ movq %r9,%r13
+ movq %rbx,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %r10,%r15
+ movq %r12,120(%rsp)
+
+ rorq $5,%r14
+ xorq %r9,%r13
+ xorq %r11,%r15
+
+ rorq $4,%r13
+ addq %rax,%r12
+ xorq %rbx,%r14
+
+ addq (%rbp,%rdi,8),%r12
+ andq %r9,%r15
+ movq %rcx,%rax
+
+ rorq $6,%r14
+ xorq %r9,%r13
+ xorq %r11,%r15
+
+ xorq %rdx,%rax
+ xorq %rbx,%r14
+ addq %r15,%r12
+ movq %rcx,%r15
+
+ rorq $14,%r13
+ andq %rbx,%rax
+ andq %rdx,%r15
+
+ rorq $28,%r14
+ addq %r13,%r12
+ addq %r15,%rax
+
+ addq %r12,%r8
+ addq %r12,%rax
+ leaq 1(%rdi),%rdi
+ addq %r14,%rax
+
+ jmp .Lrounds_16_xx
+.align 16
+.Lrounds_16_xx:
+ movq 8(%rsp),%r13
+ movq 112(%rsp),%r14
+ movq %r13,%r12
+ movq %r14,%r15
+
+ rorq $7,%r12
+ xorq %r13,%r12
+ shrq $7,%r13
+
+ rorq $1,%r12
+ xorq %r12,%r13
+ movq 72(%rsp),%r12
+
+ rorq $42,%r15
+ xorq %r14,%r15
+ shrq $6,%r14
+
+ rorq $19,%r15
+ addq %r13,%r12
+ xorq %r15,%r14
+
+ addq 0(%rsp),%r12
+ movq %r8,%r13
+ addq %r14,%r12
+ movq %rax,%r14
+ rorq $23,%r13
+ movq %r9,%r15
+ movq %r12,0(%rsp)
+
+ rorq $5,%r14
+ xorq %r8,%r13
+ xorq %r10,%r15
+
+ rorq $4,%r13
+ addq %r11,%r12
+ xorq %rax,%r14
+
+ addq (%rbp,%rdi,8),%r12
+ andq %r8,%r15
+ movq %rbx,%r11
+
+ rorq $6,%r14
+ xorq %r8,%r13
+ xorq %r10,%r15
+
+ xorq %rcx,%r11
+ xorq %rax,%r14
+ addq %r15,%r12
+ movq %rbx,%r15
+
+ rorq $14,%r13
+ andq %rax,%r11
+ andq %rcx,%r15
+
+ rorq $28,%r14
+ addq %r13,%r12
+ addq %r15,%r11
+
+ addq %r12,%rdx
+ addq %r12,%r11
+ leaq 1(%rdi),%rdi
+ addq %r14,%r11
+
+ movq 16(%rsp),%r13
+ movq 120(%rsp),%r14
+ movq %r13,%r12
+ movq %r14,%r15
+
+ rorq $7,%r12
+ xorq %r13,%r12
+ shrq $7,%r13
+
+ rorq $1,%r12
+ xorq %r12,%r13
+ movq 80(%rsp),%r12
+
+ rorq $42,%r15
+ xorq %r14,%r15
+ shrq $6,%r14
+
+ rorq $19,%r15
+ addq %r13,%r12
+ xorq %r15,%r14
+
+ addq 8(%rsp),%r12
+ movq %rdx,%r13
+ addq %r14,%r12
+ movq %r11,%r14
+ rorq $23,%r13
+ movq %r8,%r15
+ movq %r12,8(%rsp)
+
+ rorq $5,%r14
+ xorq %rdx,%r13
+ xorq %r9,%r15
+
+ rorq $4,%r13
+ addq %r10,%r12
+ xorq %r11,%r14
+
+ addq (%rbp,%rdi,8),%r12
+ andq %rdx,%r15
+ movq %rax,%r10
+
+ rorq $6,%r14
+ xorq %rdx,%r13
+ xorq %r9,%r15
+
+ xorq %rbx,%r10
+ xorq %r11,%r14
+ addq %r15,%r12
+ movq %rax,%r15
+
+ rorq $14,%r13
+ andq %r11,%r10
+ andq %rbx,%r15
+
+ rorq $28,%r14
+ addq %r13,%r12
+ addq %r15,%r10
+
+ addq %r12,%rcx
+ addq %r12,%r10
+ leaq 1(%rdi),%rdi
+ addq %r14,%r10
+
+ movq 24(%rsp),%r13
+ movq 0(%rsp),%r14
+ movq %r13,%r12
+ movq %r14,%r15
+
+ rorq $7,%r12
+ xorq %r13,%r12
+ shrq $7,%r13
+
+ rorq $1,%r12
+ xorq %r12,%r13
+ movq 88(%rsp),%r12
+
+ rorq $42,%r15
+ xorq %r14,%r15
+ shrq $6,%r14
+
+ rorq $19,%r15
+ addq %r13,%r12
+ xorq %r15,%r14
+
+ addq 16(%rsp),%r12
+ movq %rcx,%r13
+ addq %r14,%r12
+ movq %r10,%r14
+ rorq $23,%r13
+ movq %rdx,%r15
+ movq %r12,16(%rsp)
+
+ rorq $5,%r14
+ xorq %rcx,%r13
+ xorq %r8,%r15
+
+ rorq $4,%r13
+ addq %r9,%r12
+ xorq %r10,%r14
+
+ addq (%rbp,%rdi,8),%r12
+ andq %rcx,%r15
+ movq %r11,%r9
+
+ rorq $6,%r14
+ xorq %rcx,%r13
+ xorq %r8,%r15
+
+ xorq %rax,%r9
+ xorq %r10,%r14
+ addq %r15,%r12
+ movq %r11,%r15
+
+ rorq $14,%r13
+ andq %r10,%r9
+ andq %rax,%r15
+
+ rorq $28,%r14
+ addq %r13,%r12
+ addq %r15,%r9
+
+ addq %r12,%rbx
+ addq %r12,%r9
+ leaq 1(%rdi),%rdi
+ addq %r14,%r9
+
+ movq 32(%rsp),%r13
+ movq 8(%rsp),%r14
+ movq %r13,%r12
+ movq %r14,%r15
+
+ rorq $7,%r12
+ xorq %r13,%r12
+ shrq $7,%r13
+
+ rorq $1,%r12
+ xorq %r12,%r13
+ movq 96(%rsp),%r12
+
+ rorq $42,%r15
+ xorq %r14,%r15
+ shrq $6,%r14
+
+ rorq $19,%r15
+ addq %r13,%r12
+ xorq %r15,%r14
+
+ addq 24(%rsp),%r12
+ movq %rbx,%r13
+ addq %r14,%r12
+ movq %r9,%r14
+ rorq $23,%r13
+ movq %rcx,%r15
+ movq %r12,24(%rsp)
+
+ rorq $5,%r14
+ xorq %rbx,%r13
+ xorq %rdx,%r15
+
+ rorq $4,%r13
+ addq %r8,%r12
+ xorq %r9,%r14
+
+ addq (%rbp,%rdi,8),%r12
+ andq %rbx,%r15
+ movq %r10,%r8
+
+ rorq $6,%r14
+ xorq %rbx,%r13
+ xorq %rdx,%r15
+
+ xorq %r11,%r8
+ xorq %r9,%r14
+ addq %r15,%r12
+ movq %r10,%r15
+
+ rorq $14,%r13
+ andq %r9,%r8
+ andq %r11,%r15
+
+ rorq $28,%r14
+ addq %r13,%r12
+ addq %r15,%r8
+
+ addq %r12,%rax
+ addq %r12,%r8
+ leaq 1(%rdi),%rdi
+ addq %r14,%r8
+
+ movq 40(%rsp),%r13
+ movq 16(%rsp),%r14
+ movq %r13,%r12
+ movq %r14,%r15
+
+ rorq $7,%r12
+ xorq %r13,%r12
+ shrq $7,%r13
+
+ rorq $1,%r12
+ xorq %r12,%r13
+ movq 104(%rsp),%r12
+
+ rorq $42,%r15
+ xorq %r14,%r15
+ shrq $6,%r14
+
+ rorq $19,%r15
+ addq %r13,%r12
+ xorq %r15,%r14
+
+ addq 32(%rsp),%r12
+ movq %rax,%r13
+ addq %r14,%r12
+ movq %r8,%r14
+ rorq $23,%r13
+ movq %rbx,%r15
+ movq %r12,32(%rsp)
+
+ rorq $5,%r14
+ xorq %rax,%r13
+ xorq %rcx,%r15
+
+ rorq $4,%r13
+ addq %rdx,%r12
+ xorq %r8,%r14
+
+ addq (%rbp,%rdi,8),%r12
+ andq %rax,%r15
+ movq %r9,%rdx
+
+ rorq $6,%r14
+ xorq %rax,%r13
+ xorq %rcx,%r15
+
+ xorq %r10,%rdx
+ xorq %r8,%r14
+ addq %r15,%r12
+ movq %r9,%r15
+
+ rorq $14,%r13
+ andq %r8,%rdx
+ andq %r10,%r15
+
+ rorq $28,%r14
+ addq %r13,%r12
+ addq %r15,%rdx
+
+ addq %r12,%r11
+ addq %r12,%rdx
+ leaq 1(%rdi),%rdi
+ addq %r14,%rdx
+
+ movq 48(%rsp),%r13
+ movq 24(%rsp),%r14
+ movq %r13,%r12
+ movq %r14,%r15
+
+ rorq $7,%r12
+ xorq %r13,%r12
+ shrq $7,%r13
+
+ rorq $1,%r12
+ xorq %r12,%r13
+ movq 112(%rsp),%r12
+
+ rorq $42,%r15
+ xorq %r14,%r15
+ shrq $6,%r14
+
+ rorq $19,%r15
+ addq %r13,%r12
+ xorq %r15,%r14
+
+ addq 40(%rsp),%r12
+ movq %r11,%r13
+ addq %r14,%r12
+ movq %rdx,%r14
+ rorq $23,%r13
+ movq %rax,%r15
+ movq %r12,40(%rsp)
+
+ rorq $5,%r14
+ xorq %r11,%r13
+ xorq %rbx,%r15
+
+ rorq $4,%r13
+ addq %rcx,%r12
+ xorq %rdx,%r14
+
+ addq (%rbp,%rdi,8),%r12
+ andq %r11,%r15
+ movq %r8,%rcx
+
+ rorq $6,%r14
+ xorq %r11,%r13
+ xorq %rbx,%r15
+
+ xorq %r9,%rcx
+ xorq %rdx,%r14
+ addq %r15,%r12
+ movq %r8,%r15
+
+ rorq $14,%r13
+ andq %rdx,%rcx
+ andq %r9,%r15
+
+ rorq $28,%r14
+ addq %r13,%r12
+ addq %r15,%rcx
+
+ addq %r12,%r10
+ addq %r12,%rcx
+ leaq 1(%rdi),%rdi
+ addq %r14,%rcx
+
+ movq 56(%rsp),%r13
+ movq 32(%rsp),%r14
+ movq %r13,%r12
+ movq %r14,%r15
+
+ rorq $7,%r12
+ xorq %r13,%r12
+ shrq $7,%r13
+
+ rorq $1,%r12
+ xorq %r12,%r13
+ movq 120(%rsp),%r12
+
+ rorq $42,%r15
+ xorq %r14,%r15
+ shrq $6,%r14
+
+ rorq $19,%r15
+ addq %r13,%r12
+ xorq %r15,%r14
+
+ addq 48(%rsp),%r12
+ movq %r10,%r13
+ addq %r14,%r12
+ movq %rcx,%r14
+ rorq $23,%r13
+ movq %r11,%r15
+ movq %r12,48(%rsp)
+
+ rorq $5,%r14
+ xorq %r10,%r13
+ xorq %rax,%r15
+
+ rorq $4,%r13
+ addq %rbx,%r12
+ xorq %rcx,%r14
+
+ addq (%rbp,%rdi,8),%r12
+ andq %r10,%r15
+ movq %rdx,%rbx
+
+ rorq $6,%r14
+ xorq %r10,%r13
+ xorq %rax,%r15
+
+ xorq %r8,%rbx
+ xorq %rcx,%r14
+ addq %r15,%r12
+ movq %rdx,%r15
+
+ rorq $14,%r13
+ andq %rcx,%rbx
+ andq %r8,%r15
+
+ rorq $28,%r14
+ addq %r13,%r12
+ addq %r15,%rbx
+
+ addq %r12,%r9
+ addq %r12,%rbx
+ leaq 1(%rdi),%rdi
+ addq %r14,%rbx
+
+ movq 64(%rsp),%r13
+ movq 40(%rsp),%r14
+ movq %r13,%r12
+ movq %r14,%r15
+
+ rorq $7,%r12
+ xorq %r13,%r12
+ shrq $7,%r13
+
+ rorq $1,%r12
+ xorq %r12,%r13
+ movq 0(%rsp),%r12
+
+ rorq $42,%r15
+ xorq %r14,%r15
+ shrq $6,%r14
+
+ rorq $19,%r15
+ addq %r13,%r12
+ xorq %r15,%r14
+
+ addq 56(%rsp),%r12
+ movq %r9,%r13
+ addq %r14,%r12
+ movq %rbx,%r14
+ rorq $23,%r13
+ movq %r10,%r15
+ movq %r12,56(%rsp)
+
+ rorq $5,%r14
+ xorq %r9,%r13
+ xorq %r11,%r15
+
+ rorq $4,%r13
+ addq %rax,%r12
+ xorq %rbx,%r14
+
+ addq (%rbp,%rdi,8),%r12
+ andq %r9,%r15
+ movq %rcx,%rax
+
+ rorq $6,%r14
+ xorq %r9,%r13
+ xorq %r11,%r15
+
+ xorq %rdx,%rax
+ xorq %rbx,%r14
+ addq %r15,%r12
+ movq %rcx,%r15
+
+ rorq $14,%r13
+ andq %rbx,%rax
+ andq %rdx,%r15
+
+ rorq $28,%r14
+ addq %r13,%r12
+ addq %r15,%rax
+
+ addq %r12,%r8
+ addq %r12,%rax
+ leaq 1(%rdi),%rdi
+ addq %r14,%rax
+
+ movq 72(%rsp),%r13
+ movq 48(%rsp),%r14
+ movq %r13,%r12
+ movq %r14,%r15
+
+ rorq $7,%r12
+ xorq %r13,%r12
+ shrq $7,%r13
+
+ rorq $1,%r12
+ xorq %r12,%r13
+ movq 8(%rsp),%r12
+
+ rorq $42,%r15
+ xorq %r14,%r15
+ shrq $6,%r14
+
+ rorq $19,%r15
+ addq %r13,%r12
+ xorq %r15,%r14
+
+ addq 64(%rsp),%r12
+ movq %r8,%r13
+ addq %r14,%r12
+ movq %rax,%r14
+ rorq $23,%r13
+ movq %r9,%r15
+ movq %r12,64(%rsp)
+
+ rorq $5,%r14
+ xorq %r8,%r13
+ xorq %r10,%r15
+
+ rorq $4,%r13
+ addq %r11,%r12
+ xorq %rax,%r14
+
+ addq (%rbp,%rdi,8),%r12
+ andq %r8,%r15
+ movq %rbx,%r11
+
+ rorq $6,%r14
+ xorq %r8,%r13
+ xorq %r10,%r15
+
+ xorq %rcx,%r11
+ xorq %rax,%r14
+ addq %r15,%r12
+ movq %rbx,%r15
+
+ rorq $14,%r13
+ andq %rax,%r11
+ andq %rcx,%r15
+
+ rorq $28,%r14
+ addq %r13,%r12
+ addq %r15,%r11
+
+ addq %r12,%rdx
+ addq %r12,%r11
+ leaq 1(%rdi),%rdi
+ addq %r14,%r11
+
+ movq 80(%rsp),%r13
+ movq 56(%rsp),%r14
+ movq %r13,%r12
+ movq %r14,%r15
+
+ rorq $7,%r12
+ xorq %r13,%r12
+ shrq $7,%r13
+
+ rorq $1,%r12
+ xorq %r12,%r13
+ movq 16(%rsp),%r12
+
+ rorq $42,%r15
+ xorq %r14,%r15
+ shrq $6,%r14
+
+ rorq $19,%r15
+ addq %r13,%r12
+ xorq %r15,%r14
+
+ addq 72(%rsp),%r12
+ movq %rdx,%r13
+ addq %r14,%r12
+ movq %r11,%r14
+ rorq $23,%r13
+ movq %r8,%r15
+ movq %r12,72(%rsp)
+
+ rorq $5,%r14
+ xorq %rdx,%r13
+ xorq %r9,%r15
+
+ rorq $4,%r13
+ addq %r10,%r12
+ xorq %r11,%r14
+
+ addq (%rbp,%rdi,8),%r12
+ andq %rdx,%r15
+ movq %rax,%r10
+
+ rorq $6,%r14
+ xorq %rdx,%r13
+ xorq %r9,%r15
+
+ xorq %rbx,%r10
+ xorq %r11,%r14
+ addq %r15,%r12
+ movq %rax,%r15
+
+ rorq $14,%r13
+ andq %r11,%r10
+ andq %rbx,%r15
+
+ rorq $28,%r14
+ addq %r13,%r12
+ addq %r15,%r10
+
+ addq %r12,%rcx
+ addq %r12,%r10
+ leaq 1(%rdi),%rdi
+ addq %r14,%r10
+
+ movq 88(%rsp),%r13
+ movq 64(%rsp),%r14
+ movq %r13,%r12
+ movq %r14,%r15
+
+ rorq $7,%r12
+ xorq %r13,%r12
+ shrq $7,%r13
+
+ rorq $1,%r12
+ xorq %r12,%r13
+ movq 24(%rsp),%r12
+
+ rorq $42,%r15
+ xorq %r14,%r15
+ shrq $6,%r14
+
+ rorq $19,%r15
+ addq %r13,%r12
+ xorq %r15,%r14
+
+ addq 80(%rsp),%r12
+ movq %rcx,%r13
+ addq %r14,%r12
+ movq %r10,%r14
+ rorq $23,%r13
+ movq %rdx,%r15
+ movq %r12,80(%rsp)
+
+ rorq $5,%r14
+ xorq %rcx,%r13
+ xorq %r8,%r15
+
+ rorq $4,%r13
+ addq %r9,%r12
+ xorq %r10,%r14
+
+ addq (%rbp,%rdi,8),%r12
+ andq %rcx,%r15
+ movq %r11,%r9
+
+ rorq $6,%r14
+ xorq %rcx,%r13
+ xorq %r8,%r15
+
+ xorq %rax,%r9
+ xorq %r10,%r14
+ addq %r15,%r12
+ movq %r11,%r15
+
+ rorq $14,%r13
+ andq %r10,%r9
+ andq %rax,%r15
+
+ rorq $28,%r14
+ addq %r13,%r12
+ addq %r15,%r9
+
+ addq %r12,%rbx
+ addq %r12,%r9
+ leaq 1(%rdi),%rdi
+ addq %r14,%r9
+
+ movq 96(%rsp),%r13
+ movq 72(%rsp),%r14
+ movq %r13,%r12
+ movq %r14,%r15
+
+ rorq $7,%r12
+ xorq %r13,%r12
+ shrq $7,%r13
+
+ rorq $1,%r12
+ xorq %r12,%r13
+ movq 32(%rsp),%r12
+
+ rorq $42,%r15
+ xorq %r14,%r15
+ shrq $6,%r14
+
+ rorq $19,%r15
+ addq %r13,%r12
+ xorq %r15,%r14
+
+ addq 88(%rsp),%r12
+ movq %rbx,%r13
+ addq %r14,%r12
+ movq %r9,%r14
+ rorq $23,%r13
+ movq %rcx,%r15
+ movq %r12,88(%rsp)
+
+ rorq $5,%r14
+ xorq %rbx,%r13
+ xorq %rdx,%r15
+
+ rorq $4,%r13
+ addq %r8,%r12
+ xorq %r9,%r14
+
+ addq (%rbp,%rdi,8),%r12
+ andq %rbx,%r15
+ movq %r10,%r8
+
+ rorq $6,%r14
+ xorq %rbx,%r13
+ xorq %rdx,%r15
+
+ xorq %r11,%r8
+ xorq %r9,%r14
+ addq %r15,%r12
+ movq %r10,%r15
+
+ rorq $14,%r13
+ andq %r9,%r8
+ andq %r11,%r15
+
+ rorq $28,%r14
+ addq %r13,%r12
+ addq %r15,%r8
+
+ addq %r12,%rax
+ addq %r12,%r8
+ leaq 1(%rdi),%rdi
+ addq %r14,%r8
+
+ movq 104(%rsp),%r13
+ movq 80(%rsp),%r14
+ movq %r13,%r12
+ movq %r14,%r15
+
+ rorq $7,%r12
+ xorq %r13,%r12
+ shrq $7,%r13
+
+ rorq $1,%r12
+ xorq %r12,%r13
+ movq 40(%rsp),%r12
+
+ rorq $42,%r15
+ xorq %r14,%r15
+ shrq $6,%r14
+
+ rorq $19,%r15
+ addq %r13,%r12
+ xorq %r15,%r14
+
+ addq 96(%rsp),%r12
+ movq %rax,%r13
+ addq %r14,%r12
+ movq %r8,%r14
+ rorq $23,%r13
+ movq %rbx,%r15
+ movq %r12,96(%rsp)
+
+ rorq $5,%r14
+ xorq %rax,%r13
+ xorq %rcx,%r15
+
+ rorq $4,%r13
+ addq %rdx,%r12
+ xorq %r8,%r14
+
+ addq (%rbp,%rdi,8),%r12
+ andq %rax,%r15
+ movq %r9,%rdx
+
+ rorq $6,%r14
+ xorq %rax,%r13
+ xorq %rcx,%r15
+
+ xorq %r10,%rdx
+ xorq %r8,%r14
+ addq %r15,%r12
+ movq %r9,%r15
+
+ rorq $14,%r13
+ andq %r8,%rdx
+ andq %r10,%r15
+
+ rorq $28,%r14
+ addq %r13,%r12
+ addq %r15,%rdx
+
+ addq %r12,%r11
+ addq %r12,%rdx
+ leaq 1(%rdi),%rdi
+ addq %r14,%rdx
+
+ movq 112(%rsp),%r13
+ movq 88(%rsp),%r14
+ movq %r13,%r12
+ movq %r14,%r15
+
+ rorq $7,%r12
+ xorq %r13,%r12
+ shrq $7,%r13
+
+ rorq $1,%r12
+ xorq %r12,%r13
+ movq 48(%rsp),%r12
+
+ rorq $42,%r15
+ xorq %r14,%r15
+ shrq $6,%r14
+
+ rorq $19,%r15
+ addq %r13,%r12
+ xorq %r15,%r14
+
+ addq 104(%rsp),%r12
+ movq %r11,%r13
+ addq %r14,%r12
+ movq %rdx,%r14
+ rorq $23,%r13
+ movq %rax,%r15
+ movq %r12,104(%rsp)
+
+ rorq $5,%r14
+ xorq %r11,%r13
+ xorq %rbx,%r15
+
+ rorq $4,%r13
+ addq %rcx,%r12
+ xorq %rdx,%r14
+
+ addq (%rbp,%rdi,8),%r12
+ andq %r11,%r15
+ movq %r8,%rcx
+
+ rorq $6,%r14
+ xorq %r11,%r13
+ xorq %rbx,%r15
+
+ xorq %r9,%rcx
+ xorq %rdx,%r14
+ addq %r15,%r12
+ movq %r8,%r15
+
+ rorq $14,%r13
+ andq %rdx,%rcx
+ andq %r9,%r15
+
+ rorq $28,%r14
+ addq %r13,%r12
+ addq %r15,%rcx
+
+ addq %r12,%r10
+ addq %r12,%rcx
+ leaq 1(%rdi),%rdi
+ addq %r14,%rcx
+
+ movq 120(%rsp),%r13
+ movq 96(%rsp),%r14
+ movq %r13,%r12
+ movq %r14,%r15
+
+ rorq $7,%r12
+ xorq %r13,%r12
+ shrq $7,%r13
+
+ rorq $1,%r12
+ xorq %r12,%r13
+ movq 56(%rsp),%r12
+
+ rorq $42,%r15
+ xorq %r14,%r15
+ shrq $6,%r14
+
+ rorq $19,%r15
+ addq %r13,%r12
+ xorq %r15,%r14
+
+ addq 112(%rsp),%r12
+ movq %r10,%r13
+ addq %r14,%r12
+ movq %rcx,%r14
+ rorq $23,%r13
+ movq %r11,%r15
+ movq %r12,112(%rsp)
+
+ rorq $5,%r14
+ xorq %r10,%r13
+ xorq %rax,%r15
+
+ rorq $4,%r13
+ addq %rbx,%r12
+ xorq %rcx,%r14
+
+ addq (%rbp,%rdi,8),%r12
+ andq %r10,%r15
+ movq %rdx,%rbx
+
+ rorq $6,%r14
+ xorq %r10,%r13
+ xorq %rax,%r15
+
+ xorq %r8,%rbx
+ xorq %rcx,%r14
+ addq %r15,%r12
+ movq %rdx,%r15
+
+ rorq $14,%r13
+ andq %rcx,%rbx
+ andq %r8,%r15
+
+ rorq $28,%r14
+ addq %r13,%r12
+ addq %r15,%rbx
+
+ addq %r12,%r9
+ addq %r12,%rbx
+ leaq 1(%rdi),%rdi
+ addq %r14,%rbx
+
+ movq 0(%rsp),%r13
+ movq 104(%rsp),%r14
+ movq %r13,%r12
+ movq %r14,%r15
+
+ rorq $7,%r12
+ xorq %r13,%r12
+ shrq $7,%r13
+
+ rorq $1,%r12
+ xorq %r12,%r13
+ movq 64(%rsp),%r12
+
+ rorq $42,%r15
+ xorq %r14,%r15
+ shrq $6,%r14
+
+ rorq $19,%r15
+ addq %r13,%r12
+ xorq %r15,%r14
+
+ addq 120(%rsp),%r12
+ movq %r9,%r13
+ addq %r14,%r12
+ movq %rbx,%r14
+ rorq $23,%r13
+ movq %r10,%r15
+ movq %r12,120(%rsp)
+
+ rorq $5,%r14
+ xorq %r9,%r13
+ xorq %r11,%r15
+
+ rorq $4,%r13
+ addq %rax,%r12
+ xorq %rbx,%r14
+
+ addq (%rbp,%rdi,8),%r12
+ andq %r9,%r15
+ movq %rcx,%rax
+
+ rorq $6,%r14
+ xorq %r9,%r13
+ xorq %r11,%r15
+
+ xorq %rdx,%rax
+ xorq %rbx,%r14
+ addq %r15,%r12
+ movq %rcx,%r15
+
+ rorq $14,%r13
+ andq %rbx,%rax
+ andq %rdx,%r15
+
+ rorq $28,%r14
+ addq %r13,%r12
+ addq %r15,%rax
+
+ addq %r12,%r8
+ addq %r12,%rax
+ leaq 1(%rdi),%rdi
+ addq %r14,%rax
+
+ cmpq $80,%rdi
+ jb .Lrounds_16_xx
+
+ movq 128+0(%rsp),%rdi
+ leaq 128(%rsi),%rsi
+
+ addq 0(%rdi),%rax
+ addq 8(%rdi),%rbx
+ addq 16(%rdi),%rcx
+ addq 24(%rdi),%rdx
+ addq 32(%rdi),%r8
+ addq 40(%rdi),%r9
+ addq 48(%rdi),%r10
+ addq 56(%rdi),%r11
+
+ cmpq 128+16(%rsp),%rsi
+
+ movq %rax,0(%rdi)
+ movq %rbx,8(%rdi)
+ movq %rcx,16(%rdi)
+ movq %rdx,24(%rdi)
+ movq %r8,32(%rdi)
+ movq %r9,40(%rdi)
+ movq %r10,48(%rdi)
+ movq %r11,56(%rdi)
+ jb .Lloop
+
+ movq 128+24(%rsp),%rsi
+ movq (%rsi),%r15
+ movq 8(%rsi),%r14
+ movq 16(%rsi),%r13
+ movq 24(%rsi),%r12
+ movq 32(%rsi),%rbp
+ movq 40(%rsi),%rbx
+ leaq 48(%rsi),%rsp
+.Lepilogue:
+ .byte 0xf3,0xc3
+.size sha512_block_data_order,.-sha512_block_data_order
+.align 64
+.type K512,@object
+K512:
+.quad 0x428a2f98d728ae22,0x7137449123ef65cd
+.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+.quad 0x3956c25bf348b538,0x59f111f1b605d019
+.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
+.quad 0xd807aa98a3030242,0x12835b0145706fbe
+.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
+.quad 0x9bdc06a725c71235,0xc19bf174cf692694
+.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
+.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
+.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+.quad 0x983e5152ee66dfab,0xa831c66d2db43210
+.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
+.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
+.quad 0x06ca6351e003826f,0x142929670a0e6e70
+.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
+.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+.quad 0x650a73548baf63de,0x766a0abb3c77b2a8
+.quad 0x81c2c92e47edaee6,0x92722c851482353b
+.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
+.quad 0xc24b8b70d0f89791,0xc76c51a30654be30
+.quad 0xd192e819d6ef5218,0xd69906245565a910
+.quad 0xf40e35855771202a,0x106aa07032bbd1b8
+.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
+.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+.quad 0x748f82ee5defb2fc,0x78a5636f43172f60
+.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
+.quad 0x90befffa23631e28,0xa4506cebde82bde9
+.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
+.quad 0xca273eceea26619c,0xd186b8c721c0c207
+.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
+.quad 0x113f9804bef90dae,0x1b710b35131c471b
+.quad 0x28db77f523047d84,0x32caab7b40c72493
+.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
diff --git a/main/openssl/crypto/sha/asm/sha512-x86_64.pl b/main/openssl/crypto/sha/asm/sha512-x86_64.pl
index e6643f8c..8d516785 100755
--- a/main/openssl/crypto/sha/asm/sha512-x86_64.pl
+++ b/main/openssl/crypto/sha/asm/sha512-x86_64.pl
@@ -51,7 +51,8 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
-open STDOUT,"| $^X $xlate $flavour $output";
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
if ($output =~ /512/) {
$func="sha512_block_data_order";
@@ -95,50 +96,44 @@ sub ROUND_00_15()
{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
$code.=<<___;
- mov $e,$a0
- mov $e,$a1
+ ror \$`$Sigma1[2]-$Sigma1[1]`,$a0
mov $f,$a2
+ mov $T1,`$SZ*($i&0xf)`(%rsp)
- ror \$$Sigma1[0],$a0
- ror \$$Sigma1[1],$a1
+ ror \$`$Sigma0[2]-$Sigma0[1]`,$a1
+ xor $e,$a0
xor $g,$a2 # f^g
- xor $a1,$a0
- ror \$`$Sigma1[2]-$Sigma1[1]`,$a1
+ ror \$`$Sigma1[1]-$Sigma1[0]`,$a0
+ add $h,$T1 # T1+=h
+ xor $a,$a1
+
+ add ($Tbl,$round,$SZ),$T1 # T1+=K[round]
and $e,$a2 # (f^g)&e
- mov $T1,`$SZ*($i&0xf)`(%rsp)
+ mov $b,$h
- xor $a1,$a0 # Sigma1(e)
+ ror \$`$Sigma0[1]-$Sigma0[0]`,$a1
+ xor $e,$a0
xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g
- add $h,$T1 # T1+=h
-
- mov $a,$h
- add $a0,$T1 # T1+=Sigma1(e)
+ xor $c,$h # b^c
+ xor $a,$a1
add $a2,$T1 # T1+=Ch(e,f,g)
- mov $a,$a0
- mov $a,$a1
+ mov $b,$a2
- ror \$$Sigma0[0],$h
- ror \$$Sigma0[1],$a0
- mov $a,$a2
- add ($Tbl,$round,$SZ),$T1 # T1+=K[round]
+ ror \$$Sigma1[0],$a0 # Sigma1(e)
+ and $a,$h # h=(b^c)&a
+ and $c,$a2 # b&c
- xor $a0,$h
- ror \$`$Sigma0[2]-$Sigma0[1]`,$a0
- or $c,$a1 # a|c
+ ror \$$Sigma0[0],$a1 # Sigma0(a)
+ add $a0,$T1 # T1+=Sigma1(e)
+ add $a2,$h # h+=b&c (completes +=Maj(a,b,c)
- xor $a0,$h # h=Sigma0(a)
- and $c,$a2 # a&c
add $T1,$d # d+=T1
-
- and $b,$a1 # (a|c)&b
add $T1,$h # h+=T1
-
- or $a2,$a1 # Maj(a,b,c)=((a|c)&b)|(a&c)
lea 1($round),$round # round++
+ add $a1,$h # h+=Sigma0(a)
- add $a1,$h # h+=Maj(a,b,c)
___
}
@@ -147,32 +142,30 @@ sub ROUND_16_XX()
$code.=<<___;
mov `$SZ*(($i+1)&0xf)`(%rsp),$a0
- mov `$SZ*(($i+14)&0xf)`(%rsp),$T1
-
- mov $a0,$a2
+ mov `$SZ*(($i+14)&0xf)`(%rsp),$a1
+ mov $a0,$T1
+ mov $a1,$a2
+ ror \$`$sigma0[1]-$sigma0[0]`,$T1
+ xor $a0,$T1
shr \$$sigma0[2],$a0
- ror \$$sigma0[0],$a2
-
- xor $a2,$a0
- ror \$`$sigma0[1]-$sigma0[0]`,$a2
- xor $a2,$a0 # sigma0(X[(i+1)&0xf])
- mov $T1,$a1
+ ror \$$sigma0[0],$T1
+ xor $T1,$a0 # sigma0(X[(i+1)&0xf])
+ mov `$SZ*(($i+9)&0xf)`(%rsp),$T1
- shr \$$sigma1[2],$T1
- ror \$$sigma1[0],$a1
-
- xor $a1,$T1
- ror \$`$sigma1[1]-$sigma1[0]`,$a1
-
- xor $a1,$T1 # sigma1(X[(i+14)&0xf])
+ ror \$`$sigma1[1]-$sigma1[0]`,$a2
+ xor $a1,$a2
+ shr \$$sigma1[2],$a1
+ ror \$$sigma1[0],$a2
add $a0,$T1
-
- add `$SZ*(($i+9)&0xf)`(%rsp),$T1
+ xor $a2,$a1 # sigma1(X[(i+14)&0xf])
add `$SZ*($i&0xf)`(%rsp),$T1
+ mov $e,$a0
+ add $a1,$T1
+ mov $a,$a1
___
&ROUND_00_15(@_);
}
@@ -219,6 +212,8 @@ $func:
___
for($i=0;$i<16;$i++) {
$code.=" mov $SZ*$i($inp),$T1\n";
+ $code.=" mov @ROT[4],$a0\n";
+ $code.=" mov @ROT[0],$a1\n";
$code.=" bswap $T1\n";
&ROUND_00_15($i,@ROT);
unshift(@ROT,pop(@ROT));
diff --git a/main/openssl/crypto/sha/sha.h b/main/openssl/crypto/sha/sha.h
index 16cacf9f..8a6bf4bb 100644
--- a/main/openssl/crypto/sha/sha.h
+++ b/main/openssl/crypto/sha/sha.h
@@ -106,6 +106,9 @@ typedef struct SHAstate_st
} SHA_CTX;
#ifndef OPENSSL_NO_SHA0
+#ifdef OPENSSL_FIPS
+int private_SHA_Init(SHA_CTX *c);
+#endif
int SHA_Init(SHA_CTX *c);
int SHA_Update(SHA_CTX *c, const void *data, size_t len);
int SHA_Final(unsigned char *md, SHA_CTX *c);
@@ -113,6 +116,9 @@ unsigned char *SHA(const unsigned char *d, size_t n, unsigned char *md);
void SHA_Transform(SHA_CTX *c, const unsigned char *data);
#endif
#ifndef OPENSSL_NO_SHA1
+#ifdef OPENSSL_FIPS
+int private_SHA1_Init(SHA_CTX *c);
+#endif
int SHA1_Init(SHA_CTX *c);
int SHA1_Update(SHA_CTX *c, const void *data, size_t len);
int SHA1_Final(unsigned char *md, SHA_CTX *c);
@@ -135,6 +141,10 @@ typedef struct SHA256state_st
} SHA256_CTX;
#ifndef OPENSSL_NO_SHA256
+#ifdef OPENSSL_FIPS
+int private_SHA224_Init(SHA256_CTX *c);
+int private_SHA256_Init(SHA256_CTX *c);
+#endif
int SHA224_Init(SHA256_CTX *c);
int SHA224_Update(SHA256_CTX *c, const void *data, size_t len);
int SHA224_Final(unsigned char *md, SHA256_CTX *c);
@@ -182,6 +192,10 @@ typedef struct SHA512state_st
#endif
#ifndef OPENSSL_NO_SHA512
+#ifdef OPENSSL_FIPS
+int private_SHA384_Init(SHA512_CTX *c);
+int private_SHA512_Init(SHA512_CTX *c);
+#endif
int SHA384_Init(SHA512_CTX *c);
int SHA384_Update(SHA512_CTX *c, const void *data, size_t len);
int SHA384_Final(unsigned char *md, SHA512_CTX *c);
diff --git a/main/openssl/crypto/sha/sha1_one.c b/main/openssl/crypto/sha/sha1_one.c
index 7c65b602..c56ec940 100644
--- a/main/openssl/crypto/sha/sha1_one.c
+++ b/main/openssl/crypto/sha/sha1_one.c
@@ -58,8 +58,8 @@
#include <stdio.h>
#include <string.h>
-#include <openssl/sha.h>
#include <openssl/crypto.h>
+#include <openssl/sha.h>
#ifndef OPENSSL_NO_SHA1
unsigned char *SHA1(const unsigned char *d, size_t n, unsigned char *md)
diff --git a/main/openssl/crypto/sha/sha1dgst.c b/main/openssl/crypto/sha/sha1dgst.c
index 50d1925c..a9869022 100644
--- a/main/openssl/crypto/sha/sha1dgst.c
+++ b/main/openssl/crypto/sha/sha1dgst.c
@@ -56,6 +56,7 @@
* [including the GNU Public Licence.]
*/
+#include <openssl/crypto.h>
#include <openssl/opensslconf.h>
#if !defined(OPENSSL_NO_SHA1) && !defined(OPENSSL_NO_SHA)
diff --git a/main/openssl/crypto/sha/sha256.c b/main/openssl/crypto/sha/sha256.c
index 8952d876..4eae0748 100644
--- a/main/openssl/crypto/sha/sha256.c
+++ b/main/openssl/crypto/sha/sha256.c
@@ -16,7 +16,7 @@
const char SHA256_version[]="SHA-256" OPENSSL_VERSION_PTEXT;
-int SHA224_Init (SHA256_CTX *c)
+fips_md_init_ctx(SHA224, SHA256)
{
memset (c,0,sizeof(*c));
c->h[0]=0xc1059ed8UL; c->h[1]=0x367cd507UL;
@@ -27,7 +27,7 @@ int SHA224_Init (SHA256_CTX *c)
return 1;
}
-int SHA256_Init (SHA256_CTX *c)
+fips_md_init(SHA256)
{
memset (c,0,sizeof(*c));
c->h[0]=0x6a09e667UL; c->h[1]=0xbb67ae85UL;
@@ -88,17 +88,17 @@ int SHA224_Final (unsigned char *md, SHA256_CTX *c)
switch ((c)->md_len) \
{ case SHA224_DIGEST_LENGTH: \
for (nn=0;nn<SHA224_DIGEST_LENGTH/4;nn++) \
- { ll=(c)->h[nn]; HOST_l2c(ll,(s)); } \
+ { ll=(c)->h[nn]; (void)HOST_l2c(ll,(s)); } \
break; \
case SHA256_DIGEST_LENGTH: \
for (nn=0;nn<SHA256_DIGEST_LENGTH/4;nn++) \
- { ll=(c)->h[nn]; HOST_l2c(ll,(s)); } \
+ { ll=(c)->h[nn]; (void)HOST_l2c(ll,(s)); } \
break; \
default: \
if ((c)->md_len > SHA256_DIGEST_LENGTH) \
return 0; \
for (nn=0;nn<(c)->md_len/4;nn++) \
- { ll=(c)->h[nn]; HOST_l2c(ll,(s)); } \
+ { ll=(c)->h[nn]; (void)HOST_l2c(ll,(s)); } \
break; \
} \
} while (0)
diff --git a/main/openssl/crypto/sha/sha512.c b/main/openssl/crypto/sha/sha512.c
index cbc0e58c..50c229dd 100644
--- a/main/openssl/crypto/sha/sha512.c
+++ b/main/openssl/crypto/sha/sha512.c
@@ -59,21 +59,8 @@ const char SHA512_version[]="SHA-512" OPENSSL_VERSION_PTEXT;
#define SHA512_BLOCK_CAN_MANAGE_UNALIGNED_DATA
#endif
-int SHA384_Init (SHA512_CTX *c)
+fips_md_init_ctx(SHA384, SHA512)
{
-#if defined(SHA512_ASM) && (defined(__arm__) || defined(__arm))
- /* maintain dword order required by assembler module */
- unsigned int *h = (unsigned int *)c->h;
-
- h[0] = 0xcbbb9d5d; h[1] = 0xc1059ed8;
- h[2] = 0x629a292a; h[3] = 0x367cd507;
- h[4] = 0x9159015a; h[5] = 0x3070dd17;
- h[6] = 0x152fecd8; h[7] = 0xf70e5939;
- h[8] = 0x67332667; h[9] = 0xffc00b31;
- h[10] = 0x8eb44a87; h[11] = 0x68581511;
- h[12] = 0xdb0c2e0d; h[13] = 0x64f98fa7;
- h[14] = 0x47b5481d; h[15] = 0xbefa4fa4;
-#else
c->h[0]=U64(0xcbbb9d5dc1059ed8);
c->h[1]=U64(0x629a292a367cd507);
c->h[2]=U64(0x9159015a3070dd17);
@@ -82,27 +69,14 @@ int SHA384_Init (SHA512_CTX *c)
c->h[5]=U64(0x8eb44a8768581511);
c->h[6]=U64(0xdb0c2e0d64f98fa7);
c->h[7]=U64(0x47b5481dbefa4fa4);
-#endif
+
c->Nl=0; c->Nh=0;
c->num=0; c->md_len=SHA384_DIGEST_LENGTH;
return 1;
}
-int SHA512_Init (SHA512_CTX *c)
+fips_md_init(SHA512)
{
-#if defined(SHA512_ASM) && (defined(__arm__) || defined(__arm))
- /* maintain dword order required by assembler module */
- unsigned int *h = (unsigned int *)c->h;
-
- h[0] = 0x6a09e667; h[1] = 0xf3bcc908;
- h[2] = 0xbb67ae85; h[3] = 0x84caa73b;
- h[4] = 0x3c6ef372; h[5] = 0xfe94f82b;
- h[6] = 0xa54ff53a; h[7] = 0x5f1d36f1;
- h[8] = 0x510e527f; h[9] = 0xade682d1;
- h[10] = 0x9b05688c; h[11] = 0x2b3e6c1f;
- h[12] = 0x1f83d9ab; h[13] = 0xfb41bd6b;
- h[14] = 0x5be0cd19; h[15] = 0x137e2179;
-#else
c->h[0]=U64(0x6a09e667f3bcc908);
c->h[1]=U64(0xbb67ae8584caa73b);
c->h[2]=U64(0x3c6ef372fe94f82b);
@@ -111,7 +85,7 @@ int SHA512_Init (SHA512_CTX *c)
c->h[5]=U64(0x9b05688c2b3e6c1f);
c->h[6]=U64(0x1f83d9abfb41bd6b);
c->h[7]=U64(0x5be0cd19137e2179);
-#endif
+
c->Nl=0; c->Nh=0;
c->num=0; c->md_len=SHA512_DIGEST_LENGTH;
return 1;
@@ -160,24 +134,6 @@ int SHA512_Final (unsigned char *md, SHA512_CTX *c)
if (md==0) return 0;
-#if defined(SHA512_ASM) && (defined(__arm__) || defined(__arm))
- /* recall assembler dword order... */
- n = c->md_len;
- if (n == SHA384_DIGEST_LENGTH || n == SHA512_DIGEST_LENGTH)
- {
- unsigned int *h = (unsigned int *)c->h, t;
-
- for (n/=4;n;n--)
- {
- t = *(h++);
- *(md++) = (unsigned char)(t>>24);
- *(md++) = (unsigned char)(t>>16);
- *(md++) = (unsigned char)(t>>8);
- *(md++) = (unsigned char)(t);
- }
- }
- else return 0;
-#else
switch (c->md_len)
{
/* Let compiler decide if it's appropriate to unroll... */
@@ -214,7 +170,7 @@ int SHA512_Final (unsigned char *md, SHA512_CTX *c)
/* ... as well as make sure md_len is not abused. */
default: return 0;
}
-#endif
+
return 1;
}
@@ -276,7 +232,14 @@ int SHA384_Update (SHA512_CTX *c, const void *data, size_t len)
{ return SHA512_Update (c,data,len); }
void SHA512_Transform (SHA512_CTX *c, const unsigned char *data)
-{ sha512_block_data_order (c,data,1); }
+ {
+#ifndef SHA512_BLOCK_CAN_MANAGE_UNALIGNED_DATA
+ if ((size_t)data%sizeof(c->u.d[0]) != 0)
+ memcpy(c->u.p,data,sizeof(c->u.p)),
+ data = c->u.p;
+#endif
+ sha512_block_data_order (c,data,1);
+ }
unsigned char *SHA384(const unsigned char *d, size_t n, unsigned char *md)
{
diff --git a/main/openssl/crypto/sha/sha_dgst.c b/main/openssl/crypto/sha/sha_dgst.c
index 70eb5603..fb63b17f 100644
--- a/main/openssl/crypto/sha/sha_dgst.c
+++ b/main/openssl/crypto/sha/sha_dgst.c
@@ -56,6 +56,7 @@
* [including the GNU Public Licence.]
*/
+#include <openssl/crypto.h>
#include <openssl/opensslconf.h>
#if !defined(OPENSSL_NO_SHA0) && !defined(OPENSSL_NO_SHA)
diff --git a/main/openssl/crypto/sha/sha_locl.h b/main/openssl/crypto/sha/sha_locl.h
index 672c26ee..d673255f 100644
--- a/main/openssl/crypto/sha/sha_locl.h
+++ b/main/openssl/crypto/sha/sha_locl.h
@@ -69,11 +69,11 @@
#define HASH_CBLOCK SHA_CBLOCK
#define HASH_MAKE_STRING(c,s) do { \
unsigned long ll; \
- ll=(c)->h0; HOST_l2c(ll,(s)); \
- ll=(c)->h1; HOST_l2c(ll,(s)); \
- ll=(c)->h2; HOST_l2c(ll,(s)); \
- ll=(c)->h3; HOST_l2c(ll,(s)); \
- ll=(c)->h4; HOST_l2c(ll,(s)); \
+ ll=(c)->h0; (void)HOST_l2c(ll,(s)); \
+ ll=(c)->h1; (void)HOST_l2c(ll,(s)); \
+ ll=(c)->h2; (void)HOST_l2c(ll,(s)); \
+ ll=(c)->h3; (void)HOST_l2c(ll,(s)); \
+ ll=(c)->h4; (void)HOST_l2c(ll,(s)); \
} while (0)
#if defined(SHA_0)
@@ -122,7 +122,11 @@ void sha1_block_data_order (SHA_CTX *c, const void *p,size_t num);
#define INIT_DATA_h3 0x10325476UL
#define INIT_DATA_h4 0xc3d2e1f0UL
-int HASH_INIT (SHA_CTX *c)
+#ifdef SHA_0
+fips_md_init(SHA)
+#else
+fips_md_init_ctx(SHA1, SHA)
+#endif
{
memset (c,0,sizeof(*c));
c->h0=INIT_DATA_h0;
@@ -252,21 +256,21 @@ static void HASH_BLOCK_DATA_ORDER (SHA_CTX *c, const void *p, size_t num)
}
else
{
- HOST_c2l(data,l); X( 0)=l; HOST_c2l(data,l); X( 1)=l;
- BODY_00_15( 0,A,B,C,D,E,T,X( 0)); HOST_c2l(data,l); X( 2)=l;
- BODY_00_15( 1,T,A,B,C,D,E,X( 1)); HOST_c2l(data,l); X( 3)=l;
- BODY_00_15( 2,E,T,A,B,C,D,X( 2)); HOST_c2l(data,l); X( 4)=l;
- BODY_00_15( 3,D,E,T,A,B,C,X( 3)); HOST_c2l(data,l); X( 5)=l;
- BODY_00_15( 4,C,D,E,T,A,B,X( 4)); HOST_c2l(data,l); X( 6)=l;
- BODY_00_15( 5,B,C,D,E,T,A,X( 5)); HOST_c2l(data,l); X( 7)=l;
- BODY_00_15( 6,A,B,C,D,E,T,X( 6)); HOST_c2l(data,l); X( 8)=l;
- BODY_00_15( 7,T,A,B,C,D,E,X( 7)); HOST_c2l(data,l); X( 9)=l;
- BODY_00_15( 8,E,T,A,B,C,D,X( 8)); HOST_c2l(data,l); X(10)=l;
- BODY_00_15( 9,D,E,T,A,B,C,X( 9)); HOST_c2l(data,l); X(11)=l;
- BODY_00_15(10,C,D,E,T,A,B,X(10)); HOST_c2l(data,l); X(12)=l;
- BODY_00_15(11,B,C,D,E,T,A,X(11)); HOST_c2l(data,l); X(13)=l;
- BODY_00_15(12,A,B,C,D,E,T,X(12)); HOST_c2l(data,l); X(14)=l;
- BODY_00_15(13,T,A,B,C,D,E,X(13)); HOST_c2l(data,l); X(15)=l;
+ (void)HOST_c2l(data,l); X( 0)=l; (void)HOST_c2l(data,l); X( 1)=l;
+ BODY_00_15( 0,A,B,C,D,E,T,X( 0)); (void)HOST_c2l(data,l); X( 2)=l;
+ BODY_00_15( 1,T,A,B,C,D,E,X( 1)); (void)HOST_c2l(data,l); X( 3)=l;
+ BODY_00_15( 2,E,T,A,B,C,D,X( 2)); (void)HOST_c2l(data,l); X( 4)=l;
+ BODY_00_15( 3,D,E,T,A,B,C,X( 3)); (void)HOST_c2l(data,l); X( 5)=l;
+ BODY_00_15( 4,C,D,E,T,A,B,X( 4)); (void)HOST_c2l(data,l); X( 6)=l;
+ BODY_00_15( 5,B,C,D,E,T,A,X( 5)); (void)HOST_c2l(data,l); X( 7)=l;
+ BODY_00_15( 6,A,B,C,D,E,T,X( 6)); (void)HOST_c2l(data,l); X( 8)=l;
+ BODY_00_15( 7,T,A,B,C,D,E,X( 7)); (void)HOST_c2l(data,l); X( 9)=l;
+ BODY_00_15( 8,E,T,A,B,C,D,X( 8)); (void)HOST_c2l(data,l); X(10)=l;
+ BODY_00_15( 9,D,E,T,A,B,C,X( 9)); (void)HOST_c2l(data,l); X(11)=l;
+ BODY_00_15(10,C,D,E,T,A,B,X(10)); (void)HOST_c2l(data,l); X(12)=l;
+ BODY_00_15(11,B,C,D,E,T,A,X(11)); (void)HOST_c2l(data,l); X(13)=l;
+ BODY_00_15(12,A,B,C,D,E,T,X(12)); (void)HOST_c2l(data,l); X(14)=l;
+ BODY_00_15(13,T,A,B,C,D,E,X(13)); (void)HOST_c2l(data,l); X(15)=l;
BODY_00_15(14,E,T,A,B,C,D,X(14));
BODY_00_15(15,D,E,T,A,B,C,X(15));
}
diff --git a/main/openssl/crypto/sparccpuid.S b/main/openssl/crypto/sparccpuid.S
index ae61f7f5..c63d5da4 100644
--- a/main/openssl/crypto/sparccpuid.S
+++ b/main/openssl/crypto/sparccpuid.S
@@ -123,7 +123,7 @@ OPENSSL_wipe_cpu:
fmovs %f1,%f3
fmovs %f0,%f2
- add %fp,BIAS,%i0 ! return pointer to caller´s top of stack
+ add %fp,BIAS,%i0 ! return pointer to caller´s top of stack
ret
restore
@@ -235,10 +235,10 @@ _sparcv9_rdtick:
.global _sparcv9_vis1_probe
.align 8
_sparcv9_vis1_probe:
- .word 0x81b00d80 !fxor %f0,%f0,%f0
add %sp,BIAS+2,%o1
- retl
.word 0xc19a5a40 !ldda [%o1]ASI_FP16_P,%f0
+ retl
+ .word 0x81b00d80 !fxor %f0,%f0,%f0
.type _sparcv9_vis1_probe,#function
.size _sparcv9_vis1_probe,.-_sparcv9_vis1_probe
diff --git a/main/openssl/crypto/sparcv9cap.c b/main/openssl/crypto/sparcv9cap.c
index ed195ab4..43b3ac6f 100644
--- a/main/openssl/crypto/sparcv9cap.c
+++ b/main/openssl/crypto/sparcv9cap.c
@@ -19,7 +19,8 @@ int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_U
int bn_mul_mont_fpu(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num);
int bn_mul_mont_int(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num);
- if ((OPENSSL_sparcv9cap_P&(SPARCV9_PREFER_FPU|SPARCV9_VIS1)) ==
+ if (num>=8 && !(num&1) &&
+ (OPENSSL_sparcv9cap_P&(SPARCV9_PREFER_FPU|SPARCV9_VIS1)) ==
(SPARCV9_PREFER_FPU|SPARCV9_VIS1))
return bn_mul_mont_fpu(rp,ap,bp,np,n0,num);
else
@@ -169,7 +170,6 @@ void OPENSSL_cpuid_setup(void)
char *e;
struct sigaction common_act,ill_oact,bus_oact;
sigset_t all_masked,oset;
- int sig;
static int trigger=0;
if (trigger) return;
diff --git a/main/openssl/crypto/srp/srp.h b/main/openssl/crypto/srp/srp.h
new file mode 100644
index 00000000..7ec7825c
--- /dev/null
+++ b/main/openssl/crypto/srp/srp.h
@@ -0,0 +1,172 @@
+/* crypto/srp/srp.h */
+/* Written by Christophe Renou (christophe.renou@edelweb.fr) with
+ * the precious help of Peter Sylvester (peter.sylvester@edelweb.fr)
+ * for the EdelKey project and contributed to the OpenSSL project 2004.
+ */
+/* ====================================================================
+ * Copyright (c) 2004 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com). This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+#ifndef __SRP_H__
+#define __SRP_H__
+
+#ifndef OPENSSL_NO_SRP
+
+#include <stdio.h>
+#include <string.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <openssl/safestack.h>
+#include <openssl/bn.h>
+#include <openssl/crypto.h>
+
+typedef struct SRP_gN_cache_st
+ {
+ char *b64_bn;
+ BIGNUM *bn;
+ } SRP_gN_cache;
+
+
+DECLARE_STACK_OF(SRP_gN_cache)
+
+typedef struct SRP_user_pwd_st
+ {
+ char *id;
+ BIGNUM *s;
+ BIGNUM *v;
+ const BIGNUM *g;
+ const BIGNUM *N;
+ char *info;
+ } SRP_user_pwd;
+
+DECLARE_STACK_OF(SRP_user_pwd)
+
+typedef struct SRP_VBASE_st
+ {
+ STACK_OF(SRP_user_pwd) *users_pwd;
+ STACK_OF(SRP_gN_cache) *gN_cache;
+/* to simulate a user */
+ char *seed_key;
+ BIGNUM *default_g;
+ BIGNUM *default_N;
+ } SRP_VBASE;
+
+
+/*Structure interne pour retenir les couples N et g*/
+typedef struct SRP_gN_st
+ {
+ char *id;
+ BIGNUM *g;
+ BIGNUM *N;
+ } SRP_gN;
+
+DECLARE_STACK_OF(SRP_gN)
+
+SRP_VBASE *SRP_VBASE_new(char *seed_key);
+int SRP_VBASE_free(SRP_VBASE *vb);
+int SRP_VBASE_init(SRP_VBASE *vb, char * verifier_file);
+SRP_user_pwd *SRP_VBASE_get_by_user(SRP_VBASE *vb, char *username);
+char *SRP_create_verifier(const char *user, const char *pass, char **salt,
+ char **verifier, const char *N, const char *g);
+int SRP_create_verifier_BN(const char *user, const char *pass, BIGNUM **salt, BIGNUM **verifier, BIGNUM *N, BIGNUM *g);
+
+
+#define SRP_NO_ERROR 0
+#define SRP_ERR_VBASE_INCOMPLETE_FILE 1
+#define SRP_ERR_VBASE_BN_LIB 2
+#define SRP_ERR_OPEN_FILE 3
+#define SRP_ERR_MEMORY 4
+
+#define DB_srptype 0
+#define DB_srpverifier 1
+#define DB_srpsalt 2
+#define DB_srpid 3
+#define DB_srpgN 4
+#define DB_srpinfo 5
+#undef DB_NUMBER
+#define DB_NUMBER 6
+
+#define DB_SRP_INDEX 'I'
+#define DB_SRP_VALID 'V'
+#define DB_SRP_REVOKED 'R'
+#define DB_SRP_MODIF 'v'
+
+
+/* see srp.c */
+char * SRP_check_known_gN_param(BIGNUM* g, BIGNUM* N);
+SRP_gN *SRP_get_default_gN(const char * id) ;
+
+/* server side .... */
+BIGNUM *SRP_Calc_server_key(BIGNUM *A, BIGNUM *v, BIGNUM *u, BIGNUM *b, BIGNUM *N);
+BIGNUM *SRP_Calc_B(BIGNUM *b, BIGNUM *N, BIGNUM *g, BIGNUM *v);
+int SRP_Verify_A_mod_N(BIGNUM *A, BIGNUM *N);
+BIGNUM *SRP_Calc_u(BIGNUM *A, BIGNUM *B, BIGNUM *N) ;
+
+
+
+/* client side .... */
+BIGNUM *SRP_Calc_x(BIGNUM *s, const char *user, const char *pass);
+BIGNUM *SRP_Calc_A(BIGNUM *a, BIGNUM *N, BIGNUM *g);
+BIGNUM *SRP_Calc_client_key(BIGNUM *N, BIGNUM *B, BIGNUM *g, BIGNUM *x, BIGNUM *a, BIGNUM *u);
+int SRP_Verify_B_mod_N(BIGNUM *B, BIGNUM *N);
+
+#define SRP_MINIMAL_N 1024
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+#endif
diff --git a/main/openssl/crypto/srp/srp_grps.h b/main/openssl/crypto/srp/srp_grps.h
new file mode 100644
index 00000000..8e3c35e3
--- /dev/null
+++ b/main/openssl/crypto/srp/srp_grps.h
@@ -0,0 +1,517 @@
+/* start of generated data */
+
+static BN_ULONG bn_group_1024_value[] = {
+ bn_pack4(0x9FC6,0x1D2F,0xC0EB,0x06E3),
+ bn_pack4(0xFD51,0x38FE,0x8376,0x435B),
+ bn_pack4(0x2FD4,0xCBF4,0x976E,0xAA9A),
+ bn_pack4(0x68ED,0xBC3C,0x0572,0x6CC0),
+ bn_pack4(0xC529,0xF566,0x660E,0x57EC),
+ bn_pack4(0x8255,0x9B29,0x7BCF,0x1885),
+ bn_pack4(0xCE8E,0xF4AD,0x69B1,0x5D49),
+ bn_pack4(0x5DC7,0xD7B4,0x6154,0xD6B6),
+ bn_pack4(0x8E49,0x5C1D,0x6089,0xDAD1),
+ bn_pack4(0xE0D5,0xD8E2,0x50B9,0x8BE4),
+ bn_pack4(0x383B,0x4813,0xD692,0xC6E0),
+ bn_pack4(0xD674,0xDF74,0x96EA,0x81D3),
+ bn_pack4(0x9EA2,0x314C,0x9C25,0x6576),
+ bn_pack4(0x6072,0x6187,0x75FF,0x3C0B),
+ bn_pack4(0x9C33,0xF80A,0xFA8F,0xC5E8),
+ bn_pack4(0xEEAF,0x0AB9,0xADB3,0x8DD6)
+};
+static BIGNUM bn_group_1024 = {
+ bn_group_1024_value,
+ (sizeof bn_group_1024_value)/sizeof(BN_ULONG),
+ (sizeof bn_group_1024_value)/sizeof(BN_ULONG),
+ 0,
+ BN_FLG_STATIC_DATA
+};
+
+static BN_ULONG bn_group_1536_value[] = {
+ bn_pack4(0xCF76,0xE3FE,0xD135,0xF9BB),
+ bn_pack4(0x1518,0x0F93,0x499A,0x234D),
+ bn_pack4(0x8CE7,0xA28C,0x2442,0xC6F3),
+ bn_pack4(0x5A02,0x1FFF,0x5E91,0x479E),
+ bn_pack4(0x7F8A,0x2FE9,0xB8B5,0x292E),
+ bn_pack4(0x837C,0x264A,0xE3A9,0xBEB8),
+ bn_pack4(0xE442,0x734A,0xF7CC,0xB7AE),
+ bn_pack4(0x6577,0x2E43,0x7D6C,0x7F8C),
+ bn_pack4(0xDB2F,0xD53D,0x24B7,0xC486),
+ bn_pack4(0x6EDF,0x0195,0x3934,0x9627),
+ bn_pack4(0x158B,0xFD3E,0x2B9C,0x8CF5),
+ bn_pack4(0x764E,0x3F4B,0x53DD,0x9DA1),
+ bn_pack4(0x4754,0x8381,0xDBC5,0xB1FC),
+ bn_pack4(0x9B60,0x9E0B,0xE3BA,0xB63D),
+ bn_pack4(0x8134,0xB1C8,0xB979,0x8914),
+ bn_pack4(0xDF02,0x8A7C,0xEC67,0xF0D0),
+ bn_pack4(0x80B6,0x55BB,0x9A22,0xE8DC),
+ bn_pack4(0x1558,0x903B,0xA0D0,0xF843),
+ bn_pack4(0x51C6,0xA94B,0xE460,0x7A29),
+ bn_pack4(0x5F4F,0x5F55,0x6E27,0xCBDE),
+ bn_pack4(0xBEEE,0xA961,0x4B19,0xCC4D),
+ bn_pack4(0xDBA5,0x1DF4,0x99AC,0x4C80),
+ bn_pack4(0xB1F1,0x2A86,0x17A4,0x7BBB),
+ bn_pack4(0x9DEF,0x3CAF,0xB939,0x277A)
+};
+static BIGNUM bn_group_1536 = {
+ bn_group_1536_value,
+ (sizeof bn_group_1536_value)/sizeof(BN_ULONG),
+ (sizeof bn_group_1536_value)/sizeof(BN_ULONG),
+ 0,
+ BN_FLG_STATIC_DATA
+};
+
+static BN_ULONG bn_group_2048_value[] = {
+ bn_pack4(0x0FA7,0x111F,0x9E4A,0xFF73),
+ bn_pack4(0x9B65,0xE372,0xFCD6,0x8EF2),
+ bn_pack4(0x35DE,0x236D,0x525F,0x5475),
+ bn_pack4(0x94B5,0xC803,0xD89F,0x7AE4),
+ bn_pack4(0x71AE,0x35F8,0xE9DB,0xFBB6),
+ bn_pack4(0x2A56,0x98F3,0xA8D0,0xC382),
+ bn_pack4(0x9CCC,0x041C,0x7BC3,0x08D8),
+ bn_pack4(0xAF87,0x4E73,0x03CE,0x5329),
+ bn_pack4(0x6160,0x2790,0x04E5,0x7AE6),
+ bn_pack4(0x032C,0xFBDB,0xF52F,0xB378),
+ bn_pack4(0x5EA7,0x7A27,0x75D2,0xECFA),
+ bn_pack4(0x5445,0x23B5,0x24B0,0xD57D),
+ bn_pack4(0x5B9D,0x32E6,0x88F8,0x7748),
+ bn_pack4(0xF1D2,0xB907,0x8717,0x461A),
+ bn_pack4(0x76BD,0x207A,0x436C,0x6481),
+ bn_pack4(0xCA97,0xB43A,0x23FB,0x8016),
+ bn_pack4(0x1D28,0x1E44,0x6B14,0x773B),
+ bn_pack4(0x7359,0xD041,0xD5C3,0x3EA7),
+ bn_pack4(0xA80D,0x740A,0xDBF4,0xFF74),
+ bn_pack4(0x55F9,0x7993,0xEC97,0x5EEA),
+ bn_pack4(0x2918,0xA996,0x2F0B,0x93B8),
+ bn_pack4(0x661A,0x05FB,0xD5FA,0xAAE8),
+ bn_pack4(0xCF60,0x9517,0x9A16,0x3AB3),
+ bn_pack4(0xE808,0x3969,0xEDB7,0x67B0),
+ bn_pack4(0xCD7F,0x48A9,0xDA04,0xFD50),
+ bn_pack4(0xD523,0x12AB,0x4B03,0x310D),
+ bn_pack4(0x8193,0xE075,0x7767,0xA13D),
+ bn_pack4(0xA373,0x29CB,0xB4A0,0x99ED),
+ bn_pack4(0xFC31,0x9294,0x3DB5,0x6050),
+ bn_pack4(0xAF72,0xB665,0x1987,0xEE07),
+ bn_pack4(0xF166,0xDE5E,0x1389,0x582F),
+ bn_pack4(0xAC6B,0xDB41,0x324A,0x9A9B)
+};
+static BIGNUM bn_group_2048 = {
+ bn_group_2048_value,
+ (sizeof bn_group_2048_value)/sizeof(BN_ULONG),
+ (sizeof bn_group_2048_value)/sizeof(BN_ULONG),
+ 0,
+ BN_FLG_STATIC_DATA
+};
+
+static BN_ULONG bn_group_3072_value[] = {
+ bn_pack4(0xFFFF,0xFFFF,0xFFFF,0xFFFF),
+ bn_pack4(0x4B82,0xD120,0xA93A,0xD2CA),
+ bn_pack4(0x43DB,0x5BFC,0xE0FD,0x108E),
+ bn_pack4(0x08E2,0x4FA0,0x74E5,0xAB31),
+ bn_pack4(0x7709,0x88C0,0xBAD9,0x46E2),
+ bn_pack4(0xBBE1,0x1757,0x7A61,0x5D6C),
+ bn_pack4(0x521F,0x2B18,0x177B,0x200C),
+ bn_pack4(0xD876,0x0273,0x3EC8,0x6A64),
+ bn_pack4(0xF12F,0xFA06,0xD98A,0x0864),
+ bn_pack4(0xCEE3,0xD226,0x1AD2,0xEE6B),
+ bn_pack4(0x1E8C,0x94E0,0x4A25,0x619D),
+ bn_pack4(0xABF5,0xAE8C,0xDB09,0x33D7),
+ bn_pack4(0xB397,0x0F85,0xA6E1,0xE4C7),
+ bn_pack4(0x8AEA,0x7157,0x5D06,0x0C7D),
+ bn_pack4(0xECFB,0x8504,0x58DB,0xEF0A),
+ bn_pack4(0xA855,0x21AB,0xDF1C,0xBA64),
+ bn_pack4(0xAD33,0x170D,0x0450,0x7A33),
+ bn_pack4(0x1572,0x8E5A,0x8AAA,0xC42D),
+ bn_pack4(0x15D2,0x2618,0x98FA,0x0510),
+ bn_pack4(0x3995,0x497C,0xEA95,0x6AE5),
+ bn_pack4(0xDE2B,0xCBF6,0x9558,0x1718),
+ bn_pack4(0xB5C5,0x5DF0,0x6F4C,0x52C9),
+ bn_pack4(0x9B27,0x83A2,0xEC07,0xA28F),
+ bn_pack4(0xE39E,0x772C,0x180E,0x8603),
+ bn_pack4(0x3290,0x5E46,0x2E36,0xCE3B),
+ bn_pack4(0xF174,0x6C08,0xCA18,0x217C),
+ bn_pack4(0x670C,0x354E,0x4ABC,0x9804),
+ bn_pack4(0x9ED5,0x2907,0x7096,0x966D),
+ bn_pack4(0x1C62,0xF356,0x2085,0x52BB),
+ bn_pack4(0x8365,0x5D23,0xDCA3,0xAD96),
+ bn_pack4(0x6916,0x3FA8,0xFD24,0xCF5F),
+ bn_pack4(0x98DA,0x4836,0x1C55,0xD39A),
+ bn_pack4(0xC200,0x7CB8,0xA163,0xBF05),
+ bn_pack4(0x4928,0x6651,0xECE4,0x5B3D),
+ bn_pack4(0xAE9F,0x2411,0x7C4B,0x1FE6),
+ bn_pack4(0xEE38,0x6BFB,0x5A89,0x9FA5),
+ bn_pack4(0x0BFF,0x5CB6,0xF406,0xB7ED),
+ bn_pack4(0xF44C,0x42E9,0xA637,0xED6B),
+ bn_pack4(0xE485,0xB576,0x625E,0x7EC6),
+ bn_pack4(0x4FE1,0x356D,0x6D51,0xC245),
+ bn_pack4(0x302B,0x0A6D,0xF25F,0x1437),
+ bn_pack4(0xEF95,0x19B3,0xCD3A,0x431B),
+ bn_pack4(0x514A,0x0879,0x8E34,0x04DD),
+ bn_pack4(0x020B,0xBEA6,0x3B13,0x9B22),
+ bn_pack4(0x2902,0x4E08,0x8A67,0xCC74),
+ bn_pack4(0xC4C6,0x628B,0x80DC,0x1CD1),
+ bn_pack4(0xC90F,0xDAA2,0x2168,0xC234),
+ bn_pack4(0xFFFF,0xFFFF,0xFFFF,0xFFFF)
+};
+static BIGNUM bn_group_3072 = {
+ bn_group_3072_value,
+ (sizeof bn_group_3072_value)/sizeof(BN_ULONG),
+ (sizeof bn_group_3072_value)/sizeof(BN_ULONG),
+ 0,
+ BN_FLG_STATIC_DATA
+};
+
+static BN_ULONG bn_group_4096_value[] = {
+ bn_pack4(0xFFFF,0xFFFF,0xFFFF,0xFFFF),
+ bn_pack4(0x4DF4,0x35C9,0x3406,0x3199),
+ bn_pack4(0x86FF,0xB7DC,0x90A6,0xC08F),
+ bn_pack4(0x93B4,0xEA98,0x8D8F,0xDDC1),
+ bn_pack4(0xD006,0x9127,0xD5B0,0x5AA9),
+ bn_pack4(0xB81B,0xDD76,0x2170,0x481C),
+ bn_pack4(0x1F61,0x2970,0xCEE2,0xD7AF),
+ bn_pack4(0x233B,0xA186,0x515B,0xE7ED),
+ bn_pack4(0x99B2,0x964F,0xA090,0xC3A2),
+ bn_pack4(0x287C,0x5947,0x4E6B,0xC05D),
+ bn_pack4(0x2E8E,0xFC14,0x1FBE,0xCAA6),
+ bn_pack4(0xDBBB,0xC2DB,0x04DE,0x8EF9),
+ bn_pack4(0x2583,0xE9CA,0x2AD4,0x4CE8),
+ bn_pack4(0x1A94,0x6834,0xB615,0x0BDA),
+ bn_pack4(0x99C3,0x2718,0x6AF4,0xE23C),
+ bn_pack4(0x8871,0x9A10,0xBDBA,0x5B26),
+ bn_pack4(0x1A72,0x3C12,0xA787,0xE6D7),
+ bn_pack4(0x4B82,0xD120,0xA921,0x0801),
+ bn_pack4(0x43DB,0x5BFC,0xE0FD,0x108E),
+ bn_pack4(0x08E2,0x4FA0,0x74E5,0xAB31),
+ bn_pack4(0x7709,0x88C0,0xBAD9,0x46E2),
+ bn_pack4(0xBBE1,0x1757,0x7A61,0x5D6C),
+ bn_pack4(0x521F,0x2B18,0x177B,0x200C),
+ bn_pack4(0xD876,0x0273,0x3EC8,0x6A64),
+ bn_pack4(0xF12F,0xFA06,0xD98A,0x0864),
+ bn_pack4(0xCEE3,0xD226,0x1AD2,0xEE6B),
+ bn_pack4(0x1E8C,0x94E0,0x4A25,0x619D),
+ bn_pack4(0xABF5,0xAE8C,0xDB09,0x33D7),
+ bn_pack4(0xB397,0x0F85,0xA6E1,0xE4C7),
+ bn_pack4(0x8AEA,0x7157,0x5D06,0x0C7D),
+ bn_pack4(0xECFB,0x8504,0x58DB,0xEF0A),
+ bn_pack4(0xA855,0x21AB,0xDF1C,0xBA64),
+ bn_pack4(0xAD33,0x170D,0x0450,0x7A33),
+ bn_pack4(0x1572,0x8E5A,0x8AAA,0xC42D),
+ bn_pack4(0x15D2,0x2618,0x98FA,0x0510),
+ bn_pack4(0x3995,0x497C,0xEA95,0x6AE5),
+ bn_pack4(0xDE2B,0xCBF6,0x9558,0x1718),
+ bn_pack4(0xB5C5,0x5DF0,0x6F4C,0x52C9),
+ bn_pack4(0x9B27,0x83A2,0xEC07,0xA28F),
+ bn_pack4(0xE39E,0x772C,0x180E,0x8603),
+ bn_pack4(0x3290,0x5E46,0x2E36,0xCE3B),
+ bn_pack4(0xF174,0x6C08,0xCA18,0x217C),
+ bn_pack4(0x670C,0x354E,0x4ABC,0x9804),
+ bn_pack4(0x9ED5,0x2907,0x7096,0x966D),
+ bn_pack4(0x1C62,0xF356,0x2085,0x52BB),
+ bn_pack4(0x8365,0x5D23,0xDCA3,0xAD96),
+ bn_pack4(0x6916,0x3FA8,0xFD24,0xCF5F),
+ bn_pack4(0x98DA,0x4836,0x1C55,0xD39A),
+ bn_pack4(0xC200,0x7CB8,0xA163,0xBF05),
+ bn_pack4(0x4928,0x6651,0xECE4,0x5B3D),
+ bn_pack4(0xAE9F,0x2411,0x7C4B,0x1FE6),
+ bn_pack4(0xEE38,0x6BFB,0x5A89,0x9FA5),
+ bn_pack4(0x0BFF,0x5CB6,0xF406,0xB7ED),
+ bn_pack4(0xF44C,0x42E9,0xA637,0xED6B),
+ bn_pack4(0xE485,0xB576,0x625E,0x7EC6),
+ bn_pack4(0x4FE1,0x356D,0x6D51,0xC245),
+ bn_pack4(0x302B,0x0A6D,0xF25F,0x1437),
+ bn_pack4(0xEF95,0x19B3,0xCD3A,0x431B),
+ bn_pack4(0x514A,0x0879,0x8E34,0x04DD),
+ bn_pack4(0x020B,0xBEA6,0x3B13,0x9B22),
+ bn_pack4(0x2902,0x4E08,0x8A67,0xCC74),
+ bn_pack4(0xC4C6,0x628B,0x80DC,0x1CD1),
+ bn_pack4(0xC90F,0xDAA2,0x2168,0xC234),
+ bn_pack4(0xFFFF,0xFFFF,0xFFFF,0xFFFF)
+};
+static BIGNUM bn_group_4096 = {
+ bn_group_4096_value,
+ (sizeof bn_group_4096_value)/sizeof(BN_ULONG),
+ (sizeof bn_group_4096_value)/sizeof(BN_ULONG),
+ 0,
+ BN_FLG_STATIC_DATA
+};
+
+static BN_ULONG bn_group_6144_value[] = {
+ bn_pack4(0xFFFF,0xFFFF,0xFFFF,0xFFFF),
+ bn_pack4(0xE694,0xF91E,0x6DCC,0x4024),
+ bn_pack4(0x12BF,0x2D5B,0x0B74,0x74D6),
+ bn_pack4(0x043E,0x8F66,0x3F48,0x60EE),
+ bn_pack4(0x387F,0xE8D7,0x6E3C,0x0468),
+ bn_pack4(0xDA56,0xC9EC,0x2EF2,0x9632),
+ bn_pack4(0xEB19,0xCCB1,0xA313,0xD55C),
+ bn_pack4(0xF550,0xAA3D,0x8A1F,0xBFF0),
+ bn_pack4(0x06A1,0xD58B,0xB7C5,0xDA76),
+ bn_pack4(0xA797,0x15EE,0xF29B,0xE328),
+ bn_pack4(0x14CC,0x5ED2,0x0F80,0x37E0),
+ bn_pack4(0xCC8F,0x6D7E,0xBF48,0xE1D8),
+ bn_pack4(0x4BD4,0x07B2,0x2B41,0x54AA),
+ bn_pack4(0x0F1D,0x45B7,0xFF58,0x5AC5),
+ bn_pack4(0x23A9,0x7A7E,0x36CC,0x88BE),
+ bn_pack4(0x59E7,0xC97F,0xBEC7,0xE8F3),
+ bn_pack4(0xB5A8,0x4031,0x900B,0x1C9E),
+ bn_pack4(0xD55E,0x702F,0x4698,0x0C82),
+ bn_pack4(0xF482,0xD7CE,0x6E74,0xFEF6),
+ bn_pack4(0xF032,0xEA15,0xD172,0x1D03),
+ bn_pack4(0x5983,0xCA01,0xC64B,0x92EC),
+ bn_pack4(0x6FB8,0xF401,0x378C,0xD2BF),
+ bn_pack4(0x3320,0x5151,0x2BD7,0xAF42),
+ bn_pack4(0xDB7F,0x1447,0xE6CC,0x254B),
+ bn_pack4(0x44CE,0x6CBA,0xCED4,0xBB1B),
+ bn_pack4(0xDA3E,0xDBEB,0xCF9B,0x14ED),
+ bn_pack4(0x1797,0x27B0,0x865A,0x8918),
+ bn_pack4(0xB06A,0x53ED,0x9027,0xD831),
+ bn_pack4(0xE5DB,0x382F,0x4130,0x01AE),
+ bn_pack4(0xF8FF,0x9406,0xAD9E,0x530E),
+ bn_pack4(0xC975,0x1E76,0x3DBA,0x37BD),
+ bn_pack4(0xC1D4,0xDCB2,0x6026,0x46DE),
+ bn_pack4(0x36C3,0xFAB4,0xD27C,0x7026),
+ bn_pack4(0x4DF4,0x35C9,0x3402,0x8492),
+ bn_pack4(0x86FF,0xB7DC,0x90A6,0xC08F),
+ bn_pack4(0x93B4,0xEA98,0x8D8F,0xDDC1),
+ bn_pack4(0xD006,0x9127,0xD5B0,0x5AA9),
+ bn_pack4(0xB81B,0xDD76,0x2170,0x481C),
+ bn_pack4(0x1F61,0x2970,0xCEE2,0xD7AF),
+ bn_pack4(0x233B,0xA186,0x515B,0xE7ED),
+ bn_pack4(0x99B2,0x964F,0xA090,0xC3A2),
+ bn_pack4(0x287C,0x5947,0x4E6B,0xC05D),
+ bn_pack4(0x2E8E,0xFC14,0x1FBE,0xCAA6),
+ bn_pack4(0xDBBB,0xC2DB,0x04DE,0x8EF9),
+ bn_pack4(0x2583,0xE9CA,0x2AD4,0x4CE8),
+ bn_pack4(0x1A94,0x6834,0xB615,0x0BDA),
+ bn_pack4(0x99C3,0x2718,0x6AF4,0xE23C),
+ bn_pack4(0x8871,0x9A10,0xBDBA,0x5B26),
+ bn_pack4(0x1A72,0x3C12,0xA787,0xE6D7),
+ bn_pack4(0x4B82,0xD120,0xA921,0x0801),
+ bn_pack4(0x43DB,0x5BFC,0xE0FD,0x108E),
+ bn_pack4(0x08E2,0x4FA0,0x74E5,0xAB31),
+ bn_pack4(0x7709,0x88C0,0xBAD9,0x46E2),
+ bn_pack4(0xBBE1,0x1757,0x7A61,0x5D6C),
+ bn_pack4(0x521F,0x2B18,0x177B,0x200C),
+ bn_pack4(0xD876,0x0273,0x3EC8,0x6A64),
+ bn_pack4(0xF12F,0xFA06,0xD98A,0x0864),
+ bn_pack4(0xCEE3,0xD226,0x1AD2,0xEE6B),
+ bn_pack4(0x1E8C,0x94E0,0x4A25,0x619D),
+ bn_pack4(0xABF5,0xAE8C,0xDB09,0x33D7),
+ bn_pack4(0xB397,0x0F85,0xA6E1,0xE4C7),
+ bn_pack4(0x8AEA,0x7157,0x5D06,0x0C7D),
+ bn_pack4(0xECFB,0x8504,0x58DB,0xEF0A),
+ bn_pack4(0xA855,0x21AB,0xDF1C,0xBA64),
+ bn_pack4(0xAD33,0x170D,0x0450,0x7A33),
+ bn_pack4(0x1572,0x8E5A,0x8AAA,0xC42D),
+ bn_pack4(0x15D2,0x2618,0x98FA,0x0510),
+ bn_pack4(0x3995,0x497C,0xEA95,0x6AE5),
+ bn_pack4(0xDE2B,0xCBF6,0x9558,0x1718),
+ bn_pack4(0xB5C5,0x5DF0,0x6F4C,0x52C9),
+ bn_pack4(0x9B27,0x83A2,0xEC07,0xA28F),
+ bn_pack4(0xE39E,0x772C,0x180E,0x8603),
+ bn_pack4(0x3290,0x5E46,0x2E36,0xCE3B),
+ bn_pack4(0xF174,0x6C08,0xCA18,0x217C),
+ bn_pack4(0x670C,0x354E,0x4ABC,0x9804),
+ bn_pack4(0x9ED5,0x2907,0x7096,0x966D),
+ bn_pack4(0x1C62,0xF356,0x2085,0x52BB),
+ bn_pack4(0x8365,0x5D23,0xDCA3,0xAD96),
+ bn_pack4(0x6916,0x3FA8,0xFD24,0xCF5F),
+ bn_pack4(0x98DA,0x4836,0x1C55,0xD39A),
+ bn_pack4(0xC200,0x7CB8,0xA163,0xBF05),
+ bn_pack4(0x4928,0x6651,0xECE4,0x5B3D),
+ bn_pack4(0xAE9F,0x2411,0x7C4B,0x1FE6),
+ bn_pack4(0xEE38,0x6BFB,0x5A89,0x9FA5),
+ bn_pack4(0x0BFF,0x5CB6,0xF406,0xB7ED),
+ bn_pack4(0xF44C,0x42E9,0xA637,0xED6B),
+ bn_pack4(0xE485,0xB576,0x625E,0x7EC6),
+ bn_pack4(0x4FE1,0x356D,0x6D51,0xC245),
+ bn_pack4(0x302B,0x0A6D,0xF25F,0x1437),
+ bn_pack4(0xEF95,0x19B3,0xCD3A,0x431B),
+ bn_pack4(0x514A,0x0879,0x8E34,0x04DD),
+ bn_pack4(0x020B,0xBEA6,0x3B13,0x9B22),
+ bn_pack4(0x2902,0x4E08,0x8A67,0xCC74),
+ bn_pack4(0xC4C6,0x628B,0x80DC,0x1CD1),
+ bn_pack4(0xC90F,0xDAA2,0x2168,0xC234),
+ bn_pack4(0xFFFF,0xFFFF,0xFFFF,0xFFFF)
+};
+static BIGNUM bn_group_6144 = {
+ bn_group_6144_value,
+ (sizeof bn_group_6144_value)/sizeof(BN_ULONG),
+ (sizeof bn_group_6144_value)/sizeof(BN_ULONG),
+ 0,
+ BN_FLG_STATIC_DATA
+};
+
+static BN_ULONG bn_group_8192_value[] = {
+ bn_pack4(0xFFFF,0xFFFF,0xFFFF,0xFFFF),
+ bn_pack4(0x60C9,0x80DD,0x98ED,0xD3DF),
+ bn_pack4(0xC81F,0x56E8,0x80B9,0x6E71),
+ bn_pack4(0x9E30,0x50E2,0x7656,0x94DF),
+ bn_pack4(0x9558,0xE447,0x5677,0xE9AA),
+ bn_pack4(0xC919,0x0DA6,0xFC02,0x6E47),
+ bn_pack4(0x889A,0x002E,0xD5EE,0x382B),
+ bn_pack4(0x4009,0x438B,0x481C,0x6CD7),
+ bn_pack4(0x3590,0x46F4,0xEB87,0x9F92),
+ bn_pack4(0xFAF3,0x6BC3,0x1ECF,0xA268),
+ bn_pack4(0xB1D5,0x10BD,0x7EE7,0x4D73),
+ bn_pack4(0xF9AB,0x4819,0x5DED,0x7EA1),
+ bn_pack4(0x64F3,0x1CC5,0x0846,0x851D),
+ bn_pack4(0x4597,0xE899,0xA025,0x5DC1),
+ bn_pack4(0xDF31,0x0EE0,0x74AB,0x6A36),
+ bn_pack4(0x6D2A,0x13F8,0x3F44,0xF82D),
+ bn_pack4(0x062B,0x3CF5,0xB3A2,0x78A6),
+ bn_pack4(0x7968,0x3303,0xED5B,0xDD3A),
+ bn_pack4(0xFA9D,0x4B7F,0xA2C0,0x87E8),
+ bn_pack4(0x4BCB,0xC886,0x2F83,0x85DD),
+ bn_pack4(0x3473,0xFC64,0x6CEA,0x306B),
+ bn_pack4(0x13EB,0x57A8,0x1A23,0xF0C7),
+ bn_pack4(0x2222,0x2E04,0xA403,0x7C07),
+ bn_pack4(0xE3FD,0xB8BE,0xFC84,0x8AD9),
+ bn_pack4(0x238F,0x16CB,0xE39D,0x652D),
+ bn_pack4(0x3423,0xB474,0x2BF1,0xC978),
+ bn_pack4(0x3AAB,0x639C,0x5AE4,0xF568),
+ bn_pack4(0x2576,0xF693,0x6BA4,0x2466),
+ bn_pack4(0x741F,0xA7BF,0x8AFC,0x47ED),
+ bn_pack4(0x3BC8,0x32B6,0x8D9D,0xD300),
+ bn_pack4(0xD8BE,0xC4D0,0x73B9,0x31BA),
+ bn_pack4(0x3877,0x7CB6,0xA932,0xDF8C),
+ bn_pack4(0x74A3,0x926F,0x12FE,0xE5E4),
+ bn_pack4(0xE694,0xF91E,0x6DBE,0x1159),
+ bn_pack4(0x12BF,0x2D5B,0x0B74,0x74D6),
+ bn_pack4(0x043E,0x8F66,0x3F48,0x60EE),
+ bn_pack4(0x387F,0xE8D7,0x6E3C,0x0468),
+ bn_pack4(0xDA56,0xC9EC,0x2EF2,0x9632),
+ bn_pack4(0xEB19,0xCCB1,0xA313,0xD55C),
+ bn_pack4(0xF550,0xAA3D,0x8A1F,0xBFF0),
+ bn_pack4(0x06A1,0xD58B,0xB7C5,0xDA76),
+ bn_pack4(0xA797,0x15EE,0xF29B,0xE328),
+ bn_pack4(0x14CC,0x5ED2,0x0F80,0x37E0),
+ bn_pack4(0xCC8F,0x6D7E,0xBF48,0xE1D8),
+ bn_pack4(0x4BD4,0x07B2,0x2B41,0x54AA),
+ bn_pack4(0x0F1D,0x45B7,0xFF58,0x5AC5),
+ bn_pack4(0x23A9,0x7A7E,0x36CC,0x88BE),
+ bn_pack4(0x59E7,0xC97F,0xBEC7,0xE8F3),
+ bn_pack4(0xB5A8,0x4031,0x900B,0x1C9E),
+ bn_pack4(0xD55E,0x702F,0x4698,0x0C82),
+ bn_pack4(0xF482,0xD7CE,0x6E74,0xFEF6),
+ bn_pack4(0xF032,0xEA15,0xD172,0x1D03),
+ bn_pack4(0x5983,0xCA01,0xC64B,0x92EC),
+ bn_pack4(0x6FB8,0xF401,0x378C,0xD2BF),
+ bn_pack4(0x3320,0x5151,0x2BD7,0xAF42),
+ bn_pack4(0xDB7F,0x1447,0xE6CC,0x254B),
+ bn_pack4(0x44CE,0x6CBA,0xCED4,0xBB1B),
+ bn_pack4(0xDA3E,0xDBEB,0xCF9B,0x14ED),
+ bn_pack4(0x1797,0x27B0,0x865A,0x8918),
+ bn_pack4(0xB06A,0x53ED,0x9027,0xD831),
+ bn_pack4(0xE5DB,0x382F,0x4130,0x01AE),
+ bn_pack4(0xF8FF,0x9406,0xAD9E,0x530E),
+ bn_pack4(0xC975,0x1E76,0x3DBA,0x37BD),
+ bn_pack4(0xC1D4,0xDCB2,0x6026,0x46DE),
+ bn_pack4(0x36C3,0xFAB4,0xD27C,0x7026),
+ bn_pack4(0x4DF4,0x35C9,0x3402,0x8492),
+ bn_pack4(0x86FF,0xB7DC,0x90A6,0xC08F),
+ bn_pack4(0x93B4,0xEA98,0x8D8F,0xDDC1),
+ bn_pack4(0xD006,0x9127,0xD5B0,0x5AA9),
+ bn_pack4(0xB81B,0xDD76,0x2170,0x481C),
+ bn_pack4(0x1F61,0x2970,0xCEE2,0xD7AF),
+ bn_pack4(0x233B,0xA186,0x515B,0xE7ED),
+ bn_pack4(0x99B2,0x964F,0xA090,0xC3A2),
+ bn_pack4(0x287C,0x5947,0x4E6B,0xC05D),
+ bn_pack4(0x2E8E,0xFC14,0x1FBE,0xCAA6),
+ bn_pack4(0xDBBB,0xC2DB,0x04DE,0x8EF9),
+ bn_pack4(0x2583,0xE9CA,0x2AD4,0x4CE8),
+ bn_pack4(0x1A94,0x6834,0xB615,0x0BDA),
+ bn_pack4(0x99C3,0x2718,0x6AF4,0xE23C),
+ bn_pack4(0x8871,0x9A10,0xBDBA,0x5B26),
+ bn_pack4(0x1A72,0x3C12,0xA787,0xE6D7),
+ bn_pack4(0x4B82,0xD120,0xA921,0x0801),
+ bn_pack4(0x43DB,0x5BFC,0xE0FD,0x108E),
+ bn_pack4(0x08E2,0x4FA0,0x74E5,0xAB31),
+ bn_pack4(0x7709,0x88C0,0xBAD9,0x46E2),
+ bn_pack4(0xBBE1,0x1757,0x7A61,0x5D6C),
+ bn_pack4(0x521F,0x2B18,0x177B,0x200C),
+ bn_pack4(0xD876,0x0273,0x3EC8,0x6A64),
+ bn_pack4(0xF12F,0xFA06,0xD98A,0x0864),
+ bn_pack4(0xCEE3,0xD226,0x1AD2,0xEE6B),
+ bn_pack4(0x1E8C,0x94E0,0x4A25,0x619D),
+ bn_pack4(0xABF5,0xAE8C,0xDB09,0x33D7),
+ bn_pack4(0xB397,0x0F85,0xA6E1,0xE4C7),
+ bn_pack4(0x8AEA,0x7157,0x5D06,0x0C7D),
+ bn_pack4(0xECFB,0x8504,0x58DB,0xEF0A),
+ bn_pack4(0xA855,0x21AB,0xDF1C,0xBA64),
+ bn_pack4(0xAD33,0x170D,0x0450,0x7A33),
+ bn_pack4(0x1572,0x8E5A,0x8AAA,0xC42D),
+ bn_pack4(0x15D2,0x2618,0x98FA,0x0510),
+ bn_pack4(0x3995,0x497C,0xEA95,0x6AE5),
+ bn_pack4(0xDE2B,0xCBF6,0x9558,0x1718),
+ bn_pack4(0xB5C5,0x5DF0,0x6F4C,0x52C9),
+ bn_pack4(0x9B27,0x83A2,0xEC07,0xA28F),
+ bn_pack4(0xE39E,0x772C,0x180E,0x8603),
+ bn_pack4(0x3290,0x5E46,0x2E36,0xCE3B),
+ bn_pack4(0xF174,0x6C08,0xCA18,0x217C),
+ bn_pack4(0x670C,0x354E,0x4ABC,0x9804),
+ bn_pack4(0x9ED5,0x2907,0x7096,0x966D),
+ bn_pack4(0x1C62,0xF356,0x2085,0x52BB),
+ bn_pack4(0x8365,0x5D23,0xDCA3,0xAD96),
+ bn_pack4(0x6916,0x3FA8,0xFD24,0xCF5F),
+ bn_pack4(0x98DA,0x4836,0x1C55,0xD39A),
+ bn_pack4(0xC200,0x7CB8,0xA163,0xBF05),
+ bn_pack4(0x4928,0x6651,0xECE4,0x5B3D),
+ bn_pack4(0xAE9F,0x2411,0x7C4B,0x1FE6),
+ bn_pack4(0xEE38,0x6BFB,0x5A89,0x9FA5),
+ bn_pack4(0x0BFF,0x5CB6,0xF406,0xB7ED),
+ bn_pack4(0xF44C,0x42E9,0xA637,0xED6B),
+ bn_pack4(0xE485,0xB576,0x625E,0x7EC6),
+ bn_pack4(0x4FE1,0x356D,0x6D51,0xC245),
+ bn_pack4(0x302B,0x0A6D,0xF25F,0x1437),
+ bn_pack4(0xEF95,0x19B3,0xCD3A,0x431B),
+ bn_pack4(0x514A,0x0879,0x8E34,0x04DD),
+ bn_pack4(0x020B,0xBEA6,0x3B13,0x9B22),
+ bn_pack4(0x2902,0x4E08,0x8A67,0xCC74),
+ bn_pack4(0xC4C6,0x628B,0x80DC,0x1CD1),
+ bn_pack4(0xC90F,0xDAA2,0x2168,0xC234),
+ bn_pack4(0xFFFF,0xFFFF,0xFFFF,0xFFFF)
+};
+static BIGNUM bn_group_8192 = {
+ bn_group_8192_value,
+ (sizeof bn_group_8192_value)/sizeof(BN_ULONG),
+ (sizeof bn_group_8192_value)/sizeof(BN_ULONG),
+ 0,
+ BN_FLG_STATIC_DATA
+};
+
+static BN_ULONG bn_generator_19_value[] = {19} ;
+static BIGNUM bn_generator_19 = {
+ bn_generator_19_value,
+ 1,
+ 1,
+ 0,
+ BN_FLG_STATIC_DATA
+};
+static BN_ULONG bn_generator_5_value[] = {5} ;
+static BIGNUM bn_generator_5 = {
+ bn_generator_5_value,
+ 1,
+ 1,
+ 0,
+ BN_FLG_STATIC_DATA
+};
+static BN_ULONG bn_generator_2_value[] = {2} ;
+static BIGNUM bn_generator_2 = {
+ bn_generator_2_value,
+ 1,
+ 1,
+ 0,
+ BN_FLG_STATIC_DATA
+};
+
+static SRP_gN knowngN[] = {
+ {"8192",&bn_generator_19 , &bn_group_8192},
+ {"6144",&bn_generator_5 , &bn_group_6144},
+ {"4096",&bn_generator_5 , &bn_group_4096},
+ {"3072",&bn_generator_5 , &bn_group_3072},
+ {"2048",&bn_generator_2 , &bn_group_2048},
+ {"1536",&bn_generator_2 , &bn_group_1536},
+ {"1024",&bn_generator_2 , &bn_group_1024},
+};
+#define KNOWN_GN_NUMBER sizeof(knowngN) / sizeof(SRP_gN)
+
+/* end of generated data */
diff --git a/main/openssl/crypto/srp/srp_lcl.h b/main/openssl/crypto/srp/srp_lcl.h
new file mode 100644
index 00000000..42bda3f1
--- /dev/null
+++ b/main/openssl/crypto/srp/srp_lcl.h
@@ -0,0 +1,83 @@
+/* crypto/srp/srp_lcl.h */
+/* Written by Peter Sylvester (peter.sylvester@edelweb.fr)
+ * for the EdelKey project and contributed to the OpenSSL project 2004.
+ */
+/* ====================================================================
+ * Copyright (c) 2004 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com). This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+#ifndef HEADER_SRP_LCL_H
+#define HEADER_SRP_LCL_H
+
+#include <openssl/srp.h>
+#include <openssl/sha.h>
+
+#if 0
+#define srp_bn_print(a) {fprintf(stderr, #a "="); BN_print_fp(stderr,a); \
+ fprintf(stderr,"\n");}
+#else
+#define srp_bn_print(a)
+#endif
+
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/main/openssl/crypto/srp/srp_lib.c b/main/openssl/crypto/srp/srp_lib.c
new file mode 100644
index 00000000..7c1dcc51
--- /dev/null
+++ b/main/openssl/crypto/srp/srp_lib.c
@@ -0,0 +1,361 @@
+/* crypto/srp/srp_lib.c */
+/* Written by Christophe Renou (christophe.renou@edelweb.fr) with
+ * the precious help of Peter Sylvester (peter.sylvester@edelweb.fr)
+ * for the EdelKey project and contributed to the OpenSSL project 2004.
+ */
+/* ====================================================================
+ * Copyright (c) 2004 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com). This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+#ifndef OPENSSL_NO_SRP
+#include "cryptlib.h"
+#include "srp_lcl.h"
+#include <openssl/srp.h>
+#include <openssl/evp.h>
+
+#if (BN_BYTES == 8)
+# if (defined(_WIN32) || defined(_WIN64)) && !defined(__MINGW32__)
+# define bn_pack4(a1,a2,a3,a4) ((a1##UI64<<48)|(a2##UI64<<32)|(a3##UI64<<16)|a4##UI64)
+# elif defined(__arch64__)
+# define bn_pack4(a1,a2,a3,a4) ((a1##UL<<48)|(a2##UL<<32)|(a3##UL<<16)|a4##UL)
+# else
+# define bn_pack4(a1,a2,a3,a4) ((a1##ULL<<48)|(a2##ULL<<32)|(a3##ULL<<16)|a4##ULL)
+# endif
+#elif (BN_BYTES == 4)
+# define bn_pack4(a1,a2,a3,a4) ((a3##UL<<16)|a4##UL), ((a1##UL<<16)|a2##UL)
+#else
+# error "unsupported BN_BYTES"
+#endif
+
+
+#include "srp_grps.h"
+
+static BIGNUM *srp_Calc_k(BIGNUM *N, BIGNUM *g)
+ {
+ /* k = SHA1(N | PAD(g)) -- tls-srp draft 8 */
+
+ unsigned char digest[SHA_DIGEST_LENGTH];
+ unsigned char *tmp;
+ EVP_MD_CTX ctxt;
+ int longg ;
+ int longN = BN_num_bytes(N);
+
+ if ((tmp = OPENSSL_malloc(longN)) == NULL)
+ return NULL;
+ BN_bn2bin(N,tmp) ;
+
+ EVP_MD_CTX_init(&ctxt);
+ EVP_DigestInit_ex(&ctxt, EVP_sha1(), NULL);
+ EVP_DigestUpdate(&ctxt, tmp, longN);
+
+ memset(tmp, 0, longN);
+ longg = BN_bn2bin(g,tmp) ;
+ /* use the zeros behind to pad on left */
+ EVP_DigestUpdate(&ctxt, tmp + longg, longN-longg);
+ EVP_DigestUpdate(&ctxt, tmp, longg);
+ OPENSSL_free(tmp);
+
+ EVP_DigestFinal_ex(&ctxt, digest, NULL);
+ EVP_MD_CTX_cleanup(&ctxt);
+ return BN_bin2bn(digest, sizeof(digest), NULL);
+ }
+
+BIGNUM *SRP_Calc_u(BIGNUM *A, BIGNUM *B, BIGNUM *N)
+ {
+ /* k = SHA1(PAD(A) || PAD(B) ) -- tls-srp draft 8 */
+
+ BIGNUM *u;
+ unsigned char cu[SHA_DIGEST_LENGTH];
+ unsigned char *cAB;
+ EVP_MD_CTX ctxt;
+ int longN;
+ if ((A == NULL) ||(B == NULL) || (N == NULL))
+ return NULL;
+
+ longN= BN_num_bytes(N);
+
+ if ((cAB = OPENSSL_malloc(2*longN)) == NULL)
+ return NULL;
+
+ memset(cAB, 0, longN);
+
+ EVP_MD_CTX_init(&ctxt);
+ EVP_DigestInit_ex(&ctxt, EVP_sha1(), NULL);
+ EVP_DigestUpdate(&ctxt, cAB + BN_bn2bin(A,cAB+longN), longN);
+ EVP_DigestUpdate(&ctxt, cAB + BN_bn2bin(B,cAB+longN), longN);
+ OPENSSL_free(cAB);
+ EVP_DigestFinal_ex(&ctxt, cu, NULL);
+ EVP_MD_CTX_cleanup(&ctxt);
+
+ if (!(u = BN_bin2bn(cu, sizeof(cu), NULL)))
+ return NULL;
+ if (!BN_is_zero(u))
+ return u;
+ BN_free(u);
+ return NULL;
+}
+
+BIGNUM *SRP_Calc_server_key(BIGNUM *A, BIGNUM *v, BIGNUM *u, BIGNUM *b, BIGNUM *N)
+ {
+ BIGNUM *tmp = NULL, *S = NULL;
+ BN_CTX *bn_ctx;
+
+ if (u == NULL || A == NULL || v == NULL || b == NULL || N == NULL)
+ return NULL;
+
+ if ((bn_ctx = BN_CTX_new()) == NULL ||
+ (tmp = BN_new()) == NULL ||
+ (S = BN_new()) == NULL )
+ goto err;
+
+ /* S = (A*v**u) ** b */
+
+ if (!BN_mod_exp(tmp,v,u,N,bn_ctx))
+ goto err;
+ if (!BN_mod_mul(tmp,A,tmp,N,bn_ctx))
+ goto err;
+ if (!BN_mod_exp(S,tmp,b,N,bn_ctx))
+ goto err;
+err:
+ BN_CTX_free(bn_ctx);
+ BN_clear_free(tmp);
+ return S;
+ }
+
+BIGNUM *SRP_Calc_B(BIGNUM *b, BIGNUM *N, BIGNUM *g, BIGNUM *v)
+ {
+ BIGNUM *kv = NULL, *gb = NULL;
+ BIGNUM *B = NULL, *k = NULL;
+ BN_CTX *bn_ctx;
+
+ if (b == NULL || N == NULL || g == NULL || v == NULL ||
+ (bn_ctx = BN_CTX_new()) == NULL)
+ return NULL;
+
+ if ( (kv = BN_new()) == NULL ||
+ (gb = BN_new()) == NULL ||
+ (B = BN_new())== NULL)
+ goto err;
+
+ /* B = g**b + k*v */
+
+ if (!BN_mod_exp(gb,g,b,N,bn_ctx) ||
+ !(k = srp_Calc_k(N,g)) ||
+ !BN_mod_mul(kv,v,k,N,bn_ctx) ||
+ !BN_mod_add(B,gb,kv,N,bn_ctx))
+ {
+ BN_free(B);
+ B = NULL;
+ }
+err:
+ BN_CTX_free(bn_ctx);
+ BN_clear_free(kv);
+ BN_clear_free(gb);
+ BN_free(k);
+ return B;
+ }
+
+BIGNUM *SRP_Calc_x(BIGNUM *s, const char *user, const char *pass)
+ {
+ unsigned char dig[SHA_DIGEST_LENGTH];
+ EVP_MD_CTX ctxt;
+ unsigned char *cs;
+
+ if ((s == NULL) ||
+ (user == NULL) ||
+ (pass == NULL))
+ return NULL;
+
+ if ((cs = OPENSSL_malloc(BN_num_bytes(s))) == NULL)
+ return NULL;
+
+ EVP_MD_CTX_init(&ctxt);
+ EVP_DigestInit_ex(&ctxt, EVP_sha1(), NULL);
+ EVP_DigestUpdate(&ctxt, user, strlen(user));
+ EVP_DigestUpdate(&ctxt, ":", 1);
+ EVP_DigestUpdate(&ctxt, pass, strlen(pass));
+ EVP_DigestFinal_ex(&ctxt, dig, NULL);
+
+ EVP_DigestInit_ex(&ctxt, EVP_sha1(), NULL);
+ BN_bn2bin(s,cs);
+ EVP_DigestUpdate(&ctxt, cs, BN_num_bytes(s));
+ OPENSSL_free(cs);
+ EVP_DigestUpdate(&ctxt, dig, sizeof(dig));
+ EVP_DigestFinal_ex(&ctxt, dig, NULL);
+ EVP_MD_CTX_cleanup(&ctxt);
+
+ return BN_bin2bn(dig, sizeof(dig), NULL);
+ }
+
+BIGNUM *SRP_Calc_A(BIGNUM *a, BIGNUM *N, BIGNUM *g)
+ {
+ BN_CTX *bn_ctx;
+ BIGNUM * A = NULL;
+
+ if (a == NULL || N == NULL || g == NULL ||
+ (bn_ctx = BN_CTX_new()) == NULL)
+ return NULL;
+
+ if ((A = BN_new()) != NULL &&
+ !BN_mod_exp(A,g,a,N,bn_ctx))
+ {
+ BN_free(A);
+ A = NULL;
+ }
+ BN_CTX_free(bn_ctx);
+ return A;
+ }
+
+
+BIGNUM *SRP_Calc_client_key(BIGNUM *N, BIGNUM *B, BIGNUM *g, BIGNUM *x, BIGNUM *a, BIGNUM *u)
+ {
+ BIGNUM *tmp = NULL, *tmp2 = NULL, *tmp3 = NULL , *k = NULL, *K = NULL;
+ BN_CTX *bn_ctx;
+
+ if (u == NULL || B == NULL || N == NULL || g == NULL || x == NULL || a == NULL ||
+ (bn_ctx = BN_CTX_new()) == NULL)
+ return NULL;
+
+ if ((tmp = BN_new()) == NULL ||
+ (tmp2 = BN_new())== NULL ||
+ (tmp3 = BN_new())== NULL ||
+ (K = BN_new()) == NULL)
+ goto err;
+
+ if (!BN_mod_exp(tmp,g,x,N,bn_ctx))
+ goto err;
+ if (!(k = srp_Calc_k(N,g)))
+ goto err;
+ if (!BN_mod_mul(tmp2,tmp,k,N,bn_ctx))
+ goto err;
+ if (!BN_mod_sub(tmp,B,tmp2,N,bn_ctx))
+ goto err;
+
+ if (!BN_mod_mul(tmp3,u,x,N,bn_ctx))
+ goto err;
+ if (!BN_mod_add(tmp2,a,tmp3,N,bn_ctx))
+ goto err;
+ if (!BN_mod_exp(K,tmp,tmp2,N,bn_ctx))
+ goto err;
+
+err :
+ BN_CTX_free(bn_ctx);
+ BN_clear_free(tmp);
+ BN_clear_free(tmp2);
+ BN_clear_free(tmp3);
+ BN_free(k);
+ return K;
+ }
+
+int SRP_Verify_B_mod_N(BIGNUM *B, BIGNUM *N)
+ {
+ BIGNUM *r;
+ BN_CTX *bn_ctx;
+ int ret = 0;
+
+ if (B == NULL || N == NULL ||
+ (bn_ctx = BN_CTX_new()) == NULL)
+ return 0;
+
+ if ((r = BN_new()) == NULL)
+ goto err;
+ /* Checks if B % N == 0 */
+ if (!BN_nnmod(r,B,N,bn_ctx))
+ goto err;
+ ret = !BN_is_zero(r);
+err:
+ BN_CTX_free(bn_ctx);
+ BN_free(r);
+ return ret;
+ }
+
+int SRP_Verify_A_mod_N(BIGNUM *A, BIGNUM *N)
+ {
+ /* Checks if A % N == 0 */
+ return SRP_Verify_B_mod_N(A,N) ;
+ }
+
+
+/* Check if G and N are kwown parameters.
+ The values have been generated from the ietf-tls-srp draft version 8
+*/
+char *SRP_check_known_gN_param(BIGNUM *g, BIGNUM *N)
+ {
+ size_t i;
+ if ((g == NULL) || (N == NULL))
+ return 0;
+
+ srp_bn_print(g);
+ srp_bn_print(N);
+
+ for(i = 0; i < KNOWN_GN_NUMBER; i++)
+ {
+ if (BN_cmp(knowngN[i].g, g) == 0 && BN_cmp(knowngN[i].N, N) == 0)
+ return knowngN[i].id;
+ }
+ return NULL;
+ }
+
+SRP_gN *SRP_get_default_gN(const char *id)
+ {
+ size_t i;
+
+ if (id == NULL)
+ return knowngN;
+ for(i = 0; i < KNOWN_GN_NUMBER; i++)
+ {
+ if (strcmp(knowngN[i].id, id)==0)
+ return knowngN + i;
+ }
+ return NULL;
+ }
+#endif
diff --git a/main/openssl/crypto/srp/srp_vfy.c b/main/openssl/crypto/srp/srp_vfy.c
new file mode 100644
index 00000000..4a3d13ed
--- /dev/null
+++ b/main/openssl/crypto/srp/srp_vfy.c
@@ -0,0 +1,658 @@
+/* crypto/srp/srp_vfy.c */
+/* Written by Christophe Renou (christophe.renou@edelweb.fr) with
+ * the precious help of Peter Sylvester (peter.sylvester@edelweb.fr)
+ * for the EdelKey project and contributed to the OpenSSL project 2004.
+ */
+/* ====================================================================
+ * Copyright (c) 2004 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com). This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+#ifndef OPENSSL_NO_SRP
+#include "cryptlib.h"
+#include "srp_lcl.h"
+#include <openssl/srp.h>
+#include <openssl/evp.h>
+#include <openssl/buffer.h>
+#include <openssl/rand.h>
+#include <openssl/txt_db.h>
+
+#define SRP_RANDOM_SALT_LEN 20
+#define MAX_LEN 2500
+
+static char b64table[] =
+ "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz./";
+
+/* the following two conversion routines have been inspired by code from Stanford */
+
+/*
+ * Convert a base64 string into raw byte array representation.
+ */
+static int t_fromb64(unsigned char *a, const char *src)
+ {
+ char *loc;
+ int i, j;
+ int size;
+
+ while(*src && (*src == ' ' || *src == '\t' || *src == '\n'))
+ ++src;
+ size = strlen(src);
+ i = 0;
+ while(i < size)
+ {
+ loc = strchr(b64table, src[i]);
+ if(loc == (char *) 0) break;
+ else a[i] = loc - b64table;
+ ++i;
+ }
+ size = i;
+ i = size - 1;
+ j = size;
+ while(1)
+ {
+ a[j] = a[i];
+ if(--i < 0) break;
+ a[j] |= (a[i] & 3) << 6;
+ --j;
+ a[j] = (unsigned char) ((a[i] & 0x3c) >> 2);
+ if(--i < 0) break;
+ a[j] |= (a[i] & 0xf) << 4;
+ --j;
+ a[j] = (unsigned char) ((a[i] & 0x30) >> 4);
+ if(--i < 0) break;
+ a[j] |= (a[i] << 2);
+
+ a[--j] = 0;
+ if(--i < 0) break;
+ }
+ while(a[j] == 0 && j <= size) ++j;
+ i = 0;
+ while (j <= size) a[i++] = a[j++];
+ return i;
+ }
+
+
+/*
+ * Convert a raw byte string into a null-terminated base64 ASCII string.
+ */
+static char *t_tob64(char *dst, const unsigned char *src, int size)
+ {
+ int c, pos = size % 3;
+ unsigned char b0 = 0, b1 = 0, b2 = 0, notleading = 0;
+ char *olddst = dst;
+
+ switch(pos)
+ {
+ case 1:
+ b2 = src[0];
+ break;
+ case 2:
+ b1 = src[0];
+ b2 = src[1];
+ break;
+ }
+
+ while(1)
+ {
+ c = (b0 & 0xfc) >> 2;
+ if(notleading || c != 0)
+ {
+ *dst++ = b64table[c];
+ notleading = 1;
+ }
+ c = ((b0 & 3) << 4) | ((b1 & 0xf0) >> 4);
+ if(notleading || c != 0)
+ {
+ *dst++ = b64table[c];
+ notleading = 1;
+ }
+ c = ((b1 & 0xf) << 2) | ((b2 & 0xc0) >> 6);
+ if(notleading || c != 0)
+ {
+ *dst++ = b64table[c];
+ notleading = 1;
+ }
+ c = b2 & 0x3f;
+ if(notleading || c != 0)
+ {
+ *dst++ = b64table[c];
+ notleading = 1;
+ }
+ if(pos >= size) break;
+ else
+ {
+ b0 = src[pos++];
+ b1 = src[pos++];
+ b2 = src[pos++];
+ }
+ }
+
+ *dst++ = '\0';
+ return olddst;
+ }
+
+static void SRP_user_pwd_free(SRP_user_pwd *user_pwd)
+ {
+ if (user_pwd == NULL)
+ return;
+ BN_free(user_pwd->s);
+ BN_clear_free(user_pwd->v);
+ OPENSSL_free(user_pwd->id);
+ OPENSSL_free(user_pwd->info);
+ OPENSSL_free(user_pwd);
+ }
+
+static SRP_user_pwd *SRP_user_pwd_new()
+ {
+ SRP_user_pwd *ret = OPENSSL_malloc(sizeof(SRP_user_pwd));
+ if (ret == NULL)
+ return NULL;
+ ret->N = NULL;
+ ret->g = NULL;
+ ret->s = NULL;
+ ret->v = NULL;
+ ret->id = NULL ;
+ ret->info = NULL;
+ return ret;
+ }
+
+static void SRP_user_pwd_set_gN(SRP_user_pwd *vinfo, const BIGNUM *g,
+ const BIGNUM *N)
+ {
+ vinfo->N = N;
+ vinfo->g = g;
+ }
+
+static int SRP_user_pwd_set_ids(SRP_user_pwd *vinfo, const char *id,
+ const char *info)
+ {
+ if (id != NULL && NULL == (vinfo->id = BUF_strdup(id)))
+ return 0;
+ return (info == NULL || NULL != (vinfo->info = BUF_strdup(info))) ;
+ }
+
+static int SRP_user_pwd_set_sv(SRP_user_pwd *vinfo, const char *s,
+ const char *v)
+ {
+ unsigned char tmp[MAX_LEN];
+ int len;
+
+ if (strlen(s) > MAX_LEN || strlen(v) > MAX_LEN)
+ return 0;
+ len = t_fromb64(tmp, v);
+ if (NULL == (vinfo->v = BN_bin2bn(tmp, len, NULL)) )
+ return 0;
+ len = t_fromb64(tmp, s);
+ return ((vinfo->s = BN_bin2bn(tmp, len, NULL)) != NULL) ;
+ }
+
+static int SRP_user_pwd_set_sv_BN(SRP_user_pwd *vinfo, BIGNUM *s, BIGNUM *v)
+ {
+ vinfo->v = v;
+ vinfo->s = s;
+ return (vinfo->s != NULL && vinfo->v != NULL) ;
+ }
+
+SRP_VBASE *SRP_VBASE_new(char *seed_key)
+ {
+ SRP_VBASE *vb = (SRP_VBASE *) OPENSSL_malloc(sizeof(SRP_VBASE));
+
+ if (vb == NULL)
+ return NULL;
+ if (!(vb->users_pwd = sk_SRP_user_pwd_new_null()) ||
+ !(vb->gN_cache = sk_SRP_gN_cache_new_null()))
+ {
+ OPENSSL_free(vb);
+ return NULL;
+ }
+ vb->default_g = NULL;
+ vb->default_N = NULL;
+ vb->seed_key = NULL;
+ if ((seed_key != NULL) &&
+ (vb->seed_key = BUF_strdup(seed_key)) == NULL)
+ {
+ sk_SRP_user_pwd_free(vb->users_pwd);
+ sk_SRP_gN_cache_free(vb->gN_cache);
+ OPENSSL_free(vb);
+ return NULL;
+ }
+ return vb;
+ }
+
+
+int SRP_VBASE_free(SRP_VBASE *vb)
+ {
+ sk_SRP_user_pwd_pop_free(vb->users_pwd,SRP_user_pwd_free);
+ sk_SRP_gN_cache_free(vb->gN_cache);
+ OPENSSL_free(vb->seed_key);
+ OPENSSL_free(vb);
+ return 0;
+ }
+
+
+static SRP_gN_cache *SRP_gN_new_init(const char *ch)
+ {
+ unsigned char tmp[MAX_LEN];
+ int len;
+
+ SRP_gN_cache *newgN = (SRP_gN_cache *)OPENSSL_malloc(sizeof(SRP_gN_cache));
+ if (newgN == NULL)
+ return NULL;
+
+ if ((newgN->b64_bn = BUF_strdup(ch)) == NULL)
+ goto err;
+
+ len = t_fromb64(tmp, ch);
+ if ((newgN->bn = BN_bin2bn(tmp, len, NULL)))
+ return newgN;
+
+ OPENSSL_free(newgN->b64_bn);
+err:
+ OPENSSL_free(newgN);
+ return NULL;
+ }
+
+
+static void SRP_gN_free(SRP_gN_cache *gN_cache)
+ {
+ if (gN_cache == NULL)
+ return;
+ OPENSSL_free(gN_cache->b64_bn);
+ BN_free(gN_cache->bn);
+ OPENSSL_free(gN_cache);
+ }
+
+static SRP_gN *SRP_get_gN_by_id(const char *id, STACK_OF(SRP_gN) *gN_tab)
+ {
+ int i;
+
+ SRP_gN *gN;
+ if (gN_tab != NULL)
+ for(i = 0; i < sk_SRP_gN_num(gN_tab); i++)
+ {
+ gN = sk_SRP_gN_value(gN_tab, i);
+ if (gN && (id == NULL || strcmp(gN->id,id)==0))
+ return gN;
+ }
+
+ return SRP_get_default_gN(id);
+ }
+
+static BIGNUM *SRP_gN_place_bn(STACK_OF(SRP_gN_cache) *gN_cache, char *ch)
+ {
+ int i;
+ if (gN_cache == NULL)
+ return NULL;
+
+ /* search if we have already one... */
+ for(i = 0; i < sk_SRP_gN_cache_num(gN_cache); i++)
+ {
+ SRP_gN_cache *cache = sk_SRP_gN_cache_value(gN_cache, i);
+ if (strcmp(cache->b64_bn,ch)==0)
+ return cache->bn;
+ }
+ { /* it is the first time that we find it */
+ SRP_gN_cache *newgN = SRP_gN_new_init(ch);
+ if (newgN)
+ {
+ if (sk_SRP_gN_cache_insert(gN_cache,newgN,0)>0)
+ return newgN->bn;
+ SRP_gN_free(newgN);
+ }
+ }
+ return NULL;
+ }
+
+/* this function parses verifier file. Format is:
+ * string(index):base64(N):base64(g):0
+ * string(username):base64(v):base64(salt):int(index)
+ */
+
+
+int SRP_VBASE_init(SRP_VBASE *vb, char *verifier_file)
+ {
+ int error_code ;
+ STACK_OF(SRP_gN) *SRP_gN_tab = sk_SRP_gN_new_null();
+ char *last_index = NULL;
+ int i;
+ char **pp;
+
+ SRP_gN *gN = NULL;
+ SRP_user_pwd *user_pwd = NULL ;
+
+ TXT_DB *tmpdb = NULL;
+ BIO *in = BIO_new(BIO_s_file());
+
+ error_code = SRP_ERR_OPEN_FILE;
+
+ if (in == NULL || BIO_read_filename(in,verifier_file) <= 0)
+ goto err;
+
+ error_code = SRP_ERR_VBASE_INCOMPLETE_FILE;
+
+ if ((tmpdb =TXT_DB_read(in,DB_NUMBER)) == NULL)
+ goto err;
+
+ error_code = SRP_ERR_MEMORY;
+
+
+ if (vb->seed_key)
+ {
+ last_index = SRP_get_default_gN(NULL)->id;
+ }
+ for (i = 0; i < sk_OPENSSL_PSTRING_num(tmpdb->data); i++)
+ {
+ pp = sk_OPENSSL_PSTRING_value(tmpdb->data,i);
+ if (pp[DB_srptype][0] == DB_SRP_INDEX)
+ {
+ /*we add this couple in the internal Stack */
+
+ if ((gN = (SRP_gN *)OPENSSL_malloc(sizeof(SRP_gN))) == NULL)
+ goto err;
+
+ if (!(gN->id = BUF_strdup(pp[DB_srpid]))
+ || !(gN->N = SRP_gN_place_bn(vb->gN_cache,pp[DB_srpverifier]))
+ || !(gN->g = SRP_gN_place_bn(vb->gN_cache,pp[DB_srpsalt]))
+ || sk_SRP_gN_insert(SRP_gN_tab,gN,0) == 0)
+ goto err;
+
+ gN = NULL;
+
+ if (vb->seed_key != NULL)
+ {
+ last_index = pp[DB_srpid];
+ }
+ }
+ else if (pp[DB_srptype][0] == DB_SRP_VALID)
+ {
+ /* it is a user .... */
+ SRP_gN *lgN;
+ if ((lgN = SRP_get_gN_by_id(pp[DB_srpgN],SRP_gN_tab))!=NULL)
+ {
+ error_code = SRP_ERR_MEMORY;
+ if ((user_pwd = SRP_user_pwd_new()) == NULL)
+ goto err;
+
+ SRP_user_pwd_set_gN(user_pwd,lgN->g,lgN->N);
+ if (!SRP_user_pwd_set_ids(user_pwd, pp[DB_srpid],pp[DB_srpinfo]))
+ goto err;
+
+ error_code = SRP_ERR_VBASE_BN_LIB;
+ if (!SRP_user_pwd_set_sv(user_pwd, pp[DB_srpsalt],pp[DB_srpverifier]))
+ goto err;
+
+ if (sk_SRP_user_pwd_insert(vb->users_pwd, user_pwd, 0) == 0)
+ goto err;
+ user_pwd = NULL; /* abandon responsability */
+ }
+ }
+ }
+
+ if (last_index != NULL)
+ {
+ /* this means that we want to simulate a default user */
+
+ if (((gN = SRP_get_gN_by_id(last_index,SRP_gN_tab))==NULL))
+ {
+ error_code = SRP_ERR_VBASE_BN_LIB;
+ goto err;
+ }
+ vb->default_g = gN->g ;
+ vb->default_N = gN->N ;
+ gN = NULL ;
+ }
+ error_code = SRP_NO_ERROR;
+
+ err:
+ /* there may be still some leaks to fix, if this fails, the application terminates most likely */
+
+ if (gN != NULL)
+ {
+ OPENSSL_free(gN->id);
+ OPENSSL_free(gN);
+ }
+
+ SRP_user_pwd_free(user_pwd);
+
+ if (tmpdb) TXT_DB_free(tmpdb);
+ if (in) BIO_free_all(in);
+
+ sk_SRP_gN_free(SRP_gN_tab);
+
+ return error_code;
+
+ }
+
+
+SRP_user_pwd *SRP_VBASE_get_by_user(SRP_VBASE *vb, char *username)
+ {
+ int i;
+ SRP_user_pwd *user;
+ unsigned char digv[SHA_DIGEST_LENGTH];
+ unsigned char digs[SHA_DIGEST_LENGTH];
+ EVP_MD_CTX ctxt;
+
+ if (vb == NULL)
+ return NULL;
+ for(i = 0; i < sk_SRP_user_pwd_num(vb->users_pwd); i++)
+ {
+ user = sk_SRP_user_pwd_value(vb->users_pwd, i);
+ if (strcmp(user->id,username)==0)
+ return user;
+ }
+ if ((vb->seed_key == NULL) ||
+ (vb->default_g == NULL) ||
+ (vb->default_N == NULL))
+ return NULL;
+
+/* if the user is unknown we set parameters as well if we have a seed_key */
+
+ if ((user = SRP_user_pwd_new()) == NULL)
+ return NULL;
+
+ SRP_user_pwd_set_gN(user,vb->default_g,vb->default_N);
+
+ if (!SRP_user_pwd_set_ids(user,username,NULL))
+ goto err;
+
+ RAND_pseudo_bytes(digv, SHA_DIGEST_LENGTH);
+ EVP_MD_CTX_init(&ctxt);
+ EVP_DigestInit_ex(&ctxt, EVP_sha1(), NULL);
+ EVP_DigestUpdate(&ctxt, vb->seed_key, strlen(vb->seed_key));
+ EVP_DigestUpdate(&ctxt, username, strlen(username));
+ EVP_DigestFinal_ex(&ctxt, digs, NULL);
+ EVP_MD_CTX_cleanup(&ctxt);
+ if (SRP_user_pwd_set_sv_BN(user, BN_bin2bn(digs,SHA_DIGEST_LENGTH,NULL), BN_bin2bn(digv,SHA_DIGEST_LENGTH, NULL)))
+ return user;
+
+err: SRP_user_pwd_free(user);
+ return NULL;
+ }
+
+
+/*
+ create a verifier (*salt,*verifier,g and N are in base64)
+*/
+char *SRP_create_verifier(const char *user, const char *pass, char **salt,
+ char **verifier, const char *N, const char *g)
+ {
+ int len;
+ char * result=NULL;
+ char *vf;
+ BIGNUM *N_bn = NULL, *g_bn = NULL, *s = NULL, *v = NULL;
+ unsigned char tmp[MAX_LEN];
+ unsigned char tmp2[MAX_LEN];
+ char * defgNid = NULL;
+
+ if ((user == NULL)||
+ (pass == NULL)||
+ (salt == NULL)||
+ (verifier == NULL))
+ goto err;
+
+ if (N)
+ {
+ if (!(len = t_fromb64(tmp, N))) goto err;
+ N_bn = BN_bin2bn(tmp, len, NULL);
+ if (!(len = t_fromb64(tmp, g))) goto err;
+ g_bn = BN_bin2bn(tmp, len, NULL);
+ defgNid = "*";
+ }
+ else
+ {
+ SRP_gN * gN = SRP_get_gN_by_id(g, NULL) ;
+ if (gN == NULL)
+ goto err;
+ N_bn = gN->N;
+ g_bn = gN->g;
+ defgNid = gN->id;
+ }
+
+ if (*salt == NULL)
+ {
+ RAND_pseudo_bytes(tmp2, SRP_RANDOM_SALT_LEN);
+
+ s = BN_bin2bn(tmp2, SRP_RANDOM_SALT_LEN, NULL);
+ }
+ else
+ {
+ if (!(len = t_fromb64(tmp2, *salt)))
+ goto err;
+ s = BN_bin2bn(tmp2, len, NULL);
+ }
+
+
+ if(!SRP_create_verifier_BN(user, pass, &s, &v, N_bn, g_bn)) goto err;
+
+ BN_bn2bin(v,tmp);
+ if (((vf = OPENSSL_malloc(BN_num_bytes(v)*2)) == NULL))
+ goto err;
+ t_tob64(vf, tmp, BN_num_bytes(v));
+
+ *verifier = vf;
+ if (*salt == NULL)
+ {
+ char *tmp_salt;
+
+ if ((tmp_salt = OPENSSL_malloc(SRP_RANDOM_SALT_LEN * 2)) == NULL)
+ {
+ OPENSSL_free(vf);
+ goto err;
+ }
+ t_tob64(tmp_salt, tmp2, SRP_RANDOM_SALT_LEN);
+ *salt = tmp_salt;
+ }
+
+ result=defgNid;
+
+err:
+ if(N)
+ {
+ BN_free(N_bn);
+ BN_free(g_bn);
+ }
+ return result;
+ }
+
+/*
+ create a verifier (*salt,*verifier,g and N are BIGNUMs)
+*/
+int SRP_create_verifier_BN(const char *user, const char *pass, BIGNUM **salt, BIGNUM **verifier, BIGNUM *N, BIGNUM *g)
+ {
+ int result=0;
+ BIGNUM *x = NULL;
+ BN_CTX *bn_ctx = BN_CTX_new();
+ unsigned char tmp2[MAX_LEN];
+
+ if ((user == NULL)||
+ (pass == NULL)||
+ (salt == NULL)||
+ (verifier == NULL)||
+ (N == NULL)||
+ (g == NULL)||
+ (bn_ctx == NULL))
+ goto err;
+
+ srp_bn_print(N);
+ srp_bn_print(g);
+
+ if (*salt == NULL)
+ {
+ RAND_pseudo_bytes(tmp2, SRP_RANDOM_SALT_LEN);
+
+ *salt = BN_bin2bn(tmp2,SRP_RANDOM_SALT_LEN,NULL);
+ }
+
+ x = SRP_Calc_x(*salt,user,pass);
+
+ *verifier = BN_new();
+ if(*verifier == NULL) goto err;
+
+ if (!BN_mod_exp(*verifier,g,x,N,bn_ctx))
+ {
+ BN_clear_free(*verifier);
+ goto err;
+ }
+
+ srp_bn_print(*verifier);
+
+ result=1;
+
+err:
+
+ BN_clear_free(x);
+ BN_CTX_free(bn_ctx);
+ return result;
+ }
+
+
+
+#endif
diff --git a/main/openssl/crypto/stack/safestack.h b/main/openssl/crypto/stack/safestack.h
index 3e76aa58..ea3aa0d8 100644
--- a/main/openssl/crypto/stack/safestack.h
+++ b/main/openssl/crypto/stack/safestack.h
@@ -1459,6 +1459,94 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
#define sk_POLICY_MAPPING_sort(st) SKM_sk_sort(POLICY_MAPPING, (st))
#define sk_POLICY_MAPPING_is_sorted(st) SKM_sk_is_sorted(POLICY_MAPPING, (st))
+#define sk_SRP_gN_new(cmp) SKM_sk_new(SRP_gN, (cmp))
+#define sk_SRP_gN_new_null() SKM_sk_new_null(SRP_gN)
+#define sk_SRP_gN_free(st) SKM_sk_free(SRP_gN, (st))
+#define sk_SRP_gN_num(st) SKM_sk_num(SRP_gN, (st))
+#define sk_SRP_gN_value(st, i) SKM_sk_value(SRP_gN, (st), (i))
+#define sk_SRP_gN_set(st, i, val) SKM_sk_set(SRP_gN, (st), (i), (val))
+#define sk_SRP_gN_zero(st) SKM_sk_zero(SRP_gN, (st))
+#define sk_SRP_gN_push(st, val) SKM_sk_push(SRP_gN, (st), (val))
+#define sk_SRP_gN_unshift(st, val) SKM_sk_unshift(SRP_gN, (st), (val))
+#define sk_SRP_gN_find(st, val) SKM_sk_find(SRP_gN, (st), (val))
+#define sk_SRP_gN_find_ex(st, val) SKM_sk_find_ex(SRP_gN, (st), (val))
+#define sk_SRP_gN_delete(st, i) SKM_sk_delete(SRP_gN, (st), (i))
+#define sk_SRP_gN_delete_ptr(st, ptr) SKM_sk_delete_ptr(SRP_gN, (st), (ptr))
+#define sk_SRP_gN_insert(st, val, i) SKM_sk_insert(SRP_gN, (st), (val), (i))
+#define sk_SRP_gN_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(SRP_gN, (st), (cmp))
+#define sk_SRP_gN_dup(st) SKM_sk_dup(SRP_gN, st)
+#define sk_SRP_gN_pop_free(st, free_func) SKM_sk_pop_free(SRP_gN, (st), (free_func))
+#define sk_SRP_gN_shift(st) SKM_sk_shift(SRP_gN, (st))
+#define sk_SRP_gN_pop(st) SKM_sk_pop(SRP_gN, (st))
+#define sk_SRP_gN_sort(st) SKM_sk_sort(SRP_gN, (st))
+#define sk_SRP_gN_is_sorted(st) SKM_sk_is_sorted(SRP_gN, (st))
+
+#define sk_SRP_gN_cache_new(cmp) SKM_sk_new(SRP_gN_cache, (cmp))
+#define sk_SRP_gN_cache_new_null() SKM_sk_new_null(SRP_gN_cache)
+#define sk_SRP_gN_cache_free(st) SKM_sk_free(SRP_gN_cache, (st))
+#define sk_SRP_gN_cache_num(st) SKM_sk_num(SRP_gN_cache, (st))
+#define sk_SRP_gN_cache_value(st, i) SKM_sk_value(SRP_gN_cache, (st), (i))
+#define sk_SRP_gN_cache_set(st, i, val) SKM_sk_set(SRP_gN_cache, (st), (i), (val))
+#define sk_SRP_gN_cache_zero(st) SKM_sk_zero(SRP_gN_cache, (st))
+#define sk_SRP_gN_cache_push(st, val) SKM_sk_push(SRP_gN_cache, (st), (val))
+#define sk_SRP_gN_cache_unshift(st, val) SKM_sk_unshift(SRP_gN_cache, (st), (val))
+#define sk_SRP_gN_cache_find(st, val) SKM_sk_find(SRP_gN_cache, (st), (val))
+#define sk_SRP_gN_cache_find_ex(st, val) SKM_sk_find_ex(SRP_gN_cache, (st), (val))
+#define sk_SRP_gN_cache_delete(st, i) SKM_sk_delete(SRP_gN_cache, (st), (i))
+#define sk_SRP_gN_cache_delete_ptr(st, ptr) SKM_sk_delete_ptr(SRP_gN_cache, (st), (ptr))
+#define sk_SRP_gN_cache_insert(st, val, i) SKM_sk_insert(SRP_gN_cache, (st), (val), (i))
+#define sk_SRP_gN_cache_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(SRP_gN_cache, (st), (cmp))
+#define sk_SRP_gN_cache_dup(st) SKM_sk_dup(SRP_gN_cache, st)
+#define sk_SRP_gN_cache_pop_free(st, free_func) SKM_sk_pop_free(SRP_gN_cache, (st), (free_func))
+#define sk_SRP_gN_cache_shift(st) SKM_sk_shift(SRP_gN_cache, (st))
+#define sk_SRP_gN_cache_pop(st) SKM_sk_pop(SRP_gN_cache, (st))
+#define sk_SRP_gN_cache_sort(st) SKM_sk_sort(SRP_gN_cache, (st))
+#define sk_SRP_gN_cache_is_sorted(st) SKM_sk_is_sorted(SRP_gN_cache, (st))
+
+#define sk_SRP_user_pwd_new(cmp) SKM_sk_new(SRP_user_pwd, (cmp))
+#define sk_SRP_user_pwd_new_null() SKM_sk_new_null(SRP_user_pwd)
+#define sk_SRP_user_pwd_free(st) SKM_sk_free(SRP_user_pwd, (st))
+#define sk_SRP_user_pwd_num(st) SKM_sk_num(SRP_user_pwd, (st))
+#define sk_SRP_user_pwd_value(st, i) SKM_sk_value(SRP_user_pwd, (st), (i))
+#define sk_SRP_user_pwd_set(st, i, val) SKM_sk_set(SRP_user_pwd, (st), (i), (val))
+#define sk_SRP_user_pwd_zero(st) SKM_sk_zero(SRP_user_pwd, (st))
+#define sk_SRP_user_pwd_push(st, val) SKM_sk_push(SRP_user_pwd, (st), (val))
+#define sk_SRP_user_pwd_unshift(st, val) SKM_sk_unshift(SRP_user_pwd, (st), (val))
+#define sk_SRP_user_pwd_find(st, val) SKM_sk_find(SRP_user_pwd, (st), (val))
+#define sk_SRP_user_pwd_find_ex(st, val) SKM_sk_find_ex(SRP_user_pwd, (st), (val))
+#define sk_SRP_user_pwd_delete(st, i) SKM_sk_delete(SRP_user_pwd, (st), (i))
+#define sk_SRP_user_pwd_delete_ptr(st, ptr) SKM_sk_delete_ptr(SRP_user_pwd, (st), (ptr))
+#define sk_SRP_user_pwd_insert(st, val, i) SKM_sk_insert(SRP_user_pwd, (st), (val), (i))
+#define sk_SRP_user_pwd_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(SRP_user_pwd, (st), (cmp))
+#define sk_SRP_user_pwd_dup(st) SKM_sk_dup(SRP_user_pwd, st)
+#define sk_SRP_user_pwd_pop_free(st, free_func) SKM_sk_pop_free(SRP_user_pwd, (st), (free_func))
+#define sk_SRP_user_pwd_shift(st) SKM_sk_shift(SRP_user_pwd, (st))
+#define sk_SRP_user_pwd_pop(st) SKM_sk_pop(SRP_user_pwd, (st))
+#define sk_SRP_user_pwd_sort(st) SKM_sk_sort(SRP_user_pwd, (st))
+#define sk_SRP_user_pwd_is_sorted(st) SKM_sk_is_sorted(SRP_user_pwd, (st))
+
+#define sk_SRTP_PROTECTION_PROFILE_new(cmp) SKM_sk_new(SRTP_PROTECTION_PROFILE, (cmp))
+#define sk_SRTP_PROTECTION_PROFILE_new_null() SKM_sk_new_null(SRTP_PROTECTION_PROFILE)
+#define sk_SRTP_PROTECTION_PROFILE_free(st) SKM_sk_free(SRTP_PROTECTION_PROFILE, (st))
+#define sk_SRTP_PROTECTION_PROFILE_num(st) SKM_sk_num(SRTP_PROTECTION_PROFILE, (st))
+#define sk_SRTP_PROTECTION_PROFILE_value(st, i) SKM_sk_value(SRTP_PROTECTION_PROFILE, (st), (i))
+#define sk_SRTP_PROTECTION_PROFILE_set(st, i, val) SKM_sk_set(SRTP_PROTECTION_PROFILE, (st), (i), (val))
+#define sk_SRTP_PROTECTION_PROFILE_zero(st) SKM_sk_zero(SRTP_PROTECTION_PROFILE, (st))
+#define sk_SRTP_PROTECTION_PROFILE_push(st, val) SKM_sk_push(SRTP_PROTECTION_PROFILE, (st), (val))
+#define sk_SRTP_PROTECTION_PROFILE_unshift(st, val) SKM_sk_unshift(SRTP_PROTECTION_PROFILE, (st), (val))
+#define sk_SRTP_PROTECTION_PROFILE_find(st, val) SKM_sk_find(SRTP_PROTECTION_PROFILE, (st), (val))
+#define sk_SRTP_PROTECTION_PROFILE_find_ex(st, val) SKM_sk_find_ex(SRTP_PROTECTION_PROFILE, (st), (val))
+#define sk_SRTP_PROTECTION_PROFILE_delete(st, i) SKM_sk_delete(SRTP_PROTECTION_PROFILE, (st), (i))
+#define sk_SRTP_PROTECTION_PROFILE_delete_ptr(st, ptr) SKM_sk_delete_ptr(SRTP_PROTECTION_PROFILE, (st), (ptr))
+#define sk_SRTP_PROTECTION_PROFILE_insert(st, val, i) SKM_sk_insert(SRTP_PROTECTION_PROFILE, (st), (val), (i))
+#define sk_SRTP_PROTECTION_PROFILE_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(SRTP_PROTECTION_PROFILE, (st), (cmp))
+#define sk_SRTP_PROTECTION_PROFILE_dup(st) SKM_sk_dup(SRTP_PROTECTION_PROFILE, st)
+#define sk_SRTP_PROTECTION_PROFILE_pop_free(st, free_func) SKM_sk_pop_free(SRTP_PROTECTION_PROFILE, (st), (free_func))
+#define sk_SRTP_PROTECTION_PROFILE_shift(st) SKM_sk_shift(SRTP_PROTECTION_PROFILE, (st))
+#define sk_SRTP_PROTECTION_PROFILE_pop(st) SKM_sk_pop(SRTP_PROTECTION_PROFILE, (st))
+#define sk_SRTP_PROTECTION_PROFILE_sort(st) SKM_sk_sort(SRTP_PROTECTION_PROFILE, (st))
+#define sk_SRTP_PROTECTION_PROFILE_is_sorted(st) SKM_sk_is_sorted(SRTP_PROTECTION_PROFILE, (st))
+
#define sk_SSL_CIPHER_new(cmp) SKM_sk_new(SSL_CIPHER, (cmp))
#define sk_SSL_CIPHER_new_null() SKM_sk_new_null(SSL_CIPHER)
#define sk_SSL_CIPHER_free(st) SKM_sk_free(SSL_CIPHER, (st))
@@ -2056,31 +2144,6 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
#define sk_OPENSSL_STRING_is_sorted(st) SKM_sk_is_sorted(OPENSSL_STRING, (st))
-#define sk_OPENSSL_PSTRING_new(cmp) ((STACK_OF(OPENSSL_PSTRING) *)sk_new(CHECKED_SK_CMP_FUNC(OPENSSL_STRING, cmp)))
-#define sk_OPENSSL_PSTRING_new_null() ((STACK_OF(OPENSSL_PSTRING) *)sk_new_null())
-#define sk_OPENSSL_PSTRING_push(st, val) sk_push(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, val))
-#define sk_OPENSSL_PSTRING_find(st, val) sk_find(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, val))
-#define sk_OPENSSL_PSTRING_value(st, i) ((OPENSSL_PSTRING)sk_value(CHECKED_STACK_OF(OPENSSL_PSTRING, st), i))
-#define sk_OPENSSL_PSTRING_num(st) SKM_sk_num(OPENSSL_PSTRING, st)
-#define sk_OPENSSL_PSTRING_pop_free(st, free_func) sk_pop_free(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_SK_FREE_FUNC2(OPENSSL_PSTRING, free_func))
-#define sk_OPENSSL_PSTRING_insert(st, val, i) sk_insert(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, val), i)
-#define sk_OPENSSL_PSTRING_free(st) SKM_sk_free(OPENSSL_PSTRING, st)
-#define sk_OPENSSL_PSTRING_set(st, i, val) sk_set(CHECKED_STACK_OF(OPENSSL_PSTRING, st), i, CHECKED_PTR_OF(OPENSSL_STRING, val))
-#define sk_OPENSSL_PSTRING_zero(st) SKM_sk_zero(OPENSSL_PSTRING, (st))
-#define sk_OPENSSL_PSTRING_unshift(st, val) sk_unshift(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, val))
-#define sk_OPENSSL_PSTRING_find_ex(st, val) sk_find_ex((_STACK *)CHECKED_CONST_PTR_OF(STACK_OF(OPENSSL_PSTRING), st), CHECKED_CONST_PTR_OF(OPENSSL_STRING, val))
-#define sk_OPENSSL_PSTRING_delete(st, i) SKM_sk_delete(OPENSSL_PSTRING, (st), (i))
-#define sk_OPENSSL_PSTRING_delete_ptr(st, ptr) (OPENSSL_PSTRING *)sk_delete_ptr(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, ptr))
-#define sk_OPENSSL_PSTRING_set_cmp_func(st, cmp) \
- ((int (*)(const OPENSSL_STRING * const *,const OPENSSL_STRING * const *)) \
- sk_set_cmp_func(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_SK_CMP_FUNC(OPENSSL_STRING, cmp)))
-#define sk_OPENSSL_PSTRING_dup(st) SKM_sk_dup(OPENSSL_PSTRING, st)
-#define sk_OPENSSL_PSTRING_shift(st) SKM_sk_shift(OPENSSL_PSTRING, (st))
-#define sk_OPENSSL_PSTRING_pop(st) (OPENSSL_STRING *)sk_pop(CHECKED_STACK_OF(OPENSSL_PSTRING, st))
-#define sk_OPENSSL_PSTRING_sort(st) SKM_sk_sort(OPENSSL_PSTRING, (st))
-#define sk_OPENSSL_PSTRING_is_sorted(st) SKM_sk_is_sorted(OPENSSL_PSTRING, (st))
-
-
#define sk_OPENSSL_BLOCK_new(cmp) ((STACK_OF(OPENSSL_BLOCK) *)sk_new(CHECKED_SK_CMP_FUNC(void, cmp)))
#define sk_OPENSSL_BLOCK_new_null() ((STACK_OF(OPENSSL_BLOCK) *)sk_new_null())
#define sk_OPENSSL_BLOCK_push(st, val) sk_push(CHECKED_STACK_OF(OPENSSL_BLOCK, st), CHECKED_PTR_OF(void, val))
@@ -2106,6 +2169,31 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
#define sk_OPENSSL_BLOCK_is_sorted(st) SKM_sk_is_sorted(OPENSSL_BLOCK, (st))
+#define sk_OPENSSL_PSTRING_new(cmp) ((STACK_OF(OPENSSL_PSTRING) *)sk_new(CHECKED_SK_CMP_FUNC(OPENSSL_STRING, cmp)))
+#define sk_OPENSSL_PSTRING_new_null() ((STACK_OF(OPENSSL_PSTRING) *)sk_new_null())
+#define sk_OPENSSL_PSTRING_push(st, val) sk_push(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, val))
+#define sk_OPENSSL_PSTRING_find(st, val) sk_find(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, val))
+#define sk_OPENSSL_PSTRING_value(st, i) ((OPENSSL_PSTRING)sk_value(CHECKED_STACK_OF(OPENSSL_PSTRING, st), i))
+#define sk_OPENSSL_PSTRING_num(st) SKM_sk_num(OPENSSL_PSTRING, st)
+#define sk_OPENSSL_PSTRING_pop_free(st, free_func) sk_pop_free(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_SK_FREE_FUNC2(OPENSSL_PSTRING, free_func))
+#define sk_OPENSSL_PSTRING_insert(st, val, i) sk_insert(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, val), i)
+#define sk_OPENSSL_PSTRING_free(st) SKM_sk_free(OPENSSL_PSTRING, st)
+#define sk_OPENSSL_PSTRING_set(st, i, val) sk_set(CHECKED_STACK_OF(OPENSSL_PSTRING, st), i, CHECKED_PTR_OF(OPENSSL_STRING, val))
+#define sk_OPENSSL_PSTRING_zero(st) SKM_sk_zero(OPENSSL_PSTRING, (st))
+#define sk_OPENSSL_PSTRING_unshift(st, val) sk_unshift(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, val))
+#define sk_OPENSSL_PSTRING_find_ex(st, val) sk_find_ex((_STACK *)CHECKED_CONST_PTR_OF(STACK_OF(OPENSSL_PSTRING), st), CHECKED_CONST_PTR_OF(OPENSSL_STRING, val))
+#define sk_OPENSSL_PSTRING_delete(st, i) SKM_sk_delete(OPENSSL_PSTRING, (st), (i))
+#define sk_OPENSSL_PSTRING_delete_ptr(st, ptr) (OPENSSL_PSTRING *)sk_delete_ptr(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, ptr))
+#define sk_OPENSSL_PSTRING_set_cmp_func(st, cmp) \
+ ((int (*)(const OPENSSL_STRING * const *,const OPENSSL_STRING * const *)) \
+ sk_set_cmp_func(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_SK_CMP_FUNC(OPENSSL_STRING, cmp)))
+#define sk_OPENSSL_PSTRING_dup(st) SKM_sk_dup(OPENSSL_PSTRING, st)
+#define sk_OPENSSL_PSTRING_shift(st) SKM_sk_shift(OPENSSL_PSTRING, (st))
+#define sk_OPENSSL_PSTRING_pop(st) (OPENSSL_STRING *)sk_pop(CHECKED_STACK_OF(OPENSSL_PSTRING, st))
+#define sk_OPENSSL_PSTRING_sort(st) SKM_sk_sort(OPENSSL_PSTRING, (st))
+#define sk_OPENSSL_PSTRING_is_sorted(st) SKM_sk_is_sorted(OPENSSL_PSTRING, (st))
+
+
#define d2i_ASN1_SET_OF_ACCESS_DESCRIPTION(st, pp, length, d2i_func, free_func, ex_tag, ex_class) \
SKM_ASN1_SET_OF_d2i(ACCESS_DESCRIPTION, (st), (pp), (length), (d2i_func), (free_func), (ex_tag), (ex_class))
#define i2d_ASN1_SET_OF_ACCESS_DESCRIPTION(st, pp, i2d_func, ex_tag, ex_class, is_set) \
diff --git a/main/openssl/crypto/symhacks.h b/main/openssl/crypto/symhacks.h
index 3fd4a816..07a412f8 100644
--- a/main/openssl/crypto/symhacks.h
+++ b/main/openssl/crypto/symhacks.h
@@ -176,7 +176,6 @@
#define SSL_CTX_set_default_passwd_cb_userdata SSL_CTX_set_def_passwd_cb_ud
#undef SSL_COMP_get_compression_methods
#define SSL_COMP_get_compression_methods SSL_COMP_get_compress_methods
-
#undef ssl_add_clienthello_renegotiate_ext
#define ssl_add_clienthello_renegotiate_ext ssl_add_clienthello_reneg_ext
#undef ssl_add_serverhello_renegotiate_ext
@@ -185,6 +184,26 @@
#define ssl_parse_clienthello_renegotiate_ext ssl_parse_clienthello_reneg_ext
#undef ssl_parse_serverhello_renegotiate_ext
#define ssl_parse_serverhello_renegotiate_ext ssl_parse_serverhello_reneg_ext
+#undef SSL_srp_server_param_with_username
+#define SSL_srp_server_param_with_username SSL_srp_server_param_with_un
+#undef SSL_CTX_set_srp_client_pwd_callback
+#define SSL_CTX_set_srp_client_pwd_callback SSL_CTX_set_srp_client_pwd_cb
+#undef SSL_CTX_set_srp_verify_param_callback
+#define SSL_CTX_set_srp_verify_param_callback SSL_CTX_set_srp_vfy_param_cb
+#undef SSL_CTX_set_srp_username_callback
+#define SSL_CTX_set_srp_username_callback SSL_CTX_set_srp_un_cb
+#undef ssl_add_clienthello_use_srtp_ext
+#define ssl_add_clienthello_use_srtp_ext ssl_add_clihello_use_srtp_ext
+#undef ssl_add_serverhello_use_srtp_ext
+#define ssl_add_serverhello_use_srtp_ext ssl_add_serhello_use_srtp_ext
+#undef ssl_parse_clienthello_use_srtp_ext
+#define ssl_parse_clienthello_use_srtp_ext ssl_parse_clihello_use_srtp_ext
+#undef ssl_parse_serverhello_use_srtp_ext
+#define ssl_parse_serverhello_use_srtp_ext ssl_parse_serhello_use_srtp_ext
+#undef SSL_CTX_set_next_protos_advertised_cb
+#define SSL_CTX_set_next_protos_advertised_cb SSL_CTX_set_next_protos_adv_cb
+#undef SSL_CTX_set_next_proto_select_cb
+#define SSL_CTX_set_next_proto_select_cb SSL_CTX_set_next_proto_sel_cb
/* Hack some long ENGINE names */
#undef ENGINE_get_default_BN_mod_exp_crt
@@ -238,6 +257,9 @@
#define EC_GROUP_get_point_conversion_form EC_GROUP_get_point_conv_form
#undef EC_GROUP_clear_free_all_extra_data
#define EC_GROUP_clear_free_all_extra_data EC_GROUP_clr_free_all_xtra_data
+#undef EC_KEY_set_public_key_affine_coordinates
+#define EC_KEY_set_public_key_affine_coordinates \
+ EC_KEY_set_pub_key_aff_coords
#undef EC_POINT_set_Jprojective_coordinates_GFp
#define EC_POINT_set_Jprojective_coordinates_GFp \
EC_POINT_set_Jproj_coords_GFp
@@ -294,8 +316,6 @@
#define ec_GFp_simple_point_set_to_infinity ec_GFp_simple_pt_set_to_inf
#undef ec_GFp_simple_points_make_affine
#define ec_GFp_simple_points_make_affine ec_GFp_simple_pts_make_affine
-#undef ec_GFp_simple_group_get_curve_GFp
-#define ec_GFp_simple_group_get_curve_GFp ec_GFp_simple_grp_get_curve_GFp
#undef ec_GFp_simple_set_Jprojective_coordinates_GFp
#define ec_GFp_simple_set_Jprojective_coordinates_GFp \
ec_GFp_smp_set_Jproj_coords_GFp
@@ -399,6 +419,12 @@
#undef dtls1_retransmit_buffered_messages
#define dtls1_retransmit_buffered_messages dtls1_retransmit_buffered_msgs
+/* Hack some long SRP names */
+#undef SRP_generate_server_master_secret
+#define SRP_generate_server_master_secret SRP_gen_server_master_secret
+#undef SRP_generate_client_master_secret
+#define SRP_generate_client_master_secret SRP_gen_client_master_secret
+
/* Hack some long UI names */
#undef UI_method_get_prompt_constructor
#define UI_method_get_prompt_constructor UI_method_get_prompt_constructr
diff --git a/main/openssl/crypto/ui/ui.h b/main/openssl/crypto/ui/ui.h
index 2b1cfa22..bd78aa41 100644
--- a/main/openssl/crypto/ui/ui.h
+++ b/main/openssl/crypto/ui/ui.h
@@ -316,7 +316,7 @@ int (*UI_method_get_writer(UI_METHOD *method))(UI*,UI_STRING*);
int (*UI_method_get_flusher(UI_METHOD *method))(UI*);
int (*UI_method_get_reader(UI_METHOD *method))(UI*,UI_STRING*);
int (*UI_method_get_closer(UI_METHOD *method))(UI*);
-char* (*UI_method_get_prompt_constructor(UI_METHOD *method))(UI*, const char*, const char*);
+char * (*UI_method_get_prompt_constructor(UI_METHOD *method))(UI*, const char*, const char*);
/* The following functions are helpers for method writers to access relevant
data from a UI_STRING. */
diff --git a/main/openssl/crypto/ui/ui_openssl.c b/main/openssl/crypto/ui/ui_openssl.c
index b05cbf34..e6ccd34a 100644
--- a/main/openssl/crypto/ui/ui_openssl.c
+++ b/main/openssl/crypto/ui/ui_openssl.c
@@ -122,9 +122,15 @@
* sigaction and fileno included. -pedantic would be more appropriate for
* the intended purposes, but we can't prevent users from adding -ansi.
*/
+#if defined(OPENSSL_SYSNAME_VXWORKS)
+#include <sys/types.h>
+#endif
+
+#if !defined(_POSIX_C_SOURCE) && defined(OPENSSL_SYS_VMS)
#ifndef _POSIX_C_SOURCE
#define _POSIX_C_SOURCE 2
#endif
+#endif
#include <signal.h>
#include <stdio.h>
#include <string.h>
diff --git a/main/openssl/crypto/x509/x509.h b/main/openssl/crypto/x509/x509.h
index e6f8a403..092dd745 100644
--- a/main/openssl/crypto/x509/x509.h
+++ b/main/openssl/crypto/x509/x509.h
@@ -657,11 +657,15 @@ int NETSCAPE_SPKI_set_pubkey(NETSCAPE_SPKI *x, EVP_PKEY *pkey);
int NETSCAPE_SPKI_print(BIO *out, NETSCAPE_SPKI *spki);
+int X509_signature_dump(BIO *bp,const ASN1_STRING *sig, int indent);
int X509_signature_print(BIO *bp,X509_ALGOR *alg, ASN1_STRING *sig);
int X509_sign(X509 *x, EVP_PKEY *pkey, const EVP_MD *md);
+int X509_sign_ctx(X509 *x, EVP_MD_CTX *ctx);
int X509_REQ_sign(X509_REQ *x, EVP_PKEY *pkey, const EVP_MD *md);
+int X509_REQ_sign_ctx(X509_REQ *x, EVP_MD_CTX *ctx);
int X509_CRL_sign(X509_CRL *x, EVP_PKEY *pkey, const EVP_MD *md);
+int X509_CRL_sign_ctx(X509_CRL *x, EVP_MD_CTX *ctx);
int NETSCAPE_SPKI_sign(NETSCAPE_SPKI *x, EVP_PKEY *pkey, const EVP_MD *md);
int X509_pubkey_digest(const X509 *data,const EVP_MD *type,
@@ -763,6 +767,7 @@ X509_ALGOR *X509_ALGOR_dup(X509_ALGOR *xn);
int X509_ALGOR_set0(X509_ALGOR *alg, ASN1_OBJECT *aobj, int ptype, void *pval);
void X509_ALGOR_get0(ASN1_OBJECT **paobj, int *pptype, void **ppval,
X509_ALGOR *algor);
+void X509_ALGOR_set_md(X509_ALGOR *alg, const EVP_MD *md);
X509_NAME *X509_NAME_dup(X509_NAME *xn);
X509_NAME_ENTRY *X509_NAME_ENTRY_dup(X509_NAME_ENTRY *ne);
@@ -896,6 +901,9 @@ int ASN1_item_verify(const ASN1_ITEM *it, X509_ALGOR *algor1,
int ASN1_item_sign(const ASN1_ITEM *it, X509_ALGOR *algor1, X509_ALGOR *algor2,
ASN1_BIT_STRING *signature,
void *data, EVP_PKEY *pkey, const EVP_MD *type);
+int ASN1_item_sign_ctx(const ASN1_ITEM *it,
+ X509_ALGOR *algor1, X509_ALGOR *algor2,
+ ASN1_BIT_STRING *signature, void *asn, EVP_MD_CTX *ctx);
#endif
int X509_set_version(X509 *x,long version);
@@ -1161,6 +1169,9 @@ X509_ALGOR *PKCS5_pbe2_set_iv(const EVP_CIPHER *cipher, int iter,
unsigned char *salt, int saltlen,
unsigned char *aiv, int prf_nid);
+X509_ALGOR *PKCS5_pbkdf2_set(int iter, unsigned char *salt, int saltlen,
+ int prf_nid, int keylen);
+
/* PKCS#8 utilities */
DECLARE_ASN1_FUNCTIONS(PKCS8_PRIV_KEY_INFO)
diff --git a/main/openssl/crypto/x509/x509_cmp.c b/main/openssl/crypto/x509/x509_cmp.c
index 4bc9da07..352aa374 100644
--- a/main/openssl/crypto/x509/x509_cmp.c
+++ b/main/openssl/crypto/x509/x509_cmp.c
@@ -86,16 +86,20 @@ unsigned long X509_issuer_and_serial_hash(X509 *a)
EVP_MD_CTX_init(&ctx);
f=X509_NAME_oneline(a->cert_info->issuer,NULL,0);
- ret=strlen(f);
- EVP_DigestInit_ex(&ctx, EVP_md5(), NULL);
- EVP_DigestUpdate(&ctx,(unsigned char *)f,ret);
+ if (!EVP_DigestInit_ex(&ctx, EVP_md5(), NULL))
+ goto err;
+ if (!EVP_DigestUpdate(&ctx,(unsigned char *)f,strlen(f)))
+ goto err;
OPENSSL_free(f);
- EVP_DigestUpdate(&ctx,(unsigned char *)a->cert_info->serialNumber->data,
- (unsigned long)a->cert_info->serialNumber->length);
- EVP_DigestFinal_ex(&ctx,&(md[0]),NULL);
+ if(!EVP_DigestUpdate(&ctx,(unsigned char *)a->cert_info->serialNumber->data,
+ (unsigned long)a->cert_info->serialNumber->length))
+ goto err;
+ if (!EVP_DigestFinal_ex(&ctx,&(md[0]),NULL))
+ goto err;
ret=( ((unsigned long)md[0] )|((unsigned long)md[1]<<8L)|
((unsigned long)md[2]<<16L)|((unsigned long)md[3]<<24L)
)&0xffffffffL;
+ err:
EVP_MD_CTX_cleanup(&ctx);
return(ret);
}
@@ -219,7 +223,9 @@ unsigned long X509_NAME_hash(X509_NAME *x)
/* Make sure X509_NAME structure contains valid cached encoding */
i2d_X509_NAME(x,NULL);
- EVP_Digest(x->canon_enc, x->canon_enclen, md, NULL, EVP_sha1(), NULL);
+ if (!EVP_Digest(x->canon_enc, x->canon_enclen, md, NULL, EVP_sha1(),
+ NULL))
+ return 0;
ret=( ((unsigned long)md[0] )|((unsigned long)md[1]<<8L)|
((unsigned long)md[2]<<16L)|((unsigned long)md[3]<<24L)
@@ -234,16 +240,22 @@ unsigned long X509_NAME_hash(X509_NAME *x)
unsigned long X509_NAME_hash_old(X509_NAME *x)
{
+ EVP_MD_CTX md_ctx;
unsigned long ret=0;
unsigned char md[16];
/* Make sure X509_NAME structure contains valid cached encoding */
i2d_X509_NAME(x,NULL);
- EVP_Digest(x->bytes->data, x->bytes->length, md, NULL, EVP_md5(), NULL);
+ EVP_MD_CTX_init(&md_ctx);
+ EVP_MD_CTX_set_flags(&md_ctx, EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
+ if (EVP_DigestInit_ex(&md_ctx, EVP_md5(), NULL)
+ && EVP_DigestUpdate(&md_ctx, x->bytes->data, x->bytes->length)
+ && EVP_DigestFinal_ex(&md_ctx,md,NULL))
+ ret=(((unsigned long)md[0] )|((unsigned long)md[1]<<8L)|
+ ((unsigned long)md[2]<<16L)|((unsigned long)md[3]<<24L)
+ )&0xffffffffL;
+ EVP_MD_CTX_cleanup(&md_ctx);
- ret=( ((unsigned long)md[0] )|((unsigned long)md[1]<<8L)|
- ((unsigned long)md[2]<<16L)|((unsigned long)md[3]<<24L)
- )&0xffffffffL;
return(ret);
}
#endif
diff --git a/main/openssl/crypto/x509/x509_lu.c b/main/openssl/crypto/x509/x509_lu.c
index 3a6e04a1..38525a8c 100644
--- a/main/openssl/crypto/x509/x509_lu.c
+++ b/main/openssl/crypto/x509/x509_lu.c
@@ -87,7 +87,7 @@ void X509_LOOKUP_free(X509_LOOKUP *ctx)
if (ctx == NULL) return;
if ( (ctx->method != NULL) &&
(ctx->method->free != NULL))
- ctx->method->free(ctx);
+ (*ctx->method->free)(ctx);
OPENSSL_free(ctx);
}
diff --git a/main/openssl/crypto/x509/x509_vfy.c b/main/openssl/crypto/x509/x509_vfy.c
index 5a0b0249..5195ffef 100644
--- a/main/openssl/crypto/x509/x509_vfy.c
+++ b/main/openssl/crypto/x509/x509_vfy.c
@@ -153,7 +153,6 @@ static int x509_subject_cmp(X509 **a, X509 **b)
int X509_verify_cert(X509_STORE_CTX *ctx)
{
X509 *x,*xtmp,*chain_ss=NULL;
- X509_NAME *xn;
int bad_chain = 0;
X509_VERIFY_PARAM *param = ctx->param;
int depth,i,ok=0;
@@ -205,7 +204,6 @@ int X509_verify_cert(X509_STORE_CTX *ctx)
*/
/* If we are self signed, we break */
- xn=X509_get_issuer_name(x);
if (ctx->check_issued(ctx, x,x)) break;
/* If we were passed a cert chain, use it first */
@@ -242,7 +240,6 @@ int X509_verify_cert(X509_STORE_CTX *ctx)
i=sk_X509_num(ctx->chain);
x=sk_X509_value(ctx->chain,i-1);
- xn = X509_get_subject_name(x);
if (ctx->check_issued(ctx, x, x))
{
/* we have a self signed certificate */
@@ -291,7 +288,6 @@ int X509_verify_cert(X509_STORE_CTX *ctx)
if (depth < num) break;
/* If we are self signed, we break */
- xn=X509_get_issuer_name(x);
if (ctx->check_issued(ctx,x,x)) break;
ok = ctx->get_issuer(&xtmp, ctx, x);
@@ -310,7 +306,6 @@ int X509_verify_cert(X509_STORE_CTX *ctx)
}
/* we now have our chain, lets check it... */
- xn=X509_get_issuer_name(x);
/* Is last certificate looked up self signed? */
if (!ctx->check_issued(ctx,x,x))
@@ -699,6 +694,7 @@ static int check_cert(X509_STORE_CTX *ctx)
X509_CRL *crl = NULL, *dcrl = NULL;
X509 *x;
int ok, cnum;
+ unsigned int last_reasons;
cnum = ctx->error_depth;
x = sk_X509_value(ctx->chain, cnum);
ctx->current_cert = x;
@@ -707,6 +703,7 @@ static int check_cert(X509_STORE_CTX *ctx)
ctx->current_reasons = 0;
while (ctx->current_reasons != CRLDP_ALL_REASONS)
{
+ last_reasons = ctx->current_reasons;
/* Try to retrieve relevant CRL */
if (ctx->get_crl)
ok = ctx->get_crl(ctx, &crl, x);
@@ -750,6 +747,15 @@ static int check_cert(X509_STORE_CTX *ctx)
X509_CRL_free(dcrl);
crl = NULL;
dcrl = NULL;
+ /* If reasons not updated we wont get anywhere by
+ * another iteration, so exit loop.
+ */
+ if (last_reasons == ctx->current_reasons)
+ {
+ ctx->error = X509_V_ERR_UNABLE_TO_GET_CRL;
+ ok = ctx->verify_cb(0, ctx);
+ goto err;
+ }
}
err:
X509_CRL_free(crl);
@@ -877,7 +883,7 @@ static int crl_extension_match(X509_CRL *a, X509_CRL *b, int nid)
{
ASN1_OCTET_STRING *exta, *extb;
int i;
- i = X509_CRL_get_ext_by_NID(a, nid, 0);
+ i = X509_CRL_get_ext_by_NID(a, nid, -1);
if (i >= 0)
{
/* Can't have multiple occurrences */
@@ -888,7 +894,7 @@ static int crl_extension_match(X509_CRL *a, X509_CRL *b, int nid)
else
exta = NULL;
- i = X509_CRL_get_ext_by_NID(b, nid, 0);
+ i = X509_CRL_get_ext_by_NID(b, nid, -1);
if (i >= 0)
{
@@ -1732,7 +1738,7 @@ int X509_cmp_time(const ASN1_TIME *ctm, time_t *cmp_time)
atm.length=sizeof(buff2);
atm.data=(unsigned char *)buff2;
- if (X509_time_adj(&atm,-offset*60, cmp_time) == NULL)
+ if (X509_time_adj(&atm, offset*60, cmp_time) == NULL)
return 0;
if (ctm->type == V_ASN1_UTCTIME)
diff --git a/main/openssl/crypto/x509/x509type.c b/main/openssl/crypto/x509/x509type.c
index 3385ad3f..9702ec53 100644
--- a/main/openssl/crypto/x509/x509type.c
+++ b/main/openssl/crypto/x509/x509type.c
@@ -100,20 +100,26 @@ int X509_certificate_type(X509 *x, EVP_PKEY *pkey)
break;
}
- i=X509_get_signature_type(x);
- switch (i)
+ i=OBJ_obj2nid(x->sig_alg->algorithm);
+ if (i && OBJ_find_sigid_algs(i, NULL, &i))
{
- case EVP_PKEY_RSA:
- ret|=EVP_PKS_RSA;
- break;
- case EVP_PKEY_DSA:
- ret|=EVP_PKS_DSA;
- break;
- case EVP_PKEY_EC:
- ret|=EVP_PKS_EC;
- break;
- default:
- break;
+
+ switch (i)
+ {
+ case NID_rsaEncryption:
+ case NID_rsa:
+ ret|=EVP_PKS_RSA;
+ break;
+ case NID_dsa:
+ case NID_dsa_2:
+ ret|=EVP_PKS_DSA;
+ break;
+ case NID_X9_62_id_ecPublicKey:
+ ret|=EVP_PKS_EC;
+ break;
+ default:
+ break;
+ }
}
if (EVP_PKEY_size(pk) <= 1024/8)/* /8 because it's 1024 bits we look
diff --git a/main/openssl/crypto/x509/x_all.c b/main/openssl/crypto/x509/x_all.c
index 8ec88c21..e06602d6 100644
--- a/main/openssl/crypto/x509/x_all.c
+++ b/main/openssl/crypto/x509/x_all.c
@@ -95,12 +95,26 @@ int X509_sign(X509 *x, EVP_PKEY *pkey, const EVP_MD *md)
x->sig_alg, x->signature, x->cert_info,pkey,md));
}
+int X509_sign_ctx(X509 *x, EVP_MD_CTX *ctx)
+ {
+ x->cert_info->enc.modified = 1;
+ return ASN1_item_sign_ctx(ASN1_ITEM_rptr(X509_CINF),
+ x->cert_info->signature,
+ x->sig_alg, x->signature, x->cert_info, ctx);
+ }
+
int X509_REQ_sign(X509_REQ *x, EVP_PKEY *pkey, const EVP_MD *md)
{
return(ASN1_item_sign(ASN1_ITEM_rptr(X509_REQ_INFO),x->sig_alg, NULL,
x->signature, x->req_info,pkey,md));
}
+int X509_REQ_sign_ctx(X509_REQ *x, EVP_MD_CTX *ctx)
+ {
+ return ASN1_item_sign_ctx(ASN1_ITEM_rptr(X509_REQ_INFO),
+ x->sig_alg, NULL, x->signature, x->req_info, ctx);
+ }
+
int X509_CRL_sign(X509_CRL *x, EVP_PKEY *pkey, const EVP_MD *md)
{
x->crl->enc.modified = 1;
@@ -108,6 +122,13 @@ int X509_CRL_sign(X509_CRL *x, EVP_PKEY *pkey, const EVP_MD *md)
x->sig_alg, x->signature, x->crl,pkey,md));
}
+int X509_CRL_sign_ctx(X509_CRL *x, EVP_MD_CTX *ctx)
+ {
+ x->crl->enc.modified = 1;
+ return ASN1_item_sign_ctx(ASN1_ITEM_rptr(X509_CRL_INFO),
+ x->crl->sig_alg, x->sig_alg, x->signature, x->crl, ctx);
+ }
+
int NETSCAPE_SPKI_sign(NETSCAPE_SPKI *x, EVP_PKEY *pkey, const EVP_MD *md)
{
return(ASN1_item_sign(ASN1_ITEM_rptr(NETSCAPE_SPKAC), x->sig_algor,NULL,
diff --git a/main/openssl/crypto/x509v3/v3_addr.c b/main/openssl/crypto/x509v3/v3_addr.c
index 0d70e869..df46a498 100644
--- a/main/openssl/crypto/x509v3/v3_addr.c
+++ b/main/openssl/crypto/x509v3/v3_addr.c
@@ -142,12 +142,13 @@ unsigned int v3_addr_get_afi(const IPAddressFamily *f)
* Expand the bitstring form of an address into a raw byte array.
* At the moment this is coded for simplicity, not speed.
*/
-static void addr_expand(unsigned char *addr,
+static int addr_expand(unsigned char *addr,
const ASN1_BIT_STRING *bs,
const int length,
const unsigned char fill)
{
- OPENSSL_assert(bs->length >= 0 && bs->length <= length);
+ if (bs->length < 0 || bs->length > length)
+ return 0;
if (bs->length > 0) {
memcpy(addr, bs->data, bs->length);
if ((bs->flags & 7) != 0) {
@@ -159,6 +160,7 @@ static void addr_expand(unsigned char *addr,
}
}
memset(addr + bs->length, fill, length - bs->length);
+ return 1;
}
/*
@@ -181,15 +183,13 @@ static int i2r_address(BIO *out,
return 0;
switch (afi) {
case IANA_AFI_IPV4:
- if (bs->length > 4)
+ if (!addr_expand(addr, bs, 4, fill))
return 0;
- addr_expand(addr, bs, 4, fill);
BIO_printf(out, "%d.%d.%d.%d", addr[0], addr[1], addr[2], addr[3]);
break;
case IANA_AFI_IPV6:
- if (bs->length > 16)
+ if (!addr_expand(addr, bs, 16, fill))
return 0;
- addr_expand(addr, bs, 16, fill);
for (n = 16; n > 1 && addr[n-1] == 0x00 && addr[n-2] == 0x00; n -= 2)
;
for (i = 0; i < n; i += 2)
@@ -315,6 +315,12 @@ static int i2r_IPAddrBlocks(const X509V3_EXT_METHOD *method,
/*
* Sort comparison function for a sequence of IPAddressOrRange
* elements.
+ *
+ * There's no sane answer we can give if addr_expand() fails, and an
+ * assertion failure on externally supplied data is seriously uncool,
+ * so we just arbitrarily declare that if given invalid inputs this
+ * function returns -1. If this messes up your preferred sort order
+ * for garbage input, tough noogies.
*/
static int IPAddressOrRange_cmp(const IPAddressOrRange *a,
const IPAddressOrRange *b,
@@ -326,22 +332,26 @@ static int IPAddressOrRange_cmp(const IPAddressOrRange *a,
switch (a->type) {
case IPAddressOrRange_addressPrefix:
- addr_expand(addr_a, a->u.addressPrefix, length, 0x00);
+ if (!addr_expand(addr_a, a->u.addressPrefix, length, 0x00))
+ return -1;
prefixlen_a = addr_prefixlen(a->u.addressPrefix);
break;
case IPAddressOrRange_addressRange:
- addr_expand(addr_a, a->u.addressRange->min, length, 0x00);
+ if (!addr_expand(addr_a, a->u.addressRange->min, length, 0x00))
+ return -1;
prefixlen_a = length * 8;
break;
}
switch (b->type) {
case IPAddressOrRange_addressPrefix:
- addr_expand(addr_b, b->u.addressPrefix, length, 0x00);
+ if (!addr_expand(addr_b, b->u.addressPrefix, length, 0x00))
+ return -1;
prefixlen_b = addr_prefixlen(b->u.addressPrefix);
break;
case IPAddressOrRange_addressRange:
- addr_expand(addr_b, b->u.addressRange->min, length, 0x00);
+ if (!addr_expand(addr_b, b->u.addressRange->min, length, 0x00))
+ return -1;
prefixlen_b = length * 8;
break;
}
@@ -383,6 +393,7 @@ static int range_should_be_prefix(const unsigned char *min,
unsigned char mask;
int i, j;
+ OPENSSL_assert(memcmp(min, max, length) <= 0);
for (i = 0; i < length && min[i] == max[i]; i++)
;
for (j = length - 1; j >= 0 && min[j] == 0x00 && max[j] == 0xFF; j--)
@@ -601,10 +612,10 @@ static IPAddressOrRanges *make_prefix_or_range(IPAddrBlocks *addr,
return NULL;
switch (afi) {
case IANA_AFI_IPV4:
- sk_IPAddressOrRange_set_cmp_func(aors, v4IPAddressOrRange_cmp);
+ (void) sk_IPAddressOrRange_set_cmp_func(aors, v4IPAddressOrRange_cmp);
break;
case IANA_AFI_IPV6:
- sk_IPAddressOrRange_set_cmp_func(aors, v6IPAddressOrRange_cmp);
+ (void) sk_IPAddressOrRange_set_cmp_func(aors, v6IPAddressOrRange_cmp);
break;
}
f->ipAddressChoice->type = IPAddressChoice_addressesOrRanges;
@@ -656,22 +667,22 @@ int v3_addr_add_range(IPAddrBlocks *addr,
/*
* Extract min and max values from an IPAddressOrRange.
*/
-static void extract_min_max(IPAddressOrRange *aor,
+static int extract_min_max(IPAddressOrRange *aor,
unsigned char *min,
unsigned char *max,
int length)
{
- OPENSSL_assert(aor != NULL && min != NULL && max != NULL);
+ if (aor == NULL || min == NULL || max == NULL)
+ return 0;
switch (aor->type) {
case IPAddressOrRange_addressPrefix:
- addr_expand(min, aor->u.addressPrefix, length, 0x00);
- addr_expand(max, aor->u.addressPrefix, length, 0xFF);
- return;
+ return (addr_expand(min, aor->u.addressPrefix, length, 0x00) &&
+ addr_expand(max, aor->u.addressPrefix, length, 0xFF));
case IPAddressOrRange_addressRange:
- addr_expand(min, aor->u.addressRange->min, length, 0x00);
- addr_expand(max, aor->u.addressRange->max, length, 0xFF);
- return;
+ return (addr_expand(min, aor->u.addressRange->min, length, 0x00) &&
+ addr_expand(max, aor->u.addressRange->max, length, 0xFF));
}
+ return 0;
}
/*
@@ -687,9 +698,10 @@ int v3_addr_get_range(IPAddressOrRange *aor,
if (aor == NULL || min == NULL || max == NULL ||
afi_length == 0 || length < afi_length ||
(aor->type != IPAddressOrRange_addressPrefix &&
- aor->type != IPAddressOrRange_addressRange))
+ aor->type != IPAddressOrRange_addressRange) ||
+ !extract_min_max(aor, min, max, afi_length))
return 0;
- extract_min_max(aor, min, max, afi_length);
+
return afi_length;
}
@@ -771,8 +783,9 @@ int v3_addr_is_canonical(IPAddrBlocks *addr)
IPAddressOrRange *a = sk_IPAddressOrRange_value(aors, j);
IPAddressOrRange *b = sk_IPAddressOrRange_value(aors, j + 1);
- extract_min_max(a, a_min, a_max, length);
- extract_min_max(b, b_min, b_max, length);
+ if (!extract_min_max(a, a_min, a_max, length) ||
+ !extract_min_max(b, b_min, b_max, length))
+ return 0;
/*
* Punt misordered list, overlapping start, or inverted range.
@@ -800,14 +813,17 @@ int v3_addr_is_canonical(IPAddrBlocks *addr)
}
/*
- * Check final range to see if it should be a prefix.
+ * Check range to see if it's inverted or should be a
+ * prefix.
*/
j = sk_IPAddressOrRange_num(aors) - 1;
{
IPAddressOrRange *a = sk_IPAddressOrRange_value(aors, j);
- if (a->type == IPAddressOrRange_addressRange) {
- extract_min_max(a, a_min, a_max, length);
- if (range_should_be_prefix(a_min, a_max, length) >= 0)
+ if (a != NULL && a->type == IPAddressOrRange_addressRange) {
+ if (!extract_min_max(a, a_min, a_max, length))
+ return 0;
+ if (memcmp(a_min, a_max, length) > 0 ||
+ range_should_be_prefix(a_min, a_max, length) >= 0)
return 0;
}
}
@@ -841,8 +857,16 @@ static int IPAddressOrRanges_canonize(IPAddressOrRanges *aors,
unsigned char a_min[ADDR_RAW_BUF_LEN], a_max[ADDR_RAW_BUF_LEN];
unsigned char b_min[ADDR_RAW_BUF_LEN], b_max[ADDR_RAW_BUF_LEN];
- extract_min_max(a, a_min, a_max, length);
- extract_min_max(b, b_min, b_max, length);
+ if (!extract_min_max(a, a_min, a_max, length) ||
+ !extract_min_max(b, b_min, b_max, length))
+ return 0;
+
+ /*
+ * Punt inverted ranges.
+ */
+ if (memcmp(a_min, a_max, length) > 0 ||
+ memcmp(b_min, b_max, length) > 0)
+ return 0;
/*
* Punt overlaps.
@@ -860,8 +884,8 @@ static int IPAddressOrRanges_canonize(IPAddressOrRanges *aors,
IPAddressOrRange *merged;
if (!make_addressRange(&merged, a_min, b_max, length))
return 0;
- sk_IPAddressOrRange_set(aors, i, merged);
- sk_IPAddressOrRange_delete(aors, i + 1);
+ (void) sk_IPAddressOrRange_set(aors, i, merged);
+ (void) sk_IPAddressOrRange_delete(aors, i + 1);
IPAddressOrRange_free(a);
IPAddressOrRange_free(b);
--i;
@@ -869,6 +893,20 @@ static int IPAddressOrRanges_canonize(IPAddressOrRanges *aors,
}
}
+ /*
+ * Check for inverted final range.
+ */
+ j = sk_IPAddressOrRange_num(aors) - 1;
+ {
+ IPAddressOrRange *a = sk_IPAddressOrRange_value(aors, j);
+ if (a != NULL && a->type == IPAddressOrRange_addressRange) {
+ unsigned char a_min[ADDR_RAW_BUF_LEN], a_max[ADDR_RAW_BUF_LEN];
+ extract_min_max(a, a_min, a_max, length);
+ if (memcmp(a_min, a_max, length) > 0)
+ return 0;
+ }
+ }
+
return 1;
}
@@ -885,7 +923,7 @@ int v3_addr_canonize(IPAddrBlocks *addr)
v3_addr_get_afi(f)))
return 0;
}
- sk_IPAddressFamily_set_cmp_func(addr, IPAddressFamily_cmp);
+ (void) sk_IPAddressFamily_set_cmp_func(addr, IPAddressFamily_cmp);
sk_IPAddressFamily_sort(addr);
OPENSSL_assert(v3_addr_is_canonical(addr));
return 1;
@@ -1017,6 +1055,11 @@ static void *v2i_IPAddrBlocks(const struct v3_ext_method *method,
X509V3_conf_err(val);
goto err;
}
+ if (memcmp(min, max, length_from_afi(afi)) > 0) {
+ X509V3err(X509V3_F_V2I_IPADDRBLOCKS, X509V3_R_EXTENSION_VALUE_ERROR);
+ X509V3_conf_err(val);
+ goto err;
+ }
if (!v3_addr_add_range(addr, afi, safi, min, max)) {
X509V3err(X509V3_F_V2I_IPADDRBLOCKS, ERR_R_MALLOC_FAILURE);
goto err;
@@ -1102,13 +1145,15 @@ static int addr_contains(IPAddressOrRanges *parent,
p = 0;
for (c = 0; c < sk_IPAddressOrRange_num(child); c++) {
- extract_min_max(sk_IPAddressOrRange_value(child, c),
- c_min, c_max, length);
+ if (!extract_min_max(sk_IPAddressOrRange_value(child, c),
+ c_min, c_max, length))
+ return -1;
for (;; p++) {
if (p >= sk_IPAddressOrRange_num(parent))
return 0;
- extract_min_max(sk_IPAddressOrRange_value(parent, p),
- p_min, p_max, length);
+ if (!extract_min_max(sk_IPAddressOrRange_value(parent, p),
+ p_min, p_max, length))
+ return 0;
if (memcmp(p_max, c_max, length) < 0)
continue;
if (memcmp(p_min, c_min, length) > 0)
@@ -1130,7 +1175,7 @@ int v3_addr_subset(IPAddrBlocks *a, IPAddrBlocks *b)
return 1;
if (b == NULL || v3_addr_inherits(a) || v3_addr_inherits(b))
return 0;
- sk_IPAddressFamily_set_cmp_func(b, IPAddressFamily_cmp);
+ (void) sk_IPAddressFamily_set_cmp_func(b, IPAddressFamily_cmp);
for (i = 0; i < sk_IPAddressFamily_num(a); i++) {
IPAddressFamily *fa = sk_IPAddressFamily_value(a, i);
int j = sk_IPAddressFamily_find(b, fa);
@@ -1195,7 +1240,7 @@ static int v3_addr_validate_path_internal(X509_STORE_CTX *ctx,
}
if (!v3_addr_is_canonical(ext))
validation_err(X509_V_ERR_INVALID_EXTENSION);
- sk_IPAddressFamily_set_cmp_func(ext, IPAddressFamily_cmp);
+ (void) sk_IPAddressFamily_set_cmp_func(ext, IPAddressFamily_cmp);
if ((child = sk_IPAddressFamily_dup(ext)) == NULL) {
X509V3err(X509V3_F_V3_ADDR_VALIDATE_PATH_INTERNAL, ERR_R_MALLOC_FAILURE);
ret = 0;
@@ -1221,7 +1266,7 @@ static int v3_addr_validate_path_internal(X509_STORE_CTX *ctx,
}
continue;
}
- sk_IPAddressFamily_set_cmp_func(x->rfc3779_addr, IPAddressFamily_cmp);
+ (void) sk_IPAddressFamily_set_cmp_func(x->rfc3779_addr, IPAddressFamily_cmp);
for (j = 0; j < sk_IPAddressFamily_num(child); j++) {
IPAddressFamily *fc = sk_IPAddressFamily_value(child, j);
int k = sk_IPAddressFamily_find(x->rfc3779_addr, fc);
diff --git a/main/openssl/crypto/x509v3/v3_asid.c b/main/openssl/crypto/x509v3/v3_asid.c
index 3f434c06..1587e8ed 100644
--- a/main/openssl/crypto/x509v3/v3_asid.c
+++ b/main/openssl/crypto/x509v3/v3_asid.c
@@ -358,6 +358,20 @@ static int ASIdentifierChoice_is_canonical(ASIdentifierChoice *choice)
goto done;
}
+ /*
+ * Check for inverted range.
+ */
+ i = sk_ASIdOrRange_num(choice->u.asIdsOrRanges) - 1;
+ {
+ ASIdOrRange *a = sk_ASIdOrRange_value(choice->u.asIdsOrRanges, i);
+ ASN1_INTEGER *a_min, *a_max;
+ if (a != NULL && a->type == ASIdOrRange_range) {
+ extract_min_max(a, &a_min, &a_max);
+ if (ASN1_INTEGER_cmp(a_min, a_max) > 0)
+ goto done;
+ }
+ }
+
ret = 1;
done:
@@ -392,9 +406,18 @@ static int ASIdentifierChoice_canonize(ASIdentifierChoice *choice)
return 1;
/*
- * We have a list. Sort it.
+ * If not a list, or if empty list, it's broken.
+ */
+ if (choice->type != ASIdentifierChoice_asIdsOrRanges ||
+ sk_ASIdOrRange_num(choice->u.asIdsOrRanges) == 0) {
+ X509V3err(X509V3_F_ASIDENTIFIERCHOICE_CANONIZE,
+ X509V3_R_EXTENSION_VALUE_ERROR);
+ return 0;
+ }
+
+ /*
+ * We have a non-empty list. Sort it.
*/
- OPENSSL_assert(choice->type == ASIdentifierChoice_asIdsOrRanges);
sk_ASIdOrRange_sort(choice->u.asIdsOrRanges);
/*
@@ -415,6 +438,13 @@ static int ASIdentifierChoice_canonize(ASIdentifierChoice *choice)
OPENSSL_assert(ASN1_INTEGER_cmp(a_min, b_min) <= 0);
/*
+ * Punt inverted ranges.
+ */
+ if (ASN1_INTEGER_cmp(a_min, a_max) > 0 ||
+ ASN1_INTEGER_cmp(b_min, b_max) > 0)
+ goto done;
+
+ /*
* Check for overlaps.
*/
if (ASN1_INTEGER_cmp(a_max, b_min) >= 0) {
@@ -465,12 +495,26 @@ static int ASIdentifierChoice_canonize(ASIdentifierChoice *choice)
break;
}
ASIdOrRange_free(b);
- sk_ASIdOrRange_delete(choice->u.asIdsOrRanges, i + 1);
+ (void) sk_ASIdOrRange_delete(choice->u.asIdsOrRanges, i + 1);
i--;
continue;
}
}
+ /*
+ * Check for final inverted range.
+ */
+ i = sk_ASIdOrRange_num(choice->u.asIdsOrRanges) - 1;
+ {
+ ASIdOrRange *a = sk_ASIdOrRange_value(choice->u.asIdsOrRanges, i);
+ ASN1_INTEGER *a_min, *a_max;
+ if (a != NULL && a->type == ASIdOrRange_range) {
+ extract_min_max(a, &a_min, &a_max);
+ if (ASN1_INTEGER_cmp(a_min, a_max) > 0)
+ goto done;
+ }
+ }
+
OPENSSL_assert(ASIdentifierChoice_is_canonical(choice)); /* Paranoia */
ret = 1;
@@ -498,6 +542,7 @@ static void *v2i_ASIdentifiers(const struct v3_ext_method *method,
struct v3_ext_ctx *ctx,
STACK_OF(CONF_VALUE) *values)
{
+ ASN1_INTEGER *min = NULL, *max = NULL;
ASIdentifiers *asid = NULL;
int i;
@@ -508,7 +553,6 @@ static void *v2i_ASIdentifiers(const struct v3_ext_method *method,
for (i = 0; i < sk_CONF_VALUE_num(values); i++) {
CONF_VALUE *val = sk_CONF_VALUE_value(values, i);
- ASN1_INTEGER *min = NULL, *max = NULL;
int i1, i2, i3, is_range, which;
/*
@@ -578,18 +622,19 @@ static void *v2i_ASIdentifiers(const struct v3_ext_method *method,
max = s2i_ASN1_INTEGER(NULL, s + i2);
OPENSSL_free(s);
if (min == NULL || max == NULL) {
- ASN1_INTEGER_free(min);
- ASN1_INTEGER_free(max);
X509V3err(X509V3_F_V2I_ASIDENTIFIERS, ERR_R_MALLOC_FAILURE);
goto err;
}
+ if (ASN1_INTEGER_cmp(min, max) > 0) {
+ X509V3err(X509V3_F_V2I_ASIDENTIFIERS, X509V3_R_EXTENSION_VALUE_ERROR);
+ goto err;
+ }
}
if (!v3_asid_add_id_or_range(asid, which, min, max)) {
- ASN1_INTEGER_free(min);
- ASN1_INTEGER_free(max);
X509V3err(X509V3_F_V2I_ASIDENTIFIERS, ERR_R_MALLOC_FAILURE);
goto err;
}
+ min = max = NULL;
}
/*
@@ -601,6 +646,8 @@ static void *v2i_ASIdentifiers(const struct v3_ext_method *method,
err:
ASIdentifiers_free(asid);
+ ASN1_INTEGER_free(min);
+ ASN1_INTEGER_free(max);
return NULL;
}
diff --git a/main/openssl/crypto/x509v3/v3_pci.c b/main/openssl/crypto/x509v3/v3_pci.c
index 0dcfa004..f7b733ae 100644
--- a/main/openssl/crypto/x509v3/v3_pci.c
+++ b/main/openssl/crypto/x509v3/v3_pci.c
@@ -2,7 +2,7 @@
/* Contributed to the OpenSSL Project 2004
* by Richard Levitte (richard@levitte.org)
*/
-/* Copyright (c) 2004 Kungliga Tekniska Högskolan
+/* Copyright (c) 2004 Kungliga Tekniska Högskolan
* (Royal Institute of Technology, Stockholm, Sweden).
* All rights reserved.
*
diff --git a/main/openssl/crypto/x509v3/v3_pcia.c b/main/openssl/crypto/x509v3/v3_pcia.c
index bb362e0e..eb082739 100644
--- a/main/openssl/crypto/x509v3/v3_pcia.c
+++ b/main/openssl/crypto/x509v3/v3_pcia.c
@@ -2,7 +2,7 @@
/* Contributed to the OpenSSL Project 2004
* by Richard Levitte (richard@levitte.org)
*/
-/* Copyright (c) 2004 Kungliga Tekniska Högskolan
+/* Copyright (c) 2004 Kungliga Tekniska Högskolan
* (Royal Institute of Technology, Stockholm, Sweden).
* All rights reserved.
*
diff --git a/main/openssl/crypto/x509v3/v3_purp.c b/main/openssl/crypto/x509v3/v3_purp.c
index 181bd349..ad688657 100644
--- a/main/openssl/crypto/x509v3/v3_purp.c
+++ b/main/openssl/crypto/x509v3/v3_purp.c
@@ -474,11 +474,11 @@ static void x509v3_cache_extensions(X509 *x)
for (i = 0; i < X509_get_ext_count(x); i++)
{
ex = X509_get_ext(x, i);
- if (!X509_EXTENSION_get_critical(ex))
- continue;
if (OBJ_obj2nid(X509_EXTENSION_get_object(ex))
== NID_freshest_crl)
x->ex_flags |= EXFLAG_FRESHEST;
+ if (!X509_EXTENSION_get_critical(ex))
+ continue;
if (!X509_supported_extension(ex))
{
x->ex_flags |= EXFLAG_CRITICAL;
diff --git a/main/openssl/crypto/x509v3/v3_skey.c b/main/openssl/crypto/x509v3/v3_skey.c
index 202c9e48..0a984fba 100644
--- a/main/openssl/crypto/x509v3/v3_skey.c
+++ b/main/openssl/crypto/x509v3/v3_skey.c
@@ -129,7 +129,8 @@ static ASN1_OCTET_STRING *s2i_skey_id(X509V3_EXT_METHOD *method,
goto err;
}
- EVP_Digest(pk->data, pk->length, pkey_dig, &diglen, EVP_sha1(), NULL);
+ if (!EVP_Digest(pk->data, pk->length, pkey_dig, &diglen, EVP_sha1(), NULL))
+ goto err;
if(!M_ASN1_OCTET_STRING_set(oct, pkey_dig, diglen)) {
X509V3err(X509V3_F_S2I_SKEY_ID,ERR_R_MALLOC_FAILURE);
diff --git a/main/openssl/crypto/x509v3/v3_utl.c b/main/openssl/crypto/x509v3/v3_utl.c
index e0302345..87cfceb4 100644
--- a/main/openssl/crypto/x509v3/v3_utl.c
+++ b/main/openssl/crypto/x509v3/v3_utl.c
@@ -365,7 +365,7 @@ char *hex_to_string(const unsigned char *buffer, long len)
char *tmp, *q;
const unsigned char *p;
int i;
- const static char hexdig[] = "0123456789ABCDEF";
+ static const char hexdig[] = "0123456789ABCDEF";
if(!buffer || !len) return NULL;
if(!(tmp = OPENSSL_malloc(len * 3 + 1))) {
X509V3err(X509V3_F_HEX_TO_STRING,ERR_R_MALLOC_FAILURE);
diff --git a/main/openssl/crypto/x86_64cpuid.S b/main/openssl/crypto/x86_64cpuid.S
new file mode 100644
index 00000000..562b03ab
--- /dev/null
+++ b/main/openssl/crypto/x86_64cpuid.S
@@ -0,0 +1,234 @@
+
+.hidden OPENSSL_cpuid_setup
+.section .init
+ call OPENSSL_cpuid_setup
+
+.hidden OPENSSL_ia32cap_P
+.comm OPENSSL_ia32cap_P,8,4
+
+.text
+
+.globl OPENSSL_atomic_add
+.type OPENSSL_atomic_add,@function
+.align 16
+OPENSSL_atomic_add:
+ movl (%rdi),%eax
+.Lspin: leaq (%rsi,%rax,1),%r8
+.byte 0xf0
+ cmpxchgl %r8d,(%rdi)
+ jne .Lspin
+ movl %r8d,%eax
+.byte 0x48,0x98
+ .byte 0xf3,0xc3
+.size OPENSSL_atomic_add,.-OPENSSL_atomic_add
+
+.globl OPENSSL_rdtsc
+.type OPENSSL_rdtsc,@function
+.align 16
+OPENSSL_rdtsc:
+ rdtsc
+ shlq $32,%rdx
+ orq %rdx,%rax
+ .byte 0xf3,0xc3
+.size OPENSSL_rdtsc,.-OPENSSL_rdtsc
+
+.globl OPENSSL_ia32_cpuid
+.type OPENSSL_ia32_cpuid,@function
+.align 16
+OPENSSL_ia32_cpuid:
+ movq %rbx,%r8
+
+ xorl %eax,%eax
+ cpuid
+ movl %eax,%r11d
+
+ xorl %eax,%eax
+ cmpl $1970169159,%ebx
+ setne %al
+ movl %eax,%r9d
+ cmpl $1231384169,%edx
+ setne %al
+ orl %eax,%r9d
+ cmpl $1818588270,%ecx
+ setne %al
+ orl %eax,%r9d
+ jz .Lintel
+
+ cmpl $1752462657,%ebx
+ setne %al
+ movl %eax,%r10d
+ cmpl $1769238117,%edx
+ setne %al
+ orl %eax,%r10d
+ cmpl $1145913699,%ecx
+ setne %al
+ orl %eax,%r10d
+ jnz .Lintel
+
+
+ movl $2147483648,%eax
+ cpuid
+ cmpl $2147483649,%eax
+ jb .Lintel
+ movl %eax,%r10d
+ movl $2147483649,%eax
+ cpuid
+ orl %ecx,%r9d
+ andl $2049,%r9d
+
+ cmpl $2147483656,%r10d
+ jb .Lintel
+
+ movl $2147483656,%eax
+ cpuid
+ movzbq %cl,%r10
+ incq %r10
+
+ movl $1,%eax
+ cpuid
+ btl $28,%edx
+ jnc .Lgeneric
+ shrl $16,%ebx
+ cmpb %r10b,%bl
+ ja .Lgeneric
+ andl $4026531839,%edx
+ jmp .Lgeneric
+
+.Lintel:
+ cmpl $4,%r11d
+ movl $-1,%r10d
+ jb .Lnocacheinfo
+
+ movl $4,%eax
+ movl $0,%ecx
+ cpuid
+ movl %eax,%r10d
+ shrl $14,%r10d
+ andl $4095,%r10d
+
+.Lnocacheinfo:
+ movl $1,%eax
+ cpuid
+ andl $3220176895,%edx
+ cmpl $0,%r9d
+ jne .Lnotintel
+ orl $1073741824,%edx
+ andb $15,%ah
+ cmpb $15,%ah
+ jne .Lnotintel
+ orl $1048576,%edx
+.Lnotintel:
+ btl $28,%edx
+ jnc .Lgeneric
+ andl $4026531839,%edx
+ cmpl $0,%r10d
+ je .Lgeneric
+
+ orl $268435456,%edx
+ shrl $16,%ebx
+ cmpb $1,%bl
+ ja .Lgeneric
+ andl $4026531839,%edx
+.Lgeneric:
+ andl $2048,%r9d
+ andl $4294965247,%ecx
+ orl %ecx,%r9d
+
+ movl %edx,%r10d
+ btl $27,%r9d
+ jnc .Lclear_avx
+ xorl %ecx,%ecx
+.byte 0x0f,0x01,0xd0
+ andl $6,%eax
+ cmpl $6,%eax
+ je .Ldone
+.Lclear_avx:
+ movl $4026525695,%eax
+ andl %eax,%r9d
+.Ldone:
+ shlq $32,%r9
+ movl %r10d,%eax
+ movq %r8,%rbx
+ orq %r9,%rax
+ .byte 0xf3,0xc3
+.size OPENSSL_ia32_cpuid,.-OPENSSL_ia32_cpuid
+
+.globl OPENSSL_cleanse
+.type OPENSSL_cleanse,@function
+.align 16
+OPENSSL_cleanse:
+ xorq %rax,%rax
+ cmpq $15,%rsi
+ jae .Lot
+ cmpq $0,%rsi
+ je .Lret
+.Little:
+ movb %al,(%rdi)
+ subq $1,%rsi
+ leaq 1(%rdi),%rdi
+ jnz .Little
+.Lret:
+ .byte 0xf3,0xc3
+.align 16
+.Lot:
+ testq $7,%rdi
+ jz .Laligned
+ movb %al,(%rdi)
+ leaq -1(%rsi),%rsi
+ leaq 1(%rdi),%rdi
+ jmp .Lot
+.Laligned:
+ movq %rax,(%rdi)
+ leaq -8(%rsi),%rsi
+ testq $-8,%rsi
+ leaq 8(%rdi),%rdi
+ jnz .Laligned
+ cmpq $0,%rsi
+ jne .Little
+ .byte 0xf3,0xc3
+.size OPENSSL_cleanse,.-OPENSSL_cleanse
+.globl OPENSSL_wipe_cpu
+.type OPENSSL_wipe_cpu,@function
+.align 16
+OPENSSL_wipe_cpu:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ pxor %xmm8,%xmm8
+ pxor %xmm9,%xmm9
+ pxor %xmm10,%xmm10
+ pxor %xmm11,%xmm11
+ pxor %xmm12,%xmm12
+ pxor %xmm13,%xmm13
+ pxor %xmm14,%xmm14
+ pxor %xmm15,%xmm15
+ xorq %rcx,%rcx
+ xorq %rdx,%rdx
+ xorq %rsi,%rsi
+ xorq %rdi,%rdi
+ xorq %r8,%r8
+ xorq %r9,%r9
+ xorq %r10,%r10
+ xorq %r11,%r11
+ leaq 8(%rsp),%rax
+ .byte 0xf3,0xc3
+.size OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu
+.globl OPENSSL_ia32_rdrand
+.type OPENSSL_ia32_rdrand,@function
+.align 16
+OPENSSL_ia32_rdrand:
+ movl $8,%ecx
+.Loop_rdrand:
+.byte 72,15,199,240
+ jc .Lbreak_rdrand
+ loop .Loop_rdrand
+.Lbreak_rdrand:
+ cmpq $0,%rax
+ cmoveq %rcx,%rax
+ .byte 0xf3,0xc3
+.size OPENSSL_ia32_rdrand,.-OPENSSL_ia32_rdrand
diff --git a/main/openssl/crypto/x86_64cpuid.pl b/main/openssl/crypto/x86_64cpuid.pl
index c96821a3..6ebfd017 100644
--- a/main/openssl/crypto/x86_64cpuid.pl
+++ b/main/openssl/crypto/x86_64cpuid.pl
@@ -7,15 +7,25 @@ if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-open STDOUT,"| $^X ${dir}perlasm/x86_64-xlate.pl $flavour $output";
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+($arg1,$arg2,$arg3,$arg4)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order
+ ("%rdi","%rsi","%rdx","%rcx"); # Unix order
-if ($win64) { $arg1="%rcx"; $arg2="%rdx"; }
-else { $arg1="%rdi"; $arg2="%rsi"; }
print<<___;
.extern OPENSSL_cpuid_setup
+.hidden OPENSSL_cpuid_setup
.section .init
call OPENSSL_cpuid_setup
+.hidden OPENSSL_ia32cap_P
+.comm OPENSSL_ia32cap_P,8,4
+
.text
.globl OPENSSL_atomic_add
@@ -46,7 +56,7 @@ OPENSSL_rdtsc:
.type OPENSSL_ia32_cpuid,\@abi-omnipotent
.align 16
OPENSSL_ia32_cpuid:
- mov %rbx,%r8
+ mov %rbx,%r8 # save %rbx
xor %eax,%eax
cpuid
@@ -78,7 +88,15 @@ OPENSSL_ia32_cpuid:
# AMD specific
mov \$0x80000000,%eax
cpuid
- cmp \$0x80000008,%eax
+ cmp \$0x80000001,%eax
+ jb .Lintel
+ mov %eax,%r10d
+ mov \$0x80000001,%eax
+ cpuid
+ or %ecx,%r9d
+ and \$0x00000801,%r9d # isolate AMD XOP bit, 1<<11
+
+ cmp \$0x80000008,%r10d
jb .Lintel
mov \$0x80000008,%eax
@@ -89,12 +107,12 @@ OPENSSL_ia32_cpuid:
mov \$1,%eax
cpuid
bt \$28,%edx # test hyper-threading bit
- jnc .Ldone
+ jnc .Lgeneric
shr \$16,%ebx # number of logical processors
cmp %r10b,%bl
- ja .Ldone
+ ja .Lgeneric
and \$0xefffffff,%edx # ~(1<<28)
- jmp .Ldone
+ jmp .Lgeneric
.Lintel:
cmp \$4,%r11d
@@ -111,30 +129,47 @@ OPENSSL_ia32_cpuid:
.Lnocacheinfo:
mov \$1,%eax
cpuid
+ and \$0xbfefffff,%edx # force reserved bits to 0
cmp \$0,%r9d
jne .Lnotintel
- or \$0x00100000,%edx # use reserved 20th bit to engage RC4_CHAR
+ or \$0x40000000,%edx # set reserved bit#30 on Intel CPUs
and \$15,%ah
cmp \$15,%ah # examine Family ID
- je .Lnotintel
- or \$0x40000000,%edx # use reserved bit to skip unrolled loop
+ jne .Lnotintel
+ or \$0x00100000,%edx # set reserved bit#20 to engage RC4_CHAR
.Lnotintel:
bt \$28,%edx # test hyper-threading bit
- jnc .Ldone
+ jnc .Lgeneric
and \$0xefffffff,%edx # ~(1<<28)
cmp \$0,%r10d
- je .Ldone
+ je .Lgeneric
or \$0x10000000,%edx # 1<<28
shr \$16,%ebx
cmp \$1,%bl # see if cache is shared
- ja .Ldone
+ ja .Lgeneric
and \$0xefffffff,%edx # ~(1<<28)
+.Lgeneric:
+ and \$0x00000800,%r9d # isolate AMD XOP flag
+ and \$0xfffff7ff,%ecx
+ or %ecx,%r9d # merge AMD XOP flag
+
+ mov %edx,%r10d # %r9d:%r10d is copy of %ecx:%edx
+ bt \$27,%r9d # check OSXSAVE bit
+ jnc .Lclear_avx
+ xor %ecx,%ecx # XCR0
+ .byte 0x0f,0x01,0xd0 # xgetbv
+ and \$6,%eax # isolate XMM and YMM state support
+ cmp \$6,%eax
+ je .Ldone
+.Lclear_avx:
+ mov \$0xefffe7ff,%eax # ~(1<<28|1<<12|1<<11)
+ and %eax,%r9d # clear AVX, FMA and AMD XOP bits
.Ldone:
- shl \$32,%rcx
- mov %edx,%eax
- mov %r8,%rbx
- or %rcx,%rax
+ shl \$32,%r9
+ mov %r10d,%eax
+ mov %r8,%rbx # restore %rbx
+ or %r9,%rax
ret
.size OPENSSL_ia32_cpuid,.-OPENSSL_ia32_cpuid
@@ -229,4 +264,21 @@ OPENSSL_wipe_cpu:
.size OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu
___
+print<<___;
+.globl OPENSSL_ia32_rdrand
+.type OPENSSL_ia32_rdrand,\@abi-omnipotent
+.align 16
+OPENSSL_ia32_rdrand:
+ mov \$8,%ecx
+.Loop_rdrand:
+ rdrand %rax
+ jc .Lbreak_rdrand
+ loop .Loop_rdrand
+.Lbreak_rdrand:
+ cmp \$0,%rax
+ cmove %rcx,%rax
+ ret
+.size OPENSSL_ia32_rdrand,.-OPENSSL_ia32_rdrand
+___
+
close STDOUT; # flush
diff --git a/main/openssl/crypto/x86cpuid.S b/main/openssl/crypto/x86cpuid.S
new file mode 100644
index 00000000..73b5d98e
--- /dev/null
+++ b/main/openssl/crypto/x86cpuid.S
@@ -0,0 +1,334 @@
+.file "x86cpuid.s"
+.text
+.globl OPENSSL_ia32_cpuid
+.type OPENSSL_ia32_cpuid,@function
+.align 16
+OPENSSL_ia32_cpuid:
+.L_OPENSSL_ia32_cpuid_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ xorl %edx,%edx
+ pushfl
+ popl %eax
+ movl %eax,%ecx
+ xorl $2097152,%eax
+ pushl %eax
+ popfl
+ pushfl
+ popl %eax
+ xorl %eax,%ecx
+ xorl %eax,%eax
+ btl $21,%ecx
+ jnc .L000nocpuid
+ .byte 0x0f,0xa2
+ movl %eax,%edi
+ xorl %eax,%eax
+ cmpl $1970169159,%ebx
+ setne %al
+ movl %eax,%ebp
+ cmpl $1231384169,%edx
+ setne %al
+ orl %eax,%ebp
+ cmpl $1818588270,%ecx
+ setne %al
+ orl %eax,%ebp
+ jz .L001intel
+ cmpl $1752462657,%ebx
+ setne %al
+ movl %eax,%esi
+ cmpl $1769238117,%edx
+ setne %al
+ orl %eax,%esi
+ cmpl $1145913699,%ecx
+ setne %al
+ orl %eax,%esi
+ jnz .L001intel
+ movl $2147483648,%eax
+ .byte 0x0f,0xa2
+ cmpl $2147483649,%eax
+ jb .L001intel
+ movl %eax,%esi
+ movl $2147483649,%eax
+ .byte 0x0f,0xa2
+ orl %ecx,%ebp
+ andl $2049,%ebp
+ cmpl $2147483656,%esi
+ jb .L001intel
+ movl $2147483656,%eax
+ .byte 0x0f,0xa2
+ movzbl %cl,%esi
+ incl %esi
+ movl $1,%eax
+ xorl %ecx,%ecx
+ .byte 0x0f,0xa2
+ btl $28,%edx
+ jnc .L002generic
+ shrl $16,%ebx
+ andl $255,%ebx
+ cmpl %esi,%ebx
+ ja .L002generic
+ andl $4026531839,%edx
+ jmp .L002generic
+.L001intel:
+ cmpl $4,%edi
+ movl $-1,%edi
+ jb .L003nocacheinfo
+ movl $4,%eax
+ movl $0,%ecx
+ .byte 0x0f,0xa2
+ movl %eax,%edi
+ shrl $14,%edi
+ andl $4095,%edi
+.L003nocacheinfo:
+ movl $1,%eax
+ xorl %ecx,%ecx
+ .byte 0x0f,0xa2
+ andl $3220176895,%edx
+ cmpl $0,%ebp
+ jne .L004notintel
+ orl $1073741824,%edx
+ andb $15,%ah
+ cmpb $15,%ah
+ jne .L004notintel
+ orl $1048576,%edx
+.L004notintel:
+ btl $28,%edx
+ jnc .L002generic
+ andl $4026531839,%edx
+ cmpl $0,%edi
+ je .L002generic
+ orl $268435456,%edx
+ shrl $16,%ebx
+ cmpb $1,%bl
+ ja .L002generic
+ andl $4026531839,%edx
+.L002generic:
+ andl $2048,%ebp
+ andl $4294965247,%ecx
+ movl %edx,%esi
+ orl %ecx,%ebp
+ btl $27,%ecx
+ jnc .L005clear_avx
+ xorl %ecx,%ecx
+.byte 15,1,208
+ andl $6,%eax
+ cmpl $6,%eax
+ je .L006done
+ cmpl $2,%eax
+ je .L005clear_avx
+.L007clear_xmm:
+ andl $4261412861,%ebp
+ andl $4278190079,%esi
+.L005clear_avx:
+ andl $4026525695,%ebp
+.L006done:
+ movl %esi,%eax
+ movl %ebp,%edx
+.L000nocpuid:
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size OPENSSL_ia32_cpuid,.-.L_OPENSSL_ia32_cpuid_begin
+.globl OPENSSL_rdtsc
+.type OPENSSL_rdtsc,@function
+.align 16
+OPENSSL_rdtsc:
+.L_OPENSSL_rdtsc_begin:
+ xorl %eax,%eax
+ xorl %edx,%edx
+ call .L008PIC_me_up
+.L008PIC_me_up:
+ popl %ecx
+ leal _GLOBAL_OFFSET_TABLE_+[.-.L008PIC_me_up](%ecx),%ecx
+ movl OPENSSL_ia32cap_P@GOT(%ecx),%ecx
+ btl $4,(%ecx)
+ jnc .L009notsc
+ .byte 0x0f,0x31
+.L009notsc:
+ ret
+.size OPENSSL_rdtsc,.-.L_OPENSSL_rdtsc_begin
+.globl OPENSSL_instrument_halt
+.type OPENSSL_instrument_halt,@function
+.align 16
+OPENSSL_instrument_halt:
+.L_OPENSSL_instrument_halt_begin:
+ call .L010PIC_me_up
+.L010PIC_me_up:
+ popl %ecx
+ leal _GLOBAL_OFFSET_TABLE_+[.-.L010PIC_me_up](%ecx),%ecx
+ movl OPENSSL_ia32cap_P@GOT(%ecx),%ecx
+ btl $4,(%ecx)
+ jnc .L011nohalt
+.long 2421723150
+ andl $3,%eax
+ jnz .L011nohalt
+ pushfl
+ popl %eax
+ btl $9,%eax
+ jnc .L011nohalt
+ .byte 0x0f,0x31
+ pushl %edx
+ pushl %eax
+ hlt
+ .byte 0x0f,0x31
+ subl (%esp),%eax
+ sbbl 4(%esp),%edx
+ addl $8,%esp
+ ret
+.L011nohalt:
+ xorl %eax,%eax
+ xorl %edx,%edx
+ ret
+.size OPENSSL_instrument_halt,.-.L_OPENSSL_instrument_halt_begin
+.globl OPENSSL_far_spin
+.type OPENSSL_far_spin,@function
+.align 16
+OPENSSL_far_spin:
+.L_OPENSSL_far_spin_begin:
+ pushfl
+ popl %eax
+ btl $9,%eax
+ jnc .L012nospin
+ movl 4(%esp),%eax
+ movl 8(%esp),%ecx
+.long 2430111262
+ xorl %eax,%eax
+ movl (%ecx),%edx
+ jmp .L013spin
+.align 16
+.L013spin:
+ incl %eax
+ cmpl (%ecx),%edx
+ je .L013spin
+.long 529567888
+ ret
+.L012nospin:
+ xorl %eax,%eax
+ xorl %edx,%edx
+ ret
+.size OPENSSL_far_spin,.-.L_OPENSSL_far_spin_begin
+.globl OPENSSL_wipe_cpu
+.type OPENSSL_wipe_cpu,@function
+.align 16
+OPENSSL_wipe_cpu:
+.L_OPENSSL_wipe_cpu_begin:
+ xorl %eax,%eax
+ xorl %edx,%edx
+ call .L014PIC_me_up
+.L014PIC_me_up:
+ popl %ecx
+ leal _GLOBAL_OFFSET_TABLE_+[.-.L014PIC_me_up](%ecx),%ecx
+ movl OPENSSL_ia32cap_P@GOT(%ecx),%ecx
+ movl (%ecx),%ecx
+ btl $1,(%ecx)
+ jnc .L015no_x87
+.long 4007259865,4007259865,4007259865,4007259865,2430851995
+.L015no_x87:
+ leal 4(%esp),%eax
+ ret
+.size OPENSSL_wipe_cpu,.-.L_OPENSSL_wipe_cpu_begin
+.globl OPENSSL_atomic_add
+.type OPENSSL_atomic_add,@function
+.align 16
+OPENSSL_atomic_add:
+.L_OPENSSL_atomic_add_begin:
+ movl 4(%esp),%edx
+ movl 8(%esp),%ecx
+ pushl %ebx
+ nop
+ movl (%edx),%eax
+.L016spin:
+ leal (%eax,%ecx,1),%ebx
+ nop
+.long 447811568
+ jne .L016spin
+ movl %ebx,%eax
+ popl %ebx
+ ret
+.size OPENSSL_atomic_add,.-.L_OPENSSL_atomic_add_begin
+.globl OPENSSL_indirect_call
+.type OPENSSL_indirect_call,@function
+.align 16
+OPENSSL_indirect_call:
+.L_OPENSSL_indirect_call_begin:
+ pushl %ebp
+ movl %esp,%ebp
+ subl $28,%esp
+ movl 12(%ebp),%ecx
+ movl %ecx,(%esp)
+ movl 16(%ebp),%edx
+ movl %edx,4(%esp)
+ movl 20(%ebp),%eax
+ movl %eax,8(%esp)
+ movl 24(%ebp),%eax
+ movl %eax,12(%esp)
+ movl 28(%ebp),%eax
+ movl %eax,16(%esp)
+ movl 32(%ebp),%eax
+ movl %eax,20(%esp)
+ movl 36(%ebp),%eax
+ movl %eax,24(%esp)
+ call *8(%ebp)
+ movl %ebp,%esp
+ popl %ebp
+ ret
+.size OPENSSL_indirect_call,.-.L_OPENSSL_indirect_call_begin
+.globl OPENSSL_cleanse
+.type OPENSSL_cleanse,@function
+.align 16
+OPENSSL_cleanse:
+.L_OPENSSL_cleanse_begin:
+ movl 4(%esp),%edx
+ movl 8(%esp),%ecx
+ xorl %eax,%eax
+ cmpl $7,%ecx
+ jae .L017lot
+ cmpl $0,%ecx
+ je .L018ret
+.L019little:
+ movb %al,(%edx)
+ subl $1,%ecx
+ leal 1(%edx),%edx
+ jnz .L019little
+.L018ret:
+ ret
+.align 16
+.L017lot:
+ testl $3,%edx
+ jz .L020aligned
+ movb %al,(%edx)
+ leal -1(%ecx),%ecx
+ leal 1(%edx),%edx
+ jmp .L017lot
+.L020aligned:
+ movl %eax,(%edx)
+ leal -4(%ecx),%ecx
+ testl $-4,%ecx
+ leal 4(%edx),%edx
+ jnz .L020aligned
+ cmpl $0,%ecx
+ jne .L019little
+ ret
+.size OPENSSL_cleanse,.-.L_OPENSSL_cleanse_begin
+.globl OPENSSL_ia32_rdrand
+.type OPENSSL_ia32_rdrand,@function
+.align 16
+OPENSSL_ia32_rdrand:
+.L_OPENSSL_ia32_rdrand_begin:
+ movl $8,%ecx
+.L021loop:
+.byte 15,199,240
+ jc .L022break
+ loop .L021loop
+.L022break:
+ cmpl $0,%eax
+ cmovel %ecx,%eax
+ ret
+.size OPENSSL_ia32_rdrand,.-.L_OPENSSL_ia32_rdrand_begin
+.comm OPENSSL_ia32cap_P,8,4
+.section .init
+ call OPENSSL_cpuid_setup
diff --git a/main/openssl/crypto/x86cpuid.pl b/main/openssl/crypto/x86cpuid.pl
index a7464af1..b270b443 100644
--- a/main/openssl/crypto/x86cpuid.pl
+++ b/main/openssl/crypto/x86cpuid.pl
@@ -19,9 +19,9 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
&pushf ();
&pop ("eax");
&xor ("ecx","eax");
- &bt ("ecx",21);
- &jnc (&label("done"));
&xor ("eax","eax");
+ &bt ("ecx",21);
+ &jnc (&label("nocpuid"));
&cpuid ();
&mov ("edi","eax"); # max value for standard query level
@@ -51,7 +51,14 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
# AMD specific
&mov ("eax",0x80000000);
&cpuid ();
- &cmp ("eax",0x80000008);
+ &cmp ("eax",0x80000001);
+ &jb (&label("intel"));
+ &mov ("esi","eax");
+ &mov ("eax",0x80000001);
+ &cpuid ();
+ &or ("ebp","ecx");
+ &and ("ebp",1<<11|1); # isolate XOP bit
+ &cmp ("esi",0x80000008);
&jb (&label("intel"));
&mov ("eax",0x80000008);
@@ -60,15 +67,16 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
&inc ("esi"); # number of cores
&mov ("eax",1);
+ &xor ("ecx","ecx");
&cpuid ();
&bt ("edx",28);
- &jnc (&label("done"));
+ &jnc (&label("generic"));
&shr ("ebx",16);
&and ("ebx",0xff);
&cmp ("ebx","esi");
- &ja (&label("done"));
+ &ja (&label("generic"));
&and ("edx",0xefffffff); # clear hyper-threading bit
- &jmp (&label("done"));
+ &jmp (&label("generic"));
&set_label("intel");
&cmp ("edi",4);
@@ -84,28 +92,53 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
&set_label("nocacheinfo");
&mov ("eax",1);
+ &xor ("ecx","ecx");
&cpuid ();
+ &and ("edx",0xbfefffff); # force reserved bits #20, #30 to 0
&cmp ("ebp",0);
- &jne (&label("notP4"));
+ &jne (&label("notintel"));
+ &or ("edx",1<<30); # set reserved bit#30 on Intel CPUs
&and (&HB("eax"),15); # familiy ID
&cmp (&HB("eax"),15); # P4?
- &jne (&label("notP4"));
- &or ("edx",1<<20); # use reserved bit to engage RC4_CHAR
-&set_label("notP4");
+ &jne (&label("notintel"));
+ &or ("edx",1<<20); # set reserved bit#20 to engage RC4_CHAR
+&set_label("notintel");
&bt ("edx",28); # test hyper-threading bit
- &jnc (&label("done"));
+ &jnc (&label("generic"));
&and ("edx",0xefffffff);
&cmp ("edi",0);
- &je (&label("done"));
+ &je (&label("generic"));
&or ("edx",0x10000000);
&shr ("ebx",16);
&cmp (&LB("ebx"),1);
- &ja (&label("done"));
+ &ja (&label("generic"));
&and ("edx",0xefffffff); # clear hyper-threading bit if not
+
+&set_label("generic");
+ &and ("ebp",1<<11); # isolate AMD XOP flag
+ &and ("ecx",0xfffff7ff); # force 11th bit to 0
+ &mov ("esi","edx");
+ &or ("ebp","ecx"); # merge AMD XOP flag
+
+ &bt ("ecx",27); # check OSXSAVE bit
+ &jnc (&label("clear_avx"));
+ &xor ("ecx","ecx");
+ &data_byte(0x0f,0x01,0xd0); # xgetbv
+ &and ("eax",6);
+ &cmp ("eax",6);
+ &je (&label("done"));
+ &cmp ("eax",2);
+ &je (&label("clear_avx"));
+&set_label("clear_xmm");
+ &and ("ebp",0xfdfffffd); # clear AESNI and PCLMULQDQ bits
+ &and ("esi",0xfeffffff); # clear FXSR
+&set_label("clear_avx");
+ &and ("ebp",0xefffe7ff); # clear AVX, FMA and AMD XOP bits
&set_label("done");
- &mov ("eax","edx");
- &mov ("edx","ecx");
+ &mov ("eax","esi");
+ &mov ("edx","ebp");
+&set_label("nocpuid");
&function_end("OPENSSL_ia32_cpuid");
&external_label("OPENSSL_ia32cap_P");
@@ -134,7 +167,7 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
&jnz (&label("nohalt")); # not enough privileges
&pushf ();
- &pop ("eax")
+ &pop ("eax");
&bt ("eax",9);
&jnc (&label("nohalt")); # interrupts are disabled
@@ -199,8 +232,9 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
&bt (&DWP(0,"ecx"),1);
&jnc (&label("no_x87"));
if ($sse2) {
- &bt (&DWP(0,"ecx"),26);
- &jnc (&label("no_sse2"));
+ &and ("ecx",1<<26|1<<24); # check SSE2 and FXSR bits
+ &cmp ("ecx",1<<26|1<<24);
+ &jne (&label("no_sse2"));
&pxor ("xmm0","xmm0");
&pxor ("xmm1","xmm1");
&pxor ("xmm2","xmm2");
@@ -248,7 +282,7 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
# arguments is 1 or 2!
&function_begin_B("OPENSSL_indirect_call");
{
- my $i,$max=7; # $max has to be chosen as 4*n-1
+ my ($max,$i)=(7,); # $max has to be chosen as 4*n-1
# in order to preserve eventual
# stack alignment
&push ("ebp");
@@ -307,6 +341,18 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
&ret ();
&function_end_B("OPENSSL_cleanse");
+&function_begin_B("OPENSSL_ia32_rdrand");
+ &mov ("ecx",8);
+&set_label("loop");
+ &rdrand ("eax");
+ &jc (&label("break"));
+ &loop (&label("loop"));
+&set_label("break");
+ &cmp ("eax",0);
+ &cmove ("eax","ecx");
+ &ret ();
+&function_end_B("OPENSSL_ia32_rdrand");
+
&initseg("OPENSSL_cpuid_setup");
&asm_finish();