From 3e4d8f433239c40311037616b1b8833a06651ae0 Mon Sep 17 00:00:00 2001 From: Arne Schwabe Date: Mon, 16 Apr 2012 19:21:14 +0200 Subject: Initial import --- openssl/crypto/des/asm/crypt586.pl | 209 ++++ openssl/crypto/des/asm/des-586.pl | 453 ++++++++ openssl/crypto/des/asm/des_enc.m4 | 2099 ++++++++++++++++++++++++++++++++++++ openssl/crypto/des/asm/desboth.pl | 79 ++ openssl/crypto/des/asm/readme | 131 +++ 5 files changed, 2971 insertions(+) create mode 100644 openssl/crypto/des/asm/crypt586.pl create mode 100644 openssl/crypto/des/asm/des-586.pl create mode 100644 openssl/crypto/des/asm/des_enc.m4 create mode 100644 openssl/crypto/des/asm/desboth.pl create mode 100644 openssl/crypto/des/asm/readme (limited to 'openssl/crypto/des/asm') diff --git a/openssl/crypto/des/asm/crypt586.pl b/openssl/crypto/des/asm/crypt586.pl new file mode 100644 index 00000000..e36f7d44 --- /dev/null +++ b/openssl/crypto/des/asm/crypt586.pl @@ -0,0 +1,209 @@ +#!/usr/local/bin/perl +# +# The inner loop instruction sequence and the IP/FP modifications are from +# Svend Olaf Mikkelsen +# I've added the stuff needed for crypt() but I've not worried about making +# things perfect. +# + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +push(@INC,"${dir}","${dir}../../perlasm"); +require "x86asm.pl"; + +&asm_init($ARGV[0],"crypt586.pl"); + +$L="edi"; +$R="esi"; + +&external_label("DES_SPtrans"); +&fcrypt_body("fcrypt_body"); +&asm_finish(); + +sub fcrypt_body + { + local($name,$do_ip)=@_; + + &function_begin($name); + + &comment(""); + &comment("Load the 2 words"); + $trans="ebp"; + + &xor( $L, $L); + &xor( $R, $R); + + # PIC-ification:-) + &picmeup("edx","DES_SPtrans"); + #if ($cpp) { &picmeup("edx","DES_SPtrans"); } + #else { &lea("edx",&DWP("DES_SPtrans")); } + &push("edx"); # becomes &swtmp(1) + # + &mov($trans,&wparam(1)); # reloaded with DES_SPtrans in D_ENCRYPT + + &push(&DWC(25)); # add a variable + + &set_label("start"); + for ($i=0; $i<16; $i+=2) + { + &comment(""); + &comment("Round $i"); + &D_ENCRYPT($i,$L,$R,$i*2,$trans,"eax","ebx","ecx","edx"); + + &comment(""); + &comment("Round ".sprintf("%d",$i+1)); + &D_ENCRYPT($i+1,$R,$L,($i+1)*2,$trans,"eax","ebx","ecx","edx"); + } + &mov("ebx", &swtmp(0)); + &mov("eax", $L); + &dec("ebx"); + &mov($L, $R); + &mov($R, "eax"); + &mov(&swtmp(0), "ebx"); + &jnz(&label("start")); + + &comment(""); + &comment("FP"); + &mov("edx",&wparam(0)); + + &FP_new($R,$L,"eax",3); + &mov(&DWP(0,"edx","",0),"eax"); + &mov(&DWP(4,"edx","",0),$L); + + &add("esp",8); # remove variables + + &function_end($name); + } + +sub D_ENCRYPT + { + local($r,$L,$R,$S,$trans,$u,$tmp1,$tmp2,$t)=@_; + + &mov( $u, &wparam(2)); # 2 + &mov( $t, $R); + &shr( $t, 16); # 1 + &mov( $tmp2, &wparam(3)); # 2 + &xor( $t, $R); # 1 + + &and( $u, $t); # 2 + &and( $t, $tmp2); # 2 + + &mov( $tmp1, $u); + &shl( $tmp1, 16); # 1 + &mov( $tmp2, $t); + &shl( $tmp2, 16); # 1 + &xor( $u, $tmp1); # 2 + &xor( $t, $tmp2); # 2 + &mov( $tmp1, &DWP(&n2a($S*4),$trans,"",0)); # 2 + &xor( $u, $tmp1); + &mov( $tmp2, &DWP(&n2a(($S+1)*4),$trans,"",0)); # 2 + &xor( $u, $R); + &xor( $t, $R); + &xor( $t, $tmp2); + + &and( $u, "0xfcfcfcfc" ); # 2 + &xor( $tmp1, $tmp1); # 1 + &and( $t, "0xcfcfcfcf" ); # 2 + &xor( $tmp2, $tmp2); + &movb( &LB($tmp1), &LB($u) ); + &movb( &LB($tmp2), &HB($u) ); + &rotr( $t, 4 ); + &mov( $trans, &swtmp(1)); + &xor( $L, &DWP(" ",$trans,$tmp1,0)); + &movb( &LB($tmp1), &LB($t) ); + &xor( $L, &DWP("0x200",$trans,$tmp2,0)); + &movb( &LB($tmp2), &HB($t) ); + &shr( $u, 16); + &xor( $L, &DWP("0x100",$trans,$tmp1,0)); + &movb( &LB($tmp1), &HB($u) ); + &shr( $t, 16); + &xor( $L, &DWP("0x300",$trans,$tmp2,0)); + &movb( &LB($tmp2), &HB($t) ); + &and( $u, "0xff" ); + &and( $t, "0xff" ); + &mov( $tmp1, &DWP("0x600",$trans,$tmp1,0)); + &xor( $L, $tmp1); + &mov( $tmp1, &DWP("0x700",$trans,$tmp2,0)); + &xor( $L, $tmp1); + &mov( $tmp1, &DWP("0x400",$trans,$u,0)); + &xor( $L, $tmp1); + &mov( $tmp1, &DWP("0x500",$trans,$t,0)); + &xor( $L, $tmp1); + &mov( $trans, &wparam(1)); + } + +sub n2a + { + sprintf("%d",$_[0]); + } + +# now has a side affect of rotating $a by $shift +sub R_PERM_OP + { + local($a,$b,$tt,$shift,$mask,$last)=@_; + + &rotl( $a, $shift ) if ($shift != 0); + &mov( $tt, $a ); + &xor( $a, $b ); + &and( $a, $mask ); + if ($notlast eq $b) + { + &xor( $b, $a ); + &xor( $tt, $a ); + } + else + { + &xor( $tt, $a ); + &xor( $b, $a ); + } + &comment(""); + } + +sub IP_new + { + local($l,$r,$tt,$lr)=@_; + + &R_PERM_OP($l,$r,$tt, 4,"0xf0f0f0f0",$l); + &R_PERM_OP($r,$tt,$l,20,"0xfff0000f",$l); + &R_PERM_OP($l,$tt,$r,14,"0x33333333",$r); + &R_PERM_OP($tt,$r,$l,22,"0x03fc03fc",$r); + &R_PERM_OP($l,$r,$tt, 9,"0xaaaaaaaa",$r); + + if ($lr != 3) + { + if (($lr-3) < 0) + { &rotr($tt, 3-$lr); } + else { &rotl($tt, $lr-3); } + } + if ($lr != 2) + { + if (($lr-2) < 0) + { &rotr($r, 2-$lr); } + else { &rotl($r, $lr-2); } + } + } + +sub FP_new + { + local($l,$r,$tt,$lr)=@_; + + if ($lr != 2) + { + if (($lr-2) < 0) + { &rotl($r, 2-$lr); } + else { &rotr($r, $lr-2); } + } + if ($lr != 3) + { + if (($lr-3) < 0) + { &rotl($l, 3-$lr); } + else { &rotr($l, $lr-3); } + } + + &R_PERM_OP($l,$r,$tt, 0,"0xaaaaaaaa",$r); + &R_PERM_OP($tt,$r,$l,23,"0x03fc03fc",$r); + &R_PERM_OP($l,$r,$tt,10,"0x33333333",$l); + &R_PERM_OP($r,$tt,$l,18,"0xfff0000f",$l); + &R_PERM_OP($l,$tt,$r,12,"0xf0f0f0f0",$r); + &rotr($tt , 4); + } + diff --git a/openssl/crypto/des/asm/des-586.pl b/openssl/crypto/des/asm/des-586.pl new file mode 100644 index 00000000..5b5f39ce --- /dev/null +++ b/openssl/crypto/des/asm/des-586.pl @@ -0,0 +1,453 @@ +#!/usr/local/bin/perl +# +# The inner loop instruction sequence and the IP/FP modifications are from +# Svend Olaf Mikkelsen +# + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +push(@INC,"${dir}","${dir}../../perlasm"); +require "x86asm.pl"; +require "cbc.pl"; +require "desboth.pl"; + +# base code is in microsft +# op dest, source +# format. +# + +&asm_init($ARGV[0],"des-586.pl"); + +$L="edi"; +$R="esi"; +$trans="ebp"; +$small_footprint=1 if (grep(/\-DOPENSSL_SMALL_FOOTPRINT/,@ARGV)); +# one can discuss setting this variable to 1 unconditionally, as +# the folded loop is only 3% slower than unrolled, but >7 times smaller + +&public_label("DES_SPtrans"); + +&DES_encrypt_internal(); +&DES_decrypt_internal(); +&DES_encrypt("DES_encrypt1",1); +&DES_encrypt("DES_encrypt2",0); +&DES_encrypt3("DES_encrypt3",1); +&DES_encrypt3("DES_decrypt3",0); +&cbc("DES_ncbc_encrypt","DES_encrypt1","DES_encrypt1",0,4,5,3,5,-1); +&cbc("DES_ede3_cbc_encrypt","DES_encrypt3","DES_decrypt3",0,6,7,3,4,5); +&DES_SPtrans(); + +&asm_finish(); + +sub DES_encrypt_internal() + { + &function_begin_B("_x86_DES_encrypt"); + + if ($small_footprint) + { + &lea("edx",&DWP(128,"ecx")); + &push("edx"); + &push("ecx"); + &set_label("eloop"); + &D_ENCRYPT(0,$L,$R,0,$trans,"eax","ebx","ecx","edx",&swtmp(0)); + &comment(""); + &D_ENCRYPT(1,$R,$L,2,$trans,"eax","ebx","ecx","edx",&swtmp(0)); + &comment(""); + &add("ecx",16); + &cmp("ecx",&swtmp(1)); + &mov(&swtmp(0),"ecx"); + &jb(&label("eloop")); + &add("esp",8); + } + else + { + &push("ecx"); + for ($i=0; $i<16; $i+=2) + { + &comment("Round $i"); + &D_ENCRYPT($i,$L,$R,$i*2,$trans,"eax","ebx","ecx","edx",&swtmp(0)); + &comment("Round ".sprintf("%d",$i+1)); + &D_ENCRYPT($i+1,$R,$L,($i+1)*2,$trans,"eax","ebx","ecx","edx",&swtmp(0)); + } + &add("esp",4); + } + &ret(); + + &function_end_B("_x86_DES_encrypt"); + } + +sub DES_decrypt_internal() + { + &function_begin_B("_x86_DES_decrypt"); + + if ($small_footprint) + { + &push("ecx"); + &lea("ecx",&DWP(128,"ecx")); + &push("ecx"); + &set_label("dloop"); + &D_ENCRYPT(0,$L,$R,-2,$trans,"eax","ebx","ecx","edx",&swtmp(0)); + &comment(""); + &D_ENCRYPT(1,$R,$L,-4,$trans,"eax","ebx","ecx","edx",&swtmp(0)); + &comment(""); + &sub("ecx",16); + &cmp("ecx",&swtmp(1)); + &mov(&swtmp(0),"ecx"); + &ja(&label("dloop")); + &add("esp",8); + } + else + { + &push("ecx"); + for ($i=15; $i>0; $i-=2) + { + &comment("Round $i"); + &D_ENCRYPT(15-$i,$L,$R,$i*2,$trans,"eax","ebx","ecx","edx",&swtmp(0)); + &comment("Round ".sprintf("%d",$i-1)); + &D_ENCRYPT(15-$i+1,$R,$L,($i-1)*2,$trans,"eax","ebx","ecx","edx",&swtmp(0)); + } + &add("esp",4); + } + &ret(); + + &function_end_B("_x86_DES_decrypt"); + } + +sub DES_encrypt + { + local($name,$do_ip)=@_; + + &function_begin_B($name); + + &push("esi"); + &push("edi"); + + &comment(""); + &comment("Load the 2 words"); + + if ($do_ip) + { + &mov($R,&wparam(0)); + &xor( "ecx", "ecx" ); + + &push("ebx"); + &push("ebp"); + + &mov("eax",&DWP(0,$R,"",0)); + &mov("ebx",&wparam(2)); # get encrypt flag + &mov($L,&DWP(4,$R,"",0)); + &comment(""); + &comment("IP"); + &IP_new("eax",$L,$R,3); + } + else + { + &mov("eax",&wparam(0)); + &xor( "ecx", "ecx" ); + + &push("ebx"); + &push("ebp"); + + &mov($R,&DWP(0,"eax","",0)); + &mov("ebx",&wparam(2)); # get encrypt flag + &rotl($R,3); + &mov($L,&DWP(4,"eax","",0)); + &rotl($L,3); + } + + # PIC-ification:-) + &call (&label("pic_point")); + &set_label("pic_point"); + &blindpop($trans); + &lea ($trans,&DWP(&label("DES_SPtrans")."-".&label("pic_point"),$trans)); + + &mov( "ecx", &wparam(1) ); + + &cmp("ebx","0"); + &je(&label("decrypt")); + &call("_x86_DES_encrypt"); + &jmp(&label("done")); + &set_label("decrypt"); + &call("_x86_DES_decrypt"); + &set_label("done"); + + if ($do_ip) + { + &comment(""); + &comment("FP"); + &mov("edx",&wparam(0)); + &FP_new($L,$R,"eax",3); + + &mov(&DWP(0,"edx","",0),"eax"); + &mov(&DWP(4,"edx","",0),$R); + } + else + { + &comment(""); + &comment("Fixup"); + &rotr($L,3); # r + &mov("eax",&wparam(0)); + &rotr($R,3); # l + &mov(&DWP(0,"eax","",0),$L); + &mov(&DWP(4,"eax","",0),$R); + } + + &pop("ebp"); + &pop("ebx"); + &pop("edi"); + &pop("esi"); + &ret(); + + &function_end_B($name); + } + +sub D_ENCRYPT + { + local($r,$L,$R,$S,$trans,$u,$tmp1,$tmp2,$t,$wp1)=@_; + + &mov( $u, &DWP(&n2a($S*4),$tmp2,"",0)); + &xor( $tmp1, $tmp1); + &mov( $t, &DWP(&n2a(($S+1)*4),$tmp2,"",0)); + &xor( $u, $R); + &xor( $tmp2, $tmp2); + &xor( $t, $R); + &and( $u, "0xfcfcfcfc" ); + &and( $t, "0xcfcfcfcf" ); + &movb( &LB($tmp1), &LB($u) ); + &movb( &LB($tmp2), &HB($u) ); + &rotr( $t, 4 ); + &xor( $L, &DWP(" ",$trans,$tmp1,0)); + &movb( &LB($tmp1), &LB($t) ); + &xor( $L, &DWP("0x200",$trans,$tmp2,0)); + &movb( &LB($tmp2), &HB($t) ); + &shr( $u, 16); + &xor( $L, &DWP("0x100",$trans,$tmp1,0)); + &movb( &LB($tmp1), &HB($u) ); + &shr( $t, 16); + &xor( $L, &DWP("0x300",$trans,$tmp2,0)); + &movb( &LB($tmp2), &HB($t) ); + &and( $u, "0xff" ); + &and( $t, "0xff" ); + &xor( $L, &DWP("0x600",$trans,$tmp1,0)); + &xor( $L, &DWP("0x700",$trans,$tmp2,0)); + &mov( $tmp2, $wp1 ); + &xor( $L, &DWP("0x400",$trans,$u,0)); + &xor( $L, &DWP("0x500",$trans,$t,0)); + } + +sub n2a + { + sprintf("%d",$_[0]); + } + +# now has a side affect of rotating $a by $shift +sub R_PERM_OP + { + local($a,$b,$tt,$shift,$mask,$last)=@_; + + &rotl( $a, $shift ) if ($shift != 0); + &mov( $tt, $a ); + &xor( $a, $b ); + &and( $a, $mask ); + # This can never succeed, and besides it is difficult to see what the + # idea was - Ben 13 Feb 99 + if (!$last eq $b) + { + &xor( $b, $a ); + &xor( $tt, $a ); + } + else + { + &xor( $tt, $a ); + &xor( $b, $a ); + } + &comment(""); + } + +sub IP_new + { + local($l,$r,$tt,$lr)=@_; + + &R_PERM_OP($l,$r,$tt, 4,"0xf0f0f0f0",$l); + &R_PERM_OP($r,$tt,$l,20,"0xfff0000f",$l); + &R_PERM_OP($l,$tt,$r,14,"0x33333333",$r); + &R_PERM_OP($tt,$r,$l,22,"0x03fc03fc",$r); + &R_PERM_OP($l,$r,$tt, 9,"0xaaaaaaaa",$r); + + if ($lr != 3) + { + if (($lr-3) < 0) + { &rotr($tt, 3-$lr); } + else { &rotl($tt, $lr-3); } + } + if ($lr != 2) + { + if (($lr-2) < 0) + { &rotr($r, 2-$lr); } + else { &rotl($r, $lr-2); } + } + } + +sub FP_new + { + local($l,$r,$tt,$lr)=@_; + + if ($lr != 2) + { + if (($lr-2) < 0) + { &rotl($r, 2-$lr); } + else { &rotr($r, $lr-2); } + } + if ($lr != 3) + { + if (($lr-3) < 0) + { &rotl($l, 3-$lr); } + else { &rotr($l, $lr-3); } + } + + &R_PERM_OP($l,$r,$tt, 0,"0xaaaaaaaa",$r); + &R_PERM_OP($tt,$r,$l,23,"0x03fc03fc",$r); + &R_PERM_OP($l,$r,$tt,10,"0x33333333",$l); + &R_PERM_OP($r,$tt,$l,18,"0xfff0000f",$l); + &R_PERM_OP($l,$tt,$r,12,"0xf0f0f0f0",$r); + &rotr($tt , 4); + } + +sub DES_SPtrans + { + &set_label("DES_SPtrans",64); + &data_word(0x02080800, 0x00080000, 0x02000002, 0x02080802); + &data_word(0x02000000, 0x00080802, 0x00080002, 0x02000002); + &data_word(0x00080802, 0x02080800, 0x02080000, 0x00000802); + &data_word(0x02000802, 0x02000000, 0x00000000, 0x00080002); + &data_word(0x00080000, 0x00000002, 0x02000800, 0x00080800); + &data_word(0x02080802, 0x02080000, 0x00000802, 0x02000800); + &data_word(0x00000002, 0x00000800, 0x00080800, 0x02080002); + &data_word(0x00000800, 0x02000802, 0x02080002, 0x00000000); + &data_word(0x00000000, 0x02080802, 0x02000800, 0x00080002); + &data_word(0x02080800, 0x00080000, 0x00000802, 0x02000800); + &data_word(0x02080002, 0x00000800, 0x00080800, 0x02000002); + &data_word(0x00080802, 0x00000002, 0x02000002, 0x02080000); + &data_word(0x02080802, 0x00080800, 0x02080000, 0x02000802); + &data_word(0x02000000, 0x00000802, 0x00080002, 0x00000000); + &data_word(0x00080000, 0x02000000, 0x02000802, 0x02080800); + &data_word(0x00000002, 0x02080002, 0x00000800, 0x00080802); + # nibble 1 + &data_word(0x40108010, 0x00000000, 0x00108000, 0x40100000); + &data_word(0x40000010, 0x00008010, 0x40008000, 0x00108000); + &data_word(0x00008000, 0x40100010, 0x00000010, 0x40008000); + &data_word(0x00100010, 0x40108000, 0x40100000, 0x00000010); + &data_word(0x00100000, 0x40008010, 0x40100010, 0x00008000); + &data_word(0x00108010, 0x40000000, 0x00000000, 0x00100010); + &data_word(0x40008010, 0x00108010, 0x40108000, 0x40000010); + &data_word(0x40000000, 0x00100000, 0x00008010, 0x40108010); + &data_word(0x00100010, 0x40108000, 0x40008000, 0x00108010); + &data_word(0x40108010, 0x00100010, 0x40000010, 0x00000000); + &data_word(0x40000000, 0x00008010, 0x00100000, 0x40100010); + &data_word(0x00008000, 0x40000000, 0x00108010, 0x40008010); + &data_word(0x40108000, 0x00008000, 0x00000000, 0x40000010); + &data_word(0x00000010, 0x40108010, 0x00108000, 0x40100000); + &data_word(0x40100010, 0x00100000, 0x00008010, 0x40008000); + &data_word(0x40008010, 0x00000010, 0x40100000, 0x00108000); + # nibble 2 + &data_word(0x04000001, 0x04040100, 0x00000100, 0x04000101); + &data_word(0x00040001, 0x04000000, 0x04000101, 0x00040100); + &data_word(0x04000100, 0x00040000, 0x04040000, 0x00000001); + &data_word(0x04040101, 0x00000101, 0x00000001, 0x04040001); + &data_word(0x00000000, 0x00040001, 0x04040100, 0x00000100); + &data_word(0x00000101, 0x04040101, 0x00040000, 0x04000001); + &data_word(0x04040001, 0x04000100, 0x00040101, 0x04040000); + &data_word(0x00040100, 0x00000000, 0x04000000, 0x00040101); + &data_word(0x04040100, 0x00000100, 0x00000001, 0x00040000); + &data_word(0x00000101, 0x00040001, 0x04040000, 0x04000101); + &data_word(0x00000000, 0x04040100, 0x00040100, 0x04040001); + &data_word(0x00040001, 0x04000000, 0x04040101, 0x00000001); + &data_word(0x00040101, 0x04000001, 0x04000000, 0x04040101); + &data_word(0x00040000, 0x04000100, 0x04000101, 0x00040100); + &data_word(0x04000100, 0x00000000, 0x04040001, 0x00000101); + &data_word(0x04000001, 0x00040101, 0x00000100, 0x04040000); + # nibble 3 + &data_word(0x00401008, 0x10001000, 0x00000008, 0x10401008); + &data_word(0x00000000, 0x10400000, 0x10001008, 0x00400008); + &data_word(0x10401000, 0x10000008, 0x10000000, 0x00001008); + &data_word(0x10000008, 0x00401008, 0x00400000, 0x10000000); + &data_word(0x10400008, 0x00401000, 0x00001000, 0x00000008); + &data_word(0x00401000, 0x10001008, 0x10400000, 0x00001000); + &data_word(0x00001008, 0x00000000, 0x00400008, 0x10401000); + &data_word(0x10001000, 0x10400008, 0x10401008, 0x00400000); + &data_word(0x10400008, 0x00001008, 0x00400000, 0x10000008); + &data_word(0x00401000, 0x10001000, 0x00000008, 0x10400000); + &data_word(0x10001008, 0x00000000, 0x00001000, 0x00400008); + &data_word(0x00000000, 0x10400008, 0x10401000, 0x00001000); + &data_word(0x10000000, 0x10401008, 0x00401008, 0x00400000); + &data_word(0x10401008, 0x00000008, 0x10001000, 0x00401008); + &data_word(0x00400008, 0x00401000, 0x10400000, 0x10001008); + &data_word(0x00001008, 0x10000000, 0x10000008, 0x10401000); + # nibble 4 + &data_word(0x08000000, 0x00010000, 0x00000400, 0x08010420); + &data_word(0x08010020, 0x08000400, 0x00010420, 0x08010000); + &data_word(0x00010000, 0x00000020, 0x08000020, 0x00010400); + &data_word(0x08000420, 0x08010020, 0x08010400, 0x00000000); + &data_word(0x00010400, 0x08000000, 0x00010020, 0x00000420); + &data_word(0x08000400, 0x00010420, 0x00000000, 0x08000020); + &data_word(0x00000020, 0x08000420, 0x08010420, 0x00010020); + &data_word(0x08010000, 0x00000400, 0x00000420, 0x08010400); + &data_word(0x08010400, 0x08000420, 0x00010020, 0x08010000); + &data_word(0x00010000, 0x00000020, 0x08000020, 0x08000400); + &data_word(0x08000000, 0x00010400, 0x08010420, 0x00000000); + &data_word(0x00010420, 0x08000000, 0x00000400, 0x00010020); + &data_word(0x08000420, 0x00000400, 0x00000000, 0x08010420); + &data_word(0x08010020, 0x08010400, 0x00000420, 0x00010000); + &data_word(0x00010400, 0x08010020, 0x08000400, 0x00000420); + &data_word(0x00000020, 0x00010420, 0x08010000, 0x08000020); + # nibble 5 + &data_word(0x80000040, 0x00200040, 0x00000000, 0x80202000); + &data_word(0x00200040, 0x00002000, 0x80002040, 0x00200000); + &data_word(0x00002040, 0x80202040, 0x00202000, 0x80000000); + &data_word(0x80002000, 0x80000040, 0x80200000, 0x00202040); + &data_word(0x00200000, 0x80002040, 0x80200040, 0x00000000); + &data_word(0x00002000, 0x00000040, 0x80202000, 0x80200040); + &data_word(0x80202040, 0x80200000, 0x80000000, 0x00002040); + &data_word(0x00000040, 0x00202000, 0x00202040, 0x80002000); + &data_word(0x00002040, 0x80000000, 0x80002000, 0x00202040); + &data_word(0x80202000, 0x00200040, 0x00000000, 0x80002000); + &data_word(0x80000000, 0x00002000, 0x80200040, 0x00200000); + &data_word(0x00200040, 0x80202040, 0x00202000, 0x00000040); + &data_word(0x80202040, 0x00202000, 0x00200000, 0x80002040); + &data_word(0x80000040, 0x80200000, 0x00202040, 0x00000000); + &data_word(0x00002000, 0x80000040, 0x80002040, 0x80202000); + &data_word(0x80200000, 0x00002040, 0x00000040, 0x80200040); + # nibble 6 + &data_word(0x00004000, 0x00000200, 0x01000200, 0x01000004); + &data_word(0x01004204, 0x00004004, 0x00004200, 0x00000000); + &data_word(0x01000000, 0x01000204, 0x00000204, 0x01004000); + &data_word(0x00000004, 0x01004200, 0x01004000, 0x00000204); + &data_word(0x01000204, 0x00004000, 0x00004004, 0x01004204); + &data_word(0x00000000, 0x01000200, 0x01000004, 0x00004200); + &data_word(0x01004004, 0x00004204, 0x01004200, 0x00000004); + &data_word(0x00004204, 0x01004004, 0x00000200, 0x01000000); + &data_word(0x00004204, 0x01004000, 0x01004004, 0x00000204); + &data_word(0x00004000, 0x00000200, 0x01000000, 0x01004004); + &data_word(0x01000204, 0x00004204, 0x00004200, 0x00000000); + &data_word(0x00000200, 0x01000004, 0x00000004, 0x01000200); + &data_word(0x00000000, 0x01000204, 0x01000200, 0x00004200); + &data_word(0x00000204, 0x00004000, 0x01004204, 0x01000000); + &data_word(0x01004200, 0x00000004, 0x00004004, 0x01004204); + &data_word(0x01000004, 0x01004200, 0x01004000, 0x00004004); + # nibble 7 + &data_word(0x20800080, 0x20820000, 0x00020080, 0x00000000); + &data_word(0x20020000, 0x00800080, 0x20800000, 0x20820080); + &data_word(0x00000080, 0x20000000, 0x00820000, 0x00020080); + &data_word(0x00820080, 0x20020080, 0x20000080, 0x20800000); + &data_word(0x00020000, 0x00820080, 0x00800080, 0x20020000); + &data_word(0x20820080, 0x20000080, 0x00000000, 0x00820000); + &data_word(0x20000000, 0x00800000, 0x20020080, 0x20800080); + &data_word(0x00800000, 0x00020000, 0x20820000, 0x00000080); + &data_word(0x00800000, 0x00020000, 0x20000080, 0x20820080); + &data_word(0x00020080, 0x20000000, 0x00000000, 0x00820000); + &data_word(0x20800080, 0x20020080, 0x20020000, 0x00800080); + &data_word(0x20820000, 0x00000080, 0x00800080, 0x20020000); + &data_word(0x20820080, 0x00800000, 0x20800000, 0x20000080); + &data_word(0x00820000, 0x00020080, 0x20020080, 0x20800000); + &data_word(0x00000080, 0x20820000, 0x00820080, 0x00000000); + &data_word(0x20000000, 0x20800080, 0x00020000, 0x00820080); + } diff --git a/openssl/crypto/des/asm/des_enc.m4 b/openssl/crypto/des/asm/des_enc.m4 new file mode 100644 index 00000000..32805954 --- /dev/null +++ b/openssl/crypto/des/asm/des_enc.m4 @@ -0,0 +1,2099 @@ +! des_enc.m4 +! des_enc.S (generated from des_enc.m4) +! +! UltraSPARC assembler version of the LibDES/SSLeay/OpenSSL des_enc.c file. +! +! Version 1.0. 32-bit version. +! +! June 8, 2000. +! +! Version 2.0. 32/64-bit, PIC-ification, blended CPU adaptation +! by Andy Polyakov. +! +! January 1, 2003. +! +! Assembler version: Copyright Svend Olaf Mikkelsen. +! +! Original C code: Copyright Eric A. Young. +! +! This code can be freely used by LibDES/SSLeay/OpenSSL users. +! +! The LibDES/SSLeay/OpenSSL copyright notices must be respected. +! +! This version can be redistributed. +! +! To expand the m4 macros: m4 -B 8192 des_enc.m4 > des_enc.S +! +! Global registers 1 to 5 are used. This is the same as done by the +! cc compiler. The UltraSPARC load/store little endian feature is used. +! +! Instruction grouping often refers to one CPU cycle. +! +! Assemble through gcc: gcc -c -mcpu=ultrasparc -o des_enc.o des_enc.S +! +! Assemble through cc: cc -c -xarch=v8plusa -o des_enc.o des_enc.S +! +! Performance improvement according to './apps/openssl speed des' +! +! 32-bit build: +! 23% faster than cc-5.2 -xarch=v8plus -xO5 +! 115% faster than gcc-3.2.1 -m32 -mcpu=ultrasparc -O5 +! 64-bit build: +! 50% faster than cc-5.2 -xarch=v9 -xO5 +! 100% faster than gcc-3.2.1 -m64 -mcpu=ultrasparc -O5 +! + +.ident "des_enc.m4 2.1" +.file "des_enc-sparc.S" + +#if defined(__SUNPRO_C) && defined(__sparcv9) +# define ABI64 /* They've said -xarch=v9 at command line */ +#elif defined(__GNUC__) && defined(__arch64__) +# define ABI64 /* They've said -m64 at command line */ +#endif + +#ifdef ABI64 + .register %g2,#scratch + .register %g3,#scratch +# define FRAME -192 +# define BIAS 2047 +# define LDPTR ldx +# define STPTR stx +# define ARG0 128 +# define ARGSZ 8 +# ifndef OPENSSL_SYSNAME_ULTRASPARC +# define OPENSSL_SYSNAME_ULTRASPARC +# endif +#else +# define FRAME -96 +# define BIAS 0 +# define LDPTR ld +# define STPTR st +# define ARG0 68 +# define ARGSZ 4 +#endif + +#define LOOPS 7 + +#define global0 %g0 +#define global1 %g1 +#define global2 %g2 +#define global3 %g3 +#define global4 %g4 +#define global5 %g5 + +#define local0 %l0 +#define local1 %l1 +#define local2 %l2 +#define local3 %l3 +#define local4 %l4 +#define local5 %l5 +#define local7 %l6 +#define local6 %l7 + +#define in0 %i0 +#define in1 %i1 +#define in2 %i2 +#define in3 %i3 +#define in4 %i4 +#define in5 %i5 +#define in6 %i6 +#define in7 %i7 + +#define out0 %o0 +#define out1 %o1 +#define out2 %o2 +#define out3 %o3 +#define out4 %o4 +#define out5 %o5 +#define out6 %o6 +#define out7 %o7 + +#define stub stb + +changequote({,}) + + +! Macro definitions: + + +! {ip_macro} +! +! The logic used in initial and final permutations is the same as in +! the C code. The permutations are done with a clever shift, xor, and +! technique. +! +! The macro also loads address sbox 1 to 5 to global 1 to 5, address +! sbox 6 to local6, and addres sbox 8 to out3. +! +! Rotates the halfs 3 left to bring the sbox bits in convenient positions. +! +! Loads key first round from address in parameter 5 to out0, out1. +! +! After the the original LibDES initial permutation, the resulting left +! is in the variable initially used for right and vice versa. The macro +! implements the possibility to keep the halfs in the original registers. +! +! parameter 1 left +! parameter 2 right +! parameter 3 result left (modify in first round) +! parameter 4 result right (use in first round) +! parameter 5 key address +! parameter 6 1/2 for include encryption/decryption +! parameter 7 1 for move in1 to in3 +! parameter 8 1 for move in3 to in4, 2 for move in4 to in3 +! parameter 9 1 for load ks3 and ks2 to in4 and in3 + +define(ip_macro, { + +! {ip_macro} +! $1 $2 $4 $3 $5 $6 $7 $8 $9 + + ld [out2+256], local1 + srl $2, 4, local4 + + xor local4, $1, local4 + ifelse($7,1,{mov in1, in3},{nop}) + + ld [out2+260], local2 + and local4, local1, local4 + ifelse($8,1,{mov in3, in4},{}) + ifelse($8,2,{mov in4, in3},{}) + + ld [out2+280], out4 ! loop counter + sll local4, 4, local1 + xor $1, local4, $1 + + ld [out2+264], local3 + srl $1, 16, local4 + xor $2, local1, $2 + + ifelse($9,1,{LDPTR KS3, in4},{}) + xor local4, $2, local4 + nop !sethi %hi(DES_SPtrans), global1 ! sbox addr + + ifelse($9,1,{LDPTR KS2, in3},{}) + and local4, local2, local4 + nop !or global1, %lo(DES_SPtrans), global1 ! sbox addr + + sll local4, 16, local1 + xor $2, local4, $2 + + srl $2, 2, local4 + xor $1, local1, $1 + + sethi %hi(16711680), local5 + xor local4, $1, local4 + + and local4, local3, local4 + or local5, 255, local5 + + sll local4, 2, local2 + xor $1, local4, $1 + + srl $1, 8, local4 + xor $2, local2, $2 + + xor local4, $2, local4 + add global1, 768, global4 + + and local4, local5, local4 + add global1, 1024, global5 + + ld [out2+272], local7 + sll local4, 8, local1 + xor $2, local4, $2 + + srl $2, 1, local4 + xor $1, local1, $1 + + ld [$5], out0 ! key 7531 + xor local4, $1, local4 + add global1, 256, global2 + + ld [$5+4], out1 ! key 8642 + and local4, local7, local4 + add global1, 512, global3 + + sll local4, 1, local1 + xor $1, local4, $1 + + sll $1, 3, local3 + xor $2, local1, $2 + + sll $2, 3, local2 + add global1, 1280, local6 ! address sbox 8 + + srl $1, 29, local4 + add global1, 1792, out3 ! address sbox 8 + + srl $2, 29, local1 + or local4, local3, $4 + + or local2, local1, $3 + + ifelse($6, 1, { + + ld [out2+284], local5 ! 0x0000FC00 used in the rounds + or local2, local1, $3 + xor $4, out0, local1 + + call .des_enc.1 + and local1, 252, local1 + + },{}) + + ifelse($6, 2, { + + ld [out2+284], local5 ! 0x0000FC00 used in the rounds + or local2, local1, $3 + xor $4, out0, local1 + + call .des_dec.1 + and local1, 252, local1 + + },{}) +}) + + +! {rounds_macro} +! +! The logic used in the DES rounds is the same as in the C code, +! except that calculations for sbox 1 and sbox 5 begin before +! the previous round is finished. +! +! In each round one half (work) is modified based on key and the +! other half (use). +! +! In this version we do two rounds in a loop repeated 7 times +! and two rounds seperately. +! +! One half has the bits for the sboxes in the following positions: +! +! 777777xx555555xx333333xx111111xx +! +! 88xx666666xx444444xx222222xx8888 +! +! The bits for each sbox are xor-ed with the key bits for that box. +! The above xx bits are cleared, and the result used for lookup in +! the sbox table. Each sbox entry contains the 4 output bits permuted +! into 32 bits according to the P permutation. +! +! In the description of DES, left and right are switched after +! each round, except after last round. In this code the original +! left and right are kept in the same register in all rounds, meaning +! that after the 16 rounds the result for right is in the register +! originally used for left. +! +! parameter 1 first work (left in first round) +! parameter 2 first use (right in first round) +! parameter 3 enc/dec 1/-1 +! parameter 4 loop label +! parameter 5 key address register +! parameter 6 optional address for key next encryption/decryption +! parameter 7 not empty for include retl +! +! also compares in2 to 8 + +define(rounds_macro, { + +! {rounds_macro} +! $1 $2 $3 $4 $5 $6 $7 $8 $9 + + xor $2, out0, local1 + + ld [out2+284], local5 ! 0x0000FC00 + ba $4 + and local1, 252, local1 + + .align 32 + +$4: + ! local6 is address sbox 6 + ! out3 is address sbox 8 + ! out4 is loop counter + + ld [global1+local1], local1 + xor $2, out1, out1 ! 8642 + xor $2, out0, out0 ! 7531 + ! fmovs %f0, %f0 ! fxor used for alignment + + srl out1, 4, local0 ! rotate 4 right + and out0, local5, local3 ! 3 + ! fmovs %f0, %f0 + + ld [$5+$3*8], local7 ! key 7531 next round + srl local3, 8, local3 ! 3 + and local0, 252, local2 ! 2 + ! fmovs %f0, %f0 + + ld [global3+local3],local3 ! 3 + sll out1, 28, out1 ! rotate + xor $1, local1, $1 ! 1 finished, local1 now sbox 7 + + ld [global2+local2], local2 ! 2 + srl out0, 24, local1 ! 7 + or out1, local0, out1 ! rotate + + ldub [out2+local1], local1 ! 7 (and 0xFC) + srl out1, 24, local0 ! 8 + and out1, local5, local4 ! 4 + + ldub [out2+local0], local0 ! 8 (and 0xFC) + srl local4, 8, local4 ! 4 + xor $1, local2, $1 ! 2 finished local2 now sbox 6 + + ld [global4+local4],local4 ! 4 + srl out1, 16, local2 ! 6 + xor $1, local3, $1 ! 3 finished local3 now sbox 5 + + ld [out3+local0],local0 ! 8 + and local2, 252, local2 ! 6 + add global1, 1536, local5 ! address sbox 7 + + ld [local6+local2], local2 ! 6 + srl out0, 16, local3 ! 5 + xor $1, local4, $1 ! 4 finished + + ld [local5+local1],local1 ! 7 + and local3, 252, local3 ! 5 + xor $1, local0, $1 ! 8 finished + + ld [global5+local3],local3 ! 5 + xor $1, local2, $1 ! 6 finished + subcc out4, 1, out4 + + ld [$5+$3*8+4], out0 ! key 8642 next round + xor $1, local7, local2 ! sbox 5 next round + xor $1, local1, $1 ! 7 finished + + srl local2, 16, local2 ! sbox 5 next round + xor $1, local3, $1 ! 5 finished + + ld [$5+$3*16+4], out1 ! key 8642 next round again + and local2, 252, local2 ! sbox5 next round +! next round + xor $1, local7, local7 ! 7531 + + ld [global5+local2], local2 ! 5 + srl local7, 24, local3 ! 7 + xor $1, out0, out0 ! 8642 + + ldub [out2+local3], local3 ! 7 (and 0xFC) + srl out0, 4, local0 ! rotate 4 right + and local7, 252, local1 ! 1 + + sll out0, 28, out0 ! rotate + xor $2, local2, $2 ! 5 finished local2 used + + srl local0, 8, local4 ! 4 + and local0, 252, local2 ! 2 + ld [local5+local3], local3 ! 7 + + srl local0, 16, local5 ! 6 + or out0, local0, out0 ! rotate + ld [global2+local2], local2 ! 2 + + srl out0, 24, local0 + ld [$5+$3*16], out0 ! key 7531 next round + and local4, 252, local4 ! 4 + + and local5, 252, local5 ! 6 + ld [global4+local4], local4 ! 4 + xor $2, local3, $2 ! 7 finished local3 used + + and local0, 252, local0 ! 8 + ld [local6+local5], local5 ! 6 + xor $2, local2, $2 ! 2 finished local2 now sbox 3 + + srl local7, 8, local2 ! 3 start + ld [out3+local0], local0 ! 8 + xor $2, local4, $2 ! 4 finished + + and local2, 252, local2 ! 3 + ld [global1+local1], local1 ! 1 + xor $2, local5, $2 ! 6 finished local5 used + + ld [global3+local2], local2 ! 3 + xor $2, local0, $2 ! 8 finished + add $5, $3*16, $5 ! enc add 8, dec add -8 to key pointer + + ld [out2+284], local5 ! 0x0000FC00 + xor $2, out0, local4 ! sbox 1 next round + xor $2, local1, $2 ! 1 finished + + xor $2, local2, $2 ! 3 finished +#ifdef OPENSSL_SYSNAME_ULTRASPARC + bne,pt %icc, $4 +#else + bne $4 +#endif + and local4, 252, local1 ! sbox 1 next round + +! two rounds more: + + ld [global1+local1], local1 + xor $2, out1, out1 + xor $2, out0, out0 + + srl out1, 4, local0 ! rotate + and out0, local5, local3 + + ld [$5+$3*8], local7 ! key 7531 + srl local3, 8, local3 + and local0, 252, local2 + + ld [global3+local3],local3 + sll out1, 28, out1 ! rotate + xor $1, local1, $1 ! 1 finished, local1 now sbox 7 + + ld [global2+local2], local2 + srl out0, 24, local1 + or out1, local0, out1 ! rotate + + ldub [out2+local1], local1 + srl out1, 24, local0 + and out1, local5, local4 + + ldub [out2+local0], local0 + srl local4, 8, local4 + xor $1, local2, $1 ! 2 finished local2 now sbox 6 + + ld [global4+local4],local4 + srl out1, 16, local2 + xor $1, local3, $1 ! 3 finished local3 now sbox 5 + + ld [out3+local0],local0 + and local2, 252, local2 + add global1, 1536, local5 ! address sbox 7 + + ld [local6+local2], local2 + srl out0, 16, local3 + xor $1, local4, $1 ! 4 finished + + ld [local5+local1],local1 + and local3, 252, local3 + xor $1, local0, $1 + + ld [global5+local3],local3 + xor $1, local2, $1 ! 6 finished + cmp in2, 8 + + ifelse($6,{}, {}, {ld [out2+280], out4}) ! loop counter + xor $1, local7, local2 ! sbox 5 next round + xor $1, local1, $1 ! 7 finished + + ld [$5+$3*8+4], out0 + srl local2, 16, local2 ! sbox 5 next round + xor $1, local3, $1 ! 5 finished + + and local2, 252, local2 +! next round (two rounds more) + xor $1, local7, local7 ! 7531 + + ld [global5+local2], local2 + srl local7, 24, local3 + xor $1, out0, out0 ! 8642 + + ldub [out2+local3], local3 + srl out0, 4, local0 ! rotate + and local7, 252, local1 + + sll out0, 28, out0 ! rotate + xor $2, local2, $2 ! 5 finished local2 used + + srl local0, 8, local4 + and local0, 252, local2 + ld [local5+local3], local3 + + srl local0, 16, local5 + or out0, local0, out0 ! rotate + ld [global2+local2], local2 + + srl out0, 24, local0 + ifelse($6,{}, {}, {ld [$6], out0}) ! key next encryption/decryption + and local4, 252, local4 + + and local5, 252, local5 + ld [global4+local4], local4 + xor $2, local3, $2 ! 7 finished local3 used + + and local0, 252, local0 + ld [local6+local5], local5 + xor $2, local2, $2 ! 2 finished local2 now sbox 3 + + srl local7, 8, local2 ! 3 start + ld [out3+local0], local0 + xor $2, local4, $2 + + and local2, 252, local2 + ld [global1+local1], local1 + xor $2, local5, $2 ! 6 finished local5 used + + ld [global3+local2], local2 + srl $1, 3, local3 + xor $2, local0, $2 + + ifelse($6,{}, {}, {ld [$6+4], out1}) ! key next encryption/decryption + sll $1, 29, local4 + xor $2, local1, $2 + + ifelse($7,{}, {}, {retl}) + xor $2, local2, $2 +}) + + +! {fp_macro} +! +! parameter 1 right (original left) +! parameter 2 left (original right) +! parameter 3 1 for optional store to [in0] +! parameter 4 1 for load input/output address to local5/7 +! +! The final permutation logic switches the halfes, meaning that +! left and right ends up the the registers originally used. + +define(fp_macro, { + +! {fp_macro} +! $1 $2 $3 $4 $5 $6 $7 $8 $9 + + ! initially undo the rotate 3 left done after initial permutation + ! original left is received shifted 3 right and 29 left in local3/4 + + sll $2, 29, local1 + or local3, local4, $1 + + srl $2, 3, $2 + sethi %hi(0x55555555), local2 + + or $2, local1, $2 + or local2, %lo(0x55555555), local2 + + srl $2, 1, local3 + sethi %hi(0x00ff00ff), local1 + xor local3, $1, local3 + or local1, %lo(0x00ff00ff), local1 + and local3, local2, local3 + sethi %hi(0x33333333), local4 + sll local3, 1, local2 + + xor $1, local3, $1 + + srl $1, 8, local3 + xor $2, local2, $2 + xor local3, $2, local3 + or local4, %lo(0x33333333), local4 + and local3, local1, local3 + sethi %hi(0x0000ffff), local1 + sll local3, 8, local2 + + xor $2, local3, $2 + + srl $2, 2, local3 + xor $1, local2, $1 + xor local3, $1, local3 + or local1, %lo(0x0000ffff), local1 + and local3, local4, local3 + sethi %hi(0x0f0f0f0f), local4 + sll local3, 2, local2 + + ifelse($4,1, {LDPTR INPUT, local5}) + xor $1, local3, $1 + + ifelse($4,1, {LDPTR OUTPUT, local7}) + srl $1, 16, local3 + xor $2, local2, $2 + xor local3, $2, local3 + or local4, %lo(0x0f0f0f0f), local4 + and local3, local1, local3 + sll local3, 16, local2 + + xor $2, local3, local1 + + srl local1, 4, local3 + xor $1, local2, $1 + xor local3, $1, local3 + and local3, local4, local3 + sll local3, 4, local2 + + xor $1, local3, $1 + + ! optional store: + + ifelse($3,1, {st $1, [in0]}) + + xor local1, local2, $2 + + ifelse($3,1, {st $2, [in0+4]}) + +}) + + +! {fp_ip_macro} +! +! Does initial permutation for next block mixed with +! final permutation for current block. +! +! parameter 1 original left +! parameter 2 original right +! parameter 3 left ip +! parameter 4 right ip +! parameter 5 1: load ks1/ks2 to in3/in4, add 120 to in4 +! 2: mov in4 to in3 +! +! also adds -8 to length in2 and loads loop counter to out4 + +define(fp_ip_macro, { + +! {fp_ip_macro} +! $1 $2 $3 $4 $5 $6 $7 $8 $9 + + define({temp1},{out4}) + define({temp2},{local3}) + + define({ip1},{local1}) + define({ip2},{local2}) + define({ip4},{local4}) + define({ip5},{local5}) + + ! $1 in local3, local4 + + ld [out2+256], ip1 + sll out5, 29, temp1 + or local3, local4, $1 + + srl out5, 3, $2 + ifelse($5,2,{mov in4, in3}) + + ld [out2+272], ip5 + srl $4, 4, local0 + or $2, temp1, $2 + + srl $2, 1, temp1 + xor temp1, $1, temp1 + + and temp1, ip5, temp1 + xor local0, $3, local0 + + sll temp1, 1, temp2 + xor $1, temp1, $1 + + and local0, ip1, local0 + add in2, -8, in2 + + sll local0, 4, local7 + xor $3, local0, $3 + + ld [out2+268], ip4 + srl $1, 8, temp1 + xor $2, temp2, $2 + ld [out2+260], ip2 + srl $3, 16, local0 + xor $4, local7, $4 + xor temp1, $2, temp1 + xor local0, $4, local0 + and temp1, ip4, temp1 + and local0, ip2, local0 + sll temp1, 8, temp2 + xor $2, temp1, $2 + sll local0, 16, local7 + xor $4, local0, $4 + + srl $2, 2, temp1 + xor $1, temp2, $1 + + ld [out2+264], temp2 ! ip3 + srl $4, 2, local0 + xor $3, local7, $3 + xor temp1, $1, temp1 + xor local0, $3, local0 + and temp1, temp2, temp1 + and local0, temp2, local0 + sll temp1, 2, temp2 + xor $1, temp1, $1 + sll local0, 2, local7 + xor $3, local0, $3 + + srl $1, 16, temp1 + xor $2, temp2, $2 + srl $3, 8, local0 + xor $4, local7, $4 + xor temp1, $2, temp1 + xor local0, $4, local0 + and temp1, ip2, temp1 + and local0, ip4, local0 + sll temp1, 16, temp2 + xor $2, temp1, local4 + sll local0, 8, local7 + xor $4, local0, $4 + + srl $4, 1, local0 + xor $3, local7, $3 + + srl local4, 4, temp1 + xor local0, $3, local0 + + xor $1, temp2, $1 + and local0, ip5, local0 + + sll local0, 1, local7 + xor temp1, $1, temp1 + + xor $3, local0, $3 + xor $4, local7, $4 + + sll $3, 3, local5 + and temp1, ip1, temp1 + + sll temp1, 4, temp2 + xor $1, temp1, $1 + + ifelse($5,1,{LDPTR KS2, in4}) + sll $4, 3, local2 + xor local4, temp2, $2 + + ! reload since used as temporar: + + ld [out2+280], out4 ! loop counter + + srl $3, 29, local0 + ifelse($5,1,{add in4, 120, in4}) + + ifelse($5,1,{LDPTR KS1, in3}) + srl $4, 29, local7 + + or local0, local5, $4 + or local2, local7, $3 + +}) + + + +! {load_little_endian} +! +! parameter 1 address +! parameter 2 destination left +! parameter 3 destination right +! parameter 4 temporar +! parameter 5 label + +define(load_little_endian, { + +! {load_little_endian} +! $1 $2 $3 $4 $5 $6 $7 $8 $9 + + ! first in memory to rightmost in register + +#ifdef OPENSSL_SYSNAME_ULTRASPARC + andcc $1, 3, global0 + bne,pn %icc, $5 + nop + + lda [$1] 0x88, $2 + add $1, 4, $4 + + ba,pt %icc, $5a + lda [$4] 0x88, $3 +#endif + +$5: + ldub [$1+3], $2 + + ldub [$1+2], $4 + sll $2, 8, $2 + or $2, $4, $2 + + ldub [$1+1], $4 + sll $2, 8, $2 + or $2, $4, $2 + + ldub [$1+0], $4 + sll $2, 8, $2 + or $2, $4, $2 + + + ldub [$1+3+4], $3 + + ldub [$1+2+4], $4 + sll $3, 8, $3 + or $3, $4, $3 + + ldub [$1+1+4], $4 + sll $3, 8, $3 + or $3, $4, $3 + + ldub [$1+0+4], $4 + sll $3, 8, $3 + or $3, $4, $3 +$5a: + +}) + + +! {load_little_endian_inc} +! +! parameter 1 address +! parameter 2 destination left +! parameter 3 destination right +! parameter 4 temporar +! parameter 4 label +! +! adds 8 to address + +define(load_little_endian_inc, { + +! {load_little_endian_inc} +! $1 $2 $3 $4 $5 $6 $7 $8 $9 + + ! first in memory to rightmost in register + +#ifdef OPENSSL_SYSNAME_ULTRASPARC + andcc $1, 3, global0 + bne,pn %icc, $5 + nop + + lda [$1] 0x88, $2 + add $1, 4, $1 + + lda [$1] 0x88, $3 + ba,pt %icc, $5a + add $1, 4, $1 +#endif + +$5: + ldub [$1+3], $2 + + ldub [$1+2], $4 + sll $2, 8, $2 + or $2, $4, $2 + + ldub [$1+1], $4 + sll $2, 8, $2 + or $2, $4, $2 + + ldub [$1+0], $4 + sll $2, 8, $2 + or $2, $4, $2 + + ldub [$1+3+4], $3 + add $1, 8, $1 + + ldub [$1+2+4-8], $4 + sll $3, 8, $3 + or $3, $4, $3 + + ldub [$1+1+4-8], $4 + sll $3, 8, $3 + or $3, $4, $3 + + ldub [$1+0+4-8], $4 + sll $3, 8, $3 + or $3, $4, $3 +$5a: + +}) + + +! {load_n_bytes} +! +! Loads 1 to 7 bytes little endian +! Remaining bytes are zeroed. +! +! parameter 1 address +! parameter 2 length +! parameter 3 destination register left +! parameter 4 destination register right +! parameter 5 temp +! parameter 6 temp2 +! parameter 7 label +! parameter 8 return label + +define(load_n_bytes, { + +! {load_n_bytes} +! $1 $2 $5 $6 $7 $8 $7 $8 $9 + +$7.0: call .+8 + sll $2, 2, $6 + + add %o7,$7.jmp.table-$7.0,$5 + + add $5, $6, $5 + mov 0, $4 + + ld [$5], $5 + + jmp %o7+$5 + mov 0, $3 + +$7.7: + ldub [$1+6], $5 + sll $5, 16, $5 + or $3, $5, $3 +$7.6: + ldub [$1+5], $5 + sll $5, 8, $5 + or $3, $5, $3 +$7.5: + ldub [$1+4], $5 + or $3, $5, $3 +$7.4: + ldub [$1+3], $5 + sll $5, 24, $5 + or $4, $5, $4 +$7.3: + ldub [$1+2], $5 + sll $5, 16, $5 + or $4, $5, $4 +$7.2: + ldub [$1+1], $5 + sll $5, 8, $5 + or $4, $5, $4 +$7.1: + ldub [$1+0], $5 + ba $8 + or $4, $5, $4 + + .align 4 + +$7.jmp.table: + .word 0 + .word $7.1-$7.0 + .word $7.2-$7.0 + .word $7.3-$7.0 + .word $7.4-$7.0 + .word $7.5-$7.0 + .word $7.6-$7.0 + .word $7.7-$7.0 +}) + + +! {store_little_endian} +! +! parameter 1 address +! parameter 2 source left +! parameter 3 source right +! parameter 4 temporar + +define(store_little_endian, { + +! {store_little_endian} +! $1 $2 $3 $4 $5 $6 $7 $8 $9 + + ! rightmost in register to first in memory + +#ifdef OPENSSL_SYSNAME_ULTRASPARC + andcc $1, 3, global0 + bne,pn %icc, $5 + nop + + sta $2, [$1] 0x88 + add $1, 4, $4 + + ba,pt %icc, $5a + sta $3, [$4] 0x88 +#endif + +$5: + and $2, 255, $4 + stub $4, [$1+0] + + srl $2, 8, $4 + and $4, 255, $4 + stub $4, [$1+1] + + srl $2, 16, $4 + and $4, 255, $4 + stub $4, [$1+2] + + srl $2, 24, $4 + stub $4, [$1+3] + + + and $3, 255, $4 + stub $4, [$1+0+4] + + srl $3, 8, $4 + and $4, 255, $4 + stub $4, [$1+1+4] + + srl $3, 16, $4 + and $4, 255, $4 + stub $4, [$1+2+4] + + srl $3, 24, $4 + stub $4, [$1+3+4] + +$5a: + +}) + + +! {store_n_bytes} +! +! Stores 1 to 7 bytes little endian +! +! parameter 1 address +! parameter 2 length +! parameter 3 source register left +! parameter 4 source register right +! parameter 5 temp +! parameter 6 temp2 +! parameter 7 label +! parameter 8 return label + +define(store_n_bytes, { + +! {store_n_bytes} +! $1 $2 $5 $6 $7 $8 $7 $8 $9 + +$7.0: call .+8 + sll $2, 2, $6 + + add %o7,$7.jmp.table-$7.0,$5 + + add $5, $6, $5 + + ld [$5], $5 + + jmp %o7+$5 + nop + +$7.7: + srl $3, 16, $5 + and $5, 0xff, $5 + stub $5, [$1+6] +$7.6: + srl $3, 8, $5 + and $5, 0xff, $5 + stub $5, [$1+5] +$7.5: + and $3, 0xff, $5 + stub $5, [$1+4] +$7.4: + srl $4, 24, $5 + stub $5, [$1+3] +$7.3: + srl $4, 16, $5 + and $5, 0xff, $5 + stub $5, [$1+2] +$7.2: + srl $4, 8, $5 + and $5, 0xff, $5 + stub $5, [$1+1] +$7.1: + and $4, 0xff, $5 + + + ba $8 + stub $5, [$1] + + .align 4 + +$7.jmp.table: + + .word 0 + .word $7.1-$7.0 + .word $7.2-$7.0 + .word $7.3-$7.0 + .word $7.4-$7.0 + .word $7.5-$7.0 + .word $7.6-$7.0 + .word $7.7-$7.0 +}) + + +define(testvalue,{1}) + +define(register_init, { + +! For test purposes: + + sethi %hi(testvalue), local0 + or local0, %lo(testvalue), local0 + + ifelse($1,{},{}, {mov local0, $1}) + ifelse($2,{},{}, {mov local0, $2}) + ifelse($3,{},{}, {mov local0, $3}) + ifelse($4,{},{}, {mov local0, $4}) + ifelse($5,{},{}, {mov local0, $5}) + ifelse($6,{},{}, {mov local0, $6}) + ifelse($7,{},{}, {mov local0, $7}) + ifelse($8,{},{}, {mov local0, $8}) + + mov local0, local1 + mov local0, local2 + mov local0, local3 + mov local0, local4 + mov local0, local5 + mov local0, local7 + mov local0, local6 + mov local0, out0 + mov local0, out1 + mov local0, out2 + mov local0, out3 + mov local0, out4 + mov local0, out5 + mov local0, global1 + mov local0, global2 + mov local0, global3 + mov local0, global4 + mov local0, global5 + +}) + +.section ".text" + + .align 32 + +.des_enc: + + ! key address in3 + ! loads key next encryption/decryption first round from [in4] + + rounds_macro(in5, out5, 1, .des_enc.1, in3, in4, retl) + + + .align 32 + +.des_dec: + + ! implemented with out5 as first parameter to avoid + ! register exchange in ede modes + + ! key address in4 + ! loads key next encryption/decryption first round from [in3] + + rounds_macro(out5, in5, -1, .des_dec.1, in4, in3, retl) + + + +! void DES_encrypt1(data, ks, enc) +! ******************************* + + .align 32 + .global DES_encrypt1 + .type DES_encrypt1,#function + +DES_encrypt1: + + save %sp, FRAME, %sp + + sethi %hi(.PIC.DES_SPtrans-1f),global1 + or global1,%lo(.PIC.DES_SPtrans-1f),global1 +1: call .+8 + add %o7,global1,global1 + sub global1,.PIC.DES_SPtrans-.des_and,out2 + + ld [in0], in5 ! left + cmp in2, 0 ! enc + +#ifdef OPENSSL_SYSNAME_ULTRASPARC + be,pn %icc, .encrypt.dec ! enc/dec +#else + be .encrypt.dec +#endif + ld [in0+4], out5 ! right + + ! parameter 6 1/2 for include encryption/decryption + ! parameter 7 1 for move in1 to in3 + ! parameter 8 1 for move in3 to in4, 2 for move in4 to in3 + + ip_macro(in5, out5, in5, out5, in3, 0, 1, 1) + + rounds_macro(in5, out5, 1, .des_encrypt1.1, in3, in4) ! in4 not used + + fp_macro(in5, out5, 1) ! 1 for store to [in0] + + ret + restore + +.encrypt.dec: + + add in1, 120, in3 ! use last subkey for first round + + ! parameter 6 1/2 for include encryption/decryption + ! parameter 7 1 for move in1 to in3 + ! parameter 8 1 for move in3 to in4, 2 for move in4 to in3 + + ip_macro(in5, out5, out5, in5, in4, 2, 0, 1) ! include dec, ks in4 + + fp_macro(out5, in5, 1) ! 1 for store to [in0] + + ret + restore + +.DES_encrypt1.end: + .size DES_encrypt1,.DES_encrypt1.end-DES_encrypt1 + + +! void DES_encrypt2(data, ks, enc) +!********************************* + + ! encrypts/decrypts without initial/final permutation + + .align 32 + .global DES_encrypt2 + .type DES_encrypt2,#function + +DES_encrypt2: + + save %sp, FRAME, %sp + + sethi %hi(.PIC.DES_SPtrans-1f),global1 + or global1,%lo(.PIC.DES_SPtrans-1f),global1 +1: call .+8 + add %o7,global1,global1 + sub global1,.PIC.DES_SPtrans-.des_and,out2 + + ! Set sbox address 1 to 6 and rotate halfs 3 left + ! Errors caught by destest? Yes. Still? *NO* + + !sethi %hi(DES_SPtrans), global1 ! address sbox 1 + + !or global1, %lo(DES_SPtrans), global1 ! sbox 1 + + add global1, 256, global2 ! sbox 2 + add global1, 512, global3 ! sbox 3 + + ld [in0], out5 ! right + add global1, 768, global4 ! sbox 4 + add global1, 1024, global5 ! sbox 5 + + ld [in0+4], in5 ! left + add global1, 1280, local6 ! sbox 6 + add global1, 1792, out3 ! sbox 8 + + ! rotate + + sll in5, 3, local5 + mov in1, in3 ! key address to in3 + + sll out5, 3, local7 + srl in5, 29, in5 + + srl out5, 29, out5 + add in5, local5, in5 + + add out5, local7, out5 + cmp in2, 0 + + ! we use our own stackframe + +#ifdef OPENSSL_SYSNAME_ULTRASPARC + be,pn %icc, .encrypt2.dec ! decryption +#else + be .encrypt2.dec +#endif + STPTR in0, [%sp+BIAS+ARG0+0*ARGSZ] + + ld [in3], out0 ! key 7531 first round + mov LOOPS, out4 ! loop counter + + ld [in3+4], out1 ! key 8642 first round + sethi %hi(0x0000FC00), local5 + + call .des_enc + mov in3, in4 + + ! rotate + sll in5, 29, in0 + srl in5, 3, in5 + sll out5, 29, in1 + add in5, in0, in5 + srl out5, 3, out5 + LDPTR [%sp+BIAS+ARG0+0*ARGSZ], in0 + add out5, in1, out5 + st in5, [in0] + st out5, [in0+4] + + ret + restore + + +.encrypt2.dec: + + add in3, 120, in4 + + ld [in4], out0 ! key 7531 first round + mov LOOPS, out4 ! loop counter + + ld [in4+4], out1 ! key 8642 first round + sethi %hi(0x0000FC00), local5 + + mov in5, local1 ! left expected in out5 + mov out5, in5 + + call .des_dec + mov local1, out5 + +.encrypt2.finish: + + ! rotate + sll in5, 29, in0 + srl in5, 3, in5 + sll out5, 29, in1 + add in5, in0, in5 + srl out5, 3, out5 + LDPTR [%sp+BIAS+ARG0+0*ARGSZ], in0 + add out5, in1, out5 + st out5, [in0] + st in5, [in0+4] + + ret + restore + +.DES_encrypt2.end: + .size DES_encrypt2, .DES_encrypt2.end-DES_encrypt2 + + +! void DES_encrypt3(data, ks1, ks2, ks3) +! ************************************** + + .align 32 + .global DES_encrypt3 + .type DES_encrypt3,#function + +DES_encrypt3: + + save %sp, FRAME, %sp + + sethi %hi(.PIC.DES_SPtrans-1f),global1 + or global1,%lo(.PIC.DES_SPtrans-1f),global1 +1: call .+8 + add %o7,global1,global1 + sub global1,.PIC.DES_SPtrans-.des_and,out2 + + ld [in0], in5 ! left + add in2, 120, in4 ! ks2 + + ld [in0+4], out5 ! right + mov in3, in2 ! save ks3 + + ! parameter 6 1/2 for include encryption/decryption + ! parameter 7 1 for mov in1 to in3 + ! parameter 8 1 for mov in3 to in4 + ! parameter 9 1 for load ks3 and ks2 to in4 and in3 + + ip_macro(in5, out5, in5, out5, in3, 1, 1, 0, 0) + + call .des_dec + mov in2, in3 ! preload ks3 + + call .des_enc + nop + + fp_macro(in5, out5, 1) + + ret + restore + +.DES_encrypt3.end: + .size DES_encrypt3,.DES_encrypt3.end-DES_encrypt3 + + +! void DES_decrypt3(data, ks1, ks2, ks3) +! ************************************** + + .align 32 + .global DES_decrypt3 + .type DES_decrypt3,#function + +DES_decrypt3: + + save %sp, FRAME, %sp + + sethi %hi(.PIC.DES_SPtrans-1f),global1 + or global1,%lo(.PIC.DES_SPtrans-1f),global1 +1: call .+8 + add %o7,global1,global1 + sub global1,.PIC.DES_SPtrans-.des_and,out2 + + ld [in0], in5 ! left + add in3, 120, in4 ! ks3 + + ld [in0+4], out5 ! right + mov in2, in3 ! ks2 + + ! parameter 6 1/2 for include encryption/decryption + ! parameter 7 1 for mov in1 to in3 + ! parameter 8 1 for mov in3 to in4 + ! parameter 9 1 for load ks3 and ks2 to in4 and in3 + + ip_macro(in5, out5, out5, in5, in4, 2, 0, 0, 0) + + call .des_enc + add in1, 120, in4 ! preload ks1 + + call .des_dec + nop + + fp_macro(out5, in5, 1) + + ret + restore + +.DES_decrypt3.end: + .size DES_decrypt3,.DES_decrypt3.end-DES_decrypt3 + +! void DES_ncbc_encrypt(input, output, length, schedule, ivec, enc) +! ***************************************************************** + + + .align 32 + .global DES_ncbc_encrypt + .type DES_ncbc_encrypt,#function + +DES_ncbc_encrypt: + + save %sp, FRAME, %sp + + define({INPUT}, { [%sp+BIAS+ARG0+0*ARGSZ] }) + define({OUTPUT}, { [%sp+BIAS+ARG0+1*ARGSZ] }) + define({IVEC}, { [%sp+BIAS+ARG0+4*ARGSZ] }) + + sethi %hi(.PIC.DES_SPtrans-1f),global1 + or global1,%lo(.PIC.DES_SPtrans-1f),global1 +1: call .+8 + add %o7,global1,global1 + sub global1,.PIC.DES_SPtrans-.des_and,out2 + + cmp in5, 0 ! enc + +#ifdef OPENSSL_SYSNAME_ULTRASPARC + be,pn %icc, .ncbc.dec +#else + be .ncbc.dec +#endif + STPTR in4, IVEC + + ! addr left right temp label + load_little_endian(in4, in5, out5, local3, .LLE1) ! iv + + addcc in2, -8, in2 ! bytes missing when first block done + +#ifdef OPENSSL_SYSNAME_ULTRASPARC + bl,pn %icc, .ncbc.enc.seven.or.less +#else + bl .ncbc.enc.seven.or.less +#endif + mov in3, in4 ! schedule + +.ncbc.enc.next.block: + + load_little_endian(in0, out4, global4, local3, .LLE2) ! block + +.ncbc.enc.next.block_1: + + xor in5, out4, in5 ! iv xor + xor out5, global4, out5 ! iv xor + + ! parameter 8 1 for move in3 to in4, 2 for move in4 to in3 + ip_macro(in5, out5, in5, out5, in3, 0, 0, 2) + +.ncbc.enc.next.block_2: + +!// call .des_enc ! compares in2 to 8 +! rounds inlined for alignment purposes + + add global1, 768, global4 ! address sbox 4 since register used below + + rounds_macro(in5, out5, 1, .ncbc.enc.1, in3, in4) ! include encryption ks in3 + +#ifdef OPENSSL_SYSNAME_ULTRASPARC + bl,pn %icc, .ncbc.enc.next.block_fp +#else + bl .ncbc.enc.next.block_fp +#endif + add in0, 8, in0 ! input address + + ! If 8 or more bytes are to be encrypted after this block, + ! we combine final permutation for this block with initial + ! permutation for next block. Load next block: + + load_little_endian(in0, global3, global4, local5, .LLE12) + + ! parameter 1 original left + ! parameter 2 original right + ! parameter 3 left ip + ! parameter 4 right ip + ! parameter 5 1: load ks1/ks2 to in3/in4, add 120 to in4 + ! 2: mov in4 to in3 + ! + ! also adds -8 to length in2 and loads loop counter to out4 + + fp_ip_macro(out0, out1, global3, global4, 2) + + store_little_endian(in1, out0, out1, local3, .SLE10) ! block + + ld [in3], out0 ! key 7531 first round next block + mov in5, local1 + xor global3, out5, in5 ! iv xor next block + + ld [in3+4], out1 ! key 8642 + add global1, 512, global3 ! address sbox 3 since register used + xor global4, local1, out5 ! iv xor next block + + ba .ncbc.enc.next.block_2 + add in1, 8, in1 ! output adress + +.ncbc.enc.next.block_fp: + + fp_macro(in5, out5) + + store_little_endian(in1, in5, out5, local3, .SLE1) ! block + + addcc in2, -8, in2 ! bytes missing when next block done + +#ifdef OPENSSL_SYSNAME_ULTRASPARC + bpos,pt %icc, .ncbc.enc.next.block ! also jumps if 0 +#else + bpos .ncbc.enc.next.block +#endif + add in1, 8, in1 + +.ncbc.enc.seven.or.less: + + cmp in2, -8 + +#ifdef OPENSSL_SYSNAME_ULTRASPARC + ble,pt %icc, .ncbc.enc.finish +#else + ble .ncbc.enc.finish +#endif + nop + + add in2, 8, local1 ! bytes to load + + ! addr, length, dest left, dest right, temp, temp2, label, ret label + load_n_bytes(in0, local1, global4, out4, local2, local3, .LNB1, .ncbc.enc.next.block_1) + + ! Loads 1 to 7 bytes little endian to global4, out4 + + +.ncbc.enc.finish: + + LDPTR IVEC, local4 + store_little_endian(local4, in5, out5, local5, .SLE2) ! ivec + + ret + restore + + +.ncbc.dec: + + STPTR in0, INPUT + cmp in2, 0 ! length + add in3, 120, in3 + + LDPTR IVEC, local7 ! ivec +#ifdef OPENSSL_SYSNAME_ULTRASPARC + ble,pn %icc, .ncbc.dec.finish +#else + ble .ncbc.dec.finish +#endif + mov in3, in4 ! schedule + + STPTR in1, OUTPUT + mov in0, local5 ! input + + load_little_endian(local7, in0, in1, local3, .LLE3) ! ivec + +.ncbc.dec.next.block: + + load_little_endian(local5, in5, out5, local3, .LLE4) ! block + + ! parameter 6 1/2 for include encryption/decryption + ! parameter 7 1 for mov in1 to in3 + ! parameter 8 1 for mov in3 to in4 + + ip_macro(in5, out5, out5, in5, in4, 2, 0, 1) ! include decryprion ks in4 + + fp_macro(out5, in5, 0, 1) ! 1 for input and output address to local5/7 + + ! in2 is bytes left to be stored + ! in2 is compared to 8 in the rounds + + xor out5, in0, out4 ! iv xor +#ifdef OPENSSL_SYSNAME_ULTRASPARC + bl,pn %icc, .ncbc.dec.seven.or.less +#else + bl .ncbc.dec.seven.or.less +#endif + xor in5, in1, global4 ! iv xor + + ! Load ivec next block now, since input and output address might be the same. + + load_little_endian_inc(local5, in0, in1, local3, .LLE5) ! iv + + store_little_endian(local7, out4, global4, local3, .SLE3) + + STPTR local5, INPUT + add local7, 8, local7 + addcc in2, -8, in2 + +#ifdef OPENSSL_SYSNAME_ULTRASPARC + bg,pt %icc, .ncbc.dec.next.block +#else + bg .ncbc.dec.next.block +#endif + STPTR local7, OUTPUT + + +.ncbc.dec.store.iv: + + LDPTR IVEC, local4 ! ivec + store_little_endian(local4, in0, in1, local5, .SLE4) + +.ncbc.dec.finish: + + ret + restore + +.ncbc.dec.seven.or.less: + + load_little_endian_inc(local5, in0, in1, local3, .LLE13) ! ivec + + store_n_bytes(local7, in2, global4, out4, local3, local4, .SNB1, .ncbc.dec.store.iv) + + +.DES_ncbc_encrypt.end: + .size DES_ncbc_encrypt, .DES_ncbc_encrypt.end-DES_ncbc_encrypt + + +! void DES_ede3_cbc_encrypt(input, output, lenght, ks1, ks2, ks3, ivec, enc) +! ************************************************************************** + + + .align 32 + .global DES_ede3_cbc_encrypt + .type DES_ede3_cbc_encrypt,#function + +DES_ede3_cbc_encrypt: + + save %sp, FRAME, %sp + + define({KS1}, { [%sp+BIAS+ARG0+3*ARGSZ] }) + define({KS2}, { [%sp+BIAS+ARG0+4*ARGSZ] }) + define({KS3}, { [%sp+BIAS+ARG0+5*ARGSZ] }) + + sethi %hi(.PIC.DES_SPtrans-1f),global1 + or global1,%lo(.PIC.DES_SPtrans-1f),global1 +1: call .+8 + add %o7,global1,global1 + sub global1,.PIC.DES_SPtrans-.des_and,out2 + + LDPTR [%fp+BIAS+ARG0+7*ARGSZ], local3 ! enc + LDPTR [%fp+BIAS+ARG0+6*ARGSZ], local4 ! ivec + cmp local3, 0 ! enc + +#ifdef OPENSSL_SYSNAME_ULTRASPARC + be,pn %icc, .ede3.dec +#else + be .ede3.dec +#endif + STPTR in4, KS2 + + STPTR in5, KS3 + + load_little_endian(local4, in5, out5, local3, .LLE6) ! ivec + + addcc in2, -8, in2 ! bytes missing after next block + +#ifdef OPENSSL_SYSNAME_ULTRASPARC + bl,pn %icc, .ede3.enc.seven.or.less +#else + bl .ede3.enc.seven.or.less +#endif + STPTR in3, KS1 + +.ede3.enc.next.block: + + load_little_endian(in0, out4, global4, local3, .LLE7) + +.ede3.enc.next.block_1: + + LDPTR KS2, in4 + xor in5, out4, in5 ! iv xor + xor out5, global4, out5 ! iv xor + + LDPTR KS1, in3 + add in4, 120, in4 ! for decryption we use last subkey first + nop + + ip_macro(in5, out5, in5, out5, in3) + +.ede3.enc.next.block_2: + + call .des_enc ! ks1 in3 + nop + + call .des_dec ! ks2 in4 + LDPTR KS3, in3 + + call .des_enc ! ks3 in3 compares in2 to 8 + nop + +#ifdef OPENSSL_SYSNAME_ULTRASPARC + bl,pn %icc, .ede3.enc.next.block_fp +#else + bl .ede3.enc.next.block_fp +#endif + add in0, 8, in0 + + ! If 8 or more bytes are to be encrypted after this block, + ! we combine final permutation for this block with initial + ! permutation for next block. Load next block: + + load_little_endian(in0, global3, global4, local5, .LLE11) + + ! parameter 1 original left + ! parameter 2 original right + ! parameter 3 left ip + ! parameter 4 right ip + ! parameter 5 1: load ks1/ks2 to in3/in4, add 120 to in4 + ! 2: mov in4 to in3 + ! + ! also adds -8 to length in2 and loads loop counter to out4 + + fp_ip_macro(out0, out1, global3, global4, 1) + + store_little_endian(in1, out0, out1, local3, .SLE9) ! block + + mov in5, local1 + xor global3, out5, in5 ! iv xor next block + + ld [in3], out0 ! key 7531 + add global1, 512, global3 ! address sbox 3 + xor global4, local1, out5 ! iv xor next block + + ld [in3+4], out1 ! key 8642 + add global1, 768, global4 ! address sbox 4 + ba .ede3.enc.next.block_2 + add in1, 8, in1 + +.ede3.enc.next.block_fp: + + fp_macro(in5, out5) + + store_little_endian(in1, in5, out5, local3, .SLE5) ! block + + addcc in2, -8, in2 ! bytes missing when next block done + +#ifdef OPENSSL_SYSNAME_ULTRASPARC + bpos,pt %icc, .ede3.enc.next.block +#else + bpos .ede3.enc.next.block +#endif + add in1, 8, in1 + +.ede3.enc.seven.or.less: + + cmp in2, -8 + +#ifdef OPENSSL_SYSNAME_ULTRASPARC + ble,pt %icc, .ede3.enc.finish +#else + ble .ede3.enc.finish +#endif + nop + + add in2, 8, local1 ! bytes to load + + ! addr, length, dest left, dest right, temp, temp2, label, ret label + load_n_bytes(in0, local1, global4, out4, local2, local3, .LNB2, .ede3.enc.next.block_1) + +.ede3.enc.finish: + + LDPTR [%fp+BIAS+ARG0+6*ARGSZ], local4 ! ivec + store_little_endian(local4, in5, out5, local5, .SLE6) ! ivec + + ret + restore + +.ede3.dec: + + STPTR in0, INPUT + add in5, 120, in5 + + STPTR in1, OUTPUT + mov in0, local5 + add in3, 120, in3 + + STPTR in3, KS1 + cmp in2, 0 + +#ifdef OPENSSL_SYSNAME_ULTRASPARC + ble %icc, .ede3.dec.finish +#else + ble .ede3.dec.finish +#endif + STPTR in5, KS3 + + LDPTR [%fp+BIAS+ARG0+6*ARGSZ], local7 ! iv + load_little_endian(local7, in0, in1, local3, .LLE8) + +.ede3.dec.next.block: + + load_little_endian(local5, in5, out5, local3, .LLE9) + + ! parameter 6 1/2 for include encryption/decryption + ! parameter 7 1 for mov in1 to in3 + ! parameter 8 1 for mov in3 to in4 + ! parameter 9 1 for load ks3 and ks2 to in4 and in3 + + ip_macro(in5, out5, out5, in5, in4, 2, 0, 0, 1) ! inc .des_dec ks3 in4 + + call .des_enc ! ks2 in3 + LDPTR KS1, in4 + + call .des_dec ! ks1 in4 + nop + + fp_macro(out5, in5, 0, 1) ! 1 for input and output address local5/7 + + ! in2 is bytes left to be stored + ! in2 is compared to 8 in the rounds + + xor out5, in0, out4 +#ifdef OPENSSL_SYSNAME_ULTRASPARC + bl,pn %icc, .ede3.dec.seven.or.less +#else + bl .ede3.dec.seven.or.less +#endif + xor in5, in1, global4 + + load_little_endian_inc(local5, in0, in1, local3, .LLE10) ! iv next block + + store_little_endian(local7, out4, global4, local3, .SLE7) ! block + + STPTR local5, INPUT + addcc in2, -8, in2 + add local7, 8, local7 + +#ifdef OPENSSL_SYSNAME_ULTRASPARC + bg,pt %icc, .ede3.dec.next.block +#else + bg .ede3.dec.next.block +#endif + STPTR local7, OUTPUT + +.ede3.dec.store.iv: + + LDPTR [%fp+BIAS+ARG0+6*ARGSZ], local4 ! ivec + store_little_endian(local4, in0, in1, local5, .SLE8) ! ivec + +.ede3.dec.finish: + + ret + restore + +.ede3.dec.seven.or.less: + + load_little_endian_inc(local5, in0, in1, local3, .LLE14) ! iv + + store_n_bytes(local7, in2, global4, out4, local3, local4, .SNB2, .ede3.dec.store.iv) + + +.DES_ede3_cbc_encrypt.end: + .size DES_ede3_cbc_encrypt,.DES_ede3_cbc_encrypt.end-DES_ede3_cbc_encrypt + + .align 256 + .type .des_and,#object + .size .des_and,284 + +.des_and: + +! This table is used for AND 0xFC when it is known that register +! bits 8-31 are zero. Makes it possible to do three arithmetic +! operations in one cycle. + + .byte 0, 0, 0, 0, 4, 4, 4, 4 + .byte 8, 8, 8, 8, 12, 12, 12, 12 + .byte 16, 16, 16, 16, 20, 20, 20, 20 + .byte 24, 24, 24, 24, 28, 28, 28, 28 + .byte 32, 32, 32, 32, 36, 36, 36, 36 + .byte 40, 40, 40, 40, 44, 44, 44, 44 + .byte 48, 48, 48, 48, 52, 52, 52, 52 + .byte 56, 56, 56, 56, 60, 60, 60, 60 + .byte 64, 64, 64, 64, 68, 68, 68, 68 + .byte 72, 72, 72, 72, 76, 76, 76, 76 + .byte 80, 80, 80, 80, 84, 84, 84, 84 + .byte 88, 88, 88, 88, 92, 92, 92, 92 + .byte 96, 96, 96, 96, 100, 100, 100, 100 + .byte 104, 104, 104, 104, 108, 108, 108, 108 + .byte 112, 112, 112, 112, 116, 116, 116, 116 + .byte 120, 120, 120, 120, 124, 124, 124, 124 + .byte 128, 128, 128, 128, 132, 132, 132, 132 + .byte 136, 136, 136, 136, 140, 140, 140, 140 + .byte 144, 144, 144, 144, 148, 148, 148, 148 + .byte 152, 152, 152, 152, 156, 156, 156, 156 + .byte 160, 160, 160, 160, 164, 164, 164, 164 + .byte 168, 168, 168, 168, 172, 172, 172, 172 + .byte 176, 176, 176, 176, 180, 180, 180, 180 + .byte 184, 184, 184, 184, 188, 188, 188, 188 + .byte 192, 192, 192, 192, 196, 196, 196, 196 + .byte 200, 200, 200, 200, 204, 204, 204, 204 + .byte 208, 208, 208, 208, 212, 212, 212, 212 + .byte 216, 216, 216, 216, 220, 220, 220, 220 + .byte 224, 224, 224, 224, 228, 228, 228, 228 + .byte 232, 232, 232, 232, 236, 236, 236, 236 + .byte 240, 240, 240, 240, 244, 244, 244, 244 + .byte 248, 248, 248, 248, 252, 252, 252, 252 + + ! 5 numbers for initil/final permutation + + .word 0x0f0f0f0f ! offset 256 + .word 0x0000ffff ! 260 + .word 0x33333333 ! 264 + .word 0x00ff00ff ! 268 + .word 0x55555555 ! 272 + + .word 0 ! 276 + .word LOOPS ! 280 + .word 0x0000FC00 ! 284 + + .global DES_SPtrans + .type DES_SPtrans,#object + .size DES_SPtrans,2048 +.align 64 +DES_SPtrans: +.PIC.DES_SPtrans: + ! nibble 0 + .word 0x02080800, 0x00080000, 0x02000002, 0x02080802 + .word 0x02000000, 0x00080802, 0x00080002, 0x02000002 + .word 0x00080802, 0x02080800, 0x02080000, 0x00000802 + .word 0x02000802, 0x02000000, 0x00000000, 0x00080002 + .word 0x00080000, 0x00000002, 0x02000800, 0x00080800 + .word 0x02080802, 0x02080000, 0x00000802, 0x02000800 + .word 0x00000002, 0x00000800, 0x00080800, 0x02080002 + .word 0x00000800, 0x02000802, 0x02080002, 0x00000000 + .word 0x00000000, 0x02080802, 0x02000800, 0x00080002 + .word 0x02080800, 0x00080000, 0x00000802, 0x02000800 + .word 0x02080002, 0x00000800, 0x00080800, 0x02000002 + .word 0x00080802, 0x00000002, 0x02000002, 0x02080000 + .word 0x02080802, 0x00080800, 0x02080000, 0x02000802 + .word 0x02000000, 0x00000802, 0x00080002, 0x00000000 + .word 0x00080000, 0x02000000, 0x02000802, 0x02080800 + .word 0x00000002, 0x02080002, 0x00000800, 0x00080802 + ! nibble 1 + .word 0x40108010, 0x00000000, 0x00108000, 0x40100000 + .word 0x40000010, 0x00008010, 0x40008000, 0x00108000 + .word 0x00008000, 0x40100010, 0x00000010, 0x40008000 + .word 0x00100010, 0x40108000, 0x40100000, 0x00000010 + .word 0x00100000, 0x40008010, 0x40100010, 0x00008000 + .word 0x00108010, 0x40000000, 0x00000000, 0x00100010 + .word 0x40008010, 0x00108010, 0x40108000, 0x40000010 + .word 0x40000000, 0x00100000, 0x00008010, 0x40108010 + .word 0x00100010, 0x40108000, 0x40008000, 0x00108010 + .word 0x40108010, 0x00100010, 0x40000010, 0x00000000 + .word 0x40000000, 0x00008010, 0x00100000, 0x40100010 + .word 0x00008000, 0x40000000, 0x00108010, 0x40008010 + .word 0x40108000, 0x00008000, 0x00000000, 0x40000010 + .word 0x00000010, 0x40108010, 0x00108000, 0x40100000 + .word 0x40100010, 0x00100000, 0x00008010, 0x40008000 + .word 0x40008010, 0x00000010, 0x40100000, 0x00108000 + ! nibble 2 + .word 0x04000001, 0x04040100, 0x00000100, 0x04000101 + .word 0x00040001, 0x04000000, 0x04000101, 0x00040100 + .word 0x04000100, 0x00040000, 0x04040000, 0x00000001 + .word 0x04040101, 0x00000101, 0x00000001, 0x04040001 + .word 0x00000000, 0x00040001, 0x04040100, 0x00000100 + .word 0x00000101, 0x04040101, 0x00040000, 0x04000001 + .word 0x04040001, 0x04000100, 0x00040101, 0x04040000 + .word 0x00040100, 0x00000000, 0x04000000, 0x00040101 + .word 0x04040100, 0x00000100, 0x00000001, 0x00040000 + .word 0x00000101, 0x00040001, 0x04040000, 0x04000101 + .word 0x00000000, 0x04040100, 0x00040100, 0x04040001 + .word 0x00040001, 0x04000000, 0x04040101, 0x00000001 + .word 0x00040101, 0x04000001, 0x04000000, 0x04040101 + .word 0x00040000, 0x04000100, 0x04000101, 0x00040100 + .word 0x04000100, 0x00000000, 0x04040001, 0x00000101 + .word 0x04000001, 0x00040101, 0x00000100, 0x04040000 + ! nibble 3 + .word 0x00401008, 0x10001000, 0x00000008, 0x10401008 + .word 0x00000000, 0x10400000, 0x10001008, 0x00400008 + .word 0x10401000, 0x10000008, 0x10000000, 0x00001008 + .word 0x10000008, 0x00401008, 0x00400000, 0x10000000 + .word 0x10400008, 0x00401000, 0x00001000, 0x00000008 + .word 0x00401000, 0x10001008, 0x10400000, 0x00001000 + .word 0x00001008, 0x00000000, 0x00400008, 0x10401000 + .word 0x10001000, 0x10400008, 0x10401008, 0x00400000 + .word 0x10400008, 0x00001008, 0x00400000, 0x10000008 + .word 0x00401000, 0x10001000, 0x00000008, 0x10400000 + .word 0x10001008, 0x00000000, 0x00001000, 0x00400008 + .word 0x00000000, 0x10400008, 0x10401000, 0x00001000 + .word 0x10000000, 0x10401008, 0x00401008, 0x00400000 + .word 0x10401008, 0x00000008, 0x10001000, 0x00401008 + .word 0x00400008, 0x00401000, 0x10400000, 0x10001008 + .word 0x00001008, 0x10000000, 0x10000008, 0x10401000 + ! nibble 4 + .word 0x08000000, 0x00010000, 0x00000400, 0x08010420 + .word 0x08010020, 0x08000400, 0x00010420, 0x08010000 + .word 0x00010000, 0x00000020, 0x08000020, 0x00010400 + .word 0x08000420, 0x08010020, 0x08010400, 0x00000000 + .word 0x00010400, 0x08000000, 0x00010020, 0x00000420 + .word 0x08000400, 0x00010420, 0x00000000, 0x08000020 + .word 0x00000020, 0x08000420, 0x08010420, 0x00010020 + .word 0x08010000, 0x00000400, 0x00000420, 0x08010400 + .word 0x08010400, 0x08000420, 0x00010020, 0x08010000 + .word 0x00010000, 0x00000020, 0x08000020, 0x08000400 + .word 0x08000000, 0x00010400, 0x08010420, 0x00000000 + .word 0x00010420, 0x08000000, 0x00000400, 0x00010020 + .word 0x08000420, 0x00000400, 0x00000000, 0x08010420 + .word 0x08010020, 0x08010400, 0x00000420, 0x00010000 + .word 0x00010400, 0x08010020, 0x08000400, 0x00000420 + .word 0x00000020, 0x00010420, 0x08010000, 0x08000020 + ! nibble 5 + .word 0x80000040, 0x00200040, 0x00000000, 0x80202000 + .word 0x00200040, 0x00002000, 0x80002040, 0x00200000 + .word 0x00002040, 0x80202040, 0x00202000, 0x80000000 + .word 0x80002000, 0x80000040, 0x80200000, 0x00202040 + .word 0x00200000, 0x80002040, 0x80200040, 0x00000000 + .word 0x00002000, 0x00000040, 0x80202000, 0x80200040 + .word 0x80202040, 0x80200000, 0x80000000, 0x00002040 + .word 0x00000040, 0x00202000, 0x00202040, 0x80002000 + .word 0x00002040, 0x80000000, 0x80002000, 0x00202040 + .word 0x80202000, 0x00200040, 0x00000000, 0x80002000 + .word 0x80000000, 0x00002000, 0x80200040, 0x00200000 + .word 0x00200040, 0x80202040, 0x00202000, 0x00000040 + .word 0x80202040, 0x00202000, 0x00200000, 0x80002040 + .word 0x80000040, 0x80200000, 0x00202040, 0x00000000 + .word 0x00002000, 0x80000040, 0x80002040, 0x80202000 + .word 0x80200000, 0x00002040, 0x00000040, 0x80200040 + ! nibble 6 + .word 0x00004000, 0x00000200, 0x01000200, 0x01000004 + .word 0x01004204, 0x00004004, 0x00004200, 0x00000000 + .word 0x01000000, 0x01000204, 0x00000204, 0x01004000 + .word 0x00000004, 0x01004200, 0x01004000, 0x00000204 + .word 0x01000204, 0x00004000, 0x00004004, 0x01004204 + .word 0x00000000, 0x01000200, 0x01000004, 0x00004200 + .word 0x01004004, 0x00004204, 0x01004200, 0x00000004 + .word 0x00004204, 0x01004004, 0x00000200, 0x01000000 + .word 0x00004204, 0x01004000, 0x01004004, 0x00000204 + .word 0x00004000, 0x00000200, 0x01000000, 0x01004004 + .word 0x01000204, 0x00004204, 0x00004200, 0x00000000 + .word 0x00000200, 0x01000004, 0x00000004, 0x01000200 + .word 0x00000000, 0x01000204, 0x01000200, 0x00004200 + .word 0x00000204, 0x00004000, 0x01004204, 0x01000000 + .word 0x01004200, 0x00000004, 0x00004004, 0x01004204 + .word 0x01000004, 0x01004200, 0x01004000, 0x00004004 + ! nibble 7 + .word 0x20800080, 0x20820000, 0x00020080, 0x00000000 + .word 0x20020000, 0x00800080, 0x20800000, 0x20820080 + .word 0x00000080, 0x20000000, 0x00820000, 0x00020080 + .word 0x00820080, 0x20020080, 0x20000080, 0x20800000 + .word 0x00020000, 0x00820080, 0x00800080, 0x20020000 + .word 0x20820080, 0x20000080, 0x00000000, 0x00820000 + .word 0x20000000, 0x00800000, 0x20020080, 0x20800080 + .word 0x00800000, 0x00020000, 0x20820000, 0x00000080 + .word 0x00800000, 0x00020000, 0x20000080, 0x20820080 + .word 0x00020080, 0x20000000, 0x00000000, 0x00820000 + .word 0x20800080, 0x20020080, 0x20020000, 0x00800080 + .word 0x20820000, 0x00000080, 0x00800080, 0x20020000 + .word 0x20820080, 0x00800000, 0x20800000, 0x20000080 + .word 0x00820000, 0x00020080, 0x20020080, 0x20800000 + .word 0x00000080, 0x20820000, 0x00820080, 0x00000000 + .word 0x20000000, 0x20800080, 0x00020000, 0x00820080 + diff --git a/openssl/crypto/des/asm/desboth.pl b/openssl/crypto/des/asm/desboth.pl new file mode 100644 index 00000000..eec00886 --- /dev/null +++ b/openssl/crypto/des/asm/desboth.pl @@ -0,0 +1,79 @@ +#!/usr/local/bin/perl + +$L="edi"; +$R="esi"; + +sub DES_encrypt3 + { + local($name,$enc)=@_; + + &function_begin_B($name,""); + &push("ebx"); + &mov("ebx",&wparam(0)); + + &push("ebp"); + &push("esi"); + + &push("edi"); + + &comment(""); + &comment("Load the data words"); + &mov($L,&DWP(0,"ebx","",0)); + &mov($R,&DWP(4,"ebx","",0)); + &stack_push(3); + + &comment(""); + &comment("IP"); + &IP_new($L,$R,"edx",0); + + # put them back + + if ($enc) + { + &mov(&DWP(4,"ebx","",0),$R); + &mov("eax",&wparam(1)); + &mov(&DWP(0,"ebx","",0),"edx"); + &mov("edi",&wparam(2)); + &mov("esi",&wparam(3)); + } + else + { + &mov(&DWP(4,"ebx","",0),$R); + &mov("esi",&wparam(1)); + &mov(&DWP(0,"ebx","",0),"edx"); + &mov("edi",&wparam(2)); + &mov("eax",&wparam(3)); + } + &mov(&swtmp(2), (DWC(($enc)?"1":"0"))); + &mov(&swtmp(1), "eax"); + &mov(&swtmp(0), "ebx"); + &call("DES_encrypt2"); + &mov(&swtmp(2), (DWC(($enc)?"0":"1"))); + &mov(&swtmp(1), "edi"); + &mov(&swtmp(0), "ebx"); + &call("DES_encrypt2"); + &mov(&swtmp(2), (DWC(($enc)?"1":"0"))); + &mov(&swtmp(1), "esi"); + &mov(&swtmp(0), "ebx"); + &call("DES_encrypt2"); + + &stack_pop(3); + &mov($L,&DWP(0,"ebx","",0)); + &mov($R,&DWP(4,"ebx","",0)); + + &comment(""); + &comment("FP"); + &FP_new($L,$R,"eax",0); + + &mov(&DWP(0,"ebx","",0),"eax"); + &mov(&DWP(4,"ebx","",0),$R); + + &pop("edi"); + &pop("esi"); + &pop("ebp"); + &pop("ebx"); + &ret(); + &function_end_B($name); + } + + diff --git a/openssl/crypto/des/asm/readme b/openssl/crypto/des/asm/readme new file mode 100644 index 00000000..1beafe25 --- /dev/null +++ b/openssl/crypto/des/asm/readme @@ -0,0 +1,131 @@ +First up, let me say I don't like writing in assembler. It is not portable, +dependant on the particular CPU architecture release and is generally a pig +to debug and get right. Having said that, the x86 architecture is probably +the most important for speed due to number of boxes and since +it appears to be the worst architecture to to get +good C compilers for. So due to this, I have lowered myself to do +assembler for the inner DES routines in libdes :-). + +The file to implement in assembler is des_enc.c. Replace the following +4 functions +des_encrypt1(DES_LONG data[2],des_key_schedule ks, int encrypt); +des_encrypt2(DES_LONG data[2],des_key_schedule ks, int encrypt); +des_encrypt3(DES_LONG data[2],des_key_schedule ks1,ks2,ks3); +des_decrypt3(DES_LONG data[2],des_key_schedule ks1,ks2,ks3); + +They encrypt/decrypt the 64 bits held in 'data' using +the 'ks' key schedules. The only difference between the 4 functions is that +des_encrypt2() does not perform IP() or FP() on the data (this is an +optimization for when doing triple DES and des_encrypt3() and des_decrypt3() +perform triple des. The triple DES routines are in here because it does +make a big difference to have them located near the des_encrypt2 function +at link time.. + +Now as we all know, there are lots of different operating systems running on +x86 boxes, and unfortunately they normally try to make sure their assembler +formating is not the same as the other peoples. +The 4 main formats I know of are +Microsoft Windows 95/Windows NT +Elf Includes Linux and FreeBSD(?). +a.out The older Linux. +Solaris Same as Elf but different comments :-(. + +Now I was not overly keen to write 4 different copies of the same code, +so I wrote a few perl routines to output the correct assembler, given +a target assembler type. This code is ugly and is just a hack. +The libraries are x86unix.pl and x86ms.pl. +des586.pl, des686.pl and des-som[23].pl are the programs to actually +generate the assembler. + +So to generate elf assembler +perl des-som3.pl elf >dx86-elf.s +For Windows 95/NT +perl des-som2.pl win32 >win32.asm + +[ update 4 Jan 1996 ] +I have added another way to do things. +perl des-som3.pl cpp >dx86-cpp.s +generates a file that will be included by dx86unix.cpp when it is compiled. +To build for elf, a.out, solaris, bsdi etc, +cc -E -DELF asm/dx86unix.cpp | as -o asm/dx86-elf.o +cc -E -DSOL asm/dx86unix.cpp | as -o asm/dx86-sol.o +cc -E -DOUT asm/dx86unix.cpp | as -o asm/dx86-out.o +cc -E -DBSDI asm/dx86unix.cpp | as -o asm/dx86bsdi.o +This was done to cut down the number of files in the distribution. + +Now the ugly part. I acquired my copy of Intels +"Optimization's For Intel's 32-Bit Processors" and found a few interesting +things. First, the aim of the exersize is to 'extract' one byte at a time +from a word and do an array lookup. This involves getting the byte from +the 4 locations in the word and moving it to a new word and doing the lookup. +The most obvious way to do this is +xor eax, eax # clear word +movb al, cl # get low byte +xor edi DWORD PTR 0x100+des_SP[eax] # xor in word +movb al, ch # get next byte +xor edi DWORD PTR 0x300+des_SP[eax] # xor in word +shr ecx 16 +which seems ok. For the pentium, this system appears to be the best. +One has to do instruction interleaving to keep both functional units +operating, but it is basically very efficient. + +Now the crunch. When a full register is used after a partial write, eg. +mov al, cl +xor edi, DWORD PTR 0x100+des_SP[eax] +386 - 1 cycle stall +486 - 1 cycle stall +586 - 0 cycle stall +686 - at least 7 cycle stall (page 22 of the above mentioned document). + +So the technique that produces the best results on a pentium, according to +the documentation, will produce hideous results on a pentium pro. + +To get around this, des686.pl will generate code that is not as fast on +a pentium, should be very good on a pentium pro. +mov eax, ecx # copy word +shr ecx, 8 # line up next byte +and eax, 0fch # mask byte +xor edi DWORD PTR 0x100+des_SP[eax] # xor in array lookup +mov eax, ecx # get word +shr ecx 8 # line up next byte +and eax, 0fch # mask byte +xor edi DWORD PTR 0x300+des_SP[eax] # xor in array lookup + +Due to the execution units in the pentium, this actually works quite well. +For a pentium pro it should be very good. This is the type of output +Visual C++ generates. + +There is a third option. instead of using +mov al, ch +which is bad on the pentium pro, one may be able to use +movzx eax, ch +which may not incur the partial write penalty. On the pentium, +this instruction takes 4 cycles so is not worth using but on the +pentium pro it appears it may be worth while. I need access to one to +experiment :-). + +eric (20 Oct 1996) + +22 Nov 1996 - I have asked people to run the 2 different version on pentium +pros and it appears that the intel documentation is wrong. The +mov al,bh is still faster on a pentium pro, so just use the des586.pl +install des686.pl + +3 Dec 1996 - I added des_encrypt3/des_decrypt3 because I have moved these +functions into des_enc.c because it does make a massive performance +difference on some boxes to have the functions code located close to +the des_encrypt2() function. + +9 Jan 1997 - des-som2.pl is now the correct perl script to use for +pentiums. It contains an inner loop from +Svend Olaf Mikkelsen which does raw ecb DES calls at +273,000 per second. He had a previous version at 250,000 and the best +I was able to get was 203,000. The content has not changed, this is all +due to instruction sequencing (and actual instructions choice) which is able +to keep both functional units of the pentium going. +We may have lost the ugly register usage restrictions when x86 went 32 bit +but for the pentium it has been replaced by evil instruction ordering tricks. + +13 Jan 1997 - des-som3.pl, more optimizations from Svend Olaf. +raw DES at 281,000 per second on a pentium 100. + -- cgit v1.2.3