diff options
Diffstat (limited to 'main/openssl/crypto/des/asm')
| -rw-r--r-- | main/openssl/crypto/des/asm/crypt586.pl | 209 | ||||
| -rw-r--r-- | main/openssl/crypto/des/asm/des-586.pl | 453 | ||||
| -rw-r--r-- | main/openssl/crypto/des/asm/des_enc.m4 | 2099 | ||||
| -rw-r--r-- | main/openssl/crypto/des/asm/desboth.pl | 79 | ||||
| -rw-r--r-- | main/openssl/crypto/des/asm/readme | 131 | 
5 files changed, 2971 insertions, 0 deletions
| diff --git a/main/openssl/crypto/des/asm/crypt586.pl b/main/openssl/crypto/des/asm/crypt586.pl new file mode 100644 index 00000000..e36f7d44 --- /dev/null +++ b/main/openssl/crypto/des/asm/crypt586.pl @@ -0,0 +1,209 @@ +#!/usr/local/bin/perl +# +# The inner loop instruction sequence and the IP/FP modifications are from +# Svend Olaf Mikkelsen <svolaf@inet.uni-c.dk> +# I've added the stuff needed for crypt() but I've not worried about making +# things perfect. +# + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +push(@INC,"${dir}","${dir}../../perlasm"); +require "x86asm.pl"; + +&asm_init($ARGV[0],"crypt586.pl"); + +$L="edi"; +$R="esi"; + +&external_label("DES_SPtrans"); +&fcrypt_body("fcrypt_body"); +&asm_finish(); + +sub fcrypt_body +	{ +	local($name,$do_ip)=@_; + +	&function_begin($name); + +	&comment(""); +	&comment("Load the 2 words"); +	$trans="ebp"; + +	&xor(	$L,	$L); +	&xor(	$R,	$R); + +	# PIC-ification:-) +	&picmeup("edx","DES_SPtrans"); +	#if ($cpp)	{ &picmeup("edx","DES_SPtrans");   } +	#else		{ &lea("edx",&DWP("DES_SPtrans")); } +	&push("edx");	# becomes &swtmp(1) +	# +	&mov($trans,&wparam(1)); # reloaded with DES_SPtrans in D_ENCRYPT + +	&push(&DWC(25)); # add a variable + +	&set_label("start"); +	for ($i=0; $i<16; $i+=2) +		{ +		&comment(""); +		&comment("Round $i"); +		&D_ENCRYPT($i,$L,$R,$i*2,$trans,"eax","ebx","ecx","edx"); + +		&comment(""); +		&comment("Round ".sprintf("%d",$i+1)); +		&D_ENCRYPT($i+1,$R,$L,($i+1)*2,$trans,"eax","ebx","ecx","edx"); +		} +	 &mov("ebx",	&swtmp(0)); +	&mov("eax",	$L); +	 &dec("ebx"); +	&mov($L,	$R); +	 &mov($R,	"eax"); +	&mov(&swtmp(0),	"ebx"); +	 &jnz(&label("start")); + +	&comment(""); +	&comment("FP"); +	&mov("edx",&wparam(0)); + +	&FP_new($R,$L,"eax",3); +	&mov(&DWP(0,"edx","",0),"eax"); +	&mov(&DWP(4,"edx","",0),$L); + +	&add("esp",8);	# remove variables + +	&function_end($name); +	} + +sub D_ENCRYPT +	{ +	local($r,$L,$R,$S,$trans,$u,$tmp1,$tmp2,$t)=@_; + +	&mov(	$u,		&wparam(2));			# 2 +	&mov(	$t,		$R); +	&shr(	$t,		16);				# 1 +	&mov(	$tmp2,		&wparam(3));			# 2 +	&xor(	$t,		$R);				# 1 + +	&and(	$u,		$t);				# 2 +	&and(	$t,		$tmp2);				# 2 + +	&mov(	$tmp1,		$u); +	&shl(	$tmp1,		16); 				# 1 +	&mov(	$tmp2,		$t); +	&shl(	$tmp2,		16); 				# 1 +	&xor(	$u,		$tmp1);				# 2 +	&xor(	$t,		$tmp2);				# 2 +	&mov(	$tmp1,		&DWP(&n2a($S*4),$trans,"",0));	# 2 +	&xor(	$u,		$tmp1); +	&mov(	$tmp2,		&DWP(&n2a(($S+1)*4),$trans,"",0));	# 2 +	&xor(	$u,		$R); +	&xor(	$t,		$R); +	&xor(	$t,		$tmp2); + +	&and(	$u,		"0xfcfcfcfc"	);		# 2 +	&xor(	$tmp1,		$tmp1);				# 1 +	&and(	$t,		"0xcfcfcfcf"	);		# 2 +	&xor(	$tmp2,		$tmp2);	 +	&movb(	&LB($tmp1),	&LB($u)	); +	&movb(	&LB($tmp2),	&HB($u)	); +	&rotr(	$t,		4		); +	&mov(	$trans,		&swtmp(1)); +	&xor(	$L,		&DWP("     ",$trans,$tmp1,0)); +	&movb(	&LB($tmp1),	&LB($t)	); +	&xor(	$L,		&DWP("0x200",$trans,$tmp2,0)); +	&movb(	&LB($tmp2),	&HB($t)	); +	&shr(	$u,		16); +	&xor(	$L,		&DWP("0x100",$trans,$tmp1,0)); +	&movb(	&LB($tmp1),	&HB($u)	); +	&shr(	$t,		16); +	&xor(	$L,		&DWP("0x300",$trans,$tmp2,0)); +	&movb(	&LB($tmp2),	&HB($t)	); +	&and(	$u,		"0xff"	); +	&and(	$t,		"0xff"	); +	&mov(	$tmp1,		&DWP("0x600",$trans,$tmp1,0)); +	&xor(	$L,		$tmp1); +	&mov(	$tmp1,		&DWP("0x700",$trans,$tmp2,0)); +	&xor(	$L,		$tmp1); +	&mov(	$tmp1,		&DWP("0x400",$trans,$u,0)); +	&xor(	$L,		$tmp1); +	&mov(	$tmp1,		&DWP("0x500",$trans,$t,0)); +	&xor(	$L,		$tmp1); +	&mov(	$trans,		&wparam(1)); +	} + +sub n2a +	{ +	sprintf("%d",$_[0]); +	} + +# now has a side affect of rotating $a by $shift +sub R_PERM_OP +	{ +	local($a,$b,$tt,$shift,$mask,$last)=@_; + +	&rotl(	$a,		$shift		) if ($shift != 0); +	&mov(	$tt,		$a		); +	&xor(	$a,		$b		); +	&and(	$a,		$mask		); +	if ($notlast eq $b) +		{ +		&xor(	$b,		$a		); +		&xor(	$tt,		$a		); +		} +	else +		{ +		&xor(	$tt,		$a		); +		&xor(	$b,		$a		); +		} +	&comment(""); +	} + +sub IP_new +	{ +	local($l,$r,$tt,$lr)=@_; + +	&R_PERM_OP($l,$r,$tt, 4,"0xf0f0f0f0",$l); +	&R_PERM_OP($r,$tt,$l,20,"0xfff0000f",$l); +	&R_PERM_OP($l,$tt,$r,14,"0x33333333",$r); +	&R_PERM_OP($tt,$r,$l,22,"0x03fc03fc",$r); +	&R_PERM_OP($l,$r,$tt, 9,"0xaaaaaaaa",$r); +	 +	if ($lr != 3) +		{ +		if (($lr-3) < 0) +			{ &rotr($tt,	3-$lr); } +		else	{ &rotl($tt,	$lr-3); } +		} +	if ($lr != 2) +		{ +		if (($lr-2) < 0) +			{ &rotr($r,	2-$lr); } +		else	{ &rotl($r,	$lr-2); } +		} +	} + +sub FP_new +	{ +	local($l,$r,$tt,$lr)=@_; + +	if ($lr != 2) +		{ +		if (($lr-2) < 0) +			{ &rotl($r,	2-$lr); } +		else	{ &rotr($r,	$lr-2); } +		} +	if ($lr != 3) +		{ +		if (($lr-3) < 0) +			{ &rotl($l,	3-$lr); } +		else	{ &rotr($l,	$lr-3); } +		} + +	&R_PERM_OP($l,$r,$tt, 0,"0xaaaaaaaa",$r); +	&R_PERM_OP($tt,$r,$l,23,"0x03fc03fc",$r); +	&R_PERM_OP($l,$r,$tt,10,"0x33333333",$l); +	&R_PERM_OP($r,$tt,$l,18,"0xfff0000f",$l); +	&R_PERM_OP($l,$tt,$r,12,"0xf0f0f0f0",$r); +	&rotr($tt	, 4); +	} + diff --git a/main/openssl/crypto/des/asm/des-586.pl b/main/openssl/crypto/des/asm/des-586.pl new file mode 100644 index 00000000..5b5f39ce --- /dev/null +++ b/main/openssl/crypto/des/asm/des-586.pl @@ -0,0 +1,453 @@ +#!/usr/local/bin/perl +# +# The inner loop instruction sequence and the IP/FP modifications are from +# Svend Olaf Mikkelsen <svolaf@inet.uni-c.dk> +# + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +push(@INC,"${dir}","${dir}../../perlasm"); +require "x86asm.pl"; +require "cbc.pl"; +require "desboth.pl"; + +# base code is in microsft +# op dest, source +# format. +# + +&asm_init($ARGV[0],"des-586.pl"); + +$L="edi"; +$R="esi"; +$trans="ebp"; +$small_footprint=1 if (grep(/\-DOPENSSL_SMALL_FOOTPRINT/,@ARGV)); +# one can discuss setting this variable to 1 unconditionally, as +# the folded loop is only 3% slower than unrolled, but >7 times smaller + +&public_label("DES_SPtrans"); + +&DES_encrypt_internal(); +&DES_decrypt_internal(); +&DES_encrypt("DES_encrypt1",1); +&DES_encrypt("DES_encrypt2",0); +&DES_encrypt3("DES_encrypt3",1); +&DES_encrypt3("DES_decrypt3",0); +&cbc("DES_ncbc_encrypt","DES_encrypt1","DES_encrypt1",0,4,5,3,5,-1); +&cbc("DES_ede3_cbc_encrypt","DES_encrypt3","DES_decrypt3",0,6,7,3,4,5); +&DES_SPtrans(); + +&asm_finish(); + +sub DES_encrypt_internal() +	{ +	&function_begin_B("_x86_DES_encrypt"); + +	if ($small_footprint) +	    { +	    &lea("edx",&DWP(128,"ecx")); +	    &push("edx"); +	    &push("ecx"); +	    &set_label("eloop"); +		&D_ENCRYPT(0,$L,$R,0,$trans,"eax","ebx","ecx","edx",&swtmp(0)); +		&comment(""); +		&D_ENCRYPT(1,$R,$L,2,$trans,"eax","ebx","ecx","edx",&swtmp(0)); +		&comment(""); +		&add("ecx",16); +		&cmp("ecx",&swtmp(1)); +		&mov(&swtmp(0),"ecx"); +		&jb(&label("eloop")); +	    &add("esp",8); +	    } +	else +	    { +	    &push("ecx"); +	    for ($i=0; $i<16; $i+=2) +		{ +		&comment("Round $i"); +		&D_ENCRYPT($i,$L,$R,$i*2,$trans,"eax","ebx","ecx","edx",&swtmp(0)); +		&comment("Round ".sprintf("%d",$i+1)); +		&D_ENCRYPT($i+1,$R,$L,($i+1)*2,$trans,"eax","ebx","ecx","edx",&swtmp(0)); +		} +	    &add("esp",4); +	} +	&ret(); + +	&function_end_B("_x86_DES_encrypt"); +	} +	 +sub DES_decrypt_internal() +	{ +	&function_begin_B("_x86_DES_decrypt"); + +	if ($small_footprint) +	    { +	    &push("ecx"); +	    &lea("ecx",&DWP(128,"ecx")); +	    &push("ecx"); +	    &set_label("dloop"); +		&D_ENCRYPT(0,$L,$R,-2,$trans,"eax","ebx","ecx","edx",&swtmp(0)); +		&comment(""); +		&D_ENCRYPT(1,$R,$L,-4,$trans,"eax","ebx","ecx","edx",&swtmp(0)); +		&comment(""); +		&sub("ecx",16); +		&cmp("ecx",&swtmp(1)); +		&mov(&swtmp(0),"ecx"); +		&ja(&label("dloop")); +	    &add("esp",8); +	    } +	else +	    { +	    &push("ecx"); +	    for ($i=15; $i>0; $i-=2) +		{ +		&comment("Round $i"); +		&D_ENCRYPT(15-$i,$L,$R,$i*2,$trans,"eax","ebx","ecx","edx",&swtmp(0)); +		&comment("Round ".sprintf("%d",$i-1)); +		&D_ENCRYPT(15-$i+1,$R,$L,($i-1)*2,$trans,"eax","ebx","ecx","edx",&swtmp(0)); +		} +	    &add("esp",4); +	    } +	&ret(); + +	&function_end_B("_x86_DES_decrypt"); +	} +	 +sub DES_encrypt +	{ +	local($name,$do_ip)=@_; + +	&function_begin_B($name); + +	&push("esi"); +	&push("edi"); + +	&comment(""); +	&comment("Load the 2 words"); + +	if ($do_ip) +		{ +		&mov($R,&wparam(0)); +		 &xor(	"ecx",		"ecx"		); + +		&push("ebx"); +		&push("ebp"); + +		&mov("eax",&DWP(0,$R,"",0)); +		 &mov("ebx",&wparam(2));	# get encrypt flag +		&mov($L,&DWP(4,$R,"",0)); +		&comment(""); +		&comment("IP"); +		&IP_new("eax",$L,$R,3); +		} +	else +		{ +		&mov("eax",&wparam(0)); +		 &xor(	"ecx",		"ecx"		); + +		&push("ebx"); +		&push("ebp"); + +		&mov($R,&DWP(0,"eax","",0)); +		 &mov("ebx",&wparam(2));	# get encrypt flag +		&rotl($R,3); +		&mov($L,&DWP(4,"eax","",0)); +		&rotl($L,3); +		} + +	# PIC-ification:-) +	&call	(&label("pic_point")); +	&set_label("pic_point"); +	&blindpop($trans); +	&lea	($trans,&DWP(&label("DES_SPtrans")."-".&label("pic_point"),$trans)); + +	&mov(	"ecx",	&wparam(1)	); + +	&cmp("ebx","0"); +	&je(&label("decrypt")); +	&call("_x86_DES_encrypt"); +	&jmp(&label("done")); +	&set_label("decrypt"); +	&call("_x86_DES_decrypt"); +	&set_label("done"); + +	if ($do_ip) +		{ +		&comment(""); +		&comment("FP"); +		&mov("edx",&wparam(0)); +		&FP_new($L,$R,"eax",3); + +		&mov(&DWP(0,"edx","",0),"eax"); +		&mov(&DWP(4,"edx","",0),$R); +		} +	else +		{ +		&comment(""); +		&comment("Fixup"); +		&rotr($L,3);		# r +		 &mov("eax",&wparam(0)); +		&rotr($R,3);		# l +		 &mov(&DWP(0,"eax","",0),$L); +		 &mov(&DWP(4,"eax","",0),$R); +		} + +	&pop("ebp"); +	&pop("ebx"); +	&pop("edi"); +	&pop("esi"); +	&ret(); + +	&function_end_B($name); +	} + +sub D_ENCRYPT +	{ +	local($r,$L,$R,$S,$trans,$u,$tmp1,$tmp2,$t,$wp1)=@_; + +	 &mov(	$u,		&DWP(&n2a($S*4),$tmp2,"",0)); +	&xor(	$tmp1,		$tmp1); +	 &mov(	$t,		&DWP(&n2a(($S+1)*4),$tmp2,"",0)); +	&xor(	$u,		$R); +	&xor(	$tmp2,		$tmp2); +	 &xor(	$t,		$R); +	&and(	$u,		"0xfcfcfcfc"	); +	 &and(	$t,		"0xcfcfcfcf"	); +	&movb(	&LB($tmp1),	&LB($u)	); +	 &movb(	&LB($tmp2),	&HB($u)	); +	&rotr(	$t,		4		); +	&xor(	$L,		&DWP("     ",$trans,$tmp1,0)); +	 &movb(	&LB($tmp1),	&LB($t)	); +	 &xor(	$L,		&DWP("0x200",$trans,$tmp2,0)); +	 &movb(	&LB($tmp2),	&HB($t)	); +	&shr(	$u,		16); +	 &xor(	$L,		&DWP("0x100",$trans,$tmp1,0)); +	 &movb(	&LB($tmp1),	&HB($u)	); +	&shr(	$t,		16); +	 &xor(	$L,		&DWP("0x300",$trans,$tmp2,0)); +	&movb(	&LB($tmp2),	&HB($t)	); +	 &and(	$u,		"0xff"	); +	&and(	$t,		"0xff"	); +	 &xor(	$L,		&DWP("0x600",$trans,$tmp1,0)); +	 &xor(	$L,		&DWP("0x700",$trans,$tmp2,0)); +	&mov(	$tmp2,		$wp1	); +	 &xor(	$L,		&DWP("0x400",$trans,$u,0)); +	 &xor(	$L,		&DWP("0x500",$trans,$t,0)); +	} + +sub n2a +	{ +	sprintf("%d",$_[0]); +	} + +# now has a side affect of rotating $a by $shift +sub R_PERM_OP +	{ +	local($a,$b,$tt,$shift,$mask,$last)=@_; + +	&rotl(	$a,		$shift		) if ($shift != 0); +	&mov(	$tt,		$a		); +	&xor(	$a,		$b		); +	&and(	$a,		$mask		); +	# This can never succeed, and besides it is difficult to see what the +	# idea was - Ben 13 Feb 99 +	if (!$last eq $b) +		{ +		&xor(	$b,		$a		); +		&xor(	$tt,		$a		); +		} +	else +		{ +		&xor(	$tt,		$a		); +		&xor(	$b,		$a		); +		} +	&comment(""); +	} + +sub IP_new +	{ +	local($l,$r,$tt,$lr)=@_; + +	&R_PERM_OP($l,$r,$tt, 4,"0xf0f0f0f0",$l); +	&R_PERM_OP($r,$tt,$l,20,"0xfff0000f",$l); +	&R_PERM_OP($l,$tt,$r,14,"0x33333333",$r); +	&R_PERM_OP($tt,$r,$l,22,"0x03fc03fc",$r); +	&R_PERM_OP($l,$r,$tt, 9,"0xaaaaaaaa",$r); +	 +	if ($lr != 3) +		{ +		if (($lr-3) < 0) +			{ &rotr($tt,	3-$lr); } +		else	{ &rotl($tt,	$lr-3); } +		} +	if ($lr != 2) +		{ +		if (($lr-2) < 0) +			{ &rotr($r,	2-$lr); } +		else	{ &rotl($r,	$lr-2); } +		} +	} + +sub FP_new +	{ +	local($l,$r,$tt,$lr)=@_; + +	if ($lr != 2) +		{ +		if (($lr-2) < 0) +			{ &rotl($r,	2-$lr); } +		else	{ &rotr($r,	$lr-2); } +		} +	if ($lr != 3) +		{ +		if (($lr-3) < 0) +			{ &rotl($l,	3-$lr); } +		else	{ &rotr($l,	$lr-3); } +		} + +	&R_PERM_OP($l,$r,$tt, 0,"0xaaaaaaaa",$r); +	&R_PERM_OP($tt,$r,$l,23,"0x03fc03fc",$r); +	&R_PERM_OP($l,$r,$tt,10,"0x33333333",$l); +	&R_PERM_OP($r,$tt,$l,18,"0xfff0000f",$l); +	&R_PERM_OP($l,$tt,$r,12,"0xf0f0f0f0",$r); +	&rotr($tt	, 4); +	} + +sub DES_SPtrans +	{ +	&set_label("DES_SPtrans",64); +	&data_word(0x02080800, 0x00080000, 0x02000002, 0x02080802); +	&data_word(0x02000000, 0x00080802, 0x00080002, 0x02000002); +	&data_word(0x00080802, 0x02080800, 0x02080000, 0x00000802); +	&data_word(0x02000802, 0x02000000, 0x00000000, 0x00080002); +	&data_word(0x00080000, 0x00000002, 0x02000800, 0x00080800); +	&data_word(0x02080802, 0x02080000, 0x00000802, 0x02000800); +	&data_word(0x00000002, 0x00000800, 0x00080800, 0x02080002); +	&data_word(0x00000800, 0x02000802, 0x02080002, 0x00000000); +	&data_word(0x00000000, 0x02080802, 0x02000800, 0x00080002); +	&data_word(0x02080800, 0x00080000, 0x00000802, 0x02000800); +	&data_word(0x02080002, 0x00000800, 0x00080800, 0x02000002); +	&data_word(0x00080802, 0x00000002, 0x02000002, 0x02080000); +	&data_word(0x02080802, 0x00080800, 0x02080000, 0x02000802); +	&data_word(0x02000000, 0x00000802, 0x00080002, 0x00000000); +	&data_word(0x00080000, 0x02000000, 0x02000802, 0x02080800); +	&data_word(0x00000002, 0x02080002, 0x00000800, 0x00080802); +	# nibble 1 +	&data_word(0x40108010, 0x00000000, 0x00108000, 0x40100000); +	&data_word(0x40000010, 0x00008010, 0x40008000, 0x00108000); +	&data_word(0x00008000, 0x40100010, 0x00000010, 0x40008000); +	&data_word(0x00100010, 0x40108000, 0x40100000, 0x00000010); +	&data_word(0x00100000, 0x40008010, 0x40100010, 0x00008000); +	&data_word(0x00108010, 0x40000000, 0x00000000, 0x00100010); +	&data_word(0x40008010, 0x00108010, 0x40108000, 0x40000010); +	&data_word(0x40000000, 0x00100000, 0x00008010, 0x40108010); +	&data_word(0x00100010, 0x40108000, 0x40008000, 0x00108010); +	&data_word(0x40108010, 0x00100010, 0x40000010, 0x00000000); +	&data_word(0x40000000, 0x00008010, 0x00100000, 0x40100010); +	&data_word(0x00008000, 0x40000000, 0x00108010, 0x40008010); +	&data_word(0x40108000, 0x00008000, 0x00000000, 0x40000010); +	&data_word(0x00000010, 0x40108010, 0x00108000, 0x40100000); +	&data_word(0x40100010, 0x00100000, 0x00008010, 0x40008000); +	&data_word(0x40008010, 0x00000010, 0x40100000, 0x00108000); +	# nibble 2 +	&data_word(0x04000001, 0x04040100, 0x00000100, 0x04000101); +	&data_word(0x00040001, 0x04000000, 0x04000101, 0x00040100); +	&data_word(0x04000100, 0x00040000, 0x04040000, 0x00000001); +	&data_word(0x04040101, 0x00000101, 0x00000001, 0x04040001); +	&data_word(0x00000000, 0x00040001, 0x04040100, 0x00000100); +	&data_word(0x00000101, 0x04040101, 0x00040000, 0x04000001); +	&data_word(0x04040001, 0x04000100, 0x00040101, 0x04040000); +	&data_word(0x00040100, 0x00000000, 0x04000000, 0x00040101); +	&data_word(0x04040100, 0x00000100, 0x00000001, 0x00040000); +	&data_word(0x00000101, 0x00040001, 0x04040000, 0x04000101); +	&data_word(0x00000000, 0x04040100, 0x00040100, 0x04040001); +	&data_word(0x00040001, 0x04000000, 0x04040101, 0x00000001); +	&data_word(0x00040101, 0x04000001, 0x04000000, 0x04040101); +	&data_word(0x00040000, 0x04000100, 0x04000101, 0x00040100); +	&data_word(0x04000100, 0x00000000, 0x04040001, 0x00000101); +	&data_word(0x04000001, 0x00040101, 0x00000100, 0x04040000); +	# nibble 3 +	&data_word(0x00401008, 0x10001000, 0x00000008, 0x10401008); +	&data_word(0x00000000, 0x10400000, 0x10001008, 0x00400008); +	&data_word(0x10401000, 0x10000008, 0x10000000, 0x00001008); +	&data_word(0x10000008, 0x00401008, 0x00400000, 0x10000000); +	&data_word(0x10400008, 0x00401000, 0x00001000, 0x00000008); +	&data_word(0x00401000, 0x10001008, 0x10400000, 0x00001000); +	&data_word(0x00001008, 0x00000000, 0x00400008, 0x10401000); +	&data_word(0x10001000, 0x10400008, 0x10401008, 0x00400000); +	&data_word(0x10400008, 0x00001008, 0x00400000, 0x10000008); +	&data_word(0x00401000, 0x10001000, 0x00000008, 0x10400000); +	&data_word(0x10001008, 0x00000000, 0x00001000, 0x00400008); +	&data_word(0x00000000, 0x10400008, 0x10401000, 0x00001000); +	&data_word(0x10000000, 0x10401008, 0x00401008, 0x00400000); +	&data_word(0x10401008, 0x00000008, 0x10001000, 0x00401008); +	&data_word(0x00400008, 0x00401000, 0x10400000, 0x10001008); +	&data_word(0x00001008, 0x10000000, 0x10000008, 0x10401000); +	# nibble 4 +	&data_word(0x08000000, 0x00010000, 0x00000400, 0x08010420); +	&data_word(0x08010020, 0x08000400, 0x00010420, 0x08010000); +	&data_word(0x00010000, 0x00000020, 0x08000020, 0x00010400); +	&data_word(0x08000420, 0x08010020, 0x08010400, 0x00000000); +	&data_word(0x00010400, 0x08000000, 0x00010020, 0x00000420); +	&data_word(0x08000400, 0x00010420, 0x00000000, 0x08000020); +	&data_word(0x00000020, 0x08000420, 0x08010420, 0x00010020); +	&data_word(0x08010000, 0x00000400, 0x00000420, 0x08010400); +	&data_word(0x08010400, 0x08000420, 0x00010020, 0x08010000); +	&data_word(0x00010000, 0x00000020, 0x08000020, 0x08000400); +	&data_word(0x08000000, 0x00010400, 0x08010420, 0x00000000); +	&data_word(0x00010420, 0x08000000, 0x00000400, 0x00010020); +	&data_word(0x08000420, 0x00000400, 0x00000000, 0x08010420); +	&data_word(0x08010020, 0x08010400, 0x00000420, 0x00010000); +	&data_word(0x00010400, 0x08010020, 0x08000400, 0x00000420); +	&data_word(0x00000020, 0x00010420, 0x08010000, 0x08000020); +	# nibble 5 +	&data_word(0x80000040, 0x00200040, 0x00000000, 0x80202000); +	&data_word(0x00200040, 0x00002000, 0x80002040, 0x00200000); +	&data_word(0x00002040, 0x80202040, 0x00202000, 0x80000000); +	&data_word(0x80002000, 0x80000040, 0x80200000, 0x00202040); +	&data_word(0x00200000, 0x80002040, 0x80200040, 0x00000000); +	&data_word(0x00002000, 0x00000040, 0x80202000, 0x80200040); +	&data_word(0x80202040, 0x80200000, 0x80000000, 0x00002040); +	&data_word(0x00000040, 0x00202000, 0x00202040, 0x80002000); +	&data_word(0x00002040, 0x80000000, 0x80002000, 0x00202040); +	&data_word(0x80202000, 0x00200040, 0x00000000, 0x80002000); +	&data_word(0x80000000, 0x00002000, 0x80200040, 0x00200000); +	&data_word(0x00200040, 0x80202040, 0x00202000, 0x00000040); +	&data_word(0x80202040, 0x00202000, 0x00200000, 0x80002040); +	&data_word(0x80000040, 0x80200000, 0x00202040, 0x00000000); +	&data_word(0x00002000, 0x80000040, 0x80002040, 0x80202000); +	&data_word(0x80200000, 0x00002040, 0x00000040, 0x80200040); +	# nibble 6 +	&data_word(0x00004000, 0x00000200, 0x01000200, 0x01000004); +	&data_word(0x01004204, 0x00004004, 0x00004200, 0x00000000); +	&data_word(0x01000000, 0x01000204, 0x00000204, 0x01004000); +	&data_word(0x00000004, 0x01004200, 0x01004000, 0x00000204); +	&data_word(0x01000204, 0x00004000, 0x00004004, 0x01004204); +	&data_word(0x00000000, 0x01000200, 0x01000004, 0x00004200); +	&data_word(0x01004004, 0x00004204, 0x01004200, 0x00000004); +	&data_word(0x00004204, 0x01004004, 0x00000200, 0x01000000); +	&data_word(0x00004204, 0x01004000, 0x01004004, 0x00000204); +	&data_word(0x00004000, 0x00000200, 0x01000000, 0x01004004); +	&data_word(0x01000204, 0x00004204, 0x00004200, 0x00000000); +	&data_word(0x00000200, 0x01000004, 0x00000004, 0x01000200); +	&data_word(0x00000000, 0x01000204, 0x01000200, 0x00004200); +	&data_word(0x00000204, 0x00004000, 0x01004204, 0x01000000); +	&data_word(0x01004200, 0x00000004, 0x00004004, 0x01004204); +	&data_word(0x01000004, 0x01004200, 0x01004000, 0x00004004); +	# nibble 7 +	&data_word(0x20800080, 0x20820000, 0x00020080, 0x00000000); +	&data_word(0x20020000, 0x00800080, 0x20800000, 0x20820080); +	&data_word(0x00000080, 0x20000000, 0x00820000, 0x00020080); +	&data_word(0x00820080, 0x20020080, 0x20000080, 0x20800000); +	&data_word(0x00020000, 0x00820080, 0x00800080, 0x20020000); +	&data_word(0x20820080, 0x20000080, 0x00000000, 0x00820000); +	&data_word(0x20000000, 0x00800000, 0x20020080, 0x20800080); +	&data_word(0x00800000, 0x00020000, 0x20820000, 0x00000080); +	&data_word(0x00800000, 0x00020000, 0x20000080, 0x20820080); +	&data_word(0x00020080, 0x20000000, 0x00000000, 0x00820000); +	&data_word(0x20800080, 0x20020080, 0x20020000, 0x00800080); +	&data_word(0x20820000, 0x00000080, 0x00800080, 0x20020000); +	&data_word(0x20820080, 0x00800000, 0x20800000, 0x20000080); +	&data_word(0x00820000, 0x00020080, 0x20020080, 0x20800000); +	&data_word(0x00000080, 0x20820000, 0x00820080, 0x00000000); +	&data_word(0x20000000, 0x20800080, 0x00020000, 0x00820080); +	} diff --git a/main/openssl/crypto/des/asm/des_enc.m4 b/main/openssl/crypto/des/asm/des_enc.m4 new file mode 100644 index 00000000..32805954 --- /dev/null +++ b/main/openssl/crypto/des/asm/des_enc.m4 @@ -0,0 +1,2099 @@ +!  des_enc.m4 +!  des_enc.S  (generated from des_enc.m4) +! +!  UltraSPARC assembler version of the LibDES/SSLeay/OpenSSL des_enc.c file. +! +!  Version 1.0. 32-bit version. +! +!  June 8, 2000. +! +!  Version 2.0. 32/64-bit, PIC-ification, blended CPU adaptation +!		by Andy Polyakov. +! +!  January 1, 2003. +! +!  Assembler version: Copyright Svend Olaf Mikkelsen. +! +!  Original C code: Copyright Eric A. Young. +! +!  This code can be freely used by LibDES/SSLeay/OpenSSL users. +! +!  The LibDES/SSLeay/OpenSSL copyright notices must be respected. +! +!  This version can be redistributed. +! +!  To expand the m4 macros: m4 -B 8192 des_enc.m4 > des_enc.S +! +!  Global registers 1 to 5 are used. This is the same as done by the +!  cc compiler. The UltraSPARC load/store little endian feature is used. +! +!  Instruction grouping often refers to one CPU cycle. +! +!  Assemble through gcc: gcc -c -mcpu=ultrasparc -o des_enc.o des_enc.S +! +!  Assemble through cc:  cc -c -xarch=v8plusa -o des_enc.o des_enc.S +! +!  Performance improvement according to './apps/openssl speed des' +! +!	32-bit build: +!		23%  faster than cc-5.2 -xarch=v8plus -xO5 +!		115% faster than gcc-3.2.1 -m32 -mcpu=ultrasparc -O5 +!	64-bit build: +!		50%  faster than cc-5.2 -xarch=v9 -xO5 +!		100% faster than gcc-3.2.1 -m64 -mcpu=ultrasparc -O5 +! + +.ident "des_enc.m4 2.1" +.file  "des_enc-sparc.S" + +#if defined(__SUNPRO_C) && defined(__sparcv9) +# define ABI64  /* They've said -xarch=v9 at command line */ +#elif defined(__GNUC__) && defined(__arch64__) +# define ABI64  /* They've said -m64 at command line */ +#endif + +#ifdef ABI64 +  .register	%g2,#scratch +  .register	%g3,#scratch +# define	FRAME	-192 +# define	BIAS	2047 +# define	LDPTR	ldx +# define	STPTR	stx +# define	ARG0	128 +# define	ARGSZ	8 +# ifndef OPENSSL_SYSNAME_ULTRASPARC +# define OPENSSL_SYSNAME_ULTRASPARC +# endif +#else +# define	FRAME	-96 +# define	BIAS	0 +# define	LDPTR	ld +# define	STPTR	st +# define	ARG0	68 +# define	ARGSZ	4 +#endif + +#define LOOPS 7 + +#define global0 %g0 +#define global1 %g1 +#define global2 %g2 +#define global3 %g3 +#define global4 %g4 +#define global5 %g5 + +#define local0 %l0 +#define local1 %l1 +#define local2 %l2 +#define local3 %l3 +#define local4 %l4 +#define local5 %l5 +#define local7 %l6 +#define local6 %l7 + +#define in0 %i0 +#define in1 %i1 +#define in2 %i2 +#define in3 %i3 +#define in4 %i4 +#define in5 %i5 +#define in6 %i6 +#define in7 %i7 + +#define out0 %o0 +#define out1 %o1 +#define out2 %o2 +#define out3 %o3 +#define out4 %o4 +#define out5 %o5 +#define out6 %o6 +#define out7 %o7 + +#define stub stb + +changequote({,}) + + +! Macro definitions: + + +! {ip_macro} +! +! The logic used in initial and final permutations is the same as in +! the C code. The permutations are done with a clever shift, xor, and +! technique. +! +! The macro also loads address sbox 1 to 5 to global 1 to 5, address +! sbox 6 to local6, and addres sbox 8 to out3. +! +! Rotates the halfs 3 left to bring the sbox bits in convenient positions. +! +! Loads key first round from address in parameter 5 to out0, out1. +! +! After the the original LibDES initial permutation, the resulting left +! is in the variable initially used for right and vice versa. The macro +! implements the possibility to keep the halfs in the original registers. +! +! parameter 1  left +! parameter 2  right +! parameter 3  result left (modify in first round) +! parameter 4  result right (use in first round) +! parameter 5  key address +! parameter 6  1/2 for include encryption/decryption +! parameter 7  1 for move in1 to in3 +! parameter 8  1 for move in3 to in4, 2 for move in4 to in3 +! parameter 9  1 for load ks3 and ks2 to in4 and in3 + +define(ip_macro, { + +! {ip_macro} +! $1 $2 $4 $3 $5 $6 $7 $8 $9 + +	ld	[out2+256], local1 +	srl	$2, 4, local4 + +	xor	local4, $1, local4 +	ifelse($7,1,{mov in1, in3},{nop}) + +	ld	[out2+260], local2 +	and	local4, local1, local4 +	ifelse($8,1,{mov in3, in4},{}) +	ifelse($8,2,{mov in4, in3},{}) + +	ld	[out2+280], out4          ! loop counter +	sll	local4, 4, local1 +	xor	$1, local4, $1 + +	ld	[out2+264], local3 +	srl	$1, 16, local4 +	xor	$2, local1, $2 + +	ifelse($9,1,{LDPTR	KS3, in4},{}) +	xor	local4, $2, local4 +	nop	!sethi	%hi(DES_SPtrans), global1 ! sbox addr + +	ifelse($9,1,{LDPTR	KS2, in3},{}) +	and	local4, local2, local4 +	nop	!or	global1, %lo(DES_SPtrans), global1   ! sbox addr + +	sll	local4, 16, local1 +	xor	$2, local4, $2 + +	srl	$2, 2, local4 +	xor	$1, local1, $1 + +	sethi	%hi(16711680), local5 +	xor	local4, $1, local4 + +	and	local4, local3, local4 +	or	local5, 255, local5 + +	sll	local4, 2, local2 +	xor	$1, local4, $1 + +	srl	$1, 8, local4 +	xor	$2, local2, $2 + +	xor	local4, $2, local4 +	add	global1, 768, global4 + +	and	local4, local5, local4 +	add	global1, 1024, global5 + +	ld	[out2+272], local7 +	sll	local4, 8, local1 +	xor	$2, local4, $2 + +	srl	$2, 1, local4 +	xor	$1, local1, $1 + +	ld	[$5], out0                ! key 7531 +	xor	local4, $1, local4 +	add	global1, 256, global2 + +	ld	[$5+4], out1              ! key 8642 +	and	local4, local7, local4 +	add	global1, 512, global3 + +	sll	local4, 1, local1 +	xor	$1, local4, $1 + +	sll	$1, 3, local3 +	xor	$2, local1, $2 + +	sll	$2, 3, local2 +	add	global1, 1280, local6     ! address sbox 8 + +	srl	$1, 29, local4 +	add	global1, 1792, out3       ! address sbox 8 + +	srl	$2, 29, local1 +	or	local4, local3, $4 + +	or	local2, local1, $3 + +	ifelse($6, 1, { + +		ld	[out2+284], local5     ! 0x0000FC00 used in the rounds +		or	local2, local1, $3 +		xor	$4, out0, local1 + +		call .des_enc.1 +		and	local1, 252, local1 + +	},{}) + +	ifelse($6, 2, { + +		ld	[out2+284], local5     ! 0x0000FC00 used in the rounds +		or	local2, local1, $3 +		xor	$4, out0, local1 + +		call .des_dec.1 +		and	local1, 252, local1 + +	},{}) +}) + + +! {rounds_macro} +! +! The logic used in the DES rounds is the same as in the C code, +! except that calculations for sbox 1 and sbox 5 begin before +! the previous round is finished. +! +! In each round one half (work) is modified based on key and the +! other half (use). +! +! In this version we do two rounds in a loop repeated 7 times +! and two rounds seperately. +! +! One half has the bits for the sboxes in the following positions: +! +!	777777xx555555xx333333xx111111xx +! +!	88xx666666xx444444xx222222xx8888 +! +! The bits for each sbox are xor-ed with the key bits for that box. +! The above xx bits are cleared, and the result used for lookup in +! the sbox table. Each sbox entry contains the 4 output bits permuted +! into 32 bits according to the P permutation. +! +! In the description of DES, left and right are switched after +! each round, except after last round. In this code the original +! left and right are kept in the same register in all rounds, meaning +! that after the 16 rounds the result for right is in the register +! originally used for left. +! +! parameter 1  first work (left in first round) +! parameter 2  first use (right in first round) +! parameter 3  enc/dec  1/-1 +! parameter 4  loop label +! parameter 5  key address register +! parameter 6  optional address for key next encryption/decryption +! parameter 7  not empty for include retl +! +! also compares in2 to 8 + +define(rounds_macro, { + +! {rounds_macro} +! $1 $2 $3 $4 $5 $6 $7 $8 $9 + +	xor	$2, out0, local1 + +	ld	[out2+284], local5        ! 0x0000FC00 +	ba	$4 +	and	local1, 252, local1 + +	.align 32 + +$4: +	! local6 is address sbox 6 +	! out3   is address sbox 8 +	! out4   is loop counter + +	ld	[global1+local1], local1 +	xor	$2, out1, out1            ! 8642 +	xor	$2, out0, out0            ! 7531 +	! fmovs	%f0, %f0                  ! fxor used for alignment + +	srl	out1, 4, local0           ! rotate 4 right +	and	out0, local5, local3      ! 3 +	! fmovs	%f0, %f0 + +	ld	[$5+$3*8], local7         ! key 7531 next round +	srl	local3, 8, local3         ! 3 +	and	local0, 252, local2       ! 2 +	! fmovs	%f0, %f0 + +	ld	[global3+local3],local3   ! 3 +	sll	out1, 28, out1            ! rotate +	xor	$1, local1, $1            ! 1 finished, local1 now sbox 7 + +	ld	[global2+local2], local2  ! 2  +	srl	out0, 24, local1          ! 7 +	or	out1, local0, out1        ! rotate + +	ldub	[out2+local1], local1     ! 7 (and 0xFC) +	srl	out1, 24, local0          ! 8 +	and	out1, local5, local4      ! 4 + +	ldub	[out2+local0], local0     ! 8 (and 0xFC) +	srl	local4, 8, local4         ! 4 +	xor	$1, local2, $1            ! 2 finished local2 now sbox 6 + +	ld	[global4+local4],local4   ! 4 +	srl	out1, 16, local2          ! 6 +	xor	$1, local3, $1            ! 3 finished local3 now sbox 5 + +	ld	[out3+local0],local0      ! 8 +	and	local2, 252, local2       ! 6 +	add	global1, 1536, local5     ! address sbox 7 + +	ld	[local6+local2], local2   ! 6 +	srl	out0, 16, local3          ! 5 +	xor	$1, local4, $1            ! 4 finished + +	ld	[local5+local1],local1    ! 7 +	and	local3, 252, local3       ! 5 +	xor	$1, local0, $1            ! 8 finished + +	ld	[global5+local3],local3   ! 5 +	xor	$1, local2, $1            ! 6 finished +	subcc	out4, 1, out4 + +	ld	[$5+$3*8+4], out0         ! key 8642 next round +	xor	$1, local7, local2        ! sbox 5 next round +	xor	$1, local1, $1            ! 7 finished + +	srl	local2, 16, local2        ! sbox 5 next round +	xor	$1, local3, $1            ! 5 finished + +	ld	[$5+$3*16+4], out1        ! key 8642 next round again +	and	local2, 252, local2       ! sbox5 next round +! next round +	xor	$1, local7, local7        ! 7531 + +	ld	[global5+local2], local2  ! 5 +	srl	local7, 24, local3        ! 7 +	xor	$1, out0, out0            ! 8642 + +	ldub	[out2+local3], local3     ! 7 (and 0xFC) +	srl	out0, 4, local0           ! rotate 4 right +	and	local7, 252, local1       ! 1 + +	sll	out0, 28, out0            ! rotate +	xor	$2, local2, $2            ! 5 finished local2 used + +	srl	local0, 8, local4         ! 4 +	and	local0, 252, local2       ! 2 +	ld	[local5+local3], local3   ! 7 + +	srl	local0, 16, local5        ! 6 +	or	out0, local0, out0        ! rotate +	ld	[global2+local2], local2  ! 2 + +	srl	out0, 24, local0 +	ld	[$5+$3*16], out0          ! key 7531 next round +	and	local4, 252, local4	  ! 4 + +	and	local5, 252, local5       ! 6 +	ld	[global4+local4], local4  ! 4 +	xor	$2, local3, $2            ! 7 finished local3 used + +	and	local0, 252, local0       ! 8 +	ld	[local6+local5], local5   ! 6 +	xor	$2, local2, $2            ! 2 finished local2 now sbox 3 + +	srl	local7, 8, local2         ! 3 start +	ld	[out3+local0], local0     ! 8 +	xor	$2, local4, $2            ! 4 finished + +	and	local2, 252, local2       ! 3 +	ld	[global1+local1], local1  ! 1 +	xor	$2, local5, $2            ! 6 finished local5 used + +	ld	[global3+local2], local2  ! 3 +	xor	$2, local0, $2            ! 8 finished +	add	$5, $3*16, $5             ! enc add 8, dec add -8 to key pointer + +	ld	[out2+284], local5        ! 0x0000FC00 +	xor	$2, out0, local4          ! sbox 1 next round +	xor	$2, local1, $2            ! 1 finished + +	xor	$2, local2, $2            ! 3 finished +#ifdef OPENSSL_SYSNAME_ULTRASPARC +	bne,pt	%icc, $4 +#else +	bne	$4 +#endif +	and	local4, 252, local1       ! sbox 1 next round + +! two rounds more: + +	ld	[global1+local1], local1 +	xor	$2, out1, out1 +	xor	$2, out0, out0 + +	srl	out1, 4, local0           ! rotate +	and	out0, local5, local3 + +	ld	[$5+$3*8], local7         ! key 7531 +	srl	local3, 8, local3 +	and	local0, 252, local2 + +	ld	[global3+local3],local3 +	sll	out1, 28, out1            ! rotate +	xor	$1, local1, $1            ! 1 finished, local1 now sbox 7 + +	ld	[global2+local2], local2 +	srl	out0, 24, local1 +	or	out1, local0, out1        ! rotate + +	ldub	[out2+local1], local1 +	srl	out1, 24, local0 +	and	out1, local5, local4 + +	ldub	[out2+local0], local0 +	srl	local4, 8, local4 +	xor	$1, local2, $1            ! 2 finished local2 now sbox 6 + +	ld	[global4+local4],local4 +	srl	out1, 16, local2 +	xor	$1, local3, $1            ! 3 finished local3 now sbox 5 + +	ld	[out3+local0],local0 +	and	local2, 252, local2 +	add	global1, 1536, local5     ! address sbox 7 + +	ld	[local6+local2], local2 +	srl	out0, 16, local3 +	xor	$1, local4, $1            ! 4 finished + +	ld	[local5+local1],local1 +	and	local3, 252, local3 +	xor	$1, local0, $1 + +	ld	[global5+local3],local3 +	xor	$1, local2, $1            ! 6 finished +	cmp	in2, 8 + +	ifelse($6,{}, {}, {ld	[out2+280], out4})  ! loop counter +	xor	$1, local7, local2        ! sbox 5 next round +	xor	$1, local1, $1            ! 7 finished + +	ld	[$5+$3*8+4], out0 +	srl	local2, 16, local2        ! sbox 5 next round +	xor	$1, local3, $1            ! 5 finished + +	and	local2, 252, local2 +! next round (two rounds more) +	xor	$1, local7, local7        ! 7531 + +	ld	[global5+local2], local2 +	srl	local7, 24, local3 +	xor	$1, out0, out0            ! 8642 + +	ldub	[out2+local3], local3 +	srl	out0, 4, local0           ! rotate +	and	local7, 252, local1 + +	sll	out0, 28, out0            ! rotate +	xor	$2, local2, $2            ! 5 finished local2 used + +	srl	local0, 8, local4 +	and	local0, 252, local2 +	ld	[local5+local3], local3 + +	srl	local0, 16, local5 +	or	out0, local0, out0        ! rotate +	ld	[global2+local2], local2 + +	srl	out0, 24, local0 +	ifelse($6,{}, {}, {ld	[$6], out0})   ! key next encryption/decryption +	and	local4, 252, local4 + +	and	local5, 252, local5 +	ld	[global4+local4], local4 +	xor	$2, local3, $2            ! 7 finished local3 used + +	and	local0, 252, local0 +	ld	[local6+local5], local5 +	xor	$2, local2, $2            ! 2 finished local2 now sbox 3 + +	srl	local7, 8, local2         ! 3 start +	ld	[out3+local0], local0 +	xor	$2, local4, $2 + +	and	local2, 252, local2 +	ld	[global1+local1], local1 +	xor	$2, local5, $2            ! 6 finished local5 used + +	ld	[global3+local2], local2 +	srl	$1, 3, local3 +	xor	$2, local0, $2 + +	ifelse($6,{}, {}, {ld	[$6+4], out1}) ! key next encryption/decryption +	sll	$1, 29, local4 +	xor	$2, local1, $2 + +	ifelse($7,{}, {}, {retl}) +	xor	$2, local2, $2 +}) + + +! {fp_macro} +! +!  parameter 1   right (original left) +!  parameter 2   left (original right) +!  parameter 3   1 for optional store to [in0] +!  parameter 4   1 for load input/output address to local5/7 +! +!  The final permutation logic switches the halfes, meaning that +!  left and right ends up the the registers originally used. + +define(fp_macro, { + +! {fp_macro} +! $1 $2 $3 $4 $5 $6 $7 $8 $9 + +	! initially undo the rotate 3 left done after initial permutation +	! original left is received shifted 3 right and 29 left in local3/4 + +	sll	$2, 29, local1 +	or	local3, local4, $1 + +	srl	$2, 3, $2 +	sethi	%hi(0x55555555), local2 + +	or	$2, local1, $2 +	or	local2, %lo(0x55555555), local2 + +	srl	$2, 1, local3 +	sethi	%hi(0x00ff00ff), local1 +	xor	local3, $1, local3 +	or	local1, %lo(0x00ff00ff), local1 +	and	local3, local2, local3 +	sethi	%hi(0x33333333), local4 +	sll	local3, 1, local2 + +	xor	$1, local3, $1 + +	srl	$1, 8, local3 +	xor	$2, local2, $2 +	xor	local3, $2, local3 +	or	local4, %lo(0x33333333), local4 +	and	local3, local1, local3 +	sethi	%hi(0x0000ffff), local1 +	sll	local3, 8, local2 + +	xor	$2, local3, $2 + +	srl	$2, 2, local3 +	xor	$1, local2, $1 +	xor	local3, $1, local3 +	or	local1, %lo(0x0000ffff), local1 +	and	local3, local4, local3 +	sethi	%hi(0x0f0f0f0f), local4 +	sll	local3, 2, local2 + +	ifelse($4,1, {LDPTR INPUT, local5}) +	xor	$1, local3, $1 + +	ifelse($4,1, {LDPTR OUTPUT, local7}) +	srl	$1, 16, local3 +	xor	$2, local2, $2 +	xor	local3, $2, local3 +	or	local4, %lo(0x0f0f0f0f), local4 +	and	local3, local1, local3 +	sll	local3, 16, local2 + +	xor	$2, local3, local1 + +	srl	local1, 4, local3 +	xor	$1, local2, $1 +	xor	local3, $1, local3 +	and	local3, local4, local3 +	sll	local3, 4, local2 + +	xor	$1, local3, $1 + +	! optional store: + +	ifelse($3,1, {st $1, [in0]}) + +	xor	local1, local2, $2 + +	ifelse($3,1, {st $2, [in0+4]}) + +}) + + +! {fp_ip_macro} +! +! Does initial permutation for next block mixed with +! final permutation for current block. +! +! parameter 1   original left +! parameter 2   original right +! parameter 3   left ip +! parameter 4   right ip +! parameter 5   1: load ks1/ks2 to in3/in4, add 120 to in4 +!                2: mov in4 to in3 +! +! also adds -8 to length in2 and loads loop counter to out4 + +define(fp_ip_macro, { + +! {fp_ip_macro} +! $1 $2 $3 $4 $5 $6 $7 $8 $9 + +	define({temp1},{out4}) +	define({temp2},{local3}) + +	define({ip1},{local1}) +	define({ip2},{local2}) +	define({ip4},{local4}) +	define({ip5},{local5}) + +	! $1 in local3, local4 + +	ld	[out2+256], ip1 +	sll	out5, 29, temp1 +	or	local3, local4, $1 + +	srl	out5, 3, $2 +	ifelse($5,2,{mov in4, in3}) + +	ld	[out2+272], ip5 +	srl	$4, 4, local0 +	or	$2, temp1, $2 + +	srl	$2, 1, temp1 +	xor	temp1, $1, temp1 + +	and	temp1, ip5, temp1 +	xor	local0, $3, local0 + +	sll	temp1, 1, temp2 +	xor	$1, temp1, $1 + +	and	local0, ip1, local0 +	add	in2, -8, in2 + +	sll	local0, 4, local7 +	xor	$3, local0, $3 + +	ld	[out2+268], ip4 +	srl	$1, 8, temp1 +	xor	$2, temp2, $2 +	ld	[out2+260], ip2 +	srl	$3, 16, local0 +	xor	$4, local7, $4 +	xor	temp1, $2, temp1 +	xor	local0, $4, local0 +	and	temp1, ip4, temp1 +	and	local0, ip2, local0 +	sll	temp1, 8, temp2 +	xor	$2, temp1, $2 +	sll	local0, 16, local7 +	xor	$4, local0, $4 + +	srl	$2, 2, temp1 +	xor	$1, temp2, $1 + +	ld	[out2+264], temp2         ! ip3 +	srl	$4, 2, local0 +	xor	$3, local7, $3 +	xor	temp1, $1, temp1 +	xor	local0, $3, local0 +	and	temp1, temp2, temp1 +	and	local0, temp2, local0 +	sll	temp1, 2, temp2 +	xor	$1, temp1, $1 +	sll	local0, 2, local7 +	xor	$3, local0, $3 + +	srl	$1, 16, temp1 +	xor	$2, temp2, $2 +	srl	$3, 8, local0 +	xor	$4, local7, $4 +	xor	temp1, $2, temp1 +	xor	local0, $4, local0 +	and	temp1, ip2, temp1 +	and	local0, ip4, local0 +	sll	temp1, 16, temp2 +	xor	$2, temp1, local4 +	sll	local0, 8, local7 +	xor	$4, local0, $4 + +	srl	$4, 1, local0 +	xor	$3, local7, $3 + +	srl	local4, 4, temp1 +	xor	local0, $3, local0 + +	xor	$1, temp2, $1 +	and	local0, ip5, local0 + +	sll	local0, 1, local7 +	xor	temp1, $1, temp1 + +	xor	$3, local0, $3 +	xor	$4, local7, $4 + +	sll	$3, 3, local5 +	and	temp1, ip1, temp1 + +	sll	temp1, 4, temp2 +	xor	$1, temp1, $1 + +	ifelse($5,1,{LDPTR	KS2, in4}) +	sll	$4, 3, local2 +	xor	local4, temp2, $2 + +	! reload since used as temporar: + +	ld	[out2+280], out4          ! loop counter + +	srl	$3, 29, local0 +	ifelse($5,1,{add in4, 120, in4}) + +	ifelse($5,1,{LDPTR	KS1, in3}) +	srl	$4, 29, local7 + +	or	local0, local5, $4 +	or	local2, local7, $3 + +}) + + + +! {load_little_endian} +! +! parameter 1  address +! parameter 2  destination left +! parameter 3  destination right +! parameter 4  temporar +! parameter 5  label + +define(load_little_endian, { + +! {load_little_endian} +! $1 $2 $3 $4 $5 $6 $7 $8 $9 + +	! first in memory to rightmost in register + +#ifdef OPENSSL_SYSNAME_ULTRASPARC +	andcc	$1, 3, global0 +	bne,pn	%icc, $5 +	nop + +	lda	[$1] 0x88, $2 +	add	$1, 4, $4 + +	ba,pt	%icc, $5a +	lda	[$4] 0x88, $3 +#endif + +$5: +	ldub	[$1+3], $2 + +	ldub	[$1+2], $4 +	sll	$2, 8, $2 +	or	$2, $4, $2 + +	ldub	[$1+1], $4 +	sll	$2, 8, $2 +	or	$2, $4, $2 + +	ldub	[$1+0], $4 +	sll	$2, 8, $2 +	or	$2, $4, $2 + + +	ldub	[$1+3+4], $3 + +	ldub	[$1+2+4], $4 +	sll	$3, 8, $3 +	or	$3, $4, $3 + +	ldub	[$1+1+4], $4 +	sll	$3, 8, $3 +	or	$3, $4, $3 + +	ldub	[$1+0+4], $4 +	sll	$3, 8, $3 +	or	$3, $4, $3 +$5a: + +}) + + +! {load_little_endian_inc} +! +! parameter 1  address +! parameter 2  destination left +! parameter 3  destination right +! parameter 4  temporar +! parameter 4  label +! +! adds 8 to address + +define(load_little_endian_inc, { + +! {load_little_endian_inc} +! $1 $2 $3 $4 $5 $6 $7 $8 $9 + +	! first in memory to rightmost in register + +#ifdef OPENSSL_SYSNAME_ULTRASPARC +	andcc	$1, 3, global0 +	bne,pn	%icc, $5 +	nop + +	lda	[$1] 0x88, $2 +	add	$1, 4, $1 + +	lda	[$1] 0x88, $3 +	ba,pt	%icc, $5a +	add	$1, 4, $1 +#endif + +$5: +	ldub	[$1+3], $2 + +	ldub	[$1+2], $4 +	sll	$2, 8, $2 +	or	$2, $4, $2 + +	ldub	[$1+1], $4 +	sll	$2, 8, $2 +	or	$2, $4, $2 + +	ldub	[$1+0], $4 +	sll	$2, 8, $2 +	or	$2, $4, $2 + +	ldub	[$1+3+4], $3 +	add	$1, 8, $1 + +	ldub	[$1+2+4-8], $4 +	sll	$3, 8, $3 +	or	$3, $4, $3 + +	ldub	[$1+1+4-8], $4 +	sll	$3, 8, $3 +	or	$3, $4, $3 + +	ldub	[$1+0+4-8], $4 +	sll	$3, 8, $3 +	or	$3, $4, $3 +$5a: + +}) + + +! {load_n_bytes} +! +! Loads 1 to 7 bytes little endian +! Remaining bytes are zeroed. +! +! parameter 1  address +! parameter 2  length +! parameter 3  destination register left +! parameter 4  destination register right +! parameter 5  temp +! parameter 6  temp2 +! parameter 7  label +! parameter 8  return label + +define(load_n_bytes, { + +! {load_n_bytes} +! $1 $2 $5 $6 $7 $8 $7 $8 $9 + +$7.0:	call	.+8 +	sll	$2, 2, $6 + +	add	%o7,$7.jmp.table-$7.0,$5 + +	add	$5, $6, $5 +	mov	0, $4 + +	ld	[$5], $5 + +	jmp	%o7+$5 +	mov	0, $3 + +$7.7: +	ldub	[$1+6], $5 +	sll	$5, 16, $5 +	or	$3, $5, $3 +$7.6: +	ldub	[$1+5], $5 +	sll	$5, 8, $5 +	or	$3, $5, $3 +$7.5: +	ldub	[$1+4], $5 +	or	$3, $5, $3 +$7.4: +	ldub	[$1+3], $5 +	sll	$5, 24, $5 +	or	$4, $5, $4 +$7.3: +	ldub	[$1+2], $5 +	sll	$5, 16, $5 +	or	$4, $5, $4 +$7.2: +	ldub	[$1+1], $5 +	sll	$5, 8, $5 +	or	$4, $5, $4 +$7.1: +	ldub	[$1+0], $5 +	ba	$8 +	or	$4, $5, $4 + +	.align 4 + +$7.jmp.table: +	.word	0 +	.word	$7.1-$7.0 +	.word	$7.2-$7.0 +	.word	$7.3-$7.0 +	.word	$7.4-$7.0 +	.word	$7.5-$7.0 +	.word	$7.6-$7.0 +	.word	$7.7-$7.0 +}) + + +! {store_little_endian} +! +! parameter 1  address +! parameter 2  source left +! parameter 3  source right +! parameter 4  temporar + +define(store_little_endian, { + +! {store_little_endian} +! $1 $2 $3 $4 $5 $6 $7 $8 $9 + +	! rightmost in register to first in memory + +#ifdef OPENSSL_SYSNAME_ULTRASPARC +	andcc	$1, 3, global0 +	bne,pn	%icc, $5 +	nop + +	sta	$2, [$1] 0x88 +	add	$1, 4, $4 + +	ba,pt	%icc, $5a +	sta	$3, [$4] 0x88 +#endif + +$5: +	and	$2, 255, $4 +	stub	$4, [$1+0] + +	srl	$2, 8, $4 +	and	$4, 255, $4 +	stub	$4, [$1+1] + +	srl	$2, 16, $4 +	and	$4, 255, $4 +	stub	$4, [$1+2] + +	srl	$2, 24, $4 +	stub	$4, [$1+3] + + +	and	$3, 255, $4 +	stub	$4, [$1+0+4] + +	srl	$3, 8, $4 +	and	$4, 255, $4 +	stub	$4, [$1+1+4] + +	srl	$3, 16, $4 +	and	$4, 255, $4 +	stub	$4, [$1+2+4] + +	srl	$3, 24, $4 +	stub	$4, [$1+3+4] + +$5a: + +}) + + +! {store_n_bytes} +! +! Stores 1 to 7 bytes little endian +! +! parameter 1  address +! parameter 2  length +! parameter 3  source register left +! parameter 4  source register right +! parameter 5  temp +! parameter 6  temp2 +! parameter 7  label +! parameter 8  return label + +define(store_n_bytes, { + +! {store_n_bytes} +! $1 $2 $5 $6 $7 $8 $7 $8 $9 + +$7.0:	call	.+8 +	sll	$2, 2, $6 + +	add	%o7,$7.jmp.table-$7.0,$5 + +	add	$5, $6, $5 + +	ld	[$5], $5 + +	jmp	%o7+$5 +	nop + +$7.7: +	srl	$3, 16, $5 +	and	$5, 0xff, $5 +	stub	$5, [$1+6] +$7.6: +	srl	$3, 8, $5 +	and	$5, 0xff, $5 +	stub	$5, [$1+5] +$7.5: +	and	$3, 0xff, $5 +	stub	$5, [$1+4] +$7.4: +	srl	$4, 24, $5 +	stub	$5, [$1+3] +$7.3: +	srl	$4, 16, $5 +	and	$5, 0xff, $5 +	stub	$5, [$1+2] +$7.2: +	srl	$4, 8, $5 +	and	$5, 0xff, $5 +	stub	$5, [$1+1] +$7.1: +	and	$4, 0xff, $5 + + +	ba	$8 +	stub	$5, [$1] + +	.align 4 + +$7.jmp.table: + +	.word	0 +	.word	$7.1-$7.0 +	.word	$7.2-$7.0 +	.word	$7.3-$7.0 +	.word	$7.4-$7.0 +	.word	$7.5-$7.0 +	.word	$7.6-$7.0 +	.word	$7.7-$7.0 +}) + + +define(testvalue,{1}) + +define(register_init, { + +! For test purposes: + +	sethi	%hi(testvalue), local0 +	or	local0, %lo(testvalue), local0 + +	ifelse($1,{},{}, {mov	local0, $1}) +	ifelse($2,{},{}, {mov	local0, $2}) +	ifelse($3,{},{}, {mov	local0, $3}) +	ifelse($4,{},{}, {mov	local0, $4}) +	ifelse($5,{},{}, {mov	local0, $5}) +	ifelse($6,{},{}, {mov	local0, $6}) +	ifelse($7,{},{}, {mov	local0, $7}) +	ifelse($8,{},{}, {mov	local0, $8}) + +	mov	local0, local1 +	mov	local0, local2 +	mov	local0, local3 +	mov	local0, local4 +	mov	local0, local5 +	mov	local0, local7 +	mov	local0, local6 +	mov	local0, out0 +	mov	local0, out1 +	mov	local0, out2 +	mov	local0, out3 +	mov	local0, out4 +	mov	local0, out5 +	mov	local0, global1 +	mov	local0, global2 +	mov	local0, global3 +	mov	local0, global4 +	mov	local0, global5 + +}) + +.section	".text" + +	.align 32 + +.des_enc: + +	! key address in3 +	! loads key next encryption/decryption first round from [in4] + +	rounds_macro(in5, out5, 1, .des_enc.1, in3, in4, retl) + + +	.align 32 + +.des_dec: + +	! implemented with out5 as first parameter to avoid +	! register exchange in ede modes + +	! key address in4 +	! loads key next encryption/decryption first round from [in3] + +	rounds_macro(out5, in5, -1, .des_dec.1, in4, in3, retl) + + + +! void DES_encrypt1(data, ks, enc) +! ******************************* + +	.align 32 +	.global DES_encrypt1 +	.type	 DES_encrypt1,#function + +DES_encrypt1: + +	save	%sp, FRAME, %sp + +	sethi	%hi(.PIC.DES_SPtrans-1f),global1 +	or	global1,%lo(.PIC.DES_SPtrans-1f),global1 +1:	call	.+8 +	add	%o7,global1,global1 +	sub	global1,.PIC.DES_SPtrans-.des_and,out2 + +	ld	[in0], in5                ! left +	cmp	in2, 0                    ! enc + +#ifdef OPENSSL_SYSNAME_ULTRASPARC +	be,pn	%icc, .encrypt.dec        ! enc/dec +#else +	be	.encrypt.dec +#endif +	ld	[in0+4], out5             ! right + +	! parameter 6  1/2 for include encryption/decryption +	! parameter 7  1 for move in1 to in3 +	! parameter 8  1 for move in3 to in4, 2 for move in4 to in3 + +	ip_macro(in5, out5, in5, out5, in3, 0, 1, 1) + +	rounds_macro(in5, out5, 1, .des_encrypt1.1, in3, in4) ! in4 not used + +	fp_macro(in5, out5, 1)            ! 1 for store to [in0] + +	ret +	restore + +.encrypt.dec: + +	add	in1, 120, in3             ! use last subkey for first round + +	! parameter 6  1/2 for include encryption/decryption +	! parameter 7  1 for move in1 to in3 +	! parameter 8  1 for move in3 to in4, 2 for move in4 to in3 + +	ip_macro(in5, out5, out5, in5, in4, 2, 0, 1) ! include dec,  ks in4 + +	fp_macro(out5, in5, 1)            ! 1 for store to [in0] + +	ret +	restore + +.DES_encrypt1.end: +	.size	 DES_encrypt1,.DES_encrypt1.end-DES_encrypt1 + + +! void DES_encrypt2(data, ks, enc) +!********************************* + +	! encrypts/decrypts without initial/final permutation + +	.align 32 +	.global DES_encrypt2 +	.type	 DES_encrypt2,#function + +DES_encrypt2: + +	save	%sp, FRAME, %sp + +	sethi	%hi(.PIC.DES_SPtrans-1f),global1 +	or	global1,%lo(.PIC.DES_SPtrans-1f),global1 +1:	call	.+8 +	add	%o7,global1,global1 +	sub	global1,.PIC.DES_SPtrans-.des_and,out2 + +	! Set sbox address 1 to 6 and rotate halfs 3 left +	! Errors caught by destest? Yes. Still? *NO* + +	!sethi	%hi(DES_SPtrans), global1 ! address sbox 1 + +	!or	global1, %lo(DES_SPtrans), global1  ! sbox 1 + +	add	global1, 256, global2     ! sbox 2 +	add	global1, 512, global3     ! sbox 3 + +	ld	[in0], out5               ! right +	add	global1, 768, global4     ! sbox 4 +	add	global1, 1024, global5    ! sbox 5 + +	ld	[in0+4], in5              ! left +	add	global1, 1280, local6     ! sbox 6 +	add	global1, 1792, out3       ! sbox 8 + +	! rotate + +	sll	in5, 3, local5 +	mov	in1, in3                  ! key address to in3 + +	sll	out5, 3, local7 +	srl	in5, 29, in5 + +	srl	out5, 29, out5 +	add	in5, local5, in5 + +	add	out5, local7, out5 +	cmp	in2, 0 + +	! we use our own stackframe + +#ifdef OPENSSL_SYSNAME_ULTRASPARC +	be,pn	%icc, .encrypt2.dec       ! decryption +#else +	be	.encrypt2.dec +#endif +	STPTR	in0, [%sp+BIAS+ARG0+0*ARGSZ] + +	ld	[in3], out0               ! key 7531 first round +	mov	LOOPS, out4               ! loop counter + +	ld	[in3+4], out1             ! key 8642 first round +	sethi	%hi(0x0000FC00), local5 + +	call .des_enc +	mov	in3, in4 + +	! rotate +	sll	in5, 29, in0 +	srl	in5, 3, in5 +	sll	out5, 29, in1 +	add	in5, in0, in5 +	srl	out5, 3, out5 +	LDPTR	[%sp+BIAS+ARG0+0*ARGSZ], in0 +	add	out5, in1, out5 +	st	in5, [in0] +	st	out5, [in0+4] + +	ret +	restore + + +.encrypt2.dec: + +	add in3, 120, in4 + +	ld	[in4], out0               ! key 7531 first round +	mov	LOOPS, out4               ! loop counter + +	ld	[in4+4], out1             ! key 8642 first round +	sethi	%hi(0x0000FC00), local5 + +	mov	in5, local1               ! left expected in out5 +	mov	out5, in5 + +	call .des_dec +	mov	local1, out5 + +.encrypt2.finish: + +	! rotate +	sll	in5, 29, in0 +	srl	in5, 3, in5 +	sll	out5, 29, in1 +	add	in5, in0, in5 +	srl	out5, 3, out5 +	LDPTR	[%sp+BIAS+ARG0+0*ARGSZ], in0 +	add	out5, in1, out5 +	st	out5, [in0] +	st	in5, [in0+4] + +	ret +	restore + +.DES_encrypt2.end: +	.size	 DES_encrypt2, .DES_encrypt2.end-DES_encrypt2 + + +! void DES_encrypt3(data, ks1, ks2, ks3) +! ************************************** + +	.align 32 +	.global DES_encrypt3 +	.type	 DES_encrypt3,#function + +DES_encrypt3: + +	save	%sp, FRAME, %sp +	 +	sethi	%hi(.PIC.DES_SPtrans-1f),global1 +	or	global1,%lo(.PIC.DES_SPtrans-1f),global1 +1:	call	.+8 +	add	%o7,global1,global1 +	sub	global1,.PIC.DES_SPtrans-.des_and,out2 + +	ld	[in0], in5                ! left +	add	in2, 120, in4             ! ks2 + +	ld	[in0+4], out5             ! right +	mov	in3, in2                  ! save ks3 + +	! parameter 6  1/2 for include encryption/decryption +	! parameter 7  1 for mov in1 to in3 +	! parameter 8  1 for mov in3 to in4 +	! parameter 9  1 for load ks3 and ks2 to in4 and in3 + +	ip_macro(in5, out5, in5, out5, in3, 1, 1, 0, 0) + +	call	.des_dec +	mov	in2, in3                  ! preload ks3 + +	call	.des_enc +	nop + +	fp_macro(in5, out5, 1) + +	ret +	restore + +.DES_encrypt3.end: +	.size	 DES_encrypt3,.DES_encrypt3.end-DES_encrypt3 + + +! void DES_decrypt3(data, ks1, ks2, ks3) +! ************************************** + +	.align 32 +	.global DES_decrypt3 +	.type	 DES_decrypt3,#function + +DES_decrypt3: + +	save	%sp, FRAME, %sp +	 +	sethi	%hi(.PIC.DES_SPtrans-1f),global1 +	or	global1,%lo(.PIC.DES_SPtrans-1f),global1 +1:	call	.+8 +	add	%o7,global1,global1 +	sub	global1,.PIC.DES_SPtrans-.des_and,out2 + +	ld	[in0], in5                ! left +	add	in3, 120, in4             ! ks3 + +	ld	[in0+4], out5             ! right +	mov	in2, in3                  ! ks2 + +	! parameter 6  1/2 for include encryption/decryption +	! parameter 7  1 for mov in1 to in3 +	! parameter 8  1 for mov in3 to in4 +	! parameter 9  1 for load ks3 and ks2 to in4 and in3 + +	ip_macro(in5, out5, out5, in5, in4, 2, 0, 0, 0) + +	call	.des_enc +	add	in1, 120, in4             ! preload ks1 + +	call	.des_dec +	nop + +	fp_macro(out5, in5, 1) + +	ret +	restore + +.DES_decrypt3.end: +	.size	 DES_decrypt3,.DES_decrypt3.end-DES_decrypt3 + +! void DES_ncbc_encrypt(input, output, length, schedule, ivec, enc) +! ***************************************************************** + + +	.align 32 +	.global DES_ncbc_encrypt +	.type	 DES_ncbc_encrypt,#function + +DES_ncbc_encrypt: + +	save	%sp, FRAME, %sp +	 +	define({INPUT},  { [%sp+BIAS+ARG0+0*ARGSZ] }) +	define({OUTPUT}, { [%sp+BIAS+ARG0+1*ARGSZ] }) +	define({IVEC},   { [%sp+BIAS+ARG0+4*ARGSZ] }) + +	sethi	%hi(.PIC.DES_SPtrans-1f),global1 +	or	global1,%lo(.PIC.DES_SPtrans-1f),global1 +1:	call	.+8 +	add	%o7,global1,global1 +	sub	global1,.PIC.DES_SPtrans-.des_and,out2 + +	cmp	in5, 0                    ! enc    + +#ifdef OPENSSL_SYSNAME_ULTRASPARC +	be,pn	%icc, .ncbc.dec +#else +	be	.ncbc.dec +#endif +	STPTR	in4, IVEC + +	! addr  left  right  temp  label +	load_little_endian(in4, in5, out5, local3, .LLE1)  ! iv + +	addcc	in2, -8, in2              ! bytes missing when first block done + +#ifdef OPENSSL_SYSNAME_ULTRASPARC +	bl,pn	%icc, .ncbc.enc.seven.or.less +#else +	bl	.ncbc.enc.seven.or.less +#endif +	mov	in3, in4                  ! schedule + +.ncbc.enc.next.block: + +	load_little_endian(in0, out4, global4, local3, .LLE2)  ! block + +.ncbc.enc.next.block_1: + +	xor	in5, out4, in5            ! iv xor +	xor	out5, global4, out5       ! iv xor + +	! parameter 8  1 for move in3 to in4, 2 for move in4 to in3 +	ip_macro(in5, out5, in5, out5, in3, 0, 0, 2) + +.ncbc.enc.next.block_2: + +!//	call .des_enc                     ! compares in2 to 8 +!	rounds inlined for alignment purposes + +	add	global1, 768, global4     ! address sbox 4 since register used below + +	rounds_macro(in5, out5, 1, .ncbc.enc.1, in3, in4) ! include encryption  ks in3 + +#ifdef OPENSSL_SYSNAME_ULTRASPARC +	bl,pn	%icc, .ncbc.enc.next.block_fp +#else +	bl	.ncbc.enc.next.block_fp +#endif +	add	in0, 8, in0               ! input address + +	! If 8 or more bytes are to be encrypted after this block, +	! we combine final permutation for this block with initial +	! permutation for next block. Load next block: + +	load_little_endian(in0, global3, global4, local5, .LLE12) + +	!  parameter 1   original left +	!  parameter 2   original right +	!  parameter 3   left ip +	!  parameter 4   right ip +	!  parameter 5   1: load ks1/ks2 to in3/in4, add 120 to in4 +	!                2: mov in4 to in3 +	! +	! also adds -8 to length in2 and loads loop counter to out4 + +	fp_ip_macro(out0, out1, global3, global4, 2) + +	store_little_endian(in1, out0, out1, local3, .SLE10)  ! block + +	ld	[in3], out0               ! key 7531 first round next block +	mov 	in5, local1 +	xor	global3, out5, in5        ! iv xor next block + +	ld	[in3+4], out1             ! key 8642 +	add	global1, 512, global3     ! address sbox 3 since register used +	xor	global4, local1, out5     ! iv xor next block + +	ba	.ncbc.enc.next.block_2 +	add	in1, 8, in1               ! output adress + +.ncbc.enc.next.block_fp: + +	fp_macro(in5, out5) + +	store_little_endian(in1, in5, out5, local3, .SLE1)  ! block + +	addcc   in2, -8, in2              ! bytes missing when next block done + +#ifdef OPENSSL_SYSNAME_ULTRASPARC +	bpos,pt	%icc, .ncbc.enc.next.block  ! also jumps if 0 +#else +	bpos	.ncbc.enc.next.block +#endif +	add	in1, 8, in1 + +.ncbc.enc.seven.or.less: + +	cmp	in2, -8 + +#ifdef OPENSSL_SYSNAME_ULTRASPARC +	ble,pt	%icc, .ncbc.enc.finish +#else +	ble	.ncbc.enc.finish +#endif +	nop + +	add	in2, 8, local1            ! bytes to load + +	! addr, length, dest left, dest right, temp, temp2, label, ret label +	load_n_bytes(in0, local1, global4, out4, local2, local3, .LNB1, .ncbc.enc.next.block_1) + +	! Loads 1 to 7 bytes little endian to global4, out4 + + +.ncbc.enc.finish: + +	LDPTR	IVEC, local4 +	store_little_endian(local4, in5, out5, local5, .SLE2)  ! ivec + +	ret +	restore + + +.ncbc.dec: + +	STPTR	in0, INPUT +	cmp	in2, 0                    ! length +	add	in3, 120, in3 + +	LDPTR	IVEC, local7              ! ivec +#ifdef OPENSSL_SYSNAME_ULTRASPARC +	ble,pn	%icc, .ncbc.dec.finish +#else +	ble	.ncbc.dec.finish +#endif +	mov	in3, in4                  ! schedule + +	STPTR	in1, OUTPUT +	mov	in0, local5               ! input + +	load_little_endian(local7, in0, in1, local3, .LLE3)   ! ivec + +.ncbc.dec.next.block: + +	load_little_endian(local5, in5, out5, local3, .LLE4)  ! block + +	! parameter 6  1/2 for include encryption/decryption +	! parameter 7  1 for mov in1 to in3 +	! parameter 8  1 for mov in3 to in4 + +	ip_macro(in5, out5, out5, in5, in4, 2, 0, 1) ! include decryprion  ks in4 + +	fp_macro(out5, in5, 0, 1) ! 1 for input and output address to local5/7 + +	! in2 is bytes left to be stored +	! in2 is compared to 8 in the rounds + +	xor	out5, in0, out4           ! iv xor +#ifdef OPENSSL_SYSNAME_ULTRASPARC +	bl,pn	%icc, .ncbc.dec.seven.or.less +#else +	bl	.ncbc.dec.seven.or.less +#endif +	xor	in5, in1, global4         ! iv xor + +	! Load ivec next block now, since input and output address might be the same. + +	load_little_endian_inc(local5, in0, in1, local3, .LLE5)  ! iv + +	store_little_endian(local7, out4, global4, local3, .SLE3) + +	STPTR	local5, INPUT +	add	local7, 8, local7 +	addcc   in2, -8, in2 + +#ifdef OPENSSL_SYSNAME_ULTRASPARC +	bg,pt	%icc, .ncbc.dec.next.block +#else +	bg	.ncbc.dec.next.block +#endif +	STPTR	local7, OUTPUT + + +.ncbc.dec.store.iv: + +	LDPTR	IVEC, local4              ! ivec +	store_little_endian(local4, in0, in1, local5, .SLE4) + +.ncbc.dec.finish: + +	ret +	restore + +.ncbc.dec.seven.or.less: + +	load_little_endian_inc(local5, in0, in1, local3, .LLE13)     ! ivec + +	store_n_bytes(local7, in2, global4, out4, local3, local4, .SNB1, .ncbc.dec.store.iv) + + +.DES_ncbc_encrypt.end: +	.size	 DES_ncbc_encrypt, .DES_ncbc_encrypt.end-DES_ncbc_encrypt + + +! void DES_ede3_cbc_encrypt(input, output, lenght, ks1, ks2, ks3, ivec, enc) +! ************************************************************************** + + +	.align 32 +	.global DES_ede3_cbc_encrypt +	.type	 DES_ede3_cbc_encrypt,#function + +DES_ede3_cbc_encrypt: + +	save	%sp, FRAME, %sp + +	define({KS1}, { [%sp+BIAS+ARG0+3*ARGSZ] }) +	define({KS2}, { [%sp+BIAS+ARG0+4*ARGSZ] }) +	define({KS3}, { [%sp+BIAS+ARG0+5*ARGSZ] }) + +	sethi	%hi(.PIC.DES_SPtrans-1f),global1 +	or	global1,%lo(.PIC.DES_SPtrans-1f),global1 +1:	call	.+8 +	add	%o7,global1,global1 +	sub	global1,.PIC.DES_SPtrans-.des_and,out2 + +	LDPTR	[%fp+BIAS+ARG0+7*ARGSZ], local3          ! enc +	LDPTR	[%fp+BIAS+ARG0+6*ARGSZ], local4          ! ivec +	cmp	local3, 0                 ! enc + +#ifdef OPENSSL_SYSNAME_ULTRASPARC +	be,pn	%icc, .ede3.dec +#else +	be	.ede3.dec +#endif +	STPTR	in4, KS2 + +	STPTR	in5, KS3 + +	load_little_endian(local4, in5, out5, local3, .LLE6)  ! ivec + +	addcc	in2, -8, in2              ! bytes missing after next block + +#ifdef OPENSSL_SYSNAME_ULTRASPARC +	bl,pn	%icc,  .ede3.enc.seven.or.less +#else +	bl	.ede3.enc.seven.or.less +#endif +	STPTR	in3, KS1 + +.ede3.enc.next.block: + +	load_little_endian(in0, out4, global4, local3, .LLE7) + +.ede3.enc.next.block_1: + +	LDPTR	KS2, in4 +	xor	in5, out4, in5            ! iv xor +	xor	out5, global4, out5       ! iv xor + +	LDPTR	KS1, in3 +	add	in4, 120, in4             ! for decryption we use last subkey first +	nop + +	ip_macro(in5, out5, in5, out5, in3) + +.ede3.enc.next.block_2: + +	call .des_enc                     ! ks1 in3 +	nop + +	call .des_dec                     ! ks2 in4 +	LDPTR	KS3, in3 + +	call .des_enc                     ! ks3 in3  compares in2 to 8 +	nop + +#ifdef OPENSSL_SYSNAME_ULTRASPARC +	bl,pn	%icc, .ede3.enc.next.block_fp +#else +	bl	.ede3.enc.next.block_fp +#endif +	add	in0, 8, in0 + +	! If 8 or more bytes are to be encrypted after this block, +	! we combine final permutation for this block with initial +	! permutation for next block. Load next block: + +	load_little_endian(in0, global3, global4, local5, .LLE11) + +	!  parameter 1   original left +	!  parameter 2   original right +	!  parameter 3   left ip +	!  parameter 4   right ip +	!  parameter 5   1: load ks1/ks2 to in3/in4, add 120 to in4 +	!                2: mov in4 to in3 +	! +	! also adds -8 to length in2 and loads loop counter to out4 + +	fp_ip_macro(out0, out1, global3, global4, 1) + +	store_little_endian(in1, out0, out1, local3, .SLE9)  ! block + +	mov 	in5, local1 +	xor	global3, out5, in5        ! iv xor next block + +	ld	[in3], out0               ! key 7531 +	add	global1, 512, global3     ! address sbox 3 +	xor	global4, local1, out5     ! iv xor next block + +	ld	[in3+4], out1             ! key 8642 +	add	global1, 768, global4     ! address sbox 4 +	ba	.ede3.enc.next.block_2 +	add	in1, 8, in1 + +.ede3.enc.next.block_fp: + +	fp_macro(in5, out5) + +	store_little_endian(in1, in5, out5, local3, .SLE5)  ! block + +	addcc   in2, -8, in2              ! bytes missing when next block done + +#ifdef OPENSSL_SYSNAME_ULTRASPARC +	bpos,pt	%icc, .ede3.enc.next.block +#else +	bpos	.ede3.enc.next.block +#endif +	add	in1, 8, in1 + +.ede3.enc.seven.or.less: + +	cmp	in2, -8 + +#ifdef OPENSSL_SYSNAME_ULTRASPARC +	ble,pt	%icc, .ede3.enc.finish +#else +	ble	.ede3.enc.finish +#endif +	nop + +	add	in2, 8, local1            ! bytes to load + +	! addr, length, dest left, dest right, temp, temp2, label, ret label +	load_n_bytes(in0, local1, global4, out4, local2, local3, .LNB2, .ede3.enc.next.block_1) + +.ede3.enc.finish: + +	LDPTR	[%fp+BIAS+ARG0+6*ARGSZ], local4          ! ivec +	store_little_endian(local4, in5, out5, local5, .SLE6)  ! ivec + +	ret +	restore + +.ede3.dec: + +	STPTR	in0, INPUT +	add	in5, 120, in5 + +	STPTR	in1, OUTPUT +	mov	in0, local5 +	add	in3, 120, in3 + +	STPTR	in3, KS1 +	cmp	in2, 0 + +#ifdef OPENSSL_SYSNAME_ULTRASPARC +	ble	%icc, .ede3.dec.finish +#else +	ble	.ede3.dec.finish +#endif +	STPTR	in5, KS3 + +	LDPTR	[%fp+BIAS+ARG0+6*ARGSZ], local7          ! iv +	load_little_endian(local7, in0, in1, local3, .LLE8) + +.ede3.dec.next.block: + +	load_little_endian(local5, in5, out5, local3, .LLE9) + +	! parameter 6  1/2 for include encryption/decryption +	! parameter 7  1 for mov in1 to in3 +	! parameter 8  1 for mov in3 to in4 +	! parameter 9  1 for load ks3 and ks2 to in4 and in3 + +	ip_macro(in5, out5, out5, in5, in4, 2, 0, 0, 1) ! inc .des_dec ks3 in4 + +	call .des_enc                     ! ks2 in3 +	LDPTR	KS1, in4 + +	call .des_dec                     ! ks1 in4 +	nop + +	fp_macro(out5, in5, 0, 1)   ! 1 for input and output address local5/7 + +	! in2 is bytes left to be stored +	! in2 is compared to 8 in the rounds + +	xor	out5, in0, out4 +#ifdef OPENSSL_SYSNAME_ULTRASPARC +	bl,pn	%icc, .ede3.dec.seven.or.less +#else +	bl	.ede3.dec.seven.or.less +#endif +	xor	in5, in1, global4 + +	load_little_endian_inc(local5, in0, in1, local3, .LLE10)   ! iv next block + +	store_little_endian(local7, out4, global4, local3, .SLE7)  ! block + +	STPTR	local5, INPUT +	addcc   in2, -8, in2 +	add	local7, 8, local7 + +#ifdef OPENSSL_SYSNAME_ULTRASPARC +	bg,pt	%icc, .ede3.dec.next.block +#else +	bg	.ede3.dec.next.block +#endif +	STPTR	local7, OUTPUT + +.ede3.dec.store.iv: + +	LDPTR	[%fp+BIAS+ARG0+6*ARGSZ], local4          ! ivec +	store_little_endian(local4, in0, in1, local5, .SLE8)  ! ivec + +.ede3.dec.finish: + +	ret +	restore + +.ede3.dec.seven.or.less: + +	load_little_endian_inc(local5, in0, in1, local3, .LLE14)     ! iv + +	store_n_bytes(local7, in2, global4, out4, local3, local4, .SNB2, .ede3.dec.store.iv) + + +.DES_ede3_cbc_encrypt.end: +	.size	 DES_ede3_cbc_encrypt,.DES_ede3_cbc_encrypt.end-DES_ede3_cbc_encrypt + +	.align	256 +	.type	 .des_and,#object +	.size	 .des_and,284 + +.des_and: + +! This table is used for AND 0xFC when it is known that register +! bits 8-31 are zero. Makes it possible to do three arithmetic +! operations in one cycle. + +	.byte  0, 0, 0, 0, 4, 4, 4, 4 +	.byte  8, 8, 8, 8, 12, 12, 12, 12 +	.byte  16, 16, 16, 16, 20, 20, 20, 20 +	.byte  24, 24, 24, 24, 28, 28, 28, 28 +	.byte  32, 32, 32, 32, 36, 36, 36, 36 +	.byte  40, 40, 40, 40, 44, 44, 44, 44 +	.byte  48, 48, 48, 48, 52, 52, 52, 52 +	.byte  56, 56, 56, 56, 60, 60, 60, 60 +	.byte  64, 64, 64, 64, 68, 68, 68, 68 +	.byte  72, 72, 72, 72, 76, 76, 76, 76 +	.byte  80, 80, 80, 80, 84, 84, 84, 84 +	.byte  88, 88, 88, 88, 92, 92, 92, 92 +	.byte  96, 96, 96, 96, 100, 100, 100, 100 +	.byte  104, 104, 104, 104, 108, 108, 108, 108 +	.byte  112, 112, 112, 112, 116, 116, 116, 116 +	.byte  120, 120, 120, 120, 124, 124, 124, 124 +	.byte  128, 128, 128, 128, 132, 132, 132, 132 +	.byte  136, 136, 136, 136, 140, 140, 140, 140 +	.byte  144, 144, 144, 144, 148, 148, 148, 148 +	.byte  152, 152, 152, 152, 156, 156, 156, 156 +	.byte  160, 160, 160, 160, 164, 164, 164, 164 +	.byte  168, 168, 168, 168, 172, 172, 172, 172 +	.byte  176, 176, 176, 176, 180, 180, 180, 180 +	.byte  184, 184, 184, 184, 188, 188, 188, 188 +	.byte  192, 192, 192, 192, 196, 196, 196, 196 +	.byte  200, 200, 200, 200, 204, 204, 204, 204 +	.byte  208, 208, 208, 208, 212, 212, 212, 212 +	.byte  216, 216, 216, 216, 220, 220, 220, 220 +	.byte  224, 224, 224, 224, 228, 228, 228, 228 +	.byte  232, 232, 232, 232, 236, 236, 236, 236 +	.byte  240, 240, 240, 240, 244, 244, 244, 244 +	.byte  248, 248, 248, 248, 252, 252, 252, 252 + +	! 5 numbers for initil/final permutation + +	.word   0x0f0f0f0f                ! offset 256 +	.word	0x0000ffff                ! 260 +	.word	0x33333333                ! 264 +	.word	0x00ff00ff                ! 268 +	.word	0x55555555                ! 272 + +	.word	0                         ! 276 +	.word	LOOPS                     ! 280 +	.word	0x0000FC00                ! 284 + +	.global	DES_SPtrans +	.type	DES_SPtrans,#object +	.size	DES_SPtrans,2048 +.align	64 +DES_SPtrans: +.PIC.DES_SPtrans: +	! nibble 0 +	.word	0x02080800, 0x00080000, 0x02000002, 0x02080802 +	.word	0x02000000, 0x00080802, 0x00080002, 0x02000002 +	.word	0x00080802, 0x02080800, 0x02080000, 0x00000802 +	.word	0x02000802, 0x02000000, 0x00000000, 0x00080002 +	.word	0x00080000, 0x00000002, 0x02000800, 0x00080800 +	.word	0x02080802, 0x02080000, 0x00000802, 0x02000800 +	.word	0x00000002, 0x00000800, 0x00080800, 0x02080002 +	.word	0x00000800, 0x02000802, 0x02080002, 0x00000000 +	.word	0x00000000, 0x02080802, 0x02000800, 0x00080002 +	.word	0x02080800, 0x00080000, 0x00000802, 0x02000800 +	.word	0x02080002, 0x00000800, 0x00080800, 0x02000002 +	.word	0x00080802, 0x00000002, 0x02000002, 0x02080000 +	.word	0x02080802, 0x00080800, 0x02080000, 0x02000802 +	.word	0x02000000, 0x00000802, 0x00080002, 0x00000000 +	.word	0x00080000, 0x02000000, 0x02000802, 0x02080800 +	.word	0x00000002, 0x02080002, 0x00000800, 0x00080802 +	! nibble 1 +	.word	0x40108010, 0x00000000, 0x00108000, 0x40100000 +	.word	0x40000010, 0x00008010, 0x40008000, 0x00108000 +	.word	0x00008000, 0x40100010, 0x00000010, 0x40008000 +	.word	0x00100010, 0x40108000, 0x40100000, 0x00000010 +	.word	0x00100000, 0x40008010, 0x40100010, 0x00008000 +	.word	0x00108010, 0x40000000, 0x00000000, 0x00100010 +	.word	0x40008010, 0x00108010, 0x40108000, 0x40000010 +	.word	0x40000000, 0x00100000, 0x00008010, 0x40108010 +	.word	0x00100010, 0x40108000, 0x40008000, 0x00108010 +	.word	0x40108010, 0x00100010, 0x40000010, 0x00000000 +	.word	0x40000000, 0x00008010, 0x00100000, 0x40100010 +	.word	0x00008000, 0x40000000, 0x00108010, 0x40008010 +	.word	0x40108000, 0x00008000, 0x00000000, 0x40000010 +	.word	0x00000010, 0x40108010, 0x00108000, 0x40100000 +	.word	0x40100010, 0x00100000, 0x00008010, 0x40008000 +	.word	0x40008010, 0x00000010, 0x40100000, 0x00108000 +	! nibble 2 +	.word	0x04000001, 0x04040100, 0x00000100, 0x04000101 +	.word	0x00040001, 0x04000000, 0x04000101, 0x00040100 +	.word	0x04000100, 0x00040000, 0x04040000, 0x00000001 +	.word	0x04040101, 0x00000101, 0x00000001, 0x04040001 +	.word	0x00000000, 0x00040001, 0x04040100, 0x00000100 +	.word	0x00000101, 0x04040101, 0x00040000, 0x04000001 +	.word	0x04040001, 0x04000100, 0x00040101, 0x04040000 +	.word	0x00040100, 0x00000000, 0x04000000, 0x00040101 +	.word	0x04040100, 0x00000100, 0x00000001, 0x00040000 +	.word	0x00000101, 0x00040001, 0x04040000, 0x04000101 +	.word	0x00000000, 0x04040100, 0x00040100, 0x04040001 +	.word	0x00040001, 0x04000000, 0x04040101, 0x00000001 +	.word	0x00040101, 0x04000001, 0x04000000, 0x04040101 +	.word	0x00040000, 0x04000100, 0x04000101, 0x00040100 +	.word	0x04000100, 0x00000000, 0x04040001, 0x00000101 +	.word	0x04000001, 0x00040101, 0x00000100, 0x04040000 +	! nibble 3 +	.word	0x00401008, 0x10001000, 0x00000008, 0x10401008 +	.word	0x00000000, 0x10400000, 0x10001008, 0x00400008 +	.word	0x10401000, 0x10000008, 0x10000000, 0x00001008 +	.word	0x10000008, 0x00401008, 0x00400000, 0x10000000 +	.word	0x10400008, 0x00401000, 0x00001000, 0x00000008 +	.word	0x00401000, 0x10001008, 0x10400000, 0x00001000 +	.word	0x00001008, 0x00000000, 0x00400008, 0x10401000 +	.word	0x10001000, 0x10400008, 0x10401008, 0x00400000 +	.word	0x10400008, 0x00001008, 0x00400000, 0x10000008 +	.word	0x00401000, 0x10001000, 0x00000008, 0x10400000 +	.word	0x10001008, 0x00000000, 0x00001000, 0x00400008 +	.word	0x00000000, 0x10400008, 0x10401000, 0x00001000 +	.word	0x10000000, 0x10401008, 0x00401008, 0x00400000 +	.word	0x10401008, 0x00000008, 0x10001000, 0x00401008 +	.word	0x00400008, 0x00401000, 0x10400000, 0x10001008 +	.word	0x00001008, 0x10000000, 0x10000008, 0x10401000 +	! nibble 4 +	.word	0x08000000, 0x00010000, 0x00000400, 0x08010420 +	.word	0x08010020, 0x08000400, 0x00010420, 0x08010000 +	.word	0x00010000, 0x00000020, 0x08000020, 0x00010400 +	.word	0x08000420, 0x08010020, 0x08010400, 0x00000000 +	.word	0x00010400, 0x08000000, 0x00010020, 0x00000420 +	.word	0x08000400, 0x00010420, 0x00000000, 0x08000020 +	.word	0x00000020, 0x08000420, 0x08010420, 0x00010020 +	.word	0x08010000, 0x00000400, 0x00000420, 0x08010400 +	.word	0x08010400, 0x08000420, 0x00010020, 0x08010000 +	.word	0x00010000, 0x00000020, 0x08000020, 0x08000400 +	.word	0x08000000, 0x00010400, 0x08010420, 0x00000000 +	.word	0x00010420, 0x08000000, 0x00000400, 0x00010020 +	.word	0x08000420, 0x00000400, 0x00000000, 0x08010420 +	.word	0x08010020, 0x08010400, 0x00000420, 0x00010000 +	.word	0x00010400, 0x08010020, 0x08000400, 0x00000420 +	.word	0x00000020, 0x00010420, 0x08010000, 0x08000020 +	! nibble 5 +	.word	0x80000040, 0x00200040, 0x00000000, 0x80202000 +	.word	0x00200040, 0x00002000, 0x80002040, 0x00200000 +	.word	0x00002040, 0x80202040, 0x00202000, 0x80000000 +	.word	0x80002000, 0x80000040, 0x80200000, 0x00202040 +	.word	0x00200000, 0x80002040, 0x80200040, 0x00000000 +	.word	0x00002000, 0x00000040, 0x80202000, 0x80200040 +	.word	0x80202040, 0x80200000, 0x80000000, 0x00002040 +	.word	0x00000040, 0x00202000, 0x00202040, 0x80002000 +	.word	0x00002040, 0x80000000, 0x80002000, 0x00202040 +	.word	0x80202000, 0x00200040, 0x00000000, 0x80002000 +	.word	0x80000000, 0x00002000, 0x80200040, 0x00200000 +	.word	0x00200040, 0x80202040, 0x00202000, 0x00000040 +	.word	0x80202040, 0x00202000, 0x00200000, 0x80002040 +	.word	0x80000040, 0x80200000, 0x00202040, 0x00000000 +	.word	0x00002000, 0x80000040, 0x80002040, 0x80202000 +	.word	0x80200000, 0x00002040, 0x00000040, 0x80200040 +	! nibble 6 +	.word	0x00004000, 0x00000200, 0x01000200, 0x01000004 +	.word	0x01004204, 0x00004004, 0x00004200, 0x00000000 +	.word	0x01000000, 0x01000204, 0x00000204, 0x01004000 +	.word	0x00000004, 0x01004200, 0x01004000, 0x00000204 +	.word	0x01000204, 0x00004000, 0x00004004, 0x01004204 +	.word	0x00000000, 0x01000200, 0x01000004, 0x00004200 +	.word	0x01004004, 0x00004204, 0x01004200, 0x00000004 +	.word	0x00004204, 0x01004004, 0x00000200, 0x01000000 +	.word	0x00004204, 0x01004000, 0x01004004, 0x00000204 +	.word	0x00004000, 0x00000200, 0x01000000, 0x01004004 +	.word	0x01000204, 0x00004204, 0x00004200, 0x00000000 +	.word	0x00000200, 0x01000004, 0x00000004, 0x01000200 +	.word	0x00000000, 0x01000204, 0x01000200, 0x00004200 +	.word	0x00000204, 0x00004000, 0x01004204, 0x01000000 +	.word	0x01004200, 0x00000004, 0x00004004, 0x01004204 +	.word	0x01000004, 0x01004200, 0x01004000, 0x00004004 +	! nibble 7 +	.word	0x20800080, 0x20820000, 0x00020080, 0x00000000 +	.word	0x20020000, 0x00800080, 0x20800000, 0x20820080 +	.word	0x00000080, 0x20000000, 0x00820000, 0x00020080 +	.word	0x00820080, 0x20020080, 0x20000080, 0x20800000 +	.word	0x00020000, 0x00820080, 0x00800080, 0x20020000 +	.word	0x20820080, 0x20000080, 0x00000000, 0x00820000 +	.word	0x20000000, 0x00800000, 0x20020080, 0x20800080 +	.word	0x00800000, 0x00020000, 0x20820000, 0x00000080 +	.word	0x00800000, 0x00020000, 0x20000080, 0x20820080 +	.word	0x00020080, 0x20000000, 0x00000000, 0x00820000 +	.word	0x20800080, 0x20020080, 0x20020000, 0x00800080 +	.word	0x20820000, 0x00000080, 0x00800080, 0x20020000 +	.word	0x20820080, 0x00800000, 0x20800000, 0x20000080 +	.word	0x00820000, 0x00020080, 0x20020080, 0x20800000 +	.word	0x00000080, 0x20820000, 0x00820080, 0x00000000 +	.word	0x20000000, 0x20800080, 0x00020000, 0x00820080 + diff --git a/main/openssl/crypto/des/asm/desboth.pl b/main/openssl/crypto/des/asm/desboth.pl new file mode 100644 index 00000000..eec00886 --- /dev/null +++ b/main/openssl/crypto/des/asm/desboth.pl @@ -0,0 +1,79 @@ +#!/usr/local/bin/perl + +$L="edi"; +$R="esi"; + +sub DES_encrypt3 +	{ +	local($name,$enc)=@_; + +	&function_begin_B($name,""); +	&push("ebx"); +	&mov("ebx",&wparam(0)); + +	&push("ebp"); +	&push("esi"); + +	&push("edi"); + +	&comment(""); +	&comment("Load the data words"); +	&mov($L,&DWP(0,"ebx","",0)); +	&mov($R,&DWP(4,"ebx","",0)); +	&stack_push(3); + +	&comment(""); +	&comment("IP"); +	&IP_new($L,$R,"edx",0); + +	# put them back +	 +	if ($enc) +		{ +		&mov(&DWP(4,"ebx","",0),$R); +		 &mov("eax",&wparam(1)); +		&mov(&DWP(0,"ebx","",0),"edx"); +		 &mov("edi",&wparam(2)); +		 &mov("esi",&wparam(3)); +		} +	else +		{ +		&mov(&DWP(4,"ebx","",0),$R); +		 &mov("esi",&wparam(1)); +		&mov(&DWP(0,"ebx","",0),"edx"); +		 &mov("edi",&wparam(2)); +		 &mov("eax",&wparam(3)); +		} +	&mov(&swtmp(2),	(DWC(($enc)?"1":"0"))); +	&mov(&swtmp(1),	"eax"); +	&mov(&swtmp(0),	"ebx"); +	&call("DES_encrypt2"); +	&mov(&swtmp(2),	(DWC(($enc)?"0":"1"))); +	&mov(&swtmp(1),	"edi"); +	&mov(&swtmp(0),	"ebx"); +	&call("DES_encrypt2"); +	&mov(&swtmp(2),	(DWC(($enc)?"1":"0"))); +	&mov(&swtmp(1),	"esi"); +	&mov(&swtmp(0),	"ebx"); +	&call("DES_encrypt2"); + +	&stack_pop(3); +	&mov($L,&DWP(0,"ebx","",0)); +	&mov($R,&DWP(4,"ebx","",0)); + +	&comment(""); +	&comment("FP"); +	&FP_new($L,$R,"eax",0); + +	&mov(&DWP(0,"ebx","",0),"eax"); +	&mov(&DWP(4,"ebx","",0),$R); + +	&pop("edi"); +	&pop("esi"); +	&pop("ebp"); +	&pop("ebx"); +	&ret(); +	&function_end_B($name); +	} + + diff --git a/main/openssl/crypto/des/asm/readme b/main/openssl/crypto/des/asm/readme new file mode 100644 index 00000000..1beafe25 --- /dev/null +++ b/main/openssl/crypto/des/asm/readme @@ -0,0 +1,131 @@ +First up, let me say I don't like writing in assembler.  It is not portable, +dependant on the particular CPU architecture release and is generally a pig +to debug and get right.  Having said that, the x86 architecture is probably +the most important for speed due to number of boxes and since +it appears to be the worst architecture to to get +good C compilers for.  So due to this, I have lowered myself to do +assembler for the inner DES routines in libdes :-). + +The file to implement in assembler is des_enc.c.  Replace the following +4 functions +des_encrypt1(DES_LONG data[2],des_key_schedule ks, int encrypt); +des_encrypt2(DES_LONG data[2],des_key_schedule ks, int encrypt); +des_encrypt3(DES_LONG data[2],des_key_schedule ks1,ks2,ks3); +des_decrypt3(DES_LONG data[2],des_key_schedule ks1,ks2,ks3); + +They encrypt/decrypt the 64 bits held in 'data' using +the 'ks' key schedules.   The only difference between the 4 functions is that +des_encrypt2() does not perform IP() or FP() on the data (this is an +optimization for when doing triple DES and des_encrypt3() and des_decrypt3() +perform triple des.  The triple DES routines are in here because it does +make a big difference to have them located near the des_encrypt2 function +at link time.. + +Now as we all know, there are lots of different operating systems running on +x86 boxes, and unfortunately they normally try to make sure their assembler +formating is not the same as the other peoples. +The 4 main formats I know of are +Microsoft	Windows 95/Windows NT +Elf		Includes Linux and FreeBSD(?). +a.out		The older Linux. +Solaris		Same as Elf but different comments :-(. + +Now I was not overly keen to write 4 different copies of the same code, +so I wrote a few perl routines to output the correct assembler, given +a target assembler type.  This code is ugly and is just a hack. +The libraries are x86unix.pl and x86ms.pl. +des586.pl, des686.pl and des-som[23].pl are the programs to actually +generate the assembler. + +So to generate elf assembler +perl des-som3.pl elf >dx86-elf.s +For Windows 95/NT +perl des-som2.pl win32 >win32.asm + +[ update 4 Jan 1996 ] +I have added another way to do things. +perl des-som3.pl cpp >dx86-cpp.s +generates a file that will be included by dx86unix.cpp when it is compiled. +To build for elf, a.out, solaris, bsdi etc, +cc -E -DELF asm/dx86unix.cpp | as -o asm/dx86-elf.o +cc -E -DSOL asm/dx86unix.cpp | as -o asm/dx86-sol.o +cc -E -DOUT asm/dx86unix.cpp | as -o asm/dx86-out.o +cc -E -DBSDI asm/dx86unix.cpp | as -o asm/dx86bsdi.o +This was done to cut down the number of files in the distribution. + +Now the ugly part.  I acquired my copy of Intels +"Optimization's For Intel's 32-Bit Processors" and found a few interesting +things.  First, the aim of the exersize is to 'extract' one byte at a time +from a word and do an array lookup.  This involves getting the byte from +the 4 locations in the word and moving it to a new word and doing the lookup. +The most obvious way to do this is +xor	eax,	eax				# clear word +movb	al,	cl				# get low byte +xor	edi	DWORD PTR 0x100+des_SP[eax] 	# xor in word +movb	al,	ch				# get next byte +xor	edi	DWORD PTR 0x300+des_SP[eax] 	# xor in word +shr	ecx	16 +which seems ok.  For the pentium, this system appears to be the best. +One has to do instruction interleaving to keep both functional units +operating, but it is basically very efficient. + +Now the crunch.  When a full register is used after a partial write, eg. +mov	al,	cl +xor	edi,	DWORD PTR 0x100+des_SP[eax] +386	- 1 cycle stall +486	- 1 cycle stall +586	- 0 cycle stall +686	- at least 7 cycle stall (page 22 of the above mentioned document). + +So the technique that produces the best results on a pentium, according to +the documentation, will produce hideous results on a pentium pro. + +To get around this, des686.pl will generate code that is not as fast on +a pentium, should be very good on a pentium pro. +mov	eax,	ecx				# copy word  +shr	ecx,	8				# line up next byte +and	eax,	0fch				# mask byte +xor	edi	DWORD PTR 0x100+des_SP[eax] 	# xor in array lookup +mov	eax,	ecx				# get word +shr	ecx	8				# line up next byte +and	eax,	0fch				# mask byte +xor	edi	DWORD PTR 0x300+des_SP[eax] 	# xor in array lookup + +Due to the execution units in the pentium, this actually works quite well. +For a pentium pro it should be very good.  This is the type of output +Visual C++ generates. + +There is a third option.  instead of using +mov	al,	ch +which is bad on the pentium pro, one may be able to use +movzx	eax,	ch +which may not incur the partial write penalty.  On the pentium, +this instruction takes 4 cycles so is not worth using but on the +pentium pro it appears it may be worth while.  I need access to one to +experiment :-). + +eric (20 Oct 1996) + +22 Nov 1996 - I have asked people to run the 2 different version on pentium +pros and it appears that the intel documentation is wrong.  The +mov al,bh is still faster on a pentium pro, so just use the des586.pl +install des686.pl + +3 Dec 1996 - I added des_encrypt3/des_decrypt3 because I have moved these +functions into des_enc.c because it does make a massive performance +difference on some boxes to have the functions code located close to +the des_encrypt2() function. + +9 Jan 1997 - des-som2.pl is now the correct perl script to use for +pentiums.  It contains an inner loop from +Svend Olaf Mikkelsen <svolaf@inet.uni-c.dk> which does raw ecb DES calls at +273,000 per second.  He had a previous version at 250,000 and the best +I was able to get was 203,000.  The content has not changed, this is all +due to instruction sequencing (and actual instructions choice) which is able +to keep both functional units of the pentium going. +We may have lost the ugly register usage restrictions when x86 went 32 bit +but for the pentium it has been replaced by evil instruction ordering tricks. + +13 Jan 1997 - des-som3.pl, more optimizations from Svend Olaf. +raw DES at 281,000 per second on a pentium 100. + | 
