summaryrefslogtreecommitdiff
path: root/openssl/crypto/bn/asm/alpha-mont.pl
diff options
context:
space:
mode:
authorArne Schwabe <arne@rfc2549.org>2012-04-16 19:21:14 +0200
committerArne Schwabe <arne@rfc2549.org>2012-04-16 19:21:14 +0200
commit3e4d8f433239c40311037616b1b8833a06651ae0 (patch)
tree98ab7fce0d011d34677b0beb762d389cb5c39199 /openssl/crypto/bn/asm/alpha-mont.pl
Initial import
Diffstat (limited to 'openssl/crypto/bn/asm/alpha-mont.pl')
-rw-r--r--openssl/crypto/bn/asm/alpha-mont.pl321
1 files changed, 321 insertions, 0 deletions
diff --git a/openssl/crypto/bn/asm/alpha-mont.pl b/openssl/crypto/bn/asm/alpha-mont.pl
new file mode 100644
index 00000000..03596e20
--- /dev/null
+++ b/openssl/crypto/bn/asm/alpha-mont.pl
@@ -0,0 +1,321 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# On 21264 RSA sign performance improves by 70/35/20/15 percent for
+# 512/1024/2048/4096 bit key lengths. This is against vendor compiler
+# instructed to '-tune host' code with in-line assembler. Other
+# benchmarks improve by 15-20%. To anchor it to something else, the
+# code provides approximately the same performance per GHz as AMD64.
+# I.e. if you compare 1GHz 21264 and 2GHz Opteron, you'll observe ~2x
+# difference.
+
+# int bn_mul_mont(
+$rp="a0"; # BN_ULONG *rp,
+$ap="a1"; # const BN_ULONG *ap,
+$bp="a2"; # const BN_ULONG *bp,
+$np="a3"; # const BN_ULONG *np,
+$n0="a4"; # const BN_ULONG *n0,
+$num="a5"; # int num);
+
+$lo0="t0";
+$hi0="t1";
+$lo1="t2";
+$hi1="t3";
+$aj="t4";
+$bi="t5";
+$nj="t6";
+$tp="t7";
+$alo="t8";
+$ahi="t9";
+$nlo="t10";
+$nhi="t11";
+$tj="t12";
+$i="s3";
+$j="s4";
+$m1="s5";
+
+$code=<<___;
+#ifdef __linux__
+#include <asm/regdef.h>
+#else
+#include <asm.h>
+#include <regdef.h>
+#endif
+
+.text
+
+.set noat
+.set noreorder
+
+.globl bn_mul_mont
+.align 5
+.ent bn_mul_mont
+bn_mul_mont:
+ lda sp,-48(sp)
+ stq ra,0(sp)
+ stq s3,8(sp)
+ stq s4,16(sp)
+ stq s5,24(sp)
+ stq fp,32(sp)
+ mov sp,fp
+ .mask 0x0400f000,-48
+ .frame fp,48,ra
+ .prologue 0
+
+ .align 4
+ .set reorder
+ sextl $num,$num
+ mov 0,v0
+ cmplt $num,4,AT
+ bne AT,.Lexit
+
+ ldq $hi0,0($ap) # ap[0]
+ s8addq $num,16,AT
+ ldq $aj,8($ap)
+ subq sp,AT,sp
+ ldq $bi,0($bp) # bp[0]
+ lda AT,-4096(zero) # mov -4096,AT
+ ldq $n0,0($n0)
+ and sp,AT,sp
+
+ mulq $hi0,$bi,$lo0
+ ldq $hi1,0($np) # np[0]
+ umulh $hi0,$bi,$hi0
+ ldq $nj,8($np)
+
+ mulq $lo0,$n0,$m1
+
+ mulq $hi1,$m1,$lo1
+ umulh $hi1,$m1,$hi1
+
+ addq $lo1,$lo0,$lo1
+ cmpult $lo1,$lo0,AT
+ addq $hi1,AT,$hi1
+
+ mulq $aj,$bi,$alo
+ mov 2,$j
+ umulh $aj,$bi,$ahi
+ mov sp,$tp
+
+ mulq $nj,$m1,$nlo
+ s8addq $j,$ap,$aj
+ umulh $nj,$m1,$nhi
+ s8addq $j,$np,$nj
+.align 4
+.L1st:
+ .set noreorder
+ ldq $aj,0($aj)
+ addl $j,1,$j
+ ldq $nj,0($nj)
+ lda $tp,8($tp)
+
+ addq $alo,$hi0,$lo0
+ mulq $aj,$bi,$alo
+ cmpult $lo0,$hi0,AT
+ addq $nlo,$hi1,$lo1
+
+ mulq $nj,$m1,$nlo
+ addq $ahi,AT,$hi0
+ cmpult $lo1,$hi1,v0
+ cmplt $j,$num,$tj
+
+ umulh $aj,$bi,$ahi
+ addq $nhi,v0,$hi1
+ addq $lo1,$lo0,$lo1
+ s8addq $j,$ap,$aj
+
+ umulh $nj,$m1,$nhi
+ cmpult $lo1,$lo0,v0
+ addq $hi1,v0,$hi1
+ s8addq $j,$np,$nj
+
+ stq $lo1,-8($tp)
+ nop
+ unop
+ bne $tj,.L1st
+ .set reorder
+
+ addq $alo,$hi0,$lo0
+ addq $nlo,$hi1,$lo1
+ cmpult $lo0,$hi0,AT
+ cmpult $lo1,$hi1,v0
+ addq $ahi,AT,$hi0
+ addq $nhi,v0,$hi1
+
+ addq $lo1,$lo0,$lo1
+ cmpult $lo1,$lo0,v0
+ addq $hi1,v0,$hi1
+
+ stq $lo1,0($tp)
+
+ addq $hi1,$hi0,$hi1
+ cmpult $hi1,$hi0,AT
+ stq $hi1,8($tp)
+ stq AT,16($tp)
+
+ mov 1,$i
+.align 4
+.Louter:
+ s8addq $i,$bp,$bi
+ ldq $hi0,0($ap)
+ ldq $aj,8($ap)
+ ldq $bi,0($bi)
+ ldq $hi1,0($np)
+ ldq $nj,8($np)
+ ldq $tj,0(sp)
+
+ mulq $hi0,$bi,$lo0
+ umulh $hi0,$bi,$hi0
+
+ addq $lo0,$tj,$lo0
+ cmpult $lo0,$tj,AT
+ addq $hi0,AT,$hi0
+
+ mulq $lo0,$n0,$m1
+
+ mulq $hi1,$m1,$lo1
+ umulh $hi1,$m1,$hi1
+
+ addq $lo1,$lo0,$lo1
+ cmpult $lo1,$lo0,AT
+ mov 2,$j
+ addq $hi1,AT,$hi1
+
+ mulq $aj,$bi,$alo
+ mov sp,$tp
+ umulh $aj,$bi,$ahi
+
+ mulq $nj,$m1,$nlo
+ s8addq $j,$ap,$aj
+ umulh $nj,$m1,$nhi
+.align 4
+.Linner:
+ .set noreorder
+ ldq $tj,8($tp) #L0
+ nop #U1
+ ldq $aj,0($aj) #L1
+ s8addq $j,$np,$nj #U0
+
+ ldq $nj,0($nj) #L0
+ nop #U1
+ addq $alo,$hi0,$lo0 #L1
+ lda $tp,8($tp)
+
+ mulq $aj,$bi,$alo #U1
+ cmpult $lo0,$hi0,AT #L0
+ addq $nlo,$hi1,$lo1 #L1
+ addl $j,1,$j
+
+ mulq $nj,$m1,$nlo #U1
+ addq $ahi,AT,$hi0 #L0
+ addq $lo0,$tj,$lo0 #L1
+ cmpult $lo1,$hi1,v0 #U0
+
+ umulh $aj,$bi,$ahi #U1
+ cmpult $lo0,$tj,AT #L0
+ addq $lo1,$lo0,$lo1 #L1
+ addq $nhi,v0,$hi1 #U0
+
+ umulh $nj,$m1,$nhi #U1
+ s8addq $j,$ap,$aj #L0
+ cmpult $lo1,$lo0,v0 #L1
+ cmplt $j,$num,$tj #U0 # borrow $tj
+
+ addq $hi0,AT,$hi0 #L0
+ addq $hi1,v0,$hi1 #U1
+ stq $lo1,-8($tp) #L1
+ bne $tj,.Linner #U0
+ .set reorder
+
+ ldq $tj,8($tp)
+ addq $alo,$hi0,$lo0
+ addq $nlo,$hi1,$lo1
+ cmpult $lo0,$hi0,AT
+ cmpult $lo1,$hi1,v0
+ addq $ahi,AT,$hi0
+ addq $nhi,v0,$hi1
+
+ addq $lo0,$tj,$lo0
+ cmpult $lo0,$tj,AT
+ addq $hi0,AT,$hi0
+
+ ldq $tj,16($tp)
+ addq $lo1,$lo0,$j
+ cmpult $j,$lo0,v0
+ addq $hi1,v0,$hi1
+
+ addq $hi1,$hi0,$lo1
+ stq $j,0($tp)
+ cmpult $lo1,$hi0,$hi1
+ addq $lo1,$tj,$lo1
+ cmpult $lo1,$tj,AT
+ addl $i,1,$i
+ addq $hi1,AT,$hi1
+ stq $lo1,8($tp)
+ cmplt $i,$num,$tj # borrow $tj
+ stq $hi1,16($tp)
+ bne $tj,.Louter
+
+ s8addq $num,sp,$tj # &tp[num]
+ mov $rp,$bp # put rp aside
+ mov sp,$tp
+ mov sp,$ap
+ mov 0,$hi0 # clear borrow bit
+
+.align 4
+.Lsub: ldq $lo0,0($tp)
+ ldq $lo1,0($np)
+ lda $tp,8($tp)
+ lda $np,8($np)
+ subq $lo0,$lo1,$lo1 # tp[i]-np[i]
+ cmpult $lo0,$lo1,AT
+ subq $lo1,$hi0,$lo0
+ cmpult $lo1,$lo0,$hi0
+ or $hi0,AT,$hi0
+ stq $lo0,0($rp)
+ cmpult $tp,$tj,v0
+ lda $rp,8($rp)
+ bne v0,.Lsub
+
+ subq $hi1,$hi0,$hi0 # handle upmost overflow bit
+ mov sp,$tp
+ mov $bp,$rp # restore rp
+
+ and sp,$hi0,$ap
+ bic $bp,$hi0,$bp
+ bis $bp,$ap,$ap # ap=borrow?tp:rp
+
+.align 4
+.Lcopy: ldq $aj,0($ap) # copy or in-place refresh
+ lda $tp,8($tp)
+ lda $rp,8($rp)
+ lda $ap,8($ap)
+ stq zero,-8($tp) # zap tp
+ cmpult $tp,$tj,AT
+ stq $aj,-8($rp)
+ bne AT,.Lcopy
+ mov 1,v0
+
+.Lexit:
+ .set noreorder
+ mov fp,sp
+ /*ldq ra,0(sp)*/
+ ldq s3,8(sp)
+ ldq s4,16(sp)
+ ldq s5,24(sp)
+ ldq fp,32(sp)
+ lda sp,48(sp)
+ ret (ra)
+.end bn_mul_mont
+.ascii "Montgomery Multiplication for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
+.align 2
+___
+
+print $code;
+close STDOUT;