summaryrefslogtreecommitdiff
path: root/app/openssl/crypto/bn/asm
diff options
context:
space:
mode:
authorParménides GV <parmegv@sdf.org>2014-09-26 09:46:26 +0200
committerParménides GV <parmegv@sdf.org>2014-09-26 09:46:26 +0200
commit394451dbae3e71282611058e00b5fd16c865f147 (patch)
tree17b71034d9350a2848603f5edf0a8b13025909be /app/openssl/crypto/bn/asm
parent644fd02cf8da95b0b5a99fb9f2142628dd27f7c2 (diff)
Revert "Updated native subprojects from ics-openvpn."
This reverts commit d0e7ba3029b2fd42582413aa95773fe7dbdede90. I'll postpone this work for the next cycle, it's not trivial because it doesn't link properly.
Diffstat (limited to 'app/openssl/crypto/bn/asm')
-rw-r--r--app/openssl/crypto/bn/asm/armv4-gf2m.S106
-rw-r--r--app/openssl/crypto/bn/asm/armv4-gf2m.pl139
-rw-r--r--app/openssl/crypto/bn/asm/armv4-mont.pl483
-rw-r--r--app/openssl/crypto/bn/asm/armv4-mont.s444
-rw-r--r--app/openssl/crypto/bn/asm/mips3.S2201
-rw-r--r--app/openssl/crypto/bn/asm/pa-risc2.S1618
-rw-r--r--app/openssl/crypto/bn/asm/pa-risc2W.S1605
7 files changed, 138 insertions, 6458 deletions
diff --git a/app/openssl/crypto/bn/asm/armv4-gf2m.S b/app/openssl/crypto/bn/asm/armv4-gf2m.S
index 0fa25b26..038f0864 100644
--- a/app/openssl/crypto/bn/asm/armv4-gf2m.S
+++ b/app/openssl/crypto/bn/asm/armv4-gf2m.S
@@ -5,6 +5,31 @@
#if __ARM_ARCH__>=7
.fpu neon
+
+.type mul_1x1_neon,%function
+.align 5
+mul_1x1_neon:
+ vshl.u64 d2,d16,#8 @ q1-q3 are slided
+ vmull.p8 q0,d16,d17 @ a·bb
+ vshl.u64 d4,d16,#16
+ vmull.p8 q1,d2,d17 @ a<<8·bb
+ vshl.u64 d6,d16,#24
+ vmull.p8 q2,d4,d17 @ a<<16·bb
+ vshr.u64 d2,#8
+ vmull.p8 q3,d6,d17 @ a<<24·bb
+ vshl.u64 d3,#24
+ veor d0,d2
+ vshr.u64 d4,#16
+ veor d0,d3
+ vshl.u64 d5,#16
+ veor d0,d4
+ vshr.u64 d6,#24
+ veor d0,d5
+ vshl.u64 d7,#8
+ veor d0,d6
+ veor d0,d7
+ .word 0xe12fff1e
+.size mul_1x1_neon,.-mul_1x1_neon
#endif
.type mul_1x1_ialu,%function
.align 5
@@ -95,53 +120,40 @@ bn_GF2m_mul_2x2:
tst r12,#1
beq .Lialu
- ldr r12, [sp] @ 5th argument
- vmov.32 d26, r2, r1
- vmov.32 d27, r12, r3
- vmov.i64 d28, #0x0000ffffffffffff
- vmov.i64 d29, #0x00000000ffffffff
- vmov.i64 d30, #0x000000000000ffff
-
- vext.8 d2, d26, d26, #1 @ A1
- vmull.p8 q1, d2, d27 @ F = A1*B
- vext.8 d0, d27, d27, #1 @ B1
- vmull.p8 q0, d26, d0 @ E = A*B1
- vext.8 d4, d26, d26, #2 @ A2
- vmull.p8 q2, d4, d27 @ H = A2*B
- vext.8 d16, d27, d27, #2 @ B2
- vmull.p8 q8, d26, d16 @ G = A*B2
- vext.8 d6, d26, d26, #3 @ A3
- veor q1, q1, q0 @ L = E + F
- vmull.p8 q3, d6, d27 @ J = A3*B
- vext.8 d0, d27, d27, #3 @ B3
- veor q2, q2, q8 @ M = G + H
- vmull.p8 q0, d26, d0 @ I = A*B3
- veor d2, d2, d3 @ t0 = (L) (P0 + P1) << 8
- vand d3, d3, d28
- vext.8 d16, d27, d27, #4 @ B4
- veor d4, d4, d5 @ t1 = (M) (P2 + P3) << 16
- vand d5, d5, d29
- vmull.p8 q8, d26, d16 @ K = A*B4
- veor q3, q3, q0 @ N = I + J
- veor d2, d2, d3
- veor d4, d4, d5
- veor d6, d6, d7 @ t2 = (N) (P4 + P5) << 24
- vand d7, d7, d30
- vext.8 q1, q1, q1, #15
- veor d16, d16, d17 @ t3 = (K) (P6 + P7) << 32
- vmov.i64 d17, #0
- vext.8 q2, q2, q2, #14
- veor d6, d6, d7
- vmull.p8 q0, d26, d27 @ D = A*B
- vext.8 q8, q8, q8, #12
- vext.8 q3, q3, q3, #13
- veor q1, q1, q2
- veor q3, q3, q8
- veor q0, q0, q1
- veor q0, q0, q3
-
- vst1.32 {q0}, [r0]
- bx lr @ bx lr
+ veor d18,d18
+ vmov.32 d19,r3,r3 @ two copies of b1
+ vmov.32 d18[0],r1 @ a1
+
+ veor d20,d20
+ vld1.32 d21[],[sp,:32] @ two copies of b0
+ vmov.32 d20[0],r2 @ a0
+ mov r12,lr
+
+ vmov d16,d18
+ vmov d17,d19
+ bl mul_1x1_neon @ a1·b1
+ vmov d22,d0
+
+ vmov d16,d20
+ vmov d17,d21
+ bl mul_1x1_neon @ a0·b0
+ vmov d23,d0
+
+ veor d16,d20,d18
+ veor d17,d21,d19
+ veor d20,d23,d22
+ bl mul_1x1_neon @ (a0+a1)·(b0+b1)
+
+ veor d0,d20 @ (a0+a1)·(b0+b1)-a0·b0-a1·b1
+ vshl.u64 d1,d0,#32
+ vshr.u64 d0,d0,#32
+ veor d23,d1
+ veor d22,d0
+ vst1.32 {d23[0]},[r0,:32]!
+ vst1.32 {d23[1]},[r0,:32]!
+ vst1.32 {d22[0]},[r0,:32]!
+ vst1.32 {d22[1]},[r0,:32]
+ bx r12
.align 4
.Lialu:
#endif
diff --git a/app/openssl/crypto/bn/asm/armv4-gf2m.pl b/app/openssl/crypto/bn/asm/armv4-gf2m.pl
index 3f1f4f67..22ad1f85 100644
--- a/app/openssl/crypto/bn/asm/armv4-gf2m.pl
+++ b/app/openssl/crypto/bn/asm/armv4-gf2m.pl
@@ -20,21 +20,14 @@
# length, more for longer keys. Even though NEON 1x1 multiplication
# runs in even less cycles, ~30, improvement is measurable only on
# longer keys. One has to optimize code elsewhere to get NEON glow...
-#
-# April 2014
-#
-# Double bn_GF2m_mul_2x2 performance by using algorithm from paper
-# referred below, which improves ECDH and ECDSA verify benchmarks
-# by 18-40%.
-#
-# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
-# Polynomial Multiplication on ARM Processors using the NEON Engine.
-#
-# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
+sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
+sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
+sub Q() { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; }
+
$code=<<___;
#include "arm_arch.h"
@@ -43,6 +36,31 @@ $code=<<___;
#if __ARM_ARCH__>=7
.fpu neon
+
+.type mul_1x1_neon,%function
+.align 5
+mul_1x1_neon:
+ vshl.u64 `&Dlo("q1")`,d16,#8 @ q1-q3 are slided $a
+ vmull.p8 `&Q("d0")`,d16,d17 @ a·bb
+ vshl.u64 `&Dlo("q2")`,d16,#16
+ vmull.p8 q1,`&Dlo("q1")`,d17 @ a<<8·bb
+ vshl.u64 `&Dlo("q3")`,d16,#24
+ vmull.p8 q2,`&Dlo("q2")`,d17 @ a<<16·bb
+ vshr.u64 `&Dlo("q1")`,#8
+ vmull.p8 q3,`&Dlo("q3")`,d17 @ a<<24·bb
+ vshl.u64 `&Dhi("q1")`,#24
+ veor d0,`&Dlo("q1")`
+ vshr.u64 `&Dlo("q2")`,#16
+ veor d0,`&Dhi("q1")`
+ vshl.u64 `&Dhi("q2")`,#16
+ veor d0,`&Dlo("q2")`
+ vshr.u64 `&Dlo("q3")`,#24
+ veor d0,`&Dhi("q2")`
+ vshl.u64 `&Dhi("q3")`,#8
+ veor d0,`&Dlo("q3")`
+ veor d0,`&Dhi("q3")`
+ bx lr
+.size mul_1x1_neon,.-mul_1x1_neon
#endif
___
################
@@ -141,9 +159,8 @@ ___
# void bn_GF2m_mul_2x2(BN_ULONG *r,
# BN_ULONG a1,BN_ULONG a0,
# BN_ULONG b1,BN_ULONG b0); # r[3..0]=a1a0·b1b0
-{
-my ($r,$t0,$t1,$t2,$t3)=map("q$_",(0..3,8..12));
-my ($a,$b,$k48,$k32,$k16)=map("d$_",(26..31));
+
+($A1,$B1,$A0,$B0,$A1B1,$A0B0)=map("d$_",(18..23));
$code.=<<___;
.global bn_GF2m_mul_2x2
@@ -156,58 +173,44 @@ bn_GF2m_mul_2x2:
tst r12,#1
beq .Lialu
- ldr r12, [sp] @ 5th argument
- vmov.32 $a, r2, r1
- vmov.32 $b, r12, r3
- vmov.i64 $k48, #0x0000ffffffffffff
- vmov.i64 $k32, #0x00000000ffffffff
- vmov.i64 $k16, #0x000000000000ffff
-
- vext.8 $t0#lo, $a, $a, #1 @ A1
- vmull.p8 $t0, $t0#lo, $b @ F = A1*B
- vext.8 $r#lo, $b, $b, #1 @ B1
- vmull.p8 $r, $a, $r#lo @ E = A*B1
- vext.8 $t1#lo, $a, $a, #2 @ A2
- vmull.p8 $t1, $t1#lo, $b @ H = A2*B
- vext.8 $t3#lo, $b, $b, #2 @ B2
- vmull.p8 $t3, $a, $t3#lo @ G = A*B2
- vext.8 $t2#lo, $a, $a, #3 @ A3
- veor $t0, $t0, $r @ L = E + F
- vmull.p8 $t2, $t2#lo, $b @ J = A3*B
- vext.8 $r#lo, $b, $b, #3 @ B3
- veor $t1, $t1, $t3 @ M = G + H
- vmull.p8 $r, $a, $r#lo @ I = A*B3
- veor $t0#lo, $t0#lo, $t0#hi @ t0 = (L) (P0 + P1) << 8
- vand $t0#hi, $t0#hi, $k48
- vext.8 $t3#lo, $b, $b, #4 @ B4
- veor $t1#lo, $t1#lo, $t1#hi @ t1 = (M) (P2 + P3) << 16
- vand $t1#hi, $t1#hi, $k32
- vmull.p8 $t3, $a, $t3#lo @ K = A*B4
- veor $t2, $t2, $r @ N = I + J
- veor $t0#lo, $t0#lo, $t0#hi
- veor $t1#lo, $t1#lo, $t1#hi
- veor $t2#lo, $t2#lo, $t2#hi @ t2 = (N) (P4 + P5) << 24
- vand $t2#hi, $t2#hi, $k16
- vext.8 $t0, $t0, $t0, #15
- veor $t3#lo, $t3#lo, $t3#hi @ t3 = (K) (P6 + P7) << 32
- vmov.i64 $t3#hi, #0
- vext.8 $t1, $t1, $t1, #14
- veor $t2#lo, $t2#lo, $t2#hi
- vmull.p8 $r, $a, $b @ D = A*B
- vext.8 $t3, $t3, $t3, #12
- vext.8 $t2, $t2, $t2, #13
- veor $t0, $t0, $t1
- veor $t2, $t2, $t3
- veor $r, $r, $t0
- veor $r, $r, $t2
-
- vst1.32 {$r}, [r0]
- ret @ bx lr
+ veor $A1,$A1
+ vmov.32 $B1,r3,r3 @ two copies of b1
+ vmov.32 ${A1}[0],r1 @ a1
+
+ veor $A0,$A0
+ vld1.32 ${B0}[],[sp,:32] @ two copies of b0
+ vmov.32 ${A0}[0],r2 @ a0
+ mov r12,lr
+
+ vmov d16,$A1
+ vmov d17,$B1
+ bl mul_1x1_neon @ a1·b1
+ vmov $A1B1,d0
+
+ vmov d16,$A0
+ vmov d17,$B0
+ bl mul_1x1_neon @ a0·b0
+ vmov $A0B0,d0
+
+ veor d16,$A0,$A1
+ veor d17,$B0,$B1
+ veor $A0,$A0B0,$A1B1
+ bl mul_1x1_neon @ (a0+a1)·(b0+b1)
+
+ veor d0,$A0 @ (a0+a1)·(b0+b1)-a0·b0-a1·b1
+ vshl.u64 d1,d0,#32
+ vshr.u64 d0,d0,#32
+ veor $A0B0,d1
+ veor $A1B1,d0
+ vst1.32 {${A0B0}[0]},[r0,:32]!
+ vst1.32 {${A0B0}[1]},[r0,:32]!
+ vst1.32 {${A1B1}[0]},[r0,:32]!
+ vst1.32 {${A1B1}[1]},[r0,:32]
+ bx r12
.align 4
.Lialu:
#endif
___
-}
$ret="r10"; # reassigned 1st argument
$code.=<<___;
stmdb sp!,{r4-r10,lr}
@@ -269,13 +272,7 @@ $code.=<<___;
.comm OPENSSL_armcap_P,4,4
___
-foreach (split("\n",$code)) {
- s/\`([^\`]*)\`/eval $1/geo;
-
- s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
- s/\bret\b/bx lr/go or
- s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
-
- print $_,"\n";
-}
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
+print $code;
close STDOUT; # enforce flush
diff --git a/app/openssl/crypto/bn/asm/armv4-mont.pl b/app/openssl/crypto/bn/asm/armv4-mont.pl
index 72bad8e3..f78a8b5f 100644
--- a/app/openssl/crypto/bn/asm/armv4-mont.pl
+++ b/app/openssl/crypto/bn/asm/armv4-mont.pl
@@ -1,7 +1,7 @@
#!/usr/bin/env perl
# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
@@ -23,21 +23,6 @@
# than 1/2KB. Windows CE port would be trivial, as it's exclusively
# about decorations, ABI and instruction syntax are identical.
-# November 2013
-#
-# Add NEON code path, which handles lengths divisible by 8. RSA/DSA
-# performance improvement on Cortex-A8 is ~45-100% depending on key
-# length, more for longer keys. On Cortex-A15 the span is ~10-105%.
-# On Snapdragon S4 improvement was measured to vary from ~70% to
-# incredible ~380%, yes, 4.8x faster, for RSA4096 sign. But this is
-# rather because original integer-only code seems to perform
-# suboptimally on S4. Situation on Cortex-A9 is unfortunately
-# different. It's being looked into, but the trouble is that
-# performance for vectors longer than 256 bits is actually couple
-# of percent worse than for integer-only code. The code is chosen
-# for execution on all NEON-capable processors, because gain on
-# others outweighs the marginal loss on Cortex-A9.
-
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
@@ -67,40 +52,16 @@ $_n0="$num,#14*4";
$_num="$num,#15*4"; $_bpend=$_num;
$code=<<___;
-#include "arm_arch.h"
-
.text
-.code 32
-
-#if __ARM_ARCH__>=7
-.align 5
-.LOPENSSL_armcap:
-.word OPENSSL_armcap_P-bn_mul_mont
-#endif
.global bn_mul_mont
.type bn_mul_mont,%function
-.align 5
+.align 2
bn_mul_mont:
- ldr ip,[sp,#4] @ load num
stmdb sp!,{r0,r2} @ sp points at argument block
-#if __ARM_ARCH__>=7
- tst ip,#7
- bne .Lialu
- adr r0,bn_mul_mont
- ldr r2,.LOPENSSL_armcap
- ldr r0,[r0,r2]
- tst r0,#1 @ NEON available?
- ldmia sp, {r0,r2}
- beq .Lialu
- add sp,sp,#8
- b bn_mul8x_mont_neon
-.align 4
-.Lialu:
-#endif
- cmp ip,#2
- mov $num,ip @ load num
+ ldr $num,[sp,#3*4] @ load num
+ cmp $num,#2
movlt r0,#0
addlt sp,sp,#2*4
blt .Labrt
@@ -230,446 +191,14 @@ bn_mul_mont:
ldmia sp!,{r4-r12,lr} @ restore registers
add sp,sp,#2*4 @ skip over {r0,r2}
mov r0,#1
-.Labrt:
-#if __ARM_ARCH__>=5
- ret @ bx lr
-#else
- tst lr,#1
+.Labrt: tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
-#endif
.size bn_mul_mont,.-bn_mul_mont
-___
-{
-sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
-sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
-
-my ($A0,$A1,$A2,$A3)=map("d$_",(0..3));
-my ($N0,$N1,$N2,$N3)=map("d$_",(4..7));
-my ($Z,$Temp)=("q4","q5");
-my ($A0xB,$A1xB,$A2xB,$A3xB,$A4xB,$A5xB,$A6xB,$A7xB)=map("q$_",(6..13));
-my ($Bi,$Ni,$M0)=map("d$_",(28..31));
-my $zero=&Dlo($Z);
-my $temp=&Dlo($Temp);
-
-my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("r$_",(0..5));
-my ($tinptr,$toutptr,$inner,$outer)=map("r$_",(6..9));
-
-$code.=<<___;
-#if __ARM_ARCH__>=7
-.fpu neon
-
-.type bn_mul8x_mont_neon,%function
-.align 5
-bn_mul8x_mont_neon:
- mov ip,sp
- stmdb sp!,{r4-r11}
- vstmdb sp!,{d8-d15} @ ABI specification says so
- ldmia ip,{r4-r5} @ load rest of parameter block
-
- sub $toutptr,sp,#16
- vld1.32 {${Bi}[0]}, [$bptr,:32]!
- sub $toutptr,$toutptr,$num,lsl#4
- vld1.32 {$A0-$A3}, [$aptr]! @ can't specify :32 :-(
- and $toutptr,$toutptr,#-64
- vld1.32 {${M0}[0]}, [$n0,:32]
- mov sp,$toutptr @ alloca
- veor $zero,$zero,$zero
- subs $inner,$num,#8
- vzip.16 $Bi,$zero
-
- vmull.u32 $A0xB,$Bi,${A0}[0]
- vmull.u32 $A1xB,$Bi,${A0}[1]
- vmull.u32 $A2xB,$Bi,${A1}[0]
- vshl.i64 $temp,`&Dhi("$A0xB")`,#16
- vmull.u32 $A3xB,$Bi,${A1}[1]
-
- vadd.u64 $temp,$temp,`&Dlo("$A0xB")`
- veor $zero,$zero,$zero
- vmul.u32 $Ni,$temp,$M0
-
- vmull.u32 $A4xB,$Bi,${A2}[0]
- vld1.32 {$N0-$N3}, [$nptr]!
- vmull.u32 $A5xB,$Bi,${A2}[1]
- vmull.u32 $A6xB,$Bi,${A3}[0]
- vzip.16 $Ni,$zero
- vmull.u32 $A7xB,$Bi,${A3}[1]
-
- bne .LNEON_1st
-
- @ special case for num=8, everything is in register bank...
-
- vmlal.u32 $A0xB,$Ni,${N0}[0]
- sub $outer,$num,#1
- vmlal.u32 $A1xB,$Ni,${N0}[1]
- vmlal.u32 $A2xB,$Ni,${N1}[0]
- vmlal.u32 $A3xB,$Ni,${N1}[1]
-
- vmlal.u32 $A4xB,$Ni,${N2}[0]
- vmov $Temp,$A0xB
- vmlal.u32 $A5xB,$Ni,${N2}[1]
- vmov $A0xB,$A1xB
- vmlal.u32 $A6xB,$Ni,${N3}[0]
- vmov $A1xB,$A2xB
- vmlal.u32 $A7xB,$Ni,${N3}[1]
- vmov $A2xB,$A3xB
- vmov $A3xB,$A4xB
- vshr.u64 $temp,$temp,#16
- vmov $A4xB,$A5xB
- vmov $A5xB,$A6xB
- vadd.u64 $temp,$temp,`&Dhi("$Temp")`
- vmov $A6xB,$A7xB
- veor $A7xB,$A7xB
- vshr.u64 $temp,$temp,#16
-
- b .LNEON_outer8
-
-.align 4
-.LNEON_outer8:
- vld1.32 {${Bi}[0]}, [$bptr,:32]!
- veor $zero,$zero,$zero
- vzip.16 $Bi,$zero
- vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
-
- vmlal.u32 $A0xB,$Bi,${A0}[0]
- vmlal.u32 $A1xB,$Bi,${A0}[1]
- vmlal.u32 $A2xB,$Bi,${A1}[0]
- vshl.i64 $temp,`&Dhi("$A0xB")`,#16
- vmlal.u32 $A3xB,$Bi,${A1}[1]
-
- vadd.u64 $temp,$temp,`&Dlo("$A0xB")`
- veor $zero,$zero,$zero
- subs $outer,$outer,#1
- vmul.u32 $Ni,$temp,$M0
-
- vmlal.u32 $A4xB,$Bi,${A2}[0]
- vmlal.u32 $A5xB,$Bi,${A2}[1]
- vmlal.u32 $A6xB,$Bi,${A3}[0]
- vzip.16 $Ni,$zero
- vmlal.u32 $A7xB,$Bi,${A3}[1]
-
- vmlal.u32 $A0xB,$Ni,${N0}[0]
- vmlal.u32 $A1xB,$Ni,${N0}[1]
- vmlal.u32 $A2xB,$Ni,${N1}[0]
- vmlal.u32 $A3xB,$Ni,${N1}[1]
-
- vmlal.u32 $A4xB,$Ni,${N2}[0]
- vmov $Temp,$A0xB
- vmlal.u32 $A5xB,$Ni,${N2}[1]
- vmov $A0xB,$A1xB
- vmlal.u32 $A6xB,$Ni,${N3}[0]
- vmov $A1xB,$A2xB
- vmlal.u32 $A7xB,$Ni,${N3}[1]
- vmov $A2xB,$A3xB
- vmov $A3xB,$A4xB
- vshr.u64 $temp,$temp,#16
- vmov $A4xB,$A5xB
- vmov $A5xB,$A6xB
- vadd.u64 $temp,$temp,`&Dhi("$Temp")`
- vmov $A6xB,$A7xB
- veor $A7xB,$A7xB
- vshr.u64 $temp,$temp,#16
-
- bne .LNEON_outer8
-
- vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
- mov $toutptr,sp
- vshr.u64 $temp,`&Dlo("$A0xB")`,#16
- mov $inner,$num
- vadd.u64 `&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp
- add $tinptr,sp,#16
- vshr.u64 $temp,`&Dhi("$A0xB")`,#16
- vzip.16 `&Dlo("$A0xB")`,`&Dhi("$A0xB")`
-
- b .LNEON_tail2
-
-.align 4
-.LNEON_1st:
- vmlal.u32 $A0xB,$Ni,${N0}[0]
- vld1.32 {$A0-$A3}, [$aptr]!
- vmlal.u32 $A1xB,$Ni,${N0}[1]
- subs $inner,$inner,#8
- vmlal.u32 $A2xB,$Ni,${N1}[0]
- vmlal.u32 $A3xB,$Ni,${N1}[1]
-
- vmlal.u32 $A4xB,$Ni,${N2}[0]
- vld1.32 {$N0-$N1}, [$nptr]!
- vmlal.u32 $A5xB,$Ni,${N2}[1]
- vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]!
- vmlal.u32 $A6xB,$Ni,${N3}[0]
- vmlal.u32 $A7xB,$Ni,${N3}[1]
- vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]!
-
- vmull.u32 $A0xB,$Bi,${A0}[0]
- vld1.32 {$N2-$N3}, [$nptr]!
- vmull.u32 $A1xB,$Bi,${A0}[1]
- vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]!
- vmull.u32 $A2xB,$Bi,${A1}[0]
- vmull.u32 $A3xB,$Bi,${A1}[1]
- vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]!
-
- vmull.u32 $A4xB,$Bi,${A2}[0]
- vmull.u32 $A5xB,$Bi,${A2}[1]
- vmull.u32 $A6xB,$Bi,${A3}[0]
- vmull.u32 $A7xB,$Bi,${A3}[1]
-
- bne .LNEON_1st
-
- vmlal.u32 $A0xB,$Ni,${N0}[0]
- add $tinptr,sp,#16
- vmlal.u32 $A1xB,$Ni,${N0}[1]
- sub $aptr,$aptr,$num,lsl#2 @ rewind $aptr
- vmlal.u32 $A2xB,$Ni,${N1}[0]
- vld1.64 {$Temp}, [sp,:128]
- vmlal.u32 $A3xB,$Ni,${N1}[1]
- sub $outer,$num,#1
-
- vmlal.u32 $A4xB,$Ni,${N2}[0]
- vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]!
- vmlal.u32 $A5xB,$Ni,${N2}[1]
- vshr.u64 $temp,$temp,#16
- vld1.64 {$A0xB}, [$tinptr, :128]!
- vmlal.u32 $A6xB,$Ni,${N3}[0]
- vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]!
- vmlal.u32 $A7xB,$Ni,${N3}[1]
-
- vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]!
- vadd.u64 $temp,$temp,`&Dhi("$Temp")`
- veor $Z,$Z,$Z
- vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]!
- vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]!
- vst1.64 {$Z}, [$toutptr,:128]
- vshr.u64 $temp,$temp,#16
-
- b .LNEON_outer
-
-.align 4
-.LNEON_outer:
- vld1.32 {${Bi}[0]}, [$bptr,:32]!
- sub $nptr,$nptr,$num,lsl#2 @ rewind $nptr
- vld1.32 {$A0-$A3}, [$aptr]!
- veor $zero,$zero,$zero
- mov $toutptr,sp
- vzip.16 $Bi,$zero
- sub $inner,$num,#8
- vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
-
- vmlal.u32 $A0xB,$Bi,${A0}[0]
- vld1.64 {$A3xB-$A4xB},[$tinptr,:256]!
- vmlal.u32 $A1xB,$Bi,${A0}[1]
- vmlal.u32 $A2xB,$Bi,${A1}[0]
- vld1.64 {$A5xB-$A6xB},[$tinptr,:256]!
- vmlal.u32 $A3xB,$Bi,${A1}[1]
-
- vshl.i64 $temp,`&Dhi("$A0xB")`,#16
- veor $zero,$zero,$zero
- vadd.u64 $temp,$temp,`&Dlo("$A0xB")`
- vld1.64 {$A7xB},[$tinptr,:128]!
- vmul.u32 $Ni,$temp,$M0
-
- vmlal.u32 $A4xB,$Bi,${A2}[0]
- vld1.32 {$N0-$N3}, [$nptr]!
- vmlal.u32 $A5xB,$Bi,${A2}[1]
- vmlal.u32 $A6xB,$Bi,${A3}[0]
- vzip.16 $Ni,$zero
- vmlal.u32 $A7xB,$Bi,${A3}[1]
-
-.LNEON_inner:
- vmlal.u32 $A0xB,$Ni,${N0}[0]
- vld1.32 {$A0-$A3}, [$aptr]!
- vmlal.u32 $A1xB,$Ni,${N0}[1]
- subs $inner,$inner,#8
- vmlal.u32 $A2xB,$Ni,${N1}[0]
- vmlal.u32 $A3xB,$Ni,${N1}[1]
- vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]!
-
- vmlal.u32 $A4xB,$Ni,${N2}[0]
- vld1.64 {$A0xB}, [$tinptr, :128]!
- vmlal.u32 $A5xB,$Ni,${N2}[1]
- vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]!
- vmlal.u32 $A6xB,$Ni,${N3}[0]
- vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]!
- vmlal.u32 $A7xB,$Ni,${N3}[1]
- vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]!
-
- vmlal.u32 $A0xB,$Bi,${A0}[0]
- vld1.64 {$A3xB-$A4xB}, [$tinptr, :256]!
- vmlal.u32 $A1xB,$Bi,${A0}[1]
- vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]!
- vmlal.u32 $A2xB,$Bi,${A1}[0]
- vld1.64 {$A5xB-$A6xB}, [$tinptr, :256]!
- vmlal.u32 $A3xB,$Bi,${A1}[1]
- vld1.32 {$N0-$N3}, [$nptr]!
-
- vmlal.u32 $A4xB,$Bi,${A2}[0]
- vld1.64 {$A7xB}, [$tinptr, :128]!
- vmlal.u32 $A5xB,$Bi,${A2}[1]
- vmlal.u32 $A6xB,$Bi,${A3}[0]
- vmlal.u32 $A7xB,$Bi,${A3}[1]
-
- bne .LNEON_inner
-
- vmlal.u32 $A0xB,$Ni,${N0}[0]
- add $tinptr,sp,#16
- vmlal.u32 $A1xB,$Ni,${N0}[1]
- sub $aptr,$aptr,$num,lsl#2 @ rewind $aptr
- vmlal.u32 $A2xB,$Ni,${N1}[0]
- vld1.64 {$Temp}, [sp,:128]
- vmlal.u32 $A3xB,$Ni,${N1}[1]
- subs $outer,$outer,#1
-
- vmlal.u32 $A4xB,$Ni,${N2}[0]
- vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]!
- vmlal.u32 $A5xB,$Ni,${N2}[1]
- vld1.64 {$A0xB}, [$tinptr, :128]!
- vshr.u64 $temp,$temp,#16
- vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]!
- vmlal.u32 $A6xB,$Ni,${N3}[0]
- vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]!
- vmlal.u32 $A7xB,$Ni,${N3}[1]
-
- vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]!
- vadd.u64 $temp,$temp,`&Dhi("$Temp")`
- vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]!
- vshr.u64 $temp,$temp,#16
-
- bne .LNEON_outer
-
- mov $toutptr,sp
- mov $inner,$num
-
-.LNEON_tail:
- vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
- vld1.64 {$A3xB-$A4xB}, [$tinptr, :256]!
- vshr.u64 $temp,`&Dlo("$A0xB")`,#16
- vadd.u64 `&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp
- vld1.64 {$A5xB-$A6xB}, [$tinptr, :256]!
- vshr.u64 $temp,`&Dhi("$A0xB")`,#16
- vld1.64 {$A7xB}, [$tinptr, :128]!
- vzip.16 `&Dlo("$A0xB")`,`&Dhi("$A0xB")`
-
-.LNEON_tail2:
- vadd.u64 `&Dlo("$A1xB")`,`&Dlo("$A1xB")`,$temp
- vst1.32 {`&Dlo("$A0xB")`[0]}, [$toutptr, :32]!
- vshr.u64 $temp,`&Dlo("$A1xB")`,#16
- vadd.u64 `&Dhi("$A1xB")`,`&Dhi("$A1xB")`,$temp
- vshr.u64 $temp,`&Dhi("$A1xB")`,#16
- vzip.16 `&Dlo("$A1xB")`,`&Dhi("$A1xB")`
-
- vadd.u64 `&Dlo("$A2xB")`,`&Dlo("$A2xB")`,$temp
- vst1.32 {`&Dlo("$A1xB")`[0]}, [$toutptr, :32]!
- vshr.u64 $temp,`&Dlo("$A2xB")`,#16
- vadd.u64 `&Dhi("$A2xB")`,`&Dhi("$A2xB")`,$temp
- vshr.u64 $temp,`&Dhi("$A2xB")`,#16
- vzip.16 `&Dlo("$A2xB")`,`&Dhi("$A2xB")`
-
- vadd.u64 `&Dlo("$A3xB")`,`&Dlo("$A3xB")`,$temp
- vst1.32 {`&Dlo("$A2xB")`[0]}, [$toutptr, :32]!
- vshr.u64 $temp,`&Dlo("$A3xB")`,#16
- vadd.u64 `&Dhi("$A3xB")`,`&Dhi("$A3xB")`,$temp
- vshr.u64 $temp,`&Dhi("$A3xB")`,#16
- vzip.16 `&Dlo("$A3xB")`,`&Dhi("$A3xB")`
-
- vadd.u64 `&Dlo("$A4xB")`,`&Dlo("$A4xB")`,$temp
- vst1.32 {`&Dlo("$A3xB")`[0]}, [$toutptr, :32]!
- vshr.u64 $temp,`&Dlo("$A4xB")`,#16
- vadd.u64 `&Dhi("$A4xB")`,`&Dhi("$A4xB")`,$temp
- vshr.u64 $temp,`&Dhi("$A4xB")`,#16
- vzip.16 `&Dlo("$A4xB")`,`&Dhi("$A4xB")`
-
- vadd.u64 `&Dlo("$A5xB")`,`&Dlo("$A5xB")`,$temp
- vst1.32 {`&Dlo("$A4xB")`[0]}, [$toutptr, :32]!
- vshr.u64 $temp,`&Dlo("$A5xB")`,#16
- vadd.u64 `&Dhi("$A5xB")`,`&Dhi("$A5xB")`,$temp
- vshr.u64 $temp,`&Dhi("$A5xB")`,#16
- vzip.16 `&Dlo("$A5xB")`,`&Dhi("$A5xB")`
-
- vadd.u64 `&Dlo("$A6xB")`,`&Dlo("$A6xB")`,$temp
- vst1.32 {`&Dlo("$A5xB")`[0]}, [$toutptr, :32]!
- vshr.u64 $temp,`&Dlo("$A6xB")`,#16
- vadd.u64 `&Dhi("$A6xB")`,`&Dhi("$A6xB")`,$temp
- vld1.64 {$A0xB}, [$tinptr, :128]!
- vshr.u64 $temp,`&Dhi("$A6xB")`,#16
- vzip.16 `&Dlo("$A6xB")`,`&Dhi("$A6xB")`
-
- vadd.u64 `&Dlo("$A7xB")`,`&Dlo("$A7xB")`,$temp
- vst1.32 {`&Dlo("$A6xB")`[0]}, [$toutptr, :32]!
- vshr.u64 $temp,`&Dlo("$A7xB")`,#16
- vadd.u64 `&Dhi("$A7xB")`,`&Dhi("$A7xB")`,$temp
- vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]!
- vshr.u64 $temp,`&Dhi("$A7xB")`,#16
- vzip.16 `&Dlo("$A7xB")`,`&Dhi("$A7xB")`
- subs $inner,$inner,#8
- vst1.32 {`&Dlo("$A7xB")`[0]}, [$toutptr, :32]!
-
- bne .LNEON_tail
-
- vst1.32 {${temp}[0]}, [$toutptr, :32] @ top-most bit
- sub $nptr,$nptr,$num,lsl#2 @ rewind $nptr
- subs $aptr,sp,#0 @ clear carry flag
- add $bptr,sp,$num,lsl#2
-
-.LNEON_sub:
- ldmia $aptr!, {r4-r7}
- ldmia $nptr!, {r8-r11}
- sbcs r8, r4,r8
- sbcs r9, r5,r9
- sbcs r10,r6,r10
- sbcs r11,r7,r11
- teq $aptr,$bptr @ preserves carry
- stmia $rptr!, {r8-r11}
- bne .LNEON_sub
-
- ldr r10, [$aptr] @ load top-most bit
- veor q0,q0,q0
- sub r11,$bptr,sp @ this is num*4
- veor q1,q1,q1
- mov $aptr,sp
- sub $rptr,$rptr,r11 @ rewind $rptr
- mov $nptr,$bptr @ second 3/4th of frame
- sbcs r10,r10,#0 @ result is carry flag
-
-.LNEON_copy_n_zap:
- ldmia $aptr!, {r4-r7}
- ldmia $rptr, {r8-r11}
- movcc r8, r4
- vst1.64 {q0-q1}, [$nptr,:256]! @ wipe
- movcc r9, r5
- movcc r10,r6
- vst1.64 {q0-q1}, [$nptr,:256]! @ wipe
- movcc r11,r7
- ldmia $aptr, {r4-r7}
- stmia $rptr!, {r8-r11}
- sub $aptr,$aptr,#16
- ldmia $rptr, {r8-r11}
- movcc r8, r4
- vst1.64 {q0-q1}, [$aptr,:256]! @ wipe
- movcc r9, r5
- movcc r10,r6
- vst1.64 {q0-q1}, [$nptr,:256]! @ wipe
- movcc r11,r7
- teq $aptr,$bptr @ preserves carry
- stmia $rptr!, {r8-r11}
- bne .LNEON_copy_n_zap
-
- sub sp,ip,#96
- vldmia sp!,{d8-d15}
- ldmia sp!,{r4-r11}
- ret @ bx lr
-.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
-#endif
-___
-}
-$code.=<<___;
-.asciz "Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
+.asciz "Montgomery multiplication for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
-#if __ARM_ARCH__>=7
-.comm OPENSSL_armcap_P,4,4
-#endif
___
-$code =~ s/\`([^\`]*)\`/eval $1/gem;
$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
-$code =~ s/\bret\b/bx lr/gm;
print $code;
close STDOUT;
diff --git a/app/openssl/crypto/bn/asm/armv4-mont.s b/app/openssl/crypto/bn/asm/armv4-mont.s
index fecae15e..64c220b5 100644
--- a/app/openssl/crypto/bn/asm/armv4-mont.s
+++ b/app/openssl/crypto/bn/asm/armv4-mont.s
@@ -1,37 +1,13 @@
-#include "arm_arch.h"
-
.text
-.code 32
-
-#if __ARM_ARCH__>=7
-.align 5
-.LOPENSSL_armcap:
-.word OPENSSL_armcap_P-bn_mul_mont
-#endif
.global bn_mul_mont
.type bn_mul_mont,%function
-.align 5
+.align 2
bn_mul_mont:
- ldr ip,[sp,#4] @ load num
stmdb sp!,{r0,r2} @ sp points at argument block
-#if __ARM_ARCH__>=7
- tst ip,#7
- bne .Lialu
- adr r0,bn_mul_mont
- ldr r2,.LOPENSSL_armcap
- ldr r0,[r0,r2]
- tst r0,#1 @ NEON available?
- ldmia sp, {r0,r2}
- beq .Lialu
- add sp,sp,#8
- b bn_mul8x_mont_neon
-.align 4
-.Lialu:
-#endif
- cmp ip,#2
- mov r0,ip @ load num
+ ldr r0,[sp,#3*4] @ load num
+ cmp r0,#2
movlt r0,#0
addlt sp,sp,#2*4
blt .Labrt
@@ -161,419 +137,9 @@ bn_mul_mont:
ldmia sp!,{r4-r12,lr} @ restore registers
add sp,sp,#2*4 @ skip over {r0,r2}
mov r0,#1
-.Labrt:
-#if __ARM_ARCH__>=5
- bx lr @ .word 0xe12fff1e
-#else
- tst lr,#1
+.Labrt: tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
.word 0xe12fff1e @ interoperable with Thumb ISA:-)
-#endif
.size bn_mul_mont,.-bn_mul_mont
-#if __ARM_ARCH__>=7
-.fpu neon
-
-.type bn_mul8x_mont_neon,%function
-.align 5
-bn_mul8x_mont_neon:
- mov ip,sp
- stmdb sp!,{r4-r11}
- vstmdb sp!,{d8-d15} @ ABI specification says so
- ldmia ip,{r4-r5} @ load rest of parameter block
-
- sub r7,sp,#16
- vld1.32 {d28[0]}, [r2,:32]!
- sub r7,r7,r5,lsl#4
- vld1.32 {d0-d3}, [r1]! @ can't specify :32 :-(
- and r7,r7,#-64
- vld1.32 {d30[0]}, [r4,:32]
- mov sp,r7 @ alloca
- veor d8,d8,d8
- subs r8,r5,#8
- vzip.16 d28,d8
-
- vmull.u32 q6,d28,d0[0]
- vmull.u32 q7,d28,d0[1]
- vmull.u32 q8,d28,d1[0]
- vshl.i64 d10,d13,#16
- vmull.u32 q9,d28,d1[1]
-
- vadd.u64 d10,d10,d12
- veor d8,d8,d8
- vmul.u32 d29,d10,d30
-
- vmull.u32 q10,d28,d2[0]
- vld1.32 {d4-d7}, [r3]!
- vmull.u32 q11,d28,d2[1]
- vmull.u32 q12,d28,d3[0]
- vzip.16 d29,d8
- vmull.u32 q13,d28,d3[1]
-
- bne .LNEON_1st
-
- @ special case for num=8, everything is in register bank...
-
- vmlal.u32 q6,d29,d4[0]
- sub r9,r5,#1
- vmlal.u32 q7,d29,d4[1]
- vmlal.u32 q8,d29,d5[0]
- vmlal.u32 q9,d29,d5[1]
-
- vmlal.u32 q10,d29,d6[0]
- vmov q5,q6
- vmlal.u32 q11,d29,d6[1]
- vmov q6,q7
- vmlal.u32 q12,d29,d7[0]
- vmov q7,q8
- vmlal.u32 q13,d29,d7[1]
- vmov q8,q9
- vmov q9,q10
- vshr.u64 d10,d10,#16
- vmov q10,q11
- vmov q11,q12
- vadd.u64 d10,d10,d11
- vmov q12,q13
- veor q13,q13
- vshr.u64 d10,d10,#16
-
- b .LNEON_outer8
-
-.align 4
-.LNEON_outer8:
- vld1.32 {d28[0]}, [r2,:32]!
- veor d8,d8,d8
- vzip.16 d28,d8
- vadd.u64 d12,d12,d10
-
- vmlal.u32 q6,d28,d0[0]
- vmlal.u32 q7,d28,d0[1]
- vmlal.u32 q8,d28,d1[0]
- vshl.i64 d10,d13,#16
- vmlal.u32 q9,d28,d1[1]
-
- vadd.u64 d10,d10,d12
- veor d8,d8,d8
- subs r9,r9,#1
- vmul.u32 d29,d10,d30
-
- vmlal.u32 q10,d28,d2[0]
- vmlal.u32 q11,d28,d2[1]
- vmlal.u32 q12,d28,d3[0]
- vzip.16 d29,d8
- vmlal.u32 q13,d28,d3[1]
-
- vmlal.u32 q6,d29,d4[0]
- vmlal.u32 q7,d29,d4[1]
- vmlal.u32 q8,d29,d5[0]
- vmlal.u32 q9,d29,d5[1]
-
- vmlal.u32 q10,d29,d6[0]
- vmov q5,q6
- vmlal.u32 q11,d29,d6[1]
- vmov q6,q7
- vmlal.u32 q12,d29,d7[0]
- vmov q7,q8
- vmlal.u32 q13,d29,d7[1]
- vmov q8,q9
- vmov q9,q10
- vshr.u64 d10,d10,#16
- vmov q10,q11
- vmov q11,q12
- vadd.u64 d10,d10,d11
- vmov q12,q13
- veor q13,q13
- vshr.u64 d10,d10,#16
-
- bne .LNEON_outer8
-
- vadd.u64 d12,d12,d10
- mov r7,sp
- vshr.u64 d10,d12,#16
- mov r8,r5
- vadd.u64 d13,d13,d10
- add r6,sp,#16
- vshr.u64 d10,d13,#16
- vzip.16 d12,d13
-
- b .LNEON_tail2
-
-.align 4
-.LNEON_1st:
- vmlal.u32 q6,d29,d4[0]
- vld1.32 {d0-d3}, [r1]!
- vmlal.u32 q7,d29,d4[1]
- subs r8,r8,#8
- vmlal.u32 q8,d29,d5[0]
- vmlal.u32 q9,d29,d5[1]
-
- vmlal.u32 q10,d29,d6[0]
- vld1.32 {d4-d5}, [r3]!
- vmlal.u32 q11,d29,d6[1]
- vst1.64 {q6-q7}, [r7,:256]!
- vmlal.u32 q12,d29,d7[0]
- vmlal.u32 q13,d29,d7[1]
- vst1.64 {q8-q9}, [r7,:256]!
-
- vmull.u32 q6,d28,d0[0]
- vld1.32 {d6-d7}, [r3]!
- vmull.u32 q7,d28,d0[1]
- vst1.64 {q10-q11}, [r7,:256]!
- vmull.u32 q8,d28,d1[0]
- vmull.u32 q9,d28,d1[1]
- vst1.64 {q12-q13}, [r7,:256]!
-
- vmull.u32 q10,d28,d2[0]
- vmull.u32 q11,d28,d2[1]
- vmull.u32 q12,d28,d3[0]
- vmull.u32 q13,d28,d3[1]
-
- bne .LNEON_1st
-
- vmlal.u32 q6,d29,d4[0]
- add r6,sp,#16
- vmlal.u32 q7,d29,d4[1]
- sub r1,r1,r5,lsl#2 @ rewind r1
- vmlal.u32 q8,d29,d5[0]
- vld1.64 {q5}, [sp,:128]
- vmlal.u32 q9,d29,d5[1]
- sub r9,r5,#1
-
- vmlal.u32 q10,d29,d6[0]
- vst1.64 {q6-q7}, [r7,:256]!
- vmlal.u32 q11,d29,d6[1]
- vshr.u64 d10,d10,#16
- vld1.64 {q6}, [r6, :128]!
- vmlal.u32 q12,d29,d7[0]
- vst1.64 {q8-q9}, [r7,:256]!
- vmlal.u32 q13,d29,d7[1]
-
- vst1.64 {q10-q11}, [r7,:256]!
- vadd.u64 d10,d10,d11
- veor q4,q4,q4
- vst1.64 {q12-q13}, [r7,:256]!
- vld1.64 {q7-q8}, [r6, :256]!
- vst1.64 {q4}, [r7,:128]
- vshr.u64 d10,d10,#16
-
- b .LNEON_outer
-
-.align 4
-.LNEON_outer:
- vld1.32 {d28[0]}, [r2,:32]!
- sub r3,r3,r5,lsl#2 @ rewind r3
- vld1.32 {d0-d3}, [r1]!
- veor d8,d8,d8
- mov r7,sp
- vzip.16 d28,d8
- sub r8,r5,#8
- vadd.u64 d12,d12,d10
-
- vmlal.u32 q6,d28,d0[0]
- vld1.64 {q9-q10},[r6,:256]!
- vmlal.u32 q7,d28,d0[1]
- vmlal.u32 q8,d28,d1[0]
- vld1.64 {q11-q12},[r6,:256]!
- vmlal.u32 q9,d28,d1[1]
-
- vshl.i64 d10,d13,#16
- veor d8,d8,d8
- vadd.u64 d10,d10,d12
- vld1.64 {q13},[r6,:128]!
- vmul.u32 d29,d10,d30
-
- vmlal.u32 q10,d28,d2[0]
- vld1.32 {d4-d7}, [r3]!
- vmlal.u32 q11,d28,d2[1]
- vmlal.u32 q12,d28,d3[0]
- vzip.16 d29,d8
- vmlal.u32 q13,d28,d3[1]
-
-.LNEON_inner:
- vmlal.u32 q6,d29,d4[0]
- vld1.32 {d0-d3}, [r1]!
- vmlal.u32 q7,d29,d4[1]
- subs r8,r8,#8
- vmlal.u32 q8,d29,d5[0]
- vmlal.u32 q9,d29,d5[1]
- vst1.64 {q6-q7}, [r7,:256]!
-
- vmlal.u32 q10,d29,d6[0]
- vld1.64 {q6}, [r6, :128]!
- vmlal.u32 q11,d29,d6[1]
- vst1.64 {q8-q9}, [r7,:256]!
- vmlal.u32 q12,d29,d7[0]
- vld1.64 {q7-q8}, [r6, :256]!
- vmlal.u32 q13,d29,d7[1]
- vst1.64 {q10-q11}, [r7,:256]!
-
- vmlal.u32 q6,d28,d0[0]
- vld1.64 {q9-q10}, [r6, :256]!
- vmlal.u32 q7,d28,d0[1]
- vst1.64 {q12-q13}, [r7,:256]!
- vmlal.u32 q8,d28,d1[0]
- vld1.64 {q11-q12}, [r6, :256]!
- vmlal.u32 q9,d28,d1[1]
- vld1.32 {d4-d7}, [r3]!
-
- vmlal.u32 q10,d28,d2[0]
- vld1.64 {q13}, [r6, :128]!
- vmlal.u32 q11,d28,d2[1]
- vmlal.u32 q12,d28,d3[0]
- vmlal.u32 q13,d28,d3[1]
-
- bne .LNEON_inner
-
- vmlal.u32 q6,d29,d4[0]
- add r6,sp,#16
- vmlal.u32 q7,d29,d4[1]
- sub r1,r1,r5,lsl#2 @ rewind r1
- vmlal.u32 q8,d29,d5[0]
- vld1.64 {q5}, [sp,:128]
- vmlal.u32 q9,d29,d5[1]
- subs r9,r9,#1
-
- vmlal.u32 q10,d29,d6[0]
- vst1.64 {q6-q7}, [r7,:256]!
- vmlal.u32 q11,d29,d6[1]
- vld1.64 {q6}, [r6, :128]!
- vshr.u64 d10,d10,#16
- vst1.64 {q8-q9}, [r7,:256]!
- vmlal.u32 q12,d29,d7[0]
- vld1.64 {q7-q8}, [r6, :256]!
- vmlal.u32 q13,d29,d7[1]
-
- vst1.64 {q10-q11}, [r7,:256]!
- vadd.u64 d10,d10,d11
- vst1.64 {q12-q13}, [r7,:256]!
- vshr.u64 d10,d10,#16
-
- bne .LNEON_outer
-
- mov r7,sp
- mov r8,r5
-
-.LNEON_tail:
- vadd.u64 d12,d12,d10
- vld1.64 {q9-q10}, [r6, :256]!
- vshr.u64 d10,d12,#16
- vadd.u64 d13,d13,d10
- vld1.64 {q11-q12}, [r6, :256]!
- vshr.u64 d10,d13,#16
- vld1.64 {q13}, [r6, :128]!
- vzip.16 d12,d13
-
-.LNEON_tail2:
- vadd.u64 d14,d14,d10
- vst1.32 {d12[0]}, [r7, :32]!
- vshr.u64 d10,d14,#16
- vadd.u64 d15,d15,d10
- vshr.u64 d10,d15,#16
- vzip.16 d14,d15
-
- vadd.u64 d16,d16,d10
- vst1.32 {d14[0]}, [r7, :32]!
- vshr.u64 d10,d16,#16
- vadd.u64 d17,d17,d10
- vshr.u64 d10,d17,#16
- vzip.16 d16,d17
-
- vadd.u64 d18,d18,d10
- vst1.32 {d16[0]}, [r7, :32]!
- vshr.u64 d10,d18,#16
- vadd.u64 d19,d19,d10
- vshr.u64 d10,d19,#16
- vzip.16 d18,d19
-
- vadd.u64 d20,d20,d10
- vst1.32 {d18[0]}, [r7, :32]!
- vshr.u64 d10,d20,#16
- vadd.u64 d21,d21,d10
- vshr.u64 d10,d21,#16
- vzip.16 d20,d21
-
- vadd.u64 d22,d22,d10
- vst1.32 {d20[0]}, [r7, :32]!
- vshr.u64 d10,d22,#16
- vadd.u64 d23,d23,d10
- vshr.u64 d10,d23,#16
- vzip.16 d22,d23
-
- vadd.u64 d24,d24,d10
- vst1.32 {d22[0]}, [r7, :32]!
- vshr.u64 d10,d24,#16
- vadd.u64 d25,d25,d10
- vld1.64 {q6}, [r6, :128]!
- vshr.u64 d10,d25,#16
- vzip.16 d24,d25
-
- vadd.u64 d26,d26,d10
- vst1.32 {d24[0]}, [r7, :32]!
- vshr.u64 d10,d26,#16
- vadd.u64 d27,d27,d10
- vld1.64 {q7-q8}, [r6, :256]!
- vshr.u64 d10,d27,#16
- vzip.16 d26,d27
- subs r8,r8,#8
- vst1.32 {d26[0]}, [r7, :32]!
-
- bne .LNEON_tail
-
- vst1.32 {d10[0]}, [r7, :32] @ top-most bit
- sub r3,r3,r5,lsl#2 @ rewind r3
- subs r1,sp,#0 @ clear carry flag
- add r2,sp,r5,lsl#2
-
-.LNEON_sub:
- ldmia r1!, {r4-r7}
- ldmia r3!, {r8-r11}
- sbcs r8, r4,r8
- sbcs r9, r5,r9
- sbcs r10,r6,r10
- sbcs r11,r7,r11
- teq r1,r2 @ preserves carry
- stmia r0!, {r8-r11}
- bne .LNEON_sub
-
- ldr r10, [r1] @ load top-most bit
- veor q0,q0,q0
- sub r11,r2,sp @ this is num*4
- veor q1,q1,q1
- mov r1,sp
- sub r0,r0,r11 @ rewind r0
- mov r3,r2 @ second 3/4th of frame
- sbcs r10,r10,#0 @ result is carry flag
-
-.LNEON_copy_n_zap:
- ldmia r1!, {r4-r7}
- ldmia r0, {r8-r11}
- movcc r8, r4
- vst1.64 {q0-q1}, [r3,:256]! @ wipe
- movcc r9, r5
- movcc r10,r6
- vst1.64 {q0-q1}, [r3,:256]! @ wipe
- movcc r11,r7
- ldmia r1, {r4-r7}
- stmia r0!, {r8-r11}
- sub r1,r1,#16
- ldmia r0, {r8-r11}
- movcc r8, r4
- vst1.64 {q0-q1}, [r1,:256]! @ wipe
- movcc r9, r5
- movcc r10,r6
- vst1.64 {q0-q1}, [r3,:256]! @ wipe
- movcc r11,r7
- teq r1,r2 @ preserves carry
- stmia r0!, {r8-r11}
- bne .LNEON_copy_n_zap
-
- sub sp,ip,#96
- vldmia sp!,{d8-d15}
- ldmia sp!,{r4-r11}
- bx lr @ .word 0xe12fff1e
-.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
-#endif
-.asciz "Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by <appro@openssl.org>"
+.asciz "Montgomery multiplication for ARMv4, CRYPTOGAMS by <appro@openssl.org>"
.align 2
-#if __ARM_ARCH__>=7
-.comm OPENSSL_armcap_P,4,4
-#endif
diff --git a/app/openssl/crypto/bn/asm/mips3.S b/app/openssl/crypto/bn/asm/mips3.S
deleted file mode 100644
index dca4105c..00000000
--- a/app/openssl/crypto/bn/asm/mips3.S
+++ /dev/null
@@ -1,2201 +0,0 @@
-.rdata
-.asciiz "mips3.s, Version 1.1"
-.asciiz "MIPS III/IV ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
-
-/*
- * ====================================================================
- * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
- * project.
- *
- * Rights for redistribution and usage in source and binary forms are
- * granted according to the OpenSSL license. Warranty of any kind is
- * disclaimed.
- * ====================================================================
- */
-
-/*
- * This is my modest contributon to the OpenSSL project (see
- * http://www.openssl.org/ for more information about it) and is
- * a drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c
- * module. For updates see http://fy.chalmers.se/~appro/hpe/.
- *
- * The module is designed to work with either of the "new" MIPS ABI(5),
- * namely N32 or N64, offered by IRIX 6.x. It's not ment to work under
- * IRIX 5.x not only because it doesn't support new ABIs but also
- * because 5.x kernels put R4x00 CPU into 32-bit mode and all those
- * 64-bit instructions (daddu, dmultu, etc.) found below gonna only
- * cause illegal instruction exception:-(
- *
- * In addition the code depends on preprocessor flags set up by MIPSpro
- * compiler driver (either as or cc) and therefore (probably?) can't be
- * compiled by the GNU assembler. GNU C driver manages fine though...
- * I mean as long as -mmips-as is specified or is the default option,
- * because then it simply invokes /usr/bin/as which in turn takes
- * perfect care of the preprocessor definitions. Another neat feature
- * offered by the MIPSpro assembler is an optimization pass. This gave
- * me the opportunity to have the code looking more regular as all those
- * architecture dependent instruction rescheduling details were left to
- * the assembler. Cool, huh?
- *
- * Performance improvement is astonishing! 'apps/openssl speed rsa dsa'
- * goes way over 3 times faster!
- *
- * <appro@fy.chalmers.se>
- */
-#include <asm.h>
-#include <regdef.h>
-
-#if _MIPS_ISA>=4
-#define MOVNZ(cond,dst,src) \
- movn dst,src,cond
-#else
-#define MOVNZ(cond,dst,src) \
- .set noreorder; \
- bnezl cond,.+8; \
- move dst,src; \
- .set reorder
-#endif
-
-.text
-
-.set noat
-.set reorder
-
-#define MINUS4 v1
-
-.align 5
-LEAF(bn_mul_add_words)
- .set noreorder
- bgtzl a2,.L_bn_mul_add_words_proceed
- ld t0,0(a1)
- jr ra
- move v0,zero
- .set reorder
-
-.L_bn_mul_add_words_proceed:
- li MINUS4,-4
- and ta0,a2,MINUS4
- move v0,zero
- beqz ta0,.L_bn_mul_add_words_tail
-
-.L_bn_mul_add_words_loop:
- dmultu t0,a3
- ld t1,0(a0)
- ld t2,8(a1)
- ld t3,8(a0)
- ld ta0,16(a1)
- ld ta1,16(a0)
- daddu t1,v0
- sltu v0,t1,v0 /* All manuals say it "compares 32-bit
- * values", but it seems to work fine
- * even on 64-bit registers. */
- mflo AT
- mfhi t0
- daddu t1,AT
- daddu v0,t0
- sltu AT,t1,AT
- sd t1,0(a0)
- daddu v0,AT
-
- dmultu t2,a3
- ld ta2,24(a1)
- ld ta3,24(a0)
- daddu t3,v0
- sltu v0,t3,v0
- mflo AT
- mfhi t2
- daddu t3,AT
- daddu v0,t2
- sltu AT,t3,AT
- sd t3,8(a0)
- daddu v0,AT
-
- dmultu ta0,a3
- subu a2,4
- PTR_ADD a0,32
- PTR_ADD a1,32
- daddu ta1,v0
- sltu v0,ta1,v0
- mflo AT
- mfhi ta0
- daddu ta1,AT
- daddu v0,ta0
- sltu AT,ta1,AT
- sd ta1,-16(a0)
- daddu v0,AT
-
-
- dmultu ta2,a3
- and ta0,a2,MINUS4
- daddu ta3,v0
- sltu v0,ta3,v0
- mflo AT
- mfhi ta2
- daddu ta3,AT
- daddu v0,ta2
- sltu AT,ta3,AT
- sd ta3,-8(a0)
- daddu v0,AT
- .set noreorder
- bgtzl ta0,.L_bn_mul_add_words_loop
- ld t0,0(a1)
-
- bnezl a2,.L_bn_mul_add_words_tail
- ld t0,0(a1)
- .set reorder
-
-.L_bn_mul_add_words_return:
- jr ra
-
-.L_bn_mul_add_words_tail:
- dmultu t0,a3
- ld t1,0(a0)
- subu a2,1
- daddu t1,v0
- sltu v0,t1,v0
- mflo AT
- mfhi t0
- daddu t1,AT
- daddu v0,t0
- sltu AT,t1,AT
- sd t1,0(a0)
- daddu v0,AT
- beqz a2,.L_bn_mul_add_words_return
-
- ld t0,8(a1)
- dmultu t0,a3
- ld t1,8(a0)
- subu a2,1
- daddu t1,v0
- sltu v0,t1,v0
- mflo AT
- mfhi t0
- daddu t1,AT
- daddu v0,t0
- sltu AT,t1,AT
- sd t1,8(a0)
- daddu v0,AT
- beqz a2,.L_bn_mul_add_words_return
-
- ld t0,16(a1)
- dmultu t0,a3
- ld t1,16(a0)
- daddu t1,v0
- sltu v0,t1,v0
- mflo AT
- mfhi t0
- daddu t1,AT
- daddu v0,t0
- sltu AT,t1,AT
- sd t1,16(a0)
- daddu v0,AT
- jr ra
-END(bn_mul_add_words)
-
-.align 5
-LEAF(bn_mul_words)
- .set noreorder
- bgtzl a2,.L_bn_mul_words_proceed
- ld t0,0(a1)
- jr ra
- move v0,zero
- .set reorder
-
-.L_bn_mul_words_proceed:
- li MINUS4,-4
- and ta0,a2,MINUS4
- move v0,zero
- beqz ta0,.L_bn_mul_words_tail
-
-.L_bn_mul_words_loop:
- dmultu t0,a3
- ld t2,8(a1)
- ld ta0,16(a1)
- ld ta2,24(a1)
- mflo AT
- mfhi t0
- daddu v0,AT
- sltu t1,v0,AT
- sd v0,0(a0)
- daddu v0,t1,t0
-
- dmultu t2,a3
- subu a2,4
- PTR_ADD a0,32
- PTR_ADD a1,32
- mflo AT
- mfhi t2
- daddu v0,AT
- sltu t3,v0,AT
- sd v0,-24(a0)
- daddu v0,t3,t2
-
- dmultu ta0,a3
- mflo AT
- mfhi ta0
- daddu v0,AT
- sltu ta1,v0,AT
- sd v0,-16(a0)
- daddu v0,ta1,ta0
-
-
- dmultu ta2,a3
- and ta0,a2,MINUS4
- mflo AT
- mfhi ta2
- daddu v0,AT
- sltu ta3,v0,AT
- sd v0,-8(a0)
- daddu v0,ta3,ta2
- .set noreorder
- bgtzl ta0,.L_bn_mul_words_loop
- ld t0,0(a1)
-
- bnezl a2,.L_bn_mul_words_tail
- ld t0,0(a1)
- .set reorder
-
-.L_bn_mul_words_return:
- jr ra
-
-.L_bn_mul_words_tail:
- dmultu t0,a3
- subu a2,1
- mflo AT
- mfhi t0
- daddu v0,AT
- sltu t1,v0,AT
- sd v0,0(a0)
- daddu v0,t1,t0
- beqz a2,.L_bn_mul_words_return
-
- ld t0,8(a1)
- dmultu t0,a3
- subu a2,1
- mflo AT
- mfhi t0
- daddu v0,AT
- sltu t1,v0,AT
- sd v0,8(a0)
- daddu v0,t1,t0
- beqz a2,.L_bn_mul_words_return
-
- ld t0,16(a1)
- dmultu t0,a3
- mflo AT
- mfhi t0
- daddu v0,AT
- sltu t1,v0,AT
- sd v0,16(a0)
- daddu v0,t1,t0
- jr ra
-END(bn_mul_words)
-
-.align 5
-LEAF(bn_sqr_words)
- .set noreorder
- bgtzl a2,.L_bn_sqr_words_proceed
- ld t0,0(a1)
- jr ra
- move v0,zero
- .set reorder
-
-.L_bn_sqr_words_proceed:
- li MINUS4,-4
- and ta0,a2,MINUS4
- move v0,zero
- beqz ta0,.L_bn_sqr_words_tail
-
-.L_bn_sqr_words_loop:
- dmultu t0,t0
- ld t2,8(a1)
- ld ta0,16(a1)
- ld ta2,24(a1)
- mflo t1
- mfhi t0
- sd t1,0(a0)
- sd t0,8(a0)
-
- dmultu t2,t2
- subu a2,4
- PTR_ADD a0,64
- PTR_ADD a1,32
- mflo t3
- mfhi t2
- sd t3,-48(a0)
- sd t2,-40(a0)
-
- dmultu ta0,ta0
- mflo ta1
- mfhi ta0
- sd ta1,-32(a0)
- sd ta0,-24(a0)
-
-
- dmultu ta2,ta2
- and ta0,a2,MINUS4
- mflo ta3
- mfhi ta2
- sd ta3,-16(a0)
- sd ta2,-8(a0)
-
- .set noreorder
- bgtzl ta0,.L_bn_sqr_words_loop
- ld t0,0(a1)
-
- bnezl a2,.L_bn_sqr_words_tail
- ld t0,0(a1)
- .set reorder
-
-.L_bn_sqr_words_return:
- move v0,zero
- jr ra
-
-.L_bn_sqr_words_tail:
- dmultu t0,t0
- subu a2,1
- mflo t1
- mfhi t0
- sd t1,0(a0)
- sd t0,8(a0)
- beqz a2,.L_bn_sqr_words_return
-
- ld t0,8(a1)
- dmultu t0,t0
- subu a2,1
- mflo t1
- mfhi t0
- sd t1,16(a0)
- sd t0,24(a0)
- beqz a2,.L_bn_sqr_words_return
-
- ld t0,16(a1)
- dmultu t0,t0
- mflo t1
- mfhi t0
- sd t1,32(a0)
- sd t0,40(a0)
- jr ra
-END(bn_sqr_words)
-
-.align 5
-LEAF(bn_add_words)
- .set noreorder
- bgtzl a3,.L_bn_add_words_proceed
- ld t0,0(a1)
- jr ra
- move v0,zero
- .set reorder
-
-.L_bn_add_words_proceed:
- li MINUS4,-4
- and AT,a3,MINUS4
- move v0,zero
- beqz AT,.L_bn_add_words_tail
-
-.L_bn_add_words_loop:
- ld ta0,0(a2)
- subu a3,4
- ld t1,8(a1)
- and AT,a3,MINUS4
- ld t2,16(a1)
- PTR_ADD a2,32
- ld t3,24(a1)
- PTR_ADD a0,32
- ld ta1,-24(a2)
- PTR_ADD a1,32
- ld ta2,-16(a2)
- ld ta3,-8(a2)
- daddu ta0,t0
- sltu t8,ta0,t0
- daddu t0,ta0,v0
- sltu v0,t0,ta0
- sd t0,-32(a0)
- daddu v0,t8
-
- daddu ta1,t1
- sltu t9,ta1,t1
- daddu t1,ta1,v0
- sltu v0,t1,ta1
- sd t1,-24(a0)
- daddu v0,t9
-
- daddu ta2,t2
- sltu t8,ta2,t2
- daddu t2,ta2,v0
- sltu v0,t2,ta2
- sd t2,-16(a0)
- daddu v0,t8
-
- daddu ta3,t3
- sltu t9,ta3,t3
- daddu t3,ta3,v0
- sltu v0,t3,ta3
- sd t3,-8(a0)
- daddu v0,t9
-
- .set noreorder
- bgtzl AT,.L_bn_add_words_loop
- ld t0,0(a1)
-
- bnezl a3,.L_bn_add_words_tail
- ld t0,0(a1)
- .set reorder
-
-.L_bn_add_words_return:
- jr ra
-
-.L_bn_add_words_tail:
- ld ta0,0(a2)
- daddu ta0,t0
- subu a3,1
- sltu t8,ta0,t0
- daddu t0,ta0,v0
- sltu v0,t0,ta0
- sd t0,0(a0)
- daddu v0,t8
- beqz a3,.L_bn_add_words_return
-
- ld t1,8(a1)
- ld ta1,8(a2)
- daddu ta1,t1
- subu a3,1
- sltu t9,ta1,t1
- daddu t1,ta1,v0
- sltu v0,t1,ta1
- sd t1,8(a0)
- daddu v0,t9
- beqz a3,.L_bn_add_words_return
-
- ld t2,16(a1)
- ld ta2,16(a2)
- daddu ta2,t2
- sltu t8,ta2,t2
- daddu t2,ta2,v0
- sltu v0,t2,ta2
- sd t2,16(a0)
- daddu v0,t8
- jr ra
-END(bn_add_words)
-
-.align 5
-LEAF(bn_sub_words)
- .set noreorder
- bgtzl a3,.L_bn_sub_words_proceed
- ld t0,0(a1)
- jr ra
- move v0,zero
- .set reorder
-
-.L_bn_sub_words_proceed:
- li MINUS4,-4
- and AT,a3,MINUS4
- move v0,zero
- beqz AT,.L_bn_sub_words_tail
-
-.L_bn_sub_words_loop:
- ld ta0,0(a2)
- subu a3,4
- ld t1,8(a1)
- and AT,a3,MINUS4
- ld t2,16(a1)
- PTR_ADD a2,32
- ld t3,24(a1)
- PTR_ADD a0,32
- ld ta1,-24(a2)
- PTR_ADD a1,32
- ld ta2,-16(a2)
- ld ta3,-8(a2)
- sltu t8,t0,ta0
- dsubu t0,ta0
- dsubu ta0,t0,v0
- sd ta0,-32(a0)
- MOVNZ (t0,v0,t8)
-
- sltu t9,t1,ta1
- dsubu t1,ta1
- dsubu ta1,t1,v0
- sd ta1,-24(a0)
- MOVNZ (t1,v0,t9)
-
-
- sltu t8,t2,ta2
- dsubu t2,ta2
- dsubu ta2,t2,v0
- sd ta2,-16(a0)
- MOVNZ (t2,v0,t8)
-
- sltu t9,t3,ta3
- dsubu t3,ta3
- dsubu ta3,t3,v0
- sd ta3,-8(a0)
- MOVNZ (t3,v0,t9)
-
- .set noreorder
- bgtzl AT,.L_bn_sub_words_loop
- ld t0,0(a1)
-
- bnezl a3,.L_bn_sub_words_tail
- ld t0,0(a1)
- .set reorder
-
-.L_bn_sub_words_return:
- jr ra
-
-.L_bn_sub_words_tail:
- ld ta0,0(a2)
- subu a3,1
- sltu t8,t0,ta0
- dsubu t0,ta0
- dsubu ta0,t0,v0
- MOVNZ (t0,v0,t8)
- sd ta0,0(a0)
- beqz a3,.L_bn_sub_words_return
-
- ld t1,8(a1)
- subu a3,1
- ld ta1,8(a2)
- sltu t9,t1,ta1
- dsubu t1,ta1
- dsubu ta1,t1,v0
- MOVNZ (t1,v0,t9)
- sd ta1,8(a0)
- beqz a3,.L_bn_sub_words_return
-
- ld t2,16(a1)
- ld ta2,16(a2)
- sltu t8,t2,ta2
- dsubu t2,ta2
- dsubu ta2,t2,v0
- MOVNZ (t2,v0,t8)
- sd ta2,16(a0)
- jr ra
-END(bn_sub_words)
-
-#undef MINUS4
-
-.align 5
-LEAF(bn_div_3_words)
- .set reorder
- move a3,a0 /* we know that bn_div_words doesn't
- * touch a3, ta2, ta3 and preserves a2
- * so that we can save two arguments
- * and return address in registers
- * instead of stack:-)
- */
- ld a0,(a3)
- move ta2,a1
- ld a1,-8(a3)
- bne a0,a2,.L_bn_div_3_words_proceed
- li v0,-1
- jr ra
-.L_bn_div_3_words_proceed:
- move ta3,ra
- bal bn_div_words
- move ra,ta3
- dmultu ta2,v0
- ld t2,-16(a3)
- move ta0,zero
- mfhi t1
- mflo t0
- sltu t8,t1,v1
-.L_bn_div_3_words_inner_loop:
- bnez t8,.L_bn_div_3_words_inner_loop_done
- sgeu AT,t2,t0
- seq t9,t1,v1
- and AT,t9
- sltu t3,t0,ta2
- daddu v1,a2
- dsubu t1,t3
- dsubu t0,ta2
- sltu t8,t1,v1
- sltu ta0,v1,a2
- or t8,ta0
- .set noreorder
- beqzl AT,.L_bn_div_3_words_inner_loop
- dsubu v0,1
- .set reorder
-.L_bn_div_3_words_inner_loop_done:
- jr ra
-END(bn_div_3_words)
-
-.align 5
-LEAF(bn_div_words)
- .set noreorder
- bnezl a2,.L_bn_div_words_proceed
- move v1,zero
- jr ra
- li v0,-1 /* I'd rather signal div-by-zero
- * which can be done with 'break 7' */
-
-.L_bn_div_words_proceed:
- bltz a2,.L_bn_div_words_body
- move t9,v1
- dsll a2,1
- bgtz a2,.-4
- addu t9,1
-
- .set reorder
- negu t1,t9
- li t2,-1
- dsll t2,t1
- and t2,a0
- dsrl AT,a1,t1
- .set noreorder
- bnezl t2,.+8
- break 6 /* signal overflow */
- .set reorder
- dsll a0,t9
- dsll a1,t9
- or a0,AT
-
-#define QT ta0
-#define HH ta1
-#define DH v1
-.L_bn_div_words_body:
- dsrl DH,a2,32
- sgeu AT,a0,a2
- .set noreorder
- bnezl AT,.+8
- dsubu a0,a2
- .set reorder
-
- li QT,-1
- dsrl HH,a0,32
- dsrl QT,32 /* q=0xffffffff */
- beq DH,HH,.L_bn_div_words_skip_div1
- ddivu zero,a0,DH
- mflo QT
-.L_bn_div_words_skip_div1:
- dmultu a2,QT
- dsll t3,a0,32
- dsrl AT,a1,32
- or t3,AT
- mflo t0
- mfhi t1
-.L_bn_div_words_inner_loop1:
- sltu t2,t3,t0
- seq t8,HH,t1
- sltu AT,HH,t1
- and t2,t8
- sltu v0,t0,a2
- or AT,t2
- .set noreorder
- beqz AT,.L_bn_div_words_inner_loop1_done
- dsubu t1,v0
- dsubu t0,a2
- b .L_bn_div_words_inner_loop1
- dsubu QT,1
- .set reorder
-.L_bn_div_words_inner_loop1_done:
-
- dsll a1,32
- dsubu a0,t3,t0
- dsll v0,QT,32
-
- li QT,-1
- dsrl HH,a0,32
- dsrl QT,32 /* q=0xffffffff */
- beq DH,HH,.L_bn_div_words_skip_div2
- ddivu zero,a0,DH
- mflo QT
-.L_bn_div_words_skip_div2:
-#undef DH
- dmultu a2,QT
- dsll t3,a0,32
- dsrl AT,a1,32
- or t3,AT
- mflo t0
- mfhi t1
-.L_bn_div_words_inner_loop2:
- sltu t2,t3,t0
- seq t8,HH,t1
- sltu AT,HH,t1
- and t2,t8
- sltu v1,t0,a2
- or AT,t2
- .set noreorder
- beqz AT,.L_bn_div_words_inner_loop2_done
- dsubu t1,v1
- dsubu t0,a2
- b .L_bn_div_words_inner_loop2
- dsubu QT,1
- .set reorder
-.L_bn_div_words_inner_loop2_done:
-#undef HH
-
- dsubu a0,t3,t0
- or v0,QT
- dsrl v1,a0,t9 /* v1 contains remainder if anybody wants it */
- dsrl a2,t9 /* restore a2 */
- jr ra
-#undef QT
-END(bn_div_words)
-
-#define a_0 t0
-#define a_1 t1
-#define a_2 t2
-#define a_3 t3
-#define b_0 ta0
-#define b_1 ta1
-#define b_2 ta2
-#define b_3 ta3
-
-#define a_4 s0
-#define a_5 s2
-#define a_6 s4
-#define a_7 a1 /* once we load a[7] we don't need a anymore */
-#define b_4 s1
-#define b_5 s3
-#define b_6 s5
-#define b_7 a2 /* once we load b[7] we don't need b anymore */
-
-#define t_1 t8
-#define t_2 t9
-
-#define c_1 v0
-#define c_2 v1
-#define c_3 a3
-
-#define FRAME_SIZE 48
-
-.align 5
-LEAF(bn_mul_comba8)
- .set noreorder
- PTR_SUB sp,FRAME_SIZE
- .frame sp,64,ra
- .set reorder
- ld a_0,0(a1) /* If compiled with -mips3 option on
- * R5000 box assembler barks on this
- * line with "shouldn't have mult/div
- * as last instruction in bb (R10K
- * bug)" warning. If anybody out there
- * has a clue about how to circumvent
- * this do send me a note.
- * <appro@fy.chalmers.se>
- */
- ld b_0,0(a2)
- ld a_1,8(a1)
- ld a_2,16(a1)
- ld a_3,24(a1)
- ld b_1,8(a2)
- ld b_2,16(a2)
- ld b_3,24(a2)
- dmultu a_0,b_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */
- sd s0,0(sp)
- sd s1,8(sp)
- sd s2,16(sp)
- sd s3,24(sp)
- sd s4,32(sp)
- sd s5,40(sp)
- mflo c_1
- mfhi c_2
-
- dmultu a_0,b_1 /* mul_add_c(a[0],b[1],c2,c3,c1); */
- ld a_4,32(a1)
- ld a_5,40(a1)
- ld a_6,48(a1)
- ld a_7,56(a1)
- ld b_4,32(a2)
- ld b_5,40(a2)
- mflo t_1
- mfhi t_2
- daddu c_2,t_1
- sltu AT,c_2,t_1
- daddu c_3,t_2,AT
- dmultu a_1,b_0 /* mul_add_c(a[1],b[0],c2,c3,c1); */
- ld b_6,48(a2)
- ld b_7,56(a2)
- sd c_1,0(a0) /* r[0]=c1; */
- mflo t_1
- mfhi t_2
- daddu c_2,t_1
- sltu AT,c_2,t_1
- daddu t_2,AT
- daddu c_3,t_2
- sltu c_1,c_3,t_2
- sd c_2,8(a0) /* r[1]=c2; */
-
- dmultu a_2,b_0 /* mul_add_c(a[2],b[0],c3,c1,c2); */
- mflo t_1
- mfhi t_2
- daddu c_3,t_1
- sltu AT,c_3,t_1
- daddu t_2,AT
- daddu c_1,t_2
- dmultu a_1,b_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */
- mflo t_1
- mfhi t_2
- daddu c_3,t_1
- sltu AT,c_3,t_1
- daddu t_2,AT
- daddu c_1,t_2
- sltu c_2,c_1,t_2
- dmultu a_0,b_2 /* mul_add_c(a[0],b[2],c3,c1,c2); */
- mflo t_1
- mfhi t_2
- daddu c_3,t_1
- sltu AT,c_3,t_1
- daddu t_2,AT
- daddu c_1,t_2
- sltu AT,c_1,t_2
- daddu c_2,AT
- sd c_3,16(a0) /* r[2]=c3; */
-
- dmultu a_0,b_3 /* mul_add_c(a[0],b[3],c1,c2,c3); */
- mflo t_1
- mfhi t_2
- daddu c_1,t_1
- sltu AT,c_1,t_1
- daddu t_2,AT
- daddu c_2,t_2
- sltu c_3,c_2,t_2
- dmultu a_1,b_2 /* mul_add_c(a[1],b[2],c1,c2,c3); */
- mflo t_1
- mfhi t_2
- daddu c_1,t_1
- sltu AT,c_1,t_1
- daddu t_2,AT
- daddu c_2,t_2
- sltu AT,c_2,t_2
- daddu c_3,AT
- dmultu a_2,b_1 /* mul_add_c(a[2],b[1],c1,c2,c3); */
- mflo t_1
- mfhi t_2
- daddu c_1,t_1
- sltu AT,c_1,t_1
- daddu t_2,AT
- daddu c_2,t_2
- sltu AT,c_2,t_2
- daddu c_3,AT
- dmultu a_3,b_0 /* mul_add_c(a[3],b[0],c1,c2,c3); */
- mflo t_1
- mfhi t_2
- daddu c_1,t_1
- sltu AT,c_1,t_1
- daddu t_2,AT
- daddu c_2,t_2
- sltu AT,c_2,t_2
- daddu c_3,AT
- sd c_1,24(a0) /* r[3]=c1; */
-
- dmultu a_4,b_0 /* mul_add_c(a[4],b[0],c2,c3,c1); */
- mflo t_1
- mfhi t_2
- daddu c_2,t_1
- sltu AT,c_2,t_1
- daddu t_2,AT
- daddu c_3,t_2
- sltu c_1,c_3,t_2
- dmultu a_3,b_1 /* mul_add_c(a[3],b[1],c2,c3,c1); */
- mflo t_1
- mfhi t_2
- daddu c_2,t_1
- sltu AT,c_2,t_1
- daddu t_2,AT
- daddu c_3,t_2
- sltu AT,c_3,t_2
- daddu c_1,AT
- dmultu a_2,b_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */
- mflo t_1
- mfhi t_2
- daddu c_2,t_1
- sltu AT,c_2,t_1
- daddu t_2,AT
- daddu c_3,t_2
- sltu AT,c_3,t_2
- daddu c_1,AT
- dmultu a_1,b_3 /* mul_add_c(a[1],b[3],c2,c3,c1); */
- mflo t_1
- mfhi t_2
- daddu c_2,t_1
- sltu AT,c_2,t_1
- daddu t_2,AT
- daddu c_3,t_2
- sltu AT,c_3,t_2
- daddu c_1,AT
- dmultu a_0,b_4 /* mul_add_c(a[0],b[4],c2,c3,c1); */
- mflo t_1
- mfhi t_2
- daddu c_2,t_1
- sltu AT,c_2,t_1
- daddu t_2,AT
- daddu c_3,t_2
- sltu AT,c_3,t_2
- daddu c_1,AT
- sd c_2,32(a0) /* r[4]=c2; */
-
- dmultu a_0,b_5 /* mul_add_c(a[0],b[5],c3,c1,c2); */
- mflo t_1
- mfhi t_2
- daddu c_3,t_1
- sltu AT,c_3,t_1
- daddu t_2,AT
- daddu c_1,t_2
- sltu c_2,c_1,t_2
- dmultu a_1,b_4 /* mul_add_c(a[1],b[4],c3,c1,c2); */
- mflo t_1
- mfhi t_2
- daddu c_3,t_1
- sltu AT,c_3,t_1
- daddu t_2,AT
- daddu c_1,t_2
- sltu AT,c_1,t_2
- daddu c_2,AT
- dmultu a_2,b_3 /* mul_add_c(a[2],b[3],c3,c1,c2); */
- mflo t_1
- mfhi t_2
- daddu c_3,t_1
- sltu AT,c_3,t_1
- daddu t_2,AT
- daddu c_1,t_2
- sltu AT,c_1,t_2
- daddu c_2,AT
- dmultu a_3,b_2 /* mul_add_c(a[3],b[2],c3,c1,c2); */
- mflo t_1
- mfhi t_2
- daddu c_3,t_1
- sltu AT,c_3,t_1
- daddu t_2,AT
- daddu c_1,t_2
- sltu AT,c_1,t_2
- daddu c_2,AT
- dmultu a_4,b_1 /* mul_add_c(a[4],b[1],c3,c1,c2); */
- mflo t_1
- mfhi t_2
- daddu c_3,t_1
- sltu AT,c_3,t_1
- daddu t_2,AT
- daddu c_1,t_2
- sltu AT,c_1,t_2
- daddu c_2,AT
- dmultu a_5,b_0 /* mul_add_c(a[5],b[0],c3,c1,c2); */
- mflo t_1
- mfhi t_2
- daddu c_3,t_1
- sltu AT,c_3,t_1
- daddu t_2,AT
- daddu c_1,t_2
- sltu AT,c_1,t_2
- daddu c_2,AT
- sd c_3,40(a0) /* r[5]=c3; */
-
- dmultu a_6,b_0 /* mul_add_c(a[6],b[0],c1,c2,c3); */
- mflo t_1
- mfhi t_2
- daddu c_1,t_1
- sltu AT,c_1,t_1
- daddu t_2,AT
- daddu c_2,t_2
- sltu c_3,c_2,t_2
- dmultu a_5,b_1 /* mul_add_c(a[5],b[1],c1,c2,c3); */
- mflo t_1
- mfhi t_2
- daddu c_1,t_1
- sltu AT,c_1,t_1
- daddu t_2,AT
- daddu c_2,t_2
- sltu AT,c_2,t_2
- daddu c_3,AT
- dmultu a_4,b_2 /* mul_add_c(a[4],b[2],c1,c2,c3); */
- mflo t_1
- mfhi t_2
- daddu c_1,t_1
- sltu AT,c_1,t_1
- daddu t_2,AT
- daddu c_2,t_2
- sltu AT,c_2,t_2
- daddu c_3,AT
- dmultu a_3,b_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */
- mflo t_1
- mfhi t_2
- daddu c_1,t_1
- sltu AT,c_1,t_1
- daddu t_2,AT
- daddu c_2,t_2
- sltu AT,c_2,t_2
- daddu c_3,AT
- dmultu a_2,b_4 /* mul_add_c(a[2],b[4],c1,c2,c3); */
- mflo t_1
- mfhi t_2
- daddu c_1,t_1
- sltu AT,c_1,t_1
- daddu t_2,AT
- daddu c_2,t_2
- sltu AT,c_2,t_2
- daddu c_3,AT
- dmultu a_1,b_5 /* mul_add_c(a[1],b[5],c1,c2,c3); */
- mflo t_1
- mfhi t_2
- daddu c_1,t_1
- sltu AT,c_1,t_1
- daddu t_2,AT
- daddu c_2,t_2
- sltu AT,c_2,t_2
- daddu c_3,AT
- dmultu a_0,b_6 /* mul_add_c(a[0],b[6],c1,c2,c3); */
- mflo t_1
- mfhi t_2
- daddu c_1,t_1
- sltu AT,c_1,t_1
- daddu t_2,AT
- daddu c_2,t_2
- sltu AT,c_2,t_2
- daddu c_3,AT
- sd c_1,48(a0) /* r[6]=c1; */
-
- dmultu a_0,b_7 /* mul_add_c(a[0],b[7],c2,c3,c1); */
- mflo t_1
- mfhi t_2
- daddu c_2,t_1
- sltu AT,c_2,t_1
- daddu t_2,AT
- daddu c_3,t_2
- sltu c_1,c_3,t_2
- dmultu a_1,b_6 /* mul_add_c(a[1],b[6],c2,c3,c1); */
- mflo t_1
- mfhi t_2
- daddu c_2,t_1
- sltu AT,c_2,t_1
- daddu t_2,AT
- daddu c_3,t_2
- sltu AT,c_3,t_2
- daddu c_1,AT
- dmultu a_2,b_5 /* mul_add_c(a[2],b[5],c2,c3,c1); */
- mflo t_1
- mfhi t_2
- daddu c_2,t_1
- sltu AT,c_2,t_1
- daddu t_2,AT
- daddu c_3,t_2
- sltu AT,c_3,t_2
- daddu c_1,AT
- dmultu a_3,b_4 /* mul_add_c(a[3],b[4],c2,c3,c1); */
- mflo t_1
- mfhi t_2
- daddu c_2,t_1
- sltu AT,c_2,t_1
- daddu t_2,AT
- daddu c_3,t_2
- sltu AT,c_3,t_2
- daddu c_1,AT
- dmultu a_4,b_3 /* mul_add_c(a[4],b[3],c2,c3,c1); */
- mflo t_1
- mfhi t_2
- daddu c_2,t_1
- sltu AT,c_2,t_1
- daddu t_2,AT
- daddu c_3,t_2
- sltu AT,c_3,t_2
- daddu c_1,AT
- dmultu a_5,b_2 /* mul_add_c(a[5],b[2],c2,c3,c1); */
- mflo t_1
- mfhi t_2
- daddu c_2,t_1
- sltu AT,c_2,t_1
- daddu t_2,AT
- daddu c_3,t_2
- sltu AT,c_3,t_2
- daddu c_1,AT
- dmultu a_6,b_1 /* mul_add_c(a[6],b[1],c2,c3,c1); */
- mflo t_1
- mfhi t_2
- daddu c_2,t_1
- sltu AT,c_2,t_1
- daddu t_2,AT
- daddu c_3,t_2
- sltu AT,c_3,t_2
- daddu c_1,AT
- dmultu a_7,b_0 /* mul_add_c(a[7],b[0],c2,c3,c1); */
- mflo t_1
- mfhi t_2
- daddu c_2,t_1
- sltu AT,c_2,t_1
- daddu t_2,AT
- daddu c_3,t_2
- sltu AT,c_3,t_2
- daddu c_1,AT
- sd c_2,56(a0) /* r[7]=c2; */
-
- dmultu a_7,b_1 /* mul_add_c(a[7],b[1],c3,c1,c2); */
- mflo t_1
- mfhi t_2
- daddu c_3,t_1
- sltu AT,c_3,t_1
- daddu t_2,AT
- daddu c_1,t_2
- sltu c_2,c_1,t_2
- dmultu a_6,b_2 /* mul_add_c(a[6],b[2],c3,c1,c2); */
- mflo t_1
- mfhi t_2
- daddu c_3,t_1
- sltu AT,c_3,t_1
- daddu t_2,AT
- daddu c_1,t_2
- sltu AT,c_1,t_2
- daddu c_2,AT
- dmultu a_5,b_3 /* mul_add_c(a[5],b[3],c3,c1,c2); */
- mflo t_1
- mfhi t_2
- daddu c_3,t_1
- sltu AT,c_3,t_1
- daddu t_2,AT
- daddu c_1,t_2
- sltu AT,c_1,t_2
- daddu c_2,AT
- dmultu a_4,b_4 /* mul_add_c(a[4],b[4],c3,c1,c2); */
- mflo t_1
- mfhi t_2
- daddu c_3,t_1
- sltu AT,c_3,t_1
- daddu t_2,AT
- daddu c_1,t_2
- sltu AT,c_1,t_2
- daddu c_2,AT
- dmultu a_3,b_5 /* mul_add_c(a[3],b[5],c3,c1,c2); */
- mflo t_1
- mfhi t_2
- daddu c_3,t_1
- sltu AT,c_3,t_1
- daddu t_2,AT
- daddu c_1,t_2
- sltu AT,c_1,t_2
- daddu c_2,AT
- dmultu a_2,b_6 /* mul_add_c(a[2],b[6],c3,c1,c2); */
- mflo t_1
- mfhi t_2
- daddu c_3,t_1
- sltu AT,c_3,t_1
- daddu t_2,AT
- daddu c_1,t_2
- sltu AT,c_1,t_2
- daddu c_2,AT
- dmultu a_1,b_7 /* mul_add_c(a[1],b[7],c3,c1,c2); */
- mflo t_1
- mfhi t_2
- daddu c_3,t_1
- sltu AT,c_3,t_1
- daddu t_2,AT
- daddu c_1,t_2
- sltu AT,c_1,t_2
- daddu c_2,AT
- sd c_3,64(a0) /* r[8]=c3; */
-
- dmultu a_2,b_7 /* mul_add_c(a[2],b[7],c1,c2,c3); */
- mflo t_1
- mfhi t_2
- daddu c_1,t_1
- sltu AT,c_1,t_1
- daddu t_2,AT
- daddu c_2,t_2
- sltu c_3,c_2,t_2
- dmultu a_3,b_6 /* mul_add_c(a[3],b[6],c1,c2,c3); */
- mflo t_1
- mfhi t_2
- daddu c_1,t_1
- sltu AT,c_1,t_1
- daddu t_2,AT
- daddu c_2,t_2
- sltu AT,c_2,t_2
- daddu c_3,AT
- dmultu a_4,b_5 /* mul_add_c(a[4],b[5],c1,c2,c3); */
- mflo t_1
- mfhi t_2
- daddu c_1,t_1
- sltu AT,c_1,t_1
- daddu t_2,AT
- daddu c_2,t_2
- sltu AT,c_2,t_2
- daddu c_3,AT
- dmultu a_5,b_4 /* mul_add_c(a[5],b[4],c1,c2,c3); */
- mflo t_1
- mfhi t_2
- daddu c_1,t_1
- sltu AT,c_1,t_1
- daddu t_2,AT
- daddu c_2,t_2
- sltu AT,c_2,t_2
- daddu c_3,AT
- dmultu a_6,b_3 /* mul_add_c(a[6],b[3],c1,c2,c3); */
- mflo t_1
- mfhi t_2
- daddu c_1,t_1
- sltu AT,c_1,t_1
- daddu t_2,AT
- daddu c_2,t_2
- sltu AT,c_2,t_2
- daddu c_3,AT
- dmultu a_7,b_2 /* mul_add_c(a[7],b[2],c1,c2,c3); */
- mflo t_1
- mfhi t_2
- daddu c_1,t_1
- sltu AT,c_1,t_1
- daddu t_2,AT
- daddu c_2,t_2
- sltu AT,c_2,t_2
- daddu c_3,AT
- sd c_1,72(a0) /* r[9]=c1; */
-
- dmultu a_7,b_3 /* mul_add_c(a[7],b[3],c2,c3,c1); */
- mflo t_1
- mfhi t_2
- daddu c_2,t_1
- sltu AT,c_2,t_1
- daddu t_2,AT
- daddu c_3,t_2
- sltu c_1,c_3,t_2
- dmultu a_6,b_4 /* mul_add_c(a[6],b[4],c2,c3,c1); */
- mflo t_1
- mfhi t_2
- daddu c_2,t_1
- sltu AT,c_2,t_1
- daddu t_2,AT
- daddu c_3,t_2
- sltu AT,c_3,t_2
- daddu c_1,AT
- dmultu a_5,b_5 /* mul_add_c(a[5],b[5],c2,c3,c1); */
- mflo t_1
- mfhi t_2
- daddu c_2,t_1
- sltu AT,c_2,t_1
- daddu t_2,AT
- daddu c_3,t_2
- sltu AT,c_3,t_2
- daddu c_1,AT
- dmultu a_4,b_6 /* mul_add_c(a[4],b[6],c2,c3,c1); */
- mflo t_1
- mfhi t_2
- daddu c_2,t_1
- sltu AT,c_2,t_1
- daddu t_2,AT
- daddu c_3,t_2
- sltu AT,c_3,t_2
- daddu c_1,AT
- dmultu a_3,b_7 /* mul_add_c(a[3],b[7],c2,c3,c1); */
- mflo t_1
- mfhi t_2
- daddu c_2,t_1
- sltu AT,c_2,t_1
- daddu t_2,AT
- daddu c_3,t_2
- sltu AT,c_3,t_2
- daddu c_1,AT
- sd c_2,80(a0) /* r[10]=c2; */
-
- dmultu a_4,b_7 /* mul_add_c(a[4],b[7],c3,c1,c2); */
- mflo t_1
- mfhi t_2
- daddu c_3,t_1
- sltu AT,c_3,t_1
- daddu t_2,AT
- daddu c_1,t_2
- sltu c_2,c_1,t_2
- dmultu a_5,b_6 /* mul_add_c(a[5],b[6],c3,c1,c2); */
- mflo t_1
- mfhi t_2
- daddu c_3,t_1
- sltu AT,c_3,t_1
- daddu t_2,AT
- daddu c_1,t_2
- sltu AT,c_1,t_2
- daddu c_2,AT
- dmultu a_6,b_5 /* mul_add_c(a[6],b[5],c3,c1,c2); */
- mflo t_1
- mfhi t_2
- daddu c_3,t_1
- sltu AT,c_3,t_1
- daddu t_2,AT
- daddu c_1,t_2
- sltu AT,c_1,t_2
- daddu c_2,AT
- dmultu a_7,b_4 /* mul_add_c(a[7],b[4],c3,c1,c2); */
- mflo t_1
- mfhi t_2
- daddu c_3,t_1
- sltu AT,c_3,t_1
- daddu t_2,AT
- daddu c_1,t_2
- sltu AT,c_1,t_2
- daddu c_2,AT
- sd c_3,88(a0) /* r[11]=c3; */
-
- dmultu a_7,b_5 /* mul_add_c(a[7],b[5],c1,c2,c3); */
- mflo t_1
- mfhi t_2
- daddu c_1,t_1
- sltu AT,c_1,t_1
- daddu t_2,AT
- daddu c_2,t_2
- sltu c_3,c_2,t_2
- dmultu a_6,b_6 /* mul_add_c(a[6],b[6],c1,c2,c3); */
- mflo t_1
- mfhi t_2
- daddu c_1,t_1
- sltu AT,c_1,t_1
- daddu t_2,AT
- daddu c_2,t_2
- sltu AT,c_2,t_2
- daddu c_3,AT
- dmultu a_5,b_7 /* mul_add_c(a[5],b[7],c1,c2,c3); */
- mflo t_1
- mfhi t_2
- daddu c_1,t_1
- sltu AT,c_1,t_1
- daddu t_2,AT
- daddu c_2,t_2
- sltu AT,c_2,t_2
- daddu c_3,AT
- sd c_1,96(a0) /* r[12]=c1; */
-
- dmultu a_6,b_7 /* mul_add_c(a[6],b[7],c2,c3,c1); */
- mflo t_1
- mfhi t_2
- daddu c_2,t_1
- sltu AT,c_2,t_1
- daddu t_2,AT
- daddu c_3,t_2
- sltu c_1,c_3,t_2
- dmultu a_7,b_6 /* mul_add_c(a[7],b[6],c2,c3,c1); */
- mflo t_1
- mfhi t_2
- daddu c_2,t_1
- sltu AT,c_2,t_1
- daddu t_2,AT
- daddu c_3,t_2
- sltu AT,c_3,t_2
- daddu c_1,AT
- sd c_2,104(a0) /* r[13]=c2; */
-
- dmultu a_7,b_7 /* mul_add_c(a[7],b[7],c3,c1,c2); */
- ld s0,0(sp)
- ld s1,8(sp)
- ld s2,16(sp)
- ld s3,24(sp)
- ld s4,32(sp)
- ld s5,40(sp)
- mflo t_1
- mfhi t_2
- daddu c_3,t_1
- sltu AT,c_3,t_1
- daddu t_2,AT
- daddu c_1,t_2
- sd c_3,112(a0) /* r[14]=c3; */
- sd c_1,120(a0) /* r[15]=c1; */
-
- PTR_ADD sp,FRAME_SIZE
-
- jr ra
-END(bn_mul_comba8)
-
-.align 5
-LEAF(bn_mul_comba4)
- .set reorder
- ld a_0,0(a1)
- ld b_0,0(a2)
- ld a_1,8(a1)
- ld a_2,16(a1)
- dmultu a_0,b_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */
- ld a_3,24(a1)
- ld b_1,8(a2)
- ld b_2,16(a2)
- ld b_3,24(a2)
- mflo c_1
- mfhi c_2
- sd c_1,0(a0)
-
- dmultu a_0,b_1 /* mul_add_c(a[0],b[1],c2,c3,c1); */
- mflo t_1
- mfhi t_2
- daddu c_2,t_1
- sltu AT,c_2,t_1
- daddu c_3,t_2,AT
- dmultu a_1,b_0 /* mul_add_c(a[1],b[0],c2,c3,c1); */
- mflo t_1
- mfhi t_2
- daddu c_2,t_1
- sltu AT,c_2,t_1
- daddu t_2,AT
- daddu c_3,t_2
- sltu c_1,c_3,t_2
- sd c_2,8(a0)
-
- dmultu a_2,b_0 /* mul_add_c(a[2],b[0],c3,c1,c2); */
- mflo t_1
- mfhi t_2
- daddu c_3,t_1
- sltu AT,c_3,t_1
- daddu t_2,AT
- daddu c_1,t_2
- dmultu a_1,b_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */
- mflo t_1
- mfhi t_2
- daddu c_3,t_1
- sltu AT,c_3,t_1
- daddu t_2,AT
- daddu c_1,t_2
- sltu c_2,c_1,t_2
- dmultu a_0,b_2 /* mul_add_c(a[0],b[2],c3,c1,c2); */
- mflo t_1
- mfhi t_2
- daddu c_3,t_1
- sltu AT,c_3,t_1
- daddu t_2,AT
- daddu c_1,t_2
- sltu AT,c_1,t_2
- daddu c_2,AT
- sd c_3,16(a0)
-
- dmultu a_0,b_3 /* mul_add_c(a[0],b[3],c1,c2,c3); */
- mflo t_1
- mfhi t_2
- daddu c_1,t_1
- sltu AT,c_1,t_1
- daddu t_2,AT
- daddu c_2,t_2
- sltu c_3,c_2,t_2
- dmultu a_1,b_2 /* mul_add_c(a[1],b[2],c1,c2,c3); */
- mflo t_1
- mfhi t_2
- daddu c_1,t_1
- sltu AT,c_1,t_1
- daddu t_2,AT
- daddu c_2,t_2
- sltu AT,c_2,t_2
- daddu c_3,AT
- dmultu a_2,b_1 /* mul_add_c(a[2],b[1],c1,c2,c3); */
- mflo t_1
- mfhi t_2
- daddu c_1,t_1
- sltu AT,c_1,t_1
- daddu t_2,AT
- daddu c_2,t_2
- sltu AT,c_2,t_2
- daddu c_3,AT
- dmultu a_3,b_0 /* mul_add_c(a[3],b[0],c1,c2,c3); */
- mflo t_1
- mfhi t_2
- daddu c_1,t_1
- sltu AT,c_1,t_1
- daddu t_2,AT
- daddu c_2,t_2
- sltu AT,c_2,t_2
- daddu c_3,AT
- sd c_1,24(a0)
-
- dmultu a_3,b_1 /* mul_add_c(a[3],b[1],c2,c3,c1); */
- mflo t_1
- mfhi t_2
- daddu c_2,t_1
- sltu AT,c_2,t_1
- daddu t_2,AT
- daddu c_3,t_2
- sltu c_1,c_3,t_2
- dmultu a_2,b_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */
- mflo t_1
- mfhi t_2
- daddu c_2,t_1
- sltu AT,c_2,t_1
- daddu t_2,AT
- daddu c_3,t_2
- sltu AT,c_3,t_2
- daddu c_1,AT
- dmultu a_1,b_3 /* mul_add_c(a[1],b[3],c2,c3,c1); */
- mflo t_1
- mfhi t_2
- daddu c_2,t_1
- sltu AT,c_2,t_1
- daddu t_2,AT
- daddu c_3,t_2
- sltu AT,c_3,t_2
- daddu c_1,AT
- sd c_2,32(a0)
-
- dmultu a_2,b_3 /* mul_add_c(a[2],b[3],c3,c1,c2); */
- mflo t_1
- mfhi t_2
- daddu c_3,t_1
- sltu AT,c_3,t_1
- daddu t_2,AT
- daddu c_1,t_2
- sltu c_2,c_1,t_2
- dmultu a_3,b_2 /* mul_add_c(a[3],b[2],c3,c1,c2); */
- mflo t_1
- mfhi t_2
- daddu c_3,t_1
- sltu AT,c_3,t_1
- daddu t_2,AT
- daddu c_1,t_2
- sltu AT,c_1,t_2
- daddu c_2,AT
- sd c_3,40(a0)
-
- dmultu a_3,b_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */
- mflo t_1
- mfhi t_2
- daddu c_1,t_1
- sltu AT,c_1,t_1
- daddu t_2,AT
- daddu c_2,t_2
- sd c_1,48(a0)
- sd c_2,56(a0)
-
- jr ra
-END(bn_mul_comba4)
-
-#undef a_4
-#undef a_5
-#undef a_6
-#undef a_7
-#define a_4 b_0
-#define a_5 b_1
-#define a_6 b_2
-#define a_7 b_3
-
-.align 5
-LEAF(bn_sqr_comba8)
- .set reorder
- ld a_0,0(a1)
- ld a_1,8(a1)
- ld a_2,16(a1)
- ld a_3,24(a1)
-
- dmultu a_0,a_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */
- ld a_4,32(a1)
- ld a_5,40(a1)
- ld a_6,48(a1)
- ld a_7,56(a1)
- mflo c_1
- mfhi c_2
- sd c_1,0(a0)
-
- dmultu a_0,a_1 /* mul_add_c2(a[0],b[1],c2,c3,c1); */
- mflo t_1
- mfhi t_2
- slt c_1,t_2,zero
- dsll t_2,1
- slt a2,t_1,zero
- daddu t_2,a2
- dsll t_1,1
- daddu c_2,t_1
- sltu AT,c_2,t_1
- daddu c_3,t_2,AT
- sd c_2,8(a0)
-
- dmultu a_2,a_0 /* mul_add_c2(a[2],b[0],c3,c1,c2); */
- mflo t_1
- mfhi t_2
- slt c_2,t_2,zero
- dsll t_2,1
- slt a2,t_1,zero
- daddu t_2,a2
- dsll t_1,1
- daddu c_3,t_1
- sltu AT,c_3,t_1
- daddu t_2,AT
- daddu c_1,t_2
- sltu AT,c_1,t_2
- daddu c_2,AT
- dmultu a_1,a_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */
- mflo t_1
- mfhi t_2
- daddu c_3,t_1
- sltu AT,c_3,t_1
- daddu t_2,AT
- daddu c_1,t_2
- sltu AT,c_1,t_2
- daddu c_2,AT
- sd c_3,16(a0)
-
- dmultu a_0,a_3 /* mul_add_c2(a[0],b[3],c1,c2,c3); */
- mflo t_1
- mfhi t_2
- slt c_3,t_2,zero
- dsll t_2,1
- slt a2,t_1,zero
- daddu t_2,a2
- dsll t_1,1
- daddu c_1,t_1
- sltu AT,c_1,t_1
- daddu t_2,AT
- daddu c_2,t_2
- sltu AT,c_2,t_2
- daddu c_3,AT
- dmultu a_1,a_2 /* mul_add_c2(a[1],b[2],c1,c2,c3); */
- mflo t_1
- mfhi t_2
- slt AT,t_2,zero
- daddu c_3,AT
- dsll t_2,1
- slt a2,t_1,zero
- daddu t_2,a2
- dsll t_1,1
- daddu c_1,t_1
- sltu AT,c_1,t_1
- daddu t_2,AT
- daddu c_2,t_2
- sltu AT,c_2,t_2
- daddu c_3,AT
- sd c_1,24(a0)
-
- dmultu a_4,a_0 /* mul_add_c2(a[4],b[0],c2,c3,c1); */
- mflo t_1
- mfhi t_2
- slt c_1,t_2,zero
- dsll t_2,1
- slt a2,t_1,zero
- daddu t_2,a2
- dsll t_1,1
- daddu c_2,t_1
- sltu AT,c_2,t_1
- daddu t_2,AT
- daddu c_3,t_2
- sltu AT,c_3,t_2
- daddu c_1,AT
- dmultu a_3,a_1 /* mul_add_c2(a[3],b[1],c2,c3,c1); */
- mflo t_1
- mfhi t_2
- slt AT,t_2,zero
- daddu c_1,AT
- dsll t_2,1
- slt a2,t_1,zero
- daddu t_2,a2
- dsll t_1,1
- daddu c_2,t_1
- sltu AT,c_2,t_1
- daddu t_2,AT
- daddu c_3,t_2
- sltu AT,c_3,t_2
- daddu c_1,AT
- dmultu a_2,a_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */
- mflo t_1
- mfhi t_2
- daddu c_2,t_1
- sltu AT,c_2,t_1
- daddu t_2,AT
- daddu c_3,t_2
- sltu AT,c_3,t_2
- daddu c_1,AT
- sd c_2,32(a0)
-
- dmultu a_0,a_5 /* mul_add_c2(a[0],b[5],c3,c1,c2); */
- mflo t_1
- mfhi t_2
- slt c_2,t_2,zero
- dsll t_2,1
- slt a2,t_1,zero
- daddu t_2,a2
- dsll t_1,1
- daddu c_3,t_1
- sltu AT,c_3,t_1
- daddu t_2,AT
- daddu c_1,t_2
- sltu AT,c_1,t_2
- daddu c_2,AT
- dmultu a_1,a_4 /* mul_add_c2(a[1],b[4],c3,c1,c2); */
- mflo t_1
- mfhi t_2
- slt AT,t_2,zero
- daddu c_2,AT
- dsll t_2,1
- slt a2,t_1,zero
- daddu t_2,a2
- dsll t_1,1
- daddu c_3,t_1
- sltu AT,c_3,t_1
- daddu t_2,AT
- daddu c_1,t_2
- sltu AT,c_1,t_2
- daddu c_2,AT
- dmultu a_2,a_3 /* mul_add_c2(a[2],b[3],c3,c1,c2); */
- mflo t_1
- mfhi t_2
- slt AT,t_2,zero
- daddu c_2,AT
- dsll t_2,1
- slt a2,t_1,zero
- daddu t_2,a2
- dsll t_1,1
- daddu c_3,t_1
- sltu AT,c_3,t_1
- daddu t_2,AT
- daddu c_1,t_2
- sltu AT,c_1,t_2
- daddu c_2,AT
- sd c_3,40(a0)
-
- dmultu a_6,a_0 /* mul_add_c2(a[6],b[0],c1,c2,c3); */
- mflo t_1
- mfhi t_2
- slt c_3,t_2,zero
- dsll t_2,1
- slt a2,t_1,zero
- daddu t_2,a2
- dsll t_1,1
- daddu c_1,t_1
- sltu AT,c_1,t_1
- daddu t_2,AT
- daddu c_2,t_2
- sltu AT,c_2,t_2
- daddu c_3,AT
- dmultu a_5,a_1 /* mul_add_c2(a[5],b[1],c1,c2,c3); */
- mflo t_1
- mfhi t_2
- slt AT,t_2,zero
- daddu c_3,AT
- dsll t_2,1
- slt a2,t_1,zero
- daddu t_2,a2
- dsll t_1,1
- daddu c_1,t_1
- sltu AT,c_1,t_1
- daddu t_2,AT
- daddu c_2,t_2
- sltu AT,c_2,t_2
- daddu c_3,AT
- dmultu a_4,a_2 /* mul_add_c2(a[4],b[2],c1,c2,c3); */
- mflo t_1
- mfhi t_2
- slt AT,t_2,zero
- daddu c_3,AT
- dsll t_2,1
- slt a2,t_1,zero
- daddu t_2,a2
- dsll t_1,1
- daddu c_1,t_1
- sltu AT,c_1,t_1
- daddu t_2,AT
- daddu c_2,t_2
- sltu AT,c_2,t_2
- daddu c_3,AT
- dmultu a_3,a_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */
- mflo t_1
- mfhi t_2
- daddu c_1,t_1
- sltu AT,c_1,t_1
- daddu t_2,AT
- daddu c_2,t_2
- sltu AT,c_2,t_2
- daddu c_3,AT
- sd c_1,48(a0)
-
- dmultu a_0,a_7 /* mul_add_c2(a[0],b[7],c2,c3,c1); */
- mflo t_1
- mfhi t_2
- slt c_1,t_2,zero
- dsll t_2,1
- slt a2,t_1,zero
- daddu t_2,a2
- dsll t_1,1
- daddu c_2,t_1
- sltu AT,c_2,t_1
- daddu t_2,AT
- daddu c_3,t_2
- sltu AT,c_3,t_2
- daddu c_1,AT
- dmultu a_1,a_6 /* mul_add_c2(a[1],b[6],c2,c3,c1); */
- mflo t_1
- mfhi t_2
- slt AT,t_2,zero
- daddu c_1,AT
- dsll t_2,1
- slt a2,t_1,zero
- daddu t_2,a2
- dsll t_1,1
- daddu c_2,t_1
- sltu AT,c_2,t_1
- daddu t_2,AT
- daddu c_3,t_2
- sltu AT,c_3,t_2
- daddu c_1,AT
- dmultu a_2,a_5 /* mul_add_c2(a[2],b[5],c2,c3,c1); */
- mflo t_1
- mfhi t_2
- slt AT,t_2,zero
- daddu c_1,AT
- dsll t_2,1
- slt a2,t_1,zero
- daddu t_2,a2
- dsll t_1,1
- daddu c_2,t_1
- sltu AT,c_2,t_1
- daddu t_2,AT
- daddu c_3,t_2
- sltu AT,c_3,t_2
- daddu c_1,AT
- dmultu a_3,a_4 /* mul_add_c2(a[3],b[4],c2,c3,c1); */
- mflo t_1
- mfhi t_2
- slt AT,t_2,zero
- daddu c_1,AT
- dsll t_2,1
- slt a2,t_1,zero
- daddu t_2,a2
- dsll t_1,1
- daddu c_2,t_1
- sltu AT,c_2,t_1
- daddu t_2,AT
- daddu c_3,t_2
- sltu AT,c_3,t_2
- daddu c_1,AT
- sd c_2,56(a0)
-
- dmultu a_7,a_1 /* mul_add_c2(a[7],b[1],c3,c1,c2); */
- mflo t_1
- mfhi t_2
- slt c_2,t_2,zero
- dsll t_2,1
- slt a2,t_1,zero
- daddu t_2,a2
- dsll t_1,1
- daddu c_3,t_1
- sltu AT,c_3,t_1
- daddu t_2,AT
- daddu c_1,t_2
- sltu AT,c_1,t_2
- daddu c_2,AT
- dmultu a_6,a_2 /* mul_add_c2(a[6],b[2],c3,c1,c2); */
- mflo t_1
- mfhi t_2
- slt AT,t_2,zero
- daddu c_2,AT
- dsll t_2,1
- slt a2,t_1,zero
- daddu t_2,a2
- dsll t_1,1
- daddu c_3,t_1
- sltu AT,c_3,t_1
- daddu t_2,AT
- daddu c_1,t_2
- sltu AT,c_1,t_2
- daddu c_2,AT
- dmultu a_5,a_3 /* mul_add_c2(a[5],b[3],c3,c1,c2); */
- mflo t_1
- mfhi t_2
- slt AT,t_2,zero
- daddu c_2,AT
- dsll t_2,1
- slt a2,t_1,zero
- daddu t_2,a2
- dsll t_1,1
- daddu c_3,t_1
- sltu AT,c_3,t_1
- daddu t_2,AT
- daddu c_1,t_2
- sltu AT,c_1,t_2
- daddu c_2,AT
- dmultu a_4,a_4 /* mul_add_c(a[4],b[4],c3,c1,c2); */
- mflo t_1
- mfhi t_2
- daddu c_3,t_1
- sltu AT,c_3,t_1
- daddu t_2,AT
- daddu c_1,t_2
- sltu AT,c_1,t_2
- daddu c_2,AT
- sd c_3,64(a0)
-
- dmultu a_2,a_7 /* mul_add_c2(a[2],b[7],c1,c2,c3); */
- mflo t_1
- mfhi t_2
- slt c_3,t_2,zero
- dsll t_2,1
- slt a2,t_1,zero
- daddu t_2,a2
- dsll t_1,1
- daddu c_1,t_1
- sltu AT,c_1,t_1
- daddu t_2,AT
- daddu c_2,t_2
- sltu AT,c_2,t_2
- daddu c_3,AT
- dmultu a_3,a_6 /* mul_add_c2(a[3],b[6],c1,c2,c3); */
- mflo t_1
- mfhi t_2
- slt AT,t_2,zero
- daddu c_3,AT
- dsll t_2,1
- slt a2,t_1,zero
- daddu t_2,a2
- dsll t_1,1
- daddu c_1,t_1
- sltu AT,c_1,t_1
- daddu t_2,AT
- daddu c_2,t_2
- sltu AT,c_2,t_2
- daddu c_3,AT
- dmultu a_4,a_5 /* mul_add_c2(a[4],b[5],c1,c2,c3); */
- mflo t_1
- mfhi t_2
- slt AT,t_2,zero
- daddu c_3,AT
- dsll t_2,1
- slt a2,t_1,zero
- daddu t_2,a2
- dsll t_1,1
- daddu c_1,t_1
- sltu AT,c_1,t_1
- daddu t_2,AT
- daddu c_2,t_2
- sltu AT,c_2,t_2
- daddu c_3,AT
- sd c_1,72(a0)
-
- dmultu a_7,a_3 /* mul_add_c2(a[7],b[3],c2,c3,c1); */
- mflo t_1
- mfhi t_2
- slt c_1,t_2,zero
- dsll t_2,1
- slt a2,t_1,zero
- daddu t_2,a2
- dsll t_1,1
- daddu c_2,t_1
- sltu AT,c_2,t_1
- daddu t_2,AT
- daddu c_3,t_2
- sltu AT,c_3,t_2
- daddu c_1,AT
- dmultu a_6,a_4 /* mul_add_c2(a[6],b[4],c2,c3,c1); */
- mflo t_1
- mfhi t_2
- slt AT,t_2,zero
- daddu c_1,AT
- dsll t_2,1
- slt a2,t_1,zero
- daddu t_2,a2
- dsll t_1,1
- daddu c_2,t_1
- sltu AT,c_2,t_1
- daddu t_2,AT
- daddu c_3,t_2
- sltu AT,c_3,t_2
- daddu c_1,AT
- dmultu a_5,a_5 /* mul_add_c(a[5],b[5],c2,c3,c1); */
- mflo t_1
- mfhi t_2
- daddu c_2,t_1
- sltu AT,c_2,t_1
- daddu t_2,AT
- daddu c_3,t_2
- sltu AT,c_3,t_2
- daddu c_1,AT
- sd c_2,80(a0)
-
- dmultu a_4,a_7 /* mul_add_c2(a[4],b[7],c3,c1,c2); */
- mflo t_1
- mfhi t_2
- slt c_2,t_2,zero
- dsll t_2,1
- slt a2,t_1,zero
- daddu t_2,a2
- dsll t_1,1
- daddu c_3,t_1
- sltu AT,c_3,t_1
- daddu t_2,AT
- daddu c_1,t_2
- sltu AT,c_1,t_2
- daddu c_2,AT
- dmultu a_5,a_6 /* mul_add_c2(a[5],b[6],c3,c1,c2); */
- mflo t_1
- mfhi t_2
- slt AT,t_2,zero
- daddu c_2,AT
- dsll t_2,1
- slt a2,t_1,zero
- daddu t_2,a2
- dsll t_1,1
- daddu c_3,t_1
- sltu AT,c_3,t_1
- daddu t_2,AT
- daddu c_1,t_2
- sltu AT,c_1,t_2
- daddu c_2,AT
- sd c_3,88(a0)
-
- dmultu a_7,a_5 /* mul_add_c2(a[7],b[5],c1,c2,c3); */
- mflo t_1
- mfhi t_2
- slt c_3,t_2,zero
- dsll t_2,1
- slt a2,t_1,zero
- daddu t_2,a2
- dsll t_1,1
- daddu c_1,t_1
- sltu AT,c_1,t_1
- daddu t_2,AT
- daddu c_2,t_2
- sltu AT,c_2,t_2
- daddu c_3,AT
- dmultu a_6,a_6 /* mul_add_c(a[6],b[6],c1,c2,c3); */
- mflo t_1
- mfhi t_2
- daddu c_1,t_1
- sltu AT,c_1,t_1
- daddu t_2,AT
- daddu c_2,t_2
- sltu AT,c_2,t_2
- daddu c_3,AT
- sd c_1,96(a0)
-
- dmultu a_6,a_7 /* mul_add_c2(a[6],b[7],c2,c3,c1); */
- mflo t_1
- mfhi t_2
- slt c_1,t_2,zero
- dsll t_2,1
- slt a2,t_1,zero
- daddu t_2,a2
- dsll t_1,1
- daddu c_2,t_1
- sltu AT,c_2,t_1
- daddu t_2,AT
- daddu c_3,t_2
- sltu AT,c_3,t_2
- daddu c_1,AT
- sd c_2,104(a0)
-
- dmultu a_7,a_7 /* mul_add_c(a[7],b[7],c3,c1,c2); */
- mflo t_1
- mfhi t_2
- daddu c_3,t_1
- sltu AT,c_3,t_1
- daddu t_2,AT
- daddu c_1,t_2
- sd c_3,112(a0)
- sd c_1,120(a0)
-
- jr ra
-END(bn_sqr_comba8)
-
-.align 5
-LEAF(bn_sqr_comba4)
- .set reorder
- ld a_0,0(a1)
- ld a_1,8(a1)
- ld a_2,16(a1)
- ld a_3,24(a1)
- dmultu a_0,a_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */
- mflo c_1
- mfhi c_2
- sd c_1,0(a0)
-
- dmultu a_0,a_1 /* mul_add_c2(a[0],b[1],c2,c3,c1); */
- mflo t_1
- mfhi t_2
- slt c_1,t_2,zero
- dsll t_2,1
- slt a2,t_1,zero
- daddu t_2,a2
- dsll t_1,1
- daddu c_2,t_1
- sltu AT,c_2,t_1
- daddu c_3,t_2,AT
- sd c_2,8(a0)
-
- dmultu a_2,a_0 /* mul_add_c2(a[2],b[0],c3,c1,c2); */
- mflo t_1
- mfhi t_2
- slt c_2,t_2,zero
- dsll t_2,1
- slt a2,t_1,zero
- daddu t_2,a2
- dsll t_1,1
- daddu c_3,t_1
- sltu AT,c_3,t_1
- daddu t_2,AT
- daddu c_1,t_2
- sltu AT,c_1,t_2
- daddu c_2,AT
- dmultu a_1,a_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */
- mflo t_1
- mfhi t_2
- daddu c_3,t_1
- sltu AT,c_3,t_1
- daddu t_2,AT
- daddu c_1,t_2
- sltu AT,c_1,t_2
- daddu c_2,AT
- sd c_3,16(a0)
-
- dmultu a_0,a_3 /* mul_add_c2(a[0],b[3],c1,c2,c3); */
- mflo t_1
- mfhi t_2
- slt c_3,t_2,zero
- dsll t_2,1
- slt a2,t_1,zero
- daddu t_2,a2
- dsll t_1,1
- daddu c_1,t_1
- sltu AT,c_1,t_1
- daddu t_2,AT
- daddu c_2,t_2
- sltu AT,c_2,t_2
- daddu c_3,AT
- dmultu a_1,a_2 /* mul_add_c(a2[1],b[2],c1,c2,c3); */
- mflo t_1
- mfhi t_2
- slt AT,t_2,zero
- daddu c_3,AT
- dsll t_2,1
- slt a2,t_1,zero
- daddu t_2,a2
- dsll t_1,1
- daddu c_1,t_1
- sltu AT,c_1,t_1
- daddu t_2,AT
- daddu c_2,t_2
- sltu AT,c_2,t_2
- daddu c_3,AT
- sd c_1,24(a0)
-
- dmultu a_3,a_1 /* mul_add_c2(a[3],b[1],c2,c3,c1); */
- mflo t_1
- mfhi t_2
- slt c_1,t_2,zero
- dsll t_2,1
- slt a2,t_1,zero
- daddu t_2,a2
- dsll t_1,1
- daddu c_2,t_1
- sltu AT,c_2,t_1
- daddu t_2,AT
- daddu c_3,t_2
- sltu AT,c_3,t_2
- daddu c_1,AT
- dmultu a_2,a_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */
- mflo t_1
- mfhi t_2
- daddu c_2,t_1
- sltu AT,c_2,t_1
- daddu t_2,AT
- daddu c_3,t_2
- sltu AT,c_3,t_2
- daddu c_1,AT
- sd c_2,32(a0)
-
- dmultu a_2,a_3 /* mul_add_c2(a[2],b[3],c3,c1,c2); */
- mflo t_1
- mfhi t_2
- slt c_2,t_2,zero
- dsll t_2,1
- slt a2,t_1,zero
- daddu t_2,a2
- dsll t_1,1
- daddu c_3,t_1
- sltu AT,c_3,t_1
- daddu t_2,AT
- daddu c_1,t_2
- sltu AT,c_1,t_2
- daddu c_2,AT
- sd c_3,40(a0)
-
- dmultu a_3,a_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */
- mflo t_1
- mfhi t_2
- daddu c_1,t_1
- sltu AT,c_1,t_1
- daddu t_2,AT
- daddu c_2,t_2
- sd c_1,48(a0)
- sd c_2,56(a0)
-
- jr ra
-END(bn_sqr_comba4)
diff --git a/app/openssl/crypto/bn/asm/pa-risc2.S b/app/openssl/crypto/bn/asm/pa-risc2.S
deleted file mode 100644
index f3b16290..00000000
--- a/app/openssl/crypto/bn/asm/pa-risc2.S
+++ /dev/null
@@ -1,1618 +0,0 @@
-;
-; PA-RISC 2.0 implementation of bn_asm code, based on the
-; 64-bit version of the code. This code is effectively the
-; same as the 64-bit version except the register model is
-; slightly different given all values must be 32-bit between
-; function calls. Thus the 64-bit return values are returned
-; in %ret0 and %ret1 vs just %ret0 as is done in 64-bit
-;
-;
-; This code is approximately 2x faster than the C version
-; for RSA/DSA.
-;
-; See http://devresource.hp.com/ for more details on the PA-RISC
-; architecture. Also see the book "PA-RISC 2.0 Architecture"
-; by Gerry Kane for information on the instruction set architecture.
-;
-; Code written by Chris Ruemmler (with some help from the HP C
-; compiler).
-;
-; The code compiles with HP's assembler
-;
-
- .level 2.0N
- .space $TEXT$
- .subspa $CODE$,QUAD=0,ALIGN=8,ACCESS=0x2c,CODE_ONLY
-
-;
-; Global Register definitions used for the routines.
-;
-; Some information about HP's runtime architecture for 32-bits.
-;
-; "Caller save" means the calling function must save the register
-; if it wants the register to be preserved.
-; "Callee save" means if a function uses the register, it must save
-; the value before using it.
-;
-; For the floating point registers
-;
-; "caller save" registers: fr4-fr11, fr22-fr31
-; "callee save" registers: fr12-fr21
-; "special" registers: fr0-fr3 (status and exception registers)
-;
-; For the integer registers
-; value zero : r0
-; "caller save" registers: r1,r19-r26
-; "callee save" registers: r3-r18
-; return register : r2 (rp)
-; return values ; r28,r29 (ret0,ret1)
-; Stack pointer ; r30 (sp)
-; millicode return ptr ; r31 (also a caller save register)
-
-
-;
-; Arguments to the routines
-;
-r_ptr .reg %r26
-a_ptr .reg %r25
-b_ptr .reg %r24
-num .reg %r24
-n .reg %r23
-
-;
-; Note that the "w" argument for bn_mul_add_words and bn_mul_words
-; is passed on the stack at a delta of -56 from the top of stack
-; as the routine is entered.
-;
-
-;
-; Globals used in some routines
-;
-
-top_overflow .reg %r23
-high_mask .reg %r22 ; value 0xffffffff80000000L
-
-
-;------------------------------------------------------------------------------
-;
-; bn_mul_add_words
-;
-;BN_ULONG bn_mul_add_words(BN_ULONG *r_ptr, BN_ULONG *a_ptr,
-; int num, BN_ULONG w)
-;
-; arg0 = r_ptr
-; arg1 = a_ptr
-; arg3 = num
-; -56(sp) = w
-;
-; Local register definitions
-;
-
-fm1 .reg %fr22
-fm .reg %fr23
-ht_temp .reg %fr24
-ht_temp_1 .reg %fr25
-lt_temp .reg %fr26
-lt_temp_1 .reg %fr27
-fm1_1 .reg %fr28
-fm_1 .reg %fr29
-
-fw_h .reg %fr7L
-fw_l .reg %fr7R
-fw .reg %fr7
-
-fht_0 .reg %fr8L
-flt_0 .reg %fr8R
-t_float_0 .reg %fr8
-
-fht_1 .reg %fr9L
-flt_1 .reg %fr9R
-t_float_1 .reg %fr9
-
-tmp_0 .reg %r31
-tmp_1 .reg %r21
-m_0 .reg %r20
-m_1 .reg %r19
-ht_0 .reg %r1
-ht_1 .reg %r3
-lt_0 .reg %r4
-lt_1 .reg %r5
-m1_0 .reg %r6
-m1_1 .reg %r7
-rp_val .reg %r8
-rp_val_1 .reg %r9
-
-bn_mul_add_words
- .export bn_mul_add_words,entry,NO_RELOCATION,LONG_RETURN
- .proc
- .callinfo frame=128
- .entry
- .align 64
-
- STD %r3,0(%sp) ; save r3
- STD %r4,8(%sp) ; save r4
- NOP ; Needed to make the loop 16-byte aligned
- NOP ; needed to make the loop 16-byte aligned
-
- STD %r5,16(%sp) ; save r5
- NOP
- STD %r6,24(%sp) ; save r6
- STD %r7,32(%sp) ; save r7
-
- STD %r8,40(%sp) ; save r8
- STD %r9,48(%sp) ; save r9
- COPY %r0,%ret1 ; return 0 by default
- DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32
-
- CMPIB,>= 0,num,bn_mul_add_words_exit ; if (num <= 0) then exit
- LDO 128(%sp),%sp ; bump stack
-
- ;
- ; The loop is unrolled twice, so if there is only 1 number
- ; then go straight to the cleanup code.
- ;
- CMPIB,= 1,num,bn_mul_add_words_single_top
- FLDD -184(%sp),fw ; (-56-128) load up w into fw (fw_h/fw_l)
-
- ;
- ; This loop is unrolled 2 times (64-byte aligned as well)
- ;
- ; PA-RISC 2.0 chips have two fully pipelined multipliers, thus
- ; two 32-bit mutiplies can be issued per cycle.
- ;
-bn_mul_add_words_unroll2
-
- FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
- FLDD 8(a_ptr),t_float_1 ; load up 64-bit value (fr8L) ht(L)/lt(R)
- LDD 0(r_ptr),rp_val ; rp[0]
- LDD 8(r_ptr),rp_val_1 ; rp[1]
-
- XMPYU fht_0,fw_l,fm1 ; m1[0] = fht_0*fw_l
- XMPYU fht_1,fw_l,fm1_1 ; m1[1] = fht_1*fw_l
- FSTD fm1,-16(%sp) ; -16(sp) = m1[0]
- FSTD fm1_1,-48(%sp) ; -48(sp) = m1[1]
-
- XMPYU flt_0,fw_h,fm ; m[0] = flt_0*fw_h
- XMPYU flt_1,fw_h,fm_1 ; m[1] = flt_1*fw_h
- FSTD fm,-8(%sp) ; -8(sp) = m[0]
- FSTD fm_1,-40(%sp) ; -40(sp) = m[1]
-
- XMPYU fht_0,fw_h,ht_temp ; ht_temp = fht_0*fw_h
- XMPYU fht_1,fw_h,ht_temp_1 ; ht_temp_1 = fht_1*fw_h
- FSTD ht_temp,-24(%sp) ; -24(sp) = ht_temp
- FSTD ht_temp_1,-56(%sp) ; -56(sp) = ht_temp_1
-
- XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
- XMPYU flt_1,fw_l,lt_temp_1 ; lt_temp = lt*fw_l
- FSTD lt_temp,-32(%sp) ; -32(sp) = lt_temp
- FSTD lt_temp_1,-64(%sp) ; -64(sp) = lt_temp_1
-
- LDD -8(%sp),m_0 ; m[0]
- LDD -40(%sp),m_1 ; m[1]
- LDD -16(%sp),m1_0 ; m1[0]
- LDD -48(%sp),m1_1 ; m1[1]
-
- LDD -24(%sp),ht_0 ; ht[0]
- LDD -56(%sp),ht_1 ; ht[1]
- ADD,L m1_0,m_0,tmp_0 ; tmp_0 = m[0] + m1[0];
- ADD,L m1_1,m_1,tmp_1 ; tmp_1 = m[1] + m1[1];
-
- LDD -32(%sp),lt_0
- LDD -64(%sp),lt_1
- CMPCLR,*>>= tmp_0,m1_0, %r0 ; if (m[0] < m1[0])
- ADD,L ht_0,top_overflow,ht_0 ; ht[0] += (1<<32)
-
- CMPCLR,*>>= tmp_1,m1_1,%r0 ; if (m[1] < m1[1])
- ADD,L ht_1,top_overflow,ht_1 ; ht[1] += (1<<32)
- EXTRD,U tmp_0,31,32,m_0 ; m[0]>>32
- DEPD,Z tmp_0,31,32,m1_0 ; m1[0] = m[0]<<32
-
- EXTRD,U tmp_1,31,32,m_1 ; m[1]>>32
- DEPD,Z tmp_1,31,32,m1_1 ; m1[1] = m[1]<<32
- ADD,L ht_0,m_0,ht_0 ; ht[0]+= (m[0]>>32)
- ADD,L ht_1,m_1,ht_1 ; ht[1]+= (m[1]>>32)
-
- ADD lt_0,m1_0,lt_0 ; lt[0] = lt[0]+m1[0];
- ADD,DC ht_0,%r0,ht_0 ; ht[0]++
- ADD lt_1,m1_1,lt_1 ; lt[1] = lt[1]+m1[1];
- ADD,DC ht_1,%r0,ht_1 ; ht[1]++
-
- ADD %ret1,lt_0,lt_0 ; lt[0] = lt[0] + c;
- ADD,DC ht_0,%r0,ht_0 ; ht[0]++
- ADD lt_0,rp_val,lt_0 ; lt[0] = lt[0]+rp[0]
- ADD,DC ht_0,%r0,ht_0 ; ht[0]++
-
- LDO -2(num),num ; num = num - 2;
- ADD ht_0,lt_1,lt_1 ; lt[1] = lt[1] + ht_0 (c);
- ADD,DC ht_1,%r0,ht_1 ; ht[1]++
- STD lt_0,0(r_ptr) ; rp[0] = lt[0]
-
- ADD lt_1,rp_val_1,lt_1 ; lt[1] = lt[1]+rp[1]
- ADD,DC ht_1,%r0,%ret1 ; ht[1]++
- LDO 16(a_ptr),a_ptr ; a_ptr += 2
-
- STD lt_1,8(r_ptr) ; rp[1] = lt[1]
- CMPIB,<= 2,num,bn_mul_add_words_unroll2 ; go again if more to do
- LDO 16(r_ptr),r_ptr ; r_ptr += 2
-
- CMPIB,=,N 0,num,bn_mul_add_words_exit ; are we done, or cleanup last one
-
- ;
- ; Top of loop aligned on 64-byte boundary
- ;
-bn_mul_add_words_single_top
- FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
- LDD 0(r_ptr),rp_val ; rp[0]
- LDO 8(a_ptr),a_ptr ; a_ptr++
- XMPYU fht_0,fw_l,fm1 ; m1 = ht*fw_l
- FSTD fm1,-16(%sp) ; -16(sp) = m1
- XMPYU flt_0,fw_h,fm ; m = lt*fw_h
- FSTD fm,-8(%sp) ; -8(sp) = m
- XMPYU fht_0,fw_h,ht_temp ; ht_temp = ht*fw_h
- FSTD ht_temp,-24(%sp) ; -24(sp) = ht
- XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
- FSTD lt_temp,-32(%sp) ; -32(sp) = lt
-
- LDD -8(%sp),m_0
- LDD -16(%sp),m1_0 ; m1 = temp1
- ADD,L m_0,m1_0,tmp_0 ; tmp_0 = m + m1;
- LDD -24(%sp),ht_0
- LDD -32(%sp),lt_0
-
- CMPCLR,*>>= tmp_0,m1_0,%r0 ; if (m < m1)
- ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32)
-
- EXTRD,U tmp_0,31,32,m_0 ; m>>32
- DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32
-
- ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32)
- ADD lt_0,m1_0,tmp_0 ; tmp_0 = lt+m1;
- ADD,DC ht_0,%r0,ht_0 ; ht++
- ADD %ret1,tmp_0,lt_0 ; lt = lt + c;
- ADD,DC ht_0,%r0,ht_0 ; ht++
- ADD lt_0,rp_val,lt_0 ; lt = lt+rp[0]
- ADD,DC ht_0,%r0,%ret1 ; ht++
- STD lt_0,0(r_ptr) ; rp[0] = lt
-
-bn_mul_add_words_exit
- .EXIT
-
- EXTRD,U %ret1,31,32,%ret0 ; for 32-bit, return in ret0/ret1
- LDD -80(%sp),%r9 ; restore r9
- LDD -88(%sp),%r8 ; restore r8
- LDD -96(%sp),%r7 ; restore r7
- LDD -104(%sp),%r6 ; restore r6
- LDD -112(%sp),%r5 ; restore r5
- LDD -120(%sp),%r4 ; restore r4
- BVE (%rp)
- LDD,MB -128(%sp),%r3 ; restore r3
- .PROCEND ;in=23,24,25,26,29;out=28;
-
-;----------------------------------------------------------------------------
-;
-;BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
-;
-; arg0 = rp
-; arg1 = ap
-; arg3 = num
-; w on stack at -56(sp)
-
-bn_mul_words
- .proc
- .callinfo frame=128
- .entry
- .EXPORT bn_mul_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
- .align 64
-
- STD %r3,0(%sp) ; save r3
- STD %r4,8(%sp) ; save r4
- NOP
- STD %r5,16(%sp) ; save r5
-
- STD %r6,24(%sp) ; save r6
- STD %r7,32(%sp) ; save r7
- COPY %r0,%ret1 ; return 0 by default
- DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32
-
- CMPIB,>= 0,num,bn_mul_words_exit
- LDO 128(%sp),%sp ; bump stack
-
- ;
- ; See if only 1 word to do, thus just do cleanup
- ;
- CMPIB,= 1,num,bn_mul_words_single_top
- FLDD -184(%sp),fw ; (-56-128) load up w into fw (fw_h/fw_l)
-
- ;
- ; This loop is unrolled 2 times (64-byte aligned as well)
- ;
- ; PA-RISC 2.0 chips have two fully pipelined multipliers, thus
- ; two 32-bit mutiplies can be issued per cycle.
- ;
-bn_mul_words_unroll2
-
- FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
- FLDD 8(a_ptr),t_float_1 ; load up 64-bit value (fr8L) ht(L)/lt(R)
- XMPYU fht_0,fw_l,fm1 ; m1[0] = fht_0*fw_l
- XMPYU fht_1,fw_l,fm1_1 ; m1[1] = ht*fw_l
-
- FSTD fm1,-16(%sp) ; -16(sp) = m1
- FSTD fm1_1,-48(%sp) ; -48(sp) = m1
- XMPYU flt_0,fw_h,fm ; m = lt*fw_h
- XMPYU flt_1,fw_h,fm_1 ; m = lt*fw_h
-
- FSTD fm,-8(%sp) ; -8(sp) = m
- FSTD fm_1,-40(%sp) ; -40(sp) = m
- XMPYU fht_0,fw_h,ht_temp ; ht_temp = fht_0*fw_h
- XMPYU fht_1,fw_h,ht_temp_1 ; ht_temp = ht*fw_h
-
- FSTD ht_temp,-24(%sp) ; -24(sp) = ht
- FSTD ht_temp_1,-56(%sp) ; -56(sp) = ht
- XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
- XMPYU flt_1,fw_l,lt_temp_1 ; lt_temp = lt*fw_l
-
- FSTD lt_temp,-32(%sp) ; -32(sp) = lt
- FSTD lt_temp_1,-64(%sp) ; -64(sp) = lt
- LDD -8(%sp),m_0
- LDD -40(%sp),m_1
-
- LDD -16(%sp),m1_0
- LDD -48(%sp),m1_1
- LDD -24(%sp),ht_0
- LDD -56(%sp),ht_1
-
- ADD,L m1_0,m_0,tmp_0 ; tmp_0 = m + m1;
- ADD,L m1_1,m_1,tmp_1 ; tmp_1 = m + m1;
- LDD -32(%sp),lt_0
- LDD -64(%sp),lt_1
-
- CMPCLR,*>>= tmp_0,m1_0, %r0 ; if (m < m1)
- ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32)
- CMPCLR,*>>= tmp_1,m1_1,%r0 ; if (m < m1)
- ADD,L ht_1,top_overflow,ht_1 ; ht += (1<<32)
-
- EXTRD,U tmp_0,31,32,m_0 ; m>>32
- DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32
- EXTRD,U tmp_1,31,32,m_1 ; m>>32
- DEPD,Z tmp_1,31,32,m1_1 ; m1 = m<<32
-
- ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32)
- ADD,L ht_1,m_1,ht_1 ; ht+= (m>>32)
- ADD lt_0,m1_0,lt_0 ; lt = lt+m1;
- ADD,DC ht_0,%r0,ht_0 ; ht++
-
- ADD lt_1,m1_1,lt_1 ; lt = lt+m1;
- ADD,DC ht_1,%r0,ht_1 ; ht++
- ADD %ret1,lt_0,lt_0 ; lt = lt + c (ret1);
- ADD,DC ht_0,%r0,ht_0 ; ht++
-
- ADD ht_0,lt_1,lt_1 ; lt = lt + c (ht_0)
- ADD,DC ht_1,%r0,ht_1 ; ht++
- STD lt_0,0(r_ptr) ; rp[0] = lt
- STD lt_1,8(r_ptr) ; rp[1] = lt
-
- COPY ht_1,%ret1 ; carry = ht
- LDO -2(num),num ; num = num - 2;
- LDO 16(a_ptr),a_ptr ; ap += 2
- CMPIB,<= 2,num,bn_mul_words_unroll2
- LDO 16(r_ptr),r_ptr ; rp++
-
- CMPIB,=,N 0,num,bn_mul_words_exit ; are we done?
-
- ;
- ; Top of loop aligned on 64-byte boundary
- ;
-bn_mul_words_single_top
- FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
-
- XMPYU fht_0,fw_l,fm1 ; m1 = ht*fw_l
- FSTD fm1,-16(%sp) ; -16(sp) = m1
- XMPYU flt_0,fw_h,fm ; m = lt*fw_h
- FSTD fm,-8(%sp) ; -8(sp) = m
- XMPYU fht_0,fw_h,ht_temp ; ht_temp = ht*fw_h
- FSTD ht_temp,-24(%sp) ; -24(sp) = ht
- XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
- FSTD lt_temp,-32(%sp) ; -32(sp) = lt
-
- LDD -8(%sp),m_0
- LDD -16(%sp),m1_0
- ADD,L m_0,m1_0,tmp_0 ; tmp_0 = m + m1;
- LDD -24(%sp),ht_0
- LDD -32(%sp),lt_0
-
- CMPCLR,*>>= tmp_0,m1_0,%r0 ; if (m < m1)
- ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32)
-
- EXTRD,U tmp_0,31,32,m_0 ; m>>32
- DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32
-
- ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32)
- ADD lt_0,m1_0,lt_0 ; lt= lt+m1;
- ADD,DC ht_0,%r0,ht_0 ; ht++
-
- ADD %ret1,lt_0,lt_0 ; lt = lt + c;
- ADD,DC ht_0,%r0,ht_0 ; ht++
-
- COPY ht_0,%ret1 ; copy carry
- STD lt_0,0(r_ptr) ; rp[0] = lt
-
-bn_mul_words_exit
- .EXIT
- EXTRD,U %ret1,31,32,%ret0 ; for 32-bit, return in ret0/ret1
- LDD -96(%sp),%r7 ; restore r7
- LDD -104(%sp),%r6 ; restore r6
- LDD -112(%sp),%r5 ; restore r5
- LDD -120(%sp),%r4 ; restore r4
- BVE (%rp)
- LDD,MB -128(%sp),%r3 ; restore r3
- .PROCEND
-
-;----------------------------------------------------------------------------
-;
-;void bn_sqr_words(BN_ULONG *rp, BN_ULONG *ap, int num)
-;
-; arg0 = rp
-; arg1 = ap
-; arg2 = num
-;
-
-bn_sqr_words
- .proc
- .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
- .EXPORT bn_sqr_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
- .entry
- .align 64
-
- STD %r3,0(%sp) ; save r3
- STD %r4,8(%sp) ; save r4
- NOP
- STD %r5,16(%sp) ; save r5
-
- CMPIB,>= 0,num,bn_sqr_words_exit
- LDO 128(%sp),%sp ; bump stack
-
- ;
- ; If only 1, the goto straight to cleanup
- ;
- CMPIB,= 1,num,bn_sqr_words_single_top
- DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L
-
- ;
- ; This loop is unrolled 2 times (64-byte aligned as well)
- ;
-
-bn_sqr_words_unroll2
- FLDD 0(a_ptr),t_float_0 ; a[0]
- FLDD 8(a_ptr),t_float_1 ; a[1]
- XMPYU fht_0,flt_0,fm ; m[0]
- XMPYU fht_1,flt_1,fm_1 ; m[1]
-
- FSTD fm,-24(%sp) ; store m[0]
- FSTD fm_1,-56(%sp) ; store m[1]
- XMPYU flt_0,flt_0,lt_temp ; lt[0]
- XMPYU flt_1,flt_1,lt_temp_1 ; lt[1]
-
- FSTD lt_temp,-16(%sp) ; store lt[0]
- FSTD lt_temp_1,-48(%sp) ; store lt[1]
- XMPYU fht_0,fht_0,ht_temp ; ht[0]
- XMPYU fht_1,fht_1,ht_temp_1 ; ht[1]
-
- FSTD ht_temp,-8(%sp) ; store ht[0]
- FSTD ht_temp_1,-40(%sp) ; store ht[1]
- LDD -24(%sp),m_0
- LDD -56(%sp),m_1
-
- AND m_0,high_mask,tmp_0 ; m[0] & Mask
- AND m_1,high_mask,tmp_1 ; m[1] & Mask
- DEPD,Z m_0,30,31,m_0 ; m[0] << 32+1
- DEPD,Z m_1,30,31,m_1 ; m[1] << 32+1
-
- LDD -16(%sp),lt_0
- LDD -48(%sp),lt_1
- EXTRD,U tmp_0,32,33,tmp_0 ; tmp_0 = m[0]&Mask >> 32-1
- EXTRD,U tmp_1,32,33,tmp_1 ; tmp_1 = m[1]&Mask >> 32-1
-
- LDD -8(%sp),ht_0
- LDD -40(%sp),ht_1
- ADD,L ht_0,tmp_0,ht_0 ; ht[0] += tmp_0
- ADD,L ht_1,tmp_1,ht_1 ; ht[1] += tmp_1
-
- ADD lt_0,m_0,lt_0 ; lt = lt+m
- ADD,DC ht_0,%r0,ht_0 ; ht[0]++
- STD lt_0,0(r_ptr) ; rp[0] = lt[0]
- STD ht_0,8(r_ptr) ; rp[1] = ht[1]
-
- ADD lt_1,m_1,lt_1 ; lt = lt+m
- ADD,DC ht_1,%r0,ht_1 ; ht[1]++
- STD lt_1,16(r_ptr) ; rp[2] = lt[1]
- STD ht_1,24(r_ptr) ; rp[3] = ht[1]
-
- LDO -2(num),num ; num = num - 2;
- LDO 16(a_ptr),a_ptr ; ap += 2
- CMPIB,<= 2,num,bn_sqr_words_unroll2
- LDO 32(r_ptr),r_ptr ; rp += 4
-
- CMPIB,=,N 0,num,bn_sqr_words_exit ; are we done?
-
- ;
- ; Top of loop aligned on 64-byte boundary
- ;
-bn_sqr_words_single_top
- FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
-
- XMPYU fht_0,flt_0,fm ; m
- FSTD fm,-24(%sp) ; store m
-
- XMPYU flt_0,flt_0,lt_temp ; lt
- FSTD lt_temp,-16(%sp) ; store lt
-
- XMPYU fht_0,fht_0,ht_temp ; ht
- FSTD ht_temp,-8(%sp) ; store ht
-
- LDD -24(%sp),m_0 ; load m
- AND m_0,high_mask,tmp_0 ; m & Mask
- DEPD,Z m_0,30,31,m_0 ; m << 32+1
- LDD -16(%sp),lt_0 ; lt
-
- LDD -8(%sp),ht_0 ; ht
- EXTRD,U tmp_0,32,33,tmp_0 ; tmp_0 = m&Mask >> 32-1
- ADD m_0,lt_0,lt_0 ; lt = lt+m
- ADD,L ht_0,tmp_0,ht_0 ; ht += tmp_0
- ADD,DC ht_0,%r0,ht_0 ; ht++
-
- STD lt_0,0(r_ptr) ; rp[0] = lt
- STD ht_0,8(r_ptr) ; rp[1] = ht
-
-bn_sqr_words_exit
- .EXIT
- LDD -112(%sp),%r5 ; restore r5
- LDD -120(%sp),%r4 ; restore r4
- BVE (%rp)
- LDD,MB -128(%sp),%r3
- .PROCEND ;in=23,24,25,26,29;out=28;
-
-
-;----------------------------------------------------------------------------
-;
-;BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
-;
-; arg0 = rp
-; arg1 = ap
-; arg2 = bp
-; arg3 = n
-
-t .reg %r22
-b .reg %r21
-l .reg %r20
-
-bn_add_words
- .proc
- .entry
- .callinfo
- .EXPORT bn_add_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
- .align 64
-
- CMPIB,>= 0,n,bn_add_words_exit
- COPY %r0,%ret1 ; return 0 by default
-
- ;
- ; If 2 or more numbers do the loop
- ;
- CMPIB,= 1,n,bn_add_words_single_top
- NOP
-
- ;
- ; This loop is unrolled 2 times (64-byte aligned as well)
- ;
-bn_add_words_unroll2
- LDD 0(a_ptr),t
- LDD 0(b_ptr),b
- ADD t,%ret1,t ; t = t+c;
- ADD,DC %r0,%r0,%ret1 ; set c to carry
- ADD t,b,l ; l = t + b[0]
- ADD,DC %ret1,%r0,%ret1 ; c+= carry
- STD l,0(r_ptr)
-
- LDD 8(a_ptr),t
- LDD 8(b_ptr),b
- ADD t,%ret1,t ; t = t+c;
- ADD,DC %r0,%r0,%ret1 ; set c to carry
- ADD t,b,l ; l = t + b[0]
- ADD,DC %ret1,%r0,%ret1 ; c+= carry
- STD l,8(r_ptr)
-
- LDO -2(n),n
- LDO 16(a_ptr),a_ptr
- LDO 16(b_ptr),b_ptr
-
- CMPIB,<= 2,n,bn_add_words_unroll2
- LDO 16(r_ptr),r_ptr
-
- CMPIB,=,N 0,n,bn_add_words_exit ; are we done?
-
-bn_add_words_single_top
- LDD 0(a_ptr),t
- LDD 0(b_ptr),b
-
- ADD t,%ret1,t ; t = t+c;
- ADD,DC %r0,%r0,%ret1 ; set c to carry (could use CMPCLR??)
- ADD t,b,l ; l = t + b[0]
- ADD,DC %ret1,%r0,%ret1 ; c+= carry
- STD l,0(r_ptr)
-
-bn_add_words_exit
- .EXIT
- BVE (%rp)
- EXTRD,U %ret1,31,32,%ret0 ; for 32-bit, return in ret0/ret1
- .PROCEND ;in=23,24,25,26,29;out=28;
-
-;----------------------------------------------------------------------------
-;
-;BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
-;
-; arg0 = rp
-; arg1 = ap
-; arg2 = bp
-; arg3 = n
-
-t1 .reg %r22
-t2 .reg %r21
-sub_tmp1 .reg %r20
-sub_tmp2 .reg %r19
-
-
-bn_sub_words
- .proc
- .callinfo
- .EXPORT bn_sub_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
- .entry
- .align 64
-
- CMPIB,>= 0,n,bn_sub_words_exit
- COPY %r0,%ret1 ; return 0 by default
-
- ;
- ; If 2 or more numbers do the loop
- ;
- CMPIB,= 1,n,bn_sub_words_single_top
- NOP
-
- ;
- ; This loop is unrolled 2 times (64-byte aligned as well)
- ;
-bn_sub_words_unroll2
- LDD 0(a_ptr),t1
- LDD 0(b_ptr),t2
- SUB t1,t2,sub_tmp1 ; t3 = t1-t2;
- SUB sub_tmp1,%ret1,sub_tmp1 ; t3 = t3- c;
-
- CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2
- LDO 1(%r0),sub_tmp2
-
- CMPCLR,*= t1,t2,%r0
- COPY sub_tmp2,%ret1
- STD sub_tmp1,0(r_ptr)
-
- LDD 8(a_ptr),t1
- LDD 8(b_ptr),t2
- SUB t1,t2,sub_tmp1 ; t3 = t1-t2;
- SUB sub_tmp1,%ret1,sub_tmp1 ; t3 = t3- c;
- CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2
- LDO 1(%r0),sub_tmp2
-
- CMPCLR,*= t1,t2,%r0
- COPY sub_tmp2,%ret1
- STD sub_tmp1,8(r_ptr)
-
- LDO -2(n),n
- LDO 16(a_ptr),a_ptr
- LDO 16(b_ptr),b_ptr
-
- CMPIB,<= 2,n,bn_sub_words_unroll2
- LDO 16(r_ptr),r_ptr
-
- CMPIB,=,N 0,n,bn_sub_words_exit ; are we done?
-
-bn_sub_words_single_top
- LDD 0(a_ptr),t1
- LDD 0(b_ptr),t2
- SUB t1,t2,sub_tmp1 ; t3 = t1-t2;
- SUB sub_tmp1,%ret1,sub_tmp1 ; t3 = t3- c;
- CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2
- LDO 1(%r0),sub_tmp2
-
- CMPCLR,*= t1,t2,%r0
- COPY sub_tmp2,%ret1
-
- STD sub_tmp1,0(r_ptr)
-
-bn_sub_words_exit
- .EXIT
- BVE (%rp)
- EXTRD,U %ret1,31,32,%ret0 ; for 32-bit, return in ret0/ret1
- .PROCEND ;in=23,24,25,26,29;out=28;
-
-;------------------------------------------------------------------------------
-;
-; unsigned long bn_div_words(unsigned long h, unsigned long l, unsigned long d)
-;
-; arg0 = h
-; arg1 = l
-; arg2 = d
-;
-; This is mainly just output from the HP C compiler.
-;
-;------------------------------------------------------------------------------
-bn_div_words
- .PROC
- .EXPORT bn_div_words,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,RTNVAL=GR,LONG_RETURN
- .IMPORT BN_num_bits_word,CODE
- ;--- not PIC .IMPORT __iob,DATA
- ;--- not PIC .IMPORT fprintf,CODE
- .IMPORT abort,CODE
- .IMPORT $$div2U,MILLICODE
- .CALLINFO CALLER,FRAME=144,ENTRY_GR=%r9,SAVE_RP,ARGS_SAVED,ORDERING_AWARE
- .ENTRY
- STW %r2,-20(%r30) ;offset 0x8ec
- STW,MA %r3,192(%r30) ;offset 0x8f0
- STW %r4,-188(%r30) ;offset 0x8f4
- DEPD %r5,31,32,%r6 ;offset 0x8f8
- STD %r6,-184(%r30) ;offset 0x8fc
- DEPD %r7,31,32,%r8 ;offset 0x900
- STD %r8,-176(%r30) ;offset 0x904
- STW %r9,-168(%r30) ;offset 0x908
- LDD -248(%r30),%r3 ;offset 0x90c
- COPY %r26,%r4 ;offset 0x910
- COPY %r24,%r5 ;offset 0x914
- DEPD %r25,31,32,%r4 ;offset 0x918
- CMPB,*<> %r3,%r0,$0006000C ;offset 0x91c
- DEPD %r23,31,32,%r5 ;offset 0x920
- MOVIB,TR -1,%r29,$00060002 ;offset 0x924
- EXTRD,U %r29,31,32,%r28 ;offset 0x928
-$0006002A
- LDO -1(%r29),%r29 ;offset 0x92c
- SUB %r23,%r7,%r23 ;offset 0x930
-$00060024
- SUB %r4,%r31,%r25 ;offset 0x934
- AND %r25,%r19,%r26 ;offset 0x938
- CMPB,*<>,N %r0,%r26,$00060046 ;offset 0x93c
- DEPD,Z %r25,31,32,%r20 ;offset 0x940
- OR %r20,%r24,%r21 ;offset 0x944
- CMPB,*<<,N %r21,%r23,$0006002A ;offset 0x948
- SUB %r31,%r2,%r31 ;offset 0x94c
-$00060046
-$0006002E
- DEPD,Z %r23,31,32,%r25 ;offset 0x950
- EXTRD,U %r23,31,32,%r26 ;offset 0x954
- AND %r25,%r19,%r24 ;offset 0x958
- ADD,L %r31,%r26,%r31 ;offset 0x95c
- CMPCLR,*>>= %r5,%r24,%r0 ;offset 0x960
- LDO 1(%r31),%r31 ;offset 0x964
-$00060032
- CMPB,*<<=,N %r31,%r4,$00060036 ;offset 0x968
- LDO -1(%r29),%r29 ;offset 0x96c
- ADD,L %r4,%r3,%r4 ;offset 0x970
-$00060036
- ADDIB,=,N -1,%r8,$D0 ;offset 0x974
- SUB %r5,%r24,%r28 ;offset 0x978
-$0006003A
- SUB %r4,%r31,%r24 ;offset 0x97c
- SHRPD %r24,%r28,32,%r4 ;offset 0x980
- DEPD,Z %r29,31,32,%r9 ;offset 0x984
- DEPD,Z %r28,31,32,%r5 ;offset 0x988
-$0006001C
- EXTRD,U %r4,31,32,%r31 ;offset 0x98c
- CMPB,*<>,N %r31,%r2,$00060020 ;offset 0x990
- MOVB,TR %r6,%r29,$D1 ;offset 0x994
- STD %r29,-152(%r30) ;offset 0x998
-$0006000C
- EXTRD,U %r3,31,32,%r25 ;offset 0x99c
- COPY %r3,%r26 ;offset 0x9a0
- EXTRD,U %r3,31,32,%r9 ;offset 0x9a4
- EXTRD,U %r4,31,32,%r8 ;offset 0x9a8
- .CALL ARGW0=GR,ARGW1=GR,RTNVAL=GR ;in=25,26;out=28;
- B,L BN_num_bits_word,%r2 ;offset 0x9ac
- EXTRD,U %r5,31,32,%r7 ;offset 0x9b0
- LDI 64,%r20 ;offset 0x9b4
- DEPD %r7,31,32,%r5 ;offset 0x9b8
- DEPD %r8,31,32,%r4 ;offset 0x9bc
- DEPD %r9,31,32,%r3 ;offset 0x9c0
- CMPB,= %r28,%r20,$00060012 ;offset 0x9c4
- COPY %r28,%r24 ;offset 0x9c8
- MTSARCM %r24 ;offset 0x9cc
- DEPDI,Z -1,%sar,1,%r19 ;offset 0x9d0
- CMPB,*>>,N %r4,%r19,$D2 ;offset 0x9d4
-$00060012
- SUBI 64,%r24,%r31 ;offset 0x9d8
- CMPCLR,*<< %r4,%r3,%r0 ;offset 0x9dc
- SUB %r4,%r3,%r4 ;offset 0x9e0
-$00060016
- CMPB,= %r31,%r0,$0006001A ;offset 0x9e4
- COPY %r0,%r9 ;offset 0x9e8
- MTSARCM %r31 ;offset 0x9ec
- DEPD,Z %r3,%sar,64,%r3 ;offset 0x9f0
- SUBI 64,%r31,%r26 ;offset 0x9f4
- MTSAR %r26 ;offset 0x9f8
- SHRPD %r4,%r5,%sar,%r4 ;offset 0x9fc
- MTSARCM %r31 ;offset 0xa00
- DEPD,Z %r5,%sar,64,%r5 ;offset 0xa04
-$0006001A
- DEPDI,Z -1,31,32,%r19 ;offset 0xa08
- AND %r3,%r19,%r29 ;offset 0xa0c
- EXTRD,U %r29,31,32,%r2 ;offset 0xa10
- DEPDI,Z -1,63,32,%r6 ;offset 0xa14
- MOVIB,TR 2,%r8,$0006001C ;offset 0xa18
- EXTRD,U %r3,63,32,%r7 ;offset 0xa1c
-$D2
- ;--- not PIC ADDIL LR'__iob-$global$,%r27,%r1 ;offset 0xa20
- ;--- not PIC LDIL LR'C$7,%r21 ;offset 0xa24
- ;--- not PIC LDO RR'__iob-$global$+32(%r1),%r26 ;offset 0xa28
- ;--- not PIC .CALL ARGW0=GR,ARGW1=GR,ARGW2=GR,RTNVAL=GR ;in=24,25,26;out=28;
- ;--- not PIC B,L fprintf,%r2 ;offset 0xa2c
- ;--- not PIC LDO RR'C$7(%r21),%r25 ;offset 0xa30
- .CALL ;
- B,L abort,%r2 ;offset 0xa34
- NOP ;offset 0xa38
- B $D3 ;offset 0xa3c
- LDW -212(%r30),%r2 ;offset 0xa40
-$00060020
- COPY %r4,%r26 ;offset 0xa44
- EXTRD,U %r4,31,32,%r25 ;offset 0xa48
- COPY %r2,%r24 ;offset 0xa4c
- .CALL ;in=23,24,25,26;out=20,21,22,28,29; (MILLICALL)
- B,L $$div2U,%r31 ;offset 0xa50
- EXTRD,U %r2,31,32,%r23 ;offset 0xa54
- DEPD %r28,31,32,%r29 ;offset 0xa58
-$00060022
- STD %r29,-152(%r30) ;offset 0xa5c
-$D1
- AND %r5,%r19,%r24 ;offset 0xa60
- EXTRD,U %r24,31,32,%r24 ;offset 0xa64
- STW %r2,-160(%r30) ;offset 0xa68
- STW %r7,-128(%r30) ;offset 0xa6c
- FLDD -152(%r30),%fr4 ;offset 0xa70
- FLDD -152(%r30),%fr7 ;offset 0xa74
- FLDW -160(%r30),%fr8L ;offset 0xa78
- FLDW -128(%r30),%fr5L ;offset 0xa7c
- XMPYU %fr8L,%fr7L,%fr10 ;offset 0xa80
- FSTD %fr10,-136(%r30) ;offset 0xa84
- XMPYU %fr8L,%fr7R,%fr22 ;offset 0xa88
- FSTD %fr22,-144(%r30) ;offset 0xa8c
- XMPYU %fr5L,%fr4L,%fr11 ;offset 0xa90
- XMPYU %fr5L,%fr4R,%fr23 ;offset 0xa94
- FSTD %fr11,-112(%r30) ;offset 0xa98
- FSTD %fr23,-120(%r30) ;offset 0xa9c
- LDD -136(%r30),%r28 ;offset 0xaa0
- DEPD,Z %r28,31,32,%r31 ;offset 0xaa4
- LDD -144(%r30),%r20 ;offset 0xaa8
- ADD,L %r20,%r31,%r31 ;offset 0xaac
- LDD -112(%r30),%r22 ;offset 0xab0
- DEPD,Z %r22,31,32,%r22 ;offset 0xab4
- LDD -120(%r30),%r21 ;offset 0xab8
- B $00060024 ;offset 0xabc
- ADD,L %r21,%r22,%r23 ;offset 0xac0
-$D0
- OR %r9,%r29,%r29 ;offset 0xac4
-$00060040
- EXTRD,U %r29,31,32,%r28 ;offset 0xac8
-$00060002
-$L2
- LDW -212(%r30),%r2 ;offset 0xacc
-$D3
- LDW -168(%r30),%r9 ;offset 0xad0
- LDD -176(%r30),%r8 ;offset 0xad4
- EXTRD,U %r8,31,32,%r7 ;offset 0xad8
- LDD -184(%r30),%r6 ;offset 0xadc
- EXTRD,U %r6,31,32,%r5 ;offset 0xae0
- LDW -188(%r30),%r4 ;offset 0xae4
- BVE (%r2) ;offset 0xae8
- .EXIT
- LDW,MB -192(%r30),%r3 ;offset 0xaec
- .PROCEND ;in=23,25;out=28,29;fpin=105,107;
-
-
-
-
-;----------------------------------------------------------------------------
-;
-; Registers to hold 64-bit values to manipulate. The "L" part
-; of the register corresponds to the upper 32-bits, while the "R"
-; part corresponds to the lower 32-bits
-;
-; Note, that when using b6 and b7, the code must save these before
-; using them because they are callee save registers
-;
-;
-; Floating point registers to use to save values that
-; are manipulated. These don't collide with ftemp1-6 and
-; are all caller save registers
-;
-a0 .reg %fr22
-a0L .reg %fr22L
-a0R .reg %fr22R
-
-a1 .reg %fr23
-a1L .reg %fr23L
-a1R .reg %fr23R
-
-a2 .reg %fr24
-a2L .reg %fr24L
-a2R .reg %fr24R
-
-a3 .reg %fr25
-a3L .reg %fr25L
-a3R .reg %fr25R
-
-a4 .reg %fr26
-a4L .reg %fr26L
-a4R .reg %fr26R
-
-a5 .reg %fr27
-a5L .reg %fr27L
-a5R .reg %fr27R
-
-a6 .reg %fr28
-a6L .reg %fr28L
-a6R .reg %fr28R
-
-a7 .reg %fr29
-a7L .reg %fr29L
-a7R .reg %fr29R
-
-b0 .reg %fr30
-b0L .reg %fr30L
-b0R .reg %fr30R
-
-b1 .reg %fr31
-b1L .reg %fr31L
-b1R .reg %fr31R
-
-;
-; Temporary floating point variables, these are all caller save
-; registers
-;
-ftemp1 .reg %fr4
-ftemp2 .reg %fr5
-ftemp3 .reg %fr6
-ftemp4 .reg %fr7
-
-;
-; The B set of registers when used.
-;
-
-b2 .reg %fr8
-b2L .reg %fr8L
-b2R .reg %fr8R
-
-b3 .reg %fr9
-b3L .reg %fr9L
-b3R .reg %fr9R
-
-b4 .reg %fr10
-b4L .reg %fr10L
-b4R .reg %fr10R
-
-b5 .reg %fr11
-b5L .reg %fr11L
-b5R .reg %fr11R
-
-b6 .reg %fr12
-b6L .reg %fr12L
-b6R .reg %fr12R
-
-b7 .reg %fr13
-b7L .reg %fr13L
-b7R .reg %fr13R
-
-c1 .reg %r21 ; only reg
-temp1 .reg %r20 ; only reg
-temp2 .reg %r19 ; only reg
-temp3 .reg %r31 ; only reg
-
-m1 .reg %r28
-c2 .reg %r23
-high_one .reg %r1
-ht .reg %r6
-lt .reg %r5
-m .reg %r4
-c3 .reg %r3
-
-SQR_ADD_C .macro A0L,A0R,C1,C2,C3
- XMPYU A0L,A0R,ftemp1 ; m
- FSTD ftemp1,-24(%sp) ; store m
-
- XMPYU A0R,A0R,ftemp2 ; lt
- FSTD ftemp2,-16(%sp) ; store lt
-
- XMPYU A0L,A0L,ftemp3 ; ht
- FSTD ftemp3,-8(%sp) ; store ht
-
- LDD -24(%sp),m ; load m
- AND m,high_mask,temp2 ; m & Mask
- DEPD,Z m,30,31,temp3 ; m << 32+1
- LDD -16(%sp),lt ; lt
-
- LDD -8(%sp),ht ; ht
- EXTRD,U temp2,32,33,temp1 ; temp1 = m&Mask >> 32-1
- ADD temp3,lt,lt ; lt = lt+m
- ADD,L ht,temp1,ht ; ht += temp1
- ADD,DC ht,%r0,ht ; ht++
-
- ADD C1,lt,C1 ; c1=c1+lt
- ADD,DC ht,%r0,ht ; ht++
-
- ADD C2,ht,C2 ; c2=c2+ht
- ADD,DC C3,%r0,C3 ; c3++
-.endm
-
-SQR_ADD_C2 .macro A0L,A0R,A1L,A1R,C1,C2,C3
- XMPYU A0L,A1R,ftemp1 ; m1 = bl*ht
- FSTD ftemp1,-16(%sp) ;
- XMPYU A0R,A1L,ftemp2 ; m = bh*lt
- FSTD ftemp2,-8(%sp) ;
- XMPYU A0R,A1R,ftemp3 ; lt = bl*lt
- FSTD ftemp3,-32(%sp)
- XMPYU A0L,A1L,ftemp4 ; ht = bh*ht
- FSTD ftemp4,-24(%sp) ;
-
- LDD -8(%sp),m ; r21 = m
- LDD -16(%sp),m1 ; r19 = m1
- ADD,L m,m1,m ; m+m1
-
- DEPD,Z m,31,32,temp3 ; (m+m1<<32)
- LDD -24(%sp),ht ; r24 = ht
-
- CMPCLR,*>>= m,m1,%r0 ; if (m < m1)
- ADD,L ht,high_one,ht ; ht+=high_one
-
- EXTRD,U m,31,32,temp1 ; m >> 32
- LDD -32(%sp),lt ; lt
- ADD,L ht,temp1,ht ; ht+= m>>32
- ADD lt,temp3,lt ; lt = lt+m1
- ADD,DC ht,%r0,ht ; ht++
-
- ADD ht,ht,ht ; ht=ht+ht;
- ADD,DC C3,%r0,C3 ; add in carry (c3++)
-
- ADD lt,lt,lt ; lt=lt+lt;
- ADD,DC ht,%r0,ht ; add in carry (ht++)
-
- ADD C1,lt,C1 ; c1=c1+lt
- ADD,DC,*NUV ht,%r0,ht ; add in carry (ht++)
- LDO 1(C3),C3 ; bump c3 if overflow,nullify otherwise
-
- ADD C2,ht,C2 ; c2 = c2 + ht
- ADD,DC C3,%r0,C3 ; add in carry (c3++)
-.endm
-
-;
-;void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
-; arg0 = r_ptr
-; arg1 = a_ptr
-;
-
-bn_sqr_comba8
- .PROC
- .CALLINFO FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
- .EXPORT bn_sqr_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
- .ENTRY
- .align 64
-
- STD %r3,0(%sp) ; save r3
- STD %r4,8(%sp) ; save r4
- STD %r5,16(%sp) ; save r5
- STD %r6,24(%sp) ; save r6
-
- ;
- ; Zero out carries
- ;
- COPY %r0,c1
- COPY %r0,c2
- COPY %r0,c3
-
- LDO 128(%sp),%sp ; bump stack
- DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L
- DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
-
- ;
- ; Load up all of the values we are going to use
- ;
- FLDD 0(a_ptr),a0
- FLDD 8(a_ptr),a1
- FLDD 16(a_ptr),a2
- FLDD 24(a_ptr),a3
- FLDD 32(a_ptr),a4
- FLDD 40(a_ptr),a5
- FLDD 48(a_ptr),a6
- FLDD 56(a_ptr),a7
-
- SQR_ADD_C a0L,a0R,c1,c2,c3
- STD c1,0(r_ptr) ; r[0] = c1;
- COPY %r0,c1
-
- SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1
- STD c2,8(r_ptr) ; r[1] = c2;
- COPY %r0,c2
-
- SQR_ADD_C a1L,a1R,c3,c1,c2
- SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2
- STD c3,16(r_ptr) ; r[2] = c3;
- COPY %r0,c3
-
- SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3
- SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3
- STD c1,24(r_ptr) ; r[3] = c1;
- COPY %r0,c1
-
- SQR_ADD_C a2L,a2R,c2,c3,c1
- SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1
- SQR_ADD_C2 a4L,a4R,a0L,a0R,c2,c3,c1
- STD c2,32(r_ptr) ; r[4] = c2;
- COPY %r0,c2
-
- SQR_ADD_C2 a5L,a5R,a0L,a0R,c3,c1,c2
- SQR_ADD_C2 a4L,a4R,a1L,a1R,c3,c1,c2
- SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2
- STD c3,40(r_ptr) ; r[5] = c3;
- COPY %r0,c3
-
- SQR_ADD_C a3L,a3R,c1,c2,c3
- SQR_ADD_C2 a4L,a4R,a2L,a2R,c1,c2,c3
- SQR_ADD_C2 a5L,a5R,a1L,a1R,c1,c2,c3
- SQR_ADD_C2 a6L,a6R,a0L,a0R,c1,c2,c3
- STD c1,48(r_ptr) ; r[6] = c1;
- COPY %r0,c1
-
- SQR_ADD_C2 a7L,a7R,a0L,a0R,c2,c3,c1
- SQR_ADD_C2 a6L,a6R,a1L,a1R,c2,c3,c1
- SQR_ADD_C2 a5L,a5R,a2L,a2R,c2,c3,c1
- SQR_ADD_C2 a4L,a4R,a3L,a3R,c2,c3,c1
- STD c2,56(r_ptr) ; r[7] = c2;
- COPY %r0,c2
-
- SQR_ADD_C a4L,a4R,c3,c1,c2
- SQR_ADD_C2 a5L,a5R,a3L,a3R,c3,c1,c2
- SQR_ADD_C2 a6L,a6R,a2L,a2R,c3,c1,c2
- SQR_ADD_C2 a7L,a7R,a1L,a1R,c3,c1,c2
- STD c3,64(r_ptr) ; r[8] = c3;
- COPY %r0,c3
-
- SQR_ADD_C2 a7L,a7R,a2L,a2R,c1,c2,c3
- SQR_ADD_C2 a6L,a6R,a3L,a3R,c1,c2,c3
- SQR_ADD_C2 a5L,a5R,a4L,a4R,c1,c2,c3
- STD c1,72(r_ptr) ; r[9] = c1;
- COPY %r0,c1
-
- SQR_ADD_C a5L,a5R,c2,c3,c1
- SQR_ADD_C2 a6L,a6R,a4L,a4R,c2,c3,c1
- SQR_ADD_C2 a7L,a7R,a3L,a3R,c2,c3,c1
- STD c2,80(r_ptr) ; r[10] = c2;
- COPY %r0,c2
-
- SQR_ADD_C2 a7L,a7R,a4L,a4R,c3,c1,c2
- SQR_ADD_C2 a6L,a6R,a5L,a5R,c3,c1,c2
- STD c3,88(r_ptr) ; r[11] = c3;
- COPY %r0,c3
-
- SQR_ADD_C a6L,a6R,c1,c2,c3
- SQR_ADD_C2 a7L,a7R,a5L,a5R,c1,c2,c3
- STD c1,96(r_ptr) ; r[12] = c1;
- COPY %r0,c1
-
- SQR_ADD_C2 a7L,a7R,a6L,a6R,c2,c3,c1
- STD c2,104(r_ptr) ; r[13] = c2;
- COPY %r0,c2
-
- SQR_ADD_C a7L,a7R,c3,c1,c2
- STD c3, 112(r_ptr) ; r[14] = c3
- STD c1, 120(r_ptr) ; r[15] = c1
-
- .EXIT
- LDD -104(%sp),%r6 ; restore r6
- LDD -112(%sp),%r5 ; restore r5
- LDD -120(%sp),%r4 ; restore r4
- BVE (%rp)
- LDD,MB -128(%sp),%r3
-
- .PROCEND
-
-;-----------------------------------------------------------------------------
-;
-;void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
-; arg0 = r_ptr
-; arg1 = a_ptr
-;
-
-bn_sqr_comba4
- .proc
- .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
- .EXPORT bn_sqr_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
- .entry
- .align 64
- STD %r3,0(%sp) ; save r3
- STD %r4,8(%sp) ; save r4
- STD %r5,16(%sp) ; save r5
- STD %r6,24(%sp) ; save r6
-
- ;
- ; Zero out carries
- ;
- COPY %r0,c1
- COPY %r0,c2
- COPY %r0,c3
-
- LDO 128(%sp),%sp ; bump stack
- DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L
- DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
-
- ;
- ; Load up all of the values we are going to use
- ;
- FLDD 0(a_ptr),a0
- FLDD 8(a_ptr),a1
- FLDD 16(a_ptr),a2
- FLDD 24(a_ptr),a3
- FLDD 32(a_ptr),a4
- FLDD 40(a_ptr),a5
- FLDD 48(a_ptr),a6
- FLDD 56(a_ptr),a7
-
- SQR_ADD_C a0L,a0R,c1,c2,c3
-
- STD c1,0(r_ptr) ; r[0] = c1;
- COPY %r0,c1
-
- SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1
-
- STD c2,8(r_ptr) ; r[1] = c2;
- COPY %r0,c2
-
- SQR_ADD_C a1L,a1R,c3,c1,c2
- SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2
-
- STD c3,16(r_ptr) ; r[2] = c3;
- COPY %r0,c3
-
- SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3
- SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3
-
- STD c1,24(r_ptr) ; r[3] = c1;
- COPY %r0,c1
-
- SQR_ADD_C a2L,a2R,c2,c3,c1
- SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1
-
- STD c2,32(r_ptr) ; r[4] = c2;
- COPY %r0,c2
-
- SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2
- STD c3,40(r_ptr) ; r[5] = c3;
- COPY %r0,c3
-
- SQR_ADD_C a3L,a3R,c1,c2,c3
- STD c1,48(r_ptr) ; r[6] = c1;
- STD c2,56(r_ptr) ; r[7] = c2;
-
- .EXIT
- LDD -104(%sp),%r6 ; restore r6
- LDD -112(%sp),%r5 ; restore r5
- LDD -120(%sp),%r4 ; restore r4
- BVE (%rp)
- LDD,MB -128(%sp),%r3
-
- .PROCEND
-
-
-;---------------------------------------------------------------------------
-
-MUL_ADD_C .macro A0L,A0R,B0L,B0R,C1,C2,C3
- XMPYU A0L,B0R,ftemp1 ; m1 = bl*ht
- FSTD ftemp1,-16(%sp) ;
- XMPYU A0R,B0L,ftemp2 ; m = bh*lt
- FSTD ftemp2,-8(%sp) ;
- XMPYU A0R,B0R,ftemp3 ; lt = bl*lt
- FSTD ftemp3,-32(%sp)
- XMPYU A0L,B0L,ftemp4 ; ht = bh*ht
- FSTD ftemp4,-24(%sp) ;
-
- LDD -8(%sp),m ; r21 = m
- LDD -16(%sp),m1 ; r19 = m1
- ADD,L m,m1,m ; m+m1
-
- DEPD,Z m,31,32,temp3 ; (m+m1<<32)
- LDD -24(%sp),ht ; r24 = ht
-
- CMPCLR,*>>= m,m1,%r0 ; if (m < m1)
- ADD,L ht,high_one,ht ; ht+=high_one
-
- EXTRD,U m,31,32,temp1 ; m >> 32
- LDD -32(%sp),lt ; lt
- ADD,L ht,temp1,ht ; ht+= m>>32
- ADD lt,temp3,lt ; lt = lt+m1
- ADD,DC ht,%r0,ht ; ht++
-
- ADD C1,lt,C1 ; c1=c1+lt
- ADD,DC ht,%r0,ht ; bump c3 if overflow,nullify otherwise
-
- ADD C2,ht,C2 ; c2 = c2 + ht
- ADD,DC C3,%r0,C3 ; add in carry (c3++)
-.endm
-
-
-;
-;void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
-; arg0 = r_ptr
-; arg1 = a_ptr
-; arg2 = b_ptr
-;
-
-bn_mul_comba8
- .proc
- .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
- .EXPORT bn_mul_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
- .entry
- .align 64
-
- STD %r3,0(%sp) ; save r3
- STD %r4,8(%sp) ; save r4
- STD %r5,16(%sp) ; save r5
- STD %r6,24(%sp) ; save r6
- FSTD %fr12,32(%sp) ; save r6
- FSTD %fr13,40(%sp) ; save r7
-
- ;
- ; Zero out carries
- ;
- COPY %r0,c1
- COPY %r0,c2
- COPY %r0,c3
-
- LDO 128(%sp),%sp ; bump stack
- DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
-
- ;
- ; Load up all of the values we are going to use
- ;
- FLDD 0(a_ptr),a0
- FLDD 8(a_ptr),a1
- FLDD 16(a_ptr),a2
- FLDD 24(a_ptr),a3
- FLDD 32(a_ptr),a4
- FLDD 40(a_ptr),a5
- FLDD 48(a_ptr),a6
- FLDD 56(a_ptr),a7
-
- FLDD 0(b_ptr),b0
- FLDD 8(b_ptr),b1
- FLDD 16(b_ptr),b2
- FLDD 24(b_ptr),b3
- FLDD 32(b_ptr),b4
- FLDD 40(b_ptr),b5
- FLDD 48(b_ptr),b6
- FLDD 56(b_ptr),b7
-
- MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3
- STD c1,0(r_ptr)
- COPY %r0,c1
-
- MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1
- MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1
- STD c2,8(r_ptr)
- COPY %r0,c2
-
- MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2
- MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2
- MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2
- STD c3,16(r_ptr)
- COPY %r0,c3
-
- MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3
- MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3
- MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3
- MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3
- STD c1,24(r_ptr)
- COPY %r0,c1
-
- MUL_ADD_C a4L,a4R,b0L,b0R,c2,c3,c1
- MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1
- MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1
- MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1
- MUL_ADD_C a0L,a0R,b4L,b4R,c2,c3,c1
- STD c2,32(r_ptr)
- COPY %r0,c2
-
- MUL_ADD_C a0L,a0R,b5L,b5R,c3,c1,c2
- MUL_ADD_C a1L,a1R,b4L,b4R,c3,c1,c2
- MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2
- MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2
- MUL_ADD_C a4L,a4R,b1L,b1R,c3,c1,c2
- MUL_ADD_C a5L,a5R,b0L,b0R,c3,c1,c2
- STD c3,40(r_ptr)
- COPY %r0,c3
-
- MUL_ADD_C a6L,a6R,b0L,b0R,c1,c2,c3
- MUL_ADD_C a5L,a5R,b1L,b1R,c1,c2,c3
- MUL_ADD_C a4L,a4R,b2L,b2R,c1,c2,c3
- MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3
- MUL_ADD_C a2L,a2R,b4L,b4R,c1,c2,c3
- MUL_ADD_C a1L,a1R,b5L,b5R,c1,c2,c3
- MUL_ADD_C a0L,a0R,b6L,b6R,c1,c2,c3
- STD c1,48(r_ptr)
- COPY %r0,c1
-
- MUL_ADD_C a0L,a0R,b7L,b7R,c2,c3,c1
- MUL_ADD_C a1L,a1R,b6L,b6R,c2,c3,c1
- MUL_ADD_C a2L,a2R,b5L,b5R,c2,c3,c1
- MUL_ADD_C a3L,a3R,b4L,b4R,c2,c3,c1
- MUL_ADD_C a4L,a4R,b3L,b3R,c2,c3,c1
- MUL_ADD_C a5L,a5R,b2L,b2R,c2,c3,c1
- MUL_ADD_C a6L,a6R,b1L,b1R,c2,c3,c1
- MUL_ADD_C a7L,a7R,b0L,b0R,c2,c3,c1
- STD c2,56(r_ptr)
- COPY %r0,c2
-
- MUL_ADD_C a7L,a7R,b1L,b1R,c3,c1,c2
- MUL_ADD_C a6L,a6R,b2L,b2R,c3,c1,c2
- MUL_ADD_C a5L,a5R,b3L,b3R,c3,c1,c2
- MUL_ADD_C a4L,a4R,b4L,b4R,c3,c1,c2
- MUL_ADD_C a3L,a3R,b5L,b5R,c3,c1,c2
- MUL_ADD_C a2L,a2R,b6L,b6R,c3,c1,c2
- MUL_ADD_C a1L,a1R,b7L,b7R,c3,c1,c2
- STD c3,64(r_ptr)
- COPY %r0,c3
-
- MUL_ADD_C a2L,a2R,b7L,b7R,c1,c2,c3
- MUL_ADD_C a3L,a3R,b6L,b6R,c1,c2,c3
- MUL_ADD_C a4L,a4R,b5L,b5R,c1,c2,c3
- MUL_ADD_C a5L,a5R,b4L,b4R,c1,c2,c3
- MUL_ADD_C a6L,a6R,b3L,b3R,c1,c2,c3
- MUL_ADD_C a7L,a7R,b2L,b2R,c1,c2,c3
- STD c1,72(r_ptr)
- COPY %r0,c1
-
- MUL_ADD_C a7L,a7R,b3L,b3R,c2,c3,c1
- MUL_ADD_C a6L,a6R,b4L,b4R,c2,c3,c1
- MUL_ADD_C a5L,a5R,b5L,b5R,c2,c3,c1
- MUL_ADD_C a4L,a4R,b6L,b6R,c2,c3,c1
- MUL_ADD_C a3L,a3R,b7L,b7R,c2,c3,c1
- STD c2,80(r_ptr)
- COPY %r0,c2
-
- MUL_ADD_C a4L,a4R,b7L,b7R,c3,c1,c2
- MUL_ADD_C a5L,a5R,b6L,b6R,c3,c1,c2
- MUL_ADD_C a6L,a6R,b5L,b5R,c3,c1,c2
- MUL_ADD_C a7L,a7R,b4L,b4R,c3,c1,c2
- STD c3,88(r_ptr)
- COPY %r0,c3
-
- MUL_ADD_C a7L,a7R,b5L,b5R,c1,c2,c3
- MUL_ADD_C a6L,a6R,b6L,b6R,c1,c2,c3
- MUL_ADD_C a5L,a5R,b7L,b7R,c1,c2,c3
- STD c1,96(r_ptr)
- COPY %r0,c1
-
- MUL_ADD_C a6L,a6R,b7L,b7R,c2,c3,c1
- MUL_ADD_C a7L,a7R,b6L,b6R,c2,c3,c1
- STD c2,104(r_ptr)
- COPY %r0,c2
-
- MUL_ADD_C a7L,a7R,b7L,b7R,c3,c1,c2
- STD c3,112(r_ptr)
- STD c1,120(r_ptr)
-
- .EXIT
- FLDD -88(%sp),%fr13
- FLDD -96(%sp),%fr12
- LDD -104(%sp),%r6 ; restore r6
- LDD -112(%sp),%r5 ; restore r5
- LDD -120(%sp),%r4 ; restore r4
- BVE (%rp)
- LDD,MB -128(%sp),%r3
-
- .PROCEND
-
-;-----------------------------------------------------------------------------
-;
-;void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
-; arg0 = r_ptr
-; arg1 = a_ptr
-; arg2 = b_ptr
-;
-
-bn_mul_comba4
- .proc
- .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
- .EXPORT bn_mul_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
- .entry
- .align 64
-
- STD %r3,0(%sp) ; save r3
- STD %r4,8(%sp) ; save r4
- STD %r5,16(%sp) ; save r5
- STD %r6,24(%sp) ; save r6
- FSTD %fr12,32(%sp) ; save r6
- FSTD %fr13,40(%sp) ; save r7
-
- ;
- ; Zero out carries
- ;
- COPY %r0,c1
- COPY %r0,c2
- COPY %r0,c3
-
- LDO 128(%sp),%sp ; bump stack
- DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
-
- ;
- ; Load up all of the values we are going to use
- ;
- FLDD 0(a_ptr),a0
- FLDD 8(a_ptr),a1
- FLDD 16(a_ptr),a2
- FLDD 24(a_ptr),a3
-
- FLDD 0(b_ptr),b0
- FLDD 8(b_ptr),b1
- FLDD 16(b_ptr),b2
- FLDD 24(b_ptr),b3
-
- MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3
- STD c1,0(r_ptr)
- COPY %r0,c1
-
- MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1
- MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1
- STD c2,8(r_ptr)
- COPY %r0,c2
-
- MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2
- MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2
- MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2
- STD c3,16(r_ptr)
- COPY %r0,c3
-
- MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3
- MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3
- MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3
- MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3
- STD c1,24(r_ptr)
- COPY %r0,c1
-
- MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1
- MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1
- MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1
- STD c2,32(r_ptr)
- COPY %r0,c2
-
- MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2
- MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2
- STD c3,40(r_ptr)
- COPY %r0,c3
-
- MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3
- STD c1,48(r_ptr)
- STD c2,56(r_ptr)
-
- .EXIT
- FLDD -88(%sp),%fr13
- FLDD -96(%sp),%fr12
- LDD -104(%sp),%r6 ; restore r6
- LDD -112(%sp),%r5 ; restore r5
- LDD -120(%sp),%r4 ; restore r4
- BVE (%rp)
- LDD,MB -128(%sp),%r3
-
- .PROCEND
-
-
-;--- not PIC .SPACE $TEXT$
-;--- not PIC .SUBSPA $CODE$
-;--- not PIC .SPACE $PRIVATE$,SORT=16
-;--- not PIC .IMPORT $global$,DATA
-;--- not PIC .SPACE $TEXT$
-;--- not PIC .SUBSPA $CODE$
-;--- not PIC .SUBSPA $LIT$,ACCESS=0x2c
-;--- not PIC C$7
-;--- not PIC .ALIGN 8
-;--- not PIC .STRINGZ "Division would overflow (%d)\n"
- .END
diff --git a/app/openssl/crypto/bn/asm/pa-risc2W.S b/app/openssl/crypto/bn/asm/pa-risc2W.S
deleted file mode 100644
index a9954575..00000000
--- a/app/openssl/crypto/bn/asm/pa-risc2W.S
+++ /dev/null
@@ -1,1605 +0,0 @@
-;
-; PA-RISC 64-bit implementation of bn_asm code
-;
-; This code is approximately 2x faster than the C version
-; for RSA/DSA.
-;
-; See http://devresource.hp.com/ for more details on the PA-RISC
-; architecture. Also see the book "PA-RISC 2.0 Architecture"
-; by Gerry Kane for information on the instruction set architecture.
-;
-; Code written by Chris Ruemmler (with some help from the HP C
-; compiler).
-;
-; The code compiles with HP's assembler
-;
-
- .level 2.0W
- .space $TEXT$
- .subspa $CODE$,QUAD=0,ALIGN=8,ACCESS=0x2c,CODE_ONLY
-
-;
-; Global Register definitions used for the routines.
-;
-; Some information about HP's runtime architecture for 64-bits.
-;
-; "Caller save" means the calling function must save the register
-; if it wants the register to be preserved.
-; "Callee save" means if a function uses the register, it must save
-; the value before using it.
-;
-; For the floating point registers
-;
-; "caller save" registers: fr4-fr11, fr22-fr31
-; "callee save" registers: fr12-fr21
-; "special" registers: fr0-fr3 (status and exception registers)
-;
-; For the integer registers
-; value zero : r0
-; "caller save" registers: r1,r19-r26
-; "callee save" registers: r3-r18
-; return register : r2 (rp)
-; return values ; r28 (ret0,ret1)
-; Stack pointer ; r30 (sp)
-; global data pointer ; r27 (dp)
-; argument pointer ; r29 (ap)
-; millicode return ptr ; r31 (also a caller save register)
-
-
-;
-; Arguments to the routines
-;
-r_ptr .reg %r26
-a_ptr .reg %r25
-b_ptr .reg %r24
-num .reg %r24
-w .reg %r23
-n .reg %r23
-
-
-;
-; Globals used in some routines
-;
-
-top_overflow .reg %r29
-high_mask .reg %r22 ; value 0xffffffff80000000L
-
-
-;------------------------------------------------------------------------------
-;
-; bn_mul_add_words
-;
-;BN_ULONG bn_mul_add_words(BN_ULONG *r_ptr, BN_ULONG *a_ptr,
-; int num, BN_ULONG w)
-;
-; arg0 = r_ptr
-; arg1 = a_ptr
-; arg2 = num
-; arg3 = w
-;
-; Local register definitions
-;
-
-fm1 .reg %fr22
-fm .reg %fr23
-ht_temp .reg %fr24
-ht_temp_1 .reg %fr25
-lt_temp .reg %fr26
-lt_temp_1 .reg %fr27
-fm1_1 .reg %fr28
-fm_1 .reg %fr29
-
-fw_h .reg %fr7L
-fw_l .reg %fr7R
-fw .reg %fr7
-
-fht_0 .reg %fr8L
-flt_0 .reg %fr8R
-t_float_0 .reg %fr8
-
-fht_1 .reg %fr9L
-flt_1 .reg %fr9R
-t_float_1 .reg %fr9
-
-tmp_0 .reg %r31
-tmp_1 .reg %r21
-m_0 .reg %r20
-m_1 .reg %r19
-ht_0 .reg %r1
-ht_1 .reg %r3
-lt_0 .reg %r4
-lt_1 .reg %r5
-m1_0 .reg %r6
-m1_1 .reg %r7
-rp_val .reg %r8
-rp_val_1 .reg %r9
-
-bn_mul_add_words
- .export bn_mul_add_words,entry,NO_RELOCATION,LONG_RETURN
- .proc
- .callinfo frame=128
- .entry
- .align 64
-
- STD %r3,0(%sp) ; save r3
- STD %r4,8(%sp) ; save r4
- NOP ; Needed to make the loop 16-byte aligned
- NOP ; Needed to make the loop 16-byte aligned
-
- STD %r5,16(%sp) ; save r5
- STD %r6,24(%sp) ; save r6
- STD %r7,32(%sp) ; save r7
- STD %r8,40(%sp) ; save r8
-
- STD %r9,48(%sp) ; save r9
- COPY %r0,%ret0 ; return 0 by default
- DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32
- STD w,56(%sp) ; store w on stack
-
- CMPIB,>= 0,num,bn_mul_add_words_exit ; if (num <= 0) then exit
- LDO 128(%sp),%sp ; bump stack
-
- ;
- ; The loop is unrolled twice, so if there is only 1 number
- ; then go straight to the cleanup code.
- ;
- CMPIB,= 1,num,bn_mul_add_words_single_top
- FLDD -72(%sp),fw ; load up w into fp register fw (fw_h/fw_l)
-
- ;
- ; This loop is unrolled 2 times (64-byte aligned as well)
- ;
- ; PA-RISC 2.0 chips have two fully pipelined multipliers, thus
- ; two 32-bit mutiplies can be issued per cycle.
- ;
-bn_mul_add_words_unroll2
-
- FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
- FLDD 8(a_ptr),t_float_1 ; load up 64-bit value (fr8L) ht(L)/lt(R)
- LDD 0(r_ptr),rp_val ; rp[0]
- LDD 8(r_ptr),rp_val_1 ; rp[1]
-
- XMPYU fht_0,fw_l,fm1 ; m1[0] = fht_0*fw_l
- XMPYU fht_1,fw_l,fm1_1 ; m1[1] = fht_1*fw_l
- FSTD fm1,-16(%sp) ; -16(sp) = m1[0]
- FSTD fm1_1,-48(%sp) ; -48(sp) = m1[1]
-
- XMPYU flt_0,fw_h,fm ; m[0] = flt_0*fw_h
- XMPYU flt_1,fw_h,fm_1 ; m[1] = flt_1*fw_h
- FSTD fm,-8(%sp) ; -8(sp) = m[0]
- FSTD fm_1,-40(%sp) ; -40(sp) = m[1]
-
- XMPYU fht_0,fw_h,ht_temp ; ht_temp = fht_0*fw_h
- XMPYU fht_1,fw_h,ht_temp_1 ; ht_temp_1 = fht_1*fw_h
- FSTD ht_temp,-24(%sp) ; -24(sp) = ht_temp
- FSTD ht_temp_1,-56(%sp) ; -56(sp) = ht_temp_1
-
- XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
- XMPYU flt_1,fw_l,lt_temp_1 ; lt_temp = lt*fw_l
- FSTD lt_temp,-32(%sp) ; -32(sp) = lt_temp
- FSTD lt_temp_1,-64(%sp) ; -64(sp) = lt_temp_1
-
- LDD -8(%sp),m_0 ; m[0]
- LDD -40(%sp),m_1 ; m[1]
- LDD -16(%sp),m1_0 ; m1[0]
- LDD -48(%sp),m1_1 ; m1[1]
-
- LDD -24(%sp),ht_0 ; ht[0]
- LDD -56(%sp),ht_1 ; ht[1]
- ADD,L m1_0,m_0,tmp_0 ; tmp_0 = m[0] + m1[0];
- ADD,L m1_1,m_1,tmp_1 ; tmp_1 = m[1] + m1[1];
-
- LDD -32(%sp),lt_0
- LDD -64(%sp),lt_1
- CMPCLR,*>>= tmp_0,m1_0, %r0 ; if (m[0] < m1[0])
- ADD,L ht_0,top_overflow,ht_0 ; ht[0] += (1<<32)
-
- CMPCLR,*>>= tmp_1,m1_1,%r0 ; if (m[1] < m1[1])
- ADD,L ht_1,top_overflow,ht_1 ; ht[1] += (1<<32)
- EXTRD,U tmp_0,31,32,m_0 ; m[0]>>32
- DEPD,Z tmp_0,31,32,m1_0 ; m1[0] = m[0]<<32
-
- EXTRD,U tmp_1,31,32,m_1 ; m[1]>>32
- DEPD,Z tmp_1,31,32,m1_1 ; m1[1] = m[1]<<32
- ADD,L ht_0,m_0,ht_0 ; ht[0]+= (m[0]>>32)
- ADD,L ht_1,m_1,ht_1 ; ht[1]+= (m[1]>>32)
-
- ADD lt_0,m1_0,lt_0 ; lt[0] = lt[0]+m1[0];
- ADD,DC ht_0,%r0,ht_0 ; ht[0]++
- ADD lt_1,m1_1,lt_1 ; lt[1] = lt[1]+m1[1];
- ADD,DC ht_1,%r0,ht_1 ; ht[1]++
-
- ADD %ret0,lt_0,lt_0 ; lt[0] = lt[0] + c;
- ADD,DC ht_0,%r0,ht_0 ; ht[0]++
- ADD lt_0,rp_val,lt_0 ; lt[0] = lt[0]+rp[0]
- ADD,DC ht_0,%r0,ht_0 ; ht[0]++
-
- LDO -2(num),num ; num = num - 2;
- ADD ht_0,lt_1,lt_1 ; lt[1] = lt[1] + ht_0 (c);
- ADD,DC ht_1,%r0,ht_1 ; ht[1]++
- STD lt_0,0(r_ptr) ; rp[0] = lt[0]
-
- ADD lt_1,rp_val_1,lt_1 ; lt[1] = lt[1]+rp[1]
- ADD,DC ht_1,%r0,%ret0 ; ht[1]++
- LDO 16(a_ptr),a_ptr ; a_ptr += 2
-
- STD lt_1,8(r_ptr) ; rp[1] = lt[1]
- CMPIB,<= 2,num,bn_mul_add_words_unroll2 ; go again if more to do
- LDO 16(r_ptr),r_ptr ; r_ptr += 2
-
- CMPIB,=,N 0,num,bn_mul_add_words_exit ; are we done, or cleanup last one
-
- ;
- ; Top of loop aligned on 64-byte boundary
- ;
-bn_mul_add_words_single_top
- FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
- LDD 0(r_ptr),rp_val ; rp[0]
- LDO 8(a_ptr),a_ptr ; a_ptr++
- XMPYU fht_0,fw_l,fm1 ; m1 = ht*fw_l
- FSTD fm1,-16(%sp) ; -16(sp) = m1
- XMPYU flt_0,fw_h,fm ; m = lt*fw_h
- FSTD fm,-8(%sp) ; -8(sp) = m
- XMPYU fht_0,fw_h,ht_temp ; ht_temp = ht*fw_h
- FSTD ht_temp,-24(%sp) ; -24(sp) = ht
- XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
- FSTD lt_temp,-32(%sp) ; -32(sp) = lt
-
- LDD -8(%sp),m_0
- LDD -16(%sp),m1_0 ; m1 = temp1
- ADD,L m_0,m1_0,tmp_0 ; tmp_0 = m + m1;
- LDD -24(%sp),ht_0
- LDD -32(%sp),lt_0
-
- CMPCLR,*>>= tmp_0,m1_0,%r0 ; if (m < m1)
- ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32)
-
- EXTRD,U tmp_0,31,32,m_0 ; m>>32
- DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32
-
- ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32)
- ADD lt_0,m1_0,tmp_0 ; tmp_0 = lt+m1;
- ADD,DC ht_0,%r0,ht_0 ; ht++
- ADD %ret0,tmp_0,lt_0 ; lt = lt + c;
- ADD,DC ht_0,%r0,ht_0 ; ht++
- ADD lt_0,rp_val,lt_0 ; lt = lt+rp[0]
- ADD,DC ht_0,%r0,%ret0 ; ht++
- STD lt_0,0(r_ptr) ; rp[0] = lt
-
-bn_mul_add_words_exit
- .EXIT
- LDD -80(%sp),%r9 ; restore r9
- LDD -88(%sp),%r8 ; restore r8
- LDD -96(%sp),%r7 ; restore r7
- LDD -104(%sp),%r6 ; restore r6
- LDD -112(%sp),%r5 ; restore r5
- LDD -120(%sp),%r4 ; restore r4
- BVE (%rp)
- LDD,MB -128(%sp),%r3 ; restore r3
- .PROCEND ;in=23,24,25,26,29;out=28;
-
-;----------------------------------------------------------------------------
-;
-;BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
-;
-; arg0 = rp
-; arg1 = ap
-; arg2 = num
-; arg3 = w
-
-bn_mul_words
- .proc
- .callinfo frame=128
- .entry
- .EXPORT bn_mul_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
- .align 64
-
- STD %r3,0(%sp) ; save r3
- STD %r4,8(%sp) ; save r4
- STD %r5,16(%sp) ; save r5
- STD %r6,24(%sp) ; save r6
-
- STD %r7,32(%sp) ; save r7
- COPY %r0,%ret0 ; return 0 by default
- DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32
- STD w,56(%sp) ; w on stack
-
- CMPIB,>= 0,num,bn_mul_words_exit
- LDO 128(%sp),%sp ; bump stack
-
- ;
- ; See if only 1 word to do, thus just do cleanup
- ;
- CMPIB,= 1,num,bn_mul_words_single_top
- FLDD -72(%sp),fw ; load up w into fp register fw (fw_h/fw_l)
-
- ;
- ; This loop is unrolled 2 times (64-byte aligned as well)
- ;
- ; PA-RISC 2.0 chips have two fully pipelined multipliers, thus
- ; two 32-bit mutiplies can be issued per cycle.
- ;
-bn_mul_words_unroll2
-
- FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
- FLDD 8(a_ptr),t_float_1 ; load up 64-bit value (fr8L) ht(L)/lt(R)
- XMPYU fht_0,fw_l,fm1 ; m1[0] = fht_0*fw_l
- XMPYU fht_1,fw_l,fm1_1 ; m1[1] = ht*fw_l
-
- FSTD fm1,-16(%sp) ; -16(sp) = m1
- FSTD fm1_1,-48(%sp) ; -48(sp) = m1
- XMPYU flt_0,fw_h,fm ; m = lt*fw_h
- XMPYU flt_1,fw_h,fm_1 ; m = lt*fw_h
-
- FSTD fm,-8(%sp) ; -8(sp) = m
- FSTD fm_1,-40(%sp) ; -40(sp) = m
- XMPYU fht_0,fw_h,ht_temp ; ht_temp = fht_0*fw_h
- XMPYU fht_1,fw_h,ht_temp_1 ; ht_temp = ht*fw_h
-
- FSTD ht_temp,-24(%sp) ; -24(sp) = ht
- FSTD ht_temp_1,-56(%sp) ; -56(sp) = ht
- XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
- XMPYU flt_1,fw_l,lt_temp_1 ; lt_temp = lt*fw_l
-
- FSTD lt_temp,-32(%sp) ; -32(sp) = lt
- FSTD lt_temp_1,-64(%sp) ; -64(sp) = lt
- LDD -8(%sp),m_0
- LDD -40(%sp),m_1
-
- LDD -16(%sp),m1_0
- LDD -48(%sp),m1_1
- LDD -24(%sp),ht_0
- LDD -56(%sp),ht_1
-
- ADD,L m1_0,m_0,tmp_0 ; tmp_0 = m + m1;
- ADD,L m1_1,m_1,tmp_1 ; tmp_1 = m + m1;
- LDD -32(%sp),lt_0
- LDD -64(%sp),lt_1
-
- CMPCLR,*>>= tmp_0,m1_0, %r0 ; if (m < m1)
- ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32)
- CMPCLR,*>>= tmp_1,m1_1,%r0 ; if (m < m1)
- ADD,L ht_1,top_overflow,ht_1 ; ht += (1<<32)
-
- EXTRD,U tmp_0,31,32,m_0 ; m>>32
- DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32
- EXTRD,U tmp_1,31,32,m_1 ; m>>32
- DEPD,Z tmp_1,31,32,m1_1 ; m1 = m<<32
-
- ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32)
- ADD,L ht_1,m_1,ht_1 ; ht+= (m>>32)
- ADD lt_0,m1_0,lt_0 ; lt = lt+m1;
- ADD,DC ht_0,%r0,ht_0 ; ht++
-
- ADD lt_1,m1_1,lt_1 ; lt = lt+m1;
- ADD,DC ht_1,%r0,ht_1 ; ht++
- ADD %ret0,lt_0,lt_0 ; lt = lt + c (ret0);
- ADD,DC ht_0,%r0,ht_0 ; ht++
-
- ADD ht_0,lt_1,lt_1 ; lt = lt + c (ht_0)
- ADD,DC ht_1,%r0,ht_1 ; ht++
- STD lt_0,0(r_ptr) ; rp[0] = lt
- STD lt_1,8(r_ptr) ; rp[1] = lt
-
- COPY ht_1,%ret0 ; carry = ht
- LDO -2(num),num ; num = num - 2;
- LDO 16(a_ptr),a_ptr ; ap += 2
- CMPIB,<= 2,num,bn_mul_words_unroll2
- LDO 16(r_ptr),r_ptr ; rp++
-
- CMPIB,=,N 0,num,bn_mul_words_exit ; are we done?
-
- ;
- ; Top of loop aligned on 64-byte boundary
- ;
-bn_mul_words_single_top
- FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
-
- XMPYU fht_0,fw_l,fm1 ; m1 = ht*fw_l
- FSTD fm1,-16(%sp) ; -16(sp) = m1
- XMPYU flt_0,fw_h,fm ; m = lt*fw_h
- FSTD fm,-8(%sp) ; -8(sp) = m
- XMPYU fht_0,fw_h,ht_temp ; ht_temp = ht*fw_h
- FSTD ht_temp,-24(%sp) ; -24(sp) = ht
- XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l
- FSTD lt_temp,-32(%sp) ; -32(sp) = lt
-
- LDD -8(%sp),m_0
- LDD -16(%sp),m1_0
- ADD,L m_0,m1_0,tmp_0 ; tmp_0 = m + m1;
- LDD -24(%sp),ht_0
- LDD -32(%sp),lt_0
-
- CMPCLR,*>>= tmp_0,m1_0,%r0 ; if (m < m1)
- ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32)
-
- EXTRD,U tmp_0,31,32,m_0 ; m>>32
- DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32
-
- ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32)
- ADD lt_0,m1_0,lt_0 ; lt= lt+m1;
- ADD,DC ht_0,%r0,ht_0 ; ht++
-
- ADD %ret0,lt_0,lt_0 ; lt = lt + c;
- ADD,DC ht_0,%r0,ht_0 ; ht++
-
- COPY ht_0,%ret0 ; copy carry
- STD lt_0,0(r_ptr) ; rp[0] = lt
-
-bn_mul_words_exit
- .EXIT
- LDD -96(%sp),%r7 ; restore r7
- LDD -104(%sp),%r6 ; restore r6
- LDD -112(%sp),%r5 ; restore r5
- LDD -120(%sp),%r4 ; restore r4
- BVE (%rp)
- LDD,MB -128(%sp),%r3 ; restore r3
- .PROCEND ;in=23,24,25,26,29;out=28;
-
-;----------------------------------------------------------------------------
-;
-;void bn_sqr_words(BN_ULONG *rp, BN_ULONG *ap, int num)
-;
-; arg0 = rp
-; arg1 = ap
-; arg2 = num
-;
-
-bn_sqr_words
- .proc
- .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
- .EXPORT bn_sqr_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
- .entry
- .align 64
-
- STD %r3,0(%sp) ; save r3
- STD %r4,8(%sp) ; save r4
- NOP
- STD %r5,16(%sp) ; save r5
-
- CMPIB,>= 0,num,bn_sqr_words_exit
- LDO 128(%sp),%sp ; bump stack
-
- ;
- ; If only 1, the goto straight to cleanup
- ;
- CMPIB,= 1,num,bn_sqr_words_single_top
- DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L
-
- ;
- ; This loop is unrolled 2 times (64-byte aligned as well)
- ;
-
-bn_sqr_words_unroll2
- FLDD 0(a_ptr),t_float_0 ; a[0]
- FLDD 8(a_ptr),t_float_1 ; a[1]
- XMPYU fht_0,flt_0,fm ; m[0]
- XMPYU fht_1,flt_1,fm_1 ; m[1]
-
- FSTD fm,-24(%sp) ; store m[0]
- FSTD fm_1,-56(%sp) ; store m[1]
- XMPYU flt_0,flt_0,lt_temp ; lt[0]
- XMPYU flt_1,flt_1,lt_temp_1 ; lt[1]
-
- FSTD lt_temp,-16(%sp) ; store lt[0]
- FSTD lt_temp_1,-48(%sp) ; store lt[1]
- XMPYU fht_0,fht_0,ht_temp ; ht[0]
- XMPYU fht_1,fht_1,ht_temp_1 ; ht[1]
-
- FSTD ht_temp,-8(%sp) ; store ht[0]
- FSTD ht_temp_1,-40(%sp) ; store ht[1]
- LDD -24(%sp),m_0
- LDD -56(%sp),m_1
-
- AND m_0,high_mask,tmp_0 ; m[0] & Mask
- AND m_1,high_mask,tmp_1 ; m[1] & Mask
- DEPD,Z m_0,30,31,m_0 ; m[0] << 32+1
- DEPD,Z m_1,30,31,m_1 ; m[1] << 32+1
-
- LDD -16(%sp),lt_0
- LDD -48(%sp),lt_1
- EXTRD,U tmp_0,32,33,tmp_0 ; tmp_0 = m[0]&Mask >> 32-1
- EXTRD,U tmp_1,32,33,tmp_1 ; tmp_1 = m[1]&Mask >> 32-1
-
- LDD -8(%sp),ht_0
- LDD -40(%sp),ht_1
- ADD,L ht_0,tmp_0,ht_0 ; ht[0] += tmp_0
- ADD,L ht_1,tmp_1,ht_1 ; ht[1] += tmp_1
-
- ADD lt_0,m_0,lt_0 ; lt = lt+m
- ADD,DC ht_0,%r0,ht_0 ; ht[0]++
- STD lt_0,0(r_ptr) ; rp[0] = lt[0]
- STD ht_0,8(r_ptr) ; rp[1] = ht[1]
-
- ADD lt_1,m_1,lt_1 ; lt = lt+m
- ADD,DC ht_1,%r0,ht_1 ; ht[1]++
- STD lt_1,16(r_ptr) ; rp[2] = lt[1]
- STD ht_1,24(r_ptr) ; rp[3] = ht[1]
-
- LDO -2(num),num ; num = num - 2;
- LDO 16(a_ptr),a_ptr ; ap += 2
- CMPIB,<= 2,num,bn_sqr_words_unroll2
- LDO 32(r_ptr),r_ptr ; rp += 4
-
- CMPIB,=,N 0,num,bn_sqr_words_exit ; are we done?
-
- ;
- ; Top of loop aligned on 64-byte boundary
- ;
-bn_sqr_words_single_top
- FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R)
-
- XMPYU fht_0,flt_0,fm ; m
- FSTD fm,-24(%sp) ; store m
-
- XMPYU flt_0,flt_0,lt_temp ; lt
- FSTD lt_temp,-16(%sp) ; store lt
-
- XMPYU fht_0,fht_0,ht_temp ; ht
- FSTD ht_temp,-8(%sp) ; store ht
-
- LDD -24(%sp),m_0 ; load m
- AND m_0,high_mask,tmp_0 ; m & Mask
- DEPD,Z m_0,30,31,m_0 ; m << 32+1
- LDD -16(%sp),lt_0 ; lt
-
- LDD -8(%sp),ht_0 ; ht
- EXTRD,U tmp_0,32,33,tmp_0 ; tmp_0 = m&Mask >> 32-1
- ADD m_0,lt_0,lt_0 ; lt = lt+m
- ADD,L ht_0,tmp_0,ht_0 ; ht += tmp_0
- ADD,DC ht_0,%r0,ht_0 ; ht++
-
- STD lt_0,0(r_ptr) ; rp[0] = lt
- STD ht_0,8(r_ptr) ; rp[1] = ht
-
-bn_sqr_words_exit
- .EXIT
- LDD -112(%sp),%r5 ; restore r5
- LDD -120(%sp),%r4 ; restore r4
- BVE (%rp)
- LDD,MB -128(%sp),%r3
- .PROCEND ;in=23,24,25,26,29;out=28;
-
-
-;----------------------------------------------------------------------------
-;
-;BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
-;
-; arg0 = rp
-; arg1 = ap
-; arg2 = bp
-; arg3 = n
-
-t .reg %r22
-b .reg %r21
-l .reg %r20
-
-bn_add_words
- .proc
- .entry
- .callinfo
- .EXPORT bn_add_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
- .align 64
-
- CMPIB,>= 0,n,bn_add_words_exit
- COPY %r0,%ret0 ; return 0 by default
-
- ;
- ; If 2 or more numbers do the loop
- ;
- CMPIB,= 1,n,bn_add_words_single_top
- NOP
-
- ;
- ; This loop is unrolled 2 times (64-byte aligned as well)
- ;
-bn_add_words_unroll2
- LDD 0(a_ptr),t
- LDD 0(b_ptr),b
- ADD t,%ret0,t ; t = t+c;
- ADD,DC %r0,%r0,%ret0 ; set c to carry
- ADD t,b,l ; l = t + b[0]
- ADD,DC %ret0,%r0,%ret0 ; c+= carry
- STD l,0(r_ptr)
-
- LDD 8(a_ptr),t
- LDD 8(b_ptr),b
- ADD t,%ret0,t ; t = t+c;
- ADD,DC %r0,%r0,%ret0 ; set c to carry
- ADD t,b,l ; l = t + b[0]
- ADD,DC %ret0,%r0,%ret0 ; c+= carry
- STD l,8(r_ptr)
-
- LDO -2(n),n
- LDO 16(a_ptr),a_ptr
- LDO 16(b_ptr),b_ptr
-
- CMPIB,<= 2,n,bn_add_words_unroll2
- LDO 16(r_ptr),r_ptr
-
- CMPIB,=,N 0,n,bn_add_words_exit ; are we done?
-
-bn_add_words_single_top
- LDD 0(a_ptr),t
- LDD 0(b_ptr),b
-
- ADD t,%ret0,t ; t = t+c;
- ADD,DC %r0,%r0,%ret0 ; set c to carry (could use CMPCLR??)
- ADD t,b,l ; l = t + b[0]
- ADD,DC %ret0,%r0,%ret0 ; c+= carry
- STD l,0(r_ptr)
-
-bn_add_words_exit
- .EXIT
- BVE (%rp)
- NOP
- .PROCEND ;in=23,24,25,26,29;out=28;
-
-;----------------------------------------------------------------------------
-;
-;BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
-;
-; arg0 = rp
-; arg1 = ap
-; arg2 = bp
-; arg3 = n
-
-t1 .reg %r22
-t2 .reg %r21
-sub_tmp1 .reg %r20
-sub_tmp2 .reg %r19
-
-
-bn_sub_words
- .proc
- .callinfo
- .EXPORT bn_sub_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
- .entry
- .align 64
-
- CMPIB,>= 0,n,bn_sub_words_exit
- COPY %r0,%ret0 ; return 0 by default
-
- ;
- ; If 2 or more numbers do the loop
- ;
- CMPIB,= 1,n,bn_sub_words_single_top
- NOP
-
- ;
- ; This loop is unrolled 2 times (64-byte aligned as well)
- ;
-bn_sub_words_unroll2
- LDD 0(a_ptr),t1
- LDD 0(b_ptr),t2
- SUB t1,t2,sub_tmp1 ; t3 = t1-t2;
- SUB sub_tmp1,%ret0,sub_tmp1 ; t3 = t3- c;
-
- CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2
- LDO 1(%r0),sub_tmp2
-
- CMPCLR,*= t1,t2,%r0
- COPY sub_tmp2,%ret0
- STD sub_tmp1,0(r_ptr)
-
- LDD 8(a_ptr),t1
- LDD 8(b_ptr),t2
- SUB t1,t2,sub_tmp1 ; t3 = t1-t2;
- SUB sub_tmp1,%ret0,sub_tmp1 ; t3 = t3- c;
- CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2
- LDO 1(%r0),sub_tmp2
-
- CMPCLR,*= t1,t2,%r0
- COPY sub_tmp2,%ret0
- STD sub_tmp1,8(r_ptr)
-
- LDO -2(n),n
- LDO 16(a_ptr),a_ptr
- LDO 16(b_ptr),b_ptr
-
- CMPIB,<= 2,n,bn_sub_words_unroll2
- LDO 16(r_ptr),r_ptr
-
- CMPIB,=,N 0,n,bn_sub_words_exit ; are we done?
-
-bn_sub_words_single_top
- LDD 0(a_ptr),t1
- LDD 0(b_ptr),t2
- SUB t1,t2,sub_tmp1 ; t3 = t1-t2;
- SUB sub_tmp1,%ret0,sub_tmp1 ; t3 = t3- c;
- CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2
- LDO 1(%r0),sub_tmp2
-
- CMPCLR,*= t1,t2,%r0
- COPY sub_tmp2,%ret0
-
- STD sub_tmp1,0(r_ptr)
-
-bn_sub_words_exit
- .EXIT
- BVE (%rp)
- NOP
- .PROCEND ;in=23,24,25,26,29;out=28;
-
-;------------------------------------------------------------------------------
-;
-; unsigned long bn_div_words(unsigned long h, unsigned long l, unsigned long d)
-;
-; arg0 = h
-; arg1 = l
-; arg2 = d
-;
-; This is mainly just modified assembly from the compiler, thus the
-; lack of variable names.
-;
-;------------------------------------------------------------------------------
-bn_div_words
- .proc
- .callinfo CALLER,FRAME=272,ENTRY_GR=%r10,SAVE_RP,ARGS_SAVED,ORDERING_AWARE
- .EXPORT bn_div_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
- .IMPORT BN_num_bits_word,CODE,NO_RELOCATION
- .IMPORT __iob,DATA
- .IMPORT fprintf,CODE,NO_RELOCATION
- .IMPORT abort,CODE,NO_RELOCATION
- .IMPORT $$div2U,MILLICODE
- .entry
- STD %r2,-16(%r30)
- STD,MA %r3,352(%r30)
- STD %r4,-344(%r30)
- STD %r5,-336(%r30)
- STD %r6,-328(%r30)
- STD %r7,-320(%r30)
- STD %r8,-312(%r30)
- STD %r9,-304(%r30)
- STD %r10,-296(%r30)
-
- STD %r27,-288(%r30) ; save gp
-
- COPY %r24,%r3 ; save d
- COPY %r26,%r4 ; save h (high 64-bits)
- LDO -1(%r0),%ret0 ; return -1 by default
-
- CMPB,*= %r0,%arg2,$D3 ; if (d == 0)
- COPY %r25,%r5 ; save l (low 64-bits)
-
- LDO -48(%r30),%r29 ; create ap
- .CALL ;in=26,29;out=28;
- B,L BN_num_bits_word,%r2
- COPY %r3,%r26
- LDD -288(%r30),%r27 ; restore gp
- LDI 64,%r21
-
- CMPB,= %r21,%ret0,$00000012 ;if (i == 64) (forward)
- COPY %ret0,%r24 ; i
- MTSARCM %r24
- DEPDI,Z -1,%sar,1,%r29
- CMPB,*<<,N %r29,%r4,bn_div_err_case ; if (h > 1<<i) (forward)
-
-$00000012
- SUBI 64,%r24,%r31 ; i = 64 - i;
- CMPCLR,*<< %r4,%r3,%r0 ; if (h >= d)
- SUB %r4,%r3,%r4 ; h -= d
- CMPB,= %r31,%r0,$0000001A ; if (i)
- COPY %r0,%r10 ; ret = 0
- MTSARCM %r31 ; i to shift
- DEPD,Z %r3,%sar,64,%r3 ; d <<= i;
- SUBI 64,%r31,%r19 ; 64 - i; redundent
- MTSAR %r19 ; (64 -i) to shift
- SHRPD %r4,%r5,%sar,%r4 ; l>> (64-i)
- MTSARCM %r31 ; i to shift
- DEPD,Z %r5,%sar,64,%r5 ; l <<= i;
-
-$0000001A
- DEPDI,Z -1,31,32,%r19
- EXTRD,U %r3,31,32,%r6 ; dh=(d&0xfff)>>32
- EXTRD,U %r3,63,32,%r8 ; dl = d&0xffffff
- LDO 2(%r0),%r9
- STD %r3,-280(%r30) ; "d" to stack
-
-$0000001C
- DEPDI,Z -1,63,32,%r29 ;
- EXTRD,U %r4,31,32,%r31 ; h >> 32
- CMPB,*=,N %r31,%r6,$D2 ; if ((h>>32) != dh)(forward) div
- COPY %r4,%r26
- EXTRD,U %r4,31,32,%r25
- COPY %r6,%r24
- .CALL ;in=23,24,25,26;out=20,21,22,28,29; (MILLICALL)
- B,L $$div2U,%r2
- EXTRD,U %r6,31,32,%r23
- DEPD %r28,31,32,%r29
-$D2
- STD %r29,-272(%r30) ; q
- AND %r5,%r19,%r24 ; t & 0xffffffff00000000;
- EXTRD,U %r24,31,32,%r24 ; ???
- FLDD -272(%r30),%fr7 ; q
- FLDD -280(%r30),%fr8 ; d
- XMPYU %fr8L,%fr7L,%fr10
- FSTD %fr10,-256(%r30)
- XMPYU %fr8L,%fr7R,%fr22
- FSTD %fr22,-264(%r30)
- XMPYU %fr8R,%fr7L,%fr11
- XMPYU %fr8R,%fr7R,%fr23
- FSTD %fr11,-232(%r30)
- FSTD %fr23,-240(%r30)
- LDD -256(%r30),%r28
- DEPD,Z %r28,31,32,%r2
- LDD -264(%r30),%r20
- ADD,L %r20,%r2,%r31
- LDD -232(%r30),%r22
- DEPD,Z %r22,31,32,%r22
- LDD -240(%r30),%r21
- B $00000024 ; enter loop
- ADD,L %r21,%r22,%r23
-
-$0000002A
- LDO -1(%r29),%r29
- SUB %r23,%r8,%r23
-$00000024
- SUB %r4,%r31,%r25
- AND %r25,%r19,%r26
- CMPB,*<>,N %r0,%r26,$00000046 ; (forward)
- DEPD,Z %r25,31,32,%r20
- OR %r20,%r24,%r21
- CMPB,*<<,N %r21,%r23,$0000002A ;(backward)
- SUB %r31,%r6,%r31
-;-------------Break path---------------------
-
-$00000046
- DEPD,Z %r23,31,32,%r25 ;tl
- EXTRD,U %r23,31,32,%r26 ;t
- AND %r25,%r19,%r24 ;tl = (tl<<32)&0xfffffff0000000L
- ADD,L %r31,%r26,%r31 ;th += t;
- CMPCLR,*>>= %r5,%r24,%r0 ;if (l<tl)
- LDO 1(%r31),%r31 ; th++;
- CMPB,*<<=,N %r31,%r4,$00000036 ;if (n < th) (forward)
- LDO -1(%r29),%r29 ;q--;
- ADD,L %r4,%r3,%r4 ;h += d;
-$00000036
- ADDIB,=,N -1,%r9,$D1 ;if (--count == 0) break (forward)
- SUB %r5,%r24,%r28 ; l -= tl;
- SUB %r4,%r31,%r24 ; h -= th;
- SHRPD %r24,%r28,32,%r4 ; h = ((h<<32)|(l>>32));
- DEPD,Z %r29,31,32,%r10 ; ret = q<<32
- b $0000001C
- DEPD,Z %r28,31,32,%r5 ; l = l << 32
-
-$D1
- OR %r10,%r29,%r28 ; ret |= q
-$D3
- LDD -368(%r30),%r2
-$D0
- LDD -296(%r30),%r10
- LDD -304(%r30),%r9
- LDD -312(%r30),%r8
- LDD -320(%r30),%r7
- LDD -328(%r30),%r6
- LDD -336(%r30),%r5
- LDD -344(%r30),%r4
- BVE (%r2)
- .EXIT
- LDD,MB -352(%r30),%r3
-
-bn_div_err_case
- MFIA %r6
- ADDIL L'bn_div_words-bn_div_err_case,%r6,%r1
- LDO R'bn_div_words-bn_div_err_case(%r1),%r6
- ADDIL LT'__iob,%r27,%r1
- LDD RT'__iob(%r1),%r26
- ADDIL L'C$4-bn_div_words,%r6,%r1
- LDO R'C$4-bn_div_words(%r1),%r25
- LDO 64(%r26),%r26
- .CALL ;in=24,25,26,29;out=28;
- B,L fprintf,%r2
- LDO -48(%r30),%r29
- LDD -288(%r30),%r27
- .CALL ;in=29;
- B,L abort,%r2
- LDO -48(%r30),%r29
- LDD -288(%r30),%r27
- B $D0
- LDD -368(%r30),%r2
- .PROCEND ;in=24,25,26,29;out=28;
-
-;----------------------------------------------------------------------------
-;
-; Registers to hold 64-bit values to manipulate. The "L" part
-; of the register corresponds to the upper 32-bits, while the "R"
-; part corresponds to the lower 32-bits
-;
-; Note, that when using b6 and b7, the code must save these before
-; using them because they are callee save registers
-;
-;
-; Floating point registers to use to save values that
-; are manipulated. These don't collide with ftemp1-6 and
-; are all caller save registers
-;
-a0 .reg %fr22
-a0L .reg %fr22L
-a0R .reg %fr22R
-
-a1 .reg %fr23
-a1L .reg %fr23L
-a1R .reg %fr23R
-
-a2 .reg %fr24
-a2L .reg %fr24L
-a2R .reg %fr24R
-
-a3 .reg %fr25
-a3L .reg %fr25L
-a3R .reg %fr25R
-
-a4 .reg %fr26
-a4L .reg %fr26L
-a4R .reg %fr26R
-
-a5 .reg %fr27
-a5L .reg %fr27L
-a5R .reg %fr27R
-
-a6 .reg %fr28
-a6L .reg %fr28L
-a6R .reg %fr28R
-
-a7 .reg %fr29
-a7L .reg %fr29L
-a7R .reg %fr29R
-
-b0 .reg %fr30
-b0L .reg %fr30L
-b0R .reg %fr30R
-
-b1 .reg %fr31
-b1L .reg %fr31L
-b1R .reg %fr31R
-
-;
-; Temporary floating point variables, these are all caller save
-; registers
-;
-ftemp1 .reg %fr4
-ftemp2 .reg %fr5
-ftemp3 .reg %fr6
-ftemp4 .reg %fr7
-
-;
-; The B set of registers when used.
-;
-
-b2 .reg %fr8
-b2L .reg %fr8L
-b2R .reg %fr8R
-
-b3 .reg %fr9
-b3L .reg %fr9L
-b3R .reg %fr9R
-
-b4 .reg %fr10
-b4L .reg %fr10L
-b4R .reg %fr10R
-
-b5 .reg %fr11
-b5L .reg %fr11L
-b5R .reg %fr11R
-
-b6 .reg %fr12
-b6L .reg %fr12L
-b6R .reg %fr12R
-
-b7 .reg %fr13
-b7L .reg %fr13L
-b7R .reg %fr13R
-
-c1 .reg %r21 ; only reg
-temp1 .reg %r20 ; only reg
-temp2 .reg %r19 ; only reg
-temp3 .reg %r31 ; only reg
-
-m1 .reg %r28
-c2 .reg %r23
-high_one .reg %r1
-ht .reg %r6
-lt .reg %r5
-m .reg %r4
-c3 .reg %r3
-
-SQR_ADD_C .macro A0L,A0R,C1,C2,C3
- XMPYU A0L,A0R,ftemp1 ; m
- FSTD ftemp1,-24(%sp) ; store m
-
- XMPYU A0R,A0R,ftemp2 ; lt
- FSTD ftemp2,-16(%sp) ; store lt
-
- XMPYU A0L,A0L,ftemp3 ; ht
- FSTD ftemp3,-8(%sp) ; store ht
-
- LDD -24(%sp),m ; load m
- AND m,high_mask,temp2 ; m & Mask
- DEPD,Z m,30,31,temp3 ; m << 32+1
- LDD -16(%sp),lt ; lt
-
- LDD -8(%sp),ht ; ht
- EXTRD,U temp2,32,33,temp1 ; temp1 = m&Mask >> 32-1
- ADD temp3,lt,lt ; lt = lt+m
- ADD,L ht,temp1,ht ; ht += temp1
- ADD,DC ht,%r0,ht ; ht++
-
- ADD C1,lt,C1 ; c1=c1+lt
- ADD,DC ht,%r0,ht ; ht++
-
- ADD C2,ht,C2 ; c2=c2+ht
- ADD,DC C3,%r0,C3 ; c3++
-.endm
-
-SQR_ADD_C2 .macro A0L,A0R,A1L,A1R,C1,C2,C3
- XMPYU A0L,A1R,ftemp1 ; m1 = bl*ht
- FSTD ftemp1,-16(%sp) ;
- XMPYU A0R,A1L,ftemp2 ; m = bh*lt
- FSTD ftemp2,-8(%sp) ;
- XMPYU A0R,A1R,ftemp3 ; lt = bl*lt
- FSTD ftemp3,-32(%sp)
- XMPYU A0L,A1L,ftemp4 ; ht = bh*ht
- FSTD ftemp4,-24(%sp) ;
-
- LDD -8(%sp),m ; r21 = m
- LDD -16(%sp),m1 ; r19 = m1
- ADD,L m,m1,m ; m+m1
-
- DEPD,Z m,31,32,temp3 ; (m+m1<<32)
- LDD -24(%sp),ht ; r24 = ht
-
- CMPCLR,*>>= m,m1,%r0 ; if (m < m1)
- ADD,L ht,high_one,ht ; ht+=high_one
-
- EXTRD,U m,31,32,temp1 ; m >> 32
- LDD -32(%sp),lt ; lt
- ADD,L ht,temp1,ht ; ht+= m>>32
- ADD lt,temp3,lt ; lt = lt+m1
- ADD,DC ht,%r0,ht ; ht++
-
- ADD ht,ht,ht ; ht=ht+ht;
- ADD,DC C3,%r0,C3 ; add in carry (c3++)
-
- ADD lt,lt,lt ; lt=lt+lt;
- ADD,DC ht,%r0,ht ; add in carry (ht++)
-
- ADD C1,lt,C1 ; c1=c1+lt
- ADD,DC,*NUV ht,%r0,ht ; add in carry (ht++)
- LDO 1(C3),C3 ; bump c3 if overflow,nullify otherwise
-
- ADD C2,ht,C2 ; c2 = c2 + ht
- ADD,DC C3,%r0,C3 ; add in carry (c3++)
-.endm
-
-;
-;void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
-; arg0 = r_ptr
-; arg1 = a_ptr
-;
-
-bn_sqr_comba8
- .PROC
- .CALLINFO FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
- .EXPORT bn_sqr_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
- .ENTRY
- .align 64
-
- STD %r3,0(%sp) ; save r3
- STD %r4,8(%sp) ; save r4
- STD %r5,16(%sp) ; save r5
- STD %r6,24(%sp) ; save r6
-
- ;
- ; Zero out carries
- ;
- COPY %r0,c1
- COPY %r0,c2
- COPY %r0,c3
-
- LDO 128(%sp),%sp ; bump stack
- DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L
- DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
-
- ;
- ; Load up all of the values we are going to use
- ;
- FLDD 0(a_ptr),a0
- FLDD 8(a_ptr),a1
- FLDD 16(a_ptr),a2
- FLDD 24(a_ptr),a3
- FLDD 32(a_ptr),a4
- FLDD 40(a_ptr),a5
- FLDD 48(a_ptr),a6
- FLDD 56(a_ptr),a7
-
- SQR_ADD_C a0L,a0R,c1,c2,c3
- STD c1,0(r_ptr) ; r[0] = c1;
- COPY %r0,c1
-
- SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1
- STD c2,8(r_ptr) ; r[1] = c2;
- COPY %r0,c2
-
- SQR_ADD_C a1L,a1R,c3,c1,c2
- SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2
- STD c3,16(r_ptr) ; r[2] = c3;
- COPY %r0,c3
-
- SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3
- SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3
- STD c1,24(r_ptr) ; r[3] = c1;
- COPY %r0,c1
-
- SQR_ADD_C a2L,a2R,c2,c3,c1
- SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1
- SQR_ADD_C2 a4L,a4R,a0L,a0R,c2,c3,c1
- STD c2,32(r_ptr) ; r[4] = c2;
- COPY %r0,c2
-
- SQR_ADD_C2 a5L,a5R,a0L,a0R,c3,c1,c2
- SQR_ADD_C2 a4L,a4R,a1L,a1R,c3,c1,c2
- SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2
- STD c3,40(r_ptr) ; r[5] = c3;
- COPY %r0,c3
-
- SQR_ADD_C a3L,a3R,c1,c2,c3
- SQR_ADD_C2 a4L,a4R,a2L,a2R,c1,c2,c3
- SQR_ADD_C2 a5L,a5R,a1L,a1R,c1,c2,c3
- SQR_ADD_C2 a6L,a6R,a0L,a0R,c1,c2,c3
- STD c1,48(r_ptr) ; r[6] = c1;
- COPY %r0,c1
-
- SQR_ADD_C2 a7L,a7R,a0L,a0R,c2,c3,c1
- SQR_ADD_C2 a6L,a6R,a1L,a1R,c2,c3,c1
- SQR_ADD_C2 a5L,a5R,a2L,a2R,c2,c3,c1
- SQR_ADD_C2 a4L,a4R,a3L,a3R,c2,c3,c1
- STD c2,56(r_ptr) ; r[7] = c2;
- COPY %r0,c2
-
- SQR_ADD_C a4L,a4R,c3,c1,c2
- SQR_ADD_C2 a5L,a5R,a3L,a3R,c3,c1,c2
- SQR_ADD_C2 a6L,a6R,a2L,a2R,c3,c1,c2
- SQR_ADD_C2 a7L,a7R,a1L,a1R,c3,c1,c2
- STD c3,64(r_ptr) ; r[8] = c3;
- COPY %r0,c3
-
- SQR_ADD_C2 a7L,a7R,a2L,a2R,c1,c2,c3
- SQR_ADD_C2 a6L,a6R,a3L,a3R,c1,c2,c3
- SQR_ADD_C2 a5L,a5R,a4L,a4R,c1,c2,c3
- STD c1,72(r_ptr) ; r[9] = c1;
- COPY %r0,c1
-
- SQR_ADD_C a5L,a5R,c2,c3,c1
- SQR_ADD_C2 a6L,a6R,a4L,a4R,c2,c3,c1
- SQR_ADD_C2 a7L,a7R,a3L,a3R,c2,c3,c1
- STD c2,80(r_ptr) ; r[10] = c2;
- COPY %r0,c2
-
- SQR_ADD_C2 a7L,a7R,a4L,a4R,c3,c1,c2
- SQR_ADD_C2 a6L,a6R,a5L,a5R,c3,c1,c2
- STD c3,88(r_ptr) ; r[11] = c3;
- COPY %r0,c3
-
- SQR_ADD_C a6L,a6R,c1,c2,c3
- SQR_ADD_C2 a7L,a7R,a5L,a5R,c1,c2,c3
- STD c1,96(r_ptr) ; r[12] = c1;
- COPY %r0,c1
-
- SQR_ADD_C2 a7L,a7R,a6L,a6R,c2,c3,c1
- STD c2,104(r_ptr) ; r[13] = c2;
- COPY %r0,c2
-
- SQR_ADD_C a7L,a7R,c3,c1,c2
- STD c3, 112(r_ptr) ; r[14] = c3
- STD c1, 120(r_ptr) ; r[15] = c1
-
- .EXIT
- LDD -104(%sp),%r6 ; restore r6
- LDD -112(%sp),%r5 ; restore r5
- LDD -120(%sp),%r4 ; restore r4
- BVE (%rp)
- LDD,MB -128(%sp),%r3
-
- .PROCEND
-
-;-----------------------------------------------------------------------------
-;
-;void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
-; arg0 = r_ptr
-; arg1 = a_ptr
-;
-
-bn_sqr_comba4
- .proc
- .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
- .EXPORT bn_sqr_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
- .entry
- .align 64
- STD %r3,0(%sp) ; save r3
- STD %r4,8(%sp) ; save r4
- STD %r5,16(%sp) ; save r5
- STD %r6,24(%sp) ; save r6
-
- ;
- ; Zero out carries
- ;
- COPY %r0,c1
- COPY %r0,c2
- COPY %r0,c3
-
- LDO 128(%sp),%sp ; bump stack
- DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L
- DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
-
- ;
- ; Load up all of the values we are going to use
- ;
- FLDD 0(a_ptr),a0
- FLDD 8(a_ptr),a1
- FLDD 16(a_ptr),a2
- FLDD 24(a_ptr),a3
- FLDD 32(a_ptr),a4
- FLDD 40(a_ptr),a5
- FLDD 48(a_ptr),a6
- FLDD 56(a_ptr),a7
-
- SQR_ADD_C a0L,a0R,c1,c2,c3
-
- STD c1,0(r_ptr) ; r[0] = c1;
- COPY %r0,c1
-
- SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1
-
- STD c2,8(r_ptr) ; r[1] = c2;
- COPY %r0,c2
-
- SQR_ADD_C a1L,a1R,c3,c1,c2
- SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2
-
- STD c3,16(r_ptr) ; r[2] = c3;
- COPY %r0,c3
-
- SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3
- SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3
-
- STD c1,24(r_ptr) ; r[3] = c1;
- COPY %r0,c1
-
- SQR_ADD_C a2L,a2R,c2,c3,c1
- SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1
-
- STD c2,32(r_ptr) ; r[4] = c2;
- COPY %r0,c2
-
- SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2
- STD c3,40(r_ptr) ; r[5] = c3;
- COPY %r0,c3
-
- SQR_ADD_C a3L,a3R,c1,c2,c3
- STD c1,48(r_ptr) ; r[6] = c1;
- STD c2,56(r_ptr) ; r[7] = c2;
-
- .EXIT
- LDD -104(%sp),%r6 ; restore r6
- LDD -112(%sp),%r5 ; restore r5
- LDD -120(%sp),%r4 ; restore r4
- BVE (%rp)
- LDD,MB -128(%sp),%r3
-
- .PROCEND
-
-
-;---------------------------------------------------------------------------
-
-MUL_ADD_C .macro A0L,A0R,B0L,B0R,C1,C2,C3
- XMPYU A0L,B0R,ftemp1 ; m1 = bl*ht
- FSTD ftemp1,-16(%sp) ;
- XMPYU A0R,B0L,ftemp2 ; m = bh*lt
- FSTD ftemp2,-8(%sp) ;
- XMPYU A0R,B0R,ftemp3 ; lt = bl*lt
- FSTD ftemp3,-32(%sp)
- XMPYU A0L,B0L,ftemp4 ; ht = bh*ht
- FSTD ftemp4,-24(%sp) ;
-
- LDD -8(%sp),m ; r21 = m
- LDD -16(%sp),m1 ; r19 = m1
- ADD,L m,m1,m ; m+m1
-
- DEPD,Z m,31,32,temp3 ; (m+m1<<32)
- LDD -24(%sp),ht ; r24 = ht
-
- CMPCLR,*>>= m,m1,%r0 ; if (m < m1)
- ADD,L ht,high_one,ht ; ht+=high_one
-
- EXTRD,U m,31,32,temp1 ; m >> 32
- LDD -32(%sp),lt ; lt
- ADD,L ht,temp1,ht ; ht+= m>>32
- ADD lt,temp3,lt ; lt = lt+m1
- ADD,DC ht,%r0,ht ; ht++
-
- ADD C1,lt,C1 ; c1=c1+lt
- ADD,DC ht,%r0,ht ; bump c3 if overflow,nullify otherwise
-
- ADD C2,ht,C2 ; c2 = c2 + ht
- ADD,DC C3,%r0,C3 ; add in carry (c3++)
-.endm
-
-
-;
-;void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
-; arg0 = r_ptr
-; arg1 = a_ptr
-; arg2 = b_ptr
-;
-
-bn_mul_comba8
- .proc
- .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
- .EXPORT bn_mul_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
- .entry
- .align 64
-
- STD %r3,0(%sp) ; save r3
- STD %r4,8(%sp) ; save r4
- STD %r5,16(%sp) ; save r5
- STD %r6,24(%sp) ; save r6
- FSTD %fr12,32(%sp) ; save r6
- FSTD %fr13,40(%sp) ; save r7
-
- ;
- ; Zero out carries
- ;
- COPY %r0,c1
- COPY %r0,c2
- COPY %r0,c3
-
- LDO 128(%sp),%sp ; bump stack
- DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
-
- ;
- ; Load up all of the values we are going to use
- ;
- FLDD 0(a_ptr),a0
- FLDD 8(a_ptr),a1
- FLDD 16(a_ptr),a2
- FLDD 24(a_ptr),a3
- FLDD 32(a_ptr),a4
- FLDD 40(a_ptr),a5
- FLDD 48(a_ptr),a6
- FLDD 56(a_ptr),a7
-
- FLDD 0(b_ptr),b0
- FLDD 8(b_ptr),b1
- FLDD 16(b_ptr),b2
- FLDD 24(b_ptr),b3
- FLDD 32(b_ptr),b4
- FLDD 40(b_ptr),b5
- FLDD 48(b_ptr),b6
- FLDD 56(b_ptr),b7
-
- MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3
- STD c1,0(r_ptr)
- COPY %r0,c1
-
- MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1
- MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1
- STD c2,8(r_ptr)
- COPY %r0,c2
-
- MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2
- MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2
- MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2
- STD c3,16(r_ptr)
- COPY %r0,c3
-
- MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3
- MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3
- MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3
- MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3
- STD c1,24(r_ptr)
- COPY %r0,c1
-
- MUL_ADD_C a4L,a4R,b0L,b0R,c2,c3,c1
- MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1
- MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1
- MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1
- MUL_ADD_C a0L,a0R,b4L,b4R,c2,c3,c1
- STD c2,32(r_ptr)
- COPY %r0,c2
-
- MUL_ADD_C a0L,a0R,b5L,b5R,c3,c1,c2
- MUL_ADD_C a1L,a1R,b4L,b4R,c3,c1,c2
- MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2
- MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2
- MUL_ADD_C a4L,a4R,b1L,b1R,c3,c1,c2
- MUL_ADD_C a5L,a5R,b0L,b0R,c3,c1,c2
- STD c3,40(r_ptr)
- COPY %r0,c3
-
- MUL_ADD_C a6L,a6R,b0L,b0R,c1,c2,c3
- MUL_ADD_C a5L,a5R,b1L,b1R,c1,c2,c3
- MUL_ADD_C a4L,a4R,b2L,b2R,c1,c2,c3
- MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3
- MUL_ADD_C a2L,a2R,b4L,b4R,c1,c2,c3
- MUL_ADD_C a1L,a1R,b5L,b5R,c1,c2,c3
- MUL_ADD_C a0L,a0R,b6L,b6R,c1,c2,c3
- STD c1,48(r_ptr)
- COPY %r0,c1
-
- MUL_ADD_C a0L,a0R,b7L,b7R,c2,c3,c1
- MUL_ADD_C a1L,a1R,b6L,b6R,c2,c3,c1
- MUL_ADD_C a2L,a2R,b5L,b5R,c2,c3,c1
- MUL_ADD_C a3L,a3R,b4L,b4R,c2,c3,c1
- MUL_ADD_C a4L,a4R,b3L,b3R,c2,c3,c1
- MUL_ADD_C a5L,a5R,b2L,b2R,c2,c3,c1
- MUL_ADD_C a6L,a6R,b1L,b1R,c2,c3,c1
- MUL_ADD_C a7L,a7R,b0L,b0R,c2,c3,c1
- STD c2,56(r_ptr)
- COPY %r0,c2
-
- MUL_ADD_C a7L,a7R,b1L,b1R,c3,c1,c2
- MUL_ADD_C a6L,a6R,b2L,b2R,c3,c1,c2
- MUL_ADD_C a5L,a5R,b3L,b3R,c3,c1,c2
- MUL_ADD_C a4L,a4R,b4L,b4R,c3,c1,c2
- MUL_ADD_C a3L,a3R,b5L,b5R,c3,c1,c2
- MUL_ADD_C a2L,a2R,b6L,b6R,c3,c1,c2
- MUL_ADD_C a1L,a1R,b7L,b7R,c3,c1,c2
- STD c3,64(r_ptr)
- COPY %r0,c3
-
- MUL_ADD_C a2L,a2R,b7L,b7R,c1,c2,c3
- MUL_ADD_C a3L,a3R,b6L,b6R,c1,c2,c3
- MUL_ADD_C a4L,a4R,b5L,b5R,c1,c2,c3
- MUL_ADD_C a5L,a5R,b4L,b4R,c1,c2,c3
- MUL_ADD_C a6L,a6R,b3L,b3R,c1,c2,c3
- MUL_ADD_C a7L,a7R,b2L,b2R,c1,c2,c3
- STD c1,72(r_ptr)
- COPY %r0,c1
-
- MUL_ADD_C a7L,a7R,b3L,b3R,c2,c3,c1
- MUL_ADD_C a6L,a6R,b4L,b4R,c2,c3,c1
- MUL_ADD_C a5L,a5R,b5L,b5R,c2,c3,c1
- MUL_ADD_C a4L,a4R,b6L,b6R,c2,c3,c1
- MUL_ADD_C a3L,a3R,b7L,b7R,c2,c3,c1
- STD c2,80(r_ptr)
- COPY %r0,c2
-
- MUL_ADD_C a4L,a4R,b7L,b7R,c3,c1,c2
- MUL_ADD_C a5L,a5R,b6L,b6R,c3,c1,c2
- MUL_ADD_C a6L,a6R,b5L,b5R,c3,c1,c2
- MUL_ADD_C a7L,a7R,b4L,b4R,c3,c1,c2
- STD c3,88(r_ptr)
- COPY %r0,c3
-
- MUL_ADD_C a7L,a7R,b5L,b5R,c1,c2,c3
- MUL_ADD_C a6L,a6R,b6L,b6R,c1,c2,c3
- MUL_ADD_C a5L,a5R,b7L,b7R,c1,c2,c3
- STD c1,96(r_ptr)
- COPY %r0,c1
-
- MUL_ADD_C a6L,a6R,b7L,b7R,c2,c3,c1
- MUL_ADD_C a7L,a7R,b6L,b6R,c2,c3,c1
- STD c2,104(r_ptr)
- COPY %r0,c2
-
- MUL_ADD_C a7L,a7R,b7L,b7R,c3,c1,c2
- STD c3,112(r_ptr)
- STD c1,120(r_ptr)
-
- .EXIT
- FLDD -88(%sp),%fr13
- FLDD -96(%sp),%fr12
- LDD -104(%sp),%r6 ; restore r6
- LDD -112(%sp),%r5 ; restore r5
- LDD -120(%sp),%r4 ; restore r4
- BVE (%rp)
- LDD,MB -128(%sp),%r3
-
- .PROCEND
-
-;-----------------------------------------------------------------------------
-;
-;void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
-; arg0 = r_ptr
-; arg1 = a_ptr
-; arg2 = b_ptr
-;
-
-bn_mul_comba4
- .proc
- .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
- .EXPORT bn_mul_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
- .entry
- .align 64
-
- STD %r3,0(%sp) ; save r3
- STD %r4,8(%sp) ; save r4
- STD %r5,16(%sp) ; save r5
- STD %r6,24(%sp) ; save r6
- FSTD %fr12,32(%sp) ; save r6
- FSTD %fr13,40(%sp) ; save r7
-
- ;
- ; Zero out carries
- ;
- COPY %r0,c1
- COPY %r0,c2
- COPY %r0,c3
-
- LDO 128(%sp),%sp ; bump stack
- DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
-
- ;
- ; Load up all of the values we are going to use
- ;
- FLDD 0(a_ptr),a0
- FLDD 8(a_ptr),a1
- FLDD 16(a_ptr),a2
- FLDD 24(a_ptr),a3
-
- FLDD 0(b_ptr),b0
- FLDD 8(b_ptr),b1
- FLDD 16(b_ptr),b2
- FLDD 24(b_ptr),b3
-
- MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3
- STD c1,0(r_ptr)
- COPY %r0,c1
-
- MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1
- MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1
- STD c2,8(r_ptr)
- COPY %r0,c2
-
- MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2
- MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2
- MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2
- STD c3,16(r_ptr)
- COPY %r0,c3
-
- MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3
- MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3
- MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3
- MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3
- STD c1,24(r_ptr)
- COPY %r0,c1
-
- MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1
- MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1
- MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1
- STD c2,32(r_ptr)
- COPY %r0,c2
-
- MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2
- MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2
- STD c3,40(r_ptr)
- COPY %r0,c3
-
- MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3
- STD c1,48(r_ptr)
- STD c2,56(r_ptr)
-
- .EXIT
- FLDD -88(%sp),%fr13
- FLDD -96(%sp),%fr12
- LDD -104(%sp),%r6 ; restore r6
- LDD -112(%sp),%r5 ; restore r5
- LDD -120(%sp),%r4 ; restore r4
- BVE (%rp)
- LDD,MB -128(%sp),%r3
-
- .PROCEND
-
-
- .SPACE $TEXT$
- .SUBSPA $CODE$
- .SPACE $PRIVATE$,SORT=16
- .IMPORT $global$,DATA
- .SPACE $TEXT$
- .SUBSPA $CODE$
- .SUBSPA $LIT$,ACCESS=0x2c
-C$4
- .ALIGN 8
- .STRINGZ "Division would overflow (%d)\n"
- .END