summaryrefslogtreecommitdiff
path: root/openssl/crypto/bn/asm/mips3-mont.pl
blob: 8f9156e02af382d6f619ed9fdc59bc1522d3f88e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================

# This module doesn't present direct interest for OpenSSL, because it
# doesn't provide better performance for longer keys. While 512-bit
# RSA private key operations are 40% faster, 1024-bit ones are hardly
# faster at all, while longer key operations are slower by up to 20%.
# It might be of interest to embedded system developers though, as
# it's smaller than 1KB, yet offers ~3x improvement over compiler
# generated code.
#
# The module targets N32 and N64 MIPS ABIs and currently is a bit
# IRIX-centric, i.e. is likely to require adaptation for other OSes.

# int bn_mul_mont(
$rp="a0";	# BN_ULONG *rp,
$ap="a1";	# const BN_ULONG *ap,
$bp="a2";	# const BN_ULONG *bp,
$np="a3";	# const BN_ULONG *np,
$n0="a4";	# const BN_ULONG *n0,
$num="a5";	# int num);

$lo0="a6";
$hi0="a7";
$lo1="v0";
$hi1="v1";
$aj="t0";
$bi="t1";
$nj="t2";
$tp="t3";
$alo="s0";
$ahi="s1";
$nlo="s2";
$nhi="s3";
$tj="s4";
$i="s5";
$j="s6";
$fp="t8";
$m1="t9";

$FRAME=8*(2+8);

$code=<<___;
#include <asm.h>
#include <regdef.h>

.text

.set	noat
.set	reorder

.align	5
.globl	bn_mul_mont
.ent	bn_mul_mont
bn_mul_mont:
	.set	noreorder
	PTR_SUB	sp,64
	move	$fp,sp
	.frame	$fp,64,ra
	slt	AT,$num,4
	li	v0,0
	beqzl	AT,.Lproceed
	nop
	jr	ra
	PTR_ADD	sp,$fp,64
	.set	reorder
.align	5
.Lproceed:
	ld	$n0,0($n0)
	ld	$bi,0($bp)	# bp[0]
	ld	$aj,0($ap)	# ap[0]
	ld	$nj,0($np)	# np[0]
	PTR_SUB	sp,16		# place for two extra words
	sll	$num,3
	li	AT,-4096
	PTR_SUB	sp,$num
	and	sp,AT

	sd	s0,0($fp)
	sd	s1,8($fp)
	sd	s2,16($fp)
	sd	s3,24($fp)
	sd	s4,32($fp)
	sd	s5,40($fp)
	sd	s6,48($fp)
	sd	s7,56($fp)

	dmultu	$aj,$bi
	ld	$alo,8($ap)
	ld	$nlo,8($np)
	mflo	$lo0
	mfhi	$hi0
	dmultu	$lo0,$n0
	mflo	$m1

	dmultu	$alo,$bi
	mflo	$alo
	mfhi	$ahi

	dmultu	$nj,$m1
	mflo	$lo1
	mfhi	$hi1
	dmultu	$nlo,$m1
	daddu	$lo1,$lo0
	sltu	AT,$lo1,$lo0
	daddu	$hi1,AT
	mflo	$nlo
	mfhi	$nhi

	move	$tp,sp
	li	$j,16
.align	4
.L1st:
	.set	noreorder
	PTR_ADD	$aj,$ap,$j
	ld	$aj,($aj)
	PTR_ADD	$nj,$np,$j
	ld	$nj,($nj)

	dmultu	$aj,$bi
	daddu	$lo0,$alo,$hi0
	daddu	$lo1,$nlo,$hi1
	sltu	AT,$lo0,$hi0
	sltu	s7,$lo1,$hi1
	daddu	$hi0,$ahi,AT
	daddu	$hi1,$nhi,s7
	mflo	$alo
	mfhi	$ahi

	daddu	$lo1,$lo0
	sltu	AT,$lo1,$lo0
	dmultu	$nj,$m1
	daddu	$hi1,AT
	addu	$j,8
	sd	$lo1,($tp)
	sltu	s7,$j,$num
	mflo	$nlo
	mfhi	$nhi

	bnez	s7,.L1st
	PTR_ADD	$tp,8
	.set	reorder

	daddu	$lo0,$alo,$hi0
	sltu	AT,$lo0,$hi0
	daddu	$hi0,$ahi,AT

	daddu	$lo1,$nlo,$hi1
	sltu	s7,$lo1,$hi1
	daddu	$hi1,$nhi,s7
	daddu	$lo1,$lo0
	sltu	AT,$lo1,$lo0
	daddu	$hi1,AT

	sd	$lo1,($tp)

	daddu	$hi1,$hi0
	sltu	AT,$hi1,$hi0
	sd	$hi1,8($tp)
	sd	AT,16($tp)

	li	$i,8
.align	4
.Louter:
	PTR_ADD	$bi,$bp,$i
	ld	$bi,($bi)
	ld	$aj,($ap)
	ld	$alo,8($ap)
	ld	$tj,(sp)

	dmultu	$aj,$bi
	ld	$nj,($np)
	ld	$nlo,8($np)
	mflo	$lo0
	mfhi	$hi0
	daddu	$lo0,$tj
	dmultu	$lo0,$n0
	sltu	AT,$lo0,$tj
	daddu	$hi0,AT
	mflo	$m1

	dmultu	$alo,$bi
	mflo	$alo
	mfhi	$ahi

	dmultu	$nj,$m1
	mflo	$lo1
	mfhi	$hi1

	dmultu	$nlo,$m1
	daddu	$lo1,$lo0
	sltu	AT,$lo1,$lo0
	daddu	$hi1,AT
	mflo	$nlo
	mfhi	$nhi

	move	$tp,sp
	li	$j,16
	ld	$tj,8($tp)
.align	4
.Linner:
	.set	noreorder
	PTR_ADD	$aj,$ap,$j
	ld	$aj,($aj)
	PTR_ADD	$nj,$np,$j
	ld	$nj,($nj)

	dmultu	$aj,$bi
	daddu	$lo0,$alo,$hi0
	daddu	$lo1,$nlo,$hi1
	sltu	AT,$lo0,$hi0
	sltu	s7,$lo1,$hi1
	daddu	$hi0,$ahi,AT
	daddu	$hi1,$nhi,s7
	mflo	$alo
	mfhi	$ahi

	daddu	$lo0,$tj
	addu	$j,8
	dmultu	$nj,$m1
	sltu	AT,$lo0,$tj
	daddu	$lo1,$lo0
	daddu	$hi0,AT
	sltu	s7,$lo1,$lo0
	ld	$tj,16($tp)
	daddu	$hi1,s7
	sltu	AT,$j,$num
	mflo	$nlo
	mfhi	$nhi
	sd	$lo1,($tp)
	bnez	AT,.Linner
	PTR_ADD	$tp,8
	.set	reorder

	daddu	$lo0,$alo,$hi0
	sltu	AT,$lo0,$hi0
	daddu	$hi0,$ahi,AT
	daddu	$lo0,$tj
	sltu	s7,$lo0,$tj
	daddu	$hi0,s7

	ld	$tj,16($tp)
	daddu	$lo1,$nlo,$hi1
	sltu	AT,$lo1,$hi1
	daddu	$hi1,$nhi,AT
	daddu	$lo1,$lo0
	sltu	s7,$lo1,$lo0
	daddu	$hi1,s7
	sd	$lo1,($tp)

	daddu	$lo1,$hi1,$hi0
	sltu	$hi1,$lo1,$hi0
	daddu	$lo1,$tj
	sltu	AT,$lo1,$tj
	daddu	$hi1,AT
	sd	$lo1,8($tp)
	sd	$hi1,16($tp)

	addu	$i,8
	sltu	s7,$i,$num
	bnez	s7,.Louter

	.set	noreorder
	PTR_ADD	$tj,sp,$num	# &tp[num]
	move	$tp,sp
	move	$ap,sp
	li	$hi0,0		# clear borrow bit

.align	4
.Lsub:	ld	$lo0,($tp)
	ld	$lo1,($np)
	PTR_ADD	$tp,8
	PTR_ADD	$np,8
	dsubu	$lo1,$lo0,$lo1	# tp[i]-np[i]
	sgtu	AT,$lo1,$lo0
	dsubu	$lo0,$lo1,$hi0
	sgtu	$hi0,$lo0,$lo1
	sd	$lo0,($rp)
	or	$hi0,AT
	sltu	AT,$tp,$tj
	bnez	AT,.Lsub
	PTR_ADD	$rp,8

	dsubu	$hi0,$hi1,$hi0	# handle upmost overflow bit
	move	$tp,sp
	PTR_SUB	$rp,$num	# restore rp
	not	$hi1,$hi0

	and	$ap,$hi0,sp
	and	$bp,$hi1,$rp
	or	$ap,$ap,$bp	# ap=borrow?tp:rp

.align	4
.Lcopy:	ld	$aj,($ap)
	PTR_ADD	$ap,8
	PTR_ADD	$tp,8
	sd	zero,-8($tp)
	sltu	AT,$tp,$tj
	sd	$aj,($rp)
	bnez	AT,.Lcopy
	PTR_ADD	$rp,8

	ld	s0,0($fp)
	ld	s1,8($fp)
	ld	s2,16($fp)
	ld	s3,24($fp)
	ld	s4,32($fp)
	ld	s5,40($fp)
	ld	s6,48($fp)
	ld	s7,56($fp)
	li	v0,1
	jr	ra
	PTR_ADD	sp,$fp,64
	.set	reorder
END(bn_mul_mont)
.rdata
.asciiz	"Montgomery Multiplication for MIPS III/IV, CRYPTOGAMS by <appro\@openssl.org>"
___

print $code;
close STDOUT;