diff options
Diffstat (limited to 'main/openssl/crypto/bn/asm/x86-mont.S')
-rw-r--r-- | main/openssl/crypto/bn/asm/x86-mont.S | 174 |
1 files changed, 148 insertions, 26 deletions
diff --git a/main/openssl/crypto/bn/asm/x86-mont.S b/main/openssl/crypto/bn/asm/x86-mont.S index 2bbb0e3a..c701e9e3 100644 --- a/main/openssl/crypto/bn/asm/x86-mont.S +++ b/main/openssl/crypto/bn/asm/x86-mont.S @@ -42,6 +42,127 @@ bn_mul_mont: movl %esi,20(%esp) leal -3(%edi),%ebx movl %ebp,24(%esp) + call .L001PIC_me_up +.L001PIC_me_up: + popl %eax + leal _GLOBAL_OFFSET_TABLE_+[.-.L001PIC_me_up](%eax),%eax + movl OPENSSL_ia32cap_P@GOT(%eax),%eax + btl $26,(%eax) + jnc .L002non_sse2 + movl $-1,%eax + movd %eax,%mm7 + movl 8(%esp),%esi + movl 12(%esp),%edi + movl 16(%esp),%ebp + xorl %edx,%edx + xorl %ecx,%ecx + movd (%edi),%mm4 + movd (%esi),%mm5 + movd (%ebp),%mm3 + pmuludq %mm4,%mm5 + movq %mm5,%mm2 + movq %mm5,%mm0 + pand %mm7,%mm0 + pmuludq 20(%esp),%mm5 + pmuludq %mm5,%mm3 + paddq %mm0,%mm3 + movd 4(%ebp),%mm1 + movd 4(%esi),%mm0 + psrlq $32,%mm2 + psrlq $32,%mm3 + incl %ecx +.align 16 +.L0031st: + pmuludq %mm4,%mm0 + pmuludq %mm5,%mm1 + paddq %mm0,%mm2 + paddq %mm1,%mm3 + movq %mm2,%mm0 + pand %mm7,%mm0 + movd 4(%ebp,%ecx,4),%mm1 + paddq %mm0,%mm3 + movd 4(%esi,%ecx,4),%mm0 + psrlq $32,%mm2 + movd %mm3,28(%esp,%ecx,4) + psrlq $32,%mm3 + leal 1(%ecx),%ecx + cmpl %ebx,%ecx + jl .L0031st + pmuludq %mm4,%mm0 + pmuludq %mm5,%mm1 + paddq %mm0,%mm2 + paddq %mm1,%mm3 + movq %mm2,%mm0 + pand %mm7,%mm0 + paddq %mm0,%mm3 + movd %mm3,28(%esp,%ecx,4) + psrlq $32,%mm2 + psrlq $32,%mm3 + paddq %mm2,%mm3 + movq %mm3,32(%esp,%ebx,4) + incl %edx +.L004outer: + xorl %ecx,%ecx + movd (%edi,%edx,4),%mm4 + movd (%esi),%mm5 + movd 32(%esp),%mm6 + movd (%ebp),%mm3 + pmuludq %mm4,%mm5 + paddq %mm6,%mm5 + movq %mm5,%mm0 + movq %mm5,%mm2 + pand %mm7,%mm0 + pmuludq 20(%esp),%mm5 + pmuludq %mm5,%mm3 + paddq %mm0,%mm3 + movd 36(%esp),%mm6 + movd 4(%ebp),%mm1 + movd 4(%esi),%mm0 + psrlq $32,%mm2 + psrlq $32,%mm3 + paddq %mm6,%mm2 + incl %ecx + decl %ebx +.L005inner: + pmuludq %mm4,%mm0 + pmuludq %mm5,%mm1 + paddq %mm0,%mm2 + paddq %mm1,%mm3 + movq %mm2,%mm0 + movd 36(%esp,%ecx,4),%mm6 + pand %mm7,%mm0 + movd 4(%ebp,%ecx,4),%mm1 + paddq %mm0,%mm3 + movd 4(%esi,%ecx,4),%mm0 + psrlq $32,%mm2 + movd %mm3,28(%esp,%ecx,4) + psrlq $32,%mm3 + paddq %mm6,%mm2 + decl %ebx + leal 1(%ecx),%ecx + jnz .L005inner + movl %ecx,%ebx + pmuludq %mm4,%mm0 + pmuludq %mm5,%mm1 + paddq %mm0,%mm2 + paddq %mm1,%mm3 + movq %mm2,%mm0 + pand %mm7,%mm0 + paddq %mm0,%mm3 + movd %mm3,28(%esp,%ecx,4) + psrlq $32,%mm2 + psrlq $32,%mm3 + movd 36(%esp,%ebx,4),%mm6 + paddq %mm2,%mm3 + paddq %mm6,%mm3 + movq %mm3,32(%esp,%ebx,4) + leal 1(%edx),%edx + cmpl %ebx,%edx + jle .L004outer + emms + jmp .L006common_tail +.align 16 +.L002non_sse2: movl 8(%esp),%esi leal 1(%ebx),%ebp movl 12(%esp),%edi @@ -52,12 +173,12 @@ bn_mul_mont: leal 4(%edi,%ebx,4),%eax orl %edx,%ebp movl (%edi),%edi - jz .L001bn_sqr_mont + jz .L007bn_sqr_mont movl %eax,28(%esp) movl (%esi),%eax xorl %edx,%edx .align 16 -.L002mull: +.L008mull: movl %edx,%ebp mull %edi addl %eax,%ebp @@ -66,7 +187,7 @@ bn_mul_mont: movl (%esi,%ecx,4),%eax cmpl %ebx,%ecx movl %ebp,28(%esp,%ecx,4) - jl .L002mull + jl .L008mull movl %edx,%ebp mull %edi movl 20(%esp),%edi @@ -84,9 +205,9 @@ bn_mul_mont: movl 4(%esi),%eax adcl $0,%edx incl %ecx - jmp .L0032ndmadd + jmp .L0092ndmadd .align 16 -.L0041stmadd: +.L0101stmadd: movl %edx,%ebp mull %edi addl 32(%esp,%ecx,4),%ebp @@ -97,7 +218,7 @@ bn_mul_mont: adcl $0,%edx cmpl %ebx,%ecx movl %ebp,28(%esp,%ecx,4) - jl .L0041stmadd + jl .L0101stmadd movl %edx,%ebp mull %edi addl 32(%esp,%ebx,4),%eax @@ -120,7 +241,7 @@ bn_mul_mont: adcl $0,%edx movl $1,%ecx .align 16 -.L0032ndmadd: +.L0092ndmadd: movl %edx,%ebp mull %edi addl 32(%esp,%ecx,4),%ebp @@ -131,7 +252,7 @@ bn_mul_mont: adcl $0,%edx cmpl %ebx,%ecx movl %ebp,24(%esp,%ecx,4) - jl .L0032ndmadd + jl .L0092ndmadd movl %edx,%ebp mull %edi addl 32(%esp,%ebx,4),%ebp @@ -147,16 +268,16 @@ bn_mul_mont: movl %edx,32(%esp,%ebx,4) cmpl 28(%esp),%ecx movl %eax,36(%esp,%ebx,4) - je .L005common_tail + je .L006common_tail movl (%ecx),%edi movl 8(%esp),%esi movl %ecx,12(%esp) xorl %ecx,%ecx xorl %edx,%edx movl (%esi),%eax - jmp .L0041stmadd + jmp .L0101stmadd .align 16 -.L001bn_sqr_mont: +.L007bn_sqr_mont: movl %ebx,(%esp) movl %ecx,12(%esp) movl %edi,%eax @@ -167,7 +288,7 @@ bn_mul_mont: andl $1,%ebx incl %ecx .align 16 -.L006sqr: +.L011sqr: movl (%esi,%ecx,4),%eax movl %edx,%ebp mull %edi @@ -179,7 +300,7 @@ bn_mul_mont: cmpl (%esp),%ecx movl %eax,%ebx movl %ebp,28(%esp,%ecx,4) - jl .L006sqr + jl .L011sqr movl (%esi,%ecx,4),%eax movl %edx,%ebp mull %edi @@ -203,7 +324,7 @@ bn_mul_mont: movl 4(%esi),%eax movl $1,%ecx .align 16 -.L0073rdmadd: +.L0123rdmadd: movl %edx,%ebp mull %edi addl 32(%esp,%ecx,4),%ebp @@ -222,7 +343,7 @@ bn_mul_mont: adcl $0,%edx cmpl %ebx,%ecx movl %ebp,24(%esp,%ecx,4) - jl .L0073rdmadd + jl .L0123rdmadd movl %edx,%ebp mull %edi addl 32(%esp,%ebx,4),%ebp @@ -238,7 +359,7 @@ bn_mul_mont: movl %edx,32(%esp,%ebx,4) cmpl %ebx,%ecx movl %eax,36(%esp,%ebx,4) - je .L005common_tail + je .L006common_tail movl 4(%esi,%ecx,4),%edi leal 1(%ecx),%ecx movl %edi,%eax @@ -250,12 +371,12 @@ bn_mul_mont: xorl %ebp,%ebp cmpl %ebx,%ecx leal 1(%ecx),%ecx - je .L008sqrlast + je .L013sqrlast movl %edx,%ebx shrl $1,%edx andl $1,%ebx .align 16 -.L009sqradd: +.L014sqradd: movl (%esi,%ecx,4),%eax movl %edx,%ebp mull %edi @@ -271,13 +392,13 @@ bn_mul_mont: cmpl (%esp),%ecx movl %ebp,28(%esp,%ecx,4) movl %eax,%ebx - jle .L009sqradd + jle .L014sqradd movl %edx,%ebp addl %edx,%edx shrl $31,%ebp addl %ebx,%edx adcl $0,%ebp -.L008sqrlast: +.L013sqrlast: movl 20(%esp),%edi movl 16(%esp),%esi imull 32(%esp),%edi @@ -292,9 +413,9 @@ bn_mul_mont: adcl $0,%edx movl $1,%ecx movl 4(%esi),%eax - jmp .L0073rdmadd + jmp .L0123rdmadd .align 16 -.L005common_tail: +.L006common_tail: movl 16(%esp),%ebp movl 4(%esp),%edi leal 32(%esp),%esi @@ -302,13 +423,13 @@ bn_mul_mont: movl %ebx,%ecx xorl %edx,%edx .align 16 -.L010sub: +.L015sub: sbbl (%ebp,%edx,4),%eax movl %eax,(%edi,%edx,4) decl %ecx movl 4(%esi,%edx,4),%eax leal 1(%edx),%edx - jge .L010sub + jge .L015sub sbbl $0,%eax andl %eax,%esi notl %eax @@ -316,12 +437,12 @@ bn_mul_mont: andl %eax,%ebp orl %ebp,%esi .align 16 -.L011copy: +.L016copy: movl (%esi,%ebx,4),%eax movl %eax,(%edi,%ebx,4) movl %ecx,32(%esp,%ebx,4) decl %ebx - jge .L011copy + jge .L016copy movl 24(%esp),%esp movl $1,%eax .L000just_leave: @@ -336,3 +457,4 @@ bn_mul_mont: .byte 54,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121 .byte 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46 .byte 111,114,103,62,0 +.comm OPENSSL_ia32cap_P,8,4 |