diff options
Diffstat (limited to 'app/openssl/crypto/bn/asm/x86-mont.S')
-rw-r--r-- | app/openssl/crypto/bn/asm/x86-mont.S | 460 |
1 files changed, 460 insertions, 0 deletions
diff --git a/app/openssl/crypto/bn/asm/x86-mont.S b/app/openssl/crypto/bn/asm/x86-mont.S new file mode 100644 index 00000000..c701e9e3 --- /dev/null +++ b/app/openssl/crypto/bn/asm/x86-mont.S @@ -0,0 +1,460 @@ +.file "crypto/bn/asm/x86-mont.s" +.text +.globl bn_mul_mont +.type bn_mul_mont,@function +.align 16 +bn_mul_mont: +.L_bn_mul_mont_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + xorl %eax,%eax + movl 40(%esp),%edi + cmpl $4,%edi + jl .L000just_leave + leal 20(%esp),%esi + leal 24(%esp),%edx + movl %esp,%ebp + addl $2,%edi + negl %edi + leal -32(%esp,%edi,4),%esp + negl %edi + movl %esp,%eax + subl %edx,%eax + andl $2047,%eax + subl %eax,%esp + xorl %esp,%edx + andl $2048,%edx + xorl $2048,%edx + subl %edx,%esp + andl $-64,%esp + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edx + movl 16(%esi),%esi + movl (%esi),%esi + movl %eax,4(%esp) + movl %ebx,8(%esp) + movl %ecx,12(%esp) + movl %edx,16(%esp) + movl %esi,20(%esp) + leal -3(%edi),%ebx + movl %ebp,24(%esp) + call .L001PIC_me_up +.L001PIC_me_up: + popl %eax + leal _GLOBAL_OFFSET_TABLE_+[.-.L001PIC_me_up](%eax),%eax + movl OPENSSL_ia32cap_P@GOT(%eax),%eax + btl $26,(%eax) + jnc .L002non_sse2 + movl $-1,%eax + movd %eax,%mm7 + movl 8(%esp),%esi + movl 12(%esp),%edi + movl 16(%esp),%ebp + xorl %edx,%edx + xorl %ecx,%ecx + movd (%edi),%mm4 + movd (%esi),%mm5 + movd (%ebp),%mm3 + pmuludq %mm4,%mm5 + movq %mm5,%mm2 + movq %mm5,%mm0 + pand %mm7,%mm0 + pmuludq 20(%esp),%mm5 + pmuludq %mm5,%mm3 + paddq %mm0,%mm3 + movd 4(%ebp),%mm1 + movd 4(%esi),%mm0 + psrlq $32,%mm2 + psrlq $32,%mm3 + incl %ecx +.align 16 +.L0031st: + pmuludq %mm4,%mm0 + pmuludq %mm5,%mm1 + paddq %mm0,%mm2 + paddq %mm1,%mm3 + movq %mm2,%mm0 + pand %mm7,%mm0 + movd 4(%ebp,%ecx,4),%mm1 + paddq %mm0,%mm3 + movd 4(%esi,%ecx,4),%mm0 + psrlq $32,%mm2 + movd %mm3,28(%esp,%ecx,4) + psrlq $32,%mm3 + leal 1(%ecx),%ecx + cmpl %ebx,%ecx + jl .L0031st + pmuludq %mm4,%mm0 + pmuludq %mm5,%mm1 + paddq %mm0,%mm2 + paddq %mm1,%mm3 + movq %mm2,%mm0 + pand %mm7,%mm0 + paddq %mm0,%mm3 + movd %mm3,28(%esp,%ecx,4) + psrlq $32,%mm2 + psrlq $32,%mm3 + paddq %mm2,%mm3 + movq %mm3,32(%esp,%ebx,4) + incl %edx +.L004outer: + xorl %ecx,%ecx + movd (%edi,%edx,4),%mm4 + movd (%esi),%mm5 + movd 32(%esp),%mm6 + movd (%ebp),%mm3 + pmuludq %mm4,%mm5 + paddq %mm6,%mm5 + movq %mm5,%mm0 + movq %mm5,%mm2 + pand %mm7,%mm0 + pmuludq 20(%esp),%mm5 + pmuludq %mm5,%mm3 + paddq %mm0,%mm3 + movd 36(%esp),%mm6 + movd 4(%ebp),%mm1 + movd 4(%esi),%mm0 + psrlq $32,%mm2 + psrlq $32,%mm3 + paddq %mm6,%mm2 + incl %ecx + decl %ebx +.L005inner: + pmuludq %mm4,%mm0 + pmuludq %mm5,%mm1 + paddq %mm0,%mm2 + paddq %mm1,%mm3 + movq %mm2,%mm0 + movd 36(%esp,%ecx,4),%mm6 + pand %mm7,%mm0 + movd 4(%ebp,%ecx,4),%mm1 + paddq %mm0,%mm3 + movd 4(%esi,%ecx,4),%mm0 + psrlq $32,%mm2 + movd %mm3,28(%esp,%ecx,4) + psrlq $32,%mm3 + paddq %mm6,%mm2 + decl %ebx + leal 1(%ecx),%ecx + jnz .L005inner + movl %ecx,%ebx + pmuludq %mm4,%mm0 + pmuludq %mm5,%mm1 + paddq %mm0,%mm2 + paddq %mm1,%mm3 + movq %mm2,%mm0 + pand %mm7,%mm0 + paddq %mm0,%mm3 + movd %mm3,28(%esp,%ecx,4) + psrlq $32,%mm2 + psrlq $32,%mm3 + movd 36(%esp,%ebx,4),%mm6 + paddq %mm2,%mm3 + paddq %mm6,%mm3 + movq %mm3,32(%esp,%ebx,4) + leal 1(%edx),%edx + cmpl %ebx,%edx + jle .L004outer + emms + jmp .L006common_tail +.align 16 +.L002non_sse2: + movl 8(%esp),%esi + leal 1(%ebx),%ebp + movl 12(%esp),%edi + xorl %ecx,%ecx + movl %esi,%edx + andl $1,%ebp + subl %edi,%edx + leal 4(%edi,%ebx,4),%eax + orl %edx,%ebp + movl (%edi),%edi + jz .L007bn_sqr_mont + movl %eax,28(%esp) + movl (%esi),%eax + xorl %edx,%edx +.align 16 +.L008mull: + movl %edx,%ebp + mull %edi + addl %eax,%ebp + leal 1(%ecx),%ecx + adcl $0,%edx + movl (%esi,%ecx,4),%eax + cmpl %ebx,%ecx + movl %ebp,28(%esp,%ecx,4) + jl .L008mull + movl %edx,%ebp + mull %edi + movl 20(%esp),%edi + addl %ebp,%eax + movl 16(%esp),%esi + adcl $0,%edx + imull 32(%esp),%edi + movl %eax,32(%esp,%ebx,4) + xorl %ecx,%ecx + movl %edx,36(%esp,%ebx,4) + movl %ecx,40(%esp,%ebx,4) + movl (%esi),%eax + mull %edi + addl 32(%esp),%eax + movl 4(%esi),%eax + adcl $0,%edx + incl %ecx + jmp .L0092ndmadd +.align 16 +.L0101stmadd: + movl %edx,%ebp + mull %edi + addl 32(%esp,%ecx,4),%ebp + leal 1(%ecx),%ecx + adcl $0,%edx + addl %eax,%ebp + movl (%esi,%ecx,4),%eax + adcl $0,%edx + cmpl %ebx,%ecx + movl %ebp,28(%esp,%ecx,4) + jl .L0101stmadd + movl %edx,%ebp + mull %edi + addl 32(%esp,%ebx,4),%eax + movl 20(%esp),%edi + adcl $0,%edx + movl 16(%esp),%esi + addl %eax,%ebp + adcl $0,%edx + imull 32(%esp),%edi + xorl %ecx,%ecx + addl 36(%esp,%ebx,4),%edx + movl %ebp,32(%esp,%ebx,4) + adcl $0,%ecx + movl (%esi),%eax + movl %edx,36(%esp,%ebx,4) + movl %ecx,40(%esp,%ebx,4) + mull %edi + addl 32(%esp),%eax + movl 4(%esi),%eax + adcl $0,%edx + movl $1,%ecx +.align 16 +.L0092ndmadd: + movl %edx,%ebp + mull %edi + addl 32(%esp,%ecx,4),%ebp + leal 1(%ecx),%ecx + adcl $0,%edx + addl %eax,%ebp + movl (%esi,%ecx,4),%eax + adcl $0,%edx + cmpl %ebx,%ecx + movl %ebp,24(%esp,%ecx,4) + jl .L0092ndmadd + movl %edx,%ebp + mull %edi + addl 32(%esp,%ebx,4),%ebp + adcl $0,%edx + addl %eax,%ebp + adcl $0,%edx + movl %ebp,28(%esp,%ebx,4) + xorl %eax,%eax + movl 12(%esp),%ecx + addl 36(%esp,%ebx,4),%edx + adcl 40(%esp,%ebx,4),%eax + leal 4(%ecx),%ecx + movl %edx,32(%esp,%ebx,4) + cmpl 28(%esp),%ecx + movl %eax,36(%esp,%ebx,4) + je .L006common_tail + movl (%ecx),%edi + movl 8(%esp),%esi + movl %ecx,12(%esp) + xorl %ecx,%ecx + xorl %edx,%edx + movl (%esi),%eax + jmp .L0101stmadd +.align 16 +.L007bn_sqr_mont: + movl %ebx,(%esp) + movl %ecx,12(%esp) + movl %edi,%eax + mull %edi + movl %eax,32(%esp) + movl %edx,%ebx + shrl $1,%edx + andl $1,%ebx + incl %ecx +.align 16 +.L011sqr: + movl (%esi,%ecx,4),%eax + movl %edx,%ebp + mull %edi + addl %ebp,%eax + leal 1(%ecx),%ecx + adcl $0,%edx + leal (%ebx,%eax,2),%ebp + shrl $31,%eax + cmpl (%esp),%ecx + movl %eax,%ebx + movl %ebp,28(%esp,%ecx,4) + jl .L011sqr + movl (%esi,%ecx,4),%eax + movl %edx,%ebp + mull %edi + addl %ebp,%eax + movl 20(%esp),%edi + adcl $0,%edx + movl 16(%esp),%esi + leal (%ebx,%eax,2),%ebp + imull 32(%esp),%edi + shrl $31,%eax + movl %ebp,32(%esp,%ecx,4) + leal (%eax,%edx,2),%ebp + movl (%esi),%eax + shrl $31,%edx + movl %ebp,36(%esp,%ecx,4) + movl %edx,40(%esp,%ecx,4) + mull %edi + addl 32(%esp),%eax + movl %ecx,%ebx + adcl $0,%edx + movl 4(%esi),%eax + movl $1,%ecx +.align 16 +.L0123rdmadd: + movl %edx,%ebp + mull %edi + addl 32(%esp,%ecx,4),%ebp + adcl $0,%edx + addl %eax,%ebp + movl 4(%esi,%ecx,4),%eax + adcl $0,%edx + movl %ebp,28(%esp,%ecx,4) + movl %edx,%ebp + mull %edi + addl 36(%esp,%ecx,4),%ebp + leal 2(%ecx),%ecx + adcl $0,%edx + addl %eax,%ebp + movl (%esi,%ecx,4),%eax + adcl $0,%edx + cmpl %ebx,%ecx + movl %ebp,24(%esp,%ecx,4) + jl .L0123rdmadd + movl %edx,%ebp + mull %edi + addl 32(%esp,%ebx,4),%ebp + adcl $0,%edx + addl %eax,%ebp + adcl $0,%edx + movl %ebp,28(%esp,%ebx,4) + movl 12(%esp),%ecx + xorl %eax,%eax + movl 8(%esp),%esi + addl 36(%esp,%ebx,4),%edx + adcl 40(%esp,%ebx,4),%eax + movl %edx,32(%esp,%ebx,4) + cmpl %ebx,%ecx + movl %eax,36(%esp,%ebx,4) + je .L006common_tail + movl 4(%esi,%ecx,4),%edi + leal 1(%ecx),%ecx + movl %edi,%eax + movl %ecx,12(%esp) + mull %edi + addl 32(%esp,%ecx,4),%eax + adcl $0,%edx + movl %eax,32(%esp,%ecx,4) + xorl %ebp,%ebp + cmpl %ebx,%ecx + leal 1(%ecx),%ecx + je .L013sqrlast + movl %edx,%ebx + shrl $1,%edx + andl $1,%ebx +.align 16 +.L014sqradd: + movl (%esi,%ecx,4),%eax + movl %edx,%ebp + mull %edi + addl %ebp,%eax + leal (%eax,%eax,1),%ebp + adcl $0,%edx + shrl $31,%eax + addl 32(%esp,%ecx,4),%ebp + leal 1(%ecx),%ecx + adcl $0,%eax + addl %ebx,%ebp + adcl $0,%eax + cmpl (%esp),%ecx + movl %ebp,28(%esp,%ecx,4) + movl %eax,%ebx + jle .L014sqradd + movl %edx,%ebp + addl %edx,%edx + shrl $31,%ebp + addl %ebx,%edx + adcl $0,%ebp +.L013sqrlast: + movl 20(%esp),%edi + movl 16(%esp),%esi + imull 32(%esp),%edi + addl 32(%esp,%ecx,4),%edx + movl (%esi),%eax + adcl $0,%ebp + movl %edx,32(%esp,%ecx,4) + movl %ebp,36(%esp,%ecx,4) + mull %edi + addl 32(%esp),%eax + leal -1(%ecx),%ebx + adcl $0,%edx + movl $1,%ecx + movl 4(%esi),%eax + jmp .L0123rdmadd +.align 16 +.L006common_tail: + movl 16(%esp),%ebp + movl 4(%esp),%edi + leal 32(%esp),%esi + movl (%esi),%eax + movl %ebx,%ecx + xorl %edx,%edx +.align 16 +.L015sub: + sbbl (%ebp,%edx,4),%eax + movl %eax,(%edi,%edx,4) + decl %ecx + movl 4(%esi,%edx,4),%eax + leal 1(%edx),%edx + jge .L015sub + sbbl $0,%eax + andl %eax,%esi + notl %eax + movl %edi,%ebp + andl %eax,%ebp + orl %ebp,%esi +.align 16 +.L016copy: + movl (%esi,%ebx,4),%eax + movl %eax,(%edi,%ebx,4) + movl %ecx,32(%esp,%ebx,4) + decl %ebx + jge .L016copy + movl 24(%esp),%esp + movl $1,%eax +.L000just_leave: + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size bn_mul_mont,.-.L_bn_mul_mont_begin +.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105 +.byte 112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56 +.byte 54,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121 +.byte 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46 +.byte 111,114,103,62,0 +.comm OPENSSL_ia32cap_P,8,4 |