From 3e121542d8b7ab5201c47bbd3ba5611a23c54759 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Parm=C3=A9nides=20GV?= Date: Wed, 11 Jun 2014 11:56:59 +0200 Subject: Correctly connects to millipede. Location keyword on android.cfg isn't supported, EIP corresponding code has been commented out. I think we should support it in ics-openvpn, so that we can show the location instead of the server name. I've updated all opensssl, openvpn, etc. subprojects from rev 813 of ics-openvpn, and jni too. --- app/openssl/crypto/bn/asm/x86_64-mont.S | 1374 +++++++++++++++++++++++++++++++ 1 file changed, 1374 insertions(+) create mode 100644 app/openssl/crypto/bn/asm/x86_64-mont.S (limited to 'app/openssl/crypto/bn/asm/x86_64-mont.S') diff --git a/app/openssl/crypto/bn/asm/x86_64-mont.S b/app/openssl/crypto/bn/asm/x86_64-mont.S new file mode 100644 index 00000000..95e29052 --- /dev/null +++ b/app/openssl/crypto/bn/asm/x86_64-mont.S @@ -0,0 +1,1374 @@ +.text + +.globl bn_mul_mont +.type bn_mul_mont,@function +.align 16 +bn_mul_mont: + testl $3,%r9d + jnz .Lmul_enter + cmpl $8,%r9d + jb .Lmul_enter + cmpq %rsi,%rdx + jne .Lmul4x_enter + jmp .Lsqr4x_enter + +.align 16 +.Lmul_enter: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + movl %r9d,%r9d + leaq 2(%r9),%r10 + movq %rsp,%r11 + negq %r10 + leaq (%rsp,%r10,8),%rsp + andq $-1024,%rsp + + movq %r11,8(%rsp,%r9,8) +.Lmul_body: + movq %rdx,%r12 + movq (%r8),%r8 + movq (%r12),%rbx + movq (%rsi),%rax + + xorq %r14,%r14 + xorq %r15,%r15 + + movq %r8,%rbp + mulq %rbx + movq %rax,%r10 + movq (%rcx),%rax + + imulq %r10,%rbp + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r10 + movq 8(%rsi),%rax + adcq $0,%rdx + movq %rdx,%r13 + + leaq 1(%r15),%r15 + jmp .L1st_enter + +.align 16 +.L1st: + addq %rax,%r13 + movq (%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r11,%r13 + movq %r10,%r11 + adcq $0,%rdx + movq %r13,-16(%rsp,%r15,8) + movq %rdx,%r13 + +.L1st_enter: + mulq %rbx + addq %rax,%r11 + movq (%rcx,%r15,8),%rax + adcq $0,%rdx + leaq 1(%r15),%r15 + movq %rdx,%r10 + + mulq %rbp + cmpq %r9,%r15 + jne .L1st + + addq %rax,%r13 + movq (%rsi),%rax + adcq $0,%rdx + addq %r11,%r13 + adcq $0,%rdx + movq %r13,-16(%rsp,%r15,8) + movq %rdx,%r13 + movq %r10,%r11 + + xorq %rdx,%rdx + addq %r11,%r13 + adcq $0,%rdx + movq %r13,-8(%rsp,%r9,8) + movq %rdx,(%rsp,%r9,8) + + leaq 1(%r14),%r14 + jmp .Louter +.align 16 +.Louter: + movq (%r12,%r14,8),%rbx + xorq %r15,%r15 + movq %r8,%rbp + movq (%rsp),%r10 + mulq %rbx + addq %rax,%r10 + movq (%rcx),%rax + adcq $0,%rdx + + imulq %r10,%rbp + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r10 + movq 8(%rsi),%rax + adcq $0,%rdx + movq 8(%rsp),%r10 + movq %rdx,%r13 + + leaq 1(%r15),%r15 + jmp .Linner_enter + +.align 16 +.Linner: + addq %rax,%r13 + movq (%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + movq (%rsp,%r15,8),%r10 + adcq $0,%rdx + movq %r13,-16(%rsp,%r15,8) + movq %rdx,%r13 + +.Linner_enter: + mulq %rbx + addq %rax,%r11 + movq (%rcx,%r15,8),%rax + adcq $0,%rdx + addq %r11,%r10 + movq %rdx,%r11 + adcq $0,%r11 + leaq 1(%r15),%r15 + + mulq %rbp + cmpq %r9,%r15 + jne .Linner + + addq %rax,%r13 + movq (%rsi),%rax + adcq $0,%rdx + addq %r10,%r13 + movq (%rsp,%r15,8),%r10 + adcq $0,%rdx + movq %r13,-16(%rsp,%r15,8) + movq %rdx,%r13 + + xorq %rdx,%rdx + addq %r11,%r13 + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-8(%rsp,%r9,8) + movq %rdx,(%rsp,%r9,8) + + leaq 1(%r14),%r14 + cmpq %r9,%r14 + jl .Louter + + xorq %r14,%r14 + movq (%rsp),%rax + leaq (%rsp),%rsi + movq %r9,%r15 + jmp .Lsub +.align 16 +.Lsub: sbbq (%rcx,%r14,8),%rax + movq %rax,(%rdi,%r14,8) + movq 8(%rsi,%r14,8),%rax + leaq 1(%r14),%r14 + decq %r15 + jnz .Lsub + + sbbq $0,%rax + xorq %r14,%r14 + andq %rax,%rsi + notq %rax + movq %rdi,%rcx + andq %rax,%rcx + movq %r9,%r15 + orq %rcx,%rsi +.align 16 +.Lcopy: + movq (%rsi,%r14,8),%rax + movq %r14,(%rsp,%r14,8) + movq %rax,(%rdi,%r14,8) + leaq 1(%r14),%r14 + subq $1,%r15 + jnz .Lcopy + + movq 8(%rsp,%r9,8),%rsi + movq $1,%rax + movq (%rsi),%r15 + movq 8(%rsi),%r14 + movq 16(%rsi),%r13 + movq 24(%rsi),%r12 + movq 32(%rsi),%rbp + movq 40(%rsi),%rbx + leaq 48(%rsi),%rsp +.Lmul_epilogue: + .byte 0xf3,0xc3 +.size bn_mul_mont,.-bn_mul_mont +.type bn_mul4x_mont,@function +.align 16 +bn_mul4x_mont: +.Lmul4x_enter: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + movl %r9d,%r9d + leaq 4(%r9),%r10 + movq %rsp,%r11 + negq %r10 + leaq (%rsp,%r10,8),%rsp + andq $-1024,%rsp + + movq %r11,8(%rsp,%r9,8) +.Lmul4x_body: + movq %rdi,16(%rsp,%r9,8) + movq %rdx,%r12 + movq (%r8),%r8 + movq (%r12),%rbx + movq (%rsi),%rax + + xorq %r14,%r14 + xorq %r15,%r15 + + movq %r8,%rbp + mulq %rbx + movq %rax,%r10 + movq (%rcx),%rax + + imulq %r10,%rbp + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r10 + movq 8(%rsi),%rax + adcq $0,%rdx + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq 8(%rcx),%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq 16(%rsi),%rax + adcq $0,%rdx + addq %r11,%rdi + leaq 4(%r15),%r15 + adcq $0,%rdx + movq %rdi,(%rsp) + movq %rdx,%r13 + jmp .L1st4x +.align 16 +.L1st4x: + mulq %rbx + addq %rax,%r10 + movq -16(%rcx,%r15,8),%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq -8(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-24(%rsp,%r15,8) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq -8(%rcx,%r15,8),%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq (%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %rdi,-16(%rsp,%r15,8) + movq %rdx,%r13 + + mulq %rbx + addq %rax,%r10 + movq (%rcx,%r15,8),%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq 8(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-8(%rsp,%r15,8) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq 8(%rcx,%r15,8),%rax + adcq $0,%rdx + leaq 4(%r15),%r15 + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq -16(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %rdi,-32(%rsp,%r15,8) + movq %rdx,%r13 + cmpq %r9,%r15 + jl .L1st4x + + mulq %rbx + addq %rax,%r10 + movq -16(%rcx,%r15,8),%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq -8(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-24(%rsp,%r15,8) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq -8(%rcx,%r15,8),%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq (%rsi),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %rdi,-16(%rsp,%r15,8) + movq %rdx,%r13 + + xorq %rdi,%rdi + addq %r10,%r13 + adcq $0,%rdi + movq %r13,-8(%rsp,%r15,8) + movq %rdi,(%rsp,%r15,8) + + leaq 1(%r14),%r14 +.align 4 +.Louter4x: + movq (%r12,%r14,8),%rbx + xorq %r15,%r15 + movq (%rsp),%r10 + movq %r8,%rbp + mulq %rbx + addq %rax,%r10 + movq (%rcx),%rax + adcq $0,%rdx + + imulq %r10,%rbp + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r10 + movq 8(%rsi),%rax + adcq $0,%rdx + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq 8(%rcx),%rax + adcq $0,%rdx + addq 8(%rsp),%r11 + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq 16(%rsi),%rax + adcq $0,%rdx + addq %r11,%rdi + leaq 4(%r15),%r15 + adcq $0,%rdx + movq %rdi,(%rsp) + movq %rdx,%r13 + jmp .Linner4x +.align 16 +.Linner4x: + mulq %rbx + addq %rax,%r10 + movq -16(%rcx,%r15,8),%rax + adcq $0,%rdx + addq -16(%rsp,%r15,8),%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq -8(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-24(%rsp,%r15,8) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq -8(%rcx,%r15,8),%rax + adcq $0,%rdx + addq -8(%rsp,%r15,8),%r11 + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq (%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %rdi,-16(%rsp,%r15,8) + movq %rdx,%r13 + + mulq %rbx + addq %rax,%r10 + movq (%rcx,%r15,8),%rax + adcq $0,%rdx + addq (%rsp,%r15,8),%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq 8(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-8(%rsp,%r15,8) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq 8(%rcx,%r15,8),%rax + adcq $0,%rdx + addq 8(%rsp,%r15,8),%r11 + adcq $0,%rdx + leaq 4(%r15),%r15 + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq -16(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %rdi,-32(%rsp,%r15,8) + movq %rdx,%r13 + cmpq %r9,%r15 + jl .Linner4x + + mulq %rbx + addq %rax,%r10 + movq -16(%rcx,%r15,8),%rax + adcq $0,%rdx + addq -16(%rsp,%r15,8),%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq -8(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-24(%rsp,%r15,8) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq -8(%rcx,%r15,8),%rax + adcq $0,%rdx + addq -8(%rsp,%r15,8),%r11 + adcq $0,%rdx + leaq 1(%r14),%r14 + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq (%rsi),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %rdi,-16(%rsp,%r15,8) + movq %rdx,%r13 + + xorq %rdi,%rdi + addq %r10,%r13 + adcq $0,%rdi + addq (%rsp,%r9,8),%r13 + adcq $0,%rdi + movq %r13,-8(%rsp,%r15,8) + movq %rdi,(%rsp,%r15,8) + + cmpq %r9,%r14 + jl .Louter4x + movq 16(%rsp,%r9,8),%rdi + movq 0(%rsp),%rax + pxor %xmm0,%xmm0 + movq 8(%rsp),%rdx + shrq $2,%r9 + leaq (%rsp),%rsi + xorq %r14,%r14 + + subq 0(%rcx),%rax + movq 16(%rsi),%rbx + movq 24(%rsi),%rbp + sbbq 8(%rcx),%rdx + leaq -1(%r9),%r15 + jmp .Lsub4x +.align 16 +.Lsub4x: + movq %rax,0(%rdi,%r14,8) + movq %rdx,8(%rdi,%r14,8) + sbbq 16(%rcx,%r14,8),%rbx + movq 32(%rsi,%r14,8),%rax + movq 40(%rsi,%r14,8),%rdx + sbbq 24(%rcx,%r14,8),%rbp + movq %rbx,16(%rdi,%r14,8) + movq %rbp,24(%rdi,%r14,8) + sbbq 32(%rcx,%r14,8),%rax + movq 48(%rsi,%r14,8),%rbx + movq 56(%rsi,%r14,8),%rbp + sbbq 40(%rcx,%r14,8),%rdx + leaq 4(%r14),%r14 + decq %r15 + jnz .Lsub4x + + movq %rax,0(%rdi,%r14,8) + movq 32(%rsi,%r14,8),%rax + sbbq 16(%rcx,%r14,8),%rbx + movq %rdx,8(%rdi,%r14,8) + sbbq 24(%rcx,%r14,8),%rbp + movq %rbx,16(%rdi,%r14,8) + + sbbq $0,%rax + movq %rbp,24(%rdi,%r14,8) + xorq %r14,%r14 + andq %rax,%rsi + notq %rax + movq %rdi,%rcx + andq %rax,%rcx + leaq -1(%r9),%r15 + orq %rcx,%rsi + + movdqu (%rsi),%xmm1 + movdqa %xmm0,(%rsp) + movdqu %xmm1,(%rdi) + jmp .Lcopy4x +.align 16 +.Lcopy4x: + movdqu 16(%rsi,%r14,1),%xmm2 + movdqu 32(%rsi,%r14,1),%xmm1 + movdqa %xmm0,16(%rsp,%r14,1) + movdqu %xmm2,16(%rdi,%r14,1) + movdqa %xmm0,32(%rsp,%r14,1) + movdqu %xmm1,32(%rdi,%r14,1) + leaq 32(%r14),%r14 + decq %r15 + jnz .Lcopy4x + + shlq $2,%r9 + movdqu 16(%rsi,%r14,1),%xmm2 + movdqa %xmm0,16(%rsp,%r14,1) + movdqu %xmm2,16(%rdi,%r14,1) + movq 8(%rsp,%r9,8),%rsi + movq $1,%rax + movq (%rsi),%r15 + movq 8(%rsi),%r14 + movq 16(%rsi),%r13 + movq 24(%rsi),%r12 + movq 32(%rsi),%rbp + movq 40(%rsi),%rbx + leaq 48(%rsi),%rsp +.Lmul4x_epilogue: + .byte 0xf3,0xc3 +.size bn_mul4x_mont,.-bn_mul4x_mont +.type bn_sqr4x_mont,@function +.align 16 +bn_sqr4x_mont: +.Lsqr4x_enter: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + shll $3,%r9d + xorq %r10,%r10 + movq %rsp,%r11 + subq %r9,%r10 + movq (%r8),%r8 + leaq -72(%rsp,%r10,2),%rsp + andq $-1024,%rsp + + + + + + + + + + + + movq %rdi,32(%rsp) + movq %rcx,40(%rsp) + movq %r8,48(%rsp) + movq %r11,56(%rsp) +.Lsqr4x_body: + + + + + + + + leaq 32(%r10),%rbp + leaq (%rsi,%r9,1),%rsi + + movq %r9,%rcx + + + movq -32(%rsi,%rbp,1),%r14 + leaq 64(%rsp,%r9,2),%rdi + movq -24(%rsi,%rbp,1),%rax + leaq -32(%rdi,%rbp,1),%rdi + movq -16(%rsi,%rbp,1),%rbx + movq %rax,%r15 + + mulq %r14 + movq %rax,%r10 + movq %rbx,%rax + movq %rdx,%r11 + movq %r10,-24(%rdi,%rbp,1) + + xorq %r10,%r10 + mulq %r14 + addq %rax,%r11 + movq %rbx,%rax + adcq %rdx,%r10 + movq %r11,-16(%rdi,%rbp,1) + + leaq -16(%rbp),%rcx + + + movq 8(%rsi,%rcx,1),%rbx + mulq %r15 + movq %rax,%r12 + movq %rbx,%rax + movq %rdx,%r13 + + xorq %r11,%r11 + addq %r12,%r10 + leaq 16(%rcx),%rcx + adcq $0,%r11 + mulq %r14 + addq %rax,%r10 + movq %rbx,%rax + adcq %rdx,%r11 + movq %r10,-8(%rdi,%rcx,1) + jmp .Lsqr4x_1st + +.align 16 +.Lsqr4x_1st: + movq (%rsi,%rcx,1),%rbx + xorq %r12,%r12 + mulq %r15 + addq %rax,%r13 + movq %rbx,%rax + adcq %rdx,%r12 + + xorq %r10,%r10 + addq %r13,%r11 + adcq $0,%r10 + mulq %r14 + addq %rax,%r11 + movq %rbx,%rax + adcq %rdx,%r10 + movq %r11,(%rdi,%rcx,1) + + + movq 8(%rsi,%rcx,1),%rbx + xorq %r13,%r13 + mulq %r15 + addq %rax,%r12 + movq %rbx,%rax + adcq %rdx,%r13 + + xorq %r11,%r11 + addq %r12,%r10 + adcq $0,%r11 + mulq %r14 + addq %rax,%r10 + movq %rbx,%rax + adcq %rdx,%r11 + movq %r10,8(%rdi,%rcx,1) + + movq 16(%rsi,%rcx,1),%rbx + xorq %r12,%r12 + mulq %r15 + addq %rax,%r13 + movq %rbx,%rax + adcq %rdx,%r12 + + xorq %r10,%r10 + addq %r13,%r11 + adcq $0,%r10 + mulq %r14 + addq %rax,%r11 + movq %rbx,%rax + adcq %rdx,%r10 + movq %r11,16(%rdi,%rcx,1) + + + movq 24(%rsi,%rcx,1),%rbx + xorq %r13,%r13 + mulq %r15 + addq %rax,%r12 + movq %rbx,%rax + adcq %rdx,%r13 + + xorq %r11,%r11 + addq %r12,%r10 + leaq 32(%rcx),%rcx + adcq $0,%r11 + mulq %r14 + addq %rax,%r10 + movq %rbx,%rax + adcq %rdx,%r11 + movq %r10,-8(%rdi,%rcx,1) + + cmpq $0,%rcx + jne .Lsqr4x_1st + + xorq %r12,%r12 + addq %r11,%r13 + adcq $0,%r12 + mulq %r15 + addq %rax,%r13 + adcq %rdx,%r12 + + movq %r13,(%rdi) + leaq 16(%rbp),%rbp + movq %r12,8(%rdi) + jmp .Lsqr4x_outer + +.align 16 +.Lsqr4x_outer: + movq -32(%rsi,%rbp,1),%r14 + leaq 64(%rsp,%r9,2),%rdi + movq -24(%rsi,%rbp,1),%rax + leaq -32(%rdi,%rbp,1),%rdi + movq -16(%rsi,%rbp,1),%rbx + movq %rax,%r15 + + movq -24(%rdi,%rbp,1),%r10 + xorq %r11,%r11 + mulq %r14 + addq %rax,%r10 + movq %rbx,%rax + adcq %rdx,%r11 + movq %r10,-24(%rdi,%rbp,1) + + xorq %r10,%r10 + addq -16(%rdi,%rbp,1),%r11 + adcq $0,%r10 + mulq %r14 + addq %rax,%r11 + movq %rbx,%rax + adcq %rdx,%r10 + movq %r11,-16(%rdi,%rbp,1) + + leaq -16(%rbp),%rcx + xorq %r12,%r12 + + + movq 8(%rsi,%rcx,1),%rbx + xorq %r13,%r13 + addq 8(%rdi,%rcx,1),%r12 + adcq $0,%r13 + mulq %r15 + addq %rax,%r12 + movq %rbx,%rax + adcq %rdx,%r13 + + xorq %r11,%r11 + addq %r12,%r10 + adcq $0,%r11 + mulq %r14 + addq %rax,%r10 + movq %rbx,%rax + adcq %rdx,%r11 + movq %r10,8(%rdi,%rcx,1) + + leaq 16(%rcx),%rcx + jmp .Lsqr4x_inner + +.align 16 +.Lsqr4x_inner: + movq (%rsi,%rcx,1),%rbx + xorq %r12,%r12 + addq (%rdi,%rcx,1),%r13 + adcq $0,%r12 + mulq %r15 + addq %rax,%r13 + movq %rbx,%rax + adcq %rdx,%r12 + + xorq %r10,%r10 + addq %r13,%r11 + adcq $0,%r10 + mulq %r14 + addq %rax,%r11 + movq %rbx,%rax + adcq %rdx,%r10 + movq %r11,(%rdi,%rcx,1) + + movq 8(%rsi,%rcx,1),%rbx + xorq %r13,%r13 + addq 8(%rdi,%rcx,1),%r12 + adcq $0,%r13 + mulq %r15 + addq %rax,%r12 + movq %rbx,%rax + adcq %rdx,%r13 + + xorq %r11,%r11 + addq %r12,%r10 + leaq 16(%rcx),%rcx + adcq $0,%r11 + mulq %r14 + addq %rax,%r10 + movq %rbx,%rax + adcq %rdx,%r11 + movq %r10,-8(%rdi,%rcx,1) + + cmpq $0,%rcx + jne .Lsqr4x_inner + + xorq %r12,%r12 + addq %r11,%r13 + adcq $0,%r12 + mulq %r15 + addq %rax,%r13 + adcq %rdx,%r12 + + movq %r13,(%rdi) + movq %r12,8(%rdi) + + addq $16,%rbp + jnz .Lsqr4x_outer + + + movq -32(%rsi),%r14 + leaq 64(%rsp,%r9,2),%rdi + movq -24(%rsi),%rax + leaq -32(%rdi,%rbp,1),%rdi + movq -16(%rsi),%rbx + movq %rax,%r15 + + xorq %r11,%r11 + mulq %r14 + addq %rax,%r10 + movq %rbx,%rax + adcq %rdx,%r11 + movq %r10,-24(%rdi) + + xorq %r10,%r10 + addq %r13,%r11 + adcq $0,%r10 + mulq %r14 + addq %rax,%r11 + movq %rbx,%rax + adcq %rdx,%r10 + movq %r11,-16(%rdi) + + movq -8(%rsi),%rbx + mulq %r15 + addq %rax,%r12 + movq %rbx,%rax + adcq $0,%rdx + + xorq %r11,%r11 + addq %r12,%r10 + movq %rdx,%r13 + adcq $0,%r11 + mulq %r14 + addq %rax,%r10 + movq %rbx,%rax + adcq %rdx,%r11 + movq %r10,-8(%rdi) + + xorq %r12,%r12 + addq %r11,%r13 + adcq $0,%r12 + mulq %r15 + addq %rax,%r13 + movq -16(%rsi),%rax + adcq %rdx,%r12 + + movq %r13,(%rdi) + movq %r12,8(%rdi) + + mulq %rbx + addq $16,%rbp + xorq %r14,%r14 + subq %r9,%rbp + xorq %r15,%r15 + + addq %r12,%rax + adcq $0,%rdx + movq %rax,8(%rdi) + movq %rdx,16(%rdi) + movq %r15,24(%rdi) + + movq -16(%rsi,%rbp,1),%rax + leaq 64(%rsp,%r9,2),%rdi + xorq %r10,%r10 + movq -24(%rdi,%rbp,2),%r11 + + leaq (%r14,%r10,2),%r12 + shrq $63,%r10 + leaq (%rcx,%r11,2),%r13 + shrq $63,%r11 + orq %r10,%r13 + movq -16(%rdi,%rbp,2),%r10 + movq %r11,%r14 + mulq %rax + negq %r15 + movq -8(%rdi,%rbp,2),%r11 + adcq %rax,%r12 + movq -8(%rsi,%rbp,1),%rax + movq %r12,-32(%rdi,%rbp,2) + adcq %rdx,%r13 + + leaq (%r14,%r10,2),%rbx + movq %r13,-24(%rdi,%rbp,2) + sbbq %r15,%r15 + shrq $63,%r10 + leaq (%rcx,%r11,2),%r8 + shrq $63,%r11 + orq %r10,%r8 + movq 0(%rdi,%rbp,2),%r10 + movq %r11,%r14 + mulq %rax + negq %r15 + movq 8(%rdi,%rbp,2),%r11 + adcq %rax,%rbx + movq 0(%rsi,%rbp,1),%rax + movq %rbx,-16(%rdi,%rbp,2) + adcq %rdx,%r8 + leaq 16(%rbp),%rbp + movq %r8,-40(%rdi,%rbp,2) + sbbq %r15,%r15 + jmp .Lsqr4x_shift_n_add + +.align 16 +.Lsqr4x_shift_n_add: + leaq (%r14,%r10,2),%r12 + shrq $63,%r10 + leaq (%rcx,%r11,2),%r13 + shrq $63,%r11 + orq %r10,%r13 + movq -16(%rdi,%rbp,2),%r10 + movq %r11,%r14 + mulq %rax + negq %r15 + movq -8(%rdi,%rbp,2),%r11 + adcq %rax,%r12 + movq -8(%rsi,%rbp,1),%rax + movq %r12,-32(%rdi,%rbp,2) + adcq %rdx,%r13 + + leaq (%r14,%r10,2),%rbx + movq %r13,-24(%rdi,%rbp,2) + sbbq %r15,%r15 + shrq $63,%r10 + leaq (%rcx,%r11,2),%r8 + shrq $63,%r11 + orq %r10,%r8 + movq 0(%rdi,%rbp,2),%r10 + movq %r11,%r14 + mulq %rax + negq %r15 + movq 8(%rdi,%rbp,2),%r11 + adcq %rax,%rbx + movq 0(%rsi,%rbp,1),%rax + movq %rbx,-16(%rdi,%rbp,2) + adcq %rdx,%r8 + + leaq (%r14,%r10,2),%r12 + movq %r8,-8(%rdi,%rbp,2) + sbbq %r15,%r15 + shrq $63,%r10 + leaq (%rcx,%r11,2),%r13 + shrq $63,%r11 + orq %r10,%r13 + movq 16(%rdi,%rbp,2),%r10 + movq %r11,%r14 + mulq %rax + negq %r15 + movq 24(%rdi,%rbp,2),%r11 + adcq %rax,%r12 + movq 8(%rsi,%rbp,1),%rax + movq %r12,0(%rdi,%rbp,2) + adcq %rdx,%r13 + + leaq (%r14,%r10,2),%rbx + movq %r13,8(%rdi,%rbp,2) + sbbq %r15,%r15 + shrq $63,%r10 + leaq (%rcx,%r11,2),%r8 + shrq $63,%r11 + orq %r10,%r8 + movq 32(%rdi,%rbp,2),%r10 + movq %r11,%r14 + mulq %rax + negq %r15 + movq 40(%rdi,%rbp,2),%r11 + adcq %rax,%rbx + movq 16(%rsi,%rbp,1),%rax + movq %rbx,16(%rdi,%rbp,2) + adcq %rdx,%r8 + movq %r8,24(%rdi,%rbp,2) + sbbq %r15,%r15 + addq $32,%rbp + jnz .Lsqr4x_shift_n_add + + leaq (%r14,%r10,2),%r12 + shrq $63,%r10 + leaq (%rcx,%r11,2),%r13 + shrq $63,%r11 + orq %r10,%r13 + movq -16(%rdi),%r10 + movq %r11,%r14 + mulq %rax + negq %r15 + movq -8(%rdi),%r11 + adcq %rax,%r12 + movq -8(%rsi),%rax + movq %r12,-32(%rdi) + adcq %rdx,%r13 + + leaq (%r14,%r10,2),%rbx + movq %r13,-24(%rdi) + sbbq %r15,%r15 + shrq $63,%r10 + leaq (%rcx,%r11,2),%r8 + shrq $63,%r11 + orq %r10,%r8 + mulq %rax + negq %r15 + adcq %rax,%rbx + adcq %rdx,%r8 + movq %rbx,-16(%rdi) + movq %r8,-8(%rdi) + movq 40(%rsp),%rsi + movq 48(%rsp),%r8 + xorq %rcx,%rcx + movq %r9,0(%rsp) + subq %r9,%rcx + movq 64(%rsp),%r10 + movq %r8,%r14 + leaq 64(%rsp,%r9,2),%rax + leaq 64(%rsp,%r9,1),%rdi + movq %rax,8(%rsp) + leaq (%rsi,%r9,1),%rsi + xorq %rbp,%rbp + + movq 0(%rsi,%rcx,1),%rax + movq 8(%rsi,%rcx,1),%r9 + imulq %r10,%r14 + movq %rax,%rbx + jmp .Lsqr4x_mont_outer + +.align 16 +.Lsqr4x_mont_outer: + xorq %r11,%r11 + mulq %r14 + addq %rax,%r10 + movq %r9,%rax + adcq %rdx,%r11 + movq %r8,%r15 + + xorq %r10,%r10 + addq 8(%rdi,%rcx,1),%r11 + adcq $0,%r10 + mulq %r14 + addq %rax,%r11 + movq %rbx,%rax + adcq %rdx,%r10 + + imulq %r11,%r15 + + movq 16(%rsi,%rcx,1),%rbx + xorq %r13,%r13 + addq %r11,%r12 + adcq $0,%r13 + mulq %r15 + addq %rax,%r12 + movq %rbx,%rax + adcq %rdx,%r13 + movq %r12,8(%rdi,%rcx,1) + + xorq %r11,%r11 + addq 16(%rdi,%rcx,1),%r10 + adcq $0,%r11 + mulq %r14 + addq %rax,%r10 + movq %r9,%rax + adcq %rdx,%r11 + + movq 24(%rsi,%rcx,1),%r9 + xorq %r12,%r12 + addq %r10,%r13 + adcq $0,%r12 + mulq %r15 + addq %rax,%r13 + movq %r9,%rax + adcq %rdx,%r12 + movq %r13,16(%rdi,%rcx,1) + + xorq %r10,%r10 + addq 24(%rdi,%rcx,1),%r11 + leaq 32(%rcx),%rcx + adcq $0,%r10 + mulq %r14 + addq %rax,%r11 + movq %rbx,%rax + adcq %rdx,%r10 + jmp .Lsqr4x_mont_inner + +.align 16 +.Lsqr4x_mont_inner: + movq (%rsi,%rcx,1),%rbx + xorq %r13,%r13 + addq %r11,%r12 + adcq $0,%r13 + mulq %r15 + addq %rax,%r12 + movq %rbx,%rax + adcq %rdx,%r13 + movq %r12,-8(%rdi,%rcx,1) + + xorq %r11,%r11 + addq (%rdi,%rcx,1),%r10 + adcq $0,%r11 + mulq %r14 + addq %rax,%r10 + movq %r9,%rax + adcq %rdx,%r11 + + movq 8(%rsi,%rcx,1),%r9 + xorq %r12,%r12 + addq %r10,%r13 + adcq $0,%r12 + mulq %r15 + addq %rax,%r13 + movq %r9,%rax + adcq %rdx,%r12 + movq %r13,(%rdi,%rcx,1) + + xorq %r10,%r10 + addq 8(%rdi,%rcx,1),%r11 + adcq $0,%r10 + mulq %r14 + addq %rax,%r11 + movq %rbx,%rax + adcq %rdx,%r10 + + + movq 16(%rsi,%rcx,1),%rbx + xorq %r13,%r13 + addq %r11,%r12 + adcq $0,%r13 + mulq %r15 + addq %rax,%r12 + movq %rbx,%rax + adcq %rdx,%r13 + movq %r12,8(%rdi,%rcx,1) + + xorq %r11,%r11 + addq 16(%rdi,%rcx,1),%r10 + adcq $0,%r11 + mulq %r14 + addq %rax,%r10 + movq %r9,%rax + adcq %rdx,%r11 + + movq 24(%rsi,%rcx,1),%r9 + xorq %r12,%r12 + addq %r10,%r13 + adcq $0,%r12 + mulq %r15 + addq %rax,%r13 + movq %r9,%rax + adcq %rdx,%r12 + movq %r13,16(%rdi,%rcx,1) + + xorq %r10,%r10 + addq 24(%rdi,%rcx,1),%r11 + leaq 32(%rcx),%rcx + adcq $0,%r10 + mulq %r14 + addq %rax,%r11 + movq %rbx,%rax + adcq %rdx,%r10 + cmpq $0,%rcx + jne .Lsqr4x_mont_inner + + subq 0(%rsp),%rcx + movq %r8,%r14 + + xorq %r13,%r13 + addq %r11,%r12 + adcq $0,%r13 + mulq %r15 + addq %rax,%r12 + movq %r9,%rax + adcq %rdx,%r13 + movq %r12,-8(%rdi) + + xorq %r11,%r11 + addq (%rdi),%r10 + adcq $0,%r11 + movq 0(%rsi,%rcx,1),%rbx + addq %rbp,%r10 + adcq $0,%r11 + + imulq 16(%rdi,%rcx,1),%r14 + xorq %r12,%r12 + movq 8(%rsi,%rcx,1),%r9 + addq %r10,%r13 + movq 16(%rdi,%rcx,1),%r10 + adcq $0,%r12 + mulq %r15 + addq %rax,%r13 + movq %rbx,%rax + adcq %rdx,%r12 + movq %r13,(%rdi) + + xorq %rbp,%rbp + addq 8(%rdi),%r12 + adcq %rbp,%rbp + addq %r11,%r12 + leaq 16(%rdi),%rdi + adcq $0,%rbp + movq %r12,-8(%rdi) + cmpq 8(%rsp),%rdi + jb .Lsqr4x_mont_outer + + movq 0(%rsp),%r9 + movq %rbp,(%rdi) + movq 64(%rsp,%r9,1),%rax + leaq 64(%rsp,%r9,1),%rbx + movq 40(%rsp),%rsi + shrq $5,%r9 + movq 8(%rbx),%rdx + xorq %rbp,%rbp + + movq 32(%rsp),%rdi + subq 0(%rsi),%rax + movq 16(%rbx),%r10 + movq 24(%rbx),%r11 + sbbq 8(%rsi),%rdx + leaq -1(%r9),%rcx + jmp .Lsqr4x_sub +.align 16 +.Lsqr4x_sub: + movq %rax,0(%rdi,%rbp,8) + movq %rdx,8(%rdi,%rbp,8) + sbbq 16(%rsi,%rbp,8),%r10 + movq 32(%rbx,%rbp,8),%rax + movq 40(%rbx,%rbp,8),%rdx + sbbq 24(%rsi,%rbp,8),%r11 + movq %r10,16(%rdi,%rbp,8) + movq %r11,24(%rdi,%rbp,8) + sbbq 32(%rsi,%rbp,8),%rax + movq 48(%rbx,%rbp,8),%r10 + movq 56(%rbx,%rbp,8),%r11 + sbbq 40(%rsi,%rbp,8),%rdx + leaq 4(%rbp),%rbp + decq %rcx + jnz .Lsqr4x_sub + + movq %rax,0(%rdi,%rbp,8) + movq 32(%rbx,%rbp,8),%rax + sbbq 16(%rsi,%rbp,8),%r10 + movq %rdx,8(%rdi,%rbp,8) + sbbq 24(%rsi,%rbp,8),%r11 + movq %r10,16(%rdi,%rbp,8) + + sbbq $0,%rax + movq %r11,24(%rdi,%rbp,8) + xorq %rbp,%rbp + andq %rax,%rbx + notq %rax + movq %rdi,%rsi + andq %rax,%rsi + leaq -1(%r9),%rcx + orq %rsi,%rbx + + pxor %xmm0,%xmm0 + leaq 64(%rsp,%r9,8),%rsi + movdqu (%rbx),%xmm1 + leaq (%rsi,%r9,8),%rsi + movdqa %xmm0,64(%rsp) + movdqa %xmm0,(%rsi) + movdqu %xmm1,(%rdi) + jmp .Lsqr4x_copy +.align 16 +.Lsqr4x_copy: + movdqu 16(%rbx,%rbp,1),%xmm2 + movdqu 32(%rbx,%rbp,1),%xmm1 + movdqa %xmm0,80(%rsp,%rbp,1) + movdqa %xmm0,96(%rsp,%rbp,1) + movdqa %xmm0,16(%rsi,%rbp,1) + movdqa %xmm0,32(%rsi,%rbp,1) + movdqu %xmm2,16(%rdi,%rbp,1) + movdqu %xmm1,32(%rdi,%rbp,1) + leaq 32(%rbp),%rbp + decq %rcx + jnz .Lsqr4x_copy + + movdqu 16(%rbx,%rbp,1),%xmm2 + movdqa %xmm0,80(%rsp,%rbp,1) + movdqa %xmm0,16(%rsi,%rbp,1) + movdqu %xmm2,16(%rdi,%rbp,1) + movq 56(%rsp),%rsi + movq $1,%rax + movq 0(%rsi),%r15 + movq 8(%rsi),%r14 + movq 16(%rsi),%r13 + movq 24(%rsi),%r12 + movq 32(%rsi),%rbp + movq 40(%rsi),%rbx + leaq 48(%rsi),%rsp +.Lsqr4x_epilogue: + .byte 0xf3,0xc3 +.size bn_sqr4x_mont,.-bn_sqr4x_mont +.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 16 -- cgit v1.2.3