From e436c963f0976b885a7db04681344779e26dd3b5 Mon Sep 17 00:00:00 2001 From: Arne Schwabe Date: Wed, 23 Apr 2014 09:56:37 +0200 Subject: Update OpenSSL to 1.0.1g and statically link OpenVPN with it --- main/openssl/crypto/bn/asm/x86_64-mont5.S | 784 ++++++++++++++++++++++++++++++ 1 file changed, 784 insertions(+) create mode 100644 main/openssl/crypto/bn/asm/x86_64-mont5.S (limited to 'main/openssl/crypto/bn/asm/x86_64-mont5.S') diff --git a/main/openssl/crypto/bn/asm/x86_64-mont5.S b/main/openssl/crypto/bn/asm/x86_64-mont5.S new file mode 100644 index 00000000..49ec6ac6 --- /dev/null +++ b/main/openssl/crypto/bn/asm/x86_64-mont5.S @@ -0,0 +1,784 @@ +.text + +.globl bn_mul_mont_gather5 +.type bn_mul_mont_gather5,@function +.align 64 +bn_mul_mont_gather5: + testl $3,%r9d + jnz .Lmul_enter + cmpl $8,%r9d + jb .Lmul_enter + jmp .Lmul4x_enter + +.align 16 +.Lmul_enter: + movl %r9d,%r9d + movl 8(%rsp),%r10d + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + movq %rsp,%rax + leaq 2(%r9),%r11 + negq %r11 + leaq (%rsp,%r11,8),%rsp + andq $-1024,%rsp + + movq %rax,8(%rsp,%r9,8) +.Lmul_body: + movq %rdx,%r12 + movq %r10,%r11 + shrq $3,%r10 + andq $7,%r11 + notq %r10 + leaq .Lmagic_masks(%rip),%rax + andq $3,%r10 + leaq 96(%r12,%r11,8),%r12 + movq 0(%rax,%r10,8),%xmm4 + movq 8(%rax,%r10,8),%xmm5 + movq 16(%rax,%r10,8),%xmm6 + movq 24(%rax,%r10,8),%xmm7 + + movq -96(%r12),%xmm0 + movq -32(%r12),%xmm1 + pand %xmm4,%xmm0 + movq 32(%r12),%xmm2 + pand %xmm5,%xmm1 + movq 96(%r12),%xmm3 + pand %xmm6,%xmm2 + por %xmm1,%xmm0 + pand %xmm7,%xmm3 + por %xmm2,%xmm0 + leaq 256(%r12),%r12 + por %xmm3,%xmm0 + +.byte 102,72,15,126,195 + + movq (%r8),%r8 + movq (%rsi),%rax + + xorq %r14,%r14 + xorq %r15,%r15 + + movq -96(%r12),%xmm0 + movq -32(%r12),%xmm1 + pand %xmm4,%xmm0 + movq 32(%r12),%xmm2 + pand %xmm5,%xmm1 + + movq %r8,%rbp + mulq %rbx + movq %rax,%r10 + movq (%rcx),%rax + + movq 96(%r12),%xmm3 + pand %xmm6,%xmm2 + por %xmm1,%xmm0 + pand %xmm7,%xmm3 + + imulq %r10,%rbp + movq %rdx,%r11 + + por %xmm2,%xmm0 + leaq 256(%r12),%r12 + por %xmm3,%xmm0 + + mulq %rbp + addq %rax,%r10 + movq 8(%rsi),%rax + adcq $0,%rdx + movq %rdx,%r13 + + leaq 1(%r15),%r15 + jmp .L1st_enter + +.align 16 +.L1st: + addq %rax,%r13 + movq (%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r11,%r13 + movq %r10,%r11 + adcq $0,%rdx + movq %r13,-16(%rsp,%r15,8) + movq %rdx,%r13 + +.L1st_enter: + mulq %rbx + addq %rax,%r11 + movq (%rcx,%r15,8),%rax + adcq $0,%rdx + leaq 1(%r15),%r15 + movq %rdx,%r10 + + mulq %rbp + cmpq %r9,%r15 + jne .L1st + +.byte 102,72,15,126,195 + + addq %rax,%r13 + movq (%rsi),%rax + adcq $0,%rdx + addq %r11,%r13 + adcq $0,%rdx + movq %r13,-16(%rsp,%r15,8) + movq %rdx,%r13 + movq %r10,%r11 + + xorq %rdx,%rdx + addq %r11,%r13 + adcq $0,%rdx + movq %r13,-8(%rsp,%r9,8) + movq %rdx,(%rsp,%r9,8) + + leaq 1(%r14),%r14 + jmp .Louter +.align 16 +.Louter: + xorq %r15,%r15 + movq %r8,%rbp + movq (%rsp),%r10 + + movq -96(%r12),%xmm0 + movq -32(%r12),%xmm1 + pand %xmm4,%xmm0 + movq 32(%r12),%xmm2 + pand %xmm5,%xmm1 + + mulq %rbx + addq %rax,%r10 + movq (%rcx),%rax + adcq $0,%rdx + + movq 96(%r12),%xmm3 + pand %xmm6,%xmm2 + por %xmm1,%xmm0 + pand %xmm7,%xmm3 + + imulq %r10,%rbp + movq %rdx,%r11 + + por %xmm2,%xmm0 + leaq 256(%r12),%r12 + por %xmm3,%xmm0 + + mulq %rbp + addq %rax,%r10 + movq 8(%rsi),%rax + adcq $0,%rdx + movq 8(%rsp),%r10 + movq %rdx,%r13 + + leaq 1(%r15),%r15 + jmp .Linner_enter + +.align 16 +.Linner: + addq %rax,%r13 + movq (%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + movq (%rsp,%r15,8),%r10 + adcq $0,%rdx + movq %r13,-16(%rsp,%r15,8) + movq %rdx,%r13 + +.Linner_enter: + mulq %rbx + addq %rax,%r11 + movq (%rcx,%r15,8),%rax + adcq $0,%rdx + addq %r11,%r10 + movq %rdx,%r11 + adcq $0,%r11 + leaq 1(%r15),%r15 + + mulq %rbp + cmpq %r9,%r15 + jne .Linner + +.byte 102,72,15,126,195 + + addq %rax,%r13 + movq (%rsi),%rax + adcq $0,%rdx + addq %r10,%r13 + movq (%rsp,%r15,8),%r10 + adcq $0,%rdx + movq %r13,-16(%rsp,%r15,8) + movq %rdx,%r13 + + xorq %rdx,%rdx + addq %r11,%r13 + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-8(%rsp,%r9,8) + movq %rdx,(%rsp,%r9,8) + + leaq 1(%r14),%r14 + cmpq %r9,%r14 + jl .Louter + + xorq %r14,%r14 + movq (%rsp),%rax + leaq (%rsp),%rsi + movq %r9,%r15 + jmp .Lsub +.align 16 +.Lsub: sbbq (%rcx,%r14,8),%rax + movq %rax,(%rdi,%r14,8) + movq 8(%rsi,%r14,8),%rax + leaq 1(%r14),%r14 + decq %r15 + jnz .Lsub + + sbbq $0,%rax + xorq %r14,%r14 + andq %rax,%rsi + notq %rax + movq %rdi,%rcx + andq %rax,%rcx + movq %r9,%r15 + orq %rcx,%rsi +.align 16 +.Lcopy: + movq (%rsi,%r14,8),%rax + movq %r14,(%rsp,%r14,8) + movq %rax,(%rdi,%r14,8) + leaq 1(%r14),%r14 + subq $1,%r15 + jnz .Lcopy + + movq 8(%rsp,%r9,8),%rsi + movq $1,%rax + movq (%rsi),%r15 + movq 8(%rsi),%r14 + movq 16(%rsi),%r13 + movq 24(%rsi),%r12 + movq 32(%rsi),%rbp + movq 40(%rsi),%rbx + leaq 48(%rsi),%rsp +.Lmul_epilogue: + .byte 0xf3,0xc3 +.size bn_mul_mont_gather5,.-bn_mul_mont_gather5 +.type bn_mul4x_mont_gather5,@function +.align 16 +bn_mul4x_mont_gather5: +.Lmul4x_enter: + movl %r9d,%r9d + movl 8(%rsp),%r10d + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + movq %rsp,%rax + leaq 4(%r9),%r11 + negq %r11 + leaq (%rsp,%r11,8),%rsp + andq $-1024,%rsp + + movq %rax,8(%rsp,%r9,8) +.Lmul4x_body: + movq %rdi,16(%rsp,%r9,8) + movq %rdx,%r12 + movq %r10,%r11 + shrq $3,%r10 + andq $7,%r11 + notq %r10 + leaq .Lmagic_masks(%rip),%rax + andq $3,%r10 + leaq 96(%r12,%r11,8),%r12 + movq 0(%rax,%r10,8),%xmm4 + movq 8(%rax,%r10,8),%xmm5 + movq 16(%rax,%r10,8),%xmm6 + movq 24(%rax,%r10,8),%xmm7 + + movq -96(%r12),%xmm0 + movq -32(%r12),%xmm1 + pand %xmm4,%xmm0 + movq 32(%r12),%xmm2 + pand %xmm5,%xmm1 + movq 96(%r12),%xmm3 + pand %xmm6,%xmm2 + por %xmm1,%xmm0 + pand %xmm7,%xmm3 + por %xmm2,%xmm0 + leaq 256(%r12),%r12 + por %xmm3,%xmm0 + +.byte 102,72,15,126,195 + movq (%r8),%r8 + movq (%rsi),%rax + + xorq %r14,%r14 + xorq %r15,%r15 + + movq -96(%r12),%xmm0 + movq -32(%r12),%xmm1 + pand %xmm4,%xmm0 + movq 32(%r12),%xmm2 + pand %xmm5,%xmm1 + + movq %r8,%rbp + mulq %rbx + movq %rax,%r10 + movq (%rcx),%rax + + movq 96(%r12),%xmm3 + pand %xmm6,%xmm2 + por %xmm1,%xmm0 + pand %xmm7,%xmm3 + + imulq %r10,%rbp + movq %rdx,%r11 + + por %xmm2,%xmm0 + leaq 256(%r12),%r12 + por %xmm3,%xmm0 + + mulq %rbp + addq %rax,%r10 + movq 8(%rsi),%rax + adcq $0,%rdx + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq 8(%rcx),%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq 16(%rsi),%rax + adcq $0,%rdx + addq %r11,%rdi + leaq 4(%r15),%r15 + adcq $0,%rdx + movq %rdi,(%rsp) + movq %rdx,%r13 + jmp .L1st4x +.align 16 +.L1st4x: + mulq %rbx + addq %rax,%r10 + movq -16(%rcx,%r15,8),%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq -8(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-24(%rsp,%r15,8) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq -8(%rcx,%r15,8),%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq (%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %rdi,-16(%rsp,%r15,8) + movq %rdx,%r13 + + mulq %rbx + addq %rax,%r10 + movq (%rcx,%r15,8),%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq 8(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-8(%rsp,%r15,8) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq 8(%rcx,%r15,8),%rax + adcq $0,%rdx + leaq 4(%r15),%r15 + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq -16(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %rdi,-32(%rsp,%r15,8) + movq %rdx,%r13 + cmpq %r9,%r15 + jl .L1st4x + + mulq %rbx + addq %rax,%r10 + movq -16(%rcx,%r15,8),%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq -8(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-24(%rsp,%r15,8) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq -8(%rcx,%r15,8),%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq (%rsi),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %rdi,-16(%rsp,%r15,8) + movq %rdx,%r13 + +.byte 102,72,15,126,195 + + xorq %rdi,%rdi + addq %r10,%r13 + adcq $0,%rdi + movq %r13,-8(%rsp,%r15,8) + movq %rdi,(%rsp,%r15,8) + + leaq 1(%r14),%r14 +.align 4 +.Louter4x: + xorq %r15,%r15 + movq -96(%r12),%xmm0 + movq -32(%r12),%xmm1 + pand %xmm4,%xmm0 + movq 32(%r12),%xmm2 + pand %xmm5,%xmm1 + + movq (%rsp),%r10 + movq %r8,%rbp + mulq %rbx + addq %rax,%r10 + movq (%rcx),%rax + adcq $0,%rdx + + movq 96(%r12),%xmm3 + pand %xmm6,%xmm2 + por %xmm1,%xmm0 + pand %xmm7,%xmm3 + + imulq %r10,%rbp + movq %rdx,%r11 + + por %xmm2,%xmm0 + leaq 256(%r12),%r12 + por %xmm3,%xmm0 + + mulq %rbp + addq %rax,%r10 + movq 8(%rsi),%rax + adcq $0,%rdx + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq 8(%rcx),%rax + adcq $0,%rdx + addq 8(%rsp),%r11 + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq 16(%rsi),%rax + adcq $0,%rdx + addq %r11,%rdi + leaq 4(%r15),%r15 + adcq $0,%rdx + movq %rdx,%r13 + jmp .Linner4x +.align 16 +.Linner4x: + mulq %rbx + addq %rax,%r10 + movq -16(%rcx,%r15,8),%rax + adcq $0,%rdx + addq -16(%rsp,%r15,8),%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq -8(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdi,-32(%rsp,%r15,8) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq -8(%rcx,%r15,8),%rax + adcq $0,%rdx + addq -8(%rsp,%r15,8),%r11 + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq (%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %r13,-24(%rsp,%r15,8) + movq %rdx,%r13 + + mulq %rbx + addq %rax,%r10 + movq (%rcx,%r15,8),%rax + adcq $0,%rdx + addq (%rsp,%r15,8),%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq 8(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdi,-16(%rsp,%r15,8) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq 8(%rcx,%r15,8),%rax + adcq $0,%rdx + addq 8(%rsp,%r15,8),%r11 + adcq $0,%rdx + leaq 4(%r15),%r15 + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq -16(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %r13,-40(%rsp,%r15,8) + movq %rdx,%r13 + cmpq %r9,%r15 + jl .Linner4x + + mulq %rbx + addq %rax,%r10 + movq -16(%rcx,%r15,8),%rax + adcq $0,%rdx + addq -16(%rsp,%r15,8),%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq -8(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdi,-32(%rsp,%r15,8) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq -8(%rcx,%r15,8),%rax + adcq $0,%rdx + addq -8(%rsp,%r15,8),%r11 + adcq $0,%rdx + leaq 1(%r14),%r14 + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq (%rsi),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %r13,-24(%rsp,%r15,8) + movq %rdx,%r13 + +.byte 102,72,15,126,195 + movq %rdi,-16(%rsp,%r15,8) + + xorq %rdi,%rdi + addq %r10,%r13 + adcq $0,%rdi + addq (%rsp,%r9,8),%r13 + adcq $0,%rdi + movq %r13,-8(%rsp,%r15,8) + movq %rdi,(%rsp,%r15,8) + + cmpq %r9,%r14 + jl .Louter4x + movq 16(%rsp,%r9,8),%rdi + movq 0(%rsp),%rax + pxor %xmm0,%xmm0 + movq 8(%rsp),%rdx + shrq $2,%r9 + leaq (%rsp),%rsi + xorq %r14,%r14 + + subq 0(%rcx),%rax + movq 16(%rsi),%rbx + movq 24(%rsi),%rbp + sbbq 8(%rcx),%rdx + leaq -1(%r9),%r15 + jmp .Lsub4x +.align 16 +.Lsub4x: + movq %rax,0(%rdi,%r14,8) + movq %rdx,8(%rdi,%r14,8) + sbbq 16(%rcx,%r14,8),%rbx + movq 32(%rsi,%r14,8),%rax + movq 40(%rsi,%r14,8),%rdx + sbbq 24(%rcx,%r14,8),%rbp + movq %rbx,16(%rdi,%r14,8) + movq %rbp,24(%rdi,%r14,8) + sbbq 32(%rcx,%r14,8),%rax + movq 48(%rsi,%r14,8),%rbx + movq 56(%rsi,%r14,8),%rbp + sbbq 40(%rcx,%r14,8),%rdx + leaq 4(%r14),%r14 + decq %r15 + jnz .Lsub4x + + movq %rax,0(%rdi,%r14,8) + movq 32(%rsi,%r14,8),%rax + sbbq 16(%rcx,%r14,8),%rbx + movq %rdx,8(%rdi,%r14,8) + sbbq 24(%rcx,%r14,8),%rbp + movq %rbx,16(%rdi,%r14,8) + + sbbq $0,%rax + movq %rbp,24(%rdi,%r14,8) + xorq %r14,%r14 + andq %rax,%rsi + notq %rax + movq %rdi,%rcx + andq %rax,%rcx + leaq -1(%r9),%r15 + orq %rcx,%rsi + + movdqu (%rsi),%xmm1 + movdqa %xmm0,(%rsp) + movdqu %xmm1,(%rdi) + jmp .Lcopy4x +.align 16 +.Lcopy4x: + movdqu 16(%rsi,%r14,1),%xmm2 + movdqu 32(%rsi,%r14,1),%xmm1 + movdqa %xmm0,16(%rsp,%r14,1) + movdqu %xmm2,16(%rdi,%r14,1) + movdqa %xmm0,32(%rsp,%r14,1) + movdqu %xmm1,32(%rdi,%r14,1) + leaq 32(%r14),%r14 + decq %r15 + jnz .Lcopy4x + + shlq $2,%r9 + movdqu 16(%rsi,%r14,1),%xmm2 + movdqa %xmm0,16(%rsp,%r14,1) + movdqu %xmm2,16(%rdi,%r14,1) + movq 8(%rsp,%r9,8),%rsi + movq $1,%rax + movq (%rsi),%r15 + movq 8(%rsi),%r14 + movq 16(%rsi),%r13 + movq 24(%rsi),%r12 + movq 32(%rsi),%rbp + movq 40(%rsi),%rbx + leaq 48(%rsi),%rsp +.Lmul4x_epilogue: + .byte 0xf3,0xc3 +.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5 +.globl bn_scatter5 +.type bn_scatter5,@function +.align 16 +bn_scatter5: + cmpq $0,%rsi + jz .Lscatter_epilogue + leaq (%rdx,%rcx,8),%rdx +.Lscatter: + movq (%rdi),%rax + leaq 8(%rdi),%rdi + movq %rax,(%rdx) + leaq 256(%rdx),%rdx + subq $1,%rsi + jnz .Lscatter +.Lscatter_epilogue: + .byte 0xf3,0xc3 +.size bn_scatter5,.-bn_scatter5 + +.globl bn_gather5 +.type bn_gather5,@function +.align 16 +bn_gather5: + movq %rcx,%r11 + shrq $3,%rcx + andq $7,%r11 + notq %rcx + leaq .Lmagic_masks(%rip),%rax + andq $3,%rcx + leaq 96(%rdx,%r11,8),%rdx + movq 0(%rax,%rcx,8),%xmm4 + movq 8(%rax,%rcx,8),%xmm5 + movq 16(%rax,%rcx,8),%xmm6 + movq 24(%rax,%rcx,8),%xmm7 + jmp .Lgather +.align 16 +.Lgather: + movq -96(%rdx),%xmm0 + movq -32(%rdx),%xmm1 + pand %xmm4,%xmm0 + movq 32(%rdx),%xmm2 + pand %xmm5,%xmm1 + movq 96(%rdx),%xmm3 + pand %xmm6,%xmm2 + por %xmm1,%xmm0 + pand %xmm7,%xmm3 + por %xmm2,%xmm0 + leaq 256(%rdx),%rdx + por %xmm3,%xmm0 + + movq %xmm0,(%rdi) + leaq 8(%rdi),%rdi + subq $1,%rsi + jnz .Lgather + .byte 0xf3,0xc3 +.LSEH_end_bn_gather5: +.size bn_gather5,.-bn_gather5 +.align 64 +.Lmagic_masks: +.long 0,0, 0,0, 0,0, -1,-1 +.long 0,0, 0,0, 0,0, 0,0 +.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -- cgit v1.2.3