summaryrefslogtreecommitdiff
path: root/main/openssl/crypto/modes/asm
diff options
context:
space:
mode:
Diffstat (limited to 'main/openssl/crypto/modes/asm')
-rw-r--r--main/openssl/crypto/modes/asm/ghash-x86.S1327
1 files changed, 934 insertions, 393 deletions
diff --git a/main/openssl/crypto/modes/asm/ghash-x86.S b/main/openssl/crypto/modes/asm/ghash-x86.S
index cb9ae20d..50473201 100644
--- a/main/openssl/crypto/modes/asm/ghash-x86.S
+++ b/main/openssl/crypto/modes/asm/ghash-x86.S
@@ -203,418 +203,94 @@ gcm_ghash_4bit_x86:
popl %ebp
ret
.size gcm_ghash_4bit_x86,.-.L_gcm_ghash_4bit_x86_begin
-.type _mmx_gmult_4bit_inner,@function
+.globl gcm_gmult_4bit_mmx
+.type gcm_gmult_4bit_mmx,@function
.align 16
-_mmx_gmult_4bit_inner:
+gcm_gmult_4bit_mmx:
+.L_gcm_gmult_4bit_mmx_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%edi
+ movl 24(%esp),%esi
+ call .L005pic_point
+.L005pic_point:
+ popl %eax
+ leal .Lrem_4bit-.L005pic_point(%eax),%eax
+ movzbl 15(%edi),%ebx
xorl %ecx,%ecx
movl %ebx,%edx
movb %dl,%cl
+ movl $14,%ebp
shlb $4,%cl
andl $240,%edx
movq 8(%esi,%ecx,1),%mm0
movq (%esi,%ecx,1),%mm1
- movd %mm0,%ebp
- psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%edx,1),%mm0
- movb 14(%edi),%cl
- psllq $60,%mm2
- andl $15,%ebp
- pxor (%esi,%edx,1),%mm1
- movl %ecx,%edx
movd %mm0,%ebx
- pxor %mm2,%mm0
- shlb $4,%cl
+ jmp .L006mmx_loop
+.align 16
+.L006mmx_loop:
psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%ecx,1),%mm0
- psllq $60,%mm2
- andl $240,%edx
- pxor (%eax,%ebp,8),%mm1
andl $15,%ebx
- pxor (%esi,%ecx,1),%mm1
- movd %mm0,%ebp
- pxor %mm2,%mm0
- psrlq $4,%mm0
movq %mm1,%mm2
psrlq $4,%mm1
pxor 8(%esi,%edx,1),%mm0
- movb 13(%edi),%cl
+ movb (%edi,%ebp,1),%cl
psllq $60,%mm2
pxor (%eax,%ebx,8),%mm1
- andl $15,%ebp
- pxor (%esi,%edx,1),%mm1
- movl %ecx,%edx
+ decl %ebp
movd %mm0,%ebx
- pxor %mm2,%mm0
- shlb $4,%cl
- psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%ecx,1),%mm0
- psllq $60,%mm2
- andl $240,%edx
- pxor (%eax,%ebp,8),%mm1
- andl $15,%ebx
- pxor (%esi,%ecx,1),%mm1
- movd %mm0,%ebp
- pxor %mm2,%mm0
- psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%edx,1),%mm0
- movb 12(%edi),%cl
- psllq $60,%mm2
- pxor (%eax,%ebx,8),%mm1
- andl $15,%ebp
pxor (%esi,%edx,1),%mm1
movl %ecx,%edx
- movd %mm0,%ebx
pxor %mm2,%mm0
+ js .L007mmx_break
shlb $4,%cl
- psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%ecx,1),%mm0
- psllq $60,%mm2
- andl $240,%edx
- pxor (%eax,%ebp,8),%mm1
andl $15,%ebx
- pxor (%esi,%ecx,1),%mm1
- movd %mm0,%ebp
- pxor %mm2,%mm0
- psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%edx,1),%mm0
- movb 11(%edi),%cl
- psllq $60,%mm2
- pxor (%eax,%ebx,8),%mm1
- andl $15,%ebp
- pxor (%esi,%edx,1),%mm1
- movl %ecx,%edx
- movd %mm0,%ebx
- pxor %mm2,%mm0
- shlb $4,%cl
psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%ecx,1),%mm0
- psllq $60,%mm2
andl $240,%edx
- pxor (%eax,%ebp,8),%mm1
- andl $15,%ebx
- pxor (%esi,%ecx,1),%mm1
- movd %mm0,%ebp
- pxor %mm2,%mm0
- psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%edx,1),%mm0
- movb 10(%edi),%cl
- psllq $60,%mm2
- pxor (%eax,%ebx,8),%mm1
- andl $15,%ebp
- pxor (%esi,%edx,1),%mm1
- movl %ecx,%edx
- movd %mm0,%ebx
- pxor %mm2,%mm0
- shlb $4,%cl
- psrlq $4,%mm0
movq %mm1,%mm2
psrlq $4,%mm1
pxor 8(%esi,%ecx,1),%mm0
psllq $60,%mm2
- andl $240,%edx
- pxor (%eax,%ebp,8),%mm1
- andl $15,%ebx
- pxor (%esi,%ecx,1),%mm1
- movd %mm0,%ebp
- pxor %mm2,%mm0
- psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%edx,1),%mm0
- movb 9(%edi),%cl
- psllq $60,%mm2
pxor (%eax,%ebx,8),%mm1
- andl $15,%ebp
- pxor (%esi,%edx,1),%mm1
- movl %ecx,%edx
movd %mm0,%ebx
- pxor %mm2,%mm0
- shlb $4,%cl
- psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%ecx,1),%mm0
- psllq $60,%mm2
- andl $240,%edx
- pxor (%eax,%ebp,8),%mm1
- andl $15,%ebx
pxor (%esi,%ecx,1),%mm1
- movd %mm0,%ebp
- pxor %mm2,%mm0
- psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%edx,1),%mm0
- movb 8(%edi),%cl
- psllq $60,%mm2
- pxor (%eax,%ebx,8),%mm1
- andl $15,%ebp
- pxor (%esi,%edx,1),%mm1
- movl %ecx,%edx
- movd %mm0,%ebx
pxor %mm2,%mm0
+ jmp .L006mmx_loop
+.align 16
+.L007mmx_break:
shlb $4,%cl
- psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%ecx,1),%mm0
- psllq $60,%mm2
- andl $240,%edx
- pxor (%eax,%ebp,8),%mm1
andl $15,%ebx
- pxor (%esi,%ecx,1),%mm1
- movd %mm0,%ebp
- pxor %mm2,%mm0
- psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%edx,1),%mm0
- movb 7(%edi),%cl
- psllq $60,%mm2
- pxor (%eax,%ebx,8),%mm1
- andl $15,%ebp
- pxor (%esi,%edx,1),%mm1
- movl %ecx,%edx
- movd %mm0,%ebx
- pxor %mm2,%mm0
- shlb $4,%cl
psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%ecx,1),%mm0
- psllq $60,%mm2
andl $240,%edx
- pxor (%eax,%ebp,8),%mm1
- andl $15,%ebx
- pxor (%esi,%ecx,1),%mm1
- movd %mm0,%ebp
- pxor %mm2,%mm0
- psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%edx,1),%mm0
- movb 6(%edi),%cl
- psllq $60,%mm2
- pxor (%eax,%ebx,8),%mm1
- andl $15,%ebp
- pxor (%esi,%edx,1),%mm1
- movl %ecx,%edx
- movd %mm0,%ebx
- pxor %mm2,%mm0
- shlb $4,%cl
- psrlq $4,%mm0
movq %mm1,%mm2
psrlq $4,%mm1
pxor 8(%esi,%ecx,1),%mm0
psllq $60,%mm2
- andl $240,%edx
- pxor (%eax,%ebp,8),%mm1
- andl $15,%ebx
- pxor (%esi,%ecx,1),%mm1
- movd %mm0,%ebp
- pxor %mm2,%mm0
- psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%edx,1),%mm0
- movb 5(%edi),%cl
- psllq $60,%mm2
pxor (%eax,%ebx,8),%mm1
- andl $15,%ebp
- pxor (%esi,%edx,1),%mm1
- movl %ecx,%edx
movd %mm0,%ebx
- pxor %mm2,%mm0
- shlb $4,%cl
- psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%ecx,1),%mm0
- psllq $60,%mm2
- andl $240,%edx
- pxor (%eax,%ebp,8),%mm1
- andl $15,%ebx
pxor (%esi,%ecx,1),%mm1
- movd %mm0,%ebp
pxor %mm2,%mm0
psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%edx,1),%mm0
- movb 4(%edi),%cl
- psllq $60,%mm2
- pxor (%eax,%ebx,8),%mm1
- andl $15,%ebp
- pxor (%esi,%edx,1),%mm1
- movl %ecx,%edx
- movd %mm0,%ebx
- pxor %mm2,%mm0
- shlb $4,%cl
- psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%ecx,1),%mm0
- psllq $60,%mm2
- andl $240,%edx
- pxor (%eax,%ebp,8),%mm1
- andl $15,%ebx
- pxor (%esi,%ecx,1),%mm1
- movd %mm0,%ebp
- pxor %mm2,%mm0
- psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%edx,1),%mm0
- movb 3(%edi),%cl
- psllq $60,%mm2
- pxor (%eax,%ebx,8),%mm1
- andl $15,%ebp
- pxor (%esi,%edx,1),%mm1
- movl %ecx,%edx
- movd %mm0,%ebx
- pxor %mm2,%mm0
- shlb $4,%cl
- psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%ecx,1),%mm0
- psllq $60,%mm2
- andl $240,%edx
- pxor (%eax,%ebp,8),%mm1
- andl $15,%ebx
- pxor (%esi,%ecx,1),%mm1
- movd %mm0,%ebp
- pxor %mm2,%mm0
- psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%edx,1),%mm0
- movb 2(%edi),%cl
- psllq $60,%mm2
- pxor (%eax,%ebx,8),%mm1
- andl $15,%ebp
- pxor (%esi,%edx,1),%mm1
- movl %ecx,%edx
- movd %mm0,%ebx
- pxor %mm2,%mm0
- shlb $4,%cl
- psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%ecx,1),%mm0
- psllq $60,%mm2
- andl $240,%edx
- pxor (%eax,%ebp,8),%mm1
andl $15,%ebx
- pxor (%esi,%ecx,1),%mm1
- movd %mm0,%ebp
- pxor %mm2,%mm0
- psrlq $4,%mm0
movq %mm1,%mm2
psrlq $4,%mm1
pxor 8(%esi,%edx,1),%mm0
- movb 1(%edi),%cl
psllq $60,%mm2
pxor (%eax,%ebx,8),%mm1
- andl $15,%ebp
- pxor (%esi,%edx,1),%mm1
- movl %ecx,%edx
movd %mm0,%ebx
- pxor %mm2,%mm0
- shlb $4,%cl
- psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%ecx,1),%mm0
- psllq $60,%mm2
- andl $240,%edx
- pxor (%eax,%ebp,8),%mm1
- andl $15,%ebx
- pxor (%esi,%ecx,1),%mm1
- movd %mm0,%ebp
- pxor %mm2,%mm0
- psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%edx,1),%mm0
- movb (%edi),%cl
- psllq $60,%mm2
- pxor (%eax,%ebx,8),%mm1
- andl $15,%ebp
pxor (%esi,%edx,1),%mm1
- movl %ecx,%edx
- movd %mm0,%ebx
pxor %mm2,%mm0
- shlb $4,%cl
- psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%ecx,1),%mm0
- psllq $60,%mm2
- andl $240,%edx
- pxor (%eax,%ebp,8),%mm1
- andl $15,%ebx
- pxor (%esi,%ecx,1),%mm1
- movd %mm0,%ebp
- pxor %mm2,%mm0
- psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%edx,1),%mm0
- psllq $60,%mm2
- pxor (%eax,%ebx,8),%mm1
- andl $15,%ebp
- pxor (%esi,%edx,1),%mm1
- movd %mm0,%ebx
- pxor %mm2,%mm0
- movl 4(%eax,%ebp,8),%edi
psrlq $32,%mm0
movd %mm1,%edx
psrlq $32,%mm1
movd %mm0,%ecx
movd %mm1,%ebp
- shll $4,%edi
bswap %ebx
bswap %edx
bswap %ecx
- xorl %edi,%ebp
bswap %ebp
- ret
-.size _mmx_gmult_4bit_inner,.-_mmx_gmult_4bit_inner
-.globl gcm_gmult_4bit_mmx
-.type gcm_gmult_4bit_mmx,@function
-.align 16
-gcm_gmult_4bit_mmx:
-.L_gcm_gmult_4bit_mmx_begin:
- pushl %ebp
- pushl %ebx
- pushl %esi
- pushl %edi
- movl 20(%esp),%edi
- movl 24(%esp),%esi
- call .L005pic_point
-.L005pic_point:
- popl %eax
- leal .Lrem_4bit-.L005pic_point(%eax),%eax
- movzbl 15(%edi),%ebx
- call _mmx_gmult_4bit_inner
- movl 20(%esp),%edi
emms
movl %ebx,12(%edi)
movl %edx,4(%edi)
@@ -635,61 +311,926 @@ gcm_ghash_4bit_mmx:
pushl %ebx
pushl %esi
pushl %edi
- movl 20(%esp),%ebp
- movl 24(%esp),%esi
- movl 28(%esp),%edi
- movl 32(%esp),%ecx
- call .L006pic_point
-.L006pic_point:
- popl %eax
- leal .Lrem_4bit-.L006pic_point(%eax),%eax
- addl %edi,%ecx
- movl %ecx,32(%esp)
- subl $20,%esp
- movl 12(%ebp),%ebx
- movl 4(%ebp),%edx
- movl 8(%ebp),%ecx
- movl (%ebp),%ebp
- jmp .L007mmx_outer_loop
+ movl 20(%esp),%eax
+ movl 24(%esp),%ebx
+ movl 28(%esp),%ecx
+ movl 32(%esp),%edx
+ movl %esp,%ebp
+ call .L008pic_point
+.L008pic_point:
+ popl %esi
+ leal .Lrem_8bit-.L008pic_point(%esi),%esi
+ subl $544,%esp
+ andl $-64,%esp
+ subl $16,%esp
+ addl %ecx,%edx
+ movl %eax,544(%esp)
+ movl %edx,552(%esp)
+ movl %ebp,556(%esp)
+ addl $128,%ebx
+ leal 144(%esp),%edi
+ leal 400(%esp),%ebp
+ movl -120(%ebx),%edx
+ movq -120(%ebx),%mm0
+ movq -128(%ebx),%mm3
+ shll $4,%edx
+ movb %dl,(%esp)
+ movl -104(%ebx),%edx
+ movq -104(%ebx),%mm2
+ movq -112(%ebx),%mm5
+ movq %mm0,-128(%edi)
+ psrlq $4,%mm0
+ movq %mm3,(%edi)
+ movq %mm3,%mm7
+ psrlq $4,%mm3
+ shll $4,%edx
+ movb %dl,1(%esp)
+ movl -88(%ebx),%edx
+ movq -88(%ebx),%mm1
+ psllq $60,%mm7
+ movq -96(%ebx),%mm4
+ por %mm7,%mm0
+ movq %mm2,-120(%edi)
+ psrlq $4,%mm2
+ movq %mm5,8(%edi)
+ movq %mm5,%mm6
+ movq %mm0,-128(%ebp)
+ psrlq $4,%mm5
+ movq %mm3,(%ebp)
+ shll $4,%edx
+ movb %dl,2(%esp)
+ movl -72(%ebx),%edx
+ movq -72(%ebx),%mm0
+ psllq $60,%mm6
+ movq -80(%ebx),%mm3
+ por %mm6,%mm2
+ movq %mm1,-112(%edi)
+ psrlq $4,%mm1
+ movq %mm4,16(%edi)
+ movq %mm4,%mm7
+ movq %mm2,-120(%ebp)
+ psrlq $4,%mm4
+ movq %mm5,8(%ebp)
+ shll $4,%edx
+ movb %dl,3(%esp)
+ movl -56(%ebx),%edx
+ movq -56(%ebx),%mm2
+ psllq $60,%mm7
+ movq -64(%ebx),%mm5
+ por %mm7,%mm1
+ movq %mm0,-104(%edi)
+ psrlq $4,%mm0
+ movq %mm3,24(%edi)
+ movq %mm3,%mm6
+ movq %mm1,-112(%ebp)
+ psrlq $4,%mm3
+ movq %mm4,16(%ebp)
+ shll $4,%edx
+ movb %dl,4(%esp)
+ movl -40(%ebx),%edx
+ movq -40(%ebx),%mm1
+ psllq $60,%mm6
+ movq -48(%ebx),%mm4
+ por %mm6,%mm0
+ movq %mm2,-96(%edi)
+ psrlq $4,%mm2
+ movq %mm5,32(%edi)
+ movq %mm5,%mm7
+ movq %mm0,-104(%ebp)
+ psrlq $4,%mm5
+ movq %mm3,24(%ebp)
+ shll $4,%edx
+ movb %dl,5(%esp)
+ movl -24(%ebx),%edx
+ movq -24(%ebx),%mm0
+ psllq $60,%mm7
+ movq -32(%ebx),%mm3
+ por %mm7,%mm2
+ movq %mm1,-88(%edi)
+ psrlq $4,%mm1
+ movq %mm4,40(%edi)
+ movq %mm4,%mm6
+ movq %mm2,-96(%ebp)
+ psrlq $4,%mm4
+ movq %mm5,32(%ebp)
+ shll $4,%edx
+ movb %dl,6(%esp)
+ movl -8(%ebx),%edx
+ movq -8(%ebx),%mm2
+ psllq $60,%mm6
+ movq -16(%ebx),%mm5
+ por %mm6,%mm1
+ movq %mm0,-80(%edi)
+ psrlq $4,%mm0
+ movq %mm3,48(%edi)
+ movq %mm3,%mm7
+ movq %mm1,-88(%ebp)
+ psrlq $4,%mm3
+ movq %mm4,40(%ebp)
+ shll $4,%edx
+ movb %dl,7(%esp)
+ movl 8(%ebx),%edx
+ movq 8(%ebx),%mm1
+ psllq $60,%mm7
+ movq (%ebx),%mm4
+ por %mm7,%mm0
+ movq %mm2,-72(%edi)
+ psrlq $4,%mm2
+ movq %mm5,56(%edi)
+ movq %mm5,%mm6
+ movq %mm0,-80(%ebp)
+ psrlq $4,%mm5
+ movq %mm3,48(%ebp)
+ shll $4,%edx
+ movb %dl,8(%esp)
+ movl 24(%ebx),%edx
+ movq 24(%ebx),%mm0
+ psllq $60,%mm6
+ movq 16(%ebx),%mm3
+ por %mm6,%mm2
+ movq %mm1,-64(%edi)
+ psrlq $4,%mm1
+ movq %mm4,64(%edi)
+ movq %mm4,%mm7
+ movq %mm2,-72(%ebp)
+ psrlq $4,%mm4
+ movq %mm5,56(%ebp)
+ shll $4,%edx
+ movb %dl,9(%esp)
+ movl 40(%ebx),%edx
+ movq 40(%ebx),%mm2
+ psllq $60,%mm7
+ movq 32(%ebx),%mm5
+ por %mm7,%mm1
+ movq %mm0,-56(%edi)
+ psrlq $4,%mm0
+ movq %mm3,72(%edi)
+ movq %mm3,%mm6
+ movq %mm1,-64(%ebp)
+ psrlq $4,%mm3
+ movq %mm4,64(%ebp)
+ shll $4,%edx
+ movb %dl,10(%esp)
+ movl 56(%ebx),%edx
+ movq 56(%ebx),%mm1
+ psllq $60,%mm6
+ movq 48(%ebx),%mm4
+ por %mm6,%mm0
+ movq %mm2,-48(%edi)
+ psrlq $4,%mm2
+ movq %mm5,80(%edi)
+ movq %mm5,%mm7
+ movq %mm0,-56(%ebp)
+ psrlq $4,%mm5
+ movq %mm3,72(%ebp)
+ shll $4,%edx
+ movb %dl,11(%esp)
+ movl 72(%ebx),%edx
+ movq 72(%ebx),%mm0
+ psllq $60,%mm7
+ movq 64(%ebx),%mm3
+ por %mm7,%mm2
+ movq %mm1,-40(%edi)
+ psrlq $4,%mm1
+ movq %mm4,88(%edi)
+ movq %mm4,%mm6
+ movq %mm2,-48(%ebp)
+ psrlq $4,%mm4
+ movq %mm5,80(%ebp)
+ shll $4,%edx
+ movb %dl,12(%esp)
+ movl 88(%ebx),%edx
+ movq 88(%ebx),%mm2
+ psllq $60,%mm6
+ movq 80(%ebx),%mm5
+ por %mm6,%mm1
+ movq %mm0,-32(%edi)
+ psrlq $4,%mm0
+ movq %mm3,96(%edi)
+ movq %mm3,%mm7
+ movq %mm1,-40(%ebp)
+ psrlq $4,%mm3
+ movq %mm4,88(%ebp)
+ shll $4,%edx
+ movb %dl,13(%esp)
+ movl 104(%ebx),%edx
+ movq 104(%ebx),%mm1
+ psllq $60,%mm7
+ movq 96(%ebx),%mm4
+ por %mm7,%mm0
+ movq %mm2,-24(%edi)
+ psrlq $4,%mm2
+ movq %mm5,104(%edi)
+ movq %mm5,%mm6
+ movq %mm0,-32(%ebp)
+ psrlq $4,%mm5
+ movq %mm3,96(%ebp)
+ shll $4,%edx
+ movb %dl,14(%esp)
+ movl 120(%ebx),%edx
+ movq 120(%ebx),%mm0
+ psllq $60,%mm6
+ movq 112(%ebx),%mm3
+ por %mm6,%mm2
+ movq %mm1,-16(%edi)
+ psrlq $4,%mm1
+ movq %mm4,112(%edi)
+ movq %mm4,%mm7
+ movq %mm2,-24(%ebp)
+ psrlq $4,%mm4
+ movq %mm5,104(%ebp)
+ shll $4,%edx
+ movb %dl,15(%esp)
+ psllq $60,%mm7
+ por %mm7,%mm1
+ movq %mm0,-8(%edi)
+ psrlq $4,%mm0
+ movq %mm3,120(%edi)
+ movq %mm3,%mm6
+ movq %mm1,-16(%ebp)
+ psrlq $4,%mm3
+ movq %mm4,112(%ebp)
+ psllq $60,%mm6
+ por %mm6,%mm0
+ movq %mm0,-8(%ebp)
+ movq %mm3,120(%ebp)
+ movq (%eax),%mm6
+ movl 8(%eax),%ebx
+ movl 12(%eax),%edx
.align 16
-.L007mmx_outer_loop:
- xorl 12(%edi),%ebx
- xorl 4(%edi),%edx
- xorl 8(%edi),%ecx
- xorl (%edi),%ebp
- movl %edi,48(%esp)
- movl %ebx,12(%esp)
- movl %edx,4(%esp)
- movl %ecx,8(%esp)
- movl %ebp,(%esp)
- movl %esp,%edi
- shrl $24,%ebx
- call _mmx_gmult_4bit_inner
- movl 48(%esp),%edi
- leal 16(%edi),%edi
- cmpl 52(%esp),%edi
- jb .L007mmx_outer_loop
- movl 40(%esp),%edi
+.L009outer:
+ xorl 12(%ecx),%edx
+ xorl 8(%ecx),%ebx
+ pxor (%ecx),%mm6
+ leal 16(%ecx),%ecx
+ movl %ebx,536(%esp)
+ movq %mm6,528(%esp)
+ movl %ecx,548(%esp)
+ xorl %eax,%eax
+ roll $8,%edx
+ movb %dl,%al
+ movl %eax,%ebp
+ andb $15,%al
+ shrl $4,%ebp
+ pxor %mm0,%mm0
+ roll $8,%edx
+ pxor %mm1,%mm1
+ pxor %mm2,%mm2
+ movq 16(%esp,%eax,8),%mm7
+ movq 144(%esp,%eax,8),%mm6
+ movb %dl,%al
+ movd %mm7,%ebx
+ psrlq $8,%mm7
+ movq %mm6,%mm3
+ movl %eax,%edi
+ psrlq $8,%mm6
+ pxor 272(%esp,%ebp,8),%mm7
+ andb $15,%al
+ psllq $56,%mm3
+ shrl $4,%edi
+ pxor 16(%esp,%eax,8),%mm7
+ roll $8,%edx
+ pxor 144(%esp,%eax,8),%mm6
+ pxor %mm3,%mm7
+ pxor 400(%esp,%ebp,8),%mm6
+ xorb (%esp,%ebp,1),%bl
+ movb %dl,%al
+ movd %mm7,%ecx
+ movzbl %bl,%ebx
+ psrlq $8,%mm7
+ movq %mm6,%mm3
+ movl %eax,%ebp
+ psrlq $8,%mm6
+ pxor 272(%esp,%edi,8),%mm7
+ andb $15,%al
+ psllq $56,%mm3
+ shrl $4,%ebp
+ pinsrw $2,(%esi,%ebx,2),%mm2
+ pxor 16(%esp,%eax,8),%mm7
+ roll $8,%edx
+ pxor 144(%esp,%eax,8),%mm6
+ pxor %mm3,%mm7
+ pxor 400(%esp,%edi,8),%mm6
+ xorb (%esp,%edi,1),%cl
+ movb %dl,%al
+ movl 536(%esp),%edx
+ movd %mm7,%ebx
+ movzbl %cl,%ecx
+ psrlq $8,%mm7
+ movq %mm6,%mm3
+ movl %eax,%edi
+ psrlq $8,%mm6
+ pxor 272(%esp,%ebp,8),%mm7
+ andb $15,%al
+ psllq $56,%mm3
+ pxor %mm2,%mm6
+ shrl $4,%edi
+ pinsrw $2,(%esi,%ecx,2),%mm1
+ pxor 16(%esp,%eax,8),%mm7
+ roll $8,%edx
+ pxor 144(%esp,%eax,8),%mm6
+ pxor %mm3,%mm7
+ pxor 400(%esp,%ebp,8),%mm6
+ xorb (%esp,%ebp,1),%bl
+ movb %dl,%al
+ movd %mm7,%ecx
+ movzbl %bl,%ebx
+ psrlq $8,%mm7
+ movq %mm6,%mm3
+ movl %eax,%ebp
+ psrlq $8,%mm6
+ pxor 272(%esp,%edi,8),%mm7
+ andb $15,%al
+ psllq $56,%mm3
+ pxor %mm1,%mm6
+ shrl $4,%ebp
+ pinsrw $2,(%esi,%ebx,2),%mm0
+ pxor 16(%esp,%eax,8),%mm7
+ roll $8,%edx
+ pxor 144(%esp,%eax,8),%mm6
+ pxor %mm3,%mm7
+ pxor 400(%esp,%edi,8),%mm6
+ xorb (%esp,%edi,1),%cl
+ movb %dl,%al
+ movd %mm7,%ebx
+ movzbl %cl,%ecx
+ psrlq $8,%mm7
+ movq %mm6,%mm3
+ movl %eax,%edi
+ psrlq $8,%mm6
+ pxor 272(%esp,%ebp,8),%mm7
+ andb $15,%al
+ psllq $56,%mm3
+ pxor %mm0,%mm6
+ shrl $4,%edi
+ pinsrw $2,(%esi,%ecx,2),%mm2
+ pxor 16(%esp,%eax,8),%mm7
+ roll $8,%edx
+ pxor 144(%esp,%eax,8),%mm6
+ pxor %mm3,%mm7
+ pxor 400(%esp,%ebp,8),%mm6
+ xorb (%esp,%ebp,1),%bl
+ movb %dl,%al
+ movd %mm7,%ecx
+ movzbl %bl,%ebx
+ psrlq $8,%mm7
+ movq %mm6,%mm3
+ movl %eax,%ebp
+ psrlq $8,%mm6
+ pxor 272(%esp,%edi,8),%mm7
+ andb $15,%al
+ psllq $56,%mm3
+ pxor %mm2,%mm6
+ shrl $4,%ebp
+ pinsrw $2,(%esi,%ebx,2),%mm1
+ pxor 16(%esp,%eax,8),%mm7
+ roll $8,%edx
+ pxor 144(%esp,%eax,8),%mm6
+ pxor %mm3,%mm7
+ pxor 400(%esp,%edi,8),%mm6
+ xorb (%esp,%edi,1),%cl
+ movb %dl,%al
+ movl 532(%esp),%edx
+ movd %mm7,%ebx
+ movzbl %cl,%ecx
+ psrlq $8,%mm7
+ movq %mm6,%mm3
+ movl %eax,%edi
+ psrlq $8,%mm6
+ pxor 272(%esp,%ebp,8),%mm7
+ andb $15,%al
+ psllq $56,%mm3
+ pxor %mm1,%mm6
+ shrl $4,%edi
+ pinsrw $2,(%esi,%ecx,2),%mm0
+ pxor 16(%esp,%eax,8),%mm7
+ roll $8,%edx
+ pxor 144(%esp,%eax,8),%mm6
+ pxor %mm3,%mm7
+ pxor 400(%esp,%ebp,8),%mm6
+ xorb (%esp,%ebp,1),%bl
+ movb %dl,%al
+ movd %mm7,%ecx
+ movzbl %bl,%ebx
+ psrlq $8,%mm7
+ movq %mm6,%mm3
+ movl %eax,%ebp
+ psrlq $8,%mm6
+ pxor 272(%esp,%edi,8),%mm7
+ andb $15,%al
+ psllq $56,%mm3
+ pxor %mm0,%mm6
+ shrl $4,%ebp
+ pinsrw $2,(%esi,%ebx,2),%mm2
+ pxor 16(%esp,%eax,8),%mm7
+ roll $8,%edx
+ pxor 144(%esp,%eax,8),%mm6
+ pxor %mm3,%mm7
+ pxor 400(%esp,%edi,8),%mm6
+ xorb (%esp,%edi,1),%cl
+ movb %dl,%al
+ movd %mm7,%ebx
+ movzbl %cl,%ecx
+ psrlq $8,%mm7
+ movq %mm6,%mm3
+ movl %eax,%edi
+ psrlq $8,%mm6
+ pxor 272(%esp,%ebp,8),%mm7
+ andb $15,%al
+ psllq $56,%mm3
+ pxor %mm2,%mm6
+ shrl $4,%edi
+ pinsrw $2,(%esi,%ecx,2),%mm1
+ pxor 16(%esp,%eax,8),%mm7
+ roll $8,%edx
+ pxor 144(%esp,%eax,8),%mm6
+ pxor %mm3,%mm7
+ pxor 400(%esp,%ebp,8),%mm6
+ xorb (%esp,%ebp,1),%bl
+ movb %dl,%al
+ movd %mm7,%ecx
+ movzbl %bl,%ebx
+ psrlq $8,%mm7
+ movq %mm6,%mm3
+ movl %eax,%ebp
+ psrlq $8,%mm6
+ pxor 272(%esp,%edi,8),%mm7
+ andb $15,%al
+ psllq $56,%mm3
+ pxor %mm1,%mm6
+ shrl $4,%ebp
+ pinsrw $2,(%esi,%ebx,2),%mm0
+ pxor 16(%esp,%eax,8),%mm7
+ roll $8,%edx
+ pxor 144(%esp,%eax,8),%mm6
+ pxor %mm3,%mm7
+ pxor 400(%esp,%edi,8),%mm6
+ xorb (%esp,%edi,1),%cl
+ movb %dl,%al
+ movl 528(%esp),%edx
+ movd %mm7,%ebx
+ movzbl %cl,%ecx
+ psrlq $8,%mm7
+ movq %mm6,%mm3
+ movl %eax,%edi
+ psrlq $8,%mm6
+ pxor 272(%esp,%ebp,8),%mm7
+ andb $15,%al
+ psllq $56,%mm3
+ pxor %mm0,%mm6
+ shrl $4,%edi
+ pinsrw $2,(%esi,%ecx,2),%mm2
+ pxor 16(%esp,%eax,8),%mm7
+ roll $8,%edx
+ pxor 144(%esp,%eax,8),%mm6
+ pxor %mm3,%mm7
+ pxor 400(%esp,%ebp,8),%mm6
+ xorb (%esp,%ebp,1),%bl
+ movb %dl,%al
+ movd %mm7,%ecx
+ movzbl %bl,%ebx
+ psrlq $8,%mm7
+ movq %mm6,%mm3
+ movl %eax,%ebp
+ psrlq $8,%mm6
+ pxor 272(%esp,%edi,8),%mm7
+ andb $15,%al
+ psllq $56,%mm3
+ pxor %mm2,%mm6
+ shrl $4,%ebp
+ pinsrw $2,(%esi,%ebx,2),%mm1
+ pxor 16(%esp,%eax,8),%mm7
+ roll $8,%edx
+ pxor 144(%esp,%eax,8),%mm6
+ pxor %mm3,%mm7
+ pxor 400(%esp,%edi,8),%mm6
+ xorb (%esp,%edi,1),%cl
+ movb %dl,%al
+ movd %mm7,%ebx
+ movzbl %cl,%ecx
+ psrlq $8,%mm7
+ movq %mm6,%mm3
+ movl %eax,%edi
+ psrlq $8,%mm6
+ pxor 272(%esp,%ebp,8),%mm7
+ andb $15,%al
+ psllq $56,%mm3
+ pxor %mm1,%mm6
+ shrl $4,%edi
+ pinsrw $2,(%esi,%ecx,2),%mm0
+ pxor 16(%esp,%eax,8),%mm7
+ roll $8,%edx
+ pxor 144(%esp,%eax,8),%mm6
+ pxor %mm3,%mm7
+ pxor 400(%esp,%ebp,8),%mm6
+ xorb (%esp,%ebp,1),%bl
+ movb %dl,%al
+ movd %mm7,%ecx
+ movzbl %bl,%ebx
+ psrlq $8,%mm7
+ movq %mm6,%mm3
+ movl %eax,%ebp
+ psrlq $8,%mm6
+ pxor 272(%esp,%edi,8),%mm7
+ andb $15,%al
+ psllq $56,%mm3
+ pxor %mm0,%mm6
+ shrl $4,%ebp
+ pinsrw $2,(%esi,%ebx,2),%mm2
+ pxor 16(%esp,%eax,8),%mm7
+ roll $8,%edx
+ pxor 144(%esp,%eax,8),%mm6
+ pxor %mm3,%mm7
+ pxor 400(%esp,%edi,8),%mm6
+ xorb (%esp,%edi,1),%cl
+ movb %dl,%al
+ movl 524(%esp),%edx
+ movd %mm7,%ebx
+ movzbl %cl,%ecx
+ psrlq $8,%mm7
+ movq %mm6,%mm3
+ movl %eax,%edi
+ psrlq $8,%mm6
+ pxor 272(%esp,%ebp,8),%mm7
+ andb $15,%al
+ psllq $56,%mm3
+ pxor %mm2,%mm6
+ shrl $4,%edi
+ pinsrw $2,(%esi,%ecx,2),%mm1
+ pxor 16(%esp,%eax,8),%mm7
+ pxor 144(%esp,%eax,8),%mm6
+ xorb (%esp,%ebp,1),%bl
+ pxor %mm3,%mm7
+ pxor 400(%esp,%ebp,8),%mm6
+ movzbl %bl,%ebx
+ pxor %mm2,%mm2
+ psllq $4,%mm1
+ movd %mm7,%ecx
+ psrlq $4,%mm7
+ movq %mm6,%mm3
+ psrlq $4,%mm6
+ shll $4,%ecx
+ pxor 16(%esp,%edi,8),%mm7
+ psllq $60,%mm3
+ movzbl %cl,%ecx
+ pxor %mm3,%mm7
+ pxor 144(%esp,%edi,8),%mm6
+ pinsrw $2,(%esi,%ebx,2),%mm0
+ pxor %mm1,%mm6
+ movd %mm7,%edx
+ pinsrw $3,(%esi,%ecx,2),%mm2
+ psllq $12,%mm0
+ pxor %mm0,%mm6
+ psrlq $32,%mm7
+ pxor %mm2,%mm6
+ movl 548(%esp),%ecx
+ movd %mm7,%ebx
+ movq %mm6,%mm3
+ psllw $8,%mm6
+ psrlw $8,%mm3
+ por %mm3,%mm6
+ bswap %edx
+ pshufw $27,%mm6,%mm6
+ bswap %ebx
+ cmpl 552(%esp),%ecx
+ jne .L009outer
+ movl 544(%esp),%eax
+ movl %edx,12(%eax)
+ movl %ebx,8(%eax)
+ movq %mm6,(%eax)
+ movl 556(%esp),%esp
emms
- movl %ebx,12(%edi)
- movl %edx,4(%edi)
- movl %ecx,8(%edi)
- movl %ebp,(%edi)
- addl $20,%esp
popl %edi
popl %esi
popl %ebx
popl %ebp
ret
.size gcm_ghash_4bit_mmx,.-.L_gcm_ghash_4bit_mmx_begin
+.globl gcm_init_clmul
+.type gcm_init_clmul,@function
+.align 16
+gcm_init_clmul:
+.L_gcm_init_clmul_begin:
+ movl 4(%esp),%edx
+ movl 8(%esp),%eax
+ call .L010pic
+.L010pic:
+ popl %ecx
+ leal .Lbswap-.L010pic(%ecx),%ecx
+ movdqu (%eax),%xmm2
+ pshufd $78,%xmm2,%xmm2
+ pshufd $255,%xmm2,%xmm4
+ movdqa %xmm2,%xmm3
+ psllq $1,%xmm2
+ pxor %xmm5,%xmm5
+ psrlq $63,%xmm3
+ pcmpgtd %xmm4,%xmm5
+ pslldq $8,%xmm3
+ por %xmm3,%xmm2
+ pand 16(%ecx),%xmm5
+ pxor %xmm5,%xmm2
+ movdqa %xmm2,%xmm0
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+ pshufd $78,%xmm2,%xmm4
+ pxor %xmm0,%xmm3
+ pxor %xmm2,%xmm4
+.byte 102,15,58,68,194,0
+.byte 102,15,58,68,202,17
+.byte 102,15,58,68,220,0
+ xorps %xmm0,%xmm3
+ xorps %xmm1,%xmm3
+ movdqa %xmm3,%xmm4
+ psrldq $8,%xmm3
+ pslldq $8,%xmm4
+ pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+ movdqa %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $5,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm4
+ pslldq $8,%xmm0
+ psrldq $8,%xmm4
+ pxor %xmm3,%xmm0
+ pxor %xmm4,%xmm1
+ movdqa %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm0
+ pxor %xmm1,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm0
+ movdqu %xmm2,(%edx)
+ movdqu %xmm0,16(%edx)
+ ret
+.size gcm_init_clmul,.-.L_gcm_init_clmul_begin
+.globl gcm_gmult_clmul
+.type gcm_gmult_clmul,@function
+.align 16
+gcm_gmult_clmul:
+.L_gcm_gmult_clmul_begin:
+ movl 4(%esp),%eax
+ movl 8(%esp),%edx
+ call .L011pic
+.L011pic:
+ popl %ecx
+ leal .Lbswap-.L011pic(%ecx),%ecx
+ movdqu (%eax),%xmm0
+ movdqa (%ecx),%xmm5
+ movups (%edx),%xmm2
+.byte 102,15,56,0,197
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+ pshufd $78,%xmm2,%xmm4
+ pxor %xmm0,%xmm3
+ pxor %xmm2,%xmm4
+.byte 102,15,58,68,194,0
+.byte 102,15,58,68,202,17
+.byte 102,15,58,68,220,0
+ xorps %xmm0,%xmm3
+ xorps %xmm1,%xmm3
+ movdqa %xmm3,%xmm4
+ psrldq $8,%xmm3
+ pslldq $8,%xmm4
+ pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+ movdqa %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $5,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm4
+ pslldq $8,%xmm0
+ psrldq $8,%xmm4
+ pxor %xmm3,%xmm0
+ pxor %xmm4,%xmm1
+ movdqa %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm0
+ pxor %xmm1,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm0
+.byte 102,15,56,0,197
+ movdqu %xmm0,(%eax)
+ ret
+.size gcm_gmult_clmul,.-.L_gcm_gmult_clmul_begin
+.globl gcm_ghash_clmul
+.type gcm_ghash_clmul,@function
+.align 16
+gcm_ghash_clmul:
+.L_gcm_ghash_clmul_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%eax
+ movl 24(%esp),%edx
+ movl 28(%esp),%esi
+ movl 32(%esp),%ebx
+ call .L012pic
+.L012pic:
+ popl %ecx
+ leal .Lbswap-.L012pic(%ecx),%ecx
+ movdqu (%eax),%xmm0
+ movdqa (%ecx),%xmm5
+ movdqu (%edx),%xmm2
+.byte 102,15,56,0,197
+ subl $16,%ebx
+ jz .L013odd_tail
+ movdqu (%esi),%xmm3
+ movdqu 16(%esi),%xmm6
+.byte 102,15,56,0,221
+.byte 102,15,56,0,245
+ pxor %xmm3,%xmm0
+ movdqa %xmm6,%xmm7
+ pshufd $78,%xmm6,%xmm3
+ pshufd $78,%xmm2,%xmm4
+ pxor %xmm6,%xmm3
+ pxor %xmm2,%xmm4
+.byte 102,15,58,68,242,0
+.byte 102,15,58,68,250,17
+.byte 102,15,58,68,220,0
+ xorps %xmm6,%xmm3
+ xorps %xmm7,%xmm3
+ movdqa %xmm3,%xmm4
+ psrldq $8,%xmm3
+ pslldq $8,%xmm4
+ pxor %xmm3,%xmm7
+ pxor %xmm4,%xmm6
+ movups 16(%edx),%xmm2
+ leal 32(%esi),%esi
+ subl $32,%ebx
+ jbe .L014even_tail
+.L015mod_loop:
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+ pshufd $78,%xmm2,%xmm4
+ pxor %xmm0,%xmm3
+ pxor %xmm2,%xmm4
+.byte 102,15,58,68,194,0
+.byte 102,15,58,68,202,17
+.byte 102,15,58,68,220,0
+ xorps %xmm0,%xmm3
+ xorps %xmm1,%xmm3
+ movdqa %xmm3,%xmm4
+ psrldq $8,%xmm3
+ pslldq $8,%xmm4
+ pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+ movdqu (%esi),%xmm3
+ movups (%edx),%xmm2
+ pxor %xmm6,%xmm0
+ pxor %xmm7,%xmm1
+ movdqu 16(%esi),%xmm6
+.byte 102,15,56,0,221
+.byte 102,15,56,0,245
+ movdqa %xmm6,%xmm5
+ movdqa %xmm6,%xmm7
+ pxor %xmm3,%xmm1
+ movdqa %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $5,%xmm0
+ pxor %xmm3,%xmm0
+.byte 102,15,58,68,242,0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm4
+ pslldq $8,%xmm0
+ psrldq $8,%xmm4
+ pxor %xmm3,%xmm0
+ pshufd $78,%xmm5,%xmm3
+ pxor %xmm4,%xmm1
+ pxor %xmm5,%xmm3
+ pshufd $78,%xmm2,%xmm5
+ pxor %xmm2,%xmm5
+.byte 102,15,58,68,250,17
+ movdqa %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm0
+ pxor %xmm1,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm0
+.byte 102,15,58,68,221,0
+ movups 16(%edx),%xmm2
+ xorps %xmm6,%xmm3
+ xorps %xmm7,%xmm3
+ movdqa %xmm3,%xmm5
+ psrldq $8,%xmm3
+ pslldq $8,%xmm5
+ pxor %xmm3,%xmm7
+ pxor %xmm5,%xmm6
+ movdqa (%ecx),%xmm5
+ leal 32(%esi),%esi
+ subl $32,%ebx
+ ja .L015mod_loop
+.L014even_tail:
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+ pshufd $78,%xmm2,%xmm4
+ pxor %xmm0,%xmm3
+ pxor %xmm2,%xmm4
+.byte 102,15,58,68,194,0
+.byte 102,15,58,68,202,17
+.byte 102,15,58,68,220,0
+ xorps %xmm0,%xmm3
+ xorps %xmm1,%xmm3
+ movdqa %xmm3,%xmm4
+ psrldq $8,%xmm3
+ pslldq $8,%xmm4
+ pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+ pxor %xmm6,%xmm0
+ pxor %xmm7,%xmm1
+ movdqa %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $5,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm4
+ pslldq $8,%xmm0
+ psrldq $8,%xmm4
+ pxor %xmm3,%xmm0
+ pxor %xmm4,%xmm1
+ movdqa %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm0
+ pxor %xmm1,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm0
+ testl %ebx,%ebx
+ jnz .L016done
+ movups (%edx),%xmm2
+.L013odd_tail:
+ movdqu (%esi),%xmm3
+.byte 102,15,56,0,221
+ pxor %xmm3,%xmm0
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+ pshufd $78,%xmm2,%xmm4
+ pxor %xmm0,%xmm3
+ pxor %xmm2,%xmm4
+.byte 102,15,58,68,194,0
+.byte 102,15,58,68,202,17
+.byte 102,15,58,68,220,0
+ xorps %xmm0,%xmm3
+ xorps %xmm1,%xmm3
+ movdqa %xmm3,%xmm4
+ psrldq $8,%xmm3
+ pslldq $8,%xmm4
+ pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+ movdqa %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $5,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm4
+ pslldq $8,%xmm0
+ psrldq $8,%xmm4
+ pxor %xmm3,%xmm0
+ pxor %xmm4,%xmm1
+ movdqa %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm0
+ pxor %xmm1,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm0
+.L016done:
+.byte 102,15,56,0,197
+ movdqu %xmm0,(%eax)
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size gcm_ghash_clmul,.-.L_gcm_ghash_clmul_begin
+.align 64
+.Lbswap:
+.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,194
.align 64
.Lrem_4bit:
-.long 0,0,0,29491200,0,58982400,0,38141952
-.long 0,117964800,0,113901568,0,76283904,0,88997888
-.long 0,235929600,0,265420800,0,227803136,0,206962688
-.long 0,152567808,0,148504576,0,177995776,0,190709760
+.long 0,0,0,471859200,0,943718400,0,610271232
+.long 0,1887436800,0,1822425088,0,1220542464,0,1423966208
+.long 0,3774873600,0,4246732800,0,3644850176,0,3311403008
+.long 0,2441084928,0,2376073216,0,2847932416,0,3051356160
.align 64
-.L008rem_8bit:
+.Lrem_8bit:
.value 0,450,900,582,1800,1738,1164,1358
.value 3600,4050,3476,3158,2328,2266,2716,2910
.value 7200,7650,8100,7782,6952,6890,6316,6510