summaryrefslogtreecommitdiff
path: root/secure/lib/libcrypto/i386/poly1305-x86.S
diff options
context:
space:
mode:
Diffstat (limited to 'secure/lib/libcrypto/i386/poly1305-x86.S')
-rw-r--r--secure/lib/libcrypto/i386/poly1305-x86.S1110
1 files changed, 1110 insertions, 0 deletions
diff --git a/secure/lib/libcrypto/i386/poly1305-x86.S b/secure/lib/libcrypto/i386/poly1305-x86.S
index b394500278d5..100deee40bf2 100644
--- a/secure/lib/libcrypto/i386/poly1305-x86.S
+++ b/secure/lib/libcrypto/i386/poly1305-x86.S
@@ -36,6 +36,10 @@ poly1305_init:
jne .L002no_sse2
leal _poly1305_blocks_sse2-.L001pic_point(%ebx),%eax
leal _poly1305_emit_sse2-.L001pic_point(%ebx),%edx
+ movl 8(%edi),%ecx
+ testl $32,%ecx
+ jz .L002no_sse2
+ leal _poly1305_blocks_avx2-.L001pic_point(%ebx),%eax
.L002no_sse2:
movl 20(%esp),%edi
movl %eax,(%ebp)
@@ -1344,6 +1348,557 @@ _poly1305_emit_sse2:
popl %ebp
ret
.size _poly1305_emit_sse2,.-_poly1305_emit_sse2
+.align 32
+.type _poly1305_init_avx2,@function
+.align 16
+_poly1305_init_avx2:
+ vmovdqu 24(%edi),%xmm4
+ leal 48(%edi),%edi
+ movl %esp,%ebp
+ subl $224,%esp
+ andl $-16,%esp
+ vmovdqa 64(%ebx),%xmm7
+ vpand %xmm7,%xmm4,%xmm0
+ vpsrlq $26,%xmm4,%xmm1
+ vpsrldq $6,%xmm4,%xmm3
+ vpand %xmm7,%xmm1,%xmm1
+ vpsrlq $4,%xmm3,%xmm2
+ vpsrlq $30,%xmm3,%xmm3
+ vpand %xmm7,%xmm2,%xmm2
+ vpand %xmm7,%xmm3,%xmm3
+ vpsrldq $13,%xmm4,%xmm4
+ leal 144(%esp),%edx
+ movl $2,%ecx
+.L018square:
+ vmovdqa %xmm0,(%esp)
+ vmovdqa %xmm1,16(%esp)
+ vmovdqa %xmm2,32(%esp)
+ vmovdqa %xmm3,48(%esp)
+ vmovdqa %xmm4,64(%esp)
+ vpslld $2,%xmm1,%xmm6
+ vpslld $2,%xmm2,%xmm5
+ vpaddd %xmm1,%xmm6,%xmm6
+ vpaddd %xmm2,%xmm5,%xmm5
+ vmovdqa %xmm6,80(%esp)
+ vmovdqa %xmm5,96(%esp)
+ vpslld $2,%xmm3,%xmm6
+ vpslld $2,%xmm4,%xmm5
+ vpaddd %xmm3,%xmm6,%xmm6
+ vpaddd %xmm4,%xmm5,%xmm5
+ vmovdqa %xmm6,112(%esp)
+ vmovdqa %xmm5,128(%esp)
+ vpshufd $68,%xmm0,%xmm5
+ vmovdqa %xmm1,%xmm6
+ vpshufd $68,%xmm1,%xmm1
+ vpshufd $68,%xmm2,%xmm2
+ vpshufd $68,%xmm3,%xmm3
+ vpshufd $68,%xmm4,%xmm4
+ vmovdqa %xmm5,(%edx)
+ vmovdqa %xmm1,16(%edx)
+ vmovdqa %xmm2,32(%edx)
+ vmovdqa %xmm3,48(%edx)
+ vmovdqa %xmm4,64(%edx)
+ vpmuludq %xmm0,%xmm4,%xmm4
+ vpmuludq %xmm0,%xmm3,%xmm3
+ vpmuludq %xmm0,%xmm2,%xmm2
+ vpmuludq %xmm0,%xmm1,%xmm1
+ vpmuludq %xmm0,%xmm5,%xmm0
+ vpmuludq 48(%edx),%xmm6,%xmm5
+ vpaddq %xmm5,%xmm4,%xmm4
+ vpmuludq 32(%edx),%xmm6,%xmm7
+ vpaddq %xmm7,%xmm3,%xmm3
+ vpmuludq 16(%edx),%xmm6,%xmm5
+ vpaddq %xmm5,%xmm2,%xmm2
+ vmovdqa 80(%esp),%xmm7
+ vpmuludq (%edx),%xmm6,%xmm6
+ vpaddq %xmm6,%xmm1,%xmm1
+ vmovdqa 32(%esp),%xmm5
+ vpmuludq 64(%edx),%xmm7,%xmm7
+ vpaddq %xmm7,%xmm0,%xmm0
+ vpmuludq 32(%edx),%xmm5,%xmm6
+ vpaddq %xmm6,%xmm4,%xmm4
+ vpmuludq 16(%edx),%xmm5,%xmm7
+ vpaddq %xmm7,%xmm3,%xmm3
+ vmovdqa 96(%esp),%xmm6
+ vpmuludq (%edx),%xmm5,%xmm5
+ vpaddq %xmm5,%xmm2,%xmm2
+ vpmuludq 64(%edx),%xmm6,%xmm7
+ vpaddq %xmm7,%xmm1,%xmm1
+ vmovdqa 48(%esp),%xmm5
+ vpmuludq 48(%edx),%xmm6,%xmm6
+ vpaddq %xmm6,%xmm0,%xmm0
+ vpmuludq 16(%edx),%xmm5,%xmm7
+ vpaddq %xmm7,%xmm4,%xmm4
+ vmovdqa 112(%esp),%xmm6
+ vpmuludq (%edx),%xmm5,%xmm5
+ vpaddq %xmm5,%xmm3,%xmm3
+ vpmuludq 64(%edx),%xmm6,%xmm7
+ vpaddq %xmm7,%xmm2,%xmm2
+ vpmuludq 48(%edx),%xmm6,%xmm5
+ vpaddq %xmm5,%xmm1,%xmm1
+ vmovdqa 64(%esp),%xmm7
+ vpmuludq 32(%edx),%xmm6,%xmm6
+ vpaddq %xmm6,%xmm0,%xmm0
+ vmovdqa 128(%esp),%xmm5
+ vpmuludq (%edx),%xmm7,%xmm7
+ vpaddq %xmm7,%xmm4,%xmm4
+ vpmuludq 64(%edx),%xmm5,%xmm6
+ vpaddq %xmm6,%xmm3,%xmm3
+ vpmuludq 16(%edx),%xmm5,%xmm7
+ vpaddq %xmm7,%xmm0,%xmm0
+ vpmuludq 32(%edx),%xmm5,%xmm6
+ vpaddq %xmm6,%xmm1,%xmm1
+ vmovdqa 64(%ebx),%xmm7
+ vpmuludq 48(%edx),%xmm5,%xmm5
+ vpaddq %xmm5,%xmm2,%xmm2
+ vpsrlq $26,%xmm3,%xmm5
+ vpand %xmm7,%xmm3,%xmm3
+ vpsrlq $26,%xmm0,%xmm6
+ vpand %xmm7,%xmm0,%xmm0
+ vpaddq %xmm5,%xmm4,%xmm4
+ vpaddq %xmm6,%xmm1,%xmm1
+ vpsrlq $26,%xmm4,%xmm5
+ vpand %xmm7,%xmm4,%xmm4
+ vpsrlq $26,%xmm1,%xmm6
+ vpand %xmm7,%xmm1,%xmm1
+ vpaddq %xmm6,%xmm2,%xmm2
+ vpaddd %xmm5,%xmm0,%xmm0
+ vpsllq $2,%xmm5,%xmm5
+ vpsrlq $26,%xmm2,%xmm6
+ vpand %xmm7,%xmm2,%xmm2
+ vpaddd %xmm5,%xmm0,%xmm0
+ vpaddd %xmm6,%xmm3,%xmm3
+ vpsrlq $26,%xmm3,%xmm6
+ vpsrlq $26,%xmm0,%xmm5
+ vpand %xmm7,%xmm0,%xmm0
+ vpand %xmm7,%xmm3,%xmm3
+ vpaddd %xmm5,%xmm1,%xmm1
+ vpaddd %xmm6,%xmm4,%xmm4
+ decl %ecx
+ jz .L019square_break
+ vpunpcklqdq (%esp),%xmm0,%xmm0
+ vpunpcklqdq 16(%esp),%xmm1,%xmm1
+ vpunpcklqdq 32(%esp),%xmm2,%xmm2
+ vpunpcklqdq 48(%esp),%xmm3,%xmm3
+ vpunpcklqdq 64(%esp),%xmm4,%xmm4
+ jmp .L018square
+.L019square_break:
+ vpsllq $32,%xmm0,%xmm0
+ vpsllq $32,%xmm1,%xmm1
+ vpsllq $32,%xmm2,%xmm2
+ vpsllq $32,%xmm3,%xmm3
+ vpsllq $32,%xmm4,%xmm4
+ vpor (%esp),%xmm0,%xmm0
+ vpor 16(%esp),%xmm1,%xmm1
+ vpor 32(%esp),%xmm2,%xmm2
+ vpor 48(%esp),%xmm3,%xmm3
+ vpor 64(%esp),%xmm4,%xmm4
+ vpshufd $141,%xmm0,%xmm0
+ vpshufd $141,%xmm1,%xmm1
+ vpshufd $141,%xmm2,%xmm2
+ vpshufd $141,%xmm3,%xmm3
+ vpshufd $141,%xmm4,%xmm4
+ vmovdqu %xmm0,(%edi)
+ vmovdqu %xmm1,16(%edi)
+ vmovdqu %xmm2,32(%edi)
+ vmovdqu %xmm3,48(%edi)
+ vmovdqu %xmm4,64(%edi)
+ vpslld $2,%xmm1,%xmm6
+ vpslld $2,%xmm2,%xmm5
+ vpaddd %xmm1,%xmm6,%xmm6
+ vpaddd %xmm2,%xmm5,%xmm5
+ vmovdqu %xmm6,80(%edi)
+ vmovdqu %xmm5,96(%edi)
+ vpslld $2,%xmm3,%xmm6
+ vpslld $2,%xmm4,%xmm5
+ vpaddd %xmm3,%xmm6,%xmm6
+ vpaddd %xmm4,%xmm5,%xmm5
+ vmovdqu %xmm6,112(%edi)
+ vmovdqu %xmm5,128(%edi)
+ movl %ebp,%esp
+ leal -48(%edi),%edi
+ ret
+.size _poly1305_init_avx2,.-_poly1305_init_avx2
+.align 32
+.type _poly1305_blocks_avx2,@function
+.align 16
+_poly1305_blocks_avx2:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%edi
+ movl 24(%esp),%esi
+ movl 28(%esp),%ecx
+ movl 20(%edi),%eax
+ andl $-16,%ecx
+ jz .L020nodata
+ cmpl $64,%ecx
+ jae .L021enter_avx2
+ testl %eax,%eax
+ jz .Lenter_blocks
+.L021enter_avx2:
+ vzeroupper
+ call .L022pic_point
+.L022pic_point:
+ popl %ebx
+ leal .Lconst_sse2-.L022pic_point(%ebx),%ebx
+ testl %eax,%eax
+ jnz .L023base2_26
+ call _poly1305_init_avx2
+ movl (%edi),%eax
+ movl 3(%edi),%ecx
+ movl 6(%edi),%edx
+ movl 9(%edi),%esi
+ movl 13(%edi),%ebp
+ shrl $2,%ecx
+ andl $67108863,%eax
+ shrl $4,%edx
+ andl $67108863,%ecx
+ shrl $6,%esi
+ andl $67108863,%edx
+ movl %eax,(%edi)
+ movl %ecx,4(%edi)
+ movl %edx,8(%edi)
+ movl %esi,12(%edi)
+ movl %ebp,16(%edi)
+ movl $1,20(%edi)
+ movl 24(%esp),%esi
+ movl 28(%esp),%ecx
+.L023base2_26:
+ movl 32(%esp),%eax
+ movl %esp,%ebp
+ subl $448,%esp
+ andl $-512,%esp
+ vmovdqu 48(%edi),%xmm0
+ leal 288(%esp),%edx
+ vmovdqu 64(%edi),%xmm1
+ vmovdqu 80(%edi),%xmm2
+ vmovdqu 96(%edi),%xmm3
+ vmovdqu 112(%edi),%xmm4
+ leal 48(%edi),%edi
+ vpermq $64,%ymm0,%ymm0
+ vpermq $64,%ymm1,%ymm1
+ vpermq $64,%ymm2,%ymm2
+ vpermq $64,%ymm3,%ymm3
+ vpermq $64,%ymm4,%ymm4
+ vpshufd $200,%ymm0,%ymm0
+ vpshufd $200,%ymm1,%ymm1
+ vpshufd $200,%ymm2,%ymm2
+ vpshufd $200,%ymm3,%ymm3
+ vpshufd $200,%ymm4,%ymm4
+ vmovdqa %ymm0,-128(%edx)
+ vmovdqu 80(%edi),%xmm0
+ vmovdqa %ymm1,-96(%edx)
+ vmovdqu 96(%edi),%xmm1
+ vmovdqa %ymm2,-64(%edx)
+ vmovdqu 112(%edi),%xmm2
+ vmovdqa %ymm3,-32(%edx)
+ vmovdqu 128(%edi),%xmm3
+ vmovdqa %ymm4,(%edx)
+ vpermq $64,%ymm0,%ymm0
+ vpermq $64,%ymm1,%ymm1
+ vpermq $64,%ymm2,%ymm2
+ vpermq $64,%ymm3,%ymm3
+ vpshufd $200,%ymm0,%ymm0
+ vpshufd $200,%ymm1,%ymm1
+ vpshufd $200,%ymm2,%ymm2
+ vpshufd $200,%ymm3,%ymm3
+ vmovdqa %ymm0,32(%edx)
+ vmovd -48(%edi),%xmm0
+ vmovdqa %ymm1,64(%edx)
+ vmovd -44(%edi),%xmm1
+ vmovdqa %ymm2,96(%edx)
+ vmovd -40(%edi),%xmm2
+ vmovdqa %ymm3,128(%edx)
+ vmovd -36(%edi),%xmm3
+ vmovd -32(%edi),%xmm4
+ vmovdqa 64(%ebx),%ymm7
+ negl %eax
+ testl $63,%ecx
+ jz .L024even
+ movl %ecx,%edx
+ andl $-64,%ecx
+ andl $63,%edx
+ vmovdqu (%esi),%xmm5
+ cmpl $32,%edx
+ jb .L025one
+ vmovdqu 16(%esi),%xmm6
+ je .L026two
+ vinserti128 $1,32(%esi),%ymm5,%ymm5
+ leal 48(%esi),%esi
+ leal 8(%ebx),%ebx
+ leal 296(%esp),%edx
+ jmp .L027tail
+.L026two:
+ leal 32(%esi),%esi
+ leal 16(%ebx),%ebx
+ leal 304(%esp),%edx
+ jmp .L027tail
+.L025one:
+ leal 16(%esi),%esi
+ vpxor %ymm6,%ymm6,%ymm6
+ leal 32(%ebx,%eax,8),%ebx
+ leal 312(%esp),%edx
+ jmp .L027tail
+.align 32
+.L024even:
+ vmovdqu (%esi),%xmm5
+ vmovdqu 16(%esi),%xmm6
+ vinserti128 $1,32(%esi),%ymm5,%ymm5
+ vinserti128 $1,48(%esi),%ymm6,%ymm6
+ leal 64(%esi),%esi
+ subl $64,%ecx
+ jz .L027tail
+.L028loop:
+ vmovdqa %ymm2,64(%esp)
+ vpsrldq $6,%ymm5,%ymm2
+ vmovdqa %ymm0,(%esp)
+ vpsrldq $6,%ymm6,%ymm0
+ vmovdqa %ymm1,32(%esp)
+ vpunpckhqdq %ymm6,%ymm5,%ymm1
+ vpunpcklqdq %ymm6,%ymm5,%ymm5
+ vpunpcklqdq %ymm0,%ymm2,%ymm2
+ vpsrlq $30,%ymm2,%ymm0
+ vpsrlq $4,%ymm2,%ymm2
+ vpsrlq $26,%ymm5,%ymm6
+ vpsrlq $40,%ymm1,%ymm1
+ vpand %ymm7,%ymm2,%ymm2
+ vpand %ymm7,%ymm5,%ymm5
+ vpand %ymm7,%ymm6,%ymm6
+ vpand %ymm7,%ymm0,%ymm0
+ vpor (%ebx),%ymm1,%ymm1
+ vpaddq 64(%esp),%ymm2,%ymm2
+ vpaddq (%esp),%ymm5,%ymm5
+ vpaddq 32(%esp),%ymm6,%ymm6
+ vpaddq %ymm3,%ymm0,%ymm0
+ vpaddq %ymm4,%ymm1,%ymm1
+ vpmuludq -96(%edx),%ymm2,%ymm3
+ vmovdqa %ymm6,32(%esp)
+ vpmuludq -64(%edx),%ymm2,%ymm4
+ vmovdqa %ymm0,96(%esp)
+ vpmuludq 96(%edx),%ymm2,%ymm0
+ vmovdqa %ymm1,128(%esp)
+ vpmuludq 128(%edx),%ymm2,%ymm1
+ vpmuludq -128(%edx),%ymm2,%ymm2
+ vpmuludq -32(%edx),%ymm5,%ymm7
+ vpaddq %ymm7,%ymm3,%ymm3
+ vpmuludq (%edx),%ymm5,%ymm6
+ vpaddq %ymm6,%ymm4,%ymm4
+ vpmuludq -128(%edx),%ymm5,%ymm7
+ vpaddq %ymm7,%ymm0,%ymm0
+ vmovdqa 32(%esp),%ymm7
+ vpmuludq -96(%edx),%ymm5,%ymm6
+ vpaddq %ymm6,%ymm1,%ymm1
+ vpmuludq -64(%edx),%ymm5,%ymm5
+ vpaddq %ymm5,%ymm2,%ymm2
+ vpmuludq -64(%edx),%ymm7,%ymm6
+ vpaddq %ymm6,%ymm3,%ymm3
+ vpmuludq -32(%edx),%ymm7,%ymm5
+ vpaddq %ymm5,%ymm4,%ymm4
+ vpmuludq 128(%edx),%ymm7,%ymm6
+ vpaddq %ymm6,%ymm0,%ymm0
+ vmovdqa 96(%esp),%ymm6
+ vpmuludq -128(%edx),%ymm7,%ymm5
+ vpaddq %ymm5,%ymm1,%ymm1
+ vpmuludq -96(%edx),%ymm7,%ymm7
+ vpaddq %ymm7,%ymm2,%ymm2
+ vpmuludq -128(%edx),%ymm6,%ymm5
+ vpaddq %ymm5,%ymm3,%ymm3
+ vpmuludq -96(%edx),%ymm6,%ymm7
+ vpaddq %ymm7,%ymm4,%ymm4
+ vpmuludq 64(%edx),%ymm6,%ymm5
+ vpaddq %ymm5,%ymm0,%ymm0
+ vmovdqa 128(%esp),%ymm5
+ vpmuludq 96(%edx),%ymm6,%ymm7
+ vpaddq %ymm7,%ymm1,%ymm1
+ vpmuludq 128(%edx),%ymm6,%ymm6
+ vpaddq %ymm6,%ymm2,%ymm2
+ vpmuludq 128(%edx),%ymm5,%ymm7
+ vpaddq %ymm7,%ymm3,%ymm3
+ vpmuludq 32(%edx),%ymm5,%ymm6
+ vpaddq %ymm6,%ymm0,%ymm0
+ vpmuludq -128(%edx),%ymm5,%ymm7
+ vpaddq %ymm7,%ymm4,%ymm4
+ vmovdqa 64(%ebx),%ymm7
+ vpmuludq 64(%edx),%ymm5,%ymm6
+ vpaddq %ymm6,%ymm1,%ymm1
+ vpmuludq 96(%edx),%ymm5,%ymm5
+ vpaddq %ymm5,%ymm2,%ymm2
+ vpsrlq $26,%ymm3,%ymm5
+ vpand %ymm7,%ymm3,%ymm3
+ vpsrlq $26,%ymm0,%ymm6
+ vpand %ymm7,%ymm0,%ymm0
+ vpaddq %ymm5,%ymm4,%ymm4
+ vpaddq %ymm6,%ymm1,%ymm1
+ vpsrlq $26,%ymm4,%ymm5
+ vpand %ymm7,%ymm4,%ymm4
+ vpsrlq $26,%ymm1,%ymm6
+ vpand %ymm7,%ymm1,%ymm1
+ vpaddq %ymm6,%ymm2,%ymm2
+ vpaddq %ymm5,%ymm0,%ymm0
+ vpsllq $2,%ymm5,%ymm5
+ vpsrlq $26,%ymm2,%ymm6
+ vpand %ymm7,%ymm2,%ymm2
+ vpaddq %ymm5,%ymm0,%ymm0
+ vpaddq %ymm6,%ymm3,%ymm3
+ vpsrlq $26,%ymm3,%ymm6
+ vpsrlq $26,%ymm0,%ymm5
+ vpand %ymm7,%ymm0,%ymm0
+ vpand %ymm7,%ymm3,%ymm3
+ vpaddq %ymm5,%ymm1,%ymm1
+ vpaddq %ymm6,%ymm4,%ymm4
+ vmovdqu (%esi),%xmm5
+ vmovdqu 16(%esi),%xmm6
+ vinserti128 $1,32(%esi),%ymm5,%ymm5
+ vinserti128 $1,48(%esi),%ymm6,%ymm6
+ leal 64(%esi),%esi
+ subl $64,%ecx
+ jnz .L028loop
+.L027tail:
+ vmovdqa %ymm2,64(%esp)
+ vpsrldq $6,%ymm5,%ymm2
+ vmovdqa %ymm0,(%esp)
+ vpsrldq $6,%ymm6,%ymm0
+ vmovdqa %ymm1,32(%esp)
+ vpunpckhqdq %ymm6,%ymm5,%ymm1
+ vpunpcklqdq %ymm6,%ymm5,%ymm5
+ vpunpcklqdq %ymm0,%ymm2,%ymm2
+ vpsrlq $30,%ymm2,%ymm0
+ vpsrlq $4,%ymm2,%ymm2
+ vpsrlq $26,%ymm5,%ymm6
+ vpsrlq $40,%ymm1,%ymm1
+ vpand %ymm7,%ymm2,%ymm2
+ vpand %ymm7,%ymm5,%ymm5
+ vpand %ymm7,%ymm6,%ymm6
+ vpand %ymm7,%ymm0,%ymm0
+ vpor (%ebx),%ymm1,%ymm1
+ andl $-64,%ebx
+ vpaddq 64(%esp),%ymm2,%ymm2
+ vpaddq (%esp),%ymm5,%ymm5
+ vpaddq 32(%esp),%ymm6,%ymm6
+ vpaddq %ymm3,%ymm0,%ymm0
+ vpaddq %ymm4,%ymm1,%ymm1
+ vpmuludq -92(%edx),%ymm2,%ymm3
+ vmovdqa %ymm6,32(%esp)
+ vpmuludq -60(%edx),%ymm2,%ymm4
+ vmovdqa %ymm0,96(%esp)
+ vpmuludq 100(%edx),%ymm2,%ymm0
+ vmovdqa %ymm1,128(%esp)
+ vpmuludq 132(%edx),%ymm2,%ymm1
+ vpmuludq -124(%edx),%ymm2,%ymm2
+ vpmuludq -28(%edx),%ymm5,%ymm7
+ vpaddq %ymm7,%ymm3,%ymm3
+ vpmuludq 4(%edx),%ymm5,%ymm6
+ vpaddq %ymm6,%ymm4,%ymm4
+ vpmuludq -124(%edx),%ymm5,%ymm7
+ vpaddq %ymm7,%ymm0,%ymm0
+ vmovdqa 32(%esp),%ymm7
+ vpmuludq -92(%edx),%ymm5,%ymm6
+ vpaddq %ymm6,%ymm1,%ymm1
+ vpmuludq -60(%edx),%ymm5,%ymm5
+ vpaddq %ymm5,%ymm2,%ymm2
+ vpmuludq -60(%edx),%ymm7,%ymm6
+ vpaddq %ymm6,%ymm3,%ymm3
+ vpmuludq -28(%edx),%ymm7,%ymm5
+ vpaddq %ymm5,%ymm4,%ymm4
+ vpmuludq 132(%edx),%ymm7,%ymm6
+ vpaddq %ymm6,%ymm0,%ymm0
+ vmovdqa 96(%esp),%ymm6
+ vpmuludq -124(%edx),%ymm7,%ymm5
+ vpaddq %ymm5,%ymm1,%ymm1
+ vpmuludq -92(%edx),%ymm7,%ymm7
+ vpaddq %ymm7,%ymm2,%ymm2
+ vpmuludq -124(%edx),%ymm6,%ymm5
+ vpaddq %ymm5,%ymm3,%ymm3
+ vpmuludq -92(%edx),%ymm6,%ymm7
+ vpaddq %ymm7,%ymm4,%ymm4
+ vpmuludq 68(%edx),%ymm6,%ymm5
+ vpaddq %ymm5,%ymm0,%ymm0
+ vmovdqa 128(%esp),%ymm5
+ vpmuludq 100(%edx),%ymm6,%ymm7
+ vpaddq %ymm7,%ymm1,%ymm1
+ vpmuludq 132(%edx),%ymm6,%ymm6
+ vpaddq %ymm6,%ymm2,%ymm2
+ vpmuludq 132(%edx),%ymm5,%ymm7
+ vpaddq %ymm7,%ymm3,%ymm3
+ vpmuludq 36(%edx),%ymm5,%ymm6
+ vpaddq %ymm6,%ymm0,%ymm0
+ vpmuludq -124(%edx),%ymm5,%ymm7
+ vpaddq %ymm7,%ymm4,%ymm4
+ vmovdqa 64(%ebx),%ymm7
+ vpmuludq 68(%edx),%ymm5,%ymm6
+ vpaddq %ymm6,%ymm1,%ymm1
+ vpmuludq 100(%edx),%ymm5,%ymm5
+ vpaddq %ymm5,%ymm2,%ymm2
+ vpsrldq $8,%ymm4,%ymm5
+ vpsrldq $8,%ymm3,%ymm6
+ vpaddq %ymm5,%ymm4,%ymm4
+ vpsrldq $8,%ymm0,%ymm5
+ vpaddq %ymm6,%ymm3,%ymm3
+ vpsrldq $8,%ymm1,%ymm6
+ vpaddq %ymm5,%ymm0,%ymm0
+ vpsrldq $8,%ymm2,%ymm5
+ vpaddq %ymm6,%ymm1,%ymm1
+ vpermq $2,%ymm4,%ymm6
+ vpaddq %ymm5,%ymm2,%ymm2
+ vpermq $2,%ymm3,%ymm5
+ vpaddq %ymm6,%ymm4,%ymm4
+ vpermq $2,%ymm0,%ymm6
+ vpaddq %ymm5,%ymm3,%ymm3
+ vpermq $2,%ymm1,%ymm5
+ vpaddq %ymm6,%ymm0,%ymm0
+ vpermq $2,%ymm2,%ymm6
+ vpaddq %ymm5,%ymm1,%ymm1
+ vpaddq %ymm6,%ymm2,%ymm2
+ vpsrlq $26,%ymm3,%ymm5
+ vpand %ymm7,%ymm3,%ymm3
+ vpsrlq $26,%ymm0,%ymm6
+ vpand %ymm7,%ymm0,%ymm0
+ vpaddq %ymm5,%ymm4,%ymm4
+ vpaddq %ymm6,%ymm1,%ymm1
+ vpsrlq $26,%ymm4,%ymm5
+ vpand %ymm7,%ymm4,%ymm4
+ vpsrlq $26,%ymm1,%ymm6
+ vpand %ymm7,%ymm1,%ymm1
+ vpaddq %ymm6,%ymm2,%ymm2
+ vpaddq %ymm5,%ymm0,%ymm0
+ vpsllq $2,%ymm5,%ymm5
+ vpsrlq $26,%ymm2,%ymm6
+ vpand %ymm7,%ymm2,%ymm2
+ vpaddq %ymm5,%ymm0,%ymm0
+ vpaddq %ymm6,%ymm3,%ymm3
+ vpsrlq $26,%ymm3,%ymm6
+ vpsrlq $26,%ymm0,%ymm5
+ vpand %ymm7,%ymm0,%ymm0
+ vpand %ymm7,%ymm3,%ymm3
+ vpaddq %ymm5,%ymm1,%ymm1
+ vpaddq %ymm6,%ymm4,%ymm4
+ cmpl $0,%ecx
+ je .L029done
+ vpshufd $252,%xmm0,%xmm0
+ leal 288(%esp),%edx
+ vpshufd $252,%xmm1,%xmm1
+ vpshufd $252,%xmm2,%xmm2
+ vpshufd $252,%xmm3,%xmm3
+ vpshufd $252,%xmm4,%xmm4
+ jmp .L024even
+.align 16
+.L029done:
+ vmovd %xmm0,-48(%edi)
+ vmovd %xmm1,-44(%edi)
+ vmovd %xmm2,-40(%edi)
+ vmovd %xmm3,-36(%edi)
+ vmovd %xmm4,-32(%edi)
+ vzeroupper
+ movl %ebp,%esp
+.L020nodata:
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size _poly1305_blocks_avx2,.-_poly1305_blocks_avx2
.align 64
.Lconst_sse2:
.long 16777216,0,16777216,0,16777216,0,16777216,0
@@ -1392,6 +1947,10 @@ poly1305_init:
jne .L002no_sse2
leal _poly1305_blocks_sse2-.L001pic_point(%ebx),%eax
leal _poly1305_emit_sse2-.L001pic_point(%ebx),%edx
+ movl 8(%edi),%ecx
+ testl $32,%ecx
+ jz .L002no_sse2
+ leal _poly1305_blocks_avx2-.L001pic_point(%ebx),%eax
.L002no_sse2:
movl 20(%esp),%edi
movl %eax,(%ebp)
@@ -2700,6 +3259,557 @@ _poly1305_emit_sse2:
popl %ebp
ret
.size _poly1305_emit_sse2,.-_poly1305_emit_sse2
+.align 32
+.type _poly1305_init_avx2,@function
+.align 16
+_poly1305_init_avx2:
+ vmovdqu 24(%edi),%xmm4
+ leal 48(%edi),%edi
+ movl %esp,%ebp
+ subl $224,%esp
+ andl $-16,%esp
+ vmovdqa 64(%ebx),%xmm7
+ vpand %xmm7,%xmm4,%xmm0
+ vpsrlq $26,%xmm4,%xmm1
+ vpsrldq $6,%xmm4,%xmm3
+ vpand %xmm7,%xmm1,%xmm1
+ vpsrlq $4,%xmm3,%xmm2
+ vpsrlq $30,%xmm3,%xmm3
+ vpand %xmm7,%xmm2,%xmm2
+ vpand %xmm7,%xmm3,%xmm3
+ vpsrldq $13,%xmm4,%xmm4
+ leal 144(%esp),%edx
+ movl $2,%ecx
+.L018square:
+ vmovdqa %xmm0,(%esp)
+ vmovdqa %xmm1,16(%esp)
+ vmovdqa %xmm2,32(%esp)
+ vmovdqa %xmm3,48(%esp)
+ vmovdqa %xmm4,64(%esp)
+ vpslld $2,%xmm1,%xmm6
+ vpslld $2,%xmm2,%xmm5
+ vpaddd %xmm1,%xmm6,%xmm6
+ vpaddd %xmm2,%xmm5,%xmm5
+ vmovdqa %xmm6,80(%esp)
+ vmovdqa %xmm5,96(%esp)
+ vpslld $2,%xmm3,%xmm6
+ vpslld $2,%xmm4,%xmm5
+ vpaddd %xmm3,%xmm6,%xmm6
+ vpaddd %xmm4,%xmm5,%xmm5
+ vmovdqa %xmm6,112(%esp)
+ vmovdqa %xmm5,128(%esp)
+ vpshufd $68,%xmm0,%xmm5
+ vmovdqa %xmm1,%xmm6
+ vpshufd $68,%xmm1,%xmm1
+ vpshufd $68,%xmm2,%xmm2
+ vpshufd $68,%xmm3,%xmm3
+ vpshufd $68,%xmm4,%xmm4
+ vmovdqa %xmm5,(%edx)
+ vmovdqa %xmm1,16(%edx)
+ vmovdqa %xmm2,32(%edx)
+ vmovdqa %xmm3,48(%edx)
+ vmovdqa %xmm4,64(%edx)
+ vpmuludq %xmm0,%xmm4,%xmm4
+ vpmuludq %xmm0,%xmm3,%xmm3
+ vpmuludq %xmm0,%xmm2,%xmm2
+ vpmuludq %xmm0,%xmm1,%xmm1
+ vpmuludq %xmm0,%xmm5,%xmm0
+ vpmuludq 48(%edx),%xmm6,%xmm5
+ vpaddq %xmm5,%xmm4,%xmm4
+ vpmuludq 32(%edx),%xmm6,%xmm7
+ vpaddq %xmm7,%xmm3,%xmm3
+ vpmuludq 16(%edx),%xmm6,%xmm5
+ vpaddq %xmm5,%xmm2,%xmm2
+ vmovdqa 80(%esp),%xmm7
+ vpmuludq (%edx),%xmm6,%xmm6
+ vpaddq %xmm6,%xmm1,%xmm1
+ vmovdqa 32(%esp),%xmm5
+ vpmuludq 64(%edx),%xmm7,%xmm7
+ vpaddq %xmm7,%xmm0,%xmm0
+ vpmuludq 32(%edx),%xmm5,%xmm6
+ vpaddq %xmm6,%xmm4,%xmm4
+ vpmuludq 16(%edx),%xmm5,%xmm7
+ vpaddq %xmm7,%xmm3,%xmm3
+ vmovdqa 96(%esp),%xmm6
+ vpmuludq (%edx),%xmm5,%xmm5
+ vpaddq %xmm5,%xmm2,%xmm2
+ vpmuludq 64(%edx),%xmm6,%xmm7
+ vpaddq %xmm7,%xmm1,%xmm1
+ vmovdqa 48(%esp),%xmm5
+ vpmuludq 48(%edx),%xmm6,%xmm6
+ vpaddq %xmm6,%xmm0,%xmm0
+ vpmuludq 16(%edx),%xmm5,%xmm7
+ vpaddq %xmm7,%xmm4,%xmm4
+ vmovdqa 112(%esp),%xmm6
+ vpmuludq (%edx),%xmm5,%xmm5
+ vpaddq %xmm5,%xmm3,%xmm3
+ vpmuludq 64(%edx),%xmm6,%xmm7
+ vpaddq %xmm7,%xmm2,%xmm2
+ vpmuludq 48(%edx),%xmm6,%xmm5
+ vpaddq %xmm5,%xmm1,%xmm1
+ vmovdqa 64(%esp),%xmm7
+ vpmuludq 32(%edx),%xmm6,%xmm6
+ vpaddq %xmm6,%xmm0,%xmm0
+ vmovdqa 128(%esp),%xmm5
+ vpmuludq (%edx),%xmm7,%xmm7
+ vpaddq %xmm7,%xmm4,%xmm4
+ vpmuludq 64(%edx),%xmm5,%xmm6
+ vpaddq %xmm6,%xmm3,%xmm3
+ vpmuludq 16(%edx),%xmm5,%xmm7
+ vpaddq %xmm7,%xmm0,%xmm0
+ vpmuludq 32(%edx),%xmm5,%xmm6
+ vpaddq %xmm6,%xmm1,%xmm1
+ vmovdqa 64(%ebx),%xmm7
+ vpmuludq 48(%edx),%xmm5,%xmm5
+ vpaddq %xmm5,%xmm2,%xmm2
+ vpsrlq $26,%xmm3,%xmm5
+ vpand %xmm7,%xmm3,%xmm3
+ vpsrlq $26,%xmm0,%xmm6
+ vpand %xmm7,%xmm0,%xmm0
+ vpaddq %xmm5,%xmm4,%xmm4
+ vpaddq %xmm6,%xmm1,%xmm1
+ vpsrlq $26,%xmm4,%xmm5
+ vpand %xmm7,%xmm4,%xmm4
+ vpsrlq $26,%xmm1,%xmm6
+ vpand %xmm7,%xmm1,%xmm1
+ vpaddq %xmm6,%xmm2,%xmm2
+ vpaddd %xmm5,%xmm0,%xmm0
+ vpsllq $2,%xmm5,%xmm5
+ vpsrlq $26,%xmm2,%xmm6
+ vpand %xmm7,%xmm2,%xmm2
+ vpaddd %xmm5,%xmm0,%xmm0
+ vpaddd %xmm6,%xmm3,%xmm3
+ vpsrlq $26,%xmm3,%xmm6
+ vpsrlq $26,%xmm0,%xmm5
+ vpand %xmm7,%xmm0,%xmm0
+ vpand %xmm7,%xmm3,%xmm3
+ vpaddd %xmm5,%xmm1,%xmm1
+ vpaddd %xmm6,%xmm4,%xmm4
+ decl %ecx
+ jz .L019square_break
+ vpunpcklqdq (%esp),%xmm0,%xmm0
+ vpunpcklqdq 16(%esp),%xmm1,%xmm1
+ vpunpcklqdq 32(%esp),%xmm2,%xmm2
+ vpunpcklqdq 48(%esp),%xmm3,%xmm3
+ vpunpcklqdq 64(%esp),%xmm4,%xmm4
+ jmp .L018square
+.L019square_break:
+ vpsllq $32,%xmm0,%xmm0
+ vpsllq $32,%xmm1,%xmm1
+ vpsllq $32,%xmm2,%xmm2
+ vpsllq $32,%xmm3,%xmm3
+ vpsllq $32,%xmm4,%xmm4
+ vpor (%esp),%xmm0,%xmm0
+ vpor 16(%esp),%xmm1,%xmm1
+ vpor 32(%esp),%xmm2,%xmm2
+ vpor 48(%esp),%xmm3,%xmm3
+ vpor 64(%esp),%xmm4,%xmm4
+ vpshufd $141,%xmm0,%xmm0
+ vpshufd $141,%xmm1,%xmm1
+ vpshufd $141,%xmm2,%xmm2
+ vpshufd $141,%xmm3,%xmm3
+ vpshufd $141,%xmm4,%xmm4
+ vmovdqu %xmm0,(%edi)
+ vmovdqu %xmm1,16(%edi)
+ vmovdqu %xmm2,32(%edi)
+ vmovdqu %xmm3,48(%edi)
+ vmovdqu %xmm4,64(%edi)
+ vpslld $2,%xmm1,%xmm6
+ vpslld $2,%xmm2,%xmm5
+ vpaddd %xmm1,%xmm6,%xmm6
+ vpaddd %xmm2,%xmm5,%xmm5
+ vmovdqu %xmm6,80(%edi)
+ vmovdqu %xmm5,96(%edi)
+ vpslld $2,%xmm3,%xmm6
+ vpslld $2,%xmm4,%xmm5
+ vpaddd %xmm3,%xmm6,%xmm6
+ vpaddd %xmm4,%xmm5,%xmm5
+ vmovdqu %xmm6,112(%edi)
+ vmovdqu %xmm5,128(%edi)
+ movl %ebp,%esp
+ leal -48(%edi),%edi
+ ret
+.size _poly1305_init_avx2,.-_poly1305_init_avx2
+.align 32
+.type _poly1305_blocks_avx2,@function
+.align 16
+_poly1305_blocks_avx2:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%edi
+ movl 24(%esp),%esi
+ movl 28(%esp),%ecx
+ movl 20(%edi),%eax
+ andl $-16,%ecx
+ jz .L020nodata
+ cmpl $64,%ecx
+ jae .L021enter_avx2
+ testl %eax,%eax
+ jz .Lenter_blocks
+.L021enter_avx2:
+ vzeroupper
+ call .L022pic_point
+.L022pic_point:
+ popl %ebx
+ leal .Lconst_sse2-.L022pic_point(%ebx),%ebx
+ testl %eax,%eax
+ jnz .L023base2_26
+ call _poly1305_init_avx2
+ movl (%edi),%eax
+ movl 3(%edi),%ecx
+ movl 6(%edi),%edx
+ movl 9(%edi),%esi
+ movl 13(%edi),%ebp
+ shrl $2,%ecx
+ andl $67108863,%eax
+ shrl $4,%edx
+ andl $67108863,%ecx
+ shrl $6,%esi
+ andl $67108863,%edx
+ movl %eax,(%edi)
+ movl %ecx,4(%edi)
+ movl %edx,8(%edi)
+ movl %esi,12(%edi)
+ movl %ebp,16(%edi)
+ movl $1,20(%edi)
+ movl 24(%esp),%esi
+ movl 28(%esp),%ecx
+.L023base2_26:
+ movl 32(%esp),%eax
+ movl %esp,%ebp
+ subl $448,%esp
+ andl $-512,%esp
+ vmovdqu 48(%edi),%xmm0
+ leal 288(%esp),%edx
+ vmovdqu 64(%edi),%xmm1
+ vmovdqu 80(%edi),%xmm2
+ vmovdqu 96(%edi),%xmm3
+ vmovdqu 112(%edi),%xmm4
+ leal 48(%edi),%edi
+ vpermq $64,%ymm0,%ymm0
+ vpermq $64,%ymm1,%ymm1
+ vpermq $64,%ymm2,%ymm2
+ vpermq $64,%ymm3,%ymm3
+ vpermq $64,%ymm4,%ymm4
+ vpshufd $200,%ymm0,%ymm0
+ vpshufd $200,%ymm1,%ymm1
+ vpshufd $200,%ymm2,%ymm2
+ vpshufd $200,%ymm3,%ymm3
+ vpshufd $200,%ymm4,%ymm4
+ vmovdqa %ymm0,-128(%edx)
+ vmovdqu 80(%edi),%xmm0
+ vmovdqa %ymm1,-96(%edx)
+ vmovdqu 96(%edi),%xmm1
+ vmovdqa %ymm2,-64(%edx)
+ vmovdqu 112(%edi),%xmm2
+ vmovdqa %ymm3,-32(%edx)
+ vmovdqu 128(%edi),%xmm3
+ vmovdqa %ymm4,(%edx)
+ vpermq $64,%ymm0,%ymm0
+ vpermq $64,%ymm1,%ymm1
+ vpermq $64,%ymm2,%ymm2
+ vpermq $64,%ymm3,%ymm3
+ vpshufd $200,%ymm0,%ymm0
+ vpshufd $200,%ymm1,%ymm1
+ vpshufd $200,%ymm2,%ymm2
+ vpshufd $200,%ymm3,%ymm3
+ vmovdqa %ymm0,32(%edx)
+ vmovd -48(%edi),%xmm0
+ vmovdqa %ymm1,64(%edx)
+ vmovd -44(%edi),%xmm1
+ vmovdqa %ymm2,96(%edx)
+ vmovd -40(%edi),%xmm2
+ vmovdqa %ymm3,128(%edx)
+ vmovd -36(%edi),%xmm3
+ vmovd -32(%edi),%xmm4
+ vmovdqa 64(%ebx),%ymm7
+ negl %eax
+ testl $63,%ecx
+ jz .L024even
+ movl %ecx,%edx
+ andl $-64,%ecx
+ andl $63,%edx
+ vmovdqu (%esi),%xmm5
+ cmpl $32,%edx
+ jb .L025one
+ vmovdqu 16(%esi),%xmm6
+ je .L026two
+ vinserti128 $1,32(%esi),%ymm5,%ymm5
+ leal 48(%esi),%esi
+ leal 8(%ebx),%ebx
+ leal 296(%esp),%edx
+ jmp .L027tail
+.L026two:
+ leal 32(%esi),%esi
+ leal 16(%ebx),%ebx
+ leal 304(%esp),%edx
+ jmp .L027tail
+.L025one:
+ leal 16(%esi),%esi
+ vpxor %ymm6,%ymm6,%ymm6
+ leal 32(%ebx,%eax,8),%ebx
+ leal 312(%esp),%edx
+ jmp .L027tail
+.align 32
+.L024even:
+ vmovdqu (%esi),%xmm5
+ vmovdqu 16(%esi),%xmm6
+ vinserti128 $1,32(%esi),%ymm5,%ymm5
+ vinserti128 $1,48(%esi),%ymm6,%ymm6
+ leal 64(%esi),%esi
+ subl $64,%ecx
+ jz .L027tail
+.L028loop:
+ vmovdqa %ymm2,64(%esp)
+ vpsrldq $6,%ymm5,%ymm2
+ vmovdqa %ymm0,(%esp)
+ vpsrldq $6,%ymm6,%ymm0
+ vmovdqa %ymm1,32(%esp)
+ vpunpckhqdq %ymm6,%ymm5,%ymm1
+ vpunpcklqdq %ymm6,%ymm5,%ymm5
+ vpunpcklqdq %ymm0,%ymm2,%ymm2
+ vpsrlq $30,%ymm2,%ymm0
+ vpsrlq $4,%ymm2,%ymm2
+ vpsrlq $26,%ymm5,%ymm6
+ vpsrlq $40,%ymm1,%ymm1
+ vpand %ymm7,%ymm2,%ymm2
+ vpand %ymm7,%ymm5,%ymm5
+ vpand %ymm7,%ymm6,%ymm6
+ vpand %ymm7,%ymm0,%ymm0
+ vpor (%ebx),%ymm1,%ymm1
+ vpaddq 64(%esp),%ymm2,%ymm2
+ vpaddq (%esp),%ymm5,%ymm5
+ vpaddq 32(%esp),%ymm6,%ymm6
+ vpaddq %ymm3,%ymm0,%ymm0
+ vpaddq %ymm4,%ymm1,%ymm1
+ vpmuludq -96(%edx),%ymm2,%ymm3
+ vmovdqa %ymm6,32(%esp)
+ vpmuludq -64(%edx),%ymm2,%ymm4
+ vmovdqa %ymm0,96(%esp)
+ vpmuludq 96(%edx),%ymm2,%ymm0
+ vmovdqa %ymm1,128(%esp)
+ vpmuludq 128(%edx),%ymm2,%ymm1
+ vpmuludq -128(%edx),%ymm2,%ymm2
+ vpmuludq -32(%edx),%ymm5,%ymm7
+ vpaddq %ymm7,%ymm3,%ymm3
+ vpmuludq (%edx),%ymm5,%ymm6
+ vpaddq %ymm6,%ymm4,%ymm4
+ vpmuludq -128(%edx),%ymm5,%ymm7
+ vpaddq %ymm7,%ymm0,%ymm0
+ vmovdqa 32(%esp),%ymm7
+ vpmuludq -96(%edx),%ymm5,%ymm6
+ vpaddq %ymm6,%ymm1,%ymm1
+ vpmuludq -64(%edx),%ymm5,%ymm5
+ vpaddq %ymm5,%ymm2,%ymm2
+ vpmuludq -64(%edx),%ymm7,%ymm6
+ vpaddq %ymm6,%ymm3,%ymm3
+ vpmuludq -32(%edx),%ymm7,%ymm5
+ vpaddq %ymm5,%ymm4,%ymm4
+ vpmuludq 128(%edx),%ymm7,%ymm6
+ vpaddq %ymm6,%ymm0,%ymm0
+ vmovdqa 96(%esp),%ymm6
+ vpmuludq -128(%edx),%ymm7,%ymm5
+ vpaddq %ymm5,%ymm1,%ymm1
+ vpmuludq -96(%edx),%ymm7,%ymm7
+ vpaddq %ymm7,%ymm2,%ymm2
+ vpmuludq -128(%edx),%ymm6,%ymm5
+ vpaddq %ymm5,%ymm3,%ymm3
+ vpmuludq -96(%edx),%ymm6,%ymm7
+ vpaddq %ymm7,%ymm4,%ymm4
+ vpmuludq 64(%edx),%ymm6,%ymm5
+ vpaddq %ymm5,%ymm0,%ymm0
+ vmovdqa 128(%esp),%ymm5
+ vpmuludq 96(%edx),%ymm6,%ymm7
+ vpaddq %ymm7,%ymm1,%ymm1
+ vpmuludq 128(%edx),%ymm6,%ymm6
+ vpaddq %ymm6,%ymm2,%ymm2
+ vpmuludq 128(%edx),%ymm5,%ymm7
+ vpaddq %ymm7,%ymm3,%ymm3
+ vpmuludq 32(%edx),%ymm5,%ymm6
+ vpaddq %ymm6,%ymm0,%ymm0
+ vpmuludq -128(%edx),%ymm5,%ymm7
+ vpaddq %ymm7,%ymm4,%ymm4
+ vmovdqa 64(%ebx),%ymm7
+ vpmuludq 64(%edx),%ymm5,%ymm6
+ vpaddq %ymm6,%ymm1,%ymm1
+ vpmuludq 96(%edx),%ymm5,%ymm5
+ vpaddq %ymm5,%ymm2,%ymm2
+ vpsrlq $26,%ymm3,%ymm5
+ vpand %ymm7,%ymm3,%ymm3
+ vpsrlq $26,%ymm0,%ymm6
+ vpand %ymm7,%ymm0,%ymm0
+ vpaddq %ymm5,%ymm4,%ymm4
+ vpaddq %ymm6,%ymm1,%ymm1
+ vpsrlq $26,%ymm4,%ymm5
+ vpand %ymm7,%ymm4,%ymm4
+ vpsrlq $26,%ymm1,%ymm6
+ vpand %ymm7,%ymm1,%ymm1
+ vpaddq %ymm6,%ymm2,%ymm2
+ vpaddq %ymm5,%ymm0,%ymm0
+ vpsllq $2,%ymm5,%ymm5
+ vpsrlq $26,%ymm2,%ymm6
+ vpand %ymm7,%ymm2,%ymm2
+ vpaddq %ymm5,%ymm0,%ymm0
+ vpaddq %ymm6,%ymm3,%ymm3
+ vpsrlq $26,%ymm3,%ymm6
+ vpsrlq $26,%ymm0,%ymm5
+ vpand %ymm7,%ymm0,%ymm0
+ vpand %ymm7,%ymm3,%ymm3
+ vpaddq %ymm5,%ymm1,%ymm1
+ vpaddq %ymm6,%ymm4,%ymm4
+ vmovdqu (%esi),%xmm5
+ vmovdqu 16(%esi),%xmm6
+ vinserti128 $1,32(%esi),%ymm5,%ymm5
+ vinserti128 $1,48(%esi),%ymm6,%ymm6
+ leal 64(%esi),%esi
+ subl $64,%ecx
+ jnz .L028loop
+.L027tail:
+ vmovdqa %ymm2,64(%esp)
+ vpsrldq $6,%ymm5,%ymm2
+ vmovdqa %ymm0,(%esp)
+ vpsrldq $6,%ymm6,%ymm0
+ vmovdqa %ymm1,32(%esp)
+ vpunpckhqdq %ymm6,%ymm5,%ymm1
+ vpunpcklqdq %ymm6,%ymm5,%ymm5
+ vpunpcklqdq %ymm0,%ymm2,%ymm2
+ vpsrlq $30,%ymm2,%ymm0
+ vpsrlq $4,%ymm2,%ymm2
+ vpsrlq $26,%ymm5,%ymm6
+ vpsrlq $40,%ymm1,%ymm1
+ vpand %ymm7,%ymm2,%ymm2
+ vpand %ymm7,%ymm5,%ymm5
+ vpand %ymm7,%ymm6,%ymm6
+ vpand %ymm7,%ymm0,%ymm0
+ vpor (%ebx),%ymm1,%ymm1
+ andl $-64,%ebx
+ vpaddq 64(%esp),%ymm2,%ymm2
+ vpaddq (%esp),%ymm5,%ymm5
+ vpaddq 32(%esp),%ymm6,%ymm6
+ vpaddq %ymm3,%ymm0,%ymm0
+ vpaddq %ymm4,%ymm1,%ymm1
+ vpmuludq -92(%edx),%ymm2,%ymm3
+ vmovdqa %ymm6,32(%esp)
+ vpmuludq -60(%edx),%ymm2,%ymm4
+ vmovdqa %ymm0,96(%esp)
+ vpmuludq 100(%edx),%ymm2,%ymm0
+ vmovdqa %ymm1,128(%esp)
+ vpmuludq 132(%edx),%ymm2,%ymm1
+ vpmuludq -124(%edx),%ymm2,%ymm2
+ vpmuludq -28(%edx),%ymm5,%ymm7
+ vpaddq %ymm7,%ymm3,%ymm3
+ vpmuludq 4(%edx),%ymm5,%ymm6
+ vpaddq %ymm6,%ymm4,%ymm4
+ vpmuludq -124(%edx),%ymm5,%ymm7
+ vpaddq %ymm7,%ymm0,%ymm0
+ vmovdqa 32(%esp),%ymm7
+ vpmuludq -92(%edx),%ymm5,%ymm6
+ vpaddq %ymm6,%ymm1,%ymm1
+ vpmuludq -60(%edx),%ymm5,%ymm5
+ vpaddq %ymm5,%ymm2,%ymm2
+ vpmuludq -60(%edx),%ymm7,%ymm6
+ vpaddq %ymm6,%ymm3,%ymm3
+ vpmuludq -28(%edx),%ymm7,%ymm5
+ vpaddq %ymm5,%ymm4,%ymm4
+ vpmuludq 132(%edx),%ymm7,%ymm6
+ vpaddq %ymm6,%ymm0,%ymm0
+ vmovdqa 96(%esp),%ymm6
+ vpmuludq -124(%edx),%ymm7,%ymm5
+ vpaddq %ymm5,%ymm1,%ymm1
+ vpmuludq -92(%edx),%ymm7,%ymm7
+ vpaddq %ymm7,%ymm2,%ymm2
+ vpmuludq -124(%edx),%ymm6,%ymm5
+ vpaddq %ymm5,%ymm3,%ymm3
+ vpmuludq -92(%edx),%ymm6,%ymm7
+ vpaddq %ymm7,%ymm4,%ymm4
+ vpmuludq 68(%edx),%ymm6,%ymm5
+ vpaddq %ymm5,%ymm0,%ymm0
+ vmovdqa 128(%esp),%ymm5
+ vpmuludq 100(%edx),%ymm6,%ymm7
+ vpaddq %ymm7,%ymm1,%ymm1
+ vpmuludq 132(%edx),%ymm6,%ymm6
+ vpaddq %ymm6,%ymm2,%ymm2
+ vpmuludq 132(%edx),%ymm5,%ymm7
+ vpaddq %ymm7,%ymm3,%ymm3
+ vpmuludq 36(%edx),%ymm5,%ymm6
+ vpaddq %ymm6,%ymm0,%ymm0
+ vpmuludq -124(%edx),%ymm5,%ymm7
+ vpaddq %ymm7,%ymm4,%ymm4
+ vmovdqa 64(%ebx),%ymm7
+ vpmuludq 68(%edx),%ymm5,%ymm6
+ vpaddq %ymm6,%ymm1,%ymm1
+ vpmuludq 100(%edx),%ymm5,%ymm5
+ vpaddq %ymm5,%ymm2,%ymm2
+ vpsrldq $8,%ymm4,%ymm5
+ vpsrldq $8,%ymm3,%ymm6
+ vpaddq %ymm5,%ymm4,%ymm4
+ vpsrldq $8,%ymm0,%ymm5
+ vpaddq %ymm6,%ymm3,%ymm3
+ vpsrldq $8,%ymm1,%ymm6
+ vpaddq %ymm5,%ymm0,%ymm0
+ vpsrldq $8,%ymm2,%ymm5
+ vpaddq %ymm6,%ymm1,%ymm1
+ vpermq $2,%ymm4,%ymm6
+ vpaddq %ymm5,%ymm2,%ymm2
+ vpermq $2,%ymm3,%ymm5
+ vpaddq %ymm6,%ymm4,%ymm4
+ vpermq $2,%ymm0,%ymm6
+ vpaddq %ymm5,%ymm3,%ymm3
+ vpermq $2,%ymm1,%ymm5
+ vpaddq %ymm6,%ymm0,%ymm0
+ vpermq $2,%ymm2,%ymm6
+ vpaddq %ymm5,%ymm1,%ymm1
+ vpaddq %ymm6,%ymm2,%ymm2
+ vpsrlq $26,%ymm3,%ymm5
+ vpand %ymm7,%ymm3,%ymm3
+ vpsrlq $26,%ymm0,%ymm6
+ vpand %ymm7,%ymm0,%ymm0
+ vpaddq %ymm5,%ymm4,%ymm4
+ vpaddq %ymm6,%ymm1,%ymm1
+ vpsrlq $26,%ymm4,%ymm5
+ vpand %ymm7,%ymm4,%ymm4
+ vpsrlq $26,%ymm1,%ymm6
+ vpand %ymm7,%ymm1,%ymm1
+ vpaddq %ymm6,%ymm2,%ymm2
+ vpaddq %ymm5,%ymm0,%ymm0
+ vpsllq $2,%ymm5,%ymm5
+ vpsrlq $26,%ymm2,%ymm6
+ vpand %ymm7,%ymm2,%ymm2
+ vpaddq %ymm5,%ymm0,%ymm0
+ vpaddq %ymm6,%ymm3,%ymm3
+ vpsrlq $26,%ymm3,%ymm6
+ vpsrlq $26,%ymm0,%ymm5
+ vpand %ymm7,%ymm0,%ymm0
+ vpand %ymm7,%ymm3,%ymm3
+ vpaddq %ymm5,%ymm1,%ymm1
+ vpaddq %ymm6,%ymm4,%ymm4
+ cmpl $0,%ecx
+ je .L029done
+ vpshufd $252,%xmm0,%xmm0
+ leal 288(%esp),%edx
+ vpshufd $252,%xmm1,%xmm1
+ vpshufd $252,%xmm2,%xmm2
+ vpshufd $252,%xmm3,%xmm3
+ vpshufd $252,%xmm4,%xmm4
+ jmp .L024even
+.align 16
+.L029done:
+ vmovd %xmm0,-48(%edi)
+ vmovd %xmm1,-44(%edi)
+ vmovd %xmm2,-40(%edi)
+ vmovd %xmm3,-36(%edi)
+ vmovd %xmm4,-32(%edi)
+ vzeroupper
+ movl %ebp,%esp
+.L020nodata:
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size _poly1305_blocks_avx2,.-_poly1305_blocks_avx2
.align 64
.Lconst_sse2:
.long 16777216,0,16777216,0,16777216,0,16777216,0