diff options
Diffstat (limited to 'secure/lib/libcrypto/i386/poly1305-x86.S')
-rw-r--r-- | secure/lib/libcrypto/i386/poly1305-x86.S | 1110 |
1 files changed, 1110 insertions, 0 deletions
diff --git a/secure/lib/libcrypto/i386/poly1305-x86.S b/secure/lib/libcrypto/i386/poly1305-x86.S index b394500278d5..100deee40bf2 100644 --- a/secure/lib/libcrypto/i386/poly1305-x86.S +++ b/secure/lib/libcrypto/i386/poly1305-x86.S @@ -36,6 +36,10 @@ poly1305_init: jne .L002no_sse2 leal _poly1305_blocks_sse2-.L001pic_point(%ebx),%eax leal _poly1305_emit_sse2-.L001pic_point(%ebx),%edx + movl 8(%edi),%ecx + testl $32,%ecx + jz .L002no_sse2 + leal _poly1305_blocks_avx2-.L001pic_point(%ebx),%eax .L002no_sse2: movl 20(%esp),%edi movl %eax,(%ebp) @@ -1344,6 +1348,557 @@ _poly1305_emit_sse2: popl %ebp ret .size _poly1305_emit_sse2,.-_poly1305_emit_sse2 +.align 32 +.type _poly1305_init_avx2,@function +.align 16 +_poly1305_init_avx2: + vmovdqu 24(%edi),%xmm4 + leal 48(%edi),%edi + movl %esp,%ebp + subl $224,%esp + andl $-16,%esp + vmovdqa 64(%ebx),%xmm7 + vpand %xmm7,%xmm4,%xmm0 + vpsrlq $26,%xmm4,%xmm1 + vpsrldq $6,%xmm4,%xmm3 + vpand %xmm7,%xmm1,%xmm1 + vpsrlq $4,%xmm3,%xmm2 + vpsrlq $30,%xmm3,%xmm3 + vpand %xmm7,%xmm2,%xmm2 + vpand %xmm7,%xmm3,%xmm3 + vpsrldq $13,%xmm4,%xmm4 + leal 144(%esp),%edx + movl $2,%ecx +.L018square: + vmovdqa %xmm0,(%esp) + vmovdqa %xmm1,16(%esp) + vmovdqa %xmm2,32(%esp) + vmovdqa %xmm3,48(%esp) + vmovdqa %xmm4,64(%esp) + vpslld $2,%xmm1,%xmm6 + vpslld $2,%xmm2,%xmm5 + vpaddd %xmm1,%xmm6,%xmm6 + vpaddd %xmm2,%xmm5,%xmm5 + vmovdqa %xmm6,80(%esp) + vmovdqa %xmm5,96(%esp) + vpslld $2,%xmm3,%xmm6 + vpslld $2,%xmm4,%xmm5 + vpaddd %xmm3,%xmm6,%xmm6 + vpaddd %xmm4,%xmm5,%xmm5 + vmovdqa %xmm6,112(%esp) + vmovdqa %xmm5,128(%esp) + vpshufd $68,%xmm0,%xmm5 + vmovdqa %xmm1,%xmm6 + vpshufd $68,%xmm1,%xmm1 + vpshufd $68,%xmm2,%xmm2 + vpshufd $68,%xmm3,%xmm3 + vpshufd $68,%xmm4,%xmm4 + vmovdqa %xmm5,(%edx) + vmovdqa %xmm1,16(%edx) + vmovdqa %xmm2,32(%edx) + vmovdqa %xmm3,48(%edx) + vmovdqa %xmm4,64(%edx) + vpmuludq %xmm0,%xmm4,%xmm4 + vpmuludq %xmm0,%xmm3,%xmm3 + vpmuludq %xmm0,%xmm2,%xmm2 + vpmuludq %xmm0,%xmm1,%xmm1 + vpmuludq %xmm0,%xmm5,%xmm0 + vpmuludq 48(%edx),%xmm6,%xmm5 + vpaddq %xmm5,%xmm4,%xmm4 + vpmuludq 32(%edx),%xmm6,%xmm7 + vpaddq %xmm7,%xmm3,%xmm3 + vpmuludq 16(%edx),%xmm6,%xmm5 + vpaddq %xmm5,%xmm2,%xmm2 + vmovdqa 80(%esp),%xmm7 + vpmuludq (%edx),%xmm6,%xmm6 + vpaddq %xmm6,%xmm1,%xmm1 + vmovdqa 32(%esp),%xmm5 + vpmuludq 64(%edx),%xmm7,%xmm7 + vpaddq %xmm7,%xmm0,%xmm0 + vpmuludq 32(%edx),%xmm5,%xmm6 + vpaddq %xmm6,%xmm4,%xmm4 + vpmuludq 16(%edx),%xmm5,%xmm7 + vpaddq %xmm7,%xmm3,%xmm3 + vmovdqa 96(%esp),%xmm6 + vpmuludq (%edx),%xmm5,%xmm5 + vpaddq %xmm5,%xmm2,%xmm2 + vpmuludq 64(%edx),%xmm6,%xmm7 + vpaddq %xmm7,%xmm1,%xmm1 + vmovdqa 48(%esp),%xmm5 + vpmuludq 48(%edx),%xmm6,%xmm6 + vpaddq %xmm6,%xmm0,%xmm0 + vpmuludq 16(%edx),%xmm5,%xmm7 + vpaddq %xmm7,%xmm4,%xmm4 + vmovdqa 112(%esp),%xmm6 + vpmuludq (%edx),%xmm5,%xmm5 + vpaddq %xmm5,%xmm3,%xmm3 + vpmuludq 64(%edx),%xmm6,%xmm7 + vpaddq %xmm7,%xmm2,%xmm2 + vpmuludq 48(%edx),%xmm6,%xmm5 + vpaddq %xmm5,%xmm1,%xmm1 + vmovdqa 64(%esp),%xmm7 + vpmuludq 32(%edx),%xmm6,%xmm6 + vpaddq %xmm6,%xmm0,%xmm0 + vmovdqa 128(%esp),%xmm5 + vpmuludq (%edx),%xmm7,%xmm7 + vpaddq %xmm7,%xmm4,%xmm4 + vpmuludq 64(%edx),%xmm5,%xmm6 + vpaddq %xmm6,%xmm3,%xmm3 + vpmuludq 16(%edx),%xmm5,%xmm7 + vpaddq %xmm7,%xmm0,%xmm0 + vpmuludq 32(%edx),%xmm5,%xmm6 + vpaddq %xmm6,%xmm1,%xmm1 + vmovdqa 64(%ebx),%xmm7 + vpmuludq 48(%edx),%xmm5,%xmm5 + vpaddq %xmm5,%xmm2,%xmm2 + vpsrlq $26,%xmm3,%xmm5 + vpand %xmm7,%xmm3,%xmm3 + vpsrlq $26,%xmm0,%xmm6 + vpand %xmm7,%xmm0,%xmm0 + vpaddq %xmm5,%xmm4,%xmm4 + vpaddq %xmm6,%xmm1,%xmm1 + vpsrlq $26,%xmm4,%xmm5 + vpand %xmm7,%xmm4,%xmm4 + vpsrlq $26,%xmm1,%xmm6 + vpand %xmm7,%xmm1,%xmm1 + vpaddq %xmm6,%xmm2,%xmm2 + vpaddd %xmm5,%xmm0,%xmm0 + vpsllq $2,%xmm5,%xmm5 + vpsrlq $26,%xmm2,%xmm6 + vpand %xmm7,%xmm2,%xmm2 + vpaddd %xmm5,%xmm0,%xmm0 + vpaddd %xmm6,%xmm3,%xmm3 + vpsrlq $26,%xmm3,%xmm6 + vpsrlq $26,%xmm0,%xmm5 + vpand %xmm7,%xmm0,%xmm0 + vpand %xmm7,%xmm3,%xmm3 + vpaddd %xmm5,%xmm1,%xmm1 + vpaddd %xmm6,%xmm4,%xmm4 + decl %ecx + jz .L019square_break + vpunpcklqdq (%esp),%xmm0,%xmm0 + vpunpcklqdq 16(%esp),%xmm1,%xmm1 + vpunpcklqdq 32(%esp),%xmm2,%xmm2 + vpunpcklqdq 48(%esp),%xmm3,%xmm3 + vpunpcklqdq 64(%esp),%xmm4,%xmm4 + jmp .L018square +.L019square_break: + vpsllq $32,%xmm0,%xmm0 + vpsllq $32,%xmm1,%xmm1 + vpsllq $32,%xmm2,%xmm2 + vpsllq $32,%xmm3,%xmm3 + vpsllq $32,%xmm4,%xmm4 + vpor (%esp),%xmm0,%xmm0 + vpor 16(%esp),%xmm1,%xmm1 + vpor 32(%esp),%xmm2,%xmm2 + vpor 48(%esp),%xmm3,%xmm3 + vpor 64(%esp),%xmm4,%xmm4 + vpshufd $141,%xmm0,%xmm0 + vpshufd $141,%xmm1,%xmm1 + vpshufd $141,%xmm2,%xmm2 + vpshufd $141,%xmm3,%xmm3 + vpshufd $141,%xmm4,%xmm4 + vmovdqu %xmm0,(%edi) + vmovdqu %xmm1,16(%edi) + vmovdqu %xmm2,32(%edi) + vmovdqu %xmm3,48(%edi) + vmovdqu %xmm4,64(%edi) + vpslld $2,%xmm1,%xmm6 + vpslld $2,%xmm2,%xmm5 + vpaddd %xmm1,%xmm6,%xmm6 + vpaddd %xmm2,%xmm5,%xmm5 + vmovdqu %xmm6,80(%edi) + vmovdqu %xmm5,96(%edi) + vpslld $2,%xmm3,%xmm6 + vpslld $2,%xmm4,%xmm5 + vpaddd %xmm3,%xmm6,%xmm6 + vpaddd %xmm4,%xmm5,%xmm5 + vmovdqu %xmm6,112(%edi) + vmovdqu %xmm5,128(%edi) + movl %ebp,%esp + leal -48(%edi),%edi + ret +.size _poly1305_init_avx2,.-_poly1305_init_avx2 +.align 32 +.type _poly1305_blocks_avx2,@function +.align 16 +_poly1305_blocks_avx2: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%edi + movl 24(%esp),%esi + movl 28(%esp),%ecx + movl 20(%edi),%eax + andl $-16,%ecx + jz .L020nodata + cmpl $64,%ecx + jae .L021enter_avx2 + testl %eax,%eax + jz .Lenter_blocks +.L021enter_avx2: + vzeroupper + call .L022pic_point +.L022pic_point: + popl %ebx + leal .Lconst_sse2-.L022pic_point(%ebx),%ebx + testl %eax,%eax + jnz .L023base2_26 + call _poly1305_init_avx2 + movl (%edi),%eax + movl 3(%edi),%ecx + movl 6(%edi),%edx + movl 9(%edi),%esi + movl 13(%edi),%ebp + shrl $2,%ecx + andl $67108863,%eax + shrl $4,%edx + andl $67108863,%ecx + shrl $6,%esi + andl $67108863,%edx + movl %eax,(%edi) + movl %ecx,4(%edi) + movl %edx,8(%edi) + movl %esi,12(%edi) + movl %ebp,16(%edi) + movl $1,20(%edi) + movl 24(%esp),%esi + movl 28(%esp),%ecx +.L023base2_26: + movl 32(%esp),%eax + movl %esp,%ebp + subl $448,%esp + andl $-512,%esp + vmovdqu 48(%edi),%xmm0 + leal 288(%esp),%edx + vmovdqu 64(%edi),%xmm1 + vmovdqu 80(%edi),%xmm2 + vmovdqu 96(%edi),%xmm3 + vmovdqu 112(%edi),%xmm4 + leal 48(%edi),%edi + vpermq $64,%ymm0,%ymm0 + vpermq $64,%ymm1,%ymm1 + vpermq $64,%ymm2,%ymm2 + vpermq $64,%ymm3,%ymm3 + vpermq $64,%ymm4,%ymm4 + vpshufd $200,%ymm0,%ymm0 + vpshufd $200,%ymm1,%ymm1 + vpshufd $200,%ymm2,%ymm2 + vpshufd $200,%ymm3,%ymm3 + vpshufd $200,%ymm4,%ymm4 + vmovdqa %ymm0,-128(%edx) + vmovdqu 80(%edi),%xmm0 + vmovdqa %ymm1,-96(%edx) + vmovdqu 96(%edi),%xmm1 + vmovdqa %ymm2,-64(%edx) + vmovdqu 112(%edi),%xmm2 + vmovdqa %ymm3,-32(%edx) + vmovdqu 128(%edi),%xmm3 + vmovdqa %ymm4,(%edx) + vpermq $64,%ymm0,%ymm0 + vpermq $64,%ymm1,%ymm1 + vpermq $64,%ymm2,%ymm2 + vpermq $64,%ymm3,%ymm3 + vpshufd $200,%ymm0,%ymm0 + vpshufd $200,%ymm1,%ymm1 + vpshufd $200,%ymm2,%ymm2 + vpshufd $200,%ymm3,%ymm3 + vmovdqa %ymm0,32(%edx) + vmovd -48(%edi),%xmm0 + vmovdqa %ymm1,64(%edx) + vmovd -44(%edi),%xmm1 + vmovdqa %ymm2,96(%edx) + vmovd -40(%edi),%xmm2 + vmovdqa %ymm3,128(%edx) + vmovd -36(%edi),%xmm3 + vmovd -32(%edi),%xmm4 + vmovdqa 64(%ebx),%ymm7 + negl %eax + testl $63,%ecx + jz .L024even + movl %ecx,%edx + andl $-64,%ecx + andl $63,%edx + vmovdqu (%esi),%xmm5 + cmpl $32,%edx + jb .L025one + vmovdqu 16(%esi),%xmm6 + je .L026two + vinserti128 $1,32(%esi),%ymm5,%ymm5 + leal 48(%esi),%esi + leal 8(%ebx),%ebx + leal 296(%esp),%edx + jmp .L027tail +.L026two: + leal 32(%esi),%esi + leal 16(%ebx),%ebx + leal 304(%esp),%edx + jmp .L027tail +.L025one: + leal 16(%esi),%esi + vpxor %ymm6,%ymm6,%ymm6 + leal 32(%ebx,%eax,8),%ebx + leal 312(%esp),%edx + jmp .L027tail +.align 32 +.L024even: + vmovdqu (%esi),%xmm5 + vmovdqu 16(%esi),%xmm6 + vinserti128 $1,32(%esi),%ymm5,%ymm5 + vinserti128 $1,48(%esi),%ymm6,%ymm6 + leal 64(%esi),%esi + subl $64,%ecx + jz .L027tail +.L028loop: + vmovdqa %ymm2,64(%esp) + vpsrldq $6,%ymm5,%ymm2 + vmovdqa %ymm0,(%esp) + vpsrldq $6,%ymm6,%ymm0 + vmovdqa %ymm1,32(%esp) + vpunpckhqdq %ymm6,%ymm5,%ymm1 + vpunpcklqdq %ymm6,%ymm5,%ymm5 + vpunpcklqdq %ymm0,%ymm2,%ymm2 + vpsrlq $30,%ymm2,%ymm0 + vpsrlq $4,%ymm2,%ymm2 + vpsrlq $26,%ymm5,%ymm6 + vpsrlq $40,%ymm1,%ymm1 + vpand %ymm7,%ymm2,%ymm2 + vpand %ymm7,%ymm5,%ymm5 + vpand %ymm7,%ymm6,%ymm6 + vpand %ymm7,%ymm0,%ymm0 + vpor (%ebx),%ymm1,%ymm1 + vpaddq 64(%esp),%ymm2,%ymm2 + vpaddq (%esp),%ymm5,%ymm5 + vpaddq 32(%esp),%ymm6,%ymm6 + vpaddq %ymm3,%ymm0,%ymm0 + vpaddq %ymm4,%ymm1,%ymm1 + vpmuludq -96(%edx),%ymm2,%ymm3 + vmovdqa %ymm6,32(%esp) + vpmuludq -64(%edx),%ymm2,%ymm4 + vmovdqa %ymm0,96(%esp) + vpmuludq 96(%edx),%ymm2,%ymm0 + vmovdqa %ymm1,128(%esp) + vpmuludq 128(%edx),%ymm2,%ymm1 + vpmuludq -128(%edx),%ymm2,%ymm2 + vpmuludq -32(%edx),%ymm5,%ymm7 + vpaddq %ymm7,%ymm3,%ymm3 + vpmuludq (%edx),%ymm5,%ymm6 + vpaddq %ymm6,%ymm4,%ymm4 + vpmuludq -128(%edx),%ymm5,%ymm7 + vpaddq %ymm7,%ymm0,%ymm0 + vmovdqa 32(%esp),%ymm7 + vpmuludq -96(%edx),%ymm5,%ymm6 + vpaddq %ymm6,%ymm1,%ymm1 + vpmuludq -64(%edx),%ymm5,%ymm5 + vpaddq %ymm5,%ymm2,%ymm2 + vpmuludq -64(%edx),%ymm7,%ymm6 + vpaddq %ymm6,%ymm3,%ymm3 + vpmuludq -32(%edx),%ymm7,%ymm5 + vpaddq %ymm5,%ymm4,%ymm4 + vpmuludq 128(%edx),%ymm7,%ymm6 + vpaddq %ymm6,%ymm0,%ymm0 + vmovdqa 96(%esp),%ymm6 + vpmuludq -128(%edx),%ymm7,%ymm5 + vpaddq %ymm5,%ymm1,%ymm1 + vpmuludq -96(%edx),%ymm7,%ymm7 + vpaddq %ymm7,%ymm2,%ymm2 + vpmuludq -128(%edx),%ymm6,%ymm5 + vpaddq %ymm5,%ymm3,%ymm3 + vpmuludq -96(%edx),%ymm6,%ymm7 + vpaddq %ymm7,%ymm4,%ymm4 + vpmuludq 64(%edx),%ymm6,%ymm5 + vpaddq %ymm5,%ymm0,%ymm0 + vmovdqa 128(%esp),%ymm5 + vpmuludq 96(%edx),%ymm6,%ymm7 + vpaddq %ymm7,%ymm1,%ymm1 + vpmuludq 128(%edx),%ymm6,%ymm6 + vpaddq %ymm6,%ymm2,%ymm2 + vpmuludq 128(%edx),%ymm5,%ymm7 + vpaddq %ymm7,%ymm3,%ymm3 + vpmuludq 32(%edx),%ymm5,%ymm6 + vpaddq %ymm6,%ymm0,%ymm0 + vpmuludq -128(%edx),%ymm5,%ymm7 + vpaddq %ymm7,%ymm4,%ymm4 + vmovdqa 64(%ebx),%ymm7 + vpmuludq 64(%edx),%ymm5,%ymm6 + vpaddq %ymm6,%ymm1,%ymm1 + vpmuludq 96(%edx),%ymm5,%ymm5 + vpaddq %ymm5,%ymm2,%ymm2 + vpsrlq $26,%ymm3,%ymm5 + vpand %ymm7,%ymm3,%ymm3 + vpsrlq $26,%ymm0,%ymm6 + vpand %ymm7,%ymm0,%ymm0 + vpaddq %ymm5,%ymm4,%ymm4 + vpaddq %ymm6,%ymm1,%ymm1 + vpsrlq $26,%ymm4,%ymm5 + vpand %ymm7,%ymm4,%ymm4 + vpsrlq $26,%ymm1,%ymm6 + vpand %ymm7,%ymm1,%ymm1 + vpaddq %ymm6,%ymm2,%ymm2 + vpaddq %ymm5,%ymm0,%ymm0 + vpsllq $2,%ymm5,%ymm5 + vpsrlq $26,%ymm2,%ymm6 + vpand %ymm7,%ymm2,%ymm2 + vpaddq %ymm5,%ymm0,%ymm0 + vpaddq %ymm6,%ymm3,%ymm3 + vpsrlq $26,%ymm3,%ymm6 + vpsrlq $26,%ymm0,%ymm5 + vpand %ymm7,%ymm0,%ymm0 + vpand %ymm7,%ymm3,%ymm3 + vpaddq %ymm5,%ymm1,%ymm1 + vpaddq %ymm6,%ymm4,%ymm4 + vmovdqu (%esi),%xmm5 + vmovdqu 16(%esi),%xmm6 + vinserti128 $1,32(%esi),%ymm5,%ymm5 + vinserti128 $1,48(%esi),%ymm6,%ymm6 + leal 64(%esi),%esi + subl $64,%ecx + jnz .L028loop +.L027tail: + vmovdqa %ymm2,64(%esp) + vpsrldq $6,%ymm5,%ymm2 + vmovdqa %ymm0,(%esp) + vpsrldq $6,%ymm6,%ymm0 + vmovdqa %ymm1,32(%esp) + vpunpckhqdq %ymm6,%ymm5,%ymm1 + vpunpcklqdq %ymm6,%ymm5,%ymm5 + vpunpcklqdq %ymm0,%ymm2,%ymm2 + vpsrlq $30,%ymm2,%ymm0 + vpsrlq $4,%ymm2,%ymm2 + vpsrlq $26,%ymm5,%ymm6 + vpsrlq $40,%ymm1,%ymm1 + vpand %ymm7,%ymm2,%ymm2 + vpand %ymm7,%ymm5,%ymm5 + vpand %ymm7,%ymm6,%ymm6 + vpand %ymm7,%ymm0,%ymm0 + vpor (%ebx),%ymm1,%ymm1 + andl $-64,%ebx + vpaddq 64(%esp),%ymm2,%ymm2 + vpaddq (%esp),%ymm5,%ymm5 + vpaddq 32(%esp),%ymm6,%ymm6 + vpaddq %ymm3,%ymm0,%ymm0 + vpaddq %ymm4,%ymm1,%ymm1 + vpmuludq -92(%edx),%ymm2,%ymm3 + vmovdqa %ymm6,32(%esp) + vpmuludq -60(%edx),%ymm2,%ymm4 + vmovdqa %ymm0,96(%esp) + vpmuludq 100(%edx),%ymm2,%ymm0 + vmovdqa %ymm1,128(%esp) + vpmuludq 132(%edx),%ymm2,%ymm1 + vpmuludq -124(%edx),%ymm2,%ymm2 + vpmuludq -28(%edx),%ymm5,%ymm7 + vpaddq %ymm7,%ymm3,%ymm3 + vpmuludq 4(%edx),%ymm5,%ymm6 + vpaddq %ymm6,%ymm4,%ymm4 + vpmuludq -124(%edx),%ymm5,%ymm7 + vpaddq %ymm7,%ymm0,%ymm0 + vmovdqa 32(%esp),%ymm7 + vpmuludq -92(%edx),%ymm5,%ymm6 + vpaddq %ymm6,%ymm1,%ymm1 + vpmuludq -60(%edx),%ymm5,%ymm5 + vpaddq %ymm5,%ymm2,%ymm2 + vpmuludq -60(%edx),%ymm7,%ymm6 + vpaddq %ymm6,%ymm3,%ymm3 + vpmuludq -28(%edx),%ymm7,%ymm5 + vpaddq %ymm5,%ymm4,%ymm4 + vpmuludq 132(%edx),%ymm7,%ymm6 + vpaddq %ymm6,%ymm0,%ymm0 + vmovdqa 96(%esp),%ymm6 + vpmuludq -124(%edx),%ymm7,%ymm5 + vpaddq %ymm5,%ymm1,%ymm1 + vpmuludq -92(%edx),%ymm7,%ymm7 + vpaddq %ymm7,%ymm2,%ymm2 + vpmuludq -124(%edx),%ymm6,%ymm5 + vpaddq %ymm5,%ymm3,%ymm3 + vpmuludq -92(%edx),%ymm6,%ymm7 + vpaddq %ymm7,%ymm4,%ymm4 + vpmuludq 68(%edx),%ymm6,%ymm5 + vpaddq %ymm5,%ymm0,%ymm0 + vmovdqa 128(%esp),%ymm5 + vpmuludq 100(%edx),%ymm6,%ymm7 + vpaddq %ymm7,%ymm1,%ymm1 + vpmuludq 132(%edx),%ymm6,%ymm6 + vpaddq %ymm6,%ymm2,%ymm2 + vpmuludq 132(%edx),%ymm5,%ymm7 + vpaddq %ymm7,%ymm3,%ymm3 + vpmuludq 36(%edx),%ymm5,%ymm6 + vpaddq %ymm6,%ymm0,%ymm0 + vpmuludq -124(%edx),%ymm5,%ymm7 + vpaddq %ymm7,%ymm4,%ymm4 + vmovdqa 64(%ebx),%ymm7 + vpmuludq 68(%edx),%ymm5,%ymm6 + vpaddq %ymm6,%ymm1,%ymm1 + vpmuludq 100(%edx),%ymm5,%ymm5 + vpaddq %ymm5,%ymm2,%ymm2 + vpsrldq $8,%ymm4,%ymm5 + vpsrldq $8,%ymm3,%ymm6 + vpaddq %ymm5,%ymm4,%ymm4 + vpsrldq $8,%ymm0,%ymm5 + vpaddq %ymm6,%ymm3,%ymm3 + vpsrldq $8,%ymm1,%ymm6 + vpaddq %ymm5,%ymm0,%ymm0 + vpsrldq $8,%ymm2,%ymm5 + vpaddq %ymm6,%ymm1,%ymm1 + vpermq $2,%ymm4,%ymm6 + vpaddq %ymm5,%ymm2,%ymm2 + vpermq $2,%ymm3,%ymm5 + vpaddq %ymm6,%ymm4,%ymm4 + vpermq $2,%ymm0,%ymm6 + vpaddq %ymm5,%ymm3,%ymm3 + vpermq $2,%ymm1,%ymm5 + vpaddq %ymm6,%ymm0,%ymm0 + vpermq $2,%ymm2,%ymm6 + vpaddq %ymm5,%ymm1,%ymm1 + vpaddq %ymm6,%ymm2,%ymm2 + vpsrlq $26,%ymm3,%ymm5 + vpand %ymm7,%ymm3,%ymm3 + vpsrlq $26,%ymm0,%ymm6 + vpand %ymm7,%ymm0,%ymm0 + vpaddq %ymm5,%ymm4,%ymm4 + vpaddq %ymm6,%ymm1,%ymm1 + vpsrlq $26,%ymm4,%ymm5 + vpand %ymm7,%ymm4,%ymm4 + vpsrlq $26,%ymm1,%ymm6 + vpand %ymm7,%ymm1,%ymm1 + vpaddq %ymm6,%ymm2,%ymm2 + vpaddq %ymm5,%ymm0,%ymm0 + vpsllq $2,%ymm5,%ymm5 + vpsrlq $26,%ymm2,%ymm6 + vpand %ymm7,%ymm2,%ymm2 + vpaddq %ymm5,%ymm0,%ymm0 + vpaddq %ymm6,%ymm3,%ymm3 + vpsrlq $26,%ymm3,%ymm6 + vpsrlq $26,%ymm0,%ymm5 + vpand %ymm7,%ymm0,%ymm0 + vpand %ymm7,%ymm3,%ymm3 + vpaddq %ymm5,%ymm1,%ymm1 + vpaddq %ymm6,%ymm4,%ymm4 + cmpl $0,%ecx + je .L029done + vpshufd $252,%xmm0,%xmm0 + leal 288(%esp),%edx + vpshufd $252,%xmm1,%xmm1 + vpshufd $252,%xmm2,%xmm2 + vpshufd $252,%xmm3,%xmm3 + vpshufd $252,%xmm4,%xmm4 + jmp .L024even +.align 16 +.L029done: + vmovd %xmm0,-48(%edi) + vmovd %xmm1,-44(%edi) + vmovd %xmm2,-40(%edi) + vmovd %xmm3,-36(%edi) + vmovd %xmm4,-32(%edi) + vzeroupper + movl %ebp,%esp +.L020nodata: + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size _poly1305_blocks_avx2,.-_poly1305_blocks_avx2 .align 64 .Lconst_sse2: .long 16777216,0,16777216,0,16777216,0,16777216,0 @@ -1392,6 +1947,10 @@ poly1305_init: jne .L002no_sse2 leal _poly1305_blocks_sse2-.L001pic_point(%ebx),%eax leal _poly1305_emit_sse2-.L001pic_point(%ebx),%edx + movl 8(%edi),%ecx + testl $32,%ecx + jz .L002no_sse2 + leal _poly1305_blocks_avx2-.L001pic_point(%ebx),%eax .L002no_sse2: movl 20(%esp),%edi movl %eax,(%ebp) @@ -2700,6 +3259,557 @@ _poly1305_emit_sse2: popl %ebp ret .size _poly1305_emit_sse2,.-_poly1305_emit_sse2 +.align 32 +.type _poly1305_init_avx2,@function +.align 16 +_poly1305_init_avx2: + vmovdqu 24(%edi),%xmm4 + leal 48(%edi),%edi + movl %esp,%ebp + subl $224,%esp + andl $-16,%esp + vmovdqa 64(%ebx),%xmm7 + vpand %xmm7,%xmm4,%xmm0 + vpsrlq $26,%xmm4,%xmm1 + vpsrldq $6,%xmm4,%xmm3 + vpand %xmm7,%xmm1,%xmm1 + vpsrlq $4,%xmm3,%xmm2 + vpsrlq $30,%xmm3,%xmm3 + vpand %xmm7,%xmm2,%xmm2 + vpand %xmm7,%xmm3,%xmm3 + vpsrldq $13,%xmm4,%xmm4 + leal 144(%esp),%edx + movl $2,%ecx +.L018square: + vmovdqa %xmm0,(%esp) + vmovdqa %xmm1,16(%esp) + vmovdqa %xmm2,32(%esp) + vmovdqa %xmm3,48(%esp) + vmovdqa %xmm4,64(%esp) + vpslld $2,%xmm1,%xmm6 + vpslld $2,%xmm2,%xmm5 + vpaddd %xmm1,%xmm6,%xmm6 + vpaddd %xmm2,%xmm5,%xmm5 + vmovdqa %xmm6,80(%esp) + vmovdqa %xmm5,96(%esp) + vpslld $2,%xmm3,%xmm6 + vpslld $2,%xmm4,%xmm5 + vpaddd %xmm3,%xmm6,%xmm6 + vpaddd %xmm4,%xmm5,%xmm5 + vmovdqa %xmm6,112(%esp) + vmovdqa %xmm5,128(%esp) + vpshufd $68,%xmm0,%xmm5 + vmovdqa %xmm1,%xmm6 + vpshufd $68,%xmm1,%xmm1 + vpshufd $68,%xmm2,%xmm2 + vpshufd $68,%xmm3,%xmm3 + vpshufd $68,%xmm4,%xmm4 + vmovdqa %xmm5,(%edx) + vmovdqa %xmm1,16(%edx) + vmovdqa %xmm2,32(%edx) + vmovdqa %xmm3,48(%edx) + vmovdqa %xmm4,64(%edx) + vpmuludq %xmm0,%xmm4,%xmm4 + vpmuludq %xmm0,%xmm3,%xmm3 + vpmuludq %xmm0,%xmm2,%xmm2 + vpmuludq %xmm0,%xmm1,%xmm1 + vpmuludq %xmm0,%xmm5,%xmm0 + vpmuludq 48(%edx),%xmm6,%xmm5 + vpaddq %xmm5,%xmm4,%xmm4 + vpmuludq 32(%edx),%xmm6,%xmm7 + vpaddq %xmm7,%xmm3,%xmm3 + vpmuludq 16(%edx),%xmm6,%xmm5 + vpaddq %xmm5,%xmm2,%xmm2 + vmovdqa 80(%esp),%xmm7 + vpmuludq (%edx),%xmm6,%xmm6 + vpaddq %xmm6,%xmm1,%xmm1 + vmovdqa 32(%esp),%xmm5 + vpmuludq 64(%edx),%xmm7,%xmm7 + vpaddq %xmm7,%xmm0,%xmm0 + vpmuludq 32(%edx),%xmm5,%xmm6 + vpaddq %xmm6,%xmm4,%xmm4 + vpmuludq 16(%edx),%xmm5,%xmm7 + vpaddq %xmm7,%xmm3,%xmm3 + vmovdqa 96(%esp),%xmm6 + vpmuludq (%edx),%xmm5,%xmm5 + vpaddq %xmm5,%xmm2,%xmm2 + vpmuludq 64(%edx),%xmm6,%xmm7 + vpaddq %xmm7,%xmm1,%xmm1 + vmovdqa 48(%esp),%xmm5 + vpmuludq 48(%edx),%xmm6,%xmm6 + vpaddq %xmm6,%xmm0,%xmm0 + vpmuludq 16(%edx),%xmm5,%xmm7 + vpaddq %xmm7,%xmm4,%xmm4 + vmovdqa 112(%esp),%xmm6 + vpmuludq (%edx),%xmm5,%xmm5 + vpaddq %xmm5,%xmm3,%xmm3 + vpmuludq 64(%edx),%xmm6,%xmm7 + vpaddq %xmm7,%xmm2,%xmm2 + vpmuludq 48(%edx),%xmm6,%xmm5 + vpaddq %xmm5,%xmm1,%xmm1 + vmovdqa 64(%esp),%xmm7 + vpmuludq 32(%edx),%xmm6,%xmm6 + vpaddq %xmm6,%xmm0,%xmm0 + vmovdqa 128(%esp),%xmm5 + vpmuludq (%edx),%xmm7,%xmm7 + vpaddq %xmm7,%xmm4,%xmm4 + vpmuludq 64(%edx),%xmm5,%xmm6 + vpaddq %xmm6,%xmm3,%xmm3 + vpmuludq 16(%edx),%xmm5,%xmm7 + vpaddq %xmm7,%xmm0,%xmm0 + vpmuludq 32(%edx),%xmm5,%xmm6 + vpaddq %xmm6,%xmm1,%xmm1 + vmovdqa 64(%ebx),%xmm7 + vpmuludq 48(%edx),%xmm5,%xmm5 + vpaddq %xmm5,%xmm2,%xmm2 + vpsrlq $26,%xmm3,%xmm5 + vpand %xmm7,%xmm3,%xmm3 + vpsrlq $26,%xmm0,%xmm6 + vpand %xmm7,%xmm0,%xmm0 + vpaddq %xmm5,%xmm4,%xmm4 + vpaddq %xmm6,%xmm1,%xmm1 + vpsrlq $26,%xmm4,%xmm5 + vpand %xmm7,%xmm4,%xmm4 + vpsrlq $26,%xmm1,%xmm6 + vpand %xmm7,%xmm1,%xmm1 + vpaddq %xmm6,%xmm2,%xmm2 + vpaddd %xmm5,%xmm0,%xmm0 + vpsllq $2,%xmm5,%xmm5 + vpsrlq $26,%xmm2,%xmm6 + vpand %xmm7,%xmm2,%xmm2 + vpaddd %xmm5,%xmm0,%xmm0 + vpaddd %xmm6,%xmm3,%xmm3 + vpsrlq $26,%xmm3,%xmm6 + vpsrlq $26,%xmm0,%xmm5 + vpand %xmm7,%xmm0,%xmm0 + vpand %xmm7,%xmm3,%xmm3 + vpaddd %xmm5,%xmm1,%xmm1 + vpaddd %xmm6,%xmm4,%xmm4 + decl %ecx + jz .L019square_break + vpunpcklqdq (%esp),%xmm0,%xmm0 + vpunpcklqdq 16(%esp),%xmm1,%xmm1 + vpunpcklqdq 32(%esp),%xmm2,%xmm2 + vpunpcklqdq 48(%esp),%xmm3,%xmm3 + vpunpcklqdq 64(%esp),%xmm4,%xmm4 + jmp .L018square +.L019square_break: + vpsllq $32,%xmm0,%xmm0 + vpsllq $32,%xmm1,%xmm1 + vpsllq $32,%xmm2,%xmm2 + vpsllq $32,%xmm3,%xmm3 + vpsllq $32,%xmm4,%xmm4 + vpor (%esp),%xmm0,%xmm0 + vpor 16(%esp),%xmm1,%xmm1 + vpor 32(%esp),%xmm2,%xmm2 + vpor 48(%esp),%xmm3,%xmm3 + vpor 64(%esp),%xmm4,%xmm4 + vpshufd $141,%xmm0,%xmm0 + vpshufd $141,%xmm1,%xmm1 + vpshufd $141,%xmm2,%xmm2 + vpshufd $141,%xmm3,%xmm3 + vpshufd $141,%xmm4,%xmm4 + vmovdqu %xmm0,(%edi) + vmovdqu %xmm1,16(%edi) + vmovdqu %xmm2,32(%edi) + vmovdqu %xmm3,48(%edi) + vmovdqu %xmm4,64(%edi) + vpslld $2,%xmm1,%xmm6 + vpslld $2,%xmm2,%xmm5 + vpaddd %xmm1,%xmm6,%xmm6 + vpaddd %xmm2,%xmm5,%xmm5 + vmovdqu %xmm6,80(%edi) + vmovdqu %xmm5,96(%edi) + vpslld $2,%xmm3,%xmm6 + vpslld $2,%xmm4,%xmm5 + vpaddd %xmm3,%xmm6,%xmm6 + vpaddd %xmm4,%xmm5,%xmm5 + vmovdqu %xmm6,112(%edi) + vmovdqu %xmm5,128(%edi) + movl %ebp,%esp + leal -48(%edi),%edi + ret +.size _poly1305_init_avx2,.-_poly1305_init_avx2 +.align 32 +.type _poly1305_blocks_avx2,@function +.align 16 +_poly1305_blocks_avx2: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%edi + movl 24(%esp),%esi + movl 28(%esp),%ecx + movl 20(%edi),%eax + andl $-16,%ecx + jz .L020nodata + cmpl $64,%ecx + jae .L021enter_avx2 + testl %eax,%eax + jz .Lenter_blocks +.L021enter_avx2: + vzeroupper + call .L022pic_point +.L022pic_point: + popl %ebx + leal .Lconst_sse2-.L022pic_point(%ebx),%ebx + testl %eax,%eax + jnz .L023base2_26 + call _poly1305_init_avx2 + movl (%edi),%eax + movl 3(%edi),%ecx + movl 6(%edi),%edx + movl 9(%edi),%esi + movl 13(%edi),%ebp + shrl $2,%ecx + andl $67108863,%eax + shrl $4,%edx + andl $67108863,%ecx + shrl $6,%esi + andl $67108863,%edx + movl %eax,(%edi) + movl %ecx,4(%edi) + movl %edx,8(%edi) + movl %esi,12(%edi) + movl %ebp,16(%edi) + movl $1,20(%edi) + movl 24(%esp),%esi + movl 28(%esp),%ecx +.L023base2_26: + movl 32(%esp),%eax + movl %esp,%ebp + subl $448,%esp + andl $-512,%esp + vmovdqu 48(%edi),%xmm0 + leal 288(%esp),%edx + vmovdqu 64(%edi),%xmm1 + vmovdqu 80(%edi),%xmm2 + vmovdqu 96(%edi),%xmm3 + vmovdqu 112(%edi),%xmm4 + leal 48(%edi),%edi + vpermq $64,%ymm0,%ymm0 + vpermq $64,%ymm1,%ymm1 + vpermq $64,%ymm2,%ymm2 + vpermq $64,%ymm3,%ymm3 + vpermq $64,%ymm4,%ymm4 + vpshufd $200,%ymm0,%ymm0 + vpshufd $200,%ymm1,%ymm1 + vpshufd $200,%ymm2,%ymm2 + vpshufd $200,%ymm3,%ymm3 + vpshufd $200,%ymm4,%ymm4 + vmovdqa %ymm0,-128(%edx) + vmovdqu 80(%edi),%xmm0 + vmovdqa %ymm1,-96(%edx) + vmovdqu 96(%edi),%xmm1 + vmovdqa %ymm2,-64(%edx) + vmovdqu 112(%edi),%xmm2 + vmovdqa %ymm3,-32(%edx) + vmovdqu 128(%edi),%xmm3 + vmovdqa %ymm4,(%edx) + vpermq $64,%ymm0,%ymm0 + vpermq $64,%ymm1,%ymm1 + vpermq $64,%ymm2,%ymm2 + vpermq $64,%ymm3,%ymm3 + vpshufd $200,%ymm0,%ymm0 + vpshufd $200,%ymm1,%ymm1 + vpshufd $200,%ymm2,%ymm2 + vpshufd $200,%ymm3,%ymm3 + vmovdqa %ymm0,32(%edx) + vmovd -48(%edi),%xmm0 + vmovdqa %ymm1,64(%edx) + vmovd -44(%edi),%xmm1 + vmovdqa %ymm2,96(%edx) + vmovd -40(%edi),%xmm2 + vmovdqa %ymm3,128(%edx) + vmovd -36(%edi),%xmm3 + vmovd -32(%edi),%xmm4 + vmovdqa 64(%ebx),%ymm7 + negl %eax + testl $63,%ecx + jz .L024even + movl %ecx,%edx + andl $-64,%ecx + andl $63,%edx + vmovdqu (%esi),%xmm5 + cmpl $32,%edx + jb .L025one + vmovdqu 16(%esi),%xmm6 + je .L026two + vinserti128 $1,32(%esi),%ymm5,%ymm5 + leal 48(%esi),%esi + leal 8(%ebx),%ebx + leal 296(%esp),%edx + jmp .L027tail +.L026two: + leal 32(%esi),%esi + leal 16(%ebx),%ebx + leal 304(%esp),%edx + jmp .L027tail +.L025one: + leal 16(%esi),%esi + vpxor %ymm6,%ymm6,%ymm6 + leal 32(%ebx,%eax,8),%ebx + leal 312(%esp),%edx + jmp .L027tail +.align 32 +.L024even: + vmovdqu (%esi),%xmm5 + vmovdqu 16(%esi),%xmm6 + vinserti128 $1,32(%esi),%ymm5,%ymm5 + vinserti128 $1,48(%esi),%ymm6,%ymm6 + leal 64(%esi),%esi + subl $64,%ecx + jz .L027tail +.L028loop: + vmovdqa %ymm2,64(%esp) + vpsrldq $6,%ymm5,%ymm2 + vmovdqa %ymm0,(%esp) + vpsrldq $6,%ymm6,%ymm0 + vmovdqa %ymm1,32(%esp) + vpunpckhqdq %ymm6,%ymm5,%ymm1 + vpunpcklqdq %ymm6,%ymm5,%ymm5 + vpunpcklqdq %ymm0,%ymm2,%ymm2 + vpsrlq $30,%ymm2,%ymm0 + vpsrlq $4,%ymm2,%ymm2 + vpsrlq $26,%ymm5,%ymm6 + vpsrlq $40,%ymm1,%ymm1 + vpand %ymm7,%ymm2,%ymm2 + vpand %ymm7,%ymm5,%ymm5 + vpand %ymm7,%ymm6,%ymm6 + vpand %ymm7,%ymm0,%ymm0 + vpor (%ebx),%ymm1,%ymm1 + vpaddq 64(%esp),%ymm2,%ymm2 + vpaddq (%esp),%ymm5,%ymm5 + vpaddq 32(%esp),%ymm6,%ymm6 + vpaddq %ymm3,%ymm0,%ymm0 + vpaddq %ymm4,%ymm1,%ymm1 + vpmuludq -96(%edx),%ymm2,%ymm3 + vmovdqa %ymm6,32(%esp) + vpmuludq -64(%edx),%ymm2,%ymm4 + vmovdqa %ymm0,96(%esp) + vpmuludq 96(%edx),%ymm2,%ymm0 + vmovdqa %ymm1,128(%esp) + vpmuludq 128(%edx),%ymm2,%ymm1 + vpmuludq -128(%edx),%ymm2,%ymm2 + vpmuludq -32(%edx),%ymm5,%ymm7 + vpaddq %ymm7,%ymm3,%ymm3 + vpmuludq (%edx),%ymm5,%ymm6 + vpaddq %ymm6,%ymm4,%ymm4 + vpmuludq -128(%edx),%ymm5,%ymm7 + vpaddq %ymm7,%ymm0,%ymm0 + vmovdqa 32(%esp),%ymm7 + vpmuludq -96(%edx),%ymm5,%ymm6 + vpaddq %ymm6,%ymm1,%ymm1 + vpmuludq -64(%edx),%ymm5,%ymm5 + vpaddq %ymm5,%ymm2,%ymm2 + vpmuludq -64(%edx),%ymm7,%ymm6 + vpaddq %ymm6,%ymm3,%ymm3 + vpmuludq -32(%edx),%ymm7,%ymm5 + vpaddq %ymm5,%ymm4,%ymm4 + vpmuludq 128(%edx),%ymm7,%ymm6 + vpaddq %ymm6,%ymm0,%ymm0 + vmovdqa 96(%esp),%ymm6 + vpmuludq -128(%edx),%ymm7,%ymm5 + vpaddq %ymm5,%ymm1,%ymm1 + vpmuludq -96(%edx),%ymm7,%ymm7 + vpaddq %ymm7,%ymm2,%ymm2 + vpmuludq -128(%edx),%ymm6,%ymm5 + vpaddq %ymm5,%ymm3,%ymm3 + vpmuludq -96(%edx),%ymm6,%ymm7 + vpaddq %ymm7,%ymm4,%ymm4 + vpmuludq 64(%edx),%ymm6,%ymm5 + vpaddq %ymm5,%ymm0,%ymm0 + vmovdqa 128(%esp),%ymm5 + vpmuludq 96(%edx),%ymm6,%ymm7 + vpaddq %ymm7,%ymm1,%ymm1 + vpmuludq 128(%edx),%ymm6,%ymm6 + vpaddq %ymm6,%ymm2,%ymm2 + vpmuludq 128(%edx),%ymm5,%ymm7 + vpaddq %ymm7,%ymm3,%ymm3 + vpmuludq 32(%edx),%ymm5,%ymm6 + vpaddq %ymm6,%ymm0,%ymm0 + vpmuludq -128(%edx),%ymm5,%ymm7 + vpaddq %ymm7,%ymm4,%ymm4 + vmovdqa 64(%ebx),%ymm7 + vpmuludq 64(%edx),%ymm5,%ymm6 + vpaddq %ymm6,%ymm1,%ymm1 + vpmuludq 96(%edx),%ymm5,%ymm5 + vpaddq %ymm5,%ymm2,%ymm2 + vpsrlq $26,%ymm3,%ymm5 + vpand %ymm7,%ymm3,%ymm3 + vpsrlq $26,%ymm0,%ymm6 + vpand %ymm7,%ymm0,%ymm0 + vpaddq %ymm5,%ymm4,%ymm4 + vpaddq %ymm6,%ymm1,%ymm1 + vpsrlq $26,%ymm4,%ymm5 + vpand %ymm7,%ymm4,%ymm4 + vpsrlq $26,%ymm1,%ymm6 + vpand %ymm7,%ymm1,%ymm1 + vpaddq %ymm6,%ymm2,%ymm2 + vpaddq %ymm5,%ymm0,%ymm0 + vpsllq $2,%ymm5,%ymm5 + vpsrlq $26,%ymm2,%ymm6 + vpand %ymm7,%ymm2,%ymm2 + vpaddq %ymm5,%ymm0,%ymm0 + vpaddq %ymm6,%ymm3,%ymm3 + vpsrlq $26,%ymm3,%ymm6 + vpsrlq $26,%ymm0,%ymm5 + vpand %ymm7,%ymm0,%ymm0 + vpand %ymm7,%ymm3,%ymm3 + vpaddq %ymm5,%ymm1,%ymm1 + vpaddq %ymm6,%ymm4,%ymm4 + vmovdqu (%esi),%xmm5 + vmovdqu 16(%esi),%xmm6 + vinserti128 $1,32(%esi),%ymm5,%ymm5 + vinserti128 $1,48(%esi),%ymm6,%ymm6 + leal 64(%esi),%esi + subl $64,%ecx + jnz .L028loop +.L027tail: + vmovdqa %ymm2,64(%esp) + vpsrldq $6,%ymm5,%ymm2 + vmovdqa %ymm0,(%esp) + vpsrldq $6,%ymm6,%ymm0 + vmovdqa %ymm1,32(%esp) + vpunpckhqdq %ymm6,%ymm5,%ymm1 + vpunpcklqdq %ymm6,%ymm5,%ymm5 + vpunpcklqdq %ymm0,%ymm2,%ymm2 + vpsrlq $30,%ymm2,%ymm0 + vpsrlq $4,%ymm2,%ymm2 + vpsrlq $26,%ymm5,%ymm6 + vpsrlq $40,%ymm1,%ymm1 + vpand %ymm7,%ymm2,%ymm2 + vpand %ymm7,%ymm5,%ymm5 + vpand %ymm7,%ymm6,%ymm6 + vpand %ymm7,%ymm0,%ymm0 + vpor (%ebx),%ymm1,%ymm1 + andl $-64,%ebx + vpaddq 64(%esp),%ymm2,%ymm2 + vpaddq (%esp),%ymm5,%ymm5 + vpaddq 32(%esp),%ymm6,%ymm6 + vpaddq %ymm3,%ymm0,%ymm0 + vpaddq %ymm4,%ymm1,%ymm1 + vpmuludq -92(%edx),%ymm2,%ymm3 + vmovdqa %ymm6,32(%esp) + vpmuludq -60(%edx),%ymm2,%ymm4 + vmovdqa %ymm0,96(%esp) + vpmuludq 100(%edx),%ymm2,%ymm0 + vmovdqa %ymm1,128(%esp) + vpmuludq 132(%edx),%ymm2,%ymm1 + vpmuludq -124(%edx),%ymm2,%ymm2 + vpmuludq -28(%edx),%ymm5,%ymm7 + vpaddq %ymm7,%ymm3,%ymm3 + vpmuludq 4(%edx),%ymm5,%ymm6 + vpaddq %ymm6,%ymm4,%ymm4 + vpmuludq -124(%edx),%ymm5,%ymm7 + vpaddq %ymm7,%ymm0,%ymm0 + vmovdqa 32(%esp),%ymm7 + vpmuludq -92(%edx),%ymm5,%ymm6 + vpaddq %ymm6,%ymm1,%ymm1 + vpmuludq -60(%edx),%ymm5,%ymm5 + vpaddq %ymm5,%ymm2,%ymm2 + vpmuludq -60(%edx),%ymm7,%ymm6 + vpaddq %ymm6,%ymm3,%ymm3 + vpmuludq -28(%edx),%ymm7,%ymm5 + vpaddq %ymm5,%ymm4,%ymm4 + vpmuludq 132(%edx),%ymm7,%ymm6 + vpaddq %ymm6,%ymm0,%ymm0 + vmovdqa 96(%esp),%ymm6 + vpmuludq -124(%edx),%ymm7,%ymm5 + vpaddq %ymm5,%ymm1,%ymm1 + vpmuludq -92(%edx),%ymm7,%ymm7 + vpaddq %ymm7,%ymm2,%ymm2 + vpmuludq -124(%edx),%ymm6,%ymm5 + vpaddq %ymm5,%ymm3,%ymm3 + vpmuludq -92(%edx),%ymm6,%ymm7 + vpaddq %ymm7,%ymm4,%ymm4 + vpmuludq 68(%edx),%ymm6,%ymm5 + vpaddq %ymm5,%ymm0,%ymm0 + vmovdqa 128(%esp),%ymm5 + vpmuludq 100(%edx),%ymm6,%ymm7 + vpaddq %ymm7,%ymm1,%ymm1 + vpmuludq 132(%edx),%ymm6,%ymm6 + vpaddq %ymm6,%ymm2,%ymm2 + vpmuludq 132(%edx),%ymm5,%ymm7 + vpaddq %ymm7,%ymm3,%ymm3 + vpmuludq 36(%edx),%ymm5,%ymm6 + vpaddq %ymm6,%ymm0,%ymm0 + vpmuludq -124(%edx),%ymm5,%ymm7 + vpaddq %ymm7,%ymm4,%ymm4 + vmovdqa 64(%ebx),%ymm7 + vpmuludq 68(%edx),%ymm5,%ymm6 + vpaddq %ymm6,%ymm1,%ymm1 + vpmuludq 100(%edx),%ymm5,%ymm5 + vpaddq %ymm5,%ymm2,%ymm2 + vpsrldq $8,%ymm4,%ymm5 + vpsrldq $8,%ymm3,%ymm6 + vpaddq %ymm5,%ymm4,%ymm4 + vpsrldq $8,%ymm0,%ymm5 + vpaddq %ymm6,%ymm3,%ymm3 + vpsrldq $8,%ymm1,%ymm6 + vpaddq %ymm5,%ymm0,%ymm0 + vpsrldq $8,%ymm2,%ymm5 + vpaddq %ymm6,%ymm1,%ymm1 + vpermq $2,%ymm4,%ymm6 + vpaddq %ymm5,%ymm2,%ymm2 + vpermq $2,%ymm3,%ymm5 + vpaddq %ymm6,%ymm4,%ymm4 + vpermq $2,%ymm0,%ymm6 + vpaddq %ymm5,%ymm3,%ymm3 + vpermq $2,%ymm1,%ymm5 + vpaddq %ymm6,%ymm0,%ymm0 + vpermq $2,%ymm2,%ymm6 + vpaddq %ymm5,%ymm1,%ymm1 + vpaddq %ymm6,%ymm2,%ymm2 + vpsrlq $26,%ymm3,%ymm5 + vpand %ymm7,%ymm3,%ymm3 + vpsrlq $26,%ymm0,%ymm6 + vpand %ymm7,%ymm0,%ymm0 + vpaddq %ymm5,%ymm4,%ymm4 + vpaddq %ymm6,%ymm1,%ymm1 + vpsrlq $26,%ymm4,%ymm5 + vpand %ymm7,%ymm4,%ymm4 + vpsrlq $26,%ymm1,%ymm6 + vpand %ymm7,%ymm1,%ymm1 + vpaddq %ymm6,%ymm2,%ymm2 + vpaddq %ymm5,%ymm0,%ymm0 + vpsllq $2,%ymm5,%ymm5 + vpsrlq $26,%ymm2,%ymm6 + vpand %ymm7,%ymm2,%ymm2 + vpaddq %ymm5,%ymm0,%ymm0 + vpaddq %ymm6,%ymm3,%ymm3 + vpsrlq $26,%ymm3,%ymm6 + vpsrlq $26,%ymm0,%ymm5 + vpand %ymm7,%ymm0,%ymm0 + vpand %ymm7,%ymm3,%ymm3 + vpaddq %ymm5,%ymm1,%ymm1 + vpaddq %ymm6,%ymm4,%ymm4 + cmpl $0,%ecx + je .L029done + vpshufd $252,%xmm0,%xmm0 + leal 288(%esp),%edx + vpshufd $252,%xmm1,%xmm1 + vpshufd $252,%xmm2,%xmm2 + vpshufd $252,%xmm3,%xmm3 + vpshufd $252,%xmm4,%xmm4 + jmp .L024even +.align 16 +.L029done: + vmovd %xmm0,-48(%edi) + vmovd %xmm1,-44(%edi) + vmovd %xmm2,-40(%edi) + vmovd %xmm3,-36(%edi) + vmovd %xmm4,-32(%edi) + vzeroupper + movl %ebp,%esp +.L020nodata: + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size _poly1305_blocks_avx2,.-_poly1305_blocks_avx2 .align 64 .Lconst_sse2: .long 16777216,0,16777216,0,16777216,0,16777216,0 |