diff options
Diffstat (limited to 'secure/lib/libcrypto/i386/chacha-x86.S')
-rw-r--r-- | secure/lib/libcrypto/i386/chacha-x86.S | 960 |
1 files changed, 960 insertions, 0 deletions
diff --git a/secure/lib/libcrypto/i386/chacha-x86.S b/secure/lib/libcrypto/i386/chacha-x86.S index 566285310e06..d6b2936a5381 100644 --- a/secure/lib/libcrypto/i386/chacha-x86.S +++ b/secure/lib/libcrypto/i386/chacha-x86.S @@ -385,6 +385,8 @@ ChaCha20_ssse3: pushl %esi pushl %edi .Lssse3_shortcut: + testl $2048,4(%ebp) + jnz .Lxop_shortcut movl 20(%esp),%edi movl 24(%esp),%esi movl 28(%esp),%ecx @@ -528,6 +530,484 @@ ChaCha20_ssse3: .byte 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32 .byte 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111 .byte 114,103,62,0 +.globl ChaCha20_xop +.type ChaCha20_xop,@function +.align 16 +ChaCha20_xop: +.L_ChaCha20_xop_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi +.Lxop_shortcut: + movl 20(%esp),%edi + movl 24(%esp),%esi + movl 28(%esp),%ecx + movl 32(%esp),%edx + movl 36(%esp),%ebx + vzeroupper + movl %esp,%ebp + subl $524,%esp + andl $-64,%esp + movl %ebp,512(%esp) + leal .Lssse3_data-.Lpic_point(%eax),%eax + vmovdqu (%ebx),%xmm3 + cmpl $256,%ecx + jb .L0141x + movl %edx,516(%esp) + movl %ebx,520(%esp) + subl $256,%ecx + leal 384(%esp),%ebp + vmovdqu (%edx),%xmm7 + vpshufd $0,%xmm3,%xmm0 + vpshufd $85,%xmm3,%xmm1 + vpshufd $170,%xmm3,%xmm2 + vpshufd $255,%xmm3,%xmm3 + vpaddd 48(%eax),%xmm0,%xmm0 + vpshufd $0,%xmm7,%xmm4 + vpshufd $85,%xmm7,%xmm5 + vpsubd 64(%eax),%xmm0,%xmm0 + vpshufd $170,%xmm7,%xmm6 + vpshufd $255,%xmm7,%xmm7 + vmovdqa %xmm0,64(%ebp) + vmovdqa %xmm1,80(%ebp) + vmovdqa %xmm2,96(%ebp) + vmovdqa %xmm3,112(%ebp) + vmovdqu 16(%edx),%xmm3 + vmovdqa %xmm4,-64(%ebp) + vmovdqa %xmm5,-48(%ebp) + vmovdqa %xmm6,-32(%ebp) + vmovdqa %xmm7,-16(%ebp) + vmovdqa 32(%eax),%xmm7 + leal 128(%esp),%ebx + vpshufd $0,%xmm3,%xmm0 + vpshufd $85,%xmm3,%xmm1 + vpshufd $170,%xmm3,%xmm2 + vpshufd $255,%xmm3,%xmm3 + vpshufd $0,%xmm7,%xmm4 + vpshufd $85,%xmm7,%xmm5 + vpshufd $170,%xmm7,%xmm6 + vpshufd $255,%xmm7,%xmm7 + vmovdqa %xmm0,(%ebp) + vmovdqa %xmm1,16(%ebp) + vmovdqa %xmm2,32(%ebp) + vmovdqa %xmm3,48(%ebp) + vmovdqa %xmm4,-128(%ebp) + vmovdqa %xmm5,-112(%ebp) + vmovdqa %xmm6,-96(%ebp) + vmovdqa %xmm7,-80(%ebp) + leal 128(%esi),%esi + leal 128(%edi),%edi + jmp .L015outer_loop +.align 32 +.L015outer_loop: + vmovdqa -112(%ebp),%xmm1 + vmovdqa -96(%ebp),%xmm2 + vmovdqa -80(%ebp),%xmm3 + vmovdqa -48(%ebp),%xmm5 + vmovdqa -32(%ebp),%xmm6 + vmovdqa -16(%ebp),%xmm7 + vmovdqa %xmm1,-112(%ebx) + vmovdqa %xmm2,-96(%ebx) + vmovdqa %xmm3,-80(%ebx) + vmovdqa %xmm5,-48(%ebx) + vmovdqa %xmm6,-32(%ebx) + vmovdqa %xmm7,-16(%ebx) + vmovdqa 32(%ebp),%xmm2 + vmovdqa 48(%ebp),%xmm3 + vmovdqa 64(%ebp),%xmm4 + vmovdqa 80(%ebp),%xmm5 + vmovdqa 96(%ebp),%xmm6 + vmovdqa 112(%ebp),%xmm7 + vpaddd 64(%eax),%xmm4,%xmm4 + vmovdqa %xmm2,32(%ebx) + vmovdqa %xmm3,48(%ebx) + vmovdqa %xmm4,64(%ebx) + vmovdqa %xmm5,80(%ebx) + vmovdqa %xmm6,96(%ebx) + vmovdqa %xmm7,112(%ebx) + vmovdqa %xmm4,64(%ebp) + vmovdqa -128(%ebp),%xmm0 + vmovdqa %xmm4,%xmm6 + vmovdqa -64(%ebp),%xmm3 + vmovdqa (%ebp),%xmm4 + vmovdqa 16(%ebp),%xmm5 + movl $10,%edx + nop +.align 32 +.L016loop: + vpaddd %xmm3,%xmm0,%xmm0 + vpxor %xmm0,%xmm6,%xmm6 +.byte 143,232,120,194,246,16 + vpaddd %xmm6,%xmm4,%xmm4 + vpxor %xmm4,%xmm3,%xmm2 + vmovdqa -112(%ebx),%xmm1 +.byte 143,232,120,194,210,12 + vmovdqa -48(%ebx),%xmm3 + vpaddd %xmm2,%xmm0,%xmm0 + vmovdqa 80(%ebx),%xmm7 + vpxor %xmm0,%xmm6,%xmm6 + vpaddd %xmm3,%xmm1,%xmm1 +.byte 143,232,120,194,246,8 + vmovdqa %xmm0,-128(%ebx) + vpaddd %xmm6,%xmm4,%xmm4 + vmovdqa %xmm6,64(%ebx) + vpxor %xmm4,%xmm2,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 +.byte 143,232,120,194,210,7 + vmovdqa %xmm4,(%ebx) +.byte 143,232,120,194,255,16 + vmovdqa %xmm2,-64(%ebx) + vpaddd %xmm7,%xmm5,%xmm5 + vmovdqa 32(%ebx),%xmm4 + vpxor %xmm5,%xmm3,%xmm3 + vmovdqa -96(%ebx),%xmm0 +.byte 143,232,120,194,219,12 + vmovdqa -32(%ebx),%xmm2 + vpaddd %xmm3,%xmm1,%xmm1 + vmovdqa 96(%ebx),%xmm6 + vpxor %xmm1,%xmm7,%xmm7 + vpaddd %xmm2,%xmm0,%xmm0 +.byte 143,232,120,194,255,8 + vmovdqa %xmm1,-112(%ebx) + vpaddd %xmm7,%xmm5,%xmm5 + vmovdqa %xmm7,80(%ebx) + vpxor %xmm5,%xmm3,%xmm3 + vpxor %xmm0,%xmm6,%xmm6 +.byte 143,232,120,194,219,7 + vmovdqa %xmm5,16(%ebx) +.byte 143,232,120,194,246,16 + vmovdqa %xmm3,-48(%ebx) + vpaddd %xmm6,%xmm4,%xmm4 + vmovdqa 48(%ebx),%xmm5 + vpxor %xmm4,%xmm2,%xmm2 + vmovdqa -80(%ebx),%xmm1 +.byte 143,232,120,194,210,12 + vmovdqa -16(%ebx),%xmm3 + vpaddd %xmm2,%xmm0,%xmm0 + vmovdqa 112(%ebx),%xmm7 + vpxor %xmm0,%xmm6,%xmm6 + vpaddd %xmm3,%xmm1,%xmm1 +.byte 143,232,120,194,246,8 + vmovdqa %xmm0,-96(%ebx) + vpaddd %xmm6,%xmm4,%xmm4 + vmovdqa %xmm6,96(%ebx) + vpxor %xmm4,%xmm2,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 +.byte 143,232,120,194,210,7 +.byte 143,232,120,194,255,16 + vmovdqa %xmm2,-32(%ebx) + vpaddd %xmm7,%xmm5,%xmm5 + vpxor %xmm5,%xmm3,%xmm3 + vmovdqa -128(%ebx),%xmm0 +.byte 143,232,120,194,219,12 + vmovdqa -48(%ebx),%xmm2 + vpaddd %xmm3,%xmm1,%xmm1 + vpxor %xmm1,%xmm7,%xmm7 + vpaddd %xmm2,%xmm0,%xmm0 +.byte 143,232,120,194,255,8 + vmovdqa %xmm1,-80(%ebx) + vpaddd %xmm7,%xmm5,%xmm5 + vpxor %xmm5,%xmm3,%xmm3 + vpxor %xmm0,%xmm7,%xmm6 +.byte 143,232,120,194,219,7 +.byte 143,232,120,194,246,16 + vmovdqa %xmm3,-16(%ebx) + vpaddd %xmm6,%xmm4,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vmovdqa -112(%ebx),%xmm1 +.byte 143,232,120,194,210,12 + vmovdqa -32(%ebx),%xmm3 + vpaddd %xmm2,%xmm0,%xmm0 + vmovdqa 64(%ebx),%xmm7 + vpxor %xmm0,%xmm6,%xmm6 + vpaddd %xmm3,%xmm1,%xmm1 +.byte 143,232,120,194,246,8 + vmovdqa %xmm0,-128(%ebx) + vpaddd %xmm6,%xmm4,%xmm4 + vmovdqa %xmm6,112(%ebx) + vpxor %xmm4,%xmm2,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 +.byte 143,232,120,194,210,7 + vmovdqa %xmm4,32(%ebx) +.byte 143,232,120,194,255,16 + vmovdqa %xmm2,-48(%ebx) + vpaddd %xmm7,%xmm5,%xmm5 + vmovdqa (%ebx),%xmm4 + vpxor %xmm5,%xmm3,%xmm3 + vmovdqa -96(%ebx),%xmm0 +.byte 143,232,120,194,219,12 + vmovdqa -16(%ebx),%xmm2 + vpaddd %xmm3,%xmm1,%xmm1 + vmovdqa 80(%ebx),%xmm6 + vpxor %xmm1,%xmm7,%xmm7 + vpaddd %xmm2,%xmm0,%xmm0 +.byte 143,232,120,194,255,8 + vmovdqa %xmm1,-112(%ebx) + vpaddd %xmm7,%xmm5,%xmm5 + vmovdqa %xmm7,64(%ebx) + vpxor %xmm5,%xmm3,%xmm3 + vpxor %xmm0,%xmm6,%xmm6 +.byte 143,232,120,194,219,7 + vmovdqa %xmm5,48(%ebx) +.byte 143,232,120,194,246,16 + vmovdqa %xmm3,-32(%ebx) + vpaddd %xmm6,%xmm4,%xmm4 + vmovdqa 16(%ebx),%xmm5 + vpxor %xmm4,%xmm2,%xmm2 + vmovdqa -80(%ebx),%xmm1 +.byte 143,232,120,194,210,12 + vmovdqa -64(%ebx),%xmm3 + vpaddd %xmm2,%xmm0,%xmm0 + vmovdqa 96(%ebx),%xmm7 + vpxor %xmm0,%xmm6,%xmm6 + vpaddd %xmm3,%xmm1,%xmm1 +.byte 143,232,120,194,246,8 + vmovdqa %xmm0,-96(%ebx) + vpaddd %xmm6,%xmm4,%xmm4 + vmovdqa %xmm6,80(%ebx) + vpxor %xmm4,%xmm2,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 +.byte 143,232,120,194,210,7 +.byte 143,232,120,194,255,16 + vmovdqa %xmm2,-16(%ebx) + vpaddd %xmm7,%xmm5,%xmm5 + vpxor %xmm5,%xmm3,%xmm3 + vmovdqa -128(%ebx),%xmm0 +.byte 143,232,120,194,219,12 + vpaddd %xmm3,%xmm1,%xmm1 + vmovdqa 64(%ebx),%xmm6 + vpxor %xmm1,%xmm7,%xmm7 +.byte 143,232,120,194,255,8 + vmovdqa %xmm1,-80(%ebx) + vpaddd %xmm7,%xmm5,%xmm5 + vmovdqa %xmm7,96(%ebx) + vpxor %xmm5,%xmm3,%xmm3 +.byte 143,232,120,194,219,7 + decl %edx + jnz .L016loop + vmovdqa %xmm3,-64(%ebx) + vmovdqa %xmm4,(%ebx) + vmovdqa %xmm5,16(%ebx) + vmovdqa %xmm6,64(%ebx) + vmovdqa %xmm7,96(%ebx) + vmovdqa -112(%ebx),%xmm1 + vmovdqa -96(%ebx),%xmm2 + vmovdqa -80(%ebx),%xmm3 + vpaddd -128(%ebp),%xmm0,%xmm0 + vpaddd -112(%ebp),%xmm1,%xmm1 + vpaddd -96(%ebp),%xmm2,%xmm2 + vpaddd -80(%ebp),%xmm3,%xmm3 + vpunpckldq %xmm1,%xmm0,%xmm6 + vpunpckldq %xmm3,%xmm2,%xmm7 + vpunpckhdq %xmm1,%xmm0,%xmm0 + vpunpckhdq %xmm3,%xmm2,%xmm2 + vpunpcklqdq %xmm7,%xmm6,%xmm1 + vpunpckhqdq %xmm7,%xmm6,%xmm6 + vpunpcklqdq %xmm2,%xmm0,%xmm7 + vpunpckhqdq %xmm2,%xmm0,%xmm3 + vpxor -128(%esi),%xmm1,%xmm4 + vpxor -64(%esi),%xmm6,%xmm5 + vpxor (%esi),%xmm7,%xmm6 + vpxor 64(%esi),%xmm3,%xmm7 + leal 16(%esi),%esi + vmovdqa -64(%ebx),%xmm0 + vmovdqa -48(%ebx),%xmm1 + vmovdqa -32(%ebx),%xmm2 + vmovdqa -16(%ebx),%xmm3 + vmovdqu %xmm4,-128(%edi) + vmovdqu %xmm5,-64(%edi) + vmovdqu %xmm6,(%edi) + vmovdqu %xmm7,64(%edi) + leal 16(%edi),%edi + vpaddd -64(%ebp),%xmm0,%xmm0 + vpaddd -48(%ebp),%xmm1,%xmm1 + vpaddd -32(%ebp),%xmm2,%xmm2 + vpaddd -16(%ebp),%xmm3,%xmm3 + vpunpckldq %xmm1,%xmm0,%xmm6 + vpunpckldq %xmm3,%xmm2,%xmm7 + vpunpckhdq %xmm1,%xmm0,%xmm0 + vpunpckhdq %xmm3,%xmm2,%xmm2 + vpunpcklqdq %xmm7,%xmm6,%xmm1 + vpunpckhqdq %xmm7,%xmm6,%xmm6 + vpunpcklqdq %xmm2,%xmm0,%xmm7 + vpunpckhqdq %xmm2,%xmm0,%xmm3 + vpxor -128(%esi),%xmm1,%xmm4 + vpxor -64(%esi),%xmm6,%xmm5 + vpxor (%esi),%xmm7,%xmm6 + vpxor 64(%esi),%xmm3,%xmm7 + leal 16(%esi),%esi + vmovdqa (%ebx),%xmm0 + vmovdqa 16(%ebx),%xmm1 + vmovdqa 32(%ebx),%xmm2 + vmovdqa 48(%ebx),%xmm3 + vmovdqu %xmm4,-128(%edi) + vmovdqu %xmm5,-64(%edi) + vmovdqu %xmm6,(%edi) + vmovdqu %xmm7,64(%edi) + leal 16(%edi),%edi + vpaddd (%ebp),%xmm0,%xmm0 + vpaddd 16(%ebp),%xmm1,%xmm1 + vpaddd 32(%ebp),%xmm2,%xmm2 + vpaddd 48(%ebp),%xmm3,%xmm3 + vpunpckldq %xmm1,%xmm0,%xmm6 + vpunpckldq %xmm3,%xmm2,%xmm7 + vpunpckhdq %xmm1,%xmm0,%xmm0 + vpunpckhdq %xmm3,%xmm2,%xmm2 + vpunpcklqdq %xmm7,%xmm6,%xmm1 + vpunpckhqdq %xmm7,%xmm6,%xmm6 + vpunpcklqdq %xmm2,%xmm0,%xmm7 + vpunpckhqdq %xmm2,%xmm0,%xmm3 + vpxor -128(%esi),%xmm1,%xmm4 + vpxor -64(%esi),%xmm6,%xmm5 + vpxor (%esi),%xmm7,%xmm6 + vpxor 64(%esi),%xmm3,%xmm7 + leal 16(%esi),%esi + vmovdqa 64(%ebx),%xmm0 + vmovdqa 80(%ebx),%xmm1 + vmovdqa 96(%ebx),%xmm2 + vmovdqa 112(%ebx),%xmm3 + vmovdqu %xmm4,-128(%edi) + vmovdqu %xmm5,-64(%edi) + vmovdqu %xmm6,(%edi) + vmovdqu %xmm7,64(%edi) + leal 16(%edi),%edi + vpaddd 64(%ebp),%xmm0,%xmm0 + vpaddd 80(%ebp),%xmm1,%xmm1 + vpaddd 96(%ebp),%xmm2,%xmm2 + vpaddd 112(%ebp),%xmm3,%xmm3 + vpunpckldq %xmm1,%xmm0,%xmm6 + vpunpckldq %xmm3,%xmm2,%xmm7 + vpunpckhdq %xmm1,%xmm0,%xmm0 + vpunpckhdq %xmm3,%xmm2,%xmm2 + vpunpcklqdq %xmm7,%xmm6,%xmm1 + vpunpckhqdq %xmm7,%xmm6,%xmm6 + vpunpcklqdq %xmm2,%xmm0,%xmm7 + vpunpckhqdq %xmm2,%xmm0,%xmm3 + vpxor -128(%esi),%xmm1,%xmm4 + vpxor -64(%esi),%xmm6,%xmm5 + vpxor (%esi),%xmm7,%xmm6 + vpxor 64(%esi),%xmm3,%xmm7 + leal 208(%esi),%esi + vmovdqu %xmm4,-128(%edi) + vmovdqu %xmm5,-64(%edi) + vmovdqu %xmm6,(%edi) + vmovdqu %xmm7,64(%edi) + leal 208(%edi),%edi + subl $256,%ecx + jnc .L015outer_loop + addl $256,%ecx + jz .L017done + movl 520(%esp),%ebx + leal -128(%esi),%esi + movl 516(%esp),%edx + leal -128(%edi),%edi + vmovd 64(%ebp),%xmm2 + vmovdqu (%ebx),%xmm3 + vpaddd 96(%eax),%xmm2,%xmm2 + vpand 112(%eax),%xmm3,%xmm3 + vpor %xmm2,%xmm3,%xmm3 +.L0141x: + vmovdqa 32(%eax),%xmm0 + vmovdqu (%edx),%xmm1 + vmovdqu 16(%edx),%xmm2 + vmovdqa (%eax),%xmm6 + vmovdqa 16(%eax),%xmm7 + movl %ebp,48(%esp) + vmovdqa %xmm0,(%esp) + vmovdqa %xmm1,16(%esp) + vmovdqa %xmm2,32(%esp) + vmovdqa %xmm3,48(%esp) + movl $10,%edx + jmp .L018loop1x +.align 16 +.L019outer1x: + vmovdqa 80(%eax),%xmm3 + vmovdqa (%esp),%xmm0 + vmovdqa 16(%esp),%xmm1 + vmovdqa 32(%esp),%xmm2 + vpaddd 48(%esp),%xmm3,%xmm3 + movl $10,%edx + vmovdqa %xmm3,48(%esp) + jmp .L018loop1x +.align 16 +.L018loop1x: + vpaddd %xmm1,%xmm0,%xmm0 + vpxor %xmm0,%xmm3,%xmm3 +.byte 143,232,120,194,219,16 + vpaddd %xmm3,%xmm2,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 +.byte 143,232,120,194,201,12 + vpaddd %xmm1,%xmm0,%xmm0 + vpxor %xmm0,%xmm3,%xmm3 +.byte 143,232,120,194,219,8 + vpaddd %xmm3,%xmm2,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 +.byte 143,232,120,194,201,7 + vpshufd $78,%xmm2,%xmm2 + vpshufd $57,%xmm1,%xmm1 + vpshufd $147,%xmm3,%xmm3 + vpaddd %xmm1,%xmm0,%xmm0 + vpxor %xmm0,%xmm3,%xmm3 +.byte 143,232,120,194,219,16 + vpaddd %xmm3,%xmm2,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 +.byte 143,232,120,194,201,12 + vpaddd %xmm1,%xmm0,%xmm0 + vpxor %xmm0,%xmm3,%xmm3 +.byte 143,232,120,194,219,8 + vpaddd %xmm3,%xmm2,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 +.byte 143,232,120,194,201,7 + vpshufd $78,%xmm2,%xmm2 + vpshufd $147,%xmm1,%xmm1 + vpshufd $57,%xmm3,%xmm3 + decl %edx + jnz .L018loop1x + vpaddd (%esp),%xmm0,%xmm0 + vpaddd 16(%esp),%xmm1,%xmm1 + vpaddd 32(%esp),%xmm2,%xmm2 + vpaddd 48(%esp),%xmm3,%xmm3 + cmpl $64,%ecx + jb .L020tail + vpxor (%esi),%xmm0,%xmm0 + vpxor 16(%esi),%xmm1,%xmm1 + vpxor 32(%esi),%xmm2,%xmm2 + vpxor 48(%esi),%xmm3,%xmm3 + leal 64(%esi),%esi + vmovdqu %xmm0,(%edi) + vmovdqu %xmm1,16(%edi) + vmovdqu %xmm2,32(%edi) + vmovdqu %xmm3,48(%edi) + leal 64(%edi),%edi + subl $64,%ecx + jnz .L019outer1x + jmp .L017done +.L020tail: + vmovdqa %xmm0,(%esp) + vmovdqa %xmm1,16(%esp) + vmovdqa %xmm2,32(%esp) + vmovdqa %xmm3,48(%esp) + xorl %eax,%eax + xorl %edx,%edx + xorl %ebp,%ebp +.L021tail_loop: + movb (%esp,%ebp,1),%al + movb (%esi,%ebp,1),%dl + leal 1(%ebp),%ebp + xorb %dl,%al + movb %al,-1(%edi,%ebp,1) + decl %ecx + jnz .L021tail_loop +.L017done: + vzeroupper + movl 512(%esp),%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size ChaCha20_xop,.-.L_ChaCha20_xop_begin .comm OPENSSL_ia32cap_P,16,4 #else .text @@ -914,6 +1394,8 @@ ChaCha20_ssse3: pushl %esi pushl %edi .Lssse3_shortcut: + testl $2048,4(%ebp) + jnz .Lxop_shortcut movl 20(%esp),%edi movl 24(%esp),%esi movl 28(%esp),%ecx @@ -1057,5 +1539,483 @@ ChaCha20_ssse3: .byte 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32 .byte 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111 .byte 114,103,62,0 +.globl ChaCha20_xop +.type ChaCha20_xop,@function +.align 16 +ChaCha20_xop: +.L_ChaCha20_xop_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi +.Lxop_shortcut: + movl 20(%esp),%edi + movl 24(%esp),%esi + movl 28(%esp),%ecx + movl 32(%esp),%edx + movl 36(%esp),%ebx + vzeroupper + movl %esp,%ebp + subl $524,%esp + andl $-64,%esp + movl %ebp,512(%esp) + leal .Lssse3_data-.Lpic_point(%eax),%eax + vmovdqu (%ebx),%xmm3 + cmpl $256,%ecx + jb .L0141x + movl %edx,516(%esp) + movl %ebx,520(%esp) + subl $256,%ecx + leal 384(%esp),%ebp + vmovdqu (%edx),%xmm7 + vpshufd $0,%xmm3,%xmm0 + vpshufd $85,%xmm3,%xmm1 + vpshufd $170,%xmm3,%xmm2 + vpshufd $255,%xmm3,%xmm3 + vpaddd 48(%eax),%xmm0,%xmm0 + vpshufd $0,%xmm7,%xmm4 + vpshufd $85,%xmm7,%xmm5 + vpsubd 64(%eax),%xmm0,%xmm0 + vpshufd $170,%xmm7,%xmm6 + vpshufd $255,%xmm7,%xmm7 + vmovdqa %xmm0,64(%ebp) + vmovdqa %xmm1,80(%ebp) + vmovdqa %xmm2,96(%ebp) + vmovdqa %xmm3,112(%ebp) + vmovdqu 16(%edx),%xmm3 + vmovdqa %xmm4,-64(%ebp) + vmovdqa %xmm5,-48(%ebp) + vmovdqa %xmm6,-32(%ebp) + vmovdqa %xmm7,-16(%ebp) + vmovdqa 32(%eax),%xmm7 + leal 128(%esp),%ebx + vpshufd $0,%xmm3,%xmm0 + vpshufd $85,%xmm3,%xmm1 + vpshufd $170,%xmm3,%xmm2 + vpshufd $255,%xmm3,%xmm3 + vpshufd $0,%xmm7,%xmm4 + vpshufd $85,%xmm7,%xmm5 + vpshufd $170,%xmm7,%xmm6 + vpshufd $255,%xmm7,%xmm7 + vmovdqa %xmm0,(%ebp) + vmovdqa %xmm1,16(%ebp) + vmovdqa %xmm2,32(%ebp) + vmovdqa %xmm3,48(%ebp) + vmovdqa %xmm4,-128(%ebp) + vmovdqa %xmm5,-112(%ebp) + vmovdqa %xmm6,-96(%ebp) + vmovdqa %xmm7,-80(%ebp) + leal 128(%esi),%esi + leal 128(%edi),%edi + jmp .L015outer_loop +.align 32 +.L015outer_loop: + vmovdqa -112(%ebp),%xmm1 + vmovdqa -96(%ebp),%xmm2 + vmovdqa -80(%ebp),%xmm3 + vmovdqa -48(%ebp),%xmm5 + vmovdqa -32(%ebp),%xmm6 + vmovdqa -16(%ebp),%xmm7 + vmovdqa %xmm1,-112(%ebx) + vmovdqa %xmm2,-96(%ebx) + vmovdqa %xmm3,-80(%ebx) + vmovdqa %xmm5,-48(%ebx) + vmovdqa %xmm6,-32(%ebx) + vmovdqa %xmm7,-16(%ebx) + vmovdqa 32(%ebp),%xmm2 + vmovdqa 48(%ebp),%xmm3 + vmovdqa 64(%ebp),%xmm4 + vmovdqa 80(%ebp),%xmm5 + vmovdqa 96(%ebp),%xmm6 + vmovdqa 112(%ebp),%xmm7 + vpaddd 64(%eax),%xmm4,%xmm4 + vmovdqa %xmm2,32(%ebx) + vmovdqa %xmm3,48(%ebx) + vmovdqa %xmm4,64(%ebx) + vmovdqa %xmm5,80(%ebx) + vmovdqa %xmm6,96(%ebx) + vmovdqa %xmm7,112(%ebx) + vmovdqa %xmm4,64(%ebp) + vmovdqa -128(%ebp),%xmm0 + vmovdqa %xmm4,%xmm6 + vmovdqa -64(%ebp),%xmm3 + vmovdqa (%ebp),%xmm4 + vmovdqa 16(%ebp),%xmm5 + movl $10,%edx + nop +.align 32 +.L016loop: + vpaddd %xmm3,%xmm0,%xmm0 + vpxor %xmm0,%xmm6,%xmm6 +.byte 143,232,120,194,246,16 + vpaddd %xmm6,%xmm4,%xmm4 + vpxor %xmm4,%xmm3,%xmm2 + vmovdqa -112(%ebx),%xmm1 +.byte 143,232,120,194,210,12 + vmovdqa -48(%ebx),%xmm3 + vpaddd %xmm2,%xmm0,%xmm0 + vmovdqa 80(%ebx),%xmm7 + vpxor %xmm0,%xmm6,%xmm6 + vpaddd %xmm3,%xmm1,%xmm1 +.byte 143,232,120,194,246,8 + vmovdqa %xmm0,-128(%ebx) + vpaddd %xmm6,%xmm4,%xmm4 + vmovdqa %xmm6,64(%ebx) + vpxor %xmm4,%xmm2,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 +.byte 143,232,120,194,210,7 + vmovdqa %xmm4,(%ebx) +.byte 143,232,120,194,255,16 + vmovdqa %xmm2,-64(%ebx) + vpaddd %xmm7,%xmm5,%xmm5 + vmovdqa 32(%ebx),%xmm4 + vpxor %xmm5,%xmm3,%xmm3 + vmovdqa -96(%ebx),%xmm0 +.byte 143,232,120,194,219,12 + vmovdqa -32(%ebx),%xmm2 + vpaddd %xmm3,%xmm1,%xmm1 + vmovdqa 96(%ebx),%xmm6 + vpxor %xmm1,%xmm7,%xmm7 + vpaddd %xmm2,%xmm0,%xmm0 +.byte 143,232,120,194,255,8 + vmovdqa %xmm1,-112(%ebx) + vpaddd %xmm7,%xmm5,%xmm5 + vmovdqa %xmm7,80(%ebx) + vpxor %xmm5,%xmm3,%xmm3 + vpxor %xmm0,%xmm6,%xmm6 +.byte 143,232,120,194,219,7 + vmovdqa %xmm5,16(%ebx) +.byte 143,232,120,194,246,16 + vmovdqa %xmm3,-48(%ebx) + vpaddd %xmm6,%xmm4,%xmm4 + vmovdqa 48(%ebx),%xmm5 + vpxor %xmm4,%xmm2,%xmm2 + vmovdqa -80(%ebx),%xmm1 +.byte 143,232,120,194,210,12 + vmovdqa -16(%ebx),%xmm3 + vpaddd %xmm2,%xmm0,%xmm0 + vmovdqa 112(%ebx),%xmm7 + vpxor %xmm0,%xmm6,%xmm6 + vpaddd %xmm3,%xmm1,%xmm1 +.byte 143,232,120,194,246,8 + vmovdqa %xmm0,-96(%ebx) + vpaddd %xmm6,%xmm4,%xmm4 + vmovdqa %xmm6,96(%ebx) + vpxor %xmm4,%xmm2,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 +.byte 143,232,120,194,210,7 +.byte 143,232,120,194,255,16 + vmovdqa %xmm2,-32(%ebx) + vpaddd %xmm7,%xmm5,%xmm5 + vpxor %xmm5,%xmm3,%xmm3 + vmovdqa -128(%ebx),%xmm0 +.byte 143,232,120,194,219,12 + vmovdqa -48(%ebx),%xmm2 + vpaddd %xmm3,%xmm1,%xmm1 + vpxor %xmm1,%xmm7,%xmm7 + vpaddd %xmm2,%xmm0,%xmm0 +.byte 143,232,120,194,255,8 + vmovdqa %xmm1,-80(%ebx) + vpaddd %xmm7,%xmm5,%xmm5 + vpxor %xmm5,%xmm3,%xmm3 + vpxor %xmm0,%xmm7,%xmm6 +.byte 143,232,120,194,219,7 +.byte 143,232,120,194,246,16 + vmovdqa %xmm3,-16(%ebx) + vpaddd %xmm6,%xmm4,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vmovdqa -112(%ebx),%xmm1 +.byte 143,232,120,194,210,12 + vmovdqa -32(%ebx),%xmm3 + vpaddd %xmm2,%xmm0,%xmm0 + vmovdqa 64(%ebx),%xmm7 + vpxor %xmm0,%xmm6,%xmm6 + vpaddd %xmm3,%xmm1,%xmm1 +.byte 143,232,120,194,246,8 + vmovdqa %xmm0,-128(%ebx) + vpaddd %xmm6,%xmm4,%xmm4 + vmovdqa %xmm6,112(%ebx) + vpxor %xmm4,%xmm2,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 +.byte 143,232,120,194,210,7 + vmovdqa %xmm4,32(%ebx) +.byte 143,232,120,194,255,16 + vmovdqa %xmm2,-48(%ebx) + vpaddd %xmm7,%xmm5,%xmm5 + vmovdqa (%ebx),%xmm4 + vpxor %xmm5,%xmm3,%xmm3 + vmovdqa -96(%ebx),%xmm0 +.byte 143,232,120,194,219,12 + vmovdqa -16(%ebx),%xmm2 + vpaddd %xmm3,%xmm1,%xmm1 + vmovdqa 80(%ebx),%xmm6 + vpxor %xmm1,%xmm7,%xmm7 + vpaddd %xmm2,%xmm0,%xmm0 +.byte 143,232,120,194,255,8 + vmovdqa %xmm1,-112(%ebx) + vpaddd %xmm7,%xmm5,%xmm5 + vmovdqa %xmm7,64(%ebx) + vpxor %xmm5,%xmm3,%xmm3 + vpxor %xmm0,%xmm6,%xmm6 +.byte 143,232,120,194,219,7 + vmovdqa %xmm5,48(%ebx) +.byte 143,232,120,194,246,16 + vmovdqa %xmm3,-32(%ebx) + vpaddd %xmm6,%xmm4,%xmm4 + vmovdqa 16(%ebx),%xmm5 + vpxor %xmm4,%xmm2,%xmm2 + vmovdqa -80(%ebx),%xmm1 +.byte 143,232,120,194,210,12 + vmovdqa -64(%ebx),%xmm3 + vpaddd %xmm2,%xmm0,%xmm0 + vmovdqa 96(%ebx),%xmm7 + vpxor %xmm0,%xmm6,%xmm6 + vpaddd %xmm3,%xmm1,%xmm1 +.byte 143,232,120,194,246,8 + vmovdqa %xmm0,-96(%ebx) + vpaddd %xmm6,%xmm4,%xmm4 + vmovdqa %xmm6,80(%ebx) + vpxor %xmm4,%xmm2,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 +.byte 143,232,120,194,210,7 +.byte 143,232,120,194,255,16 + vmovdqa %xmm2,-16(%ebx) + vpaddd %xmm7,%xmm5,%xmm5 + vpxor %xmm5,%xmm3,%xmm3 + vmovdqa -128(%ebx),%xmm0 +.byte 143,232,120,194,219,12 + vpaddd %xmm3,%xmm1,%xmm1 + vmovdqa 64(%ebx),%xmm6 + vpxor %xmm1,%xmm7,%xmm7 +.byte 143,232,120,194,255,8 + vmovdqa %xmm1,-80(%ebx) + vpaddd %xmm7,%xmm5,%xmm5 + vmovdqa %xmm7,96(%ebx) + vpxor %xmm5,%xmm3,%xmm3 +.byte 143,232,120,194,219,7 + decl %edx + jnz .L016loop + vmovdqa %xmm3,-64(%ebx) + vmovdqa %xmm4,(%ebx) + vmovdqa %xmm5,16(%ebx) + vmovdqa %xmm6,64(%ebx) + vmovdqa %xmm7,96(%ebx) + vmovdqa -112(%ebx),%xmm1 + vmovdqa -96(%ebx),%xmm2 + vmovdqa -80(%ebx),%xmm3 + vpaddd -128(%ebp),%xmm0,%xmm0 + vpaddd -112(%ebp),%xmm1,%xmm1 + vpaddd -96(%ebp),%xmm2,%xmm2 + vpaddd -80(%ebp),%xmm3,%xmm3 + vpunpckldq %xmm1,%xmm0,%xmm6 + vpunpckldq %xmm3,%xmm2,%xmm7 + vpunpckhdq %xmm1,%xmm0,%xmm0 + vpunpckhdq %xmm3,%xmm2,%xmm2 + vpunpcklqdq %xmm7,%xmm6,%xmm1 + vpunpckhqdq %xmm7,%xmm6,%xmm6 + vpunpcklqdq %xmm2,%xmm0,%xmm7 + vpunpckhqdq %xmm2,%xmm0,%xmm3 + vpxor -128(%esi),%xmm1,%xmm4 + vpxor -64(%esi),%xmm6,%xmm5 + vpxor (%esi),%xmm7,%xmm6 + vpxor 64(%esi),%xmm3,%xmm7 + leal 16(%esi),%esi + vmovdqa -64(%ebx),%xmm0 + vmovdqa -48(%ebx),%xmm1 + vmovdqa -32(%ebx),%xmm2 + vmovdqa -16(%ebx),%xmm3 + vmovdqu %xmm4,-128(%edi) + vmovdqu %xmm5,-64(%edi) + vmovdqu %xmm6,(%edi) + vmovdqu %xmm7,64(%edi) + leal 16(%edi),%edi + vpaddd -64(%ebp),%xmm0,%xmm0 + vpaddd -48(%ebp),%xmm1,%xmm1 + vpaddd -32(%ebp),%xmm2,%xmm2 + vpaddd -16(%ebp),%xmm3,%xmm3 + vpunpckldq %xmm1,%xmm0,%xmm6 + vpunpckldq %xmm3,%xmm2,%xmm7 + vpunpckhdq %xmm1,%xmm0,%xmm0 + vpunpckhdq %xmm3,%xmm2,%xmm2 + vpunpcklqdq %xmm7,%xmm6,%xmm1 + vpunpckhqdq %xmm7,%xmm6,%xmm6 + vpunpcklqdq %xmm2,%xmm0,%xmm7 + vpunpckhqdq %xmm2,%xmm0,%xmm3 + vpxor -128(%esi),%xmm1,%xmm4 + vpxor -64(%esi),%xmm6,%xmm5 + vpxor (%esi),%xmm7,%xmm6 + vpxor 64(%esi),%xmm3,%xmm7 + leal 16(%esi),%esi + vmovdqa (%ebx),%xmm0 + vmovdqa 16(%ebx),%xmm1 + vmovdqa 32(%ebx),%xmm2 + vmovdqa 48(%ebx),%xmm3 + vmovdqu %xmm4,-128(%edi) + vmovdqu %xmm5,-64(%edi) + vmovdqu %xmm6,(%edi) + vmovdqu %xmm7,64(%edi) + leal 16(%edi),%edi + vpaddd (%ebp),%xmm0,%xmm0 + vpaddd 16(%ebp),%xmm1,%xmm1 + vpaddd 32(%ebp),%xmm2,%xmm2 + vpaddd 48(%ebp),%xmm3,%xmm3 + vpunpckldq %xmm1,%xmm0,%xmm6 + vpunpckldq %xmm3,%xmm2,%xmm7 + vpunpckhdq %xmm1,%xmm0,%xmm0 + vpunpckhdq %xmm3,%xmm2,%xmm2 + vpunpcklqdq %xmm7,%xmm6,%xmm1 + vpunpckhqdq %xmm7,%xmm6,%xmm6 + vpunpcklqdq %xmm2,%xmm0,%xmm7 + vpunpckhqdq %xmm2,%xmm0,%xmm3 + vpxor -128(%esi),%xmm1,%xmm4 + vpxor -64(%esi),%xmm6,%xmm5 + vpxor (%esi),%xmm7,%xmm6 + vpxor 64(%esi),%xmm3,%xmm7 + leal 16(%esi),%esi + vmovdqa 64(%ebx),%xmm0 + vmovdqa 80(%ebx),%xmm1 + vmovdqa 96(%ebx),%xmm2 + vmovdqa 112(%ebx),%xmm3 + vmovdqu %xmm4,-128(%edi) + vmovdqu %xmm5,-64(%edi) + vmovdqu %xmm6,(%edi) + vmovdqu %xmm7,64(%edi) + leal 16(%edi),%edi + vpaddd 64(%ebp),%xmm0,%xmm0 + vpaddd 80(%ebp),%xmm1,%xmm1 + vpaddd 96(%ebp),%xmm2,%xmm2 + vpaddd 112(%ebp),%xmm3,%xmm3 + vpunpckldq %xmm1,%xmm0,%xmm6 + vpunpckldq %xmm3,%xmm2,%xmm7 + vpunpckhdq %xmm1,%xmm0,%xmm0 + vpunpckhdq %xmm3,%xmm2,%xmm2 + vpunpcklqdq %xmm7,%xmm6,%xmm1 + vpunpckhqdq %xmm7,%xmm6,%xmm6 + vpunpcklqdq %xmm2,%xmm0,%xmm7 + vpunpckhqdq %xmm2,%xmm0,%xmm3 + vpxor -128(%esi),%xmm1,%xmm4 + vpxor -64(%esi),%xmm6,%xmm5 + vpxor (%esi),%xmm7,%xmm6 + vpxor 64(%esi),%xmm3,%xmm7 + leal 208(%esi),%esi + vmovdqu %xmm4,-128(%edi) + vmovdqu %xmm5,-64(%edi) + vmovdqu %xmm6,(%edi) + vmovdqu %xmm7,64(%edi) + leal 208(%edi),%edi + subl $256,%ecx + jnc .L015outer_loop + addl $256,%ecx + jz .L017done + movl 520(%esp),%ebx + leal -128(%esi),%esi + movl 516(%esp),%edx + leal -128(%edi),%edi + vmovd 64(%ebp),%xmm2 + vmovdqu (%ebx),%xmm3 + vpaddd 96(%eax),%xmm2,%xmm2 + vpand 112(%eax),%xmm3,%xmm3 + vpor %xmm2,%xmm3,%xmm3 +.L0141x: + vmovdqa 32(%eax),%xmm0 + vmovdqu (%edx),%xmm1 + vmovdqu 16(%edx),%xmm2 + vmovdqa (%eax),%xmm6 + vmovdqa 16(%eax),%xmm7 + movl %ebp,48(%esp) + vmovdqa %xmm0,(%esp) + vmovdqa %xmm1,16(%esp) + vmovdqa %xmm2,32(%esp) + vmovdqa %xmm3,48(%esp) + movl $10,%edx + jmp .L018loop1x +.align 16 +.L019outer1x: + vmovdqa 80(%eax),%xmm3 + vmovdqa (%esp),%xmm0 + vmovdqa 16(%esp),%xmm1 + vmovdqa 32(%esp),%xmm2 + vpaddd 48(%esp),%xmm3,%xmm3 + movl $10,%edx + vmovdqa %xmm3,48(%esp) + jmp .L018loop1x +.align 16 +.L018loop1x: + vpaddd %xmm1,%xmm0,%xmm0 + vpxor %xmm0,%xmm3,%xmm3 +.byte 143,232,120,194,219,16 + vpaddd %xmm3,%xmm2,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 +.byte 143,232,120,194,201,12 + vpaddd %xmm1,%xmm0,%xmm0 + vpxor %xmm0,%xmm3,%xmm3 +.byte 143,232,120,194,219,8 + vpaddd %xmm3,%xmm2,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 +.byte 143,232,120,194,201,7 + vpshufd $78,%xmm2,%xmm2 + vpshufd $57,%xmm1,%xmm1 + vpshufd $147,%xmm3,%xmm3 + vpaddd %xmm1,%xmm0,%xmm0 + vpxor %xmm0,%xmm3,%xmm3 +.byte 143,232,120,194,219,16 + vpaddd %xmm3,%xmm2,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 +.byte 143,232,120,194,201,12 + vpaddd %xmm1,%xmm0,%xmm0 + vpxor %xmm0,%xmm3,%xmm3 +.byte 143,232,120,194,219,8 + vpaddd %xmm3,%xmm2,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 +.byte 143,232,120,194,201,7 + vpshufd $78,%xmm2,%xmm2 + vpshufd $147,%xmm1,%xmm1 + vpshufd $57,%xmm3,%xmm3 + decl %edx + jnz .L018loop1x + vpaddd (%esp),%xmm0,%xmm0 + vpaddd 16(%esp),%xmm1,%xmm1 + vpaddd 32(%esp),%xmm2,%xmm2 + vpaddd 48(%esp),%xmm3,%xmm3 + cmpl $64,%ecx + jb .L020tail + vpxor (%esi),%xmm0,%xmm0 + vpxor 16(%esi),%xmm1,%xmm1 + vpxor 32(%esi),%xmm2,%xmm2 + vpxor 48(%esi),%xmm3,%xmm3 + leal 64(%esi),%esi + vmovdqu %xmm0,(%edi) + vmovdqu %xmm1,16(%edi) + vmovdqu %xmm2,32(%edi) + vmovdqu %xmm3,48(%edi) + leal 64(%edi),%edi + subl $64,%ecx + jnz .L019outer1x + jmp .L017done +.L020tail: + vmovdqa %xmm0,(%esp) + vmovdqa %xmm1,16(%esp) + vmovdqa %xmm2,32(%esp) + vmovdqa %xmm3,48(%esp) + xorl %eax,%eax + xorl %edx,%edx + xorl %ebp,%ebp +.L021tail_loop: + movb (%esp,%ebp,1),%al + movb (%esi,%ebp,1),%dl + leal 1(%ebp),%ebp + xorb %dl,%al + movb %al,-1(%edi,%ebp,1) + decl %ecx + jnz .L021tail_loop +.L017done: + vzeroupper + movl 512(%esp),%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size ChaCha20_xop,.-.L_ChaCha20_xop_begin .comm OPENSSL_ia32cap_P,16,4 #endif |