summaryrefslogtreecommitdiff
path: root/secure/lib/libcrypto/i386/chacha-x86.S
diff options
context:
space:
mode:
Diffstat (limited to 'secure/lib/libcrypto/i386/chacha-x86.S')
-rw-r--r--secure/lib/libcrypto/i386/chacha-x86.S960
1 files changed, 960 insertions, 0 deletions
diff --git a/secure/lib/libcrypto/i386/chacha-x86.S b/secure/lib/libcrypto/i386/chacha-x86.S
index 566285310e06..d6b2936a5381 100644
--- a/secure/lib/libcrypto/i386/chacha-x86.S
+++ b/secure/lib/libcrypto/i386/chacha-x86.S
@@ -385,6 +385,8 @@ ChaCha20_ssse3:
pushl %esi
pushl %edi
.Lssse3_shortcut:
+ testl $2048,4(%ebp)
+ jnz .Lxop_shortcut
movl 20(%esp),%edi
movl 24(%esp),%esi
movl 28(%esp),%ecx
@@ -528,6 +530,484 @@ ChaCha20_ssse3:
.byte 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32
.byte 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111
.byte 114,103,62,0
+.globl ChaCha20_xop
+.type ChaCha20_xop,@function
+.align 16
+ChaCha20_xop:
+.L_ChaCha20_xop_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+.Lxop_shortcut:
+ movl 20(%esp),%edi
+ movl 24(%esp),%esi
+ movl 28(%esp),%ecx
+ movl 32(%esp),%edx
+ movl 36(%esp),%ebx
+ vzeroupper
+ movl %esp,%ebp
+ subl $524,%esp
+ andl $-64,%esp
+ movl %ebp,512(%esp)
+ leal .Lssse3_data-.Lpic_point(%eax),%eax
+ vmovdqu (%ebx),%xmm3
+ cmpl $256,%ecx
+ jb .L0141x
+ movl %edx,516(%esp)
+ movl %ebx,520(%esp)
+ subl $256,%ecx
+ leal 384(%esp),%ebp
+ vmovdqu (%edx),%xmm7
+ vpshufd $0,%xmm3,%xmm0
+ vpshufd $85,%xmm3,%xmm1
+ vpshufd $170,%xmm3,%xmm2
+ vpshufd $255,%xmm3,%xmm3
+ vpaddd 48(%eax),%xmm0,%xmm0
+ vpshufd $0,%xmm7,%xmm4
+ vpshufd $85,%xmm7,%xmm5
+ vpsubd 64(%eax),%xmm0,%xmm0
+ vpshufd $170,%xmm7,%xmm6
+ vpshufd $255,%xmm7,%xmm7
+ vmovdqa %xmm0,64(%ebp)
+ vmovdqa %xmm1,80(%ebp)
+ vmovdqa %xmm2,96(%ebp)
+ vmovdqa %xmm3,112(%ebp)
+ vmovdqu 16(%edx),%xmm3
+ vmovdqa %xmm4,-64(%ebp)
+ vmovdqa %xmm5,-48(%ebp)
+ vmovdqa %xmm6,-32(%ebp)
+ vmovdqa %xmm7,-16(%ebp)
+ vmovdqa 32(%eax),%xmm7
+ leal 128(%esp),%ebx
+ vpshufd $0,%xmm3,%xmm0
+ vpshufd $85,%xmm3,%xmm1
+ vpshufd $170,%xmm3,%xmm2
+ vpshufd $255,%xmm3,%xmm3
+ vpshufd $0,%xmm7,%xmm4
+ vpshufd $85,%xmm7,%xmm5
+ vpshufd $170,%xmm7,%xmm6
+ vpshufd $255,%xmm7,%xmm7
+ vmovdqa %xmm0,(%ebp)
+ vmovdqa %xmm1,16(%ebp)
+ vmovdqa %xmm2,32(%ebp)
+ vmovdqa %xmm3,48(%ebp)
+ vmovdqa %xmm4,-128(%ebp)
+ vmovdqa %xmm5,-112(%ebp)
+ vmovdqa %xmm6,-96(%ebp)
+ vmovdqa %xmm7,-80(%ebp)
+ leal 128(%esi),%esi
+ leal 128(%edi),%edi
+ jmp .L015outer_loop
+.align 32
+.L015outer_loop:
+ vmovdqa -112(%ebp),%xmm1
+ vmovdqa -96(%ebp),%xmm2
+ vmovdqa -80(%ebp),%xmm3
+ vmovdqa -48(%ebp),%xmm5
+ vmovdqa -32(%ebp),%xmm6
+ vmovdqa -16(%ebp),%xmm7
+ vmovdqa %xmm1,-112(%ebx)
+ vmovdqa %xmm2,-96(%ebx)
+ vmovdqa %xmm3,-80(%ebx)
+ vmovdqa %xmm5,-48(%ebx)
+ vmovdqa %xmm6,-32(%ebx)
+ vmovdqa %xmm7,-16(%ebx)
+ vmovdqa 32(%ebp),%xmm2
+ vmovdqa 48(%ebp),%xmm3
+ vmovdqa 64(%ebp),%xmm4
+ vmovdqa 80(%ebp),%xmm5
+ vmovdqa 96(%ebp),%xmm6
+ vmovdqa 112(%ebp),%xmm7
+ vpaddd 64(%eax),%xmm4,%xmm4
+ vmovdqa %xmm2,32(%ebx)
+ vmovdqa %xmm3,48(%ebx)
+ vmovdqa %xmm4,64(%ebx)
+ vmovdqa %xmm5,80(%ebx)
+ vmovdqa %xmm6,96(%ebx)
+ vmovdqa %xmm7,112(%ebx)
+ vmovdqa %xmm4,64(%ebp)
+ vmovdqa -128(%ebp),%xmm0
+ vmovdqa %xmm4,%xmm6
+ vmovdqa -64(%ebp),%xmm3
+ vmovdqa (%ebp),%xmm4
+ vmovdqa 16(%ebp),%xmm5
+ movl $10,%edx
+ nop
+.align 32
+.L016loop:
+ vpaddd %xmm3,%xmm0,%xmm0
+ vpxor %xmm0,%xmm6,%xmm6
+.byte 143,232,120,194,246,16
+ vpaddd %xmm6,%xmm4,%xmm4
+ vpxor %xmm4,%xmm3,%xmm2
+ vmovdqa -112(%ebx),%xmm1
+.byte 143,232,120,194,210,12
+ vmovdqa -48(%ebx),%xmm3
+ vpaddd %xmm2,%xmm0,%xmm0
+ vmovdqa 80(%ebx),%xmm7
+ vpxor %xmm0,%xmm6,%xmm6
+ vpaddd %xmm3,%xmm1,%xmm1
+.byte 143,232,120,194,246,8
+ vmovdqa %xmm0,-128(%ebx)
+ vpaddd %xmm6,%xmm4,%xmm4
+ vmovdqa %xmm6,64(%ebx)
+ vpxor %xmm4,%xmm2,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+.byte 143,232,120,194,210,7
+ vmovdqa %xmm4,(%ebx)
+.byte 143,232,120,194,255,16
+ vmovdqa %xmm2,-64(%ebx)
+ vpaddd %xmm7,%xmm5,%xmm5
+ vmovdqa 32(%ebx),%xmm4
+ vpxor %xmm5,%xmm3,%xmm3
+ vmovdqa -96(%ebx),%xmm0
+.byte 143,232,120,194,219,12
+ vmovdqa -32(%ebx),%xmm2
+ vpaddd %xmm3,%xmm1,%xmm1
+ vmovdqa 96(%ebx),%xmm6
+ vpxor %xmm1,%xmm7,%xmm7
+ vpaddd %xmm2,%xmm0,%xmm0
+.byte 143,232,120,194,255,8
+ vmovdqa %xmm1,-112(%ebx)
+ vpaddd %xmm7,%xmm5,%xmm5
+ vmovdqa %xmm7,80(%ebx)
+ vpxor %xmm5,%xmm3,%xmm3
+ vpxor %xmm0,%xmm6,%xmm6
+.byte 143,232,120,194,219,7
+ vmovdqa %xmm5,16(%ebx)
+.byte 143,232,120,194,246,16
+ vmovdqa %xmm3,-48(%ebx)
+ vpaddd %xmm6,%xmm4,%xmm4
+ vmovdqa 48(%ebx),%xmm5
+ vpxor %xmm4,%xmm2,%xmm2
+ vmovdqa -80(%ebx),%xmm1
+.byte 143,232,120,194,210,12
+ vmovdqa -16(%ebx),%xmm3
+ vpaddd %xmm2,%xmm0,%xmm0
+ vmovdqa 112(%ebx),%xmm7
+ vpxor %xmm0,%xmm6,%xmm6
+ vpaddd %xmm3,%xmm1,%xmm1
+.byte 143,232,120,194,246,8
+ vmovdqa %xmm0,-96(%ebx)
+ vpaddd %xmm6,%xmm4,%xmm4
+ vmovdqa %xmm6,96(%ebx)
+ vpxor %xmm4,%xmm2,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+.byte 143,232,120,194,210,7
+.byte 143,232,120,194,255,16
+ vmovdqa %xmm2,-32(%ebx)
+ vpaddd %xmm7,%xmm5,%xmm5
+ vpxor %xmm5,%xmm3,%xmm3
+ vmovdqa -128(%ebx),%xmm0
+.byte 143,232,120,194,219,12
+ vmovdqa -48(%ebx),%xmm2
+ vpaddd %xmm3,%xmm1,%xmm1
+ vpxor %xmm1,%xmm7,%xmm7
+ vpaddd %xmm2,%xmm0,%xmm0
+.byte 143,232,120,194,255,8
+ vmovdqa %xmm1,-80(%ebx)
+ vpaddd %xmm7,%xmm5,%xmm5
+ vpxor %xmm5,%xmm3,%xmm3
+ vpxor %xmm0,%xmm7,%xmm6
+.byte 143,232,120,194,219,7
+.byte 143,232,120,194,246,16
+ vmovdqa %xmm3,-16(%ebx)
+ vpaddd %xmm6,%xmm4,%xmm4
+ vpxor %xmm4,%xmm2,%xmm2
+ vmovdqa -112(%ebx),%xmm1
+.byte 143,232,120,194,210,12
+ vmovdqa -32(%ebx),%xmm3
+ vpaddd %xmm2,%xmm0,%xmm0
+ vmovdqa 64(%ebx),%xmm7
+ vpxor %xmm0,%xmm6,%xmm6
+ vpaddd %xmm3,%xmm1,%xmm1
+.byte 143,232,120,194,246,8
+ vmovdqa %xmm0,-128(%ebx)
+ vpaddd %xmm6,%xmm4,%xmm4
+ vmovdqa %xmm6,112(%ebx)
+ vpxor %xmm4,%xmm2,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+.byte 143,232,120,194,210,7
+ vmovdqa %xmm4,32(%ebx)
+.byte 143,232,120,194,255,16
+ vmovdqa %xmm2,-48(%ebx)
+ vpaddd %xmm7,%xmm5,%xmm5
+ vmovdqa (%ebx),%xmm4
+ vpxor %xmm5,%xmm3,%xmm3
+ vmovdqa -96(%ebx),%xmm0
+.byte 143,232,120,194,219,12
+ vmovdqa -16(%ebx),%xmm2
+ vpaddd %xmm3,%xmm1,%xmm1
+ vmovdqa 80(%ebx),%xmm6
+ vpxor %xmm1,%xmm7,%xmm7
+ vpaddd %xmm2,%xmm0,%xmm0
+.byte 143,232,120,194,255,8
+ vmovdqa %xmm1,-112(%ebx)
+ vpaddd %xmm7,%xmm5,%xmm5
+ vmovdqa %xmm7,64(%ebx)
+ vpxor %xmm5,%xmm3,%xmm3
+ vpxor %xmm0,%xmm6,%xmm6
+.byte 143,232,120,194,219,7
+ vmovdqa %xmm5,48(%ebx)
+.byte 143,232,120,194,246,16
+ vmovdqa %xmm3,-32(%ebx)
+ vpaddd %xmm6,%xmm4,%xmm4
+ vmovdqa 16(%ebx),%xmm5
+ vpxor %xmm4,%xmm2,%xmm2
+ vmovdqa -80(%ebx),%xmm1
+.byte 143,232,120,194,210,12
+ vmovdqa -64(%ebx),%xmm3
+ vpaddd %xmm2,%xmm0,%xmm0
+ vmovdqa 96(%ebx),%xmm7
+ vpxor %xmm0,%xmm6,%xmm6
+ vpaddd %xmm3,%xmm1,%xmm1
+.byte 143,232,120,194,246,8
+ vmovdqa %xmm0,-96(%ebx)
+ vpaddd %xmm6,%xmm4,%xmm4
+ vmovdqa %xmm6,80(%ebx)
+ vpxor %xmm4,%xmm2,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+.byte 143,232,120,194,210,7
+.byte 143,232,120,194,255,16
+ vmovdqa %xmm2,-16(%ebx)
+ vpaddd %xmm7,%xmm5,%xmm5
+ vpxor %xmm5,%xmm3,%xmm3
+ vmovdqa -128(%ebx),%xmm0
+.byte 143,232,120,194,219,12
+ vpaddd %xmm3,%xmm1,%xmm1
+ vmovdqa 64(%ebx),%xmm6
+ vpxor %xmm1,%xmm7,%xmm7
+.byte 143,232,120,194,255,8
+ vmovdqa %xmm1,-80(%ebx)
+ vpaddd %xmm7,%xmm5,%xmm5
+ vmovdqa %xmm7,96(%ebx)
+ vpxor %xmm5,%xmm3,%xmm3
+.byte 143,232,120,194,219,7
+ decl %edx
+ jnz .L016loop
+ vmovdqa %xmm3,-64(%ebx)
+ vmovdqa %xmm4,(%ebx)
+ vmovdqa %xmm5,16(%ebx)
+ vmovdqa %xmm6,64(%ebx)
+ vmovdqa %xmm7,96(%ebx)
+ vmovdqa -112(%ebx),%xmm1
+ vmovdqa -96(%ebx),%xmm2
+ vmovdqa -80(%ebx),%xmm3
+ vpaddd -128(%ebp),%xmm0,%xmm0
+ vpaddd -112(%ebp),%xmm1,%xmm1
+ vpaddd -96(%ebp),%xmm2,%xmm2
+ vpaddd -80(%ebp),%xmm3,%xmm3
+ vpunpckldq %xmm1,%xmm0,%xmm6
+ vpunpckldq %xmm3,%xmm2,%xmm7
+ vpunpckhdq %xmm1,%xmm0,%xmm0
+ vpunpckhdq %xmm3,%xmm2,%xmm2
+ vpunpcklqdq %xmm7,%xmm6,%xmm1
+ vpunpckhqdq %xmm7,%xmm6,%xmm6
+ vpunpcklqdq %xmm2,%xmm0,%xmm7
+ vpunpckhqdq %xmm2,%xmm0,%xmm3
+ vpxor -128(%esi),%xmm1,%xmm4
+ vpxor -64(%esi),%xmm6,%xmm5
+ vpxor (%esi),%xmm7,%xmm6
+ vpxor 64(%esi),%xmm3,%xmm7
+ leal 16(%esi),%esi
+ vmovdqa -64(%ebx),%xmm0
+ vmovdqa -48(%ebx),%xmm1
+ vmovdqa -32(%ebx),%xmm2
+ vmovdqa -16(%ebx),%xmm3
+ vmovdqu %xmm4,-128(%edi)
+ vmovdqu %xmm5,-64(%edi)
+ vmovdqu %xmm6,(%edi)
+ vmovdqu %xmm7,64(%edi)
+ leal 16(%edi),%edi
+ vpaddd -64(%ebp),%xmm0,%xmm0
+ vpaddd -48(%ebp),%xmm1,%xmm1
+ vpaddd -32(%ebp),%xmm2,%xmm2
+ vpaddd -16(%ebp),%xmm3,%xmm3
+ vpunpckldq %xmm1,%xmm0,%xmm6
+ vpunpckldq %xmm3,%xmm2,%xmm7
+ vpunpckhdq %xmm1,%xmm0,%xmm0
+ vpunpckhdq %xmm3,%xmm2,%xmm2
+ vpunpcklqdq %xmm7,%xmm6,%xmm1
+ vpunpckhqdq %xmm7,%xmm6,%xmm6
+ vpunpcklqdq %xmm2,%xmm0,%xmm7
+ vpunpckhqdq %xmm2,%xmm0,%xmm3
+ vpxor -128(%esi),%xmm1,%xmm4
+ vpxor -64(%esi),%xmm6,%xmm5
+ vpxor (%esi),%xmm7,%xmm6
+ vpxor 64(%esi),%xmm3,%xmm7
+ leal 16(%esi),%esi
+ vmovdqa (%ebx),%xmm0
+ vmovdqa 16(%ebx),%xmm1
+ vmovdqa 32(%ebx),%xmm2
+ vmovdqa 48(%ebx),%xmm3
+ vmovdqu %xmm4,-128(%edi)
+ vmovdqu %xmm5,-64(%edi)
+ vmovdqu %xmm6,(%edi)
+ vmovdqu %xmm7,64(%edi)
+ leal 16(%edi),%edi
+ vpaddd (%ebp),%xmm0,%xmm0
+ vpaddd 16(%ebp),%xmm1,%xmm1
+ vpaddd 32(%ebp),%xmm2,%xmm2
+ vpaddd 48(%ebp),%xmm3,%xmm3
+ vpunpckldq %xmm1,%xmm0,%xmm6
+ vpunpckldq %xmm3,%xmm2,%xmm7
+ vpunpckhdq %xmm1,%xmm0,%xmm0
+ vpunpckhdq %xmm3,%xmm2,%xmm2
+ vpunpcklqdq %xmm7,%xmm6,%xmm1
+ vpunpckhqdq %xmm7,%xmm6,%xmm6
+ vpunpcklqdq %xmm2,%xmm0,%xmm7
+ vpunpckhqdq %xmm2,%xmm0,%xmm3
+ vpxor -128(%esi),%xmm1,%xmm4
+ vpxor -64(%esi),%xmm6,%xmm5
+ vpxor (%esi),%xmm7,%xmm6
+ vpxor 64(%esi),%xmm3,%xmm7
+ leal 16(%esi),%esi
+ vmovdqa 64(%ebx),%xmm0
+ vmovdqa 80(%ebx),%xmm1
+ vmovdqa 96(%ebx),%xmm2
+ vmovdqa 112(%ebx),%xmm3
+ vmovdqu %xmm4,-128(%edi)
+ vmovdqu %xmm5,-64(%edi)
+ vmovdqu %xmm6,(%edi)
+ vmovdqu %xmm7,64(%edi)
+ leal 16(%edi),%edi
+ vpaddd 64(%ebp),%xmm0,%xmm0
+ vpaddd 80(%ebp),%xmm1,%xmm1
+ vpaddd 96(%ebp),%xmm2,%xmm2
+ vpaddd 112(%ebp),%xmm3,%xmm3
+ vpunpckldq %xmm1,%xmm0,%xmm6
+ vpunpckldq %xmm3,%xmm2,%xmm7
+ vpunpckhdq %xmm1,%xmm0,%xmm0
+ vpunpckhdq %xmm3,%xmm2,%xmm2
+ vpunpcklqdq %xmm7,%xmm6,%xmm1
+ vpunpckhqdq %xmm7,%xmm6,%xmm6
+ vpunpcklqdq %xmm2,%xmm0,%xmm7
+ vpunpckhqdq %xmm2,%xmm0,%xmm3
+ vpxor -128(%esi),%xmm1,%xmm4
+ vpxor -64(%esi),%xmm6,%xmm5
+ vpxor (%esi),%xmm7,%xmm6
+ vpxor 64(%esi),%xmm3,%xmm7
+ leal 208(%esi),%esi
+ vmovdqu %xmm4,-128(%edi)
+ vmovdqu %xmm5,-64(%edi)
+ vmovdqu %xmm6,(%edi)
+ vmovdqu %xmm7,64(%edi)
+ leal 208(%edi),%edi
+ subl $256,%ecx
+ jnc .L015outer_loop
+ addl $256,%ecx
+ jz .L017done
+ movl 520(%esp),%ebx
+ leal -128(%esi),%esi
+ movl 516(%esp),%edx
+ leal -128(%edi),%edi
+ vmovd 64(%ebp),%xmm2
+ vmovdqu (%ebx),%xmm3
+ vpaddd 96(%eax),%xmm2,%xmm2
+ vpand 112(%eax),%xmm3,%xmm3
+ vpor %xmm2,%xmm3,%xmm3
+.L0141x:
+ vmovdqa 32(%eax),%xmm0
+ vmovdqu (%edx),%xmm1
+ vmovdqu 16(%edx),%xmm2
+ vmovdqa (%eax),%xmm6
+ vmovdqa 16(%eax),%xmm7
+ movl %ebp,48(%esp)
+ vmovdqa %xmm0,(%esp)
+ vmovdqa %xmm1,16(%esp)
+ vmovdqa %xmm2,32(%esp)
+ vmovdqa %xmm3,48(%esp)
+ movl $10,%edx
+ jmp .L018loop1x
+.align 16
+.L019outer1x:
+ vmovdqa 80(%eax),%xmm3
+ vmovdqa (%esp),%xmm0
+ vmovdqa 16(%esp),%xmm1
+ vmovdqa 32(%esp),%xmm2
+ vpaddd 48(%esp),%xmm3,%xmm3
+ movl $10,%edx
+ vmovdqa %xmm3,48(%esp)
+ jmp .L018loop1x
+.align 16
+.L018loop1x:
+ vpaddd %xmm1,%xmm0,%xmm0
+ vpxor %xmm0,%xmm3,%xmm3
+.byte 143,232,120,194,219,16
+ vpaddd %xmm3,%xmm2,%xmm2
+ vpxor %xmm2,%xmm1,%xmm1
+.byte 143,232,120,194,201,12
+ vpaddd %xmm1,%xmm0,%xmm0
+ vpxor %xmm0,%xmm3,%xmm3
+.byte 143,232,120,194,219,8
+ vpaddd %xmm3,%xmm2,%xmm2
+ vpxor %xmm2,%xmm1,%xmm1
+.byte 143,232,120,194,201,7
+ vpshufd $78,%xmm2,%xmm2
+ vpshufd $57,%xmm1,%xmm1
+ vpshufd $147,%xmm3,%xmm3
+ vpaddd %xmm1,%xmm0,%xmm0
+ vpxor %xmm0,%xmm3,%xmm3
+.byte 143,232,120,194,219,16
+ vpaddd %xmm3,%xmm2,%xmm2
+ vpxor %xmm2,%xmm1,%xmm1
+.byte 143,232,120,194,201,12
+ vpaddd %xmm1,%xmm0,%xmm0
+ vpxor %xmm0,%xmm3,%xmm3
+.byte 143,232,120,194,219,8
+ vpaddd %xmm3,%xmm2,%xmm2
+ vpxor %xmm2,%xmm1,%xmm1
+.byte 143,232,120,194,201,7
+ vpshufd $78,%xmm2,%xmm2
+ vpshufd $147,%xmm1,%xmm1
+ vpshufd $57,%xmm3,%xmm3
+ decl %edx
+ jnz .L018loop1x
+ vpaddd (%esp),%xmm0,%xmm0
+ vpaddd 16(%esp),%xmm1,%xmm1
+ vpaddd 32(%esp),%xmm2,%xmm2
+ vpaddd 48(%esp),%xmm3,%xmm3
+ cmpl $64,%ecx
+ jb .L020tail
+ vpxor (%esi),%xmm0,%xmm0
+ vpxor 16(%esi),%xmm1,%xmm1
+ vpxor 32(%esi),%xmm2,%xmm2
+ vpxor 48(%esi),%xmm3,%xmm3
+ leal 64(%esi),%esi
+ vmovdqu %xmm0,(%edi)
+ vmovdqu %xmm1,16(%edi)
+ vmovdqu %xmm2,32(%edi)
+ vmovdqu %xmm3,48(%edi)
+ leal 64(%edi),%edi
+ subl $64,%ecx
+ jnz .L019outer1x
+ jmp .L017done
+.L020tail:
+ vmovdqa %xmm0,(%esp)
+ vmovdqa %xmm1,16(%esp)
+ vmovdqa %xmm2,32(%esp)
+ vmovdqa %xmm3,48(%esp)
+ xorl %eax,%eax
+ xorl %edx,%edx
+ xorl %ebp,%ebp
+.L021tail_loop:
+ movb (%esp,%ebp,1),%al
+ movb (%esi,%ebp,1),%dl
+ leal 1(%ebp),%ebp
+ xorb %dl,%al
+ movb %al,-1(%edi,%ebp,1)
+ decl %ecx
+ jnz .L021tail_loop
+.L017done:
+ vzeroupper
+ movl 512(%esp),%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size ChaCha20_xop,.-.L_ChaCha20_xop_begin
.comm OPENSSL_ia32cap_P,16,4
#else
.text
@@ -914,6 +1394,8 @@ ChaCha20_ssse3:
pushl %esi
pushl %edi
.Lssse3_shortcut:
+ testl $2048,4(%ebp)
+ jnz .Lxop_shortcut
movl 20(%esp),%edi
movl 24(%esp),%esi
movl 28(%esp),%ecx
@@ -1057,5 +1539,483 @@ ChaCha20_ssse3:
.byte 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32
.byte 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111
.byte 114,103,62,0
+.globl ChaCha20_xop
+.type ChaCha20_xop,@function
+.align 16
+ChaCha20_xop:
+.L_ChaCha20_xop_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+.Lxop_shortcut:
+ movl 20(%esp),%edi
+ movl 24(%esp),%esi
+ movl 28(%esp),%ecx
+ movl 32(%esp),%edx
+ movl 36(%esp),%ebx
+ vzeroupper
+ movl %esp,%ebp
+ subl $524,%esp
+ andl $-64,%esp
+ movl %ebp,512(%esp)
+ leal .Lssse3_data-.Lpic_point(%eax),%eax
+ vmovdqu (%ebx),%xmm3
+ cmpl $256,%ecx
+ jb .L0141x
+ movl %edx,516(%esp)
+ movl %ebx,520(%esp)
+ subl $256,%ecx
+ leal 384(%esp),%ebp
+ vmovdqu (%edx),%xmm7
+ vpshufd $0,%xmm3,%xmm0
+ vpshufd $85,%xmm3,%xmm1
+ vpshufd $170,%xmm3,%xmm2
+ vpshufd $255,%xmm3,%xmm3
+ vpaddd 48(%eax),%xmm0,%xmm0
+ vpshufd $0,%xmm7,%xmm4
+ vpshufd $85,%xmm7,%xmm5
+ vpsubd 64(%eax),%xmm0,%xmm0
+ vpshufd $170,%xmm7,%xmm6
+ vpshufd $255,%xmm7,%xmm7
+ vmovdqa %xmm0,64(%ebp)
+ vmovdqa %xmm1,80(%ebp)
+ vmovdqa %xmm2,96(%ebp)
+ vmovdqa %xmm3,112(%ebp)
+ vmovdqu 16(%edx),%xmm3
+ vmovdqa %xmm4,-64(%ebp)
+ vmovdqa %xmm5,-48(%ebp)
+ vmovdqa %xmm6,-32(%ebp)
+ vmovdqa %xmm7,-16(%ebp)
+ vmovdqa 32(%eax),%xmm7
+ leal 128(%esp),%ebx
+ vpshufd $0,%xmm3,%xmm0
+ vpshufd $85,%xmm3,%xmm1
+ vpshufd $170,%xmm3,%xmm2
+ vpshufd $255,%xmm3,%xmm3
+ vpshufd $0,%xmm7,%xmm4
+ vpshufd $85,%xmm7,%xmm5
+ vpshufd $170,%xmm7,%xmm6
+ vpshufd $255,%xmm7,%xmm7
+ vmovdqa %xmm0,(%ebp)
+ vmovdqa %xmm1,16(%ebp)
+ vmovdqa %xmm2,32(%ebp)
+ vmovdqa %xmm3,48(%ebp)
+ vmovdqa %xmm4,-128(%ebp)
+ vmovdqa %xmm5,-112(%ebp)
+ vmovdqa %xmm6,-96(%ebp)
+ vmovdqa %xmm7,-80(%ebp)
+ leal 128(%esi),%esi
+ leal 128(%edi),%edi
+ jmp .L015outer_loop
+.align 32
+.L015outer_loop:
+ vmovdqa -112(%ebp),%xmm1
+ vmovdqa -96(%ebp),%xmm2
+ vmovdqa -80(%ebp),%xmm3
+ vmovdqa -48(%ebp),%xmm5
+ vmovdqa -32(%ebp),%xmm6
+ vmovdqa -16(%ebp),%xmm7
+ vmovdqa %xmm1,-112(%ebx)
+ vmovdqa %xmm2,-96(%ebx)
+ vmovdqa %xmm3,-80(%ebx)
+ vmovdqa %xmm5,-48(%ebx)
+ vmovdqa %xmm6,-32(%ebx)
+ vmovdqa %xmm7,-16(%ebx)
+ vmovdqa 32(%ebp),%xmm2
+ vmovdqa 48(%ebp),%xmm3
+ vmovdqa 64(%ebp),%xmm4
+ vmovdqa 80(%ebp),%xmm5
+ vmovdqa 96(%ebp),%xmm6
+ vmovdqa 112(%ebp),%xmm7
+ vpaddd 64(%eax),%xmm4,%xmm4
+ vmovdqa %xmm2,32(%ebx)
+ vmovdqa %xmm3,48(%ebx)
+ vmovdqa %xmm4,64(%ebx)
+ vmovdqa %xmm5,80(%ebx)
+ vmovdqa %xmm6,96(%ebx)
+ vmovdqa %xmm7,112(%ebx)
+ vmovdqa %xmm4,64(%ebp)
+ vmovdqa -128(%ebp),%xmm0
+ vmovdqa %xmm4,%xmm6
+ vmovdqa -64(%ebp),%xmm3
+ vmovdqa (%ebp),%xmm4
+ vmovdqa 16(%ebp),%xmm5
+ movl $10,%edx
+ nop
+.align 32
+.L016loop:
+ vpaddd %xmm3,%xmm0,%xmm0
+ vpxor %xmm0,%xmm6,%xmm6
+.byte 143,232,120,194,246,16
+ vpaddd %xmm6,%xmm4,%xmm4
+ vpxor %xmm4,%xmm3,%xmm2
+ vmovdqa -112(%ebx),%xmm1
+.byte 143,232,120,194,210,12
+ vmovdqa -48(%ebx),%xmm3
+ vpaddd %xmm2,%xmm0,%xmm0
+ vmovdqa 80(%ebx),%xmm7
+ vpxor %xmm0,%xmm6,%xmm6
+ vpaddd %xmm3,%xmm1,%xmm1
+.byte 143,232,120,194,246,8
+ vmovdqa %xmm0,-128(%ebx)
+ vpaddd %xmm6,%xmm4,%xmm4
+ vmovdqa %xmm6,64(%ebx)
+ vpxor %xmm4,%xmm2,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+.byte 143,232,120,194,210,7
+ vmovdqa %xmm4,(%ebx)
+.byte 143,232,120,194,255,16
+ vmovdqa %xmm2,-64(%ebx)
+ vpaddd %xmm7,%xmm5,%xmm5
+ vmovdqa 32(%ebx),%xmm4
+ vpxor %xmm5,%xmm3,%xmm3
+ vmovdqa -96(%ebx),%xmm0
+.byte 143,232,120,194,219,12
+ vmovdqa -32(%ebx),%xmm2
+ vpaddd %xmm3,%xmm1,%xmm1
+ vmovdqa 96(%ebx),%xmm6
+ vpxor %xmm1,%xmm7,%xmm7
+ vpaddd %xmm2,%xmm0,%xmm0
+.byte 143,232,120,194,255,8
+ vmovdqa %xmm1,-112(%ebx)
+ vpaddd %xmm7,%xmm5,%xmm5
+ vmovdqa %xmm7,80(%ebx)
+ vpxor %xmm5,%xmm3,%xmm3
+ vpxor %xmm0,%xmm6,%xmm6
+.byte 143,232,120,194,219,7
+ vmovdqa %xmm5,16(%ebx)
+.byte 143,232,120,194,246,16
+ vmovdqa %xmm3,-48(%ebx)
+ vpaddd %xmm6,%xmm4,%xmm4
+ vmovdqa 48(%ebx),%xmm5
+ vpxor %xmm4,%xmm2,%xmm2
+ vmovdqa -80(%ebx),%xmm1
+.byte 143,232,120,194,210,12
+ vmovdqa -16(%ebx),%xmm3
+ vpaddd %xmm2,%xmm0,%xmm0
+ vmovdqa 112(%ebx),%xmm7
+ vpxor %xmm0,%xmm6,%xmm6
+ vpaddd %xmm3,%xmm1,%xmm1
+.byte 143,232,120,194,246,8
+ vmovdqa %xmm0,-96(%ebx)
+ vpaddd %xmm6,%xmm4,%xmm4
+ vmovdqa %xmm6,96(%ebx)
+ vpxor %xmm4,%xmm2,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+.byte 143,232,120,194,210,7
+.byte 143,232,120,194,255,16
+ vmovdqa %xmm2,-32(%ebx)
+ vpaddd %xmm7,%xmm5,%xmm5
+ vpxor %xmm5,%xmm3,%xmm3
+ vmovdqa -128(%ebx),%xmm0
+.byte 143,232,120,194,219,12
+ vmovdqa -48(%ebx),%xmm2
+ vpaddd %xmm3,%xmm1,%xmm1
+ vpxor %xmm1,%xmm7,%xmm7
+ vpaddd %xmm2,%xmm0,%xmm0
+.byte 143,232,120,194,255,8
+ vmovdqa %xmm1,-80(%ebx)
+ vpaddd %xmm7,%xmm5,%xmm5
+ vpxor %xmm5,%xmm3,%xmm3
+ vpxor %xmm0,%xmm7,%xmm6
+.byte 143,232,120,194,219,7
+.byte 143,232,120,194,246,16
+ vmovdqa %xmm3,-16(%ebx)
+ vpaddd %xmm6,%xmm4,%xmm4
+ vpxor %xmm4,%xmm2,%xmm2
+ vmovdqa -112(%ebx),%xmm1
+.byte 143,232,120,194,210,12
+ vmovdqa -32(%ebx),%xmm3
+ vpaddd %xmm2,%xmm0,%xmm0
+ vmovdqa 64(%ebx),%xmm7
+ vpxor %xmm0,%xmm6,%xmm6
+ vpaddd %xmm3,%xmm1,%xmm1
+.byte 143,232,120,194,246,8
+ vmovdqa %xmm0,-128(%ebx)
+ vpaddd %xmm6,%xmm4,%xmm4
+ vmovdqa %xmm6,112(%ebx)
+ vpxor %xmm4,%xmm2,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+.byte 143,232,120,194,210,7
+ vmovdqa %xmm4,32(%ebx)
+.byte 143,232,120,194,255,16
+ vmovdqa %xmm2,-48(%ebx)
+ vpaddd %xmm7,%xmm5,%xmm5
+ vmovdqa (%ebx),%xmm4
+ vpxor %xmm5,%xmm3,%xmm3
+ vmovdqa -96(%ebx),%xmm0
+.byte 143,232,120,194,219,12
+ vmovdqa -16(%ebx),%xmm2
+ vpaddd %xmm3,%xmm1,%xmm1
+ vmovdqa 80(%ebx),%xmm6
+ vpxor %xmm1,%xmm7,%xmm7
+ vpaddd %xmm2,%xmm0,%xmm0
+.byte 143,232,120,194,255,8
+ vmovdqa %xmm1,-112(%ebx)
+ vpaddd %xmm7,%xmm5,%xmm5
+ vmovdqa %xmm7,64(%ebx)
+ vpxor %xmm5,%xmm3,%xmm3
+ vpxor %xmm0,%xmm6,%xmm6
+.byte 143,232,120,194,219,7
+ vmovdqa %xmm5,48(%ebx)
+.byte 143,232,120,194,246,16
+ vmovdqa %xmm3,-32(%ebx)
+ vpaddd %xmm6,%xmm4,%xmm4
+ vmovdqa 16(%ebx),%xmm5
+ vpxor %xmm4,%xmm2,%xmm2
+ vmovdqa -80(%ebx),%xmm1
+.byte 143,232,120,194,210,12
+ vmovdqa -64(%ebx),%xmm3
+ vpaddd %xmm2,%xmm0,%xmm0
+ vmovdqa 96(%ebx),%xmm7
+ vpxor %xmm0,%xmm6,%xmm6
+ vpaddd %xmm3,%xmm1,%xmm1
+.byte 143,232,120,194,246,8
+ vmovdqa %xmm0,-96(%ebx)
+ vpaddd %xmm6,%xmm4,%xmm4
+ vmovdqa %xmm6,80(%ebx)
+ vpxor %xmm4,%xmm2,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+.byte 143,232,120,194,210,7
+.byte 143,232,120,194,255,16
+ vmovdqa %xmm2,-16(%ebx)
+ vpaddd %xmm7,%xmm5,%xmm5
+ vpxor %xmm5,%xmm3,%xmm3
+ vmovdqa -128(%ebx),%xmm0
+.byte 143,232,120,194,219,12
+ vpaddd %xmm3,%xmm1,%xmm1
+ vmovdqa 64(%ebx),%xmm6
+ vpxor %xmm1,%xmm7,%xmm7
+.byte 143,232,120,194,255,8
+ vmovdqa %xmm1,-80(%ebx)
+ vpaddd %xmm7,%xmm5,%xmm5
+ vmovdqa %xmm7,96(%ebx)
+ vpxor %xmm5,%xmm3,%xmm3
+.byte 143,232,120,194,219,7
+ decl %edx
+ jnz .L016loop
+ vmovdqa %xmm3,-64(%ebx)
+ vmovdqa %xmm4,(%ebx)
+ vmovdqa %xmm5,16(%ebx)
+ vmovdqa %xmm6,64(%ebx)
+ vmovdqa %xmm7,96(%ebx)
+ vmovdqa -112(%ebx),%xmm1
+ vmovdqa -96(%ebx),%xmm2
+ vmovdqa -80(%ebx),%xmm3
+ vpaddd -128(%ebp),%xmm0,%xmm0
+ vpaddd -112(%ebp),%xmm1,%xmm1
+ vpaddd -96(%ebp),%xmm2,%xmm2
+ vpaddd -80(%ebp),%xmm3,%xmm3
+ vpunpckldq %xmm1,%xmm0,%xmm6
+ vpunpckldq %xmm3,%xmm2,%xmm7
+ vpunpckhdq %xmm1,%xmm0,%xmm0
+ vpunpckhdq %xmm3,%xmm2,%xmm2
+ vpunpcklqdq %xmm7,%xmm6,%xmm1
+ vpunpckhqdq %xmm7,%xmm6,%xmm6
+ vpunpcklqdq %xmm2,%xmm0,%xmm7
+ vpunpckhqdq %xmm2,%xmm0,%xmm3
+ vpxor -128(%esi),%xmm1,%xmm4
+ vpxor -64(%esi),%xmm6,%xmm5
+ vpxor (%esi),%xmm7,%xmm6
+ vpxor 64(%esi),%xmm3,%xmm7
+ leal 16(%esi),%esi
+ vmovdqa -64(%ebx),%xmm0
+ vmovdqa -48(%ebx),%xmm1
+ vmovdqa -32(%ebx),%xmm2
+ vmovdqa -16(%ebx),%xmm3
+ vmovdqu %xmm4,-128(%edi)
+ vmovdqu %xmm5,-64(%edi)
+ vmovdqu %xmm6,(%edi)
+ vmovdqu %xmm7,64(%edi)
+ leal 16(%edi),%edi
+ vpaddd -64(%ebp),%xmm0,%xmm0
+ vpaddd -48(%ebp),%xmm1,%xmm1
+ vpaddd -32(%ebp),%xmm2,%xmm2
+ vpaddd -16(%ebp),%xmm3,%xmm3
+ vpunpckldq %xmm1,%xmm0,%xmm6
+ vpunpckldq %xmm3,%xmm2,%xmm7
+ vpunpckhdq %xmm1,%xmm0,%xmm0
+ vpunpckhdq %xmm3,%xmm2,%xmm2
+ vpunpcklqdq %xmm7,%xmm6,%xmm1
+ vpunpckhqdq %xmm7,%xmm6,%xmm6
+ vpunpcklqdq %xmm2,%xmm0,%xmm7
+ vpunpckhqdq %xmm2,%xmm0,%xmm3
+ vpxor -128(%esi),%xmm1,%xmm4
+ vpxor -64(%esi),%xmm6,%xmm5
+ vpxor (%esi),%xmm7,%xmm6
+ vpxor 64(%esi),%xmm3,%xmm7
+ leal 16(%esi),%esi
+ vmovdqa (%ebx),%xmm0
+ vmovdqa 16(%ebx),%xmm1
+ vmovdqa 32(%ebx),%xmm2
+ vmovdqa 48(%ebx),%xmm3
+ vmovdqu %xmm4,-128(%edi)
+ vmovdqu %xmm5,-64(%edi)
+ vmovdqu %xmm6,(%edi)
+ vmovdqu %xmm7,64(%edi)
+ leal 16(%edi),%edi
+ vpaddd (%ebp),%xmm0,%xmm0
+ vpaddd 16(%ebp),%xmm1,%xmm1
+ vpaddd 32(%ebp),%xmm2,%xmm2
+ vpaddd 48(%ebp),%xmm3,%xmm3
+ vpunpckldq %xmm1,%xmm0,%xmm6
+ vpunpckldq %xmm3,%xmm2,%xmm7
+ vpunpckhdq %xmm1,%xmm0,%xmm0
+ vpunpckhdq %xmm3,%xmm2,%xmm2
+ vpunpcklqdq %xmm7,%xmm6,%xmm1
+ vpunpckhqdq %xmm7,%xmm6,%xmm6
+ vpunpcklqdq %xmm2,%xmm0,%xmm7
+ vpunpckhqdq %xmm2,%xmm0,%xmm3
+ vpxor -128(%esi),%xmm1,%xmm4
+ vpxor -64(%esi),%xmm6,%xmm5
+ vpxor (%esi),%xmm7,%xmm6
+ vpxor 64(%esi),%xmm3,%xmm7
+ leal 16(%esi),%esi
+ vmovdqa 64(%ebx),%xmm0
+ vmovdqa 80(%ebx),%xmm1
+ vmovdqa 96(%ebx),%xmm2
+ vmovdqa 112(%ebx),%xmm3
+ vmovdqu %xmm4,-128(%edi)
+ vmovdqu %xmm5,-64(%edi)
+ vmovdqu %xmm6,(%edi)
+ vmovdqu %xmm7,64(%edi)
+ leal 16(%edi),%edi
+ vpaddd 64(%ebp),%xmm0,%xmm0
+ vpaddd 80(%ebp),%xmm1,%xmm1
+ vpaddd 96(%ebp),%xmm2,%xmm2
+ vpaddd 112(%ebp),%xmm3,%xmm3
+ vpunpckldq %xmm1,%xmm0,%xmm6
+ vpunpckldq %xmm3,%xmm2,%xmm7
+ vpunpckhdq %xmm1,%xmm0,%xmm0
+ vpunpckhdq %xmm3,%xmm2,%xmm2
+ vpunpcklqdq %xmm7,%xmm6,%xmm1
+ vpunpckhqdq %xmm7,%xmm6,%xmm6
+ vpunpcklqdq %xmm2,%xmm0,%xmm7
+ vpunpckhqdq %xmm2,%xmm0,%xmm3
+ vpxor -128(%esi),%xmm1,%xmm4
+ vpxor -64(%esi),%xmm6,%xmm5
+ vpxor (%esi),%xmm7,%xmm6
+ vpxor 64(%esi),%xmm3,%xmm7
+ leal 208(%esi),%esi
+ vmovdqu %xmm4,-128(%edi)
+ vmovdqu %xmm5,-64(%edi)
+ vmovdqu %xmm6,(%edi)
+ vmovdqu %xmm7,64(%edi)
+ leal 208(%edi),%edi
+ subl $256,%ecx
+ jnc .L015outer_loop
+ addl $256,%ecx
+ jz .L017done
+ movl 520(%esp),%ebx
+ leal -128(%esi),%esi
+ movl 516(%esp),%edx
+ leal -128(%edi),%edi
+ vmovd 64(%ebp),%xmm2
+ vmovdqu (%ebx),%xmm3
+ vpaddd 96(%eax),%xmm2,%xmm2
+ vpand 112(%eax),%xmm3,%xmm3
+ vpor %xmm2,%xmm3,%xmm3
+.L0141x:
+ vmovdqa 32(%eax),%xmm0
+ vmovdqu (%edx),%xmm1
+ vmovdqu 16(%edx),%xmm2
+ vmovdqa (%eax),%xmm6
+ vmovdqa 16(%eax),%xmm7
+ movl %ebp,48(%esp)
+ vmovdqa %xmm0,(%esp)
+ vmovdqa %xmm1,16(%esp)
+ vmovdqa %xmm2,32(%esp)
+ vmovdqa %xmm3,48(%esp)
+ movl $10,%edx
+ jmp .L018loop1x
+.align 16
+.L019outer1x:
+ vmovdqa 80(%eax),%xmm3
+ vmovdqa (%esp),%xmm0
+ vmovdqa 16(%esp),%xmm1
+ vmovdqa 32(%esp),%xmm2
+ vpaddd 48(%esp),%xmm3,%xmm3
+ movl $10,%edx
+ vmovdqa %xmm3,48(%esp)
+ jmp .L018loop1x
+.align 16
+.L018loop1x:
+ vpaddd %xmm1,%xmm0,%xmm0
+ vpxor %xmm0,%xmm3,%xmm3
+.byte 143,232,120,194,219,16
+ vpaddd %xmm3,%xmm2,%xmm2
+ vpxor %xmm2,%xmm1,%xmm1
+.byte 143,232,120,194,201,12
+ vpaddd %xmm1,%xmm0,%xmm0
+ vpxor %xmm0,%xmm3,%xmm3
+.byte 143,232,120,194,219,8
+ vpaddd %xmm3,%xmm2,%xmm2
+ vpxor %xmm2,%xmm1,%xmm1
+.byte 143,232,120,194,201,7
+ vpshufd $78,%xmm2,%xmm2
+ vpshufd $57,%xmm1,%xmm1
+ vpshufd $147,%xmm3,%xmm3
+ vpaddd %xmm1,%xmm0,%xmm0
+ vpxor %xmm0,%xmm3,%xmm3
+.byte 143,232,120,194,219,16
+ vpaddd %xmm3,%xmm2,%xmm2
+ vpxor %xmm2,%xmm1,%xmm1
+.byte 143,232,120,194,201,12
+ vpaddd %xmm1,%xmm0,%xmm0
+ vpxor %xmm0,%xmm3,%xmm3
+.byte 143,232,120,194,219,8
+ vpaddd %xmm3,%xmm2,%xmm2
+ vpxor %xmm2,%xmm1,%xmm1
+.byte 143,232,120,194,201,7
+ vpshufd $78,%xmm2,%xmm2
+ vpshufd $147,%xmm1,%xmm1
+ vpshufd $57,%xmm3,%xmm3
+ decl %edx
+ jnz .L018loop1x
+ vpaddd (%esp),%xmm0,%xmm0
+ vpaddd 16(%esp),%xmm1,%xmm1
+ vpaddd 32(%esp),%xmm2,%xmm2
+ vpaddd 48(%esp),%xmm3,%xmm3
+ cmpl $64,%ecx
+ jb .L020tail
+ vpxor (%esi),%xmm0,%xmm0
+ vpxor 16(%esi),%xmm1,%xmm1
+ vpxor 32(%esi),%xmm2,%xmm2
+ vpxor 48(%esi),%xmm3,%xmm3
+ leal 64(%esi),%esi
+ vmovdqu %xmm0,(%edi)
+ vmovdqu %xmm1,16(%edi)
+ vmovdqu %xmm2,32(%edi)
+ vmovdqu %xmm3,48(%edi)
+ leal 64(%edi),%edi
+ subl $64,%ecx
+ jnz .L019outer1x
+ jmp .L017done
+.L020tail:
+ vmovdqa %xmm0,(%esp)
+ vmovdqa %xmm1,16(%esp)
+ vmovdqa %xmm2,32(%esp)
+ vmovdqa %xmm3,48(%esp)
+ xorl %eax,%eax
+ xorl %edx,%edx
+ xorl %ebp,%ebp
+.L021tail_loop:
+ movb (%esp,%ebp,1),%al
+ movb (%esi,%ebp,1),%dl
+ leal 1(%ebp),%ebp
+ xorb %dl,%al
+ movb %al,-1(%edi,%ebp,1)
+ decl %ecx
+ jnz .L021tail_loop
+.L017done:
+ vzeroupper
+ movl 512(%esp),%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size ChaCha20_xop,.-.L_ChaCha20_xop_begin
.comm OPENSSL_ia32cap_P,16,4
#endif