aboutsummaryrefslogtreecommitdiff
path: root/secure/lib
diff options
context:
space:
mode:
authorJung-uk Kim <jkim@FreeBSD.org>2018-09-20 22:48:34 +0000
committerJung-uk Kim <jkim@FreeBSD.org>2018-09-20 22:48:34 +0000
commit63ffbd00fc3a42d07ec56d5ae19a738b9154926d (patch)
tree60f51c6d9d8839d59d6df8ad584aa3dd121668e6 /secure/lib
parent4cd58f1ace0d98cedf69a43d4ae6ed0307a889eb (diff)
Notes
Diffstat (limited to 'secure/lib')
-rw-r--r--secure/lib/libcrypto/i386/bn-586.S867
-rw-r--r--secure/lib/libcrypto/i386/chacha-x86.S1332
-rw-r--r--secure/lib/libcrypto/i386/ecp_nistz256-x86.S612
-rw-r--r--secure/lib/libcrypto/i386/ghash-x86.S2710
-rw-r--r--secure/lib/libcrypto/i386/poly1305-x86.S3300
-rw-r--r--secure/lib/libcrypto/i386/sha1-586.S5192
-rw-r--r--secure/lib/libcrypto/i386/sha256-586.S7672
-rw-r--r--secure/lib/libcrypto/i386/sha512-586.S4544
-rw-r--r--secure/lib/libcrypto/i386/x86-gf2m.S24
-rw-r--r--secure/lib/libcrypto/i386/x86-mont.S341
-rw-r--r--secure/lib/libcrypto/i386/x86cpuid.S336
11 files changed, 25243 insertions, 1687 deletions
diff --git a/secure/lib/libcrypto/i386/bn-586.S b/secure/lib/libcrypto/i386/bn-586.S
index 86c7f5efc637..36f3a376b61c 100644
--- a/secure/lib/libcrypto/i386/bn-586.S
+++ b/secure/lib/libcrypto/i386/bn-586.S
@@ -7,6 +7,102 @@
.align 16
bn_mul_add_words:
.L_bn_mul_add_words_begin:
+ call .L000PIC_me_up
+.L000PIC_me_up:
+ popl %eax
+ leal OPENSSL_ia32cap_P-.L000PIC_me_up(%eax),%eax
+ btl $26,(%eax)
+ jnc .L001maw_non_sse2
+ movl 4(%esp),%eax
+ movl 8(%esp),%edx
+ movl 12(%esp),%ecx
+ movd 16(%esp),%mm0
+ pxor %mm1,%mm1
+ jmp .L002maw_sse2_entry
+.align 16
+.L003maw_sse2_unrolled:
+ movd (%eax),%mm3
+ paddq %mm3,%mm1
+ movd (%edx),%mm2
+ pmuludq %mm0,%mm2
+ movd 4(%edx),%mm4
+ pmuludq %mm0,%mm4
+ movd 8(%edx),%mm6
+ pmuludq %mm0,%mm6
+ movd 12(%edx),%mm7
+ pmuludq %mm0,%mm7
+ paddq %mm2,%mm1
+ movd 4(%eax),%mm3
+ paddq %mm4,%mm3
+ movd 8(%eax),%mm5
+ paddq %mm6,%mm5
+ movd 12(%eax),%mm4
+ paddq %mm4,%mm7
+ movd %mm1,(%eax)
+ movd 16(%edx),%mm2
+ pmuludq %mm0,%mm2
+ psrlq $32,%mm1
+ movd 20(%edx),%mm4
+ pmuludq %mm0,%mm4
+ paddq %mm3,%mm1
+ movd 24(%edx),%mm6
+ pmuludq %mm0,%mm6
+ movd %mm1,4(%eax)
+ psrlq $32,%mm1
+ movd 28(%edx),%mm3
+ addl $32,%edx
+ pmuludq %mm0,%mm3
+ paddq %mm5,%mm1
+ movd 16(%eax),%mm5
+ paddq %mm5,%mm2
+ movd %mm1,8(%eax)
+ psrlq $32,%mm1
+ paddq %mm7,%mm1
+ movd 20(%eax),%mm5
+ paddq %mm5,%mm4
+ movd %mm1,12(%eax)
+ psrlq $32,%mm1
+ paddq %mm2,%mm1
+ movd 24(%eax),%mm5
+ paddq %mm5,%mm6
+ movd %mm1,16(%eax)
+ psrlq $32,%mm1
+ paddq %mm4,%mm1
+ movd 28(%eax),%mm5
+ paddq %mm5,%mm3
+ movd %mm1,20(%eax)
+ psrlq $32,%mm1
+ paddq %mm6,%mm1
+ movd %mm1,24(%eax)
+ psrlq $32,%mm1
+ paddq %mm3,%mm1
+ movd %mm1,28(%eax)
+ leal 32(%eax),%eax
+ psrlq $32,%mm1
+ subl $8,%ecx
+ jz .L004maw_sse2_exit
+.L002maw_sse2_entry:
+ testl $4294967288,%ecx
+ jnz .L003maw_sse2_unrolled
+.align 4
+.L005maw_sse2_loop:
+ movd (%edx),%mm2
+ movd (%eax),%mm3
+ pmuludq %mm0,%mm2
+ leal 4(%edx),%edx
+ paddq %mm3,%mm1
+ paddq %mm2,%mm1
+ movd %mm1,(%eax)
+ subl $1,%ecx
+ psrlq $32,%mm1
+ leal 4(%eax),%eax
+ jnz .L005maw_sse2_loop
+.L004maw_sse2_exit:
+ movd %mm1,%eax
+ emms
+ ret
+.align 16
+.L001maw_non_sse2:
pushl %ebp
pushl %ebx
pushl %esi
@@ -19,9 +115,9 @@ bn_mul_add_words:
andl $4294967288,%ecx
movl 32(%esp),%ebp
pushl %ecx
- jz .L000maw_finish
+ jz .L006maw_finish
.align 16
-.L001maw_loop:
+.L007maw_loop:
movl (%ebx),%eax
mull %ebp
@@ -98,13 +194,13 @@ bn_mul_add_words:
subl $8,%ecx
leal 32(%ebx),%ebx
leal 32(%edi),%edi
- jnz .L001maw_loop
-.L000maw_finish:
+ jnz .L007maw_loop
+.L006maw_finish:
movl 32(%esp),%ecx
andl $7,%ecx
- jnz .L002maw_finish2
- jmp .L003maw_end
-.L002maw_finish2:
+ jnz .L008maw_finish2
+ jmp .L009maw_end
+.L008maw_finish2:
movl (%ebx),%eax
mull %ebp
@@ -115,7 +211,7 @@ bn_mul_add_words:
decl %ecx
movl %eax,(%edi)
movl %edx,%esi
- jz .L003maw_end
+ jz .L009maw_end
movl 4(%ebx),%eax
mull %ebp
@@ -126,7 +222,7 @@ bn_mul_add_words:
decl %ecx
movl %eax,4(%edi)
movl %edx,%esi
- jz .L003maw_end
+ jz .L009maw_end
movl 8(%ebx),%eax
mull %ebp
@@ -137,7 +233,7 @@ bn_mul_add_words:
decl %ecx
movl %eax,8(%edi)
movl %edx,%esi
- jz .L003maw_end
+ jz .L009maw_end
movl 12(%ebx),%eax
mull %ebp
@@ -148,7 +244,7 @@ bn_mul_add_words:
decl %ecx
movl %eax,12(%edi)
movl %edx,%esi
- jz .L003maw_end
+ jz .L009maw_end
movl 16(%ebx),%eax
mull %ebp
@@ -159,7 +255,7 @@ bn_mul_add_words:
decl %ecx
movl %eax,16(%edi)
movl %edx,%esi
- jz .L003maw_end
+ jz .L009maw_end
movl 20(%ebx),%eax
mull %ebp
@@ -170,7 +266,7 @@ bn_mul_add_words:
decl %ecx
movl %eax,20(%edi)
movl %edx,%esi
- jz .L003maw_end
+ jz .L009maw_end
movl 24(%ebx),%eax
mull %ebp
@@ -180,7 +276,7 @@ bn_mul_add_words:
adcl $0,%edx
movl %eax,24(%edi)
movl %edx,%esi
-.L003maw_end:
+.L009maw_end:
movl %esi,%eax
popl %ecx
popl %edi
@@ -194,6 +290,33 @@ bn_mul_add_words:
.align 16
bn_mul_words:
.L_bn_mul_words_begin:
+ call .L010PIC_me_up
+.L010PIC_me_up:
+ popl %eax
+ leal OPENSSL_ia32cap_P-.L010PIC_me_up(%eax),%eax
+ btl $26,(%eax)
+ jnc .L011mw_non_sse2
+ movl 4(%esp),%eax
+ movl 8(%esp),%edx
+ movl 12(%esp),%ecx
+ movd 16(%esp),%mm0
+ pxor %mm1,%mm1
+.align 16
+.L012mw_sse2_loop:
+ movd (%edx),%mm2
+ pmuludq %mm0,%mm2
+ leal 4(%edx),%edx
+ paddq %mm2,%mm1
+ movd %mm1,(%eax)
+ subl $1,%ecx
+ psrlq $32,%mm1
+ leal 4(%eax),%eax
+ jnz .L012mw_sse2_loop
+ movd %mm1,%eax
+ emms
+ ret
+.align 16
+.L011mw_non_sse2:
pushl %ebp
pushl %ebx
pushl %esi
@@ -205,8 +328,8 @@ bn_mul_words:
movl 28(%esp),%ebp
movl 32(%esp),%ecx
andl $4294967288,%ebp
- jz .L004mw_finish
-.L005mw_loop:
+ jz .L013mw_finish
+.L014mw_loop:
movl (%ebx),%eax
mull %ecx
@@ -267,14 +390,14 @@ bn_mul_words:
addl $32,%ebx
addl $32,%edi
subl $8,%ebp
- jz .L004mw_finish
- jmp .L005mw_loop
-.L004mw_finish:
+ jz .L013mw_finish
+ jmp .L014mw_loop
+.L013mw_finish:
movl 28(%esp),%ebp
andl $7,%ebp
- jnz .L006mw_finish2
- jmp .L007mw_end
-.L006mw_finish2:
+ jnz .L015mw_finish2
+ jmp .L016mw_end
+.L015mw_finish2:
movl (%ebx),%eax
mull %ecx
@@ -283,7 +406,7 @@ bn_mul_words:
movl %eax,(%edi)
movl %edx,%esi
decl %ebp
- jz .L007mw_end
+ jz .L016mw_end
movl 4(%ebx),%eax
mull %ecx
@@ -292,7 +415,7 @@ bn_mul_words:
movl %eax,4(%edi)
movl %edx,%esi
decl %ebp
- jz .L007mw_end
+ jz .L016mw_end
movl 8(%ebx),%eax
mull %ecx
@@ -301,7 +424,7 @@ bn_mul_words:
movl %eax,8(%edi)
movl %edx,%esi
decl %ebp
- jz .L007mw_end
+ jz .L016mw_end
movl 12(%ebx),%eax
mull %ecx
@@ -310,7 +433,7 @@ bn_mul_words:
movl %eax,12(%edi)
movl %edx,%esi
decl %ebp
- jz .L007mw_end
+ jz .L016mw_end
movl 16(%ebx),%eax
mull %ecx
@@ -319,7 +442,7 @@ bn_mul_words:
movl %eax,16(%edi)
movl %edx,%esi
decl %ebp
- jz .L007mw_end
+ jz .L016mw_end
movl 20(%ebx),%eax
mull %ecx
@@ -328,7 +451,7 @@ bn_mul_words:
movl %eax,20(%edi)
movl %edx,%esi
decl %ebp
- jz .L007mw_end
+ jz .L016mw_end
movl 24(%ebx),%eax
mull %ecx
@@ -336,7 +459,7 @@ bn_mul_words:
adcl $0,%edx
movl %eax,24(%edi)
movl %edx,%esi
-.L007mw_end:
+.L016mw_end:
movl %esi,%eax
popl %edi
popl %esi
@@ -349,6 +472,28 @@ bn_mul_words:
.align 16
bn_sqr_words:
.L_bn_sqr_words_begin:
+ call .L017PIC_me_up
+.L017PIC_me_up:
+ popl %eax
+ leal OPENSSL_ia32cap_P-.L017PIC_me_up(%eax),%eax
+ btl $26,(%eax)
+ jnc .L018sqr_non_sse2
+ movl 4(%esp),%eax
+ movl 8(%esp),%edx
+ movl 12(%esp),%ecx
+.align 16
+.L019sqr_sse2_loop:
+ movd (%edx),%mm0
+ pmuludq %mm0,%mm0
+ leal 4(%edx),%edx
+ movq %mm0,(%eax)
+ subl $1,%ecx
+ leal 8(%eax),%eax
+ jnz .L019sqr_sse2_loop
+ emms
+ ret
+.align 16
+.L018sqr_non_sse2:
pushl %ebp
pushl %ebx
pushl %esi
@@ -358,8 +503,8 @@ bn_sqr_words:
movl 24(%esp),%edi
movl 28(%esp),%ebx
andl $4294967288,%ebx
- jz .L008sw_finish
-.L009sw_loop:
+ jz .L020sw_finish
+.L021sw_loop:
movl (%edi),%eax
mull %eax
@@ -404,59 +549,59 @@ bn_sqr_words:
addl $32,%edi
addl $64,%esi
subl $8,%ebx
- jnz .L009sw_loop
-.L008sw_finish:
+ jnz .L021sw_loop
+.L020sw_finish:
movl 28(%esp),%ebx
andl $7,%ebx
- jz .L010sw_end
+ jz .L022sw_end
movl (%edi),%eax
mull %eax
movl %eax,(%esi)
decl %ebx
movl %edx,4(%esi)
- jz .L010sw_end
+ jz .L022sw_end
movl 4(%edi),%eax
mull %eax
movl %eax,8(%esi)
decl %ebx
movl %edx,12(%esi)
- jz .L010sw_end
+ jz .L022sw_end
movl 8(%edi),%eax
mull %eax
movl %eax,16(%esi)
decl %ebx
movl %edx,20(%esi)
- jz .L010sw_end
+ jz .L022sw_end
movl 12(%edi),%eax
mull %eax
movl %eax,24(%esi)
decl %ebx
movl %edx,28(%esi)
- jz .L010sw_end
+ jz .L022sw_end
movl 16(%edi),%eax
mull %eax
movl %eax,32(%esi)
decl %ebx
movl %edx,36(%esi)
- jz .L010sw_end
+ jz .L022sw_end
movl 20(%edi),%eax
mull %eax
movl %eax,40(%esi)
decl %ebx
movl %edx,44(%esi)
- jz .L010sw_end
+ jz .L022sw_end
movl 24(%edi),%eax
mull %eax
movl %eax,48(%esi)
movl %edx,52(%esi)
-.L010sw_end:
+.L022sw_end:
popl %edi
popl %esi
popl %ebx
@@ -490,8 +635,8 @@ bn_add_words:
movl 32(%esp),%ebp
xorl %eax,%eax
andl $4294967288,%ebp
- jz .L011aw_finish
-.L012aw_loop:
+ jz .L023aw_finish
+.L024aw_loop:
movl (%esi),%ecx
movl (%edi),%edx
@@ -569,11 +714,11 @@ bn_add_words:
addl $32,%edi
addl $32,%ebx
subl $8,%ebp
- jnz .L012aw_loop
-.L011aw_finish:
+ jnz .L024aw_loop
+.L023aw_finish:
movl 32(%esp),%ebp
andl $7,%ebp
- jz .L013aw_end
+ jz .L025aw_end
movl (%esi),%ecx
movl (%edi),%edx
@@ -584,7 +729,7 @@ bn_add_words:
adcl $0,%eax
decl %ebp
movl %ecx,(%ebx)
- jz .L013aw_end
+ jz .L025aw_end
movl 4(%esi),%ecx
movl 4(%edi),%edx
@@ -595,7 +740,7 @@ bn_add_words:
adcl $0,%eax
decl %ebp
movl %ecx,4(%ebx)
- jz .L013aw_end
+ jz .L025aw_end
movl 8(%esi),%ecx
movl 8(%edi),%edx
@@ -606,7 +751,7 @@ bn_add_words:
adcl $0,%eax
decl %ebp
movl %ecx,8(%ebx)
- jz .L013aw_end
+ jz .L025aw_end
movl 12(%esi),%ecx
movl 12(%edi),%edx
@@ -617,7 +762,7 @@ bn_add_words:
adcl $0,%eax
decl %ebp
movl %ecx,12(%ebx)
- jz .L013aw_end
+ jz .L025aw_end
movl 16(%esi),%ecx
movl 16(%edi),%edx
@@ -628,7 +773,7 @@ bn_add_words:
adcl $0,%eax
decl %ebp
movl %ecx,16(%ebx)
- jz .L013aw_end
+ jz .L025aw_end
movl 20(%esi),%ecx
movl 20(%edi),%edx
@@ -639,7 +784,7 @@ bn_add_words:
adcl $0,%eax
decl %ebp
movl %ecx,20(%ebx)
- jz .L013aw_end
+ jz .L025aw_end
movl 24(%esi),%ecx
movl 24(%edi),%edx
@@ -649,7 +794,7 @@ bn_add_words:
addl %edx,%ecx
adcl $0,%eax
movl %ecx,24(%ebx)
-.L013aw_end:
+.L025aw_end:
popl %edi
popl %esi
popl %ebx
@@ -672,8 +817,8 @@ bn_sub_words:
movl 32(%esp),%ebp
xorl %eax,%eax
andl $4294967288,%ebp
- jz .L014aw_finish
-.L015aw_loop:
+ jz .L026aw_finish
+.L027aw_loop:
movl (%esi),%ecx
movl (%edi),%edx
@@ -751,11 +896,11 @@ bn_sub_words:
addl $32,%edi
addl $32,%ebx
subl $8,%ebp
- jnz .L015aw_loop
-.L014aw_finish:
+ jnz .L027aw_loop
+.L026aw_finish:
movl 32(%esp),%ebp
andl $7,%ebp
- jz .L016aw_end
+ jz .L028aw_end
movl (%esi),%ecx
movl (%edi),%edx
@@ -766,7 +911,7 @@ bn_sub_words:
adcl $0,%eax
decl %ebp
movl %ecx,(%ebx)
- jz .L016aw_end
+ jz .L028aw_end
movl 4(%esi),%ecx
movl 4(%edi),%edx
@@ -777,7 +922,7 @@ bn_sub_words:
adcl $0,%eax
decl %ebp
movl %ecx,4(%ebx)
- jz .L016aw_end
+ jz .L028aw_end
movl 8(%esi),%ecx
movl 8(%edi),%edx
@@ -788,7 +933,7 @@ bn_sub_words:
adcl $0,%eax
decl %ebp
movl %ecx,8(%ebx)
- jz .L016aw_end
+ jz .L028aw_end
movl 12(%esi),%ecx
movl 12(%edi),%edx
@@ -799,7 +944,7 @@ bn_sub_words:
adcl $0,%eax
decl %ebp
movl %ecx,12(%ebx)
- jz .L016aw_end
+ jz .L028aw_end
movl 16(%esi),%ecx
movl 16(%edi),%edx
@@ -810,7 +955,7 @@ bn_sub_words:
adcl $0,%eax
decl %ebp
movl %ecx,16(%ebx)
- jz .L016aw_end
+ jz .L028aw_end
movl 20(%esi),%ecx
movl 20(%edi),%edx
@@ -821,7 +966,7 @@ bn_sub_words:
adcl $0,%eax
decl %ebp
movl %ecx,20(%ebx)
- jz .L016aw_end
+ jz .L028aw_end
movl 24(%esi),%ecx
movl 24(%edi),%edx
@@ -831,7 +976,7 @@ bn_sub_words:
subl %edx,%ecx
adcl $0,%eax
movl %ecx,24(%ebx)
-.L016aw_end:
+.L028aw_end:
popl %edi
popl %esi
popl %ebx
@@ -854,8 +999,8 @@ bn_sub_part_words:
movl 32(%esp),%ebp
xorl %eax,%eax
andl $4294967288,%ebp
- jz .L017aw_finish
-.L018aw_loop:
+ jz .L029aw_finish
+.L030aw_loop:
movl (%esi),%ecx
movl (%edi),%edx
@@ -933,11 +1078,11 @@ bn_sub_part_words:
addl $32,%edi
addl $32,%ebx
subl $8,%ebp
- jnz .L018aw_loop
-.L017aw_finish:
+ jnz .L030aw_loop
+.L029aw_finish:
movl 32(%esp),%ebp
andl $7,%ebp
- jz .L019aw_end
+ jz .L031aw_end
movl (%esi),%ecx
movl (%edi),%edx
@@ -951,7 +1096,7 @@ bn_sub_part_words:
addl $4,%edi
addl $4,%ebx
decl %ebp
- jz .L019aw_end
+ jz .L031aw_end
movl (%esi),%ecx
movl (%edi),%edx
@@ -965,7 +1110,7 @@ bn_sub_part_words:
addl $4,%edi
addl $4,%ebx
decl %ebp
- jz .L019aw_end
+ jz .L031aw_end
movl (%esi),%ecx
movl (%edi),%edx
@@ -979,7 +1124,7 @@ bn_sub_part_words:
addl $4,%edi
addl $4,%ebx
decl %ebp
- jz .L019aw_end
+ jz .L031aw_end
movl (%esi),%ecx
movl (%edi),%edx
@@ -993,7 +1138,7 @@ bn_sub_part_words:
addl $4,%edi
addl $4,%ebx
decl %ebp
- jz .L019aw_end
+ jz .L031aw_end
movl (%esi),%ecx
movl (%edi),%edx
@@ -1007,7 +1152,7 @@ bn_sub_part_words:
addl $4,%edi
addl $4,%ebx
decl %ebp
- jz .L019aw_end
+ jz .L031aw_end
movl (%esi),%ecx
movl (%edi),%edx
@@ -1021,7 +1166,7 @@ bn_sub_part_words:
addl $4,%edi
addl $4,%ebx
decl %ebp
- jz .L019aw_end
+ jz .L031aw_end
movl (%esi),%ecx
movl (%edi),%edx
@@ -1034,20 +1179,20 @@ bn_sub_part_words:
addl $4,%esi
addl $4,%edi
addl $4,%ebx
-.L019aw_end:
+.L031aw_end:
cmpl $0,36(%esp)
- je .L020pw_end
+ je .L032pw_end
movl 36(%esp),%ebp
cmpl $0,%ebp
- je .L020pw_end
- jge .L021pw_pos
+ je .L032pw_end
+ jge .L033pw_pos
movl $0,%edx
subl %ebp,%edx
movl %edx,%ebp
andl $4294967288,%ebp
- jz .L022pw_neg_finish
-.L023pw_neg_loop:
+ jz .L034pw_neg_finish
+.L035pw_neg_loop:
movl $0,%ecx
movl (%edi),%edx
@@ -1124,13 +1269,13 @@ bn_sub_part_words:
addl $32,%edi
addl $32,%ebx
subl $8,%ebp
- jnz .L023pw_neg_loop
-.L022pw_neg_finish:
+ jnz .L035pw_neg_loop
+.L034pw_neg_finish:
movl 36(%esp),%edx
movl $0,%ebp
subl %edx,%ebp
andl $7,%ebp
- jz .L020pw_end
+ jz .L032pw_end
movl $0,%ecx
movl (%edi),%edx
@@ -1141,7 +1286,7 @@ bn_sub_part_words:
adcl $0,%eax
decl %ebp
movl %ecx,(%ebx)
- jz .L020pw_end
+ jz .L032pw_end
movl $0,%ecx
movl 4(%edi),%edx
@@ -1152,7 +1297,7 @@ bn_sub_part_words:
adcl $0,%eax
decl %ebp
movl %ecx,4(%ebx)
- jz .L020pw_end
+ jz .L032pw_end
movl $0,%ecx
movl 8(%edi),%edx
@@ -1163,7 +1308,7 @@ bn_sub_part_words:
adcl $0,%eax
decl %ebp
movl %ecx,8(%ebx)
- jz .L020pw_end
+ jz .L032pw_end
movl $0,%ecx
movl 12(%edi),%edx
@@ -1174,7 +1319,7 @@ bn_sub_part_words:
adcl $0,%eax
decl %ebp
movl %ecx,12(%ebx)
- jz .L020pw_end
+ jz .L032pw_end
movl $0,%ecx
movl 16(%edi),%edx
@@ -1185,7 +1330,7 @@ bn_sub_part_words:
adcl $0,%eax
decl %ebp
movl %ecx,16(%ebx)
- jz .L020pw_end
+ jz .L032pw_end
movl $0,%ecx
movl 20(%edi),%edx
@@ -1196,7 +1341,7 @@ bn_sub_part_words:
adcl $0,%eax
decl %ebp
movl %ecx,20(%ebx)
- jz .L020pw_end
+ jz .L032pw_end
movl $0,%ecx
movl 24(%edi),%edx
@@ -1206,184 +1351,185 @@ bn_sub_part_words:
subl %edx,%ecx
adcl $0,%eax
movl %ecx,24(%ebx)
- jmp .L020pw_end
-.L021pw_pos:
+ jmp .L032pw_end
+.L033pw_pos:
andl $4294967288,%ebp
- jz .L024pw_pos_finish
-.L025pw_pos_loop:
+ jz .L036pw_pos_finish
+.L037pw_pos_loop:
movl (%esi),%ecx
subl %eax,%ecx
movl %ecx,(%ebx)
- jnc .L026pw_nc0
+ jnc .L038pw_nc0
movl 4(%esi),%ecx
subl %eax,%ecx
movl %ecx,4(%ebx)
- jnc .L027pw_nc1
+ jnc .L039pw_nc1
movl 8(%esi),%ecx
subl %eax,%ecx
movl %ecx,8(%ebx)
- jnc .L028pw_nc2
+ jnc .L040pw_nc2
movl 12(%esi),%ecx
subl %eax,%ecx
movl %ecx,12(%ebx)
- jnc .L029pw_nc3
+ jnc .L041pw_nc3
movl 16(%esi),%ecx
subl %eax,%ecx
movl %ecx,16(%ebx)
- jnc .L030pw_nc4
+ jnc .L042pw_nc4
movl 20(%esi),%ecx
subl %eax,%ecx
movl %ecx,20(%ebx)
- jnc .L031pw_nc5
+ jnc .L043pw_nc5
movl 24(%esi),%ecx
subl %eax,%ecx
movl %ecx,24(%ebx)
- jnc .L032pw_nc6
+ jnc .L044pw_nc6
movl 28(%esi),%ecx
subl %eax,%ecx
movl %ecx,28(%ebx)
- jnc .L033pw_nc7
+ jnc .L045pw_nc7
addl $32,%esi
addl $32,%ebx
subl $8,%ebp
- jnz .L025pw_pos_loop
-.L024pw_pos_finish:
+ jnz .L037pw_pos_loop
+.L036pw_pos_finish:
movl 36(%esp),%ebp
andl $7,%ebp
- jz .L020pw_end
+ jz .L032pw_end
movl (%esi),%ecx
subl %eax,%ecx
movl %ecx,(%ebx)
- jnc .L034pw_tail_nc0
+ jnc .L046pw_tail_nc0
decl %ebp
- jz .L020pw_end
+ jz .L032pw_end
movl 4(%esi),%ecx
subl %eax,%ecx
movl %ecx,4(%ebx)
- jnc .L035pw_tail_nc1
+ jnc .L047pw_tail_nc1
decl %ebp
- jz .L020pw_end
+ jz .L032pw_end
movl 8(%esi),%ecx
subl %eax,%ecx
movl %ecx,8(%ebx)
- jnc .L036pw_tail_nc2
+ jnc .L048pw_tail_nc2
decl %ebp
- jz .L020pw_end
+ jz .L032pw_end
movl 12(%esi),%ecx
subl %eax,%ecx
movl %ecx,12(%ebx)
- jnc .L037pw_tail_nc3
+ jnc .L049pw_tail_nc3
decl %ebp
- jz .L020pw_end
+ jz .L032pw_end
movl 16(%esi),%ecx
subl %eax,%ecx
movl %ecx,16(%ebx)
- jnc .L038pw_tail_nc4
+ jnc .L050pw_tail_nc4
decl %ebp
- jz .L020pw_end
+ jz .L032pw_end
movl 20(%esi),%ecx
subl %eax,%ecx
movl %ecx,20(%ebx)
- jnc .L039pw_tail_nc5
+ jnc .L051pw_tail_nc5
decl %ebp
- jz .L020pw_end
+ jz .L032pw_end
movl 24(%esi),%ecx
subl %eax,%ecx
movl %ecx,24(%ebx)
- jnc .L040pw_tail_nc6
+ jnc .L052pw_tail_nc6
movl $1,%eax
- jmp .L020pw_end
-.L041pw_nc_loop:
+ jmp .L032pw_end
+.L053pw_nc_loop:
movl (%esi),%ecx
movl %ecx,(%ebx)
-.L026pw_nc0:
+.L038pw_nc0:
movl 4(%esi),%ecx
movl %ecx,4(%ebx)
-.L027pw_nc1:
+.L039pw_nc1:
movl 8(%esi),%ecx
movl %ecx,8(%ebx)
-.L028pw_nc2:
+.L040pw_nc2:
movl 12(%esi),%ecx
movl %ecx,12(%ebx)
-.L029pw_nc3:
+.L041pw_nc3:
movl 16(%esi),%ecx
movl %ecx,16(%ebx)
-.L030pw_nc4:
+.L042pw_nc4:
movl 20(%esi),%ecx
movl %ecx,20(%ebx)
-.L031pw_nc5:
+.L043pw_nc5:
movl 24(%esi),%ecx
movl %ecx,24(%ebx)
-.L032pw_nc6:
+.L044pw_nc6:
movl 28(%esi),%ecx
movl %ecx,28(%ebx)
-.L033pw_nc7:
+.L045pw_nc7:
addl $32,%esi
addl $32,%ebx
subl $8,%ebp
- jnz .L041pw_nc_loop
+ jnz .L053pw_nc_loop
movl 36(%esp),%ebp
andl $7,%ebp
- jz .L042pw_nc_end
+ jz .L054pw_nc_end
movl (%esi),%ecx
movl %ecx,(%ebx)
-.L034pw_tail_nc0:
+.L046pw_tail_nc0:
decl %ebp
- jz .L042pw_nc_end
+ jz .L054pw_nc_end
movl 4(%esi),%ecx
movl %ecx,4(%ebx)
-.L035pw_tail_nc1:
+.L047pw_tail_nc1:
decl %ebp
- jz .L042pw_nc_end
+ jz .L054pw_nc_end
movl 8(%esi),%ecx
movl %ecx,8(%ebx)
-.L036pw_tail_nc2:
+.L048pw_tail_nc2:
decl %ebp
- jz .L042pw_nc_end
+ jz .L054pw_nc_end
movl 12(%esi),%ecx
movl %ecx,12(%ebx)
-.L037pw_tail_nc3:
+.L049pw_tail_nc3:
decl %ebp
- jz .L042pw_nc_end
+ jz .L054pw_nc_end
movl 16(%esi),%ecx
movl %ecx,16(%ebx)
-.L038pw_tail_nc4:
+.L050pw_tail_nc4:
decl %ebp
- jz .L042pw_nc_end
+ jz .L054pw_nc_end
movl 20(%esi),%ecx
movl %ecx,20(%ebx)
-.L039pw_tail_nc5:
+.L051pw_tail_nc5:
decl %ebp
- jz .L042pw_nc_end
+ jz .L054pw_nc_end
movl 24(%esi),%ecx
movl %ecx,24(%ebx)
-.L040pw_tail_nc6:
-.L042pw_nc_end:
+.L052pw_tail_nc6:
+.L054pw_nc_end:
movl $0,%eax
-.L020pw_end:
+.L032pw_end:
popl %edi
popl %esi
popl %ebx
popl %ebp
ret
.size bn_sub_part_words,.-.L_bn_sub_part_words_begin
+.comm OPENSSL_ia32cap_P,16,4
#else
.text
.globl bn_mul_add_words
@@ -1391,6 +1537,99 @@ bn_sub_part_words:
.align 16
bn_mul_add_words:
.L_bn_mul_add_words_begin:
+ leal OPENSSL_ia32cap_P,%eax
+ btl $26,(%eax)
+ jnc .L000maw_non_sse2
+ movl 4(%esp),%eax
+ movl 8(%esp),%edx
+ movl 12(%esp),%ecx
+ movd 16(%esp),%mm0
+ pxor %mm1,%mm1
+ jmp .L001maw_sse2_entry
+.align 16
+.L002maw_sse2_unrolled:
+ movd (%eax),%mm3
+ paddq %mm3,%mm1
+ movd (%edx),%mm2
+ pmuludq %mm0,%mm2
+ movd 4(%edx),%mm4
+ pmuludq %mm0,%mm4
+ movd 8(%edx),%mm6
+ pmuludq %mm0,%mm6
+ movd 12(%edx),%mm7
+ pmuludq %mm0,%mm7
+ paddq %mm2,%mm1
+ movd 4(%eax),%mm3
+ paddq %mm4,%mm3
+ movd 8(%eax),%mm5
+ paddq %mm6,%mm5
+ movd 12(%eax),%mm4
+ paddq %mm4,%mm7
+ movd %mm1,(%eax)
+ movd 16(%edx),%mm2
+ pmuludq %mm0,%mm2
+ psrlq $32,%mm1
+ movd 20(%edx),%mm4
+ pmuludq %mm0,%mm4
+ paddq %mm3,%mm1
+ movd 24(%edx),%mm6
+ pmuludq %mm0,%mm6
+ movd %mm1,4(%eax)
+ psrlq $32,%mm1
+ movd 28(%edx),%mm3
+ addl $32,%edx
+ pmuludq %mm0,%mm3
+ paddq %mm5,%mm1
+ movd 16(%eax),%mm5
+ paddq %mm5,%mm2
+ movd %mm1,8(%eax)
+ psrlq $32,%mm1
+ paddq %mm7,%mm1
+ movd 20(%eax),%mm5
+ paddq %mm5,%mm4
+ movd %mm1,12(%eax)
+ psrlq $32,%mm1
+ paddq %mm2,%mm1
+ movd 24(%eax),%mm5
+ paddq %mm5,%mm6
+ movd %mm1,16(%eax)
+ psrlq $32,%mm1
+ paddq %mm4,%mm1
+ movd 28(%eax),%mm5
+ paddq %mm5,%mm3
+ movd %mm1,20(%eax)
+ psrlq $32,%mm1
+ paddq %mm6,%mm1
+ movd %mm1,24(%eax)
+ psrlq $32,%mm1
+ paddq %mm3,%mm1
+ movd %mm1,28(%eax)
+ leal 32(%eax),%eax
+ psrlq $32,%mm1
+ subl $8,%ecx
+ jz .L003maw_sse2_exit
+.L001maw_sse2_entry:
+ testl $4294967288,%ecx
+ jnz .L002maw_sse2_unrolled
+.align 4
+.L004maw_sse2_loop:
+ movd (%edx),%mm2
+ movd (%eax),%mm3
+ pmuludq %mm0,%mm2
+ leal 4(%edx),%edx
+ paddq %mm3,%mm1
+ paddq %mm2,%mm1
+ movd %mm1,(%eax)
+ subl $1,%ecx
+ psrlq $32,%mm1
+ leal 4(%eax),%eax
+ jnz .L004maw_sse2_loop
+.L003maw_sse2_exit:
+ movd %mm1,%eax
+ emms
+ ret
+.align 16
+.L000maw_non_sse2:
pushl %ebp
pushl %ebx
pushl %esi
@@ -1403,9 +1642,9 @@ bn_mul_add_words:
andl $4294967288,%ecx
movl 32(%esp),%ebp
pushl %ecx
- jz .L000maw_finish
+ jz .L005maw_finish
.align 16
-.L001maw_loop:
+.L006maw_loop:
movl (%ebx),%eax
mull %ebp
@@ -1482,13 +1721,13 @@ bn_mul_add_words:
subl $8,%ecx
leal 32(%ebx),%ebx
leal 32(%edi),%edi
- jnz .L001maw_loop
-.L000maw_finish:
+ jnz .L006maw_loop
+.L005maw_finish:
movl 32(%esp),%ecx
andl $7,%ecx
- jnz .L002maw_finish2
- jmp .L003maw_end
-.L002maw_finish2:
+ jnz .L007maw_finish2
+ jmp .L008maw_end
+.L007maw_finish2:
movl (%ebx),%eax
mull %ebp
@@ -1499,7 +1738,7 @@ bn_mul_add_words:
decl %ecx
movl %eax,(%edi)
movl %edx,%esi
- jz .L003maw_end
+ jz .L008maw_end
movl 4(%ebx),%eax
mull %ebp
@@ -1510,7 +1749,7 @@ bn_mul_add_words:
decl %ecx
movl %eax,4(%edi)
movl %edx,%esi
- jz .L003maw_end
+ jz .L008maw_end
movl 8(%ebx),%eax
mull %ebp
@@ -1521,7 +1760,7 @@ bn_mul_add_words:
decl %ecx
movl %eax,8(%edi)
movl %edx,%esi
- jz .L003maw_end
+ jz .L008maw_end
movl 12(%ebx),%eax
mull %ebp
@@ -1532,7 +1771,7 @@ bn_mul_add_words:
decl %ecx
movl %eax,12(%edi)
movl %edx,%esi
- jz .L003maw_end
+ jz .L008maw_end
movl 16(%ebx),%eax
mull %ebp
@@ -1543,7 +1782,7 @@ bn_mul_add_words:
decl %ecx
movl %eax,16(%edi)
movl %edx,%esi
- jz .L003maw_end
+ jz .L008maw_end
movl 20(%ebx),%eax
mull %ebp
@@ -1554,7 +1793,7 @@ bn_mul_add_words:
decl %ecx
movl %eax,20(%edi)
movl %edx,%esi
- jz .L003maw_end
+ jz .L008maw_end
movl 24(%ebx),%eax
mull %ebp
@@ -1564,7 +1803,7 @@ bn_mul_add_words:
adcl $0,%edx
movl %eax,24(%edi)
movl %edx,%esi
-.L003maw_end:
+.L008maw_end:
movl %esi,%eax
popl %ecx
popl %edi
@@ -1578,6 +1817,30 @@ bn_mul_add_words:
.align 16
bn_mul_words:
.L_bn_mul_words_begin:
+ leal OPENSSL_ia32cap_P,%eax
+ btl $26,(%eax)
+ jnc .L009mw_non_sse2
+ movl 4(%esp),%eax
+ movl 8(%esp),%edx
+ movl 12(%esp),%ecx
+ movd 16(%esp),%mm0
+ pxor %mm1,%mm1
+.align 16
+.L010mw_sse2_loop:
+ movd (%edx),%mm2
+ pmuludq %mm0,%mm2
+ leal 4(%edx),%edx
+ paddq %mm2,%mm1
+ movd %mm1,(%eax)
+ subl $1,%ecx
+ psrlq $32,%mm1
+ leal 4(%eax),%eax
+ jnz .L010mw_sse2_loop
+ movd %mm1,%eax
+ emms
+ ret
+.align 16
+.L009mw_non_sse2:
pushl %ebp
pushl %ebx
pushl %esi
@@ -1589,8 +1852,8 @@ bn_mul_words:
movl 28(%esp),%ebp
movl 32(%esp),%ecx
andl $4294967288,%ebp
- jz .L004mw_finish
-.L005mw_loop:
+ jz .L011mw_finish
+.L012mw_loop:
movl (%ebx),%eax
mull %ecx
@@ -1651,14 +1914,14 @@ bn_mul_words:
addl $32,%ebx
addl $32,%edi
subl $8,%ebp
- jz .L004mw_finish
- jmp .L005mw_loop
-.L004mw_finish:
+ jz .L011mw_finish
+ jmp .L012mw_loop
+.L011mw_finish:
movl 28(%esp),%ebp
andl $7,%ebp
- jnz .L006mw_finish2
- jmp .L007mw_end
-.L006mw_finish2:
+ jnz .L013mw_finish2
+ jmp .L014mw_end
+.L013mw_finish2:
movl (%ebx),%eax
mull %ecx
@@ -1667,7 +1930,7 @@ bn_mul_words:
movl %eax,(%edi)
movl %edx,%esi
decl %ebp
- jz .L007mw_end
+ jz .L014mw_end
movl 4(%ebx),%eax
mull %ecx
@@ -1676,7 +1939,7 @@ bn_mul_words:
movl %eax,4(%edi)
movl %edx,%esi
decl %ebp
- jz .L007mw_end
+ jz .L014mw_end
movl 8(%ebx),%eax
mull %ecx
@@ -1685,7 +1948,7 @@ bn_mul_words:
movl %eax,8(%edi)
movl %edx,%esi
decl %ebp
- jz .L007mw_end
+ jz .L014mw_end
movl 12(%ebx),%eax
mull %ecx
@@ -1694,7 +1957,7 @@ bn_mul_words:
movl %eax,12(%edi)
movl %edx,%esi
decl %ebp
- jz .L007mw_end
+ jz .L014mw_end
movl 16(%ebx),%eax
mull %ecx
@@ -1703,7 +1966,7 @@ bn_mul_words:
movl %eax,16(%edi)
movl %edx,%esi
decl %ebp
- jz .L007mw_end
+ jz .L014mw_end
movl 20(%ebx),%eax
mull %ecx
@@ -1712,7 +1975,7 @@ bn_mul_words:
movl %eax,20(%edi)
movl %edx,%esi
decl %ebp
- jz .L007mw_end
+ jz .L014mw_end
movl 24(%ebx),%eax
mull %ecx
@@ -1720,7 +1983,7 @@ bn_mul_words:
adcl $0,%edx
movl %eax,24(%edi)
movl %edx,%esi
-.L007mw_end:
+.L014mw_end:
movl %esi,%eax
popl %edi
popl %esi
@@ -1733,6 +1996,25 @@ bn_mul_words:
.align 16
bn_sqr_words:
.L_bn_sqr_words_begin:
+ leal OPENSSL_ia32cap_P,%eax
+ btl $26,(%eax)
+ jnc .L015sqr_non_sse2
+ movl 4(%esp),%eax
+ movl 8(%esp),%edx
+ movl 12(%esp),%ecx
+.align 16
+.L016sqr_sse2_loop:
+ movd (%edx),%mm0
+ pmuludq %mm0,%mm0
+ leal 4(%edx),%edx
+ movq %mm0,(%eax)
+ subl $1,%ecx
+ leal 8(%eax),%eax
+ jnz .L016sqr_sse2_loop
+ emms
+ ret
+.align 16
+.L015sqr_non_sse2:
pushl %ebp
pushl %ebx
pushl %esi
@@ -1742,8 +2024,8 @@ bn_sqr_words:
movl 24(%esp),%edi
movl 28(%esp),%ebx
andl $4294967288,%ebx
- jz .L008sw_finish
-.L009sw_loop:
+ jz .L017sw_finish
+.L018sw_loop:
movl (%edi),%eax
mull %eax
@@ -1788,59 +2070,59 @@ bn_sqr_words:
addl $32,%edi
addl $64,%esi
subl $8,%ebx
- jnz .L009sw_loop
-.L008sw_finish:
+ jnz .L018sw_loop
+.L017sw_finish:
movl 28(%esp),%ebx
andl $7,%ebx
- jz .L010sw_end
+ jz .L019sw_end
movl (%edi),%eax
mull %eax
movl %eax,(%esi)
decl %ebx
movl %edx,4(%esi)
- jz .L010sw_end
+ jz .L019sw_end
movl 4(%edi),%eax
mull %eax
movl %eax,8(%esi)
decl %ebx
movl %edx,12(%esi)
- jz .L010sw_end
+ jz .L019sw_end
movl 8(%edi),%eax
mull %eax
movl %eax,16(%esi)
decl %ebx
movl %edx,20(%esi)
- jz .L010sw_end
+ jz .L019sw_end
movl 12(%edi),%eax
mull %eax
movl %eax,24(%esi)
decl %ebx
movl %edx,28(%esi)
- jz .L010sw_end
+ jz .L019sw_end
movl 16(%edi),%eax
mull %eax
movl %eax,32(%esi)
decl %ebx
movl %edx,36(%esi)
- jz .L010sw_end
+ jz .L019sw_end
movl 20(%edi),%eax
mull %eax
movl %eax,40(%esi)
decl %ebx
movl %edx,44(%esi)
- jz .L010sw_end
+ jz .L019sw_end
movl 24(%edi),%eax
mull %eax
movl %eax,48(%esi)
movl %edx,52(%esi)
-.L010sw_end:
+.L019sw_end:
popl %edi
popl %esi
popl %ebx
@@ -1874,8 +2156,8 @@ bn_add_words:
movl 32(%esp),%ebp
xorl %eax,%eax
andl $4294967288,%ebp
- jz .L011aw_finish
-.L012aw_loop:
+ jz .L020aw_finish
+.L021aw_loop:
movl (%esi),%ecx
movl (%edi),%edx
@@ -1953,11 +2235,11 @@ bn_add_words:
addl $32,%edi
addl $32,%ebx
subl $8,%ebp
- jnz .L012aw_loop
-.L011aw_finish:
+ jnz .L021aw_loop
+.L020aw_finish:
movl 32(%esp),%ebp
andl $7,%ebp
- jz .L013aw_end
+ jz .L022aw_end
movl (%esi),%ecx
movl (%edi),%edx
@@ -1968,7 +2250,7 @@ bn_add_words:
adcl $0,%eax
decl %ebp
movl %ecx,(%ebx)
- jz .L013aw_end
+ jz .L022aw_end
movl 4(%esi),%ecx
movl 4(%edi),%edx
@@ -1979,7 +2261,7 @@ bn_add_words:
adcl $0,%eax
decl %ebp
movl %ecx,4(%ebx)
- jz .L013aw_end
+ jz .L022aw_end
movl 8(%esi),%ecx
movl 8(%edi),%edx
@@ -1990,7 +2272,7 @@ bn_add_words:
adcl $0,%eax
decl %ebp
movl %ecx,8(%ebx)
- jz .L013aw_end
+ jz .L022aw_end
movl 12(%esi),%ecx
movl 12(%edi),%edx
@@ -2001,7 +2283,7 @@ bn_add_words:
adcl $0,%eax
decl %ebp
movl %ecx,12(%ebx)
- jz .L013aw_end
+ jz .L022aw_end
movl 16(%esi),%ecx
movl 16(%edi),%edx
@@ -2012,7 +2294,7 @@ bn_add_words:
adcl $0,%eax
decl %ebp
movl %ecx,16(%ebx)
- jz .L013aw_end
+ jz .L022aw_end
movl 20(%esi),%ecx
movl 20(%edi),%edx
@@ -2023,7 +2305,7 @@ bn_add_words:
adcl $0,%eax
decl %ebp
movl %ecx,20(%ebx)
- jz .L013aw_end
+ jz .L022aw_end
movl 24(%esi),%ecx
movl 24(%edi),%edx
@@ -2033,7 +2315,7 @@ bn_add_words:
addl %edx,%ecx
adcl $0,%eax
movl %ecx,24(%ebx)
-.L013aw_end:
+.L022aw_end:
popl %edi
popl %esi
popl %ebx
@@ -2056,8 +2338,8 @@ bn_sub_words:
movl 32(%esp),%ebp
xorl %eax,%eax
andl $4294967288,%ebp
- jz .L014aw_finish
-.L015aw_loop:
+ jz .L023aw_finish
+.L024aw_loop:
movl (%esi),%ecx
movl (%edi),%edx
@@ -2135,11 +2417,11 @@ bn_sub_words:
addl $32,%edi
addl $32,%ebx
subl $8,%ebp
- jnz .L015aw_loop
-.L014aw_finish:
+ jnz .L024aw_loop
+.L023aw_finish:
movl 32(%esp),%ebp
andl $7,%ebp
- jz .L016aw_end
+ jz .L025aw_end
movl (%esi),%ecx
movl (%edi),%edx
@@ -2150,7 +2432,7 @@ bn_sub_words:
adcl $0,%eax
decl %ebp
movl %ecx,(%ebx)
- jz .L016aw_end
+ jz .L025aw_end
movl 4(%esi),%ecx
movl 4(%edi),%edx
@@ -2161,7 +2443,7 @@ bn_sub_words:
adcl $0,%eax
decl %ebp
movl %ecx,4(%ebx)
- jz .L016aw_end
+ jz .L025aw_end
movl 8(%esi),%ecx
movl 8(%edi),%edx
@@ -2172,7 +2454,7 @@ bn_sub_words:
adcl $0,%eax
decl %ebp
movl %ecx,8(%ebx)
- jz .L016aw_end
+ jz .L025aw_end
movl 12(%esi),%ecx
movl 12(%edi),%edx
@@ -2183,7 +2465,7 @@ bn_sub_words:
adcl $0,%eax
decl %ebp
movl %ecx,12(%ebx)
- jz .L016aw_end
+ jz .L025aw_end
movl 16(%esi),%ecx
movl 16(%edi),%edx
@@ -2194,7 +2476,7 @@ bn_sub_words:
adcl $0,%eax
decl %ebp
movl %ecx,16(%ebx)
- jz .L016aw_end
+ jz .L025aw_end
movl 20(%esi),%ecx
movl 20(%edi),%edx
@@ -2205,7 +2487,7 @@ bn_sub_words:
adcl $0,%eax
decl %ebp
movl %ecx,20(%ebx)
- jz .L016aw_end
+ jz .L025aw_end
movl 24(%esi),%ecx
movl 24(%edi),%edx
@@ -2215,7 +2497,7 @@ bn_sub_words:
subl %edx,%ecx
adcl $0,%eax
movl %ecx,24(%ebx)
-.L016aw_end:
+.L025aw_end:
popl %edi
popl %esi
popl %ebx
@@ -2238,8 +2520,8 @@ bn_sub_part_words:
movl 32(%esp),%ebp
xorl %eax,%eax
andl $4294967288,%ebp
- jz .L017aw_finish
-.L018aw_loop:
+ jz .L026aw_finish
+.L027aw_loop:
movl (%esi),%ecx
movl (%edi),%edx
@@ -2317,11 +2599,11 @@ bn_sub_part_words:
addl $32,%edi
addl $32,%ebx
subl $8,%ebp
- jnz .L018aw_loop
-.L017aw_finish:
+ jnz .L027aw_loop
+.L026aw_finish:
movl 32(%esp),%ebp
andl $7,%ebp
- jz .L019aw_end
+ jz .L028aw_end
movl (%esi),%ecx
movl (%edi),%edx
@@ -2335,7 +2617,7 @@ bn_sub_part_words:
addl $4,%edi
addl $4,%ebx
decl %ebp
- jz .L019aw_end
+ jz .L028aw_end
movl (%esi),%ecx
movl (%edi),%edx
@@ -2349,7 +2631,7 @@ bn_sub_part_words:
addl $4,%edi
addl $4,%ebx
decl %ebp
- jz .L019aw_end
+ jz .L028aw_end
movl (%esi),%ecx
movl (%edi),%edx
@@ -2363,7 +2645,7 @@ bn_sub_part_words:
addl $4,%edi
addl $4,%ebx
decl %ebp
- jz .L019aw_end
+ jz .L028aw_end
movl (%esi),%ecx
movl (%edi),%edx
@@ -2377,7 +2659,7 @@ bn_sub_part_words:
addl $4,%edi
addl $4,%ebx
decl %ebp
- jz .L019aw_end
+ jz .L028aw_end
movl (%esi),%ecx
movl (%edi),%edx
@@ -2391,7 +2673,7 @@ bn_sub_part_words:
addl $4,%edi
addl $4,%ebx
decl %ebp
- jz .L019aw_end
+ jz .L028aw_end
movl (%esi),%ecx
movl (%edi),%edx
@@ -2405,7 +2687,7 @@ bn_sub_part_words:
addl $4,%edi
addl $4,%ebx
decl %ebp
- jz .L019aw_end
+ jz .L028aw_end
movl (%esi),%ecx
movl (%edi),%edx
@@ -2418,20 +2700,20 @@ bn_sub_part_words:
addl $4,%esi
addl $4,%edi
addl $4,%ebx
-.L019aw_end:
+.L028aw_end:
cmpl $0,36(%esp)
- je .L020pw_end
+ je .L029pw_end
movl 36(%esp),%ebp
cmpl $0,%ebp
- je .L020pw_end
- jge .L021pw_pos
+ je .L029pw_end
+ jge .L030pw_pos
movl $0,%edx
subl %ebp,%edx
movl %edx,%ebp
andl $4294967288,%ebp
- jz .L022pw_neg_finish
-.L023pw_neg_loop:
+ jz .L031pw_neg_finish
+.L032pw_neg_loop:
movl $0,%ecx
movl (%edi),%edx
@@ -2508,13 +2790,13 @@ bn_sub_part_words:
addl $32,%edi
addl $32,%ebx
subl $8,%ebp
- jnz .L023pw_neg_loop
-.L022pw_neg_finish:
+ jnz .L032pw_neg_loop
+.L031pw_neg_finish:
movl 36(%esp),%edx
movl $0,%ebp
subl %edx,%ebp
andl $7,%ebp
- jz .L020pw_end
+ jz .L029pw_end
movl $0,%ecx
movl (%edi),%edx
@@ -2525,7 +2807,7 @@ bn_sub_part_words:
adcl $0,%eax
decl %ebp
movl %ecx,(%ebx)
- jz .L020pw_end
+ jz .L029pw_end
movl $0,%ecx
movl 4(%edi),%edx
@@ -2536,7 +2818,7 @@ bn_sub_part_words:
adcl $0,%eax
decl %ebp
movl %ecx,4(%ebx)
- jz .L020pw_end
+ jz .L029pw_end
movl $0,%ecx
movl 8(%edi),%edx
@@ -2547,7 +2829,7 @@ bn_sub_part_words:
adcl $0,%eax
decl %ebp
movl %ecx,8(%ebx)
- jz .L020pw_end
+ jz .L029pw_end
movl $0,%ecx
movl 12(%edi),%edx
@@ -2558,7 +2840,7 @@ bn_sub_part_words:
adcl $0,%eax
decl %ebp
movl %ecx,12(%ebx)
- jz .L020pw_end
+ jz .L029pw_end
movl $0,%ecx
movl 16(%edi),%edx
@@ -2569,7 +2851,7 @@ bn_sub_part_words:
adcl $0,%eax
decl %ebp
movl %ecx,16(%ebx)
- jz .L020pw_end
+ jz .L029pw_end
movl $0,%ecx
movl 20(%edi),%edx
@@ -2580,7 +2862,7 @@ bn_sub_part_words:
adcl $0,%eax
decl %ebp
movl %ecx,20(%ebx)
- jz .L020pw_end
+ jz .L029pw_end
movl $0,%ecx
movl 24(%edi),%edx
@@ -2590,182 +2872,183 @@ bn_sub_part_words:
subl %edx,%ecx
adcl $0,%eax
movl %ecx,24(%ebx)
- jmp .L020pw_end
-.L021pw_pos:
+ jmp .L029pw_end
+.L030pw_pos:
andl $4294967288,%ebp
- jz .L024pw_pos_finish
-.L025pw_pos_loop:
+ jz .L033pw_pos_finish
+.L034pw_pos_loop:
movl (%esi),%ecx
subl %eax,%ecx
movl %ecx,(%ebx)
- jnc .L026pw_nc0
+ jnc .L035pw_nc0
movl 4(%esi),%ecx
subl %eax,%ecx
movl %ecx,4(%ebx)
- jnc .L027pw_nc1
+ jnc .L036pw_nc1
movl 8(%esi),%ecx
subl %eax,%ecx
movl %ecx,8(%ebx)
- jnc .L028pw_nc2
+ jnc .L037pw_nc2
movl 12(%esi),%ecx
subl %eax,%ecx
movl %ecx,12(%ebx)
- jnc .L029pw_nc3
+ jnc .L038pw_nc3
movl 16(%esi),%ecx
subl %eax,%ecx
movl %ecx,16(%ebx)
- jnc .L030pw_nc4
+ jnc .L039pw_nc4
movl 20(%esi),%ecx
subl %eax,%ecx
movl %ecx,20(%ebx)
- jnc .L031pw_nc5
+ jnc .L040pw_nc5
movl 24(%esi),%ecx
subl %eax,%ecx
movl %ecx,24(%ebx)
- jnc .L032pw_nc6
+ jnc .L041pw_nc6
movl 28(%esi),%ecx
subl %eax,%ecx
movl %ecx,28(%ebx)
- jnc .L033pw_nc7
+ jnc .L042pw_nc7
addl $32,%esi
addl $32,%ebx
subl $8,%ebp
- jnz .L025pw_pos_loop
-.L024pw_pos_finish:
+ jnz .L034pw_pos_loop
+.L033pw_pos_finish:
movl 36(%esp),%ebp
andl $7,%ebp
- jz .L020pw_end
+ jz .L029pw_end
movl (%esi),%ecx
subl %eax,%ecx
movl %ecx,(%ebx)
- jnc .L034pw_tail_nc0
+ jnc .L043pw_tail_nc0
decl %ebp
- jz .L020pw_end
+ jz .L029pw_end
movl 4(%esi),%ecx
subl %eax,%ecx
movl %ecx,4(%ebx)
- jnc .L035pw_tail_nc1
+ jnc .L044pw_tail_nc1
decl %ebp
- jz .L020pw_end
+ jz .L029pw_end
movl 8(%esi),%ecx
subl %eax,%ecx
movl %ecx,8(%ebx)
- jnc .L036pw_tail_nc2
+ jnc .L045pw_tail_nc2
decl %ebp
- jz .L020pw_end
+ jz .L029pw_end
movl 12(%esi),%ecx
subl %eax,%ecx
movl %ecx,12(%ebx)
- jnc .L037pw_tail_nc3
+ jnc .L046pw_tail_nc3
decl %ebp
- jz .L020pw_end
+ jz .L029pw_end
movl 16(%esi),%ecx
subl %eax,%ecx
movl %ecx,16(%ebx)
- jnc .L038pw_tail_nc4
+ jnc .L047pw_tail_nc4
decl %ebp
- jz .L020pw_end
+ jz .L029pw_end
movl 20(%esi),%ecx
subl %eax,%ecx
movl %ecx,20(%ebx)
- jnc .L039pw_tail_nc5
+ jnc .L048pw_tail_nc5
decl %ebp
- jz .L020pw_end
+ jz .L029pw_end
movl 24(%esi),%ecx
subl %eax,%ecx
movl %ecx,24(%ebx)
- jnc .L040pw_tail_nc6
+ jnc .L049pw_tail_nc6
movl $1,%eax
- jmp .L020pw_end
-.L041pw_nc_loop:
+ jmp .L029pw_end
+.L050pw_nc_loop:
movl (%esi),%ecx
movl %ecx,(%ebx)
-.L026pw_nc0:
+.L035pw_nc0:
movl 4(%esi),%ecx
movl %ecx,4(%ebx)
-.L027pw_nc1:
+.L036pw_nc1:
movl 8(%esi),%ecx
movl %ecx,8(%ebx)
-.L028pw_nc2:
+.L037pw_nc2:
movl 12(%esi),%ecx
movl %ecx,12(%ebx)
-.L029pw_nc3:
+.L038pw_nc3:
movl 16(%esi),%ecx
movl %ecx,16(%ebx)
-.L030pw_nc4:
+.L039pw_nc4:
movl 20(%esi),%ecx
movl %ecx,20(%ebx)
-.L031pw_nc5:
+.L040pw_nc5:
movl 24(%esi),%ecx
movl %ecx,24(%ebx)
-.L032pw_nc6:
+.L041pw_nc6:
movl 28(%esi),%ecx
movl %ecx,28(%ebx)
-.L033pw_nc7:
+.L042pw_nc7:
addl $32,%esi
addl $32,%ebx
subl $8,%ebp
- jnz .L041pw_nc_loop
+ jnz .L050pw_nc_loop
movl 36(%esp),%ebp
andl $7,%ebp
- jz .L042pw_nc_end
+ jz .L051pw_nc_end
movl (%esi),%ecx
movl %ecx,(%ebx)
-.L034pw_tail_nc0:
+.L043pw_tail_nc0:
decl %ebp
- jz .L042pw_nc_end
+ jz .L051pw_nc_end
movl 4(%esi),%ecx
movl %ecx,4(%ebx)
-.L035pw_tail_nc1:
+.L044pw_tail_nc1:
decl %ebp
- jz .L042pw_nc_end
+ jz .L051pw_nc_end
movl 8(%esi),%ecx
movl %ecx,8(%ebx)
-.L036pw_tail_nc2:
+.L045pw_tail_nc2:
decl %ebp
- jz .L042pw_nc_end
+ jz .L051pw_nc_end
movl 12(%esi),%ecx
movl %ecx,12(%ebx)
-.L037pw_tail_nc3:
+.L046pw_tail_nc3:
decl %ebp
- jz .L042pw_nc_end
+ jz .L051pw_nc_end
movl 16(%esi),%ecx
movl %ecx,16(%ebx)
-.L038pw_tail_nc4:
+.L047pw_tail_nc4:
decl %ebp
- jz .L042pw_nc_end
+ jz .L051pw_nc_end
movl 20(%esi),%ecx
movl %ecx,20(%ebx)
-.L039pw_tail_nc5:
+.L048pw_tail_nc5:
decl %ebp
- jz .L042pw_nc_end
+ jz .L051pw_nc_end
movl 24(%esi),%ecx
movl %ecx,24(%ebx)
-.L040pw_tail_nc6:
-.L042pw_nc_end:
+.L049pw_tail_nc6:
+.L051pw_nc_end:
movl $0,%eax
-.L020pw_end:
+.L029pw_end:
popl %edi
popl %esi
popl %ebx
popl %ebp
ret
.size bn_sub_part_words,.-.L_bn_sub_part_words_begin
+.comm OPENSSL_ia32cap_P,16,4
#endif
diff --git a/secure/lib/libcrypto/i386/chacha-x86.S b/secure/lib/libcrypto/i386/chacha-x86.S
index 261c6010f89a..d6b2936a5381 100644
--- a/secure/lib/libcrypto/i386/chacha-x86.S
+++ b/secure/lib/libcrypto/i386/chacha-x86.S
@@ -14,6 +14,16 @@ ChaCha20_ctr32:
xorl %eax,%eax
cmpl 28(%esp),%eax
je .L000no_data
+ call .Lpic_point
+.Lpic_point:
+ popl %eax
+ leal OPENSSL_ia32cap_P-.Lpic_point(%eax),%ebp
+ testl $16777216,(%ebp)
+ jz .L001x86
+ testl $512,4(%ebp)
+ jz .L001x86
+ jmp .Lssse3_shortcut
+.L001x86:
movl 32(%esp),%esi
movl 36(%esp),%edi
subl $132,%esp
@@ -42,13 +52,13 @@ ChaCha20_ctr32:
movl %ebx,116(%esp)
movl %ecx,120(%esp)
movl %edx,124(%esp)
- jmp .L001entry
+ jmp .L002entry
.align 16
-.L002outer_loop:
+.L003outer_loop:
movl %ebx,156(%esp)
movl %eax,152(%esp)
movl %ecx,160(%esp)
-.L001entry:
+.L002entry:
movl $1634760805,%eax
movl $857760878,4(%esp)
movl $2036477234,8(%esp)
@@ -76,9 +86,9 @@ ChaCha20_ctr32:
movl %edi,60(%esp)
movl %edx,112(%esp)
movl $10,%ebx
- jmp .L003loop
+ jmp .L004loop
.align 16
-.L003loop:
+.L004loop:
addl %ebp,%eax
movl %ebx,128(%esp)
movl %ebp,%ebx
@@ -232,14 +242,14 @@ ChaCha20_ctr32:
xorl %esi,%ebp
roll $7,%ebp
decl %ebx
- jnz .L003loop
+ jnz .L004loop
movl 160(%esp),%ebx
addl $1634760805,%eax
addl 80(%esp),%ebp
addl 96(%esp),%ecx
addl 100(%esp),%esi
cmpl $64,%ebx
- jb .L004tail
+ jb .L005tail
movl 156(%esp),%ebx
addl 112(%esp),%edx
addl 120(%esp),%edi
@@ -302,9 +312,9 @@ ChaCha20_ctr32:
movl %ebp,(%eax)
leal 64(%eax),%eax
subl $64,%ecx
- jnz .L002outer_loop
- jmp .L005done
-.L004tail:
+ jnz .L003outer_loop
+ jmp .L006done
+.L005tail:
addl 112(%esp),%edx
addl 120(%esp),%edi
movl %eax,(%esp)
@@ -348,15 +358,15 @@ ChaCha20_ctr32:
movl %edi,60(%esp)
xorl %eax,%eax
xorl %edx,%edx
-.L006tail_loop:
+.L007tail_loop:
movb (%esi,%ebp,1),%al
movb (%esp,%esi,1),%dl
leal 1(%esi),%esi
xorb %dl,%al
movb %al,-1(%ecx,%esi,1)
decl %ebx
- jnz .L006tail_loop
-.L005done:
+ jnz .L007tail_loop
+.L006done:
addl $132,%esp
.L000no_data:
popl %edi
@@ -365,10 +375,640 @@ ChaCha20_ctr32:
popl %ebp
ret
.size ChaCha20_ctr32,.-.L_ChaCha20_ctr32_begin
+.globl ChaCha20_ssse3
+.type ChaCha20_ssse3,@function
+.align 16
+ChaCha20_ssse3:
+.L_ChaCha20_ssse3_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+.Lssse3_shortcut:
+ testl $2048,4(%ebp)
+ jnz .Lxop_shortcut
+ movl 20(%esp),%edi
+ movl 24(%esp),%esi
+ movl 28(%esp),%ecx
+ movl 32(%esp),%edx
+ movl 36(%esp),%ebx
+ movl %esp,%ebp
+ subl $524,%esp
+ andl $-64,%esp
+ movl %ebp,512(%esp)
+ leal .Lssse3_data-.Lpic_point(%eax),%eax
+ movdqu (%ebx),%xmm3
+.L0081x:
+ movdqa 32(%eax),%xmm0
+ movdqu (%edx),%xmm1
+ movdqu 16(%edx),%xmm2
+ movdqa (%eax),%xmm6
+ movdqa 16(%eax),%xmm7
+ movl %ebp,48(%esp)
+ movdqa %xmm0,(%esp)
+ movdqa %xmm1,16(%esp)
+ movdqa %xmm2,32(%esp)
+ movdqa %xmm3,48(%esp)
+ movl $10,%edx
+ jmp .L009loop1x
+.align 16
+.L010outer1x:
+ movdqa 80(%eax),%xmm3
+ movdqa (%esp),%xmm0
+ movdqa 16(%esp),%xmm1
+ movdqa 32(%esp),%xmm2
+ paddd 48(%esp),%xmm3
+ movl $10,%edx
+ movdqa %xmm3,48(%esp)
+ jmp .L009loop1x
+.align 16
+.L009loop1x:
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+.byte 102,15,56,0,222
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm4
+ psrld $20,%xmm1
+ pslld $12,%xmm4
+ por %xmm4,%xmm1
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+.byte 102,15,56,0,223
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm4
+ psrld $25,%xmm1
+ pslld $7,%xmm4
+ por %xmm4,%xmm1
+ pshufd $78,%xmm2,%xmm2
+ pshufd $57,%xmm1,%xmm1
+ pshufd $147,%xmm3,%xmm3
+ nop
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+.byte 102,15,56,0,222
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm4
+ psrld $20,%xmm1
+ pslld $12,%xmm4
+ por %xmm4,%xmm1
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+.byte 102,15,56,0,223
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm4
+ psrld $25,%xmm1
+ pslld $7,%xmm4
+ por %xmm4,%xmm1
+ pshufd $78,%xmm2,%xmm2
+ pshufd $147,%xmm1,%xmm1
+ pshufd $57,%xmm3,%xmm3
+ decl %edx
+ jnz .L009loop1x
+ paddd (%esp),%xmm0
+ paddd 16(%esp),%xmm1
+ paddd 32(%esp),%xmm2
+ paddd 48(%esp),%xmm3
+ cmpl $64,%ecx
+ jb .L011tail
+ movdqu (%esi),%xmm4
+ movdqu 16(%esi),%xmm5
+ pxor %xmm4,%xmm0
+ movdqu 32(%esi),%xmm4
+ pxor %xmm5,%xmm1
+ movdqu 48(%esi),%xmm5
+ pxor %xmm4,%xmm2
+ pxor %xmm5,%xmm3
+ leal 64(%esi),%esi
+ movdqu %xmm0,(%edi)
+ movdqu %xmm1,16(%edi)
+ movdqu %xmm2,32(%edi)
+ movdqu %xmm3,48(%edi)
+ leal 64(%edi),%edi
+ subl $64,%ecx
+ jnz .L010outer1x
+ jmp .L012done
+.L011tail:
+ movdqa %xmm0,(%esp)
+ movdqa %xmm1,16(%esp)
+ movdqa %xmm2,32(%esp)
+ movdqa %xmm3,48(%esp)
+ xorl %eax,%eax
+ xorl %edx,%edx
+ xorl %ebp,%ebp
+.L013tail_loop:
+ movb (%esp,%ebp,1),%al
+ movb (%esi,%ebp,1),%dl
+ leal 1(%ebp),%ebp
+ xorb %dl,%al
+ movb %al,-1(%edi,%ebp,1)
+ decl %ecx
+ jnz .L013tail_loop
+.L012done:
+ movl 512(%esp),%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size ChaCha20_ssse3,.-.L_ChaCha20_ssse3_begin
+.align 64
+.Lssse3_data:
+.byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
+.byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14
+.long 1634760805,857760878,2036477234,1797285236
+.long 0,1,2,3
+.long 4,4,4,4
+.long 1,0,0,0
+.long 4,0,0,0
+.long 0,-1,-1,-1
+.align 64
.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54
.byte 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32
.byte 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111
.byte 114,103,62,0
+.globl ChaCha20_xop
+.type ChaCha20_xop,@function
+.align 16
+ChaCha20_xop:
+.L_ChaCha20_xop_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+.Lxop_shortcut:
+ movl 20(%esp),%edi
+ movl 24(%esp),%esi
+ movl 28(%esp),%ecx
+ movl 32(%esp),%edx
+ movl 36(%esp),%ebx
+ vzeroupper
+ movl %esp,%ebp
+ subl $524,%esp
+ andl $-64,%esp
+ movl %ebp,512(%esp)
+ leal .Lssse3_data-.Lpic_point(%eax),%eax
+ vmovdqu (%ebx),%xmm3
+ cmpl $256,%ecx
+ jb .L0141x
+ movl %edx,516(%esp)
+ movl %ebx,520(%esp)
+ subl $256,%ecx
+ leal 384(%esp),%ebp
+ vmovdqu (%edx),%xmm7
+ vpshufd $0,%xmm3,%xmm0
+ vpshufd $85,%xmm3,%xmm1
+ vpshufd $170,%xmm3,%xmm2
+ vpshufd $255,%xmm3,%xmm3
+ vpaddd 48(%eax),%xmm0,%xmm0
+ vpshufd $0,%xmm7,%xmm4
+ vpshufd $85,%xmm7,%xmm5
+ vpsubd 64(%eax),%xmm0,%xmm0
+ vpshufd $170,%xmm7,%xmm6
+ vpshufd $255,%xmm7,%xmm7
+ vmovdqa %xmm0,64(%ebp)
+ vmovdqa %xmm1,80(%ebp)
+ vmovdqa %xmm2,96(%ebp)
+ vmovdqa %xmm3,112(%ebp)
+ vmovdqu 16(%edx),%xmm3
+ vmovdqa %xmm4,-64(%ebp)
+ vmovdqa %xmm5,-48(%ebp)
+ vmovdqa %xmm6,-32(%ebp)
+ vmovdqa %xmm7,-16(%ebp)
+ vmovdqa 32(%eax),%xmm7
+ leal 128(%esp),%ebx
+ vpshufd $0,%xmm3,%xmm0
+ vpshufd $85,%xmm3,%xmm1
+ vpshufd $170,%xmm3,%xmm2
+ vpshufd $255,%xmm3,%xmm3
+ vpshufd $0,%xmm7,%xmm4
+ vpshufd $85,%xmm7,%xmm5
+ vpshufd $170,%xmm7,%xmm6
+ vpshufd $255,%xmm7,%xmm7
+ vmovdqa %xmm0,(%ebp)
+ vmovdqa %xmm1,16(%ebp)
+ vmovdqa %xmm2,32(%ebp)
+ vmovdqa %xmm3,48(%ebp)
+ vmovdqa %xmm4,-128(%ebp)
+ vmovdqa %xmm5,-112(%ebp)
+ vmovdqa %xmm6,-96(%ebp)
+ vmovdqa %xmm7,-80(%ebp)
+ leal 128(%esi),%esi
+ leal 128(%edi),%edi
+ jmp .L015outer_loop
+.align 32
+.L015outer_loop:
+ vmovdqa -112(%ebp),%xmm1
+ vmovdqa -96(%ebp),%xmm2
+ vmovdqa -80(%ebp),%xmm3
+ vmovdqa -48(%ebp),%xmm5
+ vmovdqa -32(%ebp),%xmm6
+ vmovdqa -16(%ebp),%xmm7
+ vmovdqa %xmm1,-112(%ebx)
+ vmovdqa %xmm2,-96(%ebx)
+ vmovdqa %xmm3,-80(%ebx)
+ vmovdqa %xmm5,-48(%ebx)
+ vmovdqa %xmm6,-32(%ebx)
+ vmovdqa %xmm7,-16(%ebx)
+ vmovdqa 32(%ebp),%xmm2
+ vmovdqa 48(%ebp),%xmm3
+ vmovdqa 64(%ebp),%xmm4
+ vmovdqa 80(%ebp),%xmm5
+ vmovdqa 96(%ebp),%xmm6
+ vmovdqa 112(%ebp),%xmm7
+ vpaddd 64(%eax),%xmm4,%xmm4
+ vmovdqa %xmm2,32(%ebx)
+ vmovdqa %xmm3,48(%ebx)
+ vmovdqa %xmm4,64(%ebx)
+ vmovdqa %xmm5,80(%ebx)
+ vmovdqa %xmm6,96(%ebx)
+ vmovdqa %xmm7,112(%ebx)
+ vmovdqa %xmm4,64(%ebp)
+ vmovdqa -128(%ebp),%xmm0
+ vmovdqa %xmm4,%xmm6
+ vmovdqa -64(%ebp),%xmm3
+ vmovdqa (%ebp),%xmm4
+ vmovdqa 16(%ebp),%xmm5
+ movl $10,%edx
+ nop
+.align 32
+.L016loop:
+ vpaddd %xmm3,%xmm0,%xmm0
+ vpxor %xmm0,%xmm6,%xmm6
+.byte 143,232,120,194,246,16
+ vpaddd %xmm6,%xmm4,%xmm4
+ vpxor %xmm4,%xmm3,%xmm2
+ vmovdqa -112(%ebx),%xmm1
+.byte 143,232,120,194,210,12
+ vmovdqa -48(%ebx),%xmm3
+ vpaddd %xmm2,%xmm0,%xmm0
+ vmovdqa 80(%ebx),%xmm7
+ vpxor %xmm0,%xmm6,%xmm6
+ vpaddd %xmm3,%xmm1,%xmm1
+.byte 143,232,120,194,246,8
+ vmovdqa %xmm0,-128(%ebx)
+ vpaddd %xmm6,%xmm4,%xmm4
+ vmovdqa %xmm6,64(%ebx)
+ vpxor %xmm4,%xmm2,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+.byte 143,232,120,194,210,7
+ vmovdqa %xmm4,(%ebx)
+.byte 143,232,120,194,255,16
+ vmovdqa %xmm2,-64(%ebx)
+ vpaddd %xmm7,%xmm5,%xmm5
+ vmovdqa 32(%ebx),%xmm4
+ vpxor %xmm5,%xmm3,%xmm3
+ vmovdqa -96(%ebx),%xmm0
+.byte 143,232,120,194,219,12
+ vmovdqa -32(%ebx),%xmm2
+ vpaddd %xmm3,%xmm1,%xmm1
+ vmovdqa 96(%ebx),%xmm6
+ vpxor %xmm1,%xmm7,%xmm7
+ vpaddd %xmm2,%xmm0,%xmm0
+.byte 143,232,120,194,255,8
+ vmovdqa %xmm1,-112(%ebx)
+ vpaddd %xmm7,%xmm5,%xmm5
+ vmovdqa %xmm7,80(%ebx)
+ vpxor %xmm5,%xmm3,%xmm3
+ vpxor %xmm0,%xmm6,%xmm6
+.byte 143,232,120,194,219,7
+ vmovdqa %xmm5,16(%ebx)
+.byte 143,232,120,194,246,16
+ vmovdqa %xmm3,-48(%ebx)
+ vpaddd %xmm6,%xmm4,%xmm4
+ vmovdqa 48(%ebx),%xmm5
+ vpxor %xmm4,%xmm2,%xmm2
+ vmovdqa -80(%ebx),%xmm1
+.byte 143,232,120,194,210,12
+ vmovdqa -16(%ebx),%xmm3
+ vpaddd %xmm2,%xmm0,%xmm0
+ vmovdqa 112(%ebx),%xmm7
+ vpxor %xmm0,%xmm6,%xmm6
+ vpaddd %xmm3,%xmm1,%xmm1
+.byte 143,232,120,194,246,8
+ vmovdqa %xmm0,-96(%ebx)
+ vpaddd %xmm6,%xmm4,%xmm4
+ vmovdqa %xmm6,96(%ebx)
+ vpxor %xmm4,%xmm2,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+.byte 143,232,120,194,210,7
+.byte 143,232,120,194,255,16
+ vmovdqa %xmm2,-32(%ebx)
+ vpaddd %xmm7,%xmm5,%xmm5
+ vpxor %xmm5,%xmm3,%xmm3
+ vmovdqa -128(%ebx),%xmm0
+.byte 143,232,120,194,219,12
+ vmovdqa -48(%ebx),%xmm2
+ vpaddd %xmm3,%xmm1,%xmm1
+ vpxor %xmm1,%xmm7,%xmm7
+ vpaddd %xmm2,%xmm0,%xmm0
+.byte 143,232,120,194,255,8
+ vmovdqa %xmm1,-80(%ebx)
+ vpaddd %xmm7,%xmm5,%xmm5
+ vpxor %xmm5,%xmm3,%xmm3
+ vpxor %xmm0,%xmm7,%xmm6
+.byte 143,232,120,194,219,7
+.byte 143,232,120,194,246,16
+ vmovdqa %xmm3,-16(%ebx)
+ vpaddd %xmm6,%xmm4,%xmm4
+ vpxor %xmm4,%xmm2,%xmm2
+ vmovdqa -112(%ebx),%xmm1
+.byte 143,232,120,194,210,12
+ vmovdqa -32(%ebx),%xmm3
+ vpaddd %xmm2,%xmm0,%xmm0
+ vmovdqa 64(%ebx),%xmm7
+ vpxor %xmm0,%xmm6,%xmm6
+ vpaddd %xmm3,%xmm1,%xmm1
+.byte 143,232,120,194,246,8
+ vmovdqa %xmm0,-128(%ebx)
+ vpaddd %xmm6,%xmm4,%xmm4
+ vmovdqa %xmm6,112(%ebx)
+ vpxor %xmm4,%xmm2,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+.byte 143,232,120,194,210,7
+ vmovdqa %xmm4,32(%ebx)
+.byte 143,232,120,194,255,16
+ vmovdqa %xmm2,-48(%ebx)
+ vpaddd %xmm7,%xmm5,%xmm5
+ vmovdqa (%ebx),%xmm4
+ vpxor %xmm5,%xmm3,%xmm3
+ vmovdqa -96(%ebx),%xmm0
+.byte 143,232,120,194,219,12
+ vmovdqa -16(%ebx),%xmm2
+ vpaddd %xmm3,%xmm1,%xmm1
+ vmovdqa 80(%ebx),%xmm6
+ vpxor %xmm1,%xmm7,%xmm7
+ vpaddd %xmm2,%xmm0,%xmm0
+.byte 143,232,120,194,255,8
+ vmovdqa %xmm1,-112(%ebx)
+ vpaddd %xmm7,%xmm5,%xmm5
+ vmovdqa %xmm7,64(%ebx)
+ vpxor %xmm5,%xmm3,%xmm3
+ vpxor %xmm0,%xmm6,%xmm6
+.byte 143,232,120,194,219,7
+ vmovdqa %xmm5,48(%ebx)
+.byte 143,232,120,194,246,16
+ vmovdqa %xmm3,-32(%ebx)
+ vpaddd %xmm6,%xmm4,%xmm4
+ vmovdqa 16(%ebx),%xmm5
+ vpxor %xmm4,%xmm2,%xmm2
+ vmovdqa -80(%ebx),%xmm1
+.byte 143,232,120,194,210,12
+ vmovdqa -64(%ebx),%xmm3
+ vpaddd %xmm2,%xmm0,%xmm0
+ vmovdqa 96(%ebx),%xmm7
+ vpxor %xmm0,%xmm6,%xmm6
+ vpaddd %xmm3,%xmm1,%xmm1
+.byte 143,232,120,194,246,8
+ vmovdqa %xmm0,-96(%ebx)
+ vpaddd %xmm6,%xmm4,%xmm4
+ vmovdqa %xmm6,80(%ebx)
+ vpxor %xmm4,%xmm2,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+.byte 143,232,120,194,210,7
+.byte 143,232,120,194,255,16
+ vmovdqa %xmm2,-16(%ebx)
+ vpaddd %xmm7,%xmm5,%xmm5
+ vpxor %xmm5,%xmm3,%xmm3
+ vmovdqa -128(%ebx),%xmm0
+.byte 143,232,120,194,219,12
+ vpaddd %xmm3,%xmm1,%xmm1
+ vmovdqa 64(%ebx),%xmm6
+ vpxor %xmm1,%xmm7,%xmm7
+.byte 143,232,120,194,255,8
+ vmovdqa %xmm1,-80(%ebx)
+ vpaddd %xmm7,%xmm5,%xmm5
+ vmovdqa %xmm7,96(%ebx)
+ vpxor %xmm5,%xmm3,%xmm3
+.byte 143,232,120,194,219,7
+ decl %edx
+ jnz .L016loop
+ vmovdqa %xmm3,-64(%ebx)
+ vmovdqa %xmm4,(%ebx)
+ vmovdqa %xmm5,16(%ebx)
+ vmovdqa %xmm6,64(%ebx)
+ vmovdqa %xmm7,96(%ebx)
+ vmovdqa -112(%ebx),%xmm1
+ vmovdqa -96(%ebx),%xmm2
+ vmovdqa -80(%ebx),%xmm3
+ vpaddd -128(%ebp),%xmm0,%xmm0
+ vpaddd -112(%ebp),%xmm1,%xmm1
+ vpaddd -96(%ebp),%xmm2,%xmm2
+ vpaddd -80(%ebp),%xmm3,%xmm3
+ vpunpckldq %xmm1,%xmm0,%xmm6
+ vpunpckldq %xmm3,%xmm2,%xmm7
+ vpunpckhdq %xmm1,%xmm0,%xmm0
+ vpunpckhdq %xmm3,%xmm2,%xmm2
+ vpunpcklqdq %xmm7,%xmm6,%xmm1
+ vpunpckhqdq %xmm7,%xmm6,%xmm6
+ vpunpcklqdq %xmm2,%xmm0,%xmm7
+ vpunpckhqdq %xmm2,%xmm0,%xmm3
+ vpxor -128(%esi),%xmm1,%xmm4
+ vpxor -64(%esi),%xmm6,%xmm5
+ vpxor (%esi),%xmm7,%xmm6
+ vpxor 64(%esi),%xmm3,%xmm7
+ leal 16(%esi),%esi
+ vmovdqa -64(%ebx),%xmm0
+ vmovdqa -48(%ebx),%xmm1
+ vmovdqa -32(%ebx),%xmm2
+ vmovdqa -16(%ebx),%xmm3
+ vmovdqu %xmm4,-128(%edi)
+ vmovdqu %xmm5,-64(%edi)
+ vmovdqu %xmm6,(%edi)
+ vmovdqu %xmm7,64(%edi)
+ leal 16(%edi),%edi
+ vpaddd -64(%ebp),%xmm0,%xmm0
+ vpaddd -48(%ebp),%xmm1,%xmm1
+ vpaddd -32(%ebp),%xmm2,%xmm2
+ vpaddd -16(%ebp),%xmm3,%xmm3
+ vpunpckldq %xmm1,%xmm0,%xmm6
+ vpunpckldq %xmm3,%xmm2,%xmm7
+ vpunpckhdq %xmm1,%xmm0,%xmm0
+ vpunpckhdq %xmm3,%xmm2,%xmm2
+ vpunpcklqdq %xmm7,%xmm6,%xmm1
+ vpunpckhqdq %xmm7,%xmm6,%xmm6
+ vpunpcklqdq %xmm2,%xmm0,%xmm7
+ vpunpckhqdq %xmm2,%xmm0,%xmm3
+ vpxor -128(%esi),%xmm1,%xmm4
+ vpxor -64(%esi),%xmm6,%xmm5
+ vpxor (%esi),%xmm7,%xmm6
+ vpxor 64(%esi),%xmm3,%xmm7
+ leal 16(%esi),%esi
+ vmovdqa (%ebx),%xmm0
+ vmovdqa 16(%ebx),%xmm1
+ vmovdqa 32(%ebx),%xmm2
+ vmovdqa 48(%ebx),%xmm3
+ vmovdqu %xmm4,-128(%edi)
+ vmovdqu %xmm5,-64(%edi)
+ vmovdqu %xmm6,(%edi)
+ vmovdqu %xmm7,64(%edi)
+ leal 16(%edi),%edi
+ vpaddd (%ebp),%xmm0,%xmm0
+ vpaddd 16(%ebp),%xmm1,%xmm1
+ vpaddd 32(%ebp),%xmm2,%xmm2
+ vpaddd 48(%ebp),%xmm3,%xmm3
+ vpunpckldq %xmm1,%xmm0,%xmm6
+ vpunpckldq %xmm3,%xmm2,%xmm7
+ vpunpckhdq %xmm1,%xmm0,%xmm0
+ vpunpckhdq %xmm3,%xmm2,%xmm2
+ vpunpcklqdq %xmm7,%xmm6,%xmm1
+ vpunpckhqdq %xmm7,%xmm6,%xmm6
+ vpunpcklqdq %xmm2,%xmm0,%xmm7
+ vpunpckhqdq %xmm2,%xmm0,%xmm3
+ vpxor -128(%esi),%xmm1,%xmm4
+ vpxor -64(%esi),%xmm6,%xmm5
+ vpxor (%esi),%xmm7,%xmm6
+ vpxor 64(%esi),%xmm3,%xmm7
+ leal 16(%esi),%esi
+ vmovdqa 64(%ebx),%xmm0
+ vmovdqa 80(%ebx),%xmm1
+ vmovdqa 96(%ebx),%xmm2
+ vmovdqa 112(%ebx),%xmm3
+ vmovdqu %xmm4,-128(%edi)
+ vmovdqu %xmm5,-64(%edi)
+ vmovdqu %xmm6,(%edi)
+ vmovdqu %xmm7,64(%edi)
+ leal 16(%edi),%edi
+ vpaddd 64(%ebp),%xmm0,%xmm0
+ vpaddd 80(%ebp),%xmm1,%xmm1
+ vpaddd 96(%ebp),%xmm2,%xmm2
+ vpaddd 112(%ebp),%xmm3,%xmm3
+ vpunpckldq %xmm1,%xmm0,%xmm6
+ vpunpckldq %xmm3,%xmm2,%xmm7
+ vpunpckhdq %xmm1,%xmm0,%xmm0
+ vpunpckhdq %xmm3,%xmm2,%xmm2
+ vpunpcklqdq %xmm7,%xmm6,%xmm1
+ vpunpckhqdq %xmm7,%xmm6,%xmm6
+ vpunpcklqdq %xmm2,%xmm0,%xmm7
+ vpunpckhqdq %xmm2,%xmm0,%xmm3
+ vpxor -128(%esi),%xmm1,%xmm4
+ vpxor -64(%esi),%xmm6,%xmm5
+ vpxor (%esi),%xmm7,%xmm6
+ vpxor 64(%esi),%xmm3,%xmm7
+ leal 208(%esi),%esi
+ vmovdqu %xmm4,-128(%edi)
+ vmovdqu %xmm5,-64(%edi)
+ vmovdqu %xmm6,(%edi)
+ vmovdqu %xmm7,64(%edi)
+ leal 208(%edi),%edi
+ subl $256,%ecx
+ jnc .L015outer_loop
+ addl $256,%ecx
+ jz .L017done
+ movl 520(%esp),%ebx
+ leal -128(%esi),%esi
+ movl 516(%esp),%edx
+ leal -128(%edi),%edi
+ vmovd 64(%ebp),%xmm2
+ vmovdqu (%ebx),%xmm3
+ vpaddd 96(%eax),%xmm2,%xmm2
+ vpand 112(%eax),%xmm3,%xmm3
+ vpor %xmm2,%xmm3,%xmm3
+.L0141x:
+ vmovdqa 32(%eax),%xmm0
+ vmovdqu (%edx),%xmm1
+ vmovdqu 16(%edx),%xmm2
+ vmovdqa (%eax),%xmm6
+ vmovdqa 16(%eax),%xmm7
+ movl %ebp,48(%esp)
+ vmovdqa %xmm0,(%esp)
+ vmovdqa %xmm1,16(%esp)
+ vmovdqa %xmm2,32(%esp)
+ vmovdqa %xmm3,48(%esp)
+ movl $10,%edx
+ jmp .L018loop1x
+.align 16
+.L019outer1x:
+ vmovdqa 80(%eax),%xmm3
+ vmovdqa (%esp),%xmm0
+ vmovdqa 16(%esp),%xmm1
+ vmovdqa 32(%esp),%xmm2
+ vpaddd 48(%esp),%xmm3,%xmm3
+ movl $10,%edx
+ vmovdqa %xmm3,48(%esp)
+ jmp .L018loop1x
+.align 16
+.L018loop1x:
+ vpaddd %xmm1,%xmm0,%xmm0
+ vpxor %xmm0,%xmm3,%xmm3
+.byte 143,232,120,194,219,16
+ vpaddd %xmm3,%xmm2,%xmm2
+ vpxor %xmm2,%xmm1,%xmm1
+.byte 143,232,120,194,201,12
+ vpaddd %xmm1,%xmm0,%xmm0
+ vpxor %xmm0,%xmm3,%xmm3
+.byte 143,232,120,194,219,8
+ vpaddd %xmm3,%xmm2,%xmm2
+ vpxor %xmm2,%xmm1,%xmm1
+.byte 143,232,120,194,201,7
+ vpshufd $78,%xmm2,%xmm2
+ vpshufd $57,%xmm1,%xmm1
+ vpshufd $147,%xmm3,%xmm3
+ vpaddd %xmm1,%xmm0,%xmm0
+ vpxor %xmm0,%xmm3,%xmm3
+.byte 143,232,120,194,219,16
+ vpaddd %xmm3,%xmm2,%xmm2
+ vpxor %xmm2,%xmm1,%xmm1
+.byte 143,232,120,194,201,12
+ vpaddd %xmm1,%xmm0,%xmm0
+ vpxor %xmm0,%xmm3,%xmm3
+.byte 143,232,120,194,219,8
+ vpaddd %xmm3,%xmm2,%xmm2
+ vpxor %xmm2,%xmm1,%xmm1
+.byte 143,232,120,194,201,7
+ vpshufd $78,%xmm2,%xmm2
+ vpshufd $147,%xmm1,%xmm1
+ vpshufd $57,%xmm3,%xmm3
+ decl %edx
+ jnz .L018loop1x
+ vpaddd (%esp),%xmm0,%xmm0
+ vpaddd 16(%esp),%xmm1,%xmm1
+ vpaddd 32(%esp),%xmm2,%xmm2
+ vpaddd 48(%esp),%xmm3,%xmm3
+ cmpl $64,%ecx
+ jb .L020tail
+ vpxor (%esi),%xmm0,%xmm0
+ vpxor 16(%esi),%xmm1,%xmm1
+ vpxor 32(%esi),%xmm2,%xmm2
+ vpxor 48(%esi),%xmm3,%xmm3
+ leal 64(%esi),%esi
+ vmovdqu %xmm0,(%edi)
+ vmovdqu %xmm1,16(%edi)
+ vmovdqu %xmm2,32(%edi)
+ vmovdqu %xmm3,48(%edi)
+ leal 64(%edi),%edi
+ subl $64,%ecx
+ jnz .L019outer1x
+ jmp .L017done
+.L020tail:
+ vmovdqa %xmm0,(%esp)
+ vmovdqa %xmm1,16(%esp)
+ vmovdqa %xmm2,32(%esp)
+ vmovdqa %xmm3,48(%esp)
+ xorl %eax,%eax
+ xorl %edx,%edx
+ xorl %ebp,%ebp
+.L021tail_loop:
+ movb (%esp,%ebp,1),%al
+ movb (%esi,%ebp,1),%dl
+ leal 1(%ebp),%ebp
+ xorb %dl,%al
+ movb %al,-1(%edi,%ebp,1)
+ decl %ecx
+ jnz .L021tail_loop
+.L017done:
+ vzeroupper
+ movl 512(%esp),%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size ChaCha20_xop,.-.L_ChaCha20_xop_begin
+.comm OPENSSL_ia32cap_P,16,4
#else
.text
.globl ChaCha20_ctr32
@@ -383,6 +1023,16 @@ ChaCha20_ctr32:
xorl %eax,%eax
cmpl 28(%esp),%eax
je .L000no_data
+ call .Lpic_point
+.Lpic_point:
+ popl %eax
+ leal OPENSSL_ia32cap_P,%ebp
+ testl $16777216,(%ebp)
+ jz .L001x86
+ testl $512,4(%ebp)
+ jz .L001x86
+ jmp .Lssse3_shortcut
+.L001x86:
movl 32(%esp),%esi
movl 36(%esp),%edi
subl $132,%esp
@@ -411,13 +1061,13 @@ ChaCha20_ctr32:
movl %ebx,116(%esp)
movl %ecx,120(%esp)
movl %edx,124(%esp)
- jmp .L001entry
+ jmp .L002entry
.align 16
-.L002outer_loop:
+.L003outer_loop:
movl %ebx,156(%esp)
movl %eax,152(%esp)
movl %ecx,160(%esp)
-.L001entry:
+.L002entry:
movl $1634760805,%eax
movl $857760878,4(%esp)
movl $2036477234,8(%esp)
@@ -445,9 +1095,9 @@ ChaCha20_ctr32:
movl %edi,60(%esp)
movl %edx,112(%esp)
movl $10,%ebx
- jmp .L003loop
+ jmp .L004loop
.align 16
-.L003loop:
+.L004loop:
addl %ebp,%eax
movl %ebx,128(%esp)
movl %ebp,%ebx
@@ -601,14 +1251,14 @@ ChaCha20_ctr32:
xorl %esi,%ebp
roll $7,%ebp
decl %ebx
- jnz .L003loop
+ jnz .L004loop
movl 160(%esp),%ebx
addl $1634760805,%eax
addl 80(%esp),%ebp
addl 96(%esp),%ecx
addl 100(%esp),%esi
cmpl $64,%ebx
- jb .L004tail
+ jb .L005tail
movl 156(%esp),%ebx
addl 112(%esp),%edx
addl 120(%esp),%edi
@@ -671,9 +1321,9 @@ ChaCha20_ctr32:
movl %ebp,(%eax)
leal 64(%eax),%eax
subl $64,%ecx
- jnz .L002outer_loop
- jmp .L005done
-.L004tail:
+ jnz .L003outer_loop
+ jmp .L006done
+.L005tail:
addl 112(%esp),%edx
addl 120(%esp),%edi
movl %eax,(%esp)
@@ -717,15 +1367,15 @@ ChaCha20_ctr32:
movl %edi,60(%esp)
xorl %eax,%eax
xorl %edx,%edx
-.L006tail_loop:
+.L007tail_loop:
movb (%esi,%ebp,1),%al
movb (%esp,%esi,1),%dl
leal 1(%esi),%esi
xorb %dl,%al
movb %al,-1(%ecx,%esi,1)
decl %ebx
- jnz .L006tail_loop
-.L005done:
+ jnz .L007tail_loop
+.L006done:
addl $132,%esp
.L000no_data:
popl %edi
@@ -734,8 +1384,638 @@ ChaCha20_ctr32:
popl %ebp
ret
.size ChaCha20_ctr32,.-.L_ChaCha20_ctr32_begin
+.globl ChaCha20_ssse3
+.type ChaCha20_ssse3,@function
+.align 16
+ChaCha20_ssse3:
+.L_ChaCha20_ssse3_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+.Lssse3_shortcut:
+ testl $2048,4(%ebp)
+ jnz .Lxop_shortcut
+ movl 20(%esp),%edi
+ movl 24(%esp),%esi
+ movl 28(%esp),%ecx
+ movl 32(%esp),%edx
+ movl 36(%esp),%ebx
+ movl %esp,%ebp
+ subl $524,%esp
+ andl $-64,%esp
+ movl %ebp,512(%esp)
+ leal .Lssse3_data-.Lpic_point(%eax),%eax
+ movdqu (%ebx),%xmm3
+.L0081x:
+ movdqa 32(%eax),%xmm0
+ movdqu (%edx),%xmm1
+ movdqu 16(%edx),%xmm2
+ movdqa (%eax),%xmm6
+ movdqa 16(%eax),%xmm7
+ movl %ebp,48(%esp)
+ movdqa %xmm0,(%esp)
+ movdqa %xmm1,16(%esp)
+ movdqa %xmm2,32(%esp)
+ movdqa %xmm3,48(%esp)
+ movl $10,%edx
+ jmp .L009loop1x
+.align 16
+.L010outer1x:
+ movdqa 80(%eax),%xmm3
+ movdqa (%esp),%xmm0
+ movdqa 16(%esp),%xmm1
+ movdqa 32(%esp),%xmm2
+ paddd 48(%esp),%xmm3
+ movl $10,%edx
+ movdqa %xmm3,48(%esp)
+ jmp .L009loop1x
+.align 16
+.L009loop1x:
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+.byte 102,15,56,0,222
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm4
+ psrld $20,%xmm1
+ pslld $12,%xmm4
+ por %xmm4,%xmm1
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+.byte 102,15,56,0,223
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm4
+ psrld $25,%xmm1
+ pslld $7,%xmm4
+ por %xmm4,%xmm1
+ pshufd $78,%xmm2,%xmm2
+ pshufd $57,%xmm1,%xmm1
+ pshufd $147,%xmm3,%xmm3
+ nop
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+.byte 102,15,56,0,222
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm4
+ psrld $20,%xmm1
+ pslld $12,%xmm4
+ por %xmm4,%xmm1
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+.byte 102,15,56,0,223
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm4
+ psrld $25,%xmm1
+ pslld $7,%xmm4
+ por %xmm4,%xmm1
+ pshufd $78,%xmm2,%xmm2
+ pshufd $147,%xmm1,%xmm1
+ pshufd $57,%xmm3,%xmm3
+ decl %edx
+ jnz .L009loop1x
+ paddd (%esp),%xmm0
+ paddd 16(%esp),%xmm1
+ paddd 32(%esp),%xmm2
+ paddd 48(%esp),%xmm3
+ cmpl $64,%ecx
+ jb .L011tail
+ movdqu (%esi),%xmm4
+ movdqu 16(%esi),%xmm5
+ pxor %xmm4,%xmm0
+ movdqu 32(%esi),%xmm4
+ pxor %xmm5,%xmm1
+ movdqu 48(%esi),%xmm5
+ pxor %xmm4,%xmm2
+ pxor %xmm5,%xmm3
+ leal 64(%esi),%esi
+ movdqu %xmm0,(%edi)
+ movdqu %xmm1,16(%edi)
+ movdqu %xmm2,32(%edi)
+ movdqu %xmm3,48(%edi)
+ leal 64(%edi),%edi
+ subl $64,%ecx
+ jnz .L010outer1x
+ jmp .L012done
+.L011tail:
+ movdqa %xmm0,(%esp)
+ movdqa %xmm1,16(%esp)
+ movdqa %xmm2,32(%esp)
+ movdqa %xmm3,48(%esp)
+ xorl %eax,%eax
+ xorl %edx,%edx
+ xorl %ebp,%ebp
+.L013tail_loop:
+ movb (%esp,%ebp,1),%al
+ movb (%esi,%ebp,1),%dl
+ leal 1(%ebp),%ebp
+ xorb %dl,%al
+ movb %al,-1(%edi,%ebp,1)
+ decl %ecx
+ jnz .L013tail_loop
+.L012done:
+ movl 512(%esp),%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size ChaCha20_ssse3,.-.L_ChaCha20_ssse3_begin
+.align 64
+.Lssse3_data:
+.byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
+.byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14
+.long 1634760805,857760878,2036477234,1797285236
+.long 0,1,2,3
+.long 4,4,4,4
+.long 1,0,0,0
+.long 4,0,0,0
+.long 0,-1,-1,-1
+.align 64
.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54
.byte 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32
.byte 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111
.byte 114,103,62,0
+.globl ChaCha20_xop
+.type ChaCha20_xop,@function
+.align 16
+ChaCha20_xop:
+.L_ChaCha20_xop_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+.Lxop_shortcut:
+ movl 20(%esp),%edi
+ movl 24(%esp),%esi
+ movl 28(%esp),%ecx
+ movl 32(%esp),%edx
+ movl 36(%esp),%ebx
+ vzeroupper
+ movl %esp,%ebp
+ subl $524,%esp
+ andl $-64,%esp
+ movl %ebp,512(%esp)
+ leal .Lssse3_data-.Lpic_point(%eax),%eax
+ vmovdqu (%ebx),%xmm3
+ cmpl $256,%ecx
+ jb .L0141x
+ movl %edx,516(%esp)
+ movl %ebx,520(%esp)
+ subl $256,%ecx
+ leal 384(%esp),%ebp
+ vmovdqu (%edx),%xmm7
+ vpshufd $0,%xmm3,%xmm0
+ vpshufd $85,%xmm3,%xmm1
+ vpshufd $170,%xmm3,%xmm2
+ vpshufd $255,%xmm3,%xmm3
+ vpaddd 48(%eax),%xmm0,%xmm0
+ vpshufd $0,%xmm7,%xmm4
+ vpshufd $85,%xmm7,%xmm5
+ vpsubd 64(%eax),%xmm0,%xmm0
+ vpshufd $170,%xmm7,%xmm6
+ vpshufd $255,%xmm7,%xmm7
+ vmovdqa %xmm0,64(%ebp)
+ vmovdqa %xmm1,80(%ebp)
+ vmovdqa %xmm2,96(%ebp)
+ vmovdqa %xmm3,112(%ebp)
+ vmovdqu 16(%edx),%xmm3
+ vmovdqa %xmm4,-64(%ebp)
+ vmovdqa %xmm5,-48(%ebp)
+ vmovdqa %xmm6,-32(%ebp)
+ vmovdqa %xmm7,-16(%ebp)
+ vmovdqa 32(%eax),%xmm7
+ leal 128(%esp),%ebx
+ vpshufd $0,%xmm3,%xmm0
+ vpshufd $85,%xmm3,%xmm1
+ vpshufd $170,%xmm3,%xmm2
+ vpshufd $255,%xmm3,%xmm3
+ vpshufd $0,%xmm7,%xmm4
+ vpshufd $85,%xmm7,%xmm5
+ vpshufd $170,%xmm7,%xmm6
+ vpshufd $255,%xmm7,%xmm7
+ vmovdqa %xmm0,(%ebp)
+ vmovdqa %xmm1,16(%ebp)
+ vmovdqa %xmm2,32(%ebp)
+ vmovdqa %xmm3,48(%ebp)
+ vmovdqa %xmm4,-128(%ebp)
+ vmovdqa %xmm5,-112(%ebp)
+ vmovdqa %xmm6,-96(%ebp)
+ vmovdqa %xmm7,-80(%ebp)
+ leal 128(%esi),%esi
+ leal 128(%edi),%edi
+ jmp .L015outer_loop
+.align 32
+.L015outer_loop:
+ vmovdqa -112(%ebp),%xmm1
+ vmovdqa -96(%ebp),%xmm2
+ vmovdqa -80(%ebp),%xmm3
+ vmovdqa -48(%ebp),%xmm5
+ vmovdqa -32(%ebp),%xmm6
+ vmovdqa -16(%ebp),%xmm7
+ vmovdqa %xmm1,-112(%ebx)
+ vmovdqa %xmm2,-96(%ebx)
+ vmovdqa %xmm3,-80(%ebx)
+ vmovdqa %xmm5,-48(%ebx)
+ vmovdqa %xmm6,-32(%ebx)
+ vmovdqa %xmm7,-16(%ebx)
+ vmovdqa 32(%ebp),%xmm2
+ vmovdqa 48(%ebp),%xmm3
+ vmovdqa 64(%ebp),%xmm4
+ vmovdqa 80(%ebp),%xmm5
+ vmovdqa 96(%ebp),%xmm6
+ vmovdqa 112(%ebp),%xmm7
+ vpaddd 64(%eax),%xmm4,%xmm4
+ vmovdqa %xmm2,32(%ebx)
+ vmovdqa %xmm3,48(%ebx)
+ vmovdqa %xmm4,64(%ebx)
+ vmovdqa %xmm5,80(%ebx)
+ vmovdqa %xmm6,96(%ebx)
+ vmovdqa %xmm7,112(%ebx)
+ vmovdqa %xmm4,64(%ebp)
+ vmovdqa -128(%ebp),%xmm0
+ vmovdqa %xmm4,%xmm6
+ vmovdqa -64(%ebp),%xmm3
+ vmovdqa (%ebp),%xmm4
+ vmovdqa 16(%ebp),%xmm5
+ movl $10,%edx
+ nop
+.align 32
+.L016loop:
+ vpaddd %xmm3,%xmm0,%xmm0
+ vpxor %xmm0,%xmm6,%xmm6
+.byte 143,232,120,194,246,16
+ vpaddd %xmm6,%xmm4,%xmm4
+ vpxor %xmm4,%xmm3,%xmm2
+ vmovdqa -112(%ebx),%xmm1
+.byte 143,232,120,194,210,12
+ vmovdqa -48(%ebx),%xmm3
+ vpaddd %xmm2,%xmm0,%xmm0
+ vmovdqa 80(%ebx),%xmm7
+ vpxor %xmm0,%xmm6,%xmm6
+ vpaddd %xmm3,%xmm1,%xmm1
+.byte 143,232,120,194,246,8
+ vmovdqa %xmm0,-128(%ebx)
+ vpaddd %xmm6,%xmm4,%xmm4
+ vmovdqa %xmm6,64(%ebx)
+ vpxor %xmm4,%xmm2,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+.byte 143,232,120,194,210,7
+ vmovdqa %xmm4,(%ebx)
+.byte 143,232,120,194,255,16
+ vmovdqa %xmm2,-64(%ebx)
+ vpaddd %xmm7,%xmm5,%xmm5
+ vmovdqa 32(%ebx),%xmm4
+ vpxor %xmm5,%xmm3,%xmm3
+ vmovdqa -96(%ebx),%xmm0
+.byte 143,232,120,194,219,12
+ vmovdqa -32(%ebx),%xmm2
+ vpaddd %xmm3,%xmm1,%xmm1
+ vmovdqa 96(%ebx),%xmm6
+ vpxor %xmm1,%xmm7,%xmm7
+ vpaddd %xmm2,%xmm0,%xmm0
+.byte 143,232,120,194,255,8
+ vmovdqa %xmm1,-112(%ebx)
+ vpaddd %xmm7,%xmm5,%xmm5
+ vmovdqa %xmm7,80(%ebx)
+ vpxor %xmm5,%xmm3,%xmm3
+ vpxor %xmm0,%xmm6,%xmm6
+.byte 143,232,120,194,219,7
+ vmovdqa %xmm5,16(%ebx)
+.byte 143,232,120,194,246,16
+ vmovdqa %xmm3,-48(%ebx)
+ vpaddd %xmm6,%xmm4,%xmm4
+ vmovdqa 48(%ebx),%xmm5
+ vpxor %xmm4,%xmm2,%xmm2
+ vmovdqa -80(%ebx),%xmm1
+.byte 143,232,120,194,210,12
+ vmovdqa -16(%ebx),%xmm3
+ vpaddd %xmm2,%xmm0,%xmm0
+ vmovdqa 112(%ebx),%xmm7
+ vpxor %xmm0,%xmm6,%xmm6
+ vpaddd %xmm3,%xmm1,%xmm1
+.byte 143,232,120,194,246,8
+ vmovdqa %xmm0,-96(%ebx)
+ vpaddd %xmm6,%xmm4,%xmm4
+ vmovdqa %xmm6,96(%ebx)
+ vpxor %xmm4,%xmm2,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+.byte 143,232,120,194,210,7
+.byte 143,232,120,194,255,16
+ vmovdqa %xmm2,-32(%ebx)
+ vpaddd %xmm7,%xmm5,%xmm5
+ vpxor %xmm5,%xmm3,%xmm3
+ vmovdqa -128(%ebx),%xmm0
+.byte 143,232,120,194,219,12
+ vmovdqa -48(%ebx),%xmm2
+ vpaddd %xmm3,%xmm1,%xmm1
+ vpxor %xmm1,%xmm7,%xmm7
+ vpaddd %xmm2,%xmm0,%xmm0
+.byte 143,232,120,194,255,8
+ vmovdqa %xmm1,-80(%ebx)
+ vpaddd %xmm7,%xmm5,%xmm5
+ vpxor %xmm5,%xmm3,%xmm3
+ vpxor %xmm0,%xmm7,%xmm6
+.byte 143,232,120,194,219,7
+.byte 143,232,120,194,246,16
+ vmovdqa %xmm3,-16(%ebx)
+ vpaddd %xmm6,%xmm4,%xmm4
+ vpxor %xmm4,%xmm2,%xmm2
+ vmovdqa -112(%ebx),%xmm1
+.byte 143,232,120,194,210,12
+ vmovdqa -32(%ebx),%xmm3
+ vpaddd %xmm2,%xmm0,%xmm0
+ vmovdqa 64(%ebx),%xmm7
+ vpxor %xmm0,%xmm6,%xmm6
+ vpaddd %xmm3,%xmm1,%xmm1
+.byte 143,232,120,194,246,8
+ vmovdqa %xmm0,-128(%ebx)
+ vpaddd %xmm6,%xmm4,%xmm4
+ vmovdqa %xmm6,112(%ebx)
+ vpxor %xmm4,%xmm2,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+.byte 143,232,120,194,210,7
+ vmovdqa %xmm4,32(%ebx)
+.byte 143,232,120,194,255,16
+ vmovdqa %xmm2,-48(%ebx)
+ vpaddd %xmm7,%xmm5,%xmm5
+ vmovdqa (%ebx),%xmm4
+ vpxor %xmm5,%xmm3,%xmm3
+ vmovdqa -96(%ebx),%xmm0
+.byte 143,232,120,194,219,12
+ vmovdqa -16(%ebx),%xmm2
+ vpaddd %xmm3,%xmm1,%xmm1
+ vmovdqa 80(%ebx),%xmm6
+ vpxor %xmm1,%xmm7,%xmm7
+ vpaddd %xmm2,%xmm0,%xmm0
+.byte 143,232,120,194,255,8
+ vmovdqa %xmm1,-112(%ebx)
+ vpaddd %xmm7,%xmm5,%xmm5
+ vmovdqa %xmm7,64(%ebx)
+ vpxor %xmm5,%xmm3,%xmm3
+ vpxor %xmm0,%xmm6,%xmm6
+.byte 143,232,120,194,219,7
+ vmovdqa %xmm5,48(%ebx)
+.byte 143,232,120,194,246,16
+ vmovdqa %xmm3,-32(%ebx)
+ vpaddd %xmm6,%xmm4,%xmm4
+ vmovdqa 16(%ebx),%xmm5
+ vpxor %xmm4,%xmm2,%xmm2
+ vmovdqa -80(%ebx),%xmm1
+.byte 143,232,120,194,210,12
+ vmovdqa -64(%ebx),%xmm3
+ vpaddd %xmm2,%xmm0,%xmm0
+ vmovdqa 96(%ebx),%xmm7
+ vpxor %xmm0,%xmm6,%xmm6
+ vpaddd %xmm3,%xmm1,%xmm1
+.byte 143,232,120,194,246,8
+ vmovdqa %xmm0,-96(%ebx)
+ vpaddd %xmm6,%xmm4,%xmm4
+ vmovdqa %xmm6,80(%ebx)
+ vpxor %xmm4,%xmm2,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+.byte 143,232,120,194,210,7
+.byte 143,232,120,194,255,16
+ vmovdqa %xmm2,-16(%ebx)
+ vpaddd %xmm7,%xmm5,%xmm5
+ vpxor %xmm5,%xmm3,%xmm3
+ vmovdqa -128(%ebx),%xmm0
+.byte 143,232,120,194,219,12
+ vpaddd %xmm3,%xmm1,%xmm1
+ vmovdqa 64(%ebx),%xmm6
+ vpxor %xmm1,%xmm7,%xmm7
+.byte 143,232,120,194,255,8
+ vmovdqa %xmm1,-80(%ebx)
+ vpaddd %xmm7,%xmm5,%xmm5
+ vmovdqa %xmm7,96(%ebx)
+ vpxor %xmm5,%xmm3,%xmm3
+.byte 143,232,120,194,219,7
+ decl %edx
+ jnz .L016loop
+ vmovdqa %xmm3,-64(%ebx)
+ vmovdqa %xmm4,(%ebx)
+ vmovdqa %xmm5,16(%ebx)
+ vmovdqa %xmm6,64(%ebx)
+ vmovdqa %xmm7,96(%ebx)
+ vmovdqa -112(%ebx),%xmm1
+ vmovdqa -96(%ebx),%xmm2
+ vmovdqa -80(%ebx),%xmm3
+ vpaddd -128(%ebp),%xmm0,%xmm0
+ vpaddd -112(%ebp),%xmm1,%xmm1
+ vpaddd -96(%ebp),%xmm2,%xmm2
+ vpaddd -80(%ebp),%xmm3,%xmm3
+ vpunpckldq %xmm1,%xmm0,%xmm6
+ vpunpckldq %xmm3,%xmm2,%xmm7
+ vpunpckhdq %xmm1,%xmm0,%xmm0
+ vpunpckhdq %xmm3,%xmm2,%xmm2
+ vpunpcklqdq %xmm7,%xmm6,%xmm1
+ vpunpckhqdq %xmm7,%xmm6,%xmm6
+ vpunpcklqdq %xmm2,%xmm0,%xmm7
+ vpunpckhqdq %xmm2,%xmm0,%xmm3
+ vpxor -128(%esi),%xmm1,%xmm4
+ vpxor -64(%esi),%xmm6,%xmm5
+ vpxor (%esi),%xmm7,%xmm6
+ vpxor 64(%esi),%xmm3,%xmm7
+ leal 16(%esi),%esi
+ vmovdqa -64(%ebx),%xmm0
+ vmovdqa -48(%ebx),%xmm1
+ vmovdqa -32(%ebx),%xmm2
+ vmovdqa -16(%ebx),%xmm3
+ vmovdqu %xmm4,-128(%edi)
+ vmovdqu %xmm5,-64(%edi)
+ vmovdqu %xmm6,(%edi)
+ vmovdqu %xmm7,64(%edi)
+ leal 16(%edi),%edi
+ vpaddd -64(%ebp),%xmm0,%xmm0
+ vpaddd -48(%ebp),%xmm1,%xmm1
+ vpaddd -32(%ebp),%xmm2,%xmm2
+ vpaddd -16(%ebp),%xmm3,%xmm3
+ vpunpckldq %xmm1,%xmm0,%xmm6
+ vpunpckldq %xmm3,%xmm2,%xmm7
+ vpunpckhdq %xmm1,%xmm0,%xmm0
+ vpunpckhdq %xmm3,%xmm2,%xmm2
+ vpunpcklqdq %xmm7,%xmm6,%xmm1
+ vpunpckhqdq %xmm7,%xmm6,%xmm6
+ vpunpcklqdq %xmm2,%xmm0,%xmm7
+ vpunpckhqdq %xmm2,%xmm0,%xmm3
+ vpxor -128(%esi),%xmm1,%xmm4
+ vpxor -64(%esi),%xmm6,%xmm5
+ vpxor (%esi),%xmm7,%xmm6
+ vpxor 64(%esi),%xmm3,%xmm7
+ leal 16(%esi),%esi
+ vmovdqa (%ebx),%xmm0
+ vmovdqa 16(%ebx),%xmm1
+ vmovdqa 32(%ebx),%xmm2
+ vmovdqa 48(%ebx),%xmm3
+ vmovdqu %xmm4,-128(%edi)
+ vmovdqu %xmm5,-64(%edi)
+ vmovdqu %xmm6,(%edi)
+ vmovdqu %xmm7,64(%edi)
+ leal 16(%edi),%edi
+ vpaddd (%ebp),%xmm0,%xmm0
+ vpaddd 16(%ebp),%xmm1,%xmm1
+ vpaddd 32(%ebp),%xmm2,%xmm2
+ vpaddd 48(%ebp),%xmm3,%xmm3
+ vpunpckldq %xmm1,%xmm0,%xmm6
+ vpunpckldq %xmm3,%xmm2,%xmm7
+ vpunpckhdq %xmm1,%xmm0,%xmm0
+ vpunpckhdq %xmm3,%xmm2,%xmm2
+ vpunpcklqdq %xmm7,%xmm6,%xmm1
+ vpunpckhqdq %xmm7,%xmm6,%xmm6
+ vpunpcklqdq %xmm2,%xmm0,%xmm7
+ vpunpckhqdq %xmm2,%xmm0,%xmm3
+ vpxor -128(%esi),%xmm1,%xmm4
+ vpxor -64(%esi),%xmm6,%xmm5
+ vpxor (%esi),%xmm7,%xmm6
+ vpxor 64(%esi),%xmm3,%xmm7
+ leal 16(%esi),%esi
+ vmovdqa 64(%ebx),%xmm0
+ vmovdqa 80(%ebx),%xmm1
+ vmovdqa 96(%ebx),%xmm2
+ vmovdqa 112(%ebx),%xmm3
+ vmovdqu %xmm4,-128(%edi)
+ vmovdqu %xmm5,-64(%edi)
+ vmovdqu %xmm6,(%edi)
+ vmovdqu %xmm7,64(%edi)
+ leal 16(%edi),%edi
+ vpaddd 64(%ebp),%xmm0,%xmm0
+ vpaddd 80(%ebp),%xmm1,%xmm1
+ vpaddd 96(%ebp),%xmm2,%xmm2
+ vpaddd 112(%ebp),%xmm3,%xmm3
+ vpunpckldq %xmm1,%xmm0,%xmm6
+ vpunpckldq %xmm3,%xmm2,%xmm7
+ vpunpckhdq %xmm1,%xmm0,%xmm0
+ vpunpckhdq %xmm3,%xmm2,%xmm2
+ vpunpcklqdq %xmm7,%xmm6,%xmm1
+ vpunpckhqdq %xmm7,%xmm6,%xmm6
+ vpunpcklqdq %xmm2,%xmm0,%xmm7
+ vpunpckhqdq %xmm2,%xmm0,%xmm3
+ vpxor -128(%esi),%xmm1,%xmm4
+ vpxor -64(%esi),%xmm6,%xmm5
+ vpxor (%esi),%xmm7,%xmm6
+ vpxor 64(%esi),%xmm3,%xmm7
+ leal 208(%esi),%esi
+ vmovdqu %xmm4,-128(%edi)
+ vmovdqu %xmm5,-64(%edi)
+ vmovdqu %xmm6,(%edi)
+ vmovdqu %xmm7,64(%edi)
+ leal 208(%edi),%edi
+ subl $256,%ecx
+ jnc .L015outer_loop
+ addl $256,%ecx
+ jz .L017done
+ movl 520(%esp),%ebx
+ leal -128(%esi),%esi
+ movl 516(%esp),%edx
+ leal -128(%edi),%edi
+ vmovd 64(%ebp),%xmm2
+ vmovdqu (%ebx),%xmm3
+ vpaddd 96(%eax),%xmm2,%xmm2
+ vpand 112(%eax),%xmm3,%xmm3
+ vpor %xmm2,%xmm3,%xmm3
+.L0141x:
+ vmovdqa 32(%eax),%xmm0
+ vmovdqu (%edx),%xmm1
+ vmovdqu 16(%edx),%xmm2
+ vmovdqa (%eax),%xmm6
+ vmovdqa 16(%eax),%xmm7
+ movl %ebp,48(%esp)
+ vmovdqa %xmm0,(%esp)
+ vmovdqa %xmm1,16(%esp)
+ vmovdqa %xmm2,32(%esp)
+ vmovdqa %xmm3,48(%esp)
+ movl $10,%edx
+ jmp .L018loop1x
+.align 16
+.L019outer1x:
+ vmovdqa 80(%eax),%xmm3
+ vmovdqa (%esp),%xmm0
+ vmovdqa 16(%esp),%xmm1
+ vmovdqa 32(%esp),%xmm2
+ vpaddd 48(%esp),%xmm3,%xmm3
+ movl $10,%edx
+ vmovdqa %xmm3,48(%esp)
+ jmp .L018loop1x
+.align 16
+.L018loop1x:
+ vpaddd %xmm1,%xmm0,%xmm0
+ vpxor %xmm0,%xmm3,%xmm3
+.byte 143,232,120,194,219,16
+ vpaddd %xmm3,%xmm2,%xmm2
+ vpxor %xmm2,%xmm1,%xmm1
+.byte 143,232,120,194,201,12
+ vpaddd %xmm1,%xmm0,%xmm0
+ vpxor %xmm0,%xmm3,%xmm3
+.byte 143,232,120,194,219,8
+ vpaddd %xmm3,%xmm2,%xmm2
+ vpxor %xmm2,%xmm1,%xmm1
+.byte 143,232,120,194,201,7
+ vpshufd $78,%xmm2,%xmm2
+ vpshufd $57,%xmm1,%xmm1
+ vpshufd $147,%xmm3,%xmm3
+ vpaddd %xmm1,%xmm0,%xmm0
+ vpxor %xmm0,%xmm3,%xmm3
+.byte 143,232,120,194,219,16
+ vpaddd %xmm3,%xmm2,%xmm2
+ vpxor %xmm2,%xmm1,%xmm1
+.byte 143,232,120,194,201,12
+ vpaddd %xmm1,%xmm0,%xmm0
+ vpxor %xmm0,%xmm3,%xmm3
+.byte 143,232,120,194,219,8
+ vpaddd %xmm3,%xmm2,%xmm2
+ vpxor %xmm2,%xmm1,%xmm1
+.byte 143,232,120,194,201,7
+ vpshufd $78,%xmm2,%xmm2
+ vpshufd $147,%xmm1,%xmm1
+ vpshufd $57,%xmm3,%xmm3
+ decl %edx
+ jnz .L018loop1x
+ vpaddd (%esp),%xmm0,%xmm0
+ vpaddd 16(%esp),%xmm1,%xmm1
+ vpaddd 32(%esp),%xmm2,%xmm2
+ vpaddd 48(%esp),%xmm3,%xmm3
+ cmpl $64,%ecx
+ jb .L020tail
+ vpxor (%esi),%xmm0,%xmm0
+ vpxor 16(%esi),%xmm1,%xmm1
+ vpxor 32(%esi),%xmm2,%xmm2
+ vpxor 48(%esi),%xmm3,%xmm3
+ leal 64(%esi),%esi
+ vmovdqu %xmm0,(%edi)
+ vmovdqu %xmm1,16(%edi)
+ vmovdqu %xmm2,32(%edi)
+ vmovdqu %xmm3,48(%edi)
+ leal 64(%edi),%edi
+ subl $64,%ecx
+ jnz .L019outer1x
+ jmp .L017done
+.L020tail:
+ vmovdqa %xmm0,(%esp)
+ vmovdqa %xmm1,16(%esp)
+ vmovdqa %xmm2,32(%esp)
+ vmovdqa %xmm3,48(%esp)
+ xorl %eax,%eax
+ xorl %edx,%edx
+ xorl %ebp,%ebp
+.L021tail_loop:
+ movb (%esp,%ebp,1),%al
+ movb (%esi,%ebp,1),%dl
+ leal 1(%ebp),%ebp
+ xorb %dl,%al
+ movb %al,-1(%edi,%ebp,1)
+ decl %ecx
+ jnz .L021tail_loop
+.L017done:
+ vzeroupper
+ movl 512(%esp),%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size ChaCha20_xop,.-.L_ChaCha20_xop_begin
+.comm OPENSSL_ia32cap_P,16,4
#endif
diff --git a/secure/lib/libcrypto/i386/ecp_nistz256-x86.S b/secure/lib/libcrypto/i386/ecp_nistz256-x86.S
index 1cd2d7958e94..7d0c1b9eb9df 100644
--- a/secure/lib/libcrypto/i386/ecp_nistz256-x86.S
+++ b/secure/lib/libcrypto/i386/ecp_nistz256-x86.S
@@ -2746,6 +2746,8 @@ ecp_nistz256_to_mont:
call _picup_eax
.L000pic:
leal .LRR-.L000pic(%eax),%ebp
+ leal OPENSSL_ia32cap_P-.L000pic(%eax),%eax
+ movl (%eax),%eax
movl 20(%esp),%edi
call _ecp_nistz256_mul_mont
popl %edi
@@ -2767,6 +2769,8 @@ ecp_nistz256_from_mont:
call _picup_eax
.L001pic:
leal .LONE-.L001pic(%eax),%ebp
+ leal OPENSSL_ia32cap_P-.L001pic(%eax),%eax
+ movl (%eax),%eax
movl 20(%esp),%edi
call _ecp_nistz256_mul_mont
popl %edi
@@ -2786,6 +2790,10 @@ ecp_nistz256_mul_mont:
pushl %edi
movl 24(%esp),%esi
movl 28(%esp),%ebp
+ call _picup_eax
+.L002pic:
+ leal OPENSSL_ia32cap_P-.L002pic(%eax),%eax
+ movl (%eax),%eax
movl 20(%esp),%edi
call _ecp_nistz256_mul_mont
popl %edi
@@ -2804,6 +2812,10 @@ ecp_nistz256_sqr_mont:
pushl %esi
pushl %edi
movl 24(%esp),%esi
+ call _picup_eax
+.L003pic:
+ leal OPENSSL_ia32cap_P-.L003pic(%eax),%eax
+ movl (%eax),%eax
movl 20(%esp),%edi
movl %esi,%ebp
call _ecp_nistz256_mul_mont
@@ -2816,6 +2828,265 @@ ecp_nistz256_sqr_mont:
.type _ecp_nistz256_mul_mont,@function
.align 16
_ecp_nistz256_mul_mont:
+ andl $83886080,%eax
+ cmpl $83886080,%eax
+ jne .L004mul_mont_ialu
+ movl %esp,%edx
+ subl $256,%esp
+ movd (%ebp),%xmm7
+ leal 4(%ebp),%ebp
+ pcmpeqd %xmm6,%xmm6
+ psrlq $48,%xmm6
+ pshuflw $220,%xmm7,%xmm7
+ andl $-64,%esp
+ pshufd $220,%xmm7,%xmm7
+ leal 128(%esp),%ebx
+ movd (%esi),%xmm0
+ pshufd $204,%xmm0,%xmm0
+ movd 4(%esi),%xmm1
+ movdqa %xmm0,(%ebx)
+ pmuludq %xmm7,%xmm0
+ movd 8(%esi),%xmm2
+ pshufd $204,%xmm1,%xmm1
+ movdqa %xmm1,16(%ebx)
+ pmuludq %xmm7,%xmm1
+ movq %xmm0,%xmm4
+ pslldq $6,%xmm4
+ paddq %xmm0,%xmm4
+ movdqa %xmm4,%xmm5
+ psrldq $10,%xmm4
+ pand %xmm6,%xmm5
+ movd 12(%esi),%xmm3
+ pshufd $204,%xmm2,%xmm2
+ movdqa %xmm2,32(%ebx)
+ pmuludq %xmm7,%xmm2
+ paddq %xmm4,%xmm1
+ movdqa %xmm1,(%esp)
+ movd 16(%esi),%xmm0
+ pshufd $204,%xmm3,%xmm3
+ movdqa %xmm3,48(%ebx)
+ pmuludq %xmm7,%xmm3
+ movdqa %xmm2,16(%esp)
+ movd 20(%esi),%xmm1
+ pshufd $204,%xmm0,%xmm0
+ movdqa %xmm0,64(%ebx)
+ pmuludq %xmm7,%xmm0
+ paddq %xmm5,%xmm3
+ movdqa %xmm3,32(%esp)
+ movd 24(%esi),%xmm2
+ pshufd $204,%xmm1,%xmm1
+ movdqa %xmm1,80(%ebx)
+ pmuludq %xmm7,%xmm1
+ movdqa %xmm0,48(%esp)
+ pshufd $177,%xmm5,%xmm4
+ movd 28(%esi),%xmm3
+ pshufd $204,%xmm2,%xmm2
+ movdqa %xmm2,96(%ebx)
+ pmuludq %xmm7,%xmm2
+ movdqa %xmm1,64(%esp)
+ psubq %xmm5,%xmm4
+ movd (%ebp),%xmm0
+ pshufd $204,%xmm3,%xmm3
+ movdqa %xmm3,112(%ebx)
+ pmuludq %xmm7,%xmm3
+ pshuflw $220,%xmm0,%xmm7
+ movdqa (%ebx),%xmm0
+ pshufd $220,%xmm7,%xmm7
+ movl $6,%ecx
+ leal 4(%ebp),%ebp
+ jmp .L005madd_sse2
+.align 16
+.L005madd_sse2:
+ paddq %xmm5,%xmm2
+ paddq %xmm4,%xmm3
+ movdqa 16(%ebx),%xmm1
+ pmuludq %xmm7,%xmm0
+ movdqa %xmm2,80(%esp)
+ movdqa 32(%ebx),%xmm2
+ pmuludq %xmm7,%xmm1
+ movdqa %xmm3,96(%esp)
+ paddq (%esp),%xmm0
+ movdqa 48(%ebx),%xmm3
+ pmuludq %xmm7,%xmm2
+ movq %xmm0,%xmm4
+ pslldq $6,%xmm4
+ paddq 16(%esp),%xmm1
+ paddq %xmm0,%xmm4
+ movdqa %xmm4,%xmm5
+ psrldq $10,%xmm4
+ movdqa 64(%ebx),%xmm0
+ pmuludq %xmm7,%xmm3
+ paddq %xmm4,%xmm1
+ paddq 32(%esp),%xmm2
+ movdqa %xmm1,(%esp)
+ movdqa 80(%ebx),%xmm1
+ pmuludq %xmm7,%xmm0
+ paddq 48(%esp),%xmm3
+ movdqa %xmm2,16(%esp)
+ pand %xmm6,%xmm5
+ movdqa 96(%ebx),%xmm2
+ pmuludq %xmm7,%xmm1
+ paddq %xmm5,%xmm3
+ paddq 64(%esp),%xmm0
+ movdqa %xmm3,32(%esp)
+ pshufd $177,%xmm5,%xmm4
+ movdqa %xmm7,%xmm3
+ pmuludq %xmm7,%xmm2
+ movd (%ebp),%xmm7
+ leal 4(%ebp),%ebp
+ paddq 80(%esp),%xmm1
+ psubq %xmm5,%xmm4
+ movdqa %xmm0,48(%esp)
+ pshuflw $220,%xmm7,%xmm7
+ pmuludq 112(%ebx),%xmm3
+ pshufd $220,%xmm7,%xmm7
+ movdqa (%ebx),%xmm0
+ movdqa %xmm1,64(%esp)
+ paddq 96(%esp),%xmm2
+ decl %ecx
+ jnz .L005madd_sse2
+ paddq %xmm5,%xmm2
+ paddq %xmm4,%xmm3
+ movdqa 16(%ebx),%xmm1
+ pmuludq %xmm7,%xmm0
+ movdqa %xmm2,80(%esp)
+ movdqa 32(%ebx),%xmm2
+ pmuludq %xmm7,%xmm1
+ movdqa %xmm3,96(%esp)
+ paddq (%esp),%xmm0
+ movdqa 48(%ebx),%xmm3
+ pmuludq %xmm7,%xmm2
+ movq %xmm0,%xmm4
+ pslldq $6,%xmm4
+ paddq 16(%esp),%xmm1
+ paddq %xmm0,%xmm4
+ movdqa %xmm4,%xmm5
+ psrldq $10,%xmm4
+ movdqa 64(%ebx),%xmm0
+ pmuludq %xmm7,%xmm3
+ paddq %xmm4,%xmm1
+ paddq 32(%esp),%xmm2
+ movdqa %xmm1,(%esp)
+ movdqa 80(%ebx),%xmm1
+ pmuludq %xmm7,%xmm0
+ paddq 48(%esp),%xmm3
+ movdqa %xmm2,16(%esp)
+ pand %xmm6,%xmm5
+ movdqa 96(%ebx),%xmm2
+ pmuludq %xmm7,%xmm1
+ paddq %xmm5,%xmm3
+ paddq 64(%esp),%xmm0
+ movdqa %xmm3,32(%esp)
+ pshufd $177,%xmm5,%xmm4
+ movdqa 112(%ebx),%xmm3
+ pmuludq %xmm7,%xmm2
+ paddq 80(%esp),%xmm1
+ psubq %xmm5,%xmm4
+ movdqa %xmm0,48(%esp)
+ pmuludq %xmm7,%xmm3
+ pcmpeqd %xmm7,%xmm7
+ movdqa (%esp),%xmm0
+ pslldq $8,%xmm7
+ movdqa %xmm1,64(%esp)
+ paddq 96(%esp),%xmm2
+ paddq %xmm5,%xmm2
+ paddq %xmm4,%xmm3
+ movdqa %xmm2,80(%esp)
+ movdqa %xmm3,96(%esp)
+ movdqa 16(%esp),%xmm1
+ movdqa 32(%esp),%xmm2
+ movdqa 48(%esp),%xmm3
+ movq %xmm0,%xmm4
+ pand %xmm7,%xmm0
+ xorl %ebp,%ebp
+ pslldq $6,%xmm4
+ movq %xmm1,%xmm5
+ paddq %xmm4,%xmm0
+ pand %xmm7,%xmm1
+ psrldq $6,%xmm0
+ movd %xmm0,%eax
+ psrldq $4,%xmm0
+ paddq %xmm0,%xmm5
+ movdqa 64(%esp),%xmm0
+ subl $-1,%eax
+ pslldq $6,%xmm5
+ movq %xmm2,%xmm4
+ paddq %xmm5,%xmm1
+ pand %xmm7,%xmm2
+ psrldq $6,%xmm1
+ movl %eax,(%edi)
+ movd %xmm1,%eax
+ psrldq $4,%xmm1
+ paddq %xmm1,%xmm4
+ movdqa 80(%esp),%xmm1
+ sbbl $-1,%eax
+ pslldq $6,%xmm4
+ movq %xmm3,%xmm5
+ paddq %xmm4,%xmm2
+ pand %xmm7,%xmm3
+ psrldq $6,%xmm2
+ movl %eax,4(%edi)
+ movd %xmm2,%eax
+ psrldq $4,%xmm2
+ paddq %xmm2,%xmm5
+ movdqa 96(%esp),%xmm2
+ sbbl $-1,%eax
+ pslldq $6,%xmm5
+ movq %xmm0,%xmm4
+ paddq %xmm5,%xmm3
+ pand %xmm7,%xmm0
+ psrldq $6,%xmm3
+ movl %eax,8(%edi)
+ movd %xmm3,%eax
+ psrldq $4,%xmm3
+ paddq %xmm3,%xmm4
+ sbbl $0,%eax
+ pslldq $6,%xmm4
+ movq %xmm1,%xmm5
+ paddq %xmm4,%xmm0
+ pand %xmm7,%xmm1
+ psrldq $6,%xmm0
+ movl %eax,12(%edi)
+ movd %xmm0,%eax
+ psrldq $4,%xmm0
+ paddq %xmm0,%xmm5
+ sbbl $0,%eax
+ pslldq $6,%xmm5
+ movq %xmm2,%xmm4
+ paddq %xmm5,%xmm1
+ pand %xmm7,%xmm2
+ psrldq $6,%xmm1
+ movd %xmm1,%ebx
+ psrldq $4,%xmm1
+ movl %edx,%esp
+ paddq %xmm1,%xmm4
+ pslldq $6,%xmm4
+ paddq %xmm4,%xmm2
+ psrldq $6,%xmm2
+ movd %xmm2,%ecx
+ psrldq $4,%xmm2
+ sbbl $0,%ebx
+ movd %xmm2,%edx
+ pextrw $2,%xmm2,%esi
+ sbbl $1,%ecx
+ sbbl $-1,%edx
+ sbbl $0,%esi
+ subl %esi,%ebp
+ addl %esi,(%edi)
+ adcl %esi,4(%edi)
+ adcl %esi,8(%edi)
+ adcl $0,12(%edi)
+ adcl $0,%eax
+ adcl $0,%ebx
+ movl %eax,16(%edi)
+ adcl %ebp,%ecx
+ movl %ebx,20(%edi)
+ adcl %esi,%edx
+ movl %ecx,24(%edi)
+ movl %edx,28(%edi)
+ ret
+.align 16
+.L004mul_mont_ialu:
subl $40,%esp
movl (%esi),%eax
movl (%ebp),%ebx
@@ -3463,7 +3734,7 @@ ecp_nistz256_scatter_w5:
movl 28(%esp),%ebp
leal 124(%edi,%ebp,4),%edi
movl $6,%ebp
-.L002scatter_w5_loop:
+.L006scatter_w5_loop:
movl (%esi),%eax
movl 4(%esi),%ebx
movl 8(%esi),%ecx
@@ -3475,7 +3746,7 @@ ecp_nistz256_scatter_w5:
movl %edx,64(%edi)
leal 256(%edi),%edi
decl %ebp
- jnz .L002scatter_w5_loop
+ jnz .L006scatter_w5_loop
popl %edi
popl %esi
popl %ebx
@@ -3590,7 +3861,7 @@ ecp_nistz256_scatter_w7:
movl 28(%esp),%ebp
leal (%edi,%ebp,1),%edi
movl $16,%ebp
-.L003scatter_w7_loop:
+.L007scatter_w7_loop:
movl (%esi),%eax
leal 4(%esi),%esi
movb %al,(%edi)
@@ -3600,7 +3871,7 @@ ecp_nistz256_scatter_w7:
movb %ah,192(%edi)
leal 256(%edi),%edi
decl %ebp
- jnz .L003scatter_w7_loop
+ jnz .L007scatter_w7_loop
popl %edi
popl %esi
popl %ebx
@@ -3832,6 +4103,10 @@ ecp_nistz256_point_double:
pushl %edi
movl 24(%esp),%esi
subl $164,%esp
+ call _picup_eax
+.L008pic:
+ leal OPENSSL_ia32cap_P-.L008pic(%eax),%edx
+ movl (%edx),%ebp
.Lpoint_double_shortcut:
movl (%esi),%eax
movl 4(%esi),%ebx
@@ -3954,6 +4229,10 @@ ecp_nistz256_point_add:
pushl %edi
movl 28(%esp),%esi
subl $596,%esp
+ call _picup_eax
+.L009pic:
+ leal OPENSSL_ia32cap_P-.L009pic(%eax),%edx
+ movl (%edx),%ebp
leal 192(%esp),%edi
movl (%esi),%eax
movl 4(%esi),%ebx
@@ -4144,26 +4423,26 @@ ecp_nistz256_point_add:
orl 8(%edi),%eax
orl 12(%edi),%eax
.byte 62
- jnz .L004add_proceed
+ jnz .L010add_proceed
movl 576(%esp),%eax
andl 580(%esp),%eax
movl 584(%esp),%ebx
- jz .L004add_proceed
+ jz .L010add_proceed
testl %ebx,%ebx
- jz .L005add_double
+ jz .L011add_double
movl 616(%esp),%edi
xorl %eax,%eax
movl $24,%ecx
.byte 252,243,171
- jmp .L006add_done
+ jmp .L012add_done
.align 16
-.L005add_double:
+.L011add_double:
movl 620(%esp),%esi
movl 588(%esp),%ebp
addl $432,%esp
jmp .Lpoint_double_shortcut
.align 16
-.L004add_proceed:
+.L010add_proceed:
movl 588(%esp),%eax
leal 352(%esp),%esi
leal 352(%esp),%ebp
@@ -4448,7 +4727,7 @@ ecp_nistz256_point_add:
orl %ebx,%eax
orl %ecx,%eax
movl %eax,60(%edi)
-.L006add_done:
+.L012add_done:
addl $596,%esp
popl %edi
popl %esi
@@ -4467,6 +4746,10 @@ ecp_nistz256_point_add_affine:
pushl %edi
movl 24(%esp),%esi
subl $492,%esp
+ call _picup_eax
+.L013pic:
+ leal OPENSSL_ia32cap_P-.L013pic(%eax),%edx
+ movl (%edx),%ebp
leal 96(%esp),%edi
movl (%esi),%eax
movl 4(%esi),%ebx
@@ -4884,6 +5167,7 @@ ecp_nistz256_point_add_affine:
popl %ebp
ret
.size ecp_nistz256_point_add_affine,.-.L_ecp_nistz256_point_add_affine_begin
+.comm OPENSSL_ia32cap_P,16,4
#else
.text
.globl ecp_nistz256_precomputed
@@ -7630,6 +7914,8 @@ ecp_nistz256_to_mont:
call _picup_eax
.L000pic:
leal .LRR-.L000pic(%eax),%ebp
+ leal OPENSSL_ia32cap_P,%eax
+ movl (%eax),%eax
movl 20(%esp),%edi
call _ecp_nistz256_mul_mont
popl %edi
@@ -7651,6 +7937,8 @@ ecp_nistz256_from_mont:
call _picup_eax
.L001pic:
leal .LONE-.L001pic(%eax),%ebp
+ leal OPENSSL_ia32cap_P,%eax
+ movl (%eax),%eax
movl 20(%esp),%edi
call _ecp_nistz256_mul_mont
popl %edi
@@ -7670,6 +7958,10 @@ ecp_nistz256_mul_mont:
pushl %edi
movl 24(%esp),%esi
movl 28(%esp),%ebp
+ call _picup_eax
+.L002pic:
+ leal OPENSSL_ia32cap_P,%eax
+ movl (%eax),%eax
movl 20(%esp),%edi
call _ecp_nistz256_mul_mont
popl %edi
@@ -7688,6 +7980,10 @@ ecp_nistz256_sqr_mont:
pushl %esi
pushl %edi
movl 24(%esp),%esi
+ call _picup_eax
+.L003pic:
+ leal OPENSSL_ia32cap_P,%eax
+ movl (%eax),%eax
movl 20(%esp),%edi
movl %esi,%ebp
call _ecp_nistz256_mul_mont
@@ -7700,6 +7996,265 @@ ecp_nistz256_sqr_mont:
.type _ecp_nistz256_mul_mont,@function
.align 16
_ecp_nistz256_mul_mont:
+ andl $83886080,%eax
+ cmpl $83886080,%eax
+ jne .L004mul_mont_ialu
+ movl %esp,%edx
+ subl $256,%esp
+ movd (%ebp),%xmm7
+ leal 4(%ebp),%ebp
+ pcmpeqd %xmm6,%xmm6
+ psrlq $48,%xmm6
+ pshuflw $220,%xmm7,%xmm7
+ andl $-64,%esp
+ pshufd $220,%xmm7,%xmm7
+ leal 128(%esp),%ebx
+ movd (%esi),%xmm0
+ pshufd $204,%xmm0,%xmm0
+ movd 4(%esi),%xmm1
+ movdqa %xmm0,(%ebx)
+ pmuludq %xmm7,%xmm0
+ movd 8(%esi),%xmm2
+ pshufd $204,%xmm1,%xmm1
+ movdqa %xmm1,16(%ebx)
+ pmuludq %xmm7,%xmm1
+ movq %xmm0,%xmm4
+ pslldq $6,%xmm4
+ paddq %xmm0,%xmm4
+ movdqa %xmm4,%xmm5
+ psrldq $10,%xmm4
+ pand %xmm6,%xmm5
+ movd 12(%esi),%xmm3
+ pshufd $204,%xmm2,%xmm2
+ movdqa %xmm2,32(%ebx)
+ pmuludq %xmm7,%xmm2
+ paddq %xmm4,%xmm1
+ movdqa %xmm1,(%esp)
+ movd 16(%esi),%xmm0
+ pshufd $204,%xmm3,%xmm3
+ movdqa %xmm3,48(%ebx)
+ pmuludq %xmm7,%xmm3
+ movdqa %xmm2,16(%esp)
+ movd 20(%esi),%xmm1
+ pshufd $204,%xmm0,%xmm0
+ movdqa %xmm0,64(%ebx)
+ pmuludq %xmm7,%xmm0
+ paddq %xmm5,%xmm3
+ movdqa %xmm3,32(%esp)
+ movd 24(%esi),%xmm2
+ pshufd $204,%xmm1,%xmm1
+ movdqa %xmm1,80(%ebx)
+ pmuludq %xmm7,%xmm1
+ movdqa %xmm0,48(%esp)
+ pshufd $177,%xmm5,%xmm4
+ movd 28(%esi),%xmm3
+ pshufd $204,%xmm2,%xmm2
+ movdqa %xmm2,96(%ebx)
+ pmuludq %xmm7,%xmm2
+ movdqa %xmm1,64(%esp)
+ psubq %xmm5,%xmm4
+ movd (%ebp),%xmm0
+ pshufd $204,%xmm3,%xmm3
+ movdqa %xmm3,112(%ebx)
+ pmuludq %xmm7,%xmm3
+ pshuflw $220,%xmm0,%xmm7
+ movdqa (%ebx),%xmm0
+ pshufd $220,%xmm7,%xmm7
+ movl $6,%ecx
+ leal 4(%ebp),%ebp
+ jmp .L005madd_sse2
+.align 16
+.L005madd_sse2:
+ paddq %xmm5,%xmm2
+ paddq %xmm4,%xmm3
+ movdqa 16(%ebx),%xmm1
+ pmuludq %xmm7,%xmm0
+ movdqa %xmm2,80(%esp)
+ movdqa 32(%ebx),%xmm2
+ pmuludq %xmm7,%xmm1
+ movdqa %xmm3,96(%esp)
+ paddq (%esp),%xmm0
+ movdqa 48(%ebx),%xmm3
+ pmuludq %xmm7,%xmm2
+ movq %xmm0,%xmm4
+ pslldq $6,%xmm4
+ paddq 16(%esp),%xmm1
+ paddq %xmm0,%xmm4
+ movdqa %xmm4,%xmm5
+ psrldq $10,%xmm4
+ movdqa 64(%ebx),%xmm0
+ pmuludq %xmm7,%xmm3
+ paddq %xmm4,%xmm1
+ paddq 32(%esp),%xmm2
+ movdqa %xmm1,(%esp)
+ movdqa 80(%ebx),%xmm1
+ pmuludq %xmm7,%xmm0
+ paddq 48(%esp),%xmm3
+ movdqa %xmm2,16(%esp)
+ pand %xmm6,%xmm5
+ movdqa 96(%ebx),%xmm2
+ pmuludq %xmm7,%xmm1
+ paddq %xmm5,%xmm3
+ paddq 64(%esp),%xmm0
+ movdqa %xmm3,32(%esp)
+ pshufd $177,%xmm5,%xmm4
+ movdqa %xmm7,%xmm3
+ pmuludq %xmm7,%xmm2
+ movd (%ebp),%xmm7
+ leal 4(%ebp),%ebp
+ paddq 80(%esp),%xmm1
+ psubq %xmm5,%xmm4
+ movdqa %xmm0,48(%esp)
+ pshuflw $220,%xmm7,%xmm7
+ pmuludq 112(%ebx),%xmm3
+ pshufd $220,%xmm7,%xmm7
+ movdqa (%ebx),%xmm0
+ movdqa %xmm1,64(%esp)
+ paddq 96(%esp),%xmm2
+ decl %ecx
+ jnz .L005madd_sse2
+ paddq %xmm5,%xmm2
+ paddq %xmm4,%xmm3
+ movdqa 16(%ebx),%xmm1
+ pmuludq %xmm7,%xmm0
+ movdqa %xmm2,80(%esp)
+ movdqa 32(%ebx),%xmm2
+ pmuludq %xmm7,%xmm1
+ movdqa %xmm3,96(%esp)
+ paddq (%esp),%xmm0
+ movdqa 48(%ebx),%xmm3
+ pmuludq %xmm7,%xmm2
+ movq %xmm0,%xmm4
+ pslldq $6,%xmm4
+ paddq 16(%esp),%xmm1
+ paddq %xmm0,%xmm4
+ movdqa %xmm4,%xmm5
+ psrldq $10,%xmm4
+ movdqa 64(%ebx),%xmm0
+ pmuludq %xmm7,%xmm3
+ paddq %xmm4,%xmm1
+ paddq 32(%esp),%xmm2
+ movdqa %xmm1,(%esp)
+ movdqa 80(%ebx),%xmm1
+ pmuludq %xmm7,%xmm0
+ paddq 48(%esp),%xmm3
+ movdqa %xmm2,16(%esp)
+ pand %xmm6,%xmm5
+ movdqa 96(%ebx),%xmm2
+ pmuludq %xmm7,%xmm1
+ paddq %xmm5,%xmm3
+ paddq 64(%esp),%xmm0
+ movdqa %xmm3,32(%esp)
+ pshufd $177,%xmm5,%xmm4
+ movdqa 112(%ebx),%xmm3
+ pmuludq %xmm7,%xmm2
+ paddq 80(%esp),%xmm1
+ psubq %xmm5,%xmm4
+ movdqa %xmm0,48(%esp)
+ pmuludq %xmm7,%xmm3
+ pcmpeqd %xmm7,%xmm7
+ movdqa (%esp),%xmm0
+ pslldq $8,%xmm7
+ movdqa %xmm1,64(%esp)
+ paddq 96(%esp),%xmm2
+ paddq %xmm5,%xmm2
+ paddq %xmm4,%xmm3
+ movdqa %xmm2,80(%esp)
+ movdqa %xmm3,96(%esp)
+ movdqa 16(%esp),%xmm1
+ movdqa 32(%esp),%xmm2
+ movdqa 48(%esp),%xmm3
+ movq %xmm0,%xmm4
+ pand %xmm7,%xmm0
+ xorl %ebp,%ebp
+ pslldq $6,%xmm4
+ movq %xmm1,%xmm5
+ paddq %xmm4,%xmm0
+ pand %xmm7,%xmm1
+ psrldq $6,%xmm0
+ movd %xmm0,%eax
+ psrldq $4,%xmm0
+ paddq %xmm0,%xmm5
+ movdqa 64(%esp),%xmm0
+ subl $-1,%eax
+ pslldq $6,%xmm5
+ movq %xmm2,%xmm4
+ paddq %xmm5,%xmm1
+ pand %xmm7,%xmm2
+ psrldq $6,%xmm1
+ movl %eax,(%edi)
+ movd %xmm1,%eax
+ psrldq $4,%xmm1
+ paddq %xmm1,%xmm4
+ movdqa 80(%esp),%xmm1
+ sbbl $-1,%eax
+ pslldq $6,%xmm4
+ movq %xmm3,%xmm5
+ paddq %xmm4,%xmm2
+ pand %xmm7,%xmm3
+ psrldq $6,%xmm2
+ movl %eax,4(%edi)
+ movd %xmm2,%eax
+ psrldq $4,%xmm2
+ paddq %xmm2,%xmm5
+ movdqa 96(%esp),%xmm2
+ sbbl $-1,%eax
+ pslldq $6,%xmm5
+ movq %xmm0,%xmm4
+ paddq %xmm5,%xmm3
+ pand %xmm7,%xmm0
+ psrldq $6,%xmm3
+ movl %eax,8(%edi)
+ movd %xmm3,%eax
+ psrldq $4,%xmm3
+ paddq %xmm3,%xmm4
+ sbbl $0,%eax
+ pslldq $6,%xmm4
+ movq %xmm1,%xmm5
+ paddq %xmm4,%xmm0
+ pand %xmm7,%xmm1
+ psrldq $6,%xmm0
+ movl %eax,12(%edi)
+ movd %xmm0,%eax
+ psrldq $4,%xmm0
+ paddq %xmm0,%xmm5
+ sbbl $0,%eax
+ pslldq $6,%xmm5
+ movq %xmm2,%xmm4
+ paddq %xmm5,%xmm1
+ pand %xmm7,%xmm2
+ psrldq $6,%xmm1
+ movd %xmm1,%ebx
+ psrldq $4,%xmm1
+ movl %edx,%esp
+ paddq %xmm1,%xmm4
+ pslldq $6,%xmm4
+ paddq %xmm4,%xmm2
+ psrldq $6,%xmm2
+ movd %xmm2,%ecx
+ psrldq $4,%xmm2
+ sbbl $0,%ebx
+ movd %xmm2,%edx
+ pextrw $2,%xmm2,%esi
+ sbbl $1,%ecx
+ sbbl $-1,%edx
+ sbbl $0,%esi
+ subl %esi,%ebp
+ addl %esi,(%edi)
+ adcl %esi,4(%edi)
+ adcl %esi,8(%edi)
+ adcl $0,12(%edi)
+ adcl $0,%eax
+ adcl $0,%ebx
+ movl %eax,16(%edi)
+ adcl %ebp,%ecx
+ movl %ebx,20(%edi)
+ adcl %esi,%edx
+ movl %ecx,24(%edi)
+ movl %edx,28(%edi)
+ ret
+.align 16
+.L004mul_mont_ialu:
subl $40,%esp
movl (%esi),%eax
movl (%ebp),%ebx
@@ -8347,7 +8902,7 @@ ecp_nistz256_scatter_w5:
movl 28(%esp),%ebp
leal 124(%edi,%ebp,4),%edi
movl $6,%ebp
-.L002scatter_w5_loop:
+.L006scatter_w5_loop:
movl (%esi),%eax
movl 4(%esi),%ebx
movl 8(%esi),%ecx
@@ -8359,7 +8914,7 @@ ecp_nistz256_scatter_w5:
movl %edx,64(%edi)
leal 256(%edi),%edi
decl %ebp
- jnz .L002scatter_w5_loop
+ jnz .L006scatter_w5_loop
popl %edi
popl %esi
popl %ebx
@@ -8474,7 +9029,7 @@ ecp_nistz256_scatter_w7:
movl 28(%esp),%ebp
leal (%edi,%ebp,1),%edi
movl $16,%ebp
-.L003scatter_w7_loop:
+.L007scatter_w7_loop:
movl (%esi),%eax
leal 4(%esi),%esi
movb %al,(%edi)
@@ -8484,7 +9039,7 @@ ecp_nistz256_scatter_w7:
movb %ah,192(%edi)
leal 256(%edi),%edi
decl %ebp
- jnz .L003scatter_w7_loop
+ jnz .L007scatter_w7_loop
popl %edi
popl %esi
popl %ebx
@@ -8716,6 +9271,10 @@ ecp_nistz256_point_double:
pushl %edi
movl 24(%esp),%esi
subl $164,%esp
+ call _picup_eax
+.L008pic:
+ leal OPENSSL_ia32cap_P,%edx
+ movl (%edx),%ebp
.Lpoint_double_shortcut:
movl (%esi),%eax
movl 4(%esi),%ebx
@@ -8838,6 +9397,10 @@ ecp_nistz256_point_add:
pushl %edi
movl 28(%esp),%esi
subl $596,%esp
+ call _picup_eax
+.L009pic:
+ leal OPENSSL_ia32cap_P,%edx
+ movl (%edx),%ebp
leal 192(%esp),%edi
movl (%esi),%eax
movl 4(%esi),%ebx
@@ -9028,26 +9591,26 @@ ecp_nistz256_point_add:
orl 8(%edi),%eax
orl 12(%edi),%eax
.byte 62
- jnz .L004add_proceed
+ jnz .L010add_proceed
movl 576(%esp),%eax
andl 580(%esp),%eax
movl 584(%esp),%ebx
- jz .L004add_proceed
+ jz .L010add_proceed
testl %ebx,%ebx
- jz .L005add_double
+ jz .L011add_double
movl 616(%esp),%edi
xorl %eax,%eax
movl $24,%ecx
.byte 252,243,171
- jmp .L006add_done
+ jmp .L012add_done
.align 16
-.L005add_double:
+.L011add_double:
movl 620(%esp),%esi
movl 588(%esp),%ebp
addl $432,%esp
jmp .Lpoint_double_shortcut
.align 16
-.L004add_proceed:
+.L010add_proceed:
movl 588(%esp),%eax
leal 352(%esp),%esi
leal 352(%esp),%ebp
@@ -9332,7 +9895,7 @@ ecp_nistz256_point_add:
orl %ebx,%eax
orl %ecx,%eax
movl %eax,60(%edi)
-.L006add_done:
+.L012add_done:
addl $596,%esp
popl %edi
popl %esi
@@ -9351,6 +9914,10 @@ ecp_nistz256_point_add_affine:
pushl %edi
movl 24(%esp),%esi
subl $492,%esp
+ call _picup_eax
+.L013pic:
+ leal OPENSSL_ia32cap_P,%edx
+ movl (%edx),%ebp
leal 96(%esp),%edi
movl (%esi),%eax
movl 4(%esi),%ebx
@@ -9768,4 +10335,5 @@ ecp_nistz256_point_add_affine:
popl %ebp
ret
.size ecp_nistz256_point_add_affine,.-.L_ecp_nistz256_point_add_affine_begin
+.comm OPENSSL_ia32cap_P,16,4
#endif
diff --git a/secure/lib/libcrypto/i386/ghash-x86.S b/secure/lib/libcrypto/i386/ghash-x86.S
index 41aa0c4a6170..02b5a036f08a 100644
--- a/secure/lib/libcrypto/i386/ghash-x86.S
+++ b/secure/lib/libcrypto/i386/ghash-x86.S
@@ -205,418 +205,94 @@ gcm_ghash_4bit_x86:
popl %ebp
ret
.size gcm_ghash_4bit_x86,.-.L_gcm_ghash_4bit_x86_begin
-.type _mmx_gmult_4bit_inner,@function
+.globl gcm_gmult_4bit_mmx
+.type gcm_gmult_4bit_mmx,@function
.align 16
-_mmx_gmult_4bit_inner:
+gcm_gmult_4bit_mmx:
+.L_gcm_gmult_4bit_mmx_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%edi
+ movl 24(%esp),%esi
+ call .L005pic_point
+.L005pic_point:
+ popl %eax
+ leal .Lrem_4bit-.L005pic_point(%eax),%eax
+ movzbl 15(%edi),%ebx
xorl %ecx,%ecx
movl %ebx,%edx
movb %dl,%cl
+ movl $14,%ebp
shlb $4,%cl
andl $240,%edx
movq 8(%esi,%ecx,1),%mm0
movq (%esi,%ecx,1),%mm1
- movd %mm0,%ebp
- psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%edx,1),%mm0
- movb 14(%edi),%cl
- psllq $60,%mm2
- andl $15,%ebp
- pxor (%esi,%edx,1),%mm1
- movl %ecx,%edx
- movd %mm0,%ebx
- pxor %mm2,%mm0
- shlb $4,%cl
- psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%ecx,1),%mm0
- psllq $60,%mm2
- andl $240,%edx
- pxor (%eax,%ebp,8),%mm1
- andl $15,%ebx
- pxor (%esi,%ecx,1),%mm1
- movd %mm0,%ebp
- pxor %mm2,%mm0
- psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%edx,1),%mm0
- movb 13(%edi),%cl
- psllq $60,%mm2
- pxor (%eax,%ebx,8),%mm1
- andl $15,%ebp
- pxor (%esi,%edx,1),%mm1
- movl %ecx,%edx
- movd %mm0,%ebx
- pxor %mm2,%mm0
- shlb $4,%cl
- psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%ecx,1),%mm0
- psllq $60,%mm2
- andl $240,%edx
- pxor (%eax,%ebp,8),%mm1
- andl $15,%ebx
- pxor (%esi,%ecx,1),%mm1
- movd %mm0,%ebp
- pxor %mm2,%mm0
- psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%edx,1),%mm0
- movb 12(%edi),%cl
- psllq $60,%mm2
- pxor (%eax,%ebx,8),%mm1
- andl $15,%ebp
- pxor (%esi,%edx,1),%mm1
- movl %ecx,%edx
- movd %mm0,%ebx
- pxor %mm2,%mm0
- shlb $4,%cl
- psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%ecx,1),%mm0
- psllq $60,%mm2
- andl $240,%edx
- pxor (%eax,%ebp,8),%mm1
- andl $15,%ebx
- pxor (%esi,%ecx,1),%mm1
- movd %mm0,%ebp
- pxor %mm2,%mm0
- psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%edx,1),%mm0
- movb 11(%edi),%cl
- psllq $60,%mm2
- pxor (%eax,%ebx,8),%mm1
- andl $15,%ebp
- pxor (%esi,%edx,1),%mm1
- movl %ecx,%edx
movd %mm0,%ebx
- pxor %mm2,%mm0
- shlb $4,%cl
+ jmp .L006mmx_loop
+.align 16
+.L006mmx_loop:
psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%ecx,1),%mm0
- psllq $60,%mm2
- andl $240,%edx
- pxor (%eax,%ebp,8),%mm1
andl $15,%ebx
- pxor (%esi,%ecx,1),%mm1
- movd %mm0,%ebp
- pxor %mm2,%mm0
- psrlq $4,%mm0
movq %mm1,%mm2
psrlq $4,%mm1
pxor 8(%esi,%edx,1),%mm0
- movb 10(%edi),%cl
+ movb (%edi,%ebp,1),%cl
psllq $60,%mm2
pxor (%eax,%ebx,8),%mm1
- andl $15,%ebp
- pxor (%esi,%edx,1),%mm1
- movl %ecx,%edx
+ decl %ebp
movd %mm0,%ebx
- pxor %mm2,%mm0
- shlb $4,%cl
- psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%ecx,1),%mm0
- psllq $60,%mm2
- andl $240,%edx
- pxor (%eax,%ebp,8),%mm1
- andl $15,%ebx
- pxor (%esi,%ecx,1),%mm1
- movd %mm0,%ebp
- pxor %mm2,%mm0
- psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%edx,1),%mm0
- movb 9(%edi),%cl
- psllq $60,%mm2
- pxor (%eax,%ebx,8),%mm1
- andl $15,%ebp
pxor (%esi,%edx,1),%mm1
movl %ecx,%edx
- movd %mm0,%ebx
pxor %mm2,%mm0
+ js .L007mmx_break
shlb $4,%cl
- psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%ecx,1),%mm0
- psllq $60,%mm2
- andl $240,%edx
- pxor (%eax,%ebp,8),%mm1
andl $15,%ebx
- pxor (%esi,%ecx,1),%mm1
- movd %mm0,%ebp
- pxor %mm2,%mm0
psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%edx,1),%mm0
- movb 8(%edi),%cl
- psllq $60,%mm2
- pxor (%eax,%ebx,8),%mm1
- andl $15,%ebp
- pxor (%esi,%edx,1),%mm1
- movl %ecx,%edx
- movd %mm0,%ebx
- pxor %mm2,%mm0
- shlb $4,%cl
- psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%ecx,1),%mm0
- psllq $60,%mm2
andl $240,%edx
- pxor (%eax,%ebp,8),%mm1
- andl $15,%ebx
- pxor (%esi,%ecx,1),%mm1
- movd %mm0,%ebp
- pxor %mm2,%mm0
- psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%edx,1),%mm0
- movb 7(%edi),%cl
- psllq $60,%mm2
- pxor (%eax,%ebx,8),%mm1
- andl $15,%ebp
- pxor (%esi,%edx,1),%mm1
- movl %ecx,%edx
- movd %mm0,%ebx
- pxor %mm2,%mm0
- shlb $4,%cl
- psrlq $4,%mm0
movq %mm1,%mm2
psrlq $4,%mm1
pxor 8(%esi,%ecx,1),%mm0
psllq $60,%mm2
- andl $240,%edx
- pxor (%eax,%ebp,8),%mm1
- andl $15,%ebx
- pxor (%esi,%ecx,1),%mm1
- movd %mm0,%ebp
- pxor %mm2,%mm0
- psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%edx,1),%mm0
- movb 6(%edi),%cl
- psllq $60,%mm2
pxor (%eax,%ebx,8),%mm1
- andl $15,%ebp
- pxor (%esi,%edx,1),%mm1
- movl %ecx,%edx
movd %mm0,%ebx
- pxor %mm2,%mm0
- shlb $4,%cl
- psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%ecx,1),%mm0
- psllq $60,%mm2
- andl $240,%edx
- pxor (%eax,%ebp,8),%mm1
- andl $15,%ebx
pxor (%esi,%ecx,1),%mm1
- movd %mm0,%ebp
- pxor %mm2,%mm0
- psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%edx,1),%mm0
- movb 5(%edi),%cl
- psllq $60,%mm2
- pxor (%eax,%ebx,8),%mm1
- andl $15,%ebp
- pxor (%esi,%edx,1),%mm1
- movl %ecx,%edx
- movd %mm0,%ebx
pxor %mm2,%mm0
+ jmp .L006mmx_loop
+.align 16
+.L007mmx_break:
shlb $4,%cl
- psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%ecx,1),%mm0
- psllq $60,%mm2
- andl $240,%edx
- pxor (%eax,%ebp,8),%mm1
andl $15,%ebx
- pxor (%esi,%ecx,1),%mm1
- movd %mm0,%ebp
- pxor %mm2,%mm0
psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%edx,1),%mm0
- movb 4(%edi),%cl
- psllq $60,%mm2
- pxor (%eax,%ebx,8),%mm1
- andl $15,%ebp
- pxor (%esi,%edx,1),%mm1
- movl %ecx,%edx
- movd %mm0,%ebx
- pxor %mm2,%mm0
- shlb $4,%cl
- psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%ecx,1),%mm0
- psllq $60,%mm2
andl $240,%edx
- pxor (%eax,%ebp,8),%mm1
- andl $15,%ebx
- pxor (%esi,%ecx,1),%mm1
- movd %mm0,%ebp
- pxor %mm2,%mm0
- psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%edx,1),%mm0
- movb 3(%edi),%cl
- psllq $60,%mm2
- pxor (%eax,%ebx,8),%mm1
- andl $15,%ebp
- pxor (%esi,%edx,1),%mm1
- movl %ecx,%edx
- movd %mm0,%ebx
- pxor %mm2,%mm0
- shlb $4,%cl
- psrlq $4,%mm0
movq %mm1,%mm2
psrlq $4,%mm1
pxor 8(%esi,%ecx,1),%mm0
psllq $60,%mm2
- andl $240,%edx
- pxor (%eax,%ebp,8),%mm1
- andl $15,%ebx
- pxor (%esi,%ecx,1),%mm1
- movd %mm0,%ebp
- pxor %mm2,%mm0
- psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%edx,1),%mm0
- movb 2(%edi),%cl
- psllq $60,%mm2
pxor (%eax,%ebx,8),%mm1
- andl $15,%ebp
- pxor (%esi,%edx,1),%mm1
- movl %ecx,%edx
movd %mm0,%ebx
- pxor %mm2,%mm0
- shlb $4,%cl
- psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%ecx,1),%mm0
- psllq $60,%mm2
- andl $240,%edx
- pxor (%eax,%ebp,8),%mm1
- andl $15,%ebx
pxor (%esi,%ecx,1),%mm1
- movd %mm0,%ebp
pxor %mm2,%mm0
psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%edx,1),%mm0
- movb 1(%edi),%cl
- psllq $60,%mm2
- pxor (%eax,%ebx,8),%mm1
- andl $15,%ebp
- pxor (%esi,%edx,1),%mm1
- movl %ecx,%edx
- movd %mm0,%ebx
- pxor %mm2,%mm0
- shlb $4,%cl
- psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%ecx,1),%mm0
- psllq $60,%mm2
- andl $240,%edx
- pxor (%eax,%ebp,8),%mm1
andl $15,%ebx
- pxor (%esi,%ecx,1),%mm1
- movd %mm0,%ebp
- pxor %mm2,%mm0
- psrlq $4,%mm0
movq %mm1,%mm2
psrlq $4,%mm1
pxor 8(%esi,%edx,1),%mm0
- movb (%edi),%cl
psllq $60,%mm2
pxor (%eax,%ebx,8),%mm1
- andl $15,%ebp
- pxor (%esi,%edx,1),%mm1
- movl %ecx,%edx
movd %mm0,%ebx
- pxor %mm2,%mm0
- shlb $4,%cl
- psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%ecx,1),%mm0
- psllq $60,%mm2
- andl $240,%edx
- pxor (%eax,%ebp,8),%mm1
- andl $15,%ebx
- pxor (%esi,%ecx,1),%mm1
- movd %mm0,%ebp
- pxor %mm2,%mm0
- psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%edx,1),%mm0
- psllq $60,%mm2
- pxor (%eax,%ebx,8),%mm1
- andl $15,%ebp
pxor (%esi,%edx,1),%mm1
- movd %mm0,%ebx
pxor %mm2,%mm0
- movl 4(%eax,%ebp,8),%edi
psrlq $32,%mm0
movd %mm1,%edx
psrlq $32,%mm1
movd %mm0,%ecx
movd %mm1,%ebp
- shll $4,%edi
bswap %ebx
bswap %edx
bswap %ecx
- xorl %edi,%ebp
bswap %ebp
- ret
-.size _mmx_gmult_4bit_inner,.-_mmx_gmult_4bit_inner
-.globl gcm_gmult_4bit_mmx
-.type gcm_gmult_4bit_mmx,@function
-.align 16
-gcm_gmult_4bit_mmx:
-.L_gcm_gmult_4bit_mmx_begin:
- pushl %ebp
- pushl %ebx
- pushl %esi
- pushl %edi
- movl 20(%esp),%edi
- movl 24(%esp),%esi
- call .L005pic_point
-.L005pic_point:
- popl %eax
- leal .Lrem_4bit-.L005pic_point(%eax),%eax
- movzbl 15(%edi),%ebx
- call _mmx_gmult_4bit_inner
- movl 20(%esp),%edi
emms
movl %ebx,12(%edi)
movl %edx,4(%edi)
@@ -637,59 +313,954 @@ gcm_ghash_4bit_mmx:
pushl %ebx
pushl %esi
pushl %edi
- movl 20(%esp),%ebp
- movl 24(%esp),%esi
- movl 28(%esp),%edi
- movl 32(%esp),%ecx
- call .L006pic_point
-.L006pic_point:
- popl %eax
- leal .Lrem_4bit-.L006pic_point(%eax),%eax
- addl %edi,%ecx
- movl %ecx,32(%esp)
- subl $20,%esp
- movl 12(%ebp),%ebx
- movl 4(%ebp),%edx
- movl 8(%ebp),%ecx
- movl (%ebp),%ebp
- jmp .L007mmx_outer_loop
+ movl 20(%esp),%eax
+ movl 24(%esp),%ebx
+ movl 28(%esp),%ecx
+ movl 32(%esp),%edx
+ movl %esp,%ebp
+ call .L008pic_point
+.L008pic_point:
+ popl %esi
+ leal .Lrem_8bit-.L008pic_point(%esi),%esi
+ subl $544,%esp
+ andl $-64,%esp
+ subl $16,%esp
+ addl %ecx,%edx
+ movl %eax,544(%esp)
+ movl %edx,552(%esp)
+ movl %ebp,556(%esp)
+ addl $128,%ebx
+ leal 144(%esp),%edi
+ leal 400(%esp),%ebp
+ movl -120(%ebx),%edx
+ movq -120(%ebx),%mm0
+ movq -128(%ebx),%mm3
+ shll $4,%edx
+ movb %dl,(%esp)
+ movl -104(%ebx),%edx
+ movq -104(%ebx),%mm2
+ movq -112(%ebx),%mm5
+ movq %mm0,-128(%edi)
+ psrlq $4,%mm0
+ movq %mm3,(%edi)
+ movq %mm3,%mm7
+ psrlq $4,%mm3
+ shll $4,%edx
+ movb %dl,1(%esp)
+ movl -88(%ebx),%edx
+ movq -88(%ebx),%mm1
+ psllq $60,%mm7
+ movq -96(%ebx),%mm4
+ por %mm7,%mm0
+ movq %mm2,-120(%edi)
+ psrlq $4,%mm2
+ movq %mm5,8(%edi)
+ movq %mm5,%mm6
+ movq %mm0,-128(%ebp)
+ psrlq $4,%mm5
+ movq %mm3,(%ebp)
+ shll $4,%edx
+ movb %dl,2(%esp)
+ movl -72(%ebx),%edx
+ movq -72(%ebx),%mm0
+ psllq $60,%mm6
+ movq -80(%ebx),%mm3
+ por %mm6,%mm2
+ movq %mm1,-112(%edi)
+ psrlq $4,%mm1
+ movq %mm4,16(%edi)
+ movq %mm4,%mm7
+ movq %mm2,-120(%ebp)
+ psrlq $4,%mm4
+ movq %mm5,8(%ebp)
+ shll $4,%edx
+ movb %dl,3(%esp)
+ movl -56(%ebx),%edx
+ movq -56(%ebx),%mm2
+ psllq $60,%mm7
+ movq -64(%ebx),%mm5
+ por %mm7,%mm1
+ movq %mm0,-104(%edi)
+ psrlq $4,%mm0
+ movq %mm3,24(%edi)
+ movq %mm3,%mm6
+ movq %mm1,-112(%ebp)
+ psrlq $4,%mm3
+ movq %mm4,16(%ebp)
+ shll $4,%edx
+ movb %dl,4(%esp)
+ movl -40(%ebx),%edx
+ movq -40(%ebx),%mm1
+ psllq $60,%mm6
+ movq -48(%ebx),%mm4
+ por %mm6,%mm0
+ movq %mm2,-96(%edi)
+ psrlq $4,%mm2
+ movq %mm5,32(%edi)
+ movq %mm5,%mm7
+ movq %mm0,-104(%ebp)
+ psrlq $4,%mm5
+ movq %mm3,24(%ebp)
+ shll $4,%edx
+ movb %dl,5(%esp)
+ movl -24(%ebx),%edx
+ movq -24(%ebx),%mm0
+ psllq $60,%mm7
+ movq -32(%ebx),%mm3
+ por %mm7,%mm2
+ movq %mm1,-88(%edi)
+ psrlq $4,%mm1
+ movq %mm4,40(%edi)
+ movq %mm4,%mm6
+ movq %mm2,-96(%ebp)
+ psrlq $4,%mm4
+ movq %mm5,32(%ebp)
+ shll $4,%edx
+ movb %dl,6(%esp)
+ movl -8(%ebx),%edx
+ movq -8(%ebx),%mm2
+ psllq $60,%mm6
+ movq -16(%ebx),%mm5
+ por %mm6,%mm1
+ movq %mm0,-80(%edi)
+ psrlq $4,%mm0
+ movq %mm3,48(%edi)
+ movq %mm3,%mm7
+ movq %mm1,-88(%ebp)
+ psrlq $4,%mm3
+ movq %mm4,40(%ebp)
+ shll $4,%edx
+ movb %dl,7(%esp)
+ movl 8(%ebx),%edx
+ movq 8(%ebx),%mm1
+ psllq $60,%mm7
+ movq (%ebx),%mm4
+ por %mm7,%mm0
+ movq %mm2,-72(%edi)
+ psrlq $4,%mm2
+ movq %mm5,56(%edi)
+ movq %mm5,%mm6
+ movq %mm0,-80(%ebp)
+ psrlq $4,%mm5
+ movq %mm3,48(%ebp)
+ shll $4,%edx
+ movb %dl,8(%esp)
+ movl 24(%ebx),%edx
+ movq 24(%ebx),%mm0
+ psllq $60,%mm6
+ movq 16(%ebx),%mm3
+ por %mm6,%mm2
+ movq %mm1,-64(%edi)
+ psrlq $4,%mm1
+ movq %mm4,64(%edi)
+ movq %mm4,%mm7
+ movq %mm2,-72(%ebp)
+ psrlq $4,%mm4
+ movq %mm5,56(%ebp)
+ shll $4,%edx
+ movb %dl,9(%esp)
+ movl 40(%ebx),%edx
+ movq 40(%ebx),%mm2
+ psllq $60,%mm7
+ movq 32(%ebx),%mm5
+ por %mm7,%mm1
+ movq %mm0,-56(%edi)
+ psrlq $4,%mm0
+ movq %mm3,72(%edi)
+ movq %mm3,%mm6
+ movq %mm1,-64(%ebp)
+ psrlq $4,%mm3
+ movq %mm4,64(%ebp)
+ shll $4,%edx
+ movb %dl,10(%esp)
+ movl 56(%ebx),%edx
+ movq 56(%ebx),%mm1
+ psllq $60,%mm6
+ movq 48(%ebx),%mm4
+ por %mm6,%mm0
+ movq %mm2,-48(%edi)
+ psrlq $4,%mm2
+ movq %mm5,80(%edi)
+ movq %mm5,%mm7
+ movq %mm0,-56(%ebp)
+ psrlq $4,%mm5
+ movq %mm3,72(%ebp)
+ shll $4,%edx
+ movb %dl,11(%esp)
+ movl 72(%ebx),%edx
+ movq 72(%ebx),%mm0
+ psllq $60,%mm7
+ movq 64(%ebx),%mm3
+ por %mm7,%mm2
+ movq %mm1,-40(%edi)
+ psrlq $4,%mm1
+ movq %mm4,88(%edi)
+ movq %mm4,%mm6
+ movq %mm2,-48(%ebp)
+ psrlq $4,%mm4
+ movq %mm5,80(%ebp)
+ shll $4,%edx
+ movb %dl,12(%esp)
+ movl 88(%ebx),%edx
+ movq 88(%ebx),%mm2
+ psllq $60,%mm6
+ movq 80(%ebx),%mm5
+ por %mm6,%mm1
+ movq %mm0,-32(%edi)
+ psrlq $4,%mm0
+ movq %mm3,96(%edi)
+ movq %mm3,%mm7
+ movq %mm1,-40(%ebp)
+ psrlq $4,%mm3
+ movq %mm4,88(%ebp)
+ shll $4,%edx
+ movb %dl,13(%esp)
+ movl 104(%ebx),%edx
+ movq 104(%ebx),%mm1
+ psllq $60,%mm7
+ movq 96(%ebx),%mm4
+ por %mm7,%mm0
+ movq %mm2,-24(%edi)
+ psrlq $4,%mm2
+ movq %mm5,104(%edi)
+ movq %mm5,%mm6
+ movq %mm0,-32(%ebp)
+ psrlq $4,%mm5
+ movq %mm3,96(%ebp)
+ shll $4,%edx
+ movb %dl,14(%esp)
+ movl 120(%ebx),%edx
+ movq 120(%ebx),%mm0
+ psllq $60,%mm6
+ movq 112(%ebx),%mm3
+ por %mm6,%mm2
+ movq %mm1,-16(%edi)
+ psrlq $4,%mm1
+ movq %mm4,112(%edi)
+ movq %mm4,%mm7
+ movq %mm2,-24(%ebp)
+ psrlq $4,%mm4
+ movq %mm5,104(%ebp)
+ shll $4,%edx
+ movb %dl,15(%esp)
+ psllq $60,%mm7
+ por %mm7,%mm1
+ movq %mm0,-8(%edi)
+ psrlq $4,%mm0
+ movq %mm3,120(%edi)
+ movq %mm3,%mm6
+ movq %mm1,-16(%ebp)
+ psrlq $4,%mm3
+ movq %mm4,112(%ebp)
+ psllq $60,%mm6
+ por %mm6,%mm0
+ movq %mm0,-8(%ebp)
+ movq %mm3,120(%ebp)
+ movq (%eax),%mm6
+ movl 8(%eax),%ebx
+ movl 12(%eax),%edx
.align 16
-.L007mmx_outer_loop:
- xorl 12(%edi),%ebx
- xorl 4(%edi),%edx
- xorl 8(%edi),%ecx
- xorl (%edi),%ebp
- movl %edi,48(%esp)
- movl %ebx,12(%esp)
- movl %edx,4(%esp)
- movl %ecx,8(%esp)
- movl %ebp,(%esp)
- movl %esp,%edi
- shrl $24,%ebx
- call _mmx_gmult_4bit_inner
- movl 48(%esp),%edi
- leal 16(%edi),%edi
- cmpl 52(%esp),%edi
- jb .L007mmx_outer_loop
- movl 40(%esp),%edi
+.L009outer:
+ xorl 12(%ecx),%edx
+ xorl 8(%ecx),%ebx
+ pxor (%ecx),%mm6
+ leal 16(%ecx),%ecx
+ movl %ebx,536(%esp)
+ movq %mm6,528(%esp)
+ movl %ecx,548(%esp)
+ xorl %eax,%eax
+ roll $8,%edx
+ movb %dl,%al
+ movl %eax,%ebp
+ andb $15,%al
+ shrl $4,%ebp
+ pxor %mm0,%mm0
+ roll $8,%edx
+ pxor %mm1,%mm1
+ pxor %mm2,%mm2
+ movq 16(%esp,%eax,8),%mm7
+ movq 144(%esp,%eax,8),%mm6
+ movb %dl,%al
+ movd %mm7,%ebx
+ psrlq $8,%mm7
+ movq %mm6,%mm3
+ movl %eax,%edi
+ psrlq $8,%mm6
+ pxor 272(%esp,%ebp,8),%mm7
+ andb $15,%al
+ psllq $56,%mm3
+ shrl $4,%edi
+ pxor 16(%esp,%eax,8),%mm7
+ roll $8,%edx
+ pxor 144(%esp,%eax,8),%mm6
+ pxor %mm3,%mm7
+ pxor 400(%esp,%ebp,8),%mm6
+ xorb (%esp,%ebp,1),%bl
+ movb %dl,%al
+ movd %mm7,%ecx
+ movzbl %bl,%ebx
+ psrlq $8,%mm7
+ movq %mm6,%mm3
+ movl %eax,%ebp
+ psrlq $8,%mm6
+ pxor 272(%esp,%edi,8),%mm7
+ andb $15,%al
+ psllq $56,%mm3
+ shrl $4,%ebp
+ pinsrw $2,(%esi,%ebx,2),%mm2
+ pxor 16(%esp,%eax,8),%mm7
+ roll $8,%edx
+ pxor 144(%esp,%eax,8),%mm6
+ pxor %mm3,%mm7
+ pxor 400(%esp,%edi,8),%mm6
+ xorb (%esp,%edi,1),%cl
+ movb %dl,%al
+ movl 536(%esp),%edx
+ movd %mm7,%ebx
+ movzbl %cl,%ecx
+ psrlq $8,%mm7
+ movq %mm6,%mm3
+ movl %eax,%edi
+ psrlq $8,%mm6
+ pxor 272(%esp,%ebp,8),%mm7
+ andb $15,%al
+ psllq $56,%mm3
+ pxor %mm2,%mm6
+ shrl $4,%edi
+ pinsrw $2,(%esi,%ecx,2),%mm1
+ pxor 16(%esp,%eax,8),%mm7
+ roll $8,%edx
+ pxor 144(%esp,%eax,8),%mm6
+ pxor %mm3,%mm7
+ pxor 400(%esp,%ebp,8),%mm6
+ xorb (%esp,%ebp,1),%bl
+ movb %dl,%al
+ movd %mm7,%ecx
+ movzbl %bl,%ebx
+ psrlq $8,%mm7
+ movq %mm6,%mm3
+ movl %eax,%ebp
+ psrlq $8,%mm6
+ pxor 272(%esp,%edi,8),%mm7
+ andb $15,%al
+ psllq $56,%mm3
+ pxor %mm1,%mm6
+ shrl $4,%ebp
+ pinsrw $2,(%esi,%ebx,2),%mm0
+ pxor 16(%esp,%eax,8),%mm7
+ roll $8,%edx
+ pxor 144(%esp,%eax,8),%mm6
+ pxor %mm3,%mm7
+ pxor 400(%esp,%edi,8),%mm6
+ xorb (%esp,%edi,1),%cl
+ movb %dl,%al
+ movd %mm7,%ebx
+ movzbl %cl,%ecx
+ psrlq $8,%mm7
+ movq %mm6,%mm3
+ movl %eax,%edi
+ psrlq $8,%mm6
+ pxor 272(%esp,%ebp,8),%mm7
+ andb $15,%al
+ psllq $56,%mm3
+ pxor %mm0,%mm6
+ shrl $4,%edi
+ pinsrw $2,(%esi,%ecx,2),%mm2
+ pxor 16(%esp,%eax,8),%mm7
+ roll $8,%edx
+ pxor 144(%esp,%eax,8),%mm6
+ pxor %mm3,%mm7
+ pxor 400(%esp,%ebp,8),%mm6
+ xorb (%esp,%ebp,1),%bl
+ movb %dl,%al
+ movd %mm7,%ecx
+ movzbl %bl,%ebx
+ psrlq $8,%mm7
+ movq %mm6,%mm3
+ movl %eax,%ebp
+ psrlq $8,%mm6
+ pxor 272(%esp,%edi,8),%mm7
+ andb $15,%al
+ psllq $56,%mm3
+ pxor %mm2,%mm6
+ shrl $4,%ebp
+ pinsrw $2,(%esi,%ebx,2),%mm1
+ pxor 16(%esp,%eax,8),%mm7
+ roll $8,%edx
+ pxor 144(%esp,%eax,8),%mm6
+ pxor %mm3,%mm7
+ pxor 400(%esp,%edi,8),%mm6
+ xorb (%esp,%edi,1),%cl
+ movb %dl,%al
+ movl 532(%esp),%edx
+ movd %mm7,%ebx
+ movzbl %cl,%ecx
+ psrlq $8,%mm7
+ movq %mm6,%mm3
+ movl %eax,%edi
+ psrlq $8,%mm6
+ pxor 272(%esp,%ebp,8),%mm7
+ andb $15,%al
+ psllq $56,%mm3
+ pxor %mm1,%mm6
+ shrl $4,%edi
+ pinsrw $2,(%esi,%ecx,2),%mm0
+ pxor 16(%esp,%eax,8),%mm7
+ roll $8,%edx
+ pxor 144(%esp,%eax,8),%mm6
+ pxor %mm3,%mm7
+ pxor 400(%esp,%ebp,8),%mm6
+ xorb (%esp,%ebp,1),%bl
+ movb %dl,%al
+ movd %mm7,%ecx
+ movzbl %bl,%ebx
+ psrlq $8,%mm7
+ movq %mm6,%mm3
+ movl %eax,%ebp
+ psrlq $8,%mm6
+ pxor 272(%esp,%edi,8),%mm7
+ andb $15,%al
+ psllq $56,%mm3
+ pxor %mm0,%mm6
+ shrl $4,%ebp
+ pinsrw $2,(%esi,%ebx,2),%mm2
+ pxor 16(%esp,%eax,8),%mm7
+ roll $8,%edx
+ pxor 144(%esp,%eax,8),%mm6
+ pxor %mm3,%mm7
+ pxor 400(%esp,%edi,8),%mm6
+ xorb (%esp,%edi,1),%cl
+ movb %dl,%al
+ movd %mm7,%ebx
+ movzbl %cl,%ecx
+ psrlq $8,%mm7
+ movq %mm6,%mm3
+ movl %eax,%edi
+ psrlq $8,%mm6
+ pxor 272(%esp,%ebp,8),%mm7
+ andb $15,%al
+ psllq $56,%mm3
+ pxor %mm2,%mm6
+ shrl $4,%edi
+ pinsrw $2,(%esi,%ecx,2),%mm1
+ pxor 16(%esp,%eax,8),%mm7
+ roll $8,%edx
+ pxor 144(%esp,%eax,8),%mm6
+ pxor %mm3,%mm7
+ pxor 400(%esp,%ebp,8),%mm6
+ xorb (%esp,%ebp,1),%bl
+ movb %dl,%al
+ movd %mm7,%ecx
+ movzbl %bl,%ebx
+ psrlq $8,%mm7
+ movq %mm6,%mm3
+ movl %eax,%ebp
+ psrlq $8,%mm6
+ pxor 272(%esp,%edi,8),%mm7
+ andb $15,%al
+ psllq $56,%mm3
+ pxor %mm1,%mm6
+ shrl $4,%ebp
+ pinsrw $2,(%esi,%ebx,2),%mm0
+ pxor 16(%esp,%eax,8),%mm7
+ roll $8,%edx
+ pxor 144(%esp,%eax,8),%mm6
+ pxor %mm3,%mm7
+ pxor 400(%esp,%edi,8),%mm6
+ xorb (%esp,%edi,1),%cl
+ movb %dl,%al
+ movl 528(%esp),%edx
+ movd %mm7,%ebx
+ movzbl %cl,%ecx
+ psrlq $8,%mm7
+ movq %mm6,%mm3
+ movl %eax,%edi
+ psrlq $8,%mm6
+ pxor 272(%esp,%ebp,8),%mm7
+ andb $15,%al
+ psllq $56,%mm3
+ pxor %mm0,%mm6
+ shrl $4,%edi
+ pinsrw $2,(%esi,%ecx,2),%mm2
+ pxor 16(%esp,%eax,8),%mm7
+ roll $8,%edx
+ pxor 144(%esp,%eax,8),%mm6
+ pxor %mm3,%mm7
+ pxor 400(%esp,%ebp,8),%mm6
+ xorb (%esp,%ebp,1),%bl
+ movb %dl,%al
+ movd %mm7,%ecx
+ movzbl %bl,%ebx
+ psrlq $8,%mm7
+ movq %mm6,%mm3
+ movl %eax,%ebp
+ psrlq $8,%mm6
+ pxor 272(%esp,%edi,8),%mm7
+ andb $15,%al
+ psllq $56,%mm3
+ pxor %mm2,%mm6
+ shrl $4,%ebp
+ pinsrw $2,(%esi,%ebx,2),%mm1
+ pxor 16(%esp,%eax,8),%mm7
+ roll $8,%edx
+ pxor 144(%esp,%eax,8),%mm6
+ pxor %mm3,%mm7
+ pxor 400(%esp,%edi,8),%mm6
+ xorb (%esp,%edi,1),%cl
+ movb %dl,%al
+ movd %mm7,%ebx
+ movzbl %cl,%ecx
+ psrlq $8,%mm7
+ movq %mm6,%mm3
+ movl %eax,%edi
+ psrlq $8,%mm6
+ pxor 272(%esp,%ebp,8),%mm7
+ andb $15,%al
+ psllq $56,%mm3
+ pxor %mm1,%mm6
+ shrl $4,%edi
+ pinsrw $2,(%esi,%ecx,2),%mm0
+ pxor 16(%esp,%eax,8),%mm7
+ roll $8,%edx
+ pxor 144(%esp,%eax,8),%mm6
+ pxor %mm3,%mm7
+ pxor 400(%esp,%ebp,8),%mm6
+ xorb (%esp,%ebp,1),%bl
+ movb %dl,%al
+ movd %mm7,%ecx
+ movzbl %bl,%ebx
+ psrlq $8,%mm7
+ movq %mm6,%mm3
+ movl %eax,%ebp
+ psrlq $8,%mm6
+ pxor 272(%esp,%edi,8),%mm7
+ andb $15,%al
+ psllq $56,%mm3
+ pxor %mm0,%mm6
+ shrl $4,%ebp
+ pinsrw $2,(%esi,%ebx,2),%mm2
+ pxor 16(%esp,%eax,8),%mm7
+ roll $8,%edx
+ pxor 144(%esp,%eax,8),%mm6
+ pxor %mm3,%mm7
+ pxor 400(%esp,%edi,8),%mm6
+ xorb (%esp,%edi,1),%cl
+ movb %dl,%al
+ movl 524(%esp),%edx
+ movd %mm7,%ebx
+ movzbl %cl,%ecx
+ psrlq $8,%mm7
+ movq %mm6,%mm3
+ movl %eax,%edi
+ psrlq $8,%mm6
+ pxor 272(%esp,%ebp,8),%mm7
+ andb $15,%al
+ psllq $56,%mm3
+ pxor %mm2,%mm6
+ shrl $4,%edi
+ pinsrw $2,(%esi,%ecx,2),%mm1
+ pxor 16(%esp,%eax,8),%mm7
+ pxor 144(%esp,%eax,8),%mm6
+ xorb (%esp,%ebp,1),%bl
+ pxor %mm3,%mm7
+ pxor 400(%esp,%ebp,8),%mm6
+ movzbl %bl,%ebx
+ pxor %mm2,%mm2
+ psllq $4,%mm1
+ movd %mm7,%ecx
+ psrlq $4,%mm7
+ movq %mm6,%mm3
+ psrlq $4,%mm6
+ shll $4,%ecx
+ pxor 16(%esp,%edi,8),%mm7
+ psllq $60,%mm3
+ movzbl %cl,%ecx
+ pxor %mm3,%mm7
+ pxor 144(%esp,%edi,8),%mm6
+ pinsrw $2,(%esi,%ebx,2),%mm0
+ pxor %mm1,%mm6
+ movd %mm7,%edx
+ pinsrw $3,(%esi,%ecx,2),%mm2
+ psllq $12,%mm0
+ pxor %mm0,%mm6
+ psrlq $32,%mm7
+ pxor %mm2,%mm6
+ movl 548(%esp),%ecx
+ movd %mm7,%ebx
+ movq %mm6,%mm3
+ psllw $8,%mm6
+ psrlw $8,%mm3
+ por %mm3,%mm6
+ bswap %edx
+ pshufw $27,%mm6,%mm6
+ bswap %ebx
+ cmpl 552(%esp),%ecx
+ jne .L009outer
+ movl 544(%esp),%eax
+ movl %edx,12(%eax)
+ movl %ebx,8(%eax)
+ movq %mm6,(%eax)
+ movl 556(%esp),%esp
emms
- movl %ebx,12(%edi)
- movl %edx,4(%edi)
- movl %ecx,8(%edi)
- movl %ebp,(%edi)
- addl $20,%esp
popl %edi
popl %esi
popl %ebx
popl %ebp
ret
.size gcm_ghash_4bit_mmx,.-.L_gcm_ghash_4bit_mmx_begin
+.globl gcm_init_clmul
+.type gcm_init_clmul,@function
+.align 16
+gcm_init_clmul:
+.L_gcm_init_clmul_begin:
+ movl 4(%esp),%edx
+ movl 8(%esp),%eax
+ call .L010pic
+.L010pic:
+ popl %ecx
+ leal .Lbswap-.L010pic(%ecx),%ecx
+ movdqu (%eax),%xmm2
+ pshufd $78,%xmm2,%xmm2
+ pshufd $255,%xmm2,%xmm4
+ movdqa %xmm2,%xmm3
+ psllq $1,%xmm2
+ pxor %xmm5,%xmm5
+ psrlq $63,%xmm3
+ pcmpgtd %xmm4,%xmm5
+ pslldq $8,%xmm3
+ por %xmm3,%xmm2
+ pand 16(%ecx),%xmm5
+ pxor %xmm5,%xmm2
+ movdqa %xmm2,%xmm0
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+ pshufd $78,%xmm2,%xmm4
+ pxor %xmm0,%xmm3
+ pxor %xmm2,%xmm4
+.byte 102,15,58,68,194,0
+.byte 102,15,58,68,202,17
+.byte 102,15,58,68,220,0
+ xorps %xmm0,%xmm3
+ xorps %xmm1,%xmm3
+ movdqa %xmm3,%xmm4
+ psrldq $8,%xmm3
+ pslldq $8,%xmm4
+ pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+ movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+ pshufd $78,%xmm2,%xmm3
+ pshufd $78,%xmm0,%xmm4
+ pxor %xmm2,%xmm3
+ movdqu %xmm2,(%edx)
+ pxor %xmm0,%xmm4
+ movdqu %xmm0,16(%edx)
+.byte 102,15,58,15,227,8
+ movdqu %xmm4,32(%edx)
+ ret
+.size gcm_init_clmul,.-.L_gcm_init_clmul_begin
+.globl gcm_gmult_clmul
+.type gcm_gmult_clmul,@function
+.align 16
+gcm_gmult_clmul:
+.L_gcm_gmult_clmul_begin:
+ movl 4(%esp),%eax
+ movl 8(%esp),%edx
+ call .L011pic
+.L011pic:
+ popl %ecx
+ leal .Lbswap-.L011pic(%ecx),%ecx
+ movdqu (%eax),%xmm0
+ movdqa (%ecx),%xmm5
+ movups (%edx),%xmm2
+.byte 102,15,56,0,197
+ movups 32(%edx),%xmm4
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+ pxor %xmm0,%xmm3
+.byte 102,15,58,68,194,0
+.byte 102,15,58,68,202,17
+.byte 102,15,58,68,220,0
+ xorps %xmm0,%xmm3
+ xorps %xmm1,%xmm3
+ movdqa %xmm3,%xmm4
+ psrldq $8,%xmm3
+ pslldq $8,%xmm4
+ pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+ movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+.byte 102,15,56,0,197
+ movdqu %xmm0,(%eax)
+ ret
+.size gcm_gmult_clmul,.-.L_gcm_gmult_clmul_begin
+.globl gcm_ghash_clmul
+.type gcm_ghash_clmul,@function
+.align 16
+gcm_ghash_clmul:
+.L_gcm_ghash_clmul_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%eax
+ movl 24(%esp),%edx
+ movl 28(%esp),%esi
+ movl 32(%esp),%ebx
+ call .L012pic
+.L012pic:
+ popl %ecx
+ leal .Lbswap-.L012pic(%ecx),%ecx
+ movdqu (%eax),%xmm0
+ movdqa (%ecx),%xmm5
+ movdqu (%edx),%xmm2
+.byte 102,15,56,0,197
+ subl $16,%ebx
+ jz .L013odd_tail
+ movdqu (%esi),%xmm3
+ movdqu 16(%esi),%xmm6
+.byte 102,15,56,0,221
+.byte 102,15,56,0,245
+ movdqu 32(%edx),%xmm5
+ pxor %xmm3,%xmm0
+ pshufd $78,%xmm6,%xmm3
+ movdqa %xmm6,%xmm7
+ pxor %xmm6,%xmm3
+ leal 32(%esi),%esi
+.byte 102,15,58,68,242,0
+.byte 102,15,58,68,250,17
+.byte 102,15,58,68,221,0
+ movups 16(%edx),%xmm2
+ nop
+ subl $32,%ebx
+ jbe .L014even_tail
+ jmp .L015mod_loop
+.align 32
+.L015mod_loop:
+ pshufd $78,%xmm0,%xmm4
+ movdqa %xmm0,%xmm1
+ pxor %xmm0,%xmm4
+ nop
+.byte 102,15,58,68,194,0
+.byte 102,15,58,68,202,17
+.byte 102,15,58,68,229,16
+ movups (%edx),%xmm2
+ xorps %xmm6,%xmm0
+ movdqa (%ecx),%xmm5
+ xorps %xmm7,%xmm1
+ movdqu (%esi),%xmm7
+ pxor %xmm0,%xmm3
+ movdqu 16(%esi),%xmm6
+ pxor %xmm1,%xmm3
+.byte 102,15,56,0,253
+ pxor %xmm3,%xmm4
+ movdqa %xmm4,%xmm3
+ psrldq $8,%xmm4
+ pslldq $8,%xmm3
+ pxor %xmm4,%xmm1
+ pxor %xmm3,%xmm0
+.byte 102,15,56,0,245
+ pxor %xmm7,%xmm1
+ movdqa %xmm6,%xmm7
+ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+.byte 102,15,58,68,242,0
+ movups 32(%edx),%xmm5
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+ pshufd $78,%xmm7,%xmm3
+ movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm7,%xmm3
+ pxor %xmm4,%xmm1
+.byte 102,15,58,68,250,17
+ movups 16(%edx),%xmm2
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+.byte 102,15,58,68,221,0
+ leal 32(%esi),%esi
+ subl $32,%ebx
+ ja .L015mod_loop
+.L014even_tail:
+ pshufd $78,%xmm0,%xmm4
+ movdqa %xmm0,%xmm1
+ pxor %xmm0,%xmm4
+.byte 102,15,58,68,194,0
+.byte 102,15,58,68,202,17
+.byte 102,15,58,68,229,16
+ movdqa (%ecx),%xmm5
+ xorps %xmm6,%xmm0
+ xorps %xmm7,%xmm1
+ pxor %xmm0,%xmm3
+ pxor %xmm1,%xmm3
+ pxor %xmm3,%xmm4
+ movdqa %xmm4,%xmm3
+ psrldq $8,%xmm4
+ pslldq $8,%xmm3
+ pxor %xmm4,%xmm1
+ pxor %xmm3,%xmm0
+ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+ movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+ testl %ebx,%ebx
+ jnz .L016done
+ movups (%edx),%xmm2
+.L013odd_tail:
+ movdqu (%esi),%xmm3
+.byte 102,15,56,0,221
+ pxor %xmm3,%xmm0
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+ pshufd $78,%xmm2,%xmm4
+ pxor %xmm0,%xmm3
+ pxor %xmm2,%xmm4
+.byte 102,15,58,68,194,0
+.byte 102,15,58,68,202,17
+.byte 102,15,58,68,220,0
+ xorps %xmm0,%xmm3
+ xorps %xmm1,%xmm3
+ movdqa %xmm3,%xmm4
+ psrldq $8,%xmm3
+ pslldq $8,%xmm4
+ pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+ movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+.L016done:
+.byte 102,15,56,0,197
+ movdqu %xmm0,(%eax)
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size gcm_ghash_clmul,.-.L_gcm_ghash_clmul_begin
+.align 64
+.Lbswap:
+.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,194
+.align 64
+.Lrem_8bit:
+.value 0,450,900,582,1800,1738,1164,1358
+.value 3600,4050,3476,3158,2328,2266,2716,2910
+.value 7200,7650,8100,7782,6952,6890,6316,6510
+.value 4656,5106,4532,4214,5432,5370,5820,6014
+.value 14400,14722,15300,14854,16200,16010,15564,15630
+.value 13904,14226,13780,13334,12632,12442,13020,13086
+.value 9312,9634,10212,9766,9064,8874,8428,8494
+.value 10864,11186,10740,10294,11640,11450,12028,12094
+.value 28800,28994,29444,29382,30600,30282,29708,30158
+.value 32400,32594,32020,31958,31128,30810,31260,31710
+.value 27808,28002,28452,28390,27560,27242,26668,27118
+.value 25264,25458,24884,24822,26040,25722,26172,26622
+.value 18624,18690,19268,19078,20424,19978,19532,19854
+.value 18128,18194,17748,17558,16856,16410,16988,17310
+.value 21728,21794,22372,22182,21480,21034,20588,20910
+.value 23280,23346,22900,22710,24056,23610,24188,24510
+.value 57600,57538,57988,58182,58888,59338,58764,58446
+.value 61200,61138,60564,60758,59416,59866,60316,59998
+.value 64800,64738,65188,65382,64040,64490,63916,63598
+.value 62256,62194,61620,61814,62520,62970,63420,63102
+.value 55616,55426,56004,56070,56904,57226,56780,56334
+.value 55120,54930,54484,54550,53336,53658,54236,53790
+.value 50528,50338,50916,50982,49768,50090,49644,49198
+.value 52080,51890,51444,51510,52344,52666,53244,52798
+.value 37248,36930,37380,37830,38536,38730,38156,38094
+.value 40848,40530,39956,40406,39064,39258,39708,39646
+.value 36256,35938,36388,36838,35496,35690,35116,35054
+.value 33712,33394,32820,33270,33976,34170,34620,34558
+.value 43456,43010,43588,43910,44744,44810,44364,44174
+.value 42960,42514,42068,42390,41176,41242,41820,41630
+.value 46560,46114,46692,47014,45800,45866,45420,45230
+.value 48112,47666,47220,47542,48376,48442,49020,48830
.align 64
.Lrem_4bit:
-.long 0,0,0,29491200,0,58982400,0,38141952
-.long 0,117964800,0,113901568,0,76283904,0,88997888
-.long 0,235929600,0,265420800,0,227803136,0,206962688
-.long 0,152567808,0,148504576,0,177995776,0,190709760
+.long 0,0,0,471859200,0,943718400,0,610271232
+.long 0,1887436800,0,1822425088,0,1220542464,0,1423966208
+.long 0,3774873600,0,4246732800,0,3644850176,0,3311403008
+.long 0,2441084928,0,2376073216,0,2847932416,0,3051356160
.byte 71,72,65,83,72,32,102,111,114,32,120,56,54,44,32,67
.byte 82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112
.byte 112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62
@@ -899,418 +1470,94 @@ gcm_ghash_4bit_x86:
popl %ebp
ret
.size gcm_ghash_4bit_x86,.-.L_gcm_ghash_4bit_x86_begin
-.type _mmx_gmult_4bit_inner,@function
+.globl gcm_gmult_4bit_mmx
+.type gcm_gmult_4bit_mmx,@function
.align 16
-_mmx_gmult_4bit_inner:
+gcm_gmult_4bit_mmx:
+.L_gcm_gmult_4bit_mmx_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%edi
+ movl 24(%esp),%esi
+ call .L005pic_point
+.L005pic_point:
+ popl %eax
+ leal .Lrem_4bit-.L005pic_point(%eax),%eax
+ movzbl 15(%edi),%ebx
xorl %ecx,%ecx
movl %ebx,%edx
movb %dl,%cl
+ movl $14,%ebp
shlb $4,%cl
andl $240,%edx
movq 8(%esi,%ecx,1),%mm0
movq (%esi,%ecx,1),%mm1
- movd %mm0,%ebp
- psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%edx,1),%mm0
- movb 14(%edi),%cl
- psllq $60,%mm2
- andl $15,%ebp
- pxor (%esi,%edx,1),%mm1
- movl %ecx,%edx
- movd %mm0,%ebx
- pxor %mm2,%mm0
- shlb $4,%cl
- psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%ecx,1),%mm0
- psllq $60,%mm2
- andl $240,%edx
- pxor (%eax,%ebp,8),%mm1
- andl $15,%ebx
- pxor (%esi,%ecx,1),%mm1
- movd %mm0,%ebp
- pxor %mm2,%mm0
- psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%edx,1),%mm0
- movb 13(%edi),%cl
- psllq $60,%mm2
- pxor (%eax,%ebx,8),%mm1
- andl $15,%ebp
- pxor (%esi,%edx,1),%mm1
- movl %ecx,%edx
- movd %mm0,%ebx
- pxor %mm2,%mm0
- shlb $4,%cl
- psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%ecx,1),%mm0
- psllq $60,%mm2
- andl $240,%edx
- pxor (%eax,%ebp,8),%mm1
- andl $15,%ebx
- pxor (%esi,%ecx,1),%mm1
- movd %mm0,%ebp
- pxor %mm2,%mm0
- psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%edx,1),%mm0
- movb 12(%edi),%cl
- psllq $60,%mm2
- pxor (%eax,%ebx,8),%mm1
- andl $15,%ebp
- pxor (%esi,%edx,1),%mm1
- movl %ecx,%edx
- movd %mm0,%ebx
- pxor %mm2,%mm0
- shlb $4,%cl
- psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%ecx,1),%mm0
- psllq $60,%mm2
- andl $240,%edx
- pxor (%eax,%ebp,8),%mm1
- andl $15,%ebx
- pxor (%esi,%ecx,1),%mm1
- movd %mm0,%ebp
- pxor %mm2,%mm0
- psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%edx,1),%mm0
- movb 11(%edi),%cl
- psllq $60,%mm2
- pxor (%eax,%ebx,8),%mm1
- andl $15,%ebp
- pxor (%esi,%edx,1),%mm1
- movl %ecx,%edx
movd %mm0,%ebx
- pxor %mm2,%mm0
- shlb $4,%cl
+ jmp .L006mmx_loop
+.align 16
+.L006mmx_loop:
psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%ecx,1),%mm0
- psllq $60,%mm2
- andl $240,%edx
- pxor (%eax,%ebp,8),%mm1
andl $15,%ebx
- pxor (%esi,%ecx,1),%mm1
- movd %mm0,%ebp
- pxor %mm2,%mm0
- psrlq $4,%mm0
movq %mm1,%mm2
psrlq $4,%mm1
pxor 8(%esi,%edx,1),%mm0
- movb 10(%edi),%cl
+ movb (%edi,%ebp,1),%cl
psllq $60,%mm2
pxor (%eax,%ebx,8),%mm1
- andl $15,%ebp
- pxor (%esi,%edx,1),%mm1
- movl %ecx,%edx
+ decl %ebp
movd %mm0,%ebx
- pxor %mm2,%mm0
- shlb $4,%cl
- psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%ecx,1),%mm0
- psllq $60,%mm2
- andl $240,%edx
- pxor (%eax,%ebp,8),%mm1
- andl $15,%ebx
- pxor (%esi,%ecx,1),%mm1
- movd %mm0,%ebp
- pxor %mm2,%mm0
- psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%edx,1),%mm0
- movb 9(%edi),%cl
- psllq $60,%mm2
- pxor (%eax,%ebx,8),%mm1
- andl $15,%ebp
pxor (%esi,%edx,1),%mm1
movl %ecx,%edx
- movd %mm0,%ebx
pxor %mm2,%mm0
+ js .L007mmx_break
shlb $4,%cl
- psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%ecx,1),%mm0
- psllq $60,%mm2
- andl $240,%edx
- pxor (%eax,%ebp,8),%mm1
andl $15,%ebx
- pxor (%esi,%ecx,1),%mm1
- movd %mm0,%ebp
- pxor %mm2,%mm0
psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%edx,1),%mm0
- movb 8(%edi),%cl
- psllq $60,%mm2
- pxor (%eax,%ebx,8),%mm1
- andl $15,%ebp
- pxor (%esi,%edx,1),%mm1
- movl %ecx,%edx
- movd %mm0,%ebx
- pxor %mm2,%mm0
- shlb $4,%cl
- psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%ecx,1),%mm0
- psllq $60,%mm2
andl $240,%edx
- pxor (%eax,%ebp,8),%mm1
- andl $15,%ebx
- pxor (%esi,%ecx,1),%mm1
- movd %mm0,%ebp
- pxor %mm2,%mm0
- psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%edx,1),%mm0
- movb 7(%edi),%cl
- psllq $60,%mm2
- pxor (%eax,%ebx,8),%mm1
- andl $15,%ebp
- pxor (%esi,%edx,1),%mm1
- movl %ecx,%edx
- movd %mm0,%ebx
- pxor %mm2,%mm0
- shlb $4,%cl
- psrlq $4,%mm0
movq %mm1,%mm2
psrlq $4,%mm1
pxor 8(%esi,%ecx,1),%mm0
psllq $60,%mm2
- andl $240,%edx
- pxor (%eax,%ebp,8),%mm1
- andl $15,%ebx
- pxor (%esi,%ecx,1),%mm1
- movd %mm0,%ebp
- pxor %mm2,%mm0
- psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%edx,1),%mm0
- movb 6(%edi),%cl
- psllq $60,%mm2
pxor (%eax,%ebx,8),%mm1
- andl $15,%ebp
- pxor (%esi,%edx,1),%mm1
- movl %ecx,%edx
movd %mm0,%ebx
- pxor %mm2,%mm0
- shlb $4,%cl
- psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%ecx,1),%mm0
- psllq $60,%mm2
- andl $240,%edx
- pxor (%eax,%ebp,8),%mm1
- andl $15,%ebx
pxor (%esi,%ecx,1),%mm1
- movd %mm0,%ebp
- pxor %mm2,%mm0
- psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%edx,1),%mm0
- movb 5(%edi),%cl
- psllq $60,%mm2
- pxor (%eax,%ebx,8),%mm1
- andl $15,%ebp
- pxor (%esi,%edx,1),%mm1
- movl %ecx,%edx
- movd %mm0,%ebx
pxor %mm2,%mm0
+ jmp .L006mmx_loop
+.align 16
+.L007mmx_break:
shlb $4,%cl
- psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%ecx,1),%mm0
- psllq $60,%mm2
- andl $240,%edx
- pxor (%eax,%ebp,8),%mm1
andl $15,%ebx
- pxor (%esi,%ecx,1),%mm1
- movd %mm0,%ebp
- pxor %mm2,%mm0
psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%edx,1),%mm0
- movb 4(%edi),%cl
- psllq $60,%mm2
- pxor (%eax,%ebx,8),%mm1
- andl $15,%ebp
- pxor (%esi,%edx,1),%mm1
- movl %ecx,%edx
- movd %mm0,%ebx
- pxor %mm2,%mm0
- shlb $4,%cl
- psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%ecx,1),%mm0
- psllq $60,%mm2
andl $240,%edx
- pxor (%eax,%ebp,8),%mm1
- andl $15,%ebx
- pxor (%esi,%ecx,1),%mm1
- movd %mm0,%ebp
- pxor %mm2,%mm0
- psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%edx,1),%mm0
- movb 3(%edi),%cl
- psllq $60,%mm2
- pxor (%eax,%ebx,8),%mm1
- andl $15,%ebp
- pxor (%esi,%edx,1),%mm1
- movl %ecx,%edx
- movd %mm0,%ebx
- pxor %mm2,%mm0
- shlb $4,%cl
- psrlq $4,%mm0
movq %mm1,%mm2
psrlq $4,%mm1
pxor 8(%esi,%ecx,1),%mm0
psllq $60,%mm2
- andl $240,%edx
- pxor (%eax,%ebp,8),%mm1
- andl $15,%ebx
- pxor (%esi,%ecx,1),%mm1
- movd %mm0,%ebp
- pxor %mm2,%mm0
- psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%edx,1),%mm0
- movb 2(%edi),%cl
- psllq $60,%mm2
pxor (%eax,%ebx,8),%mm1
- andl $15,%ebp
- pxor (%esi,%edx,1),%mm1
- movl %ecx,%edx
movd %mm0,%ebx
- pxor %mm2,%mm0
- shlb $4,%cl
- psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%ecx,1),%mm0
- psllq $60,%mm2
- andl $240,%edx
- pxor (%eax,%ebp,8),%mm1
- andl $15,%ebx
pxor (%esi,%ecx,1),%mm1
- movd %mm0,%ebp
pxor %mm2,%mm0
psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%edx,1),%mm0
- movb 1(%edi),%cl
- psllq $60,%mm2
- pxor (%eax,%ebx,8),%mm1
- andl $15,%ebp
- pxor (%esi,%edx,1),%mm1
- movl %ecx,%edx
- movd %mm0,%ebx
- pxor %mm2,%mm0
- shlb $4,%cl
- psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%ecx,1),%mm0
- psllq $60,%mm2
- andl $240,%edx
- pxor (%eax,%ebp,8),%mm1
andl $15,%ebx
- pxor (%esi,%ecx,1),%mm1
- movd %mm0,%ebp
- pxor %mm2,%mm0
- psrlq $4,%mm0
movq %mm1,%mm2
psrlq $4,%mm1
pxor 8(%esi,%edx,1),%mm0
- movb (%edi),%cl
psllq $60,%mm2
pxor (%eax,%ebx,8),%mm1
- andl $15,%ebp
- pxor (%esi,%edx,1),%mm1
- movl %ecx,%edx
movd %mm0,%ebx
- pxor %mm2,%mm0
- shlb $4,%cl
- psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%ecx,1),%mm0
- psllq $60,%mm2
- andl $240,%edx
- pxor (%eax,%ebp,8),%mm1
- andl $15,%ebx
- pxor (%esi,%ecx,1),%mm1
- movd %mm0,%ebp
- pxor %mm2,%mm0
- psrlq $4,%mm0
- movq %mm1,%mm2
- psrlq $4,%mm1
- pxor 8(%esi,%edx,1),%mm0
- psllq $60,%mm2
- pxor (%eax,%ebx,8),%mm1
- andl $15,%ebp
pxor (%esi,%edx,1),%mm1
- movd %mm0,%ebx
pxor %mm2,%mm0
- movl 4(%eax,%ebp,8),%edi
psrlq $32,%mm0
movd %mm1,%edx
psrlq $32,%mm1
movd %mm0,%ecx
movd %mm1,%ebp
- shll $4,%edi
bswap %ebx
bswap %edx
bswap %ecx
- xorl %edi,%ebp
bswap %ebp
- ret
-.size _mmx_gmult_4bit_inner,.-_mmx_gmult_4bit_inner
-.globl gcm_gmult_4bit_mmx
-.type gcm_gmult_4bit_mmx,@function
-.align 16
-gcm_gmult_4bit_mmx:
-.L_gcm_gmult_4bit_mmx_begin:
- pushl %ebp
- pushl %ebx
- pushl %esi
- pushl %edi
- movl 20(%esp),%edi
- movl 24(%esp),%esi
- call .L005pic_point
-.L005pic_point:
- popl %eax
- leal .Lrem_4bit-.L005pic_point(%eax),%eax
- movzbl 15(%edi),%ebx
- call _mmx_gmult_4bit_inner
- movl 20(%esp),%edi
emms
movl %ebx,12(%edi)
movl %edx,4(%edi)
@@ -1331,59 +1578,954 @@ gcm_ghash_4bit_mmx:
pushl %ebx
pushl %esi
pushl %edi
- movl 20(%esp),%ebp
- movl 24(%esp),%esi
- movl 28(%esp),%edi
- movl 32(%esp),%ecx
- call .L006pic_point
-.L006pic_point:
- popl %eax
- leal .Lrem_4bit-.L006pic_point(%eax),%eax
- addl %edi,%ecx
- movl %ecx,32(%esp)
- subl $20,%esp
- movl 12(%ebp),%ebx
- movl 4(%ebp),%edx
- movl 8(%ebp),%ecx
- movl (%ebp),%ebp
- jmp .L007mmx_outer_loop
+ movl 20(%esp),%eax
+ movl 24(%esp),%ebx
+ movl 28(%esp),%ecx
+ movl 32(%esp),%edx
+ movl %esp,%ebp
+ call .L008pic_point
+.L008pic_point:
+ popl %esi
+ leal .Lrem_8bit-.L008pic_point(%esi),%esi
+ subl $544,%esp
+ andl $-64,%esp
+ subl $16,%esp
+ addl %ecx,%edx
+ movl %eax,544(%esp)
+ movl %edx,552(%esp)
+ movl %ebp,556(%esp)
+ addl $128,%ebx
+ leal 144(%esp),%edi
+ leal 400(%esp),%ebp
+ movl -120(%ebx),%edx
+ movq -120(%ebx),%mm0
+ movq -128(%ebx),%mm3
+ shll $4,%edx
+ movb %dl,(%esp)
+ movl -104(%ebx),%edx
+ movq -104(%ebx),%mm2
+ movq -112(%ebx),%mm5
+ movq %mm0,-128(%edi)
+ psrlq $4,%mm0
+ movq %mm3,(%edi)
+ movq %mm3,%mm7
+ psrlq $4,%mm3
+ shll $4,%edx
+ movb %dl,1(%esp)
+ movl -88(%ebx),%edx
+ movq -88(%ebx),%mm1
+ psllq $60,%mm7
+ movq -96(%ebx),%mm4
+ por %mm7,%mm0
+ movq %mm2,-120(%edi)
+ psrlq $4,%mm2
+ movq %mm5,8(%edi)
+ movq %mm5,%mm6
+ movq %mm0,-128(%ebp)
+ psrlq $4,%mm5
+ movq %mm3,(%ebp)
+ shll $4,%edx
+ movb %dl,2(%esp)
+ movl -72(%ebx),%edx
+ movq -72(%ebx),%mm0
+ psllq $60,%mm6
+ movq -80(%ebx),%mm3
+ por %mm6,%mm2
+ movq %mm1,-112(%edi)
+ psrlq $4,%mm1
+ movq %mm4,16(%edi)
+ movq %mm4,%mm7
+ movq %mm2,-120(%ebp)
+ psrlq $4,%mm4
+ movq %mm5,8(%ebp)
+ shll $4,%edx
+ movb %dl,3(%esp)
+ movl -56(%ebx),%edx
+ movq -56(%ebx),%mm2
+ psllq $60,%mm7
+ movq -64(%ebx),%mm5
+ por %mm7,%mm1
+ movq %mm0,-104(%edi)
+ psrlq $4,%mm0
+ movq %mm3,24(%edi)
+ movq %mm3,%mm6
+ movq %mm1,-112(%ebp)
+ psrlq $4,%mm3
+ movq %mm4,16(%ebp)
+ shll $4,%edx
+ movb %dl,4(%esp)
+ movl -40(%ebx),%edx
+ movq -40(%ebx),%mm1
+ psllq $60,%mm6
+ movq -48(%ebx),%mm4
+ por %mm6,%mm0
+ movq %mm2,-96(%edi)
+ psrlq $4,%mm2
+ movq %mm5,32(%edi)
+ movq %mm5,%mm7
+ movq %mm0,-104(%ebp)
+ psrlq $4,%mm5
+ movq %mm3,24(%ebp)
+ shll $4,%edx
+ movb %dl,5(%esp)
+ movl -24(%ebx),%edx
+ movq -24(%ebx),%mm0
+ psllq $60,%mm7
+ movq -32(%ebx),%mm3
+ por %mm7,%mm2
+ movq %mm1,-88(%edi)
+ psrlq $4,%mm1
+ movq %mm4,40(%edi)
+ movq %mm4,%mm6
+ movq %mm2,-96(%ebp)
+ psrlq $4,%mm4
+ movq %mm5,32(%ebp)
+ shll $4,%edx
+ movb %dl,6(%esp)
+ movl -8(%ebx),%edx
+ movq -8(%ebx),%mm2
+ psllq $60,%mm6
+ movq -16(%ebx),%mm5
+ por %mm6,%mm1
+ movq %mm0,-80(%edi)
+ psrlq $4,%mm0
+ movq %mm3,48(%edi)
+ movq %mm3,%mm7
+ movq %mm1,-88(%ebp)
+ psrlq $4,%mm3
+ movq %mm4,40(%ebp)
+ shll $4,%edx
+ movb %dl,7(%esp)
+ movl 8(%ebx),%edx
+ movq 8(%ebx),%mm1
+ psllq $60,%mm7
+ movq (%ebx),%mm4
+ por %mm7,%mm0
+ movq %mm2,-72(%edi)
+ psrlq $4,%mm2
+ movq %mm5,56(%edi)
+ movq %mm5,%mm6
+ movq %mm0,-80(%ebp)
+ psrlq $4,%mm5
+ movq %mm3,48(%ebp)
+ shll $4,%edx
+ movb %dl,8(%esp)
+ movl 24(%ebx),%edx
+ movq 24(%ebx),%mm0
+ psllq $60,%mm6
+ movq 16(%ebx),%mm3
+ por %mm6,%mm2
+ movq %mm1,-64(%edi)
+ psrlq $4,%mm1
+ movq %mm4,64(%edi)
+ movq %mm4,%mm7
+ movq %mm2,-72(%ebp)
+ psrlq $4,%mm4
+ movq %mm5,56(%ebp)
+ shll $4,%edx
+ movb %dl,9(%esp)
+ movl 40(%ebx),%edx
+ movq 40(%ebx),%mm2
+ psllq $60,%mm7
+ movq 32(%ebx),%mm5
+ por %mm7,%mm1
+ movq %mm0,-56(%edi)
+ psrlq $4,%mm0
+ movq %mm3,72(%edi)
+ movq %mm3,%mm6
+ movq %mm1,-64(%ebp)
+ psrlq $4,%mm3
+ movq %mm4,64(%ebp)
+ shll $4,%edx
+ movb %dl,10(%esp)
+ movl 56(%ebx),%edx
+ movq 56(%ebx),%mm1
+ psllq $60,%mm6
+ movq 48(%ebx),%mm4
+ por %mm6,%mm0
+ movq %mm2,-48(%edi)
+ psrlq $4,%mm2
+ movq %mm5,80(%edi)
+ movq %mm5,%mm7
+ movq %mm0,-56(%ebp)
+ psrlq $4,%mm5
+ movq %mm3,72(%ebp)
+ shll $4,%edx
+ movb %dl,11(%esp)
+ movl 72(%ebx),%edx
+ movq 72(%ebx),%mm0
+ psllq $60,%mm7
+ movq 64(%ebx),%mm3
+ por %mm7,%mm2
+ movq %mm1,-40(%edi)
+ psrlq $4,%mm1
+ movq %mm4,88(%edi)
+ movq %mm4,%mm6
+ movq %mm2,-48(%ebp)
+ psrlq $4,%mm4
+ movq %mm5,80(%ebp)
+ shll $4,%edx
+ movb %dl,12(%esp)
+ movl 88(%ebx),%edx
+ movq 88(%ebx),%mm2
+ psllq $60,%mm6
+ movq 80(%ebx),%mm5
+ por %mm6,%mm1
+ movq %mm0,-32(%edi)
+ psrlq $4,%mm0
+ movq %mm3,96(%edi)
+ movq %mm3,%mm7
+ movq %mm1,-40(%ebp)
+ psrlq $4,%mm3
+ movq %mm4,88(%ebp)
+ shll $4,%edx
+ movb %dl,13(%esp)
+ movl 104(%ebx),%edx
+ movq 104(%ebx),%mm1
+ psllq $60,%mm7
+ movq 96(%ebx),%mm4
+ por %mm7,%mm0
+ movq %mm2,-24(%edi)
+ psrlq $4,%mm2
+ movq %mm5,104(%edi)
+ movq %mm5,%mm6
+ movq %mm0,-32(%ebp)
+ psrlq $4,%mm5
+ movq %mm3,96(%ebp)
+ shll $4,%edx
+ movb %dl,14(%esp)
+ movl 120(%ebx),%edx
+ movq 120(%ebx),%mm0
+ psllq $60,%mm6
+ movq 112(%ebx),%mm3
+ por %mm6,%mm2
+ movq %mm1,-16(%edi)
+ psrlq $4,%mm1
+ movq %mm4,112(%edi)
+ movq %mm4,%mm7
+ movq %mm2,-24(%ebp)
+ psrlq $4,%mm4
+ movq %mm5,104(%ebp)
+ shll $4,%edx
+ movb %dl,15(%esp)
+ psllq $60,%mm7
+ por %mm7,%mm1
+ movq %mm0,-8(%edi)
+ psrlq $4,%mm0
+ movq %mm3,120(%edi)
+ movq %mm3,%mm6
+ movq %mm1,-16(%ebp)
+ psrlq $4,%mm3
+ movq %mm4,112(%ebp)
+ psllq $60,%mm6
+ por %mm6,%mm0
+ movq %mm0,-8(%ebp)
+ movq %mm3,120(%ebp)
+ movq (%eax),%mm6
+ movl 8(%eax),%ebx
+ movl 12(%eax),%edx
.align 16
-.L007mmx_outer_loop:
- xorl 12(%edi),%ebx
- xorl 4(%edi),%edx
- xorl 8(%edi),%ecx
- xorl (%edi),%ebp
- movl %edi,48(%esp)
- movl %ebx,12(%esp)
- movl %edx,4(%esp)
- movl %ecx,8(%esp)
- movl %ebp,(%esp)
- movl %esp,%edi
- shrl $24,%ebx
- call _mmx_gmult_4bit_inner
- movl 48(%esp),%edi
- leal 16(%edi),%edi
- cmpl 52(%esp),%edi
- jb .L007mmx_outer_loop
- movl 40(%esp),%edi
+.L009outer:
+ xorl 12(%ecx),%edx
+ xorl 8(%ecx),%ebx
+ pxor (%ecx),%mm6
+ leal 16(%ecx),%ecx
+ movl %ebx,536(%esp)
+ movq %mm6,528(%esp)
+ movl %ecx,548(%esp)
+ xorl %eax,%eax
+ roll $8,%edx
+ movb %dl,%al
+ movl %eax,%ebp
+ andb $15,%al
+ shrl $4,%ebp
+ pxor %mm0,%mm0
+ roll $8,%edx
+ pxor %mm1,%mm1
+ pxor %mm2,%mm2
+ movq 16(%esp,%eax,8),%mm7
+ movq 144(%esp,%eax,8),%mm6
+ movb %dl,%al
+ movd %mm7,%ebx
+ psrlq $8,%mm7
+ movq %mm6,%mm3
+ movl %eax,%edi
+ psrlq $8,%mm6
+ pxor 272(%esp,%ebp,8),%mm7
+ andb $15,%al
+ psllq $56,%mm3
+ shrl $4,%edi
+ pxor 16(%esp,%eax,8),%mm7
+ roll $8,%edx
+ pxor 144(%esp,%eax,8),%mm6
+ pxor %mm3,%mm7
+ pxor 400(%esp,%ebp,8),%mm6
+ xorb (%esp,%ebp,1),%bl
+ movb %dl,%al
+ movd %mm7,%ecx
+ movzbl %bl,%ebx
+ psrlq $8,%mm7
+ movq %mm6,%mm3
+ movl %eax,%ebp
+ psrlq $8,%mm6
+ pxor 272(%esp,%edi,8),%mm7
+ andb $15,%al
+ psllq $56,%mm3
+ shrl $4,%ebp
+ pinsrw $2,(%esi,%ebx,2),%mm2
+ pxor 16(%esp,%eax,8),%mm7
+ roll $8,%edx
+ pxor 144(%esp,%eax,8),%mm6
+ pxor %mm3,%mm7
+ pxor 400(%esp,%edi,8),%mm6
+ xorb (%esp,%edi,1),%cl
+ movb %dl,%al
+ movl 536(%esp),%edx
+ movd %mm7,%ebx
+ movzbl %cl,%ecx
+ psrlq $8,%mm7
+ movq %mm6,%mm3
+ movl %eax,%edi
+ psrlq $8,%mm6
+ pxor 272(%esp,%ebp,8),%mm7
+ andb $15,%al
+ psllq $56,%mm3
+ pxor %mm2,%mm6
+ shrl $4,%edi
+ pinsrw $2,(%esi,%ecx,2),%mm1
+ pxor 16(%esp,%eax,8),%mm7
+ roll $8,%edx
+ pxor 144(%esp,%eax,8),%mm6
+ pxor %mm3,%mm7
+ pxor 400(%esp,%ebp,8),%mm6
+ xorb (%esp,%ebp,1),%bl
+ movb %dl,%al
+ movd %mm7,%ecx
+ movzbl %bl,%ebx
+ psrlq $8,%mm7
+ movq %mm6,%mm3
+ movl %eax,%ebp
+ psrlq $8,%mm6
+ pxor 272(%esp,%edi,8),%mm7
+ andb $15,%al
+ psllq $56,%mm3
+ pxor %mm1,%mm6
+ shrl $4,%ebp
+ pinsrw $2,(%esi,%ebx,2),%mm0
+ pxor 16(%esp,%eax,8),%mm7
+ roll $8,%edx
+ pxor 144(%esp,%eax,8),%mm6
+ pxor %mm3,%mm7
+ pxor 400(%esp,%edi,8),%mm6
+ xorb (%esp,%edi,1),%cl
+ movb %dl,%al
+ movd %mm7,%ebx
+ movzbl %cl,%ecx
+ psrlq $8,%mm7
+ movq %mm6,%mm3
+ movl %eax,%edi
+ psrlq $8,%mm6
+ pxor 272(%esp,%ebp,8),%mm7
+ andb $15,%al
+ psllq $56,%mm3
+ pxor %mm0,%mm6
+ shrl $4,%edi
+ pinsrw $2,(%esi,%ecx,2),%mm2
+ pxor 16(%esp,%eax,8),%mm7
+ roll $8,%edx
+ pxor 144(%esp,%eax,8),%mm6
+ pxor %mm3,%mm7
+ pxor 400(%esp,%ebp,8),%mm6
+ xorb (%esp,%ebp,1),%bl
+ movb %dl,%al
+ movd %mm7,%ecx
+ movzbl %bl,%ebx
+ psrlq $8,%mm7
+ movq %mm6,%mm3
+ movl %eax,%ebp
+ psrlq $8,%mm6
+ pxor 272(%esp,%edi,8),%mm7
+ andb $15,%al
+ psllq $56,%mm3
+ pxor %mm2,%mm6
+ shrl $4,%ebp
+ pinsrw $2,(%esi,%ebx,2),%mm1
+ pxor 16(%esp,%eax,8),%mm7
+ roll $8,%edx
+ pxor 144(%esp,%eax,8),%mm6
+ pxor %mm3,%mm7
+ pxor 400(%esp,%edi,8),%mm6
+ xorb (%esp,%edi,1),%cl
+ movb %dl,%al
+ movl 532(%esp),%edx
+ movd %mm7,%ebx
+ movzbl %cl,%ecx
+ psrlq $8,%mm7
+ movq %mm6,%mm3
+ movl %eax,%edi
+ psrlq $8,%mm6
+ pxor 272(%esp,%ebp,8),%mm7
+ andb $15,%al
+ psllq $56,%mm3
+ pxor %mm1,%mm6
+ shrl $4,%edi
+ pinsrw $2,(%esi,%ecx,2),%mm0
+ pxor 16(%esp,%eax,8),%mm7
+ roll $8,%edx
+ pxor 144(%esp,%eax,8),%mm6
+ pxor %mm3,%mm7
+ pxor 400(%esp,%ebp,8),%mm6
+ xorb (%esp,%ebp,1),%bl
+ movb %dl,%al
+ movd %mm7,%ecx
+ movzbl %bl,%ebx
+ psrlq $8,%mm7
+ movq %mm6,%mm3
+ movl %eax,%ebp
+ psrlq $8,%mm6
+ pxor 272(%esp,%edi,8),%mm7
+ andb $15,%al
+ psllq $56,%mm3
+ pxor %mm0,%mm6
+ shrl $4,%ebp
+ pinsrw $2,(%esi,%ebx,2),%mm2
+ pxor 16(%esp,%eax,8),%mm7
+ roll $8,%edx
+ pxor 144(%esp,%eax,8),%mm6
+ pxor %mm3,%mm7
+ pxor 400(%esp,%edi,8),%mm6
+ xorb (%esp,%edi,1),%cl
+ movb %dl,%al
+ movd %mm7,%ebx
+ movzbl %cl,%ecx
+ psrlq $8,%mm7
+ movq %mm6,%mm3
+ movl %eax,%edi
+ psrlq $8,%mm6
+ pxor 272(%esp,%ebp,8),%mm7
+ andb $15,%al
+ psllq $56,%mm3
+ pxor %mm2,%mm6
+ shrl $4,%edi
+ pinsrw $2,(%esi,%ecx,2),%mm1
+ pxor 16(%esp,%eax,8),%mm7
+ roll $8,%edx
+ pxor 144(%esp,%eax,8),%mm6
+ pxor %mm3,%mm7
+ pxor 400(%esp,%ebp,8),%mm6
+ xorb (%esp,%ebp,1),%bl
+ movb %dl,%al
+ movd %mm7,%ecx
+ movzbl %bl,%ebx
+ psrlq $8,%mm7
+ movq %mm6,%mm3
+ movl %eax,%ebp
+ psrlq $8,%mm6
+ pxor 272(%esp,%edi,8),%mm7
+ andb $15,%al
+ psllq $56,%mm3
+ pxor %mm1,%mm6
+ shrl $4,%ebp
+ pinsrw $2,(%esi,%ebx,2),%mm0
+ pxor 16(%esp,%eax,8),%mm7
+ roll $8,%edx
+ pxor 144(%esp,%eax,8),%mm6
+ pxor %mm3,%mm7
+ pxor 400(%esp,%edi,8),%mm6
+ xorb (%esp,%edi,1),%cl
+ movb %dl,%al
+ movl 528(%esp),%edx
+ movd %mm7,%ebx
+ movzbl %cl,%ecx
+ psrlq $8,%mm7
+ movq %mm6,%mm3
+ movl %eax,%edi
+ psrlq $8,%mm6
+ pxor 272(%esp,%ebp,8),%mm7
+ andb $15,%al
+ psllq $56,%mm3
+ pxor %mm0,%mm6
+ shrl $4,%edi
+ pinsrw $2,(%esi,%ecx,2),%mm2
+ pxor 16(%esp,%eax,8),%mm7
+ roll $8,%edx
+ pxor 144(%esp,%eax,8),%mm6
+ pxor %mm3,%mm7
+ pxor 400(%esp,%ebp,8),%mm6
+ xorb (%esp,%ebp,1),%bl
+ movb %dl,%al
+ movd %mm7,%ecx
+ movzbl %bl,%ebx
+ psrlq $8,%mm7
+ movq %mm6,%mm3
+ movl %eax,%ebp
+ psrlq $8,%mm6
+ pxor 272(%esp,%edi,8),%mm7
+ andb $15,%al
+ psllq $56,%mm3
+ pxor %mm2,%mm6
+ shrl $4,%ebp
+ pinsrw $2,(%esi,%ebx,2),%mm1
+ pxor 16(%esp,%eax,8),%mm7
+ roll $8,%edx
+ pxor 144(%esp,%eax,8),%mm6
+ pxor %mm3,%mm7
+ pxor 400(%esp,%edi,8),%mm6
+ xorb (%esp,%edi,1),%cl
+ movb %dl,%al
+ movd %mm7,%ebx
+ movzbl %cl,%ecx
+ psrlq $8,%mm7
+ movq %mm6,%mm3
+ movl %eax,%edi
+ psrlq $8,%mm6
+ pxor 272(%esp,%ebp,8),%mm7
+ andb $15,%al
+ psllq $56,%mm3
+ pxor %mm1,%mm6
+ shrl $4,%edi
+ pinsrw $2,(%esi,%ecx,2),%mm0
+ pxor 16(%esp,%eax,8),%mm7
+ roll $8,%edx
+ pxor 144(%esp,%eax,8),%mm6
+ pxor %mm3,%mm7
+ pxor 400(%esp,%ebp,8),%mm6
+ xorb (%esp,%ebp,1),%bl
+ movb %dl,%al
+ movd %mm7,%ecx
+ movzbl %bl,%ebx
+ psrlq $8,%mm7
+ movq %mm6,%mm3
+ movl %eax,%ebp
+ psrlq $8,%mm6
+ pxor 272(%esp,%edi,8),%mm7
+ andb $15,%al
+ psllq $56,%mm3
+ pxor %mm0,%mm6
+ shrl $4,%ebp
+ pinsrw $2,(%esi,%ebx,2),%mm2
+ pxor 16(%esp,%eax,8),%mm7
+ roll $8,%edx
+ pxor 144(%esp,%eax,8),%mm6
+ pxor %mm3,%mm7
+ pxor 400(%esp,%edi,8),%mm6
+ xorb (%esp,%edi,1),%cl
+ movb %dl,%al
+ movl 524(%esp),%edx
+ movd %mm7,%ebx
+ movzbl %cl,%ecx
+ psrlq $8,%mm7
+ movq %mm6,%mm3
+ movl %eax,%edi
+ psrlq $8,%mm6
+ pxor 272(%esp,%ebp,8),%mm7
+ andb $15,%al
+ psllq $56,%mm3
+ pxor %mm2,%mm6
+ shrl $4,%edi
+ pinsrw $2,(%esi,%ecx,2),%mm1
+ pxor 16(%esp,%eax,8),%mm7
+ pxor 144(%esp,%eax,8),%mm6
+ xorb (%esp,%ebp,1),%bl
+ pxor %mm3,%mm7
+ pxor 400(%esp,%ebp,8),%mm6
+ movzbl %bl,%ebx
+ pxor %mm2,%mm2
+ psllq $4,%mm1
+ movd %mm7,%ecx
+ psrlq $4,%mm7
+ movq %mm6,%mm3
+ psrlq $4,%mm6
+ shll $4,%ecx
+ pxor 16(%esp,%edi,8),%mm7
+ psllq $60,%mm3
+ movzbl %cl,%ecx
+ pxor %mm3,%mm7
+ pxor 144(%esp,%edi,8),%mm6
+ pinsrw $2,(%esi,%ebx,2),%mm0
+ pxor %mm1,%mm6
+ movd %mm7,%edx
+ pinsrw $3,(%esi,%ecx,2),%mm2
+ psllq $12,%mm0
+ pxor %mm0,%mm6
+ psrlq $32,%mm7
+ pxor %mm2,%mm6
+ movl 548(%esp),%ecx
+ movd %mm7,%ebx
+ movq %mm6,%mm3
+ psllw $8,%mm6
+ psrlw $8,%mm3
+ por %mm3,%mm6
+ bswap %edx
+ pshufw $27,%mm6,%mm6
+ bswap %ebx
+ cmpl 552(%esp),%ecx
+ jne .L009outer
+ movl 544(%esp),%eax
+ movl %edx,12(%eax)
+ movl %ebx,8(%eax)
+ movq %mm6,(%eax)
+ movl 556(%esp),%esp
emms
- movl %ebx,12(%edi)
- movl %edx,4(%edi)
- movl %ecx,8(%edi)
- movl %ebp,(%edi)
- addl $20,%esp
popl %edi
popl %esi
popl %ebx
popl %ebp
ret
.size gcm_ghash_4bit_mmx,.-.L_gcm_ghash_4bit_mmx_begin
+.globl gcm_init_clmul
+.type gcm_init_clmul,@function
+.align 16
+gcm_init_clmul:
+.L_gcm_init_clmul_begin:
+ movl 4(%esp),%edx
+ movl 8(%esp),%eax
+ call .L010pic
+.L010pic:
+ popl %ecx
+ leal .Lbswap-.L010pic(%ecx),%ecx
+ movdqu (%eax),%xmm2
+ pshufd $78,%xmm2,%xmm2
+ pshufd $255,%xmm2,%xmm4
+ movdqa %xmm2,%xmm3
+ psllq $1,%xmm2
+ pxor %xmm5,%xmm5
+ psrlq $63,%xmm3
+ pcmpgtd %xmm4,%xmm5
+ pslldq $8,%xmm3
+ por %xmm3,%xmm2
+ pand 16(%ecx),%xmm5
+ pxor %xmm5,%xmm2
+ movdqa %xmm2,%xmm0
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+ pshufd $78,%xmm2,%xmm4
+ pxor %xmm0,%xmm3
+ pxor %xmm2,%xmm4
+.byte 102,15,58,68,194,0
+.byte 102,15,58,68,202,17
+.byte 102,15,58,68,220,0
+ xorps %xmm0,%xmm3
+ xorps %xmm1,%xmm3
+ movdqa %xmm3,%xmm4
+ psrldq $8,%xmm3
+ pslldq $8,%xmm4
+ pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+ movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+ pshufd $78,%xmm2,%xmm3
+ pshufd $78,%xmm0,%xmm4
+ pxor %xmm2,%xmm3
+ movdqu %xmm2,(%edx)
+ pxor %xmm0,%xmm4
+ movdqu %xmm0,16(%edx)
+.byte 102,15,58,15,227,8
+ movdqu %xmm4,32(%edx)
+ ret
+.size gcm_init_clmul,.-.L_gcm_init_clmul_begin
+.globl gcm_gmult_clmul
+.type gcm_gmult_clmul,@function
+.align 16
+gcm_gmult_clmul:
+.L_gcm_gmult_clmul_begin:
+ movl 4(%esp),%eax
+ movl 8(%esp),%edx
+ call .L011pic
+.L011pic:
+ popl %ecx
+ leal .Lbswap-.L011pic(%ecx),%ecx
+ movdqu (%eax),%xmm0
+ movdqa (%ecx),%xmm5
+ movups (%edx),%xmm2
+.byte 102,15,56,0,197
+ movups 32(%edx),%xmm4
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+ pxor %xmm0,%xmm3
+.byte 102,15,58,68,194,0
+.byte 102,15,58,68,202,17
+.byte 102,15,58,68,220,0
+ xorps %xmm0,%xmm3
+ xorps %xmm1,%xmm3
+ movdqa %xmm3,%xmm4
+ psrldq $8,%xmm3
+ pslldq $8,%xmm4
+ pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+ movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+.byte 102,15,56,0,197
+ movdqu %xmm0,(%eax)
+ ret
+.size gcm_gmult_clmul,.-.L_gcm_gmult_clmul_begin
+.globl gcm_ghash_clmul
+.type gcm_ghash_clmul,@function
+.align 16
+gcm_ghash_clmul:
+.L_gcm_ghash_clmul_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%eax
+ movl 24(%esp),%edx
+ movl 28(%esp),%esi
+ movl 32(%esp),%ebx
+ call .L012pic
+.L012pic:
+ popl %ecx
+ leal .Lbswap-.L012pic(%ecx),%ecx
+ movdqu (%eax),%xmm0
+ movdqa (%ecx),%xmm5
+ movdqu (%edx),%xmm2
+.byte 102,15,56,0,197
+ subl $16,%ebx
+ jz .L013odd_tail
+ movdqu (%esi),%xmm3
+ movdqu 16(%esi),%xmm6
+.byte 102,15,56,0,221
+.byte 102,15,56,0,245
+ movdqu 32(%edx),%xmm5
+ pxor %xmm3,%xmm0
+ pshufd $78,%xmm6,%xmm3
+ movdqa %xmm6,%xmm7
+ pxor %xmm6,%xmm3
+ leal 32(%esi),%esi
+.byte 102,15,58,68,242,0
+.byte 102,15,58,68,250,17
+.byte 102,15,58,68,221,0
+ movups 16(%edx),%xmm2
+ nop
+ subl $32,%ebx
+ jbe .L014even_tail
+ jmp .L015mod_loop
+.align 32
+.L015mod_loop:
+ pshufd $78,%xmm0,%xmm4
+ movdqa %xmm0,%xmm1
+ pxor %xmm0,%xmm4
+ nop
+.byte 102,15,58,68,194,0
+.byte 102,15,58,68,202,17
+.byte 102,15,58,68,229,16
+ movups (%edx),%xmm2
+ xorps %xmm6,%xmm0
+ movdqa (%ecx),%xmm5
+ xorps %xmm7,%xmm1
+ movdqu (%esi),%xmm7
+ pxor %xmm0,%xmm3
+ movdqu 16(%esi),%xmm6
+ pxor %xmm1,%xmm3
+.byte 102,15,56,0,253
+ pxor %xmm3,%xmm4
+ movdqa %xmm4,%xmm3
+ psrldq $8,%xmm4
+ pslldq $8,%xmm3
+ pxor %xmm4,%xmm1
+ pxor %xmm3,%xmm0
+.byte 102,15,56,0,245
+ pxor %xmm7,%xmm1
+ movdqa %xmm6,%xmm7
+ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+.byte 102,15,58,68,242,0
+ movups 32(%edx),%xmm5
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+ pshufd $78,%xmm7,%xmm3
+ movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm7,%xmm3
+ pxor %xmm4,%xmm1
+.byte 102,15,58,68,250,17
+ movups 16(%edx),%xmm2
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+.byte 102,15,58,68,221,0
+ leal 32(%esi),%esi
+ subl $32,%ebx
+ ja .L015mod_loop
+.L014even_tail:
+ pshufd $78,%xmm0,%xmm4
+ movdqa %xmm0,%xmm1
+ pxor %xmm0,%xmm4
+.byte 102,15,58,68,194,0
+.byte 102,15,58,68,202,17
+.byte 102,15,58,68,229,16
+ movdqa (%ecx),%xmm5
+ xorps %xmm6,%xmm0
+ xorps %xmm7,%xmm1
+ pxor %xmm0,%xmm3
+ pxor %xmm1,%xmm3
+ pxor %xmm3,%xmm4
+ movdqa %xmm4,%xmm3
+ psrldq $8,%xmm4
+ pslldq $8,%xmm3
+ pxor %xmm4,%xmm1
+ pxor %xmm3,%xmm0
+ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+ movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+ testl %ebx,%ebx
+ jnz .L016done
+ movups (%edx),%xmm2
+.L013odd_tail:
+ movdqu (%esi),%xmm3
+.byte 102,15,56,0,221
+ pxor %xmm3,%xmm0
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+ pshufd $78,%xmm2,%xmm4
+ pxor %xmm0,%xmm3
+ pxor %xmm2,%xmm4
+.byte 102,15,58,68,194,0
+.byte 102,15,58,68,202,17
+.byte 102,15,58,68,220,0
+ xorps %xmm0,%xmm3
+ xorps %xmm1,%xmm3
+ movdqa %xmm3,%xmm4
+ psrldq $8,%xmm3
+ pslldq $8,%xmm4
+ pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+ movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+.L016done:
+.byte 102,15,56,0,197
+ movdqu %xmm0,(%eax)
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size gcm_ghash_clmul,.-.L_gcm_ghash_clmul_begin
+.align 64
+.Lbswap:
+.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,194
+.align 64
+.Lrem_8bit:
+.value 0,450,900,582,1800,1738,1164,1358
+.value 3600,4050,3476,3158,2328,2266,2716,2910
+.value 7200,7650,8100,7782,6952,6890,6316,6510
+.value 4656,5106,4532,4214,5432,5370,5820,6014
+.value 14400,14722,15300,14854,16200,16010,15564,15630
+.value 13904,14226,13780,13334,12632,12442,13020,13086
+.value 9312,9634,10212,9766,9064,8874,8428,8494
+.value 10864,11186,10740,10294,11640,11450,12028,12094
+.value 28800,28994,29444,29382,30600,30282,29708,30158
+.value 32400,32594,32020,31958,31128,30810,31260,31710
+.value 27808,28002,28452,28390,27560,27242,26668,27118
+.value 25264,25458,24884,24822,26040,25722,26172,26622
+.value 18624,18690,19268,19078,20424,19978,19532,19854
+.value 18128,18194,17748,17558,16856,16410,16988,17310
+.value 21728,21794,22372,22182,21480,21034,20588,20910
+.value 23280,23346,22900,22710,24056,23610,24188,24510
+.value 57600,57538,57988,58182,58888,59338,58764,58446
+.value 61200,61138,60564,60758,59416,59866,60316,59998
+.value 64800,64738,65188,65382,64040,64490,63916,63598
+.value 62256,62194,61620,61814,62520,62970,63420,63102
+.value 55616,55426,56004,56070,56904,57226,56780,56334
+.value 55120,54930,54484,54550,53336,53658,54236,53790
+.value 50528,50338,50916,50982,49768,50090,49644,49198
+.value 52080,51890,51444,51510,52344,52666,53244,52798
+.value 37248,36930,37380,37830,38536,38730,38156,38094
+.value 40848,40530,39956,40406,39064,39258,39708,39646
+.value 36256,35938,36388,36838,35496,35690,35116,35054
+.value 33712,33394,32820,33270,33976,34170,34620,34558
+.value 43456,43010,43588,43910,44744,44810,44364,44174
+.value 42960,42514,42068,42390,41176,41242,41820,41630
+.value 46560,46114,46692,47014,45800,45866,45420,45230
+.value 48112,47666,47220,47542,48376,48442,49020,48830
.align 64
.Lrem_4bit:
-.long 0,0,0,29491200,0,58982400,0,38141952
-.long 0,117964800,0,113901568,0,76283904,0,88997888
-.long 0,235929600,0,265420800,0,227803136,0,206962688
-.long 0,152567808,0,148504576,0,177995776,0,190709760
+.long 0,0,0,471859200,0,943718400,0,610271232
+.long 0,1887436800,0,1822425088,0,1220542464,0,1423966208
+.long 0,3774873600,0,4246732800,0,3644850176,0,3311403008
+.long 0,2441084928,0,2376073216,0,2847932416,0,3051356160
.byte 71,72,65,83,72,32,102,111,114,32,120,56,54,44,32,67
.byte 82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112
.byte 112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62
diff --git a/secure/lib/libcrypto/i386/poly1305-x86.S b/secure/lib/libcrypto/i386/poly1305-x86.S
index 1cccfcf9765e..100deee40bf2 100644
--- a/secure/lib/libcrypto/i386/poly1305-x86.S
+++ b/secure/lib/libcrypto/i386/poly1305-x86.S
@@ -24,6 +24,26 @@ poly1305_init:
movl %eax,20(%edi)
cmpl $0,%esi
je .L000nokey
+ call .L001pic_point
+.L001pic_point:
+ popl %ebx
+ leal poly1305_blocks-.L001pic_point(%ebx),%eax
+ leal poly1305_emit-.L001pic_point(%ebx),%edx
+ leal OPENSSL_ia32cap_P-.L001pic_point(%ebx),%edi
+ movl (%edi),%ecx
+ andl $83886080,%ecx
+ cmpl $83886080,%ecx
+ jne .L002no_sse2
+ leal _poly1305_blocks_sse2-.L001pic_point(%ebx),%eax
+ leal _poly1305_emit_sse2-.L001pic_point(%ebx),%edx
+ movl 8(%edi),%ecx
+ testl $32,%ecx
+ jz .L002no_sse2
+ leal _poly1305_blocks_avx2-.L001pic_point(%ebx),%eax
+.L002no_sse2:
+ movl 20(%esp),%edi
+ movl %eax,(%ebp)
+ movl %edx,4(%ebp)
movl (%esi),%eax
movl 4(%esi),%ebx
movl 8(%esi),%ecx
@@ -36,7 +56,7 @@ poly1305_init:
movl %ebx,28(%edi)
movl %ecx,32(%edi)
movl %edx,36(%edi)
- movl $0,%eax
+ movl $1,%eax
.L000nokey:
popl %edi
popl %esi
@@ -56,9 +76,9 @@ poly1305_blocks:
movl 20(%esp),%edi
movl 24(%esp),%esi
movl 28(%esp),%ecx
-.L001enter_blocks:
+.Lenter_blocks:
andl $-15,%ecx
- jz .L002nodata
+ jz .L003nodata
subl $64,%esp
movl 24(%edi),%eax
movl 28(%edi),%ebx
@@ -88,9 +108,9 @@ poly1305_blocks:
movl 8(%edi),%ecx
movl 12(%edi),%esi
movl 16(%edi),%edi
- jmp .L003loop
+ jmp .L004loop
.align 32
-.L003loop:
+.L004loop:
addl (%ebp),%eax
adcl 4(%ebp),%ebx
adcl 8(%ebp),%ecx
@@ -197,7 +217,7 @@ poly1305_blocks:
adcl $0,%esi
adcl $0,%edi
cmpl 92(%esp),%ebp
- jne .L003loop
+ jne .L004loop
movl 84(%esp),%edx
addl $64,%esp
movl %eax,(%edx)
@@ -205,7 +225,7 @@ poly1305_blocks:
movl %ecx,8(%edx)
movl %esi,12(%edx)
movl %edi,16(%edx)
-.L002nodata:
+.L003nodata:
popl %edi
popl %esi
popl %ebx
@@ -222,7 +242,7 @@ poly1305_emit:
pushl %esi
pushl %edi
movl 20(%esp),%ebp
-.L004enter_emit:
+.Lenter_emit:
movl 24(%esp),%edi
movl (%ebp),%eax
movl 4(%ebp),%ebx
@@ -272,11 +292,1625 @@ poly1305_emit:
popl %ebp
ret
.size poly1305_emit,.-.L_poly1305_emit_begin
+.align 32
+.type _poly1305_init_sse2,@function
+.align 16
+_poly1305_init_sse2:
+ movdqu 24(%edi),%xmm4
+ leal 48(%edi),%edi
+ movl %esp,%ebp
+ subl $224,%esp
+ andl $-16,%esp
+ movq 64(%ebx),%xmm7
+ movdqa %xmm4,%xmm0
+ movdqa %xmm4,%xmm1
+ movdqa %xmm4,%xmm2
+ pand %xmm7,%xmm0
+ psrlq $26,%xmm1
+ psrldq $6,%xmm2
+ pand %xmm7,%xmm1
+ movdqa %xmm2,%xmm3
+ psrlq $4,%xmm2
+ psrlq $30,%xmm3
+ pand %xmm7,%xmm2
+ pand %xmm7,%xmm3
+ psrldq $13,%xmm4
+ leal 144(%esp),%edx
+ movl $2,%ecx
+.L005square:
+ movdqa %xmm0,(%esp)
+ movdqa %xmm1,16(%esp)
+ movdqa %xmm2,32(%esp)
+ movdqa %xmm3,48(%esp)
+ movdqa %xmm4,64(%esp)
+ movdqa %xmm1,%xmm6
+ movdqa %xmm2,%xmm5
+ pslld $2,%xmm6
+ pslld $2,%xmm5
+ paddd %xmm1,%xmm6
+ paddd %xmm2,%xmm5
+ movdqa %xmm6,80(%esp)
+ movdqa %xmm5,96(%esp)
+ movdqa %xmm3,%xmm6
+ movdqa %xmm4,%xmm5
+ pslld $2,%xmm6
+ pslld $2,%xmm5
+ paddd %xmm3,%xmm6
+ paddd %xmm4,%xmm5
+ movdqa %xmm6,112(%esp)
+ movdqa %xmm5,128(%esp)
+ pshufd $68,%xmm0,%xmm6
+ movdqa %xmm1,%xmm5
+ pshufd $68,%xmm1,%xmm1
+ pshufd $68,%xmm2,%xmm2
+ pshufd $68,%xmm3,%xmm3
+ pshufd $68,%xmm4,%xmm4
+ movdqa %xmm6,(%edx)
+ movdqa %xmm1,16(%edx)
+ movdqa %xmm2,32(%edx)
+ movdqa %xmm3,48(%edx)
+ movdqa %xmm4,64(%edx)
+ pmuludq %xmm0,%xmm4
+ pmuludq %xmm0,%xmm3
+ pmuludq %xmm0,%xmm2
+ pmuludq %xmm0,%xmm1
+ pmuludq %xmm6,%xmm0
+ movdqa %xmm5,%xmm6
+ pmuludq 48(%edx),%xmm5
+ movdqa %xmm6,%xmm7
+ pmuludq 32(%edx),%xmm6
+ paddq %xmm5,%xmm4
+ movdqa %xmm7,%xmm5
+ pmuludq 16(%edx),%xmm7
+ paddq %xmm6,%xmm3
+ movdqa 80(%esp),%xmm6
+ pmuludq (%edx),%xmm5
+ paddq %xmm7,%xmm2
+ pmuludq 64(%edx),%xmm6
+ movdqa 32(%esp),%xmm7
+ paddq %xmm5,%xmm1
+ movdqa %xmm7,%xmm5
+ pmuludq 32(%edx),%xmm7
+ paddq %xmm6,%xmm0
+ movdqa %xmm5,%xmm6
+ pmuludq 16(%edx),%xmm5
+ paddq %xmm7,%xmm4
+ movdqa 96(%esp),%xmm7
+ pmuludq (%edx),%xmm6
+ paddq %xmm5,%xmm3
+ movdqa %xmm7,%xmm5
+ pmuludq 64(%edx),%xmm7
+ paddq %xmm6,%xmm2
+ pmuludq 48(%edx),%xmm5
+ movdqa 48(%esp),%xmm6
+ paddq %xmm7,%xmm1
+ movdqa %xmm6,%xmm7
+ pmuludq 16(%edx),%xmm6
+ paddq %xmm5,%xmm0
+ movdqa 112(%esp),%xmm5
+ pmuludq (%edx),%xmm7
+ paddq %xmm6,%xmm4
+ movdqa %xmm5,%xmm6
+ pmuludq 64(%edx),%xmm5
+ paddq %xmm7,%xmm3
+ movdqa %xmm6,%xmm7
+ pmuludq 48(%edx),%xmm6
+ paddq %xmm5,%xmm2
+ pmuludq 32(%edx),%xmm7
+ movdqa 64(%esp),%xmm5
+ paddq %xmm6,%xmm1
+ movdqa 128(%esp),%xmm6
+ pmuludq (%edx),%xmm5
+ paddq %xmm7,%xmm0
+ movdqa %xmm6,%xmm7
+ pmuludq 64(%edx),%xmm6
+ paddq %xmm5,%xmm4
+ movdqa %xmm7,%xmm5
+ pmuludq 16(%edx),%xmm7
+ paddq %xmm6,%xmm3
+ movdqa %xmm5,%xmm6
+ pmuludq 32(%edx),%xmm5
+ paddq %xmm7,%xmm0
+ pmuludq 48(%edx),%xmm6
+ movdqa 64(%ebx),%xmm7
+ paddq %xmm5,%xmm1
+ paddq %xmm6,%xmm2
+ movdqa %xmm3,%xmm5
+ pand %xmm7,%xmm3
+ psrlq $26,%xmm5
+ paddq %xmm4,%xmm5
+ movdqa %xmm0,%xmm6
+ pand %xmm7,%xmm0
+ psrlq $26,%xmm6
+ movdqa %xmm5,%xmm4
+ paddq %xmm1,%xmm6
+ psrlq $26,%xmm5
+ pand %xmm7,%xmm4
+ movdqa %xmm6,%xmm1
+ psrlq $26,%xmm6
+ paddd %xmm5,%xmm0
+ psllq $2,%xmm5
+ paddq %xmm2,%xmm6
+ paddq %xmm0,%xmm5
+ pand %xmm7,%xmm1
+ movdqa %xmm6,%xmm2
+ psrlq $26,%xmm6
+ pand %xmm7,%xmm2
+ paddd %xmm3,%xmm6
+ movdqa %xmm5,%xmm0
+ psrlq $26,%xmm5
+ movdqa %xmm6,%xmm3
+ psrlq $26,%xmm6
+ pand %xmm7,%xmm0
+ paddd %xmm5,%xmm1
+ pand %xmm7,%xmm3
+ paddd %xmm6,%xmm4
+ decl %ecx
+ jz .L006square_break
+ punpcklqdq (%esp),%xmm0
+ punpcklqdq 16(%esp),%xmm1
+ punpcklqdq 32(%esp),%xmm2
+ punpcklqdq 48(%esp),%xmm3
+ punpcklqdq 64(%esp),%xmm4
+ jmp .L005square
+.L006square_break:
+ psllq $32,%xmm0
+ psllq $32,%xmm1
+ psllq $32,%xmm2
+ psllq $32,%xmm3
+ psllq $32,%xmm4
+ por (%esp),%xmm0
+ por 16(%esp),%xmm1
+ por 32(%esp),%xmm2
+ por 48(%esp),%xmm3
+ por 64(%esp),%xmm4
+ pshufd $141,%xmm0,%xmm0
+ pshufd $141,%xmm1,%xmm1
+ pshufd $141,%xmm2,%xmm2
+ pshufd $141,%xmm3,%xmm3
+ pshufd $141,%xmm4,%xmm4
+ movdqu %xmm0,(%edi)
+ movdqu %xmm1,16(%edi)
+ movdqu %xmm2,32(%edi)
+ movdqu %xmm3,48(%edi)
+ movdqu %xmm4,64(%edi)
+ movdqa %xmm1,%xmm6
+ movdqa %xmm2,%xmm5
+ pslld $2,%xmm6
+ pslld $2,%xmm5
+ paddd %xmm1,%xmm6
+ paddd %xmm2,%xmm5
+ movdqu %xmm6,80(%edi)
+ movdqu %xmm5,96(%edi)
+ movdqa %xmm3,%xmm6
+ movdqa %xmm4,%xmm5
+ pslld $2,%xmm6
+ pslld $2,%xmm5
+ paddd %xmm3,%xmm6
+ paddd %xmm4,%xmm5
+ movdqu %xmm6,112(%edi)
+ movdqu %xmm5,128(%edi)
+ movl %ebp,%esp
+ leal -48(%edi),%edi
+ ret
+.size _poly1305_init_sse2,.-_poly1305_init_sse2
+.align 32
+.type _poly1305_blocks_sse2,@function
+.align 16
+_poly1305_blocks_sse2:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%edi
+ movl 24(%esp),%esi
+ movl 28(%esp),%ecx
+ movl 20(%edi),%eax
+ andl $-16,%ecx
+ jz .L007nodata
+ cmpl $64,%ecx
+ jae .L008enter_sse2
+ testl %eax,%eax
+ jz .Lenter_blocks
+.align 16
+.L008enter_sse2:
+ call .L009pic_point
+.L009pic_point:
+ popl %ebx
+ leal .Lconst_sse2-.L009pic_point(%ebx),%ebx
+ testl %eax,%eax
+ jnz .L010base2_26
+ call _poly1305_init_sse2
+ movl (%edi),%eax
+ movl 3(%edi),%ecx
+ movl 6(%edi),%edx
+ movl 9(%edi),%esi
+ movl 13(%edi),%ebp
+ movl $1,20(%edi)
+ shrl $2,%ecx
+ andl $67108863,%eax
+ shrl $4,%edx
+ andl $67108863,%ecx
+ shrl $6,%esi
+ andl $67108863,%edx
+ movd %eax,%xmm0
+ movd %ecx,%xmm1
+ movd %edx,%xmm2
+ movd %esi,%xmm3
+ movd %ebp,%xmm4
+ movl 24(%esp),%esi
+ movl 28(%esp),%ecx
+ jmp .L011base2_32
+.align 16
+.L010base2_26:
+ movd (%edi),%xmm0
+ movd 4(%edi),%xmm1
+ movd 8(%edi),%xmm2
+ movd 12(%edi),%xmm3
+ movd 16(%edi),%xmm4
+ movdqa 64(%ebx),%xmm7
+.L011base2_32:
+ movl 32(%esp),%eax
+ movl %esp,%ebp
+ subl $528,%esp
+ andl $-16,%esp
+ leal 48(%edi),%edi
+ shll $24,%eax
+ testl $31,%ecx
+ jz .L012even
+ movdqu (%esi),%xmm6
+ leal 16(%esi),%esi
+ movdqa %xmm6,%xmm5
+ pand %xmm7,%xmm6
+ paddd %xmm6,%xmm0
+ movdqa %xmm5,%xmm6
+ psrlq $26,%xmm5
+ psrldq $6,%xmm6
+ pand %xmm7,%xmm5
+ paddd %xmm5,%xmm1
+ movdqa %xmm6,%xmm5
+ psrlq $4,%xmm6
+ pand %xmm7,%xmm6
+ paddd %xmm6,%xmm2
+ movdqa %xmm5,%xmm6
+ psrlq $30,%xmm5
+ pand %xmm7,%xmm5
+ psrldq $7,%xmm6
+ paddd %xmm5,%xmm3
+ movd %eax,%xmm5
+ paddd %xmm6,%xmm4
+ movd 12(%edi),%xmm6
+ paddd %xmm5,%xmm4
+ movdqa %xmm0,(%esp)
+ movdqa %xmm1,16(%esp)
+ movdqa %xmm2,32(%esp)
+ movdqa %xmm3,48(%esp)
+ movdqa %xmm4,64(%esp)
+ pmuludq %xmm6,%xmm0
+ pmuludq %xmm6,%xmm1
+ pmuludq %xmm6,%xmm2
+ movd 28(%edi),%xmm5
+ pmuludq %xmm6,%xmm3
+ pmuludq %xmm6,%xmm4
+ movdqa %xmm5,%xmm6
+ pmuludq 48(%esp),%xmm5
+ movdqa %xmm6,%xmm7
+ pmuludq 32(%esp),%xmm6
+ paddq %xmm5,%xmm4
+ movdqa %xmm7,%xmm5
+ pmuludq 16(%esp),%xmm7
+ paddq %xmm6,%xmm3
+ movd 92(%edi),%xmm6
+ pmuludq (%esp),%xmm5
+ paddq %xmm7,%xmm2
+ pmuludq 64(%esp),%xmm6
+ movd 44(%edi),%xmm7
+ paddq %xmm5,%xmm1
+ movdqa %xmm7,%xmm5
+ pmuludq 32(%esp),%xmm7
+ paddq %xmm6,%xmm0
+ movdqa %xmm5,%xmm6
+ pmuludq 16(%esp),%xmm5
+ paddq %xmm7,%xmm4
+ movd 108(%edi),%xmm7
+ pmuludq (%esp),%xmm6
+ paddq %xmm5,%xmm3
+ movdqa %xmm7,%xmm5
+ pmuludq 64(%esp),%xmm7
+ paddq %xmm6,%xmm2
+ pmuludq 48(%esp),%xmm5
+ movd 60(%edi),%xmm6
+ paddq %xmm7,%xmm1
+ movdqa %xmm6,%xmm7
+ pmuludq 16(%esp),%xmm6
+ paddq %xmm5,%xmm0
+ movd 124(%edi),%xmm5
+ pmuludq (%esp),%xmm7
+ paddq %xmm6,%xmm4
+ movdqa %xmm5,%xmm6
+ pmuludq 64(%esp),%xmm5
+ paddq %xmm7,%xmm3
+ movdqa %xmm6,%xmm7
+ pmuludq 48(%esp),%xmm6
+ paddq %xmm5,%xmm2
+ pmuludq 32(%esp),%xmm7
+ movd 76(%edi),%xmm5
+ paddq %xmm6,%xmm1
+ movd 140(%edi),%xmm6
+ pmuludq (%esp),%xmm5
+ paddq %xmm7,%xmm0
+ movdqa %xmm6,%xmm7
+ pmuludq 64(%esp),%xmm6
+ paddq %xmm5,%xmm4
+ movdqa %xmm7,%xmm5
+ pmuludq 16(%esp),%xmm7
+ paddq %xmm6,%xmm3
+ movdqa %xmm5,%xmm6
+ pmuludq 32(%esp),%xmm5
+ paddq %xmm7,%xmm0
+ pmuludq 48(%esp),%xmm6
+ movdqa 64(%ebx),%xmm7
+ paddq %xmm5,%xmm1
+ paddq %xmm6,%xmm2
+ movdqa %xmm3,%xmm5
+ pand %xmm7,%xmm3
+ psrlq $26,%xmm5
+ paddq %xmm4,%xmm5
+ movdqa %xmm0,%xmm6
+ pand %xmm7,%xmm0
+ psrlq $26,%xmm6
+ movdqa %xmm5,%xmm4
+ paddq %xmm1,%xmm6
+ psrlq $26,%xmm5
+ pand %xmm7,%xmm4
+ movdqa %xmm6,%xmm1
+ psrlq $26,%xmm6
+ paddd %xmm5,%xmm0
+ psllq $2,%xmm5
+ paddq %xmm2,%xmm6
+ paddq %xmm0,%xmm5
+ pand %xmm7,%xmm1
+ movdqa %xmm6,%xmm2
+ psrlq $26,%xmm6
+ pand %xmm7,%xmm2
+ paddd %xmm3,%xmm6
+ movdqa %xmm5,%xmm0
+ psrlq $26,%xmm5
+ movdqa %xmm6,%xmm3
+ psrlq $26,%xmm6
+ pand %xmm7,%xmm0
+ paddd %xmm5,%xmm1
+ pand %xmm7,%xmm3
+ paddd %xmm6,%xmm4
+ subl $16,%ecx
+ jz .L013done
+.L012even:
+ leal 384(%esp),%edx
+ leal -32(%esi),%eax
+ subl $64,%ecx
+ movdqu (%edi),%xmm5
+ pshufd $68,%xmm5,%xmm6
+ cmovbl %eax,%esi
+ pshufd $238,%xmm5,%xmm5
+ movdqa %xmm6,(%edx)
+ leal 160(%esp),%eax
+ movdqu 16(%edi),%xmm6
+ movdqa %xmm5,-144(%edx)
+ pshufd $68,%xmm6,%xmm5
+ pshufd $238,%xmm6,%xmm6
+ movdqa %xmm5,16(%edx)
+ movdqu 32(%edi),%xmm5
+ movdqa %xmm6,-128(%edx)
+ pshufd $68,%xmm5,%xmm6
+ pshufd $238,%xmm5,%xmm5
+ movdqa %xmm6,32(%edx)
+ movdqu 48(%edi),%xmm6
+ movdqa %xmm5,-112(%edx)
+ pshufd $68,%xmm6,%xmm5
+ pshufd $238,%xmm6,%xmm6
+ movdqa %xmm5,48(%edx)
+ movdqu 64(%edi),%xmm5
+ movdqa %xmm6,-96(%edx)
+ pshufd $68,%xmm5,%xmm6
+ pshufd $238,%xmm5,%xmm5
+ movdqa %xmm6,64(%edx)
+ movdqu 80(%edi),%xmm6
+ movdqa %xmm5,-80(%edx)
+ pshufd $68,%xmm6,%xmm5
+ pshufd $238,%xmm6,%xmm6
+ movdqa %xmm5,80(%edx)
+ movdqu 96(%edi),%xmm5
+ movdqa %xmm6,-64(%edx)
+ pshufd $68,%xmm5,%xmm6
+ pshufd $238,%xmm5,%xmm5
+ movdqa %xmm6,96(%edx)
+ movdqu 112(%edi),%xmm6
+ movdqa %xmm5,-48(%edx)
+ pshufd $68,%xmm6,%xmm5
+ pshufd $238,%xmm6,%xmm6
+ movdqa %xmm5,112(%edx)
+ movdqu 128(%edi),%xmm5
+ movdqa %xmm6,-32(%edx)
+ pshufd $68,%xmm5,%xmm6
+ pshufd $238,%xmm5,%xmm5
+ movdqa %xmm6,128(%edx)
+ movdqa %xmm5,-16(%edx)
+ movdqu 32(%esi),%xmm5
+ movdqu 48(%esi),%xmm6
+ leal 32(%esi),%esi
+ movdqa %xmm2,112(%esp)
+ movdqa %xmm3,128(%esp)
+ movdqa %xmm4,144(%esp)
+ movdqa %xmm5,%xmm2
+ movdqa %xmm6,%xmm3
+ psrldq $6,%xmm2
+ psrldq $6,%xmm3
+ movdqa %xmm5,%xmm4
+ punpcklqdq %xmm3,%xmm2
+ punpckhqdq %xmm6,%xmm4
+ punpcklqdq %xmm6,%xmm5
+ movdqa %xmm2,%xmm3
+ psrlq $4,%xmm2
+ psrlq $30,%xmm3
+ movdqa %xmm5,%xmm6
+ psrlq $40,%xmm4
+ psrlq $26,%xmm6
+ pand %xmm7,%xmm5
+ pand %xmm7,%xmm6
+ pand %xmm7,%xmm2
+ pand %xmm7,%xmm3
+ por (%ebx),%xmm4
+ movdqa %xmm0,80(%esp)
+ movdqa %xmm1,96(%esp)
+ jbe .L014skip_loop
+ jmp .L015loop
+.align 32
+.L015loop:
+ movdqa -144(%edx),%xmm7
+ movdqa %xmm6,16(%eax)
+ movdqa %xmm2,32(%eax)
+ movdqa %xmm3,48(%eax)
+ movdqa %xmm4,64(%eax)
+ movdqa %xmm5,%xmm1
+ pmuludq %xmm7,%xmm5
+ movdqa %xmm6,%xmm0
+ pmuludq %xmm7,%xmm6
+ pmuludq %xmm7,%xmm2
+ pmuludq %xmm7,%xmm3
+ pmuludq %xmm7,%xmm4
+ pmuludq -16(%edx),%xmm0
+ movdqa %xmm1,%xmm7
+ pmuludq -128(%edx),%xmm1
+ paddq %xmm5,%xmm0
+ movdqa %xmm7,%xmm5
+ pmuludq -112(%edx),%xmm7
+ paddq %xmm6,%xmm1
+ movdqa %xmm5,%xmm6
+ pmuludq -96(%edx),%xmm5
+ paddq %xmm7,%xmm2
+ movdqa 16(%eax),%xmm7
+ pmuludq -80(%edx),%xmm6
+ paddq %xmm5,%xmm3
+ movdqa %xmm7,%xmm5
+ pmuludq -128(%edx),%xmm7
+ paddq %xmm6,%xmm4
+ movdqa %xmm5,%xmm6
+ pmuludq -112(%edx),%xmm5
+ paddq %xmm7,%xmm2
+ movdqa 32(%eax),%xmm7
+ pmuludq -96(%edx),%xmm6
+ paddq %xmm5,%xmm3
+ movdqa %xmm7,%xmm5
+ pmuludq -32(%edx),%xmm7
+ paddq %xmm6,%xmm4
+ movdqa %xmm5,%xmm6
+ pmuludq -16(%edx),%xmm5
+ paddq %xmm7,%xmm0
+ movdqa %xmm6,%xmm7
+ pmuludq -128(%edx),%xmm6
+ paddq %xmm5,%xmm1
+ movdqa 48(%eax),%xmm5
+ pmuludq -112(%edx),%xmm7
+ paddq %xmm6,%xmm3
+ movdqa %xmm5,%xmm6
+ pmuludq -48(%edx),%xmm5
+ paddq %xmm7,%xmm4
+ movdqa %xmm6,%xmm7
+ pmuludq -32(%edx),%xmm6
+ paddq %xmm5,%xmm0
+ movdqa %xmm7,%xmm5
+ pmuludq -16(%edx),%xmm7
+ paddq %xmm6,%xmm1
+ movdqa 64(%eax),%xmm6
+ pmuludq -128(%edx),%xmm5
+ paddq %xmm7,%xmm2
+ movdqa %xmm6,%xmm7
+ pmuludq -16(%edx),%xmm6
+ paddq %xmm5,%xmm4
+ movdqa %xmm7,%xmm5
+ pmuludq -64(%edx),%xmm7
+ paddq %xmm6,%xmm3
+ movdqa %xmm5,%xmm6
+ pmuludq -48(%edx),%xmm5
+ paddq %xmm7,%xmm0
+ movdqa 64(%ebx),%xmm7
+ pmuludq -32(%edx),%xmm6
+ paddq %xmm5,%xmm1
+ paddq %xmm6,%xmm2
+ movdqu -32(%esi),%xmm5
+ movdqu -16(%esi),%xmm6
+ leal 32(%esi),%esi
+ movdqa %xmm2,32(%esp)
+ movdqa %xmm3,48(%esp)
+ movdqa %xmm4,64(%esp)
+ movdqa %xmm5,%xmm2
+ movdqa %xmm6,%xmm3
+ psrldq $6,%xmm2
+ psrldq $6,%xmm3
+ movdqa %xmm5,%xmm4
+ punpcklqdq %xmm3,%xmm2
+ punpckhqdq %xmm6,%xmm4
+ punpcklqdq %xmm6,%xmm5
+ movdqa %xmm2,%xmm3
+ psrlq $4,%xmm2
+ psrlq $30,%xmm3
+ movdqa %xmm5,%xmm6
+ psrlq $40,%xmm4
+ psrlq $26,%xmm6
+ pand %xmm7,%xmm5
+ pand %xmm7,%xmm6
+ pand %xmm7,%xmm2
+ pand %xmm7,%xmm3
+ por (%ebx),%xmm4
+ leal -32(%esi),%eax
+ subl $64,%ecx
+ paddd 80(%esp),%xmm5
+ paddd 96(%esp),%xmm6
+ paddd 112(%esp),%xmm2
+ paddd 128(%esp),%xmm3
+ paddd 144(%esp),%xmm4
+ cmovbl %eax,%esi
+ leal 160(%esp),%eax
+ movdqa (%edx),%xmm7
+ movdqa %xmm1,16(%esp)
+ movdqa %xmm6,16(%eax)
+ movdqa %xmm2,32(%eax)
+ movdqa %xmm3,48(%eax)
+ movdqa %xmm4,64(%eax)
+ movdqa %xmm5,%xmm1
+ pmuludq %xmm7,%xmm5
+ paddq %xmm0,%xmm5
+ movdqa %xmm6,%xmm0
+ pmuludq %xmm7,%xmm6
+ pmuludq %xmm7,%xmm2
+ pmuludq %xmm7,%xmm3
+ pmuludq %xmm7,%xmm4
+ paddq 16(%esp),%xmm6
+ paddq 32(%esp),%xmm2
+ paddq 48(%esp),%xmm3
+ paddq 64(%esp),%xmm4
+ pmuludq 128(%edx),%xmm0
+ movdqa %xmm1,%xmm7
+ pmuludq 16(%edx),%xmm1
+ paddq %xmm5,%xmm0
+ movdqa %xmm7,%xmm5
+ pmuludq 32(%edx),%xmm7
+ paddq %xmm6,%xmm1
+ movdqa %xmm5,%xmm6
+ pmuludq 48(%edx),%xmm5
+ paddq %xmm7,%xmm2
+ movdqa 16(%eax),%xmm7
+ pmuludq 64(%edx),%xmm6
+ paddq %xmm5,%xmm3
+ movdqa %xmm7,%xmm5
+ pmuludq 16(%edx),%xmm7
+ paddq %xmm6,%xmm4
+ movdqa %xmm5,%xmm6
+ pmuludq 32(%edx),%xmm5
+ paddq %xmm7,%xmm2
+ movdqa 32(%eax),%xmm7
+ pmuludq 48(%edx),%xmm6
+ paddq %xmm5,%xmm3
+ movdqa %xmm7,%xmm5
+ pmuludq 112(%edx),%xmm7
+ paddq %xmm6,%xmm4
+ movdqa %xmm5,%xmm6
+ pmuludq 128(%edx),%xmm5
+ paddq %xmm7,%xmm0
+ movdqa %xmm6,%xmm7
+ pmuludq 16(%edx),%xmm6
+ paddq %xmm5,%xmm1
+ movdqa 48(%eax),%xmm5
+ pmuludq 32(%edx),%xmm7
+ paddq %xmm6,%xmm3
+ movdqa %xmm5,%xmm6
+ pmuludq 96(%edx),%xmm5
+ paddq %xmm7,%xmm4
+ movdqa %xmm6,%xmm7
+ pmuludq 112(%edx),%xmm6
+ paddq %xmm5,%xmm0
+ movdqa %xmm7,%xmm5
+ pmuludq 128(%edx),%xmm7
+ paddq %xmm6,%xmm1
+ movdqa 64(%eax),%xmm6
+ pmuludq 16(%edx),%xmm5
+ paddq %xmm7,%xmm2
+ movdqa %xmm6,%xmm7
+ pmuludq 128(%edx),%xmm6
+ paddq %xmm5,%xmm4
+ movdqa %xmm7,%xmm5
+ pmuludq 80(%edx),%xmm7
+ paddq %xmm6,%xmm3
+ movdqa %xmm5,%xmm6
+ pmuludq 96(%edx),%xmm5
+ paddq %xmm7,%xmm0
+ movdqa 64(%ebx),%xmm7
+ pmuludq 112(%edx),%xmm6
+ paddq %xmm5,%xmm1
+ paddq %xmm6,%xmm2
+ movdqa %xmm3,%xmm5
+ pand %xmm7,%xmm3
+ psrlq $26,%xmm5
+ paddq %xmm4,%xmm5
+ movdqa %xmm0,%xmm6
+ pand %xmm7,%xmm0
+ psrlq $26,%xmm6
+ movdqa %xmm5,%xmm4
+ paddq %xmm1,%xmm6
+ psrlq $26,%xmm5
+ pand %xmm7,%xmm4
+ movdqa %xmm6,%xmm1
+ psrlq $26,%xmm6
+ paddd %xmm5,%xmm0
+ psllq $2,%xmm5
+ paddq %xmm2,%xmm6
+ paddq %xmm0,%xmm5
+ pand %xmm7,%xmm1
+ movdqa %xmm6,%xmm2
+ psrlq $26,%xmm6
+ pand %xmm7,%xmm2
+ paddd %xmm3,%xmm6
+ movdqa %xmm5,%xmm0
+ psrlq $26,%xmm5
+ movdqa %xmm6,%xmm3
+ psrlq $26,%xmm6
+ pand %xmm7,%xmm0
+ paddd %xmm5,%xmm1
+ pand %xmm7,%xmm3
+ paddd %xmm6,%xmm4
+ movdqu 32(%esi),%xmm5
+ movdqu 48(%esi),%xmm6
+ leal 32(%esi),%esi
+ movdqa %xmm2,112(%esp)
+ movdqa %xmm3,128(%esp)
+ movdqa %xmm4,144(%esp)
+ movdqa %xmm5,%xmm2
+ movdqa %xmm6,%xmm3
+ psrldq $6,%xmm2
+ psrldq $6,%xmm3
+ movdqa %xmm5,%xmm4
+ punpcklqdq %xmm3,%xmm2
+ punpckhqdq %xmm6,%xmm4
+ punpcklqdq %xmm6,%xmm5
+ movdqa %xmm2,%xmm3
+ psrlq $4,%xmm2
+ psrlq $30,%xmm3
+ movdqa %xmm5,%xmm6
+ psrlq $40,%xmm4
+ psrlq $26,%xmm6
+ pand %xmm7,%xmm5
+ pand %xmm7,%xmm6
+ pand %xmm7,%xmm2
+ pand %xmm7,%xmm3
+ por (%ebx),%xmm4
+ movdqa %xmm0,80(%esp)
+ movdqa %xmm1,96(%esp)
+ ja .L015loop
+.L014skip_loop:
+ pshufd $16,-144(%edx),%xmm7
+ addl $32,%ecx
+ jnz .L016long_tail
+ paddd %xmm0,%xmm5
+ paddd %xmm1,%xmm6
+ paddd 112(%esp),%xmm2
+ paddd 128(%esp),%xmm3
+ paddd 144(%esp),%xmm4
+.L016long_tail:
+ movdqa %xmm5,(%eax)
+ movdqa %xmm6,16(%eax)
+ movdqa %xmm2,32(%eax)
+ movdqa %xmm3,48(%eax)
+ movdqa %xmm4,64(%eax)
+ pmuludq %xmm7,%xmm5
+ pmuludq %xmm7,%xmm6
+ pmuludq %xmm7,%xmm2
+ movdqa %xmm5,%xmm0
+ pshufd $16,-128(%edx),%xmm5
+ pmuludq %xmm7,%xmm3
+ movdqa %xmm6,%xmm1
+ pmuludq %xmm7,%xmm4
+ movdqa %xmm5,%xmm6
+ pmuludq 48(%eax),%xmm5
+ movdqa %xmm6,%xmm7
+ pmuludq 32(%eax),%xmm6
+ paddq %xmm5,%xmm4
+ movdqa %xmm7,%xmm5
+ pmuludq 16(%eax),%xmm7
+ paddq %xmm6,%xmm3
+ pshufd $16,-64(%edx),%xmm6
+ pmuludq (%eax),%xmm5
+ paddq %xmm7,%xmm2
+ pmuludq 64(%eax),%xmm6
+ pshufd $16,-112(%edx),%xmm7
+ paddq %xmm5,%xmm1
+ movdqa %xmm7,%xmm5
+ pmuludq 32(%eax),%xmm7
+ paddq %xmm6,%xmm0
+ movdqa %xmm5,%xmm6
+ pmuludq 16(%eax),%xmm5
+ paddq %xmm7,%xmm4
+ pshufd $16,-48(%edx),%xmm7
+ pmuludq (%eax),%xmm6
+ paddq %xmm5,%xmm3
+ movdqa %xmm7,%xmm5
+ pmuludq 64(%eax),%xmm7
+ paddq %xmm6,%xmm2
+ pmuludq 48(%eax),%xmm5
+ pshufd $16,-96(%edx),%xmm6
+ paddq %xmm7,%xmm1
+ movdqa %xmm6,%xmm7
+ pmuludq 16(%eax),%xmm6
+ paddq %xmm5,%xmm0
+ pshufd $16,-32(%edx),%xmm5
+ pmuludq (%eax),%xmm7
+ paddq %xmm6,%xmm4
+ movdqa %xmm5,%xmm6
+ pmuludq 64(%eax),%xmm5
+ paddq %xmm7,%xmm3
+ movdqa %xmm6,%xmm7
+ pmuludq 48(%eax),%xmm6
+ paddq %xmm5,%xmm2
+ pmuludq 32(%eax),%xmm7
+ pshufd $16,-80(%edx),%xmm5
+ paddq %xmm6,%xmm1
+ pshufd $16,-16(%edx),%xmm6
+ pmuludq (%eax),%xmm5
+ paddq %xmm7,%xmm0
+ movdqa %xmm6,%xmm7
+ pmuludq 64(%eax),%xmm6
+ paddq %xmm5,%xmm4
+ movdqa %xmm7,%xmm5
+ pmuludq 16(%eax),%xmm7
+ paddq %xmm6,%xmm3
+ movdqa %xmm5,%xmm6
+ pmuludq 32(%eax),%xmm5
+ paddq %xmm7,%xmm0
+ pmuludq 48(%eax),%xmm6
+ movdqa 64(%ebx),%xmm7
+ paddq %xmm5,%xmm1
+ paddq %xmm6,%xmm2
+ jz .L017short_tail
+ movdqu -32(%esi),%xmm5
+ movdqu -16(%esi),%xmm6
+ leal 32(%esi),%esi
+ movdqa %xmm2,32(%esp)
+ movdqa %xmm3,48(%esp)
+ movdqa %xmm4,64(%esp)
+ movdqa %xmm5,%xmm2
+ movdqa %xmm6,%xmm3
+ psrldq $6,%xmm2
+ psrldq $6,%xmm3
+ movdqa %xmm5,%xmm4
+ punpcklqdq %xmm3,%xmm2
+ punpckhqdq %xmm6,%xmm4
+ punpcklqdq %xmm6,%xmm5
+ movdqa %xmm2,%xmm3
+ psrlq $4,%xmm2
+ psrlq $30,%xmm3
+ movdqa %xmm5,%xmm6
+ psrlq $40,%xmm4
+ psrlq $26,%xmm6
+ pand %xmm7,%xmm5
+ pand %xmm7,%xmm6
+ pand %xmm7,%xmm2
+ pand %xmm7,%xmm3
+ por (%ebx),%xmm4
+ pshufd $16,(%edx),%xmm7
+ paddd 80(%esp),%xmm5
+ paddd 96(%esp),%xmm6
+ paddd 112(%esp),%xmm2
+ paddd 128(%esp),%xmm3
+ paddd 144(%esp),%xmm4
+ movdqa %xmm5,(%esp)
+ pmuludq %xmm7,%xmm5
+ movdqa %xmm6,16(%esp)
+ pmuludq %xmm7,%xmm6
+ paddq %xmm5,%xmm0
+ movdqa %xmm2,%xmm5
+ pmuludq %xmm7,%xmm2
+ paddq %xmm6,%xmm1
+ movdqa %xmm3,%xmm6
+ pmuludq %xmm7,%xmm3
+ paddq 32(%esp),%xmm2
+ movdqa %xmm5,32(%esp)
+ pshufd $16,16(%edx),%xmm5
+ paddq 48(%esp),%xmm3
+ movdqa %xmm6,48(%esp)
+ movdqa %xmm4,%xmm6
+ pmuludq %xmm7,%xmm4
+ paddq 64(%esp),%xmm4
+ movdqa %xmm6,64(%esp)
+ movdqa %xmm5,%xmm6
+ pmuludq 48(%esp),%xmm5
+ movdqa %xmm6,%xmm7
+ pmuludq 32(%esp),%xmm6
+ paddq %xmm5,%xmm4
+ movdqa %xmm7,%xmm5
+ pmuludq 16(%esp),%xmm7
+ paddq %xmm6,%xmm3
+ pshufd $16,80(%edx),%xmm6
+ pmuludq (%esp),%xmm5
+ paddq %xmm7,%xmm2
+ pmuludq 64(%esp),%xmm6
+ pshufd $16,32(%edx),%xmm7
+ paddq %xmm5,%xmm1
+ movdqa %xmm7,%xmm5
+ pmuludq 32(%esp),%xmm7
+ paddq %xmm6,%xmm0
+ movdqa %xmm5,%xmm6
+ pmuludq 16(%esp),%xmm5
+ paddq %xmm7,%xmm4
+ pshufd $16,96(%edx),%xmm7
+ pmuludq (%esp),%xmm6
+ paddq %xmm5,%xmm3
+ movdqa %xmm7,%xmm5
+ pmuludq 64(%esp),%xmm7
+ paddq %xmm6,%xmm2
+ pmuludq 48(%esp),%xmm5
+ pshufd $16,48(%edx),%xmm6
+ paddq %xmm7,%xmm1
+ movdqa %xmm6,%xmm7
+ pmuludq 16(%esp),%xmm6
+ paddq %xmm5,%xmm0
+ pshufd $16,112(%edx),%xmm5
+ pmuludq (%esp),%xmm7
+ paddq %xmm6,%xmm4
+ movdqa %xmm5,%xmm6
+ pmuludq 64(%esp),%xmm5
+ paddq %xmm7,%xmm3
+ movdqa %xmm6,%xmm7
+ pmuludq 48(%esp),%xmm6
+ paddq %xmm5,%xmm2
+ pmuludq 32(%esp),%xmm7
+ pshufd $16,64(%edx),%xmm5
+ paddq %xmm6,%xmm1
+ pshufd $16,128(%edx),%xmm6
+ pmuludq (%esp),%xmm5
+ paddq %xmm7,%xmm0
+ movdqa %xmm6,%xmm7
+ pmuludq 64(%esp),%xmm6
+ paddq %xmm5,%xmm4
+ movdqa %xmm7,%xmm5
+ pmuludq 16(%esp),%xmm7
+ paddq %xmm6,%xmm3
+ movdqa %xmm5,%xmm6
+ pmuludq 32(%esp),%xmm5
+ paddq %xmm7,%xmm0
+ pmuludq 48(%esp),%xmm6
+ movdqa 64(%ebx),%xmm7
+ paddq %xmm5,%xmm1
+ paddq %xmm6,%xmm2
+.L017short_tail:
+ pshufd $78,%xmm4,%xmm6
+ pshufd $78,%xmm3,%xmm5
+ paddq %xmm6,%xmm4
+ paddq %xmm5,%xmm3
+ pshufd $78,%xmm0,%xmm6
+ pshufd $78,%xmm1,%xmm5
+ paddq %xmm6,%xmm0
+ paddq %xmm5,%xmm1
+ pshufd $78,%xmm2,%xmm6
+ movdqa %xmm3,%xmm5
+ pand %xmm7,%xmm3
+ psrlq $26,%xmm5
+ paddq %xmm6,%xmm2
+ paddq %xmm4,%xmm5
+ movdqa %xmm0,%xmm6
+ pand %xmm7,%xmm0
+ psrlq $26,%xmm6
+ movdqa %xmm5,%xmm4
+ paddq %xmm1,%xmm6
+ psrlq $26,%xmm5
+ pand %xmm7,%xmm4
+ movdqa %xmm6,%xmm1
+ psrlq $26,%xmm6
+ paddd %xmm5,%xmm0
+ psllq $2,%xmm5
+ paddq %xmm2,%xmm6
+ paddq %xmm0,%xmm5
+ pand %xmm7,%xmm1
+ movdqa %xmm6,%xmm2
+ psrlq $26,%xmm6
+ pand %xmm7,%xmm2
+ paddd %xmm3,%xmm6
+ movdqa %xmm5,%xmm0
+ psrlq $26,%xmm5
+ movdqa %xmm6,%xmm3
+ psrlq $26,%xmm6
+ pand %xmm7,%xmm0
+ paddd %xmm5,%xmm1
+ pand %xmm7,%xmm3
+ paddd %xmm6,%xmm4
+.L013done:
+ movd %xmm0,-48(%edi)
+ movd %xmm1,-44(%edi)
+ movd %xmm2,-40(%edi)
+ movd %xmm3,-36(%edi)
+ movd %xmm4,-32(%edi)
+ movl %ebp,%esp
+.L007nodata:
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size _poly1305_blocks_sse2,.-_poly1305_blocks_sse2
+.align 32
+.type _poly1305_emit_sse2,@function
+.align 16
+_poly1305_emit_sse2:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%ebp
+ cmpl $0,20(%ebp)
+ je .Lenter_emit
+ movl (%ebp),%eax
+ movl 4(%ebp),%edi
+ movl 8(%ebp),%ecx
+ movl 12(%ebp),%edx
+ movl 16(%ebp),%esi
+ movl %edi,%ebx
+ shll $26,%edi
+ shrl $6,%ebx
+ addl %edi,%eax
+ movl %ecx,%edi
+ adcl $0,%ebx
+ shll $20,%edi
+ shrl $12,%ecx
+ addl %edi,%ebx
+ movl %edx,%edi
+ adcl $0,%ecx
+ shll $14,%edi
+ shrl $18,%edx
+ addl %edi,%ecx
+ movl %esi,%edi
+ adcl $0,%edx
+ shll $8,%edi
+ shrl $24,%esi
+ addl %edi,%edx
+ adcl $0,%esi
+ movl %esi,%edi
+ andl $3,%esi
+ shrl $2,%edi
+ leal (%edi,%edi,4),%ebp
+ movl 24(%esp),%edi
+ addl %ebp,%eax
+ movl 28(%esp),%ebp
+ adcl $0,%ebx
+ adcl $0,%ecx
+ adcl $0,%edx
+ adcl $0,%esi
+ movd %eax,%xmm0
+ addl $5,%eax
+ movd %ebx,%xmm1
+ adcl $0,%ebx
+ movd %ecx,%xmm2
+ adcl $0,%ecx
+ movd %edx,%xmm3
+ adcl $0,%edx
+ adcl $0,%esi
+ shrl $2,%esi
+ negl %esi
+ andl %esi,%eax
+ andl %esi,%ebx
+ andl %esi,%ecx
+ andl %esi,%edx
+ movl %eax,(%edi)
+ movd %xmm0,%eax
+ movl %ebx,4(%edi)
+ movd %xmm1,%ebx
+ movl %ecx,8(%edi)
+ movd %xmm2,%ecx
+ movl %edx,12(%edi)
+ movd %xmm3,%edx
+ notl %esi
+ andl %esi,%eax
+ andl %esi,%ebx
+ orl (%edi),%eax
+ andl %esi,%ecx
+ orl 4(%edi),%ebx
+ andl %esi,%edx
+ orl 8(%edi),%ecx
+ orl 12(%edi),%edx
+ addl (%ebp),%eax
+ adcl 4(%ebp),%ebx
+ movl %eax,(%edi)
+ adcl 8(%ebp),%ecx
+ movl %ebx,4(%edi)
+ adcl 12(%ebp),%edx
+ movl %ecx,8(%edi)
+ movl %edx,12(%edi)
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size _poly1305_emit_sse2,.-_poly1305_emit_sse2
+.align 32
+.type _poly1305_init_avx2,@function
+.align 16
+_poly1305_init_avx2:
+ vmovdqu 24(%edi),%xmm4
+ leal 48(%edi),%edi
+ movl %esp,%ebp
+ subl $224,%esp
+ andl $-16,%esp
+ vmovdqa 64(%ebx),%xmm7
+ vpand %xmm7,%xmm4,%xmm0
+ vpsrlq $26,%xmm4,%xmm1
+ vpsrldq $6,%xmm4,%xmm3
+ vpand %xmm7,%xmm1,%xmm1
+ vpsrlq $4,%xmm3,%xmm2
+ vpsrlq $30,%xmm3,%xmm3
+ vpand %xmm7,%xmm2,%xmm2
+ vpand %xmm7,%xmm3,%xmm3
+ vpsrldq $13,%xmm4,%xmm4
+ leal 144(%esp),%edx
+ movl $2,%ecx
+.L018square:
+ vmovdqa %xmm0,(%esp)
+ vmovdqa %xmm1,16(%esp)
+ vmovdqa %xmm2,32(%esp)
+ vmovdqa %xmm3,48(%esp)
+ vmovdqa %xmm4,64(%esp)
+ vpslld $2,%xmm1,%xmm6
+ vpslld $2,%xmm2,%xmm5
+ vpaddd %xmm1,%xmm6,%xmm6
+ vpaddd %xmm2,%xmm5,%xmm5
+ vmovdqa %xmm6,80(%esp)
+ vmovdqa %xmm5,96(%esp)
+ vpslld $2,%xmm3,%xmm6
+ vpslld $2,%xmm4,%xmm5
+ vpaddd %xmm3,%xmm6,%xmm6
+ vpaddd %xmm4,%xmm5,%xmm5
+ vmovdqa %xmm6,112(%esp)
+ vmovdqa %xmm5,128(%esp)
+ vpshufd $68,%xmm0,%xmm5
+ vmovdqa %xmm1,%xmm6
+ vpshufd $68,%xmm1,%xmm1
+ vpshufd $68,%xmm2,%xmm2
+ vpshufd $68,%xmm3,%xmm3
+ vpshufd $68,%xmm4,%xmm4
+ vmovdqa %xmm5,(%edx)
+ vmovdqa %xmm1,16(%edx)
+ vmovdqa %xmm2,32(%edx)
+ vmovdqa %xmm3,48(%edx)
+ vmovdqa %xmm4,64(%edx)
+ vpmuludq %xmm0,%xmm4,%xmm4
+ vpmuludq %xmm0,%xmm3,%xmm3
+ vpmuludq %xmm0,%xmm2,%xmm2
+ vpmuludq %xmm0,%xmm1,%xmm1
+ vpmuludq %xmm0,%xmm5,%xmm0
+ vpmuludq 48(%edx),%xmm6,%xmm5
+ vpaddq %xmm5,%xmm4,%xmm4
+ vpmuludq 32(%edx),%xmm6,%xmm7
+ vpaddq %xmm7,%xmm3,%xmm3
+ vpmuludq 16(%edx),%xmm6,%xmm5
+ vpaddq %xmm5,%xmm2,%xmm2
+ vmovdqa 80(%esp),%xmm7
+ vpmuludq (%edx),%xmm6,%xmm6
+ vpaddq %xmm6,%xmm1,%xmm1
+ vmovdqa 32(%esp),%xmm5
+ vpmuludq 64(%edx),%xmm7,%xmm7
+ vpaddq %xmm7,%xmm0,%xmm0
+ vpmuludq 32(%edx),%xmm5,%xmm6
+ vpaddq %xmm6,%xmm4,%xmm4
+ vpmuludq 16(%edx),%xmm5,%xmm7
+ vpaddq %xmm7,%xmm3,%xmm3
+ vmovdqa 96(%esp),%xmm6
+ vpmuludq (%edx),%xmm5,%xmm5
+ vpaddq %xmm5,%xmm2,%xmm2
+ vpmuludq 64(%edx),%xmm6,%xmm7
+ vpaddq %xmm7,%xmm1,%xmm1
+ vmovdqa 48(%esp),%xmm5
+ vpmuludq 48(%edx),%xmm6,%xmm6
+ vpaddq %xmm6,%xmm0,%xmm0
+ vpmuludq 16(%edx),%xmm5,%xmm7
+ vpaddq %xmm7,%xmm4,%xmm4
+ vmovdqa 112(%esp),%xmm6
+ vpmuludq (%edx),%xmm5,%xmm5
+ vpaddq %xmm5,%xmm3,%xmm3
+ vpmuludq 64(%edx),%xmm6,%xmm7
+ vpaddq %xmm7,%xmm2,%xmm2
+ vpmuludq 48(%edx),%xmm6,%xmm5
+ vpaddq %xmm5,%xmm1,%xmm1
+ vmovdqa 64(%esp),%xmm7
+ vpmuludq 32(%edx),%xmm6,%xmm6
+ vpaddq %xmm6,%xmm0,%xmm0
+ vmovdqa 128(%esp),%xmm5
+ vpmuludq (%edx),%xmm7,%xmm7
+ vpaddq %xmm7,%xmm4,%xmm4
+ vpmuludq 64(%edx),%xmm5,%xmm6
+ vpaddq %xmm6,%xmm3,%xmm3
+ vpmuludq 16(%edx),%xmm5,%xmm7
+ vpaddq %xmm7,%xmm0,%xmm0
+ vpmuludq 32(%edx),%xmm5,%xmm6
+ vpaddq %xmm6,%xmm1,%xmm1
+ vmovdqa 64(%ebx),%xmm7
+ vpmuludq 48(%edx),%xmm5,%xmm5
+ vpaddq %xmm5,%xmm2,%xmm2
+ vpsrlq $26,%xmm3,%xmm5
+ vpand %xmm7,%xmm3,%xmm3
+ vpsrlq $26,%xmm0,%xmm6
+ vpand %xmm7,%xmm0,%xmm0
+ vpaddq %xmm5,%xmm4,%xmm4
+ vpaddq %xmm6,%xmm1,%xmm1
+ vpsrlq $26,%xmm4,%xmm5
+ vpand %xmm7,%xmm4,%xmm4
+ vpsrlq $26,%xmm1,%xmm6
+ vpand %xmm7,%xmm1,%xmm1
+ vpaddq %xmm6,%xmm2,%xmm2
+ vpaddd %xmm5,%xmm0,%xmm0
+ vpsllq $2,%xmm5,%xmm5
+ vpsrlq $26,%xmm2,%xmm6
+ vpand %xmm7,%xmm2,%xmm2
+ vpaddd %xmm5,%xmm0,%xmm0
+ vpaddd %xmm6,%xmm3,%xmm3
+ vpsrlq $26,%xmm3,%xmm6
+ vpsrlq $26,%xmm0,%xmm5
+ vpand %xmm7,%xmm0,%xmm0
+ vpand %xmm7,%xmm3,%xmm3
+ vpaddd %xmm5,%xmm1,%xmm1
+ vpaddd %xmm6,%xmm4,%xmm4
+ decl %ecx
+ jz .L019square_break
+ vpunpcklqdq (%esp),%xmm0,%xmm0
+ vpunpcklqdq 16(%esp),%xmm1,%xmm1
+ vpunpcklqdq 32(%esp),%xmm2,%xmm2
+ vpunpcklqdq 48(%esp),%xmm3,%xmm3
+ vpunpcklqdq 64(%esp),%xmm4,%xmm4
+ jmp .L018square
+.L019square_break:
+ vpsllq $32,%xmm0,%xmm0
+ vpsllq $32,%xmm1,%xmm1
+ vpsllq $32,%xmm2,%xmm2
+ vpsllq $32,%xmm3,%xmm3
+ vpsllq $32,%xmm4,%xmm4
+ vpor (%esp),%xmm0,%xmm0
+ vpor 16(%esp),%xmm1,%xmm1
+ vpor 32(%esp),%xmm2,%xmm2
+ vpor 48(%esp),%xmm3,%xmm3
+ vpor 64(%esp),%xmm4,%xmm4
+ vpshufd $141,%xmm0,%xmm0
+ vpshufd $141,%xmm1,%xmm1
+ vpshufd $141,%xmm2,%xmm2
+ vpshufd $141,%xmm3,%xmm3
+ vpshufd $141,%xmm4,%xmm4
+ vmovdqu %xmm0,(%edi)
+ vmovdqu %xmm1,16(%edi)
+ vmovdqu %xmm2,32(%edi)
+ vmovdqu %xmm3,48(%edi)
+ vmovdqu %xmm4,64(%edi)
+ vpslld $2,%xmm1,%xmm6
+ vpslld $2,%xmm2,%xmm5
+ vpaddd %xmm1,%xmm6,%xmm6
+ vpaddd %xmm2,%xmm5,%xmm5
+ vmovdqu %xmm6,80(%edi)
+ vmovdqu %xmm5,96(%edi)
+ vpslld $2,%xmm3,%xmm6
+ vpslld $2,%xmm4,%xmm5
+ vpaddd %xmm3,%xmm6,%xmm6
+ vpaddd %xmm4,%xmm5,%xmm5
+ vmovdqu %xmm6,112(%edi)
+ vmovdqu %xmm5,128(%edi)
+ movl %ebp,%esp
+ leal -48(%edi),%edi
+ ret
+.size _poly1305_init_avx2,.-_poly1305_init_avx2
+.align 32
+.type _poly1305_blocks_avx2,@function
+.align 16
+_poly1305_blocks_avx2:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%edi
+ movl 24(%esp),%esi
+ movl 28(%esp),%ecx
+ movl 20(%edi),%eax
+ andl $-16,%ecx
+ jz .L020nodata
+ cmpl $64,%ecx
+ jae .L021enter_avx2
+ testl %eax,%eax
+ jz .Lenter_blocks
+.L021enter_avx2:
+ vzeroupper
+ call .L022pic_point
+.L022pic_point:
+ popl %ebx
+ leal .Lconst_sse2-.L022pic_point(%ebx),%ebx
+ testl %eax,%eax
+ jnz .L023base2_26
+ call _poly1305_init_avx2
+ movl (%edi),%eax
+ movl 3(%edi),%ecx
+ movl 6(%edi),%edx
+ movl 9(%edi),%esi
+ movl 13(%edi),%ebp
+ shrl $2,%ecx
+ andl $67108863,%eax
+ shrl $4,%edx
+ andl $67108863,%ecx
+ shrl $6,%esi
+ andl $67108863,%edx
+ movl %eax,(%edi)
+ movl %ecx,4(%edi)
+ movl %edx,8(%edi)
+ movl %esi,12(%edi)
+ movl %ebp,16(%edi)
+ movl $1,20(%edi)
+ movl 24(%esp),%esi
+ movl 28(%esp),%ecx
+.L023base2_26:
+ movl 32(%esp),%eax
+ movl %esp,%ebp
+ subl $448,%esp
+ andl $-512,%esp
+ vmovdqu 48(%edi),%xmm0
+ leal 288(%esp),%edx
+ vmovdqu 64(%edi),%xmm1
+ vmovdqu 80(%edi),%xmm2
+ vmovdqu 96(%edi),%xmm3
+ vmovdqu 112(%edi),%xmm4
+ leal 48(%edi),%edi
+ vpermq $64,%ymm0,%ymm0
+ vpermq $64,%ymm1,%ymm1
+ vpermq $64,%ymm2,%ymm2
+ vpermq $64,%ymm3,%ymm3
+ vpermq $64,%ymm4,%ymm4
+ vpshufd $200,%ymm0,%ymm0
+ vpshufd $200,%ymm1,%ymm1
+ vpshufd $200,%ymm2,%ymm2
+ vpshufd $200,%ymm3,%ymm3
+ vpshufd $200,%ymm4,%ymm4
+ vmovdqa %ymm0,-128(%edx)
+ vmovdqu 80(%edi),%xmm0
+ vmovdqa %ymm1,-96(%edx)
+ vmovdqu 96(%edi),%xmm1
+ vmovdqa %ymm2,-64(%edx)
+ vmovdqu 112(%edi),%xmm2
+ vmovdqa %ymm3,-32(%edx)
+ vmovdqu 128(%edi),%xmm3
+ vmovdqa %ymm4,(%edx)
+ vpermq $64,%ymm0,%ymm0
+ vpermq $64,%ymm1,%ymm1
+ vpermq $64,%ymm2,%ymm2
+ vpermq $64,%ymm3,%ymm3
+ vpshufd $200,%ymm0,%ymm0
+ vpshufd $200,%ymm1,%ymm1
+ vpshufd $200,%ymm2,%ymm2
+ vpshufd $200,%ymm3,%ymm3
+ vmovdqa %ymm0,32(%edx)
+ vmovd -48(%edi),%xmm0
+ vmovdqa %ymm1,64(%edx)
+ vmovd -44(%edi),%xmm1
+ vmovdqa %ymm2,96(%edx)
+ vmovd -40(%edi),%xmm2
+ vmovdqa %ymm3,128(%edx)
+ vmovd -36(%edi),%xmm3
+ vmovd -32(%edi),%xmm4
+ vmovdqa 64(%ebx),%ymm7
+ negl %eax
+ testl $63,%ecx
+ jz .L024even
+ movl %ecx,%edx
+ andl $-64,%ecx
+ andl $63,%edx
+ vmovdqu (%esi),%xmm5
+ cmpl $32,%edx
+ jb .L025one
+ vmovdqu 16(%esi),%xmm6
+ je .L026two
+ vinserti128 $1,32(%esi),%ymm5,%ymm5
+ leal 48(%esi),%esi
+ leal 8(%ebx),%ebx
+ leal 296(%esp),%edx
+ jmp .L027tail
+.L026two:
+ leal 32(%esi),%esi
+ leal 16(%ebx),%ebx
+ leal 304(%esp),%edx
+ jmp .L027tail
+.L025one:
+ leal 16(%esi),%esi
+ vpxor %ymm6,%ymm6,%ymm6
+ leal 32(%ebx,%eax,8),%ebx
+ leal 312(%esp),%edx
+ jmp .L027tail
+.align 32
+.L024even:
+ vmovdqu (%esi),%xmm5
+ vmovdqu 16(%esi),%xmm6
+ vinserti128 $1,32(%esi),%ymm5,%ymm5
+ vinserti128 $1,48(%esi),%ymm6,%ymm6
+ leal 64(%esi),%esi
+ subl $64,%ecx
+ jz .L027tail
+.L028loop:
+ vmovdqa %ymm2,64(%esp)
+ vpsrldq $6,%ymm5,%ymm2
+ vmovdqa %ymm0,(%esp)
+ vpsrldq $6,%ymm6,%ymm0
+ vmovdqa %ymm1,32(%esp)
+ vpunpckhqdq %ymm6,%ymm5,%ymm1
+ vpunpcklqdq %ymm6,%ymm5,%ymm5
+ vpunpcklqdq %ymm0,%ymm2,%ymm2
+ vpsrlq $30,%ymm2,%ymm0
+ vpsrlq $4,%ymm2,%ymm2
+ vpsrlq $26,%ymm5,%ymm6
+ vpsrlq $40,%ymm1,%ymm1
+ vpand %ymm7,%ymm2,%ymm2
+ vpand %ymm7,%ymm5,%ymm5
+ vpand %ymm7,%ymm6,%ymm6
+ vpand %ymm7,%ymm0,%ymm0
+ vpor (%ebx),%ymm1,%ymm1
+ vpaddq 64(%esp),%ymm2,%ymm2
+ vpaddq (%esp),%ymm5,%ymm5
+ vpaddq 32(%esp),%ymm6,%ymm6
+ vpaddq %ymm3,%ymm0,%ymm0
+ vpaddq %ymm4,%ymm1,%ymm1
+ vpmuludq -96(%edx),%ymm2,%ymm3
+ vmovdqa %ymm6,32(%esp)
+ vpmuludq -64(%edx),%ymm2,%ymm4
+ vmovdqa %ymm0,96(%esp)
+ vpmuludq 96(%edx),%ymm2,%ymm0
+ vmovdqa %ymm1,128(%esp)
+ vpmuludq 128(%edx),%ymm2,%ymm1
+ vpmuludq -128(%edx),%ymm2,%ymm2
+ vpmuludq -32(%edx),%ymm5,%ymm7
+ vpaddq %ymm7,%ymm3,%ymm3
+ vpmuludq (%edx),%ymm5,%ymm6
+ vpaddq %ymm6,%ymm4,%ymm4
+ vpmuludq -128(%edx),%ymm5,%ymm7
+ vpaddq %ymm7,%ymm0,%ymm0
+ vmovdqa 32(%esp),%ymm7
+ vpmuludq -96(%edx),%ymm5,%ymm6
+ vpaddq %ymm6,%ymm1,%ymm1
+ vpmuludq -64(%edx),%ymm5,%ymm5
+ vpaddq %ymm5,%ymm2,%ymm2
+ vpmuludq -64(%edx),%ymm7,%ymm6
+ vpaddq %ymm6,%ymm3,%ymm3
+ vpmuludq -32(%edx),%ymm7,%ymm5
+ vpaddq %ymm5,%ymm4,%ymm4
+ vpmuludq 128(%edx),%ymm7,%ymm6
+ vpaddq %ymm6,%ymm0,%ymm0
+ vmovdqa 96(%esp),%ymm6
+ vpmuludq -128(%edx),%ymm7,%ymm5
+ vpaddq %ymm5,%ymm1,%ymm1
+ vpmuludq -96(%edx),%ymm7,%ymm7
+ vpaddq %ymm7,%ymm2,%ymm2
+ vpmuludq -128(%edx),%ymm6,%ymm5
+ vpaddq %ymm5,%ymm3,%ymm3
+ vpmuludq -96(%edx),%ymm6,%ymm7
+ vpaddq %ymm7,%ymm4,%ymm4
+ vpmuludq 64(%edx),%ymm6,%ymm5
+ vpaddq %ymm5,%ymm0,%ymm0
+ vmovdqa 128(%esp),%ymm5
+ vpmuludq 96(%edx),%ymm6,%ymm7
+ vpaddq %ymm7,%ymm1,%ymm1
+ vpmuludq 128(%edx),%ymm6,%ymm6
+ vpaddq %ymm6,%ymm2,%ymm2
+ vpmuludq 128(%edx),%ymm5,%ymm7
+ vpaddq %ymm7,%ymm3,%ymm3
+ vpmuludq 32(%edx),%ymm5,%ymm6
+ vpaddq %ymm6,%ymm0,%ymm0
+ vpmuludq -128(%edx),%ymm5,%ymm7
+ vpaddq %ymm7,%ymm4,%ymm4
+ vmovdqa 64(%ebx),%ymm7
+ vpmuludq 64(%edx),%ymm5,%ymm6
+ vpaddq %ymm6,%ymm1,%ymm1
+ vpmuludq 96(%edx),%ymm5,%ymm5
+ vpaddq %ymm5,%ymm2,%ymm2
+ vpsrlq $26,%ymm3,%ymm5
+ vpand %ymm7,%ymm3,%ymm3
+ vpsrlq $26,%ymm0,%ymm6
+ vpand %ymm7,%ymm0,%ymm0
+ vpaddq %ymm5,%ymm4,%ymm4
+ vpaddq %ymm6,%ymm1,%ymm1
+ vpsrlq $26,%ymm4,%ymm5
+ vpand %ymm7,%ymm4,%ymm4
+ vpsrlq $26,%ymm1,%ymm6
+ vpand %ymm7,%ymm1,%ymm1
+ vpaddq %ymm6,%ymm2,%ymm2
+ vpaddq %ymm5,%ymm0,%ymm0
+ vpsllq $2,%ymm5,%ymm5
+ vpsrlq $26,%ymm2,%ymm6
+ vpand %ymm7,%ymm2,%ymm2
+ vpaddq %ymm5,%ymm0,%ymm0
+ vpaddq %ymm6,%ymm3,%ymm3
+ vpsrlq $26,%ymm3,%ymm6
+ vpsrlq $26,%ymm0,%ymm5
+ vpand %ymm7,%ymm0,%ymm0
+ vpand %ymm7,%ymm3,%ymm3
+ vpaddq %ymm5,%ymm1,%ymm1
+ vpaddq %ymm6,%ymm4,%ymm4
+ vmovdqu (%esi),%xmm5
+ vmovdqu 16(%esi),%xmm6
+ vinserti128 $1,32(%esi),%ymm5,%ymm5
+ vinserti128 $1,48(%esi),%ymm6,%ymm6
+ leal 64(%esi),%esi
+ subl $64,%ecx
+ jnz .L028loop
+.L027tail:
+ vmovdqa %ymm2,64(%esp)
+ vpsrldq $6,%ymm5,%ymm2
+ vmovdqa %ymm0,(%esp)
+ vpsrldq $6,%ymm6,%ymm0
+ vmovdqa %ymm1,32(%esp)
+ vpunpckhqdq %ymm6,%ymm5,%ymm1
+ vpunpcklqdq %ymm6,%ymm5,%ymm5
+ vpunpcklqdq %ymm0,%ymm2,%ymm2
+ vpsrlq $30,%ymm2,%ymm0
+ vpsrlq $4,%ymm2,%ymm2
+ vpsrlq $26,%ymm5,%ymm6
+ vpsrlq $40,%ymm1,%ymm1
+ vpand %ymm7,%ymm2,%ymm2
+ vpand %ymm7,%ymm5,%ymm5
+ vpand %ymm7,%ymm6,%ymm6
+ vpand %ymm7,%ymm0,%ymm0
+ vpor (%ebx),%ymm1,%ymm1
+ andl $-64,%ebx
+ vpaddq 64(%esp),%ymm2,%ymm2
+ vpaddq (%esp),%ymm5,%ymm5
+ vpaddq 32(%esp),%ymm6,%ymm6
+ vpaddq %ymm3,%ymm0,%ymm0
+ vpaddq %ymm4,%ymm1,%ymm1
+ vpmuludq -92(%edx),%ymm2,%ymm3
+ vmovdqa %ymm6,32(%esp)
+ vpmuludq -60(%edx),%ymm2,%ymm4
+ vmovdqa %ymm0,96(%esp)
+ vpmuludq 100(%edx),%ymm2,%ymm0
+ vmovdqa %ymm1,128(%esp)
+ vpmuludq 132(%edx),%ymm2,%ymm1
+ vpmuludq -124(%edx),%ymm2,%ymm2
+ vpmuludq -28(%edx),%ymm5,%ymm7
+ vpaddq %ymm7,%ymm3,%ymm3
+ vpmuludq 4(%edx),%ymm5,%ymm6
+ vpaddq %ymm6,%ymm4,%ymm4
+ vpmuludq -124(%edx),%ymm5,%ymm7
+ vpaddq %ymm7,%ymm0,%ymm0
+ vmovdqa 32(%esp),%ymm7
+ vpmuludq -92(%edx),%ymm5,%ymm6
+ vpaddq %ymm6,%ymm1,%ymm1
+ vpmuludq -60(%edx),%ymm5,%ymm5
+ vpaddq %ymm5,%ymm2,%ymm2
+ vpmuludq -60(%edx),%ymm7,%ymm6
+ vpaddq %ymm6,%ymm3,%ymm3
+ vpmuludq -28(%edx),%ymm7,%ymm5
+ vpaddq %ymm5,%ymm4,%ymm4
+ vpmuludq 132(%edx),%ymm7,%ymm6
+ vpaddq %ymm6,%ymm0,%ymm0
+ vmovdqa 96(%esp),%ymm6
+ vpmuludq -124(%edx),%ymm7,%ymm5
+ vpaddq %ymm5,%ymm1,%ymm1
+ vpmuludq -92(%edx),%ymm7,%ymm7
+ vpaddq %ymm7,%ymm2,%ymm2
+ vpmuludq -124(%edx),%ymm6,%ymm5
+ vpaddq %ymm5,%ymm3,%ymm3
+ vpmuludq -92(%edx),%ymm6,%ymm7
+ vpaddq %ymm7,%ymm4,%ymm4
+ vpmuludq 68(%edx),%ymm6,%ymm5
+ vpaddq %ymm5,%ymm0,%ymm0
+ vmovdqa 128(%esp),%ymm5
+ vpmuludq 100(%edx),%ymm6,%ymm7
+ vpaddq %ymm7,%ymm1,%ymm1
+ vpmuludq 132(%edx),%ymm6,%ymm6
+ vpaddq %ymm6,%ymm2,%ymm2
+ vpmuludq 132(%edx),%ymm5,%ymm7
+ vpaddq %ymm7,%ymm3,%ymm3
+ vpmuludq 36(%edx),%ymm5,%ymm6
+ vpaddq %ymm6,%ymm0,%ymm0
+ vpmuludq -124(%edx),%ymm5,%ymm7
+ vpaddq %ymm7,%ymm4,%ymm4
+ vmovdqa 64(%ebx),%ymm7
+ vpmuludq 68(%edx),%ymm5,%ymm6
+ vpaddq %ymm6,%ymm1,%ymm1
+ vpmuludq 100(%edx),%ymm5,%ymm5
+ vpaddq %ymm5,%ymm2,%ymm2
+ vpsrldq $8,%ymm4,%ymm5
+ vpsrldq $8,%ymm3,%ymm6
+ vpaddq %ymm5,%ymm4,%ymm4
+ vpsrldq $8,%ymm0,%ymm5
+ vpaddq %ymm6,%ymm3,%ymm3
+ vpsrldq $8,%ymm1,%ymm6
+ vpaddq %ymm5,%ymm0,%ymm0
+ vpsrldq $8,%ymm2,%ymm5
+ vpaddq %ymm6,%ymm1,%ymm1
+ vpermq $2,%ymm4,%ymm6
+ vpaddq %ymm5,%ymm2,%ymm2
+ vpermq $2,%ymm3,%ymm5
+ vpaddq %ymm6,%ymm4,%ymm4
+ vpermq $2,%ymm0,%ymm6
+ vpaddq %ymm5,%ymm3,%ymm3
+ vpermq $2,%ymm1,%ymm5
+ vpaddq %ymm6,%ymm0,%ymm0
+ vpermq $2,%ymm2,%ymm6
+ vpaddq %ymm5,%ymm1,%ymm1
+ vpaddq %ymm6,%ymm2,%ymm2
+ vpsrlq $26,%ymm3,%ymm5
+ vpand %ymm7,%ymm3,%ymm3
+ vpsrlq $26,%ymm0,%ymm6
+ vpand %ymm7,%ymm0,%ymm0
+ vpaddq %ymm5,%ymm4,%ymm4
+ vpaddq %ymm6,%ymm1,%ymm1
+ vpsrlq $26,%ymm4,%ymm5
+ vpand %ymm7,%ymm4,%ymm4
+ vpsrlq $26,%ymm1,%ymm6
+ vpand %ymm7,%ymm1,%ymm1
+ vpaddq %ymm6,%ymm2,%ymm2
+ vpaddq %ymm5,%ymm0,%ymm0
+ vpsllq $2,%ymm5,%ymm5
+ vpsrlq $26,%ymm2,%ymm6
+ vpand %ymm7,%ymm2,%ymm2
+ vpaddq %ymm5,%ymm0,%ymm0
+ vpaddq %ymm6,%ymm3,%ymm3
+ vpsrlq $26,%ymm3,%ymm6
+ vpsrlq $26,%ymm0,%ymm5
+ vpand %ymm7,%ymm0,%ymm0
+ vpand %ymm7,%ymm3,%ymm3
+ vpaddq %ymm5,%ymm1,%ymm1
+ vpaddq %ymm6,%ymm4,%ymm4
+ cmpl $0,%ecx
+ je .L029done
+ vpshufd $252,%xmm0,%xmm0
+ leal 288(%esp),%edx
+ vpshufd $252,%xmm1,%xmm1
+ vpshufd $252,%xmm2,%xmm2
+ vpshufd $252,%xmm3,%xmm3
+ vpshufd $252,%xmm4,%xmm4
+ jmp .L024even
+.align 16
+.L029done:
+ vmovd %xmm0,-48(%edi)
+ vmovd %xmm1,-44(%edi)
+ vmovd %xmm2,-40(%edi)
+ vmovd %xmm3,-36(%edi)
+ vmovd %xmm4,-32(%edi)
+ vzeroupper
+ movl %ebp,%esp
+.L020nodata:
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size _poly1305_blocks_avx2,.-_poly1305_blocks_avx2
+.align 64
+.Lconst_sse2:
+.long 16777216,0,16777216,0,16777216,0,16777216,0
+.long 0,0,0,0,0,0,0,0
+.long 67108863,0,67108863,0,67108863,0,67108863,0
+.long 268435455,268435452,268435452,268435452
.byte 80,111,108,121,49,51,48,53,32,102,111,114,32,120,56,54
.byte 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32
.byte 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111
.byte 114,103,62,0
.align 4
+.comm OPENSSL_ia32cap_P,16,4
#else
.text
.align 64
@@ -301,6 +1935,26 @@ poly1305_init:
movl %eax,20(%edi)
cmpl $0,%esi
je .L000nokey
+ call .L001pic_point
+.L001pic_point:
+ popl %ebx
+ leal poly1305_blocks-.L001pic_point(%ebx),%eax
+ leal poly1305_emit-.L001pic_point(%ebx),%edx
+ leal OPENSSL_ia32cap_P,%edi
+ movl (%edi),%ecx
+ andl $83886080,%ecx
+ cmpl $83886080,%ecx
+ jne .L002no_sse2
+ leal _poly1305_blocks_sse2-.L001pic_point(%ebx),%eax
+ leal _poly1305_emit_sse2-.L001pic_point(%ebx),%edx
+ movl 8(%edi),%ecx
+ testl $32,%ecx
+ jz .L002no_sse2
+ leal _poly1305_blocks_avx2-.L001pic_point(%ebx),%eax
+.L002no_sse2:
+ movl 20(%esp),%edi
+ movl %eax,(%ebp)
+ movl %edx,4(%ebp)
movl (%esi),%eax
movl 4(%esi),%ebx
movl 8(%esi),%ecx
@@ -313,7 +1967,7 @@ poly1305_init:
movl %ebx,28(%edi)
movl %ecx,32(%edi)
movl %edx,36(%edi)
- movl $0,%eax
+ movl $1,%eax
.L000nokey:
popl %edi
popl %esi
@@ -333,9 +1987,9 @@ poly1305_blocks:
movl 20(%esp),%edi
movl 24(%esp),%esi
movl 28(%esp),%ecx
-.L001enter_blocks:
+.Lenter_blocks:
andl $-15,%ecx
- jz .L002nodata
+ jz .L003nodata
subl $64,%esp
movl 24(%edi),%eax
movl 28(%edi),%ebx
@@ -365,9 +2019,9 @@ poly1305_blocks:
movl 8(%edi),%ecx
movl 12(%edi),%esi
movl 16(%edi),%edi
- jmp .L003loop
+ jmp .L004loop
.align 32
-.L003loop:
+.L004loop:
addl (%ebp),%eax
adcl 4(%ebp),%ebx
adcl 8(%ebp),%ecx
@@ -474,7 +2128,7 @@ poly1305_blocks:
adcl $0,%esi
adcl $0,%edi
cmpl 92(%esp),%ebp
- jne .L003loop
+ jne .L004loop
movl 84(%esp),%edx
addl $64,%esp
movl %eax,(%edx)
@@ -482,7 +2136,7 @@ poly1305_blocks:
movl %ecx,8(%edx)
movl %esi,12(%edx)
movl %edi,16(%edx)
-.L002nodata:
+.L003nodata:
popl %edi
popl %esi
popl %ebx
@@ -499,7 +2153,7 @@ poly1305_emit:
pushl %esi
pushl %edi
movl 20(%esp),%ebp
-.L004enter_emit:
+.Lenter_emit:
movl 24(%esp),%edi
movl (%ebp),%eax
movl 4(%ebp),%ebx
@@ -549,9 +2203,1623 @@ poly1305_emit:
popl %ebp
ret
.size poly1305_emit,.-.L_poly1305_emit_begin
+.align 32
+.type _poly1305_init_sse2,@function
+.align 16
+_poly1305_init_sse2:
+ movdqu 24(%edi),%xmm4
+ leal 48(%edi),%edi
+ movl %esp,%ebp
+ subl $224,%esp
+ andl $-16,%esp
+ movq 64(%ebx),%xmm7
+ movdqa %xmm4,%xmm0
+ movdqa %xmm4,%xmm1
+ movdqa %xmm4,%xmm2
+ pand %xmm7,%xmm0
+ psrlq $26,%xmm1
+ psrldq $6,%xmm2
+ pand %xmm7,%xmm1
+ movdqa %xmm2,%xmm3
+ psrlq $4,%xmm2
+ psrlq $30,%xmm3
+ pand %xmm7,%xmm2
+ pand %xmm7,%xmm3
+ psrldq $13,%xmm4
+ leal 144(%esp),%edx
+ movl $2,%ecx
+.L005square:
+ movdqa %xmm0,(%esp)
+ movdqa %xmm1,16(%esp)
+ movdqa %xmm2,32(%esp)
+ movdqa %xmm3,48(%esp)
+ movdqa %xmm4,64(%esp)
+ movdqa %xmm1,%xmm6
+ movdqa %xmm2,%xmm5
+ pslld $2,%xmm6
+ pslld $2,%xmm5
+ paddd %xmm1,%xmm6
+ paddd %xmm2,%xmm5
+ movdqa %xmm6,80(%esp)
+ movdqa %xmm5,96(%esp)
+ movdqa %xmm3,%xmm6
+ movdqa %xmm4,%xmm5
+ pslld $2,%xmm6
+ pslld $2,%xmm5
+ paddd %xmm3,%xmm6
+ paddd %xmm4,%xmm5
+ movdqa %xmm6,112(%esp)
+ movdqa %xmm5,128(%esp)
+ pshufd $68,%xmm0,%xmm6
+ movdqa %xmm1,%xmm5
+ pshufd $68,%xmm1,%xmm1
+ pshufd $68,%xmm2,%xmm2
+ pshufd $68,%xmm3,%xmm3
+ pshufd $68,%xmm4,%xmm4
+ movdqa %xmm6,(%edx)
+ movdqa %xmm1,16(%edx)
+ movdqa %xmm2,32(%edx)
+ movdqa %xmm3,48(%edx)
+ movdqa %xmm4,64(%edx)
+ pmuludq %xmm0,%xmm4
+ pmuludq %xmm0,%xmm3
+ pmuludq %xmm0,%xmm2
+ pmuludq %xmm0,%xmm1
+ pmuludq %xmm6,%xmm0
+ movdqa %xmm5,%xmm6
+ pmuludq 48(%edx),%xmm5
+ movdqa %xmm6,%xmm7
+ pmuludq 32(%edx),%xmm6
+ paddq %xmm5,%xmm4
+ movdqa %xmm7,%xmm5
+ pmuludq 16(%edx),%xmm7
+ paddq %xmm6,%xmm3
+ movdqa 80(%esp),%xmm6
+ pmuludq (%edx),%xmm5
+ paddq %xmm7,%xmm2
+ pmuludq 64(%edx),%xmm6
+ movdqa 32(%esp),%xmm7
+ paddq %xmm5,%xmm1
+ movdqa %xmm7,%xmm5
+ pmuludq 32(%edx),%xmm7
+ paddq %xmm6,%xmm0
+ movdqa %xmm5,%xmm6
+ pmuludq 16(%edx),%xmm5
+ paddq %xmm7,%xmm4
+ movdqa 96(%esp),%xmm7
+ pmuludq (%edx),%xmm6
+ paddq %xmm5,%xmm3
+ movdqa %xmm7,%xmm5
+ pmuludq 64(%edx),%xmm7
+ paddq %xmm6,%xmm2
+ pmuludq 48(%edx),%xmm5
+ movdqa 48(%esp),%xmm6
+ paddq %xmm7,%xmm1
+ movdqa %xmm6,%xmm7
+ pmuludq 16(%edx),%xmm6
+ paddq %xmm5,%xmm0
+ movdqa 112(%esp),%xmm5
+ pmuludq (%edx),%xmm7
+ paddq %xmm6,%xmm4
+ movdqa %xmm5,%xmm6
+ pmuludq 64(%edx),%xmm5
+ paddq %xmm7,%xmm3
+ movdqa %xmm6,%xmm7
+ pmuludq 48(%edx),%xmm6
+ paddq %xmm5,%xmm2
+ pmuludq 32(%edx),%xmm7
+ movdqa 64(%esp),%xmm5
+ paddq %xmm6,%xmm1
+ movdqa 128(%esp),%xmm6
+ pmuludq (%edx),%xmm5
+ paddq %xmm7,%xmm0
+ movdqa %xmm6,%xmm7
+ pmuludq 64(%edx),%xmm6
+ paddq %xmm5,%xmm4
+ movdqa %xmm7,%xmm5
+ pmuludq 16(%edx),%xmm7
+ paddq %xmm6,%xmm3
+ movdqa %xmm5,%xmm6
+ pmuludq 32(%edx),%xmm5
+ paddq %xmm7,%xmm0
+ pmuludq 48(%edx),%xmm6
+ movdqa 64(%ebx),%xmm7
+ paddq %xmm5,%xmm1
+ paddq %xmm6,%xmm2
+ movdqa %xmm3,%xmm5
+ pand %xmm7,%xmm3
+ psrlq $26,%xmm5
+ paddq %xmm4,%xmm5
+ movdqa %xmm0,%xmm6
+ pand %xmm7,%xmm0
+ psrlq $26,%xmm6
+ movdqa %xmm5,%xmm4
+ paddq %xmm1,%xmm6
+ psrlq $26,%xmm5
+ pand %xmm7,%xmm4
+ movdqa %xmm6,%xmm1
+ psrlq $26,%xmm6
+ paddd %xmm5,%xmm0
+ psllq $2,%xmm5
+ paddq %xmm2,%xmm6
+ paddq %xmm0,%xmm5
+ pand %xmm7,%xmm1
+ movdqa %xmm6,%xmm2
+ psrlq $26,%xmm6
+ pand %xmm7,%xmm2
+ paddd %xmm3,%xmm6
+ movdqa %xmm5,%xmm0
+ psrlq $26,%xmm5
+ movdqa %xmm6,%xmm3
+ psrlq $26,%xmm6
+ pand %xmm7,%xmm0
+ paddd %xmm5,%xmm1
+ pand %xmm7,%xmm3
+ paddd %xmm6,%xmm4
+ decl %ecx
+ jz .L006square_break
+ punpcklqdq (%esp),%xmm0
+ punpcklqdq 16(%esp),%xmm1
+ punpcklqdq 32(%esp),%xmm2
+ punpcklqdq 48(%esp),%xmm3
+ punpcklqdq 64(%esp),%xmm4
+ jmp .L005square
+.L006square_break:
+ psllq $32,%xmm0
+ psllq $32,%xmm1
+ psllq $32,%xmm2
+ psllq $32,%xmm3
+ psllq $32,%xmm4
+ por (%esp),%xmm0
+ por 16(%esp),%xmm1
+ por 32(%esp),%xmm2
+ por 48(%esp),%xmm3
+ por 64(%esp),%xmm4
+ pshufd $141,%xmm0,%xmm0
+ pshufd $141,%xmm1,%xmm1
+ pshufd $141,%xmm2,%xmm2
+ pshufd $141,%xmm3,%xmm3
+ pshufd $141,%xmm4,%xmm4
+ movdqu %xmm0,(%edi)
+ movdqu %xmm1,16(%edi)
+ movdqu %xmm2,32(%edi)
+ movdqu %xmm3,48(%edi)
+ movdqu %xmm4,64(%edi)
+ movdqa %xmm1,%xmm6
+ movdqa %xmm2,%xmm5
+ pslld $2,%xmm6
+ pslld $2,%xmm5
+ paddd %xmm1,%xmm6
+ paddd %xmm2,%xmm5
+ movdqu %xmm6,80(%edi)
+ movdqu %xmm5,96(%edi)
+ movdqa %xmm3,%xmm6
+ movdqa %xmm4,%xmm5
+ pslld $2,%xmm6
+ pslld $2,%xmm5
+ paddd %xmm3,%xmm6
+ paddd %xmm4,%xmm5
+ movdqu %xmm6,112(%edi)
+ movdqu %xmm5,128(%edi)
+ movl %ebp,%esp
+ leal -48(%edi),%edi
+ ret
+.size _poly1305_init_sse2,.-_poly1305_init_sse2
+.align 32
+.type _poly1305_blocks_sse2,@function
+.align 16
+_poly1305_blocks_sse2:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%edi
+ movl 24(%esp),%esi
+ movl 28(%esp),%ecx
+ movl 20(%edi),%eax
+ andl $-16,%ecx
+ jz .L007nodata
+ cmpl $64,%ecx
+ jae .L008enter_sse2
+ testl %eax,%eax
+ jz .Lenter_blocks
+.align 16
+.L008enter_sse2:
+ call .L009pic_point
+.L009pic_point:
+ popl %ebx
+ leal .Lconst_sse2-.L009pic_point(%ebx),%ebx
+ testl %eax,%eax
+ jnz .L010base2_26
+ call _poly1305_init_sse2
+ movl (%edi),%eax
+ movl 3(%edi),%ecx
+ movl 6(%edi),%edx
+ movl 9(%edi),%esi
+ movl 13(%edi),%ebp
+ movl $1,20(%edi)
+ shrl $2,%ecx
+ andl $67108863,%eax
+ shrl $4,%edx
+ andl $67108863,%ecx
+ shrl $6,%esi
+ andl $67108863,%edx
+ movd %eax,%xmm0
+ movd %ecx,%xmm1
+ movd %edx,%xmm2
+ movd %esi,%xmm3
+ movd %ebp,%xmm4
+ movl 24(%esp),%esi
+ movl 28(%esp),%ecx
+ jmp .L011base2_32
+.align 16
+.L010base2_26:
+ movd (%edi),%xmm0
+ movd 4(%edi),%xmm1
+ movd 8(%edi),%xmm2
+ movd 12(%edi),%xmm3
+ movd 16(%edi),%xmm4
+ movdqa 64(%ebx),%xmm7
+.L011base2_32:
+ movl 32(%esp),%eax
+ movl %esp,%ebp
+ subl $528,%esp
+ andl $-16,%esp
+ leal 48(%edi),%edi
+ shll $24,%eax
+ testl $31,%ecx
+ jz .L012even
+ movdqu (%esi),%xmm6
+ leal 16(%esi),%esi
+ movdqa %xmm6,%xmm5
+ pand %xmm7,%xmm6
+ paddd %xmm6,%xmm0
+ movdqa %xmm5,%xmm6
+ psrlq $26,%xmm5
+ psrldq $6,%xmm6
+ pand %xmm7,%xmm5
+ paddd %xmm5,%xmm1
+ movdqa %xmm6,%xmm5
+ psrlq $4,%xmm6
+ pand %xmm7,%xmm6
+ paddd %xmm6,%xmm2
+ movdqa %xmm5,%xmm6
+ psrlq $30,%xmm5
+ pand %xmm7,%xmm5
+ psrldq $7,%xmm6
+ paddd %xmm5,%xmm3
+ movd %eax,%xmm5
+ paddd %xmm6,%xmm4
+ movd 12(%edi),%xmm6
+ paddd %xmm5,%xmm4
+ movdqa %xmm0,(%esp)
+ movdqa %xmm1,16(%esp)
+ movdqa %xmm2,32(%esp)
+ movdqa %xmm3,48(%esp)
+ movdqa %xmm4,64(%esp)
+ pmuludq %xmm6,%xmm0
+ pmuludq %xmm6,%xmm1
+ pmuludq %xmm6,%xmm2
+ movd 28(%edi),%xmm5
+ pmuludq %xmm6,%xmm3
+ pmuludq %xmm6,%xmm4
+ movdqa %xmm5,%xmm6
+ pmuludq 48(%esp),%xmm5
+ movdqa %xmm6,%xmm7
+ pmuludq 32(%esp),%xmm6
+ paddq %xmm5,%xmm4
+ movdqa %xmm7,%xmm5
+ pmuludq 16(%esp),%xmm7
+ paddq %xmm6,%xmm3
+ movd 92(%edi),%xmm6
+ pmuludq (%esp),%xmm5
+ paddq %xmm7,%xmm2
+ pmuludq 64(%esp),%xmm6
+ movd 44(%edi),%xmm7
+ paddq %xmm5,%xmm1
+ movdqa %xmm7,%xmm5
+ pmuludq 32(%esp),%xmm7
+ paddq %xmm6,%xmm0
+ movdqa %xmm5,%xmm6
+ pmuludq 16(%esp),%xmm5
+ paddq %xmm7,%xmm4
+ movd 108(%edi),%xmm7
+ pmuludq (%esp),%xmm6
+ paddq %xmm5,%xmm3
+ movdqa %xmm7,%xmm5
+ pmuludq 64(%esp),%xmm7
+ paddq %xmm6,%xmm2
+ pmuludq 48(%esp),%xmm5
+ movd 60(%edi),%xmm6
+ paddq %xmm7,%xmm1
+ movdqa %xmm6,%xmm7
+ pmuludq 16(%esp),%xmm6
+ paddq %xmm5,%xmm0
+ movd 124(%edi),%xmm5
+ pmuludq (%esp),%xmm7
+ paddq %xmm6,%xmm4
+ movdqa %xmm5,%xmm6
+ pmuludq 64(%esp),%xmm5
+ paddq %xmm7,%xmm3
+ movdqa %xmm6,%xmm7
+ pmuludq 48(%esp),%xmm6
+ paddq %xmm5,%xmm2
+ pmuludq 32(%esp),%xmm7
+ movd 76(%edi),%xmm5
+ paddq %xmm6,%xmm1
+ movd 140(%edi),%xmm6
+ pmuludq (%esp),%xmm5
+ paddq %xmm7,%xmm0
+ movdqa %xmm6,%xmm7
+ pmuludq 64(%esp),%xmm6
+ paddq %xmm5,%xmm4
+ movdqa %xmm7,%xmm5
+ pmuludq 16(%esp),%xmm7
+ paddq %xmm6,%xmm3
+ movdqa %xmm5,%xmm6
+ pmuludq 32(%esp),%xmm5
+ paddq %xmm7,%xmm0
+ pmuludq 48(%esp),%xmm6
+ movdqa 64(%ebx),%xmm7
+ paddq %xmm5,%xmm1
+ paddq %xmm6,%xmm2
+ movdqa %xmm3,%xmm5
+ pand %xmm7,%xmm3
+ psrlq $26,%xmm5
+ paddq %xmm4,%xmm5
+ movdqa %xmm0,%xmm6
+ pand %xmm7,%xmm0
+ psrlq $26,%xmm6
+ movdqa %xmm5,%xmm4
+ paddq %xmm1,%xmm6
+ psrlq $26,%xmm5
+ pand %xmm7,%xmm4
+ movdqa %xmm6,%xmm1
+ psrlq $26,%xmm6
+ paddd %xmm5,%xmm0
+ psllq $2,%xmm5
+ paddq %xmm2,%xmm6
+ paddq %xmm0,%xmm5
+ pand %xmm7,%xmm1
+ movdqa %xmm6,%xmm2
+ psrlq $26,%xmm6
+ pand %xmm7,%xmm2
+ paddd %xmm3,%xmm6
+ movdqa %xmm5,%xmm0
+ psrlq $26,%xmm5
+ movdqa %xmm6,%xmm3
+ psrlq $26,%xmm6
+ pand %xmm7,%xmm0
+ paddd %xmm5,%xmm1
+ pand %xmm7,%xmm3
+ paddd %xmm6,%xmm4
+ subl $16,%ecx
+ jz .L013done
+.L012even:
+ leal 384(%esp),%edx
+ leal -32(%esi),%eax
+ subl $64,%ecx
+ movdqu (%edi),%xmm5
+ pshufd $68,%xmm5,%xmm6
+ cmovbl %eax,%esi
+ pshufd $238,%xmm5,%xmm5
+ movdqa %xmm6,(%edx)
+ leal 160(%esp),%eax
+ movdqu 16(%edi),%xmm6
+ movdqa %xmm5,-144(%edx)
+ pshufd $68,%xmm6,%xmm5
+ pshufd $238,%xmm6,%xmm6
+ movdqa %xmm5,16(%edx)
+ movdqu 32(%edi),%xmm5
+ movdqa %xmm6,-128(%edx)
+ pshufd $68,%xmm5,%xmm6
+ pshufd $238,%xmm5,%xmm5
+ movdqa %xmm6,32(%edx)
+ movdqu 48(%edi),%xmm6
+ movdqa %xmm5,-112(%edx)
+ pshufd $68,%xmm6,%xmm5
+ pshufd $238,%xmm6,%xmm6
+ movdqa %xmm5,48(%edx)
+ movdqu 64(%edi),%xmm5
+ movdqa %xmm6,-96(%edx)
+ pshufd $68,%xmm5,%xmm6
+ pshufd $238,%xmm5,%xmm5
+ movdqa %xmm6,64(%edx)
+ movdqu 80(%edi),%xmm6
+ movdqa %xmm5,-80(%edx)
+ pshufd $68,%xmm6,%xmm5
+ pshufd $238,%xmm6,%xmm6
+ movdqa %xmm5,80(%edx)
+ movdqu 96(%edi),%xmm5
+ movdqa %xmm6,-64(%edx)
+ pshufd $68,%xmm5,%xmm6
+ pshufd $238,%xmm5,%xmm5
+ movdqa %xmm6,96(%edx)
+ movdqu 112(%edi),%xmm6
+ movdqa %xmm5,-48(%edx)
+ pshufd $68,%xmm6,%xmm5
+ pshufd $238,%xmm6,%xmm6
+ movdqa %xmm5,112(%edx)
+ movdqu 128(%edi),%xmm5
+ movdqa %xmm6,-32(%edx)
+ pshufd $68,%xmm5,%xmm6
+ pshufd $238,%xmm5,%xmm5
+ movdqa %xmm6,128(%edx)
+ movdqa %xmm5,-16(%edx)
+ movdqu 32(%esi),%xmm5
+ movdqu 48(%esi),%xmm6
+ leal 32(%esi),%esi
+ movdqa %xmm2,112(%esp)
+ movdqa %xmm3,128(%esp)
+ movdqa %xmm4,144(%esp)
+ movdqa %xmm5,%xmm2
+ movdqa %xmm6,%xmm3
+ psrldq $6,%xmm2
+ psrldq $6,%xmm3
+ movdqa %xmm5,%xmm4
+ punpcklqdq %xmm3,%xmm2
+ punpckhqdq %xmm6,%xmm4
+ punpcklqdq %xmm6,%xmm5
+ movdqa %xmm2,%xmm3
+ psrlq $4,%xmm2
+ psrlq $30,%xmm3
+ movdqa %xmm5,%xmm6
+ psrlq $40,%xmm4
+ psrlq $26,%xmm6
+ pand %xmm7,%xmm5
+ pand %xmm7,%xmm6
+ pand %xmm7,%xmm2
+ pand %xmm7,%xmm3
+ por (%ebx),%xmm4
+ movdqa %xmm0,80(%esp)
+ movdqa %xmm1,96(%esp)
+ jbe .L014skip_loop
+ jmp .L015loop
+.align 32
+.L015loop:
+ movdqa -144(%edx),%xmm7
+ movdqa %xmm6,16(%eax)
+ movdqa %xmm2,32(%eax)
+ movdqa %xmm3,48(%eax)
+ movdqa %xmm4,64(%eax)
+ movdqa %xmm5,%xmm1
+ pmuludq %xmm7,%xmm5
+ movdqa %xmm6,%xmm0
+ pmuludq %xmm7,%xmm6
+ pmuludq %xmm7,%xmm2
+ pmuludq %xmm7,%xmm3
+ pmuludq %xmm7,%xmm4
+ pmuludq -16(%edx),%xmm0
+ movdqa %xmm1,%xmm7
+ pmuludq -128(%edx),%xmm1
+ paddq %xmm5,%xmm0
+ movdqa %xmm7,%xmm5
+ pmuludq -112(%edx),%xmm7
+ paddq %xmm6,%xmm1
+ movdqa %xmm5,%xmm6
+ pmuludq -96(%edx),%xmm5
+ paddq %xmm7,%xmm2
+ movdqa 16(%eax),%xmm7
+ pmuludq -80(%edx),%xmm6
+ paddq %xmm5,%xmm3
+ movdqa %xmm7,%xmm5
+ pmuludq -128(%edx),%xmm7
+ paddq %xmm6,%xmm4
+ movdqa %xmm5,%xmm6
+ pmuludq -112(%edx),%xmm5
+ paddq %xmm7,%xmm2
+ movdqa 32(%eax),%xmm7
+ pmuludq -96(%edx),%xmm6
+ paddq %xmm5,%xmm3
+ movdqa %xmm7,%xmm5
+ pmuludq -32(%edx),%xmm7
+ paddq %xmm6,%xmm4
+ movdqa %xmm5,%xmm6
+ pmuludq -16(%edx),%xmm5
+ paddq %xmm7,%xmm0
+ movdqa %xmm6,%xmm7
+ pmuludq -128(%edx),%xmm6
+ paddq %xmm5,%xmm1
+ movdqa 48(%eax),%xmm5
+ pmuludq -112(%edx),%xmm7
+ paddq %xmm6,%xmm3
+ movdqa %xmm5,%xmm6
+ pmuludq -48(%edx),%xmm5
+ paddq %xmm7,%xmm4
+ movdqa %xmm6,%xmm7
+ pmuludq -32(%edx),%xmm6
+ paddq %xmm5,%xmm0
+ movdqa %xmm7,%xmm5
+ pmuludq -16(%edx),%xmm7
+ paddq %xmm6,%xmm1
+ movdqa 64(%eax),%xmm6
+ pmuludq -128(%edx),%xmm5
+ paddq %xmm7,%xmm2
+ movdqa %xmm6,%xmm7
+ pmuludq -16(%edx),%xmm6
+ paddq %xmm5,%xmm4
+ movdqa %xmm7,%xmm5
+ pmuludq -64(%edx),%xmm7
+ paddq %xmm6,%xmm3
+ movdqa %xmm5,%xmm6
+ pmuludq -48(%edx),%xmm5
+ paddq %xmm7,%xmm0
+ movdqa 64(%ebx),%xmm7
+ pmuludq -32(%edx),%xmm6
+ paddq %xmm5,%xmm1
+ paddq %xmm6,%xmm2
+ movdqu -32(%esi),%xmm5
+ movdqu -16(%esi),%xmm6
+ leal 32(%esi),%esi
+ movdqa %xmm2,32(%esp)
+ movdqa %xmm3,48(%esp)
+ movdqa %xmm4,64(%esp)
+ movdqa %xmm5,%xmm2
+ movdqa %xmm6,%xmm3
+ psrldq $6,%xmm2
+ psrldq $6,%xmm3
+ movdqa %xmm5,%xmm4
+ punpcklqdq %xmm3,%xmm2
+ punpckhqdq %xmm6,%xmm4
+ punpcklqdq %xmm6,%xmm5
+ movdqa %xmm2,%xmm3
+ psrlq $4,%xmm2
+ psrlq $30,%xmm3
+ movdqa %xmm5,%xmm6
+ psrlq $40,%xmm4
+ psrlq $26,%xmm6
+ pand %xmm7,%xmm5
+ pand %xmm7,%xmm6
+ pand %xmm7,%xmm2
+ pand %xmm7,%xmm3
+ por (%ebx),%xmm4
+ leal -32(%esi),%eax
+ subl $64,%ecx
+ paddd 80(%esp),%xmm5
+ paddd 96(%esp),%xmm6
+ paddd 112(%esp),%xmm2
+ paddd 128(%esp),%xmm3
+ paddd 144(%esp),%xmm4
+ cmovbl %eax,%esi
+ leal 160(%esp),%eax
+ movdqa (%edx),%xmm7
+ movdqa %xmm1,16(%esp)
+ movdqa %xmm6,16(%eax)
+ movdqa %xmm2,32(%eax)
+ movdqa %xmm3,48(%eax)
+ movdqa %xmm4,64(%eax)
+ movdqa %xmm5,%xmm1
+ pmuludq %xmm7,%xmm5
+ paddq %xmm0,%xmm5
+ movdqa %xmm6,%xmm0
+ pmuludq %xmm7,%xmm6
+ pmuludq %xmm7,%xmm2
+ pmuludq %xmm7,%xmm3
+ pmuludq %xmm7,%xmm4
+ paddq 16(%esp),%xmm6
+ paddq 32(%esp),%xmm2
+ paddq 48(%esp),%xmm3
+ paddq 64(%esp),%xmm4
+ pmuludq 128(%edx),%xmm0
+ movdqa %xmm1,%xmm7
+ pmuludq 16(%edx),%xmm1
+ paddq %xmm5,%xmm0
+ movdqa %xmm7,%xmm5
+ pmuludq 32(%edx),%xmm7
+ paddq %xmm6,%xmm1
+ movdqa %xmm5,%xmm6
+ pmuludq 48(%edx),%xmm5
+ paddq %xmm7,%xmm2
+ movdqa 16(%eax),%xmm7
+ pmuludq 64(%edx),%xmm6
+ paddq %xmm5,%xmm3
+ movdqa %xmm7,%xmm5
+ pmuludq 16(%edx),%xmm7
+ paddq %xmm6,%xmm4
+ movdqa %xmm5,%xmm6
+ pmuludq 32(%edx),%xmm5
+ paddq %xmm7,%xmm2
+ movdqa 32(%eax),%xmm7
+ pmuludq 48(%edx),%xmm6
+ paddq %xmm5,%xmm3
+ movdqa %xmm7,%xmm5
+ pmuludq 112(%edx),%xmm7
+ paddq %xmm6,%xmm4
+ movdqa %xmm5,%xmm6
+ pmuludq 128(%edx),%xmm5
+ paddq %xmm7,%xmm0
+ movdqa %xmm6,%xmm7
+ pmuludq 16(%edx),%xmm6
+ paddq %xmm5,%xmm1
+ movdqa 48(%eax),%xmm5
+ pmuludq 32(%edx),%xmm7
+ paddq %xmm6,%xmm3
+ movdqa %xmm5,%xmm6
+ pmuludq 96(%edx),%xmm5
+ paddq %xmm7,%xmm4
+ movdqa %xmm6,%xmm7
+ pmuludq 112(%edx),%xmm6
+ paddq %xmm5,%xmm0
+ movdqa %xmm7,%xmm5
+ pmuludq 128(%edx),%xmm7
+ paddq %xmm6,%xmm1
+ movdqa 64(%eax),%xmm6
+ pmuludq 16(%edx),%xmm5
+ paddq %xmm7,%xmm2
+ movdqa %xmm6,%xmm7
+ pmuludq 128(%edx),%xmm6
+ paddq %xmm5,%xmm4
+ movdqa %xmm7,%xmm5
+ pmuludq 80(%edx),%xmm7
+ paddq %xmm6,%xmm3
+ movdqa %xmm5,%xmm6
+ pmuludq 96(%edx),%xmm5
+ paddq %xmm7,%xmm0
+ movdqa 64(%ebx),%xmm7
+ pmuludq 112(%edx),%xmm6
+ paddq %xmm5,%xmm1
+ paddq %xmm6,%xmm2
+ movdqa %xmm3,%xmm5
+ pand %xmm7,%xmm3
+ psrlq $26,%xmm5
+ paddq %xmm4,%xmm5
+ movdqa %xmm0,%xmm6
+ pand %xmm7,%xmm0
+ psrlq $26,%xmm6
+ movdqa %xmm5,%xmm4
+ paddq %xmm1,%xmm6
+ psrlq $26,%xmm5
+ pand %xmm7,%xmm4
+ movdqa %xmm6,%xmm1
+ psrlq $26,%xmm6
+ paddd %xmm5,%xmm0
+ psllq $2,%xmm5
+ paddq %xmm2,%xmm6
+ paddq %xmm0,%xmm5
+ pand %xmm7,%xmm1
+ movdqa %xmm6,%xmm2
+ psrlq $26,%xmm6
+ pand %xmm7,%xmm2
+ paddd %xmm3,%xmm6
+ movdqa %xmm5,%xmm0
+ psrlq $26,%xmm5
+ movdqa %xmm6,%xmm3
+ psrlq $26,%xmm6
+ pand %xmm7,%xmm0
+ paddd %xmm5,%xmm1
+ pand %xmm7,%xmm3
+ paddd %xmm6,%xmm4
+ movdqu 32(%esi),%xmm5
+ movdqu 48(%esi),%xmm6
+ leal 32(%esi),%esi
+ movdqa %xmm2,112(%esp)
+ movdqa %xmm3,128(%esp)
+ movdqa %xmm4,144(%esp)
+ movdqa %xmm5,%xmm2
+ movdqa %xmm6,%xmm3
+ psrldq $6,%xmm2
+ psrldq $6,%xmm3
+ movdqa %xmm5,%xmm4
+ punpcklqdq %xmm3,%xmm2
+ punpckhqdq %xmm6,%xmm4
+ punpcklqdq %xmm6,%xmm5
+ movdqa %xmm2,%xmm3
+ psrlq $4,%xmm2
+ psrlq $30,%xmm3
+ movdqa %xmm5,%xmm6
+ psrlq $40,%xmm4
+ psrlq $26,%xmm6
+ pand %xmm7,%xmm5
+ pand %xmm7,%xmm6
+ pand %xmm7,%xmm2
+ pand %xmm7,%xmm3
+ por (%ebx),%xmm4
+ movdqa %xmm0,80(%esp)
+ movdqa %xmm1,96(%esp)
+ ja .L015loop
+.L014skip_loop:
+ pshufd $16,-144(%edx),%xmm7
+ addl $32,%ecx
+ jnz .L016long_tail
+ paddd %xmm0,%xmm5
+ paddd %xmm1,%xmm6
+ paddd 112(%esp),%xmm2
+ paddd 128(%esp),%xmm3
+ paddd 144(%esp),%xmm4
+.L016long_tail:
+ movdqa %xmm5,(%eax)
+ movdqa %xmm6,16(%eax)
+ movdqa %xmm2,32(%eax)
+ movdqa %xmm3,48(%eax)
+ movdqa %xmm4,64(%eax)
+ pmuludq %xmm7,%xmm5
+ pmuludq %xmm7,%xmm6
+ pmuludq %xmm7,%xmm2
+ movdqa %xmm5,%xmm0
+ pshufd $16,-128(%edx),%xmm5
+ pmuludq %xmm7,%xmm3
+ movdqa %xmm6,%xmm1
+ pmuludq %xmm7,%xmm4
+ movdqa %xmm5,%xmm6
+ pmuludq 48(%eax),%xmm5
+ movdqa %xmm6,%xmm7
+ pmuludq 32(%eax),%xmm6
+ paddq %xmm5,%xmm4
+ movdqa %xmm7,%xmm5
+ pmuludq 16(%eax),%xmm7
+ paddq %xmm6,%xmm3
+ pshufd $16,-64(%edx),%xmm6
+ pmuludq (%eax),%xmm5
+ paddq %xmm7,%xmm2
+ pmuludq 64(%eax),%xmm6
+ pshufd $16,-112(%edx),%xmm7
+ paddq %xmm5,%xmm1
+ movdqa %xmm7,%xmm5
+ pmuludq 32(%eax),%xmm7
+ paddq %xmm6,%xmm0
+ movdqa %xmm5,%xmm6
+ pmuludq 16(%eax),%xmm5
+ paddq %xmm7,%xmm4
+ pshufd $16,-48(%edx),%xmm7
+ pmuludq (%eax),%xmm6
+ paddq %xmm5,%xmm3
+ movdqa %xmm7,%xmm5
+ pmuludq 64(%eax),%xmm7
+ paddq %xmm6,%xmm2
+ pmuludq 48(%eax),%xmm5
+ pshufd $16,-96(%edx),%xmm6
+ paddq %xmm7,%xmm1
+ movdqa %xmm6,%xmm7
+ pmuludq 16(%eax),%xmm6
+ paddq %xmm5,%xmm0
+ pshufd $16,-32(%edx),%xmm5
+ pmuludq (%eax),%xmm7
+ paddq %xmm6,%xmm4
+ movdqa %xmm5,%xmm6
+ pmuludq 64(%eax),%xmm5
+ paddq %xmm7,%xmm3
+ movdqa %xmm6,%xmm7
+ pmuludq 48(%eax),%xmm6
+ paddq %xmm5,%xmm2
+ pmuludq 32(%eax),%xmm7
+ pshufd $16,-80(%edx),%xmm5
+ paddq %xmm6,%xmm1
+ pshufd $16,-16(%edx),%xmm6
+ pmuludq (%eax),%xmm5
+ paddq %xmm7,%xmm0
+ movdqa %xmm6,%xmm7
+ pmuludq 64(%eax),%xmm6
+ paddq %xmm5,%xmm4
+ movdqa %xmm7,%xmm5
+ pmuludq 16(%eax),%xmm7
+ paddq %xmm6,%xmm3
+ movdqa %xmm5,%xmm6
+ pmuludq 32(%eax),%xmm5
+ paddq %xmm7,%xmm0
+ pmuludq 48(%eax),%xmm6
+ movdqa 64(%ebx),%xmm7
+ paddq %xmm5,%xmm1
+ paddq %xmm6,%xmm2
+ jz .L017short_tail
+ movdqu -32(%esi),%xmm5
+ movdqu -16(%esi),%xmm6
+ leal 32(%esi),%esi
+ movdqa %xmm2,32(%esp)
+ movdqa %xmm3,48(%esp)
+ movdqa %xmm4,64(%esp)
+ movdqa %xmm5,%xmm2
+ movdqa %xmm6,%xmm3
+ psrldq $6,%xmm2
+ psrldq $6,%xmm3
+ movdqa %xmm5,%xmm4
+ punpcklqdq %xmm3,%xmm2
+ punpckhqdq %xmm6,%xmm4
+ punpcklqdq %xmm6,%xmm5
+ movdqa %xmm2,%xmm3
+ psrlq $4,%xmm2
+ psrlq $30,%xmm3
+ movdqa %xmm5,%xmm6
+ psrlq $40,%xmm4
+ psrlq $26,%xmm6
+ pand %xmm7,%xmm5
+ pand %xmm7,%xmm6
+ pand %xmm7,%xmm2
+ pand %xmm7,%xmm3
+ por (%ebx),%xmm4
+ pshufd $16,(%edx),%xmm7
+ paddd 80(%esp),%xmm5
+ paddd 96(%esp),%xmm6
+ paddd 112(%esp),%xmm2
+ paddd 128(%esp),%xmm3
+ paddd 144(%esp),%xmm4
+ movdqa %xmm5,(%esp)
+ pmuludq %xmm7,%xmm5
+ movdqa %xmm6,16(%esp)
+ pmuludq %xmm7,%xmm6
+ paddq %xmm5,%xmm0
+ movdqa %xmm2,%xmm5
+ pmuludq %xmm7,%xmm2
+ paddq %xmm6,%xmm1
+ movdqa %xmm3,%xmm6
+ pmuludq %xmm7,%xmm3
+ paddq 32(%esp),%xmm2
+ movdqa %xmm5,32(%esp)
+ pshufd $16,16(%edx),%xmm5
+ paddq 48(%esp),%xmm3
+ movdqa %xmm6,48(%esp)
+ movdqa %xmm4,%xmm6
+ pmuludq %xmm7,%xmm4
+ paddq 64(%esp),%xmm4
+ movdqa %xmm6,64(%esp)
+ movdqa %xmm5,%xmm6
+ pmuludq 48(%esp),%xmm5
+ movdqa %xmm6,%xmm7
+ pmuludq 32(%esp),%xmm6
+ paddq %xmm5,%xmm4
+ movdqa %xmm7,%xmm5
+ pmuludq 16(%esp),%xmm7
+ paddq %xmm6,%xmm3
+ pshufd $16,80(%edx),%xmm6
+ pmuludq (%esp),%xmm5
+ paddq %xmm7,%xmm2
+ pmuludq 64(%esp),%xmm6
+ pshufd $16,32(%edx),%xmm7
+ paddq %xmm5,%xmm1
+ movdqa %xmm7,%xmm5
+ pmuludq 32(%esp),%xmm7
+ paddq %xmm6,%xmm0
+ movdqa %xmm5,%xmm6
+ pmuludq 16(%esp),%xmm5
+ paddq %xmm7,%xmm4
+ pshufd $16,96(%edx),%xmm7
+ pmuludq (%esp),%xmm6
+ paddq %xmm5,%xmm3
+ movdqa %xmm7,%xmm5
+ pmuludq 64(%esp),%xmm7
+ paddq %xmm6,%xmm2
+ pmuludq 48(%esp),%xmm5
+ pshufd $16,48(%edx),%xmm6
+ paddq %xmm7,%xmm1
+ movdqa %xmm6,%xmm7
+ pmuludq 16(%esp),%xmm6
+ paddq %xmm5,%xmm0
+ pshufd $16,112(%edx),%xmm5
+ pmuludq (%esp),%xmm7
+ paddq %xmm6,%xmm4
+ movdqa %xmm5,%xmm6
+ pmuludq 64(%esp),%xmm5
+ paddq %xmm7,%xmm3
+ movdqa %xmm6,%xmm7
+ pmuludq 48(%esp),%xmm6
+ paddq %xmm5,%xmm2
+ pmuludq 32(%esp),%xmm7
+ pshufd $16,64(%edx),%xmm5
+ paddq %xmm6,%xmm1
+ pshufd $16,128(%edx),%xmm6
+ pmuludq (%esp),%xmm5
+ paddq %xmm7,%xmm0
+ movdqa %xmm6,%xmm7
+ pmuludq 64(%esp),%xmm6
+ paddq %xmm5,%xmm4
+ movdqa %xmm7,%xmm5
+ pmuludq 16(%esp),%xmm7
+ paddq %xmm6,%xmm3
+ movdqa %xmm5,%xmm6
+ pmuludq 32(%esp),%xmm5
+ paddq %xmm7,%xmm0
+ pmuludq 48(%esp),%xmm6
+ movdqa 64(%ebx),%xmm7
+ paddq %xmm5,%xmm1
+ paddq %xmm6,%xmm2
+.L017short_tail:
+ pshufd $78,%xmm4,%xmm6
+ pshufd $78,%xmm3,%xmm5
+ paddq %xmm6,%xmm4
+ paddq %xmm5,%xmm3
+ pshufd $78,%xmm0,%xmm6
+ pshufd $78,%xmm1,%xmm5
+ paddq %xmm6,%xmm0
+ paddq %xmm5,%xmm1
+ pshufd $78,%xmm2,%xmm6
+ movdqa %xmm3,%xmm5
+ pand %xmm7,%xmm3
+ psrlq $26,%xmm5
+ paddq %xmm6,%xmm2
+ paddq %xmm4,%xmm5
+ movdqa %xmm0,%xmm6
+ pand %xmm7,%xmm0
+ psrlq $26,%xmm6
+ movdqa %xmm5,%xmm4
+ paddq %xmm1,%xmm6
+ psrlq $26,%xmm5
+ pand %xmm7,%xmm4
+ movdqa %xmm6,%xmm1
+ psrlq $26,%xmm6
+ paddd %xmm5,%xmm0
+ psllq $2,%xmm5
+ paddq %xmm2,%xmm6
+ paddq %xmm0,%xmm5
+ pand %xmm7,%xmm1
+ movdqa %xmm6,%xmm2
+ psrlq $26,%xmm6
+ pand %xmm7,%xmm2
+ paddd %xmm3,%xmm6
+ movdqa %xmm5,%xmm0
+ psrlq $26,%xmm5
+ movdqa %xmm6,%xmm3
+ psrlq $26,%xmm6
+ pand %xmm7,%xmm0
+ paddd %xmm5,%xmm1
+ pand %xmm7,%xmm3
+ paddd %xmm6,%xmm4
+.L013done:
+ movd %xmm0,-48(%edi)
+ movd %xmm1,-44(%edi)
+ movd %xmm2,-40(%edi)
+ movd %xmm3,-36(%edi)
+ movd %xmm4,-32(%edi)
+ movl %ebp,%esp
+.L007nodata:
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size _poly1305_blocks_sse2,.-_poly1305_blocks_sse2
+.align 32
+.type _poly1305_emit_sse2,@function
+.align 16
+_poly1305_emit_sse2:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%ebp
+ cmpl $0,20(%ebp)
+ je .Lenter_emit
+ movl (%ebp),%eax
+ movl 4(%ebp),%edi
+ movl 8(%ebp),%ecx
+ movl 12(%ebp),%edx
+ movl 16(%ebp),%esi
+ movl %edi,%ebx
+ shll $26,%edi
+ shrl $6,%ebx
+ addl %edi,%eax
+ movl %ecx,%edi
+ adcl $0,%ebx
+ shll $20,%edi
+ shrl $12,%ecx
+ addl %edi,%ebx
+ movl %edx,%edi
+ adcl $0,%ecx
+ shll $14,%edi
+ shrl $18,%edx
+ addl %edi,%ecx
+ movl %esi,%edi
+ adcl $0,%edx
+ shll $8,%edi
+ shrl $24,%esi
+ addl %edi,%edx
+ adcl $0,%esi
+ movl %esi,%edi
+ andl $3,%esi
+ shrl $2,%edi
+ leal (%edi,%edi,4),%ebp
+ movl 24(%esp),%edi
+ addl %ebp,%eax
+ movl 28(%esp),%ebp
+ adcl $0,%ebx
+ adcl $0,%ecx
+ adcl $0,%edx
+ adcl $0,%esi
+ movd %eax,%xmm0
+ addl $5,%eax
+ movd %ebx,%xmm1
+ adcl $0,%ebx
+ movd %ecx,%xmm2
+ adcl $0,%ecx
+ movd %edx,%xmm3
+ adcl $0,%edx
+ adcl $0,%esi
+ shrl $2,%esi
+ negl %esi
+ andl %esi,%eax
+ andl %esi,%ebx
+ andl %esi,%ecx
+ andl %esi,%edx
+ movl %eax,(%edi)
+ movd %xmm0,%eax
+ movl %ebx,4(%edi)
+ movd %xmm1,%ebx
+ movl %ecx,8(%edi)
+ movd %xmm2,%ecx
+ movl %edx,12(%edi)
+ movd %xmm3,%edx
+ notl %esi
+ andl %esi,%eax
+ andl %esi,%ebx
+ orl (%edi),%eax
+ andl %esi,%ecx
+ orl 4(%edi),%ebx
+ andl %esi,%edx
+ orl 8(%edi),%ecx
+ orl 12(%edi),%edx
+ addl (%ebp),%eax
+ adcl 4(%ebp),%ebx
+ movl %eax,(%edi)
+ adcl 8(%ebp),%ecx
+ movl %ebx,4(%edi)
+ adcl 12(%ebp),%edx
+ movl %ecx,8(%edi)
+ movl %edx,12(%edi)
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size _poly1305_emit_sse2,.-_poly1305_emit_sse2
+.align 32
+.type _poly1305_init_avx2,@function
+.align 16
+_poly1305_init_avx2:
+ vmovdqu 24(%edi),%xmm4
+ leal 48(%edi),%edi
+ movl %esp,%ebp
+ subl $224,%esp
+ andl $-16,%esp
+ vmovdqa 64(%ebx),%xmm7
+ vpand %xmm7,%xmm4,%xmm0
+ vpsrlq $26,%xmm4,%xmm1
+ vpsrldq $6,%xmm4,%xmm3
+ vpand %xmm7,%xmm1,%xmm1
+ vpsrlq $4,%xmm3,%xmm2
+ vpsrlq $30,%xmm3,%xmm3
+ vpand %xmm7,%xmm2,%xmm2
+ vpand %xmm7,%xmm3,%xmm3
+ vpsrldq $13,%xmm4,%xmm4
+ leal 144(%esp),%edx
+ movl $2,%ecx
+.L018square:
+ vmovdqa %xmm0,(%esp)
+ vmovdqa %xmm1,16(%esp)
+ vmovdqa %xmm2,32(%esp)
+ vmovdqa %xmm3,48(%esp)
+ vmovdqa %xmm4,64(%esp)
+ vpslld $2,%xmm1,%xmm6
+ vpslld $2,%xmm2,%xmm5
+ vpaddd %xmm1,%xmm6,%xmm6
+ vpaddd %xmm2,%xmm5,%xmm5
+ vmovdqa %xmm6,80(%esp)
+ vmovdqa %xmm5,96(%esp)
+ vpslld $2,%xmm3,%xmm6
+ vpslld $2,%xmm4,%xmm5
+ vpaddd %xmm3,%xmm6,%xmm6
+ vpaddd %xmm4,%xmm5,%xmm5
+ vmovdqa %xmm6,112(%esp)
+ vmovdqa %xmm5,128(%esp)
+ vpshufd $68,%xmm0,%xmm5
+ vmovdqa %xmm1,%xmm6
+ vpshufd $68,%xmm1,%xmm1
+ vpshufd $68,%xmm2,%xmm2
+ vpshufd $68,%xmm3,%xmm3
+ vpshufd $68,%xmm4,%xmm4
+ vmovdqa %xmm5,(%edx)
+ vmovdqa %xmm1,16(%edx)
+ vmovdqa %xmm2,32(%edx)
+ vmovdqa %xmm3,48(%edx)
+ vmovdqa %xmm4,64(%edx)
+ vpmuludq %xmm0,%xmm4,%xmm4
+ vpmuludq %xmm0,%xmm3,%xmm3
+ vpmuludq %xmm0,%xmm2,%xmm2
+ vpmuludq %xmm0,%xmm1,%xmm1
+ vpmuludq %xmm0,%xmm5,%xmm0
+ vpmuludq 48(%edx),%xmm6,%xmm5
+ vpaddq %xmm5,%xmm4,%xmm4
+ vpmuludq 32(%edx),%xmm6,%xmm7
+ vpaddq %xmm7,%xmm3,%xmm3
+ vpmuludq 16(%edx),%xmm6,%xmm5
+ vpaddq %xmm5,%xmm2,%xmm2
+ vmovdqa 80(%esp),%xmm7
+ vpmuludq (%edx),%xmm6,%xmm6
+ vpaddq %xmm6,%xmm1,%xmm1
+ vmovdqa 32(%esp),%xmm5
+ vpmuludq 64(%edx),%xmm7,%xmm7
+ vpaddq %xmm7,%xmm0,%xmm0
+ vpmuludq 32(%edx),%xmm5,%xmm6
+ vpaddq %xmm6,%xmm4,%xmm4
+ vpmuludq 16(%edx),%xmm5,%xmm7
+ vpaddq %xmm7,%xmm3,%xmm3
+ vmovdqa 96(%esp),%xmm6
+ vpmuludq (%edx),%xmm5,%xmm5
+ vpaddq %xmm5,%xmm2,%xmm2
+ vpmuludq 64(%edx),%xmm6,%xmm7
+ vpaddq %xmm7,%xmm1,%xmm1
+ vmovdqa 48(%esp),%xmm5
+ vpmuludq 48(%edx),%xmm6,%xmm6
+ vpaddq %xmm6,%xmm0,%xmm0
+ vpmuludq 16(%edx),%xmm5,%xmm7
+ vpaddq %xmm7,%xmm4,%xmm4
+ vmovdqa 112(%esp),%xmm6
+ vpmuludq (%edx),%xmm5,%xmm5
+ vpaddq %xmm5,%xmm3,%xmm3
+ vpmuludq 64(%edx),%xmm6,%xmm7
+ vpaddq %xmm7,%xmm2,%xmm2
+ vpmuludq 48(%edx),%xmm6,%xmm5
+ vpaddq %xmm5,%xmm1,%xmm1
+ vmovdqa 64(%esp),%xmm7
+ vpmuludq 32(%edx),%xmm6,%xmm6
+ vpaddq %xmm6,%xmm0,%xmm0
+ vmovdqa 128(%esp),%xmm5
+ vpmuludq (%edx),%xmm7,%xmm7
+ vpaddq %xmm7,%xmm4,%xmm4
+ vpmuludq 64(%edx),%xmm5,%xmm6
+ vpaddq %xmm6,%xmm3,%xmm3
+ vpmuludq 16(%edx),%xmm5,%xmm7
+ vpaddq %xmm7,%xmm0,%xmm0
+ vpmuludq 32(%edx),%xmm5,%xmm6
+ vpaddq %xmm6,%xmm1,%xmm1
+ vmovdqa 64(%ebx),%xmm7
+ vpmuludq 48(%edx),%xmm5,%xmm5
+ vpaddq %xmm5,%xmm2,%xmm2
+ vpsrlq $26,%xmm3,%xmm5
+ vpand %xmm7,%xmm3,%xmm3
+ vpsrlq $26,%xmm0,%xmm6
+ vpand %xmm7,%xmm0,%xmm0
+ vpaddq %xmm5,%xmm4,%xmm4
+ vpaddq %xmm6,%xmm1,%xmm1
+ vpsrlq $26,%xmm4,%xmm5
+ vpand %xmm7,%xmm4,%xmm4
+ vpsrlq $26,%xmm1,%xmm6
+ vpand %xmm7,%xmm1,%xmm1
+ vpaddq %xmm6,%xmm2,%xmm2
+ vpaddd %xmm5,%xmm0,%xmm0
+ vpsllq $2,%xmm5,%xmm5
+ vpsrlq $26,%xmm2,%xmm6
+ vpand %xmm7,%xmm2,%xmm2
+ vpaddd %xmm5,%xmm0,%xmm0
+ vpaddd %xmm6,%xmm3,%xmm3
+ vpsrlq $26,%xmm3,%xmm6
+ vpsrlq $26,%xmm0,%xmm5
+ vpand %xmm7,%xmm0,%xmm0
+ vpand %xmm7,%xmm3,%xmm3
+ vpaddd %xmm5,%xmm1,%xmm1
+ vpaddd %xmm6,%xmm4,%xmm4
+ decl %ecx
+ jz .L019square_break
+ vpunpcklqdq (%esp),%xmm0,%xmm0
+ vpunpcklqdq 16(%esp),%xmm1,%xmm1
+ vpunpcklqdq 32(%esp),%xmm2,%xmm2
+ vpunpcklqdq 48(%esp),%xmm3,%xmm3
+ vpunpcklqdq 64(%esp),%xmm4,%xmm4
+ jmp .L018square
+.L019square_break:
+ vpsllq $32,%xmm0,%xmm0
+ vpsllq $32,%xmm1,%xmm1
+ vpsllq $32,%xmm2,%xmm2
+ vpsllq $32,%xmm3,%xmm3
+ vpsllq $32,%xmm4,%xmm4
+ vpor (%esp),%xmm0,%xmm0
+ vpor 16(%esp),%xmm1,%xmm1
+ vpor 32(%esp),%xmm2,%xmm2
+ vpor 48(%esp),%xmm3,%xmm3
+ vpor 64(%esp),%xmm4,%xmm4
+ vpshufd $141,%xmm0,%xmm0
+ vpshufd $141,%xmm1,%xmm1
+ vpshufd $141,%xmm2,%xmm2
+ vpshufd $141,%xmm3,%xmm3
+ vpshufd $141,%xmm4,%xmm4
+ vmovdqu %xmm0,(%edi)
+ vmovdqu %xmm1,16(%edi)
+ vmovdqu %xmm2,32(%edi)
+ vmovdqu %xmm3,48(%edi)
+ vmovdqu %xmm4,64(%edi)
+ vpslld $2,%xmm1,%xmm6
+ vpslld $2,%xmm2,%xmm5
+ vpaddd %xmm1,%xmm6,%xmm6
+ vpaddd %xmm2,%xmm5,%xmm5
+ vmovdqu %xmm6,80(%edi)
+ vmovdqu %xmm5,96(%edi)
+ vpslld $2,%xmm3,%xmm6
+ vpslld $2,%xmm4,%xmm5
+ vpaddd %xmm3,%xmm6,%xmm6
+ vpaddd %xmm4,%xmm5,%xmm5
+ vmovdqu %xmm6,112(%edi)
+ vmovdqu %xmm5,128(%edi)
+ movl %ebp,%esp
+ leal -48(%edi),%edi
+ ret
+.size _poly1305_init_avx2,.-_poly1305_init_avx2
+.align 32
+.type _poly1305_blocks_avx2,@function
+.align 16
+_poly1305_blocks_avx2:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%edi
+ movl 24(%esp),%esi
+ movl 28(%esp),%ecx
+ movl 20(%edi),%eax
+ andl $-16,%ecx
+ jz .L020nodata
+ cmpl $64,%ecx
+ jae .L021enter_avx2
+ testl %eax,%eax
+ jz .Lenter_blocks
+.L021enter_avx2:
+ vzeroupper
+ call .L022pic_point
+.L022pic_point:
+ popl %ebx
+ leal .Lconst_sse2-.L022pic_point(%ebx),%ebx
+ testl %eax,%eax
+ jnz .L023base2_26
+ call _poly1305_init_avx2
+ movl (%edi),%eax
+ movl 3(%edi),%ecx
+ movl 6(%edi),%edx
+ movl 9(%edi),%esi
+ movl 13(%edi),%ebp
+ shrl $2,%ecx
+ andl $67108863,%eax
+ shrl $4,%edx
+ andl $67108863,%ecx
+ shrl $6,%esi
+ andl $67108863,%edx
+ movl %eax,(%edi)
+ movl %ecx,4(%edi)
+ movl %edx,8(%edi)
+ movl %esi,12(%edi)
+ movl %ebp,16(%edi)
+ movl $1,20(%edi)
+ movl 24(%esp),%esi
+ movl 28(%esp),%ecx
+.L023base2_26:
+ movl 32(%esp),%eax
+ movl %esp,%ebp
+ subl $448,%esp
+ andl $-512,%esp
+ vmovdqu 48(%edi),%xmm0
+ leal 288(%esp),%edx
+ vmovdqu 64(%edi),%xmm1
+ vmovdqu 80(%edi),%xmm2
+ vmovdqu 96(%edi),%xmm3
+ vmovdqu 112(%edi),%xmm4
+ leal 48(%edi),%edi
+ vpermq $64,%ymm0,%ymm0
+ vpermq $64,%ymm1,%ymm1
+ vpermq $64,%ymm2,%ymm2
+ vpermq $64,%ymm3,%ymm3
+ vpermq $64,%ymm4,%ymm4
+ vpshufd $200,%ymm0,%ymm0
+ vpshufd $200,%ymm1,%ymm1
+ vpshufd $200,%ymm2,%ymm2
+ vpshufd $200,%ymm3,%ymm3
+ vpshufd $200,%ymm4,%ymm4
+ vmovdqa %ymm0,-128(%edx)
+ vmovdqu 80(%edi),%xmm0
+ vmovdqa %ymm1,-96(%edx)
+ vmovdqu 96(%edi),%xmm1
+ vmovdqa %ymm2,-64(%edx)
+ vmovdqu 112(%edi),%xmm2
+ vmovdqa %ymm3,-32(%edx)
+ vmovdqu 128(%edi),%xmm3
+ vmovdqa %ymm4,(%edx)
+ vpermq $64,%ymm0,%ymm0
+ vpermq $64,%ymm1,%ymm1
+ vpermq $64,%ymm2,%ymm2
+ vpermq $64,%ymm3,%ymm3
+ vpshufd $200,%ymm0,%ymm0
+ vpshufd $200,%ymm1,%ymm1
+ vpshufd $200,%ymm2,%ymm2
+ vpshufd $200,%ymm3,%ymm3
+ vmovdqa %ymm0,32(%edx)
+ vmovd -48(%edi),%xmm0
+ vmovdqa %ymm1,64(%edx)
+ vmovd -44(%edi),%xmm1
+ vmovdqa %ymm2,96(%edx)
+ vmovd -40(%edi),%xmm2
+ vmovdqa %ymm3,128(%edx)
+ vmovd -36(%edi),%xmm3
+ vmovd -32(%edi),%xmm4
+ vmovdqa 64(%ebx),%ymm7
+ negl %eax
+ testl $63,%ecx
+ jz .L024even
+ movl %ecx,%edx
+ andl $-64,%ecx
+ andl $63,%edx
+ vmovdqu (%esi),%xmm5
+ cmpl $32,%edx
+ jb .L025one
+ vmovdqu 16(%esi),%xmm6
+ je .L026two
+ vinserti128 $1,32(%esi),%ymm5,%ymm5
+ leal 48(%esi),%esi
+ leal 8(%ebx),%ebx
+ leal 296(%esp),%edx
+ jmp .L027tail
+.L026two:
+ leal 32(%esi),%esi
+ leal 16(%ebx),%ebx
+ leal 304(%esp),%edx
+ jmp .L027tail
+.L025one:
+ leal 16(%esi),%esi
+ vpxor %ymm6,%ymm6,%ymm6
+ leal 32(%ebx,%eax,8),%ebx
+ leal 312(%esp),%edx
+ jmp .L027tail
+.align 32
+.L024even:
+ vmovdqu (%esi),%xmm5
+ vmovdqu 16(%esi),%xmm6
+ vinserti128 $1,32(%esi),%ymm5,%ymm5
+ vinserti128 $1,48(%esi),%ymm6,%ymm6
+ leal 64(%esi),%esi
+ subl $64,%ecx
+ jz .L027tail
+.L028loop:
+ vmovdqa %ymm2,64(%esp)
+ vpsrldq $6,%ymm5,%ymm2
+ vmovdqa %ymm0,(%esp)
+ vpsrldq $6,%ymm6,%ymm0
+ vmovdqa %ymm1,32(%esp)
+ vpunpckhqdq %ymm6,%ymm5,%ymm1
+ vpunpcklqdq %ymm6,%ymm5,%ymm5
+ vpunpcklqdq %ymm0,%ymm2,%ymm2
+ vpsrlq $30,%ymm2,%ymm0
+ vpsrlq $4,%ymm2,%ymm2
+ vpsrlq $26,%ymm5,%ymm6
+ vpsrlq $40,%ymm1,%ymm1
+ vpand %ymm7,%ymm2,%ymm2
+ vpand %ymm7,%ymm5,%ymm5
+ vpand %ymm7,%ymm6,%ymm6
+ vpand %ymm7,%ymm0,%ymm0
+ vpor (%ebx),%ymm1,%ymm1
+ vpaddq 64(%esp),%ymm2,%ymm2
+ vpaddq (%esp),%ymm5,%ymm5
+ vpaddq 32(%esp),%ymm6,%ymm6
+ vpaddq %ymm3,%ymm0,%ymm0
+ vpaddq %ymm4,%ymm1,%ymm1
+ vpmuludq -96(%edx),%ymm2,%ymm3
+ vmovdqa %ymm6,32(%esp)
+ vpmuludq -64(%edx),%ymm2,%ymm4
+ vmovdqa %ymm0,96(%esp)
+ vpmuludq 96(%edx),%ymm2,%ymm0
+ vmovdqa %ymm1,128(%esp)
+ vpmuludq 128(%edx),%ymm2,%ymm1
+ vpmuludq -128(%edx),%ymm2,%ymm2
+ vpmuludq -32(%edx),%ymm5,%ymm7
+ vpaddq %ymm7,%ymm3,%ymm3
+ vpmuludq (%edx),%ymm5,%ymm6
+ vpaddq %ymm6,%ymm4,%ymm4
+ vpmuludq -128(%edx),%ymm5,%ymm7
+ vpaddq %ymm7,%ymm0,%ymm0
+ vmovdqa 32(%esp),%ymm7
+ vpmuludq -96(%edx),%ymm5,%ymm6
+ vpaddq %ymm6,%ymm1,%ymm1
+ vpmuludq -64(%edx),%ymm5,%ymm5
+ vpaddq %ymm5,%ymm2,%ymm2
+ vpmuludq -64(%edx),%ymm7,%ymm6
+ vpaddq %ymm6,%ymm3,%ymm3
+ vpmuludq -32(%edx),%ymm7,%ymm5
+ vpaddq %ymm5,%ymm4,%ymm4
+ vpmuludq 128(%edx),%ymm7,%ymm6
+ vpaddq %ymm6,%ymm0,%ymm0
+ vmovdqa 96(%esp),%ymm6
+ vpmuludq -128(%edx),%ymm7,%ymm5
+ vpaddq %ymm5,%ymm1,%ymm1
+ vpmuludq -96(%edx),%ymm7,%ymm7
+ vpaddq %ymm7,%ymm2,%ymm2
+ vpmuludq -128(%edx),%ymm6,%ymm5
+ vpaddq %ymm5,%ymm3,%ymm3
+ vpmuludq -96(%edx),%ymm6,%ymm7
+ vpaddq %ymm7,%ymm4,%ymm4
+ vpmuludq 64(%edx),%ymm6,%ymm5
+ vpaddq %ymm5,%ymm0,%ymm0
+ vmovdqa 128(%esp),%ymm5
+ vpmuludq 96(%edx),%ymm6,%ymm7
+ vpaddq %ymm7,%ymm1,%ymm1
+ vpmuludq 128(%edx),%ymm6,%ymm6
+ vpaddq %ymm6,%ymm2,%ymm2
+ vpmuludq 128(%edx),%ymm5,%ymm7
+ vpaddq %ymm7,%ymm3,%ymm3
+ vpmuludq 32(%edx),%ymm5,%ymm6
+ vpaddq %ymm6,%ymm0,%ymm0
+ vpmuludq -128(%edx),%ymm5,%ymm7
+ vpaddq %ymm7,%ymm4,%ymm4
+ vmovdqa 64(%ebx),%ymm7
+ vpmuludq 64(%edx),%ymm5,%ymm6
+ vpaddq %ymm6,%ymm1,%ymm1
+ vpmuludq 96(%edx),%ymm5,%ymm5
+ vpaddq %ymm5,%ymm2,%ymm2
+ vpsrlq $26,%ymm3,%ymm5
+ vpand %ymm7,%ymm3,%ymm3
+ vpsrlq $26,%ymm0,%ymm6
+ vpand %ymm7,%ymm0,%ymm0
+ vpaddq %ymm5,%ymm4,%ymm4
+ vpaddq %ymm6,%ymm1,%ymm1
+ vpsrlq $26,%ymm4,%ymm5
+ vpand %ymm7,%ymm4,%ymm4
+ vpsrlq $26,%ymm1,%ymm6
+ vpand %ymm7,%ymm1,%ymm1
+ vpaddq %ymm6,%ymm2,%ymm2
+ vpaddq %ymm5,%ymm0,%ymm0
+ vpsllq $2,%ymm5,%ymm5
+ vpsrlq $26,%ymm2,%ymm6
+ vpand %ymm7,%ymm2,%ymm2
+ vpaddq %ymm5,%ymm0,%ymm0
+ vpaddq %ymm6,%ymm3,%ymm3
+ vpsrlq $26,%ymm3,%ymm6
+ vpsrlq $26,%ymm0,%ymm5
+ vpand %ymm7,%ymm0,%ymm0
+ vpand %ymm7,%ymm3,%ymm3
+ vpaddq %ymm5,%ymm1,%ymm1
+ vpaddq %ymm6,%ymm4,%ymm4
+ vmovdqu (%esi),%xmm5
+ vmovdqu 16(%esi),%xmm6
+ vinserti128 $1,32(%esi),%ymm5,%ymm5
+ vinserti128 $1,48(%esi),%ymm6,%ymm6
+ leal 64(%esi),%esi
+ subl $64,%ecx
+ jnz .L028loop
+.L027tail:
+ vmovdqa %ymm2,64(%esp)
+ vpsrldq $6,%ymm5,%ymm2
+ vmovdqa %ymm0,(%esp)
+ vpsrldq $6,%ymm6,%ymm0
+ vmovdqa %ymm1,32(%esp)
+ vpunpckhqdq %ymm6,%ymm5,%ymm1
+ vpunpcklqdq %ymm6,%ymm5,%ymm5
+ vpunpcklqdq %ymm0,%ymm2,%ymm2
+ vpsrlq $30,%ymm2,%ymm0
+ vpsrlq $4,%ymm2,%ymm2
+ vpsrlq $26,%ymm5,%ymm6
+ vpsrlq $40,%ymm1,%ymm1
+ vpand %ymm7,%ymm2,%ymm2
+ vpand %ymm7,%ymm5,%ymm5
+ vpand %ymm7,%ymm6,%ymm6
+ vpand %ymm7,%ymm0,%ymm0
+ vpor (%ebx),%ymm1,%ymm1
+ andl $-64,%ebx
+ vpaddq 64(%esp),%ymm2,%ymm2
+ vpaddq (%esp),%ymm5,%ymm5
+ vpaddq 32(%esp),%ymm6,%ymm6
+ vpaddq %ymm3,%ymm0,%ymm0
+ vpaddq %ymm4,%ymm1,%ymm1
+ vpmuludq -92(%edx),%ymm2,%ymm3
+ vmovdqa %ymm6,32(%esp)
+ vpmuludq -60(%edx),%ymm2,%ymm4
+ vmovdqa %ymm0,96(%esp)
+ vpmuludq 100(%edx),%ymm2,%ymm0
+ vmovdqa %ymm1,128(%esp)
+ vpmuludq 132(%edx),%ymm2,%ymm1
+ vpmuludq -124(%edx),%ymm2,%ymm2
+ vpmuludq -28(%edx),%ymm5,%ymm7
+ vpaddq %ymm7,%ymm3,%ymm3
+ vpmuludq 4(%edx),%ymm5,%ymm6
+ vpaddq %ymm6,%ymm4,%ymm4
+ vpmuludq -124(%edx),%ymm5,%ymm7
+ vpaddq %ymm7,%ymm0,%ymm0
+ vmovdqa 32(%esp),%ymm7
+ vpmuludq -92(%edx),%ymm5,%ymm6
+ vpaddq %ymm6,%ymm1,%ymm1
+ vpmuludq -60(%edx),%ymm5,%ymm5
+ vpaddq %ymm5,%ymm2,%ymm2
+ vpmuludq -60(%edx),%ymm7,%ymm6
+ vpaddq %ymm6,%ymm3,%ymm3
+ vpmuludq -28(%edx),%ymm7,%ymm5
+ vpaddq %ymm5,%ymm4,%ymm4
+ vpmuludq 132(%edx),%ymm7,%ymm6
+ vpaddq %ymm6,%ymm0,%ymm0
+ vmovdqa 96(%esp),%ymm6
+ vpmuludq -124(%edx),%ymm7,%ymm5
+ vpaddq %ymm5,%ymm1,%ymm1
+ vpmuludq -92(%edx),%ymm7,%ymm7
+ vpaddq %ymm7,%ymm2,%ymm2
+ vpmuludq -124(%edx),%ymm6,%ymm5
+ vpaddq %ymm5,%ymm3,%ymm3
+ vpmuludq -92(%edx),%ymm6,%ymm7
+ vpaddq %ymm7,%ymm4,%ymm4
+ vpmuludq 68(%edx),%ymm6,%ymm5
+ vpaddq %ymm5,%ymm0,%ymm0
+ vmovdqa 128(%esp),%ymm5
+ vpmuludq 100(%edx),%ymm6,%ymm7
+ vpaddq %ymm7,%ymm1,%ymm1
+ vpmuludq 132(%edx),%ymm6,%ymm6
+ vpaddq %ymm6,%ymm2,%ymm2
+ vpmuludq 132(%edx),%ymm5,%ymm7
+ vpaddq %ymm7,%ymm3,%ymm3
+ vpmuludq 36(%edx),%ymm5,%ymm6
+ vpaddq %ymm6,%ymm0,%ymm0
+ vpmuludq -124(%edx),%ymm5,%ymm7
+ vpaddq %ymm7,%ymm4,%ymm4
+ vmovdqa 64(%ebx),%ymm7
+ vpmuludq 68(%edx),%ymm5,%ymm6
+ vpaddq %ymm6,%ymm1,%ymm1
+ vpmuludq 100(%edx),%ymm5,%ymm5
+ vpaddq %ymm5,%ymm2,%ymm2
+ vpsrldq $8,%ymm4,%ymm5
+ vpsrldq $8,%ymm3,%ymm6
+ vpaddq %ymm5,%ymm4,%ymm4
+ vpsrldq $8,%ymm0,%ymm5
+ vpaddq %ymm6,%ymm3,%ymm3
+ vpsrldq $8,%ymm1,%ymm6
+ vpaddq %ymm5,%ymm0,%ymm0
+ vpsrldq $8,%ymm2,%ymm5
+ vpaddq %ymm6,%ymm1,%ymm1
+ vpermq $2,%ymm4,%ymm6
+ vpaddq %ymm5,%ymm2,%ymm2
+ vpermq $2,%ymm3,%ymm5
+ vpaddq %ymm6,%ymm4,%ymm4
+ vpermq $2,%ymm0,%ymm6
+ vpaddq %ymm5,%ymm3,%ymm3
+ vpermq $2,%ymm1,%ymm5
+ vpaddq %ymm6,%ymm0,%ymm0
+ vpermq $2,%ymm2,%ymm6
+ vpaddq %ymm5,%ymm1,%ymm1
+ vpaddq %ymm6,%ymm2,%ymm2
+ vpsrlq $26,%ymm3,%ymm5
+ vpand %ymm7,%ymm3,%ymm3
+ vpsrlq $26,%ymm0,%ymm6
+ vpand %ymm7,%ymm0,%ymm0
+ vpaddq %ymm5,%ymm4,%ymm4
+ vpaddq %ymm6,%ymm1,%ymm1
+ vpsrlq $26,%ymm4,%ymm5
+ vpand %ymm7,%ymm4,%ymm4
+ vpsrlq $26,%ymm1,%ymm6
+ vpand %ymm7,%ymm1,%ymm1
+ vpaddq %ymm6,%ymm2,%ymm2
+ vpaddq %ymm5,%ymm0,%ymm0
+ vpsllq $2,%ymm5,%ymm5
+ vpsrlq $26,%ymm2,%ymm6
+ vpand %ymm7,%ymm2,%ymm2
+ vpaddq %ymm5,%ymm0,%ymm0
+ vpaddq %ymm6,%ymm3,%ymm3
+ vpsrlq $26,%ymm3,%ymm6
+ vpsrlq $26,%ymm0,%ymm5
+ vpand %ymm7,%ymm0,%ymm0
+ vpand %ymm7,%ymm3,%ymm3
+ vpaddq %ymm5,%ymm1,%ymm1
+ vpaddq %ymm6,%ymm4,%ymm4
+ cmpl $0,%ecx
+ je .L029done
+ vpshufd $252,%xmm0,%xmm0
+ leal 288(%esp),%edx
+ vpshufd $252,%xmm1,%xmm1
+ vpshufd $252,%xmm2,%xmm2
+ vpshufd $252,%xmm3,%xmm3
+ vpshufd $252,%xmm4,%xmm4
+ jmp .L024even
+.align 16
+.L029done:
+ vmovd %xmm0,-48(%edi)
+ vmovd %xmm1,-44(%edi)
+ vmovd %xmm2,-40(%edi)
+ vmovd %xmm3,-36(%edi)
+ vmovd %xmm4,-32(%edi)
+ vzeroupper
+ movl %ebp,%esp
+.L020nodata:
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size _poly1305_blocks_avx2,.-_poly1305_blocks_avx2
+.align 64
+.Lconst_sse2:
+.long 16777216,0,16777216,0,16777216,0,16777216,0
+.long 0,0,0,0,0,0,0,0
+.long 67108863,0,67108863,0,67108863,0,67108863,0
+.long 268435455,268435452,268435452,268435452
.byte 80,111,108,121,49,51,48,53,32,102,111,114,32,120,56,54
.byte 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32
.byte 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111
.byte 114,103,62,0
.align 4
+.comm OPENSSL_ia32cap_P,16,4
#endif
diff --git a/secure/lib/libcrypto/i386/sha1-586.S b/secure/lib/libcrypto/i386/sha1-586.S
index 797f2dd9c908..7e90e2d9b1d2 100644
--- a/secure/lib/libcrypto/i386/sha1-586.S
+++ b/secure/lib/libcrypto/i386/sha1-586.S
@@ -11,6 +11,28 @@ sha1_block_data_order:
pushl %ebx
pushl %esi
pushl %edi
+ call .L000pic_point
+.L000pic_point:
+ popl %ebp
+ leal OPENSSL_ia32cap_P-.L000pic_point(%ebp),%esi
+ leal .LK_XX_XX-.L000pic_point(%ebp),%ebp
+ movl (%esi),%eax
+ movl 4(%esi),%edx
+ testl $512,%edx
+ jz .L001x86
+ movl 8(%esi),%ecx
+ testl $16777216,%eax
+ jz .L001x86
+ testl $536870912,%ecx
+ jnz .Lshaext_shortcut
+ andl $268435456,%edx
+ andl $1073741824,%eax
+ orl %edx,%eax
+ cmpl $1342177280,%eax
+ je .Lavx_shortcut
+ jmp .Lssse3_shortcut
+.align 16
+.L001x86:
movl 20(%esp),%ebp
movl 24(%esp),%esi
movl 28(%esp),%eax
@@ -19,9 +41,9 @@ sha1_block_data_order:
addl %esi,%eax
movl %eax,104(%esp)
movl 16(%ebp),%edi
- jmp .L000loop
+ jmp .L002loop
.align 16
-.L000loop:
+.L002loop:
movl (%esi),%eax
movl 4(%esi),%ebx
movl 8(%esi),%ecx
@@ -1368,7 +1390,7 @@ sha1_block_data_order:
movl %ebx,12(%ebp)
movl %edx,%esi
movl %ecx,16(%ebp)
- jb .L000loop
+ jb .L002loop
addl $76,%esp
popl %edi
popl %esi
@@ -1376,10 +1398,2578 @@ sha1_block_data_order:
popl %ebp
ret
.size sha1_block_data_order,.-.L_sha1_block_data_order_begin
+.type _sha1_block_data_order_shaext,@function
+.align 16
+_sha1_block_data_order_shaext:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ call .L003pic_point
+.L003pic_point:
+ popl %ebp
+ leal .LK_XX_XX-.L003pic_point(%ebp),%ebp
+.Lshaext_shortcut:
+ movl 20(%esp),%edi
+ movl %esp,%ebx
+ movl 24(%esp),%esi
+ movl 28(%esp),%ecx
+ subl $32,%esp
+ movdqu (%edi),%xmm0
+ movd 16(%edi),%xmm1
+ andl $-32,%esp
+ movdqa 80(%ebp),%xmm3
+ movdqu (%esi),%xmm4
+ pshufd $27,%xmm0,%xmm0
+ movdqu 16(%esi),%xmm5
+ pshufd $27,%xmm1,%xmm1
+ movdqu 32(%esi),%xmm6
+.byte 102,15,56,0,227
+ movdqu 48(%esi),%xmm7
+.byte 102,15,56,0,235
+.byte 102,15,56,0,243
+.byte 102,15,56,0,251
+ jmp .L004loop_shaext
+.align 16
+.L004loop_shaext:
+ decl %ecx
+ leal 64(%esi),%eax
+ movdqa %xmm1,(%esp)
+ paddd %xmm4,%xmm1
+ cmovnel %eax,%esi
+ movdqa %xmm0,16(%esp)
+.byte 15,56,201,229
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,0
+.byte 15,56,200,213
+ pxor %xmm6,%xmm4
+.byte 15,56,201,238
+.byte 15,56,202,231
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,0
+.byte 15,56,200,206
+ pxor %xmm7,%xmm5
+.byte 15,56,202,236
+.byte 15,56,201,247
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,0
+.byte 15,56,200,215
+ pxor %xmm4,%xmm6
+.byte 15,56,201,252
+.byte 15,56,202,245
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,0
+.byte 15,56,200,204
+ pxor %xmm5,%xmm7
+.byte 15,56,202,254
+.byte 15,56,201,229
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,0
+.byte 15,56,200,213
+ pxor %xmm6,%xmm4
+.byte 15,56,201,238
+.byte 15,56,202,231
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,1
+.byte 15,56,200,206
+ pxor %xmm7,%xmm5
+.byte 15,56,202,236
+.byte 15,56,201,247
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,1
+.byte 15,56,200,215
+ pxor %xmm4,%xmm6
+.byte 15,56,201,252
+.byte 15,56,202,245
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,1
+.byte 15,56,200,204
+ pxor %xmm5,%xmm7
+.byte 15,56,202,254
+.byte 15,56,201,229
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,1
+.byte 15,56,200,213
+ pxor %xmm6,%xmm4
+.byte 15,56,201,238
+.byte 15,56,202,231
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,1
+.byte 15,56,200,206
+ pxor %xmm7,%xmm5
+.byte 15,56,202,236
+.byte 15,56,201,247
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,2
+.byte 15,56,200,215
+ pxor %xmm4,%xmm6
+.byte 15,56,201,252
+.byte 15,56,202,245
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,2
+.byte 15,56,200,204
+ pxor %xmm5,%xmm7
+.byte 15,56,202,254
+.byte 15,56,201,229
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,2
+.byte 15,56,200,213
+ pxor %xmm6,%xmm4
+.byte 15,56,201,238
+.byte 15,56,202,231
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,2
+.byte 15,56,200,206
+ pxor %xmm7,%xmm5
+.byte 15,56,202,236
+.byte 15,56,201,247
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,2
+.byte 15,56,200,215
+ pxor %xmm4,%xmm6
+.byte 15,56,201,252
+.byte 15,56,202,245
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,3
+.byte 15,56,200,204
+ pxor %xmm5,%xmm7
+.byte 15,56,202,254
+ movdqu (%esi),%xmm4
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,3
+.byte 15,56,200,213
+ movdqu 16(%esi),%xmm5
+.byte 102,15,56,0,227
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,3
+.byte 15,56,200,206
+ movdqu 32(%esi),%xmm6
+.byte 102,15,56,0,235
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,3
+.byte 15,56,200,215
+ movdqu 48(%esi),%xmm7
+.byte 102,15,56,0,243
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,3
+ movdqa (%esp),%xmm2
+.byte 102,15,56,0,251
+.byte 15,56,200,202
+ paddd 16(%esp),%xmm0
+ jnz .L004loop_shaext
+ pshufd $27,%xmm0,%xmm0
+ pshufd $27,%xmm1,%xmm1
+ movdqu %xmm0,(%edi)
+ movd %xmm1,16(%edi)
+ movl %ebx,%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size _sha1_block_data_order_shaext,.-_sha1_block_data_order_shaext
+.type _sha1_block_data_order_ssse3,@function
+.align 16
+_sha1_block_data_order_ssse3:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ call .L005pic_point
+.L005pic_point:
+ popl %ebp
+ leal .LK_XX_XX-.L005pic_point(%ebp),%ebp
+.Lssse3_shortcut:
+ movdqa (%ebp),%xmm7
+ movdqa 16(%ebp),%xmm0
+ movdqa 32(%ebp),%xmm1
+ movdqa 48(%ebp),%xmm2
+ movdqa 64(%ebp),%xmm6
+ movl 20(%esp),%edi
+ movl 24(%esp),%ebp
+ movl 28(%esp),%edx
+ movl %esp,%esi
+ subl $208,%esp
+ andl $-64,%esp
+ movdqa %xmm0,112(%esp)
+ movdqa %xmm1,128(%esp)
+ movdqa %xmm2,144(%esp)
+ shll $6,%edx
+ movdqa %xmm7,160(%esp)
+ addl %ebp,%edx
+ movdqa %xmm6,176(%esp)
+ addl $64,%ebp
+ movl %edi,192(%esp)
+ movl %ebp,196(%esp)
+ movl %edx,200(%esp)
+ movl %esi,204(%esp)
+ movl (%edi),%eax
+ movl 4(%edi),%ebx
+ movl 8(%edi),%ecx
+ movl 12(%edi),%edx
+ movl 16(%edi),%edi
+ movl %ebx,%esi
+ movdqu -64(%ebp),%xmm0
+ movdqu -48(%ebp),%xmm1
+ movdqu -32(%ebp),%xmm2
+ movdqu -16(%ebp),%xmm3
+.byte 102,15,56,0,198
+.byte 102,15,56,0,206
+.byte 102,15,56,0,214
+ movdqa %xmm7,96(%esp)
+.byte 102,15,56,0,222
+ paddd %xmm7,%xmm0
+ paddd %xmm7,%xmm1
+ paddd %xmm7,%xmm2
+ movdqa %xmm0,(%esp)
+ psubd %xmm7,%xmm0
+ movdqa %xmm1,16(%esp)
+ psubd %xmm7,%xmm1
+ movdqa %xmm2,32(%esp)
+ movl %ecx,%ebp
+ psubd %xmm7,%xmm2
+ xorl %edx,%ebp
+ pshufd $238,%xmm0,%xmm4
+ andl %ebp,%esi
+ jmp .L006loop
+.align 16
+.L006loop:
+ rorl $2,%ebx
+ xorl %edx,%esi
+ movl %eax,%ebp
+ punpcklqdq %xmm1,%xmm4
+ movdqa %xmm3,%xmm6
+ addl (%esp),%edi
+ xorl %ecx,%ebx
+ paddd %xmm3,%xmm7
+ movdqa %xmm0,64(%esp)
+ roll $5,%eax
+ addl %esi,%edi
+ psrldq $4,%xmm6
+ andl %ebx,%ebp
+ xorl %ecx,%ebx
+ pxor %xmm0,%xmm4
+ addl %eax,%edi
+ rorl $7,%eax
+ pxor %xmm2,%xmm6
+ xorl %ecx,%ebp
+ movl %edi,%esi
+ addl 4(%esp),%edx
+ pxor %xmm6,%xmm4
+ xorl %ebx,%eax
+ roll $5,%edi
+ movdqa %xmm7,48(%esp)
+ addl %ebp,%edx
+ andl %eax,%esi
+ movdqa %xmm4,%xmm0
+ xorl %ebx,%eax
+ addl %edi,%edx
+ rorl $7,%edi
+ movdqa %xmm4,%xmm6
+ xorl %ebx,%esi
+ pslldq $12,%xmm0
+ paddd %xmm4,%xmm4
+ movl %edx,%ebp
+ addl 8(%esp),%ecx
+ psrld $31,%xmm6
+ xorl %eax,%edi
+ roll $5,%edx
+ movdqa %xmm0,%xmm7
+ addl %esi,%ecx
+ andl %edi,%ebp
+ xorl %eax,%edi
+ psrld $30,%xmm0
+ addl %edx,%ecx
+ rorl $7,%edx
+ por %xmm6,%xmm4
+ xorl %eax,%ebp
+ movl %ecx,%esi
+ addl 12(%esp),%ebx
+ pslld $2,%xmm7
+ xorl %edi,%edx
+ roll $5,%ecx
+ pxor %xmm0,%xmm4
+ movdqa 96(%esp),%xmm0
+ addl %ebp,%ebx
+ andl %edx,%esi
+ pxor %xmm7,%xmm4
+ pshufd $238,%xmm1,%xmm5
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ movl %ebx,%ebp
+ punpcklqdq %xmm2,%xmm5
+ movdqa %xmm4,%xmm7
+ addl 16(%esp),%eax
+ xorl %edx,%ecx
+ paddd %xmm4,%xmm0
+ movdqa %xmm1,80(%esp)
+ roll $5,%ebx
+ addl %esi,%eax
+ psrldq $4,%xmm7
+ andl %ecx,%ebp
+ xorl %edx,%ecx
+ pxor %xmm1,%xmm5
+ addl %ebx,%eax
+ rorl $7,%ebx
+ pxor %xmm3,%xmm7
+ xorl %edx,%ebp
+ movl %eax,%esi
+ addl 20(%esp),%edi
+ pxor %xmm7,%xmm5
+ xorl %ecx,%ebx
+ roll $5,%eax
+ movdqa %xmm0,(%esp)
+ addl %ebp,%edi
+ andl %ebx,%esi
+ movdqa %xmm5,%xmm1
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ rorl $7,%eax
+ movdqa %xmm5,%xmm7
+ xorl %ecx,%esi
+ pslldq $12,%xmm1
+ paddd %xmm5,%xmm5
+ movl %edi,%ebp
+ addl 24(%esp),%edx
+ psrld $31,%xmm7
+ xorl %ebx,%eax
+ roll $5,%edi
+ movdqa %xmm1,%xmm0
+ addl %esi,%edx
+ andl %eax,%ebp
+ xorl %ebx,%eax
+ psrld $30,%xmm1
+ addl %edi,%edx
+ rorl $7,%edi
+ por %xmm7,%xmm5
+ xorl %ebx,%ebp
+ movl %edx,%esi
+ addl 28(%esp),%ecx
+ pslld $2,%xmm0
+ xorl %eax,%edi
+ roll $5,%edx
+ pxor %xmm1,%xmm5
+ movdqa 112(%esp),%xmm1
+ addl %ebp,%ecx
+ andl %edi,%esi
+ pxor %xmm0,%xmm5
+ pshufd $238,%xmm2,%xmm6
+ xorl %eax,%edi
+ addl %edx,%ecx
+ rorl $7,%edx
+ xorl %eax,%esi
+ movl %ecx,%ebp
+ punpcklqdq %xmm3,%xmm6
+ movdqa %xmm5,%xmm0
+ addl 32(%esp),%ebx
+ xorl %edi,%edx
+ paddd %xmm5,%xmm1
+ movdqa %xmm2,96(%esp)
+ roll $5,%ecx
+ addl %esi,%ebx
+ psrldq $4,%xmm0
+ andl %edx,%ebp
+ xorl %edi,%edx
+ pxor %xmm2,%xmm6
+ addl %ecx,%ebx
+ rorl $7,%ecx
+ pxor %xmm4,%xmm0
+ xorl %edi,%ebp
+ movl %ebx,%esi
+ addl 36(%esp),%eax
+ pxor %xmm0,%xmm6
+ xorl %edx,%ecx
+ roll $5,%ebx
+ movdqa %xmm1,16(%esp)
+ addl %ebp,%eax
+ andl %ecx,%esi
+ movdqa %xmm6,%xmm2
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ rorl $7,%ebx
+ movdqa %xmm6,%xmm0
+ xorl %edx,%esi
+ pslldq $12,%xmm2
+ paddd %xmm6,%xmm6
+ movl %eax,%ebp
+ addl 40(%esp),%edi
+ psrld $31,%xmm0
+ xorl %ecx,%ebx
+ roll $5,%eax
+ movdqa %xmm2,%xmm1
+ addl %esi,%edi
+ andl %ebx,%ebp
+ xorl %ecx,%ebx
+ psrld $30,%xmm2
+ addl %eax,%edi
+ rorl $7,%eax
+ por %xmm0,%xmm6
+ xorl %ecx,%ebp
+ movdqa 64(%esp),%xmm0
+ movl %edi,%esi
+ addl 44(%esp),%edx
+ pslld $2,%xmm1
+ xorl %ebx,%eax
+ roll $5,%edi
+ pxor %xmm2,%xmm6
+ movdqa 112(%esp),%xmm2
+ addl %ebp,%edx
+ andl %eax,%esi
+ pxor %xmm1,%xmm6
+ pshufd $238,%xmm3,%xmm7
+ xorl %ebx,%eax
+ addl %edi,%edx
+ rorl $7,%edi
+ xorl %ebx,%esi
+ movl %edx,%ebp
+ punpcklqdq %xmm4,%xmm7
+ movdqa %xmm6,%xmm1
+ addl 48(%esp),%ecx
+ xorl %eax,%edi
+ paddd %xmm6,%xmm2
+ movdqa %xmm3,64(%esp)
+ roll $5,%edx
+ addl %esi,%ecx
+ psrldq $4,%xmm1
+ andl %edi,%ebp
+ xorl %eax,%edi
+ pxor %xmm3,%xmm7
+ addl %edx,%ecx
+ rorl $7,%edx
+ pxor %xmm5,%xmm1
+ xorl %eax,%ebp
+ movl %ecx,%esi
+ addl 52(%esp),%ebx
+ pxor %xmm1,%xmm7
+ xorl %edi,%edx
+ roll $5,%ecx
+ movdqa %xmm2,32(%esp)
+ addl %ebp,%ebx
+ andl %edx,%esi
+ movdqa %xmm7,%xmm3
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ rorl $7,%ecx
+ movdqa %xmm7,%xmm1
+ xorl %edi,%esi
+ pslldq $12,%xmm3
+ paddd %xmm7,%xmm7
+ movl %ebx,%ebp
+ addl 56(%esp),%eax
+ psrld $31,%xmm1
+ xorl %edx,%ecx
+ roll $5,%ebx
+ movdqa %xmm3,%xmm2
+ addl %esi,%eax
+ andl %ecx,%ebp
+ xorl %edx,%ecx
+ psrld $30,%xmm3
+ addl %ebx,%eax
+ rorl $7,%ebx
+ por %xmm1,%xmm7
+ xorl %edx,%ebp
+ movdqa 80(%esp),%xmm1
+ movl %eax,%esi
+ addl 60(%esp),%edi
+ pslld $2,%xmm2
+ xorl %ecx,%ebx
+ roll $5,%eax
+ pxor %xmm3,%xmm7
+ movdqa 112(%esp),%xmm3
+ addl %ebp,%edi
+ andl %ebx,%esi
+ pxor %xmm2,%xmm7
+ pshufd $238,%xmm6,%xmm2
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ rorl $7,%eax
+ pxor %xmm4,%xmm0
+ punpcklqdq %xmm7,%xmm2
+ xorl %ecx,%esi
+ movl %edi,%ebp
+ addl (%esp),%edx
+ pxor %xmm1,%xmm0
+ movdqa %xmm4,80(%esp)
+ xorl %ebx,%eax
+ roll $5,%edi
+ movdqa %xmm3,%xmm4
+ addl %esi,%edx
+ paddd %xmm7,%xmm3
+ andl %eax,%ebp
+ pxor %xmm2,%xmm0
+ xorl %ebx,%eax
+ addl %edi,%edx
+ rorl $7,%edi
+ xorl %ebx,%ebp
+ movdqa %xmm0,%xmm2
+ movdqa %xmm3,48(%esp)
+ movl %edx,%esi
+ addl 4(%esp),%ecx
+ xorl %eax,%edi
+ roll $5,%edx
+ pslld $2,%xmm0
+ addl %ebp,%ecx
+ andl %edi,%esi
+ psrld $30,%xmm2
+ xorl %eax,%edi
+ addl %edx,%ecx
+ rorl $7,%edx
+ xorl %eax,%esi
+ movl %ecx,%ebp
+ addl 8(%esp),%ebx
+ xorl %edi,%edx
+ roll $5,%ecx
+ por %xmm2,%xmm0
+ addl %esi,%ebx
+ andl %edx,%ebp
+ movdqa 96(%esp),%xmm2
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ addl 12(%esp),%eax
+ xorl %edi,%ebp
+ movl %ebx,%esi
+ pshufd $238,%xmm7,%xmm3
+ roll $5,%ebx
+ addl %ebp,%eax
+ xorl %edx,%esi
+ rorl $7,%ecx
+ addl %ebx,%eax
+ addl 16(%esp),%edi
+ pxor %xmm5,%xmm1
+ punpcklqdq %xmm0,%xmm3
+ xorl %ecx,%esi
+ movl %eax,%ebp
+ roll $5,%eax
+ pxor %xmm2,%xmm1
+ movdqa %xmm5,96(%esp)
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ movdqa %xmm4,%xmm5
+ rorl $7,%ebx
+ paddd %xmm0,%xmm4
+ addl %eax,%edi
+ pxor %xmm3,%xmm1
+ addl 20(%esp),%edx
+ xorl %ebx,%ebp
+ movl %edi,%esi
+ roll $5,%edi
+ movdqa %xmm1,%xmm3
+ movdqa %xmm4,(%esp)
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ rorl $7,%eax
+ addl %edi,%edx
+ pslld $2,%xmm1
+ addl 24(%esp),%ecx
+ xorl %eax,%esi
+ psrld $30,%xmm3
+ movl %edx,%ebp
+ roll $5,%edx
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ rorl $7,%edi
+ addl %edx,%ecx
+ por %xmm3,%xmm1
+ addl 28(%esp),%ebx
+ xorl %edi,%ebp
+ movdqa 64(%esp),%xmm3
+ movl %ecx,%esi
+ roll $5,%ecx
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ rorl $7,%edx
+ pshufd $238,%xmm0,%xmm4
+ addl %ecx,%ebx
+ addl 32(%esp),%eax
+ pxor %xmm6,%xmm2
+ punpcklqdq %xmm1,%xmm4
+ xorl %edx,%esi
+ movl %ebx,%ebp
+ roll $5,%ebx
+ pxor %xmm3,%xmm2
+ movdqa %xmm6,64(%esp)
+ addl %esi,%eax
+ xorl %edx,%ebp
+ movdqa 128(%esp),%xmm6
+ rorl $7,%ecx
+ paddd %xmm1,%xmm5
+ addl %ebx,%eax
+ pxor %xmm4,%xmm2
+ addl 36(%esp),%edi
+ xorl %ecx,%ebp
+ movl %eax,%esi
+ roll $5,%eax
+ movdqa %xmm2,%xmm4
+ movdqa %xmm5,16(%esp)
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ rorl $7,%ebx
+ addl %eax,%edi
+ pslld $2,%xmm2
+ addl 40(%esp),%edx
+ xorl %ebx,%esi
+ psrld $30,%xmm4
+ movl %edi,%ebp
+ roll $5,%edi
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ rorl $7,%eax
+ addl %edi,%edx
+ por %xmm4,%xmm2
+ addl 44(%esp),%ecx
+ xorl %eax,%ebp
+ movdqa 80(%esp),%xmm4
+ movl %edx,%esi
+ roll $5,%edx
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ rorl $7,%edi
+ pshufd $238,%xmm1,%xmm5
+ addl %edx,%ecx
+ addl 48(%esp),%ebx
+ pxor %xmm7,%xmm3
+ punpcklqdq %xmm2,%xmm5
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ roll $5,%ecx
+ pxor %xmm4,%xmm3
+ movdqa %xmm7,80(%esp)
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ movdqa %xmm6,%xmm7
+ rorl $7,%edx
+ paddd %xmm2,%xmm6
+ addl %ecx,%ebx
+ pxor %xmm5,%xmm3
+ addl 52(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ roll $5,%ebx
+ movdqa %xmm3,%xmm5
+ movdqa %xmm6,32(%esp)
+ addl %ebp,%eax
+ xorl %edx,%esi
+ rorl $7,%ecx
+ addl %ebx,%eax
+ pslld $2,%xmm3
+ addl 56(%esp),%edi
+ xorl %ecx,%esi
+ psrld $30,%xmm5
+ movl %eax,%ebp
+ roll $5,%eax
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ rorl $7,%ebx
+ addl %eax,%edi
+ por %xmm5,%xmm3
+ addl 60(%esp),%edx
+ xorl %ebx,%ebp
+ movdqa 96(%esp),%xmm5
+ movl %edi,%esi
+ roll $5,%edi
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ rorl $7,%eax
+ pshufd $238,%xmm2,%xmm6
+ addl %edi,%edx
+ addl (%esp),%ecx
+ pxor %xmm0,%xmm4
+ punpcklqdq %xmm3,%xmm6
+ xorl %eax,%esi
+ movl %edx,%ebp
+ roll $5,%edx
+ pxor %xmm5,%xmm4
+ movdqa %xmm0,96(%esp)
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ movdqa %xmm7,%xmm0
+ rorl $7,%edi
+ paddd %xmm3,%xmm7
+ addl %edx,%ecx
+ pxor %xmm6,%xmm4
+ addl 4(%esp),%ebx
+ xorl %edi,%ebp
+ movl %ecx,%esi
+ roll $5,%ecx
+ movdqa %xmm4,%xmm6
+ movdqa %xmm7,48(%esp)
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ rorl $7,%edx
+ addl %ecx,%ebx
+ pslld $2,%xmm4
+ addl 8(%esp),%eax
+ xorl %edx,%esi
+ psrld $30,%xmm6
+ movl %ebx,%ebp
+ roll $5,%ebx
+ addl %esi,%eax
+ xorl %edx,%ebp
+ rorl $7,%ecx
+ addl %ebx,%eax
+ por %xmm6,%xmm4
+ addl 12(%esp),%edi
+ xorl %ecx,%ebp
+ movdqa 64(%esp),%xmm6
+ movl %eax,%esi
+ roll $5,%eax
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ rorl $7,%ebx
+ pshufd $238,%xmm3,%xmm7
+ addl %eax,%edi
+ addl 16(%esp),%edx
+ pxor %xmm1,%xmm5
+ punpcklqdq %xmm4,%xmm7
+ xorl %ebx,%esi
+ movl %edi,%ebp
+ roll $5,%edi
+ pxor %xmm6,%xmm5
+ movdqa %xmm1,64(%esp)
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ movdqa %xmm0,%xmm1
+ rorl $7,%eax
+ paddd %xmm4,%xmm0
+ addl %edi,%edx
+ pxor %xmm7,%xmm5
+ addl 20(%esp),%ecx
+ xorl %eax,%ebp
+ movl %edx,%esi
+ roll $5,%edx
+ movdqa %xmm5,%xmm7
+ movdqa %xmm0,(%esp)
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ rorl $7,%edi
+ addl %edx,%ecx
+ pslld $2,%xmm5
+ addl 24(%esp),%ebx
+ xorl %edi,%esi
+ psrld $30,%xmm7
+ movl %ecx,%ebp
+ roll $5,%ecx
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ rorl $7,%edx
+ addl %ecx,%ebx
+ por %xmm7,%xmm5
+ addl 28(%esp),%eax
+ movdqa 80(%esp),%xmm7
+ rorl $7,%ecx
+ movl %ebx,%esi
+ xorl %edx,%ebp
+ roll $5,%ebx
+ pshufd $238,%xmm4,%xmm0
+ addl %ebp,%eax
+ xorl %ecx,%esi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 32(%esp),%edi
+ pxor %xmm2,%xmm6
+ punpcklqdq %xmm5,%xmm0
+ andl %ecx,%esi
+ xorl %edx,%ecx
+ rorl $7,%ebx
+ pxor %xmm7,%xmm6
+ movdqa %xmm2,80(%esp)
+ movl %eax,%ebp
+ xorl %ecx,%esi
+ roll $5,%eax
+ movdqa %xmm1,%xmm2
+ addl %esi,%edi
+ paddd %xmm5,%xmm1
+ xorl %ebx,%ebp
+ pxor %xmm0,%xmm6
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ addl 36(%esp),%edx
+ andl %ebx,%ebp
+ movdqa %xmm6,%xmm0
+ movdqa %xmm1,16(%esp)
+ xorl %ecx,%ebx
+ rorl $7,%eax
+ movl %edi,%esi
+ xorl %ebx,%ebp
+ roll $5,%edi
+ pslld $2,%xmm6
+ addl %ebp,%edx
+ xorl %eax,%esi
+ psrld $30,%xmm0
+ xorl %ebx,%eax
+ addl %edi,%edx
+ addl 40(%esp),%ecx
+ andl %eax,%esi
+ xorl %ebx,%eax
+ rorl $7,%edi
+ por %xmm0,%xmm6
+ movl %edx,%ebp
+ xorl %eax,%esi
+ movdqa 96(%esp),%xmm0
+ roll $5,%edx
+ addl %esi,%ecx
+ xorl %edi,%ebp
+ xorl %eax,%edi
+ addl %edx,%ecx
+ pshufd $238,%xmm5,%xmm1
+ addl 44(%esp),%ebx
+ andl %edi,%ebp
+ xorl %eax,%edi
+ rorl $7,%edx
+ movl %ecx,%esi
+ xorl %edi,%ebp
+ roll $5,%ecx
+ addl %ebp,%ebx
+ xorl %edx,%esi
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ addl 48(%esp),%eax
+ pxor %xmm3,%xmm7
+ punpcklqdq %xmm6,%xmm1
+ andl %edx,%esi
+ xorl %edi,%edx
+ rorl $7,%ecx
+ pxor %xmm0,%xmm7
+ movdqa %xmm3,96(%esp)
+ movl %ebx,%ebp
+ xorl %edx,%esi
+ roll $5,%ebx
+ movdqa 144(%esp),%xmm3
+ addl %esi,%eax
+ paddd %xmm6,%xmm2
+ xorl %ecx,%ebp
+ pxor %xmm1,%xmm7
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 52(%esp),%edi
+ andl %ecx,%ebp
+ movdqa %xmm7,%xmm1
+ movdqa %xmm2,32(%esp)
+ xorl %edx,%ecx
+ rorl $7,%ebx
+ movl %eax,%esi
+ xorl %ecx,%ebp
+ roll $5,%eax
+ pslld $2,%xmm7
+ addl %ebp,%edi
+ xorl %ebx,%esi
+ psrld $30,%xmm1
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ addl 56(%esp),%edx
+ andl %ebx,%esi
+ xorl %ecx,%ebx
+ rorl $7,%eax
+ por %xmm1,%xmm7
+ movl %edi,%ebp
+ xorl %ebx,%esi
+ movdqa 64(%esp),%xmm1
+ roll $5,%edi
+ addl %esi,%edx
+ xorl %eax,%ebp
+ xorl %ebx,%eax
+ addl %edi,%edx
+ pshufd $238,%xmm6,%xmm2
+ addl 60(%esp),%ecx
+ andl %eax,%ebp
+ xorl %ebx,%eax
+ rorl $7,%edi
+ movl %edx,%esi
+ xorl %eax,%ebp
+ roll $5,%edx
+ addl %ebp,%ecx
+ xorl %edi,%esi
+ xorl %eax,%edi
+ addl %edx,%ecx
+ addl (%esp),%ebx
+ pxor %xmm4,%xmm0
+ punpcklqdq %xmm7,%xmm2
+ andl %edi,%esi
+ xorl %eax,%edi
+ rorl $7,%edx
+ pxor %xmm1,%xmm0
+ movdqa %xmm4,64(%esp)
+ movl %ecx,%ebp
+ xorl %edi,%esi
+ roll $5,%ecx
+ movdqa %xmm3,%xmm4
+ addl %esi,%ebx
+ paddd %xmm7,%xmm3
+ xorl %edx,%ebp
+ pxor %xmm2,%xmm0
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ addl 4(%esp),%eax
+ andl %edx,%ebp
+ movdqa %xmm0,%xmm2
+ movdqa %xmm3,48(%esp)
+ xorl %edi,%edx
+ rorl $7,%ecx
+ movl %ebx,%esi
+ xorl %edx,%ebp
+ roll $5,%ebx
+ pslld $2,%xmm0
+ addl %ebp,%eax
+ xorl %ecx,%esi
+ psrld $30,%xmm2
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 8(%esp),%edi
+ andl %ecx,%esi
+ xorl %edx,%ecx
+ rorl $7,%ebx
+ por %xmm2,%xmm0
+ movl %eax,%ebp
+ xorl %ecx,%esi
+ movdqa 80(%esp),%xmm2
+ roll $5,%eax
+ addl %esi,%edi
+ xorl %ebx,%ebp
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ pshufd $238,%xmm7,%xmm3
+ addl 12(%esp),%edx
+ andl %ebx,%ebp
+ xorl %ecx,%ebx
+ rorl $7,%eax
+ movl %edi,%esi
+ xorl %ebx,%ebp
+ roll $5,%edi
+ addl %ebp,%edx
+ xorl %eax,%esi
+ xorl %ebx,%eax
+ addl %edi,%edx
+ addl 16(%esp),%ecx
+ pxor %xmm5,%xmm1
+ punpcklqdq %xmm0,%xmm3
+ andl %eax,%esi
+ xorl %ebx,%eax
+ rorl $7,%edi
+ pxor %xmm2,%xmm1
+ movdqa %xmm5,80(%esp)
+ movl %edx,%ebp
+ xorl %eax,%esi
+ roll $5,%edx
+ movdqa %xmm4,%xmm5
+ addl %esi,%ecx
+ paddd %xmm0,%xmm4
+ xorl %edi,%ebp
+ pxor %xmm3,%xmm1
+ xorl %eax,%edi
+ addl %edx,%ecx
+ addl 20(%esp),%ebx
+ andl %edi,%ebp
+ movdqa %xmm1,%xmm3
+ movdqa %xmm4,(%esp)
+ xorl %eax,%edi
+ rorl $7,%edx
+ movl %ecx,%esi
+ xorl %edi,%ebp
+ roll $5,%ecx
+ pslld $2,%xmm1
+ addl %ebp,%ebx
+ xorl %edx,%esi
+ psrld $30,%xmm3
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ addl 24(%esp),%eax
+ andl %edx,%esi
+ xorl %edi,%edx
+ rorl $7,%ecx
+ por %xmm3,%xmm1
+ movl %ebx,%ebp
+ xorl %edx,%esi
+ movdqa 96(%esp),%xmm3
+ roll $5,%ebx
+ addl %esi,%eax
+ xorl %ecx,%ebp
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ pshufd $238,%xmm0,%xmm4
+ addl 28(%esp),%edi
+ andl %ecx,%ebp
+ xorl %edx,%ecx
+ rorl $7,%ebx
+ movl %eax,%esi
+ xorl %ecx,%ebp
+ roll $5,%eax
+ addl %ebp,%edi
+ xorl %ebx,%esi
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ addl 32(%esp),%edx
+ pxor %xmm6,%xmm2
+ punpcklqdq %xmm1,%xmm4
+ andl %ebx,%esi
+ xorl %ecx,%ebx
+ rorl $7,%eax
+ pxor %xmm3,%xmm2
+ movdqa %xmm6,96(%esp)
+ movl %edi,%ebp
+ xorl %ebx,%esi
+ roll $5,%edi
+ movdqa %xmm5,%xmm6
+ addl %esi,%edx
+ paddd %xmm1,%xmm5
+ xorl %eax,%ebp
+ pxor %xmm4,%xmm2
+ xorl %ebx,%eax
+ addl %edi,%edx
+ addl 36(%esp),%ecx
+ andl %eax,%ebp
+ movdqa %xmm2,%xmm4
+ movdqa %xmm5,16(%esp)
+ xorl %ebx,%eax
+ rorl $7,%edi
+ movl %edx,%esi
+ xorl %eax,%ebp
+ roll $5,%edx
+ pslld $2,%xmm2
+ addl %ebp,%ecx
+ xorl %edi,%esi
+ psrld $30,%xmm4
+ xorl %eax,%edi
+ addl %edx,%ecx
+ addl 40(%esp),%ebx
+ andl %edi,%esi
+ xorl %eax,%edi
+ rorl $7,%edx
+ por %xmm4,%xmm2
+ movl %ecx,%ebp
+ xorl %edi,%esi
+ movdqa 64(%esp),%xmm4
+ roll $5,%ecx
+ addl %esi,%ebx
+ xorl %edx,%ebp
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ pshufd $238,%xmm1,%xmm5
+ addl 44(%esp),%eax
+ andl %edx,%ebp
+ xorl %edi,%edx
+ rorl $7,%ecx
+ movl %ebx,%esi
+ xorl %edx,%ebp
+ roll $5,%ebx
+ addl %ebp,%eax
+ xorl %edx,%esi
+ addl %ebx,%eax
+ addl 48(%esp),%edi
+ pxor %xmm7,%xmm3
+ punpcklqdq %xmm2,%xmm5
+ xorl %ecx,%esi
+ movl %eax,%ebp
+ roll $5,%eax
+ pxor %xmm4,%xmm3
+ movdqa %xmm7,64(%esp)
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ movdqa %xmm6,%xmm7
+ rorl $7,%ebx
+ paddd %xmm2,%xmm6
+ addl %eax,%edi
+ pxor %xmm5,%xmm3
+ addl 52(%esp),%edx
+ xorl %ebx,%ebp
+ movl %edi,%esi
+ roll $5,%edi
+ movdqa %xmm3,%xmm5
+ movdqa %xmm6,32(%esp)
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ rorl $7,%eax
+ addl %edi,%edx
+ pslld $2,%xmm3
+ addl 56(%esp),%ecx
+ xorl %eax,%esi
+ psrld $30,%xmm5
+ movl %edx,%ebp
+ roll $5,%edx
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ rorl $7,%edi
+ addl %edx,%ecx
+ por %xmm5,%xmm3
+ addl 60(%esp),%ebx
+ xorl %edi,%ebp
+ movl %ecx,%esi
+ roll $5,%ecx
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl (%esp),%eax
+ xorl %edx,%esi
+ movl %ebx,%ebp
+ roll $5,%ebx
+ addl %esi,%eax
+ xorl %edx,%ebp
+ rorl $7,%ecx
+ paddd %xmm3,%xmm7
+ addl %ebx,%eax
+ addl 4(%esp),%edi
+ xorl %ecx,%ebp
+ movl %eax,%esi
+ movdqa %xmm7,48(%esp)
+ roll $5,%eax
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ rorl $7,%ebx
+ addl %eax,%edi
+ addl 8(%esp),%edx
+ xorl %ebx,%esi
+ movl %edi,%ebp
+ roll $5,%edi
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ rorl $7,%eax
+ addl %edi,%edx
+ addl 12(%esp),%ecx
+ xorl %eax,%ebp
+ movl %edx,%esi
+ roll $5,%edx
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ rorl $7,%edi
+ addl %edx,%ecx
+ movl 196(%esp),%ebp
+ cmpl 200(%esp),%ebp
+ je .L007done
+ movdqa 160(%esp),%xmm7
+ movdqa 176(%esp),%xmm6
+ movdqu (%ebp),%xmm0
+ movdqu 16(%ebp),%xmm1
+ movdqu 32(%ebp),%xmm2
+ movdqu 48(%ebp),%xmm3
+ addl $64,%ebp
+.byte 102,15,56,0,198
+ movl %ebp,196(%esp)
+ movdqa %xmm7,96(%esp)
+ addl 16(%esp),%ebx
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ roll $5,%ecx
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ rorl $7,%edx
+.byte 102,15,56,0,206
+ addl %ecx,%ebx
+ addl 20(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ paddd %xmm7,%xmm0
+ roll $5,%ebx
+ addl %ebp,%eax
+ xorl %edx,%esi
+ rorl $7,%ecx
+ movdqa %xmm0,(%esp)
+ addl %ebx,%eax
+ addl 24(%esp),%edi
+ xorl %ecx,%esi
+ movl %eax,%ebp
+ psubd %xmm7,%xmm0
+ roll $5,%eax
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ rorl $7,%ebx
+ addl %eax,%edi
+ addl 28(%esp),%edx
+ xorl %ebx,%ebp
+ movl %edi,%esi
+ roll $5,%edi
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ rorl $7,%eax
+ addl %edi,%edx
+ addl 32(%esp),%ecx
+ xorl %eax,%esi
+ movl %edx,%ebp
+ roll $5,%edx
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ rorl $7,%edi
+.byte 102,15,56,0,214
+ addl %edx,%ecx
+ addl 36(%esp),%ebx
+ xorl %edi,%ebp
+ movl %ecx,%esi
+ paddd %xmm7,%xmm1
+ roll $5,%ecx
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ rorl $7,%edx
+ movdqa %xmm1,16(%esp)
+ addl %ecx,%ebx
+ addl 40(%esp),%eax
+ xorl %edx,%esi
+ movl %ebx,%ebp
+ psubd %xmm7,%xmm1
+ roll $5,%ebx
+ addl %esi,%eax
+ xorl %edx,%ebp
+ rorl $7,%ecx
+ addl %ebx,%eax
+ addl 44(%esp),%edi
+ xorl %ecx,%ebp
+ movl %eax,%esi
+ roll $5,%eax
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ rorl $7,%ebx
+ addl %eax,%edi
+ addl 48(%esp),%edx
+ xorl %ebx,%esi
+ movl %edi,%ebp
+ roll $5,%edi
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ rorl $7,%eax
+.byte 102,15,56,0,222
+ addl %edi,%edx
+ addl 52(%esp),%ecx
+ xorl %eax,%ebp
+ movl %edx,%esi
+ paddd %xmm7,%xmm2
+ roll $5,%edx
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ rorl $7,%edi
+ movdqa %xmm2,32(%esp)
+ addl %edx,%ecx
+ addl 56(%esp),%ebx
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ psubd %xmm7,%xmm2
+ roll $5,%ecx
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl 60(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ roll $5,%ebx
+ addl %ebp,%eax
+ rorl $7,%ecx
+ addl %ebx,%eax
+ movl 192(%esp),%ebp
+ addl (%ebp),%eax
+ addl 4(%ebp),%esi
+ addl 8(%ebp),%ecx
+ movl %eax,(%ebp)
+ addl 12(%ebp),%edx
+ movl %esi,4(%ebp)
+ addl 16(%ebp),%edi
+ movl %ecx,8(%ebp)
+ movl %ecx,%ebx
+ movl %edx,12(%ebp)
+ xorl %edx,%ebx
+ movl %edi,16(%ebp)
+ movl %esi,%ebp
+ pshufd $238,%xmm0,%xmm4
+ andl %ebx,%esi
+ movl %ebp,%ebx
+ jmp .L006loop
+.align 16
+.L007done:
+ addl 16(%esp),%ebx
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ roll $5,%ecx
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl 20(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ roll $5,%ebx
+ addl %ebp,%eax
+ xorl %edx,%esi
+ rorl $7,%ecx
+ addl %ebx,%eax
+ addl 24(%esp),%edi
+ xorl %ecx,%esi
+ movl %eax,%ebp
+ roll $5,%eax
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ rorl $7,%ebx
+ addl %eax,%edi
+ addl 28(%esp),%edx
+ xorl %ebx,%ebp
+ movl %edi,%esi
+ roll $5,%edi
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ rorl $7,%eax
+ addl %edi,%edx
+ addl 32(%esp),%ecx
+ xorl %eax,%esi
+ movl %edx,%ebp
+ roll $5,%edx
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ rorl $7,%edi
+ addl %edx,%ecx
+ addl 36(%esp),%ebx
+ xorl %edi,%ebp
+ movl %ecx,%esi
+ roll $5,%ecx
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl 40(%esp),%eax
+ xorl %edx,%esi
+ movl %ebx,%ebp
+ roll $5,%ebx
+ addl %esi,%eax
+ xorl %edx,%ebp
+ rorl $7,%ecx
+ addl %ebx,%eax
+ addl 44(%esp),%edi
+ xorl %ecx,%ebp
+ movl %eax,%esi
+ roll $5,%eax
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ rorl $7,%ebx
+ addl %eax,%edi
+ addl 48(%esp),%edx
+ xorl %ebx,%esi
+ movl %edi,%ebp
+ roll $5,%edi
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ rorl $7,%eax
+ addl %edi,%edx
+ addl 52(%esp),%ecx
+ xorl %eax,%ebp
+ movl %edx,%esi
+ roll $5,%edx
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ rorl $7,%edi
+ addl %edx,%ecx
+ addl 56(%esp),%ebx
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ roll $5,%ecx
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl 60(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ roll $5,%ebx
+ addl %ebp,%eax
+ rorl $7,%ecx
+ addl %ebx,%eax
+ movl 192(%esp),%ebp
+ addl (%ebp),%eax
+ movl 204(%esp),%esp
+ addl 4(%ebp),%esi
+ addl 8(%ebp),%ecx
+ movl %eax,(%ebp)
+ addl 12(%ebp),%edx
+ movl %esi,4(%ebp)
+ addl 16(%ebp),%edi
+ movl %ecx,8(%ebp)
+ movl %edx,12(%ebp)
+ movl %edi,16(%ebp)
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size _sha1_block_data_order_ssse3,.-_sha1_block_data_order_ssse3
+.type _sha1_block_data_order_avx,@function
+.align 16
+_sha1_block_data_order_avx:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ call .L008pic_point
+.L008pic_point:
+ popl %ebp
+ leal .LK_XX_XX-.L008pic_point(%ebp),%ebp
+.Lavx_shortcut:
+ vzeroall
+ vmovdqa (%ebp),%xmm7
+ vmovdqa 16(%ebp),%xmm0
+ vmovdqa 32(%ebp),%xmm1
+ vmovdqa 48(%ebp),%xmm2
+ vmovdqa 64(%ebp),%xmm6
+ movl 20(%esp),%edi
+ movl 24(%esp),%ebp
+ movl 28(%esp),%edx
+ movl %esp,%esi
+ subl $208,%esp
+ andl $-64,%esp
+ vmovdqa %xmm0,112(%esp)
+ vmovdqa %xmm1,128(%esp)
+ vmovdqa %xmm2,144(%esp)
+ shll $6,%edx
+ vmovdqa %xmm7,160(%esp)
+ addl %ebp,%edx
+ vmovdqa %xmm6,176(%esp)
+ addl $64,%ebp
+ movl %edi,192(%esp)
+ movl %ebp,196(%esp)
+ movl %edx,200(%esp)
+ movl %esi,204(%esp)
+ movl (%edi),%eax
+ movl 4(%edi),%ebx
+ movl 8(%edi),%ecx
+ movl 12(%edi),%edx
+ movl 16(%edi),%edi
+ movl %ebx,%esi
+ vmovdqu -64(%ebp),%xmm0
+ vmovdqu -48(%ebp),%xmm1
+ vmovdqu -32(%ebp),%xmm2
+ vmovdqu -16(%ebp),%xmm3
+ vpshufb %xmm6,%xmm0,%xmm0
+ vpshufb %xmm6,%xmm1,%xmm1
+ vpshufb %xmm6,%xmm2,%xmm2
+ vmovdqa %xmm7,96(%esp)
+ vpshufb %xmm6,%xmm3,%xmm3
+ vpaddd %xmm7,%xmm0,%xmm4
+ vpaddd %xmm7,%xmm1,%xmm5
+ vpaddd %xmm7,%xmm2,%xmm6
+ vmovdqa %xmm4,(%esp)
+ movl %ecx,%ebp
+ vmovdqa %xmm5,16(%esp)
+ xorl %edx,%ebp
+ vmovdqa %xmm6,32(%esp)
+ andl %ebp,%esi
+ jmp .L009loop
+.align 16
+.L009loop:
+ shrdl $2,%ebx,%ebx
+ xorl %edx,%esi
+ vpalignr $8,%xmm0,%xmm1,%xmm4
+ movl %eax,%ebp
+ addl (%esp),%edi
+ vpaddd %xmm3,%xmm7,%xmm7
+ vmovdqa %xmm0,64(%esp)
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ vpsrldq $4,%xmm3,%xmm6
+ addl %esi,%edi
+ andl %ebx,%ebp
+ vpxor %xmm0,%xmm4,%xmm4
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ vpxor %xmm2,%xmm6,%xmm6
+ shrdl $7,%eax,%eax
+ xorl %ecx,%ebp
+ vmovdqa %xmm7,48(%esp)
+ movl %edi,%esi
+ addl 4(%esp),%edx
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %ebx,%eax
+ shldl $5,%edi,%edi
+ addl %ebp,%edx
+ andl %eax,%esi
+ vpsrld $31,%xmm4,%xmm6
+ xorl %ebx,%eax
+ addl %edi,%edx
+ shrdl $7,%edi,%edi
+ xorl %ebx,%esi
+ vpslldq $12,%xmm4,%xmm0
+ vpaddd %xmm4,%xmm4,%xmm4
+ movl %edx,%ebp
+ addl 8(%esp),%ecx
+ xorl %eax,%edi
+ shldl $5,%edx,%edx
+ vpsrld $30,%xmm0,%xmm7
+ vpor %xmm6,%xmm4,%xmm4
+ addl %esi,%ecx
+ andl %edi,%ebp
+ xorl %eax,%edi
+ addl %edx,%ecx
+ vpslld $2,%xmm0,%xmm0
+ shrdl $7,%edx,%edx
+ xorl %eax,%ebp
+ vpxor %xmm7,%xmm4,%xmm4
+ movl %ecx,%esi
+ addl 12(%esp),%ebx
+ xorl %edi,%edx
+ shldl $5,%ecx,%ecx
+ vpxor %xmm0,%xmm4,%xmm4
+ addl %ebp,%ebx
+ andl %edx,%esi
+ vmovdqa 96(%esp),%xmm0
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ shrdl $7,%ecx,%ecx
+ xorl %edi,%esi
+ vpalignr $8,%xmm1,%xmm2,%xmm5
+ movl %ebx,%ebp
+ addl 16(%esp),%eax
+ vpaddd %xmm4,%xmm0,%xmm0
+ vmovdqa %xmm1,80(%esp)
+ xorl %edx,%ecx
+ shldl $5,%ebx,%ebx
+ vpsrldq $4,%xmm4,%xmm7
+ addl %esi,%eax
+ andl %ecx,%ebp
+ vpxor %xmm1,%xmm5,%xmm5
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ vpxor %xmm3,%xmm7,%xmm7
+ shrdl $7,%ebx,%ebx
+ xorl %edx,%ebp
+ vmovdqa %xmm0,(%esp)
+ movl %eax,%esi
+ addl 20(%esp),%edi
+ vpxor %xmm7,%xmm5,%xmm5
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ addl %ebp,%edi
+ andl %ebx,%esi
+ vpsrld $31,%xmm5,%xmm7
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ shrdl $7,%eax,%eax
+ xorl %ecx,%esi
+ vpslldq $12,%xmm5,%xmm1
+ vpaddd %xmm5,%xmm5,%xmm5
+ movl %edi,%ebp
+ addl 24(%esp),%edx
+ xorl %ebx,%eax
+ shldl $5,%edi,%edi
+ vpsrld $30,%xmm1,%xmm0
+ vpor %xmm7,%xmm5,%xmm5
+ addl %esi,%edx
+ andl %eax,%ebp
+ xorl %ebx,%eax
+ addl %edi,%edx
+ vpslld $2,%xmm1,%xmm1
+ shrdl $7,%edi,%edi
+ xorl %ebx,%ebp
+ vpxor %xmm0,%xmm5,%xmm5
+ movl %edx,%esi
+ addl 28(%esp),%ecx
+ xorl %eax,%edi
+ shldl $5,%edx,%edx
+ vpxor %xmm1,%xmm5,%xmm5
+ addl %ebp,%ecx
+ andl %edi,%esi
+ vmovdqa 112(%esp),%xmm1
+ xorl %eax,%edi
+ addl %edx,%ecx
+ shrdl $7,%edx,%edx
+ xorl %eax,%esi
+ vpalignr $8,%xmm2,%xmm3,%xmm6
+ movl %ecx,%ebp
+ addl 32(%esp),%ebx
+ vpaddd %xmm5,%xmm1,%xmm1
+ vmovdqa %xmm2,96(%esp)
+ xorl %edi,%edx
+ shldl $5,%ecx,%ecx
+ vpsrldq $4,%xmm5,%xmm0
+ addl %esi,%ebx
+ andl %edx,%ebp
+ vpxor %xmm2,%xmm6,%xmm6
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ vpxor %xmm4,%xmm0,%xmm0
+ shrdl $7,%ecx,%ecx
+ xorl %edi,%ebp
+ vmovdqa %xmm1,16(%esp)
+ movl %ebx,%esi
+ addl 36(%esp),%eax
+ vpxor %xmm0,%xmm6,%xmm6
+ xorl %edx,%ecx
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ andl %ecx,%esi
+ vpsrld $31,%xmm6,%xmm0
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ shrdl $7,%ebx,%ebx
+ xorl %edx,%esi
+ vpslldq $12,%xmm6,%xmm2
+ vpaddd %xmm6,%xmm6,%xmm6
+ movl %eax,%ebp
+ addl 40(%esp),%edi
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ vpsrld $30,%xmm2,%xmm1
+ vpor %xmm0,%xmm6,%xmm6
+ addl %esi,%edi
+ andl %ebx,%ebp
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ vpslld $2,%xmm2,%xmm2
+ vmovdqa 64(%esp),%xmm0
+ shrdl $7,%eax,%eax
+ xorl %ecx,%ebp
+ vpxor %xmm1,%xmm6,%xmm6
+ movl %edi,%esi
+ addl 44(%esp),%edx
+ xorl %ebx,%eax
+ shldl $5,%edi,%edi
+ vpxor %xmm2,%xmm6,%xmm6
+ addl %ebp,%edx
+ andl %eax,%esi
+ vmovdqa 112(%esp),%xmm2
+ xorl %ebx,%eax
+ addl %edi,%edx
+ shrdl $7,%edi,%edi
+ xorl %ebx,%esi
+ vpalignr $8,%xmm3,%xmm4,%xmm7
+ movl %edx,%ebp
+ addl 48(%esp),%ecx
+ vpaddd %xmm6,%xmm2,%xmm2
+ vmovdqa %xmm3,64(%esp)
+ xorl %eax,%edi
+ shldl $5,%edx,%edx
+ vpsrldq $4,%xmm6,%xmm1
+ addl %esi,%ecx
+ andl %edi,%ebp
+ vpxor %xmm3,%xmm7,%xmm7
+ xorl %eax,%edi
+ addl %edx,%ecx
+ vpxor %xmm5,%xmm1,%xmm1
+ shrdl $7,%edx,%edx
+ xorl %eax,%ebp
+ vmovdqa %xmm2,32(%esp)
+ movl %ecx,%esi
+ addl 52(%esp),%ebx
+ vpxor %xmm1,%xmm7,%xmm7
+ xorl %edi,%edx
+ shldl $5,%ecx,%ecx
+ addl %ebp,%ebx
+ andl %edx,%esi
+ vpsrld $31,%xmm7,%xmm1
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ shrdl $7,%ecx,%ecx
+ xorl %edi,%esi
+ vpslldq $12,%xmm7,%xmm3
+ vpaddd %xmm7,%xmm7,%xmm7
+ movl %ebx,%ebp
+ addl 56(%esp),%eax
+ xorl %edx,%ecx
+ shldl $5,%ebx,%ebx
+ vpsrld $30,%xmm3,%xmm2
+ vpor %xmm1,%xmm7,%xmm7
+ addl %esi,%eax
+ andl %ecx,%ebp
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ vpslld $2,%xmm3,%xmm3
+ vmovdqa 80(%esp),%xmm1
+ shrdl $7,%ebx,%ebx
+ xorl %edx,%ebp
+ vpxor %xmm2,%xmm7,%xmm7
+ movl %eax,%esi
+ addl 60(%esp),%edi
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ vpxor %xmm3,%xmm7,%xmm7
+ addl %ebp,%edi
+ andl %ebx,%esi
+ vmovdqa 112(%esp),%xmm3
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ vpalignr $8,%xmm6,%xmm7,%xmm2
+ vpxor %xmm4,%xmm0,%xmm0
+ shrdl $7,%eax,%eax
+ xorl %ecx,%esi
+ movl %edi,%ebp
+ addl (%esp),%edx
+ vpxor %xmm1,%xmm0,%xmm0
+ vmovdqa %xmm4,80(%esp)
+ xorl %ebx,%eax
+ shldl $5,%edi,%edi
+ vmovdqa %xmm3,%xmm4
+ vpaddd %xmm7,%xmm3,%xmm3
+ addl %esi,%edx
+ andl %eax,%ebp
+ vpxor %xmm2,%xmm0,%xmm0
+ xorl %ebx,%eax
+ addl %edi,%edx
+ shrdl $7,%edi,%edi
+ xorl %ebx,%ebp
+ vpsrld $30,%xmm0,%xmm2
+ vmovdqa %xmm3,48(%esp)
+ movl %edx,%esi
+ addl 4(%esp),%ecx
+ xorl %eax,%edi
+ shldl $5,%edx,%edx
+ vpslld $2,%xmm0,%xmm0
+ addl %ebp,%ecx
+ andl %edi,%esi
+ xorl %eax,%edi
+ addl %edx,%ecx
+ shrdl $7,%edx,%edx
+ xorl %eax,%esi
+ movl %ecx,%ebp
+ addl 8(%esp),%ebx
+ vpor %xmm2,%xmm0,%xmm0
+ xorl %edi,%edx
+ shldl $5,%ecx,%ecx
+ vmovdqa 96(%esp),%xmm2
+ addl %esi,%ebx
+ andl %edx,%ebp
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ addl 12(%esp),%eax
+ xorl %edi,%ebp
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpalignr $8,%xmm7,%xmm0,%xmm3
+ vpxor %xmm5,%xmm1,%xmm1
+ addl 16(%esp),%edi
+ xorl %ecx,%esi
+ movl %eax,%ebp
+ shldl $5,%eax,%eax
+ vpxor %xmm2,%xmm1,%xmm1
+ vmovdqa %xmm5,96(%esp)
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ vmovdqa %xmm4,%xmm5
+ vpaddd %xmm0,%xmm4,%xmm4
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ vpxor %xmm3,%xmm1,%xmm1
+ addl 20(%esp),%edx
+ xorl %ebx,%ebp
+ movl %edi,%esi
+ shldl $5,%edi,%edi
+ vpsrld $30,%xmm1,%xmm3
+ vmovdqa %xmm4,(%esp)
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ vpslld $2,%xmm1,%xmm1
+ addl 24(%esp),%ecx
+ xorl %eax,%esi
+ movl %edx,%ebp
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ vpor %xmm3,%xmm1,%xmm1
+ addl 28(%esp),%ebx
+ xorl %edi,%ebp
+ vmovdqa 64(%esp),%xmm3
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpalignr $8,%xmm0,%xmm1,%xmm4
+ vpxor %xmm6,%xmm2,%xmm2
+ addl 32(%esp),%eax
+ xorl %edx,%esi
+ movl %ebx,%ebp
+ shldl $5,%ebx,%ebx
+ vpxor %xmm3,%xmm2,%xmm2
+ vmovdqa %xmm6,64(%esp)
+ addl %esi,%eax
+ xorl %edx,%ebp
+ vmovdqa 128(%esp),%xmm6
+ vpaddd %xmm1,%xmm5,%xmm5
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpxor %xmm4,%xmm2,%xmm2
+ addl 36(%esp),%edi
+ xorl %ecx,%ebp
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ vpsrld $30,%xmm2,%xmm4
+ vmovdqa %xmm5,16(%esp)
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ vpslld $2,%xmm2,%xmm2
+ addl 40(%esp),%edx
+ xorl %ebx,%esi
+ movl %edi,%ebp
+ shldl $5,%edi,%edi
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ vpor %xmm4,%xmm2,%xmm2
+ addl 44(%esp),%ecx
+ xorl %eax,%ebp
+ vmovdqa 80(%esp),%xmm4
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ vpalignr $8,%xmm1,%xmm2,%xmm5
+ vpxor %xmm7,%xmm3,%xmm3
+ addl 48(%esp),%ebx
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ shldl $5,%ecx,%ecx
+ vpxor %xmm4,%xmm3,%xmm3
+ vmovdqa %xmm7,80(%esp)
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ vmovdqa %xmm6,%xmm7
+ vpaddd %xmm2,%xmm6,%xmm6
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpxor %xmm5,%xmm3,%xmm3
+ addl 52(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ vpsrld $30,%xmm3,%xmm5
+ vmovdqa %xmm6,32(%esp)
+ addl %ebp,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpslld $2,%xmm3,%xmm3
+ addl 56(%esp),%edi
+ xorl %ecx,%esi
+ movl %eax,%ebp
+ shldl $5,%eax,%eax
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ vpor %xmm5,%xmm3,%xmm3
+ addl 60(%esp),%edx
+ xorl %ebx,%ebp
+ vmovdqa 96(%esp),%xmm5
+ movl %edi,%esi
+ shldl $5,%edi,%edi
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ vpalignr $8,%xmm2,%xmm3,%xmm6
+ vpxor %xmm0,%xmm4,%xmm4
+ addl (%esp),%ecx
+ xorl %eax,%esi
+ movl %edx,%ebp
+ shldl $5,%edx,%edx
+ vpxor %xmm5,%xmm4,%xmm4
+ vmovdqa %xmm0,96(%esp)
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ vmovdqa %xmm7,%xmm0
+ vpaddd %xmm3,%xmm7,%xmm7
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ vpxor %xmm6,%xmm4,%xmm4
+ addl 4(%esp),%ebx
+ xorl %edi,%ebp
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ vpsrld $30,%xmm4,%xmm6
+ vmovdqa %xmm7,48(%esp)
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpslld $2,%xmm4,%xmm4
+ addl 8(%esp),%eax
+ xorl %edx,%esi
+ movl %ebx,%ebp
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %edx,%ebp
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpor %xmm6,%xmm4,%xmm4
+ addl 12(%esp),%edi
+ xorl %ecx,%ebp
+ vmovdqa 64(%esp),%xmm6
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ vpalignr $8,%xmm3,%xmm4,%xmm7
+ vpxor %xmm1,%xmm5,%xmm5
+ addl 16(%esp),%edx
+ xorl %ebx,%esi
+ movl %edi,%ebp
+ shldl $5,%edi,%edi
+ vpxor %xmm6,%xmm5,%xmm5
+ vmovdqa %xmm1,64(%esp)
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ vmovdqa %xmm0,%xmm1
+ vpaddd %xmm4,%xmm0,%xmm0
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ vpxor %xmm7,%xmm5,%xmm5
+ addl 20(%esp),%ecx
+ xorl %eax,%ebp
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ vpsrld $30,%xmm5,%xmm7
+ vmovdqa %xmm0,(%esp)
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ vpslld $2,%xmm5,%xmm5
+ addl 24(%esp),%ebx
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpor %xmm7,%xmm5,%xmm5
+ addl 28(%esp),%eax
+ vmovdqa 80(%esp),%xmm7
+ shrdl $7,%ecx,%ecx
+ movl %ebx,%esi
+ xorl %edx,%ebp
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ xorl %ecx,%esi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ vpalignr $8,%xmm4,%xmm5,%xmm0
+ vpxor %xmm2,%xmm6,%xmm6
+ addl 32(%esp),%edi
+ andl %ecx,%esi
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ vpxor %xmm7,%xmm6,%xmm6
+ vmovdqa %xmm2,80(%esp)
+ movl %eax,%ebp
+ xorl %ecx,%esi
+ vmovdqa %xmm1,%xmm2
+ vpaddd %xmm5,%xmm1,%xmm1
+ shldl $5,%eax,%eax
+ addl %esi,%edi
+ vpxor %xmm0,%xmm6,%xmm6
+ xorl %ebx,%ebp
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ addl 36(%esp),%edx
+ vpsrld $30,%xmm6,%xmm0
+ vmovdqa %xmm1,16(%esp)
+ andl %ebx,%ebp
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ movl %edi,%esi
+ vpslld $2,%xmm6,%xmm6
+ xorl %ebx,%ebp
+ shldl $5,%edi,%edi
+ addl %ebp,%edx
+ xorl %eax,%esi
+ xorl %ebx,%eax
+ addl %edi,%edx
+ addl 40(%esp),%ecx
+ andl %eax,%esi
+ vpor %xmm0,%xmm6,%xmm6
+ xorl %ebx,%eax
+ shrdl $7,%edi,%edi
+ vmovdqa 96(%esp),%xmm0
+ movl %edx,%ebp
+ xorl %eax,%esi
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %edi,%ebp
+ xorl %eax,%edi
+ addl %edx,%ecx
+ addl 44(%esp),%ebx
+ andl %edi,%ebp
+ xorl %eax,%edi
+ shrdl $7,%edx,%edx
+ movl %ecx,%esi
+ xorl %edi,%ebp
+ shldl $5,%ecx,%ecx
+ addl %ebp,%ebx
+ xorl %edx,%esi
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ vpalignr $8,%xmm5,%xmm6,%xmm1
+ vpxor %xmm3,%xmm7,%xmm7
+ addl 48(%esp),%eax
+ andl %edx,%esi
+ xorl %edi,%edx
+ shrdl $7,%ecx,%ecx
+ vpxor %xmm0,%xmm7,%xmm7
+ vmovdqa %xmm3,96(%esp)
+ movl %ebx,%ebp
+ xorl %edx,%esi
+ vmovdqa 144(%esp),%xmm3
+ vpaddd %xmm6,%xmm2,%xmm2
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ vpxor %xmm1,%xmm7,%xmm7
+ xorl %ecx,%ebp
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 52(%esp),%edi
+ vpsrld $30,%xmm7,%xmm1
+ vmovdqa %xmm2,32(%esp)
+ andl %ecx,%ebp
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ movl %eax,%esi
+ vpslld $2,%xmm7,%xmm7
+ xorl %ecx,%ebp
+ shldl $5,%eax,%eax
+ addl %ebp,%edi
+ xorl %ebx,%esi
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ addl 56(%esp),%edx
+ andl %ebx,%esi
+ vpor %xmm1,%xmm7,%xmm7
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ vmovdqa 64(%esp),%xmm1
+ movl %edi,%ebp
+ xorl %ebx,%esi
+ shldl $5,%edi,%edi
+ addl %esi,%edx
+ xorl %eax,%ebp
+ xorl %ebx,%eax
+ addl %edi,%edx
+ addl 60(%esp),%ecx
+ andl %eax,%ebp
+ xorl %ebx,%eax
+ shrdl $7,%edi,%edi
+ movl %edx,%esi
+ xorl %eax,%ebp
+ shldl $5,%edx,%edx
+ addl %ebp,%ecx
+ xorl %edi,%esi
+ xorl %eax,%edi
+ addl %edx,%ecx
+ vpalignr $8,%xmm6,%xmm7,%xmm2
+ vpxor %xmm4,%xmm0,%xmm0
+ addl (%esp),%ebx
+ andl %edi,%esi
+ xorl %eax,%edi
+ shrdl $7,%edx,%edx
+ vpxor %xmm1,%xmm0,%xmm0
+ vmovdqa %xmm4,64(%esp)
+ movl %ecx,%ebp
+ xorl %edi,%esi
+ vmovdqa %xmm3,%xmm4
+ vpaddd %xmm7,%xmm3,%xmm3
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ vpxor %xmm2,%xmm0,%xmm0
+ xorl %edx,%ebp
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ addl 4(%esp),%eax
+ vpsrld $30,%xmm0,%xmm2
+ vmovdqa %xmm3,48(%esp)
+ andl %edx,%ebp
+ xorl %edi,%edx
+ shrdl $7,%ecx,%ecx
+ movl %ebx,%esi
+ vpslld $2,%xmm0,%xmm0
+ xorl %edx,%ebp
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ xorl %ecx,%esi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 8(%esp),%edi
+ andl %ecx,%esi
+ vpor %xmm2,%xmm0,%xmm0
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ vmovdqa 80(%esp),%xmm2
+ movl %eax,%ebp
+ xorl %ecx,%esi
+ shldl $5,%eax,%eax
+ addl %esi,%edi
+ xorl %ebx,%ebp
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ addl 12(%esp),%edx
+ andl %ebx,%ebp
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ movl %edi,%esi
+ xorl %ebx,%ebp
+ shldl $5,%edi,%edi
+ addl %ebp,%edx
+ xorl %eax,%esi
+ xorl %ebx,%eax
+ addl %edi,%edx
+ vpalignr $8,%xmm7,%xmm0,%xmm3
+ vpxor %xmm5,%xmm1,%xmm1
+ addl 16(%esp),%ecx
+ andl %eax,%esi
+ xorl %ebx,%eax
+ shrdl $7,%edi,%edi
+ vpxor %xmm2,%xmm1,%xmm1
+ vmovdqa %xmm5,80(%esp)
+ movl %edx,%ebp
+ xorl %eax,%esi
+ vmovdqa %xmm4,%xmm5
+ vpaddd %xmm0,%xmm4,%xmm4
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ vpxor %xmm3,%xmm1,%xmm1
+ xorl %edi,%ebp
+ xorl %eax,%edi
+ addl %edx,%ecx
+ addl 20(%esp),%ebx
+ vpsrld $30,%xmm1,%xmm3
+ vmovdqa %xmm4,(%esp)
+ andl %edi,%ebp
+ xorl %eax,%edi
+ shrdl $7,%edx,%edx
+ movl %ecx,%esi
+ vpslld $2,%xmm1,%xmm1
+ xorl %edi,%ebp
+ shldl $5,%ecx,%ecx
+ addl %ebp,%ebx
+ xorl %edx,%esi
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ addl 24(%esp),%eax
+ andl %edx,%esi
+ vpor %xmm3,%xmm1,%xmm1
+ xorl %edi,%edx
+ shrdl $7,%ecx,%ecx
+ vmovdqa 96(%esp),%xmm3
+ movl %ebx,%ebp
+ xorl %edx,%esi
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %ecx,%ebp
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 28(%esp),%edi
+ andl %ecx,%ebp
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ movl %eax,%esi
+ xorl %ecx,%ebp
+ shldl $5,%eax,%eax
+ addl %ebp,%edi
+ xorl %ebx,%esi
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ vpalignr $8,%xmm0,%xmm1,%xmm4
+ vpxor %xmm6,%xmm2,%xmm2
+ addl 32(%esp),%edx
+ andl %ebx,%esi
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ vpxor %xmm3,%xmm2,%xmm2
+ vmovdqa %xmm6,96(%esp)
+ movl %edi,%ebp
+ xorl %ebx,%esi
+ vmovdqa %xmm5,%xmm6
+ vpaddd %xmm1,%xmm5,%xmm5
+ shldl $5,%edi,%edi
+ addl %esi,%edx
+ vpxor %xmm4,%xmm2,%xmm2
+ xorl %eax,%ebp
+ xorl %ebx,%eax
+ addl %edi,%edx
+ addl 36(%esp),%ecx
+ vpsrld $30,%xmm2,%xmm4
+ vmovdqa %xmm5,16(%esp)
+ andl %eax,%ebp
+ xorl %ebx,%eax
+ shrdl $7,%edi,%edi
+ movl %edx,%esi
+ vpslld $2,%xmm2,%xmm2
+ xorl %eax,%ebp
+ shldl $5,%edx,%edx
+ addl %ebp,%ecx
+ xorl %edi,%esi
+ xorl %eax,%edi
+ addl %edx,%ecx
+ addl 40(%esp),%ebx
+ andl %edi,%esi
+ vpor %xmm4,%xmm2,%xmm2
+ xorl %eax,%edi
+ shrdl $7,%edx,%edx
+ vmovdqa 64(%esp),%xmm4
+ movl %ecx,%ebp
+ xorl %edi,%esi
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %edx,%ebp
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ addl 44(%esp),%eax
+ andl %edx,%ebp
+ xorl %edi,%edx
+ shrdl $7,%ecx,%ecx
+ movl %ebx,%esi
+ xorl %edx,%ebp
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ xorl %edx,%esi
+ addl %ebx,%eax
+ vpalignr $8,%xmm1,%xmm2,%xmm5
+ vpxor %xmm7,%xmm3,%xmm3
+ addl 48(%esp),%edi
+ xorl %ecx,%esi
+ movl %eax,%ebp
+ shldl $5,%eax,%eax
+ vpxor %xmm4,%xmm3,%xmm3
+ vmovdqa %xmm7,64(%esp)
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ vmovdqa %xmm6,%xmm7
+ vpaddd %xmm2,%xmm6,%xmm6
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ vpxor %xmm5,%xmm3,%xmm3
+ addl 52(%esp),%edx
+ xorl %ebx,%ebp
+ movl %edi,%esi
+ shldl $5,%edi,%edi
+ vpsrld $30,%xmm3,%xmm5
+ vmovdqa %xmm6,32(%esp)
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ vpslld $2,%xmm3,%xmm3
+ addl 56(%esp),%ecx
+ xorl %eax,%esi
+ movl %edx,%ebp
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ vpor %xmm5,%xmm3,%xmm3
+ addl 60(%esp),%ebx
+ xorl %edi,%ebp
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl (%esp),%eax
+ vpaddd %xmm3,%xmm7,%xmm7
+ xorl %edx,%esi
+ movl %ebx,%ebp
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ vmovdqa %xmm7,48(%esp)
+ xorl %edx,%ebp
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 4(%esp),%edi
+ xorl %ecx,%ebp
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ addl 8(%esp),%edx
+ xorl %ebx,%esi
+ movl %edi,%ebp
+ shldl $5,%edi,%edi
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ addl 12(%esp),%ecx
+ xorl %eax,%ebp
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ movl 196(%esp),%ebp
+ cmpl 200(%esp),%ebp
+ je .L010done
+ vmovdqa 160(%esp),%xmm7
+ vmovdqa 176(%esp),%xmm6
+ vmovdqu (%ebp),%xmm0
+ vmovdqu 16(%ebp),%xmm1
+ vmovdqu 32(%ebp),%xmm2
+ vmovdqu 48(%ebp),%xmm3
+ addl $64,%ebp
+ vpshufb %xmm6,%xmm0,%xmm0
+ movl %ebp,196(%esp)
+ vmovdqa %xmm7,96(%esp)
+ addl 16(%esp),%ebx
+ xorl %edi,%esi
+ vpshufb %xmm6,%xmm1,%xmm1
+ movl %ecx,%ebp
+ shldl $5,%ecx,%ecx
+ vpaddd %xmm7,%xmm0,%xmm4
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vmovdqa %xmm4,(%esp)
+ addl 20(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 24(%esp),%edi
+ xorl %ecx,%esi
+ movl %eax,%ebp
+ shldl $5,%eax,%eax
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ addl 28(%esp),%edx
+ xorl %ebx,%ebp
+ movl %edi,%esi
+ shldl $5,%edi,%edi
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ addl 32(%esp),%ecx
+ xorl %eax,%esi
+ vpshufb %xmm6,%xmm2,%xmm2
+ movl %edx,%ebp
+ shldl $5,%edx,%edx
+ vpaddd %xmm7,%xmm1,%xmm5
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ vmovdqa %xmm5,16(%esp)
+ addl 36(%esp),%ebx
+ xorl %edi,%ebp
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 40(%esp),%eax
+ xorl %edx,%esi
+ movl %ebx,%ebp
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %edx,%ebp
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 44(%esp),%edi
+ xorl %ecx,%ebp
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ addl 48(%esp),%edx
+ xorl %ebx,%esi
+ vpshufb %xmm6,%xmm3,%xmm3
+ movl %edi,%ebp
+ shldl $5,%edi,%edi
+ vpaddd %xmm7,%xmm2,%xmm6
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ vmovdqa %xmm6,32(%esp)
+ addl 52(%esp),%ecx
+ xorl %eax,%ebp
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ addl 56(%esp),%ebx
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 60(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ movl 192(%esp),%ebp
+ addl (%ebp),%eax
+ addl 4(%ebp),%esi
+ addl 8(%ebp),%ecx
+ movl %eax,(%ebp)
+ addl 12(%ebp),%edx
+ movl %esi,4(%ebp)
+ addl 16(%ebp),%edi
+ movl %ecx,%ebx
+ movl %ecx,8(%ebp)
+ xorl %edx,%ebx
+ movl %edx,12(%ebp)
+ movl %edi,16(%ebp)
+ movl %esi,%ebp
+ andl %ebx,%esi
+ movl %ebp,%ebx
+ jmp .L009loop
+.align 16
+.L010done:
+ addl 16(%esp),%ebx
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 20(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 24(%esp),%edi
+ xorl %ecx,%esi
+ movl %eax,%ebp
+ shldl $5,%eax,%eax
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ addl 28(%esp),%edx
+ xorl %ebx,%ebp
+ movl %edi,%esi
+ shldl $5,%edi,%edi
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ addl 32(%esp),%ecx
+ xorl %eax,%esi
+ movl %edx,%ebp
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ addl 36(%esp),%ebx
+ xorl %edi,%ebp
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 40(%esp),%eax
+ xorl %edx,%esi
+ movl %ebx,%ebp
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %edx,%ebp
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 44(%esp),%edi
+ xorl %ecx,%ebp
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ addl 48(%esp),%edx
+ xorl %ebx,%esi
+ movl %edi,%ebp
+ shldl $5,%edi,%edi
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ addl 52(%esp),%ecx
+ xorl %eax,%ebp
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ addl 56(%esp),%ebx
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 60(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vzeroall
+ movl 192(%esp),%ebp
+ addl (%ebp),%eax
+ movl 204(%esp),%esp
+ addl 4(%ebp),%esi
+ addl 8(%ebp),%ecx
+ movl %eax,(%ebp)
+ addl 12(%ebp),%edx
+ movl %esi,4(%ebp)
+ addl 16(%ebp),%edi
+ movl %ecx,8(%ebp)
+ movl %edx,12(%ebp)
+ movl %edi,16(%ebp)
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size _sha1_block_data_order_avx,.-_sha1_block_data_order_avx
+.align 64
+.LK_XX_XX:
+.long 1518500249,1518500249,1518500249,1518500249
+.long 1859775393,1859775393,1859775393,1859775393
+.long 2400959708,2400959708,2400959708,2400959708
+.long 3395469782,3395469782,3395469782,3395469782
+.long 66051,67438087,134810123,202182159
+.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
.byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115
.byte 102,111,114,109,32,102,111,114,32,120,56,54,44,32,67,82
.byte 89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112
.byte 114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.comm OPENSSL_ia32cap_P,16,4
#else
.text
.globl sha1_block_data_order
@@ -1391,6 +3981,28 @@ sha1_block_data_order:
pushl %ebx
pushl %esi
pushl %edi
+ call .L000pic_point
+.L000pic_point:
+ popl %ebp
+ leal OPENSSL_ia32cap_P,%esi
+ leal .LK_XX_XX-.L000pic_point(%ebp),%ebp
+ movl (%esi),%eax
+ movl 4(%esi),%edx
+ testl $512,%edx
+ jz .L001x86
+ movl 8(%esi),%ecx
+ testl $16777216,%eax
+ jz .L001x86
+ testl $536870912,%ecx
+ jnz .Lshaext_shortcut
+ andl $268435456,%edx
+ andl $1073741824,%eax
+ orl %edx,%eax
+ cmpl $1342177280,%eax
+ je .Lavx_shortcut
+ jmp .Lssse3_shortcut
+.align 16
+.L001x86:
movl 20(%esp),%ebp
movl 24(%esp),%esi
movl 28(%esp),%eax
@@ -1399,9 +4011,9 @@ sha1_block_data_order:
addl %esi,%eax
movl %eax,104(%esp)
movl 16(%ebp),%edi
- jmp .L000loop
+ jmp .L002loop
.align 16
-.L000loop:
+.L002loop:
movl (%esi),%eax
movl 4(%esi),%ebx
movl 8(%esi),%ecx
@@ -2748,7 +5360,7 @@ sha1_block_data_order:
movl %ebx,12(%ebp)
movl %edx,%esi
movl %ecx,16(%ebp)
- jb .L000loop
+ jb .L002loop
addl $76,%esp
popl %edi
popl %esi
@@ -2756,8 +5368,2576 @@ sha1_block_data_order:
popl %ebp
ret
.size sha1_block_data_order,.-.L_sha1_block_data_order_begin
+.type _sha1_block_data_order_shaext,@function
+.align 16
+_sha1_block_data_order_shaext:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ call .L003pic_point
+.L003pic_point:
+ popl %ebp
+ leal .LK_XX_XX-.L003pic_point(%ebp),%ebp
+.Lshaext_shortcut:
+ movl 20(%esp),%edi
+ movl %esp,%ebx
+ movl 24(%esp),%esi
+ movl 28(%esp),%ecx
+ subl $32,%esp
+ movdqu (%edi),%xmm0
+ movd 16(%edi),%xmm1
+ andl $-32,%esp
+ movdqa 80(%ebp),%xmm3
+ movdqu (%esi),%xmm4
+ pshufd $27,%xmm0,%xmm0
+ movdqu 16(%esi),%xmm5
+ pshufd $27,%xmm1,%xmm1
+ movdqu 32(%esi),%xmm6
+.byte 102,15,56,0,227
+ movdqu 48(%esi),%xmm7
+.byte 102,15,56,0,235
+.byte 102,15,56,0,243
+.byte 102,15,56,0,251
+ jmp .L004loop_shaext
+.align 16
+.L004loop_shaext:
+ decl %ecx
+ leal 64(%esi),%eax
+ movdqa %xmm1,(%esp)
+ paddd %xmm4,%xmm1
+ cmovnel %eax,%esi
+ movdqa %xmm0,16(%esp)
+.byte 15,56,201,229
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,0
+.byte 15,56,200,213
+ pxor %xmm6,%xmm4
+.byte 15,56,201,238
+.byte 15,56,202,231
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,0
+.byte 15,56,200,206
+ pxor %xmm7,%xmm5
+.byte 15,56,202,236
+.byte 15,56,201,247
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,0
+.byte 15,56,200,215
+ pxor %xmm4,%xmm6
+.byte 15,56,201,252
+.byte 15,56,202,245
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,0
+.byte 15,56,200,204
+ pxor %xmm5,%xmm7
+.byte 15,56,202,254
+.byte 15,56,201,229
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,0
+.byte 15,56,200,213
+ pxor %xmm6,%xmm4
+.byte 15,56,201,238
+.byte 15,56,202,231
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,1
+.byte 15,56,200,206
+ pxor %xmm7,%xmm5
+.byte 15,56,202,236
+.byte 15,56,201,247
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,1
+.byte 15,56,200,215
+ pxor %xmm4,%xmm6
+.byte 15,56,201,252
+.byte 15,56,202,245
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,1
+.byte 15,56,200,204
+ pxor %xmm5,%xmm7
+.byte 15,56,202,254
+.byte 15,56,201,229
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,1
+.byte 15,56,200,213
+ pxor %xmm6,%xmm4
+.byte 15,56,201,238
+.byte 15,56,202,231
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,1
+.byte 15,56,200,206
+ pxor %xmm7,%xmm5
+.byte 15,56,202,236
+.byte 15,56,201,247
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,2
+.byte 15,56,200,215
+ pxor %xmm4,%xmm6
+.byte 15,56,201,252
+.byte 15,56,202,245
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,2
+.byte 15,56,200,204
+ pxor %xmm5,%xmm7
+.byte 15,56,202,254
+.byte 15,56,201,229
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,2
+.byte 15,56,200,213
+ pxor %xmm6,%xmm4
+.byte 15,56,201,238
+.byte 15,56,202,231
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,2
+.byte 15,56,200,206
+ pxor %xmm7,%xmm5
+.byte 15,56,202,236
+.byte 15,56,201,247
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,2
+.byte 15,56,200,215
+ pxor %xmm4,%xmm6
+.byte 15,56,201,252
+.byte 15,56,202,245
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,3
+.byte 15,56,200,204
+ pxor %xmm5,%xmm7
+.byte 15,56,202,254
+ movdqu (%esi),%xmm4
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,3
+.byte 15,56,200,213
+ movdqu 16(%esi),%xmm5
+.byte 102,15,56,0,227
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,3
+.byte 15,56,200,206
+ movdqu 32(%esi),%xmm6
+.byte 102,15,56,0,235
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,3
+.byte 15,56,200,215
+ movdqu 48(%esi),%xmm7
+.byte 102,15,56,0,243
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,3
+ movdqa (%esp),%xmm2
+.byte 102,15,56,0,251
+.byte 15,56,200,202
+ paddd 16(%esp),%xmm0
+ jnz .L004loop_shaext
+ pshufd $27,%xmm0,%xmm0
+ pshufd $27,%xmm1,%xmm1
+ movdqu %xmm0,(%edi)
+ movd %xmm1,16(%edi)
+ movl %ebx,%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size _sha1_block_data_order_shaext,.-_sha1_block_data_order_shaext
+.type _sha1_block_data_order_ssse3,@function
+.align 16
+_sha1_block_data_order_ssse3:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ call .L005pic_point
+.L005pic_point:
+ popl %ebp
+ leal .LK_XX_XX-.L005pic_point(%ebp),%ebp
+.Lssse3_shortcut:
+ movdqa (%ebp),%xmm7
+ movdqa 16(%ebp),%xmm0
+ movdqa 32(%ebp),%xmm1
+ movdqa 48(%ebp),%xmm2
+ movdqa 64(%ebp),%xmm6
+ movl 20(%esp),%edi
+ movl 24(%esp),%ebp
+ movl 28(%esp),%edx
+ movl %esp,%esi
+ subl $208,%esp
+ andl $-64,%esp
+ movdqa %xmm0,112(%esp)
+ movdqa %xmm1,128(%esp)
+ movdqa %xmm2,144(%esp)
+ shll $6,%edx
+ movdqa %xmm7,160(%esp)
+ addl %ebp,%edx
+ movdqa %xmm6,176(%esp)
+ addl $64,%ebp
+ movl %edi,192(%esp)
+ movl %ebp,196(%esp)
+ movl %edx,200(%esp)
+ movl %esi,204(%esp)
+ movl (%edi),%eax
+ movl 4(%edi),%ebx
+ movl 8(%edi),%ecx
+ movl 12(%edi),%edx
+ movl 16(%edi),%edi
+ movl %ebx,%esi
+ movdqu -64(%ebp),%xmm0
+ movdqu -48(%ebp),%xmm1
+ movdqu -32(%ebp),%xmm2
+ movdqu -16(%ebp),%xmm3
+.byte 102,15,56,0,198
+.byte 102,15,56,0,206
+.byte 102,15,56,0,214
+ movdqa %xmm7,96(%esp)
+.byte 102,15,56,0,222
+ paddd %xmm7,%xmm0
+ paddd %xmm7,%xmm1
+ paddd %xmm7,%xmm2
+ movdqa %xmm0,(%esp)
+ psubd %xmm7,%xmm0
+ movdqa %xmm1,16(%esp)
+ psubd %xmm7,%xmm1
+ movdqa %xmm2,32(%esp)
+ movl %ecx,%ebp
+ psubd %xmm7,%xmm2
+ xorl %edx,%ebp
+ pshufd $238,%xmm0,%xmm4
+ andl %ebp,%esi
+ jmp .L006loop
+.align 16
+.L006loop:
+ rorl $2,%ebx
+ xorl %edx,%esi
+ movl %eax,%ebp
+ punpcklqdq %xmm1,%xmm4
+ movdqa %xmm3,%xmm6
+ addl (%esp),%edi
+ xorl %ecx,%ebx
+ paddd %xmm3,%xmm7
+ movdqa %xmm0,64(%esp)
+ roll $5,%eax
+ addl %esi,%edi
+ psrldq $4,%xmm6
+ andl %ebx,%ebp
+ xorl %ecx,%ebx
+ pxor %xmm0,%xmm4
+ addl %eax,%edi
+ rorl $7,%eax
+ pxor %xmm2,%xmm6
+ xorl %ecx,%ebp
+ movl %edi,%esi
+ addl 4(%esp),%edx
+ pxor %xmm6,%xmm4
+ xorl %ebx,%eax
+ roll $5,%edi
+ movdqa %xmm7,48(%esp)
+ addl %ebp,%edx
+ andl %eax,%esi
+ movdqa %xmm4,%xmm0
+ xorl %ebx,%eax
+ addl %edi,%edx
+ rorl $7,%edi
+ movdqa %xmm4,%xmm6
+ xorl %ebx,%esi
+ pslldq $12,%xmm0
+ paddd %xmm4,%xmm4
+ movl %edx,%ebp
+ addl 8(%esp),%ecx
+ psrld $31,%xmm6
+ xorl %eax,%edi
+ roll $5,%edx
+ movdqa %xmm0,%xmm7
+ addl %esi,%ecx
+ andl %edi,%ebp
+ xorl %eax,%edi
+ psrld $30,%xmm0
+ addl %edx,%ecx
+ rorl $7,%edx
+ por %xmm6,%xmm4
+ xorl %eax,%ebp
+ movl %ecx,%esi
+ addl 12(%esp),%ebx
+ pslld $2,%xmm7
+ xorl %edi,%edx
+ roll $5,%ecx
+ pxor %xmm0,%xmm4
+ movdqa 96(%esp),%xmm0
+ addl %ebp,%ebx
+ andl %edx,%esi
+ pxor %xmm7,%xmm4
+ pshufd $238,%xmm1,%xmm5
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ movl %ebx,%ebp
+ punpcklqdq %xmm2,%xmm5
+ movdqa %xmm4,%xmm7
+ addl 16(%esp),%eax
+ xorl %edx,%ecx
+ paddd %xmm4,%xmm0
+ movdqa %xmm1,80(%esp)
+ roll $5,%ebx
+ addl %esi,%eax
+ psrldq $4,%xmm7
+ andl %ecx,%ebp
+ xorl %edx,%ecx
+ pxor %xmm1,%xmm5
+ addl %ebx,%eax
+ rorl $7,%ebx
+ pxor %xmm3,%xmm7
+ xorl %edx,%ebp
+ movl %eax,%esi
+ addl 20(%esp),%edi
+ pxor %xmm7,%xmm5
+ xorl %ecx,%ebx
+ roll $5,%eax
+ movdqa %xmm0,(%esp)
+ addl %ebp,%edi
+ andl %ebx,%esi
+ movdqa %xmm5,%xmm1
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ rorl $7,%eax
+ movdqa %xmm5,%xmm7
+ xorl %ecx,%esi
+ pslldq $12,%xmm1
+ paddd %xmm5,%xmm5
+ movl %edi,%ebp
+ addl 24(%esp),%edx
+ psrld $31,%xmm7
+ xorl %ebx,%eax
+ roll $5,%edi
+ movdqa %xmm1,%xmm0
+ addl %esi,%edx
+ andl %eax,%ebp
+ xorl %ebx,%eax
+ psrld $30,%xmm1
+ addl %edi,%edx
+ rorl $7,%edi
+ por %xmm7,%xmm5
+ xorl %ebx,%ebp
+ movl %edx,%esi
+ addl 28(%esp),%ecx
+ pslld $2,%xmm0
+ xorl %eax,%edi
+ roll $5,%edx
+ pxor %xmm1,%xmm5
+ movdqa 112(%esp),%xmm1
+ addl %ebp,%ecx
+ andl %edi,%esi
+ pxor %xmm0,%xmm5
+ pshufd $238,%xmm2,%xmm6
+ xorl %eax,%edi
+ addl %edx,%ecx
+ rorl $7,%edx
+ xorl %eax,%esi
+ movl %ecx,%ebp
+ punpcklqdq %xmm3,%xmm6
+ movdqa %xmm5,%xmm0
+ addl 32(%esp),%ebx
+ xorl %edi,%edx
+ paddd %xmm5,%xmm1
+ movdqa %xmm2,96(%esp)
+ roll $5,%ecx
+ addl %esi,%ebx
+ psrldq $4,%xmm0
+ andl %edx,%ebp
+ xorl %edi,%edx
+ pxor %xmm2,%xmm6
+ addl %ecx,%ebx
+ rorl $7,%ecx
+ pxor %xmm4,%xmm0
+ xorl %edi,%ebp
+ movl %ebx,%esi
+ addl 36(%esp),%eax
+ pxor %xmm0,%xmm6
+ xorl %edx,%ecx
+ roll $5,%ebx
+ movdqa %xmm1,16(%esp)
+ addl %ebp,%eax
+ andl %ecx,%esi
+ movdqa %xmm6,%xmm2
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ rorl $7,%ebx
+ movdqa %xmm6,%xmm0
+ xorl %edx,%esi
+ pslldq $12,%xmm2
+ paddd %xmm6,%xmm6
+ movl %eax,%ebp
+ addl 40(%esp),%edi
+ psrld $31,%xmm0
+ xorl %ecx,%ebx
+ roll $5,%eax
+ movdqa %xmm2,%xmm1
+ addl %esi,%edi
+ andl %ebx,%ebp
+ xorl %ecx,%ebx
+ psrld $30,%xmm2
+ addl %eax,%edi
+ rorl $7,%eax
+ por %xmm0,%xmm6
+ xorl %ecx,%ebp
+ movdqa 64(%esp),%xmm0
+ movl %edi,%esi
+ addl 44(%esp),%edx
+ pslld $2,%xmm1
+ xorl %ebx,%eax
+ roll $5,%edi
+ pxor %xmm2,%xmm6
+ movdqa 112(%esp),%xmm2
+ addl %ebp,%edx
+ andl %eax,%esi
+ pxor %xmm1,%xmm6
+ pshufd $238,%xmm3,%xmm7
+ xorl %ebx,%eax
+ addl %edi,%edx
+ rorl $7,%edi
+ xorl %ebx,%esi
+ movl %edx,%ebp
+ punpcklqdq %xmm4,%xmm7
+ movdqa %xmm6,%xmm1
+ addl 48(%esp),%ecx
+ xorl %eax,%edi
+ paddd %xmm6,%xmm2
+ movdqa %xmm3,64(%esp)
+ roll $5,%edx
+ addl %esi,%ecx
+ psrldq $4,%xmm1
+ andl %edi,%ebp
+ xorl %eax,%edi
+ pxor %xmm3,%xmm7
+ addl %edx,%ecx
+ rorl $7,%edx
+ pxor %xmm5,%xmm1
+ xorl %eax,%ebp
+ movl %ecx,%esi
+ addl 52(%esp),%ebx
+ pxor %xmm1,%xmm7
+ xorl %edi,%edx
+ roll $5,%ecx
+ movdqa %xmm2,32(%esp)
+ addl %ebp,%ebx
+ andl %edx,%esi
+ movdqa %xmm7,%xmm3
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ rorl $7,%ecx
+ movdqa %xmm7,%xmm1
+ xorl %edi,%esi
+ pslldq $12,%xmm3
+ paddd %xmm7,%xmm7
+ movl %ebx,%ebp
+ addl 56(%esp),%eax
+ psrld $31,%xmm1
+ xorl %edx,%ecx
+ roll $5,%ebx
+ movdqa %xmm3,%xmm2
+ addl %esi,%eax
+ andl %ecx,%ebp
+ xorl %edx,%ecx
+ psrld $30,%xmm3
+ addl %ebx,%eax
+ rorl $7,%ebx
+ por %xmm1,%xmm7
+ xorl %edx,%ebp
+ movdqa 80(%esp),%xmm1
+ movl %eax,%esi
+ addl 60(%esp),%edi
+ pslld $2,%xmm2
+ xorl %ecx,%ebx
+ roll $5,%eax
+ pxor %xmm3,%xmm7
+ movdqa 112(%esp),%xmm3
+ addl %ebp,%edi
+ andl %ebx,%esi
+ pxor %xmm2,%xmm7
+ pshufd $238,%xmm6,%xmm2
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ rorl $7,%eax
+ pxor %xmm4,%xmm0
+ punpcklqdq %xmm7,%xmm2
+ xorl %ecx,%esi
+ movl %edi,%ebp
+ addl (%esp),%edx
+ pxor %xmm1,%xmm0
+ movdqa %xmm4,80(%esp)
+ xorl %ebx,%eax
+ roll $5,%edi
+ movdqa %xmm3,%xmm4
+ addl %esi,%edx
+ paddd %xmm7,%xmm3
+ andl %eax,%ebp
+ pxor %xmm2,%xmm0
+ xorl %ebx,%eax
+ addl %edi,%edx
+ rorl $7,%edi
+ xorl %ebx,%ebp
+ movdqa %xmm0,%xmm2
+ movdqa %xmm3,48(%esp)
+ movl %edx,%esi
+ addl 4(%esp),%ecx
+ xorl %eax,%edi
+ roll $5,%edx
+ pslld $2,%xmm0
+ addl %ebp,%ecx
+ andl %edi,%esi
+ psrld $30,%xmm2
+ xorl %eax,%edi
+ addl %edx,%ecx
+ rorl $7,%edx
+ xorl %eax,%esi
+ movl %ecx,%ebp
+ addl 8(%esp),%ebx
+ xorl %edi,%edx
+ roll $5,%ecx
+ por %xmm2,%xmm0
+ addl %esi,%ebx
+ andl %edx,%ebp
+ movdqa 96(%esp),%xmm2
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ addl 12(%esp),%eax
+ xorl %edi,%ebp
+ movl %ebx,%esi
+ pshufd $238,%xmm7,%xmm3
+ roll $5,%ebx
+ addl %ebp,%eax
+ xorl %edx,%esi
+ rorl $7,%ecx
+ addl %ebx,%eax
+ addl 16(%esp),%edi
+ pxor %xmm5,%xmm1
+ punpcklqdq %xmm0,%xmm3
+ xorl %ecx,%esi
+ movl %eax,%ebp
+ roll $5,%eax
+ pxor %xmm2,%xmm1
+ movdqa %xmm5,96(%esp)
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ movdqa %xmm4,%xmm5
+ rorl $7,%ebx
+ paddd %xmm0,%xmm4
+ addl %eax,%edi
+ pxor %xmm3,%xmm1
+ addl 20(%esp),%edx
+ xorl %ebx,%ebp
+ movl %edi,%esi
+ roll $5,%edi
+ movdqa %xmm1,%xmm3
+ movdqa %xmm4,(%esp)
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ rorl $7,%eax
+ addl %edi,%edx
+ pslld $2,%xmm1
+ addl 24(%esp),%ecx
+ xorl %eax,%esi
+ psrld $30,%xmm3
+ movl %edx,%ebp
+ roll $5,%edx
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ rorl $7,%edi
+ addl %edx,%ecx
+ por %xmm3,%xmm1
+ addl 28(%esp),%ebx
+ xorl %edi,%ebp
+ movdqa 64(%esp),%xmm3
+ movl %ecx,%esi
+ roll $5,%ecx
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ rorl $7,%edx
+ pshufd $238,%xmm0,%xmm4
+ addl %ecx,%ebx
+ addl 32(%esp),%eax
+ pxor %xmm6,%xmm2
+ punpcklqdq %xmm1,%xmm4
+ xorl %edx,%esi
+ movl %ebx,%ebp
+ roll $5,%ebx
+ pxor %xmm3,%xmm2
+ movdqa %xmm6,64(%esp)
+ addl %esi,%eax
+ xorl %edx,%ebp
+ movdqa 128(%esp),%xmm6
+ rorl $7,%ecx
+ paddd %xmm1,%xmm5
+ addl %ebx,%eax
+ pxor %xmm4,%xmm2
+ addl 36(%esp),%edi
+ xorl %ecx,%ebp
+ movl %eax,%esi
+ roll $5,%eax
+ movdqa %xmm2,%xmm4
+ movdqa %xmm5,16(%esp)
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ rorl $7,%ebx
+ addl %eax,%edi
+ pslld $2,%xmm2
+ addl 40(%esp),%edx
+ xorl %ebx,%esi
+ psrld $30,%xmm4
+ movl %edi,%ebp
+ roll $5,%edi
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ rorl $7,%eax
+ addl %edi,%edx
+ por %xmm4,%xmm2
+ addl 44(%esp),%ecx
+ xorl %eax,%ebp
+ movdqa 80(%esp),%xmm4
+ movl %edx,%esi
+ roll $5,%edx
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ rorl $7,%edi
+ pshufd $238,%xmm1,%xmm5
+ addl %edx,%ecx
+ addl 48(%esp),%ebx
+ pxor %xmm7,%xmm3
+ punpcklqdq %xmm2,%xmm5
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ roll $5,%ecx
+ pxor %xmm4,%xmm3
+ movdqa %xmm7,80(%esp)
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ movdqa %xmm6,%xmm7
+ rorl $7,%edx
+ paddd %xmm2,%xmm6
+ addl %ecx,%ebx
+ pxor %xmm5,%xmm3
+ addl 52(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ roll $5,%ebx
+ movdqa %xmm3,%xmm5
+ movdqa %xmm6,32(%esp)
+ addl %ebp,%eax
+ xorl %edx,%esi
+ rorl $7,%ecx
+ addl %ebx,%eax
+ pslld $2,%xmm3
+ addl 56(%esp),%edi
+ xorl %ecx,%esi
+ psrld $30,%xmm5
+ movl %eax,%ebp
+ roll $5,%eax
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ rorl $7,%ebx
+ addl %eax,%edi
+ por %xmm5,%xmm3
+ addl 60(%esp),%edx
+ xorl %ebx,%ebp
+ movdqa 96(%esp),%xmm5
+ movl %edi,%esi
+ roll $5,%edi
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ rorl $7,%eax
+ pshufd $238,%xmm2,%xmm6
+ addl %edi,%edx
+ addl (%esp),%ecx
+ pxor %xmm0,%xmm4
+ punpcklqdq %xmm3,%xmm6
+ xorl %eax,%esi
+ movl %edx,%ebp
+ roll $5,%edx
+ pxor %xmm5,%xmm4
+ movdqa %xmm0,96(%esp)
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ movdqa %xmm7,%xmm0
+ rorl $7,%edi
+ paddd %xmm3,%xmm7
+ addl %edx,%ecx
+ pxor %xmm6,%xmm4
+ addl 4(%esp),%ebx
+ xorl %edi,%ebp
+ movl %ecx,%esi
+ roll $5,%ecx
+ movdqa %xmm4,%xmm6
+ movdqa %xmm7,48(%esp)
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ rorl $7,%edx
+ addl %ecx,%ebx
+ pslld $2,%xmm4
+ addl 8(%esp),%eax
+ xorl %edx,%esi
+ psrld $30,%xmm6
+ movl %ebx,%ebp
+ roll $5,%ebx
+ addl %esi,%eax
+ xorl %edx,%ebp
+ rorl $7,%ecx
+ addl %ebx,%eax
+ por %xmm6,%xmm4
+ addl 12(%esp),%edi
+ xorl %ecx,%ebp
+ movdqa 64(%esp),%xmm6
+ movl %eax,%esi
+ roll $5,%eax
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ rorl $7,%ebx
+ pshufd $238,%xmm3,%xmm7
+ addl %eax,%edi
+ addl 16(%esp),%edx
+ pxor %xmm1,%xmm5
+ punpcklqdq %xmm4,%xmm7
+ xorl %ebx,%esi
+ movl %edi,%ebp
+ roll $5,%edi
+ pxor %xmm6,%xmm5
+ movdqa %xmm1,64(%esp)
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ movdqa %xmm0,%xmm1
+ rorl $7,%eax
+ paddd %xmm4,%xmm0
+ addl %edi,%edx
+ pxor %xmm7,%xmm5
+ addl 20(%esp),%ecx
+ xorl %eax,%ebp
+ movl %edx,%esi
+ roll $5,%edx
+ movdqa %xmm5,%xmm7
+ movdqa %xmm0,(%esp)
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ rorl $7,%edi
+ addl %edx,%ecx
+ pslld $2,%xmm5
+ addl 24(%esp),%ebx
+ xorl %edi,%esi
+ psrld $30,%xmm7
+ movl %ecx,%ebp
+ roll $5,%ecx
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ rorl $7,%edx
+ addl %ecx,%ebx
+ por %xmm7,%xmm5
+ addl 28(%esp),%eax
+ movdqa 80(%esp),%xmm7
+ rorl $7,%ecx
+ movl %ebx,%esi
+ xorl %edx,%ebp
+ roll $5,%ebx
+ pshufd $238,%xmm4,%xmm0
+ addl %ebp,%eax
+ xorl %ecx,%esi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 32(%esp),%edi
+ pxor %xmm2,%xmm6
+ punpcklqdq %xmm5,%xmm0
+ andl %ecx,%esi
+ xorl %edx,%ecx
+ rorl $7,%ebx
+ pxor %xmm7,%xmm6
+ movdqa %xmm2,80(%esp)
+ movl %eax,%ebp
+ xorl %ecx,%esi
+ roll $5,%eax
+ movdqa %xmm1,%xmm2
+ addl %esi,%edi
+ paddd %xmm5,%xmm1
+ xorl %ebx,%ebp
+ pxor %xmm0,%xmm6
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ addl 36(%esp),%edx
+ andl %ebx,%ebp
+ movdqa %xmm6,%xmm0
+ movdqa %xmm1,16(%esp)
+ xorl %ecx,%ebx
+ rorl $7,%eax
+ movl %edi,%esi
+ xorl %ebx,%ebp
+ roll $5,%edi
+ pslld $2,%xmm6
+ addl %ebp,%edx
+ xorl %eax,%esi
+ psrld $30,%xmm0
+ xorl %ebx,%eax
+ addl %edi,%edx
+ addl 40(%esp),%ecx
+ andl %eax,%esi
+ xorl %ebx,%eax
+ rorl $7,%edi
+ por %xmm0,%xmm6
+ movl %edx,%ebp
+ xorl %eax,%esi
+ movdqa 96(%esp),%xmm0
+ roll $5,%edx
+ addl %esi,%ecx
+ xorl %edi,%ebp
+ xorl %eax,%edi
+ addl %edx,%ecx
+ pshufd $238,%xmm5,%xmm1
+ addl 44(%esp),%ebx
+ andl %edi,%ebp
+ xorl %eax,%edi
+ rorl $7,%edx
+ movl %ecx,%esi
+ xorl %edi,%ebp
+ roll $5,%ecx
+ addl %ebp,%ebx
+ xorl %edx,%esi
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ addl 48(%esp),%eax
+ pxor %xmm3,%xmm7
+ punpcklqdq %xmm6,%xmm1
+ andl %edx,%esi
+ xorl %edi,%edx
+ rorl $7,%ecx
+ pxor %xmm0,%xmm7
+ movdqa %xmm3,96(%esp)
+ movl %ebx,%ebp
+ xorl %edx,%esi
+ roll $5,%ebx
+ movdqa 144(%esp),%xmm3
+ addl %esi,%eax
+ paddd %xmm6,%xmm2
+ xorl %ecx,%ebp
+ pxor %xmm1,%xmm7
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 52(%esp),%edi
+ andl %ecx,%ebp
+ movdqa %xmm7,%xmm1
+ movdqa %xmm2,32(%esp)
+ xorl %edx,%ecx
+ rorl $7,%ebx
+ movl %eax,%esi
+ xorl %ecx,%ebp
+ roll $5,%eax
+ pslld $2,%xmm7
+ addl %ebp,%edi
+ xorl %ebx,%esi
+ psrld $30,%xmm1
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ addl 56(%esp),%edx
+ andl %ebx,%esi
+ xorl %ecx,%ebx
+ rorl $7,%eax
+ por %xmm1,%xmm7
+ movl %edi,%ebp
+ xorl %ebx,%esi
+ movdqa 64(%esp),%xmm1
+ roll $5,%edi
+ addl %esi,%edx
+ xorl %eax,%ebp
+ xorl %ebx,%eax
+ addl %edi,%edx
+ pshufd $238,%xmm6,%xmm2
+ addl 60(%esp),%ecx
+ andl %eax,%ebp
+ xorl %ebx,%eax
+ rorl $7,%edi
+ movl %edx,%esi
+ xorl %eax,%ebp
+ roll $5,%edx
+ addl %ebp,%ecx
+ xorl %edi,%esi
+ xorl %eax,%edi
+ addl %edx,%ecx
+ addl (%esp),%ebx
+ pxor %xmm4,%xmm0
+ punpcklqdq %xmm7,%xmm2
+ andl %edi,%esi
+ xorl %eax,%edi
+ rorl $7,%edx
+ pxor %xmm1,%xmm0
+ movdqa %xmm4,64(%esp)
+ movl %ecx,%ebp
+ xorl %edi,%esi
+ roll $5,%ecx
+ movdqa %xmm3,%xmm4
+ addl %esi,%ebx
+ paddd %xmm7,%xmm3
+ xorl %edx,%ebp
+ pxor %xmm2,%xmm0
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ addl 4(%esp),%eax
+ andl %edx,%ebp
+ movdqa %xmm0,%xmm2
+ movdqa %xmm3,48(%esp)
+ xorl %edi,%edx
+ rorl $7,%ecx
+ movl %ebx,%esi
+ xorl %edx,%ebp
+ roll $5,%ebx
+ pslld $2,%xmm0
+ addl %ebp,%eax
+ xorl %ecx,%esi
+ psrld $30,%xmm2
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 8(%esp),%edi
+ andl %ecx,%esi
+ xorl %edx,%ecx
+ rorl $7,%ebx
+ por %xmm2,%xmm0
+ movl %eax,%ebp
+ xorl %ecx,%esi
+ movdqa 80(%esp),%xmm2
+ roll $5,%eax
+ addl %esi,%edi
+ xorl %ebx,%ebp
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ pshufd $238,%xmm7,%xmm3
+ addl 12(%esp),%edx
+ andl %ebx,%ebp
+ xorl %ecx,%ebx
+ rorl $7,%eax
+ movl %edi,%esi
+ xorl %ebx,%ebp
+ roll $5,%edi
+ addl %ebp,%edx
+ xorl %eax,%esi
+ xorl %ebx,%eax
+ addl %edi,%edx
+ addl 16(%esp),%ecx
+ pxor %xmm5,%xmm1
+ punpcklqdq %xmm0,%xmm3
+ andl %eax,%esi
+ xorl %ebx,%eax
+ rorl $7,%edi
+ pxor %xmm2,%xmm1
+ movdqa %xmm5,80(%esp)
+ movl %edx,%ebp
+ xorl %eax,%esi
+ roll $5,%edx
+ movdqa %xmm4,%xmm5
+ addl %esi,%ecx
+ paddd %xmm0,%xmm4
+ xorl %edi,%ebp
+ pxor %xmm3,%xmm1
+ xorl %eax,%edi
+ addl %edx,%ecx
+ addl 20(%esp),%ebx
+ andl %edi,%ebp
+ movdqa %xmm1,%xmm3
+ movdqa %xmm4,(%esp)
+ xorl %eax,%edi
+ rorl $7,%edx
+ movl %ecx,%esi
+ xorl %edi,%ebp
+ roll $5,%ecx
+ pslld $2,%xmm1
+ addl %ebp,%ebx
+ xorl %edx,%esi
+ psrld $30,%xmm3
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ addl 24(%esp),%eax
+ andl %edx,%esi
+ xorl %edi,%edx
+ rorl $7,%ecx
+ por %xmm3,%xmm1
+ movl %ebx,%ebp
+ xorl %edx,%esi
+ movdqa 96(%esp),%xmm3
+ roll $5,%ebx
+ addl %esi,%eax
+ xorl %ecx,%ebp
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ pshufd $238,%xmm0,%xmm4
+ addl 28(%esp),%edi
+ andl %ecx,%ebp
+ xorl %edx,%ecx
+ rorl $7,%ebx
+ movl %eax,%esi
+ xorl %ecx,%ebp
+ roll $5,%eax
+ addl %ebp,%edi
+ xorl %ebx,%esi
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ addl 32(%esp),%edx
+ pxor %xmm6,%xmm2
+ punpcklqdq %xmm1,%xmm4
+ andl %ebx,%esi
+ xorl %ecx,%ebx
+ rorl $7,%eax
+ pxor %xmm3,%xmm2
+ movdqa %xmm6,96(%esp)
+ movl %edi,%ebp
+ xorl %ebx,%esi
+ roll $5,%edi
+ movdqa %xmm5,%xmm6
+ addl %esi,%edx
+ paddd %xmm1,%xmm5
+ xorl %eax,%ebp
+ pxor %xmm4,%xmm2
+ xorl %ebx,%eax
+ addl %edi,%edx
+ addl 36(%esp),%ecx
+ andl %eax,%ebp
+ movdqa %xmm2,%xmm4
+ movdqa %xmm5,16(%esp)
+ xorl %ebx,%eax
+ rorl $7,%edi
+ movl %edx,%esi
+ xorl %eax,%ebp
+ roll $5,%edx
+ pslld $2,%xmm2
+ addl %ebp,%ecx
+ xorl %edi,%esi
+ psrld $30,%xmm4
+ xorl %eax,%edi
+ addl %edx,%ecx
+ addl 40(%esp),%ebx
+ andl %edi,%esi
+ xorl %eax,%edi
+ rorl $7,%edx
+ por %xmm4,%xmm2
+ movl %ecx,%ebp
+ xorl %edi,%esi
+ movdqa 64(%esp),%xmm4
+ roll $5,%ecx
+ addl %esi,%ebx
+ xorl %edx,%ebp
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ pshufd $238,%xmm1,%xmm5
+ addl 44(%esp),%eax
+ andl %edx,%ebp
+ xorl %edi,%edx
+ rorl $7,%ecx
+ movl %ebx,%esi
+ xorl %edx,%ebp
+ roll $5,%ebx
+ addl %ebp,%eax
+ xorl %edx,%esi
+ addl %ebx,%eax
+ addl 48(%esp),%edi
+ pxor %xmm7,%xmm3
+ punpcklqdq %xmm2,%xmm5
+ xorl %ecx,%esi
+ movl %eax,%ebp
+ roll $5,%eax
+ pxor %xmm4,%xmm3
+ movdqa %xmm7,64(%esp)
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ movdqa %xmm6,%xmm7
+ rorl $7,%ebx
+ paddd %xmm2,%xmm6
+ addl %eax,%edi
+ pxor %xmm5,%xmm3
+ addl 52(%esp),%edx
+ xorl %ebx,%ebp
+ movl %edi,%esi
+ roll $5,%edi
+ movdqa %xmm3,%xmm5
+ movdqa %xmm6,32(%esp)
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ rorl $7,%eax
+ addl %edi,%edx
+ pslld $2,%xmm3
+ addl 56(%esp),%ecx
+ xorl %eax,%esi
+ psrld $30,%xmm5
+ movl %edx,%ebp
+ roll $5,%edx
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ rorl $7,%edi
+ addl %edx,%ecx
+ por %xmm5,%xmm3
+ addl 60(%esp),%ebx
+ xorl %edi,%ebp
+ movl %ecx,%esi
+ roll $5,%ecx
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl (%esp),%eax
+ xorl %edx,%esi
+ movl %ebx,%ebp
+ roll $5,%ebx
+ addl %esi,%eax
+ xorl %edx,%ebp
+ rorl $7,%ecx
+ paddd %xmm3,%xmm7
+ addl %ebx,%eax
+ addl 4(%esp),%edi
+ xorl %ecx,%ebp
+ movl %eax,%esi
+ movdqa %xmm7,48(%esp)
+ roll $5,%eax
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ rorl $7,%ebx
+ addl %eax,%edi
+ addl 8(%esp),%edx
+ xorl %ebx,%esi
+ movl %edi,%ebp
+ roll $5,%edi
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ rorl $7,%eax
+ addl %edi,%edx
+ addl 12(%esp),%ecx
+ xorl %eax,%ebp
+ movl %edx,%esi
+ roll $5,%edx
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ rorl $7,%edi
+ addl %edx,%ecx
+ movl 196(%esp),%ebp
+ cmpl 200(%esp),%ebp
+ je .L007done
+ movdqa 160(%esp),%xmm7
+ movdqa 176(%esp),%xmm6
+ movdqu (%ebp),%xmm0
+ movdqu 16(%ebp),%xmm1
+ movdqu 32(%ebp),%xmm2
+ movdqu 48(%ebp),%xmm3
+ addl $64,%ebp
+.byte 102,15,56,0,198
+ movl %ebp,196(%esp)
+ movdqa %xmm7,96(%esp)
+ addl 16(%esp),%ebx
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ roll $5,%ecx
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ rorl $7,%edx
+.byte 102,15,56,0,206
+ addl %ecx,%ebx
+ addl 20(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ paddd %xmm7,%xmm0
+ roll $5,%ebx
+ addl %ebp,%eax
+ xorl %edx,%esi
+ rorl $7,%ecx
+ movdqa %xmm0,(%esp)
+ addl %ebx,%eax
+ addl 24(%esp),%edi
+ xorl %ecx,%esi
+ movl %eax,%ebp
+ psubd %xmm7,%xmm0
+ roll $5,%eax
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ rorl $7,%ebx
+ addl %eax,%edi
+ addl 28(%esp),%edx
+ xorl %ebx,%ebp
+ movl %edi,%esi
+ roll $5,%edi
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ rorl $7,%eax
+ addl %edi,%edx
+ addl 32(%esp),%ecx
+ xorl %eax,%esi
+ movl %edx,%ebp
+ roll $5,%edx
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ rorl $7,%edi
+.byte 102,15,56,0,214
+ addl %edx,%ecx
+ addl 36(%esp),%ebx
+ xorl %edi,%ebp
+ movl %ecx,%esi
+ paddd %xmm7,%xmm1
+ roll $5,%ecx
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ rorl $7,%edx
+ movdqa %xmm1,16(%esp)
+ addl %ecx,%ebx
+ addl 40(%esp),%eax
+ xorl %edx,%esi
+ movl %ebx,%ebp
+ psubd %xmm7,%xmm1
+ roll $5,%ebx
+ addl %esi,%eax
+ xorl %edx,%ebp
+ rorl $7,%ecx
+ addl %ebx,%eax
+ addl 44(%esp),%edi
+ xorl %ecx,%ebp
+ movl %eax,%esi
+ roll $5,%eax
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ rorl $7,%ebx
+ addl %eax,%edi
+ addl 48(%esp),%edx
+ xorl %ebx,%esi
+ movl %edi,%ebp
+ roll $5,%edi
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ rorl $7,%eax
+.byte 102,15,56,0,222
+ addl %edi,%edx
+ addl 52(%esp),%ecx
+ xorl %eax,%ebp
+ movl %edx,%esi
+ paddd %xmm7,%xmm2
+ roll $5,%edx
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ rorl $7,%edi
+ movdqa %xmm2,32(%esp)
+ addl %edx,%ecx
+ addl 56(%esp),%ebx
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ psubd %xmm7,%xmm2
+ roll $5,%ecx
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl 60(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ roll $5,%ebx
+ addl %ebp,%eax
+ rorl $7,%ecx
+ addl %ebx,%eax
+ movl 192(%esp),%ebp
+ addl (%ebp),%eax
+ addl 4(%ebp),%esi
+ addl 8(%ebp),%ecx
+ movl %eax,(%ebp)
+ addl 12(%ebp),%edx
+ movl %esi,4(%ebp)
+ addl 16(%ebp),%edi
+ movl %ecx,8(%ebp)
+ movl %ecx,%ebx
+ movl %edx,12(%ebp)
+ xorl %edx,%ebx
+ movl %edi,16(%ebp)
+ movl %esi,%ebp
+ pshufd $238,%xmm0,%xmm4
+ andl %ebx,%esi
+ movl %ebp,%ebx
+ jmp .L006loop
+.align 16
+.L007done:
+ addl 16(%esp),%ebx
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ roll $5,%ecx
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl 20(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ roll $5,%ebx
+ addl %ebp,%eax
+ xorl %edx,%esi
+ rorl $7,%ecx
+ addl %ebx,%eax
+ addl 24(%esp),%edi
+ xorl %ecx,%esi
+ movl %eax,%ebp
+ roll $5,%eax
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ rorl $7,%ebx
+ addl %eax,%edi
+ addl 28(%esp),%edx
+ xorl %ebx,%ebp
+ movl %edi,%esi
+ roll $5,%edi
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ rorl $7,%eax
+ addl %edi,%edx
+ addl 32(%esp),%ecx
+ xorl %eax,%esi
+ movl %edx,%ebp
+ roll $5,%edx
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ rorl $7,%edi
+ addl %edx,%ecx
+ addl 36(%esp),%ebx
+ xorl %edi,%ebp
+ movl %ecx,%esi
+ roll $5,%ecx
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl 40(%esp),%eax
+ xorl %edx,%esi
+ movl %ebx,%ebp
+ roll $5,%ebx
+ addl %esi,%eax
+ xorl %edx,%ebp
+ rorl $7,%ecx
+ addl %ebx,%eax
+ addl 44(%esp),%edi
+ xorl %ecx,%ebp
+ movl %eax,%esi
+ roll $5,%eax
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ rorl $7,%ebx
+ addl %eax,%edi
+ addl 48(%esp),%edx
+ xorl %ebx,%esi
+ movl %edi,%ebp
+ roll $5,%edi
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ rorl $7,%eax
+ addl %edi,%edx
+ addl 52(%esp),%ecx
+ xorl %eax,%ebp
+ movl %edx,%esi
+ roll $5,%edx
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ rorl $7,%edi
+ addl %edx,%ecx
+ addl 56(%esp),%ebx
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ roll $5,%ecx
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl 60(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ roll $5,%ebx
+ addl %ebp,%eax
+ rorl $7,%ecx
+ addl %ebx,%eax
+ movl 192(%esp),%ebp
+ addl (%ebp),%eax
+ movl 204(%esp),%esp
+ addl 4(%ebp),%esi
+ addl 8(%ebp),%ecx
+ movl %eax,(%ebp)
+ addl 12(%ebp),%edx
+ movl %esi,4(%ebp)
+ addl 16(%ebp),%edi
+ movl %ecx,8(%ebp)
+ movl %edx,12(%ebp)
+ movl %edi,16(%ebp)
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size _sha1_block_data_order_ssse3,.-_sha1_block_data_order_ssse3
+.type _sha1_block_data_order_avx,@function
+.align 16
+_sha1_block_data_order_avx:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ call .L008pic_point
+.L008pic_point:
+ popl %ebp
+ leal .LK_XX_XX-.L008pic_point(%ebp),%ebp
+.Lavx_shortcut:
+ vzeroall
+ vmovdqa (%ebp),%xmm7
+ vmovdqa 16(%ebp),%xmm0
+ vmovdqa 32(%ebp),%xmm1
+ vmovdqa 48(%ebp),%xmm2
+ vmovdqa 64(%ebp),%xmm6
+ movl 20(%esp),%edi
+ movl 24(%esp),%ebp
+ movl 28(%esp),%edx
+ movl %esp,%esi
+ subl $208,%esp
+ andl $-64,%esp
+ vmovdqa %xmm0,112(%esp)
+ vmovdqa %xmm1,128(%esp)
+ vmovdqa %xmm2,144(%esp)
+ shll $6,%edx
+ vmovdqa %xmm7,160(%esp)
+ addl %ebp,%edx
+ vmovdqa %xmm6,176(%esp)
+ addl $64,%ebp
+ movl %edi,192(%esp)
+ movl %ebp,196(%esp)
+ movl %edx,200(%esp)
+ movl %esi,204(%esp)
+ movl (%edi),%eax
+ movl 4(%edi),%ebx
+ movl 8(%edi),%ecx
+ movl 12(%edi),%edx
+ movl 16(%edi),%edi
+ movl %ebx,%esi
+ vmovdqu -64(%ebp),%xmm0
+ vmovdqu -48(%ebp),%xmm1
+ vmovdqu -32(%ebp),%xmm2
+ vmovdqu -16(%ebp),%xmm3
+ vpshufb %xmm6,%xmm0,%xmm0
+ vpshufb %xmm6,%xmm1,%xmm1
+ vpshufb %xmm6,%xmm2,%xmm2
+ vmovdqa %xmm7,96(%esp)
+ vpshufb %xmm6,%xmm3,%xmm3
+ vpaddd %xmm7,%xmm0,%xmm4
+ vpaddd %xmm7,%xmm1,%xmm5
+ vpaddd %xmm7,%xmm2,%xmm6
+ vmovdqa %xmm4,(%esp)
+ movl %ecx,%ebp
+ vmovdqa %xmm5,16(%esp)
+ xorl %edx,%ebp
+ vmovdqa %xmm6,32(%esp)
+ andl %ebp,%esi
+ jmp .L009loop
+.align 16
+.L009loop:
+ shrdl $2,%ebx,%ebx
+ xorl %edx,%esi
+ vpalignr $8,%xmm0,%xmm1,%xmm4
+ movl %eax,%ebp
+ addl (%esp),%edi
+ vpaddd %xmm3,%xmm7,%xmm7
+ vmovdqa %xmm0,64(%esp)
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ vpsrldq $4,%xmm3,%xmm6
+ addl %esi,%edi
+ andl %ebx,%ebp
+ vpxor %xmm0,%xmm4,%xmm4
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ vpxor %xmm2,%xmm6,%xmm6
+ shrdl $7,%eax,%eax
+ xorl %ecx,%ebp
+ vmovdqa %xmm7,48(%esp)
+ movl %edi,%esi
+ addl 4(%esp),%edx
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %ebx,%eax
+ shldl $5,%edi,%edi
+ addl %ebp,%edx
+ andl %eax,%esi
+ vpsrld $31,%xmm4,%xmm6
+ xorl %ebx,%eax
+ addl %edi,%edx
+ shrdl $7,%edi,%edi
+ xorl %ebx,%esi
+ vpslldq $12,%xmm4,%xmm0
+ vpaddd %xmm4,%xmm4,%xmm4
+ movl %edx,%ebp
+ addl 8(%esp),%ecx
+ xorl %eax,%edi
+ shldl $5,%edx,%edx
+ vpsrld $30,%xmm0,%xmm7
+ vpor %xmm6,%xmm4,%xmm4
+ addl %esi,%ecx
+ andl %edi,%ebp
+ xorl %eax,%edi
+ addl %edx,%ecx
+ vpslld $2,%xmm0,%xmm0
+ shrdl $7,%edx,%edx
+ xorl %eax,%ebp
+ vpxor %xmm7,%xmm4,%xmm4
+ movl %ecx,%esi
+ addl 12(%esp),%ebx
+ xorl %edi,%edx
+ shldl $5,%ecx,%ecx
+ vpxor %xmm0,%xmm4,%xmm4
+ addl %ebp,%ebx
+ andl %edx,%esi
+ vmovdqa 96(%esp),%xmm0
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ shrdl $7,%ecx,%ecx
+ xorl %edi,%esi
+ vpalignr $8,%xmm1,%xmm2,%xmm5
+ movl %ebx,%ebp
+ addl 16(%esp),%eax
+ vpaddd %xmm4,%xmm0,%xmm0
+ vmovdqa %xmm1,80(%esp)
+ xorl %edx,%ecx
+ shldl $5,%ebx,%ebx
+ vpsrldq $4,%xmm4,%xmm7
+ addl %esi,%eax
+ andl %ecx,%ebp
+ vpxor %xmm1,%xmm5,%xmm5
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ vpxor %xmm3,%xmm7,%xmm7
+ shrdl $7,%ebx,%ebx
+ xorl %edx,%ebp
+ vmovdqa %xmm0,(%esp)
+ movl %eax,%esi
+ addl 20(%esp),%edi
+ vpxor %xmm7,%xmm5,%xmm5
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ addl %ebp,%edi
+ andl %ebx,%esi
+ vpsrld $31,%xmm5,%xmm7
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ shrdl $7,%eax,%eax
+ xorl %ecx,%esi
+ vpslldq $12,%xmm5,%xmm1
+ vpaddd %xmm5,%xmm5,%xmm5
+ movl %edi,%ebp
+ addl 24(%esp),%edx
+ xorl %ebx,%eax
+ shldl $5,%edi,%edi
+ vpsrld $30,%xmm1,%xmm0
+ vpor %xmm7,%xmm5,%xmm5
+ addl %esi,%edx
+ andl %eax,%ebp
+ xorl %ebx,%eax
+ addl %edi,%edx
+ vpslld $2,%xmm1,%xmm1
+ shrdl $7,%edi,%edi
+ xorl %ebx,%ebp
+ vpxor %xmm0,%xmm5,%xmm5
+ movl %edx,%esi
+ addl 28(%esp),%ecx
+ xorl %eax,%edi
+ shldl $5,%edx,%edx
+ vpxor %xmm1,%xmm5,%xmm5
+ addl %ebp,%ecx
+ andl %edi,%esi
+ vmovdqa 112(%esp),%xmm1
+ xorl %eax,%edi
+ addl %edx,%ecx
+ shrdl $7,%edx,%edx
+ xorl %eax,%esi
+ vpalignr $8,%xmm2,%xmm3,%xmm6
+ movl %ecx,%ebp
+ addl 32(%esp),%ebx
+ vpaddd %xmm5,%xmm1,%xmm1
+ vmovdqa %xmm2,96(%esp)
+ xorl %edi,%edx
+ shldl $5,%ecx,%ecx
+ vpsrldq $4,%xmm5,%xmm0
+ addl %esi,%ebx
+ andl %edx,%ebp
+ vpxor %xmm2,%xmm6,%xmm6
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ vpxor %xmm4,%xmm0,%xmm0
+ shrdl $7,%ecx,%ecx
+ xorl %edi,%ebp
+ vmovdqa %xmm1,16(%esp)
+ movl %ebx,%esi
+ addl 36(%esp),%eax
+ vpxor %xmm0,%xmm6,%xmm6
+ xorl %edx,%ecx
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ andl %ecx,%esi
+ vpsrld $31,%xmm6,%xmm0
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ shrdl $7,%ebx,%ebx
+ xorl %edx,%esi
+ vpslldq $12,%xmm6,%xmm2
+ vpaddd %xmm6,%xmm6,%xmm6
+ movl %eax,%ebp
+ addl 40(%esp),%edi
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ vpsrld $30,%xmm2,%xmm1
+ vpor %xmm0,%xmm6,%xmm6
+ addl %esi,%edi
+ andl %ebx,%ebp
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ vpslld $2,%xmm2,%xmm2
+ vmovdqa 64(%esp),%xmm0
+ shrdl $7,%eax,%eax
+ xorl %ecx,%ebp
+ vpxor %xmm1,%xmm6,%xmm6
+ movl %edi,%esi
+ addl 44(%esp),%edx
+ xorl %ebx,%eax
+ shldl $5,%edi,%edi
+ vpxor %xmm2,%xmm6,%xmm6
+ addl %ebp,%edx
+ andl %eax,%esi
+ vmovdqa 112(%esp),%xmm2
+ xorl %ebx,%eax
+ addl %edi,%edx
+ shrdl $7,%edi,%edi
+ xorl %ebx,%esi
+ vpalignr $8,%xmm3,%xmm4,%xmm7
+ movl %edx,%ebp
+ addl 48(%esp),%ecx
+ vpaddd %xmm6,%xmm2,%xmm2
+ vmovdqa %xmm3,64(%esp)
+ xorl %eax,%edi
+ shldl $5,%edx,%edx
+ vpsrldq $4,%xmm6,%xmm1
+ addl %esi,%ecx
+ andl %edi,%ebp
+ vpxor %xmm3,%xmm7,%xmm7
+ xorl %eax,%edi
+ addl %edx,%ecx
+ vpxor %xmm5,%xmm1,%xmm1
+ shrdl $7,%edx,%edx
+ xorl %eax,%ebp
+ vmovdqa %xmm2,32(%esp)
+ movl %ecx,%esi
+ addl 52(%esp),%ebx
+ vpxor %xmm1,%xmm7,%xmm7
+ xorl %edi,%edx
+ shldl $5,%ecx,%ecx
+ addl %ebp,%ebx
+ andl %edx,%esi
+ vpsrld $31,%xmm7,%xmm1
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ shrdl $7,%ecx,%ecx
+ xorl %edi,%esi
+ vpslldq $12,%xmm7,%xmm3
+ vpaddd %xmm7,%xmm7,%xmm7
+ movl %ebx,%ebp
+ addl 56(%esp),%eax
+ xorl %edx,%ecx
+ shldl $5,%ebx,%ebx
+ vpsrld $30,%xmm3,%xmm2
+ vpor %xmm1,%xmm7,%xmm7
+ addl %esi,%eax
+ andl %ecx,%ebp
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ vpslld $2,%xmm3,%xmm3
+ vmovdqa 80(%esp),%xmm1
+ shrdl $7,%ebx,%ebx
+ xorl %edx,%ebp
+ vpxor %xmm2,%xmm7,%xmm7
+ movl %eax,%esi
+ addl 60(%esp),%edi
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ vpxor %xmm3,%xmm7,%xmm7
+ addl %ebp,%edi
+ andl %ebx,%esi
+ vmovdqa 112(%esp),%xmm3
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ vpalignr $8,%xmm6,%xmm7,%xmm2
+ vpxor %xmm4,%xmm0,%xmm0
+ shrdl $7,%eax,%eax
+ xorl %ecx,%esi
+ movl %edi,%ebp
+ addl (%esp),%edx
+ vpxor %xmm1,%xmm0,%xmm0
+ vmovdqa %xmm4,80(%esp)
+ xorl %ebx,%eax
+ shldl $5,%edi,%edi
+ vmovdqa %xmm3,%xmm4
+ vpaddd %xmm7,%xmm3,%xmm3
+ addl %esi,%edx
+ andl %eax,%ebp
+ vpxor %xmm2,%xmm0,%xmm0
+ xorl %ebx,%eax
+ addl %edi,%edx
+ shrdl $7,%edi,%edi
+ xorl %ebx,%ebp
+ vpsrld $30,%xmm0,%xmm2
+ vmovdqa %xmm3,48(%esp)
+ movl %edx,%esi
+ addl 4(%esp),%ecx
+ xorl %eax,%edi
+ shldl $5,%edx,%edx
+ vpslld $2,%xmm0,%xmm0
+ addl %ebp,%ecx
+ andl %edi,%esi
+ xorl %eax,%edi
+ addl %edx,%ecx
+ shrdl $7,%edx,%edx
+ xorl %eax,%esi
+ movl %ecx,%ebp
+ addl 8(%esp),%ebx
+ vpor %xmm2,%xmm0,%xmm0
+ xorl %edi,%edx
+ shldl $5,%ecx,%ecx
+ vmovdqa 96(%esp),%xmm2
+ addl %esi,%ebx
+ andl %edx,%ebp
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ addl 12(%esp),%eax
+ xorl %edi,%ebp
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpalignr $8,%xmm7,%xmm0,%xmm3
+ vpxor %xmm5,%xmm1,%xmm1
+ addl 16(%esp),%edi
+ xorl %ecx,%esi
+ movl %eax,%ebp
+ shldl $5,%eax,%eax
+ vpxor %xmm2,%xmm1,%xmm1
+ vmovdqa %xmm5,96(%esp)
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ vmovdqa %xmm4,%xmm5
+ vpaddd %xmm0,%xmm4,%xmm4
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ vpxor %xmm3,%xmm1,%xmm1
+ addl 20(%esp),%edx
+ xorl %ebx,%ebp
+ movl %edi,%esi
+ shldl $5,%edi,%edi
+ vpsrld $30,%xmm1,%xmm3
+ vmovdqa %xmm4,(%esp)
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ vpslld $2,%xmm1,%xmm1
+ addl 24(%esp),%ecx
+ xorl %eax,%esi
+ movl %edx,%ebp
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ vpor %xmm3,%xmm1,%xmm1
+ addl 28(%esp),%ebx
+ xorl %edi,%ebp
+ vmovdqa 64(%esp),%xmm3
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpalignr $8,%xmm0,%xmm1,%xmm4
+ vpxor %xmm6,%xmm2,%xmm2
+ addl 32(%esp),%eax
+ xorl %edx,%esi
+ movl %ebx,%ebp
+ shldl $5,%ebx,%ebx
+ vpxor %xmm3,%xmm2,%xmm2
+ vmovdqa %xmm6,64(%esp)
+ addl %esi,%eax
+ xorl %edx,%ebp
+ vmovdqa 128(%esp),%xmm6
+ vpaddd %xmm1,%xmm5,%xmm5
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpxor %xmm4,%xmm2,%xmm2
+ addl 36(%esp),%edi
+ xorl %ecx,%ebp
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ vpsrld $30,%xmm2,%xmm4
+ vmovdqa %xmm5,16(%esp)
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ vpslld $2,%xmm2,%xmm2
+ addl 40(%esp),%edx
+ xorl %ebx,%esi
+ movl %edi,%ebp
+ shldl $5,%edi,%edi
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ vpor %xmm4,%xmm2,%xmm2
+ addl 44(%esp),%ecx
+ xorl %eax,%ebp
+ vmovdqa 80(%esp),%xmm4
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ vpalignr $8,%xmm1,%xmm2,%xmm5
+ vpxor %xmm7,%xmm3,%xmm3
+ addl 48(%esp),%ebx
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ shldl $5,%ecx,%ecx
+ vpxor %xmm4,%xmm3,%xmm3
+ vmovdqa %xmm7,80(%esp)
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ vmovdqa %xmm6,%xmm7
+ vpaddd %xmm2,%xmm6,%xmm6
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpxor %xmm5,%xmm3,%xmm3
+ addl 52(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ vpsrld $30,%xmm3,%xmm5
+ vmovdqa %xmm6,32(%esp)
+ addl %ebp,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpslld $2,%xmm3,%xmm3
+ addl 56(%esp),%edi
+ xorl %ecx,%esi
+ movl %eax,%ebp
+ shldl $5,%eax,%eax
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ vpor %xmm5,%xmm3,%xmm3
+ addl 60(%esp),%edx
+ xorl %ebx,%ebp
+ vmovdqa 96(%esp),%xmm5
+ movl %edi,%esi
+ shldl $5,%edi,%edi
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ vpalignr $8,%xmm2,%xmm3,%xmm6
+ vpxor %xmm0,%xmm4,%xmm4
+ addl (%esp),%ecx
+ xorl %eax,%esi
+ movl %edx,%ebp
+ shldl $5,%edx,%edx
+ vpxor %xmm5,%xmm4,%xmm4
+ vmovdqa %xmm0,96(%esp)
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ vmovdqa %xmm7,%xmm0
+ vpaddd %xmm3,%xmm7,%xmm7
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ vpxor %xmm6,%xmm4,%xmm4
+ addl 4(%esp),%ebx
+ xorl %edi,%ebp
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ vpsrld $30,%xmm4,%xmm6
+ vmovdqa %xmm7,48(%esp)
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpslld $2,%xmm4,%xmm4
+ addl 8(%esp),%eax
+ xorl %edx,%esi
+ movl %ebx,%ebp
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %edx,%ebp
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpor %xmm6,%xmm4,%xmm4
+ addl 12(%esp),%edi
+ xorl %ecx,%ebp
+ vmovdqa 64(%esp),%xmm6
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ vpalignr $8,%xmm3,%xmm4,%xmm7
+ vpxor %xmm1,%xmm5,%xmm5
+ addl 16(%esp),%edx
+ xorl %ebx,%esi
+ movl %edi,%ebp
+ shldl $5,%edi,%edi
+ vpxor %xmm6,%xmm5,%xmm5
+ vmovdqa %xmm1,64(%esp)
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ vmovdqa %xmm0,%xmm1
+ vpaddd %xmm4,%xmm0,%xmm0
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ vpxor %xmm7,%xmm5,%xmm5
+ addl 20(%esp),%ecx
+ xorl %eax,%ebp
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ vpsrld $30,%xmm5,%xmm7
+ vmovdqa %xmm0,(%esp)
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ vpslld $2,%xmm5,%xmm5
+ addl 24(%esp),%ebx
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpor %xmm7,%xmm5,%xmm5
+ addl 28(%esp),%eax
+ vmovdqa 80(%esp),%xmm7
+ shrdl $7,%ecx,%ecx
+ movl %ebx,%esi
+ xorl %edx,%ebp
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ xorl %ecx,%esi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ vpalignr $8,%xmm4,%xmm5,%xmm0
+ vpxor %xmm2,%xmm6,%xmm6
+ addl 32(%esp),%edi
+ andl %ecx,%esi
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ vpxor %xmm7,%xmm6,%xmm6
+ vmovdqa %xmm2,80(%esp)
+ movl %eax,%ebp
+ xorl %ecx,%esi
+ vmovdqa %xmm1,%xmm2
+ vpaddd %xmm5,%xmm1,%xmm1
+ shldl $5,%eax,%eax
+ addl %esi,%edi
+ vpxor %xmm0,%xmm6,%xmm6
+ xorl %ebx,%ebp
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ addl 36(%esp),%edx
+ vpsrld $30,%xmm6,%xmm0
+ vmovdqa %xmm1,16(%esp)
+ andl %ebx,%ebp
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ movl %edi,%esi
+ vpslld $2,%xmm6,%xmm6
+ xorl %ebx,%ebp
+ shldl $5,%edi,%edi
+ addl %ebp,%edx
+ xorl %eax,%esi
+ xorl %ebx,%eax
+ addl %edi,%edx
+ addl 40(%esp),%ecx
+ andl %eax,%esi
+ vpor %xmm0,%xmm6,%xmm6
+ xorl %ebx,%eax
+ shrdl $7,%edi,%edi
+ vmovdqa 96(%esp),%xmm0
+ movl %edx,%ebp
+ xorl %eax,%esi
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %edi,%ebp
+ xorl %eax,%edi
+ addl %edx,%ecx
+ addl 44(%esp),%ebx
+ andl %edi,%ebp
+ xorl %eax,%edi
+ shrdl $7,%edx,%edx
+ movl %ecx,%esi
+ xorl %edi,%ebp
+ shldl $5,%ecx,%ecx
+ addl %ebp,%ebx
+ xorl %edx,%esi
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ vpalignr $8,%xmm5,%xmm6,%xmm1
+ vpxor %xmm3,%xmm7,%xmm7
+ addl 48(%esp),%eax
+ andl %edx,%esi
+ xorl %edi,%edx
+ shrdl $7,%ecx,%ecx
+ vpxor %xmm0,%xmm7,%xmm7
+ vmovdqa %xmm3,96(%esp)
+ movl %ebx,%ebp
+ xorl %edx,%esi
+ vmovdqa 144(%esp),%xmm3
+ vpaddd %xmm6,%xmm2,%xmm2
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ vpxor %xmm1,%xmm7,%xmm7
+ xorl %ecx,%ebp
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 52(%esp),%edi
+ vpsrld $30,%xmm7,%xmm1
+ vmovdqa %xmm2,32(%esp)
+ andl %ecx,%ebp
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ movl %eax,%esi
+ vpslld $2,%xmm7,%xmm7
+ xorl %ecx,%ebp
+ shldl $5,%eax,%eax
+ addl %ebp,%edi
+ xorl %ebx,%esi
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ addl 56(%esp),%edx
+ andl %ebx,%esi
+ vpor %xmm1,%xmm7,%xmm7
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ vmovdqa 64(%esp),%xmm1
+ movl %edi,%ebp
+ xorl %ebx,%esi
+ shldl $5,%edi,%edi
+ addl %esi,%edx
+ xorl %eax,%ebp
+ xorl %ebx,%eax
+ addl %edi,%edx
+ addl 60(%esp),%ecx
+ andl %eax,%ebp
+ xorl %ebx,%eax
+ shrdl $7,%edi,%edi
+ movl %edx,%esi
+ xorl %eax,%ebp
+ shldl $5,%edx,%edx
+ addl %ebp,%ecx
+ xorl %edi,%esi
+ xorl %eax,%edi
+ addl %edx,%ecx
+ vpalignr $8,%xmm6,%xmm7,%xmm2
+ vpxor %xmm4,%xmm0,%xmm0
+ addl (%esp),%ebx
+ andl %edi,%esi
+ xorl %eax,%edi
+ shrdl $7,%edx,%edx
+ vpxor %xmm1,%xmm0,%xmm0
+ vmovdqa %xmm4,64(%esp)
+ movl %ecx,%ebp
+ xorl %edi,%esi
+ vmovdqa %xmm3,%xmm4
+ vpaddd %xmm7,%xmm3,%xmm3
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ vpxor %xmm2,%xmm0,%xmm0
+ xorl %edx,%ebp
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ addl 4(%esp),%eax
+ vpsrld $30,%xmm0,%xmm2
+ vmovdqa %xmm3,48(%esp)
+ andl %edx,%ebp
+ xorl %edi,%edx
+ shrdl $7,%ecx,%ecx
+ movl %ebx,%esi
+ vpslld $2,%xmm0,%xmm0
+ xorl %edx,%ebp
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ xorl %ecx,%esi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 8(%esp),%edi
+ andl %ecx,%esi
+ vpor %xmm2,%xmm0,%xmm0
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ vmovdqa 80(%esp),%xmm2
+ movl %eax,%ebp
+ xorl %ecx,%esi
+ shldl $5,%eax,%eax
+ addl %esi,%edi
+ xorl %ebx,%ebp
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ addl 12(%esp),%edx
+ andl %ebx,%ebp
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ movl %edi,%esi
+ xorl %ebx,%ebp
+ shldl $5,%edi,%edi
+ addl %ebp,%edx
+ xorl %eax,%esi
+ xorl %ebx,%eax
+ addl %edi,%edx
+ vpalignr $8,%xmm7,%xmm0,%xmm3
+ vpxor %xmm5,%xmm1,%xmm1
+ addl 16(%esp),%ecx
+ andl %eax,%esi
+ xorl %ebx,%eax
+ shrdl $7,%edi,%edi
+ vpxor %xmm2,%xmm1,%xmm1
+ vmovdqa %xmm5,80(%esp)
+ movl %edx,%ebp
+ xorl %eax,%esi
+ vmovdqa %xmm4,%xmm5
+ vpaddd %xmm0,%xmm4,%xmm4
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ vpxor %xmm3,%xmm1,%xmm1
+ xorl %edi,%ebp
+ xorl %eax,%edi
+ addl %edx,%ecx
+ addl 20(%esp),%ebx
+ vpsrld $30,%xmm1,%xmm3
+ vmovdqa %xmm4,(%esp)
+ andl %edi,%ebp
+ xorl %eax,%edi
+ shrdl $7,%edx,%edx
+ movl %ecx,%esi
+ vpslld $2,%xmm1,%xmm1
+ xorl %edi,%ebp
+ shldl $5,%ecx,%ecx
+ addl %ebp,%ebx
+ xorl %edx,%esi
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ addl 24(%esp),%eax
+ andl %edx,%esi
+ vpor %xmm3,%xmm1,%xmm1
+ xorl %edi,%edx
+ shrdl $7,%ecx,%ecx
+ vmovdqa 96(%esp),%xmm3
+ movl %ebx,%ebp
+ xorl %edx,%esi
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %ecx,%ebp
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 28(%esp),%edi
+ andl %ecx,%ebp
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ movl %eax,%esi
+ xorl %ecx,%ebp
+ shldl $5,%eax,%eax
+ addl %ebp,%edi
+ xorl %ebx,%esi
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ vpalignr $8,%xmm0,%xmm1,%xmm4
+ vpxor %xmm6,%xmm2,%xmm2
+ addl 32(%esp),%edx
+ andl %ebx,%esi
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ vpxor %xmm3,%xmm2,%xmm2
+ vmovdqa %xmm6,96(%esp)
+ movl %edi,%ebp
+ xorl %ebx,%esi
+ vmovdqa %xmm5,%xmm6
+ vpaddd %xmm1,%xmm5,%xmm5
+ shldl $5,%edi,%edi
+ addl %esi,%edx
+ vpxor %xmm4,%xmm2,%xmm2
+ xorl %eax,%ebp
+ xorl %ebx,%eax
+ addl %edi,%edx
+ addl 36(%esp),%ecx
+ vpsrld $30,%xmm2,%xmm4
+ vmovdqa %xmm5,16(%esp)
+ andl %eax,%ebp
+ xorl %ebx,%eax
+ shrdl $7,%edi,%edi
+ movl %edx,%esi
+ vpslld $2,%xmm2,%xmm2
+ xorl %eax,%ebp
+ shldl $5,%edx,%edx
+ addl %ebp,%ecx
+ xorl %edi,%esi
+ xorl %eax,%edi
+ addl %edx,%ecx
+ addl 40(%esp),%ebx
+ andl %edi,%esi
+ vpor %xmm4,%xmm2,%xmm2
+ xorl %eax,%edi
+ shrdl $7,%edx,%edx
+ vmovdqa 64(%esp),%xmm4
+ movl %ecx,%ebp
+ xorl %edi,%esi
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %edx,%ebp
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ addl 44(%esp),%eax
+ andl %edx,%ebp
+ xorl %edi,%edx
+ shrdl $7,%ecx,%ecx
+ movl %ebx,%esi
+ xorl %edx,%ebp
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ xorl %edx,%esi
+ addl %ebx,%eax
+ vpalignr $8,%xmm1,%xmm2,%xmm5
+ vpxor %xmm7,%xmm3,%xmm3
+ addl 48(%esp),%edi
+ xorl %ecx,%esi
+ movl %eax,%ebp
+ shldl $5,%eax,%eax
+ vpxor %xmm4,%xmm3,%xmm3
+ vmovdqa %xmm7,64(%esp)
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ vmovdqa %xmm6,%xmm7
+ vpaddd %xmm2,%xmm6,%xmm6
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ vpxor %xmm5,%xmm3,%xmm3
+ addl 52(%esp),%edx
+ xorl %ebx,%ebp
+ movl %edi,%esi
+ shldl $5,%edi,%edi
+ vpsrld $30,%xmm3,%xmm5
+ vmovdqa %xmm6,32(%esp)
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ vpslld $2,%xmm3,%xmm3
+ addl 56(%esp),%ecx
+ xorl %eax,%esi
+ movl %edx,%ebp
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ vpor %xmm5,%xmm3,%xmm3
+ addl 60(%esp),%ebx
+ xorl %edi,%ebp
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl (%esp),%eax
+ vpaddd %xmm3,%xmm7,%xmm7
+ xorl %edx,%esi
+ movl %ebx,%ebp
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ vmovdqa %xmm7,48(%esp)
+ xorl %edx,%ebp
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 4(%esp),%edi
+ xorl %ecx,%ebp
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ addl 8(%esp),%edx
+ xorl %ebx,%esi
+ movl %edi,%ebp
+ shldl $5,%edi,%edi
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ addl 12(%esp),%ecx
+ xorl %eax,%ebp
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ movl 196(%esp),%ebp
+ cmpl 200(%esp),%ebp
+ je .L010done
+ vmovdqa 160(%esp),%xmm7
+ vmovdqa 176(%esp),%xmm6
+ vmovdqu (%ebp),%xmm0
+ vmovdqu 16(%ebp),%xmm1
+ vmovdqu 32(%ebp),%xmm2
+ vmovdqu 48(%ebp),%xmm3
+ addl $64,%ebp
+ vpshufb %xmm6,%xmm0,%xmm0
+ movl %ebp,196(%esp)
+ vmovdqa %xmm7,96(%esp)
+ addl 16(%esp),%ebx
+ xorl %edi,%esi
+ vpshufb %xmm6,%xmm1,%xmm1
+ movl %ecx,%ebp
+ shldl $5,%ecx,%ecx
+ vpaddd %xmm7,%xmm0,%xmm4
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vmovdqa %xmm4,(%esp)
+ addl 20(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 24(%esp),%edi
+ xorl %ecx,%esi
+ movl %eax,%ebp
+ shldl $5,%eax,%eax
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ addl 28(%esp),%edx
+ xorl %ebx,%ebp
+ movl %edi,%esi
+ shldl $5,%edi,%edi
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ addl 32(%esp),%ecx
+ xorl %eax,%esi
+ vpshufb %xmm6,%xmm2,%xmm2
+ movl %edx,%ebp
+ shldl $5,%edx,%edx
+ vpaddd %xmm7,%xmm1,%xmm5
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ vmovdqa %xmm5,16(%esp)
+ addl 36(%esp),%ebx
+ xorl %edi,%ebp
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 40(%esp),%eax
+ xorl %edx,%esi
+ movl %ebx,%ebp
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %edx,%ebp
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 44(%esp),%edi
+ xorl %ecx,%ebp
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ addl 48(%esp),%edx
+ xorl %ebx,%esi
+ vpshufb %xmm6,%xmm3,%xmm3
+ movl %edi,%ebp
+ shldl $5,%edi,%edi
+ vpaddd %xmm7,%xmm2,%xmm6
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ vmovdqa %xmm6,32(%esp)
+ addl 52(%esp),%ecx
+ xorl %eax,%ebp
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ addl 56(%esp),%ebx
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 60(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ movl 192(%esp),%ebp
+ addl (%ebp),%eax
+ addl 4(%ebp),%esi
+ addl 8(%ebp),%ecx
+ movl %eax,(%ebp)
+ addl 12(%ebp),%edx
+ movl %esi,4(%ebp)
+ addl 16(%ebp),%edi
+ movl %ecx,%ebx
+ movl %ecx,8(%ebp)
+ xorl %edx,%ebx
+ movl %edx,12(%ebp)
+ movl %edi,16(%ebp)
+ movl %esi,%ebp
+ andl %ebx,%esi
+ movl %ebp,%ebx
+ jmp .L009loop
+.align 16
+.L010done:
+ addl 16(%esp),%ebx
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 20(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 24(%esp),%edi
+ xorl %ecx,%esi
+ movl %eax,%ebp
+ shldl $5,%eax,%eax
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ addl 28(%esp),%edx
+ xorl %ebx,%ebp
+ movl %edi,%esi
+ shldl $5,%edi,%edi
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ addl 32(%esp),%ecx
+ xorl %eax,%esi
+ movl %edx,%ebp
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ addl 36(%esp),%ebx
+ xorl %edi,%ebp
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 40(%esp),%eax
+ xorl %edx,%esi
+ movl %ebx,%ebp
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %edx,%ebp
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 44(%esp),%edi
+ xorl %ecx,%ebp
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ addl 48(%esp),%edx
+ xorl %ebx,%esi
+ movl %edi,%ebp
+ shldl $5,%edi,%edi
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ addl 52(%esp),%ecx
+ xorl %eax,%ebp
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ addl 56(%esp),%ebx
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 60(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vzeroall
+ movl 192(%esp),%ebp
+ addl (%ebp),%eax
+ movl 204(%esp),%esp
+ addl 4(%ebp),%esi
+ addl 8(%ebp),%ecx
+ movl %eax,(%ebp)
+ addl 12(%ebp),%edx
+ movl %esi,4(%ebp)
+ addl 16(%ebp),%edi
+ movl %ecx,8(%ebp)
+ movl %edx,12(%ebp)
+ movl %edi,16(%ebp)
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size _sha1_block_data_order_avx,.-_sha1_block_data_order_avx
+.align 64
+.LK_XX_XX:
+.long 1518500249,1518500249,1518500249,1518500249
+.long 1859775393,1859775393,1859775393,1859775393
+.long 2400959708,2400959708,2400959708,2400959708
+.long 3395469782,3395469782,3395469782,3395469782
+.long 66051,67438087,134810123,202182159
+.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
.byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115
.byte 102,111,114,109,32,102,111,114,32,120,56,54,44,32,67,82
.byte 89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112
.byte 114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.comm OPENSSL_ia32cap_P,16,4
#endif
diff --git a/secure/lib/libcrypto/i386/sha256-586.S b/secure/lib/libcrypto/i386/sha256-586.S
index 2bc29ad737cd..7b4205352bdf 100644
--- a/secure/lib/libcrypto/i386/sha256-586.S
+++ b/secure/lib/libcrypto/i386/sha256-586.S
@@ -27,6 +27,28 @@ sha256_block_data_order:
movl %edi,4(%esp)
movl %eax,8(%esp)
movl %ebx,12(%esp)
+ leal OPENSSL_ia32cap_P-.L001K256(%ebp),%edx
+ movl (%edx),%ecx
+ movl 4(%edx),%ebx
+ testl $1048576,%ecx
+ jnz .L002loop
+ movl 8(%edx),%edx
+ testl $16777216,%ecx
+ jz .L003no_xmm
+ andl $1073741824,%ecx
+ andl $268435968,%ebx
+ testl $536870912,%edx
+ jnz .L004shaext
+ orl %ebx,%ecx
+ andl $1342177280,%ecx
+ cmpl $1342177280,%ecx
+ je .L005AVX
+ testl $512,%ebx
+ jnz .L006SSSE3
+.L003no_xmm:
+ subl %edi,%eax
+ cmpl $256,%eax
+ jae .L007unrolled
jmp .L002loop
.align 16
.L002loop:
@@ -98,7 +120,7 @@ sha256_block_data_order:
movl %ecx,28(%esp)
movl %edi,32(%esp)
.align 16
-.L00300_15:
+.L00800_15:
movl %edx,%ecx
movl 24(%esp),%esi
rorl $14,%ecx
@@ -136,11 +158,11 @@ sha256_block_data_order:
addl $4,%ebp
addl %ebx,%eax
cmpl $3248222580,%esi
- jne .L00300_15
+ jne .L00800_15
movl 156(%esp),%ecx
- jmp .L00416_63
+ jmp .L00916_63
.align 16
-.L00416_63:
+.L00916_63:
movl %ecx,%ebx
movl 104(%esp),%esi
rorl $11,%ecx
@@ -195,7 +217,7 @@ sha256_block_data_order:
addl $4,%ebp
addl %ebx,%eax
cmpl $3329325298,%esi
- jne .L00416_63
+ jne .L00916_63
movl 356(%esp),%esi
movl 8(%esp),%ebx
movl 16(%esp),%ecx
@@ -229,207 +251,6 @@ sha256_block_data_order:
popl %ebx
popl %ebp
ret
-.align 32
-.L005loop_shrd:
- movl (%edi),%eax
- movl 4(%edi),%ebx
- movl 8(%edi),%ecx
- bswap %eax
- movl 12(%edi),%edx
- bswap %ebx
- pushl %eax
- bswap %ecx
- pushl %ebx
- bswap %edx
- pushl %ecx
- pushl %edx
- movl 16(%edi),%eax
- movl 20(%edi),%ebx
- movl 24(%edi),%ecx
- bswap %eax
- movl 28(%edi),%edx
- bswap %ebx
- pushl %eax
- bswap %ecx
- pushl %ebx
- bswap %edx
- pushl %ecx
- pushl %edx
- movl 32(%edi),%eax
- movl 36(%edi),%ebx
- movl 40(%edi),%ecx
- bswap %eax
- movl 44(%edi),%edx
- bswap %ebx
- pushl %eax
- bswap %ecx
- pushl %ebx
- bswap %edx
- pushl %ecx
- pushl %edx
- movl 48(%edi),%eax
- movl 52(%edi),%ebx
- movl 56(%edi),%ecx
- bswap %eax
- movl 60(%edi),%edx
- bswap %ebx
- pushl %eax
- bswap %ecx
- pushl %ebx
- bswap %edx
- pushl %ecx
- pushl %edx
- addl $64,%edi
- leal -36(%esp),%esp
- movl %edi,104(%esp)
- movl (%esi),%eax
- movl 4(%esi),%ebx
- movl 8(%esi),%ecx
- movl 12(%esi),%edi
- movl %ebx,8(%esp)
- xorl %ecx,%ebx
- movl %ecx,12(%esp)
- movl %edi,16(%esp)
- movl %ebx,(%esp)
- movl 16(%esi),%edx
- movl 20(%esi),%ebx
- movl 24(%esi),%ecx
- movl 28(%esi),%edi
- movl %ebx,24(%esp)
- movl %ecx,28(%esp)
- movl %edi,32(%esp)
-.align 16
-.L00600_15_shrd:
- movl %edx,%ecx
- movl 24(%esp),%esi
- shrdl $14,%ecx,%ecx
- movl 28(%esp),%edi
- xorl %edx,%ecx
- xorl %edi,%esi
- movl 96(%esp),%ebx
- shrdl $5,%ecx,%ecx
- andl %edx,%esi
- movl %edx,20(%esp)
- xorl %ecx,%edx
- addl 32(%esp),%ebx
- xorl %edi,%esi
- shrdl $6,%edx,%edx
- movl %eax,%ecx
- addl %esi,%ebx
- shrdl $9,%ecx,%ecx
- addl %edx,%ebx
- movl 8(%esp),%edi
- xorl %eax,%ecx
- movl %eax,4(%esp)
- leal -4(%esp),%esp
- shrdl $11,%ecx,%ecx
- movl (%ebp),%esi
- xorl %eax,%ecx
- movl 20(%esp),%edx
- xorl %edi,%eax
- shrdl $2,%ecx,%ecx
- addl %esi,%ebx
- movl %eax,(%esp)
- addl %ebx,%edx
- andl 4(%esp),%eax
- addl %ecx,%ebx
- xorl %edi,%eax
- addl $4,%ebp
- addl %ebx,%eax
- cmpl $3248222580,%esi
- jne .L00600_15_shrd
- movl 156(%esp),%ecx
- jmp .L00716_63_shrd
-.align 16
-.L00716_63_shrd:
- movl %ecx,%ebx
- movl 104(%esp),%esi
- shrdl $11,%ecx,%ecx
- movl %esi,%edi
- shrdl $2,%esi,%esi
- xorl %ebx,%ecx
- shrl $3,%ebx
- shrdl $7,%ecx,%ecx
- xorl %edi,%esi
- xorl %ecx,%ebx
- shrdl $17,%esi,%esi
- addl 160(%esp),%ebx
- shrl $10,%edi
- addl 124(%esp),%ebx
- movl %edx,%ecx
- xorl %esi,%edi
- movl 24(%esp),%esi
- shrdl $14,%ecx,%ecx
- addl %edi,%ebx
- movl 28(%esp),%edi
- xorl %edx,%ecx
- xorl %edi,%esi
- movl %ebx,96(%esp)
- shrdl $5,%ecx,%ecx
- andl %edx,%esi
- movl %edx,20(%esp)
- xorl %ecx,%edx
- addl 32(%esp),%ebx
- xorl %edi,%esi
- shrdl $6,%edx,%edx
- movl %eax,%ecx
- addl %esi,%ebx
- shrdl $9,%ecx,%ecx
- addl %edx,%ebx
- movl 8(%esp),%edi
- xorl %eax,%ecx
- movl %eax,4(%esp)
- leal -4(%esp),%esp
- shrdl $11,%ecx,%ecx
- movl (%ebp),%esi
- xorl %eax,%ecx
- movl 20(%esp),%edx
- xorl %edi,%eax
- shrdl $2,%ecx,%ecx
- addl %esi,%ebx
- movl %eax,(%esp)
- addl %ebx,%edx
- andl 4(%esp),%eax
- addl %ecx,%ebx
- xorl %edi,%eax
- movl 156(%esp),%ecx
- addl $4,%ebp
- addl %ebx,%eax
- cmpl $3329325298,%esi
- jne .L00716_63_shrd
- movl 356(%esp),%esi
- movl 8(%esp),%ebx
- movl 16(%esp),%ecx
- addl (%esi),%eax
- addl 4(%esi),%ebx
- addl 8(%esi),%edi
- addl 12(%esi),%ecx
- movl %eax,(%esi)
- movl %ebx,4(%esi)
- movl %edi,8(%esi)
- movl %ecx,12(%esi)
- movl 24(%esp),%eax
- movl 28(%esp),%ebx
- movl 32(%esp),%ecx
- movl 360(%esp),%edi
- addl 16(%esi),%edx
- addl 20(%esi),%eax
- addl 24(%esi),%ebx
- addl 28(%esi),%ecx
- movl %edx,16(%esi)
- movl %eax,20(%esi)
- movl %ebx,24(%esi)
- movl %ecx,28(%esi)
- leal 356(%esp),%esp
- subl $256,%ebp
- cmpl 8(%esp),%edi
- jb .L005loop_shrd
- movl 12(%esp),%esp
- popl %edi
- popl %esi
- popl %ebx
- popl %ebp
- ret
.align 64
.L001K256:
.long 1116352408,1899447441,3049323471,3921009573,961987163,1508970993,2453635748,2870763221,3624381080,310598401,607225278,1426881987,1925078388,2162078206,2614888103,3248222580,3835390401,4022224774,264347078,604807628,770255983,1249150122,1555081692,1996064986,2554220882,2821834349,2952996808,3210313671,3336571891,3584528711,113926993,338241895,666307205,773529912,1294757372,1396182291,1695183700,1986661051,2177026350,2456956037,2730485921,2820302411,3259730800,3345764771,3516065817,3600352804,4094571909,275423344,430227734,506948616,659060556,883997877,958139571,1322822218,1537002063,1747873779,1955562222,2024104815,2227730452,2361852424,2428436474,2756734187,3204031479,3329325298
@@ -440,7 +261,7 @@ sha256_block_data_order:
.byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
.byte 62,0
.align 16
-.L008unrolled:
+.L007unrolled:
leal -96(%esp),%esp
movl (%esi),%eax
movl 4(%esi),%ebp
@@ -457,9 +278,9 @@ sha256_block_data_order:
movl %ebx,20(%esp)
movl %ecx,24(%esp)
movl %esi,28(%esp)
- jmp .L009grand_loop
+ jmp .L010grand_loop
.align 16
-.L009grand_loop:
+.L010grand_loop:
movl (%edi),%ebx
movl 4(%edi),%ecx
bswap %ebx
@@ -3339,213 +3160,2568 @@ sha256_block_data_order:
movl %ebx,24(%esp)
movl %ecx,28(%esp)
cmpl 104(%esp),%edi
- jb .L009grand_loop
+ jb .L010grand_loop
movl 108(%esp),%esp
popl %edi
popl %esi
popl %ebx
popl %ebp
ret
-.size sha256_block_data_order,.-.L_sha256_block_data_order_begin
-#else
-.text
-.globl sha256_block_data_order
-.type sha256_block_data_order,@function
+.align 32
+.L004shaext:
+ subl $32,%esp
+ movdqu (%esi),%xmm1
+ leal 128(%ebp),%ebp
+ movdqu 16(%esi),%xmm2
+ movdqa 128(%ebp),%xmm7
+ pshufd $27,%xmm1,%xmm0
+ pshufd $177,%xmm1,%xmm1
+ pshufd $27,%xmm2,%xmm2
+.byte 102,15,58,15,202,8
+ punpcklqdq %xmm0,%xmm2
+ jmp .L011loop_shaext
.align 16
-sha256_block_data_order:
-.L_sha256_block_data_order_begin:
- pushl %ebp
- pushl %ebx
- pushl %esi
- pushl %edi
- movl 20(%esp),%esi
- movl 24(%esp),%edi
- movl 28(%esp),%eax
- movl %esp,%ebx
- call .L000pic_point
-.L000pic_point:
+.L011loop_shaext:
+ movdqu (%edi),%xmm3
+ movdqu 16(%edi),%xmm4
+ movdqu 32(%edi),%xmm5
+.byte 102,15,56,0,223
+ movdqu 48(%edi),%xmm6
+ movdqa %xmm2,16(%esp)
+ movdqa -128(%ebp),%xmm0
+ paddd %xmm3,%xmm0
+.byte 102,15,56,0,231
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ nop
+ movdqa %xmm1,(%esp)
+.byte 15,56,203,202
+ movdqa -112(%ebp),%xmm0
+ paddd %xmm4,%xmm0
+.byte 102,15,56,0,239
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ leal 64(%edi),%edi
+.byte 15,56,204,220
+.byte 15,56,203,202
+ movdqa -96(%ebp),%xmm0
+ paddd %xmm5,%xmm0
+.byte 102,15,56,0,247
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm6,%xmm7
+.byte 102,15,58,15,253,4
+ nop
+ paddd %xmm7,%xmm3
+.byte 15,56,204,229
+.byte 15,56,203,202
+ movdqa -80(%ebp),%xmm0
+ paddd %xmm6,%xmm0
+.byte 15,56,205,222
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm3,%xmm7
+.byte 102,15,58,15,254,4
+ nop
+ paddd %xmm7,%xmm4
+.byte 15,56,204,238
+.byte 15,56,203,202
+ movdqa -64(%ebp),%xmm0
+ paddd %xmm3,%xmm0
+.byte 15,56,205,227
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm4,%xmm7
+.byte 102,15,58,15,251,4
+ nop
+ paddd %xmm7,%xmm5
+.byte 15,56,204,243
+.byte 15,56,203,202
+ movdqa -48(%ebp),%xmm0
+ paddd %xmm4,%xmm0
+.byte 15,56,205,236
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm5,%xmm7
+.byte 102,15,58,15,252,4
+ nop
+ paddd %xmm7,%xmm6
+.byte 15,56,204,220
+.byte 15,56,203,202
+ movdqa -32(%ebp),%xmm0
+ paddd %xmm5,%xmm0
+.byte 15,56,205,245
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm6,%xmm7
+.byte 102,15,58,15,253,4
+ nop
+ paddd %xmm7,%xmm3
+.byte 15,56,204,229
+.byte 15,56,203,202
+ movdqa -16(%ebp),%xmm0
+ paddd %xmm6,%xmm0
+.byte 15,56,205,222
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm3,%xmm7
+.byte 102,15,58,15,254,4
+ nop
+ paddd %xmm7,%xmm4
+.byte 15,56,204,238
+.byte 15,56,203,202
+ movdqa (%ebp),%xmm0
+ paddd %xmm3,%xmm0
+.byte 15,56,205,227
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm4,%xmm7
+.byte 102,15,58,15,251,4
+ nop
+ paddd %xmm7,%xmm5
+.byte 15,56,204,243
+.byte 15,56,203,202
+ movdqa 16(%ebp),%xmm0
+ paddd %xmm4,%xmm0
+.byte 15,56,205,236
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm5,%xmm7
+.byte 102,15,58,15,252,4
+ nop
+ paddd %xmm7,%xmm6
+.byte 15,56,204,220
+.byte 15,56,203,202
+ movdqa 32(%ebp),%xmm0
+ paddd %xmm5,%xmm0
+.byte 15,56,205,245
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm6,%xmm7
+.byte 102,15,58,15,253,4
+ nop
+ paddd %xmm7,%xmm3
+.byte 15,56,204,229
+.byte 15,56,203,202
+ movdqa 48(%ebp),%xmm0
+ paddd %xmm6,%xmm0
+.byte 15,56,205,222
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm3,%xmm7
+.byte 102,15,58,15,254,4
+ nop
+ paddd %xmm7,%xmm4
+.byte 15,56,204,238
+.byte 15,56,203,202
+ movdqa 64(%ebp),%xmm0
+ paddd %xmm3,%xmm0
+.byte 15,56,205,227
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm4,%xmm7
+.byte 102,15,58,15,251,4
+ nop
+ paddd %xmm7,%xmm5
+.byte 15,56,204,243
+.byte 15,56,203,202
+ movdqa 80(%ebp),%xmm0
+ paddd %xmm4,%xmm0
+.byte 15,56,205,236
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm5,%xmm7
+.byte 102,15,58,15,252,4
+.byte 15,56,203,202
+ paddd %xmm7,%xmm6
+ movdqa 96(%ebp),%xmm0
+ paddd %xmm5,%xmm0
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+.byte 15,56,205,245
+ movdqa 128(%ebp),%xmm7
+.byte 15,56,203,202
+ movdqa 112(%ebp),%xmm0
+ paddd %xmm6,%xmm0
+ nop
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ cmpl %edi,%eax
+ nop
+.byte 15,56,203,202
+ paddd 16(%esp),%xmm2
+ paddd (%esp),%xmm1
+ jnz .L011loop_shaext
+ pshufd $177,%xmm2,%xmm2
+ pshufd $27,%xmm1,%xmm7
+ pshufd $177,%xmm1,%xmm1
+ punpckhqdq %xmm2,%xmm1
+.byte 102,15,58,15,215,8
+ movl 44(%esp),%esp
+ movdqu %xmm1,(%esi)
+ movdqu %xmm2,16(%esi)
+ popl %edi
+ popl %esi
+ popl %ebx
popl %ebp
- leal .L001K256-.L000pic_point(%ebp),%ebp
- subl $16,%esp
- andl $-64,%esp
- shll $6,%eax
- addl %edi,%eax
- movl %esi,(%esp)
- movl %edi,4(%esp)
- movl %eax,8(%esp)
- movl %ebx,12(%esp)
- jmp .L002loop
-.align 16
-.L002loop:
- movl (%edi),%eax
- movl 4(%edi),%ebx
- movl 8(%edi),%ecx
- bswap %eax
- movl 12(%edi),%edx
- bswap %ebx
- pushl %eax
- bswap %ecx
- pushl %ebx
- bswap %edx
- pushl %ecx
- pushl %edx
- movl 16(%edi),%eax
- movl 20(%edi),%ebx
- movl 24(%edi),%ecx
- bswap %eax
- movl 28(%edi),%edx
- bswap %ebx
- pushl %eax
- bswap %ecx
- pushl %ebx
- bswap %edx
- pushl %ecx
- pushl %edx
- movl 32(%edi),%eax
- movl 36(%edi),%ebx
- movl 40(%edi),%ecx
- bswap %eax
- movl 44(%edi),%edx
- bswap %ebx
- pushl %eax
- bswap %ecx
- pushl %ebx
- bswap %edx
- pushl %ecx
- pushl %edx
- movl 48(%edi),%eax
- movl 52(%edi),%ebx
- movl 56(%edi),%ecx
- bswap %eax
- movl 60(%edi),%edx
- bswap %ebx
- pushl %eax
- bswap %ecx
- pushl %ebx
- bswap %edx
- pushl %ecx
- pushl %edx
- addl $64,%edi
- leal -36(%esp),%esp
- movl %edi,104(%esp)
+ ret
+.align 32
+.L006SSSE3:
+ leal -96(%esp),%esp
movl (%esi),%eax
movl 4(%esi),%ebx
movl 8(%esi),%ecx
movl 12(%esi),%edi
- movl %ebx,8(%esp)
+ movl %ebx,4(%esp)
xorl %ecx,%ebx
- movl %ecx,12(%esp)
- movl %edi,16(%esp)
- movl %ebx,(%esp)
+ movl %ecx,8(%esp)
+ movl %edi,12(%esp)
movl 16(%esi),%edx
- movl 20(%esi),%ebx
+ movl 20(%esi),%edi
movl 24(%esi),%ecx
- movl 28(%esi),%edi
- movl %ebx,24(%esp)
- movl %ecx,28(%esp)
- movl %edi,32(%esp)
+ movl 28(%esi),%esi
+ movl %edi,20(%esp)
+ movl 100(%esp),%edi
+ movl %ecx,24(%esp)
+ movl %esi,28(%esp)
+ movdqa 256(%ebp),%xmm7
+ jmp .L012grand_ssse3
.align 16
-.L00300_15:
+.L012grand_ssse3:
+ movdqu (%edi),%xmm0
+ movdqu 16(%edi),%xmm1
+ movdqu 32(%edi),%xmm2
+ movdqu 48(%edi),%xmm3
+ addl $64,%edi
+.byte 102,15,56,0,199
+ movl %edi,100(%esp)
+.byte 102,15,56,0,207
+ movdqa (%ebp),%xmm4
+.byte 102,15,56,0,215
+ movdqa 16(%ebp),%xmm5
+ paddd %xmm0,%xmm4
+.byte 102,15,56,0,223
+ movdqa 32(%ebp),%xmm6
+ paddd %xmm1,%xmm5
+ movdqa 48(%ebp),%xmm7
+ movdqa %xmm4,32(%esp)
+ paddd %xmm2,%xmm6
+ movdqa %xmm5,48(%esp)
+ paddd %xmm3,%xmm7
+ movdqa %xmm6,64(%esp)
+ movdqa %xmm7,80(%esp)
+ jmp .L013ssse3_00_47
+.align 16
+.L013ssse3_00_47:
+ addl $64,%ebp
+ movl %edx,%ecx
+ movdqa %xmm1,%xmm4
+ rorl $14,%edx
+ movl 20(%esp),%esi
+ movdqa %xmm3,%xmm7
+ xorl %ecx,%edx
+ movl 24(%esp),%edi
+.byte 102,15,58,15,224,4
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+.byte 102,15,58,15,250,4
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ movdqa %xmm4,%xmm5
+ rorl $6,%edx
+ movl %eax,%ecx
+ movdqa %xmm4,%xmm6
+ addl %edi,%edx
+ movl 4(%esp),%edi
+ psrld $3,%xmm4
+ movl %eax,%esi
+ rorl $9,%ecx
+ paddd %xmm7,%xmm0
+ movl %eax,(%esp)
+ xorl %eax,%ecx
+ psrld $7,%xmm6
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ pshufd $250,%xmm3,%xmm7
+ xorl %esi,%ecx
+ addl 32(%esp),%edx
+ pslld $14,%xmm5
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ pxor %xmm6,%xmm4
+ addl %edx,%ebx
+ addl 12(%esp),%edx
+ psrld $11,%xmm6
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ pxor %xmm5,%xmm4
+ movl 16(%esp),%esi
+ xorl %ecx,%edx
+ pslld $11,%xmm5
+ movl 20(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ pxor %xmm6,%xmm4
+ andl %ecx,%esi
+ movl %ecx,12(%esp)
+ movdqa %xmm7,%xmm6
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ pxor %xmm5,%xmm4
+ movl %ebx,%ecx
+ addl %edi,%edx
+ psrld $10,%xmm7
+ movl (%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ paddd %xmm4,%xmm0
+ movl %ebx,28(%esp)
+ xorl %ebx,%ecx
+ psrlq $17,%xmm6
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ rorl $11,%ecx
+ pxor %xmm6,%xmm7
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ psrlq $2,%xmm6
+ addl 36(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ pxor %xmm6,%xmm7
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ pshufd $128,%xmm7,%xmm7
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 12(%esp),%esi
+ xorl %ecx,%edx
+ movl 16(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ psrldq $8,%xmm7
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ paddd %xmm7,%xmm0
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 28(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,24(%esp)
+ pshufd $80,%xmm0,%xmm7
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ movdqa %xmm7,%xmm6
+ rorl $11,%ecx
+ psrld $10,%xmm7
+ andl %eax,%ebx
+ psrlq $17,%xmm6
+ xorl %esi,%ecx
+ addl 40(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ pxor %xmm6,%xmm7
+ addl %edx,%ebx
+ addl 4(%esp),%edx
+ psrlq $2,%xmm6
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ pxor %xmm6,%xmm7
+ movl 8(%esp),%esi
+ xorl %ecx,%edx
+ movl 12(%esp),%edi
+ pshufd $8,%xmm7,%xmm7
+ xorl %edi,%esi
+ rorl $5,%edx
+ movdqa (%ebp),%xmm6
+ andl %ecx,%esi
+ movl %ecx,4(%esp)
+ pslldq $8,%xmm7
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 24(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ paddd %xmm7,%xmm0
+ movl %ebx,20(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ paddd %xmm0,%xmm6
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 44(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %ecx,%eax
+ movdqa %xmm6,32(%esp)
+ movl %edx,%ecx
+ movdqa %xmm2,%xmm4
+ rorl $14,%edx
+ movl 4(%esp),%esi
+ movdqa %xmm0,%xmm7
+ xorl %ecx,%edx
+ movl 8(%esp),%edi
+.byte 102,15,58,15,225,4
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+.byte 102,15,58,15,251,4
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ movdqa %xmm4,%xmm5
+ rorl $6,%edx
+ movl %eax,%ecx
+ movdqa %xmm4,%xmm6
+ addl %edi,%edx
+ movl 20(%esp),%edi
+ psrld $3,%xmm4
+ movl %eax,%esi
+ rorl $9,%ecx
+ paddd %xmm7,%xmm1
+ movl %eax,16(%esp)
+ xorl %eax,%ecx
+ psrld $7,%xmm6
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ pshufd $250,%xmm0,%xmm7
+ xorl %esi,%ecx
+ addl 48(%esp),%edx
+ pslld $14,%xmm5
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ pxor %xmm6,%xmm4
+ addl %edx,%ebx
+ addl 28(%esp),%edx
+ psrld $11,%xmm6
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ pxor %xmm5,%xmm4
+ movl (%esp),%esi
+ xorl %ecx,%edx
+ pslld $11,%xmm5
+ movl 4(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ pxor %xmm6,%xmm4
+ andl %ecx,%esi
+ movl %ecx,28(%esp)
+ movdqa %xmm7,%xmm6
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ pxor %xmm5,%xmm4
+ movl %ebx,%ecx
+ addl %edi,%edx
+ psrld $10,%xmm7
+ movl 16(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ paddd %xmm4,%xmm1
+ movl %ebx,12(%esp)
+ xorl %ebx,%ecx
+ psrlq $17,%xmm6
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ rorl $11,%ecx
+ pxor %xmm6,%xmm7
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ psrlq $2,%xmm6
+ addl 52(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ pxor %xmm6,%xmm7
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ pshufd $128,%xmm7,%xmm7
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 28(%esp),%esi
+ xorl %ecx,%edx
+ movl (%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ psrldq $8,%xmm7
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ paddd %xmm7,%xmm1
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 12(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,8(%esp)
+ pshufd $80,%xmm1,%xmm7
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ movdqa %xmm7,%xmm6
+ rorl $11,%ecx
+ psrld $10,%xmm7
+ andl %eax,%ebx
+ psrlq $17,%xmm6
+ xorl %esi,%ecx
+ addl 56(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ pxor %xmm6,%xmm7
+ addl %edx,%ebx
+ addl 20(%esp),%edx
+ psrlq $2,%xmm6
+ addl %ecx,%ebx
movl %edx,%ecx
+ rorl $14,%edx
+ pxor %xmm6,%xmm7
movl 24(%esp),%esi
- rorl $14,%ecx
+ xorl %ecx,%edx
movl 28(%esp),%edi
- xorl %edx,%ecx
+ pshufd $8,%xmm7,%xmm7
xorl %edi,%esi
- movl 96(%esp),%ebx
- rorl $5,%ecx
- andl %edx,%esi
- movl %edx,20(%esp)
+ rorl $5,%edx
+ movdqa 16(%ebp),%xmm6
+ andl %ecx,%esi
+ movl %ecx,20(%esp)
+ pslldq $8,%xmm7
xorl %ecx,%edx
- addl 32(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 8(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ paddd %xmm7,%xmm1
+ movl %ebx,4(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ paddd %xmm1,%xmm6
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 60(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %ecx,%eax
+ movdqa %xmm6,48(%esp)
+ movl %edx,%ecx
+ movdqa %xmm3,%xmm4
+ rorl $14,%edx
+ movl 20(%esp),%esi
+ movdqa %xmm1,%xmm7
+ xorl %ecx,%edx
+ movl 24(%esp),%edi
+.byte 102,15,58,15,226,4
xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+.byte 102,15,58,15,248,4
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ movdqa %xmm4,%xmm5
rorl $6,%edx
movl %eax,%ecx
- addl %esi,%ebx
+ movdqa %xmm4,%xmm6
+ addl %edi,%edx
+ movl 4(%esp),%edi
+ psrld $3,%xmm4
+ movl %eax,%esi
rorl $9,%ecx
+ paddd %xmm7,%xmm2
+ movl %eax,(%esp)
+ xorl %eax,%ecx
+ psrld $7,%xmm6
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ pshufd $250,%xmm1,%xmm7
+ xorl %esi,%ecx
+ addl 64(%esp),%edx
+ pslld $14,%xmm5
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ pxor %xmm6,%xmm4
addl %edx,%ebx
+ addl 12(%esp),%edx
+ psrld $11,%xmm6
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ pxor %xmm5,%xmm4
+ movl 16(%esp),%esi
+ xorl %ecx,%edx
+ pslld $11,%xmm5
+ movl 20(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ pxor %xmm6,%xmm4
+ andl %ecx,%esi
+ movl %ecx,12(%esp)
+ movdqa %xmm7,%xmm6
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ pxor %xmm5,%xmm4
+ movl %ebx,%ecx
+ addl %edi,%edx
+ psrld $10,%xmm7
+ movl (%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ paddd %xmm4,%xmm2
+ movl %ebx,28(%esp)
+ xorl %ebx,%ecx
+ psrlq $17,%xmm6
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ rorl $11,%ecx
+ pxor %xmm6,%xmm7
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ psrlq $2,%xmm6
+ addl 68(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ pxor %xmm6,%xmm7
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ pshufd $128,%xmm7,%xmm7
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 12(%esp),%esi
+ xorl %ecx,%edx
+ movl 16(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ psrldq $8,%xmm7
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ paddd %xmm7,%xmm2
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 28(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,24(%esp)
+ pshufd $80,%xmm2,%xmm7
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ movdqa %xmm7,%xmm6
+ rorl $11,%ecx
+ psrld $10,%xmm7
+ andl %eax,%ebx
+ psrlq $17,%xmm6
+ xorl %esi,%ecx
+ addl 72(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ pxor %xmm6,%xmm7
+ addl %edx,%ebx
+ addl 4(%esp),%edx
+ psrlq $2,%xmm6
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ pxor %xmm6,%xmm7
+ movl 8(%esp),%esi
+ xorl %ecx,%edx
+ movl 12(%esp),%edi
+ pshufd $8,%xmm7,%xmm7
+ xorl %edi,%esi
+ rorl $5,%edx
+ movdqa 32(%ebp),%xmm6
+ andl %ecx,%esi
+ movl %ecx,4(%esp)
+ pslldq $8,%xmm7
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 24(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ paddd %xmm7,%xmm2
+ movl %ebx,20(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ paddd %xmm2,%xmm6
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 76(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %ecx,%eax
+ movdqa %xmm6,64(%esp)
+ movl %edx,%ecx
+ movdqa %xmm0,%xmm4
+ rorl $14,%edx
+ movl 4(%esp),%esi
+ movdqa %xmm2,%xmm7
+ xorl %ecx,%edx
movl 8(%esp),%edi
+.byte 102,15,58,15,227,4
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+.byte 102,15,58,15,249,4
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ movdqa %xmm4,%xmm5
+ rorl $6,%edx
+ movl %eax,%ecx
+ movdqa %xmm4,%xmm6
+ addl %edi,%edx
+ movl 20(%esp),%edi
+ psrld $3,%xmm4
+ movl %eax,%esi
+ rorl $9,%ecx
+ paddd %xmm7,%xmm3
+ movl %eax,16(%esp)
xorl %eax,%ecx
- movl %eax,4(%esp)
- leal -4(%esp),%esp
+ psrld $7,%xmm6
+ xorl %edi,%eax
+ addl 12(%esp),%edx
rorl $11,%ecx
- movl (%ebp),%esi
+ andl %eax,%ebx
+ pshufd $250,%xmm2,%xmm7
+ xorl %esi,%ecx
+ addl 80(%esp),%edx
+ pslld $14,%xmm5
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ pxor %xmm6,%xmm4
+ addl %edx,%ebx
+ addl 28(%esp),%edx
+ psrld $11,%xmm6
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ pxor %xmm5,%xmm4
+ movl (%esp),%esi
+ xorl %ecx,%edx
+ pslld $11,%xmm5
+ movl 4(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ pxor %xmm6,%xmm4
+ andl %ecx,%esi
+ movl %ecx,28(%esp)
+ movdqa %xmm7,%xmm6
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ pxor %xmm5,%xmm4
+ movl %ebx,%ecx
+ addl %edi,%edx
+ psrld $10,%xmm7
+ movl 16(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ paddd %xmm4,%xmm3
+ movl %ebx,12(%esp)
+ xorl %ebx,%ecx
+ psrlq $17,%xmm6
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ rorl $11,%ecx
+ pxor %xmm6,%xmm7
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ psrlq $2,%xmm6
+ addl 84(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ pxor %xmm6,%xmm7
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ pshufd $128,%xmm7,%xmm7
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 28(%esp),%esi
+ xorl %ecx,%edx
+ movl (%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ psrldq $8,%xmm7
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ paddd %xmm7,%xmm3
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 12(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,8(%esp)
+ pshufd $80,%xmm3,%xmm7
xorl %eax,%ecx
- movl 20(%esp),%edx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ movdqa %xmm7,%xmm6
+ rorl $11,%ecx
+ psrld $10,%xmm7
+ andl %eax,%ebx
+ psrlq $17,%xmm6
+ xorl %esi,%ecx
+ addl 88(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ pxor %xmm6,%xmm7
+ addl %edx,%ebx
+ addl 20(%esp),%edx
+ psrlq $2,%xmm6
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ pxor %xmm6,%xmm7
+ movl 24(%esp),%esi
+ xorl %ecx,%edx
+ movl 28(%esp),%edi
+ pshufd $8,%xmm7,%xmm7
+ xorl %edi,%esi
+ rorl $5,%edx
+ movdqa 48(%ebp),%xmm6
+ andl %ecx,%esi
+ movl %ecx,20(%esp)
+ pslldq $8,%xmm7
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 8(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ paddd %xmm7,%xmm3
+ movl %ebx,4(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ paddd %xmm3,%xmm6
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 92(%esp),%edx
xorl %edi,%eax
rorl $2,%ecx
- addl %esi,%ebx
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %ecx,%eax
+ movdqa %xmm6,80(%esp)
+ cmpl $66051,64(%ebp)
+ jne .L013ssse3_00_47
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 20(%esp),%esi
+ xorl %ecx,%edx
+ movl 24(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 4(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
movl %eax,(%esp)
- addl %ebx,%edx
- andl 4(%esp),%eax
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 32(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ addl %edx,%ebx
+ addl 12(%esp),%edx
addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 16(%esp),%esi
+ xorl %ecx,%edx
+ movl 20(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,12(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl (%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ movl %ebx,28(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 36(%esp),%edx
xorl %edi,%eax
- addl $4,%ebp
- addl %ebx,%eax
- cmpl $3248222580,%esi
- jne .L00300_15
- movl 156(%esp),%ecx
- jmp .L00416_63
-.align 16
-.L00416_63:
- movl %ecx,%ebx
- movl 104(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 12(%esp),%esi
+ xorl %ecx,%edx
+ movl 16(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 28(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,24(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
rorl $11,%ecx
- movl %esi,%edi
- rorl $2,%esi
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 40(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ addl %edx,%ebx
+ addl 4(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 8(%esp),%esi
+ xorl %ecx,%edx
+ movl 12(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,4(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 24(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ movl %ebx,20(%esp)
xorl %ebx,%ecx
- shrl $3,%ebx
- rorl $7,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 44(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 4(%esp),%esi
+ xorl %ecx,%edx
+ movl 8(%esp),%edi
xorl %edi,%esi
- xorl %ecx,%ebx
- rorl $17,%esi
- addl 160(%esp),%ebx
- shrl $10,%edi
- addl 124(%esp),%ebx
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 20(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,16(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 48(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ addl %edx,%ebx
+ addl 28(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl (%esp),%esi
+ xorl %ecx,%edx
+ movl 4(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,28(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 16(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ movl %ebx,12(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 52(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %ecx,%eax
movl %edx,%ecx
+ rorl $14,%edx
+ movl 28(%esp),%esi
+ xorl %ecx,%edx
+ movl (%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 12(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,8(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 56(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ addl %edx,%ebx
+ addl 20(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
movl 24(%esp),%esi
- rorl $14,%ecx
- addl %edi,%ebx
+ xorl %ecx,%edx
movl 28(%esp),%edi
- xorl %edx,%ecx
xorl %edi,%esi
- movl %ebx,96(%esp)
- rorl $5,%ecx
- andl %edx,%esi
- movl %edx,20(%esp)
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,20(%esp)
xorl %ecx,%edx
- addl 32(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 8(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ movl %ebx,4(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 60(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 20(%esp),%esi
+ xorl %ecx,%edx
+ movl 24(%esp),%edi
xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
rorl $6,%edx
movl %eax,%ecx
- addl %esi,%ebx
+ addl %edi,%edx
+ movl 4(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 64(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ addl %edx,%ebx
+ addl 12(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 16(%esp),%esi
+ xorl %ecx,%edx
+ movl 20(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,12(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl (%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ movl %ebx,28(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 68(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 12(%esp),%esi
+ xorl %ecx,%edx
+ movl 16(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 28(%esp),%edi
+ movl %eax,%esi
rorl $9,%ecx
+ movl %eax,24(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 72(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
addl %edx,%ebx
+ addl 4(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 8(%esp),%esi
+ xorl %ecx,%edx
+ movl 12(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,4(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 24(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ movl %ebx,20(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 76(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 4(%esp),%esi
+ xorl %ecx,%edx
movl 8(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 20(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,16(%esp)
xorl %eax,%ecx
- movl %eax,4(%esp)
- leal -4(%esp),%esp
+ xorl %edi,%eax
+ addl 12(%esp),%edx
rorl $11,%ecx
- movl (%ebp),%esi
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 80(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ addl %edx,%ebx
+ addl 28(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl (%esp),%esi
+ xorl %ecx,%edx
+ movl 4(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,28(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 16(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ movl %ebx,12(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 84(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 28(%esp),%esi
+ xorl %ecx,%edx
+ movl (%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 12(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,8(%esp)
xorl %eax,%ecx
- movl 20(%esp),%edx
xorl %edi,%eax
+ addl 4(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 88(%esp),%edx
+ xorl %edi,%ebx
rorl $2,%ecx
- addl %esi,%ebx
+ addl %edx,%ebx
+ addl 20(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 24(%esp),%esi
+ xorl %ecx,%edx
+ movl 28(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,20(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 8(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ movl %ebx,4(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 92(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %ecx,%eax
+ movl 96(%esp),%esi
+ xorl %edi,%ebx
+ movl 12(%esp),%ecx
+ addl (%esi),%eax
+ addl 4(%esi),%ebx
+ addl 8(%esi),%edi
+ addl 12(%esi),%ecx
+ movl %eax,(%esi)
+ movl %ebx,4(%esi)
+ movl %edi,8(%esi)
+ movl %ecx,12(%esi)
+ movl %ebx,4(%esp)
+ xorl %edi,%ebx
+ movl %edi,8(%esp)
+ movl %ecx,12(%esp)
+ movl 20(%esp),%edi
+ movl 24(%esp),%ecx
+ addl 16(%esi),%edx
+ addl 20(%esi),%edi
+ addl 24(%esi),%ecx
+ movl %edx,16(%esi)
+ movl %edi,20(%esi)
+ movl %edi,20(%esp)
+ movl 28(%esp),%edi
+ movl %ecx,24(%esi)
+ addl 28(%esi),%edi
+ movl %ecx,24(%esp)
+ movl %edi,28(%esi)
+ movl %edi,28(%esp)
+ movl 100(%esp),%edi
+ movdqa 64(%ebp),%xmm7
+ subl $192,%ebp
+ cmpl 104(%esp),%edi
+ jb .L012grand_ssse3
+ movl 108(%esp),%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.align 32
+.L005AVX:
+ andl $264,%edx
+ cmpl $264,%edx
+ je .L014AVX_BMI
+ leal -96(%esp),%esp
+ vzeroall
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl 8(%esi),%ecx
+ movl 12(%esi),%edi
+ movl %ebx,4(%esp)
+ xorl %ecx,%ebx
+ movl %ecx,8(%esp)
+ movl %edi,12(%esp)
+ movl 16(%esi),%edx
+ movl 20(%esi),%edi
+ movl 24(%esi),%ecx
+ movl 28(%esi),%esi
+ movl %edi,20(%esp)
+ movl 100(%esp),%edi
+ movl %ecx,24(%esp)
+ movl %esi,28(%esp)
+ vmovdqa 256(%ebp),%xmm7
+ jmp .L015grand_avx
+.align 32
+.L015grand_avx:
+ vmovdqu (%edi),%xmm0
+ vmovdqu 16(%edi),%xmm1
+ vmovdqu 32(%edi),%xmm2
+ vmovdqu 48(%edi),%xmm3
+ addl $64,%edi
+ vpshufb %xmm7,%xmm0,%xmm0
+ movl %edi,100(%esp)
+ vpshufb %xmm7,%xmm1,%xmm1
+ vpshufb %xmm7,%xmm2,%xmm2
+ vpaddd (%ebp),%xmm0,%xmm4
+ vpshufb %xmm7,%xmm3,%xmm3
+ vpaddd 16(%ebp),%xmm1,%xmm5
+ vpaddd 32(%ebp),%xmm2,%xmm6
+ vpaddd 48(%ebp),%xmm3,%xmm7
+ vmovdqa %xmm4,32(%esp)
+ vmovdqa %xmm5,48(%esp)
+ vmovdqa %xmm6,64(%esp)
+ vmovdqa %xmm7,80(%esp)
+ jmp .L016avx_00_47
+.align 16
+.L016avx_00_47:
+ addl $64,%ebp
+ vpalignr $4,%xmm0,%xmm1,%xmm4
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 20(%esp),%esi
+ vpalignr $4,%xmm2,%xmm3,%xmm7
+ xorl %ecx,%edx
+ movl 24(%esp),%edi
+ xorl %edi,%esi
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ vpaddd %xmm7,%xmm0,%xmm0
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrld $3,%xmm4,%xmm7
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 4(%esp),%edi
+ vpslld $14,%xmm4,%xmm5
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
movl %eax,(%esp)
- addl %ebx,%edx
- andl 4(%esp),%eax
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ vpshufd $250,%xmm3,%xmm7
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ vpsrld $11,%xmm6,%xmm6
+ addl 32(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ vpxor %xmm5,%xmm4,%xmm4
+ addl %edx,%ebx
+ addl 12(%esp),%edx
addl %ecx,%ebx
+ vpslld $11,%xmm5,%xmm5
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 16(%esp),%esi
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %ecx,%edx
+ movl 20(%esp),%edi
+ xorl %edi,%esi
+ vpsrld $10,%xmm7,%xmm6
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,12(%esp)
+ vpxor %xmm5,%xmm4,%xmm4
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrlq $17,%xmm7,%xmm5
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl (%esp),%edi
+ vpaddd %xmm4,%xmm0,%xmm0
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,28(%esp)
+ vpxor %xmm5,%xmm6,%xmm6
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ vpsrlq $19,%xmm7,%xmm7
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ addl 36(%esp),%edx
xorl %edi,%eax
- movl 156(%esp),%ecx
- addl $4,%ebp
- addl %ebx,%eax
- cmpl $3329325298,%esi
- jne .L00416_63
- movl 356(%esp),%esi
- movl 8(%esp),%ebx
- movl 16(%esp),%ecx
+ shrdl $2,%ecx,%ecx
+ vpshufd $132,%xmm6,%xmm7
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %ecx,%eax
+ vpsrldq $8,%xmm7,%xmm7
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 12(%esp),%esi
+ vpaddd %xmm7,%xmm0,%xmm0
+ xorl %ecx,%edx
+ movl 16(%esp),%edi
+ xorl %edi,%esi
+ vpshufd $80,%xmm0,%xmm7
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ vpsrld $10,%xmm7,%xmm6
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrlq $17,%xmm7,%xmm5
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 28(%esp),%edi
+ vpxor %xmm5,%xmm6,%xmm6
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,24(%esp)
+ vpsrlq $19,%xmm7,%xmm7
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ vpshufd $232,%xmm6,%xmm7
+ addl 40(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ vpslldq $8,%xmm7,%xmm7
+ addl %edx,%ebx
+ addl 4(%esp),%edx
+ addl %ecx,%ebx
+ vpaddd %xmm7,%xmm0,%xmm0
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 8(%esp),%esi
+ vpaddd (%ebp),%xmm0,%xmm6
+ xorl %ecx,%edx
+ movl 12(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,4(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 24(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,20(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 44(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %ecx,%eax
+ vmovdqa %xmm6,32(%esp)
+ vpalignr $4,%xmm1,%xmm2,%xmm4
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 4(%esp),%esi
+ vpalignr $4,%xmm3,%xmm0,%xmm7
+ xorl %ecx,%edx
+ movl 8(%esp),%edi
+ xorl %edi,%esi
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ vpaddd %xmm7,%xmm1,%xmm1
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrld $3,%xmm4,%xmm7
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 20(%esp),%edi
+ vpslld $14,%xmm4,%xmm5
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,16(%esp)
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ vpshufd $250,%xmm0,%xmm7
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ vpsrld $11,%xmm6,%xmm6
+ addl 48(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ vpxor %xmm5,%xmm4,%xmm4
+ addl %edx,%ebx
+ addl 28(%esp),%edx
+ addl %ecx,%ebx
+ vpslld $11,%xmm5,%xmm5
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl (%esp),%esi
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %ecx,%edx
+ movl 4(%esp),%edi
+ xorl %edi,%esi
+ vpsrld $10,%xmm7,%xmm6
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,28(%esp)
+ vpxor %xmm5,%xmm4,%xmm4
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrlq $17,%xmm7,%xmm5
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 16(%esp),%edi
+ vpaddd %xmm4,%xmm1,%xmm1
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,12(%esp)
+ vpxor %xmm5,%xmm6,%xmm6
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ vpsrlq $19,%xmm7,%xmm7
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ addl 52(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ vpshufd $132,%xmm6,%xmm7
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %ecx,%eax
+ vpsrldq $8,%xmm7,%xmm7
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 28(%esp),%esi
+ vpaddd %xmm7,%xmm1,%xmm1
+ xorl %ecx,%edx
+ movl (%esp),%edi
+ xorl %edi,%esi
+ vpshufd $80,%xmm1,%xmm7
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ vpsrld $10,%xmm7,%xmm6
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrlq $17,%xmm7,%xmm5
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 12(%esp),%edi
+ vpxor %xmm5,%xmm6,%xmm6
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,8(%esp)
+ vpsrlq $19,%xmm7,%xmm7
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ vpshufd $232,%xmm6,%xmm7
+ addl 56(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ vpslldq $8,%xmm7,%xmm7
+ addl %edx,%ebx
+ addl 20(%esp),%edx
+ addl %ecx,%ebx
+ vpaddd %xmm7,%xmm1,%xmm1
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 24(%esp),%esi
+ vpaddd 16(%ebp),%xmm1,%xmm6
+ xorl %ecx,%edx
+ movl 28(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,20(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 8(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,4(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 60(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %ecx,%eax
+ vmovdqa %xmm6,48(%esp)
+ vpalignr $4,%xmm2,%xmm3,%xmm4
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 20(%esp),%esi
+ vpalignr $4,%xmm0,%xmm1,%xmm7
+ xorl %ecx,%edx
+ movl 24(%esp),%edi
+ xorl %edi,%esi
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ vpaddd %xmm7,%xmm2,%xmm2
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrld $3,%xmm4,%xmm7
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 4(%esp),%edi
+ vpslld $14,%xmm4,%xmm5
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,(%esp)
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ vpshufd $250,%xmm1,%xmm7
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ vpsrld $11,%xmm6,%xmm6
+ addl 64(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ vpxor %xmm5,%xmm4,%xmm4
+ addl %edx,%ebx
+ addl 12(%esp),%edx
+ addl %ecx,%ebx
+ vpslld $11,%xmm5,%xmm5
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 16(%esp),%esi
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %ecx,%edx
+ movl 20(%esp),%edi
+ xorl %edi,%esi
+ vpsrld $10,%xmm7,%xmm6
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,12(%esp)
+ vpxor %xmm5,%xmm4,%xmm4
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrlq $17,%xmm7,%xmm5
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl (%esp),%edi
+ vpaddd %xmm4,%xmm2,%xmm2
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,28(%esp)
+ vpxor %xmm5,%xmm6,%xmm6
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ vpsrlq $19,%xmm7,%xmm7
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ addl 68(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ vpshufd $132,%xmm6,%xmm7
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %ecx,%eax
+ vpsrldq $8,%xmm7,%xmm7
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 12(%esp),%esi
+ vpaddd %xmm7,%xmm2,%xmm2
+ xorl %ecx,%edx
+ movl 16(%esp),%edi
+ xorl %edi,%esi
+ vpshufd $80,%xmm2,%xmm7
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ vpsrld $10,%xmm7,%xmm6
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrlq $17,%xmm7,%xmm5
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 28(%esp),%edi
+ vpxor %xmm5,%xmm6,%xmm6
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,24(%esp)
+ vpsrlq $19,%xmm7,%xmm7
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ vpshufd $232,%xmm6,%xmm7
+ addl 72(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ vpslldq $8,%xmm7,%xmm7
+ addl %edx,%ebx
+ addl 4(%esp),%edx
+ addl %ecx,%ebx
+ vpaddd %xmm7,%xmm2,%xmm2
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 8(%esp),%esi
+ vpaddd 32(%ebp),%xmm2,%xmm6
+ xorl %ecx,%edx
+ movl 12(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,4(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 24(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,20(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 76(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %ecx,%eax
+ vmovdqa %xmm6,64(%esp)
+ vpalignr $4,%xmm3,%xmm0,%xmm4
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 4(%esp),%esi
+ vpalignr $4,%xmm1,%xmm2,%xmm7
+ xorl %ecx,%edx
+ movl 8(%esp),%edi
+ xorl %edi,%esi
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ vpaddd %xmm7,%xmm3,%xmm3
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrld $3,%xmm4,%xmm7
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 20(%esp),%edi
+ vpslld $14,%xmm4,%xmm5
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,16(%esp)
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ vpshufd $250,%xmm2,%xmm7
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ vpsrld $11,%xmm6,%xmm6
+ addl 80(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ vpxor %xmm5,%xmm4,%xmm4
+ addl %edx,%ebx
+ addl 28(%esp),%edx
+ addl %ecx,%ebx
+ vpslld $11,%xmm5,%xmm5
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl (%esp),%esi
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %ecx,%edx
+ movl 4(%esp),%edi
+ xorl %edi,%esi
+ vpsrld $10,%xmm7,%xmm6
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,28(%esp)
+ vpxor %xmm5,%xmm4,%xmm4
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrlq $17,%xmm7,%xmm5
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 16(%esp),%edi
+ vpaddd %xmm4,%xmm3,%xmm3
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,12(%esp)
+ vpxor %xmm5,%xmm6,%xmm6
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ vpsrlq $19,%xmm7,%xmm7
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ addl 84(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ vpshufd $132,%xmm6,%xmm7
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %ecx,%eax
+ vpsrldq $8,%xmm7,%xmm7
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 28(%esp),%esi
+ vpaddd %xmm7,%xmm3,%xmm3
+ xorl %ecx,%edx
+ movl (%esp),%edi
+ xorl %edi,%esi
+ vpshufd $80,%xmm3,%xmm7
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ vpsrld $10,%xmm7,%xmm6
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrlq $17,%xmm7,%xmm5
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 12(%esp),%edi
+ vpxor %xmm5,%xmm6,%xmm6
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,8(%esp)
+ vpsrlq $19,%xmm7,%xmm7
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ vpshufd $232,%xmm6,%xmm7
+ addl 88(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ vpslldq $8,%xmm7,%xmm7
+ addl %edx,%ebx
+ addl 20(%esp),%edx
+ addl %ecx,%ebx
+ vpaddd %xmm7,%xmm3,%xmm3
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 24(%esp),%esi
+ vpaddd 48(%ebp),%xmm3,%xmm6
+ xorl %ecx,%edx
+ movl 28(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,20(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 8(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,4(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 92(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %ecx,%eax
+ vmovdqa %xmm6,80(%esp)
+ cmpl $66051,64(%ebp)
+ jne .L016avx_00_47
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 20(%esp),%esi
+ xorl %ecx,%edx
+ movl 24(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 4(%esp),%edi
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 32(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ addl %edx,%ebx
+ addl 12(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 16(%esp),%esi
+ xorl %ecx,%edx
+ movl 20(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,12(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl (%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,28(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 36(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 12(%esp),%esi
+ xorl %ecx,%edx
+ movl 16(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 28(%esp),%edi
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,24(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 40(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ addl %edx,%ebx
+ addl 4(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 8(%esp),%esi
+ xorl %ecx,%edx
+ movl 12(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,4(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 24(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,20(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 44(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 4(%esp),%esi
+ xorl %ecx,%edx
+ movl 8(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 20(%esp),%edi
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,16(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 48(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ addl %edx,%ebx
+ addl 28(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl (%esp),%esi
+ xorl %ecx,%edx
+ movl 4(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,28(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 16(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,12(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 52(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 28(%esp),%esi
+ xorl %ecx,%edx
+ movl (%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 12(%esp),%edi
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,8(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 56(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ addl %edx,%ebx
+ addl 20(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 24(%esp),%esi
+ xorl %ecx,%edx
+ movl 28(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,20(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 8(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,4(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 60(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 20(%esp),%esi
+ xorl %ecx,%edx
+ movl 24(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 4(%esp),%edi
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 64(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ addl %edx,%ebx
+ addl 12(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 16(%esp),%esi
+ xorl %ecx,%edx
+ movl 20(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,12(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl (%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,28(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 68(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 12(%esp),%esi
+ xorl %ecx,%edx
+ movl 16(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 28(%esp),%edi
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,24(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 72(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ addl %edx,%ebx
+ addl 4(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 8(%esp),%esi
+ xorl %ecx,%edx
+ movl 12(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,4(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 24(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,20(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 76(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 4(%esp),%esi
+ xorl %ecx,%edx
+ movl 8(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 20(%esp),%edi
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,16(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 80(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ addl %edx,%ebx
+ addl 28(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl (%esp),%esi
+ xorl %ecx,%edx
+ movl 4(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,28(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 16(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,12(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 84(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 28(%esp),%esi
+ xorl %ecx,%edx
+ movl (%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 12(%esp),%edi
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,8(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 88(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ addl %edx,%ebx
+ addl 20(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 24(%esp),%esi
+ xorl %ecx,%edx
+ movl 28(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,20(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 8(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,4(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 92(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %ecx,%eax
+ movl 96(%esp),%esi
+ xorl %edi,%ebx
+ movl 12(%esp),%ecx
addl (%esi),%eax
addl 4(%esi),%ebx
addl 8(%esi),%edi
@@ -3554,30 +5730,1111 @@ sha256_block_data_order:
movl %ebx,4(%esi)
movl %edi,8(%esi)
movl %ecx,12(%esi)
- movl 24(%esp),%eax
- movl 28(%esp),%ebx
- movl 32(%esp),%ecx
- movl 360(%esp),%edi
+ movl %ebx,4(%esp)
+ xorl %edi,%ebx
+ movl %edi,8(%esp)
+ movl %ecx,12(%esp)
+ movl 20(%esp),%edi
+ movl 24(%esp),%ecx
addl 16(%esi),%edx
- addl 20(%esi),%eax
- addl 24(%esi),%ebx
- addl 28(%esi),%ecx
+ addl 20(%esi),%edi
+ addl 24(%esi),%ecx
movl %edx,16(%esi)
- movl %eax,20(%esi)
- movl %ebx,24(%esi)
- movl %ecx,28(%esi)
- leal 356(%esp),%esp
- subl $256,%ebp
- cmpl 8(%esp),%edi
- jb .L002loop
- movl 12(%esp),%esp
+ movl %edi,20(%esi)
+ movl %edi,20(%esp)
+ movl 28(%esp),%edi
+ movl %ecx,24(%esi)
+ addl 28(%esi),%edi
+ movl %ecx,24(%esp)
+ movl %edi,28(%esi)
+ movl %edi,28(%esp)
+ movl 100(%esp),%edi
+ vmovdqa 64(%ebp),%xmm7
+ subl $192,%ebp
+ cmpl 104(%esp),%edi
+ jb .L015grand_avx
+ movl 108(%esp),%esp
+ vzeroall
popl %edi
popl %esi
popl %ebx
popl %ebp
ret
.align 32
-.L005loop_shrd:
+.L014AVX_BMI:
+ leal -96(%esp),%esp
+ vzeroall
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl 8(%esi),%ecx
+ movl 12(%esi),%edi
+ movl %ebx,4(%esp)
+ xorl %ecx,%ebx
+ movl %ecx,8(%esp)
+ movl %edi,12(%esp)
+ movl 16(%esi),%edx
+ movl 20(%esi),%edi
+ movl 24(%esi),%ecx
+ movl 28(%esi),%esi
+ movl %edi,20(%esp)
+ movl 100(%esp),%edi
+ movl %ecx,24(%esp)
+ movl %esi,28(%esp)
+ vmovdqa 256(%ebp),%xmm7
+ jmp .L017grand_avx_bmi
+.align 32
+.L017grand_avx_bmi:
+ vmovdqu (%edi),%xmm0
+ vmovdqu 16(%edi),%xmm1
+ vmovdqu 32(%edi),%xmm2
+ vmovdqu 48(%edi),%xmm3
+ addl $64,%edi
+ vpshufb %xmm7,%xmm0,%xmm0
+ movl %edi,100(%esp)
+ vpshufb %xmm7,%xmm1,%xmm1
+ vpshufb %xmm7,%xmm2,%xmm2
+ vpaddd (%ebp),%xmm0,%xmm4
+ vpshufb %xmm7,%xmm3,%xmm3
+ vpaddd 16(%ebp),%xmm1,%xmm5
+ vpaddd 32(%ebp),%xmm2,%xmm6
+ vpaddd 48(%ebp),%xmm3,%xmm7
+ vmovdqa %xmm4,32(%esp)
+ vmovdqa %xmm5,48(%esp)
+ vmovdqa %xmm6,64(%esp)
+ vmovdqa %xmm7,80(%esp)
+ jmp .L018avx_bmi_00_47
+.align 16
+.L018avx_bmi_00_47:
+ addl $64,%ebp
+ vpalignr $4,%xmm0,%xmm1,%xmm4
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,16(%esp)
+ vpalignr $4,%xmm2,%xmm3,%xmm7
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 24(%esp),%edx,%esi
+ vpsrld $7,%xmm4,%xmm6
+ xorl %edi,%ecx
+ andl 20(%esp),%edx
+ movl %eax,(%esp)
+ vpaddd %xmm7,%xmm0,%xmm0
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ vpsrld $3,%xmm4,%xmm7
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ vpslld $14,%xmm4,%xmm5
+ movl 4(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ vpxor %xmm6,%xmm7,%xmm4
+ addl 28(%esp),%edx
+ andl %eax,%ebx
+ addl 32(%esp),%edx
+ vpshufd $250,%xmm3,%xmm7
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 12(%esp),%edx
+ vpsrld $11,%xmm6,%xmm6
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ vpxor %xmm5,%xmm4,%xmm4
+ movl %edx,12(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ vpslld $11,%xmm5,%xmm5
+ andnl 20(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 16(%esp),%edx
+ vpxor %xmm6,%xmm4,%xmm4
+ movl %ebx,28(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ vpsrld $10,%xmm7,%xmm6
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ vpxor %xmm5,%xmm4,%xmm4
+ movl (%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ vpsrlq $17,%xmm7,%xmm5
+ addl 24(%esp),%edx
+ andl %ebx,%eax
+ addl 36(%esp),%edx
+ vpaddd %xmm4,%xmm0,%xmm0
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl 8(%esp),%edx
+ vpxor %xmm5,%xmm6,%xmm6
+ leal (%eax,%ecx,1),%eax
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ vpsrlq $19,%xmm7,%xmm7
+ movl %edx,8(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ andnl 16(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 12(%esp),%edx
+ vpshufd $132,%xmm6,%xmm7
+ movl %eax,24(%esp)
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ vpsrldq $8,%xmm7,%xmm7
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ vpaddd %xmm7,%xmm0,%xmm0
+ movl 28(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ vpshufd $80,%xmm0,%xmm7
+ addl 20(%esp),%edx
+ andl %eax,%ebx
+ addl 40(%esp),%edx
+ vpsrld $10,%xmm7,%xmm6
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 4(%esp),%edx
+ vpsrlq $17,%xmm7,%xmm5
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ vpxor %xmm5,%xmm6,%xmm6
+ movl %edx,4(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ vpsrlq $19,%xmm7,%xmm7
+ andnl 12(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 8(%esp),%edx
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %ebx,20(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ vpshufd $232,%xmm6,%xmm7
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ vpslldq $8,%xmm7,%xmm7
+ movl 24(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ vpaddd %xmm7,%xmm0,%xmm0
+ addl 16(%esp),%edx
+ andl %ebx,%eax
+ addl 44(%esp),%edx
+ vpaddd (%ebp),%xmm0,%xmm6
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl (%esp),%edx
+ leal (%eax,%ecx,1),%eax
+ vmovdqa %xmm6,32(%esp)
+ vpalignr $4,%xmm1,%xmm2,%xmm4
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,(%esp)
+ vpalignr $4,%xmm3,%xmm0,%xmm7
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 8(%esp),%edx,%esi
+ vpsrld $7,%xmm4,%xmm6
+ xorl %edi,%ecx
+ andl 4(%esp),%edx
+ movl %eax,16(%esp)
+ vpaddd %xmm7,%xmm1,%xmm1
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ vpsrld $3,%xmm4,%xmm7
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ vpslld $14,%xmm4,%xmm5
+ movl 20(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ vpxor %xmm6,%xmm7,%xmm4
+ addl 12(%esp),%edx
+ andl %eax,%ebx
+ addl 48(%esp),%edx
+ vpshufd $250,%xmm0,%xmm7
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 28(%esp),%edx
+ vpsrld $11,%xmm6,%xmm6
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ vpxor %xmm5,%xmm4,%xmm4
+ movl %edx,28(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ vpslld $11,%xmm5,%xmm5
+ andnl 4(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl (%esp),%edx
+ vpxor %xmm6,%xmm4,%xmm4
+ movl %ebx,12(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ vpsrld $10,%xmm7,%xmm6
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ vpxor %xmm5,%xmm4,%xmm4
+ movl 16(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ vpsrlq $17,%xmm7,%xmm5
+ addl 8(%esp),%edx
+ andl %ebx,%eax
+ addl 52(%esp),%edx
+ vpaddd %xmm4,%xmm1,%xmm1
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl 24(%esp),%edx
+ vpxor %xmm5,%xmm6,%xmm6
+ leal (%eax,%ecx,1),%eax
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ vpsrlq $19,%xmm7,%xmm7
+ movl %edx,24(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ andnl (%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 28(%esp),%edx
+ vpshufd $132,%xmm6,%xmm7
+ movl %eax,8(%esp)
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ vpsrldq $8,%xmm7,%xmm7
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ vpaddd %xmm7,%xmm1,%xmm1
+ movl 12(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ vpshufd $80,%xmm1,%xmm7
+ addl 4(%esp),%edx
+ andl %eax,%ebx
+ addl 56(%esp),%edx
+ vpsrld $10,%xmm7,%xmm6
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 20(%esp),%edx
+ vpsrlq $17,%xmm7,%xmm5
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ vpxor %xmm5,%xmm6,%xmm6
+ movl %edx,20(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ vpsrlq $19,%xmm7,%xmm7
+ andnl 28(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 24(%esp),%edx
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %ebx,4(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ vpshufd $232,%xmm6,%xmm7
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ vpslldq $8,%xmm7,%xmm7
+ movl 8(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ vpaddd %xmm7,%xmm1,%xmm1
+ addl (%esp),%edx
+ andl %ebx,%eax
+ addl 60(%esp),%edx
+ vpaddd 16(%ebp),%xmm1,%xmm6
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl 16(%esp),%edx
+ leal (%eax,%ecx,1),%eax
+ vmovdqa %xmm6,48(%esp)
+ vpalignr $4,%xmm2,%xmm3,%xmm4
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,16(%esp)
+ vpalignr $4,%xmm0,%xmm1,%xmm7
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 24(%esp),%edx,%esi
+ vpsrld $7,%xmm4,%xmm6
+ xorl %edi,%ecx
+ andl 20(%esp),%edx
+ movl %eax,(%esp)
+ vpaddd %xmm7,%xmm2,%xmm2
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ vpsrld $3,%xmm4,%xmm7
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ vpslld $14,%xmm4,%xmm5
+ movl 4(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ vpxor %xmm6,%xmm7,%xmm4
+ addl 28(%esp),%edx
+ andl %eax,%ebx
+ addl 64(%esp),%edx
+ vpshufd $250,%xmm1,%xmm7
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 12(%esp),%edx
+ vpsrld $11,%xmm6,%xmm6
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ vpxor %xmm5,%xmm4,%xmm4
+ movl %edx,12(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ vpslld $11,%xmm5,%xmm5
+ andnl 20(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 16(%esp),%edx
+ vpxor %xmm6,%xmm4,%xmm4
+ movl %ebx,28(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ vpsrld $10,%xmm7,%xmm6
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ vpxor %xmm5,%xmm4,%xmm4
+ movl (%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ vpsrlq $17,%xmm7,%xmm5
+ addl 24(%esp),%edx
+ andl %ebx,%eax
+ addl 68(%esp),%edx
+ vpaddd %xmm4,%xmm2,%xmm2
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl 8(%esp),%edx
+ vpxor %xmm5,%xmm6,%xmm6
+ leal (%eax,%ecx,1),%eax
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ vpsrlq $19,%xmm7,%xmm7
+ movl %edx,8(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ andnl 16(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 12(%esp),%edx
+ vpshufd $132,%xmm6,%xmm7
+ movl %eax,24(%esp)
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ vpsrldq $8,%xmm7,%xmm7
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ vpaddd %xmm7,%xmm2,%xmm2
+ movl 28(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ vpshufd $80,%xmm2,%xmm7
+ addl 20(%esp),%edx
+ andl %eax,%ebx
+ addl 72(%esp),%edx
+ vpsrld $10,%xmm7,%xmm6
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 4(%esp),%edx
+ vpsrlq $17,%xmm7,%xmm5
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ vpxor %xmm5,%xmm6,%xmm6
+ movl %edx,4(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ vpsrlq $19,%xmm7,%xmm7
+ andnl 12(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 8(%esp),%edx
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %ebx,20(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ vpshufd $232,%xmm6,%xmm7
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ vpslldq $8,%xmm7,%xmm7
+ movl 24(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ vpaddd %xmm7,%xmm2,%xmm2
+ addl 16(%esp),%edx
+ andl %ebx,%eax
+ addl 76(%esp),%edx
+ vpaddd 32(%ebp),%xmm2,%xmm6
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl (%esp),%edx
+ leal (%eax,%ecx,1),%eax
+ vmovdqa %xmm6,64(%esp)
+ vpalignr $4,%xmm3,%xmm0,%xmm4
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,(%esp)
+ vpalignr $4,%xmm1,%xmm2,%xmm7
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 8(%esp),%edx,%esi
+ vpsrld $7,%xmm4,%xmm6
+ xorl %edi,%ecx
+ andl 4(%esp),%edx
+ movl %eax,16(%esp)
+ vpaddd %xmm7,%xmm3,%xmm3
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ vpsrld $3,%xmm4,%xmm7
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ vpslld $14,%xmm4,%xmm5
+ movl 20(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ vpxor %xmm6,%xmm7,%xmm4
+ addl 12(%esp),%edx
+ andl %eax,%ebx
+ addl 80(%esp),%edx
+ vpshufd $250,%xmm2,%xmm7
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 28(%esp),%edx
+ vpsrld $11,%xmm6,%xmm6
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ vpxor %xmm5,%xmm4,%xmm4
+ movl %edx,28(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ vpslld $11,%xmm5,%xmm5
+ andnl 4(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl (%esp),%edx
+ vpxor %xmm6,%xmm4,%xmm4
+ movl %ebx,12(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ vpsrld $10,%xmm7,%xmm6
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ vpxor %xmm5,%xmm4,%xmm4
+ movl 16(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ vpsrlq $17,%xmm7,%xmm5
+ addl 8(%esp),%edx
+ andl %ebx,%eax
+ addl 84(%esp),%edx
+ vpaddd %xmm4,%xmm3,%xmm3
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl 24(%esp),%edx
+ vpxor %xmm5,%xmm6,%xmm6
+ leal (%eax,%ecx,1),%eax
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ vpsrlq $19,%xmm7,%xmm7
+ movl %edx,24(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ andnl (%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 28(%esp),%edx
+ vpshufd $132,%xmm6,%xmm7
+ movl %eax,8(%esp)
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ vpsrldq $8,%xmm7,%xmm7
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ vpaddd %xmm7,%xmm3,%xmm3
+ movl 12(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ vpshufd $80,%xmm3,%xmm7
+ addl 4(%esp),%edx
+ andl %eax,%ebx
+ addl 88(%esp),%edx
+ vpsrld $10,%xmm7,%xmm6
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 20(%esp),%edx
+ vpsrlq $17,%xmm7,%xmm5
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ vpxor %xmm5,%xmm6,%xmm6
+ movl %edx,20(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ vpsrlq $19,%xmm7,%xmm7
+ andnl 28(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 24(%esp),%edx
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %ebx,4(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ vpshufd $232,%xmm6,%xmm7
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ vpslldq $8,%xmm7,%xmm7
+ movl 8(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ vpaddd %xmm7,%xmm3,%xmm3
+ addl (%esp),%edx
+ andl %ebx,%eax
+ addl 92(%esp),%edx
+ vpaddd 48(%ebp),%xmm3,%xmm6
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl 16(%esp),%edx
+ leal (%eax,%ecx,1),%eax
+ vmovdqa %xmm6,80(%esp)
+ cmpl $66051,64(%ebp)
+ jne .L018avx_bmi_00_47
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,16(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 24(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 20(%esp),%edx
+ movl %eax,(%esp)
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ movl 4(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ andl %eax,%ebx
+ addl 32(%esp),%edx
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 12(%esp),%edx
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,12(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 20(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 16(%esp),%edx
+ movl %ebx,28(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ movl (%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ andl %ebx,%eax
+ addl 36(%esp),%edx
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl 8(%esp),%edx
+ leal (%eax,%ecx,1),%eax
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,8(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 16(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 12(%esp),%edx
+ movl %eax,24(%esp)
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ movl 28(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ andl %eax,%ebx
+ addl 40(%esp),%edx
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 4(%esp),%edx
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,4(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 12(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 8(%esp),%edx
+ movl %ebx,20(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ movl 24(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ andl %ebx,%eax
+ addl 44(%esp),%edx
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl (%esp),%edx
+ leal (%eax,%ecx,1),%eax
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 8(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 4(%esp),%edx
+ movl %eax,16(%esp)
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ movl 20(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ andl %eax,%ebx
+ addl 48(%esp),%edx
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 28(%esp),%edx
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,28(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 4(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl (%esp),%edx
+ movl %ebx,12(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ movl 16(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ andl %ebx,%eax
+ addl 52(%esp),%edx
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl 24(%esp),%edx
+ leal (%eax,%ecx,1),%eax
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,24(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl (%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 28(%esp),%edx
+ movl %eax,8(%esp)
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ movl 12(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ andl %eax,%ebx
+ addl 56(%esp),%edx
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 20(%esp),%edx
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,20(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 28(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 24(%esp),%edx
+ movl %ebx,4(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ movl 8(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ andl %ebx,%eax
+ addl 60(%esp),%edx
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl 16(%esp),%edx
+ leal (%eax,%ecx,1),%eax
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,16(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 24(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 20(%esp),%edx
+ movl %eax,(%esp)
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ movl 4(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ andl %eax,%ebx
+ addl 64(%esp),%edx
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 12(%esp),%edx
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,12(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 20(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 16(%esp),%edx
+ movl %ebx,28(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ movl (%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ andl %ebx,%eax
+ addl 68(%esp),%edx
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl 8(%esp),%edx
+ leal (%eax,%ecx,1),%eax
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,8(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 16(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 12(%esp),%edx
+ movl %eax,24(%esp)
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ movl 28(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ andl %eax,%ebx
+ addl 72(%esp),%edx
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 4(%esp),%edx
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,4(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 12(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 8(%esp),%edx
+ movl %ebx,20(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ movl 24(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ andl %ebx,%eax
+ addl 76(%esp),%edx
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl (%esp),%edx
+ leal (%eax,%ecx,1),%eax
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 8(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 4(%esp),%edx
+ movl %eax,16(%esp)
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ movl 20(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ andl %eax,%ebx
+ addl 80(%esp),%edx
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 28(%esp),%edx
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,28(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 4(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl (%esp),%edx
+ movl %ebx,12(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ movl 16(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ andl %ebx,%eax
+ addl 84(%esp),%edx
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl 24(%esp),%edx
+ leal (%eax,%ecx,1),%eax
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,24(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl (%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 28(%esp),%edx
+ movl %eax,8(%esp)
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ movl 12(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ andl %eax,%ebx
+ addl 88(%esp),%edx
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 20(%esp),%edx
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,20(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 28(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 24(%esp),%edx
+ movl %ebx,4(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ movl 8(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ andl %ebx,%eax
+ addl 92(%esp),%edx
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl 16(%esp),%edx
+ leal (%eax,%ecx,1),%eax
+ movl 96(%esp),%esi
+ xorl %edi,%ebx
+ movl 12(%esp),%ecx
+ addl (%esi),%eax
+ addl 4(%esi),%ebx
+ addl 8(%esi),%edi
+ addl 12(%esi),%ecx
+ movl %eax,(%esi)
+ movl %ebx,4(%esi)
+ movl %edi,8(%esi)
+ movl %ecx,12(%esi)
+ movl %ebx,4(%esp)
+ xorl %edi,%ebx
+ movl %edi,8(%esp)
+ movl %ecx,12(%esp)
+ movl 20(%esp),%edi
+ movl 24(%esp),%ecx
+ addl 16(%esi),%edx
+ addl 20(%esi),%edi
+ addl 24(%esi),%ecx
+ movl %edx,16(%esi)
+ movl %edi,20(%esi)
+ movl %edi,20(%esp)
+ movl 28(%esp),%edi
+ movl %ecx,24(%esi)
+ addl 28(%esi),%edi
+ movl %ecx,24(%esp)
+ movl %edi,28(%esi)
+ movl %edi,28(%esp)
+ movl 100(%esp),%edi
+ vmovdqa 64(%ebp),%xmm7
+ subl $192,%ebp
+ cmpl 104(%esp),%edi
+ jb .L017grand_avx_bmi
+ movl 108(%esp),%esp
+ vzeroall
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size sha256_block_data_order,.-.L_sha256_block_data_order_begin
+.comm OPENSSL_ia32cap_P,16,4
+#else
+.text
+.globl sha256_block_data_order
+.type sha256_block_data_order,@function
+.align 16
+sha256_block_data_order:
+.L_sha256_block_data_order_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%esi
+ movl 24(%esp),%edi
+ movl 28(%esp),%eax
+ movl %esp,%ebx
+ call .L000pic_point
+.L000pic_point:
+ popl %ebp
+ leal .L001K256-.L000pic_point(%ebp),%ebp
+ subl $16,%esp
+ andl $-64,%esp
+ shll $6,%eax
+ addl %edi,%eax
+ movl %esi,(%esp)
+ movl %edi,4(%esp)
+ movl %eax,8(%esp)
+ movl %ebx,12(%esp)
+ leal OPENSSL_ia32cap_P,%edx
+ movl (%edx),%ecx
+ movl 4(%edx),%ebx
+ testl $1048576,%ecx
+ jnz .L002loop
+ movl 8(%edx),%edx
+ testl $16777216,%ecx
+ jz .L003no_xmm
+ andl $1073741824,%ecx
+ andl $268435968,%ebx
+ testl $536870912,%edx
+ jnz .L004shaext
+ orl %ebx,%ecx
+ andl $1342177280,%ecx
+ cmpl $1342177280,%ecx
+ je .L005AVX
+ testl $512,%ebx
+ jnz .L006SSSE3
+.L003no_xmm:
+ subl %edi,%eax
+ cmpl $256,%eax
+ jae .L007unrolled
+ jmp .L002loop
+.align 16
+.L002loop:
movl (%edi),%eax
movl 4(%edi),%ebx
movl 8(%edi),%ecx
@@ -3646,35 +6903,35 @@ sha256_block_data_order:
movl %ecx,28(%esp)
movl %edi,32(%esp)
.align 16
-.L00600_15_shrd:
+.L00800_15:
movl %edx,%ecx
movl 24(%esp),%esi
- shrdl $14,%ecx,%ecx
+ rorl $14,%ecx
movl 28(%esp),%edi
xorl %edx,%ecx
xorl %edi,%esi
movl 96(%esp),%ebx
- shrdl $5,%ecx,%ecx
+ rorl $5,%ecx
andl %edx,%esi
movl %edx,20(%esp)
xorl %ecx,%edx
addl 32(%esp),%ebx
xorl %edi,%esi
- shrdl $6,%edx,%edx
+ rorl $6,%edx
movl %eax,%ecx
addl %esi,%ebx
- shrdl $9,%ecx,%ecx
+ rorl $9,%ecx
addl %edx,%ebx
movl 8(%esp),%edi
xorl %eax,%ecx
movl %eax,4(%esp)
leal -4(%esp),%esp
- shrdl $11,%ecx,%ecx
+ rorl $11,%ecx
movl (%ebp),%esi
xorl %eax,%ecx
movl 20(%esp),%edx
xorl %edi,%eax
- shrdl $2,%ecx,%ecx
+ rorl $2,%ecx
addl %esi,%ebx
movl %eax,(%esp)
addl %ebx,%edx
@@ -3684,55 +6941,55 @@ sha256_block_data_order:
addl $4,%ebp
addl %ebx,%eax
cmpl $3248222580,%esi
- jne .L00600_15_shrd
+ jne .L00800_15
movl 156(%esp),%ecx
- jmp .L00716_63_shrd
+ jmp .L00916_63
.align 16
-.L00716_63_shrd:
+.L00916_63:
movl %ecx,%ebx
movl 104(%esp),%esi
- shrdl $11,%ecx,%ecx
+ rorl $11,%ecx
movl %esi,%edi
- shrdl $2,%esi,%esi
+ rorl $2,%esi
xorl %ebx,%ecx
shrl $3,%ebx
- shrdl $7,%ecx,%ecx
+ rorl $7,%ecx
xorl %edi,%esi
xorl %ecx,%ebx
- shrdl $17,%esi,%esi
+ rorl $17,%esi
addl 160(%esp),%ebx
shrl $10,%edi
addl 124(%esp),%ebx
movl %edx,%ecx
xorl %esi,%edi
movl 24(%esp),%esi
- shrdl $14,%ecx,%ecx
+ rorl $14,%ecx
addl %edi,%ebx
movl 28(%esp),%edi
xorl %edx,%ecx
xorl %edi,%esi
movl %ebx,96(%esp)
- shrdl $5,%ecx,%ecx
+ rorl $5,%ecx
andl %edx,%esi
movl %edx,20(%esp)
xorl %ecx,%edx
addl 32(%esp),%ebx
xorl %edi,%esi
- shrdl $6,%edx,%edx
+ rorl $6,%edx
movl %eax,%ecx
addl %esi,%ebx
- shrdl $9,%ecx,%ecx
+ rorl $9,%ecx
addl %edx,%ebx
movl 8(%esp),%edi
xorl %eax,%ecx
movl %eax,4(%esp)
leal -4(%esp),%esp
- shrdl $11,%ecx,%ecx
+ rorl $11,%ecx
movl (%ebp),%esi
xorl %eax,%ecx
movl 20(%esp),%edx
xorl %edi,%eax
- shrdl $2,%ecx,%ecx
+ rorl $2,%ecx
addl %esi,%ebx
movl %eax,(%esp)
addl %ebx,%edx
@@ -3743,7 +7000,7 @@ sha256_block_data_order:
addl $4,%ebp
addl %ebx,%eax
cmpl $3329325298,%esi
- jne .L00716_63_shrd
+ jne .L00916_63
movl 356(%esp),%esi
movl 8(%esp),%ebx
movl 16(%esp),%ecx
@@ -3770,7 +7027,7 @@ sha256_block_data_order:
leal 356(%esp),%esp
subl $256,%ebp
cmpl 8(%esp),%edi
- jb .L005loop_shrd
+ jb .L002loop
movl 12(%esp),%esp
popl %edi
popl %esi
@@ -3787,7 +7044,7 @@ sha256_block_data_order:
.byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
.byte 62,0
.align 16
-.L008unrolled:
+.L007unrolled:
leal -96(%esp),%esp
movl (%esi),%eax
movl 4(%esi),%ebp
@@ -3804,9 +7061,9 @@ sha256_block_data_order:
movl %ebx,20(%esp)
movl %ecx,24(%esp)
movl %esi,28(%esp)
- jmp .L009grand_loop
+ jmp .L010grand_loop
.align 16
-.L009grand_loop:
+.L010grand_loop:
movl (%edi),%ebx
movl 4(%edi),%ecx
bswap %ebx
@@ -6686,12 +9943,3627 @@ sha256_block_data_order:
movl %ebx,24(%esp)
movl %ecx,28(%esp)
cmpl 104(%esp),%edi
- jb .L009grand_loop
+ jb .L010grand_loop
+ movl 108(%esp),%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.align 32
+.L004shaext:
+ subl $32,%esp
+ movdqu (%esi),%xmm1
+ leal 128(%ebp),%ebp
+ movdqu 16(%esi),%xmm2
+ movdqa 128(%ebp),%xmm7
+ pshufd $27,%xmm1,%xmm0
+ pshufd $177,%xmm1,%xmm1
+ pshufd $27,%xmm2,%xmm2
+.byte 102,15,58,15,202,8
+ punpcklqdq %xmm0,%xmm2
+ jmp .L011loop_shaext
+.align 16
+.L011loop_shaext:
+ movdqu (%edi),%xmm3
+ movdqu 16(%edi),%xmm4
+ movdqu 32(%edi),%xmm5
+.byte 102,15,56,0,223
+ movdqu 48(%edi),%xmm6
+ movdqa %xmm2,16(%esp)
+ movdqa -128(%ebp),%xmm0
+ paddd %xmm3,%xmm0
+.byte 102,15,56,0,231
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ nop
+ movdqa %xmm1,(%esp)
+.byte 15,56,203,202
+ movdqa -112(%ebp),%xmm0
+ paddd %xmm4,%xmm0
+.byte 102,15,56,0,239
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ leal 64(%edi),%edi
+.byte 15,56,204,220
+.byte 15,56,203,202
+ movdqa -96(%ebp),%xmm0
+ paddd %xmm5,%xmm0
+.byte 102,15,56,0,247
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm6,%xmm7
+.byte 102,15,58,15,253,4
+ nop
+ paddd %xmm7,%xmm3
+.byte 15,56,204,229
+.byte 15,56,203,202
+ movdqa -80(%ebp),%xmm0
+ paddd %xmm6,%xmm0
+.byte 15,56,205,222
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm3,%xmm7
+.byte 102,15,58,15,254,4
+ nop
+ paddd %xmm7,%xmm4
+.byte 15,56,204,238
+.byte 15,56,203,202
+ movdqa -64(%ebp),%xmm0
+ paddd %xmm3,%xmm0
+.byte 15,56,205,227
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm4,%xmm7
+.byte 102,15,58,15,251,4
+ nop
+ paddd %xmm7,%xmm5
+.byte 15,56,204,243
+.byte 15,56,203,202
+ movdqa -48(%ebp),%xmm0
+ paddd %xmm4,%xmm0
+.byte 15,56,205,236
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm5,%xmm7
+.byte 102,15,58,15,252,4
+ nop
+ paddd %xmm7,%xmm6
+.byte 15,56,204,220
+.byte 15,56,203,202
+ movdqa -32(%ebp),%xmm0
+ paddd %xmm5,%xmm0
+.byte 15,56,205,245
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm6,%xmm7
+.byte 102,15,58,15,253,4
+ nop
+ paddd %xmm7,%xmm3
+.byte 15,56,204,229
+.byte 15,56,203,202
+ movdqa -16(%ebp),%xmm0
+ paddd %xmm6,%xmm0
+.byte 15,56,205,222
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm3,%xmm7
+.byte 102,15,58,15,254,4
+ nop
+ paddd %xmm7,%xmm4
+.byte 15,56,204,238
+.byte 15,56,203,202
+ movdqa (%ebp),%xmm0
+ paddd %xmm3,%xmm0
+.byte 15,56,205,227
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm4,%xmm7
+.byte 102,15,58,15,251,4
+ nop
+ paddd %xmm7,%xmm5
+.byte 15,56,204,243
+.byte 15,56,203,202
+ movdqa 16(%ebp),%xmm0
+ paddd %xmm4,%xmm0
+.byte 15,56,205,236
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm5,%xmm7
+.byte 102,15,58,15,252,4
+ nop
+ paddd %xmm7,%xmm6
+.byte 15,56,204,220
+.byte 15,56,203,202
+ movdqa 32(%ebp),%xmm0
+ paddd %xmm5,%xmm0
+.byte 15,56,205,245
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm6,%xmm7
+.byte 102,15,58,15,253,4
+ nop
+ paddd %xmm7,%xmm3
+.byte 15,56,204,229
+.byte 15,56,203,202
+ movdqa 48(%ebp),%xmm0
+ paddd %xmm6,%xmm0
+.byte 15,56,205,222
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm3,%xmm7
+.byte 102,15,58,15,254,4
+ nop
+ paddd %xmm7,%xmm4
+.byte 15,56,204,238
+.byte 15,56,203,202
+ movdqa 64(%ebp),%xmm0
+ paddd %xmm3,%xmm0
+.byte 15,56,205,227
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm4,%xmm7
+.byte 102,15,58,15,251,4
+ nop
+ paddd %xmm7,%xmm5
+.byte 15,56,204,243
+.byte 15,56,203,202
+ movdqa 80(%ebp),%xmm0
+ paddd %xmm4,%xmm0
+.byte 15,56,205,236
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm5,%xmm7
+.byte 102,15,58,15,252,4
+.byte 15,56,203,202
+ paddd %xmm7,%xmm6
+ movdqa 96(%ebp),%xmm0
+ paddd %xmm5,%xmm0
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+.byte 15,56,205,245
+ movdqa 128(%ebp),%xmm7
+.byte 15,56,203,202
+ movdqa 112(%ebp),%xmm0
+ paddd %xmm6,%xmm0
+ nop
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ cmpl %edi,%eax
+ nop
+.byte 15,56,203,202
+ paddd 16(%esp),%xmm2
+ paddd (%esp),%xmm1
+ jnz .L011loop_shaext
+ pshufd $177,%xmm2,%xmm2
+ pshufd $27,%xmm1,%xmm7
+ pshufd $177,%xmm1,%xmm1
+ punpckhqdq %xmm2,%xmm1
+.byte 102,15,58,15,215,8
+ movl 44(%esp),%esp
+ movdqu %xmm1,(%esi)
+ movdqu %xmm2,16(%esi)
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.align 32
+.L006SSSE3:
+ leal -96(%esp),%esp
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl 8(%esi),%ecx
+ movl 12(%esi),%edi
+ movl %ebx,4(%esp)
+ xorl %ecx,%ebx
+ movl %ecx,8(%esp)
+ movl %edi,12(%esp)
+ movl 16(%esi),%edx
+ movl 20(%esi),%edi
+ movl 24(%esi),%ecx
+ movl 28(%esi),%esi
+ movl %edi,20(%esp)
+ movl 100(%esp),%edi
+ movl %ecx,24(%esp)
+ movl %esi,28(%esp)
+ movdqa 256(%ebp),%xmm7
+ jmp .L012grand_ssse3
+.align 16
+.L012grand_ssse3:
+ movdqu (%edi),%xmm0
+ movdqu 16(%edi),%xmm1
+ movdqu 32(%edi),%xmm2
+ movdqu 48(%edi),%xmm3
+ addl $64,%edi
+.byte 102,15,56,0,199
+ movl %edi,100(%esp)
+.byte 102,15,56,0,207
+ movdqa (%ebp),%xmm4
+.byte 102,15,56,0,215
+ movdqa 16(%ebp),%xmm5
+ paddd %xmm0,%xmm4
+.byte 102,15,56,0,223
+ movdqa 32(%ebp),%xmm6
+ paddd %xmm1,%xmm5
+ movdqa 48(%ebp),%xmm7
+ movdqa %xmm4,32(%esp)
+ paddd %xmm2,%xmm6
+ movdqa %xmm5,48(%esp)
+ paddd %xmm3,%xmm7
+ movdqa %xmm6,64(%esp)
+ movdqa %xmm7,80(%esp)
+ jmp .L013ssse3_00_47
+.align 16
+.L013ssse3_00_47:
+ addl $64,%ebp
+ movl %edx,%ecx
+ movdqa %xmm1,%xmm4
+ rorl $14,%edx
+ movl 20(%esp),%esi
+ movdqa %xmm3,%xmm7
+ xorl %ecx,%edx
+ movl 24(%esp),%edi
+.byte 102,15,58,15,224,4
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+.byte 102,15,58,15,250,4
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ movdqa %xmm4,%xmm5
+ rorl $6,%edx
+ movl %eax,%ecx
+ movdqa %xmm4,%xmm6
+ addl %edi,%edx
+ movl 4(%esp),%edi
+ psrld $3,%xmm4
+ movl %eax,%esi
+ rorl $9,%ecx
+ paddd %xmm7,%xmm0
+ movl %eax,(%esp)
+ xorl %eax,%ecx
+ psrld $7,%xmm6
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ pshufd $250,%xmm3,%xmm7
+ xorl %esi,%ecx
+ addl 32(%esp),%edx
+ pslld $14,%xmm5
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ pxor %xmm6,%xmm4
+ addl %edx,%ebx
+ addl 12(%esp),%edx
+ psrld $11,%xmm6
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ pxor %xmm5,%xmm4
+ movl 16(%esp),%esi
+ xorl %ecx,%edx
+ pslld $11,%xmm5
+ movl 20(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ pxor %xmm6,%xmm4
+ andl %ecx,%esi
+ movl %ecx,12(%esp)
+ movdqa %xmm7,%xmm6
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ pxor %xmm5,%xmm4
+ movl %ebx,%ecx
+ addl %edi,%edx
+ psrld $10,%xmm7
+ movl (%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ paddd %xmm4,%xmm0
+ movl %ebx,28(%esp)
+ xorl %ebx,%ecx
+ psrlq $17,%xmm6
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ rorl $11,%ecx
+ pxor %xmm6,%xmm7
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ psrlq $2,%xmm6
+ addl 36(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ pxor %xmm6,%xmm7
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ pshufd $128,%xmm7,%xmm7
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 12(%esp),%esi
+ xorl %ecx,%edx
+ movl 16(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ psrldq $8,%xmm7
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ paddd %xmm7,%xmm0
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 28(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,24(%esp)
+ pshufd $80,%xmm0,%xmm7
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ movdqa %xmm7,%xmm6
+ rorl $11,%ecx
+ psrld $10,%xmm7
+ andl %eax,%ebx
+ psrlq $17,%xmm6
+ xorl %esi,%ecx
+ addl 40(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ pxor %xmm6,%xmm7
+ addl %edx,%ebx
+ addl 4(%esp),%edx
+ psrlq $2,%xmm6
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ pxor %xmm6,%xmm7
+ movl 8(%esp),%esi
+ xorl %ecx,%edx
+ movl 12(%esp),%edi
+ pshufd $8,%xmm7,%xmm7
+ xorl %edi,%esi
+ rorl $5,%edx
+ movdqa (%ebp),%xmm6
+ andl %ecx,%esi
+ movl %ecx,4(%esp)
+ pslldq $8,%xmm7
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 24(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ paddd %xmm7,%xmm0
+ movl %ebx,20(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ paddd %xmm0,%xmm6
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 44(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %ecx,%eax
+ movdqa %xmm6,32(%esp)
+ movl %edx,%ecx
+ movdqa %xmm2,%xmm4
+ rorl $14,%edx
+ movl 4(%esp),%esi
+ movdqa %xmm0,%xmm7
+ xorl %ecx,%edx
+ movl 8(%esp),%edi
+.byte 102,15,58,15,225,4
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+.byte 102,15,58,15,251,4
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ movdqa %xmm4,%xmm5
+ rorl $6,%edx
+ movl %eax,%ecx
+ movdqa %xmm4,%xmm6
+ addl %edi,%edx
+ movl 20(%esp),%edi
+ psrld $3,%xmm4
+ movl %eax,%esi
+ rorl $9,%ecx
+ paddd %xmm7,%xmm1
+ movl %eax,16(%esp)
+ xorl %eax,%ecx
+ psrld $7,%xmm6
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ pshufd $250,%xmm0,%xmm7
+ xorl %esi,%ecx
+ addl 48(%esp),%edx
+ pslld $14,%xmm5
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ pxor %xmm6,%xmm4
+ addl %edx,%ebx
+ addl 28(%esp),%edx
+ psrld $11,%xmm6
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ pxor %xmm5,%xmm4
+ movl (%esp),%esi
+ xorl %ecx,%edx
+ pslld $11,%xmm5
+ movl 4(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ pxor %xmm6,%xmm4
+ andl %ecx,%esi
+ movl %ecx,28(%esp)
+ movdqa %xmm7,%xmm6
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ pxor %xmm5,%xmm4
+ movl %ebx,%ecx
+ addl %edi,%edx
+ psrld $10,%xmm7
+ movl 16(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ paddd %xmm4,%xmm1
+ movl %ebx,12(%esp)
+ xorl %ebx,%ecx
+ psrlq $17,%xmm6
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ rorl $11,%ecx
+ pxor %xmm6,%xmm7
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ psrlq $2,%xmm6
+ addl 52(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ pxor %xmm6,%xmm7
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ pshufd $128,%xmm7,%xmm7
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 28(%esp),%esi
+ xorl %ecx,%edx
+ movl (%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ psrldq $8,%xmm7
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ paddd %xmm7,%xmm1
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 12(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,8(%esp)
+ pshufd $80,%xmm1,%xmm7
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ movdqa %xmm7,%xmm6
+ rorl $11,%ecx
+ psrld $10,%xmm7
+ andl %eax,%ebx
+ psrlq $17,%xmm6
+ xorl %esi,%ecx
+ addl 56(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ pxor %xmm6,%xmm7
+ addl %edx,%ebx
+ addl 20(%esp),%edx
+ psrlq $2,%xmm6
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ pxor %xmm6,%xmm7
+ movl 24(%esp),%esi
+ xorl %ecx,%edx
+ movl 28(%esp),%edi
+ pshufd $8,%xmm7,%xmm7
+ xorl %edi,%esi
+ rorl $5,%edx
+ movdqa 16(%ebp),%xmm6
+ andl %ecx,%esi
+ movl %ecx,20(%esp)
+ pslldq $8,%xmm7
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 8(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ paddd %xmm7,%xmm1
+ movl %ebx,4(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ paddd %xmm1,%xmm6
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 60(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %ecx,%eax
+ movdqa %xmm6,48(%esp)
+ movl %edx,%ecx
+ movdqa %xmm3,%xmm4
+ rorl $14,%edx
+ movl 20(%esp),%esi
+ movdqa %xmm1,%xmm7
+ xorl %ecx,%edx
+ movl 24(%esp),%edi
+.byte 102,15,58,15,226,4
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+.byte 102,15,58,15,248,4
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ movdqa %xmm4,%xmm5
+ rorl $6,%edx
+ movl %eax,%ecx
+ movdqa %xmm4,%xmm6
+ addl %edi,%edx
+ movl 4(%esp),%edi
+ psrld $3,%xmm4
+ movl %eax,%esi
+ rorl $9,%ecx
+ paddd %xmm7,%xmm2
+ movl %eax,(%esp)
+ xorl %eax,%ecx
+ psrld $7,%xmm6
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ pshufd $250,%xmm1,%xmm7
+ xorl %esi,%ecx
+ addl 64(%esp),%edx
+ pslld $14,%xmm5
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ pxor %xmm6,%xmm4
+ addl %edx,%ebx
+ addl 12(%esp),%edx
+ psrld $11,%xmm6
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ pxor %xmm5,%xmm4
+ movl 16(%esp),%esi
+ xorl %ecx,%edx
+ pslld $11,%xmm5
+ movl 20(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ pxor %xmm6,%xmm4
+ andl %ecx,%esi
+ movl %ecx,12(%esp)
+ movdqa %xmm7,%xmm6
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ pxor %xmm5,%xmm4
+ movl %ebx,%ecx
+ addl %edi,%edx
+ psrld $10,%xmm7
+ movl (%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ paddd %xmm4,%xmm2
+ movl %ebx,28(%esp)
+ xorl %ebx,%ecx
+ psrlq $17,%xmm6
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ rorl $11,%ecx
+ pxor %xmm6,%xmm7
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ psrlq $2,%xmm6
+ addl 68(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ pxor %xmm6,%xmm7
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ pshufd $128,%xmm7,%xmm7
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 12(%esp),%esi
+ xorl %ecx,%edx
+ movl 16(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ psrldq $8,%xmm7
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ paddd %xmm7,%xmm2
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 28(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,24(%esp)
+ pshufd $80,%xmm2,%xmm7
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ movdqa %xmm7,%xmm6
+ rorl $11,%ecx
+ psrld $10,%xmm7
+ andl %eax,%ebx
+ psrlq $17,%xmm6
+ xorl %esi,%ecx
+ addl 72(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ pxor %xmm6,%xmm7
+ addl %edx,%ebx
+ addl 4(%esp),%edx
+ psrlq $2,%xmm6
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ pxor %xmm6,%xmm7
+ movl 8(%esp),%esi
+ xorl %ecx,%edx
+ movl 12(%esp),%edi
+ pshufd $8,%xmm7,%xmm7
+ xorl %edi,%esi
+ rorl $5,%edx
+ movdqa 32(%ebp),%xmm6
+ andl %ecx,%esi
+ movl %ecx,4(%esp)
+ pslldq $8,%xmm7
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 24(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ paddd %xmm7,%xmm2
+ movl %ebx,20(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ paddd %xmm2,%xmm6
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 76(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %ecx,%eax
+ movdqa %xmm6,64(%esp)
+ movl %edx,%ecx
+ movdqa %xmm0,%xmm4
+ rorl $14,%edx
+ movl 4(%esp),%esi
+ movdqa %xmm2,%xmm7
+ xorl %ecx,%edx
+ movl 8(%esp),%edi
+.byte 102,15,58,15,227,4
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+.byte 102,15,58,15,249,4
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ movdqa %xmm4,%xmm5
+ rorl $6,%edx
+ movl %eax,%ecx
+ movdqa %xmm4,%xmm6
+ addl %edi,%edx
+ movl 20(%esp),%edi
+ psrld $3,%xmm4
+ movl %eax,%esi
+ rorl $9,%ecx
+ paddd %xmm7,%xmm3
+ movl %eax,16(%esp)
+ xorl %eax,%ecx
+ psrld $7,%xmm6
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ pshufd $250,%xmm2,%xmm7
+ xorl %esi,%ecx
+ addl 80(%esp),%edx
+ pslld $14,%xmm5
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ pxor %xmm6,%xmm4
+ addl %edx,%ebx
+ addl 28(%esp),%edx
+ psrld $11,%xmm6
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ pxor %xmm5,%xmm4
+ movl (%esp),%esi
+ xorl %ecx,%edx
+ pslld $11,%xmm5
+ movl 4(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ pxor %xmm6,%xmm4
+ andl %ecx,%esi
+ movl %ecx,28(%esp)
+ movdqa %xmm7,%xmm6
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ pxor %xmm5,%xmm4
+ movl %ebx,%ecx
+ addl %edi,%edx
+ psrld $10,%xmm7
+ movl 16(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ paddd %xmm4,%xmm3
+ movl %ebx,12(%esp)
+ xorl %ebx,%ecx
+ psrlq $17,%xmm6
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ rorl $11,%ecx
+ pxor %xmm6,%xmm7
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ psrlq $2,%xmm6
+ addl 84(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ pxor %xmm6,%xmm7
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ pshufd $128,%xmm7,%xmm7
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 28(%esp),%esi
+ xorl %ecx,%edx
+ movl (%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ psrldq $8,%xmm7
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ paddd %xmm7,%xmm3
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 12(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,8(%esp)
+ pshufd $80,%xmm3,%xmm7
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ movdqa %xmm7,%xmm6
+ rorl $11,%ecx
+ psrld $10,%xmm7
+ andl %eax,%ebx
+ psrlq $17,%xmm6
+ xorl %esi,%ecx
+ addl 88(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ pxor %xmm6,%xmm7
+ addl %edx,%ebx
+ addl 20(%esp),%edx
+ psrlq $2,%xmm6
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ pxor %xmm6,%xmm7
+ movl 24(%esp),%esi
+ xorl %ecx,%edx
+ movl 28(%esp),%edi
+ pshufd $8,%xmm7,%xmm7
+ xorl %edi,%esi
+ rorl $5,%edx
+ movdqa 48(%ebp),%xmm6
+ andl %ecx,%esi
+ movl %ecx,20(%esp)
+ pslldq $8,%xmm7
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 8(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ paddd %xmm7,%xmm3
+ movl %ebx,4(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ paddd %xmm3,%xmm6
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 92(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %ecx,%eax
+ movdqa %xmm6,80(%esp)
+ cmpl $66051,64(%ebp)
+ jne .L013ssse3_00_47
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 20(%esp),%esi
+ xorl %ecx,%edx
+ movl 24(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 4(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 32(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ addl %edx,%ebx
+ addl 12(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 16(%esp),%esi
+ xorl %ecx,%edx
+ movl 20(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,12(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl (%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ movl %ebx,28(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 36(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 12(%esp),%esi
+ xorl %ecx,%edx
+ movl 16(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 28(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,24(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 40(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ addl %edx,%ebx
+ addl 4(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 8(%esp),%esi
+ xorl %ecx,%edx
+ movl 12(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,4(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 24(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ movl %ebx,20(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 44(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 4(%esp),%esi
+ xorl %ecx,%edx
+ movl 8(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 20(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,16(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 48(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ addl %edx,%ebx
+ addl 28(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl (%esp),%esi
+ xorl %ecx,%edx
+ movl 4(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,28(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 16(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ movl %ebx,12(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 52(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 28(%esp),%esi
+ xorl %ecx,%edx
+ movl (%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 12(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,8(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 56(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ addl %edx,%ebx
+ addl 20(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 24(%esp),%esi
+ xorl %ecx,%edx
+ movl 28(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,20(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 8(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ movl %ebx,4(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 60(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 20(%esp),%esi
+ xorl %ecx,%edx
+ movl 24(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 4(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 64(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ addl %edx,%ebx
+ addl 12(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 16(%esp),%esi
+ xorl %ecx,%edx
+ movl 20(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,12(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl (%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ movl %ebx,28(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 68(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 12(%esp),%esi
+ xorl %ecx,%edx
+ movl 16(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 28(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,24(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 72(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ addl %edx,%ebx
+ addl 4(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 8(%esp),%esi
+ xorl %ecx,%edx
+ movl 12(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,4(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 24(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ movl %ebx,20(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 76(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 4(%esp),%esi
+ xorl %ecx,%edx
+ movl 8(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 20(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,16(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 80(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ addl %edx,%ebx
+ addl 28(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl (%esp),%esi
+ xorl %ecx,%edx
+ movl 4(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,28(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 16(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ movl %ebx,12(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 84(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 28(%esp),%esi
+ xorl %ecx,%edx
+ movl (%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 12(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,8(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 88(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ addl %edx,%ebx
+ addl 20(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 24(%esp),%esi
+ xorl %ecx,%edx
+ movl 28(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,20(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 8(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ movl %ebx,4(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 92(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %ecx,%eax
+ movl 96(%esp),%esi
+ xorl %edi,%ebx
+ movl 12(%esp),%ecx
+ addl (%esi),%eax
+ addl 4(%esi),%ebx
+ addl 8(%esi),%edi
+ addl 12(%esi),%ecx
+ movl %eax,(%esi)
+ movl %ebx,4(%esi)
+ movl %edi,8(%esi)
+ movl %ecx,12(%esi)
+ movl %ebx,4(%esp)
+ xorl %edi,%ebx
+ movl %edi,8(%esp)
+ movl %ecx,12(%esp)
+ movl 20(%esp),%edi
+ movl 24(%esp),%ecx
+ addl 16(%esi),%edx
+ addl 20(%esi),%edi
+ addl 24(%esi),%ecx
+ movl %edx,16(%esi)
+ movl %edi,20(%esi)
+ movl %edi,20(%esp)
+ movl 28(%esp),%edi
+ movl %ecx,24(%esi)
+ addl 28(%esi),%edi
+ movl %ecx,24(%esp)
+ movl %edi,28(%esi)
+ movl %edi,28(%esp)
+ movl 100(%esp),%edi
+ movdqa 64(%ebp),%xmm7
+ subl $192,%ebp
+ cmpl 104(%esp),%edi
+ jb .L012grand_ssse3
+ movl 108(%esp),%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.align 32
+.L005AVX:
+ andl $264,%edx
+ cmpl $264,%edx
+ je .L014AVX_BMI
+ leal -96(%esp),%esp
+ vzeroall
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl 8(%esi),%ecx
+ movl 12(%esi),%edi
+ movl %ebx,4(%esp)
+ xorl %ecx,%ebx
+ movl %ecx,8(%esp)
+ movl %edi,12(%esp)
+ movl 16(%esi),%edx
+ movl 20(%esi),%edi
+ movl 24(%esi),%ecx
+ movl 28(%esi),%esi
+ movl %edi,20(%esp)
+ movl 100(%esp),%edi
+ movl %ecx,24(%esp)
+ movl %esi,28(%esp)
+ vmovdqa 256(%ebp),%xmm7
+ jmp .L015grand_avx
+.align 32
+.L015grand_avx:
+ vmovdqu (%edi),%xmm0
+ vmovdqu 16(%edi),%xmm1
+ vmovdqu 32(%edi),%xmm2
+ vmovdqu 48(%edi),%xmm3
+ addl $64,%edi
+ vpshufb %xmm7,%xmm0,%xmm0
+ movl %edi,100(%esp)
+ vpshufb %xmm7,%xmm1,%xmm1
+ vpshufb %xmm7,%xmm2,%xmm2
+ vpaddd (%ebp),%xmm0,%xmm4
+ vpshufb %xmm7,%xmm3,%xmm3
+ vpaddd 16(%ebp),%xmm1,%xmm5
+ vpaddd 32(%ebp),%xmm2,%xmm6
+ vpaddd 48(%ebp),%xmm3,%xmm7
+ vmovdqa %xmm4,32(%esp)
+ vmovdqa %xmm5,48(%esp)
+ vmovdqa %xmm6,64(%esp)
+ vmovdqa %xmm7,80(%esp)
+ jmp .L016avx_00_47
+.align 16
+.L016avx_00_47:
+ addl $64,%ebp
+ vpalignr $4,%xmm0,%xmm1,%xmm4
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 20(%esp),%esi
+ vpalignr $4,%xmm2,%xmm3,%xmm7
+ xorl %ecx,%edx
+ movl 24(%esp),%edi
+ xorl %edi,%esi
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ vpaddd %xmm7,%xmm0,%xmm0
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrld $3,%xmm4,%xmm7
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 4(%esp),%edi
+ vpslld $14,%xmm4,%xmm5
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,(%esp)
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ vpshufd $250,%xmm3,%xmm7
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ vpsrld $11,%xmm6,%xmm6
+ addl 32(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ vpxor %xmm5,%xmm4,%xmm4
+ addl %edx,%ebx
+ addl 12(%esp),%edx
+ addl %ecx,%ebx
+ vpslld $11,%xmm5,%xmm5
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 16(%esp),%esi
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %ecx,%edx
+ movl 20(%esp),%edi
+ xorl %edi,%esi
+ vpsrld $10,%xmm7,%xmm6
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,12(%esp)
+ vpxor %xmm5,%xmm4,%xmm4
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrlq $17,%xmm7,%xmm5
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl (%esp),%edi
+ vpaddd %xmm4,%xmm0,%xmm0
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,28(%esp)
+ vpxor %xmm5,%xmm6,%xmm6
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ vpsrlq $19,%xmm7,%xmm7
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ addl 36(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ vpshufd $132,%xmm6,%xmm7
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %ecx,%eax
+ vpsrldq $8,%xmm7,%xmm7
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 12(%esp),%esi
+ vpaddd %xmm7,%xmm0,%xmm0
+ xorl %ecx,%edx
+ movl 16(%esp),%edi
+ xorl %edi,%esi
+ vpshufd $80,%xmm0,%xmm7
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ vpsrld $10,%xmm7,%xmm6
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrlq $17,%xmm7,%xmm5
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 28(%esp),%edi
+ vpxor %xmm5,%xmm6,%xmm6
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,24(%esp)
+ vpsrlq $19,%xmm7,%xmm7
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ vpshufd $232,%xmm6,%xmm7
+ addl 40(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ vpslldq $8,%xmm7,%xmm7
+ addl %edx,%ebx
+ addl 4(%esp),%edx
+ addl %ecx,%ebx
+ vpaddd %xmm7,%xmm0,%xmm0
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 8(%esp),%esi
+ vpaddd (%ebp),%xmm0,%xmm6
+ xorl %ecx,%edx
+ movl 12(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,4(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 24(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,20(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 44(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %ecx,%eax
+ vmovdqa %xmm6,32(%esp)
+ vpalignr $4,%xmm1,%xmm2,%xmm4
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 4(%esp),%esi
+ vpalignr $4,%xmm3,%xmm0,%xmm7
+ xorl %ecx,%edx
+ movl 8(%esp),%edi
+ xorl %edi,%esi
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ vpaddd %xmm7,%xmm1,%xmm1
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrld $3,%xmm4,%xmm7
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 20(%esp),%edi
+ vpslld $14,%xmm4,%xmm5
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,16(%esp)
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ vpshufd $250,%xmm0,%xmm7
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ vpsrld $11,%xmm6,%xmm6
+ addl 48(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ vpxor %xmm5,%xmm4,%xmm4
+ addl %edx,%ebx
+ addl 28(%esp),%edx
+ addl %ecx,%ebx
+ vpslld $11,%xmm5,%xmm5
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl (%esp),%esi
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %ecx,%edx
+ movl 4(%esp),%edi
+ xorl %edi,%esi
+ vpsrld $10,%xmm7,%xmm6
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,28(%esp)
+ vpxor %xmm5,%xmm4,%xmm4
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrlq $17,%xmm7,%xmm5
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 16(%esp),%edi
+ vpaddd %xmm4,%xmm1,%xmm1
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,12(%esp)
+ vpxor %xmm5,%xmm6,%xmm6
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ vpsrlq $19,%xmm7,%xmm7
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ addl 52(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ vpshufd $132,%xmm6,%xmm7
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %ecx,%eax
+ vpsrldq $8,%xmm7,%xmm7
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 28(%esp),%esi
+ vpaddd %xmm7,%xmm1,%xmm1
+ xorl %ecx,%edx
+ movl (%esp),%edi
+ xorl %edi,%esi
+ vpshufd $80,%xmm1,%xmm7
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ vpsrld $10,%xmm7,%xmm6
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrlq $17,%xmm7,%xmm5
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 12(%esp),%edi
+ vpxor %xmm5,%xmm6,%xmm6
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,8(%esp)
+ vpsrlq $19,%xmm7,%xmm7
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ vpshufd $232,%xmm6,%xmm7
+ addl 56(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ vpslldq $8,%xmm7,%xmm7
+ addl %edx,%ebx
+ addl 20(%esp),%edx
+ addl %ecx,%ebx
+ vpaddd %xmm7,%xmm1,%xmm1
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 24(%esp),%esi
+ vpaddd 16(%ebp),%xmm1,%xmm6
+ xorl %ecx,%edx
+ movl 28(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,20(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 8(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,4(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 60(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %ecx,%eax
+ vmovdqa %xmm6,48(%esp)
+ vpalignr $4,%xmm2,%xmm3,%xmm4
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 20(%esp),%esi
+ vpalignr $4,%xmm0,%xmm1,%xmm7
+ xorl %ecx,%edx
+ movl 24(%esp),%edi
+ xorl %edi,%esi
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ vpaddd %xmm7,%xmm2,%xmm2
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrld $3,%xmm4,%xmm7
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 4(%esp),%edi
+ vpslld $14,%xmm4,%xmm5
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,(%esp)
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ vpshufd $250,%xmm1,%xmm7
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ vpsrld $11,%xmm6,%xmm6
+ addl 64(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ vpxor %xmm5,%xmm4,%xmm4
+ addl %edx,%ebx
+ addl 12(%esp),%edx
+ addl %ecx,%ebx
+ vpslld $11,%xmm5,%xmm5
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 16(%esp),%esi
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %ecx,%edx
+ movl 20(%esp),%edi
+ xorl %edi,%esi
+ vpsrld $10,%xmm7,%xmm6
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,12(%esp)
+ vpxor %xmm5,%xmm4,%xmm4
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrlq $17,%xmm7,%xmm5
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl (%esp),%edi
+ vpaddd %xmm4,%xmm2,%xmm2
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,28(%esp)
+ vpxor %xmm5,%xmm6,%xmm6
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ vpsrlq $19,%xmm7,%xmm7
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ addl 68(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ vpshufd $132,%xmm6,%xmm7
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %ecx,%eax
+ vpsrldq $8,%xmm7,%xmm7
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 12(%esp),%esi
+ vpaddd %xmm7,%xmm2,%xmm2
+ xorl %ecx,%edx
+ movl 16(%esp),%edi
+ xorl %edi,%esi
+ vpshufd $80,%xmm2,%xmm7
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ vpsrld $10,%xmm7,%xmm6
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrlq $17,%xmm7,%xmm5
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 28(%esp),%edi
+ vpxor %xmm5,%xmm6,%xmm6
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,24(%esp)
+ vpsrlq $19,%xmm7,%xmm7
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ vpshufd $232,%xmm6,%xmm7
+ addl 72(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ vpslldq $8,%xmm7,%xmm7
+ addl %edx,%ebx
+ addl 4(%esp),%edx
+ addl %ecx,%ebx
+ vpaddd %xmm7,%xmm2,%xmm2
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 8(%esp),%esi
+ vpaddd 32(%ebp),%xmm2,%xmm6
+ xorl %ecx,%edx
+ movl 12(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,4(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 24(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,20(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 76(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %ecx,%eax
+ vmovdqa %xmm6,64(%esp)
+ vpalignr $4,%xmm3,%xmm0,%xmm4
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 4(%esp),%esi
+ vpalignr $4,%xmm1,%xmm2,%xmm7
+ xorl %ecx,%edx
+ movl 8(%esp),%edi
+ xorl %edi,%esi
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ vpaddd %xmm7,%xmm3,%xmm3
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrld $3,%xmm4,%xmm7
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 20(%esp),%edi
+ vpslld $14,%xmm4,%xmm5
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,16(%esp)
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ vpshufd $250,%xmm2,%xmm7
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ vpsrld $11,%xmm6,%xmm6
+ addl 80(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ vpxor %xmm5,%xmm4,%xmm4
+ addl %edx,%ebx
+ addl 28(%esp),%edx
+ addl %ecx,%ebx
+ vpslld $11,%xmm5,%xmm5
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl (%esp),%esi
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %ecx,%edx
+ movl 4(%esp),%edi
+ xorl %edi,%esi
+ vpsrld $10,%xmm7,%xmm6
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,28(%esp)
+ vpxor %xmm5,%xmm4,%xmm4
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrlq $17,%xmm7,%xmm5
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 16(%esp),%edi
+ vpaddd %xmm4,%xmm3,%xmm3
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,12(%esp)
+ vpxor %xmm5,%xmm6,%xmm6
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ vpsrlq $19,%xmm7,%xmm7
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ addl 84(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ vpshufd $132,%xmm6,%xmm7
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %ecx,%eax
+ vpsrldq $8,%xmm7,%xmm7
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 28(%esp),%esi
+ vpaddd %xmm7,%xmm3,%xmm3
+ xorl %ecx,%edx
+ movl (%esp),%edi
+ xorl %edi,%esi
+ vpshufd $80,%xmm3,%xmm7
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ vpsrld $10,%xmm7,%xmm6
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrlq $17,%xmm7,%xmm5
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 12(%esp),%edi
+ vpxor %xmm5,%xmm6,%xmm6
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,8(%esp)
+ vpsrlq $19,%xmm7,%xmm7
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ vpshufd $232,%xmm6,%xmm7
+ addl 88(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ vpslldq $8,%xmm7,%xmm7
+ addl %edx,%ebx
+ addl 20(%esp),%edx
+ addl %ecx,%ebx
+ vpaddd %xmm7,%xmm3,%xmm3
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 24(%esp),%esi
+ vpaddd 48(%ebp),%xmm3,%xmm6
+ xorl %ecx,%edx
+ movl 28(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,20(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 8(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,4(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 92(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %ecx,%eax
+ vmovdqa %xmm6,80(%esp)
+ cmpl $66051,64(%ebp)
+ jne .L016avx_00_47
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 20(%esp),%esi
+ xorl %ecx,%edx
+ movl 24(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 4(%esp),%edi
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 32(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ addl %edx,%ebx
+ addl 12(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 16(%esp),%esi
+ xorl %ecx,%edx
+ movl 20(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,12(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl (%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,28(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 36(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 12(%esp),%esi
+ xorl %ecx,%edx
+ movl 16(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 28(%esp),%edi
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,24(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 40(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ addl %edx,%ebx
+ addl 4(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 8(%esp),%esi
+ xorl %ecx,%edx
+ movl 12(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,4(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 24(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,20(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 44(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 4(%esp),%esi
+ xorl %ecx,%edx
+ movl 8(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 20(%esp),%edi
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,16(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 48(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ addl %edx,%ebx
+ addl 28(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl (%esp),%esi
+ xorl %ecx,%edx
+ movl 4(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,28(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 16(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,12(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 52(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 28(%esp),%esi
+ xorl %ecx,%edx
+ movl (%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 12(%esp),%edi
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,8(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 56(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ addl %edx,%ebx
+ addl 20(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 24(%esp),%esi
+ xorl %ecx,%edx
+ movl 28(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,20(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 8(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,4(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 60(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 20(%esp),%esi
+ xorl %ecx,%edx
+ movl 24(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 4(%esp),%edi
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 64(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ addl %edx,%ebx
+ addl 12(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 16(%esp),%esi
+ xorl %ecx,%edx
+ movl 20(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,12(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl (%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,28(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 68(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 12(%esp),%esi
+ xorl %ecx,%edx
+ movl 16(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 28(%esp),%edi
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,24(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 72(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ addl %edx,%ebx
+ addl 4(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 8(%esp),%esi
+ xorl %ecx,%edx
+ movl 12(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,4(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 24(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,20(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 76(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 4(%esp),%esi
+ xorl %ecx,%edx
+ movl 8(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 20(%esp),%edi
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,16(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 80(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ addl %edx,%ebx
+ addl 28(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl (%esp),%esi
+ xorl %ecx,%edx
+ movl 4(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,28(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 16(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,12(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 84(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 28(%esp),%esi
+ xorl %ecx,%edx
+ movl (%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 12(%esp),%edi
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,8(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 88(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ addl %edx,%ebx
+ addl 20(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 24(%esp),%esi
+ xorl %ecx,%edx
+ movl 28(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,20(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 8(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,4(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 92(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %ecx,%eax
+ movl 96(%esp),%esi
+ xorl %edi,%ebx
+ movl 12(%esp),%ecx
+ addl (%esi),%eax
+ addl 4(%esi),%ebx
+ addl 8(%esi),%edi
+ addl 12(%esi),%ecx
+ movl %eax,(%esi)
+ movl %ebx,4(%esi)
+ movl %edi,8(%esi)
+ movl %ecx,12(%esi)
+ movl %ebx,4(%esp)
+ xorl %edi,%ebx
+ movl %edi,8(%esp)
+ movl %ecx,12(%esp)
+ movl 20(%esp),%edi
+ movl 24(%esp),%ecx
+ addl 16(%esi),%edx
+ addl 20(%esi),%edi
+ addl 24(%esi),%ecx
+ movl %edx,16(%esi)
+ movl %edi,20(%esi)
+ movl %edi,20(%esp)
+ movl 28(%esp),%edi
+ movl %ecx,24(%esi)
+ addl 28(%esi),%edi
+ movl %ecx,24(%esp)
+ movl %edi,28(%esi)
+ movl %edi,28(%esp)
+ movl 100(%esp),%edi
+ vmovdqa 64(%ebp),%xmm7
+ subl $192,%ebp
+ cmpl 104(%esp),%edi
+ jb .L015grand_avx
+ movl 108(%esp),%esp
+ vzeroall
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.align 32
+.L014AVX_BMI:
+ leal -96(%esp),%esp
+ vzeroall
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl 8(%esi),%ecx
+ movl 12(%esi),%edi
+ movl %ebx,4(%esp)
+ xorl %ecx,%ebx
+ movl %ecx,8(%esp)
+ movl %edi,12(%esp)
+ movl 16(%esi),%edx
+ movl 20(%esi),%edi
+ movl 24(%esi),%ecx
+ movl 28(%esi),%esi
+ movl %edi,20(%esp)
+ movl 100(%esp),%edi
+ movl %ecx,24(%esp)
+ movl %esi,28(%esp)
+ vmovdqa 256(%ebp),%xmm7
+ jmp .L017grand_avx_bmi
+.align 32
+.L017grand_avx_bmi:
+ vmovdqu (%edi),%xmm0
+ vmovdqu 16(%edi),%xmm1
+ vmovdqu 32(%edi),%xmm2
+ vmovdqu 48(%edi),%xmm3
+ addl $64,%edi
+ vpshufb %xmm7,%xmm0,%xmm0
+ movl %edi,100(%esp)
+ vpshufb %xmm7,%xmm1,%xmm1
+ vpshufb %xmm7,%xmm2,%xmm2
+ vpaddd (%ebp),%xmm0,%xmm4
+ vpshufb %xmm7,%xmm3,%xmm3
+ vpaddd 16(%ebp),%xmm1,%xmm5
+ vpaddd 32(%ebp),%xmm2,%xmm6
+ vpaddd 48(%ebp),%xmm3,%xmm7
+ vmovdqa %xmm4,32(%esp)
+ vmovdqa %xmm5,48(%esp)
+ vmovdqa %xmm6,64(%esp)
+ vmovdqa %xmm7,80(%esp)
+ jmp .L018avx_bmi_00_47
+.align 16
+.L018avx_bmi_00_47:
+ addl $64,%ebp
+ vpalignr $4,%xmm0,%xmm1,%xmm4
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,16(%esp)
+ vpalignr $4,%xmm2,%xmm3,%xmm7
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 24(%esp),%edx,%esi
+ vpsrld $7,%xmm4,%xmm6
+ xorl %edi,%ecx
+ andl 20(%esp),%edx
+ movl %eax,(%esp)
+ vpaddd %xmm7,%xmm0,%xmm0
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ vpsrld $3,%xmm4,%xmm7
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ vpslld $14,%xmm4,%xmm5
+ movl 4(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ vpxor %xmm6,%xmm7,%xmm4
+ addl 28(%esp),%edx
+ andl %eax,%ebx
+ addl 32(%esp),%edx
+ vpshufd $250,%xmm3,%xmm7
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 12(%esp),%edx
+ vpsrld $11,%xmm6,%xmm6
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ vpxor %xmm5,%xmm4,%xmm4
+ movl %edx,12(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ vpslld $11,%xmm5,%xmm5
+ andnl 20(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 16(%esp),%edx
+ vpxor %xmm6,%xmm4,%xmm4
+ movl %ebx,28(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ vpsrld $10,%xmm7,%xmm6
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ vpxor %xmm5,%xmm4,%xmm4
+ movl (%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ vpsrlq $17,%xmm7,%xmm5
+ addl 24(%esp),%edx
+ andl %ebx,%eax
+ addl 36(%esp),%edx
+ vpaddd %xmm4,%xmm0,%xmm0
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl 8(%esp),%edx
+ vpxor %xmm5,%xmm6,%xmm6
+ leal (%eax,%ecx,1),%eax
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ vpsrlq $19,%xmm7,%xmm7
+ movl %edx,8(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ andnl 16(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 12(%esp),%edx
+ vpshufd $132,%xmm6,%xmm7
+ movl %eax,24(%esp)
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ vpsrldq $8,%xmm7,%xmm7
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ vpaddd %xmm7,%xmm0,%xmm0
+ movl 28(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ vpshufd $80,%xmm0,%xmm7
+ addl 20(%esp),%edx
+ andl %eax,%ebx
+ addl 40(%esp),%edx
+ vpsrld $10,%xmm7,%xmm6
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 4(%esp),%edx
+ vpsrlq $17,%xmm7,%xmm5
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ vpxor %xmm5,%xmm6,%xmm6
+ movl %edx,4(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ vpsrlq $19,%xmm7,%xmm7
+ andnl 12(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 8(%esp),%edx
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %ebx,20(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ vpshufd $232,%xmm6,%xmm7
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ vpslldq $8,%xmm7,%xmm7
+ movl 24(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ vpaddd %xmm7,%xmm0,%xmm0
+ addl 16(%esp),%edx
+ andl %ebx,%eax
+ addl 44(%esp),%edx
+ vpaddd (%ebp),%xmm0,%xmm6
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl (%esp),%edx
+ leal (%eax,%ecx,1),%eax
+ vmovdqa %xmm6,32(%esp)
+ vpalignr $4,%xmm1,%xmm2,%xmm4
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,(%esp)
+ vpalignr $4,%xmm3,%xmm0,%xmm7
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 8(%esp),%edx,%esi
+ vpsrld $7,%xmm4,%xmm6
+ xorl %edi,%ecx
+ andl 4(%esp),%edx
+ movl %eax,16(%esp)
+ vpaddd %xmm7,%xmm1,%xmm1
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ vpsrld $3,%xmm4,%xmm7
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ vpslld $14,%xmm4,%xmm5
+ movl 20(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ vpxor %xmm6,%xmm7,%xmm4
+ addl 12(%esp),%edx
+ andl %eax,%ebx
+ addl 48(%esp),%edx
+ vpshufd $250,%xmm0,%xmm7
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 28(%esp),%edx
+ vpsrld $11,%xmm6,%xmm6
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ vpxor %xmm5,%xmm4,%xmm4
+ movl %edx,28(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ vpslld $11,%xmm5,%xmm5
+ andnl 4(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl (%esp),%edx
+ vpxor %xmm6,%xmm4,%xmm4
+ movl %ebx,12(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ vpsrld $10,%xmm7,%xmm6
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ vpxor %xmm5,%xmm4,%xmm4
+ movl 16(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ vpsrlq $17,%xmm7,%xmm5
+ addl 8(%esp),%edx
+ andl %ebx,%eax
+ addl 52(%esp),%edx
+ vpaddd %xmm4,%xmm1,%xmm1
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl 24(%esp),%edx
+ vpxor %xmm5,%xmm6,%xmm6
+ leal (%eax,%ecx,1),%eax
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ vpsrlq $19,%xmm7,%xmm7
+ movl %edx,24(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ andnl (%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 28(%esp),%edx
+ vpshufd $132,%xmm6,%xmm7
+ movl %eax,8(%esp)
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ vpsrldq $8,%xmm7,%xmm7
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ vpaddd %xmm7,%xmm1,%xmm1
+ movl 12(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ vpshufd $80,%xmm1,%xmm7
+ addl 4(%esp),%edx
+ andl %eax,%ebx
+ addl 56(%esp),%edx
+ vpsrld $10,%xmm7,%xmm6
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 20(%esp),%edx
+ vpsrlq $17,%xmm7,%xmm5
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ vpxor %xmm5,%xmm6,%xmm6
+ movl %edx,20(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ vpsrlq $19,%xmm7,%xmm7
+ andnl 28(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 24(%esp),%edx
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %ebx,4(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ vpshufd $232,%xmm6,%xmm7
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ vpslldq $8,%xmm7,%xmm7
+ movl 8(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ vpaddd %xmm7,%xmm1,%xmm1
+ addl (%esp),%edx
+ andl %ebx,%eax
+ addl 60(%esp),%edx
+ vpaddd 16(%ebp),%xmm1,%xmm6
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl 16(%esp),%edx
+ leal (%eax,%ecx,1),%eax
+ vmovdqa %xmm6,48(%esp)
+ vpalignr $4,%xmm2,%xmm3,%xmm4
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,16(%esp)
+ vpalignr $4,%xmm0,%xmm1,%xmm7
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 24(%esp),%edx,%esi
+ vpsrld $7,%xmm4,%xmm6
+ xorl %edi,%ecx
+ andl 20(%esp),%edx
+ movl %eax,(%esp)
+ vpaddd %xmm7,%xmm2,%xmm2
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ vpsrld $3,%xmm4,%xmm7
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ vpslld $14,%xmm4,%xmm5
+ movl 4(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ vpxor %xmm6,%xmm7,%xmm4
+ addl 28(%esp),%edx
+ andl %eax,%ebx
+ addl 64(%esp),%edx
+ vpshufd $250,%xmm1,%xmm7
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 12(%esp),%edx
+ vpsrld $11,%xmm6,%xmm6
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ vpxor %xmm5,%xmm4,%xmm4
+ movl %edx,12(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ vpslld $11,%xmm5,%xmm5
+ andnl 20(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 16(%esp),%edx
+ vpxor %xmm6,%xmm4,%xmm4
+ movl %ebx,28(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ vpsrld $10,%xmm7,%xmm6
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ vpxor %xmm5,%xmm4,%xmm4
+ movl (%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ vpsrlq $17,%xmm7,%xmm5
+ addl 24(%esp),%edx
+ andl %ebx,%eax
+ addl 68(%esp),%edx
+ vpaddd %xmm4,%xmm2,%xmm2
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl 8(%esp),%edx
+ vpxor %xmm5,%xmm6,%xmm6
+ leal (%eax,%ecx,1),%eax
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ vpsrlq $19,%xmm7,%xmm7
+ movl %edx,8(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ andnl 16(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 12(%esp),%edx
+ vpshufd $132,%xmm6,%xmm7
+ movl %eax,24(%esp)
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ vpsrldq $8,%xmm7,%xmm7
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ vpaddd %xmm7,%xmm2,%xmm2
+ movl 28(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ vpshufd $80,%xmm2,%xmm7
+ addl 20(%esp),%edx
+ andl %eax,%ebx
+ addl 72(%esp),%edx
+ vpsrld $10,%xmm7,%xmm6
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 4(%esp),%edx
+ vpsrlq $17,%xmm7,%xmm5
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ vpxor %xmm5,%xmm6,%xmm6
+ movl %edx,4(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ vpsrlq $19,%xmm7,%xmm7
+ andnl 12(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 8(%esp),%edx
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %ebx,20(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ vpshufd $232,%xmm6,%xmm7
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ vpslldq $8,%xmm7,%xmm7
+ movl 24(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ vpaddd %xmm7,%xmm2,%xmm2
+ addl 16(%esp),%edx
+ andl %ebx,%eax
+ addl 76(%esp),%edx
+ vpaddd 32(%ebp),%xmm2,%xmm6
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl (%esp),%edx
+ leal (%eax,%ecx,1),%eax
+ vmovdqa %xmm6,64(%esp)
+ vpalignr $4,%xmm3,%xmm0,%xmm4
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,(%esp)
+ vpalignr $4,%xmm1,%xmm2,%xmm7
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 8(%esp),%edx,%esi
+ vpsrld $7,%xmm4,%xmm6
+ xorl %edi,%ecx
+ andl 4(%esp),%edx
+ movl %eax,16(%esp)
+ vpaddd %xmm7,%xmm3,%xmm3
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ vpsrld $3,%xmm4,%xmm7
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ vpslld $14,%xmm4,%xmm5
+ movl 20(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ vpxor %xmm6,%xmm7,%xmm4
+ addl 12(%esp),%edx
+ andl %eax,%ebx
+ addl 80(%esp),%edx
+ vpshufd $250,%xmm2,%xmm7
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 28(%esp),%edx
+ vpsrld $11,%xmm6,%xmm6
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ vpxor %xmm5,%xmm4,%xmm4
+ movl %edx,28(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ vpslld $11,%xmm5,%xmm5
+ andnl 4(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl (%esp),%edx
+ vpxor %xmm6,%xmm4,%xmm4
+ movl %ebx,12(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ vpsrld $10,%xmm7,%xmm6
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ vpxor %xmm5,%xmm4,%xmm4
+ movl 16(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ vpsrlq $17,%xmm7,%xmm5
+ addl 8(%esp),%edx
+ andl %ebx,%eax
+ addl 84(%esp),%edx
+ vpaddd %xmm4,%xmm3,%xmm3
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl 24(%esp),%edx
+ vpxor %xmm5,%xmm6,%xmm6
+ leal (%eax,%ecx,1),%eax
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ vpsrlq $19,%xmm7,%xmm7
+ movl %edx,24(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ andnl (%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 28(%esp),%edx
+ vpshufd $132,%xmm6,%xmm7
+ movl %eax,8(%esp)
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ vpsrldq $8,%xmm7,%xmm7
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ vpaddd %xmm7,%xmm3,%xmm3
+ movl 12(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ vpshufd $80,%xmm3,%xmm7
+ addl 4(%esp),%edx
+ andl %eax,%ebx
+ addl 88(%esp),%edx
+ vpsrld $10,%xmm7,%xmm6
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 20(%esp),%edx
+ vpsrlq $17,%xmm7,%xmm5
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ vpxor %xmm5,%xmm6,%xmm6
+ movl %edx,20(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ vpsrlq $19,%xmm7,%xmm7
+ andnl 28(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 24(%esp),%edx
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %ebx,4(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ vpshufd $232,%xmm6,%xmm7
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ vpslldq $8,%xmm7,%xmm7
+ movl 8(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ vpaddd %xmm7,%xmm3,%xmm3
+ addl (%esp),%edx
+ andl %ebx,%eax
+ addl 92(%esp),%edx
+ vpaddd 48(%ebp),%xmm3,%xmm6
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl 16(%esp),%edx
+ leal (%eax,%ecx,1),%eax
+ vmovdqa %xmm6,80(%esp)
+ cmpl $66051,64(%ebp)
+ jne .L018avx_bmi_00_47
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,16(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 24(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 20(%esp),%edx
+ movl %eax,(%esp)
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ movl 4(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ andl %eax,%ebx
+ addl 32(%esp),%edx
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 12(%esp),%edx
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,12(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 20(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 16(%esp),%edx
+ movl %ebx,28(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ movl (%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ andl %ebx,%eax
+ addl 36(%esp),%edx
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl 8(%esp),%edx
+ leal (%eax,%ecx,1),%eax
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,8(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 16(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 12(%esp),%edx
+ movl %eax,24(%esp)
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ movl 28(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ andl %eax,%ebx
+ addl 40(%esp),%edx
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 4(%esp),%edx
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,4(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 12(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 8(%esp),%edx
+ movl %ebx,20(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ movl 24(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ andl %ebx,%eax
+ addl 44(%esp),%edx
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl (%esp),%edx
+ leal (%eax,%ecx,1),%eax
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 8(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 4(%esp),%edx
+ movl %eax,16(%esp)
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ movl 20(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ andl %eax,%ebx
+ addl 48(%esp),%edx
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 28(%esp),%edx
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,28(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 4(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl (%esp),%edx
+ movl %ebx,12(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ movl 16(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ andl %ebx,%eax
+ addl 52(%esp),%edx
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl 24(%esp),%edx
+ leal (%eax,%ecx,1),%eax
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,24(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl (%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 28(%esp),%edx
+ movl %eax,8(%esp)
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ movl 12(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ andl %eax,%ebx
+ addl 56(%esp),%edx
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 20(%esp),%edx
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,20(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 28(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 24(%esp),%edx
+ movl %ebx,4(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ movl 8(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ andl %ebx,%eax
+ addl 60(%esp),%edx
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl 16(%esp),%edx
+ leal (%eax,%ecx,1),%eax
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,16(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 24(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 20(%esp),%edx
+ movl %eax,(%esp)
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ movl 4(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ andl %eax,%ebx
+ addl 64(%esp),%edx
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 12(%esp),%edx
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,12(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 20(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 16(%esp),%edx
+ movl %ebx,28(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ movl (%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ andl %ebx,%eax
+ addl 68(%esp),%edx
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl 8(%esp),%edx
+ leal (%eax,%ecx,1),%eax
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,8(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 16(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 12(%esp),%edx
+ movl %eax,24(%esp)
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ movl 28(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ andl %eax,%ebx
+ addl 72(%esp),%edx
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 4(%esp),%edx
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,4(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 12(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 8(%esp),%edx
+ movl %ebx,20(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ movl 24(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ andl %ebx,%eax
+ addl 76(%esp),%edx
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl (%esp),%edx
+ leal (%eax,%ecx,1),%eax
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 8(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 4(%esp),%edx
+ movl %eax,16(%esp)
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ movl 20(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ andl %eax,%ebx
+ addl 80(%esp),%edx
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 28(%esp),%edx
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,28(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 4(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl (%esp),%edx
+ movl %ebx,12(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ movl 16(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ andl %ebx,%eax
+ addl 84(%esp),%edx
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl 24(%esp),%edx
+ leal (%eax,%ecx,1),%eax
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,24(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl (%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 28(%esp),%edx
+ movl %eax,8(%esp)
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ movl 12(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ andl %eax,%ebx
+ addl 88(%esp),%edx
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 20(%esp),%edx
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,20(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 28(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 24(%esp),%edx
+ movl %ebx,4(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ movl 8(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ andl %ebx,%eax
+ addl 92(%esp),%edx
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl 16(%esp),%edx
+ leal (%eax,%ecx,1),%eax
+ movl 96(%esp),%esi
+ xorl %edi,%ebx
+ movl 12(%esp),%ecx
+ addl (%esi),%eax
+ addl 4(%esi),%ebx
+ addl 8(%esi),%edi
+ addl 12(%esi),%ecx
+ movl %eax,(%esi)
+ movl %ebx,4(%esi)
+ movl %edi,8(%esi)
+ movl %ecx,12(%esi)
+ movl %ebx,4(%esp)
+ xorl %edi,%ebx
+ movl %edi,8(%esp)
+ movl %ecx,12(%esp)
+ movl 20(%esp),%edi
+ movl 24(%esp),%ecx
+ addl 16(%esi),%edx
+ addl 20(%esi),%edi
+ addl 24(%esi),%ecx
+ movl %edx,16(%esi)
+ movl %edi,20(%esi)
+ movl %edi,20(%esp)
+ movl 28(%esp),%edi
+ movl %ecx,24(%esi)
+ addl 28(%esi),%edi
+ movl %ecx,24(%esp)
+ movl %edi,28(%esi)
+ movl %edi,28(%esp)
+ movl 100(%esp),%edi
+ vmovdqa 64(%ebp),%xmm7
+ subl $192,%ebp
+ cmpl 104(%esp),%edi
+ jb .L017grand_avx_bmi
movl 108(%esp),%esp
+ vzeroall
popl %edi
popl %esi
popl %ebx
popl %ebp
ret
.size sha256_block_data_order,.-.L_sha256_block_data_order_begin
+.comm OPENSSL_ia32cap_P,16,4
#endif
diff --git a/secure/lib/libcrypto/i386/sha512-586.S b/secure/lib/libcrypto/i386/sha512-586.S
index fb003c3438ee..77a0cd78b150 100644
--- a/secure/lib/libcrypto/i386/sha512-586.S
+++ b/secure/lib/libcrypto/i386/sha512-586.S
@@ -27,6 +27,2269 @@ sha512_block_data_order:
movl %edi,4(%esp)
movl %eax,8(%esp)
movl %ebx,12(%esp)
+ leal OPENSSL_ia32cap_P-.L001K512(%ebp),%edx
+ movl (%edx),%ecx
+ testl $67108864,%ecx
+ jz .L002loop_x86
+ movl 4(%edx),%edx
+ movq (%esi),%mm0
+ andl $16777216,%ecx
+ movq 8(%esi),%mm1
+ andl $512,%edx
+ movq 16(%esi),%mm2
+ orl %edx,%ecx
+ movq 24(%esi),%mm3
+ movq 32(%esi),%mm4
+ movq 40(%esi),%mm5
+ movq 48(%esi),%mm6
+ movq 56(%esi),%mm7
+ cmpl $16777728,%ecx
+ je .L003SSSE3
+ subl $80,%esp
+ jmp .L004loop_sse2
+.align 16
+.L004loop_sse2:
+ movq %mm1,8(%esp)
+ movq %mm2,16(%esp)
+ movq %mm3,24(%esp)
+ movq %mm5,40(%esp)
+ movq %mm6,48(%esp)
+ pxor %mm1,%mm2
+ movq %mm7,56(%esp)
+ movq %mm0,%mm3
+ movl (%edi),%eax
+ movl 4(%edi),%ebx
+ addl $8,%edi
+ movl $15,%edx
+ bswap %eax
+ bswap %ebx
+ jmp .L00500_14_sse2
+.align 16
+.L00500_14_sse2:
+ movd %eax,%mm1
+ movl (%edi),%eax
+ movd %ebx,%mm7
+ movl 4(%edi),%ebx
+ addl $8,%edi
+ bswap %eax
+ bswap %ebx
+ punpckldq %mm1,%mm7
+ movq %mm4,%mm1
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,32(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ movq %mm3,%mm0
+ movq %mm7,72(%esp)
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 56(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ paddq (%ebp),%mm7
+ pxor %mm4,%mm3
+ movq 24(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 8(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ subl $8,%esp
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 40(%esp),%mm5
+ paddq %mm2,%mm3
+ movq %mm0,%mm2
+ addl $8,%ebp
+ paddq %mm6,%mm3
+ movq 48(%esp),%mm6
+ decl %edx
+ jnz .L00500_14_sse2
+ movd %eax,%mm1
+ movd %ebx,%mm7
+ punpckldq %mm1,%mm7
+ movq %mm4,%mm1
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,32(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ movq %mm3,%mm0
+ movq %mm7,72(%esp)
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 56(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ paddq (%ebp),%mm7
+ pxor %mm4,%mm3
+ movq 24(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 8(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ subl $8,%esp
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 192(%esp),%mm7
+ paddq %mm2,%mm3
+ movq %mm0,%mm2
+ addl $8,%ebp
+ paddq %mm6,%mm3
+ pxor %mm0,%mm0
+ movl $32,%edx
+ jmp .L00616_79_sse2
+.align 16
+.L00616_79_sse2:
+ movq 88(%esp),%mm5
+ movq %mm7,%mm1
+ psrlq $1,%mm7
+ movq %mm5,%mm6
+ psrlq $6,%mm5
+ psllq $56,%mm1
+ paddq %mm3,%mm0
+ movq %mm7,%mm3
+ psrlq $6,%mm7
+ pxor %mm1,%mm3
+ psllq $7,%mm1
+ pxor %mm7,%mm3
+ psrlq $1,%mm7
+ pxor %mm1,%mm3
+ movq %mm5,%mm1
+ psrlq $13,%mm5
+ pxor %mm3,%mm7
+ psllq $3,%mm6
+ pxor %mm5,%mm1
+ paddq 200(%esp),%mm7
+ pxor %mm6,%mm1
+ psrlq $42,%mm5
+ paddq 128(%esp),%mm7
+ pxor %mm5,%mm1
+ psllq $42,%mm6
+ movq 40(%esp),%mm5
+ pxor %mm6,%mm1
+ movq 48(%esp),%mm6
+ paddq %mm1,%mm7
+ movq %mm4,%mm1
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,32(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ movq %mm7,72(%esp)
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 56(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ paddq (%ebp),%mm7
+ pxor %mm4,%mm3
+ movq 24(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 8(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ subl $8,%esp
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 192(%esp),%mm7
+ paddq %mm6,%mm2
+ addl $8,%ebp
+ movq 88(%esp),%mm5
+ movq %mm7,%mm1
+ psrlq $1,%mm7
+ movq %mm5,%mm6
+ psrlq $6,%mm5
+ psllq $56,%mm1
+ paddq %mm3,%mm2
+ movq %mm7,%mm3
+ psrlq $6,%mm7
+ pxor %mm1,%mm3
+ psllq $7,%mm1
+ pxor %mm7,%mm3
+ psrlq $1,%mm7
+ pxor %mm1,%mm3
+ movq %mm5,%mm1
+ psrlq $13,%mm5
+ pxor %mm3,%mm7
+ psllq $3,%mm6
+ pxor %mm5,%mm1
+ paddq 200(%esp),%mm7
+ pxor %mm6,%mm1
+ psrlq $42,%mm5
+ paddq 128(%esp),%mm7
+ pxor %mm5,%mm1
+ psllq $42,%mm6
+ movq 40(%esp),%mm5
+ pxor %mm6,%mm1
+ movq 48(%esp),%mm6
+ paddq %mm1,%mm7
+ movq %mm4,%mm1
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,32(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ movq %mm7,72(%esp)
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 56(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ paddq (%ebp),%mm7
+ pxor %mm4,%mm3
+ movq 24(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 8(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ subl $8,%esp
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 192(%esp),%mm7
+ paddq %mm6,%mm0
+ addl $8,%ebp
+ decl %edx
+ jnz .L00616_79_sse2
+ paddq %mm3,%mm0
+ movq 8(%esp),%mm1
+ movq 24(%esp),%mm3
+ movq 40(%esp),%mm5
+ movq 48(%esp),%mm6
+ movq 56(%esp),%mm7
+ pxor %mm1,%mm2
+ paddq (%esi),%mm0
+ paddq 8(%esi),%mm1
+ paddq 16(%esi),%mm2
+ paddq 24(%esi),%mm3
+ paddq 32(%esi),%mm4
+ paddq 40(%esi),%mm5
+ paddq 48(%esi),%mm6
+ paddq 56(%esi),%mm7
+ movl $640,%eax
+ movq %mm0,(%esi)
+ movq %mm1,8(%esi)
+ movq %mm2,16(%esi)
+ movq %mm3,24(%esi)
+ movq %mm4,32(%esi)
+ movq %mm5,40(%esi)
+ movq %mm6,48(%esi)
+ movq %mm7,56(%esi)
+ leal (%esp,%eax,1),%esp
+ subl %eax,%ebp
+ cmpl 88(%esp),%edi
+ jb .L004loop_sse2
+ movl 92(%esp),%esp
+ emms
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.align 32
+.L003SSSE3:
+ leal -64(%esp),%edx
+ subl $256,%esp
+ movdqa 640(%ebp),%xmm1
+ movdqu (%edi),%xmm0
+.byte 102,15,56,0,193
+ movdqa (%ebp),%xmm3
+ movdqa %xmm1,%xmm2
+ movdqu 16(%edi),%xmm1
+ paddq %xmm0,%xmm3
+.byte 102,15,56,0,202
+ movdqa %xmm3,-128(%edx)
+ movdqa 16(%ebp),%xmm4
+ movdqa %xmm2,%xmm3
+ movdqu 32(%edi),%xmm2
+ paddq %xmm1,%xmm4
+.byte 102,15,56,0,211
+ movdqa %xmm4,-112(%edx)
+ movdqa 32(%ebp),%xmm5
+ movdqa %xmm3,%xmm4
+ movdqu 48(%edi),%xmm3
+ paddq %xmm2,%xmm5
+.byte 102,15,56,0,220
+ movdqa %xmm5,-96(%edx)
+ movdqa 48(%ebp),%xmm6
+ movdqa %xmm4,%xmm5
+ movdqu 64(%edi),%xmm4
+ paddq %xmm3,%xmm6
+.byte 102,15,56,0,229
+ movdqa %xmm6,-80(%edx)
+ movdqa 64(%ebp),%xmm7
+ movdqa %xmm5,%xmm6
+ movdqu 80(%edi),%xmm5
+ paddq %xmm4,%xmm7
+.byte 102,15,56,0,238
+ movdqa %xmm7,-64(%edx)
+ movdqa %xmm0,(%edx)
+ movdqa 80(%ebp),%xmm0
+ movdqa %xmm6,%xmm7
+ movdqu 96(%edi),%xmm6
+ paddq %xmm5,%xmm0
+.byte 102,15,56,0,247
+ movdqa %xmm0,-48(%edx)
+ movdqa %xmm1,16(%edx)
+ movdqa 96(%ebp),%xmm1
+ movdqa %xmm7,%xmm0
+ movdqu 112(%edi),%xmm7
+ paddq %xmm6,%xmm1
+.byte 102,15,56,0,248
+ movdqa %xmm1,-32(%edx)
+ movdqa %xmm2,32(%edx)
+ movdqa 112(%ebp),%xmm2
+ movdqa (%edx),%xmm0
+ paddq %xmm7,%xmm2
+ movdqa %xmm2,-16(%edx)
+ nop
+.align 32
+.L007loop_ssse3:
+ movdqa 16(%edx),%xmm2
+ movdqa %xmm3,48(%edx)
+ leal 128(%ebp),%ebp
+ movq %mm1,8(%esp)
+ movl %edi,%ebx
+ movq %mm2,16(%esp)
+ leal 128(%edi),%edi
+ movq %mm3,24(%esp)
+ cmpl %eax,%edi
+ movq %mm5,40(%esp)
+ cmovbl %edi,%ebx
+ movq %mm6,48(%esp)
+ movl $4,%ecx
+ pxor %mm1,%mm2
+ movq %mm7,56(%esp)
+ pxor %mm3,%mm3
+ jmp .L00800_47_ssse3
+.align 32
+.L00800_47_ssse3:
+ movdqa %xmm5,%xmm3
+ movdqa %xmm2,%xmm1
+.byte 102,15,58,15,208,8
+ movdqa %xmm4,(%edx)
+.byte 102,15,58,15,220,8
+ movdqa %xmm2,%xmm4
+ psrlq $7,%xmm2
+ paddq %xmm3,%xmm0
+ movdqa %xmm4,%xmm3
+ psrlq $1,%xmm4
+ psllq $56,%xmm3
+ pxor %xmm4,%xmm2
+ psrlq $7,%xmm4
+ pxor %xmm3,%xmm2
+ psllq $7,%xmm3
+ pxor %xmm4,%xmm2
+ movdqa %xmm7,%xmm4
+ pxor %xmm3,%xmm2
+ movdqa %xmm7,%xmm3
+ psrlq $6,%xmm4
+ paddq %xmm2,%xmm0
+ movdqa %xmm7,%xmm2
+ psrlq $19,%xmm3
+ psllq $3,%xmm2
+ pxor %xmm3,%xmm4
+ psrlq $42,%xmm3
+ pxor %xmm2,%xmm4
+ psllq $42,%xmm2
+ pxor %xmm3,%xmm4
+ movdqa 32(%edx),%xmm3
+ pxor %xmm2,%xmm4
+ movdqa (%ebp),%xmm2
+ movq %mm4,%mm1
+ paddq %xmm4,%xmm0
+ movq -128(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,32(%esp)
+ paddq %xmm0,%xmm2
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 56(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 24(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 8(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 32(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 40(%esp),%mm6
+ movq %mm4,%mm1
+ movq -120(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,24(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,56(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 48(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 16(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq (%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 24(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 32(%esp),%mm6
+ movdqa %xmm2,-128(%edx)
+ movdqa %xmm6,%xmm4
+ movdqa %xmm3,%xmm2
+.byte 102,15,58,15,217,8
+ movdqa %xmm5,16(%edx)
+.byte 102,15,58,15,229,8
+ movdqa %xmm3,%xmm5
+ psrlq $7,%xmm3
+ paddq %xmm4,%xmm1
+ movdqa %xmm5,%xmm4
+ psrlq $1,%xmm5
+ psllq $56,%xmm4
+ pxor %xmm5,%xmm3
+ psrlq $7,%xmm5
+ pxor %xmm4,%xmm3
+ psllq $7,%xmm4
+ pxor %xmm5,%xmm3
+ movdqa %xmm0,%xmm5
+ pxor %xmm4,%xmm3
+ movdqa %xmm0,%xmm4
+ psrlq $6,%xmm5
+ paddq %xmm3,%xmm1
+ movdqa %xmm0,%xmm3
+ psrlq $19,%xmm4
+ psllq $3,%xmm3
+ pxor %xmm4,%xmm5
+ psrlq $42,%xmm4
+ pxor %xmm3,%xmm5
+ psllq $42,%xmm3
+ pxor %xmm4,%xmm5
+ movdqa 48(%edx),%xmm4
+ pxor %xmm3,%xmm5
+ movdqa 16(%ebp),%xmm3
+ movq %mm4,%mm1
+ paddq %xmm5,%xmm1
+ movq -112(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,16(%esp)
+ paddq %xmm1,%xmm3
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,48(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 40(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 8(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 56(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 16(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 24(%esp),%mm6
+ movq %mm4,%mm1
+ movq -104(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,8(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,40(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 32(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq (%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 48(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 8(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 16(%esp),%mm6
+ movdqa %xmm3,-112(%edx)
+ movdqa %xmm7,%xmm5
+ movdqa %xmm4,%xmm3
+.byte 102,15,58,15,226,8
+ movdqa %xmm6,32(%edx)
+.byte 102,15,58,15,238,8
+ movdqa %xmm4,%xmm6
+ psrlq $7,%xmm4
+ paddq %xmm5,%xmm2
+ movdqa %xmm6,%xmm5
+ psrlq $1,%xmm6
+ psllq $56,%xmm5
+ pxor %xmm6,%xmm4
+ psrlq $7,%xmm6
+ pxor %xmm5,%xmm4
+ psllq $7,%xmm5
+ pxor %xmm6,%xmm4
+ movdqa %xmm1,%xmm6
+ pxor %xmm5,%xmm4
+ movdqa %xmm1,%xmm5
+ psrlq $6,%xmm6
+ paddq %xmm4,%xmm2
+ movdqa %xmm1,%xmm4
+ psrlq $19,%xmm5
+ psllq $3,%xmm4
+ pxor %xmm5,%xmm6
+ psrlq $42,%xmm5
+ pxor %xmm4,%xmm6
+ psllq $42,%xmm4
+ pxor %xmm5,%xmm6
+ movdqa (%edx),%xmm5
+ pxor %xmm4,%xmm6
+ movdqa 32(%ebp),%xmm4
+ movq %mm4,%mm1
+ paddq %xmm6,%xmm2
+ movq -96(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,(%esp)
+ paddq %xmm2,%xmm4
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,32(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 24(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 56(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 40(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq (%esp),%mm5
+ paddq %mm6,%mm2
+ movq 8(%esp),%mm6
+ movq %mm4,%mm1
+ movq -88(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,56(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,24(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 16(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 48(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 32(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 56(%esp),%mm5
+ paddq %mm6,%mm0
+ movq (%esp),%mm6
+ movdqa %xmm4,-96(%edx)
+ movdqa %xmm0,%xmm6
+ movdqa %xmm5,%xmm4
+.byte 102,15,58,15,235,8
+ movdqa %xmm7,48(%edx)
+.byte 102,15,58,15,247,8
+ movdqa %xmm5,%xmm7
+ psrlq $7,%xmm5
+ paddq %xmm6,%xmm3
+ movdqa %xmm7,%xmm6
+ psrlq $1,%xmm7
+ psllq $56,%xmm6
+ pxor %xmm7,%xmm5
+ psrlq $7,%xmm7
+ pxor %xmm6,%xmm5
+ psllq $7,%xmm6
+ pxor %xmm7,%xmm5
+ movdqa %xmm2,%xmm7
+ pxor %xmm6,%xmm5
+ movdqa %xmm2,%xmm6
+ psrlq $6,%xmm7
+ paddq %xmm5,%xmm3
+ movdqa %xmm2,%xmm5
+ psrlq $19,%xmm6
+ psllq $3,%xmm5
+ pxor %xmm6,%xmm7
+ psrlq $42,%xmm6
+ pxor %xmm5,%xmm7
+ psllq $42,%xmm5
+ pxor %xmm6,%xmm7
+ movdqa 16(%edx),%xmm6
+ pxor %xmm5,%xmm7
+ movdqa 48(%ebp),%xmm5
+ movq %mm4,%mm1
+ paddq %xmm7,%xmm3
+ movq -80(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,48(%esp)
+ paddq %xmm3,%xmm5
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,16(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 8(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 40(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 24(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 48(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 56(%esp),%mm6
+ movq %mm4,%mm1
+ movq -72(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,40(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,8(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq (%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 32(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 16(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 40(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 48(%esp),%mm6
+ movdqa %xmm5,-80(%edx)
+ movdqa %xmm1,%xmm7
+ movdqa %xmm6,%xmm5
+.byte 102,15,58,15,244,8
+ movdqa %xmm0,(%edx)
+.byte 102,15,58,15,248,8
+ movdqa %xmm6,%xmm0
+ psrlq $7,%xmm6
+ paddq %xmm7,%xmm4
+ movdqa %xmm0,%xmm7
+ psrlq $1,%xmm0
+ psllq $56,%xmm7
+ pxor %xmm0,%xmm6
+ psrlq $7,%xmm0
+ pxor %xmm7,%xmm6
+ psllq $7,%xmm7
+ pxor %xmm0,%xmm6
+ movdqa %xmm3,%xmm0
+ pxor %xmm7,%xmm6
+ movdqa %xmm3,%xmm7
+ psrlq $6,%xmm0
+ paddq %xmm6,%xmm4
+ movdqa %xmm3,%xmm6
+ psrlq $19,%xmm7
+ psllq $3,%xmm6
+ pxor %xmm7,%xmm0
+ psrlq $42,%xmm7
+ pxor %xmm6,%xmm0
+ psllq $42,%xmm6
+ pxor %xmm7,%xmm0
+ movdqa 32(%edx),%xmm7
+ pxor %xmm6,%xmm0
+ movdqa 64(%ebp),%xmm6
+ movq %mm4,%mm1
+ paddq %xmm0,%xmm4
+ movq -64(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,32(%esp)
+ paddq %xmm4,%xmm6
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 56(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 24(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 8(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 32(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 40(%esp),%mm6
+ movq %mm4,%mm1
+ movq -56(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,24(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,56(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 48(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 16(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq (%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 24(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 32(%esp),%mm6
+ movdqa %xmm6,-64(%edx)
+ movdqa %xmm2,%xmm0
+ movdqa %xmm7,%xmm6
+.byte 102,15,58,15,253,8
+ movdqa %xmm1,16(%edx)
+.byte 102,15,58,15,193,8
+ movdqa %xmm7,%xmm1
+ psrlq $7,%xmm7
+ paddq %xmm0,%xmm5
+ movdqa %xmm1,%xmm0
+ psrlq $1,%xmm1
+ psllq $56,%xmm0
+ pxor %xmm1,%xmm7
+ psrlq $7,%xmm1
+ pxor %xmm0,%xmm7
+ psllq $7,%xmm0
+ pxor %xmm1,%xmm7
+ movdqa %xmm4,%xmm1
+ pxor %xmm0,%xmm7
+ movdqa %xmm4,%xmm0
+ psrlq $6,%xmm1
+ paddq %xmm7,%xmm5
+ movdqa %xmm4,%xmm7
+ psrlq $19,%xmm0
+ psllq $3,%xmm7
+ pxor %xmm0,%xmm1
+ psrlq $42,%xmm0
+ pxor %xmm7,%xmm1
+ psllq $42,%xmm7
+ pxor %xmm0,%xmm1
+ movdqa 48(%edx),%xmm0
+ pxor %xmm7,%xmm1
+ movdqa 80(%ebp),%xmm7
+ movq %mm4,%mm1
+ paddq %xmm1,%xmm5
+ movq -48(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,16(%esp)
+ paddq %xmm5,%xmm7
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,48(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 40(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 8(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 56(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 16(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 24(%esp),%mm6
+ movq %mm4,%mm1
+ movq -40(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,8(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,40(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 32(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq (%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 48(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 8(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 16(%esp),%mm6
+ movdqa %xmm7,-48(%edx)
+ movdqa %xmm3,%xmm1
+ movdqa %xmm0,%xmm7
+.byte 102,15,58,15,198,8
+ movdqa %xmm2,32(%edx)
+.byte 102,15,58,15,202,8
+ movdqa %xmm0,%xmm2
+ psrlq $7,%xmm0
+ paddq %xmm1,%xmm6
+ movdqa %xmm2,%xmm1
+ psrlq $1,%xmm2
+ psllq $56,%xmm1
+ pxor %xmm2,%xmm0
+ psrlq $7,%xmm2
+ pxor %xmm1,%xmm0
+ psllq $7,%xmm1
+ pxor %xmm2,%xmm0
+ movdqa %xmm5,%xmm2
+ pxor %xmm1,%xmm0
+ movdqa %xmm5,%xmm1
+ psrlq $6,%xmm2
+ paddq %xmm0,%xmm6
+ movdqa %xmm5,%xmm0
+ psrlq $19,%xmm1
+ psllq $3,%xmm0
+ pxor %xmm1,%xmm2
+ psrlq $42,%xmm1
+ pxor %xmm0,%xmm2
+ psllq $42,%xmm0
+ pxor %xmm1,%xmm2
+ movdqa (%edx),%xmm1
+ pxor %xmm0,%xmm2
+ movdqa 96(%ebp),%xmm0
+ movq %mm4,%mm1
+ paddq %xmm2,%xmm6
+ movq -32(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,(%esp)
+ paddq %xmm6,%xmm0
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,32(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 24(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 56(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 40(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq (%esp),%mm5
+ paddq %mm6,%mm2
+ movq 8(%esp),%mm6
+ movq %mm4,%mm1
+ movq -24(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,56(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,24(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 16(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 48(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 32(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 56(%esp),%mm5
+ paddq %mm6,%mm0
+ movq (%esp),%mm6
+ movdqa %xmm0,-32(%edx)
+ movdqa %xmm4,%xmm2
+ movdqa %xmm1,%xmm0
+.byte 102,15,58,15,207,8
+ movdqa %xmm3,48(%edx)
+.byte 102,15,58,15,211,8
+ movdqa %xmm1,%xmm3
+ psrlq $7,%xmm1
+ paddq %xmm2,%xmm7
+ movdqa %xmm3,%xmm2
+ psrlq $1,%xmm3
+ psllq $56,%xmm2
+ pxor %xmm3,%xmm1
+ psrlq $7,%xmm3
+ pxor %xmm2,%xmm1
+ psllq $7,%xmm2
+ pxor %xmm3,%xmm1
+ movdqa %xmm6,%xmm3
+ pxor %xmm2,%xmm1
+ movdqa %xmm6,%xmm2
+ psrlq $6,%xmm3
+ paddq %xmm1,%xmm7
+ movdqa %xmm6,%xmm1
+ psrlq $19,%xmm2
+ psllq $3,%xmm1
+ pxor %xmm2,%xmm3
+ psrlq $42,%xmm2
+ pxor %xmm1,%xmm3
+ psllq $42,%xmm1
+ pxor %xmm2,%xmm3
+ movdqa 16(%edx),%xmm2
+ pxor %xmm1,%xmm3
+ movdqa 112(%ebp),%xmm1
+ movq %mm4,%mm1
+ paddq %xmm3,%xmm7
+ movq -16(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,48(%esp)
+ paddq %xmm7,%xmm1
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,16(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 8(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 40(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 24(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 48(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 56(%esp),%mm6
+ movq %mm4,%mm1
+ movq -8(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,40(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,8(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq (%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 32(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 16(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 40(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 48(%esp),%mm6
+ movdqa %xmm1,-16(%edx)
+ leal 128(%ebp),%ebp
+ decl %ecx
+ jnz .L00800_47_ssse3
+ movdqa (%ebp),%xmm1
+ leal -640(%ebp),%ebp
+ movdqu (%ebx),%xmm0
+.byte 102,15,56,0,193
+ movdqa (%ebp),%xmm3
+ movdqa %xmm1,%xmm2
+ movdqu 16(%ebx),%xmm1
+ paddq %xmm0,%xmm3
+.byte 102,15,56,0,202
+ movq %mm4,%mm1
+ movq -128(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,32(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 56(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 24(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 8(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 32(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 40(%esp),%mm6
+ movq %mm4,%mm1
+ movq -120(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,24(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,56(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 48(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 16(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq (%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 24(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 32(%esp),%mm6
+ movdqa %xmm3,-128(%edx)
+ movdqa 16(%ebp),%xmm4
+ movdqa %xmm2,%xmm3
+ movdqu 32(%ebx),%xmm2
+ paddq %xmm1,%xmm4
+.byte 102,15,56,0,211
+ movq %mm4,%mm1
+ movq -112(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,16(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,48(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 40(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 8(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 56(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 16(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 24(%esp),%mm6
+ movq %mm4,%mm1
+ movq -104(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,8(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,40(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 32(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq (%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 48(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 8(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 16(%esp),%mm6
+ movdqa %xmm4,-112(%edx)
+ movdqa 32(%ebp),%xmm5
+ movdqa %xmm3,%xmm4
+ movdqu 48(%ebx),%xmm3
+ paddq %xmm2,%xmm5
+.byte 102,15,56,0,220
+ movq %mm4,%mm1
+ movq -96(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,32(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 24(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 56(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 40(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq (%esp),%mm5
+ paddq %mm6,%mm2
+ movq 8(%esp),%mm6
+ movq %mm4,%mm1
+ movq -88(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,56(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,24(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 16(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 48(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 32(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 56(%esp),%mm5
+ paddq %mm6,%mm0
+ movq (%esp),%mm6
+ movdqa %xmm5,-96(%edx)
+ movdqa 48(%ebp),%xmm6
+ movdqa %xmm4,%xmm5
+ movdqu 64(%ebx),%xmm4
+ paddq %xmm3,%xmm6
+.byte 102,15,56,0,229
+ movq %mm4,%mm1
+ movq -80(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,48(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,16(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 8(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 40(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 24(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 48(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 56(%esp),%mm6
+ movq %mm4,%mm1
+ movq -72(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,40(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,8(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq (%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 32(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 16(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 40(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 48(%esp),%mm6
+ movdqa %xmm6,-80(%edx)
+ movdqa 64(%ebp),%xmm7
+ movdqa %xmm5,%xmm6
+ movdqu 80(%ebx),%xmm5
+ paddq %xmm4,%xmm7
+.byte 102,15,56,0,238
+ movq %mm4,%mm1
+ movq -64(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,32(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 56(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 24(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 8(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 32(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 40(%esp),%mm6
+ movq %mm4,%mm1
+ movq -56(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,24(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,56(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 48(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 16(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq (%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 24(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 32(%esp),%mm6
+ movdqa %xmm7,-64(%edx)
+ movdqa %xmm0,(%edx)
+ movdqa 80(%ebp),%xmm0
+ movdqa %xmm6,%xmm7
+ movdqu 96(%ebx),%xmm6
+ paddq %xmm5,%xmm0
+.byte 102,15,56,0,247
+ movq %mm4,%mm1
+ movq -48(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,16(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,48(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 40(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 8(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 56(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 16(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 24(%esp),%mm6
+ movq %mm4,%mm1
+ movq -40(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,8(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,40(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 32(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq (%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 48(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 8(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 16(%esp),%mm6
+ movdqa %xmm0,-48(%edx)
+ movdqa %xmm1,16(%edx)
+ movdqa 96(%ebp),%xmm1
+ movdqa %xmm7,%xmm0
+ movdqu 112(%ebx),%xmm7
+ paddq %xmm6,%xmm1
+.byte 102,15,56,0,248
+ movq %mm4,%mm1
+ movq -32(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,32(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 24(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 56(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 40(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq (%esp),%mm5
+ paddq %mm6,%mm2
+ movq 8(%esp),%mm6
+ movq %mm4,%mm1
+ movq -24(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,56(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,24(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 16(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 48(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 32(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 56(%esp),%mm5
+ paddq %mm6,%mm0
+ movq (%esp),%mm6
+ movdqa %xmm1,-32(%edx)
+ movdqa %xmm2,32(%edx)
+ movdqa 112(%ebp),%xmm2
+ movdqa (%edx),%xmm0
+ paddq %xmm7,%xmm2
+ movq %mm4,%mm1
+ movq -16(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,48(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,16(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 8(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 40(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 24(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 48(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 56(%esp),%mm6
+ movq %mm4,%mm1
+ movq -8(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,40(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,8(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq (%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 32(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 16(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 40(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 48(%esp),%mm6
+ movdqa %xmm2,-16(%edx)
+ movq 8(%esp),%mm1
+ paddq %mm3,%mm0
+ movq 24(%esp),%mm3
+ movq 56(%esp),%mm7
+ pxor %mm1,%mm2
+ paddq (%esi),%mm0
+ paddq 8(%esi),%mm1
+ paddq 16(%esi),%mm2
+ paddq 24(%esi),%mm3
+ paddq 32(%esi),%mm4
+ paddq 40(%esi),%mm5
+ paddq 48(%esi),%mm6
+ paddq 56(%esi),%mm7
+ movq %mm0,(%esi)
+ movq %mm1,8(%esi)
+ movq %mm2,16(%esi)
+ movq %mm3,24(%esi)
+ movq %mm4,32(%esi)
+ movq %mm5,40(%esi)
+ movq %mm6,48(%esi)
+ movq %mm7,56(%esi)
+ cmpl %eax,%edi
+ jb .L007loop_ssse3
+ movl 76(%edx),%esp
+ emms
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
.align 16
.L002loop_x86:
movl (%edi),%eax
@@ -132,7 +2395,7 @@ sha512_block_data_order:
movl $16,%ecx
.long 2784229001
.align 16
-.L00300_15_x86:
+.L00900_15_x86:
movl 40(%esp),%ecx
movl 44(%esp),%edx
movl %ecx,%esi
@@ -239,9 +2502,9 @@ sha512_block_data_order:
subl $8,%esp
leal 8(%ebp),%ebp
cmpb $148,%dl
- jne .L00300_15_x86
+ jne .L00900_15_x86
.align 16
-.L00416_79_x86:
+.L01016_79_x86:
movl 312(%esp),%ecx
movl 316(%esp),%edx
movl %ecx,%esi
@@ -414,7 +2677,7 @@ sha512_block_data_order:
subl $8,%esp
leal 8(%ebp),%ebp
cmpb $23,%dl
- jne .L00416_79_x86
+ jne .L01016_79_x86
movl 840(%esp),%esi
movl 844(%esp),%edi
movl (%esi),%eax
@@ -565,6 +2828,7 @@ sha512_block_data_order:
.byte 67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97
.byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
.byte 62,0
+.comm OPENSSL_ia32cap_P,16,4
#else
.text
.globl sha512_block_data_order
@@ -592,6 +2856,2269 @@ sha512_block_data_order:
movl %edi,4(%esp)
movl %eax,8(%esp)
movl %ebx,12(%esp)
+ leal OPENSSL_ia32cap_P,%edx
+ movl (%edx),%ecx
+ testl $67108864,%ecx
+ jz .L002loop_x86
+ movl 4(%edx),%edx
+ movq (%esi),%mm0
+ andl $16777216,%ecx
+ movq 8(%esi),%mm1
+ andl $512,%edx
+ movq 16(%esi),%mm2
+ orl %edx,%ecx
+ movq 24(%esi),%mm3
+ movq 32(%esi),%mm4
+ movq 40(%esi),%mm5
+ movq 48(%esi),%mm6
+ movq 56(%esi),%mm7
+ cmpl $16777728,%ecx
+ je .L003SSSE3
+ subl $80,%esp
+ jmp .L004loop_sse2
+.align 16
+.L004loop_sse2:
+ movq %mm1,8(%esp)
+ movq %mm2,16(%esp)
+ movq %mm3,24(%esp)
+ movq %mm5,40(%esp)
+ movq %mm6,48(%esp)
+ pxor %mm1,%mm2
+ movq %mm7,56(%esp)
+ movq %mm0,%mm3
+ movl (%edi),%eax
+ movl 4(%edi),%ebx
+ addl $8,%edi
+ movl $15,%edx
+ bswap %eax
+ bswap %ebx
+ jmp .L00500_14_sse2
+.align 16
+.L00500_14_sse2:
+ movd %eax,%mm1
+ movl (%edi),%eax
+ movd %ebx,%mm7
+ movl 4(%edi),%ebx
+ addl $8,%edi
+ bswap %eax
+ bswap %ebx
+ punpckldq %mm1,%mm7
+ movq %mm4,%mm1
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,32(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ movq %mm3,%mm0
+ movq %mm7,72(%esp)
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 56(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ paddq (%ebp),%mm7
+ pxor %mm4,%mm3
+ movq 24(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 8(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ subl $8,%esp
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 40(%esp),%mm5
+ paddq %mm2,%mm3
+ movq %mm0,%mm2
+ addl $8,%ebp
+ paddq %mm6,%mm3
+ movq 48(%esp),%mm6
+ decl %edx
+ jnz .L00500_14_sse2
+ movd %eax,%mm1
+ movd %ebx,%mm7
+ punpckldq %mm1,%mm7
+ movq %mm4,%mm1
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,32(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ movq %mm3,%mm0
+ movq %mm7,72(%esp)
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 56(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ paddq (%ebp),%mm7
+ pxor %mm4,%mm3
+ movq 24(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 8(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ subl $8,%esp
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 192(%esp),%mm7
+ paddq %mm2,%mm3
+ movq %mm0,%mm2
+ addl $8,%ebp
+ paddq %mm6,%mm3
+ pxor %mm0,%mm0
+ movl $32,%edx
+ jmp .L00616_79_sse2
+.align 16
+.L00616_79_sse2:
+ movq 88(%esp),%mm5
+ movq %mm7,%mm1
+ psrlq $1,%mm7
+ movq %mm5,%mm6
+ psrlq $6,%mm5
+ psllq $56,%mm1
+ paddq %mm3,%mm0
+ movq %mm7,%mm3
+ psrlq $6,%mm7
+ pxor %mm1,%mm3
+ psllq $7,%mm1
+ pxor %mm7,%mm3
+ psrlq $1,%mm7
+ pxor %mm1,%mm3
+ movq %mm5,%mm1
+ psrlq $13,%mm5
+ pxor %mm3,%mm7
+ psllq $3,%mm6
+ pxor %mm5,%mm1
+ paddq 200(%esp),%mm7
+ pxor %mm6,%mm1
+ psrlq $42,%mm5
+ paddq 128(%esp),%mm7
+ pxor %mm5,%mm1
+ psllq $42,%mm6
+ movq 40(%esp),%mm5
+ pxor %mm6,%mm1
+ movq 48(%esp),%mm6
+ paddq %mm1,%mm7
+ movq %mm4,%mm1
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,32(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ movq %mm7,72(%esp)
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 56(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ paddq (%ebp),%mm7
+ pxor %mm4,%mm3
+ movq 24(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 8(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ subl $8,%esp
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 192(%esp),%mm7
+ paddq %mm6,%mm2
+ addl $8,%ebp
+ movq 88(%esp),%mm5
+ movq %mm7,%mm1
+ psrlq $1,%mm7
+ movq %mm5,%mm6
+ psrlq $6,%mm5
+ psllq $56,%mm1
+ paddq %mm3,%mm2
+ movq %mm7,%mm3
+ psrlq $6,%mm7
+ pxor %mm1,%mm3
+ psllq $7,%mm1
+ pxor %mm7,%mm3
+ psrlq $1,%mm7
+ pxor %mm1,%mm3
+ movq %mm5,%mm1
+ psrlq $13,%mm5
+ pxor %mm3,%mm7
+ psllq $3,%mm6
+ pxor %mm5,%mm1
+ paddq 200(%esp),%mm7
+ pxor %mm6,%mm1
+ psrlq $42,%mm5
+ paddq 128(%esp),%mm7
+ pxor %mm5,%mm1
+ psllq $42,%mm6
+ movq 40(%esp),%mm5
+ pxor %mm6,%mm1
+ movq 48(%esp),%mm6
+ paddq %mm1,%mm7
+ movq %mm4,%mm1
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,32(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ movq %mm7,72(%esp)
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 56(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ paddq (%ebp),%mm7
+ pxor %mm4,%mm3
+ movq 24(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 8(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ subl $8,%esp
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 192(%esp),%mm7
+ paddq %mm6,%mm0
+ addl $8,%ebp
+ decl %edx
+ jnz .L00616_79_sse2
+ paddq %mm3,%mm0
+ movq 8(%esp),%mm1
+ movq 24(%esp),%mm3
+ movq 40(%esp),%mm5
+ movq 48(%esp),%mm6
+ movq 56(%esp),%mm7
+ pxor %mm1,%mm2
+ paddq (%esi),%mm0
+ paddq 8(%esi),%mm1
+ paddq 16(%esi),%mm2
+ paddq 24(%esi),%mm3
+ paddq 32(%esi),%mm4
+ paddq 40(%esi),%mm5
+ paddq 48(%esi),%mm6
+ paddq 56(%esi),%mm7
+ movl $640,%eax
+ movq %mm0,(%esi)
+ movq %mm1,8(%esi)
+ movq %mm2,16(%esi)
+ movq %mm3,24(%esi)
+ movq %mm4,32(%esi)
+ movq %mm5,40(%esi)
+ movq %mm6,48(%esi)
+ movq %mm7,56(%esi)
+ leal (%esp,%eax,1),%esp
+ subl %eax,%ebp
+ cmpl 88(%esp),%edi
+ jb .L004loop_sse2
+ movl 92(%esp),%esp
+ emms
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.align 32
+.L003SSSE3:
+ leal -64(%esp),%edx
+ subl $256,%esp
+ movdqa 640(%ebp),%xmm1
+ movdqu (%edi),%xmm0
+.byte 102,15,56,0,193
+ movdqa (%ebp),%xmm3
+ movdqa %xmm1,%xmm2
+ movdqu 16(%edi),%xmm1
+ paddq %xmm0,%xmm3
+.byte 102,15,56,0,202
+ movdqa %xmm3,-128(%edx)
+ movdqa 16(%ebp),%xmm4
+ movdqa %xmm2,%xmm3
+ movdqu 32(%edi),%xmm2
+ paddq %xmm1,%xmm4
+.byte 102,15,56,0,211
+ movdqa %xmm4,-112(%edx)
+ movdqa 32(%ebp),%xmm5
+ movdqa %xmm3,%xmm4
+ movdqu 48(%edi),%xmm3
+ paddq %xmm2,%xmm5
+.byte 102,15,56,0,220
+ movdqa %xmm5,-96(%edx)
+ movdqa 48(%ebp),%xmm6
+ movdqa %xmm4,%xmm5
+ movdqu 64(%edi),%xmm4
+ paddq %xmm3,%xmm6
+.byte 102,15,56,0,229
+ movdqa %xmm6,-80(%edx)
+ movdqa 64(%ebp),%xmm7
+ movdqa %xmm5,%xmm6
+ movdqu 80(%edi),%xmm5
+ paddq %xmm4,%xmm7
+.byte 102,15,56,0,238
+ movdqa %xmm7,-64(%edx)
+ movdqa %xmm0,(%edx)
+ movdqa 80(%ebp),%xmm0
+ movdqa %xmm6,%xmm7
+ movdqu 96(%edi),%xmm6
+ paddq %xmm5,%xmm0
+.byte 102,15,56,0,247
+ movdqa %xmm0,-48(%edx)
+ movdqa %xmm1,16(%edx)
+ movdqa 96(%ebp),%xmm1
+ movdqa %xmm7,%xmm0
+ movdqu 112(%edi),%xmm7
+ paddq %xmm6,%xmm1
+.byte 102,15,56,0,248
+ movdqa %xmm1,-32(%edx)
+ movdqa %xmm2,32(%edx)
+ movdqa 112(%ebp),%xmm2
+ movdqa (%edx),%xmm0
+ paddq %xmm7,%xmm2
+ movdqa %xmm2,-16(%edx)
+ nop
+.align 32
+.L007loop_ssse3:
+ movdqa 16(%edx),%xmm2
+ movdqa %xmm3,48(%edx)
+ leal 128(%ebp),%ebp
+ movq %mm1,8(%esp)
+ movl %edi,%ebx
+ movq %mm2,16(%esp)
+ leal 128(%edi),%edi
+ movq %mm3,24(%esp)
+ cmpl %eax,%edi
+ movq %mm5,40(%esp)
+ cmovbl %edi,%ebx
+ movq %mm6,48(%esp)
+ movl $4,%ecx
+ pxor %mm1,%mm2
+ movq %mm7,56(%esp)
+ pxor %mm3,%mm3
+ jmp .L00800_47_ssse3
+.align 32
+.L00800_47_ssse3:
+ movdqa %xmm5,%xmm3
+ movdqa %xmm2,%xmm1
+.byte 102,15,58,15,208,8
+ movdqa %xmm4,(%edx)
+.byte 102,15,58,15,220,8
+ movdqa %xmm2,%xmm4
+ psrlq $7,%xmm2
+ paddq %xmm3,%xmm0
+ movdqa %xmm4,%xmm3
+ psrlq $1,%xmm4
+ psllq $56,%xmm3
+ pxor %xmm4,%xmm2
+ psrlq $7,%xmm4
+ pxor %xmm3,%xmm2
+ psllq $7,%xmm3
+ pxor %xmm4,%xmm2
+ movdqa %xmm7,%xmm4
+ pxor %xmm3,%xmm2
+ movdqa %xmm7,%xmm3
+ psrlq $6,%xmm4
+ paddq %xmm2,%xmm0
+ movdqa %xmm7,%xmm2
+ psrlq $19,%xmm3
+ psllq $3,%xmm2
+ pxor %xmm3,%xmm4
+ psrlq $42,%xmm3
+ pxor %xmm2,%xmm4
+ psllq $42,%xmm2
+ pxor %xmm3,%xmm4
+ movdqa 32(%edx),%xmm3
+ pxor %xmm2,%xmm4
+ movdqa (%ebp),%xmm2
+ movq %mm4,%mm1
+ paddq %xmm4,%xmm0
+ movq -128(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,32(%esp)
+ paddq %xmm0,%xmm2
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 56(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 24(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 8(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 32(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 40(%esp),%mm6
+ movq %mm4,%mm1
+ movq -120(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,24(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,56(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 48(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 16(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq (%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 24(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 32(%esp),%mm6
+ movdqa %xmm2,-128(%edx)
+ movdqa %xmm6,%xmm4
+ movdqa %xmm3,%xmm2
+.byte 102,15,58,15,217,8
+ movdqa %xmm5,16(%edx)
+.byte 102,15,58,15,229,8
+ movdqa %xmm3,%xmm5
+ psrlq $7,%xmm3
+ paddq %xmm4,%xmm1
+ movdqa %xmm5,%xmm4
+ psrlq $1,%xmm5
+ psllq $56,%xmm4
+ pxor %xmm5,%xmm3
+ psrlq $7,%xmm5
+ pxor %xmm4,%xmm3
+ psllq $7,%xmm4
+ pxor %xmm5,%xmm3
+ movdqa %xmm0,%xmm5
+ pxor %xmm4,%xmm3
+ movdqa %xmm0,%xmm4
+ psrlq $6,%xmm5
+ paddq %xmm3,%xmm1
+ movdqa %xmm0,%xmm3
+ psrlq $19,%xmm4
+ psllq $3,%xmm3
+ pxor %xmm4,%xmm5
+ psrlq $42,%xmm4
+ pxor %xmm3,%xmm5
+ psllq $42,%xmm3
+ pxor %xmm4,%xmm5
+ movdqa 48(%edx),%xmm4
+ pxor %xmm3,%xmm5
+ movdqa 16(%ebp),%xmm3
+ movq %mm4,%mm1
+ paddq %xmm5,%xmm1
+ movq -112(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,16(%esp)
+ paddq %xmm1,%xmm3
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,48(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 40(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 8(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 56(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 16(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 24(%esp),%mm6
+ movq %mm4,%mm1
+ movq -104(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,8(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,40(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 32(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq (%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 48(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 8(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 16(%esp),%mm6
+ movdqa %xmm3,-112(%edx)
+ movdqa %xmm7,%xmm5
+ movdqa %xmm4,%xmm3
+.byte 102,15,58,15,226,8
+ movdqa %xmm6,32(%edx)
+.byte 102,15,58,15,238,8
+ movdqa %xmm4,%xmm6
+ psrlq $7,%xmm4
+ paddq %xmm5,%xmm2
+ movdqa %xmm6,%xmm5
+ psrlq $1,%xmm6
+ psllq $56,%xmm5
+ pxor %xmm6,%xmm4
+ psrlq $7,%xmm6
+ pxor %xmm5,%xmm4
+ psllq $7,%xmm5
+ pxor %xmm6,%xmm4
+ movdqa %xmm1,%xmm6
+ pxor %xmm5,%xmm4
+ movdqa %xmm1,%xmm5
+ psrlq $6,%xmm6
+ paddq %xmm4,%xmm2
+ movdqa %xmm1,%xmm4
+ psrlq $19,%xmm5
+ psllq $3,%xmm4
+ pxor %xmm5,%xmm6
+ psrlq $42,%xmm5
+ pxor %xmm4,%xmm6
+ psllq $42,%xmm4
+ pxor %xmm5,%xmm6
+ movdqa (%edx),%xmm5
+ pxor %xmm4,%xmm6
+ movdqa 32(%ebp),%xmm4
+ movq %mm4,%mm1
+ paddq %xmm6,%xmm2
+ movq -96(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,(%esp)
+ paddq %xmm2,%xmm4
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,32(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 24(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 56(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 40(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq (%esp),%mm5
+ paddq %mm6,%mm2
+ movq 8(%esp),%mm6
+ movq %mm4,%mm1
+ movq -88(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,56(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,24(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 16(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 48(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 32(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 56(%esp),%mm5
+ paddq %mm6,%mm0
+ movq (%esp),%mm6
+ movdqa %xmm4,-96(%edx)
+ movdqa %xmm0,%xmm6
+ movdqa %xmm5,%xmm4
+.byte 102,15,58,15,235,8
+ movdqa %xmm7,48(%edx)
+.byte 102,15,58,15,247,8
+ movdqa %xmm5,%xmm7
+ psrlq $7,%xmm5
+ paddq %xmm6,%xmm3
+ movdqa %xmm7,%xmm6
+ psrlq $1,%xmm7
+ psllq $56,%xmm6
+ pxor %xmm7,%xmm5
+ psrlq $7,%xmm7
+ pxor %xmm6,%xmm5
+ psllq $7,%xmm6
+ pxor %xmm7,%xmm5
+ movdqa %xmm2,%xmm7
+ pxor %xmm6,%xmm5
+ movdqa %xmm2,%xmm6
+ psrlq $6,%xmm7
+ paddq %xmm5,%xmm3
+ movdqa %xmm2,%xmm5
+ psrlq $19,%xmm6
+ psllq $3,%xmm5
+ pxor %xmm6,%xmm7
+ psrlq $42,%xmm6
+ pxor %xmm5,%xmm7
+ psllq $42,%xmm5
+ pxor %xmm6,%xmm7
+ movdqa 16(%edx),%xmm6
+ pxor %xmm5,%xmm7
+ movdqa 48(%ebp),%xmm5
+ movq %mm4,%mm1
+ paddq %xmm7,%xmm3
+ movq -80(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,48(%esp)
+ paddq %xmm3,%xmm5
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,16(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 8(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 40(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 24(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 48(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 56(%esp),%mm6
+ movq %mm4,%mm1
+ movq -72(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,40(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,8(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq (%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 32(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 16(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 40(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 48(%esp),%mm6
+ movdqa %xmm5,-80(%edx)
+ movdqa %xmm1,%xmm7
+ movdqa %xmm6,%xmm5
+.byte 102,15,58,15,244,8
+ movdqa %xmm0,(%edx)
+.byte 102,15,58,15,248,8
+ movdqa %xmm6,%xmm0
+ psrlq $7,%xmm6
+ paddq %xmm7,%xmm4
+ movdqa %xmm0,%xmm7
+ psrlq $1,%xmm0
+ psllq $56,%xmm7
+ pxor %xmm0,%xmm6
+ psrlq $7,%xmm0
+ pxor %xmm7,%xmm6
+ psllq $7,%xmm7
+ pxor %xmm0,%xmm6
+ movdqa %xmm3,%xmm0
+ pxor %xmm7,%xmm6
+ movdqa %xmm3,%xmm7
+ psrlq $6,%xmm0
+ paddq %xmm6,%xmm4
+ movdqa %xmm3,%xmm6
+ psrlq $19,%xmm7
+ psllq $3,%xmm6
+ pxor %xmm7,%xmm0
+ psrlq $42,%xmm7
+ pxor %xmm6,%xmm0
+ psllq $42,%xmm6
+ pxor %xmm7,%xmm0
+ movdqa 32(%edx),%xmm7
+ pxor %xmm6,%xmm0
+ movdqa 64(%ebp),%xmm6
+ movq %mm4,%mm1
+ paddq %xmm0,%xmm4
+ movq -64(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,32(%esp)
+ paddq %xmm4,%xmm6
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 56(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 24(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 8(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 32(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 40(%esp),%mm6
+ movq %mm4,%mm1
+ movq -56(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,24(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,56(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 48(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 16(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq (%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 24(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 32(%esp),%mm6
+ movdqa %xmm6,-64(%edx)
+ movdqa %xmm2,%xmm0
+ movdqa %xmm7,%xmm6
+.byte 102,15,58,15,253,8
+ movdqa %xmm1,16(%edx)
+.byte 102,15,58,15,193,8
+ movdqa %xmm7,%xmm1
+ psrlq $7,%xmm7
+ paddq %xmm0,%xmm5
+ movdqa %xmm1,%xmm0
+ psrlq $1,%xmm1
+ psllq $56,%xmm0
+ pxor %xmm1,%xmm7
+ psrlq $7,%xmm1
+ pxor %xmm0,%xmm7
+ psllq $7,%xmm0
+ pxor %xmm1,%xmm7
+ movdqa %xmm4,%xmm1
+ pxor %xmm0,%xmm7
+ movdqa %xmm4,%xmm0
+ psrlq $6,%xmm1
+ paddq %xmm7,%xmm5
+ movdqa %xmm4,%xmm7
+ psrlq $19,%xmm0
+ psllq $3,%xmm7
+ pxor %xmm0,%xmm1
+ psrlq $42,%xmm0
+ pxor %xmm7,%xmm1
+ psllq $42,%xmm7
+ pxor %xmm0,%xmm1
+ movdqa 48(%edx),%xmm0
+ pxor %xmm7,%xmm1
+ movdqa 80(%ebp),%xmm7
+ movq %mm4,%mm1
+ paddq %xmm1,%xmm5
+ movq -48(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,16(%esp)
+ paddq %xmm5,%xmm7
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,48(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 40(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 8(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 56(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 16(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 24(%esp),%mm6
+ movq %mm4,%mm1
+ movq -40(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,8(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,40(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 32(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq (%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 48(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 8(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 16(%esp),%mm6
+ movdqa %xmm7,-48(%edx)
+ movdqa %xmm3,%xmm1
+ movdqa %xmm0,%xmm7
+.byte 102,15,58,15,198,8
+ movdqa %xmm2,32(%edx)
+.byte 102,15,58,15,202,8
+ movdqa %xmm0,%xmm2
+ psrlq $7,%xmm0
+ paddq %xmm1,%xmm6
+ movdqa %xmm2,%xmm1
+ psrlq $1,%xmm2
+ psllq $56,%xmm1
+ pxor %xmm2,%xmm0
+ psrlq $7,%xmm2
+ pxor %xmm1,%xmm0
+ psllq $7,%xmm1
+ pxor %xmm2,%xmm0
+ movdqa %xmm5,%xmm2
+ pxor %xmm1,%xmm0
+ movdqa %xmm5,%xmm1
+ psrlq $6,%xmm2
+ paddq %xmm0,%xmm6
+ movdqa %xmm5,%xmm0
+ psrlq $19,%xmm1
+ psllq $3,%xmm0
+ pxor %xmm1,%xmm2
+ psrlq $42,%xmm1
+ pxor %xmm0,%xmm2
+ psllq $42,%xmm0
+ pxor %xmm1,%xmm2
+ movdqa (%edx),%xmm1
+ pxor %xmm0,%xmm2
+ movdqa 96(%ebp),%xmm0
+ movq %mm4,%mm1
+ paddq %xmm2,%xmm6
+ movq -32(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,(%esp)
+ paddq %xmm6,%xmm0
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,32(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 24(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 56(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 40(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq (%esp),%mm5
+ paddq %mm6,%mm2
+ movq 8(%esp),%mm6
+ movq %mm4,%mm1
+ movq -24(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,56(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,24(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 16(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 48(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 32(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 56(%esp),%mm5
+ paddq %mm6,%mm0
+ movq (%esp),%mm6
+ movdqa %xmm0,-32(%edx)
+ movdqa %xmm4,%xmm2
+ movdqa %xmm1,%xmm0
+.byte 102,15,58,15,207,8
+ movdqa %xmm3,48(%edx)
+.byte 102,15,58,15,211,8
+ movdqa %xmm1,%xmm3
+ psrlq $7,%xmm1
+ paddq %xmm2,%xmm7
+ movdqa %xmm3,%xmm2
+ psrlq $1,%xmm3
+ psllq $56,%xmm2
+ pxor %xmm3,%xmm1
+ psrlq $7,%xmm3
+ pxor %xmm2,%xmm1
+ psllq $7,%xmm2
+ pxor %xmm3,%xmm1
+ movdqa %xmm6,%xmm3
+ pxor %xmm2,%xmm1
+ movdqa %xmm6,%xmm2
+ psrlq $6,%xmm3
+ paddq %xmm1,%xmm7
+ movdqa %xmm6,%xmm1
+ psrlq $19,%xmm2
+ psllq $3,%xmm1
+ pxor %xmm2,%xmm3
+ psrlq $42,%xmm2
+ pxor %xmm1,%xmm3
+ psllq $42,%xmm1
+ pxor %xmm2,%xmm3
+ movdqa 16(%edx),%xmm2
+ pxor %xmm1,%xmm3
+ movdqa 112(%ebp),%xmm1
+ movq %mm4,%mm1
+ paddq %xmm3,%xmm7
+ movq -16(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,48(%esp)
+ paddq %xmm7,%xmm1
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,16(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 8(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 40(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 24(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 48(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 56(%esp),%mm6
+ movq %mm4,%mm1
+ movq -8(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,40(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,8(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq (%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 32(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 16(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 40(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 48(%esp),%mm6
+ movdqa %xmm1,-16(%edx)
+ leal 128(%ebp),%ebp
+ decl %ecx
+ jnz .L00800_47_ssse3
+ movdqa (%ebp),%xmm1
+ leal -640(%ebp),%ebp
+ movdqu (%ebx),%xmm0
+.byte 102,15,56,0,193
+ movdqa (%ebp),%xmm3
+ movdqa %xmm1,%xmm2
+ movdqu 16(%ebx),%xmm1
+ paddq %xmm0,%xmm3
+.byte 102,15,56,0,202
+ movq %mm4,%mm1
+ movq -128(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,32(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 56(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 24(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 8(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 32(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 40(%esp),%mm6
+ movq %mm4,%mm1
+ movq -120(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,24(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,56(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 48(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 16(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq (%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 24(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 32(%esp),%mm6
+ movdqa %xmm3,-128(%edx)
+ movdqa 16(%ebp),%xmm4
+ movdqa %xmm2,%xmm3
+ movdqu 32(%ebx),%xmm2
+ paddq %xmm1,%xmm4
+.byte 102,15,56,0,211
+ movq %mm4,%mm1
+ movq -112(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,16(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,48(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 40(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 8(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 56(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 16(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 24(%esp),%mm6
+ movq %mm4,%mm1
+ movq -104(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,8(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,40(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 32(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq (%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 48(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 8(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 16(%esp),%mm6
+ movdqa %xmm4,-112(%edx)
+ movdqa 32(%ebp),%xmm5
+ movdqa %xmm3,%xmm4
+ movdqu 48(%ebx),%xmm3
+ paddq %xmm2,%xmm5
+.byte 102,15,56,0,220
+ movq %mm4,%mm1
+ movq -96(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,32(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 24(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 56(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 40(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq (%esp),%mm5
+ paddq %mm6,%mm2
+ movq 8(%esp),%mm6
+ movq %mm4,%mm1
+ movq -88(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,56(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,24(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 16(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 48(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 32(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 56(%esp),%mm5
+ paddq %mm6,%mm0
+ movq (%esp),%mm6
+ movdqa %xmm5,-96(%edx)
+ movdqa 48(%ebp),%xmm6
+ movdqa %xmm4,%xmm5
+ movdqu 64(%ebx),%xmm4
+ paddq %xmm3,%xmm6
+.byte 102,15,56,0,229
+ movq %mm4,%mm1
+ movq -80(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,48(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,16(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 8(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 40(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 24(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 48(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 56(%esp),%mm6
+ movq %mm4,%mm1
+ movq -72(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,40(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,8(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq (%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 32(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 16(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 40(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 48(%esp),%mm6
+ movdqa %xmm6,-80(%edx)
+ movdqa 64(%ebp),%xmm7
+ movdqa %xmm5,%xmm6
+ movdqu 80(%ebx),%xmm5
+ paddq %xmm4,%xmm7
+.byte 102,15,56,0,238
+ movq %mm4,%mm1
+ movq -64(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,32(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 56(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 24(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 8(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 32(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 40(%esp),%mm6
+ movq %mm4,%mm1
+ movq -56(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,24(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,56(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 48(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 16(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq (%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 24(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 32(%esp),%mm6
+ movdqa %xmm7,-64(%edx)
+ movdqa %xmm0,(%edx)
+ movdqa 80(%ebp),%xmm0
+ movdqa %xmm6,%xmm7
+ movdqu 96(%ebx),%xmm6
+ paddq %xmm5,%xmm0
+.byte 102,15,56,0,247
+ movq %mm4,%mm1
+ movq -48(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,16(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,48(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 40(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 8(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 56(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 16(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 24(%esp),%mm6
+ movq %mm4,%mm1
+ movq -40(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,8(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,40(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 32(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq (%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 48(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 8(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 16(%esp),%mm6
+ movdqa %xmm0,-48(%edx)
+ movdqa %xmm1,16(%edx)
+ movdqa 96(%ebp),%xmm1
+ movdqa %xmm7,%xmm0
+ movdqu 112(%ebx),%xmm7
+ paddq %xmm6,%xmm1
+.byte 102,15,56,0,248
+ movq %mm4,%mm1
+ movq -32(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,32(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 24(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 56(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 40(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq (%esp),%mm5
+ paddq %mm6,%mm2
+ movq 8(%esp),%mm6
+ movq %mm4,%mm1
+ movq -24(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,56(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,24(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 16(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 48(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 32(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 56(%esp),%mm5
+ paddq %mm6,%mm0
+ movq (%esp),%mm6
+ movdqa %xmm1,-32(%edx)
+ movdqa %xmm2,32(%edx)
+ movdqa 112(%ebp),%xmm2
+ movdqa (%edx),%xmm0
+ paddq %xmm7,%xmm2
+ movq %mm4,%mm1
+ movq -16(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,48(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,16(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 8(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 40(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 24(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 48(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 56(%esp),%mm6
+ movq %mm4,%mm1
+ movq -8(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,40(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,8(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq (%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 32(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 16(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 40(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 48(%esp),%mm6
+ movdqa %xmm2,-16(%edx)
+ movq 8(%esp),%mm1
+ paddq %mm3,%mm0
+ movq 24(%esp),%mm3
+ movq 56(%esp),%mm7
+ pxor %mm1,%mm2
+ paddq (%esi),%mm0
+ paddq 8(%esi),%mm1
+ paddq 16(%esi),%mm2
+ paddq 24(%esi),%mm3
+ paddq 32(%esi),%mm4
+ paddq 40(%esi),%mm5
+ paddq 48(%esi),%mm6
+ paddq 56(%esi),%mm7
+ movq %mm0,(%esi)
+ movq %mm1,8(%esi)
+ movq %mm2,16(%esi)
+ movq %mm3,24(%esi)
+ movq %mm4,32(%esi)
+ movq %mm5,40(%esi)
+ movq %mm6,48(%esi)
+ movq %mm7,56(%esi)
+ cmpl %eax,%edi
+ jb .L007loop_ssse3
+ movl 76(%edx),%esp
+ emms
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
.align 16
.L002loop_x86:
movl (%edi),%eax
@@ -697,7 +5224,7 @@ sha512_block_data_order:
movl $16,%ecx
.long 2784229001
.align 16
-.L00300_15_x86:
+.L00900_15_x86:
movl 40(%esp),%ecx
movl 44(%esp),%edx
movl %ecx,%esi
@@ -804,9 +5331,9 @@ sha512_block_data_order:
subl $8,%esp
leal 8(%ebp),%ebp
cmpb $148,%dl
- jne .L00300_15_x86
+ jne .L00900_15_x86
.align 16
-.L00416_79_x86:
+.L01016_79_x86:
movl 312(%esp),%ecx
movl 316(%esp),%edx
movl %ecx,%esi
@@ -979,7 +5506,7 @@ sha512_block_data_order:
subl $8,%esp
leal 8(%ebp),%ebp
cmpb $23,%dl
- jne .L00416_79_x86
+ jne .L01016_79_x86
movl 840(%esp),%esi
movl 844(%esp),%edi
movl (%esi),%eax
@@ -1130,4 +5657,5 @@ sha512_block_data_order:
.byte 67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97
.byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
.byte 62,0
+.comm OPENSSL_ia32cap_P,16,4
#endif
diff --git a/secure/lib/libcrypto/i386/x86-gf2m.S b/secure/lib/libcrypto/i386/x86-gf2m.S
index f649387cf175..9a137085bffe 100644
--- a/secure/lib/libcrypto/i386/x86-gf2m.S
+++ b/secure/lib/libcrypto/i386/x86-gf2m.S
@@ -250,6 +250,18 @@ bn_GF2m_mul_2x2:
movl 4(%edx),%edx
testl $8388608,%eax
jz .L001ialu
+ testl $16777216,%eax
+ jz .L002mmx
+ testl $2,%edx
+ jz .L002mmx
+ movups 8(%esp),%xmm0
+ shufps $177,%xmm0,%xmm0
+.byte 102,15,58,68,192,1
+ movl 4(%esp),%eax
+ movups %xmm0,(%eax)
+ ret
+.align 16
+.L002mmx:
pushl %ebp
pushl %ebx
pushl %esi
@@ -581,6 +593,18 @@ bn_GF2m_mul_2x2:
movl 4(%edx),%edx
testl $8388608,%eax
jz .L000ialu
+ testl $16777216,%eax
+ jz .L001mmx
+ testl $2,%edx
+ jz .L001mmx
+ movups 8(%esp),%xmm0
+ shufps $177,%xmm0,%xmm0
+.byte 102,15,58,68,192,1
+ movl 4(%esp),%eax
+ movups %xmm0,(%eax)
+ ret
+.align 16
+.L001mmx:
pushl %ebp
pushl %ebx
pushl %esi
diff --git a/secure/lib/libcrypto/i386/x86-mont.S b/secure/lib/libcrypto/i386/x86-mont.S
index 468e390190c7..dbb8531aefaf 100644
--- a/secure/lib/libcrypto/i386/x86-mont.S
+++ b/secure/lib/libcrypto/i386/x86-mont.S
@@ -59,6 +59,126 @@ bn_mul_mont:
movl %esi,20(%esp)
leal -3(%edi),%ebx
movl %edx,24(%esp)
+ call .L003PIC_me_up
+.L003PIC_me_up:
+ popl %eax
+ leal OPENSSL_ia32cap_P-.L003PIC_me_up(%eax),%eax
+ btl $26,(%eax)
+ jnc .L004non_sse2
+ movl $-1,%eax
+ movd %eax,%mm7
+ movl 8(%esp),%esi
+ movl 12(%esp),%edi
+ movl 16(%esp),%ebp
+ xorl %edx,%edx
+ xorl %ecx,%ecx
+ movd (%edi),%mm4
+ movd (%esi),%mm5
+ movd (%ebp),%mm3
+ pmuludq %mm4,%mm5
+ movq %mm5,%mm2
+ movq %mm5,%mm0
+ pand %mm7,%mm0
+ pmuludq 20(%esp),%mm5
+ pmuludq %mm5,%mm3
+ paddq %mm0,%mm3
+ movd 4(%ebp),%mm1
+ movd 4(%esi),%mm0
+ psrlq $32,%mm2
+ psrlq $32,%mm3
+ incl %ecx
+.align 16
+.L0051st:
+ pmuludq %mm4,%mm0
+ pmuludq %mm5,%mm1
+ paddq %mm0,%mm2
+ paddq %mm1,%mm3
+ movq %mm2,%mm0
+ pand %mm7,%mm0
+ movd 4(%ebp,%ecx,4),%mm1
+ paddq %mm0,%mm3
+ movd 4(%esi,%ecx,4),%mm0
+ psrlq $32,%mm2
+ movd %mm3,28(%esp,%ecx,4)
+ psrlq $32,%mm3
+ leal 1(%ecx),%ecx
+ cmpl %ebx,%ecx
+ jl .L0051st
+ pmuludq %mm4,%mm0
+ pmuludq %mm5,%mm1
+ paddq %mm0,%mm2
+ paddq %mm1,%mm3
+ movq %mm2,%mm0
+ pand %mm7,%mm0
+ paddq %mm0,%mm3
+ movd %mm3,28(%esp,%ecx,4)
+ psrlq $32,%mm2
+ psrlq $32,%mm3
+ paddq %mm2,%mm3
+ movq %mm3,32(%esp,%ebx,4)
+ incl %edx
+.L006outer:
+ xorl %ecx,%ecx
+ movd (%edi,%edx,4),%mm4
+ movd (%esi),%mm5
+ movd 32(%esp),%mm6
+ movd (%ebp),%mm3
+ pmuludq %mm4,%mm5
+ paddq %mm6,%mm5
+ movq %mm5,%mm0
+ movq %mm5,%mm2
+ pand %mm7,%mm0
+ pmuludq 20(%esp),%mm5
+ pmuludq %mm5,%mm3
+ paddq %mm0,%mm3
+ movd 36(%esp),%mm6
+ movd 4(%ebp),%mm1
+ movd 4(%esi),%mm0
+ psrlq $32,%mm2
+ psrlq $32,%mm3
+ paddq %mm6,%mm2
+ incl %ecx
+ decl %ebx
+.L007inner:
+ pmuludq %mm4,%mm0
+ pmuludq %mm5,%mm1
+ paddq %mm0,%mm2
+ paddq %mm1,%mm3
+ movq %mm2,%mm0
+ movd 36(%esp,%ecx,4),%mm6
+ pand %mm7,%mm0
+ movd 4(%ebp,%ecx,4),%mm1
+ paddq %mm0,%mm3
+ movd 4(%esi,%ecx,4),%mm0
+ psrlq $32,%mm2
+ movd %mm3,28(%esp,%ecx,4)
+ psrlq $32,%mm3
+ paddq %mm6,%mm2
+ decl %ebx
+ leal 1(%ecx),%ecx
+ jnz .L007inner
+ movl %ecx,%ebx
+ pmuludq %mm4,%mm0
+ pmuludq %mm5,%mm1
+ paddq %mm0,%mm2
+ paddq %mm1,%mm3
+ movq %mm2,%mm0
+ pand %mm7,%mm0
+ paddq %mm0,%mm3
+ movd %mm3,28(%esp,%ecx,4)
+ psrlq $32,%mm2
+ psrlq $32,%mm3
+ movd 36(%esp,%ebx,4),%mm6
+ paddq %mm2,%mm3
+ paddq %mm6,%mm3
+ movq %mm3,32(%esp,%ebx,4)
+ leal 1(%edx),%edx
+ cmpl %ebx,%edx
+ jle .L006outer
+ emms
+ jmp .L008common_tail
+.align 16
+.L004non_sse2:
movl 8(%esp),%esi
leal 1(%ebx),%ebp
movl 12(%esp),%edi
@@ -69,12 +189,12 @@ bn_mul_mont:
leal 4(%edi,%ebx,4),%eax
orl %edx,%ebp
movl (%edi),%edi
- jz .L003bn_sqr_mont
+ jz .L009bn_sqr_mont
movl %eax,28(%esp)
movl (%esi),%eax
xorl %edx,%edx
.align 16
-.L004mull:
+.L010mull:
movl %edx,%ebp
mull %edi
addl %eax,%ebp
@@ -83,7 +203,7 @@ bn_mul_mont:
movl (%esi,%ecx,4),%eax
cmpl %ebx,%ecx
movl %ebp,28(%esp,%ecx,4)
- jl .L004mull
+ jl .L010mull
movl %edx,%ebp
mull %edi
movl 20(%esp),%edi
@@ -101,9 +221,9 @@ bn_mul_mont:
movl 4(%esi),%eax
adcl $0,%edx
incl %ecx
- jmp .L0052ndmadd
+ jmp .L0112ndmadd
.align 16
-.L0061stmadd:
+.L0121stmadd:
movl %edx,%ebp
mull %edi
addl 32(%esp,%ecx,4),%ebp
@@ -114,7 +234,7 @@ bn_mul_mont:
adcl $0,%edx
cmpl %ebx,%ecx
movl %ebp,28(%esp,%ecx,4)
- jl .L0061stmadd
+ jl .L0121stmadd
movl %edx,%ebp
mull %edi
addl 32(%esp,%ebx,4),%eax
@@ -137,7 +257,7 @@ bn_mul_mont:
adcl $0,%edx
movl $1,%ecx
.align 16
-.L0052ndmadd:
+.L0112ndmadd:
movl %edx,%ebp
mull %edi
addl 32(%esp,%ecx,4),%ebp
@@ -148,7 +268,7 @@ bn_mul_mont:
adcl $0,%edx
cmpl %ebx,%ecx
movl %ebp,24(%esp,%ecx,4)
- jl .L0052ndmadd
+ jl .L0112ndmadd
movl %edx,%ebp
mull %edi
addl 32(%esp,%ebx,4),%ebp
@@ -164,16 +284,16 @@ bn_mul_mont:
movl %edx,32(%esp,%ebx,4)
cmpl 28(%esp),%ecx
movl %eax,36(%esp,%ebx,4)
- je .L007common_tail
+ je .L008common_tail
movl (%ecx),%edi
movl 8(%esp),%esi
movl %ecx,12(%esp)
xorl %ecx,%ecx
xorl %edx,%edx
movl (%esi),%eax
- jmp .L0061stmadd
+ jmp .L0121stmadd
.align 16
-.L003bn_sqr_mont:
+.L009bn_sqr_mont:
movl %ebx,(%esp)
movl %ecx,12(%esp)
movl %edi,%eax
@@ -184,7 +304,7 @@ bn_mul_mont:
andl $1,%ebx
incl %ecx
.align 16
-.L008sqr:
+.L013sqr:
movl (%esi,%ecx,4),%eax
movl %edx,%ebp
mull %edi
@@ -196,7 +316,7 @@ bn_mul_mont:
cmpl (%esp),%ecx
movl %eax,%ebx
movl %ebp,28(%esp,%ecx,4)
- jl .L008sqr
+ jl .L013sqr
movl (%esi,%ecx,4),%eax
movl %edx,%ebp
mull %edi
@@ -220,7 +340,7 @@ bn_mul_mont:
movl 4(%esi),%eax
movl $1,%ecx
.align 16
-.L0093rdmadd:
+.L0143rdmadd:
movl %edx,%ebp
mull %edi
addl 32(%esp,%ecx,4),%ebp
@@ -239,7 +359,7 @@ bn_mul_mont:
adcl $0,%edx
cmpl %ebx,%ecx
movl %ebp,24(%esp,%ecx,4)
- jl .L0093rdmadd
+ jl .L0143rdmadd
movl %edx,%ebp
mull %edi
addl 32(%esp,%ebx,4),%ebp
@@ -255,7 +375,7 @@ bn_mul_mont:
movl %edx,32(%esp,%ebx,4)
cmpl %ebx,%ecx
movl %eax,36(%esp,%ebx,4)
- je .L007common_tail
+ je .L008common_tail
movl 4(%esi,%ecx,4),%edi
leal 1(%ecx),%ecx
movl %edi,%eax
@@ -267,12 +387,12 @@ bn_mul_mont:
xorl %ebp,%ebp
cmpl %ebx,%ecx
leal 1(%ecx),%ecx
- je .L010sqrlast
+ je .L015sqrlast
movl %edx,%ebx
shrl $1,%edx
andl $1,%ebx
.align 16
-.L011sqradd:
+.L016sqradd:
movl (%esi,%ecx,4),%eax
movl %edx,%ebp
mull %edi
@@ -288,13 +408,13 @@ bn_mul_mont:
cmpl (%esp),%ecx
movl %ebp,28(%esp,%ecx,4)
movl %eax,%ebx
- jle .L011sqradd
+ jle .L016sqradd
movl %edx,%ebp
addl %edx,%edx
shrl $31,%ebp
addl %ebx,%edx
adcl $0,%ebp
-.L010sqrlast:
+.L015sqrlast:
movl 20(%esp),%edi
movl 16(%esp),%esi
imull 32(%esp),%edi
@@ -309,9 +429,9 @@ bn_mul_mont:
adcl $0,%edx
movl $1,%ecx
movl 4(%esi),%eax
- jmp .L0093rdmadd
+ jmp .L0143rdmadd
.align 16
-.L007common_tail:
+.L008common_tail:
movl 16(%esp),%ebp
movl 4(%esp),%edi
leal 32(%esp),%esi
@@ -319,19 +439,19 @@ bn_mul_mont:
movl %ebx,%ecx
xorl %edx,%edx
.align 16
-.L012sub:
+.L017sub:
sbbl (%ebp,%edx,4),%eax
movl %eax,(%edi,%edx,4)
decl %ecx
movl 4(%esi,%edx,4),%eax
leal 1(%edx),%edx
- jge .L012sub
+ jge .L017sub
sbbl $0,%eax
movl $-1,%edx
xorl %eax,%edx
- jmp .L013copy
+ jmp .L018copy
.align 16
-.L013copy:
+.L018copy:
movl 32(%esp,%ebx,4),%esi
movl (%edi,%ebx,4),%ebp
movl %ecx,32(%esp,%ebx,4)
@@ -340,7 +460,7 @@ bn_mul_mont:
orl %esi,%ebp
movl %ebp,(%edi,%ebx,4)
decl %ebx
- jge .L013copy
+ jge .L018copy
movl 24(%esp),%esp
movl $1,%eax
.L000just_leave:
@@ -355,6 +475,7 @@ bn_mul_mont:
.byte 54,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121
.byte 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46
.byte 111,114,103,62,0
+.comm OPENSSL_ia32cap_P,16,4
#else
.text
.globl bn_mul_mont
@@ -414,6 +535,123 @@ bn_mul_mont:
movl %esi,20(%esp)
leal -3(%edi),%ebx
movl %edx,24(%esp)
+ leal OPENSSL_ia32cap_P,%eax
+ btl $26,(%eax)
+ jnc .L003non_sse2
+ movl $-1,%eax
+ movd %eax,%mm7
+ movl 8(%esp),%esi
+ movl 12(%esp),%edi
+ movl 16(%esp),%ebp
+ xorl %edx,%edx
+ xorl %ecx,%ecx
+ movd (%edi),%mm4
+ movd (%esi),%mm5
+ movd (%ebp),%mm3
+ pmuludq %mm4,%mm5
+ movq %mm5,%mm2
+ movq %mm5,%mm0
+ pand %mm7,%mm0
+ pmuludq 20(%esp),%mm5
+ pmuludq %mm5,%mm3
+ paddq %mm0,%mm3
+ movd 4(%ebp),%mm1
+ movd 4(%esi),%mm0
+ psrlq $32,%mm2
+ psrlq $32,%mm3
+ incl %ecx
+.align 16
+.L0041st:
+ pmuludq %mm4,%mm0
+ pmuludq %mm5,%mm1
+ paddq %mm0,%mm2
+ paddq %mm1,%mm3
+ movq %mm2,%mm0
+ pand %mm7,%mm0
+ movd 4(%ebp,%ecx,4),%mm1
+ paddq %mm0,%mm3
+ movd 4(%esi,%ecx,4),%mm0
+ psrlq $32,%mm2
+ movd %mm3,28(%esp,%ecx,4)
+ psrlq $32,%mm3
+ leal 1(%ecx),%ecx
+ cmpl %ebx,%ecx
+ jl .L0041st
+ pmuludq %mm4,%mm0
+ pmuludq %mm5,%mm1
+ paddq %mm0,%mm2
+ paddq %mm1,%mm3
+ movq %mm2,%mm0
+ pand %mm7,%mm0
+ paddq %mm0,%mm3
+ movd %mm3,28(%esp,%ecx,4)
+ psrlq $32,%mm2
+ psrlq $32,%mm3
+ paddq %mm2,%mm3
+ movq %mm3,32(%esp,%ebx,4)
+ incl %edx
+.L005outer:
+ xorl %ecx,%ecx
+ movd (%edi,%edx,4),%mm4
+ movd (%esi),%mm5
+ movd 32(%esp),%mm6
+ movd (%ebp),%mm3
+ pmuludq %mm4,%mm5
+ paddq %mm6,%mm5
+ movq %mm5,%mm0
+ movq %mm5,%mm2
+ pand %mm7,%mm0
+ pmuludq 20(%esp),%mm5
+ pmuludq %mm5,%mm3
+ paddq %mm0,%mm3
+ movd 36(%esp),%mm6
+ movd 4(%ebp),%mm1
+ movd 4(%esi),%mm0
+ psrlq $32,%mm2
+ psrlq $32,%mm3
+ paddq %mm6,%mm2
+ incl %ecx
+ decl %ebx
+.L006inner:
+ pmuludq %mm4,%mm0
+ pmuludq %mm5,%mm1
+ paddq %mm0,%mm2
+ paddq %mm1,%mm3
+ movq %mm2,%mm0
+ movd 36(%esp,%ecx,4),%mm6
+ pand %mm7,%mm0
+ movd 4(%ebp,%ecx,4),%mm1
+ paddq %mm0,%mm3
+ movd 4(%esi,%ecx,4),%mm0
+ psrlq $32,%mm2
+ movd %mm3,28(%esp,%ecx,4)
+ psrlq $32,%mm3
+ paddq %mm6,%mm2
+ decl %ebx
+ leal 1(%ecx),%ecx
+ jnz .L006inner
+ movl %ecx,%ebx
+ pmuludq %mm4,%mm0
+ pmuludq %mm5,%mm1
+ paddq %mm0,%mm2
+ paddq %mm1,%mm3
+ movq %mm2,%mm0
+ pand %mm7,%mm0
+ paddq %mm0,%mm3
+ movd %mm3,28(%esp,%ecx,4)
+ psrlq $32,%mm2
+ psrlq $32,%mm3
+ movd 36(%esp,%ebx,4),%mm6
+ paddq %mm2,%mm3
+ paddq %mm6,%mm3
+ movq %mm3,32(%esp,%ebx,4)
+ leal 1(%edx),%edx
+ cmpl %ebx,%edx
+ jle .L005outer
+ emms
+ jmp .L007common_tail
+.align 16
+.L003non_sse2:
movl 8(%esp),%esi
leal 1(%ebx),%ebp
movl 12(%esp),%edi
@@ -424,12 +662,12 @@ bn_mul_mont:
leal 4(%edi,%ebx,4),%eax
orl %edx,%ebp
movl (%edi),%edi
- jz .L003bn_sqr_mont
+ jz .L008bn_sqr_mont
movl %eax,28(%esp)
movl (%esi),%eax
xorl %edx,%edx
.align 16
-.L004mull:
+.L009mull:
movl %edx,%ebp
mull %edi
addl %eax,%ebp
@@ -438,7 +676,7 @@ bn_mul_mont:
movl (%esi,%ecx,4),%eax
cmpl %ebx,%ecx
movl %ebp,28(%esp,%ecx,4)
- jl .L004mull
+ jl .L009mull
movl %edx,%ebp
mull %edi
movl 20(%esp),%edi
@@ -456,9 +694,9 @@ bn_mul_mont:
movl 4(%esi),%eax
adcl $0,%edx
incl %ecx
- jmp .L0052ndmadd
+ jmp .L0102ndmadd
.align 16
-.L0061stmadd:
+.L0111stmadd:
movl %edx,%ebp
mull %edi
addl 32(%esp,%ecx,4),%ebp
@@ -469,7 +707,7 @@ bn_mul_mont:
adcl $0,%edx
cmpl %ebx,%ecx
movl %ebp,28(%esp,%ecx,4)
- jl .L0061stmadd
+ jl .L0111stmadd
movl %edx,%ebp
mull %edi
addl 32(%esp,%ebx,4),%eax
@@ -492,7 +730,7 @@ bn_mul_mont:
adcl $0,%edx
movl $1,%ecx
.align 16
-.L0052ndmadd:
+.L0102ndmadd:
movl %edx,%ebp
mull %edi
addl 32(%esp,%ecx,4),%ebp
@@ -503,7 +741,7 @@ bn_mul_mont:
adcl $0,%edx
cmpl %ebx,%ecx
movl %ebp,24(%esp,%ecx,4)
- jl .L0052ndmadd
+ jl .L0102ndmadd
movl %edx,%ebp
mull %edi
addl 32(%esp,%ebx,4),%ebp
@@ -526,9 +764,9 @@ bn_mul_mont:
xorl %ecx,%ecx
xorl %edx,%edx
movl (%esi),%eax
- jmp .L0061stmadd
+ jmp .L0111stmadd
.align 16
-.L003bn_sqr_mont:
+.L008bn_sqr_mont:
movl %ebx,(%esp)
movl %ecx,12(%esp)
movl %edi,%eax
@@ -539,7 +777,7 @@ bn_mul_mont:
andl $1,%ebx
incl %ecx
.align 16
-.L008sqr:
+.L012sqr:
movl (%esi,%ecx,4),%eax
movl %edx,%ebp
mull %edi
@@ -551,7 +789,7 @@ bn_mul_mont:
cmpl (%esp),%ecx
movl %eax,%ebx
movl %ebp,28(%esp,%ecx,4)
- jl .L008sqr
+ jl .L012sqr
movl (%esi,%ecx,4),%eax
movl %edx,%ebp
mull %edi
@@ -575,7 +813,7 @@ bn_mul_mont:
movl 4(%esi),%eax
movl $1,%ecx
.align 16
-.L0093rdmadd:
+.L0133rdmadd:
movl %edx,%ebp
mull %edi
addl 32(%esp,%ecx,4),%ebp
@@ -594,7 +832,7 @@ bn_mul_mont:
adcl $0,%edx
cmpl %ebx,%ecx
movl %ebp,24(%esp,%ecx,4)
- jl .L0093rdmadd
+ jl .L0133rdmadd
movl %edx,%ebp
mull %edi
addl 32(%esp,%ebx,4),%ebp
@@ -622,12 +860,12 @@ bn_mul_mont:
xorl %ebp,%ebp
cmpl %ebx,%ecx
leal 1(%ecx),%ecx
- je .L010sqrlast
+ je .L014sqrlast
movl %edx,%ebx
shrl $1,%edx
andl $1,%ebx
.align 16
-.L011sqradd:
+.L015sqradd:
movl (%esi,%ecx,4),%eax
movl %edx,%ebp
mull %edi
@@ -643,13 +881,13 @@ bn_mul_mont:
cmpl (%esp),%ecx
movl %ebp,28(%esp,%ecx,4)
movl %eax,%ebx
- jle .L011sqradd
+ jle .L015sqradd
movl %edx,%ebp
addl %edx,%edx
shrl $31,%ebp
addl %ebx,%edx
adcl $0,%ebp
-.L010sqrlast:
+.L014sqrlast:
movl 20(%esp),%edi
movl 16(%esp),%esi
imull 32(%esp),%edi
@@ -664,7 +902,7 @@ bn_mul_mont:
adcl $0,%edx
movl $1,%ecx
movl 4(%esi),%eax
- jmp .L0093rdmadd
+ jmp .L0133rdmadd
.align 16
.L007common_tail:
movl 16(%esp),%ebp
@@ -674,19 +912,19 @@ bn_mul_mont:
movl %ebx,%ecx
xorl %edx,%edx
.align 16
-.L012sub:
+.L016sub:
sbbl (%ebp,%edx,4),%eax
movl %eax,(%edi,%edx,4)
decl %ecx
movl 4(%esi,%edx,4),%eax
leal 1(%edx),%edx
- jge .L012sub
+ jge .L016sub
sbbl $0,%eax
movl $-1,%edx
xorl %eax,%edx
- jmp .L013copy
+ jmp .L017copy
.align 16
-.L013copy:
+.L017copy:
movl 32(%esp,%ebx,4),%esi
movl (%edi,%ebx,4),%ebp
movl %ecx,32(%esp,%ebx,4)
@@ -695,7 +933,7 @@ bn_mul_mont:
orl %esi,%ebp
movl %ebp,(%edi,%ebx,4)
decl %ebx
- jge .L013copy
+ jge .L017copy
movl 24(%esp),%esp
movl $1,%eax
.L000just_leave:
@@ -710,4 +948,5 @@ bn_mul_mont:
.byte 54,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121
.byte 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46
.byte 111,114,103,62,0
+.comm OPENSSL_ia32cap_P,16,4
#endif
diff --git a/secure/lib/libcrypto/i386/x86cpuid.S b/secure/lib/libcrypto/i386/x86cpuid.S
index a8101a077cdb..6029b2df04fe 100644
--- a/secure/lib/libcrypto/i386/x86cpuid.S
+++ b/secure/lib/libcrypto/i386/x86cpuid.S
@@ -236,6 +236,18 @@ OPENSSL_wipe_cpu:
movl (%ecx),%ecx
btl $1,(%ecx)
jnc .L016no_x87
+ andl $83886080,%ecx
+ cmpl $83886080,%ecx
+ jne .L017no_sse2
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+.L017no_sse2:
.long 4007259865,4007259865,4007259865,4007259865,2430851995
.L016no_x87:
leal 4(%esp),%eax
@@ -251,11 +263,11 @@ OPENSSL_atomic_add:
pushl %ebx
nop
movl (%edx),%eax
-.L017spin:
+.L018spin:
leal (%eax,%ecx,1),%ebx
nop
.long 447811568
- jne .L017spin
+ jne .L018spin
movl %ebx,%eax
popl %ebx
ret
@@ -269,32 +281,32 @@ OPENSSL_cleanse:
movl 8(%esp),%ecx
xorl %eax,%eax
cmpl $7,%ecx
- jae .L018lot
+ jae .L019lot
cmpl $0,%ecx
- je .L019ret
-.L020little:
+ je .L020ret
+.L021little:
movb %al,(%edx)
subl $1,%ecx
leal 1(%edx),%edx
- jnz .L020little
-.L019ret:
+ jnz .L021little
+.L020ret:
ret
.align 16
-.L018lot:
+.L019lot:
testl $3,%edx
- jz .L021aligned
+ jz .L022aligned
movb %al,(%edx)
leal -1(%ecx),%ecx
leal 1(%edx),%edx
- jmp .L018lot
-.L021aligned:
+ jmp .L019lot
+.L022aligned:
movl %eax,(%edx)
leal -4(%ecx),%ecx
testl $-4,%ecx
leal 4(%edx),%edx
- jnz .L021aligned
+ jnz .L022aligned
cmpl $0,%ecx
- jne .L020little
+ jne .L021little
ret
.size OPENSSL_cleanse,.-.L_OPENSSL_cleanse_begin
.globl CRYPTO_memcmp
@@ -310,18 +322,18 @@ CRYPTO_memcmp:
xorl %eax,%eax
xorl %edx,%edx
cmpl $0,%ecx
- je .L022no_data
-.L023loop:
+ je .L023no_data
+.L024loop:
movb (%esi),%dl
leal 1(%esi),%esi
xorb (%edi),%dl
leal 1(%edi),%edi
orb %dl,%al
decl %ecx
- jnz .L023loop
+ jnz .L024loop
negl %eax
shrl $31,%eax
-.L022no_data:
+.L023no_data:
popl %edi
popl %esi
ret
@@ -336,6 +348,38 @@ OPENSSL_instrument_bus:
pushl %esi
pushl %edi
movl $0,%eax
+ call .L025PIC_me_up
+.L025PIC_me_up:
+ popl %edx
+ leal OPENSSL_ia32cap_P-.L025PIC_me_up(%edx),%edx
+ btl $4,(%edx)
+ jnc .L026nogo
+ btl $19,(%edx)
+ jnc .L026nogo
+ movl 20(%esp),%edi
+ movl 24(%esp),%ecx
+ .byte 0x0f,0x31
+ movl %eax,%esi
+ movl $0,%ebx
+ clflush (%edi)
+.byte 240
+ addl %ebx,(%edi)
+ jmp .L027loop
+.align 16
+.L027loop:
+ .byte 0x0f,0x31
+ movl %eax,%edx
+ subl %esi,%eax
+ movl %edx,%esi
+ movl %eax,%ebx
+ clflush (%edi)
+.byte 240
+ addl %eax,(%edi)
+ leal 4(%edi),%edi
+ subl $1,%ecx
+ jnz .L027loop
+ movl 24(%esp),%eax
+.L026nogo:
popl %edi
popl %esi
popl %ebx
@@ -352,6 +396,51 @@ OPENSSL_instrument_bus2:
pushl %esi
pushl %edi
movl $0,%eax
+ call .L028PIC_me_up
+.L028PIC_me_up:
+ popl %edx
+ leal OPENSSL_ia32cap_P-.L028PIC_me_up(%edx),%edx
+ btl $4,(%edx)
+ jnc .L029nogo
+ btl $19,(%edx)
+ jnc .L029nogo
+ movl 20(%esp),%edi
+ movl 24(%esp),%ecx
+ movl 28(%esp),%ebp
+ .byte 0x0f,0x31
+ movl %eax,%esi
+ movl $0,%ebx
+ clflush (%edi)
+.byte 240
+ addl %ebx,(%edi)
+ .byte 0x0f,0x31
+ movl %eax,%edx
+ subl %esi,%eax
+ movl %edx,%esi
+ movl %eax,%ebx
+ jmp .L030loop2
+.align 16
+.L030loop2:
+ clflush (%edi)
+.byte 240
+ addl %eax,(%edi)
+ subl $1,%ebp
+ jz .L031done2
+ .byte 0x0f,0x31
+ movl %eax,%edx
+ subl %esi,%eax
+ movl %edx,%esi
+ cmpl %ebx,%eax
+ movl %eax,%ebx
+ movl $0,%edx
+ setne %dl
+ subl %edx,%ecx
+ leal (%edi,%edx,4),%edi
+ jnz .L030loop2
+.L031done2:
+ movl 24(%esp),%eax
+ subl %ecx,%eax
+.L029nogo:
popl %edi
popl %esi
popl %ebx
@@ -369,33 +458,33 @@ OPENSSL_ia32_rdrand_bytes:
movl 12(%esp),%edi
movl 16(%esp),%ebx
cmpl $0,%ebx
- je .L024done
+ je .L032done
movl $8,%ecx
-.L025loop:
+.L033loop:
.byte 15,199,242
- jc .L026break
- loop .L025loop
- jmp .L024done
+ jc .L034break
+ loop .L033loop
+ jmp .L032done
.align 16
-.L026break:
+.L034break:
cmpl $4,%ebx
- jb .L027tail
+ jb .L035tail
movl %edx,(%edi)
leal 4(%edi),%edi
addl $4,%eax
subl $4,%ebx
- jz .L024done
+ jz .L032done
movl $8,%ecx
- jmp .L025loop
+ jmp .L033loop
.align 16
-.L027tail:
+.L035tail:
movb %dl,(%edi)
leal 1(%edi),%edi
incl %eax
shrl $8,%edx
decl %ebx
- jnz .L027tail
-.L024done:
+ jnz .L035tail
+.L032done:
xorl %edx,%edx
popl %ebx
popl %edi
@@ -412,33 +501,33 @@ OPENSSL_ia32_rdseed_bytes:
movl 12(%esp),%edi
movl 16(%esp),%ebx
cmpl $0,%ebx
- je .L028done
+ je .L036done
movl $8,%ecx
-.L029loop:
+.L037loop:
.byte 15,199,250
- jc .L030break
- loop .L029loop
- jmp .L028done
+ jc .L038break
+ loop .L037loop
+ jmp .L036done
.align 16
-.L030break:
+.L038break:
cmpl $4,%ebx
- jb .L031tail
+ jb .L039tail
movl %edx,(%edi)
leal 4(%edi),%edi
addl $4,%eax
subl $4,%ebx
- jz .L028done
+ jz .L036done
movl $8,%ecx
- jmp .L029loop
+ jmp .L037loop
.align 16
-.L031tail:
+.L039tail:
movb %dl,(%edi)
leal 1(%edi),%edi
incl %eax
shrl $8,%edx
decl %ebx
- jnz .L031tail
-.L028done:
+ jnz .L039tail
+.L036done:
xorl %edx,%edx
popl %ebx
popl %edi
@@ -676,6 +765,18 @@ OPENSSL_wipe_cpu:
movl (%ecx),%ecx
btl $1,(%ecx)
jnc .L013no_x87
+ andl $83886080,%ecx
+ cmpl $83886080,%ecx
+ jne .L014no_sse2
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+.L014no_sse2:
.long 4007259865,4007259865,4007259865,4007259865,2430851995
.L013no_x87:
leal 4(%esp),%eax
@@ -691,11 +792,11 @@ OPENSSL_atomic_add:
pushl %ebx
nop
movl (%edx),%eax
-.L014spin:
+.L015spin:
leal (%eax,%ecx,1),%ebx
nop
.long 447811568
- jne .L014spin
+ jne .L015spin
movl %ebx,%eax
popl %ebx
ret
@@ -709,32 +810,32 @@ OPENSSL_cleanse:
movl 8(%esp),%ecx
xorl %eax,%eax
cmpl $7,%ecx
- jae .L015lot
+ jae .L016lot
cmpl $0,%ecx
- je .L016ret
-.L017little:
+ je .L017ret
+.L018little:
movb %al,(%edx)
subl $1,%ecx
leal 1(%edx),%edx
- jnz .L017little
-.L016ret:
+ jnz .L018little
+.L017ret:
ret
.align 16
-.L015lot:
+.L016lot:
testl $3,%edx
- jz .L018aligned
+ jz .L019aligned
movb %al,(%edx)
leal -1(%ecx),%ecx
leal 1(%edx),%edx
- jmp .L015lot
-.L018aligned:
+ jmp .L016lot
+.L019aligned:
movl %eax,(%edx)
leal -4(%ecx),%ecx
testl $-4,%ecx
leal 4(%edx),%edx
- jnz .L018aligned
+ jnz .L019aligned
cmpl $0,%ecx
- jne .L017little
+ jne .L018little
ret
.size OPENSSL_cleanse,.-.L_OPENSSL_cleanse_begin
.globl CRYPTO_memcmp
@@ -750,18 +851,18 @@ CRYPTO_memcmp:
xorl %eax,%eax
xorl %edx,%edx
cmpl $0,%ecx
- je .L019no_data
-.L020loop:
+ je .L020no_data
+.L021loop:
movb (%esi),%dl
leal 1(%esi),%esi
xorb (%edi),%dl
leal 1(%edi),%edi
orb %dl,%al
decl %ecx
- jnz .L020loop
+ jnz .L021loop
negl %eax
shrl $31,%eax
-.L019no_data:
+.L020no_data:
popl %edi
popl %esi
ret
@@ -776,6 +877,35 @@ OPENSSL_instrument_bus:
pushl %esi
pushl %edi
movl $0,%eax
+ leal OPENSSL_ia32cap_P,%edx
+ btl $4,(%edx)
+ jnc .L022nogo
+ btl $19,(%edx)
+ jnc .L022nogo
+ movl 20(%esp),%edi
+ movl 24(%esp),%ecx
+ .byte 0x0f,0x31
+ movl %eax,%esi
+ movl $0,%ebx
+ clflush (%edi)
+.byte 240
+ addl %ebx,(%edi)
+ jmp .L023loop
+.align 16
+.L023loop:
+ .byte 0x0f,0x31
+ movl %eax,%edx
+ subl %esi,%eax
+ movl %edx,%esi
+ movl %eax,%ebx
+ clflush (%edi)
+.byte 240
+ addl %eax,(%edi)
+ leal 4(%edi),%edi
+ subl $1,%ecx
+ jnz .L023loop
+ movl 24(%esp),%eax
+.L022nogo:
popl %edi
popl %esi
popl %ebx
@@ -792,6 +922,48 @@ OPENSSL_instrument_bus2:
pushl %esi
pushl %edi
movl $0,%eax
+ leal OPENSSL_ia32cap_P,%edx
+ btl $4,(%edx)
+ jnc .L024nogo
+ btl $19,(%edx)
+ jnc .L024nogo
+ movl 20(%esp),%edi
+ movl 24(%esp),%ecx
+ movl 28(%esp),%ebp
+ .byte 0x0f,0x31
+ movl %eax,%esi
+ movl $0,%ebx
+ clflush (%edi)
+.byte 240
+ addl %ebx,(%edi)
+ .byte 0x0f,0x31
+ movl %eax,%edx
+ subl %esi,%eax
+ movl %edx,%esi
+ movl %eax,%ebx
+ jmp .L025loop2
+.align 16
+.L025loop2:
+ clflush (%edi)
+.byte 240
+ addl %eax,(%edi)
+ subl $1,%ebp
+ jz .L026done2
+ .byte 0x0f,0x31
+ movl %eax,%edx
+ subl %esi,%eax
+ movl %edx,%esi
+ cmpl %ebx,%eax
+ movl %eax,%ebx
+ movl $0,%edx
+ setne %dl
+ subl %edx,%ecx
+ leal (%edi,%edx,4),%edi
+ jnz .L025loop2
+.L026done2:
+ movl 24(%esp),%eax
+ subl %ecx,%eax
+.L024nogo:
popl %edi
popl %esi
popl %ebx
@@ -809,33 +981,33 @@ OPENSSL_ia32_rdrand_bytes:
movl 12(%esp),%edi
movl 16(%esp),%ebx
cmpl $0,%ebx
- je .L021done
+ je .L027done
movl $8,%ecx
-.L022loop:
+.L028loop:
.byte 15,199,242
- jc .L023break
- loop .L022loop
- jmp .L021done
+ jc .L029break
+ loop .L028loop
+ jmp .L027done
.align 16
-.L023break:
+.L029break:
cmpl $4,%ebx
- jb .L024tail
+ jb .L030tail
movl %edx,(%edi)
leal 4(%edi),%edi
addl $4,%eax
subl $4,%ebx
- jz .L021done
+ jz .L027done
movl $8,%ecx
- jmp .L022loop
+ jmp .L028loop
.align 16
-.L024tail:
+.L030tail:
movb %dl,(%edi)
leal 1(%edi),%edi
incl %eax
shrl $8,%edx
decl %ebx
- jnz .L024tail
-.L021done:
+ jnz .L030tail
+.L027done:
xorl %edx,%edx
popl %ebx
popl %edi
@@ -852,33 +1024,33 @@ OPENSSL_ia32_rdseed_bytes:
movl 12(%esp),%edi
movl 16(%esp),%ebx
cmpl $0,%ebx
- je .L025done
+ je .L031done
movl $8,%ecx
-.L026loop:
+.L032loop:
.byte 15,199,250
- jc .L027break
- loop .L026loop
- jmp .L025done
+ jc .L033break
+ loop .L032loop
+ jmp .L031done
.align 16
-.L027break:
+.L033break:
cmpl $4,%ebx
- jb .L028tail
+ jb .L034tail
movl %edx,(%edi)
leal 4(%edi),%edi
addl $4,%eax
subl $4,%ebx
- jz .L025done
+ jz .L031done
movl $8,%ecx
- jmp .L026loop
+ jmp .L032loop
.align 16
-.L028tail:
+.L034tail:
movb %dl,(%edi)
leal 1(%edi),%edi
incl %eax
shrl $8,%edx
decl %ebx
- jnz .L028tail
-.L025done:
+ jnz .L034tail
+.L031done:
xorl %edx,%edx
popl %ebx
popl %edi