diff options
Diffstat (limited to 'secure/lib/libcrypto/amd64')
-rw-r--r-- | secure/lib/libcrypto/amd64/aes-x86_64.S | 70 | ||||
-rw-r--r-- | secure/lib/libcrypto/amd64/aesni-sha1-x86_64.S | 8 | ||||
-rw-r--r-- | secure/lib/libcrypto/amd64/aesni-x86_64.S | 112 | ||||
-rw-r--r-- | secure/lib/libcrypto/amd64/bsaes-x86_64.S | 158 | ||||
-rw-r--r-- | secure/lib/libcrypto/amd64/cmll-x86_64.S | 2 | ||||
-rw-r--r-- | secure/lib/libcrypto/amd64/ecp_nistz256-x86_64.S | 27 | ||||
-rw-r--r-- | secure/lib/libcrypto/amd64/ghash-x86_64.S | 82 | ||||
-rw-r--r-- | secure/lib/libcrypto/amd64/md5-x86_64.S | 34 | ||||
-rw-r--r-- | secure/lib/libcrypto/amd64/rsaz-x86_64.S | 218 | ||||
-rw-r--r-- | secure/lib/libcrypto/amd64/sha1-mb-x86_64.S | 16 | ||||
-rw-r--r-- | secure/lib/libcrypto/amd64/sha1-x86_64.S | 8 | ||||
-rw-r--r-- | secure/lib/libcrypto/amd64/sha256-mb-x86_64.S | 84 | ||||
-rw-r--r-- | secure/lib/libcrypto/amd64/sha256-x86_64.S | 44 | ||||
-rw-r--r-- | secure/lib/libcrypto/amd64/vpaes-x86_64.S | 20 | ||||
-rw-r--r-- | secure/lib/libcrypto/amd64/x86_64-gf2m.S | 2 | ||||
-rw-r--r-- | secure/lib/libcrypto/amd64/x86_64-mont.S | 108 | ||||
-rw-r--r-- | secure/lib/libcrypto/amd64/x86_64-mont5.S | 901 | ||||
-rw-r--r-- | secure/lib/libcrypto/amd64/x86_64cpuid.S | 48 |
18 files changed, 1242 insertions, 700 deletions
diff --git a/secure/lib/libcrypto/amd64/aes-x86_64.S b/secure/lib/libcrypto/amd64/aes-x86_64.S index 3243d6d4ed81..4c39bbaa9ce2 100644 --- a/secure/lib/libcrypto/amd64/aes-x86_64.S +++ b/secure/lib/libcrypto/amd64/aes-x86_64.S @@ -82,8 +82,8 @@ _x86_64_AES_encrypt: movl 0(%r14,%rdi,8),%edi movl 0(%r14,%rbp,8),%ebp - andl $65280,%edi - andl $65280,%ebp + andl $0x0000ff00,%edi + andl $0x0000ff00,%ebp xorl %edi,%r10d xorl %ebp,%r11d @@ -95,8 +95,8 @@ _x86_64_AES_encrypt: movl 0(%r14,%rsi,8),%esi movl 0(%r14,%rdi,8),%edi - andl $65280,%esi - andl $65280,%edi + andl $0x0000ff00,%esi + andl $0x0000ff00,%edi shrl $16,%ebx xorl %esi,%r12d xorl %edi,%r8d @@ -109,9 +109,9 @@ _x86_64_AES_encrypt: movl 0(%r14,%rdi,8),%edi movl 0(%r14,%rbp,8),%ebp - andl $16711680,%esi - andl $16711680,%edi - andl $16711680,%ebp + andl $0x00ff0000,%esi + andl $0x00ff0000,%edi + andl $0x00ff0000,%ebp xorl %esi,%r10d xorl %edi,%r11d @@ -124,9 +124,9 @@ _x86_64_AES_encrypt: movl 2(%r14,%rdi,8),%edi movl 2(%r14,%rbp,8),%ebp - andl $16711680,%esi - andl $4278190080,%edi - andl $4278190080,%ebp + andl $0x00ff0000,%esi + andl $0xff000000,%edi + andl $0xff000000,%ebp xorl %esi,%r8d xorl %edi,%r10d @@ -139,8 +139,8 @@ _x86_64_AES_encrypt: movl 2(%r14,%rdi,8),%edi movl 16+0(%r15),%eax - andl $4278190080,%esi - andl $4278190080,%edi + andl $0xff000000,%esi + andl $0xff000000,%edi xorl %esi,%r12d xorl %edi,%r8d @@ -242,8 +242,8 @@ _x86_64_AES_encrypt_compact: xorl %r8d,%edx cmpq 16(%rsp),%r15 je .Lenc_compact_done - movl $2155905152,%r10d - movl $2155905152,%r11d + movl $0x80808080,%r10d + movl $0x80808080,%r11d andl %eax,%r10d andl %ebx,%r11d movl %r10d,%esi @@ -254,10 +254,10 @@ _x86_64_AES_encrypt_compact: leal (%rbx,%rbx,1),%r9d subl %r10d,%esi subl %r11d,%edi - andl $4278124286,%r8d - andl $4278124286,%r9d - andl $454761243,%esi - andl $454761243,%edi + andl $0xfefefefe,%r8d + andl $0xfefefefe,%r9d + andl $0x1b1b1b1b,%esi + andl $0x1b1b1b1b,%edi movl %eax,%r10d movl %ebx,%r11d xorl %esi,%r8d @@ -265,9 +265,9 @@ _x86_64_AES_encrypt_compact: xorl %r8d,%eax xorl %r9d,%ebx - movl $2155905152,%r12d + movl $0x80808080,%r12d roll $24,%eax - movl $2155905152,%ebp + movl $0x80808080,%ebp roll $24,%ebx andl %ecx,%r12d andl %edx,%ebp @@ -290,10 +290,10 @@ _x86_64_AES_encrypt_compact: xorl %r10d,%eax xorl %r11d,%ebx - andl $4278124286,%r8d - andl $4278124286,%r9d - andl $454761243,%esi - andl $454761243,%edi + andl $0xfefefefe,%r8d + andl $0xfefefefe,%r9d + andl $0x1b1b1b1b,%esi + andl $0x1b1b1b1b,%edi movl %ecx,%r12d movl %edx,%ebp xorl %esi,%r8d @@ -346,7 +346,7 @@ AES_encrypt: andq $-64,%rsp subq %rsp,%rcx negq %rcx - andq $960,%rcx + andq $0x3c0,%rcx subq %rcx,%rsp subq $32,%rsp @@ -371,7 +371,7 @@ AES_encrypt: leaq .LAES_Te+2048(%rip),%r14 leaq 768(%rsp),%rbp subq %r14,%rbp - andq $768,%rbp + andq $0x300,%rbp leaq (%r14,%rbp,1),%r14 call _x86_64_AES_encrypt_compact @@ -793,7 +793,7 @@ AES_decrypt: andq $-64,%rsp subq %rsp,%rcx negq %rcx - andq $960,%rcx + andq $0x3c0,%rcx subq %rcx,%rsp subq $32,%rsp @@ -818,7 +818,7 @@ AES_decrypt: leaq .LAES_Td+2048(%rip),%r14 leaq 768(%rsp),%rbp subq %r14,%rbp - andq $768,%rbp + andq $0x300,%rbp leaq (%r14,%rbp,1),%r14 shrq $3,%rbp addq %rbp,%r14 @@ -1334,9 +1334,9 @@ AES_cbc_encrypt: movq %r14,%r10 leaq 2304(%r14),%r11 movq %r15,%r12 - andq $4095,%r10 - andq $4095,%r11 - andq $4095,%r12 + andq $0xFFF,%r10 + andq $0xFFF,%r11 + andq $0xFFF,%r12 cmpq %r11,%r12 jb .Lcbc_te_break_out @@ -1345,7 +1345,7 @@ AES_cbc_encrypt: jmp .Lcbc_te_ok .Lcbc_te_break_out: subq %r10,%r12 - andq $4095,%r12 + andq $0xFFF,%r12 addq $320,%r12 subq %r12,%r15 .align 4 @@ -1371,7 +1371,7 @@ AES_cbc_encrypt: movq %r15,%r10 subq %r14,%r10 - andq $4095,%r10 + andq $0xfff,%r10 cmpq $2304,%r10 jb .Lcbc_do_ecopy cmpq $4096-248,%r10 @@ -1558,7 +1558,7 @@ AES_cbc_encrypt: leaq -88-63(%rcx),%r10 subq %rbp,%r10 negq %r10 - andq $960,%r10 + andq $0x3c0,%r10 subq %r10,%rbp xchgq %rsp,%rbp @@ -1587,7 +1587,7 @@ AES_cbc_encrypt: leaq 2048(%r14),%r14 leaq 768-8(%rsp),%rax subq %r14,%rax - andq $768,%rax + andq $0x300,%rax leaq (%r14,%rax,1),%r14 cmpq $0,%rbx diff --git a/secure/lib/libcrypto/amd64/aesni-sha1-x86_64.S b/secure/lib/libcrypto/amd64/aesni-sha1-x86_64.S index fa164341ce8d..45a5e3b74f6d 100644 --- a/secure/lib/libcrypto/amd64/aesni-sha1-x86_64.S +++ b/secure/lib/libcrypto/amd64/aesni-sha1-x86_64.S @@ -1393,8 +1393,8 @@ aesni_cbc_sha1_enc_shaext: movups 16(%rcx),%xmm0 leaq 112(%rcx),%rcx - pshufd $27,%xmm8,%xmm8 - pshufd $27,%xmm9,%xmm9 + pshufd $0b00011011,%xmm8,%xmm8 + pshufd $0b00011011,%xmm9,%xmm9 jmp .Loop_shaext .align 16 @@ -1673,8 +1673,8 @@ aesni_cbc_sha1_enc_shaext: leaq 64(%rdi),%rdi jnz .Loop_shaext - pshufd $27,%xmm8,%xmm8 - pshufd $27,%xmm9,%xmm9 + pshufd $0b00011011,%xmm8,%xmm8 + pshufd $0b00011011,%xmm9,%xmm9 movups %xmm2,(%r8) movdqu %xmm8,(%r9) movd %xmm9,16(%r9) diff --git a/secure/lib/libcrypto/amd64/aesni-x86_64.S b/secure/lib/libcrypto/amd64/aesni-x86_64.S index 082a3060c221..5be861663de5 100644 --- a/secure/lib/libcrypto/amd64/aesni-x86_64.S +++ b/secure/lib/libcrypto/amd64/aesni-x86_64.S @@ -504,7 +504,7 @@ aesni_ecb_encrypt: testl %r8d,%r8d jz .Lecb_decrypt - cmpq $128,%rdx + cmpq $0x80,%rdx jb .Lecb_enc_tail movdqu (%rdi),%xmm2 @@ -516,7 +516,7 @@ aesni_ecb_encrypt: movdqu 96(%rdi),%xmm8 movdqu 112(%rdi),%xmm9 leaq 128(%rdi),%rdi - subq $128,%rdx + subq $0x80,%rdx jmp .Lecb_enc_loop8_enter .align 16 .Lecb_enc_loop8: @@ -544,7 +544,7 @@ aesni_ecb_encrypt: call _aesni_encrypt8 - subq $128,%rdx + subq $0x80,%rdx jnc .Lecb_enc_loop8 movups %xmm2,(%rsi) @@ -558,22 +558,22 @@ aesni_ecb_encrypt: movups %xmm8,96(%rsi) movups %xmm9,112(%rsi) leaq 128(%rsi),%rsi - addq $128,%rdx + addq $0x80,%rdx jz .Lecb_ret .Lecb_enc_tail: movups (%rdi),%xmm2 - cmpq $32,%rdx + cmpq $0x20,%rdx jb .Lecb_enc_one movups 16(%rdi),%xmm3 je .Lecb_enc_two movups 32(%rdi),%xmm4 - cmpq $64,%rdx + cmpq $0x40,%rdx jb .Lecb_enc_three movups 48(%rdi),%xmm5 je .Lecb_enc_four movups 64(%rdi),%xmm6 - cmpq $96,%rdx + cmpq $0x60,%rdx jb .Lecb_enc_five movups 80(%rdi),%xmm7 je .Lecb_enc_six @@ -647,7 +647,7 @@ aesni_ecb_encrypt: .align 16 .Lecb_decrypt: - cmpq $128,%rdx + cmpq $0x80,%rdx jb .Lecb_dec_tail movdqu (%rdi),%xmm2 @@ -659,7 +659,7 @@ aesni_ecb_encrypt: movdqu 96(%rdi),%xmm8 movdqu 112(%rdi),%xmm9 leaq 128(%rdi),%rdi - subq $128,%rdx + subq $0x80,%rdx jmp .Lecb_dec_loop8_enter .align 16 .Lecb_dec_loop8: @@ -688,7 +688,7 @@ aesni_ecb_encrypt: call _aesni_decrypt8 movups (%r11),%xmm0 - subq $128,%rdx + subq $0x80,%rdx jnc .Lecb_dec_loop8 movups %xmm2,(%rsi) @@ -710,22 +710,22 @@ aesni_ecb_encrypt: movups %xmm9,112(%rsi) pxor %xmm9,%xmm9 leaq 128(%rsi),%rsi - addq $128,%rdx + addq $0x80,%rdx jz .Lecb_ret .Lecb_dec_tail: movups (%rdi),%xmm2 - cmpq $32,%rdx + cmpq $0x20,%rdx jb .Lecb_dec_one movups 16(%rdi),%xmm3 je .Lecb_dec_two movups 32(%rdi),%xmm4 - cmpq $64,%rdx + cmpq $0x40,%rdx jb .Lecb_dec_three movups 48(%rdi),%xmm5 je .Lecb_dec_four movups 64(%rdi),%xmm6 - cmpq $96,%rdx + cmpq $0x60,%rdx jb .Lecb_dec_five movups 80(%rdi),%xmm7 je .Lecb_dec_six @@ -1599,7 +1599,7 @@ aesni_xts_encrypt: movdqa .Lxts_magic(%rip),%xmm8 movdqa %xmm2,%xmm15 - pshufd $95,%xmm2,%xmm9 + pshufd $0x5f,%xmm2,%xmm9 pxor %xmm0,%xmm1 movdqa %xmm9,%xmm14 paddd %xmm9,%xmm9 @@ -1698,7 +1698,7 @@ aesni_xts_encrypt: .byte 102,15,56,220,248 movups 64(%r11),%xmm0 movdqa %xmm8,80(%rsp) - pshufd $95,%xmm15,%xmm9 + pshufd $0x5f,%xmm15,%xmm9 jmp .Lxts_enc_loop6 .align 32 .Lxts_enc_loop6: @@ -1837,13 +1837,13 @@ aesni_xts_encrypt: jz .Lxts_enc_done pxor %xmm0,%xmm11 - cmpq $32,%rdx + cmpq $0x20,%rdx jb .Lxts_enc_one pxor %xmm0,%xmm12 je .Lxts_enc_two pxor %xmm0,%xmm13 - cmpq $64,%rdx + cmpq $0x40,%rdx jb .Lxts_enc_three pxor %xmm0,%xmm14 je .Lxts_enc_four @@ -2070,7 +2070,7 @@ aesni_xts_decrypt: movdqa .Lxts_magic(%rip),%xmm8 movdqa %xmm2,%xmm15 - pshufd $95,%xmm2,%xmm9 + pshufd $0x5f,%xmm2,%xmm9 pxor %xmm0,%xmm1 movdqa %xmm9,%xmm14 paddd %xmm9,%xmm9 @@ -2169,7 +2169,7 @@ aesni_xts_decrypt: .byte 102,15,56,222,248 movups 64(%r11),%xmm0 movdqa %xmm8,80(%rsp) - pshufd $95,%xmm15,%xmm9 + pshufd $0x5f,%xmm15,%xmm9 jmp .Lxts_dec_loop6 .align 32 .Lxts_dec_loop6: @@ -2309,13 +2309,13 @@ aesni_xts_decrypt: jz .Lxts_dec_done pxor %xmm0,%xmm12 - cmpq $32,%rdx + cmpq $0x20,%rdx jb .Lxts_dec_one pxor %xmm0,%xmm13 je .Lxts_dec_two pxor %xmm0,%xmm14 - cmpq $64,%rdx + cmpq $0x40,%rdx jb .Lxts_dec_three je .Lxts_dec_four @@ -2346,7 +2346,7 @@ aesni_xts_decrypt: pcmpgtd %xmm15,%xmm14 movdqu %xmm6,64(%rsi) leaq 80(%rsi),%rsi - pshufd $19,%xmm14,%xmm11 + pshufd $0x13,%xmm14,%xmm11 andq $15,%r9 jz .Lxts_dec_ret @@ -2635,7 +2635,7 @@ aesni_cbc_encrypt: leaq -8(%rax),%rbp movups (%r8),%xmm10 movl %r10d,%eax - cmpq $80,%rdx + cmpq $0x50,%rdx jbe .Lcbc_dec_tail movups (%rcx),%xmm0 @@ -2651,14 +2651,14 @@ aesni_cbc_encrypt: movdqu 80(%rdi),%xmm7 movdqa %xmm6,%xmm15 movl OPENSSL_ia32cap_P+4(%rip),%r9d - cmpq $112,%rdx + cmpq $0x70,%rdx jbe .Lcbc_dec_six_or_seven andl $71303168,%r9d - subq $80,%rdx + subq $0x50,%rdx cmpl $4194304,%r9d je .Lcbc_dec_loop6_enter - subq $32,%rdx + subq $0x20,%rdx leaq 112(%rcx),%rcx jmp .Lcbc_dec_loop8_enter .align 16 @@ -2673,7 +2673,7 @@ aesni_cbc_encrypt: movups 16-112(%rcx),%xmm1 pxor %xmm0,%xmm4 xorq %r11,%r11 - cmpq $112,%rdx + cmpq $0x70,%rdx pxor %xmm0,%xmm5 pxor %xmm0,%xmm6 pxor %xmm0,%xmm7 @@ -2858,21 +2858,21 @@ aesni_cbc_encrypt: movups %xmm8,96(%rsi) leaq 112(%rsi),%rsi - subq $128,%rdx + subq $0x80,%rdx ja .Lcbc_dec_loop8 movaps %xmm9,%xmm2 leaq -112(%rcx),%rcx - addq $112,%rdx + addq $0x70,%rdx jle .Lcbc_dec_clear_tail_collected movups %xmm9,(%rsi) leaq 16(%rsi),%rsi - cmpq $80,%rdx + cmpq $0x50,%rdx jbe .Lcbc_dec_tail movaps %xmm11,%xmm2 .Lcbc_dec_six_or_seven: - cmpq $96,%rdx + cmpq $0x60,%rdx ja .Lcbc_dec_seven movaps %xmm7,%xmm8 @@ -2965,33 +2965,33 @@ aesni_cbc_encrypt: movl %r10d,%eax movdqu %xmm6,64(%rsi) leaq 80(%rsi),%rsi - subq $96,%rdx + subq $0x60,%rdx ja .Lcbc_dec_loop6 movdqa %xmm7,%xmm2 - addq $80,%rdx + addq $0x50,%rdx jle .Lcbc_dec_clear_tail_collected movups %xmm7,(%rsi) leaq 16(%rsi),%rsi .Lcbc_dec_tail: movups (%rdi),%xmm2 - subq $16,%rdx + subq $0x10,%rdx jbe .Lcbc_dec_one movups 16(%rdi),%xmm3 movaps %xmm2,%xmm11 - subq $16,%rdx + subq $0x10,%rdx jbe .Lcbc_dec_two movups 32(%rdi),%xmm4 movaps %xmm3,%xmm12 - subq $16,%rdx + subq $0x10,%rdx jbe .Lcbc_dec_three movups 48(%rdi),%xmm5 movaps %xmm4,%xmm13 - subq $16,%rdx + subq $0x10,%rdx jbe .Lcbc_dec_four movups 64(%rdi),%xmm6 @@ -3016,7 +3016,7 @@ aesni_cbc_encrypt: movdqa %xmm6,%xmm2 pxor %xmm6,%xmm6 pxor %xmm7,%xmm7 - subq $16,%rdx + subq $0x10,%rdx jmp .Lcbc_dec_tail_collected .align 16 @@ -3333,7 +3333,7 @@ __aesni_set_encrypt_key: pslldq $4,%xmm0 pxor %xmm3,%xmm0 - pshufd $255,%xmm0,%xmm3 + pshufd $0xff,%xmm0,%xmm3 pxor %xmm1,%xmm3 pslldq $4,%xmm1 pxor %xmm1,%xmm3 @@ -3420,7 +3420,7 @@ __aesni_set_encrypt_key: decl %r10d jz .Ldone_key256 - pshufd $255,%xmm0,%xmm2 + pshufd $0xff,%xmm0,%xmm2 pxor %xmm3,%xmm3 .byte 102,15,56,221,211 @@ -3463,11 +3463,11 @@ __aesni_set_encrypt_key: movups %xmm0,(%rax) leaq 16(%rax),%rax .Lkey_expansion_128_cold: - shufps $16,%xmm0,%xmm4 + shufps $0b00010000,%xmm0,%xmm4 xorps %xmm4,%xmm0 - shufps $140,%xmm0,%xmm4 + shufps $0b10001100,%xmm0,%xmm4 xorps %xmm4,%xmm0 - shufps $255,%xmm1,%xmm1 + shufps $0b11111111,%xmm1,%xmm1 xorps %xmm1,%xmm0 .byte 0xf3,0xc3 @@ -3478,25 +3478,25 @@ __aesni_set_encrypt_key: .Lkey_expansion_192a_cold: movaps %xmm2,%xmm5 .Lkey_expansion_192b_warm: - shufps $16,%xmm0,%xmm4 + shufps $0b00010000,%xmm0,%xmm4 movdqa %xmm2,%xmm3 xorps %xmm4,%xmm0 - shufps $140,%xmm0,%xmm4 + shufps $0b10001100,%xmm0,%xmm4 pslldq $4,%xmm3 xorps %xmm4,%xmm0 - pshufd $85,%xmm1,%xmm1 + pshufd $0b01010101,%xmm1,%xmm1 pxor %xmm3,%xmm2 pxor %xmm1,%xmm0 - pshufd $255,%xmm0,%xmm3 + pshufd $0b11111111,%xmm0,%xmm3 pxor %xmm3,%xmm2 .byte 0xf3,0xc3 .align 16 .Lkey_expansion_192b: movaps %xmm0,%xmm3 - shufps $68,%xmm0,%xmm5 + shufps $0b01000100,%xmm0,%xmm5 movups %xmm5,(%rax) - shufps $78,%xmm2,%xmm3 + shufps $0b01001110,%xmm2,%xmm3 movups %xmm3,16(%rax) leaq 32(%rax),%rax jmp .Lkey_expansion_192b_warm @@ -3506,11 +3506,11 @@ __aesni_set_encrypt_key: movups %xmm2,(%rax) leaq 16(%rax),%rax .Lkey_expansion_256a_cold: - shufps $16,%xmm0,%xmm4 + shufps $0b00010000,%xmm0,%xmm4 xorps %xmm4,%xmm0 - shufps $140,%xmm0,%xmm4 + shufps $0b10001100,%xmm0,%xmm4 xorps %xmm4,%xmm0 - shufps $255,%xmm1,%xmm1 + shufps $0b11111111,%xmm1,%xmm1 xorps %xmm1,%xmm0 .byte 0xf3,0xc3 @@ -3519,11 +3519,11 @@ __aesni_set_encrypt_key: movups %xmm0,(%rax) leaq 16(%rax),%rax - shufps $16,%xmm2,%xmm4 + shufps $0b00010000,%xmm2,%xmm4 xorps %xmm4,%xmm2 - shufps $140,%xmm2,%xmm4 + shufps $0b10001100,%xmm2,%xmm4 xorps %xmm4,%xmm2 - shufps $170,%xmm1,%xmm1 + shufps $0b10101010,%xmm1,%xmm1 xorps %xmm1,%xmm2 .byte 0xf3,0xc3 .size aesni_set_encrypt_key,.-aesni_set_encrypt_key diff --git a/secure/lib/libcrypto/amd64/bsaes-x86_64.S b/secure/lib/libcrypto/amd64/bsaes-x86_64.S index be410de2f1d3..d39a81cef10d 100644 --- a/secure/lib/libcrypto/amd64/bsaes-x86_64.S +++ b/secure/lib/libcrypto/amd64/bsaes-x86_64.S @@ -325,45 +325,45 @@ _bsaes_encrypt8_bitslice: pxor %xmm2,%xmm5 decl %r10d jl .Lenc_done - pshufd $147,%xmm15,%xmm7 - pshufd $147,%xmm0,%xmm8 + pshufd $0x93,%xmm15,%xmm7 + pshufd $0x93,%xmm0,%xmm8 pxor %xmm7,%xmm15 - pshufd $147,%xmm3,%xmm9 + pshufd $0x93,%xmm3,%xmm9 pxor %xmm8,%xmm0 - pshufd $147,%xmm5,%xmm10 + pshufd $0x93,%xmm5,%xmm10 pxor %xmm9,%xmm3 - pshufd $147,%xmm2,%xmm11 + pshufd $0x93,%xmm2,%xmm11 pxor %xmm10,%xmm5 - pshufd $147,%xmm6,%xmm12 + pshufd $0x93,%xmm6,%xmm12 pxor %xmm11,%xmm2 - pshufd $147,%xmm1,%xmm13 + pshufd $0x93,%xmm1,%xmm13 pxor %xmm12,%xmm6 - pshufd $147,%xmm4,%xmm14 + pshufd $0x93,%xmm4,%xmm14 pxor %xmm13,%xmm1 pxor %xmm14,%xmm4 pxor %xmm15,%xmm8 pxor %xmm4,%xmm7 pxor %xmm4,%xmm8 - pshufd $78,%xmm15,%xmm15 + pshufd $0x4E,%xmm15,%xmm15 pxor %xmm0,%xmm9 - pshufd $78,%xmm0,%xmm0 + pshufd $0x4E,%xmm0,%xmm0 pxor %xmm2,%xmm12 pxor %xmm7,%xmm15 pxor %xmm6,%xmm13 pxor %xmm8,%xmm0 pxor %xmm5,%xmm11 - pshufd $78,%xmm2,%xmm7 + pshufd $0x4E,%xmm2,%xmm7 pxor %xmm1,%xmm14 - pshufd $78,%xmm6,%xmm8 + pshufd $0x4E,%xmm6,%xmm8 pxor %xmm3,%xmm10 - pshufd $78,%xmm5,%xmm2 + pshufd $0x4E,%xmm5,%xmm2 pxor %xmm4,%xmm10 - pshufd $78,%xmm4,%xmm6 + pshufd $0x4E,%xmm4,%xmm6 pxor %xmm4,%xmm11 - pshufd $78,%xmm1,%xmm5 + pshufd $0x4E,%xmm1,%xmm5 pxor %xmm11,%xmm7 - pshufd $78,%xmm3,%xmm1 + pshufd $0x4E,%xmm3,%xmm1 pxor %xmm12,%xmm8 pxor %xmm10,%xmm2 pxor %xmm14,%xmm6 @@ -797,24 +797,24 @@ _bsaes_decrypt8: decl %r10d jl .Ldec_done - pshufd $78,%xmm15,%xmm7 - pshufd $78,%xmm2,%xmm13 + pshufd $0x4E,%xmm15,%xmm7 + pshufd $0x4E,%xmm2,%xmm13 pxor %xmm15,%xmm7 - pshufd $78,%xmm4,%xmm14 + pshufd $0x4E,%xmm4,%xmm14 pxor %xmm2,%xmm13 - pshufd $78,%xmm0,%xmm8 + pshufd $0x4E,%xmm0,%xmm8 pxor %xmm4,%xmm14 - pshufd $78,%xmm5,%xmm9 + pshufd $0x4E,%xmm5,%xmm9 pxor %xmm0,%xmm8 - pshufd $78,%xmm3,%xmm10 + pshufd $0x4E,%xmm3,%xmm10 pxor %xmm5,%xmm9 pxor %xmm13,%xmm15 pxor %xmm13,%xmm0 - pshufd $78,%xmm1,%xmm11 + pshufd $0x4E,%xmm1,%xmm11 pxor %xmm3,%xmm10 pxor %xmm7,%xmm5 pxor %xmm8,%xmm3 - pshufd $78,%xmm6,%xmm12 + pshufd $0x4E,%xmm6,%xmm12 pxor %xmm1,%xmm11 pxor %xmm14,%xmm0 pxor %xmm9,%xmm1 @@ -828,45 +828,45 @@ _bsaes_decrypt8: pxor %xmm14,%xmm1 pxor %xmm14,%xmm6 pxor %xmm12,%xmm4 - pshufd $147,%xmm15,%xmm7 - pshufd $147,%xmm0,%xmm8 + pshufd $0x93,%xmm15,%xmm7 + pshufd $0x93,%xmm0,%xmm8 pxor %xmm7,%xmm15 - pshufd $147,%xmm5,%xmm9 + pshufd $0x93,%xmm5,%xmm9 pxor %xmm8,%xmm0 - pshufd $147,%xmm3,%xmm10 + pshufd $0x93,%xmm3,%xmm10 pxor %xmm9,%xmm5 - pshufd $147,%xmm1,%xmm11 + pshufd $0x93,%xmm1,%xmm11 pxor %xmm10,%xmm3 - pshufd $147,%xmm6,%xmm12 + pshufd $0x93,%xmm6,%xmm12 pxor %xmm11,%xmm1 - pshufd $147,%xmm2,%xmm13 + pshufd $0x93,%xmm2,%xmm13 pxor %xmm12,%xmm6 - pshufd $147,%xmm4,%xmm14 + pshufd $0x93,%xmm4,%xmm14 pxor %xmm13,%xmm2 pxor %xmm14,%xmm4 pxor %xmm15,%xmm8 pxor %xmm4,%xmm7 pxor %xmm4,%xmm8 - pshufd $78,%xmm15,%xmm15 + pshufd $0x4E,%xmm15,%xmm15 pxor %xmm0,%xmm9 - pshufd $78,%xmm0,%xmm0 + pshufd $0x4E,%xmm0,%xmm0 pxor %xmm1,%xmm12 pxor %xmm7,%xmm15 pxor %xmm6,%xmm13 pxor %xmm8,%xmm0 pxor %xmm3,%xmm11 - pshufd $78,%xmm1,%xmm7 + pshufd $0x4E,%xmm1,%xmm7 pxor %xmm2,%xmm14 - pshufd $78,%xmm6,%xmm8 + pshufd $0x4E,%xmm6,%xmm8 pxor %xmm5,%xmm10 - pshufd $78,%xmm3,%xmm1 + pshufd $0x4E,%xmm3,%xmm1 pxor %xmm4,%xmm10 - pshufd $78,%xmm4,%xmm6 + pshufd $0x4E,%xmm4,%xmm6 pxor %xmm4,%xmm11 - pshufd $78,%xmm2,%xmm3 + pshufd $0x4E,%xmm2,%xmm3 pxor %xmm11,%xmm7 - pshufd $78,%xmm5,%xmm2 + pshufd $0x4E,%xmm5,%xmm2 pxor %xmm12,%xmm8 pxor %xmm1,%xmm10 pxor %xmm14,%xmm6 @@ -1553,20 +1553,20 @@ bsaes_xts_encrypt: movdqa %xmm7,(%rax) andq $-16,%r14 - subq $128,%rsp + subq $0x80,%rsp movdqa 32(%rbp),%xmm6 pxor %xmm14,%xmm14 movdqa .Lxts_magic(%rip),%xmm12 pcmpgtd %xmm6,%xmm14 - subq $128,%r14 + subq $0x80,%r14 jc .Lxts_enc_short jmp .Lxts_enc_loop .align 16 .Lxts_enc_loop: - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm15 movdqa %xmm6,0(%rsp) @@ -1574,7 +1574,7 @@ bsaes_xts_encrypt: pand %xmm12,%xmm13 pcmpgtd %xmm6,%xmm14 pxor %xmm13,%xmm6 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm0 movdqa %xmm6,16(%rsp) @@ -1583,7 +1583,7 @@ bsaes_xts_encrypt: pcmpgtd %xmm6,%xmm14 pxor %xmm13,%xmm6 movdqu 0(%r12),%xmm7 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm1 movdqa %xmm6,32(%rsp) @@ -1593,7 +1593,7 @@ bsaes_xts_encrypt: pxor %xmm13,%xmm6 movdqu 16(%r12),%xmm8 pxor %xmm7,%xmm15 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm2 movdqa %xmm6,48(%rsp) @@ -1603,7 +1603,7 @@ bsaes_xts_encrypt: pxor %xmm13,%xmm6 movdqu 32(%r12),%xmm9 pxor %xmm8,%xmm0 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm3 movdqa %xmm6,64(%rsp) @@ -1613,7 +1613,7 @@ bsaes_xts_encrypt: pxor %xmm13,%xmm6 movdqu 48(%r12),%xmm10 pxor %xmm9,%xmm1 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm4 movdqa %xmm6,80(%rsp) @@ -1623,7 +1623,7 @@ bsaes_xts_encrypt: pxor %xmm13,%xmm6 movdqu 64(%r12),%xmm11 pxor %xmm10,%xmm2 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm5 movdqa %xmm6,96(%rsp) @@ -1667,20 +1667,20 @@ bsaes_xts_encrypt: pxor %xmm14,%xmm14 movdqa .Lxts_magic(%rip),%xmm12 pcmpgtd %xmm6,%xmm14 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 paddq %xmm6,%xmm6 pand %xmm12,%xmm13 pcmpgtd %xmm6,%xmm14 pxor %xmm13,%xmm6 - subq $128,%r14 + subq $0x80,%r14 jnc .Lxts_enc_loop .Lxts_enc_short: - addq $128,%r14 + addq $0x80,%r14 jz .Lxts_enc_done - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm15 movdqa %xmm6,0(%rsp) @@ -1688,7 +1688,7 @@ bsaes_xts_encrypt: pand %xmm12,%xmm13 pcmpgtd %xmm6,%xmm14 pxor %xmm13,%xmm6 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm0 movdqa %xmm6,16(%rsp) @@ -1699,7 +1699,7 @@ bsaes_xts_encrypt: movdqu 0(%r12),%xmm7 cmpq $16,%r14 je .Lxts_enc_1 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm1 movdqa %xmm6,32(%rsp) @@ -1711,7 +1711,7 @@ bsaes_xts_encrypt: cmpq $32,%r14 je .Lxts_enc_2 pxor %xmm7,%xmm15 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm2 movdqa %xmm6,48(%rsp) @@ -1723,7 +1723,7 @@ bsaes_xts_encrypt: cmpq $48,%r14 je .Lxts_enc_3 pxor %xmm8,%xmm0 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm3 movdqa %xmm6,64(%rsp) @@ -1735,7 +1735,7 @@ bsaes_xts_encrypt: cmpq $64,%r14 je .Lxts_enc_4 pxor %xmm9,%xmm1 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm4 movdqa %xmm6,80(%rsp) @@ -1747,7 +1747,7 @@ bsaes_xts_encrypt: cmpq $80,%r14 je .Lxts_enc_5 pxor %xmm10,%xmm2 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm5 movdqa %xmm6,96(%rsp) @@ -2012,20 +2012,20 @@ bsaes_xts_decrypt: shlq $4,%rax subq %rax,%r14 - subq $128,%rsp + subq $0x80,%rsp movdqa 32(%rbp),%xmm6 pxor %xmm14,%xmm14 movdqa .Lxts_magic(%rip),%xmm12 pcmpgtd %xmm6,%xmm14 - subq $128,%r14 + subq $0x80,%r14 jc .Lxts_dec_short jmp .Lxts_dec_loop .align 16 .Lxts_dec_loop: - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm15 movdqa %xmm6,0(%rsp) @@ -2033,7 +2033,7 @@ bsaes_xts_decrypt: pand %xmm12,%xmm13 pcmpgtd %xmm6,%xmm14 pxor %xmm13,%xmm6 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm0 movdqa %xmm6,16(%rsp) @@ -2042,7 +2042,7 @@ bsaes_xts_decrypt: pcmpgtd %xmm6,%xmm14 pxor %xmm13,%xmm6 movdqu 0(%r12),%xmm7 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm1 movdqa %xmm6,32(%rsp) @@ -2052,7 +2052,7 @@ bsaes_xts_decrypt: pxor %xmm13,%xmm6 movdqu 16(%r12),%xmm8 pxor %xmm7,%xmm15 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm2 movdqa %xmm6,48(%rsp) @@ -2062,7 +2062,7 @@ bsaes_xts_decrypt: pxor %xmm13,%xmm6 movdqu 32(%r12),%xmm9 pxor %xmm8,%xmm0 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm3 movdqa %xmm6,64(%rsp) @@ -2072,7 +2072,7 @@ bsaes_xts_decrypt: pxor %xmm13,%xmm6 movdqu 48(%r12),%xmm10 pxor %xmm9,%xmm1 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm4 movdqa %xmm6,80(%rsp) @@ -2082,7 +2082,7 @@ bsaes_xts_decrypt: pxor %xmm13,%xmm6 movdqu 64(%r12),%xmm11 pxor %xmm10,%xmm2 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm5 movdqa %xmm6,96(%rsp) @@ -2126,20 +2126,20 @@ bsaes_xts_decrypt: pxor %xmm14,%xmm14 movdqa .Lxts_magic(%rip),%xmm12 pcmpgtd %xmm6,%xmm14 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 paddq %xmm6,%xmm6 pand %xmm12,%xmm13 pcmpgtd %xmm6,%xmm14 pxor %xmm13,%xmm6 - subq $128,%r14 + subq $0x80,%r14 jnc .Lxts_dec_loop .Lxts_dec_short: - addq $128,%r14 + addq $0x80,%r14 jz .Lxts_dec_done - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm15 movdqa %xmm6,0(%rsp) @@ -2147,7 +2147,7 @@ bsaes_xts_decrypt: pand %xmm12,%xmm13 pcmpgtd %xmm6,%xmm14 pxor %xmm13,%xmm6 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm0 movdqa %xmm6,16(%rsp) @@ -2158,7 +2158,7 @@ bsaes_xts_decrypt: movdqu 0(%r12),%xmm7 cmpq $16,%r14 je .Lxts_dec_1 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm1 movdqa %xmm6,32(%rsp) @@ -2170,7 +2170,7 @@ bsaes_xts_decrypt: cmpq $32,%r14 je .Lxts_dec_2 pxor %xmm7,%xmm15 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm2 movdqa %xmm6,48(%rsp) @@ -2182,7 +2182,7 @@ bsaes_xts_decrypt: cmpq $48,%r14 je .Lxts_dec_3 pxor %xmm8,%xmm0 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm3 movdqa %xmm6,64(%rsp) @@ -2194,7 +2194,7 @@ bsaes_xts_decrypt: cmpq $64,%r14 je .Lxts_dec_4 pxor %xmm9,%xmm1 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm4 movdqa %xmm6,80(%rsp) @@ -2206,7 +2206,7 @@ bsaes_xts_decrypt: cmpq $80,%r14 je .Lxts_dec_5 pxor %xmm10,%xmm2 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm5 movdqa %xmm6,96(%rsp) @@ -2383,7 +2383,7 @@ bsaes_xts_decrypt: pxor %xmm14,%xmm14 movdqa .Lxts_magic(%rip),%xmm12 pcmpgtd %xmm6,%xmm14 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 movdqa %xmm6,%xmm5 paddq %xmm6,%xmm6 pand %xmm12,%xmm13 diff --git a/secure/lib/libcrypto/amd64/cmll-x86_64.S b/secure/lib/libcrypto/amd64/cmll-x86_64.S index ecd33f135fda..81d244573e96 100644 --- a/secure/lib/libcrypto/amd64/cmll-x86_64.S +++ b/secure/lib/libcrypto/amd64/cmll-x86_64.S @@ -1625,7 +1625,7 @@ Camellia_cbc_encrypt: leaq -64-63(%rcx),%r10 subq %rsp,%r10 negq %r10 - andq $960,%r10 + andq $0x3C0,%r10 subq %r10,%rsp diff --git a/secure/lib/libcrypto/amd64/ecp_nistz256-x86_64.S b/secure/lib/libcrypto/amd64/ecp_nistz256-x86_64.S index c5875d7fc0bc..5d7d5e2be520 100644 --- a/secure/lib/libcrypto/amd64/ecp_nistz256-x86_64.S +++ b/secure/lib/libcrypto/amd64/ecp_nistz256-x86_64.S @@ -1122,6 +1122,7 @@ ecp_nistz256_point_double: pushq %r15 subq $160+8,%rsp +.Lpoint_double_shortcutq: movdqu 0(%rsi),%xmm0 movq %rsi,%rbx movdqu 16(%rsi),%xmm1 @@ -1342,7 +1343,7 @@ ecp_nistz256_point_add: por %xmm1,%xmm3 movdqu 0(%rsi),%xmm0 - pshufd $177,%xmm3,%xmm5 + pshufd $0xb1,%xmm3,%xmm5 movdqu 16(%rsi),%xmm1 movdqu 32(%rsi),%xmm2 por %xmm3,%xmm5 @@ -1352,7 +1353,7 @@ ecp_nistz256_point_add: movq 64+16(%rsi),%r15 movq 64+24(%rsi),%r8 movdqa %xmm0,480(%rsp) - pshufd $30,%xmm5,%xmm4 + pshufd $0x1e,%xmm5,%xmm4 movdqa %xmm1,480+16(%rsp) por %xmm0,%xmm1 .byte 102,72,15,110,199 @@ -1372,10 +1373,10 @@ ecp_nistz256_point_add: call __ecp_nistz256_sqr_montq pcmpeqd %xmm4,%xmm5 - pshufd $177,%xmm3,%xmm4 + pshufd $0xb1,%xmm3,%xmm4 por %xmm3,%xmm4 pshufd $0,%xmm5,%xmm5 - pshufd $30,%xmm4,%xmm3 + pshufd $0x1e,%xmm4,%xmm3 por %xmm3,%xmm4 pxor %xmm3,%xmm3 pcmpeqd %xmm3,%xmm4 @@ -1384,6 +1385,7 @@ ecp_nistz256_point_add: movq 64+8(%rbx),%r14 movq 64+16(%rbx),%r15 movq 64+24(%rbx),%r8 +.byte 102,72,15,110,203 leaq 64-0(%rbx),%rsi leaq 32(%rsp),%rdi @@ -1475,7 +1477,7 @@ ecp_nistz256_point_add: testq %r8,%r8 jnz .Ladd_proceedq testq %r9,%r9 - jz .Ladd_proceedq + jz .Ladd_doubleq .byte 102,72,15,126,199 pxor %xmm0,%xmm0 @@ -1488,6 +1490,13 @@ ecp_nistz256_point_add: jmp .Ladd_doneq .align 32 +.Ladd_doubleq: +.byte 102,72,15,126,206 +.byte 102,72,15,126,199 + addq $416,%rsp + jmp .Lpoint_double_shortcutq + +.align 32 .Ladd_proceedq: movq 0+64(%rsp),%rax movq 8+64(%rsp),%r14 @@ -1734,13 +1743,13 @@ ecp_nistz256_point_add_affine: por %xmm1,%xmm3 movdqu 0(%rbx),%xmm0 - pshufd $177,%xmm3,%xmm5 + pshufd $0xb1,%xmm3,%xmm5 movdqu 16(%rbx),%xmm1 movdqu 32(%rbx),%xmm2 por %xmm3,%xmm5 movdqu 48(%rbx),%xmm3 movdqa %xmm0,416(%rsp) - pshufd $30,%xmm5,%xmm4 + pshufd $0x1e,%xmm5,%xmm4 movdqa %xmm1,416+16(%rsp) por %xmm0,%xmm1 .byte 102,72,15,110,199 @@ -1756,13 +1765,13 @@ ecp_nistz256_point_add_affine: call __ecp_nistz256_sqr_montq pcmpeqd %xmm4,%xmm5 - pshufd $177,%xmm3,%xmm4 + pshufd $0xb1,%xmm3,%xmm4 movq 0(%rbx),%rax movq %r12,%r9 por %xmm3,%xmm4 pshufd $0,%xmm5,%xmm5 - pshufd $30,%xmm4,%xmm3 + pshufd $0x1e,%xmm4,%xmm3 movq %r13,%r10 por %xmm3,%xmm4 pxor %xmm3,%xmm3 diff --git a/secure/lib/libcrypto/amd64/ghash-x86_64.S b/secure/lib/libcrypto/amd64/ghash-x86_64.S index aa93c8023cc4..ef024bfca1e4 100644 --- a/secure/lib/libcrypto/amd64/ghash-x86_64.S +++ b/secure/lib/libcrypto/amd64/ghash-x86_64.S @@ -21,14 +21,14 @@ gcm_gmult_4bit: movq $14,%rcx movq 8(%rsi,%rax,1),%r8 movq (%rsi,%rax,1),%r9 - andb $240,%bl + andb $0xf0,%bl movq %r8,%rdx jmp .Loop1 .align 16 .Loop1: shrq $4,%r8 - andq $15,%rdx + andq $0xf,%rdx movq %r9,%r10 movb (%rdi,%rcx,1),%al shrq $4,%r9 @@ -44,13 +44,13 @@ gcm_gmult_4bit: js .Lbreak1 shrq $4,%r8 - andq $15,%rdx + andq $0xf,%rdx movq %r9,%r10 shrq $4,%r9 xorq 8(%rsi,%rax,1),%r8 shlq $60,%r10 xorq (%rsi,%rax,1),%r9 - andb $240,%bl + andb $0xf0,%bl xorq (%r11,%rdx,8),%r9 movq %r8,%rdx xorq %r10,%r8 @@ -59,19 +59,19 @@ gcm_gmult_4bit: .align 16 .Lbreak1: shrq $4,%r8 - andq $15,%rdx + andq $0xf,%rdx movq %r9,%r10 shrq $4,%r9 xorq 8(%rsi,%rax,1),%r8 shlq $60,%r10 xorq (%rsi,%rax,1),%r9 - andb $240,%bl + andb $0xf0,%bl xorq (%r11,%rdx,8),%r9 movq %r8,%rdx xorq %r10,%r8 shrq $4,%r8 - andq $15,%rdx + andq $0xf,%rdx movq %r9,%r10 shrq $4,%r9 xorq 8(%rsi,%rbx,1),%r8 @@ -662,10 +662,10 @@ gcm_ghash_4bit: gcm_init_clmul: .L_init_clmul: movdqu (%rsi),%xmm2 - pshufd $78,%xmm2,%xmm2 + pshufd $0b01001110,%xmm2,%xmm2 - pshufd $255,%xmm2,%xmm4 + pshufd $0b11111111,%xmm2,%xmm4 movdqa %xmm2,%xmm3 psllq $1,%xmm2 pxor %xmm5,%xmm5 @@ -679,11 +679,11 @@ gcm_init_clmul: pxor %xmm5,%xmm2 - pshufd $78,%xmm2,%xmm6 + pshufd $0b01001110,%xmm2,%xmm6 movdqa %xmm2,%xmm0 pxor %xmm2,%xmm6 movdqa %xmm0,%xmm1 - pshufd $78,%xmm0,%xmm3 + pshufd $0b01001110,%xmm0,%xmm3 pxor %xmm0,%xmm3 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 @@ -719,8 +719,8 @@ gcm_init_clmul: pxor %xmm4,%xmm0 psrlq $1,%xmm0 pxor %xmm1,%xmm0 - pshufd $78,%xmm2,%xmm3 - pshufd $78,%xmm0,%xmm4 + pshufd $0b01001110,%xmm2,%xmm3 + pshufd $0b01001110,%xmm0,%xmm4 pxor %xmm2,%xmm3 movdqu %xmm2,0(%rdi) pxor %xmm0,%xmm4 @@ -728,7 +728,7 @@ gcm_init_clmul: .byte 102,15,58,15,227,8 movdqu %xmm4,32(%rdi) movdqa %xmm0,%xmm1 - pshufd $78,%xmm0,%xmm3 + pshufd $0b01001110,%xmm0,%xmm3 pxor %xmm0,%xmm3 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 @@ -766,7 +766,7 @@ gcm_init_clmul: pxor %xmm1,%xmm0 movdqa %xmm0,%xmm5 movdqa %xmm0,%xmm1 - pshufd $78,%xmm0,%xmm3 + pshufd $0b01001110,%xmm0,%xmm3 pxor %xmm0,%xmm3 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 @@ -802,8 +802,8 @@ gcm_init_clmul: pxor %xmm4,%xmm0 psrlq $1,%xmm0 pxor %xmm1,%xmm0 - pshufd $78,%xmm5,%xmm3 - pshufd $78,%xmm0,%xmm4 + pshufd $0b01001110,%xmm5,%xmm3 + pshufd $0b01001110,%xmm0,%xmm4 pxor %xmm5,%xmm3 movdqu %xmm5,48(%rdi) pxor %xmm0,%xmm4 @@ -823,7 +823,7 @@ gcm_gmult_clmul: movdqu 32(%rsi),%xmm4 .byte 102,15,56,0,197 movdqa %xmm0,%xmm1 - pshufd $78,%xmm0,%xmm3 + pshufd $0b01001110,%xmm0,%xmm3 pxor %xmm0,%xmm3 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 @@ -875,20 +875,20 @@ gcm_ghash_clmul: movdqu 32(%rsi),%xmm7 .byte 102,65,15,56,0,194 - subq $16,%rcx + subq $0x10,%rcx jz .Lodd_tail movdqu 16(%rsi),%xmm6 movl OPENSSL_ia32cap_P+4(%rip),%eax - cmpq $48,%rcx + cmpq $0x30,%rcx jb .Lskip4x andl $71303168,%eax cmpl $4194304,%eax je .Lskip4x - subq $48,%rcx - movq $11547335547999543296,%rax + subq $0x30,%rcx + movq $0xA040608020C0E000,%rax movdqu 48(%rsi),%xmm14 movdqu 64(%rsi),%xmm15 @@ -900,14 +900,14 @@ gcm_ghash_clmul: .byte 102,65,15,56,0,218 .byte 102,69,15,56,0,218 movdqa %xmm3,%xmm5 - pshufd $78,%xmm3,%xmm4 + pshufd $0b01001110,%xmm3,%xmm4 pxor %xmm3,%xmm4 .byte 102,15,58,68,218,0 .byte 102,15,58,68,234,17 .byte 102,15,58,68,231,0 movdqa %xmm11,%xmm13 - pshufd $78,%xmm11,%xmm12 + pshufd $0b01001110,%xmm11,%xmm12 pxor %xmm11,%xmm12 .byte 102,68,15,58,68,222,0 .byte 102,68,15,58,68,238,17 @@ -922,12 +922,12 @@ gcm_ghash_clmul: .byte 102,69,15,56,0,218 .byte 102,69,15,56,0,194 movdqa %xmm11,%xmm13 - pshufd $78,%xmm11,%xmm12 + pshufd $0b01001110,%xmm11,%xmm12 pxor %xmm8,%xmm0 pxor %xmm11,%xmm12 .byte 102,69,15,58,68,222,0 movdqa %xmm0,%xmm1 - pshufd $78,%xmm0,%xmm8 + pshufd $0b01001110,%xmm0,%xmm8 pxor %xmm0,%xmm8 .byte 102,69,15,58,68,238,17 .byte 102,68,15,58,68,231,0 @@ -935,7 +935,7 @@ gcm_ghash_clmul: xorps %xmm13,%xmm5 leaq 64(%rdx),%rdx - subq $64,%rcx + subq $0x40,%rcx jc .Ltail4x jmp .Lmod4_loop @@ -950,14 +950,14 @@ gcm_ghash_clmul: movdqu 32(%rdx),%xmm3 movdqa %xmm11,%xmm13 .byte 102,68,15,58,68,199,16 - pshufd $78,%xmm11,%xmm12 + pshufd $0b01001110,%xmm11,%xmm12 xorps %xmm5,%xmm1 pxor %xmm11,%xmm12 .byte 102,65,15,56,0,218 movups 32(%rsi),%xmm7 xorps %xmm4,%xmm8 .byte 102,68,15,58,68,218,0 - pshufd $78,%xmm3,%xmm4 + pshufd $0b01001110,%xmm3,%xmm4 pxor %xmm0,%xmm8 movdqa %xmm3,%xmm5 @@ -1001,7 +1001,7 @@ gcm_ghash_clmul: movdqa %xmm11,%xmm13 pxor %xmm12,%xmm4 - pshufd $78,%xmm11,%xmm12 + pshufd $0b01001110,%xmm11,%xmm12 pxor %xmm9,%xmm0 pxor %xmm8,%xmm1 pxor %xmm11,%xmm12 @@ -1011,14 +1011,14 @@ gcm_ghash_clmul: movdqa %xmm0,%xmm1 .byte 102,69,15,58,68,238,17 xorps %xmm11,%xmm3 - pshufd $78,%xmm0,%xmm8 + pshufd $0b01001110,%xmm0,%xmm8 pxor %xmm0,%xmm8 .byte 102,68,15,58,68,231,0 xorps %xmm13,%xmm5 leaq 64(%rdx),%rdx - subq $64,%rcx + subq $0x40,%rcx jnc .Lmod4_loop .Ltail4x: @@ -1062,10 +1062,10 @@ gcm_ghash_clmul: pxor %xmm4,%xmm0 psrlq $1,%xmm0 pxor %xmm1,%xmm0 - addq $64,%rcx + addq $0x40,%rcx jz .Ldone movdqu 32(%rsi),%xmm7 - subq $16,%rcx + subq $0x10,%rcx jz .Lodd_tail .Lskip4x: @@ -1080,7 +1080,7 @@ gcm_ghash_clmul: pxor %xmm8,%xmm0 movdqa %xmm3,%xmm5 - pshufd $78,%xmm3,%xmm4 + pshufd $0b01001110,%xmm3,%xmm4 pxor %xmm3,%xmm4 .byte 102,15,58,68,218,0 .byte 102,15,58,68,234,17 @@ -1088,7 +1088,7 @@ gcm_ghash_clmul: leaq 32(%rdx),%rdx nop - subq $32,%rcx + subq $0x20,%rcx jbe .Leven_tail nop jmp .Lmod_loop @@ -1097,7 +1097,7 @@ gcm_ghash_clmul: .Lmod_loop: movdqa %xmm0,%xmm1 movdqa %xmm4,%xmm8 - pshufd $78,%xmm0,%xmm4 + pshufd $0b01001110,%xmm0,%xmm4 pxor %xmm0,%xmm4 .byte 102,15,58,68,198,0 @@ -1135,7 +1135,7 @@ gcm_ghash_clmul: pslldq $8,%xmm0 psrldq $8,%xmm8 pxor %xmm9,%xmm0 - pshufd $78,%xmm5,%xmm4 + pshufd $0b01001110,%xmm5,%xmm4 pxor %xmm8,%xmm1 pxor %xmm5,%xmm4 @@ -1151,13 +1151,13 @@ gcm_ghash_clmul: .byte 102,15,58,68,231,0 pxor %xmm1,%xmm0 - subq $32,%rcx + subq $0x20,%rcx ja .Lmod_loop .Leven_tail: movdqa %xmm0,%xmm1 movdqa %xmm4,%xmm8 - pshufd $78,%xmm0,%xmm4 + pshufd $0b01001110,%xmm0,%xmm4 pxor %xmm0,%xmm4 .byte 102,15,58,68,198,0 @@ -1205,7 +1205,7 @@ gcm_ghash_clmul: .byte 102,69,15,56,0,194 pxor %xmm8,%xmm0 movdqa %xmm0,%xmm1 - pshufd $78,%xmm0,%xmm3 + pshufd $0b01001110,%xmm0,%xmm3 pxor %xmm0,%xmm3 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 diff --git a/secure/lib/libcrypto/amd64/md5-x86_64.S b/secure/lib/libcrypto/amd64/md5-x86_64.S index 94fb761b5ac0..8f6e6899aaea 100644 --- a/secure/lib/libcrypto/amd64/md5-x86_64.S +++ b/secure/lib/libcrypto/amd64/md5-x86_64.S @@ -494,14 +494,14 @@ md5_block_asm_data_order: movl %ecx,%r11d addl %ecx,%ebx movl 0(%rsi),%r10d - movl $4294967295,%r11d + movl $0xffffffff,%r11d xorl %edx,%r11d leal -198630844(%rax,%r10,1),%eax orl %ebx,%r11d xorl %ecx,%r11d addl %r11d,%eax movl 28(%rsi),%r10d - movl $4294967295,%r11d + movl $0xffffffff,%r11d roll $6,%eax xorl %ecx,%r11d addl %ebx,%eax @@ -510,7 +510,7 @@ md5_block_asm_data_order: xorl %ebx,%r11d addl %r11d,%edx movl 56(%rsi),%r10d - movl $4294967295,%r11d + movl $0xffffffff,%r11d roll $10,%edx xorl %ebx,%r11d addl %eax,%edx @@ -519,7 +519,7 @@ md5_block_asm_data_order: xorl %eax,%r11d addl %r11d,%ecx movl 20(%rsi),%r10d - movl $4294967295,%r11d + movl $0xffffffff,%r11d roll $15,%ecx xorl %eax,%r11d addl %edx,%ecx @@ -528,7 +528,7 @@ md5_block_asm_data_order: xorl %edx,%r11d addl %r11d,%ebx movl 48(%rsi),%r10d - movl $4294967295,%r11d + movl $0xffffffff,%r11d roll $21,%ebx xorl %edx,%r11d addl %ecx,%ebx @@ -537,7 +537,7 @@ md5_block_asm_data_order: xorl %ecx,%r11d addl %r11d,%eax movl 12(%rsi),%r10d - movl $4294967295,%r11d + movl $0xffffffff,%r11d roll $6,%eax xorl %ecx,%r11d addl %ebx,%eax @@ -546,7 +546,7 @@ md5_block_asm_data_order: xorl %ebx,%r11d addl %r11d,%edx movl 40(%rsi),%r10d - movl $4294967295,%r11d + movl $0xffffffff,%r11d roll $10,%edx xorl %ebx,%r11d addl %eax,%edx @@ -555,7 +555,7 @@ md5_block_asm_data_order: xorl %eax,%r11d addl %r11d,%ecx movl 4(%rsi),%r10d - movl $4294967295,%r11d + movl $0xffffffff,%r11d roll $15,%ecx xorl %eax,%r11d addl %edx,%ecx @@ -564,7 +564,7 @@ md5_block_asm_data_order: xorl %edx,%r11d addl %r11d,%ebx movl 32(%rsi),%r10d - movl $4294967295,%r11d + movl $0xffffffff,%r11d roll $21,%ebx xorl %edx,%r11d addl %ecx,%ebx @@ -573,7 +573,7 @@ md5_block_asm_data_order: xorl %ecx,%r11d addl %r11d,%eax movl 60(%rsi),%r10d - movl $4294967295,%r11d + movl $0xffffffff,%r11d roll $6,%eax xorl %ecx,%r11d addl %ebx,%eax @@ -582,7 +582,7 @@ md5_block_asm_data_order: xorl %ebx,%r11d addl %r11d,%edx movl 24(%rsi),%r10d - movl $4294967295,%r11d + movl $0xffffffff,%r11d roll $10,%edx xorl %ebx,%r11d addl %eax,%edx @@ -591,7 +591,7 @@ md5_block_asm_data_order: xorl %eax,%r11d addl %r11d,%ecx movl 52(%rsi),%r10d - movl $4294967295,%r11d + movl $0xffffffff,%r11d roll $15,%ecx xorl %eax,%r11d addl %edx,%ecx @@ -600,7 +600,7 @@ md5_block_asm_data_order: xorl %edx,%r11d addl %r11d,%ebx movl 16(%rsi),%r10d - movl $4294967295,%r11d + movl $0xffffffff,%r11d roll $21,%ebx xorl %edx,%r11d addl %ecx,%ebx @@ -609,7 +609,7 @@ md5_block_asm_data_order: xorl %ecx,%r11d addl %r11d,%eax movl 44(%rsi),%r10d - movl $4294967295,%r11d + movl $0xffffffff,%r11d roll $6,%eax xorl %ecx,%r11d addl %ebx,%eax @@ -618,7 +618,7 @@ md5_block_asm_data_order: xorl %ebx,%r11d addl %r11d,%edx movl 8(%rsi),%r10d - movl $4294967295,%r11d + movl $0xffffffff,%r11d roll $10,%edx xorl %ebx,%r11d addl %eax,%edx @@ -627,7 +627,7 @@ md5_block_asm_data_order: xorl %eax,%r11d addl %r11d,%ecx movl 36(%rsi),%r10d - movl $4294967295,%r11d + movl $0xffffffff,%r11d roll $15,%ecx xorl %eax,%r11d addl %edx,%ecx @@ -636,7 +636,7 @@ md5_block_asm_data_order: xorl %edx,%r11d addl %r11d,%ebx movl 0(%rsi),%r10d - movl $4294967295,%r11d + movl $0xffffffff,%r11d roll $21,%ebx xorl %edx,%r11d addl %ecx,%ebx diff --git a/secure/lib/libcrypto/amd64/rsaz-x86_64.S b/secure/lib/libcrypto/amd64/rsaz-x86_64.S index efd229a455e4..e2b031343329 100644 --- a/secure/lib/libcrypto/amd64/rsaz-x86_64.S +++ b/secure/lib/libcrypto/amd64/rsaz-x86_64.S @@ -462,48 +462,94 @@ rsaz_512_mul_gather4: pushq %r14 pushq %r15 - movl %r9d,%r9d - subq $128+24,%rsp + subq $152,%rsp .Lmul_gather4_body: - movl 64(%rdx,%r9,4),%eax -.byte 102,72,15,110,199 - movl (%rdx,%r9,4),%ebx -.byte 102,72,15,110,201 + movd %r9d,%xmm8 + movdqa .Linc+16(%rip),%xmm1 + movdqa .Linc(%rip),%xmm0 + + pshufd $0,%xmm8,%xmm8 + movdqa %xmm1,%xmm7 + movdqa %xmm1,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm8,%xmm0 + movdqa %xmm7,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm8,%xmm1 + movdqa %xmm7,%xmm4 + paddd %xmm2,%xmm3 + pcmpeqd %xmm8,%xmm2 + movdqa %xmm7,%xmm5 + paddd %xmm3,%xmm4 + pcmpeqd %xmm8,%xmm3 + movdqa %xmm7,%xmm6 + paddd %xmm4,%xmm5 + pcmpeqd %xmm8,%xmm4 + paddd %xmm5,%xmm6 + pcmpeqd %xmm8,%xmm5 + paddd %xmm6,%xmm7 + pcmpeqd %xmm8,%xmm6 + pcmpeqd %xmm8,%xmm7 + + movdqa 0(%rdx),%xmm8 + movdqa 16(%rdx),%xmm9 + movdqa 32(%rdx),%xmm10 + movdqa 48(%rdx),%xmm11 + pand %xmm0,%xmm8 + movdqa 64(%rdx),%xmm12 + pand %xmm1,%xmm9 + movdqa 80(%rdx),%xmm13 + pand %xmm2,%xmm10 + movdqa 96(%rdx),%xmm14 + pand %xmm3,%xmm11 + movdqa 112(%rdx),%xmm15 + leaq 128(%rdx),%rbp + pand %xmm4,%xmm12 + pand %xmm5,%xmm13 + pand %xmm6,%xmm14 + pand %xmm7,%xmm15 + por %xmm10,%xmm8 + por %xmm11,%xmm9 + por %xmm12,%xmm8 + por %xmm13,%xmm9 + por %xmm14,%xmm8 + por %xmm15,%xmm9 + + por %xmm9,%xmm8 + pshufd $0x4e,%xmm8,%xmm9 + por %xmm9,%xmm8 +.byte 102,76,15,126,195 + movq %r8,128(%rsp) + movq %rdi,128+8(%rsp) + movq %rcx,128+16(%rsp) - shlq $32,%rax - orq %rax,%rbx movq (%rsi),%rax movq 8(%rsi),%rcx - leaq 128(%rdx,%r9,4),%rbp mulq %rbx movq %rax,(%rsp) movq %rcx,%rax movq %rdx,%r8 mulq %rbx - movd (%rbp),%xmm4 addq %rax,%r8 movq 16(%rsi),%rax movq %rdx,%r9 adcq $0,%r9 mulq %rbx - movd 64(%rbp),%xmm5 addq %rax,%r9 movq 24(%rsi),%rax movq %rdx,%r10 adcq $0,%r10 mulq %rbx - pslldq $4,%xmm5 addq %rax,%r10 movq 32(%rsi),%rax movq %rdx,%r11 adcq $0,%r11 mulq %rbx - por %xmm5,%xmm4 addq %rax,%r11 movq 40(%rsi),%rax movq %rdx,%r12 @@ -516,14 +562,12 @@ rsaz_512_mul_gather4: adcq $0,%r13 mulq %rbx - leaq 128(%rbp),%rbp addq %rax,%r13 movq 56(%rsi),%rax movq %rdx,%r14 adcq $0,%r14 mulq %rbx -.byte 102,72,15,126,227 addq %rax,%r14 movq (%rsi),%rax movq %rdx,%r15 @@ -535,6 +579,35 @@ rsaz_512_mul_gather4: .align 32 .Loop_mul_gather: + movdqa 0(%rbp),%xmm8 + movdqa 16(%rbp),%xmm9 + movdqa 32(%rbp),%xmm10 + movdqa 48(%rbp),%xmm11 + pand %xmm0,%xmm8 + movdqa 64(%rbp),%xmm12 + pand %xmm1,%xmm9 + movdqa 80(%rbp),%xmm13 + pand %xmm2,%xmm10 + movdqa 96(%rbp),%xmm14 + pand %xmm3,%xmm11 + movdqa 112(%rbp),%xmm15 + leaq 128(%rbp),%rbp + pand %xmm4,%xmm12 + pand %xmm5,%xmm13 + pand %xmm6,%xmm14 + pand %xmm7,%xmm15 + por %xmm10,%xmm8 + por %xmm11,%xmm9 + por %xmm12,%xmm8 + por %xmm13,%xmm9 + por %xmm14,%xmm8 + por %xmm15,%xmm9 + + por %xmm9,%xmm8 + pshufd $0x4e,%xmm8,%xmm9 + por %xmm9,%xmm8 +.byte 102,76,15,126,195 + mulq %rbx addq %rax,%r8 movq 8(%rsi),%rax @@ -543,7 +616,6 @@ rsaz_512_mul_gather4: adcq $0,%r8 mulq %rbx - movd (%rbp),%xmm4 addq %rax,%r9 movq 16(%rsi),%rax adcq $0,%rdx @@ -552,7 +624,6 @@ rsaz_512_mul_gather4: adcq $0,%r9 mulq %rbx - movd 64(%rbp),%xmm5 addq %rax,%r10 movq 24(%rsi),%rax adcq $0,%rdx @@ -561,7 +632,6 @@ rsaz_512_mul_gather4: adcq $0,%r10 mulq %rbx - pslldq $4,%xmm5 addq %rax,%r11 movq 32(%rsi),%rax adcq $0,%rdx @@ -570,7 +640,6 @@ rsaz_512_mul_gather4: adcq $0,%r11 mulq %rbx - por %xmm5,%xmm4 addq %rax,%r12 movq 40(%rsi),%rax adcq $0,%rdx @@ -595,7 +664,6 @@ rsaz_512_mul_gather4: adcq $0,%r14 mulq %rbx -.byte 102,72,15,126,227 addq %rax,%r15 movq (%rsi),%rax adcq $0,%rdx @@ -603,7 +671,6 @@ rsaz_512_mul_gather4: movq %rdx,%r15 adcq $0,%r15 - leaq 128(%rbp),%rbp leaq 8(%rdi),%rdi decl %ecx @@ -618,8 +685,8 @@ rsaz_512_mul_gather4: movq %r14,48(%rdi) movq %r15,56(%rdi) -.byte 102,72,15,126,199 -.byte 102,72,15,126,205 + movq 128+8(%rsp),%rdi + movq 128+16(%rsp),%rbp movq (%rsp),%r8 movq 8(%rsp),%r9 @@ -668,7 +735,7 @@ rsaz_512_mul_scatter4: movl %r9d,%r9d subq $128+24,%rsp .Lmul_scatter4_body: - leaq (%r8,%r9,4),%r8 + leaq (%r8,%r9,8),%r8 .byte 102,72,15,110,199 .byte 102,72,15,110,202 .byte 102,73,15,110,208 @@ -704,30 +771,14 @@ rsaz_512_mul_scatter4: call __rsaz_512_subtract - movl %r8d,0(%rsi) - shrq $32,%r8 - movl %r9d,128(%rsi) - shrq $32,%r9 - movl %r10d,256(%rsi) - shrq $32,%r10 - movl %r11d,384(%rsi) - shrq $32,%r11 - movl %r12d,512(%rsi) - shrq $32,%r12 - movl %r13d,640(%rsi) - shrq $32,%r13 - movl %r14d,768(%rsi) - shrq $32,%r14 - movl %r15d,896(%rsi) - shrq $32,%r15 - movl %r8d,64(%rsi) - movl %r9d,192(%rsi) - movl %r10d,320(%rsi) - movl %r11d,448(%rsi) - movl %r12d,576(%rsi) - movl %r13d,704(%rsi) - movl %r14d,832(%rsi) - movl %r15d,960(%rsi) + movq %r8,0(%rsi) + movq %r9,128(%rsi) + movq %r10,256(%rsi) + movq %r11,384(%rsi) + movq %r12,512(%rsi) + movq %r13,640(%rsi) + movq %r14,768(%rsi) + movq %r15,896(%rsi) leaq 128+24+48(%rsp),%rax movq -48(%rax),%r15 @@ -1080,16 +1131,14 @@ __rsaz_512_mul: .type rsaz_512_scatter4,@function .align 16 rsaz_512_scatter4: - leaq (%rdi,%rdx,4),%rdi + leaq (%rdi,%rdx,8),%rdi movl $8,%r9d jmp .Loop_scatter .align 16 .Loop_scatter: movq (%rsi),%rax leaq 8(%rsi),%rsi - movl %eax,(%rdi) - shrq $32,%rax - movl %eax,64(%rdi) + movq %rax,(%rdi) leaq 128(%rdi),%rdi decl %r9d jnz .Loop_scatter @@ -1100,19 +1149,72 @@ rsaz_512_scatter4: .type rsaz_512_gather4,@function .align 16 rsaz_512_gather4: - leaq (%rsi,%rdx,4),%rsi + movd %edx,%xmm8 + movdqa .Linc+16(%rip),%xmm1 + movdqa .Linc(%rip),%xmm0 + + pshufd $0,%xmm8,%xmm8 + movdqa %xmm1,%xmm7 + movdqa %xmm1,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm8,%xmm0 + movdqa %xmm7,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm8,%xmm1 + movdqa %xmm7,%xmm4 + paddd %xmm2,%xmm3 + pcmpeqd %xmm8,%xmm2 + movdqa %xmm7,%xmm5 + paddd %xmm3,%xmm4 + pcmpeqd %xmm8,%xmm3 + movdqa %xmm7,%xmm6 + paddd %xmm4,%xmm5 + pcmpeqd %xmm8,%xmm4 + paddd %xmm5,%xmm6 + pcmpeqd %xmm8,%xmm5 + paddd %xmm6,%xmm7 + pcmpeqd %xmm8,%xmm6 + pcmpeqd %xmm8,%xmm7 movl $8,%r9d jmp .Loop_gather .align 16 .Loop_gather: - movl (%rsi),%eax - movl 64(%rsi),%r8d + movdqa 0(%rsi),%xmm8 + movdqa 16(%rsi),%xmm9 + movdqa 32(%rsi),%xmm10 + movdqa 48(%rsi),%xmm11 + pand %xmm0,%xmm8 + movdqa 64(%rsi),%xmm12 + pand %xmm1,%xmm9 + movdqa 80(%rsi),%xmm13 + pand %xmm2,%xmm10 + movdqa 96(%rsi),%xmm14 + pand %xmm3,%xmm11 + movdqa 112(%rsi),%xmm15 leaq 128(%rsi),%rsi - shlq $32,%r8 - orq %r8,%rax - movq %rax,(%rdi) + pand %xmm4,%xmm12 + pand %xmm5,%xmm13 + pand %xmm6,%xmm14 + pand %xmm7,%xmm15 + por %xmm10,%xmm8 + por %xmm11,%xmm9 + por %xmm12,%xmm8 + por %xmm13,%xmm9 + por %xmm14,%xmm8 + por %xmm15,%xmm9 + + por %xmm9,%xmm8 + pshufd $0x4e,%xmm8,%xmm9 + por %xmm9,%xmm8 + movq %xmm8,(%rdi) leaq 8(%rdi),%rdi decl %r9d jnz .Loop_gather .byte 0xf3,0xc3 +.LSEH_end_rsaz_512_gather4: .size rsaz_512_gather4,.-rsaz_512_gather4 + +.align 64 +.Linc: +.long 0,0, 1,1 +.long 2,2, 2,2 diff --git a/secure/lib/libcrypto/amd64/sha1-mb-x86_64.S b/secure/lib/libcrypto/amd64/sha1-mb-x86_64.S index 6c7cd2f7c634..6a797616f277 100644 --- a/secure/lib/libcrypto/amd64/sha1-mb-x86_64.S +++ b/secure/lib/libcrypto/amd64/sha1-mb-x86_64.S @@ -2600,10 +2600,10 @@ _shaext_shortcut: punpcklqdq %xmm5,%xmm0 punpckhqdq %xmm5,%xmm8 - pshufd $63,%xmm7,%xmm1 - pshufd $127,%xmm7,%xmm9 - pshufd $27,%xmm0,%xmm0 - pshufd $27,%xmm8,%xmm8 + pshufd $0b00111111,%xmm7,%xmm1 + pshufd $0b01111111,%xmm7,%xmm9 + pshufd $0b00011011,%xmm0,%xmm0 + pshufd $0b00011011,%xmm8,%xmm8 jmp .Loop_shaext .align 32 @@ -2858,8 +2858,8 @@ _shaext_shortcut: .byte 69,15,58,204,193,3 .byte 69,15,56,200,214 - pshufd $0,%xmm6,%xmm11 - pshufd $85,%xmm6,%xmm12 + pshufd $0x00,%xmm6,%xmm11 + pshufd $0x55,%xmm6,%xmm12 movdqa %xmm6,%xmm7 pcmpgtd %xmm4,%xmm11 pcmpgtd %xmm4,%xmm12 @@ -2889,8 +2889,8 @@ _shaext_shortcut: movl 280(%rsp),%edx - pshufd $27,%xmm0,%xmm0 - pshufd $27,%xmm8,%xmm8 + pshufd $0b00011011,%xmm0,%xmm0 + pshufd $0b00011011,%xmm8,%xmm8 movdqa %xmm0,%xmm6 punpckldq %xmm8,%xmm0 diff --git a/secure/lib/libcrypto/amd64/sha1-x86_64.S b/secure/lib/libcrypto/amd64/sha1-x86_64.S index 25c27e59d0ef..74c9432b9f36 100644 --- a/secure/lib/libcrypto/amd64/sha1-x86_64.S +++ b/secure/lib/libcrypto/amd64/sha1-x86_64.S @@ -1241,9 +1241,9 @@ _shaext_shortcut: movdqa K_XX_XX+160(%rip),%xmm3 movdqu (%rsi),%xmm4 - pshufd $27,%xmm0,%xmm0 + pshufd $0b00011011,%xmm0,%xmm0 movdqu 16(%rsi),%xmm5 - pshufd $27,%xmm1,%xmm1 + pshufd $0b00011011,%xmm1,%xmm1 movdqu 32(%rsi),%xmm6 .byte 102,15,56,0,227 movdqu 48(%rsi),%xmm7 @@ -1393,8 +1393,8 @@ _shaext_shortcut: jnz .Loop_shaext - pshufd $27,%xmm0,%xmm0 - pshufd $27,%xmm1,%xmm1 + pshufd $0b00011011,%xmm0,%xmm0 + pshufd $0b00011011,%xmm1,%xmm1 movdqu %xmm0,(%rdi) movd %xmm1,16(%rdi) .byte 0xf3,0xc3 diff --git a/secure/lib/libcrypto/amd64/sha256-mb-x86_64.S b/secure/lib/libcrypto/amd64/sha256-mb-x86_64.S index 893d42a09a7e..b14c7967cee3 100644 --- a/secure/lib/libcrypto/amd64/sha256-mb-x86_64.S +++ b/secure/lib/libcrypto/amd64/sha256-mb-x86_64.S @@ -2678,10 +2678,10 @@ _shaext_shortcut: punpckhqdq %xmm8,%xmm14 punpckhqdq %xmm10,%xmm15 - pshufd $27,%xmm12,%xmm12 - pshufd $27,%xmm13,%xmm13 - pshufd $27,%xmm14,%xmm14 - pshufd $27,%xmm15,%xmm15 + pshufd $0b00011011,%xmm12,%xmm12 + pshufd $0b00011011,%xmm13,%xmm13 + pshufd $0b00011011,%xmm14,%xmm14 + pshufd $0b00011011,%xmm15,%xmm15 jmp .Loop_shaext .align 32 @@ -2713,11 +2713,11 @@ _shaext_shortcut: movdqa %xmm2,%xmm0 movdqa %xmm15,112(%rsp) .byte 69,15,56,203,254 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 pxor %xmm12,%xmm4 movdqa %xmm12,64(%rsp) .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 pxor %xmm14,%xmm8 movdqa %xmm14,96(%rsp) movdqa 16-128(%rbp),%xmm1 @@ -2735,11 +2735,11 @@ _shaext_shortcut: .byte 102,68,15,56,0,211 prefetcht0 127(%r9) .byte 69,15,56,203,254 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 .byte 102,68,15,56,0,219 .byte 15,56,204,229 .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 movdqa 32-128(%rbp),%xmm1 paddd %xmm6,%xmm1 .byte 69,15,56,203,247 @@ -2752,14 +2752,14 @@ _shaext_shortcut: movdqa %xmm2,%xmm0 movdqa %xmm7,%xmm3 .byte 69,15,56,203,254 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 .byte 102,15,58,15,222,4 paddd %xmm3,%xmm4 movdqa %xmm11,%xmm3 .byte 102,65,15,58,15,218,4 .byte 15,56,204,238 .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 movdqa 48-128(%rbp),%xmm1 paddd %xmm7,%xmm1 .byte 69,15,56,203,247 @@ -2776,13 +2776,13 @@ _shaext_shortcut: .byte 102,15,58,15,223,4 .byte 69,15,56,203,254 .byte 69,15,56,205,195 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 paddd %xmm3,%xmm5 movdqa %xmm8,%xmm3 .byte 102,65,15,58,15,219,4 .byte 15,56,204,247 .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 movdqa 64-128(%rbp),%xmm1 paddd %xmm4,%xmm1 .byte 69,15,56,203,247 @@ -2798,13 +2798,13 @@ _shaext_shortcut: .byte 102,15,58,15,220,4 .byte 69,15,56,203,254 .byte 69,15,56,205,200 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 paddd %xmm3,%xmm6 movdqa %xmm9,%xmm3 .byte 102,65,15,58,15,216,4 .byte 15,56,204,252 .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 movdqa 80-128(%rbp),%xmm1 paddd %xmm5,%xmm1 .byte 69,15,56,203,247 @@ -2820,13 +2820,13 @@ _shaext_shortcut: .byte 102,15,58,15,221,4 .byte 69,15,56,203,254 .byte 69,15,56,205,209 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 paddd %xmm3,%xmm7 movdqa %xmm10,%xmm3 .byte 102,65,15,58,15,217,4 .byte 15,56,204,229 .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 movdqa 96-128(%rbp),%xmm1 paddd %xmm6,%xmm1 .byte 69,15,56,203,247 @@ -2842,13 +2842,13 @@ _shaext_shortcut: .byte 102,15,58,15,222,4 .byte 69,15,56,203,254 .byte 69,15,56,205,218 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 paddd %xmm3,%xmm4 movdqa %xmm11,%xmm3 .byte 102,65,15,58,15,218,4 .byte 15,56,204,238 .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 movdqa 112-128(%rbp),%xmm1 paddd %xmm7,%xmm1 .byte 69,15,56,203,247 @@ -2864,13 +2864,13 @@ _shaext_shortcut: .byte 102,15,58,15,223,4 .byte 69,15,56,203,254 .byte 69,15,56,205,195 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 paddd %xmm3,%xmm5 movdqa %xmm8,%xmm3 .byte 102,65,15,58,15,219,4 .byte 15,56,204,247 .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 movdqa 128-128(%rbp),%xmm1 paddd %xmm4,%xmm1 .byte 69,15,56,203,247 @@ -2886,13 +2886,13 @@ _shaext_shortcut: .byte 102,15,58,15,220,4 .byte 69,15,56,203,254 .byte 69,15,56,205,200 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 paddd %xmm3,%xmm6 movdqa %xmm9,%xmm3 .byte 102,65,15,58,15,216,4 .byte 15,56,204,252 .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 movdqa 144-128(%rbp),%xmm1 paddd %xmm5,%xmm1 .byte 69,15,56,203,247 @@ -2908,13 +2908,13 @@ _shaext_shortcut: .byte 102,15,58,15,221,4 .byte 69,15,56,203,254 .byte 69,15,56,205,209 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 paddd %xmm3,%xmm7 movdqa %xmm10,%xmm3 .byte 102,65,15,58,15,217,4 .byte 15,56,204,229 .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 movdqa 160-128(%rbp),%xmm1 paddd %xmm6,%xmm1 .byte 69,15,56,203,247 @@ -2930,13 +2930,13 @@ _shaext_shortcut: .byte 102,15,58,15,222,4 .byte 69,15,56,203,254 .byte 69,15,56,205,218 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 paddd %xmm3,%xmm4 movdqa %xmm11,%xmm3 .byte 102,65,15,58,15,218,4 .byte 15,56,204,238 .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 movdqa 176-128(%rbp),%xmm1 paddd %xmm7,%xmm1 .byte 69,15,56,203,247 @@ -2952,13 +2952,13 @@ _shaext_shortcut: .byte 102,15,58,15,223,4 .byte 69,15,56,203,254 .byte 69,15,56,205,195 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 paddd %xmm3,%xmm5 movdqa %xmm8,%xmm3 .byte 102,65,15,58,15,219,4 .byte 15,56,204,247 .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 movdqa 192-128(%rbp),%xmm1 paddd %xmm4,%xmm1 .byte 69,15,56,203,247 @@ -2974,13 +2974,13 @@ _shaext_shortcut: .byte 102,15,58,15,220,4 .byte 69,15,56,203,254 .byte 69,15,56,205,200 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 paddd %xmm3,%xmm6 movdqa %xmm9,%xmm3 .byte 102,65,15,58,15,216,4 .byte 15,56,204,252 .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 movdqa 208-128(%rbp),%xmm1 paddd %xmm5,%xmm1 .byte 69,15,56,203,247 @@ -2996,13 +2996,13 @@ _shaext_shortcut: .byte 102,15,58,15,221,4 .byte 69,15,56,203,254 .byte 69,15,56,205,209 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 paddd %xmm3,%xmm7 movdqa %xmm10,%xmm3 .byte 102,65,15,58,15,217,4 nop .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 movdqa 224-128(%rbp),%xmm1 paddd %xmm6,%xmm1 .byte 69,15,56,203,247 @@ -3019,13 +3019,13 @@ _shaext_shortcut: pxor %xmm6,%xmm6 .byte 69,15,56,203,254 .byte 69,15,56,205,218 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 movdqa 240-128(%rbp),%xmm1 paddd %xmm7,%xmm1 movq (%rbx),%xmm7 nop .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 movdqa 240-128(%rbp),%xmm2 paddd %xmm11,%xmm2 .byte 69,15,56,203,247 @@ -3035,17 +3035,17 @@ _shaext_shortcut: cmovgeq %rsp,%r8 cmpl 4(%rbx),%ecx cmovgeq %rsp,%r9 - pshufd $0,%xmm7,%xmm9 + pshufd $0x00,%xmm7,%xmm9 .byte 69,15,56,203,236 movdqa %xmm2,%xmm0 - pshufd $85,%xmm7,%xmm10 + pshufd $0x55,%xmm7,%xmm10 movdqa %xmm7,%xmm11 .byte 69,15,56,203,254 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 pcmpgtd %xmm6,%xmm9 pcmpgtd %xmm6,%xmm10 .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 pcmpgtd %xmm6,%xmm11 movdqa K256_shaext-16(%rip),%xmm3 .byte 69,15,56,203,247 @@ -3067,10 +3067,10 @@ _shaext_shortcut: movl 280(%rsp),%edx - pshufd $27,%xmm12,%xmm12 - pshufd $27,%xmm13,%xmm13 - pshufd $27,%xmm14,%xmm14 - pshufd $27,%xmm15,%xmm15 + pshufd $0b00011011,%xmm12,%xmm12 + pshufd $0b00011011,%xmm13,%xmm13 + pshufd $0b00011011,%xmm14,%xmm14 + pshufd $0b00011011,%xmm15,%xmm15 movdqa %xmm12,%xmm5 movdqa %xmm13,%xmm6 diff --git a/secure/lib/libcrypto/amd64/sha256-x86_64.S b/secure/lib/libcrypto/amd64/sha256-x86_64.S index a43a6685394f..0cbc56617f58 100644 --- a/secure/lib/libcrypto/amd64/sha256-x86_64.S +++ b/secure/lib/libcrypto/amd64/sha256-x86_64.S @@ -1755,9 +1755,9 @@ _shaext_shortcut: movdqu 16(%rdi),%xmm2 movdqa 512-128(%rcx),%xmm7 - pshufd $27,%xmm1,%xmm0 - pshufd $177,%xmm1,%xmm1 - pshufd $27,%xmm2,%xmm2 + pshufd $0x1b,%xmm1,%xmm0 + pshufd $0xb1,%xmm1,%xmm1 + pshufd $0x1b,%xmm2,%xmm2 movdqa %xmm7,%xmm8 .byte 102,15,58,15,202,8 punpcklqdq %xmm0,%xmm2 @@ -1776,7 +1776,7 @@ _shaext_shortcut: .byte 102,15,56,0,231 movdqa %xmm2,%xmm10 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 nop movdqa %xmm1,%xmm9 .byte 15,56,203,202 @@ -1785,7 +1785,7 @@ _shaext_shortcut: paddd %xmm4,%xmm0 .byte 102,15,56,0,239 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 leaq 64(%rsi),%rsi .byte 15,56,204,220 .byte 15,56,203,202 @@ -1794,7 +1794,7 @@ _shaext_shortcut: paddd %xmm5,%xmm0 .byte 102,15,56,0,247 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm6,%xmm7 .byte 102,15,58,15,253,4 nop @@ -1806,7 +1806,7 @@ _shaext_shortcut: paddd %xmm6,%xmm0 .byte 15,56,205,222 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm3,%xmm7 .byte 102,15,58,15,254,4 nop @@ -1817,7 +1817,7 @@ _shaext_shortcut: paddd %xmm3,%xmm0 .byte 15,56,205,227 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm4,%xmm7 .byte 102,15,58,15,251,4 nop @@ -1828,7 +1828,7 @@ _shaext_shortcut: paddd %xmm4,%xmm0 .byte 15,56,205,236 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm5,%xmm7 .byte 102,15,58,15,252,4 nop @@ -1839,7 +1839,7 @@ _shaext_shortcut: paddd %xmm5,%xmm0 .byte 15,56,205,245 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm6,%xmm7 .byte 102,15,58,15,253,4 nop @@ -1850,7 +1850,7 @@ _shaext_shortcut: paddd %xmm6,%xmm0 .byte 15,56,205,222 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm3,%xmm7 .byte 102,15,58,15,254,4 nop @@ -1861,7 +1861,7 @@ _shaext_shortcut: paddd %xmm3,%xmm0 .byte 15,56,205,227 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm4,%xmm7 .byte 102,15,58,15,251,4 nop @@ -1872,7 +1872,7 @@ _shaext_shortcut: paddd %xmm4,%xmm0 .byte 15,56,205,236 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm5,%xmm7 .byte 102,15,58,15,252,4 nop @@ -1883,7 +1883,7 @@ _shaext_shortcut: paddd %xmm5,%xmm0 .byte 15,56,205,245 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm6,%xmm7 .byte 102,15,58,15,253,4 nop @@ -1894,7 +1894,7 @@ _shaext_shortcut: paddd %xmm6,%xmm0 .byte 15,56,205,222 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm3,%xmm7 .byte 102,15,58,15,254,4 nop @@ -1905,7 +1905,7 @@ _shaext_shortcut: paddd %xmm3,%xmm0 .byte 15,56,205,227 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm4,%xmm7 .byte 102,15,58,15,251,4 nop @@ -1916,7 +1916,7 @@ _shaext_shortcut: paddd %xmm4,%xmm0 .byte 15,56,205,236 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm5,%xmm7 .byte 102,15,58,15,252,4 .byte 15,56,203,202 @@ -1925,7 +1925,7 @@ _shaext_shortcut: movdqa 448-128(%rcx),%xmm0 paddd %xmm5,%xmm0 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 .byte 15,56,205,245 movdqa %xmm8,%xmm7 .byte 15,56,203,202 @@ -1934,7 +1934,7 @@ _shaext_shortcut: paddd %xmm6,%xmm0 nop .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 decq %rdx nop .byte 15,56,203,202 @@ -1943,9 +1943,9 @@ _shaext_shortcut: paddd %xmm9,%xmm1 jnz .Loop_shaext - pshufd $177,%xmm2,%xmm2 - pshufd $27,%xmm1,%xmm7 - pshufd $177,%xmm1,%xmm1 + pshufd $0xb1,%xmm2,%xmm2 + pshufd $0x1b,%xmm1,%xmm7 + pshufd $0xb1,%xmm1,%xmm1 punpckhqdq %xmm2,%xmm1 .byte 102,15,58,15,215,8 diff --git a/secure/lib/libcrypto/amd64/vpaes-x86_64.S b/secure/lib/libcrypto/amd64/vpaes-x86_64.S index 8ec5c40a2697..c990e69f7d69 100644 --- a/secure/lib/libcrypto/amd64/vpaes-x86_64.S +++ b/secure/lib/libcrypto/amd64/vpaes-x86_64.S @@ -61,7 +61,7 @@ _vpaes_encrypt_core: addq $16,%r11 pxor %xmm0,%xmm3 .byte 102,15,56,0,193 - andq $48,%r11 + andq $0x30,%r11 subq $1,%rax pxor %xmm3,%xmm0 @@ -121,10 +121,10 @@ _vpaes_decrypt_core: pand %xmm9,%xmm0 .byte 102,15,56,0,208 movdqa .Lk_dipt+16(%rip),%xmm0 - xorq $48,%r11 + xorq $0x30,%r11 leaq .Lk_dsbd(%rip),%r10 .byte 102,15,56,0,193 - andq $48,%r11 + andq $0x30,%r11 pxor %xmm5,%xmm2 movdqa .Lk_mc_forward+48(%rip),%xmm5 pxor %xmm2,%xmm0 @@ -243,7 +243,7 @@ _vpaes_schedule_core: movdqa (%r8,%r10,1),%xmm1 .byte 102,15,56,0,217 movdqu %xmm3,(%rdx) - xorq $48,%r8 + xorq $0x30,%r8 .Lschedule_go: cmpl $192,%esi @@ -333,7 +333,7 @@ _vpaes_schedule_core: call _vpaes_schedule_mangle - pshufd $255,%xmm0,%xmm0 + pshufd $0xFF,%xmm0,%xmm0 movdqa %xmm7,%xmm5 movdqa %xmm6,%xmm7 call _vpaes_schedule_low_round @@ -400,8 +400,8 @@ _vpaes_schedule_core: .type _vpaes_schedule_192_smear,@function .align 16 _vpaes_schedule_192_smear: - pshufd $128,%xmm6,%xmm1 - pshufd $254,%xmm7,%xmm0 + pshufd $0x80,%xmm6,%xmm1 + pshufd $0xFE,%xmm7,%xmm0 pxor %xmm1,%xmm6 pxor %xmm1,%xmm1 pxor %xmm0,%xmm6 @@ -438,7 +438,7 @@ _vpaes_schedule_round: pxor %xmm1,%xmm7 - pshufd $255,%xmm0,%xmm0 + pshufd $0xFF,%xmm0,%xmm0 .byte 102,15,58,15,192,1 @@ -597,7 +597,7 @@ _vpaes_schedule_mangle: movdqa (%r8,%r10,1),%xmm1 .byte 102,15,56,0,217 addq $-16,%r8 - andq $48,%r8 + andq $0x30,%r8 movdqu %xmm3,(%rdx) .byte 0xf3,0xc3 .size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle @@ -615,7 +615,7 @@ vpaes_set_encrypt_key: movl %eax,240(%rdx) movl $0,%ecx - movl $48,%r8d + movl $0x30,%r8d call _vpaes_schedule_core xorl %eax,%eax .byte 0xf3,0xc3 diff --git a/secure/lib/libcrypto/amd64/x86_64-gf2m.S b/secure/lib/libcrypto/amd64/x86_64-gf2m.S index f86c2531ab2d..a53d5119c274 100644 --- a/secure/lib/libcrypto/amd64/x86_64-gf2m.S +++ b/secure/lib/libcrypto/amd64/x86_64-gf2m.S @@ -243,7 +243,7 @@ bn_GF2m_mul_2x2: movq %rcx,56(%rsp) movq %r8,64(%rsp) - movq $15,%r8 + movq $0xf,%r8 movq %rsi,%rax movq %rcx,%rbp call _mul_1x1 diff --git a/secure/lib/libcrypto/amd64/x86_64-mont.S b/secure/lib/libcrypto/amd64/x86_64-mont.S index bff0fb9d7297..3e67383de02b 100644 --- a/secure/lib/libcrypto/amd64/x86_64-mont.S +++ b/secure/lib/libcrypto/amd64/x86_64-mont.S @@ -634,20 +634,20 @@ bn_sqr8x_mont: - leaq -64(%rsp,%r9,4),%r11 + leaq -64(%rsp,%r9,2),%r11 movq (%r8),%r8 subq %rsi,%r11 andq $4095,%r11 cmpq %r11,%r10 jb .Lsqr8x_sp_alt subq %r11,%rsp - leaq -64(%rsp,%r9,4),%rsp + leaq -64(%rsp,%r9,2),%rsp jmp .Lsqr8x_sp_done .align 32 .Lsqr8x_sp_alt: - leaq 4096-64(,%r9,4),%r10 - leaq -64(%rsp,%r9,4),%rsp + leaq 4096-64(,%r9,2),%r10 + leaq -64(%rsp,%r9,2),%rsp subq %r10,%r11 movq $0,%r10 cmovcq %r10,%r11 @@ -657,58 +657,80 @@ bn_sqr8x_mont: movq %r9,%r10 negq %r9 - leaq 64(%rsp,%r9,2),%r11 movq %r8,32(%rsp) movq %rax,40(%rsp) .Lsqr8x_body: - movq %r9,%rbp -.byte 102,73,15,110,211 - shrq $3+2,%rbp - movl OPENSSL_ia32cap_P+8(%rip),%eax - jmp .Lsqr8x_copy_n - -.align 32 -.Lsqr8x_copy_n: - movq 0(%rcx),%xmm0 - movq 8(%rcx),%xmm1 - movq 16(%rcx),%xmm3 - movq 24(%rcx),%xmm4 - leaq 32(%rcx),%rcx - movdqa %xmm0,0(%r11) - movdqa %xmm1,16(%r11) - movdqa %xmm3,32(%r11) - movdqa %xmm4,48(%r11) - leaq 64(%r11),%r11 - decq %rbp - jnz .Lsqr8x_copy_n - +.byte 102,72,15,110,209 pxor %xmm0,%xmm0 .byte 102,72,15,110,207 .byte 102,73,15,110,218 call bn_sqr8x_internal + + + + leaq (%rdi,%r9,1),%rbx + movq %r9,%rcx + movq %r9,%rdx +.byte 102,72,15,126,207 + sarq $3+2,%rcx + jmp .Lsqr8x_sub + +.align 32 +.Lsqr8x_sub: + movq 0(%rbx),%r12 + movq 8(%rbx),%r13 + movq 16(%rbx),%r14 + movq 24(%rbx),%r15 + leaq 32(%rbx),%rbx + sbbq 0(%rbp),%r12 + sbbq 8(%rbp),%r13 + sbbq 16(%rbp),%r14 + sbbq 24(%rbp),%r15 + leaq 32(%rbp),%rbp + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r14,16(%rdi) + movq %r15,24(%rdi) + leaq 32(%rdi),%rdi + incq %rcx + jnz .Lsqr8x_sub + + sbbq $0,%rax + leaq (%rbx,%r9,1),%rbx + leaq (%rdi,%r9,1),%rdi + +.byte 102,72,15,110,200 pxor %xmm0,%xmm0 - leaq 48(%rsp),%rax - leaq 64(%rsp,%r9,2),%rdx - shrq $3+2,%r9 + pshufd $0,%xmm1,%xmm1 movq 40(%rsp),%rsi - jmp .Lsqr8x_zero + jmp .Lsqr8x_cond_copy .align 32 -.Lsqr8x_zero: - movdqa %xmm0,0(%rax) - movdqa %xmm0,16(%rax) - movdqa %xmm0,32(%rax) - movdqa %xmm0,48(%rax) - leaq 64(%rax),%rax - movdqa %xmm0,0(%rdx) - movdqa %xmm0,16(%rdx) - movdqa %xmm0,32(%rdx) - movdqa %xmm0,48(%rdx) - leaq 64(%rdx),%rdx - decq %r9 - jnz .Lsqr8x_zero +.Lsqr8x_cond_copy: + movdqa 0(%rbx),%xmm2 + movdqa 16(%rbx),%xmm3 + leaq 32(%rbx),%rbx + movdqu 0(%rdi),%xmm4 + movdqu 16(%rdi),%xmm5 + leaq 32(%rdi),%rdi + movdqa %xmm0,-32(%rbx) + movdqa %xmm0,-16(%rbx) + movdqa %xmm0,-32(%rbx,%rdx,1) + movdqa %xmm0,-16(%rbx,%rdx,1) + pcmpeqd %xmm1,%xmm0 + pand %xmm1,%xmm2 + pand %xmm1,%xmm3 + pand %xmm0,%xmm4 + pand %xmm0,%xmm5 + pxor %xmm0,%xmm0 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqu %xmm4,-32(%rdi) + movdqu %xmm5,-16(%rdi) + addq $32,%r9 + jnz .Lsqr8x_cond_copy movq $1,%rax movq -48(%rsi),%r15 diff --git a/secure/lib/libcrypto/amd64/x86_64-mont5.S b/secure/lib/libcrypto/amd64/x86_64-mont5.S index 19162e880d7f..cff6c72b5863 100644 --- a/secure/lib/libcrypto/amd64/x86_64-mont5.S +++ b/secure/lib/libcrypto/amd64/x86_64-mont5.S @@ -15,46 +15,151 @@ bn_mul_mont_gather5: .Lmul_enter: movl %r9d,%r9d movq %rsp,%rax - movl 8(%rsp),%r10d + movd 8(%rsp),%xmm5 + leaq .Linc(%rip),%r10 pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 pushq %r15 + leaq 2(%r9),%r11 negq %r11 - leaq (%rsp,%r11,8),%rsp + leaq -264(%rsp,%r11,8),%rsp andq $-1024,%rsp movq %rax,8(%rsp,%r9,8) .Lmul_body: - movq %rdx,%r12 - movq %r10,%r11 - shrq $3,%r10 - andq $7,%r11 - notq %r10 - leaq .Lmagic_masks(%rip),%rax - andq $3,%r10 - leaq 96(%r12,%r11,8),%r12 - movq 0(%rax,%r10,8),%xmm4 - movq 8(%rax,%r10,8),%xmm5 - movq 16(%rax,%r10,8),%xmm6 - movq 24(%rax,%r10,8),%xmm7 - - movq -96(%r12),%xmm0 - movq -32(%r12),%xmm1 - pand %xmm4,%xmm0 - movq 32(%r12),%xmm2 - pand %xmm5,%xmm1 - movq 96(%r12),%xmm3 - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - pand %xmm7,%xmm3 + leaq 128(%rdx),%r12 + movdqa 0(%r10),%xmm0 + movdqa 16(%r10),%xmm1 + leaq 24-112(%rsp,%r9,8),%r10 + andq $-16,%r10 + + pshufd $0,%xmm5,%xmm5 + movdqa %xmm1,%xmm4 + movdqa %xmm1,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 +.byte 0x67 + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,112(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,128(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,144(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,160(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,176(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,192(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,208(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,224(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,240(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,256(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,272(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,288(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,304(%r10) + + paddd %xmm2,%xmm3 +.byte 0x67 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,320(%r10) + + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,336(%r10) + pand 64(%r12),%xmm0 + + pand 80(%r12),%xmm1 + pand 96(%r12),%xmm2 + movdqa %xmm3,352(%r10) + pand 112(%r12),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa -128(%r12),%xmm4 + movdqa -112(%r12),%xmm5 + movdqa -96(%r12),%xmm2 + pand 112(%r10),%xmm4 + movdqa -80(%r12),%xmm3 + pand 128(%r10),%xmm5 + por %xmm4,%xmm0 + pand 144(%r10),%xmm2 + por %xmm5,%xmm1 + pand 160(%r10),%xmm3 por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa -64(%r12),%xmm4 + movdqa -48(%r12),%xmm5 + movdqa -32(%r12),%xmm2 + pand 176(%r10),%xmm4 + movdqa -16(%r12),%xmm3 + pand 192(%r10),%xmm5 + por %xmm4,%xmm0 + pand 208(%r10),%xmm2 + por %xmm5,%xmm1 + pand 224(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa 0(%r12),%xmm4 + movdqa 16(%r12),%xmm5 + movdqa 32(%r12),%xmm2 + pand 240(%r10),%xmm4 + movdqa 48(%r12),%xmm3 + pand 256(%r10),%xmm5 + por %xmm4,%xmm0 + pand 272(%r10),%xmm2 + por %xmm5,%xmm1 + pand 288(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + por %xmm1,%xmm0 + pshufd $0x4e,%xmm0,%xmm1 + por %xmm1,%xmm0 leaq 256(%r12),%r12 - por %xmm3,%xmm0 - .byte 102,72,15,126,195 movq (%r8),%r8 @@ -63,29 +168,14 @@ bn_mul_mont_gather5: xorq %r14,%r14 xorq %r15,%r15 - movq -96(%r12),%xmm0 - movq -32(%r12),%xmm1 - pand %xmm4,%xmm0 - movq 32(%r12),%xmm2 - pand %xmm5,%xmm1 - movq %r8,%rbp mulq %rbx movq %rax,%r10 movq (%rcx),%rax - movq 96(%r12),%xmm3 - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - pand %xmm7,%xmm3 - imulq %r10,%rbp movq %rdx,%r11 - por %xmm2,%xmm0 - leaq 256(%r12),%r12 - por %xmm3,%xmm0 - mulq %rbp addq %rax,%r10 movq 8(%rsi),%rax @@ -118,14 +208,12 @@ bn_mul_mont_gather5: cmpq %r9,%r15 jne .L1st -.byte 102,72,15,126,195 addq %rax,%r13 - movq (%rsi),%rax adcq $0,%rdx addq %r11,%r13 adcq $0,%rdx - movq %r13,-16(%rsp,%r15,8) + movq %r13,-16(%rsp,%r9,8) movq %rdx,%r13 movq %r10,%r11 @@ -139,33 +227,78 @@ bn_mul_mont_gather5: jmp .Louter .align 16 .Louter: + leaq 24+128(%rsp,%r9,8),%rdx + andq $-16,%rdx + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + movdqa -128(%r12),%xmm0 + movdqa -112(%r12),%xmm1 + movdqa -96(%r12),%xmm2 + movdqa -80(%r12),%xmm3 + pand -128(%rdx),%xmm0 + pand -112(%rdx),%xmm1 + por %xmm0,%xmm4 + pand -96(%rdx),%xmm2 + por %xmm1,%xmm5 + pand -80(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa -64(%r12),%xmm0 + movdqa -48(%r12),%xmm1 + movdqa -32(%r12),%xmm2 + movdqa -16(%r12),%xmm3 + pand -64(%rdx),%xmm0 + pand -48(%rdx),%xmm1 + por %xmm0,%xmm4 + pand -32(%rdx),%xmm2 + por %xmm1,%xmm5 + pand -16(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 0(%r12),%xmm0 + movdqa 16(%r12),%xmm1 + movdqa 32(%r12),%xmm2 + movdqa 48(%r12),%xmm3 + pand 0(%rdx),%xmm0 + pand 16(%rdx),%xmm1 + por %xmm0,%xmm4 + pand 32(%rdx),%xmm2 + por %xmm1,%xmm5 + pand 48(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 64(%r12),%xmm0 + movdqa 80(%r12),%xmm1 + movdqa 96(%r12),%xmm2 + movdqa 112(%r12),%xmm3 + pand 64(%rdx),%xmm0 + pand 80(%rdx),%xmm1 + por %xmm0,%xmm4 + pand 96(%rdx),%xmm2 + por %xmm1,%xmm5 + pand 112(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + por %xmm5,%xmm4 + pshufd $0x4e,%xmm4,%xmm0 + por %xmm4,%xmm0 + leaq 256(%r12),%r12 + + movq (%rsi),%rax +.byte 102,72,15,126,195 + xorq %r15,%r15 movq %r8,%rbp movq (%rsp),%r10 - movq -96(%r12),%xmm0 - movq -32(%r12),%xmm1 - pand %xmm4,%xmm0 - movq 32(%r12),%xmm2 - pand %xmm5,%xmm1 - mulq %rbx addq %rax,%r10 movq (%rcx),%rax adcq $0,%rdx - movq 96(%r12),%xmm3 - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - pand %xmm7,%xmm3 - imulq %r10,%rbp movq %rdx,%r11 - por %xmm2,%xmm0 - leaq 256(%r12),%r12 - por %xmm3,%xmm0 - mulq %rbp addq %rax,%r10 movq 8(%rsi),%rax @@ -201,15 +334,12 @@ bn_mul_mont_gather5: cmpq %r9,%r15 jne .Linner -.byte 102,72,15,126,195 - addq %rax,%r13 - movq (%rsi),%rax adcq $0,%rdx addq %r10,%r13 - movq (%rsp,%r15,8),%r10 + movq (%rsp,%r9,8),%r10 adcq $0,%rdx - movq %r13,-16(%rsp,%r15,8) + movq %r13,-16(%rsp,%r9,8) movq %rdx,%r13 xorq %rdx,%rdx @@ -256,6 +386,7 @@ bn_mul_mont_gather5: movq 8(%rsp,%r9,8),%rsi movq $1,%rax + movq -48(%rsi),%r15 movq -40(%rsi),%r14 movq -32(%rsi),%r13 @@ -278,10 +409,10 @@ bn_mul4x_mont_gather5: pushq %r13 pushq %r14 pushq %r15 + .byte 0x67 - movl %r9d,%r10d shll $3,%r9d - shll $3+2,%r10d + leaq (%r9,%r9,2),%r10 negq %r9 @@ -291,19 +422,21 @@ bn_mul4x_mont_gather5: - leaq -64(%rsp,%r9,2),%r11 - subq %rsi,%r11 + + + leaq -320(%rsp,%r9,2),%r11 + subq %rdi,%r11 andq $4095,%r11 cmpq %r11,%r10 jb .Lmul4xsp_alt subq %r11,%rsp - leaq -64(%rsp,%r9,2),%rsp + leaq -320(%rsp,%r9,2),%rsp jmp .Lmul4xsp_done .align 32 .Lmul4xsp_alt: - leaq 4096-64(,%r9,2),%r10 - leaq -64(%rsp,%r9,2),%rsp + leaq 4096-320(,%r9,2),%r10 + leaq -320(%rsp,%r9,2),%rsp subq %r10,%r11 movq $0,%r10 cmovcq %r10,%r11 @@ -319,6 +452,7 @@ bn_mul4x_mont_gather5: movq 40(%rsp),%rsi movq $1,%rax + movq -48(%rsi),%r15 movq -40(%rsi),%r14 movq -32(%rsi),%r13 @@ -334,47 +468,141 @@ bn_mul4x_mont_gather5: .align 32 mul4x_internal: shlq $5,%r9 - movl 8(%rax),%r10d - leaq 256(%rdx,%r9,1),%r13 + movd 8(%rax),%xmm5 + leaq .Linc(%rip),%rax + leaq 128(%rdx,%r9,1),%r13 shrq $5,%r9 - movq %r10,%r11 - shrq $3,%r10 - andq $7,%r11 - notq %r10 - leaq .Lmagic_masks(%rip),%rax - andq $3,%r10 - leaq 96(%rdx,%r11,8),%r12 - movq 0(%rax,%r10,8),%xmm4 - movq 8(%rax,%r10,8),%xmm5 - addq $7,%r11 - movq 16(%rax,%r10,8),%xmm6 - movq 24(%rax,%r10,8),%xmm7 - andq $7,%r11 - - movq -96(%r12),%xmm0 - leaq 256(%r12),%r14 - movq -32(%r12),%xmm1 - pand %xmm4,%xmm0 - movq 32(%r12),%xmm2 - pand %xmm5,%xmm1 - movq 96(%r12),%xmm3 - pand %xmm6,%xmm2 -.byte 0x67 - por %xmm1,%xmm0 - movq -96(%r14),%xmm1 -.byte 0x67 - pand %xmm7,%xmm3 -.byte 0x67 - por %xmm2,%xmm0 - movq -32(%r14),%xmm2 + movdqa 0(%rax),%xmm0 + movdqa 16(%rax),%xmm1 + leaq 88-112(%rsp,%r9,1),%r10 + leaq 128(%rdx),%r12 + + pshufd $0,%xmm5,%xmm5 + movdqa %xmm1,%xmm4 +.byte 0x67,0x67 + movdqa %xmm1,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 .byte 0x67 - pand %xmm4,%xmm1 + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,112(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,128(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,144(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,160(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,176(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,192(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,208(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,224(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,240(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,256(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,272(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,288(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,304(%r10) + + paddd %xmm2,%xmm3 .byte 0x67 - por %xmm3,%xmm0 - movq 32(%r14),%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,320(%r10) + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,336(%r10) + pand 64(%r12),%xmm0 + + pand 80(%r12),%xmm1 + pand 96(%r12),%xmm2 + movdqa %xmm3,352(%r10) + pand 112(%r12),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa -128(%r12),%xmm4 + movdqa -112(%r12),%xmm5 + movdqa -96(%r12),%xmm2 + pand 112(%r10),%xmm4 + movdqa -80(%r12),%xmm3 + pand 128(%r10),%xmm5 + por %xmm4,%xmm0 + pand 144(%r10),%xmm2 + por %xmm5,%xmm1 + pand 160(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa -64(%r12),%xmm4 + movdqa -48(%r12),%xmm5 + movdqa -32(%r12),%xmm2 + pand 176(%r10),%xmm4 + movdqa -16(%r12),%xmm3 + pand 192(%r10),%xmm5 + por %xmm4,%xmm0 + pand 208(%r10),%xmm2 + por %xmm5,%xmm1 + pand 224(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa 0(%r12),%xmm4 + movdqa 16(%r12),%xmm5 + movdqa 32(%r12),%xmm2 + pand 240(%r10),%xmm4 + movdqa 48(%r12),%xmm3 + pand 256(%r10),%xmm5 + por %xmm4,%xmm0 + pand 272(%r10),%xmm2 + por %xmm5,%xmm1 + pand 288(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + por %xmm1,%xmm0 + pshufd $0x4e,%xmm0,%xmm1 + por %xmm1,%xmm0 + leaq 256(%r12),%r12 .byte 102,72,15,126,195 - movq 96(%r14),%xmm0 + movq %r13,16+8(%rsp) movq %rdi,56+8(%rsp) @@ -388,26 +616,10 @@ mul4x_internal: movq %rax,%r10 movq (%rcx),%rax - pand %xmm5,%xmm2 - pand %xmm6,%xmm3 - por %xmm2,%xmm1 - imulq %r10,%rbp - - - - - - - - leaq 64+8(%rsp,%r11,8),%r14 + leaq 64+8(%rsp),%r14 movq %rdx,%r11 - pand %xmm7,%xmm0 - por %xmm3,%xmm1 - leaq 512(%r12),%r12 - por %xmm1,%xmm0 - mulq %rbp addq %rax,%r10 movq 8(%rsi,%r9,1),%rax @@ -416,7 +628,7 @@ mul4x_internal: mulq %rbx addq %rax,%r11 - movq 16(%rcx),%rax + movq 8(%rcx),%rax adcq $0,%rdx movq %rdx,%r10 @@ -426,7 +638,7 @@ mul4x_internal: adcq $0,%rdx addq %r11,%rdi leaq 32(%r9),%r15 - leaq 64(%rcx),%rcx + leaq 32(%rcx),%rcx adcq $0,%rdx movq %rdi,(%r14) movq %rdx,%r13 @@ -436,7 +648,7 @@ mul4x_internal: .L1st4x: mulq %rbx addq %rax,%r10 - movq -32(%rcx),%rax + movq -16(%rcx),%rax leaq 32(%r14),%r14 adcq $0,%rdx movq %rdx,%r11 @@ -452,7 +664,7 @@ mul4x_internal: mulq %rbx addq %rax,%r11 - movq -16(%rcx),%rax + movq -8(%rcx),%rax adcq $0,%rdx movq %rdx,%r10 @@ -482,7 +694,7 @@ mul4x_internal: mulq %rbx addq %rax,%r11 - movq 16(%rcx),%rax + movq 8(%rcx),%rax adcq $0,%rdx movq %rdx,%r10 @@ -491,7 +703,7 @@ mul4x_internal: movq 16(%rsi,%r15,1),%rax adcq $0,%rdx addq %r11,%rdi - leaq 64(%rcx),%rcx + leaq 32(%rcx),%rcx adcq $0,%rdx movq %rdi,(%r14) movq %rdx,%r13 @@ -501,7 +713,7 @@ mul4x_internal: mulq %rbx addq %rax,%r10 - movq -32(%rcx),%rax + movq -16(%rcx),%rax leaq 32(%r14),%r14 adcq $0,%rdx movq %rdx,%r11 @@ -517,7 +729,7 @@ mul4x_internal: mulq %rbx addq %rax,%r11 - movq -16(%rcx),%rax + movq -8(%rcx),%rax adcq $0,%rdx movq %rdx,%r10 @@ -530,8 +742,7 @@ mul4x_internal: movq %rdi,-16(%r14) movq %rdx,%r13 -.byte 102,72,15,126,195 - leaq (%rcx,%r9,2),%rcx + leaq (%rcx,%r9,1),%rcx xorq %rdi,%rdi addq %r10,%r13 @@ -542,6 +753,63 @@ mul4x_internal: .align 32 .Louter4x: + leaq 16+128(%r14),%rdx + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + movdqa -128(%r12),%xmm0 + movdqa -112(%r12),%xmm1 + movdqa -96(%r12),%xmm2 + movdqa -80(%r12),%xmm3 + pand -128(%rdx),%xmm0 + pand -112(%rdx),%xmm1 + por %xmm0,%xmm4 + pand -96(%rdx),%xmm2 + por %xmm1,%xmm5 + pand -80(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa -64(%r12),%xmm0 + movdqa -48(%r12),%xmm1 + movdqa -32(%r12),%xmm2 + movdqa -16(%r12),%xmm3 + pand -64(%rdx),%xmm0 + pand -48(%rdx),%xmm1 + por %xmm0,%xmm4 + pand -32(%rdx),%xmm2 + por %xmm1,%xmm5 + pand -16(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 0(%r12),%xmm0 + movdqa 16(%r12),%xmm1 + movdqa 32(%r12),%xmm2 + movdqa 48(%r12),%xmm3 + pand 0(%rdx),%xmm0 + pand 16(%rdx),%xmm1 + por %xmm0,%xmm4 + pand 32(%rdx),%xmm2 + por %xmm1,%xmm5 + pand 48(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 64(%r12),%xmm0 + movdqa 80(%r12),%xmm1 + movdqa 96(%r12),%xmm2 + movdqa 112(%r12),%xmm3 + pand 64(%rdx),%xmm0 + pand 80(%rdx),%xmm1 + por %xmm0,%xmm4 + pand 96(%rdx),%xmm2 + por %xmm1,%xmm5 + pand 112(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + por %xmm5,%xmm4 + pshufd $0x4e,%xmm4,%xmm0 + por %xmm4,%xmm0 + leaq 256(%r12),%r12 +.byte 102,72,15,126,195 + movq (%r14,%r9,1),%r10 movq %r8,%rbp mulq %rbx @@ -549,25 +817,11 @@ mul4x_internal: movq (%rcx),%rax adcq $0,%rdx - movq -96(%r12),%xmm0 - movq -32(%r12),%xmm1 - pand %xmm4,%xmm0 - movq 32(%r12),%xmm2 - pand %xmm5,%xmm1 - movq 96(%r12),%xmm3 - imulq %r10,%rbp -.byte 0x67 movq %rdx,%r11 movq %rdi,(%r14) - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - pand %xmm7,%xmm3 - por %xmm2,%xmm0 leaq (%r14,%r9,1),%r14 - leaq 256(%r12),%r12 - por %xmm3,%xmm0 mulq %rbp addq %rax,%r10 @@ -577,7 +831,7 @@ mul4x_internal: mulq %rbx addq %rax,%r11 - movq 16(%rcx),%rax + movq 8(%rcx),%rax adcq $0,%rdx addq 8(%r14),%r11 adcq $0,%rdx @@ -589,7 +843,7 @@ mul4x_internal: adcq $0,%rdx addq %r11,%rdi leaq 32(%r9),%r15 - leaq 64(%rcx),%rcx + leaq 32(%rcx),%rcx adcq $0,%rdx movq %rdx,%r13 jmp .Linner4x @@ -598,7 +852,7 @@ mul4x_internal: .Linner4x: mulq %rbx addq %rax,%r10 - movq -32(%rcx),%rax + movq -16(%rcx),%rax adcq $0,%rdx addq 16(%r14),%r10 leaq 32(%r14),%r14 @@ -616,7 +870,7 @@ mul4x_internal: mulq %rbx addq %rax,%r11 - movq -16(%rcx),%rax + movq -8(%rcx),%rax adcq $0,%rdx addq -8(%r14),%r11 adcq $0,%rdx @@ -650,7 +904,7 @@ mul4x_internal: mulq %rbx addq %rax,%r11 - movq 16(%rcx),%rax + movq 8(%rcx),%rax adcq $0,%rdx addq 8(%r14),%r11 adcq $0,%rdx @@ -661,7 +915,7 @@ mul4x_internal: movq 16(%rsi,%r15,1),%rax adcq $0,%rdx addq %r11,%rdi - leaq 64(%rcx),%rcx + leaq 32(%rcx),%rcx adcq $0,%rdx movq %r13,-8(%r14) movq %rdx,%r13 @@ -671,7 +925,7 @@ mul4x_internal: mulq %rbx addq %rax,%r10 - movq -32(%rcx),%rax + movq -16(%rcx),%rax adcq $0,%rdx addq 16(%r14),%r10 leaq 32(%r14),%r14 @@ -690,7 +944,7 @@ mul4x_internal: mulq %rbx addq %rax,%r11 movq %rbp,%rax - movq -16(%rcx),%rbp + movq -8(%rcx),%rbp adcq $0,%rdx addq -8(%r14),%r11 adcq $0,%rdx @@ -705,9 +959,8 @@ mul4x_internal: movq %r13,-24(%r14) movq %rdx,%r13 -.byte 102,72,15,126,195 movq %rdi,-16(%r14) - leaq (%rcx,%r9,2),%rcx + leaq (%rcx,%r9,1),%rcx xorq %rdi,%rdi addq %r10,%r13 @@ -718,16 +971,23 @@ mul4x_internal: cmpq 16+8(%rsp),%r12 jb .Louter4x + xorq %rax,%rax subq %r13,%rbp adcq %r15,%r15 orq %r15,%rdi - xorq $1,%rdi + subq %rdi,%rax leaq (%r14,%r9,1),%rbx - leaq (%rcx,%rdi,8),%rbp + movq (%rcx),%r12 + leaq (%rcx),%rbp movq %r9,%rcx sarq $3+2,%rcx movq 56+8(%rsp),%rdi - jmp .Lsqr4x_sub + decq %r12 + xorq %r10,%r10 + movq 8(%rbp),%r13 + movq 16(%rbp),%r14 + movq 24(%rbp),%r15 + jmp .Lsqr4x_sub_entry .size mul4x_internal,.-mul4x_internal .globl bn_power5 .type bn_power5,@function @@ -740,9 +1000,9 @@ bn_power5: pushq %r13 pushq %r14 pushq %r15 - movl %r9d,%r10d + shll $3,%r9d - shll $3+2,%r10d + leal (%r9,%r9,2),%r10d negq %r9 movq (%r8),%r8 @@ -752,19 +1012,20 @@ bn_power5: - leaq -64(%rsp,%r9,2),%r11 - subq %rsi,%r11 + + leaq -320(%rsp,%r9,2),%r11 + subq %rdi,%r11 andq $4095,%r11 cmpq %r11,%r10 jb .Lpwr_sp_alt subq %r11,%rsp - leaq -64(%rsp,%r9,2),%rsp + leaq -320(%rsp,%r9,2),%rsp jmp .Lpwr_sp_done .align 32 .Lpwr_sp_alt: - leaq 4096-64(,%r9,2),%r10 - leaq -64(%rsp,%r9,2),%rsp + leaq 4096-320(,%r9,2),%r10 + leaq -320(%rsp,%r9,2),%rsp subq %r10,%r11 movq $0,%r10 cmovcq %r10,%r11 @@ -792,10 +1053,15 @@ bn_power5: .byte 102,72,15,110,226 call __bn_sqr8x_internal + call __bn_post4x_internal call __bn_sqr8x_internal + call __bn_post4x_internal call __bn_sqr8x_internal + call __bn_post4x_internal call __bn_sqr8x_internal + call __bn_post4x_internal call __bn_sqr8x_internal + call __bn_post4x_internal .byte 102,72,15,126,209 .byte 102,72,15,126,226 @@ -1339,9 +1605,9 @@ __bn_sqr8x_internal: movq %rbx,-16(%rdi) movq %r8,-8(%rdi) .byte 102,72,15,126,213 -sqr8x_reduction: +__bn_sqr8x_reduction: xorq %rax,%rax - leaq (%rbp,%r9,2),%rcx + leaq (%r9,%rbp,1),%rcx leaq 48+8(%rsp,%r9,2),%rdx movq %rcx,0+8(%rsp) leaq 48+8(%rsp,%r9,1),%rdi @@ -1374,14 +1640,14 @@ sqr8x_reduction: .align 32 .L8x_reduce: mulq %rbx - movq 16(%rbp),%rax + movq 8(%rbp),%rax negq %r8 movq %rdx,%r8 adcq $0,%r8 mulq %rbx addq %rax,%r9 - movq 32(%rbp),%rax + movq 16(%rbp),%rax adcq $0,%rdx addq %r9,%r8 movq %rbx,48-8+8(%rsp,%rcx,8) @@ -1390,7 +1656,7 @@ sqr8x_reduction: mulq %rbx addq %rax,%r10 - movq 48(%rbp),%rax + movq 24(%rbp),%rax adcq $0,%rdx addq %r10,%r9 movq 32+8(%rsp),%rsi @@ -1399,7 +1665,7 @@ sqr8x_reduction: mulq %rbx addq %rax,%r11 - movq 64(%rbp),%rax + movq 32(%rbp),%rax adcq $0,%rdx imulq %r8,%rsi addq %r11,%r10 @@ -1408,7 +1674,7 @@ sqr8x_reduction: mulq %rbx addq %rax,%r12 - movq 80(%rbp),%rax + movq 40(%rbp),%rax adcq $0,%rdx addq %r12,%r11 movq %rdx,%r12 @@ -1416,7 +1682,7 @@ sqr8x_reduction: mulq %rbx addq %rax,%r13 - movq 96(%rbp),%rax + movq 48(%rbp),%rax adcq $0,%rdx addq %r13,%r12 movq %rdx,%r13 @@ -1424,7 +1690,7 @@ sqr8x_reduction: mulq %rbx addq %rax,%r14 - movq 112(%rbp),%rax + movq 56(%rbp),%rax adcq $0,%rdx addq %r14,%r13 movq %rdx,%r14 @@ -1442,7 +1708,7 @@ sqr8x_reduction: decl %ecx jnz .L8x_reduce - leaq 128(%rbp),%rbp + leaq 64(%rbp),%rbp xorq %rax,%rax movq 8+8(%rsp),%rdx cmpq 0+8(%rsp),%rbp @@ -1468,14 +1734,14 @@ sqr8x_reduction: .L8x_tail: mulq %rbx addq %rax,%r8 - movq 16(%rbp),%rax + movq 8(%rbp),%rax movq %r8,(%rdi) movq %rdx,%r8 adcq $0,%r8 mulq %rbx addq %rax,%r9 - movq 32(%rbp),%rax + movq 16(%rbp),%rax adcq $0,%rdx addq %r9,%r8 leaq 8(%rdi),%rdi @@ -1484,7 +1750,7 @@ sqr8x_reduction: mulq %rbx addq %rax,%r10 - movq 48(%rbp),%rax + movq 24(%rbp),%rax adcq $0,%rdx addq %r10,%r9 movq %rdx,%r10 @@ -1492,7 +1758,7 @@ sqr8x_reduction: mulq %rbx addq %rax,%r11 - movq 64(%rbp),%rax + movq 32(%rbp),%rax adcq $0,%rdx addq %r11,%r10 movq %rdx,%r11 @@ -1500,7 +1766,7 @@ sqr8x_reduction: mulq %rbx addq %rax,%r12 - movq 80(%rbp),%rax + movq 40(%rbp),%rax adcq $0,%rdx addq %r12,%r11 movq %rdx,%r12 @@ -1508,7 +1774,7 @@ sqr8x_reduction: mulq %rbx addq %rax,%r13 - movq 96(%rbp),%rax + movq 48(%rbp),%rax adcq $0,%rdx addq %r13,%r12 movq %rdx,%r13 @@ -1516,7 +1782,7 @@ sqr8x_reduction: mulq %rbx addq %rax,%r14 - movq 112(%rbp),%rax + movq 56(%rbp),%rax adcq $0,%rdx addq %r14,%r13 movq %rdx,%r14 @@ -1534,7 +1800,7 @@ sqr8x_reduction: decl %ecx jnz .L8x_tail - leaq 128(%rbp),%rbp + leaq 64(%rbp),%rbp movq 8+8(%rsp),%rdx cmpq 0+8(%rsp),%rbp jae .L8x_tail_done @@ -1580,7 +1846,7 @@ sqr8x_reduction: adcq 48(%rdi),%r14 adcq 56(%rdi),%r15 adcq $0,%rax - movq -16(%rbp),%rcx + movq -8(%rbp),%rcx xorq %rsi,%rsi .byte 102,72,15,126,213 @@ -1598,44 +1864,62 @@ sqr8x_reduction: cmpq %rdx,%rdi jb .L8x_reduction_loop - - subq %r15,%rcx + .byte 0xf3,0xc3 +.size bn_sqr8x_internal,.-bn_sqr8x_internal +.type __bn_post4x_internal,@function +.align 32 +__bn_post4x_internal: + movq 0(%rbp),%r12 leaq (%rdi,%r9,1),%rbx - adcq %rsi,%rsi movq %r9,%rcx - orq %rsi,%rax .byte 102,72,15,126,207 - xorq $1,%rax + negq %rax .byte 102,72,15,126,206 - leaq (%rbp,%rax,8),%rbp sarq $3+2,%rcx - jmp .Lsqr4x_sub + decq %r12 + xorq %r10,%r10 + movq 8(%rbp),%r13 + movq 16(%rbp),%r14 + movq 24(%rbp),%r15 + jmp .Lsqr4x_sub_entry -.align 32 +.align 16 .Lsqr4x_sub: -.byte 0x66 - movq 0(%rbx),%r12 - movq 8(%rbx),%r13 - sbbq 0(%rbp),%r12 - movq 16(%rbx),%r14 - sbbq 16(%rbp),%r13 - movq 24(%rbx),%r15 - leaq 32(%rbx),%rbx - sbbq 32(%rbp),%r14 + movq 0(%rbp),%r12 + movq 8(%rbp),%r13 + movq 16(%rbp),%r14 + movq 24(%rbp),%r15 +.Lsqr4x_sub_entry: + leaq 32(%rbp),%rbp + notq %r12 + notq %r13 + notq %r14 + notq %r15 + andq %rax,%r12 + andq %rax,%r13 + andq %rax,%r14 + andq %rax,%r15 + + negq %r10 + adcq 0(%rbx),%r12 + adcq 8(%rbx),%r13 + adcq 16(%rbx),%r14 + adcq 24(%rbx),%r15 movq %r12,0(%rdi) - sbbq 48(%rbp),%r15 - leaq 64(%rbp),%rbp + leaq 32(%rbx),%rbx movq %r13,8(%rdi) + sbbq %r10,%r10 movq %r14,16(%rdi) movq %r15,24(%rdi) leaq 32(%rdi),%rdi incq %rcx jnz .Lsqr4x_sub + movq %r9,%r10 negq %r9 .byte 0xf3,0xc3 -.size bn_sqr8x_internal,.-bn_sqr8x_internal +.size __bn_post4x_internal,.-__bn_post4x_internal .globl bn_from_montgomery .type bn_from_montgomery,@function .align 32 @@ -1657,10 +1941,9 @@ bn_from_mont8x: pushq %r13 pushq %r14 pushq %r15 -.byte 0x67 - movl %r9d,%r10d + shll $3,%r9d - shll $3+2,%r10d + leaq (%r9,%r9,2),%r10 negq %r9 movq (%r8),%r8 @@ -1670,19 +1953,20 @@ bn_from_mont8x: - leaq -64(%rsp,%r9,2),%r11 - subq %rsi,%r11 + + leaq -320(%rsp,%r9,2),%r11 + subq %rdi,%r11 andq $4095,%r11 cmpq %r11,%r10 jb .Lfrom_sp_alt subq %r11,%rsp - leaq -64(%rsp,%r9,2),%rsp + leaq -320(%rsp,%r9,2),%rsp jmp .Lfrom_sp_done .align 32 .Lfrom_sp_alt: - leaq 4096-64(,%r9,2),%r10 - leaq -64(%rsp,%r9,2),%rsp + leaq 4096-320(,%r9,2),%r10 + leaq -320(%rsp,%r9,2),%rsp subq %r10,%r11 movq $0,%r10 cmovcq %r10,%r11 @@ -1733,7 +2017,8 @@ bn_from_mont8x: .byte 0x67 movq %rcx,%rbp .byte 102,73,15,110,218 - call sqr8x_reduction + call __bn_sqr8x_reduction + call __bn_post4x_internal pxor %xmm0,%xmm0 leaq 48(%rsp),%rax @@ -1800,45 +2085,169 @@ bn_scatter5: .globl bn_gather5 .type bn_gather5,@function -.align 16 +.align 32 bn_gather5: - movl %ecx,%r11d - shrl $3,%ecx - andq $7,%r11 - notl %ecx - leaq .Lmagic_masks(%rip),%rax - andl $3,%ecx - leaq 128(%rdx,%r11,8),%rdx - movq 0(%rax,%rcx,8),%xmm4 - movq 8(%rax,%rcx,8),%xmm5 - movq 16(%rax,%rcx,8),%xmm6 - movq 24(%rax,%rcx,8),%xmm7 +.LSEH_begin_bn_gather5: + +.byte 0x4c,0x8d,0x14,0x24 +.byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 + leaq .Linc(%rip),%rax + andq $-16,%rsp + + movd %ecx,%xmm5 + movdqa 0(%rax),%xmm0 + movdqa 16(%rax),%xmm1 + leaq 128(%rdx),%r11 + leaq 128(%rsp),%rax + + pshufd $0,%xmm5,%xmm5 + movdqa %xmm1,%xmm4 + movdqa %xmm1,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm4,%xmm3 + + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,-128(%rax) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,-112(%rax) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,-96(%rax) + movdqa %xmm4,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,-80(%rax) + movdqa %xmm4,%xmm3 + + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,-64(%rax) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,-48(%rax) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,-32(%rax) + movdqa %xmm4,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,-16(%rax) + movdqa %xmm4,%xmm3 + + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,0(%rax) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,16(%rax) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,32(%rax) + movdqa %xmm4,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,48(%rax) + movdqa %xmm4,%xmm3 + + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,64(%rax) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,80(%rax) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,96(%rax) + movdqa %xmm4,%xmm2 + movdqa %xmm3,112(%rax) jmp .Lgather -.align 16 -.Lgather: - movq -128(%rdx),%xmm0 - movq -64(%rdx),%xmm1 - pand %xmm4,%xmm0 - movq 0(%rdx),%xmm2 - pand %xmm5,%xmm1 - movq 64(%rdx),%xmm3 - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - pand %xmm7,%xmm3 -.byte 0x67,0x67 - por %xmm2,%xmm0 - leaq 256(%rdx),%rdx - por %xmm3,%xmm0 +.align 32 +.Lgather: + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + movdqa -128(%r11),%xmm0 + movdqa -112(%r11),%xmm1 + movdqa -96(%r11),%xmm2 + pand -128(%rax),%xmm0 + movdqa -80(%r11),%xmm3 + pand -112(%rax),%xmm1 + por %xmm0,%xmm4 + pand -96(%rax),%xmm2 + por %xmm1,%xmm5 + pand -80(%rax),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa -64(%r11),%xmm0 + movdqa -48(%r11),%xmm1 + movdqa -32(%r11),%xmm2 + pand -64(%rax),%xmm0 + movdqa -16(%r11),%xmm3 + pand -48(%rax),%xmm1 + por %xmm0,%xmm4 + pand -32(%rax),%xmm2 + por %xmm1,%xmm5 + pand -16(%rax),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 0(%r11),%xmm0 + movdqa 16(%r11),%xmm1 + movdqa 32(%r11),%xmm2 + pand 0(%rax),%xmm0 + movdqa 48(%r11),%xmm3 + pand 16(%rax),%xmm1 + por %xmm0,%xmm4 + pand 32(%rax),%xmm2 + por %xmm1,%xmm5 + pand 48(%rax),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 64(%r11),%xmm0 + movdqa 80(%r11),%xmm1 + movdqa 96(%r11),%xmm2 + pand 64(%rax),%xmm0 + movdqa 112(%r11),%xmm3 + pand 80(%rax),%xmm1 + por %xmm0,%xmm4 + pand 96(%rax),%xmm2 + por %xmm1,%xmm5 + pand 112(%rax),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + por %xmm5,%xmm4 + leaq 256(%r11),%r11 + pshufd $0x4e,%xmm4,%xmm0 + por %xmm4,%xmm0 movq %xmm0,(%rdi) leaq 8(%rdi),%rdi subl $1,%esi jnz .Lgather + + leaq (%r10),%rsp .byte 0xf3,0xc3 .LSEH_end_bn_gather5: .size bn_gather5,.-bn_gather5 .align 64 -.Lmagic_masks: -.long 0,0, 0,0, 0,0, -1,-1 -.long 0,0, 0,0, 0,0, 0,0 +.Linc: +.long 0,0, 1,1 +.long 2,2, 2,2 .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 diff --git a/secure/lib/libcrypto/amd64/x86_64cpuid.S b/secure/lib/libcrypto/amd64/x86_64cpuid.S index 93de516c582e..e5b8011b0ccb 100644 --- a/secure/lib/libcrypto/amd64/x86_64cpuid.S +++ b/secure/lib/libcrypto/amd64/x86_64cpuid.S @@ -45,43 +45,43 @@ OPENSSL_ia32_cpuid: movl %eax,%r11d xorl %eax,%eax - cmpl $1970169159,%ebx + cmpl $0x756e6547,%ebx setne %al movl %eax,%r9d - cmpl $1231384169,%edx + cmpl $0x49656e69,%edx setne %al orl %eax,%r9d - cmpl $1818588270,%ecx + cmpl $0x6c65746e,%ecx setne %al orl %eax,%r9d jz .Lintel - cmpl $1752462657,%ebx + cmpl $0x68747541,%ebx setne %al movl %eax,%r10d - cmpl $1769238117,%edx + cmpl $0x69746E65,%edx setne %al orl %eax,%r10d - cmpl $1145913699,%ecx + cmpl $0x444D4163,%ecx setne %al orl %eax,%r10d jnz .Lintel - movl $2147483648,%eax + movl $0x80000000,%eax cpuid - cmpl $2147483649,%eax + cmpl $0x80000001,%eax jb .Lintel movl %eax,%r10d - movl $2147483649,%eax + movl $0x80000001,%eax cpuid orl %ecx,%r9d - andl $2049,%r9d + andl $0x00000801,%r9d - cmpl $2147483656,%r10d + cmpl $0x80000008,%r10d jb .Lintel - movl $2147483656,%eax + movl $0x80000008,%eax cpuid movzbq %cl,%r10 incq %r10 @@ -93,7 +93,7 @@ OPENSSL_ia32_cpuid: shrl $16,%ebx cmpb %r10b,%bl ja .Lgeneric - andl $4026531839,%edx + andl $0xefffffff,%edx jmp .Lgeneric .Lintel: @@ -106,7 +106,7 @@ OPENSSL_ia32_cpuid: cpuid movl %eax,%r10d shrl $14,%r10d - andl $4095,%r10d + andl $0xfff,%r10d cmpl $7,%r11d jb .Lnocacheinfo @@ -119,29 +119,29 @@ OPENSSL_ia32_cpuid: .Lnocacheinfo: movl $1,%eax cpuid - andl $3220176895,%edx + andl $0xbfefffff,%edx cmpl $0,%r9d jne .Lnotintel - orl $1073741824,%edx + orl $0x40000000,%edx andb $15,%ah cmpb $15,%ah jne .Lnotintel - orl $1048576,%edx + orl $0x00100000,%edx .Lnotintel: btl $28,%edx jnc .Lgeneric - andl $4026531839,%edx + andl $0xefffffff,%edx cmpl $0,%r10d je .Lgeneric - orl $268435456,%edx + orl $0x10000000,%edx shrl $16,%ebx cmpb $1,%bl ja .Lgeneric - andl $4026531839,%edx + andl $0xefffffff,%edx .Lgeneric: - andl $2048,%r9d - andl $4294965247,%ecx + andl $0x00000800,%r9d + andl $0xfffff7ff,%ecx orl %ecx,%r9d movl %edx,%r10d @@ -153,9 +153,9 @@ OPENSSL_ia32_cpuid: cmpl $6,%eax je .Ldone .Lclear_avx: - movl $4026525695,%eax + movl $0xefffe7ff,%eax andl %eax,%r9d - andl $4294967263,8(%rdi) + andl $0xffffffdf,8(%rdi) .Ldone: shlq $32,%r9 movl %r10d,%eax |