diff options
Diffstat (limited to 'secure/lib/libcrypto/amd64/rsaz-x86_64.S')
-rw-r--r-- | secure/lib/libcrypto/amd64/rsaz-x86_64.S | 863 |
1 files changed, 113 insertions, 750 deletions
diff --git a/secure/lib/libcrypto/amd64/rsaz-x86_64.S b/secure/lib/libcrypto/amd64/rsaz-x86_64.S index e4e7b0469a53..ae64f7a73987 100644 --- a/secure/lib/libcrypto/amd64/rsaz-x86_64.S +++ b/secure/lib/libcrypto/amd64/rsaz-x86_64.S @@ -31,14 +31,10 @@ rsaz_512_sqr: subq $128+24,%rsp .cfi_adjust_cfa_offset 128+24 .Lsqr_body: - movq %rdx,%rbp +.byte 102,72,15,110,202 movq (%rsi),%rdx movq 8(%rsi),%rax movq %rcx,128(%rsp) - movl $0x80100,%r11d - andl OPENSSL_ia32cap_P+8(%rip),%r11d - cmpl $0x80100,%r11d - je .Loop_sqrx jmp .Loop_sqr .align 32 @@ -46,6 +42,7 @@ rsaz_512_sqr: movl %r8d,128+8(%rsp) movq %rdx,%rbx + movq %rax,%rbp mulq %rdx movq %rax,%r8 movq 16(%rsi),%rax @@ -84,31 +81,29 @@ rsaz_512_sqr: mulq %rbx addq %rax,%r14 movq %rbx,%rax - movq %rdx,%r15 - adcq $0,%r15 + adcq $0,%rdx + xorq %rcx,%rcx addq %r8,%r8 - movq %r9,%rcx - adcq %r9,%r9 + movq %rdx,%r15 + adcq $0,%rcx mulq %rax - movq %rax,(%rsp) - addq %rdx,%r8 - adcq $0,%r9 + addq %r8,%rdx + adcq $0,%rcx - movq %r8,8(%rsp) - shrq $63,%rcx + movq %rax,(%rsp) + movq %rdx,8(%rsp) - movq 8(%rsi),%r8 movq 16(%rsi),%rax - mulq %r8 + mulq %rbp addq %rax,%r10 movq 24(%rsi),%rax movq %rdx,%rbx adcq $0,%rbx - mulq %r8 + mulq %rbp addq %rax,%r11 movq 32(%rsi),%rax adcq $0,%rdx @@ -116,7 +111,7 @@ rsaz_512_sqr: movq %rdx,%rbx adcq $0,%rbx - mulq %r8 + mulq %rbp addq %rax,%r12 movq 40(%rsi),%rax adcq $0,%rdx @@ -124,7 +119,7 @@ rsaz_512_sqr: movq %rdx,%rbx adcq $0,%rbx - mulq %r8 + mulq %rbp addq %rax,%r13 movq 48(%rsi),%rax adcq $0,%rdx @@ -132,7 +127,7 @@ rsaz_512_sqr: movq %rdx,%rbx adcq $0,%rbx - mulq %r8 + mulq %rbp addq %rax,%r14 movq 56(%rsi),%rax adcq $0,%rdx @@ -140,39 +135,39 @@ rsaz_512_sqr: movq %rdx,%rbx adcq $0,%rbx - mulq %r8 + mulq %rbp addq %rax,%r15 - movq %r8,%rax + movq %rbp,%rax adcq $0,%rdx addq %rbx,%r15 - movq %rdx,%r8 - movq %r10,%rdx - adcq $0,%r8 + adcq $0,%rdx - addq %rdx,%rdx - leaq (%rcx,%r10,2),%r10 - movq %r11,%rbx - adcq %r11,%r11 + xorq %rbx,%rbx + addq %r9,%r9 + movq %rdx,%r8 + adcq %r10,%r10 + adcq $0,%rbx mulq %rax + + addq %rcx,%rax + movq 16(%rsi),%rbp addq %rax,%r9 + movq 24(%rsi),%rax adcq %rdx,%r10 - adcq $0,%r11 + adcq $0,%rbx movq %r9,16(%rsp) movq %r10,24(%rsp) - shrq $63,%rbx - movq 16(%rsi),%r9 - movq 24(%rsi),%rax - mulq %r9 + mulq %rbp addq %rax,%r12 movq 32(%rsi),%rax movq %rdx,%rcx adcq $0,%rcx - mulq %r9 + mulq %rbp addq %rax,%r13 movq 40(%rsi),%rax adcq $0,%rdx @@ -180,7 +175,7 @@ rsaz_512_sqr: movq %rdx,%rcx adcq $0,%rcx - mulq %r9 + mulq %rbp addq %rax,%r14 movq 48(%rsi),%rax adcq $0,%rdx @@ -188,9 +183,7 @@ rsaz_512_sqr: movq %rdx,%rcx adcq $0,%rcx - mulq %r9 - movq %r12,%r10 - leaq (%rbx,%r12,2),%r12 + mulq %rbp addq %rax,%r15 movq 56(%rsi),%rax adcq $0,%rdx @@ -198,36 +191,40 @@ rsaz_512_sqr: movq %rdx,%rcx adcq $0,%rcx - mulq %r9 - shrq $63,%r10 + mulq %rbp addq %rax,%r8 - movq %r9,%rax + movq %rbp,%rax adcq $0,%rdx addq %rcx,%r8 - movq %rdx,%r9 - adcq $0,%r9 + adcq $0,%rdx - movq %r13,%rcx - leaq (%r10,%r13,2),%r13 + xorq %rcx,%rcx + addq %r11,%r11 + movq %rdx,%r9 + adcq %r12,%r12 + adcq $0,%rcx mulq %rax + + addq %rbx,%rax + movq 24(%rsi),%r10 addq %rax,%r11 + movq 32(%rsi),%rax adcq %rdx,%r12 - adcq $0,%r13 + adcq $0,%rcx movq %r11,32(%rsp) movq %r12,40(%rsp) - shrq $63,%rcx - movq 24(%rsi),%r10 - movq 32(%rsi),%rax + movq %rax,%r11 mulq %r10 addq %rax,%r14 movq 40(%rsi),%rax movq %rdx,%rbx adcq $0,%rbx + movq %rax,%r12 mulq %r10 addq %rax,%r15 movq 48(%rsi),%rax @@ -236,9 +233,8 @@ rsaz_512_sqr: movq %rdx,%rbx adcq $0,%rbx + movq %rax,%rbp mulq %r10 - movq %r14,%r12 - leaq (%rcx,%r14,2),%r14 addq %rax,%r8 movq 56(%rsi),%rax adcq $0,%rdx @@ -247,32 +243,33 @@ rsaz_512_sqr: adcq $0,%rbx mulq %r10 - shrq $63,%r12 addq %rax,%r9 movq %r10,%rax adcq $0,%rdx addq %rbx,%r9 - movq %rdx,%r10 - adcq $0,%r10 + adcq $0,%rdx - movq %r15,%rbx - leaq (%r12,%r15,2),%r15 + xorq %rbx,%rbx + addq %r13,%r13 + movq %rdx,%r10 + adcq %r14,%r14 + adcq $0,%rbx mulq %rax + + addq %rcx,%rax addq %rax,%r13 + movq %r12,%rax adcq %rdx,%r14 - adcq $0,%r15 + adcq $0,%rbx movq %r13,48(%rsp) movq %r14,56(%rsp) - shrq $63,%rbx - movq 32(%rsi),%r11 - movq 40(%rsi),%rax mulq %r11 addq %rax,%r8 - movq 48(%rsi),%rax + movq %rbp,%rax movq %rdx,%rcx adcq $0,%rcx @@ -280,97 +277,99 @@ rsaz_512_sqr: addq %rax,%r9 movq 56(%rsi),%rax adcq $0,%rdx - movq %r8,%r12 - leaq (%rbx,%r8,2),%r8 addq %rcx,%r9 movq %rdx,%rcx adcq $0,%rcx + movq %rax,%r14 mulq %r11 - shrq $63,%r12 addq %rax,%r10 movq %r11,%rax adcq $0,%rdx addq %rcx,%r10 - movq %rdx,%r11 - adcq $0,%r11 + adcq $0,%rdx - movq %r9,%rcx - leaq (%r12,%r9,2),%r9 + xorq %rcx,%rcx + addq %r15,%r15 + movq %rdx,%r11 + adcq %r8,%r8 + adcq $0,%rcx mulq %rax + + addq %rbx,%rax addq %rax,%r15 + movq %rbp,%rax adcq %rdx,%r8 - adcq $0,%r9 + adcq $0,%rcx movq %r15,64(%rsp) movq %r8,72(%rsp) - shrq $63,%rcx - movq 40(%rsi),%r12 - movq 48(%rsi),%rax mulq %r12 addq %rax,%r10 - movq 56(%rsi),%rax + movq %r14,%rax movq %rdx,%rbx adcq $0,%rbx mulq %r12 addq %rax,%r11 movq %r12,%rax - movq %r10,%r15 - leaq (%rcx,%r10,2),%r10 adcq $0,%rdx - shrq $63,%r15 addq %rbx,%r11 - movq %rdx,%r12 - adcq $0,%r12 + adcq $0,%rdx - movq %r11,%rbx - leaq (%r15,%r11,2),%r11 + xorq %rbx,%rbx + addq %r9,%r9 + movq %rdx,%r12 + adcq %r10,%r10 + adcq $0,%rbx mulq %rax + + addq %rcx,%rax addq %rax,%r9 + movq %r14,%rax adcq %rdx,%r10 - adcq $0,%r11 + adcq $0,%rbx movq %r9,80(%rsp) movq %r10,88(%rsp) - movq 48(%rsi),%r13 - movq 56(%rsi),%rax - mulq %r13 + mulq %rbp addq %rax,%r12 - movq %r13,%rax - movq %rdx,%r13 - adcq $0,%r13 + movq %rbp,%rax + adcq $0,%rdx - xorq %r14,%r14 - shlq $1,%rbx + xorq %rcx,%rcx + addq %r11,%r11 + movq %rdx,%r13 adcq %r12,%r12 - adcq %r13,%r13 - adcq %r14,%r14 + adcq $0,%rcx mulq %rax + + addq %rbx,%rax addq %rax,%r11 + movq %r14,%rax adcq %rdx,%r12 - adcq $0,%r13 + adcq $0,%rcx movq %r11,96(%rsp) movq %r12,104(%rsp) - movq 56(%rsi),%rax - mulq %rax - addq %rax,%r13 - adcq $0,%rdx + xorq %rbx,%rbx + addq %r13,%r13 + adcq $0,%rbx - addq %rdx,%r14 + mulq %rax - movq %r13,112(%rsp) - movq %r14,120(%rsp) + addq %rcx,%rax + addq %r13,%rax + adcq %rbx,%rdx movq (%rsp),%r8 movq 8(%rsp),%r9 @@ -380,276 +379,12 @@ rsaz_512_sqr: movq 40(%rsp),%r13 movq 48(%rsp),%r14 movq 56(%rsp),%r15 - - call __rsaz_512_reduce - - addq 64(%rsp),%r8 - adcq 72(%rsp),%r9 - adcq 80(%rsp),%r10 - adcq 88(%rsp),%r11 - adcq 96(%rsp),%r12 - adcq 104(%rsp),%r13 - adcq 112(%rsp),%r14 - adcq 120(%rsp),%r15 - sbbq %rcx,%rcx - - call __rsaz_512_subtract - - movq %r8,%rdx - movq %r9,%rax - movl 128+8(%rsp),%r8d - movq %rdi,%rsi - - decl %r8d - jnz .Loop_sqr - jmp .Lsqr_tail - -.align 32 -.Loop_sqrx: - movl %r8d,128+8(%rsp) -.byte 102,72,15,110,199 -.byte 102,72,15,110,205 - - mulxq %rax,%r8,%r9 - - mulxq 16(%rsi),%rcx,%r10 - xorq %rbp,%rbp - - mulxq 24(%rsi),%rax,%r11 - adcxq %rcx,%r9 - - mulxq 32(%rsi),%rcx,%r12 - adcxq %rax,%r10 - - mulxq 40(%rsi),%rax,%r13 - adcxq %rcx,%r11 - -.byte 0xc4,0x62,0xf3,0xf6,0xb6,0x30,0x00,0x00,0x00 - adcxq %rax,%r12 - adcxq %rcx,%r13 - -.byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 - adcxq %rax,%r14 - adcxq %rbp,%r15 - - movq %r9,%rcx - shldq $1,%r8,%r9 - shlq $1,%r8 - - xorl %ebp,%ebp - mulxq %rdx,%rax,%rdx - adcxq %rdx,%r8 - movq 8(%rsi),%rdx - adcxq %rbp,%r9 - - movq %rax,(%rsp) - movq %r8,8(%rsp) - - - mulxq 16(%rsi),%rax,%rbx - adoxq %rax,%r10 - adcxq %rbx,%r11 - -.byte 0xc4,0x62,0xc3,0xf6,0x86,0x18,0x00,0x00,0x00 - adoxq %rdi,%r11 - adcxq %r8,%r12 - - mulxq 32(%rsi),%rax,%rbx - adoxq %rax,%r12 - adcxq %rbx,%r13 - - mulxq 40(%rsi),%rdi,%r8 - adoxq %rdi,%r13 - adcxq %r8,%r14 - -.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 - adoxq %rax,%r14 - adcxq %rbx,%r15 - -.byte 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00 - adoxq %rdi,%r15 - adcxq %rbp,%r8 - adoxq %rbp,%r8 - - movq %r11,%rbx - shldq $1,%r10,%r11 - shldq $1,%rcx,%r10 - - xorl %ebp,%ebp - mulxq %rdx,%rax,%rcx - movq 16(%rsi),%rdx - adcxq %rax,%r9 - adcxq %rcx,%r10 - adcxq %rbp,%r11 - - movq %r9,16(%rsp) -.byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00 - - -.byte 0xc4,0x62,0xc3,0xf6,0x8e,0x18,0x00,0x00,0x00 - adoxq %rdi,%r12 - adcxq %r9,%r13 - - mulxq 32(%rsi),%rax,%rcx - adoxq %rax,%r13 - adcxq %rcx,%r14 - - mulxq 40(%rsi),%rdi,%r9 - adoxq %rdi,%r14 - adcxq %r9,%r15 - -.byte 0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00 - adoxq %rax,%r15 - adcxq %rcx,%r8 - -.byte 0xc4,0x62,0xc3,0xf6,0x8e,0x38,0x00,0x00,0x00 - adoxq %rdi,%r8 - adcxq %rbp,%r9 - adoxq %rbp,%r9 - - movq %r13,%rcx - shldq $1,%r12,%r13 - shldq $1,%rbx,%r12 - - xorl %ebp,%ebp - mulxq %rdx,%rax,%rdx - adcxq %rax,%r11 - adcxq %rdx,%r12 - movq 24(%rsi),%rdx - adcxq %rbp,%r13 - - movq %r11,32(%rsp) -.byte 0x4c,0x89,0xa4,0x24,0x28,0x00,0x00,0x00 - - -.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x20,0x00,0x00,0x00 - adoxq %rax,%r14 - adcxq %rbx,%r15 - - mulxq 40(%rsi),%rdi,%r10 - adoxq %rdi,%r15 - adcxq %r10,%r8 - - mulxq 48(%rsi),%rax,%rbx - adoxq %rax,%r8 - adcxq %rbx,%r9 - - mulxq 56(%rsi),%rdi,%r10 - adoxq %rdi,%r9 - adcxq %rbp,%r10 - adoxq %rbp,%r10 - -.byte 0x66 - movq %r15,%rbx - shldq $1,%r14,%r15 - shldq $1,%rcx,%r14 - - xorl %ebp,%ebp - mulxq %rdx,%rax,%rdx - adcxq %rax,%r13 - adcxq %rdx,%r14 - movq 32(%rsi),%rdx - adcxq %rbp,%r15 - - movq %r13,48(%rsp) - movq %r14,56(%rsp) - - -.byte 0xc4,0x62,0xc3,0xf6,0x9e,0x28,0x00,0x00,0x00 - adoxq %rdi,%r8 - adcxq %r11,%r9 - - mulxq 48(%rsi),%rax,%rcx - adoxq %rax,%r9 - adcxq %rcx,%r10 - - mulxq 56(%rsi),%rdi,%r11 - adoxq %rdi,%r10 - adcxq %rbp,%r11 - adoxq %rbp,%r11 - - movq %r9,%rcx - shldq $1,%r8,%r9 - shldq $1,%rbx,%r8 - - xorl %ebp,%ebp - mulxq %rdx,%rax,%rdx - adcxq %rax,%r15 - adcxq %rdx,%r8 - movq 40(%rsi),%rdx - adcxq %rbp,%r9 - - movq %r15,64(%rsp) - movq %r8,72(%rsp) - - -.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 - adoxq %rax,%r10 - adcxq %rbx,%r11 - -.byte 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00 - adoxq %rdi,%r11 - adcxq %rbp,%r12 - adoxq %rbp,%r12 - - movq %r11,%rbx - shldq $1,%r10,%r11 - shldq $1,%rcx,%r10 - - xorl %ebp,%ebp - mulxq %rdx,%rax,%rdx - adcxq %rax,%r9 - adcxq %rdx,%r10 - movq 48(%rsi),%rdx - adcxq %rbp,%r11 - - movq %r9,80(%rsp) - movq %r10,88(%rsp) - - -.byte 0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00 - adoxq %rax,%r12 - adoxq %rbp,%r13 - - xorq %r14,%r14 - shldq $1,%r13,%r14 - shldq $1,%r12,%r13 - shldq $1,%rbx,%r12 - - xorl %ebp,%ebp - mulxq %rdx,%rax,%rdx - adcxq %rax,%r11 - adcxq %rdx,%r12 - movq 56(%rsi),%rdx - adcxq %rbp,%r13 - -.byte 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00 -.byte 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00 - - - mulxq %rdx,%rax,%rdx - adoxq %rax,%r13 - adoxq %rbp,%rdx - -.byte 0x66 - addq %rdx,%r14 - - movq %r13,112(%rsp) - movq %r14,120(%rsp) -.byte 102,72,15,126,199 .byte 102,72,15,126,205 - movq 128(%rsp),%rdx - movq (%rsp),%r8 - movq 8(%rsp),%r9 - movq 16(%rsp),%r10 - movq 24(%rsp),%r11 - movq 32(%rsp),%r12 - movq 40(%rsp),%r13 - movq 48(%rsp),%r14 - movq 56(%rsp),%r15 + movq %rax,112(%rsp) + movq %rdx,120(%rsp) - call __rsaz_512_reducex + call __rsaz_512_reduce addq 64(%rsp),%r8 adcq 72(%rsp),%r9 @@ -669,9 +404,7 @@ rsaz_512_sqr: movq %rdi,%rsi decl %r8d - jnz .Loop_sqrx - -.Lsqr_tail: + jnz .Loop_sqr leaq 128+24+48(%rsp),%rax .cfi_def_cfa %rax,8 @@ -723,10 +456,6 @@ rsaz_512_mul: .byte 102,72,15,110,199 .byte 102,72,15,110,201 movq %r8,128(%rsp) - movl $0x80100,%r11d - andl OPENSSL_ia32cap_P+8(%rip),%r11d - cmpl $0x80100,%r11d - je .Lmulx movq (%rdx),%rbx movq %rdx,%rbp call __rsaz_512_mul @@ -744,29 +473,6 @@ rsaz_512_mul: movq 56(%rsp),%r15 call __rsaz_512_reduce - jmp .Lmul_tail - -.align 32 -.Lmulx: - movq %rdx,%rbp - movq (%rdx),%rdx - call __rsaz_512_mulx - -.byte 102,72,15,126,199 -.byte 102,72,15,126,205 - - movq 128(%rsp),%rdx - movq (%rsp),%r8 - movq 8(%rsp),%r9 - movq 16(%rsp),%r10 - movq 24(%rsp),%r11 - movq 32(%rsp),%r12 - movq 40(%rsp),%r13 - movq 48(%rsp),%r14 - movq 56(%rsp),%r15 - - call __rsaz_512_reducex -.Lmul_tail: addq 64(%rsp),%r8 adcq 72(%rsp),%r9 adcq 80(%rsp),%r10 @@ -880,10 +586,6 @@ rsaz_512_mul_gather4: por %xmm9,%xmm8 pshufd $0x4e,%xmm8,%xmm9 por %xmm9,%xmm8 - movl $0x80100,%r11d - andl OPENSSL_ia32cap_P+8(%rip),%r11d - cmpl $0x80100,%r11d - je .Lmulx_gather .byte 102,76,15,126,195 movq %r8,128(%rsp) @@ -1064,142 +766,6 @@ rsaz_512_mul_gather4: movq 56(%rsp),%r15 call __rsaz_512_reduce - jmp .Lmul_gather_tail - -.align 32 -.Lmulx_gather: -.byte 102,76,15,126,194 - - movq %r8,128(%rsp) - movq %rdi,128+8(%rsp) - movq %rcx,128+16(%rsp) - - mulxq (%rsi),%rbx,%r8 - movq %rbx,(%rsp) - xorl %edi,%edi - - mulxq 8(%rsi),%rax,%r9 - - mulxq 16(%rsi),%rbx,%r10 - adcxq %rax,%r8 - - mulxq 24(%rsi),%rax,%r11 - adcxq %rbx,%r9 - - mulxq 32(%rsi),%rbx,%r12 - adcxq %rax,%r10 - - mulxq 40(%rsi),%rax,%r13 - adcxq %rbx,%r11 - - mulxq 48(%rsi),%rbx,%r14 - adcxq %rax,%r12 - - mulxq 56(%rsi),%rax,%r15 - adcxq %rbx,%r13 - adcxq %rax,%r14 -.byte 0x67 - movq %r8,%rbx - adcxq %rdi,%r15 - - movq $-7,%rcx - jmp .Loop_mulx_gather - -.align 32 -.Loop_mulx_gather: - movdqa 0(%rbp),%xmm8 - movdqa 16(%rbp),%xmm9 - movdqa 32(%rbp),%xmm10 - movdqa 48(%rbp),%xmm11 - pand %xmm0,%xmm8 - movdqa 64(%rbp),%xmm12 - pand %xmm1,%xmm9 - movdqa 80(%rbp),%xmm13 - pand %xmm2,%xmm10 - movdqa 96(%rbp),%xmm14 - pand %xmm3,%xmm11 - movdqa 112(%rbp),%xmm15 - leaq 128(%rbp),%rbp - pand %xmm4,%xmm12 - pand %xmm5,%xmm13 - pand %xmm6,%xmm14 - pand %xmm7,%xmm15 - por %xmm10,%xmm8 - por %xmm11,%xmm9 - por %xmm12,%xmm8 - por %xmm13,%xmm9 - por %xmm14,%xmm8 - por %xmm15,%xmm9 - - por %xmm9,%xmm8 - pshufd $0x4e,%xmm8,%xmm9 - por %xmm9,%xmm8 -.byte 102,76,15,126,194 - -.byte 0xc4,0x62,0xfb,0xf6,0x86,0x00,0x00,0x00,0x00 - adcxq %rax,%rbx - adoxq %r9,%r8 - - mulxq 8(%rsi),%rax,%r9 - adcxq %rax,%r8 - adoxq %r10,%r9 - - mulxq 16(%rsi),%rax,%r10 - adcxq %rax,%r9 - adoxq %r11,%r10 - -.byte 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00 - adcxq %rax,%r10 - adoxq %r12,%r11 - - mulxq 32(%rsi),%rax,%r12 - adcxq %rax,%r11 - adoxq %r13,%r12 - - mulxq 40(%rsi),%rax,%r13 - adcxq %rax,%r12 - adoxq %r14,%r13 - -.byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 - adcxq %rax,%r13 -.byte 0x67 - adoxq %r15,%r14 - - mulxq 56(%rsi),%rax,%r15 - movq %rbx,64(%rsp,%rcx,8) - adcxq %rax,%r14 - adoxq %rdi,%r15 - movq %r8,%rbx - adcxq %rdi,%r15 - - incq %rcx - jnz .Loop_mulx_gather - - movq %r8,64(%rsp) - movq %r9,64+8(%rsp) - movq %r10,64+16(%rsp) - movq %r11,64+24(%rsp) - movq %r12,64+32(%rsp) - movq %r13,64+40(%rsp) - movq %r14,64+48(%rsp) - movq %r15,64+56(%rsp) - - movq 128(%rsp),%rdx - movq 128+8(%rsp),%rdi - movq 128+16(%rsp),%rbp - - movq (%rsp),%r8 - movq 8(%rsp),%r9 - movq 16(%rsp),%r10 - movq 24(%rsp),%r11 - movq 32(%rsp),%r12 - movq 40(%rsp),%r13 - movq 48(%rsp),%r14 - movq 56(%rsp),%r15 - - call __rsaz_512_reducex - -.Lmul_gather_tail: addq 64(%rsp),%r8 adcq 72(%rsp),%r9 adcq 80(%rsp),%r10 @@ -1267,10 +833,6 @@ rsaz_512_mul_scatter4: movq %rcx,128(%rsp) movq %rdi,%rbp - movl $0x80100,%r11d - andl OPENSSL_ia32cap_P+8(%rip),%r11d - cmpl $0x80100,%r11d - je .Lmulx_scatter movq (%rdi),%rbx call __rsaz_512_mul @@ -1287,29 +849,6 @@ rsaz_512_mul_scatter4: movq 56(%rsp),%r15 call __rsaz_512_reduce - jmp .Lmul_scatter_tail - -.align 32 -.Lmulx_scatter: - movq (%rdi),%rdx - call __rsaz_512_mulx - -.byte 102,72,15,126,199 -.byte 102,72,15,126,205 - - movq 128(%rsp),%rdx - movq (%rsp),%r8 - movq 8(%rsp),%r9 - movq 16(%rsp),%r10 - movq 24(%rsp),%r11 - movq 32(%rsp),%r12 - movq 40(%rsp),%r13 - movq 48(%rsp),%r14 - movq 56(%rsp),%r15 - - call __rsaz_512_reducex - -.Lmul_scatter_tail: addq 64(%rsp),%r8 adcq 72(%rsp),%r9 adcq 80(%rsp),%r10 @@ -1379,7 +918,6 @@ rsaz_512_mul_by_one: subq $128+24,%rsp .cfi_adjust_cfa_offset 128+24 .Lmul_by_one_body: - movl OPENSSL_ia32cap_P+8(%rip),%eax movq %rdx,%rbp movq %rcx,128(%rsp) @@ -1400,16 +938,7 @@ rsaz_512_mul_by_one: movdqa %xmm0,64(%rsp) movdqa %xmm0,80(%rsp) movdqa %xmm0,96(%rsp) - andl $0x80100,%eax - cmpl $0x80100,%eax - je .Lby_one_callx call __rsaz_512_reduce - jmp .Lby_one_tail -.align 32 -.Lby_one_callx: - movq 128(%rsp),%rdx - call __rsaz_512_reducex -.Lby_one_tail: movq %r8,(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) @@ -1442,6 +971,7 @@ rsaz_512_mul_by_one: .type __rsaz_512_reduce,@function .align 32 __rsaz_512_reduce: +.cfi_startproc movq %r8,%rbx imulq 128+8(%rsp),%rbx movq 0(%rbp),%rax @@ -1521,66 +1051,12 @@ __rsaz_512_reduce: jne .Lreduction_loop .byte 0xf3,0xc3 +.cfi_endproc .size __rsaz_512_reduce,.-__rsaz_512_reduce -.type __rsaz_512_reducex,@function -.align 32 -__rsaz_512_reducex: - - imulq %r8,%rdx - xorq %rsi,%rsi - movl $8,%ecx - jmp .Lreduction_loopx - -.align 32 -.Lreduction_loopx: - movq %r8,%rbx - mulxq 0(%rbp),%rax,%r8 - adcxq %rbx,%rax - adoxq %r9,%r8 - - mulxq 8(%rbp),%rax,%r9 - adcxq %rax,%r8 - adoxq %r10,%r9 - - mulxq 16(%rbp),%rbx,%r10 - adcxq %rbx,%r9 - adoxq %r11,%r10 - - mulxq 24(%rbp),%rbx,%r11 - adcxq %rbx,%r10 - adoxq %r12,%r11 - -.byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 - movq %rdx,%rax - movq %r8,%rdx - adcxq %rbx,%r11 - adoxq %r13,%r12 - - mulxq 128+8(%rsp),%rbx,%rdx - movq %rax,%rdx - - mulxq 40(%rbp),%rax,%r13 - adcxq %rax,%r12 - adoxq %r14,%r13 - -.byte 0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00 - adcxq %rax,%r13 - adoxq %r15,%r14 - - mulxq 56(%rbp),%rax,%r15 - movq %rbx,%rdx - adcxq %rax,%r14 - adoxq %rsi,%r15 - adcxq %rsi,%r15 - - decl %ecx - jne .Lreduction_loopx - - .byte 0xf3,0xc3 -.size __rsaz_512_reducex,.-__rsaz_512_reducex .type __rsaz_512_subtract,@function .align 32 __rsaz_512_subtract: +.cfi_startproc movq %r8,(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) @@ -1634,10 +1110,12 @@ __rsaz_512_subtract: movq %r15,56(%rdi) .byte 0xf3,0xc3 +.cfi_endproc .size __rsaz_512_subtract,.-__rsaz_512_subtract .type __rsaz_512_mul,@function .align 32 __rsaz_512_mul: +.cfi_startproc leaq 8(%rsp),%rdi movq (%rsi),%rax @@ -1776,131 +1254,13 @@ __rsaz_512_mul: movq %r15,56(%rdi) .byte 0xf3,0xc3 +.cfi_endproc .size __rsaz_512_mul,.-__rsaz_512_mul -.type __rsaz_512_mulx,@function -.align 32 -__rsaz_512_mulx: - mulxq (%rsi),%rbx,%r8 - movq $-6,%rcx - - mulxq 8(%rsi),%rax,%r9 - movq %rbx,8(%rsp) - - mulxq 16(%rsi),%rbx,%r10 - adcq %rax,%r8 - - mulxq 24(%rsi),%rax,%r11 - adcq %rbx,%r9 - - mulxq 32(%rsi),%rbx,%r12 - adcq %rax,%r10 - - mulxq 40(%rsi),%rax,%r13 - adcq %rbx,%r11 - - mulxq 48(%rsi),%rbx,%r14 - adcq %rax,%r12 - - mulxq 56(%rsi),%rax,%r15 - movq 8(%rbp),%rdx - adcq %rbx,%r13 - adcq %rax,%r14 - adcq $0,%r15 - - xorq %rdi,%rdi - jmp .Loop_mulx - -.align 32 -.Loop_mulx: - movq %r8,%rbx - mulxq (%rsi),%rax,%r8 - adcxq %rax,%rbx - adoxq %r9,%r8 - - mulxq 8(%rsi),%rax,%r9 - adcxq %rax,%r8 - adoxq %r10,%r9 - - mulxq 16(%rsi),%rax,%r10 - adcxq %rax,%r9 - adoxq %r11,%r10 - - mulxq 24(%rsi),%rax,%r11 - adcxq %rax,%r10 - adoxq %r12,%r11 - -.byte 0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00 - adcxq %rax,%r11 - adoxq %r13,%r12 - - mulxq 40(%rsi),%rax,%r13 - adcxq %rax,%r12 - adoxq %r14,%r13 - - mulxq 48(%rsi),%rax,%r14 - adcxq %rax,%r13 - adoxq %r15,%r14 - - mulxq 56(%rsi),%rax,%r15 - movq 64(%rbp,%rcx,8),%rdx - movq %rbx,8+64-8(%rsp,%rcx,8) - adcxq %rax,%r14 - adoxq %rdi,%r15 - adcxq %rdi,%r15 - - incq %rcx - jnz .Loop_mulx - - movq %r8,%rbx - mulxq (%rsi),%rax,%r8 - adcxq %rax,%rbx - adoxq %r9,%r8 - -.byte 0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00 - adcxq %rax,%r8 - adoxq %r10,%r9 - -.byte 0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00 - adcxq %rax,%r9 - adoxq %r11,%r10 - - mulxq 24(%rsi),%rax,%r11 - adcxq %rax,%r10 - adoxq %r12,%r11 - - mulxq 32(%rsi),%rax,%r12 - adcxq %rax,%r11 - adoxq %r13,%r12 - - mulxq 40(%rsi),%rax,%r13 - adcxq %rax,%r12 - adoxq %r14,%r13 - -.byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 - adcxq %rax,%r13 - adoxq %r15,%r14 - -.byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 - adcxq %rax,%r14 - adoxq %rdi,%r15 - adcxq %rdi,%r15 - - movq %rbx,8+64-8(%rsp) - movq %r8,8+64(%rsp) - movq %r9,8+64+8(%rsp) - movq %r10,8+64+16(%rsp) - movq %r11,8+64+24(%rsp) - movq %r12,8+64+32(%rsp) - movq %r13,8+64+40(%rsp) - movq %r14,8+64+48(%rsp) - movq %r15,8+64+56(%rsp) - - .byte 0xf3,0xc3 -.size __rsaz_512_mulx,.-__rsaz_512_mulx .globl rsaz_512_scatter4 .type rsaz_512_scatter4,@function .align 16 rsaz_512_scatter4: +.cfi_startproc leaq (%rdi,%rdx,8),%rdi movl $8,%r9d jmp .Loop_scatter @@ -1913,12 +1273,14 @@ rsaz_512_scatter4: decl %r9d jnz .Loop_scatter .byte 0xf3,0xc3 +.cfi_endproc .size rsaz_512_scatter4,.-rsaz_512_scatter4 .globl rsaz_512_gather4 .type rsaz_512_gather4,@function .align 16 rsaz_512_gather4: +.cfi_startproc movd %edx,%xmm8 movdqa .Linc+16(%rip),%xmm1 movdqa .Linc(%rip),%xmm0 @@ -1982,6 +1344,7 @@ rsaz_512_gather4: jnz .Loop_gather .byte 0xf3,0xc3 .LSEH_end_rsaz_512_gather4: +.cfi_endproc .size rsaz_512_gather4,.-rsaz_512_gather4 .align 64 |