diff options
Diffstat (limited to 'crypto/bn/rsaz-2k-avxifma.s')
| -rw-r--r-- | crypto/bn/rsaz-2k-avxifma.s | 1167 |
1 files changed, 1167 insertions, 0 deletions
diff --git a/crypto/bn/rsaz-2k-avxifma.s b/crypto/bn/rsaz-2k-avxifma.s new file mode 100644 index 000000000000..e02f4697561c --- /dev/null +++ b/crypto/bn/rsaz-2k-avxifma.s @@ -0,0 +1,1167 @@ +.text + +.globl ossl_rsaz_avxifma_eligible +.type ossl_rsaz_avxifma_eligible,@function +.align 32 +ossl_rsaz_avxifma_eligible: + movl OPENSSL_ia32cap_P+20(%rip),%ecx + xorl %eax,%eax + andl $8388608,%ecx + cmpl $8388608,%ecx + cmovel %ecx,%eax + .byte 0xf3,0xc3 +.size ossl_rsaz_avxifma_eligible, .-ossl_rsaz_avxifma_eligible +.text + +.globl ossl_rsaz_amm52x20_x1_avxifma256 +.type ossl_rsaz_amm52x20_x1_avxifma256,@function +.align 32 +ossl_rsaz_amm52x20_x1_avxifma256: +.cfi_startproc +.byte 243,15,30,250 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 +.Lossl_rsaz_amm52x20_x1_avxifma256_body: + + + vpxor %ymm0,%ymm0,%ymm0 + vmovapd %ymm0,%ymm3 + vmovapd %ymm0,%ymm5 + vmovapd %ymm0,%ymm6 + vmovapd %ymm0,%ymm7 + vmovapd %ymm0,%ymm8 + + xorl %r9d,%r9d + + movq %rdx,%r11 + movq $0xfffffffffffff,%rax + + + movl $5,%ebx + +.align 32 +.Lloop5: + movq 0(%r11),%r13 + + vpbroadcastq 0(%r11),%ymm1 + movq 0(%rsi),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + movq %r12,%r10 + adcq $0,%r10 + + movq %r8,%r13 + imulq %r9,%r13 + andq %rax,%r13 + + vmovq %r13,%xmm2 + vpbroadcastq %xmm2,%ymm2 + movq 0(%rcx),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + adcq %r12,%r10 + + shrq $52,%r9 + salq $12,%r10 + orq %r10,%r9 + + leaq -168(%rsp),%rsp +{vex} vpmadd52luq 0(%rsi), %ymm1, %ymm3 +{vex} vpmadd52luq 32(%rsi), %ymm1, %ymm5 +{vex} vpmadd52luq 64(%rsi), %ymm1, %ymm6 +{vex} vpmadd52luq 96(%rsi), %ymm1, %ymm7 +{vex} vpmadd52luq 128(%rsi), %ymm1, %ymm8 + +{vex} vpmadd52luq 0(%rcx), %ymm2, %ymm3 +{vex} vpmadd52luq 32(%rcx), %ymm2, %ymm5 +{vex} vpmadd52luq 64(%rcx), %ymm2, %ymm6 +{vex} vpmadd52luq 96(%rcx), %ymm2, %ymm7 +{vex} vpmadd52luq 128(%rcx), %ymm2, %ymm8 + + + vmovdqu %ymm3,0(%rsp) + vmovdqu %ymm5,32(%rsp) + vmovdqu %ymm6,64(%rsp) + vmovdqu %ymm7,96(%rsp) + vmovdqu %ymm8,128(%rsp) + movq $0,160(%rsp) + + vmovdqu 8(%rsp),%ymm3 + vmovdqu 40(%rsp),%ymm5 + vmovdqu 72(%rsp),%ymm6 + vmovdqu 104(%rsp),%ymm7 + vmovdqu 136(%rsp),%ymm8 + + addq 8(%rsp),%r9 + +{vex} vpmadd52huq 0(%rsi), %ymm1, %ymm3 +{vex} vpmadd52huq 32(%rsi), %ymm1, %ymm5 +{vex} vpmadd52huq 64(%rsi), %ymm1, %ymm6 +{vex} vpmadd52huq 96(%rsi), %ymm1, %ymm7 +{vex} vpmadd52huq 128(%rsi), %ymm1, %ymm8 + +{vex} vpmadd52huq 0(%rcx), %ymm2, %ymm3 +{vex} vpmadd52huq 32(%rcx), %ymm2, %ymm5 +{vex} vpmadd52huq 64(%rcx), %ymm2, %ymm6 +{vex} vpmadd52huq 96(%rcx), %ymm2, %ymm7 +{vex} vpmadd52huq 128(%rcx), %ymm2, %ymm8 + leaq 168(%rsp),%rsp + movq 8(%r11),%r13 + + vpbroadcastq 8(%r11),%ymm1 + movq 0(%rsi),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + movq %r12,%r10 + adcq $0,%r10 + + movq %r8,%r13 + imulq %r9,%r13 + andq %rax,%r13 + + vmovq %r13,%xmm2 + vpbroadcastq %xmm2,%ymm2 + movq 0(%rcx),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + adcq %r12,%r10 + + shrq $52,%r9 + salq $12,%r10 + orq %r10,%r9 + + leaq -168(%rsp),%rsp +{vex} vpmadd52luq 0(%rsi), %ymm1, %ymm3 +{vex} vpmadd52luq 32(%rsi), %ymm1, %ymm5 +{vex} vpmadd52luq 64(%rsi), %ymm1, %ymm6 +{vex} vpmadd52luq 96(%rsi), %ymm1, %ymm7 +{vex} vpmadd52luq 128(%rsi), %ymm1, %ymm8 + +{vex} vpmadd52luq 0(%rcx), %ymm2, %ymm3 +{vex} vpmadd52luq 32(%rcx), %ymm2, %ymm5 +{vex} vpmadd52luq 64(%rcx), %ymm2, %ymm6 +{vex} vpmadd52luq 96(%rcx), %ymm2, %ymm7 +{vex} vpmadd52luq 128(%rcx), %ymm2, %ymm8 + + + vmovdqu %ymm3,0(%rsp) + vmovdqu %ymm5,32(%rsp) + vmovdqu %ymm6,64(%rsp) + vmovdqu %ymm7,96(%rsp) + vmovdqu %ymm8,128(%rsp) + movq $0,160(%rsp) + + vmovdqu 8(%rsp),%ymm3 + vmovdqu 40(%rsp),%ymm5 + vmovdqu 72(%rsp),%ymm6 + vmovdqu 104(%rsp),%ymm7 + vmovdqu 136(%rsp),%ymm8 + + addq 8(%rsp),%r9 + +{vex} vpmadd52huq 0(%rsi), %ymm1, %ymm3 +{vex} vpmadd52huq 32(%rsi), %ymm1, %ymm5 +{vex} vpmadd52huq 64(%rsi), %ymm1, %ymm6 +{vex} vpmadd52huq 96(%rsi), %ymm1, %ymm7 +{vex} vpmadd52huq 128(%rsi), %ymm1, %ymm8 + +{vex} vpmadd52huq 0(%rcx), %ymm2, %ymm3 +{vex} vpmadd52huq 32(%rcx), %ymm2, %ymm5 +{vex} vpmadd52huq 64(%rcx), %ymm2, %ymm6 +{vex} vpmadd52huq 96(%rcx), %ymm2, %ymm7 +{vex} vpmadd52huq 128(%rcx), %ymm2, %ymm8 + leaq 168(%rsp),%rsp + movq 16(%r11),%r13 + + vpbroadcastq 16(%r11),%ymm1 + movq 0(%rsi),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + movq %r12,%r10 + adcq $0,%r10 + + movq %r8,%r13 + imulq %r9,%r13 + andq %rax,%r13 + + vmovq %r13,%xmm2 + vpbroadcastq %xmm2,%ymm2 + movq 0(%rcx),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + adcq %r12,%r10 + + shrq $52,%r9 + salq $12,%r10 + orq %r10,%r9 + + leaq -168(%rsp),%rsp +{vex} vpmadd52luq 0(%rsi), %ymm1, %ymm3 +{vex} vpmadd52luq 32(%rsi), %ymm1, %ymm5 +{vex} vpmadd52luq 64(%rsi), %ymm1, %ymm6 +{vex} vpmadd52luq 96(%rsi), %ymm1, %ymm7 +{vex} vpmadd52luq 128(%rsi), %ymm1, %ymm8 + +{vex} vpmadd52luq 0(%rcx), %ymm2, %ymm3 +{vex} vpmadd52luq 32(%rcx), %ymm2, %ymm5 +{vex} vpmadd52luq 64(%rcx), %ymm2, %ymm6 +{vex} vpmadd52luq 96(%rcx), %ymm2, %ymm7 +{vex} vpmadd52luq 128(%rcx), %ymm2, %ymm8 + + + vmovdqu %ymm3,0(%rsp) + vmovdqu %ymm5,32(%rsp) + vmovdqu %ymm6,64(%rsp) + vmovdqu %ymm7,96(%rsp) + vmovdqu %ymm8,128(%rsp) + movq $0,160(%rsp) + + vmovdqu 8(%rsp),%ymm3 + vmovdqu 40(%rsp),%ymm5 + vmovdqu 72(%rsp),%ymm6 + vmovdqu 104(%rsp),%ymm7 + vmovdqu 136(%rsp),%ymm8 + + addq 8(%rsp),%r9 + +{vex} vpmadd52huq 0(%rsi), %ymm1, %ymm3 +{vex} vpmadd52huq 32(%rsi), %ymm1, %ymm5 +{vex} vpmadd52huq 64(%rsi), %ymm1, %ymm6 +{vex} vpmadd52huq 96(%rsi), %ymm1, %ymm7 +{vex} vpmadd52huq 128(%rsi), %ymm1, %ymm8 + +{vex} vpmadd52huq 0(%rcx), %ymm2, %ymm3 +{vex} vpmadd52huq 32(%rcx), %ymm2, %ymm5 +{vex} vpmadd52huq 64(%rcx), %ymm2, %ymm6 +{vex} vpmadd52huq 96(%rcx), %ymm2, %ymm7 +{vex} vpmadd52huq 128(%rcx), %ymm2, %ymm8 + leaq 168(%rsp),%rsp + movq 24(%r11),%r13 + + vpbroadcastq 24(%r11),%ymm1 + movq 0(%rsi),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + movq %r12,%r10 + adcq $0,%r10 + + movq %r8,%r13 + imulq %r9,%r13 + andq %rax,%r13 + + vmovq %r13,%xmm2 + vpbroadcastq %xmm2,%ymm2 + movq 0(%rcx),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + adcq %r12,%r10 + + shrq $52,%r9 + salq $12,%r10 + orq %r10,%r9 + + leaq -168(%rsp),%rsp +{vex} vpmadd52luq 0(%rsi), %ymm1, %ymm3 +{vex} vpmadd52luq 32(%rsi), %ymm1, %ymm5 +{vex} vpmadd52luq 64(%rsi), %ymm1, %ymm6 +{vex} vpmadd52luq 96(%rsi), %ymm1, %ymm7 +{vex} vpmadd52luq 128(%rsi), %ymm1, %ymm8 + +{vex} vpmadd52luq 0(%rcx), %ymm2, %ymm3 +{vex} vpmadd52luq 32(%rcx), %ymm2, %ymm5 +{vex} vpmadd52luq 64(%rcx), %ymm2, %ymm6 +{vex} vpmadd52luq 96(%rcx), %ymm2, %ymm7 +{vex} vpmadd52luq 128(%rcx), %ymm2, %ymm8 + + + vmovdqu %ymm3,0(%rsp) + vmovdqu %ymm5,32(%rsp) + vmovdqu %ymm6,64(%rsp) + vmovdqu %ymm7,96(%rsp) + vmovdqu %ymm8,128(%rsp) + movq $0,160(%rsp) + + vmovdqu 8(%rsp),%ymm3 + vmovdqu 40(%rsp),%ymm5 + vmovdqu 72(%rsp),%ymm6 + vmovdqu 104(%rsp),%ymm7 + vmovdqu 136(%rsp),%ymm8 + + addq 8(%rsp),%r9 + +{vex} vpmadd52huq 0(%rsi), %ymm1, %ymm3 +{vex} vpmadd52huq 32(%rsi), %ymm1, %ymm5 +{vex} vpmadd52huq 64(%rsi), %ymm1, %ymm6 +{vex} vpmadd52huq 96(%rsi), %ymm1, %ymm7 +{vex} vpmadd52huq 128(%rsi), %ymm1, %ymm8 + +{vex} vpmadd52huq 0(%rcx), %ymm2, %ymm3 +{vex} vpmadd52huq 32(%rcx), %ymm2, %ymm5 +{vex} vpmadd52huq 64(%rcx), %ymm2, %ymm6 +{vex} vpmadd52huq 96(%rcx), %ymm2, %ymm7 +{vex} vpmadd52huq 128(%rcx), %ymm2, %ymm8 + leaq 168(%rsp),%rsp + leaq 32(%r11),%r11 + decl %ebx + jne .Lloop5 + + vmovq %r9,%xmm0 + vpbroadcastq %xmm0,%ymm0 + vpblendd $3,%ymm0,%ymm3,%ymm3 + + + + vpsrlq $52,%ymm3,%ymm0 + vpsrlq $52,%ymm5,%ymm1 + vpsrlq $52,%ymm6,%ymm2 + vpsrlq $52,%ymm7,%ymm13 + vpsrlq $52,%ymm8,%ymm14 + + + vpermq $144,%ymm14,%ymm14 + vpermq $3,%ymm13,%ymm15 + vblendpd $1,%ymm15,%ymm14,%ymm14 + + vpermq $144,%ymm13,%ymm13 + vpermq $3,%ymm2,%ymm15 + vblendpd $1,%ymm15,%ymm13,%ymm13 + + vpermq $144,%ymm2,%ymm2 + vpermq $3,%ymm1,%ymm15 + vblendpd $1,%ymm15,%ymm2,%ymm2 + + vpermq $144,%ymm1,%ymm1 + vpermq $3,%ymm0,%ymm15 + vblendpd $1,%ymm15,%ymm1,%ymm1 + + vpermq $144,%ymm0,%ymm0 + vpand .Lhigh64x3(%rip),%ymm0,%ymm0 + + + vpand .Lmask52x4(%rip),%ymm3,%ymm3 + vpand .Lmask52x4(%rip),%ymm5,%ymm5 + vpand .Lmask52x4(%rip),%ymm6,%ymm6 + vpand .Lmask52x4(%rip),%ymm7,%ymm7 + vpand .Lmask52x4(%rip),%ymm8,%ymm8 + + + vpaddq %ymm0,%ymm3,%ymm3 + vpaddq %ymm1,%ymm5,%ymm5 + vpaddq %ymm2,%ymm6,%ymm6 + vpaddq %ymm13,%ymm7,%ymm7 + vpaddq %ymm14,%ymm8,%ymm8 + + + + vpcmpgtq .Lmask52x4(%rip),%ymm3,%ymm0 + vpcmpgtq .Lmask52x4(%rip),%ymm5,%ymm1 + vpcmpgtq .Lmask52x4(%rip),%ymm6,%ymm2 + vpcmpgtq .Lmask52x4(%rip),%ymm7,%ymm13 + vpcmpgtq .Lmask52x4(%rip),%ymm8,%ymm14 + vmovmskpd %ymm0,%r14d + vmovmskpd %ymm1,%r13d + vmovmskpd %ymm2,%r12d + vmovmskpd %ymm13,%r11d + vmovmskpd %ymm14,%r10d + + + vpcmpeqq .Lmask52x4(%rip),%ymm3,%ymm0 + vpcmpeqq .Lmask52x4(%rip),%ymm5,%ymm1 + vpcmpeqq .Lmask52x4(%rip),%ymm6,%ymm2 + vpcmpeqq .Lmask52x4(%rip),%ymm7,%ymm13 + vpcmpeqq .Lmask52x4(%rip),%ymm8,%ymm14 + vmovmskpd %ymm0,%r9d + vmovmskpd %ymm1,%r8d + vmovmskpd %ymm2,%ebx + vmovmskpd %ymm13,%ecx + vmovmskpd %ymm14,%edx + + + + shlb $4,%r13b + orb %r13b,%r14b + shlb $4,%r11b + orb %r11b,%r12b + + addb %r14b,%r14b + adcb %r12b,%r12b + adcb %r10b,%r10b + + shlb $4,%r8b + orb %r8b,%r9b + shlb $4,%cl + orb %cl,%bl + + addb %r9b,%r14b + adcb %bl,%r12b + adcb %dl,%r10b + + xorb %r9b,%r14b + xorb %bl,%r12b + xorb %dl,%r10b + + leaq .Lkmasklut(%rip),%rdx + + movb %r14b,%r13b + andq $0xf,%r14 + vpsubq .Lmask52x4(%rip),%ymm3,%ymm0 + shlq $5,%r14 + vmovapd (%rdx,%r14), %ymm2 + vblendvpd %ymm2,%ymm0,%ymm3,%ymm3 + + shrb $4,%r13b + andq $0xf,%r13 + vpsubq .Lmask52x4(%rip),%ymm5,%ymm0 + shlq $5,%r13 + vmovapd (%rdx,%r13), %ymm2 + vblendvpd %ymm2,%ymm0,%ymm5,%ymm5 + + movb %r12b,%r11b + andq $0xf,%r12 + vpsubq .Lmask52x4(%rip),%ymm6,%ymm0 + shlq $5,%r12 + vmovapd (%rdx,%r12), %ymm2 + vblendvpd %ymm2,%ymm0,%ymm6,%ymm6 + + shrb $4,%r11b + andq $0xf,%r11 + vpsubq .Lmask52x4(%rip),%ymm7,%ymm0 + shlq $5,%r11 + vmovapd (%rdx,%r11), %ymm2 + vblendvpd %ymm2,%ymm0,%ymm7,%ymm7 + + andq $0xf,%r10 + vpsubq .Lmask52x4(%rip),%ymm8,%ymm0 + shlq $5,%r10 + vmovapd (%rdx,%r10), %ymm2 + vblendvpd %ymm2,%ymm0,%ymm8,%ymm8 + + + vpand .Lmask52x4(%rip),%ymm3,%ymm3 + vpand .Lmask52x4(%rip),%ymm5,%ymm5 + vpand .Lmask52x4(%rip),%ymm6,%ymm6 + vpand .Lmask52x4(%rip),%ymm7,%ymm7 + vpand .Lmask52x4(%rip),%ymm8,%ymm8 + + vmovdqu %ymm3,0(%rdi) + vmovdqu %ymm5,32(%rdi) + vmovdqu %ymm6,64(%rdi) + vmovdqu %ymm7,96(%rdi) + vmovdqu %ymm8,128(%rdi) + + vzeroupper + movq 0(%rsp),%r15 +.cfi_restore %r15 + movq 8(%rsp),%r14 +.cfi_restore %r14 + movq 16(%rsp),%r13 +.cfi_restore %r13 + movq 24(%rsp),%r12 +.cfi_restore %r12 + movq 32(%rsp),%rbp +.cfi_restore %rbp + movq 40(%rsp),%rbx +.cfi_restore %rbx + leaq 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lossl_rsaz_amm52x20_x1_avxifma256_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size ossl_rsaz_amm52x20_x1_avxifma256, .-ossl_rsaz_amm52x20_x1_avxifma256 +.section .rodata +.align 32 +.Lmask52x4: +.quad 0xfffffffffffff +.quad 0xfffffffffffff +.quad 0xfffffffffffff +.quad 0xfffffffffffff +.Lhigh64x3: +.quad 0x0 +.quad 0xffffffffffffffff +.quad 0xffffffffffffffff +.quad 0xffffffffffffffff +.Lkmasklut: + +.quad 0x0 +.quad 0x0 +.quad 0x0 +.quad 0x0 + +.quad 0xffffffffffffffff +.quad 0x0 +.quad 0x0 +.quad 0x0 + +.quad 0x0 +.quad 0xffffffffffffffff +.quad 0x0 +.quad 0x0 + +.quad 0xffffffffffffffff +.quad 0xffffffffffffffff +.quad 0x0 +.quad 0x0 + +.quad 0x0 +.quad 0x0 +.quad 0xffffffffffffffff +.quad 0x0 + +.quad 0xffffffffffffffff +.quad 0x0 +.quad 0xffffffffffffffff +.quad 0x0 + +.quad 0x0 +.quad 0xffffffffffffffff +.quad 0xffffffffffffffff +.quad 0x0 + +.quad 0xffffffffffffffff +.quad 0xffffffffffffffff +.quad 0xffffffffffffffff +.quad 0x0 + +.quad 0x0 +.quad 0x0 +.quad 0x0 +.quad 0xffffffffffffffff + +.quad 0xffffffffffffffff +.quad 0x0 +.quad 0x0 +.quad 0xffffffffffffffff + +.quad 0x0 +.quad 0xffffffffffffffff +.quad 0x0 +.quad 0xffffffffffffffff + +.quad 0xffffffffffffffff +.quad 0xffffffffffffffff +.quad 0x0 +.quad 0xffffffffffffffff + +.quad 0x0 +.quad 0x0 +.quad 0xffffffffffffffff +.quad 0xffffffffffffffff + +.quad 0xffffffffffffffff +.quad 0x0 +.quad 0xffffffffffffffff +.quad 0xffffffffffffffff + +.quad 0x0 +.quad 0xffffffffffffffff +.quad 0xffffffffffffffff +.quad 0xffffffffffffffff + +.quad 0xffffffffffffffff +.quad 0xffffffffffffffff +.quad 0xffffffffffffffff +.quad 0xffffffffffffffff +.text + +.globl ossl_rsaz_amm52x20_x2_avxifma256 +.type ossl_rsaz_amm52x20_x2_avxifma256,@function +.align 32 +ossl_rsaz_amm52x20_x2_avxifma256: +.cfi_startproc +.byte 243,15,30,250 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 +.Lossl_rsaz_amm52x20_x2_avxifma256_body: + + + vpxor %ymm0,%ymm0,%ymm0 + vmovapd %ymm0,%ymm3 + vmovapd %ymm0,%ymm5 + vmovapd %ymm0,%ymm6 + vmovapd %ymm0,%ymm7 + vmovapd %ymm0,%ymm8 + vmovapd %ymm0,%ymm4 + vmovapd %ymm0,%ymm9 + vmovapd %ymm0,%ymm10 + vmovapd %ymm0,%ymm11 + vmovapd %ymm0,%ymm12 + + xorl %r9d,%r9d + xorl %r15d,%r15d + + movq %rdx,%r11 + movq $0xfffffffffffff,%rax + + movl $20,%ebx + +.align 32 +.Lloop20: + movq 0(%r11),%r13 + + vpbroadcastq 0(%r11),%ymm1 + movq 0(%rsi),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + movq %r12,%r10 + adcq $0,%r10 + + movq (%r8),%r13 + imulq %r9,%r13 + andq %rax,%r13 + + vmovq %r13,%xmm2 + vpbroadcastq %xmm2,%ymm2 + movq 0(%rcx),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + adcq %r12,%r10 + + shrq $52,%r9 + salq $12,%r10 + orq %r10,%r9 + + leaq -168(%rsp),%rsp +{vex} vpmadd52luq 0(%rsi), %ymm1, %ymm3 +{vex} vpmadd52luq 32(%rsi), %ymm1, %ymm5 +{vex} vpmadd52luq 64(%rsi), %ymm1, %ymm6 +{vex} vpmadd52luq 96(%rsi), %ymm1, %ymm7 +{vex} vpmadd52luq 128(%rsi), %ymm1, %ymm8 + +{vex} vpmadd52luq 0(%rcx), %ymm2, %ymm3 +{vex} vpmadd52luq 32(%rcx), %ymm2, %ymm5 +{vex} vpmadd52luq 64(%rcx), %ymm2, %ymm6 +{vex} vpmadd52luq 96(%rcx), %ymm2, %ymm7 +{vex} vpmadd52luq 128(%rcx), %ymm2, %ymm8 + + + vmovdqu %ymm3,0(%rsp) + vmovdqu %ymm5,32(%rsp) + vmovdqu %ymm6,64(%rsp) + vmovdqu %ymm7,96(%rsp) + vmovdqu %ymm8,128(%rsp) + movq $0,160(%rsp) + + vmovdqu 8(%rsp),%ymm3 + vmovdqu 40(%rsp),%ymm5 + vmovdqu 72(%rsp),%ymm6 + vmovdqu 104(%rsp),%ymm7 + vmovdqu 136(%rsp),%ymm8 + + addq 8(%rsp),%r9 + +{vex} vpmadd52huq 0(%rsi), %ymm1, %ymm3 +{vex} vpmadd52huq 32(%rsi), %ymm1, %ymm5 +{vex} vpmadd52huq 64(%rsi), %ymm1, %ymm6 +{vex} vpmadd52huq 96(%rsi), %ymm1, %ymm7 +{vex} vpmadd52huq 128(%rsi), %ymm1, %ymm8 + +{vex} vpmadd52huq 0(%rcx), %ymm2, %ymm3 +{vex} vpmadd52huq 32(%rcx), %ymm2, %ymm5 +{vex} vpmadd52huq 64(%rcx), %ymm2, %ymm6 +{vex} vpmadd52huq 96(%rcx), %ymm2, %ymm7 +{vex} vpmadd52huq 128(%rcx), %ymm2, %ymm8 + leaq 168(%rsp),%rsp + movq 160(%r11),%r13 + + vpbroadcastq 160(%r11),%ymm1 + movq 160(%rsi),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r15 + movq %r12,%r10 + adcq $0,%r10 + + movq 8(%r8),%r13 + imulq %r15,%r13 + andq %rax,%r13 + + vmovq %r13,%xmm2 + vpbroadcastq %xmm2,%ymm2 + movq 160(%rcx),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r15 + adcq %r12,%r10 + + shrq $52,%r15 + salq $12,%r10 + orq %r10,%r15 + + leaq -168(%rsp),%rsp +{vex} vpmadd52luq 160(%rsi), %ymm1, %ymm4 +{vex} vpmadd52luq 192(%rsi), %ymm1, %ymm9 +{vex} vpmadd52luq 224(%rsi), %ymm1, %ymm10 +{vex} vpmadd52luq 256(%rsi), %ymm1, %ymm11 +{vex} vpmadd52luq 288(%rsi), %ymm1, %ymm12 + +{vex} vpmadd52luq 160(%rcx), %ymm2, %ymm4 +{vex} vpmadd52luq 192(%rcx), %ymm2, %ymm9 +{vex} vpmadd52luq 224(%rcx), %ymm2, %ymm10 +{vex} vpmadd52luq 256(%rcx), %ymm2, %ymm11 +{vex} vpmadd52luq 288(%rcx), %ymm2, %ymm12 + + + vmovdqu %ymm4,0(%rsp) + vmovdqu %ymm9,32(%rsp) + vmovdqu %ymm10,64(%rsp) + vmovdqu %ymm11,96(%rsp) + vmovdqu %ymm12,128(%rsp) + movq $0,160(%rsp) + + vmovdqu 8(%rsp),%ymm4 + vmovdqu 40(%rsp),%ymm9 + vmovdqu 72(%rsp),%ymm10 + vmovdqu 104(%rsp),%ymm11 + vmovdqu 136(%rsp),%ymm12 + + addq 8(%rsp),%r15 + +{vex} vpmadd52huq 160(%rsi), %ymm1, %ymm4 +{vex} vpmadd52huq 192(%rsi), %ymm1, %ymm9 +{vex} vpmadd52huq 224(%rsi), %ymm1, %ymm10 +{vex} vpmadd52huq 256(%rsi), %ymm1, %ymm11 +{vex} vpmadd52huq 288(%rsi), %ymm1, %ymm12 + +{vex} vpmadd52huq 160(%rcx), %ymm2, %ymm4 +{vex} vpmadd52huq 192(%rcx), %ymm2, %ymm9 +{vex} vpmadd52huq 224(%rcx), %ymm2, %ymm10 +{vex} vpmadd52huq 256(%rcx), %ymm2, %ymm11 +{vex} vpmadd52huq 288(%rcx), %ymm2, %ymm12 + leaq 168(%rsp),%rsp + leaq 8(%r11),%r11 + decl %ebx + jne .Lloop20 + + vmovq %r9,%xmm0 + vpbroadcastq %xmm0,%ymm0 + vpblendd $3,%ymm0,%ymm3,%ymm3 + + + + vpsrlq $52,%ymm3,%ymm0 + vpsrlq $52,%ymm5,%ymm1 + vpsrlq $52,%ymm6,%ymm2 + vpsrlq $52,%ymm7,%ymm13 + vpsrlq $52,%ymm8,%ymm14 + + + vpermq $144,%ymm14,%ymm14 + vpermq $3,%ymm13,%ymm15 + vblendpd $1,%ymm15,%ymm14,%ymm14 + + vpermq $144,%ymm13,%ymm13 + vpermq $3,%ymm2,%ymm15 + vblendpd $1,%ymm15,%ymm13,%ymm13 + + vpermq $144,%ymm2,%ymm2 + vpermq $3,%ymm1,%ymm15 + vblendpd $1,%ymm15,%ymm2,%ymm2 + + vpermq $144,%ymm1,%ymm1 + vpermq $3,%ymm0,%ymm15 + vblendpd $1,%ymm15,%ymm1,%ymm1 + + vpermq $144,%ymm0,%ymm0 + vpand .Lhigh64x3(%rip),%ymm0,%ymm0 + + + vpand .Lmask52x4(%rip),%ymm3,%ymm3 + vpand .Lmask52x4(%rip),%ymm5,%ymm5 + vpand .Lmask52x4(%rip),%ymm6,%ymm6 + vpand .Lmask52x4(%rip),%ymm7,%ymm7 + vpand .Lmask52x4(%rip),%ymm8,%ymm8 + + + vpaddq %ymm0,%ymm3,%ymm3 + vpaddq %ymm1,%ymm5,%ymm5 + vpaddq %ymm2,%ymm6,%ymm6 + vpaddq %ymm13,%ymm7,%ymm7 + vpaddq %ymm14,%ymm8,%ymm8 + + + + vpcmpgtq .Lmask52x4(%rip),%ymm3,%ymm0 + vpcmpgtq .Lmask52x4(%rip),%ymm5,%ymm1 + vpcmpgtq .Lmask52x4(%rip),%ymm6,%ymm2 + vpcmpgtq .Lmask52x4(%rip),%ymm7,%ymm13 + vpcmpgtq .Lmask52x4(%rip),%ymm8,%ymm14 + vmovmskpd %ymm0,%r14d + vmovmskpd %ymm1,%r13d + vmovmskpd %ymm2,%r12d + vmovmskpd %ymm13,%r11d + vmovmskpd %ymm14,%r10d + + + vpcmpeqq .Lmask52x4(%rip),%ymm3,%ymm0 + vpcmpeqq .Lmask52x4(%rip),%ymm5,%ymm1 + vpcmpeqq .Lmask52x4(%rip),%ymm6,%ymm2 + vpcmpeqq .Lmask52x4(%rip),%ymm7,%ymm13 + vpcmpeqq .Lmask52x4(%rip),%ymm8,%ymm14 + vmovmskpd %ymm0,%r9d + vmovmskpd %ymm1,%r8d + vmovmskpd %ymm2,%ebx + vmovmskpd %ymm13,%ecx + vmovmskpd %ymm14,%edx + + + + shlb $4,%r13b + orb %r13b,%r14b + shlb $4,%r11b + orb %r11b,%r12b + + addb %r14b,%r14b + adcb %r12b,%r12b + adcb %r10b,%r10b + + shlb $4,%r8b + orb %r8b,%r9b + shlb $4,%cl + orb %cl,%bl + + addb %r9b,%r14b + adcb %bl,%r12b + adcb %dl,%r10b + + xorb %r9b,%r14b + xorb %bl,%r12b + xorb %dl,%r10b + + leaq .Lkmasklut(%rip),%rdx + + movb %r14b,%r13b + andq $0xf,%r14 + vpsubq .Lmask52x4(%rip),%ymm3,%ymm0 + shlq $5,%r14 + vmovapd (%rdx,%r14), %ymm2 + vblendvpd %ymm2,%ymm0,%ymm3,%ymm3 + + shrb $4,%r13b + andq $0xf,%r13 + vpsubq .Lmask52x4(%rip),%ymm5,%ymm0 + shlq $5,%r13 + vmovapd (%rdx,%r13), %ymm2 + vblendvpd %ymm2,%ymm0,%ymm5,%ymm5 + + movb %r12b,%r11b + andq $0xf,%r12 + vpsubq .Lmask52x4(%rip),%ymm6,%ymm0 + shlq $5,%r12 + vmovapd (%rdx,%r12), %ymm2 + vblendvpd %ymm2,%ymm0,%ymm6,%ymm6 + + shrb $4,%r11b + andq $0xf,%r11 + vpsubq .Lmask52x4(%rip),%ymm7,%ymm0 + shlq $5,%r11 + vmovapd (%rdx,%r11), %ymm2 + vblendvpd %ymm2,%ymm0,%ymm7,%ymm7 + + andq $0xf,%r10 + vpsubq .Lmask52x4(%rip),%ymm8,%ymm0 + shlq $5,%r10 + vmovapd (%rdx,%r10), %ymm2 + vblendvpd %ymm2,%ymm0,%ymm8,%ymm8 + + + vpand .Lmask52x4(%rip),%ymm3,%ymm3 + vpand .Lmask52x4(%rip),%ymm5,%ymm5 + vpand .Lmask52x4(%rip),%ymm6,%ymm6 + vpand .Lmask52x4(%rip),%ymm7,%ymm7 + vpand .Lmask52x4(%rip),%ymm8,%ymm8 + + vmovq %r15,%xmm0 + vpbroadcastq %xmm0,%ymm0 + vpblendd $3,%ymm0,%ymm4,%ymm4 + + + + vpsrlq $52,%ymm4,%ymm0 + vpsrlq $52,%ymm9,%ymm1 + vpsrlq $52,%ymm10,%ymm2 + vpsrlq $52,%ymm11,%ymm13 + vpsrlq $52,%ymm12,%ymm14 + + + vpermq $144,%ymm14,%ymm14 + vpermq $3,%ymm13,%ymm15 + vblendpd $1,%ymm15,%ymm14,%ymm14 + + vpermq $144,%ymm13,%ymm13 + vpermq $3,%ymm2,%ymm15 + vblendpd $1,%ymm15,%ymm13,%ymm13 + + vpermq $144,%ymm2,%ymm2 + vpermq $3,%ymm1,%ymm15 + vblendpd $1,%ymm15,%ymm2,%ymm2 + + vpermq $144,%ymm1,%ymm1 + vpermq $3,%ymm0,%ymm15 + vblendpd $1,%ymm15,%ymm1,%ymm1 + + vpermq $144,%ymm0,%ymm0 + vpand .Lhigh64x3(%rip),%ymm0,%ymm0 + + + vpand .Lmask52x4(%rip),%ymm4,%ymm4 + vpand .Lmask52x4(%rip),%ymm9,%ymm9 + vpand .Lmask52x4(%rip),%ymm10,%ymm10 + vpand .Lmask52x4(%rip),%ymm11,%ymm11 + vpand .Lmask52x4(%rip),%ymm12,%ymm12 + + + vpaddq %ymm0,%ymm4,%ymm4 + vpaddq %ymm1,%ymm9,%ymm9 + vpaddq %ymm2,%ymm10,%ymm10 + vpaddq %ymm13,%ymm11,%ymm11 + vpaddq %ymm14,%ymm12,%ymm12 + + + + vpcmpgtq .Lmask52x4(%rip),%ymm4,%ymm0 + vpcmpgtq .Lmask52x4(%rip),%ymm9,%ymm1 + vpcmpgtq .Lmask52x4(%rip),%ymm10,%ymm2 + vpcmpgtq .Lmask52x4(%rip),%ymm11,%ymm13 + vpcmpgtq .Lmask52x4(%rip),%ymm12,%ymm14 + vmovmskpd %ymm0,%r14d + vmovmskpd %ymm1,%r13d + vmovmskpd %ymm2,%r12d + vmovmskpd %ymm13,%r11d + vmovmskpd %ymm14,%r10d + + + vpcmpeqq .Lmask52x4(%rip),%ymm4,%ymm0 + vpcmpeqq .Lmask52x4(%rip),%ymm9,%ymm1 + vpcmpeqq .Lmask52x4(%rip),%ymm10,%ymm2 + vpcmpeqq .Lmask52x4(%rip),%ymm11,%ymm13 + vpcmpeqq .Lmask52x4(%rip),%ymm12,%ymm14 + vmovmskpd %ymm0,%r9d + vmovmskpd %ymm1,%r8d + vmovmskpd %ymm2,%ebx + vmovmskpd %ymm13,%ecx + vmovmskpd %ymm14,%edx + + + + shlb $4,%r13b + orb %r13b,%r14b + shlb $4,%r11b + orb %r11b,%r12b + + addb %r14b,%r14b + adcb %r12b,%r12b + adcb %r10b,%r10b + + shlb $4,%r8b + orb %r8b,%r9b + shlb $4,%cl + orb %cl,%bl + + addb %r9b,%r14b + adcb %bl,%r12b + adcb %dl,%r10b + + xorb %r9b,%r14b + xorb %bl,%r12b + xorb %dl,%r10b + + leaq .Lkmasklut(%rip),%rdx + + movb %r14b,%r13b + andq $0xf,%r14 + vpsubq .Lmask52x4(%rip),%ymm4,%ymm0 + shlq $5,%r14 + vmovapd (%rdx,%r14), %ymm2 + vblendvpd %ymm2,%ymm0,%ymm4,%ymm4 + + shrb $4,%r13b + andq $0xf,%r13 + vpsubq .Lmask52x4(%rip),%ymm9,%ymm0 + shlq $5,%r13 + vmovapd (%rdx,%r13), %ymm2 + vblendvpd %ymm2,%ymm0,%ymm9,%ymm9 + + movb %r12b,%r11b + andq $0xf,%r12 + vpsubq .Lmask52x4(%rip),%ymm10,%ymm0 + shlq $5,%r12 + vmovapd (%rdx,%r12), %ymm2 + vblendvpd %ymm2,%ymm0,%ymm10,%ymm10 + + shrb $4,%r11b + andq $0xf,%r11 + vpsubq .Lmask52x4(%rip),%ymm11,%ymm0 + shlq $5,%r11 + vmovapd (%rdx,%r11), %ymm2 + vblendvpd %ymm2,%ymm0,%ymm11,%ymm11 + + andq $0xf,%r10 + vpsubq .Lmask52x4(%rip),%ymm12,%ymm0 + shlq $5,%r10 + vmovapd (%rdx,%r10), %ymm2 + vblendvpd %ymm2,%ymm0,%ymm12,%ymm12 + + + vpand .Lmask52x4(%rip),%ymm4,%ymm4 + vpand .Lmask52x4(%rip),%ymm9,%ymm9 + vpand .Lmask52x4(%rip),%ymm10,%ymm10 + vpand .Lmask52x4(%rip),%ymm11,%ymm11 + vpand .Lmask52x4(%rip),%ymm12,%ymm12 + + vmovdqu %ymm3,0(%rdi) + vmovdqu %ymm5,32(%rdi) + vmovdqu %ymm6,64(%rdi) + vmovdqu %ymm7,96(%rdi) + vmovdqu %ymm8,128(%rdi) + + vmovdqu %ymm4,160(%rdi) + vmovdqu %ymm9,192(%rdi) + vmovdqu %ymm10,224(%rdi) + vmovdqu %ymm11,256(%rdi) + vmovdqu %ymm12,288(%rdi) + + vzeroupper + movq 0(%rsp),%r15 +.cfi_restore %r15 + movq 8(%rsp),%r14 +.cfi_restore %r14 + movq 16(%rsp),%r13 +.cfi_restore %r13 + movq 24(%rsp),%r12 +.cfi_restore %r12 + movq 32(%rsp),%rbp +.cfi_restore %rbp + movq 40(%rsp),%rbx +.cfi_restore %rbx + leaq 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lossl_rsaz_amm52x20_x2_avxifma256_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size ossl_rsaz_amm52x20_x2_avxifma256, .-ossl_rsaz_amm52x20_x2_avxifma256 +.text + +.align 32 +.globl ossl_extract_multiplier_2x20_win5_avx +.type ossl_extract_multiplier_2x20_win5_avx,@function +ossl_extract_multiplier_2x20_win5_avx: +.cfi_startproc +.byte 243,15,30,250 + vmovapd .Lones(%rip),%ymm14 + vmovq %rdx,%xmm10 + vpbroadcastq %xmm10,%ymm12 + vmovq %rcx,%xmm10 + vpbroadcastq %xmm10,%ymm13 + leaq 10240(%rsi),%rax + + + vpxor %xmm0,%xmm0,%xmm0 + vmovapd %ymm0,%ymm11 + vmovapd %ymm0,%ymm1 + vmovapd %ymm0,%ymm2 + vmovapd %ymm0,%ymm3 + vmovapd %ymm0,%ymm4 + vmovapd %ymm0,%ymm5 + vmovapd %ymm0,%ymm6 + vmovapd %ymm0,%ymm7 + vmovapd %ymm0,%ymm8 + vmovapd %ymm0,%ymm9 + +.align 32 +.Lloop: + vpcmpeqq %ymm11,%ymm12,%ymm15 + vmovdqu 0(%rsi),%ymm10 + vblendvpd %ymm15,%ymm10,%ymm0,%ymm0 + vmovdqu 32(%rsi),%ymm10 + vblendvpd %ymm15,%ymm10,%ymm1,%ymm1 + vmovdqu 64(%rsi),%ymm10 + vblendvpd %ymm15,%ymm10,%ymm2,%ymm2 + vmovdqu 96(%rsi),%ymm10 + vblendvpd %ymm15,%ymm10,%ymm3,%ymm3 + vmovdqu 128(%rsi),%ymm10 + vblendvpd %ymm15,%ymm10,%ymm4,%ymm4 + vpcmpeqq %ymm11,%ymm13,%ymm15 + vmovdqu 160(%rsi),%ymm10 + vblendvpd %ymm15,%ymm10,%ymm5,%ymm5 + vmovdqu 192(%rsi),%ymm10 + vblendvpd %ymm15,%ymm10,%ymm6,%ymm6 + vmovdqu 224(%rsi),%ymm10 + vblendvpd %ymm15,%ymm10,%ymm7,%ymm7 + vmovdqu 256(%rsi),%ymm10 + vblendvpd %ymm15,%ymm10,%ymm8,%ymm8 + vmovdqu 288(%rsi),%ymm10 + vblendvpd %ymm15,%ymm10,%ymm9,%ymm9 + vpaddq %ymm14,%ymm11,%ymm11 + addq $320,%rsi + cmpq %rsi,%rax + jne .Lloop + vmovdqu %ymm0,0(%rdi) + vmovdqu %ymm1,32(%rdi) + vmovdqu %ymm2,64(%rdi) + vmovdqu %ymm3,96(%rdi) + vmovdqu %ymm4,128(%rdi) + vmovdqu %ymm5,160(%rdi) + vmovdqu %ymm6,192(%rdi) + vmovdqu %ymm7,224(%rdi) + vmovdqu %ymm8,256(%rdi) + vmovdqu %ymm9,288(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc +.size ossl_extract_multiplier_2x20_win5_avx, .-ossl_extract_multiplier_2x20_win5_avx +.section .rodata +.align 32 +.Lones: +.quad 1,1,1,1 +.Lzeros: +.quad 0,0,0,0 + .section ".note.gnu.property", "a" + .p2align 3 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + # "GNU" encoded with .byte, since .asciz isn't supported + # on Solaris. + .byte 0x47 + .byte 0x4e + .byte 0x55 + .byte 0 +1: + .p2align 3 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 3 +4: |
