diff options
Diffstat (limited to 'crypto/bn/rsaz-3k-avx512.s')
| -rw-r--r-- | crypto/bn/rsaz-3k-avx512.s | 1331 |
1 files changed, 1331 insertions, 0 deletions
diff --git a/crypto/bn/rsaz-3k-avx512.s b/crypto/bn/rsaz-3k-avx512.s new file mode 100644 index 000000000000..022574148e8f --- /dev/null +++ b/crypto/bn/rsaz-3k-avx512.s @@ -0,0 +1,1331 @@ +.text + +.globl ossl_rsaz_amm52x30_x1_ifma256 +.type ossl_rsaz_amm52x30_x1_ifma256,@function +.align 32 +ossl_rsaz_amm52x30_x1_ifma256: +.cfi_startproc +.byte 243,15,30,250 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + + vpxord %ymm0,%ymm0,%ymm0 + vmovdqa64 %ymm0,%ymm3 + vmovdqa64 %ymm0,%ymm4 + vmovdqa64 %ymm0,%ymm5 + vmovdqa64 %ymm0,%ymm6 + vmovdqa64 %ymm0,%ymm7 + vmovdqa64 %ymm0,%ymm8 + vmovdqa64 %ymm0,%ymm9 + vmovdqa64 %ymm0,%ymm10 + + xorl %r9d,%r9d + + movq %rdx,%r11 + movq $0xfffffffffffff,%rax + + + movl $7,%ebx + +.align 32 +.Lloop7: + movq 0(%r11),%r13 + + vpbroadcastq %r13,%ymm1 + movq 0(%rsi),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + movq %r12,%r10 + adcq $0,%r10 + + movq %r8,%r13 + imulq %r9,%r13 + andq %rax,%r13 + + vpbroadcastq %r13,%ymm2 + movq 0(%rcx),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + adcq %r12,%r10 + + shrq $52,%r9 + salq $12,%r10 + orq %r10,%r9 + + vpmadd52luq 0(%rsi),%ymm1,%ymm3 + vpmadd52luq 32(%rsi),%ymm1,%ymm4 + vpmadd52luq 64(%rsi),%ymm1,%ymm5 + vpmadd52luq 96(%rsi),%ymm1,%ymm6 + vpmadd52luq 128(%rsi),%ymm1,%ymm7 + vpmadd52luq 160(%rsi),%ymm1,%ymm8 + vpmadd52luq 192(%rsi),%ymm1,%ymm9 + vpmadd52luq 224(%rsi),%ymm1,%ymm10 + + vpmadd52luq 0(%rcx),%ymm2,%ymm3 + vpmadd52luq 32(%rcx),%ymm2,%ymm4 + vpmadd52luq 64(%rcx),%ymm2,%ymm5 + vpmadd52luq 96(%rcx),%ymm2,%ymm6 + vpmadd52luq 128(%rcx),%ymm2,%ymm7 + vpmadd52luq 160(%rcx),%ymm2,%ymm8 + vpmadd52luq 192(%rcx),%ymm2,%ymm9 + vpmadd52luq 224(%rcx),%ymm2,%ymm10 + + + valignq $1,%ymm3,%ymm4,%ymm3 + valignq $1,%ymm4,%ymm5,%ymm4 + valignq $1,%ymm5,%ymm6,%ymm5 + valignq $1,%ymm6,%ymm7,%ymm6 + valignq $1,%ymm7,%ymm8,%ymm7 + valignq $1,%ymm8,%ymm9,%ymm8 + valignq $1,%ymm9,%ymm10,%ymm9 + valignq $1,%ymm10,%ymm0,%ymm10 + + vmovq %xmm3,%r13 + addq %r13,%r9 + + vpmadd52huq 0(%rsi),%ymm1,%ymm3 + vpmadd52huq 32(%rsi),%ymm1,%ymm4 + vpmadd52huq 64(%rsi),%ymm1,%ymm5 + vpmadd52huq 96(%rsi),%ymm1,%ymm6 + vpmadd52huq 128(%rsi),%ymm1,%ymm7 + vpmadd52huq 160(%rsi),%ymm1,%ymm8 + vpmadd52huq 192(%rsi),%ymm1,%ymm9 + vpmadd52huq 224(%rsi),%ymm1,%ymm10 + + vpmadd52huq 0(%rcx),%ymm2,%ymm3 + vpmadd52huq 32(%rcx),%ymm2,%ymm4 + vpmadd52huq 64(%rcx),%ymm2,%ymm5 + vpmadd52huq 96(%rcx),%ymm2,%ymm6 + vpmadd52huq 128(%rcx),%ymm2,%ymm7 + vpmadd52huq 160(%rcx),%ymm2,%ymm8 + vpmadd52huq 192(%rcx),%ymm2,%ymm9 + vpmadd52huq 224(%rcx),%ymm2,%ymm10 + movq 8(%r11),%r13 + + vpbroadcastq %r13,%ymm1 + movq 0(%rsi),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + movq %r12,%r10 + adcq $0,%r10 + + movq %r8,%r13 + imulq %r9,%r13 + andq %rax,%r13 + + vpbroadcastq %r13,%ymm2 + movq 0(%rcx),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + adcq %r12,%r10 + + shrq $52,%r9 + salq $12,%r10 + orq %r10,%r9 + + vpmadd52luq 0(%rsi),%ymm1,%ymm3 + vpmadd52luq 32(%rsi),%ymm1,%ymm4 + vpmadd52luq 64(%rsi),%ymm1,%ymm5 + vpmadd52luq 96(%rsi),%ymm1,%ymm6 + vpmadd52luq 128(%rsi),%ymm1,%ymm7 + vpmadd52luq 160(%rsi),%ymm1,%ymm8 + vpmadd52luq 192(%rsi),%ymm1,%ymm9 + vpmadd52luq 224(%rsi),%ymm1,%ymm10 + + vpmadd52luq 0(%rcx),%ymm2,%ymm3 + vpmadd52luq 32(%rcx),%ymm2,%ymm4 + vpmadd52luq 64(%rcx),%ymm2,%ymm5 + vpmadd52luq 96(%rcx),%ymm2,%ymm6 + vpmadd52luq 128(%rcx),%ymm2,%ymm7 + vpmadd52luq 160(%rcx),%ymm2,%ymm8 + vpmadd52luq 192(%rcx),%ymm2,%ymm9 + vpmadd52luq 224(%rcx),%ymm2,%ymm10 + + + valignq $1,%ymm3,%ymm4,%ymm3 + valignq $1,%ymm4,%ymm5,%ymm4 + valignq $1,%ymm5,%ymm6,%ymm5 + valignq $1,%ymm6,%ymm7,%ymm6 + valignq $1,%ymm7,%ymm8,%ymm7 + valignq $1,%ymm8,%ymm9,%ymm8 + valignq $1,%ymm9,%ymm10,%ymm9 + valignq $1,%ymm10,%ymm0,%ymm10 + + vmovq %xmm3,%r13 + addq %r13,%r9 + + vpmadd52huq 0(%rsi),%ymm1,%ymm3 + vpmadd52huq 32(%rsi),%ymm1,%ymm4 + vpmadd52huq 64(%rsi),%ymm1,%ymm5 + vpmadd52huq 96(%rsi),%ymm1,%ymm6 + vpmadd52huq 128(%rsi),%ymm1,%ymm7 + vpmadd52huq 160(%rsi),%ymm1,%ymm8 + vpmadd52huq 192(%rsi),%ymm1,%ymm9 + vpmadd52huq 224(%rsi),%ymm1,%ymm10 + + vpmadd52huq 0(%rcx),%ymm2,%ymm3 + vpmadd52huq 32(%rcx),%ymm2,%ymm4 + vpmadd52huq 64(%rcx),%ymm2,%ymm5 + vpmadd52huq 96(%rcx),%ymm2,%ymm6 + vpmadd52huq 128(%rcx),%ymm2,%ymm7 + vpmadd52huq 160(%rcx),%ymm2,%ymm8 + vpmadd52huq 192(%rcx),%ymm2,%ymm9 + vpmadd52huq 224(%rcx),%ymm2,%ymm10 + movq 16(%r11),%r13 + + vpbroadcastq %r13,%ymm1 + movq 0(%rsi),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + movq %r12,%r10 + adcq $0,%r10 + + movq %r8,%r13 + imulq %r9,%r13 + andq %rax,%r13 + + vpbroadcastq %r13,%ymm2 + movq 0(%rcx),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + adcq %r12,%r10 + + shrq $52,%r9 + salq $12,%r10 + orq %r10,%r9 + + vpmadd52luq 0(%rsi),%ymm1,%ymm3 + vpmadd52luq 32(%rsi),%ymm1,%ymm4 + vpmadd52luq 64(%rsi),%ymm1,%ymm5 + vpmadd52luq 96(%rsi),%ymm1,%ymm6 + vpmadd52luq 128(%rsi),%ymm1,%ymm7 + vpmadd52luq 160(%rsi),%ymm1,%ymm8 + vpmadd52luq 192(%rsi),%ymm1,%ymm9 + vpmadd52luq 224(%rsi),%ymm1,%ymm10 + + vpmadd52luq 0(%rcx),%ymm2,%ymm3 + vpmadd52luq 32(%rcx),%ymm2,%ymm4 + vpmadd52luq 64(%rcx),%ymm2,%ymm5 + vpmadd52luq 96(%rcx),%ymm2,%ymm6 + vpmadd52luq 128(%rcx),%ymm2,%ymm7 + vpmadd52luq 160(%rcx),%ymm2,%ymm8 + vpmadd52luq 192(%rcx),%ymm2,%ymm9 + vpmadd52luq 224(%rcx),%ymm2,%ymm10 + + + valignq $1,%ymm3,%ymm4,%ymm3 + valignq $1,%ymm4,%ymm5,%ymm4 + valignq $1,%ymm5,%ymm6,%ymm5 + valignq $1,%ymm6,%ymm7,%ymm6 + valignq $1,%ymm7,%ymm8,%ymm7 + valignq $1,%ymm8,%ymm9,%ymm8 + valignq $1,%ymm9,%ymm10,%ymm9 + valignq $1,%ymm10,%ymm0,%ymm10 + + vmovq %xmm3,%r13 + addq %r13,%r9 + + vpmadd52huq 0(%rsi),%ymm1,%ymm3 + vpmadd52huq 32(%rsi),%ymm1,%ymm4 + vpmadd52huq 64(%rsi),%ymm1,%ymm5 + vpmadd52huq 96(%rsi),%ymm1,%ymm6 + vpmadd52huq 128(%rsi),%ymm1,%ymm7 + vpmadd52huq 160(%rsi),%ymm1,%ymm8 + vpmadd52huq 192(%rsi),%ymm1,%ymm9 + vpmadd52huq 224(%rsi),%ymm1,%ymm10 + + vpmadd52huq 0(%rcx),%ymm2,%ymm3 + vpmadd52huq 32(%rcx),%ymm2,%ymm4 + vpmadd52huq 64(%rcx),%ymm2,%ymm5 + vpmadd52huq 96(%rcx),%ymm2,%ymm6 + vpmadd52huq 128(%rcx),%ymm2,%ymm7 + vpmadd52huq 160(%rcx),%ymm2,%ymm8 + vpmadd52huq 192(%rcx),%ymm2,%ymm9 + vpmadd52huq 224(%rcx),%ymm2,%ymm10 + movq 24(%r11),%r13 + + vpbroadcastq %r13,%ymm1 + movq 0(%rsi),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + movq %r12,%r10 + adcq $0,%r10 + + movq %r8,%r13 + imulq %r9,%r13 + andq %rax,%r13 + + vpbroadcastq %r13,%ymm2 + movq 0(%rcx),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + adcq %r12,%r10 + + shrq $52,%r9 + salq $12,%r10 + orq %r10,%r9 + + vpmadd52luq 0(%rsi),%ymm1,%ymm3 + vpmadd52luq 32(%rsi),%ymm1,%ymm4 + vpmadd52luq 64(%rsi),%ymm1,%ymm5 + vpmadd52luq 96(%rsi),%ymm1,%ymm6 + vpmadd52luq 128(%rsi),%ymm1,%ymm7 + vpmadd52luq 160(%rsi),%ymm1,%ymm8 + vpmadd52luq 192(%rsi),%ymm1,%ymm9 + vpmadd52luq 224(%rsi),%ymm1,%ymm10 + + vpmadd52luq 0(%rcx),%ymm2,%ymm3 + vpmadd52luq 32(%rcx),%ymm2,%ymm4 + vpmadd52luq 64(%rcx),%ymm2,%ymm5 + vpmadd52luq 96(%rcx),%ymm2,%ymm6 + vpmadd52luq 128(%rcx),%ymm2,%ymm7 + vpmadd52luq 160(%rcx),%ymm2,%ymm8 + vpmadd52luq 192(%rcx),%ymm2,%ymm9 + vpmadd52luq 224(%rcx),%ymm2,%ymm10 + + + valignq $1,%ymm3,%ymm4,%ymm3 + valignq $1,%ymm4,%ymm5,%ymm4 + valignq $1,%ymm5,%ymm6,%ymm5 + valignq $1,%ymm6,%ymm7,%ymm6 + valignq $1,%ymm7,%ymm8,%ymm7 + valignq $1,%ymm8,%ymm9,%ymm8 + valignq $1,%ymm9,%ymm10,%ymm9 + valignq $1,%ymm10,%ymm0,%ymm10 + + vmovq %xmm3,%r13 + addq %r13,%r9 + + vpmadd52huq 0(%rsi),%ymm1,%ymm3 + vpmadd52huq 32(%rsi),%ymm1,%ymm4 + vpmadd52huq 64(%rsi),%ymm1,%ymm5 + vpmadd52huq 96(%rsi),%ymm1,%ymm6 + vpmadd52huq 128(%rsi),%ymm1,%ymm7 + vpmadd52huq 160(%rsi),%ymm1,%ymm8 + vpmadd52huq 192(%rsi),%ymm1,%ymm9 + vpmadd52huq 224(%rsi),%ymm1,%ymm10 + + vpmadd52huq 0(%rcx),%ymm2,%ymm3 + vpmadd52huq 32(%rcx),%ymm2,%ymm4 + vpmadd52huq 64(%rcx),%ymm2,%ymm5 + vpmadd52huq 96(%rcx),%ymm2,%ymm6 + vpmadd52huq 128(%rcx),%ymm2,%ymm7 + vpmadd52huq 160(%rcx),%ymm2,%ymm8 + vpmadd52huq 192(%rcx),%ymm2,%ymm9 + vpmadd52huq 224(%rcx),%ymm2,%ymm10 + leaq 32(%r11),%r11 + decl %ebx + jne .Lloop7 + movq 0(%r11),%r13 + + vpbroadcastq %r13,%ymm1 + movq 0(%rsi),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + movq %r12,%r10 + adcq $0,%r10 + + movq %r8,%r13 + imulq %r9,%r13 + andq %rax,%r13 + + vpbroadcastq %r13,%ymm2 + movq 0(%rcx),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + adcq %r12,%r10 + + shrq $52,%r9 + salq $12,%r10 + orq %r10,%r9 + + vpmadd52luq 0(%rsi),%ymm1,%ymm3 + vpmadd52luq 32(%rsi),%ymm1,%ymm4 + vpmadd52luq 64(%rsi),%ymm1,%ymm5 + vpmadd52luq 96(%rsi),%ymm1,%ymm6 + vpmadd52luq 128(%rsi),%ymm1,%ymm7 + vpmadd52luq 160(%rsi),%ymm1,%ymm8 + vpmadd52luq 192(%rsi),%ymm1,%ymm9 + vpmadd52luq 224(%rsi),%ymm1,%ymm10 + + vpmadd52luq 0(%rcx),%ymm2,%ymm3 + vpmadd52luq 32(%rcx),%ymm2,%ymm4 + vpmadd52luq 64(%rcx),%ymm2,%ymm5 + vpmadd52luq 96(%rcx),%ymm2,%ymm6 + vpmadd52luq 128(%rcx),%ymm2,%ymm7 + vpmadd52luq 160(%rcx),%ymm2,%ymm8 + vpmadd52luq 192(%rcx),%ymm2,%ymm9 + vpmadd52luq 224(%rcx),%ymm2,%ymm10 + + + valignq $1,%ymm3,%ymm4,%ymm3 + valignq $1,%ymm4,%ymm5,%ymm4 + valignq $1,%ymm5,%ymm6,%ymm5 + valignq $1,%ymm6,%ymm7,%ymm6 + valignq $1,%ymm7,%ymm8,%ymm7 + valignq $1,%ymm8,%ymm9,%ymm8 + valignq $1,%ymm9,%ymm10,%ymm9 + valignq $1,%ymm10,%ymm0,%ymm10 + + vmovq %xmm3,%r13 + addq %r13,%r9 + + vpmadd52huq 0(%rsi),%ymm1,%ymm3 + vpmadd52huq 32(%rsi),%ymm1,%ymm4 + vpmadd52huq 64(%rsi),%ymm1,%ymm5 + vpmadd52huq 96(%rsi),%ymm1,%ymm6 + vpmadd52huq 128(%rsi),%ymm1,%ymm7 + vpmadd52huq 160(%rsi),%ymm1,%ymm8 + vpmadd52huq 192(%rsi),%ymm1,%ymm9 + vpmadd52huq 224(%rsi),%ymm1,%ymm10 + + vpmadd52huq 0(%rcx),%ymm2,%ymm3 + vpmadd52huq 32(%rcx),%ymm2,%ymm4 + vpmadd52huq 64(%rcx),%ymm2,%ymm5 + vpmadd52huq 96(%rcx),%ymm2,%ymm6 + vpmadd52huq 128(%rcx),%ymm2,%ymm7 + vpmadd52huq 160(%rcx),%ymm2,%ymm8 + vpmadd52huq 192(%rcx),%ymm2,%ymm9 + vpmadd52huq 224(%rcx),%ymm2,%ymm10 + movq 8(%r11),%r13 + + vpbroadcastq %r13,%ymm1 + movq 0(%rsi),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + movq %r12,%r10 + adcq $0,%r10 + + movq %r8,%r13 + imulq %r9,%r13 + andq %rax,%r13 + + vpbroadcastq %r13,%ymm2 + movq 0(%rcx),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + adcq %r12,%r10 + + shrq $52,%r9 + salq $12,%r10 + orq %r10,%r9 + + vpmadd52luq 0(%rsi),%ymm1,%ymm3 + vpmadd52luq 32(%rsi),%ymm1,%ymm4 + vpmadd52luq 64(%rsi),%ymm1,%ymm5 + vpmadd52luq 96(%rsi),%ymm1,%ymm6 + vpmadd52luq 128(%rsi),%ymm1,%ymm7 + vpmadd52luq 160(%rsi),%ymm1,%ymm8 + vpmadd52luq 192(%rsi),%ymm1,%ymm9 + vpmadd52luq 224(%rsi),%ymm1,%ymm10 + + vpmadd52luq 0(%rcx),%ymm2,%ymm3 + vpmadd52luq 32(%rcx),%ymm2,%ymm4 + vpmadd52luq 64(%rcx),%ymm2,%ymm5 + vpmadd52luq 96(%rcx),%ymm2,%ymm6 + vpmadd52luq 128(%rcx),%ymm2,%ymm7 + vpmadd52luq 160(%rcx),%ymm2,%ymm8 + vpmadd52luq 192(%rcx),%ymm2,%ymm9 + vpmadd52luq 224(%rcx),%ymm2,%ymm10 + + + valignq $1,%ymm3,%ymm4,%ymm3 + valignq $1,%ymm4,%ymm5,%ymm4 + valignq $1,%ymm5,%ymm6,%ymm5 + valignq $1,%ymm6,%ymm7,%ymm6 + valignq $1,%ymm7,%ymm8,%ymm7 + valignq $1,%ymm8,%ymm9,%ymm8 + valignq $1,%ymm9,%ymm10,%ymm9 + valignq $1,%ymm10,%ymm0,%ymm10 + + vmovq %xmm3,%r13 + addq %r13,%r9 + + vpmadd52huq 0(%rsi),%ymm1,%ymm3 + vpmadd52huq 32(%rsi),%ymm1,%ymm4 + vpmadd52huq 64(%rsi),%ymm1,%ymm5 + vpmadd52huq 96(%rsi),%ymm1,%ymm6 + vpmadd52huq 128(%rsi),%ymm1,%ymm7 + vpmadd52huq 160(%rsi),%ymm1,%ymm8 + vpmadd52huq 192(%rsi),%ymm1,%ymm9 + vpmadd52huq 224(%rsi),%ymm1,%ymm10 + + vpmadd52huq 0(%rcx),%ymm2,%ymm3 + vpmadd52huq 32(%rcx),%ymm2,%ymm4 + vpmadd52huq 64(%rcx),%ymm2,%ymm5 + vpmadd52huq 96(%rcx),%ymm2,%ymm6 + vpmadd52huq 128(%rcx),%ymm2,%ymm7 + vpmadd52huq 160(%rcx),%ymm2,%ymm8 + vpmadd52huq 192(%rcx),%ymm2,%ymm9 + vpmadd52huq 224(%rcx),%ymm2,%ymm10 + + vpbroadcastq %r9,%ymm0 + vpblendd $3,%ymm0,%ymm3,%ymm3 + + + + vpsrlq $52,%ymm3,%ymm0 + vpsrlq $52,%ymm4,%ymm1 + vpsrlq $52,%ymm5,%ymm2 + vpsrlq $52,%ymm6,%ymm19 + vpsrlq $52,%ymm7,%ymm20 + vpsrlq $52,%ymm8,%ymm21 + vpsrlq $52,%ymm9,%ymm22 + vpsrlq $52,%ymm10,%ymm23 + + + valignq $3,%ymm22,%ymm23,%ymm23 + valignq $3,%ymm21,%ymm22,%ymm22 + valignq $3,%ymm20,%ymm21,%ymm21 + valignq $3,%ymm19,%ymm20,%ymm20 + valignq $3,%ymm2,%ymm19,%ymm19 + valignq $3,%ymm1,%ymm2,%ymm2 + valignq $3,%ymm0,%ymm1,%ymm1 + valignq $3,.Lzeros(%rip),%ymm0,%ymm0 + + + vpandq .Lmask52x4(%rip),%ymm3,%ymm3 + vpandq .Lmask52x4(%rip),%ymm4,%ymm4 + vpandq .Lmask52x4(%rip),%ymm5,%ymm5 + vpandq .Lmask52x4(%rip),%ymm6,%ymm6 + vpandq .Lmask52x4(%rip),%ymm7,%ymm7 + vpandq .Lmask52x4(%rip),%ymm8,%ymm8 + vpandq .Lmask52x4(%rip),%ymm9,%ymm9 + vpandq .Lmask52x4(%rip),%ymm10,%ymm10 + + + vpaddq %ymm0,%ymm3,%ymm3 + vpaddq %ymm1,%ymm4,%ymm4 + vpaddq %ymm2,%ymm5,%ymm5 + vpaddq %ymm19,%ymm6,%ymm6 + vpaddq %ymm20,%ymm7,%ymm7 + vpaddq %ymm21,%ymm8,%ymm8 + vpaddq %ymm22,%ymm9,%ymm9 + vpaddq %ymm23,%ymm10,%ymm10 + + + + vpcmpuq $6,.Lmask52x4(%rip),%ymm3,%k1 + vpcmpuq $6,.Lmask52x4(%rip),%ymm4,%k2 + kmovb %k1,%r14d + kmovb %k2,%r13d + shlb $4,%r13b + orb %r13b,%r14b + + vpcmpuq $6,.Lmask52x4(%rip),%ymm5,%k1 + vpcmpuq $6,.Lmask52x4(%rip),%ymm6,%k2 + kmovb %k1,%r13d + kmovb %k2,%r12d + shlb $4,%r12b + orb %r12b,%r13b + + vpcmpuq $6,.Lmask52x4(%rip),%ymm7,%k1 + vpcmpuq $6,.Lmask52x4(%rip),%ymm8,%k2 + kmovb %k1,%r12d + kmovb %k2,%r11d + shlb $4,%r11b + orb %r11b,%r12b + + vpcmpuq $6,.Lmask52x4(%rip),%ymm9,%k1 + vpcmpuq $6,.Lmask52x4(%rip),%ymm10,%k2 + kmovb %k1,%r11d + kmovb %k2,%r10d + shlb $4,%r10b + orb %r10b,%r11b + + addb %r14b,%r14b + adcb %r13b,%r13b + adcb %r12b,%r12b + adcb %r11b,%r11b + + + vpcmpuq $0,.Lmask52x4(%rip),%ymm3,%k1 + vpcmpuq $0,.Lmask52x4(%rip),%ymm4,%k2 + kmovb %k1,%r9d + kmovb %k2,%r8d + shlb $4,%r8b + orb %r8b,%r9b + + vpcmpuq $0,.Lmask52x4(%rip),%ymm5,%k1 + vpcmpuq $0,.Lmask52x4(%rip),%ymm6,%k2 + kmovb %k1,%r8d + kmovb %k2,%edx + shlb $4,%dl + orb %dl,%r8b + + vpcmpuq $0,.Lmask52x4(%rip),%ymm7,%k1 + vpcmpuq $0,.Lmask52x4(%rip),%ymm8,%k2 + kmovb %k1,%edx + kmovb %k2,%ecx + shlb $4,%cl + orb %cl,%dl + + vpcmpuq $0,.Lmask52x4(%rip),%ymm9,%k1 + vpcmpuq $0,.Lmask52x4(%rip),%ymm10,%k2 + kmovb %k1,%ecx + kmovb %k2,%ebx + shlb $4,%bl + orb %bl,%cl + + addb %r9b,%r14b + adcb %r8b,%r13b + adcb %dl,%r12b + adcb %cl,%r11b + + xorb %r9b,%r14b + xorb %r8b,%r13b + xorb %dl,%r12b + xorb %cl,%r11b + + kmovb %r14d,%k1 + shrb $4,%r14b + kmovb %r14d,%k2 + kmovb %r13d,%k3 + shrb $4,%r13b + kmovb %r13d,%k4 + kmovb %r12d,%k5 + shrb $4,%r12b + kmovb %r12d,%k6 + kmovb %r11d,%k7 + + vpsubq .Lmask52x4(%rip),%ymm3,%ymm3{%k1} + vpsubq .Lmask52x4(%rip),%ymm4,%ymm4{%k2} + vpsubq .Lmask52x4(%rip),%ymm5,%ymm5{%k3} + vpsubq .Lmask52x4(%rip),%ymm6,%ymm6{%k4} + vpsubq .Lmask52x4(%rip),%ymm7,%ymm7{%k5} + vpsubq .Lmask52x4(%rip),%ymm8,%ymm8{%k6} + vpsubq .Lmask52x4(%rip),%ymm9,%ymm9{%k7} + + vpandq .Lmask52x4(%rip),%ymm3,%ymm3 + vpandq .Lmask52x4(%rip),%ymm4,%ymm4 + vpandq .Lmask52x4(%rip),%ymm5,%ymm5 + vpandq .Lmask52x4(%rip),%ymm6,%ymm6 + vpandq .Lmask52x4(%rip),%ymm7,%ymm7 + vpandq .Lmask52x4(%rip),%ymm8,%ymm8 + vpandq .Lmask52x4(%rip),%ymm9,%ymm9 + + shrb $4,%r11b + kmovb %r11d,%k1 + + vpsubq .Lmask52x4(%rip),%ymm10,%ymm10{%k1} + + vpandq .Lmask52x4(%rip),%ymm10,%ymm10 + + vmovdqu64 %ymm3,0(%rdi) + vmovdqu64 %ymm4,32(%rdi) + vmovdqu64 %ymm5,64(%rdi) + vmovdqu64 %ymm6,96(%rdi) + vmovdqu64 %ymm7,128(%rdi) + vmovdqu64 %ymm8,160(%rdi) + vmovdqu64 %ymm9,192(%rdi) + vmovdqu64 %ymm10,224(%rdi) + + vzeroupper + leaq (%rsp),%rax +.cfi_def_cfa_register %rax + movq 0(%rax),%r15 +.cfi_restore %r15 + movq 8(%rax),%r14 +.cfi_restore %r14 + movq 16(%rax),%r13 +.cfi_restore %r13 + movq 24(%rax),%r12 +.cfi_restore %r12 + movq 32(%rax),%rbp +.cfi_restore %rbp + movq 40(%rax),%rbx +.cfi_restore %rbx + leaq 48(%rax),%rsp +.cfi_def_cfa %rsp,8 +.Lossl_rsaz_amm52x30_x1_ifma256_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size ossl_rsaz_amm52x30_x1_ifma256, .-ossl_rsaz_amm52x30_x1_ifma256 +.section .rodata +.align 32 +.Lmask52x4: +.quad 0xfffffffffffff +.quad 0xfffffffffffff +.quad 0xfffffffffffff +.quad 0xfffffffffffff +.text + +.globl ossl_rsaz_amm52x30_x2_ifma256 +.type ossl_rsaz_amm52x30_x2_ifma256,@function +.align 32 +ossl_rsaz_amm52x30_x2_ifma256: +.cfi_startproc +.byte 243,15,30,250 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + + vpxord %ymm0,%ymm0,%ymm0 + vmovdqa64 %ymm0,%ymm3 + vmovdqa64 %ymm0,%ymm4 + vmovdqa64 %ymm0,%ymm5 + vmovdqa64 %ymm0,%ymm6 + vmovdqa64 %ymm0,%ymm7 + vmovdqa64 %ymm0,%ymm8 + vmovdqa64 %ymm0,%ymm9 + vmovdqa64 %ymm0,%ymm10 + + vmovdqa64 %ymm0,%ymm11 + vmovdqa64 %ymm0,%ymm12 + vmovdqa64 %ymm0,%ymm13 + vmovdqa64 %ymm0,%ymm14 + vmovdqa64 %ymm0,%ymm15 + vmovdqa64 %ymm0,%ymm16 + vmovdqa64 %ymm0,%ymm17 + vmovdqa64 %ymm0,%ymm18 + + + xorl %r9d,%r9d + xorl %r15d,%r15d + + movq %rdx,%r11 + movq $0xfffffffffffff,%rax + + movl $30,%ebx + +.align 32 +.Lloop30: + movq 0(%r11),%r13 + + vpbroadcastq %r13,%ymm1 + movq 0(%rsi),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + movq %r12,%r10 + adcq $0,%r10 + + movq (%r8),%r13 + imulq %r9,%r13 + andq %rax,%r13 + + vpbroadcastq %r13,%ymm2 + movq 0(%rcx),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + adcq %r12,%r10 + + shrq $52,%r9 + salq $12,%r10 + orq %r10,%r9 + + vpmadd52luq 0(%rsi),%ymm1,%ymm3 + vpmadd52luq 32(%rsi),%ymm1,%ymm4 + vpmadd52luq 64(%rsi),%ymm1,%ymm5 + vpmadd52luq 96(%rsi),%ymm1,%ymm6 + vpmadd52luq 128(%rsi),%ymm1,%ymm7 + vpmadd52luq 160(%rsi),%ymm1,%ymm8 + vpmadd52luq 192(%rsi),%ymm1,%ymm9 + vpmadd52luq 224(%rsi),%ymm1,%ymm10 + + vpmadd52luq 0(%rcx),%ymm2,%ymm3 + vpmadd52luq 32(%rcx),%ymm2,%ymm4 + vpmadd52luq 64(%rcx),%ymm2,%ymm5 + vpmadd52luq 96(%rcx),%ymm2,%ymm6 + vpmadd52luq 128(%rcx),%ymm2,%ymm7 + vpmadd52luq 160(%rcx),%ymm2,%ymm8 + vpmadd52luq 192(%rcx),%ymm2,%ymm9 + vpmadd52luq 224(%rcx),%ymm2,%ymm10 + + + valignq $1,%ymm3,%ymm4,%ymm3 + valignq $1,%ymm4,%ymm5,%ymm4 + valignq $1,%ymm5,%ymm6,%ymm5 + valignq $1,%ymm6,%ymm7,%ymm6 + valignq $1,%ymm7,%ymm8,%ymm7 + valignq $1,%ymm8,%ymm9,%ymm8 + valignq $1,%ymm9,%ymm10,%ymm9 + valignq $1,%ymm10,%ymm0,%ymm10 + + vmovq %xmm3,%r13 + addq %r13,%r9 + + vpmadd52huq 0(%rsi),%ymm1,%ymm3 + vpmadd52huq 32(%rsi),%ymm1,%ymm4 + vpmadd52huq 64(%rsi),%ymm1,%ymm5 + vpmadd52huq 96(%rsi),%ymm1,%ymm6 + vpmadd52huq 128(%rsi),%ymm1,%ymm7 + vpmadd52huq 160(%rsi),%ymm1,%ymm8 + vpmadd52huq 192(%rsi),%ymm1,%ymm9 + vpmadd52huq 224(%rsi),%ymm1,%ymm10 + + vpmadd52huq 0(%rcx),%ymm2,%ymm3 + vpmadd52huq 32(%rcx),%ymm2,%ymm4 + vpmadd52huq 64(%rcx),%ymm2,%ymm5 + vpmadd52huq 96(%rcx),%ymm2,%ymm6 + vpmadd52huq 128(%rcx),%ymm2,%ymm7 + vpmadd52huq 160(%rcx),%ymm2,%ymm8 + vpmadd52huq 192(%rcx),%ymm2,%ymm9 + vpmadd52huq 224(%rcx),%ymm2,%ymm10 + movq 256(%r11),%r13 + + vpbroadcastq %r13,%ymm1 + movq 256(%rsi),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r15 + movq %r12,%r10 + adcq $0,%r10 + + movq 8(%r8),%r13 + imulq %r15,%r13 + andq %rax,%r13 + + vpbroadcastq %r13,%ymm2 + movq 256(%rcx),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r15 + adcq %r12,%r10 + + shrq $52,%r15 + salq $12,%r10 + orq %r10,%r15 + + vpmadd52luq 256(%rsi),%ymm1,%ymm11 + vpmadd52luq 288(%rsi),%ymm1,%ymm12 + vpmadd52luq 320(%rsi),%ymm1,%ymm13 + vpmadd52luq 352(%rsi),%ymm1,%ymm14 + vpmadd52luq 384(%rsi),%ymm1,%ymm15 + vpmadd52luq 416(%rsi),%ymm1,%ymm16 + vpmadd52luq 448(%rsi),%ymm1,%ymm17 + vpmadd52luq 480(%rsi),%ymm1,%ymm18 + + vpmadd52luq 256(%rcx),%ymm2,%ymm11 + vpmadd52luq 288(%rcx),%ymm2,%ymm12 + vpmadd52luq 320(%rcx),%ymm2,%ymm13 + vpmadd52luq 352(%rcx),%ymm2,%ymm14 + vpmadd52luq 384(%rcx),%ymm2,%ymm15 + vpmadd52luq 416(%rcx),%ymm2,%ymm16 + vpmadd52luq 448(%rcx),%ymm2,%ymm17 + vpmadd52luq 480(%rcx),%ymm2,%ymm18 + + + valignq $1,%ymm11,%ymm12,%ymm11 + valignq $1,%ymm12,%ymm13,%ymm12 + valignq $1,%ymm13,%ymm14,%ymm13 + valignq $1,%ymm14,%ymm15,%ymm14 + valignq $1,%ymm15,%ymm16,%ymm15 + valignq $1,%ymm16,%ymm17,%ymm16 + valignq $1,%ymm17,%ymm18,%ymm17 + valignq $1,%ymm18,%ymm0,%ymm18 + + vmovq %xmm11,%r13 + addq %r13,%r15 + + vpmadd52huq 256(%rsi),%ymm1,%ymm11 + vpmadd52huq 288(%rsi),%ymm1,%ymm12 + vpmadd52huq 320(%rsi),%ymm1,%ymm13 + vpmadd52huq 352(%rsi),%ymm1,%ymm14 + vpmadd52huq 384(%rsi),%ymm1,%ymm15 + vpmadd52huq 416(%rsi),%ymm1,%ymm16 + vpmadd52huq 448(%rsi),%ymm1,%ymm17 + vpmadd52huq 480(%rsi),%ymm1,%ymm18 + + vpmadd52huq 256(%rcx),%ymm2,%ymm11 + vpmadd52huq 288(%rcx),%ymm2,%ymm12 + vpmadd52huq 320(%rcx),%ymm2,%ymm13 + vpmadd52huq 352(%rcx),%ymm2,%ymm14 + vpmadd52huq 384(%rcx),%ymm2,%ymm15 + vpmadd52huq 416(%rcx),%ymm2,%ymm16 + vpmadd52huq 448(%rcx),%ymm2,%ymm17 + vpmadd52huq 480(%rcx),%ymm2,%ymm18 + leaq 8(%r11),%r11 + decl %ebx + jne .Lloop30 + + vpbroadcastq %r9,%ymm0 + vpblendd $3,%ymm0,%ymm3,%ymm3 + + + + vpsrlq $52,%ymm3,%ymm0 + vpsrlq $52,%ymm4,%ymm1 + vpsrlq $52,%ymm5,%ymm2 + vpsrlq $52,%ymm6,%ymm19 + vpsrlq $52,%ymm7,%ymm20 + vpsrlq $52,%ymm8,%ymm21 + vpsrlq $52,%ymm9,%ymm22 + vpsrlq $52,%ymm10,%ymm23 + + + valignq $3,%ymm22,%ymm23,%ymm23 + valignq $3,%ymm21,%ymm22,%ymm22 + valignq $3,%ymm20,%ymm21,%ymm21 + valignq $3,%ymm19,%ymm20,%ymm20 + valignq $3,%ymm2,%ymm19,%ymm19 + valignq $3,%ymm1,%ymm2,%ymm2 + valignq $3,%ymm0,%ymm1,%ymm1 + valignq $3,.Lzeros(%rip),%ymm0,%ymm0 + + + vpandq .Lmask52x4(%rip),%ymm3,%ymm3 + vpandq .Lmask52x4(%rip),%ymm4,%ymm4 + vpandq .Lmask52x4(%rip),%ymm5,%ymm5 + vpandq .Lmask52x4(%rip),%ymm6,%ymm6 + vpandq .Lmask52x4(%rip),%ymm7,%ymm7 + vpandq .Lmask52x4(%rip),%ymm8,%ymm8 + vpandq .Lmask52x4(%rip),%ymm9,%ymm9 + vpandq .Lmask52x4(%rip),%ymm10,%ymm10 + + + vpaddq %ymm0,%ymm3,%ymm3 + vpaddq %ymm1,%ymm4,%ymm4 + vpaddq %ymm2,%ymm5,%ymm5 + vpaddq %ymm19,%ymm6,%ymm6 + vpaddq %ymm20,%ymm7,%ymm7 + vpaddq %ymm21,%ymm8,%ymm8 + vpaddq %ymm22,%ymm9,%ymm9 + vpaddq %ymm23,%ymm10,%ymm10 + + + + vpcmpuq $6,.Lmask52x4(%rip),%ymm3,%k1 + vpcmpuq $6,.Lmask52x4(%rip),%ymm4,%k2 + kmovb %k1,%r14d + kmovb %k2,%r13d + shlb $4,%r13b + orb %r13b,%r14b + + vpcmpuq $6,.Lmask52x4(%rip),%ymm5,%k1 + vpcmpuq $6,.Lmask52x4(%rip),%ymm6,%k2 + kmovb %k1,%r13d + kmovb %k2,%r12d + shlb $4,%r12b + orb %r12b,%r13b + + vpcmpuq $6,.Lmask52x4(%rip),%ymm7,%k1 + vpcmpuq $6,.Lmask52x4(%rip),%ymm8,%k2 + kmovb %k1,%r12d + kmovb %k2,%r11d + shlb $4,%r11b + orb %r11b,%r12b + + vpcmpuq $6,.Lmask52x4(%rip),%ymm9,%k1 + vpcmpuq $6,.Lmask52x4(%rip),%ymm10,%k2 + kmovb %k1,%r11d + kmovb %k2,%r10d + shlb $4,%r10b + orb %r10b,%r11b + + addb %r14b,%r14b + adcb %r13b,%r13b + adcb %r12b,%r12b + adcb %r11b,%r11b + + + vpcmpuq $0,.Lmask52x4(%rip),%ymm3,%k1 + vpcmpuq $0,.Lmask52x4(%rip),%ymm4,%k2 + kmovb %k1,%r9d + kmovb %k2,%r8d + shlb $4,%r8b + orb %r8b,%r9b + + vpcmpuq $0,.Lmask52x4(%rip),%ymm5,%k1 + vpcmpuq $0,.Lmask52x4(%rip),%ymm6,%k2 + kmovb %k1,%r8d + kmovb %k2,%edx + shlb $4,%dl + orb %dl,%r8b + + vpcmpuq $0,.Lmask52x4(%rip),%ymm7,%k1 + vpcmpuq $0,.Lmask52x4(%rip),%ymm8,%k2 + kmovb %k1,%edx + kmovb %k2,%ecx + shlb $4,%cl + orb %cl,%dl + + vpcmpuq $0,.Lmask52x4(%rip),%ymm9,%k1 + vpcmpuq $0,.Lmask52x4(%rip),%ymm10,%k2 + kmovb %k1,%ecx + kmovb %k2,%ebx + shlb $4,%bl + orb %bl,%cl + + addb %r9b,%r14b + adcb %r8b,%r13b + adcb %dl,%r12b + adcb %cl,%r11b + + xorb %r9b,%r14b + xorb %r8b,%r13b + xorb %dl,%r12b + xorb %cl,%r11b + + kmovb %r14d,%k1 + shrb $4,%r14b + kmovb %r14d,%k2 + kmovb %r13d,%k3 + shrb $4,%r13b + kmovb %r13d,%k4 + kmovb %r12d,%k5 + shrb $4,%r12b + kmovb %r12d,%k6 + kmovb %r11d,%k7 + + vpsubq .Lmask52x4(%rip),%ymm3,%ymm3{%k1} + vpsubq .Lmask52x4(%rip),%ymm4,%ymm4{%k2} + vpsubq .Lmask52x4(%rip),%ymm5,%ymm5{%k3} + vpsubq .Lmask52x4(%rip),%ymm6,%ymm6{%k4} + vpsubq .Lmask52x4(%rip),%ymm7,%ymm7{%k5} + vpsubq .Lmask52x4(%rip),%ymm8,%ymm8{%k6} + vpsubq .Lmask52x4(%rip),%ymm9,%ymm9{%k7} + + vpandq .Lmask52x4(%rip),%ymm3,%ymm3 + vpandq .Lmask52x4(%rip),%ymm4,%ymm4 + vpandq .Lmask52x4(%rip),%ymm5,%ymm5 + vpandq .Lmask52x4(%rip),%ymm6,%ymm6 + vpandq .Lmask52x4(%rip),%ymm7,%ymm7 + vpandq .Lmask52x4(%rip),%ymm8,%ymm8 + vpandq .Lmask52x4(%rip),%ymm9,%ymm9 + + shrb $4,%r11b + kmovb %r11d,%k1 + + vpsubq .Lmask52x4(%rip),%ymm10,%ymm10{%k1} + + vpandq .Lmask52x4(%rip),%ymm10,%ymm10 + + vpbroadcastq %r15,%ymm0 + vpblendd $3,%ymm0,%ymm11,%ymm11 + + + + vpsrlq $52,%ymm11,%ymm0 + vpsrlq $52,%ymm12,%ymm1 + vpsrlq $52,%ymm13,%ymm2 + vpsrlq $52,%ymm14,%ymm19 + vpsrlq $52,%ymm15,%ymm20 + vpsrlq $52,%ymm16,%ymm21 + vpsrlq $52,%ymm17,%ymm22 + vpsrlq $52,%ymm18,%ymm23 + + + valignq $3,%ymm22,%ymm23,%ymm23 + valignq $3,%ymm21,%ymm22,%ymm22 + valignq $3,%ymm20,%ymm21,%ymm21 + valignq $3,%ymm19,%ymm20,%ymm20 + valignq $3,%ymm2,%ymm19,%ymm19 + valignq $3,%ymm1,%ymm2,%ymm2 + valignq $3,%ymm0,%ymm1,%ymm1 + valignq $3,.Lzeros(%rip),%ymm0,%ymm0 + + + vpandq .Lmask52x4(%rip),%ymm11,%ymm11 + vpandq .Lmask52x4(%rip),%ymm12,%ymm12 + vpandq .Lmask52x4(%rip),%ymm13,%ymm13 + vpandq .Lmask52x4(%rip),%ymm14,%ymm14 + vpandq .Lmask52x4(%rip),%ymm15,%ymm15 + vpandq .Lmask52x4(%rip),%ymm16,%ymm16 + vpandq .Lmask52x4(%rip),%ymm17,%ymm17 + vpandq .Lmask52x4(%rip),%ymm18,%ymm18 + + + vpaddq %ymm0,%ymm11,%ymm11 + vpaddq %ymm1,%ymm12,%ymm12 + vpaddq %ymm2,%ymm13,%ymm13 + vpaddq %ymm19,%ymm14,%ymm14 + vpaddq %ymm20,%ymm15,%ymm15 + vpaddq %ymm21,%ymm16,%ymm16 + vpaddq %ymm22,%ymm17,%ymm17 + vpaddq %ymm23,%ymm18,%ymm18 + + + + vpcmpuq $6,.Lmask52x4(%rip),%ymm11,%k1 + vpcmpuq $6,.Lmask52x4(%rip),%ymm12,%k2 + kmovb %k1,%r14d + kmovb %k2,%r13d + shlb $4,%r13b + orb %r13b,%r14b + + vpcmpuq $6,.Lmask52x4(%rip),%ymm13,%k1 + vpcmpuq $6,.Lmask52x4(%rip),%ymm14,%k2 + kmovb %k1,%r13d + kmovb %k2,%r12d + shlb $4,%r12b + orb %r12b,%r13b + + vpcmpuq $6,.Lmask52x4(%rip),%ymm15,%k1 + vpcmpuq $6,.Lmask52x4(%rip),%ymm16,%k2 + kmovb %k1,%r12d + kmovb %k2,%r11d + shlb $4,%r11b + orb %r11b,%r12b + + vpcmpuq $6,.Lmask52x4(%rip),%ymm17,%k1 + vpcmpuq $6,.Lmask52x4(%rip),%ymm18,%k2 + kmovb %k1,%r11d + kmovb %k2,%r10d + shlb $4,%r10b + orb %r10b,%r11b + + addb %r14b,%r14b + adcb %r13b,%r13b + adcb %r12b,%r12b + adcb %r11b,%r11b + + + vpcmpuq $0,.Lmask52x4(%rip),%ymm11,%k1 + vpcmpuq $0,.Lmask52x4(%rip),%ymm12,%k2 + kmovb %k1,%r9d + kmovb %k2,%r8d + shlb $4,%r8b + orb %r8b,%r9b + + vpcmpuq $0,.Lmask52x4(%rip),%ymm13,%k1 + vpcmpuq $0,.Lmask52x4(%rip),%ymm14,%k2 + kmovb %k1,%r8d + kmovb %k2,%edx + shlb $4,%dl + orb %dl,%r8b + + vpcmpuq $0,.Lmask52x4(%rip),%ymm15,%k1 + vpcmpuq $0,.Lmask52x4(%rip),%ymm16,%k2 + kmovb %k1,%edx + kmovb %k2,%ecx + shlb $4,%cl + orb %cl,%dl + + vpcmpuq $0,.Lmask52x4(%rip),%ymm17,%k1 + vpcmpuq $0,.Lmask52x4(%rip),%ymm18,%k2 + kmovb %k1,%ecx + kmovb %k2,%ebx + shlb $4,%bl + orb %bl,%cl + + addb %r9b,%r14b + adcb %r8b,%r13b + adcb %dl,%r12b + adcb %cl,%r11b + + xorb %r9b,%r14b + xorb %r8b,%r13b + xorb %dl,%r12b + xorb %cl,%r11b + + kmovb %r14d,%k1 + shrb $4,%r14b + kmovb %r14d,%k2 + kmovb %r13d,%k3 + shrb $4,%r13b + kmovb %r13d,%k4 + kmovb %r12d,%k5 + shrb $4,%r12b + kmovb %r12d,%k6 + kmovb %r11d,%k7 + + vpsubq .Lmask52x4(%rip),%ymm11,%ymm11{%k1} + vpsubq .Lmask52x4(%rip),%ymm12,%ymm12{%k2} + vpsubq .Lmask52x4(%rip),%ymm13,%ymm13{%k3} + vpsubq .Lmask52x4(%rip),%ymm14,%ymm14{%k4} + vpsubq .Lmask52x4(%rip),%ymm15,%ymm15{%k5} + vpsubq .Lmask52x4(%rip),%ymm16,%ymm16{%k6} + vpsubq .Lmask52x4(%rip),%ymm17,%ymm17{%k7} + + vpandq .Lmask52x4(%rip),%ymm11,%ymm11 + vpandq .Lmask52x4(%rip),%ymm12,%ymm12 + vpandq .Lmask52x4(%rip),%ymm13,%ymm13 + vpandq .Lmask52x4(%rip),%ymm14,%ymm14 + vpandq .Lmask52x4(%rip),%ymm15,%ymm15 + vpandq .Lmask52x4(%rip),%ymm16,%ymm16 + vpandq .Lmask52x4(%rip),%ymm17,%ymm17 + + shrb $4,%r11b + kmovb %r11d,%k1 + + vpsubq .Lmask52x4(%rip),%ymm18,%ymm18{%k1} + + vpandq .Lmask52x4(%rip),%ymm18,%ymm18 + + vmovdqu64 %ymm3,0(%rdi) + vmovdqu64 %ymm4,32(%rdi) + vmovdqu64 %ymm5,64(%rdi) + vmovdqu64 %ymm6,96(%rdi) + vmovdqu64 %ymm7,128(%rdi) + vmovdqu64 %ymm8,160(%rdi) + vmovdqu64 %ymm9,192(%rdi) + vmovdqu64 %ymm10,224(%rdi) + + vmovdqu64 %ymm11,256(%rdi) + vmovdqu64 %ymm12,288(%rdi) + vmovdqu64 %ymm13,320(%rdi) + vmovdqu64 %ymm14,352(%rdi) + vmovdqu64 %ymm15,384(%rdi) + vmovdqu64 %ymm16,416(%rdi) + vmovdqu64 %ymm17,448(%rdi) + vmovdqu64 %ymm18,480(%rdi) + + vzeroupper + leaq (%rsp),%rax +.cfi_def_cfa_register %rax + movq 0(%rax),%r15 +.cfi_restore %r15 + movq 8(%rax),%r14 +.cfi_restore %r14 + movq 16(%rax),%r13 +.cfi_restore %r13 + movq 24(%rax),%r12 +.cfi_restore %r12 + movq 32(%rax),%rbp +.cfi_restore %rbp + movq 40(%rax),%rbx +.cfi_restore %rbx + leaq 48(%rax),%rsp +.cfi_def_cfa %rsp,8 +.Lossl_rsaz_amm52x30_x2_ifma256_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size ossl_rsaz_amm52x30_x2_ifma256, .-ossl_rsaz_amm52x30_x2_ifma256 +.text + +.align 32 +.globl ossl_extract_multiplier_2x30_win5 +.type ossl_extract_multiplier_2x30_win5,@function +ossl_extract_multiplier_2x30_win5: +.cfi_startproc +.byte 243,15,30,250 + vmovdqa64 .Lones(%rip),%ymm30 + vpbroadcastq %rdx,%ymm28 + vpbroadcastq %rcx,%ymm29 + leaq 16384(%rsi),%rax + + + vpxor %xmm0,%xmm0,%xmm0 + vmovdqa64 %ymm0,%ymm27 + vmovdqa64 %ymm0,%ymm1 + vmovdqa64 %ymm0,%ymm2 + vmovdqa64 %ymm0,%ymm3 + vmovdqa64 %ymm0,%ymm4 + vmovdqa64 %ymm0,%ymm5 + vmovdqa64 %ymm0,%ymm16 + vmovdqa64 %ymm0,%ymm17 + vmovdqa64 %ymm0,%ymm18 + vmovdqa64 %ymm0,%ymm19 + vmovdqa64 %ymm0,%ymm20 + vmovdqa64 %ymm0,%ymm21 + vmovdqa64 %ymm0,%ymm22 + vmovdqa64 %ymm0,%ymm23 + vmovdqa64 %ymm0,%ymm24 + vmovdqa64 %ymm0,%ymm25 + +.align 32 +.Lloop: + vpcmpq $0,%ymm27,%ymm28,%k1 + vpcmpq $0,%ymm27,%ymm29,%k2 + vmovdqu64 0(%rsi),%ymm26 + vpblendmq %ymm26,%ymm0,%ymm0{%k1} + vmovdqu64 32(%rsi),%ymm26 + vpblendmq %ymm26,%ymm1,%ymm1{%k1} + vmovdqu64 64(%rsi),%ymm26 + vpblendmq %ymm26,%ymm2,%ymm2{%k1} + vmovdqu64 96(%rsi),%ymm26 + vpblendmq %ymm26,%ymm3,%ymm3{%k1} + vmovdqu64 128(%rsi),%ymm26 + vpblendmq %ymm26,%ymm4,%ymm4{%k1} + vmovdqu64 160(%rsi),%ymm26 + vpblendmq %ymm26,%ymm5,%ymm5{%k1} + vmovdqu64 192(%rsi),%ymm26 + vpblendmq %ymm26,%ymm16,%ymm16{%k1} + vmovdqu64 224(%rsi),%ymm26 + vpblendmq %ymm26,%ymm17,%ymm17{%k1} + vmovdqu64 256(%rsi),%ymm26 + vpblendmq %ymm26,%ymm18,%ymm18{%k2} + vmovdqu64 288(%rsi),%ymm26 + vpblendmq %ymm26,%ymm19,%ymm19{%k2} + vmovdqu64 320(%rsi),%ymm26 + vpblendmq %ymm26,%ymm20,%ymm20{%k2} + vmovdqu64 352(%rsi),%ymm26 + vpblendmq %ymm26,%ymm21,%ymm21{%k2} + vmovdqu64 384(%rsi),%ymm26 + vpblendmq %ymm26,%ymm22,%ymm22{%k2} + vmovdqu64 416(%rsi),%ymm26 + vpblendmq %ymm26,%ymm23,%ymm23{%k2} + vmovdqu64 448(%rsi),%ymm26 + vpblendmq %ymm26,%ymm24,%ymm24{%k2} + vmovdqu64 480(%rsi),%ymm26 + vpblendmq %ymm26,%ymm25,%ymm25{%k2} + vpaddq %ymm30,%ymm27,%ymm27 + addq $512,%rsi + cmpq %rsi,%rax + jne .Lloop + vmovdqu64 %ymm0,0(%rdi) + vmovdqu64 %ymm1,32(%rdi) + vmovdqu64 %ymm2,64(%rdi) + vmovdqu64 %ymm3,96(%rdi) + vmovdqu64 %ymm4,128(%rdi) + vmovdqu64 %ymm5,160(%rdi) + vmovdqu64 %ymm16,192(%rdi) + vmovdqu64 %ymm17,224(%rdi) + vmovdqu64 %ymm18,256(%rdi) + vmovdqu64 %ymm19,288(%rdi) + vmovdqu64 %ymm20,320(%rdi) + vmovdqu64 %ymm21,352(%rdi) + vmovdqu64 %ymm22,384(%rdi) + vmovdqu64 %ymm23,416(%rdi) + vmovdqu64 %ymm24,448(%rdi) + vmovdqu64 %ymm25,480(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size ossl_extract_multiplier_2x30_win5, .-ossl_extract_multiplier_2x30_win5 +.section .rodata +.align 32 +.Lones: +.quad 1,1,1,1 +.Lzeros: +.quad 0,0,0,0 + .section ".note.gnu.property", "a" + .p2align 3 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + # "GNU" encoded with .byte, since .asciz isn't supported + # on Solaris. + .byte 0x47 + .byte 0x4e + .byte 0x55 + .byte 0 +1: + .p2align 3 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 3 +4: |
