aboutsummaryrefslogtreecommitdiff
path: root/crypto/bn/rsaz-3k-avx512.s
diff options
context:
space:
mode:
Diffstat (limited to 'crypto/bn/rsaz-3k-avx512.s')
-rw-r--r--crypto/bn/rsaz-3k-avx512.s1331
1 files changed, 1331 insertions, 0 deletions
diff --git a/crypto/bn/rsaz-3k-avx512.s b/crypto/bn/rsaz-3k-avx512.s
new file mode 100644
index 000000000000..022574148e8f
--- /dev/null
+++ b/crypto/bn/rsaz-3k-avx512.s
@@ -0,0 +1,1331 @@
+.text
+
+.globl ossl_rsaz_amm52x30_x1_ifma256
+.type ossl_rsaz_amm52x30_x1_ifma256,@function
+.align 32
+ossl_rsaz_amm52x30_x1_ifma256:
+.cfi_startproc
+.byte 243,15,30,250
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r15,-56
+
+ vpxord %ymm0,%ymm0,%ymm0
+ vmovdqa64 %ymm0,%ymm3
+ vmovdqa64 %ymm0,%ymm4
+ vmovdqa64 %ymm0,%ymm5
+ vmovdqa64 %ymm0,%ymm6
+ vmovdqa64 %ymm0,%ymm7
+ vmovdqa64 %ymm0,%ymm8
+ vmovdqa64 %ymm0,%ymm9
+ vmovdqa64 %ymm0,%ymm10
+
+ xorl %r9d,%r9d
+
+ movq %rdx,%r11
+ movq $0xfffffffffffff,%rax
+
+
+ movl $7,%ebx
+
+.align 32
+.Lloop7:
+ movq 0(%r11),%r13
+
+ vpbroadcastq %r13,%ymm1
+ movq 0(%rsi),%rdx
+ mulxq %r13,%r13,%r12
+ addq %r13,%r9
+ movq %r12,%r10
+ adcq $0,%r10
+
+ movq %r8,%r13
+ imulq %r9,%r13
+ andq %rax,%r13
+
+ vpbroadcastq %r13,%ymm2
+ movq 0(%rcx),%rdx
+ mulxq %r13,%r13,%r12
+ addq %r13,%r9
+ adcq %r12,%r10
+
+ shrq $52,%r9
+ salq $12,%r10
+ orq %r10,%r9
+
+ vpmadd52luq 0(%rsi),%ymm1,%ymm3
+ vpmadd52luq 32(%rsi),%ymm1,%ymm4
+ vpmadd52luq 64(%rsi),%ymm1,%ymm5
+ vpmadd52luq 96(%rsi),%ymm1,%ymm6
+ vpmadd52luq 128(%rsi),%ymm1,%ymm7
+ vpmadd52luq 160(%rsi),%ymm1,%ymm8
+ vpmadd52luq 192(%rsi),%ymm1,%ymm9
+ vpmadd52luq 224(%rsi),%ymm1,%ymm10
+
+ vpmadd52luq 0(%rcx),%ymm2,%ymm3
+ vpmadd52luq 32(%rcx),%ymm2,%ymm4
+ vpmadd52luq 64(%rcx),%ymm2,%ymm5
+ vpmadd52luq 96(%rcx),%ymm2,%ymm6
+ vpmadd52luq 128(%rcx),%ymm2,%ymm7
+ vpmadd52luq 160(%rcx),%ymm2,%ymm8
+ vpmadd52luq 192(%rcx),%ymm2,%ymm9
+ vpmadd52luq 224(%rcx),%ymm2,%ymm10
+
+
+ valignq $1,%ymm3,%ymm4,%ymm3
+ valignq $1,%ymm4,%ymm5,%ymm4
+ valignq $1,%ymm5,%ymm6,%ymm5
+ valignq $1,%ymm6,%ymm7,%ymm6
+ valignq $1,%ymm7,%ymm8,%ymm7
+ valignq $1,%ymm8,%ymm9,%ymm8
+ valignq $1,%ymm9,%ymm10,%ymm9
+ valignq $1,%ymm10,%ymm0,%ymm10
+
+ vmovq %xmm3,%r13
+ addq %r13,%r9
+
+ vpmadd52huq 0(%rsi),%ymm1,%ymm3
+ vpmadd52huq 32(%rsi),%ymm1,%ymm4
+ vpmadd52huq 64(%rsi),%ymm1,%ymm5
+ vpmadd52huq 96(%rsi),%ymm1,%ymm6
+ vpmadd52huq 128(%rsi),%ymm1,%ymm7
+ vpmadd52huq 160(%rsi),%ymm1,%ymm8
+ vpmadd52huq 192(%rsi),%ymm1,%ymm9
+ vpmadd52huq 224(%rsi),%ymm1,%ymm10
+
+ vpmadd52huq 0(%rcx),%ymm2,%ymm3
+ vpmadd52huq 32(%rcx),%ymm2,%ymm4
+ vpmadd52huq 64(%rcx),%ymm2,%ymm5
+ vpmadd52huq 96(%rcx),%ymm2,%ymm6
+ vpmadd52huq 128(%rcx),%ymm2,%ymm7
+ vpmadd52huq 160(%rcx),%ymm2,%ymm8
+ vpmadd52huq 192(%rcx),%ymm2,%ymm9
+ vpmadd52huq 224(%rcx),%ymm2,%ymm10
+ movq 8(%r11),%r13
+
+ vpbroadcastq %r13,%ymm1
+ movq 0(%rsi),%rdx
+ mulxq %r13,%r13,%r12
+ addq %r13,%r9
+ movq %r12,%r10
+ adcq $0,%r10
+
+ movq %r8,%r13
+ imulq %r9,%r13
+ andq %rax,%r13
+
+ vpbroadcastq %r13,%ymm2
+ movq 0(%rcx),%rdx
+ mulxq %r13,%r13,%r12
+ addq %r13,%r9
+ adcq %r12,%r10
+
+ shrq $52,%r9
+ salq $12,%r10
+ orq %r10,%r9
+
+ vpmadd52luq 0(%rsi),%ymm1,%ymm3
+ vpmadd52luq 32(%rsi),%ymm1,%ymm4
+ vpmadd52luq 64(%rsi),%ymm1,%ymm5
+ vpmadd52luq 96(%rsi),%ymm1,%ymm6
+ vpmadd52luq 128(%rsi),%ymm1,%ymm7
+ vpmadd52luq 160(%rsi),%ymm1,%ymm8
+ vpmadd52luq 192(%rsi),%ymm1,%ymm9
+ vpmadd52luq 224(%rsi),%ymm1,%ymm10
+
+ vpmadd52luq 0(%rcx),%ymm2,%ymm3
+ vpmadd52luq 32(%rcx),%ymm2,%ymm4
+ vpmadd52luq 64(%rcx),%ymm2,%ymm5
+ vpmadd52luq 96(%rcx),%ymm2,%ymm6
+ vpmadd52luq 128(%rcx),%ymm2,%ymm7
+ vpmadd52luq 160(%rcx),%ymm2,%ymm8
+ vpmadd52luq 192(%rcx),%ymm2,%ymm9
+ vpmadd52luq 224(%rcx),%ymm2,%ymm10
+
+
+ valignq $1,%ymm3,%ymm4,%ymm3
+ valignq $1,%ymm4,%ymm5,%ymm4
+ valignq $1,%ymm5,%ymm6,%ymm5
+ valignq $1,%ymm6,%ymm7,%ymm6
+ valignq $1,%ymm7,%ymm8,%ymm7
+ valignq $1,%ymm8,%ymm9,%ymm8
+ valignq $1,%ymm9,%ymm10,%ymm9
+ valignq $1,%ymm10,%ymm0,%ymm10
+
+ vmovq %xmm3,%r13
+ addq %r13,%r9
+
+ vpmadd52huq 0(%rsi),%ymm1,%ymm3
+ vpmadd52huq 32(%rsi),%ymm1,%ymm4
+ vpmadd52huq 64(%rsi),%ymm1,%ymm5
+ vpmadd52huq 96(%rsi),%ymm1,%ymm6
+ vpmadd52huq 128(%rsi),%ymm1,%ymm7
+ vpmadd52huq 160(%rsi),%ymm1,%ymm8
+ vpmadd52huq 192(%rsi),%ymm1,%ymm9
+ vpmadd52huq 224(%rsi),%ymm1,%ymm10
+
+ vpmadd52huq 0(%rcx),%ymm2,%ymm3
+ vpmadd52huq 32(%rcx),%ymm2,%ymm4
+ vpmadd52huq 64(%rcx),%ymm2,%ymm5
+ vpmadd52huq 96(%rcx),%ymm2,%ymm6
+ vpmadd52huq 128(%rcx),%ymm2,%ymm7
+ vpmadd52huq 160(%rcx),%ymm2,%ymm8
+ vpmadd52huq 192(%rcx),%ymm2,%ymm9
+ vpmadd52huq 224(%rcx),%ymm2,%ymm10
+ movq 16(%r11),%r13
+
+ vpbroadcastq %r13,%ymm1
+ movq 0(%rsi),%rdx
+ mulxq %r13,%r13,%r12
+ addq %r13,%r9
+ movq %r12,%r10
+ adcq $0,%r10
+
+ movq %r8,%r13
+ imulq %r9,%r13
+ andq %rax,%r13
+
+ vpbroadcastq %r13,%ymm2
+ movq 0(%rcx),%rdx
+ mulxq %r13,%r13,%r12
+ addq %r13,%r9
+ adcq %r12,%r10
+
+ shrq $52,%r9
+ salq $12,%r10
+ orq %r10,%r9
+
+ vpmadd52luq 0(%rsi),%ymm1,%ymm3
+ vpmadd52luq 32(%rsi),%ymm1,%ymm4
+ vpmadd52luq 64(%rsi),%ymm1,%ymm5
+ vpmadd52luq 96(%rsi),%ymm1,%ymm6
+ vpmadd52luq 128(%rsi),%ymm1,%ymm7
+ vpmadd52luq 160(%rsi),%ymm1,%ymm8
+ vpmadd52luq 192(%rsi),%ymm1,%ymm9
+ vpmadd52luq 224(%rsi),%ymm1,%ymm10
+
+ vpmadd52luq 0(%rcx),%ymm2,%ymm3
+ vpmadd52luq 32(%rcx),%ymm2,%ymm4
+ vpmadd52luq 64(%rcx),%ymm2,%ymm5
+ vpmadd52luq 96(%rcx),%ymm2,%ymm6
+ vpmadd52luq 128(%rcx),%ymm2,%ymm7
+ vpmadd52luq 160(%rcx),%ymm2,%ymm8
+ vpmadd52luq 192(%rcx),%ymm2,%ymm9
+ vpmadd52luq 224(%rcx),%ymm2,%ymm10
+
+
+ valignq $1,%ymm3,%ymm4,%ymm3
+ valignq $1,%ymm4,%ymm5,%ymm4
+ valignq $1,%ymm5,%ymm6,%ymm5
+ valignq $1,%ymm6,%ymm7,%ymm6
+ valignq $1,%ymm7,%ymm8,%ymm7
+ valignq $1,%ymm8,%ymm9,%ymm8
+ valignq $1,%ymm9,%ymm10,%ymm9
+ valignq $1,%ymm10,%ymm0,%ymm10
+
+ vmovq %xmm3,%r13
+ addq %r13,%r9
+
+ vpmadd52huq 0(%rsi),%ymm1,%ymm3
+ vpmadd52huq 32(%rsi),%ymm1,%ymm4
+ vpmadd52huq 64(%rsi),%ymm1,%ymm5
+ vpmadd52huq 96(%rsi),%ymm1,%ymm6
+ vpmadd52huq 128(%rsi),%ymm1,%ymm7
+ vpmadd52huq 160(%rsi),%ymm1,%ymm8
+ vpmadd52huq 192(%rsi),%ymm1,%ymm9
+ vpmadd52huq 224(%rsi),%ymm1,%ymm10
+
+ vpmadd52huq 0(%rcx),%ymm2,%ymm3
+ vpmadd52huq 32(%rcx),%ymm2,%ymm4
+ vpmadd52huq 64(%rcx),%ymm2,%ymm5
+ vpmadd52huq 96(%rcx),%ymm2,%ymm6
+ vpmadd52huq 128(%rcx),%ymm2,%ymm7
+ vpmadd52huq 160(%rcx),%ymm2,%ymm8
+ vpmadd52huq 192(%rcx),%ymm2,%ymm9
+ vpmadd52huq 224(%rcx),%ymm2,%ymm10
+ movq 24(%r11),%r13
+
+ vpbroadcastq %r13,%ymm1
+ movq 0(%rsi),%rdx
+ mulxq %r13,%r13,%r12
+ addq %r13,%r9
+ movq %r12,%r10
+ adcq $0,%r10
+
+ movq %r8,%r13
+ imulq %r9,%r13
+ andq %rax,%r13
+
+ vpbroadcastq %r13,%ymm2
+ movq 0(%rcx),%rdx
+ mulxq %r13,%r13,%r12
+ addq %r13,%r9
+ adcq %r12,%r10
+
+ shrq $52,%r9
+ salq $12,%r10
+ orq %r10,%r9
+
+ vpmadd52luq 0(%rsi),%ymm1,%ymm3
+ vpmadd52luq 32(%rsi),%ymm1,%ymm4
+ vpmadd52luq 64(%rsi),%ymm1,%ymm5
+ vpmadd52luq 96(%rsi),%ymm1,%ymm6
+ vpmadd52luq 128(%rsi),%ymm1,%ymm7
+ vpmadd52luq 160(%rsi),%ymm1,%ymm8
+ vpmadd52luq 192(%rsi),%ymm1,%ymm9
+ vpmadd52luq 224(%rsi),%ymm1,%ymm10
+
+ vpmadd52luq 0(%rcx),%ymm2,%ymm3
+ vpmadd52luq 32(%rcx),%ymm2,%ymm4
+ vpmadd52luq 64(%rcx),%ymm2,%ymm5
+ vpmadd52luq 96(%rcx),%ymm2,%ymm6
+ vpmadd52luq 128(%rcx),%ymm2,%ymm7
+ vpmadd52luq 160(%rcx),%ymm2,%ymm8
+ vpmadd52luq 192(%rcx),%ymm2,%ymm9
+ vpmadd52luq 224(%rcx),%ymm2,%ymm10
+
+
+ valignq $1,%ymm3,%ymm4,%ymm3
+ valignq $1,%ymm4,%ymm5,%ymm4
+ valignq $1,%ymm5,%ymm6,%ymm5
+ valignq $1,%ymm6,%ymm7,%ymm6
+ valignq $1,%ymm7,%ymm8,%ymm7
+ valignq $1,%ymm8,%ymm9,%ymm8
+ valignq $1,%ymm9,%ymm10,%ymm9
+ valignq $1,%ymm10,%ymm0,%ymm10
+
+ vmovq %xmm3,%r13
+ addq %r13,%r9
+
+ vpmadd52huq 0(%rsi),%ymm1,%ymm3
+ vpmadd52huq 32(%rsi),%ymm1,%ymm4
+ vpmadd52huq 64(%rsi),%ymm1,%ymm5
+ vpmadd52huq 96(%rsi),%ymm1,%ymm6
+ vpmadd52huq 128(%rsi),%ymm1,%ymm7
+ vpmadd52huq 160(%rsi),%ymm1,%ymm8
+ vpmadd52huq 192(%rsi),%ymm1,%ymm9
+ vpmadd52huq 224(%rsi),%ymm1,%ymm10
+
+ vpmadd52huq 0(%rcx),%ymm2,%ymm3
+ vpmadd52huq 32(%rcx),%ymm2,%ymm4
+ vpmadd52huq 64(%rcx),%ymm2,%ymm5
+ vpmadd52huq 96(%rcx),%ymm2,%ymm6
+ vpmadd52huq 128(%rcx),%ymm2,%ymm7
+ vpmadd52huq 160(%rcx),%ymm2,%ymm8
+ vpmadd52huq 192(%rcx),%ymm2,%ymm9
+ vpmadd52huq 224(%rcx),%ymm2,%ymm10
+ leaq 32(%r11),%r11
+ decl %ebx
+ jne .Lloop7
+ movq 0(%r11),%r13
+
+ vpbroadcastq %r13,%ymm1
+ movq 0(%rsi),%rdx
+ mulxq %r13,%r13,%r12
+ addq %r13,%r9
+ movq %r12,%r10
+ adcq $0,%r10
+
+ movq %r8,%r13
+ imulq %r9,%r13
+ andq %rax,%r13
+
+ vpbroadcastq %r13,%ymm2
+ movq 0(%rcx),%rdx
+ mulxq %r13,%r13,%r12
+ addq %r13,%r9
+ adcq %r12,%r10
+
+ shrq $52,%r9
+ salq $12,%r10
+ orq %r10,%r9
+
+ vpmadd52luq 0(%rsi),%ymm1,%ymm3
+ vpmadd52luq 32(%rsi),%ymm1,%ymm4
+ vpmadd52luq 64(%rsi),%ymm1,%ymm5
+ vpmadd52luq 96(%rsi),%ymm1,%ymm6
+ vpmadd52luq 128(%rsi),%ymm1,%ymm7
+ vpmadd52luq 160(%rsi),%ymm1,%ymm8
+ vpmadd52luq 192(%rsi),%ymm1,%ymm9
+ vpmadd52luq 224(%rsi),%ymm1,%ymm10
+
+ vpmadd52luq 0(%rcx),%ymm2,%ymm3
+ vpmadd52luq 32(%rcx),%ymm2,%ymm4
+ vpmadd52luq 64(%rcx),%ymm2,%ymm5
+ vpmadd52luq 96(%rcx),%ymm2,%ymm6
+ vpmadd52luq 128(%rcx),%ymm2,%ymm7
+ vpmadd52luq 160(%rcx),%ymm2,%ymm8
+ vpmadd52luq 192(%rcx),%ymm2,%ymm9
+ vpmadd52luq 224(%rcx),%ymm2,%ymm10
+
+
+ valignq $1,%ymm3,%ymm4,%ymm3
+ valignq $1,%ymm4,%ymm5,%ymm4
+ valignq $1,%ymm5,%ymm6,%ymm5
+ valignq $1,%ymm6,%ymm7,%ymm6
+ valignq $1,%ymm7,%ymm8,%ymm7
+ valignq $1,%ymm8,%ymm9,%ymm8
+ valignq $1,%ymm9,%ymm10,%ymm9
+ valignq $1,%ymm10,%ymm0,%ymm10
+
+ vmovq %xmm3,%r13
+ addq %r13,%r9
+
+ vpmadd52huq 0(%rsi),%ymm1,%ymm3
+ vpmadd52huq 32(%rsi),%ymm1,%ymm4
+ vpmadd52huq 64(%rsi),%ymm1,%ymm5
+ vpmadd52huq 96(%rsi),%ymm1,%ymm6
+ vpmadd52huq 128(%rsi),%ymm1,%ymm7
+ vpmadd52huq 160(%rsi),%ymm1,%ymm8
+ vpmadd52huq 192(%rsi),%ymm1,%ymm9
+ vpmadd52huq 224(%rsi),%ymm1,%ymm10
+
+ vpmadd52huq 0(%rcx),%ymm2,%ymm3
+ vpmadd52huq 32(%rcx),%ymm2,%ymm4
+ vpmadd52huq 64(%rcx),%ymm2,%ymm5
+ vpmadd52huq 96(%rcx),%ymm2,%ymm6
+ vpmadd52huq 128(%rcx),%ymm2,%ymm7
+ vpmadd52huq 160(%rcx),%ymm2,%ymm8
+ vpmadd52huq 192(%rcx),%ymm2,%ymm9
+ vpmadd52huq 224(%rcx),%ymm2,%ymm10
+ movq 8(%r11),%r13
+
+ vpbroadcastq %r13,%ymm1
+ movq 0(%rsi),%rdx
+ mulxq %r13,%r13,%r12
+ addq %r13,%r9
+ movq %r12,%r10
+ adcq $0,%r10
+
+ movq %r8,%r13
+ imulq %r9,%r13
+ andq %rax,%r13
+
+ vpbroadcastq %r13,%ymm2
+ movq 0(%rcx),%rdx
+ mulxq %r13,%r13,%r12
+ addq %r13,%r9
+ adcq %r12,%r10
+
+ shrq $52,%r9
+ salq $12,%r10
+ orq %r10,%r9
+
+ vpmadd52luq 0(%rsi),%ymm1,%ymm3
+ vpmadd52luq 32(%rsi),%ymm1,%ymm4
+ vpmadd52luq 64(%rsi),%ymm1,%ymm5
+ vpmadd52luq 96(%rsi),%ymm1,%ymm6
+ vpmadd52luq 128(%rsi),%ymm1,%ymm7
+ vpmadd52luq 160(%rsi),%ymm1,%ymm8
+ vpmadd52luq 192(%rsi),%ymm1,%ymm9
+ vpmadd52luq 224(%rsi),%ymm1,%ymm10
+
+ vpmadd52luq 0(%rcx),%ymm2,%ymm3
+ vpmadd52luq 32(%rcx),%ymm2,%ymm4
+ vpmadd52luq 64(%rcx),%ymm2,%ymm5
+ vpmadd52luq 96(%rcx),%ymm2,%ymm6
+ vpmadd52luq 128(%rcx),%ymm2,%ymm7
+ vpmadd52luq 160(%rcx),%ymm2,%ymm8
+ vpmadd52luq 192(%rcx),%ymm2,%ymm9
+ vpmadd52luq 224(%rcx),%ymm2,%ymm10
+
+
+ valignq $1,%ymm3,%ymm4,%ymm3
+ valignq $1,%ymm4,%ymm5,%ymm4
+ valignq $1,%ymm5,%ymm6,%ymm5
+ valignq $1,%ymm6,%ymm7,%ymm6
+ valignq $1,%ymm7,%ymm8,%ymm7
+ valignq $1,%ymm8,%ymm9,%ymm8
+ valignq $1,%ymm9,%ymm10,%ymm9
+ valignq $1,%ymm10,%ymm0,%ymm10
+
+ vmovq %xmm3,%r13
+ addq %r13,%r9
+
+ vpmadd52huq 0(%rsi),%ymm1,%ymm3
+ vpmadd52huq 32(%rsi),%ymm1,%ymm4
+ vpmadd52huq 64(%rsi),%ymm1,%ymm5
+ vpmadd52huq 96(%rsi),%ymm1,%ymm6
+ vpmadd52huq 128(%rsi),%ymm1,%ymm7
+ vpmadd52huq 160(%rsi),%ymm1,%ymm8
+ vpmadd52huq 192(%rsi),%ymm1,%ymm9
+ vpmadd52huq 224(%rsi),%ymm1,%ymm10
+
+ vpmadd52huq 0(%rcx),%ymm2,%ymm3
+ vpmadd52huq 32(%rcx),%ymm2,%ymm4
+ vpmadd52huq 64(%rcx),%ymm2,%ymm5
+ vpmadd52huq 96(%rcx),%ymm2,%ymm6
+ vpmadd52huq 128(%rcx),%ymm2,%ymm7
+ vpmadd52huq 160(%rcx),%ymm2,%ymm8
+ vpmadd52huq 192(%rcx),%ymm2,%ymm9
+ vpmadd52huq 224(%rcx),%ymm2,%ymm10
+
+ vpbroadcastq %r9,%ymm0
+ vpblendd $3,%ymm0,%ymm3,%ymm3
+
+
+
+ vpsrlq $52,%ymm3,%ymm0
+ vpsrlq $52,%ymm4,%ymm1
+ vpsrlq $52,%ymm5,%ymm2
+ vpsrlq $52,%ymm6,%ymm19
+ vpsrlq $52,%ymm7,%ymm20
+ vpsrlq $52,%ymm8,%ymm21
+ vpsrlq $52,%ymm9,%ymm22
+ vpsrlq $52,%ymm10,%ymm23
+
+
+ valignq $3,%ymm22,%ymm23,%ymm23
+ valignq $3,%ymm21,%ymm22,%ymm22
+ valignq $3,%ymm20,%ymm21,%ymm21
+ valignq $3,%ymm19,%ymm20,%ymm20
+ valignq $3,%ymm2,%ymm19,%ymm19
+ valignq $3,%ymm1,%ymm2,%ymm2
+ valignq $3,%ymm0,%ymm1,%ymm1
+ valignq $3,.Lzeros(%rip),%ymm0,%ymm0
+
+
+ vpandq .Lmask52x4(%rip),%ymm3,%ymm3
+ vpandq .Lmask52x4(%rip),%ymm4,%ymm4
+ vpandq .Lmask52x4(%rip),%ymm5,%ymm5
+ vpandq .Lmask52x4(%rip),%ymm6,%ymm6
+ vpandq .Lmask52x4(%rip),%ymm7,%ymm7
+ vpandq .Lmask52x4(%rip),%ymm8,%ymm8
+ vpandq .Lmask52x4(%rip),%ymm9,%ymm9
+ vpandq .Lmask52x4(%rip),%ymm10,%ymm10
+
+
+ vpaddq %ymm0,%ymm3,%ymm3
+ vpaddq %ymm1,%ymm4,%ymm4
+ vpaddq %ymm2,%ymm5,%ymm5
+ vpaddq %ymm19,%ymm6,%ymm6
+ vpaddq %ymm20,%ymm7,%ymm7
+ vpaddq %ymm21,%ymm8,%ymm8
+ vpaddq %ymm22,%ymm9,%ymm9
+ vpaddq %ymm23,%ymm10,%ymm10
+
+
+
+ vpcmpuq $6,.Lmask52x4(%rip),%ymm3,%k1
+ vpcmpuq $6,.Lmask52x4(%rip),%ymm4,%k2
+ kmovb %k1,%r14d
+ kmovb %k2,%r13d
+ shlb $4,%r13b
+ orb %r13b,%r14b
+
+ vpcmpuq $6,.Lmask52x4(%rip),%ymm5,%k1
+ vpcmpuq $6,.Lmask52x4(%rip),%ymm6,%k2
+ kmovb %k1,%r13d
+ kmovb %k2,%r12d
+ shlb $4,%r12b
+ orb %r12b,%r13b
+
+ vpcmpuq $6,.Lmask52x4(%rip),%ymm7,%k1
+ vpcmpuq $6,.Lmask52x4(%rip),%ymm8,%k2
+ kmovb %k1,%r12d
+ kmovb %k2,%r11d
+ shlb $4,%r11b
+ orb %r11b,%r12b
+
+ vpcmpuq $6,.Lmask52x4(%rip),%ymm9,%k1
+ vpcmpuq $6,.Lmask52x4(%rip),%ymm10,%k2
+ kmovb %k1,%r11d
+ kmovb %k2,%r10d
+ shlb $4,%r10b
+ orb %r10b,%r11b
+
+ addb %r14b,%r14b
+ adcb %r13b,%r13b
+ adcb %r12b,%r12b
+ adcb %r11b,%r11b
+
+
+ vpcmpuq $0,.Lmask52x4(%rip),%ymm3,%k1
+ vpcmpuq $0,.Lmask52x4(%rip),%ymm4,%k2
+ kmovb %k1,%r9d
+ kmovb %k2,%r8d
+ shlb $4,%r8b
+ orb %r8b,%r9b
+
+ vpcmpuq $0,.Lmask52x4(%rip),%ymm5,%k1
+ vpcmpuq $0,.Lmask52x4(%rip),%ymm6,%k2
+ kmovb %k1,%r8d
+ kmovb %k2,%edx
+ shlb $4,%dl
+ orb %dl,%r8b
+
+ vpcmpuq $0,.Lmask52x4(%rip),%ymm7,%k1
+ vpcmpuq $0,.Lmask52x4(%rip),%ymm8,%k2
+ kmovb %k1,%edx
+ kmovb %k2,%ecx
+ shlb $4,%cl
+ orb %cl,%dl
+
+ vpcmpuq $0,.Lmask52x4(%rip),%ymm9,%k1
+ vpcmpuq $0,.Lmask52x4(%rip),%ymm10,%k2
+ kmovb %k1,%ecx
+ kmovb %k2,%ebx
+ shlb $4,%bl
+ orb %bl,%cl
+
+ addb %r9b,%r14b
+ adcb %r8b,%r13b
+ adcb %dl,%r12b
+ adcb %cl,%r11b
+
+ xorb %r9b,%r14b
+ xorb %r8b,%r13b
+ xorb %dl,%r12b
+ xorb %cl,%r11b
+
+ kmovb %r14d,%k1
+ shrb $4,%r14b
+ kmovb %r14d,%k2
+ kmovb %r13d,%k3
+ shrb $4,%r13b
+ kmovb %r13d,%k4
+ kmovb %r12d,%k5
+ shrb $4,%r12b
+ kmovb %r12d,%k6
+ kmovb %r11d,%k7
+
+ vpsubq .Lmask52x4(%rip),%ymm3,%ymm3{%k1}
+ vpsubq .Lmask52x4(%rip),%ymm4,%ymm4{%k2}
+ vpsubq .Lmask52x4(%rip),%ymm5,%ymm5{%k3}
+ vpsubq .Lmask52x4(%rip),%ymm6,%ymm6{%k4}
+ vpsubq .Lmask52x4(%rip),%ymm7,%ymm7{%k5}
+ vpsubq .Lmask52x4(%rip),%ymm8,%ymm8{%k6}
+ vpsubq .Lmask52x4(%rip),%ymm9,%ymm9{%k7}
+
+ vpandq .Lmask52x4(%rip),%ymm3,%ymm3
+ vpandq .Lmask52x4(%rip),%ymm4,%ymm4
+ vpandq .Lmask52x4(%rip),%ymm5,%ymm5
+ vpandq .Lmask52x4(%rip),%ymm6,%ymm6
+ vpandq .Lmask52x4(%rip),%ymm7,%ymm7
+ vpandq .Lmask52x4(%rip),%ymm8,%ymm8
+ vpandq .Lmask52x4(%rip),%ymm9,%ymm9
+
+ shrb $4,%r11b
+ kmovb %r11d,%k1
+
+ vpsubq .Lmask52x4(%rip),%ymm10,%ymm10{%k1}
+
+ vpandq .Lmask52x4(%rip),%ymm10,%ymm10
+
+ vmovdqu64 %ymm3,0(%rdi)
+ vmovdqu64 %ymm4,32(%rdi)
+ vmovdqu64 %ymm5,64(%rdi)
+ vmovdqu64 %ymm6,96(%rdi)
+ vmovdqu64 %ymm7,128(%rdi)
+ vmovdqu64 %ymm8,160(%rdi)
+ vmovdqu64 %ymm9,192(%rdi)
+ vmovdqu64 %ymm10,224(%rdi)
+
+ vzeroupper
+ leaq (%rsp),%rax
+.cfi_def_cfa_register %rax
+ movq 0(%rax),%r15
+.cfi_restore %r15
+ movq 8(%rax),%r14
+.cfi_restore %r14
+ movq 16(%rax),%r13
+.cfi_restore %r13
+ movq 24(%rax),%r12
+.cfi_restore %r12
+ movq 32(%rax),%rbp
+.cfi_restore %rbp
+ movq 40(%rax),%rbx
+.cfi_restore %rbx
+ leaq 48(%rax),%rsp
+.cfi_def_cfa %rsp,8
+.Lossl_rsaz_amm52x30_x1_ifma256_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size ossl_rsaz_amm52x30_x1_ifma256, .-ossl_rsaz_amm52x30_x1_ifma256
+.section .rodata
+.align 32
+.Lmask52x4:
+.quad 0xfffffffffffff
+.quad 0xfffffffffffff
+.quad 0xfffffffffffff
+.quad 0xfffffffffffff
+.text
+
+.globl ossl_rsaz_amm52x30_x2_ifma256
+.type ossl_rsaz_amm52x30_x2_ifma256,@function
+.align 32
+ossl_rsaz_amm52x30_x2_ifma256:
+.cfi_startproc
+.byte 243,15,30,250
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r15,-56
+
+ vpxord %ymm0,%ymm0,%ymm0
+ vmovdqa64 %ymm0,%ymm3
+ vmovdqa64 %ymm0,%ymm4
+ vmovdqa64 %ymm0,%ymm5
+ vmovdqa64 %ymm0,%ymm6
+ vmovdqa64 %ymm0,%ymm7
+ vmovdqa64 %ymm0,%ymm8
+ vmovdqa64 %ymm0,%ymm9
+ vmovdqa64 %ymm0,%ymm10
+
+ vmovdqa64 %ymm0,%ymm11
+ vmovdqa64 %ymm0,%ymm12
+ vmovdqa64 %ymm0,%ymm13
+ vmovdqa64 %ymm0,%ymm14
+ vmovdqa64 %ymm0,%ymm15
+ vmovdqa64 %ymm0,%ymm16
+ vmovdqa64 %ymm0,%ymm17
+ vmovdqa64 %ymm0,%ymm18
+
+
+ xorl %r9d,%r9d
+ xorl %r15d,%r15d
+
+ movq %rdx,%r11
+ movq $0xfffffffffffff,%rax
+
+ movl $30,%ebx
+
+.align 32
+.Lloop30:
+ movq 0(%r11),%r13
+
+ vpbroadcastq %r13,%ymm1
+ movq 0(%rsi),%rdx
+ mulxq %r13,%r13,%r12
+ addq %r13,%r9
+ movq %r12,%r10
+ adcq $0,%r10
+
+ movq (%r8),%r13
+ imulq %r9,%r13
+ andq %rax,%r13
+
+ vpbroadcastq %r13,%ymm2
+ movq 0(%rcx),%rdx
+ mulxq %r13,%r13,%r12
+ addq %r13,%r9
+ adcq %r12,%r10
+
+ shrq $52,%r9
+ salq $12,%r10
+ orq %r10,%r9
+
+ vpmadd52luq 0(%rsi),%ymm1,%ymm3
+ vpmadd52luq 32(%rsi),%ymm1,%ymm4
+ vpmadd52luq 64(%rsi),%ymm1,%ymm5
+ vpmadd52luq 96(%rsi),%ymm1,%ymm6
+ vpmadd52luq 128(%rsi),%ymm1,%ymm7
+ vpmadd52luq 160(%rsi),%ymm1,%ymm8
+ vpmadd52luq 192(%rsi),%ymm1,%ymm9
+ vpmadd52luq 224(%rsi),%ymm1,%ymm10
+
+ vpmadd52luq 0(%rcx),%ymm2,%ymm3
+ vpmadd52luq 32(%rcx),%ymm2,%ymm4
+ vpmadd52luq 64(%rcx),%ymm2,%ymm5
+ vpmadd52luq 96(%rcx),%ymm2,%ymm6
+ vpmadd52luq 128(%rcx),%ymm2,%ymm7
+ vpmadd52luq 160(%rcx),%ymm2,%ymm8
+ vpmadd52luq 192(%rcx),%ymm2,%ymm9
+ vpmadd52luq 224(%rcx),%ymm2,%ymm10
+
+
+ valignq $1,%ymm3,%ymm4,%ymm3
+ valignq $1,%ymm4,%ymm5,%ymm4
+ valignq $1,%ymm5,%ymm6,%ymm5
+ valignq $1,%ymm6,%ymm7,%ymm6
+ valignq $1,%ymm7,%ymm8,%ymm7
+ valignq $1,%ymm8,%ymm9,%ymm8
+ valignq $1,%ymm9,%ymm10,%ymm9
+ valignq $1,%ymm10,%ymm0,%ymm10
+
+ vmovq %xmm3,%r13
+ addq %r13,%r9
+
+ vpmadd52huq 0(%rsi),%ymm1,%ymm3
+ vpmadd52huq 32(%rsi),%ymm1,%ymm4
+ vpmadd52huq 64(%rsi),%ymm1,%ymm5
+ vpmadd52huq 96(%rsi),%ymm1,%ymm6
+ vpmadd52huq 128(%rsi),%ymm1,%ymm7
+ vpmadd52huq 160(%rsi),%ymm1,%ymm8
+ vpmadd52huq 192(%rsi),%ymm1,%ymm9
+ vpmadd52huq 224(%rsi),%ymm1,%ymm10
+
+ vpmadd52huq 0(%rcx),%ymm2,%ymm3
+ vpmadd52huq 32(%rcx),%ymm2,%ymm4
+ vpmadd52huq 64(%rcx),%ymm2,%ymm5
+ vpmadd52huq 96(%rcx),%ymm2,%ymm6
+ vpmadd52huq 128(%rcx),%ymm2,%ymm7
+ vpmadd52huq 160(%rcx),%ymm2,%ymm8
+ vpmadd52huq 192(%rcx),%ymm2,%ymm9
+ vpmadd52huq 224(%rcx),%ymm2,%ymm10
+ movq 256(%r11),%r13
+
+ vpbroadcastq %r13,%ymm1
+ movq 256(%rsi),%rdx
+ mulxq %r13,%r13,%r12
+ addq %r13,%r15
+ movq %r12,%r10
+ adcq $0,%r10
+
+ movq 8(%r8),%r13
+ imulq %r15,%r13
+ andq %rax,%r13
+
+ vpbroadcastq %r13,%ymm2
+ movq 256(%rcx),%rdx
+ mulxq %r13,%r13,%r12
+ addq %r13,%r15
+ adcq %r12,%r10
+
+ shrq $52,%r15
+ salq $12,%r10
+ orq %r10,%r15
+
+ vpmadd52luq 256(%rsi),%ymm1,%ymm11
+ vpmadd52luq 288(%rsi),%ymm1,%ymm12
+ vpmadd52luq 320(%rsi),%ymm1,%ymm13
+ vpmadd52luq 352(%rsi),%ymm1,%ymm14
+ vpmadd52luq 384(%rsi),%ymm1,%ymm15
+ vpmadd52luq 416(%rsi),%ymm1,%ymm16
+ vpmadd52luq 448(%rsi),%ymm1,%ymm17
+ vpmadd52luq 480(%rsi),%ymm1,%ymm18
+
+ vpmadd52luq 256(%rcx),%ymm2,%ymm11
+ vpmadd52luq 288(%rcx),%ymm2,%ymm12
+ vpmadd52luq 320(%rcx),%ymm2,%ymm13
+ vpmadd52luq 352(%rcx),%ymm2,%ymm14
+ vpmadd52luq 384(%rcx),%ymm2,%ymm15
+ vpmadd52luq 416(%rcx),%ymm2,%ymm16
+ vpmadd52luq 448(%rcx),%ymm2,%ymm17
+ vpmadd52luq 480(%rcx),%ymm2,%ymm18
+
+
+ valignq $1,%ymm11,%ymm12,%ymm11
+ valignq $1,%ymm12,%ymm13,%ymm12
+ valignq $1,%ymm13,%ymm14,%ymm13
+ valignq $1,%ymm14,%ymm15,%ymm14
+ valignq $1,%ymm15,%ymm16,%ymm15
+ valignq $1,%ymm16,%ymm17,%ymm16
+ valignq $1,%ymm17,%ymm18,%ymm17
+ valignq $1,%ymm18,%ymm0,%ymm18
+
+ vmovq %xmm11,%r13
+ addq %r13,%r15
+
+ vpmadd52huq 256(%rsi),%ymm1,%ymm11
+ vpmadd52huq 288(%rsi),%ymm1,%ymm12
+ vpmadd52huq 320(%rsi),%ymm1,%ymm13
+ vpmadd52huq 352(%rsi),%ymm1,%ymm14
+ vpmadd52huq 384(%rsi),%ymm1,%ymm15
+ vpmadd52huq 416(%rsi),%ymm1,%ymm16
+ vpmadd52huq 448(%rsi),%ymm1,%ymm17
+ vpmadd52huq 480(%rsi),%ymm1,%ymm18
+
+ vpmadd52huq 256(%rcx),%ymm2,%ymm11
+ vpmadd52huq 288(%rcx),%ymm2,%ymm12
+ vpmadd52huq 320(%rcx),%ymm2,%ymm13
+ vpmadd52huq 352(%rcx),%ymm2,%ymm14
+ vpmadd52huq 384(%rcx),%ymm2,%ymm15
+ vpmadd52huq 416(%rcx),%ymm2,%ymm16
+ vpmadd52huq 448(%rcx),%ymm2,%ymm17
+ vpmadd52huq 480(%rcx),%ymm2,%ymm18
+ leaq 8(%r11),%r11
+ decl %ebx
+ jne .Lloop30
+
+ vpbroadcastq %r9,%ymm0
+ vpblendd $3,%ymm0,%ymm3,%ymm3
+
+
+
+ vpsrlq $52,%ymm3,%ymm0
+ vpsrlq $52,%ymm4,%ymm1
+ vpsrlq $52,%ymm5,%ymm2
+ vpsrlq $52,%ymm6,%ymm19
+ vpsrlq $52,%ymm7,%ymm20
+ vpsrlq $52,%ymm8,%ymm21
+ vpsrlq $52,%ymm9,%ymm22
+ vpsrlq $52,%ymm10,%ymm23
+
+
+ valignq $3,%ymm22,%ymm23,%ymm23
+ valignq $3,%ymm21,%ymm22,%ymm22
+ valignq $3,%ymm20,%ymm21,%ymm21
+ valignq $3,%ymm19,%ymm20,%ymm20
+ valignq $3,%ymm2,%ymm19,%ymm19
+ valignq $3,%ymm1,%ymm2,%ymm2
+ valignq $3,%ymm0,%ymm1,%ymm1
+ valignq $3,.Lzeros(%rip),%ymm0,%ymm0
+
+
+ vpandq .Lmask52x4(%rip),%ymm3,%ymm3
+ vpandq .Lmask52x4(%rip),%ymm4,%ymm4
+ vpandq .Lmask52x4(%rip),%ymm5,%ymm5
+ vpandq .Lmask52x4(%rip),%ymm6,%ymm6
+ vpandq .Lmask52x4(%rip),%ymm7,%ymm7
+ vpandq .Lmask52x4(%rip),%ymm8,%ymm8
+ vpandq .Lmask52x4(%rip),%ymm9,%ymm9
+ vpandq .Lmask52x4(%rip),%ymm10,%ymm10
+
+
+ vpaddq %ymm0,%ymm3,%ymm3
+ vpaddq %ymm1,%ymm4,%ymm4
+ vpaddq %ymm2,%ymm5,%ymm5
+ vpaddq %ymm19,%ymm6,%ymm6
+ vpaddq %ymm20,%ymm7,%ymm7
+ vpaddq %ymm21,%ymm8,%ymm8
+ vpaddq %ymm22,%ymm9,%ymm9
+ vpaddq %ymm23,%ymm10,%ymm10
+
+
+
+ vpcmpuq $6,.Lmask52x4(%rip),%ymm3,%k1
+ vpcmpuq $6,.Lmask52x4(%rip),%ymm4,%k2
+ kmovb %k1,%r14d
+ kmovb %k2,%r13d
+ shlb $4,%r13b
+ orb %r13b,%r14b
+
+ vpcmpuq $6,.Lmask52x4(%rip),%ymm5,%k1
+ vpcmpuq $6,.Lmask52x4(%rip),%ymm6,%k2
+ kmovb %k1,%r13d
+ kmovb %k2,%r12d
+ shlb $4,%r12b
+ orb %r12b,%r13b
+
+ vpcmpuq $6,.Lmask52x4(%rip),%ymm7,%k1
+ vpcmpuq $6,.Lmask52x4(%rip),%ymm8,%k2
+ kmovb %k1,%r12d
+ kmovb %k2,%r11d
+ shlb $4,%r11b
+ orb %r11b,%r12b
+
+ vpcmpuq $6,.Lmask52x4(%rip),%ymm9,%k1
+ vpcmpuq $6,.Lmask52x4(%rip),%ymm10,%k2
+ kmovb %k1,%r11d
+ kmovb %k2,%r10d
+ shlb $4,%r10b
+ orb %r10b,%r11b
+
+ addb %r14b,%r14b
+ adcb %r13b,%r13b
+ adcb %r12b,%r12b
+ adcb %r11b,%r11b
+
+
+ vpcmpuq $0,.Lmask52x4(%rip),%ymm3,%k1
+ vpcmpuq $0,.Lmask52x4(%rip),%ymm4,%k2
+ kmovb %k1,%r9d
+ kmovb %k2,%r8d
+ shlb $4,%r8b
+ orb %r8b,%r9b
+
+ vpcmpuq $0,.Lmask52x4(%rip),%ymm5,%k1
+ vpcmpuq $0,.Lmask52x4(%rip),%ymm6,%k2
+ kmovb %k1,%r8d
+ kmovb %k2,%edx
+ shlb $4,%dl
+ orb %dl,%r8b
+
+ vpcmpuq $0,.Lmask52x4(%rip),%ymm7,%k1
+ vpcmpuq $0,.Lmask52x4(%rip),%ymm8,%k2
+ kmovb %k1,%edx
+ kmovb %k2,%ecx
+ shlb $4,%cl
+ orb %cl,%dl
+
+ vpcmpuq $0,.Lmask52x4(%rip),%ymm9,%k1
+ vpcmpuq $0,.Lmask52x4(%rip),%ymm10,%k2
+ kmovb %k1,%ecx
+ kmovb %k2,%ebx
+ shlb $4,%bl
+ orb %bl,%cl
+
+ addb %r9b,%r14b
+ adcb %r8b,%r13b
+ adcb %dl,%r12b
+ adcb %cl,%r11b
+
+ xorb %r9b,%r14b
+ xorb %r8b,%r13b
+ xorb %dl,%r12b
+ xorb %cl,%r11b
+
+ kmovb %r14d,%k1
+ shrb $4,%r14b
+ kmovb %r14d,%k2
+ kmovb %r13d,%k3
+ shrb $4,%r13b
+ kmovb %r13d,%k4
+ kmovb %r12d,%k5
+ shrb $4,%r12b
+ kmovb %r12d,%k6
+ kmovb %r11d,%k7
+
+ vpsubq .Lmask52x4(%rip),%ymm3,%ymm3{%k1}
+ vpsubq .Lmask52x4(%rip),%ymm4,%ymm4{%k2}
+ vpsubq .Lmask52x4(%rip),%ymm5,%ymm5{%k3}
+ vpsubq .Lmask52x4(%rip),%ymm6,%ymm6{%k4}
+ vpsubq .Lmask52x4(%rip),%ymm7,%ymm7{%k5}
+ vpsubq .Lmask52x4(%rip),%ymm8,%ymm8{%k6}
+ vpsubq .Lmask52x4(%rip),%ymm9,%ymm9{%k7}
+
+ vpandq .Lmask52x4(%rip),%ymm3,%ymm3
+ vpandq .Lmask52x4(%rip),%ymm4,%ymm4
+ vpandq .Lmask52x4(%rip),%ymm5,%ymm5
+ vpandq .Lmask52x4(%rip),%ymm6,%ymm6
+ vpandq .Lmask52x4(%rip),%ymm7,%ymm7
+ vpandq .Lmask52x4(%rip),%ymm8,%ymm8
+ vpandq .Lmask52x4(%rip),%ymm9,%ymm9
+
+ shrb $4,%r11b
+ kmovb %r11d,%k1
+
+ vpsubq .Lmask52x4(%rip),%ymm10,%ymm10{%k1}
+
+ vpandq .Lmask52x4(%rip),%ymm10,%ymm10
+
+ vpbroadcastq %r15,%ymm0
+ vpblendd $3,%ymm0,%ymm11,%ymm11
+
+
+
+ vpsrlq $52,%ymm11,%ymm0
+ vpsrlq $52,%ymm12,%ymm1
+ vpsrlq $52,%ymm13,%ymm2
+ vpsrlq $52,%ymm14,%ymm19
+ vpsrlq $52,%ymm15,%ymm20
+ vpsrlq $52,%ymm16,%ymm21
+ vpsrlq $52,%ymm17,%ymm22
+ vpsrlq $52,%ymm18,%ymm23
+
+
+ valignq $3,%ymm22,%ymm23,%ymm23
+ valignq $3,%ymm21,%ymm22,%ymm22
+ valignq $3,%ymm20,%ymm21,%ymm21
+ valignq $3,%ymm19,%ymm20,%ymm20
+ valignq $3,%ymm2,%ymm19,%ymm19
+ valignq $3,%ymm1,%ymm2,%ymm2
+ valignq $3,%ymm0,%ymm1,%ymm1
+ valignq $3,.Lzeros(%rip),%ymm0,%ymm0
+
+
+ vpandq .Lmask52x4(%rip),%ymm11,%ymm11
+ vpandq .Lmask52x4(%rip),%ymm12,%ymm12
+ vpandq .Lmask52x4(%rip),%ymm13,%ymm13
+ vpandq .Lmask52x4(%rip),%ymm14,%ymm14
+ vpandq .Lmask52x4(%rip),%ymm15,%ymm15
+ vpandq .Lmask52x4(%rip),%ymm16,%ymm16
+ vpandq .Lmask52x4(%rip),%ymm17,%ymm17
+ vpandq .Lmask52x4(%rip),%ymm18,%ymm18
+
+
+ vpaddq %ymm0,%ymm11,%ymm11
+ vpaddq %ymm1,%ymm12,%ymm12
+ vpaddq %ymm2,%ymm13,%ymm13
+ vpaddq %ymm19,%ymm14,%ymm14
+ vpaddq %ymm20,%ymm15,%ymm15
+ vpaddq %ymm21,%ymm16,%ymm16
+ vpaddq %ymm22,%ymm17,%ymm17
+ vpaddq %ymm23,%ymm18,%ymm18
+
+
+
+ vpcmpuq $6,.Lmask52x4(%rip),%ymm11,%k1
+ vpcmpuq $6,.Lmask52x4(%rip),%ymm12,%k2
+ kmovb %k1,%r14d
+ kmovb %k2,%r13d
+ shlb $4,%r13b
+ orb %r13b,%r14b
+
+ vpcmpuq $6,.Lmask52x4(%rip),%ymm13,%k1
+ vpcmpuq $6,.Lmask52x4(%rip),%ymm14,%k2
+ kmovb %k1,%r13d
+ kmovb %k2,%r12d
+ shlb $4,%r12b
+ orb %r12b,%r13b
+
+ vpcmpuq $6,.Lmask52x4(%rip),%ymm15,%k1
+ vpcmpuq $6,.Lmask52x4(%rip),%ymm16,%k2
+ kmovb %k1,%r12d
+ kmovb %k2,%r11d
+ shlb $4,%r11b
+ orb %r11b,%r12b
+
+ vpcmpuq $6,.Lmask52x4(%rip),%ymm17,%k1
+ vpcmpuq $6,.Lmask52x4(%rip),%ymm18,%k2
+ kmovb %k1,%r11d
+ kmovb %k2,%r10d
+ shlb $4,%r10b
+ orb %r10b,%r11b
+
+ addb %r14b,%r14b
+ adcb %r13b,%r13b
+ adcb %r12b,%r12b
+ adcb %r11b,%r11b
+
+
+ vpcmpuq $0,.Lmask52x4(%rip),%ymm11,%k1
+ vpcmpuq $0,.Lmask52x4(%rip),%ymm12,%k2
+ kmovb %k1,%r9d
+ kmovb %k2,%r8d
+ shlb $4,%r8b
+ orb %r8b,%r9b
+
+ vpcmpuq $0,.Lmask52x4(%rip),%ymm13,%k1
+ vpcmpuq $0,.Lmask52x4(%rip),%ymm14,%k2
+ kmovb %k1,%r8d
+ kmovb %k2,%edx
+ shlb $4,%dl
+ orb %dl,%r8b
+
+ vpcmpuq $0,.Lmask52x4(%rip),%ymm15,%k1
+ vpcmpuq $0,.Lmask52x4(%rip),%ymm16,%k2
+ kmovb %k1,%edx
+ kmovb %k2,%ecx
+ shlb $4,%cl
+ orb %cl,%dl
+
+ vpcmpuq $0,.Lmask52x4(%rip),%ymm17,%k1
+ vpcmpuq $0,.Lmask52x4(%rip),%ymm18,%k2
+ kmovb %k1,%ecx
+ kmovb %k2,%ebx
+ shlb $4,%bl
+ orb %bl,%cl
+
+ addb %r9b,%r14b
+ adcb %r8b,%r13b
+ adcb %dl,%r12b
+ adcb %cl,%r11b
+
+ xorb %r9b,%r14b
+ xorb %r8b,%r13b
+ xorb %dl,%r12b
+ xorb %cl,%r11b
+
+ kmovb %r14d,%k1
+ shrb $4,%r14b
+ kmovb %r14d,%k2
+ kmovb %r13d,%k3
+ shrb $4,%r13b
+ kmovb %r13d,%k4
+ kmovb %r12d,%k5
+ shrb $4,%r12b
+ kmovb %r12d,%k6
+ kmovb %r11d,%k7
+
+ vpsubq .Lmask52x4(%rip),%ymm11,%ymm11{%k1}
+ vpsubq .Lmask52x4(%rip),%ymm12,%ymm12{%k2}
+ vpsubq .Lmask52x4(%rip),%ymm13,%ymm13{%k3}
+ vpsubq .Lmask52x4(%rip),%ymm14,%ymm14{%k4}
+ vpsubq .Lmask52x4(%rip),%ymm15,%ymm15{%k5}
+ vpsubq .Lmask52x4(%rip),%ymm16,%ymm16{%k6}
+ vpsubq .Lmask52x4(%rip),%ymm17,%ymm17{%k7}
+
+ vpandq .Lmask52x4(%rip),%ymm11,%ymm11
+ vpandq .Lmask52x4(%rip),%ymm12,%ymm12
+ vpandq .Lmask52x4(%rip),%ymm13,%ymm13
+ vpandq .Lmask52x4(%rip),%ymm14,%ymm14
+ vpandq .Lmask52x4(%rip),%ymm15,%ymm15
+ vpandq .Lmask52x4(%rip),%ymm16,%ymm16
+ vpandq .Lmask52x4(%rip),%ymm17,%ymm17
+
+ shrb $4,%r11b
+ kmovb %r11d,%k1
+
+ vpsubq .Lmask52x4(%rip),%ymm18,%ymm18{%k1}
+
+ vpandq .Lmask52x4(%rip),%ymm18,%ymm18
+
+ vmovdqu64 %ymm3,0(%rdi)
+ vmovdqu64 %ymm4,32(%rdi)
+ vmovdqu64 %ymm5,64(%rdi)
+ vmovdqu64 %ymm6,96(%rdi)
+ vmovdqu64 %ymm7,128(%rdi)
+ vmovdqu64 %ymm8,160(%rdi)
+ vmovdqu64 %ymm9,192(%rdi)
+ vmovdqu64 %ymm10,224(%rdi)
+
+ vmovdqu64 %ymm11,256(%rdi)
+ vmovdqu64 %ymm12,288(%rdi)
+ vmovdqu64 %ymm13,320(%rdi)
+ vmovdqu64 %ymm14,352(%rdi)
+ vmovdqu64 %ymm15,384(%rdi)
+ vmovdqu64 %ymm16,416(%rdi)
+ vmovdqu64 %ymm17,448(%rdi)
+ vmovdqu64 %ymm18,480(%rdi)
+
+ vzeroupper
+ leaq (%rsp),%rax
+.cfi_def_cfa_register %rax
+ movq 0(%rax),%r15
+.cfi_restore %r15
+ movq 8(%rax),%r14
+.cfi_restore %r14
+ movq 16(%rax),%r13
+.cfi_restore %r13
+ movq 24(%rax),%r12
+.cfi_restore %r12
+ movq 32(%rax),%rbp
+.cfi_restore %rbp
+ movq 40(%rax),%rbx
+.cfi_restore %rbx
+ leaq 48(%rax),%rsp
+.cfi_def_cfa %rsp,8
+.Lossl_rsaz_amm52x30_x2_ifma256_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size ossl_rsaz_amm52x30_x2_ifma256, .-ossl_rsaz_amm52x30_x2_ifma256
+.text
+
+.align 32
+.globl ossl_extract_multiplier_2x30_win5
+.type ossl_extract_multiplier_2x30_win5,@function
+ossl_extract_multiplier_2x30_win5:
+.cfi_startproc
+.byte 243,15,30,250
+ vmovdqa64 .Lones(%rip),%ymm30
+ vpbroadcastq %rdx,%ymm28
+ vpbroadcastq %rcx,%ymm29
+ leaq 16384(%rsi),%rax
+
+
+ vpxor %xmm0,%xmm0,%xmm0
+ vmovdqa64 %ymm0,%ymm27
+ vmovdqa64 %ymm0,%ymm1
+ vmovdqa64 %ymm0,%ymm2
+ vmovdqa64 %ymm0,%ymm3
+ vmovdqa64 %ymm0,%ymm4
+ vmovdqa64 %ymm0,%ymm5
+ vmovdqa64 %ymm0,%ymm16
+ vmovdqa64 %ymm0,%ymm17
+ vmovdqa64 %ymm0,%ymm18
+ vmovdqa64 %ymm0,%ymm19
+ vmovdqa64 %ymm0,%ymm20
+ vmovdqa64 %ymm0,%ymm21
+ vmovdqa64 %ymm0,%ymm22
+ vmovdqa64 %ymm0,%ymm23
+ vmovdqa64 %ymm0,%ymm24
+ vmovdqa64 %ymm0,%ymm25
+
+.align 32
+.Lloop:
+ vpcmpq $0,%ymm27,%ymm28,%k1
+ vpcmpq $0,%ymm27,%ymm29,%k2
+ vmovdqu64 0(%rsi),%ymm26
+ vpblendmq %ymm26,%ymm0,%ymm0{%k1}
+ vmovdqu64 32(%rsi),%ymm26
+ vpblendmq %ymm26,%ymm1,%ymm1{%k1}
+ vmovdqu64 64(%rsi),%ymm26
+ vpblendmq %ymm26,%ymm2,%ymm2{%k1}
+ vmovdqu64 96(%rsi),%ymm26
+ vpblendmq %ymm26,%ymm3,%ymm3{%k1}
+ vmovdqu64 128(%rsi),%ymm26
+ vpblendmq %ymm26,%ymm4,%ymm4{%k1}
+ vmovdqu64 160(%rsi),%ymm26
+ vpblendmq %ymm26,%ymm5,%ymm5{%k1}
+ vmovdqu64 192(%rsi),%ymm26
+ vpblendmq %ymm26,%ymm16,%ymm16{%k1}
+ vmovdqu64 224(%rsi),%ymm26
+ vpblendmq %ymm26,%ymm17,%ymm17{%k1}
+ vmovdqu64 256(%rsi),%ymm26
+ vpblendmq %ymm26,%ymm18,%ymm18{%k2}
+ vmovdqu64 288(%rsi),%ymm26
+ vpblendmq %ymm26,%ymm19,%ymm19{%k2}
+ vmovdqu64 320(%rsi),%ymm26
+ vpblendmq %ymm26,%ymm20,%ymm20{%k2}
+ vmovdqu64 352(%rsi),%ymm26
+ vpblendmq %ymm26,%ymm21,%ymm21{%k2}
+ vmovdqu64 384(%rsi),%ymm26
+ vpblendmq %ymm26,%ymm22,%ymm22{%k2}
+ vmovdqu64 416(%rsi),%ymm26
+ vpblendmq %ymm26,%ymm23,%ymm23{%k2}
+ vmovdqu64 448(%rsi),%ymm26
+ vpblendmq %ymm26,%ymm24,%ymm24{%k2}
+ vmovdqu64 480(%rsi),%ymm26
+ vpblendmq %ymm26,%ymm25,%ymm25{%k2}
+ vpaddq %ymm30,%ymm27,%ymm27
+ addq $512,%rsi
+ cmpq %rsi,%rax
+ jne .Lloop
+ vmovdqu64 %ymm0,0(%rdi)
+ vmovdqu64 %ymm1,32(%rdi)
+ vmovdqu64 %ymm2,64(%rdi)
+ vmovdqu64 %ymm3,96(%rdi)
+ vmovdqu64 %ymm4,128(%rdi)
+ vmovdqu64 %ymm5,160(%rdi)
+ vmovdqu64 %ymm16,192(%rdi)
+ vmovdqu64 %ymm17,224(%rdi)
+ vmovdqu64 %ymm18,256(%rdi)
+ vmovdqu64 %ymm19,288(%rdi)
+ vmovdqu64 %ymm20,320(%rdi)
+ vmovdqu64 %ymm21,352(%rdi)
+ vmovdqu64 %ymm22,384(%rdi)
+ vmovdqu64 %ymm23,416(%rdi)
+ vmovdqu64 %ymm24,448(%rdi)
+ vmovdqu64 %ymm25,480(%rdi)
+
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size ossl_extract_multiplier_2x30_win5, .-ossl_extract_multiplier_2x30_win5
+.section .rodata
+.align 32
+.Lones:
+.quad 1,1,1,1
+.Lzeros:
+.quad 0,0,0,0
+ .section ".note.gnu.property", "a"
+ .p2align 3
+ .long 1f - 0f
+ .long 4f - 1f
+ .long 5
+0:
+ # "GNU" encoded with .byte, since .asciz isn't supported
+ # on Solaris.
+ .byte 0x47
+ .byte 0x4e
+ .byte 0x55
+ .byte 0
+1:
+ .p2align 3
+ .long 0xc0000002
+ .long 3f - 2f
+2:
+ .long 3
+3:
+ .p2align 3
+4: