summaryrefslogtreecommitdiff
path: root/secure/lib/libcrypto/amd64/rsaz-x86_64.S
diff options
context:
space:
mode:
Diffstat (limited to 'secure/lib/libcrypto/amd64/rsaz-x86_64.S')
-rw-r--r--secure/lib/libcrypto/amd64/rsaz-x86_64.S863
1 files changed, 113 insertions, 750 deletions
diff --git a/secure/lib/libcrypto/amd64/rsaz-x86_64.S b/secure/lib/libcrypto/amd64/rsaz-x86_64.S
index e4e7b0469a53..ae64f7a73987 100644
--- a/secure/lib/libcrypto/amd64/rsaz-x86_64.S
+++ b/secure/lib/libcrypto/amd64/rsaz-x86_64.S
@@ -31,14 +31,10 @@ rsaz_512_sqr:
subq $128+24,%rsp
.cfi_adjust_cfa_offset 128+24
.Lsqr_body:
- movq %rdx,%rbp
+.byte 102,72,15,110,202
movq (%rsi),%rdx
movq 8(%rsi),%rax
movq %rcx,128(%rsp)
- movl $0x80100,%r11d
- andl OPENSSL_ia32cap_P+8(%rip),%r11d
- cmpl $0x80100,%r11d
- je .Loop_sqrx
jmp .Loop_sqr
.align 32
@@ -46,6 +42,7 @@ rsaz_512_sqr:
movl %r8d,128+8(%rsp)
movq %rdx,%rbx
+ movq %rax,%rbp
mulq %rdx
movq %rax,%r8
movq 16(%rsi),%rax
@@ -84,31 +81,29 @@ rsaz_512_sqr:
mulq %rbx
addq %rax,%r14
movq %rbx,%rax
- movq %rdx,%r15
- adcq $0,%r15
+ adcq $0,%rdx
+ xorq %rcx,%rcx
addq %r8,%r8
- movq %r9,%rcx
- adcq %r9,%r9
+ movq %rdx,%r15
+ adcq $0,%rcx
mulq %rax
- movq %rax,(%rsp)
- addq %rdx,%r8
- adcq $0,%r9
+ addq %r8,%rdx
+ adcq $0,%rcx
- movq %r8,8(%rsp)
- shrq $63,%rcx
+ movq %rax,(%rsp)
+ movq %rdx,8(%rsp)
- movq 8(%rsi),%r8
movq 16(%rsi),%rax
- mulq %r8
+ mulq %rbp
addq %rax,%r10
movq 24(%rsi),%rax
movq %rdx,%rbx
adcq $0,%rbx
- mulq %r8
+ mulq %rbp
addq %rax,%r11
movq 32(%rsi),%rax
adcq $0,%rdx
@@ -116,7 +111,7 @@ rsaz_512_sqr:
movq %rdx,%rbx
adcq $0,%rbx
- mulq %r8
+ mulq %rbp
addq %rax,%r12
movq 40(%rsi),%rax
adcq $0,%rdx
@@ -124,7 +119,7 @@ rsaz_512_sqr:
movq %rdx,%rbx
adcq $0,%rbx
- mulq %r8
+ mulq %rbp
addq %rax,%r13
movq 48(%rsi),%rax
adcq $0,%rdx
@@ -132,7 +127,7 @@ rsaz_512_sqr:
movq %rdx,%rbx
adcq $0,%rbx
- mulq %r8
+ mulq %rbp
addq %rax,%r14
movq 56(%rsi),%rax
adcq $0,%rdx
@@ -140,39 +135,39 @@ rsaz_512_sqr:
movq %rdx,%rbx
adcq $0,%rbx
- mulq %r8
+ mulq %rbp
addq %rax,%r15
- movq %r8,%rax
+ movq %rbp,%rax
adcq $0,%rdx
addq %rbx,%r15
- movq %rdx,%r8
- movq %r10,%rdx
- adcq $0,%r8
+ adcq $0,%rdx
- addq %rdx,%rdx
- leaq (%rcx,%r10,2),%r10
- movq %r11,%rbx
- adcq %r11,%r11
+ xorq %rbx,%rbx
+ addq %r9,%r9
+ movq %rdx,%r8
+ adcq %r10,%r10
+ adcq $0,%rbx
mulq %rax
+
+ addq %rcx,%rax
+ movq 16(%rsi),%rbp
addq %rax,%r9
+ movq 24(%rsi),%rax
adcq %rdx,%r10
- adcq $0,%r11
+ adcq $0,%rbx
movq %r9,16(%rsp)
movq %r10,24(%rsp)
- shrq $63,%rbx
- movq 16(%rsi),%r9
- movq 24(%rsi),%rax
- mulq %r9
+ mulq %rbp
addq %rax,%r12
movq 32(%rsi),%rax
movq %rdx,%rcx
adcq $0,%rcx
- mulq %r9
+ mulq %rbp
addq %rax,%r13
movq 40(%rsi),%rax
adcq $0,%rdx
@@ -180,7 +175,7 @@ rsaz_512_sqr:
movq %rdx,%rcx
adcq $0,%rcx
- mulq %r9
+ mulq %rbp
addq %rax,%r14
movq 48(%rsi),%rax
adcq $0,%rdx
@@ -188,9 +183,7 @@ rsaz_512_sqr:
movq %rdx,%rcx
adcq $0,%rcx
- mulq %r9
- movq %r12,%r10
- leaq (%rbx,%r12,2),%r12
+ mulq %rbp
addq %rax,%r15
movq 56(%rsi),%rax
adcq $0,%rdx
@@ -198,36 +191,40 @@ rsaz_512_sqr:
movq %rdx,%rcx
adcq $0,%rcx
- mulq %r9
- shrq $63,%r10
+ mulq %rbp
addq %rax,%r8
- movq %r9,%rax
+ movq %rbp,%rax
adcq $0,%rdx
addq %rcx,%r8
- movq %rdx,%r9
- adcq $0,%r9
+ adcq $0,%rdx
- movq %r13,%rcx
- leaq (%r10,%r13,2),%r13
+ xorq %rcx,%rcx
+ addq %r11,%r11
+ movq %rdx,%r9
+ adcq %r12,%r12
+ adcq $0,%rcx
mulq %rax
+
+ addq %rbx,%rax
+ movq 24(%rsi),%r10
addq %rax,%r11
+ movq 32(%rsi),%rax
adcq %rdx,%r12
- adcq $0,%r13
+ adcq $0,%rcx
movq %r11,32(%rsp)
movq %r12,40(%rsp)
- shrq $63,%rcx
- movq 24(%rsi),%r10
- movq 32(%rsi),%rax
+ movq %rax,%r11
mulq %r10
addq %rax,%r14
movq 40(%rsi),%rax
movq %rdx,%rbx
adcq $0,%rbx
+ movq %rax,%r12
mulq %r10
addq %rax,%r15
movq 48(%rsi),%rax
@@ -236,9 +233,8 @@ rsaz_512_sqr:
movq %rdx,%rbx
adcq $0,%rbx
+ movq %rax,%rbp
mulq %r10
- movq %r14,%r12
- leaq (%rcx,%r14,2),%r14
addq %rax,%r8
movq 56(%rsi),%rax
adcq $0,%rdx
@@ -247,32 +243,33 @@ rsaz_512_sqr:
adcq $0,%rbx
mulq %r10
- shrq $63,%r12
addq %rax,%r9
movq %r10,%rax
adcq $0,%rdx
addq %rbx,%r9
- movq %rdx,%r10
- adcq $0,%r10
+ adcq $0,%rdx
- movq %r15,%rbx
- leaq (%r12,%r15,2),%r15
+ xorq %rbx,%rbx
+ addq %r13,%r13
+ movq %rdx,%r10
+ adcq %r14,%r14
+ adcq $0,%rbx
mulq %rax
+
+ addq %rcx,%rax
addq %rax,%r13
+ movq %r12,%rax
adcq %rdx,%r14
- adcq $0,%r15
+ adcq $0,%rbx
movq %r13,48(%rsp)
movq %r14,56(%rsp)
- shrq $63,%rbx
- movq 32(%rsi),%r11
- movq 40(%rsi),%rax
mulq %r11
addq %rax,%r8
- movq 48(%rsi),%rax
+ movq %rbp,%rax
movq %rdx,%rcx
adcq $0,%rcx
@@ -280,97 +277,99 @@ rsaz_512_sqr:
addq %rax,%r9
movq 56(%rsi),%rax
adcq $0,%rdx
- movq %r8,%r12
- leaq (%rbx,%r8,2),%r8
addq %rcx,%r9
movq %rdx,%rcx
adcq $0,%rcx
+ movq %rax,%r14
mulq %r11
- shrq $63,%r12
addq %rax,%r10
movq %r11,%rax
adcq $0,%rdx
addq %rcx,%r10
- movq %rdx,%r11
- adcq $0,%r11
+ adcq $0,%rdx
- movq %r9,%rcx
- leaq (%r12,%r9,2),%r9
+ xorq %rcx,%rcx
+ addq %r15,%r15
+ movq %rdx,%r11
+ adcq %r8,%r8
+ adcq $0,%rcx
mulq %rax
+
+ addq %rbx,%rax
addq %rax,%r15
+ movq %rbp,%rax
adcq %rdx,%r8
- adcq $0,%r9
+ adcq $0,%rcx
movq %r15,64(%rsp)
movq %r8,72(%rsp)
- shrq $63,%rcx
- movq 40(%rsi),%r12
- movq 48(%rsi),%rax
mulq %r12
addq %rax,%r10
- movq 56(%rsi),%rax
+ movq %r14,%rax
movq %rdx,%rbx
adcq $0,%rbx
mulq %r12
addq %rax,%r11
movq %r12,%rax
- movq %r10,%r15
- leaq (%rcx,%r10,2),%r10
adcq $0,%rdx
- shrq $63,%r15
addq %rbx,%r11
- movq %rdx,%r12
- adcq $0,%r12
+ adcq $0,%rdx
- movq %r11,%rbx
- leaq (%r15,%r11,2),%r11
+ xorq %rbx,%rbx
+ addq %r9,%r9
+ movq %rdx,%r12
+ adcq %r10,%r10
+ adcq $0,%rbx
mulq %rax
+
+ addq %rcx,%rax
addq %rax,%r9
+ movq %r14,%rax
adcq %rdx,%r10
- adcq $0,%r11
+ adcq $0,%rbx
movq %r9,80(%rsp)
movq %r10,88(%rsp)
- movq 48(%rsi),%r13
- movq 56(%rsi),%rax
- mulq %r13
+ mulq %rbp
addq %rax,%r12
- movq %r13,%rax
- movq %rdx,%r13
- adcq $0,%r13
+ movq %rbp,%rax
+ adcq $0,%rdx
- xorq %r14,%r14
- shlq $1,%rbx
+ xorq %rcx,%rcx
+ addq %r11,%r11
+ movq %rdx,%r13
adcq %r12,%r12
- adcq %r13,%r13
- adcq %r14,%r14
+ adcq $0,%rcx
mulq %rax
+
+ addq %rbx,%rax
addq %rax,%r11
+ movq %r14,%rax
adcq %rdx,%r12
- adcq $0,%r13
+ adcq $0,%rcx
movq %r11,96(%rsp)
movq %r12,104(%rsp)
- movq 56(%rsi),%rax
- mulq %rax
- addq %rax,%r13
- adcq $0,%rdx
+ xorq %rbx,%rbx
+ addq %r13,%r13
+ adcq $0,%rbx
- addq %rdx,%r14
+ mulq %rax
- movq %r13,112(%rsp)
- movq %r14,120(%rsp)
+ addq %rcx,%rax
+ addq %r13,%rax
+ adcq %rbx,%rdx
movq (%rsp),%r8
movq 8(%rsp),%r9
@@ -380,276 +379,12 @@ rsaz_512_sqr:
movq 40(%rsp),%r13
movq 48(%rsp),%r14
movq 56(%rsp),%r15
-
- call __rsaz_512_reduce
-
- addq 64(%rsp),%r8
- adcq 72(%rsp),%r9
- adcq 80(%rsp),%r10
- adcq 88(%rsp),%r11
- adcq 96(%rsp),%r12
- adcq 104(%rsp),%r13
- adcq 112(%rsp),%r14
- adcq 120(%rsp),%r15
- sbbq %rcx,%rcx
-
- call __rsaz_512_subtract
-
- movq %r8,%rdx
- movq %r9,%rax
- movl 128+8(%rsp),%r8d
- movq %rdi,%rsi
-
- decl %r8d
- jnz .Loop_sqr
- jmp .Lsqr_tail
-
-.align 32
-.Loop_sqrx:
- movl %r8d,128+8(%rsp)
-.byte 102,72,15,110,199
-.byte 102,72,15,110,205
-
- mulxq %rax,%r8,%r9
-
- mulxq 16(%rsi),%rcx,%r10
- xorq %rbp,%rbp
-
- mulxq 24(%rsi),%rax,%r11
- adcxq %rcx,%r9
-
- mulxq 32(%rsi),%rcx,%r12
- adcxq %rax,%r10
-
- mulxq 40(%rsi),%rax,%r13
- adcxq %rcx,%r11
-
-.byte 0xc4,0x62,0xf3,0xf6,0xb6,0x30,0x00,0x00,0x00
- adcxq %rax,%r12
- adcxq %rcx,%r13
-
-.byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00
- adcxq %rax,%r14
- adcxq %rbp,%r15
-
- movq %r9,%rcx
- shldq $1,%r8,%r9
- shlq $1,%r8
-
- xorl %ebp,%ebp
- mulxq %rdx,%rax,%rdx
- adcxq %rdx,%r8
- movq 8(%rsi),%rdx
- adcxq %rbp,%r9
-
- movq %rax,(%rsp)
- movq %r8,8(%rsp)
-
-
- mulxq 16(%rsi),%rax,%rbx
- adoxq %rax,%r10
- adcxq %rbx,%r11
-
-.byte 0xc4,0x62,0xc3,0xf6,0x86,0x18,0x00,0x00,0x00
- adoxq %rdi,%r11
- adcxq %r8,%r12
-
- mulxq 32(%rsi),%rax,%rbx
- adoxq %rax,%r12
- adcxq %rbx,%r13
-
- mulxq 40(%rsi),%rdi,%r8
- adoxq %rdi,%r13
- adcxq %r8,%r14
-
-.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00
- adoxq %rax,%r14
- adcxq %rbx,%r15
-
-.byte 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00
- adoxq %rdi,%r15
- adcxq %rbp,%r8
- adoxq %rbp,%r8
-
- movq %r11,%rbx
- shldq $1,%r10,%r11
- shldq $1,%rcx,%r10
-
- xorl %ebp,%ebp
- mulxq %rdx,%rax,%rcx
- movq 16(%rsi),%rdx
- adcxq %rax,%r9
- adcxq %rcx,%r10
- adcxq %rbp,%r11
-
- movq %r9,16(%rsp)
-.byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00
-
-
-.byte 0xc4,0x62,0xc3,0xf6,0x8e,0x18,0x00,0x00,0x00
- adoxq %rdi,%r12
- adcxq %r9,%r13
-
- mulxq 32(%rsi),%rax,%rcx
- adoxq %rax,%r13
- adcxq %rcx,%r14
-
- mulxq 40(%rsi),%rdi,%r9
- adoxq %rdi,%r14
- adcxq %r9,%r15
-
-.byte 0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00
- adoxq %rax,%r15
- adcxq %rcx,%r8
-
-.byte 0xc4,0x62,0xc3,0xf6,0x8e,0x38,0x00,0x00,0x00
- adoxq %rdi,%r8
- adcxq %rbp,%r9
- adoxq %rbp,%r9
-
- movq %r13,%rcx
- shldq $1,%r12,%r13
- shldq $1,%rbx,%r12
-
- xorl %ebp,%ebp
- mulxq %rdx,%rax,%rdx
- adcxq %rax,%r11
- adcxq %rdx,%r12
- movq 24(%rsi),%rdx
- adcxq %rbp,%r13
-
- movq %r11,32(%rsp)
-.byte 0x4c,0x89,0xa4,0x24,0x28,0x00,0x00,0x00
-
-
-.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x20,0x00,0x00,0x00
- adoxq %rax,%r14
- adcxq %rbx,%r15
-
- mulxq 40(%rsi),%rdi,%r10
- adoxq %rdi,%r15
- adcxq %r10,%r8
-
- mulxq 48(%rsi),%rax,%rbx
- adoxq %rax,%r8
- adcxq %rbx,%r9
-
- mulxq 56(%rsi),%rdi,%r10
- adoxq %rdi,%r9
- adcxq %rbp,%r10
- adoxq %rbp,%r10
-
-.byte 0x66
- movq %r15,%rbx
- shldq $1,%r14,%r15
- shldq $1,%rcx,%r14
-
- xorl %ebp,%ebp
- mulxq %rdx,%rax,%rdx
- adcxq %rax,%r13
- adcxq %rdx,%r14
- movq 32(%rsi),%rdx
- adcxq %rbp,%r15
-
- movq %r13,48(%rsp)
- movq %r14,56(%rsp)
-
-
-.byte 0xc4,0x62,0xc3,0xf6,0x9e,0x28,0x00,0x00,0x00
- adoxq %rdi,%r8
- adcxq %r11,%r9
-
- mulxq 48(%rsi),%rax,%rcx
- adoxq %rax,%r9
- adcxq %rcx,%r10
-
- mulxq 56(%rsi),%rdi,%r11
- adoxq %rdi,%r10
- adcxq %rbp,%r11
- adoxq %rbp,%r11
-
- movq %r9,%rcx
- shldq $1,%r8,%r9
- shldq $1,%rbx,%r8
-
- xorl %ebp,%ebp
- mulxq %rdx,%rax,%rdx
- adcxq %rax,%r15
- adcxq %rdx,%r8
- movq 40(%rsi),%rdx
- adcxq %rbp,%r9
-
- movq %r15,64(%rsp)
- movq %r8,72(%rsp)
-
-
-.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00
- adoxq %rax,%r10
- adcxq %rbx,%r11
-
-.byte 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00
- adoxq %rdi,%r11
- adcxq %rbp,%r12
- adoxq %rbp,%r12
-
- movq %r11,%rbx
- shldq $1,%r10,%r11
- shldq $1,%rcx,%r10
-
- xorl %ebp,%ebp
- mulxq %rdx,%rax,%rdx
- adcxq %rax,%r9
- adcxq %rdx,%r10
- movq 48(%rsi),%rdx
- adcxq %rbp,%r11
-
- movq %r9,80(%rsp)
- movq %r10,88(%rsp)
-
-
-.byte 0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00
- adoxq %rax,%r12
- adoxq %rbp,%r13
-
- xorq %r14,%r14
- shldq $1,%r13,%r14
- shldq $1,%r12,%r13
- shldq $1,%rbx,%r12
-
- xorl %ebp,%ebp
- mulxq %rdx,%rax,%rdx
- adcxq %rax,%r11
- adcxq %rdx,%r12
- movq 56(%rsi),%rdx
- adcxq %rbp,%r13
-
-.byte 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00
-.byte 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00
-
-
- mulxq %rdx,%rax,%rdx
- adoxq %rax,%r13
- adoxq %rbp,%rdx
-
-.byte 0x66
- addq %rdx,%r14
-
- movq %r13,112(%rsp)
- movq %r14,120(%rsp)
-.byte 102,72,15,126,199
.byte 102,72,15,126,205
- movq 128(%rsp),%rdx
- movq (%rsp),%r8
- movq 8(%rsp),%r9
- movq 16(%rsp),%r10
- movq 24(%rsp),%r11
- movq 32(%rsp),%r12
- movq 40(%rsp),%r13
- movq 48(%rsp),%r14
- movq 56(%rsp),%r15
+ movq %rax,112(%rsp)
+ movq %rdx,120(%rsp)
- call __rsaz_512_reducex
+ call __rsaz_512_reduce
addq 64(%rsp),%r8
adcq 72(%rsp),%r9
@@ -669,9 +404,7 @@ rsaz_512_sqr:
movq %rdi,%rsi
decl %r8d
- jnz .Loop_sqrx
-
-.Lsqr_tail:
+ jnz .Loop_sqr
leaq 128+24+48(%rsp),%rax
.cfi_def_cfa %rax,8
@@ -723,10 +456,6 @@ rsaz_512_mul:
.byte 102,72,15,110,199
.byte 102,72,15,110,201
movq %r8,128(%rsp)
- movl $0x80100,%r11d
- andl OPENSSL_ia32cap_P+8(%rip),%r11d
- cmpl $0x80100,%r11d
- je .Lmulx
movq (%rdx),%rbx
movq %rdx,%rbp
call __rsaz_512_mul
@@ -744,29 +473,6 @@ rsaz_512_mul:
movq 56(%rsp),%r15
call __rsaz_512_reduce
- jmp .Lmul_tail
-
-.align 32
-.Lmulx:
- movq %rdx,%rbp
- movq (%rdx),%rdx
- call __rsaz_512_mulx
-
-.byte 102,72,15,126,199
-.byte 102,72,15,126,205
-
- movq 128(%rsp),%rdx
- movq (%rsp),%r8
- movq 8(%rsp),%r9
- movq 16(%rsp),%r10
- movq 24(%rsp),%r11
- movq 32(%rsp),%r12
- movq 40(%rsp),%r13
- movq 48(%rsp),%r14
- movq 56(%rsp),%r15
-
- call __rsaz_512_reducex
-.Lmul_tail:
addq 64(%rsp),%r8
adcq 72(%rsp),%r9
adcq 80(%rsp),%r10
@@ -880,10 +586,6 @@ rsaz_512_mul_gather4:
por %xmm9,%xmm8
pshufd $0x4e,%xmm8,%xmm9
por %xmm9,%xmm8
- movl $0x80100,%r11d
- andl OPENSSL_ia32cap_P+8(%rip),%r11d
- cmpl $0x80100,%r11d
- je .Lmulx_gather
.byte 102,76,15,126,195
movq %r8,128(%rsp)
@@ -1064,142 +766,6 @@ rsaz_512_mul_gather4:
movq 56(%rsp),%r15
call __rsaz_512_reduce
- jmp .Lmul_gather_tail
-
-.align 32
-.Lmulx_gather:
-.byte 102,76,15,126,194
-
- movq %r8,128(%rsp)
- movq %rdi,128+8(%rsp)
- movq %rcx,128+16(%rsp)
-
- mulxq (%rsi),%rbx,%r8
- movq %rbx,(%rsp)
- xorl %edi,%edi
-
- mulxq 8(%rsi),%rax,%r9
-
- mulxq 16(%rsi),%rbx,%r10
- adcxq %rax,%r8
-
- mulxq 24(%rsi),%rax,%r11
- adcxq %rbx,%r9
-
- mulxq 32(%rsi),%rbx,%r12
- adcxq %rax,%r10
-
- mulxq 40(%rsi),%rax,%r13
- adcxq %rbx,%r11
-
- mulxq 48(%rsi),%rbx,%r14
- adcxq %rax,%r12
-
- mulxq 56(%rsi),%rax,%r15
- adcxq %rbx,%r13
- adcxq %rax,%r14
-.byte 0x67
- movq %r8,%rbx
- adcxq %rdi,%r15
-
- movq $-7,%rcx
- jmp .Loop_mulx_gather
-
-.align 32
-.Loop_mulx_gather:
- movdqa 0(%rbp),%xmm8
- movdqa 16(%rbp),%xmm9
- movdqa 32(%rbp),%xmm10
- movdqa 48(%rbp),%xmm11
- pand %xmm0,%xmm8
- movdqa 64(%rbp),%xmm12
- pand %xmm1,%xmm9
- movdqa 80(%rbp),%xmm13
- pand %xmm2,%xmm10
- movdqa 96(%rbp),%xmm14
- pand %xmm3,%xmm11
- movdqa 112(%rbp),%xmm15
- leaq 128(%rbp),%rbp
- pand %xmm4,%xmm12
- pand %xmm5,%xmm13
- pand %xmm6,%xmm14
- pand %xmm7,%xmm15
- por %xmm10,%xmm8
- por %xmm11,%xmm9
- por %xmm12,%xmm8
- por %xmm13,%xmm9
- por %xmm14,%xmm8
- por %xmm15,%xmm9
-
- por %xmm9,%xmm8
- pshufd $0x4e,%xmm8,%xmm9
- por %xmm9,%xmm8
-.byte 102,76,15,126,194
-
-.byte 0xc4,0x62,0xfb,0xf6,0x86,0x00,0x00,0x00,0x00
- adcxq %rax,%rbx
- adoxq %r9,%r8
-
- mulxq 8(%rsi),%rax,%r9
- adcxq %rax,%r8
- adoxq %r10,%r9
-
- mulxq 16(%rsi),%rax,%r10
- adcxq %rax,%r9
- adoxq %r11,%r10
-
-.byte 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00
- adcxq %rax,%r10
- adoxq %r12,%r11
-
- mulxq 32(%rsi),%rax,%r12
- adcxq %rax,%r11
- adoxq %r13,%r12
-
- mulxq 40(%rsi),%rax,%r13
- adcxq %rax,%r12
- adoxq %r14,%r13
-
-.byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00
- adcxq %rax,%r13
-.byte 0x67
- adoxq %r15,%r14
-
- mulxq 56(%rsi),%rax,%r15
- movq %rbx,64(%rsp,%rcx,8)
- adcxq %rax,%r14
- adoxq %rdi,%r15
- movq %r8,%rbx
- adcxq %rdi,%r15
-
- incq %rcx
- jnz .Loop_mulx_gather
-
- movq %r8,64(%rsp)
- movq %r9,64+8(%rsp)
- movq %r10,64+16(%rsp)
- movq %r11,64+24(%rsp)
- movq %r12,64+32(%rsp)
- movq %r13,64+40(%rsp)
- movq %r14,64+48(%rsp)
- movq %r15,64+56(%rsp)
-
- movq 128(%rsp),%rdx
- movq 128+8(%rsp),%rdi
- movq 128+16(%rsp),%rbp
-
- movq (%rsp),%r8
- movq 8(%rsp),%r9
- movq 16(%rsp),%r10
- movq 24(%rsp),%r11
- movq 32(%rsp),%r12
- movq 40(%rsp),%r13
- movq 48(%rsp),%r14
- movq 56(%rsp),%r15
-
- call __rsaz_512_reducex
-
-.Lmul_gather_tail:
addq 64(%rsp),%r8
adcq 72(%rsp),%r9
adcq 80(%rsp),%r10
@@ -1267,10 +833,6 @@ rsaz_512_mul_scatter4:
movq %rcx,128(%rsp)
movq %rdi,%rbp
- movl $0x80100,%r11d
- andl OPENSSL_ia32cap_P+8(%rip),%r11d
- cmpl $0x80100,%r11d
- je .Lmulx_scatter
movq (%rdi),%rbx
call __rsaz_512_mul
@@ -1287,29 +849,6 @@ rsaz_512_mul_scatter4:
movq 56(%rsp),%r15
call __rsaz_512_reduce
- jmp .Lmul_scatter_tail
-
-.align 32
-.Lmulx_scatter:
- movq (%rdi),%rdx
- call __rsaz_512_mulx
-
-.byte 102,72,15,126,199
-.byte 102,72,15,126,205
-
- movq 128(%rsp),%rdx
- movq (%rsp),%r8
- movq 8(%rsp),%r9
- movq 16(%rsp),%r10
- movq 24(%rsp),%r11
- movq 32(%rsp),%r12
- movq 40(%rsp),%r13
- movq 48(%rsp),%r14
- movq 56(%rsp),%r15
-
- call __rsaz_512_reducex
-
-.Lmul_scatter_tail:
addq 64(%rsp),%r8
adcq 72(%rsp),%r9
adcq 80(%rsp),%r10
@@ -1379,7 +918,6 @@ rsaz_512_mul_by_one:
subq $128+24,%rsp
.cfi_adjust_cfa_offset 128+24
.Lmul_by_one_body:
- movl OPENSSL_ia32cap_P+8(%rip),%eax
movq %rdx,%rbp
movq %rcx,128(%rsp)
@@ -1400,16 +938,7 @@ rsaz_512_mul_by_one:
movdqa %xmm0,64(%rsp)
movdqa %xmm0,80(%rsp)
movdqa %xmm0,96(%rsp)
- andl $0x80100,%eax
- cmpl $0x80100,%eax
- je .Lby_one_callx
call __rsaz_512_reduce
- jmp .Lby_one_tail
-.align 32
-.Lby_one_callx:
- movq 128(%rsp),%rdx
- call __rsaz_512_reducex
-.Lby_one_tail:
movq %r8,(%rdi)
movq %r9,8(%rdi)
movq %r10,16(%rdi)
@@ -1442,6 +971,7 @@ rsaz_512_mul_by_one:
.type __rsaz_512_reduce,@function
.align 32
__rsaz_512_reduce:
+.cfi_startproc
movq %r8,%rbx
imulq 128+8(%rsp),%rbx
movq 0(%rbp),%rax
@@ -1521,66 +1051,12 @@ __rsaz_512_reduce:
jne .Lreduction_loop
.byte 0xf3,0xc3
+.cfi_endproc
.size __rsaz_512_reduce,.-__rsaz_512_reduce
-.type __rsaz_512_reducex,@function
-.align 32
-__rsaz_512_reducex:
-
- imulq %r8,%rdx
- xorq %rsi,%rsi
- movl $8,%ecx
- jmp .Lreduction_loopx
-
-.align 32
-.Lreduction_loopx:
- movq %r8,%rbx
- mulxq 0(%rbp),%rax,%r8
- adcxq %rbx,%rax
- adoxq %r9,%r8
-
- mulxq 8(%rbp),%rax,%r9
- adcxq %rax,%r8
- adoxq %r10,%r9
-
- mulxq 16(%rbp),%rbx,%r10
- adcxq %rbx,%r9
- adoxq %r11,%r10
-
- mulxq 24(%rbp),%rbx,%r11
- adcxq %rbx,%r10
- adoxq %r12,%r11
-
-.byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
- movq %rdx,%rax
- movq %r8,%rdx
- adcxq %rbx,%r11
- adoxq %r13,%r12
-
- mulxq 128+8(%rsp),%rbx,%rdx
- movq %rax,%rdx
-
- mulxq 40(%rbp),%rax,%r13
- adcxq %rax,%r12
- adoxq %r14,%r13
-
-.byte 0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00
- adcxq %rax,%r13
- adoxq %r15,%r14
-
- mulxq 56(%rbp),%rax,%r15
- movq %rbx,%rdx
- adcxq %rax,%r14
- adoxq %rsi,%r15
- adcxq %rsi,%r15
-
- decl %ecx
- jne .Lreduction_loopx
-
- .byte 0xf3,0xc3
-.size __rsaz_512_reducex,.-__rsaz_512_reducex
.type __rsaz_512_subtract,@function
.align 32
__rsaz_512_subtract:
+.cfi_startproc
movq %r8,(%rdi)
movq %r9,8(%rdi)
movq %r10,16(%rdi)
@@ -1634,10 +1110,12 @@ __rsaz_512_subtract:
movq %r15,56(%rdi)
.byte 0xf3,0xc3
+.cfi_endproc
.size __rsaz_512_subtract,.-__rsaz_512_subtract
.type __rsaz_512_mul,@function
.align 32
__rsaz_512_mul:
+.cfi_startproc
leaq 8(%rsp),%rdi
movq (%rsi),%rax
@@ -1776,131 +1254,13 @@ __rsaz_512_mul:
movq %r15,56(%rdi)
.byte 0xf3,0xc3
+.cfi_endproc
.size __rsaz_512_mul,.-__rsaz_512_mul
-.type __rsaz_512_mulx,@function
-.align 32
-__rsaz_512_mulx:
- mulxq (%rsi),%rbx,%r8
- movq $-6,%rcx
-
- mulxq 8(%rsi),%rax,%r9
- movq %rbx,8(%rsp)
-
- mulxq 16(%rsi),%rbx,%r10
- adcq %rax,%r8
-
- mulxq 24(%rsi),%rax,%r11
- adcq %rbx,%r9
-
- mulxq 32(%rsi),%rbx,%r12
- adcq %rax,%r10
-
- mulxq 40(%rsi),%rax,%r13
- adcq %rbx,%r11
-
- mulxq 48(%rsi),%rbx,%r14
- adcq %rax,%r12
-
- mulxq 56(%rsi),%rax,%r15
- movq 8(%rbp),%rdx
- adcq %rbx,%r13
- adcq %rax,%r14
- adcq $0,%r15
-
- xorq %rdi,%rdi
- jmp .Loop_mulx
-
-.align 32
-.Loop_mulx:
- movq %r8,%rbx
- mulxq (%rsi),%rax,%r8
- adcxq %rax,%rbx
- adoxq %r9,%r8
-
- mulxq 8(%rsi),%rax,%r9
- adcxq %rax,%r8
- adoxq %r10,%r9
-
- mulxq 16(%rsi),%rax,%r10
- adcxq %rax,%r9
- adoxq %r11,%r10
-
- mulxq 24(%rsi),%rax,%r11
- adcxq %rax,%r10
- adoxq %r12,%r11
-
-.byte 0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00
- adcxq %rax,%r11
- adoxq %r13,%r12
-
- mulxq 40(%rsi),%rax,%r13
- adcxq %rax,%r12
- adoxq %r14,%r13
-
- mulxq 48(%rsi),%rax,%r14
- adcxq %rax,%r13
- adoxq %r15,%r14
-
- mulxq 56(%rsi),%rax,%r15
- movq 64(%rbp,%rcx,8),%rdx
- movq %rbx,8+64-8(%rsp,%rcx,8)
- adcxq %rax,%r14
- adoxq %rdi,%r15
- adcxq %rdi,%r15
-
- incq %rcx
- jnz .Loop_mulx
-
- movq %r8,%rbx
- mulxq (%rsi),%rax,%r8
- adcxq %rax,%rbx
- adoxq %r9,%r8
-
-.byte 0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00
- adcxq %rax,%r8
- adoxq %r10,%r9
-
-.byte 0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00
- adcxq %rax,%r9
- adoxq %r11,%r10
-
- mulxq 24(%rsi),%rax,%r11
- adcxq %rax,%r10
- adoxq %r12,%r11
-
- mulxq 32(%rsi),%rax,%r12
- adcxq %rax,%r11
- adoxq %r13,%r12
-
- mulxq 40(%rsi),%rax,%r13
- adcxq %rax,%r12
- adoxq %r14,%r13
-
-.byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00
- adcxq %rax,%r13
- adoxq %r15,%r14
-
-.byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00
- adcxq %rax,%r14
- adoxq %rdi,%r15
- adcxq %rdi,%r15
-
- movq %rbx,8+64-8(%rsp)
- movq %r8,8+64(%rsp)
- movq %r9,8+64+8(%rsp)
- movq %r10,8+64+16(%rsp)
- movq %r11,8+64+24(%rsp)
- movq %r12,8+64+32(%rsp)
- movq %r13,8+64+40(%rsp)
- movq %r14,8+64+48(%rsp)
- movq %r15,8+64+56(%rsp)
-
- .byte 0xf3,0xc3
-.size __rsaz_512_mulx,.-__rsaz_512_mulx
.globl rsaz_512_scatter4
.type rsaz_512_scatter4,@function
.align 16
rsaz_512_scatter4:
+.cfi_startproc
leaq (%rdi,%rdx,8),%rdi
movl $8,%r9d
jmp .Loop_scatter
@@ -1913,12 +1273,14 @@ rsaz_512_scatter4:
decl %r9d
jnz .Loop_scatter
.byte 0xf3,0xc3
+.cfi_endproc
.size rsaz_512_scatter4,.-rsaz_512_scatter4
.globl rsaz_512_gather4
.type rsaz_512_gather4,@function
.align 16
rsaz_512_gather4:
+.cfi_startproc
movd %edx,%xmm8
movdqa .Linc+16(%rip),%xmm1
movdqa .Linc(%rip),%xmm0
@@ -1982,6 +1344,7 @@ rsaz_512_gather4:
jnz .Loop_gather
.byte 0xf3,0xc3
.LSEH_end_rsaz_512_gather4:
+.cfi_endproc
.size rsaz_512_gather4,.-rsaz_512_gather4
.align 64