summaryrefslogtreecommitdiff
path: root/crypto/rc4
diff options
context:
space:
mode:
Diffstat (limited to 'crypto/rc4')
-rw-r--r--crypto/rc4/Makefile4
-rw-r--r--crypto/rc4/asm/rc4-586.pl4
-rw-r--r--crypto/rc4/asm/rc4-ia64.S5
-rwxr-xr-xcrypto/rc4/asm/rc4-x86_64.pl130
-rw-r--r--crypto/rc4/rc4_skey.c9
5 files changed, 138 insertions, 14 deletions
diff --git a/crypto/rc4/Makefile b/crypto/rc4/Makefile
index 7857c95fbfcdf..187ed5c6684a7 100644
--- a/crypto/rc4/Makefile
+++ b/crypto/rc4/Makefile
@@ -10,7 +10,7 @@ INCLUDES=
CFLAG=-g
AR= ar r
-RC4_ENC=rc4_enc.o
+RC4_ENC=rc4_enc.o rc4_skey.o
CFLAGS= $(INCLUDES) $(CFLAG)
ASFLAGS= $(INCLUDES) $(ASFLAG)
@@ -22,7 +22,7 @@ APPS=
LIB=$(TOP)/libcrypto.a
LIBSRC=rc4_skey.c rc4_enc.c
-LIBOBJ=rc4_skey.o $(RC4_ENC)
+LIBOBJ=$(RC4_ENC)
SRC= $(LIBSRC)
diff --git a/crypto/rc4/asm/rc4-586.pl b/crypto/rc4/asm/rc4-586.pl
index 22bda4b451e99..ef7eee766cb96 100644
--- a/crypto/rc4/asm/rc4-586.pl
+++ b/crypto/rc4/asm/rc4-586.pl
@@ -212,11 +212,11 @@ sub RC4
&movz ($ty,&BP(0,$d,$ty));
&add (&LB($x),1);
&xorb (&LB($ty),&BP(0,$in));
- &lea ($in,&BP(1,$in));
+ &lea ($in,&DWP(1,$in));
&movz ($tx,&BP(0,$d,$x));
&cmp ($in,&swtmp(2));
&movb (&BP(0,$out),&LB($ty));
- &lea ($out,&BP(1,$out));
+ &lea ($out,&DWP(1,$out));
&jb (&label("RC4_CHAR_loop"));
&set_label("finished");
diff --git a/crypto/rc4/asm/rc4-ia64.S b/crypto/rc4/asm/rc4-ia64.S
index a322d0c718e50..8210c47d049d3 100644
--- a/crypto/rc4/asm/rc4-ia64.S
+++ b/crypto/rc4/asm/rc4-ia64.S
@@ -75,14 +75,13 @@ yy=r31;
.skip 16
RC4:
.prologue
- .fframe 0
.save ar.pfs,r2
- .save ar.lc,r3
- .save pr,prsave
{ .mii; alloc r2=ar.pfs,4,12,0,16
+ .save pr,prsave
mov prsave=pr
ADDP key=0,in0 };;
{ .mib; cmp.eq p6,p0=0,in1 // len==0?
+ .save ar.lc,r3
mov r3=ar.lc
(p6) br.ret.spnt.many b0 };; // emergency exit
diff --git a/crypto/rc4/asm/rc4-x86_64.pl b/crypto/rc4/asm/rc4-x86_64.pl
index 4b990cba077e2..2d473204854c2 100755
--- a/crypto/rc4/asm/rc4-x86_64.pl
+++ b/crypto/rc4/asm/rc4-x86_64.pl
@@ -2,8 +2,9 @@
#
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
-# project. Rights for redistribution and usage in source and binary
-# forms are granted according to the OpenSSL license.
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# 2.22x RC4 tune-up:-) It should be noted though that my hand [as in
@@ -49,8 +50,22 @@
# is not implemented, then this final RC4_CHAR code-path should be
# preferred, as it provides better *all-round* performance].
+# Intel Core2 was observed to perform poorly on both code paths:-( It
+# apparently suffers from some kind of partial register stall, which
+# occurs in 64-bit mode only [as virtually identical 32-bit loop was
+# observed to outperform 64-bit one by almost 50%]. Adding two movzb to
+# cloop1 boosts its performance by 80%! This loop appears to be optimal
+# fit for Core2 and therefore the code was modified to skip cloop8 on
+# this CPU.
+
$output=shift;
-open STDOUT,"| $^X ../perlasm/x86_64-xlate.pl $output";
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| $^X $xlate $output";
$dat="%rdi"; # arg1
$len="%rsi"; # arg2
@@ -152,6 +167,8 @@ $code.=<<___;
movzb ($dat,$XX[0]),$TX[0]#d
test \$-8,$len
jz .Lcloop1
+ cmp \$0,260($dat)
+ jnz .Lcloop1
push %rbx
jmp .Lcloop8
.align 16
@@ -221,6 +238,8 @@ $code.=<<___;
movb $TY#b,($dat,$XX[0])
add $TX[0]#b,$TY#b
add \$1,$XX[0]#b
+ movzb $TY#b,$TY#d
+ movzb $XX[0]#b,$XX[0]#d
movzb ($dat,$TY),$TY#d
movzb ($dat,$XX[0]),$TX[0]#d
xorb ($inp),$TY#b
@@ -233,6 +252,111 @@ $code.=<<___;
.size RC4,.-RC4
___
+$idx="%r8";
+$ido="%r9";
+
+$code.=<<___;
+.extern OPENSSL_ia32cap_P
+.globl RC4_set_key
+.type RC4_set_key,\@function,3
+.align 16
+RC4_set_key:
+ lea 8($dat),$dat
+ lea ($inp,$len),$inp
+ neg $len
+ mov $len,%rcx
+ xor %eax,%eax
+ xor $ido,$ido
+ xor %r10,%r10
+ xor %r11,%r11
+
+ mov OPENSSL_ia32cap_P(%rip),$idx#d
+ bt \$20,$idx#d
+ jnc .Lw1stloop
+ bt \$30,$idx#d
+ setc $ido#b
+ mov $ido#d,260($dat)
+ jmp .Lc1stloop
+
+.align 16
+.Lw1stloop:
+ mov %eax,($dat,%rax,4)
+ add \$1,%al
+ jnc .Lw1stloop
+
+ xor $ido,$ido
+ xor $idx,$idx
+.align 16
+.Lw2ndloop:
+ mov ($dat,$ido,4),%r10d
+ add ($inp,$len,1),$idx#b
+ add %r10b,$idx#b
+ add \$1,$len
+ mov ($dat,$idx,4),%r11d
+ cmovz %rcx,$len
+ mov %r10d,($dat,$idx,4)
+ mov %r11d,($dat,$ido,4)
+ add \$1,$ido#b
+ jnc .Lw2ndloop
+ jmp .Lexit_key
+
+.align 16
+.Lc1stloop:
+ mov %al,($dat,%rax)
+ add \$1,%al
+ jnc .Lc1stloop
+
+ xor $ido,$ido
+ xor $idx,$idx
+.align 16
+.Lc2ndloop:
+ mov ($dat,$ido),%r10b
+ add ($inp,$len),$idx#b
+ add %r10b,$idx#b
+ add \$1,$len
+ mov ($dat,$idx),%r11b
+ jnz .Lcnowrap
+ mov %rcx,$len
+.Lcnowrap:
+ mov %r10b,($dat,$idx)
+ mov %r11b,($dat,$ido)
+ add \$1,$ido#b
+ jnc .Lc2ndloop
+ movl \$-1,256($dat)
+
+.align 16
+.Lexit_key:
+ xor %eax,%eax
+ mov %eax,-8($dat)
+ mov %eax,-4($dat)
+ ret
+.size RC4_set_key,.-RC4_set_key
+
+.globl RC4_options
+.type RC4_options,\@function,0
+.align 16
+RC4_options:
+ .picmeup %rax
+ lea .Lopts-.(%rax),%rax
+ mov OPENSSL_ia32cap_P(%rip),%edx
+ bt \$20,%edx
+ jnc .Ldone
+ add \$12,%rax
+ bt \$30,%edx
+ jnc .Ldone
+ add \$13,%rax
+.Ldone:
+ ret
+.align 64
+.Lopts:
+.asciz "rc4(8x,int)"
+.asciz "rc4(8x,char)"
+.asciz "rc4(1x,char)"
+.asciz "RC4 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align 64
+.size RC4_options,.-RC4_options
+___
+
$code =~ s/#([bwd])/$1/gm;
print $code;
diff --git a/crypto/rc4/rc4_skey.c b/crypto/rc4/rc4_skey.c
index b22c40b0bd064..46b77ec32169e 100644
--- a/crypto/rc4/rc4_skey.c
+++ b/crypto/rc4/rc4_skey.c
@@ -119,14 +119,15 @@ void RC4_set_key(RC4_KEY *key, int len, const unsigned char *data)
* implementations suffer from significant performance
* losses then, e.g. PIII exhibits >2x deterioration,
* and so does Opteron. In order to assure optimal
- * all-round performance, let us [try to] detect P4 at
- * run-time by checking upon HTT bit in CPU capability
+ * all-round performance, we detect P4 at run-time by
+ * checking upon reserved bit 20 in CPU capability
* vector and set up compressed key schedule, which is
* recognized by correspondingly updated assembler
- * module...
+ * module... Bit 20 is set up by OPENSSL_ia32_cpuid.
+ *
* <appro@fy.chalmers.se>
*/
- if (OPENSSL_ia32cap_P & (1<<28)) {
+ if (OPENSSL_ia32cap_P & (1<<20)) {
unsigned char *cp=(unsigned char *)d;
for (i=0;i<256;i++) cp[i]=i;