5 files changed, 138 insertions, 14 deletions
diff --git a/crypto/rc4/Makefile b/crypto/rc4/Makefile
index 7857c95fbfcdf..187ed5c6684a7 100644
--- a/crypto/rc4/Makefile
+++ b/crypto/rc4/Makefile
@@ -10,7 +10,7 @@ INCLUDES=
 CFLAG=-g
 AR=		ar r
 
-RC4_ENC=rc4_enc.o
+RC4_ENC=rc4_enc.o rc4_skey.o
 
 CFLAGS= $(INCLUDES) $(CFLAG)
 ASFLAGS= $(INCLUDES) $(ASFLAG)
@@ -22,7 +22,7 @@ APPS=
 
 LIB=$(TOP)/libcrypto.a
 LIBSRC=rc4_skey.c rc4_enc.c
-LIBOBJ=rc4_skey.o $(RC4_ENC)
+LIBOBJ=$(RC4_ENC)
 
 SRC= $(LIBSRC)
 
diff --git a/crypto/rc4/asm/rc4-586.pl b/crypto/rc4/asm/rc4-586.pl
index 22bda4b451e99..ef7eee766cb96 100644
--- a/crypto/rc4/asm/rc4-586.pl
+++ b/crypto/rc4/asm/rc4-586.pl
@@ -212,11 +212,11 @@ sub RC4
 		&movz	($ty,&BP(0,$d,$ty));
 		&add	(&LB($x),1);
 		&xorb	(&LB($ty),&BP(0,$in));
-		&lea	($in,&BP(1,$in));
+		&lea	($in,&DWP(1,$in));
 		&movz	($tx,&BP(0,$d,$x));
 		&cmp	($in,&swtmp(2));
 		&movb	(&BP(0,$out),&LB($ty));
-		&lea	($out,&BP(1,$out));
+		&lea	($out,&DWP(1,$out));
 	&jb	(&label("RC4_CHAR_loop"));
 
 	&set_label("finished");
diff --git a/crypto/rc4/asm/rc4-ia64.S b/crypto/rc4/asm/rc4-ia64.S
index a322d0c718e50..8210c47d049d3 100644
--- a/crypto/rc4/asm/rc4-ia64.S
+++ b/crypto/rc4/asm/rc4-ia64.S
@@ -75,14 +75,13 @@ yy=r31;
 .skip	16
 RC4:
 	.prologue
-	.fframe 0
 	.save   ar.pfs,r2
-	.save	ar.lc,r3
-	.save	pr,prsave
 { .mii;	alloc	r2=ar.pfs,4,12,0,16
+	.save	pr,prsave
 	mov	prsave=pr
 	ADDP	key=0,in0		};;
 { .mib;	cmp.eq	p6,p0=0,in1			// len==0?
+	.save	ar.lc,r3
 	mov	r3=ar.lc
 (p6)	br.ret.spnt.many	b0	};;	// emergency exit
 
diff --git a/crypto/rc4/asm/rc4-x86_64.pl b/crypto/rc4/asm/rc4-x86_64.pl
index 4b990cba077e2..2d473204854c2 100755
--- a/crypto/rc4/asm/rc4-x86_64.pl
+++ b/crypto/rc4/asm/rc4-x86_64.pl
@@ -2,8 +2,9 @@
 #
 # ====================================================================
 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
-# project. Rights for redistribution and usage in source and binary
-# forms are granted according to the OpenSSL license.
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
 # ====================================================================
 #
 # 2.22x RC4 tune-up:-) It should be noted though that my hand [as in
@@ -49,8 +50,22 @@
 # is not implemented, then this final RC4_CHAR code-path should be
 # preferred, as it provides better *all-round* performance].
 
+# Intel Core2 was observed to perform poorly on both code paths:-( It
+# apparently suffers from some kind of partial register stall, which
+# occurs in 64-bit mode only [as virtually identical 32-bit loop was
+# observed to outperform 64-bit one by almost 50%]. Adding two movzb to
+# cloop1 boosts its performance by 80%! This loop appears to be optimal
+# fit for Core2 and therefore the code was modified to skip cloop8 on
+# this CPU.
+
 $output=shift;
-open STDOUT,"| $^X ../perlasm/x86_64-xlate.pl $output";
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| $^X $xlate $output";
 
 $dat="%rdi";	    # arg1
 $len="%rsi";	    # arg2
@@ -152,6 +167,8 @@ $code.=<<___;
 	movzb	($dat,$XX[0]),$TX[0]#d
 	test	\$-8,$len
 	jz	.Lcloop1
+	cmp	\$0,260($dat)
+	jnz	.Lcloop1
 	push	%rbx
 	jmp	.Lcloop8
 .align	16
@@ -221,6 +238,8 @@ $code.=<<___;
 	movb	$TY#b,($dat,$XX[0])
 	add	$TX[0]#b,$TY#b
 	add	\$1,$XX[0]#b
+	movzb	$TY#b,$TY#d
+	movzb	$XX[0]#b,$XX[0]#d
 	movzb	($dat,$TY),$TY#d
 	movzb	($dat,$XX[0]),$TX[0]#d
 	xorb	($inp),$TY#b
@@ -233,6 +252,111 @@ $code.=<<___;
 .size	RC4,.-RC4
 ___
 
+$idx="%r8";
+$ido="%r9";
+
+$code.=<<___;
+.extern	OPENSSL_ia32cap_P
+.globl	RC4_set_key
+.type	RC4_set_key,\@function,3
+.align	16
+RC4_set_key:
+	lea	8($dat),$dat
+	lea	($inp,$len),$inp
+	neg	$len
+	mov	$len,%rcx
+	xor	%eax,%eax
+	xor	$ido,$ido
+	xor	%r10,%r10
+	xor	%r11,%r11
+
+	mov	OPENSSL_ia32cap_P(%rip),$idx#d
+	bt	\$20,$idx#d
+	jnc	.Lw1stloop
+	bt	\$30,$idx#d
+	setc	$ido#b
+	mov	$ido#d,260($dat)
+	jmp	.Lc1stloop
+
+.align	16
+.Lw1stloop:
+	mov	%eax,($dat,%rax,4)
+	add	\$1,%al
+	jnc	.Lw1stloop
+
+	xor	$ido,$ido
+	xor	$idx,$idx
+.align	16
+.Lw2ndloop:
+	mov	($dat,$ido,4),%r10d
+	add	($inp,$len,1),$idx#b
+	add	%r10b,$idx#b
+	add	\$1,$len
+	mov	($dat,$idx,4),%r11d
+	cmovz	%rcx,$len
+	mov	%r10d,($dat,$idx,4)
+	mov	%r11d,($dat,$ido,4)
+	add	\$1,$ido#b
+	jnc	.Lw2ndloop
+	jmp	.Lexit_key
+
+.align	16
+.Lc1stloop:
+	mov	%al,($dat,%rax)
+	add	\$1,%al
+	jnc	.Lc1stloop
+
+	xor	$ido,$ido
+	xor	$idx,$idx
+.align	16
+.Lc2ndloop:
+	mov	($dat,$ido),%r10b
+	add	($inp,$len),$idx#b
+	add	%r10b,$idx#b
+	add	\$1,$len
+	mov	($dat,$idx),%r11b
+	jnz	.Lcnowrap
+	mov	%rcx,$len
+.Lcnowrap:
+	mov	%r10b,($dat,$idx)
+	mov	%r11b,($dat,$ido)
+	add	\$1,$ido#b
+	jnc	.Lc2ndloop
+	movl	\$-1,256($dat)
+
+.align	16
+.Lexit_key:
+	xor	%eax,%eax
+	mov	%eax,-8($dat)
+	mov	%eax,-4($dat)
+	ret
+.size	RC4_set_key,.-RC4_set_key
+
+.globl	RC4_options
+.type	RC4_options,\@function,0
+.align	16
+RC4_options:
+	.picmeup %rax
+	lea	.Lopts-.(%rax),%rax
+	mov	OPENSSL_ia32cap_P(%rip),%edx
+	bt	\$20,%edx
+	jnc	.Ldone
+	add	\$12,%rax
+	bt	\$30,%edx
+	jnc	.Ldone
+	add	\$13,%rax
+.Ldone:
+	ret
+.align	64
+.Lopts:
+.asciz	"rc4(8x,int)"
+.asciz	"rc4(8x,char)"
+.asciz	"rc4(1x,char)"
+.asciz	"RC4 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align	64
+.size	RC4_options,.-RC4_options
+___
+
 $code =~ s/#([bwd])/$1/gm;
 
 print $code;
diff --git a/crypto/rc4/rc4_skey.c b/crypto/rc4/rc4_skey.c
index b22c40b0bd064..46b77ec32169e 100644
--- a/crypto/rc4/rc4_skey.c
+++ b/crypto/rc4/rc4_skey.c
@@ -119,14 +119,15 @@ void RC4_set_key(RC4_KEY *key, int len, const unsigned char *data)
 		 * implementations suffer from significant performance
 		 * losses then, e.g. PIII exhibits >2x deterioration,
 		 * and so does Opteron. In order to assure optimal
-		 * all-round performance, let us [try to] detect P4 at
-		 * run-time by checking upon HTT bit in CPU capability
+		 * all-round performance, we detect P4 at run-time by
+		 * checking upon reserved bit 20 in CPU capability
 		 * vector and set up compressed key schedule, which is
 		 * recognized by correspondingly updated assembler
-		 * module...
+		 * module... Bit 20 is set up by OPENSSL_ia32_cpuid.
+		 *
 		 *				<appro@fy.chalmers.se>
 		 */
-		if (OPENSSL_ia32cap_P & (1<<28)) {
+		if (OPENSSL_ia32cap_P & (1<<20)) {
 			unsigned char *cp=(unsigned char *)d;
 
 			for (i=0;i<256;i++) cp[i]=i;