diff options
Diffstat (limited to 'crypto/rc4')
-rw-r--r-- | crypto/rc4/Makefile | 4 | ||||
-rw-r--r-- | crypto/rc4/asm/rc4-586.pl | 4 | ||||
-rw-r--r-- | crypto/rc4/asm/rc4-ia64.S | 5 | ||||
-rwxr-xr-x | crypto/rc4/asm/rc4-x86_64.pl | 130 | ||||
-rw-r--r-- | crypto/rc4/rc4_skey.c | 9 |
5 files changed, 138 insertions, 14 deletions
diff --git a/crypto/rc4/Makefile b/crypto/rc4/Makefile index 7857c95fbfcdf..187ed5c6684a7 100644 --- a/crypto/rc4/Makefile +++ b/crypto/rc4/Makefile @@ -10,7 +10,7 @@ INCLUDES= CFLAG=-g AR= ar r -RC4_ENC=rc4_enc.o +RC4_ENC=rc4_enc.o rc4_skey.o CFLAGS= $(INCLUDES) $(CFLAG) ASFLAGS= $(INCLUDES) $(ASFLAG) @@ -22,7 +22,7 @@ APPS= LIB=$(TOP)/libcrypto.a LIBSRC=rc4_skey.c rc4_enc.c -LIBOBJ=rc4_skey.o $(RC4_ENC) +LIBOBJ=$(RC4_ENC) SRC= $(LIBSRC) diff --git a/crypto/rc4/asm/rc4-586.pl b/crypto/rc4/asm/rc4-586.pl index 22bda4b451e99..ef7eee766cb96 100644 --- a/crypto/rc4/asm/rc4-586.pl +++ b/crypto/rc4/asm/rc4-586.pl @@ -212,11 +212,11 @@ sub RC4 &movz ($ty,&BP(0,$d,$ty)); &add (&LB($x),1); &xorb (&LB($ty),&BP(0,$in)); - &lea ($in,&BP(1,$in)); + &lea ($in,&DWP(1,$in)); &movz ($tx,&BP(0,$d,$x)); &cmp ($in,&swtmp(2)); &movb (&BP(0,$out),&LB($ty)); - &lea ($out,&BP(1,$out)); + &lea ($out,&DWP(1,$out)); &jb (&label("RC4_CHAR_loop")); &set_label("finished"); diff --git a/crypto/rc4/asm/rc4-ia64.S b/crypto/rc4/asm/rc4-ia64.S index a322d0c718e50..8210c47d049d3 100644 --- a/crypto/rc4/asm/rc4-ia64.S +++ b/crypto/rc4/asm/rc4-ia64.S @@ -75,14 +75,13 @@ yy=r31; .skip 16 RC4: .prologue - .fframe 0 .save ar.pfs,r2 - .save ar.lc,r3 - .save pr,prsave { .mii; alloc r2=ar.pfs,4,12,0,16 + .save pr,prsave mov prsave=pr ADDP key=0,in0 };; { .mib; cmp.eq p6,p0=0,in1 // len==0? + .save ar.lc,r3 mov r3=ar.lc (p6) br.ret.spnt.many b0 };; // emergency exit diff --git a/crypto/rc4/asm/rc4-x86_64.pl b/crypto/rc4/asm/rc4-x86_64.pl index 4b990cba077e2..2d473204854c2 100755 --- a/crypto/rc4/asm/rc4-x86_64.pl +++ b/crypto/rc4/asm/rc4-x86_64.pl @@ -2,8 +2,9 @@ # # ==================================================================== # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL -# project. Rights for redistribution and usage in source and binary -# forms are granted according to the OpenSSL license. +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # # 2.22x RC4 tune-up:-) It should be noted though that my hand [as in @@ -49,8 +50,22 @@ # is not implemented, then this final RC4_CHAR code-path should be # preferred, as it provides better *all-round* performance]. +# Intel Core2 was observed to perform poorly on both code paths:-( It +# apparently suffers from some kind of partial register stall, which +# occurs in 64-bit mode only [as virtually identical 32-bit loop was +# observed to outperform 64-bit one by almost 50%]. Adding two movzb to +# cloop1 boosts its performance by 80%! This loop appears to be optimal +# fit for Core2 and therefore the code was modified to skip cloop8 on +# this CPU. + $output=shift; -open STDOUT,"| $^X ../perlasm/x86_64-xlate.pl $output"; + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| $^X $xlate $output"; $dat="%rdi"; # arg1 $len="%rsi"; # arg2 @@ -152,6 +167,8 @@ $code.=<<___; movzb ($dat,$XX[0]),$TX[0]#d test \$-8,$len jz .Lcloop1 + cmp \$0,260($dat) + jnz .Lcloop1 push %rbx jmp .Lcloop8 .align 16 @@ -221,6 +238,8 @@ $code.=<<___; movb $TY#b,($dat,$XX[0]) add $TX[0]#b,$TY#b add \$1,$XX[0]#b + movzb $TY#b,$TY#d + movzb $XX[0]#b,$XX[0]#d movzb ($dat,$TY),$TY#d movzb ($dat,$XX[0]),$TX[0]#d xorb ($inp),$TY#b @@ -233,6 +252,111 @@ $code.=<<___; .size RC4,.-RC4 ___ +$idx="%r8"; +$ido="%r9"; + +$code.=<<___; +.extern OPENSSL_ia32cap_P +.globl RC4_set_key +.type RC4_set_key,\@function,3 +.align 16 +RC4_set_key: + lea 8($dat),$dat + lea ($inp,$len),$inp + neg $len + mov $len,%rcx + xor %eax,%eax + xor $ido,$ido + xor %r10,%r10 + xor %r11,%r11 + + mov OPENSSL_ia32cap_P(%rip),$idx#d + bt \$20,$idx#d + jnc .Lw1stloop + bt \$30,$idx#d + setc $ido#b + mov $ido#d,260($dat) + jmp .Lc1stloop + +.align 16 +.Lw1stloop: + mov %eax,($dat,%rax,4) + add \$1,%al + jnc .Lw1stloop + + xor $ido,$ido + xor $idx,$idx +.align 16 +.Lw2ndloop: + mov ($dat,$ido,4),%r10d + add ($inp,$len,1),$idx#b + add %r10b,$idx#b + add \$1,$len + mov ($dat,$idx,4),%r11d + cmovz %rcx,$len + mov %r10d,($dat,$idx,4) + mov %r11d,($dat,$ido,4) + add \$1,$ido#b + jnc .Lw2ndloop + jmp .Lexit_key + +.align 16 +.Lc1stloop: + mov %al,($dat,%rax) + add \$1,%al + jnc .Lc1stloop + + xor $ido,$ido + xor $idx,$idx +.align 16 +.Lc2ndloop: + mov ($dat,$ido),%r10b + add ($inp,$len),$idx#b + add %r10b,$idx#b + add \$1,$len + mov ($dat,$idx),%r11b + jnz .Lcnowrap + mov %rcx,$len +.Lcnowrap: + mov %r10b,($dat,$idx) + mov %r11b,($dat,$ido) + add \$1,$ido#b + jnc .Lc2ndloop + movl \$-1,256($dat) + +.align 16 +.Lexit_key: + xor %eax,%eax + mov %eax,-8($dat) + mov %eax,-4($dat) + ret +.size RC4_set_key,.-RC4_set_key + +.globl RC4_options +.type RC4_options,\@function,0 +.align 16 +RC4_options: + .picmeup %rax + lea .Lopts-.(%rax),%rax + mov OPENSSL_ia32cap_P(%rip),%edx + bt \$20,%edx + jnc .Ldone + add \$12,%rax + bt \$30,%edx + jnc .Ldone + add \$13,%rax +.Ldone: + ret +.align 64 +.Lopts: +.asciz "rc4(8x,int)" +.asciz "rc4(8x,char)" +.asciz "rc4(1x,char)" +.asciz "RC4 for x86_64, CRYPTOGAMS by <appro\@openssl.org>" +.align 64 +.size RC4_options,.-RC4_options +___ + $code =~ s/#([bwd])/$1/gm; print $code; diff --git a/crypto/rc4/rc4_skey.c b/crypto/rc4/rc4_skey.c index b22c40b0bd064..46b77ec32169e 100644 --- a/crypto/rc4/rc4_skey.c +++ b/crypto/rc4/rc4_skey.c @@ -119,14 +119,15 @@ void RC4_set_key(RC4_KEY *key, int len, const unsigned char *data) * implementations suffer from significant performance * losses then, e.g. PIII exhibits >2x deterioration, * and so does Opteron. In order to assure optimal - * all-round performance, let us [try to] detect P4 at - * run-time by checking upon HTT bit in CPU capability + * all-round performance, we detect P4 at run-time by + * checking upon reserved bit 20 in CPU capability * vector and set up compressed key schedule, which is * recognized by correspondingly updated assembler - * module... + * module... Bit 20 is set up by OPENSSL_ia32_cpuid. + * * <appro@fy.chalmers.se> */ - if (OPENSSL_ia32cap_P & (1<<28)) { + if (OPENSSL_ia32cap_P & (1<<20)) { unsigned char *cp=(unsigned char *)d; for (i=0;i<256;i++) cp[i]=i; |