diff options
Diffstat (limited to 'crypto/sha/asm/sha1-586.pl')
-rw-r--r-- | crypto/sha/asm/sha1-586.pl | 37 |
1 files changed, 26 insertions, 11 deletions
diff --git a/crypto/sha/asm/sha1-586.pl b/crypto/sha/asm/sha1-586.pl index e0b5d83b6201..9d4ff7f39a52 100644 --- a/crypto/sha/asm/sha1-586.pl +++ b/crypto/sha/asm/sha1-586.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 1998-2018 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== # [Re]written by Andy Polyakov <appro@openssl.org> for the OpenSSL @@ -28,10 +35,9 @@ # P4 +85%(!) +45% # # As you can see Pentium came out as looser:-( Yet I reckoned that -# improvement on P4 outweights the loss and incorporate this +# improvement on P4 outweighs the loss and incorporate this # re-tuned code to 0.9.7 and later. # ---------------------------------------------------------------- -# <appro@fy.chalmers.se> # August 2009. # @@ -97,10 +103,12 @@ # Sandy Bridge 8.8 6.2/+40% 5.1(**)/+73% # Ivy Bridge 7.2 4.8/+51% 4.7(**)/+53% # Haswell 6.5 4.3/+51% 4.1(**)/+58% +# Skylake 6.4 4.1/+55% 4.1(**)/+55% # Bulldozer 11.6 6.0/+92% # VIA Nano 10.6 7.5/+41% # Atom 12.5 9.3(*)/+35% # Silvermont 14.5 9.9(*)/+46% +# Goldmont 8.8 6.7/+30% 1.7(***)/+415% # # (*) Loop is 1056 instructions long and expected result is ~8.25. # The discrepancy is because of front-end limitations, so @@ -108,12 +116,17 @@ # limited parallelism. # # (**) As per above comment, the result is for AVX *plus* sh[rl]d. +# +# (***) SHAEXT result $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; push(@INC,"${dir}","${dir}../../perlasm"); require "x86asm.pl"; -&asm_init($ARGV[0],"sha1-586.pl",$ARGV[$#ARGV] eq "386"); +$output=pop; +open STDOUT,">$output"; + +&asm_init($ARGV[0],$ARGV[$#ARGV] eq "386"); $xmm=$ymm=0; for (@ARGV) { $xmm=1 if (/-DOPENSSL_IA32_SSE2/); } @@ -123,7 +136,7 @@ $ymm=1 if ($xmm && =~ /GNU assembler version ([2-9]\.[0-9]+)/ && $1>=2.19); # first version supporting AVX -$ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32n" && +$ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32n" && `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ && $1>=2.03); # first version supporting AVX @@ -131,7 +144,7 @@ $ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32" && `ml 2>&1` =~ /Version ([0-9]+)\./ && $1>=10); # first version supporting AVX -$ymm=1 if ($xmm && !$ymm && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/ && +$ymm=1 if ($xmm && !$ymm && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([3-9]\.[0-9]+)/ && $2>=3.0); # first version supporting AVX $shaext=$xmm; ### set to zero if compiling for 1.0.1 @@ -536,7 +549,7 @@ for($i=0;$i<20-4;$i+=2) { # being implemented in SSSE3). Once 8 quadruples or 32 elements are # collected, it switches to routine proposed by Max Locktyukhin. # -# Calculations inevitably require temporary reqisters, and there are +# Calculations inevitably require temporary registers, and there are # no %xmm registers left to spare. For this reason part of the ring # buffer, X[2..4] to be specific, is offloaded to 3 quadriples ring # buffer on the stack. Keep in mind that X[2] is alias X[-6], X[3] - @@ -647,7 +660,7 @@ my $_ror=sub { &ror(@_) }; &jmp (&label("loop")); ###################################################################### -# SSE instruction sequence is first broken to groups of indepentent +# SSE instruction sequence is first broken to groups of independent # instructions, independent in respect to their inputs and shifter # (not all architectures have more than one). Then IALU instructions # are "knitted in" between the SSE groups. Distance is maintained for @@ -656,14 +669,14 @@ my $_ror=sub { &ror(@_) }; # # Temporary registers usage. X[2] is volatile at the entry and at the # end is restored from backtrace ring buffer. X[3] is expected to -# contain current K_XX_XX constant and is used to caclulate X[-1]+K +# contain current K_XX_XX constant and is used to calculate X[-1]+K # from previous round, it becomes volatile the moment the value is # saved to stack for transfer to IALU. X[4] becomes volatile whenever # X[-4] is accumulated and offloaded to backtrace ring buffer, at the # end it is loaded with next K_XX_XX [which becomes X[3] in next # round]... # -sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4 +sub Xupdate_ssse3_16_31() # recall that $Xi starts with 4 { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 40 instructions @@ -1186,7 +1199,7 @@ my $_ror=sub { &shrd(@_[0],@_) }; &and (@T[0],@T[1]); &jmp (&label("loop")); -sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4 +sub Xupdate_avx_16_31() # recall that $Xi starts with 4 { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 40 instructions @@ -1474,3 +1487,5 @@ sub Xtail_avx() &asciz("SHA1 block transform for x86, CRYPTOGAMS by <appro\@openssl.org>"); &asm_finish(); + +close STDOUT; |