diff options
Diffstat (limited to 'crypto/sha')
59 files changed, 12237 insertions, 2312 deletions
diff --git a/crypto/sha/Makefile b/crypto/sha/Makefile deleted file mode 100644 index 8b8f8b285f1c..000000000000 --- a/crypto/sha/Makefile +++ /dev/null @@ -1,177 +0,0 @@ -# -# OpenSSL/crypto/sha/Makefile -# - -DIR= sha -TOP= ../.. -CC= cc -CPP= $(CC) -E -INCLUDES= -CFLAG=-g -MAKEFILE= Makefile -AR= ar r - -SHA1_ASM_OBJ= - -CFLAGS= $(INCLUDES) $(CFLAG) -ASFLAGS= $(INCLUDES) $(ASFLAG) -AFLAGS= $(ASFLAGS) - -GENERAL=Makefile -TEST=shatest.c sha1test.c sha256t.c sha512t.c -APPS= - -LIB=$(TOP)/libcrypto.a -LIBSRC=sha_dgst.c sha1dgst.c sha_one.c sha1_one.c sha256.c sha512.c -LIBOBJ=sha_dgst.o sha1dgst.o sha_one.o sha1_one.o sha256.o sha512.o $(SHA1_ASM_OBJ) - -SRC= $(LIBSRC) - -EXHEADER= sha.h -HEADER= sha_locl.h $(EXHEADER) - -ALL= $(GENERAL) $(SRC) $(HEADER) - -top: - (cd ../..; $(MAKE) DIRS=crypto SDIRS=$(DIR) sub_all) - -all: lib - -lib: $(LIBOBJ) - $(AR) $(LIB) $(LIBOBJ) - $(RANLIB) $(LIB) || echo Never mind. - @touch lib - -sha1-586.s: asm/sha1-586.pl ../perlasm/x86asm.pl - $(PERL) asm/sha1-586.pl $(PERLASM_SCHEME) $(CFLAGS) $(PROCESSOR) > $@ -sha256-586.s: asm/sha256-586.pl ../perlasm/x86asm.pl - $(PERL) asm/sha256-586.pl $(PERLASM_SCHEME) $(CFLAGS) $(PROCESSOR) > $@ -sha512-586.s: asm/sha512-586.pl ../perlasm/x86asm.pl - $(PERL) asm/sha512-586.pl $(PERLASM_SCHEME) $(CFLAGS) $(PROCESSOR) > $@ - -sha1-ia64.s: asm/sha1-ia64.pl - (cd asm; $(PERL) sha1-ia64.pl ../$@ $(CFLAGS)) -sha256-ia64.s: asm/sha512-ia64.pl - (cd asm; $(PERL) sha512-ia64.pl ../$@ $(CFLAGS)) -sha512-ia64.s: asm/sha512-ia64.pl - (cd asm; $(PERL) sha512-ia64.pl ../$@ $(CFLAGS)) - -sha256-armv4.S: asm/sha256-armv4.pl - $(PERL) $< $(PERLASM_SCHEME) $@ - -sha1-alpha.s: asm/sha1-alpha.pl - (preproc=$$$$.$@.S; trap "rm $$preproc" INT; \ - $(PERL) asm/sha1-alpha.pl > $$preproc && \ - $(CC) -E -P $$preproc > $@ && rm $$preproc) - -# Solaris make has to be explicitly told -sha1-x86_64.s: asm/sha1-x86_64.pl; $(PERL) asm/sha1-x86_64.pl $(PERLASM_SCHEME) > $@ -sha1-mb-x86_64.s: asm/sha1-mb-x86_64.pl; $(PERL) asm/sha1-mb-x86_64.pl $(PERLASM_SCHEME) > $@ -sha256-x86_64.s:asm/sha512-x86_64.pl; $(PERL) asm/sha512-x86_64.pl $(PERLASM_SCHEME) $@ -sha256-mb-x86_64.s: asm/sha256-mb-x86_64.pl; $(PERL) asm/sha256-mb-x86_64.pl $(PERLASM_SCHEME) > $@ -sha512-x86_64.s:asm/sha512-x86_64.pl; $(PERL) asm/sha512-x86_64.pl $(PERLASM_SCHEME) $@ -sha1-sparcv9.S: asm/sha1-sparcv9.pl; $(PERL) asm/sha1-sparcv9.pl $@ $(CFLAGS) -sha256-sparcv9.S:asm/sha512-sparcv9.pl; $(PERL) asm/sha512-sparcv9.pl $@ $(CFLAGS) -sha512-sparcv9.S:asm/sha512-sparcv9.pl; $(PERL) asm/sha512-sparcv9.pl $@ $(CFLAGS) - -sha1-ppc.s: asm/sha1-ppc.pl; $(PERL) asm/sha1-ppc.pl $(PERLASM_SCHEME) $@ -sha256-ppc.s: asm/sha512-ppc.pl; $(PERL) asm/sha512-ppc.pl $(PERLASM_SCHEME) $@ -sha512-ppc.s: asm/sha512-ppc.pl; $(PERL) asm/sha512-ppc.pl $(PERLASM_SCHEME) $@ -sha256p8-ppc.s: asm/sha512p8-ppc.pl; $(PERL) asm/sha512p8-ppc.pl $(PERLASM_SCHEME) $@ -sha512p8-ppc.s: asm/sha512p8-ppc.pl; $(PERL) asm/sha512p8-ppc.pl $(PERLASM_SCHEME) $@ - -sha1-parisc.s: asm/sha1-parisc.pl; $(PERL) asm/sha1-parisc.pl $(PERLASM_SCHEME) $@ -sha256-parisc.s:asm/sha512-parisc.pl; $(PERL) asm/sha512-parisc.pl $(PERLASM_SCHEME) $@ -sha512-parisc.s:asm/sha512-parisc.pl; $(PERL) asm/sha512-parisc.pl $(PERLASM_SCHEME) $@ - -sha1-mips.S: asm/sha1-mips.pl; $(PERL) asm/sha1-mips.pl $(PERLASM_SCHEME) $@ -sha256-mips.S: asm/sha512-mips.pl; $(PERL) asm/sha512-mips.pl $(PERLASM_SCHEME) $@ -sha512-mips.S: asm/sha512-mips.pl; $(PERL) asm/sha512-mips.pl $(PERLASM_SCHEME) $@ - -# GNU make "catch all" -sha1-%.S: asm/sha1-%.pl; $(PERL) $< $(PERLASM_SCHEME) $@ -sha256-%.S: asm/sha512-%.pl; $(PERL) $< $(PERLASM_SCHEME) $@ -sha512-%.S: asm/sha512-%.pl; $(PERL) $< $(PERLASM_SCHEME) $@ - -sha1-armv4-large.o: sha1-armv4-large.S -sha256-armv4.o: sha256-armv4.S -sha512-armv4.o: sha512-armv4.S -sha1-armv8.o: sha1-armv8.S -sha256-armv8.o: sha256-armv8.S -sha512-armv8.o: sha512-armv8.S - -files: - $(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO - -links: - @$(PERL) $(TOP)/util/mklink.pl ../../include/openssl $(EXHEADER) - @$(PERL) $(TOP)/util/mklink.pl ../../test $(TEST) - @$(PERL) $(TOP)/util/mklink.pl ../../apps $(APPS) - -install: - @[ -n "$(INSTALLTOP)" ] # should be set by top Makefile... - @headerlist="$(EXHEADER)"; for i in $$headerlist ; \ - do \ - (cp $$i $(INSTALL_PREFIX)$(INSTALLTOP)/include/openssl/$$i; \ - chmod 644 $(INSTALL_PREFIX)$(INSTALLTOP)/include/openssl/$$i ); \ - done; - -tags: - ctags $(SRC) - -tests: - -lint: - lint -DLINT $(INCLUDES) $(SRC)>fluff - -update: depend - -depend: - @[ -n "$(MAKEDEPEND)" ] # should be set by upper Makefile... - $(MAKEDEPEND) -- $(CFLAG) $(INCLUDES) $(DEPFLAG) -- $(PROGS) $(LIBSRC) - -dclean: - $(PERL) -pe 'if (/^# DO NOT DELETE THIS LINE/) {print; exit(0);}' $(MAKEFILE) >Makefile.new - mv -f Makefile.new $(MAKEFILE) - -clean: - rm -f *.s *.S *.o *.obj lib tags core .pure .nfs* *.old *.bak fluff - -# DO NOT DELETE THIS LINE -- make depend depends on it. - -sha1_one.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h -sha1_one.o: ../../include/openssl/opensslconf.h -sha1_one.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h -sha1_one.o: ../../include/openssl/safestack.h ../../include/openssl/sha.h -sha1_one.o: ../../include/openssl/stack.h ../../include/openssl/symhacks.h -sha1_one.o: sha1_one.c -sha1dgst.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h -sha1dgst.o: ../../include/openssl/opensslconf.h -sha1dgst.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h -sha1dgst.o: ../../include/openssl/safestack.h ../../include/openssl/sha.h -sha1dgst.o: ../../include/openssl/stack.h ../../include/openssl/symhacks.h -sha1dgst.o: ../md32_common.h sha1dgst.c sha_locl.h -sha256.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h -sha256.o: ../../include/openssl/opensslconf.h ../../include/openssl/opensslv.h -sha256.o: ../../include/openssl/ossl_typ.h ../../include/openssl/safestack.h -sha256.o: ../../include/openssl/sha.h ../../include/openssl/stack.h -sha256.o: ../../include/openssl/symhacks.h ../md32_common.h sha256.c -sha512.o: ../../e_os.h ../../include/openssl/bio.h -sha512.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h -sha512.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h -sha512.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h -sha512.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h -sha512.o: ../../include/openssl/safestack.h ../../include/openssl/sha.h -sha512.o: ../../include/openssl/stack.h ../../include/openssl/symhacks.h -sha512.o: ../cryptlib.h sha512.c -sha_dgst.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h -sha_dgst.o: ../../include/openssl/opensslconf.h -sha_dgst.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h -sha_dgst.o: ../../include/openssl/safestack.h ../../include/openssl/sha.h -sha_dgst.o: ../../include/openssl/stack.h ../../include/openssl/symhacks.h -sha_dgst.o: ../md32_common.h sha_dgst.c sha_locl.h -sha_one.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h -sha_one.o: ../../include/openssl/opensslconf.h ../../include/openssl/opensslv.h -sha_one.o: ../../include/openssl/ossl_typ.h ../../include/openssl/safestack.h -sha_one.o: ../../include/openssl/sha.h ../../include/openssl/stack.h -sha_one.o: ../../include/openssl/symhacks.h sha_one.c diff --git a/crypto/sha/asm/README b/crypto/sha/asm/README deleted file mode 100644 index b7e755765fcc..000000000000 --- a/crypto/sha/asm/README +++ /dev/null @@ -1 +0,0 @@ -C2.pl works diff --git a/crypto/sha/asm/keccak1600-armv4.pl b/crypto/sha/asm/keccak1600-armv4.pl new file mode 100755 index 000000000000..8bf665c8b38d --- /dev/null +++ b/crypto/sha/asm/keccak1600-armv4.pl @@ -0,0 +1,1606 @@ +#!/usr/bin/env perl +# Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# Keccak-1600 for ARMv4. +# +# June 2017. +# +# Non-NEON code is KECCAK_1X variant (see sha/keccak1600.c) with bit +# interleaving. How does it compare to Keccak Code Package? It's as +# fast, but several times smaller, and is endian- and ISA-neutral. ISA +# neutrality means that minimum ISA requirement is ARMv4, yet it can +# be assembled even as Thumb-2. NEON code path is KECCAK_1X_ALT with +# register layout taken from Keccak Code Package. It's also as fast, +# in fact faster by 10-15% on some processors, and endian-neutral. +# +# August 2017. +# +# Switch to KECCAK_2X variant for non-NEON code and merge almost 1/2 +# of rotate instructions with logical ones. This resulted in ~10% +# improvement on most processors. Switch to KECCAK_2X effectively +# minimizes re-loads from temporary storage, and merged rotates just +# eliminate corresponding instructions. As for latter. When examining +# code you'll notice commented ror instructions. These are eliminated +# ones, and you should trace destination register below to see what's +# going on. Just in case, why not all rotates are eliminated. Trouble +# is that you have operations that require both inputs to be rotated, +# e.g. 'eor a,b>>>x,c>>>y'. This conundrum is resolved by using +# 'eor a,b,c>>>(x-y)' and then merge-rotating 'a' in next operation +# that takes 'a' as input. And thing is that this next operation can +# be in next round. It's totally possible to "carry" rotate "factors" +# to the next round, but it makes code more complex. And the last word +# is the keyword, i.e. "almost 1/2" is kind of complexity cap [for the +# time being]... +# +# Reduce per-round instruction count in Thumb-2 case by 16%. This is +# achieved by folding ldr/str pairs to their double-word counterparts. +# Theoretically this should have improved performance on single-issue +# cores, such as Cortex-A5/A7, by 19%. Reality is a bit different, as +# usual... +# +######################################################################## +# Numbers are cycles per processed byte. Non-NEON results account even +# for input bit interleaving. +# +# r=1088(*) Thumb-2(**) NEON +# +# ARM11xx 82/+150% +# Cortex-A5 88/+160%, 86, 36 +# Cortex-A7 78/+160%, 68, 34 +# Cortex-A8 51/+230%, 57, 30 +# Cortex-A9 53/+210%, 51, 26 +# Cortex-A15 42/+160%, 38, 18 +# Snapdragon S4 43/+210%, 38, 24 +# +# (*) Corresponds to SHA3-256. Percentage after slash is improvement +# over compiler-generated KECCAK_2X reference code. +# (**) Thumb-2 results for Cortex-A5/A7 are likely to apply even to +# Cortex-Mx, x>=3. Otherwise, non-NEON results for NEON-capable +# processors are presented mostly for reference purposes. + +$flavour = shift; +if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } +else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} + +my @C = map("r$_",(0..9)); +my @E = map("r$_",(10..12,14)); + +######################################################################## +# Stack layout +# ----->+-----------------------+ +# | uint64_t A[5][5] | +# | ... | +# +200->+-----------------------+ +# | uint64_t D[5] | +# | ... | +# +240->+-----------------------+ +# | uint64_t T[5][5] | +# | ... | +# +440->+-----------------------+ +# | saved lr | +# +444->+-----------------------+ +# | loop counter | +# +448->+-----------------------+ +# | ... + +my @A = map([ 8*$_, 8*($_+1), 8*($_+2), 8*($_+3), 8*($_+4) ], (0,5,10,15,20)); +my @D = map(8*$_, (25..29)); +my @T = map([ 8*$_, 8*($_+1), 8*($_+2), 8*($_+3), 8*($_+4) ], (30,35,40,45,50)); + +$code.=<<___; +#include "arm_arch.h" + +.text + +#if defined(__thumb2__) +.syntax unified +.thumb +#else +.code 32 +#endif + +.type iotas32, %object +.align 5 +iotas32: + .long 0x00000001, 0x00000000 + .long 0x00000000, 0x00000089 + .long 0x00000000, 0x8000008b + .long 0x00000000, 0x80008080 + .long 0x00000001, 0x0000008b + .long 0x00000001, 0x00008000 + .long 0x00000001, 0x80008088 + .long 0x00000001, 0x80000082 + .long 0x00000000, 0x0000000b + .long 0x00000000, 0x0000000a + .long 0x00000001, 0x00008082 + .long 0x00000000, 0x00008003 + .long 0x00000001, 0x0000808b + .long 0x00000001, 0x8000000b + .long 0x00000001, 0x8000008a + .long 0x00000001, 0x80000081 + .long 0x00000000, 0x80000081 + .long 0x00000000, 0x80000008 + .long 0x00000000, 0x00000083 + .long 0x00000000, 0x80008003 + .long 0x00000001, 0x80008088 + .long 0x00000000, 0x80000088 + .long 0x00000001, 0x00008000 + .long 0x00000000, 0x80008082 +.size iotas32,.-iotas32 + +.type KeccakF1600_int, %function +.align 5 +KeccakF1600_int: + add @C[9],sp,#$A[4][2] + add @E[2],sp,#$A[0][0] + add @E[0],sp,#$A[1][0] + ldmia @C[9],{@C[4]-@C[9]} @ A[4][2..4] +KeccakF1600_enter: + str lr,[sp,#440] + eor @E[1],@E[1],@E[1] + str @E[1],[sp,#444] + b .Lround2x + +.align 4 +.Lround2x: +___ +sub Round { +my (@A,@R); (@A[0..4],@R) = @_; + +$code.=<<___; + ldmia @E[2],{@C[0]-@C[3]} @ A[0][0..1] + ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[1][0..1] +#ifdef __thumb2__ + eor @C[0],@C[0],@E[0] + eor @C[1],@C[1],@E[1] + eor @C[2],@C[2],@E[2] + ldrd @E[0],@E[1],[sp,#$A[1][2]] + eor @C[3],@C[3],@E[3] + ldrd @E[2],@E[3],[sp,#$A[1][3]] + eor @C[4],@C[4],@E[0] + eor @C[5],@C[5],@E[1] + eor @C[6],@C[6],@E[2] + ldrd @E[0],@E[1],[sp,#$A[1][4]] + eor @C[7],@C[7],@E[3] + ldrd @E[2],@E[3],[sp,#$A[2][0]] + eor @C[8],@C[8],@E[0] + eor @C[9],@C[9],@E[1] + eor @C[0],@C[0],@E[2] + ldrd @E[0],@E[1],[sp,#$A[2][1]] + eor @C[1],@C[1],@E[3] + ldrd @E[2],@E[3],[sp,#$A[2][2]] + eor @C[2],@C[2],@E[0] + eor @C[3],@C[3],@E[1] + eor @C[4],@C[4],@E[2] + ldrd @E[0],@E[1],[sp,#$A[2][3]] + eor @C[5],@C[5],@E[3] + ldrd @E[2],@E[3],[sp,#$A[2][4]] + eor @C[6],@C[6],@E[0] + eor @C[7],@C[7],@E[1] + eor @C[8],@C[8],@E[2] + ldrd @E[0],@E[1],[sp,#$A[3][0]] + eor @C[9],@C[9],@E[3] + ldrd @E[2],@E[3],[sp,#$A[3][1]] + eor @C[0],@C[0],@E[0] + eor @C[1],@C[1],@E[1] + eor @C[2],@C[2],@E[2] + ldrd @E[0],@E[1],[sp,#$A[3][2]] + eor @C[3],@C[3],@E[3] + ldrd @E[2],@E[3],[sp,#$A[3][3]] + eor @C[4],@C[4],@E[0] + eor @C[5],@C[5],@E[1] + eor @C[6],@C[6],@E[2] + ldrd @E[0],@E[1],[sp,#$A[3][4]] + eor @C[7],@C[7],@E[3] + ldrd @E[2],@E[3],[sp,#$A[4][0]] + eor @C[8],@C[8],@E[0] + eor @C[9],@C[9],@E[1] + eor @C[0],@C[0],@E[2] + ldrd @E[0],@E[1],[sp,#$A[4][1]] + eor @C[1],@C[1],@E[3] + ldrd @E[2],@E[3],[sp,#$A[0][2]] + eor @C[2],@C[2],@E[0] + eor @C[3],@C[3],@E[1] + eor @C[4],@C[4],@E[2] + ldrd @E[0],@E[1],[sp,#$A[0][3]] + eor @C[5],@C[5],@E[3] + ldrd @E[2],@E[3],[sp,#$A[0][4]] +#else + eor @C[0],@C[0],@E[0] + add @E[0],sp,#$A[1][2] + eor @C[1],@C[1],@E[1] + eor @C[2],@C[2],@E[2] + eor @C[3],@C[3],@E[3] + ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[1][2..3] + eor @C[4],@C[4],@E[0] + add @E[0],sp,#$A[1][4] + eor @C[5],@C[5],@E[1] + eor @C[6],@C[6],@E[2] + eor @C[7],@C[7],@E[3] + ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[1][4]..A[2][0] + eor @C[8],@C[8],@E[0] + add @E[0],sp,#$A[2][1] + eor @C[9],@C[9],@E[1] + eor @C[0],@C[0],@E[2] + eor @C[1],@C[1],@E[3] + ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[2][1..2] + eor @C[2],@C[2],@E[0] + add @E[0],sp,#$A[2][3] + eor @C[3],@C[3],@E[1] + eor @C[4],@C[4],@E[2] + eor @C[5],@C[5],@E[3] + ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[2][3..4] + eor @C[6],@C[6],@E[0] + add @E[0],sp,#$A[3][0] + eor @C[7],@C[7],@E[1] + eor @C[8],@C[8],@E[2] + eor @C[9],@C[9],@E[3] + ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[3][0..1] + eor @C[0],@C[0],@E[0] + add @E[0],sp,#$A[3][2] + eor @C[1],@C[1],@E[1] + eor @C[2],@C[2],@E[2] + eor @C[3],@C[3],@E[3] + ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[3][2..3] + eor @C[4],@C[4],@E[0] + add @E[0],sp,#$A[3][4] + eor @C[5],@C[5],@E[1] + eor @C[6],@C[6],@E[2] + eor @C[7],@C[7],@E[3] + ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[3][4]..A[4][0] + eor @C[8],@C[8],@E[0] + ldr @E[0],[sp,#$A[4][1]] @ A[4][1] + eor @C[9],@C[9],@E[1] + ldr @E[1],[sp,#$A[4][1]+4] + eor @C[0],@C[0],@E[2] + ldr @E[2],[sp,#$A[0][2]] @ A[0][2] + eor @C[1],@C[1],@E[3] + ldr @E[3],[sp,#$A[0][2]+4] + eor @C[2],@C[2],@E[0] + add @E[0],sp,#$A[0][3] + eor @C[3],@C[3],@E[1] + eor @C[4],@C[4],@E[2] + eor @C[5],@C[5],@E[3] + ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[0][3..4] +#endif + eor @C[6],@C[6],@E[0] + eor @C[7],@C[7],@E[1] + eor @C[8],@C[8],@E[2] + eor @C[9],@C[9],@E[3] + + eor @E[0],@C[0],@C[5],ror#32-1 @ E[0] = ROL64(C[2], 1) ^ C[0]; + str.l @E[0],[sp,#$D[1]] @ D[1] = E[0] + eor @E[1],@C[1],@C[4] + str.h @E[1],[sp,#$D[1]+4] + eor @E[2],@C[6],@C[1],ror#32-1 @ E[1] = ROL64(C[0], 1) ^ C[3]; + eor @E[3],@C[7],@C[0] + str.l @E[2],[sp,#$D[4]] @ D[4] = E[1] + eor @C[0],@C[8],@C[3],ror#32-1 @ C[0] = ROL64(C[1], 1) ^ C[4]; + str.h @E[3],[sp,#$D[4]+4] + eor @C[1],@C[9],@C[2] + str.l @C[0],[sp,#$D[0]] @ D[0] = C[0] + eor @C[2],@C[2],@C[7],ror#32-1 @ C[1] = ROL64(C[3], 1) ^ C[1]; + ldr.l @C[7],[sp,#$A[3][3]] + eor @C[3],@C[3],@C[6] + str.h @C[1],[sp,#$D[0]+4] + ldr.h @C[6],[sp,#$A[3][3]+4] + str.l @C[2],[sp,#$D[2]] @ D[2] = C[1] + eor @C[4],@C[4],@C[9],ror#32-1 @ C[2] = ROL64(C[4], 1) ^ C[2]; + str.h @C[3],[sp,#$D[2]+4] + eor @C[5],@C[5],@C[8] + + ldr.l @C[8],[sp,#$A[4][4]] + ldr.h @C[9],[sp,#$A[4][4]+4] + str.l @C[4],[sp,#$D[3]] @ D[3] = C[2] + eor @C[7],@C[7],@C[4] + str.h @C[5],[sp,#$D[3]+4] + eor @C[6],@C[6],@C[5] + ldr.l @C[4],[sp,#$A[0][0]] + @ ror @C[7],@C[7],#32-10 @ C[3] = ROL64(A[3][3] ^ C[2], rhotates[3][3]); /* D[3] */ + @ ror @C[6],@C[6],#32-11 + ldr.h @C[5],[sp,#$A[0][0]+4] + eor @C[8],@C[8],@E[2] + eor @C[9],@C[9],@E[3] + ldr.l @E[2],[sp,#$A[2][2]] + eor @C[0],@C[0],@C[4] + ldr.h @E[3],[sp,#$A[2][2]+4] + @ ror @C[8],@C[8],#32-7 @ C[4] = ROL64(A[4][4] ^ E[1], rhotates[4][4]); /* D[4] */ + @ ror @C[9],@C[9],#32-7 + eor @C[1],@C[1],@C[5] @ C[0] = A[0][0] ^ C[0]; /* rotate by 0 */ /* D[0] */ + eor @E[2],@E[2],@C[2] + ldr.l @C[2],[sp,#$A[1][1]] + eor @E[3],@E[3],@C[3] + ldr.h @C[3],[sp,#$A[1][1]+4] + ror @C[5],@E[2],#32-21 @ C[2] = ROL64(A[2][2] ^ C[1], rhotates[2][2]); /* D[2] */ + ldr @E[2],[sp,#444] @ load counter + eor @C[2],@C[2],@E[0] + adr @E[0],iotas32 + ror @C[4],@E[3],#32-22 + add @E[3],@E[0],@E[2] + eor @C[3],@C[3],@E[1] +___ +$code.=<<___ if ($A[0][0] != $T[0][0]); + ldmia @E[3],{@E[0],@E[1]} @ iotas[i] +___ +$code.=<<___ if ($A[0][0] == $T[0][0]); + ldr.l @E[0],[@E[3],#8] @ iotas[i].lo + add @E[2],@E[2],#16 + ldr.h @E[1],[@E[3],#12] @ iotas[i].hi + cmp @E[2],#192 + str @E[2],[sp,#444] @ store counter +___ +$code.=<<___; + bic @E[2],@C[4],@C[2],ror#32-22 + bic @E[3],@C[5],@C[3],ror#32-22 + ror @C[2],@C[2],#32-22 @ C[1] = ROL64(A[1][1] ^ E[0], rhotates[1][1]); /* D[1] */ + ror @C[3],@C[3],#32-22 + eor @E[2],@E[2],@C[0] + eor @E[3],@E[3],@C[1] + eor @E[0],@E[0],@E[2] + eor @E[1],@E[1],@E[3] + str.l @E[0],[sp,#$R[0][0]] @ R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i]; + bic @E[2],@C[6],@C[4],ror#11 + str.h @E[1],[sp,#$R[0][0]+4] + bic @E[3],@C[7],@C[5],ror#10 + bic @E[0],@C[8],@C[6],ror#32-(11-7) + bic @E[1],@C[9],@C[7],ror#32-(10-7) + eor @E[2],@C[2],@E[2],ror#32-11 + str.l @E[2],[sp,#$R[0][1]] @ R[0][1] = C[1] ^ (~C[2] & C[3]); + eor @E[3],@C[3],@E[3],ror#32-10 + str.h @E[3],[sp,#$R[0][1]+4] + eor @E[0],@C[4],@E[0],ror#32-7 + eor @E[1],@C[5],@E[1],ror#32-7 + str.l @E[0],[sp,#$R[0][2]] @ R[0][2] = C[2] ^ (~C[3] & C[4]); + bic @E[2],@C[0],@C[8],ror#32-7 + str.h @E[1],[sp,#$R[0][2]+4] + bic @E[3],@C[1],@C[9],ror#32-7 + eor @E[2],@E[2],@C[6],ror#32-11 + str.l @E[2],[sp,#$R[0][3]] @ R[0][3] = C[3] ^ (~C[4] & C[0]); + eor @E[3],@E[3],@C[7],ror#32-10 + str.h @E[3],[sp,#$R[0][3]+4] + bic @E[0],@C[2],@C[0] + add @E[3],sp,#$D[3] + ldr.l @C[0],[sp,#$A[0][3]] @ A[0][3] + bic @E[1],@C[3],@C[1] + ldr.h @C[1],[sp,#$A[0][3]+4] + eor @E[0],@E[0],@C[8],ror#32-7 + eor @E[1],@E[1],@C[9],ror#32-7 + str.l @E[0],[sp,#$R[0][4]] @ R[0][4] = C[4] ^ (~C[0] & C[1]); + add @C[9],sp,#$D[0] + str.h @E[1],[sp,#$R[0][4]+4] + + ldmia @E[3],{@E[0]-@E[2],@E[3]} @ D[3..4] + ldmia @C[9],{@C[6]-@C[9]} @ D[0..1] + + ldr.l @C[2],[sp,#$A[1][4]] @ A[1][4] + eor @C[0],@C[0],@E[0] + ldr.h @C[3],[sp,#$A[1][4]+4] + eor @C[1],@C[1],@E[1] + @ ror @C[0],@C[0],#32-14 @ C[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]); + ldr.l @E[0],[sp,#$A[3][1]] @ A[3][1] + @ ror @C[1],@C[1],#32-14 + ldr.h @E[1],[sp,#$A[3][1]+4] + + eor @C[2],@C[2],@E[2] + ldr.l @C[4],[sp,#$A[2][0]] @ A[2][0] + eor @C[3],@C[3],@E[3] + ldr.h @C[5],[sp,#$A[2][0]+4] + @ ror @C[2],@C[2],#32-10 @ C[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]); + @ ror @C[3],@C[3],#32-10 + + eor @C[6],@C[6],@C[4] + ldr.l @E[2],[sp,#$D[2]] @ D[2] + eor @C[7],@C[7],@C[5] + ldr.h @E[3],[sp,#$D[2]+4] + ror @C[5],@C[6],#32-1 @ C[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]); + ror @C[4],@C[7],#32-2 + + eor @E[0],@E[0],@C[8] + ldr.l @C[8],[sp,#$A[4][2]] @ A[4][2] + eor @E[1],@E[1],@C[9] + ldr.h @C[9],[sp,#$A[4][2]+4] + ror @C[7],@E[0],#32-22 @ C[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]); + ror @C[6],@E[1],#32-23 + + bic @E[0],@C[4],@C[2],ror#32-10 + bic @E[1],@C[5],@C[3],ror#32-10 + eor @E[2],@E[2],@C[8] + eor @E[3],@E[3],@C[9] + ror @C[9],@E[2],#32-30 @ C[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]); + ror @C[8],@E[3],#32-31 + eor @E[0],@E[0],@C[0],ror#32-14 + eor @E[1],@E[1],@C[1],ror#32-14 + str.l @E[0],[sp,#$R[1][0]] @ R[1][0] = C[0] ^ (~C[1] & C[2]) + bic @E[2],@C[6],@C[4] + str.h @E[1],[sp,#$R[1][0]+4] + bic @E[3],@C[7],@C[5] + eor @E[2],@E[2],@C[2],ror#32-10 + str.l @E[2],[sp,#$R[1][1]] @ R[1][1] = C[1] ^ (~C[2] & C[3]); + eor @E[3],@E[3],@C[3],ror#32-10 + str.h @E[3],[sp,#$R[1][1]+4] + bic @E[0],@C[8],@C[6] + bic @E[1],@C[9],@C[7] + bic @E[2],@C[0],@C[8],ror#14 + bic @E[3],@C[1],@C[9],ror#14 + eor @E[0],@E[0],@C[4] + eor @E[1],@E[1],@C[5] + str.l @E[0],[sp,#$R[1][2]] @ R[1][2] = C[2] ^ (~C[3] & C[4]); + bic @C[2],@C[2],@C[0],ror#32-(14-10) + str.h @E[1],[sp,#$R[1][2]+4] + eor @E[2],@C[6],@E[2],ror#32-14 + bic @E[1],@C[3],@C[1],ror#32-(14-10) + str.l @E[2],[sp,#$R[1][3]] @ R[1][3] = C[3] ^ (~C[4] & C[0]); + eor @E[3],@C[7],@E[3],ror#32-14 + str.h @E[3],[sp,#$R[1][3]+4] + add @E[2],sp,#$D[1] + ldr.l @C[1],[sp,#$A[0][1]] @ A[0][1] + eor @E[0],@C[8],@C[2],ror#32-10 + ldr.h @C[0],[sp,#$A[0][1]+4] + eor @E[1],@C[9],@E[1],ror#32-10 + str.l @E[0],[sp,#$R[1][4]] @ R[1][4] = C[4] ^ (~C[0] & C[1]); + str.h @E[1],[sp,#$R[1][4]+4] + + add @C[9],sp,#$D[3] + ldmia @E[2],{@E[0]-@E[2],@E[3]} @ D[1..2] + ldr.l @C[2],[sp,#$A[1][2]] @ A[1][2] + ldr.h @C[3],[sp,#$A[1][2]+4] + ldmia @C[9],{@C[6]-@C[9]} @ D[3..4] + + eor @C[1],@C[1],@E[0] + ldr.l @C[4],[sp,#$A[2][3]] @ A[2][3] + eor @C[0],@C[0],@E[1] + ldr.h @C[5],[sp,#$A[2][3]+4] + ror @C[0],@C[0],#32-1 @ C[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]); + + eor @C[2],@C[2],@E[2] + ldr.l @E[0],[sp,#$A[3][4]] @ A[3][4] + eor @C[3],@C[3],@E[3] + ldr.h @E[1],[sp,#$A[3][4]+4] + @ ror @C[2],@C[2],#32-3 @ C[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]); + ldr.l @E[2],[sp,#$D[0]] @ D[0] + @ ror @C[3],@C[3],#32-3 + ldr.h @E[3],[sp,#$D[0]+4] + + eor @C[4],@C[4],@C[6] + eor @C[5],@C[5],@C[7] + @ ror @C[5],@C[6],#32-12 @ C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]); + @ ror @C[4],@C[7],#32-13 @ [track reverse order below] + + eor @E[0],@E[0],@C[8] + ldr.l @C[8],[sp,#$A[4][0]] @ A[4][0] + eor @E[1],@E[1],@C[9] + ldr.h @C[9],[sp,#$A[4][0]+4] + ror @C[6],@E[0],#32-4 @ C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]); + ror @C[7],@E[1],#32-4 + + eor @E[2],@E[2],@C[8] + eor @E[3],@E[3],@C[9] + ror @C[8],@E[2],#32-9 @ C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]); + ror @C[9],@E[3],#32-9 + + bic @E[0],@C[5],@C[2],ror#13-3 + bic @E[1],@C[4],@C[3],ror#12-3 + bic @E[2],@C[6],@C[5],ror#32-13 + bic @E[3],@C[7],@C[4],ror#32-12 + eor @E[0],@C[0],@E[0],ror#32-13 + eor @E[1],@C[1],@E[1],ror#32-12 + str.l @E[0],[sp,#$R[2][0]] @ R[2][0] = C[0] ^ (~C[1] & C[2]) + eor @E[2],@E[2],@C[2],ror#32-3 + str.h @E[1],[sp,#$R[2][0]+4] + eor @E[3],@E[3],@C[3],ror#32-3 + str.l @E[2],[sp,#$R[2][1]] @ R[2][1] = C[1] ^ (~C[2] & C[3]); + bic @E[0],@C[8],@C[6] + bic @E[1],@C[9],@C[7] + str.h @E[3],[sp,#$R[2][1]+4] + eor @E[0],@E[0],@C[5],ror#32-13 + eor @E[1],@E[1],@C[4],ror#32-12 + str.l @E[0],[sp,#$R[2][2]] @ R[2][2] = C[2] ^ (~C[3] & C[4]); + bic @E[2],@C[0],@C[8] + str.h @E[1],[sp,#$R[2][2]+4] + bic @E[3],@C[1],@C[9] + eor @E[2],@E[2],@C[6] + eor @E[3],@E[3],@C[7] + str.l @E[2],[sp,#$R[2][3]] @ R[2][3] = C[3] ^ (~C[4] & C[0]); + bic @E[0],@C[2],@C[0],ror#3 + str.h @E[3],[sp,#$R[2][3]+4] + bic @E[1],@C[3],@C[1],ror#3 + ldr.l @C[1],[sp,#$A[0][4]] @ A[0][4] [in reverse order] + eor @E[0],@C[8],@E[0],ror#32-3 + ldr.h @C[0],[sp,#$A[0][4]+4] + eor @E[1],@C[9],@E[1],ror#32-3 + str.l @E[0],[sp,#$R[2][4]] @ R[2][4] = C[4] ^ (~C[0] & C[1]); + add @C[9],sp,#$D[1] + str.h @E[1],[sp,#$R[2][4]+4] + + ldr.l @E[0],[sp,#$D[4]] @ D[4] + ldr.h @E[1],[sp,#$D[4]+4] + ldr.l @E[2],[sp,#$D[0]] @ D[0] + ldr.h @E[3],[sp,#$D[0]+4] + + ldmia @C[9],{@C[6]-@C[9]} @ D[1..2] + + eor @C[1],@C[1],@E[0] + ldr.l @C[2],[sp,#$A[1][0]] @ A[1][0] + eor @C[0],@C[0],@E[1] + ldr.h @C[3],[sp,#$A[1][0]+4] + @ ror @C[1],@E[0],#32-13 @ C[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]); + ldr.l @C[4],[sp,#$A[2][1]] @ A[2][1] + @ ror @C[0],@E[1],#32-14 @ [was loaded in reverse order] + ldr.h @C[5],[sp,#$A[2][1]+4] + + eor @C[2],@C[2],@E[2] + ldr.l @E[0],[sp,#$A[3][2]] @ A[3][2] + eor @C[3],@C[3],@E[3] + ldr.h @E[1],[sp,#$A[3][2]+4] + @ ror @C[2],@C[2],#32-18 @ C[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]); + ldr.l @E[2],[sp,#$D[3]] @ D[3] + @ ror @C[3],@C[3],#32-18 + ldr.h @E[3],[sp,#$D[3]+4] + + eor @C[6],@C[6],@C[4] + eor @C[7],@C[7],@C[5] + ror @C[4],@C[6],#32-5 @ C[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]); + ror @C[5],@C[7],#32-5 + + eor @E[0],@E[0],@C[8] + ldr.l @C[8],[sp,#$A[4][3]] @ A[4][3] + eor @E[1],@E[1],@C[9] + ldr.h @C[9],[sp,#$A[4][3]+4] + ror @C[7],@E[0],#32-7 @ C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]); + ror @C[6],@E[1],#32-8 + + eor @E[2],@E[2],@C[8] + eor @E[3],@E[3],@C[9] + ror @C[8],@E[2],#32-28 @ C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]); + ror @C[9],@E[3],#32-28 + + bic @E[0],@C[4],@C[2],ror#32-18 + bic @E[1],@C[5],@C[3],ror#32-18 + eor @E[0],@E[0],@C[0],ror#32-14 + eor @E[1],@E[1],@C[1],ror#32-13 + str.l @E[0],[sp,#$R[3][0]] @ R[3][0] = C[0] ^ (~C[1] & C[2]) + bic @E[2],@C[6],@C[4] + str.h @E[1],[sp,#$R[3][0]+4] + bic @E[3],@C[7],@C[5] + eor @E[2],@E[2],@C[2],ror#32-18 + str.l @E[2],[sp,#$R[3][1]] @ R[3][1] = C[1] ^ (~C[2] & C[3]); + eor @E[3],@E[3],@C[3],ror#32-18 + str.h @E[3],[sp,#$R[3][1]+4] + bic @E[0],@C[8],@C[6] + bic @E[1],@C[9],@C[7] + bic @E[2],@C[0],@C[8],ror#14 + bic @E[3],@C[1],@C[9],ror#13 + eor @E[0],@E[0],@C[4] + eor @E[1],@E[1],@C[5] + str.l @E[0],[sp,#$R[3][2]] @ R[3][2] = C[2] ^ (~C[3] & C[4]); + bic @C[2],@C[2],@C[0],ror#18-14 + str.h @E[1],[sp,#$R[3][2]+4] + eor @E[2],@C[6],@E[2],ror#32-14 + bic @E[1],@C[3],@C[1],ror#18-13 + eor @E[3],@C[7],@E[3],ror#32-13 + str.l @E[2],[sp,#$R[3][3]] @ R[3][3] = C[3] ^ (~C[4] & C[0]); + str.h @E[3],[sp,#$R[3][3]+4] + add @E[3],sp,#$D[2] + ldr.l @C[0],[sp,#$A[0][2]] @ A[0][2] + eor @E[0],@C[8],@C[2],ror#32-18 + ldr.h @C[1],[sp,#$A[0][2]+4] + eor @E[1],@C[9],@E[1],ror#32-18 + str.l @E[0],[sp,#$R[3][4]] @ R[3][4] = C[4] ^ (~C[0] & C[1]); + str.h @E[1],[sp,#$R[3][4]+4] + + ldmia @E[3],{@E[0]-@E[2],@E[3]} @ D[2..3] + ldr.l @C[2],[sp,#$A[1][3]] @ A[1][3] + ldr.h @C[3],[sp,#$A[1][3]+4] + ldr.l @C[6],[sp,#$D[4]] @ D[4] + ldr.h @C[7],[sp,#$D[4]+4] + + eor @C[0],@C[0],@E[0] + ldr.l @C[4],[sp,#$A[2][4]] @ A[2][4] + eor @C[1],@C[1],@E[1] + ldr.h @C[5],[sp,#$A[2][4]+4] + @ ror @C[0],@C[0],#32-31 @ C[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]); + ldr.l @C[8],[sp,#$D[0]] @ D[0] + @ ror @C[1],@C[1],#32-31 + ldr.h @C[9],[sp,#$D[0]+4] + + eor @E[2],@E[2],@C[2] + ldr.l @E[0],[sp,#$A[3][0]] @ A[3][0] + eor @E[3],@E[3],@C[3] + ldr.h @E[1],[sp,#$A[3][0]+4] + ror @C[3],@E[2],#32-27 @ C[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]); + ldr.l @E[2],[sp,#$D[1]] @ D[1] + ror @C[2],@E[3],#32-28 + ldr.h @E[3],[sp,#$D[1]+4] + + eor @C[6],@C[6],@C[4] + eor @C[7],@C[7],@C[5] + ror @C[5],@C[6],#32-19 @ C[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]); + ror @C[4],@C[7],#32-20 + + eor @E[0],@E[0],@C[8] + ldr.l @C[8],[sp,#$A[4][1]] @ A[4][1] + eor @E[1],@E[1],@C[9] + ldr.h @C[9],[sp,#$A[4][1]+4] + ror @C[7],@E[0],#32-20 @ C[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]); + ror @C[6],@E[1],#32-21 + + eor @C[8],@C[8],@E[2] + eor @C[9],@C[9],@E[3] + @ ror @C[8],@C[2],#32-1 @ C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]); + @ ror @C[9],@C[3],#32-1 + + bic @E[0],@C[4],@C[2] + bic @E[1],@C[5],@C[3] + eor @E[0],@E[0],@C[0],ror#32-31 + str.l @E[0],[sp,#$R[4][0]] @ R[4][0] = C[0] ^ (~C[1] & C[2]) + eor @E[1],@E[1],@C[1],ror#32-31 + str.h @E[1],[sp,#$R[4][0]+4] + bic @E[2],@C[6],@C[4] + bic @E[3],@C[7],@C[5] + eor @E[2],@E[2],@C[2] + eor @E[3],@E[3],@C[3] + str.l @E[2],[sp,#$R[4][1]] @ R[4][1] = C[1] ^ (~C[2] & C[3]); + bic @E[0],@C[8],@C[6],ror#1 + str.h @E[3],[sp,#$R[4][1]+4] + bic @E[1],@C[9],@C[7],ror#1 + bic @E[2],@C[0],@C[8],ror#31-1 + bic @E[3],@C[1],@C[9],ror#31-1 + eor @C[4],@C[4],@E[0],ror#32-1 + str.l @C[4],[sp,#$R[4][2]] @ R[4][2] = C[2] ^= (~C[3] & C[4]); + eor @C[5],@C[5],@E[1],ror#32-1 + str.h @C[5],[sp,#$R[4][2]+4] + eor @C[6],@C[6],@E[2],ror#32-31 + eor @C[7],@C[7],@E[3],ror#32-31 + str.l @C[6],[sp,#$R[4][3]] @ R[4][3] = C[3] ^= (~C[4] & C[0]); + bic @E[0],@C[2],@C[0],ror#32-31 + str.h @C[7],[sp,#$R[4][3]+4] + bic @E[1],@C[3],@C[1],ror#32-31 + add @E[2],sp,#$R[0][0] + eor @C[8],@E[0],@C[8],ror#32-1 + add @E[0],sp,#$R[1][0] + eor @C[9],@E[1],@C[9],ror#32-1 + str.l @C[8],[sp,#$R[4][4]] @ R[4][4] = C[4] ^= (~C[0] & C[1]); + str.h @C[9],[sp,#$R[4][4]+4] +___ +} + Round(@A,@T); + Round(@T,@A); +$code.=<<___; + blo .Lround2x + + ldr pc,[sp,#440] +.size KeccakF1600_int,.-KeccakF1600_int + +.type KeccakF1600, %function +.align 5 +KeccakF1600: + stmdb sp!,{r0,r4-r11,lr} + sub sp,sp,#440+16 @ space for A[5][5],D[5],T[5][5],... + + add @E[0],r0,#$A[1][0] + add @E[1],sp,#$A[1][0] + ldmia r0, {@C[0]-@C[9]} @ copy A[5][5] to stack + stmia sp, {@C[0]-@C[9]} + ldmia @E[0]!,{@C[0]-@C[9]} + stmia @E[1]!,{@C[0]-@C[9]} + ldmia @E[0]!,{@C[0]-@C[9]} + stmia @E[1]!,{@C[0]-@C[9]} + ldmia @E[0]!,{@C[0]-@C[9]} + stmia @E[1]!,{@C[0]-@C[9]} + ldmia @E[0], {@C[0]-@C[9]} + add @E[2],sp,#$A[0][0] + add @E[0],sp,#$A[1][0] + stmia @E[1], {@C[0]-@C[9]} + + bl KeccakF1600_enter + + ldr @E[1], [sp,#440+16] @ restore pointer to A + ldmia sp, {@C[0]-@C[9]} + stmia @E[1]!,{@C[0]-@C[9]} @ return A[5][5] + ldmia @E[0]!,{@C[0]-@C[9]} + stmia @E[1]!,{@C[0]-@C[9]} + ldmia @E[0]!,{@C[0]-@C[9]} + stmia @E[1]!,{@C[0]-@C[9]} + ldmia @E[0]!,{@C[0]-@C[9]} + stmia @E[1]!,{@C[0]-@C[9]} + ldmia @E[0], {@C[0]-@C[9]} + stmia @E[1], {@C[0]-@C[9]} + + add sp,sp,#440+20 + ldmia sp!,{r4-r11,pc} +.size KeccakF1600,.-KeccakF1600 +___ +{ my ($A_flat,$inp,$len,$bsz) = map("r$_",(10..12,14)); + +######################################################################## +# Stack layout +# ----->+-----------------------+ +# | uint64_t A[5][5] | +# | ... | +# | ... | +# +456->+-----------------------+ +# | 0x55555555 | +# +460->+-----------------------+ +# | 0x33333333 | +# +464->+-----------------------+ +# | 0x0f0f0f0f | +# +468->+-----------------------+ +# | 0x00ff00ff | +# +472->+-----------------------+ +# | uint64_t *A | +# +476->+-----------------------+ +# | const void *inp | +# +480->+-----------------------+ +# | size_t len | +# +484->+-----------------------+ +# | size_t bs | +# +488->+-----------------------+ +# | .... + +$code.=<<___; +.global SHA3_absorb +.type SHA3_absorb,%function +.align 5 +SHA3_absorb: + stmdb sp!,{r0-r12,lr} + sub sp,sp,#456+16 + + add $A_flat,r0,#$A[1][0] + @ mov $inp,r1 + mov $len,r2 + mov $bsz,r3 + cmp r2,r3 + blo .Labsorb_abort + + add $inp,sp,#0 + ldmia r0, {@C[0]-@C[9]} @ copy A[5][5] to stack + stmia $inp!, {@C[0]-@C[9]} + ldmia $A_flat!,{@C[0]-@C[9]} + stmia $inp!, {@C[0]-@C[9]} + ldmia $A_flat!,{@C[0]-@C[9]} + stmia $inp!, {@C[0]-@C[9]} + ldmia $A_flat!,{@C[0]-@C[9]} + stmia $inp!, {@C[0]-@C[9]} + ldmia $A_flat!,{@C[0]-@C[9]} + stmia $inp, {@C[0]-@C[9]} + + ldr $inp,[sp,#476] @ restore $inp +#ifdef __thumb2__ + mov r9,#0x00ff00ff + mov r8,#0x0f0f0f0f + mov r7,#0x33333333 + mov r6,#0x55555555 +#else + mov r6,#0x11 @ compose constants + mov r8,#0x0f + mov r9,#0xff + orr r6,r6,r6,lsl#8 + orr r8,r8,r8,lsl#8 + orr r6,r6,r6,lsl#16 @ 0x11111111 + orr r9,r9,r9,lsl#16 @ 0x00ff00ff + orr r8,r8,r8,lsl#16 @ 0x0f0f0f0f + orr r7,r6,r6,lsl#1 @ 0x33333333 + orr r6,r6,r6,lsl#2 @ 0x55555555 +#endif + str r9,[sp,#468] + str r8,[sp,#464] + str r7,[sp,#460] + str r6,[sp,#456] + b .Loop_absorb + +.align 4 +.Loop_absorb: + subs r0,$len,$bsz + blo .Labsorbed + add $A_flat,sp,#0 + str r0,[sp,#480] @ save len - bsz + +.align 4 +.Loop_block: + ldrb r0,[$inp],#1 + ldrb r1,[$inp],#1 + ldrb r2,[$inp],#1 + ldrb r3,[$inp],#1 + ldrb r4,[$inp],#1 + orr r0,r0,r1,lsl#8 + ldrb r1,[$inp],#1 + orr r0,r0,r2,lsl#16 + ldrb r2,[$inp],#1 + orr r0,r0,r3,lsl#24 @ lo + ldrb r3,[$inp],#1 + orr r1,r4,r1,lsl#8 + orr r1,r1,r2,lsl#16 + orr r1,r1,r3,lsl#24 @ hi + + and r2,r0,r6 @ &=0x55555555 + and r0,r0,r6,lsl#1 @ &=0xaaaaaaaa + and r3,r1,r6 @ &=0x55555555 + and r1,r1,r6,lsl#1 @ &=0xaaaaaaaa + orr r2,r2,r2,lsr#1 + orr r0,r0,r0,lsl#1 + orr r3,r3,r3,lsr#1 + orr r1,r1,r1,lsl#1 + and r2,r2,r7 @ &=0x33333333 + and r0,r0,r7,lsl#2 @ &=0xcccccccc + and r3,r3,r7 @ &=0x33333333 + and r1,r1,r7,lsl#2 @ &=0xcccccccc + orr r2,r2,r2,lsr#2 + orr r0,r0,r0,lsl#2 + orr r3,r3,r3,lsr#2 + orr r1,r1,r1,lsl#2 + and r2,r2,r8 @ &=0x0f0f0f0f + and r0,r0,r8,lsl#4 @ &=0xf0f0f0f0 + and r3,r3,r8 @ &=0x0f0f0f0f + and r1,r1,r8,lsl#4 @ &=0xf0f0f0f0 + ldmia $A_flat,{r4-r5} @ A_flat[i] + orr r2,r2,r2,lsr#4 + orr r0,r0,r0,lsl#4 + orr r3,r3,r3,lsr#4 + orr r1,r1,r1,lsl#4 + and r2,r2,r9 @ &=0x00ff00ff + and r0,r0,r9,lsl#8 @ &=0xff00ff00 + and r3,r3,r9 @ &=0x00ff00ff + and r1,r1,r9,lsl#8 @ &=0xff00ff00 + orr r2,r2,r2,lsr#8 + orr r0,r0,r0,lsl#8 + orr r3,r3,r3,lsr#8 + orr r1,r1,r1,lsl#8 + + lsl r2,r2,#16 + lsr r1,r1,#16 + eor r4,r4,r3,lsl#16 + eor r5,r5,r0,lsr#16 + eor r4,r4,r2,lsr#16 + eor r5,r5,r1,lsl#16 + stmia $A_flat!,{r4-r5} @ A_flat[i++] ^= BitInterleave(inp[0..7]) + + subs $bsz,$bsz,#8 + bhi .Loop_block + + str $inp,[sp,#476] + + bl KeccakF1600_int + + add r14,sp,#456 + ldmia r14,{r6-r12,r14} @ restore constants and variables + b .Loop_absorb + +.align 4 +.Labsorbed: + add $inp,sp,#$A[1][0] + ldmia sp, {@C[0]-@C[9]} + stmia $A_flat!,{@C[0]-@C[9]} @ return A[5][5] + ldmia $inp!, {@C[0]-@C[9]} + stmia $A_flat!,{@C[0]-@C[9]} + ldmia $inp!, {@C[0]-@C[9]} + stmia $A_flat!,{@C[0]-@C[9]} + ldmia $inp!, {@C[0]-@C[9]} + stmia $A_flat!,{@C[0]-@C[9]} + ldmia $inp, {@C[0]-@C[9]} + stmia $A_flat, {@C[0]-@C[9]} + +.Labsorb_abort: + add sp,sp,#456+32 + mov r0,$len @ return value + ldmia sp!,{r4-r12,pc} +.size SHA3_absorb,.-SHA3_absorb +___ +} +{ my ($out,$len,$A_flat,$bsz) = map("r$_", (4,5,10,12)); + +$code.=<<___; +.global SHA3_squeeze +.type SHA3_squeeze,%function +.align 5 +SHA3_squeeze: + stmdb sp!,{r0,r3-r10,lr} + + mov $A_flat,r0 + mov $out,r1 + mov $len,r2 + mov $bsz,r3 + +#ifdef __thumb2__ + mov r9,#0x00ff00ff + mov r8,#0x0f0f0f0f + mov r7,#0x33333333 + mov r6,#0x55555555 +#else + mov r6,#0x11 @ compose constants + mov r8,#0x0f + mov r9,#0xff + orr r6,r6,r6,lsl#8 + orr r8,r8,r8,lsl#8 + orr r6,r6,r6,lsl#16 @ 0x11111111 + orr r9,r9,r9,lsl#16 @ 0x00ff00ff + orr r8,r8,r8,lsl#16 @ 0x0f0f0f0f + orr r7,r6,r6,lsl#1 @ 0x33333333 + orr r6,r6,r6,lsl#2 @ 0x55555555 +#endif + stmdb sp!,{r6-r9} + + mov r14,$A_flat + b .Loop_squeeze + +.align 4 +.Loop_squeeze: + ldmia $A_flat!,{r0,r1} @ A_flat[i++] + + lsl r2,r0,#16 + lsl r3,r1,#16 @ r3 = r1 << 16 + lsr r2,r2,#16 @ r2 = r0 & 0x0000ffff + lsr r1,r1,#16 + lsr r0,r0,#16 @ r0 = r0 >> 16 + lsl r1,r1,#16 @ r1 = r1 & 0xffff0000 + + orr r2,r2,r2,lsl#8 + orr r3,r3,r3,lsr#8 + orr r0,r0,r0,lsl#8 + orr r1,r1,r1,lsr#8 + and r2,r2,r9 @ &=0x00ff00ff + and r3,r3,r9,lsl#8 @ &=0xff00ff00 + and r0,r0,r9 @ &=0x00ff00ff + and r1,r1,r9,lsl#8 @ &=0xff00ff00 + orr r2,r2,r2,lsl#4 + orr r3,r3,r3,lsr#4 + orr r0,r0,r0,lsl#4 + orr r1,r1,r1,lsr#4 + and r2,r2,r8 @ &=0x0f0f0f0f + and r3,r3,r8,lsl#4 @ &=0xf0f0f0f0 + and r0,r0,r8 @ &=0x0f0f0f0f + and r1,r1,r8,lsl#4 @ &=0xf0f0f0f0 + orr r2,r2,r2,lsl#2 + orr r3,r3,r3,lsr#2 + orr r0,r0,r0,lsl#2 + orr r1,r1,r1,lsr#2 + and r2,r2,r7 @ &=0x33333333 + and r3,r3,r7,lsl#2 @ &=0xcccccccc + and r0,r0,r7 @ &=0x33333333 + and r1,r1,r7,lsl#2 @ &=0xcccccccc + orr r2,r2,r2,lsl#1 + orr r3,r3,r3,lsr#1 + orr r0,r0,r0,lsl#1 + orr r1,r1,r1,lsr#1 + and r2,r2,r6 @ &=0x55555555 + and r3,r3,r6,lsl#1 @ &=0xaaaaaaaa + and r0,r0,r6 @ &=0x55555555 + and r1,r1,r6,lsl#1 @ &=0xaaaaaaaa + + orr r2,r2,r3 + orr r0,r0,r1 + + cmp $len,#8 + blo .Lsqueeze_tail + lsr r1,r2,#8 + strb r2,[$out],#1 + lsr r3,r2,#16 + strb r1,[$out],#1 + lsr r2,r2,#24 + strb r3,[$out],#1 + strb r2,[$out],#1 + + lsr r1,r0,#8 + strb r0,[$out],#1 + lsr r3,r0,#16 + strb r1,[$out],#1 + lsr r0,r0,#24 + strb r3,[$out],#1 + strb r0,[$out],#1 + subs $len,$len,#8 + beq .Lsqueeze_done + + subs $bsz,$bsz,#8 @ bsz -= 8 + bhi .Loop_squeeze + + mov r0,r14 @ original $A_flat + + bl KeccakF1600 + + ldmia sp,{r6-r10,r12} @ restore constants and variables + mov r14,$A_flat + b .Loop_squeeze + +.align 4 +.Lsqueeze_tail: + strb r2,[$out],#1 + lsr r2,r2,#8 + subs $len,$len,#1 + beq .Lsqueeze_done + strb r2,[$out],#1 + lsr r2,r2,#8 + subs $len,$len,#1 + beq .Lsqueeze_done + strb r2,[$out],#1 + lsr r2,r2,#8 + subs $len,$len,#1 + beq .Lsqueeze_done + strb r2,[$out],#1 + subs $len,$len,#1 + beq .Lsqueeze_done + + strb r0,[$out],#1 + lsr r0,r0,#8 + subs $len,$len,#1 + beq .Lsqueeze_done + strb r0,[$out],#1 + lsr r0,r0,#8 + subs $len,$len,#1 + beq .Lsqueeze_done + strb r0,[$out] + b .Lsqueeze_done + +.align 4 +.Lsqueeze_done: + add sp,sp,#24 + ldmia sp!,{r4-r10,pc} +.size SHA3_squeeze,.-SHA3_squeeze +___ +} + +$code.=<<___; +#if __ARM_MAX_ARCH__>=7 +.fpu neon + +.type iotas64, %object +.align 5 +iotas64: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 +.size iotas64,.-iotas64 + +.type KeccakF1600_neon, %function +.align 5 +KeccakF1600_neon: + add r1, r0, #16 + adr r2, iotas64 + mov r3, #24 @ loop counter + b .Loop_neon + +.align 4 +.Loop_neon: + @ Theta + vst1.64 {q4}, [r0:64] @ offload A[0..1][4] + veor q13, q0, q5 @ A[0..1][0]^A[2..3][0] + vst1.64 {d18}, [r1:64] @ offload A[2][4] + veor q14, q1, q6 @ A[0..1][1]^A[2..3][1] + veor q15, q2, q7 @ A[0..1][2]^A[2..3][2] + veor d26, d26, d27 @ C[0]=A[0][0]^A[1][0]^A[2][0]^A[3][0] + veor d27, d28, d29 @ C[1]=A[0][1]^A[1][1]^A[2][1]^A[3][1] + veor q14, q3, q8 @ A[0..1][3]^A[2..3][3] + veor q4, q4, q9 @ A[0..1][4]^A[2..3][4] + veor d30, d30, d31 @ C[2]=A[0][2]^A[1][2]^A[2][2]^A[3][2] + veor d31, d28, d29 @ C[3]=A[0][3]^A[1][3]^A[2][3]^A[3][3] + veor d25, d8, d9 @ C[4]=A[0][4]^A[1][4]^A[2][4]^A[3][4] + veor q13, q13, q10 @ C[0..1]^=A[4][0..1] + veor q14, q15, q11 @ C[2..3]^=A[4][2..3] + veor d25, d25, d24 @ C[4]^=A[4][4] + + vadd.u64 q4, q13, q13 @ C[0..1]<<1 + vadd.u64 q15, q14, q14 @ C[2..3]<<1 + vadd.u64 d18, d25, d25 @ C[4]<<1 + vsri.u64 q4, q13, #63 @ ROL64(C[0..1],1) + vsri.u64 q15, q14, #63 @ ROL64(C[2..3],1) + vsri.u64 d18, d25, #63 @ ROL64(C[4],1) + veor d25, d25, d9 @ D[0] = C[4] ^= ROL64(C[1],1) + veor q13, q13, q15 @ D[1..2] = C[0..1] ^ ROL64(C[2..3],1) + veor d28, d28, d18 @ D[3] = C[2] ^= ROL64(C[4],1) + veor d29, d29, d8 @ D[4] = C[3] ^= ROL64(C[0],1) + + veor d0, d0, d25 @ A[0][0] ^= C[4] + veor d1, d1, d25 @ A[1][0] ^= C[4] + veor d10, d10, d25 @ A[2][0] ^= C[4] + veor d11, d11, d25 @ A[3][0] ^= C[4] + veor d20, d20, d25 @ A[4][0] ^= C[4] + + veor d2, d2, d26 @ A[0][1] ^= D[1] + veor d3, d3, d26 @ A[1][1] ^= D[1] + veor d12, d12, d26 @ A[2][1] ^= D[1] + veor d13, d13, d26 @ A[3][1] ^= D[1] + veor d21, d21, d26 @ A[4][1] ^= D[1] + vmov d26, d27 + + veor d6, d6, d28 @ A[0][3] ^= C[2] + veor d7, d7, d28 @ A[1][3] ^= C[2] + veor d16, d16, d28 @ A[2][3] ^= C[2] + veor d17, d17, d28 @ A[3][3] ^= C[2] + veor d23, d23, d28 @ A[4][3] ^= C[2] + vld1.64 {q4}, [r0:64] @ restore A[0..1][4] + vmov d28, d29 + + vld1.64 {d18}, [r1:64] @ restore A[2][4] + veor q2, q2, q13 @ A[0..1][2] ^= D[2] + veor q7, q7, q13 @ A[2..3][2] ^= D[2] + veor d22, d22, d27 @ A[4][2] ^= D[2] + + veor q4, q4, q14 @ A[0..1][4] ^= C[3] + veor q9, q9, q14 @ A[2..3][4] ^= C[3] + veor d24, d24, d29 @ A[4][4] ^= C[3] + + @ Rho + Pi + vmov d26, d2 @ C[1] = A[0][1] + vshl.u64 d2, d3, #44 + vmov d27, d4 @ C[2] = A[0][2] + vshl.u64 d4, d14, #43 + vmov d28, d6 @ C[3] = A[0][3] + vshl.u64 d6, d17, #21 + vmov d29, d8 @ C[4] = A[0][4] + vshl.u64 d8, d24, #14 + vsri.u64 d2, d3, #64-44 @ A[0][1] = ROL64(A[1][1], rhotates[1][1]) + vsri.u64 d4, d14, #64-43 @ A[0][2] = ROL64(A[2][2], rhotates[2][2]) + vsri.u64 d6, d17, #64-21 @ A[0][3] = ROL64(A[3][3], rhotates[3][3]) + vsri.u64 d8, d24, #64-14 @ A[0][4] = ROL64(A[4][4], rhotates[4][4]) + + vshl.u64 d3, d9, #20 + vshl.u64 d14, d16, #25 + vshl.u64 d17, d15, #15 + vshl.u64 d24, d21, #2 + vsri.u64 d3, d9, #64-20 @ A[1][1] = ROL64(A[1][4], rhotates[1][4]) + vsri.u64 d14, d16, #64-25 @ A[2][2] = ROL64(A[2][3], rhotates[2][3]) + vsri.u64 d17, d15, #64-15 @ A[3][3] = ROL64(A[3][2], rhotates[3][2]) + vsri.u64 d24, d21, #64-2 @ A[4][4] = ROL64(A[4][1], rhotates[4][1]) + + vshl.u64 d9, d22, #61 + @ vshl.u64 d16, d19, #8 + vshl.u64 d15, d12, #10 + vshl.u64 d21, d7, #55 + vsri.u64 d9, d22, #64-61 @ A[1][4] = ROL64(A[4][2], rhotates[4][2]) + vext.8 d16, d19, d19, #8-1 @ A[2][3] = ROL64(A[3][4], rhotates[3][4]) + vsri.u64 d15, d12, #64-10 @ A[3][2] = ROL64(A[2][1], rhotates[2][1]) + vsri.u64 d21, d7, #64-55 @ A[4][1] = ROL64(A[1][3], rhotates[1][3]) + + vshl.u64 d22, d18, #39 + @ vshl.u64 d19, d23, #56 + vshl.u64 d12, d5, #6 + vshl.u64 d7, d13, #45 + vsri.u64 d22, d18, #64-39 @ A[4][2] = ROL64(A[2][4], rhotates[2][4]) + vext.8 d19, d23, d23, #8-7 @ A[3][4] = ROL64(A[4][3], rhotates[4][3]) + vsri.u64 d12, d5, #64-6 @ A[2][1] = ROL64(A[1][2], rhotates[1][2]) + vsri.u64 d7, d13, #64-45 @ A[1][3] = ROL64(A[3][1], rhotates[3][1]) + + vshl.u64 d18, d20, #18 + vshl.u64 d23, d11, #41 + vshl.u64 d5, d10, #3 + vshl.u64 d13, d1, #36 + vsri.u64 d18, d20, #64-18 @ A[2][4] = ROL64(A[4][0], rhotates[4][0]) + vsri.u64 d23, d11, #64-41 @ A[4][3] = ROL64(A[3][0], rhotates[3][0]) + vsri.u64 d5, d10, #64-3 @ A[1][2] = ROL64(A[2][0], rhotates[2][0]) + vsri.u64 d13, d1, #64-36 @ A[3][1] = ROL64(A[1][0], rhotates[1][0]) + + vshl.u64 d1, d28, #28 + vshl.u64 d10, d26, #1 + vshl.u64 d11, d29, #27 + vshl.u64 d20, d27, #62 + vsri.u64 d1, d28, #64-28 @ A[1][0] = ROL64(C[3], rhotates[0][3]) + vsri.u64 d10, d26, #64-1 @ A[2][0] = ROL64(C[1], rhotates[0][1]) + vsri.u64 d11, d29, #64-27 @ A[3][0] = ROL64(C[4], rhotates[0][4]) + vsri.u64 d20, d27, #64-62 @ A[4][0] = ROL64(C[2], rhotates[0][2]) + + @ Chi + Iota + vbic q13, q2, q1 + vbic q14, q3, q2 + vbic q15, q4, q3 + veor q13, q13, q0 @ A[0..1][0] ^ (~A[0..1][1] & A[0..1][2]) + veor q14, q14, q1 @ A[0..1][1] ^ (~A[0..1][2] & A[0..1][3]) + veor q2, q2, q15 @ A[0..1][2] ^= (~A[0..1][3] & A[0..1][4]) + vst1.64 {q13}, [r0:64] @ offload A[0..1][0] + vbic q13, q0, q4 + vbic q15, q1, q0 + vmov q1, q14 @ A[0..1][1] + veor q3, q3, q13 @ A[0..1][3] ^= (~A[0..1][4] & A[0..1][0]) + veor q4, q4, q15 @ A[0..1][4] ^= (~A[0..1][0] & A[0..1][1]) + + vbic q13, q7, q6 + vmov q0, q5 @ A[2..3][0] + vbic q14, q8, q7 + vmov q15, q6 @ A[2..3][1] + veor q5, q5, q13 @ A[2..3][0] ^= (~A[2..3][1] & A[2..3][2]) + vbic q13, q9, q8 + veor q6, q6, q14 @ A[2..3][1] ^= (~A[2..3][2] & A[2..3][3]) + vbic q14, q0, q9 + veor q7, q7, q13 @ A[2..3][2] ^= (~A[2..3][3] & A[2..3][4]) + vbic q13, q15, q0 + veor q8, q8, q14 @ A[2..3][3] ^= (~A[2..3][4] & A[2..3][0]) + vmov q14, q10 @ A[4][0..1] + veor q9, q9, q13 @ A[2..3][4] ^= (~A[2..3][0] & A[2..3][1]) + + vld1.64 d25, [r2:64]! @ Iota[i++] + vbic d26, d22, d21 + vbic d27, d23, d22 + vld1.64 {q0}, [r0:64] @ restore A[0..1][0] + veor d20, d20, d26 @ A[4][0] ^= (~A[4][1] & A[4][2]) + vbic d26, d24, d23 + veor d21, d21, d27 @ A[4][1] ^= (~A[4][2] & A[4][3]) + vbic d27, d28, d24 + veor d22, d22, d26 @ A[4][2] ^= (~A[4][3] & A[4][4]) + vbic d26, d29, d28 + veor d23, d23, d27 @ A[4][3] ^= (~A[4][4] & A[4][0]) + veor d0, d0, d25 @ A[0][0] ^= Iota[i] + veor d24, d24, d26 @ A[4][4] ^= (~A[4][0] & A[4][1]) + + subs r3, r3, #1 + bne .Loop_neon + + bx lr +.size KeccakF1600_neon,.-KeccakF1600_neon + +.global SHA3_absorb_neon +.type SHA3_absorb_neon, %function +.align 5 +SHA3_absorb_neon: + stmdb sp!, {r4-r6,lr} + vstmdb sp!, {d8-d15} + + mov r4, r1 @ inp + mov r5, r2 @ len + mov r6, r3 @ bsz + + vld1.32 {d0}, [r0:64]! @ A[0][0] + vld1.32 {d2}, [r0:64]! @ A[0][1] + vld1.32 {d4}, [r0:64]! @ A[0][2] + vld1.32 {d6}, [r0:64]! @ A[0][3] + vld1.32 {d8}, [r0:64]! @ A[0][4] + + vld1.32 {d1}, [r0:64]! @ A[1][0] + vld1.32 {d3}, [r0:64]! @ A[1][1] + vld1.32 {d5}, [r0:64]! @ A[1][2] + vld1.32 {d7}, [r0:64]! @ A[1][3] + vld1.32 {d9}, [r0:64]! @ A[1][4] + + vld1.32 {d10}, [r0:64]! @ A[2][0] + vld1.32 {d12}, [r0:64]! @ A[2][1] + vld1.32 {d14}, [r0:64]! @ A[2][2] + vld1.32 {d16}, [r0:64]! @ A[2][3] + vld1.32 {d18}, [r0:64]! @ A[2][4] + + vld1.32 {d11}, [r0:64]! @ A[3][0] + vld1.32 {d13}, [r0:64]! @ A[3][1] + vld1.32 {d15}, [r0:64]! @ A[3][2] + vld1.32 {d17}, [r0:64]! @ A[3][3] + vld1.32 {d19}, [r0:64]! @ A[3][4] + + vld1.32 {d20-d23}, [r0:64]! @ A[4][0..3] + vld1.32 {d24}, [r0:64] @ A[4][4] + sub r0, r0, #24*8 @ rewind + b .Loop_absorb_neon + +.align 4 +.Loop_absorb_neon: + subs r12, r5, r6 @ len - bsz + blo .Labsorbed_neon + mov r5, r12 + + vld1.8 {d31}, [r4]! @ endian-neutral loads... + cmp r6, #8*2 + veor d0, d0, d31 @ A[0][0] ^= *inp++ + blo .Lprocess_neon + vld1.8 {d31}, [r4]! + veor d2, d2, d31 @ A[0][1] ^= *inp++ + beq .Lprocess_neon + vld1.8 {d31}, [r4]! + cmp r6, #8*4 + veor d4, d4, d31 @ A[0][2] ^= *inp++ + blo .Lprocess_neon + vld1.8 {d31}, [r4]! + veor d6, d6, d31 @ A[0][3] ^= *inp++ + beq .Lprocess_neon + vld1.8 {d31},[r4]! + cmp r6, #8*6 + veor d8, d8, d31 @ A[0][4] ^= *inp++ + blo .Lprocess_neon + + vld1.8 {d31}, [r4]! + veor d1, d1, d31 @ A[1][0] ^= *inp++ + beq .Lprocess_neon + vld1.8 {d31}, [r4]! + cmp r6, #8*8 + veor d3, d3, d31 @ A[1][1] ^= *inp++ + blo .Lprocess_neon + vld1.8 {d31}, [r4]! + veor d5, d5, d31 @ A[1][2] ^= *inp++ + beq .Lprocess_neon + vld1.8 {d31}, [r4]! + cmp r6, #8*10 + veor d7, d7, d31 @ A[1][3] ^= *inp++ + blo .Lprocess_neon + vld1.8 {d31}, [r4]! + veor d9, d9, d31 @ A[1][4] ^= *inp++ + beq .Lprocess_neon + + vld1.8 {d31}, [r4]! + cmp r6, #8*12 + veor d10, d10, d31 @ A[2][0] ^= *inp++ + blo .Lprocess_neon + vld1.8 {d31}, [r4]! + veor d12, d12, d31 @ A[2][1] ^= *inp++ + beq .Lprocess_neon + vld1.8 {d31}, [r4]! + cmp r6, #8*14 + veor d14, d14, d31 @ A[2][2] ^= *inp++ + blo .Lprocess_neon + vld1.8 {d31}, [r4]! + veor d16, d16, d31 @ A[2][3] ^= *inp++ + beq .Lprocess_neon + vld1.8 {d31}, [r4]! + cmp r6, #8*16 + veor d18, d18, d31 @ A[2][4] ^= *inp++ + blo .Lprocess_neon + + vld1.8 {d31}, [r4]! + veor d11, d11, d31 @ A[3][0] ^= *inp++ + beq .Lprocess_neon + vld1.8 {d31}, [r4]! + cmp r6, #8*18 + veor d13, d13, d31 @ A[3][1] ^= *inp++ + blo .Lprocess_neon + vld1.8 {d31}, [r4]! + veor d15, d15, d31 @ A[3][2] ^= *inp++ + beq .Lprocess_neon + vld1.8 {d31}, [r4]! + cmp r6, #8*20 + veor d17, d17, d31 @ A[3][3] ^= *inp++ + blo .Lprocess_neon + vld1.8 {d31}, [r4]! + veor d19, d19, d31 @ A[3][4] ^= *inp++ + beq .Lprocess_neon + + vld1.8 {d31}, [r4]! + cmp r6, #8*22 + veor d20, d20, d31 @ A[4][0] ^= *inp++ + blo .Lprocess_neon + vld1.8 {d31}, [r4]! + veor d21, d21, d31 @ A[4][1] ^= *inp++ + beq .Lprocess_neon + vld1.8 {d31}, [r4]! + cmp r6, #8*24 + veor d22, d22, d31 @ A[4][2] ^= *inp++ + blo .Lprocess_neon + vld1.8 {d31}, [r4]! + veor d23, d23, d31 @ A[4][3] ^= *inp++ + beq .Lprocess_neon + vld1.8 {d31}, [r4]! + veor d24, d24, d31 @ A[4][4] ^= *inp++ + +.Lprocess_neon: + bl KeccakF1600_neon + b .Loop_absorb_neon + +.align 4 +.Labsorbed_neon: + vst1.32 {d0}, [r0:64]! @ A[0][0..4] + vst1.32 {d2}, [r0:64]! + vst1.32 {d4}, [r0:64]! + vst1.32 {d6}, [r0:64]! + vst1.32 {d8}, [r0:64]! + + vst1.32 {d1}, [r0:64]! @ A[1][0..4] + vst1.32 {d3}, [r0:64]! + vst1.32 {d5}, [r0:64]! + vst1.32 {d7}, [r0:64]! + vst1.32 {d9}, [r0:64]! + + vst1.32 {d10}, [r0:64]! @ A[2][0..4] + vst1.32 {d12}, [r0:64]! + vst1.32 {d14}, [r0:64]! + vst1.32 {d16}, [r0:64]! + vst1.32 {d18}, [r0:64]! + + vst1.32 {d11}, [r0:64]! @ A[3][0..4] + vst1.32 {d13}, [r0:64]! + vst1.32 {d15}, [r0:64]! + vst1.32 {d17}, [r0:64]! + vst1.32 {d19}, [r0:64]! + + vst1.32 {d20-d23}, [r0:64]! @ A[4][0..4] + vst1.32 {d24}, [r0:64] + + mov r0, r5 @ return value + vldmia sp!, {d8-d15} + ldmia sp!, {r4-r6,pc} +.size SHA3_absorb_neon,.-SHA3_absorb_neon + +.global SHA3_squeeze_neon +.type SHA3_squeeze_neon, %function +.align 5 +SHA3_squeeze_neon: + stmdb sp!, {r4-r6,lr} + + mov r4, r1 @ out + mov r5, r2 @ len + mov r6, r3 @ bsz + mov r12, r0 @ A_flat + mov r14, r3 @ bsz + b .Loop_squeeze_neon + +.align 4 +.Loop_squeeze_neon: + cmp r5, #8 + blo .Lsqueeze_neon_tail + vld1.32 {d0}, [r12]! + vst1.8 {d0}, [r4]! @ endian-neutral store + + subs r5, r5, #8 @ len -= 8 + beq .Lsqueeze_neon_done + + subs r14, r14, #8 @ bsz -= 8 + bhi .Loop_squeeze_neon + + vstmdb sp!, {d8-d15} + + vld1.32 {d0}, [r0:64]! @ A[0][0..4] + vld1.32 {d2}, [r0:64]! + vld1.32 {d4}, [r0:64]! + vld1.32 {d6}, [r0:64]! + vld1.32 {d8}, [r0:64]! + + vld1.32 {d1}, [r0:64]! @ A[1][0..4] + vld1.32 {d3}, [r0:64]! + vld1.32 {d5}, [r0:64]! + vld1.32 {d7}, [r0:64]! + vld1.32 {d9}, [r0:64]! + + vld1.32 {d10}, [r0:64]! @ A[2][0..4] + vld1.32 {d12}, [r0:64]! + vld1.32 {d14}, [r0:64]! + vld1.32 {d16}, [r0:64]! + vld1.32 {d18}, [r0:64]! + + vld1.32 {d11}, [r0:64]! @ A[3][0..4] + vld1.32 {d13}, [r0:64]! + vld1.32 {d15}, [r0:64]! + vld1.32 {d17}, [r0:64]! + vld1.32 {d19}, [r0:64]! + + vld1.32 {d20-d23}, [r0:64]! @ A[4][0..4] + vld1.32 {d24}, [r0:64] + sub r0, r0, #24*8 @ rewind + + bl KeccakF1600_neon + + mov r12, r0 @ A_flat + vst1.32 {d0}, [r0:64]! @ A[0][0..4] + vst1.32 {d2}, [r0:64]! + vst1.32 {d4}, [r0:64]! + vst1.32 {d6}, [r0:64]! + vst1.32 {d8}, [r0:64]! + + vst1.32 {d1}, [r0:64]! @ A[1][0..4] + vst1.32 {d3}, [r0:64]! + vst1.32 {d5}, [r0:64]! + vst1.32 {d7}, [r0:64]! + vst1.32 {d9}, [r0:64]! + + vst1.32 {d10}, [r0:64]! @ A[2][0..4] + vst1.32 {d12}, [r0:64]! + vst1.32 {d14}, [r0:64]! + vst1.32 {d16}, [r0:64]! + vst1.32 {d18}, [r0:64]! + + vst1.32 {d11}, [r0:64]! @ A[3][0..4] + vst1.32 {d13}, [r0:64]! + vst1.32 {d15}, [r0:64]! + vst1.32 {d17}, [r0:64]! + vst1.32 {d19}, [r0:64]! + + vst1.32 {d20-d23}, [r0:64]! @ A[4][0..4] + mov r14, r6 @ bsz + vst1.32 {d24}, [r0:64] + mov r0, r12 @ rewind + + vldmia sp!, {d8-d15} + b .Loop_squeeze_neon + +.align 4 +.Lsqueeze_neon_tail: + ldmia r12, {r2,r3} + cmp r5, #2 + strb r2, [r4],#1 @ endian-neutral store + lsr r2, r2, #8 + blo .Lsqueeze_neon_done + strb r2, [r4], #1 + lsr r2, r2, #8 + beq .Lsqueeze_neon_done + strb r2, [r4], #1 + lsr r2, r2, #8 + cmp r5, #4 + blo .Lsqueeze_neon_done + strb r2, [r4], #1 + beq .Lsqueeze_neon_done + + strb r3, [r4], #1 + lsr r3, r3, #8 + cmp r5, #6 + blo .Lsqueeze_neon_done + strb r3, [r4], #1 + lsr r3, r3, #8 + beq .Lsqueeze_neon_done + strb r3, [r4], #1 + +.Lsqueeze_neon_done: + ldmia sp!, {r4-r6,pc} +.size SHA3_squeeze_neon,.-SHA3_squeeze_neon +#endif +.asciz "Keccak-1600 absorb and squeeze for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>" +.align 2 +___ + +{ + my %ldr, %str; + + sub ldrd { + my ($mnemonic,$half,$reg,$ea) = @_; + my $op = $mnemonic eq "ldr" ? \%ldr : \%str; + + if ($half eq "l") { + $$op{reg} = $reg; + $$op{ea} = $ea; + sprintf "#ifndef __thumb2__\n" . + " %s\t%s,%s\n" . + "#endif", $mnemonic,$reg,$ea; + } else { + sprintf "#ifndef __thumb2__\n" . + " %s\t%s,%s\n" . + "#else\n" . + " %sd\t%s,%s,%s\n" . + "#endif", $mnemonic,$reg,$ea, + $mnemonic,$$op{reg},$reg,$$op{ea}; + } + } +} + +foreach (split($/,$code)) { + s/\`([^\`]*)\`/eval $1/ge; + + s/^\s+(ldr|str)\.([lh])\s+(r[0-9]+),\s*(\[.*)/ldrd($1,$2,$3,$4)/ge or + s/\b(ror|ls[rl])\s+(r[0-9]+.*)#/mov $2$1#/g or + s/\bret\b/bx lr/g or + s/\bbx\s+lr\b/.word\t0xe12fff1e/g; # make it possible to compile with -march=armv4 + + print $_,"\n"; +} + +close STDOUT; # enforce flush diff --git a/crypto/sha/asm/keccak1600-armv8.pl b/crypto/sha/asm/keccak1600-armv8.pl new file mode 100755 index 000000000000..704ab4a7e45a --- /dev/null +++ b/crypto/sha/asm/keccak1600-armv8.pl @@ -0,0 +1,866 @@ +#!/usr/bin/env perl +# Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# Keccak-1600 for ARMv8. +# +# June 2017. +# +# This is straightforward KECCAK_1X_ALT implementation. It makes no +# sense to attempt SIMD/NEON implementation for following reason. +# 64-bit lanes of vector registers can't be addressed as easily as in +# 32-bit mode. This means that 64-bit NEON is bound to be slower than +# 32-bit NEON, and this implementation is faster than 32-bit NEON on +# same processor. Even though it takes more scalar xor's and andn's, +# it gets compensated by availability of rotate. Not to forget that +# most processors achieve higher issue rate with scalar instructions. +# +# February 2018. +# +# Add hardware-assisted ARMv8.2 implementation. It's KECCAK_1X_ALT +# variant with register permutation/rotation twist that allows to +# eliminate copies to temporary registers. If you look closely you'll +# notice that it uses only one lane of vector registers. The new +# instructions effectively facilitate parallel hashing, which we don't +# support [yet?]. But lowest-level core procedure is prepared for it. +# The inner round is 67 [vector] instructions, so it's not actually +# obvious that it will provide performance improvement [in serial +# hash] as long as vector instructions issue rate is limited to 1 per +# cycle... +# +###################################################################### +# Numbers are cycles per processed byte. +# +# r=1088(*) +# +# Cortex-A53 13 +# Cortex-A57 12 +# X-Gene 14 +# Mongoose 10 +# Kryo 12 +# Denver 7.8 +# Apple A7 7.2 +# +# (*) Corresponds to SHA3-256. No improvement coefficients are listed +# because they vary too much from compiler to compiler. Newer +# compiler does much better and improvement varies from 5% on +# Cortex-A57 to 25% on Cortex-A53. While in comparison to older +# compiler this code is at least 2x faster... + +$flavour = shift; +$output = shift; + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or +die "can't locate arm-xlate.pl"; + +open OUT,"| \"$^X\" $xlate $flavour $output"; +*STDOUT=*OUT; + +my @rhotates = ([ 0, 1, 62, 28, 27 ], + [ 36, 44, 6, 55, 20 ], + [ 3, 10, 43, 25, 39 ], + [ 41, 45, 15, 21, 8 ], + [ 18, 2, 61, 56, 14 ]); + +$code.=<<___; +.text + +.align 8 // strategic alignment and padding that allows to use + // address value as loop termination condition... + .quad 0,0,0,0,0,0,0,0 +.type iotas,%object +iotas: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 +.size iotas,.-iotas +___ + {{{ +my @A = map([ "x$_", "x".($_+1), "x".($_+2), "x".($_+3), "x".($_+4) ], + (0, 5, 10, 15, 20)); + $A[3][3] = "x25"; # x18 is reserved + +my @C = map("x$_", (26,27,28,30)); + +$code.=<<___; +.type KeccakF1600_int,%function +.align 5 +KeccakF1600_int: + adr $C[2],iotas + stp $C[2],x30,[sp,#16] // 32 bytes on top are mine + b .Loop +.align 4 +.Loop: + ////////////////////////////////////////// Theta + eor $C[0],$A[0][0],$A[1][0] + stp $A[0][4],$A[1][4],[sp,#0] // offload pair... + eor $C[1],$A[0][1],$A[1][1] + eor $C[2],$A[0][2],$A[1][2] + eor $C[3],$A[0][3],$A[1][3] +___ + $C[4]=$A[0][4]; + $C[5]=$A[1][4]; +$code.=<<___; + eor $C[4],$A[0][4],$A[1][4] + eor $C[0],$C[0],$A[2][0] + eor $C[1],$C[1],$A[2][1] + eor $C[2],$C[2],$A[2][2] + eor $C[3],$C[3],$A[2][3] + eor $C[4],$C[4],$A[2][4] + eor $C[0],$C[0],$A[3][0] + eor $C[1],$C[1],$A[3][1] + eor $C[2],$C[2],$A[3][2] + eor $C[3],$C[3],$A[3][3] + eor $C[4],$C[4],$A[3][4] + eor $C[0],$C[0],$A[4][0] + eor $C[2],$C[2],$A[4][2] + eor $C[1],$C[1],$A[4][1] + eor $C[3],$C[3],$A[4][3] + eor $C[4],$C[4],$A[4][4] + + eor $C[5],$C[0],$C[2],ror#63 + + eor $A[0][1],$A[0][1],$C[5] + eor $A[1][1],$A[1][1],$C[5] + eor $A[2][1],$A[2][1],$C[5] + eor $A[3][1],$A[3][1],$C[5] + eor $A[4][1],$A[4][1],$C[5] + + eor $C[5],$C[1],$C[3],ror#63 + eor $C[2],$C[2],$C[4],ror#63 + eor $C[3],$C[3],$C[0],ror#63 + eor $C[4],$C[4],$C[1],ror#63 + + eor $C[1], $A[0][2],$C[5] // mov $C[1],$A[0][2] + eor $A[1][2],$A[1][2],$C[5] + eor $A[2][2],$A[2][2],$C[5] + eor $A[3][2],$A[3][2],$C[5] + eor $A[4][2],$A[4][2],$C[5] + + eor $A[0][0],$A[0][0],$C[4] + eor $A[1][0],$A[1][0],$C[4] + eor $A[2][0],$A[2][0],$C[4] + eor $A[3][0],$A[3][0],$C[4] + eor $A[4][0],$A[4][0],$C[4] +___ + $C[4]=undef; + $C[5]=undef; +$code.=<<___; + ldp $A[0][4],$A[1][4],[sp,#0] // re-load offloaded data + eor $C[0], $A[0][3],$C[2] // mov $C[0],$A[0][3] + eor $A[1][3],$A[1][3],$C[2] + eor $A[2][3],$A[2][3],$C[2] + eor $A[3][3],$A[3][3],$C[2] + eor $A[4][3],$A[4][3],$C[2] + + eor $C[2], $A[0][4],$C[3] // mov $C[2],$A[0][4] + eor $A[1][4],$A[1][4],$C[3] + eor $A[2][4],$A[2][4],$C[3] + eor $A[3][4],$A[3][4],$C[3] + eor $A[4][4],$A[4][4],$C[3] + + ////////////////////////////////////////// Rho+Pi + mov $C[3],$A[0][1] + ror $A[0][1],$A[1][1],#64-$rhotates[1][1] + //mov $C[1],$A[0][2] + ror $A[0][2],$A[2][2],#64-$rhotates[2][2] + //mov $C[0],$A[0][3] + ror $A[0][3],$A[3][3],#64-$rhotates[3][3] + //mov $C[2],$A[0][4] + ror $A[0][4],$A[4][4],#64-$rhotates[4][4] + + ror $A[1][1],$A[1][4],#64-$rhotates[1][4] + ror $A[2][2],$A[2][3],#64-$rhotates[2][3] + ror $A[3][3],$A[3][2],#64-$rhotates[3][2] + ror $A[4][4],$A[4][1],#64-$rhotates[4][1] + + ror $A[1][4],$A[4][2],#64-$rhotates[4][2] + ror $A[2][3],$A[3][4],#64-$rhotates[3][4] + ror $A[3][2],$A[2][1],#64-$rhotates[2][1] + ror $A[4][1],$A[1][3],#64-$rhotates[1][3] + + ror $A[4][2],$A[2][4],#64-$rhotates[2][4] + ror $A[3][4],$A[4][3],#64-$rhotates[4][3] + ror $A[2][1],$A[1][2],#64-$rhotates[1][2] + ror $A[1][3],$A[3][1],#64-$rhotates[3][1] + + ror $A[2][4],$A[4][0],#64-$rhotates[4][0] + ror $A[4][3],$A[3][0],#64-$rhotates[3][0] + ror $A[1][2],$A[2][0],#64-$rhotates[2][0] + ror $A[3][1],$A[1][0],#64-$rhotates[1][0] + + ror $A[1][0],$C[0],#64-$rhotates[0][3] + ror $A[2][0],$C[3],#64-$rhotates[0][1] + ror $A[3][0],$C[2],#64-$rhotates[0][4] + ror $A[4][0],$C[1],#64-$rhotates[0][2] + + ////////////////////////////////////////// Chi+Iota + bic $C[0],$A[0][2],$A[0][1] + bic $C[1],$A[0][3],$A[0][2] + bic $C[2],$A[0][0],$A[0][4] + bic $C[3],$A[0][1],$A[0][0] + eor $A[0][0],$A[0][0],$C[0] + bic $C[0],$A[0][4],$A[0][3] + eor $A[0][1],$A[0][1],$C[1] + ldr $C[1],[sp,#16] + eor $A[0][3],$A[0][3],$C[2] + eor $A[0][4],$A[0][4],$C[3] + eor $A[0][2],$A[0][2],$C[0] + ldr $C[3],[$C[1]],#8 // Iota[i++] + + bic $C[0],$A[1][2],$A[1][1] + tst $C[1],#255 // are we done? + str $C[1],[sp,#16] + bic $C[1],$A[1][3],$A[1][2] + bic $C[2],$A[1][0],$A[1][4] + eor $A[0][0],$A[0][0],$C[3] // A[0][0] ^= Iota + bic $C[3],$A[1][1],$A[1][0] + eor $A[1][0],$A[1][0],$C[0] + bic $C[0],$A[1][4],$A[1][3] + eor $A[1][1],$A[1][1],$C[1] + eor $A[1][3],$A[1][3],$C[2] + eor $A[1][4],$A[1][4],$C[3] + eor $A[1][2],$A[1][2],$C[0] + + bic $C[0],$A[2][2],$A[2][1] + bic $C[1],$A[2][3],$A[2][2] + bic $C[2],$A[2][0],$A[2][4] + bic $C[3],$A[2][1],$A[2][0] + eor $A[2][0],$A[2][0],$C[0] + bic $C[0],$A[2][4],$A[2][3] + eor $A[2][1],$A[2][1],$C[1] + eor $A[2][3],$A[2][3],$C[2] + eor $A[2][4],$A[2][4],$C[3] + eor $A[2][2],$A[2][2],$C[0] + + bic $C[0],$A[3][2],$A[3][1] + bic $C[1],$A[3][3],$A[3][2] + bic $C[2],$A[3][0],$A[3][4] + bic $C[3],$A[3][1],$A[3][0] + eor $A[3][0],$A[3][0],$C[0] + bic $C[0],$A[3][4],$A[3][3] + eor $A[3][1],$A[3][1],$C[1] + eor $A[3][3],$A[3][3],$C[2] + eor $A[3][4],$A[3][4],$C[3] + eor $A[3][2],$A[3][2],$C[0] + + bic $C[0],$A[4][2],$A[4][1] + bic $C[1],$A[4][3],$A[4][2] + bic $C[2],$A[4][0],$A[4][4] + bic $C[3],$A[4][1],$A[4][0] + eor $A[4][0],$A[4][0],$C[0] + bic $C[0],$A[4][4],$A[4][3] + eor $A[4][1],$A[4][1],$C[1] + eor $A[4][3],$A[4][3],$C[2] + eor $A[4][4],$A[4][4],$C[3] + eor $A[4][2],$A[4][2],$C[0] + + bne .Loop + + ldr x30,[sp,#24] + ret +.size KeccakF1600_int,.-KeccakF1600_int + +.type KeccakF1600,%function +.align 5 +KeccakF1600: + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#48 + + str x0,[sp,#32] // offload argument + mov $C[0],x0 + ldp $A[0][0],$A[0][1],[x0,#16*0] + ldp $A[0][2],$A[0][3],[$C[0],#16*1] + ldp $A[0][4],$A[1][0],[$C[0],#16*2] + ldp $A[1][1],$A[1][2],[$C[0],#16*3] + ldp $A[1][3],$A[1][4],[$C[0],#16*4] + ldp $A[2][0],$A[2][1],[$C[0],#16*5] + ldp $A[2][2],$A[2][3],[$C[0],#16*6] + ldp $A[2][4],$A[3][0],[$C[0],#16*7] + ldp $A[3][1],$A[3][2],[$C[0],#16*8] + ldp $A[3][3],$A[3][4],[$C[0],#16*9] + ldp $A[4][0],$A[4][1],[$C[0],#16*10] + ldp $A[4][2],$A[4][3],[$C[0],#16*11] + ldr $A[4][4],[$C[0],#16*12] + + bl KeccakF1600_int + + ldr $C[0],[sp,#32] + stp $A[0][0],$A[0][1],[$C[0],#16*0] + stp $A[0][2],$A[0][3],[$C[0],#16*1] + stp $A[0][4],$A[1][0],[$C[0],#16*2] + stp $A[1][1],$A[1][2],[$C[0],#16*3] + stp $A[1][3],$A[1][4],[$C[0],#16*4] + stp $A[2][0],$A[2][1],[$C[0],#16*5] + stp $A[2][2],$A[2][3],[$C[0],#16*6] + stp $A[2][4],$A[3][0],[$C[0],#16*7] + stp $A[3][1],$A[3][2],[$C[0],#16*8] + stp $A[3][3],$A[3][4],[$C[0],#16*9] + stp $A[4][0],$A[4][1],[$C[0],#16*10] + stp $A[4][2],$A[4][3],[$C[0],#16*11] + str $A[4][4],[$C[0],#16*12] + + ldp x19,x20,[x29,#16] + add sp,sp,#48 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#128 + ret +.size KeccakF1600,.-KeccakF1600 + +.globl SHA3_absorb +.type SHA3_absorb,%function +.align 5 +SHA3_absorb: + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#64 + + stp x0,x1,[sp,#32] // offload arguments + stp x2,x3,[sp,#48] + + mov $C[0],x0 // uint64_t A[5][5] + mov $C[1],x1 // const void *inp + mov $C[2],x2 // size_t len + mov $C[3],x3 // size_t bsz + ldp $A[0][0],$A[0][1],[$C[0],#16*0] + ldp $A[0][2],$A[0][3],[$C[0],#16*1] + ldp $A[0][4],$A[1][0],[$C[0],#16*2] + ldp $A[1][1],$A[1][2],[$C[0],#16*3] + ldp $A[1][3],$A[1][4],[$C[0],#16*4] + ldp $A[2][0],$A[2][1],[$C[0],#16*5] + ldp $A[2][2],$A[2][3],[$C[0],#16*6] + ldp $A[2][4],$A[3][0],[$C[0],#16*7] + ldp $A[3][1],$A[3][2],[$C[0],#16*8] + ldp $A[3][3],$A[3][4],[$C[0],#16*9] + ldp $A[4][0],$A[4][1],[$C[0],#16*10] + ldp $A[4][2],$A[4][3],[$C[0],#16*11] + ldr $A[4][4],[$C[0],#16*12] + b .Loop_absorb + +.align 4 +.Loop_absorb: + subs $C[0],$C[2],$C[3] // len - bsz + blo .Labsorbed + + str $C[0],[sp,#48] // save len - bsz +___ +for (my $i=0; $i<24; $i+=2) { +my $j = $i+1; +$code.=<<___; + ldr $C[0],[$C[1]],#8 // *inp++ +#ifdef __AARCH64EB__ + rev $C[0],$C[0] +#endif + eor $A[$i/5][$i%5],$A[$i/5][$i%5],$C[0] + cmp $C[3],#8*($i+2) + blo .Lprocess_block + ldr $C[0],[$C[1]],#8 // *inp++ +#ifdef __AARCH64EB__ + rev $C[0],$C[0] +#endif + eor $A[$j/5][$j%5],$A[$j/5][$j%5],$C[0] + beq .Lprocess_block +___ +} +$code.=<<___; + ldr $C[0],[$C[1]],#8 // *inp++ +#ifdef __AARCH64EB__ + rev $C[0],$C[0] +#endif + eor $A[4][4],$A[4][4],$C[0] + +.Lprocess_block: + str $C[1],[sp,#40] // save inp + + bl KeccakF1600_int + + ldr $C[1],[sp,#40] // restore arguments + ldp $C[2],$C[3],[sp,#48] + b .Loop_absorb + +.align 4 +.Labsorbed: + ldr $C[1],[sp,#32] + stp $A[0][0],$A[0][1],[$C[1],#16*0] + stp $A[0][2],$A[0][3],[$C[1],#16*1] + stp $A[0][4],$A[1][0],[$C[1],#16*2] + stp $A[1][1],$A[1][2],[$C[1],#16*3] + stp $A[1][3],$A[1][4],[$C[1],#16*4] + stp $A[2][0],$A[2][1],[$C[1],#16*5] + stp $A[2][2],$A[2][3],[$C[1],#16*6] + stp $A[2][4],$A[3][0],[$C[1],#16*7] + stp $A[3][1],$A[3][2],[$C[1],#16*8] + stp $A[3][3],$A[3][4],[$C[1],#16*9] + stp $A[4][0],$A[4][1],[$C[1],#16*10] + stp $A[4][2],$A[4][3],[$C[1],#16*11] + str $A[4][4],[$C[1],#16*12] + + mov x0,$C[2] // return value + ldp x19,x20,[x29,#16] + add sp,sp,#64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#128 + ret +.size SHA3_absorb,.-SHA3_absorb +___ +{ +my ($A_flat,$out,$len,$bsz) = map("x$_",(19..22)); +$code.=<<___; +.globl SHA3_squeeze +.type SHA3_squeeze,%function +.align 5 +SHA3_squeeze: + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + mov $A_flat,x0 // put aside arguments + mov $out,x1 + mov $len,x2 + mov $bsz,x3 + +.Loop_squeeze: + ldr x4,[x0],#8 + cmp $len,#8 + blo .Lsqueeze_tail +#ifdef __AARCH64EB__ + rev x4,x4 +#endif + str x4,[$out],#8 + subs $len,$len,#8 + beq .Lsqueeze_done + + subs x3,x3,#8 + bhi .Loop_squeeze + + mov x0,$A_flat + bl KeccakF1600 + mov x0,$A_flat + mov x3,$bsz + b .Loop_squeeze + +.align 4 +.Lsqueeze_tail: + strb w4,[$out],#1 + lsr x4,x4,#8 + subs $len,$len,#1 + beq .Lsqueeze_done + strb w4,[$out],#1 + lsr x4,x4,#8 + subs $len,$len,#1 + beq .Lsqueeze_done + strb w4,[$out],#1 + lsr x4,x4,#8 + subs $len,$len,#1 + beq .Lsqueeze_done + strb w4,[$out],#1 + lsr x4,x4,#8 + subs $len,$len,#1 + beq .Lsqueeze_done + strb w4,[$out],#1 + lsr x4,x4,#8 + subs $len,$len,#1 + beq .Lsqueeze_done + strb w4,[$out],#1 + lsr x4,x4,#8 + subs $len,$len,#1 + beq .Lsqueeze_done + strb w4,[$out],#1 + +.Lsqueeze_done: + ldp x19,x20,[sp,#16] + ldp x21,x22,[sp,#32] + ldp x29,x30,[sp],#48 + ret +.size SHA3_squeeze,.-SHA3_squeeze +___ +} }}} + {{{ +my @A = map([ "v".$_.".16b", "v".($_+1).".16b", "v".($_+2).".16b", + "v".($_+3).".16b", "v".($_+4).".16b" ], + (0, 5, 10, 15, 20)); + +my @C = map("v$_.16b", (25..31)); + +$code.=<<___; +.type KeccakF1600_ce,%function +.align 5 +KeccakF1600_ce: + mov x9,#12 + adr x10,iotas + b .Loop_ce +.align 4 +.Loop_ce: +___ +for($i=0; $i<2; $i++) { +$code.=<<___; + ////////////////////////////////////////////////// Theta + eor3 $C[0],$A[0][0],$A[1][0],$A[2][0] + eor3 $C[1],$A[0][1],$A[1][1],$A[2][1] + eor3 $C[2],$A[0][2],$A[1][2],$A[2][2] + eor3 $C[3],$A[0][3],$A[1][3],$A[2][3] + eor3 $C[4],$A[0][4],$A[1][4],$A[2][4] + eor3 $C[0],$C[0], $A[3][0],$A[4][0] + eor3 $C[1],$C[1], $A[3][1],$A[4][1] + eor3 $C[2],$C[2], $A[3][2],$A[4][2] + eor3 $C[3],$C[3], $A[3][3],$A[4][3] + eor3 $C[4],$C[4], $A[3][4],$A[4][4] + + rax1 $C[5],$C[0],$C[2] // D[1] + rax1 $C[6],$C[1],$C[3] // D[2] + rax1 $C[2],$C[2],$C[4] // D[3] + rax1 $C[3],$C[3],$C[0] // D[4] + rax1 $C[4],$C[4],$C[1] // D[0] + + ////////////////////////////////////////////////// Theta+Rho+Pi + xar $C[0], $A[1][1],$C[5],#64-$rhotates[1][1] // C[0]=A[0][1] + xar $A[1][1],$A[1][4],$C[3],#64-$rhotates[1][4] + xar $A[1][4],$A[4][2],$C[6],#64-$rhotates[4][2] + xar $A[4][2],$A[2][4],$C[3],#64-$rhotates[2][4] + xar $A[2][4],$A[4][0],$C[4],#64-$rhotates[4][0] + + xar $A[4][0],$A[0][2],$C[6],#64-$rhotates[0][2] + + xar $A[0][2],$A[2][2],$C[6],#64-$rhotates[2][2] + xar $A[2][2],$A[2][3],$C[2],#64-$rhotates[2][3] + xar $A[2][3],$A[3][4],$C[3],#64-$rhotates[3][4] + xar $A[3][4],$A[4][3],$C[2],#64-$rhotates[4][3] + xar $A[4][3],$A[3][0],$C[4],#64-$rhotates[3][0] + + xar $A[3][0],$A[0][4],$C[3],#64-$rhotates[0][4] + + eor $A[0][0],$A[0][0],$C[4] + ldr x11,[x10],#8 + + xar $C[1], $A[3][3],$C[2],#64-$rhotates[3][3] // C[1]=A[0][3] + xar $A[3][3],$A[3][2],$C[6],#64-$rhotates[3][2] + xar $A[3][2],$A[2][1],$C[5],#64-$rhotates[2][1] + xar $A[2][1],$A[1][2],$C[6],#64-$rhotates[1][2] + xar $A[1][2],$A[2][0],$C[4],#64-$rhotates[2][0] + + xar $A[2][0],$A[0][1],$C[5],#64-$rhotates[0][1] // * + + xar $A[0][4],$A[4][4],$C[3],#64-$rhotates[4][4] + xar $A[4][4],$A[4][1],$C[5],#64-$rhotates[4][1] + xar $A[4][1],$A[1][3],$C[2],#64-$rhotates[1][3] + xar $A[1][3],$A[3][1],$C[5],#64-$rhotates[3][1] + xar $A[3][1],$A[1][0],$C[4],#64-$rhotates[1][0] + + xar $C[2], $A[0][3],$C[2],#64-$rhotates[0][3] // C[2]=A[1][0] + + ////////////////////////////////////////////////// Chi+Iota + dup $C[6],x11 // borrow C[6] + bcax $C[3], $A[0][0],$A[0][2],$C[0] // * + bcax $A[0][1],$C[0], $C[1], $A[0][2] // * + bcax $A[0][2],$A[0][2],$A[0][4],$C[1] + bcax $A[0][3],$C[1], $A[0][0],$A[0][4] + bcax $A[0][4],$A[0][4],$C[0], $A[0][0] + + bcax $A[1][0],$C[2], $A[1][2],$A[1][1] // * + bcax $C[0], $A[1][1],$A[1][3],$A[1][2] // * + bcax $A[1][2],$A[1][2],$A[1][4],$A[1][3] + bcax $A[1][3],$A[1][3],$C[2], $A[1][4] + bcax $A[1][4],$A[1][4],$A[1][1],$C[2] + + eor $A[0][0],$C[3],$C[6] // Iota + + bcax $C[1], $A[2][0],$A[2][2],$A[2][1] // * + bcax $C[2], $A[2][1],$A[2][3],$A[2][2] // * + bcax $A[2][2],$A[2][2],$A[2][4],$A[2][3] + bcax $A[2][3],$A[2][3],$A[2][0],$A[2][4] + bcax $A[2][4],$A[2][4],$A[2][1],$A[2][0] + + bcax $C[3], $A[3][0],$A[3][2],$A[3][1] // * + bcax $C[4], $A[3][1],$A[3][3],$A[3][2] // * + bcax $A[3][2],$A[3][2],$A[3][4],$A[3][3] + bcax $A[3][3],$A[3][3],$A[3][0],$A[3][4] + bcax $A[3][4],$A[3][4],$A[3][1],$A[3][0] + + bcax $C[5], $A[4][0],$A[4][2],$A[4][1] // * + bcax $C[6], $A[4][1],$A[4][3],$A[4][2] // * + bcax $A[4][2],$A[4][2],$A[4][4],$A[4][3] + bcax $A[4][3],$A[4][3],$A[4][0],$A[4][4] + bcax $A[4][4],$A[4][4],$A[4][1],$A[4][0] +___ + ( $A[1][1], $C[0]) = ( $C[0], $A[1][1]); + ($A[2][0],$A[2][1], $C[1],$C[2]) = ($C[1],$C[2], $A[2][0],$A[2][1]); + ($A[3][0],$A[3][1], $C[3],$C[4]) = ($C[3],$C[4], $A[3][0],$A[3][1]); + ($A[4][0],$A[4][1], $C[5],$C[6]) = ($C[5],$C[6], $A[4][0],$A[4][1]); +} +$code.=<<___; + subs x9,x9,#1 + bne .Loop_ce + + ret +.size KeccakF1600_ce,.-KeccakF1600_ce + +.type KeccakF1600_cext,%function +.align 5 +KeccakF1600_cext: + stp x29,x30,[sp,#-80]! + add x29,sp,#0 + stp d8,d9,[sp,#16] // per ABI requirement + stp d10,d11,[sp,#32] + stp d12,d13,[sp,#48] + stp d14,d15,[sp,#64] +___ +for($i=0; $i<24; $i+=2) { # load A[5][5] +my $j=$i+1; +$code.=<<___; + ldp d$i,d$j,[x0,#8*$i] +___ +} +$code.=<<___; + ldr d24,[x0,#8*$i] + bl KeccakF1600_ce + ldr x30,[sp,#8] +___ +for($i=0; $i<24; $i+=2) { # store A[5][5] +my $j=$i+1; +$code.=<<___; + stp d$i,d$j,[x0,#8*$i] +___ +} +$code.=<<___; + str d24,[x0,#8*$i] + + ldp d8,d9,[sp,#16] + ldp d10,d11,[sp,#32] + ldp d12,d13,[sp,#48] + ldp d14,d15,[sp,#64] + ldr x29,[sp],#80 + ret +.size KeccakF1600_cext,.-KeccakF1600_cext +___ + +{ +my ($ctx,$inp,$len,$bsz) = map("x$_",(0..3)); + +$code.=<<___; +.globl SHA3_absorb_cext +.type SHA3_absorb_cext,%function +.align 5 +SHA3_absorb_cext: + stp x29,x30,[sp,#-80]! + add x29,sp,#0 + stp d8,d9,[sp,#16] // per ABI requirement + stp d10,d11,[sp,#32] + stp d12,d13,[sp,#48] + stp d14,d15,[sp,#64] +___ +for($i=0; $i<24; $i+=2) { # load A[5][5] +my $j=$i+1; +$code.=<<___; + ldp d$i,d$j,[x0,#8*$i] +___ +} +$code.=<<___; + ldr d24,[x0,#8*$i] + b .Loop_absorb_ce + +.align 4 +.Loop_absorb_ce: + subs $len,$len,$bsz // len - bsz + blo .Labsorbed_ce +___ +for (my $i=0; $i<24; $i+=2) { +my $j = $i+1; +$code.=<<___; + ldr d31,[$inp],#8 // *inp++ +#ifdef __AARCH64EB__ + rev64 v31.16b,v31.16b +#endif + eor $A[$i/5][$i%5],$A[$i/5][$i%5],v31.16b + cmp $bsz,#8*($i+2) + blo .Lprocess_block_ce + ldr d31,[$inp],#8 // *inp++ +#ifdef __AARCH64EB__ + rev v31.16b,v31.16b +#endif + eor $A[$j/5][$j%5],$A[$j/5][$j%5],v31.16b + beq .Lprocess_block_ce +___ +} +$code.=<<___; + ldr d31,[$inp],#8 // *inp++ +#ifdef __AARCH64EB__ + rev v31.16b,v31.16b +#endif + eor $A[4][4],$A[4][4],v31.16b + +.Lprocess_block_ce: + + bl KeccakF1600_ce + + b .Loop_absorb_ce + +.align 4 +.Labsorbed_ce: +___ +for($i=0; $i<24; $i+=2) { # store A[5][5] +my $j=$i+1; +$code.=<<___; + stp d$i,d$j,[x0,#8*$i] +___ +} +$code.=<<___; + str d24,[x0,#8*$i] + add x0,$len,$bsz // return value + + ldp d8,d9,[sp,#16] + ldp d10,d11,[sp,#32] + ldp d12,d13,[sp,#48] + ldp d14,d15,[sp,#64] + ldp x29,x30,[sp],#80 + ret +.size SHA3_absorb_cext,.-SHA3_absorb_cext +___ +} +{ +my ($ctx,$out,$len,$bsz) = map("x$_",(0..3)); +$code.=<<___; +.globl SHA3_squeeze_cext +.type SHA3_squeeze_cext,%function +.align 5 +SHA3_squeeze_cext: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + mov x9,$ctx + mov x10,$bsz + +.Loop_squeeze_ce: + ldr x4,[x9],#8 + cmp $len,#8 + blo .Lsqueeze_tail_ce +#ifdef __AARCH64EB__ + rev x4,x4 +#endif + str x4,[$out],#8 + beq .Lsqueeze_done_ce + + sub $len,$len,#8 + subs x10,x10,#8 + bhi .Loop_squeeze_ce + + bl KeccakF1600_cext + ldr x30,[sp,#8] + mov x9,$ctx + mov x10,$bsz + b .Loop_squeeze_ce + +.align 4 +.Lsqueeze_tail_ce: + strb w4,[$out],#1 + lsr x4,x4,#8 + subs $len,$len,#1 + beq .Lsqueeze_done_ce + strb w4,[$out],#1 + lsr x4,x4,#8 + subs $len,$len,#1 + beq .Lsqueeze_done_ce + strb w4,[$out],#1 + lsr x4,x4,#8 + subs $len,$len,#1 + beq .Lsqueeze_done_ce + strb w4,[$out],#1 + lsr x4,x4,#8 + subs $len,$len,#1 + beq .Lsqueeze_done_ce + strb w4,[$out],#1 + lsr x4,x4,#8 + subs $len,$len,#1 + beq .Lsqueeze_done_ce + strb w4,[$out],#1 + lsr x4,x4,#8 + subs $len,$len,#1 + beq .Lsqueeze_done_ce + strb w4,[$out],#1 + +.Lsqueeze_done_ce: + ldr x29,[sp],#16 + ret +.size SHA3_squeeze_cext,.-SHA3_squeeze_cext +___ +} }}} +$code.=<<___; +.asciz "Keccak-1600 absorb and squeeze for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" +___ + +{ my %opcode = ( + "rax1" => 0xce608c00, "eor3" => 0xce000000, + "bcax" => 0xce200000, "xar" => 0xce800000 ); + + sub unsha3 { + my ($mnemonic,$arg)=@_; + + $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv#]([0-9\-]+))?)?/ + && + sprintf ".inst\t0x%08x\t//%s %s", + $opcode{$mnemonic}|$1|($2<<5)|($3<<16)|(eval($4)<<10), + $mnemonic,$arg; + } +} + +foreach(split("\n",$code)) { + + s/\`([^\`]*)\`/eval($1)/ge; + + m/\bdup\b/ and s/\.16b/.2d/g or + s/\b(eor3|rax1|xar|bcax)\s+(v.*)/unsha3($1,$2)/ge; + + print $_,"\n"; +} + +close STDOUT; diff --git a/crypto/sha/asm/keccak1600-avx2.pl b/crypto/sha/asm/keccak1600-avx2.pl new file mode 100755 index 000000000000..d9fc1c59ec29 --- /dev/null +++ b/crypto/sha/asm/keccak1600-avx2.pl @@ -0,0 +1,482 @@ +#!/usr/bin/env perl +# Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# Keccak-1600 for AVX2. +# +# July 2017. +# +# To paraphrase Gilles Van Assche, if you contemplate Fig. 2.3 on page +# 20 of The Keccak reference [or Fig. 5 of FIPS PUB 202], and load data +# other than A[0][0] in magic order into 6 [256-bit] registers, *each +# dedicated to one axis*, Pi permutation is reduced to intra-register +# shuffles... +# +# It makes other steps more intricate, but overall, is it a win? To be +# more specific index permutations organized by quadruples are: +# +# [4][4] [3][3] [2][2] [1][1]<-+ +# [0][4] [0][3] [0][2] [0][1]<-+ +# [3][0] [1][0] [4][0] [2][0] | +# [4][3] [3][1] [2][4] [1][2] | +# [3][4] [1][3] [4][2] [2][1] | +# [2][3] [4][1] [1][4] [3][2] | +# [2][2] [4][4] [1][1] [3][3] -+ +# +# This however is highly impractical for Theta and Chi. What would help +# Theta is if x indices were aligned column-wise, or in other words: +# +# [0][4] [0][3] [0][2] [0][1] +# [3][0] [1][0] [4][0] [2][0] +#vpermq([4][3] [3][1] [2][4] [1][2], 0b01110010) +# [2][4] [4][3] [1][2] [3][1] +#vpermq([4][2] [3][4] [2][1] [1][3], 0b10001101) +# [3][4] [1][3] [4][2] [2][1] +#vpermq([2][3] [4][1] [1][4] [3][2], 0b01110010) +# [1][4] [2][3] [3][2] [4][1] +#vpermq([1][1] [2][2] [3][3] [4][4], 0b00011011) +# [4][4] [3][3] [2][2] [1][1] +# +# So here we have it, lines not marked with vpermq() represent the magic +# order in which data is to be loaded and maintained. [And lines marked +# with vpermq() represent Pi circular permutation in chosen layout. Note +# that first step is permutation-free.] A[0][0] is loaded to register of +# its own, to all lanes. [A[0][0] is not part of Pi permutation or Rho.] +# Digits in variables' names denote right-most coordinates: + +my ($A00, # [0][0] [0][0] [0][0] [0][0] # %ymm0 + $A01, # [0][4] [0][3] [0][2] [0][1] # %ymm1 + $A20, # [3][0] [1][0] [4][0] [2][0] # %ymm2 + $A31, # [2][4] [4][3] [1][2] [3][1] # %ymm3 + $A21, # [3][4] [1][3] [4][2] [2][1] # %ymm4 + $A41, # [1][4] [2][3] [3][2] [4][1] # %ymm5 + $A11) = # [4][4] [3][3] [2][2] [1][1] # %ymm6 + map("%ymm$_",(0..6)); + +# We also need to map the magic order into offsets within structure: + +my @A_jagged = ([0,0], [1,0], [1,1], [1,2], [1,3], # [0][0..4] + [2,2], [6,0], [3,1], [4,2], [5,3], # [1][0..4] + [2,0], [4,0], [6,1], [5,2], [3,3], # [2][0..4] + [2,3], [3,0], [5,1], [6,2], [4,3], # [3][0..4] + [2,1], [5,0], [4,1], [3,2], [6,3]); # [4][0..4] + @A_jagged = map(8*($$_[0]*4+$$_[1]), @A_jagged); # ... and now linear + +# But on the other hand Chi is much better off if y indices were aligned +# column-wise, not x. For this reason we have to shuffle data prior +# Chi and revert it afterwards. Prior shuffle is naturally merged with +# Pi itself: +# +# [0][4] [0][3] [0][2] [0][1] +# [3][0] [1][0] [4][0] [2][0] +#vpermq([4][3] [3][1] [2][4] [1][2], 0b01110010) +#vpermq([2][4] [4][3] [1][2] [3][1], 0b00011011) = 0b10001101 +# [3][1] [1][2] [4][3] [2][4] +#vpermq([4][2] [3][4] [2][1] [1][3], 0b10001101) +#vpermq([3][4] [1][3] [4][2] [2][1], 0b11100100) = 0b10001101 +# [3][4] [1][3] [4][2] [2][1] +#vpermq([2][3] [4][1] [1][4] [3][2], 0b01110010) +#vpermq([1][4] [2][3] [3][2] [4][1], 0b01110010) = 0b00011011 +# [3][2] [1][4] [4][1] [2][3] +#vpermq([1][1] [2][2] [3][3] [4][4], 0b00011011) +#vpermq([4][4] [3][3] [2][2] [1][1], 0b10001101) = 0b01110010 +# [3][3] [1][1] [4][4] [2][2] +# +# And reverse post-Chi permutation: +# +# [0][4] [0][3] [0][2] [0][1] +# [3][0] [1][0] [4][0] [2][0] +#vpermq([3][1] [1][2] [4][3] [2][4], 0b00011011) +# [2][4] [4][3] [1][2] [3][1] +#vpermq([3][4] [1][3] [4][2] [2][1], 0b11100100) = nop :-) +# [3][4] [1][3] [4][2] [2][1] +#vpermq([3][2] [1][4] [4][1] [2][3], 0b10001101) +# [1][4] [2][3] [3][2] [4][1] +#vpermq([3][3] [1][1] [4][4] [2][2], 0b01110010) +# [4][4] [3][3] [2][2] [1][1] +# +######################################################################## +# Numbers are cycles per processed byte out of large message. +# +# r=1088(*) +# +# Haswell 8.7/+10% +# Skylake 7.8/+20% +# Ryzen 17(**) +# +# (*) Corresponds to SHA3-256. Percentage after slash is improvement +# coefficient in comparison to scalar keccak1600-x86_64.pl. +# (**) It's expected that Ryzen performs poorly, because instruction +# issue rate is limited to two AVX2 instructions per cycle and +# in addition vpblendd is reportedly bound to specific port. +# Obviously this code path should not be executed on Ryzen. + +my @T = map("%ymm$_",(7..15)); +my ($C14,$C00,$D00,$D14) = @T[5..8]; + +$code.=<<___; +.text + +.type __KeccakF1600,\@function +.align 32 +__KeccakF1600: + lea rhotates_left+96(%rip),%r8 + lea rhotates_right+96(%rip),%r9 + lea iotas(%rip),%r10 + mov \$24,%eax + jmp .Loop_avx2 + +.align 32 +.Loop_avx2: + ######################################### Theta + vpshufd \$0b01001110,$A20,$C00 + vpxor $A31,$A41,$C14 + vpxor $A11,$A21,@T[2] + vpxor $A01,$C14,$C14 + vpxor @T[2],$C14,$C14 # C[1..4] + + vpermq \$0b10010011,$C14,@T[4] + vpxor $A20,$C00,$C00 + vpermq \$0b01001110,$C00,@T[0] + + vpsrlq \$63,$C14,@T[1] + vpaddq $C14,$C14,@T[2] + vpor @T[2],@T[1],@T[1] # ROL64(C[1..4],1) + + vpermq \$0b00111001,@T[1],$D14 + vpxor @T[4],@T[1],$D00 + vpermq \$0b00000000,$D00,$D00 # D[0..0] = ROL64(C[1],1) ^ C[4] + + vpxor $A00,$C00,$C00 + vpxor @T[0],$C00,$C00 # C[0..0] + + vpsrlq \$63,$C00,@T[0] + vpaddq $C00,$C00,@T[1] + vpor @T[0],@T[1],@T[1] # ROL64(C[0..0],1) + + vpxor $D00,$A20,$A20 # ^= D[0..0] + vpxor $D00,$A00,$A00 # ^= D[0..0] + + vpblendd \$0b11000000,@T[1],$D14,$D14 + vpblendd \$0b00000011,$C00,@T[4],@T[4] + vpxor @T[4],$D14,$D14 # D[1..4] = ROL64(C[2..4,0),1) ^ C[0..3] + + ######################################### Rho + Pi + pre-Chi shuffle + vpsllvq 0*32-96(%r8),$A20,@T[3] + vpsrlvq 0*32-96(%r9),$A20,$A20 + vpor @T[3],$A20,$A20 + + vpxor $D14,$A31,$A31 # ^= D[1..4] from Theta + vpsllvq 2*32-96(%r8),$A31,@T[4] + vpsrlvq 2*32-96(%r9),$A31,$A31 + vpor @T[4],$A31,$A31 + + vpxor $D14,$A21,$A21 # ^= D[1..4] from Theta + vpsllvq 3*32-96(%r8),$A21,@T[5] + vpsrlvq 3*32-96(%r9),$A21,$A21 + vpor @T[5],$A21,$A21 + + vpxor $D14,$A41,$A41 # ^= D[1..4] from Theta + vpsllvq 4*32-96(%r8),$A41,@T[6] + vpsrlvq 4*32-96(%r9),$A41,$A41 + vpor @T[6],$A41,$A41 + + vpxor $D14,$A11,$A11 # ^= D[1..4] from Theta + vpermq \$0b10001101,$A20,@T[3] # $A20 -> future $A31 + vpermq \$0b10001101,$A31,@T[4] # $A31 -> future $A21 + vpsllvq 5*32-96(%r8),$A11,@T[7] + vpsrlvq 5*32-96(%r9),$A11,@T[1] + vpor @T[7],@T[1],@T[1] # $A11 -> future $A01 + + vpxor $D14,$A01,$A01 # ^= D[1..4] from Theta + vpermq \$0b00011011,$A21,@T[5] # $A21 -> future $A41 + vpermq \$0b01110010,$A41,@T[6] # $A41 -> future $A11 + vpsllvq 1*32-96(%r8),$A01,@T[8] + vpsrlvq 1*32-96(%r9),$A01,@T[2] + vpor @T[8],@T[2],@T[2] # $A01 -> future $A20 + + ######################################### Chi + vpsrldq \$8,@T[1],@T[7] + vpandn @T[7],@T[1],@T[0] # tgting [0][0] [0][0] [0][0] [0][0] + + vpblendd \$0b00001100,@T[6],@T[2],$A31 # [4][4] [2][0] + vpblendd \$0b00001100,@T[2],@T[4],@T[8] # [4][0] [2][1] + vpblendd \$0b00001100,@T[4],@T[3],$A41 # [4][2] [2][4] + vpblendd \$0b00001100,@T[3],@T[2],@T[7] # [4][3] [2][0] + vpblendd \$0b00110000,@T[4],$A31,$A31 # [1][3] [4][4] [2][0] + vpblendd \$0b00110000,@T[5],@T[8],@T[8] # [1][4] [4][0] [2][1] + vpblendd \$0b00110000,@T[2],$A41,$A41 # [1][0] [4][2] [2][4] + vpblendd \$0b00110000,@T[6],@T[7],@T[7] # [1][1] [4][3] [2][0] + vpblendd \$0b11000000,@T[5],$A31,$A31 # [3][2] [1][3] [4][4] [2][0] + vpblendd \$0b11000000,@T[6],@T[8],@T[8] # [3][3] [1][4] [4][0] [2][1] + vpblendd \$0b11000000,@T[6],$A41,$A41 # [3][3] [1][0] [4][2] [2][4] + vpblendd \$0b11000000,@T[4],@T[7],@T[7] # [3][4] [1][1] [4][3] [2][0] + vpandn @T[8],$A31,$A31 # tgting [3][1] [1][2] [4][3] [2][4] + vpandn @T[7],$A41,$A41 # tgting [3][2] [1][4] [4][1] [2][3] + + vpblendd \$0b00001100,@T[2],@T[5],$A11 # [4][0] [2][3] + vpblendd \$0b00001100,@T[5],@T[3],@T[8] # [4][1] [2][4] + vpxor @T[3],$A31,$A31 + vpblendd \$0b00110000,@T[3],$A11,$A11 # [1][2] [4][0] [2][3] + vpblendd \$0b00110000,@T[4],@T[8],@T[8] # [1][3] [4][1] [2][4] + vpxor @T[5],$A41,$A41 + vpblendd \$0b11000000,@T[4],$A11,$A11 # [3][4] [1][2] [4][0] [2][3] + vpblendd \$0b11000000,@T[2],@T[8],@T[8] # [3][0] [1][3] [4][1] [2][4] + vpandn @T[8],$A11,$A11 # tgting [3][3] [1][1] [4][4] [2][2] + vpxor @T[6],$A11,$A11 + + vpermq \$0b00011110,@T[1],$A21 # [0][1] [0][2] [0][4] [0][3] + vpblendd \$0b00110000,$A00,$A21,@T[8] # [0][1] [0][0] [0][4] [0][3] + vpermq \$0b00111001,@T[1],$A01 # [0][1] [0][4] [0][3] [0][2] + vpblendd \$0b11000000,$A00,$A01,$A01 # [0][0] [0][4] [0][3] [0][2] + vpandn @T[8],$A01,$A01 # tgting [0][4] [0][3] [0][2] [0][1] + + vpblendd \$0b00001100,@T[5],@T[4],$A20 # [4][1] [2][1] + vpblendd \$0b00001100,@T[4],@T[6],@T[7] # [4][2] [2][2] + vpblendd \$0b00110000,@T[6],$A20,$A20 # [1][1] [4][1] [2][1] + vpblendd \$0b00110000,@T[3],@T[7],@T[7] # [1][2] [4][2] [2][2] + vpblendd \$0b11000000,@T[3],$A20,$A20 # [3][1] [1][1] [4][1] [2][1] + vpblendd \$0b11000000,@T[5],@T[7],@T[7] # [3][2] [1][2] [4][2] [2][2] + vpandn @T[7],$A20,$A20 # tgting [3][0] [1][0] [4][0] [2][0] + vpxor @T[2],$A20,$A20 + + vpermq \$0b00000000,@T[0],@T[0] # [0][0] [0][0] [0][0] [0][0] + vpermq \$0b00011011,$A31,$A31 # post-Chi shuffle + vpermq \$0b10001101,$A41,$A41 + vpermq \$0b01110010,$A11,$A11 + + vpblendd \$0b00001100,@T[3],@T[6],$A21 # [4][3] [2][2] + vpblendd \$0b00001100,@T[6],@T[5],@T[7] # [4][4] [2][3] + vpblendd \$0b00110000,@T[5],$A21,$A21 # [1][4] [4][3] [2][2] + vpblendd \$0b00110000,@T[2],@T[7],@T[7] # [1][0] [4][4] [2][3] + vpblendd \$0b11000000,@T[2],$A21,$A21 # [3][0] [1][4] [4][3] [2][2] + vpblendd \$0b11000000,@T[3],@T[7],@T[7] # [3][1] [1][0] [4][4] [2][3] + vpandn @T[7],$A21,$A21 # tgting [3][4] [1][3] [4][2] [2][1] + + vpxor @T[0],$A00,$A00 + vpxor @T[1],$A01,$A01 + vpxor @T[4],$A21,$A21 + + ######################################### Iota + vpxor (%r10),$A00,$A00 + lea 32(%r10),%r10 + + dec %eax + jnz .Loop_avx2 + + ret +.size __KeccakF1600,.-__KeccakF1600 +___ +my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx"); +my $out = $inp; # in squeeze + +$code.=<<___; +.globl SHA3_absorb +.type SHA3_absorb,\@function +.align 32 +SHA3_absorb: + mov %rsp,%r11 + + lea -240(%rsp),%rsp + and \$-32,%rsp + + lea 96($A_flat),$A_flat + lea 96($inp),$inp + lea 96(%rsp),%r10 + + vzeroupper + + vpbroadcastq -96($A_flat),$A00 # load A[5][5] + vmovdqu 8+32*0-96($A_flat),$A01 + vmovdqu 8+32*1-96($A_flat),$A20 + vmovdqu 8+32*2-96($A_flat),$A31 + vmovdqu 8+32*3-96($A_flat),$A21 + vmovdqu 8+32*4-96($A_flat),$A41 + vmovdqu 8+32*5-96($A_flat),$A11 + + vpxor @T[0],@T[0],@T[0] + vmovdqa @T[0],32*2-96(%r10) # zero transfer area on stack + vmovdqa @T[0],32*3-96(%r10) + vmovdqa @T[0],32*4-96(%r10) + vmovdqa @T[0],32*5-96(%r10) + vmovdqa @T[0],32*6-96(%r10) + +.Loop_absorb_avx2: + mov $bsz,%rax + sub $bsz,$len + jc .Ldone_absorb_avx2 + + shr \$3,%eax + vpbroadcastq 0-96($inp),@T[0] + vmovdqu 8-96($inp),@T[1] + sub \$4,%eax +___ +for(my $i=5; $i<25; $i++) { +$code.=<<___ + dec %eax + jz .Labsorved_avx2 + mov 8*$i-96($inp),%r8 + mov %r8,$A_jagged[$i]-96(%r10) +___ +} +$code.=<<___; +.Labsorved_avx2: + lea ($inp,$bsz),$inp + + vpxor @T[0],$A00,$A00 + vpxor @T[1],$A01,$A01 + vpxor 32*2-96(%r10),$A20,$A20 + vpxor 32*3-96(%r10),$A31,$A31 + vpxor 32*4-96(%r10),$A21,$A21 + vpxor 32*5-96(%r10),$A41,$A41 + vpxor 32*6-96(%r10),$A11,$A11 + + call __KeccakF1600 + + lea 96(%rsp),%r10 + jmp .Loop_absorb_avx2 + +.Ldone_absorb_avx2: + vmovq %xmm0,-96($A_flat) + vmovdqu $A01,8+32*0-96($A_flat) + vmovdqu $A20,8+32*1-96($A_flat) + vmovdqu $A31,8+32*2-96($A_flat) + vmovdqu $A21,8+32*3-96($A_flat) + vmovdqu $A41,8+32*4-96($A_flat) + vmovdqu $A11,8+32*5-96($A_flat) + + vzeroupper + + lea (%r11),%rsp + lea ($len,$bsz),%rax # return value + ret +.size SHA3_absorb,.-SHA3_absorb + +.globl SHA3_squeeze +.type SHA3_squeeze,\@function +.align 32 +SHA3_squeeze: + mov %rsp,%r11 + + lea 96($A_flat),$A_flat + shr \$3,$bsz + + vzeroupper + + vpbroadcastq -96($A_flat),$A00 + vpxor @T[0],@T[0],@T[0] + vmovdqu 8+32*0-96($A_flat),$A01 + vmovdqu 8+32*1-96($A_flat),$A20 + vmovdqu 8+32*2-96($A_flat),$A31 + vmovdqu 8+32*3-96($A_flat),$A21 + vmovdqu 8+32*4-96($A_flat),$A41 + vmovdqu 8+32*5-96($A_flat),$A11 + + mov $bsz,%rax + +.Loop_squeeze_avx2: + mov @A_jagged[$i]-96($A_flat),%r8 +___ +for (my $i=0; $i<25; $i++) { +$code.=<<___; + sub \$8,$len + jc .Ltail_squeeze_avx2 + mov %r8,($out) + lea 8($out),$out + je .Ldone_squeeze_avx2 + dec %eax + je .Lextend_output_avx2 + mov @A_jagged[$i+1]-120($A_flat),%r8 +___ +} +$code.=<<___; +.Lextend_output_avx2: + call __KeccakF1600 + + vmovq %xmm0,-96($A_flat) + vmovdqu $A01,8+32*0-96($A_flat) + vmovdqu $A20,8+32*1-96($A_flat) + vmovdqu $A31,8+32*2-96($A_flat) + vmovdqu $A21,8+32*3-96($A_flat) + vmovdqu $A41,8+32*4-96($A_flat) + vmovdqu $A11,8+32*5-96($A_flat) + + mov $bsz,%rax + jmp .Loop_squeeze_avx2 + + +.Ltail_squeeze_avx2: + add \$8,$len +.Loop_tail_avx2: + mov %r8b,($out) + lea 1($out),$out + shr \$8,%r8 + dec $len + jnz .Loop_tail_avx2 + +.Ldone_squeeze_avx2: + vzeroupper + + lea (%r11),%rsp + ret +.size SHA3_squeeze,.-SHA3_squeeze + +.align 64 +rhotates_left: + .quad 3, 18, 36, 41 # [2][0] [4][0] [1][0] [3][0] + .quad 1, 62, 28, 27 # [0][1] [0][2] [0][3] [0][4] + .quad 45, 6, 56, 39 # [3][1] [1][2] [4][3] [2][4] + .quad 10, 61, 55, 8 # [2][1] [4][2] [1][3] [3][4] + .quad 2, 15, 25, 20 # [4][1] [3][2] [2][3] [1][4] + .quad 44, 43, 21, 14 # [1][1] [2][2] [3][3] [4][4] +rhotates_right: + .quad 64-3, 64-18, 64-36, 64-41 + .quad 64-1, 64-62, 64-28, 64-27 + .quad 64-45, 64-6, 64-56, 64-39 + .quad 64-10, 64-61, 64-55, 64-8 + .quad 64-2, 64-15, 64-25, 64-20 + .quad 64-44, 64-43, 64-21, 64-14 +iotas: + .quad 0x0000000000000001, 0x0000000000000001, 0x0000000000000001, 0x0000000000000001 + .quad 0x0000000000008082, 0x0000000000008082, 0x0000000000008082, 0x0000000000008082 + .quad 0x800000000000808a, 0x800000000000808a, 0x800000000000808a, 0x800000000000808a + .quad 0x8000000080008000, 0x8000000080008000, 0x8000000080008000, 0x8000000080008000 + .quad 0x000000000000808b, 0x000000000000808b, 0x000000000000808b, 0x000000000000808b + .quad 0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001 + .quad 0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081 + .quad 0x8000000000008009, 0x8000000000008009, 0x8000000000008009, 0x8000000000008009 + .quad 0x000000000000008a, 0x000000000000008a, 0x000000000000008a, 0x000000000000008a + .quad 0x0000000000000088, 0x0000000000000088, 0x0000000000000088, 0x0000000000000088 + .quad 0x0000000080008009, 0x0000000080008009, 0x0000000080008009, 0x0000000080008009 + .quad 0x000000008000000a, 0x000000008000000a, 0x000000008000000a, 0x000000008000000a + .quad 0x000000008000808b, 0x000000008000808b, 0x000000008000808b, 0x000000008000808b + .quad 0x800000000000008b, 0x800000000000008b, 0x800000000000008b, 0x800000000000008b + .quad 0x8000000000008089, 0x8000000000008089, 0x8000000000008089, 0x8000000000008089 + .quad 0x8000000000008003, 0x8000000000008003, 0x8000000000008003, 0x8000000000008003 + .quad 0x8000000000008002, 0x8000000000008002, 0x8000000000008002, 0x8000000000008002 + .quad 0x8000000000000080, 0x8000000000000080, 0x8000000000000080, 0x8000000000000080 + .quad 0x000000000000800a, 0x000000000000800a, 0x000000000000800a, 0x000000000000800a + .quad 0x800000008000000a, 0x800000008000000a, 0x800000008000000a, 0x800000008000000a + .quad 0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081 + .quad 0x8000000000008080, 0x8000000000008080, 0x8000000000008080, 0x8000000000008080 + .quad 0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001 + .quad 0x8000000080008008, 0x8000000080008008, 0x8000000080008008, 0x8000000080008008 + +.asciz "Keccak-1600 absorb and squeeze for AVX2, CRYPTOGAMS by <appro\@openssl.org>" +___ + +$output=pop; +open STDOUT,">$output"; +print $code; +close STDOUT; diff --git a/crypto/sha/asm/keccak1600-avx512.pl b/crypto/sha/asm/keccak1600-avx512.pl new file mode 100755 index 000000000000..9074ff02dec3 --- /dev/null +++ b/crypto/sha/asm/keccak1600-avx512.pl @@ -0,0 +1,551 @@ +#!/usr/bin/env perl +# Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# Keccak-1600 for AVX-512F. +# +# July 2017. +# +# Below code is KECCAK_1X_ALT implementation (see sha/keccak1600.c). +# Pretty straightforward, the only "magic" is data layout in registers. +# It's impossible to have one that is optimal for every step, hence +# it's changing as algorithm progresses. Data is saved in linear order, +# but in-register order morphs between rounds. Even rounds take in +# linear layout, and odd rounds - transposed, or "verticaly-shaped"... +# +######################################################################## +# Numbers are cycles per processed byte out of large message. +# +# r=1088(*) +# +# Knights Landing 7.6 +# Skylake-X 5.7 +# +# (*) Corresponds to SHA3-256. + +######################################################################## +# Below code is combination of two ideas. One is taken from Keccak Code +# Package, hereafter KCP, and another one from initial version of this +# module. What is common is observation that Pi's input and output are +# "mostly transposed", i.e. if input is aligned by x coordinate, then +# output is [mostly] aligned by y. Both versions, KCP and predecessor, +# were trying to use one of them from round to round, which resulted in +# some kind of transposition in each round. This version still does +# transpose data, but only every second round. Another essential factor +# is that KCP transposition has to be performed with instructions that +# turned to be rather expensive on Knights Landing, both latency- and +# throughput-wise. Not to mention that some of them have to depend on +# each other. On the other hand initial version of this module was +# relying heavily on blend instructions. There were lots of them, +# resulting in higher instruction count, yet it performed better on +# Knights Landing, because processor can execute pair of them each +# cycle and they have minimal latency. This module is an attempt to +# bring best parts together:-) +# +# Coordinates below correspond to those in sha/keccak1600.c. Input +# layout is straight linear: +# +# [0][4] [0][3] [0][2] [0][1] [0][0] +# [1][4] [1][3] [1][2] [1][1] [1][0] +# [2][4] [2][3] [2][2] [2][1] [2][0] +# [3][4] [3][3] [3][2] [3][1] [3][0] +# [4][4] [4][3] [4][2] [4][1] [4][0] +# +# It's perfect for Theta, while Pi is reduced to intra-register +# permutations which yield layout perfect for Chi: +# +# [4][0] [3][0] [2][0] [1][0] [0][0] +# [4][1] [3][1] [2][1] [1][1] [0][1] +# [4][2] [3][2] [2][2] [1][2] [0][2] +# [4][3] [3][3] [2][3] [1][3] [0][3] +# [4][4] [3][4] [2][4] [1][4] [0][4] +# +# Now instead of performing full transposition and feeding it to next +# identical round, we perform kind of diagonal transposition to layout +# from initial version of this module, and make it suitable for Theta: +# +# [4][4] [3][3] [2][2] [1][1] [0][0]>4.3.2.1.0>[4][4] [3][3] [2][2] [1][1] [0][0] +# [4][0] [3][4] [2][3] [1][2] [0][1]>3.2.1.0.4>[3][4] [2][3] [1][2] [0][1] [4][0] +# [4][1] [3][0] [2][4] [1][3] [0][2]>2.1.0.4.3>[2][4] [1][3] [0][2] [4][1] [3][0] +# [4][2] [3][1] [2][0] [1][4] [0][3]>1.0.4.3.2>[1][4] [0][3] [4][2] [3][1] [2][0] +# [4][3] [3][2] [2][1] [1][0] [0][4]>0.4.3.2.1>[0][4] [4][3] [3][2] [2][1] [1][0] +# +# Now intra-register permutations yield initial [almost] straight +# linear layout: +# +# [4][4] [3][3] [2][2] [1][1] [0][0] +##[0][4] [0][3] [0][2] [0][1] [0][0] +# [3][4] [2][3] [1][2] [0][1] [4][0] +##[2][3] [2][2] [2][1] [2][0] [2][4] +# [2][4] [1][3] [0][2] [4][1] [3][0] +##[4][2] [4][1] [4][0] [4][4] [4][3] +# [1][4] [0][3] [4][2] [3][1] [2][0] +##[1][1] [1][0] [1][4] [1][3] [1][2] +# [0][4] [4][3] [3][2] [2][1] [1][0] +##[3][0] [3][4] [3][3] [3][2] [3][1] +# +# This means that odd round Chi is performed in less suitable layout, +# with a number of additional permutations. But overall it turned to be +# a win. Permutations are fastest possible on Knights Landing and they +# are laid down to be independent of each other. In the essence I traded +# 20 blend instructions for 3 permutations. The result is 13% faster +# than KCP on Skylake-X, and >40% on Knights Landing. +# +# As implied, data is loaded in straight linear order. Digits in +# variables' names represent coordinates of right-most element of +# loaded data chunk: + +my ($A00, # [0][4] [0][3] [0][2] [0][1] [0][0] + $A10, # [1][4] [1][3] [1][2] [1][1] [1][0] + $A20, # [2][4] [2][3] [2][2] [2][1] [2][0] + $A30, # [3][4] [3][3] [3][2] [3][1] [3][0] + $A40) = # [4][4] [4][3] [4][2] [4][1] [4][0] + map("%zmm$_",(0..4)); + +# We also need to map the magic order into offsets within structure: + +my @A_jagged = ([0,0], [0,1], [0,2], [0,3], [0,4], + [1,0], [1,1], [1,2], [1,3], [1,4], + [2,0], [2,1], [2,2], [2,3], [2,4], + [3,0], [3,1], [3,2], [3,3], [3,4], + [4,0], [4,1], [4,2], [4,3], [4,4]); + @A_jagged = map(8*($$_[0]*8+$$_[1]), @A_jagged); # ... and now linear + +my @T = map("%zmm$_",(5..12)); +my @Theta = map("%zmm$_",(33,13..16)); # invalid @Theta[0] is not typo +my @Pi0 = map("%zmm$_",(17..21)); +my @Rhotate0 = map("%zmm$_",(22..26)); +my @Rhotate1 = map("%zmm$_",(27..31)); + +my ($C00,$D00) = @T[0..1]; +my ($k00001,$k00010,$k00100,$k01000,$k10000,$k11111) = map("%k$_",(1..6)); + +$code.=<<___; +.text + +.type __KeccakF1600,\@function +.align 32 +__KeccakF1600: + lea iotas(%rip),%r10 + mov \$12,%eax + jmp .Loop_avx512 + +.align 32 +.Loop_avx512: + ######################################### Theta, even round + vmovdqa64 $A00,@T[0] # put aside original A00 + vpternlogq \$0x96,$A20,$A10,$A00 # and use it as "C00" + vpternlogq \$0x96,$A40,$A30,$A00 + + vprolq \$1,$A00,$D00 + vpermq $A00,@Theta[1],$A00 + vpermq $D00,@Theta[4],$D00 + + vpternlogq \$0x96,$A00,$D00,@T[0] # T[0] is original A00 + vpternlogq \$0x96,$A00,$D00,$A10 + vpternlogq \$0x96,$A00,$D00,$A20 + vpternlogq \$0x96,$A00,$D00,$A30 + vpternlogq \$0x96,$A00,$D00,$A40 + + ######################################### Rho + vprolvq @Rhotate0[0],@T[0],$A00 # T[0] is original A00 + vprolvq @Rhotate0[1],$A10,$A10 + vprolvq @Rhotate0[2],$A20,$A20 + vprolvq @Rhotate0[3],$A30,$A30 + vprolvq @Rhotate0[4],$A40,$A40 + + ######################################### Pi + vpermq $A00,@Pi0[0],$A00 + vpermq $A10,@Pi0[1],$A10 + vpermq $A20,@Pi0[2],$A20 + vpermq $A30,@Pi0[3],$A30 + vpermq $A40,@Pi0[4],$A40 + + ######################################### Chi + vmovdqa64 $A00,@T[0] + vmovdqa64 $A10,@T[1] + vpternlogq \$0xD2,$A20,$A10,$A00 + vpternlogq \$0xD2,$A30,$A20,$A10 + vpternlogq \$0xD2,$A40,$A30,$A20 + vpternlogq \$0xD2,@T[0],$A40,$A30 + vpternlogq \$0xD2,@T[1],@T[0],$A40 + + ######################################### Iota + vpxorq (%r10),$A00,${A00}{$k00001} + lea 16(%r10),%r10 + + ######################################### Harmonize rounds + vpblendmq $A20,$A10,@{T[1]}{$k00010} + vpblendmq $A30,$A20,@{T[2]}{$k00010} + vpblendmq $A40,$A30,@{T[3]}{$k00010} + vpblendmq $A10,$A00,@{T[0]}{$k00010} + vpblendmq $A00,$A40,@{T[4]}{$k00010} + + vpblendmq $A30,@T[1],@{T[1]}{$k00100} + vpblendmq $A40,@T[2],@{T[2]}{$k00100} + vpblendmq $A20,@T[0],@{T[0]}{$k00100} + vpblendmq $A00,@T[3],@{T[3]}{$k00100} + vpblendmq $A10,@T[4],@{T[4]}{$k00100} + + vpblendmq $A40,@T[1],@{T[1]}{$k01000} + vpblendmq $A30,@T[0],@{T[0]}{$k01000} + vpblendmq $A00,@T[2],@{T[2]}{$k01000} + vpblendmq $A10,@T[3],@{T[3]}{$k01000} + vpblendmq $A20,@T[4],@{T[4]}{$k01000} + + vpblendmq $A40,@T[0],@{T[0]}{$k10000} + vpblendmq $A00,@T[1],@{T[1]}{$k10000} + vpblendmq $A10,@T[2],@{T[2]}{$k10000} + vpblendmq $A20,@T[3],@{T[3]}{$k10000} + vpblendmq $A30,@T[4],@{T[4]}{$k10000} + + #vpermq @T[0],@Theta[0],$A00 # doesn't actually change order + vpermq @T[1],@Theta[1],$A10 + vpermq @T[2],@Theta[2],$A20 + vpermq @T[3],@Theta[3],$A30 + vpermq @T[4],@Theta[4],$A40 + + ######################################### Theta, odd round + vmovdqa64 $T[0],$A00 # real A00 + vpternlogq \$0x96,$A20,$A10,$C00 # C00 is @T[0]'s alias + vpternlogq \$0x96,$A40,$A30,$C00 + + vprolq \$1,$C00,$D00 + vpermq $C00,@Theta[1],$C00 + vpermq $D00,@Theta[4],$D00 + + vpternlogq \$0x96,$C00,$D00,$A00 + vpternlogq \$0x96,$C00,$D00,$A30 + vpternlogq \$0x96,$C00,$D00,$A10 + vpternlogq \$0x96,$C00,$D00,$A40 + vpternlogq \$0x96,$C00,$D00,$A20 + + ######################################### Rho + vprolvq @Rhotate1[0],$A00,$A00 + vprolvq @Rhotate1[3],$A30,@T[1] + vprolvq @Rhotate1[1],$A10,@T[2] + vprolvq @Rhotate1[4],$A40,@T[3] + vprolvq @Rhotate1[2],$A20,@T[4] + + vpermq $A00,@Theta[4],@T[5] + vpermq $A00,@Theta[3],@T[6] + + ######################################### Iota + vpxorq -8(%r10),$A00,${A00}{$k00001} + + ######################################### Pi + vpermq @T[1],@Theta[2],$A10 + vpermq @T[2],@Theta[4],$A20 + vpermq @T[3],@Theta[1],$A30 + vpermq @T[4],@Theta[3],$A40 + + ######################################### Chi + vpternlogq \$0xD2,@T[6],@T[5],$A00 + + vpermq @T[1],@Theta[1],@T[7] + #vpermq @T[1],@Theta[0],@T[1] + vpternlogq \$0xD2,@T[1],@T[7],$A10 + + vpermq @T[2],@Theta[3],@T[0] + vpermq @T[2],@Theta[2],@T[2] + vpternlogq \$0xD2,@T[2],@T[0],$A20 + + #vpermq @T[3],@Theta[0],@T[3] + vpermq @T[3],@Theta[4],@T[1] + vpternlogq \$0xD2,@T[1],@T[3],$A30 + + vpermq @T[4],@Theta[2],@T[0] + vpermq @T[4],@Theta[1],@T[4] + vpternlogq \$0xD2,@T[4],@T[0],$A40 + + dec %eax + jnz .Loop_avx512 + + ret +.size __KeccakF1600,.-__KeccakF1600 +___ + +my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx"); +my $out = $inp; # in squeeze + +$code.=<<___; +.globl SHA3_absorb +.type SHA3_absorb,\@function +.align 32 +SHA3_absorb: + mov %rsp,%r11 + + lea -320(%rsp),%rsp + and \$-64,%rsp + + lea 96($A_flat),$A_flat + lea 96($inp),$inp + lea 128(%rsp),%r9 + + lea theta_perm(%rip),%r8 + + kxnorw $k11111,$k11111,$k11111 + kshiftrw \$15,$k11111,$k00001 + kshiftrw \$11,$k11111,$k11111 + kshiftlw \$1,$k00001,$k00010 + kshiftlw \$2,$k00001,$k00100 + kshiftlw \$3,$k00001,$k01000 + kshiftlw \$4,$k00001,$k10000 + + #vmovdqa64 64*0(%r8),@Theta[0] + vmovdqa64 64*1(%r8),@Theta[1] + vmovdqa64 64*2(%r8),@Theta[2] + vmovdqa64 64*3(%r8),@Theta[3] + vmovdqa64 64*4(%r8),@Theta[4] + + vmovdqa64 64*5(%r8),@Rhotate1[0] + vmovdqa64 64*6(%r8),@Rhotate1[1] + vmovdqa64 64*7(%r8),@Rhotate1[2] + vmovdqa64 64*8(%r8),@Rhotate1[3] + vmovdqa64 64*9(%r8),@Rhotate1[4] + + vmovdqa64 64*10(%r8),@Rhotate0[0] + vmovdqa64 64*11(%r8),@Rhotate0[1] + vmovdqa64 64*12(%r8),@Rhotate0[2] + vmovdqa64 64*13(%r8),@Rhotate0[3] + vmovdqa64 64*14(%r8),@Rhotate0[4] + + vmovdqa64 64*15(%r8),@Pi0[0] + vmovdqa64 64*16(%r8),@Pi0[1] + vmovdqa64 64*17(%r8),@Pi0[2] + vmovdqa64 64*18(%r8),@Pi0[3] + vmovdqa64 64*19(%r8),@Pi0[4] + + vmovdqu64 40*0-96($A_flat),${A00}{$k11111}{z} + vpxorq @T[0],@T[0],@T[0] + vmovdqu64 40*1-96($A_flat),${A10}{$k11111}{z} + vmovdqu64 40*2-96($A_flat),${A20}{$k11111}{z} + vmovdqu64 40*3-96($A_flat),${A30}{$k11111}{z} + vmovdqu64 40*4-96($A_flat),${A40}{$k11111}{z} + + vmovdqa64 @T[0],0*64-128(%r9) # zero transfer area on stack + vmovdqa64 @T[0],1*64-128(%r9) + vmovdqa64 @T[0],2*64-128(%r9) + vmovdqa64 @T[0],3*64-128(%r9) + vmovdqa64 @T[0],4*64-128(%r9) + jmp .Loop_absorb_avx512 + +.align 32 +.Loop_absorb_avx512: + mov $bsz,%rax + sub $bsz,$len + jc .Ldone_absorb_avx512 + + shr \$3,%eax +___ +for(my $i=0; $i<25; $i++) { +$code.=<<___ + mov 8*$i-96($inp),%r8 + mov %r8,$A_jagged[$i]-128(%r9) + dec %eax + jz .Labsorved_avx512 +___ +} +$code.=<<___; +.Labsorved_avx512: + lea ($inp,$bsz),$inp + + vpxorq 64*0-128(%r9),$A00,$A00 + vpxorq 64*1-128(%r9),$A10,$A10 + vpxorq 64*2-128(%r9),$A20,$A20 + vpxorq 64*3-128(%r9),$A30,$A30 + vpxorq 64*4-128(%r9),$A40,$A40 + + call __KeccakF1600 + + jmp .Loop_absorb_avx512 + +.align 32 +.Ldone_absorb_avx512: + vmovdqu64 $A00,40*0-96($A_flat){$k11111} + vmovdqu64 $A10,40*1-96($A_flat){$k11111} + vmovdqu64 $A20,40*2-96($A_flat){$k11111} + vmovdqu64 $A30,40*3-96($A_flat){$k11111} + vmovdqu64 $A40,40*4-96($A_flat){$k11111} + + vzeroupper + + lea (%r11),%rsp + lea ($len,$bsz),%rax # return value + ret +.size SHA3_absorb,.-SHA3_absorb + +.globl SHA3_squeeze +.type SHA3_squeeze,\@function +.align 32 +SHA3_squeeze: + mov %rsp,%r11 + + lea 96($A_flat),$A_flat + cmp $bsz,$len + jbe .Lno_output_extension_avx512 + + lea theta_perm(%rip),%r8 + + kxnorw $k11111,$k11111,$k11111 + kshiftrw \$15,$k11111,$k00001 + kshiftrw \$11,$k11111,$k11111 + kshiftlw \$1,$k00001,$k00010 + kshiftlw \$2,$k00001,$k00100 + kshiftlw \$3,$k00001,$k01000 + kshiftlw \$4,$k00001,$k10000 + + #vmovdqa64 64*0(%r8),@Theta[0] + vmovdqa64 64*1(%r8),@Theta[1] + vmovdqa64 64*2(%r8),@Theta[2] + vmovdqa64 64*3(%r8),@Theta[3] + vmovdqa64 64*4(%r8),@Theta[4] + + vmovdqa64 64*5(%r8),@Rhotate1[0] + vmovdqa64 64*6(%r8),@Rhotate1[1] + vmovdqa64 64*7(%r8),@Rhotate1[2] + vmovdqa64 64*8(%r8),@Rhotate1[3] + vmovdqa64 64*9(%r8),@Rhotate1[4] + + vmovdqa64 64*10(%r8),@Rhotate0[0] + vmovdqa64 64*11(%r8),@Rhotate0[1] + vmovdqa64 64*12(%r8),@Rhotate0[2] + vmovdqa64 64*13(%r8),@Rhotate0[3] + vmovdqa64 64*14(%r8),@Rhotate0[4] + + vmovdqa64 64*15(%r8),@Pi0[0] + vmovdqa64 64*16(%r8),@Pi0[1] + vmovdqa64 64*17(%r8),@Pi0[2] + vmovdqa64 64*18(%r8),@Pi0[3] + vmovdqa64 64*19(%r8),@Pi0[4] + + vmovdqu64 40*0-96($A_flat),${A00}{$k11111}{z} + vmovdqu64 40*1-96($A_flat),${A10}{$k11111}{z} + vmovdqu64 40*2-96($A_flat),${A20}{$k11111}{z} + vmovdqu64 40*3-96($A_flat),${A30}{$k11111}{z} + vmovdqu64 40*4-96($A_flat),${A40}{$k11111}{z} + +.Lno_output_extension_avx512: + shr \$3,$bsz + lea -96($A_flat),%r9 + mov $bsz,%rax + jmp .Loop_squeeze_avx512 + +.align 32 +.Loop_squeeze_avx512: + cmp \$8,$len + jb .Ltail_squeeze_avx512 + + mov (%r9),%r8 + lea 8(%r9),%r9 + mov %r8,($out) + lea 8($out),$out + sub \$8,$len # len -= 8 + jz .Ldone_squeeze_avx512 + + sub \$1,%rax # bsz-- + jnz .Loop_squeeze_avx512 + + #vpermq @Theta[4],@Theta[4],@Theta[3] + #vpermq @Theta[3],@Theta[4],@Theta[2] + #vpermq @Theta[3],@Theta[3],@Theta[1] + + call __KeccakF1600 + + vmovdqu64 $A00,40*0-96($A_flat){$k11111} + vmovdqu64 $A10,40*1-96($A_flat){$k11111} + vmovdqu64 $A20,40*2-96($A_flat){$k11111} + vmovdqu64 $A30,40*3-96($A_flat){$k11111} + vmovdqu64 $A40,40*4-96($A_flat){$k11111} + + lea -96($A_flat),%r9 + mov $bsz,%rax + jmp .Loop_squeeze_avx512 + +.Ltail_squeeze_avx512: + mov $out,%rdi + mov %r9,%rsi + mov $len,%rcx + .byte 0xf3,0xa4 # rep movsb + +.Ldone_squeeze_avx512: + vzeroupper + + lea (%r11),%rsp + ret +.size SHA3_squeeze,.-SHA3_squeeze + +.align 64 +theta_perm: + .quad 0, 1, 2, 3, 4, 5, 6, 7 # [not used] + .quad 4, 0, 1, 2, 3, 5, 6, 7 + .quad 3, 4, 0, 1, 2, 5, 6, 7 + .quad 2, 3, 4, 0, 1, 5, 6, 7 + .quad 1, 2, 3, 4, 0, 5, 6, 7 + +rhotates1: + .quad 0, 44, 43, 21, 14, 0, 0, 0 # [0][0] [1][1] [2][2] [3][3] [4][4] + .quad 18, 1, 6, 25, 8, 0, 0, 0 # [4][0] [0][1] [1][2] [2][3] [3][4] + .quad 41, 2, 62, 55, 39, 0, 0, 0 # [3][0] [4][1] [0][2] [1][3] [2][4] + .quad 3, 45, 61, 28, 20, 0, 0, 0 # [2][0] [3][1] [4][2] [0][3] [1][4] + .quad 36, 10, 15, 56, 27, 0, 0, 0 # [1][0] [2][1] [3][2] [4][3] [0][4] + +rhotates0: + .quad 0, 1, 62, 28, 27, 0, 0, 0 + .quad 36, 44, 6, 55, 20, 0, 0, 0 + .quad 3, 10, 43, 25, 39, 0, 0, 0 + .quad 41, 45, 15, 21, 8, 0, 0, 0 + .quad 18, 2, 61, 56, 14, 0, 0, 0 + +pi0_perm: + .quad 0, 3, 1, 4, 2, 5, 6, 7 + .quad 1, 4, 2, 0, 3, 5, 6, 7 + .quad 2, 0, 3, 1, 4, 5, 6, 7 + .quad 3, 1, 4, 2, 0, 5, 6, 7 + .quad 4, 2, 0, 3, 1, 5, 6, 7 + + +iotas: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + +.asciz "Keccak-1600 absorb and squeeze for AVX-512F, CRYPTOGAMS by <appro\@openssl.org>" +___ + +$output=pop; +open STDOUT,">$output"; +print $code; +close STDOUT; diff --git a/crypto/sha/asm/keccak1600-avx512vl.pl b/crypto/sha/asm/keccak1600-avx512vl.pl new file mode 100755 index 000000000000..a21bb8615a7c --- /dev/null +++ b/crypto/sha/asm/keccak1600-avx512vl.pl @@ -0,0 +1,392 @@ +#!/usr/bin/env perl +# Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# Keccak-1600 for AVX512VL. +# +# December 2017. +# +# This is an adaptation of AVX2 module that reuses register data +# layout, but utilizes new 256-bit AVX512VL instructions. See AVX2 +# module for further information on layout. +# +######################################################################## +# Numbers are cycles per processed byte out of large message. +# +# r=1088(*) +# +# Skylake-X 6.4/+47% +# +# (*) Corresponds to SHA3-256. Percentage after slash is improvement +# coefficient in comparison to scalar keccak1600-x86_64.pl. + +# Digits in variables' names denote right-most coordinates: + +my ($A00, # [0][0] [0][0] [0][0] [0][0] # %ymm0 + $A01, # [0][4] [0][3] [0][2] [0][1] # %ymm1 + $A20, # [3][0] [1][0] [4][0] [2][0] # %ymm2 + $A31, # [2][4] [4][3] [1][2] [3][1] # %ymm3 + $A21, # [3][4] [1][3] [4][2] [2][1] # %ymm4 + $A41, # [1][4] [2][3] [3][2] [4][1] # %ymm5 + $A11) = # [4][4] [3][3] [2][2] [1][1] # %ymm6 + map("%ymm$_",(0..6)); + +# We also need to map the magic order into offsets within structure: + +my @A_jagged = ([0,0], [1,0], [1,1], [1,2], [1,3], # [0][0..4] + [2,2], [6,0], [3,1], [4,2], [5,3], # [1][0..4] + [2,0], [4,0], [6,1], [5,2], [3,3], # [2][0..4] + [2,3], [3,0], [5,1], [6,2], [4,3], # [3][0..4] + [2,1], [5,0], [4,1], [3,2], [6,3]); # [4][0..4] + @A_jagged = map(8*($$_[0]*4+$$_[1]), @A_jagged); # ... and now linear + +my @T = map("%ymm$_",(7..15)); +my ($C14,$C00,$D00,$D14) = @T[5..8]; +my ($R20,$R01,$R31,$R21,$R41,$R11) = map("%ymm$_",(16..21)); + +$code.=<<___; +.text + +.type __KeccakF1600,\@function +.align 32 +__KeccakF1600: + lea iotas(%rip),%r10 + mov \$24,%eax + jmp .Loop_avx512vl + +.align 32 +.Loop_avx512vl: + ######################################### Theta + vpshufd \$0b01001110,$A20,$C00 + vpxor $A31,$A41,$C14 + vpxor $A11,$A21,@T[2] + vpternlogq \$0x96,$A01,$T[2],$C14 # C[1..4] + + vpxor $A20,$C00,$C00 + vpermq \$0b01001110,$C00,@T[0] + + vpermq \$0b10010011,$C14,@T[4] + vprolq \$1,$C14,@T[1] # ROL64(C[1..4],1) + + vpermq \$0b00111001,@T[1],$D14 + vpxor @T[4],@T[1],$D00 + vpermq \$0b00000000,$D00,$D00 # D[0..0] = ROL64(C[1],1) ^ C[4] + + vpternlogq \$0x96,@T[0],$A00,$C00 # C[0..0] + vprolq \$1,$C00,@T[1] # ROL64(C[0..0],1) + + vpxor $D00,$A00,$A00 # ^= D[0..0] + + vpblendd \$0b11000000,@T[1],$D14,$D14 + vpblendd \$0b00000011,$C00,@T[4],@T[0] + + ######################################### Rho + Pi + pre-Chi shuffle + vpxor $D00,$A20,$A20 # ^= D[0..0] from Theta + vprolvq $R20,$A20,$A20 + + vpternlogq \$0x96,@T[0],$D14,$A31 # ^= D[1..4] from Theta + vprolvq $R31,$A31,$A31 + + vpternlogq \$0x96,@T[0],$D14,$A21 # ^= D[1..4] from Theta + vprolvq $R21,$A21,$A21 + + vpternlogq \$0x96,@T[0],$D14,$A41 # ^= D[1..4] from Theta + vprolvq $R41,$A41,$A41 + + vpermq \$0b10001101,$A20,@T[3] # $A20 -> future $A31 + vpermq \$0b10001101,$A31,@T[4] # $A31 -> future $A21 + vpternlogq \$0x96,@T[0],$D14,$A11 # ^= D[1..4] from Theta + vprolvq $R11,$A11,@T[1] # $A11 -> future $A01 + + vpermq \$0b00011011,$A21,@T[5] # $A21 -> future $A41 + vpermq \$0b01110010,$A41,@T[6] # $A41 -> future $A11 + vpternlogq \$0x96,@T[0],$D14,$A01 # ^= D[1..4] from Theta + vprolvq $R01,$A01,@T[2] # $A01 -> future $A20 + + ######################################### Chi + vpblendd \$0b00001100,@T[6],@T[2],$A31 # [4][4] [2][0] + vpblendd \$0b00001100,@T[2],@T[4],@T[8] # [4][0] [2][1] + vpblendd \$0b00001100,@T[4],@T[3],$A41 # [4][2] [2][4] + vpblendd \$0b00001100,@T[3],@T[2],@T[7] # [4][3] [2][0] + vpblendd \$0b00110000,@T[4],$A31,$A31 # [1][3] [4][4] [2][0] + vpblendd \$0b00110000,@T[5],@T[8],@T[8] # [1][4] [4][0] [2][1] + vpblendd \$0b00110000,@T[2],$A41,$A41 # [1][0] [4][2] [2][4] + vpblendd \$0b00110000,@T[6],@T[7],@T[7] # [1][1] [4][3] [2][0] + vpblendd \$0b11000000,@T[5],$A31,$A31 # [3][2] [1][3] [4][4] [2][0] + vpblendd \$0b11000000,@T[6],@T[8],@T[8] # [3][3] [1][4] [4][0] [2][1] + vpblendd \$0b11000000,@T[6],$A41,$A41 # [3][3] [1][0] [4][2] [2][4] + vpblendd \$0b11000000,@T[4],@T[7],@T[7] # [3][4] [1][1] [4][3] [2][0] + vpternlogq \$0xC6,@T[8],@T[3],$A31 # [3][1] [1][2] [4][3] [2][4] + vpternlogq \$0xC6,@T[7],@T[5],$A41 # [3][2] [1][4] [4][1] [2][3] + + vpsrldq \$8,@T[1],@T[0] + vpandn @T[0],@T[1],@T[0] # tgting [0][0] [0][0] [0][0] [0][0] + + vpblendd \$0b00001100,@T[2],@T[5],$A11 # [4][0] [2][3] + vpblendd \$0b00001100,@T[5],@T[3],@T[8] # [4][1] [2][4] + vpblendd \$0b00110000,@T[3],$A11,$A11 # [1][2] [4][0] [2][3] + vpblendd \$0b00110000,@T[4],@T[8],@T[8] # [1][3] [4][1] [2][4] + vpblendd \$0b11000000,@T[4],$A11,$A11 # [3][4] [1][2] [4][0] [2][3] + vpblendd \$0b11000000,@T[2],@T[8],@T[8] # [3][0] [1][3] [4][1] [2][4] + vpternlogq \$0xC6,@T[8],@T[6],$A11 # [3][3] [1][1] [4][4] [2][2] + + vpermq \$0b00011110,@T[1],$A21 # [0][1] [0][2] [0][4] [0][3] + vpblendd \$0b00110000,$A00,$A21,@T[8] # [0][1] [0][0] [0][4] [0][3] + vpermq \$0b00111001,@T[1],$A01 # [0][1] [0][4] [0][3] [0][2] + vpblendd \$0b11000000,$A00,$A01,$A01 # [0][0] [0][4] [0][3] [0][2] + + vpblendd \$0b00001100,@T[5],@T[4],$A20 # [4][1] [2][1] + vpblendd \$0b00001100,@T[4],@T[6],@T[7] # [4][2] [2][2] + vpblendd \$0b00110000,@T[6],$A20,$A20 # [1][1] [4][1] [2][1] + vpblendd \$0b00110000,@T[3],@T[7],@T[7] # [1][2] [4][2] [2][2] + vpblendd \$0b11000000,@T[3],$A20,$A20 # [3][1] [1][1] [4][1] [2][1] + vpblendd \$0b11000000,@T[5],@T[7],@T[7] # [3][2] [1][2] [4][2] [2][2] + vpternlogq \$0xC6,@T[7],@T[2],$A20 # [3][0] [1][0] [4][0] [2][0] + + vpermq \$0b00000000,@T[0],@T[0] # [0][0] [0][0] [0][0] [0][0] + vpermq \$0b00011011,$A31,$A31 # post-Chi shuffle + vpermq \$0b10001101,$A41,$A41 + vpermq \$0b01110010,$A11,$A11 + + vpblendd \$0b00001100,@T[3],@T[6],$A21 # [4][3] [2][2] + vpblendd \$0b00001100,@T[6],@T[5],@T[7] # [4][4] [2][3] + vpblendd \$0b00110000,@T[5],$A21,$A21 # [1][4] [4][3] [2][2] + vpblendd \$0b00110000,@T[2],@T[7],@T[7] # [1][0] [4][4] [2][3] + vpblendd \$0b11000000,@T[2],$A21,$A21 # [3][0] [1][4] [4][3] [2][2] + vpblendd \$0b11000000,@T[3],@T[7],@T[7] # [3][1] [1][0] [4][4] [2][3] + + vpternlogq \$0xC6,@T[8],@T[1],$A01 # [0][4] [0][3] [0][2] [0][1] + vpternlogq \$0xC6,@T[7],@T[4],$A21 # [3][4] [1][3] [4][2] [2][1] + + ######################################### Iota + vpternlogq \$0x96,(%r10),@T[0],$A00 + lea 32(%r10),%r10 + + dec %eax + jnz .Loop_avx512vl + + ret +.size __KeccakF1600,.-__KeccakF1600 +___ +my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx"); +my $out = $inp; # in squeeze + +$code.=<<___; +.globl SHA3_absorb +.type SHA3_absorb,\@function +.align 32 +SHA3_absorb: + mov %rsp,%r11 + + lea -240(%rsp),%rsp + and \$-32,%rsp + + lea 96($A_flat),$A_flat + lea 96($inp),$inp + lea 96(%rsp),%r10 + lea rhotates_left(%rip),%r8 + + vzeroupper + + vpbroadcastq -96($A_flat),$A00 # load A[5][5] + vmovdqu 8+32*0-96($A_flat),$A01 + vmovdqu 8+32*1-96($A_flat),$A20 + vmovdqu 8+32*2-96($A_flat),$A31 + vmovdqu 8+32*3-96($A_flat),$A21 + vmovdqu 8+32*4-96($A_flat),$A41 + vmovdqu 8+32*5-96($A_flat),$A11 + + vmovdqa64 0*32(%r8),$R20 # load "rhotate" indices + vmovdqa64 1*32(%r8),$R01 + vmovdqa64 2*32(%r8),$R31 + vmovdqa64 3*32(%r8),$R21 + vmovdqa64 4*32(%r8),$R41 + vmovdqa64 5*32(%r8),$R11 + + vpxor @T[0],@T[0],@T[0] + vmovdqa @T[0],32*2-96(%r10) # zero transfer area on stack + vmovdqa @T[0],32*3-96(%r10) + vmovdqa @T[0],32*4-96(%r10) + vmovdqa @T[0],32*5-96(%r10) + vmovdqa @T[0],32*6-96(%r10) + +.Loop_absorb_avx512vl: + mov $bsz,%rax + sub $bsz,$len + jc .Ldone_absorb_avx512vl + + shr \$3,%eax + vpbroadcastq 0-96($inp),@T[0] + vmovdqu 8-96($inp),@T[1] + sub \$4,%eax +___ +for(my $i=5; $i<25; $i++) { +$code.=<<___ + dec %eax + jz .Labsorved_avx512vl + mov 8*$i-96($inp),%r8 + mov %r8,$A_jagged[$i]-96(%r10) +___ +} +$code.=<<___; +.Labsorved_avx512vl: + lea ($inp,$bsz),$inp + + vpxor @T[0],$A00,$A00 + vpxor @T[1],$A01,$A01 + vpxor 32*2-96(%r10),$A20,$A20 + vpxor 32*3-96(%r10),$A31,$A31 + vpxor 32*4-96(%r10),$A21,$A21 + vpxor 32*5-96(%r10),$A41,$A41 + vpxor 32*6-96(%r10),$A11,$A11 + + call __KeccakF1600 + + lea 96(%rsp),%r10 + jmp .Loop_absorb_avx512vl + +.Ldone_absorb_avx512vl: + vmovq %xmm0,-96($A_flat) + vmovdqu $A01,8+32*0-96($A_flat) + vmovdqu $A20,8+32*1-96($A_flat) + vmovdqu $A31,8+32*2-96($A_flat) + vmovdqu $A21,8+32*3-96($A_flat) + vmovdqu $A41,8+32*4-96($A_flat) + vmovdqu $A11,8+32*5-96($A_flat) + + vzeroupper + + lea (%r11),%rsp + lea ($len,$bsz),%rax # return value + ret +.size SHA3_absorb,.-SHA3_absorb + +.globl SHA3_squeeze +.type SHA3_squeeze,\@function +.align 32 +SHA3_squeeze: + mov %rsp,%r11 + + lea 96($A_flat),$A_flat + lea rhotates_left(%rip),%r8 + shr \$3,$bsz + + vzeroupper + + vpbroadcastq -96($A_flat),$A00 + vpxor @T[0],@T[0],@T[0] + vmovdqu 8+32*0-96($A_flat),$A01 + vmovdqu 8+32*1-96($A_flat),$A20 + vmovdqu 8+32*2-96($A_flat),$A31 + vmovdqu 8+32*3-96($A_flat),$A21 + vmovdqu 8+32*4-96($A_flat),$A41 + vmovdqu 8+32*5-96($A_flat),$A11 + + vmovdqa64 0*32(%r8),$R20 # load "rhotate" indices + vmovdqa64 1*32(%r8),$R01 + vmovdqa64 2*32(%r8),$R31 + vmovdqa64 3*32(%r8),$R21 + vmovdqa64 4*32(%r8),$R41 + vmovdqa64 5*32(%r8),$R11 + + mov $bsz,%rax + +.Loop_squeeze_avx512vl: + mov @A_jagged[$i]-96($A_flat),%r8 +___ +for (my $i=0; $i<25; $i++) { +$code.=<<___; + sub \$8,$len + jc .Ltail_squeeze_avx512vl + mov %r8,($out) + lea 8($out),$out + je .Ldone_squeeze_avx512vl + dec %eax + je .Lextend_output_avx512vl + mov @A_jagged[$i+1]-120($A_flat),%r8 +___ +} +$code.=<<___; +.Lextend_output_avx512vl: + call __KeccakF1600 + + vmovq %xmm0,-96($A_flat) + vmovdqu $A01,8+32*0-96($A_flat) + vmovdqu $A20,8+32*1-96($A_flat) + vmovdqu $A31,8+32*2-96($A_flat) + vmovdqu $A21,8+32*3-96($A_flat) + vmovdqu $A41,8+32*4-96($A_flat) + vmovdqu $A11,8+32*5-96($A_flat) + + mov $bsz,%rax + jmp .Loop_squeeze_avx512vl + + +.Ltail_squeeze_avx512vl: + add \$8,$len +.Loop_tail_avx512vl: + mov %r8b,($out) + lea 1($out),$out + shr \$8,%r8 + dec $len + jnz .Loop_tail_avx512vl + +.Ldone_squeeze_avx512vl: + vzeroupper + + lea (%r11),%rsp + ret +.size SHA3_squeeze,.-SHA3_squeeze + +.align 64 +rhotates_left: + .quad 3, 18, 36, 41 # [2][0] [4][0] [1][0] [3][0] + .quad 1, 62, 28, 27 # [0][1] [0][2] [0][3] [0][4] + .quad 45, 6, 56, 39 # [3][1] [1][2] [4][3] [2][4] + .quad 10, 61, 55, 8 # [2][1] [4][2] [1][3] [3][4] + .quad 2, 15, 25, 20 # [4][1] [3][2] [2][3] [1][4] + .quad 44, 43, 21, 14 # [1][1] [2][2] [3][3] [4][4] +iotas: + .quad 0x0000000000000001, 0x0000000000000001, 0x0000000000000001, 0x0000000000000001 + .quad 0x0000000000008082, 0x0000000000008082, 0x0000000000008082, 0x0000000000008082 + .quad 0x800000000000808a, 0x800000000000808a, 0x800000000000808a, 0x800000000000808a + .quad 0x8000000080008000, 0x8000000080008000, 0x8000000080008000, 0x8000000080008000 + .quad 0x000000000000808b, 0x000000000000808b, 0x000000000000808b, 0x000000000000808b + .quad 0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001 + .quad 0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081 + .quad 0x8000000000008009, 0x8000000000008009, 0x8000000000008009, 0x8000000000008009 + .quad 0x000000000000008a, 0x000000000000008a, 0x000000000000008a, 0x000000000000008a + .quad 0x0000000000000088, 0x0000000000000088, 0x0000000000000088, 0x0000000000000088 + .quad 0x0000000080008009, 0x0000000080008009, 0x0000000080008009, 0x0000000080008009 + .quad 0x000000008000000a, 0x000000008000000a, 0x000000008000000a, 0x000000008000000a + .quad 0x000000008000808b, 0x000000008000808b, 0x000000008000808b, 0x000000008000808b + .quad 0x800000000000008b, 0x800000000000008b, 0x800000000000008b, 0x800000000000008b + .quad 0x8000000000008089, 0x8000000000008089, 0x8000000000008089, 0x8000000000008089 + .quad 0x8000000000008003, 0x8000000000008003, 0x8000000000008003, 0x8000000000008003 + .quad 0x8000000000008002, 0x8000000000008002, 0x8000000000008002, 0x8000000000008002 + .quad 0x8000000000000080, 0x8000000000000080, 0x8000000000000080, 0x8000000000000080 + .quad 0x000000000000800a, 0x000000000000800a, 0x000000000000800a, 0x000000000000800a + .quad 0x800000008000000a, 0x800000008000000a, 0x800000008000000a, 0x800000008000000a + .quad 0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081 + .quad 0x8000000000008080, 0x8000000000008080, 0x8000000000008080, 0x8000000000008080 + .quad 0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001 + .quad 0x8000000080008008, 0x8000000080008008, 0x8000000080008008, 0x8000000080008008 + +.asciz "Keccak-1600 absorb and squeeze for AVX512VL, CRYPTOGAMS by <appro\@openssl.org>" +___ + +$output=pop; +open STDOUT,">$output"; +print $code; +close STDOUT; diff --git a/crypto/sha/asm/keccak1600-c64x.pl b/crypto/sha/asm/keccak1600-c64x.pl new file mode 100755 index 000000000000..b00af9af91d8 --- /dev/null +++ b/crypto/sha/asm/keccak1600-c64x.pl @@ -0,0 +1,885 @@ +#!/usr/bin/env perl +# Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# [ABI- and endian-neutral] Keccak-1600 for C64x. +# +# June 2017. +# +# This is straightforward KECCAK_1X_ALT variant (see sha/keccak1600.c) +# with bit interleaving. 64-bit values are simply split between A- and +# B-files, with A-file holding least significant halves. This works +# out perfectly, because all operations including cross-communications +# [in rotate operations] are always complementary. Performance is +# [incredible for a 32-bit processor] 10.9 cycles per processed byte +# for r=1088, which corresponds to SHA3-256. This is >15x faster than +# compiler-generated KECCAK_1X_ALT code, and >10x than other variants. +# On average processor ends up issuing ~4.5 instructions per cycle... + +my @A = map([ $_, ($_+1), ($_+2), ($_+3), ($_+4) ], (5,10,16,21,26)); + $A[1][4] = 31; # B14 is reserved, A14 is used as iota[] + ($A[3][0],$A[4][1]) = ($A[4][1],$A[3][0]); +my @C = (0..4,$A[3][0],$A[4][0]); +my $iotas = "A14"; + +my @rhotates = ([ 0, 1, 62, 28, 27 ], + [ 36, 44, 6, 55, 20 ], + [ 3, 10, 43, 25, 39 ], + [ 41, 45, 15, 21, 8 ], + [ 18, 2, 61, 56, 14 ]); + +sub ROL64 { + my ($src,$rot,$dst,$p) = @_; + + if ($rot&1) { +$code.=<<___; +$p ROTL B$src,$rot/2+1,A$dst +|| ROTL A$src,$rot/2, B$dst +___ + } else { +$code.=<<___; +$p ROTL A$src,$rot/2,A$dst +|| ROTL B$src,$rot/2,B$dst +___ + } +} + +######################################################################## +# Stack frame layout +# +# SP--->+------+------+ +# | | | +# +1--->+------+------+<- -9 below 4 slots are used by KeccakF1600_int +# | | | +# +2--->+------+------+<- -8 +# | | | +# +3--->+------+------+<- -7 +# | A2 | A3 | A3:A2 are preserved by KeccakF1600_int +# +4--->+------+------+<- -6 +# | B2 | B3 | B3:B2 are preserved by KeccakF1600_int +# +5--->+------+------+<- -5 below is ABI-compliant layout +# | A10 | A11 | +# +6--->+------+------+<- -4 +# | A12 | A13 | +# +7--->+------+------+<- -3 +# | A14 | B3 | +# +8--->+------+------+<- -2 +# | B10 | B11 | +# +9--->+------+------+<- -1 +# | B12 | B13 | +# +------+------+<---FP +# | A15 | +# +------+-- + +$code.=<<___; + .text + + .if .ASSEMBLER_VERSION<7000000 + .asg 0,__TI_EABI__ + .endif + .if __TI_EABI__ + .nocmp + .asg KeccakF1600,_KeccakF1600 + .asg SHA3_absorb,_SHA3_absorb + .asg SHA3_squeeze,_SHA3_squeeze + .endif + + .asg B3,RA + .asg A15,FP + .asg B15,SP + + .align 32 +_KeccakF1600_int: + .asmfunc + STDW A3:A2,*FP[-7] +|| STDW B3:B2,*SP[4] +_KeccakF1600_cheat: + .if __TI_EABI__ + ADDKPC _KeccakF1600_int,B0 +|| MVKL \$PCR_OFFSET(iotas,_KeccakF1600_int),$iotas + MVKH \$PCR_OFFSET(iotas,_KeccakF1600_int),$iotas + .else + ADDKPC _KeccakF1600_int,B0 +|| MVKL (iotas-_KeccakF1600_int),$iotas + MVKH (iotas-_KeccakF1600_int),$iotas + .endif + ADD B0,$iotas,$iotas +loop?: + XOR A$A[0][2],A$A[1][2],A$C[2] ; Theta +|| XOR B$A[0][2],B$A[1][2],B$C[2] +|| XOR A$A[0][3],A$A[1][3],A$C[3] +|| XOR B$A[0][3],B$A[1][3],B$C[3] +|| XOR A$A[0][0],A$A[1][0],A$C[0] +|| XOR B$A[0][0],B$A[1][0],B$C[0] + XOR A$A[2][2],A$C[2],A$C[2] +|| XOR B$A[2][2],B$C[2],B$C[2] +|| XOR A$A[2][3],A$C[3],A$C[3] +|| XOR B$A[2][3],B$C[3],B$C[3] +|| XOR A$A[2][0],A$C[0],A$C[0] +|| XOR B$A[2][0],B$C[0],B$C[0] + XOR A$A[3][2],A$C[2],A$C[2] +|| XOR B$A[3][2],B$C[2],B$C[2] +|| XOR A$A[3][3],A$C[3],A$C[3] +|| XOR B$A[3][3],B$C[3],B$C[3] +|| XOR A$A[3][0],A$C[0],A$C[0] +|| XOR B$A[3][0],B$C[0],B$C[0] + XOR A$A[4][2],A$C[2],A$C[2] +|| XOR B$A[4][2],B$C[2],B$C[2] +|| XOR A$A[4][3],A$C[3],A$C[3] +|| XOR B$A[4][3],B$C[3],B$C[3] +|| XOR A$A[4][0],A$C[0],A$C[0] +|| XOR B$A[4][0],B$C[0],B$C[0] + XOR A$A[0][4],A$A[1][4],A$C[4] +|| XOR B$A[0][4],B$A[1][4],B$C[4] +|| XOR A$A[0][1],A$A[1][1],A$C[1] +|| XOR B$A[0][1],B$A[1][1],B$C[1] +|| STDW A$A[3][0]:A$A[4][0],*SP[1] ; offload some data + STDW B$A[3][0]:B$A[4][0],*SP[2] +|| XOR A$A[2][4],A$C[4],A$C[4] +|| XOR B$A[2][4],B$C[4],B$C[4] +|| XOR A$A[2][1],A$C[1],A$C[1] +|| XOR B$A[2][1],B$C[1],B$C[1] +|| ROTL B$C[2],1,A$C[5] ; ROL64(C[2],1) +|| ROTL A$C[2],0,B$C[5] + XOR A$A[3][4],A$C[4],A$C[4] +|| XOR B$A[3][4],B$C[4],B$C[4] +|| XOR A$A[3][1],A$C[1],A$C[1] +|| XOR B$A[3][1],B$C[1],B$C[1] +|| ROTL B$C[3],1,A$C[6] ; ROL64(C[3],1) +|| ROTL A$C[3],0,B$C[6] + XOR A$A[4][4],A$C[4],A$C[4] +|| XOR B$A[4][4],B$C[4],B$C[4] +|| XOR A$A[4][1],A$C[1],A$C[1] +|| XOR B$A[4][1],B$C[1],B$C[1] +|| XOR A$C[0],A$C[5],A$C[5] ; C[0] ^ ROL64(C[2],1) +|| XOR B$C[0],B$C[5],B$C[5] + XOR A$C[5],A$A[0][1],A$A[0][1] +|| XOR B$C[5],B$A[0][1],B$A[0][1] +|| XOR A$C[5],A$A[1][1],A$A[1][1] +|| XOR B$C[5],B$A[1][1],B$A[1][1] +|| XOR A$C[5],A$A[2][1],A$A[2][1] +|| XOR B$C[5],B$A[2][1],B$A[2][1] + XOR A$C[5],A$A[3][1],A$A[3][1] +|| XOR B$C[5],B$A[3][1],B$A[3][1] +|| XOR A$C[5],A$A[4][1],A$A[4][1] +|| XOR B$C[5],B$A[4][1],B$A[4][1] +|| ROTL B$C[4],1,A$C[5] ; ROL64(C[4],1) +|| ROTL A$C[4],0,B$C[5] +|| XOR A$C[1],A$C[6],A$C[6] ; C[1] ^ ROL64(C[3],1) +|| XOR B$C[1],B$C[6],B$C[6] + XOR A$C[6],A$A[0][2],A$A[0][2] +|| XOR B$C[6],B$A[0][2],B$A[0][2] +|| XOR A$C[6],A$A[1][2],A$A[1][2] +|| XOR B$C[6],B$A[1][2],B$A[1][2] +|| XOR A$C[6],A$A[2][2],A$A[2][2] +|| XOR B$C[6],B$A[2][2],B$A[2][2] +|| ROTL B$C[1],1,A$C[1] ; ROL64(C[1],1) +|| ROTL A$C[1],0,B$C[1] + XOR A$C[6],A$A[3][2],A$A[3][2] +|| XOR B$C[6],B$A[3][2],B$A[3][2] +|| XOR A$C[6],A$A[4][2],A$A[4][2] +|| XOR B$C[6],B$A[4][2],B$A[4][2] +|| ROTL B$C[0],1,A$C[6] ; ROL64(C[0],1) +|| ROTL A$C[0],0,B$C[6] +|| XOR A$C[5],A$C[2],A$C[2] ; C[2] ^= ROL64(C[4],1) +|| XOR B$C[5],B$C[2],B$C[2] + XOR A$C[2],A$A[0][3],A$A[0][3] +|| XOR B$C[2],B$A[0][3],B$A[0][3] +|| XOR A$C[2],A$A[1][3],A$A[1][3] +|| XOR B$C[2],B$A[1][3],B$A[1][3] +|| XOR A$C[2],A$A[2][3],A$A[2][3] +|| XOR B$C[2],B$A[2][3],B$A[2][3] + XOR A$C[6],A$C[3],A$C[3] ; C[3] ^= ROL64(C[0],1) +|| XOR B$C[6],B$C[3],B$C[3] +|| LDDW *FP[-9],A$A[3][0]:A$A[4][0] ; restore offloaded data +|| LDDW *SP[2],B$A[3][0]:B$A[4][0] +|| XOR A$C[2],A$A[3][3],A$A[3][3] +|| XOR B$C[2],B$A[3][3],B$A[3][3] + XOR A$C[2],A$A[4][3],A$A[4][3] +|| XOR B$C[2],B$A[4][3],B$A[4][3] +|| XOR A$C[3],A$A[0][4],A$A[0][4] +|| XOR B$C[3],B$A[0][4],B$A[0][4] +|| XOR A$C[3],A$A[1][4],A$A[1][4] +|| XOR B$C[3],B$A[1][4],B$A[1][4] + XOR A$C[3],A$A[2][4],A$A[2][4] +|| XOR B$C[3],B$A[2][4],B$A[2][4] +|| XOR A$C[3],A$A[3][4],A$A[3][4] +|| XOR B$C[3],B$A[3][4],B$A[3][4] +|| XOR A$C[3],A$A[4][4],A$A[4][4] +|| XOR B$C[3],B$A[4][4],B$A[4][4] + XOR A$C[1],A$C[4],A$C[4] ; C[4] ^= ROL64(C[1],1) +|| XOR B$C[1],B$C[4],B$C[4] +|| MV A$A[0][1],A$C[1] ; Rho+Pi, "early start" +|| MV B$A[0][1],B$C[1] +___ + &ROL64 ($A[1][1],$rhotates[1][1],$A[0][1],"||"); +$code.=<<___; + XOR A$C[4],A$A[0][0],A$A[0][0] +|| XOR B$C[4],B$A[0][0],B$A[0][0] +|| XOR A$C[4],A$A[1][0],A$A[1][0] +|| XOR B$C[4],B$A[1][0],B$A[1][0] +|| MV A$A[0][3],A$C[3] +|| MV B$A[0][3],B$C[3] +___ + &ROL64 ($A[3][3],$rhotates[3][3],$A[0][3],"||"); +$code.=<<___; + XOR A$C[4],A$A[2][0],A$A[2][0] +|| XOR B$C[4],B$A[2][0],B$A[2][0] +|| XOR A$C[4],A$A[3][0],A$A[3][0] +|| XOR B$C[4],B$A[3][0],B$A[3][0] +|| MV A$A[0][2],A$C[2] +|| MV B$A[0][2],B$C[2] +___ + &ROL64 ($A[2][2],$rhotates[2][2],$A[0][2],"||"); +$code.=<<___; + XOR A$C[4],A$A[4][0],A$A[4][0] +|| XOR B$C[4],B$A[4][0],B$A[4][0] +|| MV A$A[0][4],A$C[4] +|| MV B$A[0][4],B$C[4] +___ + &ROL64 ($A[4][4],$rhotates[4][4],$A[0][4],"||"); + + &ROL64 ($A[1][4],$rhotates[1][4],$A[1][1]); +$code.=<<___; +|| LDW *${iotas}++[2],A$C[0] +___ + &ROL64 ($A[2][3],$rhotates[2][3],$A[2][2]); +$code.=<<___; +|| LDW *${iotas}[-1],B$C[0] +___ + &ROL64 ($A[3][2],$rhotates[3][2],$A[3][3]); + &ROL64 ($A[4][1],$rhotates[4][1],$A[4][4]); + + &ROL64 ($A[4][2],$rhotates[4][2],$A[1][4]); + &ROL64 ($A[3][4],$rhotates[3][4],$A[2][3]); + &ROL64 ($A[2][1],$rhotates[2][1],$A[3][2]); + &ROL64 ($A[1][3],$rhotates[1][3],$A[4][1]); + + &ROL64 ($A[2][4],$rhotates[2][4],$A[4][2]); + &ROL64 ($A[4][3],$rhotates[4][3],$A[3][4]); + &ROL64 ($A[1][2],$rhotates[1][2],$A[2][1]); + &ROL64 ($A[3][1],$rhotates[3][1],$A[1][3]); + + &ROL64 ($A[4][0],$rhotates[4][0],$A[2][4]); + &ROL64 ($A[3][0],$rhotates[3][0],$A[4][3]); + &ROL64 ($A[2][0],$rhotates[2][0],$A[1][2]); + &ROL64 ($A[1][0],$rhotates[1][0],$A[3][1]); + + #&ROL64 ($C[3], $rhotates[0][3],$A[1][0]); # moved below + &ROL64 ($C[1], $rhotates[0][1],$A[2][0]); + &ROL64 ($C[4], $rhotates[0][4],$A[3][0]); + &ROL64 ($C[2], $rhotates[0][2],$A[4][0]); +$code.=<<___; +|| ANDN A$A[0][2],A$A[0][1],A$C[4] ; Chi+Iota +|| ANDN B$A[0][2],B$A[0][1],B$C[4] +|| ANDN A$A[0][3],A$A[0][2],A$C[1] +|| ANDN B$A[0][3],B$A[0][2],B$C[1] +|| ANDN A$A[0][4],A$A[0][3],A$C[2] +|| ANDN B$A[0][4],B$A[0][3],B$C[2] +___ + &ROL64 ($C[3], $rhotates[0][3],$A[1][0]); +$code.=<<___; +|| ANDN A$A[0][0],A$A[0][4],A$C[3] +|| ANDN B$A[0][0],B$A[0][4],B$C[3] +|| XOR A$C[4],A$A[0][0],A$A[0][0] +|| XOR B$C[4],B$A[0][0],B$A[0][0] +|| ANDN A$A[0][1],A$A[0][0],A$C[4] +|| ANDN B$A[0][1],B$A[0][0],B$C[4] + XOR A$C[1],A$A[0][1],A$A[0][1] +|| XOR B$C[1],B$A[0][1],B$A[0][1] +|| XOR A$C[2],A$A[0][2],A$A[0][2] +|| XOR B$C[2],B$A[0][2],B$A[0][2] +|| XOR A$C[3],A$A[0][3],A$A[0][3] +|| XOR B$C[3],B$A[0][3],B$A[0][3] + XOR A$C[4],A$A[0][4],A$A[0][4] +|| XOR B$C[4],B$A[0][4],B$A[0][4] +|| XOR A$C[0],A$A[0][0],A$A[0][0] ; A[0][0] ^= iotas[i++]; +|| XOR B$C[0],B$A[0][0],B$A[0][0] +|| EXTU $iotas,24,24,A0 ; A0 is A$C[0], as we done? + + ANDN A$A[1][2],A$A[1][1],A$C[4] +|| ANDN B$A[1][2],B$A[1][1],B$C[4] +|| ANDN A$A[1][3],A$A[1][2],A$C[1] +|| ANDN B$A[1][3],B$A[1][2],B$C[1] +|| ANDN A$A[1][4],A$A[1][3],A$C[2] +|| ANDN B$A[1][4],B$A[1][3],B$C[2] + ANDN A$A[1][0],A$A[1][4],A$C[3] +|| ANDN B$A[1][0],B$A[1][4],B$C[3] +|| XOR A$C[4],A$A[1][0],A$A[1][0] +|| XOR B$C[4],B$A[1][0],B$A[1][0] +|| ANDN A$A[1][1],A$A[1][0],A$C[4] +|| ANDN B$A[1][1],B$A[1][0],B$C[4] + XOR A$C[1],A$A[1][1],A$A[1][1] +|| XOR B$C[1],B$A[1][1],B$A[1][1] +|| XOR A$C[2],A$A[1][2],A$A[1][2] +|| XOR B$C[2],B$A[1][2],B$A[1][2] +|| XOR A$C[3],A$A[1][3],A$A[1][3] +|| XOR B$C[3],B$A[1][3],B$A[1][3] + XOR A$C[4],A$A[1][4],A$A[1][4] +|| XOR B$C[4],B$A[1][4],B$A[1][4] + +|| ANDN A$A[2][2],A$A[2][1],A$C[4] +|| ANDN B$A[2][2],B$A[2][1],B$C[4] +|| ANDN A$A[2][3],A$A[2][2],A$C[1] +|| ANDN B$A[2][3],B$A[2][2],B$C[1] + ANDN A$A[2][4],A$A[2][3],A$C[2] +|| ANDN B$A[2][4],B$A[2][3],B$C[2] +|| ANDN A$A[2][0],A$A[2][4],A$C[3] +|| ANDN B$A[2][0],B$A[2][4],B$C[3] +|| XOR A$C[4],A$A[2][0],A$A[2][0] +|| XOR B$C[4],B$A[2][0],B$A[2][0] + ANDN A$A[2][1],A$A[2][0],A$C[4] +|| ANDN B$A[2][1],B$A[2][0],B$C[4] +|| XOR A$C[1],A$A[2][1],A$A[2][1] +|| XOR B$C[1],B$A[2][1],B$A[2][1] +|| XOR A$C[2],A$A[2][2],A$A[2][2] +|| XOR B$C[2],B$A[2][2],B$A[2][2] + XOR A$C[3],A$A[2][3],A$A[2][3] +|| XOR B$C[3],B$A[2][3],B$A[2][3] +|| XOR A$C[4],A$A[2][4],A$A[2][4] +|| XOR B$C[4],B$A[2][4],B$A[2][4] + + ANDN A$A[3][2],A$A[3][1],A$C[4] +|| ANDN B$A[3][2],B$A[3][1],B$C[4] +|| ANDN A$A[3][3],A$A[3][2],A$C[1] +|| ANDN B$A[3][3],B$A[3][2],B$C[1] +|| ANDN A$A[3][4],A$A[3][3],A$C[2] +|| ANDN B$A[3][4],B$A[3][3],B$C[2] + ANDN A$A[3][0],A$A[3][4],A$C[3] +|| ANDN B$A[3][0],B$A[3][4],B$C[3] +|| XOR A$C[4],A$A[3][0],A$A[3][0] +|| XOR B$C[4],B$A[3][0],B$A[3][0] +|| ANDN A$A[3][1],A$A[3][0],A$C[4] +|| ANDN B$A[3][1],B$A[3][0],B$C[4] + XOR A$C[1],A$A[3][1],A$A[3][1] +|| XOR B$C[1],B$A[3][1],B$A[3][1] +|| XOR A$C[2],A$A[3][2],A$A[3][2] +|| XOR B$C[2],B$A[3][2],B$A[3][2] +|| XOR A$C[3],A$A[3][3],A$A[3][3] +||[A0] BNOP loop? + XOR B$C[3],B$A[3][3],B$A[3][3] +|| XOR A$C[4],A$A[3][4],A$A[3][4] +|| XOR B$C[4],B$A[3][4],B$A[3][4] +||[!A0] LDDW *FP[-7],A3:A2 +||[!A0] LDDW *SP[4], RA:B2 + + ANDN A$A[4][2],A$A[4][1],A$C[4] +|| ANDN B$A[4][2],B$A[4][1],B$C[4] +|| ANDN A$A[4][3],A$A[4][2],A$C[1] +|| ANDN B$A[4][3],B$A[4][2],B$C[1] +|| ANDN A$A[4][4],A$A[4][3],A$C[2] +|| ANDN B$A[4][4],B$A[4][3],B$C[2] + ANDN A$A[4][0],A$A[4][4],A$C[3] +|| ANDN B$A[4][0],B$A[4][4],B$C[3] +|| XOR A$C[4],A$A[4][0],A$A[4][0] +|| XOR B$C[4],B$A[4][0],B$A[4][0] +|| ANDN A$A[4][1],A$A[4][0],A$C[4] +|| ANDN B$A[4][1],B$A[4][0],B$C[4] + XOR A$C[1],A$A[4][1],A$A[4][1] +|| XOR B$C[1],B$A[4][1],B$A[4][1] +|| XOR A$C[2],A$A[4][2],A$A[4][2] +|| XOR B$C[2],B$A[4][2],B$A[4][2] +|| XOR A$C[3],A$A[4][3],A$A[4][3] +|| XOR B$C[3],B$A[4][3],B$A[4][3] + XOR A$C[4],A$A[4][4],A$A[4][4] +|| XOR B$C[4],B$A[4][4],B$A[4][4] +;;===== branch to loop? is taken here + + BNOP RA,5 + .endasmfunc + + .newblock + .global _KeccakF1600 + .align 32 +_KeccakF1600: + .asmfunc stack_usage(80) + STW FP,*SP--(80) ; save frame pointer +|| MV SP,FP + STDW B13:B12,*SP[9] +|| STDW A13:A12,*FP[-4] + STDW B11:B10,*SP[8] +|| STDW A11:A10,*FP[-5] + STW RA, *SP[15] +|| STW A14,*FP[-6] +|| MV A4,A2 +|| ADD 4,A4,B2 + + LDW *A2++[2],A$A[0][0] ; load A[5][5] +|| LDW *B2++[2],B$A[0][0] + LDW *A2++[2],A$A[0][1] +|| LDW *B2++[2],B$A[0][1] + LDW *A2++[2],A$A[0][2] +|| LDW *B2++[2],B$A[0][2] + LDW *A2++[2],A$A[0][3] +|| LDW *B2++[2],B$A[0][3] + LDW *A2++[2],A$A[0][4] +|| LDW *B2++[2],B$A[0][4] + + LDW *A2++[2],A$A[1][0] +|| LDW *B2++[2],B$A[1][0] + LDW *A2++[2],A$A[1][1] +|| LDW *B2++[2],B$A[1][1] + LDW *A2++[2],A$A[1][2] +|| LDW *B2++[2],B$A[1][2] + LDW *A2++[2],A$A[1][3] +|| LDW *B2++[2],B$A[1][3] + LDW *A2++[2],A$A[1][4] +|| LDW *B2++[2],B$A[1][4] + + LDW *A2++[2],A$A[2][0] +|| LDW *B2++[2],B$A[2][0] + LDW *A2++[2],A$A[2][1] +|| LDW *B2++[2],B$A[2][1] + LDW *A2++[2],A$A[2][2] +|| LDW *B2++[2],B$A[2][2] + LDW *A2++[2],A$A[2][3] +|| LDW *B2++[2],B$A[2][3] + LDW *A2++[2],A$A[2][4] +|| LDW *B2++[2],B$A[2][4] + + LDW *A2++[2],A$A[3][0] +|| LDW *B2++[2],B$A[3][0] + LDW *A2++[2],A$A[3][1] +|| LDW *B2++[2],B$A[3][1] + LDW *A2++[2],A$A[3][2] +|| LDW *B2++[2],B$A[3][2] + LDW *A2++[2],A$A[3][3] +|| LDW *B2++[2],B$A[3][3] + LDW *A2++[2],A$A[3][4] +|| LDW *B2++[2],B$A[3][4] +|| BNOP _KeccakF1600_int + + ADDKPC ret?,RA +|| LDW *A2++[2],A$A[4][0] +|| LDW *B2++[2],B$A[4][0] + LDW *A2++[2],A$A[4][1] +|| LDW *B2++[2],B$A[4][1] + LDW *A2++[2],A$A[4][2] +|| LDW *B2++[2],B$A[4][2] + LDW *A2++[2],A$A[4][3] +|| LDW *B2++[2],B$A[4][3] + LDW *A2,A$A[4][4] +|| LDW *B2,B$A[4][4] +|| ADDK -192,A2 ; rewind +|| ADDK -192,B2 + + .align 16 +ret?: + STW A$A[0][0],*A2++[2] ; store A[5][5] +|| STW B$A[0][0],*B2++[2] + STW A$A[0][1],*A2++[2] +|| STW B$A[0][1],*B2++[2] + STW A$A[0][2],*A2++[2] +|| STW B$A[0][2],*B2++[2] + STW A$A[0][3],*A2++[2] +|| STW B$A[0][3],*B2++[2] + STW A$A[0][4],*A2++[2] +|| STW B$A[0][4],*B2++[2] + + STW A$A[1][0],*A2++[2] +|| STW B$A[1][0],*B2++[2] + STW A$A[1][1],*A2++[2] +|| STW B$A[1][1],*B2++[2] + STW A$A[1][2],*A2++[2] +|| STW B$A[1][2],*B2++[2] + STW A$A[1][3],*A2++[2] +|| STW B$A[1][3],*B2++[2] + STW A$A[1][4],*A2++[2] +|| STW B$A[1][4],*B2++[2] + + STW A$A[2][0],*A2++[2] +|| STW B$A[2][0],*B2++[2] + STW A$A[2][1],*A2++[2] +|| STW B$A[2][1],*B2++[2] + STW A$A[2][2],*A2++[2] +|| STW B$A[2][2],*B2++[2] + STW A$A[2][3],*A2++[2] +|| STW B$A[2][3],*B2++[2] + STW A$A[2][4],*A2++[2] +|| STW B$A[2][4],*B2++[2] + + STW A$A[3][0],*A2++[2] +|| STW B$A[3][0],*B2++[2] + STW A$A[3][1],*A2++[2] +|| STW B$A[3][1],*B2++[2] + STW A$A[3][2],*A2++[2] +|| STW B$A[3][2],*B2++[2] + STW A$A[3][3],*A2++[2] +|| STW B$A[3][3],*B2++[2] + STW A$A[3][4],*A2++[2] +|| STW B$A[3][4],*B2++[2] + + LDW *SP[15],RA +|| LDW *FP[-6],A14 + + STW A$A[4][0],*A2++[2] +|| STW B$A[4][0],*B2++[2] + STW A$A[4][1],*A2++[2] +|| STW B$A[4][1],*B2++[2] + STW A$A[4][2],*A2++[2] +|| STW B$A[4][2],*B2++[2] + STW A$A[4][3],*A2++[2] +|| STW B$A[4][3],*B2++[2] + STW A$A[4][4],*A2 +|| STW B$A[4][4],*B2 +|| ADDK -192,A2 ; rewind + + MV A2,A4 ; return original A4 +|| LDDW *SP[8], B11:B10 +|| LDDW *FP[-5],A11:A10 + LDDW *SP[9], B13:B12 +|| LDDW *FP[-4],A13:A12 +|| BNOP RA + LDW *++SP(80),FP ; restore frame pointer + NOP 4 ; wait till FP is committed + .endasmfunc + + .newblock + .asg B2,BSZ + .asg A2,INP + .asg A3,LEN + .global _SHA3_absorb + .align 32 +_SHA3_absorb: + .asmfunc stack_usage(80) + STW FP,*SP--(80) ; save frame pointer +|| MV SP,FP + STDW B13:B12,*SP[9] +|| STDW A13:A12,*FP[-4] + STDW B11:B10,*SP[8] +|| STDW A11:A10,*FP[-5] + STW RA, *SP[15] +|| STW A14,*FP[-6] + + STW A4,*SP[1] ; save A[][] +|| MV B4,INP ; reassign arguments +|| MV A6,LEN +|| MV B6,BSZ +|| ADD 4,A4,B4 + + LDW *A4++[2],A$A[0][0] ; load A[5][5] +|| LDW *B4++[2],B$A[0][0] + LDW *A4++[2],A$A[0][1] +|| LDW *B4++[2],B$A[0][1] + LDW *A4++[2],A$A[0][2] +|| LDW *B4++[2],B$A[0][2] + LDW *A4++[2],A$A[0][3] +|| LDW *B4++[2],B$A[0][3] + LDW *A4++[2],A$A[0][4] +|| LDW *B4++[2],B$A[0][4] + + LDW *A4++[2],A$A[1][0] +|| LDW *B4++[2],B$A[1][0] + LDW *A4++[2],A$A[1][1] +|| LDW *B4++[2],B$A[1][1] + LDW *A4++[2],A$A[1][2] +|| LDW *B4++[2],B$A[1][2] + LDW *A4++[2],A$A[1][3] +|| LDW *B4++[2],B$A[1][3] + LDW *A4++[2],A$A[1][4] +|| LDW *B4++[2],B$A[1][4] + + LDW *A4++[2],A$A[2][0] +|| LDW *B4++[2],B$A[2][0] + LDW *A4++[2],A$A[2][1] +|| LDW *B4++[2],B$A[2][1] + LDW *A4++[2],A$A[2][2] +|| LDW *B4++[2],B$A[2][2] + LDW *A4++[2],A$A[2][3] +|| LDW *B4++[2],B$A[2][3] + LDW *A4++[2],A$A[2][4] +|| LDW *B4++[2],B$A[2][4] + + LDW *A4++[2],A$A[3][0] +|| LDW *B4++[2],B$A[3][0] + LDW *A4++[2],A$A[3][1] +|| LDW *B4++[2],B$A[3][1] + LDW *A4++[2],A$A[3][2] +|| LDW *B4++[2],B$A[3][2] + LDW *A4++[2],A$A[3][3] +|| LDW *B4++[2],B$A[3][3] + LDW *A4++[2],A$A[3][4] +|| LDW *B4++[2],B$A[3][4] + + LDW *A4++[2],A$A[4][0] +|| LDW *B4++[2],B$A[4][0] + LDW *A4++[2],A$A[4][1] +|| LDW *B4++[2],B$A[4][1] + LDW *A4++[2],A$A[4][2] +|| LDW *B4++[2],B$A[4][2] + LDW *A4++[2],A$A[4][3] +|| LDW *B4++[2],B$A[4][3] + LDW *A4,A$A[4][4] +|| LDW *B4,B$A[4][4] +|| ADDKPC loop?,RA + STDW RA:BSZ,*SP[4] + +loop?: + CMPLTU LEN,BSZ,A0 ; len < bsz? +|| SHRU BSZ,3,BSZ + [A0] BNOP ret? +||[A0] ZERO BSZ +||[A0] LDW *SP[1],A2 ; pull A[][] + [BSZ] LDNDW *INP++,A1:A0 +||[BSZ] SUB LEN,8,LEN +||[BSZ] SUB BSZ,1,BSZ + NOP 4 +___ +for ($y = 0; $y < 5; $y++) { + for ($x = 0; $x < ($y<4 ? 5 : 4); $x++) { +$code.=<<___; + .if .BIG_ENDIAN + SWAP2 A0,A1 +|| SWAP2 A1,A0 + SWAP4 A0,A0 + SWAP4 A1,A1 +||[!BSZ]BNOP _KeccakF1600_cheat +||[!BSZ]STDW LEN:INP,*SP[3] +|| DEAL A0,A0 + .else + [!BSZ]BNOP _KeccakF1600_cheat +||[!BSZ]STDW LEN:INP,*SP[3] +|| DEAL A0,A0 + .endif + [BSZ] LDNDW *INP++,A1:A0 +|| DEAL A1,A1 + [BSZ] SUB LEN,8,LEN +||[BSZ] SUB BSZ,1,BSZ + PACK2 A1,A0,A0 +|| PACKH2 A1,A0,A1 + XOR A0,A$A[$y][$x],A$A[$y][$x] + XOR A1,B$A[$y][$x],B$A[$y][$x] +___ + } +} +$code.=<<___; + .if .BIG_ENDIAN + SWAP2 A0,A1 +|| SWAP2 A1,A0 + SWAP4 A0,A0 + SWAP4 A1,A1 + .endif + BNOP _KeccakF1600_cheat +|| STDW LEN:INP,*SP[3] +|| DEAL A0,A0 + DEAL A1,A1 + NOP + PACK2 A1,A0,A0 +|| PACKH2 A1,A0,A1 + XOR A0,A$A[4][4],A$A[4][4] + XOR A1,B$A[4][4],B$A[4][4] + + .align 16 +ret?: + MV LEN,A4 ; return value +|| ADD 4,A2,B2 + + STW A$A[0][0],*A2++[2] ; store A[5][5] +|| STW B$A[0][0],*B2++[2] + STW A$A[0][1],*A2++[2] +|| STW B$A[0][1],*B2++[2] + STW A$A[0][2],*A2++[2] +|| STW B$A[0][2],*B2++[2] + STW A$A[0][3],*A2++[2] +|| STW B$A[0][3],*B2++[2] + STW A$A[0][4],*A2++[2] +|| STW B$A[0][4],*B2++[2] + + STW A$A[1][0],*A2++[2] +|| STW B$A[1][0],*B2++[2] + STW A$A[1][1],*A2++[2] +|| STW B$A[1][1],*B2++[2] + STW A$A[1][2],*A2++[2] +|| STW B$A[1][2],*B2++[2] + STW A$A[1][3],*A2++[2] +|| STW B$A[1][3],*B2++[2] + STW A$A[1][4],*A2++[2] +|| STW B$A[1][4],*B2++[2] + + STW A$A[2][0],*A2++[2] +|| STW B$A[2][0],*B2++[2] + STW A$A[2][1],*A2++[2] +|| STW B$A[2][1],*B2++[2] + STW A$A[2][2],*A2++[2] +|| STW B$A[2][2],*B2++[2] + STW A$A[2][3],*A2++[2] +|| STW B$A[2][3],*B2++[2] + STW A$A[2][4],*A2++[2] +|| STW B$A[2][4],*B2++[2] + + LDW *SP[15],RA +|| LDW *FP[-6],A14 + + STW A$A[3][0],*A2++[2] +|| STW B$A[3][0],*B2++[2] + STW A$A[3][1],*A2++[2] +|| STW B$A[3][1],*B2++[2] + STW A$A[3][2],*A2++[2] +|| STW B$A[3][2],*B2++[2] + STW A$A[3][3],*A2++[2] +|| STW B$A[3][3],*B2++[2] + STW A$A[3][4],*A2++[2] +|| STW B$A[3][4],*B2++[2] + + LDDW *SP[8], B11:B10 +|| LDDW *FP[-5],A11:A10 + LDDW *SP[9], B13:B12 +|| LDDW *FP[-4],A13:A12 + BNOP RA +|| LDW *++SP(80),FP ; restore frame pointer + + STW A$A[4][0],*A2++[2] +|| STW B$A[4][0],*B2++[2] + STW A$A[4][1],*A2++[2] +|| STW B$A[4][1],*B2++[2] + STW A$A[4][2],*A2++[2] +|| STW B$A[4][2],*B2++[2] + STW A$A[4][3],*A2++[2] +|| STW B$A[4][3],*B2++[2] + STW A$A[4][4],*A2++[2] +|| STW B$A[4][4],*B2++[2] + .endasmfunc + + .newblock + .global _SHA3_squeeze + .asg A12,OUT + .asg A13,LEN + .asg A14,BSZ + .align 32 +_SHA3_squeeze: + .asmfunc stack_usage(24) + STW FP,*SP--(24) ; save frame pointer +|| MV SP,FP + STW RA, *SP[5] +|| STW A14,*FP[-2] + STDW A13:A12,*FP[-2] +|| MV B4,OUT ; reassign arguments + MV A6,LEN +|| MV B6,BSZ + +loop?: + LDW *SP[5],RA ; reload RA +|| SHRU BSZ,3,A1 +|| MV A4,A8 +|| ADD 4,A4,B8 +block?: + CMPLTU LEN,8,A0 ; len < 8? + [A0] BNOP tail? + LDW *A8++[2],A9 +|| LDW *B8++[2],B9 +|| SUB LEN,8,LEN ; len -= 8 + MV LEN,A0 +|| SUB A1,1,A1 ; bsz-- +|| NOP 4 + .if .BIG_ENDIAN + SWAP4 A9,A9 +|| SWAP4 B9,B9 + SWAP2 A9,A9 +|| SWAP2 B9,B9 + .endif + [!A0] BNOP ret? +||[!A0] ZERO A1 + PACK2 B9,A9,B7 +||[A1] BNOP block? + PACKH2 B9,A9,B9 +|| SHFL B7,B7 + SHFL B9,B9 + STNW B7,*OUT++ + STNW B9,*OUT++ + NOP + + BNOP _KeccakF1600,4 + ADDKPC loop?,RA + + .align 16 +tail?: + .if .BIG_ENDIAN + SWAP4 A9,A9 +|| SWAP4 B9,B9 + SWAP2 A9,A9 +|| SWAP2 B9,B9 + .endif + PACK2 B9,A9,B7 + PACKH2 B9,A9,B9 +|| SHFL B7,B7 + SHFL B9,B9 + + STB B7,*OUT++ +|| SHRU B7,8,B7 +|| ADD LEN,7,A0 + [A0] STB B7,*OUT++ +||[A0] SHRU B7,8,B7 +||[A0] SUB A0,1,A0 + [A0] STB B7,*OUT++ +||[A0] SHRU B7,8,B7 +||[A0] SUB A0,1,A0 + [A0] STB B7,*OUT++ +||[A0] SUB A0,1,A0 + [A0] STB B9,*OUT++ +||[A0] SHRU B9,8,B9 +||[A0] SUB A0,1,A0 + [A0] STB B9,*OUT++ +||[A0] SHRU B9,8,B9 +||[A0] SUB A0,1,A0 + [A0] STB B9,*OUT++ + +ret?: + LDDW *FP[-2],A13:A12 + BNOP RA +|| LDW *FP[-2],A14 + LDW *++SP(24),FP ; restore frame pointer + NOP 4 ; wait till FP is committed + .endasmfunc + + .if __TI_EABI__ + .sect ".text:sha_asm.const" + .else + .sect ".const:sha_asm" + .endif + .align 256 + .uword 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +iotas: + .uword 0x00000001, 0x00000000 + .uword 0x00000000, 0x00000089 + .uword 0x00000000, 0x8000008b + .uword 0x00000000, 0x80008080 + .uword 0x00000001, 0x0000008b + .uword 0x00000001, 0x00008000 + .uword 0x00000001, 0x80008088 + .uword 0x00000001, 0x80000082 + .uword 0x00000000, 0x0000000b + .uword 0x00000000, 0x0000000a + .uword 0x00000001, 0x00008082 + .uword 0x00000000, 0x00008003 + .uword 0x00000001, 0x0000808b + .uword 0x00000001, 0x8000000b + .uword 0x00000001, 0x8000008a + .uword 0x00000001, 0x80000081 + .uword 0x00000000, 0x80000081 + .uword 0x00000000, 0x80000008 + .uword 0x00000000, 0x00000083 + .uword 0x00000000, 0x80008003 + .uword 0x00000001, 0x80008088 + .uword 0x00000000, 0x80000088 + .uword 0x00000001, 0x00008000 + .uword 0x00000000, 0x80008082 + + .cstring "Keccak-1600 absorb and squeeze for C64x, CRYPTOGAMS by <appro\@openssl.org>" + .align 4 +___ + +$output=pop; +open STDOUT,">$output"; +print $code; +close STDOUT; diff --git a/crypto/sha/asm/keccak1600-mmx.pl b/crypto/sha/asm/keccak1600-mmx.pl new file mode 100755 index 000000000000..c7685add79dd --- /dev/null +++ b/crypto/sha/asm/keccak1600-mmx.pl @@ -0,0 +1,440 @@ +#!/usr/bin/env perl +# Copyright 2017 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# Keccak-1600 for x86 MMX. +# +# June 2017. +# +# Below code is KECCAK_2X implementation (see sha/keccak1600.c) with +# C[5] held in register bank and D[5] offloaded to memory. Though +# instead of actually unrolling the loop pair-wise I simply flip +# pointers to T[][] and A[][] and the end of round. Since number of +# rounds is even, last round writes to A[][] and everything works out. +# It's argued that MMX is the only code path meaningful to implement +# for x86. This is because non-MMX-capable processors is an extinct +# breed, and they as well can lurk executing compiler-generated code. +# For reference gcc-5.x-generated KECCAK_2X code takes 89 cycles per +# processed byte on Pentium. Which is fair result. But older compilers +# produce worse code. On the other hand one can wonder why not 128-bit +# SSE2? Well, SSE2 won't provide double improvement, rather far from +# that, if any at all on some processors, because it will take extra +# permutations and inter-bank data trasfers. Besides, contemporary +# CPUs are better off executing 64-bit code, and it makes lesser sense +# to invest into fancy 32-bit code. And the decision doesn't seem to +# be inadequate, if one compares below results to "64-bit platforms in +# 32-bit mode" SIMD data points available at +# http://keccak.noekeon.org/sw_performance.html. +# +######################################################################## +# Numbers are cycles per processed byte out of large message. +# +# r=1088(i) +# +# PIII 30/+150% +# Pentium M 27/+150% +# P4 40/+85% +# Core 2 19/+170% +# Sandy Bridge(ii) 18/+140% +# Atom 33/+180% +# Silvermont(ii) 30/+180% +# VIA Nano(ii) 43/+60% +# Sledgehammer(ii)(iii) 24/+130% +# +# (i) Corresponds to SHA3-256. Numbers after slash are improvement +# coefficients over KECCAK_2X [with bit interleave and lane +# complementing] position-independent *scalar* code generated +# by gcc-5.x. It's not exactly fair comparison, but it's a +# datapoint... +# (ii) 64-bit processor executing 32-bit code. +# (iii) Result is considered to be representative even for older AMD +# processors. + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +push(@INC,"${dir}","${dir}../../perlasm"); +require "x86asm.pl"; + +$output=pop; +open STDOUT,">$output"; + +&asm_init($ARGV[0],$ARGV[$#ARGV] eq "386"); + +my @C = map("mm$_",(0..4)); +my @T = map("mm$_",(5..7)); +my @A = map([ 8*$_-100, 8*($_+1)-100, 8*($_+2)-100, + 8*($_+3)-100, 8*($_+4)-100 ], (0,5,10,15,20)); +my @D = map(8*$_+4, (0..4)); +my @rhotates = ([ 0, 1, 62, 28, 27 ], + [ 36, 44, 6, 55, 20 ], + [ 3, 10, 43, 25, 39 ], + [ 41, 45, 15, 21, 8 ], + [ 18, 2, 61, 56, 14 ]); + +&static_label("iotas"); + +&function_begin_B("_KeccakF1600"); + &movq (@C[0],&QWP($A[4][0],"esi")); + &movq (@C[1],&QWP($A[4][1],"esi")); + &movq (@C[2],&QWP($A[4][2],"esi")); + &movq (@C[3],&QWP($A[4][3],"esi")); + &movq (@C[4],&QWP($A[4][4],"esi")); + + &mov ("ecx",24); # loop counter + &jmp (&label("loop")); + + &set_label("loop",16); + ######################################### Theta + &pxor (@C[0],&QWP($A[0][0],"esi")); + &pxor (@C[1],&QWP($A[0][1],"esi")); + &pxor (@C[2],&QWP($A[0][2],"esi")); + &pxor (@C[3],&QWP($A[0][3],"esi")); + &pxor (@C[4],&QWP($A[0][4],"esi")); + + &pxor (@C[0],&QWP($A[1][0],"esi")); + &pxor (@C[1],&QWP($A[1][1],"esi")); + &pxor (@C[2],&QWP($A[1][2],"esi")); + &pxor (@C[3],&QWP($A[1][3],"esi")); + &pxor (@C[4],&QWP($A[1][4],"esi")); + + &pxor (@C[0],&QWP($A[2][0],"esi")); + &pxor (@C[1],&QWP($A[2][1],"esi")); + &pxor (@C[2],&QWP($A[2][2],"esi")); + &pxor (@C[3],&QWP($A[2][3],"esi")); + &pxor (@C[4],&QWP($A[2][4],"esi")); + + &pxor (@C[2],&QWP($A[3][2],"esi")); + &pxor (@C[0],&QWP($A[3][0],"esi")); + &pxor (@C[1],&QWP($A[3][1],"esi")); + &pxor (@C[3],&QWP($A[3][3],"esi")); + &movq (@T[0],@C[2]); + &pxor (@C[4],&QWP($A[3][4],"esi")); + + &movq (@T[2],@C[2]); + &psrlq (@T[0],63); + &movq (@T[1],@C[0]); + &psllq (@T[2],1); + &pxor (@T[0],@C[0]); + &psrlq (@C[0],63); + &pxor (@T[0],@T[2]); + &psllq (@T[1],1); + &movq (@T[2],@C[1]); + &movq (&QWP(@D[1],"esp"),@T[0]); # D[1] = E[0] = ROL64(C[2], 1) ^ C[0]; + + &pxor (@T[1],@C[0]); + &psrlq (@T[2],63); + &pxor (@T[1],@C[3]); + &movq (@C[0],@C[1]); + &movq (&QWP(@D[4],"esp"),@T[1]); # D[4] = E[1] = ROL64(C[0], 1) ^ C[3]; + + &psllq (@C[0],1); + &pxor (@T[2],@C[4]); + &pxor (@C[0],@T[2]); + + &movq (@T[2],@C[3]); + &psrlq (@C[3],63); + &movq (&QWP(@D[0],"esp"),@C[0]); # D[0] = C[0] = ROL64(C[1], 1) ^ C[4]; + &psllq (@T[2],1); + &movq (@T[0],@C[4]); + &psrlq (@C[4],63); + &pxor (@C[1],@C[3]); + &psllq (@T[0],1); + &pxor (@C[1],@T[2]); + &pxor (@C[2],@C[4]); + &movq (&QWP(@D[2],"esp"),@C[1]); # D[2] = C[1] = ROL64(C[3], 1) ^ C[1]; + &pxor (@C[2],@T[0]); + + ######################################### first Rho(0) is special + &movq (@C[3],&QWP($A[3][3],"esi")); + &movq (&QWP(@D[3],"esp"),@C[2]); # D[3] = C[2] = ROL64(C[4], 1) ^ C[2]; + &pxor (@C[3],@C[2]); + &movq (@C[4],&QWP($A[4][4],"esi")); + &movq (@T[2],@C[3]); + &psrlq (@C[3],64-$rhotates[3][3]); + &pxor (@C[4],@T[1]); + &psllq (@T[2],$rhotates[3][3]); + &movq (@T[1],@C[4]); + &psrlq (@C[4],64-$rhotates[4][4]); + &por (@C[3],@T[2]); # C[3] = ROL64(A[3][3] ^ C[2], rhotates[3][3]); /* D[3] */ + &psllq (@T[1],$rhotates[4][4]); + + &movq (@C[2],&QWP($A[2][2],"esi")); + &por (@C[4],@T[1]); # C[4] = ROL64(A[4][4] ^ E[1], rhotates[4][4]); /* D[4] */ + &pxor (@C[2],@C[1]); + &movq (@C[1],&QWP($A[1][1],"esi")); + &movq (@T[1],@C[2]); + &psrlq (@C[2],64-$rhotates[2][2]); + &pxor (@C[1],&QWP(@D[1],"esp")); + &psllq (@T[1],$rhotates[2][2]); + + &movq (@T[2],@C[1]); + &psrlq (@C[1],64-$rhotates[1][1]); + &por (@C[2],@T[1]); # C[2] = ROL64(A[2][2] ^ C[1], rhotates[2][2]); /* D[2] */ + &psllq (@T[2],$rhotates[1][1]); + &pxor (@C[0],&QWP($A[0][0],"esi")); # /* rotate by 0 */ /* D[0] */ + &por (@C[1],@T[2]); # C[1] = ROL64(A[1][1] ^ D[1], rhotates[1][1]); + +sub Chi() { ######### regular Chi step + my ($y,$xrho) = @_; + + &movq (@T[0],@C[1]); + &movq (@T[1],@C[2]); + &pandn (@T[0],@C[2]); + &pandn (@C[2],@C[3]); + &pxor (@T[0],@C[0]); + &pxor (@C[2],@C[1]); + &pxor (@T[0],&QWP(0,"ebx")) if ($y == 0); + &lea ("ebx",&DWP(8,"ebx")) if ($y == 0); + + &movq (@T[2],@C[3]); + &movq (&QWP($A[$y][0],"edi"),@T[0]); # R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i]; + &movq (@T[0],@C[4]); + &pandn (@C[3],@C[4]); + &pandn (@C[4],@C[0]); + &pxor (@C[3],@T[1]); + &movq (&QWP($A[$y][1],"edi"),@C[2]); # R[0][1] = C[1] ^ (~C[2] & C[3]); + &pxor (@C[4],@T[2]); + &movq (@T[2],&QWP($A[0][$xrho],"esi")) if (defined($xrho)); + + &movq (&QWP($A[$y][2],"edi"),@C[3]); # R[0][2] = C[2] ^ (~C[3] & C[4]); + &pandn (@C[0],@C[1]); + &movq (&QWP($A[$y][3],"edi"),@C[4]); # R[0][3] = C[3] ^ (~C[4] & C[0]); + &pxor (@C[0],@T[0]); + &pxor (@T[2],&QWP(@D[$xrho],"esp")) if (defined($xrho)); + &movq (&QWP($A[$y][4],"edi"),@C[0]); # R[0][4] = C[4] ^ (~C[0] & C[1]); +} + &Chi (0, 3); + +sub Rho() { ######### regular Rho step + my $x = shift; + + #&movq (@T[2],&QWP($A[0][$x],"esi")); # moved to Chi + #&pxor (@T[2],&QWP(@D[$x],"esp")); # moved to Chi + &movq (@C[0],@T[2]); + &psrlq (@T[2],64-$rhotates[0][$x]); + &movq (@C[1],&QWP($A[1][($x+1)%5],"esi")); + &psllq (@C[0],$rhotates[0][$x]); + &pxor (@C[1],&QWP(@D[($x+1)%5],"esp")); + &por (@C[0],@T[2]); # C[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]); + + &movq (@T[1],@C[1]); + &psrlq (@C[1],64-$rhotates[1][($x+1)%5]); + &movq (@C[2],&QWP($A[2][($x+2)%5],"esi")); + &psllq (@T[1],$rhotates[1][($x+1)%5]); + &pxor (@C[2],&QWP(@D[($x+2)%5],"esp")); + &por (@C[1],@T[1]); # C[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]); + + &movq (@T[2],@C[2]); + &psrlq (@C[2],64-$rhotates[2][($x+2)%5]); + &movq (@C[3],&QWP($A[3][($x+3)%5],"esi")); + &psllq (@T[2],$rhotates[2][($x+2)%5]); + &pxor (@C[3],&QWP(@D[($x+3)%5],"esp")); + &por (@C[2],@T[2]); # C[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]); + + &movq (@T[0],@C[3]); + &psrlq (@C[3],64-$rhotates[3][($x+3)%5]); + &movq (@C[4],&QWP($A[4][($x+4)%5],"esi")); + &psllq (@T[0],$rhotates[3][($x+3)%5]); + &pxor (@C[4],&QWP(@D[($x+4)%5],"esp")); + &por (@C[3],@T[0]); # C[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]); + + &movq (@T[1],@C[4]); + &psrlq (@C[4],64-$rhotates[4][($x+4)%5]); + &psllq (@T[1],$rhotates[4][($x+4)%5]); + &por (@C[4],@T[1]); # C[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]); +} + &Rho (3); &Chi (1, 1); + &Rho (1); &Chi (2, 4); + &Rho (4); &Chi (3, 2); + &Rho (2); ###&Chi (4); + + &movq (@T[0],@C[0]); ######### last Chi(4) is special + &xor ("edi","esi"); # &xchg ("esi","edi"); + &movq (&QWP(@D[1],"esp"),@C[1]); + &xor ("esi","edi"); + &xor ("edi","esi"); + + &movq (@T[1],@C[1]); + &movq (@T[2],@C[2]); + &pandn (@T[1],@C[2]); + &pandn (@T[2],@C[3]); + &pxor (@C[0],@T[1]); + &pxor (@C[1],@T[2]); + + &movq (@T[1],@C[3]); + &movq (&QWP($A[4][0],"esi"),@C[0]); # R[4][0] = C[0] ^= (~C[1] & C[2]); + &pandn (@T[1],@C[4]); + &movq (&QWP($A[4][1],"esi"),@C[1]); # R[4][1] = C[1] ^= (~C[2] & C[3]); + &pxor (@C[2],@T[1]); + &movq (@T[2],@C[4]); + &movq (&QWP($A[4][2],"esi"),@C[2]); # R[4][2] = C[2] ^= (~C[3] & C[4]); + + &pandn (@T[2],@T[0]); + &pandn (@T[0],&QWP(@D[1],"esp")); + &pxor (@C[3],@T[2]); + &pxor (@C[4],@T[0]); + &movq (&QWP($A[4][3],"esi"),@C[3]); # R[4][3] = C[3] ^= (~C[4] & D[0]); + &sub ("ecx",1); + &movq (&QWP($A[4][4],"esi"),@C[4]); # R[4][4] = C[4] ^= (~D[0] & D[1]); + &jnz (&label("loop")); + + &lea ("ebx",&DWP(-192,"ebx")); # rewind iotas + &ret (); +&function_end_B("_KeccakF1600"); + +&function_begin("KeccakF1600"); + &mov ("esi",&wparam(0)); + &mov ("ebp","esp"); + &sub ("esp",240); + &call (&label("pic_point")); + &set_label("pic_point"); + &blindpop("ebx"); + &lea ("ebx",&DWP(&label("iotas")."-".&label("pic_point"),"ebx")); + &and ("esp",-8); + &lea ("esi",&DWP(100,"esi")); # size optimization + &lea ("edi",&DWP(8*5+100,"esp")); # size optimization + + &call ("_KeccakF1600"); + + &mov ("esp","ebp"); + &emms (); +&function_end("KeccakF1600"); + +&function_begin("SHA3_absorb"); + &mov ("esi",&wparam(0)); # A[][] + &mov ("eax",&wparam(1)); # inp + &mov ("ecx",&wparam(2)); # len + &mov ("edx",&wparam(3)); # bsz + &mov ("ebp","esp"); + &sub ("esp",240+8); + &call (&label("pic_point")); + &set_label("pic_point"); + &blindpop("ebx"); + &lea ("ebx",&DWP(&label("iotas")."-".&label("pic_point"),"ebx")); + &and ("esp",-8); + + &mov ("edi","esi"); + &lea ("esi",&DWP(100,"esi")); # size optimization + &mov (&DWP(-4,"ebp"),"edx"); # save bsz + &jmp (&label("loop")); + +&set_label("loop",16); + &cmp ("ecx","edx"); # len < bsz? + &jc (&label("absorbed")); + + &shr ("edx",3); # bsz /= 8 +&set_label("block"); + &movq ("mm0",&QWP(0,"eax")); + &lea ("eax",&DWP(8,"eax")); + &pxor ("mm0",&QWP(0,"edi")); + &lea ("edi",&DWP(8,"edi")); + &sub ("ecx",8); # len -= 8 + &movq (&QWP(-8,"edi"),"mm0"); + &dec ("edx"); # bsz-- + &jnz (&label("block")); + + &lea ("edi",&DWP(8*5+100,"esp")); # size optimization + &mov (&DWP(-8,"ebp"),"ecx"); # save len + &call ("_KeccakF1600"); + &mov ("ecx",&DWP(-8,"ebp")); # pull len + &mov ("edx",&DWP(-4,"ebp")); # pull bsz + &lea ("edi",&DWP(-100,"esi")); + &jmp (&label("loop")); + +&set_label("absorbed",16); + &mov ("eax","ecx"); # return value + &mov ("esp","ebp"); + &emms (); +&function_end("SHA3_absorb"); + +&function_begin("SHA3_squeeze"); + &mov ("esi",&wparam(0)); # A[][] + &mov ("eax",&wparam(1)); # out + &mov ("ecx",&wparam(2)); # len + &mov ("edx",&wparam(3)); # bsz + &mov ("ebp","esp"); + &sub ("esp",240+8); + &call (&label("pic_point")); + &set_label("pic_point"); + &blindpop("ebx"); + &lea ("ebx",&DWP(&label("iotas")."-".&label("pic_point"),"ebx")); + &and ("esp",-8); + + &shr ("edx",3); # bsz /= 8 + &mov ("edi","esi"); + &lea ("esi",&DWP(100,"esi")); # size optimization + &mov (&DWP(-4,"ebp"),"edx"); # save bsz + &jmp (&label("loop")); + +&set_label("loop",16); + &cmp ("ecx",8); # len < 8? + &jc (&label("tail")); + + &movq ("mm0",&QWP(0,"edi")); + &lea ("edi",&DWP(8,"edi")); + &movq (&QWP(0,"eax"),"mm0"); + &lea ("eax",&DWP(8,"eax")); + &sub ("ecx",8); # len -= 8 + &jz (&label("done")); + + &dec ("edx"); # bsz-- + &jnz (&label("loop")); + + &lea ("edi",&DWP(8*5+100,"esp")); # size optimization + &mov (&DWP(-8,"ebp"),"ecx"); # save len + &call ("_KeccakF1600"); + &mov ("ecx",&DWP(-8,"ebp")); # pull len + &mov ("edx",&DWP(-4,"ebp")); # pull bsz + &lea ("edi",&DWP(-100,"esi")); + &jmp (&label("loop")); + +&set_label("tail",16); + &mov ("esi","edi"); + &mov ("edi","eax"); + &data_word("0xA4F39066"); # rep movsb + +&set_label("done"); + &mov ("esp","ebp"); + &emms (); +&function_end("SHA3_squeeze"); + +&set_label("iotas",32); + &data_word(0x00000001,0x00000000); + &data_word(0x00008082,0x00000000); + &data_word(0x0000808a,0x80000000); + &data_word(0x80008000,0x80000000); + &data_word(0x0000808b,0x00000000); + &data_word(0x80000001,0x00000000); + &data_word(0x80008081,0x80000000); + &data_word(0x00008009,0x80000000); + &data_word(0x0000008a,0x00000000); + &data_word(0x00000088,0x00000000); + &data_word(0x80008009,0x00000000); + &data_word(0x8000000a,0x00000000); + &data_word(0x8000808b,0x00000000); + &data_word(0x0000008b,0x80000000); + &data_word(0x00008089,0x80000000); + &data_word(0x00008003,0x80000000); + &data_word(0x00008002,0x80000000); + &data_word(0x00000080,0x80000000); + &data_word(0x0000800a,0x00000000); + &data_word(0x8000000a,0x80000000); + &data_word(0x80008081,0x80000000); + &data_word(0x00008080,0x80000000); + &data_word(0x80000001,0x00000000); + &data_word(0x80008008,0x80000000); +&asciz("Keccak-1600 absorb and squeeze for MMX, CRYPTOGAMS by <appro\@openssl.org>"); + +&asm_finish(); + +close STDOUT; diff --git a/crypto/sha/asm/keccak1600-ppc64.pl b/crypto/sha/asm/keccak1600-ppc64.pl new file mode 100755 index 000000000000..30e70c5d6d7b --- /dev/null +++ b/crypto/sha/asm/keccak1600-ppc64.pl @@ -0,0 +1,758 @@ +#!/usr/bin/env perl +# Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# Keccak-1600 for PPC64. +# +# June 2017. +# +# This is straightforward KECCAK_1X_ALT implementation that works on +# *any* PPC64. Then PowerISA 2.07 adds 2x64-bit vector rotate, and +# it's possible to achieve performance better than below, but that is +# naturally option only for POWER8 and successors... +# +###################################################################### +# Numbers are cycles per processed byte. +# +# r=1088(*) +# +# PPC970/G5 14.6/+120% +# POWER7 10.3/+100% +# POWER8 11.5/+85% +# POWER9 9.4/+45% +# +# (*) Corresponds to SHA3-256. Percentage after slash is improvement +# over gcc-4.x-generated KECCAK_1X_ALT code. Newer compilers do +# much better (but watch out for them generating code specific +# to processor they execute on). + +$flavour = shift; + +if ($flavour =~ /64/) { + $SIZE_T =8; + $LRSAVE =2*$SIZE_T; + $UCMP ="cmpld"; + $STU ="stdu"; + $POP ="ld"; + $PUSH ="std"; +} else { die "nonsense $flavour"; } + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or +die "can't locate ppc-xlate.pl"; + +open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; + +$FRAME=24*$SIZE_T+6*$SIZE_T+32; +$LOCALS=6*$SIZE_T; +$TEMP=$LOCALS+6*$SIZE_T; + +my $sp ="r1"; + +my @A = map([ "r$_", "r".($_+1), "r".($_+2), "r".($_+3), "r".($_+4) ], + (7, 12, 17, 22, 27)); + $A[1][1] = "r6"; # r13 is reserved + +my @C = map("r$_", (0,3,4,5)); + +my @rhotates = ([ 0, 1, 62, 28, 27 ], + [ 36, 44, 6, 55, 20 ], + [ 3, 10, 43, 25, 39 ], + [ 41, 45, 15, 21, 8 ], + [ 18, 2, 61, 56, 14 ]); + +$code.=<<___; +.text + +.type KeccakF1600_int,\@function +.align 5 +KeccakF1600_int: + li r0,24 + mtctr r0 + b .Loop +.align 4 +.Loop: + xor $C[0],$A[0][0],$A[1][0] ; Theta + std $A[0][4],`$TEMP+0`($sp) + xor $C[1],$A[0][1],$A[1][1] + std $A[1][4],`$TEMP+8`($sp) + xor $C[2],$A[0][2],$A[1][2] + std $A[2][4],`$TEMP+16`($sp) + xor $C[3],$A[0][3],$A[1][3] + std $A[3][4],`$TEMP+24`($sp) +___ + $C[4]=$A[0][4]; + $C[5]=$A[1][4]; + $C[6]=$A[2][4]; + $C[7]=$A[3][4]; +$code.=<<___; + xor $C[4],$A[0][4],$A[1][4] + xor $C[0],$C[0],$A[2][0] + xor $C[1],$C[1],$A[2][1] + xor $C[2],$C[2],$A[2][2] + xor $C[3],$C[3],$A[2][3] + xor $C[4],$C[4],$A[2][4] + xor $C[0],$C[0],$A[3][0] + xor $C[1],$C[1],$A[3][1] + xor $C[2],$C[2],$A[3][2] + xor $C[3],$C[3],$A[3][3] + xor $C[4],$C[4],$A[3][4] + xor $C[0],$C[0],$A[4][0] + xor $C[2],$C[2],$A[4][2] + xor $C[1],$C[1],$A[4][1] + xor $C[3],$C[3],$A[4][3] + rotldi $C[5],$C[2],1 + xor $C[4],$C[4],$A[4][4] + rotldi $C[6],$C[3],1 + xor $C[5],$C[5],$C[0] + rotldi $C[7],$C[4],1 + + xor $A[0][1],$A[0][1],$C[5] + xor $A[1][1],$A[1][1],$C[5] + xor $A[2][1],$A[2][1],$C[5] + xor $A[3][1],$A[3][1],$C[5] + xor $A[4][1],$A[4][1],$C[5] + + rotldi $C[5],$C[0],1 + xor $C[6],$C[6],$C[1] + xor $C[2],$C[2],$C[7] + rotldi $C[7],$C[1],1 + xor $C[3],$C[3],$C[5] + xor $C[4],$C[4],$C[7] + + xor $C[1], $A[0][2],$C[6] ;mr $C[1],$A[0][2] + xor $A[1][2],$A[1][2],$C[6] + xor $A[2][2],$A[2][2],$C[6] + xor $A[3][2],$A[3][2],$C[6] + xor $A[4][2],$A[4][2],$C[6] + + xor $A[0][0],$A[0][0],$C[4] + xor $A[1][0],$A[1][0],$C[4] + xor $A[2][0],$A[2][0],$C[4] + xor $A[3][0],$A[3][0],$C[4] + xor $A[4][0],$A[4][0],$C[4] +___ + $C[4]=undef; + $C[5]=undef; + $C[6]=undef; + $C[7]=undef; +$code.=<<___; + ld $A[0][4],`$TEMP+0`($sp) + xor $C[0], $A[0][3],$C[2] ;mr $C[0],$A[0][3] + ld $A[1][4],`$TEMP+8`($sp) + xor $A[1][3],$A[1][3],$C[2] + ld $A[2][4],`$TEMP+16`($sp) + xor $A[2][3],$A[2][3],$C[2] + ld $A[3][4],`$TEMP+24`($sp) + xor $A[3][3],$A[3][3],$C[2] + xor $A[4][3],$A[4][3],$C[2] + + xor $C[2], $A[0][4],$C[3] ;mr $C[2],$A[0][4] + xor $A[1][4],$A[1][4],$C[3] + xor $A[2][4],$A[2][4],$C[3] + xor $A[3][4],$A[3][4],$C[3] + xor $A[4][4],$A[4][4],$C[3] + + mr $C[3],$A[0][1] ; Rho+Pi + rotldi $A[0][1],$A[1][1],$rhotates[1][1] + ;mr $C[1],$A[0][2] + rotldi $A[0][2],$A[2][2],$rhotates[2][2] + ;mr $C[0],$A[0][3] + rotldi $A[0][3],$A[3][3],$rhotates[3][3] + ;mr $C[2],$A[0][4] + rotldi $A[0][4],$A[4][4],$rhotates[4][4] + + rotldi $A[1][1],$A[1][4],$rhotates[1][4] + rotldi $A[2][2],$A[2][3],$rhotates[2][3] + rotldi $A[3][3],$A[3][2],$rhotates[3][2] + rotldi $A[4][4],$A[4][1],$rhotates[4][1] + + rotldi $A[1][4],$A[4][2],$rhotates[4][2] + rotldi $A[2][3],$A[3][4],$rhotates[3][4] + rotldi $A[3][2],$A[2][1],$rhotates[2][1] + rotldi $A[4][1],$A[1][3],$rhotates[1][3] + + rotldi $A[4][2],$A[2][4],$rhotates[2][4] + rotldi $A[3][4],$A[4][3],$rhotates[4][3] + rotldi $A[2][1],$A[1][2],$rhotates[1][2] + rotldi $A[1][3],$A[3][1],$rhotates[3][1] + + rotldi $A[2][4],$A[4][0],$rhotates[4][0] + rotldi $A[4][3],$A[3][0],$rhotates[3][0] + rotldi $A[1][2],$A[2][0],$rhotates[2][0] + rotldi $A[3][1],$A[1][0],$rhotates[1][0] + + rotldi $A[1][0],$C[0],$rhotates[0][3] + rotldi $A[2][0],$C[3],$rhotates[0][1] + rotldi $A[3][0],$C[2],$rhotates[0][4] + rotldi $A[4][0],$C[1],$rhotates[0][2] + + andc $C[0],$A[0][2],$A[0][1] ; Chi+Iota + andc $C[1],$A[0][3],$A[0][2] + andc $C[2],$A[0][0],$A[0][4] + andc $C[3],$A[0][1],$A[0][0] + xor $A[0][0],$A[0][0],$C[0] + andc $C[0],$A[0][4],$A[0][3] + xor $A[0][1],$A[0][1],$C[1] + ld $C[1],`$LOCALS+4*$SIZE_T`($sp) + xor $A[0][3],$A[0][3],$C[2] + xor $A[0][4],$A[0][4],$C[3] + xor $A[0][2],$A[0][2],$C[0] + ldu $C[3],8($C[1]) ; Iota[i++] + + andc $C[0],$A[1][2],$A[1][1] + std $C[1],`$LOCALS+4*$SIZE_T`($sp) + andc $C[1],$A[1][3],$A[1][2] + andc $C[2],$A[1][0],$A[1][4] + xor $A[0][0],$A[0][0],$C[3] ; A[0][0] ^= Iota + andc $C[3],$A[1][1],$A[1][0] + xor $A[1][0],$A[1][0],$C[0] + andc $C[0],$A[1][4],$A[1][3] + xor $A[1][1],$A[1][1],$C[1] + xor $A[1][3],$A[1][3],$C[2] + xor $A[1][4],$A[1][4],$C[3] + xor $A[1][2],$A[1][2],$C[0] + + andc $C[0],$A[2][2],$A[2][1] + andc $C[1],$A[2][3],$A[2][2] + andc $C[2],$A[2][0],$A[2][4] + andc $C[3],$A[2][1],$A[2][0] + xor $A[2][0],$A[2][0],$C[0] + andc $C[0],$A[2][4],$A[2][3] + xor $A[2][1],$A[2][1],$C[1] + xor $A[2][3],$A[2][3],$C[2] + xor $A[2][4],$A[2][4],$C[3] + xor $A[2][2],$A[2][2],$C[0] + + andc $C[0],$A[3][2],$A[3][1] + andc $C[1],$A[3][3],$A[3][2] + andc $C[2],$A[3][0],$A[3][4] + andc $C[3],$A[3][1],$A[3][0] + xor $A[3][0],$A[3][0],$C[0] + andc $C[0],$A[3][4],$A[3][3] + xor $A[3][1],$A[3][1],$C[1] + xor $A[3][3],$A[3][3],$C[2] + xor $A[3][4],$A[3][4],$C[3] + xor $A[3][2],$A[3][2],$C[0] + + andc $C[0],$A[4][2],$A[4][1] + andc $C[1],$A[4][3],$A[4][2] + andc $C[2],$A[4][0],$A[4][4] + andc $C[3],$A[4][1],$A[4][0] + xor $A[4][0],$A[4][0],$C[0] + andc $C[0],$A[4][4],$A[4][3] + xor $A[4][1],$A[4][1],$C[1] + xor $A[4][3],$A[4][3],$C[2] + xor $A[4][4],$A[4][4],$C[3] + xor $A[4][2],$A[4][2],$C[0] + + bdnz .Loop + + blr + .long 0 + .byte 0,12,0x14,0,0,0,0,0 +.size KeccakF1600_int,.-KeccakF1600_int + +.type KeccakF1600,\@function +.align 5 +KeccakF1600: + $STU $sp,-$FRAME($sp) + mflr r0 + $PUSH r14,`$FRAME-$SIZE_T*18`($sp) + $PUSH r15,`$FRAME-$SIZE_T*17`($sp) + $PUSH r16,`$FRAME-$SIZE_T*16`($sp) + $PUSH r17,`$FRAME-$SIZE_T*15`($sp) + $PUSH r18,`$FRAME-$SIZE_T*14`($sp) + $PUSH r19,`$FRAME-$SIZE_T*13`($sp) + $PUSH r20,`$FRAME-$SIZE_T*12`($sp) + $PUSH r21,`$FRAME-$SIZE_T*11`($sp) + $PUSH r22,`$FRAME-$SIZE_T*10`($sp) + $PUSH r23,`$FRAME-$SIZE_T*9`($sp) + $PUSH r24,`$FRAME-$SIZE_T*8`($sp) + $PUSH r25,`$FRAME-$SIZE_T*7`($sp) + $PUSH r26,`$FRAME-$SIZE_T*6`($sp) + $PUSH r27,`$FRAME-$SIZE_T*5`($sp) + $PUSH r28,`$FRAME-$SIZE_T*4`($sp) + $PUSH r29,`$FRAME-$SIZE_T*3`($sp) + $PUSH r30,`$FRAME-$SIZE_T*2`($sp) + $PUSH r31,`$FRAME-$SIZE_T*1`($sp) + $PUSH r0,`$FRAME+$LRSAVE`($sp) + + bl PICmeup + subi r12,r12,8 ; prepare for ldu + + $PUSH r3,`$LOCALS+0*$SIZE_T`($sp) + ;$PUSH r4,`$LOCALS+1*$SIZE_T`($sp) + ;$PUSH r5,`$LOCALS+2*$SIZE_T`($sp) + ;$PUSH r6,`$LOCALS+3*$SIZE_T`($sp) + $PUSH r12,`$LOCALS+4*$SIZE_T`($sp) + + ld $A[0][0],`8*0`(r3) ; load A[5][5] + ld $A[0][1],`8*1`(r3) + ld $A[0][2],`8*2`(r3) + ld $A[0][3],`8*3`(r3) + ld $A[0][4],`8*4`(r3) + ld $A[1][0],`8*5`(r3) + ld $A[1][1],`8*6`(r3) + ld $A[1][2],`8*7`(r3) + ld $A[1][3],`8*8`(r3) + ld $A[1][4],`8*9`(r3) + ld $A[2][0],`8*10`(r3) + ld $A[2][1],`8*11`(r3) + ld $A[2][2],`8*12`(r3) + ld $A[2][3],`8*13`(r3) + ld $A[2][4],`8*14`(r3) + ld $A[3][0],`8*15`(r3) + ld $A[3][1],`8*16`(r3) + ld $A[3][2],`8*17`(r3) + ld $A[3][3],`8*18`(r3) + ld $A[3][4],`8*19`(r3) + ld $A[4][0],`8*20`(r3) + ld $A[4][1],`8*21`(r3) + ld $A[4][2],`8*22`(r3) + ld $A[4][3],`8*23`(r3) + ld $A[4][4],`8*24`(r3) + + bl KeccakF1600_int + + $POP r3,`$LOCALS+0*$SIZE_T`($sp) + std $A[0][0],`8*0`(r3) ; return A[5][5] + std $A[0][1],`8*1`(r3) + std $A[0][2],`8*2`(r3) + std $A[0][3],`8*3`(r3) + std $A[0][4],`8*4`(r3) + std $A[1][0],`8*5`(r3) + std $A[1][1],`8*6`(r3) + std $A[1][2],`8*7`(r3) + std $A[1][3],`8*8`(r3) + std $A[1][4],`8*9`(r3) + std $A[2][0],`8*10`(r3) + std $A[2][1],`8*11`(r3) + std $A[2][2],`8*12`(r3) + std $A[2][3],`8*13`(r3) + std $A[2][4],`8*14`(r3) + std $A[3][0],`8*15`(r3) + std $A[3][1],`8*16`(r3) + std $A[3][2],`8*17`(r3) + std $A[3][3],`8*18`(r3) + std $A[3][4],`8*19`(r3) + std $A[4][0],`8*20`(r3) + std $A[4][1],`8*21`(r3) + std $A[4][2],`8*22`(r3) + std $A[4][3],`8*23`(r3) + std $A[4][4],`8*24`(r3) + + $POP r0,`$FRAME+$LRSAVE`($sp) + $POP r14,`$FRAME-$SIZE_T*18`($sp) + $POP r15,`$FRAME-$SIZE_T*17`($sp) + $POP r16,`$FRAME-$SIZE_T*16`($sp) + $POP r17,`$FRAME-$SIZE_T*15`($sp) + $POP r18,`$FRAME-$SIZE_T*14`($sp) + $POP r19,`$FRAME-$SIZE_T*13`($sp) + $POP r20,`$FRAME-$SIZE_T*12`($sp) + $POP r21,`$FRAME-$SIZE_T*11`($sp) + $POP r22,`$FRAME-$SIZE_T*10`($sp) + $POP r23,`$FRAME-$SIZE_T*9`($sp) + $POP r24,`$FRAME-$SIZE_T*8`($sp) + $POP r25,`$FRAME-$SIZE_T*7`($sp) + $POP r26,`$FRAME-$SIZE_T*6`($sp) + $POP r27,`$FRAME-$SIZE_T*5`($sp) + $POP r28,`$FRAME-$SIZE_T*4`($sp) + $POP r29,`$FRAME-$SIZE_T*3`($sp) + $POP r30,`$FRAME-$SIZE_T*2`($sp) + $POP r31,`$FRAME-$SIZE_T*1`($sp) + mtlr r0 + addi $sp,$sp,$FRAME + blr + .long 0 + .byte 0,12,4,1,0x80,18,1,0 + .long 0 +.size KeccakF1600,.-KeccakF1600 + +.type dword_le_load,\@function +.align 5 +dword_le_load: + lbzu r0,1(r3) + lbzu r4,1(r3) + lbzu r5,1(r3) + insrdi r0,r4,8,48 + lbzu r4,1(r3) + insrdi r0,r5,8,40 + lbzu r5,1(r3) + insrdi r0,r4,8,32 + lbzu r4,1(r3) + insrdi r0,r5,8,24 + lbzu r5,1(r3) + insrdi r0,r4,8,16 + lbzu r4,1(r3) + insrdi r0,r5,8,8 + insrdi r0,r4,8,0 + blr + .long 0 + .byte 0,12,0x14,0,0,0,1,0 + .long 0 +.size dword_le_load,.-dword_le_load + +.globl SHA3_absorb +.type SHA3_absorb,\@function +.align 5 +SHA3_absorb: + $STU $sp,-$FRAME($sp) + mflr r0 + $PUSH r14,`$FRAME-$SIZE_T*18`($sp) + $PUSH r15,`$FRAME-$SIZE_T*17`($sp) + $PUSH r16,`$FRAME-$SIZE_T*16`($sp) + $PUSH r17,`$FRAME-$SIZE_T*15`($sp) + $PUSH r18,`$FRAME-$SIZE_T*14`($sp) + $PUSH r19,`$FRAME-$SIZE_T*13`($sp) + $PUSH r20,`$FRAME-$SIZE_T*12`($sp) + $PUSH r21,`$FRAME-$SIZE_T*11`($sp) + $PUSH r22,`$FRAME-$SIZE_T*10`($sp) + $PUSH r23,`$FRAME-$SIZE_T*9`($sp) + $PUSH r24,`$FRAME-$SIZE_T*8`($sp) + $PUSH r25,`$FRAME-$SIZE_T*7`($sp) + $PUSH r26,`$FRAME-$SIZE_T*6`($sp) + $PUSH r27,`$FRAME-$SIZE_T*5`($sp) + $PUSH r28,`$FRAME-$SIZE_T*4`($sp) + $PUSH r29,`$FRAME-$SIZE_T*3`($sp) + $PUSH r30,`$FRAME-$SIZE_T*2`($sp) + $PUSH r31,`$FRAME-$SIZE_T*1`($sp) + $PUSH r0,`$FRAME+$LRSAVE`($sp) + + bl PICmeup + subi r4,r4,1 ; prepare for lbzu + subi r12,r12,8 ; prepare for ldu + + $PUSH r3,`$LOCALS+0*$SIZE_T`($sp) ; save A[][] + $PUSH r4,`$LOCALS+1*$SIZE_T`($sp) ; save inp + $PUSH r5,`$LOCALS+2*$SIZE_T`($sp) ; save len + $PUSH r6,`$LOCALS+3*$SIZE_T`($sp) ; save bsz + mr r0,r6 + $PUSH r12,`$LOCALS+4*$SIZE_T`($sp) + + ld $A[0][0],`8*0`(r3) ; load A[5][5] + ld $A[0][1],`8*1`(r3) + ld $A[0][2],`8*2`(r3) + ld $A[0][3],`8*3`(r3) + ld $A[0][4],`8*4`(r3) + ld $A[1][0],`8*5`(r3) + ld $A[1][1],`8*6`(r3) + ld $A[1][2],`8*7`(r3) + ld $A[1][3],`8*8`(r3) + ld $A[1][4],`8*9`(r3) + ld $A[2][0],`8*10`(r3) + ld $A[2][1],`8*11`(r3) + ld $A[2][2],`8*12`(r3) + ld $A[2][3],`8*13`(r3) + ld $A[2][4],`8*14`(r3) + ld $A[3][0],`8*15`(r3) + ld $A[3][1],`8*16`(r3) + ld $A[3][2],`8*17`(r3) + ld $A[3][3],`8*18`(r3) + ld $A[3][4],`8*19`(r3) + ld $A[4][0],`8*20`(r3) + ld $A[4][1],`8*21`(r3) + ld $A[4][2],`8*22`(r3) + ld $A[4][3],`8*23`(r3) + ld $A[4][4],`8*24`(r3) + + mr r3,r4 + mr r4,r5 + mr r5,r0 + + b .Loop_absorb + +.align 4 +.Loop_absorb: + $UCMP r4,r5 ; len < bsz? + blt .Labsorbed + + sub r4,r4,r5 ; len -= bsz + srwi r5,r5,3 + $PUSH r4,`$LOCALS+2*$SIZE_T`($sp) ; save len + mtctr r5 + bl dword_le_load ; *inp++ + xor $A[0][0],$A[0][0],r0 + bdz .Lprocess_block + bl dword_le_load ; *inp++ + xor $A[0][1],$A[0][1],r0 + bdz .Lprocess_block + bl dword_le_load ; *inp++ + xor $A[0][2],$A[0][2],r0 + bdz .Lprocess_block + bl dword_le_load ; *inp++ + xor $A[0][3],$A[0][3],r0 + bdz .Lprocess_block + bl dword_le_load ; *inp++ + xor $A[0][4],$A[0][4],r0 + bdz .Lprocess_block + bl dword_le_load ; *inp++ + xor $A[1][0],$A[1][0],r0 + bdz .Lprocess_block + bl dword_le_load ; *inp++ + xor $A[1][1],$A[1][1],r0 + bdz .Lprocess_block + bl dword_le_load ; *inp++ + xor $A[1][2],$A[1][2],r0 + bdz .Lprocess_block + bl dword_le_load ; *inp++ + xor $A[1][3],$A[1][3],r0 + bdz .Lprocess_block + bl dword_le_load ; *inp++ + xor $A[1][4],$A[1][4],r0 + bdz .Lprocess_block + bl dword_le_load ; *inp++ + xor $A[2][0],$A[2][0],r0 + bdz .Lprocess_block + bl dword_le_load ; *inp++ + xor $A[2][1],$A[2][1],r0 + bdz .Lprocess_block + bl dword_le_load ; *inp++ + xor $A[2][2],$A[2][2],r0 + bdz .Lprocess_block + bl dword_le_load ; *inp++ + xor $A[2][3],$A[2][3],r0 + bdz .Lprocess_block + bl dword_le_load ; *inp++ + xor $A[2][4],$A[2][4],r0 + bdz .Lprocess_block + bl dword_le_load ; *inp++ + xor $A[3][0],$A[3][0],r0 + bdz .Lprocess_block + bl dword_le_load ; *inp++ + xor $A[3][1],$A[3][1],r0 + bdz .Lprocess_block + bl dword_le_load ; *inp++ + xor $A[3][2],$A[3][2],r0 + bdz .Lprocess_block + bl dword_le_load ; *inp++ + xor $A[3][3],$A[3][3],r0 + bdz .Lprocess_block + bl dword_le_load ; *inp++ + xor $A[3][4],$A[3][4],r0 + bdz .Lprocess_block + bl dword_le_load ; *inp++ + xor $A[4][0],$A[4][0],r0 + bdz .Lprocess_block + bl dword_le_load ; *inp++ + xor $A[4][1],$A[4][1],r0 + bdz .Lprocess_block + bl dword_le_load ; *inp++ + xor $A[4][2],$A[4][2],r0 + bdz .Lprocess_block + bl dword_le_load ; *inp++ + xor $A[4][3],$A[4][3],r0 + bdz .Lprocess_block + bl dword_le_load ; *inp++ + xor $A[4][4],$A[4][4],r0 + +.Lprocess_block: + $PUSH r3,`$LOCALS+1*$SIZE_T`($sp) ; save inp + + bl KeccakF1600_int + + $POP r0,`$LOCALS+4*$SIZE_T`($sp) ; pull iotas[24] + $POP r5,`$LOCALS+3*$SIZE_T`($sp) ; restore bsz + $POP r4,`$LOCALS+2*$SIZE_T`($sp) ; restore len + $POP r3,`$LOCALS+1*$SIZE_T`($sp) ; restore inp + addic r0,r0,`-8*24` ; rewind iotas + $PUSH r0,`$LOCALS+4*$SIZE_T`($sp) + + b .Loop_absorb + +.align 4 +.Labsorbed: + $POP r3,`$LOCALS+0*$SIZE_T`($sp) + std $A[0][0],`8*0`(r3) ; return A[5][5] + std $A[0][1],`8*1`(r3) + std $A[0][2],`8*2`(r3) + std $A[0][3],`8*3`(r3) + std $A[0][4],`8*4`(r3) + std $A[1][0],`8*5`(r3) + std $A[1][1],`8*6`(r3) + std $A[1][2],`8*7`(r3) + std $A[1][3],`8*8`(r3) + std $A[1][4],`8*9`(r3) + std $A[2][0],`8*10`(r3) + std $A[2][1],`8*11`(r3) + std $A[2][2],`8*12`(r3) + std $A[2][3],`8*13`(r3) + std $A[2][4],`8*14`(r3) + std $A[3][0],`8*15`(r3) + std $A[3][1],`8*16`(r3) + std $A[3][2],`8*17`(r3) + std $A[3][3],`8*18`(r3) + std $A[3][4],`8*19`(r3) + std $A[4][0],`8*20`(r3) + std $A[4][1],`8*21`(r3) + std $A[4][2],`8*22`(r3) + std $A[4][3],`8*23`(r3) + std $A[4][4],`8*24`(r3) + + mr r3,r4 ; return value + $POP r0,`$FRAME+$LRSAVE`($sp) + $POP r14,`$FRAME-$SIZE_T*18`($sp) + $POP r15,`$FRAME-$SIZE_T*17`($sp) + $POP r16,`$FRAME-$SIZE_T*16`($sp) + $POP r17,`$FRAME-$SIZE_T*15`($sp) + $POP r18,`$FRAME-$SIZE_T*14`($sp) + $POP r19,`$FRAME-$SIZE_T*13`($sp) + $POP r20,`$FRAME-$SIZE_T*12`($sp) + $POP r21,`$FRAME-$SIZE_T*11`($sp) + $POP r22,`$FRAME-$SIZE_T*10`($sp) + $POP r23,`$FRAME-$SIZE_T*9`($sp) + $POP r24,`$FRAME-$SIZE_T*8`($sp) + $POP r25,`$FRAME-$SIZE_T*7`($sp) + $POP r26,`$FRAME-$SIZE_T*6`($sp) + $POP r27,`$FRAME-$SIZE_T*5`($sp) + $POP r28,`$FRAME-$SIZE_T*4`($sp) + $POP r29,`$FRAME-$SIZE_T*3`($sp) + $POP r30,`$FRAME-$SIZE_T*2`($sp) + $POP r31,`$FRAME-$SIZE_T*1`($sp) + mtlr r0 + addi $sp,$sp,$FRAME + blr + .long 0 + .byte 0,12,4,1,0x80,18,4,0 + .long 0 +.size SHA3_absorb,.-SHA3_absorb +___ +{ +my ($A_flat,$out,$len,$bsz) = map("r$_",(28..31)); +$code.=<<___; +.globl SHA3_squeeze +.type SHA3_squeeze,\@function +.align 5 +SHA3_squeeze: + $STU $sp,`-10*$SIZE_T`($sp) + mflr r0 + $PUSH r28,`6*$SIZE_T`($sp) + $PUSH r29,`7*$SIZE_T`($sp) + $PUSH r30,`8*$SIZE_T`($sp) + $PUSH r31,`9*$SIZE_T`($sp) + $PUSH r0,`10*$SIZE_T+$LRSAVE`($sp) + + mr $A_flat,r3 + subi r3,r3,8 ; prepare for ldu + subi $out,r4,1 ; prepare for stbu + mr $len,r5 + mr $bsz,r6 + b .Loop_squeeze + +.align 4 +.Loop_squeeze: + ldu r0,8(r3) + ${UCMP}i $len,8 + blt .Lsqueeze_tail + + stbu r0,1($out) + srdi r0,r0,8 + stbu r0,1($out) + srdi r0,r0,8 + stbu r0,1($out) + srdi r0,r0,8 + stbu r0,1($out) + srdi r0,r0,8 + stbu r0,1($out) + srdi r0,r0,8 + stbu r0,1($out) + srdi r0,r0,8 + stbu r0,1($out) + srdi r0,r0,8 + stbu r0,1($out) + + subic. $len,$len,8 + beq .Lsqueeze_done + + subic. r6,r6,8 + bgt .Loop_squeeze + + mr r3,$A_flat + bl KeccakF1600 + subi r3,$A_flat,8 ; prepare for ldu + mr r6,$bsz + b .Loop_squeeze + +.align 4 +.Lsqueeze_tail: + mtctr $len +.Loop_tail: + stbu r0,1($out) + srdi r0,r0,8 + bdnz .Loop_tail + +.Lsqueeze_done: + $POP r0,`10*$SIZE_T+$LRSAVE`($sp) + $POP r28,`6*$SIZE_T`($sp) + $POP r29,`7*$SIZE_T`($sp) + $POP r30,`8*$SIZE_T`($sp) + $POP r31,`9*$SIZE_T`($sp) + mtlr r0 + addi $sp,$sp,`10*$SIZE_T` + blr + .long 0 + .byte 0,12,4,1,0x80,4,4,0 + .long 0 +.size SHA3_squeeze,.-SHA3_squeeze +___ +} + +# Ugly hack here, because PPC assembler syntax seem to vary too +# much from platforms to platform... +$code.=<<___; +.align 6 +PICmeup: + mflr r0 + bcl 20,31,\$+4 + mflr r12 ; vvvvvv "distance" between . and 1st data entry + addi r12,r12,`64-8` + mtlr r0 + blr + .long 0 + .byte 0,12,0x14,0,0,0,0,0 + .space `64-9*4` +.type iotas,\@object +iotas: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 +.size iotas,.-iotas +.asciz "Keccak-1600 absorb and squeeze for PPC64, CRYPTOGAMS by <appro\@openssl.org>" +___ + +$code =~ s/\`([^\`]*)\`/eval $1/gem; +print $code; +close STDOUT; diff --git a/crypto/sha/asm/keccak1600-s390x.pl b/crypto/sha/asm/keccak1600-s390x.pl new file mode 100755 index 000000000000..3bce19be9ea4 --- /dev/null +++ b/crypto/sha/asm/keccak1600-s390x.pl @@ -0,0 +1,560 @@ +#!/usr/bin/env perl +# Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# Keccak-1600 for s390x. +# +# June 2017. +# +# Below code is [lane complementing] KECCAK_2X implementation (see +# sha/keccak1600.c) with C[5] and D[5] held in register bank. Though +# instead of actually unrolling the loop pair-wise I simply flip +# pointers to T[][] and A[][] at the end of round. Since number of +# rounds is even, last round writes to A[][] and everything works out. +# In the nutshell it's transliteration of x86_64 module, because both +# architectures have similar capabilities/limitations. Performance +# measurement is problematic as I don't have access to an idle system. +# It looks like z13 processes one byte [out of long message] in ~14 +# cycles. At least the result is consistent with estimate based on +# amount of instruction and assumed instruction issue rate. It's ~2.5x +# faster than compiler-generated code. + +$flavour = shift; + +if ($flavour =~ /3[12]/) { + $SIZE_T=4; + $g=""; +} else { + $SIZE_T=8; + $g="g"; +} + +while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + +my @A = map([ 8*$_, 8*($_+1), 8*($_+2), 8*($_+3), 8*($_+4) ], (0,5,10,15,20)); + +my @C = map("%r$_",(0,1,5..7)); +my @D = map("%r$_",(8..12)); +my @T = map("%r$_",(13..14)); +my ($src,$dst,$iotas) = map("%r$_",(2..4)); +my $sp = "%r15"; + +$stdframe=16*$SIZE_T+4*8; +$frame=$stdframe+25*8; + +my @rhotates = ([ 0, 1, 62, 28, 27 ], + [ 36, 44, 6, 55, 20 ], + [ 3, 10, 43, 25, 39 ], + [ 41, 45, 15, 21, 8 ], + [ 18, 2, 61, 56, 14 ]); + +{ my @C = @C; # copy, because we mess them up... + my @D = @D; + +$code.=<<___; +.text + +.type __KeccakF1600,\@function +.align 32 +__KeccakF1600: + st${g} %r14,$SIZE_T*14($sp) + lg @C[0],$A[4][0]($src) + lg @C[1],$A[4][1]($src) + lg @C[2],$A[4][2]($src) + lg @C[3],$A[4][3]($src) + lg @C[4],$A[4][4]($src) + larl $iotas,iotas + j .Loop + +.align 16 +.Loop: + lg @D[0],$A[0][0]($src) + lg @D[1],$A[1][1]($src) + lg @D[2],$A[2][2]($src) + lg @D[3],$A[3][3]($src) + + xgr @C[0],@D[0] + xg @C[1],$A[0][1]($src) + xg @C[2],$A[0][2]($src) + xg @C[3],$A[0][3]($src) + lgr @D[4],@C[4] + xg @C[4],$A[0][4]($src) + + xg @C[0],$A[1][0]($src) + xgr @C[1],@D[1] + xg @C[2],$A[1][2]($src) + xg @C[3],$A[1][3]($src) + xg @C[4],$A[1][4]($src) + + xg @C[0],$A[2][0]($src) + xg @C[1],$A[2][1]($src) + xgr @C[2],@D[2] + xg @C[3],$A[2][3]($src) + xg @C[4],$A[2][4]($src) + + xg @C[0],$A[3][0]($src) + xg @C[1],$A[3][1]($src) + xg @C[2],$A[3][2]($src) + xgr @C[3],@D[3] + xg @C[4],$A[3][4]($src) + + lgr @T[0],@C[2] + rllg @C[2],@C[2],1 + xgr @C[2],@C[0] # D[1] = ROL64(C[2], 1) ^ C[0] + + rllg @C[0],@C[0],1 + xgr @C[0],@C[3] # D[4] = ROL64(C[0], 1) ^ C[3] + + rllg @C[3],@C[3],1 + xgr @C[3],@C[1] # D[2] = ROL64(C[3], 1) ^ C[1] + + rllg @C[1],@C[1],1 + xgr @C[1],@C[4] # D[0] = ROL64(C[1], 1) ^ C[4] + + rllg @C[4],@C[4],1 + xgr @C[4],@T[0] # D[3] = ROL64(C[4], 1) ^ C[2] +___ + (@D[0..4], @C) = (@C[1..4,0], @D); +$code.=<<___; + xgr @C[1],@D[1] + xgr @C[2],@D[2] + xgr @C[3],@D[3] + rllg @C[1],@C[1],$rhotates[1][1] + xgr @C[4],@D[4] + rllg @C[2],@C[2],$rhotates[2][2] + xgr @C[0],@D[0] + + lgr @T[0],@C[1] + ogr @C[1],@C[2] + rllg @C[3],@C[3],$rhotates[3][3] + xgr @C[1],@C[0] # C[0] ^ ( C[1] | C[2]) + rllg @C[4],@C[4],$rhotates[4][4] + xg @C[1],0($iotas) + la $iotas,8($iotas) + stg @C[1],$A[0][0]($dst) # R[0][0] = C[0] ^ ( C[1] | C[2]) ^ iotas[i] + + lgr @T[1],@C[4] + ngr @C[4],@C[3] + lghi @C[1],-1 # no 'not' instruction :-( + xgr @C[4],@C[2] # C[2] ^ ( C[4] & C[3]) + xgr @C[2],@C[1] # not @C[2] + stg @C[4],$A[0][2]($dst) # R[0][2] = C[2] ^ ( C[4] & C[3]) + ogr @C[2],@C[3] + xgr @C[2],@T[0] # C[1] ^ (~C[2] | C[3]) + + ngr @T[0],@C[0] + stg @C[2],$A[0][1]($dst) # R[0][1] = C[1] ^ (~C[2] | C[3]) + xgr @T[0],@T[1] # C[4] ^ ( C[1] & C[0]) + ogr @T[1],@C[0] + stg @T[0],$A[0][4]($dst) # R[0][4] = C[4] ^ ( C[1] & C[0]) + xgr @T[1],@C[3] # C[3] ^ ( C[4] | C[0]) + stg @T[1],$A[0][3]($dst) # R[0][3] = C[3] ^ ( C[4] | C[0]) + + + lg @C[0],$A[0][3]($src) + lg @C[4],$A[4][2]($src) + lg @C[3],$A[3][1]($src) + lg @C[1],$A[1][4]($src) + lg @C[2],$A[2][0]($src) + + xgr @C[0],@D[3] + xgr @C[4],@D[2] + rllg @C[0],@C[0],$rhotates[0][3] + xgr @C[3],@D[1] + rllg @C[4],@C[4],$rhotates[4][2] + xgr @C[1],@D[4] + rllg @C[3],@C[3],$rhotates[3][1] + xgr @C[2],@D[0] + + lgr @T[0],@C[0] + ogr @C[0],@C[4] + rllg @C[1],@C[1],$rhotates[1][4] + xgr @C[0],@C[3] # C[3] ^ (C[0] | C[4]) + rllg @C[2],@C[2],$rhotates[2][0] + stg @C[0],$A[1][3]($dst) # R[1][3] = C[3] ^ (C[0] | C[4]) + + lgr @T[1],@C[1] + ngr @C[1],@T[0] + lghi @C[0],-1 # no 'not' instruction :-( + xgr @C[1],@C[4] # C[4] ^ (C[1] & C[0]) + xgr @C[4],@C[0] # not @C[4] + stg @C[1],$A[1][4]($dst) # R[1][4] = C[4] ^ (C[1] & C[0]) + + ogr @C[4],@C[3] + xgr @C[4],@C[2] # C[2] ^ (~C[4] | C[3]) + + ngr @C[3],@C[2] + stg @C[4],$A[1][2]($dst) # R[1][2] = C[2] ^ (~C[4] | C[3]) + xgr @C[3],@T[1] # C[1] ^ (C[3] & C[2]) + ogr @T[1],@C[2] + stg @C[3],$A[1][1]($dst) # R[1][1] = C[1] ^ (C[3] & C[2]) + xgr @T[1],@T[0] # C[0] ^ (C[1] | C[2]) + stg @T[1],$A[1][0]($dst) # R[1][0] = C[0] ^ (C[1] | C[2]) + + + lg @C[2],$A[2][3]($src) + lg @C[3],$A[3][4]($src) + lg @C[1],$A[1][2]($src) + lg @C[4],$A[4][0]($src) + lg @C[0],$A[0][1]($src) + + xgr @C[2],@D[3] + xgr @C[3],@D[4] + rllg @C[2],@C[2],$rhotates[2][3] + xgr @C[1],@D[2] + rllg @C[3],@C[3],$rhotates[3][4] + xgr @C[4],@D[0] + rllg @C[1],@C[1],$rhotates[1][2] + xgr @C[0],@D[1] + + lgr @T[0],@C[2] + ngr @C[2],@C[3] + rllg @C[4],@C[4],$rhotates[4][0] + xgr @C[2],@C[1] # C[1] ^ ( C[2] & C[3]) + lghi @T[1],-1 # no 'not' instruction :-( + stg @C[2],$A[2][1]($dst) # R[2][1] = C[1] ^ ( C[2] & C[3]) + + xgr @C[3],@T[1] # not @C[3] + lgr @T[1],@C[4] + ngr @C[4],@C[3] + rllg @C[0],@C[0],$rhotates[0][1] + xgr @C[4],@T[0] # C[2] ^ ( C[4] & ~C[3]) + ogr @T[0],@C[1] + stg @C[4],$A[2][2]($dst) # R[2][2] = C[2] ^ ( C[4] & ~C[3]) + xgr @T[0],@C[0] # C[0] ^ ( C[2] | C[1]) + + ngr @C[1],@C[0] + stg @T[0],$A[2][0]($dst) # R[2][0] = C[0] ^ ( C[2] | C[1]) + xgr @C[1],@T[1] # C[4] ^ ( C[1] & C[0]) + ogr @C[0],@T[1] + stg @C[1],$A[2][4]($dst) # R[2][4] = C[4] ^ ( C[1] & C[0]) + xgr @C[0],@C[3] # ~C[3] ^ ( C[0] | C[4]) + stg @C[0],$A[2][3]($dst) # R[2][3] = ~C[3] ^ ( C[0] | C[4]) + + + lg @C[2],$A[2][1]($src) + lg @C[3],$A[3][2]($src) + lg @C[1],$A[1][0]($src) + lg @C[4],$A[4][3]($src) + lg @C[0],$A[0][4]($src) + + xgr @C[2],@D[1] + xgr @C[3],@D[2] + rllg @C[2],@C[2],$rhotates[2][1] + xgr @C[1],@D[0] + rllg @C[3],@C[3],$rhotates[3][2] + xgr @C[4],@D[3] + rllg @C[1],@C[1],$rhotates[1][0] + xgr @C[0],@D[4] + rllg @C[4],@C[4],$rhotates[4][3] + + lgr @T[0],@C[2] + ogr @C[2],@C[3] + lghi @T[1],-1 # no 'not' instruction :-( + xgr @C[2],@C[1] # C[1] ^ ( C[2] | C[3]) + xgr @C[3],@T[1] # not @C[3] + stg @C[2],$A[3][1]($dst) # R[3][1] = C[1] ^ ( C[2] | C[3]) + + lgr @T[1],@C[4] + ogr @C[4],@C[3] + rllg @C[0],@C[0],$rhotates[0][4] + xgr @C[4],@T[0] # C[2] ^ ( C[4] | ~C[3]) + ngr @T[0],@C[1] + stg @C[4],$A[3][2]($dst) # R[3][2] = C[2] ^ ( C[4] | ~C[3]) + xgr @T[0],@C[0] # C[0] ^ ( C[2] & C[1]) + + ogr @C[1],@C[0] + stg @T[0],$A[3][0]($dst) # R[3][0] = C[0] ^ ( C[2] & C[1]) + xgr @C[1],@T[1] # C[4] ^ ( C[1] | C[0]) + ngr @C[0],@T[1] + stg @C[1],$A[3][4]($dst) # R[3][4] = C[4] ^ ( C[1] | C[0]) + xgr @C[0],@C[3] # ~C[3] ^ ( C[0] & C[4]) + stg @C[0],$A[3][3]($dst) # R[3][3] = ~C[3] ^ ( C[0] & C[4]) + + + xg @D[2],$A[0][2]($src) + xg @D[3],$A[1][3]($src) + xg @D[1],$A[4][1]($src) + xg @D[4],$A[2][4]($src) + xgr $dst,$src # xchg $dst,$src + rllg @D[2],@D[2],$rhotates[0][2] + xg @D[0],$A[3][0]($src) + rllg @D[3],@D[3],$rhotates[1][3] + xgr $src,$dst + rllg @D[1],@D[1],$rhotates[4][1] + xgr $dst,$src + rllg @D[4],@D[4],$rhotates[2][4] +___ + @C = @D[2..4,0,1]; +$code.=<<___; + lgr @T[0],@C[0] + ngr @C[0],@C[1] + lghi @T[1],-1 # no 'not' instruction :-( + xgr @C[0],@C[4] # C[4] ^ ( C[0] & C[1]) + xgr @C[1],@T[1] # not @C[1] + stg @C[0],$A[4][4]($src) # R[4][4] = C[4] ^ ( C[0] & C[1]) + + lgr @T[1],@C[2] + ngr @C[2],@C[1] + rllg @D[0],@D[0],$rhotates[3][0] + xgr @C[2],@T[0] # C[0] ^ ( C[2] & ~C[1]) + ogr @T[0],@C[4] + stg @C[2],$A[4][0]($src) # R[4][0] = C[0] ^ ( C[2] & ~C[1]) + xgr @T[0],@C[3] # C[3] ^ ( C[0] | C[4]) + + ngr @C[4],@C[3] + stg @T[0],$A[4][3]($src) # R[4][3] = C[3] ^ ( C[0] | C[4]) + xgr @C[4],@T[1] # C[2] ^ ( C[4] & C[3]) + ogr @C[3],@T[1] + stg @C[4],$A[4][2]($src) # R[4][2] = C[2] ^ ( C[4] & C[3]) + xgr @C[3],@C[1] # ~C[1] ^ ( C[2] | C[3]) + + lgr @C[1],@C[0] # harmonize with the loop top + lgr @C[0],@T[0] + stg @C[3],$A[4][1]($src) # R[4][1] = ~C[1] ^ ( C[2] | C[3]) + + tmll $iotas,255 + jnz .Loop + + l${g} %r14,$SIZE_T*14($sp) + br %r14 +.size __KeccakF1600,.-__KeccakF1600 +___ +} +{ +$code.=<<___; +.type KeccakF1600,\@function +.align 32 +KeccakF1600: +.LKeccakF1600: + lghi %r1,-$frame + stm${g} %r6,%r15,$SIZE_T*6($sp) + lgr %r0,$sp + la $sp,0(%r1,$sp) + st${g} %r0,0($sp) + + lghi @D[0],-1 # no 'not' instruction :-( + lghi @D[1],-1 + lghi @D[2],-1 + lghi @D[3],-1 + lghi @D[4],-1 + lghi @T[0],-1 + xg @D[0],$A[0][1]($src) + xg @D[1],$A[0][2]($src) + xg @D[2],$A[1][3]($src) + xg @D[3],$A[2][2]($src) + xg @D[4],$A[3][2]($src) + xg @T[0],$A[4][0]($src) + stmg @D[0],@D[1],$A[0][1]($src) + stg @D[2],$A[1][3]($src) + stg @D[3],$A[2][2]($src) + stg @D[4],$A[3][2]($src) + stg @T[0],$A[4][0]($src) + + la $dst,$stdframe($sp) + + bras %r14,__KeccakF1600 + + lghi @D[0],-1 # no 'not' instruction :-( + lghi @D[1],-1 + lghi @D[2],-1 + lghi @D[3],-1 + lghi @D[4],-1 + lghi @T[0],-1 + xg @D[0],$A[0][1]($src) + xg @D[1],$A[0][2]($src) + xg @D[2],$A[1][3]($src) + xg @D[3],$A[2][2]($src) + xg @D[4],$A[3][2]($src) + xg @T[0],$A[4][0]($src) + stmg @D[0],@D[1],$A[0][1]($src) + stg @D[2],$A[1][3]($src) + stg @D[3],$A[2][2]($src) + stg @D[4],$A[3][2]($src) + stg @T[0],$A[4][0]($src) + + lm${g} %r6,%r15,$frame+6*$SIZE_T($sp) + br %r14 +.size KeccakF1600,.-KeccakF1600 +___ +} +{ my ($A_flat,$inp,$len,$bsz) = map("%r$_",(2..5)); + +$code.=<<___; +.globl SHA3_absorb +.type SHA3_absorb,\@function +.align 32 +SHA3_absorb: + lghi %r1,-$frame + stm${g} %r5,%r15,$SIZE_T*5($sp) + lgr %r0,$sp + la $sp,0(%r1,$sp) + st${g} %r0,0($sp) + + lghi @D[0],-1 # no 'not' instruction :-( + lghi @D[1],-1 + lghi @D[2],-1 + lghi @D[3],-1 + lghi @D[4],-1 + lghi @T[0],-1 + xg @D[0],$A[0][1]($src) + xg @D[1],$A[0][2]($src) + xg @D[2],$A[1][3]($src) + xg @D[3],$A[2][2]($src) + xg @D[4],$A[3][2]($src) + xg @T[0],$A[4][0]($src) + stmg @D[0],@D[1],$A[0][1]($src) + stg @D[2],$A[1][3]($src) + stg @D[3],$A[2][2]($src) + stg @D[4],$A[3][2]($src) + stg @T[0],$A[4][0]($src) + +.Loop_absorb: + cl${g}r $len,$bsz + jl .Ldone_absorb + + srl${g} $bsz,3 + la %r1,0($A_flat) + +.Lblock_absorb: + lrvg %r0,0($inp) + la $inp,8($inp) + xg %r0,0(%r1) + la %r1,8(%r1) + a${g}hi $len,-8 + stg %r0,-8(%r1) + brct $bsz,.Lblock_absorb + + stm${g} $inp,$len,$frame+3*$SIZE_T($sp) + la $dst,$stdframe($sp) + bras %r14,__KeccakF1600 + lm${g} $inp,$bsz,$frame+3*$SIZE_T($sp) + j .Loop_absorb + +.align 16 +.Ldone_absorb: + lghi @D[0],-1 # no 'not' instruction :-( + lghi @D[1],-1 + lghi @D[2],-1 + lghi @D[3],-1 + lghi @D[4],-1 + lghi @T[0],-1 + xg @D[0],$A[0][1]($src) + xg @D[1],$A[0][2]($src) + xg @D[2],$A[1][3]($src) + xg @D[3],$A[2][2]($src) + xg @D[4],$A[3][2]($src) + xg @T[0],$A[4][0]($src) + stmg @D[0],@D[1],$A[0][1]($src) + stg @D[2],$A[1][3]($src) + stg @D[3],$A[2][2]($src) + stg @D[4],$A[3][2]($src) + stg @T[0],$A[4][0]($src) + + lgr %r2,$len # return value + + lm${g} %r6,%r15,$frame+6*$SIZE_T($sp) + br %r14 +.size SHA3_absorb,.-SHA3_absorb +___ +} +{ my ($A_flat,$out,$len,$bsz) = map("%r$_",(2..5)); + +$code.=<<___; +.globl SHA3_squeeze +.type SHA3_squeeze,\@function +.align 32 +SHA3_squeeze: + srl${g} $bsz,3 + st${g} %r14,2*$SIZE_T($sp) + lghi %r14,8 + st${g} $bsz,5*$SIZE_T($sp) + la %r1,0($A_flat) + + j .Loop_squeeze + +.align 16 +.Loop_squeeze: + cl${g}r $len,%r14 + jl .Ltail_squeeze + + lrvg %r0,0(%r1) + la %r1,8(%r1) + stg %r0,0($out) + la $out,8($out) + a${g}hi $len,-8 # len -= 8 + jz .Ldone_squeeze + + brct $bsz,.Loop_squeeze # bsz-- + + stm${g} $out,$len,3*$SIZE_T($sp) + bras %r14,.LKeccakF1600 + lm${g} $out,$bsz,3*$SIZE_T($sp) + lghi %r14,8 + la %r1,0($A_flat) + j .Loop_squeeze + +.Ltail_squeeze: + lg %r0,0(%r1) +.Loop_tail_squeeze: + stc %r0,0($out) + la $out,1($out) + srlg %r0,8 + brct $len,.Loop_tail_squeeze + +.Ldone_squeeze: + l${g} %r14,2*$SIZE_T($sp) + br %r14 +.size SHA3_squeeze,.-SHA3_squeeze +___ +} +$code.=<<___; +.align 256 + .quad 0,0,0,0,0,0,0,0 +.type iotas,\@object +iotas: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 +.size iotas,.-iotas +.asciz "Keccak-1600 absorb and squeeze for s390x, CRYPTOGAMS by <appro\@openssl.org>" +___ + +# unlike 32-bit shift 64-bit one takes three arguments +$code =~ s/(srlg\s+)(%r[0-9]+),/$1$2,$2,/gm; + +print $code; +close STDOUT; diff --git a/crypto/sha/asm/keccak1600-x86_64.pl b/crypto/sha/asm/keccak1600-x86_64.pl new file mode 100755 index 000000000000..42de5bf12344 --- /dev/null +++ b/crypto/sha/asm/keccak1600-x86_64.pl @@ -0,0 +1,607 @@ +#!/usr/bin/env perl +# Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# Keccak-1600 for x86_64. +# +# June 2017. +# +# Below code is [lane complementing] KECCAK_2X implementation (see +# sha/keccak1600.c) with C[5] and D[5] held in register bank. Though +# instead of actually unrolling the loop pair-wise I simply flip +# pointers to T[][] and A[][] at the end of round. Since number of +# rounds is even, last round writes to A[][] and everything works out. +# How does it compare to x86_64 assembly module in Keccak Code Package? +# Depending on processor it's either as fast or faster by up to 15%... +# +######################################################################## +# Numbers are cycles per processed byte out of large message. +# +# r=1088(*) +# +# P4 25.8 +# Core 2 12.9 +# Westmere 13.7 +# Sandy Bridge 12.9(**) +# Haswell 9.6 +# Skylake 9.4 +# Silvermont 22.8 +# Goldmont 15.8 +# VIA Nano 17.3 +# Sledgehammer 13.3 +# Bulldozer 16.5 +# Ryzen 8.8 +# +# (*) Corresponds to SHA3-256. Improvement over compiler-generate +# varies a lot, most commont coefficient is 15% in comparison to +# gcc-5.x, 50% for gcc-4.x, 90% for gcc-3.x. +# (**) Sandy Bridge has broken rotate instruction. Performance can be +# improved by 14% by replacing rotates with double-precision +# shift with same register as source and destination. + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; +*STDOUT=*OUT; + +my @A = map([ 8*$_-100, 8*($_+1)-100, 8*($_+2)-100, + 8*($_+3)-100, 8*($_+4)-100 ], (0,5,10,15,20)); + +my @C = ("%rax","%rbx","%rcx","%rdx","%rbp"); +my @D = map("%r$_",(8..12)); +my @T = map("%r$_",(13..14)); +my $iotas = "%r15"; + +my @rhotates = ([ 0, 1, 62, 28, 27 ], + [ 36, 44, 6, 55, 20 ], + [ 3, 10, 43, 25, 39 ], + [ 41, 45, 15, 21, 8 ], + [ 18, 2, 61, 56, 14 ]); + +$code.=<<___; +.text + +.type __KeccakF1600,\@abi-omnipotent +.align 32 +__KeccakF1600: + mov $A[4][0](%rdi),@C[0] + mov $A[4][1](%rdi),@C[1] + mov $A[4][2](%rdi),@C[2] + mov $A[4][3](%rdi),@C[3] + mov $A[4][4](%rdi),@C[4] + jmp .Loop + +.align 32 +.Loop: + mov $A[0][0](%rdi),@D[0] + mov $A[1][1](%rdi),@D[1] + mov $A[2][2](%rdi),@D[2] + mov $A[3][3](%rdi),@D[3] + + xor $A[0][2](%rdi),@C[2] + xor $A[0][3](%rdi),@C[3] + xor @D[0], @C[0] + xor $A[0][1](%rdi),@C[1] + xor $A[1][2](%rdi),@C[2] + xor $A[1][0](%rdi),@C[0] + mov @C[4],@D[4] + xor $A[0][4](%rdi),@C[4] + + xor @D[2], @C[2] + xor $A[2][0](%rdi),@C[0] + xor $A[1][3](%rdi),@C[3] + xor @D[1], @C[1] + xor $A[1][4](%rdi),@C[4] + + xor $A[3][2](%rdi),@C[2] + xor $A[3][0](%rdi),@C[0] + xor $A[2][3](%rdi),@C[3] + xor $A[2][1](%rdi),@C[1] + xor $A[2][4](%rdi),@C[4] + + mov @C[2],@T[0] + rol \$1,@C[2] + xor @C[0],@C[2] # D[1] = ROL64(C[2], 1) ^ C[0] + xor @D[3], @C[3] + + rol \$1,@C[0] + xor @C[3],@C[0] # D[4] = ROL64(C[0], 1) ^ C[3] + xor $A[3][1](%rdi),@C[1] + + rol \$1,@C[3] + xor @C[1],@C[3] # D[2] = ROL64(C[3], 1) ^ C[1] + xor $A[3][4](%rdi),@C[4] + + rol \$1,@C[1] + xor @C[4],@C[1] # D[0] = ROL64(C[1], 1) ^ C[4] + + rol \$1,@C[4] + xor @T[0],@C[4] # D[3] = ROL64(C[4], 1) ^ C[2] +___ + (@D[0..4], @C) = (@C[1..4,0], @D); +$code.=<<___; + xor @D[1],@C[1] + xor @D[2],@C[2] + rol \$$rhotates[1][1],@C[1] + xor @D[3],@C[3] + xor @D[4],@C[4] + rol \$$rhotates[2][2],@C[2] + xor @D[0],@C[0] + mov @C[1],@T[0] + rol \$$rhotates[3][3],@C[3] + or @C[2],@C[1] + xor @C[0],@C[1] # C[0] ^ ( C[1] | C[2]) + rol \$$rhotates[4][4],@C[4] + + xor ($iotas),@C[1] + lea 8($iotas),$iotas + + mov @C[4],@T[1] + and @C[3],@C[4] + mov @C[1],$A[0][0](%rsi) # R[0][0] = C[0] ^ ( C[1] | C[2]) ^ iotas[i] + xor @C[2],@C[4] # C[2] ^ ( C[4] & C[3]) + not @C[2] + mov @C[4],$A[0][2](%rsi) # R[0][2] = C[2] ^ ( C[4] & C[3]) + + or @C[3],@C[2] + mov $A[4][2](%rdi),@C[4] + xor @T[0],@C[2] # C[1] ^ (~C[2] | C[3]) + mov @C[2],$A[0][1](%rsi) # R[0][1] = C[1] ^ (~C[2] | C[3]) + + and @C[0],@T[0] + mov $A[1][4](%rdi),@C[1] + xor @T[1],@T[0] # C[4] ^ ( C[1] & C[0]) + mov $A[2][0](%rdi),@C[2] + mov @T[0],$A[0][4](%rsi) # R[0][4] = C[4] ^ ( C[1] & C[0]) + + or @C[0],@T[1] + mov $A[0][3](%rdi),@C[0] + xor @C[3],@T[1] # C[3] ^ ( C[4] | C[0]) + mov $A[3][1](%rdi),@C[3] + mov @T[1],$A[0][3](%rsi) # R[0][3] = C[3] ^ ( C[4] | C[0]) + + + xor @D[3],@C[0] + xor @D[2],@C[4] + rol \$$rhotates[0][3],@C[0] + xor @D[1],@C[3] + xor @D[4],@C[1] + rol \$$rhotates[4][2],@C[4] + rol \$$rhotates[3][1],@C[3] + xor @D[0],@C[2] + rol \$$rhotates[1][4],@C[1] + mov @C[0],@T[0] + or @C[4],@C[0] + rol \$$rhotates[2][0],@C[2] + + xor @C[3],@C[0] # C[3] ^ (C[0] | C[4]) + mov @C[0],$A[1][3](%rsi) # R[1][3] = C[3] ^ (C[0] | C[4]) + + mov @C[1],@T[1] + and @T[0],@C[1] + mov $A[0][1](%rdi),@C[0] + xor @C[4],@C[1] # C[4] ^ (C[1] & C[0]) + not @C[4] + mov @C[1],$A[1][4](%rsi) # R[1][4] = C[4] ^ (C[1] & C[0]) + + or @C[3],@C[4] + mov $A[1][2](%rdi),@C[1] + xor @C[2],@C[4] # C[2] ^ (~C[4] | C[3]) + mov @C[4],$A[1][2](%rsi) # R[1][2] = C[2] ^ (~C[4] | C[3]) + + and @C[2],@C[3] + mov $A[4][0](%rdi),@C[4] + xor @T[1],@C[3] # C[1] ^ (C[3] & C[2]) + mov @C[3],$A[1][1](%rsi) # R[1][1] = C[1] ^ (C[3] & C[2]) + + or @C[2],@T[1] + mov $A[2][3](%rdi),@C[2] + xor @T[0],@T[1] # C[0] ^ (C[1] | C[2]) + mov $A[3][4](%rdi),@C[3] + mov @T[1],$A[1][0](%rsi) # R[1][0] = C[0] ^ (C[1] | C[2]) + + + xor @D[3],@C[2] + xor @D[4],@C[3] + rol \$$rhotates[2][3],@C[2] + xor @D[2],@C[1] + rol \$$rhotates[3][4],@C[3] + xor @D[0],@C[4] + rol \$$rhotates[1][2],@C[1] + xor @D[1],@C[0] + rol \$$rhotates[4][0],@C[4] + mov @C[2],@T[0] + and @C[3],@C[2] + rol \$$rhotates[0][1],@C[0] + + not @C[3] + xor @C[1],@C[2] # C[1] ^ ( C[2] & C[3]) + mov @C[2],$A[2][1](%rsi) # R[2][1] = C[1] ^ ( C[2] & C[3]) + + mov @C[4],@T[1] + and @C[3],@C[4] + mov $A[2][1](%rdi),@C[2] + xor @T[0],@C[4] # C[2] ^ ( C[4] & ~C[3]) + mov @C[4],$A[2][2](%rsi) # R[2][2] = C[2] ^ ( C[4] & ~C[3]) + + or @C[1],@T[0] + mov $A[4][3](%rdi),@C[4] + xor @C[0],@T[0] # C[0] ^ ( C[2] | C[1]) + mov @T[0],$A[2][0](%rsi) # R[2][0] = C[0] ^ ( C[2] | C[1]) + + and @C[0],@C[1] + xor @T[1],@C[1] # C[4] ^ ( C[1] & C[0]) + mov @C[1],$A[2][4](%rsi) # R[2][4] = C[4] ^ ( C[1] & C[0]) + + or @C[0],@T[1] + mov $A[1][0](%rdi),@C[1] + xor @C[3],@T[1] # ~C[3] ^ ( C[0] | C[4]) + mov $A[3][2](%rdi),@C[3] + mov @T[1],$A[2][3](%rsi) # R[2][3] = ~C[3] ^ ( C[0] | C[4]) + + + mov $A[0][4](%rdi),@C[0] + + xor @D[1],@C[2] + xor @D[2],@C[3] + rol \$$rhotates[2][1],@C[2] + xor @D[0],@C[1] + rol \$$rhotates[3][2],@C[3] + xor @D[3],@C[4] + rol \$$rhotates[1][0],@C[1] + xor @D[4],@C[0] + rol \$$rhotates[4][3],@C[4] + mov @C[2],@T[0] + or @C[3],@C[2] + rol \$$rhotates[0][4],@C[0] + + not @C[3] + xor @C[1],@C[2] # C[1] ^ ( C[2] | C[3]) + mov @C[2],$A[3][1](%rsi) # R[3][1] = C[1] ^ ( C[2] | C[3]) + + mov @C[4],@T[1] + or @C[3],@C[4] + xor @T[0],@C[4] # C[2] ^ ( C[4] | ~C[3]) + mov @C[4],$A[3][2](%rsi) # R[3][2] = C[2] ^ ( C[4] | ~C[3]) + + and @C[1],@T[0] + xor @C[0],@T[0] # C[0] ^ ( C[2] & C[1]) + mov @T[0],$A[3][0](%rsi) # R[3][0] = C[0] ^ ( C[2] & C[1]) + + or @C[0],@C[1] + xor @T[1],@C[1] # C[4] ^ ( C[1] | C[0]) + mov @C[1],$A[3][4](%rsi) # R[3][4] = C[4] ^ ( C[1] | C[0]) + + and @T[1],@C[0] + xor @C[3],@C[0] # ~C[3] ^ ( C[0] & C[4]) + mov @C[0],$A[3][3](%rsi) # R[3][3] = ~C[3] ^ ( C[0] & C[4]) + + + xor $A[0][2](%rdi),@D[2] + xor $A[1][3](%rdi),@D[3] + rol \$$rhotates[0][2],@D[2] + xor $A[4][1](%rdi),@D[1] + rol \$$rhotates[1][3],@D[3] + xor $A[2][4](%rdi),@D[4] + rol \$$rhotates[4][1],@D[1] + xor $A[3][0](%rdi),@D[0] + xchg %rsi,%rdi + rol \$$rhotates[2][4],@D[4] + rol \$$rhotates[3][0],@D[0] +___ + @C = @D[2..4,0,1]; +$code.=<<___; + mov @C[0],@T[0] + and @C[1],@C[0] + not @C[1] + xor @C[4],@C[0] # C[4] ^ ( C[0] & C[1]) + mov @C[0],$A[4][4](%rdi) # R[4][4] = C[4] ^ ( C[0] & C[1]) + + mov @C[2],@T[1] + and @C[1],@C[2] + xor @T[0],@C[2] # C[0] ^ ( C[2] & ~C[1]) + mov @C[2],$A[4][0](%rdi) # R[4][0] = C[0] ^ ( C[2] & ~C[1]) + + or @C[4],@T[0] + xor @C[3],@T[0] # C[3] ^ ( C[0] | C[4]) + mov @T[0],$A[4][3](%rdi) # R[4][3] = C[3] ^ ( C[0] | C[4]) + + and @C[3],@C[4] + xor @T[1],@C[4] # C[2] ^ ( C[4] & C[3]) + mov @C[4],$A[4][2](%rdi) # R[4][2] = C[2] ^ ( C[4] & C[3]) + + or @T[1],@C[3] + xor @C[1],@C[3] # ~C[1] ^ ( C[2] | C[3]) + mov @C[3],$A[4][1](%rdi) # R[4][1] = ~C[1] ^ ( C[2] | C[3]) + + mov @C[0],@C[1] # harmonize with the loop top + mov @T[0],@C[0] + + test \$255,$iotas + jnz .Loop + + lea -192($iotas),$iotas # rewind iotas + ret +.size __KeccakF1600,.-__KeccakF1600 + +.type KeccakF1600,\@abi-omnipotent +.align 32 +KeccakF1600: +.cfi_startproc + push %rbx +.cfi_push %rbx + push %rbp +.cfi_push %rbp + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + + lea 100(%rdi),%rdi # size optimization + sub \$200,%rsp +.cfi_adjust_cfa_offset 200 + + notq $A[0][1](%rdi) + notq $A[0][2](%rdi) + notq $A[1][3](%rdi) + notq $A[2][2](%rdi) + notq $A[3][2](%rdi) + notq $A[4][0](%rdi) + + lea iotas(%rip),$iotas + lea 100(%rsp),%rsi # size optimization + + call __KeccakF1600 + + notq $A[0][1](%rdi) + notq $A[0][2](%rdi) + notq $A[1][3](%rdi) + notq $A[2][2](%rdi) + notq $A[3][2](%rdi) + notq $A[4][0](%rdi) + lea -100(%rdi),%rdi # preserve A[][] + + add \$200,%rsp +.cfi_adjust_cfa_offset -200 + + pop %r15 +.cfi_pop %r15 + pop %r14 +.cfi_pop %r14 + pop %r13 +.cfi_pop %r13 + pop %r12 +.cfi_pop %r12 + pop %rbp +.cfi_pop %rbp + pop %rbx +.cfi_pop %rbx + ret +.cfi_endproc +.size KeccakF1600,.-KeccakF1600 +___ + +{ my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx"); + ($A_flat,$inp) = ("%r8","%r9"); +$code.=<<___; +.globl SHA3_absorb +.type SHA3_absorb,\@function,4 +.align 32 +SHA3_absorb: +.cfi_startproc + push %rbx +.cfi_push %rbx + push %rbp +.cfi_push %rbp + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + + lea 100(%rdi),%rdi # size optimization + sub \$232,%rsp +.cfi_adjust_cfa_offset 232 + + mov %rsi,$inp + lea 100(%rsp),%rsi # size optimization + + notq $A[0][1](%rdi) + notq $A[0][2](%rdi) + notq $A[1][3](%rdi) + notq $A[2][2](%rdi) + notq $A[3][2](%rdi) + notq $A[4][0](%rdi) + lea iotas(%rip),$iotas + + mov $bsz,216-100(%rsi) # save bsz + +.Loop_absorb: + cmp $bsz,$len + jc .Ldone_absorb + + shr \$3,$bsz + lea -100(%rdi),$A_flat + +.Lblock_absorb: + mov ($inp),%rax + lea 8($inp),$inp + xor ($A_flat),%rax + lea 8($A_flat),$A_flat + sub \$8,$len + mov %rax,-8($A_flat) + sub \$1,$bsz + jnz .Lblock_absorb + + mov $inp,200-100(%rsi) # save inp + mov $len,208-100(%rsi) # save len + call __KeccakF1600 + mov 200-100(%rsi),$inp # pull inp + mov 208-100(%rsi),$len # pull len + mov 216-100(%rsi),$bsz # pull bsz + jmp .Loop_absorb + +.align 32 +.Ldone_absorb: + mov $len,%rax # return value + + notq $A[0][1](%rdi) + notq $A[0][2](%rdi) + notq $A[1][3](%rdi) + notq $A[2][2](%rdi) + notq $A[3][2](%rdi) + notq $A[4][0](%rdi) + + add \$232,%rsp +.cfi_adjust_cfa_offset -232 + + pop %r15 +.cfi_pop %r15 + pop %r14 +.cfi_pop %r14 + pop %r13 +.cfi_pop %r13 + pop %r12 +.cfi_pop %r12 + pop %rbp +.cfi_pop %rbp + pop %rbx +.cfi_pop %rbx + ret +.cfi_endproc +.size SHA3_absorb,.-SHA3_absorb +___ +} +{ my ($A_flat,$out,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx"); + ($out,$len,$bsz) = ("%r12","%r13","%r14"); + +$code.=<<___; +.globl SHA3_squeeze +.type SHA3_squeeze,\@function,4 +.align 32 +SHA3_squeeze: +.cfi_startproc + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + + shr \$3,%rcx + mov $A_flat,%r8 + mov %rsi,$out + mov %rdx,$len + mov %rcx,$bsz + jmp .Loop_squeeze + +.align 32 +.Loop_squeeze: + cmp \$8,$len + jb .Ltail_squeeze + + mov (%r8),%rax + lea 8(%r8),%r8 + mov %rax,($out) + lea 8($out),$out + sub \$8,$len # len -= 8 + jz .Ldone_squeeze + + sub \$1,%rcx # bsz-- + jnz .Loop_squeeze + + call KeccakF1600 + mov $A_flat,%r8 + mov $bsz,%rcx + jmp .Loop_squeeze + +.Ltail_squeeze: + mov %r8, %rsi + mov $out,%rdi + mov $len,%rcx + .byte 0xf3,0xa4 # rep movsb + +.Ldone_squeeze: + pop %r14 +.cfi_pop %r14 + pop %r13 +.cfi_pop %r13 + pop %r12 +.cfi_pop %r13 + ret +.cfi_endproc +.size SHA3_squeeze,.-SHA3_squeeze +___ +} +$code.=<<___; +.align 256 + .quad 0,0,0,0,0,0,0,0 +.type iotas,\@object +iotas: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 +.size iotas,.-iotas +.asciz "Keccak-1600 absorb and squeeze for x86_64, CRYPTOGAMS by <appro\@openssl.org>" +___ + +foreach (split("\n",$code)) { + # Below replacement results in 11.2 on Sandy Bridge, 9.4 on + # Haswell, but it hurts other processors by up to 2-3-4x... + #s/rol\s+(\$[0-9]+),(%[a-z][a-z0-9]+)/shld\t$1,$2,$2/; + # Below replacement results in 9.3 on Haswell [as well as + # on Ryzen, i.e. it *hurts* Ryzen]... + #s/rol\s+\$([0-9]+),(%[a-z][a-z0-9]+)/rorx\t\$64-$1,$2,$2/; + + print $_, "\n"; +} + +close STDOUT; diff --git a/crypto/sha/asm/keccak1600p8-ppc.pl b/crypto/sha/asm/keccak1600p8-ppc.pl new file mode 100755 index 000000000000..de2bcd660a09 --- /dev/null +++ b/crypto/sha/asm/keccak1600p8-ppc.pl @@ -0,0 +1,850 @@ +#!/usr/bin/env perl +# Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# Keccak-1600 for PowerISA 2.07. +# +# June 2017. +# +# This is straightforward KECCAK_1X_ALT SIMD implementation, but with +# disjoint Rho and Pi. The module is ABI-bitness- and endian-neutral. +# POWER8 processor spends 9.8 cycles to process byte out of large +# buffer for r=1088, which matches SHA3-256. This is 17% better than +# scalar PPC64 code. It probably should be noted that if POWER8's +# successor can achieve higher scalar instruction issue rate, then +# this module will loose... And it does on POWER9 with 12.0 vs. 9.4. + +$flavour = shift; + +if ($flavour =~ /64/) { + $SIZE_T =8; + $LRSAVE =2*$SIZE_T; + $UCMP ="cmpld"; + $STU ="stdu"; + $POP ="ld"; + $PUSH ="std"; +} elsif ($flavour =~ /32/) { + $SIZE_T =4; + $LRSAVE =$SIZE_T; + $STU ="stwu"; + $POP ="lwz"; + $PUSH ="stw"; + $UCMP ="cmplw"; +} else { die "nonsense $flavour"; } + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or +die "can't locate ppc-xlate.pl"; + +open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; + +$FRAME=6*$SIZE_T+13*16; # 13*16 is for v20-v31 offload + +my $sp ="r1"; + +my $iotas = "r12"; + +######################################################################## +# Register layout: +# +# v0 A[0][0] A[1][0] +# v1 A[0][1] A[1][1] +# v2 A[0][2] A[1][2] +# v3 A[0][3] A[1][3] +# v4 A[0][4] A[1][4] +# +# v5 A[2][0] A[3][0] +# v6 A[2][1] A[3][1] +# v7 A[2][2] A[3][2] +# v8 A[2][3] A[3][3] +# v9 A[2][4] A[3][4] +# +# v10 A[4][0] A[4][1] +# v11 A[4][2] A[4][3] +# v12 A[4][4] A[4][4] +# +# v13..25 rhotates[][] +# v26..31 volatile +# +$code.=<<___; +.machine "any" +.text + +.type KeccakF1600_int,\@function +.align 5 +KeccakF1600_int: + li r0,24 + mtctr r0 + li r0,0 + b .Loop + +.align 4 +.Loop: + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Theta + vxor v26,v0, v5 ; A[0..1][0]^A[2..3][0] + vxor v27,v1, v6 ; A[0..1][1]^A[2..3][1] + vxor v28,v2, v7 ; A[0..1][2]^A[2..3][2] + vxor v29,v3, v8 ; A[0..1][3]^A[2..3][3] + vxor v30,v4, v9 ; A[0..1][4]^A[2..3][4] + vpermdi v31,v26,v27,0b00 ; A[0][0..1]^A[2][0..1] + vpermdi v26,v26,v27,0b11 ; A[1][0..1]^A[3][0..1] + vpermdi v27,v28,v29,0b00 ; A[0][2..3]^A[2][2..3] + vpermdi v28,v28,v29,0b11 ; A[1][2..3]^A[3][2..3] + vpermdi v29,v30,v30,0b10 ; A[1..0][4]^A[3..2][4] + vxor v26,v26,v31 ; C[0..1] + vxor v27,v27,v28 ; C[2..3] + vxor v28,v29,v30 ; C[4..4] + vspltisb v31,1 + vxor v26,v26,v10 ; C[0..1] ^= A[4][0..1] + vxor v27,v27,v11 ; C[2..3] ^= A[4][2..3] + vxor v28,v28,v12 ; C[4..4] ^= A[4][4..4], low! + + vrld v29,v26,v31 ; ROL64(C[0..1],1) + vrld v30,v27,v31 ; ROL64(C[2..3],1) + vrld v31,v28,v31 ; ROL64(C[4..4],1) + vpermdi v31,v31,v29,0b10 + vxor v26,v26,v30 ; C[0..1] ^= ROL64(C[2..3],1) + vxor v27,v27,v31 ; C[2..3] ^= ROL64(C[4..0],1) + vxor v28,v28,v29 ; C[4..4] ^= ROL64(C[0..1],1), low! + + vpermdi v29,v26,v26,0b00 ; C[0..0] + vpermdi v30,v28,v26,0b10 ; C[4..0] + vpermdi v31,v28,v28,0b11 ; C[4..4] + vxor v1, v1, v29 ; A[0..1][1] ^= C[0..0] + vxor v6, v6, v29 ; A[2..3][1] ^= C[0..0] + vxor v10,v10,v30 ; A[4][0..1] ^= C[4..0] + vxor v0, v0, v31 ; A[0..1][0] ^= C[4..4] + vxor v5, v5, v31 ; A[2..3][0] ^= C[4..4] + + vpermdi v29,v27,v27,0b00 ; C[2..2] + vpermdi v30,v26,v26,0b11 ; C[1..1] + vpermdi v31,v26,v27,0b10 ; C[1..2] + vxor v3, v3, v29 ; A[0..1][3] ^= C[2..2] + vxor v8, v8, v29 ; A[2..3][3] ^= C[2..2] + vxor v2, v2, v30 ; A[0..1][2] ^= C[1..1] + vxor v7, v7, v30 ; A[2..3][2] ^= C[1..1] + vxor v11,v11,v31 ; A[4][2..3] ^= C[1..2] + + vpermdi v29,v27,v27,0b11 ; C[3..3] + vxor v4, v4, v29 ; A[0..1][4] ^= C[3..3] + vxor v9, v9, v29 ; A[2..3][4] ^= C[3..3] + vxor v12,v12,v29 ; A[4..4][4] ^= C[3..3] + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Rho + vrld v26,v0, v13 ; v0 + vrld v1, v1, v14 + vrld v27,v2, v15 ; v2 + vrld v28,v3, v16 ; v3 + vrld v4, v4, v17 + vrld v5, v5, v18 + vrld v6, v6, v19 + vrld v29,v7, v20 ; v7 + vrld v8, v8, v21 + vrld v9, v9, v22 + vrld v10,v10,v23 + vrld v30,v11,v24 ; v11 + vrld v12,v12,v25 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Pi + vpermdi v0, v26,v28,0b00 ; [0][0] [1][0] < [0][0] [0][3] + vpermdi v2, v29,v5, 0b00 ; [0][2] [1][2] < [2][2] [2][0] + vpermdi v11,v9, v5, 0b01 ; [4][2] [4][3] < [2][4] [3][0] + vpermdi v5, v1, v4, 0b00 ; [2][0] [3][0] < [0][1] [0][4] + vpermdi v1, v1, v4, 0b11 ; [0][1] [1][1] < [1][1] [1][4] + vpermdi v3, v8, v6, 0b11 ; [0][3] [1][3] < [3][3] [3][1] + vpermdi v4, v12,v30,0b10 ; [0][4] [1][4] < [4][4] [4][2] + vpermdi v7, v8, v6, 0b00 ; [2][2] [3][2] < [2][3] [2][1] + vpermdi v6, v27,v26,0b11 ; [2][1] [3][1] < [1][2] [1][0] + vpermdi v8, v9, v29,0b11 ; [2][3] [3][3] < [3][4] [3][2] + vpermdi v12,v10,v10,0b11 ; [4][4] [4][4] < [4][1] [4][1] + vpermdi v9, v10,v30,0b01 ; [2][4] [3][4] < [4][0] [4][3] + vpermdi v10,v27,v28,0b01 ; [4][0] [4][1] < [0][2] [1][3] + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Chi + Iota + lvx_u v31,$iotas,r0 ; iotas[index] + addic r0,r0,16 ; index++ + + vandc v26,v2, v1 ; (~A[0..1][1] & A[0..1][2]) + vandc v27,v3, v2 ; (~A[0..1][2] & A[0..1][3]) + vandc v28,v4, v3 ; (~A[0..1][3] & A[0..1][4]) + vandc v29,v0, v4 ; (~A[0..1][4] & A[0..1][0]) + vandc v30,v1, v0 ; (~A[0..1][0] & A[0..1][1]) + vxor v0, v0, v26 ; A[0..1][0] ^= (~A[0..1][1] & A[0..1][2]) + vxor v1, v1, v27 ; A[0..1][1] ^= (~A[0..1][2] & A[0..1][3]) + vxor v2, v2, v28 ; A[0..1][2] ^= (~A[0..1][3] & A[0..1][4]) + vxor v3, v3, v29 ; A[0..1][3] ^= (~A[0..1][4] & A[0..1][0]) + vxor v4, v4, v30 ; A[0..1][4] ^= (~A[0..1][0] & A[0..1][1]) + + vandc v26,v7, v6 ; (~A[2..3][1] & A[2..3][2]) + vandc v27,v8, v7 ; (~A[2..3][2] & A[2..3][3]) + vandc v28,v9, v8 ; (~A[2..3][3] & A[2..3][4]) + vandc v29,v5, v9 ; (~A[2..3][4] & A[2..3][0]) + vandc v30,v6, v5 ; (~A[2..3][0] & A[2..3][1]) + vxor v5, v5, v26 ; A[2..3][0] ^= (~A[2..3][1] & A[2..3][2]) + vxor v6, v6, v27 ; A[2..3][1] ^= (~A[2..3][2] & A[2..3][3]) + vxor v7, v7, v28 ; A[2..3][2] ^= (~A[2..3][3] & A[2..3][4]) + vxor v8, v8, v29 ; A[2..3][3] ^= (~A[2..3][4] & A[2..3][0]) + vxor v9, v9, v30 ; A[2..3][4] ^= (~A[2..3][0] & A[2..3][1]) + + vxor v0, v0, v31 ; A[0][0] ^= iotas[index++] + + vpermdi v26,v10,v11,0b10 ; A[4][1..2] + vpermdi v27,v12,v10,0b00 ; A[4][4..0] + vpermdi v28,v11,v12,0b10 ; A[4][3..4] + vpermdi v29,v10,v10,0b10 ; A[4][1..0] + vandc v26,v11,v26 ; (~A[4][1..2] & A[4][2..3]) + vandc v27,v27,v28 ; (~A[4][3..4] & A[4][4..0]) + vandc v28,v10,v29 ; (~A[4][1..0] & A[4][0..1]) + vxor v10,v10,v26 ; A[4][0..1] ^= (~A[4][1..2] & A[4][2..3]) + vxor v11,v11,v27 ; A[4][2..3] ^= (~A[4][3..4] & A[4][4..0]) + vxor v12,v12,v28 ; A[4][4..4] ^= (~A[4][0..1] & A[4][1..0]) + + bdnz .Loop + + vpermdi v12,v12,v12,0b11 ; broadcast A[4][4] + blr + .long 0 + .byte 0,12,0x14,0,0,0,0,0 +.size KeccakF1600_int,.-KeccakF1600_int + +.type KeccakF1600,\@function +.align 5 +KeccakF1600: + $STU $sp,-$FRAME($sp) + li r10,`15+6*$SIZE_T` + li r11,`31+6*$SIZE_T` + mflr r8 + mfspr r7, 256 ; save vrsave + stvx v20,r10,$sp + addi r10,r10,32 + stvx v21,r11,$sp + addi r11,r11,32 + stvx v22,r10,$sp + addi r10,r10,32 + stvx v23,r11,$sp + addi r11,r11,32 + stvx v24,r10,$sp + addi r10,r10,32 + stvx v25,r11,$sp + addi r11,r11,32 + stvx v26,r10,$sp + addi r10,r10,32 + stvx v27,r11,$sp + addi r11,r11,32 + stvx v28,r10,$sp + addi r10,r10,32 + stvx v29,r11,$sp + addi r11,r11,32 + stvx v30,r10,$sp + stvx v31,r11,$sp + stw r7,`$FRAME-4`($sp) ; save vrsave + li r0, -1 + $PUSH r8,`$FRAME+$LRSAVE`($sp) + mtspr 256, r0 ; preserve all AltiVec registers + + li r11,16 + lvx_4w v0,0,r3 ; load A[5][5] + li r10,32 + lvx_4w v1,r11,r3 + addi r11,r11,32 + lvx_4w v2,r10,r3 + addi r10,r10,32 + lvx_4w v3,r11,r3 + addi r11,r11,32 + lvx_4w v4,r10,r3 + addi r10,r10,32 + lvx_4w v5,r11,r3 + addi r11,r11,32 + lvx_4w v6,r10,r3 + addi r10,r10,32 + lvx_4w v7,r11,r3 + addi r11,r11,32 + lvx_4w v8,r10,r3 + addi r10,r10,32 + lvx_4w v9,r11,r3 + addi r11,r11,32 + lvx_4w v10,r10,r3 + addi r10,r10,32 + lvx_4w v11,r11,r3 + lvx_splt v12,r10,r3 + + bl PICmeup + + li r11,16 + lvx_u v13,0,r12 ; load rhotates + li r10,32 + lvx_u v14,r11,r12 + addi r11,r11,32 + lvx_u v15,r10,r12 + addi r10,r10,32 + lvx_u v16,r11,r12 + addi r11,r11,32 + lvx_u v17,r10,r12 + addi r10,r10,32 + lvx_u v18,r11,r12 + addi r11,r11,32 + lvx_u v19,r10,r12 + addi r10,r10,32 + lvx_u v20,r11,r12 + addi r11,r11,32 + lvx_u v21,r10,r12 + addi r10,r10,32 + lvx_u v22,r11,r12 + addi r11,r11,32 + lvx_u v23,r10,r12 + addi r10,r10,32 + lvx_u v24,r11,r12 + lvx_u v25,r10,r12 + addi r12,r12,`16*16` ; points at iotas + + bl KeccakF1600_int + + li r11,16 + stvx_4w v0,0,r3 ; return A[5][5] + li r10,32 + stvx_4w v1,r11,r3 + addi r11,r11,32 + stvx_4w v2,r10,r3 + addi r10,r10,32 + stvx_4w v3,r11,r3 + addi r11,r11,32 + stvx_4w v4,r10,r3 + addi r10,r10,32 + stvx_4w v5,r11,r3 + addi r11,r11,32 + stvx_4w v6,r10,r3 + addi r10,r10,32 + stvx_4w v7,r11,r3 + addi r11,r11,32 + stvx_4w v8,r10,r3 + addi r10,r10,32 + stvx_4w v9,r11,r3 + addi r11,r11,32 + stvx_4w v10,r10,r3 + addi r10,r10,32 + stvx_4w v11,r11,r3 + stvdx_u v12,r10,r3 + + li r10,`15+6*$SIZE_T` + li r11,`31+6*$SIZE_T` + mtlr r8 + mtspr 256, r7 ; restore vrsave + lvx v20,r10,$sp + addi r10,r10,32 + lvx v21,r11,$sp + addi r11,r11,32 + lvx v22,r10,$sp + addi r10,r10,32 + lvx v23,r11,$sp + addi r11,r11,32 + lvx v24,r10,$sp + addi r10,r10,32 + lvx v25,r11,$sp + addi r11,r11,32 + lvx v26,r10,$sp + addi r10,r10,32 + lvx v27,r11,$sp + addi r11,r11,32 + lvx v28,r10,$sp + addi r10,r10,32 + lvx v29,r11,$sp + addi r11,r11,32 + lvx v30,r10,$sp + lvx v31,r11,$sp + addi $sp,$sp,$FRAME + blr + .long 0 + .byte 0,12,0x04,1,0x80,0,1,0 + .long 0 +.size KeccakF1600,.-KeccakF1600 +___ +{ +my ($A_jagged,$inp,$len,$bsz) = map("r$_",(3..6)); + +$code.=<<___; +.globl SHA3_absorb +.type SHA3_absorb,\@function +.align 5 +SHA3_absorb: + $STU $sp,-$FRAME($sp) + li r10,`15+6*$SIZE_T` + li r11,`31+6*$SIZE_T` + mflr r8 + mfspr r7, 256 ; save vrsave + stvx v20,r10,$sp + addi r10,r10,32 + stvx v21,r11,$sp + addi r11,r11,32 + stvx v22,r10,$sp + addi r10,r10,32 + stvx v23,r11,$sp + addi r11,r11,32 + stvx v24,r10,$sp + addi r10,r10,32 + stvx v25,r11,$sp + addi r11,r11,32 + stvx v26,r10,$sp + addi r10,r10,32 + stvx v27,r11,$sp + addi r11,r11,32 + stvx v28,r10,$sp + addi r10,r10,32 + stvx v29,r11,$sp + addi r11,r11,32 + stvx v30,r10,$sp + stvx v31,r11,$sp + stw r7,`$FRAME-4`($sp) ; save vrsave + li r0, -1 + $PUSH r8,`$FRAME+$LRSAVE`($sp) + mtspr 256, r0 ; preserve all AltiVec registers + + li r11,16 + lvx_4w v0,0,$A_jagged ; load A[5][5] + li r10,32 + lvx_4w v1,r11,$A_jagged + addi r11,r11,32 + lvx_4w v2,r10,$A_jagged + addi r10,r10,32 + lvx_4w v3,r11,$A_jagged + addi r11,r11,32 + lvx_4w v4,r10,$A_jagged + addi r10,r10,32 + lvx_4w v5,r11,$A_jagged + addi r11,r11,32 + lvx_4w v6,r10,$A_jagged + addi r10,r10,32 + lvx_4w v7,r11,$A_jagged + addi r11,r11,32 + lvx_4w v8,r10,$A_jagged + addi r10,r10,32 + lvx_4w v9,r11,$A_jagged + addi r11,r11,32 + lvx_4w v10,r10,$A_jagged + addi r10,r10,32 + lvx_4w v11,r11,$A_jagged + lvx_splt v12,r10,$A_jagged + + bl PICmeup + + li r11,16 + lvx_u v13,0,r12 ; load rhotates + li r10,32 + lvx_u v14,r11,r12 + addi r11,r11,32 + lvx_u v15,r10,r12 + addi r10,r10,32 + lvx_u v16,r11,r12 + addi r11,r11,32 + lvx_u v17,r10,r12 + addi r10,r10,32 + lvx_u v18,r11,r12 + addi r11,r11,32 + lvx_u v19,r10,r12 + addi r10,r10,32 + lvx_u v20,r11,r12 + addi r11,r11,32 + lvx_u v21,r10,r12 + addi r10,r10,32 + lvx_u v22,r11,r12 + addi r11,r11,32 + lvx_u v23,r10,r12 + addi r10,r10,32 + lvx_u v24,r11,r12 + lvx_u v25,r10,r12 + li r10,-32 + li r11,-16 + addi r12,r12,`16*16` ; points at iotas + b .Loop_absorb + +.align 4 +.Loop_absorb: + $UCMP $len,$bsz ; len < bsz? + blt .Labsorbed + + sub $len,$len,$bsz ; len -= bsz + srwi r0,$bsz,3 + mtctr r0 + + lvx_u v30,r10,r12 ; permutation masks + lvx_u v31,r11,r12 + ?vspltisb v27,7 ; prepare masks for byte swap + ?vxor v30,v30,v27 ; on big-endian + ?vxor v31,v31,v27 + + vxor v27,v27,v27 ; zero + lvdx_u v26,0,$inp + addi $inp,$inp,8 + vperm v26,v26,v27,v30 + vxor v0, v0, v26 + bdz .Lprocess_block + lvdx_u v26,0,$inp + addi $inp,$inp,8 + vperm v26,v26,v27,v30 + vxor v1, v1, v26 + bdz .Lprocess_block + lvdx_u v26,0,$inp + addi $inp,$inp,8 + vperm v26,v26,v27,v30 + vxor v2, v2, v26 + bdz .Lprocess_block + lvdx_u v26,0,$inp + addi $inp,$inp,8 + vperm v26,v26,v27,v30 + vxor v3, v3, v26 + bdz .Lprocess_block + lvdx_u v26,0,$inp + addi $inp,$inp,8 + vperm v26,v26,v27,v30 + vxor v4, v4, v26 + bdz .Lprocess_block + lvdx_u v26,0,$inp + addi $inp,$inp,8 + vperm v26,v26,v27,v31 + vxor v0, v0, v26 + bdz .Lprocess_block + lvdx_u v26,0,$inp + addi $inp,$inp,8 + vperm v26,v26,v27,v31 + vxor v1, v1, v26 + bdz .Lprocess_block + lvdx_u v26,0,$inp + addi $inp,$inp,8 + vperm v26,v26,v27,v31 + vxor v2, v2, v26 + bdz .Lprocess_block + lvdx_u v26,0,$inp + addi $inp,$inp,8 + vperm v26,v26,v27,v31 + vxor v3, v3, v26 + bdz .Lprocess_block + lvdx_u v26,0,$inp + addi $inp,$inp,8 + vperm v26,v26,v27,v31 + vxor v4, v4, v26 + bdz .Lprocess_block + lvdx_u v26,0,$inp + addi $inp,$inp,8 + vperm v26,v26,v27,v30 + vxor v5, v5, v26 + bdz .Lprocess_block + lvdx_u v26,0,$inp + addi $inp,$inp,8 + vperm v26,v26,v27,v30 + vxor v6, v6, v26 + bdz .Lprocess_block + lvdx_u v26,0,$inp + addi $inp,$inp,8 + vperm v26,v26,v27,v30 + vxor v7, v7, v26 + bdz .Lprocess_block + lvdx_u v26,0,$inp + addi $inp,$inp,8 + vperm v26,v26,v27,v30 + vxor v8, v8, v26 + bdz .Lprocess_block + lvdx_u v26,0,$inp + addi $inp,$inp,8 + vperm v26,v26,v27,v30 + vxor v9, v9, v26 + bdz .Lprocess_block + lvdx_u v26,0,$inp + addi $inp,$inp,8 + vperm v26,v26,v27,v31 + vxor v5, v5, v26 + bdz .Lprocess_block + lvdx_u v26,0,$inp + addi $inp,$inp,8 + vperm v26,v26,v27,v31 + vxor v6, v6, v26 + bdz .Lprocess_block + lvdx_u v26,0,$inp + addi $inp,$inp,8 + vperm v26,v26,v27,v31 + vxor v7, v7, v26 + bdz .Lprocess_block + lvdx_u v26,0,$inp + addi $inp,$inp,8 + vperm v26,v26,v27,v31 + vxor v8, v8, v26 + bdz .Lprocess_block + lvdx_u v26,0,$inp + addi $inp,$inp,8 + vperm v26,v26,v27,v31 + vxor v9, v9, v26 + bdz .Lprocess_block + lvdx_u v26,0,$inp + addi $inp,$inp,8 + vperm v26,v26,v27,v30 + vxor v10, v10, v26 + bdz .Lprocess_block + lvdx_u v26,0,$inp + addi $inp,$inp,8 + vperm v26,v26,v27,v31 + vxor v10, v10, v26 + bdz .Lprocess_block + lvdx_u v26,0,$inp + addi $inp,$inp,8 + vperm v26,v26,v27,v30 + vxor v11, v11, v26 + bdz .Lprocess_block + lvdx_u v26,0,$inp + addi $inp,$inp,8 + vperm v26,v26,v27,v31 + vxor v11, v11, v26 + bdz .Lprocess_block + lvdx_u v26,0,$inp + addi $inp,$inp,8 + vperm v26,v26,v27,v31 + vxor v12, v12, v26 + +.Lprocess_block: + bl KeccakF1600_int + + b .Loop_absorb + +.align 4 +.Labsorbed: + li r11,16 + stvx_4w v0,0,$A_jagged ; return A[5][5] + li r10,32 + stvx_4w v1,r11,$A_jagged + addi r11,r11,32 + stvx_4w v2,r10,$A_jagged + addi r10,r10,32 + stvx_4w v3,r11,$A_jagged + addi r11,r11,32 + stvx_4w v4,r10,$A_jagged + addi r10,r10,32 + stvx_4w v5,r11,$A_jagged + addi r11,r11,32 + stvx_4w v6,r10,$A_jagged + addi r10,r10,32 + stvx_4w v7,r11,$A_jagged + addi r11,r11,32 + stvx_4w v8,r10,$A_jagged + addi r10,r10,32 + stvx_4w v9,r11,$A_jagged + addi r11,r11,32 + stvx_4w v10,r10,$A_jagged + addi r10,r10,32 + stvx_4w v11,r11,$A_jagged + stvdx_u v12,r10,$A_jagged + + mr r3,$len ; return value + li r10,`15+6*$SIZE_T` + li r11,`31+6*$SIZE_T` + mtlr r8 + mtspr 256, r7 ; restore vrsave + lvx v20,r10,$sp + addi r10,r10,32 + lvx v21,r11,$sp + addi r11,r11,32 + lvx v22,r10,$sp + addi r10,r10,32 + lvx v23,r11,$sp + addi r11,r11,32 + lvx v24,r10,$sp + addi r10,r10,32 + lvx v25,r11,$sp + addi r11,r11,32 + lvx v26,r10,$sp + addi r10,r10,32 + lvx v27,r11,$sp + addi r11,r11,32 + lvx v28,r10,$sp + addi r10,r10,32 + lvx v29,r11,$sp + addi r11,r11,32 + lvx v30,r10,$sp + lvx v31,r11,$sp + addi $sp,$sp,$FRAME + blr + .long 0 + .byte 0,12,0x04,1,0x80,0,4,0 + .long 0 +.size SHA3_absorb,.-SHA3_absorb +___ +} +{ +my ($A_jagged,$out,$len,$bsz) = map("r$_",(3..6)); + +$code.=<<___; +.globl SHA3_squeeze +.type SHA3_squeeze,\@function +.align 5 +SHA3_squeeze: + mflr r9 ; r9 is not touched by KeccakF1600 + subi $out,$out,1 ; prepare for stbu + addi r8,$A_jagged,4 ; prepare volatiles + mr r10,$bsz + li r11,0 + b .Loop_squeeze +.align 4 +.Loop_squeeze: + lwzx r7,r11,r8 ; lo + lwzx r0,r11,$A_jagged ; hi + ${UCMP}i $len,8 + blt .Lsqueeze_tail + + stbu r7,1($out) ; write lo + srwi r7,r7,8 + stbu r7,1($out) + srwi r7,r7,8 + stbu r7,1($out) + srwi r7,r7,8 + stbu r7,1($out) + stbu r0,1($out) ; write hi + srwi r0,r0,8 + stbu r0,1($out) + srwi r0,r0,8 + stbu r0,1($out) + srwi r0,r0,8 + stbu r0,1($out) + + subic. $len,$len,8 + beqlr ; return if done + + subic. r10,r10,8 + ble .Loutput_expand + + addi r11,r11,16 ; calculate jagged index + cmplwi r11,`16*5` + blt .Loop_squeeze + subi r11,r11,72 + beq .Loop_squeeze + addi r11,r11,72 + cmplwi r11,`16*5+8` + subi r11,r11,8 + beq .Loop_squeeze + addi r11,r11,8 + cmplwi r11,`16*10` + subi r11,r11,72 + beq .Loop_squeeze + addi r11,r11,72 + blt .Loop_squeeze + subi r11,r11,8 + b .Loop_squeeze + +.align 4 +.Loutput_expand: + bl KeccakF1600 + mtlr r9 + + addi r8,$A_jagged,4 ; restore volatiles + mr r10,$bsz + li r11,0 + b .Loop_squeeze + +.align 4 +.Lsqueeze_tail: + mtctr $len + subic. $len,$len,4 + ble .Loop_tail_lo + li r8,4 + mtctr r8 +.Loop_tail_lo: + stbu r7,1($out) + srdi r7,r7,8 + bdnz .Loop_tail_lo + ble .Lsqueeze_done + mtctr $len +.Loop_tail_hi: + stbu r0,1($out) + srdi r0,r0,8 + bdnz .Loop_tail_hi + +.Lsqueeze_done: + blr + .long 0 + .byte 0,12,0x14,0,0,0,4,0 + .long 0 +.size SHA3_squeeze,.-SHA3_squeeze +___ +} +$code.=<<___; +.align 6 +PICmeup: + mflr r0 + bcl 20,31,\$+4 + mflr r12 ; vvvvvv "distance" between . and 1st data entry + addi r12,r12,`64-8` + mtlr r0 + blr + .long 0 + .byte 0,12,0x14,0,0,0,0,0 + .space `64-9*4` +.type rhotates,\@object +.align 6 +rhotates: + .quad 0, 36 + .quad 1, 44 + .quad 62, 6 + .quad 28, 55 + .quad 27, 20 + .quad 3, 41 + .quad 10, 45 + .quad 43, 15 + .quad 25, 21 + .quad 39, 8 + .quad 18, 2 + .quad 61, 56 + .quad 14, 14 +.size rhotates,.-rhotates + .quad 0,0 + .quad 0x0001020304050607,0x1011121314151617 + .quad 0x1011121314151617,0x0001020304050607 +.type iotas,\@object +iotas: + .quad 0x0000000000000001,0 + .quad 0x0000000000008082,0 + .quad 0x800000000000808a,0 + .quad 0x8000000080008000,0 + .quad 0x000000000000808b,0 + .quad 0x0000000080000001,0 + .quad 0x8000000080008081,0 + .quad 0x8000000000008009,0 + .quad 0x000000000000008a,0 + .quad 0x0000000000000088,0 + .quad 0x0000000080008009,0 + .quad 0x000000008000000a,0 + .quad 0x000000008000808b,0 + .quad 0x800000000000008b,0 + .quad 0x8000000000008089,0 + .quad 0x8000000000008003,0 + .quad 0x8000000000008002,0 + .quad 0x8000000000000080,0 + .quad 0x000000000000800a,0 + .quad 0x800000008000000a,0 + .quad 0x8000000080008081,0 + .quad 0x8000000000008080,0 + .quad 0x0000000080000001,0 + .quad 0x8000000080008008,0 +.size iotas,.-iotas +.asciz "Keccak-1600 absorb and squeeze for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>" +___ + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/ge; + + if ($flavour =~ /le$/) { # little-endian + s/\?([a-z]+)/;$1/; + } else { # big-endian + s/\?([a-z]+)/$1/; + } + + print $_,"\n"; +} + +close STDOUT; diff --git a/crypto/sha/asm/sha1-586.pl b/crypto/sha/asm/sha1-586.pl index e0b5d83b6201..9d4ff7f39a52 100644 --- a/crypto/sha/asm/sha1-586.pl +++ b/crypto/sha/asm/sha1-586.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 1998-2018 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== # [Re]written by Andy Polyakov <appro@openssl.org> for the OpenSSL @@ -28,10 +35,9 @@ # P4 +85%(!) +45% # # As you can see Pentium came out as looser:-( Yet I reckoned that -# improvement on P4 outweights the loss and incorporate this +# improvement on P4 outweighs the loss and incorporate this # re-tuned code to 0.9.7 and later. # ---------------------------------------------------------------- -# <appro@fy.chalmers.se> # August 2009. # @@ -97,10 +103,12 @@ # Sandy Bridge 8.8 6.2/+40% 5.1(**)/+73% # Ivy Bridge 7.2 4.8/+51% 4.7(**)/+53% # Haswell 6.5 4.3/+51% 4.1(**)/+58% +# Skylake 6.4 4.1/+55% 4.1(**)/+55% # Bulldozer 11.6 6.0/+92% # VIA Nano 10.6 7.5/+41% # Atom 12.5 9.3(*)/+35% # Silvermont 14.5 9.9(*)/+46% +# Goldmont 8.8 6.7/+30% 1.7(***)/+415% # # (*) Loop is 1056 instructions long and expected result is ~8.25. # The discrepancy is because of front-end limitations, so @@ -108,12 +116,17 @@ # limited parallelism. # # (**) As per above comment, the result is for AVX *plus* sh[rl]d. +# +# (***) SHAEXT result $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; push(@INC,"${dir}","${dir}../../perlasm"); require "x86asm.pl"; -&asm_init($ARGV[0],"sha1-586.pl",$ARGV[$#ARGV] eq "386"); +$output=pop; +open STDOUT,">$output"; + +&asm_init($ARGV[0],$ARGV[$#ARGV] eq "386"); $xmm=$ymm=0; for (@ARGV) { $xmm=1 if (/-DOPENSSL_IA32_SSE2/); } @@ -123,7 +136,7 @@ $ymm=1 if ($xmm && =~ /GNU assembler version ([2-9]\.[0-9]+)/ && $1>=2.19); # first version supporting AVX -$ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32n" && +$ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32n" && `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ && $1>=2.03); # first version supporting AVX @@ -131,7 +144,7 @@ $ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32" && `ml 2>&1` =~ /Version ([0-9]+)\./ && $1>=10); # first version supporting AVX -$ymm=1 if ($xmm && !$ymm && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/ && +$ymm=1 if ($xmm && !$ymm && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([3-9]\.[0-9]+)/ && $2>=3.0); # first version supporting AVX $shaext=$xmm; ### set to zero if compiling for 1.0.1 @@ -536,7 +549,7 @@ for($i=0;$i<20-4;$i+=2) { # being implemented in SSSE3). Once 8 quadruples or 32 elements are # collected, it switches to routine proposed by Max Locktyukhin. # -# Calculations inevitably require temporary reqisters, and there are +# Calculations inevitably require temporary registers, and there are # no %xmm registers left to spare. For this reason part of the ring # buffer, X[2..4] to be specific, is offloaded to 3 quadriples ring # buffer on the stack. Keep in mind that X[2] is alias X[-6], X[3] - @@ -647,7 +660,7 @@ my $_ror=sub { &ror(@_) }; &jmp (&label("loop")); ###################################################################### -# SSE instruction sequence is first broken to groups of indepentent +# SSE instruction sequence is first broken to groups of independent # instructions, independent in respect to their inputs and shifter # (not all architectures have more than one). Then IALU instructions # are "knitted in" between the SSE groups. Distance is maintained for @@ -656,14 +669,14 @@ my $_ror=sub { &ror(@_) }; # # Temporary registers usage. X[2] is volatile at the entry and at the # end is restored from backtrace ring buffer. X[3] is expected to -# contain current K_XX_XX constant and is used to caclulate X[-1]+K +# contain current K_XX_XX constant and is used to calculate X[-1]+K # from previous round, it becomes volatile the moment the value is # saved to stack for transfer to IALU. X[4] becomes volatile whenever # X[-4] is accumulated and offloaded to backtrace ring buffer, at the # end it is loaded with next K_XX_XX [which becomes X[3] in next # round]... # -sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4 +sub Xupdate_ssse3_16_31() # recall that $Xi starts with 4 { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 40 instructions @@ -1186,7 +1199,7 @@ my $_ror=sub { &shrd(@_[0],@_) }; &and (@T[0],@T[1]); &jmp (&label("loop")); -sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4 +sub Xupdate_avx_16_31() # recall that $Xi starts with 4 { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 40 instructions @@ -1474,3 +1487,5 @@ sub Xtail_avx() &asciz("SHA1 block transform for x86, CRYPTOGAMS by <appro\@openssl.org>"); &asm_finish(); + +close STDOUT; diff --git a/crypto/sha/asm/sha1-armv4-large.pl b/crypto/sha/asm/sha1-armv4-large.pl index b2c30322c351..7ff5bfbba6cb 100755 --- a/crypto/sha/asm/sha1-armv4-large.pl +++ b/crypto/sha/asm/sha1-armv4-large.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL @@ -60,14 +67,28 @@ # is ~2.5x larger and there are some redundant instructions executed # when processing last block, improvement is not as big for smallest # blocks, only ~30%. Snapdragon S4 is a tad faster, 6.4 cycles per -# byte, which is also >80% faster than integer-only code. +# byte, which is also >80% faster than integer-only code. Cortex-A15 +# is even faster spending 5.6 cycles per byte outperforming integer- +# only code by factor of 2. # May 2014. # # Add ARMv8 code path performing at 2.35 cpb on Apple A7. -while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} -open STDOUT,">$output"; +$flavour = shift; +if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } +else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} $ctx="r0"; $inp="r1"; @@ -167,7 +188,12 @@ $code=<<___; #include "arm_arch.h" .text +#if defined(__thumb2__) +.syntax unified +.thumb +#else .code 32 +#endif .global sha1_block_data_order .type sha1_block_data_order,%function @@ -175,9 +201,13 @@ $code=<<___; .align 5 sha1_block_data_order: #if __ARM_MAX_ARCH__>=7 - sub r3,pc,#8 @ sha1_block_data_order +.Lsha1_block: + adr r3,.Lsha1_block ldr r12,.LOPENSSL_armcap ldr r12,[r3,r12] @ OPENSSL_armcap_P +#ifdef __APPLE__ + ldr r12,[r12] +#endif tst r12,#ARMV8_SHA1 bne .LARMv8 tst r12,#ARMV7_NEON @@ -199,7 +229,12 @@ for($i=0;$i<5;$i++) { &BODY_00_15(@V); unshift(@V,pop(@V)); } $code.=<<___; +#if defined(__thumb2__) + mov $t3,sp + teq $Xi,$t3 +#else teq $Xi,sp +#endif bne .L_00_15 @ [((11+4)*5+2)*3] sub sp,sp,#25*4 ___ @@ -218,7 +253,12 @@ for($i=0;$i<5;$i++) { &BODY_20_39(@V); unshift(@V,pop(@V)); } $code.=<<___; +#if defined(__thumb2__) + mov $t3,sp + teq $Xi,$t3 +#else teq $Xi,sp @ preserve carry +#endif bne .L_20_39_or_60_79 @ [+((12+3)*5+2)*4] bcs .L_done @ [+((12+3)*5+2)*4], spare 300 bytes @@ -230,7 +270,12 @@ for($i=0;$i<5;$i++) { &BODY_40_59(@V); unshift(@V,pop(@V)); } $code.=<<___; +#if defined(__thumb2__) + mov $t3,sp + teq $Xi,$t3 +#else teq $Xi,sp +#endif bne .L_40_59 @ [+((12+5)*5+2)*4] ldr $K,.LK_60_79 @@ -266,7 +311,7 @@ $code.=<<___; .LK_60_79: .word 0xca62c1d6 #if __ARM_MAX_ARCH__>=7 .LOPENSSL_armcap: -.word OPENSSL_armcap_P-sha1_block_data_order +.word OPENSSL_armcap_P-.Lsha1_block #endif .asciz "SHA1 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>" .align 5 @@ -441,6 +486,7 @@ sub Xuplast_80 () &teq ($inp,$len); &sub ($K_XX_XX,$K_XX_XX,16); # rewind $K_XX_XX + &it ("eq"); &subeq ($inp,$inp,64); # reload last block to avoid SEGV &vld1_8 ("{@X[-4&7]-@X[-3&7]}","[$inp]!"); eval(shift(@insns)); @@ -491,12 +537,12 @@ sha1_block_data_order_neon: @ dmb @ errata #451034 on early Cortex A8 @ vstmdb sp!,{d8-d15} @ ABI specification says so mov $saved_sp,sp - sub sp,sp,#64 @ alloca + sub $Xfer,sp,#64 adr $K_XX_XX,.LK_00_19 - bic sp,sp,#15 @ align for 128-bit stores + bic $Xfer,$Xfer,#15 @ align for 128-bit stores ldmia $ctx,{$a,$b,$c,$d,$e} @ load context - mov $Xfer,sp + mov sp,$Xfer @ alloca vld1.8 {@X[-4&7]-@X[-3&7]},[$inp]! @ handles unaligned veor $zero,$zero,$zero @@ -543,10 +589,13 @@ $code.=<<___; add $b,$b,$t0 add $c,$c,$t1 add $d,$d,$Xfer + it eq moveq sp,$saved_sp add $e,$e,$Ki + it ne ldrne $Ki,[sp] stmia $ctx,{$a,$b,$c,$d,$e} + itt ne addne $Xfer,sp,#3*16 bne .Loop_neon @@ -567,6 +616,13 @@ my ($W0,$W1,$ABCD_SAVE)=map("q$_",(12..14)); $code.=<<___; #if __ARM_MAX_ARCH__>=7 + +# if defined(__thumb2__) +# define INST(a,b,c,d) .byte c,d|0xf,a,b +# else +# define INST(a,b,c,d) .byte a,b,c,d|0x10 +# endif + .type sha1_block_data_order_armv8,%function .align 5 sha1_block_data_order_armv8: @@ -660,7 +716,10 @@ ___ # since ARMv7 instructions are always encoded little-endian. # correct solution is to use .inst directive, but older # assemblers don't implement it:-( - sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s", + + # this fix-up provides Thumb encoding in conjunction with INST + $word &= ~0x10000000 if (($word & 0x0f000000) == 0x02000000); + sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s", $word&0xff,($word>>8)&0xff, ($word>>16)&0xff,($word>>24)&0xff, $mnemonic,$arg; diff --git a/crypto/sha/asm/sha1-armv8.pl b/crypto/sha/asm/sha1-armv8.pl index c04432a54394..3ba871fedee6 100755 --- a/crypto/sha/asm/sha1-armv8.pl +++ b/crypto/sha/asm/sha1-armv8.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # # ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL @@ -18,13 +25,23 @@ # Cortex-A57 2.35 7.88 (+74%) # Denver 2.13 3.97 (+0%)(**) # X-Gene 8.80 (+200%) +# Mongoose 2.05 6.50 (+160%) +# Kryo 1.88 8.00 (+90%) # # (*) Software results are presented mostly for reference purposes. # (**) Keep in mind that Denver relies on binary translation, which # optimizes compiler output at run-time. $flavour = shift; -open STDOUT,">".shift; +$output = shift; + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or +die "can't locate arm-xlate.pl"; + +open OUT,"| \"$^X\" $xlate $flavour $output"; +*STDOUT=*OUT; ($ctx,$inp,$num)=("x0","x1","x2"); @Xw=map("w$_",(3..17,19)); @@ -158,11 +175,16 @@ $code.=<<___; .text +.extern OPENSSL_armcap_P .globl sha1_block_data_order .type sha1_block_data_order,%function .align 6 sha1_block_data_order: +#ifdef __ILP32__ + ldrsw x16,.LOPENSSL_armcap_P +#else ldr x16,.LOPENSSL_armcap_P +#endif adr x17,.LOPENSSL_armcap_P add x16,x16,x17 ldr w16,[x16] @@ -300,7 +322,11 @@ $code.=<<___; .long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc //K_40_59 .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 //K_60_79 .LOPENSSL_armcap_P: +#ifdef __ILP32__ +.long OPENSSL_armcap_P-. +#else .quad OPENSSL_armcap_P-. +#endif .asciz "SHA1 block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" .align 2 .comm OPENSSL_armcap_P,4,4 diff --git a/crypto/sha/asm/sha1-c64xplus.pl b/crypto/sha/asm/sha1-c64xplus.pl new file mode 100755 index 000000000000..4db2bcb06b31 --- /dev/null +++ b/crypto/sha/asm/sha1-c64xplus.pl @@ -0,0 +1,337 @@ +#! /usr/bin/env perl +# Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# SHA1 for C64x+. +# +# November 2011 +# +# If compared to compiler-generated code with similar characteristics, +# i.e. compiled with OPENSSL_SMALL_FOOTPRINT and utilizing SPLOOPs, +# this implementation is 25% smaller and >2x faster. In absolute terms +# performance is (quite impressive) ~6.5 cycles per processed byte. +# Fully unrolled assembler would be ~5x larger and is likely to be +# ~15% faster. It would be free from references to intermediate ring +# buffer, but put more pressure on L1P [both because the code would be +# larger and won't be using SPLOOP buffer]. There are no plans to +# realize fully unrolled variant though... +# +# !!! Note that this module uses AMR, which means that all interrupt +# service routines are expected to preserve it and for own well-being +# zero it upon entry. + +while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + +($CTX,$INP,$NUM) = ("A4","B4","A6"); # arguments + +($A,$B,$C,$D,$E, $Arot,$F,$F0,$T,$K) = map("A$_",(16..20, 21..25)); +($X0,$X2,$X8,$X13) = ("A26","B26","A27","B27"); +($TX0,$TX1,$TX2,$TX3) = map("B$_",(28..31)); +($XPA,$XPB) = ("A5","B5"); # X circular buffer +($Actx,$Bctx,$Cctx,$Dctx,$Ectx) = map("A$_",(3,6..9)); # zaps $NUM + +$code=<<___; + .text + + .if .ASSEMBLER_VERSION<7000000 + .asg 0,__TI_EABI__ + .endif + .if __TI_EABI__ + .asg sha1_block_data_order,_sha1_block_data_order + .endif + + .asg B3,RA + .asg A15,FP + .asg B15,SP + + .if .BIG_ENDIAN + .asg MV,SWAP2 + .asg MV,SWAP4 + .endif + + .global _sha1_block_data_order +_sha1_block_data_order: + .asmfunc stack_usage(64) + MV $NUM,A0 ; reassign $NUM +|| MVK -64,B0 + [!A0] BNOP RA ; if ($NUM==0) return; +|| [A0] STW FP,*SP--[16] ; save frame pointer and alloca(64) +|| [A0] MV SP,FP + [A0] LDW *${CTX}[0],$A ; load A-E... +|| [A0] AND B0,SP,SP ; align stack at 64 bytes + [A0] LDW *${CTX}[1],$B +|| [A0] SUBAW SP,2,SP ; reserve two words above buffer + [A0] LDW *${CTX}[2],$C +|| [A0] MVK 0x00404,B0 + [A0] LDW *${CTX}[3],$D +|| [A0] MVKH 0x50000,B0 ; 0x050404, 64 bytes for $XP[AB] + [A0] LDW *${CTX}[4],$E +|| [A0] MVC B0,AMR ; setup circular addressing + LDNW *${INP}++,$TX1 ; pre-fetch input + NOP 1 + +loop?: + MVK 0x00007999,$K +|| ADDAW SP,2,$XPA +|| SUB A0,1,A0 +|| MVK 13,B0 + MVKH 0x5a820000,$K ; K_00_19 +|| ADDAW SP,2,$XPB +|| MV $A,$Actx +|| MV $B,$Bctx +;;================================================== + SPLOOPD 5 ; BODY_00_13 +|| MV $C,$Cctx +|| MV $D,$Dctx +|| MV $E,$Ectx +|| MVC B0,ILC + + ROTL $A,5,$Arot +|| AND $C,$B,$F +|| ANDN $D,$B,$F0 +|| ADD $K,$E,$T ; T=E+K + + XOR $F0,$F,$F ; F_00_19(B,C,D) +|| MV $D,$E ; E=D +|| MV $C,$D ; D=C +|| SWAP2 $TX1,$TX2 +|| LDNW *${INP}++,$TX1 + + ADD $F,$T,$T ; T+=F_00_19(B,C,D) +|| ROTL $B,30,$C ; C=ROL(B,30) +|| SWAP4 $TX2,$TX3 ; byte swap + + ADD $Arot,$T,$T ; T+=ROL(A,5) +|| MV $A,$B ; B=A + + ADD $TX3,$T,$A ; A=T+Xi +|| STW $TX3,*${XPB}++ + SPKERNEL +;;================================================== + ROTL $A,5,$Arot ; BODY_14 +|| AND $C,$B,$F +|| ANDN $D,$B,$F0 +|| ADD $K,$E,$T ; T=E+K + + XOR $F0,$F,$F ; F_00_19(B,C,D) +|| MV $D,$E ; E=D +|| MV $C,$D ; D=C +|| SWAP2 $TX1,$TX2 +|| LDNW *${INP}++,$TX1 + + ADD $F,$T,$T ; T+=F_00_19(B,C,D) +|| ROTL $B,30,$C ; C=ROL(B,30) +|| SWAP4 $TX2,$TX2 ; byte swap +|| LDW *${XPA}++,$X0 ; fetches from X ring buffer are +|| LDW *${XPB}[4],$X2 ; 2 iterations ahead + + ADD $Arot,$T,$T ; T+=ROL(A,5) +|| MV $A,$B ; B=A +|| LDW *${XPA}[7],$X8 +|| MV $TX3,$X13 ; || LDW *${XPB}[15],$X13 +|| MV $TX2,$TX3 + + ADD $TX2,$T,$A ; A=T+Xi +|| STW $TX2,*${XPB}++ +;;================================================== + ROTL $A,5,$Arot ; BODY_15 +|| AND $C,$B,$F +|| ANDN $D,$B,$F0 +|| ADD $K,$E,$T ; T=E+K + + XOR $F0,$F,$F ; F_00_19(B,C,D) +|| MV $D,$E ; E=D +|| MV $C,$D ; D=C +|| SWAP2 $TX1,$TX2 + + ADD $F,$T,$T ; T+=F_00_19(B,C,D) +|| ROTL $B,30,$C ; C=ROL(B,30) +|| SWAP4 $TX2,$TX2 ; byte swap +|| XOR $X0,$X2,$TX0 ; Xupdate XORs are 1 iteration ahead +|| LDW *${XPA}++,$X0 +|| LDW *${XPB}[4],$X2 + + ADD $Arot,$T,$T ; T+=ROL(A,5) +|| MV $A,$B ; B=A +|| XOR $X8,$X13,$TX1 +|| LDW *${XPA}[7],$X8 +|| MV $TX3,$X13 ; || LDW *${XPB}[15],$X13 +|| MV $TX2,$TX3 + + ADD $TX2,$T,$A ; A=T+Xi +|| STW $TX2,*${XPB}++ +|| XOR $TX0,$TX1,$TX1 +|| MVK 3,B0 +;;================================================== + SPLOOPD 5 ; BODY_16_19 +|| MVC B0,ILC + + ROTL $A,5,$Arot +|| AND $C,$B,$F +|| ANDN $D,$B,$F0 +|| ADD $K,$E,$T ; T=E+K +|| ROTL $TX1,1,$TX2 ; Xupdate output + + XOR $F0,$F,$F ; F_00_19(B,C,D) +|| MV $D,$E ; E=D +|| MV $C,$D ; D=C + + ADD $F,$T,$T ; T+=F_00_19(B,C,D) +|| ROTL $B,30,$C ; C=ROL(B,30) +|| XOR $X0,$X2,$TX0 +|| LDW *${XPA}++,$X0 +|| LDW *${XPB}[4],$X2 + + ADD $Arot,$T,$T ; T+=ROL(A,5) +|| MV $A,$B ; B=A +|| XOR $X8,$X13,$TX1 +|| LDW *${XPA}[7],$X8 +|| MV $TX3,$X13 ; || LDW *${XPB}[15],$X13 +|| MV $TX2,$TX3 + + ADD $TX2,$T,$A ; A=T+Xi +|| STW $TX2,*${XPB}++ +|| XOR $TX0,$TX1,$TX1 + SPKERNEL + + MVK 0xffffeba1,$K +|| MVK 19,B0 + MVKH 0x6ed90000,$K ; K_20_39 +___ +sub BODY_20_39 { +$code.=<<___; +;;================================================== + SPLOOPD 5 ; BODY_20_39 +|| MVC B0,ILC + + ROTL $A,5,$Arot +|| XOR $B,$C,$F +|| ADD $K,$E,$T ; T=E+K +|| ROTL $TX1,1,$TX2 ; Xupdate output + + XOR $D,$F,$F ; F_20_39(B,C,D) +|| MV $D,$E ; E=D +|| MV $C,$D ; D=C + + ADD $F,$T,$T ; T+=F_20_39(B,C,D) +|| ROTL $B,30,$C ; C=ROL(B,30) +|| XOR $X0,$X2,$TX0 +|| LDW *${XPA}++,$X0 +|| LDW *${XPB}[4],$X2 + + ADD $Arot,$T,$T ; T+=ROL(A,5) +|| MV $A,$B ; B=A +|| XOR $X8,$X13,$TX1 +|| LDW *${XPA}[7],$X8 +|| MV $TX3,$X13 ; || LDW *${XPB}[15],$X13 +|| MV $TX2,$TX3 + + ADD $TX2,$T,$A ; A=T+Xi +|| STW $TX2,*${XPB}++ ; last one is redundant +|| XOR $TX0,$TX1,$TX1 + SPKERNEL +___ +$code.=<<___ if (!shift); + MVK 0xffffbcdc,$K + MVKH 0x8f1b0000,$K ; K_40_59 +___ +} &BODY_20_39(); +$code.=<<___; +;;================================================== + SPLOOPD 5 ; BODY_40_59 +|| MVC B0,ILC +|| AND $B,$C,$F +|| AND $B,$D,$F0 + + ROTL $A,5,$Arot +|| XOR $F0,$F,$F +|| AND $C,$D,$F0 +|| ADD $K,$E,$T ; T=E+K +|| ROTL $TX1,1,$TX2 ; Xupdate output + + XOR $F0,$F,$F ; F_40_59(B,C,D) +|| MV $D,$E ; E=D +|| MV $C,$D ; D=C + + ADD $F,$T,$T ; T+=F_40_59(B,C,D) +|| ROTL $B,30,$C ; C=ROL(B,30) +|| XOR $X0,$X2,$TX0 +|| LDW *${XPA}++,$X0 +|| LDW *${XPB}[4],$X2 + + ADD $Arot,$T,$T ; T+=ROL(A,5) +|| MV $A,$B ; B=A +|| XOR $X8,$X13,$TX1 +|| LDW *${XPA}[7],$X8 +|| MV $TX3,$X13 ; || LDW *${XPB}[15],$X13 +|| MV $TX2,$TX3 + + ADD $TX2,$T,$A ; A=T+Xi +|| STW $TX2,*${XPB}++ +|| XOR $TX0,$TX1,$TX1 +|| AND $B,$C,$F +|| AND $B,$D,$F0 + SPKERNEL + + MVK 0xffffc1d6,$K +|| MVK 18,B0 + MVKH 0xca620000,$K ; K_60_79 +___ + &BODY_20_39(-1); # BODY_60_78 +$code.=<<___; +;;================================================== + [A0] B loop? +|| ROTL $A,5,$Arot ; BODY_79 +|| XOR $B,$C,$F +|| ROTL $TX1,1,$TX2 ; Xupdate output + + [A0] LDNW *${INP}++,$TX1 ; pre-fetch input +|| ADD $K,$E,$T ; T=E+K +|| XOR $D,$F,$F ; F_20_39(B,C,D) + + ADD $F,$T,$T ; T+=F_20_39(B,C,D) +|| ADD $Ectx,$D,$E ; E=D,E+=Ectx +|| ADD $Dctx,$C,$D ; D=C,D+=Dctx +|| ROTL $B,30,$C ; C=ROL(B,30) + + ADD $Arot,$T,$T ; T+=ROL(A,5) +|| ADD $Bctx,$A,$B ; B=A,B+=Bctx + + ADD $TX2,$T,$A ; A=T+Xi + + ADD $Actx,$A,$A ; A+=Actx +|| ADD $Cctx,$C,$C ; C+=Cctx +;; end of loop? + + BNOP RA ; return +|| MV FP,SP ; restore stack pointer +|| LDW *FP[0],FP ; restore frame pointer + STW $A,*${CTX}[0] ; emit A-E... +|| MVK 0,B0 + STW $B,*${CTX}[1] +|| MVC B0,AMR ; clear AMR + STW $C,*${CTX}[2] + STW $D,*${CTX}[3] + STW $E,*${CTX}[4] + .endasmfunc + + .sect .const + .cstring "SHA1 block transform for C64x+, CRYPTOGAMS by <appro\@openssl.org>" + .align 4 +___ + +print $code; +close STDOUT; diff --git a/crypto/sha/asm/sha1-ia64.pl b/crypto/sha/asm/sha1-ia64.pl index 02d35d1614c1..bf1d2ebeb0ab 100644 --- a/crypto/sha/asm/sha1-ia64.pl +++ b/crypto/sha/asm/sha1-ia64.pl @@ -1,7 +1,14 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2004-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # # ==================================================================== -# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. @@ -14,6 +21,8 @@ # Performance under big-endian OS such as HP-UX is 179MBps*1GHz, which # is >50% better than HP C and >2x better than gcc. +$output = pop; + $code=<<___; .ident \"sha1-ia64.s, version 1.3\" .ident \"IA-64 ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>\" @@ -301,5 +310,5 @@ $code.=<<___; stringz "SHA1 block transform for IA64, CRYPTOGAMS by <appro\@openssl.org>" ___ -$output=shift and open STDOUT,">$output"; +open STDOUT,">$output" if $output; print $code; diff --git a/crypto/sha/asm/sha1-mb-x86_64.pl b/crypto/sha/asm/sha1-mb-x86_64.pl index a8d8708d4b75..443b649830f4 100755 --- a/crypto/sha/asm/sha1-mb-x86_64.pl +++ b/crypto/sha/asm/sha1-mb-x86_64.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL @@ -19,6 +26,7 @@ # Sandy Bridge (8.16 +5.15=13.3)/n 4.99 5.98 +80% # Ivy Bridge (8.08 +5.14=13.2)/n 4.60 5.54 +68% # Haswell(iii) (8.96 +5.00=14.0)/n 3.57 4.55 +160% +# Skylake (8.70 +5.00=13.7)/n 3.64 4.20 +145% # Bulldozer (9.76 +5.76=15.5)/n 5.95 6.37 +64% # # (i) multi-block CBC encrypt with 128-bit key; @@ -62,7 +70,7 @@ if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([ $avx = ($2>=3.0) + ($2>3.0); } -open OUT,"| \"$^X\" $xlate $flavour $output"; +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; # void sha1_multi_block ( @@ -87,7 +95,7 @@ $K="%xmm15"; if (1) { # Atom-specific optimization aiming to eliminate pshufb with high - # registers [and thus get rid of 48 cycles accumulated penalty] + # registers [and thus get rid of 48 cycles accumulated penalty] @Xi=map("%xmm$_",(0..4)); ($tx,$t0,$t1,$t2,$t3)=map("%xmm$_",(5..9)); @V=($A,$B,$C,$D,$E)=map("%xmm$_",(10..14)); @@ -118,7 +126,7 @@ my $k=$i+2; # ... # $i==13: 14,15,15,15, # $i==14: 15 -# +# # Then at $i==15 Xupdate is applied one iteration in advance... $code.=<<___ if ($i==0); movd (@ptr[0]),@Xi[0] @@ -355,6 +363,7 @@ $code.=<<___; .type sha1_multi_block,\@function,3 .align 32 sha1_multi_block: +.cfi_startproc mov OPENSSL_ia32cap_P+4(%rip),%rcx bt \$61,%rcx # check SHA bit jc _shaext_shortcut @@ -365,8 +374,11 @@ $code.=<<___ if ($avx); ___ $code.=<<___; mov %rsp,%rax +.cfi_def_cfa_register %rax push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbx ___ $code.=<<___ if ($win64); lea -0xa8(%rsp),%rsp @@ -385,6 +397,7 @@ $code.=<<___; sub \$`$REG_SZ*18`,%rsp and \$-256,%rsp mov %rax,`$REG_SZ*17`(%rsp) # original %rsp +.cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8 .Lbody: lea K_XX_XX(%rip),$Tbl lea `$REG_SZ*16`(%rsp),%rbx @@ -431,7 +444,7 @@ for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } $code.=<<___; movdqa (%rbx),@Xi[0] # pull counters mov \$1,%ecx - cmp 4*0(%rbx),%ecx # examinte counters + cmp 4*0(%rbx),%ecx # examine counters pxor $t2,$t2 cmovge $Tbl,@ptr[0] # cancel input cmp 4*1(%rbx),%ecx @@ -478,7 +491,8 @@ $code.=<<___; jnz .Loop_grande .Ldone: - mov `$REG_SZ*17`(%rsp),%rax # orignal %rsp + mov `$REG_SZ*17`(%rsp),%rax # original %rsp +.cfi_def_cfa %rax,8 ___ $code.=<<___ if ($win64); movaps -0xb8(%rax),%xmm6 @@ -494,10 +508,14 @@ $code.=<<___ if ($win64); ___ $code.=<<___; mov -16(%rax),%rbp +.cfi_restore %rbp mov -8(%rax),%rbx +.cfi_restore %rbx lea (%rax),%rsp +.cfi_def_cfa_register %rsp .Lepilogue: ret +.cfi_endproc .size sha1_multi_block,.-sha1_multi_block ___ {{{ @@ -509,10 +527,14 @@ $code.=<<___; .type sha1_multi_block_shaext,\@function,3 .align 32 sha1_multi_block_shaext: +.cfi_startproc _shaext_shortcut: mov %rsp,%rax +.cfi_def_cfa_register %rax push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp ___ $code.=<<___ if ($win64); lea -0xa8(%rsp),%rsp @@ -538,7 +560,7 @@ $code.=<<___; movdqa K_XX_XX+0x80(%rip),$BSWAP # byte-n-word swap .Loop_grande_shaext: - mov $num,`$REG_SZ*17+8`(%rsp) # orignal $num + mov $num,`$REG_SZ*17+8`(%rsp) # original $num xor $num,$num ___ for($i=0;$i<2;$i++) { @@ -748,10 +770,14 @@ $code.=<<___ if ($win64); ___ $code.=<<___; mov -16(%rax),%rbp +.cfi_restore %rbp mov -8(%rax),%rbx +.cfi_restore %rbx lea (%rax),%rsp +.cfi_def_cfa_register %rsp .Lepilogue_shaext: ret +.cfi_endproc .size sha1_multi_block_shaext,.-sha1_multi_block_shaext ___ }}} @@ -994,6 +1020,7 @@ $code.=<<___; .type sha1_multi_block_avx,\@function,3 .align 32 sha1_multi_block_avx: +.cfi_startproc _avx_shortcut: ___ $code.=<<___ if ($avx>1); @@ -1008,8 +1035,11 @@ $code.=<<___ if ($avx>1); ___ $code.=<<___; mov %rsp,%rax +.cfi_def_cfa_register %rax push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp ___ $code.=<<___ if ($win64); lea -0xa8(%rsp),%rsp @@ -1028,6 +1058,7 @@ $code.=<<___; sub \$`$REG_SZ*18`, %rsp and \$-256,%rsp mov %rax,`$REG_SZ*17`(%rsp) # original %rsp +.cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8 .Lbody_avx: lea K_XX_XX(%rip),$Tbl lea `$REG_SZ*16`(%rsp),%rbx @@ -1116,7 +1147,8 @@ $code.=<<___; jnz .Loop_grande_avx .Ldone_avx: - mov `$REG_SZ*17`(%rsp),%rax # orignal %rsp + mov `$REG_SZ*17`(%rsp),%rax # original %rsp +.cfi_def_cfa %rax,8 vzeroupper ___ $code.=<<___ if ($win64); @@ -1133,10 +1165,14 @@ $code.=<<___ if ($win64); ___ $code.=<<___; mov -16(%rax),%rbp +.cfi_restore %rbp mov -8(%rax),%rbx +.cfi_restore %rbx lea (%rax),%rsp +.cfi_def_cfa_register %rsp .Lepilogue_avx: ret +.cfi_endproc .size sha1_multi_block_avx,.-sha1_multi_block_avx ___ @@ -1156,14 +1192,22 @@ $code.=<<___; .type sha1_multi_block_avx2,\@function,3 .align 32 sha1_multi_block_avx2: +.cfi_startproc _avx2_shortcut: mov %rsp,%rax +.cfi_def_cfa_register %rax push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 +.cfi_push %r15 ___ $code.=<<___ if ($win64); lea -0xa8(%rsp),%rsp @@ -1182,6 +1226,7 @@ $code.=<<___; sub \$`$REG_SZ*18`, %rsp and \$-256,%rsp mov %rax,`$REG_SZ*17`(%rsp) # original %rsp +.cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8 .Lbody_avx2: lea K_XX_XX(%rip),$Tbl shr \$1,$num @@ -1271,7 +1316,8 @@ $code.=<<___; #jnz .Loop_grande_avx2 .Ldone_avx2: - mov `$REG_SZ*17`(%rsp),%rax # orignal %rsp + mov `$REG_SZ*17`(%rsp),%rax # original %rsp +.cfi_def_cfa %rax,8 vzeroupper ___ $code.=<<___ if ($win64); @@ -1288,14 +1334,22 @@ $code.=<<___ if ($win64); ___ $code.=<<___; mov -48(%rax),%r15 +.cfi_restore %r15 mov -40(%rax),%r14 +.cfi_restore %r14 mov -32(%rax),%r13 +.cfi_restore %r13 mov -24(%rax),%r12 +.cfi_restore %r12 mov -16(%rax),%rbp +.cfi_restore %rbp mov -8(%rax),%rbx +.cfi_restore %rbx lea (%rax),%rsp +.cfi_def_cfa_register %rsp .Lepilogue_avx2: ret +.cfi_endproc .size sha1_multi_block_avx2,.-sha1_multi_block_avx2 ___ } }}} @@ -1454,10 +1508,10 @@ avx2_handler: mov -48(%rax),%r15 mov %rbx,144($context) # restore context->Rbx mov %rbp,160($context) # restore context->Rbp - mov %r12,216($context) # restore cotnext->R12 - mov %r13,224($context) # restore cotnext->R13 - mov %r14,232($context) # restore cotnext->R14 - mov %r15,240($context) # restore cotnext->R15 + mov %r12,216($context) # restore context->R12 + mov %r13,224($context) # restore context->R13 + mov %r14,232($context) # restore context->R14 + mov %r15,240($context) # restore context->R15 lea -56-10*16(%rax),%rsi lea 512($context),%rdi # &context.Xmm6 diff --git a/crypto/sha/asm/sha1-mips.pl b/crypto/sha/asm/sha1-mips.pl index 340849389993..08f84bc3b3d9 100755 --- a/crypto/sha/asm/sha1-mips.pl +++ b/crypto/sha/asm/sha1-mips.pl @@ -1,7 +1,14 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2009-2018 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== -# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. @@ -49,15 +56,15 @@ $flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64 if ($flavour =~ /64|n32/i) { - $PTR_ADD="dadd"; # incidentally works even on n32 - $PTR_SUB="dsub"; # incidentally works even on n32 + $PTR_ADD="daddu"; # incidentally works even on n32 + $PTR_SUB="dsubu"; # incidentally works even on n32 $REG_S="sd"; $REG_L="ld"; $PTR_SLL="dsll"; # incidentally works even on n32 $SZREG=8; } else { - $PTR_ADD="add"; - $PTR_SUB="sub"; + $PTR_ADD="addu"; + $PTR_SUB="subu"; $REG_S="sw"; $REG_L="lw"; $PTR_SLL="sll"; @@ -68,9 +75,9 @@ if ($flavour =~ /64|n32/i) { # ###################################################################### -$big_endian=(`echo MIPSEL | $ENV{CC} -E -`=~/MIPSEL/)?1:0 if ($ENV{CC}); +$big_endian=(`echo MIPSEB | $ENV{CC} -E -`=~/MIPSEB/)?0:1 if ($ENV{CC}); -for (@ARGV) { $output=$_ if (/^\w[\w\-]*\.\w+$/); } +for (@ARGV) { $output=$_ if (/\w[\w\-]*\.\w+$/); } open STDOUT,">$output"; if (!defined($big_endian)) @@ -119,10 +126,14 @@ $code.=<<___; addu $e,$K # $i xor $t0,$c,$d rotr $t1,$a,27 - lwl @X[$j],$j*4+$MSB($inp) and $t0,$b addu $e,$t1 +#if defined(_MIPS_ARCH_MIPS32R6) || defined(_MIPS_ARCH_MIPS64R6) + lw @X[$j],$j*4($inp) +#else + lwl @X[$j],$j*4+$MSB($inp) lwr @X[$j],$j*4+$LSB($inp) +#endif xor $t0,$d addu $e,@X[$i] rotr $b,$b,2 @@ -325,17 +336,11 @@ $code.=<<___ if ($i<79); ___ } -$FRAMESIZE=16; # large enough to accomodate NUBI saved registers -$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000; +$FRAMESIZE=16; # large enough to accommodate NUBI saved registers +$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0xc0fff008" : "0xc0ff0000"; $code=<<___; -#ifdef OPENSSL_FIPSCANISTER -# include <openssl/fipssyms.h> -#endif - -#if defined(__mips_smartmips) && !defined(_MIPS_ARCH_MIPS32R2) -#define _MIPS_ARCH_MIPS32R2 -#endif +#include "mips_arch.h" .text @@ -380,10 +385,16 @@ $code.=<<___; .align 4 .Loop: .set reorder +#if defined(_MIPS_ARCH_MIPS32R6) || defined(_MIPS_ARCH_MIPS64R6) + lui $K,0x5a82 + lw @X[0],($inp) + ori $K,0x7999 # K_00_19 +#else lwl @X[0],$MSB($inp) lui $K,0x5a82 lwr @X[0],$LSB($inp) ori $K,0x7999 # K_00_19 +#endif ___ for ($i=0;$i<15;$i++) { &BODY_00_14($i,@V); unshift(@V,pop(@V)); } for (;$i<20;$i++) { &BODY_15_19($i,@V); unshift(@V,pop(@V)); } diff --git a/crypto/sha/asm/sha1-parisc.pl b/crypto/sha/asm/sha1-parisc.pl index 6e5a328a6f1f..b001be16a23c 100755 --- a/crypto/sha/asm/sha1-parisc.pl +++ b/crypto/sha/asm/sha1-parisc.pl @@ -1,7 +1,14 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2009-2018 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== -# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. @@ -253,8 +260,20 @@ $code.=<<___; .STRINGZ "SHA1 block transform for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>" ___ -$code =~ s/\`([^\`]*)\`/eval $1/gem; -$code =~ s/,\*/,/gm if ($SIZE_T==4); -$code =~ s/\bbv\b/bve/gm if ($SIZE_T==8); -print $code; +if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` + =~ /GNU assembler/) { + $gnuas = 1; +} + +foreach(split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/ge; + + s/(\.LEVEL\s+2\.0)W/$1w/ if ($gnuas && $SIZE_T==8); + s/\.SPACE\s+\$TEXT\$/.text/ if ($gnuas && $SIZE_T==8); + s/\.SUBSPA.*// if ($gnuas && $SIZE_T==8); + s/,\*/,/ if ($SIZE_T==4); + s/\bbv\b/bve/ if ($SIZE_T==8); + + print $_,"\n"; +} close STDOUT; diff --git a/crypto/sha/asm/sha1-ppc.pl b/crypto/sha/asm/sha1-ppc.pl index ab655021ccd6..0cda0a3e1517 100755 --- a/crypto/sha/asm/sha1-ppc.pl +++ b/crypto/sha/asm/sha1-ppc.pl @@ -1,7 +1,14 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2006-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== -# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. @@ -37,7 +44,7 @@ if ($flavour =~ /64/) { $PUSH ="stw"; } else { die "nonsense $flavour"; } -# Define endianess based on flavour +# Define endianness based on flavour # i.e.: linux64le $LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0; diff --git a/crypto/sha/asm/sha1-s390x.pl b/crypto/sha/asm/sha1-s390x.pl index d5cf1640a120..5729c3089877 100755 --- a/crypto/sha/asm/sha1-s390x.pl +++ b/crypto/sha/asm/sha1-s390x.pl @@ -1,7 +1,14 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== -# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. @@ -28,7 +35,8 @@ # instructions and achieve "64-bit" performance even in 31-bit legacy # application context. The feature is not specific to any particular # processor, as long as it's "z-CPU". Latter implies that the code -# remains z/Architecture specific. +# remains z/Architecture specific. On z990 it was measured to perform +# 23% better than code generated by gcc 4.3. $kimdfunc=1; # magic function code for kimd instruction @@ -42,7 +50,7 @@ if ($flavour =~ /3[12]/) { $g="g"; } -while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} +while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} open STDOUT,">$output"; $K_00_39="%r0"; $K=$K_00_39; @@ -152,6 +160,8 @@ ___ } $code.=<<___; +#include "s390x_arch.h" + .text .align 64 .type Ktable,\@object @@ -164,10 +174,7 @@ sha1_block_data_order: ___ $code.=<<___ if ($kimdfunc); larl %r1,OPENSSL_s390xcap_P - lg %r0,0(%r1) - tmhl %r0,0x4000 # check for message-security assist - jz .Lsoftware - lg %r0,16(%r1) # check kimd capabilities + lg %r0,S390X_KIMD(%r1) # check kimd capabilities tmhh %r0,`0x8000>>$kimdfunc` jz .Lsoftware lghi %r0,$kimdfunc @@ -234,7 +241,6 @@ $code.=<<___; br %r14 .size sha1_block_data_order,.-sha1_block_data_order .string "SHA1 block transform for s390x, CRYPTOGAMS by <appro\@openssl.org>" -.comm OPENSSL_s390xcap_P,80,8 ___ $code =~ s/\`([^\`]*)\`/eval $1/gem; diff --git a/crypto/sha/asm/sha1-sparcv9.pl b/crypto/sha/asm/sha1-sparcv9.pl index b5efcde5c139..3e612e3d5f68 100755 --- a/crypto/sha/asm/sha1-sparcv9.pl +++ b/crypto/sha/asm/sha1-sparcv9.pl @@ -1,12 +1,19 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== -# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # -# Hardware SPARC T4 support by David S. Miller <davem@davemloft.net>. +# Hardware SPARC T4 support by David S. Miller # ==================================================================== # Performance improvement is not really impressive on pre-T1 CPU: +8% @@ -25,7 +32,7 @@ # single-process result on 8-core processor, or ~9GBps per 2.85GHz # socket. -$output=shift; +$output=pop; open STDOUT,">$output"; @X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7"); @@ -220,7 +227,7 @@ sha1_block_data_order: ldd [%o1 + 0x20], %f16 ldd [%o1 + 0x28], %f18 ldd [%o1 + 0x30], %f20 - subcc %o2, 1, %o2 ! done yet? + subcc %o2, 1, %o2 ! done yet? ldd [%o1 + 0x38], %f22 add %o1, 0x40, %o1 prefetch [%o1 + 63], 20 @@ -368,7 +375,7 @@ ___ # Purpose of these subroutines is to explicitly encode VIS instructions, # so that one can compile the module without having to specify VIS -# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a. +# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a. # Idea is to reserve for option to produce "universal" binary and let # programmer detect if current CPU is VIS capable at run-time. sub unvis { diff --git a/crypto/sha/asm/sha1-sparcv9a.pl b/crypto/sha/asm/sha1-sparcv9a.pl index e65291bbd979..50d3e136a12d 100755 --- a/crypto/sha/asm/sha1-sparcv9a.pl +++ b/crypto/sha/asm/sha1-sparcv9a.pl @@ -1,7 +1,14 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== -# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. @@ -512,7 +519,7 @@ $code.=<<___; mov $Cctx,$C mov $Dctx,$D mov $Ectx,$E - alignaddr %g0,$tmp0,%g0 + alignaddr %g0,$tmp0,%g0 dec 1,$len ba .Loop mov $nXfer,$Xfer @@ -544,7 +551,7 @@ ___ # Purpose of these subroutines is to explicitly encode VIS instructions, # so that one can compile the module without having to specify VIS -# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a. +# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a. # Idea is to reserve for option to produce "universal" binary and let # programmer detect if current CPU is VIS capable at run-time. sub unvis { diff --git a/crypto/sha/asm/sha1-thumb.pl b/crypto/sha/asm/sha1-thumb.pl index 7c9ea9b0296c..ac74a25d6ead 100755 --- a/crypto/sha/asm/sha1-thumb.pl +++ b/crypto/sha/asm/sha1-thumb.pl @@ -1,7 +1,14 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== -# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. @@ -14,7 +21,7 @@ # The code does not present direct interest to OpenSSL, because of low # performance. Its purpose is to establish _size_ benchmark. Pretty # useless one I must say, because 30% or 88 bytes larger ARMv4 code -# [avialable on demand] is almost _twice_ as fast. It should also be +# [available on demand] is almost _twice_ as fast. It should also be # noted that in-lining of .Lcommon and .Lrotate improves performance # by over 40%, while code increases by only 10% or 32 bytes. But once # again, the goal was to establish _size_ benchmark, not performance. diff --git a/crypto/sha/asm/sha1-x86_64.pl b/crypto/sha/asm/sha1-x86_64.pl index 752138b0eac1..60819f61867c 100755 --- a/crypto/sha/asm/sha1-x86_64.pl +++ b/crypto/sha/asm/sha1-x86_64.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2006-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # # ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL @@ -73,13 +80,18 @@ # Sandy Bridge 7.70 6.10/+26% 4.99/+54% # Ivy Bridge 6.06 4.67/+30% 4.60/+32% # Haswell 5.45 4.15/+31% 3.57/+53% +# Skylake 5.18 4.06/+28% 3.54/+46% # Bulldozer 9.11 5.95/+53% +# Ryzen 4.75 3.80/+24% 1.93/+150%(**) # VIA Nano 9.32 7.15/+30% # Atom 10.3 9.17/+12% # Silvermont 13.1(*) 9.37/+40% +# Knights L 13.2(*) 9.68/+36% 8.30/+59% +# Goldmont 8.13 6.42/+27% 1.70/+380%(**) # # (*) obviously suboptimal result, nothing was done about it, # because SSSE3 code is compiled unconditionally; +# (**) SHAEXT result $flavour = shift; $output = shift; @@ -114,7 +126,7 @@ if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([ $shaext=1; ### set to zero if compiling for 1.0.1 $avx=1 if (!$shaext && $avx); -open OUT,"| \"$^X\" $xlate $flavour $output"; +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; $ctx="%rdi"; # 1st arg @@ -247,6 +259,7 @@ $code.=<<___; .type sha1_block_data_order,\@function,3 .align 16 sha1_block_data_order: +.cfi_startproc mov OPENSSL_ia32cap_P+0(%rip),%r9d mov OPENSSL_ia32cap_P+4(%rip),%r8d mov OPENSSL_ia32cap_P+8(%rip),%r10d @@ -254,7 +267,7 @@ sha1_block_data_order: jz .Lialu ___ $code.=<<___ if ($shaext); - test \$`1<<29`,%r10d # check SHA bit + test \$`1<<29`,%r10d # check SHA bit jnz _shaext_shortcut ___ $code.=<<___ if ($avx>1); @@ -275,17 +288,24 @@ $code.=<<___; .align 16 .Lialu: mov %rsp,%rax +.cfi_def_cfa_register %rax push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 mov %rdi,$ctx # reassigned argument sub \$`8+16*4`,%rsp mov %rsi,$inp # reassigned argument and \$-64,%rsp mov %rdx,$num # reassigned argument mov %rax,`16*4`(%rsp) +.cfi_cfa_expression %rsp+64,deref,+8 .Lprologue: mov 0($ctx),$A @@ -319,14 +339,22 @@ $code.=<<___; jnz .Lloop mov `16*4`(%rsp),%rsi +.cfi_def_cfa %rsi,8 mov -40(%rsi),%r14 +.cfi_restore %r14 mov -32(%rsi),%r13 +.cfi_restore %r13 mov -24(%rsi),%r12 +.cfi_restore %r12 mov -16(%rsi),%rbp +.cfi_restore %rbp mov -8(%rsi),%rbx +.cfi_restore %rbx lea (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lepilogue: ret +.cfi_endproc .size sha1_block_data_order,.-sha1_block_data_order ___ if ($shaext) {{{ @@ -342,6 +370,7 @@ $code.=<<___; .align 32 sha1_block_data_order_shaext: _shaext_shortcut: +.cfi_startproc ___ $code.=<<___ if ($win64); lea `-8-4*16`(%rsp),%rsp @@ -439,6 +468,7 @@ $code.=<<___ if ($win64); .Lepilogue_shaext: ___ $code.=<<___; +.cfi_endproc ret .size sha1_block_data_order_shaext,.-sha1_block_data_order_shaext ___ @@ -452,7 +482,8 @@ my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization my @T=("%esi","%edi"); my $j=0; my $rx=0; -my $K_XX_XX="%r11"; +my $K_XX_XX="%r14"; +my $fp="%r11"; my $_rol=sub { &rol(@_) }; my $_ror=sub { &ror(@_) }; @@ -473,25 +504,31 @@ $code.=<<___; .align 16 sha1_block_data_order_ssse3: _ssse3_shortcut: - mov %rsp,%rax +.cfi_startproc + mov %rsp,$fp # frame pointer +.cfi_def_cfa_register $fp push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 # redundant, done to share Win64 SE handler +.cfi_push %r13 push %r14 +.cfi_push %r14 lea `-64-($win64?6*16:0)`(%rsp),%rsp ___ $code.=<<___ if ($win64); - movaps %xmm6,-40-6*16(%rax) - movaps %xmm7,-40-5*16(%rax) - movaps %xmm8,-40-4*16(%rax) - movaps %xmm9,-40-3*16(%rax) - movaps %xmm10,-40-2*16(%rax) - movaps %xmm11,-40-1*16(%rax) + movaps %xmm6,-40-6*16($fp) + movaps %xmm7,-40-5*16($fp) + movaps %xmm8,-40-4*16($fp) + movaps %xmm9,-40-3*16($fp) + movaps %xmm10,-40-2*16($fp) + movaps %xmm11,-40-1*16($fp) .Lprologue_ssse3: ___ $code.=<<___; - mov %rax,%r14 # original %rsp and \$-64,%rsp mov %rdi,$ctx # reassigned argument mov %rsi,$inp # reassigned argument @@ -541,7 +578,7 @@ sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; } -sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4 +sub Xupdate_ssse3_16_31() # recall that $Xi starts with 4 { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 40 instructions @@ -898,23 +935,29 @@ $code.=<<___; mov $E,16($ctx) ___ $code.=<<___ if ($win64); - movaps -40-6*16(%r14),%xmm6 - movaps -40-5*16(%r14),%xmm7 - movaps -40-4*16(%r14),%xmm8 - movaps -40-3*16(%r14),%xmm9 - movaps -40-2*16(%r14),%xmm10 - movaps -40-1*16(%r14),%xmm11 + movaps -40-6*16($fp),%xmm6 + movaps -40-5*16($fp),%xmm7 + movaps -40-4*16($fp),%xmm8 + movaps -40-3*16($fp),%xmm9 + movaps -40-2*16($fp),%xmm10 + movaps -40-1*16($fp),%xmm11 ___ $code.=<<___; - lea (%r14),%rsi - mov -40(%rsi),%r14 - mov -32(%rsi),%r13 - mov -24(%rsi),%r12 - mov -16(%rsi),%rbp - mov -8(%rsi),%rbx - lea (%rsi),%rsp + mov -40($fp),%r14 +.cfi_restore %r14 + mov -32($fp),%r13 +.cfi_restore %r13 + mov -24($fp),%r12 +.cfi_restore %r12 + mov -16($fp),%rbp +.cfi_restore %rbp + mov -8($fp),%rbx +.cfi_restore %rbx + lea ($fp),%rsp +.cfi_def_cfa_register %rsp .Lepilogue_ssse3: ret +.cfi_endproc .size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3 ___ @@ -935,26 +978,32 @@ $code.=<<___; .align 16 sha1_block_data_order_avx: _avx_shortcut: - mov %rsp,%rax +.cfi_startproc + mov %rsp,$fp +.cfi_def_cfa_register $fp push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 # redundant, done to share Win64 SE handler +.cfi_push %r13 push %r14 +.cfi_push %r14 lea `-64-($win64?6*16:0)`(%rsp),%rsp vzeroupper ___ $code.=<<___ if ($win64); - vmovaps %xmm6,-40-6*16(%rax) - vmovaps %xmm7,-40-5*16(%rax) - vmovaps %xmm8,-40-4*16(%rax) - vmovaps %xmm9,-40-3*16(%rax) - vmovaps %xmm10,-40-2*16(%rax) - vmovaps %xmm11,-40-1*16(%rax) + vmovaps %xmm6,-40-6*16($fp) + vmovaps %xmm7,-40-5*16($fp) + vmovaps %xmm8,-40-4*16($fp) + vmovaps %xmm9,-40-3*16($fp) + vmovaps %xmm10,-40-2*16($fp) + vmovaps %xmm11,-40-1*16($fp) .Lprologue_avx: ___ $code.=<<___; - mov %rax,%r14 # original %rsp and \$-64,%rsp mov %rdi,$ctx # reassigned argument mov %rsi,$inp # reassigned argument @@ -994,7 +1043,7 @@ $code.=<<___; jmp .Loop_avx ___ -sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4 +sub Xupdate_avx_16_31() # recall that $Xi starts with 4 { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 40 instructions @@ -1262,23 +1311,29 @@ $code.=<<___; mov $E,16($ctx) ___ $code.=<<___ if ($win64); - movaps -40-6*16(%r14),%xmm6 - movaps -40-5*16(%r14),%xmm7 - movaps -40-4*16(%r14),%xmm8 - movaps -40-3*16(%r14),%xmm9 - movaps -40-2*16(%r14),%xmm10 - movaps -40-1*16(%r14),%xmm11 + movaps -40-6*16($fp),%xmm6 + movaps -40-5*16($fp),%xmm7 + movaps -40-4*16($fp),%xmm8 + movaps -40-3*16($fp),%xmm9 + movaps -40-2*16($fp),%xmm10 + movaps -40-1*16($fp),%xmm11 ___ $code.=<<___; - lea (%r14),%rsi - mov -40(%rsi),%r14 - mov -32(%rsi),%r13 - mov -24(%rsi),%r12 - mov -16(%rsi),%rbp - mov -8(%rsi),%rbx - lea (%rsi),%rsp + mov -40($fp),%r14 +.cfi_restore %r14 + mov -32($fp),%r13 +.cfi_restore %r13 + mov -24($fp),%r12 +.cfi_restore %r12 + mov -16($fp),%rbp +.cfi_restore %rbp + mov -8($fp),%rbx +.cfi_restore %rbx + lea ($fp),%rsp +.cfi_def_cfa_register %rsp .Lepilogue_avx: ret +.cfi_endproc .size sha1_block_data_order_avx,.-sha1_block_data_order_avx ___ @@ -1302,26 +1357,32 @@ $code.=<<___; .align 16 sha1_block_data_order_avx2: _avx2_shortcut: - mov %rsp,%rax +.cfi_startproc + mov %rsp,$fp +.cfi_def_cfa_register $fp push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 vzeroupper ___ $code.=<<___ if ($win64); lea -6*16(%rsp),%rsp - vmovaps %xmm6,-40-6*16(%rax) - vmovaps %xmm7,-40-5*16(%rax) - vmovaps %xmm8,-40-4*16(%rax) - vmovaps %xmm9,-40-3*16(%rax) - vmovaps %xmm10,-40-2*16(%rax) - vmovaps %xmm11,-40-1*16(%rax) + vmovaps %xmm6,-40-6*16($fp) + vmovaps %xmm7,-40-5*16($fp) + vmovaps %xmm8,-40-4*16($fp) + vmovaps %xmm9,-40-3*16($fp) + vmovaps %xmm10,-40-2*16($fp) + vmovaps %xmm11,-40-1*16($fp) .Lprologue_avx2: ___ $code.=<<___; - mov %rax,%r14 # original %rsp mov %rdi,$ctx # reassigned argument mov %rsi,$inp # reassigned argument mov %rdx,$num # reassigned argument @@ -1466,7 +1527,7 @@ sub bodyx_40_59 () { # 10 instructions, 3 cycles critical path ) } -sub Xupdate_avx2_16_31() # recall that $Xi starts wtih 4 +sub Xupdate_avx2_16_31() # recall that $Xi starts with 4 { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body,&$body); # 35 instructions @@ -1741,23 +1802,29 @@ $code.=<<___; vzeroupper ___ $code.=<<___ if ($win64); - movaps -40-6*16(%r14),%xmm6 - movaps -40-5*16(%r14),%xmm7 - movaps -40-4*16(%r14),%xmm8 - movaps -40-3*16(%r14),%xmm9 - movaps -40-2*16(%r14),%xmm10 - movaps -40-1*16(%r14),%xmm11 + movaps -40-6*16($fp),%xmm6 + movaps -40-5*16($fp),%xmm7 + movaps -40-4*16($fp),%xmm8 + movaps -40-3*16($fp),%xmm9 + movaps -40-2*16($fp),%xmm10 + movaps -40-1*16($fp),%xmm11 ___ $code.=<<___; - lea (%r14),%rsi - mov -40(%rsi),%r14 - mov -32(%rsi),%r13 - mov -24(%rsi),%r12 - mov -16(%rsi),%rbp - mov -8(%rsi),%rbx - lea (%rsi),%rsp + mov -40($fp),%r14 +.cfi_restore %r14 + mov -32($fp),%r13 +.cfi_restore %r13 + mov -24($fp),%r12 +.cfi_restore %r12 + mov -16($fp),%rbp +.cfi_restore %rbp + mov -8($fp),%rbx +.cfi_restore %rbx + lea ($fp),%rsp +.cfi_def_cfa_register %rsp .Lepilogue_avx2: ret +.cfi_endproc .size sha1_block_data_order_avx2,.-sha1_block_data_order_avx2 ___ } @@ -1898,15 +1965,13 @@ ssse3_handler: cmp %r10,%rbx # context->Rip<prologue label jb .Lcommon_seh_tail - mov 152($context),%rax # pull context->Rsp + mov 208($context),%rax # pull context->R11 mov 4(%r11),%r10d # HandlerData[1] lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=epilogue label jae .Lcommon_seh_tail - mov 232($context),%rax # pull context->R14 - lea -40-6*16(%rax),%rsi lea 512($context),%rdi # &context.Xmm6 mov \$12,%ecx @@ -1919,9 +1984,9 @@ ssse3_handler: mov -40(%rax),%r14 mov %rbx,144($context) # restore context->Rbx mov %rbp,160($context) # restore context->Rbp - mov %r12,216($context) # restore cotnext->R12 - mov %r13,224($context) # restore cotnext->R13 - mov %r14,232($context) # restore cotnext->R14 + mov %r12,216($context) # restore context->R12 + mov %r13,224($context) # restore context->R13 + mov %r14,232($context) # restore context->R14 .Lcommon_seh_tail: mov 8(%rax),%rdi diff --git a/crypto/sha/asm/sha256-586.pl b/crypto/sha/asm/sha256-586.pl index e9077143817c..dccc771ad584 100755 --- a/crypto/sha/asm/sha256-586.pl +++ b/crypto/sha/asm/sha256-586.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2007-2018 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # # ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL @@ -11,7 +18,7 @@ # # Performance improvement over compiler generated code varies from # 10% to 40% [see below]. Not very impressive on some µ-archs, but -# it's 5 times smaller and optimizies amount of writes. +# it's 5 times smaller and optimizes amount of writes. # # May 2012. # @@ -40,7 +47,7 @@ # # Performance in clock cycles per processed byte (less is better): # -# gcc icc x86 asm(*) SIMD x86_64 asm(**) +# gcc icc x86 asm(*) SIMD x86_64 asm(**) # Pentium 46 57 40/38 - - # PIII 36 33 27/24 - - # P4 41 38 28 - 17.3 @@ -50,20 +57,26 @@ # Sandy Bridge 25 - 15.9 12.4 11.6 # Ivy Bridge 24 - 15.0 11.4 10.3 # Haswell 22 - 13.9 9.46 7.80 +# Skylake 20 - 14.9 9.50 7.70 # Bulldozer 36 - 27/22 17.0 13.6 # VIA Nano 36 - 25/22 16.8 16.5 # Atom 50 - 30/25 21.9 18.9 # Silvermont 40 - 34/31 22.9 20.6 +# Goldmont 29 - 20 16.3(***) # # (*) numbers after slash are for unrolled loop, where applicable; # (**) x86_64 assembly performance is presented for reference # purposes, results are best-available; +# (***) SHAEXT result is 4.1, strangely enough better than 64-bit one; $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; push(@INC,"${dir}","${dir}../../perlasm"); require "x86asm.pl"; -&asm_init($ARGV[0],"sha512-586.pl",$ARGV[$#ARGV] eq "386"); +$output=pop; +open STDOUT,">$output"; + +&asm_init($ARGV[0],$ARGV[$#ARGV] eq "386"); $xmm=$avx=0; for (@ARGV) { $xmm=1 if (/-DOPENSSL_IA32_SSE2/); } @@ -83,7 +96,7 @@ if ($xmm && !$avx && $ARGV[0] eq "win32" && $avx = ($1>=10) + ($1>=11); } -if ($xmm && !$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) { +if ($xmm && !$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([3-9]\.[0-9]+)/) { $avx = ($2>=3.0) + ($2>3.0); } @@ -266,7 +279,7 @@ my $suffix=shift; &mov ($Coff,"ecx"); &mov ($Doff,"edi"); &mov (&DWP(0,"esp"),"ebx"); # magic - &mov ($E,&DWP(16,"esi")); + &mov ($E,&DWP(16,"esi")); &mov ("ebx",&DWP(20,"esi")); &mov ("ecx",&DWP(24,"esi")); &mov ("edi",&DWP(28,"esi")); @@ -375,7 +388,7 @@ my @AH=($A,$K256); &xor ($AH[1],"ecx"); # magic &mov (&DWP(8,"esp"),"ecx"); &mov (&DWP(12,"esp"),"ebx"); - &mov ($E,&DWP(16,"esi")); + &mov ($E,&DWP(16,"esi")); &mov ("ebx",&DWP(20,"esi")); &mov ("ecx",&DWP(24,"esi")); &mov ("esi",&DWP(28,"esi")); @@ -1279,3 +1292,5 @@ sub bodyx_00_15 () { # +10% &function_end_B("sha256_block_data_order"); &asm_finish(); + +close STDOUT; diff --git a/crypto/sha/asm/sha256-armv4.pl b/crypto/sha/asm/sha256-armv4.pl index 750216eb4267..edcfc31278e3 100755 --- a/crypto/sha/asm/sha256-armv4.pl +++ b/crypto/sha/asm/sha256-armv4.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2007-2018 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL @@ -37,8 +44,20 @@ # # Add ARMv8 code path performing at 2.0 cpb on Apple A7. -while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} -open STDOUT,">$output"; +$flavour = shift; +if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } +else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} $ctx="r0"; $t0="r0"; $inp="r1"; $t4="r1"; @@ -73,7 +92,9 @@ $code.=<<___ if ($i<16); eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` add $a,$a,$t2 @ h+=Maj(a,b,c) from the past eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) +# ifndef __ARMEB__ rev $t1,$t1 +# endif #else @ ldrb $t1,[$inp,#3] @ $i add $a,$a,$t2 @ h+=Maj(a,b,c) from the past @@ -161,15 +182,11 @@ $code=<<___; #endif .text -#if __ARM_ARCH__<7 -.code 32 -#else +#if defined(__thumb2__) .syntax unified -# ifdef __thumb2__ .thumb -# else +#else .code 32 -# endif #endif .type K256,%object @@ -195,21 +212,25 @@ K256: .word 0 @ terminator #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) .LOPENSSL_armcap: -.word OPENSSL_armcap_P-sha256_block_data_order +.word OPENSSL_armcap_P-.Lsha256_block_data_order #endif .align 5 .global sha256_block_data_order .type sha256_block_data_order,%function sha256_block_data_order: -#if __ARM_ARCH__<7 +.Lsha256_block_data_order: +#if __ARM_ARCH__<7 && !defined(__thumb2__) sub r3,pc,#8 @ sha256_block_data_order #else - adr r3,. + adr r3,.Lsha256_block_data_order #endif #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) ldr r12,.LOPENSSL_armcap ldr r12,[r3,r12] @ OPENSSL_armcap_P +#ifdef __APPLE__ + ldr r12,[r12] +#endif tst r12,#ARMV8_SHA256 bne .LARMv8 tst r12,#ARMV7_NEON @@ -233,7 +254,7 @@ for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); } $code.=".Lrounds_16_xx:\n"; for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); } $code.=<<___; -#if __ARM_ARCH__>=7 +#ifdef __thumb2__ ite eq @ Thumb2 thing, sanity check in ARM #endif ldreq $t3,[sp,#16*4] @ pull ctx @@ -454,7 +475,8 @@ $code.=<<___; .global sha256_block_data_order_neon .type sha256_block_data_order_neon,%function -.align 4 +.align 5 +.skip 16 sha256_block_data_order_neon: .LNEON: stmdb sp!,{r4-r12,lr} @@ -580,7 +602,7 @@ my $Ktbl="r3"; $code.=<<___; #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) -# ifdef __thumb2__ +# if defined(__thumb2__) # define INST(a,b,c,d) .byte c,d|0xc,a,b # else # define INST(a,b,c,d) .byte a,b,c,d @@ -591,14 +613,11 @@ $code.=<<___; sha256_block_data_order_armv8: .LARMv8: vld1.32 {$ABCD,$EFGH},[$ctx] -# ifdef __thumb2__ - adr $Ktbl,.LARMv8 - sub $Ktbl,$Ktbl,#.LARMv8-K256 -# else - adrl $Ktbl,K256 -# endif + sub $Ktbl,$Ktbl,#256+32 add $len,$inp,$len,lsl#6 @ len to point at the end of inp + b .Loop_v8 +.align 4 .Loop_v8: vld1.8 {@MSG[0]-@MSG[1]},[$inp]! vld1.8 {@MSG[2]-@MSG[3]},[$inp]! diff --git a/crypto/sha/asm/sha256-c64xplus.pl b/crypto/sha/asm/sha256-c64xplus.pl new file mode 100755 index 000000000000..3ab7d9b68946 --- /dev/null +++ b/crypto/sha/asm/sha256-c64xplus.pl @@ -0,0 +1,320 @@ +#! /usr/bin/env perl +# Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# SHA256 for C64x+. +# +# January 2012 +# +# Performance is just below 10 cycles per processed byte, which is +# almost 40% faster than compiler-generated code. Unroll is unlikely +# to give more than ~8% improvement... +# +# !!! Note that this module uses AMR, which means that all interrupt +# service routines are expected to preserve it and for own well-being +# zero it upon entry. + +while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + +($CTXA,$INP,$NUM) = ("A4","B4","A6"); # arguments + $K256="A3"; + +($A,$Actx,$B,$Bctx,$C,$Cctx,$D,$Dctx,$T2,$S0,$s1,$t0a,$t1a,$t2a,$X9,$X14) + =map("A$_",(16..31)); +($E,$Ectx,$F,$Fctx,$G,$Gctx,$H,$Hctx,$T1,$S1,$s0,$t0e,$t1e,$t2e,$X1,$X15) + =map("B$_",(16..31)); + +($Xia,$Xib)=("A5","B5"); # circular/ring buffer + $CTXB=$t2e; + +($Xn,$X0,$K)=("B7","B8","B9"); +($Maj,$Ch)=($T2,"B6"); + +$code.=<<___; + .text + + .if .ASSEMBLER_VERSION<7000000 + .asg 0,__TI_EABI__ + .endif + .if __TI_EABI__ + .nocmp + .asg sha256_block_data_order,_sha256_block_data_order + .endif + + .asg B3,RA + .asg A15,FP + .asg B15,SP + + .if .BIG_ENDIAN + .asg SWAP2,MV + .asg SWAP4,MV + .endif + + .global _sha256_block_data_order +_sha256_block_data_order: +__sha256_block: + .asmfunc stack_usage(64) + MV $NUM,A0 ; reassign $NUM +|| MVK -64,B0 + [!A0] BNOP RA ; if ($NUM==0) return; +|| [A0] STW FP,*SP--[16] ; save frame pointer and alloca(64) +|| [A0] MV SP,FP + [A0] ADDKPC __sha256_block,B2 +|| [A0] AND B0,SP,SP ; align stack at 64 bytes + .if __TI_EABI__ + [A0] MVK 0x00404,B1 +|| [A0] MVKL \$PCR_OFFSET(K256,__sha256_block),$K256 + [A0] MVKH 0x50000,B1 +|| [A0] MVKH \$PCR_OFFSET(K256,__sha256_block),$K256 + .else + [A0] MVK 0x00404,B1 +|| [A0] MVKL (K256-__sha256_block),$K256 + [A0] MVKH 0x50000,B1 +|| [A0] MVKH (K256-__sha256_block),$K256 + .endif + [A0] MVC B1,AMR ; setup circular addressing +|| [A0] MV SP,$Xia + [A0] MV SP,$Xib +|| [A0] ADD B2,$K256,$K256 +|| [A0] MV $CTXA,$CTXB +|| [A0] SUBAW SP,2,SP ; reserve two words above buffer + LDW *${CTXA}[0],$A ; load ctx +|| LDW *${CTXB}[4],$E + LDW *${CTXA}[1],$B +|| LDW *${CTXB}[5],$F + LDW *${CTXA}[2],$C +|| LDW *${CTXB}[6],$G + LDW *${CTXA}[3],$D +|| LDW *${CTXB}[7],$H + + LDNW *$INP++,$Xn ; pre-fetch input + LDW *$K256++,$K ; pre-fetch K256[0] + MVK 14,B0 ; loop counters + MVK 47,B1 +|| ADDAW $Xia,9,$Xia +outerloop?: + SUB A0,1,A0 +|| MV $A,$Actx +|| MV $E,$Ectx +|| MVD $B,$Bctx +|| MVD $F,$Fctx + MV $C,$Cctx +|| MV $G,$Gctx +|| MVD $D,$Dctx +|| MVD $H,$Hctx +|| SWAP4 $Xn,$X0 + + SPLOOPD 8 ; BODY_00_14 +|| MVC B0,ILC +|| SWAP2 $X0,$X0 + + LDNW *$INP++,$Xn +|| ROTL $A,30,$S0 +|| OR $A,$B,$Maj +|| AND $A,$B,$t2a +|| ROTL $E,26,$S1 +|| AND $F,$E,$Ch +|| ANDN $G,$E,$t2e + ROTL $A,19,$t0a +|| AND $C,$Maj,$Maj +|| ROTL $E,21,$t0e +|| XOR $t2e,$Ch,$Ch ; Ch(e,f,g) = (e&f)^(~e&g) + ROTL $A,10,$t1a +|| OR $t2a,$Maj,$Maj ; Maj(a,b,c) = ((a|b)&c)|(a&b) +|| ROTL $E,7,$t1e +|| ADD $K,$H,$T1 ; T1 = h + K256[i] + ADD $X0,$T1,$T1 ; T1 += X[i]; +|| STW $X0,*$Xib++ +|| XOR $t0a,$S0,$S0 +|| XOR $t0e,$S1,$S1 + XOR $t1a,$S0,$S0 ; Sigma0(a) +|| XOR $t1e,$S1,$S1 ; Sigma1(e) +|| LDW *$K256++,$K ; pre-fetch K256[i+1] +|| ADD $Ch,$T1,$T1 ; T1 += Ch(e,f,g) + ADD $S1,$T1,$T1 ; T1 += Sigma1(e) +|| ADD $S0,$Maj,$T2 ; T2 = Sigma0(a) + Maj(a,b,c) +|| ROTL $G,0,$H ; h = g +|| MV $F,$G ; g = f +|| MV $X0,$X14 +|| SWAP4 $Xn,$X0 + SWAP2 $X0,$X0 +|| MV $E,$F ; f = e +|| ADD $D,$T1,$E ; e = d + T1 +|| MV $C,$D ; d = c + MV $B,$C ; c = b +|| MV $A,$B ; b = a +|| ADD $T1,$T2,$A ; a = T1 + T2 + SPKERNEL + + ROTL $A,30,$S0 ; BODY_15 +|| OR $A,$B,$Maj +|| AND $A,$B,$t2a +|| ROTL $E,26,$S1 +|| AND $F,$E,$Ch +|| ANDN $G,$E,$t2e +|| LDW *${Xib}[1],$Xn ; modulo-scheduled + ROTL $A,19,$t0a +|| AND $C,$Maj,$Maj +|| ROTL $E,21,$t0e +|| XOR $t2e,$Ch,$Ch ; Ch(e,f,g) = (e&f)^(~e&g) +|| LDW *${Xib}[2],$X1 ; modulo-scheduled + ROTL $A,10,$t1a +|| OR $t2a,$Maj,$Maj ; Maj(a,b,c) = ((a|b)&c)|(a&b) +|| ROTL $E,7,$t1e +|| ADD $K,$H,$T1 ; T1 = h + K256[i] + ADD $X0,$T1,$T1 ; T1 += X[i]; +|| STW $X0,*$Xib++ +|| XOR $t0a,$S0,$S0 +|| XOR $t0e,$S1,$S1 + XOR $t1a,$S0,$S0 ; Sigma0(a) +|| XOR $t1e,$S1,$S1 ; Sigma1(e) +|| LDW *$K256++,$K ; pre-fetch K256[i+1] +|| ADD $Ch,$T1,$T1 ; T1 += Ch(e,f,g) + ADD $S1,$T1,$T1 ; T1 += Sigma1(e) +|| ADD $S0,$Maj,$T2 ; T2 = Sigma0(a) + Maj(a,b,c) +|| ROTL $G,0,$H ; h = g +|| MV $F,$G ; g = f +|| MV $X0,$X15 + MV $E,$F ; f = e +|| ADD $D,$T1,$E ; e = d + T1 +|| MV $C,$D ; d = c +|| MV $Xn,$X0 ; modulo-scheduled +|| LDW *$Xia,$X9 ; modulo-scheduled +|| ROTL $X1,25,$t0e ; modulo-scheduled +|| ROTL $X14,15,$t0a ; modulo-scheduled + SHRU $X1,3,$s0 ; modulo-scheduled +|| SHRU $X14,10,$s1 ; modulo-scheduled +|| ROTL $B,0,$C ; c = b +|| MV $A,$B ; b = a +|| ADD $T1,$T2,$A ; a = T1 + T2 + + SPLOOPD 10 ; BODY_16_63 +|| MVC B1,ILC +|| ROTL $X1,14,$t1e ; modulo-scheduled +|| ROTL $X14,13,$t1a ; modulo-scheduled + + XOR $t0e,$s0,$s0 +|| XOR $t0a,$s1,$s1 +|| MV $X15,$X14 +|| MV $X1,$Xn + XOR $t1e,$s0,$s0 ; sigma0(X[i+1]) +|| XOR $t1a,$s1,$s1 ; sigma1(X[i+14]) +|| LDW *${Xib}[2],$X1 ; module-scheduled + ROTL $A,30,$S0 +|| OR $A,$B,$Maj +|| AND $A,$B,$t2a +|| ROTL $E,26,$S1 +|| AND $F,$E,$Ch +|| ANDN $G,$E,$t2e +|| ADD $X9,$X0,$X0 ; X[i] += X[i+9] + ROTL $A,19,$t0a +|| AND $C,$Maj,$Maj +|| ROTL $E,21,$t0e +|| XOR $t2e,$Ch,$Ch ; Ch(e,f,g) = (e&f)^(~e&g) +|| ADD $s0,$X0,$X0 ; X[i] += sigma1(X[i+1]) + ROTL $A,10,$t1a +|| OR $t2a,$Maj,$Maj ; Maj(a,b,c) = ((a|b)&c)|(a&b) +|| ROTL $E,7,$t1e +|| ADD $H,$K,$T1 ; T1 = h + K256[i] +|| ADD $s1,$X0,$X0 ; X[i] += sigma1(X[i+14]) + XOR $t0a,$S0,$S0 +|| XOR $t0e,$S1,$S1 +|| ADD $X0,$T1,$T1 ; T1 += X[i] +|| STW $X0,*$Xib++ + XOR $t1a,$S0,$S0 ; Sigma0(a) +|| XOR $t1e,$S1,$S1 ; Sigma1(e) +|| ADD $Ch,$T1,$T1 ; T1 += Ch(e,f,g) +|| MV $X0,$X15 +|| ROTL $G,0,$H ; h = g +|| LDW *$K256++,$K ; pre-fetch K256[i+1] + ADD $S1,$T1,$T1 ; T1 += Sigma1(e) +|| ADD $S0,$Maj,$T2 ; T2 = Sigma0(a) + Maj(a,b,c) +|| MV $F,$G ; g = f +|| MV $Xn,$X0 ; modulo-scheduled +|| LDW *++$Xia,$X9 ; modulo-scheduled +|| ROTL $X1,25,$t0e ; module-scheduled +|| ROTL $X14,15,$t0a ; modulo-scheduled + ROTL $X1,14,$t1e ; modulo-scheduled +|| ROTL $X14,13,$t1a ; modulo-scheduled +|| MV $E,$F ; f = e +|| ADD $D,$T1,$E ; e = d + T1 +|| MV $C,$D ; d = c +|| MV $B,$C ; c = b + MV $A,$B ; b = a +|| ADD $T1,$T2,$A ; a = T1 + T2 +|| SHRU $X1,3,$s0 ; modulo-scheduled +|| SHRU $X14,10,$s1 ; modulo-scheduled + SPKERNEL + + [A0] B outerloop? +|| [A0] LDNW *$INP++,$Xn ; pre-fetch input +|| [A0] ADDK -260,$K256 ; rewind K256 +|| ADD $Actx,$A,$A ; accumulate ctx +|| ADD $Ectx,$E,$E +|| ADD $Bctx,$B,$B + ADD $Fctx,$F,$F +|| ADD $Cctx,$C,$C +|| ADD $Gctx,$G,$G +|| ADD $Dctx,$D,$D +|| ADD $Hctx,$H,$H +|| [A0] LDW *$K256++,$K ; pre-fetch K256[0] + + [!A0] BNOP RA +||[!A0] MV $CTXA,$CTXB + [!A0] MV FP,SP ; restore stack pointer +||[!A0] LDW *FP[0],FP ; restore frame pointer + [!A0] STW $A,*${CTXA}[0] ; save ctx +||[!A0] STW $E,*${CTXB}[4] +||[!A0] MVK 0,B0 + [!A0] STW $B,*${CTXA}[1] +||[!A0] STW $F,*${CTXB}[5] +||[!A0] MVC B0,AMR ; clear AMR + STW $C,*${CTXA}[2] +|| STW $G,*${CTXB}[6] + STW $D,*${CTXA}[3] +|| STW $H,*${CTXB}[7] + .endasmfunc + + .if __TI_EABI__ + .sect ".text:sha_asm.const" + .else + .sect ".const:sha_asm" + .endif + .align 128 +K256: + .uword 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 + .uword 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 + .uword 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 + .uword 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 + .uword 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc + .uword 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da + .uword 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 + .uword 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 + .uword 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 + .uword 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 + .uword 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 + .uword 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 + .uword 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 + .uword 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 + .uword 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 + .uword 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 + .cstring "SHA256 block transform for C64x+, CRYPTOGAMS by <appro\@openssl.org>" + .align 4 + +___ + +print $code; +close STDOUT; diff --git a/crypto/sha/asm/sha256-mb-x86_64.pl b/crypto/sha/asm/sha256-mb-x86_64.pl index 9770286b9596..73978dbd81d6 100755 --- a/crypto/sha/asm/sha256-mb-x86_64.pl +++ b/crypto/sha/asm/sha256-mb-x86_64.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL @@ -19,6 +26,7 @@ # Sandy Bridge (20.5 +5.15=25.7)/n 11.6 13.0 +103% # Ivy Bridge (20.4 +5.14=25.5)/n 10.3 11.6 +82% # Haswell(iii) (21.0 +5.00=26.0)/n 7.80 8.79 +170% +# Skylake (18.9 +5.00=23.9)/n 7.70 8.17 +170% # Bulldozer (21.6 +5.76=27.4)/n 13.6 13.7 +100% # # (i) multi-block CBC encrypt with 128-bit key; @@ -28,7 +36,7 @@ # (iii) "this" is for n=8, when we gather twice as much data, result # for n=4 is 20.3+4.44=24.7; # (iv) presented improvement coefficients are asymptotic limits and -# in real-life application are somewhat lower, e.g. for 2KB +# in real-life application are somewhat lower, e.g. for 2KB # fragments they range from 75% to 130% (on Haswell); $flavour = shift; @@ -63,7 +71,7 @@ if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([ $avx = ($2>=3.0) + ($2>3.0); } -open OUT,"| \"$^X\" $xlate $flavour $output"; +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; # void sha256_multi_block ( @@ -236,6 +244,7 @@ $code.=<<___; .type sha256_multi_block,\@function,3 .align 32 sha256_multi_block: +.cfi_startproc mov OPENSSL_ia32cap_P+4(%rip),%rcx bt \$61,%rcx # check SHA bit jc _shaext_shortcut @@ -246,8 +255,11 @@ $code.=<<___ if ($avx); ___ $code.=<<___; mov %rsp,%rax +.cfi_def_cfa_register %rax push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp ___ $code.=<<___ if ($win64); lea -0xa8(%rsp),%rsp @@ -266,6 +278,7 @@ $code.=<<___; sub \$`$REG_SZ*18`, %rsp and \$-256,%rsp mov %rax,`$REG_SZ*17`(%rsp) # original %rsp +.cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8 .Lbody: lea K256+128(%rip),$Tbl lea `$REG_SZ*16`(%rsp),%rbx @@ -382,7 +395,8 @@ $code.=<<___; jnz .Loop_grande .Ldone: - mov `$REG_SZ*17`(%rsp),%rax # orignal %rsp + mov `$REG_SZ*17`(%rsp),%rax # original %rsp +.cfi_def_cfa %rax,8 ___ $code.=<<___ if ($win64); movaps -0xb8(%rax),%xmm6 @@ -398,10 +412,14 @@ $code.=<<___ if ($win64); ___ $code.=<<___; mov -16(%rax),%rbp +.cfi_restore %rbp mov -8(%rax),%rbx +.cfi_restore %rbx lea (%rax),%rsp +.cfi_def_cfa_register %rsp .Lepilogue: ret +.cfi_endproc .size sha256_multi_block,.-sha256_multi_block ___ {{{ @@ -413,10 +431,14 @@ $code.=<<___; .type sha256_multi_block_shaext,\@function,3 .align 32 sha256_multi_block_shaext: +.cfi_startproc _shaext_shortcut: mov %rsp,%rax +.cfi_def_cfa_register %rax push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp ___ $code.=<<___ if ($win64); lea -0xa8(%rsp),%rsp @@ -442,7 +464,7 @@ $code.=<<___; lea K256_shaext+0x80(%rip),$Tbl .Loop_grande_shaext: - mov $num,`$REG_SZ*17+8`(%rsp) # orignal $num + mov $num,`$REG_SZ*17+8`(%rsp) # original $num xor $num,$num ___ for($i=0;$i<2;$i++) { @@ -750,10 +772,14 @@ $code.=<<___ if ($win64); ___ $code.=<<___; mov -16(%rax),%rbp +.cfi_restore %rbp mov -8(%rax),%rbx +.cfi_restore %rbx lea (%rax),%rsp +.cfi_def_cfa_register %rsp .Lepilogue_shaext: ret +.cfi_endproc .size sha256_multi_block_shaext,.-sha256_multi_block_shaext ___ }}} @@ -913,6 +939,7 @@ $code.=<<___; .type sha256_multi_block_avx,\@function,3 .align 32 sha256_multi_block_avx: +.cfi_startproc _avx_shortcut: ___ $code.=<<___ if ($avx>1); @@ -927,8 +954,11 @@ $code.=<<___ if ($avx>1); ___ $code.=<<___; mov %rsp,%rax +.cfi_def_cfa_register %rax push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp ___ $code.=<<___ if ($win64); lea -0xa8(%rsp),%rsp @@ -947,6 +977,7 @@ $code.=<<___; sub \$`$REG_SZ*18`, %rsp and \$-256,%rsp mov %rax,`$REG_SZ*17`(%rsp) # original %rsp +.cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8 .Lbody_avx: lea K256+128(%rip),$Tbl lea `$REG_SZ*16`(%rsp),%rbx @@ -1061,7 +1092,8 @@ $code.=<<___; jnz .Loop_grande_avx .Ldone_avx: - mov `$REG_SZ*17`(%rsp),%rax # orignal %rsp + mov `$REG_SZ*17`(%rsp),%rax # original %rsp +.cfi_def_cfa %rax,8 vzeroupper ___ $code.=<<___ if ($win64); @@ -1078,10 +1110,14 @@ $code.=<<___ if ($win64); ___ $code.=<<___; mov -16(%rax),%rbp +.cfi_restore %rbp mov -8(%rax),%rbx +.cfi_restore %rbx lea (%rax),%rsp +.cfi_def_cfa_register %rsp .Lepilogue_avx: ret +.cfi_endproc .size sha256_multi_block_avx,.-sha256_multi_block_avx ___ if ($avx>1) { @@ -1097,14 +1133,22 @@ $code.=<<___; .type sha256_multi_block_avx2,\@function,3 .align 32 sha256_multi_block_avx2: +.cfi_startproc _avx2_shortcut: mov %rsp,%rax +.cfi_def_cfa_register %rax push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 +.cfi_push %r15 ___ $code.=<<___ if ($win64); lea -0xa8(%rsp),%rsp @@ -1123,6 +1167,7 @@ $code.=<<___; sub \$`$REG_SZ*18`, %rsp and \$-256,%rsp mov %rax,`$REG_SZ*17`(%rsp) # original %rsp +.cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8 .Lbody_avx2: lea K256+128(%rip),$Tbl lea 0x80($ctx),$ctx # size optimization @@ -1237,7 +1282,8 @@ $code.=<<___; #jnz .Loop_grande_avx2 .Ldone_avx2: - mov `$REG_SZ*17`(%rsp),%rax # orignal %rsp + mov `$REG_SZ*17`(%rsp),%rax # original %rsp +.cfi_def_cfa %rax,8 vzeroupper ___ $code.=<<___ if ($win64); @@ -1254,14 +1300,22 @@ $code.=<<___ if ($win64); ___ $code.=<<___; mov -48(%rax),%r15 +.cfi_restore %r15 mov -40(%rax),%r14 +.cfi_restore %r14 mov -32(%rax),%r13 +.cfi_restore %r13 mov -24(%rax),%r12 +.cfi_restore %r12 mov -16(%rax),%rbp +.cfi_restore %rbp mov -8(%rax),%rbx +.cfi_restore %rbx lea (%rax),%rsp +.cfi_def_cfa_register %rsp .Lepilogue_avx2: ret +.cfi_endproc .size sha256_multi_block_avx2,.-sha256_multi_block_avx2 ___ } }}} @@ -1454,10 +1508,10 @@ avx2_handler: mov -48(%rax),%r15 mov %rbx,144($context) # restore context->Rbx mov %rbp,160($context) # restore context->Rbp - mov %r12,216($context) # restore cotnext->R12 - mov %r13,224($context) # restore cotnext->R13 - mov %r14,232($context) # restore cotnext->R14 - mov %r15,240($context) # restore cotnext->R15 + mov %r12,216($context) # restore context->R12 + mov %r13,224($context) # restore context->R13 + mov %r14,232($context) # restore context->R14 + mov %r15,240($context) # restore context->R15 lea -56-10*16(%rax),%rsi lea 512($context),%rdi # &context.Xmm6 diff --git a/crypto/sha/asm/sha512-586.pl b/crypto/sha/asm/sha512-586.pl index 2f6a202c3765..867ce30b9721 100755 --- a/crypto/sha/asm/sha512-586.pl +++ b/crypto/sha/asm/sha512-586.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # # ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL @@ -25,15 +32,17 @@ # Sandy Bridge 58 - 35 11.9 11.2 # Ivy Bridge 50 - 33 11.5 8.17 # Haswell 46 - 29 11.3 7.66 +# Skylake 40 - 26 13.3 7.25 # Bulldozer 121 - 50 14.0 13.5 # VIA Nano 91 - 52 33 14.7 # Atom 126 - 68 48(***) 14.7 # Silvermont 97 - 58 42(***) 17.5 +# Goldmont 80 - 48 19.5 12.0 # # (*) whichever best applicable. # (**) x86_64 assembler performance is presented for reference # purposes, the results are for integer-only code. -# (***) paddq is increadibly slow on Atom. +# (***) paddq is incredibly slow on Atom. # # IALU code-path is optimized for elder Pentiums. On vanilla Pentium # performance improvement over compiler generated code reaches ~60%, @@ -50,7 +59,10 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; push(@INC,"${dir}","${dir}../../perlasm"); require "x86asm.pl"; -&asm_init($ARGV[0],"sha512-586.pl",$ARGV[$#ARGV] eq "386"); +$output=pop; +open STDOUT,">$output"; + +&asm_init($ARGV[0],$ARGV[$#ARGV] eq "386"); $sse2=0; for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } @@ -373,7 +385,7 @@ if ($sse2) { &set_label("16_79_sse2",16); for ($j=0;$j<2;$j++) { # 2x unroll - #&movq ("mm7",&QWP(8*(9+16-1),"esp")); # prefetched in BODY_00_15 + #&movq ("mm7",&QWP(8*(9+16-1),"esp")); # prefetched in BODY_00_15 &movq ("mm5",&QWP(8*(9+16-14),"esp")); &movq ("mm1","mm7"); &psrlq ("mm7",1); @@ -909,3 +921,5 @@ sub BODY_00_15_ssse3 { # "phase-less" copy of BODY_00_15_sse2 &asciz("SHA512 block transform for x86, CRYPTOGAMS by <appro\@openssl.org>"); &asm_finish(); + +close STDOUT; diff --git a/crypto/sha/asm/sha512-armv4.pl b/crypto/sha/asm/sha512-armv4.pl index fb7dc506aca1..0b4c5674d9df 100755 --- a/crypto/sha/asm/sha512-armv4.pl +++ b/crypto/sha/asm/sha512-armv4.pl @@ -1,10 +1,19 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2007-2018 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. +# +# Permission to use under GPL terms is granted. # ==================================================================== # SHA512 block procedure for ARMv4. September 2007. @@ -34,16 +43,9 @@ # terms it's 22.6 cycles per byte, which is disappointing result. # Technical writers asserted that 3-way S4 pipeline can sustain # multiple NEON instructions per cycle, but dual NEON issue could -# not be observed, and for NEON-only sequences IPC(*) was found to -# be limited by 1:-( 0.33 and 0.66 were measured for sequences with -# ILPs(*) of 1 and 2 respectively. This in turn means that you can -# even find yourself striving, as I did here, for achieving IPC -# adequate to one delivered by Cortex A8 [for reference, it's -# 0.5 for ILP of 1, and 1 for higher ILPs]. -# -# (*) ILP, instruction-level parallelism, how many instructions -# *can* execute at the same time. IPC, instructions per cycle, -# indicates how many instructions actually execute. +# not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html +# for further details. On side note Cortex-A15 processes one byte in +# 16 cycles. # Byte order [in]dependence. ========================================= # @@ -55,8 +57,20 @@ $hi="HI"; $lo="LO"; # ==================================================================== -while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} -open STDOUT,">$output"; +$flavour = shift; +if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } +else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} $ctx="r0"; # parameter block $inp="r1"; @@ -143,6 +157,9 @@ $code.=<<___; teq $t0,#$magic ldr $t3,[sp,#$Coff+0] @ c.lo +#ifdef __thumb2__ + it eq @ Thumb2 thing, sanity check in ARM +#endif orreq $Ktbl,$Ktbl,#1 @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39)) @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25 @@ -180,7 +197,17 @@ $code.=<<___; ___ } $code=<<___; -#include "arm_arch.h" +#ifndef __KERNEL__ +# include "arm_arch.h" +# define VFP_ABI_PUSH vstmdb sp!,{d8-d15} +# define VFP_ABI_POP vldmia sp!,{d8-d15} +#else +# define __ARM_ARCH__ __LINUX_ARM_ARCH__ +# define __ARM_MAX_ARCH__ 7 +# define VFP_ABI_PUSH +# define VFP_ABI_POP +#endif + #ifdef __ARMEL__ # define LO 0 # define HI 4 @@ -192,7 +219,14 @@ $code=<<___; #endif .text +#if defined(__thumb2__) +.syntax unified +.thumb +# define adrl adr +#else .code 32 +#endif + .type K512,%object .align 5 K512: @@ -237,9 +271,9 @@ WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c) WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a) WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817) .size K512,.-K512 -#if __ARM_MAX_ARCH__>=7 +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) .LOPENSSL_armcap: -.word OPENSSL_armcap_P-sha512_block_data_order +.word OPENSSL_armcap_P-.Lsha512_block_data_order .skip 32-4 #else .skip 32 @@ -248,14 +282,22 @@ WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817) .global sha512_block_data_order .type sha512_block_data_order,%function sha512_block_data_order: +.Lsha512_block_data_order: +#if __ARM_ARCH__<7 && !defined(__thumb2__) sub r3,pc,#8 @ sha512_block_data_order - add $len,$inp,$len,lsl#7 @ len to point at the end of inp -#if __ARM_MAX_ARCH__>=7 +#else + adr r3,.Lsha512_block_data_order +#endif +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) ldr r12,.LOPENSSL_armcap ldr r12,[r3,r12] @ OPENSSL_armcap_P - tst r12,#1 +#ifdef __APPLE__ + ldr r12,[r12] +#endif + tst r12,#ARMV7_NEON bne .LNEON #endif + add $len,$inp,$len,lsl#7 @ len to point at the end of inp stmdb sp!,{r4-r12,lr} sub $Ktbl,r3,#672 @ K512 sub sp,sp,#9*8 @@ -369,6 +411,9 @@ $code.=<<___; ___ &BODY_00_15(0x17); $code.=<<___; +#ifdef __thumb2__ + ittt eq @ Thumb2 thing, sanity check in ARM +#endif ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0] ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4] beq .L16_79 @@ -453,6 +498,7 @@ $code.=<<___; moveq pc,lr @ be binary compatible with V4, yet bx lr @ interoperable with Thumb ISA:-) #endif +.size sha512_block_data_order,.-sha512_block_data_order ___ { @@ -559,11 +605,15 @@ $code.=<<___; .arch armv7-a .fpu neon +.global sha512_block_data_order_neon +.type sha512_block_data_order_neon,%function .align 4 +sha512_block_data_order_neon: .LNEON: dmb @ errata #451034 on early Cortex A8 - vstmdb sp!,{d8-d15} @ ABI specification says so - sub $Ktbl,r3,#672 @ K512 + add $len,$inp,$len,lsl#7 @ len to point at the end of inp + adr $Ktbl,K512 + VFP_ABI_PUSH vldmia $ctx,{$A-$H} @ load context .Loop_neon: ___ @@ -588,16 +638,16 @@ $code.=<<___; sub $Ktbl,#640 @ rewind K512 bne .Loop_neon - vldmia sp!,{d8-d15} @ epilogue + VFP_ABI_POP ret @ bx lr +.size sha512_block_data_order_neon,.-sha512_block_data_order_neon #endif ___ } $code.=<<___; -.size sha512_block_data_order,.-sha512_block_data_order .asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>" .align 2 -#if __ARM_MAX_ARCH__>=7 +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) .comm OPENSSL_armcap_P,4,4 #endif ___ @@ -605,5 +655,14 @@ ___ $code =~ s/\`([^\`]*)\`/eval $1/gem; $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 $code =~ s/\bret\b/bx lr/gm; + +open SELF,$0; +while(<SELF>) { + next if (/^#!/); + last if (!s/^#/@/ and !/^$/); + print; +} +close SELF; + print $code; close STDOUT; # enforce flush diff --git a/crypto/sha/asm/sha512-armv8.pl b/crypto/sha/asm/sha512-armv8.pl index f7b36b986a61..ac84ebb52e4f 100755 --- a/crypto/sha/asm/sha512-armv8.pl +++ b/crypto/sha/asm/sha512-armv8.pl @@ -1,10 +1,18 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2014-2018 The OpenSSL Project Authors. All Rights Reserved. # +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. +# +# Permission to use under GPLv2 terms is granted. # ==================================================================== # # SHA256/512 for ARMv8. @@ -18,7 +26,9 @@ # Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***)) # Denver 2.01 10.5 (+26%) 6.70 (+8%) # X-Gene 20.0 (+100%) 12.8 (+300%(***)) -# +# Mongoose 2.36 13.0 (+50%) 8.36 (+33%) +# Kryo 1.92 17.4 (+30%) 11.2 (+8%) +# # (*) Software SHA256 results are of lesser relevance, presented # mostly for informational purposes. # (**) The result is a trade-off: it's possible to improve it by @@ -26,12 +36,37 @@ # on Cortex-A53 (or by 4 cycles per round). # (***) Super-impressive coefficients over gcc-generated code are # indication of some compiler "pathology", most notably code -# generated with -mgeneral-regs-only is significanty faster +# generated with -mgeneral-regs-only is significantly faster # and the gap is only 40-90%. - -$flavour=shift; -$output=shift; -open STDOUT,">$output"; +# +# October 2016. +# +# Originally it was reckoned that it makes no sense to implement NEON +# version of SHA256 for 64-bit processors. This is because performance +# improvement on most wide-spread Cortex-A5x processors was observed +# to be marginal, same on Cortex-A53 and ~10% on A57. But then it was +# observed that 32-bit NEON SHA256 performs significantly better than +# 64-bit scalar version on *some* of the more recent processors. As +# result 64-bit NEON version of SHA256 was added to provide best +# all-round performance. For example it executes ~30% faster on X-Gene +# and Mongoose. [For reference, NEON version of SHA512 is bound to +# deliver much less improvement, likely *negative* on Cortex-A5x. +# Which is why NEON support is limited to SHA256.] + +$output=pop; +$flavour=pop; + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open OUT,"| \"$^X\" $xlate $flavour $output"; + *STDOUT=*OUT; +} else { + open STDOUT,">$output"; +} if ($output =~ /512/) { $BITS=512; @@ -68,7 +103,7 @@ my ($T0,$T1,$T2)=(@X[($i-8)&15],@X[($i-9)&15],@X[($i-10)&15]); $T0=@X[$i+3] if ($i<11); $code.=<<___ if ($i<16); -#ifndef __ARMEB__ +#ifndef __AARCH64EB__ rev @X[$i],@X[$i] // $i #endif ___ @@ -151,24 +186,39 @@ ___ } $code.=<<___; -#include "arm_arch.h" +#ifndef __KERNEL__ +# include "arm_arch.h" +#endif .text +.extern OPENSSL_armcap_P .globl $func .type $func,%function .align 6 $func: -___ -$code.=<<___ if ($SZ==4); +#ifndef __KERNEL__ +# ifdef __ILP32__ + ldrsw x16,.LOPENSSL_armcap_P +# else ldr x16,.LOPENSSL_armcap_P +# endif adr x17,.LOPENSSL_armcap_P add x16,x16,x17 ldr w16,[x16] +___ +$code.=<<___ if ($SZ==4); tst w16,#ARMV8_SHA256 b.ne .Lv8_entry + tst w16,#ARMV7_NEON + b.ne .Lneon_entry +___ +$code.=<<___ if ($SZ==8); + tst w16,#ARMV8_SHA512 + b.ne .Lv8_entry ___ $code.=<<___; +#endif stp x29,x30,[sp,#-128]! add x29,sp,#0 @@ -184,7 +234,7 @@ $code.=<<___; ldp $E,$F,[$ctx,#4*$SZ] add $num,$inp,$num,lsl#`log(16*$SZ)/log(2)` // end of input ldp $G,$H,[$ctx,#6*$SZ] - adr $Ktbl,K$BITS + adr $Ktbl,.LK$BITS stp $ctx,$num,[x29,#96] .Loop: @@ -234,8 +284,8 @@ $code.=<<___; .size $func,.-$func .align 6 -.type K$BITS,%object -K$BITS: +.type .LK$BITS,%object +.LK$BITS: ___ $code.=<<___ if ($SZ==8); .quad 0x428a2f98d728ae22,0x7137449123ef65cd @@ -300,10 +350,16 @@ $code.=<<___ if ($SZ==4); .long 0 //terminator ___ $code.=<<___; -.size K$BITS,.-K$BITS +.size .LK$BITS,.-.LK$BITS +#ifndef __KERNEL__ .align 3 .LOPENSSL_armcap_P: +# ifdef __ILP32__ + .long OPENSSL_armcap_P-. +# else .quad OPENSSL_armcap_P-. +# endif +#endif .asciz "SHA$BITS block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" .align 2 ___ @@ -317,6 +373,7 @@ my ($W0,$W1)=("v16.4s","v17.4s"); my ($ABCD_SAVE,$EFGH_SAVE)=("v18.16b","v19.16b"); $code.=<<___; +#ifndef __KERNEL__ .type sha256_block_armv8,%function .align 6 sha256_block_armv8: @@ -325,7 +382,7 @@ sha256_block_armv8: add x29,sp,#0 ld1.32 {$ABCD,$EFGH},[$ctx] - adr $Ktbl,K256 + adr $Ktbl,.LK256 .Loop_hw: ld1 {@MSG[0]-@MSG[3]},[$inp],#64 @@ -385,11 +442,406 @@ $code.=<<___; ldr x29,[sp],#16 ret .size sha256_block_armv8,.-sha256_block_armv8 +#endif ___ } +if ($SZ==4) { ######################################### NEON stuff # +# You'll surely note a lot of similarities with sha256-armv4 module, +# and of course it's not a coincidence. sha256-armv4 was used as +# initial template, but was adapted for ARMv8 instruction set and +# extensively re-tuned for all-round performance. + +my @V = ($A,$B,$C,$D,$E,$F,$G,$H) = map("w$_",(3..10)); +my ($t0,$t1,$t2,$t3,$t4) = map("w$_",(11..15)); +my $Ktbl="x16"; +my $Xfer="x17"; +my @X = map("q$_",(0..3)); +my ($T0,$T1,$T2,$T3,$T4,$T5,$T6,$T7) = map("q$_",(4..7,16..19)); +my $j=0; + +sub AUTOLOAD() # thunk [simplified] x86-style perlasm +{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; + my $arg = pop; + $arg = "#$arg" if ($arg*1 eq $arg); + $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; +} + +sub Dscalar { shift =~ m|[qv]([0-9]+)|?"d$1":""; } +sub Dlo { shift =~ m|[qv]([0-9]+)|?"v$1.d[0]":""; } +sub Dhi { shift =~ m|[qv]([0-9]+)|?"v$1.d[1]":""; } + +sub Xupdate() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); + my ($a,$b,$c,$d,$e,$f,$g,$h); + + &ext_8 ($T0,@X[0],@X[1],4); # X[1..4] + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &ext_8 ($T3,@X[2],@X[3],4); # X[9..12] + eval(shift(@insns)); + eval(shift(@insns)); + &mov (&Dscalar($T7),&Dhi(@X[3])); # X[14..15] + eval(shift(@insns)); + eval(shift(@insns)); + &ushr_32 ($T2,$T0,$sigma0[0]); + eval(shift(@insns)); + &ushr_32 ($T1,$T0,$sigma0[2]); + eval(shift(@insns)); + &add_32 (@X[0],@X[0],$T3); # X[0..3] += X[9..12] + eval(shift(@insns)); + &sli_32 ($T2,$T0,32-$sigma0[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &ushr_32 ($T3,$T0,$sigma0[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &eor_8 ($T1,$T1,$T2); + eval(shift(@insns)); + eval(shift(@insns)); + &sli_32 ($T3,$T0,32-$sigma0[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &ushr_32 ($T4,$T7,$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &eor_8 ($T1,$T1,$T3); # sigma0(X[1..4]) + eval(shift(@insns)); + eval(shift(@insns)); + &sli_32 ($T4,$T7,32-$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &ushr_32 ($T5,$T7,$sigma1[2]); + eval(shift(@insns)); + eval(shift(@insns)); + &ushr_32 ($T3,$T7,$sigma1[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &add_32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4]) + eval(shift(@insns)); + eval(shift(@insns)); + &sli_u32 ($T3,$T7,32-$sigma1[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &eor_8 ($T5,$T5,$T4); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &eor_8 ($T5,$T5,$T3); # sigma1(X[14..15]) + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &add_32 (@X[0],@X[0],$T5); # X[0..1] += sigma1(X[14..15]) + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &ushr_32 ($T6,@X[0],$sigma1[0]); + eval(shift(@insns)); + &ushr_32 ($T7,@X[0],$sigma1[2]); + eval(shift(@insns)); + eval(shift(@insns)); + &sli_32 ($T6,@X[0],32-$sigma1[0]); + eval(shift(@insns)); + &ushr_32 ($T5,@X[0],$sigma1[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &eor_8 ($T7,$T7,$T6); + eval(shift(@insns)); + eval(shift(@insns)); + &sli_32 ($T5,@X[0],32-$sigma1[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &ld1_32 ("{$T0}","[$Ktbl], #16"); + eval(shift(@insns)); + &eor_8 ($T7,$T7,$T5); # sigma1(X[16..17]) + eval(shift(@insns)); + eval(shift(@insns)); + &eor_8 ($T5,$T5,$T5); + eval(shift(@insns)); + eval(shift(@insns)); + &mov (&Dhi($T5), &Dlo($T7)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &add_32 (@X[0],@X[0],$T5); # X[2..3] += sigma1(X[16..17]) + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &add_32 ($T0,$T0,@X[0]); + while($#insns>=1) { eval(shift(@insns)); } + &st1_32 ("{$T0}","[$Xfer], #16"); + eval(shift(@insns)); + + push(@X,shift(@X)); # "rotate" X[] +} + +sub Xpreload() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); + my ($a,$b,$c,$d,$e,$f,$g,$h); + + eval(shift(@insns)); + eval(shift(@insns)); + &ld1_8 ("{@X[0]}","[$inp],#16"); + eval(shift(@insns)); + eval(shift(@insns)); + &ld1_32 ("{$T0}","[$Ktbl],#16"); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &rev32 (@X[0],@X[0]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &add_32 ($T0,$T0,@X[0]); + foreach (@insns) { eval; } # remaining instructions + &st1_32 ("{$T0}","[$Xfer], #16"); + + push(@X,shift(@X)); # "rotate" X[] +} + +sub body_00_15 () { + ( + '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'. + '&add ($h,$h,$t1)', # h+=X[i]+K[i] + '&add ($a,$a,$t4);'. # h+=Sigma0(a) from the past + '&and ($t1,$f,$e)', + '&bic ($t4,$g,$e)', + '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))', + '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past + '&orr ($t1,$t1,$t4)', # Ch(e,f,g) + '&eor ($t0,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e) + '&eor ($t4,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))', + '&add ($h,$h,$t1)', # h+=Ch(e,f,g) + '&ror ($t0,$t0,"#$Sigma1[0]")', + '&eor ($t2,$a,$b)', # a^b, b^c in next round + '&eor ($t4,$t4,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a) + '&add ($h,$h,$t0)', # h+=Sigma1(e) + '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'. + '&ldr ($t1,"[$Ktbl]") if ($j==15);'. + '&and ($t3,$t3,$t2)', # (b^c)&=(a^b) + '&ror ($t4,$t4,"#$Sigma0[0]")', + '&add ($d,$d,$h)', # d+=h + '&eor ($t3,$t3,$b)', # Maj(a,b,c) + '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);' + ) +} + +$code.=<<___; +#ifdef __KERNEL__ +.globl sha256_block_neon +#endif +.type sha256_block_neon,%function +.align 4 +sha256_block_neon: +.Lneon_entry: + stp x29, x30, [sp, #-16]! + mov x29, sp + sub sp,sp,#16*4 + + adr $Ktbl,.LK256 + add $num,$inp,$num,lsl#6 // len to point at the end of inp + + ld1.8 {@X[0]},[$inp], #16 + ld1.8 {@X[1]},[$inp], #16 + ld1.8 {@X[2]},[$inp], #16 + ld1.8 {@X[3]},[$inp], #16 + ld1.32 {$T0},[$Ktbl], #16 + ld1.32 {$T1},[$Ktbl], #16 + ld1.32 {$T2},[$Ktbl], #16 + ld1.32 {$T3},[$Ktbl], #16 + rev32 @X[0],@X[0] // yes, even on + rev32 @X[1],@X[1] // big-endian + rev32 @X[2],@X[2] + rev32 @X[3],@X[3] + mov $Xfer,sp + add.32 $T0,$T0,@X[0] + add.32 $T1,$T1,@X[1] + add.32 $T2,$T2,@X[2] + st1.32 {$T0-$T1},[$Xfer], #32 + add.32 $T3,$T3,@X[3] + st1.32 {$T2-$T3},[$Xfer] + sub $Xfer,$Xfer,#32 + + ldp $A,$B,[$ctx] + ldp $C,$D,[$ctx,#8] + ldp $E,$F,[$ctx,#16] + ldp $G,$H,[$ctx,#24] + ldr $t1,[sp,#0] + mov $t2,wzr + eor $t3,$B,$C + mov $t4,wzr + b .L_00_48 + +.align 4 +.L_00_48: +___ + &Xupdate(\&body_00_15); + &Xupdate(\&body_00_15); + &Xupdate(\&body_00_15); + &Xupdate(\&body_00_15); +$code.=<<___; + cmp $t1,#0 // check for K256 terminator + ldr $t1,[sp,#0] + sub $Xfer,$Xfer,#64 + bne .L_00_48 + + sub $Ktbl,$Ktbl,#256 // rewind $Ktbl + cmp $inp,$num + mov $Xfer, #64 + csel $Xfer, $Xfer, xzr, eq + sub $inp,$inp,$Xfer // avoid SEGV + mov $Xfer,sp +___ + &Xpreload(\&body_00_15); + &Xpreload(\&body_00_15); + &Xpreload(\&body_00_15); + &Xpreload(\&body_00_15); +$code.=<<___; + add $A,$A,$t4 // h+=Sigma0(a) from the past + ldp $t0,$t1,[$ctx,#0] + add $A,$A,$t2 // h+=Maj(a,b,c) from the past + ldp $t2,$t3,[$ctx,#8] + add $A,$A,$t0 // accumulate + add $B,$B,$t1 + ldp $t0,$t1,[$ctx,#16] + add $C,$C,$t2 + add $D,$D,$t3 + ldp $t2,$t3,[$ctx,#24] + add $E,$E,$t0 + add $F,$F,$t1 + ldr $t1,[sp,#0] + stp $A,$B,[$ctx,#0] + add $G,$G,$t2 + mov $t2,wzr + stp $C,$D,[$ctx,#8] + add $H,$H,$t3 + stp $E,$F,[$ctx,#16] + eor $t3,$B,$C + stp $G,$H,[$ctx,#24] + mov $t4,wzr + mov $Xfer,sp + b.ne .L_00_48 + + ldr x29,[x29] + add sp,sp,#16*4+16 + ret +.size sha256_block_neon,.-sha256_block_neon +___ +} + +if ($SZ==8) { +my $Ktbl="x3"; + +my @H = map("v$_.16b",(0..4)); +my ($fg,$de,$m9_10)=map("v$_.16b",(5..7)); +my @MSG=map("v$_.16b",(16..23)); +my ($W0,$W1)=("v24.2d","v25.2d"); +my ($AB,$CD,$EF,$GH)=map("v$_.16b",(26..29)); + $code.=<<___; +#ifndef __KERNEL__ +.type sha512_block_armv8,%function +.align 6 +sha512_block_armv8: +.Lv8_entry: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1 {@MSG[0]-@MSG[3]},[$inp],#64 // load input + ld1 {@MSG[4]-@MSG[7]},[$inp],#64 + + ld1.64 {@H[0]-@H[3]},[$ctx] // load context + adr $Ktbl,.LK512 + + rev64 @MSG[0],@MSG[0] + rev64 @MSG[1],@MSG[1] + rev64 @MSG[2],@MSG[2] + rev64 @MSG[3],@MSG[3] + rev64 @MSG[4],@MSG[4] + rev64 @MSG[5],@MSG[5] + rev64 @MSG[6],@MSG[6] + rev64 @MSG[7],@MSG[7] + b .Loop_hw + +.align 4 +.Loop_hw: + ld1.64 {$W0},[$Ktbl],#16 + subs $num,$num,#1 + sub x4,$inp,#128 + orr $AB,@H[0],@H[0] // offload + orr $CD,@H[1],@H[1] + orr $EF,@H[2],@H[2] + orr $GH,@H[3],@H[3] + csel $inp,$inp,x4,ne // conditional rewind +___ +for($i=0;$i<32;$i++) { +$code.=<<___; + add.i64 $W0,$W0,@MSG[0] + ld1.64 {$W1},[$Ktbl],#16 + ext $W0,$W0,$W0,#8 + ext $fg,@H[2],@H[3],#8 + ext $de,@H[1],@H[2],#8 + add.i64 @H[3],@H[3],$W0 // "T1 + H + K512[i]" + sha512su0 @MSG[0],@MSG[1] + ext $m9_10,@MSG[4],@MSG[5],#8 + sha512h @H[3],$fg,$de + sha512su1 @MSG[0],@MSG[7],$m9_10 + add.i64 @H[4],@H[1],@H[3] // "D + T1" + sha512h2 @H[3],$H[1],@H[0] +___ + ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG)); + @H = (@H[3],@H[0],@H[4],@H[2],@H[1]); +} +for(;$i<40;$i++) { +$code.=<<___ if ($i<39); + ld1.64 {$W1},[$Ktbl],#16 +___ +$code.=<<___ if ($i==39); + sub $Ktbl,$Ktbl,#$rounds*$SZ // rewind +___ +$code.=<<___; + add.i64 $W0,$W0,@MSG[0] + ld1 {@MSG[0]},[$inp],#16 // load next input + ext $W0,$W0,$W0,#8 + ext $fg,@H[2],@H[3],#8 + ext $de,@H[1],@H[2],#8 + add.i64 @H[3],@H[3],$W0 // "T1 + H + K512[i]" + sha512h @H[3],$fg,$de + rev64 @MSG[0],@MSG[0] + add.i64 @H[4],@H[1],@H[3] // "D + T1" + sha512h2 @H[3],$H[1],@H[0] +___ + ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG)); + @H = (@H[3],@H[0],@H[4],@H[2],@H[1]); +} +$code.=<<___; + add.i64 @H[0],@H[0],$AB // accumulate + add.i64 @H[1],@H[1],$CD + add.i64 @H[2],@H[2],$EF + add.i64 @H[3],@H[3],$GH + + cbnz $num,.Loop_hw + + st1.64 {@H[0]-@H[3]},[$ctx] // store context + + ldr x29,[sp],#16 + ret +.size sha512_block_armv8,.-sha512_block_armv8 +#endif +___ +} + +$code.=<<___; +#ifndef __KERNEL__ .comm OPENSSL_armcap_P,4,4 +#endif ___ { my %opcode = ( @@ -407,14 +859,43 @@ ___ } } +{ my %opcode = ( + "sha512h" => 0xce608000, "sha512h2" => 0xce608400, + "sha512su0" => 0xcec08000, "sha512su1" => 0xce608800 ); + + sub unsha512 { + my ($mnemonic,$arg)=@_; + + $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o + && + sprintf ".inst\t0x%08x\t//%s %s", + $opcode{$mnemonic}|$1|($2<<5)|($3<<16), + $mnemonic,$arg; + } +} + +open SELF,$0; +while(<SELF>) { + next if (/^#!/); + last if (!s/^#/\/\// and !/^$/); + print; +} +close SELF; + foreach(split("\n",$code)) { - s/\`([^\`]*)\`/eval($1)/geo; + s/\`([^\`]*)\`/eval($1)/ge; + + s/\b(sha512\w+)\s+([qv].*)/unsha512($1,$2)/ge or + s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/ge; - s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/geo; + s/\bq([0-9]+)\b/v$1.16b/g; # old->new registers - s/\.\w?32\b//o and s/\.16b/\.4s/go; - m/(ld|st)1[^\[]+\[0\]/o and s/\.4s/\.s/go; + s/\.[ui]?8(\s)/$1/; + s/\.\w?64\b// and s/\.16b/\.2d/g or + s/\.\w?32\b// and s/\.16b/\.4s/g; + m/\bext\b/ and s/\.2d/\.16b/g or + m/(ld|st)1[^\[]+\[0\]/ and s/\.4s/\.s/g; print $_,"\n"; } diff --git a/crypto/sha/asm/sha512-c64xplus.pl b/crypto/sha/asm/sha512-c64xplus.pl new file mode 100755 index 000000000000..9ebfc92e23ae --- /dev/null +++ b/crypto/sha/asm/sha512-c64xplus.pl @@ -0,0 +1,438 @@ +#! /usr/bin/env perl +# Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# SHA512 for C64x+. +# +# January 2012 +# +# Performance is 19 cycles per processed byte. Compared to block +# transform function from sha512.c compiled with cl6x with -mv6400+ +# -o2 -DOPENSSL_SMALL_FOOTPRINT it's almost 7x faster and 2x smaller. +# Loop unroll won't make it, this implementation, any faster, because +# it's effectively dominated by SHRU||SHL pairs and you can't schedule +# more of them. +# +# !!! Note that this module uses AMR, which means that all interrupt +# service routines are expected to preserve it and for own well-being +# zero it upon entry. + +while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + +($CTXA,$INP,$NUM) = ("A4","B4","A6"); # arguments + $K512="A3"; + +($Ahi,$Actxhi,$Bhi,$Bctxhi,$Chi,$Cctxhi,$Dhi,$Dctxhi, + $Ehi,$Ectxhi,$Fhi,$Fctxhi,$Ghi,$Gctxhi,$Hhi,$Hctxhi)=map("A$_",(16..31)); +($Alo,$Actxlo,$Blo,$Bctxlo,$Clo,$Cctxlo,$Dlo,$Dctxlo, + $Elo,$Ectxlo,$Flo,$Fctxlo,$Glo,$Gctxlo,$Hlo,$Hctxlo)=map("B$_",(16..31)); + +($S1hi,$CHhi,$S0hi,$t0hi)=map("A$_",(10..13)); +($S1lo,$CHlo,$S0lo,$t0lo)=map("B$_",(10..13)); +($T1hi, $T2hi)= ("A6","A7"); +($T1lo,$T1carry,$T2lo,$T2carry)=("B6","B7","B8","B9"); +($Khi,$Klo)=("A9","A8"); +($MAJhi,$MAJlo)=($T2hi,$T2lo); +($t1hi,$t1lo)=($Khi,"B2"); + $CTXB=$t1lo; + +($Xihi,$Xilo)=("A5","B5"); # circular/ring buffer + +$code.=<<___; + .text + + .if .ASSEMBLER_VERSION<7000000 + .asg 0,__TI_EABI__ + .endif + .if __TI_EABI__ + .nocmp + .asg sha512_block_data_order,_sha512_block_data_order + .endif + + .asg B3,RA + .asg A15,FP + .asg B15,SP + + .if .BIG_ENDIAN + .asg $Khi,KHI + .asg $Klo,KLO + .else + .asg $Khi,KLO + .asg $Klo,KHI + .endif + + .global _sha512_block_data_order +_sha512_block_data_order: +__sha512_block: + .asmfunc stack_usage(40+128) + MV $NUM,A0 ; reassign $NUM +|| MVK -128,B0 + [!A0] BNOP RA ; if ($NUM==0) return; +|| [A0] STW FP,*SP--(40) ; save frame pointer +|| [A0] MV SP,FP + [A0] STDW B13:B12,*SP[4] +|| [A0] MVK 0x00404,B1 + [A0] STDW B11:B10,*SP[3] +|| [A0] STDW A13:A12,*FP[-3] +|| [A0] MVKH 0x60000,B1 + [A0] STDW A11:A10,*SP[1] +|| [A0] MVC B1,AMR ; setup circular addressing +|| [A0] ADD B0,SP,SP ; alloca(128) + .if __TI_EABI__ + [A0] AND B0,SP,SP ; align stack at 128 bytes +|| [A0] ADDKPC __sha512_block,B1 +|| [A0] MVKL \$PCR_OFFSET(K512,__sha512_block),$K512 + [A0] MVKH \$PCR_OFFSET(K512,__sha512_block),$K512 +|| [A0] SUBAW SP,2,SP ; reserve two words above buffer + .else + [A0] AND B0,SP,SP ; align stack at 128 bytes +|| [A0] ADDKPC __sha512_block,B1 +|| [A0] MVKL (K512-__sha512_block),$K512 + [A0] MVKH (K512-__sha512_block),$K512 +|| [A0] SUBAW SP,2,SP ; reserve two words above buffer + .endif + ADDAW SP,3,$Xilo + ADDAW SP,2,$Xihi + +|| MV $CTXA,$CTXB + LDW *${CTXA}[0^.LITTLE_ENDIAN],$Ahi ; load ctx +|| LDW *${CTXB}[1^.LITTLE_ENDIAN],$Alo +|| ADD B1,$K512,$K512 + LDW *${CTXA}[2^.LITTLE_ENDIAN],$Bhi +|| LDW *${CTXB}[3^.LITTLE_ENDIAN],$Blo + LDW *${CTXA}[4^.LITTLE_ENDIAN],$Chi +|| LDW *${CTXB}[5^.LITTLE_ENDIAN],$Clo + LDW *${CTXA}[6^.LITTLE_ENDIAN],$Dhi +|| LDW *${CTXB}[7^.LITTLE_ENDIAN],$Dlo + LDW *${CTXA}[8^.LITTLE_ENDIAN],$Ehi +|| LDW *${CTXB}[9^.LITTLE_ENDIAN],$Elo + LDW *${CTXA}[10^.LITTLE_ENDIAN],$Fhi +|| LDW *${CTXB}[11^.LITTLE_ENDIAN],$Flo + LDW *${CTXA}[12^.LITTLE_ENDIAN],$Ghi +|| LDW *${CTXB}[13^.LITTLE_ENDIAN],$Glo + LDW *${CTXA}[14^.LITTLE_ENDIAN],$Hhi +|| LDW *${CTXB}[15^.LITTLE_ENDIAN],$Hlo + + LDNDW *$INP++,B11:B10 ; pre-fetch input + LDDW *$K512++,$Khi:$Klo ; pre-fetch K512[0] +outerloop?: + MVK 15,B0 ; loop counters +|| MVK 64,B1 +|| SUB A0,1,A0 + MV $Ahi,$Actxhi +|| MV $Alo,$Actxlo +|| MV $Bhi,$Bctxhi +|| MV $Blo,$Bctxlo +|| MV $Chi,$Cctxhi +|| MV $Clo,$Cctxlo +|| MVD $Dhi,$Dctxhi +|| MVD $Dlo,$Dctxlo + MV $Ehi,$Ectxhi +|| MV $Elo,$Ectxlo +|| MV $Fhi,$Fctxhi +|| MV $Flo,$Fctxlo +|| MV $Ghi,$Gctxhi +|| MV $Glo,$Gctxlo +|| MVD $Hhi,$Hctxhi +|| MVD $Hlo,$Hctxlo +loop0_15?: + .if .BIG_ENDIAN + MV B11,$T1hi +|| MV B10,$T1lo + .else + SWAP4 B10,$T1hi +|| SWAP4 B11,$T1lo + SWAP2 $T1hi,$T1hi +|| SWAP2 $T1lo,$T1lo + .endif +loop16_79?: + STW $T1hi,*$Xihi++[2] +|| STW $T1lo,*$Xilo++[2] ; X[i] = T1 +|| ADD $Hhi,$T1hi,$T1hi +|| ADDU $Hlo,$T1lo,$T1carry:$T1lo ; T1 += h +|| SHRU $Ehi,14,$S1hi +|| SHL $Ehi,32-14,$S1lo + XOR $Fhi,$Ghi,$CHhi +|| XOR $Flo,$Glo,$CHlo +|| ADD KHI,$T1hi,$T1hi +|| ADDU KLO,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += K512[i] +|| SHRU $Elo,14,$t0lo +|| SHL $Elo,32-14,$t0hi + XOR $t0hi,$S1hi,$S1hi +|| XOR $t0lo,$S1lo,$S1lo +|| AND $Ehi,$CHhi,$CHhi +|| AND $Elo,$CHlo,$CHlo +|| ROTL $Ghi,0,$Hhi +|| ROTL $Glo,0,$Hlo ; h = g +|| SHRU $Ehi,18,$t0hi +|| SHL $Ehi,32-18,$t0lo + XOR $t0hi,$S1hi,$S1hi +|| XOR $t0lo,$S1lo,$S1lo +|| XOR $Ghi,$CHhi,$CHhi +|| XOR $Glo,$CHlo,$CHlo ; Ch(e,f,g) = ((f^g)&e)^g +|| ROTL $Fhi,0,$Ghi +|| ROTL $Flo,0,$Glo ; g = f +|| SHRU $Elo,18,$t0lo +|| SHL $Elo,32-18,$t0hi + XOR $t0hi,$S1hi,$S1hi +|| XOR $t0lo,$S1lo,$S1lo +|| OR $Ahi,$Bhi,$MAJhi +|| OR $Alo,$Blo,$MAJlo +|| ROTL $Ehi,0,$Fhi +|| ROTL $Elo,0,$Flo ; f = e +|| SHRU $Ehi,41-32,$t0lo +|| SHL $Ehi,64-41,$t0hi + XOR $t0hi,$S1hi,$S1hi +|| XOR $t0lo,$S1lo,$S1lo +|| AND $Chi,$MAJhi,$MAJhi +|| AND $Clo,$MAJlo,$MAJlo +|| ROTL $Dhi,0,$Ehi +|| ROTL $Dlo,0,$Elo ; e = d +|| SHRU $Elo,41-32,$t0hi +|| SHL $Elo,64-41,$t0lo + XOR $t0hi,$S1hi,$S1hi +|| XOR $t0lo,$S1lo,$S1lo ; Sigma1(e) +|| AND $Ahi,$Bhi,$t1hi +|| AND $Alo,$Blo,$t1lo +|| ROTL $Chi,0,$Dhi +|| ROTL $Clo,0,$Dlo ; d = c +|| SHRU $Ahi,28,$S0hi +|| SHL $Ahi,32-28,$S0lo + OR $t1hi,$MAJhi,$MAJhi +|| OR $t1lo,$MAJlo,$MAJlo ; Maj(a,b,c) = ((a|b)&c)|(a&b) +|| ADD $CHhi,$T1hi,$T1hi +|| ADDU $CHlo,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += Ch(e,f,g) +|| ROTL $Bhi,0,$Chi +|| ROTL $Blo,0,$Clo ; c = b +|| SHRU $Alo,28,$t0lo +|| SHL $Alo,32-28,$t0hi + XOR $t0hi,$S0hi,$S0hi +|| XOR $t0lo,$S0lo,$S0lo +|| ADD $S1hi,$T1hi,$T1hi +|| ADDU $S1lo,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += Sigma1(e) +|| ROTL $Ahi,0,$Bhi +|| ROTL $Alo,0,$Blo ; b = a +|| SHRU $Ahi,34-32,$t0lo +|| SHL $Ahi,64-34,$t0hi + XOR $t0hi,$S0hi,$S0hi +|| XOR $t0lo,$S0lo,$S0lo +|| ADD $MAJhi,$T1hi,$T2hi +|| ADDU $MAJlo,$T1carry:$T1lo,$T2carry:$T2lo ; T2 = T1+Maj(a,b,c) +|| SHRU $Alo,34-32,$t0hi +|| SHL $Alo,64-34,$t0lo + XOR $t0hi,$S0hi,$S0hi +|| XOR $t0lo,$S0lo,$S0lo +|| ADD $Ehi,$T1hi,$T1hi +|| ADDU $Elo,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += e +|| [B0] BNOP loop0_15? +|| SHRU $Ahi,39-32,$t0lo +|| SHL $Ahi,64-39,$t0hi + XOR $t0hi,$S0hi,$S0hi +|| XOR $t0lo,$S0lo,$S0lo +|| [B0] LDNDW *$INP++,B11:B10 ; pre-fetch input +||[!B1] BNOP break? +|| SHRU $Alo,39-32,$t0hi +|| SHL $Alo,64-39,$t0lo + XOR $t0hi,$S0hi,$S0hi +|| XOR $t0lo,$S0lo,$S0lo ; Sigma0(a) +|| ADD $T1carry,$T1hi,$Ehi +|| MV $T1lo,$Elo ; e = T1 +||[!B0] LDW *${Xihi}[28],$T1hi +||[!B0] LDW *${Xilo}[28],$T1lo ; X[i+14] + ADD $S0hi,$T2hi,$T2hi +|| ADDU $S0lo,$T2carry:$T2lo,$T2carry:$T2lo ; T2 += Sigma0(a) +|| [B1] LDDW *$K512++,$Khi:$Klo ; pre-fetch K512[i] + NOP ; avoid cross-path stall + ADD $T2carry,$T2hi,$Ahi +|| MV $T2lo,$Alo ; a = T2 +|| [B0] SUB B0,1,B0 +;;===== branch to loop00_15? is taken here + NOP +;;===== branch to break? is taken here + LDW *${Xihi}[2],$T2hi +|| LDW *${Xilo}[2],$T2lo ; X[i+1] +|| SHRU $T1hi,19,$S1hi +|| SHL $T1hi,32-19,$S1lo + SHRU $T1lo,19,$t0lo +|| SHL $T1lo,32-19,$t0hi + XOR $t0hi,$S1hi,$S1hi +|| XOR $t0lo,$S1lo,$S1lo +|| SHRU $T1hi,61-32,$t0lo +|| SHL $T1hi,64-61,$t0hi + XOR $t0hi,$S1hi,$S1hi +|| XOR $t0lo,$S1lo,$S1lo +|| SHRU $T1lo,61-32,$t0hi +|| SHL $T1lo,64-61,$t0lo + XOR $t0hi,$S1hi,$S1hi +|| XOR $t0lo,$S1lo,$S1lo +|| SHRU $T1hi,6,$t0hi +|| SHL $T1hi,32-6,$t0lo + XOR $t0hi,$S1hi,$S1hi +|| XOR $t0lo,$S1lo,$S1lo +|| SHRU $T1lo,6,$t0lo +|| LDW *${Xihi}[18],$T1hi +|| LDW *${Xilo}[18],$T1lo ; X[i+9] + XOR $t0lo,$S1lo,$S1lo ; sigma1(Xi[i+14]) + +|| LDW *${Xihi}[0],$CHhi +|| LDW *${Xilo}[0],$CHlo ; X[i] +|| SHRU $T2hi,1,$S0hi +|| SHL $T2hi,32-1,$S0lo + SHRU $T2lo,1,$t0lo +|| SHL $T2lo,32-1,$t0hi + XOR $t0hi,$S0hi,$S0hi +|| XOR $t0lo,$S0lo,$S0lo +|| SHRU $T2hi,8,$t0hi +|| SHL $T2hi,32-8,$t0lo + XOR $t0hi,$S0hi,$S0hi +|| XOR $t0lo,$S0lo,$S0lo +|| SHRU $T2lo,8,$t0lo +|| SHL $T2lo,32-8,$t0hi + XOR $t0hi,$S0hi,$S0hi +|| XOR $t0lo,$S0lo,$S0lo +|| ADD $S1hi,$T1hi,$T1hi +|| ADDU $S1lo,$T1lo,$T1carry:$T1lo ; T1 = X[i+9]+sigma1() +|| [B1] BNOP loop16_79? +|| SHRU $T2hi,7,$t0hi +|| SHL $T2hi,32-7,$t0lo + XOR $t0hi,$S0hi,$S0hi +|| XOR $t0lo,$S0lo,$S0lo +|| ADD $CHhi,$T1hi,$T1hi +|| ADDU $CHlo,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += X[i] +|| SHRU $T2lo,7,$t0lo + XOR $t0lo,$S0lo,$S0lo ; sigma0(Xi[i+1] + + ADD $S0hi,$T1hi,$T1hi +|| ADDU $S0lo,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += sigma0() +|| [B1] SUB B1,1,B1 + NOP ; avoid cross-path stall + ADD $T1carry,$T1hi,$T1hi +;;===== branch to loop16_79? is taken here + +break?: + ADD $Ahi,$Actxhi,$Ahi ; accumulate ctx +|| ADDU $Alo,$Actxlo,$Actxlo:$Alo +|| [A0] LDNDW *$INP++,B11:B10 ; pre-fetch input +|| [A0] ADDK -640,$K512 ; rewind pointer to K512 + ADD $Bhi,$Bctxhi,$Bhi +|| ADDU $Blo,$Bctxlo,$Bctxlo:$Blo +|| [A0] LDDW *$K512++,$Khi:$Klo ; pre-fetch K512[0] + ADD $Chi,$Cctxhi,$Chi +|| ADDU $Clo,$Cctxlo,$Cctxlo:$Clo +|| ADD $Actxlo,$Ahi,$Ahi +||[!A0] MV $CTXA,$CTXB + ADD $Dhi,$Dctxhi,$Dhi +|| ADDU $Dlo,$Dctxlo,$Dctxlo:$Dlo +|| ADD $Bctxlo,$Bhi,$Bhi +||[!A0] STW $Ahi,*${CTXA}[0^.LITTLE_ENDIAN] ; save ctx +||[!A0] STW $Alo,*${CTXB}[1^.LITTLE_ENDIAN] + ADD $Ehi,$Ectxhi,$Ehi +|| ADDU $Elo,$Ectxlo,$Ectxlo:$Elo +|| ADD $Cctxlo,$Chi,$Chi +|| [A0] BNOP outerloop? +||[!A0] STW $Bhi,*${CTXA}[2^.LITTLE_ENDIAN] +||[!A0] STW $Blo,*${CTXB}[3^.LITTLE_ENDIAN] + ADD $Fhi,$Fctxhi,$Fhi +|| ADDU $Flo,$Fctxlo,$Fctxlo:$Flo +|| ADD $Dctxlo,$Dhi,$Dhi +||[!A0] STW $Chi,*${CTXA}[4^.LITTLE_ENDIAN] +||[!A0] STW $Clo,*${CTXB}[5^.LITTLE_ENDIAN] + ADD $Ghi,$Gctxhi,$Ghi +|| ADDU $Glo,$Gctxlo,$Gctxlo:$Glo +|| ADD $Ectxlo,$Ehi,$Ehi +||[!A0] STW $Dhi,*${CTXA}[6^.LITTLE_ENDIAN] +||[!A0] STW $Dlo,*${CTXB}[7^.LITTLE_ENDIAN] + ADD $Hhi,$Hctxhi,$Hhi +|| ADDU $Hlo,$Hctxlo,$Hctxlo:$Hlo +|| ADD $Fctxlo,$Fhi,$Fhi +||[!A0] STW $Ehi,*${CTXA}[8^.LITTLE_ENDIAN] +||[!A0] STW $Elo,*${CTXB}[9^.LITTLE_ENDIAN] + ADD $Gctxlo,$Ghi,$Ghi +||[!A0] STW $Fhi,*${CTXA}[10^.LITTLE_ENDIAN] +||[!A0] STW $Flo,*${CTXB}[11^.LITTLE_ENDIAN] + ADD $Hctxlo,$Hhi,$Hhi +||[!A0] STW $Ghi,*${CTXA}[12^.LITTLE_ENDIAN] +||[!A0] STW $Glo,*${CTXB}[13^.LITTLE_ENDIAN] +;;===== branch to outerloop? is taken here + + STW $Hhi,*${CTXA}[14^.LITTLE_ENDIAN] +|| STW $Hlo,*${CTXB}[15^.LITTLE_ENDIAN] +|| MVK -40,B0 + ADD FP,B0,SP ; destroy circular buffer +|| LDDW *FP[-4],A11:A10 + LDDW *SP[2],A13:A12 +|| LDDW *FP[-2],B11:B10 + LDDW *SP[4],B13:B12 +|| BNOP RA + LDW *++SP(40),FP ; restore frame pointer + MVK 0,B0 + MVC B0,AMR ; clear AMR + NOP 2 ; wait till FP is committed + .endasmfunc + + .if __TI_EABI__ + .sect ".text:sha_asm.const" + .else + .sect ".const:sha_asm" + .endif + .align 128 +K512: + .uword 0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd + .uword 0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc + .uword 0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019 + .uword 0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118 + .uword 0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe + .uword 0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2 + .uword 0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1 + .uword 0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694 + .uword 0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3 + .uword 0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65 + .uword 0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483 + .uword 0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5 + .uword 0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210 + .uword 0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4 + .uword 0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725 + .uword 0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70 + .uword 0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926 + .uword 0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df + .uword 0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8 + .uword 0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b + .uword 0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001 + .uword 0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30 + .uword 0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910 + .uword 0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8 + .uword 0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53 + .uword 0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8 + .uword 0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb + .uword 0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3 + .uword 0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60 + .uword 0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec + .uword 0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9 + .uword 0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b + .uword 0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207 + .uword 0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178 + .uword 0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6 + .uword 0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b + .uword 0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493 + .uword 0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c + .uword 0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a + .uword 0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817 + .cstring "SHA512 block transform for C64x+, CRYPTOGAMS by <appro\@openssl.org>" + .align 4 +___ + +print $code; +close STDOUT; diff --git a/crypto/sha/asm/sha512-ia64.pl b/crypto/sha/asm/sha512-ia64.pl index 59f889a09594..356a46aced78 100755 --- a/crypto/sha/asm/sha512-ia64.pl +++ b/crypto/sha/asm/sha512-ia64.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2004-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # # ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL @@ -68,7 +75,7 @@ # To generate code, pass the file name with either 256 or 512 in its # name and compiler flags. -$output=shift; +$output=pop; if ($output =~ /512.*\.[s|asm]/) { $SZ=8; diff --git a/crypto/sha/asm/sha512-mips.pl b/crypto/sha/asm/sha512-mips.pl index b468cfb4569e..dab684dde5bc 100755 --- a/crypto/sha/asm/sha512-mips.pl +++ b/crypto/sha/asm/sha512-mips.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2010-2018 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL @@ -52,15 +59,17 @@ $flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64 if ($flavour =~ /64|n32/i) { - $PTR_ADD="dadd"; # incidentally works even on n32 - $PTR_SUB="dsub"; # incidentally works even on n32 + $PTR_LA="dla"; + $PTR_ADD="daddu"; # incidentally works even on n32 + $PTR_SUB="dsubu"; # incidentally works even on n32 $REG_S="sd"; $REG_L="ld"; $PTR_SLL="dsll"; # incidentally works even on n32 $SZREG=8; } else { - $PTR_ADD="add"; - $PTR_SUB="sub"; + $PTR_LA="la"; + $PTR_ADD="addu"; + $PTR_SUB="subu"; $REG_S="sw"; $REG_L="lw"; $PTR_SLL="sll"; @@ -72,9 +81,9 @@ $pf = ($flavour =~ /nubi/i) ? $t0 : $t2; # ###################################################################### -$big_endian=(`echo MIPSEL | $ENV{CC} -E -`=~/MIPSEL/)?1:0 if ($ENV{CC}); +$big_endian=(`echo MIPSEB | $ENV{CC} -E -`=~/MIPSEB/)?0:1 if ($ENV{CC}); -for (@ARGV) { $output=$_ if (/^\w[\w\-]*\.\w+$/); } +for (@ARGV) { $output=$_ if (/\w[\w\-]*\.\w+$/); } open STDOUT,">$output"; if (!defined($big_endian)) { $big_endian=(unpack('L',pack('N',1))==1); } @@ -126,8 +135,12 @@ my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; my ($T1,$tmp0,$tmp1,$tmp2)=(@X[4],@X[5],@X[6],@X[7]); $code.=<<___ if ($i<15); +#if defined(_MIPS_ARCH_MIPS32R6) || defined(_MIPS_ARCH_MIPS64R6) + ${LD} @X[1],`($i+1)*$SZ`($inp) +#else ${LD}l @X[1],`($i+1)*$SZ+$MSB`($inp) ${LD}r @X[1],`($i+1)*$SZ+$LSB`($inp) +#endif ___ $code.=<<___ if (!$big_endian && $i<16 && $SZ==4); #if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2) @@ -286,16 +299,10 @@ ___ } $FRAMESIZE=16*$SZ+16*$SZREG; -$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000; +$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0xc0fff008" : "0xc0ff0000"; $code.=<<___; -#ifdef OPENSSL_FIPSCANISTER -# include <openssl/fipssyms.h> -#endif - -#if defined(__mips_smartmips) && !defined(_MIPS_ARCH_MIPS32R2) -#define _MIPS_ARCH_MIPS32R2 -#endif +#include "mips_arch.h" .text .set noat @@ -343,7 +350,7 @@ $code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification ___ $code.=<<___; .set reorder - la $Ktbl,K${label} # PIC-ified 'load address' + $PTR_LA $Ktbl,K${label} # PIC-ified 'load address' $LD $A,0*$SZ($ctx) # load context $LD $B,1*$SZ($ctx) @@ -360,8 +367,12 @@ $code.=<<___; .align 5 .Loop: +#if defined(_MIPS_ARCH_MIPS32R6) || defined(_MIPS_ARCH_MIPS64R6) + ${LD} @X[0],($inp) +#else ${LD}l @X[0],$MSB($inp) ${LD}r @X[0],$LSB($inp) +#endif ___ for ($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); push(@X,shift(@X)); } diff --git a/crypto/sha/asm/sha512-parisc.pl b/crypto/sha/asm/sha512-parisc.pl index 6cad72e25573..59eb320ab6ed 100755 --- a/crypto/sha/asm/sha512-parisc.pl +++ b/crypto/sha/asm/sha512-parisc.pl @@ -1,7 +1,14 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2009-2018 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== -# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. @@ -361,7 +368,7 @@ L\$parisc1 ___ @V=( $Ahi, $Alo, $Bhi, $Blo, $Chi, $Clo, $Dhi, $Dlo, - $Ehi, $Elo, $Fhi, $Flo, $Ghi, $Glo, $Hhi, $Hlo) = + $Ehi, $Elo, $Fhi, $Flo, $Ghi, $Glo, $Hhi, $Hlo) = ( "%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8", "%r9","%r10","%r11","%r12","%r13","%r14","%r15","%r16"); $a0 ="%r17"; @@ -412,7 +419,7 @@ $code.=<<___; add $t0,$hlo,$hlo shd $ahi,$alo,$Sigma0[0],$t0 addc $t1,$hhi,$hhi ; h += Sigma1(e) - shd $alo,$ahi,$Sigma0[0],$t1 + shd $alo,$ahi,$Sigma0[0],$t1 add $a0,$hlo,$hlo shd $ahi,$alo,$Sigma0[1],$t2 addc $a1,$hhi,$hhi ; h += Ch(e,f,g) @@ -760,13 +767,18 @@ sub assemble { ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args"; } +if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` + =~ /GNU assembler/) { + $gnuas = 1; +} + foreach (split("\n",$code)) { s/\`([^\`]*)\`/eval $1/ge; s/shd\s+(%r[0-9]+),(%r[0-9]+),([0-9]+)/ $3>31 ? sprintf("shd\t%$2,%$1,%d",$3-32) # rotation for >=32 : sprintf("shd\t%$1,%$2,%d",$3)/e or - # translate made up instructons: _ror, _shr, _align, _shl + # translate made up instructions: _ror, _shr, _align, _shl s/_ror(\s+)(%r[0-9]+),/ ($SZ==4 ? "shd" : "shrpd")."$1$2,$2,"/e or @@ -783,9 +795,11 @@ foreach (split("\n",$code)) { s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($SIZE_T==4); - s/cmpb,\*/comb,/ if ($SIZE_T==4); - - s/\bbv\b/bve/ if ($SIZE_T==8); + s/(\.LEVEL\s+2\.0)W/$1w/ if ($gnuas && $SIZE_T==8); + s/\.SPACE\s+\$TEXT\$/.text/ if ($gnuas && $SIZE_T==8); + s/\.SUBSPA.*// if ($gnuas && $SIZE_T==8); + s/cmpb,\*/comb,/ if ($SIZE_T==4); + s/\bbv\b/bve/ if ($SIZE_T==8); print $_,"\n"; } diff --git a/crypto/sha/asm/sha512-ppc.pl b/crypto/sha/asm/sha512-ppc.pl index 17fdc6e8e5a9..71699f663706 100755 --- a/crypto/sha/asm/sha512-ppc.pl +++ b/crypto/sha/asm/sha512-ppc.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2006-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL @@ -19,7 +26,7 @@ # # (*) 64-bit code in 32-bit application context, which actually is # on TODO list. It should be noted that for safe deployment in -# 32-bit *mutli-threaded* context asyncronous signals should be +# 32-bit *multi-threaded* context asynchronous signals should be # blocked upon entry to SHA512 block routine. This is because # 32-bit signaling procedure invalidates upper halves of GPRs. # Context switch procedure preserves them, but not signaling:-( diff --git a/crypto/sha/asm/sha512-s390x.pl b/crypto/sha/asm/sha512-s390x.pl index 9c10e4e9ee74..4c0f4e79315b 100755 --- a/crypto/sha/asm/sha512-s390x.pl +++ b/crypto/sha/asm/sha512-s390x.pl @@ -1,7 +1,14 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== -# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. @@ -33,7 +40,7 @@ # instructions and achieve "64-bit" performance even in 31-bit legacy # application context. The feature is not specific to any particular # processor, as long as it's "z-CPU". Latter implies that the code -# remains z/Architecture specific. On z900 SHA256 was measured to +# remains z/Architecture specific. On z990 SHA256 was measured to # perform 2.4x and SHA512 - 13x better than code generated by gcc 4.3. $flavour = shift; @@ -64,7 +71,7 @@ $tbl="%r13"; $T1="%r14"; $sp="%r15"; -while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} +while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} open STDOUT,">$output"; if ($output =~ /512/) { @@ -163,6 +170,8 @@ ___ } $code.=<<___; +#include "s390x_arch.h" + .text .align 64 .type $Table,\@object @@ -237,10 +246,7 @@ $Func: ___ $code.=<<___ if ($kimdfunc); larl %r1,OPENSSL_s390xcap_P - lg %r0,0(%r1) - tmhl %r0,0x4000 # check for message-security assist - jz .Lsoftware - lg %r0,16(%r1) # check kimd capabilities + lg %r0,S390X_KIMD(%r1) # check kimd capabilities tmhh %r0,`0x8000>>$kimdfunc` jz .Lsoftware lghi %r0,$kimdfunc @@ -304,11 +310,10 @@ $code.=<<___; cl${g} $inp,`$frame+4*$SIZE_T`($sp) jne .Lloop - lm${g} %r6,%r15,`$frame+6*$SIZE_T`($sp) + lm${g} %r6,%r15,`$frame+6*$SIZE_T`($sp) br %r14 .size $Func,.-$Func .string "SHA${label} block transform for s390x, CRYPTOGAMS by <appro\@openssl.org>" -.comm OPENSSL_s390xcap_P,80,8 ___ $code =~ s/\`([^\`]*)\`/eval $1/gem; diff --git a/crypto/sha/asm/sha512-sparcv9.pl b/crypto/sha/asm/sha512-sparcv9.pl index 5a9c15d1d34a..4432bda65ab5 100755 --- a/crypto/sha/asm/sha512-sparcv9.pl +++ b/crypto/sha/asm/sha512-sparcv9.pl @@ -1,12 +1,19 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== -# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # -# Hardware SPARC T4 support by David S. Miller <davem@davemloft.net>. +# Hardware SPARC T4 support by David S. Miller # ==================================================================== # SHA256 performance improvement over compiler generated code varies @@ -49,7 +56,7 @@ # saturates at 11.5x single-process result on 8-core processor, or # ~11/16GBps per 2.85GHz socket. -$output=shift; +$output=pop; open STDOUT,">$output"; if ($output =~ /512/) { @@ -95,7 +102,7 @@ if ($output =~ /512/) { $locals=0; # X[16] is register resident @X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7"); - + $A="%l0"; $B="%l1"; $C="%l2"; @@ -247,7 +254,7 @@ $code.=<<___; $SLL $a,`$SZ*8-@Sigma0[1]`,$tmp1 xor $tmp0,$h,$h $SRL $a,@Sigma0[2],$tmp0 - xor $tmp1,$h,$h + xor $tmp1,$h,$h $SLL $a,`$SZ*8-@Sigma0[0]`,$tmp1 xor $tmp0,$h,$h xor $tmp1,$h,$h ! Sigma0(a) @@ -791,7 +798,7 @@ ___ # Purpose of these subroutines is to explicitly encode VIS instructions, # so that one can compile the module without having to specify VIS -# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a. +# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a. # Idea is to reserve for option to produce "universal" binary and let # programmer detect if current CPU is VIS capable at run-time. sub unvis { diff --git a/crypto/sha/asm/sha512-x86_64.pl b/crypto/sha/asm/sha512-x86_64.pl index 78e445f3fe4a..f2ebdfdb68b6 100755 --- a/crypto/sha/asm/sha512-x86_64.pl +++ b/crypto/sha/asm/sha512-x86_64.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # # ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL @@ -34,7 +41,7 @@ # level parallelism, on a given CPU implementation in this case. # # Special note on Intel EM64T. While Opteron CPU exhibits perfect -# perfromance ratio of 1.5 between 64- and 32-bit flavors [see above], +# performance ratio of 1.5 between 64- and 32-bit flavors [see above], # [currently available] EM64T CPUs apparently are far from it. On the # contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit # sha256_block:-( This is presumably because 64-bit shifts/rotates @@ -86,12 +93,16 @@ # Sandy Bridge 17.4 14.2(+23%) 11.6(+50%(**)) 11.2 8.10(+38%(**)) # Ivy Bridge 12.6 10.5(+20%) 10.3(+22%) 8.17 7.22(+13%) # Haswell 12.2 9.28(+31%) 7.80(+56%) 7.66 5.40(+42%) +# Skylake 11.4 9.03(+26%) 7.70(+48%) 7.25 5.20(+40%) # Bulldozer 21.1 13.6(+54%) 13.6(+54%(***)) 13.5 8.58(+57%) +# Ryzen 11.0 9.02(+22%) 2.05(+440%) 7.05 5.67(+20%) # VIA Nano 23.0 16.5(+39%) - 14.7 - # Atom 23.0 18.9(+22%) - 14.7 - # Silvermont 27.4 20.6(+33%) - 17.5 - +# Knights L 27.4 21.0(+30%) 19.6(+40%) 17.5 12.8(+37%) +# Goldmont 18.9 14.3(+32%) 4.16(+350%) 12.0 - # -# (*) whichever best applicable; +# (*) whichever best applicable, including SHAEXT; # (**) switch from ror to shrd stands for fair share of improvement; # (***) execution time is fully determined by remaining integer-only # part, body_00_15; reducing the amount of SIMD instructions @@ -131,7 +142,7 @@ if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([ $shaext=1; ### set to zero if compiling for 1.0.1 $avx=1 if (!$shaext && $avx); -open OUT,"| \"$^X\" $xlate $flavour $output"; +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; if ($output =~ /512/) { @@ -167,7 +178,7 @@ $Tbl="%rbp"; $_ctx="16*$SZ+0*8(%rsp)"; $_inp="16*$SZ+1*8(%rsp)"; $_end="16*$SZ+2*8(%rsp)"; -$_rsp="16*$SZ+3*8(%rsp)"; +$_rsp="`16*$SZ+3*8`(%rsp)"; $framesz="16*$SZ+4*8"; @@ -260,6 +271,7 @@ $code=<<___; .type $func,\@function,3 .align 16 $func: +.cfi_startproc ___ $code.=<<___ if ($SZ==4 || $avx); lea OPENSSL_ia32cap_P(%rip),%r11 @@ -292,13 +304,20 @@ $code.=<<___ if ($SZ==4); jnz .Lssse3_shortcut ___ $code.=<<___; + mov %rsp,%rax # copy %rsp +.cfi_def_cfa_register %rax push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 - mov %rsp,%r11 # copy %rsp +.cfi_push %r15 shl \$4,%rdx # num*16 sub \$$framesz,%rsp lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ @@ -306,7 +325,8 @@ $code.=<<___; mov $ctx,$_ctx # save ctx, 1st arg mov $inp,$_inp # save inp, 2nd arh mov %rdx,$_end # save end pointer, "3rd" arg - mov %r11,$_rsp # save copy of %rsp + mov %rax,$_rsp # save copy of %rsp +.cfi_cfa_expression $_rsp,deref,+8 .Lprologue: mov $SZ*0($ctx),$A @@ -373,15 +393,24 @@ $code.=<<___; jb .Lloop mov $_rsp,%rsi - mov (%rsi),%r15 - mov 8(%rsi),%r14 - mov 16(%rsi),%r13 - mov 24(%rsi),%r12 - mov 32(%rsi),%rbp - mov 40(%rsi),%rbx - lea 48(%rsi),%rsp +.cfi_def_cfa %rsi,8 + mov -48(%rsi),%r15 +.cfi_restore %r15 + mov -40(%rsi),%r14 +.cfi_restore %r14 + mov -32(%rsi),%r13 +.cfi_restore %r13 + mov -24(%rsi),%r12 +.cfi_restore %r12 + mov -16(%rsi),%rbp +.cfi_restore %rbp + mov -8(%rsi),%rbx +.cfi_restore %rbx + lea (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lepilogue: ret +.cfi_endproc .size $func,.-$func ___ @@ -751,14 +780,22 @@ $code.=<<___; .type ${func}_ssse3,\@function,3 .align 64 ${func}_ssse3: +.cfi_startproc .Lssse3_shortcut: + mov %rsp,%rax # copy %rsp +.cfi_def_cfa_register %rax push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 - mov %rsp,%r11 # copy %rsp +.cfi_push %r15 shl \$4,%rdx # num*16 sub \$`$framesz+$win64*16*4`,%rsp lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ @@ -766,7 +803,8 @@ ${func}_ssse3: mov $ctx,$_ctx # save ctx, 1st arg mov $inp,$_inp # save inp, 2nd arh mov %rdx,$_end # save end pointer, "3rd" arg - mov %r11,$_rsp # save copy of %rsp + mov %rax,$_rsp # save copy of %rsp +.cfi_cfa_expression $_rsp,deref,+8 ___ $code.=<<___ if ($win64); movaps %xmm6,16*$SZ+32(%rsp) @@ -1065,6 +1103,7 @@ $code.=<<___; jb .Lloop_ssse3 mov $_rsp,%rsi +.cfi_def_cfa %rsi,8 ___ $code.=<<___ if ($win64); movaps 16*$SZ+32(%rsp),%xmm6 @@ -1073,15 +1112,23 @@ $code.=<<___ if ($win64); movaps 16*$SZ+80(%rsp),%xmm9 ___ $code.=<<___; - mov (%rsi),%r15 - mov 8(%rsi),%r14 - mov 16(%rsi),%r13 - mov 24(%rsi),%r12 - mov 32(%rsi),%rbp - mov 40(%rsi),%rbx - lea 48(%rsi),%rsp + mov -48(%rsi),%r15 +.cfi_restore %r15 + mov -40(%rsi),%r14 +.cfi_restore %r14 + mov -32(%rsi),%r13 +.cfi_restore %r13 + mov -24(%rsi),%r12 +.cfi_restore %r12 + mov -16(%rsi),%rbp +.cfi_restore %rbp + mov -8(%rsi),%rbx +.cfi_restore %rbx + lea (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lepilogue_ssse3: ret +.cfi_endproc .size ${func}_ssse3,.-${func}_ssse3 ___ } @@ -1095,14 +1142,22 @@ $code.=<<___; .type ${func}_xop,\@function,3 .align 64 ${func}_xop: +.cfi_startproc .Lxop_shortcut: + mov %rsp,%rax # copy %rsp +.cfi_def_cfa_register %rax push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 - mov %rsp,%r11 # copy %rsp +.cfi_push %r15 shl \$4,%rdx # num*16 sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ @@ -1110,7 +1165,8 @@ ${func}_xop: mov $ctx,$_ctx # save ctx, 1st arg mov $inp,$_inp # save inp, 2nd arh mov %rdx,$_end # save end pointer, "3rd" arg - mov %r11,$_rsp # save copy of %rsp + mov %rax,$_rsp # save copy of %rsp +.cfi_cfa_expression $_rsp,deref,+8 ___ $code.=<<___ if ($win64); movaps %xmm6,16*$SZ+32(%rsp) @@ -1437,6 +1493,7 @@ $code.=<<___; jb .Lloop_xop mov $_rsp,%rsi +.cfi_def_cfa %rsi,8 vzeroupper ___ $code.=<<___ if ($win64); @@ -1450,15 +1507,23 @@ $code.=<<___ if ($win64 && $SZ>4); movaps 16*$SZ+112(%rsp),%xmm11 ___ $code.=<<___; - mov (%rsi),%r15 - mov 8(%rsi),%r14 - mov 16(%rsi),%r13 - mov 24(%rsi),%r12 - mov 32(%rsi),%rbp - mov 40(%rsi),%rbx - lea 48(%rsi),%rsp + mov -48(%rsi),%r15 +.cfi_restore %r15 + mov -40(%rsi),%r14 +.cfi_restore %r14 + mov -32(%rsi),%r13 +.cfi_restore %r13 + mov -24(%rsi),%r12 +.cfi_restore %r12 + mov -16(%rsi),%rbp +.cfi_restore %rbp + mov -8(%rsi),%rbx +.cfi_restore %rbx + lea (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lepilogue_xop: ret +.cfi_endproc .size ${func}_xop,.-${func}_xop ___ } @@ -1471,14 +1536,22 @@ $code.=<<___; .type ${func}_avx,\@function,3 .align 64 ${func}_avx: +.cfi_startproc .Lavx_shortcut: + mov %rsp,%rax # copy %rsp +.cfi_def_cfa_register %rax push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 - mov %rsp,%r11 # copy %rsp +.cfi_push %r15 shl \$4,%rdx # num*16 sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ @@ -1486,7 +1559,8 @@ ${func}_avx: mov $ctx,$_ctx # save ctx, 1st arg mov $inp,$_inp # save inp, 2nd arh mov %rdx,$_end # save end pointer, "3rd" arg - mov %r11,$_rsp # save copy of %rsp + mov %rax,$_rsp # save copy of %rsp +.cfi_cfa_expression $_rsp,deref,+8 ___ $code.=<<___ if ($win64); movaps %xmm6,16*$SZ+32(%rsp) @@ -1745,6 +1819,7 @@ $code.=<<___; jb .Lloop_avx mov $_rsp,%rsi +.cfi_def_cfa %rsi,8 vzeroupper ___ $code.=<<___ if ($win64); @@ -1758,15 +1833,23 @@ $code.=<<___ if ($win64 && $SZ>4); movaps 16*$SZ+112(%rsp),%xmm11 ___ $code.=<<___; - mov (%rsi),%r15 - mov 8(%rsi),%r14 - mov 16(%rsi),%r13 - mov 24(%rsi),%r12 - mov 32(%rsi),%rbp - mov 40(%rsi),%rbx - lea 48(%rsi),%rsp + mov -48(%rsi),%r15 +.cfi_restore %r15 + mov -40(%rsi),%r14 +.cfi_restore %r14 + mov -32(%rsi),%r13 +.cfi_restore %r13 + mov -24(%rsi),%r12 +.cfi_restore %r12 + mov -16(%rsi),%rbp +.cfi_restore %rbp + mov -8(%rsi),%rbx +.cfi_restore %rbx + lea (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lepilogue_avx: ret +.cfi_endproc .size ${func}_avx,.-${func}_avx ___ @@ -1774,7 +1857,7 @@ if ($avx>1) {{ ###################################################################### # AVX2+BMI code path # -my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp +my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp my $PUSH8=8*2*$SZ; use integer; @@ -1822,14 +1905,22 @@ $code.=<<___; .type ${func}_avx2,\@function,3 .align 64 ${func}_avx2: +.cfi_startproc .Lavx2_shortcut: + mov %rsp,%rax # copy %rsp +.cfi_def_cfa_register %rax push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 - mov %rsp,%r11 # copy %rsp +.cfi_push %r15 sub \$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp shl \$4,%rdx # num*16 and \$-256*$SZ,%rsp # align stack frame @@ -1838,7 +1929,8 @@ ${func}_avx2: mov $ctx,$_ctx # save ctx, 1st arg mov $inp,$_inp # save inp, 2nd arh mov %rdx,$_end # save end pointer, "3rd" arg - mov %r11,$_rsp # save copy of %rsp + mov %rax,$_rsp # save copy of %rsp +.cfi_cfa_expression $_rsp,deref,+8 ___ $code.=<<___ if ($win64); movaps %xmm6,16*$SZ+32(%rsp) @@ -2119,6 +2211,7 @@ $code.=<<___; .Ldone_avx2: lea ($Tbl),%rsp mov $_rsp,%rsi +.cfi_def_cfa %rsi,8 vzeroupper ___ $code.=<<___ if ($win64); @@ -2132,15 +2225,23 @@ $code.=<<___ if ($win64 && $SZ>4); movaps 16*$SZ+112(%rsp),%xmm11 ___ $code.=<<___; - mov (%rsi),%r15 - mov 8(%rsi),%r14 - mov 16(%rsi),%r13 - mov 24(%rsi),%r12 - mov 32(%rsi),%rbp - mov 40(%rsi),%rbx - lea 48(%rsi),%rsp + mov -48(%rsi),%r15 +.cfi_restore %r15 + mov -40(%rsi),%r14 +.cfi_restore %r14 + mov -32(%rsi),%r13 +.cfi_restore %r13 + mov -24(%rsi),%r12 +.cfi_restore %r12 + mov -16(%rsi),%rbp +.cfi_restore %rbp + mov -8(%rsi),%rbx +.cfi_restore %rbx + lea (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lepilogue_avx2: ret +.cfi_endproc .size ${func}_avx2,.-${func}_avx2 ___ }} @@ -2200,7 +2301,6 @@ ___ $code.=<<___; mov %rax,%rsi # put aside Rsp mov 16*$SZ+3*8(%rax),%rax # pull $_rsp - lea 48(%rax),%rax mov -8(%rax),%rbx mov -16(%rax),%rbp diff --git a/crypto/sha/asm/sha512p8-ppc.pl b/crypto/sha/asm/sha512p8-ppc.pl index 47189502c6cc..2792800b475c 100755 --- a/crypto/sha/asm/sha512p8-ppc.pl +++ b/crypto/sha/asm/sha512p8-ppc.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2014-2018 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL @@ -18,11 +25,20 @@ # sha1-ppc.pl and 1.6x slower than aes-128-cbc. Another interesting # result is degree of computational resources' utilization. POWER8 is # "massively multi-threaded chip" and difference between single- and -# maximum multi-process benchmark results tells that utlization is +# maximum multi-process benchmark results tells that utilization is # whooping 94%. For sha512-ppc.pl we get [not unimpressive] 84% and # for sha1-ppc.pl - 73%. 100% means that multi-process result equals # to single-process one, given that all threads end up on the same # physical core. +# +###################################################################### +# Believed-to-be-accurate results in cycles per processed byte [on +# little-endian system]. Numbers in square brackets are for 64-bit +# build of sha512-ppc.pl, presented for reference. +# +# POWER8 POWER9 +# SHA256 9.7 [15.8] 11.2 [12.5] +# SHA512 6.1 [10.3] 7.0 [7.9] $flavour=shift; $output =shift; @@ -63,7 +79,8 @@ if ($output =~ /512/) { } $func="sha${bits}_block_p8"; -$FRAME=8*$SIZE_T; +$LOCALS=8*$SIZE_T+8*16; +$FRAME=$LOCALS+9*16+6*$SIZE_T; $sp ="r1"; $toc="r2"; @@ -75,16 +92,16 @@ $idx="r7"; $lrsave="r8"; $offload="r11"; $vrsave="r12"; -($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,10,26..31)); - $x00=0 if ($flavour =~ /osx/); +@I = ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70) = (0,map("r$_",(10,26..31))); @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("v$_",(0..7)); -@X=map("v$_",(8..23)); -($Ki,$Func,$S0,$S1,$s0,$s1,$lemask)=map("v$_",(24..31)); +@X=map("v$_",(8..19,24..27)); +($Ki,$Func,$Sigma,$lemask)=map("v$_",(28..31)); sub ROUND { my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; my $j=($i+1)%16; +my $k=($i+2)%8; $code.=<<___ if ($i<15 && ($i%(16/$SZ))==(16/$SZ-1)); lvx_u @X[$i+1],0,$inp ; load X[i] in advance @@ -96,26 +113,30 @@ ___ $code.=<<___ if ($LENDIAN && $i<16 && ($i%(16/$SZ))==0); vperm @X[$i],@X[$i],@X[$i],$lemask ___ +$code.=<<___ if ($i>=15); + vshasigma${sz} $Sigma,@X[($j+1)%16],0,0 + vaddu${sz}m @X[$j],@X[$j],$Sigma + vshasigma${sz} $Sigma,@X[($j+14)%16],0,15 + vaddu${sz}m @X[$j],@X[$j],$Sigma + vaddu${sz}m @X[$j],@X[$j],@X[($j+9)%16] +___ $code.=<<___; - `"vshasigma${sz} $s0,@X[($j+1)%16],0,0" if ($i>=15)` - vsel $Func,$g,$f,$e ; Ch(e,f,g) - vshasigma${sz} $S1,$e,1,15 ; Sigma1(e) vaddu${sz}m $h,$h,@X[$i%16] ; h+=X[i] - vshasigma${sz} $S0,$a,1,0 ; Sigma0(a) - `"vshasigma${sz} $s1,@X[($j+14)%16],0,15" if ($i>=15)` + vsel $Func,$g,$f,$e ; Ch(e,f,g) + vaddu${sz}m $g,$g,$Ki ; future h+=K[i] vaddu${sz}m $h,$h,$Func ; h+=Ch(e,f,g) + vshasigma${sz} $Sigma,$e,1,15 ; Sigma1(e) + vaddu${sz}m $h,$h,$Sigma ; h+=Sigma1(e) vxor $Func,$a,$b - `"vaddu${sz}m @X[$j],@X[$j],@X[($j+9)%16]" if ($i>=15)` - vaddu${sz}m $h,$h,$S1 ; h+=Sigma1(e) vsel $Func,$b,$c,$Func ; Maj(a,b,c) - vaddu${sz}m $g,$g,$Ki ; future h+=K[i] vaddu${sz}m $d,$d,$h ; d+=h - vaddu${sz}m $S0,$S0,$Func ; Sigma0(a)+Maj(a,b,c) - `"vaddu${sz}m @X[$j],@X[$j],$s0" if ($i>=15)` - lvx $Ki,$idx,$Tbl ; load next K[i] - addi $idx,$idx,16 - vaddu${sz}m $h,$h,$S0 ; h+=Sigma0(a)+Maj(a,b,c) - `"vaddu${sz}m @X[$j],@X[$j],$s1" if ($i>=15)` + vshasigma${sz} $Sigma,$a,1,0 ; Sigma0(a) + vaddu${sz}m $Sigma,$Sigma,$Func ; Sigma0(a)+Maj(a,b,c) + vaddu${sz}m $h,$h,$Sigma ; h+=Sigma0(a)+Maj(a,b,c) + lvx $Ki,@I[$k],$idx ; load next K[i] +___ +$code.=<<___ if ($k == 7); + addi $idx,$idx,0x80 ___ } @@ -126,21 +147,13 @@ $code=<<___; .globl $func .align 6 $func: - $STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp) + $STU $sp,-$FRAME($sp) mflr $lrsave - li r10,`$FRAME+8*16+15` - li r11,`$FRAME+8*16+31` - stvx v20,r10,$sp # ABI says so + li r10,`$LOCALS+15` + li r11,`$LOCALS+31` + stvx v24,r10,$sp # ABI says so addi r10,r10,32 mfspr $vrsave,256 - stvx v21,r11,$sp - addi r11,r11,32 - stvx v22,r10,$sp - addi r10,r10,32 - stvx v23,r11,$sp - addi r11,r11,32 - stvx v24,r10,$sp - addi r10,r10,32 stvx v25,r11,$sp addi r11,r11,32 stvx v26,r10,$sp @@ -153,26 +166,26 @@ $func: addi r11,r11,32 stvx v30,r10,$sp stvx v31,r11,$sp - li r11,-1 - stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave + li r11,-4096+255 + stw $vrsave,`$FRAME+6*$SIZE_T-4`($sp) # save vrsave li $x10,0x10 - $PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp) + $PUSH r26,`$FRAME-6*$SIZE_T`($sp) li $x20,0x20 - $PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp) + $PUSH r27,`$FRAME-5*$SIZE_T`($sp) li $x30,0x30 - $PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp) + $PUSH r28,`$FRAME-4*$SIZE_T`($sp) li $x40,0x40 - $PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp) + $PUSH r29,`$FRAME-3*$SIZE_T`($sp) li $x50,0x50 - $PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp) + $PUSH r30,`$FRAME-2*$SIZE_T`($sp) li $x60,0x60 - $PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp) + $PUSH r31,`$FRAME-1*$SIZE_T`($sp) li $x70,0x70 - $PUSH $lrsave,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp) + $PUSH $lrsave,`$FRAME+$LRSAVE`($sp) mtspr 256,r11 bl LPICmeup - addi $offload,$sp,$FRAME+15 + addi $offload,$sp,`8*$SIZE_T+15` ___ $code.=<<___ if ($LENDIAN); li $idx,8 @@ -206,9 +219,9 @@ $code.=<<___; .align 5 Loop: lvx $Ki,$x00,$Tbl - li $idx,16 lvx_u @X[0],0,$inp addi $inp,$inp,16 + mr $idx,$Tbl # copy $Tbl stvx $A,$x00,$offload # offload $A-$H stvx $B,$x10,$offload stvx $C,$x20,$offload @@ -218,8 +231,7 @@ Loop: stvx $G,$x60,$offload stvx $H,$x70,$offload vaddu${sz}m $H,$H,$Ki # h+K[i] - lvx $Ki,$idx,$Tbl - addi $idx,$idx,16 + lvx $Ki,$x10,$Tbl ___ for ($i=0;$i<16;$i++) { &ROUND($i,@V); unshift(@V,pop(@V)); } $code.=<<___; @@ -252,10 +264,9 @@ $code.=<<___; bne Loop ___ $code.=<<___ if ($SZ==4); - lvx @X[0],$idx,$Tbl - addi $idx,$idx,16 + lvx @X[0],$x20,$idx vperm $A,$A,$B,$Ki # pack the answer - lvx @X[1],$idx,$Tbl + lvx @X[1],$x30,$idx vperm $E,$E,$F,$Ki vperm $A,$A,$C,@X[0] vperm $E,$E,$G,@X[0] @@ -275,19 +286,11 @@ $code.=<<___ if ($SZ==8); stvx_u $G,$x30,$ctx ___ $code.=<<___; - li r10,`$FRAME+8*16+15` + li r10,`$LOCALS+15` mtlr $lrsave - li r11,`$FRAME+8*16+31` + li r11,`$LOCALS+31` mtspr 256,$vrsave - lvx v20,r10,$sp # ABI says so - addi r10,r10,32 - lvx v21,r11,$sp - addi r11,r11,32 - lvx v22,r10,$sp - addi r10,r10,32 - lvx v23,r11,$sp - addi r11,r11,32 - lvx v24,r10,$sp + lvx v24,r10,$sp # ABI says so addi r10,r10,32 lvx v25,r11,$sp addi r11,r11,32 @@ -301,13 +304,13 @@ $code.=<<___; addi r11,r11,32 lvx v30,r10,$sp lvx v31,r11,$sp - $POP r26,`$FRAME+21*16+0*$SIZE_T`($sp) - $POP r27,`$FRAME+21*16+1*$SIZE_T`($sp) - $POP r28,`$FRAME+21*16+2*$SIZE_T`($sp) - $POP r29,`$FRAME+21*16+3*$SIZE_T`($sp) - $POP r30,`$FRAME+21*16+4*$SIZE_T`($sp) - $POP r31,`$FRAME+21*16+5*$SIZE_T`($sp) - addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T` + $POP r26,`$FRAME-6*$SIZE_T`($sp) + $POP r27,`$FRAME-5*$SIZE_T`($sp) + $POP r28,`$FRAME-4*$SIZE_T`($sp) + $POP r29,`$FRAME-3*$SIZE_T`($sp) + $POP r30,`$FRAME-2*$SIZE_T`($sp) + $POP r31,`$FRAME-1*$SIZE_T`($sp) + addi $sp,$sp,$FRAME blr .long 0 .byte 0,12,4,1,0x80,6,3,0 diff --git a/crypto/sha/build.info b/crypto/sha/build.info new file mode 100644 index 000000000000..5dd5a9941d34 --- /dev/null +++ b/crypto/sha/build.info @@ -0,0 +1,89 @@ +LIBS=../../libcrypto +SOURCE[../../libcrypto]=\ + sha1dgst.c sha1_one.c sha256.c sha512.c {- $target{sha1_asm_src} -} \ + {- $target{keccak1600_asm_src} -} + +GENERATE[sha1-586.s]=asm/sha1-586.pl \ + $(PERLASM_SCHEME) $(LIB_CFLAGS) $(LIB_CPPFLAGS) $(PROCESSOR) +DEPEND[sha1-586.s]=../perlasm/x86asm.pl +GENERATE[sha256-586.s]=asm/sha256-586.pl \ + $(PERLASM_SCHEME) $(LIB_CFLAGS) $(LIB_CPPFLAGS) $(PROCESSOR) +DEPEND[sha256-586.s]=../perlasm/x86asm.pl +GENERATE[sha512-586.s]=asm/sha512-586.pl \ + $(PERLASM_SCHEME) $(LIB_CFLAGS) $(LIB_CPPFLAGS) $(PROCESSOR) +DEPEND[sha512-586.s]=../perlasm/x86asm.pl + +GENERATE[sha1-ia64.s]=asm/sha1-ia64.pl $(LIB_CFLAGS) $(LIB_CPPFLAGS) +GENERATE[sha256-ia64.s]=asm/sha512-ia64.pl $(LIB_CFLAGS) $(LIB_CPPFLAGS) +GENERATE[sha512-ia64.s]=asm/sha512-ia64.pl $(LIB_CFLAGS) $(LIB_CPPFLAGS) + +GENERATE[sha1-alpha.S]=asm/sha1-alpha.pl $(PERLASM_SCHEME) + +GENERATE[sha1-x86_64.s]=asm/sha1-x86_64.pl $(PERLASM_SCHEME) +GENERATE[sha1-mb-x86_64.s]=asm/sha1-mb-x86_64.pl $(PERLASM_SCHEME) +GENERATE[sha256-x86_64.s]=asm/sha512-x86_64.pl $(PERLASM_SCHEME) +GENERATE[sha256-mb-x86_64.s]=asm/sha256-mb-x86_64.pl $(PERLASM_SCHEME) +GENERATE[sha512-x86_64.s]=asm/sha512-x86_64.pl $(PERLASM_SCHEME) +GENERATE[keccak1600-x86_64.s]=asm/keccak1600-x86_64.pl $(PERLASM_SCHEME) + +GENERATE[sha1-sparcv9.S]=asm/sha1-sparcv9.pl $(PERLASM_SCHEME) +INCLUDE[sha1-sparcv9.o]=.. +GENERATE[sha256-sparcv9.S]=asm/sha512-sparcv9.pl $(PERLASM_SCHEME) +INCLUDE[sha256-sparcv9.o]=.. +GENERATE[sha512-sparcv9.S]=asm/sha512-sparcv9.pl $(PERLASM_SCHEME) +INCLUDE[sha512-sparcv9.o]=.. + +GENERATE[sha1-ppc.s]=asm/sha1-ppc.pl $(PERLASM_SCHEME) +GENERATE[sha256-ppc.s]=asm/sha512-ppc.pl $(PERLASM_SCHEME) +GENERATE[sha512-ppc.s]=asm/sha512-ppc.pl $(PERLASM_SCHEME) +GENERATE[sha256p8-ppc.s]=asm/sha512p8-ppc.pl $(PERLASM_SCHEME) +GENERATE[sha512p8-ppc.s]=asm/sha512p8-ppc.pl $(PERLASM_SCHEME) +GENERATE[keccak1600-ppc64.s]=asm/keccak1600-ppc64.pl $(PERLASM_SCHEME) + +GENERATE[sha1-parisc.s]=asm/sha1-parisc.pl $(PERLASM_SCHEME) +GENERATE[sha256-parisc.s]=asm/sha512-parisc.pl $(PERLASM_SCHEME) +GENERATE[sha512-parisc.s]=asm/sha512-parisc.pl $(PERLASM_SCHEME) + +GENERATE[sha1-mips.S]=asm/sha1-mips.pl $(PERLASM_SCHEME) +INCLUDE[sha1-mips.o]=.. +GENERATE[sha256-mips.S]=asm/sha512-mips.pl $(PERLASM_SCHEME) +INCLUDE[sha256-mips.o]=.. +GENERATE[sha512-mips.S]=asm/sha512-mips.pl $(PERLASM_SCHEME) +INCLUDE[sha512-mips.o]=.. + +GENERATE[sha1-armv4-large.S]=asm/sha1-armv4-large.pl $(PERLASM_SCHEME) +INCLUDE[sha1-armv4-large.o]=.. +GENERATE[sha256-armv4.S]=asm/sha256-armv4.pl $(PERLASM_SCHEME) +INCLUDE[sha256-armv4.o]=.. +GENERATE[sha512-armv4.S]=asm/sha512-armv4.pl $(PERLASM_SCHEME) +INCLUDE[sha512-armv4.o]=.. +GENERATE[keccak1600-armv4.S]=asm/keccak1600-armv4.pl $(PERLASM_SCHEME) +INCLUDE[keccak1600-armv4.o]=.. + +GENERATE[sha1-armv8.S]=asm/sha1-armv8.pl $(PERLASM_SCHEME) +INCLUDE[sha1-armv8.o]=.. +GENERATE[sha256-armv8.S]=asm/sha512-armv8.pl $(PERLASM_SCHEME) +INCLUDE[sha256-armv8.o]=.. +GENERATE[sha512-armv8.S]=asm/sha512-armv8.pl $(PERLASM_SCHEME) +INCLUDE[sha512-armv8.o]=.. +GENERATE[keccak1600-armv8.S]=asm/keccak1600-armv8.pl $(PERLASM_SCHEME) + +GENERATE[sha1-s390x.S]=asm/sha1-s390x.pl $(PERLASM_SCHEME) +INCLUDE[sha1-s390x.o]=.. +GENERATE[sha256-s390x.S]=asm/sha512-s390x.pl $(PERLASM_SCHEME) +INCLUDE[sha256-s390x.o]=.. +GENERATE[sha512-s390x.S]=asm/sha512-s390x.pl $(PERLASM_SCHEME) +INCLUDE[sha512-s390x.o]=.. +GENERATE[keccak1600-s390x.S]=asm/keccak1600-s390x.pl $(PERLASM_SCHEME) + +BEGINRAW[Makefile(unix)] +##### SHA assembler implementations + +# GNU make "catch all" +{- $builddir -}/sha1-%.S: {- $sourcedir -}/asm/sha1-%.pl + CC="$(CC)" $(PERL) $< $(PERLASM_SCHEME) $@ +{- $builddir -}/sha256-%.S: {- $sourcedir -}/asm/sha512-%.pl + CC="$(CC)" $(PERL) $< $(PERLASM_SCHEME) $@ +{- $builddir -}/sha512-%.S: {- $sourcedir -}/asm/sha512-%.pl + CC="$(CC)" $(PERL) $< $(PERLASM_SCHEME) $@ +ENDRAW[Makefile(unix)] diff --git a/crypto/sha/keccak1600.c b/crypto/sha/keccak1600.c new file mode 100644 index 000000000000..e7223486af5b --- /dev/null +++ b/crypto/sha/keccak1600.c @@ -0,0 +1,1246 @@ +/* + * Copyright 2016 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the OpenSSL license (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +#include <openssl/e_os2.h> +#include <string.h> +#include <assert.h> + +size_t SHA3_absorb(uint64_t A[5][5], const unsigned char *inp, size_t len, + size_t r); +void SHA3_squeeze(uint64_t A[5][5], unsigned char *out, size_t len, size_t r); + +#if !defined(KECCAK1600_ASM) || !defined(SELFTEST) + +/* + * Choose some sensible defaults + */ +#if !defined(KECCAK_REF) && !defined(KECCAK_1X) && !defined(KECCAK_1X_ALT) && \ + !defined(KECCAK_2X) && !defined(KECCAK_INPLACE) +# define KECCAK_2X /* default to KECCAK_2X variant */ +#endif + +#if defined(__i386) || defined(__i386__) || defined(_M_IX86) +# define KECCAK_COMPLEMENTING_TRANSFORM +#endif + +#if defined(__x86_64__) || defined(__aarch64__) || \ + defined(__mips64) || defined(__ia64) || \ + (defined(__VMS) && !defined(__vax)) +/* + * These are available even in ILP32 flavours, but even then they are + * capable of performing 64-bit operations as efficiently as in *P64. + * Since it's not given that we can use sizeof(void *), just shunt it. + */ +# define BIT_INTERLEAVE (0) +#else +# define BIT_INTERLEAVE (sizeof(void *) < 8) +#endif + +#define ROL32(a, offset) (((a) << (offset)) | ((a) >> ((32 - (offset)) & 31))) + +static uint64_t ROL64(uint64_t val, int offset) +{ + if (offset == 0) { + return val; + } else if (!BIT_INTERLEAVE) { + return (val << offset) | (val >> (64-offset)); + } else { + uint32_t hi = (uint32_t)(val >> 32), lo = (uint32_t)val; + + if (offset & 1) { + uint32_t tmp = hi; + + offset >>= 1; + hi = ROL32(lo, offset); + lo = ROL32(tmp, offset + 1); + } else { + offset >>= 1; + lo = ROL32(lo, offset); + hi = ROL32(hi, offset); + } + + return ((uint64_t)hi << 32) | lo; + } +} + +static const unsigned char rhotates[5][5] = { + { 0, 1, 62, 28, 27 }, + { 36, 44, 6, 55, 20 }, + { 3, 10, 43, 25, 39 }, + { 41, 45, 15, 21, 8 }, + { 18, 2, 61, 56, 14 } +}; + +static const uint64_t iotas[] = { + BIT_INTERLEAVE ? 0x0000000000000001U : 0x0000000000000001U, + BIT_INTERLEAVE ? 0x0000008900000000U : 0x0000000000008082U, + BIT_INTERLEAVE ? 0x8000008b00000000U : 0x800000000000808aU, + BIT_INTERLEAVE ? 0x8000808000000000U : 0x8000000080008000U, + BIT_INTERLEAVE ? 0x0000008b00000001U : 0x000000000000808bU, + BIT_INTERLEAVE ? 0x0000800000000001U : 0x0000000080000001U, + BIT_INTERLEAVE ? 0x8000808800000001U : 0x8000000080008081U, + BIT_INTERLEAVE ? 0x8000008200000001U : 0x8000000000008009U, + BIT_INTERLEAVE ? 0x0000000b00000000U : 0x000000000000008aU, + BIT_INTERLEAVE ? 0x0000000a00000000U : 0x0000000000000088U, + BIT_INTERLEAVE ? 0x0000808200000001U : 0x0000000080008009U, + BIT_INTERLEAVE ? 0x0000800300000000U : 0x000000008000000aU, + BIT_INTERLEAVE ? 0x0000808b00000001U : 0x000000008000808bU, + BIT_INTERLEAVE ? 0x8000000b00000001U : 0x800000000000008bU, + BIT_INTERLEAVE ? 0x8000008a00000001U : 0x8000000000008089U, + BIT_INTERLEAVE ? 0x8000008100000001U : 0x8000000000008003U, + BIT_INTERLEAVE ? 0x8000008100000000U : 0x8000000000008002U, + BIT_INTERLEAVE ? 0x8000000800000000U : 0x8000000000000080U, + BIT_INTERLEAVE ? 0x0000008300000000U : 0x000000000000800aU, + BIT_INTERLEAVE ? 0x8000800300000000U : 0x800000008000000aU, + BIT_INTERLEAVE ? 0x8000808800000001U : 0x8000000080008081U, + BIT_INTERLEAVE ? 0x8000008800000000U : 0x8000000000008080U, + BIT_INTERLEAVE ? 0x0000800000000001U : 0x0000000080000001U, + BIT_INTERLEAVE ? 0x8000808200000000U : 0x8000000080008008U +}; + +#if defined(KECCAK_REF) +/* + * This is straightforward or "maximum clarity" implementation aiming + * to resemble section 3.2 of the FIPS PUB 202 "SHA-3 Standard: + * Permutation-Based Hash and Extendible-Output Functions" as much as + * possible. With one caveat. Because of the way C stores matrices, + * references to A[x,y] in the specification are presented as A[y][x]. + * Implementation unrolls inner x-loops so that modulo 5 operations are + * explicitly pre-computed. + */ +static void Theta(uint64_t A[5][5]) +{ + uint64_t C[5], D[5]; + size_t y; + + C[0] = A[0][0]; + C[1] = A[0][1]; + C[2] = A[0][2]; + C[3] = A[0][3]; + C[4] = A[0][4]; + + for (y = 1; y < 5; y++) { + C[0] ^= A[y][0]; + C[1] ^= A[y][1]; + C[2] ^= A[y][2]; + C[3] ^= A[y][3]; + C[4] ^= A[y][4]; + } + + D[0] = ROL64(C[1], 1) ^ C[4]; + D[1] = ROL64(C[2], 1) ^ C[0]; + D[2] = ROL64(C[3], 1) ^ C[1]; + D[3] = ROL64(C[4], 1) ^ C[2]; + D[4] = ROL64(C[0], 1) ^ C[3]; + + for (y = 0; y < 5; y++) { + A[y][0] ^= D[0]; + A[y][1] ^= D[1]; + A[y][2] ^= D[2]; + A[y][3] ^= D[3]; + A[y][4] ^= D[4]; + } +} + +static void Rho(uint64_t A[5][5]) +{ + size_t y; + + for (y = 0; y < 5; y++) { + A[y][0] = ROL64(A[y][0], rhotates[y][0]); + A[y][1] = ROL64(A[y][1], rhotates[y][1]); + A[y][2] = ROL64(A[y][2], rhotates[y][2]); + A[y][3] = ROL64(A[y][3], rhotates[y][3]); + A[y][4] = ROL64(A[y][4], rhotates[y][4]); + } +} + +static void Pi(uint64_t A[5][5]) +{ + uint64_t T[5][5]; + + /* + * T = A + * A[y][x] = T[x][(3*y+x)%5] + */ + memcpy(T, A, sizeof(T)); + + A[0][0] = T[0][0]; + A[0][1] = T[1][1]; + A[0][2] = T[2][2]; + A[0][3] = T[3][3]; + A[0][4] = T[4][4]; + + A[1][0] = T[0][3]; + A[1][1] = T[1][4]; + A[1][2] = T[2][0]; + A[1][3] = T[3][1]; + A[1][4] = T[4][2]; + + A[2][0] = T[0][1]; + A[2][1] = T[1][2]; + A[2][2] = T[2][3]; + A[2][3] = T[3][4]; + A[2][4] = T[4][0]; + + A[3][0] = T[0][4]; + A[3][1] = T[1][0]; + A[3][2] = T[2][1]; + A[3][3] = T[3][2]; + A[3][4] = T[4][3]; + + A[4][0] = T[0][2]; + A[4][1] = T[1][3]; + A[4][2] = T[2][4]; + A[4][3] = T[3][0]; + A[4][4] = T[4][1]; +} + +static void Chi(uint64_t A[5][5]) +{ + uint64_t C[5]; + size_t y; + + for (y = 0; y < 5; y++) { + C[0] = A[y][0] ^ (~A[y][1] & A[y][2]); + C[1] = A[y][1] ^ (~A[y][2] & A[y][3]); + C[2] = A[y][2] ^ (~A[y][3] & A[y][4]); + C[3] = A[y][3] ^ (~A[y][4] & A[y][0]); + C[4] = A[y][4] ^ (~A[y][0] & A[y][1]); + + A[y][0] = C[0]; + A[y][1] = C[1]; + A[y][2] = C[2]; + A[y][3] = C[3]; + A[y][4] = C[4]; + } +} + +static void Iota(uint64_t A[5][5], size_t i) +{ + assert(i < (sizeof(iotas) / sizeof(iotas[0]))); + A[0][0] ^= iotas[i]; +} + +static void KeccakF1600(uint64_t A[5][5]) +{ + size_t i; + + for (i = 0; i < 24; i++) { + Theta(A); + Rho(A); + Pi(A); + Chi(A); + Iota(A, i); + } +} + +#elif defined(KECCAK_1X) +/* + * This implementation is optimization of above code featuring unroll + * of even y-loops, their fusion and code motion. It also minimizes + * temporary storage. Compiler would normally do all these things for + * you, purpose of manual optimization is to provide "unobscured" + * reference for assembly implementation [in case this approach is + * chosen for implementation on some platform]. In the nutshell it's + * equivalent of "plane-per-plane processing" approach discussed in + * section 2.4 of "Keccak implementation overview". + */ +static void Round(uint64_t A[5][5], size_t i) +{ + uint64_t C[5], E[2]; /* registers */ + uint64_t D[5], T[2][5]; /* memory */ + + assert(i < (sizeof(iotas) / sizeof(iotas[0]))); + + C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0]; + C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1]; + C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2]; + C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3]; + C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4]; + +#if defined(__arm__) + D[1] = E[0] = ROL64(C[2], 1) ^ C[0]; + D[4] = E[1] = ROL64(C[0], 1) ^ C[3]; + D[0] = C[0] = ROL64(C[1], 1) ^ C[4]; + D[2] = C[1] = ROL64(C[3], 1) ^ C[1]; + D[3] = C[2] = ROL64(C[4], 1) ^ C[2]; + + T[0][0] = A[3][0] ^ C[0]; /* borrow T[0][0] */ + T[0][1] = A[0][1] ^ E[0]; /* D[1] */ + T[0][2] = A[0][2] ^ C[1]; /* D[2] */ + T[0][3] = A[0][3] ^ C[2]; /* D[3] */ + T[0][4] = A[0][4] ^ E[1]; /* D[4] */ + + C[3] = ROL64(A[3][3] ^ C[2], rhotates[3][3]); /* D[3] */ + C[4] = ROL64(A[4][4] ^ E[1], rhotates[4][4]); /* D[4] */ + C[0] = A[0][0] ^ C[0]; /* rotate by 0 */ /* D[0] */ + C[2] = ROL64(A[2][2] ^ C[1], rhotates[2][2]); /* D[2] */ + C[1] = ROL64(A[1][1] ^ E[0], rhotates[1][1]); /* D[1] */ +#else + D[0] = ROL64(C[1], 1) ^ C[4]; + D[1] = ROL64(C[2], 1) ^ C[0]; + D[2] = ROL64(C[3], 1) ^ C[1]; + D[3] = ROL64(C[4], 1) ^ C[2]; + D[4] = ROL64(C[0], 1) ^ C[3]; + + T[0][0] = A[3][0] ^ D[0]; /* borrow T[0][0] */ + T[0][1] = A[0][1] ^ D[1]; + T[0][2] = A[0][2] ^ D[2]; + T[0][3] = A[0][3] ^ D[3]; + T[0][4] = A[0][4] ^ D[4]; + + C[0] = A[0][0] ^ D[0]; /* rotate by 0 */ + C[1] = ROL64(A[1][1] ^ D[1], rhotates[1][1]); + C[2] = ROL64(A[2][2] ^ D[2], rhotates[2][2]); + C[3] = ROL64(A[3][3] ^ D[3], rhotates[3][3]); + C[4] = ROL64(A[4][4] ^ D[4], rhotates[4][4]); +#endif + A[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i]; + A[0][1] = C[1] ^ (~C[2] & C[3]); + A[0][2] = C[2] ^ (~C[3] & C[4]); + A[0][3] = C[3] ^ (~C[4] & C[0]); + A[0][4] = C[4] ^ (~C[0] & C[1]); + + T[1][0] = A[1][0] ^ (C[3] = D[0]); + T[1][1] = A[2][1] ^ (C[4] = D[1]); /* borrow T[1][1] */ + T[1][2] = A[1][2] ^ (E[0] = D[2]); + T[1][3] = A[1][3] ^ (E[1] = D[3]); + T[1][4] = A[2][4] ^ (C[2] = D[4]); /* borrow T[1][4] */ + + C[0] = ROL64(T[0][3], rhotates[0][3]); + C[1] = ROL64(A[1][4] ^ C[2], rhotates[1][4]); /* D[4] */ + C[2] = ROL64(A[2][0] ^ C[3], rhotates[2][0]); /* D[0] */ + C[3] = ROL64(A[3][1] ^ C[4], rhotates[3][1]); /* D[1] */ + C[4] = ROL64(A[4][2] ^ E[0], rhotates[4][2]); /* D[2] */ + + A[1][0] = C[0] ^ (~C[1] & C[2]); + A[1][1] = C[1] ^ (~C[2] & C[3]); + A[1][2] = C[2] ^ (~C[3] & C[4]); + A[1][3] = C[3] ^ (~C[4] & C[0]); + A[1][4] = C[4] ^ (~C[0] & C[1]); + + C[0] = ROL64(T[0][1], rhotates[0][1]); + C[1] = ROL64(T[1][2], rhotates[1][2]); + C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]); + C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]); + C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]); + + A[2][0] = C[0] ^ (~C[1] & C[2]); + A[2][1] = C[1] ^ (~C[2] & C[3]); + A[2][2] = C[2] ^ (~C[3] & C[4]); + A[2][3] = C[3] ^ (~C[4] & C[0]); + A[2][4] = C[4] ^ (~C[0] & C[1]); + + C[0] = ROL64(T[0][4], rhotates[0][4]); + C[1] = ROL64(T[1][0], rhotates[1][0]); + C[2] = ROL64(T[1][1], rhotates[2][1]); /* originally A[2][1] */ + C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]); + C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]); + + A[3][0] = C[0] ^ (~C[1] & C[2]); + A[3][1] = C[1] ^ (~C[2] & C[3]); + A[3][2] = C[2] ^ (~C[3] & C[4]); + A[3][3] = C[3] ^ (~C[4] & C[0]); + A[3][4] = C[4] ^ (~C[0] & C[1]); + + C[0] = ROL64(T[0][2], rhotates[0][2]); + C[1] = ROL64(T[1][3], rhotates[1][3]); + C[2] = ROL64(T[1][4], rhotates[2][4]); /* originally A[2][4] */ + C[3] = ROL64(T[0][0], rhotates[3][0]); /* originally A[3][0] */ + C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]); + + A[4][0] = C[0] ^ (~C[1] & C[2]); + A[4][1] = C[1] ^ (~C[2] & C[3]); + A[4][2] = C[2] ^ (~C[3] & C[4]); + A[4][3] = C[3] ^ (~C[4] & C[0]); + A[4][4] = C[4] ^ (~C[0] & C[1]); +} + +static void KeccakF1600(uint64_t A[5][5]) +{ + size_t i; + + for (i = 0; i < 24; i++) { + Round(A, i); + } +} + +#elif defined(KECCAK_1X_ALT) +/* + * This is variant of above KECCAK_1X that reduces requirement for + * temporary storage even further, but at cost of more updates to A[][]. + * It's less suitable if A[][] is memory bound, but better if it's + * register bound. + */ + +static void Round(uint64_t A[5][5], size_t i) +{ + uint64_t C[5], D[5]; + + assert(i < (sizeof(iotas) / sizeof(iotas[0]))); + + C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0]; + C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1]; + C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2]; + C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3]; + C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4]; + + D[1] = C[0] ^ ROL64(C[2], 1); + D[2] = C[1] ^ ROL64(C[3], 1); + D[3] = C[2] ^= ROL64(C[4], 1); + D[4] = C[3] ^= ROL64(C[0], 1); + D[0] = C[4] ^= ROL64(C[1], 1); + + A[0][1] ^= D[1]; + A[1][1] ^= D[1]; + A[2][1] ^= D[1]; + A[3][1] ^= D[1]; + A[4][1] ^= D[1]; + + A[0][2] ^= D[2]; + A[1][2] ^= D[2]; + A[2][2] ^= D[2]; + A[3][2] ^= D[2]; + A[4][2] ^= D[2]; + + A[0][3] ^= C[2]; + A[1][3] ^= C[2]; + A[2][3] ^= C[2]; + A[3][3] ^= C[2]; + A[4][3] ^= C[2]; + + A[0][4] ^= C[3]; + A[1][4] ^= C[3]; + A[2][4] ^= C[3]; + A[3][4] ^= C[3]; + A[4][4] ^= C[3]; + + A[0][0] ^= C[4]; + A[1][0] ^= C[4]; + A[2][0] ^= C[4]; + A[3][0] ^= C[4]; + A[4][0] ^= C[4]; + + C[1] = A[0][1]; + C[2] = A[0][2]; + C[3] = A[0][3]; + C[4] = A[0][4]; + + A[0][1] = ROL64(A[1][1], rhotates[1][1]); + A[0][2] = ROL64(A[2][2], rhotates[2][2]); + A[0][3] = ROL64(A[3][3], rhotates[3][3]); + A[0][4] = ROL64(A[4][4], rhotates[4][4]); + + A[1][1] = ROL64(A[1][4], rhotates[1][4]); + A[2][2] = ROL64(A[2][3], rhotates[2][3]); + A[3][3] = ROL64(A[3][2], rhotates[3][2]); + A[4][4] = ROL64(A[4][1], rhotates[4][1]); + + A[1][4] = ROL64(A[4][2], rhotates[4][2]); + A[2][3] = ROL64(A[3][4], rhotates[3][4]); + A[3][2] = ROL64(A[2][1], rhotates[2][1]); + A[4][1] = ROL64(A[1][3], rhotates[1][3]); + + A[4][2] = ROL64(A[2][4], rhotates[2][4]); + A[3][4] = ROL64(A[4][3], rhotates[4][3]); + A[2][1] = ROL64(A[1][2], rhotates[1][2]); + A[1][3] = ROL64(A[3][1], rhotates[3][1]); + + A[2][4] = ROL64(A[4][0], rhotates[4][0]); + A[4][3] = ROL64(A[3][0], rhotates[3][0]); + A[1][2] = ROL64(A[2][0], rhotates[2][0]); + A[3][1] = ROL64(A[1][0], rhotates[1][0]); + + A[1][0] = ROL64(C[3], rhotates[0][3]); + A[2][0] = ROL64(C[1], rhotates[0][1]); + A[3][0] = ROL64(C[4], rhotates[0][4]); + A[4][0] = ROL64(C[2], rhotates[0][2]); + + C[0] = A[0][0]; + C[1] = A[1][0]; + D[0] = A[0][1]; + D[1] = A[1][1]; + + A[0][0] ^= (~A[0][1] & A[0][2]); + A[1][0] ^= (~A[1][1] & A[1][2]); + A[0][1] ^= (~A[0][2] & A[0][3]); + A[1][1] ^= (~A[1][2] & A[1][3]); + A[0][2] ^= (~A[0][3] & A[0][4]); + A[1][2] ^= (~A[1][3] & A[1][4]); + A[0][3] ^= (~A[0][4] & C[0]); + A[1][3] ^= (~A[1][4] & C[1]); + A[0][4] ^= (~C[0] & D[0]); + A[1][4] ^= (~C[1] & D[1]); + + C[2] = A[2][0]; + C[3] = A[3][0]; + D[2] = A[2][1]; + D[3] = A[3][1]; + + A[2][0] ^= (~A[2][1] & A[2][2]); + A[3][0] ^= (~A[3][1] & A[3][2]); + A[2][1] ^= (~A[2][2] & A[2][3]); + A[3][1] ^= (~A[3][2] & A[3][3]); + A[2][2] ^= (~A[2][3] & A[2][4]); + A[3][2] ^= (~A[3][3] & A[3][4]); + A[2][3] ^= (~A[2][4] & C[2]); + A[3][3] ^= (~A[3][4] & C[3]); + A[2][4] ^= (~C[2] & D[2]); + A[3][4] ^= (~C[3] & D[3]); + + C[4] = A[4][0]; + D[4] = A[4][1]; + + A[4][0] ^= (~A[4][1] & A[4][2]); + A[4][1] ^= (~A[4][2] & A[4][3]); + A[4][2] ^= (~A[4][3] & A[4][4]); + A[4][3] ^= (~A[4][4] & C[4]); + A[4][4] ^= (~C[4] & D[4]); + A[0][0] ^= iotas[i]; +} + +static void KeccakF1600(uint64_t A[5][5]) +{ + size_t i; + + for (i = 0; i < 24; i++) { + Round(A, i); + } +} + +#elif defined(KECCAK_2X) +/* + * This implementation is variant of KECCAK_1X above with outer-most + * round loop unrolled twice. This allows to take temporary storage + * out of round procedure and simplify references to it by alternating + * it with actual data (see round loop below). Originally it was meant + * rather as reference for an assembly implementation, but it seems to + * play best with compilers [as well as provide best instruction per + * processed byte ratio at minimal round unroll factor]... + */ +static void Round(uint64_t R[5][5], uint64_t A[5][5], size_t i) +{ + uint64_t C[5], D[5]; + + assert(i < (sizeof(iotas) / sizeof(iotas[0]))); + + C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0]; + C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1]; + C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2]; + C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3]; + C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4]; + + D[0] = ROL64(C[1], 1) ^ C[4]; + D[1] = ROL64(C[2], 1) ^ C[0]; + D[2] = ROL64(C[3], 1) ^ C[1]; + D[3] = ROL64(C[4], 1) ^ C[2]; + D[4] = ROL64(C[0], 1) ^ C[3]; + + C[0] = A[0][0] ^ D[0]; /* rotate by 0 */ + C[1] = ROL64(A[1][1] ^ D[1], rhotates[1][1]); + C[2] = ROL64(A[2][2] ^ D[2], rhotates[2][2]); + C[3] = ROL64(A[3][3] ^ D[3], rhotates[3][3]); + C[4] = ROL64(A[4][4] ^ D[4], rhotates[4][4]); + +#ifdef KECCAK_COMPLEMENTING_TRANSFORM + R[0][0] = C[0] ^ ( C[1] | C[2]) ^ iotas[i]; + R[0][1] = C[1] ^ (~C[2] | C[3]); + R[0][2] = C[2] ^ ( C[3] & C[4]); + R[0][3] = C[3] ^ ( C[4] | C[0]); + R[0][4] = C[4] ^ ( C[0] & C[1]); +#else + R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i]; + R[0][1] = C[1] ^ (~C[2] & C[3]); + R[0][2] = C[2] ^ (~C[3] & C[4]); + R[0][3] = C[3] ^ (~C[4] & C[0]); + R[0][4] = C[4] ^ (~C[0] & C[1]); +#endif + + C[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]); + C[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]); + C[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]); + C[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]); + C[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]); + +#ifdef KECCAK_COMPLEMENTING_TRANSFORM + R[1][0] = C[0] ^ (C[1] | C[2]); + R[1][1] = C[1] ^ (C[2] & C[3]); + R[1][2] = C[2] ^ (C[3] | ~C[4]); + R[1][3] = C[3] ^ (C[4] | C[0]); + R[1][4] = C[4] ^ (C[0] & C[1]); +#else + R[1][0] = C[0] ^ (~C[1] & C[2]); + R[1][1] = C[1] ^ (~C[2] & C[3]); + R[1][2] = C[2] ^ (~C[3] & C[4]); + R[1][3] = C[3] ^ (~C[4] & C[0]); + R[1][4] = C[4] ^ (~C[0] & C[1]); +#endif + + C[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]); + C[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]); + C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]); + C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]); + C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]); + +#ifdef KECCAK_COMPLEMENTING_TRANSFORM + R[2][0] = C[0] ^ ( C[1] | C[2]); + R[2][1] = C[1] ^ ( C[2] & C[3]); + R[2][2] = C[2] ^ (~C[3] & C[4]); + R[2][3] = ~C[3] ^ ( C[4] | C[0]); + R[2][4] = C[4] ^ ( C[0] & C[1]); +#else + R[2][0] = C[0] ^ (~C[1] & C[2]); + R[2][1] = C[1] ^ (~C[2] & C[3]); + R[2][2] = C[2] ^ (~C[3] & C[4]); + R[2][3] = C[3] ^ (~C[4] & C[0]); + R[2][4] = C[4] ^ (~C[0] & C[1]); +#endif + + C[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]); + C[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]); + C[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]); + C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]); + C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]); + +#ifdef KECCAK_COMPLEMENTING_TRANSFORM + R[3][0] = C[0] ^ ( C[1] & C[2]); + R[3][1] = C[1] ^ ( C[2] | C[3]); + R[3][2] = C[2] ^ (~C[3] | C[4]); + R[3][3] = ~C[3] ^ ( C[4] & C[0]); + R[3][4] = C[4] ^ ( C[0] | C[1]); +#else + R[3][0] = C[0] ^ (~C[1] & C[2]); + R[3][1] = C[1] ^ (~C[2] & C[3]); + R[3][2] = C[2] ^ (~C[3] & C[4]); + R[3][3] = C[3] ^ (~C[4] & C[0]); + R[3][4] = C[4] ^ (~C[0] & C[1]); +#endif + + C[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]); + C[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]); + C[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]); + C[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]); + C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]); + +#ifdef KECCAK_COMPLEMENTING_TRANSFORM + R[4][0] = C[0] ^ (~C[1] & C[2]); + R[4][1] = ~C[1] ^ ( C[2] | C[3]); + R[4][2] = C[2] ^ ( C[3] & C[4]); + R[4][3] = C[3] ^ ( C[4] | C[0]); + R[4][4] = C[4] ^ ( C[0] & C[1]); +#else + R[4][0] = C[0] ^ (~C[1] & C[2]); + R[4][1] = C[1] ^ (~C[2] & C[3]); + R[4][2] = C[2] ^ (~C[3] & C[4]); + R[4][3] = C[3] ^ (~C[4] & C[0]); + R[4][4] = C[4] ^ (~C[0] & C[1]); +#endif +} + +static void KeccakF1600(uint64_t A[5][5]) +{ + uint64_t T[5][5]; + size_t i; + +#ifdef KECCAK_COMPLEMENTING_TRANSFORM + A[0][1] = ~A[0][1]; + A[0][2] = ~A[0][2]; + A[1][3] = ~A[1][3]; + A[2][2] = ~A[2][2]; + A[3][2] = ~A[3][2]; + A[4][0] = ~A[4][0]; +#endif + + for (i = 0; i < 24; i += 2) { + Round(T, A, i); + Round(A, T, i + 1); + } + +#ifdef KECCAK_COMPLEMENTING_TRANSFORM + A[0][1] = ~A[0][1]; + A[0][2] = ~A[0][2]; + A[1][3] = ~A[1][3]; + A[2][2] = ~A[2][2]; + A[3][2] = ~A[3][2]; + A[4][0] = ~A[4][0]; +#endif +} + +#else /* define KECCAK_INPLACE to compile this code path */ +/* + * This implementation is KECCAK_1X from above combined 4 times with + * a twist that allows to omit temporary storage and perform in-place + * processing. It's discussed in section 2.5 of "Keccak implementation + * overview". It's likely to be best suited for processors with large + * register bank... On the other hand processor with large register + * bank can as well use KECCAK_1X_ALT, it would be as fast but much + * more compact... + */ +static void FourRounds(uint64_t A[5][5], size_t i) +{ + uint64_t B[5], C[5], D[5]; + + assert(i <= (sizeof(iotas) / sizeof(iotas[0]) - 4)); + + /* Round 4*n */ + C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0]; + C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1]; + C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2]; + C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3]; + C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4]; + + D[0] = ROL64(C[1], 1) ^ C[4]; + D[1] = ROL64(C[2], 1) ^ C[0]; + D[2] = ROL64(C[3], 1) ^ C[1]; + D[3] = ROL64(C[4], 1) ^ C[2]; + D[4] = ROL64(C[0], 1) ^ C[3]; + + B[0] = A[0][0] ^ D[0]; /* rotate by 0 */ + B[1] = ROL64(A[1][1] ^ D[1], rhotates[1][1]); + B[2] = ROL64(A[2][2] ^ D[2], rhotates[2][2]); + B[3] = ROL64(A[3][3] ^ D[3], rhotates[3][3]); + B[4] = ROL64(A[4][4] ^ D[4], rhotates[4][4]); + + C[0] = A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i]; + C[1] = A[1][1] = B[1] ^ (~B[2] & B[3]); + C[2] = A[2][2] = B[2] ^ (~B[3] & B[4]); + C[3] = A[3][3] = B[3] ^ (~B[4] & B[0]); + C[4] = A[4][4] = B[4] ^ (~B[0] & B[1]); + + B[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]); + B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]); + B[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]); + B[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]); + B[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]); + + C[0] ^= A[2][0] = B[0] ^ (~B[1] & B[2]); + C[1] ^= A[3][1] = B[1] ^ (~B[2] & B[3]); + C[2] ^= A[4][2] = B[2] ^ (~B[3] & B[4]); + C[3] ^= A[0][3] = B[3] ^ (~B[4] & B[0]); + C[4] ^= A[1][4] = B[4] ^ (~B[0] & B[1]); + + B[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]); + B[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]); + B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]); + B[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]); + B[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]); + + C[0] ^= A[4][0] = B[0] ^ (~B[1] & B[2]); + C[1] ^= A[0][1] = B[1] ^ (~B[2] & B[3]); + C[2] ^= A[1][2] = B[2] ^ (~B[3] & B[4]); + C[3] ^= A[2][3] = B[3] ^ (~B[4] & B[0]); + C[4] ^= A[3][4] = B[4] ^ (~B[0] & B[1]); + + B[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]); + B[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]); + B[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]); + B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]); + B[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]); + + C[0] ^= A[1][0] = B[0] ^ (~B[1] & B[2]); + C[1] ^= A[2][1] = B[1] ^ (~B[2] & B[3]); + C[2] ^= A[3][2] = B[2] ^ (~B[3] & B[4]); + C[3] ^= A[4][3] = B[3] ^ (~B[4] & B[0]); + C[4] ^= A[0][4] = B[4] ^ (~B[0] & B[1]); + + B[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]); + B[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]); + B[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]); + B[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]); + B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]); + + C[0] ^= A[3][0] = B[0] ^ (~B[1] & B[2]); + C[1] ^= A[4][1] = B[1] ^ (~B[2] & B[3]); + C[2] ^= A[0][2] = B[2] ^ (~B[3] & B[4]); + C[3] ^= A[1][3] = B[3] ^ (~B[4] & B[0]); + C[4] ^= A[2][4] = B[4] ^ (~B[0] & B[1]); + + /* Round 4*n+1 */ + D[0] = ROL64(C[1], 1) ^ C[4]; + D[1] = ROL64(C[2], 1) ^ C[0]; + D[2] = ROL64(C[3], 1) ^ C[1]; + D[3] = ROL64(C[4], 1) ^ C[2]; + D[4] = ROL64(C[0], 1) ^ C[3]; + + B[0] = A[0][0] ^ D[0]; /* rotate by 0 */ + B[1] = ROL64(A[3][1] ^ D[1], rhotates[1][1]); + B[2] = ROL64(A[1][2] ^ D[2], rhotates[2][2]); + B[3] = ROL64(A[4][3] ^ D[3], rhotates[3][3]); + B[4] = ROL64(A[2][4] ^ D[4], rhotates[4][4]); + + C[0] = A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i + 1]; + C[1] = A[3][1] = B[1] ^ (~B[2] & B[3]); + C[2] = A[1][2] = B[2] ^ (~B[3] & B[4]); + C[3] = A[4][3] = B[3] ^ (~B[4] & B[0]); + C[4] = A[2][4] = B[4] ^ (~B[0] & B[1]); + + B[0] = ROL64(A[3][3] ^ D[3], rhotates[0][3]); + B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]); + B[2] = ROL64(A[4][0] ^ D[0], rhotates[2][0]); + B[3] = ROL64(A[2][1] ^ D[1], rhotates[3][1]); + B[4] = ROL64(A[0][2] ^ D[2], rhotates[4][2]); + + C[0] ^= A[4][0] = B[0] ^ (~B[1] & B[2]); + C[1] ^= A[2][1] = B[1] ^ (~B[2] & B[3]); + C[2] ^= A[0][2] = B[2] ^ (~B[3] & B[4]); + C[3] ^= A[3][3] = B[3] ^ (~B[4] & B[0]); + C[4] ^= A[1][4] = B[4] ^ (~B[0] & B[1]); + + B[0] = ROL64(A[1][1] ^ D[1], rhotates[0][1]); + B[1] = ROL64(A[4][2] ^ D[2], rhotates[1][2]); + B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]); + B[3] = ROL64(A[0][4] ^ D[4], rhotates[3][4]); + B[4] = ROL64(A[3][0] ^ D[0], rhotates[4][0]); + + C[0] ^= A[3][0] = B[0] ^ (~B[1] & B[2]); + C[1] ^= A[1][1] = B[1] ^ (~B[2] & B[3]); + C[2] ^= A[4][2] = B[2] ^ (~B[3] & B[4]); + C[3] ^= A[2][3] = B[3] ^ (~B[4] & B[0]); + C[4] ^= A[0][4] = B[4] ^ (~B[0] & B[1]); + + B[0] = ROL64(A[4][4] ^ D[4], rhotates[0][4]); + B[1] = ROL64(A[2][0] ^ D[0], rhotates[1][0]); + B[2] = ROL64(A[0][1] ^ D[1], rhotates[2][1]); + B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]); + B[4] = ROL64(A[1][3] ^ D[3], rhotates[4][3]); + + C[0] ^= A[2][0] = B[0] ^ (~B[1] & B[2]); + C[1] ^= A[0][1] = B[1] ^ (~B[2] & B[3]); + C[2] ^= A[3][2] = B[2] ^ (~B[3] & B[4]); + C[3] ^= A[1][3] = B[3] ^ (~B[4] & B[0]); + C[4] ^= A[4][4] = B[4] ^ (~B[0] & B[1]); + + B[0] = ROL64(A[2][2] ^ D[2], rhotates[0][2]); + B[1] = ROL64(A[0][3] ^ D[3], rhotates[1][3]); + B[2] = ROL64(A[3][4] ^ D[4], rhotates[2][4]); + B[3] = ROL64(A[1][0] ^ D[0], rhotates[3][0]); + B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]); + + C[0] ^= A[1][0] = B[0] ^ (~B[1] & B[2]); + C[1] ^= A[4][1] = B[1] ^ (~B[2] & B[3]); + C[2] ^= A[2][2] = B[2] ^ (~B[3] & B[4]); + C[3] ^= A[0][3] = B[3] ^ (~B[4] & B[0]); + C[4] ^= A[3][4] = B[4] ^ (~B[0] & B[1]); + + /* Round 4*n+2 */ + D[0] = ROL64(C[1], 1) ^ C[4]; + D[1] = ROL64(C[2], 1) ^ C[0]; + D[2] = ROL64(C[3], 1) ^ C[1]; + D[3] = ROL64(C[4], 1) ^ C[2]; + D[4] = ROL64(C[0], 1) ^ C[3]; + + B[0] = A[0][0] ^ D[0]; /* rotate by 0 */ + B[1] = ROL64(A[2][1] ^ D[1], rhotates[1][1]); + B[2] = ROL64(A[4][2] ^ D[2], rhotates[2][2]); + B[3] = ROL64(A[1][3] ^ D[3], rhotates[3][3]); + B[4] = ROL64(A[3][4] ^ D[4], rhotates[4][4]); + + C[0] = A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i + 2]; + C[1] = A[2][1] = B[1] ^ (~B[2] & B[3]); + C[2] = A[4][2] = B[2] ^ (~B[3] & B[4]); + C[3] = A[1][3] = B[3] ^ (~B[4] & B[0]); + C[4] = A[3][4] = B[4] ^ (~B[0] & B[1]); + + B[0] = ROL64(A[4][3] ^ D[3], rhotates[0][3]); + B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]); + B[2] = ROL64(A[3][0] ^ D[0], rhotates[2][0]); + B[3] = ROL64(A[0][1] ^ D[1], rhotates[3][1]); + B[4] = ROL64(A[2][2] ^ D[2], rhotates[4][2]); + + C[0] ^= A[3][0] = B[0] ^ (~B[1] & B[2]); + C[1] ^= A[0][1] = B[1] ^ (~B[2] & B[3]); + C[2] ^= A[2][2] = B[2] ^ (~B[3] & B[4]); + C[3] ^= A[4][3] = B[3] ^ (~B[4] & B[0]); + C[4] ^= A[1][4] = B[4] ^ (~B[0] & B[1]); + + B[0] = ROL64(A[3][1] ^ D[1], rhotates[0][1]); + B[1] = ROL64(A[0][2] ^ D[2], rhotates[1][2]); + B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]); + B[3] = ROL64(A[4][4] ^ D[4], rhotates[3][4]); + B[4] = ROL64(A[1][0] ^ D[0], rhotates[4][0]); + + C[0] ^= A[1][0] = B[0] ^ (~B[1] & B[2]); + C[1] ^= A[3][1] = B[1] ^ (~B[2] & B[3]); + C[2] ^= A[0][2] = B[2] ^ (~B[3] & B[4]); + C[3] ^= A[2][3] = B[3] ^ (~B[4] & B[0]); + C[4] ^= A[4][4] = B[4] ^ (~B[0] & B[1]); + + B[0] = ROL64(A[2][4] ^ D[4], rhotates[0][4]); + B[1] = ROL64(A[4][0] ^ D[0], rhotates[1][0]); + B[2] = ROL64(A[1][1] ^ D[1], rhotates[2][1]); + B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]); + B[4] = ROL64(A[0][3] ^ D[3], rhotates[4][3]); + + C[0] ^= A[4][0] = B[0] ^ (~B[1] & B[2]); + C[1] ^= A[1][1] = B[1] ^ (~B[2] & B[3]); + C[2] ^= A[3][2] = B[2] ^ (~B[3] & B[4]); + C[3] ^= A[0][3] = B[3] ^ (~B[4] & B[0]); + C[4] ^= A[2][4] = B[4] ^ (~B[0] & B[1]); + + B[0] = ROL64(A[1][2] ^ D[2], rhotates[0][2]); + B[1] = ROL64(A[3][3] ^ D[3], rhotates[1][3]); + B[2] = ROL64(A[0][4] ^ D[4], rhotates[2][4]); + B[3] = ROL64(A[2][0] ^ D[0], rhotates[3][0]); + B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]); + + C[0] ^= A[2][0] = B[0] ^ (~B[1] & B[2]); + C[1] ^= A[4][1] = B[1] ^ (~B[2] & B[3]); + C[2] ^= A[1][2] = B[2] ^ (~B[3] & B[4]); + C[3] ^= A[3][3] = B[3] ^ (~B[4] & B[0]); + C[4] ^= A[0][4] = B[4] ^ (~B[0] & B[1]); + + /* Round 4*n+3 */ + D[0] = ROL64(C[1], 1) ^ C[4]; + D[1] = ROL64(C[2], 1) ^ C[0]; + D[2] = ROL64(C[3], 1) ^ C[1]; + D[3] = ROL64(C[4], 1) ^ C[2]; + D[4] = ROL64(C[0], 1) ^ C[3]; + + B[0] = A[0][0] ^ D[0]; /* rotate by 0 */ + B[1] = ROL64(A[0][1] ^ D[1], rhotates[1][1]); + B[2] = ROL64(A[0][2] ^ D[2], rhotates[2][2]); + B[3] = ROL64(A[0][3] ^ D[3], rhotates[3][3]); + B[4] = ROL64(A[0][4] ^ D[4], rhotates[4][4]); + + /* C[0] = */ A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i + 3]; + /* C[1] = */ A[0][1] = B[1] ^ (~B[2] & B[3]); + /* C[2] = */ A[0][2] = B[2] ^ (~B[3] & B[4]); + /* C[3] = */ A[0][3] = B[3] ^ (~B[4] & B[0]); + /* C[4] = */ A[0][4] = B[4] ^ (~B[0] & B[1]); + + B[0] = ROL64(A[1][3] ^ D[3], rhotates[0][3]); + B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]); + B[2] = ROL64(A[1][0] ^ D[0], rhotates[2][0]); + B[3] = ROL64(A[1][1] ^ D[1], rhotates[3][1]); + B[4] = ROL64(A[1][2] ^ D[2], rhotates[4][2]); + + /* C[0] ^= */ A[1][0] = B[0] ^ (~B[1] & B[2]); + /* C[1] ^= */ A[1][1] = B[1] ^ (~B[2] & B[3]); + /* C[2] ^= */ A[1][2] = B[2] ^ (~B[3] & B[4]); + /* C[3] ^= */ A[1][3] = B[3] ^ (~B[4] & B[0]); + /* C[4] ^= */ A[1][4] = B[4] ^ (~B[0] & B[1]); + + B[0] = ROL64(A[2][1] ^ D[1], rhotates[0][1]); + B[1] = ROL64(A[2][2] ^ D[2], rhotates[1][2]); + B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]); + B[3] = ROL64(A[2][4] ^ D[4], rhotates[3][4]); + B[4] = ROL64(A[2][0] ^ D[0], rhotates[4][0]); + + /* C[0] ^= */ A[2][0] = B[0] ^ (~B[1] & B[2]); + /* C[1] ^= */ A[2][1] = B[1] ^ (~B[2] & B[3]); + /* C[2] ^= */ A[2][2] = B[2] ^ (~B[3] & B[4]); + /* C[3] ^= */ A[2][3] = B[3] ^ (~B[4] & B[0]); + /* C[4] ^= */ A[2][4] = B[4] ^ (~B[0] & B[1]); + + B[0] = ROL64(A[3][4] ^ D[4], rhotates[0][4]); + B[1] = ROL64(A[3][0] ^ D[0], rhotates[1][0]); + B[2] = ROL64(A[3][1] ^ D[1], rhotates[2][1]); + B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]); + B[4] = ROL64(A[3][3] ^ D[3], rhotates[4][3]); + + /* C[0] ^= */ A[3][0] = B[0] ^ (~B[1] & B[2]); + /* C[1] ^= */ A[3][1] = B[1] ^ (~B[2] & B[3]); + /* C[2] ^= */ A[3][2] = B[2] ^ (~B[3] & B[4]); + /* C[3] ^= */ A[3][3] = B[3] ^ (~B[4] & B[0]); + /* C[4] ^= */ A[3][4] = B[4] ^ (~B[0] & B[1]); + + B[0] = ROL64(A[4][2] ^ D[2], rhotates[0][2]); + B[1] = ROL64(A[4][3] ^ D[3], rhotates[1][3]); + B[2] = ROL64(A[4][4] ^ D[4], rhotates[2][4]); + B[3] = ROL64(A[4][0] ^ D[0], rhotates[3][0]); + B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]); + + /* C[0] ^= */ A[4][0] = B[0] ^ (~B[1] & B[2]); + /* C[1] ^= */ A[4][1] = B[1] ^ (~B[2] & B[3]); + /* C[2] ^= */ A[4][2] = B[2] ^ (~B[3] & B[4]); + /* C[3] ^= */ A[4][3] = B[3] ^ (~B[4] & B[0]); + /* C[4] ^= */ A[4][4] = B[4] ^ (~B[0] & B[1]); +} + +static void KeccakF1600(uint64_t A[5][5]) +{ + size_t i; + + for (i = 0; i < 24; i += 4) { + FourRounds(A, i); + } +} + +#endif + +static uint64_t BitInterleave(uint64_t Ai) +{ + if (BIT_INTERLEAVE) { + uint32_t hi = (uint32_t)(Ai >> 32), lo = (uint32_t)Ai; + uint32_t t0, t1; + + t0 = lo & 0x55555555; + t0 |= t0 >> 1; t0 &= 0x33333333; + t0 |= t0 >> 2; t0 &= 0x0f0f0f0f; + t0 |= t0 >> 4; t0 &= 0x00ff00ff; + t0 |= t0 >> 8; t0 &= 0x0000ffff; + + t1 = hi & 0x55555555; + t1 |= t1 >> 1; t1 &= 0x33333333; + t1 |= t1 >> 2; t1 &= 0x0f0f0f0f; + t1 |= t1 >> 4; t1 &= 0x00ff00ff; + t1 |= t1 >> 8; t1 <<= 16; + + lo &= 0xaaaaaaaa; + lo |= lo << 1; lo &= 0xcccccccc; + lo |= lo << 2; lo &= 0xf0f0f0f0; + lo |= lo << 4; lo &= 0xff00ff00; + lo |= lo << 8; lo >>= 16; + + hi &= 0xaaaaaaaa; + hi |= hi << 1; hi &= 0xcccccccc; + hi |= hi << 2; hi &= 0xf0f0f0f0; + hi |= hi << 4; hi &= 0xff00ff00; + hi |= hi << 8; hi &= 0xffff0000; + + Ai = ((uint64_t)(hi | lo) << 32) | (t1 | t0); + } + + return Ai; +} + +static uint64_t BitDeinterleave(uint64_t Ai) +{ + if (BIT_INTERLEAVE) { + uint32_t hi = (uint32_t)(Ai >> 32), lo = (uint32_t)Ai; + uint32_t t0, t1; + + t0 = lo & 0x0000ffff; + t0 |= t0 << 8; t0 &= 0x00ff00ff; + t0 |= t0 << 4; t0 &= 0x0f0f0f0f; + t0 |= t0 << 2; t0 &= 0x33333333; + t0 |= t0 << 1; t0 &= 0x55555555; + + t1 = hi << 16; + t1 |= t1 >> 8; t1 &= 0xff00ff00; + t1 |= t1 >> 4; t1 &= 0xf0f0f0f0; + t1 |= t1 >> 2; t1 &= 0xcccccccc; + t1 |= t1 >> 1; t1 &= 0xaaaaaaaa; + + lo >>= 16; + lo |= lo << 8; lo &= 0x00ff00ff; + lo |= lo << 4; lo &= 0x0f0f0f0f; + lo |= lo << 2; lo &= 0x33333333; + lo |= lo << 1; lo &= 0x55555555; + + hi &= 0xffff0000; + hi |= hi >> 8; hi &= 0xff00ff00; + hi |= hi >> 4; hi &= 0xf0f0f0f0; + hi |= hi >> 2; hi &= 0xcccccccc; + hi |= hi >> 1; hi &= 0xaaaaaaaa; + + Ai = ((uint64_t)(hi | lo) << 32) | (t1 | t0); + } + + return Ai; +} + +/* + * SHA3_absorb can be called multiple times, but at each invocation + * largest multiple of |r| out of |len| bytes are processed. Then + * remaining amount of bytes is returned. This is done to spare caller + * trouble of calculating the largest multiple of |r|. |r| can be viewed + * as blocksize. It is commonly (1600 - 256*n)/8, e.g. 168, 136, 104, + * 72, but can also be (1600 - 448)/8 = 144. All this means that message + * padding and intermediate sub-block buffering, byte- or bitwise, is + * caller's responsibility. + */ +size_t SHA3_absorb(uint64_t A[5][5], const unsigned char *inp, size_t len, + size_t r) +{ + uint64_t *A_flat = (uint64_t *)A; + size_t i, w = r / 8; + + assert(r < (25 * sizeof(A[0][0])) && (r % 8) == 0); + + while (len >= r) { + for (i = 0; i < w; i++) { + uint64_t Ai = (uint64_t)inp[0] | (uint64_t)inp[1] << 8 | + (uint64_t)inp[2] << 16 | (uint64_t)inp[3] << 24 | + (uint64_t)inp[4] << 32 | (uint64_t)inp[5] << 40 | + (uint64_t)inp[6] << 48 | (uint64_t)inp[7] << 56; + inp += 8; + + A_flat[i] ^= BitInterleave(Ai); + } + KeccakF1600(A); + len -= r; + } + + return len; +} + +/* + * SHA3_squeeze is called once at the end to generate |out| hash value + * of |len| bytes. + */ +void SHA3_squeeze(uint64_t A[5][5], unsigned char *out, size_t len, size_t r) +{ + uint64_t *A_flat = (uint64_t *)A; + size_t i, w = r / 8; + + assert(r < (25 * sizeof(A[0][0])) && (r % 8) == 0); + + while (len != 0) { + for (i = 0; i < w && len != 0; i++) { + uint64_t Ai = BitDeinterleave(A_flat[i]); + + if (len < 8) { + for (i = 0; i < len; i++) { + *out++ = (unsigned char)Ai; + Ai >>= 8; + } + return; + } + + out[0] = (unsigned char)(Ai); + out[1] = (unsigned char)(Ai >> 8); + out[2] = (unsigned char)(Ai >> 16); + out[3] = (unsigned char)(Ai >> 24); + out[4] = (unsigned char)(Ai >> 32); + out[5] = (unsigned char)(Ai >> 40); + out[6] = (unsigned char)(Ai >> 48); + out[7] = (unsigned char)(Ai >> 56); + out += 8; + len -= 8; + } + if (len) + KeccakF1600(A); + } +} +#endif + +#ifdef SELFTEST +/* + * Post-padding one-shot implementations would look as following: + * + * SHA3_224 SHA3_sponge(inp, len, out, 224/8, (1600-448)/8); + * SHA3_256 SHA3_sponge(inp, len, out, 256/8, (1600-512)/8); + * SHA3_384 SHA3_sponge(inp, len, out, 384/8, (1600-768)/8); + * SHA3_512 SHA3_sponge(inp, len, out, 512/8, (1600-1024)/8); + * SHAKE_128 SHA3_sponge(inp, len, out, d, (1600-256)/8); + * SHAKE_256 SHA3_sponge(inp, len, out, d, (1600-512)/8); + */ + +void SHA3_sponge(const unsigned char *inp, size_t len, + unsigned char *out, size_t d, size_t r) +{ + uint64_t A[5][5]; + + memset(A, 0, sizeof(A)); + SHA3_absorb(A, inp, len, r); + SHA3_squeeze(A, out, d, r); +} + +# include <stdio.h> + +int main() +{ + /* + * This is 5-bit SHAKE128 test from http://csrc.nist.gov/groups/ST/toolkit/examples.html#aHashing + */ + unsigned char test[168] = { '\xf3', '\x3' }; + unsigned char out[512]; + size_t i; + static const unsigned char result[512] = { + 0x2E, 0x0A, 0xBF, 0xBA, 0x83, 0xE6, 0x72, 0x0B, + 0xFB, 0xC2, 0x25, 0xFF, 0x6B, 0x7A, 0xB9, 0xFF, + 0xCE, 0x58, 0xBA, 0x02, 0x7E, 0xE3, 0xD8, 0x98, + 0x76, 0x4F, 0xEF, 0x28, 0x7D, 0xDE, 0xCC, 0xCA, + 0x3E, 0x6E, 0x59, 0x98, 0x41, 0x1E, 0x7D, 0xDB, + 0x32, 0xF6, 0x75, 0x38, 0xF5, 0x00, 0xB1, 0x8C, + 0x8C, 0x97, 0xC4, 0x52, 0xC3, 0x70, 0xEA, 0x2C, + 0xF0, 0xAF, 0xCA, 0x3E, 0x05, 0xDE, 0x7E, 0x4D, + 0xE2, 0x7F, 0xA4, 0x41, 0xA9, 0xCB, 0x34, 0xFD, + 0x17, 0xC9, 0x78, 0xB4, 0x2D, 0x5B, 0x7E, 0x7F, + 0x9A, 0xB1, 0x8F, 0xFE, 0xFF, 0xC3, 0xC5, 0xAC, + 0x2F, 0x3A, 0x45, 0x5E, 0xEB, 0xFD, 0xC7, 0x6C, + 0xEA, 0xEB, 0x0A, 0x2C, 0xCA, 0x22, 0xEE, 0xF6, + 0xE6, 0x37, 0xF4, 0xCA, 0xBE, 0x5C, 0x51, 0xDE, + 0xD2, 0xE3, 0xFA, 0xD8, 0xB9, 0x52, 0x70, 0xA3, + 0x21, 0x84, 0x56, 0x64, 0xF1, 0x07, 0xD1, 0x64, + 0x96, 0xBB, 0x7A, 0xBF, 0xBE, 0x75, 0x04, 0xB6, + 0xED, 0xE2, 0xE8, 0x9E, 0x4B, 0x99, 0x6F, 0xB5, + 0x8E, 0xFD, 0xC4, 0x18, 0x1F, 0x91, 0x63, 0x38, + 0x1C, 0xBE, 0x7B, 0xC0, 0x06, 0xA7, 0xA2, 0x05, + 0x98, 0x9C, 0x52, 0x6C, 0xD1, 0xBD, 0x68, 0x98, + 0x36, 0x93, 0xB4, 0xBD, 0xC5, 0x37, 0x28, 0xB2, + 0x41, 0xC1, 0xCF, 0xF4, 0x2B, 0xB6, 0x11, 0x50, + 0x2C, 0x35, 0x20, 0x5C, 0xAB, 0xB2, 0x88, 0x75, + 0x56, 0x55, 0xD6, 0x20, 0xC6, 0x79, 0x94, 0xF0, + 0x64, 0x51, 0x18, 0x7F, 0x6F, 0xD1, 0x7E, 0x04, + 0x66, 0x82, 0xBA, 0x12, 0x86, 0x06, 0x3F, 0xF8, + 0x8F, 0xE2, 0x50, 0x8D, 0x1F, 0xCA, 0xF9, 0x03, + 0x5A, 0x12, 0x31, 0xAD, 0x41, 0x50, 0xA9, 0xC9, + 0xB2, 0x4C, 0x9B, 0x2D, 0x66, 0xB2, 0xAD, 0x1B, + 0xDE, 0x0B, 0xD0, 0xBB, 0xCB, 0x8B, 0xE0, 0x5B, + 0x83, 0x52, 0x29, 0xEF, 0x79, 0x19, 0x73, 0x73, + 0x23, 0x42, 0x44, 0x01, 0xE1, 0xD8, 0x37, 0xB6, + 0x6E, 0xB4, 0xE6, 0x30, 0xFF, 0x1D, 0xE7, 0x0C, + 0xB3, 0x17, 0xC2, 0xBA, 0xCB, 0x08, 0x00, 0x1D, + 0x34, 0x77, 0xB7, 0xA7, 0x0A, 0x57, 0x6D, 0x20, + 0x86, 0x90, 0x33, 0x58, 0x9D, 0x85, 0xA0, 0x1D, + 0xDB, 0x2B, 0x66, 0x46, 0xC0, 0x43, 0xB5, 0x9F, + 0xC0, 0x11, 0x31, 0x1D, 0xA6, 0x66, 0xFA, 0x5A, + 0xD1, 0xD6, 0x38, 0x7F, 0xA9, 0xBC, 0x40, 0x15, + 0xA3, 0x8A, 0x51, 0xD1, 0xDA, 0x1E, 0xA6, 0x1D, + 0x64, 0x8D, 0xC8, 0xE3, 0x9A, 0x88, 0xB9, 0xD6, + 0x22, 0xBD, 0xE2, 0x07, 0xFD, 0xAB, 0xC6, 0xF2, + 0x82, 0x7A, 0x88, 0x0C, 0x33, 0x0B, 0xBF, 0x6D, + 0xF7, 0x33, 0x77, 0x4B, 0x65, 0x3E, 0x57, 0x30, + 0x5D, 0x78, 0xDC, 0xE1, 0x12, 0xF1, 0x0A, 0x2C, + 0x71, 0xF4, 0xCD, 0xAD, 0x92, 0xED, 0x11, 0x3E, + 0x1C, 0xEA, 0x63, 0xB9, 0x19, 0x25, 0xED, 0x28, + 0x19, 0x1E, 0x6D, 0xBB, 0xB5, 0xAA, 0x5A, 0x2A, + 0xFD, 0xA5, 0x1F, 0xC0, 0x5A, 0x3A, 0xF5, 0x25, + 0x8B, 0x87, 0x66, 0x52, 0x43, 0x55, 0x0F, 0x28, + 0x94, 0x8A, 0xE2, 0xB8, 0xBE, 0xB6, 0xBC, 0x9C, + 0x77, 0x0B, 0x35, 0xF0, 0x67, 0xEA, 0xA6, 0x41, + 0xEF, 0xE6, 0x5B, 0x1A, 0x44, 0x90, 0x9D, 0x1B, + 0x14, 0x9F, 0x97, 0xEE, 0xA6, 0x01, 0x39, 0x1C, + 0x60, 0x9E, 0xC8, 0x1D, 0x19, 0x30, 0xF5, 0x7C, + 0x18, 0xA4, 0xE0, 0xFA, 0xB4, 0x91, 0xD1, 0xCA, + 0xDF, 0xD5, 0x04, 0x83, 0x44, 0x9E, 0xDC, 0x0F, + 0x07, 0xFF, 0xB2, 0x4D, 0x2C, 0x6F, 0x9A, 0x9A, + 0x3B, 0xFF, 0x39, 0xAE, 0x3D, 0x57, 0xF5, 0x60, + 0x65, 0x4D, 0x7D, 0x75, 0xC9, 0x08, 0xAB, 0xE6, + 0x25, 0x64, 0x75, 0x3E, 0xAC, 0x39, 0xD7, 0x50, + 0x3D, 0xA6, 0xD3, 0x7C, 0x2E, 0x32, 0xE1, 0xAF, + 0x3B, 0x8A, 0xEC, 0x8A, 0xE3, 0x06, 0x9C, 0xD9 + }; + + test[167] = '\x80'; + SHA3_sponge(test, sizeof(test), out, sizeof(out), sizeof(test)); + + /* + * Rationale behind keeping output [formatted as below] is that + * one should be able to redirect it to a file, then copy-n-paste + * final "output val" from official example to another file, and + * compare the two with diff(1). + */ + for (i = 0; i < sizeof(out);) { + printf("%02X", out[i]); + printf(++i % 16 && i != sizeof(out) ? " " : "\n"); + } + + if (memcmp(out,result,sizeof(out))) { + fprintf(stderr,"failure\n"); + return 1; + } else { + fprintf(stderr,"success\n"); + return 0; + } +} +#endif diff --git a/crypto/sha/sha.c b/crypto/sha/sha.c deleted file mode 100644 index cfc12f3edc68..000000000000 --- a/crypto/sha/sha.c +++ /dev/null @@ -1,118 +0,0 @@ -/* crypto/sha/sha.c */ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] - */ - -#include <stdio.h> -#include <stdlib.h> -#include <openssl/sha.h> - -#define BUFSIZE 1024*16 - -void do_fp(FILE *f); -void pt(unsigned char *md); -int read(int, void *, unsigned int); -int main(int argc, char **argv) -{ - int i, err = 0; - FILE *IN; - - if (argc == 1) { - do_fp(stdin); - } else { - for (i = 1; i < argc; i++) { - IN = fopen(argv[i], "r"); - if (IN == NULL) { - perror(argv[i]); - err++; - continue; - } - printf("SHA(%s)= ", argv[i]); - do_fp(IN); - fclose(IN); - } - } - exit(err); -} - -void do_fp(FILE *f) -{ - SHA_CTX c; - unsigned char md[SHA_DIGEST_LENGTH]; - int fd; - int i; - unsigned char buf[BUFSIZE]; - - fd = fileno(f); - SHA_Init(&c); - for (;;) { - i = read(fd, buf, BUFSIZE); - if (i <= 0) - break; - SHA_Update(&c, buf, (unsigned long)i); - } - SHA_Final(&(md[0]), &c); - pt(md); -} - -void pt(unsigned char *md) -{ - int i; - - for (i = 0; i < SHA_DIGEST_LENGTH; i++) - printf("%02x", md[i]); - printf("\n"); -} diff --git a/crypto/sha/sha.h b/crypto/sha/sha.h deleted file mode 100644 index e5169e4fee04..000000000000 --- a/crypto/sha/sha.h +++ /dev/null @@ -1,214 +0,0 @@ -/* crypto/sha/sha.h */ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] - */ - -#ifndef HEADER_SHA_H -# define HEADER_SHA_H - -# include <openssl/e_os2.h> -# include <stddef.h> - -#ifdef __cplusplus -extern "C" { -#endif - -# if defined(OPENSSL_NO_SHA) || (defined(OPENSSL_NO_SHA0) && defined(OPENSSL_NO_SHA1)) -# error SHA is disabled. -# endif - -# if defined(OPENSSL_FIPS) -# define FIPS_SHA_SIZE_T size_t -# endif - -/*- - * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - * ! SHA_LONG has to be at least 32 bits wide. If it's wider, then ! - * ! SHA_LONG_LOG2 has to be defined along. ! - * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - */ - -# if defined(__LP32__) -# define SHA_LONG unsigned long -# elif defined(OPENSSL_SYS_CRAY) || defined(__ILP64__) -# define SHA_LONG unsigned long -# define SHA_LONG_LOG2 3 -# else -# define SHA_LONG unsigned int -# endif - -# define SHA_LBLOCK 16 -# define SHA_CBLOCK (SHA_LBLOCK*4)/* SHA treats input data as a - * contiguous array of 32 bit wide - * big-endian values. */ -# define SHA_LAST_BLOCK (SHA_CBLOCK-8) -# define SHA_DIGEST_LENGTH 20 - -typedef struct SHAstate_st { - SHA_LONG h0, h1, h2, h3, h4; - SHA_LONG Nl, Nh; - SHA_LONG data[SHA_LBLOCK]; - unsigned int num; -} SHA_CTX; - -# ifndef OPENSSL_NO_SHA0 -# ifdef OPENSSL_FIPS -int private_SHA_Init(SHA_CTX *c); -# endif -int SHA_Init(SHA_CTX *c); -int SHA_Update(SHA_CTX *c, const void *data, size_t len); -int SHA_Final(unsigned char *md, SHA_CTX *c); -unsigned char *SHA(const unsigned char *d, size_t n, unsigned char *md); -void SHA_Transform(SHA_CTX *c, const unsigned char *data); -# endif -# ifndef OPENSSL_NO_SHA1 -# ifdef OPENSSL_FIPS -int private_SHA1_Init(SHA_CTX *c); -# endif -int SHA1_Init(SHA_CTX *c); -int SHA1_Update(SHA_CTX *c, const void *data, size_t len); -int SHA1_Final(unsigned char *md, SHA_CTX *c); -unsigned char *SHA1(const unsigned char *d, size_t n, unsigned char *md); -void SHA1_Transform(SHA_CTX *c, const unsigned char *data); -# endif - -# define SHA256_CBLOCK (SHA_LBLOCK*4)/* SHA-256 treats input data as a - * contiguous array of 32 bit wide - * big-endian values. */ -# define SHA224_DIGEST_LENGTH 28 -# define SHA256_DIGEST_LENGTH 32 - -typedef struct SHA256state_st { - SHA_LONG h[8]; - SHA_LONG Nl, Nh; - SHA_LONG data[SHA_LBLOCK]; - unsigned int num, md_len; -} SHA256_CTX; - -# ifndef OPENSSL_NO_SHA256 -# ifdef OPENSSL_FIPS -int private_SHA224_Init(SHA256_CTX *c); -int private_SHA256_Init(SHA256_CTX *c); -# endif -int SHA224_Init(SHA256_CTX *c); -int SHA224_Update(SHA256_CTX *c, const void *data, size_t len); -int SHA224_Final(unsigned char *md, SHA256_CTX *c); -unsigned char *SHA224(const unsigned char *d, size_t n, unsigned char *md); -int SHA256_Init(SHA256_CTX *c); -int SHA256_Update(SHA256_CTX *c, const void *data, size_t len); -int SHA256_Final(unsigned char *md, SHA256_CTX *c); -unsigned char *SHA256(const unsigned char *d, size_t n, unsigned char *md); -void SHA256_Transform(SHA256_CTX *c, const unsigned char *data); -# endif - -# define SHA384_DIGEST_LENGTH 48 -# define SHA512_DIGEST_LENGTH 64 - -# ifndef OPENSSL_NO_SHA512 -/* - * Unlike 32-bit digest algorithms, SHA-512 *relies* on SHA_LONG64 - * being exactly 64-bit wide. See Implementation Notes in sha512.c - * for further details. - */ -/* - * SHA-512 treats input data as a - * contiguous array of 64 bit - * wide big-endian values. - */ -# define SHA512_CBLOCK (SHA_LBLOCK*8) -# if (defined(_WIN32) || defined(_WIN64)) && !defined(__MINGW32__) -# define SHA_LONG64 unsigned __int64 -# define U64(C) C##UI64 -# elif defined(__arch64__) -# define SHA_LONG64 unsigned long -# define U64(C) C##UL -# else -# define SHA_LONG64 unsigned long long -# define U64(C) C##ULL -# endif - -typedef struct SHA512state_st { - SHA_LONG64 h[8]; - SHA_LONG64 Nl, Nh; - union { - SHA_LONG64 d[SHA_LBLOCK]; - unsigned char p[SHA512_CBLOCK]; - } u; - unsigned int num, md_len; -} SHA512_CTX; -# endif - -# ifndef OPENSSL_NO_SHA512 -# ifdef OPENSSL_FIPS -int private_SHA384_Init(SHA512_CTX *c); -int private_SHA512_Init(SHA512_CTX *c); -# endif -int SHA384_Init(SHA512_CTX *c); -int SHA384_Update(SHA512_CTX *c, const void *data, size_t len); -int SHA384_Final(unsigned char *md, SHA512_CTX *c); -unsigned char *SHA384(const unsigned char *d, size_t n, unsigned char *md); -int SHA512_Init(SHA512_CTX *c); -int SHA512_Update(SHA512_CTX *c, const void *data, size_t len); -int SHA512_Final(unsigned char *md, SHA512_CTX *c); -unsigned char *SHA512(const unsigned char *d, size_t n, unsigned char *md); -void SHA512_Transform(SHA512_CTX *c, const unsigned char *data); -# endif - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/crypto/sha/sha1.c b/crypto/sha/sha1.c deleted file mode 100644 index 8dd19431b48d..000000000000 --- a/crypto/sha/sha1.c +++ /dev/null @@ -1,121 +0,0 @@ -/* crypto/sha/sha1.c */ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] - */ - -#include <stdio.h> -#include <stdlib.h> -#include <openssl/sha.h> - -#define BUFSIZE 1024*16 - -void do_fp(FILE *f); -void pt(unsigned char *md); -#ifndef _OSD_POSIX -int read(int, void *, unsigned int); -#endif - -int main(int argc, char **argv) -{ - int i, err = 0; - FILE *IN; - - if (argc == 1) { - do_fp(stdin); - } else { - for (i = 1; i < argc; i++) { - IN = fopen(argv[i], "r"); - if (IN == NULL) { - perror(argv[i]); - err++; - continue; - } - printf("SHA1(%s)= ", argv[i]); - do_fp(IN); - fclose(IN); - } - } - exit(err); -} - -void do_fp(FILE *f) -{ - SHA_CTX c; - unsigned char md[SHA_DIGEST_LENGTH]; - int fd; - int i; - unsigned char buf[BUFSIZE]; - - fd = fileno(f); - SHA1_Init(&c); - for (;;) { - i = read(fd, buf, BUFSIZE); - if (i <= 0) - break; - SHA1_Update(&c, buf, (unsigned long)i); - } - SHA1_Final(&(md[0]), &c); - pt(md); -} - -void pt(unsigned char *md) -{ - int i; - - for (i = 0; i < SHA_DIGEST_LENGTH; i++) - printf("%02x", md[i]); - printf("\n"); -} diff --git a/crypto/sha/sha1_one.c b/crypto/sha/sha1_one.c index a6dd760a1e0e..e5b38211d2da 100644 --- a/crypto/sha/sha1_one.c +++ b/crypto/sha/sha1_one.c @@ -1,59 +1,10 @@ -/* crypto/sha/sha1_one.c */ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. +/* + * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] + * Licensed under the OpenSSL license (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html */ #include <stdio.h> @@ -61,7 +12,6 @@ #include <openssl/crypto.h> #include <openssl/sha.h> -#ifndef OPENSSL_NO_SHA1 unsigned char *SHA1(const unsigned char *d, size_t n, unsigned char *md) { SHA_CTX c; @@ -74,6 +24,5 @@ unsigned char *SHA1(const unsigned char *d, size_t n, unsigned char *md) SHA1_Update(&c, d, n); SHA1_Final(md, &c); OPENSSL_cleanse(&c, sizeof(c)); - return (md); + return md; } -#endif diff --git a/crypto/sha/sha1dgst.c b/crypto/sha/sha1dgst.c index a67f1fe36479..819370e61540 100644 --- a/crypto/sha/sha1dgst.c +++ b/crypto/sha/sha1dgst.c @@ -1,74 +1,17 @@ -/* crypto/sha/sha1dgst.c */ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. +/* + * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] + * Licensed under the OpenSSL license (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html */ #include <openssl/crypto.h> #include <openssl/opensslconf.h> -#if !defined(OPENSSL_NO_SHA1) && !defined(OPENSSL_NO_SHA) - -# undef SHA_0 -# define SHA_1 # include <openssl/opensslv.h> -const char SHA1_version[] = "SHA1" OPENSSL_VERSION_PTEXT; - /* The implementation is in ../md32_common.h */ # include "sha_locl.h" - -#endif diff --git a/crypto/sha/sha1test.c b/crypto/sha/sha1test.c deleted file mode 100644 index 551a348df37f..000000000000 --- a/crypto/sha/sha1test.c +++ /dev/null @@ -1,174 +0,0 @@ -/* crypto/sha/sha1test.c */ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] - */ - -#include <stdio.h> -#include <string.h> -#include <stdlib.h> - -#include "../e_os.h" - -#ifdef OPENSSL_NO_SHA -int main(int argc, char *argv[]) -{ - printf("No SHA support\n"); - return (0); -} -#else -# include <openssl/evp.h> -# include <openssl/sha.h> - -# ifdef CHARSET_EBCDIC -# include <openssl/ebcdic.h> -# endif - -# undef SHA_0 /* FIPS 180 */ -# define SHA_1 /* FIPS 180-1 */ - -static char *test[] = { - "abc", - "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq", - NULL, -}; - -# ifdef SHA_0 -static char *ret[] = { - "0164b8a914cd2a5e74c4f7ff082c4d97f1edf880", - "d2516ee1acfa5baf33dfc1c471e438449ef134c8", -}; - -static char *bigret = "3232affa48628a26653b5aaa44541fd90d690603"; -# endif -# ifdef SHA_1 -static char *ret[] = { - "a9993e364706816aba3e25717850c26c9cd0d89d", - "84983e441c3bd26ebaae4aa1f95129e5e54670f1", -}; - -static char *bigret = "34aa973cd4c4daa4f61eeb2bdbad27316534016f"; -# endif - -static char *pt(unsigned char *md); -int main(int argc, char *argv[]) -{ - int i, err = 0; - char **P, **R; - static unsigned char buf[1000]; - char *p, *r; - EVP_MD_CTX c; - unsigned char md[SHA_DIGEST_LENGTH]; - -# ifdef CHARSET_EBCDIC - ebcdic2ascii(test[0], test[0], strlen(test[0])); - ebcdic2ascii(test[1], test[1], strlen(test[1])); -# endif - - EVP_MD_CTX_init(&c); - P = test; - R = ret; - i = 1; - while (*P != NULL) { - EVP_Digest(*P, strlen((char *)*P), md, NULL, EVP_sha1(), NULL); - p = pt(md); - if (strcmp(p, (char *)*R) != 0) { - printf("error calculating SHA1 on '%s'\n", *P); - printf("got %s instead of %s\n", p, *R); - err++; - } else - printf("test %d ok\n", i); - i++; - R++; - P++; - } - - memset(buf, 'a', 1000); -# ifdef CHARSET_EBCDIC - ebcdic2ascii(buf, buf, 1000); -# endif /* CHARSET_EBCDIC */ - EVP_DigestInit_ex(&c, EVP_sha1(), NULL); - for (i = 0; i < 1000; i++) - EVP_DigestUpdate(&c, buf, 1000); - EVP_DigestFinal_ex(&c, md, NULL); - p = pt(md); - - r = bigret; - if (strcmp(p, r) != 0) { - printf("error calculating SHA1 on 'a' * 1000\n"); - printf("got %s instead of %s\n", p, r); - err++; - } else - printf("test 3 ok\n"); - -# ifdef OPENSSL_SYS_NETWARE - if (err) - printf("ERROR: %d\n", err); -# endif - EVP_MD_CTX_cleanup(&c); - EXIT(err); - return (0); -} - -static char *pt(unsigned char *md) -{ - int i; - static char buf[80]; - - for (i = 0; i < SHA_DIGEST_LENGTH; i++) - sprintf(&(buf[i * 2]), "%02x", md[i]); - return (buf); -} -#endif diff --git a/crypto/sha/sha256.c b/crypto/sha/sha256.c index 72a11593697e..bf78f075eefb 100644 --- a/crypto/sha/sha256.c +++ b/crypto/sha/sha256.c @@ -1,22 +1,22 @@ -/* crypto/sha/sha256.c */ -/* ==================================================================== - * Copyright (c) 2004 The OpenSSL Project. All rights reserved - * according to the OpenSSL license [found in ../../LICENSE]. - * ==================================================================== +/* + * Copyright 2004-2016 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the OpenSSL license (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html */ -#include <openssl/opensslconf.h> -#if !defined(OPENSSL_NO_SHA) && !defined(OPENSSL_NO_SHA256) -# include <stdlib.h> -# include <string.h> +#include <openssl/opensslconf.h> -# include <openssl/crypto.h> -# include <openssl/sha.h> -# include <openssl/opensslv.h> +#include <stdlib.h> +#include <string.h> -const char SHA256_version[] = "SHA-256" OPENSSL_VERSION_PTEXT; +#include <openssl/crypto.h> +#include <openssl/sha.h> +#include <openssl/opensslv.h> -fips_md_init_ctx(SHA224, SHA256) +int SHA224_Init(SHA256_CTX *c) { memset(c, 0, sizeof(*c)); c->h[0] = 0xc1059ed8UL; @@ -31,7 +31,7 @@ fips_md_init_ctx(SHA224, SHA256) return 1; } -fips_md_init(SHA256) +int SHA256_Init(SHA256_CTX *c) { memset(c, 0, sizeof(*c)); c->h[0] = 0x6a09e667UL; @@ -57,7 +57,7 @@ unsigned char *SHA224(const unsigned char *d, size_t n, unsigned char *md) SHA256_Update(&c, d, n); SHA256_Final(md, &c); OPENSSL_cleanse(&c, sizeof(c)); - return (md); + return md; } unsigned char *SHA256(const unsigned char *d, size_t n, unsigned char *md) @@ -71,7 +71,7 @@ unsigned char *SHA256(const unsigned char *d, size_t n, unsigned char *md) SHA256_Update(&c, d, n); SHA256_Final(md, &c); OPENSSL_cleanse(&c, sizeof(c)); - return (md); + return md; } int SHA224_Update(SHA256_CTX *c, const void *data, size_t len) @@ -84,20 +84,21 @@ int SHA224_Final(unsigned char *md, SHA256_CTX *c) return SHA256_Final(md, c); } -# define DATA_ORDER_IS_BIG_ENDIAN +#define DATA_ORDER_IS_BIG_ENDIAN + +#define HASH_LONG SHA_LONG +#define HASH_CTX SHA256_CTX +#define HASH_CBLOCK SHA_CBLOCK -# define HASH_LONG SHA_LONG -# define HASH_CTX SHA256_CTX -# define HASH_CBLOCK SHA_CBLOCK /* * Note that FIPS180-2 discusses "Truncation of the Hash Function Output." * default: case below covers for it. It's not clear however if it's * permitted to truncate to amount of bytes not divisible by 4. I bet not, * but if it is, then default: case shall be extended. For reference. - * Idea behind separate cases for pre-defined lenghts is to let the + * Idea behind separate cases for pre-defined lengths is to let the * compiler decide if it's appropriate to unroll small loops. */ -# define HASH_MAKE_STRING(c,s) do { \ +#define HASH_MAKE_STRING(c,s) do { \ unsigned long ll; \ unsigned int nn; \ switch ((c)->md_len) \ @@ -118,18 +119,18 @@ int SHA224_Final(unsigned char *md, SHA256_CTX *c) } \ } while (0) -# define HASH_UPDATE SHA256_Update -# define HASH_TRANSFORM SHA256_Transform -# define HASH_FINAL SHA256_Final -# define HASH_BLOCK_DATA_ORDER sha256_block_data_order -# ifndef SHA256_ASM +#define HASH_UPDATE SHA256_Update +#define HASH_TRANSFORM SHA256_Transform +#define HASH_FINAL SHA256_Final +#define HASH_BLOCK_DATA_ORDER sha256_block_data_order +#ifndef SHA256_ASM static -# endif +#endif void sha256_block_data_order(SHA256_CTX *ctx, const void *in, size_t num); -# include "md32_common.h" +#include "internal/md32_common.h" -# ifndef SHA256_ASM +#ifndef SHA256_ASM static const SHA_LONG K256[64] = { 0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL, 0x3956c25bUL, 0x59f111f1UL, 0x923f82a4UL, 0xab1c5ed5UL, @@ -154,15 +155,15 @@ static const SHA_LONG K256[64] = { * is left one. This is why you might notice that rotation coefficients * differ from those observed in FIPS document by 32-N... */ -# define Sigma0(x) (ROTATE((x),30) ^ ROTATE((x),19) ^ ROTATE((x),10)) -# define Sigma1(x) (ROTATE((x),26) ^ ROTATE((x),21) ^ ROTATE((x),7)) -# define sigma0(x) (ROTATE((x),25) ^ ROTATE((x),14) ^ ((x)>>3)) -# define sigma1(x) (ROTATE((x),15) ^ ROTATE((x),13) ^ ((x)>>10)) +# define Sigma0(x) (ROTATE((x),30) ^ ROTATE((x),19) ^ ROTATE((x),10)) +# define Sigma1(x) (ROTATE((x),26) ^ ROTATE((x),21) ^ ROTATE((x),7)) +# define sigma0(x) (ROTATE((x),25) ^ ROTATE((x),14) ^ ((x)>>3)) +# define sigma1(x) (ROTATE((x),15) ^ ROTATE((x),13) ^ ((x)>>10)) -# define Ch(x,y,z) (((x) & (y)) ^ ((~(x)) & (z))) -# define Maj(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z))) +# define Ch(x,y,z) (((x) & (y)) ^ ((~(x)) & (z))) +# define Maj(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z))) -# ifdef OPENSSL_SMALL_FOOTPRINT +# ifdef OPENSSL_SMALL_FOOTPRINT static void sha256_block_data_order(SHA256_CTX *ctx, const void *in, size_t num) @@ -184,7 +185,7 @@ static void sha256_block_data_order(SHA256_CTX *ctx, const void *in, h = ctx->h[7]; for (i = 0; i < 16; i++) { - HOST_c2l(data, l); + (void)HOST_c2l(data, l); T1 = X[i] = l; T1 += h + Sigma1(e) + Ch(e, f, g) + K256[i]; T2 = Sigma0(a) + Maj(a, b, c); @@ -229,14 +230,14 @@ static void sha256_block_data_order(SHA256_CTX *ctx, const void *in, } } -# else +# else -# define ROUND_00_15(i,a,b,c,d,e,f,g,h) do { \ +# define ROUND_00_15(i,a,b,c,d,e,f,g,h) do { \ T1 += h + Sigma1(e) + Ch(e,f,g) + K256[i]; \ h = Sigma0(a) + Maj(a,b,c); \ d += T1; h += T1; } while (0) -# define ROUND_16_63(i,a,b,c,d,e,f,g,h,X) do { \ +# define ROUND_16_63(i,a,b,c,d,e,f,g,h,X) do { \ s0 = X[(i+1)&0x0f]; s0 = sigma0(s0); \ s1 = X[(i+14)&0x0f]; s1 = sigma1(s1); \ T1 = X[(i)&0x0f] += s0 + s1 + X[(i+9)&0x0f]; \ @@ -308,52 +309,52 @@ static void sha256_block_data_order(SHA256_CTX *ctx, const void *in, } else { SHA_LONG l; - HOST_c2l(data, l); + (void)HOST_c2l(data, l); T1 = X[0] = l; ROUND_00_15(0, a, b, c, d, e, f, g, h); - HOST_c2l(data, l); + (void)HOST_c2l(data, l); T1 = X[1] = l; ROUND_00_15(1, h, a, b, c, d, e, f, g); - HOST_c2l(data, l); + (void)HOST_c2l(data, l); T1 = X[2] = l; ROUND_00_15(2, g, h, a, b, c, d, e, f); - HOST_c2l(data, l); + (void)HOST_c2l(data, l); T1 = X[3] = l; ROUND_00_15(3, f, g, h, a, b, c, d, e); - HOST_c2l(data, l); + (void)HOST_c2l(data, l); T1 = X[4] = l; ROUND_00_15(4, e, f, g, h, a, b, c, d); - HOST_c2l(data, l); + (void)HOST_c2l(data, l); T1 = X[5] = l; ROUND_00_15(5, d, e, f, g, h, a, b, c); - HOST_c2l(data, l); + (void)HOST_c2l(data, l); T1 = X[6] = l; ROUND_00_15(6, c, d, e, f, g, h, a, b); - HOST_c2l(data, l); + (void)HOST_c2l(data, l); T1 = X[7] = l; ROUND_00_15(7, b, c, d, e, f, g, h, a); - HOST_c2l(data, l); + (void)HOST_c2l(data, l); T1 = X[8] = l; ROUND_00_15(8, a, b, c, d, e, f, g, h); - HOST_c2l(data, l); + (void)HOST_c2l(data, l); T1 = X[9] = l; ROUND_00_15(9, h, a, b, c, d, e, f, g); - HOST_c2l(data, l); + (void)HOST_c2l(data, l); T1 = X[10] = l; ROUND_00_15(10, g, h, a, b, c, d, e, f); - HOST_c2l(data, l); + (void)HOST_c2l(data, l); T1 = X[11] = l; ROUND_00_15(11, f, g, h, a, b, c, d, e); - HOST_c2l(data, l); + (void)HOST_c2l(data, l); T1 = X[12] = l; ROUND_00_15(12, e, f, g, h, a, b, c, d); - HOST_c2l(data, l); + (void)HOST_c2l(data, l); T1 = X[13] = l; ROUND_00_15(13, d, e, f, g, h, a, b, c); - HOST_c2l(data, l); + (void)HOST_c2l(data, l); T1 = X[14] = l; ROUND_00_15(14, c, d, e, f, g, h, a, b); - HOST_c2l(data, l); + (void)HOST_c2l(data, l); T1 = X[15] = l; ROUND_00_15(15, b, c, d, e, f, g, h, a); } @@ -381,7 +382,5 @@ static void sha256_block_data_order(SHA256_CTX *ctx, const void *in, } } -# endif -# endif /* SHA256_ASM */ - -#endif /* OPENSSL_NO_SHA256 */ +# endif +#endif /* SHA256_ASM */ diff --git a/crypto/sha/sha256t.c b/crypto/sha/sha256t.c deleted file mode 100644 index 35dbbc2a96d7..000000000000 --- a/crypto/sha/sha256t.c +++ /dev/null @@ -1,158 +0,0 @@ -/* crypto/sha/sha256t.c */ -/* ==================================================================== - * Copyright (c) 2004 The OpenSSL Project. All rights reserved. - * ==================================================================== - */ -#include <stdio.h> -#include <string.h> -#include <stdlib.h> - -#include <openssl/sha.h> -#include <openssl/evp.h> - -#if defined(OPENSSL_NO_SHA) || defined(OPENSSL_NO_SHA256) -int main(int argc, char *argv[]) -{ - printf("No SHA256 support\n"); - return (0); -} -#else - -unsigned char app_b1[SHA256_DIGEST_LENGTH] = { - 0xba, 0x78, 0x16, 0xbf, 0x8f, 0x01, 0xcf, 0xea, - 0x41, 0x41, 0x40, 0xde, 0x5d, 0xae, 0x22, 0x23, - 0xb0, 0x03, 0x61, 0xa3, 0x96, 0x17, 0x7a, 0x9c, - 0xb4, 0x10, 0xff, 0x61, 0xf2, 0x00, 0x15, 0xad -}; - -unsigned char app_b2[SHA256_DIGEST_LENGTH] = { - 0x24, 0x8d, 0x6a, 0x61, 0xd2, 0x06, 0x38, 0xb8, - 0xe5, 0xc0, 0x26, 0x93, 0x0c, 0x3e, 0x60, 0x39, - 0xa3, 0x3c, 0xe4, 0x59, 0x64, 0xff, 0x21, 0x67, - 0xf6, 0xec, 0xed, 0xd4, 0x19, 0xdb, 0x06, 0xc1 -}; - -unsigned char app_b3[SHA256_DIGEST_LENGTH] = { - 0xcd, 0xc7, 0x6e, 0x5c, 0x99, 0x14, 0xfb, 0x92, - 0x81, 0xa1, 0xc7, 0xe2, 0x84, 0xd7, 0x3e, 0x67, - 0xf1, 0x80, 0x9a, 0x48, 0xa4, 0x97, 0x20, 0x0e, - 0x04, 0x6d, 0x39, 0xcc, 0xc7, 0x11, 0x2c, 0xd0 -}; - -unsigned char addenum_1[SHA224_DIGEST_LENGTH] = { - 0x23, 0x09, 0x7d, 0x22, 0x34, 0x05, 0xd8, 0x22, - 0x86, 0x42, 0xa4, 0x77, 0xbd, 0xa2, 0x55, 0xb3, - 0x2a, 0xad, 0xbc, 0xe4, 0xbd, 0xa0, 0xb3, 0xf7, - 0xe3, 0x6c, 0x9d, 0xa7 -}; - -unsigned char addenum_2[SHA224_DIGEST_LENGTH] = { - 0x75, 0x38, 0x8b, 0x16, 0x51, 0x27, 0x76, 0xcc, - 0x5d, 0xba, 0x5d, 0xa1, 0xfd, 0x89, 0x01, 0x50, - 0xb0, 0xc6, 0x45, 0x5c, 0xb4, 0xf5, 0x8b, 0x19, - 0x52, 0x52, 0x25, 0x25 -}; - -unsigned char addenum_3[SHA224_DIGEST_LENGTH] = { - 0x20, 0x79, 0x46, 0x55, 0x98, 0x0c, 0x91, 0xd8, - 0xbb, 0xb4, 0xc1, 0xea, 0x97, 0x61, 0x8a, 0x4b, - 0xf0, 0x3f, 0x42, 0x58, 0x19, 0x48, 0xb2, 0xee, - 0x4e, 0xe7, 0xad, 0x67 -}; - -int main(int argc, char **argv) -{ - unsigned char md[SHA256_DIGEST_LENGTH]; - int i; - EVP_MD_CTX evp; - - fprintf(stdout, "Testing SHA-256 "); - - EVP_Digest("abc", 3, md, NULL, EVP_sha256(), NULL); - if (memcmp(md, app_b1, sizeof(app_b1))) { - fflush(stdout); - fprintf(stderr, "\nTEST 1 of 3 failed.\n"); - return 1; - } else - fprintf(stdout, "."); - fflush(stdout); - - EVP_Digest("abcdbcde" "cdefdefg" "efghfghi" "ghijhijk" - "ijkljklm" "klmnlmno" "mnopnopq", 56, md, NULL, EVP_sha256(), - NULL); - if (memcmp(md, app_b2, sizeof(app_b2))) { - fflush(stdout); - fprintf(stderr, "\nTEST 2 of 3 failed.\n"); - return 1; - } else - fprintf(stdout, "."); - fflush(stdout); - - EVP_MD_CTX_init(&evp); - EVP_DigestInit_ex(&evp, EVP_sha256(), NULL); - for (i = 0; i < 1000000; i += 160) - EVP_DigestUpdate(&evp, "aaaaaaaa" "aaaaaaaa" "aaaaaaaa" "aaaaaaaa" - "aaaaaaaa" "aaaaaaaa" "aaaaaaaa" "aaaaaaaa" - "aaaaaaaa" "aaaaaaaa" "aaaaaaaa" "aaaaaaaa" - "aaaaaaaa" "aaaaaaaa" "aaaaaaaa" "aaaaaaaa" - "aaaaaaaa" "aaaaaaaa" "aaaaaaaa" "aaaaaaaa", - (1000000 - i) < 160 ? 1000000 - i : 160); - EVP_DigestFinal_ex(&evp, md, NULL); - EVP_MD_CTX_cleanup(&evp); - - if (memcmp(md, app_b3, sizeof(app_b3))) { - fflush(stdout); - fprintf(stderr, "\nTEST 3 of 3 failed.\n"); - return 1; - } else - fprintf(stdout, "."); - fflush(stdout); - - fprintf(stdout, " passed.\n"); - fflush(stdout); - - fprintf(stdout, "Testing SHA-224 "); - - EVP_Digest("abc", 3, md, NULL, EVP_sha224(), NULL); - if (memcmp(md, addenum_1, sizeof(addenum_1))) { - fflush(stdout); - fprintf(stderr, "\nTEST 1 of 3 failed.\n"); - return 1; - } else - fprintf(stdout, "."); - fflush(stdout); - - EVP_Digest("abcdbcde" "cdefdefg" "efghfghi" "ghijhijk" - "ijkljklm" "klmnlmno" "mnopnopq", 56, md, NULL, EVP_sha224(), - NULL); - if (memcmp(md, addenum_2, sizeof(addenum_2))) { - fflush(stdout); - fprintf(stderr, "\nTEST 2 of 3 failed.\n"); - return 1; - } else - fprintf(stdout, "."); - fflush(stdout); - - EVP_MD_CTX_init(&evp); - EVP_DigestInit_ex(&evp, EVP_sha224(), NULL); - for (i = 0; i < 1000000; i += 64) - EVP_DigestUpdate(&evp, "aaaaaaaa" "aaaaaaaa" "aaaaaaaa" "aaaaaaaa" - "aaaaaaaa" "aaaaaaaa" "aaaaaaaa" "aaaaaaaa", - (1000000 - i) < 64 ? 1000000 - i : 64); - EVP_DigestFinal_ex(&evp, md, NULL); - EVP_MD_CTX_cleanup(&evp); - - if (memcmp(md, addenum_3, sizeof(addenum_3))) { - fflush(stdout); - fprintf(stderr, "\nTEST 3 of 3 failed.\n"); - return 1; - } else - fprintf(stdout, "."); - fflush(stdout); - - fprintf(stdout, " passed.\n"); - fflush(stdout); - - return 0; -} -#endif diff --git a/crypto/sha/sha512.c b/crypto/sha/sha512.c index 3bf66ae1987e..50b65ee811d6 100644 --- a/crypto/sha/sha512.c +++ b/crypto/sha/sha512.c @@ -1,17 +1,19 @@ -/* crypto/sha/sha512.c */ -/* ==================================================================== - * Copyright (c) 2004 The OpenSSL Project. All rights reserved - * according to the OpenSSL license [found in ../../LICENSE]. - * ==================================================================== +/* + * Copyright 2004-2018 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the OpenSSL license (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html */ + #include <openssl/opensslconf.h> -#if !defined(OPENSSL_NO_SHA) && !defined(OPENSSL_NO_SHA512) /*- * IMPLEMENTATION NOTES. * * As you might have noticed 32-bit hash algorithms: * - * - permit SHA_LONG to be wider than 32-bit (case on CRAY); + * - permit SHA_LONG to be wider than 32-bit * - optimized versions implement two transform functions: one operating * on [aligned] data in host byte order and one - on data in input * stream byte order; @@ -39,28 +41,62 @@ * As this implementation relies on 64-bit integer type, it's totally * inappropriate for platforms which don't support it, most notably * 16-bit platforms. - * <appro@fy.chalmers.se> */ -# include <stdlib.h> -# include <string.h> +#include <stdlib.h> +#include <string.h> -# include <openssl/crypto.h> -# include <openssl/sha.h> -# include <openssl/opensslv.h> +#include <openssl/crypto.h> +#include <openssl/sha.h> +#include <openssl/opensslv.h> -# include "cryptlib.h" +#include "internal/cryptlib.h" +#include "internal/sha.h" -const char SHA512_version[] = "SHA-512" OPENSSL_VERSION_PTEXT; - -# if defined(__i386) || defined(__i386__) || defined(_M_IX86) || \ +#if defined(__i386) || defined(__i386__) || defined(_M_IX86) || \ defined(__x86_64) || defined(_M_AMD64) || defined(_M_X64) || \ defined(__s390__) || defined(__s390x__) || \ defined(__aarch64__) || \ defined(SHA512_ASM) -# define SHA512_BLOCK_CAN_MANAGE_UNALIGNED_DATA -# endif +# define SHA512_BLOCK_CAN_MANAGE_UNALIGNED_DATA +#endif -fips_md_init_ctx(SHA384, SHA512) +int sha512_224_init(SHA512_CTX *c) +{ + c->h[0] = U64(0x8c3d37c819544da2); + c->h[1] = U64(0x73e1996689dcd4d6); + c->h[2] = U64(0x1dfab7ae32ff9c82); + c->h[3] = U64(0x679dd514582f9fcf); + c->h[4] = U64(0x0f6d2b697bd44da8); + c->h[5] = U64(0x77e36f7304c48942); + c->h[6] = U64(0x3f9d85a86a1d36c8); + c->h[7] = U64(0x1112e6ad91d692a1); + + c->Nl = 0; + c->Nh = 0; + c->num = 0; + c->md_len = SHA224_DIGEST_LENGTH; + return 1; +} + +int sha512_256_init(SHA512_CTX *c) +{ + c->h[0] = U64(0x22312194fc2bf72c); + c->h[1] = U64(0x9f555fa3c84c64c2); + c->h[2] = U64(0x2393b86b6f53b151); + c->h[3] = U64(0x963877195940eabd); + c->h[4] = U64(0x96283ee2a88effe3); + c->h[5] = U64(0xbe5e1e2553863992); + c->h[6] = U64(0x2b0199fc2c85b8aa); + c->h[7] = U64(0x0eb72ddc81c52ca2); + + c->Nl = 0; + c->Nh = 0; + c->num = 0; + c->md_len = SHA256_DIGEST_LENGTH; + return 1; +} + +int SHA384_Init(SHA512_CTX *c) { c->h[0] = U64(0xcbbb9d5dc1059ed8); c->h[1] = U64(0x629a292a367cd507); @@ -78,7 +114,7 @@ fips_md_init_ctx(SHA384, SHA512) return 1; } -fips_md_init(SHA512) +int SHA512_Init(SHA512_CTX *c) { c->h[0] = U64(0x6a09e667f3bcc908); c->h[1] = U64(0xbb67ae8584caa73b); @@ -96,9 +132,9 @@ fips_md_init(SHA512) return 1; } -# ifndef SHA512_ASM +#ifndef SHA512_ASM static -# endif +#endif void sha512_block_data_order(SHA512_CTX *ctx, const void *in, size_t num); int SHA512_Final(unsigned char *md, SHA512_CTX *c) @@ -108,15 +144,17 @@ int SHA512_Final(unsigned char *md, SHA512_CTX *c) p[n] = 0x80; /* There always is a room for one */ n++; - if (n > (sizeof(c->u) - 16)) - memset(p + n, 0, sizeof(c->u) - n), n = 0, - sha512_block_data_order(c, p, 1); + if (n > (sizeof(c->u) - 16)) { + memset(p + n, 0, sizeof(c->u) - n); + n = 0; + sha512_block_data_order(c, p, 1); + } memset(p + n, 0, sizeof(c->u) - 16 - n); -# ifdef B_ENDIAN +#ifdef B_ENDIAN c->u.d[SHA_LBLOCK - 2] = c->Nh; c->u.d[SHA_LBLOCK - 1] = c->Nl; -# else +#else p[sizeof(c->u) - 1] = (unsigned char)(c->Nl); p[sizeof(c->u) - 2] = (unsigned char)(c->Nl >> 8); p[sizeof(c->u) - 3] = (unsigned char)(c->Nl >> 16); @@ -133,7 +171,7 @@ int SHA512_Final(unsigned char *md, SHA512_CTX *c) p[sizeof(c->u) - 14] = (unsigned char)(c->Nh >> 40); p[sizeof(c->u) - 15] = (unsigned char)(c->Nh >> 48); p[sizeof(c->u) - 16] = (unsigned char)(c->Nh >> 56); -# endif +#endif sha512_block_data_order(c, p, 1); @@ -141,7 +179,47 @@ int SHA512_Final(unsigned char *md, SHA512_CTX *c) return 0; switch (c->md_len) { - /* Let compiler decide if it's appropriate to unroll... */ + /* Let compiler decide if it's appropriate to unroll... */ + case SHA224_DIGEST_LENGTH: + for (n = 0; n < SHA224_DIGEST_LENGTH / 8; n++) { + SHA_LONG64 t = c->h[n]; + + *(md++) = (unsigned char)(t >> 56); + *(md++) = (unsigned char)(t >> 48); + *(md++) = (unsigned char)(t >> 40); + *(md++) = (unsigned char)(t >> 32); + *(md++) = (unsigned char)(t >> 24); + *(md++) = (unsigned char)(t >> 16); + *(md++) = (unsigned char)(t >> 8); + *(md++) = (unsigned char)(t); + } + /* + * For 224 bits, there are four bytes left over that have to be + * processed separately. + */ + { + SHA_LONG64 t = c->h[SHA224_DIGEST_LENGTH / 8]; + + *(md++) = (unsigned char)(t >> 56); + *(md++) = (unsigned char)(t >> 48); + *(md++) = (unsigned char)(t >> 40); + *(md++) = (unsigned char)(t >> 32); + } + break; + case SHA256_DIGEST_LENGTH: + for (n = 0; n < SHA256_DIGEST_LENGTH / 8; n++) { + SHA_LONG64 t = c->h[n]; + + *(md++) = (unsigned char)(t >> 56); + *(md++) = (unsigned char)(t >> 48); + *(md++) = (unsigned char)(t >> 40); + *(md++) = (unsigned char)(t >> 32); + *(md++) = (unsigned char)(t >> 24); + *(md++) = (unsigned char)(t >> 16); + *(md++) = (unsigned char)(t >> 8); + *(md++) = (unsigned char)(t); + } + break; case SHA384_DIGEST_LENGTH: for (n = 0; n < SHA384_DIGEST_LENGTH / 8; n++) { SHA_LONG64 t = c->h[n]; @@ -170,7 +248,7 @@ int SHA512_Final(unsigned char *md, SHA512_CTX *c) *(md++) = (unsigned char)(t); } break; - /* ... as well as make sure md_len is not abused. */ + /* ... as well as make sure md_len is not abused. */ default: return 0; } @@ -213,16 +291,16 @@ int SHA512_Update(SHA512_CTX *c, const void *_data, size_t len) } if (len >= sizeof(c->u)) { -# ifndef SHA512_BLOCK_CAN_MANAGE_UNALIGNED_DATA +#ifndef SHA512_BLOCK_CAN_MANAGE_UNALIGNED_DATA if ((size_t)data % sizeof(c->u.d[0]) != 0) while (len >= sizeof(c->u)) memcpy(p, data, sizeof(c->u)), - sha512_block_data_order(c, p, 1), - len -= sizeof(c->u), data += sizeof(c->u); + sha512_block_data_order(c, p, 1), + len -= sizeof(c->u), data += sizeof(c->u); else -# endif +#endif sha512_block_data_order(c, data, len / sizeof(c->u)), - data += len, len %= sizeof(c->u), data -= len; + data += len, len %= sizeof(c->u), data -= len; } if (len != 0) @@ -238,10 +316,10 @@ int SHA384_Update(SHA512_CTX *c, const void *data, size_t len) void SHA512_Transform(SHA512_CTX *c, const unsigned char *data) { -# ifndef SHA512_BLOCK_CAN_MANAGE_UNALIGNED_DATA +#ifndef SHA512_BLOCK_CAN_MANAGE_UNALIGNED_DATA if ((size_t)data % sizeof(c->u.d[0]) != 0) memcpy(c->u.p, data, sizeof(c->u.p)), data = c->u.p; -# endif +#endif sha512_block_data_order(c, data, 1); } @@ -256,7 +334,7 @@ unsigned char *SHA384(const unsigned char *d, size_t n, unsigned char *md) SHA512_Update(&c, d, n); SHA512_Final(md, &c); OPENSSL_cleanse(&c, sizeof(c)); - return (md); + return md; } unsigned char *SHA512(const unsigned char *d, size_t n, unsigned char *md) @@ -270,10 +348,10 @@ unsigned char *SHA512(const unsigned char *d, size_t n, unsigned char *md) SHA512_Update(&c, d, n); SHA512_Final(md, &c); OPENSSL_cleanse(&c, sizeof(c)); - return (md); + return md; } -# ifndef SHA512_ASM +#ifndef SHA512_ASM static const SHA_LONG64 K512[80] = { U64(0x428a2f98d728ae22), U64(0x7137449123ef65cd), U64(0xb5c0fbcfec4d3b2f), U64(0xe9b5dba58189dbbc), @@ -317,103 +395,111 @@ static const SHA_LONG64 K512[80] = { U64(0x5fcb6fab3ad6faec), U64(0x6c44198c4a475817) }; -# ifndef PEDANTIC -# if defined(__GNUC__) && __GNUC__>=2 && !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM) -# if defined(__x86_64) || defined(__x86_64__) -# define ROTR(a,n) ({ SHA_LONG64 ret; \ +# ifndef PEDANTIC +# if defined(__GNUC__) && __GNUC__>=2 && \ + !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM) +# if defined(__x86_64) || defined(__x86_64__) +# define ROTR(a,n) ({ SHA_LONG64 ret; \ asm ("rorq %1,%0" \ : "=r"(ret) \ : "J"(n),"0"(a) \ : "cc"); ret; }) -# if !defined(B_ENDIAN) -# define PULL64(x) ({ SHA_LONG64 ret=*((const SHA_LONG64 *)(&(x))); \ +# if !defined(B_ENDIAN) +# define PULL64(x) ({ SHA_LONG64 ret=*((const SHA_LONG64 *)(&(x))); \ asm ("bswapq %0" \ : "=r"(ret) \ : "0"(ret)); ret; }) -# endif -# elif (defined(__i386) || defined(__i386__)) && !defined(B_ENDIAN) -# if defined(I386_ONLY) -# define PULL64(x) ({ const unsigned int *p=(const unsigned int *)(&(x));\ - unsigned int hi=p[0],lo=p[1]; \ +# endif +# elif (defined(__i386) || defined(__i386__)) && !defined(B_ENDIAN) +# if defined(I386_ONLY) +# define PULL64(x) ({ const unsigned int *p=(const unsigned int *)(&(x));\ + unsigned int hi=p[0],lo=p[1]; \ asm("xchgb %%ah,%%al;xchgb %%dh,%%dl;"\ "roll $16,%%eax; roll $16,%%edx; "\ - "xchgb %%ah,%%al;xchgb %%dh,%%dl;" \ + "xchgb %%ah,%%al;xchgb %%dh,%%dl;"\ : "=a"(lo),"=d"(hi) \ : "0"(lo),"1"(hi) : "cc"); \ ((SHA_LONG64)hi)<<32|lo; }) -# else -# define PULL64(x) ({ const unsigned int *p=(const unsigned int *)(&(x));\ - unsigned int hi=p[0],lo=p[1]; \ +# else +# define PULL64(x) ({ const unsigned int *p=(const unsigned int *)(&(x));\ + unsigned int hi=p[0],lo=p[1]; \ asm ("bswapl %0; bswapl %1;" \ : "=r"(lo),"=r"(hi) \ : "0"(lo),"1"(hi)); \ ((SHA_LONG64)hi)<<32|lo; }) -# endif -# elif (defined(_ARCH_PPC) && defined(__64BIT__)) || defined(_ARCH_PPC64) -# define ROTR(a,n) ({ SHA_LONG64 ret; \ +# endif +# elif (defined(_ARCH_PPC) && defined(__64BIT__)) || defined(_ARCH_PPC64) +# define ROTR(a,n) ({ SHA_LONG64 ret; \ asm ("rotrdi %0,%1,%2" \ : "=r"(ret) \ : "r"(a),"K"(n)); ret; }) -# elif defined(__aarch64__) -# define ROTR(a,n) ({ SHA_LONG64 ret; \ +# elif defined(__aarch64__) +# define ROTR(a,n) ({ SHA_LONG64 ret; \ asm ("ror %0,%1,%2" \ : "=r"(ret) \ : "r"(a),"I"(n)); ret; }) -# if defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && \ +# if defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && \ __BYTE_ORDER__==__ORDER_LITTLE_ENDIAN__ -# define PULL64(x) ({ SHA_LONG64 ret; \ +# define PULL64(x) ({ SHA_LONG64 ret; \ asm ("rev %0,%1" \ : "=r"(ret) \ - : "r"(*((const SHA_LONG64 *)(&(x))))); ret; }) -# endif -# endif -# elif defined(_MSC_VER) -# if defined(_WIN64) /* applies to both IA-64 and AMD64 */ -# pragma intrinsic(_rotr64) -# define ROTR(a,n) _rotr64((a),n) + : "r"(*((const SHA_LONG64 *)(&(x))))); ret; }) # endif -# if defined(_M_IX86) && !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM) -# if defined(I386_ONLY) +# endif +# elif defined(_MSC_VER) +# if defined(_WIN64) /* applies to both IA-64 and AMD64 */ +# pragma intrinsic(_rotr64) +# define ROTR(a,n) _rotr64((a),n) +# endif +# if defined(_M_IX86) && !defined(OPENSSL_NO_ASM) && \ + !defined(OPENSSL_NO_INLINE_ASM) +# if defined(I386_ONLY) static SHA_LONG64 __fastcall __pull64be(const void *x) { - _asm mov edx,[ecx + 0] - _asm mov eax,[ecx + 4] -_asm xchg dh, dl - _asm xchg ah, al - _asm rol edx, 16 _asm rol eax, 16 _asm xchg dh, dl _asm xchg ah, al} -# else + _asm mov edx,[ecx + 0] + _asm mov eax,[ecx + 4] + _asm xchg dh, dl + _asm xchg ah, al + _asm rol edx, 16 + _asm rol eax, 16 + _asm xchg dh, dl + _asm xchg ah, al +} +# else static SHA_LONG64 __fastcall __pull64be(const void *x) { - _asm mov edx,[ecx + 0] - _asm mov eax,[ecx + 4] -_asm bswap edx _asm bswap eax} -# endif -# define PULL64(x) __pull64be(&(x)) -# if _MSC_VER<=1200 -# pragma inline_depth(0) -# endif + _asm mov edx,[ecx + 0] + _asm mov eax,[ecx + 4] + _asm bswap edx + _asm bswap eax +} # endif +# define PULL64(x) __pull64be(&(x)) # endif # endif -# ifndef PULL64 -# define B(x,j) (((SHA_LONG64)(*(((const unsigned char *)(&x))+j)))<<((7-j)*8)) -# define PULL64(x) (B(x,0)|B(x,1)|B(x,2)|B(x,3)|B(x,4)|B(x,5)|B(x,6)|B(x,7)) -# endif -# ifndef ROTR -# define ROTR(x,s) (((x)>>s) | (x)<<(64-s)) -# endif -# define Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39)) -# define Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41)) -# define sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7)) -# define sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6)) -# define Ch(x,y,z) (((x) & (y)) ^ ((~(x)) & (z))) -# define Maj(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z))) -# if defined(__i386) || defined(__i386__) || defined(_M_IX86) +# endif +# ifndef PULL64 +# define B(x,j) (((SHA_LONG64)(*(((const unsigned char *)(&x))+j)))<<((7-j)*8)) +# define PULL64(x) (B(x,0)|B(x,1)|B(x,2)|B(x,3)|B(x,4)|B(x,5)|B(x,6)|B(x,7)) +# endif +# ifndef ROTR +# define ROTR(x,s) (((x)>>s) | (x)<<(64-s)) +# endif +# define Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39)) +# define Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41)) +# define sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7)) +# define sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6)) +# define Ch(x,y,z) (((x) & (y)) ^ ((~(x)) & (z))) +# define Maj(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z))) + +# if defined(__i386) || defined(__i386__) || defined(_M_IX86) /* * This code should give better results on 32-bit CPU with less than * ~24 registers, both size and performance wise... - */ static void sha512_block_data_order(SHA512_CTX *ctx, const void *in, - size_t num) + */ + +static void sha512_block_data_order(SHA512_CTX *ctx, const void *in, + size_t num) { const SHA_LONG64 *W = in; SHA_LONG64 A, E, T; @@ -433,11 +519,11 @@ _asm bswap edx _asm bswap eax} F[7] = ctx->h[7]; for (i = 0; i < 16; i++, F--) { -# ifdef B_ENDIAN +# ifdef B_ENDIAN T = W[i]; -# else +# else T = PULL64(W[i]); -# endif +# endif F[0] = A; F[4] = E; F[8] = T; @@ -472,7 +558,8 @@ _asm bswap edx _asm bswap eax} } } -# elif defined(OPENSSL_SMALL_FOOTPRINT) +# elif defined(OPENSSL_SMALL_FOOTPRINT) + static void sha512_block_data_order(SHA512_CTX *ctx, const void *in, size_t num) { @@ -493,11 +580,11 @@ static void sha512_block_data_order(SHA512_CTX *ctx, const void *in, h = ctx->h[7]; for (i = 0; i < 16; i++) { -# ifdef B_ENDIAN +# ifdef B_ENDIAN T1 = X[i] = W[i]; -# else +# else T1 = X[i] = PULL64(W[i]); -# endif +# endif T1 += h + Sigma1(e) + Ch(e, f, g) + K512[i]; T2 = Sigma0(a) + Maj(a, b, c); h = g; @@ -542,16 +629,18 @@ static void sha512_block_data_order(SHA512_CTX *ctx, const void *in, } } -# else -# define ROUND_00_15(i,a,b,c,d,e,f,g,h) do { \ +# else +# define ROUND_00_15(i,a,b,c,d,e,f,g,h) do { \ T1 += h + Sigma1(e) + Ch(e,f,g) + K512[i]; \ h = Sigma0(a) + Maj(a,b,c); \ - d += T1; h += T1; } while (0) -# define ROUND_16_80(i,j,a,b,c,d,e,f,g,h,X) do { \ + d += T1; h += T1; } while (0) + +# define ROUND_16_80(i,j,a,b,c,d,e,f,g,h,X) do { \ s0 = X[(j+1)&0x0f]; s0 = sigma0(s0); \ s1 = X[(j+14)&0x0f]; s1 = sigma1(s1); \ T1 = X[(j)&0x0f] += s0 + s1 + X[(j+9)&0x0f]; \ ROUND_00_15(i+j,a,b,c,d,e,f,g,h); } while (0) + static void sha512_block_data_order(SHA512_CTX *ctx, const void *in, size_t num) { @@ -571,7 +660,7 @@ static void sha512_block_data_order(SHA512_CTX *ctx, const void *in, g = ctx->h[6]; h = ctx->h[7]; -# ifdef B_ENDIAN +# ifdef B_ENDIAN T1 = X[0] = W[0]; ROUND_00_15(0, a, b, c, d, e, f, g, h); T1 = X[1] = W[1]; @@ -604,7 +693,7 @@ static void sha512_block_data_order(SHA512_CTX *ctx, const void *in, ROUND_00_15(14, c, d, e, f, g, h, a, b); T1 = X[15] = W[15]; ROUND_00_15(15, b, c, d, e, f, g, h, a); -# else +# else T1 = X[0] = PULL64(W[0]); ROUND_00_15(0, a, b, c, d, e, f, g, h); T1 = X[1] = PULL64(W[1]); @@ -637,7 +726,7 @@ static void sha512_block_data_order(SHA512_CTX *ctx, const void *in, ROUND_00_15(14, c, d, e, f, g, h, a, b); T1 = X[15] = PULL64(W[15]); ROUND_00_15(15, b, c, d, e, f, g, h, a); -# endif +# endif for (i = 16; i < 80; i += 16) { ROUND_16_80(i, 0, a, b, c, d, e, f, g, h, X); @@ -671,14 +760,6 @@ static void sha512_block_data_order(SHA512_CTX *ctx, const void *in, } } -# endif - -# endif /* SHA512_ASM */ - -#else /* !OPENSSL_NO_SHA512 */ - -# if defined(PEDANTIC) || defined(__DECC) || defined(OPENSSL_SYS_MACOSX) -static void *dummy = &dummy; # endif -#endif /* !OPENSSL_NO_SHA512 */ +#endif /* SHA512_ASM */ diff --git a/crypto/sha/sha512t.c b/crypto/sha/sha512t.c deleted file mode 100644 index 178882fc76b6..000000000000 --- a/crypto/sha/sha512t.c +++ /dev/null @@ -1,196 +0,0 @@ -/* crypto/sha/sha512t.c */ -/* ==================================================================== - * Copyright (c) 2004 The OpenSSL Project. All rights reserved. - * ==================================================================== - */ -#include <stdio.h> -#include <string.h> -#include <stdlib.h> - -#include <openssl/sha.h> -#include <openssl/evp.h> -#include <openssl/crypto.h> - -#if defined(OPENSSL_NO_SHA) || defined(OPENSSL_NO_SHA512) -int main(int argc, char *argv[]) -{ - printf("No SHA512 support\n"); - return (0); -} -#else - -unsigned char app_c1[SHA512_DIGEST_LENGTH] = { - 0xdd, 0xaf, 0x35, 0xa1, 0x93, 0x61, 0x7a, 0xba, - 0xcc, 0x41, 0x73, 0x49, 0xae, 0x20, 0x41, 0x31, - 0x12, 0xe6, 0xfa, 0x4e, 0x89, 0xa9, 0x7e, 0xa2, - 0x0a, 0x9e, 0xee, 0xe6, 0x4b, 0x55, 0xd3, 0x9a, - 0x21, 0x92, 0x99, 0x2a, 0x27, 0x4f, 0xc1, 0xa8, - 0x36, 0xba, 0x3c, 0x23, 0xa3, 0xfe, 0xeb, 0xbd, - 0x45, 0x4d, 0x44, 0x23, 0x64, 0x3c, 0xe8, 0x0e, - 0x2a, 0x9a, 0xc9, 0x4f, 0xa5, 0x4c, 0xa4, 0x9f -}; - -unsigned char app_c2[SHA512_DIGEST_LENGTH] = { - 0x8e, 0x95, 0x9b, 0x75, 0xda, 0xe3, 0x13, 0xda, - 0x8c, 0xf4, 0xf7, 0x28, 0x14, 0xfc, 0x14, 0x3f, - 0x8f, 0x77, 0x79, 0xc6, 0xeb, 0x9f, 0x7f, 0xa1, - 0x72, 0x99, 0xae, 0xad, 0xb6, 0x88, 0x90, 0x18, - 0x50, 0x1d, 0x28, 0x9e, 0x49, 0x00, 0xf7, 0xe4, - 0x33, 0x1b, 0x99, 0xde, 0xc4, 0xb5, 0x43, 0x3a, - 0xc7, 0xd3, 0x29, 0xee, 0xb6, 0xdd, 0x26, 0x54, - 0x5e, 0x96, 0xe5, 0x5b, 0x87, 0x4b, 0xe9, 0x09 -}; - -unsigned char app_c3[SHA512_DIGEST_LENGTH] = { - 0xe7, 0x18, 0x48, 0x3d, 0x0c, 0xe7, 0x69, 0x64, - 0x4e, 0x2e, 0x42, 0xc7, 0xbc, 0x15, 0xb4, 0x63, - 0x8e, 0x1f, 0x98, 0xb1, 0x3b, 0x20, 0x44, 0x28, - 0x56, 0x32, 0xa8, 0x03, 0xaf, 0xa9, 0x73, 0xeb, - 0xde, 0x0f, 0xf2, 0x44, 0x87, 0x7e, 0xa6, 0x0a, - 0x4c, 0xb0, 0x43, 0x2c, 0xe5, 0x77, 0xc3, 0x1b, - 0xeb, 0x00, 0x9c, 0x5c, 0x2c, 0x49, 0xaa, 0x2e, - 0x4e, 0xad, 0xb2, 0x17, 0xad, 0x8c, 0xc0, 0x9b -}; - -unsigned char app_d1[SHA384_DIGEST_LENGTH] = { - 0xcb, 0x00, 0x75, 0x3f, 0x45, 0xa3, 0x5e, 0x8b, - 0xb5, 0xa0, 0x3d, 0x69, 0x9a, 0xc6, 0x50, 0x07, - 0x27, 0x2c, 0x32, 0xab, 0x0e, 0xde, 0xd1, 0x63, - 0x1a, 0x8b, 0x60, 0x5a, 0x43, 0xff, 0x5b, 0xed, - 0x80, 0x86, 0x07, 0x2b, 0xa1, 0xe7, 0xcc, 0x23, - 0x58, 0xba, 0xec, 0xa1, 0x34, 0xc8, 0x25, 0xa7 -}; - -unsigned char app_d2[SHA384_DIGEST_LENGTH] = { - 0x09, 0x33, 0x0c, 0x33, 0xf7, 0x11, 0x47, 0xe8, - 0x3d, 0x19, 0x2f, 0xc7, 0x82, 0xcd, 0x1b, 0x47, - 0x53, 0x11, 0x1b, 0x17, 0x3b, 0x3b, 0x05, 0xd2, - 0x2f, 0xa0, 0x80, 0x86, 0xe3, 0xb0, 0xf7, 0x12, - 0xfc, 0xc7, 0xc7, 0x1a, 0x55, 0x7e, 0x2d, 0xb9, - 0x66, 0xc3, 0xe9, 0xfa, 0x91, 0x74, 0x60, 0x39 -}; - -unsigned char app_d3[SHA384_DIGEST_LENGTH] = { - 0x9d, 0x0e, 0x18, 0x09, 0x71, 0x64, 0x74, 0xcb, - 0x08, 0x6e, 0x83, 0x4e, 0x31, 0x0a, 0x4a, 0x1c, - 0xed, 0x14, 0x9e, 0x9c, 0x00, 0xf2, 0x48, 0x52, - 0x79, 0x72, 0xce, 0xc5, 0x70, 0x4c, 0x2a, 0x5b, - 0x07, 0xb8, 0xb3, 0xdc, 0x38, 0xec, 0xc4, 0xeb, - 0xae, 0x97, 0xdd, 0xd8, 0x7f, 0x3d, 0x89, 0x85 -}; - -int main(int argc, char **argv) -{ - unsigned char md[SHA512_DIGEST_LENGTH]; - int i; - EVP_MD_CTX evp; - -# ifdef OPENSSL_IA32_SSE2 - /* - * Alternative to this is to call OpenSSL_add_all_algorithms... The below - * code is retained exclusively for debugging purposes. - */ - { - char *env; - - if ((env = getenv("OPENSSL_ia32cap"))) - OPENSSL_ia32cap = strtoul(env, NULL, 0); - } -# endif - - fprintf(stdout, "Testing SHA-512 "); - - EVP_Digest("abc", 3, md, NULL, EVP_sha512(), NULL); - if (memcmp(md, app_c1, sizeof(app_c1))) { - fflush(stdout); - fprintf(stderr, "\nTEST 1 of 3 failed.\n"); - return 1; - } else - fprintf(stdout, "."); - fflush(stdout); - - EVP_Digest("abcdefgh" "bcdefghi" "cdefghij" "defghijk" - "efghijkl" "fghijklm" "ghijklmn" "hijklmno" - "ijklmnop" "jklmnopq" "klmnopqr" "lmnopqrs" - "mnopqrst" "nopqrstu", 112, md, NULL, EVP_sha512(), NULL); - if (memcmp(md, app_c2, sizeof(app_c2))) { - fflush(stdout); - fprintf(stderr, "\nTEST 2 of 3 failed.\n"); - return 1; - } else - fprintf(stdout, "."); - fflush(stdout); - - EVP_MD_CTX_init(&evp); - EVP_DigestInit_ex(&evp, EVP_sha512(), NULL); - for (i = 0; i < 1000000; i += 288) - EVP_DigestUpdate(&evp, "aaaaaaaa" "aaaaaaaa" "aaaaaaaa" "aaaaaaaa" - "aaaaaaaa" "aaaaaaaa" "aaaaaaaa" "aaaaaaaa" - "aaaaaaaa" "aaaaaaaa" "aaaaaaaa" "aaaaaaaa" - "aaaaaaaa" "aaaaaaaa" "aaaaaaaa" "aaaaaaaa" - "aaaaaaaa" "aaaaaaaa" "aaaaaaaa" "aaaaaaaa" - "aaaaaaaa" "aaaaaaaa" "aaaaaaaa" "aaaaaaaa" - "aaaaaaaa" "aaaaaaaa" "aaaaaaaa" "aaaaaaaa" - "aaaaaaaa" "aaaaaaaa" "aaaaaaaa" "aaaaaaaa" - "aaaaaaaa" "aaaaaaaa" "aaaaaaaa" "aaaaaaaa", - (1000000 - i) < 288 ? 1000000 - i : 288); - EVP_DigestFinal_ex(&evp, md, NULL); - EVP_MD_CTX_cleanup(&evp); - - if (memcmp(md, app_c3, sizeof(app_c3))) { - fflush(stdout); - fprintf(stderr, "\nTEST 3 of 3 failed.\n"); - return 1; - } else - fprintf(stdout, "."); - fflush(stdout); - - fprintf(stdout, " passed.\n"); - fflush(stdout); - - fprintf(stdout, "Testing SHA-384 "); - - EVP_Digest("abc", 3, md, NULL, EVP_sha384(), NULL); - if (memcmp(md, app_d1, sizeof(app_d1))) { - fflush(stdout); - fprintf(stderr, "\nTEST 1 of 3 failed.\n"); - return 1; - } else - fprintf(stdout, "."); - fflush(stdout); - - EVP_Digest("abcdefgh" "bcdefghi" "cdefghij" "defghijk" - "efghijkl" "fghijklm" "ghijklmn" "hijklmno" - "ijklmnop" "jklmnopq" "klmnopqr" "lmnopqrs" - "mnopqrst" "nopqrstu", 112, md, NULL, EVP_sha384(), NULL); - if (memcmp(md, app_d2, sizeof(app_d2))) { - fflush(stdout); - fprintf(stderr, "\nTEST 2 of 3 failed.\n"); - return 1; - } else - fprintf(stdout, "."); - fflush(stdout); - - EVP_MD_CTX_init(&evp); - EVP_DigestInit_ex(&evp, EVP_sha384(), NULL); - for (i = 0; i < 1000000; i += 64) - EVP_DigestUpdate(&evp, "aaaaaaaa" "aaaaaaaa" "aaaaaaaa" "aaaaaaaa" - "aaaaaaaa" "aaaaaaaa" "aaaaaaaa" "aaaaaaaa", - (1000000 - i) < 64 ? 1000000 - i : 64); - EVP_DigestFinal_ex(&evp, md, NULL); - EVP_MD_CTX_cleanup(&evp); - - if (memcmp(md, app_d3, sizeof(app_d3))) { - fflush(stdout); - fprintf(stderr, "\nTEST 3 of 3 failed.\n"); - return 1; - } else - fprintf(stdout, "."); - fflush(stdout); - - fprintf(stdout, " passed.\n"); - fflush(stdout); - - return 0; -} -#endif diff --git a/crypto/sha/sha_dgst.c b/crypto/sha/sha_dgst.c deleted file mode 100644 index f77cf5e38d8e..000000000000 --- a/crypto/sha/sha_dgst.c +++ /dev/null @@ -1,74 +0,0 @@ -/* crypto/sha/sha1dgst.c */ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] - */ - -#include <openssl/crypto.h> -#include <openssl/opensslconf.h> -#if !defined(OPENSSL_NO_SHA0) && !defined(OPENSSL_NO_SHA) - -# undef SHA_1 -# define SHA_0 - -# include <openssl/opensslv.h> - -const char SHA_version[] = "SHA" OPENSSL_VERSION_PTEXT; - -/* The implementation is in ../md32_common.h */ - -# include "sha_locl.h" - -#endif diff --git a/crypto/sha/sha_locl.h b/crypto/sha/sha_locl.h index 03bd411ede69..4e5a09038267 100644 --- a/crypto/sha/sha_locl.h +++ b/crypto/sha/sha_locl.h @@ -1,59 +1,10 @@ -/* crypto/sha/sha_locl.h */ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. +/* + * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] + * Licensed under the OpenSSL license (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html */ #include <stdlib.h> @@ -76,45 +27,22 @@ ll=(c)->h4; (void)HOST_l2c(ll,(s)); \ } while (0) -#if defined(SHA_0) - -# define HASH_UPDATE SHA_Update -# define HASH_TRANSFORM SHA_Transform -# define HASH_FINAL SHA_Final -# define HASH_INIT SHA_Init -# define HASH_BLOCK_DATA_ORDER sha_block_data_order -# define Xupdate(a,ix,ia,ib,ic,id) (ix=(a)=(ia^ib^ic^id)) - -static void sha_block_data_order(SHA_CTX *c, const void *p, size_t num); - -#elif defined(SHA_1) - -# define HASH_UPDATE SHA1_Update -# define HASH_TRANSFORM SHA1_Transform -# define HASH_FINAL SHA1_Final -# define HASH_INIT SHA1_Init -# define HASH_BLOCK_DATA_ORDER sha1_block_data_order -# if defined(__MWERKS__) && defined(__MC68K__) - /* Metrowerks for Motorola fails otherwise:-( <appro@fy.chalmers.se> */ -# define Xupdate(a,ix,ia,ib,ic,id) do { (a)=(ia^ib^ic^id); \ - ix=(a)=ROTATE((a),1); \ - } while (0) -# else -# define Xupdate(a,ix,ia,ib,ic,id) ( (a)=(ia^ib^ic^id), \ +#define HASH_UPDATE SHA1_Update +#define HASH_TRANSFORM SHA1_Transform +#define HASH_FINAL SHA1_Final +#define HASH_INIT SHA1_Init +#define HASH_BLOCK_DATA_ORDER sha1_block_data_order +#define Xupdate(a,ix,ia,ib,ic,id) ( (a)=(ia^ib^ic^id), \ ix=(a)=ROTATE((a),1) \ ) -# endif - -# ifndef SHA1_ASM -static -# endif -void sha1_block_data_order(SHA_CTX *c, const void *p, size_t num); +#ifndef SHA1_ASM +static void sha1_block_data_order(SHA_CTX *c, const void *p, size_t num); #else -# error "Either SHA_0 or SHA_1 must be defined." +void sha1_block_data_order(SHA_CTX *c, const void *p, size_t num); #endif -#include "md32_common.h" +#include "internal/md32_common.h" #define INIT_DATA_h0 0x67452301UL #define INIT_DATA_h1 0xefcdab89UL @@ -122,11 +50,7 @@ void sha1_block_data_order(SHA_CTX *c, const void *p, size_t num); #define INIT_DATA_h3 0x10325476UL #define INIT_DATA_h4 0xc3d2e1f0UL -#ifdef SHA_0 -fips_md_init(SHA) -#else -fips_md_init_ctx(SHA1, SHA) -#endif +int HASH_INIT(SHA_CTX *c) { memset(c, 0, sizeof(*c)); c->h0 = INIT_DATA_h0; @@ -143,11 +67,12 @@ fips_md_init_ctx(SHA1, SHA) #define K_60_79 0xca62c1d6UL /* - * As pointed out by Wei Dai <weidai@eskimo.com>, F() below can be simplified - * to the code in F_00_19. Wei attributes these optimisations to Peter - * Gutmann's SHS code, and he attributes it to Rich Schroeppel. #define - * F(x,y,z) (((x) & (y)) | ((~(x)) & (z))) I've just become aware of another - * tweak to be made, again from Wei Dai, in F_40_59, (x&a)|(y&a) -> (x|y)&a + * As pointed out by Wei Dai, F() below can be simplified to the code in + * F_00_19. Wei attributes these optimizations to Peter Gutmann's SHS code, + * and he attributes it to Rich Schroeppel. + * #define F(x,y,z) (((x) & (y)) | ((~(x)) & (z))) + * I've just become aware of another tweak to be made, again from Wei Dai, + * in F_40_59, (x&a)|(y&a) -> (x|y)&a */ #define F_00_19(b,c,d) ((((c) ^ (d)) & (b)) ^ (d)) #define F_20_39(b,c,d) ((b) ^ (c) ^ (d)) @@ -191,12 +116,11 @@ fips_md_init_ctx(SHA1, SHA) # ifndef MD32_XARRAY /* * Originally X was an array. As it's automatic it's natural - * to expect RISC compiler to accomodate at least part of it in + * to expect RISC compiler to accommodate at least part of it in * the register bank, isn't it? Unfortunately not all compilers * "find" this expectation reasonable:-( On order to make such * compilers generate better code I replace X[] with a bunch of * X0, X1, etc. See the function body below... - * <appro@fy.chalmers.se> */ # define X(i) XX##i # else @@ -208,7 +132,7 @@ fips_md_init_ctx(SHA1, SHA) # define X(i) XX[i] # endif -# if !defined(SHA_1) || !defined(SHA1_ASM) +# if !defined(SHA1_ASM) static void HASH_BLOCK_DATA_ORDER(SHA_CTX *c, const void *p, size_t num) { const unsigned char *data = p; @@ -442,7 +366,7 @@ static void HASH_BLOCK_DATA_ORDER(SHA_CTX *c, const void *p, size_t num) E=D, D=C, C=ROTATE(B,30), B=A; \ A=ROTATE(A,5)+T+xa; } while(0) -# if !defined(SHA_1) || !defined(SHA1_ASM) +# if !defined(SHA1_ASM) static void HASH_BLOCK_DATA_ORDER(SHA_CTX *c, const void *p, size_t num) { const unsigned char *data = p; @@ -458,7 +382,7 @@ static void HASH_BLOCK_DATA_ORDER(SHA_CTX *c, const void *p, size_t num) for (;;) { for (i = 0; i < 16; i++) { - HOST_c2l(data, l); + (void)HOST_c2l(data, l); X[i] = l; BODY_00_15(X[i]); } diff --git a/crypto/sha/sha_one.c b/crypto/sha/sha_one.c deleted file mode 100644 index 0930b98a66b6..000000000000 --- a/crypto/sha/sha_one.c +++ /dev/null @@ -1,79 +0,0 @@ -/* crypto/sha/sha_one.c */ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] - */ - -#include <stdio.h> -#include <string.h> -#include <openssl/sha.h> -#include <openssl/crypto.h> - -#ifndef OPENSSL_NO_SHA0 -unsigned char *SHA(const unsigned char *d, size_t n, unsigned char *md) -{ - SHA_CTX c; - static unsigned char m[SHA_DIGEST_LENGTH]; - - if (md == NULL) - md = m; - if (!SHA_Init(&c)) - return NULL; - SHA_Update(&c, d, n); - SHA_Final(md, &c); - OPENSSL_cleanse(&c, sizeof(c)); - return (md); -} -#endif diff --git a/crypto/sha/shatest.c b/crypto/sha/shatest.c deleted file mode 100644 index 105060a7ec2d..000000000000 --- a/crypto/sha/shatest.c +++ /dev/null @@ -1,174 +0,0 @@ -/* crypto/sha/shatest.c */ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] - */ - -#include <stdio.h> -#include <string.h> -#include <stdlib.h> - -#include "../e_os.h" - -#if defined(OPENSSL_NO_SHA) || defined(OPENSSL_NO_SHA0) -int main(int argc, char *argv[]) -{ - printf("No SHA0 support\n"); - return (0); -} -#else -# include <openssl/evp.h> -# include <openssl/sha.h> - -# ifdef CHARSET_EBCDIC -# include <openssl/ebcdic.h> -# endif - -# define SHA_0 /* FIPS 180 */ -# undef SHA_1 /* FIPS 180-1 */ - -static char *test[] = { - "abc", - "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq", - NULL, -}; - -# ifdef SHA_0 -static char *ret[] = { - "0164b8a914cd2a5e74c4f7ff082c4d97f1edf880", - "d2516ee1acfa5baf33dfc1c471e438449ef134c8", -}; - -static char *bigret = "3232affa48628a26653b5aaa44541fd90d690603"; -# endif -# ifdef SHA_1 -static char *ret[] = { - "a9993e364706816aba3e25717850c26c9cd0d89d", - "84983e441c3bd26ebaae4aa1f95129e5e54670f1", -}; - -static char *bigret = "34aa973cd4c4daa4f61eeb2bdbad27316534016f"; -# endif - -static char *pt(unsigned char *md); -int main(int argc, char *argv[]) -{ - int i, err = 0; - char **P, **R; - static unsigned char buf[1000]; - char *p, *r; - EVP_MD_CTX c; - unsigned char md[SHA_DIGEST_LENGTH]; - -# ifdef CHARSET_EBCDIC - ebcdic2ascii(test[0], test[0], strlen(test[0])); - ebcdic2ascii(test[1], test[1], strlen(test[1])); -# endif - - EVP_MD_CTX_init(&c); - P = test; - R = ret; - i = 1; - while (*P != NULL) { - EVP_Digest(*P, strlen(*P), md, NULL, EVP_sha(), NULL); - p = pt(md); - if (strcmp(p, *R) != 0) { - printf("error calculating SHA on '%s'\n", *P); - printf("got %s instead of %s\n", p, *R); - err++; - } else - printf("test %d ok\n", i); - i++; - R++; - P++; - } - - memset(buf, 'a', 1000); -# ifdef CHARSET_EBCDIC - ebcdic2ascii(buf, buf, 1000); -# endif /* CHARSET_EBCDIC */ - EVP_DigestInit_ex(&c, EVP_sha(), NULL); - for (i = 0; i < 1000; i++) - EVP_DigestUpdate(&c, buf, 1000); - EVP_DigestFinal_ex(&c, md, NULL); - p = pt(md); - - r = bigret; - if (strcmp(p, r) != 0) { - printf("error calculating SHA on '%s'\n", p); - printf("got %s instead of %s\n", p, r); - err++; - } else - printf("test 3 ok\n"); - -# ifdef OPENSSL_SYS_NETWARE - if (err) - printf("ERROR: %d\n", err); -# endif - EVP_MD_CTX_cleanup(&c); - EXIT(err); - return (0); -} - -static char *pt(unsigned char *md) -{ - int i; - static char buf[80]; - - for (i = 0; i < SHA_DIGEST_LENGTH; i++) - sprintf(&(buf[i * 2]), "%02x", md[i]); - return (buf); -} -#endif |