summaryrefslogtreecommitdiff
path: root/crypto/sha
diff options
context:
space:
mode:
Diffstat (limited to 'crypto/sha')
-rw-r--r--crypto/sha/Makefile177
-rw-r--r--crypto/sha/asm/README1
-rwxr-xr-xcrypto/sha/asm/keccak1600-armv4.pl1606
-rwxr-xr-xcrypto/sha/asm/keccak1600-armv8.pl866
-rwxr-xr-xcrypto/sha/asm/keccak1600-avx2.pl482
-rwxr-xr-xcrypto/sha/asm/keccak1600-avx512.pl551
-rwxr-xr-xcrypto/sha/asm/keccak1600-avx512vl.pl392
-rwxr-xr-xcrypto/sha/asm/keccak1600-c64x.pl885
-rwxr-xr-xcrypto/sha/asm/keccak1600-mmx.pl440
-rwxr-xr-xcrypto/sha/asm/keccak1600-ppc64.pl758
-rwxr-xr-xcrypto/sha/asm/keccak1600-s390x.pl560
-rwxr-xr-xcrypto/sha/asm/keccak1600-x86_64.pl607
-rwxr-xr-xcrypto/sha/asm/keccak1600p8-ppc.pl850
-rw-r--r--crypto/sha/asm/sha1-586.pl37
-rwxr-xr-xcrypto/sha/asm/sha1-armv4-large.pl79
-rwxr-xr-xcrypto/sha/asm/sha1-armv8.pl30
-rwxr-xr-xcrypto/sha/asm/sha1-c64xplus.pl337
-rw-r--r--crypto/sha/asm/sha1-ia64.pl15
-rwxr-xr-xcrypto/sha/asm/sha1-mb-x86_64.pl80
-rwxr-xr-xcrypto/sha/asm/sha1-mips.pl47
-rwxr-xr-xcrypto/sha/asm/sha1-parisc.pl31
-rwxr-xr-xcrypto/sha/asm/sha1-ppc.pl13
-rwxr-xr-xcrypto/sha/asm/sha1-s390x.pl24
-rwxr-xr-xcrypto/sha/asm/sha1-sparcv9.pl19
-rwxr-xr-xcrypto/sha/asm/sha1-sparcv9a.pl15
-rwxr-xr-xcrypto/sha/asm/sha1-thumb.pl13
-rwxr-xr-xcrypto/sha/asm/sha1-x86_64.pl217
-rwxr-xr-xcrypto/sha/asm/sha256-586.pl29
-rwxr-xr-xcrypto/sha/asm/sha256-armv4.pl61
-rwxr-xr-xcrypto/sha/asm/sha256-c64xplus.pl320
-rwxr-xr-xcrypto/sha/asm/sha256-mb-x86_64.pl76
-rwxr-xr-xcrypto/sha/asm/sha512-586.pl22
-rwxr-xr-xcrypto/sha/asm/sha512-armv4.pl107
-rwxr-xr-xcrypto/sha/asm/sha512-armv8.pl521
-rwxr-xr-xcrypto/sha/asm/sha512-c64xplus.pl438
-rwxr-xr-xcrypto/sha/asm/sha512-ia64.pl11
-rwxr-xr-xcrypto/sha/asm/sha512-mips.pl43
-rwxr-xr-xcrypto/sha/asm/sha512-parisc.pl30
-rwxr-xr-xcrypto/sha/asm/sha512-ppc.pl11
-rwxr-xr-xcrypto/sha/asm/sha512-s390x.pl25
-rwxr-xr-xcrypto/sha/asm/sha512-sparcv9.pl21
-rwxr-xr-xcrypto/sha/asm/sha512-x86_64.pl204
-rwxr-xr-xcrypto/sha/asm/sha512p8-ppc.pl137
-rw-r--r--crypto/sha/build.info89
-rw-r--r--crypto/sha/keccak1600.c1246
-rw-r--r--crypto/sha/sha.c118
-rw-r--r--crypto/sha/sha.h214
-rw-r--r--crypto/sha/sha1.c121
-rw-r--r--crypto/sha/sha1_one.c65
-rw-r--r--crypto/sha/sha1dgst.c69
-rw-r--r--crypto/sha/sha1test.c174
-rw-r--r--crypto/sha/sha256.c123
-rw-r--r--crypto/sha/sha256t.c158
-rw-r--r--crypto/sha/sha512.c331
-rw-r--r--crypto/sha/sha512t.c196
-rw-r--r--crypto/sha/sha_dgst.c74
-rw-r--r--crypto/sha/sha_locl.h130
-rw-r--r--crypto/sha/sha_one.c79
-rw-r--r--crypto/sha/shatest.c174
59 files changed, 12237 insertions, 2312 deletions
diff --git a/crypto/sha/Makefile b/crypto/sha/Makefile
deleted file mode 100644
index 8b8f8b285f1c..000000000000
--- a/crypto/sha/Makefile
+++ /dev/null
@@ -1,177 +0,0 @@
-#
-# OpenSSL/crypto/sha/Makefile
-#
-
-DIR= sha
-TOP= ../..
-CC= cc
-CPP= $(CC) -E
-INCLUDES=
-CFLAG=-g
-MAKEFILE= Makefile
-AR= ar r
-
-SHA1_ASM_OBJ=
-
-CFLAGS= $(INCLUDES) $(CFLAG)
-ASFLAGS= $(INCLUDES) $(ASFLAG)
-AFLAGS= $(ASFLAGS)
-
-GENERAL=Makefile
-TEST=shatest.c sha1test.c sha256t.c sha512t.c
-APPS=
-
-LIB=$(TOP)/libcrypto.a
-LIBSRC=sha_dgst.c sha1dgst.c sha_one.c sha1_one.c sha256.c sha512.c
-LIBOBJ=sha_dgst.o sha1dgst.o sha_one.o sha1_one.o sha256.o sha512.o $(SHA1_ASM_OBJ)
-
-SRC= $(LIBSRC)
-
-EXHEADER= sha.h
-HEADER= sha_locl.h $(EXHEADER)
-
-ALL= $(GENERAL) $(SRC) $(HEADER)
-
-top:
- (cd ../..; $(MAKE) DIRS=crypto SDIRS=$(DIR) sub_all)
-
-all: lib
-
-lib: $(LIBOBJ)
- $(AR) $(LIB) $(LIBOBJ)
- $(RANLIB) $(LIB) || echo Never mind.
- @touch lib
-
-sha1-586.s: asm/sha1-586.pl ../perlasm/x86asm.pl
- $(PERL) asm/sha1-586.pl $(PERLASM_SCHEME) $(CFLAGS) $(PROCESSOR) > $@
-sha256-586.s: asm/sha256-586.pl ../perlasm/x86asm.pl
- $(PERL) asm/sha256-586.pl $(PERLASM_SCHEME) $(CFLAGS) $(PROCESSOR) > $@
-sha512-586.s: asm/sha512-586.pl ../perlasm/x86asm.pl
- $(PERL) asm/sha512-586.pl $(PERLASM_SCHEME) $(CFLAGS) $(PROCESSOR) > $@
-
-sha1-ia64.s: asm/sha1-ia64.pl
- (cd asm; $(PERL) sha1-ia64.pl ../$@ $(CFLAGS))
-sha256-ia64.s: asm/sha512-ia64.pl
- (cd asm; $(PERL) sha512-ia64.pl ../$@ $(CFLAGS))
-sha512-ia64.s: asm/sha512-ia64.pl
- (cd asm; $(PERL) sha512-ia64.pl ../$@ $(CFLAGS))
-
-sha256-armv4.S: asm/sha256-armv4.pl
- $(PERL) $< $(PERLASM_SCHEME) $@
-
-sha1-alpha.s: asm/sha1-alpha.pl
- (preproc=$$$$.$@.S; trap "rm $$preproc" INT; \
- $(PERL) asm/sha1-alpha.pl > $$preproc && \
- $(CC) -E -P $$preproc > $@ && rm $$preproc)
-
-# Solaris make has to be explicitly told
-sha1-x86_64.s: asm/sha1-x86_64.pl; $(PERL) asm/sha1-x86_64.pl $(PERLASM_SCHEME) > $@
-sha1-mb-x86_64.s: asm/sha1-mb-x86_64.pl; $(PERL) asm/sha1-mb-x86_64.pl $(PERLASM_SCHEME) > $@
-sha256-x86_64.s:asm/sha512-x86_64.pl; $(PERL) asm/sha512-x86_64.pl $(PERLASM_SCHEME) $@
-sha256-mb-x86_64.s: asm/sha256-mb-x86_64.pl; $(PERL) asm/sha256-mb-x86_64.pl $(PERLASM_SCHEME) > $@
-sha512-x86_64.s:asm/sha512-x86_64.pl; $(PERL) asm/sha512-x86_64.pl $(PERLASM_SCHEME) $@
-sha1-sparcv9.S: asm/sha1-sparcv9.pl; $(PERL) asm/sha1-sparcv9.pl $@ $(CFLAGS)
-sha256-sparcv9.S:asm/sha512-sparcv9.pl; $(PERL) asm/sha512-sparcv9.pl $@ $(CFLAGS)
-sha512-sparcv9.S:asm/sha512-sparcv9.pl; $(PERL) asm/sha512-sparcv9.pl $@ $(CFLAGS)
-
-sha1-ppc.s: asm/sha1-ppc.pl; $(PERL) asm/sha1-ppc.pl $(PERLASM_SCHEME) $@
-sha256-ppc.s: asm/sha512-ppc.pl; $(PERL) asm/sha512-ppc.pl $(PERLASM_SCHEME) $@
-sha512-ppc.s: asm/sha512-ppc.pl; $(PERL) asm/sha512-ppc.pl $(PERLASM_SCHEME) $@
-sha256p8-ppc.s: asm/sha512p8-ppc.pl; $(PERL) asm/sha512p8-ppc.pl $(PERLASM_SCHEME) $@
-sha512p8-ppc.s: asm/sha512p8-ppc.pl; $(PERL) asm/sha512p8-ppc.pl $(PERLASM_SCHEME) $@
-
-sha1-parisc.s: asm/sha1-parisc.pl; $(PERL) asm/sha1-parisc.pl $(PERLASM_SCHEME) $@
-sha256-parisc.s:asm/sha512-parisc.pl; $(PERL) asm/sha512-parisc.pl $(PERLASM_SCHEME) $@
-sha512-parisc.s:asm/sha512-parisc.pl; $(PERL) asm/sha512-parisc.pl $(PERLASM_SCHEME) $@
-
-sha1-mips.S: asm/sha1-mips.pl; $(PERL) asm/sha1-mips.pl $(PERLASM_SCHEME) $@
-sha256-mips.S: asm/sha512-mips.pl; $(PERL) asm/sha512-mips.pl $(PERLASM_SCHEME) $@
-sha512-mips.S: asm/sha512-mips.pl; $(PERL) asm/sha512-mips.pl $(PERLASM_SCHEME) $@
-
-# GNU make "catch all"
-sha1-%.S: asm/sha1-%.pl; $(PERL) $< $(PERLASM_SCHEME) $@
-sha256-%.S: asm/sha512-%.pl; $(PERL) $< $(PERLASM_SCHEME) $@
-sha512-%.S: asm/sha512-%.pl; $(PERL) $< $(PERLASM_SCHEME) $@
-
-sha1-armv4-large.o: sha1-armv4-large.S
-sha256-armv4.o: sha256-armv4.S
-sha512-armv4.o: sha512-armv4.S
-sha1-armv8.o: sha1-armv8.S
-sha256-armv8.o: sha256-armv8.S
-sha512-armv8.o: sha512-armv8.S
-
-files:
- $(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO
-
-links:
- @$(PERL) $(TOP)/util/mklink.pl ../../include/openssl $(EXHEADER)
- @$(PERL) $(TOP)/util/mklink.pl ../../test $(TEST)
- @$(PERL) $(TOP)/util/mklink.pl ../../apps $(APPS)
-
-install:
- @[ -n "$(INSTALLTOP)" ] # should be set by top Makefile...
- @headerlist="$(EXHEADER)"; for i in $$headerlist ; \
- do \
- (cp $$i $(INSTALL_PREFIX)$(INSTALLTOP)/include/openssl/$$i; \
- chmod 644 $(INSTALL_PREFIX)$(INSTALLTOP)/include/openssl/$$i ); \
- done;
-
-tags:
- ctags $(SRC)
-
-tests:
-
-lint:
- lint -DLINT $(INCLUDES) $(SRC)>fluff
-
-update: depend
-
-depend:
- @[ -n "$(MAKEDEPEND)" ] # should be set by upper Makefile...
- $(MAKEDEPEND) -- $(CFLAG) $(INCLUDES) $(DEPFLAG) -- $(PROGS) $(LIBSRC)
-
-dclean:
- $(PERL) -pe 'if (/^# DO NOT DELETE THIS LINE/) {print; exit(0);}' $(MAKEFILE) >Makefile.new
- mv -f Makefile.new $(MAKEFILE)
-
-clean:
- rm -f *.s *.S *.o *.obj lib tags core .pure .nfs* *.old *.bak fluff
-
-# DO NOT DELETE THIS LINE -- make depend depends on it.
-
-sha1_one.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
-sha1_one.o: ../../include/openssl/opensslconf.h
-sha1_one.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-sha1_one.o: ../../include/openssl/safestack.h ../../include/openssl/sha.h
-sha1_one.o: ../../include/openssl/stack.h ../../include/openssl/symhacks.h
-sha1_one.o: sha1_one.c
-sha1dgst.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
-sha1dgst.o: ../../include/openssl/opensslconf.h
-sha1dgst.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-sha1dgst.o: ../../include/openssl/safestack.h ../../include/openssl/sha.h
-sha1dgst.o: ../../include/openssl/stack.h ../../include/openssl/symhacks.h
-sha1dgst.o: ../md32_common.h sha1dgst.c sha_locl.h
-sha256.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
-sha256.o: ../../include/openssl/opensslconf.h ../../include/openssl/opensslv.h
-sha256.o: ../../include/openssl/ossl_typ.h ../../include/openssl/safestack.h
-sha256.o: ../../include/openssl/sha.h ../../include/openssl/stack.h
-sha256.o: ../../include/openssl/symhacks.h ../md32_common.h sha256.c
-sha512.o: ../../e_os.h ../../include/openssl/bio.h
-sha512.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
-sha512.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
-sha512.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
-sha512.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-sha512.o: ../../include/openssl/safestack.h ../../include/openssl/sha.h
-sha512.o: ../../include/openssl/stack.h ../../include/openssl/symhacks.h
-sha512.o: ../cryptlib.h sha512.c
-sha_dgst.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
-sha_dgst.o: ../../include/openssl/opensslconf.h
-sha_dgst.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-sha_dgst.o: ../../include/openssl/safestack.h ../../include/openssl/sha.h
-sha_dgst.o: ../../include/openssl/stack.h ../../include/openssl/symhacks.h
-sha_dgst.o: ../md32_common.h sha_dgst.c sha_locl.h
-sha_one.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
-sha_one.o: ../../include/openssl/opensslconf.h ../../include/openssl/opensslv.h
-sha_one.o: ../../include/openssl/ossl_typ.h ../../include/openssl/safestack.h
-sha_one.o: ../../include/openssl/sha.h ../../include/openssl/stack.h
-sha_one.o: ../../include/openssl/symhacks.h sha_one.c
diff --git a/crypto/sha/asm/README b/crypto/sha/asm/README
deleted file mode 100644
index b7e755765fcc..000000000000
--- a/crypto/sha/asm/README
+++ /dev/null
@@ -1 +0,0 @@
-C2.pl works
diff --git a/crypto/sha/asm/keccak1600-armv4.pl b/crypto/sha/asm/keccak1600-armv4.pl
new file mode 100755
index 000000000000..8bf665c8b38d
--- /dev/null
+++ b/crypto/sha/asm/keccak1600-armv4.pl
@@ -0,0 +1,1606 @@
+#!/usr/bin/env perl
+# Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# Keccak-1600 for ARMv4.
+#
+# June 2017.
+#
+# Non-NEON code is KECCAK_1X variant (see sha/keccak1600.c) with bit
+# interleaving. How does it compare to Keccak Code Package? It's as
+# fast, but several times smaller, and is endian- and ISA-neutral. ISA
+# neutrality means that minimum ISA requirement is ARMv4, yet it can
+# be assembled even as Thumb-2. NEON code path is KECCAK_1X_ALT with
+# register layout taken from Keccak Code Package. It's also as fast,
+# in fact faster by 10-15% on some processors, and endian-neutral.
+#
+# August 2017.
+#
+# Switch to KECCAK_2X variant for non-NEON code and merge almost 1/2
+# of rotate instructions with logical ones. This resulted in ~10%
+# improvement on most processors. Switch to KECCAK_2X effectively
+# minimizes re-loads from temporary storage, and merged rotates just
+# eliminate corresponding instructions. As for latter. When examining
+# code you'll notice commented ror instructions. These are eliminated
+# ones, and you should trace destination register below to see what's
+# going on. Just in case, why not all rotates are eliminated. Trouble
+# is that you have operations that require both inputs to be rotated,
+# e.g. 'eor a,b>>>x,c>>>y'. This conundrum is resolved by using
+# 'eor a,b,c>>>(x-y)' and then merge-rotating 'a' in next operation
+# that takes 'a' as input. And thing is that this next operation can
+# be in next round. It's totally possible to "carry" rotate "factors"
+# to the next round, but it makes code more complex. And the last word
+# is the keyword, i.e. "almost 1/2" is kind of complexity cap [for the
+# time being]...
+#
+# Reduce per-round instruction count in Thumb-2 case by 16%. This is
+# achieved by folding ldr/str pairs to their double-word counterparts.
+# Theoretically this should have improved performance on single-issue
+# cores, such as Cortex-A5/A7, by 19%. Reality is a bit different, as
+# usual...
+#
+########################################################################
+# Numbers are cycles per processed byte. Non-NEON results account even
+# for input bit interleaving.
+#
+# r=1088(*) Thumb-2(**) NEON
+#
+# ARM11xx 82/+150%
+# Cortex-A5 88/+160%, 86, 36
+# Cortex-A7 78/+160%, 68, 34
+# Cortex-A8 51/+230%, 57, 30
+# Cortex-A9 53/+210%, 51, 26
+# Cortex-A15 42/+160%, 38, 18
+# Snapdragon S4 43/+210%, 38, 24
+#
+# (*) Corresponds to SHA3-256. Percentage after slash is improvement
+# over compiler-generated KECCAK_2X reference code.
+# (**) Thumb-2 results for Cortex-A5/A7 are likely to apply even to
+# Cortex-Mx, x>=3. Otherwise, non-NEON results for NEON-capable
+# processors are presented mostly for reference purposes.
+
+$flavour = shift;
+if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+ ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+ ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+ die "can't locate arm-xlate.pl";
+
+ open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+ open STDOUT,">$output";
+}
+
+my @C = map("r$_",(0..9));
+my @E = map("r$_",(10..12,14));
+
+########################################################################
+# Stack layout
+# ----->+-----------------------+
+# | uint64_t A[5][5] |
+# | ... |
+# +200->+-----------------------+
+# | uint64_t D[5] |
+# | ... |
+# +240->+-----------------------+
+# | uint64_t T[5][5] |
+# | ... |
+# +440->+-----------------------+
+# | saved lr |
+# +444->+-----------------------+
+# | loop counter |
+# +448->+-----------------------+
+# | ...
+
+my @A = map([ 8*$_, 8*($_+1), 8*($_+2), 8*($_+3), 8*($_+4) ], (0,5,10,15,20));
+my @D = map(8*$_, (25..29));
+my @T = map([ 8*$_, 8*($_+1), 8*($_+2), 8*($_+3), 8*($_+4) ], (30,35,40,45,50));
+
+$code.=<<___;
+#include "arm_arch.h"
+
+.text
+
+#if defined(__thumb2__)
+.syntax unified
+.thumb
+#else
+.code 32
+#endif
+
+.type iotas32, %object
+.align 5
+iotas32:
+ .long 0x00000001, 0x00000000
+ .long 0x00000000, 0x00000089
+ .long 0x00000000, 0x8000008b
+ .long 0x00000000, 0x80008080
+ .long 0x00000001, 0x0000008b
+ .long 0x00000001, 0x00008000
+ .long 0x00000001, 0x80008088
+ .long 0x00000001, 0x80000082
+ .long 0x00000000, 0x0000000b
+ .long 0x00000000, 0x0000000a
+ .long 0x00000001, 0x00008082
+ .long 0x00000000, 0x00008003
+ .long 0x00000001, 0x0000808b
+ .long 0x00000001, 0x8000000b
+ .long 0x00000001, 0x8000008a
+ .long 0x00000001, 0x80000081
+ .long 0x00000000, 0x80000081
+ .long 0x00000000, 0x80000008
+ .long 0x00000000, 0x00000083
+ .long 0x00000000, 0x80008003
+ .long 0x00000001, 0x80008088
+ .long 0x00000000, 0x80000088
+ .long 0x00000001, 0x00008000
+ .long 0x00000000, 0x80008082
+.size iotas32,.-iotas32
+
+.type KeccakF1600_int, %function
+.align 5
+KeccakF1600_int:
+ add @C[9],sp,#$A[4][2]
+ add @E[2],sp,#$A[0][0]
+ add @E[0],sp,#$A[1][0]
+ ldmia @C[9],{@C[4]-@C[9]} @ A[4][2..4]
+KeccakF1600_enter:
+ str lr,[sp,#440]
+ eor @E[1],@E[1],@E[1]
+ str @E[1],[sp,#444]
+ b .Lround2x
+
+.align 4
+.Lround2x:
+___
+sub Round {
+my (@A,@R); (@A[0..4],@R) = @_;
+
+$code.=<<___;
+ ldmia @E[2],{@C[0]-@C[3]} @ A[0][0..1]
+ ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[1][0..1]
+#ifdef __thumb2__
+ eor @C[0],@C[0],@E[0]
+ eor @C[1],@C[1],@E[1]
+ eor @C[2],@C[2],@E[2]
+ ldrd @E[0],@E[1],[sp,#$A[1][2]]
+ eor @C[3],@C[3],@E[3]
+ ldrd @E[2],@E[3],[sp,#$A[1][3]]
+ eor @C[4],@C[4],@E[0]
+ eor @C[5],@C[5],@E[1]
+ eor @C[6],@C[6],@E[2]
+ ldrd @E[0],@E[1],[sp,#$A[1][4]]
+ eor @C[7],@C[7],@E[3]
+ ldrd @E[2],@E[3],[sp,#$A[2][0]]
+ eor @C[8],@C[8],@E[0]
+ eor @C[9],@C[9],@E[1]
+ eor @C[0],@C[0],@E[2]
+ ldrd @E[0],@E[1],[sp,#$A[2][1]]
+ eor @C[1],@C[1],@E[3]
+ ldrd @E[2],@E[3],[sp,#$A[2][2]]
+ eor @C[2],@C[2],@E[0]
+ eor @C[3],@C[3],@E[1]
+ eor @C[4],@C[4],@E[2]
+ ldrd @E[0],@E[1],[sp,#$A[2][3]]
+ eor @C[5],@C[5],@E[3]
+ ldrd @E[2],@E[3],[sp,#$A[2][4]]
+ eor @C[6],@C[6],@E[0]
+ eor @C[7],@C[7],@E[1]
+ eor @C[8],@C[8],@E[2]
+ ldrd @E[0],@E[1],[sp,#$A[3][0]]
+ eor @C[9],@C[9],@E[3]
+ ldrd @E[2],@E[3],[sp,#$A[3][1]]
+ eor @C[0],@C[0],@E[0]
+ eor @C[1],@C[1],@E[1]
+ eor @C[2],@C[2],@E[2]
+ ldrd @E[0],@E[1],[sp,#$A[3][2]]
+ eor @C[3],@C[3],@E[3]
+ ldrd @E[2],@E[3],[sp,#$A[3][3]]
+ eor @C[4],@C[4],@E[0]
+ eor @C[5],@C[5],@E[1]
+ eor @C[6],@C[6],@E[2]
+ ldrd @E[0],@E[1],[sp,#$A[3][4]]
+ eor @C[7],@C[7],@E[3]
+ ldrd @E[2],@E[3],[sp,#$A[4][0]]
+ eor @C[8],@C[8],@E[0]
+ eor @C[9],@C[9],@E[1]
+ eor @C[0],@C[0],@E[2]
+ ldrd @E[0],@E[1],[sp,#$A[4][1]]
+ eor @C[1],@C[1],@E[3]
+ ldrd @E[2],@E[3],[sp,#$A[0][2]]
+ eor @C[2],@C[2],@E[0]
+ eor @C[3],@C[3],@E[1]
+ eor @C[4],@C[4],@E[2]
+ ldrd @E[0],@E[1],[sp,#$A[0][3]]
+ eor @C[5],@C[5],@E[3]
+ ldrd @E[2],@E[3],[sp,#$A[0][4]]
+#else
+ eor @C[0],@C[0],@E[0]
+ add @E[0],sp,#$A[1][2]
+ eor @C[1],@C[1],@E[1]
+ eor @C[2],@C[2],@E[2]
+ eor @C[3],@C[3],@E[3]
+ ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[1][2..3]
+ eor @C[4],@C[4],@E[0]
+ add @E[0],sp,#$A[1][4]
+ eor @C[5],@C[5],@E[1]
+ eor @C[6],@C[6],@E[2]
+ eor @C[7],@C[7],@E[3]
+ ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[1][4]..A[2][0]
+ eor @C[8],@C[8],@E[0]
+ add @E[0],sp,#$A[2][1]
+ eor @C[9],@C[9],@E[1]
+ eor @C[0],@C[0],@E[2]
+ eor @C[1],@C[1],@E[3]
+ ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[2][1..2]
+ eor @C[2],@C[2],@E[0]
+ add @E[0],sp,#$A[2][3]
+ eor @C[3],@C[3],@E[1]
+ eor @C[4],@C[4],@E[2]
+ eor @C[5],@C[5],@E[3]
+ ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[2][3..4]
+ eor @C[6],@C[6],@E[0]
+ add @E[0],sp,#$A[3][0]
+ eor @C[7],@C[7],@E[1]
+ eor @C[8],@C[8],@E[2]
+ eor @C[9],@C[9],@E[3]
+ ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[3][0..1]
+ eor @C[0],@C[0],@E[0]
+ add @E[0],sp,#$A[3][2]
+ eor @C[1],@C[1],@E[1]
+ eor @C[2],@C[2],@E[2]
+ eor @C[3],@C[3],@E[3]
+ ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[3][2..3]
+ eor @C[4],@C[4],@E[0]
+ add @E[0],sp,#$A[3][4]
+ eor @C[5],@C[5],@E[1]
+ eor @C[6],@C[6],@E[2]
+ eor @C[7],@C[7],@E[3]
+ ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[3][4]..A[4][0]
+ eor @C[8],@C[8],@E[0]
+ ldr @E[0],[sp,#$A[4][1]] @ A[4][1]
+ eor @C[9],@C[9],@E[1]
+ ldr @E[1],[sp,#$A[4][1]+4]
+ eor @C[0],@C[0],@E[2]
+ ldr @E[2],[sp,#$A[0][2]] @ A[0][2]
+ eor @C[1],@C[1],@E[3]
+ ldr @E[3],[sp,#$A[0][2]+4]
+ eor @C[2],@C[2],@E[0]
+ add @E[0],sp,#$A[0][3]
+ eor @C[3],@C[3],@E[1]
+ eor @C[4],@C[4],@E[2]
+ eor @C[5],@C[5],@E[3]
+ ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[0][3..4]
+#endif
+ eor @C[6],@C[6],@E[0]
+ eor @C[7],@C[7],@E[1]
+ eor @C[8],@C[8],@E[2]
+ eor @C[9],@C[9],@E[3]
+
+ eor @E[0],@C[0],@C[5],ror#32-1 @ E[0] = ROL64(C[2], 1) ^ C[0];
+ str.l @E[0],[sp,#$D[1]] @ D[1] = E[0]
+ eor @E[1],@C[1],@C[4]
+ str.h @E[1],[sp,#$D[1]+4]
+ eor @E[2],@C[6],@C[1],ror#32-1 @ E[1] = ROL64(C[0], 1) ^ C[3];
+ eor @E[3],@C[7],@C[0]
+ str.l @E[2],[sp,#$D[4]] @ D[4] = E[1]
+ eor @C[0],@C[8],@C[3],ror#32-1 @ C[0] = ROL64(C[1], 1) ^ C[4];
+ str.h @E[3],[sp,#$D[4]+4]
+ eor @C[1],@C[9],@C[2]
+ str.l @C[0],[sp,#$D[0]] @ D[0] = C[0]
+ eor @C[2],@C[2],@C[7],ror#32-1 @ C[1] = ROL64(C[3], 1) ^ C[1];
+ ldr.l @C[7],[sp,#$A[3][3]]
+ eor @C[3],@C[3],@C[6]
+ str.h @C[1],[sp,#$D[0]+4]
+ ldr.h @C[6],[sp,#$A[3][3]+4]
+ str.l @C[2],[sp,#$D[2]] @ D[2] = C[1]
+ eor @C[4],@C[4],@C[9],ror#32-1 @ C[2] = ROL64(C[4], 1) ^ C[2];
+ str.h @C[3],[sp,#$D[2]+4]
+ eor @C[5],@C[5],@C[8]
+
+ ldr.l @C[8],[sp,#$A[4][4]]
+ ldr.h @C[9],[sp,#$A[4][4]+4]
+ str.l @C[4],[sp,#$D[3]] @ D[3] = C[2]
+ eor @C[7],@C[7],@C[4]
+ str.h @C[5],[sp,#$D[3]+4]
+ eor @C[6],@C[6],@C[5]
+ ldr.l @C[4],[sp,#$A[0][0]]
+ @ ror @C[7],@C[7],#32-10 @ C[3] = ROL64(A[3][3] ^ C[2], rhotates[3][3]); /* D[3] */
+ @ ror @C[6],@C[6],#32-11
+ ldr.h @C[5],[sp,#$A[0][0]+4]
+ eor @C[8],@C[8],@E[2]
+ eor @C[9],@C[9],@E[3]
+ ldr.l @E[2],[sp,#$A[2][2]]
+ eor @C[0],@C[0],@C[4]
+ ldr.h @E[3],[sp,#$A[2][2]+4]
+ @ ror @C[8],@C[8],#32-7 @ C[4] = ROL64(A[4][4] ^ E[1], rhotates[4][4]); /* D[4] */
+ @ ror @C[9],@C[9],#32-7
+ eor @C[1],@C[1],@C[5] @ C[0] = A[0][0] ^ C[0]; /* rotate by 0 */ /* D[0] */
+ eor @E[2],@E[2],@C[2]
+ ldr.l @C[2],[sp,#$A[1][1]]
+ eor @E[3],@E[3],@C[3]
+ ldr.h @C[3],[sp,#$A[1][1]+4]
+ ror @C[5],@E[2],#32-21 @ C[2] = ROL64(A[2][2] ^ C[1], rhotates[2][2]); /* D[2] */
+ ldr @E[2],[sp,#444] @ load counter
+ eor @C[2],@C[2],@E[0]
+ adr @E[0],iotas32
+ ror @C[4],@E[3],#32-22
+ add @E[3],@E[0],@E[2]
+ eor @C[3],@C[3],@E[1]
+___
+$code.=<<___ if ($A[0][0] != $T[0][0]);
+ ldmia @E[3],{@E[0],@E[1]} @ iotas[i]
+___
+$code.=<<___ if ($A[0][0] == $T[0][0]);
+ ldr.l @E[0],[@E[3],#8] @ iotas[i].lo
+ add @E[2],@E[2],#16
+ ldr.h @E[1],[@E[3],#12] @ iotas[i].hi
+ cmp @E[2],#192
+ str @E[2],[sp,#444] @ store counter
+___
+$code.=<<___;
+ bic @E[2],@C[4],@C[2],ror#32-22
+ bic @E[3],@C[5],@C[3],ror#32-22
+ ror @C[2],@C[2],#32-22 @ C[1] = ROL64(A[1][1] ^ E[0], rhotates[1][1]); /* D[1] */
+ ror @C[3],@C[3],#32-22
+ eor @E[2],@E[2],@C[0]
+ eor @E[3],@E[3],@C[1]
+ eor @E[0],@E[0],@E[2]
+ eor @E[1],@E[1],@E[3]
+ str.l @E[0],[sp,#$R[0][0]] @ R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
+ bic @E[2],@C[6],@C[4],ror#11
+ str.h @E[1],[sp,#$R[0][0]+4]
+ bic @E[3],@C[7],@C[5],ror#10
+ bic @E[0],@C[8],@C[6],ror#32-(11-7)
+ bic @E[1],@C[9],@C[7],ror#32-(10-7)
+ eor @E[2],@C[2],@E[2],ror#32-11
+ str.l @E[2],[sp,#$R[0][1]] @ R[0][1] = C[1] ^ (~C[2] & C[3]);
+ eor @E[3],@C[3],@E[3],ror#32-10
+ str.h @E[3],[sp,#$R[0][1]+4]
+ eor @E[0],@C[4],@E[0],ror#32-7
+ eor @E[1],@C[5],@E[1],ror#32-7
+ str.l @E[0],[sp,#$R[0][2]] @ R[0][2] = C[2] ^ (~C[3] & C[4]);
+ bic @E[2],@C[0],@C[8],ror#32-7
+ str.h @E[1],[sp,#$R[0][2]+4]
+ bic @E[3],@C[1],@C[9],ror#32-7
+ eor @E[2],@E[2],@C[6],ror#32-11
+ str.l @E[2],[sp,#$R[0][3]] @ R[0][3] = C[3] ^ (~C[4] & C[0]);
+ eor @E[3],@E[3],@C[7],ror#32-10
+ str.h @E[3],[sp,#$R[0][3]+4]
+ bic @E[0],@C[2],@C[0]
+ add @E[3],sp,#$D[3]
+ ldr.l @C[0],[sp,#$A[0][3]] @ A[0][3]
+ bic @E[1],@C[3],@C[1]
+ ldr.h @C[1],[sp,#$A[0][3]+4]
+ eor @E[0],@E[0],@C[8],ror#32-7
+ eor @E[1],@E[1],@C[9],ror#32-7
+ str.l @E[0],[sp,#$R[0][4]] @ R[0][4] = C[4] ^ (~C[0] & C[1]);
+ add @C[9],sp,#$D[0]
+ str.h @E[1],[sp,#$R[0][4]+4]
+
+ ldmia @E[3],{@E[0]-@E[2],@E[3]} @ D[3..4]
+ ldmia @C[9],{@C[6]-@C[9]} @ D[0..1]
+
+ ldr.l @C[2],[sp,#$A[1][4]] @ A[1][4]
+ eor @C[0],@C[0],@E[0]
+ ldr.h @C[3],[sp,#$A[1][4]+4]
+ eor @C[1],@C[1],@E[1]
+ @ ror @C[0],@C[0],#32-14 @ C[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]);
+ ldr.l @E[0],[sp,#$A[3][1]] @ A[3][1]
+ @ ror @C[1],@C[1],#32-14
+ ldr.h @E[1],[sp,#$A[3][1]+4]
+
+ eor @C[2],@C[2],@E[2]
+ ldr.l @C[4],[sp,#$A[2][0]] @ A[2][0]
+ eor @C[3],@C[3],@E[3]
+ ldr.h @C[5],[sp,#$A[2][0]+4]
+ @ ror @C[2],@C[2],#32-10 @ C[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
+ @ ror @C[3],@C[3],#32-10
+
+ eor @C[6],@C[6],@C[4]
+ ldr.l @E[2],[sp,#$D[2]] @ D[2]
+ eor @C[7],@C[7],@C[5]
+ ldr.h @E[3],[sp,#$D[2]+4]
+ ror @C[5],@C[6],#32-1 @ C[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]);
+ ror @C[4],@C[7],#32-2
+
+ eor @E[0],@E[0],@C[8]
+ ldr.l @C[8],[sp,#$A[4][2]] @ A[4][2]
+ eor @E[1],@E[1],@C[9]
+ ldr.h @C[9],[sp,#$A[4][2]+4]
+ ror @C[7],@E[0],#32-22 @ C[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]);
+ ror @C[6],@E[1],#32-23
+
+ bic @E[0],@C[4],@C[2],ror#32-10
+ bic @E[1],@C[5],@C[3],ror#32-10
+ eor @E[2],@E[2],@C[8]
+ eor @E[3],@E[3],@C[9]
+ ror @C[9],@E[2],#32-30 @ C[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]);
+ ror @C[8],@E[3],#32-31
+ eor @E[0],@E[0],@C[0],ror#32-14
+ eor @E[1],@E[1],@C[1],ror#32-14
+ str.l @E[0],[sp,#$R[1][0]] @ R[1][0] = C[0] ^ (~C[1] & C[2])
+ bic @E[2],@C[6],@C[4]
+ str.h @E[1],[sp,#$R[1][0]+4]
+ bic @E[3],@C[7],@C[5]
+ eor @E[2],@E[2],@C[2],ror#32-10
+ str.l @E[2],[sp,#$R[1][1]] @ R[1][1] = C[1] ^ (~C[2] & C[3]);
+ eor @E[3],@E[3],@C[3],ror#32-10
+ str.h @E[3],[sp,#$R[1][1]+4]
+ bic @E[0],@C[8],@C[6]
+ bic @E[1],@C[9],@C[7]
+ bic @E[2],@C[0],@C[8],ror#14
+ bic @E[3],@C[1],@C[9],ror#14
+ eor @E[0],@E[0],@C[4]
+ eor @E[1],@E[1],@C[5]
+ str.l @E[0],[sp,#$R[1][2]] @ R[1][2] = C[2] ^ (~C[3] & C[4]);
+ bic @C[2],@C[2],@C[0],ror#32-(14-10)
+ str.h @E[1],[sp,#$R[1][2]+4]
+ eor @E[2],@C[6],@E[2],ror#32-14
+ bic @E[1],@C[3],@C[1],ror#32-(14-10)
+ str.l @E[2],[sp,#$R[1][3]] @ R[1][3] = C[3] ^ (~C[4] & C[0]);
+ eor @E[3],@C[7],@E[3],ror#32-14
+ str.h @E[3],[sp,#$R[1][3]+4]
+ add @E[2],sp,#$D[1]
+ ldr.l @C[1],[sp,#$A[0][1]] @ A[0][1]
+ eor @E[0],@C[8],@C[2],ror#32-10
+ ldr.h @C[0],[sp,#$A[0][1]+4]
+ eor @E[1],@C[9],@E[1],ror#32-10
+ str.l @E[0],[sp,#$R[1][4]] @ R[1][4] = C[4] ^ (~C[0] & C[1]);
+ str.h @E[1],[sp,#$R[1][4]+4]
+
+ add @C[9],sp,#$D[3]
+ ldmia @E[2],{@E[0]-@E[2],@E[3]} @ D[1..2]
+ ldr.l @C[2],[sp,#$A[1][2]] @ A[1][2]
+ ldr.h @C[3],[sp,#$A[1][2]+4]
+ ldmia @C[9],{@C[6]-@C[9]} @ D[3..4]
+
+ eor @C[1],@C[1],@E[0]
+ ldr.l @C[4],[sp,#$A[2][3]] @ A[2][3]
+ eor @C[0],@C[0],@E[1]
+ ldr.h @C[5],[sp,#$A[2][3]+4]
+ ror @C[0],@C[0],#32-1 @ C[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]);
+
+ eor @C[2],@C[2],@E[2]
+ ldr.l @E[0],[sp,#$A[3][4]] @ A[3][4]
+ eor @C[3],@C[3],@E[3]
+ ldr.h @E[1],[sp,#$A[3][4]+4]
+ @ ror @C[2],@C[2],#32-3 @ C[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]);
+ ldr.l @E[2],[sp,#$D[0]] @ D[0]
+ @ ror @C[3],@C[3],#32-3
+ ldr.h @E[3],[sp,#$D[0]+4]
+
+ eor @C[4],@C[4],@C[6]
+ eor @C[5],@C[5],@C[7]
+ @ ror @C[5],@C[6],#32-12 @ C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
+ @ ror @C[4],@C[7],#32-13 @ [track reverse order below]
+
+ eor @E[0],@E[0],@C[8]
+ ldr.l @C[8],[sp,#$A[4][0]] @ A[4][0]
+ eor @E[1],@E[1],@C[9]
+ ldr.h @C[9],[sp,#$A[4][0]+4]
+ ror @C[6],@E[0],#32-4 @ C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
+ ror @C[7],@E[1],#32-4
+
+ eor @E[2],@E[2],@C[8]
+ eor @E[3],@E[3],@C[9]
+ ror @C[8],@E[2],#32-9 @ C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
+ ror @C[9],@E[3],#32-9
+
+ bic @E[0],@C[5],@C[2],ror#13-3
+ bic @E[1],@C[4],@C[3],ror#12-3
+ bic @E[2],@C[6],@C[5],ror#32-13
+ bic @E[3],@C[7],@C[4],ror#32-12
+ eor @E[0],@C[0],@E[0],ror#32-13
+ eor @E[1],@C[1],@E[1],ror#32-12
+ str.l @E[0],[sp,#$R[2][0]] @ R[2][0] = C[0] ^ (~C[1] & C[2])
+ eor @E[2],@E[2],@C[2],ror#32-3
+ str.h @E[1],[sp,#$R[2][0]+4]
+ eor @E[3],@E[3],@C[3],ror#32-3
+ str.l @E[2],[sp,#$R[2][1]] @ R[2][1] = C[1] ^ (~C[2] & C[3]);
+ bic @E[0],@C[8],@C[6]
+ bic @E[1],@C[9],@C[7]
+ str.h @E[3],[sp,#$R[2][1]+4]
+ eor @E[0],@E[0],@C[5],ror#32-13
+ eor @E[1],@E[1],@C[4],ror#32-12
+ str.l @E[0],[sp,#$R[2][2]] @ R[2][2] = C[2] ^ (~C[3] & C[4]);
+ bic @E[2],@C[0],@C[8]
+ str.h @E[1],[sp,#$R[2][2]+4]
+ bic @E[3],@C[1],@C[9]
+ eor @E[2],@E[2],@C[6]
+ eor @E[3],@E[3],@C[7]
+ str.l @E[2],[sp,#$R[2][3]] @ R[2][3] = C[3] ^ (~C[4] & C[0]);
+ bic @E[0],@C[2],@C[0],ror#3
+ str.h @E[3],[sp,#$R[2][3]+4]
+ bic @E[1],@C[3],@C[1],ror#3
+ ldr.l @C[1],[sp,#$A[0][4]] @ A[0][4] [in reverse order]
+ eor @E[0],@C[8],@E[0],ror#32-3
+ ldr.h @C[0],[sp,#$A[0][4]+4]
+ eor @E[1],@C[9],@E[1],ror#32-3
+ str.l @E[0],[sp,#$R[2][4]] @ R[2][4] = C[4] ^ (~C[0] & C[1]);
+ add @C[9],sp,#$D[1]
+ str.h @E[1],[sp,#$R[2][4]+4]
+
+ ldr.l @E[0],[sp,#$D[4]] @ D[4]
+ ldr.h @E[1],[sp,#$D[4]+4]
+ ldr.l @E[2],[sp,#$D[0]] @ D[0]
+ ldr.h @E[3],[sp,#$D[0]+4]
+
+ ldmia @C[9],{@C[6]-@C[9]} @ D[1..2]
+
+ eor @C[1],@C[1],@E[0]
+ ldr.l @C[2],[sp,#$A[1][0]] @ A[1][0]
+ eor @C[0],@C[0],@E[1]
+ ldr.h @C[3],[sp,#$A[1][0]+4]
+ @ ror @C[1],@E[0],#32-13 @ C[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]);
+ ldr.l @C[4],[sp,#$A[2][1]] @ A[2][1]
+ @ ror @C[0],@E[1],#32-14 @ [was loaded in reverse order]
+ ldr.h @C[5],[sp,#$A[2][1]+4]
+
+ eor @C[2],@C[2],@E[2]
+ ldr.l @E[0],[sp,#$A[3][2]] @ A[3][2]
+ eor @C[3],@C[3],@E[3]
+ ldr.h @E[1],[sp,#$A[3][2]+4]
+ @ ror @C[2],@C[2],#32-18 @ C[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]);
+ ldr.l @E[2],[sp,#$D[3]] @ D[3]
+ @ ror @C[3],@C[3],#32-18
+ ldr.h @E[3],[sp,#$D[3]+4]
+
+ eor @C[6],@C[6],@C[4]
+ eor @C[7],@C[7],@C[5]
+ ror @C[4],@C[6],#32-5 @ C[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]);
+ ror @C[5],@C[7],#32-5
+
+ eor @E[0],@E[0],@C[8]
+ ldr.l @C[8],[sp,#$A[4][3]] @ A[4][3]
+ eor @E[1],@E[1],@C[9]
+ ldr.h @C[9],[sp,#$A[4][3]+4]
+ ror @C[7],@E[0],#32-7 @ C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
+ ror @C[6],@E[1],#32-8
+
+ eor @E[2],@E[2],@C[8]
+ eor @E[3],@E[3],@C[9]
+ ror @C[8],@E[2],#32-28 @ C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
+ ror @C[9],@E[3],#32-28
+
+ bic @E[0],@C[4],@C[2],ror#32-18
+ bic @E[1],@C[5],@C[3],ror#32-18
+ eor @E[0],@E[0],@C[0],ror#32-14
+ eor @E[1],@E[1],@C[1],ror#32-13
+ str.l @E[0],[sp,#$R[3][0]] @ R[3][0] = C[0] ^ (~C[1] & C[2])
+ bic @E[2],@C[6],@C[4]
+ str.h @E[1],[sp,#$R[3][0]+4]
+ bic @E[3],@C[7],@C[5]
+ eor @E[2],@E[2],@C[2],ror#32-18
+ str.l @E[2],[sp,#$R[3][1]] @ R[3][1] = C[1] ^ (~C[2] & C[3]);
+ eor @E[3],@E[3],@C[3],ror#32-18
+ str.h @E[3],[sp,#$R[3][1]+4]
+ bic @E[0],@C[8],@C[6]
+ bic @E[1],@C[9],@C[7]
+ bic @E[2],@C[0],@C[8],ror#14
+ bic @E[3],@C[1],@C[9],ror#13
+ eor @E[0],@E[0],@C[4]
+ eor @E[1],@E[1],@C[5]
+ str.l @E[0],[sp,#$R[3][2]] @ R[3][2] = C[2] ^ (~C[3] & C[4]);
+ bic @C[2],@C[2],@C[0],ror#18-14
+ str.h @E[1],[sp,#$R[3][2]+4]
+ eor @E[2],@C[6],@E[2],ror#32-14
+ bic @E[1],@C[3],@C[1],ror#18-13
+ eor @E[3],@C[7],@E[3],ror#32-13
+ str.l @E[2],[sp,#$R[3][3]] @ R[3][3] = C[3] ^ (~C[4] & C[0]);
+ str.h @E[3],[sp,#$R[3][3]+4]
+ add @E[3],sp,#$D[2]
+ ldr.l @C[0],[sp,#$A[0][2]] @ A[0][2]
+ eor @E[0],@C[8],@C[2],ror#32-18
+ ldr.h @C[1],[sp,#$A[0][2]+4]
+ eor @E[1],@C[9],@E[1],ror#32-18
+ str.l @E[0],[sp,#$R[3][4]] @ R[3][4] = C[4] ^ (~C[0] & C[1]);
+ str.h @E[1],[sp,#$R[3][4]+4]
+
+ ldmia @E[3],{@E[0]-@E[2],@E[3]} @ D[2..3]
+ ldr.l @C[2],[sp,#$A[1][3]] @ A[1][3]
+ ldr.h @C[3],[sp,#$A[1][3]+4]
+ ldr.l @C[6],[sp,#$D[4]] @ D[4]
+ ldr.h @C[7],[sp,#$D[4]+4]
+
+ eor @C[0],@C[0],@E[0]
+ ldr.l @C[4],[sp,#$A[2][4]] @ A[2][4]
+ eor @C[1],@C[1],@E[1]
+ ldr.h @C[5],[sp,#$A[2][4]+4]
+ @ ror @C[0],@C[0],#32-31 @ C[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]);
+ ldr.l @C[8],[sp,#$D[0]] @ D[0]
+ @ ror @C[1],@C[1],#32-31
+ ldr.h @C[9],[sp,#$D[0]+4]
+
+ eor @E[2],@E[2],@C[2]
+ ldr.l @E[0],[sp,#$A[3][0]] @ A[3][0]
+ eor @E[3],@E[3],@C[3]
+ ldr.h @E[1],[sp,#$A[3][0]+4]
+ ror @C[3],@E[2],#32-27 @ C[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]);
+ ldr.l @E[2],[sp,#$D[1]] @ D[1]
+ ror @C[2],@E[3],#32-28
+ ldr.h @E[3],[sp,#$D[1]+4]
+
+ eor @C[6],@C[6],@C[4]
+ eor @C[7],@C[7],@C[5]
+ ror @C[5],@C[6],#32-19 @ C[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]);
+ ror @C[4],@C[7],#32-20
+
+ eor @E[0],@E[0],@C[8]
+ ldr.l @C[8],[sp,#$A[4][1]] @ A[4][1]
+ eor @E[1],@E[1],@C[9]
+ ldr.h @C[9],[sp,#$A[4][1]+4]
+ ror @C[7],@E[0],#32-20 @ C[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]);
+ ror @C[6],@E[1],#32-21
+
+ eor @C[8],@C[8],@E[2]
+ eor @C[9],@C[9],@E[3]
+ @ ror @C[8],@C[2],#32-1 @ C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
+ @ ror @C[9],@C[3],#32-1
+
+ bic @E[0],@C[4],@C[2]
+ bic @E[1],@C[5],@C[3]
+ eor @E[0],@E[0],@C[0],ror#32-31
+ str.l @E[0],[sp,#$R[4][0]] @ R[4][0] = C[0] ^ (~C[1] & C[2])
+ eor @E[1],@E[1],@C[1],ror#32-31
+ str.h @E[1],[sp,#$R[4][0]+4]
+ bic @E[2],@C[6],@C[4]
+ bic @E[3],@C[7],@C[5]
+ eor @E[2],@E[2],@C[2]
+ eor @E[3],@E[3],@C[3]
+ str.l @E[2],[sp,#$R[4][1]] @ R[4][1] = C[1] ^ (~C[2] & C[3]);
+ bic @E[0],@C[8],@C[6],ror#1
+ str.h @E[3],[sp,#$R[4][1]+4]
+ bic @E[1],@C[9],@C[7],ror#1
+ bic @E[2],@C[0],@C[8],ror#31-1
+ bic @E[3],@C[1],@C[9],ror#31-1
+ eor @C[4],@C[4],@E[0],ror#32-1
+ str.l @C[4],[sp,#$R[4][2]] @ R[4][2] = C[2] ^= (~C[3] & C[4]);
+ eor @C[5],@C[5],@E[1],ror#32-1
+ str.h @C[5],[sp,#$R[4][2]+4]
+ eor @C[6],@C[6],@E[2],ror#32-31
+ eor @C[7],@C[7],@E[3],ror#32-31
+ str.l @C[6],[sp,#$R[4][3]] @ R[4][3] = C[3] ^= (~C[4] & C[0]);
+ bic @E[0],@C[2],@C[0],ror#32-31
+ str.h @C[7],[sp,#$R[4][3]+4]
+ bic @E[1],@C[3],@C[1],ror#32-31
+ add @E[2],sp,#$R[0][0]
+ eor @C[8],@E[0],@C[8],ror#32-1
+ add @E[0],sp,#$R[1][0]
+ eor @C[9],@E[1],@C[9],ror#32-1
+ str.l @C[8],[sp,#$R[4][4]] @ R[4][4] = C[4] ^= (~C[0] & C[1]);
+ str.h @C[9],[sp,#$R[4][4]+4]
+___
+}
+ Round(@A,@T);
+ Round(@T,@A);
+$code.=<<___;
+ blo .Lround2x
+
+ ldr pc,[sp,#440]
+.size KeccakF1600_int,.-KeccakF1600_int
+
+.type KeccakF1600, %function
+.align 5
+KeccakF1600:
+ stmdb sp!,{r0,r4-r11,lr}
+ sub sp,sp,#440+16 @ space for A[5][5],D[5],T[5][5],...
+
+ add @E[0],r0,#$A[1][0]
+ add @E[1],sp,#$A[1][0]
+ ldmia r0, {@C[0]-@C[9]} @ copy A[5][5] to stack
+ stmia sp, {@C[0]-@C[9]}
+ ldmia @E[0]!,{@C[0]-@C[9]}
+ stmia @E[1]!,{@C[0]-@C[9]}
+ ldmia @E[0]!,{@C[0]-@C[9]}
+ stmia @E[1]!,{@C[0]-@C[9]}
+ ldmia @E[0]!,{@C[0]-@C[9]}
+ stmia @E[1]!,{@C[0]-@C[9]}
+ ldmia @E[0], {@C[0]-@C[9]}
+ add @E[2],sp,#$A[0][0]
+ add @E[0],sp,#$A[1][0]
+ stmia @E[1], {@C[0]-@C[9]}
+
+ bl KeccakF1600_enter
+
+ ldr @E[1], [sp,#440+16] @ restore pointer to A
+ ldmia sp, {@C[0]-@C[9]}
+ stmia @E[1]!,{@C[0]-@C[9]} @ return A[5][5]
+ ldmia @E[0]!,{@C[0]-@C[9]}
+ stmia @E[1]!,{@C[0]-@C[9]}
+ ldmia @E[0]!,{@C[0]-@C[9]}
+ stmia @E[1]!,{@C[0]-@C[9]}
+ ldmia @E[0]!,{@C[0]-@C[9]}
+ stmia @E[1]!,{@C[0]-@C[9]}
+ ldmia @E[0], {@C[0]-@C[9]}
+ stmia @E[1], {@C[0]-@C[9]}
+
+ add sp,sp,#440+20
+ ldmia sp!,{r4-r11,pc}
+.size KeccakF1600,.-KeccakF1600
+___
+{ my ($A_flat,$inp,$len,$bsz) = map("r$_",(10..12,14));
+
+########################################################################
+# Stack layout
+# ----->+-----------------------+
+# | uint64_t A[5][5] |
+# | ... |
+# | ... |
+# +456->+-----------------------+
+# | 0x55555555 |
+# +460->+-----------------------+
+# | 0x33333333 |
+# +464->+-----------------------+
+# | 0x0f0f0f0f |
+# +468->+-----------------------+
+# | 0x00ff00ff |
+# +472->+-----------------------+
+# | uint64_t *A |
+# +476->+-----------------------+
+# | const void *inp |
+# +480->+-----------------------+
+# | size_t len |
+# +484->+-----------------------+
+# | size_t bs |
+# +488->+-----------------------+
+# | ....
+
+$code.=<<___;
+.global SHA3_absorb
+.type SHA3_absorb,%function
+.align 5
+SHA3_absorb:
+ stmdb sp!,{r0-r12,lr}
+ sub sp,sp,#456+16
+
+ add $A_flat,r0,#$A[1][0]
+ @ mov $inp,r1
+ mov $len,r2
+ mov $bsz,r3
+ cmp r2,r3
+ blo .Labsorb_abort
+
+ add $inp,sp,#0
+ ldmia r0, {@C[0]-@C[9]} @ copy A[5][5] to stack
+ stmia $inp!, {@C[0]-@C[9]}
+ ldmia $A_flat!,{@C[0]-@C[9]}
+ stmia $inp!, {@C[0]-@C[9]}
+ ldmia $A_flat!,{@C[0]-@C[9]}
+ stmia $inp!, {@C[0]-@C[9]}
+ ldmia $A_flat!,{@C[0]-@C[9]}
+ stmia $inp!, {@C[0]-@C[9]}
+ ldmia $A_flat!,{@C[0]-@C[9]}
+ stmia $inp, {@C[0]-@C[9]}
+
+ ldr $inp,[sp,#476] @ restore $inp
+#ifdef __thumb2__
+ mov r9,#0x00ff00ff
+ mov r8,#0x0f0f0f0f
+ mov r7,#0x33333333
+ mov r6,#0x55555555
+#else
+ mov r6,#0x11 @ compose constants
+ mov r8,#0x0f
+ mov r9,#0xff
+ orr r6,r6,r6,lsl#8
+ orr r8,r8,r8,lsl#8
+ orr r6,r6,r6,lsl#16 @ 0x11111111
+ orr r9,r9,r9,lsl#16 @ 0x00ff00ff
+ orr r8,r8,r8,lsl#16 @ 0x0f0f0f0f
+ orr r7,r6,r6,lsl#1 @ 0x33333333
+ orr r6,r6,r6,lsl#2 @ 0x55555555
+#endif
+ str r9,[sp,#468]
+ str r8,[sp,#464]
+ str r7,[sp,#460]
+ str r6,[sp,#456]
+ b .Loop_absorb
+
+.align 4
+.Loop_absorb:
+ subs r0,$len,$bsz
+ blo .Labsorbed
+ add $A_flat,sp,#0
+ str r0,[sp,#480] @ save len - bsz
+
+.align 4
+.Loop_block:
+ ldrb r0,[$inp],#1
+ ldrb r1,[$inp],#1
+ ldrb r2,[$inp],#1
+ ldrb r3,[$inp],#1
+ ldrb r4,[$inp],#1
+ orr r0,r0,r1,lsl#8
+ ldrb r1,[$inp],#1
+ orr r0,r0,r2,lsl#16
+ ldrb r2,[$inp],#1
+ orr r0,r0,r3,lsl#24 @ lo
+ ldrb r3,[$inp],#1
+ orr r1,r4,r1,lsl#8
+ orr r1,r1,r2,lsl#16
+ orr r1,r1,r3,lsl#24 @ hi
+
+ and r2,r0,r6 @ &=0x55555555
+ and r0,r0,r6,lsl#1 @ &=0xaaaaaaaa
+ and r3,r1,r6 @ &=0x55555555
+ and r1,r1,r6,lsl#1 @ &=0xaaaaaaaa
+ orr r2,r2,r2,lsr#1
+ orr r0,r0,r0,lsl#1
+ orr r3,r3,r3,lsr#1
+ orr r1,r1,r1,lsl#1
+ and r2,r2,r7 @ &=0x33333333
+ and r0,r0,r7,lsl#2 @ &=0xcccccccc
+ and r3,r3,r7 @ &=0x33333333
+ and r1,r1,r7,lsl#2 @ &=0xcccccccc
+ orr r2,r2,r2,lsr#2
+ orr r0,r0,r0,lsl#2
+ orr r3,r3,r3,lsr#2
+ orr r1,r1,r1,lsl#2
+ and r2,r2,r8 @ &=0x0f0f0f0f
+ and r0,r0,r8,lsl#4 @ &=0xf0f0f0f0
+ and r3,r3,r8 @ &=0x0f0f0f0f
+ and r1,r1,r8,lsl#4 @ &=0xf0f0f0f0
+ ldmia $A_flat,{r4-r5} @ A_flat[i]
+ orr r2,r2,r2,lsr#4
+ orr r0,r0,r0,lsl#4
+ orr r3,r3,r3,lsr#4
+ orr r1,r1,r1,lsl#4
+ and r2,r2,r9 @ &=0x00ff00ff
+ and r0,r0,r9,lsl#8 @ &=0xff00ff00
+ and r3,r3,r9 @ &=0x00ff00ff
+ and r1,r1,r9,lsl#8 @ &=0xff00ff00
+ orr r2,r2,r2,lsr#8
+ orr r0,r0,r0,lsl#8
+ orr r3,r3,r3,lsr#8
+ orr r1,r1,r1,lsl#8
+
+ lsl r2,r2,#16
+ lsr r1,r1,#16
+ eor r4,r4,r3,lsl#16
+ eor r5,r5,r0,lsr#16
+ eor r4,r4,r2,lsr#16
+ eor r5,r5,r1,lsl#16
+ stmia $A_flat!,{r4-r5} @ A_flat[i++] ^= BitInterleave(inp[0..7])
+
+ subs $bsz,$bsz,#8
+ bhi .Loop_block
+
+ str $inp,[sp,#476]
+
+ bl KeccakF1600_int
+
+ add r14,sp,#456
+ ldmia r14,{r6-r12,r14} @ restore constants and variables
+ b .Loop_absorb
+
+.align 4
+.Labsorbed:
+ add $inp,sp,#$A[1][0]
+ ldmia sp, {@C[0]-@C[9]}
+ stmia $A_flat!,{@C[0]-@C[9]} @ return A[5][5]
+ ldmia $inp!, {@C[0]-@C[9]}
+ stmia $A_flat!,{@C[0]-@C[9]}
+ ldmia $inp!, {@C[0]-@C[9]}
+ stmia $A_flat!,{@C[0]-@C[9]}
+ ldmia $inp!, {@C[0]-@C[9]}
+ stmia $A_flat!,{@C[0]-@C[9]}
+ ldmia $inp, {@C[0]-@C[9]}
+ stmia $A_flat, {@C[0]-@C[9]}
+
+.Labsorb_abort:
+ add sp,sp,#456+32
+ mov r0,$len @ return value
+ ldmia sp!,{r4-r12,pc}
+.size SHA3_absorb,.-SHA3_absorb
+___
+}
+{ my ($out,$len,$A_flat,$bsz) = map("r$_", (4,5,10,12));
+
+$code.=<<___;
+.global SHA3_squeeze
+.type SHA3_squeeze,%function
+.align 5
+SHA3_squeeze:
+ stmdb sp!,{r0,r3-r10,lr}
+
+ mov $A_flat,r0
+ mov $out,r1
+ mov $len,r2
+ mov $bsz,r3
+
+#ifdef __thumb2__
+ mov r9,#0x00ff00ff
+ mov r8,#0x0f0f0f0f
+ mov r7,#0x33333333
+ mov r6,#0x55555555
+#else
+ mov r6,#0x11 @ compose constants
+ mov r8,#0x0f
+ mov r9,#0xff
+ orr r6,r6,r6,lsl#8
+ orr r8,r8,r8,lsl#8
+ orr r6,r6,r6,lsl#16 @ 0x11111111
+ orr r9,r9,r9,lsl#16 @ 0x00ff00ff
+ orr r8,r8,r8,lsl#16 @ 0x0f0f0f0f
+ orr r7,r6,r6,lsl#1 @ 0x33333333
+ orr r6,r6,r6,lsl#2 @ 0x55555555
+#endif
+ stmdb sp!,{r6-r9}
+
+ mov r14,$A_flat
+ b .Loop_squeeze
+
+.align 4
+.Loop_squeeze:
+ ldmia $A_flat!,{r0,r1} @ A_flat[i++]
+
+ lsl r2,r0,#16
+ lsl r3,r1,#16 @ r3 = r1 << 16
+ lsr r2,r2,#16 @ r2 = r0 & 0x0000ffff
+ lsr r1,r1,#16
+ lsr r0,r0,#16 @ r0 = r0 >> 16
+ lsl r1,r1,#16 @ r1 = r1 & 0xffff0000
+
+ orr r2,r2,r2,lsl#8
+ orr r3,r3,r3,lsr#8
+ orr r0,r0,r0,lsl#8
+ orr r1,r1,r1,lsr#8
+ and r2,r2,r9 @ &=0x00ff00ff
+ and r3,r3,r9,lsl#8 @ &=0xff00ff00
+ and r0,r0,r9 @ &=0x00ff00ff
+ and r1,r1,r9,lsl#8 @ &=0xff00ff00
+ orr r2,r2,r2,lsl#4
+ orr r3,r3,r3,lsr#4
+ orr r0,r0,r0,lsl#4
+ orr r1,r1,r1,lsr#4
+ and r2,r2,r8 @ &=0x0f0f0f0f
+ and r3,r3,r8,lsl#4 @ &=0xf0f0f0f0
+ and r0,r0,r8 @ &=0x0f0f0f0f
+ and r1,r1,r8,lsl#4 @ &=0xf0f0f0f0
+ orr r2,r2,r2,lsl#2
+ orr r3,r3,r3,lsr#2
+ orr r0,r0,r0,lsl#2
+ orr r1,r1,r1,lsr#2
+ and r2,r2,r7 @ &=0x33333333
+ and r3,r3,r7,lsl#2 @ &=0xcccccccc
+ and r0,r0,r7 @ &=0x33333333
+ and r1,r1,r7,lsl#2 @ &=0xcccccccc
+ orr r2,r2,r2,lsl#1
+ orr r3,r3,r3,lsr#1
+ orr r0,r0,r0,lsl#1
+ orr r1,r1,r1,lsr#1
+ and r2,r2,r6 @ &=0x55555555
+ and r3,r3,r6,lsl#1 @ &=0xaaaaaaaa
+ and r0,r0,r6 @ &=0x55555555
+ and r1,r1,r6,lsl#1 @ &=0xaaaaaaaa
+
+ orr r2,r2,r3
+ orr r0,r0,r1
+
+ cmp $len,#8
+ blo .Lsqueeze_tail
+ lsr r1,r2,#8
+ strb r2,[$out],#1
+ lsr r3,r2,#16
+ strb r1,[$out],#1
+ lsr r2,r2,#24
+ strb r3,[$out],#1
+ strb r2,[$out],#1
+
+ lsr r1,r0,#8
+ strb r0,[$out],#1
+ lsr r3,r0,#16
+ strb r1,[$out],#1
+ lsr r0,r0,#24
+ strb r3,[$out],#1
+ strb r0,[$out],#1
+ subs $len,$len,#8
+ beq .Lsqueeze_done
+
+ subs $bsz,$bsz,#8 @ bsz -= 8
+ bhi .Loop_squeeze
+
+ mov r0,r14 @ original $A_flat
+
+ bl KeccakF1600
+
+ ldmia sp,{r6-r10,r12} @ restore constants and variables
+ mov r14,$A_flat
+ b .Loop_squeeze
+
+.align 4
+.Lsqueeze_tail:
+ strb r2,[$out],#1
+ lsr r2,r2,#8
+ subs $len,$len,#1
+ beq .Lsqueeze_done
+ strb r2,[$out],#1
+ lsr r2,r2,#8
+ subs $len,$len,#1
+ beq .Lsqueeze_done
+ strb r2,[$out],#1
+ lsr r2,r2,#8
+ subs $len,$len,#1
+ beq .Lsqueeze_done
+ strb r2,[$out],#1
+ subs $len,$len,#1
+ beq .Lsqueeze_done
+
+ strb r0,[$out],#1
+ lsr r0,r0,#8
+ subs $len,$len,#1
+ beq .Lsqueeze_done
+ strb r0,[$out],#1
+ lsr r0,r0,#8
+ subs $len,$len,#1
+ beq .Lsqueeze_done
+ strb r0,[$out]
+ b .Lsqueeze_done
+
+.align 4
+.Lsqueeze_done:
+ add sp,sp,#24
+ ldmia sp!,{r4-r10,pc}
+.size SHA3_squeeze,.-SHA3_squeeze
+___
+}
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7
+.fpu neon
+
+.type iotas64, %object
+.align 5
+iotas64:
+ .quad 0x0000000000000001
+ .quad 0x0000000000008082
+ .quad 0x800000000000808a
+ .quad 0x8000000080008000
+ .quad 0x000000000000808b
+ .quad 0x0000000080000001
+ .quad 0x8000000080008081
+ .quad 0x8000000000008009
+ .quad 0x000000000000008a
+ .quad 0x0000000000000088
+ .quad 0x0000000080008009
+ .quad 0x000000008000000a
+ .quad 0x000000008000808b
+ .quad 0x800000000000008b
+ .quad 0x8000000000008089
+ .quad 0x8000000000008003
+ .quad 0x8000000000008002
+ .quad 0x8000000000000080
+ .quad 0x000000000000800a
+ .quad 0x800000008000000a
+ .quad 0x8000000080008081
+ .quad 0x8000000000008080
+ .quad 0x0000000080000001
+ .quad 0x8000000080008008
+.size iotas64,.-iotas64
+
+.type KeccakF1600_neon, %function
+.align 5
+KeccakF1600_neon:
+ add r1, r0, #16
+ adr r2, iotas64
+ mov r3, #24 @ loop counter
+ b .Loop_neon
+
+.align 4
+.Loop_neon:
+ @ Theta
+ vst1.64 {q4}, [r0:64] @ offload A[0..1][4]
+ veor q13, q0, q5 @ A[0..1][0]^A[2..3][0]
+ vst1.64 {d18}, [r1:64] @ offload A[2][4]
+ veor q14, q1, q6 @ A[0..1][1]^A[2..3][1]
+ veor q15, q2, q7 @ A[0..1][2]^A[2..3][2]
+ veor d26, d26, d27 @ C[0]=A[0][0]^A[1][0]^A[2][0]^A[3][0]
+ veor d27, d28, d29 @ C[1]=A[0][1]^A[1][1]^A[2][1]^A[3][1]
+ veor q14, q3, q8 @ A[0..1][3]^A[2..3][3]
+ veor q4, q4, q9 @ A[0..1][4]^A[2..3][4]
+ veor d30, d30, d31 @ C[2]=A[0][2]^A[1][2]^A[2][2]^A[3][2]
+ veor d31, d28, d29 @ C[3]=A[0][3]^A[1][3]^A[2][3]^A[3][3]
+ veor d25, d8, d9 @ C[4]=A[0][4]^A[1][4]^A[2][4]^A[3][4]
+ veor q13, q13, q10 @ C[0..1]^=A[4][0..1]
+ veor q14, q15, q11 @ C[2..3]^=A[4][2..3]
+ veor d25, d25, d24 @ C[4]^=A[4][4]
+
+ vadd.u64 q4, q13, q13 @ C[0..1]<<1
+ vadd.u64 q15, q14, q14 @ C[2..3]<<1
+ vadd.u64 d18, d25, d25 @ C[4]<<1
+ vsri.u64 q4, q13, #63 @ ROL64(C[0..1],1)
+ vsri.u64 q15, q14, #63 @ ROL64(C[2..3],1)
+ vsri.u64 d18, d25, #63 @ ROL64(C[4],1)
+ veor d25, d25, d9 @ D[0] = C[4] ^= ROL64(C[1],1)
+ veor q13, q13, q15 @ D[1..2] = C[0..1] ^ ROL64(C[2..3],1)
+ veor d28, d28, d18 @ D[3] = C[2] ^= ROL64(C[4],1)
+ veor d29, d29, d8 @ D[4] = C[3] ^= ROL64(C[0],1)
+
+ veor d0, d0, d25 @ A[0][0] ^= C[4]
+ veor d1, d1, d25 @ A[1][0] ^= C[4]
+ veor d10, d10, d25 @ A[2][0] ^= C[4]
+ veor d11, d11, d25 @ A[3][0] ^= C[4]
+ veor d20, d20, d25 @ A[4][0] ^= C[4]
+
+ veor d2, d2, d26 @ A[0][1] ^= D[1]
+ veor d3, d3, d26 @ A[1][1] ^= D[1]
+ veor d12, d12, d26 @ A[2][1] ^= D[1]
+ veor d13, d13, d26 @ A[3][1] ^= D[1]
+ veor d21, d21, d26 @ A[4][1] ^= D[1]
+ vmov d26, d27
+
+ veor d6, d6, d28 @ A[0][3] ^= C[2]
+ veor d7, d7, d28 @ A[1][3] ^= C[2]
+ veor d16, d16, d28 @ A[2][3] ^= C[2]
+ veor d17, d17, d28 @ A[3][3] ^= C[2]
+ veor d23, d23, d28 @ A[4][3] ^= C[2]
+ vld1.64 {q4}, [r0:64] @ restore A[0..1][4]
+ vmov d28, d29
+
+ vld1.64 {d18}, [r1:64] @ restore A[2][4]
+ veor q2, q2, q13 @ A[0..1][2] ^= D[2]
+ veor q7, q7, q13 @ A[2..3][2] ^= D[2]
+ veor d22, d22, d27 @ A[4][2] ^= D[2]
+
+ veor q4, q4, q14 @ A[0..1][4] ^= C[3]
+ veor q9, q9, q14 @ A[2..3][4] ^= C[3]
+ veor d24, d24, d29 @ A[4][4] ^= C[3]
+
+ @ Rho + Pi
+ vmov d26, d2 @ C[1] = A[0][1]
+ vshl.u64 d2, d3, #44
+ vmov d27, d4 @ C[2] = A[0][2]
+ vshl.u64 d4, d14, #43
+ vmov d28, d6 @ C[3] = A[0][3]
+ vshl.u64 d6, d17, #21
+ vmov d29, d8 @ C[4] = A[0][4]
+ vshl.u64 d8, d24, #14
+ vsri.u64 d2, d3, #64-44 @ A[0][1] = ROL64(A[1][1], rhotates[1][1])
+ vsri.u64 d4, d14, #64-43 @ A[0][2] = ROL64(A[2][2], rhotates[2][2])
+ vsri.u64 d6, d17, #64-21 @ A[0][3] = ROL64(A[3][3], rhotates[3][3])
+ vsri.u64 d8, d24, #64-14 @ A[0][4] = ROL64(A[4][4], rhotates[4][4])
+
+ vshl.u64 d3, d9, #20
+ vshl.u64 d14, d16, #25
+ vshl.u64 d17, d15, #15
+ vshl.u64 d24, d21, #2
+ vsri.u64 d3, d9, #64-20 @ A[1][1] = ROL64(A[1][4], rhotates[1][4])
+ vsri.u64 d14, d16, #64-25 @ A[2][2] = ROL64(A[2][3], rhotates[2][3])
+ vsri.u64 d17, d15, #64-15 @ A[3][3] = ROL64(A[3][2], rhotates[3][2])
+ vsri.u64 d24, d21, #64-2 @ A[4][4] = ROL64(A[4][1], rhotates[4][1])
+
+ vshl.u64 d9, d22, #61
+ @ vshl.u64 d16, d19, #8
+ vshl.u64 d15, d12, #10
+ vshl.u64 d21, d7, #55
+ vsri.u64 d9, d22, #64-61 @ A[1][4] = ROL64(A[4][2], rhotates[4][2])
+ vext.8 d16, d19, d19, #8-1 @ A[2][3] = ROL64(A[3][4], rhotates[3][4])
+ vsri.u64 d15, d12, #64-10 @ A[3][2] = ROL64(A[2][1], rhotates[2][1])
+ vsri.u64 d21, d7, #64-55 @ A[4][1] = ROL64(A[1][3], rhotates[1][3])
+
+ vshl.u64 d22, d18, #39
+ @ vshl.u64 d19, d23, #56
+ vshl.u64 d12, d5, #6
+ vshl.u64 d7, d13, #45
+ vsri.u64 d22, d18, #64-39 @ A[4][2] = ROL64(A[2][4], rhotates[2][4])
+ vext.8 d19, d23, d23, #8-7 @ A[3][4] = ROL64(A[4][3], rhotates[4][3])
+ vsri.u64 d12, d5, #64-6 @ A[2][1] = ROL64(A[1][2], rhotates[1][2])
+ vsri.u64 d7, d13, #64-45 @ A[1][3] = ROL64(A[3][1], rhotates[3][1])
+
+ vshl.u64 d18, d20, #18
+ vshl.u64 d23, d11, #41
+ vshl.u64 d5, d10, #3
+ vshl.u64 d13, d1, #36
+ vsri.u64 d18, d20, #64-18 @ A[2][4] = ROL64(A[4][0], rhotates[4][0])
+ vsri.u64 d23, d11, #64-41 @ A[4][3] = ROL64(A[3][0], rhotates[3][0])
+ vsri.u64 d5, d10, #64-3 @ A[1][2] = ROL64(A[2][0], rhotates[2][0])
+ vsri.u64 d13, d1, #64-36 @ A[3][1] = ROL64(A[1][0], rhotates[1][0])
+
+ vshl.u64 d1, d28, #28
+ vshl.u64 d10, d26, #1
+ vshl.u64 d11, d29, #27
+ vshl.u64 d20, d27, #62
+ vsri.u64 d1, d28, #64-28 @ A[1][0] = ROL64(C[3], rhotates[0][3])
+ vsri.u64 d10, d26, #64-1 @ A[2][0] = ROL64(C[1], rhotates[0][1])
+ vsri.u64 d11, d29, #64-27 @ A[3][0] = ROL64(C[4], rhotates[0][4])
+ vsri.u64 d20, d27, #64-62 @ A[4][0] = ROL64(C[2], rhotates[0][2])
+
+ @ Chi + Iota
+ vbic q13, q2, q1
+ vbic q14, q3, q2
+ vbic q15, q4, q3
+ veor q13, q13, q0 @ A[0..1][0] ^ (~A[0..1][1] & A[0..1][2])
+ veor q14, q14, q1 @ A[0..1][1] ^ (~A[0..1][2] & A[0..1][3])
+ veor q2, q2, q15 @ A[0..1][2] ^= (~A[0..1][3] & A[0..1][4])
+ vst1.64 {q13}, [r0:64] @ offload A[0..1][0]
+ vbic q13, q0, q4
+ vbic q15, q1, q0
+ vmov q1, q14 @ A[0..1][1]
+ veor q3, q3, q13 @ A[0..1][3] ^= (~A[0..1][4] & A[0..1][0])
+ veor q4, q4, q15 @ A[0..1][4] ^= (~A[0..1][0] & A[0..1][1])
+
+ vbic q13, q7, q6
+ vmov q0, q5 @ A[2..3][0]
+ vbic q14, q8, q7
+ vmov q15, q6 @ A[2..3][1]
+ veor q5, q5, q13 @ A[2..3][0] ^= (~A[2..3][1] & A[2..3][2])
+ vbic q13, q9, q8
+ veor q6, q6, q14 @ A[2..3][1] ^= (~A[2..3][2] & A[2..3][3])
+ vbic q14, q0, q9
+ veor q7, q7, q13 @ A[2..3][2] ^= (~A[2..3][3] & A[2..3][4])
+ vbic q13, q15, q0
+ veor q8, q8, q14 @ A[2..3][3] ^= (~A[2..3][4] & A[2..3][0])
+ vmov q14, q10 @ A[4][0..1]
+ veor q9, q9, q13 @ A[2..3][4] ^= (~A[2..3][0] & A[2..3][1])
+
+ vld1.64 d25, [r2:64]! @ Iota[i++]
+ vbic d26, d22, d21
+ vbic d27, d23, d22
+ vld1.64 {q0}, [r0:64] @ restore A[0..1][0]
+ veor d20, d20, d26 @ A[4][0] ^= (~A[4][1] & A[4][2])
+ vbic d26, d24, d23
+ veor d21, d21, d27 @ A[4][1] ^= (~A[4][2] & A[4][3])
+ vbic d27, d28, d24
+ veor d22, d22, d26 @ A[4][2] ^= (~A[4][3] & A[4][4])
+ vbic d26, d29, d28
+ veor d23, d23, d27 @ A[4][3] ^= (~A[4][4] & A[4][0])
+ veor d0, d0, d25 @ A[0][0] ^= Iota[i]
+ veor d24, d24, d26 @ A[4][4] ^= (~A[4][0] & A[4][1])
+
+ subs r3, r3, #1
+ bne .Loop_neon
+
+ bx lr
+.size KeccakF1600_neon,.-KeccakF1600_neon
+
+.global SHA3_absorb_neon
+.type SHA3_absorb_neon, %function
+.align 5
+SHA3_absorb_neon:
+ stmdb sp!, {r4-r6,lr}
+ vstmdb sp!, {d8-d15}
+
+ mov r4, r1 @ inp
+ mov r5, r2 @ len
+ mov r6, r3 @ bsz
+
+ vld1.32 {d0}, [r0:64]! @ A[0][0]
+ vld1.32 {d2}, [r0:64]! @ A[0][1]
+ vld1.32 {d4}, [r0:64]! @ A[0][2]
+ vld1.32 {d6}, [r0:64]! @ A[0][3]
+ vld1.32 {d8}, [r0:64]! @ A[0][4]
+
+ vld1.32 {d1}, [r0:64]! @ A[1][0]
+ vld1.32 {d3}, [r0:64]! @ A[1][1]
+ vld1.32 {d5}, [r0:64]! @ A[1][2]
+ vld1.32 {d7}, [r0:64]! @ A[1][3]
+ vld1.32 {d9}, [r0:64]! @ A[1][4]
+
+ vld1.32 {d10}, [r0:64]! @ A[2][0]
+ vld1.32 {d12}, [r0:64]! @ A[2][1]
+ vld1.32 {d14}, [r0:64]! @ A[2][2]
+ vld1.32 {d16}, [r0:64]! @ A[2][3]
+ vld1.32 {d18}, [r0:64]! @ A[2][4]
+
+ vld1.32 {d11}, [r0:64]! @ A[3][0]
+ vld1.32 {d13}, [r0:64]! @ A[3][1]
+ vld1.32 {d15}, [r0:64]! @ A[3][2]
+ vld1.32 {d17}, [r0:64]! @ A[3][3]
+ vld1.32 {d19}, [r0:64]! @ A[3][4]
+
+ vld1.32 {d20-d23}, [r0:64]! @ A[4][0..3]
+ vld1.32 {d24}, [r0:64] @ A[4][4]
+ sub r0, r0, #24*8 @ rewind
+ b .Loop_absorb_neon
+
+.align 4
+.Loop_absorb_neon:
+ subs r12, r5, r6 @ len - bsz
+ blo .Labsorbed_neon
+ mov r5, r12
+
+ vld1.8 {d31}, [r4]! @ endian-neutral loads...
+ cmp r6, #8*2
+ veor d0, d0, d31 @ A[0][0] ^= *inp++
+ blo .Lprocess_neon
+ vld1.8 {d31}, [r4]!
+ veor d2, d2, d31 @ A[0][1] ^= *inp++
+ beq .Lprocess_neon
+ vld1.8 {d31}, [r4]!
+ cmp r6, #8*4
+ veor d4, d4, d31 @ A[0][2] ^= *inp++
+ blo .Lprocess_neon
+ vld1.8 {d31}, [r4]!
+ veor d6, d6, d31 @ A[0][3] ^= *inp++
+ beq .Lprocess_neon
+ vld1.8 {d31},[r4]!
+ cmp r6, #8*6
+ veor d8, d8, d31 @ A[0][4] ^= *inp++
+ blo .Lprocess_neon
+
+ vld1.8 {d31}, [r4]!
+ veor d1, d1, d31 @ A[1][0] ^= *inp++
+ beq .Lprocess_neon
+ vld1.8 {d31}, [r4]!
+ cmp r6, #8*8
+ veor d3, d3, d31 @ A[1][1] ^= *inp++
+ blo .Lprocess_neon
+ vld1.8 {d31}, [r4]!
+ veor d5, d5, d31 @ A[1][2] ^= *inp++
+ beq .Lprocess_neon
+ vld1.8 {d31}, [r4]!
+ cmp r6, #8*10
+ veor d7, d7, d31 @ A[1][3] ^= *inp++
+ blo .Lprocess_neon
+ vld1.8 {d31}, [r4]!
+ veor d9, d9, d31 @ A[1][4] ^= *inp++
+ beq .Lprocess_neon
+
+ vld1.8 {d31}, [r4]!
+ cmp r6, #8*12
+ veor d10, d10, d31 @ A[2][0] ^= *inp++
+ blo .Lprocess_neon
+ vld1.8 {d31}, [r4]!
+ veor d12, d12, d31 @ A[2][1] ^= *inp++
+ beq .Lprocess_neon
+ vld1.8 {d31}, [r4]!
+ cmp r6, #8*14
+ veor d14, d14, d31 @ A[2][2] ^= *inp++
+ blo .Lprocess_neon
+ vld1.8 {d31}, [r4]!
+ veor d16, d16, d31 @ A[2][3] ^= *inp++
+ beq .Lprocess_neon
+ vld1.8 {d31}, [r4]!
+ cmp r6, #8*16
+ veor d18, d18, d31 @ A[2][4] ^= *inp++
+ blo .Lprocess_neon
+
+ vld1.8 {d31}, [r4]!
+ veor d11, d11, d31 @ A[3][0] ^= *inp++
+ beq .Lprocess_neon
+ vld1.8 {d31}, [r4]!
+ cmp r6, #8*18
+ veor d13, d13, d31 @ A[3][1] ^= *inp++
+ blo .Lprocess_neon
+ vld1.8 {d31}, [r4]!
+ veor d15, d15, d31 @ A[3][2] ^= *inp++
+ beq .Lprocess_neon
+ vld1.8 {d31}, [r4]!
+ cmp r6, #8*20
+ veor d17, d17, d31 @ A[3][3] ^= *inp++
+ blo .Lprocess_neon
+ vld1.8 {d31}, [r4]!
+ veor d19, d19, d31 @ A[3][4] ^= *inp++
+ beq .Lprocess_neon
+
+ vld1.8 {d31}, [r4]!
+ cmp r6, #8*22
+ veor d20, d20, d31 @ A[4][0] ^= *inp++
+ blo .Lprocess_neon
+ vld1.8 {d31}, [r4]!
+ veor d21, d21, d31 @ A[4][1] ^= *inp++
+ beq .Lprocess_neon
+ vld1.8 {d31}, [r4]!
+ cmp r6, #8*24
+ veor d22, d22, d31 @ A[4][2] ^= *inp++
+ blo .Lprocess_neon
+ vld1.8 {d31}, [r4]!
+ veor d23, d23, d31 @ A[4][3] ^= *inp++
+ beq .Lprocess_neon
+ vld1.8 {d31}, [r4]!
+ veor d24, d24, d31 @ A[4][4] ^= *inp++
+
+.Lprocess_neon:
+ bl KeccakF1600_neon
+ b .Loop_absorb_neon
+
+.align 4
+.Labsorbed_neon:
+ vst1.32 {d0}, [r0:64]! @ A[0][0..4]
+ vst1.32 {d2}, [r0:64]!
+ vst1.32 {d4}, [r0:64]!
+ vst1.32 {d6}, [r0:64]!
+ vst1.32 {d8}, [r0:64]!
+
+ vst1.32 {d1}, [r0:64]! @ A[1][0..4]
+ vst1.32 {d3}, [r0:64]!
+ vst1.32 {d5}, [r0:64]!
+ vst1.32 {d7}, [r0:64]!
+ vst1.32 {d9}, [r0:64]!
+
+ vst1.32 {d10}, [r0:64]! @ A[2][0..4]
+ vst1.32 {d12}, [r0:64]!
+ vst1.32 {d14}, [r0:64]!
+ vst1.32 {d16}, [r0:64]!
+ vst1.32 {d18}, [r0:64]!
+
+ vst1.32 {d11}, [r0:64]! @ A[3][0..4]
+ vst1.32 {d13}, [r0:64]!
+ vst1.32 {d15}, [r0:64]!
+ vst1.32 {d17}, [r0:64]!
+ vst1.32 {d19}, [r0:64]!
+
+ vst1.32 {d20-d23}, [r0:64]! @ A[4][0..4]
+ vst1.32 {d24}, [r0:64]
+
+ mov r0, r5 @ return value
+ vldmia sp!, {d8-d15}
+ ldmia sp!, {r4-r6,pc}
+.size SHA3_absorb_neon,.-SHA3_absorb_neon
+
+.global SHA3_squeeze_neon
+.type SHA3_squeeze_neon, %function
+.align 5
+SHA3_squeeze_neon:
+ stmdb sp!, {r4-r6,lr}
+
+ mov r4, r1 @ out
+ mov r5, r2 @ len
+ mov r6, r3 @ bsz
+ mov r12, r0 @ A_flat
+ mov r14, r3 @ bsz
+ b .Loop_squeeze_neon
+
+.align 4
+.Loop_squeeze_neon:
+ cmp r5, #8
+ blo .Lsqueeze_neon_tail
+ vld1.32 {d0}, [r12]!
+ vst1.8 {d0}, [r4]! @ endian-neutral store
+
+ subs r5, r5, #8 @ len -= 8
+ beq .Lsqueeze_neon_done
+
+ subs r14, r14, #8 @ bsz -= 8
+ bhi .Loop_squeeze_neon
+
+ vstmdb sp!, {d8-d15}
+
+ vld1.32 {d0}, [r0:64]! @ A[0][0..4]
+ vld1.32 {d2}, [r0:64]!
+ vld1.32 {d4}, [r0:64]!
+ vld1.32 {d6}, [r0:64]!
+ vld1.32 {d8}, [r0:64]!
+
+ vld1.32 {d1}, [r0:64]! @ A[1][0..4]
+ vld1.32 {d3}, [r0:64]!
+ vld1.32 {d5}, [r0:64]!
+ vld1.32 {d7}, [r0:64]!
+ vld1.32 {d9}, [r0:64]!
+
+ vld1.32 {d10}, [r0:64]! @ A[2][0..4]
+ vld1.32 {d12}, [r0:64]!
+ vld1.32 {d14}, [r0:64]!
+ vld1.32 {d16}, [r0:64]!
+ vld1.32 {d18}, [r0:64]!
+
+ vld1.32 {d11}, [r0:64]! @ A[3][0..4]
+ vld1.32 {d13}, [r0:64]!
+ vld1.32 {d15}, [r0:64]!
+ vld1.32 {d17}, [r0:64]!
+ vld1.32 {d19}, [r0:64]!
+
+ vld1.32 {d20-d23}, [r0:64]! @ A[4][0..4]
+ vld1.32 {d24}, [r0:64]
+ sub r0, r0, #24*8 @ rewind
+
+ bl KeccakF1600_neon
+
+ mov r12, r0 @ A_flat
+ vst1.32 {d0}, [r0:64]! @ A[0][0..4]
+ vst1.32 {d2}, [r0:64]!
+ vst1.32 {d4}, [r0:64]!
+ vst1.32 {d6}, [r0:64]!
+ vst1.32 {d8}, [r0:64]!
+
+ vst1.32 {d1}, [r0:64]! @ A[1][0..4]
+ vst1.32 {d3}, [r0:64]!
+ vst1.32 {d5}, [r0:64]!
+ vst1.32 {d7}, [r0:64]!
+ vst1.32 {d9}, [r0:64]!
+
+ vst1.32 {d10}, [r0:64]! @ A[2][0..4]
+ vst1.32 {d12}, [r0:64]!
+ vst1.32 {d14}, [r0:64]!
+ vst1.32 {d16}, [r0:64]!
+ vst1.32 {d18}, [r0:64]!
+
+ vst1.32 {d11}, [r0:64]! @ A[3][0..4]
+ vst1.32 {d13}, [r0:64]!
+ vst1.32 {d15}, [r0:64]!
+ vst1.32 {d17}, [r0:64]!
+ vst1.32 {d19}, [r0:64]!
+
+ vst1.32 {d20-d23}, [r0:64]! @ A[4][0..4]
+ mov r14, r6 @ bsz
+ vst1.32 {d24}, [r0:64]
+ mov r0, r12 @ rewind
+
+ vldmia sp!, {d8-d15}
+ b .Loop_squeeze_neon
+
+.align 4
+.Lsqueeze_neon_tail:
+ ldmia r12, {r2,r3}
+ cmp r5, #2
+ strb r2, [r4],#1 @ endian-neutral store
+ lsr r2, r2, #8
+ blo .Lsqueeze_neon_done
+ strb r2, [r4], #1
+ lsr r2, r2, #8
+ beq .Lsqueeze_neon_done
+ strb r2, [r4], #1
+ lsr r2, r2, #8
+ cmp r5, #4
+ blo .Lsqueeze_neon_done
+ strb r2, [r4], #1
+ beq .Lsqueeze_neon_done
+
+ strb r3, [r4], #1
+ lsr r3, r3, #8
+ cmp r5, #6
+ blo .Lsqueeze_neon_done
+ strb r3, [r4], #1
+ lsr r3, r3, #8
+ beq .Lsqueeze_neon_done
+ strb r3, [r4], #1
+
+.Lsqueeze_neon_done:
+ ldmia sp!, {r4-r6,pc}
+.size SHA3_squeeze_neon,.-SHA3_squeeze_neon
+#endif
+.asciz "Keccak-1600 absorb and squeeze for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
+.align 2
+___
+
+{
+ my %ldr, %str;
+
+ sub ldrd {
+ my ($mnemonic,$half,$reg,$ea) = @_;
+ my $op = $mnemonic eq "ldr" ? \%ldr : \%str;
+
+ if ($half eq "l") {
+ $$op{reg} = $reg;
+ $$op{ea} = $ea;
+ sprintf "#ifndef __thumb2__\n" .
+ " %s\t%s,%s\n" .
+ "#endif", $mnemonic,$reg,$ea;
+ } else {
+ sprintf "#ifndef __thumb2__\n" .
+ " %s\t%s,%s\n" .
+ "#else\n" .
+ " %sd\t%s,%s,%s\n" .
+ "#endif", $mnemonic,$reg,$ea,
+ $mnemonic,$$op{reg},$reg,$$op{ea};
+ }
+ }
+}
+
+foreach (split($/,$code)) {
+ s/\`([^\`]*)\`/eval $1/ge;
+
+ s/^\s+(ldr|str)\.([lh])\s+(r[0-9]+),\s*(\[.*)/ldrd($1,$2,$3,$4)/ge or
+ s/\b(ror|ls[rl])\s+(r[0-9]+.*)#/mov $2$1#/g or
+ s/\bret\b/bx lr/g or
+ s/\bbx\s+lr\b/.word\t0xe12fff1e/g; # make it possible to compile with -march=armv4
+
+ print $_,"\n";
+}
+
+close STDOUT; # enforce flush
diff --git a/crypto/sha/asm/keccak1600-armv8.pl b/crypto/sha/asm/keccak1600-armv8.pl
new file mode 100755
index 000000000000..704ab4a7e45a
--- /dev/null
+++ b/crypto/sha/asm/keccak1600-armv8.pl
@@ -0,0 +1,866 @@
+#!/usr/bin/env perl
+# Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# Keccak-1600 for ARMv8.
+#
+# June 2017.
+#
+# This is straightforward KECCAK_1X_ALT implementation. It makes no
+# sense to attempt SIMD/NEON implementation for following reason.
+# 64-bit lanes of vector registers can't be addressed as easily as in
+# 32-bit mode. This means that 64-bit NEON is bound to be slower than
+# 32-bit NEON, and this implementation is faster than 32-bit NEON on
+# same processor. Even though it takes more scalar xor's and andn's,
+# it gets compensated by availability of rotate. Not to forget that
+# most processors achieve higher issue rate with scalar instructions.
+#
+# February 2018.
+#
+# Add hardware-assisted ARMv8.2 implementation. It's KECCAK_1X_ALT
+# variant with register permutation/rotation twist that allows to
+# eliminate copies to temporary registers. If you look closely you'll
+# notice that it uses only one lane of vector registers. The new
+# instructions effectively facilitate parallel hashing, which we don't
+# support [yet?]. But lowest-level core procedure is prepared for it.
+# The inner round is 67 [vector] instructions, so it's not actually
+# obvious that it will provide performance improvement [in serial
+# hash] as long as vector instructions issue rate is limited to 1 per
+# cycle...
+#
+######################################################################
+# Numbers are cycles per processed byte.
+#
+# r=1088(*)
+#
+# Cortex-A53 13
+# Cortex-A57 12
+# X-Gene 14
+# Mongoose 10
+# Kryo 12
+# Denver 7.8
+# Apple A7 7.2
+#
+# (*) Corresponds to SHA3-256. No improvement coefficients are listed
+# because they vary too much from compiler to compiler. Newer
+# compiler does much better and improvement varies from 5% on
+# Cortex-A57 to 25% on Cortex-A53. While in comparison to older
+# compiler this code is at least 2x faster...
+
+$flavour = shift;
+$output = shift;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+my @rhotates = ([ 0, 1, 62, 28, 27 ],
+ [ 36, 44, 6, 55, 20 ],
+ [ 3, 10, 43, 25, 39 ],
+ [ 41, 45, 15, 21, 8 ],
+ [ 18, 2, 61, 56, 14 ]);
+
+$code.=<<___;
+.text
+
+.align 8 // strategic alignment and padding that allows to use
+ // address value as loop termination condition...
+ .quad 0,0,0,0,0,0,0,0
+.type iotas,%object
+iotas:
+ .quad 0x0000000000000001
+ .quad 0x0000000000008082
+ .quad 0x800000000000808a
+ .quad 0x8000000080008000
+ .quad 0x000000000000808b
+ .quad 0x0000000080000001
+ .quad 0x8000000080008081
+ .quad 0x8000000000008009
+ .quad 0x000000000000008a
+ .quad 0x0000000000000088
+ .quad 0x0000000080008009
+ .quad 0x000000008000000a
+ .quad 0x000000008000808b
+ .quad 0x800000000000008b
+ .quad 0x8000000000008089
+ .quad 0x8000000000008003
+ .quad 0x8000000000008002
+ .quad 0x8000000000000080
+ .quad 0x000000000000800a
+ .quad 0x800000008000000a
+ .quad 0x8000000080008081
+ .quad 0x8000000000008080
+ .quad 0x0000000080000001
+ .quad 0x8000000080008008
+.size iotas,.-iotas
+___
+ {{{
+my @A = map([ "x$_", "x".($_+1), "x".($_+2), "x".($_+3), "x".($_+4) ],
+ (0, 5, 10, 15, 20));
+ $A[3][3] = "x25"; # x18 is reserved
+
+my @C = map("x$_", (26,27,28,30));
+
+$code.=<<___;
+.type KeccakF1600_int,%function
+.align 5
+KeccakF1600_int:
+ adr $C[2],iotas
+ stp $C[2],x30,[sp,#16] // 32 bytes on top are mine
+ b .Loop
+.align 4
+.Loop:
+ ////////////////////////////////////////// Theta
+ eor $C[0],$A[0][0],$A[1][0]
+ stp $A[0][4],$A[1][4],[sp,#0] // offload pair...
+ eor $C[1],$A[0][1],$A[1][1]
+ eor $C[2],$A[0][2],$A[1][2]
+ eor $C[3],$A[0][3],$A[1][3]
+___
+ $C[4]=$A[0][4];
+ $C[5]=$A[1][4];
+$code.=<<___;
+ eor $C[4],$A[0][4],$A[1][4]
+ eor $C[0],$C[0],$A[2][0]
+ eor $C[1],$C[1],$A[2][1]
+ eor $C[2],$C[2],$A[2][2]
+ eor $C[3],$C[3],$A[2][3]
+ eor $C[4],$C[4],$A[2][4]
+ eor $C[0],$C[0],$A[3][0]
+ eor $C[1],$C[1],$A[3][1]
+ eor $C[2],$C[2],$A[3][2]
+ eor $C[3],$C[3],$A[3][3]
+ eor $C[4],$C[4],$A[3][4]
+ eor $C[0],$C[0],$A[4][0]
+ eor $C[2],$C[2],$A[4][2]
+ eor $C[1],$C[1],$A[4][1]
+ eor $C[3],$C[3],$A[4][3]
+ eor $C[4],$C[4],$A[4][4]
+
+ eor $C[5],$C[0],$C[2],ror#63
+
+ eor $A[0][1],$A[0][1],$C[5]
+ eor $A[1][1],$A[1][1],$C[5]
+ eor $A[2][1],$A[2][1],$C[5]
+ eor $A[3][1],$A[3][1],$C[5]
+ eor $A[4][1],$A[4][1],$C[5]
+
+ eor $C[5],$C[1],$C[3],ror#63
+ eor $C[2],$C[2],$C[4],ror#63
+ eor $C[3],$C[3],$C[0],ror#63
+ eor $C[4],$C[4],$C[1],ror#63
+
+ eor $C[1], $A[0][2],$C[5] // mov $C[1],$A[0][2]
+ eor $A[1][2],$A[1][2],$C[5]
+ eor $A[2][2],$A[2][2],$C[5]
+ eor $A[3][2],$A[3][2],$C[5]
+ eor $A[4][2],$A[4][2],$C[5]
+
+ eor $A[0][0],$A[0][0],$C[4]
+ eor $A[1][0],$A[1][0],$C[4]
+ eor $A[2][0],$A[2][0],$C[4]
+ eor $A[3][0],$A[3][0],$C[4]
+ eor $A[4][0],$A[4][0],$C[4]
+___
+ $C[4]=undef;
+ $C[5]=undef;
+$code.=<<___;
+ ldp $A[0][4],$A[1][4],[sp,#0] // re-load offloaded data
+ eor $C[0], $A[0][3],$C[2] // mov $C[0],$A[0][3]
+ eor $A[1][3],$A[1][3],$C[2]
+ eor $A[2][3],$A[2][3],$C[2]
+ eor $A[3][3],$A[3][3],$C[2]
+ eor $A[4][3],$A[4][3],$C[2]
+
+ eor $C[2], $A[0][4],$C[3] // mov $C[2],$A[0][4]
+ eor $A[1][4],$A[1][4],$C[3]
+ eor $A[2][4],$A[2][4],$C[3]
+ eor $A[3][4],$A[3][4],$C[3]
+ eor $A[4][4],$A[4][4],$C[3]
+
+ ////////////////////////////////////////// Rho+Pi
+ mov $C[3],$A[0][1]
+ ror $A[0][1],$A[1][1],#64-$rhotates[1][1]
+ //mov $C[1],$A[0][2]
+ ror $A[0][2],$A[2][2],#64-$rhotates[2][2]
+ //mov $C[0],$A[0][3]
+ ror $A[0][3],$A[3][3],#64-$rhotates[3][3]
+ //mov $C[2],$A[0][4]
+ ror $A[0][4],$A[4][4],#64-$rhotates[4][4]
+
+ ror $A[1][1],$A[1][4],#64-$rhotates[1][4]
+ ror $A[2][2],$A[2][3],#64-$rhotates[2][3]
+ ror $A[3][3],$A[3][2],#64-$rhotates[3][2]
+ ror $A[4][4],$A[4][1],#64-$rhotates[4][1]
+
+ ror $A[1][4],$A[4][2],#64-$rhotates[4][2]
+ ror $A[2][3],$A[3][4],#64-$rhotates[3][4]
+ ror $A[3][2],$A[2][1],#64-$rhotates[2][1]
+ ror $A[4][1],$A[1][3],#64-$rhotates[1][3]
+
+ ror $A[4][2],$A[2][4],#64-$rhotates[2][4]
+ ror $A[3][4],$A[4][3],#64-$rhotates[4][3]
+ ror $A[2][1],$A[1][2],#64-$rhotates[1][2]
+ ror $A[1][3],$A[3][1],#64-$rhotates[3][1]
+
+ ror $A[2][4],$A[4][0],#64-$rhotates[4][0]
+ ror $A[4][3],$A[3][0],#64-$rhotates[3][0]
+ ror $A[1][2],$A[2][0],#64-$rhotates[2][0]
+ ror $A[3][1],$A[1][0],#64-$rhotates[1][0]
+
+ ror $A[1][0],$C[0],#64-$rhotates[0][3]
+ ror $A[2][0],$C[3],#64-$rhotates[0][1]
+ ror $A[3][0],$C[2],#64-$rhotates[0][4]
+ ror $A[4][0],$C[1],#64-$rhotates[0][2]
+
+ ////////////////////////////////////////// Chi+Iota
+ bic $C[0],$A[0][2],$A[0][1]
+ bic $C[1],$A[0][3],$A[0][2]
+ bic $C[2],$A[0][0],$A[0][4]
+ bic $C[3],$A[0][1],$A[0][0]
+ eor $A[0][0],$A[0][0],$C[0]
+ bic $C[0],$A[0][4],$A[0][3]
+ eor $A[0][1],$A[0][1],$C[1]
+ ldr $C[1],[sp,#16]
+ eor $A[0][3],$A[0][3],$C[2]
+ eor $A[0][4],$A[0][4],$C[3]
+ eor $A[0][2],$A[0][2],$C[0]
+ ldr $C[3],[$C[1]],#8 // Iota[i++]
+
+ bic $C[0],$A[1][2],$A[1][1]
+ tst $C[1],#255 // are we done?
+ str $C[1],[sp,#16]
+ bic $C[1],$A[1][3],$A[1][2]
+ bic $C[2],$A[1][0],$A[1][4]
+ eor $A[0][0],$A[0][0],$C[3] // A[0][0] ^= Iota
+ bic $C[3],$A[1][1],$A[1][0]
+ eor $A[1][0],$A[1][0],$C[0]
+ bic $C[0],$A[1][4],$A[1][3]
+ eor $A[1][1],$A[1][1],$C[1]
+ eor $A[1][3],$A[1][3],$C[2]
+ eor $A[1][4],$A[1][4],$C[3]
+ eor $A[1][2],$A[1][2],$C[0]
+
+ bic $C[0],$A[2][2],$A[2][1]
+ bic $C[1],$A[2][3],$A[2][2]
+ bic $C[2],$A[2][0],$A[2][4]
+ bic $C[3],$A[2][1],$A[2][0]
+ eor $A[2][0],$A[2][0],$C[0]
+ bic $C[0],$A[2][4],$A[2][3]
+ eor $A[2][1],$A[2][1],$C[1]
+ eor $A[2][3],$A[2][3],$C[2]
+ eor $A[2][4],$A[2][4],$C[3]
+ eor $A[2][2],$A[2][2],$C[0]
+
+ bic $C[0],$A[3][2],$A[3][1]
+ bic $C[1],$A[3][3],$A[3][2]
+ bic $C[2],$A[3][0],$A[3][4]
+ bic $C[3],$A[3][1],$A[3][0]
+ eor $A[3][0],$A[3][0],$C[0]
+ bic $C[0],$A[3][4],$A[3][3]
+ eor $A[3][1],$A[3][1],$C[1]
+ eor $A[3][3],$A[3][3],$C[2]
+ eor $A[3][4],$A[3][4],$C[3]
+ eor $A[3][2],$A[3][2],$C[0]
+
+ bic $C[0],$A[4][2],$A[4][1]
+ bic $C[1],$A[4][3],$A[4][2]
+ bic $C[2],$A[4][0],$A[4][4]
+ bic $C[3],$A[4][1],$A[4][0]
+ eor $A[4][0],$A[4][0],$C[0]
+ bic $C[0],$A[4][4],$A[4][3]
+ eor $A[4][1],$A[4][1],$C[1]
+ eor $A[4][3],$A[4][3],$C[2]
+ eor $A[4][4],$A[4][4],$C[3]
+ eor $A[4][2],$A[4][2],$C[0]
+
+ bne .Loop
+
+ ldr x30,[sp,#24]
+ ret
+.size KeccakF1600_int,.-KeccakF1600_int
+
+.type KeccakF1600,%function
+.align 5
+KeccakF1600:
+ stp x29,x30,[sp,#-128]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+ sub sp,sp,#48
+
+ str x0,[sp,#32] // offload argument
+ mov $C[0],x0
+ ldp $A[0][0],$A[0][1],[x0,#16*0]
+ ldp $A[0][2],$A[0][3],[$C[0],#16*1]
+ ldp $A[0][4],$A[1][0],[$C[0],#16*2]
+ ldp $A[1][1],$A[1][2],[$C[0],#16*3]
+ ldp $A[1][3],$A[1][4],[$C[0],#16*4]
+ ldp $A[2][0],$A[2][1],[$C[0],#16*5]
+ ldp $A[2][2],$A[2][3],[$C[0],#16*6]
+ ldp $A[2][4],$A[3][0],[$C[0],#16*7]
+ ldp $A[3][1],$A[3][2],[$C[0],#16*8]
+ ldp $A[3][3],$A[3][4],[$C[0],#16*9]
+ ldp $A[4][0],$A[4][1],[$C[0],#16*10]
+ ldp $A[4][2],$A[4][3],[$C[0],#16*11]
+ ldr $A[4][4],[$C[0],#16*12]
+
+ bl KeccakF1600_int
+
+ ldr $C[0],[sp,#32]
+ stp $A[0][0],$A[0][1],[$C[0],#16*0]
+ stp $A[0][2],$A[0][3],[$C[0],#16*1]
+ stp $A[0][4],$A[1][0],[$C[0],#16*2]
+ stp $A[1][1],$A[1][2],[$C[0],#16*3]
+ stp $A[1][3],$A[1][4],[$C[0],#16*4]
+ stp $A[2][0],$A[2][1],[$C[0],#16*5]
+ stp $A[2][2],$A[2][3],[$C[0],#16*6]
+ stp $A[2][4],$A[3][0],[$C[0],#16*7]
+ stp $A[3][1],$A[3][2],[$C[0],#16*8]
+ stp $A[3][3],$A[3][4],[$C[0],#16*9]
+ stp $A[4][0],$A[4][1],[$C[0],#16*10]
+ stp $A[4][2],$A[4][3],[$C[0],#16*11]
+ str $A[4][4],[$C[0],#16*12]
+
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#48
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#128
+ ret
+.size KeccakF1600,.-KeccakF1600
+
+.globl SHA3_absorb
+.type SHA3_absorb,%function
+.align 5
+SHA3_absorb:
+ stp x29,x30,[sp,#-128]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+ sub sp,sp,#64
+
+ stp x0,x1,[sp,#32] // offload arguments
+ stp x2,x3,[sp,#48]
+
+ mov $C[0],x0 // uint64_t A[5][5]
+ mov $C[1],x1 // const void *inp
+ mov $C[2],x2 // size_t len
+ mov $C[3],x3 // size_t bsz
+ ldp $A[0][0],$A[0][1],[$C[0],#16*0]
+ ldp $A[0][2],$A[0][3],[$C[0],#16*1]
+ ldp $A[0][4],$A[1][0],[$C[0],#16*2]
+ ldp $A[1][1],$A[1][2],[$C[0],#16*3]
+ ldp $A[1][3],$A[1][4],[$C[0],#16*4]
+ ldp $A[2][0],$A[2][1],[$C[0],#16*5]
+ ldp $A[2][2],$A[2][3],[$C[0],#16*6]
+ ldp $A[2][4],$A[3][0],[$C[0],#16*7]
+ ldp $A[3][1],$A[3][2],[$C[0],#16*8]
+ ldp $A[3][3],$A[3][4],[$C[0],#16*9]
+ ldp $A[4][0],$A[4][1],[$C[0],#16*10]
+ ldp $A[4][2],$A[4][3],[$C[0],#16*11]
+ ldr $A[4][4],[$C[0],#16*12]
+ b .Loop_absorb
+
+.align 4
+.Loop_absorb:
+ subs $C[0],$C[2],$C[3] // len - bsz
+ blo .Labsorbed
+
+ str $C[0],[sp,#48] // save len - bsz
+___
+for (my $i=0; $i<24; $i+=2) {
+my $j = $i+1;
+$code.=<<___;
+ ldr $C[0],[$C[1]],#8 // *inp++
+#ifdef __AARCH64EB__
+ rev $C[0],$C[0]
+#endif
+ eor $A[$i/5][$i%5],$A[$i/5][$i%5],$C[0]
+ cmp $C[3],#8*($i+2)
+ blo .Lprocess_block
+ ldr $C[0],[$C[1]],#8 // *inp++
+#ifdef __AARCH64EB__
+ rev $C[0],$C[0]
+#endif
+ eor $A[$j/5][$j%5],$A[$j/5][$j%5],$C[0]
+ beq .Lprocess_block
+___
+}
+$code.=<<___;
+ ldr $C[0],[$C[1]],#8 // *inp++
+#ifdef __AARCH64EB__
+ rev $C[0],$C[0]
+#endif
+ eor $A[4][4],$A[4][4],$C[0]
+
+.Lprocess_block:
+ str $C[1],[sp,#40] // save inp
+
+ bl KeccakF1600_int
+
+ ldr $C[1],[sp,#40] // restore arguments
+ ldp $C[2],$C[3],[sp,#48]
+ b .Loop_absorb
+
+.align 4
+.Labsorbed:
+ ldr $C[1],[sp,#32]
+ stp $A[0][0],$A[0][1],[$C[1],#16*0]
+ stp $A[0][2],$A[0][3],[$C[1],#16*1]
+ stp $A[0][4],$A[1][0],[$C[1],#16*2]
+ stp $A[1][1],$A[1][2],[$C[1],#16*3]
+ stp $A[1][3],$A[1][4],[$C[1],#16*4]
+ stp $A[2][0],$A[2][1],[$C[1],#16*5]
+ stp $A[2][2],$A[2][3],[$C[1],#16*6]
+ stp $A[2][4],$A[3][0],[$C[1],#16*7]
+ stp $A[3][1],$A[3][2],[$C[1],#16*8]
+ stp $A[3][3],$A[3][4],[$C[1],#16*9]
+ stp $A[4][0],$A[4][1],[$C[1],#16*10]
+ stp $A[4][2],$A[4][3],[$C[1],#16*11]
+ str $A[4][4],[$C[1],#16*12]
+
+ mov x0,$C[2] // return value
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#64
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#128
+ ret
+.size SHA3_absorb,.-SHA3_absorb
+___
+{
+my ($A_flat,$out,$len,$bsz) = map("x$_",(19..22));
+$code.=<<___;
+.globl SHA3_squeeze
+.type SHA3_squeeze,%function
+.align 5
+SHA3_squeeze:
+ stp x29,x30,[sp,#-48]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+
+ mov $A_flat,x0 // put aside arguments
+ mov $out,x1
+ mov $len,x2
+ mov $bsz,x3
+
+.Loop_squeeze:
+ ldr x4,[x0],#8
+ cmp $len,#8
+ blo .Lsqueeze_tail
+#ifdef __AARCH64EB__
+ rev x4,x4
+#endif
+ str x4,[$out],#8
+ subs $len,$len,#8
+ beq .Lsqueeze_done
+
+ subs x3,x3,#8
+ bhi .Loop_squeeze
+
+ mov x0,$A_flat
+ bl KeccakF1600
+ mov x0,$A_flat
+ mov x3,$bsz
+ b .Loop_squeeze
+
+.align 4
+.Lsqueeze_tail:
+ strb w4,[$out],#1
+ lsr x4,x4,#8
+ subs $len,$len,#1
+ beq .Lsqueeze_done
+ strb w4,[$out],#1
+ lsr x4,x4,#8
+ subs $len,$len,#1
+ beq .Lsqueeze_done
+ strb w4,[$out],#1
+ lsr x4,x4,#8
+ subs $len,$len,#1
+ beq .Lsqueeze_done
+ strb w4,[$out],#1
+ lsr x4,x4,#8
+ subs $len,$len,#1
+ beq .Lsqueeze_done
+ strb w4,[$out],#1
+ lsr x4,x4,#8
+ subs $len,$len,#1
+ beq .Lsqueeze_done
+ strb w4,[$out],#1
+ lsr x4,x4,#8
+ subs $len,$len,#1
+ beq .Lsqueeze_done
+ strb w4,[$out],#1
+
+.Lsqueeze_done:
+ ldp x19,x20,[sp,#16]
+ ldp x21,x22,[sp,#32]
+ ldp x29,x30,[sp],#48
+ ret
+.size SHA3_squeeze,.-SHA3_squeeze
+___
+} }}}
+ {{{
+my @A = map([ "v".$_.".16b", "v".($_+1).".16b", "v".($_+2).".16b",
+ "v".($_+3).".16b", "v".($_+4).".16b" ],
+ (0, 5, 10, 15, 20));
+
+my @C = map("v$_.16b", (25..31));
+
+$code.=<<___;
+.type KeccakF1600_ce,%function
+.align 5
+KeccakF1600_ce:
+ mov x9,#12
+ adr x10,iotas
+ b .Loop_ce
+.align 4
+.Loop_ce:
+___
+for($i=0; $i<2; $i++) {
+$code.=<<___;
+ ////////////////////////////////////////////////// Theta
+ eor3 $C[0],$A[0][0],$A[1][0],$A[2][0]
+ eor3 $C[1],$A[0][1],$A[1][1],$A[2][1]
+ eor3 $C[2],$A[0][2],$A[1][2],$A[2][2]
+ eor3 $C[3],$A[0][3],$A[1][3],$A[2][3]
+ eor3 $C[4],$A[0][4],$A[1][4],$A[2][4]
+ eor3 $C[0],$C[0], $A[3][0],$A[4][0]
+ eor3 $C[1],$C[1], $A[3][1],$A[4][1]
+ eor3 $C[2],$C[2], $A[3][2],$A[4][2]
+ eor3 $C[3],$C[3], $A[3][3],$A[4][3]
+ eor3 $C[4],$C[4], $A[3][4],$A[4][4]
+
+ rax1 $C[5],$C[0],$C[2] // D[1]
+ rax1 $C[6],$C[1],$C[3] // D[2]
+ rax1 $C[2],$C[2],$C[4] // D[3]
+ rax1 $C[3],$C[3],$C[0] // D[4]
+ rax1 $C[4],$C[4],$C[1] // D[0]
+
+ ////////////////////////////////////////////////// Theta+Rho+Pi
+ xar $C[0], $A[1][1],$C[5],#64-$rhotates[1][1] // C[0]=A[0][1]
+ xar $A[1][1],$A[1][4],$C[3],#64-$rhotates[1][4]
+ xar $A[1][4],$A[4][2],$C[6],#64-$rhotates[4][2]
+ xar $A[4][2],$A[2][4],$C[3],#64-$rhotates[2][4]
+ xar $A[2][4],$A[4][0],$C[4],#64-$rhotates[4][0]
+
+ xar $A[4][0],$A[0][2],$C[6],#64-$rhotates[0][2]
+
+ xar $A[0][2],$A[2][2],$C[6],#64-$rhotates[2][2]
+ xar $A[2][2],$A[2][3],$C[2],#64-$rhotates[2][3]
+ xar $A[2][3],$A[3][4],$C[3],#64-$rhotates[3][4]
+ xar $A[3][4],$A[4][3],$C[2],#64-$rhotates[4][3]
+ xar $A[4][3],$A[3][0],$C[4],#64-$rhotates[3][0]
+
+ xar $A[3][0],$A[0][4],$C[3],#64-$rhotates[0][4]
+
+ eor $A[0][0],$A[0][0],$C[4]
+ ldr x11,[x10],#8
+
+ xar $C[1], $A[3][3],$C[2],#64-$rhotates[3][3] // C[1]=A[0][3]
+ xar $A[3][3],$A[3][2],$C[6],#64-$rhotates[3][2]
+ xar $A[3][2],$A[2][1],$C[5],#64-$rhotates[2][1]
+ xar $A[2][1],$A[1][2],$C[6],#64-$rhotates[1][2]
+ xar $A[1][2],$A[2][0],$C[4],#64-$rhotates[2][0]
+
+ xar $A[2][0],$A[0][1],$C[5],#64-$rhotates[0][1] // *
+
+ xar $A[0][4],$A[4][4],$C[3],#64-$rhotates[4][4]
+ xar $A[4][4],$A[4][1],$C[5],#64-$rhotates[4][1]
+ xar $A[4][1],$A[1][3],$C[2],#64-$rhotates[1][3]
+ xar $A[1][3],$A[3][1],$C[5],#64-$rhotates[3][1]
+ xar $A[3][1],$A[1][0],$C[4],#64-$rhotates[1][0]
+
+ xar $C[2], $A[0][3],$C[2],#64-$rhotates[0][3] // C[2]=A[1][0]
+
+ ////////////////////////////////////////////////// Chi+Iota
+ dup $C[6],x11 // borrow C[6]
+ bcax $C[3], $A[0][0],$A[0][2],$C[0] // *
+ bcax $A[0][1],$C[0], $C[1], $A[0][2] // *
+ bcax $A[0][2],$A[0][2],$A[0][4],$C[1]
+ bcax $A[0][3],$C[1], $A[0][0],$A[0][4]
+ bcax $A[0][4],$A[0][4],$C[0], $A[0][0]
+
+ bcax $A[1][0],$C[2], $A[1][2],$A[1][1] // *
+ bcax $C[0], $A[1][1],$A[1][3],$A[1][2] // *
+ bcax $A[1][2],$A[1][2],$A[1][4],$A[1][3]
+ bcax $A[1][3],$A[1][3],$C[2], $A[1][4]
+ bcax $A[1][4],$A[1][4],$A[1][1],$C[2]
+
+ eor $A[0][0],$C[3],$C[6] // Iota
+
+ bcax $C[1], $A[2][0],$A[2][2],$A[2][1] // *
+ bcax $C[2], $A[2][1],$A[2][3],$A[2][2] // *
+ bcax $A[2][2],$A[2][2],$A[2][4],$A[2][3]
+ bcax $A[2][3],$A[2][3],$A[2][0],$A[2][4]
+ bcax $A[2][4],$A[2][4],$A[2][1],$A[2][0]
+
+ bcax $C[3], $A[3][0],$A[3][2],$A[3][1] // *
+ bcax $C[4], $A[3][1],$A[3][3],$A[3][2] // *
+ bcax $A[3][2],$A[3][2],$A[3][4],$A[3][3]
+ bcax $A[3][3],$A[3][3],$A[3][0],$A[3][4]
+ bcax $A[3][4],$A[3][4],$A[3][1],$A[3][0]
+
+ bcax $C[5], $A[4][0],$A[4][2],$A[4][1] // *
+ bcax $C[6], $A[4][1],$A[4][3],$A[4][2] // *
+ bcax $A[4][2],$A[4][2],$A[4][4],$A[4][3]
+ bcax $A[4][3],$A[4][3],$A[4][0],$A[4][4]
+ bcax $A[4][4],$A[4][4],$A[4][1],$A[4][0]
+___
+ ( $A[1][1], $C[0]) = ( $C[0], $A[1][1]);
+ ($A[2][0],$A[2][1], $C[1],$C[2]) = ($C[1],$C[2], $A[2][0],$A[2][1]);
+ ($A[3][0],$A[3][1], $C[3],$C[4]) = ($C[3],$C[4], $A[3][0],$A[3][1]);
+ ($A[4][0],$A[4][1], $C[5],$C[6]) = ($C[5],$C[6], $A[4][0],$A[4][1]);
+}
+$code.=<<___;
+ subs x9,x9,#1
+ bne .Loop_ce
+
+ ret
+.size KeccakF1600_ce,.-KeccakF1600_ce
+
+.type KeccakF1600_cext,%function
+.align 5
+KeccakF1600_cext:
+ stp x29,x30,[sp,#-80]!
+ add x29,sp,#0
+ stp d8,d9,[sp,#16] // per ABI requirement
+ stp d10,d11,[sp,#32]
+ stp d12,d13,[sp,#48]
+ stp d14,d15,[sp,#64]
+___
+for($i=0; $i<24; $i+=2) { # load A[5][5]
+my $j=$i+1;
+$code.=<<___;
+ ldp d$i,d$j,[x0,#8*$i]
+___
+}
+$code.=<<___;
+ ldr d24,[x0,#8*$i]
+ bl KeccakF1600_ce
+ ldr x30,[sp,#8]
+___
+for($i=0; $i<24; $i+=2) { # store A[5][5]
+my $j=$i+1;
+$code.=<<___;
+ stp d$i,d$j,[x0,#8*$i]
+___
+}
+$code.=<<___;
+ str d24,[x0,#8*$i]
+
+ ldp d8,d9,[sp,#16]
+ ldp d10,d11,[sp,#32]
+ ldp d12,d13,[sp,#48]
+ ldp d14,d15,[sp,#64]
+ ldr x29,[sp],#80
+ ret
+.size KeccakF1600_cext,.-KeccakF1600_cext
+___
+
+{
+my ($ctx,$inp,$len,$bsz) = map("x$_",(0..3));
+
+$code.=<<___;
+.globl SHA3_absorb_cext
+.type SHA3_absorb_cext,%function
+.align 5
+SHA3_absorb_cext:
+ stp x29,x30,[sp,#-80]!
+ add x29,sp,#0
+ stp d8,d9,[sp,#16] // per ABI requirement
+ stp d10,d11,[sp,#32]
+ stp d12,d13,[sp,#48]
+ stp d14,d15,[sp,#64]
+___
+for($i=0; $i<24; $i+=2) { # load A[5][5]
+my $j=$i+1;
+$code.=<<___;
+ ldp d$i,d$j,[x0,#8*$i]
+___
+}
+$code.=<<___;
+ ldr d24,[x0,#8*$i]
+ b .Loop_absorb_ce
+
+.align 4
+.Loop_absorb_ce:
+ subs $len,$len,$bsz // len - bsz
+ blo .Labsorbed_ce
+___
+for (my $i=0; $i<24; $i+=2) {
+my $j = $i+1;
+$code.=<<___;
+ ldr d31,[$inp],#8 // *inp++
+#ifdef __AARCH64EB__
+ rev64 v31.16b,v31.16b
+#endif
+ eor $A[$i/5][$i%5],$A[$i/5][$i%5],v31.16b
+ cmp $bsz,#8*($i+2)
+ blo .Lprocess_block_ce
+ ldr d31,[$inp],#8 // *inp++
+#ifdef __AARCH64EB__
+ rev v31.16b,v31.16b
+#endif
+ eor $A[$j/5][$j%5],$A[$j/5][$j%5],v31.16b
+ beq .Lprocess_block_ce
+___
+}
+$code.=<<___;
+ ldr d31,[$inp],#8 // *inp++
+#ifdef __AARCH64EB__
+ rev v31.16b,v31.16b
+#endif
+ eor $A[4][4],$A[4][4],v31.16b
+
+.Lprocess_block_ce:
+
+ bl KeccakF1600_ce
+
+ b .Loop_absorb_ce
+
+.align 4
+.Labsorbed_ce:
+___
+for($i=0; $i<24; $i+=2) { # store A[5][5]
+my $j=$i+1;
+$code.=<<___;
+ stp d$i,d$j,[x0,#8*$i]
+___
+}
+$code.=<<___;
+ str d24,[x0,#8*$i]
+ add x0,$len,$bsz // return value
+
+ ldp d8,d9,[sp,#16]
+ ldp d10,d11,[sp,#32]
+ ldp d12,d13,[sp,#48]
+ ldp d14,d15,[sp,#64]
+ ldp x29,x30,[sp],#80
+ ret
+.size SHA3_absorb_cext,.-SHA3_absorb_cext
+___
+}
+{
+my ($ctx,$out,$len,$bsz) = map("x$_",(0..3));
+$code.=<<___;
+.globl SHA3_squeeze_cext
+.type SHA3_squeeze_cext,%function
+.align 5
+SHA3_squeeze_cext:
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+ mov x9,$ctx
+ mov x10,$bsz
+
+.Loop_squeeze_ce:
+ ldr x4,[x9],#8
+ cmp $len,#8
+ blo .Lsqueeze_tail_ce
+#ifdef __AARCH64EB__
+ rev x4,x4
+#endif
+ str x4,[$out],#8
+ beq .Lsqueeze_done_ce
+
+ sub $len,$len,#8
+ subs x10,x10,#8
+ bhi .Loop_squeeze_ce
+
+ bl KeccakF1600_cext
+ ldr x30,[sp,#8]
+ mov x9,$ctx
+ mov x10,$bsz
+ b .Loop_squeeze_ce
+
+.align 4
+.Lsqueeze_tail_ce:
+ strb w4,[$out],#1
+ lsr x4,x4,#8
+ subs $len,$len,#1
+ beq .Lsqueeze_done_ce
+ strb w4,[$out],#1
+ lsr x4,x4,#8
+ subs $len,$len,#1
+ beq .Lsqueeze_done_ce
+ strb w4,[$out],#1
+ lsr x4,x4,#8
+ subs $len,$len,#1
+ beq .Lsqueeze_done_ce
+ strb w4,[$out],#1
+ lsr x4,x4,#8
+ subs $len,$len,#1
+ beq .Lsqueeze_done_ce
+ strb w4,[$out],#1
+ lsr x4,x4,#8
+ subs $len,$len,#1
+ beq .Lsqueeze_done_ce
+ strb w4,[$out],#1
+ lsr x4,x4,#8
+ subs $len,$len,#1
+ beq .Lsqueeze_done_ce
+ strb w4,[$out],#1
+
+.Lsqueeze_done_ce:
+ ldr x29,[sp],#16
+ ret
+.size SHA3_squeeze_cext,.-SHA3_squeeze_cext
+___
+} }}}
+$code.=<<___;
+.asciz "Keccak-1600 absorb and squeeze for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+{ my %opcode = (
+ "rax1" => 0xce608c00, "eor3" => 0xce000000,
+ "bcax" => 0xce200000, "xar" => 0xce800000 );
+
+ sub unsha3 {
+ my ($mnemonic,$arg)=@_;
+
+ $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv#]([0-9\-]+))?)?/
+ &&
+ sprintf ".inst\t0x%08x\t//%s %s",
+ $opcode{$mnemonic}|$1|($2<<5)|($3<<16)|(eval($4)<<10),
+ $mnemonic,$arg;
+ }
+}
+
+foreach(split("\n",$code)) {
+
+ s/\`([^\`]*)\`/eval($1)/ge;
+
+ m/\bdup\b/ and s/\.16b/.2d/g or
+ s/\b(eor3|rax1|xar|bcax)\s+(v.*)/unsha3($1,$2)/ge;
+
+ print $_,"\n";
+}
+
+close STDOUT;
diff --git a/crypto/sha/asm/keccak1600-avx2.pl b/crypto/sha/asm/keccak1600-avx2.pl
new file mode 100755
index 000000000000..d9fc1c59ec29
--- /dev/null
+++ b/crypto/sha/asm/keccak1600-avx2.pl
@@ -0,0 +1,482 @@
+#!/usr/bin/env perl
+# Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# Keccak-1600 for AVX2.
+#
+# July 2017.
+#
+# To paraphrase Gilles Van Assche, if you contemplate Fig. 2.3 on page
+# 20 of The Keccak reference [or Fig. 5 of FIPS PUB 202], and load data
+# other than A[0][0] in magic order into 6 [256-bit] registers, *each
+# dedicated to one axis*, Pi permutation is reduced to intra-register
+# shuffles...
+#
+# It makes other steps more intricate, but overall, is it a win? To be
+# more specific index permutations organized by quadruples are:
+#
+# [4][4] [3][3] [2][2] [1][1]<-+
+# [0][4] [0][3] [0][2] [0][1]<-+
+# [3][0] [1][0] [4][0] [2][0] |
+# [4][3] [3][1] [2][4] [1][2] |
+# [3][4] [1][3] [4][2] [2][1] |
+# [2][3] [4][1] [1][4] [3][2] |
+# [2][2] [4][4] [1][1] [3][3] -+
+#
+# This however is highly impractical for Theta and Chi. What would help
+# Theta is if x indices were aligned column-wise, or in other words:
+#
+# [0][4] [0][3] [0][2] [0][1]
+# [3][0] [1][0] [4][0] [2][0]
+#vpermq([4][3] [3][1] [2][4] [1][2], 0b01110010)
+# [2][4] [4][3] [1][2] [3][1]
+#vpermq([4][2] [3][4] [2][1] [1][3], 0b10001101)
+# [3][4] [1][3] [4][2] [2][1]
+#vpermq([2][3] [4][1] [1][4] [3][2], 0b01110010)
+# [1][4] [2][3] [3][2] [4][1]
+#vpermq([1][1] [2][2] [3][3] [4][4], 0b00011011)
+# [4][4] [3][3] [2][2] [1][1]
+#
+# So here we have it, lines not marked with vpermq() represent the magic
+# order in which data is to be loaded and maintained. [And lines marked
+# with vpermq() represent Pi circular permutation in chosen layout. Note
+# that first step is permutation-free.] A[0][0] is loaded to register of
+# its own, to all lanes. [A[0][0] is not part of Pi permutation or Rho.]
+# Digits in variables' names denote right-most coordinates:
+
+my ($A00, # [0][0] [0][0] [0][0] [0][0] # %ymm0
+ $A01, # [0][4] [0][3] [0][2] [0][1] # %ymm1
+ $A20, # [3][0] [1][0] [4][0] [2][0] # %ymm2
+ $A31, # [2][4] [4][3] [1][2] [3][1] # %ymm3
+ $A21, # [3][4] [1][3] [4][2] [2][1] # %ymm4
+ $A41, # [1][4] [2][3] [3][2] [4][1] # %ymm5
+ $A11) = # [4][4] [3][3] [2][2] [1][1] # %ymm6
+ map("%ymm$_",(0..6));
+
+# We also need to map the magic order into offsets within structure:
+
+my @A_jagged = ([0,0], [1,0], [1,1], [1,2], [1,3], # [0][0..4]
+ [2,2], [6,0], [3,1], [4,2], [5,3], # [1][0..4]
+ [2,0], [4,0], [6,1], [5,2], [3,3], # [2][0..4]
+ [2,3], [3,0], [5,1], [6,2], [4,3], # [3][0..4]
+ [2,1], [5,0], [4,1], [3,2], [6,3]); # [4][0..4]
+ @A_jagged = map(8*($$_[0]*4+$$_[1]), @A_jagged); # ... and now linear
+
+# But on the other hand Chi is much better off if y indices were aligned
+# column-wise, not x. For this reason we have to shuffle data prior
+# Chi and revert it afterwards. Prior shuffle is naturally merged with
+# Pi itself:
+#
+# [0][4] [0][3] [0][2] [0][1]
+# [3][0] [1][0] [4][0] [2][0]
+#vpermq([4][3] [3][1] [2][4] [1][2], 0b01110010)
+#vpermq([2][4] [4][3] [1][2] [3][1], 0b00011011) = 0b10001101
+# [3][1] [1][2] [4][3] [2][4]
+#vpermq([4][2] [3][4] [2][1] [1][3], 0b10001101)
+#vpermq([3][4] [1][3] [4][2] [2][1], 0b11100100) = 0b10001101
+# [3][4] [1][3] [4][2] [2][1]
+#vpermq([2][3] [4][1] [1][4] [3][2], 0b01110010)
+#vpermq([1][4] [2][3] [3][2] [4][1], 0b01110010) = 0b00011011
+# [3][2] [1][4] [4][1] [2][3]
+#vpermq([1][1] [2][2] [3][3] [4][4], 0b00011011)
+#vpermq([4][4] [3][3] [2][2] [1][1], 0b10001101) = 0b01110010
+# [3][3] [1][1] [4][4] [2][2]
+#
+# And reverse post-Chi permutation:
+#
+# [0][4] [0][3] [0][2] [0][1]
+# [3][0] [1][0] [4][0] [2][0]
+#vpermq([3][1] [1][2] [4][3] [2][4], 0b00011011)
+# [2][4] [4][3] [1][2] [3][1]
+#vpermq([3][4] [1][3] [4][2] [2][1], 0b11100100) = nop :-)
+# [3][4] [1][3] [4][2] [2][1]
+#vpermq([3][2] [1][4] [4][1] [2][3], 0b10001101)
+# [1][4] [2][3] [3][2] [4][1]
+#vpermq([3][3] [1][1] [4][4] [2][2], 0b01110010)
+# [4][4] [3][3] [2][2] [1][1]
+#
+########################################################################
+# Numbers are cycles per processed byte out of large message.
+#
+# r=1088(*)
+#
+# Haswell 8.7/+10%
+# Skylake 7.8/+20%
+# Ryzen 17(**)
+#
+# (*) Corresponds to SHA3-256. Percentage after slash is improvement
+# coefficient in comparison to scalar keccak1600-x86_64.pl.
+# (**) It's expected that Ryzen performs poorly, because instruction
+# issue rate is limited to two AVX2 instructions per cycle and
+# in addition vpblendd is reportedly bound to specific port.
+# Obviously this code path should not be executed on Ryzen.
+
+my @T = map("%ymm$_",(7..15));
+my ($C14,$C00,$D00,$D14) = @T[5..8];
+
+$code.=<<___;
+.text
+
+.type __KeccakF1600,\@function
+.align 32
+__KeccakF1600:
+ lea rhotates_left+96(%rip),%r8
+ lea rhotates_right+96(%rip),%r9
+ lea iotas(%rip),%r10
+ mov \$24,%eax
+ jmp .Loop_avx2
+
+.align 32
+.Loop_avx2:
+ ######################################### Theta
+ vpshufd \$0b01001110,$A20,$C00
+ vpxor $A31,$A41,$C14
+ vpxor $A11,$A21,@T[2]
+ vpxor $A01,$C14,$C14
+ vpxor @T[2],$C14,$C14 # C[1..4]
+
+ vpermq \$0b10010011,$C14,@T[4]
+ vpxor $A20,$C00,$C00
+ vpermq \$0b01001110,$C00,@T[0]
+
+ vpsrlq \$63,$C14,@T[1]
+ vpaddq $C14,$C14,@T[2]
+ vpor @T[2],@T[1],@T[1] # ROL64(C[1..4],1)
+
+ vpermq \$0b00111001,@T[1],$D14
+ vpxor @T[4],@T[1],$D00
+ vpermq \$0b00000000,$D00,$D00 # D[0..0] = ROL64(C[1],1) ^ C[4]
+
+ vpxor $A00,$C00,$C00
+ vpxor @T[0],$C00,$C00 # C[0..0]
+
+ vpsrlq \$63,$C00,@T[0]
+ vpaddq $C00,$C00,@T[1]
+ vpor @T[0],@T[1],@T[1] # ROL64(C[0..0],1)
+
+ vpxor $D00,$A20,$A20 # ^= D[0..0]
+ vpxor $D00,$A00,$A00 # ^= D[0..0]
+
+ vpblendd \$0b11000000,@T[1],$D14,$D14
+ vpblendd \$0b00000011,$C00,@T[4],@T[4]
+ vpxor @T[4],$D14,$D14 # D[1..4] = ROL64(C[2..4,0),1) ^ C[0..3]
+
+ ######################################### Rho + Pi + pre-Chi shuffle
+ vpsllvq 0*32-96(%r8),$A20,@T[3]
+ vpsrlvq 0*32-96(%r9),$A20,$A20
+ vpor @T[3],$A20,$A20
+
+ vpxor $D14,$A31,$A31 # ^= D[1..4] from Theta
+ vpsllvq 2*32-96(%r8),$A31,@T[4]
+ vpsrlvq 2*32-96(%r9),$A31,$A31
+ vpor @T[4],$A31,$A31
+
+ vpxor $D14,$A21,$A21 # ^= D[1..4] from Theta
+ vpsllvq 3*32-96(%r8),$A21,@T[5]
+ vpsrlvq 3*32-96(%r9),$A21,$A21
+ vpor @T[5],$A21,$A21
+
+ vpxor $D14,$A41,$A41 # ^= D[1..4] from Theta
+ vpsllvq 4*32-96(%r8),$A41,@T[6]
+ vpsrlvq 4*32-96(%r9),$A41,$A41
+ vpor @T[6],$A41,$A41
+
+ vpxor $D14,$A11,$A11 # ^= D[1..4] from Theta
+ vpermq \$0b10001101,$A20,@T[3] # $A20 -> future $A31
+ vpermq \$0b10001101,$A31,@T[4] # $A31 -> future $A21
+ vpsllvq 5*32-96(%r8),$A11,@T[7]
+ vpsrlvq 5*32-96(%r9),$A11,@T[1]
+ vpor @T[7],@T[1],@T[1] # $A11 -> future $A01
+
+ vpxor $D14,$A01,$A01 # ^= D[1..4] from Theta
+ vpermq \$0b00011011,$A21,@T[5] # $A21 -> future $A41
+ vpermq \$0b01110010,$A41,@T[6] # $A41 -> future $A11
+ vpsllvq 1*32-96(%r8),$A01,@T[8]
+ vpsrlvq 1*32-96(%r9),$A01,@T[2]
+ vpor @T[8],@T[2],@T[2] # $A01 -> future $A20
+
+ ######################################### Chi
+ vpsrldq \$8,@T[1],@T[7]
+ vpandn @T[7],@T[1],@T[0] # tgting [0][0] [0][0] [0][0] [0][0]
+
+ vpblendd \$0b00001100,@T[6],@T[2],$A31 # [4][4] [2][0]
+ vpblendd \$0b00001100,@T[2],@T[4],@T[8] # [4][0] [2][1]
+ vpblendd \$0b00001100,@T[4],@T[3],$A41 # [4][2] [2][4]
+ vpblendd \$0b00001100,@T[3],@T[2],@T[7] # [4][3] [2][0]
+ vpblendd \$0b00110000,@T[4],$A31,$A31 # [1][3] [4][4] [2][0]
+ vpblendd \$0b00110000,@T[5],@T[8],@T[8] # [1][4] [4][0] [2][1]
+ vpblendd \$0b00110000,@T[2],$A41,$A41 # [1][0] [4][2] [2][4]
+ vpblendd \$0b00110000,@T[6],@T[7],@T[7] # [1][1] [4][3] [2][0]
+ vpblendd \$0b11000000,@T[5],$A31,$A31 # [3][2] [1][3] [4][4] [2][0]
+ vpblendd \$0b11000000,@T[6],@T[8],@T[8] # [3][3] [1][4] [4][0] [2][1]
+ vpblendd \$0b11000000,@T[6],$A41,$A41 # [3][3] [1][0] [4][2] [2][4]
+ vpblendd \$0b11000000,@T[4],@T[7],@T[7] # [3][4] [1][1] [4][3] [2][0]
+ vpandn @T[8],$A31,$A31 # tgting [3][1] [1][2] [4][3] [2][4]
+ vpandn @T[7],$A41,$A41 # tgting [3][2] [1][4] [4][1] [2][3]
+
+ vpblendd \$0b00001100,@T[2],@T[5],$A11 # [4][0] [2][3]
+ vpblendd \$0b00001100,@T[5],@T[3],@T[8] # [4][1] [2][4]
+ vpxor @T[3],$A31,$A31
+ vpblendd \$0b00110000,@T[3],$A11,$A11 # [1][2] [4][0] [2][3]
+ vpblendd \$0b00110000,@T[4],@T[8],@T[8] # [1][3] [4][1] [2][4]
+ vpxor @T[5],$A41,$A41
+ vpblendd \$0b11000000,@T[4],$A11,$A11 # [3][4] [1][2] [4][0] [2][3]
+ vpblendd \$0b11000000,@T[2],@T[8],@T[8] # [3][0] [1][3] [4][1] [2][4]
+ vpandn @T[8],$A11,$A11 # tgting [3][3] [1][1] [4][4] [2][2]
+ vpxor @T[6],$A11,$A11
+
+ vpermq \$0b00011110,@T[1],$A21 # [0][1] [0][2] [0][4] [0][3]
+ vpblendd \$0b00110000,$A00,$A21,@T[8] # [0][1] [0][0] [0][4] [0][3]
+ vpermq \$0b00111001,@T[1],$A01 # [0][1] [0][4] [0][3] [0][2]
+ vpblendd \$0b11000000,$A00,$A01,$A01 # [0][0] [0][4] [0][3] [0][2]
+ vpandn @T[8],$A01,$A01 # tgting [0][4] [0][3] [0][2] [0][1]
+
+ vpblendd \$0b00001100,@T[5],@T[4],$A20 # [4][1] [2][1]
+ vpblendd \$0b00001100,@T[4],@T[6],@T[7] # [4][2] [2][2]
+ vpblendd \$0b00110000,@T[6],$A20,$A20 # [1][1] [4][1] [2][1]
+ vpblendd \$0b00110000,@T[3],@T[7],@T[7] # [1][2] [4][2] [2][2]
+ vpblendd \$0b11000000,@T[3],$A20,$A20 # [3][1] [1][1] [4][1] [2][1]
+ vpblendd \$0b11000000,@T[5],@T[7],@T[7] # [3][2] [1][2] [4][2] [2][2]
+ vpandn @T[7],$A20,$A20 # tgting [3][0] [1][0] [4][0] [2][0]
+ vpxor @T[2],$A20,$A20
+
+ vpermq \$0b00000000,@T[0],@T[0] # [0][0] [0][0] [0][0] [0][0]
+ vpermq \$0b00011011,$A31,$A31 # post-Chi shuffle
+ vpermq \$0b10001101,$A41,$A41
+ vpermq \$0b01110010,$A11,$A11
+
+ vpblendd \$0b00001100,@T[3],@T[6],$A21 # [4][3] [2][2]
+ vpblendd \$0b00001100,@T[6],@T[5],@T[7] # [4][4] [2][3]
+ vpblendd \$0b00110000,@T[5],$A21,$A21 # [1][4] [4][3] [2][2]
+ vpblendd \$0b00110000,@T[2],@T[7],@T[7] # [1][0] [4][4] [2][3]
+ vpblendd \$0b11000000,@T[2],$A21,$A21 # [3][0] [1][4] [4][3] [2][2]
+ vpblendd \$0b11000000,@T[3],@T[7],@T[7] # [3][1] [1][0] [4][4] [2][3]
+ vpandn @T[7],$A21,$A21 # tgting [3][4] [1][3] [4][2] [2][1]
+
+ vpxor @T[0],$A00,$A00
+ vpxor @T[1],$A01,$A01
+ vpxor @T[4],$A21,$A21
+
+ ######################################### Iota
+ vpxor (%r10),$A00,$A00
+ lea 32(%r10),%r10
+
+ dec %eax
+ jnz .Loop_avx2
+
+ ret
+.size __KeccakF1600,.-__KeccakF1600
+___
+my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
+my $out = $inp; # in squeeze
+
+$code.=<<___;
+.globl SHA3_absorb
+.type SHA3_absorb,\@function
+.align 32
+SHA3_absorb:
+ mov %rsp,%r11
+
+ lea -240(%rsp),%rsp
+ and \$-32,%rsp
+
+ lea 96($A_flat),$A_flat
+ lea 96($inp),$inp
+ lea 96(%rsp),%r10
+
+ vzeroupper
+
+ vpbroadcastq -96($A_flat),$A00 # load A[5][5]
+ vmovdqu 8+32*0-96($A_flat),$A01
+ vmovdqu 8+32*1-96($A_flat),$A20
+ vmovdqu 8+32*2-96($A_flat),$A31
+ vmovdqu 8+32*3-96($A_flat),$A21
+ vmovdqu 8+32*4-96($A_flat),$A41
+ vmovdqu 8+32*5-96($A_flat),$A11
+
+ vpxor @T[0],@T[0],@T[0]
+ vmovdqa @T[0],32*2-96(%r10) # zero transfer area on stack
+ vmovdqa @T[0],32*3-96(%r10)
+ vmovdqa @T[0],32*4-96(%r10)
+ vmovdqa @T[0],32*5-96(%r10)
+ vmovdqa @T[0],32*6-96(%r10)
+
+.Loop_absorb_avx2:
+ mov $bsz,%rax
+ sub $bsz,$len
+ jc .Ldone_absorb_avx2
+
+ shr \$3,%eax
+ vpbroadcastq 0-96($inp),@T[0]
+ vmovdqu 8-96($inp),@T[1]
+ sub \$4,%eax
+___
+for(my $i=5; $i<25; $i++) {
+$code.=<<___
+ dec %eax
+ jz .Labsorved_avx2
+ mov 8*$i-96($inp),%r8
+ mov %r8,$A_jagged[$i]-96(%r10)
+___
+}
+$code.=<<___;
+.Labsorved_avx2:
+ lea ($inp,$bsz),$inp
+
+ vpxor @T[0],$A00,$A00
+ vpxor @T[1],$A01,$A01
+ vpxor 32*2-96(%r10),$A20,$A20
+ vpxor 32*3-96(%r10),$A31,$A31
+ vpxor 32*4-96(%r10),$A21,$A21
+ vpxor 32*5-96(%r10),$A41,$A41
+ vpxor 32*6-96(%r10),$A11,$A11
+
+ call __KeccakF1600
+
+ lea 96(%rsp),%r10
+ jmp .Loop_absorb_avx2
+
+.Ldone_absorb_avx2:
+ vmovq %xmm0,-96($A_flat)
+ vmovdqu $A01,8+32*0-96($A_flat)
+ vmovdqu $A20,8+32*1-96($A_flat)
+ vmovdqu $A31,8+32*2-96($A_flat)
+ vmovdqu $A21,8+32*3-96($A_flat)
+ vmovdqu $A41,8+32*4-96($A_flat)
+ vmovdqu $A11,8+32*5-96($A_flat)
+
+ vzeroupper
+
+ lea (%r11),%rsp
+ lea ($len,$bsz),%rax # return value
+ ret
+.size SHA3_absorb,.-SHA3_absorb
+
+.globl SHA3_squeeze
+.type SHA3_squeeze,\@function
+.align 32
+SHA3_squeeze:
+ mov %rsp,%r11
+
+ lea 96($A_flat),$A_flat
+ shr \$3,$bsz
+
+ vzeroupper
+
+ vpbroadcastq -96($A_flat),$A00
+ vpxor @T[0],@T[0],@T[0]
+ vmovdqu 8+32*0-96($A_flat),$A01
+ vmovdqu 8+32*1-96($A_flat),$A20
+ vmovdqu 8+32*2-96($A_flat),$A31
+ vmovdqu 8+32*3-96($A_flat),$A21
+ vmovdqu 8+32*4-96($A_flat),$A41
+ vmovdqu 8+32*5-96($A_flat),$A11
+
+ mov $bsz,%rax
+
+.Loop_squeeze_avx2:
+ mov @A_jagged[$i]-96($A_flat),%r8
+___
+for (my $i=0; $i<25; $i++) {
+$code.=<<___;
+ sub \$8,$len
+ jc .Ltail_squeeze_avx2
+ mov %r8,($out)
+ lea 8($out),$out
+ je .Ldone_squeeze_avx2
+ dec %eax
+ je .Lextend_output_avx2
+ mov @A_jagged[$i+1]-120($A_flat),%r8
+___
+}
+$code.=<<___;
+.Lextend_output_avx2:
+ call __KeccakF1600
+
+ vmovq %xmm0,-96($A_flat)
+ vmovdqu $A01,8+32*0-96($A_flat)
+ vmovdqu $A20,8+32*1-96($A_flat)
+ vmovdqu $A31,8+32*2-96($A_flat)
+ vmovdqu $A21,8+32*3-96($A_flat)
+ vmovdqu $A41,8+32*4-96($A_flat)
+ vmovdqu $A11,8+32*5-96($A_flat)
+
+ mov $bsz,%rax
+ jmp .Loop_squeeze_avx2
+
+
+.Ltail_squeeze_avx2:
+ add \$8,$len
+.Loop_tail_avx2:
+ mov %r8b,($out)
+ lea 1($out),$out
+ shr \$8,%r8
+ dec $len
+ jnz .Loop_tail_avx2
+
+.Ldone_squeeze_avx2:
+ vzeroupper
+
+ lea (%r11),%rsp
+ ret
+.size SHA3_squeeze,.-SHA3_squeeze
+
+.align 64
+rhotates_left:
+ .quad 3, 18, 36, 41 # [2][0] [4][0] [1][0] [3][0]
+ .quad 1, 62, 28, 27 # [0][1] [0][2] [0][3] [0][4]
+ .quad 45, 6, 56, 39 # [3][1] [1][2] [4][3] [2][4]
+ .quad 10, 61, 55, 8 # [2][1] [4][2] [1][3] [3][4]
+ .quad 2, 15, 25, 20 # [4][1] [3][2] [2][3] [1][4]
+ .quad 44, 43, 21, 14 # [1][1] [2][2] [3][3] [4][4]
+rhotates_right:
+ .quad 64-3, 64-18, 64-36, 64-41
+ .quad 64-1, 64-62, 64-28, 64-27
+ .quad 64-45, 64-6, 64-56, 64-39
+ .quad 64-10, 64-61, 64-55, 64-8
+ .quad 64-2, 64-15, 64-25, 64-20
+ .quad 64-44, 64-43, 64-21, 64-14
+iotas:
+ .quad 0x0000000000000001, 0x0000000000000001, 0x0000000000000001, 0x0000000000000001
+ .quad 0x0000000000008082, 0x0000000000008082, 0x0000000000008082, 0x0000000000008082
+ .quad 0x800000000000808a, 0x800000000000808a, 0x800000000000808a, 0x800000000000808a
+ .quad 0x8000000080008000, 0x8000000080008000, 0x8000000080008000, 0x8000000080008000
+ .quad 0x000000000000808b, 0x000000000000808b, 0x000000000000808b, 0x000000000000808b
+ .quad 0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001
+ .quad 0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081
+ .quad 0x8000000000008009, 0x8000000000008009, 0x8000000000008009, 0x8000000000008009
+ .quad 0x000000000000008a, 0x000000000000008a, 0x000000000000008a, 0x000000000000008a
+ .quad 0x0000000000000088, 0x0000000000000088, 0x0000000000000088, 0x0000000000000088
+ .quad 0x0000000080008009, 0x0000000080008009, 0x0000000080008009, 0x0000000080008009
+ .quad 0x000000008000000a, 0x000000008000000a, 0x000000008000000a, 0x000000008000000a
+ .quad 0x000000008000808b, 0x000000008000808b, 0x000000008000808b, 0x000000008000808b
+ .quad 0x800000000000008b, 0x800000000000008b, 0x800000000000008b, 0x800000000000008b
+ .quad 0x8000000000008089, 0x8000000000008089, 0x8000000000008089, 0x8000000000008089
+ .quad 0x8000000000008003, 0x8000000000008003, 0x8000000000008003, 0x8000000000008003
+ .quad 0x8000000000008002, 0x8000000000008002, 0x8000000000008002, 0x8000000000008002
+ .quad 0x8000000000000080, 0x8000000000000080, 0x8000000000000080, 0x8000000000000080
+ .quad 0x000000000000800a, 0x000000000000800a, 0x000000000000800a, 0x000000000000800a
+ .quad 0x800000008000000a, 0x800000008000000a, 0x800000008000000a, 0x800000008000000a
+ .quad 0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081
+ .quad 0x8000000000008080, 0x8000000000008080, 0x8000000000008080, 0x8000000000008080
+ .quad 0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001
+ .quad 0x8000000080008008, 0x8000000080008008, 0x8000000080008008, 0x8000000080008008
+
+.asciz "Keccak-1600 absorb and squeeze for AVX2, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+$output=pop;
+open STDOUT,">$output";
+print $code;
+close STDOUT;
diff --git a/crypto/sha/asm/keccak1600-avx512.pl b/crypto/sha/asm/keccak1600-avx512.pl
new file mode 100755
index 000000000000..9074ff02dec3
--- /dev/null
+++ b/crypto/sha/asm/keccak1600-avx512.pl
@@ -0,0 +1,551 @@
+#!/usr/bin/env perl
+# Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# Keccak-1600 for AVX-512F.
+#
+# July 2017.
+#
+# Below code is KECCAK_1X_ALT implementation (see sha/keccak1600.c).
+# Pretty straightforward, the only "magic" is data layout in registers.
+# It's impossible to have one that is optimal for every step, hence
+# it's changing as algorithm progresses. Data is saved in linear order,
+# but in-register order morphs between rounds. Even rounds take in
+# linear layout, and odd rounds - transposed, or "verticaly-shaped"...
+#
+########################################################################
+# Numbers are cycles per processed byte out of large message.
+#
+# r=1088(*)
+#
+# Knights Landing 7.6
+# Skylake-X 5.7
+#
+# (*) Corresponds to SHA3-256.
+
+########################################################################
+# Below code is combination of two ideas. One is taken from Keccak Code
+# Package, hereafter KCP, and another one from initial version of this
+# module. What is common is observation that Pi's input and output are
+# "mostly transposed", i.e. if input is aligned by x coordinate, then
+# output is [mostly] aligned by y. Both versions, KCP and predecessor,
+# were trying to use one of them from round to round, which resulted in
+# some kind of transposition in each round. This version still does
+# transpose data, but only every second round. Another essential factor
+# is that KCP transposition has to be performed with instructions that
+# turned to be rather expensive on Knights Landing, both latency- and
+# throughput-wise. Not to mention that some of them have to depend on
+# each other. On the other hand initial version of this module was
+# relying heavily on blend instructions. There were lots of them,
+# resulting in higher instruction count, yet it performed better on
+# Knights Landing, because processor can execute pair of them each
+# cycle and they have minimal latency. This module is an attempt to
+# bring best parts together:-)
+#
+# Coordinates below correspond to those in sha/keccak1600.c. Input
+# layout is straight linear:
+#
+# [0][4] [0][3] [0][2] [0][1] [0][0]
+# [1][4] [1][3] [1][2] [1][1] [1][0]
+# [2][4] [2][3] [2][2] [2][1] [2][0]
+# [3][4] [3][3] [3][2] [3][1] [3][0]
+# [4][4] [4][3] [4][2] [4][1] [4][0]
+#
+# It's perfect for Theta, while Pi is reduced to intra-register
+# permutations which yield layout perfect for Chi:
+#
+# [4][0] [3][0] [2][0] [1][0] [0][0]
+# [4][1] [3][1] [2][1] [1][1] [0][1]
+# [4][2] [3][2] [2][2] [1][2] [0][2]
+# [4][3] [3][3] [2][3] [1][3] [0][3]
+# [4][4] [3][4] [2][4] [1][4] [0][4]
+#
+# Now instead of performing full transposition and feeding it to next
+# identical round, we perform kind of diagonal transposition to layout
+# from initial version of this module, and make it suitable for Theta:
+#
+# [4][4] [3][3] [2][2] [1][1] [0][0]>4.3.2.1.0>[4][4] [3][3] [2][2] [1][1] [0][0]
+# [4][0] [3][4] [2][3] [1][2] [0][1]>3.2.1.0.4>[3][4] [2][3] [1][2] [0][1] [4][0]
+# [4][1] [3][0] [2][4] [1][3] [0][2]>2.1.0.4.3>[2][4] [1][3] [0][2] [4][1] [3][0]
+# [4][2] [3][1] [2][0] [1][4] [0][3]>1.0.4.3.2>[1][4] [0][3] [4][2] [3][1] [2][0]
+# [4][3] [3][2] [2][1] [1][0] [0][4]>0.4.3.2.1>[0][4] [4][3] [3][2] [2][1] [1][0]
+#
+# Now intra-register permutations yield initial [almost] straight
+# linear layout:
+#
+# [4][4] [3][3] [2][2] [1][1] [0][0]
+##[0][4] [0][3] [0][2] [0][1] [0][0]
+# [3][4] [2][3] [1][2] [0][1] [4][0]
+##[2][3] [2][2] [2][1] [2][0] [2][4]
+# [2][4] [1][3] [0][2] [4][1] [3][0]
+##[4][2] [4][1] [4][0] [4][4] [4][3]
+# [1][4] [0][3] [4][2] [3][1] [2][0]
+##[1][1] [1][0] [1][4] [1][3] [1][2]
+# [0][4] [4][3] [3][2] [2][1] [1][0]
+##[3][0] [3][4] [3][3] [3][2] [3][1]
+#
+# This means that odd round Chi is performed in less suitable layout,
+# with a number of additional permutations. But overall it turned to be
+# a win. Permutations are fastest possible on Knights Landing and they
+# are laid down to be independent of each other. In the essence I traded
+# 20 blend instructions for 3 permutations. The result is 13% faster
+# than KCP on Skylake-X, and >40% on Knights Landing.
+#
+# As implied, data is loaded in straight linear order. Digits in
+# variables' names represent coordinates of right-most element of
+# loaded data chunk:
+
+my ($A00, # [0][4] [0][3] [0][2] [0][1] [0][0]
+ $A10, # [1][4] [1][3] [1][2] [1][1] [1][0]
+ $A20, # [2][4] [2][3] [2][2] [2][1] [2][0]
+ $A30, # [3][4] [3][3] [3][2] [3][1] [3][0]
+ $A40) = # [4][4] [4][3] [4][2] [4][1] [4][0]
+ map("%zmm$_",(0..4));
+
+# We also need to map the magic order into offsets within structure:
+
+my @A_jagged = ([0,0], [0,1], [0,2], [0,3], [0,4],
+ [1,0], [1,1], [1,2], [1,3], [1,4],
+ [2,0], [2,1], [2,2], [2,3], [2,4],
+ [3,0], [3,1], [3,2], [3,3], [3,4],
+ [4,0], [4,1], [4,2], [4,3], [4,4]);
+ @A_jagged = map(8*($$_[0]*8+$$_[1]), @A_jagged); # ... and now linear
+
+my @T = map("%zmm$_",(5..12));
+my @Theta = map("%zmm$_",(33,13..16)); # invalid @Theta[0] is not typo
+my @Pi0 = map("%zmm$_",(17..21));
+my @Rhotate0 = map("%zmm$_",(22..26));
+my @Rhotate1 = map("%zmm$_",(27..31));
+
+my ($C00,$D00) = @T[0..1];
+my ($k00001,$k00010,$k00100,$k01000,$k10000,$k11111) = map("%k$_",(1..6));
+
+$code.=<<___;
+.text
+
+.type __KeccakF1600,\@function
+.align 32
+__KeccakF1600:
+ lea iotas(%rip),%r10
+ mov \$12,%eax
+ jmp .Loop_avx512
+
+.align 32
+.Loop_avx512:
+ ######################################### Theta, even round
+ vmovdqa64 $A00,@T[0] # put aside original A00
+ vpternlogq \$0x96,$A20,$A10,$A00 # and use it as "C00"
+ vpternlogq \$0x96,$A40,$A30,$A00
+
+ vprolq \$1,$A00,$D00
+ vpermq $A00,@Theta[1],$A00
+ vpermq $D00,@Theta[4],$D00
+
+ vpternlogq \$0x96,$A00,$D00,@T[0] # T[0] is original A00
+ vpternlogq \$0x96,$A00,$D00,$A10
+ vpternlogq \$0x96,$A00,$D00,$A20
+ vpternlogq \$0x96,$A00,$D00,$A30
+ vpternlogq \$0x96,$A00,$D00,$A40
+
+ ######################################### Rho
+ vprolvq @Rhotate0[0],@T[0],$A00 # T[0] is original A00
+ vprolvq @Rhotate0[1],$A10,$A10
+ vprolvq @Rhotate0[2],$A20,$A20
+ vprolvq @Rhotate0[3],$A30,$A30
+ vprolvq @Rhotate0[4],$A40,$A40
+
+ ######################################### Pi
+ vpermq $A00,@Pi0[0],$A00
+ vpermq $A10,@Pi0[1],$A10
+ vpermq $A20,@Pi0[2],$A20
+ vpermq $A30,@Pi0[3],$A30
+ vpermq $A40,@Pi0[4],$A40
+
+ ######################################### Chi
+ vmovdqa64 $A00,@T[0]
+ vmovdqa64 $A10,@T[1]
+ vpternlogq \$0xD2,$A20,$A10,$A00
+ vpternlogq \$0xD2,$A30,$A20,$A10
+ vpternlogq \$0xD2,$A40,$A30,$A20
+ vpternlogq \$0xD2,@T[0],$A40,$A30
+ vpternlogq \$0xD2,@T[1],@T[0],$A40
+
+ ######################################### Iota
+ vpxorq (%r10),$A00,${A00}{$k00001}
+ lea 16(%r10),%r10
+
+ ######################################### Harmonize rounds
+ vpblendmq $A20,$A10,@{T[1]}{$k00010}
+ vpblendmq $A30,$A20,@{T[2]}{$k00010}
+ vpblendmq $A40,$A30,@{T[3]}{$k00010}
+ vpblendmq $A10,$A00,@{T[0]}{$k00010}
+ vpblendmq $A00,$A40,@{T[4]}{$k00010}
+
+ vpblendmq $A30,@T[1],@{T[1]}{$k00100}
+ vpblendmq $A40,@T[2],@{T[2]}{$k00100}
+ vpblendmq $A20,@T[0],@{T[0]}{$k00100}
+ vpblendmq $A00,@T[3],@{T[3]}{$k00100}
+ vpblendmq $A10,@T[4],@{T[4]}{$k00100}
+
+ vpblendmq $A40,@T[1],@{T[1]}{$k01000}
+ vpblendmq $A30,@T[0],@{T[0]}{$k01000}
+ vpblendmq $A00,@T[2],@{T[2]}{$k01000}
+ vpblendmq $A10,@T[3],@{T[3]}{$k01000}
+ vpblendmq $A20,@T[4],@{T[4]}{$k01000}
+
+ vpblendmq $A40,@T[0],@{T[0]}{$k10000}
+ vpblendmq $A00,@T[1],@{T[1]}{$k10000}
+ vpblendmq $A10,@T[2],@{T[2]}{$k10000}
+ vpblendmq $A20,@T[3],@{T[3]}{$k10000}
+ vpblendmq $A30,@T[4],@{T[4]}{$k10000}
+
+ #vpermq @T[0],@Theta[0],$A00 # doesn't actually change order
+ vpermq @T[1],@Theta[1],$A10
+ vpermq @T[2],@Theta[2],$A20
+ vpermq @T[3],@Theta[3],$A30
+ vpermq @T[4],@Theta[4],$A40
+
+ ######################################### Theta, odd round
+ vmovdqa64 $T[0],$A00 # real A00
+ vpternlogq \$0x96,$A20,$A10,$C00 # C00 is @T[0]'s alias
+ vpternlogq \$0x96,$A40,$A30,$C00
+
+ vprolq \$1,$C00,$D00
+ vpermq $C00,@Theta[1],$C00
+ vpermq $D00,@Theta[4],$D00
+
+ vpternlogq \$0x96,$C00,$D00,$A00
+ vpternlogq \$0x96,$C00,$D00,$A30
+ vpternlogq \$0x96,$C00,$D00,$A10
+ vpternlogq \$0x96,$C00,$D00,$A40
+ vpternlogq \$0x96,$C00,$D00,$A20
+
+ ######################################### Rho
+ vprolvq @Rhotate1[0],$A00,$A00
+ vprolvq @Rhotate1[3],$A30,@T[1]
+ vprolvq @Rhotate1[1],$A10,@T[2]
+ vprolvq @Rhotate1[4],$A40,@T[3]
+ vprolvq @Rhotate1[2],$A20,@T[4]
+
+ vpermq $A00,@Theta[4],@T[5]
+ vpermq $A00,@Theta[3],@T[6]
+
+ ######################################### Iota
+ vpxorq -8(%r10),$A00,${A00}{$k00001}
+
+ ######################################### Pi
+ vpermq @T[1],@Theta[2],$A10
+ vpermq @T[2],@Theta[4],$A20
+ vpermq @T[3],@Theta[1],$A30
+ vpermq @T[4],@Theta[3],$A40
+
+ ######################################### Chi
+ vpternlogq \$0xD2,@T[6],@T[5],$A00
+
+ vpermq @T[1],@Theta[1],@T[7]
+ #vpermq @T[1],@Theta[0],@T[1]
+ vpternlogq \$0xD2,@T[1],@T[7],$A10
+
+ vpermq @T[2],@Theta[3],@T[0]
+ vpermq @T[2],@Theta[2],@T[2]
+ vpternlogq \$0xD2,@T[2],@T[0],$A20
+
+ #vpermq @T[3],@Theta[0],@T[3]
+ vpermq @T[3],@Theta[4],@T[1]
+ vpternlogq \$0xD2,@T[1],@T[3],$A30
+
+ vpermq @T[4],@Theta[2],@T[0]
+ vpermq @T[4],@Theta[1],@T[4]
+ vpternlogq \$0xD2,@T[4],@T[0],$A40
+
+ dec %eax
+ jnz .Loop_avx512
+
+ ret
+.size __KeccakF1600,.-__KeccakF1600
+___
+
+my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
+my $out = $inp; # in squeeze
+
+$code.=<<___;
+.globl SHA3_absorb
+.type SHA3_absorb,\@function
+.align 32
+SHA3_absorb:
+ mov %rsp,%r11
+
+ lea -320(%rsp),%rsp
+ and \$-64,%rsp
+
+ lea 96($A_flat),$A_flat
+ lea 96($inp),$inp
+ lea 128(%rsp),%r9
+
+ lea theta_perm(%rip),%r8
+
+ kxnorw $k11111,$k11111,$k11111
+ kshiftrw \$15,$k11111,$k00001
+ kshiftrw \$11,$k11111,$k11111
+ kshiftlw \$1,$k00001,$k00010
+ kshiftlw \$2,$k00001,$k00100
+ kshiftlw \$3,$k00001,$k01000
+ kshiftlw \$4,$k00001,$k10000
+
+ #vmovdqa64 64*0(%r8),@Theta[0]
+ vmovdqa64 64*1(%r8),@Theta[1]
+ vmovdqa64 64*2(%r8),@Theta[2]
+ vmovdqa64 64*3(%r8),@Theta[3]
+ vmovdqa64 64*4(%r8),@Theta[4]
+
+ vmovdqa64 64*5(%r8),@Rhotate1[0]
+ vmovdqa64 64*6(%r8),@Rhotate1[1]
+ vmovdqa64 64*7(%r8),@Rhotate1[2]
+ vmovdqa64 64*8(%r8),@Rhotate1[3]
+ vmovdqa64 64*9(%r8),@Rhotate1[4]
+
+ vmovdqa64 64*10(%r8),@Rhotate0[0]
+ vmovdqa64 64*11(%r8),@Rhotate0[1]
+ vmovdqa64 64*12(%r8),@Rhotate0[2]
+ vmovdqa64 64*13(%r8),@Rhotate0[3]
+ vmovdqa64 64*14(%r8),@Rhotate0[4]
+
+ vmovdqa64 64*15(%r8),@Pi0[0]
+ vmovdqa64 64*16(%r8),@Pi0[1]
+ vmovdqa64 64*17(%r8),@Pi0[2]
+ vmovdqa64 64*18(%r8),@Pi0[3]
+ vmovdqa64 64*19(%r8),@Pi0[4]
+
+ vmovdqu64 40*0-96($A_flat),${A00}{$k11111}{z}
+ vpxorq @T[0],@T[0],@T[0]
+ vmovdqu64 40*1-96($A_flat),${A10}{$k11111}{z}
+ vmovdqu64 40*2-96($A_flat),${A20}{$k11111}{z}
+ vmovdqu64 40*3-96($A_flat),${A30}{$k11111}{z}
+ vmovdqu64 40*4-96($A_flat),${A40}{$k11111}{z}
+
+ vmovdqa64 @T[0],0*64-128(%r9) # zero transfer area on stack
+ vmovdqa64 @T[0],1*64-128(%r9)
+ vmovdqa64 @T[0],2*64-128(%r9)
+ vmovdqa64 @T[0],3*64-128(%r9)
+ vmovdqa64 @T[0],4*64-128(%r9)
+ jmp .Loop_absorb_avx512
+
+.align 32
+.Loop_absorb_avx512:
+ mov $bsz,%rax
+ sub $bsz,$len
+ jc .Ldone_absorb_avx512
+
+ shr \$3,%eax
+___
+for(my $i=0; $i<25; $i++) {
+$code.=<<___
+ mov 8*$i-96($inp),%r8
+ mov %r8,$A_jagged[$i]-128(%r9)
+ dec %eax
+ jz .Labsorved_avx512
+___
+}
+$code.=<<___;
+.Labsorved_avx512:
+ lea ($inp,$bsz),$inp
+
+ vpxorq 64*0-128(%r9),$A00,$A00
+ vpxorq 64*1-128(%r9),$A10,$A10
+ vpxorq 64*2-128(%r9),$A20,$A20
+ vpxorq 64*3-128(%r9),$A30,$A30
+ vpxorq 64*4-128(%r9),$A40,$A40
+
+ call __KeccakF1600
+
+ jmp .Loop_absorb_avx512
+
+.align 32
+.Ldone_absorb_avx512:
+ vmovdqu64 $A00,40*0-96($A_flat){$k11111}
+ vmovdqu64 $A10,40*1-96($A_flat){$k11111}
+ vmovdqu64 $A20,40*2-96($A_flat){$k11111}
+ vmovdqu64 $A30,40*3-96($A_flat){$k11111}
+ vmovdqu64 $A40,40*4-96($A_flat){$k11111}
+
+ vzeroupper
+
+ lea (%r11),%rsp
+ lea ($len,$bsz),%rax # return value
+ ret
+.size SHA3_absorb,.-SHA3_absorb
+
+.globl SHA3_squeeze
+.type SHA3_squeeze,\@function
+.align 32
+SHA3_squeeze:
+ mov %rsp,%r11
+
+ lea 96($A_flat),$A_flat
+ cmp $bsz,$len
+ jbe .Lno_output_extension_avx512
+
+ lea theta_perm(%rip),%r8
+
+ kxnorw $k11111,$k11111,$k11111
+ kshiftrw \$15,$k11111,$k00001
+ kshiftrw \$11,$k11111,$k11111
+ kshiftlw \$1,$k00001,$k00010
+ kshiftlw \$2,$k00001,$k00100
+ kshiftlw \$3,$k00001,$k01000
+ kshiftlw \$4,$k00001,$k10000
+
+ #vmovdqa64 64*0(%r8),@Theta[0]
+ vmovdqa64 64*1(%r8),@Theta[1]
+ vmovdqa64 64*2(%r8),@Theta[2]
+ vmovdqa64 64*3(%r8),@Theta[3]
+ vmovdqa64 64*4(%r8),@Theta[4]
+
+ vmovdqa64 64*5(%r8),@Rhotate1[0]
+ vmovdqa64 64*6(%r8),@Rhotate1[1]
+ vmovdqa64 64*7(%r8),@Rhotate1[2]
+ vmovdqa64 64*8(%r8),@Rhotate1[3]
+ vmovdqa64 64*9(%r8),@Rhotate1[4]
+
+ vmovdqa64 64*10(%r8),@Rhotate0[0]
+ vmovdqa64 64*11(%r8),@Rhotate0[1]
+ vmovdqa64 64*12(%r8),@Rhotate0[2]
+ vmovdqa64 64*13(%r8),@Rhotate0[3]
+ vmovdqa64 64*14(%r8),@Rhotate0[4]
+
+ vmovdqa64 64*15(%r8),@Pi0[0]
+ vmovdqa64 64*16(%r8),@Pi0[1]
+ vmovdqa64 64*17(%r8),@Pi0[2]
+ vmovdqa64 64*18(%r8),@Pi0[3]
+ vmovdqa64 64*19(%r8),@Pi0[4]
+
+ vmovdqu64 40*0-96($A_flat),${A00}{$k11111}{z}
+ vmovdqu64 40*1-96($A_flat),${A10}{$k11111}{z}
+ vmovdqu64 40*2-96($A_flat),${A20}{$k11111}{z}
+ vmovdqu64 40*3-96($A_flat),${A30}{$k11111}{z}
+ vmovdqu64 40*4-96($A_flat),${A40}{$k11111}{z}
+
+.Lno_output_extension_avx512:
+ shr \$3,$bsz
+ lea -96($A_flat),%r9
+ mov $bsz,%rax
+ jmp .Loop_squeeze_avx512
+
+.align 32
+.Loop_squeeze_avx512:
+ cmp \$8,$len
+ jb .Ltail_squeeze_avx512
+
+ mov (%r9),%r8
+ lea 8(%r9),%r9
+ mov %r8,($out)
+ lea 8($out),$out
+ sub \$8,$len # len -= 8
+ jz .Ldone_squeeze_avx512
+
+ sub \$1,%rax # bsz--
+ jnz .Loop_squeeze_avx512
+
+ #vpermq @Theta[4],@Theta[4],@Theta[3]
+ #vpermq @Theta[3],@Theta[4],@Theta[2]
+ #vpermq @Theta[3],@Theta[3],@Theta[1]
+
+ call __KeccakF1600
+
+ vmovdqu64 $A00,40*0-96($A_flat){$k11111}
+ vmovdqu64 $A10,40*1-96($A_flat){$k11111}
+ vmovdqu64 $A20,40*2-96($A_flat){$k11111}
+ vmovdqu64 $A30,40*3-96($A_flat){$k11111}
+ vmovdqu64 $A40,40*4-96($A_flat){$k11111}
+
+ lea -96($A_flat),%r9
+ mov $bsz,%rax
+ jmp .Loop_squeeze_avx512
+
+.Ltail_squeeze_avx512:
+ mov $out,%rdi
+ mov %r9,%rsi
+ mov $len,%rcx
+ .byte 0xf3,0xa4 # rep movsb
+
+.Ldone_squeeze_avx512:
+ vzeroupper
+
+ lea (%r11),%rsp
+ ret
+.size SHA3_squeeze,.-SHA3_squeeze
+
+.align 64
+theta_perm:
+ .quad 0, 1, 2, 3, 4, 5, 6, 7 # [not used]
+ .quad 4, 0, 1, 2, 3, 5, 6, 7
+ .quad 3, 4, 0, 1, 2, 5, 6, 7
+ .quad 2, 3, 4, 0, 1, 5, 6, 7
+ .quad 1, 2, 3, 4, 0, 5, 6, 7
+
+rhotates1:
+ .quad 0, 44, 43, 21, 14, 0, 0, 0 # [0][0] [1][1] [2][2] [3][3] [4][4]
+ .quad 18, 1, 6, 25, 8, 0, 0, 0 # [4][0] [0][1] [1][2] [2][3] [3][4]
+ .quad 41, 2, 62, 55, 39, 0, 0, 0 # [3][0] [4][1] [0][2] [1][3] [2][4]
+ .quad 3, 45, 61, 28, 20, 0, 0, 0 # [2][0] [3][1] [4][2] [0][3] [1][4]
+ .quad 36, 10, 15, 56, 27, 0, 0, 0 # [1][0] [2][1] [3][2] [4][3] [0][4]
+
+rhotates0:
+ .quad 0, 1, 62, 28, 27, 0, 0, 0
+ .quad 36, 44, 6, 55, 20, 0, 0, 0
+ .quad 3, 10, 43, 25, 39, 0, 0, 0
+ .quad 41, 45, 15, 21, 8, 0, 0, 0
+ .quad 18, 2, 61, 56, 14, 0, 0, 0
+
+pi0_perm:
+ .quad 0, 3, 1, 4, 2, 5, 6, 7
+ .quad 1, 4, 2, 0, 3, 5, 6, 7
+ .quad 2, 0, 3, 1, 4, 5, 6, 7
+ .quad 3, 1, 4, 2, 0, 5, 6, 7
+ .quad 4, 2, 0, 3, 1, 5, 6, 7
+
+
+iotas:
+ .quad 0x0000000000000001
+ .quad 0x0000000000008082
+ .quad 0x800000000000808a
+ .quad 0x8000000080008000
+ .quad 0x000000000000808b
+ .quad 0x0000000080000001
+ .quad 0x8000000080008081
+ .quad 0x8000000000008009
+ .quad 0x000000000000008a
+ .quad 0x0000000000000088
+ .quad 0x0000000080008009
+ .quad 0x000000008000000a
+ .quad 0x000000008000808b
+ .quad 0x800000000000008b
+ .quad 0x8000000000008089
+ .quad 0x8000000000008003
+ .quad 0x8000000000008002
+ .quad 0x8000000000000080
+ .quad 0x000000000000800a
+ .quad 0x800000008000000a
+ .quad 0x8000000080008081
+ .quad 0x8000000000008080
+ .quad 0x0000000080000001
+ .quad 0x8000000080008008
+
+.asciz "Keccak-1600 absorb and squeeze for AVX-512F, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+$output=pop;
+open STDOUT,">$output";
+print $code;
+close STDOUT;
diff --git a/crypto/sha/asm/keccak1600-avx512vl.pl b/crypto/sha/asm/keccak1600-avx512vl.pl
new file mode 100755
index 000000000000..a21bb8615a7c
--- /dev/null
+++ b/crypto/sha/asm/keccak1600-avx512vl.pl
@@ -0,0 +1,392 @@
+#!/usr/bin/env perl
+# Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# Keccak-1600 for AVX512VL.
+#
+# December 2017.
+#
+# This is an adaptation of AVX2 module that reuses register data
+# layout, but utilizes new 256-bit AVX512VL instructions. See AVX2
+# module for further information on layout.
+#
+########################################################################
+# Numbers are cycles per processed byte out of large message.
+#
+# r=1088(*)
+#
+# Skylake-X 6.4/+47%
+#
+# (*) Corresponds to SHA3-256. Percentage after slash is improvement
+# coefficient in comparison to scalar keccak1600-x86_64.pl.
+
+# Digits in variables' names denote right-most coordinates:
+
+my ($A00, # [0][0] [0][0] [0][0] [0][0] # %ymm0
+ $A01, # [0][4] [0][3] [0][2] [0][1] # %ymm1
+ $A20, # [3][0] [1][0] [4][0] [2][0] # %ymm2
+ $A31, # [2][4] [4][3] [1][2] [3][1] # %ymm3
+ $A21, # [3][4] [1][3] [4][2] [2][1] # %ymm4
+ $A41, # [1][4] [2][3] [3][2] [4][1] # %ymm5
+ $A11) = # [4][4] [3][3] [2][2] [1][1] # %ymm6
+ map("%ymm$_",(0..6));
+
+# We also need to map the magic order into offsets within structure:
+
+my @A_jagged = ([0,0], [1,0], [1,1], [1,2], [1,3], # [0][0..4]
+ [2,2], [6,0], [3,1], [4,2], [5,3], # [1][0..4]
+ [2,0], [4,0], [6,1], [5,2], [3,3], # [2][0..4]
+ [2,3], [3,0], [5,1], [6,2], [4,3], # [3][0..4]
+ [2,1], [5,0], [4,1], [3,2], [6,3]); # [4][0..4]
+ @A_jagged = map(8*($$_[0]*4+$$_[1]), @A_jagged); # ... and now linear
+
+my @T = map("%ymm$_",(7..15));
+my ($C14,$C00,$D00,$D14) = @T[5..8];
+my ($R20,$R01,$R31,$R21,$R41,$R11) = map("%ymm$_",(16..21));
+
+$code.=<<___;
+.text
+
+.type __KeccakF1600,\@function
+.align 32
+__KeccakF1600:
+ lea iotas(%rip),%r10
+ mov \$24,%eax
+ jmp .Loop_avx512vl
+
+.align 32
+.Loop_avx512vl:
+ ######################################### Theta
+ vpshufd \$0b01001110,$A20,$C00
+ vpxor $A31,$A41,$C14
+ vpxor $A11,$A21,@T[2]
+ vpternlogq \$0x96,$A01,$T[2],$C14 # C[1..4]
+
+ vpxor $A20,$C00,$C00
+ vpermq \$0b01001110,$C00,@T[0]
+
+ vpermq \$0b10010011,$C14,@T[4]
+ vprolq \$1,$C14,@T[1] # ROL64(C[1..4],1)
+
+ vpermq \$0b00111001,@T[1],$D14
+ vpxor @T[4],@T[1],$D00
+ vpermq \$0b00000000,$D00,$D00 # D[0..0] = ROL64(C[1],1) ^ C[4]
+
+ vpternlogq \$0x96,@T[0],$A00,$C00 # C[0..0]
+ vprolq \$1,$C00,@T[1] # ROL64(C[0..0],1)
+
+ vpxor $D00,$A00,$A00 # ^= D[0..0]
+
+ vpblendd \$0b11000000,@T[1],$D14,$D14
+ vpblendd \$0b00000011,$C00,@T[4],@T[0]
+
+ ######################################### Rho + Pi + pre-Chi shuffle
+ vpxor $D00,$A20,$A20 # ^= D[0..0] from Theta
+ vprolvq $R20,$A20,$A20
+
+ vpternlogq \$0x96,@T[0],$D14,$A31 # ^= D[1..4] from Theta
+ vprolvq $R31,$A31,$A31
+
+ vpternlogq \$0x96,@T[0],$D14,$A21 # ^= D[1..4] from Theta
+ vprolvq $R21,$A21,$A21
+
+ vpternlogq \$0x96,@T[0],$D14,$A41 # ^= D[1..4] from Theta
+ vprolvq $R41,$A41,$A41
+
+ vpermq \$0b10001101,$A20,@T[3] # $A20 -> future $A31
+ vpermq \$0b10001101,$A31,@T[4] # $A31 -> future $A21
+ vpternlogq \$0x96,@T[0],$D14,$A11 # ^= D[1..4] from Theta
+ vprolvq $R11,$A11,@T[1] # $A11 -> future $A01
+
+ vpermq \$0b00011011,$A21,@T[5] # $A21 -> future $A41
+ vpermq \$0b01110010,$A41,@T[6] # $A41 -> future $A11
+ vpternlogq \$0x96,@T[0],$D14,$A01 # ^= D[1..4] from Theta
+ vprolvq $R01,$A01,@T[2] # $A01 -> future $A20
+
+ ######################################### Chi
+ vpblendd \$0b00001100,@T[6],@T[2],$A31 # [4][4] [2][0]
+ vpblendd \$0b00001100,@T[2],@T[4],@T[8] # [4][0] [2][1]
+ vpblendd \$0b00001100,@T[4],@T[3],$A41 # [4][2] [2][4]
+ vpblendd \$0b00001100,@T[3],@T[2],@T[7] # [4][3] [2][0]
+ vpblendd \$0b00110000,@T[4],$A31,$A31 # [1][3] [4][4] [2][0]
+ vpblendd \$0b00110000,@T[5],@T[8],@T[8] # [1][4] [4][0] [2][1]
+ vpblendd \$0b00110000,@T[2],$A41,$A41 # [1][0] [4][2] [2][4]
+ vpblendd \$0b00110000,@T[6],@T[7],@T[7] # [1][1] [4][3] [2][0]
+ vpblendd \$0b11000000,@T[5],$A31,$A31 # [3][2] [1][3] [4][4] [2][0]
+ vpblendd \$0b11000000,@T[6],@T[8],@T[8] # [3][3] [1][4] [4][0] [2][1]
+ vpblendd \$0b11000000,@T[6],$A41,$A41 # [3][3] [1][0] [4][2] [2][4]
+ vpblendd \$0b11000000,@T[4],@T[7],@T[7] # [3][4] [1][1] [4][3] [2][0]
+ vpternlogq \$0xC6,@T[8],@T[3],$A31 # [3][1] [1][2] [4][3] [2][4]
+ vpternlogq \$0xC6,@T[7],@T[5],$A41 # [3][2] [1][4] [4][1] [2][3]
+
+ vpsrldq \$8,@T[1],@T[0]
+ vpandn @T[0],@T[1],@T[0] # tgting [0][0] [0][0] [0][0] [0][0]
+
+ vpblendd \$0b00001100,@T[2],@T[5],$A11 # [4][0] [2][3]
+ vpblendd \$0b00001100,@T[5],@T[3],@T[8] # [4][1] [2][4]
+ vpblendd \$0b00110000,@T[3],$A11,$A11 # [1][2] [4][0] [2][3]
+ vpblendd \$0b00110000,@T[4],@T[8],@T[8] # [1][3] [4][1] [2][4]
+ vpblendd \$0b11000000,@T[4],$A11,$A11 # [3][4] [1][2] [4][0] [2][3]
+ vpblendd \$0b11000000,@T[2],@T[8],@T[8] # [3][0] [1][3] [4][1] [2][4]
+ vpternlogq \$0xC6,@T[8],@T[6],$A11 # [3][3] [1][1] [4][4] [2][2]
+
+ vpermq \$0b00011110,@T[1],$A21 # [0][1] [0][2] [0][4] [0][3]
+ vpblendd \$0b00110000,$A00,$A21,@T[8] # [0][1] [0][0] [0][4] [0][3]
+ vpermq \$0b00111001,@T[1],$A01 # [0][1] [0][4] [0][3] [0][2]
+ vpblendd \$0b11000000,$A00,$A01,$A01 # [0][0] [0][4] [0][3] [0][2]
+
+ vpblendd \$0b00001100,@T[5],@T[4],$A20 # [4][1] [2][1]
+ vpblendd \$0b00001100,@T[4],@T[6],@T[7] # [4][2] [2][2]
+ vpblendd \$0b00110000,@T[6],$A20,$A20 # [1][1] [4][1] [2][1]
+ vpblendd \$0b00110000,@T[3],@T[7],@T[7] # [1][2] [4][2] [2][2]
+ vpblendd \$0b11000000,@T[3],$A20,$A20 # [3][1] [1][1] [4][1] [2][1]
+ vpblendd \$0b11000000,@T[5],@T[7],@T[7] # [3][2] [1][2] [4][2] [2][2]
+ vpternlogq \$0xC6,@T[7],@T[2],$A20 # [3][0] [1][0] [4][0] [2][0]
+
+ vpermq \$0b00000000,@T[0],@T[0] # [0][0] [0][0] [0][0] [0][0]
+ vpermq \$0b00011011,$A31,$A31 # post-Chi shuffle
+ vpermq \$0b10001101,$A41,$A41
+ vpermq \$0b01110010,$A11,$A11
+
+ vpblendd \$0b00001100,@T[3],@T[6],$A21 # [4][3] [2][2]
+ vpblendd \$0b00001100,@T[6],@T[5],@T[7] # [4][4] [2][3]
+ vpblendd \$0b00110000,@T[5],$A21,$A21 # [1][4] [4][3] [2][2]
+ vpblendd \$0b00110000,@T[2],@T[7],@T[7] # [1][0] [4][4] [2][3]
+ vpblendd \$0b11000000,@T[2],$A21,$A21 # [3][0] [1][4] [4][3] [2][2]
+ vpblendd \$0b11000000,@T[3],@T[7],@T[7] # [3][1] [1][0] [4][4] [2][3]
+
+ vpternlogq \$0xC6,@T[8],@T[1],$A01 # [0][4] [0][3] [0][2] [0][1]
+ vpternlogq \$0xC6,@T[7],@T[4],$A21 # [3][4] [1][3] [4][2] [2][1]
+
+ ######################################### Iota
+ vpternlogq \$0x96,(%r10),@T[0],$A00
+ lea 32(%r10),%r10
+
+ dec %eax
+ jnz .Loop_avx512vl
+
+ ret
+.size __KeccakF1600,.-__KeccakF1600
+___
+my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
+my $out = $inp; # in squeeze
+
+$code.=<<___;
+.globl SHA3_absorb
+.type SHA3_absorb,\@function
+.align 32
+SHA3_absorb:
+ mov %rsp,%r11
+
+ lea -240(%rsp),%rsp
+ and \$-32,%rsp
+
+ lea 96($A_flat),$A_flat
+ lea 96($inp),$inp
+ lea 96(%rsp),%r10
+ lea rhotates_left(%rip),%r8
+
+ vzeroupper
+
+ vpbroadcastq -96($A_flat),$A00 # load A[5][5]
+ vmovdqu 8+32*0-96($A_flat),$A01
+ vmovdqu 8+32*1-96($A_flat),$A20
+ vmovdqu 8+32*2-96($A_flat),$A31
+ vmovdqu 8+32*3-96($A_flat),$A21
+ vmovdqu 8+32*4-96($A_flat),$A41
+ vmovdqu 8+32*5-96($A_flat),$A11
+
+ vmovdqa64 0*32(%r8),$R20 # load "rhotate" indices
+ vmovdqa64 1*32(%r8),$R01
+ vmovdqa64 2*32(%r8),$R31
+ vmovdqa64 3*32(%r8),$R21
+ vmovdqa64 4*32(%r8),$R41
+ vmovdqa64 5*32(%r8),$R11
+
+ vpxor @T[0],@T[0],@T[0]
+ vmovdqa @T[0],32*2-96(%r10) # zero transfer area on stack
+ vmovdqa @T[0],32*3-96(%r10)
+ vmovdqa @T[0],32*4-96(%r10)
+ vmovdqa @T[0],32*5-96(%r10)
+ vmovdqa @T[0],32*6-96(%r10)
+
+.Loop_absorb_avx512vl:
+ mov $bsz,%rax
+ sub $bsz,$len
+ jc .Ldone_absorb_avx512vl
+
+ shr \$3,%eax
+ vpbroadcastq 0-96($inp),@T[0]
+ vmovdqu 8-96($inp),@T[1]
+ sub \$4,%eax
+___
+for(my $i=5; $i<25; $i++) {
+$code.=<<___
+ dec %eax
+ jz .Labsorved_avx512vl
+ mov 8*$i-96($inp),%r8
+ mov %r8,$A_jagged[$i]-96(%r10)
+___
+}
+$code.=<<___;
+.Labsorved_avx512vl:
+ lea ($inp,$bsz),$inp
+
+ vpxor @T[0],$A00,$A00
+ vpxor @T[1],$A01,$A01
+ vpxor 32*2-96(%r10),$A20,$A20
+ vpxor 32*3-96(%r10),$A31,$A31
+ vpxor 32*4-96(%r10),$A21,$A21
+ vpxor 32*5-96(%r10),$A41,$A41
+ vpxor 32*6-96(%r10),$A11,$A11
+
+ call __KeccakF1600
+
+ lea 96(%rsp),%r10
+ jmp .Loop_absorb_avx512vl
+
+.Ldone_absorb_avx512vl:
+ vmovq %xmm0,-96($A_flat)
+ vmovdqu $A01,8+32*0-96($A_flat)
+ vmovdqu $A20,8+32*1-96($A_flat)
+ vmovdqu $A31,8+32*2-96($A_flat)
+ vmovdqu $A21,8+32*3-96($A_flat)
+ vmovdqu $A41,8+32*4-96($A_flat)
+ vmovdqu $A11,8+32*5-96($A_flat)
+
+ vzeroupper
+
+ lea (%r11),%rsp
+ lea ($len,$bsz),%rax # return value
+ ret
+.size SHA3_absorb,.-SHA3_absorb
+
+.globl SHA3_squeeze
+.type SHA3_squeeze,\@function
+.align 32
+SHA3_squeeze:
+ mov %rsp,%r11
+
+ lea 96($A_flat),$A_flat
+ lea rhotates_left(%rip),%r8
+ shr \$3,$bsz
+
+ vzeroupper
+
+ vpbroadcastq -96($A_flat),$A00
+ vpxor @T[0],@T[0],@T[0]
+ vmovdqu 8+32*0-96($A_flat),$A01
+ vmovdqu 8+32*1-96($A_flat),$A20
+ vmovdqu 8+32*2-96($A_flat),$A31
+ vmovdqu 8+32*3-96($A_flat),$A21
+ vmovdqu 8+32*4-96($A_flat),$A41
+ vmovdqu 8+32*5-96($A_flat),$A11
+
+ vmovdqa64 0*32(%r8),$R20 # load "rhotate" indices
+ vmovdqa64 1*32(%r8),$R01
+ vmovdqa64 2*32(%r8),$R31
+ vmovdqa64 3*32(%r8),$R21
+ vmovdqa64 4*32(%r8),$R41
+ vmovdqa64 5*32(%r8),$R11
+
+ mov $bsz,%rax
+
+.Loop_squeeze_avx512vl:
+ mov @A_jagged[$i]-96($A_flat),%r8
+___
+for (my $i=0; $i<25; $i++) {
+$code.=<<___;
+ sub \$8,$len
+ jc .Ltail_squeeze_avx512vl
+ mov %r8,($out)
+ lea 8($out),$out
+ je .Ldone_squeeze_avx512vl
+ dec %eax
+ je .Lextend_output_avx512vl
+ mov @A_jagged[$i+1]-120($A_flat),%r8
+___
+}
+$code.=<<___;
+.Lextend_output_avx512vl:
+ call __KeccakF1600
+
+ vmovq %xmm0,-96($A_flat)
+ vmovdqu $A01,8+32*0-96($A_flat)
+ vmovdqu $A20,8+32*1-96($A_flat)
+ vmovdqu $A31,8+32*2-96($A_flat)
+ vmovdqu $A21,8+32*3-96($A_flat)
+ vmovdqu $A41,8+32*4-96($A_flat)
+ vmovdqu $A11,8+32*5-96($A_flat)
+
+ mov $bsz,%rax
+ jmp .Loop_squeeze_avx512vl
+
+
+.Ltail_squeeze_avx512vl:
+ add \$8,$len
+.Loop_tail_avx512vl:
+ mov %r8b,($out)
+ lea 1($out),$out
+ shr \$8,%r8
+ dec $len
+ jnz .Loop_tail_avx512vl
+
+.Ldone_squeeze_avx512vl:
+ vzeroupper
+
+ lea (%r11),%rsp
+ ret
+.size SHA3_squeeze,.-SHA3_squeeze
+
+.align 64
+rhotates_left:
+ .quad 3, 18, 36, 41 # [2][0] [4][0] [1][0] [3][0]
+ .quad 1, 62, 28, 27 # [0][1] [0][2] [0][3] [0][4]
+ .quad 45, 6, 56, 39 # [3][1] [1][2] [4][3] [2][4]
+ .quad 10, 61, 55, 8 # [2][1] [4][2] [1][3] [3][4]
+ .quad 2, 15, 25, 20 # [4][1] [3][2] [2][3] [1][4]
+ .quad 44, 43, 21, 14 # [1][1] [2][2] [3][3] [4][4]
+iotas:
+ .quad 0x0000000000000001, 0x0000000000000001, 0x0000000000000001, 0x0000000000000001
+ .quad 0x0000000000008082, 0x0000000000008082, 0x0000000000008082, 0x0000000000008082
+ .quad 0x800000000000808a, 0x800000000000808a, 0x800000000000808a, 0x800000000000808a
+ .quad 0x8000000080008000, 0x8000000080008000, 0x8000000080008000, 0x8000000080008000
+ .quad 0x000000000000808b, 0x000000000000808b, 0x000000000000808b, 0x000000000000808b
+ .quad 0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001
+ .quad 0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081
+ .quad 0x8000000000008009, 0x8000000000008009, 0x8000000000008009, 0x8000000000008009
+ .quad 0x000000000000008a, 0x000000000000008a, 0x000000000000008a, 0x000000000000008a
+ .quad 0x0000000000000088, 0x0000000000000088, 0x0000000000000088, 0x0000000000000088
+ .quad 0x0000000080008009, 0x0000000080008009, 0x0000000080008009, 0x0000000080008009
+ .quad 0x000000008000000a, 0x000000008000000a, 0x000000008000000a, 0x000000008000000a
+ .quad 0x000000008000808b, 0x000000008000808b, 0x000000008000808b, 0x000000008000808b
+ .quad 0x800000000000008b, 0x800000000000008b, 0x800000000000008b, 0x800000000000008b
+ .quad 0x8000000000008089, 0x8000000000008089, 0x8000000000008089, 0x8000000000008089
+ .quad 0x8000000000008003, 0x8000000000008003, 0x8000000000008003, 0x8000000000008003
+ .quad 0x8000000000008002, 0x8000000000008002, 0x8000000000008002, 0x8000000000008002
+ .quad 0x8000000000000080, 0x8000000000000080, 0x8000000000000080, 0x8000000000000080
+ .quad 0x000000000000800a, 0x000000000000800a, 0x000000000000800a, 0x000000000000800a
+ .quad 0x800000008000000a, 0x800000008000000a, 0x800000008000000a, 0x800000008000000a
+ .quad 0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081
+ .quad 0x8000000000008080, 0x8000000000008080, 0x8000000000008080, 0x8000000000008080
+ .quad 0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001
+ .quad 0x8000000080008008, 0x8000000080008008, 0x8000000080008008, 0x8000000080008008
+
+.asciz "Keccak-1600 absorb and squeeze for AVX512VL, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+$output=pop;
+open STDOUT,">$output";
+print $code;
+close STDOUT;
diff --git a/crypto/sha/asm/keccak1600-c64x.pl b/crypto/sha/asm/keccak1600-c64x.pl
new file mode 100755
index 000000000000..b00af9af91d8
--- /dev/null
+++ b/crypto/sha/asm/keccak1600-c64x.pl
@@ -0,0 +1,885 @@
+#!/usr/bin/env perl
+# Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# [ABI- and endian-neutral] Keccak-1600 for C64x.
+#
+# June 2017.
+#
+# This is straightforward KECCAK_1X_ALT variant (see sha/keccak1600.c)
+# with bit interleaving. 64-bit values are simply split between A- and
+# B-files, with A-file holding least significant halves. This works
+# out perfectly, because all operations including cross-communications
+# [in rotate operations] are always complementary. Performance is
+# [incredible for a 32-bit processor] 10.9 cycles per processed byte
+# for r=1088, which corresponds to SHA3-256. This is >15x faster than
+# compiler-generated KECCAK_1X_ALT code, and >10x than other variants.
+# On average processor ends up issuing ~4.5 instructions per cycle...
+
+my @A = map([ $_, ($_+1), ($_+2), ($_+3), ($_+4) ], (5,10,16,21,26));
+ $A[1][4] = 31; # B14 is reserved, A14 is used as iota[]
+ ($A[3][0],$A[4][1]) = ($A[4][1],$A[3][0]);
+my @C = (0..4,$A[3][0],$A[4][0]);
+my $iotas = "A14";
+
+my @rhotates = ([ 0, 1, 62, 28, 27 ],
+ [ 36, 44, 6, 55, 20 ],
+ [ 3, 10, 43, 25, 39 ],
+ [ 41, 45, 15, 21, 8 ],
+ [ 18, 2, 61, 56, 14 ]);
+
+sub ROL64 {
+ my ($src,$rot,$dst,$p) = @_;
+
+ if ($rot&1) {
+$code.=<<___;
+$p ROTL B$src,$rot/2+1,A$dst
+|| ROTL A$src,$rot/2, B$dst
+___
+ } else {
+$code.=<<___;
+$p ROTL A$src,$rot/2,A$dst
+|| ROTL B$src,$rot/2,B$dst
+___
+ }
+}
+
+########################################################################
+# Stack frame layout
+#
+# SP--->+------+------+
+# | | |
+# +1--->+------+------+<- -9 below 4 slots are used by KeccakF1600_int
+# | | |
+# +2--->+------+------+<- -8
+# | | |
+# +3--->+------+------+<- -7
+# | A2 | A3 | A3:A2 are preserved by KeccakF1600_int
+# +4--->+------+------+<- -6
+# | B2 | B3 | B3:B2 are preserved by KeccakF1600_int
+# +5--->+------+------+<- -5 below is ABI-compliant layout
+# | A10 | A11 |
+# +6--->+------+------+<- -4
+# | A12 | A13 |
+# +7--->+------+------+<- -3
+# | A14 | B3 |
+# +8--->+------+------+<- -2
+# | B10 | B11 |
+# +9--->+------+------+<- -1
+# | B12 | B13 |
+# +------+------+<---FP
+# | A15 |
+# +------+--
+
+$code.=<<___;
+ .text
+
+ .if .ASSEMBLER_VERSION<7000000
+ .asg 0,__TI_EABI__
+ .endif
+ .if __TI_EABI__
+ .nocmp
+ .asg KeccakF1600,_KeccakF1600
+ .asg SHA3_absorb,_SHA3_absorb
+ .asg SHA3_squeeze,_SHA3_squeeze
+ .endif
+
+ .asg B3,RA
+ .asg A15,FP
+ .asg B15,SP
+
+ .align 32
+_KeccakF1600_int:
+ .asmfunc
+ STDW A3:A2,*FP[-7]
+|| STDW B3:B2,*SP[4]
+_KeccakF1600_cheat:
+ .if __TI_EABI__
+ ADDKPC _KeccakF1600_int,B0
+|| MVKL \$PCR_OFFSET(iotas,_KeccakF1600_int),$iotas
+ MVKH \$PCR_OFFSET(iotas,_KeccakF1600_int),$iotas
+ .else
+ ADDKPC _KeccakF1600_int,B0
+|| MVKL (iotas-_KeccakF1600_int),$iotas
+ MVKH (iotas-_KeccakF1600_int),$iotas
+ .endif
+ ADD B0,$iotas,$iotas
+loop?:
+ XOR A$A[0][2],A$A[1][2],A$C[2] ; Theta
+|| XOR B$A[0][2],B$A[1][2],B$C[2]
+|| XOR A$A[0][3],A$A[1][3],A$C[3]
+|| XOR B$A[0][3],B$A[1][3],B$C[3]
+|| XOR A$A[0][0],A$A[1][0],A$C[0]
+|| XOR B$A[0][0],B$A[1][0],B$C[0]
+ XOR A$A[2][2],A$C[2],A$C[2]
+|| XOR B$A[2][2],B$C[2],B$C[2]
+|| XOR A$A[2][3],A$C[3],A$C[3]
+|| XOR B$A[2][3],B$C[3],B$C[3]
+|| XOR A$A[2][0],A$C[0],A$C[0]
+|| XOR B$A[2][0],B$C[0],B$C[0]
+ XOR A$A[3][2],A$C[2],A$C[2]
+|| XOR B$A[3][2],B$C[2],B$C[2]
+|| XOR A$A[3][3],A$C[3],A$C[3]
+|| XOR B$A[3][3],B$C[3],B$C[3]
+|| XOR A$A[3][0],A$C[0],A$C[0]
+|| XOR B$A[3][0],B$C[0],B$C[0]
+ XOR A$A[4][2],A$C[2],A$C[2]
+|| XOR B$A[4][2],B$C[2],B$C[2]
+|| XOR A$A[4][3],A$C[3],A$C[3]
+|| XOR B$A[4][3],B$C[3],B$C[3]
+|| XOR A$A[4][0],A$C[0],A$C[0]
+|| XOR B$A[4][0],B$C[0],B$C[0]
+ XOR A$A[0][4],A$A[1][4],A$C[4]
+|| XOR B$A[0][4],B$A[1][4],B$C[4]
+|| XOR A$A[0][1],A$A[1][1],A$C[1]
+|| XOR B$A[0][1],B$A[1][1],B$C[1]
+|| STDW A$A[3][0]:A$A[4][0],*SP[1] ; offload some data
+ STDW B$A[3][0]:B$A[4][0],*SP[2]
+|| XOR A$A[2][4],A$C[4],A$C[4]
+|| XOR B$A[2][4],B$C[4],B$C[4]
+|| XOR A$A[2][1],A$C[1],A$C[1]
+|| XOR B$A[2][1],B$C[1],B$C[1]
+|| ROTL B$C[2],1,A$C[5] ; ROL64(C[2],1)
+|| ROTL A$C[2],0,B$C[5]
+ XOR A$A[3][4],A$C[4],A$C[4]
+|| XOR B$A[3][4],B$C[4],B$C[4]
+|| XOR A$A[3][1],A$C[1],A$C[1]
+|| XOR B$A[3][1],B$C[1],B$C[1]
+|| ROTL B$C[3],1,A$C[6] ; ROL64(C[3],1)
+|| ROTL A$C[3],0,B$C[6]
+ XOR A$A[4][4],A$C[4],A$C[4]
+|| XOR B$A[4][4],B$C[4],B$C[4]
+|| XOR A$A[4][1],A$C[1],A$C[1]
+|| XOR B$A[4][1],B$C[1],B$C[1]
+|| XOR A$C[0],A$C[5],A$C[5] ; C[0] ^ ROL64(C[2],1)
+|| XOR B$C[0],B$C[5],B$C[5]
+ XOR A$C[5],A$A[0][1],A$A[0][1]
+|| XOR B$C[5],B$A[0][1],B$A[0][1]
+|| XOR A$C[5],A$A[1][1],A$A[1][1]
+|| XOR B$C[5],B$A[1][1],B$A[1][1]
+|| XOR A$C[5],A$A[2][1],A$A[2][1]
+|| XOR B$C[5],B$A[2][1],B$A[2][1]
+ XOR A$C[5],A$A[3][1],A$A[3][1]
+|| XOR B$C[5],B$A[3][1],B$A[3][1]
+|| XOR A$C[5],A$A[4][1],A$A[4][1]
+|| XOR B$C[5],B$A[4][1],B$A[4][1]
+|| ROTL B$C[4],1,A$C[5] ; ROL64(C[4],1)
+|| ROTL A$C[4],0,B$C[5]
+|| XOR A$C[1],A$C[6],A$C[6] ; C[1] ^ ROL64(C[3],1)
+|| XOR B$C[1],B$C[6],B$C[6]
+ XOR A$C[6],A$A[0][2],A$A[0][2]
+|| XOR B$C[6],B$A[0][2],B$A[0][2]
+|| XOR A$C[6],A$A[1][2],A$A[1][2]
+|| XOR B$C[6],B$A[1][2],B$A[1][2]
+|| XOR A$C[6],A$A[2][2],A$A[2][2]
+|| XOR B$C[6],B$A[2][2],B$A[2][2]
+|| ROTL B$C[1],1,A$C[1] ; ROL64(C[1],1)
+|| ROTL A$C[1],0,B$C[1]
+ XOR A$C[6],A$A[3][2],A$A[3][2]
+|| XOR B$C[6],B$A[3][2],B$A[3][2]
+|| XOR A$C[6],A$A[4][2],A$A[4][2]
+|| XOR B$C[6],B$A[4][2],B$A[4][2]
+|| ROTL B$C[0],1,A$C[6] ; ROL64(C[0],1)
+|| ROTL A$C[0],0,B$C[6]
+|| XOR A$C[5],A$C[2],A$C[2] ; C[2] ^= ROL64(C[4],1)
+|| XOR B$C[5],B$C[2],B$C[2]
+ XOR A$C[2],A$A[0][3],A$A[0][3]
+|| XOR B$C[2],B$A[0][3],B$A[0][3]
+|| XOR A$C[2],A$A[1][3],A$A[1][3]
+|| XOR B$C[2],B$A[1][3],B$A[1][3]
+|| XOR A$C[2],A$A[2][3],A$A[2][3]
+|| XOR B$C[2],B$A[2][3],B$A[2][3]
+ XOR A$C[6],A$C[3],A$C[3] ; C[3] ^= ROL64(C[0],1)
+|| XOR B$C[6],B$C[3],B$C[3]
+|| LDDW *FP[-9],A$A[3][0]:A$A[4][0] ; restore offloaded data
+|| LDDW *SP[2],B$A[3][0]:B$A[4][0]
+|| XOR A$C[2],A$A[3][3],A$A[3][3]
+|| XOR B$C[2],B$A[3][3],B$A[3][3]
+ XOR A$C[2],A$A[4][3],A$A[4][3]
+|| XOR B$C[2],B$A[4][3],B$A[4][3]
+|| XOR A$C[3],A$A[0][4],A$A[0][4]
+|| XOR B$C[3],B$A[0][4],B$A[0][4]
+|| XOR A$C[3],A$A[1][4],A$A[1][4]
+|| XOR B$C[3],B$A[1][4],B$A[1][4]
+ XOR A$C[3],A$A[2][4],A$A[2][4]
+|| XOR B$C[3],B$A[2][4],B$A[2][4]
+|| XOR A$C[3],A$A[3][4],A$A[3][4]
+|| XOR B$C[3],B$A[3][4],B$A[3][4]
+|| XOR A$C[3],A$A[4][4],A$A[4][4]
+|| XOR B$C[3],B$A[4][4],B$A[4][4]
+ XOR A$C[1],A$C[4],A$C[4] ; C[4] ^= ROL64(C[1],1)
+|| XOR B$C[1],B$C[4],B$C[4]
+|| MV A$A[0][1],A$C[1] ; Rho+Pi, "early start"
+|| MV B$A[0][1],B$C[1]
+___
+ &ROL64 ($A[1][1],$rhotates[1][1],$A[0][1],"||");
+$code.=<<___;
+ XOR A$C[4],A$A[0][0],A$A[0][0]
+|| XOR B$C[4],B$A[0][0],B$A[0][0]
+|| XOR A$C[4],A$A[1][0],A$A[1][0]
+|| XOR B$C[4],B$A[1][0],B$A[1][0]
+|| MV A$A[0][3],A$C[3]
+|| MV B$A[0][3],B$C[3]
+___
+ &ROL64 ($A[3][3],$rhotates[3][3],$A[0][3],"||");
+$code.=<<___;
+ XOR A$C[4],A$A[2][0],A$A[2][0]
+|| XOR B$C[4],B$A[2][0],B$A[2][0]
+|| XOR A$C[4],A$A[3][0],A$A[3][0]
+|| XOR B$C[4],B$A[3][0],B$A[3][0]
+|| MV A$A[0][2],A$C[2]
+|| MV B$A[0][2],B$C[2]
+___
+ &ROL64 ($A[2][2],$rhotates[2][2],$A[0][2],"||");
+$code.=<<___;
+ XOR A$C[4],A$A[4][0],A$A[4][0]
+|| XOR B$C[4],B$A[4][0],B$A[4][0]
+|| MV A$A[0][4],A$C[4]
+|| MV B$A[0][4],B$C[4]
+___
+ &ROL64 ($A[4][4],$rhotates[4][4],$A[0][4],"||");
+
+ &ROL64 ($A[1][4],$rhotates[1][4],$A[1][1]);
+$code.=<<___;
+|| LDW *${iotas}++[2],A$C[0]
+___
+ &ROL64 ($A[2][3],$rhotates[2][3],$A[2][2]);
+$code.=<<___;
+|| LDW *${iotas}[-1],B$C[0]
+___
+ &ROL64 ($A[3][2],$rhotates[3][2],$A[3][3]);
+ &ROL64 ($A[4][1],$rhotates[4][1],$A[4][4]);
+
+ &ROL64 ($A[4][2],$rhotates[4][2],$A[1][4]);
+ &ROL64 ($A[3][4],$rhotates[3][4],$A[2][3]);
+ &ROL64 ($A[2][1],$rhotates[2][1],$A[3][2]);
+ &ROL64 ($A[1][3],$rhotates[1][3],$A[4][1]);
+
+ &ROL64 ($A[2][4],$rhotates[2][4],$A[4][2]);
+ &ROL64 ($A[4][3],$rhotates[4][3],$A[3][4]);
+ &ROL64 ($A[1][2],$rhotates[1][2],$A[2][1]);
+ &ROL64 ($A[3][1],$rhotates[3][1],$A[1][3]);
+
+ &ROL64 ($A[4][0],$rhotates[4][0],$A[2][4]);
+ &ROL64 ($A[3][0],$rhotates[3][0],$A[4][3]);
+ &ROL64 ($A[2][0],$rhotates[2][0],$A[1][2]);
+ &ROL64 ($A[1][0],$rhotates[1][0],$A[3][1]);
+
+ #&ROL64 ($C[3], $rhotates[0][3],$A[1][0]); # moved below
+ &ROL64 ($C[1], $rhotates[0][1],$A[2][0]);
+ &ROL64 ($C[4], $rhotates[0][4],$A[3][0]);
+ &ROL64 ($C[2], $rhotates[0][2],$A[4][0]);
+$code.=<<___;
+|| ANDN A$A[0][2],A$A[0][1],A$C[4] ; Chi+Iota
+|| ANDN B$A[0][2],B$A[0][1],B$C[4]
+|| ANDN A$A[0][3],A$A[0][2],A$C[1]
+|| ANDN B$A[0][3],B$A[0][2],B$C[1]
+|| ANDN A$A[0][4],A$A[0][3],A$C[2]
+|| ANDN B$A[0][4],B$A[0][3],B$C[2]
+___
+ &ROL64 ($C[3], $rhotates[0][3],$A[1][0]);
+$code.=<<___;
+|| ANDN A$A[0][0],A$A[0][4],A$C[3]
+|| ANDN B$A[0][0],B$A[0][4],B$C[3]
+|| XOR A$C[4],A$A[0][0],A$A[0][0]
+|| XOR B$C[4],B$A[0][0],B$A[0][0]
+|| ANDN A$A[0][1],A$A[0][0],A$C[4]
+|| ANDN B$A[0][1],B$A[0][0],B$C[4]
+ XOR A$C[1],A$A[0][1],A$A[0][1]
+|| XOR B$C[1],B$A[0][1],B$A[0][1]
+|| XOR A$C[2],A$A[0][2],A$A[0][2]
+|| XOR B$C[2],B$A[0][2],B$A[0][2]
+|| XOR A$C[3],A$A[0][3],A$A[0][3]
+|| XOR B$C[3],B$A[0][3],B$A[0][3]
+ XOR A$C[4],A$A[0][4],A$A[0][4]
+|| XOR B$C[4],B$A[0][4],B$A[0][4]
+|| XOR A$C[0],A$A[0][0],A$A[0][0] ; A[0][0] ^= iotas[i++];
+|| XOR B$C[0],B$A[0][0],B$A[0][0]
+|| EXTU $iotas,24,24,A0 ; A0 is A$C[0], as we done?
+
+ ANDN A$A[1][2],A$A[1][1],A$C[4]
+|| ANDN B$A[1][2],B$A[1][1],B$C[4]
+|| ANDN A$A[1][3],A$A[1][2],A$C[1]
+|| ANDN B$A[1][3],B$A[1][2],B$C[1]
+|| ANDN A$A[1][4],A$A[1][3],A$C[2]
+|| ANDN B$A[1][4],B$A[1][3],B$C[2]
+ ANDN A$A[1][0],A$A[1][4],A$C[3]
+|| ANDN B$A[1][0],B$A[1][4],B$C[3]
+|| XOR A$C[4],A$A[1][0],A$A[1][0]
+|| XOR B$C[4],B$A[1][0],B$A[1][0]
+|| ANDN A$A[1][1],A$A[1][0],A$C[4]
+|| ANDN B$A[1][1],B$A[1][0],B$C[4]
+ XOR A$C[1],A$A[1][1],A$A[1][1]
+|| XOR B$C[1],B$A[1][1],B$A[1][1]
+|| XOR A$C[2],A$A[1][2],A$A[1][2]
+|| XOR B$C[2],B$A[1][2],B$A[1][2]
+|| XOR A$C[3],A$A[1][3],A$A[1][3]
+|| XOR B$C[3],B$A[1][3],B$A[1][3]
+ XOR A$C[4],A$A[1][4],A$A[1][4]
+|| XOR B$C[4],B$A[1][4],B$A[1][4]
+
+|| ANDN A$A[2][2],A$A[2][1],A$C[4]
+|| ANDN B$A[2][2],B$A[2][1],B$C[4]
+|| ANDN A$A[2][3],A$A[2][2],A$C[1]
+|| ANDN B$A[2][3],B$A[2][2],B$C[1]
+ ANDN A$A[2][4],A$A[2][3],A$C[2]
+|| ANDN B$A[2][4],B$A[2][3],B$C[2]
+|| ANDN A$A[2][0],A$A[2][4],A$C[3]
+|| ANDN B$A[2][0],B$A[2][4],B$C[3]
+|| XOR A$C[4],A$A[2][0],A$A[2][0]
+|| XOR B$C[4],B$A[2][0],B$A[2][0]
+ ANDN A$A[2][1],A$A[2][0],A$C[4]
+|| ANDN B$A[2][1],B$A[2][0],B$C[4]
+|| XOR A$C[1],A$A[2][1],A$A[2][1]
+|| XOR B$C[1],B$A[2][1],B$A[2][1]
+|| XOR A$C[2],A$A[2][2],A$A[2][2]
+|| XOR B$C[2],B$A[2][2],B$A[2][2]
+ XOR A$C[3],A$A[2][3],A$A[2][3]
+|| XOR B$C[3],B$A[2][3],B$A[2][3]
+|| XOR A$C[4],A$A[2][4],A$A[2][4]
+|| XOR B$C[4],B$A[2][4],B$A[2][4]
+
+ ANDN A$A[3][2],A$A[3][1],A$C[4]
+|| ANDN B$A[3][2],B$A[3][1],B$C[4]
+|| ANDN A$A[3][3],A$A[3][2],A$C[1]
+|| ANDN B$A[3][3],B$A[3][2],B$C[1]
+|| ANDN A$A[3][4],A$A[3][3],A$C[2]
+|| ANDN B$A[3][4],B$A[3][3],B$C[2]
+ ANDN A$A[3][0],A$A[3][4],A$C[3]
+|| ANDN B$A[3][0],B$A[3][4],B$C[3]
+|| XOR A$C[4],A$A[3][0],A$A[3][0]
+|| XOR B$C[4],B$A[3][0],B$A[3][0]
+|| ANDN A$A[3][1],A$A[3][0],A$C[4]
+|| ANDN B$A[3][1],B$A[3][0],B$C[4]
+ XOR A$C[1],A$A[3][1],A$A[3][1]
+|| XOR B$C[1],B$A[3][1],B$A[3][1]
+|| XOR A$C[2],A$A[3][2],A$A[3][2]
+|| XOR B$C[2],B$A[3][2],B$A[3][2]
+|| XOR A$C[3],A$A[3][3],A$A[3][3]
+||[A0] BNOP loop?
+ XOR B$C[3],B$A[3][3],B$A[3][3]
+|| XOR A$C[4],A$A[3][4],A$A[3][4]
+|| XOR B$C[4],B$A[3][4],B$A[3][4]
+||[!A0] LDDW *FP[-7],A3:A2
+||[!A0] LDDW *SP[4], RA:B2
+
+ ANDN A$A[4][2],A$A[4][1],A$C[4]
+|| ANDN B$A[4][2],B$A[4][1],B$C[4]
+|| ANDN A$A[4][3],A$A[4][2],A$C[1]
+|| ANDN B$A[4][3],B$A[4][2],B$C[1]
+|| ANDN A$A[4][4],A$A[4][3],A$C[2]
+|| ANDN B$A[4][4],B$A[4][3],B$C[2]
+ ANDN A$A[4][0],A$A[4][4],A$C[3]
+|| ANDN B$A[4][0],B$A[4][4],B$C[3]
+|| XOR A$C[4],A$A[4][0],A$A[4][0]
+|| XOR B$C[4],B$A[4][0],B$A[4][0]
+|| ANDN A$A[4][1],A$A[4][0],A$C[4]
+|| ANDN B$A[4][1],B$A[4][0],B$C[4]
+ XOR A$C[1],A$A[4][1],A$A[4][1]
+|| XOR B$C[1],B$A[4][1],B$A[4][1]
+|| XOR A$C[2],A$A[4][2],A$A[4][2]
+|| XOR B$C[2],B$A[4][2],B$A[4][2]
+|| XOR A$C[3],A$A[4][3],A$A[4][3]
+|| XOR B$C[3],B$A[4][3],B$A[4][3]
+ XOR A$C[4],A$A[4][4],A$A[4][4]
+|| XOR B$C[4],B$A[4][4],B$A[4][4]
+;;===== branch to loop? is taken here
+
+ BNOP RA,5
+ .endasmfunc
+
+ .newblock
+ .global _KeccakF1600
+ .align 32
+_KeccakF1600:
+ .asmfunc stack_usage(80)
+ STW FP,*SP--(80) ; save frame pointer
+|| MV SP,FP
+ STDW B13:B12,*SP[9]
+|| STDW A13:A12,*FP[-4]
+ STDW B11:B10,*SP[8]
+|| STDW A11:A10,*FP[-5]
+ STW RA, *SP[15]
+|| STW A14,*FP[-6]
+|| MV A4,A2
+|| ADD 4,A4,B2
+
+ LDW *A2++[2],A$A[0][0] ; load A[5][5]
+|| LDW *B2++[2],B$A[0][0]
+ LDW *A2++[2],A$A[0][1]
+|| LDW *B2++[2],B$A[0][1]
+ LDW *A2++[2],A$A[0][2]
+|| LDW *B2++[2],B$A[0][2]
+ LDW *A2++[2],A$A[0][3]
+|| LDW *B2++[2],B$A[0][3]
+ LDW *A2++[2],A$A[0][4]
+|| LDW *B2++[2],B$A[0][4]
+
+ LDW *A2++[2],A$A[1][0]
+|| LDW *B2++[2],B$A[1][0]
+ LDW *A2++[2],A$A[1][1]
+|| LDW *B2++[2],B$A[1][1]
+ LDW *A2++[2],A$A[1][2]
+|| LDW *B2++[2],B$A[1][2]
+ LDW *A2++[2],A$A[1][3]
+|| LDW *B2++[2],B$A[1][3]
+ LDW *A2++[2],A$A[1][4]
+|| LDW *B2++[2],B$A[1][4]
+
+ LDW *A2++[2],A$A[2][0]
+|| LDW *B2++[2],B$A[2][0]
+ LDW *A2++[2],A$A[2][1]
+|| LDW *B2++[2],B$A[2][1]
+ LDW *A2++[2],A$A[2][2]
+|| LDW *B2++[2],B$A[2][2]
+ LDW *A2++[2],A$A[2][3]
+|| LDW *B2++[2],B$A[2][3]
+ LDW *A2++[2],A$A[2][4]
+|| LDW *B2++[2],B$A[2][4]
+
+ LDW *A2++[2],A$A[3][0]
+|| LDW *B2++[2],B$A[3][0]
+ LDW *A2++[2],A$A[3][1]
+|| LDW *B2++[2],B$A[3][1]
+ LDW *A2++[2],A$A[3][2]
+|| LDW *B2++[2],B$A[3][2]
+ LDW *A2++[2],A$A[3][3]
+|| LDW *B2++[2],B$A[3][3]
+ LDW *A2++[2],A$A[3][4]
+|| LDW *B2++[2],B$A[3][4]
+|| BNOP _KeccakF1600_int
+
+ ADDKPC ret?,RA
+|| LDW *A2++[2],A$A[4][0]
+|| LDW *B2++[2],B$A[4][0]
+ LDW *A2++[2],A$A[4][1]
+|| LDW *B2++[2],B$A[4][1]
+ LDW *A2++[2],A$A[4][2]
+|| LDW *B2++[2],B$A[4][2]
+ LDW *A2++[2],A$A[4][3]
+|| LDW *B2++[2],B$A[4][3]
+ LDW *A2,A$A[4][4]
+|| LDW *B2,B$A[4][4]
+|| ADDK -192,A2 ; rewind
+|| ADDK -192,B2
+
+ .align 16
+ret?:
+ STW A$A[0][0],*A2++[2] ; store A[5][5]
+|| STW B$A[0][0],*B2++[2]
+ STW A$A[0][1],*A2++[2]
+|| STW B$A[0][1],*B2++[2]
+ STW A$A[0][2],*A2++[2]
+|| STW B$A[0][2],*B2++[2]
+ STW A$A[0][3],*A2++[2]
+|| STW B$A[0][3],*B2++[2]
+ STW A$A[0][4],*A2++[2]
+|| STW B$A[0][4],*B2++[2]
+
+ STW A$A[1][0],*A2++[2]
+|| STW B$A[1][0],*B2++[2]
+ STW A$A[1][1],*A2++[2]
+|| STW B$A[1][1],*B2++[2]
+ STW A$A[1][2],*A2++[2]
+|| STW B$A[1][2],*B2++[2]
+ STW A$A[1][3],*A2++[2]
+|| STW B$A[1][3],*B2++[2]
+ STW A$A[1][4],*A2++[2]
+|| STW B$A[1][4],*B2++[2]
+
+ STW A$A[2][0],*A2++[2]
+|| STW B$A[2][0],*B2++[2]
+ STW A$A[2][1],*A2++[2]
+|| STW B$A[2][1],*B2++[2]
+ STW A$A[2][2],*A2++[2]
+|| STW B$A[2][2],*B2++[2]
+ STW A$A[2][3],*A2++[2]
+|| STW B$A[2][3],*B2++[2]
+ STW A$A[2][4],*A2++[2]
+|| STW B$A[2][4],*B2++[2]
+
+ STW A$A[3][0],*A2++[2]
+|| STW B$A[3][0],*B2++[2]
+ STW A$A[3][1],*A2++[2]
+|| STW B$A[3][1],*B2++[2]
+ STW A$A[3][2],*A2++[2]
+|| STW B$A[3][2],*B2++[2]
+ STW A$A[3][3],*A2++[2]
+|| STW B$A[3][3],*B2++[2]
+ STW A$A[3][4],*A2++[2]
+|| STW B$A[3][4],*B2++[2]
+
+ LDW *SP[15],RA
+|| LDW *FP[-6],A14
+
+ STW A$A[4][0],*A2++[2]
+|| STW B$A[4][0],*B2++[2]
+ STW A$A[4][1],*A2++[2]
+|| STW B$A[4][1],*B2++[2]
+ STW A$A[4][2],*A2++[2]
+|| STW B$A[4][2],*B2++[2]
+ STW A$A[4][3],*A2++[2]
+|| STW B$A[4][3],*B2++[2]
+ STW A$A[4][4],*A2
+|| STW B$A[4][4],*B2
+|| ADDK -192,A2 ; rewind
+
+ MV A2,A4 ; return original A4
+|| LDDW *SP[8], B11:B10
+|| LDDW *FP[-5],A11:A10
+ LDDW *SP[9], B13:B12
+|| LDDW *FP[-4],A13:A12
+|| BNOP RA
+ LDW *++SP(80),FP ; restore frame pointer
+ NOP 4 ; wait till FP is committed
+ .endasmfunc
+
+ .newblock
+ .asg B2,BSZ
+ .asg A2,INP
+ .asg A3,LEN
+ .global _SHA3_absorb
+ .align 32
+_SHA3_absorb:
+ .asmfunc stack_usage(80)
+ STW FP,*SP--(80) ; save frame pointer
+|| MV SP,FP
+ STDW B13:B12,*SP[9]
+|| STDW A13:A12,*FP[-4]
+ STDW B11:B10,*SP[8]
+|| STDW A11:A10,*FP[-5]
+ STW RA, *SP[15]
+|| STW A14,*FP[-6]
+
+ STW A4,*SP[1] ; save A[][]
+|| MV B4,INP ; reassign arguments
+|| MV A6,LEN
+|| MV B6,BSZ
+|| ADD 4,A4,B4
+
+ LDW *A4++[2],A$A[0][0] ; load A[5][5]
+|| LDW *B4++[2],B$A[0][0]
+ LDW *A4++[2],A$A[0][1]
+|| LDW *B4++[2],B$A[0][1]
+ LDW *A4++[2],A$A[0][2]
+|| LDW *B4++[2],B$A[0][2]
+ LDW *A4++[2],A$A[0][3]
+|| LDW *B4++[2],B$A[0][3]
+ LDW *A4++[2],A$A[0][4]
+|| LDW *B4++[2],B$A[0][4]
+
+ LDW *A4++[2],A$A[1][0]
+|| LDW *B4++[2],B$A[1][0]
+ LDW *A4++[2],A$A[1][1]
+|| LDW *B4++[2],B$A[1][1]
+ LDW *A4++[2],A$A[1][2]
+|| LDW *B4++[2],B$A[1][2]
+ LDW *A4++[2],A$A[1][3]
+|| LDW *B4++[2],B$A[1][3]
+ LDW *A4++[2],A$A[1][4]
+|| LDW *B4++[2],B$A[1][4]
+
+ LDW *A4++[2],A$A[2][0]
+|| LDW *B4++[2],B$A[2][0]
+ LDW *A4++[2],A$A[2][1]
+|| LDW *B4++[2],B$A[2][1]
+ LDW *A4++[2],A$A[2][2]
+|| LDW *B4++[2],B$A[2][2]
+ LDW *A4++[2],A$A[2][3]
+|| LDW *B4++[2],B$A[2][3]
+ LDW *A4++[2],A$A[2][4]
+|| LDW *B4++[2],B$A[2][4]
+
+ LDW *A4++[2],A$A[3][0]
+|| LDW *B4++[2],B$A[3][0]
+ LDW *A4++[2],A$A[3][1]
+|| LDW *B4++[2],B$A[3][1]
+ LDW *A4++[2],A$A[3][2]
+|| LDW *B4++[2],B$A[3][2]
+ LDW *A4++[2],A$A[3][3]
+|| LDW *B4++[2],B$A[3][3]
+ LDW *A4++[2],A$A[3][4]
+|| LDW *B4++[2],B$A[3][4]
+
+ LDW *A4++[2],A$A[4][0]
+|| LDW *B4++[2],B$A[4][0]
+ LDW *A4++[2],A$A[4][1]
+|| LDW *B4++[2],B$A[4][1]
+ LDW *A4++[2],A$A[4][2]
+|| LDW *B4++[2],B$A[4][2]
+ LDW *A4++[2],A$A[4][3]
+|| LDW *B4++[2],B$A[4][3]
+ LDW *A4,A$A[4][4]
+|| LDW *B4,B$A[4][4]
+|| ADDKPC loop?,RA
+ STDW RA:BSZ,*SP[4]
+
+loop?:
+ CMPLTU LEN,BSZ,A0 ; len < bsz?
+|| SHRU BSZ,3,BSZ
+ [A0] BNOP ret?
+||[A0] ZERO BSZ
+||[A0] LDW *SP[1],A2 ; pull A[][]
+ [BSZ] LDNDW *INP++,A1:A0
+||[BSZ] SUB LEN,8,LEN
+||[BSZ] SUB BSZ,1,BSZ
+ NOP 4
+___
+for ($y = 0; $y < 5; $y++) {
+ for ($x = 0; $x < ($y<4 ? 5 : 4); $x++) {
+$code.=<<___;
+ .if .BIG_ENDIAN
+ SWAP2 A0,A1
+|| SWAP2 A1,A0
+ SWAP4 A0,A0
+ SWAP4 A1,A1
+||[!BSZ]BNOP _KeccakF1600_cheat
+||[!BSZ]STDW LEN:INP,*SP[3]
+|| DEAL A0,A0
+ .else
+ [!BSZ]BNOP _KeccakF1600_cheat
+||[!BSZ]STDW LEN:INP,*SP[3]
+|| DEAL A0,A0
+ .endif
+ [BSZ] LDNDW *INP++,A1:A0
+|| DEAL A1,A1
+ [BSZ] SUB LEN,8,LEN
+||[BSZ] SUB BSZ,1,BSZ
+ PACK2 A1,A0,A0
+|| PACKH2 A1,A0,A1
+ XOR A0,A$A[$y][$x],A$A[$y][$x]
+ XOR A1,B$A[$y][$x],B$A[$y][$x]
+___
+ }
+}
+$code.=<<___;
+ .if .BIG_ENDIAN
+ SWAP2 A0,A1
+|| SWAP2 A1,A0
+ SWAP4 A0,A0
+ SWAP4 A1,A1
+ .endif
+ BNOP _KeccakF1600_cheat
+|| STDW LEN:INP,*SP[3]
+|| DEAL A0,A0
+ DEAL A1,A1
+ NOP
+ PACK2 A1,A0,A0
+|| PACKH2 A1,A0,A1
+ XOR A0,A$A[4][4],A$A[4][4]
+ XOR A1,B$A[4][4],B$A[4][4]
+
+ .align 16
+ret?:
+ MV LEN,A4 ; return value
+|| ADD 4,A2,B2
+
+ STW A$A[0][0],*A2++[2] ; store A[5][5]
+|| STW B$A[0][0],*B2++[2]
+ STW A$A[0][1],*A2++[2]
+|| STW B$A[0][1],*B2++[2]
+ STW A$A[0][2],*A2++[2]
+|| STW B$A[0][2],*B2++[2]
+ STW A$A[0][3],*A2++[2]
+|| STW B$A[0][3],*B2++[2]
+ STW A$A[0][4],*A2++[2]
+|| STW B$A[0][4],*B2++[2]
+
+ STW A$A[1][0],*A2++[2]
+|| STW B$A[1][0],*B2++[2]
+ STW A$A[1][1],*A2++[2]
+|| STW B$A[1][1],*B2++[2]
+ STW A$A[1][2],*A2++[2]
+|| STW B$A[1][2],*B2++[2]
+ STW A$A[1][3],*A2++[2]
+|| STW B$A[1][3],*B2++[2]
+ STW A$A[1][4],*A2++[2]
+|| STW B$A[1][4],*B2++[2]
+
+ STW A$A[2][0],*A2++[2]
+|| STW B$A[2][0],*B2++[2]
+ STW A$A[2][1],*A2++[2]
+|| STW B$A[2][1],*B2++[2]
+ STW A$A[2][2],*A2++[2]
+|| STW B$A[2][2],*B2++[2]
+ STW A$A[2][3],*A2++[2]
+|| STW B$A[2][3],*B2++[2]
+ STW A$A[2][4],*A2++[2]
+|| STW B$A[2][4],*B2++[2]
+
+ LDW *SP[15],RA
+|| LDW *FP[-6],A14
+
+ STW A$A[3][0],*A2++[2]
+|| STW B$A[3][0],*B2++[2]
+ STW A$A[3][1],*A2++[2]
+|| STW B$A[3][1],*B2++[2]
+ STW A$A[3][2],*A2++[2]
+|| STW B$A[3][2],*B2++[2]
+ STW A$A[3][3],*A2++[2]
+|| STW B$A[3][3],*B2++[2]
+ STW A$A[3][4],*A2++[2]
+|| STW B$A[3][4],*B2++[2]
+
+ LDDW *SP[8], B11:B10
+|| LDDW *FP[-5],A11:A10
+ LDDW *SP[9], B13:B12
+|| LDDW *FP[-4],A13:A12
+ BNOP RA
+|| LDW *++SP(80),FP ; restore frame pointer
+
+ STW A$A[4][0],*A2++[2]
+|| STW B$A[4][0],*B2++[2]
+ STW A$A[4][1],*A2++[2]
+|| STW B$A[4][1],*B2++[2]
+ STW A$A[4][2],*A2++[2]
+|| STW B$A[4][2],*B2++[2]
+ STW A$A[4][3],*A2++[2]
+|| STW B$A[4][3],*B2++[2]
+ STW A$A[4][4],*A2++[2]
+|| STW B$A[4][4],*B2++[2]
+ .endasmfunc
+
+ .newblock
+ .global _SHA3_squeeze
+ .asg A12,OUT
+ .asg A13,LEN
+ .asg A14,BSZ
+ .align 32
+_SHA3_squeeze:
+ .asmfunc stack_usage(24)
+ STW FP,*SP--(24) ; save frame pointer
+|| MV SP,FP
+ STW RA, *SP[5]
+|| STW A14,*FP[-2]
+ STDW A13:A12,*FP[-2]
+|| MV B4,OUT ; reassign arguments
+ MV A6,LEN
+|| MV B6,BSZ
+
+loop?:
+ LDW *SP[5],RA ; reload RA
+|| SHRU BSZ,3,A1
+|| MV A4,A8
+|| ADD 4,A4,B8
+block?:
+ CMPLTU LEN,8,A0 ; len < 8?
+ [A0] BNOP tail?
+ LDW *A8++[2],A9
+|| LDW *B8++[2],B9
+|| SUB LEN,8,LEN ; len -= 8
+ MV LEN,A0
+|| SUB A1,1,A1 ; bsz--
+|| NOP 4
+ .if .BIG_ENDIAN
+ SWAP4 A9,A9
+|| SWAP4 B9,B9
+ SWAP2 A9,A9
+|| SWAP2 B9,B9
+ .endif
+ [!A0] BNOP ret?
+||[!A0] ZERO A1
+ PACK2 B9,A9,B7
+||[A1] BNOP block?
+ PACKH2 B9,A9,B9
+|| SHFL B7,B7
+ SHFL B9,B9
+ STNW B7,*OUT++
+ STNW B9,*OUT++
+ NOP
+
+ BNOP _KeccakF1600,4
+ ADDKPC loop?,RA
+
+ .align 16
+tail?:
+ .if .BIG_ENDIAN
+ SWAP4 A9,A9
+|| SWAP4 B9,B9
+ SWAP2 A9,A9
+|| SWAP2 B9,B9
+ .endif
+ PACK2 B9,A9,B7
+ PACKH2 B9,A9,B9
+|| SHFL B7,B7
+ SHFL B9,B9
+
+ STB B7,*OUT++
+|| SHRU B7,8,B7
+|| ADD LEN,7,A0
+ [A0] STB B7,*OUT++
+||[A0] SHRU B7,8,B7
+||[A0] SUB A0,1,A0
+ [A0] STB B7,*OUT++
+||[A0] SHRU B7,8,B7
+||[A0] SUB A0,1,A0
+ [A0] STB B7,*OUT++
+||[A0] SUB A0,1,A0
+ [A0] STB B9,*OUT++
+||[A0] SHRU B9,8,B9
+||[A0] SUB A0,1,A0
+ [A0] STB B9,*OUT++
+||[A0] SHRU B9,8,B9
+||[A0] SUB A0,1,A0
+ [A0] STB B9,*OUT++
+
+ret?:
+ LDDW *FP[-2],A13:A12
+ BNOP RA
+|| LDW *FP[-2],A14
+ LDW *++SP(24),FP ; restore frame pointer
+ NOP 4 ; wait till FP is committed
+ .endasmfunc
+
+ .if __TI_EABI__
+ .sect ".text:sha_asm.const"
+ .else
+ .sect ".const:sha_asm"
+ .endif
+ .align 256
+ .uword 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+iotas:
+ .uword 0x00000001, 0x00000000
+ .uword 0x00000000, 0x00000089
+ .uword 0x00000000, 0x8000008b
+ .uword 0x00000000, 0x80008080
+ .uword 0x00000001, 0x0000008b
+ .uword 0x00000001, 0x00008000
+ .uword 0x00000001, 0x80008088
+ .uword 0x00000001, 0x80000082
+ .uword 0x00000000, 0x0000000b
+ .uword 0x00000000, 0x0000000a
+ .uword 0x00000001, 0x00008082
+ .uword 0x00000000, 0x00008003
+ .uword 0x00000001, 0x0000808b
+ .uword 0x00000001, 0x8000000b
+ .uword 0x00000001, 0x8000008a
+ .uword 0x00000001, 0x80000081
+ .uword 0x00000000, 0x80000081
+ .uword 0x00000000, 0x80000008
+ .uword 0x00000000, 0x00000083
+ .uword 0x00000000, 0x80008003
+ .uword 0x00000001, 0x80008088
+ .uword 0x00000000, 0x80000088
+ .uword 0x00000001, 0x00008000
+ .uword 0x00000000, 0x80008082
+
+ .cstring "Keccak-1600 absorb and squeeze for C64x, CRYPTOGAMS by <appro\@openssl.org>"
+ .align 4
+___
+
+$output=pop;
+open STDOUT,">$output";
+print $code;
+close STDOUT;
diff --git a/crypto/sha/asm/keccak1600-mmx.pl b/crypto/sha/asm/keccak1600-mmx.pl
new file mode 100755
index 000000000000..c7685add79dd
--- /dev/null
+++ b/crypto/sha/asm/keccak1600-mmx.pl
@@ -0,0 +1,440 @@
+#!/usr/bin/env perl
+# Copyright 2017 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# Keccak-1600 for x86 MMX.
+#
+# June 2017.
+#
+# Below code is KECCAK_2X implementation (see sha/keccak1600.c) with
+# C[5] held in register bank and D[5] offloaded to memory. Though
+# instead of actually unrolling the loop pair-wise I simply flip
+# pointers to T[][] and A[][] and the end of round. Since number of
+# rounds is even, last round writes to A[][] and everything works out.
+# It's argued that MMX is the only code path meaningful to implement
+# for x86. This is because non-MMX-capable processors is an extinct
+# breed, and they as well can lurk executing compiler-generated code.
+# For reference gcc-5.x-generated KECCAK_2X code takes 89 cycles per
+# processed byte on Pentium. Which is fair result. But older compilers
+# produce worse code. On the other hand one can wonder why not 128-bit
+# SSE2? Well, SSE2 won't provide double improvement, rather far from
+# that, if any at all on some processors, because it will take extra
+# permutations and inter-bank data trasfers. Besides, contemporary
+# CPUs are better off executing 64-bit code, and it makes lesser sense
+# to invest into fancy 32-bit code. And the decision doesn't seem to
+# be inadequate, if one compares below results to "64-bit platforms in
+# 32-bit mode" SIMD data points available at
+# http://keccak.noekeon.org/sw_performance.html.
+#
+########################################################################
+# Numbers are cycles per processed byte out of large message.
+#
+# r=1088(i)
+#
+# PIII 30/+150%
+# Pentium M 27/+150%
+# P4 40/+85%
+# Core 2 19/+170%
+# Sandy Bridge(ii) 18/+140%
+# Atom 33/+180%
+# Silvermont(ii) 30/+180%
+# VIA Nano(ii) 43/+60%
+# Sledgehammer(ii)(iii) 24/+130%
+#
+# (i) Corresponds to SHA3-256. Numbers after slash are improvement
+# coefficients over KECCAK_2X [with bit interleave and lane
+# complementing] position-independent *scalar* code generated
+# by gcc-5.x. It's not exactly fair comparison, but it's a
+# datapoint...
+# (ii) 64-bit processor executing 32-bit code.
+# (iii) Result is considered to be representative even for older AMD
+# processors.
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+
+$output=pop;
+open STDOUT,">$output";
+
+&asm_init($ARGV[0],$ARGV[$#ARGV] eq "386");
+
+my @C = map("mm$_",(0..4));
+my @T = map("mm$_",(5..7));
+my @A = map([ 8*$_-100, 8*($_+1)-100, 8*($_+2)-100,
+ 8*($_+3)-100, 8*($_+4)-100 ], (0,5,10,15,20));
+my @D = map(8*$_+4, (0..4));
+my @rhotates = ([ 0, 1, 62, 28, 27 ],
+ [ 36, 44, 6, 55, 20 ],
+ [ 3, 10, 43, 25, 39 ],
+ [ 41, 45, 15, 21, 8 ],
+ [ 18, 2, 61, 56, 14 ]);
+
+&static_label("iotas");
+
+&function_begin_B("_KeccakF1600");
+ &movq (@C[0],&QWP($A[4][0],"esi"));
+ &movq (@C[1],&QWP($A[4][1],"esi"));
+ &movq (@C[2],&QWP($A[4][2],"esi"));
+ &movq (@C[3],&QWP($A[4][3],"esi"));
+ &movq (@C[4],&QWP($A[4][4],"esi"));
+
+ &mov ("ecx",24); # loop counter
+ &jmp (&label("loop"));
+
+ &set_label("loop",16);
+ ######################################### Theta
+ &pxor (@C[0],&QWP($A[0][0],"esi"));
+ &pxor (@C[1],&QWP($A[0][1],"esi"));
+ &pxor (@C[2],&QWP($A[0][2],"esi"));
+ &pxor (@C[3],&QWP($A[0][3],"esi"));
+ &pxor (@C[4],&QWP($A[0][4],"esi"));
+
+ &pxor (@C[0],&QWP($A[1][0],"esi"));
+ &pxor (@C[1],&QWP($A[1][1],"esi"));
+ &pxor (@C[2],&QWP($A[1][2],"esi"));
+ &pxor (@C[3],&QWP($A[1][3],"esi"));
+ &pxor (@C[4],&QWP($A[1][4],"esi"));
+
+ &pxor (@C[0],&QWP($A[2][0],"esi"));
+ &pxor (@C[1],&QWP($A[2][1],"esi"));
+ &pxor (@C[2],&QWP($A[2][2],"esi"));
+ &pxor (@C[3],&QWP($A[2][3],"esi"));
+ &pxor (@C[4],&QWP($A[2][4],"esi"));
+
+ &pxor (@C[2],&QWP($A[3][2],"esi"));
+ &pxor (@C[0],&QWP($A[3][0],"esi"));
+ &pxor (@C[1],&QWP($A[3][1],"esi"));
+ &pxor (@C[3],&QWP($A[3][3],"esi"));
+ &movq (@T[0],@C[2]);
+ &pxor (@C[4],&QWP($A[3][4],"esi"));
+
+ &movq (@T[2],@C[2]);
+ &psrlq (@T[0],63);
+ &movq (@T[1],@C[0]);
+ &psllq (@T[2],1);
+ &pxor (@T[0],@C[0]);
+ &psrlq (@C[0],63);
+ &pxor (@T[0],@T[2]);
+ &psllq (@T[1],1);
+ &movq (@T[2],@C[1]);
+ &movq (&QWP(@D[1],"esp"),@T[0]); # D[1] = E[0] = ROL64(C[2], 1) ^ C[0];
+
+ &pxor (@T[1],@C[0]);
+ &psrlq (@T[2],63);
+ &pxor (@T[1],@C[3]);
+ &movq (@C[0],@C[1]);
+ &movq (&QWP(@D[4],"esp"),@T[1]); # D[4] = E[1] = ROL64(C[0], 1) ^ C[3];
+
+ &psllq (@C[0],1);
+ &pxor (@T[2],@C[4]);
+ &pxor (@C[0],@T[2]);
+
+ &movq (@T[2],@C[3]);
+ &psrlq (@C[3],63);
+ &movq (&QWP(@D[0],"esp"),@C[0]); # D[0] = C[0] = ROL64(C[1], 1) ^ C[4];
+ &psllq (@T[2],1);
+ &movq (@T[0],@C[4]);
+ &psrlq (@C[4],63);
+ &pxor (@C[1],@C[3]);
+ &psllq (@T[0],1);
+ &pxor (@C[1],@T[2]);
+ &pxor (@C[2],@C[4]);
+ &movq (&QWP(@D[2],"esp"),@C[1]); # D[2] = C[1] = ROL64(C[3], 1) ^ C[1];
+ &pxor (@C[2],@T[0]);
+
+ ######################################### first Rho(0) is special
+ &movq (@C[3],&QWP($A[3][3],"esi"));
+ &movq (&QWP(@D[3],"esp"),@C[2]); # D[3] = C[2] = ROL64(C[4], 1) ^ C[2];
+ &pxor (@C[3],@C[2]);
+ &movq (@C[4],&QWP($A[4][4],"esi"));
+ &movq (@T[2],@C[3]);
+ &psrlq (@C[3],64-$rhotates[3][3]);
+ &pxor (@C[4],@T[1]);
+ &psllq (@T[2],$rhotates[3][3]);
+ &movq (@T[1],@C[4]);
+ &psrlq (@C[4],64-$rhotates[4][4]);
+ &por (@C[3],@T[2]); # C[3] = ROL64(A[3][3] ^ C[2], rhotates[3][3]); /* D[3] */
+ &psllq (@T[1],$rhotates[4][4]);
+
+ &movq (@C[2],&QWP($A[2][2],"esi"));
+ &por (@C[4],@T[1]); # C[4] = ROL64(A[4][4] ^ E[1], rhotates[4][4]); /* D[4] */
+ &pxor (@C[2],@C[1]);
+ &movq (@C[1],&QWP($A[1][1],"esi"));
+ &movq (@T[1],@C[2]);
+ &psrlq (@C[2],64-$rhotates[2][2]);
+ &pxor (@C[1],&QWP(@D[1],"esp"));
+ &psllq (@T[1],$rhotates[2][2]);
+
+ &movq (@T[2],@C[1]);
+ &psrlq (@C[1],64-$rhotates[1][1]);
+ &por (@C[2],@T[1]); # C[2] = ROL64(A[2][2] ^ C[1], rhotates[2][2]); /* D[2] */
+ &psllq (@T[2],$rhotates[1][1]);
+ &pxor (@C[0],&QWP($A[0][0],"esi")); # /* rotate by 0 */ /* D[0] */
+ &por (@C[1],@T[2]); # C[1] = ROL64(A[1][1] ^ D[1], rhotates[1][1]);
+
+sub Chi() { ######### regular Chi step
+ my ($y,$xrho) = @_;
+
+ &movq (@T[0],@C[1]);
+ &movq (@T[1],@C[2]);
+ &pandn (@T[0],@C[2]);
+ &pandn (@C[2],@C[3]);
+ &pxor (@T[0],@C[0]);
+ &pxor (@C[2],@C[1]);
+ &pxor (@T[0],&QWP(0,"ebx")) if ($y == 0);
+ &lea ("ebx",&DWP(8,"ebx")) if ($y == 0);
+
+ &movq (@T[2],@C[3]);
+ &movq (&QWP($A[$y][0],"edi"),@T[0]); # R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
+ &movq (@T[0],@C[4]);
+ &pandn (@C[3],@C[4]);
+ &pandn (@C[4],@C[0]);
+ &pxor (@C[3],@T[1]);
+ &movq (&QWP($A[$y][1],"edi"),@C[2]); # R[0][1] = C[1] ^ (~C[2] & C[3]);
+ &pxor (@C[4],@T[2]);
+ &movq (@T[2],&QWP($A[0][$xrho],"esi")) if (defined($xrho));
+
+ &movq (&QWP($A[$y][2],"edi"),@C[3]); # R[0][2] = C[2] ^ (~C[3] & C[4]);
+ &pandn (@C[0],@C[1]);
+ &movq (&QWP($A[$y][3],"edi"),@C[4]); # R[0][3] = C[3] ^ (~C[4] & C[0]);
+ &pxor (@C[0],@T[0]);
+ &pxor (@T[2],&QWP(@D[$xrho],"esp")) if (defined($xrho));
+ &movq (&QWP($A[$y][4],"edi"),@C[0]); # R[0][4] = C[4] ^ (~C[0] & C[1]);
+}
+ &Chi (0, 3);
+
+sub Rho() { ######### regular Rho step
+ my $x = shift;
+
+ #&movq (@T[2],&QWP($A[0][$x],"esi")); # moved to Chi
+ #&pxor (@T[2],&QWP(@D[$x],"esp")); # moved to Chi
+ &movq (@C[0],@T[2]);
+ &psrlq (@T[2],64-$rhotates[0][$x]);
+ &movq (@C[1],&QWP($A[1][($x+1)%5],"esi"));
+ &psllq (@C[0],$rhotates[0][$x]);
+ &pxor (@C[1],&QWP(@D[($x+1)%5],"esp"));
+ &por (@C[0],@T[2]); # C[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]);
+
+ &movq (@T[1],@C[1]);
+ &psrlq (@C[1],64-$rhotates[1][($x+1)%5]);
+ &movq (@C[2],&QWP($A[2][($x+2)%5],"esi"));
+ &psllq (@T[1],$rhotates[1][($x+1)%5]);
+ &pxor (@C[2],&QWP(@D[($x+2)%5],"esp"));
+ &por (@C[1],@T[1]); # C[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
+
+ &movq (@T[2],@C[2]);
+ &psrlq (@C[2],64-$rhotates[2][($x+2)%5]);
+ &movq (@C[3],&QWP($A[3][($x+3)%5],"esi"));
+ &psllq (@T[2],$rhotates[2][($x+2)%5]);
+ &pxor (@C[3],&QWP(@D[($x+3)%5],"esp"));
+ &por (@C[2],@T[2]); # C[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]);
+
+ &movq (@T[0],@C[3]);
+ &psrlq (@C[3],64-$rhotates[3][($x+3)%5]);
+ &movq (@C[4],&QWP($A[4][($x+4)%5],"esi"));
+ &psllq (@T[0],$rhotates[3][($x+3)%5]);
+ &pxor (@C[4],&QWP(@D[($x+4)%5],"esp"));
+ &por (@C[3],@T[0]); # C[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]);
+
+ &movq (@T[1],@C[4]);
+ &psrlq (@C[4],64-$rhotates[4][($x+4)%5]);
+ &psllq (@T[1],$rhotates[4][($x+4)%5]);
+ &por (@C[4],@T[1]); # C[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]);
+}
+ &Rho (3); &Chi (1, 1);
+ &Rho (1); &Chi (2, 4);
+ &Rho (4); &Chi (3, 2);
+ &Rho (2); ###&Chi (4);
+
+ &movq (@T[0],@C[0]); ######### last Chi(4) is special
+ &xor ("edi","esi"); # &xchg ("esi","edi");
+ &movq (&QWP(@D[1],"esp"),@C[1]);
+ &xor ("esi","edi");
+ &xor ("edi","esi");
+
+ &movq (@T[1],@C[1]);
+ &movq (@T[2],@C[2]);
+ &pandn (@T[1],@C[2]);
+ &pandn (@T[2],@C[3]);
+ &pxor (@C[0],@T[1]);
+ &pxor (@C[1],@T[2]);
+
+ &movq (@T[1],@C[3]);
+ &movq (&QWP($A[4][0],"esi"),@C[0]); # R[4][0] = C[0] ^= (~C[1] & C[2]);
+ &pandn (@T[1],@C[4]);
+ &movq (&QWP($A[4][1],"esi"),@C[1]); # R[4][1] = C[1] ^= (~C[2] & C[3]);
+ &pxor (@C[2],@T[1]);
+ &movq (@T[2],@C[4]);
+ &movq (&QWP($A[4][2],"esi"),@C[2]); # R[4][2] = C[2] ^= (~C[3] & C[4]);
+
+ &pandn (@T[2],@T[0]);
+ &pandn (@T[0],&QWP(@D[1],"esp"));
+ &pxor (@C[3],@T[2]);
+ &pxor (@C[4],@T[0]);
+ &movq (&QWP($A[4][3],"esi"),@C[3]); # R[4][3] = C[3] ^= (~C[4] & D[0]);
+ &sub ("ecx",1);
+ &movq (&QWP($A[4][4],"esi"),@C[4]); # R[4][4] = C[4] ^= (~D[0] & D[1]);
+ &jnz (&label("loop"));
+
+ &lea ("ebx",&DWP(-192,"ebx")); # rewind iotas
+ &ret ();
+&function_end_B("_KeccakF1600");
+
+&function_begin("KeccakF1600");
+ &mov ("esi",&wparam(0));
+ &mov ("ebp","esp");
+ &sub ("esp",240);
+ &call (&label("pic_point"));
+ &set_label("pic_point");
+ &blindpop("ebx");
+ &lea ("ebx",&DWP(&label("iotas")."-".&label("pic_point"),"ebx"));
+ &and ("esp",-8);
+ &lea ("esi",&DWP(100,"esi")); # size optimization
+ &lea ("edi",&DWP(8*5+100,"esp")); # size optimization
+
+ &call ("_KeccakF1600");
+
+ &mov ("esp","ebp");
+ &emms ();
+&function_end("KeccakF1600");
+
+&function_begin("SHA3_absorb");
+ &mov ("esi",&wparam(0)); # A[][]
+ &mov ("eax",&wparam(1)); # inp
+ &mov ("ecx",&wparam(2)); # len
+ &mov ("edx",&wparam(3)); # bsz
+ &mov ("ebp","esp");
+ &sub ("esp",240+8);
+ &call (&label("pic_point"));
+ &set_label("pic_point");
+ &blindpop("ebx");
+ &lea ("ebx",&DWP(&label("iotas")."-".&label("pic_point"),"ebx"));
+ &and ("esp",-8);
+
+ &mov ("edi","esi");
+ &lea ("esi",&DWP(100,"esi")); # size optimization
+ &mov (&DWP(-4,"ebp"),"edx"); # save bsz
+ &jmp (&label("loop"));
+
+&set_label("loop",16);
+ &cmp ("ecx","edx"); # len < bsz?
+ &jc (&label("absorbed"));
+
+ &shr ("edx",3); # bsz /= 8
+&set_label("block");
+ &movq ("mm0",&QWP(0,"eax"));
+ &lea ("eax",&DWP(8,"eax"));
+ &pxor ("mm0",&QWP(0,"edi"));
+ &lea ("edi",&DWP(8,"edi"));
+ &sub ("ecx",8); # len -= 8
+ &movq (&QWP(-8,"edi"),"mm0");
+ &dec ("edx"); # bsz--
+ &jnz (&label("block"));
+
+ &lea ("edi",&DWP(8*5+100,"esp")); # size optimization
+ &mov (&DWP(-8,"ebp"),"ecx"); # save len
+ &call ("_KeccakF1600");
+ &mov ("ecx",&DWP(-8,"ebp")); # pull len
+ &mov ("edx",&DWP(-4,"ebp")); # pull bsz
+ &lea ("edi",&DWP(-100,"esi"));
+ &jmp (&label("loop"));
+
+&set_label("absorbed",16);
+ &mov ("eax","ecx"); # return value
+ &mov ("esp","ebp");
+ &emms ();
+&function_end("SHA3_absorb");
+
+&function_begin("SHA3_squeeze");
+ &mov ("esi",&wparam(0)); # A[][]
+ &mov ("eax",&wparam(1)); # out
+ &mov ("ecx",&wparam(2)); # len
+ &mov ("edx",&wparam(3)); # bsz
+ &mov ("ebp","esp");
+ &sub ("esp",240+8);
+ &call (&label("pic_point"));
+ &set_label("pic_point");
+ &blindpop("ebx");
+ &lea ("ebx",&DWP(&label("iotas")."-".&label("pic_point"),"ebx"));
+ &and ("esp",-8);
+
+ &shr ("edx",3); # bsz /= 8
+ &mov ("edi","esi");
+ &lea ("esi",&DWP(100,"esi")); # size optimization
+ &mov (&DWP(-4,"ebp"),"edx"); # save bsz
+ &jmp (&label("loop"));
+
+&set_label("loop",16);
+ &cmp ("ecx",8); # len < 8?
+ &jc (&label("tail"));
+
+ &movq ("mm0",&QWP(0,"edi"));
+ &lea ("edi",&DWP(8,"edi"));
+ &movq (&QWP(0,"eax"),"mm0");
+ &lea ("eax",&DWP(8,"eax"));
+ &sub ("ecx",8); # len -= 8
+ &jz (&label("done"));
+
+ &dec ("edx"); # bsz--
+ &jnz (&label("loop"));
+
+ &lea ("edi",&DWP(8*5+100,"esp")); # size optimization
+ &mov (&DWP(-8,"ebp"),"ecx"); # save len
+ &call ("_KeccakF1600");
+ &mov ("ecx",&DWP(-8,"ebp")); # pull len
+ &mov ("edx",&DWP(-4,"ebp")); # pull bsz
+ &lea ("edi",&DWP(-100,"esi"));
+ &jmp (&label("loop"));
+
+&set_label("tail",16);
+ &mov ("esi","edi");
+ &mov ("edi","eax");
+ &data_word("0xA4F39066"); # rep movsb
+
+&set_label("done");
+ &mov ("esp","ebp");
+ &emms ();
+&function_end("SHA3_squeeze");
+
+&set_label("iotas",32);
+ &data_word(0x00000001,0x00000000);
+ &data_word(0x00008082,0x00000000);
+ &data_word(0x0000808a,0x80000000);
+ &data_word(0x80008000,0x80000000);
+ &data_word(0x0000808b,0x00000000);
+ &data_word(0x80000001,0x00000000);
+ &data_word(0x80008081,0x80000000);
+ &data_word(0x00008009,0x80000000);
+ &data_word(0x0000008a,0x00000000);
+ &data_word(0x00000088,0x00000000);
+ &data_word(0x80008009,0x00000000);
+ &data_word(0x8000000a,0x00000000);
+ &data_word(0x8000808b,0x00000000);
+ &data_word(0x0000008b,0x80000000);
+ &data_word(0x00008089,0x80000000);
+ &data_word(0x00008003,0x80000000);
+ &data_word(0x00008002,0x80000000);
+ &data_word(0x00000080,0x80000000);
+ &data_word(0x0000800a,0x00000000);
+ &data_word(0x8000000a,0x80000000);
+ &data_word(0x80008081,0x80000000);
+ &data_word(0x00008080,0x80000000);
+ &data_word(0x80000001,0x00000000);
+ &data_word(0x80008008,0x80000000);
+&asciz("Keccak-1600 absorb and squeeze for MMX, CRYPTOGAMS by <appro\@openssl.org>");
+
+&asm_finish();
+
+close STDOUT;
diff --git a/crypto/sha/asm/keccak1600-ppc64.pl b/crypto/sha/asm/keccak1600-ppc64.pl
new file mode 100755
index 000000000000..30e70c5d6d7b
--- /dev/null
+++ b/crypto/sha/asm/keccak1600-ppc64.pl
@@ -0,0 +1,758 @@
+#!/usr/bin/env perl
+# Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# Keccak-1600 for PPC64.
+#
+# June 2017.
+#
+# This is straightforward KECCAK_1X_ALT implementation that works on
+# *any* PPC64. Then PowerISA 2.07 adds 2x64-bit vector rotate, and
+# it's possible to achieve performance better than below, but that is
+# naturally option only for POWER8 and successors...
+#
+######################################################################
+# Numbers are cycles per processed byte.
+#
+# r=1088(*)
+#
+# PPC970/G5 14.6/+120%
+# POWER7 10.3/+100%
+# POWER8 11.5/+85%
+# POWER9 9.4/+45%
+#
+# (*) Corresponds to SHA3-256. Percentage after slash is improvement
+# over gcc-4.x-generated KECCAK_1X_ALT code. Newer compilers do
+# much better (but watch out for them generating code specific
+# to processor they execute on).
+
+$flavour = shift;
+
+if ($flavour =~ /64/) {
+ $SIZE_T =8;
+ $LRSAVE =2*$SIZE_T;
+ $UCMP ="cmpld";
+ $STU ="stdu";
+ $POP ="ld";
+ $PUSH ="std";
+} else { die "nonsense $flavour"; }
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
+die "can't locate ppc-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
+
+$FRAME=24*$SIZE_T+6*$SIZE_T+32;
+$LOCALS=6*$SIZE_T;
+$TEMP=$LOCALS+6*$SIZE_T;
+
+my $sp ="r1";
+
+my @A = map([ "r$_", "r".($_+1), "r".($_+2), "r".($_+3), "r".($_+4) ],
+ (7, 12, 17, 22, 27));
+ $A[1][1] = "r6"; # r13 is reserved
+
+my @C = map("r$_", (0,3,4,5));
+
+my @rhotates = ([ 0, 1, 62, 28, 27 ],
+ [ 36, 44, 6, 55, 20 ],
+ [ 3, 10, 43, 25, 39 ],
+ [ 41, 45, 15, 21, 8 ],
+ [ 18, 2, 61, 56, 14 ]);
+
+$code.=<<___;
+.text
+
+.type KeccakF1600_int,\@function
+.align 5
+KeccakF1600_int:
+ li r0,24
+ mtctr r0
+ b .Loop
+.align 4
+.Loop:
+ xor $C[0],$A[0][0],$A[1][0] ; Theta
+ std $A[0][4],`$TEMP+0`($sp)
+ xor $C[1],$A[0][1],$A[1][1]
+ std $A[1][4],`$TEMP+8`($sp)
+ xor $C[2],$A[0][2],$A[1][2]
+ std $A[2][4],`$TEMP+16`($sp)
+ xor $C[3],$A[0][3],$A[1][3]
+ std $A[3][4],`$TEMP+24`($sp)
+___
+ $C[4]=$A[0][4];
+ $C[5]=$A[1][4];
+ $C[6]=$A[2][4];
+ $C[7]=$A[3][4];
+$code.=<<___;
+ xor $C[4],$A[0][4],$A[1][4]
+ xor $C[0],$C[0],$A[2][0]
+ xor $C[1],$C[1],$A[2][1]
+ xor $C[2],$C[2],$A[2][2]
+ xor $C[3],$C[3],$A[2][3]
+ xor $C[4],$C[4],$A[2][4]
+ xor $C[0],$C[0],$A[3][0]
+ xor $C[1],$C[1],$A[3][1]
+ xor $C[2],$C[2],$A[3][2]
+ xor $C[3],$C[3],$A[3][3]
+ xor $C[4],$C[4],$A[3][4]
+ xor $C[0],$C[0],$A[4][0]
+ xor $C[2],$C[2],$A[4][2]
+ xor $C[1],$C[1],$A[4][1]
+ xor $C[3],$C[3],$A[4][3]
+ rotldi $C[5],$C[2],1
+ xor $C[4],$C[4],$A[4][4]
+ rotldi $C[6],$C[3],1
+ xor $C[5],$C[5],$C[0]
+ rotldi $C[7],$C[4],1
+
+ xor $A[0][1],$A[0][1],$C[5]
+ xor $A[1][1],$A[1][1],$C[5]
+ xor $A[2][1],$A[2][1],$C[5]
+ xor $A[3][1],$A[3][1],$C[5]
+ xor $A[4][1],$A[4][1],$C[5]
+
+ rotldi $C[5],$C[0],1
+ xor $C[6],$C[6],$C[1]
+ xor $C[2],$C[2],$C[7]
+ rotldi $C[7],$C[1],1
+ xor $C[3],$C[3],$C[5]
+ xor $C[4],$C[4],$C[7]
+
+ xor $C[1], $A[0][2],$C[6] ;mr $C[1],$A[0][2]
+ xor $A[1][2],$A[1][2],$C[6]
+ xor $A[2][2],$A[2][2],$C[6]
+ xor $A[3][2],$A[3][2],$C[6]
+ xor $A[4][2],$A[4][2],$C[6]
+
+ xor $A[0][0],$A[0][0],$C[4]
+ xor $A[1][0],$A[1][0],$C[4]
+ xor $A[2][0],$A[2][0],$C[4]
+ xor $A[3][0],$A[3][0],$C[4]
+ xor $A[4][0],$A[4][0],$C[4]
+___
+ $C[4]=undef;
+ $C[5]=undef;
+ $C[6]=undef;
+ $C[7]=undef;
+$code.=<<___;
+ ld $A[0][4],`$TEMP+0`($sp)
+ xor $C[0], $A[0][3],$C[2] ;mr $C[0],$A[0][3]
+ ld $A[1][4],`$TEMP+8`($sp)
+ xor $A[1][3],$A[1][3],$C[2]
+ ld $A[2][4],`$TEMP+16`($sp)
+ xor $A[2][3],$A[2][3],$C[2]
+ ld $A[3][4],`$TEMP+24`($sp)
+ xor $A[3][3],$A[3][3],$C[2]
+ xor $A[4][3],$A[4][3],$C[2]
+
+ xor $C[2], $A[0][4],$C[3] ;mr $C[2],$A[0][4]
+ xor $A[1][4],$A[1][4],$C[3]
+ xor $A[2][4],$A[2][4],$C[3]
+ xor $A[3][4],$A[3][4],$C[3]
+ xor $A[4][4],$A[4][4],$C[3]
+
+ mr $C[3],$A[0][1] ; Rho+Pi
+ rotldi $A[0][1],$A[1][1],$rhotates[1][1]
+ ;mr $C[1],$A[0][2]
+ rotldi $A[0][2],$A[2][2],$rhotates[2][2]
+ ;mr $C[0],$A[0][3]
+ rotldi $A[0][3],$A[3][3],$rhotates[3][3]
+ ;mr $C[2],$A[0][4]
+ rotldi $A[0][4],$A[4][4],$rhotates[4][4]
+
+ rotldi $A[1][1],$A[1][4],$rhotates[1][4]
+ rotldi $A[2][2],$A[2][3],$rhotates[2][3]
+ rotldi $A[3][3],$A[3][2],$rhotates[3][2]
+ rotldi $A[4][4],$A[4][1],$rhotates[4][1]
+
+ rotldi $A[1][4],$A[4][2],$rhotates[4][2]
+ rotldi $A[2][3],$A[3][4],$rhotates[3][4]
+ rotldi $A[3][2],$A[2][1],$rhotates[2][1]
+ rotldi $A[4][1],$A[1][3],$rhotates[1][3]
+
+ rotldi $A[4][2],$A[2][4],$rhotates[2][4]
+ rotldi $A[3][4],$A[4][3],$rhotates[4][3]
+ rotldi $A[2][1],$A[1][2],$rhotates[1][2]
+ rotldi $A[1][3],$A[3][1],$rhotates[3][1]
+
+ rotldi $A[2][4],$A[4][0],$rhotates[4][0]
+ rotldi $A[4][3],$A[3][0],$rhotates[3][0]
+ rotldi $A[1][2],$A[2][0],$rhotates[2][0]
+ rotldi $A[3][1],$A[1][0],$rhotates[1][0]
+
+ rotldi $A[1][0],$C[0],$rhotates[0][3]
+ rotldi $A[2][0],$C[3],$rhotates[0][1]
+ rotldi $A[3][0],$C[2],$rhotates[0][4]
+ rotldi $A[4][0],$C[1],$rhotates[0][2]
+
+ andc $C[0],$A[0][2],$A[0][1] ; Chi+Iota
+ andc $C[1],$A[0][3],$A[0][2]
+ andc $C[2],$A[0][0],$A[0][4]
+ andc $C[3],$A[0][1],$A[0][0]
+ xor $A[0][0],$A[0][0],$C[0]
+ andc $C[0],$A[0][4],$A[0][3]
+ xor $A[0][1],$A[0][1],$C[1]
+ ld $C[1],`$LOCALS+4*$SIZE_T`($sp)
+ xor $A[0][3],$A[0][3],$C[2]
+ xor $A[0][4],$A[0][4],$C[3]
+ xor $A[0][2],$A[0][2],$C[0]
+ ldu $C[3],8($C[1]) ; Iota[i++]
+
+ andc $C[0],$A[1][2],$A[1][1]
+ std $C[1],`$LOCALS+4*$SIZE_T`($sp)
+ andc $C[1],$A[1][3],$A[1][2]
+ andc $C[2],$A[1][0],$A[1][4]
+ xor $A[0][0],$A[0][0],$C[3] ; A[0][0] ^= Iota
+ andc $C[3],$A[1][1],$A[1][0]
+ xor $A[1][0],$A[1][0],$C[0]
+ andc $C[0],$A[1][4],$A[1][3]
+ xor $A[1][1],$A[1][1],$C[1]
+ xor $A[1][3],$A[1][3],$C[2]
+ xor $A[1][4],$A[1][4],$C[3]
+ xor $A[1][2],$A[1][2],$C[0]
+
+ andc $C[0],$A[2][2],$A[2][1]
+ andc $C[1],$A[2][3],$A[2][2]
+ andc $C[2],$A[2][0],$A[2][4]
+ andc $C[3],$A[2][1],$A[2][0]
+ xor $A[2][0],$A[2][0],$C[0]
+ andc $C[0],$A[2][4],$A[2][3]
+ xor $A[2][1],$A[2][1],$C[1]
+ xor $A[2][3],$A[2][3],$C[2]
+ xor $A[2][4],$A[2][4],$C[3]
+ xor $A[2][2],$A[2][2],$C[0]
+
+ andc $C[0],$A[3][2],$A[3][1]
+ andc $C[1],$A[3][3],$A[3][2]
+ andc $C[2],$A[3][0],$A[3][4]
+ andc $C[3],$A[3][1],$A[3][0]
+ xor $A[3][0],$A[3][0],$C[0]
+ andc $C[0],$A[3][4],$A[3][3]
+ xor $A[3][1],$A[3][1],$C[1]
+ xor $A[3][3],$A[3][3],$C[2]
+ xor $A[3][4],$A[3][4],$C[3]
+ xor $A[3][2],$A[3][2],$C[0]
+
+ andc $C[0],$A[4][2],$A[4][1]
+ andc $C[1],$A[4][3],$A[4][2]
+ andc $C[2],$A[4][0],$A[4][4]
+ andc $C[3],$A[4][1],$A[4][0]
+ xor $A[4][0],$A[4][0],$C[0]
+ andc $C[0],$A[4][4],$A[4][3]
+ xor $A[4][1],$A[4][1],$C[1]
+ xor $A[4][3],$A[4][3],$C[2]
+ xor $A[4][4],$A[4][4],$C[3]
+ xor $A[4][2],$A[4][2],$C[0]
+
+ bdnz .Loop
+
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,0,0
+.size KeccakF1600_int,.-KeccakF1600_int
+
+.type KeccakF1600,\@function
+.align 5
+KeccakF1600:
+ $STU $sp,-$FRAME($sp)
+ mflr r0
+ $PUSH r14,`$FRAME-$SIZE_T*18`($sp)
+ $PUSH r15,`$FRAME-$SIZE_T*17`($sp)
+ $PUSH r16,`$FRAME-$SIZE_T*16`($sp)
+ $PUSH r17,`$FRAME-$SIZE_T*15`($sp)
+ $PUSH r18,`$FRAME-$SIZE_T*14`($sp)
+ $PUSH r19,`$FRAME-$SIZE_T*13`($sp)
+ $PUSH r20,`$FRAME-$SIZE_T*12`($sp)
+ $PUSH r21,`$FRAME-$SIZE_T*11`($sp)
+ $PUSH r22,`$FRAME-$SIZE_T*10`($sp)
+ $PUSH r23,`$FRAME-$SIZE_T*9`($sp)
+ $PUSH r24,`$FRAME-$SIZE_T*8`($sp)
+ $PUSH r25,`$FRAME-$SIZE_T*7`($sp)
+ $PUSH r26,`$FRAME-$SIZE_T*6`($sp)
+ $PUSH r27,`$FRAME-$SIZE_T*5`($sp)
+ $PUSH r28,`$FRAME-$SIZE_T*4`($sp)
+ $PUSH r29,`$FRAME-$SIZE_T*3`($sp)
+ $PUSH r30,`$FRAME-$SIZE_T*2`($sp)
+ $PUSH r31,`$FRAME-$SIZE_T*1`($sp)
+ $PUSH r0,`$FRAME+$LRSAVE`($sp)
+
+ bl PICmeup
+ subi r12,r12,8 ; prepare for ldu
+
+ $PUSH r3,`$LOCALS+0*$SIZE_T`($sp)
+ ;$PUSH r4,`$LOCALS+1*$SIZE_T`($sp)
+ ;$PUSH r5,`$LOCALS+2*$SIZE_T`($sp)
+ ;$PUSH r6,`$LOCALS+3*$SIZE_T`($sp)
+ $PUSH r12,`$LOCALS+4*$SIZE_T`($sp)
+
+ ld $A[0][0],`8*0`(r3) ; load A[5][5]
+ ld $A[0][1],`8*1`(r3)
+ ld $A[0][2],`8*2`(r3)
+ ld $A[0][3],`8*3`(r3)
+ ld $A[0][4],`8*4`(r3)
+ ld $A[1][0],`8*5`(r3)
+ ld $A[1][1],`8*6`(r3)
+ ld $A[1][2],`8*7`(r3)
+ ld $A[1][3],`8*8`(r3)
+ ld $A[1][4],`8*9`(r3)
+ ld $A[2][0],`8*10`(r3)
+ ld $A[2][1],`8*11`(r3)
+ ld $A[2][2],`8*12`(r3)
+ ld $A[2][3],`8*13`(r3)
+ ld $A[2][4],`8*14`(r3)
+ ld $A[3][0],`8*15`(r3)
+ ld $A[3][1],`8*16`(r3)
+ ld $A[3][2],`8*17`(r3)
+ ld $A[3][3],`8*18`(r3)
+ ld $A[3][4],`8*19`(r3)
+ ld $A[4][0],`8*20`(r3)
+ ld $A[4][1],`8*21`(r3)
+ ld $A[4][2],`8*22`(r3)
+ ld $A[4][3],`8*23`(r3)
+ ld $A[4][4],`8*24`(r3)
+
+ bl KeccakF1600_int
+
+ $POP r3,`$LOCALS+0*$SIZE_T`($sp)
+ std $A[0][0],`8*0`(r3) ; return A[5][5]
+ std $A[0][1],`8*1`(r3)
+ std $A[0][2],`8*2`(r3)
+ std $A[0][3],`8*3`(r3)
+ std $A[0][4],`8*4`(r3)
+ std $A[1][0],`8*5`(r3)
+ std $A[1][1],`8*6`(r3)
+ std $A[1][2],`8*7`(r3)
+ std $A[1][3],`8*8`(r3)
+ std $A[1][4],`8*9`(r3)
+ std $A[2][0],`8*10`(r3)
+ std $A[2][1],`8*11`(r3)
+ std $A[2][2],`8*12`(r3)
+ std $A[2][3],`8*13`(r3)
+ std $A[2][4],`8*14`(r3)
+ std $A[3][0],`8*15`(r3)
+ std $A[3][1],`8*16`(r3)
+ std $A[3][2],`8*17`(r3)
+ std $A[3][3],`8*18`(r3)
+ std $A[3][4],`8*19`(r3)
+ std $A[4][0],`8*20`(r3)
+ std $A[4][1],`8*21`(r3)
+ std $A[4][2],`8*22`(r3)
+ std $A[4][3],`8*23`(r3)
+ std $A[4][4],`8*24`(r3)
+
+ $POP r0,`$FRAME+$LRSAVE`($sp)
+ $POP r14,`$FRAME-$SIZE_T*18`($sp)
+ $POP r15,`$FRAME-$SIZE_T*17`($sp)
+ $POP r16,`$FRAME-$SIZE_T*16`($sp)
+ $POP r17,`$FRAME-$SIZE_T*15`($sp)
+ $POP r18,`$FRAME-$SIZE_T*14`($sp)
+ $POP r19,`$FRAME-$SIZE_T*13`($sp)
+ $POP r20,`$FRAME-$SIZE_T*12`($sp)
+ $POP r21,`$FRAME-$SIZE_T*11`($sp)
+ $POP r22,`$FRAME-$SIZE_T*10`($sp)
+ $POP r23,`$FRAME-$SIZE_T*9`($sp)
+ $POP r24,`$FRAME-$SIZE_T*8`($sp)
+ $POP r25,`$FRAME-$SIZE_T*7`($sp)
+ $POP r26,`$FRAME-$SIZE_T*6`($sp)
+ $POP r27,`$FRAME-$SIZE_T*5`($sp)
+ $POP r28,`$FRAME-$SIZE_T*4`($sp)
+ $POP r29,`$FRAME-$SIZE_T*3`($sp)
+ $POP r30,`$FRAME-$SIZE_T*2`($sp)
+ $POP r31,`$FRAME-$SIZE_T*1`($sp)
+ mtlr r0
+ addi $sp,$sp,$FRAME
+ blr
+ .long 0
+ .byte 0,12,4,1,0x80,18,1,0
+ .long 0
+.size KeccakF1600,.-KeccakF1600
+
+.type dword_le_load,\@function
+.align 5
+dword_le_load:
+ lbzu r0,1(r3)
+ lbzu r4,1(r3)
+ lbzu r5,1(r3)
+ insrdi r0,r4,8,48
+ lbzu r4,1(r3)
+ insrdi r0,r5,8,40
+ lbzu r5,1(r3)
+ insrdi r0,r4,8,32
+ lbzu r4,1(r3)
+ insrdi r0,r5,8,24
+ lbzu r5,1(r3)
+ insrdi r0,r4,8,16
+ lbzu r4,1(r3)
+ insrdi r0,r5,8,8
+ insrdi r0,r4,8,0
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,1,0
+ .long 0
+.size dword_le_load,.-dword_le_load
+
+.globl SHA3_absorb
+.type SHA3_absorb,\@function
+.align 5
+SHA3_absorb:
+ $STU $sp,-$FRAME($sp)
+ mflr r0
+ $PUSH r14,`$FRAME-$SIZE_T*18`($sp)
+ $PUSH r15,`$FRAME-$SIZE_T*17`($sp)
+ $PUSH r16,`$FRAME-$SIZE_T*16`($sp)
+ $PUSH r17,`$FRAME-$SIZE_T*15`($sp)
+ $PUSH r18,`$FRAME-$SIZE_T*14`($sp)
+ $PUSH r19,`$FRAME-$SIZE_T*13`($sp)
+ $PUSH r20,`$FRAME-$SIZE_T*12`($sp)
+ $PUSH r21,`$FRAME-$SIZE_T*11`($sp)
+ $PUSH r22,`$FRAME-$SIZE_T*10`($sp)
+ $PUSH r23,`$FRAME-$SIZE_T*9`($sp)
+ $PUSH r24,`$FRAME-$SIZE_T*8`($sp)
+ $PUSH r25,`$FRAME-$SIZE_T*7`($sp)
+ $PUSH r26,`$FRAME-$SIZE_T*6`($sp)
+ $PUSH r27,`$FRAME-$SIZE_T*5`($sp)
+ $PUSH r28,`$FRAME-$SIZE_T*4`($sp)
+ $PUSH r29,`$FRAME-$SIZE_T*3`($sp)
+ $PUSH r30,`$FRAME-$SIZE_T*2`($sp)
+ $PUSH r31,`$FRAME-$SIZE_T*1`($sp)
+ $PUSH r0,`$FRAME+$LRSAVE`($sp)
+
+ bl PICmeup
+ subi r4,r4,1 ; prepare for lbzu
+ subi r12,r12,8 ; prepare for ldu
+
+ $PUSH r3,`$LOCALS+0*$SIZE_T`($sp) ; save A[][]
+ $PUSH r4,`$LOCALS+1*$SIZE_T`($sp) ; save inp
+ $PUSH r5,`$LOCALS+2*$SIZE_T`($sp) ; save len
+ $PUSH r6,`$LOCALS+3*$SIZE_T`($sp) ; save bsz
+ mr r0,r6
+ $PUSH r12,`$LOCALS+4*$SIZE_T`($sp)
+
+ ld $A[0][0],`8*0`(r3) ; load A[5][5]
+ ld $A[0][1],`8*1`(r3)
+ ld $A[0][2],`8*2`(r3)
+ ld $A[0][3],`8*3`(r3)
+ ld $A[0][4],`8*4`(r3)
+ ld $A[1][0],`8*5`(r3)
+ ld $A[1][1],`8*6`(r3)
+ ld $A[1][2],`8*7`(r3)
+ ld $A[1][3],`8*8`(r3)
+ ld $A[1][4],`8*9`(r3)
+ ld $A[2][0],`8*10`(r3)
+ ld $A[2][1],`8*11`(r3)
+ ld $A[2][2],`8*12`(r3)
+ ld $A[2][3],`8*13`(r3)
+ ld $A[2][4],`8*14`(r3)
+ ld $A[3][0],`8*15`(r3)
+ ld $A[3][1],`8*16`(r3)
+ ld $A[3][2],`8*17`(r3)
+ ld $A[3][3],`8*18`(r3)
+ ld $A[3][4],`8*19`(r3)
+ ld $A[4][0],`8*20`(r3)
+ ld $A[4][1],`8*21`(r3)
+ ld $A[4][2],`8*22`(r3)
+ ld $A[4][3],`8*23`(r3)
+ ld $A[4][4],`8*24`(r3)
+
+ mr r3,r4
+ mr r4,r5
+ mr r5,r0
+
+ b .Loop_absorb
+
+.align 4
+.Loop_absorb:
+ $UCMP r4,r5 ; len < bsz?
+ blt .Labsorbed
+
+ sub r4,r4,r5 ; len -= bsz
+ srwi r5,r5,3
+ $PUSH r4,`$LOCALS+2*$SIZE_T`($sp) ; save len
+ mtctr r5
+ bl dword_le_load ; *inp++
+ xor $A[0][0],$A[0][0],r0
+ bdz .Lprocess_block
+ bl dword_le_load ; *inp++
+ xor $A[0][1],$A[0][1],r0
+ bdz .Lprocess_block
+ bl dword_le_load ; *inp++
+ xor $A[0][2],$A[0][2],r0
+ bdz .Lprocess_block
+ bl dword_le_load ; *inp++
+ xor $A[0][3],$A[0][3],r0
+ bdz .Lprocess_block
+ bl dword_le_load ; *inp++
+ xor $A[0][4],$A[0][4],r0
+ bdz .Lprocess_block
+ bl dword_le_load ; *inp++
+ xor $A[1][0],$A[1][0],r0
+ bdz .Lprocess_block
+ bl dword_le_load ; *inp++
+ xor $A[1][1],$A[1][1],r0
+ bdz .Lprocess_block
+ bl dword_le_load ; *inp++
+ xor $A[1][2],$A[1][2],r0
+ bdz .Lprocess_block
+ bl dword_le_load ; *inp++
+ xor $A[1][3],$A[1][3],r0
+ bdz .Lprocess_block
+ bl dword_le_load ; *inp++
+ xor $A[1][4],$A[1][4],r0
+ bdz .Lprocess_block
+ bl dword_le_load ; *inp++
+ xor $A[2][0],$A[2][0],r0
+ bdz .Lprocess_block
+ bl dword_le_load ; *inp++
+ xor $A[2][1],$A[2][1],r0
+ bdz .Lprocess_block
+ bl dword_le_load ; *inp++
+ xor $A[2][2],$A[2][2],r0
+ bdz .Lprocess_block
+ bl dword_le_load ; *inp++
+ xor $A[2][3],$A[2][3],r0
+ bdz .Lprocess_block
+ bl dword_le_load ; *inp++
+ xor $A[2][4],$A[2][4],r0
+ bdz .Lprocess_block
+ bl dword_le_load ; *inp++
+ xor $A[3][0],$A[3][0],r0
+ bdz .Lprocess_block
+ bl dword_le_load ; *inp++
+ xor $A[3][1],$A[3][1],r0
+ bdz .Lprocess_block
+ bl dword_le_load ; *inp++
+ xor $A[3][2],$A[3][2],r0
+ bdz .Lprocess_block
+ bl dword_le_load ; *inp++
+ xor $A[3][3],$A[3][3],r0
+ bdz .Lprocess_block
+ bl dword_le_load ; *inp++
+ xor $A[3][4],$A[3][4],r0
+ bdz .Lprocess_block
+ bl dword_le_load ; *inp++
+ xor $A[4][0],$A[4][0],r0
+ bdz .Lprocess_block
+ bl dword_le_load ; *inp++
+ xor $A[4][1],$A[4][1],r0
+ bdz .Lprocess_block
+ bl dword_le_load ; *inp++
+ xor $A[4][2],$A[4][2],r0
+ bdz .Lprocess_block
+ bl dword_le_load ; *inp++
+ xor $A[4][3],$A[4][3],r0
+ bdz .Lprocess_block
+ bl dword_le_load ; *inp++
+ xor $A[4][4],$A[4][4],r0
+
+.Lprocess_block:
+ $PUSH r3,`$LOCALS+1*$SIZE_T`($sp) ; save inp
+
+ bl KeccakF1600_int
+
+ $POP r0,`$LOCALS+4*$SIZE_T`($sp) ; pull iotas[24]
+ $POP r5,`$LOCALS+3*$SIZE_T`($sp) ; restore bsz
+ $POP r4,`$LOCALS+2*$SIZE_T`($sp) ; restore len
+ $POP r3,`$LOCALS+1*$SIZE_T`($sp) ; restore inp
+ addic r0,r0,`-8*24` ; rewind iotas
+ $PUSH r0,`$LOCALS+4*$SIZE_T`($sp)
+
+ b .Loop_absorb
+
+.align 4
+.Labsorbed:
+ $POP r3,`$LOCALS+0*$SIZE_T`($sp)
+ std $A[0][0],`8*0`(r3) ; return A[5][5]
+ std $A[0][1],`8*1`(r3)
+ std $A[0][2],`8*2`(r3)
+ std $A[0][3],`8*3`(r3)
+ std $A[0][4],`8*4`(r3)
+ std $A[1][0],`8*5`(r3)
+ std $A[1][1],`8*6`(r3)
+ std $A[1][2],`8*7`(r3)
+ std $A[1][3],`8*8`(r3)
+ std $A[1][4],`8*9`(r3)
+ std $A[2][0],`8*10`(r3)
+ std $A[2][1],`8*11`(r3)
+ std $A[2][2],`8*12`(r3)
+ std $A[2][3],`8*13`(r3)
+ std $A[2][4],`8*14`(r3)
+ std $A[3][0],`8*15`(r3)
+ std $A[3][1],`8*16`(r3)
+ std $A[3][2],`8*17`(r3)
+ std $A[3][3],`8*18`(r3)
+ std $A[3][4],`8*19`(r3)
+ std $A[4][0],`8*20`(r3)
+ std $A[4][1],`8*21`(r3)
+ std $A[4][2],`8*22`(r3)
+ std $A[4][3],`8*23`(r3)
+ std $A[4][4],`8*24`(r3)
+
+ mr r3,r4 ; return value
+ $POP r0,`$FRAME+$LRSAVE`($sp)
+ $POP r14,`$FRAME-$SIZE_T*18`($sp)
+ $POP r15,`$FRAME-$SIZE_T*17`($sp)
+ $POP r16,`$FRAME-$SIZE_T*16`($sp)
+ $POP r17,`$FRAME-$SIZE_T*15`($sp)
+ $POP r18,`$FRAME-$SIZE_T*14`($sp)
+ $POP r19,`$FRAME-$SIZE_T*13`($sp)
+ $POP r20,`$FRAME-$SIZE_T*12`($sp)
+ $POP r21,`$FRAME-$SIZE_T*11`($sp)
+ $POP r22,`$FRAME-$SIZE_T*10`($sp)
+ $POP r23,`$FRAME-$SIZE_T*9`($sp)
+ $POP r24,`$FRAME-$SIZE_T*8`($sp)
+ $POP r25,`$FRAME-$SIZE_T*7`($sp)
+ $POP r26,`$FRAME-$SIZE_T*6`($sp)
+ $POP r27,`$FRAME-$SIZE_T*5`($sp)
+ $POP r28,`$FRAME-$SIZE_T*4`($sp)
+ $POP r29,`$FRAME-$SIZE_T*3`($sp)
+ $POP r30,`$FRAME-$SIZE_T*2`($sp)
+ $POP r31,`$FRAME-$SIZE_T*1`($sp)
+ mtlr r0
+ addi $sp,$sp,$FRAME
+ blr
+ .long 0
+ .byte 0,12,4,1,0x80,18,4,0
+ .long 0
+.size SHA3_absorb,.-SHA3_absorb
+___
+{
+my ($A_flat,$out,$len,$bsz) = map("r$_",(28..31));
+$code.=<<___;
+.globl SHA3_squeeze
+.type SHA3_squeeze,\@function
+.align 5
+SHA3_squeeze:
+ $STU $sp,`-10*$SIZE_T`($sp)
+ mflr r0
+ $PUSH r28,`6*$SIZE_T`($sp)
+ $PUSH r29,`7*$SIZE_T`($sp)
+ $PUSH r30,`8*$SIZE_T`($sp)
+ $PUSH r31,`9*$SIZE_T`($sp)
+ $PUSH r0,`10*$SIZE_T+$LRSAVE`($sp)
+
+ mr $A_flat,r3
+ subi r3,r3,8 ; prepare for ldu
+ subi $out,r4,1 ; prepare for stbu
+ mr $len,r5
+ mr $bsz,r6
+ b .Loop_squeeze
+
+.align 4
+.Loop_squeeze:
+ ldu r0,8(r3)
+ ${UCMP}i $len,8
+ blt .Lsqueeze_tail
+
+ stbu r0,1($out)
+ srdi r0,r0,8
+ stbu r0,1($out)
+ srdi r0,r0,8
+ stbu r0,1($out)
+ srdi r0,r0,8
+ stbu r0,1($out)
+ srdi r0,r0,8
+ stbu r0,1($out)
+ srdi r0,r0,8
+ stbu r0,1($out)
+ srdi r0,r0,8
+ stbu r0,1($out)
+ srdi r0,r0,8
+ stbu r0,1($out)
+
+ subic. $len,$len,8
+ beq .Lsqueeze_done
+
+ subic. r6,r6,8
+ bgt .Loop_squeeze
+
+ mr r3,$A_flat
+ bl KeccakF1600
+ subi r3,$A_flat,8 ; prepare for ldu
+ mr r6,$bsz
+ b .Loop_squeeze
+
+.align 4
+.Lsqueeze_tail:
+ mtctr $len
+.Loop_tail:
+ stbu r0,1($out)
+ srdi r0,r0,8
+ bdnz .Loop_tail
+
+.Lsqueeze_done:
+ $POP r0,`10*$SIZE_T+$LRSAVE`($sp)
+ $POP r28,`6*$SIZE_T`($sp)
+ $POP r29,`7*$SIZE_T`($sp)
+ $POP r30,`8*$SIZE_T`($sp)
+ $POP r31,`9*$SIZE_T`($sp)
+ mtlr r0
+ addi $sp,$sp,`10*$SIZE_T`
+ blr
+ .long 0
+ .byte 0,12,4,1,0x80,4,4,0
+ .long 0
+.size SHA3_squeeze,.-SHA3_squeeze
+___
+}
+
+# Ugly hack here, because PPC assembler syntax seem to vary too
+# much from platforms to platform...
+$code.=<<___;
+.align 6
+PICmeup:
+ mflr r0
+ bcl 20,31,\$+4
+ mflr r12 ; vvvvvv "distance" between . and 1st data entry
+ addi r12,r12,`64-8`
+ mtlr r0
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,0,0
+ .space `64-9*4`
+.type iotas,\@object
+iotas:
+ .quad 0x0000000000000001
+ .quad 0x0000000000008082
+ .quad 0x800000000000808a
+ .quad 0x8000000080008000
+ .quad 0x000000000000808b
+ .quad 0x0000000080000001
+ .quad 0x8000000080008081
+ .quad 0x8000000000008009
+ .quad 0x000000000000008a
+ .quad 0x0000000000000088
+ .quad 0x0000000080008009
+ .quad 0x000000008000000a
+ .quad 0x000000008000808b
+ .quad 0x800000000000008b
+ .quad 0x8000000000008089
+ .quad 0x8000000000008003
+ .quad 0x8000000000008002
+ .quad 0x8000000000000080
+ .quad 0x000000000000800a
+ .quad 0x800000008000000a
+ .quad 0x8000000080008081
+ .quad 0x8000000000008080
+ .quad 0x0000000080000001
+ .quad 0x8000000080008008
+.size iotas,.-iotas
+.asciz "Keccak-1600 absorb and squeeze for PPC64, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+print $code;
+close STDOUT;
diff --git a/crypto/sha/asm/keccak1600-s390x.pl b/crypto/sha/asm/keccak1600-s390x.pl
new file mode 100755
index 000000000000..3bce19be9ea4
--- /dev/null
+++ b/crypto/sha/asm/keccak1600-s390x.pl
@@ -0,0 +1,560 @@
+#!/usr/bin/env perl
+# Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# Keccak-1600 for s390x.
+#
+# June 2017.
+#
+# Below code is [lane complementing] KECCAK_2X implementation (see
+# sha/keccak1600.c) with C[5] and D[5] held in register bank. Though
+# instead of actually unrolling the loop pair-wise I simply flip
+# pointers to T[][] and A[][] at the end of round. Since number of
+# rounds is even, last round writes to A[][] and everything works out.
+# In the nutshell it's transliteration of x86_64 module, because both
+# architectures have similar capabilities/limitations. Performance
+# measurement is problematic as I don't have access to an idle system.
+# It looks like z13 processes one byte [out of long message] in ~14
+# cycles. At least the result is consistent with estimate based on
+# amount of instruction and assumed instruction issue rate. It's ~2.5x
+# faster than compiler-generated code.
+
+$flavour = shift;
+
+if ($flavour =~ /3[12]/) {
+ $SIZE_T=4;
+ $g="";
+} else {
+ $SIZE_T=8;
+ $g="g";
+}
+
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+my @A = map([ 8*$_, 8*($_+1), 8*($_+2), 8*($_+3), 8*($_+4) ], (0,5,10,15,20));
+
+my @C = map("%r$_",(0,1,5..7));
+my @D = map("%r$_",(8..12));
+my @T = map("%r$_",(13..14));
+my ($src,$dst,$iotas) = map("%r$_",(2..4));
+my $sp = "%r15";
+
+$stdframe=16*$SIZE_T+4*8;
+$frame=$stdframe+25*8;
+
+my @rhotates = ([ 0, 1, 62, 28, 27 ],
+ [ 36, 44, 6, 55, 20 ],
+ [ 3, 10, 43, 25, 39 ],
+ [ 41, 45, 15, 21, 8 ],
+ [ 18, 2, 61, 56, 14 ]);
+
+{ my @C = @C; # copy, because we mess them up...
+ my @D = @D;
+
+$code.=<<___;
+.text
+
+.type __KeccakF1600,\@function
+.align 32
+__KeccakF1600:
+ st${g} %r14,$SIZE_T*14($sp)
+ lg @C[0],$A[4][0]($src)
+ lg @C[1],$A[4][1]($src)
+ lg @C[2],$A[4][2]($src)
+ lg @C[3],$A[4][3]($src)
+ lg @C[4],$A[4][4]($src)
+ larl $iotas,iotas
+ j .Loop
+
+.align 16
+.Loop:
+ lg @D[0],$A[0][0]($src)
+ lg @D[1],$A[1][1]($src)
+ lg @D[2],$A[2][2]($src)
+ lg @D[3],$A[3][3]($src)
+
+ xgr @C[0],@D[0]
+ xg @C[1],$A[0][1]($src)
+ xg @C[2],$A[0][2]($src)
+ xg @C[3],$A[0][3]($src)
+ lgr @D[4],@C[4]
+ xg @C[4],$A[0][4]($src)
+
+ xg @C[0],$A[1][0]($src)
+ xgr @C[1],@D[1]
+ xg @C[2],$A[1][2]($src)
+ xg @C[3],$A[1][3]($src)
+ xg @C[4],$A[1][4]($src)
+
+ xg @C[0],$A[2][0]($src)
+ xg @C[1],$A[2][1]($src)
+ xgr @C[2],@D[2]
+ xg @C[3],$A[2][3]($src)
+ xg @C[4],$A[2][4]($src)
+
+ xg @C[0],$A[3][0]($src)
+ xg @C[1],$A[3][1]($src)
+ xg @C[2],$A[3][2]($src)
+ xgr @C[3],@D[3]
+ xg @C[4],$A[3][4]($src)
+
+ lgr @T[0],@C[2]
+ rllg @C[2],@C[2],1
+ xgr @C[2],@C[0] # D[1] = ROL64(C[2], 1) ^ C[0]
+
+ rllg @C[0],@C[0],1
+ xgr @C[0],@C[3] # D[4] = ROL64(C[0], 1) ^ C[3]
+
+ rllg @C[3],@C[3],1
+ xgr @C[3],@C[1] # D[2] = ROL64(C[3], 1) ^ C[1]
+
+ rllg @C[1],@C[1],1
+ xgr @C[1],@C[4] # D[0] = ROL64(C[1], 1) ^ C[4]
+
+ rllg @C[4],@C[4],1
+ xgr @C[4],@T[0] # D[3] = ROL64(C[4], 1) ^ C[2]
+___
+ (@D[0..4], @C) = (@C[1..4,0], @D);
+$code.=<<___;
+ xgr @C[1],@D[1]
+ xgr @C[2],@D[2]
+ xgr @C[3],@D[3]
+ rllg @C[1],@C[1],$rhotates[1][1]
+ xgr @C[4],@D[4]
+ rllg @C[2],@C[2],$rhotates[2][2]
+ xgr @C[0],@D[0]
+
+ lgr @T[0],@C[1]
+ ogr @C[1],@C[2]
+ rllg @C[3],@C[3],$rhotates[3][3]
+ xgr @C[1],@C[0] # C[0] ^ ( C[1] | C[2])
+ rllg @C[4],@C[4],$rhotates[4][4]
+ xg @C[1],0($iotas)
+ la $iotas,8($iotas)
+ stg @C[1],$A[0][0]($dst) # R[0][0] = C[0] ^ ( C[1] | C[2]) ^ iotas[i]
+
+ lgr @T[1],@C[4]
+ ngr @C[4],@C[3]
+ lghi @C[1],-1 # no 'not' instruction :-(
+ xgr @C[4],@C[2] # C[2] ^ ( C[4] & C[3])
+ xgr @C[2],@C[1] # not @C[2]
+ stg @C[4],$A[0][2]($dst) # R[0][2] = C[2] ^ ( C[4] & C[3])
+ ogr @C[2],@C[3]
+ xgr @C[2],@T[0] # C[1] ^ (~C[2] | C[3])
+
+ ngr @T[0],@C[0]
+ stg @C[2],$A[0][1]($dst) # R[0][1] = C[1] ^ (~C[2] | C[3])
+ xgr @T[0],@T[1] # C[4] ^ ( C[1] & C[0])
+ ogr @T[1],@C[0]
+ stg @T[0],$A[0][4]($dst) # R[0][4] = C[4] ^ ( C[1] & C[0])
+ xgr @T[1],@C[3] # C[3] ^ ( C[4] | C[0])
+ stg @T[1],$A[0][3]($dst) # R[0][3] = C[3] ^ ( C[4] | C[0])
+
+
+ lg @C[0],$A[0][3]($src)
+ lg @C[4],$A[4][2]($src)
+ lg @C[3],$A[3][1]($src)
+ lg @C[1],$A[1][4]($src)
+ lg @C[2],$A[2][0]($src)
+
+ xgr @C[0],@D[3]
+ xgr @C[4],@D[2]
+ rllg @C[0],@C[0],$rhotates[0][3]
+ xgr @C[3],@D[1]
+ rllg @C[4],@C[4],$rhotates[4][2]
+ xgr @C[1],@D[4]
+ rllg @C[3],@C[3],$rhotates[3][1]
+ xgr @C[2],@D[0]
+
+ lgr @T[0],@C[0]
+ ogr @C[0],@C[4]
+ rllg @C[1],@C[1],$rhotates[1][4]
+ xgr @C[0],@C[3] # C[3] ^ (C[0] | C[4])
+ rllg @C[2],@C[2],$rhotates[2][0]
+ stg @C[0],$A[1][3]($dst) # R[1][3] = C[3] ^ (C[0] | C[4])
+
+ lgr @T[1],@C[1]
+ ngr @C[1],@T[0]
+ lghi @C[0],-1 # no 'not' instruction :-(
+ xgr @C[1],@C[4] # C[4] ^ (C[1] & C[0])
+ xgr @C[4],@C[0] # not @C[4]
+ stg @C[1],$A[1][4]($dst) # R[1][4] = C[4] ^ (C[1] & C[0])
+
+ ogr @C[4],@C[3]
+ xgr @C[4],@C[2] # C[2] ^ (~C[4] | C[3])
+
+ ngr @C[3],@C[2]
+ stg @C[4],$A[1][2]($dst) # R[1][2] = C[2] ^ (~C[4] | C[3])
+ xgr @C[3],@T[1] # C[1] ^ (C[3] & C[2])
+ ogr @T[1],@C[2]
+ stg @C[3],$A[1][1]($dst) # R[1][1] = C[1] ^ (C[3] & C[2])
+ xgr @T[1],@T[0] # C[0] ^ (C[1] | C[2])
+ stg @T[1],$A[1][0]($dst) # R[1][0] = C[0] ^ (C[1] | C[2])
+
+
+ lg @C[2],$A[2][3]($src)
+ lg @C[3],$A[3][4]($src)
+ lg @C[1],$A[1][2]($src)
+ lg @C[4],$A[4][0]($src)
+ lg @C[0],$A[0][1]($src)
+
+ xgr @C[2],@D[3]
+ xgr @C[3],@D[4]
+ rllg @C[2],@C[2],$rhotates[2][3]
+ xgr @C[1],@D[2]
+ rllg @C[3],@C[3],$rhotates[3][4]
+ xgr @C[4],@D[0]
+ rllg @C[1],@C[1],$rhotates[1][2]
+ xgr @C[0],@D[1]
+
+ lgr @T[0],@C[2]
+ ngr @C[2],@C[3]
+ rllg @C[4],@C[4],$rhotates[4][0]
+ xgr @C[2],@C[1] # C[1] ^ ( C[2] & C[3])
+ lghi @T[1],-1 # no 'not' instruction :-(
+ stg @C[2],$A[2][1]($dst) # R[2][1] = C[1] ^ ( C[2] & C[3])
+
+ xgr @C[3],@T[1] # not @C[3]
+ lgr @T[1],@C[4]
+ ngr @C[4],@C[3]
+ rllg @C[0],@C[0],$rhotates[0][1]
+ xgr @C[4],@T[0] # C[2] ^ ( C[4] & ~C[3])
+ ogr @T[0],@C[1]
+ stg @C[4],$A[2][2]($dst) # R[2][2] = C[2] ^ ( C[4] & ~C[3])
+ xgr @T[0],@C[0] # C[0] ^ ( C[2] | C[1])
+
+ ngr @C[1],@C[0]
+ stg @T[0],$A[2][0]($dst) # R[2][0] = C[0] ^ ( C[2] | C[1])
+ xgr @C[1],@T[1] # C[4] ^ ( C[1] & C[0])
+ ogr @C[0],@T[1]
+ stg @C[1],$A[2][4]($dst) # R[2][4] = C[4] ^ ( C[1] & C[0])
+ xgr @C[0],@C[3] # ~C[3] ^ ( C[0] | C[4])
+ stg @C[0],$A[2][3]($dst) # R[2][3] = ~C[3] ^ ( C[0] | C[4])
+
+
+ lg @C[2],$A[2][1]($src)
+ lg @C[3],$A[3][2]($src)
+ lg @C[1],$A[1][0]($src)
+ lg @C[4],$A[4][3]($src)
+ lg @C[0],$A[0][4]($src)
+
+ xgr @C[2],@D[1]
+ xgr @C[3],@D[2]
+ rllg @C[2],@C[2],$rhotates[2][1]
+ xgr @C[1],@D[0]
+ rllg @C[3],@C[3],$rhotates[3][2]
+ xgr @C[4],@D[3]
+ rllg @C[1],@C[1],$rhotates[1][0]
+ xgr @C[0],@D[4]
+ rllg @C[4],@C[4],$rhotates[4][3]
+
+ lgr @T[0],@C[2]
+ ogr @C[2],@C[3]
+ lghi @T[1],-1 # no 'not' instruction :-(
+ xgr @C[2],@C[1] # C[1] ^ ( C[2] | C[3])
+ xgr @C[3],@T[1] # not @C[3]
+ stg @C[2],$A[3][1]($dst) # R[3][1] = C[1] ^ ( C[2] | C[3])
+
+ lgr @T[1],@C[4]
+ ogr @C[4],@C[3]
+ rllg @C[0],@C[0],$rhotates[0][4]
+ xgr @C[4],@T[0] # C[2] ^ ( C[4] | ~C[3])
+ ngr @T[0],@C[1]
+ stg @C[4],$A[3][2]($dst) # R[3][2] = C[2] ^ ( C[4] | ~C[3])
+ xgr @T[0],@C[0] # C[0] ^ ( C[2] & C[1])
+
+ ogr @C[1],@C[0]
+ stg @T[0],$A[3][0]($dst) # R[3][0] = C[0] ^ ( C[2] & C[1])
+ xgr @C[1],@T[1] # C[4] ^ ( C[1] | C[0])
+ ngr @C[0],@T[1]
+ stg @C[1],$A[3][4]($dst) # R[3][4] = C[4] ^ ( C[1] | C[0])
+ xgr @C[0],@C[3] # ~C[3] ^ ( C[0] & C[4])
+ stg @C[0],$A[3][3]($dst) # R[3][3] = ~C[3] ^ ( C[0] & C[4])
+
+
+ xg @D[2],$A[0][2]($src)
+ xg @D[3],$A[1][3]($src)
+ xg @D[1],$A[4][1]($src)
+ xg @D[4],$A[2][4]($src)
+ xgr $dst,$src # xchg $dst,$src
+ rllg @D[2],@D[2],$rhotates[0][2]
+ xg @D[0],$A[3][0]($src)
+ rllg @D[3],@D[3],$rhotates[1][3]
+ xgr $src,$dst
+ rllg @D[1],@D[1],$rhotates[4][1]
+ xgr $dst,$src
+ rllg @D[4],@D[4],$rhotates[2][4]
+___
+ @C = @D[2..4,0,1];
+$code.=<<___;
+ lgr @T[0],@C[0]
+ ngr @C[0],@C[1]
+ lghi @T[1],-1 # no 'not' instruction :-(
+ xgr @C[0],@C[4] # C[4] ^ ( C[0] & C[1])
+ xgr @C[1],@T[1] # not @C[1]
+ stg @C[0],$A[4][4]($src) # R[4][4] = C[4] ^ ( C[0] & C[1])
+
+ lgr @T[1],@C[2]
+ ngr @C[2],@C[1]
+ rllg @D[0],@D[0],$rhotates[3][0]
+ xgr @C[2],@T[0] # C[0] ^ ( C[2] & ~C[1])
+ ogr @T[0],@C[4]
+ stg @C[2],$A[4][0]($src) # R[4][0] = C[0] ^ ( C[2] & ~C[1])
+ xgr @T[0],@C[3] # C[3] ^ ( C[0] | C[4])
+
+ ngr @C[4],@C[3]
+ stg @T[0],$A[4][3]($src) # R[4][3] = C[3] ^ ( C[0] | C[4])
+ xgr @C[4],@T[1] # C[2] ^ ( C[4] & C[3])
+ ogr @C[3],@T[1]
+ stg @C[4],$A[4][2]($src) # R[4][2] = C[2] ^ ( C[4] & C[3])
+ xgr @C[3],@C[1] # ~C[1] ^ ( C[2] | C[3])
+
+ lgr @C[1],@C[0] # harmonize with the loop top
+ lgr @C[0],@T[0]
+ stg @C[3],$A[4][1]($src) # R[4][1] = ~C[1] ^ ( C[2] | C[3])
+
+ tmll $iotas,255
+ jnz .Loop
+
+ l${g} %r14,$SIZE_T*14($sp)
+ br %r14
+.size __KeccakF1600,.-__KeccakF1600
+___
+}
+{
+$code.=<<___;
+.type KeccakF1600,\@function
+.align 32
+KeccakF1600:
+.LKeccakF1600:
+ lghi %r1,-$frame
+ stm${g} %r6,%r15,$SIZE_T*6($sp)
+ lgr %r0,$sp
+ la $sp,0(%r1,$sp)
+ st${g} %r0,0($sp)
+
+ lghi @D[0],-1 # no 'not' instruction :-(
+ lghi @D[1],-1
+ lghi @D[2],-1
+ lghi @D[3],-1
+ lghi @D[4],-1
+ lghi @T[0],-1
+ xg @D[0],$A[0][1]($src)
+ xg @D[1],$A[0][2]($src)
+ xg @D[2],$A[1][3]($src)
+ xg @D[3],$A[2][2]($src)
+ xg @D[4],$A[3][2]($src)
+ xg @T[0],$A[4][0]($src)
+ stmg @D[0],@D[1],$A[0][1]($src)
+ stg @D[2],$A[1][3]($src)
+ stg @D[3],$A[2][2]($src)
+ stg @D[4],$A[3][2]($src)
+ stg @T[0],$A[4][0]($src)
+
+ la $dst,$stdframe($sp)
+
+ bras %r14,__KeccakF1600
+
+ lghi @D[0],-1 # no 'not' instruction :-(
+ lghi @D[1],-1
+ lghi @D[2],-1
+ lghi @D[3],-1
+ lghi @D[4],-1
+ lghi @T[0],-1
+ xg @D[0],$A[0][1]($src)
+ xg @D[1],$A[0][2]($src)
+ xg @D[2],$A[1][3]($src)
+ xg @D[3],$A[2][2]($src)
+ xg @D[4],$A[3][2]($src)
+ xg @T[0],$A[4][0]($src)
+ stmg @D[0],@D[1],$A[0][1]($src)
+ stg @D[2],$A[1][3]($src)
+ stg @D[3],$A[2][2]($src)
+ stg @D[4],$A[3][2]($src)
+ stg @T[0],$A[4][0]($src)
+
+ lm${g} %r6,%r15,$frame+6*$SIZE_T($sp)
+ br %r14
+.size KeccakF1600,.-KeccakF1600
+___
+}
+{ my ($A_flat,$inp,$len,$bsz) = map("%r$_",(2..5));
+
+$code.=<<___;
+.globl SHA3_absorb
+.type SHA3_absorb,\@function
+.align 32
+SHA3_absorb:
+ lghi %r1,-$frame
+ stm${g} %r5,%r15,$SIZE_T*5($sp)
+ lgr %r0,$sp
+ la $sp,0(%r1,$sp)
+ st${g} %r0,0($sp)
+
+ lghi @D[0],-1 # no 'not' instruction :-(
+ lghi @D[1],-1
+ lghi @D[2],-1
+ lghi @D[3],-1
+ lghi @D[4],-1
+ lghi @T[0],-1
+ xg @D[0],$A[0][1]($src)
+ xg @D[1],$A[0][2]($src)
+ xg @D[2],$A[1][3]($src)
+ xg @D[3],$A[2][2]($src)
+ xg @D[4],$A[3][2]($src)
+ xg @T[0],$A[4][0]($src)
+ stmg @D[0],@D[1],$A[0][1]($src)
+ stg @D[2],$A[1][3]($src)
+ stg @D[3],$A[2][2]($src)
+ stg @D[4],$A[3][2]($src)
+ stg @T[0],$A[4][0]($src)
+
+.Loop_absorb:
+ cl${g}r $len,$bsz
+ jl .Ldone_absorb
+
+ srl${g} $bsz,3
+ la %r1,0($A_flat)
+
+.Lblock_absorb:
+ lrvg %r0,0($inp)
+ la $inp,8($inp)
+ xg %r0,0(%r1)
+ la %r1,8(%r1)
+ a${g}hi $len,-8
+ stg %r0,-8(%r1)
+ brct $bsz,.Lblock_absorb
+
+ stm${g} $inp,$len,$frame+3*$SIZE_T($sp)
+ la $dst,$stdframe($sp)
+ bras %r14,__KeccakF1600
+ lm${g} $inp,$bsz,$frame+3*$SIZE_T($sp)
+ j .Loop_absorb
+
+.align 16
+.Ldone_absorb:
+ lghi @D[0],-1 # no 'not' instruction :-(
+ lghi @D[1],-1
+ lghi @D[2],-1
+ lghi @D[3],-1
+ lghi @D[4],-1
+ lghi @T[0],-1
+ xg @D[0],$A[0][1]($src)
+ xg @D[1],$A[0][2]($src)
+ xg @D[2],$A[1][3]($src)
+ xg @D[3],$A[2][2]($src)
+ xg @D[4],$A[3][2]($src)
+ xg @T[0],$A[4][0]($src)
+ stmg @D[0],@D[1],$A[0][1]($src)
+ stg @D[2],$A[1][3]($src)
+ stg @D[3],$A[2][2]($src)
+ stg @D[4],$A[3][2]($src)
+ stg @T[0],$A[4][0]($src)
+
+ lgr %r2,$len # return value
+
+ lm${g} %r6,%r15,$frame+6*$SIZE_T($sp)
+ br %r14
+.size SHA3_absorb,.-SHA3_absorb
+___
+}
+{ my ($A_flat,$out,$len,$bsz) = map("%r$_",(2..5));
+
+$code.=<<___;
+.globl SHA3_squeeze
+.type SHA3_squeeze,\@function
+.align 32
+SHA3_squeeze:
+ srl${g} $bsz,3
+ st${g} %r14,2*$SIZE_T($sp)
+ lghi %r14,8
+ st${g} $bsz,5*$SIZE_T($sp)
+ la %r1,0($A_flat)
+
+ j .Loop_squeeze
+
+.align 16
+.Loop_squeeze:
+ cl${g}r $len,%r14
+ jl .Ltail_squeeze
+
+ lrvg %r0,0(%r1)
+ la %r1,8(%r1)
+ stg %r0,0($out)
+ la $out,8($out)
+ a${g}hi $len,-8 # len -= 8
+ jz .Ldone_squeeze
+
+ brct $bsz,.Loop_squeeze # bsz--
+
+ stm${g} $out,$len,3*$SIZE_T($sp)
+ bras %r14,.LKeccakF1600
+ lm${g} $out,$bsz,3*$SIZE_T($sp)
+ lghi %r14,8
+ la %r1,0($A_flat)
+ j .Loop_squeeze
+
+.Ltail_squeeze:
+ lg %r0,0(%r1)
+.Loop_tail_squeeze:
+ stc %r0,0($out)
+ la $out,1($out)
+ srlg %r0,8
+ brct $len,.Loop_tail_squeeze
+
+.Ldone_squeeze:
+ l${g} %r14,2*$SIZE_T($sp)
+ br %r14
+.size SHA3_squeeze,.-SHA3_squeeze
+___
+}
+$code.=<<___;
+.align 256
+ .quad 0,0,0,0,0,0,0,0
+.type iotas,\@object
+iotas:
+ .quad 0x0000000000000001
+ .quad 0x0000000000008082
+ .quad 0x800000000000808a
+ .quad 0x8000000080008000
+ .quad 0x000000000000808b
+ .quad 0x0000000080000001
+ .quad 0x8000000080008081
+ .quad 0x8000000000008009
+ .quad 0x000000000000008a
+ .quad 0x0000000000000088
+ .quad 0x0000000080008009
+ .quad 0x000000008000000a
+ .quad 0x000000008000808b
+ .quad 0x800000000000008b
+ .quad 0x8000000000008089
+ .quad 0x8000000000008003
+ .quad 0x8000000000008002
+ .quad 0x8000000000000080
+ .quad 0x000000000000800a
+ .quad 0x800000008000000a
+ .quad 0x8000000080008081
+ .quad 0x8000000000008080
+ .quad 0x0000000080000001
+ .quad 0x8000000080008008
+.size iotas,.-iotas
+.asciz "Keccak-1600 absorb and squeeze for s390x, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+# unlike 32-bit shift 64-bit one takes three arguments
+$code =~ s/(srlg\s+)(%r[0-9]+),/$1$2,$2,/gm;
+
+print $code;
+close STDOUT;
diff --git a/crypto/sha/asm/keccak1600-x86_64.pl b/crypto/sha/asm/keccak1600-x86_64.pl
new file mode 100755
index 000000000000..42de5bf12344
--- /dev/null
+++ b/crypto/sha/asm/keccak1600-x86_64.pl
@@ -0,0 +1,607 @@
+#!/usr/bin/env perl
+# Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# Keccak-1600 for x86_64.
+#
+# June 2017.
+#
+# Below code is [lane complementing] KECCAK_2X implementation (see
+# sha/keccak1600.c) with C[5] and D[5] held in register bank. Though
+# instead of actually unrolling the loop pair-wise I simply flip
+# pointers to T[][] and A[][] at the end of round. Since number of
+# rounds is even, last round writes to A[][] and everything works out.
+# How does it compare to x86_64 assembly module in Keccak Code Package?
+# Depending on processor it's either as fast or faster by up to 15%...
+#
+########################################################################
+# Numbers are cycles per processed byte out of large message.
+#
+# r=1088(*)
+#
+# P4 25.8
+# Core 2 12.9
+# Westmere 13.7
+# Sandy Bridge 12.9(**)
+# Haswell 9.6
+# Skylake 9.4
+# Silvermont 22.8
+# Goldmont 15.8
+# VIA Nano 17.3
+# Sledgehammer 13.3
+# Bulldozer 16.5
+# Ryzen 8.8
+#
+# (*) Corresponds to SHA3-256. Improvement over compiler-generate
+# varies a lot, most commont coefficient is 15% in comparison to
+# gcc-5.x, 50% for gcc-4.x, 90% for gcc-3.x.
+# (**) Sandy Bridge has broken rotate instruction. Performance can be
+# improved by 14% by replacing rotates with double-precision
+# shift with same register as source and destination.
+
+$flavour = shift;
+$output = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+*STDOUT=*OUT;
+
+my @A = map([ 8*$_-100, 8*($_+1)-100, 8*($_+2)-100,
+ 8*($_+3)-100, 8*($_+4)-100 ], (0,5,10,15,20));
+
+my @C = ("%rax","%rbx","%rcx","%rdx","%rbp");
+my @D = map("%r$_",(8..12));
+my @T = map("%r$_",(13..14));
+my $iotas = "%r15";
+
+my @rhotates = ([ 0, 1, 62, 28, 27 ],
+ [ 36, 44, 6, 55, 20 ],
+ [ 3, 10, 43, 25, 39 ],
+ [ 41, 45, 15, 21, 8 ],
+ [ 18, 2, 61, 56, 14 ]);
+
+$code.=<<___;
+.text
+
+.type __KeccakF1600,\@abi-omnipotent
+.align 32
+__KeccakF1600:
+ mov $A[4][0](%rdi),@C[0]
+ mov $A[4][1](%rdi),@C[1]
+ mov $A[4][2](%rdi),@C[2]
+ mov $A[4][3](%rdi),@C[3]
+ mov $A[4][4](%rdi),@C[4]
+ jmp .Loop
+
+.align 32
+.Loop:
+ mov $A[0][0](%rdi),@D[0]
+ mov $A[1][1](%rdi),@D[1]
+ mov $A[2][2](%rdi),@D[2]
+ mov $A[3][3](%rdi),@D[3]
+
+ xor $A[0][2](%rdi),@C[2]
+ xor $A[0][3](%rdi),@C[3]
+ xor @D[0], @C[0]
+ xor $A[0][1](%rdi),@C[1]
+ xor $A[1][2](%rdi),@C[2]
+ xor $A[1][0](%rdi),@C[0]
+ mov @C[4],@D[4]
+ xor $A[0][4](%rdi),@C[4]
+
+ xor @D[2], @C[2]
+ xor $A[2][0](%rdi),@C[0]
+ xor $A[1][3](%rdi),@C[3]
+ xor @D[1], @C[1]
+ xor $A[1][4](%rdi),@C[4]
+
+ xor $A[3][2](%rdi),@C[2]
+ xor $A[3][0](%rdi),@C[0]
+ xor $A[2][3](%rdi),@C[3]
+ xor $A[2][1](%rdi),@C[1]
+ xor $A[2][4](%rdi),@C[4]
+
+ mov @C[2],@T[0]
+ rol \$1,@C[2]
+ xor @C[0],@C[2] # D[1] = ROL64(C[2], 1) ^ C[0]
+ xor @D[3], @C[3]
+
+ rol \$1,@C[0]
+ xor @C[3],@C[0] # D[4] = ROL64(C[0], 1) ^ C[3]
+ xor $A[3][1](%rdi),@C[1]
+
+ rol \$1,@C[3]
+ xor @C[1],@C[3] # D[2] = ROL64(C[3], 1) ^ C[1]
+ xor $A[3][4](%rdi),@C[4]
+
+ rol \$1,@C[1]
+ xor @C[4],@C[1] # D[0] = ROL64(C[1], 1) ^ C[4]
+
+ rol \$1,@C[4]
+ xor @T[0],@C[4] # D[3] = ROL64(C[4], 1) ^ C[2]
+___
+ (@D[0..4], @C) = (@C[1..4,0], @D);
+$code.=<<___;
+ xor @D[1],@C[1]
+ xor @D[2],@C[2]
+ rol \$$rhotates[1][1],@C[1]
+ xor @D[3],@C[3]
+ xor @D[4],@C[4]
+ rol \$$rhotates[2][2],@C[2]
+ xor @D[0],@C[0]
+ mov @C[1],@T[0]
+ rol \$$rhotates[3][3],@C[3]
+ or @C[2],@C[1]
+ xor @C[0],@C[1] # C[0] ^ ( C[1] | C[2])
+ rol \$$rhotates[4][4],@C[4]
+
+ xor ($iotas),@C[1]
+ lea 8($iotas),$iotas
+
+ mov @C[4],@T[1]
+ and @C[3],@C[4]
+ mov @C[1],$A[0][0](%rsi) # R[0][0] = C[0] ^ ( C[1] | C[2]) ^ iotas[i]
+ xor @C[2],@C[4] # C[2] ^ ( C[4] & C[3])
+ not @C[2]
+ mov @C[4],$A[0][2](%rsi) # R[0][2] = C[2] ^ ( C[4] & C[3])
+
+ or @C[3],@C[2]
+ mov $A[4][2](%rdi),@C[4]
+ xor @T[0],@C[2] # C[1] ^ (~C[2] | C[3])
+ mov @C[2],$A[0][1](%rsi) # R[0][1] = C[1] ^ (~C[2] | C[3])
+
+ and @C[0],@T[0]
+ mov $A[1][4](%rdi),@C[1]
+ xor @T[1],@T[0] # C[4] ^ ( C[1] & C[0])
+ mov $A[2][0](%rdi),@C[2]
+ mov @T[0],$A[0][4](%rsi) # R[0][4] = C[4] ^ ( C[1] & C[0])
+
+ or @C[0],@T[1]
+ mov $A[0][3](%rdi),@C[0]
+ xor @C[3],@T[1] # C[3] ^ ( C[4] | C[0])
+ mov $A[3][1](%rdi),@C[3]
+ mov @T[1],$A[0][3](%rsi) # R[0][3] = C[3] ^ ( C[4] | C[0])
+
+
+ xor @D[3],@C[0]
+ xor @D[2],@C[4]
+ rol \$$rhotates[0][3],@C[0]
+ xor @D[1],@C[3]
+ xor @D[4],@C[1]
+ rol \$$rhotates[4][2],@C[4]
+ rol \$$rhotates[3][1],@C[3]
+ xor @D[0],@C[2]
+ rol \$$rhotates[1][4],@C[1]
+ mov @C[0],@T[0]
+ or @C[4],@C[0]
+ rol \$$rhotates[2][0],@C[2]
+
+ xor @C[3],@C[0] # C[3] ^ (C[0] | C[4])
+ mov @C[0],$A[1][3](%rsi) # R[1][3] = C[3] ^ (C[0] | C[4])
+
+ mov @C[1],@T[1]
+ and @T[0],@C[1]
+ mov $A[0][1](%rdi),@C[0]
+ xor @C[4],@C[1] # C[4] ^ (C[1] & C[0])
+ not @C[4]
+ mov @C[1],$A[1][4](%rsi) # R[1][4] = C[4] ^ (C[1] & C[0])
+
+ or @C[3],@C[4]
+ mov $A[1][2](%rdi),@C[1]
+ xor @C[2],@C[4] # C[2] ^ (~C[4] | C[3])
+ mov @C[4],$A[1][2](%rsi) # R[1][2] = C[2] ^ (~C[4] | C[3])
+
+ and @C[2],@C[3]
+ mov $A[4][0](%rdi),@C[4]
+ xor @T[1],@C[3] # C[1] ^ (C[3] & C[2])
+ mov @C[3],$A[1][1](%rsi) # R[1][1] = C[1] ^ (C[3] & C[2])
+
+ or @C[2],@T[1]
+ mov $A[2][3](%rdi),@C[2]
+ xor @T[0],@T[1] # C[0] ^ (C[1] | C[2])
+ mov $A[3][4](%rdi),@C[3]
+ mov @T[1],$A[1][0](%rsi) # R[1][0] = C[0] ^ (C[1] | C[2])
+
+
+ xor @D[3],@C[2]
+ xor @D[4],@C[3]
+ rol \$$rhotates[2][3],@C[2]
+ xor @D[2],@C[1]
+ rol \$$rhotates[3][4],@C[3]
+ xor @D[0],@C[4]
+ rol \$$rhotates[1][2],@C[1]
+ xor @D[1],@C[0]
+ rol \$$rhotates[4][0],@C[4]
+ mov @C[2],@T[0]
+ and @C[3],@C[2]
+ rol \$$rhotates[0][1],@C[0]
+
+ not @C[3]
+ xor @C[1],@C[2] # C[1] ^ ( C[2] & C[3])
+ mov @C[2],$A[2][1](%rsi) # R[2][1] = C[1] ^ ( C[2] & C[3])
+
+ mov @C[4],@T[1]
+ and @C[3],@C[4]
+ mov $A[2][1](%rdi),@C[2]
+ xor @T[0],@C[4] # C[2] ^ ( C[4] & ~C[3])
+ mov @C[4],$A[2][2](%rsi) # R[2][2] = C[2] ^ ( C[4] & ~C[3])
+
+ or @C[1],@T[0]
+ mov $A[4][3](%rdi),@C[4]
+ xor @C[0],@T[0] # C[0] ^ ( C[2] | C[1])
+ mov @T[0],$A[2][0](%rsi) # R[2][0] = C[0] ^ ( C[2] | C[1])
+
+ and @C[0],@C[1]
+ xor @T[1],@C[1] # C[4] ^ ( C[1] & C[0])
+ mov @C[1],$A[2][4](%rsi) # R[2][4] = C[4] ^ ( C[1] & C[0])
+
+ or @C[0],@T[1]
+ mov $A[1][0](%rdi),@C[1]
+ xor @C[3],@T[1] # ~C[3] ^ ( C[0] | C[4])
+ mov $A[3][2](%rdi),@C[3]
+ mov @T[1],$A[2][3](%rsi) # R[2][3] = ~C[3] ^ ( C[0] | C[4])
+
+
+ mov $A[0][4](%rdi),@C[0]
+
+ xor @D[1],@C[2]
+ xor @D[2],@C[3]
+ rol \$$rhotates[2][1],@C[2]
+ xor @D[0],@C[1]
+ rol \$$rhotates[3][2],@C[3]
+ xor @D[3],@C[4]
+ rol \$$rhotates[1][0],@C[1]
+ xor @D[4],@C[0]
+ rol \$$rhotates[4][3],@C[4]
+ mov @C[2],@T[0]
+ or @C[3],@C[2]
+ rol \$$rhotates[0][4],@C[0]
+
+ not @C[3]
+ xor @C[1],@C[2] # C[1] ^ ( C[2] | C[3])
+ mov @C[2],$A[3][1](%rsi) # R[3][1] = C[1] ^ ( C[2] | C[3])
+
+ mov @C[4],@T[1]
+ or @C[3],@C[4]
+ xor @T[0],@C[4] # C[2] ^ ( C[4] | ~C[3])
+ mov @C[4],$A[3][2](%rsi) # R[3][2] = C[2] ^ ( C[4] | ~C[3])
+
+ and @C[1],@T[0]
+ xor @C[0],@T[0] # C[0] ^ ( C[2] & C[1])
+ mov @T[0],$A[3][0](%rsi) # R[3][0] = C[0] ^ ( C[2] & C[1])
+
+ or @C[0],@C[1]
+ xor @T[1],@C[1] # C[4] ^ ( C[1] | C[0])
+ mov @C[1],$A[3][4](%rsi) # R[3][4] = C[4] ^ ( C[1] | C[0])
+
+ and @T[1],@C[0]
+ xor @C[3],@C[0] # ~C[3] ^ ( C[0] & C[4])
+ mov @C[0],$A[3][3](%rsi) # R[3][3] = ~C[3] ^ ( C[0] & C[4])
+
+
+ xor $A[0][2](%rdi),@D[2]
+ xor $A[1][3](%rdi),@D[3]
+ rol \$$rhotates[0][2],@D[2]
+ xor $A[4][1](%rdi),@D[1]
+ rol \$$rhotates[1][3],@D[3]
+ xor $A[2][4](%rdi),@D[4]
+ rol \$$rhotates[4][1],@D[1]
+ xor $A[3][0](%rdi),@D[0]
+ xchg %rsi,%rdi
+ rol \$$rhotates[2][4],@D[4]
+ rol \$$rhotates[3][0],@D[0]
+___
+ @C = @D[2..4,0,1];
+$code.=<<___;
+ mov @C[0],@T[0]
+ and @C[1],@C[0]
+ not @C[1]
+ xor @C[4],@C[0] # C[4] ^ ( C[0] & C[1])
+ mov @C[0],$A[4][4](%rdi) # R[4][4] = C[4] ^ ( C[0] & C[1])
+
+ mov @C[2],@T[1]
+ and @C[1],@C[2]
+ xor @T[0],@C[2] # C[0] ^ ( C[2] & ~C[1])
+ mov @C[2],$A[4][0](%rdi) # R[4][0] = C[0] ^ ( C[2] & ~C[1])
+
+ or @C[4],@T[0]
+ xor @C[3],@T[0] # C[3] ^ ( C[0] | C[4])
+ mov @T[0],$A[4][3](%rdi) # R[4][3] = C[3] ^ ( C[0] | C[4])
+
+ and @C[3],@C[4]
+ xor @T[1],@C[4] # C[2] ^ ( C[4] & C[3])
+ mov @C[4],$A[4][2](%rdi) # R[4][2] = C[2] ^ ( C[4] & C[3])
+
+ or @T[1],@C[3]
+ xor @C[1],@C[3] # ~C[1] ^ ( C[2] | C[3])
+ mov @C[3],$A[4][1](%rdi) # R[4][1] = ~C[1] ^ ( C[2] | C[3])
+
+ mov @C[0],@C[1] # harmonize with the loop top
+ mov @T[0],@C[0]
+
+ test \$255,$iotas
+ jnz .Loop
+
+ lea -192($iotas),$iotas # rewind iotas
+ ret
+.size __KeccakF1600,.-__KeccakF1600
+
+.type KeccakF1600,\@abi-omnipotent
+.align 32
+KeccakF1600:
+.cfi_startproc
+ push %rbx
+.cfi_push %rbx
+ push %rbp
+.cfi_push %rbp
+ push %r12
+.cfi_push %r12
+ push %r13
+.cfi_push %r13
+ push %r14
+.cfi_push %r14
+ push %r15
+.cfi_push %r15
+
+ lea 100(%rdi),%rdi # size optimization
+ sub \$200,%rsp
+.cfi_adjust_cfa_offset 200
+
+ notq $A[0][1](%rdi)
+ notq $A[0][2](%rdi)
+ notq $A[1][3](%rdi)
+ notq $A[2][2](%rdi)
+ notq $A[3][2](%rdi)
+ notq $A[4][0](%rdi)
+
+ lea iotas(%rip),$iotas
+ lea 100(%rsp),%rsi # size optimization
+
+ call __KeccakF1600
+
+ notq $A[0][1](%rdi)
+ notq $A[0][2](%rdi)
+ notq $A[1][3](%rdi)
+ notq $A[2][2](%rdi)
+ notq $A[3][2](%rdi)
+ notq $A[4][0](%rdi)
+ lea -100(%rdi),%rdi # preserve A[][]
+
+ add \$200,%rsp
+.cfi_adjust_cfa_offset -200
+
+ pop %r15
+.cfi_pop %r15
+ pop %r14
+.cfi_pop %r14
+ pop %r13
+.cfi_pop %r13
+ pop %r12
+.cfi_pop %r12
+ pop %rbp
+.cfi_pop %rbp
+ pop %rbx
+.cfi_pop %rbx
+ ret
+.cfi_endproc
+.size KeccakF1600,.-KeccakF1600
+___
+
+{ my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
+ ($A_flat,$inp) = ("%r8","%r9");
+$code.=<<___;
+.globl SHA3_absorb
+.type SHA3_absorb,\@function,4
+.align 32
+SHA3_absorb:
+.cfi_startproc
+ push %rbx
+.cfi_push %rbx
+ push %rbp
+.cfi_push %rbp
+ push %r12
+.cfi_push %r12
+ push %r13
+.cfi_push %r13
+ push %r14
+.cfi_push %r14
+ push %r15
+.cfi_push %r15
+
+ lea 100(%rdi),%rdi # size optimization
+ sub \$232,%rsp
+.cfi_adjust_cfa_offset 232
+
+ mov %rsi,$inp
+ lea 100(%rsp),%rsi # size optimization
+
+ notq $A[0][1](%rdi)
+ notq $A[0][2](%rdi)
+ notq $A[1][3](%rdi)
+ notq $A[2][2](%rdi)
+ notq $A[3][2](%rdi)
+ notq $A[4][0](%rdi)
+ lea iotas(%rip),$iotas
+
+ mov $bsz,216-100(%rsi) # save bsz
+
+.Loop_absorb:
+ cmp $bsz,$len
+ jc .Ldone_absorb
+
+ shr \$3,$bsz
+ lea -100(%rdi),$A_flat
+
+.Lblock_absorb:
+ mov ($inp),%rax
+ lea 8($inp),$inp
+ xor ($A_flat),%rax
+ lea 8($A_flat),$A_flat
+ sub \$8,$len
+ mov %rax,-8($A_flat)
+ sub \$1,$bsz
+ jnz .Lblock_absorb
+
+ mov $inp,200-100(%rsi) # save inp
+ mov $len,208-100(%rsi) # save len
+ call __KeccakF1600
+ mov 200-100(%rsi),$inp # pull inp
+ mov 208-100(%rsi),$len # pull len
+ mov 216-100(%rsi),$bsz # pull bsz
+ jmp .Loop_absorb
+
+.align 32
+.Ldone_absorb:
+ mov $len,%rax # return value
+
+ notq $A[0][1](%rdi)
+ notq $A[0][2](%rdi)
+ notq $A[1][3](%rdi)
+ notq $A[2][2](%rdi)
+ notq $A[3][2](%rdi)
+ notq $A[4][0](%rdi)
+
+ add \$232,%rsp
+.cfi_adjust_cfa_offset -232
+
+ pop %r15
+.cfi_pop %r15
+ pop %r14
+.cfi_pop %r14
+ pop %r13
+.cfi_pop %r13
+ pop %r12
+.cfi_pop %r12
+ pop %rbp
+.cfi_pop %rbp
+ pop %rbx
+.cfi_pop %rbx
+ ret
+.cfi_endproc
+.size SHA3_absorb,.-SHA3_absorb
+___
+}
+{ my ($A_flat,$out,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
+ ($out,$len,$bsz) = ("%r12","%r13","%r14");
+
+$code.=<<___;
+.globl SHA3_squeeze
+.type SHA3_squeeze,\@function,4
+.align 32
+SHA3_squeeze:
+.cfi_startproc
+ push %r12
+.cfi_push %r12
+ push %r13
+.cfi_push %r13
+ push %r14
+.cfi_push %r14
+
+ shr \$3,%rcx
+ mov $A_flat,%r8
+ mov %rsi,$out
+ mov %rdx,$len
+ mov %rcx,$bsz
+ jmp .Loop_squeeze
+
+.align 32
+.Loop_squeeze:
+ cmp \$8,$len
+ jb .Ltail_squeeze
+
+ mov (%r8),%rax
+ lea 8(%r8),%r8
+ mov %rax,($out)
+ lea 8($out),$out
+ sub \$8,$len # len -= 8
+ jz .Ldone_squeeze
+
+ sub \$1,%rcx # bsz--
+ jnz .Loop_squeeze
+
+ call KeccakF1600
+ mov $A_flat,%r8
+ mov $bsz,%rcx
+ jmp .Loop_squeeze
+
+.Ltail_squeeze:
+ mov %r8, %rsi
+ mov $out,%rdi
+ mov $len,%rcx
+ .byte 0xf3,0xa4 # rep movsb
+
+.Ldone_squeeze:
+ pop %r14
+.cfi_pop %r14
+ pop %r13
+.cfi_pop %r13
+ pop %r12
+.cfi_pop %r13
+ ret
+.cfi_endproc
+.size SHA3_squeeze,.-SHA3_squeeze
+___
+}
+$code.=<<___;
+.align 256
+ .quad 0,0,0,0,0,0,0,0
+.type iotas,\@object
+iotas:
+ .quad 0x0000000000000001
+ .quad 0x0000000000008082
+ .quad 0x800000000000808a
+ .quad 0x8000000080008000
+ .quad 0x000000000000808b
+ .quad 0x0000000080000001
+ .quad 0x8000000080008081
+ .quad 0x8000000000008009
+ .quad 0x000000000000008a
+ .quad 0x0000000000000088
+ .quad 0x0000000080008009
+ .quad 0x000000008000000a
+ .quad 0x000000008000808b
+ .quad 0x800000000000008b
+ .quad 0x8000000000008089
+ .quad 0x8000000000008003
+ .quad 0x8000000000008002
+ .quad 0x8000000000000080
+ .quad 0x000000000000800a
+ .quad 0x800000008000000a
+ .quad 0x8000000080008081
+ .quad 0x8000000000008080
+ .quad 0x0000000080000001
+ .quad 0x8000000080008008
+.size iotas,.-iotas
+.asciz "Keccak-1600 absorb and squeeze for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+foreach (split("\n",$code)) {
+ # Below replacement results in 11.2 on Sandy Bridge, 9.4 on
+ # Haswell, but it hurts other processors by up to 2-3-4x...
+ #s/rol\s+(\$[0-9]+),(%[a-z][a-z0-9]+)/shld\t$1,$2,$2/;
+ # Below replacement results in 9.3 on Haswell [as well as
+ # on Ryzen, i.e. it *hurts* Ryzen]...
+ #s/rol\s+\$([0-9]+),(%[a-z][a-z0-9]+)/rorx\t\$64-$1,$2,$2/;
+
+ print $_, "\n";
+}
+
+close STDOUT;
diff --git a/crypto/sha/asm/keccak1600p8-ppc.pl b/crypto/sha/asm/keccak1600p8-ppc.pl
new file mode 100755
index 000000000000..de2bcd660a09
--- /dev/null
+++ b/crypto/sha/asm/keccak1600p8-ppc.pl
@@ -0,0 +1,850 @@
+#!/usr/bin/env perl
+# Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# Keccak-1600 for PowerISA 2.07.
+#
+# June 2017.
+#
+# This is straightforward KECCAK_1X_ALT SIMD implementation, but with
+# disjoint Rho and Pi. The module is ABI-bitness- and endian-neutral.
+# POWER8 processor spends 9.8 cycles to process byte out of large
+# buffer for r=1088, which matches SHA3-256. This is 17% better than
+# scalar PPC64 code. It probably should be noted that if POWER8's
+# successor can achieve higher scalar instruction issue rate, then
+# this module will loose... And it does on POWER9 with 12.0 vs. 9.4.
+
+$flavour = shift;
+
+if ($flavour =~ /64/) {
+ $SIZE_T =8;
+ $LRSAVE =2*$SIZE_T;
+ $UCMP ="cmpld";
+ $STU ="stdu";
+ $POP ="ld";
+ $PUSH ="std";
+} elsif ($flavour =~ /32/) {
+ $SIZE_T =4;
+ $LRSAVE =$SIZE_T;
+ $STU ="stwu";
+ $POP ="lwz";
+ $PUSH ="stw";
+ $UCMP ="cmplw";
+} else { die "nonsense $flavour"; }
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
+die "can't locate ppc-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
+
+$FRAME=6*$SIZE_T+13*16; # 13*16 is for v20-v31 offload
+
+my $sp ="r1";
+
+my $iotas = "r12";
+
+########################################################################
+# Register layout:
+#
+# v0 A[0][0] A[1][0]
+# v1 A[0][1] A[1][1]
+# v2 A[0][2] A[1][2]
+# v3 A[0][3] A[1][3]
+# v4 A[0][4] A[1][4]
+#
+# v5 A[2][0] A[3][0]
+# v6 A[2][1] A[3][1]
+# v7 A[2][2] A[3][2]
+# v8 A[2][3] A[3][3]
+# v9 A[2][4] A[3][4]
+#
+# v10 A[4][0] A[4][1]
+# v11 A[4][2] A[4][3]
+# v12 A[4][4] A[4][4]
+#
+# v13..25 rhotates[][]
+# v26..31 volatile
+#
+$code.=<<___;
+.machine "any"
+.text
+
+.type KeccakF1600_int,\@function
+.align 5
+KeccakF1600_int:
+ li r0,24
+ mtctr r0
+ li r0,0
+ b .Loop
+
+.align 4
+.Loop:
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Theta
+ vxor v26,v0, v5 ; A[0..1][0]^A[2..3][0]
+ vxor v27,v1, v6 ; A[0..1][1]^A[2..3][1]
+ vxor v28,v2, v7 ; A[0..1][2]^A[2..3][2]
+ vxor v29,v3, v8 ; A[0..1][3]^A[2..3][3]
+ vxor v30,v4, v9 ; A[0..1][4]^A[2..3][4]
+ vpermdi v31,v26,v27,0b00 ; A[0][0..1]^A[2][0..1]
+ vpermdi v26,v26,v27,0b11 ; A[1][0..1]^A[3][0..1]
+ vpermdi v27,v28,v29,0b00 ; A[0][2..3]^A[2][2..3]
+ vpermdi v28,v28,v29,0b11 ; A[1][2..3]^A[3][2..3]
+ vpermdi v29,v30,v30,0b10 ; A[1..0][4]^A[3..2][4]
+ vxor v26,v26,v31 ; C[0..1]
+ vxor v27,v27,v28 ; C[2..3]
+ vxor v28,v29,v30 ; C[4..4]
+ vspltisb v31,1
+ vxor v26,v26,v10 ; C[0..1] ^= A[4][0..1]
+ vxor v27,v27,v11 ; C[2..3] ^= A[4][2..3]
+ vxor v28,v28,v12 ; C[4..4] ^= A[4][4..4], low!
+
+ vrld v29,v26,v31 ; ROL64(C[0..1],1)
+ vrld v30,v27,v31 ; ROL64(C[2..3],1)
+ vrld v31,v28,v31 ; ROL64(C[4..4],1)
+ vpermdi v31,v31,v29,0b10
+ vxor v26,v26,v30 ; C[0..1] ^= ROL64(C[2..3],1)
+ vxor v27,v27,v31 ; C[2..3] ^= ROL64(C[4..0],1)
+ vxor v28,v28,v29 ; C[4..4] ^= ROL64(C[0..1],1), low!
+
+ vpermdi v29,v26,v26,0b00 ; C[0..0]
+ vpermdi v30,v28,v26,0b10 ; C[4..0]
+ vpermdi v31,v28,v28,0b11 ; C[4..4]
+ vxor v1, v1, v29 ; A[0..1][1] ^= C[0..0]
+ vxor v6, v6, v29 ; A[2..3][1] ^= C[0..0]
+ vxor v10,v10,v30 ; A[4][0..1] ^= C[4..0]
+ vxor v0, v0, v31 ; A[0..1][0] ^= C[4..4]
+ vxor v5, v5, v31 ; A[2..3][0] ^= C[4..4]
+
+ vpermdi v29,v27,v27,0b00 ; C[2..2]
+ vpermdi v30,v26,v26,0b11 ; C[1..1]
+ vpermdi v31,v26,v27,0b10 ; C[1..2]
+ vxor v3, v3, v29 ; A[0..1][3] ^= C[2..2]
+ vxor v8, v8, v29 ; A[2..3][3] ^= C[2..2]
+ vxor v2, v2, v30 ; A[0..1][2] ^= C[1..1]
+ vxor v7, v7, v30 ; A[2..3][2] ^= C[1..1]
+ vxor v11,v11,v31 ; A[4][2..3] ^= C[1..2]
+
+ vpermdi v29,v27,v27,0b11 ; C[3..3]
+ vxor v4, v4, v29 ; A[0..1][4] ^= C[3..3]
+ vxor v9, v9, v29 ; A[2..3][4] ^= C[3..3]
+ vxor v12,v12,v29 ; A[4..4][4] ^= C[3..3]
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Rho
+ vrld v26,v0, v13 ; v0
+ vrld v1, v1, v14
+ vrld v27,v2, v15 ; v2
+ vrld v28,v3, v16 ; v3
+ vrld v4, v4, v17
+ vrld v5, v5, v18
+ vrld v6, v6, v19
+ vrld v29,v7, v20 ; v7
+ vrld v8, v8, v21
+ vrld v9, v9, v22
+ vrld v10,v10,v23
+ vrld v30,v11,v24 ; v11
+ vrld v12,v12,v25
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Pi
+ vpermdi v0, v26,v28,0b00 ; [0][0] [1][0] < [0][0] [0][3]
+ vpermdi v2, v29,v5, 0b00 ; [0][2] [1][2] < [2][2] [2][0]
+ vpermdi v11,v9, v5, 0b01 ; [4][2] [4][3] < [2][4] [3][0]
+ vpermdi v5, v1, v4, 0b00 ; [2][0] [3][0] < [0][1] [0][4]
+ vpermdi v1, v1, v4, 0b11 ; [0][1] [1][1] < [1][1] [1][4]
+ vpermdi v3, v8, v6, 0b11 ; [0][3] [1][3] < [3][3] [3][1]
+ vpermdi v4, v12,v30,0b10 ; [0][4] [1][4] < [4][4] [4][2]
+ vpermdi v7, v8, v6, 0b00 ; [2][2] [3][2] < [2][3] [2][1]
+ vpermdi v6, v27,v26,0b11 ; [2][1] [3][1] < [1][2] [1][0]
+ vpermdi v8, v9, v29,0b11 ; [2][3] [3][3] < [3][4] [3][2]
+ vpermdi v12,v10,v10,0b11 ; [4][4] [4][4] < [4][1] [4][1]
+ vpermdi v9, v10,v30,0b01 ; [2][4] [3][4] < [4][0] [4][3]
+ vpermdi v10,v27,v28,0b01 ; [4][0] [4][1] < [0][2] [1][3]
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Chi + Iota
+ lvx_u v31,$iotas,r0 ; iotas[index]
+ addic r0,r0,16 ; index++
+
+ vandc v26,v2, v1 ; (~A[0..1][1] & A[0..1][2])
+ vandc v27,v3, v2 ; (~A[0..1][2] & A[0..1][3])
+ vandc v28,v4, v3 ; (~A[0..1][3] & A[0..1][4])
+ vandc v29,v0, v4 ; (~A[0..1][4] & A[0..1][0])
+ vandc v30,v1, v0 ; (~A[0..1][0] & A[0..1][1])
+ vxor v0, v0, v26 ; A[0..1][0] ^= (~A[0..1][1] & A[0..1][2])
+ vxor v1, v1, v27 ; A[0..1][1] ^= (~A[0..1][2] & A[0..1][3])
+ vxor v2, v2, v28 ; A[0..1][2] ^= (~A[0..1][3] & A[0..1][4])
+ vxor v3, v3, v29 ; A[0..1][3] ^= (~A[0..1][4] & A[0..1][0])
+ vxor v4, v4, v30 ; A[0..1][4] ^= (~A[0..1][0] & A[0..1][1])
+
+ vandc v26,v7, v6 ; (~A[2..3][1] & A[2..3][2])
+ vandc v27,v8, v7 ; (~A[2..3][2] & A[2..3][3])
+ vandc v28,v9, v8 ; (~A[2..3][3] & A[2..3][4])
+ vandc v29,v5, v9 ; (~A[2..3][4] & A[2..3][0])
+ vandc v30,v6, v5 ; (~A[2..3][0] & A[2..3][1])
+ vxor v5, v5, v26 ; A[2..3][0] ^= (~A[2..3][1] & A[2..3][2])
+ vxor v6, v6, v27 ; A[2..3][1] ^= (~A[2..3][2] & A[2..3][3])
+ vxor v7, v7, v28 ; A[2..3][2] ^= (~A[2..3][3] & A[2..3][4])
+ vxor v8, v8, v29 ; A[2..3][3] ^= (~A[2..3][4] & A[2..3][0])
+ vxor v9, v9, v30 ; A[2..3][4] ^= (~A[2..3][0] & A[2..3][1])
+
+ vxor v0, v0, v31 ; A[0][0] ^= iotas[index++]
+
+ vpermdi v26,v10,v11,0b10 ; A[4][1..2]
+ vpermdi v27,v12,v10,0b00 ; A[4][4..0]
+ vpermdi v28,v11,v12,0b10 ; A[4][3..4]
+ vpermdi v29,v10,v10,0b10 ; A[4][1..0]
+ vandc v26,v11,v26 ; (~A[4][1..2] & A[4][2..3])
+ vandc v27,v27,v28 ; (~A[4][3..4] & A[4][4..0])
+ vandc v28,v10,v29 ; (~A[4][1..0] & A[4][0..1])
+ vxor v10,v10,v26 ; A[4][0..1] ^= (~A[4][1..2] & A[4][2..3])
+ vxor v11,v11,v27 ; A[4][2..3] ^= (~A[4][3..4] & A[4][4..0])
+ vxor v12,v12,v28 ; A[4][4..4] ^= (~A[4][0..1] & A[4][1..0])
+
+ bdnz .Loop
+
+ vpermdi v12,v12,v12,0b11 ; broadcast A[4][4]
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,0,0
+.size KeccakF1600_int,.-KeccakF1600_int
+
+.type KeccakF1600,\@function
+.align 5
+KeccakF1600:
+ $STU $sp,-$FRAME($sp)
+ li r10,`15+6*$SIZE_T`
+ li r11,`31+6*$SIZE_T`
+ mflr r8
+ mfspr r7, 256 ; save vrsave
+ stvx v20,r10,$sp
+ addi r10,r10,32
+ stvx v21,r11,$sp
+ addi r11,r11,32
+ stvx v22,r10,$sp
+ addi r10,r10,32
+ stvx v23,r11,$sp
+ addi r11,r11,32
+ stvx v24,r10,$sp
+ addi r10,r10,32
+ stvx v25,r11,$sp
+ addi r11,r11,32
+ stvx v26,r10,$sp
+ addi r10,r10,32
+ stvx v27,r11,$sp
+ addi r11,r11,32
+ stvx v28,r10,$sp
+ addi r10,r10,32
+ stvx v29,r11,$sp
+ addi r11,r11,32
+ stvx v30,r10,$sp
+ stvx v31,r11,$sp
+ stw r7,`$FRAME-4`($sp) ; save vrsave
+ li r0, -1
+ $PUSH r8,`$FRAME+$LRSAVE`($sp)
+ mtspr 256, r0 ; preserve all AltiVec registers
+
+ li r11,16
+ lvx_4w v0,0,r3 ; load A[5][5]
+ li r10,32
+ lvx_4w v1,r11,r3
+ addi r11,r11,32
+ lvx_4w v2,r10,r3
+ addi r10,r10,32
+ lvx_4w v3,r11,r3
+ addi r11,r11,32
+ lvx_4w v4,r10,r3
+ addi r10,r10,32
+ lvx_4w v5,r11,r3
+ addi r11,r11,32
+ lvx_4w v6,r10,r3
+ addi r10,r10,32
+ lvx_4w v7,r11,r3
+ addi r11,r11,32
+ lvx_4w v8,r10,r3
+ addi r10,r10,32
+ lvx_4w v9,r11,r3
+ addi r11,r11,32
+ lvx_4w v10,r10,r3
+ addi r10,r10,32
+ lvx_4w v11,r11,r3
+ lvx_splt v12,r10,r3
+
+ bl PICmeup
+
+ li r11,16
+ lvx_u v13,0,r12 ; load rhotates
+ li r10,32
+ lvx_u v14,r11,r12
+ addi r11,r11,32
+ lvx_u v15,r10,r12
+ addi r10,r10,32
+ lvx_u v16,r11,r12
+ addi r11,r11,32
+ lvx_u v17,r10,r12
+ addi r10,r10,32
+ lvx_u v18,r11,r12
+ addi r11,r11,32
+ lvx_u v19,r10,r12
+ addi r10,r10,32
+ lvx_u v20,r11,r12
+ addi r11,r11,32
+ lvx_u v21,r10,r12
+ addi r10,r10,32
+ lvx_u v22,r11,r12
+ addi r11,r11,32
+ lvx_u v23,r10,r12
+ addi r10,r10,32
+ lvx_u v24,r11,r12
+ lvx_u v25,r10,r12
+ addi r12,r12,`16*16` ; points at iotas
+
+ bl KeccakF1600_int
+
+ li r11,16
+ stvx_4w v0,0,r3 ; return A[5][5]
+ li r10,32
+ stvx_4w v1,r11,r3
+ addi r11,r11,32
+ stvx_4w v2,r10,r3
+ addi r10,r10,32
+ stvx_4w v3,r11,r3
+ addi r11,r11,32
+ stvx_4w v4,r10,r3
+ addi r10,r10,32
+ stvx_4w v5,r11,r3
+ addi r11,r11,32
+ stvx_4w v6,r10,r3
+ addi r10,r10,32
+ stvx_4w v7,r11,r3
+ addi r11,r11,32
+ stvx_4w v8,r10,r3
+ addi r10,r10,32
+ stvx_4w v9,r11,r3
+ addi r11,r11,32
+ stvx_4w v10,r10,r3
+ addi r10,r10,32
+ stvx_4w v11,r11,r3
+ stvdx_u v12,r10,r3
+
+ li r10,`15+6*$SIZE_T`
+ li r11,`31+6*$SIZE_T`
+ mtlr r8
+ mtspr 256, r7 ; restore vrsave
+ lvx v20,r10,$sp
+ addi r10,r10,32
+ lvx v21,r11,$sp
+ addi r11,r11,32
+ lvx v22,r10,$sp
+ addi r10,r10,32
+ lvx v23,r11,$sp
+ addi r11,r11,32
+ lvx v24,r10,$sp
+ addi r10,r10,32
+ lvx v25,r11,$sp
+ addi r11,r11,32
+ lvx v26,r10,$sp
+ addi r10,r10,32
+ lvx v27,r11,$sp
+ addi r11,r11,32
+ lvx v28,r10,$sp
+ addi r10,r10,32
+ lvx v29,r11,$sp
+ addi r11,r11,32
+ lvx v30,r10,$sp
+ lvx v31,r11,$sp
+ addi $sp,$sp,$FRAME
+ blr
+ .long 0
+ .byte 0,12,0x04,1,0x80,0,1,0
+ .long 0
+.size KeccakF1600,.-KeccakF1600
+___
+{
+my ($A_jagged,$inp,$len,$bsz) = map("r$_",(3..6));
+
+$code.=<<___;
+.globl SHA3_absorb
+.type SHA3_absorb,\@function
+.align 5
+SHA3_absorb:
+ $STU $sp,-$FRAME($sp)
+ li r10,`15+6*$SIZE_T`
+ li r11,`31+6*$SIZE_T`
+ mflr r8
+ mfspr r7, 256 ; save vrsave
+ stvx v20,r10,$sp
+ addi r10,r10,32
+ stvx v21,r11,$sp
+ addi r11,r11,32
+ stvx v22,r10,$sp
+ addi r10,r10,32
+ stvx v23,r11,$sp
+ addi r11,r11,32
+ stvx v24,r10,$sp
+ addi r10,r10,32
+ stvx v25,r11,$sp
+ addi r11,r11,32
+ stvx v26,r10,$sp
+ addi r10,r10,32
+ stvx v27,r11,$sp
+ addi r11,r11,32
+ stvx v28,r10,$sp
+ addi r10,r10,32
+ stvx v29,r11,$sp
+ addi r11,r11,32
+ stvx v30,r10,$sp
+ stvx v31,r11,$sp
+ stw r7,`$FRAME-4`($sp) ; save vrsave
+ li r0, -1
+ $PUSH r8,`$FRAME+$LRSAVE`($sp)
+ mtspr 256, r0 ; preserve all AltiVec registers
+
+ li r11,16
+ lvx_4w v0,0,$A_jagged ; load A[5][5]
+ li r10,32
+ lvx_4w v1,r11,$A_jagged
+ addi r11,r11,32
+ lvx_4w v2,r10,$A_jagged
+ addi r10,r10,32
+ lvx_4w v3,r11,$A_jagged
+ addi r11,r11,32
+ lvx_4w v4,r10,$A_jagged
+ addi r10,r10,32
+ lvx_4w v5,r11,$A_jagged
+ addi r11,r11,32
+ lvx_4w v6,r10,$A_jagged
+ addi r10,r10,32
+ lvx_4w v7,r11,$A_jagged
+ addi r11,r11,32
+ lvx_4w v8,r10,$A_jagged
+ addi r10,r10,32
+ lvx_4w v9,r11,$A_jagged
+ addi r11,r11,32
+ lvx_4w v10,r10,$A_jagged
+ addi r10,r10,32
+ lvx_4w v11,r11,$A_jagged
+ lvx_splt v12,r10,$A_jagged
+
+ bl PICmeup
+
+ li r11,16
+ lvx_u v13,0,r12 ; load rhotates
+ li r10,32
+ lvx_u v14,r11,r12
+ addi r11,r11,32
+ lvx_u v15,r10,r12
+ addi r10,r10,32
+ lvx_u v16,r11,r12
+ addi r11,r11,32
+ lvx_u v17,r10,r12
+ addi r10,r10,32
+ lvx_u v18,r11,r12
+ addi r11,r11,32
+ lvx_u v19,r10,r12
+ addi r10,r10,32
+ lvx_u v20,r11,r12
+ addi r11,r11,32
+ lvx_u v21,r10,r12
+ addi r10,r10,32
+ lvx_u v22,r11,r12
+ addi r11,r11,32
+ lvx_u v23,r10,r12
+ addi r10,r10,32
+ lvx_u v24,r11,r12
+ lvx_u v25,r10,r12
+ li r10,-32
+ li r11,-16
+ addi r12,r12,`16*16` ; points at iotas
+ b .Loop_absorb
+
+.align 4
+.Loop_absorb:
+ $UCMP $len,$bsz ; len < bsz?
+ blt .Labsorbed
+
+ sub $len,$len,$bsz ; len -= bsz
+ srwi r0,$bsz,3
+ mtctr r0
+
+ lvx_u v30,r10,r12 ; permutation masks
+ lvx_u v31,r11,r12
+ ?vspltisb v27,7 ; prepare masks for byte swap
+ ?vxor v30,v30,v27 ; on big-endian
+ ?vxor v31,v31,v27
+
+ vxor v27,v27,v27 ; zero
+ lvdx_u v26,0,$inp
+ addi $inp,$inp,8
+ vperm v26,v26,v27,v30
+ vxor v0, v0, v26
+ bdz .Lprocess_block
+ lvdx_u v26,0,$inp
+ addi $inp,$inp,8
+ vperm v26,v26,v27,v30
+ vxor v1, v1, v26
+ bdz .Lprocess_block
+ lvdx_u v26,0,$inp
+ addi $inp,$inp,8
+ vperm v26,v26,v27,v30
+ vxor v2, v2, v26
+ bdz .Lprocess_block
+ lvdx_u v26,0,$inp
+ addi $inp,$inp,8
+ vperm v26,v26,v27,v30
+ vxor v3, v3, v26
+ bdz .Lprocess_block
+ lvdx_u v26,0,$inp
+ addi $inp,$inp,8
+ vperm v26,v26,v27,v30
+ vxor v4, v4, v26
+ bdz .Lprocess_block
+ lvdx_u v26,0,$inp
+ addi $inp,$inp,8
+ vperm v26,v26,v27,v31
+ vxor v0, v0, v26
+ bdz .Lprocess_block
+ lvdx_u v26,0,$inp
+ addi $inp,$inp,8
+ vperm v26,v26,v27,v31
+ vxor v1, v1, v26
+ bdz .Lprocess_block
+ lvdx_u v26,0,$inp
+ addi $inp,$inp,8
+ vperm v26,v26,v27,v31
+ vxor v2, v2, v26
+ bdz .Lprocess_block
+ lvdx_u v26,0,$inp
+ addi $inp,$inp,8
+ vperm v26,v26,v27,v31
+ vxor v3, v3, v26
+ bdz .Lprocess_block
+ lvdx_u v26,0,$inp
+ addi $inp,$inp,8
+ vperm v26,v26,v27,v31
+ vxor v4, v4, v26
+ bdz .Lprocess_block
+ lvdx_u v26,0,$inp
+ addi $inp,$inp,8
+ vperm v26,v26,v27,v30
+ vxor v5, v5, v26
+ bdz .Lprocess_block
+ lvdx_u v26,0,$inp
+ addi $inp,$inp,8
+ vperm v26,v26,v27,v30
+ vxor v6, v6, v26
+ bdz .Lprocess_block
+ lvdx_u v26,0,$inp
+ addi $inp,$inp,8
+ vperm v26,v26,v27,v30
+ vxor v7, v7, v26
+ bdz .Lprocess_block
+ lvdx_u v26,0,$inp
+ addi $inp,$inp,8
+ vperm v26,v26,v27,v30
+ vxor v8, v8, v26
+ bdz .Lprocess_block
+ lvdx_u v26,0,$inp
+ addi $inp,$inp,8
+ vperm v26,v26,v27,v30
+ vxor v9, v9, v26
+ bdz .Lprocess_block
+ lvdx_u v26,0,$inp
+ addi $inp,$inp,8
+ vperm v26,v26,v27,v31
+ vxor v5, v5, v26
+ bdz .Lprocess_block
+ lvdx_u v26,0,$inp
+ addi $inp,$inp,8
+ vperm v26,v26,v27,v31
+ vxor v6, v6, v26
+ bdz .Lprocess_block
+ lvdx_u v26,0,$inp
+ addi $inp,$inp,8
+ vperm v26,v26,v27,v31
+ vxor v7, v7, v26
+ bdz .Lprocess_block
+ lvdx_u v26,0,$inp
+ addi $inp,$inp,8
+ vperm v26,v26,v27,v31
+ vxor v8, v8, v26
+ bdz .Lprocess_block
+ lvdx_u v26,0,$inp
+ addi $inp,$inp,8
+ vperm v26,v26,v27,v31
+ vxor v9, v9, v26
+ bdz .Lprocess_block
+ lvdx_u v26,0,$inp
+ addi $inp,$inp,8
+ vperm v26,v26,v27,v30
+ vxor v10, v10, v26
+ bdz .Lprocess_block
+ lvdx_u v26,0,$inp
+ addi $inp,$inp,8
+ vperm v26,v26,v27,v31
+ vxor v10, v10, v26
+ bdz .Lprocess_block
+ lvdx_u v26,0,$inp
+ addi $inp,$inp,8
+ vperm v26,v26,v27,v30
+ vxor v11, v11, v26
+ bdz .Lprocess_block
+ lvdx_u v26,0,$inp
+ addi $inp,$inp,8
+ vperm v26,v26,v27,v31
+ vxor v11, v11, v26
+ bdz .Lprocess_block
+ lvdx_u v26,0,$inp
+ addi $inp,$inp,8
+ vperm v26,v26,v27,v31
+ vxor v12, v12, v26
+
+.Lprocess_block:
+ bl KeccakF1600_int
+
+ b .Loop_absorb
+
+.align 4
+.Labsorbed:
+ li r11,16
+ stvx_4w v0,0,$A_jagged ; return A[5][5]
+ li r10,32
+ stvx_4w v1,r11,$A_jagged
+ addi r11,r11,32
+ stvx_4w v2,r10,$A_jagged
+ addi r10,r10,32
+ stvx_4w v3,r11,$A_jagged
+ addi r11,r11,32
+ stvx_4w v4,r10,$A_jagged
+ addi r10,r10,32
+ stvx_4w v5,r11,$A_jagged
+ addi r11,r11,32
+ stvx_4w v6,r10,$A_jagged
+ addi r10,r10,32
+ stvx_4w v7,r11,$A_jagged
+ addi r11,r11,32
+ stvx_4w v8,r10,$A_jagged
+ addi r10,r10,32
+ stvx_4w v9,r11,$A_jagged
+ addi r11,r11,32
+ stvx_4w v10,r10,$A_jagged
+ addi r10,r10,32
+ stvx_4w v11,r11,$A_jagged
+ stvdx_u v12,r10,$A_jagged
+
+ mr r3,$len ; return value
+ li r10,`15+6*$SIZE_T`
+ li r11,`31+6*$SIZE_T`
+ mtlr r8
+ mtspr 256, r7 ; restore vrsave
+ lvx v20,r10,$sp
+ addi r10,r10,32
+ lvx v21,r11,$sp
+ addi r11,r11,32
+ lvx v22,r10,$sp
+ addi r10,r10,32
+ lvx v23,r11,$sp
+ addi r11,r11,32
+ lvx v24,r10,$sp
+ addi r10,r10,32
+ lvx v25,r11,$sp
+ addi r11,r11,32
+ lvx v26,r10,$sp
+ addi r10,r10,32
+ lvx v27,r11,$sp
+ addi r11,r11,32
+ lvx v28,r10,$sp
+ addi r10,r10,32
+ lvx v29,r11,$sp
+ addi r11,r11,32
+ lvx v30,r10,$sp
+ lvx v31,r11,$sp
+ addi $sp,$sp,$FRAME
+ blr
+ .long 0
+ .byte 0,12,0x04,1,0x80,0,4,0
+ .long 0
+.size SHA3_absorb,.-SHA3_absorb
+___
+}
+{
+my ($A_jagged,$out,$len,$bsz) = map("r$_",(3..6));
+
+$code.=<<___;
+.globl SHA3_squeeze
+.type SHA3_squeeze,\@function
+.align 5
+SHA3_squeeze:
+ mflr r9 ; r9 is not touched by KeccakF1600
+ subi $out,$out,1 ; prepare for stbu
+ addi r8,$A_jagged,4 ; prepare volatiles
+ mr r10,$bsz
+ li r11,0
+ b .Loop_squeeze
+.align 4
+.Loop_squeeze:
+ lwzx r7,r11,r8 ; lo
+ lwzx r0,r11,$A_jagged ; hi
+ ${UCMP}i $len,8
+ blt .Lsqueeze_tail
+
+ stbu r7,1($out) ; write lo
+ srwi r7,r7,8
+ stbu r7,1($out)
+ srwi r7,r7,8
+ stbu r7,1($out)
+ srwi r7,r7,8
+ stbu r7,1($out)
+ stbu r0,1($out) ; write hi
+ srwi r0,r0,8
+ stbu r0,1($out)
+ srwi r0,r0,8
+ stbu r0,1($out)
+ srwi r0,r0,8
+ stbu r0,1($out)
+
+ subic. $len,$len,8
+ beqlr ; return if done
+
+ subic. r10,r10,8
+ ble .Loutput_expand
+
+ addi r11,r11,16 ; calculate jagged index
+ cmplwi r11,`16*5`
+ blt .Loop_squeeze
+ subi r11,r11,72
+ beq .Loop_squeeze
+ addi r11,r11,72
+ cmplwi r11,`16*5+8`
+ subi r11,r11,8
+ beq .Loop_squeeze
+ addi r11,r11,8
+ cmplwi r11,`16*10`
+ subi r11,r11,72
+ beq .Loop_squeeze
+ addi r11,r11,72
+ blt .Loop_squeeze
+ subi r11,r11,8
+ b .Loop_squeeze
+
+.align 4
+.Loutput_expand:
+ bl KeccakF1600
+ mtlr r9
+
+ addi r8,$A_jagged,4 ; restore volatiles
+ mr r10,$bsz
+ li r11,0
+ b .Loop_squeeze
+
+.align 4
+.Lsqueeze_tail:
+ mtctr $len
+ subic. $len,$len,4
+ ble .Loop_tail_lo
+ li r8,4
+ mtctr r8
+.Loop_tail_lo:
+ stbu r7,1($out)
+ srdi r7,r7,8
+ bdnz .Loop_tail_lo
+ ble .Lsqueeze_done
+ mtctr $len
+.Loop_tail_hi:
+ stbu r0,1($out)
+ srdi r0,r0,8
+ bdnz .Loop_tail_hi
+
+.Lsqueeze_done:
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,4,0
+ .long 0
+.size SHA3_squeeze,.-SHA3_squeeze
+___
+}
+$code.=<<___;
+.align 6
+PICmeup:
+ mflr r0
+ bcl 20,31,\$+4
+ mflr r12 ; vvvvvv "distance" between . and 1st data entry
+ addi r12,r12,`64-8`
+ mtlr r0
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,0,0
+ .space `64-9*4`
+.type rhotates,\@object
+.align 6
+rhotates:
+ .quad 0, 36
+ .quad 1, 44
+ .quad 62, 6
+ .quad 28, 55
+ .quad 27, 20
+ .quad 3, 41
+ .quad 10, 45
+ .quad 43, 15
+ .quad 25, 21
+ .quad 39, 8
+ .quad 18, 2
+ .quad 61, 56
+ .quad 14, 14
+.size rhotates,.-rhotates
+ .quad 0,0
+ .quad 0x0001020304050607,0x1011121314151617
+ .quad 0x1011121314151617,0x0001020304050607
+.type iotas,\@object
+iotas:
+ .quad 0x0000000000000001,0
+ .quad 0x0000000000008082,0
+ .quad 0x800000000000808a,0
+ .quad 0x8000000080008000,0
+ .quad 0x000000000000808b,0
+ .quad 0x0000000080000001,0
+ .quad 0x8000000080008081,0
+ .quad 0x8000000000008009,0
+ .quad 0x000000000000008a,0
+ .quad 0x0000000000000088,0
+ .quad 0x0000000080008009,0
+ .quad 0x000000008000000a,0
+ .quad 0x000000008000808b,0
+ .quad 0x800000000000008b,0
+ .quad 0x8000000000008089,0
+ .quad 0x8000000000008003,0
+ .quad 0x8000000000008002,0
+ .quad 0x8000000000000080,0
+ .quad 0x000000000000800a,0
+ .quad 0x800000008000000a,0
+ .quad 0x8000000080008081,0
+ .quad 0x8000000000008080,0
+ .quad 0x0000000080000001,0
+ .quad 0x8000000080008008,0
+.size iotas,.-iotas
+.asciz "Keccak-1600 absorb and squeeze for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+foreach (split("\n",$code)) {
+ s/\`([^\`]*)\`/eval $1/ge;
+
+ if ($flavour =~ /le$/) { # little-endian
+ s/\?([a-z]+)/;$1/;
+ } else { # big-endian
+ s/\?([a-z]+)/$1/;
+ }
+
+ print $_,"\n";
+}
+
+close STDOUT;
diff --git a/crypto/sha/asm/sha1-586.pl b/crypto/sha/asm/sha1-586.pl
index e0b5d83b6201..9d4ff7f39a52 100644
--- a/crypto/sha/asm/sha1-586.pl
+++ b/crypto/sha/asm/sha1-586.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 1998-2018 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
# ====================================================================
# [Re]written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -28,10 +35,9 @@
# P4 +85%(!) +45%
#
# As you can see Pentium came out as looser:-( Yet I reckoned that
-# improvement on P4 outweights the loss and incorporate this
+# improvement on P4 outweighs the loss and incorporate this
# re-tuned code to 0.9.7 and later.
# ----------------------------------------------------------------
-# <appro@fy.chalmers.se>
# August 2009.
#
@@ -97,10 +103,12 @@
# Sandy Bridge 8.8 6.2/+40% 5.1(**)/+73%
# Ivy Bridge 7.2 4.8/+51% 4.7(**)/+53%
# Haswell 6.5 4.3/+51% 4.1(**)/+58%
+# Skylake 6.4 4.1/+55% 4.1(**)/+55%
# Bulldozer 11.6 6.0/+92%
# VIA Nano 10.6 7.5/+41%
# Atom 12.5 9.3(*)/+35%
# Silvermont 14.5 9.9(*)/+46%
+# Goldmont 8.8 6.7/+30% 1.7(***)/+415%
#
# (*) Loop is 1056 instructions long and expected result is ~8.25.
# The discrepancy is because of front-end limitations, so
@@ -108,12 +116,17 @@
# limited parallelism.
#
# (**) As per above comment, the result is for AVX *plus* sh[rl]d.
+#
+# (***) SHAEXT result
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";
-&asm_init($ARGV[0],"sha1-586.pl",$ARGV[$#ARGV] eq "386");
+$output=pop;
+open STDOUT,">$output";
+
+&asm_init($ARGV[0],$ARGV[$#ARGV] eq "386");
$xmm=$ymm=0;
for (@ARGV) { $xmm=1 if (/-DOPENSSL_IA32_SSE2/); }
@@ -123,7 +136,7 @@ $ymm=1 if ($xmm &&
=~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
$1>=2.19); # first version supporting AVX
-$ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32n" &&
+$ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32n" &&
`nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
$1>=2.03); # first version supporting AVX
@@ -131,7 +144,7 @@ $ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32" &&
`ml 2>&1` =~ /Version ([0-9]+)\./ &&
$1>=10); # first version supporting AVX
-$ymm=1 if ($xmm && !$ymm && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/ &&
+$ymm=1 if ($xmm && !$ymm && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([3-9]\.[0-9]+)/ &&
$2>=3.0); # first version supporting AVX
$shaext=$xmm; ### set to zero if compiling for 1.0.1
@@ -536,7 +549,7 @@ for($i=0;$i<20-4;$i+=2) {
# being implemented in SSSE3). Once 8 quadruples or 32 elements are
# collected, it switches to routine proposed by Max Locktyukhin.
#
-# Calculations inevitably require temporary reqisters, and there are
+# Calculations inevitably require temporary registers, and there are
# no %xmm registers left to spare. For this reason part of the ring
# buffer, X[2..4] to be specific, is offloaded to 3 quadriples ring
# buffer on the stack. Keep in mind that X[2] is alias X[-6], X[3] -
@@ -647,7 +660,7 @@ my $_ror=sub { &ror(@_) };
&jmp (&label("loop"));
######################################################################
-# SSE instruction sequence is first broken to groups of indepentent
+# SSE instruction sequence is first broken to groups of independent
# instructions, independent in respect to their inputs and shifter
# (not all architectures have more than one). Then IALU instructions
# are "knitted in" between the SSE groups. Distance is maintained for
@@ -656,14 +669,14 @@ my $_ror=sub { &ror(@_) };
#
# Temporary registers usage. X[2] is volatile at the entry and at the
# end is restored from backtrace ring buffer. X[3] is expected to
-# contain current K_XX_XX constant and is used to caclulate X[-1]+K
+# contain current K_XX_XX constant and is used to calculate X[-1]+K
# from previous round, it becomes volatile the moment the value is
# saved to stack for transfer to IALU. X[4] becomes volatile whenever
# X[-4] is accumulated and offloaded to backtrace ring buffer, at the
# end it is loaded with next K_XX_XX [which becomes X[3] in next
# round]...
#
-sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4
+sub Xupdate_ssse3_16_31() # recall that $Xi starts with 4
{ use integer;
my $body = shift;
my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
@@ -1186,7 +1199,7 @@ my $_ror=sub { &shrd(@_[0],@_) };
&and (@T[0],@T[1]);
&jmp (&label("loop"));
-sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4
+sub Xupdate_avx_16_31() # recall that $Xi starts with 4
{ use integer;
my $body = shift;
my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
@@ -1474,3 +1487,5 @@ sub Xtail_avx()
&asciz("SHA1 block transform for x86, CRYPTOGAMS by <appro\@openssl.org>");
&asm_finish();
+
+close STDOUT;
diff --git a/crypto/sha/asm/sha1-armv4-large.pl b/crypto/sha/asm/sha1-armv4-large.pl
index b2c30322c351..7ff5bfbba6cb 100755
--- a/crypto/sha/asm/sha1-armv4-large.pl
+++ b/crypto/sha/asm/sha1-armv4-large.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -60,14 +67,28 @@
# is ~2.5x larger and there are some redundant instructions executed
# when processing last block, improvement is not as big for smallest
# blocks, only ~30%. Snapdragon S4 is a tad faster, 6.4 cycles per
-# byte, which is also >80% faster than integer-only code.
+# byte, which is also >80% faster than integer-only code. Cortex-A15
+# is even faster spending 5.6 cycles per byte outperforming integer-
+# only code by factor of 2.
# May 2014.
#
# Add ARMv8 code path performing at 2.35 cpb on Apple A7.
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
+$flavour = shift;
+if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+ ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+ ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+ die "can't locate arm-xlate.pl";
+
+ open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+ open STDOUT,">$output";
+}
$ctx="r0";
$inp="r1";
@@ -167,7 +188,12 @@ $code=<<___;
#include "arm_arch.h"
.text
+#if defined(__thumb2__)
+.syntax unified
+.thumb
+#else
.code 32
+#endif
.global sha1_block_data_order
.type sha1_block_data_order,%function
@@ -175,9 +201,13 @@ $code=<<___;
.align 5
sha1_block_data_order:
#if __ARM_MAX_ARCH__>=7
- sub r3,pc,#8 @ sha1_block_data_order
+.Lsha1_block:
+ adr r3,.Lsha1_block
ldr r12,.LOPENSSL_armcap
ldr r12,[r3,r12] @ OPENSSL_armcap_P
+#ifdef __APPLE__
+ ldr r12,[r12]
+#endif
tst r12,#ARMV8_SHA1
bne .LARMv8
tst r12,#ARMV7_NEON
@@ -199,7 +229,12 @@ for($i=0;$i<5;$i++) {
&BODY_00_15(@V); unshift(@V,pop(@V));
}
$code.=<<___;
+#if defined(__thumb2__)
+ mov $t3,sp
+ teq $Xi,$t3
+#else
teq $Xi,sp
+#endif
bne .L_00_15 @ [((11+4)*5+2)*3]
sub sp,sp,#25*4
___
@@ -218,7 +253,12 @@ for($i=0;$i<5;$i++) {
&BODY_20_39(@V); unshift(@V,pop(@V));
}
$code.=<<___;
+#if defined(__thumb2__)
+ mov $t3,sp
+ teq $Xi,$t3
+#else
teq $Xi,sp @ preserve carry
+#endif
bne .L_20_39_or_60_79 @ [+((12+3)*5+2)*4]
bcs .L_done @ [+((12+3)*5+2)*4], spare 300 bytes
@@ -230,7 +270,12 @@ for($i=0;$i<5;$i++) {
&BODY_40_59(@V); unshift(@V,pop(@V));
}
$code.=<<___;
+#if defined(__thumb2__)
+ mov $t3,sp
+ teq $Xi,$t3
+#else
teq $Xi,sp
+#endif
bne .L_40_59 @ [+((12+5)*5+2)*4]
ldr $K,.LK_60_79
@@ -266,7 +311,7 @@ $code.=<<___;
.LK_60_79: .word 0xca62c1d6
#if __ARM_MAX_ARCH__>=7
.LOPENSSL_armcap:
-.word OPENSSL_armcap_P-sha1_block_data_order
+.word OPENSSL_armcap_P-.Lsha1_block
#endif
.asciz "SHA1 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
.align 5
@@ -441,6 +486,7 @@ sub Xuplast_80 ()
&teq ($inp,$len);
&sub ($K_XX_XX,$K_XX_XX,16); # rewind $K_XX_XX
+ &it ("eq");
&subeq ($inp,$inp,64); # reload last block to avoid SEGV
&vld1_8 ("{@X[-4&7]-@X[-3&7]}","[$inp]!");
eval(shift(@insns));
@@ -491,12 +537,12 @@ sha1_block_data_order_neon:
@ dmb @ errata #451034 on early Cortex A8
@ vstmdb sp!,{d8-d15} @ ABI specification says so
mov $saved_sp,sp
- sub sp,sp,#64 @ alloca
+ sub $Xfer,sp,#64
adr $K_XX_XX,.LK_00_19
- bic sp,sp,#15 @ align for 128-bit stores
+ bic $Xfer,$Xfer,#15 @ align for 128-bit stores
ldmia $ctx,{$a,$b,$c,$d,$e} @ load context
- mov $Xfer,sp
+ mov sp,$Xfer @ alloca
vld1.8 {@X[-4&7]-@X[-3&7]},[$inp]! @ handles unaligned
veor $zero,$zero,$zero
@@ -543,10 +589,13 @@ $code.=<<___;
add $b,$b,$t0
add $c,$c,$t1
add $d,$d,$Xfer
+ it eq
moveq sp,$saved_sp
add $e,$e,$Ki
+ it ne
ldrne $Ki,[sp]
stmia $ctx,{$a,$b,$c,$d,$e}
+ itt ne
addne $Xfer,sp,#3*16
bne .Loop_neon
@@ -567,6 +616,13 @@ my ($W0,$W1,$ABCD_SAVE)=map("q$_",(12..14));
$code.=<<___;
#if __ARM_MAX_ARCH__>=7
+
+# if defined(__thumb2__)
+# define INST(a,b,c,d) .byte c,d|0xf,a,b
+# else
+# define INST(a,b,c,d) .byte a,b,c,d|0x10
+# endif
+
.type sha1_block_data_order_armv8,%function
.align 5
sha1_block_data_order_armv8:
@@ -660,7 +716,10 @@ ___
# since ARMv7 instructions are always encoded little-endian.
# correct solution is to use .inst directive, but older
# assemblers don't implement it:-(
- sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
+
+ # this fix-up provides Thumb encoding in conjunction with INST
+ $word &= ~0x10000000 if (($word & 0x0f000000) == 0x02000000);
+ sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
$word&0xff,($word>>8)&0xff,
($word>>16)&0xff,($word>>24)&0xff,
$mnemonic,$arg;
diff --git a/crypto/sha/asm/sha1-armv8.pl b/crypto/sha/asm/sha1-armv8.pl
index c04432a54394..3ba871fedee6 100755
--- a/crypto/sha/asm/sha1-armv8.pl
+++ b/crypto/sha/asm/sha1-armv8.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -18,13 +25,23 @@
# Cortex-A57 2.35 7.88 (+74%)
# Denver 2.13 3.97 (+0%)(**)
# X-Gene 8.80 (+200%)
+# Mongoose 2.05 6.50 (+160%)
+# Kryo 1.88 8.00 (+90%)
#
# (*) Software results are presented mostly for reference purposes.
# (**) Keep in mind that Denver relies on binary translation, which
# optimizes compiler output at run-time.
$flavour = shift;
-open STDOUT,">".shift;
+$output = shift;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
($ctx,$inp,$num)=("x0","x1","x2");
@Xw=map("w$_",(3..17,19));
@@ -158,11 +175,16 @@ $code.=<<___;
.text
+.extern OPENSSL_armcap_P
.globl sha1_block_data_order
.type sha1_block_data_order,%function
.align 6
sha1_block_data_order:
+#ifdef __ILP32__
+ ldrsw x16,.LOPENSSL_armcap_P
+#else
ldr x16,.LOPENSSL_armcap_P
+#endif
adr x17,.LOPENSSL_armcap_P
add x16,x16,x17
ldr w16,[x16]
@@ -300,7 +322,11 @@ $code.=<<___;
.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc //K_40_59
.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 //K_60_79
.LOPENSSL_armcap_P:
+#ifdef __ILP32__
+.long OPENSSL_armcap_P-.
+#else
.quad OPENSSL_armcap_P-.
+#endif
.asciz "SHA1 block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
.comm OPENSSL_armcap_P,4,4
diff --git a/crypto/sha/asm/sha1-c64xplus.pl b/crypto/sha/asm/sha1-c64xplus.pl
new file mode 100755
index 000000000000..4db2bcb06b31
--- /dev/null
+++ b/crypto/sha/asm/sha1-c64xplus.pl
@@ -0,0 +1,337 @@
+#! /usr/bin/env perl
+# Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# SHA1 for C64x+.
+#
+# November 2011
+#
+# If compared to compiler-generated code with similar characteristics,
+# i.e. compiled with OPENSSL_SMALL_FOOTPRINT and utilizing SPLOOPs,
+# this implementation is 25% smaller and >2x faster. In absolute terms
+# performance is (quite impressive) ~6.5 cycles per processed byte.
+# Fully unrolled assembler would be ~5x larger and is likely to be
+# ~15% faster. It would be free from references to intermediate ring
+# buffer, but put more pressure on L1P [both because the code would be
+# larger and won't be using SPLOOP buffer]. There are no plans to
+# realize fully unrolled variant though...
+#
+# !!! Note that this module uses AMR, which means that all interrupt
+# service routines are expected to preserve it and for own well-being
+# zero it upon entry.
+
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+($CTX,$INP,$NUM) = ("A4","B4","A6"); # arguments
+
+($A,$B,$C,$D,$E, $Arot,$F,$F0,$T,$K) = map("A$_",(16..20, 21..25));
+($X0,$X2,$X8,$X13) = ("A26","B26","A27","B27");
+($TX0,$TX1,$TX2,$TX3) = map("B$_",(28..31));
+($XPA,$XPB) = ("A5","B5"); # X circular buffer
+($Actx,$Bctx,$Cctx,$Dctx,$Ectx) = map("A$_",(3,6..9)); # zaps $NUM
+
+$code=<<___;
+ .text
+
+ .if .ASSEMBLER_VERSION<7000000
+ .asg 0,__TI_EABI__
+ .endif
+ .if __TI_EABI__
+ .asg sha1_block_data_order,_sha1_block_data_order
+ .endif
+
+ .asg B3,RA
+ .asg A15,FP
+ .asg B15,SP
+
+ .if .BIG_ENDIAN
+ .asg MV,SWAP2
+ .asg MV,SWAP4
+ .endif
+
+ .global _sha1_block_data_order
+_sha1_block_data_order:
+ .asmfunc stack_usage(64)
+ MV $NUM,A0 ; reassign $NUM
+|| MVK -64,B0
+ [!A0] BNOP RA ; if ($NUM==0) return;
+|| [A0] STW FP,*SP--[16] ; save frame pointer and alloca(64)
+|| [A0] MV SP,FP
+ [A0] LDW *${CTX}[0],$A ; load A-E...
+|| [A0] AND B0,SP,SP ; align stack at 64 bytes
+ [A0] LDW *${CTX}[1],$B
+|| [A0] SUBAW SP,2,SP ; reserve two words above buffer
+ [A0] LDW *${CTX}[2],$C
+|| [A0] MVK 0x00404,B0
+ [A0] LDW *${CTX}[3],$D
+|| [A0] MVKH 0x50000,B0 ; 0x050404, 64 bytes for $XP[AB]
+ [A0] LDW *${CTX}[4],$E
+|| [A0] MVC B0,AMR ; setup circular addressing
+ LDNW *${INP}++,$TX1 ; pre-fetch input
+ NOP 1
+
+loop?:
+ MVK 0x00007999,$K
+|| ADDAW SP,2,$XPA
+|| SUB A0,1,A0
+|| MVK 13,B0
+ MVKH 0x5a820000,$K ; K_00_19
+|| ADDAW SP,2,$XPB
+|| MV $A,$Actx
+|| MV $B,$Bctx
+;;==================================================
+ SPLOOPD 5 ; BODY_00_13
+|| MV $C,$Cctx
+|| MV $D,$Dctx
+|| MV $E,$Ectx
+|| MVC B0,ILC
+
+ ROTL $A,5,$Arot
+|| AND $C,$B,$F
+|| ANDN $D,$B,$F0
+|| ADD $K,$E,$T ; T=E+K
+
+ XOR $F0,$F,$F ; F_00_19(B,C,D)
+|| MV $D,$E ; E=D
+|| MV $C,$D ; D=C
+|| SWAP2 $TX1,$TX2
+|| LDNW *${INP}++,$TX1
+
+ ADD $F,$T,$T ; T+=F_00_19(B,C,D)
+|| ROTL $B,30,$C ; C=ROL(B,30)
+|| SWAP4 $TX2,$TX3 ; byte swap
+
+ ADD $Arot,$T,$T ; T+=ROL(A,5)
+|| MV $A,$B ; B=A
+
+ ADD $TX3,$T,$A ; A=T+Xi
+|| STW $TX3,*${XPB}++
+ SPKERNEL
+;;==================================================
+ ROTL $A,5,$Arot ; BODY_14
+|| AND $C,$B,$F
+|| ANDN $D,$B,$F0
+|| ADD $K,$E,$T ; T=E+K
+
+ XOR $F0,$F,$F ; F_00_19(B,C,D)
+|| MV $D,$E ; E=D
+|| MV $C,$D ; D=C
+|| SWAP2 $TX1,$TX2
+|| LDNW *${INP}++,$TX1
+
+ ADD $F,$T,$T ; T+=F_00_19(B,C,D)
+|| ROTL $B,30,$C ; C=ROL(B,30)
+|| SWAP4 $TX2,$TX2 ; byte swap
+|| LDW *${XPA}++,$X0 ; fetches from X ring buffer are
+|| LDW *${XPB}[4],$X2 ; 2 iterations ahead
+
+ ADD $Arot,$T,$T ; T+=ROL(A,5)
+|| MV $A,$B ; B=A
+|| LDW *${XPA}[7],$X8
+|| MV $TX3,$X13 ; || LDW *${XPB}[15],$X13
+|| MV $TX2,$TX3
+
+ ADD $TX2,$T,$A ; A=T+Xi
+|| STW $TX2,*${XPB}++
+;;==================================================
+ ROTL $A,5,$Arot ; BODY_15
+|| AND $C,$B,$F
+|| ANDN $D,$B,$F0
+|| ADD $K,$E,$T ; T=E+K
+
+ XOR $F0,$F,$F ; F_00_19(B,C,D)
+|| MV $D,$E ; E=D
+|| MV $C,$D ; D=C
+|| SWAP2 $TX1,$TX2
+
+ ADD $F,$T,$T ; T+=F_00_19(B,C,D)
+|| ROTL $B,30,$C ; C=ROL(B,30)
+|| SWAP4 $TX2,$TX2 ; byte swap
+|| XOR $X0,$X2,$TX0 ; Xupdate XORs are 1 iteration ahead
+|| LDW *${XPA}++,$X0
+|| LDW *${XPB}[4],$X2
+
+ ADD $Arot,$T,$T ; T+=ROL(A,5)
+|| MV $A,$B ; B=A
+|| XOR $X8,$X13,$TX1
+|| LDW *${XPA}[7],$X8
+|| MV $TX3,$X13 ; || LDW *${XPB}[15],$X13
+|| MV $TX2,$TX3
+
+ ADD $TX2,$T,$A ; A=T+Xi
+|| STW $TX2,*${XPB}++
+|| XOR $TX0,$TX1,$TX1
+|| MVK 3,B0
+;;==================================================
+ SPLOOPD 5 ; BODY_16_19
+|| MVC B0,ILC
+
+ ROTL $A,5,$Arot
+|| AND $C,$B,$F
+|| ANDN $D,$B,$F0
+|| ADD $K,$E,$T ; T=E+K
+|| ROTL $TX1,1,$TX2 ; Xupdate output
+
+ XOR $F0,$F,$F ; F_00_19(B,C,D)
+|| MV $D,$E ; E=D
+|| MV $C,$D ; D=C
+
+ ADD $F,$T,$T ; T+=F_00_19(B,C,D)
+|| ROTL $B,30,$C ; C=ROL(B,30)
+|| XOR $X0,$X2,$TX0
+|| LDW *${XPA}++,$X0
+|| LDW *${XPB}[4],$X2
+
+ ADD $Arot,$T,$T ; T+=ROL(A,5)
+|| MV $A,$B ; B=A
+|| XOR $X8,$X13,$TX1
+|| LDW *${XPA}[7],$X8
+|| MV $TX3,$X13 ; || LDW *${XPB}[15],$X13
+|| MV $TX2,$TX3
+
+ ADD $TX2,$T,$A ; A=T+Xi
+|| STW $TX2,*${XPB}++
+|| XOR $TX0,$TX1,$TX1
+ SPKERNEL
+
+ MVK 0xffffeba1,$K
+|| MVK 19,B0
+ MVKH 0x6ed90000,$K ; K_20_39
+___
+sub BODY_20_39 {
+$code.=<<___;
+;;==================================================
+ SPLOOPD 5 ; BODY_20_39
+|| MVC B0,ILC
+
+ ROTL $A,5,$Arot
+|| XOR $B,$C,$F
+|| ADD $K,$E,$T ; T=E+K
+|| ROTL $TX1,1,$TX2 ; Xupdate output
+
+ XOR $D,$F,$F ; F_20_39(B,C,D)
+|| MV $D,$E ; E=D
+|| MV $C,$D ; D=C
+
+ ADD $F,$T,$T ; T+=F_20_39(B,C,D)
+|| ROTL $B,30,$C ; C=ROL(B,30)
+|| XOR $X0,$X2,$TX0
+|| LDW *${XPA}++,$X0
+|| LDW *${XPB}[4],$X2
+
+ ADD $Arot,$T,$T ; T+=ROL(A,5)
+|| MV $A,$B ; B=A
+|| XOR $X8,$X13,$TX1
+|| LDW *${XPA}[7],$X8
+|| MV $TX3,$X13 ; || LDW *${XPB}[15],$X13
+|| MV $TX2,$TX3
+
+ ADD $TX2,$T,$A ; A=T+Xi
+|| STW $TX2,*${XPB}++ ; last one is redundant
+|| XOR $TX0,$TX1,$TX1
+ SPKERNEL
+___
+$code.=<<___ if (!shift);
+ MVK 0xffffbcdc,$K
+ MVKH 0x8f1b0000,$K ; K_40_59
+___
+} &BODY_20_39();
+$code.=<<___;
+;;==================================================
+ SPLOOPD 5 ; BODY_40_59
+|| MVC B0,ILC
+|| AND $B,$C,$F
+|| AND $B,$D,$F0
+
+ ROTL $A,5,$Arot
+|| XOR $F0,$F,$F
+|| AND $C,$D,$F0
+|| ADD $K,$E,$T ; T=E+K
+|| ROTL $TX1,1,$TX2 ; Xupdate output
+
+ XOR $F0,$F,$F ; F_40_59(B,C,D)
+|| MV $D,$E ; E=D
+|| MV $C,$D ; D=C
+
+ ADD $F,$T,$T ; T+=F_40_59(B,C,D)
+|| ROTL $B,30,$C ; C=ROL(B,30)
+|| XOR $X0,$X2,$TX0
+|| LDW *${XPA}++,$X0
+|| LDW *${XPB}[4],$X2
+
+ ADD $Arot,$T,$T ; T+=ROL(A,5)
+|| MV $A,$B ; B=A
+|| XOR $X8,$X13,$TX1
+|| LDW *${XPA}[7],$X8
+|| MV $TX3,$X13 ; || LDW *${XPB}[15],$X13
+|| MV $TX2,$TX3
+
+ ADD $TX2,$T,$A ; A=T+Xi
+|| STW $TX2,*${XPB}++
+|| XOR $TX0,$TX1,$TX1
+|| AND $B,$C,$F
+|| AND $B,$D,$F0
+ SPKERNEL
+
+ MVK 0xffffc1d6,$K
+|| MVK 18,B0
+ MVKH 0xca620000,$K ; K_60_79
+___
+ &BODY_20_39(-1); # BODY_60_78
+$code.=<<___;
+;;==================================================
+ [A0] B loop?
+|| ROTL $A,5,$Arot ; BODY_79
+|| XOR $B,$C,$F
+|| ROTL $TX1,1,$TX2 ; Xupdate output
+
+ [A0] LDNW *${INP}++,$TX1 ; pre-fetch input
+|| ADD $K,$E,$T ; T=E+K
+|| XOR $D,$F,$F ; F_20_39(B,C,D)
+
+ ADD $F,$T,$T ; T+=F_20_39(B,C,D)
+|| ADD $Ectx,$D,$E ; E=D,E+=Ectx
+|| ADD $Dctx,$C,$D ; D=C,D+=Dctx
+|| ROTL $B,30,$C ; C=ROL(B,30)
+
+ ADD $Arot,$T,$T ; T+=ROL(A,5)
+|| ADD $Bctx,$A,$B ; B=A,B+=Bctx
+
+ ADD $TX2,$T,$A ; A=T+Xi
+
+ ADD $Actx,$A,$A ; A+=Actx
+|| ADD $Cctx,$C,$C ; C+=Cctx
+;; end of loop?
+
+ BNOP RA ; return
+|| MV FP,SP ; restore stack pointer
+|| LDW *FP[0],FP ; restore frame pointer
+ STW $A,*${CTX}[0] ; emit A-E...
+|| MVK 0,B0
+ STW $B,*${CTX}[1]
+|| MVC B0,AMR ; clear AMR
+ STW $C,*${CTX}[2]
+ STW $D,*${CTX}[3]
+ STW $E,*${CTX}[4]
+ .endasmfunc
+
+ .sect .const
+ .cstring "SHA1 block transform for C64x+, CRYPTOGAMS by <appro\@openssl.org>"
+ .align 4
+___
+
+print $code;
+close STDOUT;
diff --git a/crypto/sha/asm/sha1-ia64.pl b/crypto/sha/asm/sha1-ia64.pl
index 02d35d1614c1..bf1d2ebeb0ab 100644
--- a/crypto/sha/asm/sha1-ia64.pl
+++ b/crypto/sha/asm/sha1-ia64.pl
@@ -1,7 +1,14 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2004-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
#
# ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
@@ -14,6 +21,8 @@
# Performance under big-endian OS such as HP-UX is 179MBps*1GHz, which
# is >50% better than HP C and >2x better than gcc.
+$output = pop;
+
$code=<<___;
.ident \"sha1-ia64.s, version 1.3\"
.ident \"IA-64 ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>\"
@@ -301,5 +310,5 @@ $code.=<<___;
stringz "SHA1 block transform for IA64, CRYPTOGAMS by <appro\@openssl.org>"
___
-$output=shift and open STDOUT,">$output";
+open STDOUT,">$output" if $output;
print $code;
diff --git a/crypto/sha/asm/sha1-mb-x86_64.pl b/crypto/sha/asm/sha1-mb-x86_64.pl
index a8d8708d4b75..443b649830f4 100755
--- a/crypto/sha/asm/sha1-mb-x86_64.pl
+++ b/crypto/sha/asm/sha1-mb-x86_64.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -19,6 +26,7 @@
# Sandy Bridge (8.16 +5.15=13.3)/n 4.99 5.98 +80%
# Ivy Bridge (8.08 +5.14=13.2)/n 4.60 5.54 +68%
# Haswell(iii) (8.96 +5.00=14.0)/n 3.57 4.55 +160%
+# Skylake (8.70 +5.00=13.7)/n 3.64 4.20 +145%
# Bulldozer (9.76 +5.76=15.5)/n 5.95 6.37 +64%
#
# (i) multi-block CBC encrypt with 128-bit key;
@@ -62,7 +70,7 @@ if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([
$avx = ($2>=3.0) + ($2>3.0);
}
-open OUT,"| \"$^X\" $xlate $flavour $output";
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
*STDOUT=*OUT;
# void sha1_multi_block (
@@ -87,7 +95,7 @@ $K="%xmm15";
if (1) {
# Atom-specific optimization aiming to eliminate pshufb with high
- # registers [and thus get rid of 48 cycles accumulated penalty]
+ # registers [and thus get rid of 48 cycles accumulated penalty]
@Xi=map("%xmm$_",(0..4));
($tx,$t0,$t1,$t2,$t3)=map("%xmm$_",(5..9));
@V=($A,$B,$C,$D,$E)=map("%xmm$_",(10..14));
@@ -118,7 +126,7 @@ my $k=$i+2;
# ...
# $i==13: 14,15,15,15,
# $i==14: 15
-#
+#
# Then at $i==15 Xupdate is applied one iteration in advance...
$code.=<<___ if ($i==0);
movd (@ptr[0]),@Xi[0]
@@ -355,6 +363,7 @@ $code.=<<___;
.type sha1_multi_block,\@function,3
.align 32
sha1_multi_block:
+.cfi_startproc
mov OPENSSL_ia32cap_P+4(%rip),%rcx
bt \$61,%rcx # check SHA bit
jc _shaext_shortcut
@@ -365,8 +374,11 @@ $code.=<<___ if ($avx);
___
$code.=<<___;
mov %rsp,%rax
+.cfi_def_cfa_register %rax
push %rbx
+.cfi_push %rbx
push %rbp
+.cfi_push %rbx
___
$code.=<<___ if ($win64);
lea -0xa8(%rsp),%rsp
@@ -385,6 +397,7 @@ $code.=<<___;
sub \$`$REG_SZ*18`,%rsp
and \$-256,%rsp
mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
+.cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8
.Lbody:
lea K_XX_XX(%rip),$Tbl
lea `$REG_SZ*16`(%rsp),%rbx
@@ -431,7 +444,7 @@ for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
movdqa (%rbx),@Xi[0] # pull counters
mov \$1,%ecx
- cmp 4*0(%rbx),%ecx # examinte counters
+ cmp 4*0(%rbx),%ecx # examine counters
pxor $t2,$t2
cmovge $Tbl,@ptr[0] # cancel input
cmp 4*1(%rbx),%ecx
@@ -478,7 +491,8 @@ $code.=<<___;
jnz .Loop_grande
.Ldone:
- mov `$REG_SZ*17`(%rsp),%rax # orignal %rsp
+ mov `$REG_SZ*17`(%rsp),%rax # original %rsp
+.cfi_def_cfa %rax,8
___
$code.=<<___ if ($win64);
movaps -0xb8(%rax),%xmm6
@@ -494,10 +508,14 @@ $code.=<<___ if ($win64);
___
$code.=<<___;
mov -16(%rax),%rbp
+.cfi_restore %rbp
mov -8(%rax),%rbx
+.cfi_restore %rbx
lea (%rax),%rsp
+.cfi_def_cfa_register %rsp
.Lepilogue:
ret
+.cfi_endproc
.size sha1_multi_block,.-sha1_multi_block
___
{{{
@@ -509,10 +527,14 @@ $code.=<<___;
.type sha1_multi_block_shaext,\@function,3
.align 32
sha1_multi_block_shaext:
+.cfi_startproc
_shaext_shortcut:
mov %rsp,%rax
+.cfi_def_cfa_register %rax
push %rbx
+.cfi_push %rbx
push %rbp
+.cfi_push %rbp
___
$code.=<<___ if ($win64);
lea -0xa8(%rsp),%rsp
@@ -538,7 +560,7 @@ $code.=<<___;
movdqa K_XX_XX+0x80(%rip),$BSWAP # byte-n-word swap
.Loop_grande_shaext:
- mov $num,`$REG_SZ*17+8`(%rsp) # orignal $num
+ mov $num,`$REG_SZ*17+8`(%rsp) # original $num
xor $num,$num
___
for($i=0;$i<2;$i++) {
@@ -748,10 +770,14 @@ $code.=<<___ if ($win64);
___
$code.=<<___;
mov -16(%rax),%rbp
+.cfi_restore %rbp
mov -8(%rax),%rbx
+.cfi_restore %rbx
lea (%rax),%rsp
+.cfi_def_cfa_register %rsp
.Lepilogue_shaext:
ret
+.cfi_endproc
.size sha1_multi_block_shaext,.-sha1_multi_block_shaext
___
}}}
@@ -994,6 +1020,7 @@ $code.=<<___;
.type sha1_multi_block_avx,\@function,3
.align 32
sha1_multi_block_avx:
+.cfi_startproc
_avx_shortcut:
___
$code.=<<___ if ($avx>1);
@@ -1008,8 +1035,11 @@ $code.=<<___ if ($avx>1);
___
$code.=<<___;
mov %rsp,%rax
+.cfi_def_cfa_register %rax
push %rbx
+.cfi_push %rbx
push %rbp
+.cfi_push %rbp
___
$code.=<<___ if ($win64);
lea -0xa8(%rsp),%rsp
@@ -1028,6 +1058,7 @@ $code.=<<___;
sub \$`$REG_SZ*18`, %rsp
and \$-256,%rsp
mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
+.cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8
.Lbody_avx:
lea K_XX_XX(%rip),$Tbl
lea `$REG_SZ*16`(%rsp),%rbx
@@ -1116,7 +1147,8 @@ $code.=<<___;
jnz .Loop_grande_avx
.Ldone_avx:
- mov `$REG_SZ*17`(%rsp),%rax # orignal %rsp
+ mov `$REG_SZ*17`(%rsp),%rax # original %rsp
+.cfi_def_cfa %rax,8
vzeroupper
___
$code.=<<___ if ($win64);
@@ -1133,10 +1165,14 @@ $code.=<<___ if ($win64);
___
$code.=<<___;
mov -16(%rax),%rbp
+.cfi_restore %rbp
mov -8(%rax),%rbx
+.cfi_restore %rbx
lea (%rax),%rsp
+.cfi_def_cfa_register %rsp
.Lepilogue_avx:
ret
+.cfi_endproc
.size sha1_multi_block_avx,.-sha1_multi_block_avx
___
@@ -1156,14 +1192,22 @@ $code.=<<___;
.type sha1_multi_block_avx2,\@function,3
.align 32
sha1_multi_block_avx2:
+.cfi_startproc
_avx2_shortcut:
mov %rsp,%rax
+.cfi_def_cfa_register %rax
push %rbx
+.cfi_push %rbx
push %rbp
+.cfi_push %rbp
push %r12
+.cfi_push %r12
push %r13
+.cfi_push %r13
push %r14
+.cfi_push %r14
push %r15
+.cfi_push %r15
___
$code.=<<___ if ($win64);
lea -0xa8(%rsp),%rsp
@@ -1182,6 +1226,7 @@ $code.=<<___;
sub \$`$REG_SZ*18`, %rsp
and \$-256,%rsp
mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
+.cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8
.Lbody_avx2:
lea K_XX_XX(%rip),$Tbl
shr \$1,$num
@@ -1271,7 +1316,8 @@ $code.=<<___;
#jnz .Loop_grande_avx2
.Ldone_avx2:
- mov `$REG_SZ*17`(%rsp),%rax # orignal %rsp
+ mov `$REG_SZ*17`(%rsp),%rax # original %rsp
+.cfi_def_cfa %rax,8
vzeroupper
___
$code.=<<___ if ($win64);
@@ -1288,14 +1334,22 @@ $code.=<<___ if ($win64);
___
$code.=<<___;
mov -48(%rax),%r15
+.cfi_restore %r15
mov -40(%rax),%r14
+.cfi_restore %r14
mov -32(%rax),%r13
+.cfi_restore %r13
mov -24(%rax),%r12
+.cfi_restore %r12
mov -16(%rax),%rbp
+.cfi_restore %rbp
mov -8(%rax),%rbx
+.cfi_restore %rbx
lea (%rax),%rsp
+.cfi_def_cfa_register %rsp
.Lepilogue_avx2:
ret
+.cfi_endproc
.size sha1_multi_block_avx2,.-sha1_multi_block_avx2
___
} }}}
@@ -1454,10 +1508,10 @@ avx2_handler:
mov -48(%rax),%r15
mov %rbx,144($context) # restore context->Rbx
mov %rbp,160($context) # restore context->Rbp
- mov %r12,216($context) # restore cotnext->R12
- mov %r13,224($context) # restore cotnext->R13
- mov %r14,232($context) # restore cotnext->R14
- mov %r15,240($context) # restore cotnext->R15
+ mov %r12,216($context) # restore context->R12
+ mov %r13,224($context) # restore context->R13
+ mov %r14,232($context) # restore context->R14
+ mov %r15,240($context) # restore context->R15
lea -56-10*16(%rax),%rsi
lea 512($context),%rdi # &context.Xmm6
diff --git a/crypto/sha/asm/sha1-mips.pl b/crypto/sha/asm/sha1-mips.pl
index 340849389993..08f84bc3b3d9 100755
--- a/crypto/sha/asm/sha1-mips.pl
+++ b/crypto/sha/asm/sha1-mips.pl
@@ -1,7 +1,14 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2009-2018 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
# ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
@@ -49,15 +56,15 @@
$flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64
if ($flavour =~ /64|n32/i) {
- $PTR_ADD="dadd"; # incidentally works even on n32
- $PTR_SUB="dsub"; # incidentally works even on n32
+ $PTR_ADD="daddu"; # incidentally works even on n32
+ $PTR_SUB="dsubu"; # incidentally works even on n32
$REG_S="sd";
$REG_L="ld";
$PTR_SLL="dsll"; # incidentally works even on n32
$SZREG=8;
} else {
- $PTR_ADD="add";
- $PTR_SUB="sub";
+ $PTR_ADD="addu";
+ $PTR_SUB="subu";
$REG_S="sw";
$REG_L="lw";
$PTR_SLL="sll";
@@ -68,9 +75,9 @@ if ($flavour =~ /64|n32/i) {
#
######################################################################
-$big_endian=(`echo MIPSEL | $ENV{CC} -E -`=~/MIPSEL/)?1:0 if ($ENV{CC});
+$big_endian=(`echo MIPSEB | $ENV{CC} -E -`=~/MIPSEB/)?0:1 if ($ENV{CC});
-for (@ARGV) { $output=$_ if (/^\w[\w\-]*\.\w+$/); }
+for (@ARGV) { $output=$_ if (/\w[\w\-]*\.\w+$/); }
open STDOUT,">$output";
if (!defined($big_endian))
@@ -119,10 +126,14 @@ $code.=<<___;
addu $e,$K # $i
xor $t0,$c,$d
rotr $t1,$a,27
- lwl @X[$j],$j*4+$MSB($inp)
and $t0,$b
addu $e,$t1
+#if defined(_MIPS_ARCH_MIPS32R6) || defined(_MIPS_ARCH_MIPS64R6)
+ lw @X[$j],$j*4($inp)
+#else
+ lwl @X[$j],$j*4+$MSB($inp)
lwr @X[$j],$j*4+$LSB($inp)
+#endif
xor $t0,$d
addu $e,@X[$i]
rotr $b,$b,2
@@ -325,17 +336,11 @@ $code.=<<___ if ($i<79);
___
}
-$FRAMESIZE=16; # large enough to accomodate NUBI saved registers
-$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000;
+$FRAMESIZE=16; # large enough to accommodate NUBI saved registers
+$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0xc0fff008" : "0xc0ff0000";
$code=<<___;
-#ifdef OPENSSL_FIPSCANISTER
-# include <openssl/fipssyms.h>
-#endif
-
-#if defined(__mips_smartmips) && !defined(_MIPS_ARCH_MIPS32R2)
-#define _MIPS_ARCH_MIPS32R2
-#endif
+#include "mips_arch.h"
.text
@@ -380,10 +385,16 @@ $code.=<<___;
.align 4
.Loop:
.set reorder
+#if defined(_MIPS_ARCH_MIPS32R6) || defined(_MIPS_ARCH_MIPS64R6)
+ lui $K,0x5a82
+ lw @X[0],($inp)
+ ori $K,0x7999 # K_00_19
+#else
lwl @X[0],$MSB($inp)
lui $K,0x5a82
lwr @X[0],$LSB($inp)
ori $K,0x7999 # K_00_19
+#endif
___
for ($i=0;$i<15;$i++) { &BODY_00_14($i,@V); unshift(@V,pop(@V)); }
for (;$i<20;$i++) { &BODY_15_19($i,@V); unshift(@V,pop(@V)); }
diff --git a/crypto/sha/asm/sha1-parisc.pl b/crypto/sha/asm/sha1-parisc.pl
index 6e5a328a6f1f..b001be16a23c 100755
--- a/crypto/sha/asm/sha1-parisc.pl
+++ b/crypto/sha/asm/sha1-parisc.pl
@@ -1,7 +1,14 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2009-2018 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
# ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
@@ -253,8 +260,20 @@ $code.=<<___;
.STRINGZ "SHA1 block transform for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
___
-$code =~ s/\`([^\`]*)\`/eval $1/gem;
-$code =~ s/,\*/,/gm if ($SIZE_T==4);
-$code =~ s/\bbv\b/bve/gm if ($SIZE_T==8);
-print $code;
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+ =~ /GNU assembler/) {
+ $gnuas = 1;
+}
+
+foreach(split("\n",$code)) {
+ s/\`([^\`]*)\`/eval $1/ge;
+
+ s/(\.LEVEL\s+2\.0)W/$1w/ if ($gnuas && $SIZE_T==8);
+ s/\.SPACE\s+\$TEXT\$/.text/ if ($gnuas && $SIZE_T==8);
+ s/\.SUBSPA.*// if ($gnuas && $SIZE_T==8);
+ s/,\*/,/ if ($SIZE_T==4);
+ s/\bbv\b/bve/ if ($SIZE_T==8);
+
+ print $_,"\n";
+}
close STDOUT;
diff --git a/crypto/sha/asm/sha1-ppc.pl b/crypto/sha/asm/sha1-ppc.pl
index ab655021ccd6..0cda0a3e1517 100755
--- a/crypto/sha/asm/sha1-ppc.pl
+++ b/crypto/sha/asm/sha1-ppc.pl
@@ -1,7 +1,14 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2006-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
# ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
@@ -37,7 +44,7 @@ if ($flavour =~ /64/) {
$PUSH ="stw";
} else { die "nonsense $flavour"; }
-# Define endianess based on flavour
+# Define endianness based on flavour
# i.e.: linux64le
$LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0;
diff --git a/crypto/sha/asm/sha1-s390x.pl b/crypto/sha/asm/sha1-s390x.pl
index d5cf1640a120..5729c3089877 100755
--- a/crypto/sha/asm/sha1-s390x.pl
+++ b/crypto/sha/asm/sha1-s390x.pl
@@ -1,7 +1,14 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
# ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
@@ -28,7 +35,8 @@
# instructions and achieve "64-bit" performance even in 31-bit legacy
# application context. The feature is not specific to any particular
# processor, as long as it's "z-CPU". Latter implies that the code
-# remains z/Architecture specific.
+# remains z/Architecture specific. On z990 it was measured to perform
+# 23% better than code generated by gcc 4.3.
$kimdfunc=1; # magic function code for kimd instruction
@@ -42,7 +50,7 @@ if ($flavour =~ /3[12]/) {
$g="g";
}
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
$K_00_39="%r0"; $K=$K_00_39;
@@ -152,6 +160,8 @@ ___
}
$code.=<<___;
+#include "s390x_arch.h"
+
.text
.align 64
.type Ktable,\@object
@@ -164,10 +174,7 @@ sha1_block_data_order:
___
$code.=<<___ if ($kimdfunc);
larl %r1,OPENSSL_s390xcap_P
- lg %r0,0(%r1)
- tmhl %r0,0x4000 # check for message-security assist
- jz .Lsoftware
- lg %r0,16(%r1) # check kimd capabilities
+ lg %r0,S390X_KIMD(%r1) # check kimd capabilities
tmhh %r0,`0x8000>>$kimdfunc`
jz .Lsoftware
lghi %r0,$kimdfunc
@@ -234,7 +241,6 @@ $code.=<<___;
br %r14
.size sha1_block_data_order,.-sha1_block_data_order
.string "SHA1 block transform for s390x, CRYPTOGAMS by <appro\@openssl.org>"
-.comm OPENSSL_s390xcap_P,80,8
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
diff --git a/crypto/sha/asm/sha1-sparcv9.pl b/crypto/sha/asm/sha1-sparcv9.pl
index b5efcde5c139..3e612e3d5f68 100755
--- a/crypto/sha/asm/sha1-sparcv9.pl
+++ b/crypto/sha/asm/sha1-sparcv9.pl
@@ -1,12 +1,19 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
# ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
#
-# Hardware SPARC T4 support by David S. Miller <davem@davemloft.net>.
+# Hardware SPARC T4 support by David S. Miller
# ====================================================================
# Performance improvement is not really impressive on pre-T1 CPU: +8%
@@ -25,7 +32,7 @@
# single-process result on 8-core processor, or ~9GBps per 2.85GHz
# socket.
-$output=shift;
+$output=pop;
open STDOUT,">$output";
@X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7");
@@ -220,7 +227,7 @@ sha1_block_data_order:
ldd [%o1 + 0x20], %f16
ldd [%o1 + 0x28], %f18
ldd [%o1 + 0x30], %f20
- subcc %o2, 1, %o2 ! done yet?
+ subcc %o2, 1, %o2 ! done yet?
ldd [%o1 + 0x38], %f22
add %o1, 0x40, %o1
prefetch [%o1 + 63], 20
@@ -368,7 +375,7 @@ ___
# Purpose of these subroutines is to explicitly encode VIS instructions,
# so that one can compile the module without having to specify VIS
-# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
+# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
# Idea is to reserve for option to produce "universal" binary and let
# programmer detect if current CPU is VIS capable at run-time.
sub unvis {
diff --git a/crypto/sha/asm/sha1-sparcv9a.pl b/crypto/sha/asm/sha1-sparcv9a.pl
index e65291bbd979..50d3e136a12d 100755
--- a/crypto/sha/asm/sha1-sparcv9a.pl
+++ b/crypto/sha/asm/sha1-sparcv9a.pl
@@ -1,7 +1,14 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
# ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
@@ -512,7 +519,7 @@ $code.=<<___;
mov $Cctx,$C
mov $Dctx,$D
mov $Ectx,$E
- alignaddr %g0,$tmp0,%g0
+ alignaddr %g0,$tmp0,%g0
dec 1,$len
ba .Loop
mov $nXfer,$Xfer
@@ -544,7 +551,7 @@ ___
# Purpose of these subroutines is to explicitly encode VIS instructions,
# so that one can compile the module without having to specify VIS
-# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
+# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
# Idea is to reserve for option to produce "universal" binary and let
# programmer detect if current CPU is VIS capable at run-time.
sub unvis {
diff --git a/crypto/sha/asm/sha1-thumb.pl b/crypto/sha/asm/sha1-thumb.pl
index 7c9ea9b0296c..ac74a25d6ead 100755
--- a/crypto/sha/asm/sha1-thumb.pl
+++ b/crypto/sha/asm/sha1-thumb.pl
@@ -1,7 +1,14 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
# ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
@@ -14,7 +21,7 @@
# The code does not present direct interest to OpenSSL, because of low
# performance. Its purpose is to establish _size_ benchmark. Pretty
# useless one I must say, because 30% or 88 bytes larger ARMv4 code
-# [avialable on demand] is almost _twice_ as fast. It should also be
+# [available on demand] is almost _twice_ as fast. It should also be
# noted that in-lining of .Lcommon and .Lrotate improves performance
# by over 40%, while code increases by only 10% or 32 bytes. But once
# again, the goal was to establish _size_ benchmark, not performance.
diff --git a/crypto/sha/asm/sha1-x86_64.pl b/crypto/sha/asm/sha1-x86_64.pl
index 752138b0eac1..60819f61867c 100755
--- a/crypto/sha/asm/sha1-x86_64.pl
+++ b/crypto/sha/asm/sha1-x86_64.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2006-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -73,13 +80,18 @@
# Sandy Bridge 7.70 6.10/+26% 4.99/+54%
# Ivy Bridge 6.06 4.67/+30% 4.60/+32%
# Haswell 5.45 4.15/+31% 3.57/+53%
+# Skylake 5.18 4.06/+28% 3.54/+46%
# Bulldozer 9.11 5.95/+53%
+# Ryzen 4.75 3.80/+24% 1.93/+150%(**)
# VIA Nano 9.32 7.15/+30%
# Atom 10.3 9.17/+12%
# Silvermont 13.1(*) 9.37/+40%
+# Knights L 13.2(*) 9.68/+36% 8.30/+59%
+# Goldmont 8.13 6.42/+27% 1.70/+380%(**)
#
# (*) obviously suboptimal result, nothing was done about it,
# because SSSE3 code is compiled unconditionally;
+# (**) SHAEXT result
$flavour = shift;
$output = shift;
@@ -114,7 +126,7 @@ if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([
$shaext=1; ### set to zero if compiling for 1.0.1
$avx=1 if (!$shaext && $avx);
-open OUT,"| \"$^X\" $xlate $flavour $output";
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
*STDOUT=*OUT;
$ctx="%rdi"; # 1st arg
@@ -247,6 +259,7 @@ $code.=<<___;
.type sha1_block_data_order,\@function,3
.align 16
sha1_block_data_order:
+.cfi_startproc
mov OPENSSL_ia32cap_P+0(%rip),%r9d
mov OPENSSL_ia32cap_P+4(%rip),%r8d
mov OPENSSL_ia32cap_P+8(%rip),%r10d
@@ -254,7 +267,7 @@ sha1_block_data_order:
jz .Lialu
___
$code.=<<___ if ($shaext);
- test \$`1<<29`,%r10d # check SHA bit
+ test \$`1<<29`,%r10d # check SHA bit
jnz _shaext_shortcut
___
$code.=<<___ if ($avx>1);
@@ -275,17 +288,24 @@ $code.=<<___;
.align 16
.Lialu:
mov %rsp,%rax
+.cfi_def_cfa_register %rax
push %rbx
+.cfi_push %rbx
push %rbp
+.cfi_push %rbp
push %r12
+.cfi_push %r12
push %r13
+.cfi_push %r13
push %r14
+.cfi_push %r14
mov %rdi,$ctx # reassigned argument
sub \$`8+16*4`,%rsp
mov %rsi,$inp # reassigned argument
and \$-64,%rsp
mov %rdx,$num # reassigned argument
mov %rax,`16*4`(%rsp)
+.cfi_cfa_expression %rsp+64,deref,+8
.Lprologue:
mov 0($ctx),$A
@@ -319,14 +339,22 @@ $code.=<<___;
jnz .Lloop
mov `16*4`(%rsp),%rsi
+.cfi_def_cfa %rsi,8
mov -40(%rsi),%r14
+.cfi_restore %r14
mov -32(%rsi),%r13
+.cfi_restore %r13
mov -24(%rsi),%r12
+.cfi_restore %r12
mov -16(%rsi),%rbp
+.cfi_restore %rbp
mov -8(%rsi),%rbx
+.cfi_restore %rbx
lea (%rsi),%rsp
+.cfi_def_cfa_register %rsp
.Lepilogue:
ret
+.cfi_endproc
.size sha1_block_data_order,.-sha1_block_data_order
___
if ($shaext) {{{
@@ -342,6 +370,7 @@ $code.=<<___;
.align 32
sha1_block_data_order_shaext:
_shaext_shortcut:
+.cfi_startproc
___
$code.=<<___ if ($win64);
lea `-8-4*16`(%rsp),%rsp
@@ -439,6 +468,7 @@ $code.=<<___ if ($win64);
.Lepilogue_shaext:
___
$code.=<<___;
+.cfi_endproc
ret
.size sha1_block_data_order_shaext,.-sha1_block_data_order_shaext
___
@@ -452,7 +482,8 @@ my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization
my @T=("%esi","%edi");
my $j=0;
my $rx=0;
-my $K_XX_XX="%r11";
+my $K_XX_XX="%r14";
+my $fp="%r11";
my $_rol=sub { &rol(@_) };
my $_ror=sub { &ror(@_) };
@@ -473,25 +504,31 @@ $code.=<<___;
.align 16
sha1_block_data_order_ssse3:
_ssse3_shortcut:
- mov %rsp,%rax
+.cfi_startproc
+ mov %rsp,$fp # frame pointer
+.cfi_def_cfa_register $fp
push %rbx
+.cfi_push %rbx
push %rbp
+.cfi_push %rbp
push %r12
+.cfi_push %r12
push %r13 # redundant, done to share Win64 SE handler
+.cfi_push %r13
push %r14
+.cfi_push %r14
lea `-64-($win64?6*16:0)`(%rsp),%rsp
___
$code.=<<___ if ($win64);
- movaps %xmm6,-40-6*16(%rax)
- movaps %xmm7,-40-5*16(%rax)
- movaps %xmm8,-40-4*16(%rax)
- movaps %xmm9,-40-3*16(%rax)
- movaps %xmm10,-40-2*16(%rax)
- movaps %xmm11,-40-1*16(%rax)
+ movaps %xmm6,-40-6*16($fp)
+ movaps %xmm7,-40-5*16($fp)
+ movaps %xmm8,-40-4*16($fp)
+ movaps %xmm9,-40-3*16($fp)
+ movaps %xmm10,-40-2*16($fp)
+ movaps %xmm11,-40-1*16($fp)
.Lprologue_ssse3:
___
$code.=<<___;
- mov %rax,%r14 # original %rsp
and \$-64,%rsp
mov %rdi,$ctx # reassigned argument
mov %rsi,$inp # reassigned argument
@@ -541,7 +578,7 @@ sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
$code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
}
-sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4
+sub Xupdate_ssse3_16_31() # recall that $Xi starts with 4
{ use integer;
my $body = shift;
my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
@@ -898,23 +935,29 @@ $code.=<<___;
mov $E,16($ctx)
___
$code.=<<___ if ($win64);
- movaps -40-6*16(%r14),%xmm6
- movaps -40-5*16(%r14),%xmm7
- movaps -40-4*16(%r14),%xmm8
- movaps -40-3*16(%r14),%xmm9
- movaps -40-2*16(%r14),%xmm10
- movaps -40-1*16(%r14),%xmm11
+ movaps -40-6*16($fp),%xmm6
+ movaps -40-5*16($fp),%xmm7
+ movaps -40-4*16($fp),%xmm8
+ movaps -40-3*16($fp),%xmm9
+ movaps -40-2*16($fp),%xmm10
+ movaps -40-1*16($fp),%xmm11
___
$code.=<<___;
- lea (%r14),%rsi
- mov -40(%rsi),%r14
- mov -32(%rsi),%r13
- mov -24(%rsi),%r12
- mov -16(%rsi),%rbp
- mov -8(%rsi),%rbx
- lea (%rsi),%rsp
+ mov -40($fp),%r14
+.cfi_restore %r14
+ mov -32($fp),%r13
+.cfi_restore %r13
+ mov -24($fp),%r12
+.cfi_restore %r12
+ mov -16($fp),%rbp
+.cfi_restore %rbp
+ mov -8($fp),%rbx
+.cfi_restore %rbx
+ lea ($fp),%rsp
+.cfi_def_cfa_register %rsp
.Lepilogue_ssse3:
ret
+.cfi_endproc
.size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
___
@@ -935,26 +978,32 @@ $code.=<<___;
.align 16
sha1_block_data_order_avx:
_avx_shortcut:
- mov %rsp,%rax
+.cfi_startproc
+ mov %rsp,$fp
+.cfi_def_cfa_register $fp
push %rbx
+.cfi_push %rbx
push %rbp
+.cfi_push %rbp
push %r12
+.cfi_push %r12
push %r13 # redundant, done to share Win64 SE handler
+.cfi_push %r13
push %r14
+.cfi_push %r14
lea `-64-($win64?6*16:0)`(%rsp),%rsp
vzeroupper
___
$code.=<<___ if ($win64);
- vmovaps %xmm6,-40-6*16(%rax)
- vmovaps %xmm7,-40-5*16(%rax)
- vmovaps %xmm8,-40-4*16(%rax)
- vmovaps %xmm9,-40-3*16(%rax)
- vmovaps %xmm10,-40-2*16(%rax)
- vmovaps %xmm11,-40-1*16(%rax)
+ vmovaps %xmm6,-40-6*16($fp)
+ vmovaps %xmm7,-40-5*16($fp)
+ vmovaps %xmm8,-40-4*16($fp)
+ vmovaps %xmm9,-40-3*16($fp)
+ vmovaps %xmm10,-40-2*16($fp)
+ vmovaps %xmm11,-40-1*16($fp)
.Lprologue_avx:
___
$code.=<<___;
- mov %rax,%r14 # original %rsp
and \$-64,%rsp
mov %rdi,$ctx # reassigned argument
mov %rsi,$inp # reassigned argument
@@ -994,7 +1043,7 @@ $code.=<<___;
jmp .Loop_avx
___
-sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4
+sub Xupdate_avx_16_31() # recall that $Xi starts with 4
{ use integer;
my $body = shift;
my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
@@ -1262,23 +1311,29 @@ $code.=<<___;
mov $E,16($ctx)
___
$code.=<<___ if ($win64);
- movaps -40-6*16(%r14),%xmm6
- movaps -40-5*16(%r14),%xmm7
- movaps -40-4*16(%r14),%xmm8
- movaps -40-3*16(%r14),%xmm9
- movaps -40-2*16(%r14),%xmm10
- movaps -40-1*16(%r14),%xmm11
+ movaps -40-6*16($fp),%xmm6
+ movaps -40-5*16($fp),%xmm7
+ movaps -40-4*16($fp),%xmm8
+ movaps -40-3*16($fp),%xmm9
+ movaps -40-2*16($fp),%xmm10
+ movaps -40-1*16($fp),%xmm11
___
$code.=<<___;
- lea (%r14),%rsi
- mov -40(%rsi),%r14
- mov -32(%rsi),%r13
- mov -24(%rsi),%r12
- mov -16(%rsi),%rbp
- mov -8(%rsi),%rbx
- lea (%rsi),%rsp
+ mov -40($fp),%r14
+.cfi_restore %r14
+ mov -32($fp),%r13
+.cfi_restore %r13
+ mov -24($fp),%r12
+.cfi_restore %r12
+ mov -16($fp),%rbp
+.cfi_restore %rbp
+ mov -8($fp),%rbx
+.cfi_restore %rbx
+ lea ($fp),%rsp
+.cfi_def_cfa_register %rsp
.Lepilogue_avx:
ret
+.cfi_endproc
.size sha1_block_data_order_avx,.-sha1_block_data_order_avx
___
@@ -1302,26 +1357,32 @@ $code.=<<___;
.align 16
sha1_block_data_order_avx2:
_avx2_shortcut:
- mov %rsp,%rax
+.cfi_startproc
+ mov %rsp,$fp
+.cfi_def_cfa_register $fp
push %rbx
+.cfi_push %rbx
push %rbp
+.cfi_push %rbp
push %r12
+.cfi_push %r12
push %r13
+.cfi_push %r13
push %r14
+.cfi_push %r14
vzeroupper
___
$code.=<<___ if ($win64);
lea -6*16(%rsp),%rsp
- vmovaps %xmm6,-40-6*16(%rax)
- vmovaps %xmm7,-40-5*16(%rax)
- vmovaps %xmm8,-40-4*16(%rax)
- vmovaps %xmm9,-40-3*16(%rax)
- vmovaps %xmm10,-40-2*16(%rax)
- vmovaps %xmm11,-40-1*16(%rax)
+ vmovaps %xmm6,-40-6*16($fp)
+ vmovaps %xmm7,-40-5*16($fp)
+ vmovaps %xmm8,-40-4*16($fp)
+ vmovaps %xmm9,-40-3*16($fp)
+ vmovaps %xmm10,-40-2*16($fp)
+ vmovaps %xmm11,-40-1*16($fp)
.Lprologue_avx2:
___
$code.=<<___;
- mov %rax,%r14 # original %rsp
mov %rdi,$ctx # reassigned argument
mov %rsi,$inp # reassigned argument
mov %rdx,$num # reassigned argument
@@ -1466,7 +1527,7 @@ sub bodyx_40_59 () { # 10 instructions, 3 cycles critical path
)
}
-sub Xupdate_avx2_16_31() # recall that $Xi starts wtih 4
+sub Xupdate_avx2_16_31() # recall that $Xi starts with 4
{ use integer;
my $body = shift;
my @insns = (&$body,&$body,&$body,&$body,&$body); # 35 instructions
@@ -1741,23 +1802,29 @@ $code.=<<___;
vzeroupper
___
$code.=<<___ if ($win64);
- movaps -40-6*16(%r14),%xmm6
- movaps -40-5*16(%r14),%xmm7
- movaps -40-4*16(%r14),%xmm8
- movaps -40-3*16(%r14),%xmm9
- movaps -40-2*16(%r14),%xmm10
- movaps -40-1*16(%r14),%xmm11
+ movaps -40-6*16($fp),%xmm6
+ movaps -40-5*16($fp),%xmm7
+ movaps -40-4*16($fp),%xmm8
+ movaps -40-3*16($fp),%xmm9
+ movaps -40-2*16($fp),%xmm10
+ movaps -40-1*16($fp),%xmm11
___
$code.=<<___;
- lea (%r14),%rsi
- mov -40(%rsi),%r14
- mov -32(%rsi),%r13
- mov -24(%rsi),%r12
- mov -16(%rsi),%rbp
- mov -8(%rsi),%rbx
- lea (%rsi),%rsp
+ mov -40($fp),%r14
+.cfi_restore %r14
+ mov -32($fp),%r13
+.cfi_restore %r13
+ mov -24($fp),%r12
+.cfi_restore %r12
+ mov -16($fp),%rbp
+.cfi_restore %rbp
+ mov -8($fp),%rbx
+.cfi_restore %rbx
+ lea ($fp),%rsp
+.cfi_def_cfa_register %rsp
.Lepilogue_avx2:
ret
+.cfi_endproc
.size sha1_block_data_order_avx2,.-sha1_block_data_order_avx2
___
}
@@ -1898,15 +1965,13 @@ ssse3_handler:
cmp %r10,%rbx # context->Rip<prologue label
jb .Lcommon_seh_tail
- mov 152($context),%rax # pull context->Rsp
+ mov 208($context),%rax # pull context->R11
mov 4(%r11),%r10d # HandlerData[1]
lea (%rsi,%r10),%r10 # epilogue label
cmp %r10,%rbx # context->Rip>=epilogue label
jae .Lcommon_seh_tail
- mov 232($context),%rax # pull context->R14
-
lea -40-6*16(%rax),%rsi
lea 512($context),%rdi # &context.Xmm6
mov \$12,%ecx
@@ -1919,9 +1984,9 @@ ssse3_handler:
mov -40(%rax),%r14
mov %rbx,144($context) # restore context->Rbx
mov %rbp,160($context) # restore context->Rbp
- mov %r12,216($context) # restore cotnext->R12
- mov %r13,224($context) # restore cotnext->R13
- mov %r14,232($context) # restore cotnext->R14
+ mov %r12,216($context) # restore context->R12
+ mov %r13,224($context) # restore context->R13
+ mov %r14,232($context) # restore context->R14
.Lcommon_seh_tail:
mov 8(%rax),%rdi
diff --git a/crypto/sha/asm/sha256-586.pl b/crypto/sha/asm/sha256-586.pl
index e9077143817c..dccc771ad584 100755
--- a/crypto/sha/asm/sha256-586.pl
+++ b/crypto/sha/asm/sha256-586.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2007-2018 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -11,7 +18,7 @@
#
# Performance improvement over compiler generated code varies from
# 10% to 40% [see below]. Not very impressive on some µ-archs, but
-# it's 5 times smaller and optimizies amount of writes.
+# it's 5 times smaller and optimizes amount of writes.
#
# May 2012.
#
@@ -40,7 +47,7 @@
#
# Performance in clock cycles per processed byte (less is better):
#
-# gcc icc x86 asm(*) SIMD x86_64 asm(**)
+# gcc icc x86 asm(*) SIMD x86_64 asm(**)
# Pentium 46 57 40/38 - -
# PIII 36 33 27/24 - -
# P4 41 38 28 - 17.3
@@ -50,20 +57,26 @@
# Sandy Bridge 25 - 15.9 12.4 11.6
# Ivy Bridge 24 - 15.0 11.4 10.3
# Haswell 22 - 13.9 9.46 7.80
+# Skylake 20 - 14.9 9.50 7.70
# Bulldozer 36 - 27/22 17.0 13.6
# VIA Nano 36 - 25/22 16.8 16.5
# Atom 50 - 30/25 21.9 18.9
# Silvermont 40 - 34/31 22.9 20.6
+# Goldmont 29 - 20 16.3(***)
#
# (*) numbers after slash are for unrolled loop, where applicable;
# (**) x86_64 assembly performance is presented for reference
# purposes, results are best-available;
+# (***) SHAEXT result is 4.1, strangely enough better than 64-bit one;
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";
-&asm_init($ARGV[0],"sha512-586.pl",$ARGV[$#ARGV] eq "386");
+$output=pop;
+open STDOUT,">$output";
+
+&asm_init($ARGV[0],$ARGV[$#ARGV] eq "386");
$xmm=$avx=0;
for (@ARGV) { $xmm=1 if (/-DOPENSSL_IA32_SSE2/); }
@@ -83,7 +96,7 @@ if ($xmm && !$avx && $ARGV[0] eq "win32" &&
$avx = ($1>=10) + ($1>=11);
}
-if ($xmm && !$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) {
+if ($xmm && !$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([3-9]\.[0-9]+)/) {
$avx = ($2>=3.0) + ($2>3.0);
}
@@ -266,7 +279,7 @@ my $suffix=shift;
&mov ($Coff,"ecx");
&mov ($Doff,"edi");
&mov (&DWP(0,"esp"),"ebx"); # magic
- &mov ($E,&DWP(16,"esi"));
+ &mov ($E,&DWP(16,"esi"));
&mov ("ebx",&DWP(20,"esi"));
&mov ("ecx",&DWP(24,"esi"));
&mov ("edi",&DWP(28,"esi"));
@@ -375,7 +388,7 @@ my @AH=($A,$K256);
&xor ($AH[1],"ecx"); # magic
&mov (&DWP(8,"esp"),"ecx");
&mov (&DWP(12,"esp"),"ebx");
- &mov ($E,&DWP(16,"esi"));
+ &mov ($E,&DWP(16,"esi"));
&mov ("ebx",&DWP(20,"esi"));
&mov ("ecx",&DWP(24,"esi"));
&mov ("esi",&DWP(28,"esi"));
@@ -1279,3 +1292,5 @@ sub bodyx_00_15 () { # +10%
&function_end_B("sha256_block_data_order");
&asm_finish();
+
+close STDOUT;
diff --git a/crypto/sha/asm/sha256-armv4.pl b/crypto/sha/asm/sha256-armv4.pl
index 750216eb4267..edcfc31278e3 100755
--- a/crypto/sha/asm/sha256-armv4.pl
+++ b/crypto/sha/asm/sha256-armv4.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2007-2018 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -37,8 +44,20 @@
#
# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
+$flavour = shift;
+if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+ ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+ ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+ die "can't locate arm-xlate.pl";
+
+ open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+ open STDOUT,">$output";
+}
$ctx="r0"; $t0="r0";
$inp="r1"; $t4="r1";
@@ -73,7 +92,9 @@ $code.=<<___ if ($i<16);
eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
+# ifndef __ARMEB__
rev $t1,$t1
+# endif
#else
@ ldrb $t1,[$inp,#3] @ $i
add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
@@ -161,15 +182,11 @@ $code=<<___;
#endif
.text
-#if __ARM_ARCH__<7
-.code 32
-#else
+#if defined(__thumb2__)
.syntax unified
-# ifdef __thumb2__
.thumb
-# else
+#else
.code 32
-# endif
#endif
.type K256,%object
@@ -195,21 +212,25 @@ K256:
.word 0 @ terminator
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
.LOPENSSL_armcap:
-.word OPENSSL_armcap_P-sha256_block_data_order
+.word OPENSSL_armcap_P-.Lsha256_block_data_order
#endif
.align 5
.global sha256_block_data_order
.type sha256_block_data_order,%function
sha256_block_data_order:
-#if __ARM_ARCH__<7
+.Lsha256_block_data_order:
+#if __ARM_ARCH__<7 && !defined(__thumb2__)
sub r3,pc,#8 @ sha256_block_data_order
#else
- adr r3,.
+ adr r3,.Lsha256_block_data_order
#endif
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
ldr r12,.LOPENSSL_armcap
ldr r12,[r3,r12] @ OPENSSL_armcap_P
+#ifdef __APPLE__
+ ldr r12,[r12]
+#endif
tst r12,#ARMV8_SHA256
bne .LARMv8
tst r12,#ARMV7_NEON
@@ -233,7 +254,7 @@ for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
$code.=".Lrounds_16_xx:\n";
for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
-#if __ARM_ARCH__>=7
+#ifdef __thumb2__
ite eq @ Thumb2 thing, sanity check in ARM
#endif
ldreq $t3,[sp,#16*4] @ pull ctx
@@ -454,7 +475,8 @@ $code.=<<___;
.global sha256_block_data_order_neon
.type sha256_block_data_order_neon,%function
-.align 4
+.align 5
+.skip 16
sha256_block_data_order_neon:
.LNEON:
stmdb sp!,{r4-r12,lr}
@@ -580,7 +602,7 @@ my $Ktbl="r3";
$code.=<<___;
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-# ifdef __thumb2__
+# if defined(__thumb2__)
# define INST(a,b,c,d) .byte c,d|0xc,a,b
# else
# define INST(a,b,c,d) .byte a,b,c,d
@@ -591,14 +613,11 @@ $code.=<<___;
sha256_block_data_order_armv8:
.LARMv8:
vld1.32 {$ABCD,$EFGH},[$ctx]
-# ifdef __thumb2__
- adr $Ktbl,.LARMv8
- sub $Ktbl,$Ktbl,#.LARMv8-K256
-# else
- adrl $Ktbl,K256
-# endif
+ sub $Ktbl,$Ktbl,#256+32
add $len,$inp,$len,lsl#6 @ len to point at the end of inp
+ b .Loop_v8
+.align 4
.Loop_v8:
vld1.8 {@MSG[0]-@MSG[1]},[$inp]!
vld1.8 {@MSG[2]-@MSG[3]},[$inp]!
diff --git a/crypto/sha/asm/sha256-c64xplus.pl b/crypto/sha/asm/sha256-c64xplus.pl
new file mode 100755
index 000000000000..3ab7d9b68946
--- /dev/null
+++ b/crypto/sha/asm/sha256-c64xplus.pl
@@ -0,0 +1,320 @@
+#! /usr/bin/env perl
+# Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# SHA256 for C64x+.
+#
+# January 2012
+#
+# Performance is just below 10 cycles per processed byte, which is
+# almost 40% faster than compiler-generated code. Unroll is unlikely
+# to give more than ~8% improvement...
+#
+# !!! Note that this module uses AMR, which means that all interrupt
+# service routines are expected to preserve it and for own well-being
+# zero it upon entry.
+
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+($CTXA,$INP,$NUM) = ("A4","B4","A6"); # arguments
+ $K256="A3";
+
+($A,$Actx,$B,$Bctx,$C,$Cctx,$D,$Dctx,$T2,$S0,$s1,$t0a,$t1a,$t2a,$X9,$X14)
+ =map("A$_",(16..31));
+($E,$Ectx,$F,$Fctx,$G,$Gctx,$H,$Hctx,$T1,$S1,$s0,$t0e,$t1e,$t2e,$X1,$X15)
+ =map("B$_",(16..31));
+
+($Xia,$Xib)=("A5","B5"); # circular/ring buffer
+ $CTXB=$t2e;
+
+($Xn,$X0,$K)=("B7","B8","B9");
+($Maj,$Ch)=($T2,"B6");
+
+$code.=<<___;
+ .text
+
+ .if .ASSEMBLER_VERSION<7000000
+ .asg 0,__TI_EABI__
+ .endif
+ .if __TI_EABI__
+ .nocmp
+ .asg sha256_block_data_order,_sha256_block_data_order
+ .endif
+
+ .asg B3,RA
+ .asg A15,FP
+ .asg B15,SP
+
+ .if .BIG_ENDIAN
+ .asg SWAP2,MV
+ .asg SWAP4,MV
+ .endif
+
+ .global _sha256_block_data_order
+_sha256_block_data_order:
+__sha256_block:
+ .asmfunc stack_usage(64)
+ MV $NUM,A0 ; reassign $NUM
+|| MVK -64,B0
+ [!A0] BNOP RA ; if ($NUM==0) return;
+|| [A0] STW FP,*SP--[16] ; save frame pointer and alloca(64)
+|| [A0] MV SP,FP
+ [A0] ADDKPC __sha256_block,B2
+|| [A0] AND B0,SP,SP ; align stack at 64 bytes
+ .if __TI_EABI__
+ [A0] MVK 0x00404,B1
+|| [A0] MVKL \$PCR_OFFSET(K256,__sha256_block),$K256
+ [A0] MVKH 0x50000,B1
+|| [A0] MVKH \$PCR_OFFSET(K256,__sha256_block),$K256
+ .else
+ [A0] MVK 0x00404,B1
+|| [A0] MVKL (K256-__sha256_block),$K256
+ [A0] MVKH 0x50000,B1
+|| [A0] MVKH (K256-__sha256_block),$K256
+ .endif
+ [A0] MVC B1,AMR ; setup circular addressing
+|| [A0] MV SP,$Xia
+ [A0] MV SP,$Xib
+|| [A0] ADD B2,$K256,$K256
+|| [A0] MV $CTXA,$CTXB
+|| [A0] SUBAW SP,2,SP ; reserve two words above buffer
+ LDW *${CTXA}[0],$A ; load ctx
+|| LDW *${CTXB}[4],$E
+ LDW *${CTXA}[1],$B
+|| LDW *${CTXB}[5],$F
+ LDW *${CTXA}[2],$C
+|| LDW *${CTXB}[6],$G
+ LDW *${CTXA}[3],$D
+|| LDW *${CTXB}[7],$H
+
+ LDNW *$INP++,$Xn ; pre-fetch input
+ LDW *$K256++,$K ; pre-fetch K256[0]
+ MVK 14,B0 ; loop counters
+ MVK 47,B1
+|| ADDAW $Xia,9,$Xia
+outerloop?:
+ SUB A0,1,A0
+|| MV $A,$Actx
+|| MV $E,$Ectx
+|| MVD $B,$Bctx
+|| MVD $F,$Fctx
+ MV $C,$Cctx
+|| MV $G,$Gctx
+|| MVD $D,$Dctx
+|| MVD $H,$Hctx
+|| SWAP4 $Xn,$X0
+
+ SPLOOPD 8 ; BODY_00_14
+|| MVC B0,ILC
+|| SWAP2 $X0,$X0
+
+ LDNW *$INP++,$Xn
+|| ROTL $A,30,$S0
+|| OR $A,$B,$Maj
+|| AND $A,$B,$t2a
+|| ROTL $E,26,$S1
+|| AND $F,$E,$Ch
+|| ANDN $G,$E,$t2e
+ ROTL $A,19,$t0a
+|| AND $C,$Maj,$Maj
+|| ROTL $E,21,$t0e
+|| XOR $t2e,$Ch,$Ch ; Ch(e,f,g) = (e&f)^(~e&g)
+ ROTL $A,10,$t1a
+|| OR $t2a,$Maj,$Maj ; Maj(a,b,c) = ((a|b)&c)|(a&b)
+|| ROTL $E,7,$t1e
+|| ADD $K,$H,$T1 ; T1 = h + K256[i]
+ ADD $X0,$T1,$T1 ; T1 += X[i];
+|| STW $X0,*$Xib++
+|| XOR $t0a,$S0,$S0
+|| XOR $t0e,$S1,$S1
+ XOR $t1a,$S0,$S0 ; Sigma0(a)
+|| XOR $t1e,$S1,$S1 ; Sigma1(e)
+|| LDW *$K256++,$K ; pre-fetch K256[i+1]
+|| ADD $Ch,$T1,$T1 ; T1 += Ch(e,f,g)
+ ADD $S1,$T1,$T1 ; T1 += Sigma1(e)
+|| ADD $S0,$Maj,$T2 ; T2 = Sigma0(a) + Maj(a,b,c)
+|| ROTL $G,0,$H ; h = g
+|| MV $F,$G ; g = f
+|| MV $X0,$X14
+|| SWAP4 $Xn,$X0
+ SWAP2 $X0,$X0
+|| MV $E,$F ; f = e
+|| ADD $D,$T1,$E ; e = d + T1
+|| MV $C,$D ; d = c
+ MV $B,$C ; c = b
+|| MV $A,$B ; b = a
+|| ADD $T1,$T2,$A ; a = T1 + T2
+ SPKERNEL
+
+ ROTL $A,30,$S0 ; BODY_15
+|| OR $A,$B,$Maj
+|| AND $A,$B,$t2a
+|| ROTL $E,26,$S1
+|| AND $F,$E,$Ch
+|| ANDN $G,$E,$t2e
+|| LDW *${Xib}[1],$Xn ; modulo-scheduled
+ ROTL $A,19,$t0a
+|| AND $C,$Maj,$Maj
+|| ROTL $E,21,$t0e
+|| XOR $t2e,$Ch,$Ch ; Ch(e,f,g) = (e&f)^(~e&g)
+|| LDW *${Xib}[2],$X1 ; modulo-scheduled
+ ROTL $A,10,$t1a
+|| OR $t2a,$Maj,$Maj ; Maj(a,b,c) = ((a|b)&c)|(a&b)
+|| ROTL $E,7,$t1e
+|| ADD $K,$H,$T1 ; T1 = h + K256[i]
+ ADD $X0,$T1,$T1 ; T1 += X[i];
+|| STW $X0,*$Xib++
+|| XOR $t0a,$S0,$S0
+|| XOR $t0e,$S1,$S1
+ XOR $t1a,$S0,$S0 ; Sigma0(a)
+|| XOR $t1e,$S1,$S1 ; Sigma1(e)
+|| LDW *$K256++,$K ; pre-fetch K256[i+1]
+|| ADD $Ch,$T1,$T1 ; T1 += Ch(e,f,g)
+ ADD $S1,$T1,$T1 ; T1 += Sigma1(e)
+|| ADD $S0,$Maj,$T2 ; T2 = Sigma0(a) + Maj(a,b,c)
+|| ROTL $G,0,$H ; h = g
+|| MV $F,$G ; g = f
+|| MV $X0,$X15
+ MV $E,$F ; f = e
+|| ADD $D,$T1,$E ; e = d + T1
+|| MV $C,$D ; d = c
+|| MV $Xn,$X0 ; modulo-scheduled
+|| LDW *$Xia,$X9 ; modulo-scheduled
+|| ROTL $X1,25,$t0e ; modulo-scheduled
+|| ROTL $X14,15,$t0a ; modulo-scheduled
+ SHRU $X1,3,$s0 ; modulo-scheduled
+|| SHRU $X14,10,$s1 ; modulo-scheduled
+|| ROTL $B,0,$C ; c = b
+|| MV $A,$B ; b = a
+|| ADD $T1,$T2,$A ; a = T1 + T2
+
+ SPLOOPD 10 ; BODY_16_63
+|| MVC B1,ILC
+|| ROTL $X1,14,$t1e ; modulo-scheduled
+|| ROTL $X14,13,$t1a ; modulo-scheduled
+
+ XOR $t0e,$s0,$s0
+|| XOR $t0a,$s1,$s1
+|| MV $X15,$X14
+|| MV $X1,$Xn
+ XOR $t1e,$s0,$s0 ; sigma0(X[i+1])
+|| XOR $t1a,$s1,$s1 ; sigma1(X[i+14])
+|| LDW *${Xib}[2],$X1 ; module-scheduled
+ ROTL $A,30,$S0
+|| OR $A,$B,$Maj
+|| AND $A,$B,$t2a
+|| ROTL $E,26,$S1
+|| AND $F,$E,$Ch
+|| ANDN $G,$E,$t2e
+|| ADD $X9,$X0,$X0 ; X[i] += X[i+9]
+ ROTL $A,19,$t0a
+|| AND $C,$Maj,$Maj
+|| ROTL $E,21,$t0e
+|| XOR $t2e,$Ch,$Ch ; Ch(e,f,g) = (e&f)^(~e&g)
+|| ADD $s0,$X0,$X0 ; X[i] += sigma1(X[i+1])
+ ROTL $A,10,$t1a
+|| OR $t2a,$Maj,$Maj ; Maj(a,b,c) = ((a|b)&c)|(a&b)
+|| ROTL $E,7,$t1e
+|| ADD $H,$K,$T1 ; T1 = h + K256[i]
+|| ADD $s1,$X0,$X0 ; X[i] += sigma1(X[i+14])
+ XOR $t0a,$S0,$S0
+|| XOR $t0e,$S1,$S1
+|| ADD $X0,$T1,$T1 ; T1 += X[i]
+|| STW $X0,*$Xib++
+ XOR $t1a,$S0,$S0 ; Sigma0(a)
+|| XOR $t1e,$S1,$S1 ; Sigma1(e)
+|| ADD $Ch,$T1,$T1 ; T1 += Ch(e,f,g)
+|| MV $X0,$X15
+|| ROTL $G,0,$H ; h = g
+|| LDW *$K256++,$K ; pre-fetch K256[i+1]
+ ADD $S1,$T1,$T1 ; T1 += Sigma1(e)
+|| ADD $S0,$Maj,$T2 ; T2 = Sigma0(a) + Maj(a,b,c)
+|| MV $F,$G ; g = f
+|| MV $Xn,$X0 ; modulo-scheduled
+|| LDW *++$Xia,$X9 ; modulo-scheduled
+|| ROTL $X1,25,$t0e ; module-scheduled
+|| ROTL $X14,15,$t0a ; modulo-scheduled
+ ROTL $X1,14,$t1e ; modulo-scheduled
+|| ROTL $X14,13,$t1a ; modulo-scheduled
+|| MV $E,$F ; f = e
+|| ADD $D,$T1,$E ; e = d + T1
+|| MV $C,$D ; d = c
+|| MV $B,$C ; c = b
+ MV $A,$B ; b = a
+|| ADD $T1,$T2,$A ; a = T1 + T2
+|| SHRU $X1,3,$s0 ; modulo-scheduled
+|| SHRU $X14,10,$s1 ; modulo-scheduled
+ SPKERNEL
+
+ [A0] B outerloop?
+|| [A0] LDNW *$INP++,$Xn ; pre-fetch input
+|| [A0] ADDK -260,$K256 ; rewind K256
+|| ADD $Actx,$A,$A ; accumulate ctx
+|| ADD $Ectx,$E,$E
+|| ADD $Bctx,$B,$B
+ ADD $Fctx,$F,$F
+|| ADD $Cctx,$C,$C
+|| ADD $Gctx,$G,$G
+|| ADD $Dctx,$D,$D
+|| ADD $Hctx,$H,$H
+|| [A0] LDW *$K256++,$K ; pre-fetch K256[0]
+
+ [!A0] BNOP RA
+||[!A0] MV $CTXA,$CTXB
+ [!A0] MV FP,SP ; restore stack pointer
+||[!A0] LDW *FP[0],FP ; restore frame pointer
+ [!A0] STW $A,*${CTXA}[0] ; save ctx
+||[!A0] STW $E,*${CTXB}[4]
+||[!A0] MVK 0,B0
+ [!A0] STW $B,*${CTXA}[1]
+||[!A0] STW $F,*${CTXB}[5]
+||[!A0] MVC B0,AMR ; clear AMR
+ STW $C,*${CTXA}[2]
+|| STW $G,*${CTXB}[6]
+ STW $D,*${CTXA}[3]
+|| STW $H,*${CTXB}[7]
+ .endasmfunc
+
+ .if __TI_EABI__
+ .sect ".text:sha_asm.const"
+ .else
+ .sect ".const:sha_asm"
+ .endif
+ .align 128
+K256:
+ .uword 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
+ .uword 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
+ .uword 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
+ .uword 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
+ .uword 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
+ .uword 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
+ .uword 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
+ .uword 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
+ .uword 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
+ .uword 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
+ .uword 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
+ .uword 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
+ .uword 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
+ .uword 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
+ .uword 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
+ .uword 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+ .cstring "SHA256 block transform for C64x+, CRYPTOGAMS by <appro\@openssl.org>"
+ .align 4
+
+___
+
+print $code;
+close STDOUT;
diff --git a/crypto/sha/asm/sha256-mb-x86_64.pl b/crypto/sha/asm/sha256-mb-x86_64.pl
index 9770286b9596..73978dbd81d6 100755
--- a/crypto/sha/asm/sha256-mb-x86_64.pl
+++ b/crypto/sha/asm/sha256-mb-x86_64.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -19,6 +26,7 @@
# Sandy Bridge (20.5 +5.15=25.7)/n 11.6 13.0 +103%
# Ivy Bridge (20.4 +5.14=25.5)/n 10.3 11.6 +82%
# Haswell(iii) (21.0 +5.00=26.0)/n 7.80 8.79 +170%
+# Skylake (18.9 +5.00=23.9)/n 7.70 8.17 +170%
# Bulldozer (21.6 +5.76=27.4)/n 13.6 13.7 +100%
#
# (i) multi-block CBC encrypt with 128-bit key;
@@ -28,7 +36,7 @@
# (iii) "this" is for n=8, when we gather twice as much data, result
# for n=4 is 20.3+4.44=24.7;
# (iv) presented improvement coefficients are asymptotic limits and
-# in real-life application are somewhat lower, e.g. for 2KB
+# in real-life application are somewhat lower, e.g. for 2KB
# fragments they range from 75% to 130% (on Haswell);
$flavour = shift;
@@ -63,7 +71,7 @@ if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([
$avx = ($2>=3.0) + ($2>3.0);
}
-open OUT,"| \"$^X\" $xlate $flavour $output";
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
*STDOUT=*OUT;
# void sha256_multi_block (
@@ -236,6 +244,7 @@ $code.=<<___;
.type sha256_multi_block,\@function,3
.align 32
sha256_multi_block:
+.cfi_startproc
mov OPENSSL_ia32cap_P+4(%rip),%rcx
bt \$61,%rcx # check SHA bit
jc _shaext_shortcut
@@ -246,8 +255,11 @@ $code.=<<___ if ($avx);
___
$code.=<<___;
mov %rsp,%rax
+.cfi_def_cfa_register %rax
push %rbx
+.cfi_push %rbx
push %rbp
+.cfi_push %rbp
___
$code.=<<___ if ($win64);
lea -0xa8(%rsp),%rsp
@@ -266,6 +278,7 @@ $code.=<<___;
sub \$`$REG_SZ*18`, %rsp
and \$-256,%rsp
mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
+.cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8
.Lbody:
lea K256+128(%rip),$Tbl
lea `$REG_SZ*16`(%rsp),%rbx
@@ -382,7 +395,8 @@ $code.=<<___;
jnz .Loop_grande
.Ldone:
- mov `$REG_SZ*17`(%rsp),%rax # orignal %rsp
+ mov `$REG_SZ*17`(%rsp),%rax # original %rsp
+.cfi_def_cfa %rax,8
___
$code.=<<___ if ($win64);
movaps -0xb8(%rax),%xmm6
@@ -398,10 +412,14 @@ $code.=<<___ if ($win64);
___
$code.=<<___;
mov -16(%rax),%rbp
+.cfi_restore %rbp
mov -8(%rax),%rbx
+.cfi_restore %rbx
lea (%rax),%rsp
+.cfi_def_cfa_register %rsp
.Lepilogue:
ret
+.cfi_endproc
.size sha256_multi_block,.-sha256_multi_block
___
{{{
@@ -413,10 +431,14 @@ $code.=<<___;
.type sha256_multi_block_shaext,\@function,3
.align 32
sha256_multi_block_shaext:
+.cfi_startproc
_shaext_shortcut:
mov %rsp,%rax
+.cfi_def_cfa_register %rax
push %rbx
+.cfi_push %rbx
push %rbp
+.cfi_push %rbp
___
$code.=<<___ if ($win64);
lea -0xa8(%rsp),%rsp
@@ -442,7 +464,7 @@ $code.=<<___;
lea K256_shaext+0x80(%rip),$Tbl
.Loop_grande_shaext:
- mov $num,`$REG_SZ*17+8`(%rsp) # orignal $num
+ mov $num,`$REG_SZ*17+8`(%rsp) # original $num
xor $num,$num
___
for($i=0;$i<2;$i++) {
@@ -750,10 +772,14 @@ $code.=<<___ if ($win64);
___
$code.=<<___;
mov -16(%rax),%rbp
+.cfi_restore %rbp
mov -8(%rax),%rbx
+.cfi_restore %rbx
lea (%rax),%rsp
+.cfi_def_cfa_register %rsp
.Lepilogue_shaext:
ret
+.cfi_endproc
.size sha256_multi_block_shaext,.-sha256_multi_block_shaext
___
}}}
@@ -913,6 +939,7 @@ $code.=<<___;
.type sha256_multi_block_avx,\@function,3
.align 32
sha256_multi_block_avx:
+.cfi_startproc
_avx_shortcut:
___
$code.=<<___ if ($avx>1);
@@ -927,8 +954,11 @@ $code.=<<___ if ($avx>1);
___
$code.=<<___;
mov %rsp,%rax
+.cfi_def_cfa_register %rax
push %rbx
+.cfi_push %rbx
push %rbp
+.cfi_push %rbp
___
$code.=<<___ if ($win64);
lea -0xa8(%rsp),%rsp
@@ -947,6 +977,7 @@ $code.=<<___;
sub \$`$REG_SZ*18`, %rsp
and \$-256,%rsp
mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
+.cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8
.Lbody_avx:
lea K256+128(%rip),$Tbl
lea `$REG_SZ*16`(%rsp),%rbx
@@ -1061,7 +1092,8 @@ $code.=<<___;
jnz .Loop_grande_avx
.Ldone_avx:
- mov `$REG_SZ*17`(%rsp),%rax # orignal %rsp
+ mov `$REG_SZ*17`(%rsp),%rax # original %rsp
+.cfi_def_cfa %rax,8
vzeroupper
___
$code.=<<___ if ($win64);
@@ -1078,10 +1110,14 @@ $code.=<<___ if ($win64);
___
$code.=<<___;
mov -16(%rax),%rbp
+.cfi_restore %rbp
mov -8(%rax),%rbx
+.cfi_restore %rbx
lea (%rax),%rsp
+.cfi_def_cfa_register %rsp
.Lepilogue_avx:
ret
+.cfi_endproc
.size sha256_multi_block_avx,.-sha256_multi_block_avx
___
if ($avx>1) {
@@ -1097,14 +1133,22 @@ $code.=<<___;
.type sha256_multi_block_avx2,\@function,3
.align 32
sha256_multi_block_avx2:
+.cfi_startproc
_avx2_shortcut:
mov %rsp,%rax
+.cfi_def_cfa_register %rax
push %rbx
+.cfi_push %rbx
push %rbp
+.cfi_push %rbp
push %r12
+.cfi_push %r12
push %r13
+.cfi_push %r13
push %r14
+.cfi_push %r14
push %r15
+.cfi_push %r15
___
$code.=<<___ if ($win64);
lea -0xa8(%rsp),%rsp
@@ -1123,6 +1167,7 @@ $code.=<<___;
sub \$`$REG_SZ*18`, %rsp
and \$-256,%rsp
mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
+.cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8
.Lbody_avx2:
lea K256+128(%rip),$Tbl
lea 0x80($ctx),$ctx # size optimization
@@ -1237,7 +1282,8 @@ $code.=<<___;
#jnz .Loop_grande_avx2
.Ldone_avx2:
- mov `$REG_SZ*17`(%rsp),%rax # orignal %rsp
+ mov `$REG_SZ*17`(%rsp),%rax # original %rsp
+.cfi_def_cfa %rax,8
vzeroupper
___
$code.=<<___ if ($win64);
@@ -1254,14 +1300,22 @@ $code.=<<___ if ($win64);
___
$code.=<<___;
mov -48(%rax),%r15
+.cfi_restore %r15
mov -40(%rax),%r14
+.cfi_restore %r14
mov -32(%rax),%r13
+.cfi_restore %r13
mov -24(%rax),%r12
+.cfi_restore %r12
mov -16(%rax),%rbp
+.cfi_restore %rbp
mov -8(%rax),%rbx
+.cfi_restore %rbx
lea (%rax),%rsp
+.cfi_def_cfa_register %rsp
.Lepilogue_avx2:
ret
+.cfi_endproc
.size sha256_multi_block_avx2,.-sha256_multi_block_avx2
___
} }}}
@@ -1454,10 +1508,10 @@ avx2_handler:
mov -48(%rax),%r15
mov %rbx,144($context) # restore context->Rbx
mov %rbp,160($context) # restore context->Rbp
- mov %r12,216($context) # restore cotnext->R12
- mov %r13,224($context) # restore cotnext->R13
- mov %r14,232($context) # restore cotnext->R14
- mov %r15,240($context) # restore cotnext->R15
+ mov %r12,216($context) # restore context->R12
+ mov %r13,224($context) # restore context->R13
+ mov %r14,232($context) # restore context->R14
+ mov %r15,240($context) # restore context->R15
lea -56-10*16(%rax),%rsi
lea 512($context),%rdi # &context.Xmm6
diff --git a/crypto/sha/asm/sha512-586.pl b/crypto/sha/asm/sha512-586.pl
index 2f6a202c3765..867ce30b9721 100755
--- a/crypto/sha/asm/sha512-586.pl
+++ b/crypto/sha/asm/sha512-586.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -25,15 +32,17 @@
# Sandy Bridge 58 - 35 11.9 11.2
# Ivy Bridge 50 - 33 11.5 8.17
# Haswell 46 - 29 11.3 7.66
+# Skylake 40 - 26 13.3 7.25
# Bulldozer 121 - 50 14.0 13.5
# VIA Nano 91 - 52 33 14.7
# Atom 126 - 68 48(***) 14.7
# Silvermont 97 - 58 42(***) 17.5
+# Goldmont 80 - 48 19.5 12.0
#
# (*) whichever best applicable.
# (**) x86_64 assembler performance is presented for reference
# purposes, the results are for integer-only code.
-# (***) paddq is increadibly slow on Atom.
+# (***) paddq is incredibly slow on Atom.
#
# IALU code-path is optimized for elder Pentiums. On vanilla Pentium
# performance improvement over compiler generated code reaches ~60%,
@@ -50,7 +59,10 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";
-&asm_init($ARGV[0],"sha512-586.pl",$ARGV[$#ARGV] eq "386");
+$output=pop;
+open STDOUT,">$output";
+
+&asm_init($ARGV[0],$ARGV[$#ARGV] eq "386");
$sse2=0;
for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
@@ -373,7 +385,7 @@ if ($sse2) {
&set_label("16_79_sse2",16);
for ($j=0;$j<2;$j++) { # 2x unroll
- #&movq ("mm7",&QWP(8*(9+16-1),"esp")); # prefetched in BODY_00_15
+ #&movq ("mm7",&QWP(8*(9+16-1),"esp")); # prefetched in BODY_00_15
&movq ("mm5",&QWP(8*(9+16-14),"esp"));
&movq ("mm1","mm7");
&psrlq ("mm7",1);
@@ -909,3 +921,5 @@ sub BODY_00_15_ssse3 { # "phase-less" copy of BODY_00_15_sse2
&asciz("SHA512 block transform for x86, CRYPTOGAMS by <appro\@openssl.org>");
&asm_finish();
+
+close STDOUT;
diff --git a/crypto/sha/asm/sha512-armv4.pl b/crypto/sha/asm/sha512-armv4.pl
index fb7dc506aca1..0b4c5674d9df 100755
--- a/crypto/sha/asm/sha512-armv4.pl
+++ b/crypto/sha/asm/sha512-armv4.pl
@@ -1,10 +1,19 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2007-2018 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
+#
+# Permission to use under GPL terms is granted.
# ====================================================================
# SHA512 block procedure for ARMv4. September 2007.
@@ -34,16 +43,9 @@
# terms it's 22.6 cycles per byte, which is disappointing result.
# Technical writers asserted that 3-way S4 pipeline can sustain
# multiple NEON instructions per cycle, but dual NEON issue could
-# not be observed, and for NEON-only sequences IPC(*) was found to
-# be limited by 1:-( 0.33 and 0.66 were measured for sequences with
-# ILPs(*) of 1 and 2 respectively. This in turn means that you can
-# even find yourself striving, as I did here, for achieving IPC
-# adequate to one delivered by Cortex A8 [for reference, it's
-# 0.5 for ILP of 1, and 1 for higher ILPs].
-#
-# (*) ILP, instruction-level parallelism, how many instructions
-# *can* execute at the same time. IPC, instructions per cycle,
-# indicates how many instructions actually execute.
+# not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html
+# for further details. On side note Cortex-A15 processes one byte in
+# 16 cycles.
# Byte order [in]dependence. =========================================
#
@@ -55,8 +57,20 @@ $hi="HI";
$lo="LO";
# ====================================================================
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
+$flavour = shift;
+if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+ ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+ ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+ die "can't locate arm-xlate.pl";
+
+ open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+ open STDOUT,">$output";
+}
$ctx="r0"; # parameter block
$inp="r1";
@@ -143,6 +157,9 @@ $code.=<<___;
teq $t0,#$magic
ldr $t3,[sp,#$Coff+0] @ c.lo
+#ifdef __thumb2__
+ it eq @ Thumb2 thing, sanity check in ARM
+#endif
orreq $Ktbl,$Ktbl,#1
@ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
@ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
@@ -180,7 +197,17 @@ $code.=<<___;
___
}
$code=<<___;
-#include "arm_arch.h"
+#ifndef __KERNEL__
+# include "arm_arch.h"
+# define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
+# define VFP_ABI_POP vldmia sp!,{d8-d15}
+#else
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ 7
+# define VFP_ABI_PUSH
+# define VFP_ABI_POP
+#endif
+
#ifdef __ARMEL__
# define LO 0
# define HI 4
@@ -192,7 +219,14 @@ $code=<<___;
#endif
.text
+#if defined(__thumb2__)
+.syntax unified
+.thumb
+# define adrl adr
+#else
.code 32
+#endif
+
.type K512,%object
.align 5
K512:
@@ -237,9 +271,9 @@ WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
.size K512,.-K512
-#if __ARM_MAX_ARCH__>=7
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
.LOPENSSL_armcap:
-.word OPENSSL_armcap_P-sha512_block_data_order
+.word OPENSSL_armcap_P-.Lsha512_block_data_order
.skip 32-4
#else
.skip 32
@@ -248,14 +282,22 @@ WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
.global sha512_block_data_order
.type sha512_block_data_order,%function
sha512_block_data_order:
+.Lsha512_block_data_order:
+#if __ARM_ARCH__<7 && !defined(__thumb2__)
sub r3,pc,#8 @ sha512_block_data_order
- add $len,$inp,$len,lsl#7 @ len to point at the end of inp
-#if __ARM_MAX_ARCH__>=7
+#else
+ adr r3,.Lsha512_block_data_order
+#endif
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
ldr r12,.LOPENSSL_armcap
ldr r12,[r3,r12] @ OPENSSL_armcap_P
- tst r12,#1
+#ifdef __APPLE__
+ ldr r12,[r12]
+#endif
+ tst r12,#ARMV7_NEON
bne .LNEON
#endif
+ add $len,$inp,$len,lsl#7 @ len to point at the end of inp
stmdb sp!,{r4-r12,lr}
sub $Ktbl,r3,#672 @ K512
sub sp,sp,#9*8
@@ -369,6 +411,9 @@ $code.=<<___;
___
&BODY_00_15(0x17);
$code.=<<___;
+#ifdef __thumb2__
+ ittt eq @ Thumb2 thing, sanity check in ARM
+#endif
ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0]
ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4]
beq .L16_79
@@ -453,6 +498,7 @@ $code.=<<___;
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
#endif
+.size sha512_block_data_order,.-sha512_block_data_order
___
{
@@ -559,11 +605,15 @@ $code.=<<___;
.arch armv7-a
.fpu neon
+.global sha512_block_data_order_neon
+.type sha512_block_data_order_neon,%function
.align 4
+sha512_block_data_order_neon:
.LNEON:
dmb @ errata #451034 on early Cortex A8
- vstmdb sp!,{d8-d15} @ ABI specification says so
- sub $Ktbl,r3,#672 @ K512
+ add $len,$inp,$len,lsl#7 @ len to point at the end of inp
+ adr $Ktbl,K512
+ VFP_ABI_PUSH
vldmia $ctx,{$A-$H} @ load context
.Loop_neon:
___
@@ -588,16 +638,16 @@ $code.=<<___;
sub $Ktbl,#640 @ rewind K512
bne .Loop_neon
- vldmia sp!,{d8-d15} @ epilogue
+ VFP_ABI_POP
ret @ bx lr
+.size sha512_block_data_order_neon,.-sha512_block_data_order_neon
#endif
___
}
$code.=<<___;
-.size sha512_block_data_order,.-sha512_block_data_order
.asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
-#if __ARM_MAX_ARCH__>=7
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
.comm OPENSSL_armcap_P,4,4
#endif
___
@@ -605,5 +655,14 @@ ___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
$code =~ s/\bret\b/bx lr/gm;
+
+open SELF,$0;
+while(<SELF>) {
+ next if (/^#!/);
+ last if (!s/^#/@/ and !/^$/);
+ print;
+}
+close SELF;
+
print $code;
close STDOUT; # enforce flush
diff --git a/crypto/sha/asm/sha512-armv8.pl b/crypto/sha/asm/sha512-armv8.pl
index f7b36b986a61..ac84ebb52e4f 100755
--- a/crypto/sha/asm/sha512-armv8.pl
+++ b/crypto/sha/asm/sha512-armv8.pl
@@ -1,10 +1,18 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2014-2018 The OpenSSL Project Authors. All Rights Reserved.
#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
+#
+# Permission to use under GPLv2 terms is granted.
# ====================================================================
#
# SHA256/512 for ARMv8.
@@ -18,7 +26,9 @@
# Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***))
# Denver 2.01 10.5 (+26%) 6.70 (+8%)
# X-Gene 20.0 (+100%) 12.8 (+300%(***))
-#
+# Mongoose 2.36 13.0 (+50%) 8.36 (+33%)
+# Kryo 1.92 17.4 (+30%) 11.2 (+8%)
+#
# (*) Software SHA256 results are of lesser relevance, presented
# mostly for informational purposes.
# (**) The result is a trade-off: it's possible to improve it by
@@ -26,12 +36,37 @@
# on Cortex-A53 (or by 4 cycles per round).
# (***) Super-impressive coefficients over gcc-generated code are
# indication of some compiler "pathology", most notably code
-# generated with -mgeneral-regs-only is significanty faster
+# generated with -mgeneral-regs-only is significantly faster
# and the gap is only 40-90%.
-
-$flavour=shift;
-$output=shift;
-open STDOUT,">$output";
+#
+# October 2016.
+#
+# Originally it was reckoned that it makes no sense to implement NEON
+# version of SHA256 for 64-bit processors. This is because performance
+# improvement on most wide-spread Cortex-A5x processors was observed
+# to be marginal, same on Cortex-A53 and ~10% on A57. But then it was
+# observed that 32-bit NEON SHA256 performs significantly better than
+# 64-bit scalar version on *some* of the more recent processors. As
+# result 64-bit NEON version of SHA256 was added to provide best
+# all-round performance. For example it executes ~30% faster on X-Gene
+# and Mongoose. [For reference, NEON version of SHA512 is bound to
+# deliver much less improvement, likely *negative* on Cortex-A5x.
+# Which is why NEON support is limited to SHA256.]
+
+$output=pop;
+$flavour=pop;
+
+if ($flavour && $flavour ne "void") {
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+ ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+ ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+ die "can't locate arm-xlate.pl";
+
+ open OUT,"| \"$^X\" $xlate $flavour $output";
+ *STDOUT=*OUT;
+} else {
+ open STDOUT,">$output";
+}
if ($output =~ /512/) {
$BITS=512;
@@ -68,7 +103,7 @@ my ($T0,$T1,$T2)=(@X[($i-8)&15],@X[($i-9)&15],@X[($i-10)&15]);
$T0=@X[$i+3] if ($i<11);
$code.=<<___ if ($i<16);
-#ifndef __ARMEB__
+#ifndef __AARCH64EB__
rev @X[$i],@X[$i] // $i
#endif
___
@@ -151,24 +186,39 @@ ___
}
$code.=<<___;
-#include "arm_arch.h"
+#ifndef __KERNEL__
+# include "arm_arch.h"
+#endif
.text
+.extern OPENSSL_armcap_P
.globl $func
.type $func,%function
.align 6
$func:
-___
-$code.=<<___ if ($SZ==4);
+#ifndef __KERNEL__
+# ifdef __ILP32__
+ ldrsw x16,.LOPENSSL_armcap_P
+# else
ldr x16,.LOPENSSL_armcap_P
+# endif
adr x17,.LOPENSSL_armcap_P
add x16,x16,x17
ldr w16,[x16]
+___
+$code.=<<___ if ($SZ==4);
tst w16,#ARMV8_SHA256
b.ne .Lv8_entry
+ tst w16,#ARMV7_NEON
+ b.ne .Lneon_entry
+___
+$code.=<<___ if ($SZ==8);
+ tst w16,#ARMV8_SHA512
+ b.ne .Lv8_entry
___
$code.=<<___;
+#endif
stp x29,x30,[sp,#-128]!
add x29,sp,#0
@@ -184,7 +234,7 @@ $code.=<<___;
ldp $E,$F,[$ctx,#4*$SZ]
add $num,$inp,$num,lsl#`log(16*$SZ)/log(2)` // end of input
ldp $G,$H,[$ctx,#6*$SZ]
- adr $Ktbl,K$BITS
+ adr $Ktbl,.LK$BITS
stp $ctx,$num,[x29,#96]
.Loop:
@@ -234,8 +284,8 @@ $code.=<<___;
.size $func,.-$func
.align 6
-.type K$BITS,%object
-K$BITS:
+.type .LK$BITS,%object
+.LK$BITS:
___
$code.=<<___ if ($SZ==8);
.quad 0x428a2f98d728ae22,0x7137449123ef65cd
@@ -300,10 +350,16 @@ $code.=<<___ if ($SZ==4);
.long 0 //terminator
___
$code.=<<___;
-.size K$BITS,.-K$BITS
+.size .LK$BITS,.-.LK$BITS
+#ifndef __KERNEL__
.align 3
.LOPENSSL_armcap_P:
+# ifdef __ILP32__
+ .long OPENSSL_armcap_P-.
+# else
.quad OPENSSL_armcap_P-.
+# endif
+#endif
.asciz "SHA$BITS block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
___
@@ -317,6 +373,7 @@ my ($W0,$W1)=("v16.4s","v17.4s");
my ($ABCD_SAVE,$EFGH_SAVE)=("v18.16b","v19.16b");
$code.=<<___;
+#ifndef __KERNEL__
.type sha256_block_armv8,%function
.align 6
sha256_block_armv8:
@@ -325,7 +382,7 @@ sha256_block_armv8:
add x29,sp,#0
ld1.32 {$ABCD,$EFGH},[$ctx]
- adr $Ktbl,K256
+ adr $Ktbl,.LK256
.Loop_hw:
ld1 {@MSG[0]-@MSG[3]},[$inp],#64
@@ -385,11 +442,406 @@ $code.=<<___;
ldr x29,[sp],#16
ret
.size sha256_block_armv8,.-sha256_block_armv8
+#endif
___
}
+if ($SZ==4) { ######################################### NEON stuff #
+# You'll surely note a lot of similarities with sha256-armv4 module,
+# and of course it's not a coincidence. sha256-armv4 was used as
+# initial template, but was adapted for ARMv8 instruction set and
+# extensively re-tuned for all-round performance.
+
+my @V = ($A,$B,$C,$D,$E,$F,$G,$H) = map("w$_",(3..10));
+my ($t0,$t1,$t2,$t3,$t4) = map("w$_",(11..15));
+my $Ktbl="x16";
+my $Xfer="x17";
+my @X = map("q$_",(0..3));
+my ($T0,$T1,$T2,$T3,$T4,$T5,$T6,$T7) = map("q$_",(4..7,16..19));
+my $j=0;
+
+sub AUTOLOAD() # thunk [simplified] x86-style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
+ my $arg = pop;
+ $arg = "#$arg" if ($arg*1 eq $arg);
+ $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
+}
+
+sub Dscalar { shift =~ m|[qv]([0-9]+)|?"d$1":""; }
+sub Dlo { shift =~ m|[qv]([0-9]+)|?"v$1.d[0]":""; }
+sub Dhi { shift =~ m|[qv]([0-9]+)|?"v$1.d[1]":""; }
+
+sub Xupdate()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body);
+ my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+ &ext_8 ($T0,@X[0],@X[1],4); # X[1..4]
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &ext_8 ($T3,@X[2],@X[3],4); # X[9..12]
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &mov (&Dscalar($T7),&Dhi(@X[3])); # X[14..15]
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &ushr_32 ($T2,$T0,$sigma0[0]);
+ eval(shift(@insns));
+ &ushr_32 ($T1,$T0,$sigma0[2]);
+ eval(shift(@insns));
+ &add_32 (@X[0],@X[0],$T3); # X[0..3] += X[9..12]
+ eval(shift(@insns));
+ &sli_32 ($T2,$T0,32-$sigma0[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &ushr_32 ($T3,$T0,$sigma0[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &eor_8 ($T1,$T1,$T2);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &sli_32 ($T3,$T0,32-$sigma0[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &ushr_32 ($T4,$T7,$sigma1[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &eor_8 ($T1,$T1,$T3); # sigma0(X[1..4])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &sli_32 ($T4,$T7,32-$sigma1[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &ushr_32 ($T5,$T7,$sigma1[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &ushr_32 ($T3,$T7,$sigma1[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &add_32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &sli_u32 ($T3,$T7,32-$sigma1[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &eor_8 ($T5,$T5,$T4);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &eor_8 ($T5,$T5,$T3); # sigma1(X[14..15])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &add_32 (@X[0],@X[0],$T5); # X[0..1] += sigma1(X[14..15])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &ushr_32 ($T6,@X[0],$sigma1[0]);
+ eval(shift(@insns));
+ &ushr_32 ($T7,@X[0],$sigma1[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &sli_32 ($T6,@X[0],32-$sigma1[0]);
+ eval(shift(@insns));
+ &ushr_32 ($T5,@X[0],$sigma1[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &eor_8 ($T7,$T7,$T6);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &sli_32 ($T5,@X[0],32-$sigma1[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &ld1_32 ("{$T0}","[$Ktbl], #16");
+ eval(shift(@insns));
+ &eor_8 ($T7,$T7,$T5); # sigma1(X[16..17])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &eor_8 ($T5,$T5,$T5);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &mov (&Dhi($T5), &Dlo($T7));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &add_32 (@X[0],@X[0],$T5); # X[2..3] += sigma1(X[16..17])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &add_32 ($T0,$T0,@X[0]);
+ while($#insns>=1) { eval(shift(@insns)); }
+ &st1_32 ("{$T0}","[$Xfer], #16");
+ eval(shift(@insns));
+
+ push(@X,shift(@X)); # "rotate" X[]
+}
+
+sub Xpreload()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body);
+ my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &ld1_8 ("{@X[0]}","[$inp],#16");
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &ld1_32 ("{$T0}","[$Ktbl],#16");
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &rev32 (@X[0],@X[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &add_32 ($T0,$T0,@X[0]);
+ foreach (@insns) { eval; } # remaining instructions
+ &st1_32 ("{$T0}","[$Xfer], #16");
+
+ push(@X,shift(@X)); # "rotate" X[]
+}
+
+sub body_00_15 () {
+ (
+ '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
+ '&add ($h,$h,$t1)', # h+=X[i]+K[i]
+ '&add ($a,$a,$t4);'. # h+=Sigma0(a) from the past
+ '&and ($t1,$f,$e)',
+ '&bic ($t4,$g,$e)',
+ '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
+ '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past
+ '&orr ($t1,$t1,$t4)', # Ch(e,f,g)
+ '&eor ($t0,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e)
+ '&eor ($t4,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
+ '&add ($h,$h,$t1)', # h+=Ch(e,f,g)
+ '&ror ($t0,$t0,"#$Sigma1[0]")',
+ '&eor ($t2,$a,$b)', # a^b, b^c in next round
+ '&eor ($t4,$t4,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
+ '&add ($h,$h,$t0)', # h+=Sigma1(e)
+ '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
+ '&ldr ($t1,"[$Ktbl]") if ($j==15);'.
+ '&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
+ '&ror ($t4,$t4,"#$Sigma0[0]")',
+ '&add ($d,$d,$h)', # d+=h
+ '&eor ($t3,$t3,$b)', # Maj(a,b,c)
+ '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
+ )
+}
+
+$code.=<<___;
+#ifdef __KERNEL__
+.globl sha256_block_neon
+#endif
+.type sha256_block_neon,%function
+.align 4
+sha256_block_neon:
+.Lneon_entry:
+ stp x29, x30, [sp, #-16]!
+ mov x29, sp
+ sub sp,sp,#16*4
+
+ adr $Ktbl,.LK256
+ add $num,$inp,$num,lsl#6 // len to point at the end of inp
+
+ ld1.8 {@X[0]},[$inp], #16
+ ld1.8 {@X[1]},[$inp], #16
+ ld1.8 {@X[2]},[$inp], #16
+ ld1.8 {@X[3]},[$inp], #16
+ ld1.32 {$T0},[$Ktbl], #16
+ ld1.32 {$T1},[$Ktbl], #16
+ ld1.32 {$T2},[$Ktbl], #16
+ ld1.32 {$T3},[$Ktbl], #16
+ rev32 @X[0],@X[0] // yes, even on
+ rev32 @X[1],@X[1] // big-endian
+ rev32 @X[2],@X[2]
+ rev32 @X[3],@X[3]
+ mov $Xfer,sp
+ add.32 $T0,$T0,@X[0]
+ add.32 $T1,$T1,@X[1]
+ add.32 $T2,$T2,@X[2]
+ st1.32 {$T0-$T1},[$Xfer], #32
+ add.32 $T3,$T3,@X[3]
+ st1.32 {$T2-$T3},[$Xfer]
+ sub $Xfer,$Xfer,#32
+
+ ldp $A,$B,[$ctx]
+ ldp $C,$D,[$ctx,#8]
+ ldp $E,$F,[$ctx,#16]
+ ldp $G,$H,[$ctx,#24]
+ ldr $t1,[sp,#0]
+ mov $t2,wzr
+ eor $t3,$B,$C
+ mov $t4,wzr
+ b .L_00_48
+
+.align 4
+.L_00_48:
+___
+ &Xupdate(\&body_00_15);
+ &Xupdate(\&body_00_15);
+ &Xupdate(\&body_00_15);
+ &Xupdate(\&body_00_15);
+$code.=<<___;
+ cmp $t1,#0 // check for K256 terminator
+ ldr $t1,[sp,#0]
+ sub $Xfer,$Xfer,#64
+ bne .L_00_48
+
+ sub $Ktbl,$Ktbl,#256 // rewind $Ktbl
+ cmp $inp,$num
+ mov $Xfer, #64
+ csel $Xfer, $Xfer, xzr, eq
+ sub $inp,$inp,$Xfer // avoid SEGV
+ mov $Xfer,sp
+___
+ &Xpreload(\&body_00_15);
+ &Xpreload(\&body_00_15);
+ &Xpreload(\&body_00_15);
+ &Xpreload(\&body_00_15);
+$code.=<<___;
+ add $A,$A,$t4 // h+=Sigma0(a) from the past
+ ldp $t0,$t1,[$ctx,#0]
+ add $A,$A,$t2 // h+=Maj(a,b,c) from the past
+ ldp $t2,$t3,[$ctx,#8]
+ add $A,$A,$t0 // accumulate
+ add $B,$B,$t1
+ ldp $t0,$t1,[$ctx,#16]
+ add $C,$C,$t2
+ add $D,$D,$t3
+ ldp $t2,$t3,[$ctx,#24]
+ add $E,$E,$t0
+ add $F,$F,$t1
+ ldr $t1,[sp,#0]
+ stp $A,$B,[$ctx,#0]
+ add $G,$G,$t2
+ mov $t2,wzr
+ stp $C,$D,[$ctx,#8]
+ add $H,$H,$t3
+ stp $E,$F,[$ctx,#16]
+ eor $t3,$B,$C
+ stp $G,$H,[$ctx,#24]
+ mov $t4,wzr
+ mov $Xfer,sp
+ b.ne .L_00_48
+
+ ldr x29,[x29]
+ add sp,sp,#16*4+16
+ ret
+.size sha256_block_neon,.-sha256_block_neon
+___
+}
+
+if ($SZ==8) {
+my $Ktbl="x3";
+
+my @H = map("v$_.16b",(0..4));
+my ($fg,$de,$m9_10)=map("v$_.16b",(5..7));
+my @MSG=map("v$_.16b",(16..23));
+my ($W0,$W1)=("v24.2d","v25.2d");
+my ($AB,$CD,$EF,$GH)=map("v$_.16b",(26..29));
+
$code.=<<___;
+#ifndef __KERNEL__
+.type sha512_block_armv8,%function
+.align 6
+sha512_block_armv8:
+.Lv8_entry:
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+ ld1 {@MSG[0]-@MSG[3]},[$inp],#64 // load input
+ ld1 {@MSG[4]-@MSG[7]},[$inp],#64
+
+ ld1.64 {@H[0]-@H[3]},[$ctx] // load context
+ adr $Ktbl,.LK512
+
+ rev64 @MSG[0],@MSG[0]
+ rev64 @MSG[1],@MSG[1]
+ rev64 @MSG[2],@MSG[2]
+ rev64 @MSG[3],@MSG[3]
+ rev64 @MSG[4],@MSG[4]
+ rev64 @MSG[5],@MSG[5]
+ rev64 @MSG[6],@MSG[6]
+ rev64 @MSG[7],@MSG[7]
+ b .Loop_hw
+
+.align 4
+.Loop_hw:
+ ld1.64 {$W0},[$Ktbl],#16
+ subs $num,$num,#1
+ sub x4,$inp,#128
+ orr $AB,@H[0],@H[0] // offload
+ orr $CD,@H[1],@H[1]
+ orr $EF,@H[2],@H[2]
+ orr $GH,@H[3],@H[3]
+ csel $inp,$inp,x4,ne // conditional rewind
+___
+for($i=0;$i<32;$i++) {
+$code.=<<___;
+ add.i64 $W0,$W0,@MSG[0]
+ ld1.64 {$W1},[$Ktbl],#16
+ ext $W0,$W0,$W0,#8
+ ext $fg,@H[2],@H[3],#8
+ ext $de,@H[1],@H[2],#8
+ add.i64 @H[3],@H[3],$W0 // "T1 + H + K512[i]"
+ sha512su0 @MSG[0],@MSG[1]
+ ext $m9_10,@MSG[4],@MSG[5],#8
+ sha512h @H[3],$fg,$de
+ sha512su1 @MSG[0],@MSG[7],$m9_10
+ add.i64 @H[4],@H[1],@H[3] // "D + T1"
+ sha512h2 @H[3],$H[1],@H[0]
+___
+ ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
+ @H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
+}
+for(;$i<40;$i++) {
+$code.=<<___ if ($i<39);
+ ld1.64 {$W1},[$Ktbl],#16
+___
+$code.=<<___ if ($i==39);
+ sub $Ktbl,$Ktbl,#$rounds*$SZ // rewind
+___
+$code.=<<___;
+ add.i64 $W0,$W0,@MSG[0]
+ ld1 {@MSG[0]},[$inp],#16 // load next input
+ ext $W0,$W0,$W0,#8
+ ext $fg,@H[2],@H[3],#8
+ ext $de,@H[1],@H[2],#8
+ add.i64 @H[3],@H[3],$W0 // "T1 + H + K512[i]"
+ sha512h @H[3],$fg,$de
+ rev64 @MSG[0],@MSG[0]
+ add.i64 @H[4],@H[1],@H[3] // "D + T1"
+ sha512h2 @H[3],$H[1],@H[0]
+___
+ ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
+ @H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
+}
+$code.=<<___;
+ add.i64 @H[0],@H[0],$AB // accumulate
+ add.i64 @H[1],@H[1],$CD
+ add.i64 @H[2],@H[2],$EF
+ add.i64 @H[3],@H[3],$GH
+
+ cbnz $num,.Loop_hw
+
+ st1.64 {@H[0]-@H[3]},[$ctx] // store context
+
+ ldr x29,[sp],#16
+ ret
+.size sha512_block_armv8,.-sha512_block_armv8
+#endif
+___
+}
+
+$code.=<<___;
+#ifndef __KERNEL__
.comm OPENSSL_armcap_P,4,4
+#endif
___
{ my %opcode = (
@@ -407,14 +859,43 @@ ___
}
}
+{ my %opcode = (
+ "sha512h" => 0xce608000, "sha512h2" => 0xce608400,
+ "sha512su0" => 0xcec08000, "sha512su1" => 0xce608800 );
+
+ sub unsha512 {
+ my ($mnemonic,$arg)=@_;
+
+ $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
+ &&
+ sprintf ".inst\t0x%08x\t//%s %s",
+ $opcode{$mnemonic}|$1|($2<<5)|($3<<16),
+ $mnemonic,$arg;
+ }
+}
+
+open SELF,$0;
+while(<SELF>) {
+ next if (/^#!/);
+ last if (!s/^#/\/\// and !/^$/);
+ print;
+}
+close SELF;
+
foreach(split("\n",$code)) {
- s/\`([^\`]*)\`/eval($1)/geo;
+ s/\`([^\`]*)\`/eval($1)/ge;
+
+ s/\b(sha512\w+)\s+([qv].*)/unsha512($1,$2)/ge or
+ s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/ge;
- s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/geo;
+ s/\bq([0-9]+)\b/v$1.16b/g; # old->new registers
- s/\.\w?32\b//o and s/\.16b/\.4s/go;
- m/(ld|st)1[^\[]+\[0\]/o and s/\.4s/\.s/go;
+ s/\.[ui]?8(\s)/$1/;
+ s/\.\w?64\b// and s/\.16b/\.2d/g or
+ s/\.\w?32\b// and s/\.16b/\.4s/g;
+ m/\bext\b/ and s/\.2d/\.16b/g or
+ m/(ld|st)1[^\[]+\[0\]/ and s/\.4s/\.s/g;
print $_,"\n";
}
diff --git a/crypto/sha/asm/sha512-c64xplus.pl b/crypto/sha/asm/sha512-c64xplus.pl
new file mode 100755
index 000000000000..9ebfc92e23ae
--- /dev/null
+++ b/crypto/sha/asm/sha512-c64xplus.pl
@@ -0,0 +1,438 @@
+#! /usr/bin/env perl
+# Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# SHA512 for C64x+.
+#
+# January 2012
+#
+# Performance is 19 cycles per processed byte. Compared to block
+# transform function from sha512.c compiled with cl6x with -mv6400+
+# -o2 -DOPENSSL_SMALL_FOOTPRINT it's almost 7x faster and 2x smaller.
+# Loop unroll won't make it, this implementation, any faster, because
+# it's effectively dominated by SHRU||SHL pairs and you can't schedule
+# more of them.
+#
+# !!! Note that this module uses AMR, which means that all interrupt
+# service routines are expected to preserve it and for own well-being
+# zero it upon entry.
+
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+($CTXA,$INP,$NUM) = ("A4","B4","A6"); # arguments
+ $K512="A3";
+
+($Ahi,$Actxhi,$Bhi,$Bctxhi,$Chi,$Cctxhi,$Dhi,$Dctxhi,
+ $Ehi,$Ectxhi,$Fhi,$Fctxhi,$Ghi,$Gctxhi,$Hhi,$Hctxhi)=map("A$_",(16..31));
+($Alo,$Actxlo,$Blo,$Bctxlo,$Clo,$Cctxlo,$Dlo,$Dctxlo,
+ $Elo,$Ectxlo,$Flo,$Fctxlo,$Glo,$Gctxlo,$Hlo,$Hctxlo)=map("B$_",(16..31));
+
+($S1hi,$CHhi,$S0hi,$t0hi)=map("A$_",(10..13));
+($S1lo,$CHlo,$S0lo,$t0lo)=map("B$_",(10..13));
+($T1hi, $T2hi)= ("A6","A7");
+($T1lo,$T1carry,$T2lo,$T2carry)=("B6","B7","B8","B9");
+($Khi,$Klo)=("A9","A8");
+($MAJhi,$MAJlo)=($T2hi,$T2lo);
+($t1hi,$t1lo)=($Khi,"B2");
+ $CTXB=$t1lo;
+
+($Xihi,$Xilo)=("A5","B5"); # circular/ring buffer
+
+$code.=<<___;
+ .text
+
+ .if .ASSEMBLER_VERSION<7000000
+ .asg 0,__TI_EABI__
+ .endif
+ .if __TI_EABI__
+ .nocmp
+ .asg sha512_block_data_order,_sha512_block_data_order
+ .endif
+
+ .asg B3,RA
+ .asg A15,FP
+ .asg B15,SP
+
+ .if .BIG_ENDIAN
+ .asg $Khi,KHI
+ .asg $Klo,KLO
+ .else
+ .asg $Khi,KLO
+ .asg $Klo,KHI
+ .endif
+
+ .global _sha512_block_data_order
+_sha512_block_data_order:
+__sha512_block:
+ .asmfunc stack_usage(40+128)
+ MV $NUM,A0 ; reassign $NUM
+|| MVK -128,B0
+ [!A0] BNOP RA ; if ($NUM==0) return;
+|| [A0] STW FP,*SP--(40) ; save frame pointer
+|| [A0] MV SP,FP
+ [A0] STDW B13:B12,*SP[4]
+|| [A0] MVK 0x00404,B1
+ [A0] STDW B11:B10,*SP[3]
+|| [A0] STDW A13:A12,*FP[-3]
+|| [A0] MVKH 0x60000,B1
+ [A0] STDW A11:A10,*SP[1]
+|| [A0] MVC B1,AMR ; setup circular addressing
+|| [A0] ADD B0,SP,SP ; alloca(128)
+ .if __TI_EABI__
+ [A0] AND B0,SP,SP ; align stack at 128 bytes
+|| [A0] ADDKPC __sha512_block,B1
+|| [A0] MVKL \$PCR_OFFSET(K512,__sha512_block),$K512
+ [A0] MVKH \$PCR_OFFSET(K512,__sha512_block),$K512
+|| [A0] SUBAW SP,2,SP ; reserve two words above buffer
+ .else
+ [A0] AND B0,SP,SP ; align stack at 128 bytes
+|| [A0] ADDKPC __sha512_block,B1
+|| [A0] MVKL (K512-__sha512_block),$K512
+ [A0] MVKH (K512-__sha512_block),$K512
+|| [A0] SUBAW SP,2,SP ; reserve two words above buffer
+ .endif
+ ADDAW SP,3,$Xilo
+ ADDAW SP,2,$Xihi
+
+|| MV $CTXA,$CTXB
+ LDW *${CTXA}[0^.LITTLE_ENDIAN],$Ahi ; load ctx
+|| LDW *${CTXB}[1^.LITTLE_ENDIAN],$Alo
+|| ADD B1,$K512,$K512
+ LDW *${CTXA}[2^.LITTLE_ENDIAN],$Bhi
+|| LDW *${CTXB}[3^.LITTLE_ENDIAN],$Blo
+ LDW *${CTXA}[4^.LITTLE_ENDIAN],$Chi
+|| LDW *${CTXB}[5^.LITTLE_ENDIAN],$Clo
+ LDW *${CTXA}[6^.LITTLE_ENDIAN],$Dhi
+|| LDW *${CTXB}[7^.LITTLE_ENDIAN],$Dlo
+ LDW *${CTXA}[8^.LITTLE_ENDIAN],$Ehi
+|| LDW *${CTXB}[9^.LITTLE_ENDIAN],$Elo
+ LDW *${CTXA}[10^.LITTLE_ENDIAN],$Fhi
+|| LDW *${CTXB}[11^.LITTLE_ENDIAN],$Flo
+ LDW *${CTXA}[12^.LITTLE_ENDIAN],$Ghi
+|| LDW *${CTXB}[13^.LITTLE_ENDIAN],$Glo
+ LDW *${CTXA}[14^.LITTLE_ENDIAN],$Hhi
+|| LDW *${CTXB}[15^.LITTLE_ENDIAN],$Hlo
+
+ LDNDW *$INP++,B11:B10 ; pre-fetch input
+ LDDW *$K512++,$Khi:$Klo ; pre-fetch K512[0]
+outerloop?:
+ MVK 15,B0 ; loop counters
+|| MVK 64,B1
+|| SUB A0,1,A0
+ MV $Ahi,$Actxhi
+|| MV $Alo,$Actxlo
+|| MV $Bhi,$Bctxhi
+|| MV $Blo,$Bctxlo
+|| MV $Chi,$Cctxhi
+|| MV $Clo,$Cctxlo
+|| MVD $Dhi,$Dctxhi
+|| MVD $Dlo,$Dctxlo
+ MV $Ehi,$Ectxhi
+|| MV $Elo,$Ectxlo
+|| MV $Fhi,$Fctxhi
+|| MV $Flo,$Fctxlo
+|| MV $Ghi,$Gctxhi
+|| MV $Glo,$Gctxlo
+|| MVD $Hhi,$Hctxhi
+|| MVD $Hlo,$Hctxlo
+loop0_15?:
+ .if .BIG_ENDIAN
+ MV B11,$T1hi
+|| MV B10,$T1lo
+ .else
+ SWAP4 B10,$T1hi
+|| SWAP4 B11,$T1lo
+ SWAP2 $T1hi,$T1hi
+|| SWAP2 $T1lo,$T1lo
+ .endif
+loop16_79?:
+ STW $T1hi,*$Xihi++[2]
+|| STW $T1lo,*$Xilo++[2] ; X[i] = T1
+|| ADD $Hhi,$T1hi,$T1hi
+|| ADDU $Hlo,$T1lo,$T1carry:$T1lo ; T1 += h
+|| SHRU $Ehi,14,$S1hi
+|| SHL $Ehi,32-14,$S1lo
+ XOR $Fhi,$Ghi,$CHhi
+|| XOR $Flo,$Glo,$CHlo
+|| ADD KHI,$T1hi,$T1hi
+|| ADDU KLO,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += K512[i]
+|| SHRU $Elo,14,$t0lo
+|| SHL $Elo,32-14,$t0hi
+ XOR $t0hi,$S1hi,$S1hi
+|| XOR $t0lo,$S1lo,$S1lo
+|| AND $Ehi,$CHhi,$CHhi
+|| AND $Elo,$CHlo,$CHlo
+|| ROTL $Ghi,0,$Hhi
+|| ROTL $Glo,0,$Hlo ; h = g
+|| SHRU $Ehi,18,$t0hi
+|| SHL $Ehi,32-18,$t0lo
+ XOR $t0hi,$S1hi,$S1hi
+|| XOR $t0lo,$S1lo,$S1lo
+|| XOR $Ghi,$CHhi,$CHhi
+|| XOR $Glo,$CHlo,$CHlo ; Ch(e,f,g) = ((f^g)&e)^g
+|| ROTL $Fhi,0,$Ghi
+|| ROTL $Flo,0,$Glo ; g = f
+|| SHRU $Elo,18,$t0lo
+|| SHL $Elo,32-18,$t0hi
+ XOR $t0hi,$S1hi,$S1hi
+|| XOR $t0lo,$S1lo,$S1lo
+|| OR $Ahi,$Bhi,$MAJhi
+|| OR $Alo,$Blo,$MAJlo
+|| ROTL $Ehi,0,$Fhi
+|| ROTL $Elo,0,$Flo ; f = e
+|| SHRU $Ehi,41-32,$t0lo
+|| SHL $Ehi,64-41,$t0hi
+ XOR $t0hi,$S1hi,$S1hi
+|| XOR $t0lo,$S1lo,$S1lo
+|| AND $Chi,$MAJhi,$MAJhi
+|| AND $Clo,$MAJlo,$MAJlo
+|| ROTL $Dhi,0,$Ehi
+|| ROTL $Dlo,0,$Elo ; e = d
+|| SHRU $Elo,41-32,$t0hi
+|| SHL $Elo,64-41,$t0lo
+ XOR $t0hi,$S1hi,$S1hi
+|| XOR $t0lo,$S1lo,$S1lo ; Sigma1(e)
+|| AND $Ahi,$Bhi,$t1hi
+|| AND $Alo,$Blo,$t1lo
+|| ROTL $Chi,0,$Dhi
+|| ROTL $Clo,0,$Dlo ; d = c
+|| SHRU $Ahi,28,$S0hi
+|| SHL $Ahi,32-28,$S0lo
+ OR $t1hi,$MAJhi,$MAJhi
+|| OR $t1lo,$MAJlo,$MAJlo ; Maj(a,b,c) = ((a|b)&c)|(a&b)
+|| ADD $CHhi,$T1hi,$T1hi
+|| ADDU $CHlo,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += Ch(e,f,g)
+|| ROTL $Bhi,0,$Chi
+|| ROTL $Blo,0,$Clo ; c = b
+|| SHRU $Alo,28,$t0lo
+|| SHL $Alo,32-28,$t0hi
+ XOR $t0hi,$S0hi,$S0hi
+|| XOR $t0lo,$S0lo,$S0lo
+|| ADD $S1hi,$T1hi,$T1hi
+|| ADDU $S1lo,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += Sigma1(e)
+|| ROTL $Ahi,0,$Bhi
+|| ROTL $Alo,0,$Blo ; b = a
+|| SHRU $Ahi,34-32,$t0lo
+|| SHL $Ahi,64-34,$t0hi
+ XOR $t0hi,$S0hi,$S0hi
+|| XOR $t0lo,$S0lo,$S0lo
+|| ADD $MAJhi,$T1hi,$T2hi
+|| ADDU $MAJlo,$T1carry:$T1lo,$T2carry:$T2lo ; T2 = T1+Maj(a,b,c)
+|| SHRU $Alo,34-32,$t0hi
+|| SHL $Alo,64-34,$t0lo
+ XOR $t0hi,$S0hi,$S0hi
+|| XOR $t0lo,$S0lo,$S0lo
+|| ADD $Ehi,$T1hi,$T1hi
+|| ADDU $Elo,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += e
+|| [B0] BNOP loop0_15?
+|| SHRU $Ahi,39-32,$t0lo
+|| SHL $Ahi,64-39,$t0hi
+ XOR $t0hi,$S0hi,$S0hi
+|| XOR $t0lo,$S0lo,$S0lo
+|| [B0] LDNDW *$INP++,B11:B10 ; pre-fetch input
+||[!B1] BNOP break?
+|| SHRU $Alo,39-32,$t0hi
+|| SHL $Alo,64-39,$t0lo
+ XOR $t0hi,$S0hi,$S0hi
+|| XOR $t0lo,$S0lo,$S0lo ; Sigma0(a)
+|| ADD $T1carry,$T1hi,$Ehi
+|| MV $T1lo,$Elo ; e = T1
+||[!B0] LDW *${Xihi}[28],$T1hi
+||[!B0] LDW *${Xilo}[28],$T1lo ; X[i+14]
+ ADD $S0hi,$T2hi,$T2hi
+|| ADDU $S0lo,$T2carry:$T2lo,$T2carry:$T2lo ; T2 += Sigma0(a)
+|| [B1] LDDW *$K512++,$Khi:$Klo ; pre-fetch K512[i]
+ NOP ; avoid cross-path stall
+ ADD $T2carry,$T2hi,$Ahi
+|| MV $T2lo,$Alo ; a = T2
+|| [B0] SUB B0,1,B0
+;;===== branch to loop00_15? is taken here
+ NOP
+;;===== branch to break? is taken here
+ LDW *${Xihi}[2],$T2hi
+|| LDW *${Xilo}[2],$T2lo ; X[i+1]
+|| SHRU $T1hi,19,$S1hi
+|| SHL $T1hi,32-19,$S1lo
+ SHRU $T1lo,19,$t0lo
+|| SHL $T1lo,32-19,$t0hi
+ XOR $t0hi,$S1hi,$S1hi
+|| XOR $t0lo,$S1lo,$S1lo
+|| SHRU $T1hi,61-32,$t0lo
+|| SHL $T1hi,64-61,$t0hi
+ XOR $t0hi,$S1hi,$S1hi
+|| XOR $t0lo,$S1lo,$S1lo
+|| SHRU $T1lo,61-32,$t0hi
+|| SHL $T1lo,64-61,$t0lo
+ XOR $t0hi,$S1hi,$S1hi
+|| XOR $t0lo,$S1lo,$S1lo
+|| SHRU $T1hi,6,$t0hi
+|| SHL $T1hi,32-6,$t0lo
+ XOR $t0hi,$S1hi,$S1hi
+|| XOR $t0lo,$S1lo,$S1lo
+|| SHRU $T1lo,6,$t0lo
+|| LDW *${Xihi}[18],$T1hi
+|| LDW *${Xilo}[18],$T1lo ; X[i+9]
+ XOR $t0lo,$S1lo,$S1lo ; sigma1(Xi[i+14])
+
+|| LDW *${Xihi}[0],$CHhi
+|| LDW *${Xilo}[0],$CHlo ; X[i]
+|| SHRU $T2hi,1,$S0hi
+|| SHL $T2hi,32-1,$S0lo
+ SHRU $T2lo,1,$t0lo
+|| SHL $T2lo,32-1,$t0hi
+ XOR $t0hi,$S0hi,$S0hi
+|| XOR $t0lo,$S0lo,$S0lo
+|| SHRU $T2hi,8,$t0hi
+|| SHL $T2hi,32-8,$t0lo
+ XOR $t0hi,$S0hi,$S0hi
+|| XOR $t0lo,$S0lo,$S0lo
+|| SHRU $T2lo,8,$t0lo
+|| SHL $T2lo,32-8,$t0hi
+ XOR $t0hi,$S0hi,$S0hi
+|| XOR $t0lo,$S0lo,$S0lo
+|| ADD $S1hi,$T1hi,$T1hi
+|| ADDU $S1lo,$T1lo,$T1carry:$T1lo ; T1 = X[i+9]+sigma1()
+|| [B1] BNOP loop16_79?
+|| SHRU $T2hi,7,$t0hi
+|| SHL $T2hi,32-7,$t0lo
+ XOR $t0hi,$S0hi,$S0hi
+|| XOR $t0lo,$S0lo,$S0lo
+|| ADD $CHhi,$T1hi,$T1hi
+|| ADDU $CHlo,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += X[i]
+|| SHRU $T2lo,7,$t0lo
+ XOR $t0lo,$S0lo,$S0lo ; sigma0(Xi[i+1]
+
+ ADD $S0hi,$T1hi,$T1hi
+|| ADDU $S0lo,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += sigma0()
+|| [B1] SUB B1,1,B1
+ NOP ; avoid cross-path stall
+ ADD $T1carry,$T1hi,$T1hi
+;;===== branch to loop16_79? is taken here
+
+break?:
+ ADD $Ahi,$Actxhi,$Ahi ; accumulate ctx
+|| ADDU $Alo,$Actxlo,$Actxlo:$Alo
+|| [A0] LDNDW *$INP++,B11:B10 ; pre-fetch input
+|| [A0] ADDK -640,$K512 ; rewind pointer to K512
+ ADD $Bhi,$Bctxhi,$Bhi
+|| ADDU $Blo,$Bctxlo,$Bctxlo:$Blo
+|| [A0] LDDW *$K512++,$Khi:$Klo ; pre-fetch K512[0]
+ ADD $Chi,$Cctxhi,$Chi
+|| ADDU $Clo,$Cctxlo,$Cctxlo:$Clo
+|| ADD $Actxlo,$Ahi,$Ahi
+||[!A0] MV $CTXA,$CTXB
+ ADD $Dhi,$Dctxhi,$Dhi
+|| ADDU $Dlo,$Dctxlo,$Dctxlo:$Dlo
+|| ADD $Bctxlo,$Bhi,$Bhi
+||[!A0] STW $Ahi,*${CTXA}[0^.LITTLE_ENDIAN] ; save ctx
+||[!A0] STW $Alo,*${CTXB}[1^.LITTLE_ENDIAN]
+ ADD $Ehi,$Ectxhi,$Ehi
+|| ADDU $Elo,$Ectxlo,$Ectxlo:$Elo
+|| ADD $Cctxlo,$Chi,$Chi
+|| [A0] BNOP outerloop?
+||[!A0] STW $Bhi,*${CTXA}[2^.LITTLE_ENDIAN]
+||[!A0] STW $Blo,*${CTXB}[3^.LITTLE_ENDIAN]
+ ADD $Fhi,$Fctxhi,$Fhi
+|| ADDU $Flo,$Fctxlo,$Fctxlo:$Flo
+|| ADD $Dctxlo,$Dhi,$Dhi
+||[!A0] STW $Chi,*${CTXA}[4^.LITTLE_ENDIAN]
+||[!A0] STW $Clo,*${CTXB}[5^.LITTLE_ENDIAN]
+ ADD $Ghi,$Gctxhi,$Ghi
+|| ADDU $Glo,$Gctxlo,$Gctxlo:$Glo
+|| ADD $Ectxlo,$Ehi,$Ehi
+||[!A0] STW $Dhi,*${CTXA}[6^.LITTLE_ENDIAN]
+||[!A0] STW $Dlo,*${CTXB}[7^.LITTLE_ENDIAN]
+ ADD $Hhi,$Hctxhi,$Hhi
+|| ADDU $Hlo,$Hctxlo,$Hctxlo:$Hlo
+|| ADD $Fctxlo,$Fhi,$Fhi
+||[!A0] STW $Ehi,*${CTXA}[8^.LITTLE_ENDIAN]
+||[!A0] STW $Elo,*${CTXB}[9^.LITTLE_ENDIAN]
+ ADD $Gctxlo,$Ghi,$Ghi
+||[!A0] STW $Fhi,*${CTXA}[10^.LITTLE_ENDIAN]
+||[!A0] STW $Flo,*${CTXB}[11^.LITTLE_ENDIAN]
+ ADD $Hctxlo,$Hhi,$Hhi
+||[!A0] STW $Ghi,*${CTXA}[12^.LITTLE_ENDIAN]
+||[!A0] STW $Glo,*${CTXB}[13^.LITTLE_ENDIAN]
+;;===== branch to outerloop? is taken here
+
+ STW $Hhi,*${CTXA}[14^.LITTLE_ENDIAN]
+|| STW $Hlo,*${CTXB}[15^.LITTLE_ENDIAN]
+|| MVK -40,B0
+ ADD FP,B0,SP ; destroy circular buffer
+|| LDDW *FP[-4],A11:A10
+ LDDW *SP[2],A13:A12
+|| LDDW *FP[-2],B11:B10
+ LDDW *SP[4],B13:B12
+|| BNOP RA
+ LDW *++SP(40),FP ; restore frame pointer
+ MVK 0,B0
+ MVC B0,AMR ; clear AMR
+ NOP 2 ; wait till FP is committed
+ .endasmfunc
+
+ .if __TI_EABI__
+ .sect ".text:sha_asm.const"
+ .else
+ .sect ".const:sha_asm"
+ .endif
+ .align 128
+K512:
+ .uword 0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
+ .uword 0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc
+ .uword 0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019
+ .uword 0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118
+ .uword 0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe
+ .uword 0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2
+ .uword 0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1
+ .uword 0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694
+ .uword 0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3
+ .uword 0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65
+ .uword 0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483
+ .uword 0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5
+ .uword 0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210
+ .uword 0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4
+ .uword 0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725
+ .uword 0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70
+ .uword 0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926
+ .uword 0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df
+ .uword 0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8
+ .uword 0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b
+ .uword 0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001
+ .uword 0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30
+ .uword 0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910
+ .uword 0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8
+ .uword 0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53
+ .uword 0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8
+ .uword 0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb
+ .uword 0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3
+ .uword 0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60
+ .uword 0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec
+ .uword 0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9
+ .uword 0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b
+ .uword 0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207
+ .uword 0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178
+ .uword 0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6
+ .uword 0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b
+ .uword 0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493
+ .uword 0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
+ .uword 0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
+ .uword 0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
+ .cstring "SHA512 block transform for C64x+, CRYPTOGAMS by <appro\@openssl.org>"
+ .align 4
+___
+
+print $code;
+close STDOUT;
diff --git a/crypto/sha/asm/sha512-ia64.pl b/crypto/sha/asm/sha512-ia64.pl
index 59f889a09594..356a46aced78 100755
--- a/crypto/sha/asm/sha512-ia64.pl
+++ b/crypto/sha/asm/sha512-ia64.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2004-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -68,7 +75,7 @@
# To generate code, pass the file name with either 256 or 512 in its
# name and compiler flags.
-$output=shift;
+$output=pop;
if ($output =~ /512.*\.[s|asm]/) {
$SZ=8;
diff --git a/crypto/sha/asm/sha512-mips.pl b/crypto/sha/asm/sha512-mips.pl
index b468cfb4569e..dab684dde5bc 100755
--- a/crypto/sha/asm/sha512-mips.pl
+++ b/crypto/sha/asm/sha512-mips.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2010-2018 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -52,15 +59,17 @@
$flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64
if ($flavour =~ /64|n32/i) {
- $PTR_ADD="dadd"; # incidentally works even on n32
- $PTR_SUB="dsub"; # incidentally works even on n32
+ $PTR_LA="dla";
+ $PTR_ADD="daddu"; # incidentally works even on n32
+ $PTR_SUB="dsubu"; # incidentally works even on n32
$REG_S="sd";
$REG_L="ld";
$PTR_SLL="dsll"; # incidentally works even on n32
$SZREG=8;
} else {
- $PTR_ADD="add";
- $PTR_SUB="sub";
+ $PTR_LA="la";
+ $PTR_ADD="addu";
+ $PTR_SUB="subu";
$REG_S="sw";
$REG_L="lw";
$PTR_SLL="sll";
@@ -72,9 +81,9 @@ $pf = ($flavour =~ /nubi/i) ? $t0 : $t2;
#
######################################################################
-$big_endian=(`echo MIPSEL | $ENV{CC} -E -`=~/MIPSEL/)?1:0 if ($ENV{CC});
+$big_endian=(`echo MIPSEB | $ENV{CC} -E -`=~/MIPSEB/)?0:1 if ($ENV{CC});
-for (@ARGV) { $output=$_ if (/^\w[\w\-]*\.\w+$/); }
+for (@ARGV) { $output=$_ if (/\w[\w\-]*\.\w+$/); }
open STDOUT,">$output";
if (!defined($big_endian)) { $big_endian=(unpack('L',pack('N',1))==1); }
@@ -126,8 +135,12 @@ my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
my ($T1,$tmp0,$tmp1,$tmp2)=(@X[4],@X[5],@X[6],@X[7]);
$code.=<<___ if ($i<15);
+#if defined(_MIPS_ARCH_MIPS32R6) || defined(_MIPS_ARCH_MIPS64R6)
+ ${LD} @X[1],`($i+1)*$SZ`($inp)
+#else
${LD}l @X[1],`($i+1)*$SZ+$MSB`($inp)
${LD}r @X[1],`($i+1)*$SZ+$LSB`($inp)
+#endif
___
$code.=<<___ if (!$big_endian && $i<16 && $SZ==4);
#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2)
@@ -286,16 +299,10 @@ ___
}
$FRAMESIZE=16*$SZ+16*$SZREG;
-$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000;
+$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0xc0fff008" : "0xc0ff0000";
$code.=<<___;
-#ifdef OPENSSL_FIPSCANISTER
-# include <openssl/fipssyms.h>
-#endif
-
-#if defined(__mips_smartmips) && !defined(_MIPS_ARCH_MIPS32R2)
-#define _MIPS_ARCH_MIPS32R2
-#endif
+#include "mips_arch.h"
.text
.set noat
@@ -343,7 +350,7 @@ $code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification
___
$code.=<<___;
.set reorder
- la $Ktbl,K${label} # PIC-ified 'load address'
+ $PTR_LA $Ktbl,K${label} # PIC-ified 'load address'
$LD $A,0*$SZ($ctx) # load context
$LD $B,1*$SZ($ctx)
@@ -360,8 +367,12 @@ $code.=<<___;
.align 5
.Loop:
+#if defined(_MIPS_ARCH_MIPS32R6) || defined(_MIPS_ARCH_MIPS64R6)
+ ${LD} @X[0],($inp)
+#else
${LD}l @X[0],$MSB($inp)
${LD}r @X[0],$LSB($inp)
+#endif
___
for ($i=0;$i<16;$i++)
{ &BODY_00_15($i,@V); unshift(@V,pop(@V)); push(@X,shift(@X)); }
diff --git a/crypto/sha/asm/sha512-parisc.pl b/crypto/sha/asm/sha512-parisc.pl
index 6cad72e25573..59eb320ab6ed 100755
--- a/crypto/sha/asm/sha512-parisc.pl
+++ b/crypto/sha/asm/sha512-parisc.pl
@@ -1,7 +1,14 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2009-2018 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
# ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
@@ -361,7 +368,7 @@ L\$parisc1
___
@V=( $Ahi, $Alo, $Bhi, $Blo, $Chi, $Clo, $Dhi, $Dlo,
- $Ehi, $Elo, $Fhi, $Flo, $Ghi, $Glo, $Hhi, $Hlo) =
+ $Ehi, $Elo, $Fhi, $Flo, $Ghi, $Glo, $Hhi, $Hlo) =
( "%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8",
"%r9","%r10","%r11","%r12","%r13","%r14","%r15","%r16");
$a0 ="%r17";
@@ -412,7 +419,7 @@ $code.=<<___;
add $t0,$hlo,$hlo
shd $ahi,$alo,$Sigma0[0],$t0
addc $t1,$hhi,$hhi ; h += Sigma1(e)
- shd $alo,$ahi,$Sigma0[0],$t1
+ shd $alo,$ahi,$Sigma0[0],$t1
add $a0,$hlo,$hlo
shd $ahi,$alo,$Sigma0[1],$t2
addc $a1,$hhi,$hhi ; h += Ch(e,f,g)
@@ -760,13 +767,18 @@ sub assemble {
ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
}
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+ =~ /GNU assembler/) {
+ $gnuas = 1;
+}
+
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/ge;
s/shd\s+(%r[0-9]+),(%r[0-9]+),([0-9]+)/
$3>31 ? sprintf("shd\t%$2,%$1,%d",$3-32) # rotation for >=32
: sprintf("shd\t%$1,%$2,%d",$3)/e or
- # translate made up instructons: _ror, _shr, _align, _shl
+ # translate made up instructions: _ror, _shr, _align, _shl
s/_ror(\s+)(%r[0-9]+),/
($SZ==4 ? "shd" : "shrpd")."$1$2,$2,"/e or
@@ -783,9 +795,11 @@ foreach (split("\n",$code)) {
s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($SIZE_T==4);
- s/cmpb,\*/comb,/ if ($SIZE_T==4);
-
- s/\bbv\b/bve/ if ($SIZE_T==8);
+ s/(\.LEVEL\s+2\.0)W/$1w/ if ($gnuas && $SIZE_T==8);
+ s/\.SPACE\s+\$TEXT\$/.text/ if ($gnuas && $SIZE_T==8);
+ s/\.SUBSPA.*// if ($gnuas && $SIZE_T==8);
+ s/cmpb,\*/comb,/ if ($SIZE_T==4);
+ s/\bbv\b/bve/ if ($SIZE_T==8);
print $_,"\n";
}
diff --git a/crypto/sha/asm/sha512-ppc.pl b/crypto/sha/asm/sha512-ppc.pl
index 17fdc6e8e5a9..71699f663706 100755
--- a/crypto/sha/asm/sha512-ppc.pl
+++ b/crypto/sha/asm/sha512-ppc.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2006-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -19,7 +26,7 @@
#
# (*) 64-bit code in 32-bit application context, which actually is
# on TODO list. It should be noted that for safe deployment in
-# 32-bit *mutli-threaded* context asyncronous signals should be
+# 32-bit *multi-threaded* context asynchronous signals should be
# blocked upon entry to SHA512 block routine. This is because
# 32-bit signaling procedure invalidates upper halves of GPRs.
# Context switch procedure preserves them, but not signaling:-(
diff --git a/crypto/sha/asm/sha512-s390x.pl b/crypto/sha/asm/sha512-s390x.pl
index 9c10e4e9ee74..4c0f4e79315b 100755
--- a/crypto/sha/asm/sha512-s390x.pl
+++ b/crypto/sha/asm/sha512-s390x.pl
@@ -1,7 +1,14 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
# ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
@@ -33,7 +40,7 @@
# instructions and achieve "64-bit" performance even in 31-bit legacy
# application context. The feature is not specific to any particular
# processor, as long as it's "z-CPU". Latter implies that the code
-# remains z/Architecture specific. On z900 SHA256 was measured to
+# remains z/Architecture specific. On z990 SHA256 was measured to
# perform 2.4x and SHA512 - 13x better than code generated by gcc 4.3.
$flavour = shift;
@@ -64,7 +71,7 @@ $tbl="%r13";
$T1="%r14";
$sp="%r15";
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
if ($output =~ /512/) {
@@ -163,6 +170,8 @@ ___
}
$code.=<<___;
+#include "s390x_arch.h"
+
.text
.align 64
.type $Table,\@object
@@ -237,10 +246,7 @@ $Func:
___
$code.=<<___ if ($kimdfunc);
larl %r1,OPENSSL_s390xcap_P
- lg %r0,0(%r1)
- tmhl %r0,0x4000 # check for message-security assist
- jz .Lsoftware
- lg %r0,16(%r1) # check kimd capabilities
+ lg %r0,S390X_KIMD(%r1) # check kimd capabilities
tmhh %r0,`0x8000>>$kimdfunc`
jz .Lsoftware
lghi %r0,$kimdfunc
@@ -304,11 +310,10 @@ $code.=<<___;
cl${g} $inp,`$frame+4*$SIZE_T`($sp)
jne .Lloop
- lm${g} %r6,%r15,`$frame+6*$SIZE_T`($sp)
+ lm${g} %r6,%r15,`$frame+6*$SIZE_T`($sp)
br %r14
.size $Func,.-$Func
.string "SHA${label} block transform for s390x, CRYPTOGAMS by <appro\@openssl.org>"
-.comm OPENSSL_s390xcap_P,80,8
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
diff --git a/crypto/sha/asm/sha512-sparcv9.pl b/crypto/sha/asm/sha512-sparcv9.pl
index 5a9c15d1d34a..4432bda65ab5 100755
--- a/crypto/sha/asm/sha512-sparcv9.pl
+++ b/crypto/sha/asm/sha512-sparcv9.pl
@@ -1,12 +1,19 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
# ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
#
-# Hardware SPARC T4 support by David S. Miller <davem@davemloft.net>.
+# Hardware SPARC T4 support by David S. Miller
# ====================================================================
# SHA256 performance improvement over compiler generated code varies
@@ -49,7 +56,7 @@
# saturates at 11.5x single-process result on 8-core processor, or
# ~11/16GBps per 2.85GHz socket.
-$output=shift;
+$output=pop;
open STDOUT,">$output";
if ($output =~ /512/) {
@@ -95,7 +102,7 @@ if ($output =~ /512/) {
$locals=0; # X[16] is register resident
@X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7");
-
+
$A="%l0";
$B="%l1";
$C="%l2";
@@ -247,7 +254,7 @@ $code.=<<___;
$SLL $a,`$SZ*8-@Sigma0[1]`,$tmp1
xor $tmp0,$h,$h
$SRL $a,@Sigma0[2],$tmp0
- xor $tmp1,$h,$h
+ xor $tmp1,$h,$h
$SLL $a,`$SZ*8-@Sigma0[0]`,$tmp1
xor $tmp0,$h,$h
xor $tmp1,$h,$h ! Sigma0(a)
@@ -791,7 +798,7 @@ ___
# Purpose of these subroutines is to explicitly encode VIS instructions,
# so that one can compile the module without having to specify VIS
-# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
+# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
# Idea is to reserve for option to produce "universal" binary and let
# programmer detect if current CPU is VIS capable at run-time.
sub unvis {
diff --git a/crypto/sha/asm/sha512-x86_64.pl b/crypto/sha/asm/sha512-x86_64.pl
index 78e445f3fe4a..f2ebdfdb68b6 100755
--- a/crypto/sha/asm/sha512-x86_64.pl
+++ b/crypto/sha/asm/sha512-x86_64.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -34,7 +41,7 @@
# level parallelism, on a given CPU implementation in this case.
#
# Special note on Intel EM64T. While Opteron CPU exhibits perfect
-# perfromance ratio of 1.5 between 64- and 32-bit flavors [see above],
+# performance ratio of 1.5 between 64- and 32-bit flavors [see above],
# [currently available] EM64T CPUs apparently are far from it. On the
# contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
# sha256_block:-( This is presumably because 64-bit shifts/rotates
@@ -86,12 +93,16 @@
# Sandy Bridge 17.4 14.2(+23%) 11.6(+50%(**)) 11.2 8.10(+38%(**))
# Ivy Bridge 12.6 10.5(+20%) 10.3(+22%) 8.17 7.22(+13%)
# Haswell 12.2 9.28(+31%) 7.80(+56%) 7.66 5.40(+42%)
+# Skylake 11.4 9.03(+26%) 7.70(+48%) 7.25 5.20(+40%)
# Bulldozer 21.1 13.6(+54%) 13.6(+54%(***)) 13.5 8.58(+57%)
+# Ryzen 11.0 9.02(+22%) 2.05(+440%) 7.05 5.67(+20%)
# VIA Nano 23.0 16.5(+39%) - 14.7 -
# Atom 23.0 18.9(+22%) - 14.7 -
# Silvermont 27.4 20.6(+33%) - 17.5 -
+# Knights L 27.4 21.0(+30%) 19.6(+40%) 17.5 12.8(+37%)
+# Goldmont 18.9 14.3(+32%) 4.16(+350%) 12.0 -
#
-# (*) whichever best applicable;
+# (*) whichever best applicable, including SHAEXT;
# (**) switch from ror to shrd stands for fair share of improvement;
# (***) execution time is fully determined by remaining integer-only
# part, body_00_15; reducing the amount of SIMD instructions
@@ -131,7 +142,7 @@ if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([
$shaext=1; ### set to zero if compiling for 1.0.1
$avx=1 if (!$shaext && $avx);
-open OUT,"| \"$^X\" $xlate $flavour $output";
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
*STDOUT=*OUT;
if ($output =~ /512/) {
@@ -167,7 +178,7 @@ $Tbl="%rbp";
$_ctx="16*$SZ+0*8(%rsp)";
$_inp="16*$SZ+1*8(%rsp)";
$_end="16*$SZ+2*8(%rsp)";
-$_rsp="16*$SZ+3*8(%rsp)";
+$_rsp="`16*$SZ+3*8`(%rsp)";
$framesz="16*$SZ+4*8";
@@ -260,6 +271,7 @@ $code=<<___;
.type $func,\@function,3
.align 16
$func:
+.cfi_startproc
___
$code.=<<___ if ($SZ==4 || $avx);
lea OPENSSL_ia32cap_P(%rip),%r11
@@ -292,13 +304,20 @@ $code.=<<___ if ($SZ==4);
jnz .Lssse3_shortcut
___
$code.=<<___;
+ mov %rsp,%rax # copy %rsp
+.cfi_def_cfa_register %rax
push %rbx
+.cfi_push %rbx
push %rbp
+.cfi_push %rbp
push %r12
+.cfi_push %r12
push %r13
+.cfi_push %r13
push %r14
+.cfi_push %r14
push %r15
- mov %rsp,%r11 # copy %rsp
+.cfi_push %r15
shl \$4,%rdx # num*16
sub \$$framesz,%rsp
lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
@@ -306,7 +325,8 @@ $code.=<<___;
mov $ctx,$_ctx # save ctx, 1st arg
mov $inp,$_inp # save inp, 2nd arh
mov %rdx,$_end # save end pointer, "3rd" arg
- mov %r11,$_rsp # save copy of %rsp
+ mov %rax,$_rsp # save copy of %rsp
+.cfi_cfa_expression $_rsp,deref,+8
.Lprologue:
mov $SZ*0($ctx),$A
@@ -373,15 +393,24 @@ $code.=<<___;
jb .Lloop
mov $_rsp,%rsi
- mov (%rsi),%r15
- mov 8(%rsi),%r14
- mov 16(%rsi),%r13
- mov 24(%rsi),%r12
- mov 32(%rsi),%rbp
- mov 40(%rsi),%rbx
- lea 48(%rsi),%rsp
+.cfi_def_cfa %rsi,8
+ mov -48(%rsi),%r15
+.cfi_restore %r15
+ mov -40(%rsi),%r14
+.cfi_restore %r14
+ mov -32(%rsi),%r13
+.cfi_restore %r13
+ mov -24(%rsi),%r12
+.cfi_restore %r12
+ mov -16(%rsi),%rbp
+.cfi_restore %rbp
+ mov -8(%rsi),%rbx
+.cfi_restore %rbx
+ lea (%rsi),%rsp
+.cfi_def_cfa_register %rsp
.Lepilogue:
ret
+.cfi_endproc
.size $func,.-$func
___
@@ -751,14 +780,22 @@ $code.=<<___;
.type ${func}_ssse3,\@function,3
.align 64
${func}_ssse3:
+.cfi_startproc
.Lssse3_shortcut:
+ mov %rsp,%rax # copy %rsp
+.cfi_def_cfa_register %rax
push %rbx
+.cfi_push %rbx
push %rbp
+.cfi_push %rbp
push %r12
+.cfi_push %r12
push %r13
+.cfi_push %r13
push %r14
+.cfi_push %r14
push %r15
- mov %rsp,%r11 # copy %rsp
+.cfi_push %r15
shl \$4,%rdx # num*16
sub \$`$framesz+$win64*16*4`,%rsp
lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
@@ -766,7 +803,8 @@ ${func}_ssse3:
mov $ctx,$_ctx # save ctx, 1st arg
mov $inp,$_inp # save inp, 2nd arh
mov %rdx,$_end # save end pointer, "3rd" arg
- mov %r11,$_rsp # save copy of %rsp
+ mov %rax,$_rsp # save copy of %rsp
+.cfi_cfa_expression $_rsp,deref,+8
___
$code.=<<___ if ($win64);
movaps %xmm6,16*$SZ+32(%rsp)
@@ -1065,6 +1103,7 @@ $code.=<<___;
jb .Lloop_ssse3
mov $_rsp,%rsi
+.cfi_def_cfa %rsi,8
___
$code.=<<___ if ($win64);
movaps 16*$SZ+32(%rsp),%xmm6
@@ -1073,15 +1112,23 @@ $code.=<<___ if ($win64);
movaps 16*$SZ+80(%rsp),%xmm9
___
$code.=<<___;
- mov (%rsi),%r15
- mov 8(%rsi),%r14
- mov 16(%rsi),%r13
- mov 24(%rsi),%r12
- mov 32(%rsi),%rbp
- mov 40(%rsi),%rbx
- lea 48(%rsi),%rsp
+ mov -48(%rsi),%r15
+.cfi_restore %r15
+ mov -40(%rsi),%r14
+.cfi_restore %r14
+ mov -32(%rsi),%r13
+.cfi_restore %r13
+ mov -24(%rsi),%r12
+.cfi_restore %r12
+ mov -16(%rsi),%rbp
+.cfi_restore %rbp
+ mov -8(%rsi),%rbx
+.cfi_restore %rbx
+ lea (%rsi),%rsp
+.cfi_def_cfa_register %rsp
.Lepilogue_ssse3:
ret
+.cfi_endproc
.size ${func}_ssse3,.-${func}_ssse3
___
}
@@ -1095,14 +1142,22 @@ $code.=<<___;
.type ${func}_xop,\@function,3
.align 64
${func}_xop:
+.cfi_startproc
.Lxop_shortcut:
+ mov %rsp,%rax # copy %rsp
+.cfi_def_cfa_register %rax
push %rbx
+.cfi_push %rbx
push %rbp
+.cfi_push %rbp
push %r12
+.cfi_push %r12
push %r13
+.cfi_push %r13
push %r14
+.cfi_push %r14
push %r15
- mov %rsp,%r11 # copy %rsp
+.cfi_push %r15
shl \$4,%rdx # num*16
sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
@@ -1110,7 +1165,8 @@ ${func}_xop:
mov $ctx,$_ctx # save ctx, 1st arg
mov $inp,$_inp # save inp, 2nd arh
mov %rdx,$_end # save end pointer, "3rd" arg
- mov %r11,$_rsp # save copy of %rsp
+ mov %rax,$_rsp # save copy of %rsp
+.cfi_cfa_expression $_rsp,deref,+8
___
$code.=<<___ if ($win64);
movaps %xmm6,16*$SZ+32(%rsp)
@@ -1437,6 +1493,7 @@ $code.=<<___;
jb .Lloop_xop
mov $_rsp,%rsi
+.cfi_def_cfa %rsi,8
vzeroupper
___
$code.=<<___ if ($win64);
@@ -1450,15 +1507,23 @@ $code.=<<___ if ($win64 && $SZ>4);
movaps 16*$SZ+112(%rsp),%xmm11
___
$code.=<<___;
- mov (%rsi),%r15
- mov 8(%rsi),%r14
- mov 16(%rsi),%r13
- mov 24(%rsi),%r12
- mov 32(%rsi),%rbp
- mov 40(%rsi),%rbx
- lea 48(%rsi),%rsp
+ mov -48(%rsi),%r15
+.cfi_restore %r15
+ mov -40(%rsi),%r14
+.cfi_restore %r14
+ mov -32(%rsi),%r13
+.cfi_restore %r13
+ mov -24(%rsi),%r12
+.cfi_restore %r12
+ mov -16(%rsi),%rbp
+.cfi_restore %rbp
+ mov -8(%rsi),%rbx
+.cfi_restore %rbx
+ lea (%rsi),%rsp
+.cfi_def_cfa_register %rsp
.Lepilogue_xop:
ret
+.cfi_endproc
.size ${func}_xop,.-${func}_xop
___
}
@@ -1471,14 +1536,22 @@ $code.=<<___;
.type ${func}_avx,\@function,3
.align 64
${func}_avx:
+.cfi_startproc
.Lavx_shortcut:
+ mov %rsp,%rax # copy %rsp
+.cfi_def_cfa_register %rax
push %rbx
+.cfi_push %rbx
push %rbp
+.cfi_push %rbp
push %r12
+.cfi_push %r12
push %r13
+.cfi_push %r13
push %r14
+.cfi_push %r14
push %r15
- mov %rsp,%r11 # copy %rsp
+.cfi_push %r15
shl \$4,%rdx # num*16
sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
@@ -1486,7 +1559,8 @@ ${func}_avx:
mov $ctx,$_ctx # save ctx, 1st arg
mov $inp,$_inp # save inp, 2nd arh
mov %rdx,$_end # save end pointer, "3rd" arg
- mov %r11,$_rsp # save copy of %rsp
+ mov %rax,$_rsp # save copy of %rsp
+.cfi_cfa_expression $_rsp,deref,+8
___
$code.=<<___ if ($win64);
movaps %xmm6,16*$SZ+32(%rsp)
@@ -1745,6 +1819,7 @@ $code.=<<___;
jb .Lloop_avx
mov $_rsp,%rsi
+.cfi_def_cfa %rsi,8
vzeroupper
___
$code.=<<___ if ($win64);
@@ -1758,15 +1833,23 @@ $code.=<<___ if ($win64 && $SZ>4);
movaps 16*$SZ+112(%rsp),%xmm11
___
$code.=<<___;
- mov (%rsi),%r15
- mov 8(%rsi),%r14
- mov 16(%rsi),%r13
- mov 24(%rsi),%r12
- mov 32(%rsi),%rbp
- mov 40(%rsi),%rbx
- lea 48(%rsi),%rsp
+ mov -48(%rsi),%r15
+.cfi_restore %r15
+ mov -40(%rsi),%r14
+.cfi_restore %r14
+ mov -32(%rsi),%r13
+.cfi_restore %r13
+ mov -24(%rsi),%r12
+.cfi_restore %r12
+ mov -16(%rsi),%rbp
+.cfi_restore %rbp
+ mov -8(%rsi),%rbx
+.cfi_restore %rbx
+ lea (%rsi),%rsp
+.cfi_def_cfa_register %rsp
.Lepilogue_avx:
ret
+.cfi_endproc
.size ${func}_avx,.-${func}_avx
___
@@ -1774,7 +1857,7 @@ if ($avx>1) {{
######################################################################
# AVX2+BMI code path
#
-my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
+my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
my $PUSH8=8*2*$SZ;
use integer;
@@ -1822,14 +1905,22 @@ $code.=<<___;
.type ${func}_avx2,\@function,3
.align 64
${func}_avx2:
+.cfi_startproc
.Lavx2_shortcut:
+ mov %rsp,%rax # copy %rsp
+.cfi_def_cfa_register %rax
push %rbx
+.cfi_push %rbx
push %rbp
+.cfi_push %rbp
push %r12
+.cfi_push %r12
push %r13
+.cfi_push %r13
push %r14
+.cfi_push %r14
push %r15
- mov %rsp,%r11 # copy %rsp
+.cfi_push %r15
sub \$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp
shl \$4,%rdx # num*16
and \$-256*$SZ,%rsp # align stack frame
@@ -1838,7 +1929,8 @@ ${func}_avx2:
mov $ctx,$_ctx # save ctx, 1st arg
mov $inp,$_inp # save inp, 2nd arh
mov %rdx,$_end # save end pointer, "3rd" arg
- mov %r11,$_rsp # save copy of %rsp
+ mov %rax,$_rsp # save copy of %rsp
+.cfi_cfa_expression $_rsp,deref,+8
___
$code.=<<___ if ($win64);
movaps %xmm6,16*$SZ+32(%rsp)
@@ -2119,6 +2211,7 @@ $code.=<<___;
.Ldone_avx2:
lea ($Tbl),%rsp
mov $_rsp,%rsi
+.cfi_def_cfa %rsi,8
vzeroupper
___
$code.=<<___ if ($win64);
@@ -2132,15 +2225,23 @@ $code.=<<___ if ($win64 && $SZ>4);
movaps 16*$SZ+112(%rsp),%xmm11
___
$code.=<<___;
- mov (%rsi),%r15
- mov 8(%rsi),%r14
- mov 16(%rsi),%r13
- mov 24(%rsi),%r12
- mov 32(%rsi),%rbp
- mov 40(%rsi),%rbx
- lea 48(%rsi),%rsp
+ mov -48(%rsi),%r15
+.cfi_restore %r15
+ mov -40(%rsi),%r14
+.cfi_restore %r14
+ mov -32(%rsi),%r13
+.cfi_restore %r13
+ mov -24(%rsi),%r12
+.cfi_restore %r12
+ mov -16(%rsi),%rbp
+.cfi_restore %rbp
+ mov -8(%rsi),%rbx
+.cfi_restore %rbx
+ lea (%rsi),%rsp
+.cfi_def_cfa_register %rsp
.Lepilogue_avx2:
ret
+.cfi_endproc
.size ${func}_avx2,.-${func}_avx2
___
}}
@@ -2200,7 +2301,6 @@ ___
$code.=<<___;
mov %rax,%rsi # put aside Rsp
mov 16*$SZ+3*8(%rax),%rax # pull $_rsp
- lea 48(%rax),%rax
mov -8(%rax),%rbx
mov -16(%rax),%rbp
diff --git a/crypto/sha/asm/sha512p8-ppc.pl b/crypto/sha/asm/sha512p8-ppc.pl
index 47189502c6cc..2792800b475c 100755
--- a/crypto/sha/asm/sha512p8-ppc.pl
+++ b/crypto/sha/asm/sha512p8-ppc.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2014-2018 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -18,11 +25,20 @@
# sha1-ppc.pl and 1.6x slower than aes-128-cbc. Another interesting
# result is degree of computational resources' utilization. POWER8 is
# "massively multi-threaded chip" and difference between single- and
-# maximum multi-process benchmark results tells that utlization is
+# maximum multi-process benchmark results tells that utilization is
# whooping 94%. For sha512-ppc.pl we get [not unimpressive] 84% and
# for sha1-ppc.pl - 73%. 100% means that multi-process result equals
# to single-process one, given that all threads end up on the same
# physical core.
+#
+######################################################################
+# Believed-to-be-accurate results in cycles per processed byte [on
+# little-endian system]. Numbers in square brackets are for 64-bit
+# build of sha512-ppc.pl, presented for reference.
+#
+# POWER8 POWER9
+# SHA256 9.7 [15.8] 11.2 [12.5]
+# SHA512 6.1 [10.3] 7.0 [7.9]
$flavour=shift;
$output =shift;
@@ -63,7 +79,8 @@ if ($output =~ /512/) {
}
$func="sha${bits}_block_p8";
-$FRAME=8*$SIZE_T;
+$LOCALS=8*$SIZE_T+8*16;
+$FRAME=$LOCALS+9*16+6*$SIZE_T;
$sp ="r1";
$toc="r2";
@@ -75,16 +92,16 @@ $idx="r7";
$lrsave="r8";
$offload="r11";
$vrsave="r12";
-($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,10,26..31));
- $x00=0 if ($flavour =~ /osx/);
+@I = ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70) = (0,map("r$_",(10,26..31)));
@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("v$_",(0..7));
-@X=map("v$_",(8..23));
-($Ki,$Func,$S0,$S1,$s0,$s1,$lemask)=map("v$_",(24..31));
+@X=map("v$_",(8..19,24..27));
+($Ki,$Func,$Sigma,$lemask)=map("v$_",(28..31));
sub ROUND {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
my $j=($i+1)%16;
+my $k=($i+2)%8;
$code.=<<___ if ($i<15 && ($i%(16/$SZ))==(16/$SZ-1));
lvx_u @X[$i+1],0,$inp ; load X[i] in advance
@@ -96,26 +113,30 @@ ___
$code.=<<___ if ($LENDIAN && $i<16 && ($i%(16/$SZ))==0);
vperm @X[$i],@X[$i],@X[$i],$lemask
___
+$code.=<<___ if ($i>=15);
+ vshasigma${sz} $Sigma,@X[($j+1)%16],0,0
+ vaddu${sz}m @X[$j],@X[$j],$Sigma
+ vshasigma${sz} $Sigma,@X[($j+14)%16],0,15
+ vaddu${sz}m @X[$j],@X[$j],$Sigma
+ vaddu${sz}m @X[$j],@X[$j],@X[($j+9)%16]
+___
$code.=<<___;
- `"vshasigma${sz} $s0,@X[($j+1)%16],0,0" if ($i>=15)`
- vsel $Func,$g,$f,$e ; Ch(e,f,g)
- vshasigma${sz} $S1,$e,1,15 ; Sigma1(e)
vaddu${sz}m $h,$h,@X[$i%16] ; h+=X[i]
- vshasigma${sz} $S0,$a,1,0 ; Sigma0(a)
- `"vshasigma${sz} $s1,@X[($j+14)%16],0,15" if ($i>=15)`
+ vsel $Func,$g,$f,$e ; Ch(e,f,g)
+ vaddu${sz}m $g,$g,$Ki ; future h+=K[i]
vaddu${sz}m $h,$h,$Func ; h+=Ch(e,f,g)
+ vshasigma${sz} $Sigma,$e,1,15 ; Sigma1(e)
+ vaddu${sz}m $h,$h,$Sigma ; h+=Sigma1(e)
vxor $Func,$a,$b
- `"vaddu${sz}m @X[$j],@X[$j],@X[($j+9)%16]" if ($i>=15)`
- vaddu${sz}m $h,$h,$S1 ; h+=Sigma1(e)
vsel $Func,$b,$c,$Func ; Maj(a,b,c)
- vaddu${sz}m $g,$g,$Ki ; future h+=K[i]
vaddu${sz}m $d,$d,$h ; d+=h
- vaddu${sz}m $S0,$S0,$Func ; Sigma0(a)+Maj(a,b,c)
- `"vaddu${sz}m @X[$j],@X[$j],$s0" if ($i>=15)`
- lvx $Ki,$idx,$Tbl ; load next K[i]
- addi $idx,$idx,16
- vaddu${sz}m $h,$h,$S0 ; h+=Sigma0(a)+Maj(a,b,c)
- `"vaddu${sz}m @X[$j],@X[$j],$s1" if ($i>=15)`
+ vshasigma${sz} $Sigma,$a,1,0 ; Sigma0(a)
+ vaddu${sz}m $Sigma,$Sigma,$Func ; Sigma0(a)+Maj(a,b,c)
+ vaddu${sz}m $h,$h,$Sigma ; h+=Sigma0(a)+Maj(a,b,c)
+ lvx $Ki,@I[$k],$idx ; load next K[i]
+___
+$code.=<<___ if ($k == 7);
+ addi $idx,$idx,0x80
___
}
@@ -126,21 +147,13 @@ $code=<<___;
.globl $func
.align 6
$func:
- $STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
+ $STU $sp,-$FRAME($sp)
mflr $lrsave
- li r10,`$FRAME+8*16+15`
- li r11,`$FRAME+8*16+31`
- stvx v20,r10,$sp # ABI says so
+ li r10,`$LOCALS+15`
+ li r11,`$LOCALS+31`
+ stvx v24,r10,$sp # ABI says so
addi r10,r10,32
mfspr $vrsave,256
- stvx v21,r11,$sp
- addi r11,r11,32
- stvx v22,r10,$sp
- addi r10,r10,32
- stvx v23,r11,$sp
- addi r11,r11,32
- stvx v24,r10,$sp
- addi r10,r10,32
stvx v25,r11,$sp
addi r11,r11,32
stvx v26,r10,$sp
@@ -153,26 +166,26 @@ $func:
addi r11,r11,32
stvx v30,r10,$sp
stvx v31,r11,$sp
- li r11,-1
- stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave
+ li r11,-4096+255
+ stw $vrsave,`$FRAME+6*$SIZE_T-4`($sp) # save vrsave
li $x10,0x10
- $PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp)
+ $PUSH r26,`$FRAME-6*$SIZE_T`($sp)
li $x20,0x20
- $PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp)
+ $PUSH r27,`$FRAME-5*$SIZE_T`($sp)
li $x30,0x30
- $PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp)
+ $PUSH r28,`$FRAME-4*$SIZE_T`($sp)
li $x40,0x40
- $PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp)
+ $PUSH r29,`$FRAME-3*$SIZE_T`($sp)
li $x50,0x50
- $PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp)
+ $PUSH r30,`$FRAME-2*$SIZE_T`($sp)
li $x60,0x60
- $PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp)
+ $PUSH r31,`$FRAME-1*$SIZE_T`($sp)
li $x70,0x70
- $PUSH $lrsave,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp)
+ $PUSH $lrsave,`$FRAME+$LRSAVE`($sp)
mtspr 256,r11
bl LPICmeup
- addi $offload,$sp,$FRAME+15
+ addi $offload,$sp,`8*$SIZE_T+15`
___
$code.=<<___ if ($LENDIAN);
li $idx,8
@@ -206,9 +219,9 @@ $code.=<<___;
.align 5
Loop:
lvx $Ki,$x00,$Tbl
- li $idx,16
lvx_u @X[0],0,$inp
addi $inp,$inp,16
+ mr $idx,$Tbl # copy $Tbl
stvx $A,$x00,$offload # offload $A-$H
stvx $B,$x10,$offload
stvx $C,$x20,$offload
@@ -218,8 +231,7 @@ Loop:
stvx $G,$x60,$offload
stvx $H,$x70,$offload
vaddu${sz}m $H,$H,$Ki # h+K[i]
- lvx $Ki,$idx,$Tbl
- addi $idx,$idx,16
+ lvx $Ki,$x10,$Tbl
___
for ($i=0;$i<16;$i++) { &ROUND($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
@@ -252,10 +264,9 @@ $code.=<<___;
bne Loop
___
$code.=<<___ if ($SZ==4);
- lvx @X[0],$idx,$Tbl
- addi $idx,$idx,16
+ lvx @X[0],$x20,$idx
vperm $A,$A,$B,$Ki # pack the answer
- lvx @X[1],$idx,$Tbl
+ lvx @X[1],$x30,$idx
vperm $E,$E,$F,$Ki
vperm $A,$A,$C,@X[0]
vperm $E,$E,$G,@X[0]
@@ -275,19 +286,11 @@ $code.=<<___ if ($SZ==8);
stvx_u $G,$x30,$ctx
___
$code.=<<___;
- li r10,`$FRAME+8*16+15`
+ li r10,`$LOCALS+15`
mtlr $lrsave
- li r11,`$FRAME+8*16+31`
+ li r11,`$LOCALS+31`
mtspr 256,$vrsave
- lvx v20,r10,$sp # ABI says so
- addi r10,r10,32
- lvx v21,r11,$sp
- addi r11,r11,32
- lvx v22,r10,$sp
- addi r10,r10,32
- lvx v23,r11,$sp
- addi r11,r11,32
- lvx v24,r10,$sp
+ lvx v24,r10,$sp # ABI says so
addi r10,r10,32
lvx v25,r11,$sp
addi r11,r11,32
@@ -301,13 +304,13 @@ $code.=<<___;
addi r11,r11,32
lvx v30,r10,$sp
lvx v31,r11,$sp
- $POP r26,`$FRAME+21*16+0*$SIZE_T`($sp)
- $POP r27,`$FRAME+21*16+1*$SIZE_T`($sp)
- $POP r28,`$FRAME+21*16+2*$SIZE_T`($sp)
- $POP r29,`$FRAME+21*16+3*$SIZE_T`($sp)
- $POP r30,`$FRAME+21*16+4*$SIZE_T`($sp)
- $POP r31,`$FRAME+21*16+5*$SIZE_T`($sp)
- addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T`
+ $POP r26,`$FRAME-6*$SIZE_T`($sp)
+ $POP r27,`$FRAME-5*$SIZE_T`($sp)
+ $POP r28,`$FRAME-4*$SIZE_T`($sp)
+ $POP r29,`$FRAME-3*$SIZE_T`($sp)
+ $POP r30,`$FRAME-2*$SIZE_T`($sp)
+ $POP r31,`$FRAME-1*$SIZE_T`($sp)
+ addi $sp,$sp,$FRAME
blr
.long 0
.byte 0,12,4,1,0x80,6,3,0
diff --git a/crypto/sha/build.info b/crypto/sha/build.info
new file mode 100644
index 000000000000..5dd5a9941d34
--- /dev/null
+++ b/crypto/sha/build.info
@@ -0,0 +1,89 @@
+LIBS=../../libcrypto
+SOURCE[../../libcrypto]=\
+ sha1dgst.c sha1_one.c sha256.c sha512.c {- $target{sha1_asm_src} -} \
+ {- $target{keccak1600_asm_src} -}
+
+GENERATE[sha1-586.s]=asm/sha1-586.pl \
+ $(PERLASM_SCHEME) $(LIB_CFLAGS) $(LIB_CPPFLAGS) $(PROCESSOR)
+DEPEND[sha1-586.s]=../perlasm/x86asm.pl
+GENERATE[sha256-586.s]=asm/sha256-586.pl \
+ $(PERLASM_SCHEME) $(LIB_CFLAGS) $(LIB_CPPFLAGS) $(PROCESSOR)
+DEPEND[sha256-586.s]=../perlasm/x86asm.pl
+GENERATE[sha512-586.s]=asm/sha512-586.pl \
+ $(PERLASM_SCHEME) $(LIB_CFLAGS) $(LIB_CPPFLAGS) $(PROCESSOR)
+DEPEND[sha512-586.s]=../perlasm/x86asm.pl
+
+GENERATE[sha1-ia64.s]=asm/sha1-ia64.pl $(LIB_CFLAGS) $(LIB_CPPFLAGS)
+GENERATE[sha256-ia64.s]=asm/sha512-ia64.pl $(LIB_CFLAGS) $(LIB_CPPFLAGS)
+GENERATE[sha512-ia64.s]=asm/sha512-ia64.pl $(LIB_CFLAGS) $(LIB_CPPFLAGS)
+
+GENERATE[sha1-alpha.S]=asm/sha1-alpha.pl $(PERLASM_SCHEME)
+
+GENERATE[sha1-x86_64.s]=asm/sha1-x86_64.pl $(PERLASM_SCHEME)
+GENERATE[sha1-mb-x86_64.s]=asm/sha1-mb-x86_64.pl $(PERLASM_SCHEME)
+GENERATE[sha256-x86_64.s]=asm/sha512-x86_64.pl $(PERLASM_SCHEME)
+GENERATE[sha256-mb-x86_64.s]=asm/sha256-mb-x86_64.pl $(PERLASM_SCHEME)
+GENERATE[sha512-x86_64.s]=asm/sha512-x86_64.pl $(PERLASM_SCHEME)
+GENERATE[keccak1600-x86_64.s]=asm/keccak1600-x86_64.pl $(PERLASM_SCHEME)
+
+GENERATE[sha1-sparcv9.S]=asm/sha1-sparcv9.pl $(PERLASM_SCHEME)
+INCLUDE[sha1-sparcv9.o]=..
+GENERATE[sha256-sparcv9.S]=asm/sha512-sparcv9.pl $(PERLASM_SCHEME)
+INCLUDE[sha256-sparcv9.o]=..
+GENERATE[sha512-sparcv9.S]=asm/sha512-sparcv9.pl $(PERLASM_SCHEME)
+INCLUDE[sha512-sparcv9.o]=..
+
+GENERATE[sha1-ppc.s]=asm/sha1-ppc.pl $(PERLASM_SCHEME)
+GENERATE[sha256-ppc.s]=asm/sha512-ppc.pl $(PERLASM_SCHEME)
+GENERATE[sha512-ppc.s]=asm/sha512-ppc.pl $(PERLASM_SCHEME)
+GENERATE[sha256p8-ppc.s]=asm/sha512p8-ppc.pl $(PERLASM_SCHEME)
+GENERATE[sha512p8-ppc.s]=asm/sha512p8-ppc.pl $(PERLASM_SCHEME)
+GENERATE[keccak1600-ppc64.s]=asm/keccak1600-ppc64.pl $(PERLASM_SCHEME)
+
+GENERATE[sha1-parisc.s]=asm/sha1-parisc.pl $(PERLASM_SCHEME)
+GENERATE[sha256-parisc.s]=asm/sha512-parisc.pl $(PERLASM_SCHEME)
+GENERATE[sha512-parisc.s]=asm/sha512-parisc.pl $(PERLASM_SCHEME)
+
+GENERATE[sha1-mips.S]=asm/sha1-mips.pl $(PERLASM_SCHEME)
+INCLUDE[sha1-mips.o]=..
+GENERATE[sha256-mips.S]=asm/sha512-mips.pl $(PERLASM_SCHEME)
+INCLUDE[sha256-mips.o]=..
+GENERATE[sha512-mips.S]=asm/sha512-mips.pl $(PERLASM_SCHEME)
+INCLUDE[sha512-mips.o]=..
+
+GENERATE[sha1-armv4-large.S]=asm/sha1-armv4-large.pl $(PERLASM_SCHEME)
+INCLUDE[sha1-armv4-large.o]=..
+GENERATE[sha256-armv4.S]=asm/sha256-armv4.pl $(PERLASM_SCHEME)
+INCLUDE[sha256-armv4.o]=..
+GENERATE[sha512-armv4.S]=asm/sha512-armv4.pl $(PERLASM_SCHEME)
+INCLUDE[sha512-armv4.o]=..
+GENERATE[keccak1600-armv4.S]=asm/keccak1600-armv4.pl $(PERLASM_SCHEME)
+INCLUDE[keccak1600-armv4.o]=..
+
+GENERATE[sha1-armv8.S]=asm/sha1-armv8.pl $(PERLASM_SCHEME)
+INCLUDE[sha1-armv8.o]=..
+GENERATE[sha256-armv8.S]=asm/sha512-armv8.pl $(PERLASM_SCHEME)
+INCLUDE[sha256-armv8.o]=..
+GENERATE[sha512-armv8.S]=asm/sha512-armv8.pl $(PERLASM_SCHEME)
+INCLUDE[sha512-armv8.o]=..
+GENERATE[keccak1600-armv8.S]=asm/keccak1600-armv8.pl $(PERLASM_SCHEME)
+
+GENERATE[sha1-s390x.S]=asm/sha1-s390x.pl $(PERLASM_SCHEME)
+INCLUDE[sha1-s390x.o]=..
+GENERATE[sha256-s390x.S]=asm/sha512-s390x.pl $(PERLASM_SCHEME)
+INCLUDE[sha256-s390x.o]=..
+GENERATE[sha512-s390x.S]=asm/sha512-s390x.pl $(PERLASM_SCHEME)
+INCLUDE[sha512-s390x.o]=..
+GENERATE[keccak1600-s390x.S]=asm/keccak1600-s390x.pl $(PERLASM_SCHEME)
+
+BEGINRAW[Makefile(unix)]
+##### SHA assembler implementations
+
+# GNU make "catch all"
+{- $builddir -}/sha1-%.S: {- $sourcedir -}/asm/sha1-%.pl
+ CC="$(CC)" $(PERL) $< $(PERLASM_SCHEME) $@
+{- $builddir -}/sha256-%.S: {- $sourcedir -}/asm/sha512-%.pl
+ CC="$(CC)" $(PERL) $< $(PERLASM_SCHEME) $@
+{- $builddir -}/sha512-%.S: {- $sourcedir -}/asm/sha512-%.pl
+ CC="$(CC)" $(PERL) $< $(PERLASM_SCHEME) $@
+ENDRAW[Makefile(unix)]
diff --git a/crypto/sha/keccak1600.c b/crypto/sha/keccak1600.c
new file mode 100644
index 000000000000..e7223486af5b
--- /dev/null
+++ b/crypto/sha/keccak1600.c
@@ -0,0 +1,1246 @@
+/*
+ * Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <openssl/e_os2.h>
+#include <string.h>
+#include <assert.h>
+
+size_t SHA3_absorb(uint64_t A[5][5], const unsigned char *inp, size_t len,
+ size_t r);
+void SHA3_squeeze(uint64_t A[5][5], unsigned char *out, size_t len, size_t r);
+
+#if !defined(KECCAK1600_ASM) || !defined(SELFTEST)
+
+/*
+ * Choose some sensible defaults
+ */
+#if !defined(KECCAK_REF) && !defined(KECCAK_1X) && !defined(KECCAK_1X_ALT) && \
+ !defined(KECCAK_2X) && !defined(KECCAK_INPLACE)
+# define KECCAK_2X /* default to KECCAK_2X variant */
+#endif
+
+#if defined(__i386) || defined(__i386__) || defined(_M_IX86)
+# define KECCAK_COMPLEMENTING_TRANSFORM
+#endif
+
+#if defined(__x86_64__) || defined(__aarch64__) || \
+ defined(__mips64) || defined(__ia64) || \
+ (defined(__VMS) && !defined(__vax))
+/*
+ * These are available even in ILP32 flavours, but even then they are
+ * capable of performing 64-bit operations as efficiently as in *P64.
+ * Since it's not given that we can use sizeof(void *), just shunt it.
+ */
+# define BIT_INTERLEAVE (0)
+#else
+# define BIT_INTERLEAVE (sizeof(void *) < 8)
+#endif
+
+#define ROL32(a, offset) (((a) << (offset)) | ((a) >> ((32 - (offset)) & 31)))
+
+static uint64_t ROL64(uint64_t val, int offset)
+{
+ if (offset == 0) {
+ return val;
+ } else if (!BIT_INTERLEAVE) {
+ return (val << offset) | (val >> (64-offset));
+ } else {
+ uint32_t hi = (uint32_t)(val >> 32), lo = (uint32_t)val;
+
+ if (offset & 1) {
+ uint32_t tmp = hi;
+
+ offset >>= 1;
+ hi = ROL32(lo, offset);
+ lo = ROL32(tmp, offset + 1);
+ } else {
+ offset >>= 1;
+ lo = ROL32(lo, offset);
+ hi = ROL32(hi, offset);
+ }
+
+ return ((uint64_t)hi << 32) | lo;
+ }
+}
+
+static const unsigned char rhotates[5][5] = {
+ { 0, 1, 62, 28, 27 },
+ { 36, 44, 6, 55, 20 },
+ { 3, 10, 43, 25, 39 },
+ { 41, 45, 15, 21, 8 },
+ { 18, 2, 61, 56, 14 }
+};
+
+static const uint64_t iotas[] = {
+ BIT_INTERLEAVE ? 0x0000000000000001U : 0x0000000000000001U,
+ BIT_INTERLEAVE ? 0x0000008900000000U : 0x0000000000008082U,
+ BIT_INTERLEAVE ? 0x8000008b00000000U : 0x800000000000808aU,
+ BIT_INTERLEAVE ? 0x8000808000000000U : 0x8000000080008000U,
+ BIT_INTERLEAVE ? 0x0000008b00000001U : 0x000000000000808bU,
+ BIT_INTERLEAVE ? 0x0000800000000001U : 0x0000000080000001U,
+ BIT_INTERLEAVE ? 0x8000808800000001U : 0x8000000080008081U,
+ BIT_INTERLEAVE ? 0x8000008200000001U : 0x8000000000008009U,
+ BIT_INTERLEAVE ? 0x0000000b00000000U : 0x000000000000008aU,
+ BIT_INTERLEAVE ? 0x0000000a00000000U : 0x0000000000000088U,
+ BIT_INTERLEAVE ? 0x0000808200000001U : 0x0000000080008009U,
+ BIT_INTERLEAVE ? 0x0000800300000000U : 0x000000008000000aU,
+ BIT_INTERLEAVE ? 0x0000808b00000001U : 0x000000008000808bU,
+ BIT_INTERLEAVE ? 0x8000000b00000001U : 0x800000000000008bU,
+ BIT_INTERLEAVE ? 0x8000008a00000001U : 0x8000000000008089U,
+ BIT_INTERLEAVE ? 0x8000008100000001U : 0x8000000000008003U,
+ BIT_INTERLEAVE ? 0x8000008100000000U : 0x8000000000008002U,
+ BIT_INTERLEAVE ? 0x8000000800000000U : 0x8000000000000080U,
+ BIT_INTERLEAVE ? 0x0000008300000000U : 0x000000000000800aU,
+ BIT_INTERLEAVE ? 0x8000800300000000U : 0x800000008000000aU,
+ BIT_INTERLEAVE ? 0x8000808800000001U : 0x8000000080008081U,
+ BIT_INTERLEAVE ? 0x8000008800000000U : 0x8000000000008080U,
+ BIT_INTERLEAVE ? 0x0000800000000001U : 0x0000000080000001U,
+ BIT_INTERLEAVE ? 0x8000808200000000U : 0x8000000080008008U
+};
+
+#if defined(KECCAK_REF)
+/*
+ * This is straightforward or "maximum clarity" implementation aiming
+ * to resemble section 3.2 of the FIPS PUB 202 "SHA-3 Standard:
+ * Permutation-Based Hash and Extendible-Output Functions" as much as
+ * possible. With one caveat. Because of the way C stores matrices,
+ * references to A[x,y] in the specification are presented as A[y][x].
+ * Implementation unrolls inner x-loops so that modulo 5 operations are
+ * explicitly pre-computed.
+ */
+static void Theta(uint64_t A[5][5])
+{
+ uint64_t C[5], D[5];
+ size_t y;
+
+ C[0] = A[0][0];
+ C[1] = A[0][1];
+ C[2] = A[0][2];
+ C[3] = A[0][3];
+ C[4] = A[0][4];
+
+ for (y = 1; y < 5; y++) {
+ C[0] ^= A[y][0];
+ C[1] ^= A[y][1];
+ C[2] ^= A[y][2];
+ C[3] ^= A[y][3];
+ C[4] ^= A[y][4];
+ }
+
+ D[0] = ROL64(C[1], 1) ^ C[4];
+ D[1] = ROL64(C[2], 1) ^ C[0];
+ D[2] = ROL64(C[3], 1) ^ C[1];
+ D[3] = ROL64(C[4], 1) ^ C[2];
+ D[4] = ROL64(C[0], 1) ^ C[3];
+
+ for (y = 0; y < 5; y++) {
+ A[y][0] ^= D[0];
+ A[y][1] ^= D[1];
+ A[y][2] ^= D[2];
+ A[y][3] ^= D[3];
+ A[y][4] ^= D[4];
+ }
+}
+
+static void Rho(uint64_t A[5][5])
+{
+ size_t y;
+
+ for (y = 0; y < 5; y++) {
+ A[y][0] = ROL64(A[y][0], rhotates[y][0]);
+ A[y][1] = ROL64(A[y][1], rhotates[y][1]);
+ A[y][2] = ROL64(A[y][2], rhotates[y][2]);
+ A[y][3] = ROL64(A[y][3], rhotates[y][3]);
+ A[y][4] = ROL64(A[y][4], rhotates[y][4]);
+ }
+}
+
+static void Pi(uint64_t A[5][5])
+{
+ uint64_t T[5][5];
+
+ /*
+ * T = A
+ * A[y][x] = T[x][(3*y+x)%5]
+ */
+ memcpy(T, A, sizeof(T));
+
+ A[0][0] = T[0][0];
+ A[0][1] = T[1][1];
+ A[0][2] = T[2][2];
+ A[0][3] = T[3][3];
+ A[0][4] = T[4][4];
+
+ A[1][0] = T[0][3];
+ A[1][1] = T[1][4];
+ A[1][2] = T[2][0];
+ A[1][3] = T[3][1];
+ A[1][4] = T[4][2];
+
+ A[2][0] = T[0][1];
+ A[2][1] = T[1][2];
+ A[2][2] = T[2][3];
+ A[2][3] = T[3][4];
+ A[2][4] = T[4][0];
+
+ A[3][0] = T[0][4];
+ A[3][1] = T[1][0];
+ A[3][2] = T[2][1];
+ A[3][3] = T[3][2];
+ A[3][4] = T[4][3];
+
+ A[4][0] = T[0][2];
+ A[4][1] = T[1][3];
+ A[4][2] = T[2][4];
+ A[4][3] = T[3][0];
+ A[4][4] = T[4][1];
+}
+
+static void Chi(uint64_t A[5][5])
+{
+ uint64_t C[5];
+ size_t y;
+
+ for (y = 0; y < 5; y++) {
+ C[0] = A[y][0] ^ (~A[y][1] & A[y][2]);
+ C[1] = A[y][1] ^ (~A[y][2] & A[y][3]);
+ C[2] = A[y][2] ^ (~A[y][3] & A[y][4]);
+ C[3] = A[y][3] ^ (~A[y][4] & A[y][0]);
+ C[4] = A[y][4] ^ (~A[y][0] & A[y][1]);
+
+ A[y][0] = C[0];
+ A[y][1] = C[1];
+ A[y][2] = C[2];
+ A[y][3] = C[3];
+ A[y][4] = C[4];
+ }
+}
+
+static void Iota(uint64_t A[5][5], size_t i)
+{
+ assert(i < (sizeof(iotas) / sizeof(iotas[0])));
+ A[0][0] ^= iotas[i];
+}
+
+static void KeccakF1600(uint64_t A[5][5])
+{
+ size_t i;
+
+ for (i = 0; i < 24; i++) {
+ Theta(A);
+ Rho(A);
+ Pi(A);
+ Chi(A);
+ Iota(A, i);
+ }
+}
+
+#elif defined(KECCAK_1X)
+/*
+ * This implementation is optimization of above code featuring unroll
+ * of even y-loops, their fusion and code motion. It also minimizes
+ * temporary storage. Compiler would normally do all these things for
+ * you, purpose of manual optimization is to provide "unobscured"
+ * reference for assembly implementation [in case this approach is
+ * chosen for implementation on some platform]. In the nutshell it's
+ * equivalent of "plane-per-plane processing" approach discussed in
+ * section 2.4 of "Keccak implementation overview".
+ */
+static void Round(uint64_t A[5][5], size_t i)
+{
+ uint64_t C[5], E[2]; /* registers */
+ uint64_t D[5], T[2][5]; /* memory */
+
+ assert(i < (sizeof(iotas) / sizeof(iotas[0])));
+
+ C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0];
+ C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1];
+ C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2];
+ C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3];
+ C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4];
+
+#if defined(__arm__)
+ D[1] = E[0] = ROL64(C[2], 1) ^ C[0];
+ D[4] = E[1] = ROL64(C[0], 1) ^ C[3];
+ D[0] = C[0] = ROL64(C[1], 1) ^ C[4];
+ D[2] = C[1] = ROL64(C[3], 1) ^ C[1];
+ D[3] = C[2] = ROL64(C[4], 1) ^ C[2];
+
+ T[0][0] = A[3][0] ^ C[0]; /* borrow T[0][0] */
+ T[0][1] = A[0][1] ^ E[0]; /* D[1] */
+ T[0][2] = A[0][2] ^ C[1]; /* D[2] */
+ T[0][3] = A[0][3] ^ C[2]; /* D[3] */
+ T[0][4] = A[0][4] ^ E[1]; /* D[4] */
+
+ C[3] = ROL64(A[3][3] ^ C[2], rhotates[3][3]); /* D[3] */
+ C[4] = ROL64(A[4][4] ^ E[1], rhotates[4][4]); /* D[4] */
+ C[0] = A[0][0] ^ C[0]; /* rotate by 0 */ /* D[0] */
+ C[2] = ROL64(A[2][2] ^ C[1], rhotates[2][2]); /* D[2] */
+ C[1] = ROL64(A[1][1] ^ E[0], rhotates[1][1]); /* D[1] */
+#else
+ D[0] = ROL64(C[1], 1) ^ C[4];
+ D[1] = ROL64(C[2], 1) ^ C[0];
+ D[2] = ROL64(C[3], 1) ^ C[1];
+ D[3] = ROL64(C[4], 1) ^ C[2];
+ D[4] = ROL64(C[0], 1) ^ C[3];
+
+ T[0][0] = A[3][0] ^ D[0]; /* borrow T[0][0] */
+ T[0][1] = A[0][1] ^ D[1];
+ T[0][2] = A[0][2] ^ D[2];
+ T[0][3] = A[0][3] ^ D[3];
+ T[0][4] = A[0][4] ^ D[4];
+
+ C[0] = A[0][0] ^ D[0]; /* rotate by 0 */
+ C[1] = ROL64(A[1][1] ^ D[1], rhotates[1][1]);
+ C[2] = ROL64(A[2][2] ^ D[2], rhotates[2][2]);
+ C[3] = ROL64(A[3][3] ^ D[3], rhotates[3][3]);
+ C[4] = ROL64(A[4][4] ^ D[4], rhotates[4][4]);
+#endif
+ A[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
+ A[0][1] = C[1] ^ (~C[2] & C[3]);
+ A[0][2] = C[2] ^ (~C[3] & C[4]);
+ A[0][3] = C[3] ^ (~C[4] & C[0]);
+ A[0][4] = C[4] ^ (~C[0] & C[1]);
+
+ T[1][0] = A[1][0] ^ (C[3] = D[0]);
+ T[1][1] = A[2][1] ^ (C[4] = D[1]); /* borrow T[1][1] */
+ T[1][2] = A[1][2] ^ (E[0] = D[2]);
+ T[1][3] = A[1][3] ^ (E[1] = D[3]);
+ T[1][4] = A[2][4] ^ (C[2] = D[4]); /* borrow T[1][4] */
+
+ C[0] = ROL64(T[0][3], rhotates[0][3]);
+ C[1] = ROL64(A[1][4] ^ C[2], rhotates[1][4]); /* D[4] */
+ C[2] = ROL64(A[2][0] ^ C[3], rhotates[2][0]); /* D[0] */
+ C[3] = ROL64(A[3][1] ^ C[4], rhotates[3][1]); /* D[1] */
+ C[4] = ROL64(A[4][2] ^ E[0], rhotates[4][2]); /* D[2] */
+
+ A[1][0] = C[0] ^ (~C[1] & C[2]);
+ A[1][1] = C[1] ^ (~C[2] & C[3]);
+ A[1][2] = C[2] ^ (~C[3] & C[4]);
+ A[1][3] = C[3] ^ (~C[4] & C[0]);
+ A[1][4] = C[4] ^ (~C[0] & C[1]);
+
+ C[0] = ROL64(T[0][1], rhotates[0][1]);
+ C[1] = ROL64(T[1][2], rhotates[1][2]);
+ C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
+ C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
+ C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
+
+ A[2][0] = C[0] ^ (~C[1] & C[2]);
+ A[2][1] = C[1] ^ (~C[2] & C[3]);
+ A[2][2] = C[2] ^ (~C[3] & C[4]);
+ A[2][3] = C[3] ^ (~C[4] & C[0]);
+ A[2][4] = C[4] ^ (~C[0] & C[1]);
+
+ C[0] = ROL64(T[0][4], rhotates[0][4]);
+ C[1] = ROL64(T[1][0], rhotates[1][0]);
+ C[2] = ROL64(T[1][1], rhotates[2][1]); /* originally A[2][1] */
+ C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
+ C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
+
+ A[3][0] = C[0] ^ (~C[1] & C[2]);
+ A[3][1] = C[1] ^ (~C[2] & C[3]);
+ A[3][2] = C[2] ^ (~C[3] & C[4]);
+ A[3][3] = C[3] ^ (~C[4] & C[0]);
+ A[3][4] = C[4] ^ (~C[0] & C[1]);
+
+ C[0] = ROL64(T[0][2], rhotates[0][2]);
+ C[1] = ROL64(T[1][3], rhotates[1][3]);
+ C[2] = ROL64(T[1][4], rhotates[2][4]); /* originally A[2][4] */
+ C[3] = ROL64(T[0][0], rhotates[3][0]); /* originally A[3][0] */
+ C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
+
+ A[4][0] = C[0] ^ (~C[1] & C[2]);
+ A[4][1] = C[1] ^ (~C[2] & C[3]);
+ A[4][2] = C[2] ^ (~C[3] & C[4]);
+ A[4][3] = C[3] ^ (~C[4] & C[0]);
+ A[4][4] = C[4] ^ (~C[0] & C[1]);
+}
+
+static void KeccakF1600(uint64_t A[5][5])
+{
+ size_t i;
+
+ for (i = 0; i < 24; i++) {
+ Round(A, i);
+ }
+}
+
+#elif defined(KECCAK_1X_ALT)
+/*
+ * This is variant of above KECCAK_1X that reduces requirement for
+ * temporary storage even further, but at cost of more updates to A[][].
+ * It's less suitable if A[][] is memory bound, but better if it's
+ * register bound.
+ */
+
+static void Round(uint64_t A[5][5], size_t i)
+{
+ uint64_t C[5], D[5];
+
+ assert(i < (sizeof(iotas) / sizeof(iotas[0])));
+
+ C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0];
+ C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1];
+ C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2];
+ C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3];
+ C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4];
+
+ D[1] = C[0] ^ ROL64(C[2], 1);
+ D[2] = C[1] ^ ROL64(C[3], 1);
+ D[3] = C[2] ^= ROL64(C[4], 1);
+ D[4] = C[3] ^= ROL64(C[0], 1);
+ D[0] = C[4] ^= ROL64(C[1], 1);
+
+ A[0][1] ^= D[1];
+ A[1][1] ^= D[1];
+ A[2][1] ^= D[1];
+ A[3][1] ^= D[1];
+ A[4][1] ^= D[1];
+
+ A[0][2] ^= D[2];
+ A[1][2] ^= D[2];
+ A[2][2] ^= D[2];
+ A[3][2] ^= D[2];
+ A[4][2] ^= D[2];
+
+ A[0][3] ^= C[2];
+ A[1][3] ^= C[2];
+ A[2][3] ^= C[2];
+ A[3][3] ^= C[2];
+ A[4][3] ^= C[2];
+
+ A[0][4] ^= C[3];
+ A[1][4] ^= C[3];
+ A[2][4] ^= C[3];
+ A[3][4] ^= C[3];
+ A[4][4] ^= C[3];
+
+ A[0][0] ^= C[4];
+ A[1][0] ^= C[4];
+ A[2][0] ^= C[4];
+ A[3][0] ^= C[4];
+ A[4][0] ^= C[4];
+
+ C[1] = A[0][1];
+ C[2] = A[0][2];
+ C[3] = A[0][3];
+ C[4] = A[0][4];
+
+ A[0][1] = ROL64(A[1][1], rhotates[1][1]);
+ A[0][2] = ROL64(A[2][2], rhotates[2][2]);
+ A[0][3] = ROL64(A[3][3], rhotates[3][3]);
+ A[0][4] = ROL64(A[4][4], rhotates[4][4]);
+
+ A[1][1] = ROL64(A[1][4], rhotates[1][4]);
+ A[2][2] = ROL64(A[2][3], rhotates[2][3]);
+ A[3][3] = ROL64(A[3][2], rhotates[3][2]);
+ A[4][4] = ROL64(A[4][1], rhotates[4][1]);
+
+ A[1][4] = ROL64(A[4][2], rhotates[4][2]);
+ A[2][3] = ROL64(A[3][4], rhotates[3][4]);
+ A[3][2] = ROL64(A[2][1], rhotates[2][1]);
+ A[4][1] = ROL64(A[1][3], rhotates[1][3]);
+
+ A[4][2] = ROL64(A[2][4], rhotates[2][4]);
+ A[3][4] = ROL64(A[4][3], rhotates[4][3]);
+ A[2][1] = ROL64(A[1][2], rhotates[1][2]);
+ A[1][3] = ROL64(A[3][1], rhotates[3][1]);
+
+ A[2][4] = ROL64(A[4][0], rhotates[4][0]);
+ A[4][3] = ROL64(A[3][0], rhotates[3][0]);
+ A[1][2] = ROL64(A[2][0], rhotates[2][0]);
+ A[3][1] = ROL64(A[1][0], rhotates[1][0]);
+
+ A[1][0] = ROL64(C[3], rhotates[0][3]);
+ A[2][0] = ROL64(C[1], rhotates[0][1]);
+ A[3][0] = ROL64(C[4], rhotates[0][4]);
+ A[4][0] = ROL64(C[2], rhotates[0][2]);
+
+ C[0] = A[0][0];
+ C[1] = A[1][0];
+ D[0] = A[0][1];
+ D[1] = A[1][1];
+
+ A[0][0] ^= (~A[0][1] & A[0][2]);
+ A[1][0] ^= (~A[1][1] & A[1][2]);
+ A[0][1] ^= (~A[0][2] & A[0][3]);
+ A[1][1] ^= (~A[1][2] & A[1][3]);
+ A[0][2] ^= (~A[0][3] & A[0][4]);
+ A[1][2] ^= (~A[1][3] & A[1][4]);
+ A[0][3] ^= (~A[0][4] & C[0]);
+ A[1][3] ^= (~A[1][4] & C[1]);
+ A[0][4] ^= (~C[0] & D[0]);
+ A[1][4] ^= (~C[1] & D[1]);
+
+ C[2] = A[2][0];
+ C[3] = A[3][0];
+ D[2] = A[2][1];
+ D[3] = A[3][1];
+
+ A[2][0] ^= (~A[2][1] & A[2][2]);
+ A[3][0] ^= (~A[3][1] & A[3][2]);
+ A[2][1] ^= (~A[2][2] & A[2][3]);
+ A[3][1] ^= (~A[3][2] & A[3][3]);
+ A[2][2] ^= (~A[2][3] & A[2][4]);
+ A[3][2] ^= (~A[3][3] & A[3][4]);
+ A[2][3] ^= (~A[2][4] & C[2]);
+ A[3][3] ^= (~A[3][4] & C[3]);
+ A[2][4] ^= (~C[2] & D[2]);
+ A[3][4] ^= (~C[3] & D[3]);
+
+ C[4] = A[4][0];
+ D[4] = A[4][1];
+
+ A[4][0] ^= (~A[4][1] & A[4][2]);
+ A[4][1] ^= (~A[4][2] & A[4][3]);
+ A[4][2] ^= (~A[4][3] & A[4][4]);
+ A[4][3] ^= (~A[4][4] & C[4]);
+ A[4][4] ^= (~C[4] & D[4]);
+ A[0][0] ^= iotas[i];
+}
+
+static void KeccakF1600(uint64_t A[5][5])
+{
+ size_t i;
+
+ for (i = 0; i < 24; i++) {
+ Round(A, i);
+ }
+}
+
+#elif defined(KECCAK_2X)
+/*
+ * This implementation is variant of KECCAK_1X above with outer-most
+ * round loop unrolled twice. This allows to take temporary storage
+ * out of round procedure and simplify references to it by alternating
+ * it with actual data (see round loop below). Originally it was meant
+ * rather as reference for an assembly implementation, but it seems to
+ * play best with compilers [as well as provide best instruction per
+ * processed byte ratio at minimal round unroll factor]...
+ */
+static void Round(uint64_t R[5][5], uint64_t A[5][5], size_t i)
+{
+ uint64_t C[5], D[5];
+
+ assert(i < (sizeof(iotas) / sizeof(iotas[0])));
+
+ C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0];
+ C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1];
+ C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2];
+ C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3];
+ C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4];
+
+ D[0] = ROL64(C[1], 1) ^ C[4];
+ D[1] = ROL64(C[2], 1) ^ C[0];
+ D[2] = ROL64(C[3], 1) ^ C[1];
+ D[3] = ROL64(C[4], 1) ^ C[2];
+ D[4] = ROL64(C[0], 1) ^ C[3];
+
+ C[0] = A[0][0] ^ D[0]; /* rotate by 0 */
+ C[1] = ROL64(A[1][1] ^ D[1], rhotates[1][1]);
+ C[2] = ROL64(A[2][2] ^ D[2], rhotates[2][2]);
+ C[3] = ROL64(A[3][3] ^ D[3], rhotates[3][3]);
+ C[4] = ROL64(A[4][4] ^ D[4], rhotates[4][4]);
+
+#ifdef KECCAK_COMPLEMENTING_TRANSFORM
+ R[0][0] = C[0] ^ ( C[1] | C[2]) ^ iotas[i];
+ R[0][1] = C[1] ^ (~C[2] | C[3]);
+ R[0][2] = C[2] ^ ( C[3] & C[4]);
+ R[0][3] = C[3] ^ ( C[4] | C[0]);
+ R[0][4] = C[4] ^ ( C[0] & C[1]);
+#else
+ R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
+ R[0][1] = C[1] ^ (~C[2] & C[3]);
+ R[0][2] = C[2] ^ (~C[3] & C[4]);
+ R[0][3] = C[3] ^ (~C[4] & C[0]);
+ R[0][4] = C[4] ^ (~C[0] & C[1]);
+#endif
+
+ C[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]);
+ C[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
+ C[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]);
+ C[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]);
+ C[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]);
+
+#ifdef KECCAK_COMPLEMENTING_TRANSFORM
+ R[1][0] = C[0] ^ (C[1] | C[2]);
+ R[1][1] = C[1] ^ (C[2] & C[3]);
+ R[1][2] = C[2] ^ (C[3] | ~C[4]);
+ R[1][3] = C[3] ^ (C[4] | C[0]);
+ R[1][4] = C[4] ^ (C[0] & C[1]);
+#else
+ R[1][0] = C[0] ^ (~C[1] & C[2]);
+ R[1][1] = C[1] ^ (~C[2] & C[3]);
+ R[1][2] = C[2] ^ (~C[3] & C[4]);
+ R[1][3] = C[3] ^ (~C[4] & C[0]);
+ R[1][4] = C[4] ^ (~C[0] & C[1]);
+#endif
+
+ C[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]);
+ C[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]);
+ C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
+ C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
+ C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
+
+#ifdef KECCAK_COMPLEMENTING_TRANSFORM
+ R[2][0] = C[0] ^ ( C[1] | C[2]);
+ R[2][1] = C[1] ^ ( C[2] & C[3]);
+ R[2][2] = C[2] ^ (~C[3] & C[4]);
+ R[2][3] = ~C[3] ^ ( C[4] | C[0]);
+ R[2][4] = C[4] ^ ( C[0] & C[1]);
+#else
+ R[2][0] = C[0] ^ (~C[1] & C[2]);
+ R[2][1] = C[1] ^ (~C[2] & C[3]);
+ R[2][2] = C[2] ^ (~C[3] & C[4]);
+ R[2][3] = C[3] ^ (~C[4] & C[0]);
+ R[2][4] = C[4] ^ (~C[0] & C[1]);
+#endif
+
+ C[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]);
+ C[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]);
+ C[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]);
+ C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
+ C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
+
+#ifdef KECCAK_COMPLEMENTING_TRANSFORM
+ R[3][0] = C[0] ^ ( C[1] & C[2]);
+ R[3][1] = C[1] ^ ( C[2] | C[3]);
+ R[3][2] = C[2] ^ (~C[3] | C[4]);
+ R[3][3] = ~C[3] ^ ( C[4] & C[0]);
+ R[3][4] = C[4] ^ ( C[0] | C[1]);
+#else
+ R[3][0] = C[0] ^ (~C[1] & C[2]);
+ R[3][1] = C[1] ^ (~C[2] & C[3]);
+ R[3][2] = C[2] ^ (~C[3] & C[4]);
+ R[3][3] = C[3] ^ (~C[4] & C[0]);
+ R[3][4] = C[4] ^ (~C[0] & C[1]);
+#endif
+
+ C[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]);
+ C[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]);
+ C[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]);
+ C[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]);
+ C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
+
+#ifdef KECCAK_COMPLEMENTING_TRANSFORM
+ R[4][0] = C[0] ^ (~C[1] & C[2]);
+ R[4][1] = ~C[1] ^ ( C[2] | C[3]);
+ R[4][2] = C[2] ^ ( C[3] & C[4]);
+ R[4][3] = C[3] ^ ( C[4] | C[0]);
+ R[4][4] = C[4] ^ ( C[0] & C[1]);
+#else
+ R[4][0] = C[0] ^ (~C[1] & C[2]);
+ R[4][1] = C[1] ^ (~C[2] & C[3]);
+ R[4][2] = C[2] ^ (~C[3] & C[4]);
+ R[4][3] = C[3] ^ (~C[4] & C[0]);
+ R[4][4] = C[4] ^ (~C[0] & C[1]);
+#endif
+}
+
+static void KeccakF1600(uint64_t A[5][5])
+{
+ uint64_t T[5][5];
+ size_t i;
+
+#ifdef KECCAK_COMPLEMENTING_TRANSFORM
+ A[0][1] = ~A[0][1];
+ A[0][2] = ~A[0][2];
+ A[1][3] = ~A[1][3];
+ A[2][2] = ~A[2][2];
+ A[3][2] = ~A[3][2];
+ A[4][0] = ~A[4][0];
+#endif
+
+ for (i = 0; i < 24; i += 2) {
+ Round(T, A, i);
+ Round(A, T, i + 1);
+ }
+
+#ifdef KECCAK_COMPLEMENTING_TRANSFORM
+ A[0][1] = ~A[0][1];
+ A[0][2] = ~A[0][2];
+ A[1][3] = ~A[1][3];
+ A[2][2] = ~A[2][2];
+ A[3][2] = ~A[3][2];
+ A[4][0] = ~A[4][0];
+#endif
+}
+
+#else /* define KECCAK_INPLACE to compile this code path */
+/*
+ * This implementation is KECCAK_1X from above combined 4 times with
+ * a twist that allows to omit temporary storage and perform in-place
+ * processing. It's discussed in section 2.5 of "Keccak implementation
+ * overview". It's likely to be best suited for processors with large
+ * register bank... On the other hand processor with large register
+ * bank can as well use KECCAK_1X_ALT, it would be as fast but much
+ * more compact...
+ */
+static void FourRounds(uint64_t A[5][5], size_t i)
+{
+ uint64_t B[5], C[5], D[5];
+
+ assert(i <= (sizeof(iotas) / sizeof(iotas[0]) - 4));
+
+ /* Round 4*n */
+ C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0];
+ C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1];
+ C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2];
+ C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3];
+ C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4];
+
+ D[0] = ROL64(C[1], 1) ^ C[4];
+ D[1] = ROL64(C[2], 1) ^ C[0];
+ D[2] = ROL64(C[3], 1) ^ C[1];
+ D[3] = ROL64(C[4], 1) ^ C[2];
+ D[4] = ROL64(C[0], 1) ^ C[3];
+
+ B[0] = A[0][0] ^ D[0]; /* rotate by 0 */
+ B[1] = ROL64(A[1][1] ^ D[1], rhotates[1][1]);
+ B[2] = ROL64(A[2][2] ^ D[2], rhotates[2][2]);
+ B[3] = ROL64(A[3][3] ^ D[3], rhotates[3][3]);
+ B[4] = ROL64(A[4][4] ^ D[4], rhotates[4][4]);
+
+ C[0] = A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i];
+ C[1] = A[1][1] = B[1] ^ (~B[2] & B[3]);
+ C[2] = A[2][2] = B[2] ^ (~B[3] & B[4]);
+ C[3] = A[3][3] = B[3] ^ (~B[4] & B[0]);
+ C[4] = A[4][4] = B[4] ^ (~B[0] & B[1]);
+
+ B[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]);
+ B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
+ B[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]);
+ B[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]);
+ B[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]);
+
+ C[0] ^= A[2][0] = B[0] ^ (~B[1] & B[2]);
+ C[1] ^= A[3][1] = B[1] ^ (~B[2] & B[3]);
+ C[2] ^= A[4][2] = B[2] ^ (~B[3] & B[4]);
+ C[3] ^= A[0][3] = B[3] ^ (~B[4] & B[0]);
+ C[4] ^= A[1][4] = B[4] ^ (~B[0] & B[1]);
+
+ B[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]);
+ B[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]);
+ B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
+ B[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
+ B[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
+
+ C[0] ^= A[4][0] = B[0] ^ (~B[1] & B[2]);
+ C[1] ^= A[0][1] = B[1] ^ (~B[2] & B[3]);
+ C[2] ^= A[1][2] = B[2] ^ (~B[3] & B[4]);
+ C[3] ^= A[2][3] = B[3] ^ (~B[4] & B[0]);
+ C[4] ^= A[3][4] = B[4] ^ (~B[0] & B[1]);
+
+ B[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]);
+ B[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]);
+ B[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]);
+ B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
+ B[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
+
+ C[0] ^= A[1][0] = B[0] ^ (~B[1] & B[2]);
+ C[1] ^= A[2][1] = B[1] ^ (~B[2] & B[3]);
+ C[2] ^= A[3][2] = B[2] ^ (~B[3] & B[4]);
+ C[3] ^= A[4][3] = B[3] ^ (~B[4] & B[0]);
+ C[4] ^= A[0][4] = B[4] ^ (~B[0] & B[1]);
+
+ B[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]);
+ B[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]);
+ B[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]);
+ B[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]);
+ B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
+
+ C[0] ^= A[3][0] = B[0] ^ (~B[1] & B[2]);
+ C[1] ^= A[4][1] = B[1] ^ (~B[2] & B[3]);
+ C[2] ^= A[0][2] = B[2] ^ (~B[3] & B[4]);
+ C[3] ^= A[1][3] = B[3] ^ (~B[4] & B[0]);
+ C[4] ^= A[2][4] = B[4] ^ (~B[0] & B[1]);
+
+ /* Round 4*n+1 */
+ D[0] = ROL64(C[1], 1) ^ C[4];
+ D[1] = ROL64(C[2], 1) ^ C[0];
+ D[2] = ROL64(C[3], 1) ^ C[1];
+ D[3] = ROL64(C[4], 1) ^ C[2];
+ D[4] = ROL64(C[0], 1) ^ C[3];
+
+ B[0] = A[0][0] ^ D[0]; /* rotate by 0 */
+ B[1] = ROL64(A[3][1] ^ D[1], rhotates[1][1]);
+ B[2] = ROL64(A[1][2] ^ D[2], rhotates[2][2]);
+ B[3] = ROL64(A[4][3] ^ D[3], rhotates[3][3]);
+ B[4] = ROL64(A[2][4] ^ D[4], rhotates[4][4]);
+
+ C[0] = A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i + 1];
+ C[1] = A[3][1] = B[1] ^ (~B[2] & B[3]);
+ C[2] = A[1][2] = B[2] ^ (~B[3] & B[4]);
+ C[3] = A[4][3] = B[3] ^ (~B[4] & B[0]);
+ C[4] = A[2][4] = B[4] ^ (~B[0] & B[1]);
+
+ B[0] = ROL64(A[3][3] ^ D[3], rhotates[0][3]);
+ B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
+ B[2] = ROL64(A[4][0] ^ D[0], rhotates[2][0]);
+ B[3] = ROL64(A[2][1] ^ D[1], rhotates[3][1]);
+ B[4] = ROL64(A[0][2] ^ D[2], rhotates[4][2]);
+
+ C[0] ^= A[4][0] = B[0] ^ (~B[1] & B[2]);
+ C[1] ^= A[2][1] = B[1] ^ (~B[2] & B[3]);
+ C[2] ^= A[0][2] = B[2] ^ (~B[3] & B[4]);
+ C[3] ^= A[3][3] = B[3] ^ (~B[4] & B[0]);
+ C[4] ^= A[1][4] = B[4] ^ (~B[0] & B[1]);
+
+ B[0] = ROL64(A[1][1] ^ D[1], rhotates[0][1]);
+ B[1] = ROL64(A[4][2] ^ D[2], rhotates[1][2]);
+ B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
+ B[3] = ROL64(A[0][4] ^ D[4], rhotates[3][4]);
+ B[4] = ROL64(A[3][0] ^ D[0], rhotates[4][0]);
+
+ C[0] ^= A[3][0] = B[0] ^ (~B[1] & B[2]);
+ C[1] ^= A[1][1] = B[1] ^ (~B[2] & B[3]);
+ C[2] ^= A[4][2] = B[2] ^ (~B[3] & B[4]);
+ C[3] ^= A[2][3] = B[3] ^ (~B[4] & B[0]);
+ C[4] ^= A[0][4] = B[4] ^ (~B[0] & B[1]);
+
+ B[0] = ROL64(A[4][4] ^ D[4], rhotates[0][4]);
+ B[1] = ROL64(A[2][0] ^ D[0], rhotates[1][0]);
+ B[2] = ROL64(A[0][1] ^ D[1], rhotates[2][1]);
+ B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
+ B[4] = ROL64(A[1][3] ^ D[3], rhotates[4][3]);
+
+ C[0] ^= A[2][0] = B[0] ^ (~B[1] & B[2]);
+ C[1] ^= A[0][1] = B[1] ^ (~B[2] & B[3]);
+ C[2] ^= A[3][2] = B[2] ^ (~B[3] & B[4]);
+ C[3] ^= A[1][3] = B[3] ^ (~B[4] & B[0]);
+ C[4] ^= A[4][4] = B[4] ^ (~B[0] & B[1]);
+
+ B[0] = ROL64(A[2][2] ^ D[2], rhotates[0][2]);
+ B[1] = ROL64(A[0][3] ^ D[3], rhotates[1][3]);
+ B[2] = ROL64(A[3][4] ^ D[4], rhotates[2][4]);
+ B[3] = ROL64(A[1][0] ^ D[0], rhotates[3][0]);
+ B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
+
+ C[0] ^= A[1][0] = B[0] ^ (~B[1] & B[2]);
+ C[1] ^= A[4][1] = B[1] ^ (~B[2] & B[3]);
+ C[2] ^= A[2][2] = B[2] ^ (~B[3] & B[4]);
+ C[3] ^= A[0][3] = B[3] ^ (~B[4] & B[0]);
+ C[4] ^= A[3][4] = B[4] ^ (~B[0] & B[1]);
+
+ /* Round 4*n+2 */
+ D[0] = ROL64(C[1], 1) ^ C[4];
+ D[1] = ROL64(C[2], 1) ^ C[0];
+ D[2] = ROL64(C[3], 1) ^ C[1];
+ D[3] = ROL64(C[4], 1) ^ C[2];
+ D[4] = ROL64(C[0], 1) ^ C[3];
+
+ B[0] = A[0][0] ^ D[0]; /* rotate by 0 */
+ B[1] = ROL64(A[2][1] ^ D[1], rhotates[1][1]);
+ B[2] = ROL64(A[4][2] ^ D[2], rhotates[2][2]);
+ B[3] = ROL64(A[1][3] ^ D[3], rhotates[3][3]);
+ B[4] = ROL64(A[3][4] ^ D[4], rhotates[4][4]);
+
+ C[0] = A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i + 2];
+ C[1] = A[2][1] = B[1] ^ (~B[2] & B[3]);
+ C[2] = A[4][2] = B[2] ^ (~B[3] & B[4]);
+ C[3] = A[1][3] = B[3] ^ (~B[4] & B[0]);
+ C[4] = A[3][4] = B[4] ^ (~B[0] & B[1]);
+
+ B[0] = ROL64(A[4][3] ^ D[3], rhotates[0][3]);
+ B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
+ B[2] = ROL64(A[3][0] ^ D[0], rhotates[2][0]);
+ B[3] = ROL64(A[0][1] ^ D[1], rhotates[3][1]);
+ B[4] = ROL64(A[2][2] ^ D[2], rhotates[4][2]);
+
+ C[0] ^= A[3][0] = B[0] ^ (~B[1] & B[2]);
+ C[1] ^= A[0][1] = B[1] ^ (~B[2] & B[3]);
+ C[2] ^= A[2][2] = B[2] ^ (~B[3] & B[4]);
+ C[3] ^= A[4][3] = B[3] ^ (~B[4] & B[0]);
+ C[4] ^= A[1][4] = B[4] ^ (~B[0] & B[1]);
+
+ B[0] = ROL64(A[3][1] ^ D[1], rhotates[0][1]);
+ B[1] = ROL64(A[0][2] ^ D[2], rhotates[1][2]);
+ B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
+ B[3] = ROL64(A[4][4] ^ D[4], rhotates[3][4]);
+ B[4] = ROL64(A[1][0] ^ D[0], rhotates[4][0]);
+
+ C[0] ^= A[1][0] = B[0] ^ (~B[1] & B[2]);
+ C[1] ^= A[3][1] = B[1] ^ (~B[2] & B[3]);
+ C[2] ^= A[0][2] = B[2] ^ (~B[3] & B[4]);
+ C[3] ^= A[2][3] = B[3] ^ (~B[4] & B[0]);
+ C[4] ^= A[4][4] = B[4] ^ (~B[0] & B[1]);
+
+ B[0] = ROL64(A[2][4] ^ D[4], rhotates[0][4]);
+ B[1] = ROL64(A[4][0] ^ D[0], rhotates[1][0]);
+ B[2] = ROL64(A[1][1] ^ D[1], rhotates[2][1]);
+ B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
+ B[4] = ROL64(A[0][3] ^ D[3], rhotates[4][3]);
+
+ C[0] ^= A[4][0] = B[0] ^ (~B[1] & B[2]);
+ C[1] ^= A[1][1] = B[1] ^ (~B[2] & B[3]);
+ C[2] ^= A[3][2] = B[2] ^ (~B[3] & B[4]);
+ C[3] ^= A[0][3] = B[3] ^ (~B[4] & B[0]);
+ C[4] ^= A[2][4] = B[4] ^ (~B[0] & B[1]);
+
+ B[0] = ROL64(A[1][2] ^ D[2], rhotates[0][2]);
+ B[1] = ROL64(A[3][3] ^ D[3], rhotates[1][3]);
+ B[2] = ROL64(A[0][4] ^ D[4], rhotates[2][4]);
+ B[3] = ROL64(A[2][0] ^ D[0], rhotates[3][0]);
+ B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
+
+ C[0] ^= A[2][0] = B[0] ^ (~B[1] & B[2]);
+ C[1] ^= A[4][1] = B[1] ^ (~B[2] & B[3]);
+ C[2] ^= A[1][2] = B[2] ^ (~B[3] & B[4]);
+ C[3] ^= A[3][3] = B[3] ^ (~B[4] & B[0]);
+ C[4] ^= A[0][4] = B[4] ^ (~B[0] & B[1]);
+
+ /* Round 4*n+3 */
+ D[0] = ROL64(C[1], 1) ^ C[4];
+ D[1] = ROL64(C[2], 1) ^ C[0];
+ D[2] = ROL64(C[3], 1) ^ C[1];
+ D[3] = ROL64(C[4], 1) ^ C[2];
+ D[4] = ROL64(C[0], 1) ^ C[3];
+
+ B[0] = A[0][0] ^ D[0]; /* rotate by 0 */
+ B[1] = ROL64(A[0][1] ^ D[1], rhotates[1][1]);
+ B[2] = ROL64(A[0][2] ^ D[2], rhotates[2][2]);
+ B[3] = ROL64(A[0][3] ^ D[3], rhotates[3][3]);
+ B[4] = ROL64(A[0][4] ^ D[4], rhotates[4][4]);
+
+ /* C[0] = */ A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i + 3];
+ /* C[1] = */ A[0][1] = B[1] ^ (~B[2] & B[3]);
+ /* C[2] = */ A[0][2] = B[2] ^ (~B[3] & B[4]);
+ /* C[3] = */ A[0][3] = B[3] ^ (~B[4] & B[0]);
+ /* C[4] = */ A[0][4] = B[4] ^ (~B[0] & B[1]);
+
+ B[0] = ROL64(A[1][3] ^ D[3], rhotates[0][3]);
+ B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
+ B[2] = ROL64(A[1][0] ^ D[0], rhotates[2][0]);
+ B[3] = ROL64(A[1][1] ^ D[1], rhotates[3][1]);
+ B[4] = ROL64(A[1][2] ^ D[2], rhotates[4][2]);
+
+ /* C[0] ^= */ A[1][0] = B[0] ^ (~B[1] & B[2]);
+ /* C[1] ^= */ A[1][1] = B[1] ^ (~B[2] & B[3]);
+ /* C[2] ^= */ A[1][2] = B[2] ^ (~B[3] & B[4]);
+ /* C[3] ^= */ A[1][3] = B[3] ^ (~B[4] & B[0]);
+ /* C[4] ^= */ A[1][4] = B[4] ^ (~B[0] & B[1]);
+
+ B[0] = ROL64(A[2][1] ^ D[1], rhotates[0][1]);
+ B[1] = ROL64(A[2][2] ^ D[2], rhotates[1][2]);
+ B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
+ B[3] = ROL64(A[2][4] ^ D[4], rhotates[3][4]);
+ B[4] = ROL64(A[2][0] ^ D[0], rhotates[4][0]);
+
+ /* C[0] ^= */ A[2][0] = B[0] ^ (~B[1] & B[2]);
+ /* C[1] ^= */ A[2][1] = B[1] ^ (~B[2] & B[3]);
+ /* C[2] ^= */ A[2][2] = B[2] ^ (~B[3] & B[4]);
+ /* C[3] ^= */ A[2][3] = B[3] ^ (~B[4] & B[0]);
+ /* C[4] ^= */ A[2][4] = B[4] ^ (~B[0] & B[1]);
+
+ B[0] = ROL64(A[3][4] ^ D[4], rhotates[0][4]);
+ B[1] = ROL64(A[3][0] ^ D[0], rhotates[1][0]);
+ B[2] = ROL64(A[3][1] ^ D[1], rhotates[2][1]);
+ B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
+ B[4] = ROL64(A[3][3] ^ D[3], rhotates[4][3]);
+
+ /* C[0] ^= */ A[3][0] = B[0] ^ (~B[1] & B[2]);
+ /* C[1] ^= */ A[3][1] = B[1] ^ (~B[2] & B[3]);
+ /* C[2] ^= */ A[3][2] = B[2] ^ (~B[3] & B[4]);
+ /* C[3] ^= */ A[3][3] = B[3] ^ (~B[4] & B[0]);
+ /* C[4] ^= */ A[3][4] = B[4] ^ (~B[0] & B[1]);
+
+ B[0] = ROL64(A[4][2] ^ D[2], rhotates[0][2]);
+ B[1] = ROL64(A[4][3] ^ D[3], rhotates[1][3]);
+ B[2] = ROL64(A[4][4] ^ D[4], rhotates[2][4]);
+ B[3] = ROL64(A[4][0] ^ D[0], rhotates[3][0]);
+ B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
+
+ /* C[0] ^= */ A[4][0] = B[0] ^ (~B[1] & B[2]);
+ /* C[1] ^= */ A[4][1] = B[1] ^ (~B[2] & B[3]);
+ /* C[2] ^= */ A[4][2] = B[2] ^ (~B[3] & B[4]);
+ /* C[3] ^= */ A[4][3] = B[3] ^ (~B[4] & B[0]);
+ /* C[4] ^= */ A[4][4] = B[4] ^ (~B[0] & B[1]);
+}
+
+static void KeccakF1600(uint64_t A[5][5])
+{
+ size_t i;
+
+ for (i = 0; i < 24; i += 4) {
+ FourRounds(A, i);
+ }
+}
+
+#endif
+
+static uint64_t BitInterleave(uint64_t Ai)
+{
+ if (BIT_INTERLEAVE) {
+ uint32_t hi = (uint32_t)(Ai >> 32), lo = (uint32_t)Ai;
+ uint32_t t0, t1;
+
+ t0 = lo & 0x55555555;
+ t0 |= t0 >> 1; t0 &= 0x33333333;
+ t0 |= t0 >> 2; t0 &= 0x0f0f0f0f;
+ t0 |= t0 >> 4; t0 &= 0x00ff00ff;
+ t0 |= t0 >> 8; t0 &= 0x0000ffff;
+
+ t1 = hi & 0x55555555;
+ t1 |= t1 >> 1; t1 &= 0x33333333;
+ t1 |= t1 >> 2; t1 &= 0x0f0f0f0f;
+ t1 |= t1 >> 4; t1 &= 0x00ff00ff;
+ t1 |= t1 >> 8; t1 <<= 16;
+
+ lo &= 0xaaaaaaaa;
+ lo |= lo << 1; lo &= 0xcccccccc;
+ lo |= lo << 2; lo &= 0xf0f0f0f0;
+ lo |= lo << 4; lo &= 0xff00ff00;
+ lo |= lo << 8; lo >>= 16;
+
+ hi &= 0xaaaaaaaa;
+ hi |= hi << 1; hi &= 0xcccccccc;
+ hi |= hi << 2; hi &= 0xf0f0f0f0;
+ hi |= hi << 4; hi &= 0xff00ff00;
+ hi |= hi << 8; hi &= 0xffff0000;
+
+ Ai = ((uint64_t)(hi | lo) << 32) | (t1 | t0);
+ }
+
+ return Ai;
+}
+
+static uint64_t BitDeinterleave(uint64_t Ai)
+{
+ if (BIT_INTERLEAVE) {
+ uint32_t hi = (uint32_t)(Ai >> 32), lo = (uint32_t)Ai;
+ uint32_t t0, t1;
+
+ t0 = lo & 0x0000ffff;
+ t0 |= t0 << 8; t0 &= 0x00ff00ff;
+ t0 |= t0 << 4; t0 &= 0x0f0f0f0f;
+ t0 |= t0 << 2; t0 &= 0x33333333;
+ t0 |= t0 << 1; t0 &= 0x55555555;
+
+ t1 = hi << 16;
+ t1 |= t1 >> 8; t1 &= 0xff00ff00;
+ t1 |= t1 >> 4; t1 &= 0xf0f0f0f0;
+ t1 |= t1 >> 2; t1 &= 0xcccccccc;
+ t1 |= t1 >> 1; t1 &= 0xaaaaaaaa;
+
+ lo >>= 16;
+ lo |= lo << 8; lo &= 0x00ff00ff;
+ lo |= lo << 4; lo &= 0x0f0f0f0f;
+ lo |= lo << 2; lo &= 0x33333333;
+ lo |= lo << 1; lo &= 0x55555555;
+
+ hi &= 0xffff0000;
+ hi |= hi >> 8; hi &= 0xff00ff00;
+ hi |= hi >> 4; hi &= 0xf0f0f0f0;
+ hi |= hi >> 2; hi &= 0xcccccccc;
+ hi |= hi >> 1; hi &= 0xaaaaaaaa;
+
+ Ai = ((uint64_t)(hi | lo) << 32) | (t1 | t0);
+ }
+
+ return Ai;
+}
+
+/*
+ * SHA3_absorb can be called multiple times, but at each invocation
+ * largest multiple of |r| out of |len| bytes are processed. Then
+ * remaining amount of bytes is returned. This is done to spare caller
+ * trouble of calculating the largest multiple of |r|. |r| can be viewed
+ * as blocksize. It is commonly (1600 - 256*n)/8, e.g. 168, 136, 104,
+ * 72, but can also be (1600 - 448)/8 = 144. All this means that message
+ * padding and intermediate sub-block buffering, byte- or bitwise, is
+ * caller's responsibility.
+ */
+size_t SHA3_absorb(uint64_t A[5][5], const unsigned char *inp, size_t len,
+ size_t r)
+{
+ uint64_t *A_flat = (uint64_t *)A;
+ size_t i, w = r / 8;
+
+ assert(r < (25 * sizeof(A[0][0])) && (r % 8) == 0);
+
+ while (len >= r) {
+ for (i = 0; i < w; i++) {
+ uint64_t Ai = (uint64_t)inp[0] | (uint64_t)inp[1] << 8 |
+ (uint64_t)inp[2] << 16 | (uint64_t)inp[3] << 24 |
+ (uint64_t)inp[4] << 32 | (uint64_t)inp[5] << 40 |
+ (uint64_t)inp[6] << 48 | (uint64_t)inp[7] << 56;
+ inp += 8;
+
+ A_flat[i] ^= BitInterleave(Ai);
+ }
+ KeccakF1600(A);
+ len -= r;
+ }
+
+ return len;
+}
+
+/*
+ * SHA3_squeeze is called once at the end to generate |out| hash value
+ * of |len| bytes.
+ */
+void SHA3_squeeze(uint64_t A[5][5], unsigned char *out, size_t len, size_t r)
+{
+ uint64_t *A_flat = (uint64_t *)A;
+ size_t i, w = r / 8;
+
+ assert(r < (25 * sizeof(A[0][0])) && (r % 8) == 0);
+
+ while (len != 0) {
+ for (i = 0; i < w && len != 0; i++) {
+ uint64_t Ai = BitDeinterleave(A_flat[i]);
+
+ if (len < 8) {
+ for (i = 0; i < len; i++) {
+ *out++ = (unsigned char)Ai;
+ Ai >>= 8;
+ }
+ return;
+ }
+
+ out[0] = (unsigned char)(Ai);
+ out[1] = (unsigned char)(Ai >> 8);
+ out[2] = (unsigned char)(Ai >> 16);
+ out[3] = (unsigned char)(Ai >> 24);
+ out[4] = (unsigned char)(Ai >> 32);
+ out[5] = (unsigned char)(Ai >> 40);
+ out[6] = (unsigned char)(Ai >> 48);
+ out[7] = (unsigned char)(Ai >> 56);
+ out += 8;
+ len -= 8;
+ }
+ if (len)
+ KeccakF1600(A);
+ }
+}
+#endif
+
+#ifdef SELFTEST
+/*
+ * Post-padding one-shot implementations would look as following:
+ *
+ * SHA3_224 SHA3_sponge(inp, len, out, 224/8, (1600-448)/8);
+ * SHA3_256 SHA3_sponge(inp, len, out, 256/8, (1600-512)/8);
+ * SHA3_384 SHA3_sponge(inp, len, out, 384/8, (1600-768)/8);
+ * SHA3_512 SHA3_sponge(inp, len, out, 512/8, (1600-1024)/8);
+ * SHAKE_128 SHA3_sponge(inp, len, out, d, (1600-256)/8);
+ * SHAKE_256 SHA3_sponge(inp, len, out, d, (1600-512)/8);
+ */
+
+void SHA3_sponge(const unsigned char *inp, size_t len,
+ unsigned char *out, size_t d, size_t r)
+{
+ uint64_t A[5][5];
+
+ memset(A, 0, sizeof(A));
+ SHA3_absorb(A, inp, len, r);
+ SHA3_squeeze(A, out, d, r);
+}
+
+# include <stdio.h>
+
+int main()
+{
+ /*
+ * This is 5-bit SHAKE128 test from http://csrc.nist.gov/groups/ST/toolkit/examples.html#aHashing
+ */
+ unsigned char test[168] = { '\xf3', '\x3' };
+ unsigned char out[512];
+ size_t i;
+ static const unsigned char result[512] = {
+ 0x2E, 0x0A, 0xBF, 0xBA, 0x83, 0xE6, 0x72, 0x0B,
+ 0xFB, 0xC2, 0x25, 0xFF, 0x6B, 0x7A, 0xB9, 0xFF,
+ 0xCE, 0x58, 0xBA, 0x02, 0x7E, 0xE3, 0xD8, 0x98,
+ 0x76, 0x4F, 0xEF, 0x28, 0x7D, 0xDE, 0xCC, 0xCA,
+ 0x3E, 0x6E, 0x59, 0x98, 0x41, 0x1E, 0x7D, 0xDB,
+ 0x32, 0xF6, 0x75, 0x38, 0xF5, 0x00, 0xB1, 0x8C,
+ 0x8C, 0x97, 0xC4, 0x52, 0xC3, 0x70, 0xEA, 0x2C,
+ 0xF0, 0xAF, 0xCA, 0x3E, 0x05, 0xDE, 0x7E, 0x4D,
+ 0xE2, 0x7F, 0xA4, 0x41, 0xA9, 0xCB, 0x34, 0xFD,
+ 0x17, 0xC9, 0x78, 0xB4, 0x2D, 0x5B, 0x7E, 0x7F,
+ 0x9A, 0xB1, 0x8F, 0xFE, 0xFF, 0xC3, 0xC5, 0xAC,
+ 0x2F, 0x3A, 0x45, 0x5E, 0xEB, 0xFD, 0xC7, 0x6C,
+ 0xEA, 0xEB, 0x0A, 0x2C, 0xCA, 0x22, 0xEE, 0xF6,
+ 0xE6, 0x37, 0xF4, 0xCA, 0xBE, 0x5C, 0x51, 0xDE,
+ 0xD2, 0xE3, 0xFA, 0xD8, 0xB9, 0x52, 0x70, 0xA3,
+ 0x21, 0x84, 0x56, 0x64, 0xF1, 0x07, 0xD1, 0x64,
+ 0x96, 0xBB, 0x7A, 0xBF, 0xBE, 0x75, 0x04, 0xB6,
+ 0xED, 0xE2, 0xE8, 0x9E, 0x4B, 0x99, 0x6F, 0xB5,
+ 0x8E, 0xFD, 0xC4, 0x18, 0x1F, 0x91, 0x63, 0x38,
+ 0x1C, 0xBE, 0x7B, 0xC0, 0x06, 0xA7, 0xA2, 0x05,
+ 0x98, 0x9C, 0x52, 0x6C, 0xD1, 0xBD, 0x68, 0x98,
+ 0x36, 0x93, 0xB4, 0xBD, 0xC5, 0x37, 0x28, 0xB2,
+ 0x41, 0xC1, 0xCF, 0xF4, 0x2B, 0xB6, 0x11, 0x50,
+ 0x2C, 0x35, 0x20, 0x5C, 0xAB, 0xB2, 0x88, 0x75,
+ 0x56, 0x55, 0xD6, 0x20, 0xC6, 0x79, 0x94, 0xF0,
+ 0x64, 0x51, 0x18, 0x7F, 0x6F, 0xD1, 0x7E, 0x04,
+ 0x66, 0x82, 0xBA, 0x12, 0x86, 0x06, 0x3F, 0xF8,
+ 0x8F, 0xE2, 0x50, 0x8D, 0x1F, 0xCA, 0xF9, 0x03,
+ 0x5A, 0x12, 0x31, 0xAD, 0x41, 0x50, 0xA9, 0xC9,
+ 0xB2, 0x4C, 0x9B, 0x2D, 0x66, 0xB2, 0xAD, 0x1B,
+ 0xDE, 0x0B, 0xD0, 0xBB, 0xCB, 0x8B, 0xE0, 0x5B,
+ 0x83, 0x52, 0x29, 0xEF, 0x79, 0x19, 0x73, 0x73,
+ 0x23, 0x42, 0x44, 0x01, 0xE1, 0xD8, 0x37, 0xB6,
+ 0x6E, 0xB4, 0xE6, 0x30, 0xFF, 0x1D, 0xE7, 0x0C,
+ 0xB3, 0x17, 0xC2, 0xBA, 0xCB, 0x08, 0x00, 0x1D,
+ 0x34, 0x77, 0xB7, 0xA7, 0x0A, 0x57, 0x6D, 0x20,
+ 0x86, 0x90, 0x33, 0x58, 0x9D, 0x85, 0xA0, 0x1D,
+ 0xDB, 0x2B, 0x66, 0x46, 0xC0, 0x43, 0xB5, 0x9F,
+ 0xC0, 0x11, 0x31, 0x1D, 0xA6, 0x66, 0xFA, 0x5A,
+ 0xD1, 0xD6, 0x38, 0x7F, 0xA9, 0xBC, 0x40, 0x15,
+ 0xA3, 0x8A, 0x51, 0xD1, 0xDA, 0x1E, 0xA6, 0x1D,
+ 0x64, 0x8D, 0xC8, 0xE3, 0x9A, 0x88, 0xB9, 0xD6,
+ 0x22, 0xBD, 0xE2, 0x07, 0xFD, 0xAB, 0xC6, 0xF2,
+ 0x82, 0x7A, 0x88, 0x0C, 0x33, 0x0B, 0xBF, 0x6D,
+ 0xF7, 0x33, 0x77, 0x4B, 0x65, 0x3E, 0x57, 0x30,
+ 0x5D, 0x78, 0xDC, 0xE1, 0x12, 0xF1, 0x0A, 0x2C,
+ 0x71, 0xF4, 0xCD, 0xAD, 0x92, 0xED, 0x11, 0x3E,
+ 0x1C, 0xEA, 0x63, 0xB9, 0x19, 0x25, 0xED, 0x28,
+ 0x19, 0x1E, 0x6D, 0xBB, 0xB5, 0xAA, 0x5A, 0x2A,
+ 0xFD, 0xA5, 0x1F, 0xC0, 0x5A, 0x3A, 0xF5, 0x25,
+ 0x8B, 0x87, 0x66, 0x52, 0x43, 0x55, 0x0F, 0x28,
+ 0x94, 0x8A, 0xE2, 0xB8, 0xBE, 0xB6, 0xBC, 0x9C,
+ 0x77, 0x0B, 0x35, 0xF0, 0x67, 0xEA, 0xA6, 0x41,
+ 0xEF, 0xE6, 0x5B, 0x1A, 0x44, 0x90, 0x9D, 0x1B,
+ 0x14, 0x9F, 0x97, 0xEE, 0xA6, 0x01, 0x39, 0x1C,
+ 0x60, 0x9E, 0xC8, 0x1D, 0x19, 0x30, 0xF5, 0x7C,
+ 0x18, 0xA4, 0xE0, 0xFA, 0xB4, 0x91, 0xD1, 0xCA,
+ 0xDF, 0xD5, 0x04, 0x83, 0x44, 0x9E, 0xDC, 0x0F,
+ 0x07, 0xFF, 0xB2, 0x4D, 0x2C, 0x6F, 0x9A, 0x9A,
+ 0x3B, 0xFF, 0x39, 0xAE, 0x3D, 0x57, 0xF5, 0x60,
+ 0x65, 0x4D, 0x7D, 0x75, 0xC9, 0x08, 0xAB, 0xE6,
+ 0x25, 0x64, 0x75, 0x3E, 0xAC, 0x39, 0xD7, 0x50,
+ 0x3D, 0xA6, 0xD3, 0x7C, 0x2E, 0x32, 0xE1, 0xAF,
+ 0x3B, 0x8A, 0xEC, 0x8A, 0xE3, 0x06, 0x9C, 0xD9
+ };
+
+ test[167] = '\x80';
+ SHA3_sponge(test, sizeof(test), out, sizeof(out), sizeof(test));
+
+ /*
+ * Rationale behind keeping output [formatted as below] is that
+ * one should be able to redirect it to a file, then copy-n-paste
+ * final "output val" from official example to another file, and
+ * compare the two with diff(1).
+ */
+ for (i = 0; i < sizeof(out);) {
+ printf("%02X", out[i]);
+ printf(++i % 16 && i != sizeof(out) ? " " : "\n");
+ }
+
+ if (memcmp(out,result,sizeof(out))) {
+ fprintf(stderr,"failure\n");
+ return 1;
+ } else {
+ fprintf(stderr,"success\n");
+ return 0;
+ }
+}
+#endif
diff --git a/crypto/sha/sha.c b/crypto/sha/sha.c
deleted file mode 100644
index cfc12f3edc68..000000000000
--- a/crypto/sha/sha.c
+++ /dev/null
@@ -1,118 +0,0 @@
-/* crypto/sha/sha.c */
-/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
- * All rights reserved.
- *
- * This package is an SSL implementation written
- * by Eric Young (eay@cryptsoft.com).
- * The implementation was written so as to conform with Netscapes SSL.
- *
- * This library is free for commercial and non-commercial use as long as
- * the following conditions are aheared to. The following conditions
- * apply to all code found in this distribution, be it the RC4, RSA,
- * lhash, DES, etc., code; not just the SSL code. The SSL documentation
- * included with this distribution is covered by the same copyright terms
- * except that the holder is Tim Hudson (tjh@cryptsoft.com).
- *
- * Copyright remains Eric Young's, and as such any Copyright notices in
- * the code are not to be removed.
- * If this package is used in a product, Eric Young should be given attribution
- * as the author of the parts of the library used.
- * This can be in the form of a textual message at program startup or
- * in documentation (online or textual) provided with the package.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- * must display the following acknowledgement:
- * "This product includes cryptographic software written by
- * Eric Young (eay@cryptsoft.com)"
- * The word 'cryptographic' can be left out if the rouines from the library
- * being used are not cryptographic related :-).
- * 4. If you include any Windows specific code (or a derivative thereof) from
- * the apps directory (application code) you must include an acknowledgement:
- * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
- *
- * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * The licence and distribution terms for any publically available version or
- * derivative of this code cannot be changed. i.e. this code cannot simply be
- * copied and put under another distribution licence
- * [including the GNU Public Licence.]
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <openssl/sha.h>
-
-#define BUFSIZE 1024*16
-
-void do_fp(FILE *f);
-void pt(unsigned char *md);
-int read(int, void *, unsigned int);
-int main(int argc, char **argv)
-{
- int i, err = 0;
- FILE *IN;
-
- if (argc == 1) {
- do_fp(stdin);
- } else {
- for (i = 1; i < argc; i++) {
- IN = fopen(argv[i], "r");
- if (IN == NULL) {
- perror(argv[i]);
- err++;
- continue;
- }
- printf("SHA(%s)= ", argv[i]);
- do_fp(IN);
- fclose(IN);
- }
- }
- exit(err);
-}
-
-void do_fp(FILE *f)
-{
- SHA_CTX c;
- unsigned char md[SHA_DIGEST_LENGTH];
- int fd;
- int i;
- unsigned char buf[BUFSIZE];
-
- fd = fileno(f);
- SHA_Init(&c);
- for (;;) {
- i = read(fd, buf, BUFSIZE);
- if (i <= 0)
- break;
- SHA_Update(&c, buf, (unsigned long)i);
- }
- SHA_Final(&(md[0]), &c);
- pt(md);
-}
-
-void pt(unsigned char *md)
-{
- int i;
-
- for (i = 0; i < SHA_DIGEST_LENGTH; i++)
- printf("%02x", md[i]);
- printf("\n");
-}
diff --git a/crypto/sha/sha.h b/crypto/sha/sha.h
deleted file mode 100644
index e5169e4fee04..000000000000
--- a/crypto/sha/sha.h
+++ /dev/null
@@ -1,214 +0,0 @@
-/* crypto/sha/sha.h */
-/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
- * All rights reserved.
- *
- * This package is an SSL implementation written
- * by Eric Young (eay@cryptsoft.com).
- * The implementation was written so as to conform with Netscapes SSL.
- *
- * This library is free for commercial and non-commercial use as long as
- * the following conditions are aheared to. The following conditions
- * apply to all code found in this distribution, be it the RC4, RSA,
- * lhash, DES, etc., code; not just the SSL code. The SSL documentation
- * included with this distribution is covered by the same copyright terms
- * except that the holder is Tim Hudson (tjh@cryptsoft.com).
- *
- * Copyright remains Eric Young's, and as such any Copyright notices in
- * the code are not to be removed.
- * If this package is used in a product, Eric Young should be given attribution
- * as the author of the parts of the library used.
- * This can be in the form of a textual message at program startup or
- * in documentation (online or textual) provided with the package.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- * must display the following acknowledgement:
- * "This product includes cryptographic software written by
- * Eric Young (eay@cryptsoft.com)"
- * The word 'cryptographic' can be left out if the rouines from the library
- * being used are not cryptographic related :-).
- * 4. If you include any Windows specific code (or a derivative thereof) from
- * the apps directory (application code) you must include an acknowledgement:
- * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
- *
- * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * The licence and distribution terms for any publically available version or
- * derivative of this code cannot be changed. i.e. this code cannot simply be
- * copied and put under another distribution licence
- * [including the GNU Public Licence.]
- */
-
-#ifndef HEADER_SHA_H
-# define HEADER_SHA_H
-
-# include <openssl/e_os2.h>
-# include <stddef.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-# if defined(OPENSSL_NO_SHA) || (defined(OPENSSL_NO_SHA0) && defined(OPENSSL_NO_SHA1))
-# error SHA is disabled.
-# endif
-
-# if defined(OPENSSL_FIPS)
-# define FIPS_SHA_SIZE_T size_t
-# endif
-
-/*-
- * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
- * ! SHA_LONG has to be at least 32 bits wide. If it's wider, then !
- * ! SHA_LONG_LOG2 has to be defined along. !
- * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
- */
-
-# if defined(__LP32__)
-# define SHA_LONG unsigned long
-# elif defined(OPENSSL_SYS_CRAY) || defined(__ILP64__)
-# define SHA_LONG unsigned long
-# define SHA_LONG_LOG2 3
-# else
-# define SHA_LONG unsigned int
-# endif
-
-# define SHA_LBLOCK 16
-# define SHA_CBLOCK (SHA_LBLOCK*4)/* SHA treats input data as a
- * contiguous array of 32 bit wide
- * big-endian values. */
-# define SHA_LAST_BLOCK (SHA_CBLOCK-8)
-# define SHA_DIGEST_LENGTH 20
-
-typedef struct SHAstate_st {
- SHA_LONG h0, h1, h2, h3, h4;
- SHA_LONG Nl, Nh;
- SHA_LONG data[SHA_LBLOCK];
- unsigned int num;
-} SHA_CTX;
-
-# ifndef OPENSSL_NO_SHA0
-# ifdef OPENSSL_FIPS
-int private_SHA_Init(SHA_CTX *c);
-# endif
-int SHA_Init(SHA_CTX *c);
-int SHA_Update(SHA_CTX *c, const void *data, size_t len);
-int SHA_Final(unsigned char *md, SHA_CTX *c);
-unsigned char *SHA(const unsigned char *d, size_t n, unsigned char *md);
-void SHA_Transform(SHA_CTX *c, const unsigned char *data);
-# endif
-# ifndef OPENSSL_NO_SHA1
-# ifdef OPENSSL_FIPS
-int private_SHA1_Init(SHA_CTX *c);
-# endif
-int SHA1_Init(SHA_CTX *c);
-int SHA1_Update(SHA_CTX *c, const void *data, size_t len);
-int SHA1_Final(unsigned char *md, SHA_CTX *c);
-unsigned char *SHA1(const unsigned char *d, size_t n, unsigned char *md);
-void SHA1_Transform(SHA_CTX *c, const unsigned char *data);
-# endif
-
-# define SHA256_CBLOCK (SHA_LBLOCK*4)/* SHA-256 treats input data as a
- * contiguous array of 32 bit wide
- * big-endian values. */
-# define SHA224_DIGEST_LENGTH 28
-# define SHA256_DIGEST_LENGTH 32
-
-typedef struct SHA256state_st {
- SHA_LONG h[8];
- SHA_LONG Nl, Nh;
- SHA_LONG data[SHA_LBLOCK];
- unsigned int num, md_len;
-} SHA256_CTX;
-
-# ifndef OPENSSL_NO_SHA256
-# ifdef OPENSSL_FIPS
-int private_SHA224_Init(SHA256_CTX *c);
-int private_SHA256_Init(SHA256_CTX *c);
-# endif
-int SHA224_Init(SHA256_CTX *c);
-int SHA224_Update(SHA256_CTX *c, const void *data, size_t len);
-int SHA224_Final(unsigned char *md, SHA256_CTX *c);
-unsigned char *SHA224(const unsigned char *d, size_t n, unsigned char *md);
-int SHA256_Init(SHA256_CTX *c);
-int SHA256_Update(SHA256_CTX *c, const void *data, size_t len);
-int SHA256_Final(unsigned char *md, SHA256_CTX *c);
-unsigned char *SHA256(const unsigned char *d, size_t n, unsigned char *md);
-void SHA256_Transform(SHA256_CTX *c, const unsigned char *data);
-# endif
-
-# define SHA384_DIGEST_LENGTH 48
-# define SHA512_DIGEST_LENGTH 64
-
-# ifndef OPENSSL_NO_SHA512
-/*
- * Unlike 32-bit digest algorithms, SHA-512 *relies* on SHA_LONG64
- * being exactly 64-bit wide. See Implementation Notes in sha512.c
- * for further details.
- */
-/*
- * SHA-512 treats input data as a
- * contiguous array of 64 bit
- * wide big-endian values.
- */
-# define SHA512_CBLOCK (SHA_LBLOCK*8)
-# if (defined(_WIN32) || defined(_WIN64)) && !defined(__MINGW32__)
-# define SHA_LONG64 unsigned __int64
-# define U64(C) C##UI64
-# elif defined(__arch64__)
-# define SHA_LONG64 unsigned long
-# define U64(C) C##UL
-# else
-# define SHA_LONG64 unsigned long long
-# define U64(C) C##ULL
-# endif
-
-typedef struct SHA512state_st {
- SHA_LONG64 h[8];
- SHA_LONG64 Nl, Nh;
- union {
- SHA_LONG64 d[SHA_LBLOCK];
- unsigned char p[SHA512_CBLOCK];
- } u;
- unsigned int num, md_len;
-} SHA512_CTX;
-# endif
-
-# ifndef OPENSSL_NO_SHA512
-# ifdef OPENSSL_FIPS
-int private_SHA384_Init(SHA512_CTX *c);
-int private_SHA512_Init(SHA512_CTX *c);
-# endif
-int SHA384_Init(SHA512_CTX *c);
-int SHA384_Update(SHA512_CTX *c, const void *data, size_t len);
-int SHA384_Final(unsigned char *md, SHA512_CTX *c);
-unsigned char *SHA384(const unsigned char *d, size_t n, unsigned char *md);
-int SHA512_Init(SHA512_CTX *c);
-int SHA512_Update(SHA512_CTX *c, const void *data, size_t len);
-int SHA512_Final(unsigned char *md, SHA512_CTX *c);
-unsigned char *SHA512(const unsigned char *d, size_t n, unsigned char *md);
-void SHA512_Transform(SHA512_CTX *c, const unsigned char *data);
-# endif
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/crypto/sha/sha1.c b/crypto/sha/sha1.c
deleted file mode 100644
index 8dd19431b48d..000000000000
--- a/crypto/sha/sha1.c
+++ /dev/null
@@ -1,121 +0,0 @@
-/* crypto/sha/sha1.c */
-/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
- * All rights reserved.
- *
- * This package is an SSL implementation written
- * by Eric Young (eay@cryptsoft.com).
- * The implementation was written so as to conform with Netscapes SSL.
- *
- * This library is free for commercial and non-commercial use as long as
- * the following conditions are aheared to. The following conditions
- * apply to all code found in this distribution, be it the RC4, RSA,
- * lhash, DES, etc., code; not just the SSL code. The SSL documentation
- * included with this distribution is covered by the same copyright terms
- * except that the holder is Tim Hudson (tjh@cryptsoft.com).
- *
- * Copyright remains Eric Young's, and as such any Copyright notices in
- * the code are not to be removed.
- * If this package is used in a product, Eric Young should be given attribution
- * as the author of the parts of the library used.
- * This can be in the form of a textual message at program startup or
- * in documentation (online or textual) provided with the package.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- * must display the following acknowledgement:
- * "This product includes cryptographic software written by
- * Eric Young (eay@cryptsoft.com)"
- * The word 'cryptographic' can be left out if the rouines from the library
- * being used are not cryptographic related :-).
- * 4. If you include any Windows specific code (or a derivative thereof) from
- * the apps directory (application code) you must include an acknowledgement:
- * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
- *
- * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * The licence and distribution terms for any publically available version or
- * derivative of this code cannot be changed. i.e. this code cannot simply be
- * copied and put under another distribution licence
- * [including the GNU Public Licence.]
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <openssl/sha.h>
-
-#define BUFSIZE 1024*16
-
-void do_fp(FILE *f);
-void pt(unsigned char *md);
-#ifndef _OSD_POSIX
-int read(int, void *, unsigned int);
-#endif
-
-int main(int argc, char **argv)
-{
- int i, err = 0;
- FILE *IN;
-
- if (argc == 1) {
- do_fp(stdin);
- } else {
- for (i = 1; i < argc; i++) {
- IN = fopen(argv[i], "r");
- if (IN == NULL) {
- perror(argv[i]);
- err++;
- continue;
- }
- printf("SHA1(%s)= ", argv[i]);
- do_fp(IN);
- fclose(IN);
- }
- }
- exit(err);
-}
-
-void do_fp(FILE *f)
-{
- SHA_CTX c;
- unsigned char md[SHA_DIGEST_LENGTH];
- int fd;
- int i;
- unsigned char buf[BUFSIZE];
-
- fd = fileno(f);
- SHA1_Init(&c);
- for (;;) {
- i = read(fd, buf, BUFSIZE);
- if (i <= 0)
- break;
- SHA1_Update(&c, buf, (unsigned long)i);
- }
- SHA1_Final(&(md[0]), &c);
- pt(md);
-}
-
-void pt(unsigned char *md)
-{
- int i;
-
- for (i = 0; i < SHA_DIGEST_LENGTH; i++)
- printf("%02x", md[i]);
- printf("\n");
-}
diff --git a/crypto/sha/sha1_one.c b/crypto/sha/sha1_one.c
index a6dd760a1e0e..e5b38211d2da 100644
--- a/crypto/sha/sha1_one.c
+++ b/crypto/sha/sha1_one.c
@@ -1,59 +1,10 @@
-/* crypto/sha/sha1_one.c */
-/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
- * All rights reserved.
+/*
+ * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
*
- * This package is an SSL implementation written
- * by Eric Young (eay@cryptsoft.com).
- * The implementation was written so as to conform with Netscapes SSL.
- *
- * This library is free for commercial and non-commercial use as long as
- * the following conditions are aheared to. The following conditions
- * apply to all code found in this distribution, be it the RC4, RSA,
- * lhash, DES, etc., code; not just the SSL code. The SSL documentation
- * included with this distribution is covered by the same copyright terms
- * except that the holder is Tim Hudson (tjh@cryptsoft.com).
- *
- * Copyright remains Eric Young's, and as such any Copyright notices in
- * the code are not to be removed.
- * If this package is used in a product, Eric Young should be given attribution
- * as the author of the parts of the library used.
- * This can be in the form of a textual message at program startup or
- * in documentation (online or textual) provided with the package.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- * must display the following acknowledgement:
- * "This product includes cryptographic software written by
- * Eric Young (eay@cryptsoft.com)"
- * The word 'cryptographic' can be left out if the rouines from the library
- * being used are not cryptographic related :-).
- * 4. If you include any Windows specific code (or a derivative thereof) from
- * the apps directory (application code) you must include an acknowledgement:
- * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
- *
- * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * The licence and distribution terms for any publically available version or
- * derivative of this code cannot be changed. i.e. this code cannot simply be
- * copied and put under another distribution licence
- * [including the GNU Public Licence.]
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
*/
#include <stdio.h>
@@ -61,7 +12,6 @@
#include <openssl/crypto.h>
#include <openssl/sha.h>
-#ifndef OPENSSL_NO_SHA1
unsigned char *SHA1(const unsigned char *d, size_t n, unsigned char *md)
{
SHA_CTX c;
@@ -74,6 +24,5 @@ unsigned char *SHA1(const unsigned char *d, size_t n, unsigned char *md)
SHA1_Update(&c, d, n);
SHA1_Final(md, &c);
OPENSSL_cleanse(&c, sizeof(c));
- return (md);
+ return md;
}
-#endif
diff --git a/crypto/sha/sha1dgst.c b/crypto/sha/sha1dgst.c
index a67f1fe36479..819370e61540 100644
--- a/crypto/sha/sha1dgst.c
+++ b/crypto/sha/sha1dgst.c
@@ -1,74 +1,17 @@
-/* crypto/sha/sha1dgst.c */
-/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
- * All rights reserved.
+/*
+ * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
*
- * This package is an SSL implementation written
- * by Eric Young (eay@cryptsoft.com).
- * The implementation was written so as to conform with Netscapes SSL.
- *
- * This library is free for commercial and non-commercial use as long as
- * the following conditions are aheared to. The following conditions
- * apply to all code found in this distribution, be it the RC4, RSA,
- * lhash, DES, etc., code; not just the SSL code. The SSL documentation
- * included with this distribution is covered by the same copyright terms
- * except that the holder is Tim Hudson (tjh@cryptsoft.com).
- *
- * Copyright remains Eric Young's, and as such any Copyright notices in
- * the code are not to be removed.
- * If this package is used in a product, Eric Young should be given attribution
- * as the author of the parts of the library used.
- * This can be in the form of a textual message at program startup or
- * in documentation (online or textual) provided with the package.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- * must display the following acknowledgement:
- * "This product includes cryptographic software written by
- * Eric Young (eay@cryptsoft.com)"
- * The word 'cryptographic' can be left out if the rouines from the library
- * being used are not cryptographic related :-).
- * 4. If you include any Windows specific code (or a derivative thereof) from
- * the apps directory (application code) you must include an acknowledgement:
- * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
- *
- * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * The licence and distribution terms for any publically available version or
- * derivative of this code cannot be changed. i.e. this code cannot simply be
- * copied and put under another distribution licence
- * [including the GNU Public Licence.]
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
*/
#include <openssl/crypto.h>
#include <openssl/opensslconf.h>
-#if !defined(OPENSSL_NO_SHA1) && !defined(OPENSSL_NO_SHA)
-
-# undef SHA_0
-# define SHA_1
# include <openssl/opensslv.h>
-const char SHA1_version[] = "SHA1" OPENSSL_VERSION_PTEXT;
-
/* The implementation is in ../md32_common.h */
# include "sha_locl.h"
-
-#endif
diff --git a/crypto/sha/sha1test.c b/crypto/sha/sha1test.c
deleted file mode 100644
index 551a348df37f..000000000000
--- a/crypto/sha/sha1test.c
+++ /dev/null
@@ -1,174 +0,0 @@
-/* crypto/sha/sha1test.c */
-/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
- * All rights reserved.
- *
- * This package is an SSL implementation written
- * by Eric Young (eay@cryptsoft.com).
- * The implementation was written so as to conform with Netscapes SSL.
- *
- * This library is free for commercial and non-commercial use as long as
- * the following conditions are aheared to. The following conditions
- * apply to all code found in this distribution, be it the RC4, RSA,
- * lhash, DES, etc., code; not just the SSL code. The SSL documentation
- * included with this distribution is covered by the same copyright terms
- * except that the holder is Tim Hudson (tjh@cryptsoft.com).
- *
- * Copyright remains Eric Young's, and as such any Copyright notices in
- * the code are not to be removed.
- * If this package is used in a product, Eric Young should be given attribution
- * as the author of the parts of the library used.
- * This can be in the form of a textual message at program startup or
- * in documentation (online or textual) provided with the package.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- * must display the following acknowledgement:
- * "This product includes cryptographic software written by
- * Eric Young (eay@cryptsoft.com)"
- * The word 'cryptographic' can be left out if the rouines from the library
- * being used are not cryptographic related :-).
- * 4. If you include any Windows specific code (or a derivative thereof) from
- * the apps directory (application code) you must include an acknowledgement:
- * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
- *
- * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * The licence and distribution terms for any publically available version or
- * derivative of this code cannot be changed. i.e. this code cannot simply be
- * copied and put under another distribution licence
- * [including the GNU Public Licence.]
- */
-
-#include <stdio.h>
-#include <string.h>
-#include <stdlib.h>
-
-#include "../e_os.h"
-
-#ifdef OPENSSL_NO_SHA
-int main(int argc, char *argv[])
-{
- printf("No SHA support\n");
- return (0);
-}
-#else
-# include <openssl/evp.h>
-# include <openssl/sha.h>
-
-# ifdef CHARSET_EBCDIC
-# include <openssl/ebcdic.h>
-# endif
-
-# undef SHA_0 /* FIPS 180 */
-# define SHA_1 /* FIPS 180-1 */
-
-static char *test[] = {
- "abc",
- "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq",
- NULL,
-};
-
-# ifdef SHA_0
-static char *ret[] = {
- "0164b8a914cd2a5e74c4f7ff082c4d97f1edf880",
- "d2516ee1acfa5baf33dfc1c471e438449ef134c8",
-};
-
-static char *bigret = "3232affa48628a26653b5aaa44541fd90d690603";
-# endif
-# ifdef SHA_1
-static char *ret[] = {
- "a9993e364706816aba3e25717850c26c9cd0d89d",
- "84983e441c3bd26ebaae4aa1f95129e5e54670f1",
-};
-
-static char *bigret = "34aa973cd4c4daa4f61eeb2bdbad27316534016f";
-# endif
-
-static char *pt(unsigned char *md);
-int main(int argc, char *argv[])
-{
- int i, err = 0;
- char **P, **R;
- static unsigned char buf[1000];
- char *p, *r;
- EVP_MD_CTX c;
- unsigned char md[SHA_DIGEST_LENGTH];
-
-# ifdef CHARSET_EBCDIC
- ebcdic2ascii(test[0], test[0], strlen(test[0]));
- ebcdic2ascii(test[1], test[1], strlen(test[1]));
-# endif
-
- EVP_MD_CTX_init(&c);
- P = test;
- R = ret;
- i = 1;
- while (*P != NULL) {
- EVP_Digest(*P, strlen((char *)*P), md, NULL, EVP_sha1(), NULL);
- p = pt(md);
- if (strcmp(p, (char *)*R) != 0) {
- printf("error calculating SHA1 on '%s'\n", *P);
- printf("got %s instead of %s\n", p, *R);
- err++;
- } else
- printf("test %d ok\n", i);
- i++;
- R++;
- P++;
- }
-
- memset(buf, 'a', 1000);
-# ifdef CHARSET_EBCDIC
- ebcdic2ascii(buf, buf, 1000);
-# endif /* CHARSET_EBCDIC */
- EVP_DigestInit_ex(&c, EVP_sha1(), NULL);
- for (i = 0; i < 1000; i++)
- EVP_DigestUpdate(&c, buf, 1000);
- EVP_DigestFinal_ex(&c, md, NULL);
- p = pt(md);
-
- r = bigret;
- if (strcmp(p, r) != 0) {
- printf("error calculating SHA1 on 'a' * 1000\n");
- printf("got %s instead of %s\n", p, r);
- err++;
- } else
- printf("test 3 ok\n");
-
-# ifdef OPENSSL_SYS_NETWARE
- if (err)
- printf("ERROR: %d\n", err);
-# endif
- EVP_MD_CTX_cleanup(&c);
- EXIT(err);
- return (0);
-}
-
-static char *pt(unsigned char *md)
-{
- int i;
- static char buf[80];
-
- for (i = 0; i < SHA_DIGEST_LENGTH; i++)
- sprintf(&(buf[i * 2]), "%02x", md[i]);
- return (buf);
-}
-#endif
diff --git a/crypto/sha/sha256.c b/crypto/sha/sha256.c
index 72a11593697e..bf78f075eefb 100644
--- a/crypto/sha/sha256.c
+++ b/crypto/sha/sha256.c
@@ -1,22 +1,22 @@
-/* crypto/sha/sha256.c */
-/* ====================================================================
- * Copyright (c) 2004 The OpenSSL Project. All rights reserved
- * according to the OpenSSL license [found in ../../LICENSE].
- * ====================================================================
+/*
+ * Copyright 2004-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
*/
-#include <openssl/opensslconf.h>
-#if !defined(OPENSSL_NO_SHA) && !defined(OPENSSL_NO_SHA256)
-# include <stdlib.h>
-# include <string.h>
+#include <openssl/opensslconf.h>
-# include <openssl/crypto.h>
-# include <openssl/sha.h>
-# include <openssl/opensslv.h>
+#include <stdlib.h>
+#include <string.h>
-const char SHA256_version[] = "SHA-256" OPENSSL_VERSION_PTEXT;
+#include <openssl/crypto.h>
+#include <openssl/sha.h>
+#include <openssl/opensslv.h>
-fips_md_init_ctx(SHA224, SHA256)
+int SHA224_Init(SHA256_CTX *c)
{
memset(c, 0, sizeof(*c));
c->h[0] = 0xc1059ed8UL;
@@ -31,7 +31,7 @@ fips_md_init_ctx(SHA224, SHA256)
return 1;
}
-fips_md_init(SHA256)
+int SHA256_Init(SHA256_CTX *c)
{
memset(c, 0, sizeof(*c));
c->h[0] = 0x6a09e667UL;
@@ -57,7 +57,7 @@ unsigned char *SHA224(const unsigned char *d, size_t n, unsigned char *md)
SHA256_Update(&c, d, n);
SHA256_Final(md, &c);
OPENSSL_cleanse(&c, sizeof(c));
- return (md);
+ return md;
}
unsigned char *SHA256(const unsigned char *d, size_t n, unsigned char *md)
@@ -71,7 +71,7 @@ unsigned char *SHA256(const unsigned char *d, size_t n, unsigned char *md)
SHA256_Update(&c, d, n);
SHA256_Final(md, &c);
OPENSSL_cleanse(&c, sizeof(c));
- return (md);
+ return md;
}
int SHA224_Update(SHA256_CTX *c, const void *data, size_t len)
@@ -84,20 +84,21 @@ int SHA224_Final(unsigned char *md, SHA256_CTX *c)
return SHA256_Final(md, c);
}
-# define DATA_ORDER_IS_BIG_ENDIAN
+#define DATA_ORDER_IS_BIG_ENDIAN
+
+#define HASH_LONG SHA_LONG
+#define HASH_CTX SHA256_CTX
+#define HASH_CBLOCK SHA_CBLOCK
-# define HASH_LONG SHA_LONG
-# define HASH_CTX SHA256_CTX
-# define HASH_CBLOCK SHA_CBLOCK
/*
* Note that FIPS180-2 discusses "Truncation of the Hash Function Output."
* default: case below covers for it. It's not clear however if it's
* permitted to truncate to amount of bytes not divisible by 4. I bet not,
* but if it is, then default: case shall be extended. For reference.
- * Idea behind separate cases for pre-defined lenghts is to let the
+ * Idea behind separate cases for pre-defined lengths is to let the
* compiler decide if it's appropriate to unroll small loops.
*/
-# define HASH_MAKE_STRING(c,s) do { \
+#define HASH_MAKE_STRING(c,s) do { \
unsigned long ll; \
unsigned int nn; \
switch ((c)->md_len) \
@@ -118,18 +119,18 @@ int SHA224_Final(unsigned char *md, SHA256_CTX *c)
} \
} while (0)
-# define HASH_UPDATE SHA256_Update
-# define HASH_TRANSFORM SHA256_Transform
-# define HASH_FINAL SHA256_Final
-# define HASH_BLOCK_DATA_ORDER sha256_block_data_order
-# ifndef SHA256_ASM
+#define HASH_UPDATE SHA256_Update
+#define HASH_TRANSFORM SHA256_Transform
+#define HASH_FINAL SHA256_Final
+#define HASH_BLOCK_DATA_ORDER sha256_block_data_order
+#ifndef SHA256_ASM
static
-# endif
+#endif
void sha256_block_data_order(SHA256_CTX *ctx, const void *in, size_t num);
-# include "md32_common.h"
+#include "internal/md32_common.h"
-# ifndef SHA256_ASM
+#ifndef SHA256_ASM
static const SHA_LONG K256[64] = {
0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL,
0x3956c25bUL, 0x59f111f1UL, 0x923f82a4UL, 0xab1c5ed5UL,
@@ -154,15 +155,15 @@ static const SHA_LONG K256[64] = {
* is left one. This is why you might notice that rotation coefficients
* differ from those observed in FIPS document by 32-N...
*/
-# define Sigma0(x) (ROTATE((x),30) ^ ROTATE((x),19) ^ ROTATE((x),10))
-# define Sigma1(x) (ROTATE((x),26) ^ ROTATE((x),21) ^ ROTATE((x),7))
-# define sigma0(x) (ROTATE((x),25) ^ ROTATE((x),14) ^ ((x)>>3))
-# define sigma1(x) (ROTATE((x),15) ^ ROTATE((x),13) ^ ((x)>>10))
+# define Sigma0(x) (ROTATE((x),30) ^ ROTATE((x),19) ^ ROTATE((x),10))
+# define Sigma1(x) (ROTATE((x),26) ^ ROTATE((x),21) ^ ROTATE((x),7))
+# define sigma0(x) (ROTATE((x),25) ^ ROTATE((x),14) ^ ((x)>>3))
+# define sigma1(x) (ROTATE((x),15) ^ ROTATE((x),13) ^ ((x)>>10))
-# define Ch(x,y,z) (((x) & (y)) ^ ((~(x)) & (z)))
-# define Maj(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
+# define Ch(x,y,z) (((x) & (y)) ^ ((~(x)) & (z)))
+# define Maj(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
-# ifdef OPENSSL_SMALL_FOOTPRINT
+# ifdef OPENSSL_SMALL_FOOTPRINT
static void sha256_block_data_order(SHA256_CTX *ctx, const void *in,
size_t num)
@@ -184,7 +185,7 @@ static void sha256_block_data_order(SHA256_CTX *ctx, const void *in,
h = ctx->h[7];
for (i = 0; i < 16; i++) {
- HOST_c2l(data, l);
+ (void)HOST_c2l(data, l);
T1 = X[i] = l;
T1 += h + Sigma1(e) + Ch(e, f, g) + K256[i];
T2 = Sigma0(a) + Maj(a, b, c);
@@ -229,14 +230,14 @@ static void sha256_block_data_order(SHA256_CTX *ctx, const void *in,
}
}
-# else
+# else
-# define ROUND_00_15(i,a,b,c,d,e,f,g,h) do { \
+# define ROUND_00_15(i,a,b,c,d,e,f,g,h) do { \
T1 += h + Sigma1(e) + Ch(e,f,g) + K256[i]; \
h = Sigma0(a) + Maj(a,b,c); \
d += T1; h += T1; } while (0)
-# define ROUND_16_63(i,a,b,c,d,e,f,g,h,X) do { \
+# define ROUND_16_63(i,a,b,c,d,e,f,g,h,X) do { \
s0 = X[(i+1)&0x0f]; s0 = sigma0(s0); \
s1 = X[(i+14)&0x0f]; s1 = sigma1(s1); \
T1 = X[(i)&0x0f] += s0 + s1 + X[(i+9)&0x0f]; \
@@ -308,52 +309,52 @@ static void sha256_block_data_order(SHA256_CTX *ctx, const void *in,
} else {
SHA_LONG l;
- HOST_c2l(data, l);
+ (void)HOST_c2l(data, l);
T1 = X[0] = l;
ROUND_00_15(0, a, b, c, d, e, f, g, h);
- HOST_c2l(data, l);
+ (void)HOST_c2l(data, l);
T1 = X[1] = l;
ROUND_00_15(1, h, a, b, c, d, e, f, g);
- HOST_c2l(data, l);
+ (void)HOST_c2l(data, l);
T1 = X[2] = l;
ROUND_00_15(2, g, h, a, b, c, d, e, f);
- HOST_c2l(data, l);
+ (void)HOST_c2l(data, l);
T1 = X[3] = l;
ROUND_00_15(3, f, g, h, a, b, c, d, e);
- HOST_c2l(data, l);
+ (void)HOST_c2l(data, l);
T1 = X[4] = l;
ROUND_00_15(4, e, f, g, h, a, b, c, d);
- HOST_c2l(data, l);
+ (void)HOST_c2l(data, l);
T1 = X[5] = l;
ROUND_00_15(5, d, e, f, g, h, a, b, c);
- HOST_c2l(data, l);
+ (void)HOST_c2l(data, l);
T1 = X[6] = l;
ROUND_00_15(6, c, d, e, f, g, h, a, b);
- HOST_c2l(data, l);
+ (void)HOST_c2l(data, l);
T1 = X[7] = l;
ROUND_00_15(7, b, c, d, e, f, g, h, a);
- HOST_c2l(data, l);
+ (void)HOST_c2l(data, l);
T1 = X[8] = l;
ROUND_00_15(8, a, b, c, d, e, f, g, h);
- HOST_c2l(data, l);
+ (void)HOST_c2l(data, l);
T1 = X[9] = l;
ROUND_00_15(9, h, a, b, c, d, e, f, g);
- HOST_c2l(data, l);
+ (void)HOST_c2l(data, l);
T1 = X[10] = l;
ROUND_00_15(10, g, h, a, b, c, d, e, f);
- HOST_c2l(data, l);
+ (void)HOST_c2l(data, l);
T1 = X[11] = l;
ROUND_00_15(11, f, g, h, a, b, c, d, e);
- HOST_c2l(data, l);
+ (void)HOST_c2l(data, l);
T1 = X[12] = l;
ROUND_00_15(12, e, f, g, h, a, b, c, d);
- HOST_c2l(data, l);
+ (void)HOST_c2l(data, l);
T1 = X[13] = l;
ROUND_00_15(13, d, e, f, g, h, a, b, c);
- HOST_c2l(data, l);
+ (void)HOST_c2l(data, l);
T1 = X[14] = l;
ROUND_00_15(14, c, d, e, f, g, h, a, b);
- HOST_c2l(data, l);
+ (void)HOST_c2l(data, l);
T1 = X[15] = l;
ROUND_00_15(15, b, c, d, e, f, g, h, a);
}
@@ -381,7 +382,5 @@ static void sha256_block_data_order(SHA256_CTX *ctx, const void *in,
}
}
-# endif
-# endif /* SHA256_ASM */
-
-#endif /* OPENSSL_NO_SHA256 */
+# endif
+#endif /* SHA256_ASM */
diff --git a/crypto/sha/sha256t.c b/crypto/sha/sha256t.c
deleted file mode 100644
index 35dbbc2a96d7..000000000000
--- a/crypto/sha/sha256t.c
+++ /dev/null
@@ -1,158 +0,0 @@
-/* crypto/sha/sha256t.c */
-/* ====================================================================
- * Copyright (c) 2004 The OpenSSL Project. All rights reserved.
- * ====================================================================
- */
-#include <stdio.h>
-#include <string.h>
-#include <stdlib.h>
-
-#include <openssl/sha.h>
-#include <openssl/evp.h>
-
-#if defined(OPENSSL_NO_SHA) || defined(OPENSSL_NO_SHA256)
-int main(int argc, char *argv[])
-{
- printf("No SHA256 support\n");
- return (0);
-}
-#else
-
-unsigned char app_b1[SHA256_DIGEST_LENGTH] = {
- 0xba, 0x78, 0x16, 0xbf, 0x8f, 0x01, 0xcf, 0xea,
- 0x41, 0x41, 0x40, 0xde, 0x5d, 0xae, 0x22, 0x23,
- 0xb0, 0x03, 0x61, 0xa3, 0x96, 0x17, 0x7a, 0x9c,
- 0xb4, 0x10, 0xff, 0x61, 0xf2, 0x00, 0x15, 0xad
-};
-
-unsigned char app_b2[SHA256_DIGEST_LENGTH] = {
- 0x24, 0x8d, 0x6a, 0x61, 0xd2, 0x06, 0x38, 0xb8,
- 0xe5, 0xc0, 0x26, 0x93, 0x0c, 0x3e, 0x60, 0x39,
- 0xa3, 0x3c, 0xe4, 0x59, 0x64, 0xff, 0x21, 0x67,
- 0xf6, 0xec, 0xed, 0xd4, 0x19, 0xdb, 0x06, 0xc1
-};
-
-unsigned char app_b3[SHA256_DIGEST_LENGTH] = {
- 0xcd, 0xc7, 0x6e, 0x5c, 0x99, 0x14, 0xfb, 0x92,
- 0x81, 0xa1, 0xc7, 0xe2, 0x84, 0xd7, 0x3e, 0x67,
- 0xf1, 0x80, 0x9a, 0x48, 0xa4, 0x97, 0x20, 0x0e,
- 0x04, 0x6d, 0x39, 0xcc, 0xc7, 0x11, 0x2c, 0xd0
-};
-
-unsigned char addenum_1[SHA224_DIGEST_LENGTH] = {
- 0x23, 0x09, 0x7d, 0x22, 0x34, 0x05, 0xd8, 0x22,
- 0x86, 0x42, 0xa4, 0x77, 0xbd, 0xa2, 0x55, 0xb3,
- 0x2a, 0xad, 0xbc, 0xe4, 0xbd, 0xa0, 0xb3, 0xf7,
- 0xe3, 0x6c, 0x9d, 0xa7
-};
-
-unsigned char addenum_2[SHA224_DIGEST_LENGTH] = {
- 0x75, 0x38, 0x8b, 0x16, 0x51, 0x27, 0x76, 0xcc,
- 0x5d, 0xba, 0x5d, 0xa1, 0xfd, 0x89, 0x01, 0x50,
- 0xb0, 0xc6, 0x45, 0x5c, 0xb4, 0xf5, 0x8b, 0x19,
- 0x52, 0x52, 0x25, 0x25
-};
-
-unsigned char addenum_3[SHA224_DIGEST_LENGTH] = {
- 0x20, 0x79, 0x46, 0x55, 0x98, 0x0c, 0x91, 0xd8,
- 0xbb, 0xb4, 0xc1, 0xea, 0x97, 0x61, 0x8a, 0x4b,
- 0xf0, 0x3f, 0x42, 0x58, 0x19, 0x48, 0xb2, 0xee,
- 0x4e, 0xe7, 0xad, 0x67
-};
-
-int main(int argc, char **argv)
-{
- unsigned char md[SHA256_DIGEST_LENGTH];
- int i;
- EVP_MD_CTX evp;
-
- fprintf(stdout, "Testing SHA-256 ");
-
- EVP_Digest("abc", 3, md, NULL, EVP_sha256(), NULL);
- if (memcmp(md, app_b1, sizeof(app_b1))) {
- fflush(stdout);
- fprintf(stderr, "\nTEST 1 of 3 failed.\n");
- return 1;
- } else
- fprintf(stdout, ".");
- fflush(stdout);
-
- EVP_Digest("abcdbcde" "cdefdefg" "efghfghi" "ghijhijk"
- "ijkljklm" "klmnlmno" "mnopnopq", 56, md, NULL, EVP_sha256(),
- NULL);
- if (memcmp(md, app_b2, sizeof(app_b2))) {
- fflush(stdout);
- fprintf(stderr, "\nTEST 2 of 3 failed.\n");
- return 1;
- } else
- fprintf(stdout, ".");
- fflush(stdout);
-
- EVP_MD_CTX_init(&evp);
- EVP_DigestInit_ex(&evp, EVP_sha256(), NULL);
- for (i = 0; i < 1000000; i += 160)
- EVP_DigestUpdate(&evp, "aaaaaaaa" "aaaaaaaa" "aaaaaaaa" "aaaaaaaa"
- "aaaaaaaa" "aaaaaaaa" "aaaaaaaa" "aaaaaaaa"
- "aaaaaaaa" "aaaaaaaa" "aaaaaaaa" "aaaaaaaa"
- "aaaaaaaa" "aaaaaaaa" "aaaaaaaa" "aaaaaaaa"
- "aaaaaaaa" "aaaaaaaa" "aaaaaaaa" "aaaaaaaa",
- (1000000 - i) < 160 ? 1000000 - i : 160);
- EVP_DigestFinal_ex(&evp, md, NULL);
- EVP_MD_CTX_cleanup(&evp);
-
- if (memcmp(md, app_b3, sizeof(app_b3))) {
- fflush(stdout);
- fprintf(stderr, "\nTEST 3 of 3 failed.\n");
- return 1;
- } else
- fprintf(stdout, ".");
- fflush(stdout);
-
- fprintf(stdout, " passed.\n");
- fflush(stdout);
-
- fprintf(stdout, "Testing SHA-224 ");
-
- EVP_Digest("abc", 3, md, NULL, EVP_sha224(), NULL);
- if (memcmp(md, addenum_1, sizeof(addenum_1))) {
- fflush(stdout);
- fprintf(stderr, "\nTEST 1 of 3 failed.\n");
- return 1;
- } else
- fprintf(stdout, ".");
- fflush(stdout);
-
- EVP_Digest("abcdbcde" "cdefdefg" "efghfghi" "ghijhijk"
- "ijkljklm" "klmnlmno" "mnopnopq", 56, md, NULL, EVP_sha224(),
- NULL);
- if (memcmp(md, addenum_2, sizeof(addenum_2))) {
- fflush(stdout);
- fprintf(stderr, "\nTEST 2 of 3 failed.\n");
- return 1;
- } else
- fprintf(stdout, ".");
- fflush(stdout);
-
- EVP_MD_CTX_init(&evp);
- EVP_DigestInit_ex(&evp, EVP_sha224(), NULL);
- for (i = 0; i < 1000000; i += 64)
- EVP_DigestUpdate(&evp, "aaaaaaaa" "aaaaaaaa" "aaaaaaaa" "aaaaaaaa"
- "aaaaaaaa" "aaaaaaaa" "aaaaaaaa" "aaaaaaaa",
- (1000000 - i) < 64 ? 1000000 - i : 64);
- EVP_DigestFinal_ex(&evp, md, NULL);
- EVP_MD_CTX_cleanup(&evp);
-
- if (memcmp(md, addenum_3, sizeof(addenum_3))) {
- fflush(stdout);
- fprintf(stderr, "\nTEST 3 of 3 failed.\n");
- return 1;
- } else
- fprintf(stdout, ".");
- fflush(stdout);
-
- fprintf(stdout, " passed.\n");
- fflush(stdout);
-
- return 0;
-}
-#endif
diff --git a/crypto/sha/sha512.c b/crypto/sha/sha512.c
index 3bf66ae1987e..50b65ee811d6 100644
--- a/crypto/sha/sha512.c
+++ b/crypto/sha/sha512.c
@@ -1,17 +1,19 @@
-/* crypto/sha/sha512.c */
-/* ====================================================================
- * Copyright (c) 2004 The OpenSSL Project. All rights reserved
- * according to the OpenSSL license [found in ../../LICENSE].
- * ====================================================================
+/*
+ * Copyright 2004-2018 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
*/
+
#include <openssl/opensslconf.h>
-#if !defined(OPENSSL_NO_SHA) && !defined(OPENSSL_NO_SHA512)
/*-
* IMPLEMENTATION NOTES.
*
* As you might have noticed 32-bit hash algorithms:
*
- * - permit SHA_LONG to be wider than 32-bit (case on CRAY);
+ * - permit SHA_LONG to be wider than 32-bit
* - optimized versions implement two transform functions: one operating
* on [aligned] data in host byte order and one - on data in input
* stream byte order;
@@ -39,28 +41,62 @@
* As this implementation relies on 64-bit integer type, it's totally
* inappropriate for platforms which don't support it, most notably
* 16-bit platforms.
- * <appro@fy.chalmers.se>
*/
-# include <stdlib.h>
-# include <string.h>
+#include <stdlib.h>
+#include <string.h>
-# include <openssl/crypto.h>
-# include <openssl/sha.h>
-# include <openssl/opensslv.h>
+#include <openssl/crypto.h>
+#include <openssl/sha.h>
+#include <openssl/opensslv.h>
-# include "cryptlib.h"
+#include "internal/cryptlib.h"
+#include "internal/sha.h"
-const char SHA512_version[] = "SHA-512" OPENSSL_VERSION_PTEXT;
-
-# if defined(__i386) || defined(__i386__) || defined(_M_IX86) || \
+#if defined(__i386) || defined(__i386__) || defined(_M_IX86) || \
defined(__x86_64) || defined(_M_AMD64) || defined(_M_X64) || \
defined(__s390__) || defined(__s390x__) || \
defined(__aarch64__) || \
defined(SHA512_ASM)
-# define SHA512_BLOCK_CAN_MANAGE_UNALIGNED_DATA
-# endif
+# define SHA512_BLOCK_CAN_MANAGE_UNALIGNED_DATA
+#endif
-fips_md_init_ctx(SHA384, SHA512)
+int sha512_224_init(SHA512_CTX *c)
+{
+ c->h[0] = U64(0x8c3d37c819544da2);
+ c->h[1] = U64(0x73e1996689dcd4d6);
+ c->h[2] = U64(0x1dfab7ae32ff9c82);
+ c->h[3] = U64(0x679dd514582f9fcf);
+ c->h[4] = U64(0x0f6d2b697bd44da8);
+ c->h[5] = U64(0x77e36f7304c48942);
+ c->h[6] = U64(0x3f9d85a86a1d36c8);
+ c->h[7] = U64(0x1112e6ad91d692a1);
+
+ c->Nl = 0;
+ c->Nh = 0;
+ c->num = 0;
+ c->md_len = SHA224_DIGEST_LENGTH;
+ return 1;
+}
+
+int sha512_256_init(SHA512_CTX *c)
+{
+ c->h[0] = U64(0x22312194fc2bf72c);
+ c->h[1] = U64(0x9f555fa3c84c64c2);
+ c->h[2] = U64(0x2393b86b6f53b151);
+ c->h[3] = U64(0x963877195940eabd);
+ c->h[4] = U64(0x96283ee2a88effe3);
+ c->h[5] = U64(0xbe5e1e2553863992);
+ c->h[6] = U64(0x2b0199fc2c85b8aa);
+ c->h[7] = U64(0x0eb72ddc81c52ca2);
+
+ c->Nl = 0;
+ c->Nh = 0;
+ c->num = 0;
+ c->md_len = SHA256_DIGEST_LENGTH;
+ return 1;
+}
+
+int SHA384_Init(SHA512_CTX *c)
{
c->h[0] = U64(0xcbbb9d5dc1059ed8);
c->h[1] = U64(0x629a292a367cd507);
@@ -78,7 +114,7 @@ fips_md_init_ctx(SHA384, SHA512)
return 1;
}
-fips_md_init(SHA512)
+int SHA512_Init(SHA512_CTX *c)
{
c->h[0] = U64(0x6a09e667f3bcc908);
c->h[1] = U64(0xbb67ae8584caa73b);
@@ -96,9 +132,9 @@ fips_md_init(SHA512)
return 1;
}
-# ifndef SHA512_ASM
+#ifndef SHA512_ASM
static
-# endif
+#endif
void sha512_block_data_order(SHA512_CTX *ctx, const void *in, size_t num);
int SHA512_Final(unsigned char *md, SHA512_CTX *c)
@@ -108,15 +144,17 @@ int SHA512_Final(unsigned char *md, SHA512_CTX *c)
p[n] = 0x80; /* There always is a room for one */
n++;
- if (n > (sizeof(c->u) - 16))
- memset(p + n, 0, sizeof(c->u) - n), n = 0,
- sha512_block_data_order(c, p, 1);
+ if (n > (sizeof(c->u) - 16)) {
+ memset(p + n, 0, sizeof(c->u) - n);
+ n = 0;
+ sha512_block_data_order(c, p, 1);
+ }
memset(p + n, 0, sizeof(c->u) - 16 - n);
-# ifdef B_ENDIAN
+#ifdef B_ENDIAN
c->u.d[SHA_LBLOCK - 2] = c->Nh;
c->u.d[SHA_LBLOCK - 1] = c->Nl;
-# else
+#else
p[sizeof(c->u) - 1] = (unsigned char)(c->Nl);
p[sizeof(c->u) - 2] = (unsigned char)(c->Nl >> 8);
p[sizeof(c->u) - 3] = (unsigned char)(c->Nl >> 16);
@@ -133,7 +171,7 @@ int SHA512_Final(unsigned char *md, SHA512_CTX *c)
p[sizeof(c->u) - 14] = (unsigned char)(c->Nh >> 40);
p[sizeof(c->u) - 15] = (unsigned char)(c->Nh >> 48);
p[sizeof(c->u) - 16] = (unsigned char)(c->Nh >> 56);
-# endif
+#endif
sha512_block_data_order(c, p, 1);
@@ -141,7 +179,47 @@ int SHA512_Final(unsigned char *md, SHA512_CTX *c)
return 0;
switch (c->md_len) {
- /* Let compiler decide if it's appropriate to unroll... */
+ /* Let compiler decide if it's appropriate to unroll... */
+ case SHA224_DIGEST_LENGTH:
+ for (n = 0; n < SHA224_DIGEST_LENGTH / 8; n++) {
+ SHA_LONG64 t = c->h[n];
+
+ *(md++) = (unsigned char)(t >> 56);
+ *(md++) = (unsigned char)(t >> 48);
+ *(md++) = (unsigned char)(t >> 40);
+ *(md++) = (unsigned char)(t >> 32);
+ *(md++) = (unsigned char)(t >> 24);
+ *(md++) = (unsigned char)(t >> 16);
+ *(md++) = (unsigned char)(t >> 8);
+ *(md++) = (unsigned char)(t);
+ }
+ /*
+ * For 224 bits, there are four bytes left over that have to be
+ * processed separately.
+ */
+ {
+ SHA_LONG64 t = c->h[SHA224_DIGEST_LENGTH / 8];
+
+ *(md++) = (unsigned char)(t >> 56);
+ *(md++) = (unsigned char)(t >> 48);
+ *(md++) = (unsigned char)(t >> 40);
+ *(md++) = (unsigned char)(t >> 32);
+ }
+ break;
+ case SHA256_DIGEST_LENGTH:
+ for (n = 0; n < SHA256_DIGEST_LENGTH / 8; n++) {
+ SHA_LONG64 t = c->h[n];
+
+ *(md++) = (unsigned char)(t >> 56);
+ *(md++) = (unsigned char)(t >> 48);
+ *(md++) = (unsigned char)(t >> 40);
+ *(md++) = (unsigned char)(t >> 32);
+ *(md++) = (unsigned char)(t >> 24);
+ *(md++) = (unsigned char)(t >> 16);
+ *(md++) = (unsigned char)(t >> 8);
+ *(md++) = (unsigned char)(t);
+ }
+ break;
case SHA384_DIGEST_LENGTH:
for (n = 0; n < SHA384_DIGEST_LENGTH / 8; n++) {
SHA_LONG64 t = c->h[n];
@@ -170,7 +248,7 @@ int SHA512_Final(unsigned char *md, SHA512_CTX *c)
*(md++) = (unsigned char)(t);
}
break;
- /* ... as well as make sure md_len is not abused. */
+ /* ... as well as make sure md_len is not abused. */
default:
return 0;
}
@@ -213,16 +291,16 @@ int SHA512_Update(SHA512_CTX *c, const void *_data, size_t len)
}
if (len >= sizeof(c->u)) {
-# ifndef SHA512_BLOCK_CAN_MANAGE_UNALIGNED_DATA
+#ifndef SHA512_BLOCK_CAN_MANAGE_UNALIGNED_DATA
if ((size_t)data % sizeof(c->u.d[0]) != 0)
while (len >= sizeof(c->u))
memcpy(p, data, sizeof(c->u)),
- sha512_block_data_order(c, p, 1),
- len -= sizeof(c->u), data += sizeof(c->u);
+ sha512_block_data_order(c, p, 1),
+ len -= sizeof(c->u), data += sizeof(c->u);
else
-# endif
+#endif
sha512_block_data_order(c, data, len / sizeof(c->u)),
- data += len, len %= sizeof(c->u), data -= len;
+ data += len, len %= sizeof(c->u), data -= len;
}
if (len != 0)
@@ -238,10 +316,10 @@ int SHA384_Update(SHA512_CTX *c, const void *data, size_t len)
void SHA512_Transform(SHA512_CTX *c, const unsigned char *data)
{
-# ifndef SHA512_BLOCK_CAN_MANAGE_UNALIGNED_DATA
+#ifndef SHA512_BLOCK_CAN_MANAGE_UNALIGNED_DATA
if ((size_t)data % sizeof(c->u.d[0]) != 0)
memcpy(c->u.p, data, sizeof(c->u.p)), data = c->u.p;
-# endif
+#endif
sha512_block_data_order(c, data, 1);
}
@@ -256,7 +334,7 @@ unsigned char *SHA384(const unsigned char *d, size_t n, unsigned char *md)
SHA512_Update(&c, d, n);
SHA512_Final(md, &c);
OPENSSL_cleanse(&c, sizeof(c));
- return (md);
+ return md;
}
unsigned char *SHA512(const unsigned char *d, size_t n, unsigned char *md)
@@ -270,10 +348,10 @@ unsigned char *SHA512(const unsigned char *d, size_t n, unsigned char *md)
SHA512_Update(&c, d, n);
SHA512_Final(md, &c);
OPENSSL_cleanse(&c, sizeof(c));
- return (md);
+ return md;
}
-# ifndef SHA512_ASM
+#ifndef SHA512_ASM
static const SHA_LONG64 K512[80] = {
U64(0x428a2f98d728ae22), U64(0x7137449123ef65cd),
U64(0xb5c0fbcfec4d3b2f), U64(0xe9b5dba58189dbbc),
@@ -317,103 +395,111 @@ static const SHA_LONG64 K512[80] = {
U64(0x5fcb6fab3ad6faec), U64(0x6c44198c4a475817)
};
-# ifndef PEDANTIC
-# if defined(__GNUC__) && __GNUC__>=2 && !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM)
-# if defined(__x86_64) || defined(__x86_64__)
-# define ROTR(a,n) ({ SHA_LONG64 ret; \
+# ifndef PEDANTIC
+# if defined(__GNUC__) && __GNUC__>=2 && \
+ !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM)
+# if defined(__x86_64) || defined(__x86_64__)
+# define ROTR(a,n) ({ SHA_LONG64 ret; \
asm ("rorq %1,%0" \
: "=r"(ret) \
: "J"(n),"0"(a) \
: "cc"); ret; })
-# if !defined(B_ENDIAN)
-# define PULL64(x) ({ SHA_LONG64 ret=*((const SHA_LONG64 *)(&(x))); \
+# if !defined(B_ENDIAN)
+# define PULL64(x) ({ SHA_LONG64 ret=*((const SHA_LONG64 *)(&(x))); \
asm ("bswapq %0" \
: "=r"(ret) \
: "0"(ret)); ret; })
-# endif
-# elif (defined(__i386) || defined(__i386__)) && !defined(B_ENDIAN)
-# if defined(I386_ONLY)
-# define PULL64(x) ({ const unsigned int *p=(const unsigned int *)(&(x));\
- unsigned int hi=p[0],lo=p[1]; \
+# endif
+# elif (defined(__i386) || defined(__i386__)) && !defined(B_ENDIAN)
+# if defined(I386_ONLY)
+# define PULL64(x) ({ const unsigned int *p=(const unsigned int *)(&(x));\
+ unsigned int hi=p[0],lo=p[1]; \
asm("xchgb %%ah,%%al;xchgb %%dh,%%dl;"\
"roll $16,%%eax; roll $16,%%edx; "\
- "xchgb %%ah,%%al;xchgb %%dh,%%dl;" \
+ "xchgb %%ah,%%al;xchgb %%dh,%%dl;"\
: "=a"(lo),"=d"(hi) \
: "0"(lo),"1"(hi) : "cc"); \
((SHA_LONG64)hi)<<32|lo; })
-# else
-# define PULL64(x) ({ const unsigned int *p=(const unsigned int *)(&(x));\
- unsigned int hi=p[0],lo=p[1]; \
+# else
+# define PULL64(x) ({ const unsigned int *p=(const unsigned int *)(&(x));\
+ unsigned int hi=p[0],lo=p[1]; \
asm ("bswapl %0; bswapl %1;" \
: "=r"(lo),"=r"(hi) \
: "0"(lo),"1"(hi)); \
((SHA_LONG64)hi)<<32|lo; })
-# endif
-# elif (defined(_ARCH_PPC) && defined(__64BIT__)) || defined(_ARCH_PPC64)
-# define ROTR(a,n) ({ SHA_LONG64 ret; \
+# endif
+# elif (defined(_ARCH_PPC) && defined(__64BIT__)) || defined(_ARCH_PPC64)
+# define ROTR(a,n) ({ SHA_LONG64 ret; \
asm ("rotrdi %0,%1,%2" \
: "=r"(ret) \
: "r"(a),"K"(n)); ret; })
-# elif defined(__aarch64__)
-# define ROTR(a,n) ({ SHA_LONG64 ret; \
+# elif defined(__aarch64__)
+# define ROTR(a,n) ({ SHA_LONG64 ret; \
asm ("ror %0,%1,%2" \
: "=r"(ret) \
: "r"(a),"I"(n)); ret; })
-# if defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && \
+# if defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && \
__BYTE_ORDER__==__ORDER_LITTLE_ENDIAN__
-# define PULL64(x) ({ SHA_LONG64 ret; \
+# define PULL64(x) ({ SHA_LONG64 ret; \
asm ("rev %0,%1" \
: "=r"(ret) \
- : "r"(*((const SHA_LONG64 *)(&(x))))); ret; })
-# endif
-# endif
-# elif defined(_MSC_VER)
-# if defined(_WIN64) /* applies to both IA-64 and AMD64 */
-# pragma intrinsic(_rotr64)
-# define ROTR(a,n) _rotr64((a),n)
+ : "r"(*((const SHA_LONG64 *)(&(x))))); ret; })
# endif
-# if defined(_M_IX86) && !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM)
-# if defined(I386_ONLY)
+# endif
+# elif defined(_MSC_VER)
+# if defined(_WIN64) /* applies to both IA-64 and AMD64 */
+# pragma intrinsic(_rotr64)
+# define ROTR(a,n) _rotr64((a),n)
+# endif
+# if defined(_M_IX86) && !defined(OPENSSL_NO_ASM) && \
+ !defined(OPENSSL_NO_INLINE_ASM)
+# if defined(I386_ONLY)
static SHA_LONG64 __fastcall __pull64be(const void *x)
{
- _asm mov edx,[ecx + 0]
- _asm mov eax,[ecx + 4]
-_asm xchg dh, dl
- _asm xchg ah, al
- _asm rol edx, 16 _asm rol eax, 16 _asm xchg dh, dl _asm xchg ah, al}
-# else
+ _asm mov edx,[ecx + 0]
+ _asm mov eax,[ecx + 4]
+ _asm xchg dh, dl
+ _asm xchg ah, al
+ _asm rol edx, 16
+ _asm rol eax, 16
+ _asm xchg dh, dl
+ _asm xchg ah, al
+}
+# else
static SHA_LONG64 __fastcall __pull64be(const void *x)
{
- _asm mov edx,[ecx + 0]
- _asm mov eax,[ecx + 4]
-_asm bswap edx _asm bswap eax}
-# endif
-# define PULL64(x) __pull64be(&(x))
-# if _MSC_VER<=1200
-# pragma inline_depth(0)
-# endif
+ _asm mov edx,[ecx + 0]
+ _asm mov eax,[ecx + 4]
+ _asm bswap edx
+ _asm bswap eax
+}
# endif
+# define PULL64(x) __pull64be(&(x))
# endif
# endif
-# ifndef PULL64
-# define B(x,j) (((SHA_LONG64)(*(((const unsigned char *)(&x))+j)))<<((7-j)*8))
-# define PULL64(x) (B(x,0)|B(x,1)|B(x,2)|B(x,3)|B(x,4)|B(x,5)|B(x,6)|B(x,7))
-# endif
-# ifndef ROTR
-# define ROTR(x,s) (((x)>>s) | (x)<<(64-s))
-# endif
-# define Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
-# define Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
-# define sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
-# define sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
-# define Ch(x,y,z) (((x) & (y)) ^ ((~(x)) & (z)))
-# define Maj(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
-# if defined(__i386) || defined(__i386__) || defined(_M_IX86)
+# endif
+# ifndef PULL64
+# define B(x,j) (((SHA_LONG64)(*(((const unsigned char *)(&x))+j)))<<((7-j)*8))
+# define PULL64(x) (B(x,0)|B(x,1)|B(x,2)|B(x,3)|B(x,4)|B(x,5)|B(x,6)|B(x,7))
+# endif
+# ifndef ROTR
+# define ROTR(x,s) (((x)>>s) | (x)<<(64-s))
+# endif
+# define Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
+# define Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
+# define sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
+# define sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
+# define Ch(x,y,z) (((x) & (y)) ^ ((~(x)) & (z)))
+# define Maj(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
+
+# if defined(__i386) || defined(__i386__) || defined(_M_IX86)
/*
* This code should give better results on 32-bit CPU with less than
* ~24 registers, both size and performance wise...
- */ static void sha512_block_data_order(SHA512_CTX *ctx, const void *in,
- size_t num)
+ */
+
+static void sha512_block_data_order(SHA512_CTX *ctx, const void *in,
+ size_t num)
{
const SHA_LONG64 *W = in;
SHA_LONG64 A, E, T;
@@ -433,11 +519,11 @@ _asm bswap edx _asm bswap eax}
F[7] = ctx->h[7];
for (i = 0; i < 16; i++, F--) {
-# ifdef B_ENDIAN
+# ifdef B_ENDIAN
T = W[i];
-# else
+# else
T = PULL64(W[i]);
-# endif
+# endif
F[0] = A;
F[4] = E;
F[8] = T;
@@ -472,7 +558,8 @@ _asm bswap edx _asm bswap eax}
}
}
-# elif defined(OPENSSL_SMALL_FOOTPRINT)
+# elif defined(OPENSSL_SMALL_FOOTPRINT)
+
static void sha512_block_data_order(SHA512_CTX *ctx, const void *in,
size_t num)
{
@@ -493,11 +580,11 @@ static void sha512_block_data_order(SHA512_CTX *ctx, const void *in,
h = ctx->h[7];
for (i = 0; i < 16; i++) {
-# ifdef B_ENDIAN
+# ifdef B_ENDIAN
T1 = X[i] = W[i];
-# else
+# else
T1 = X[i] = PULL64(W[i]);
-# endif
+# endif
T1 += h + Sigma1(e) + Ch(e, f, g) + K512[i];
T2 = Sigma0(a) + Maj(a, b, c);
h = g;
@@ -542,16 +629,18 @@ static void sha512_block_data_order(SHA512_CTX *ctx, const void *in,
}
}
-# else
-# define ROUND_00_15(i,a,b,c,d,e,f,g,h) do { \
+# else
+# define ROUND_00_15(i,a,b,c,d,e,f,g,h) do { \
T1 += h + Sigma1(e) + Ch(e,f,g) + K512[i]; \
h = Sigma0(a) + Maj(a,b,c); \
- d += T1; h += T1; } while (0)
-# define ROUND_16_80(i,j,a,b,c,d,e,f,g,h,X) do { \
+ d += T1; h += T1; } while (0)
+
+# define ROUND_16_80(i,j,a,b,c,d,e,f,g,h,X) do { \
s0 = X[(j+1)&0x0f]; s0 = sigma0(s0); \
s1 = X[(j+14)&0x0f]; s1 = sigma1(s1); \
T1 = X[(j)&0x0f] += s0 + s1 + X[(j+9)&0x0f]; \
ROUND_00_15(i+j,a,b,c,d,e,f,g,h); } while (0)
+
static void sha512_block_data_order(SHA512_CTX *ctx, const void *in,
size_t num)
{
@@ -571,7 +660,7 @@ static void sha512_block_data_order(SHA512_CTX *ctx, const void *in,
g = ctx->h[6];
h = ctx->h[7];
-# ifdef B_ENDIAN
+# ifdef B_ENDIAN
T1 = X[0] = W[0];
ROUND_00_15(0, a, b, c, d, e, f, g, h);
T1 = X[1] = W[1];
@@ -604,7 +693,7 @@ static void sha512_block_data_order(SHA512_CTX *ctx, const void *in,
ROUND_00_15(14, c, d, e, f, g, h, a, b);
T1 = X[15] = W[15];
ROUND_00_15(15, b, c, d, e, f, g, h, a);
-# else
+# else
T1 = X[0] = PULL64(W[0]);
ROUND_00_15(0, a, b, c, d, e, f, g, h);
T1 = X[1] = PULL64(W[1]);
@@ -637,7 +726,7 @@ static void sha512_block_data_order(SHA512_CTX *ctx, const void *in,
ROUND_00_15(14, c, d, e, f, g, h, a, b);
T1 = X[15] = PULL64(W[15]);
ROUND_00_15(15, b, c, d, e, f, g, h, a);
-# endif
+# endif
for (i = 16; i < 80; i += 16) {
ROUND_16_80(i, 0, a, b, c, d, e, f, g, h, X);
@@ -671,14 +760,6 @@ static void sha512_block_data_order(SHA512_CTX *ctx, const void *in,
}
}
-# endif
-
-# endif /* SHA512_ASM */
-
-#else /* !OPENSSL_NO_SHA512 */
-
-# if defined(PEDANTIC) || defined(__DECC) || defined(OPENSSL_SYS_MACOSX)
-static void *dummy = &dummy;
# endif
-#endif /* !OPENSSL_NO_SHA512 */
+#endif /* SHA512_ASM */
diff --git a/crypto/sha/sha512t.c b/crypto/sha/sha512t.c
deleted file mode 100644
index 178882fc76b6..000000000000
--- a/crypto/sha/sha512t.c
+++ /dev/null
@@ -1,196 +0,0 @@
-/* crypto/sha/sha512t.c */
-/* ====================================================================
- * Copyright (c) 2004 The OpenSSL Project. All rights reserved.
- * ====================================================================
- */
-#include <stdio.h>
-#include <string.h>
-#include <stdlib.h>
-
-#include <openssl/sha.h>
-#include <openssl/evp.h>
-#include <openssl/crypto.h>
-
-#if defined(OPENSSL_NO_SHA) || defined(OPENSSL_NO_SHA512)
-int main(int argc, char *argv[])
-{
- printf("No SHA512 support\n");
- return (0);
-}
-#else
-
-unsigned char app_c1[SHA512_DIGEST_LENGTH] = {
- 0xdd, 0xaf, 0x35, 0xa1, 0x93, 0x61, 0x7a, 0xba,
- 0xcc, 0x41, 0x73, 0x49, 0xae, 0x20, 0x41, 0x31,
- 0x12, 0xe6, 0xfa, 0x4e, 0x89, 0xa9, 0x7e, 0xa2,
- 0x0a, 0x9e, 0xee, 0xe6, 0x4b, 0x55, 0xd3, 0x9a,
- 0x21, 0x92, 0x99, 0x2a, 0x27, 0x4f, 0xc1, 0xa8,
- 0x36, 0xba, 0x3c, 0x23, 0xa3, 0xfe, 0xeb, 0xbd,
- 0x45, 0x4d, 0x44, 0x23, 0x64, 0x3c, 0xe8, 0x0e,
- 0x2a, 0x9a, 0xc9, 0x4f, 0xa5, 0x4c, 0xa4, 0x9f
-};
-
-unsigned char app_c2[SHA512_DIGEST_LENGTH] = {
- 0x8e, 0x95, 0x9b, 0x75, 0xda, 0xe3, 0x13, 0xda,
- 0x8c, 0xf4, 0xf7, 0x28, 0x14, 0xfc, 0x14, 0x3f,
- 0x8f, 0x77, 0x79, 0xc6, 0xeb, 0x9f, 0x7f, 0xa1,
- 0x72, 0x99, 0xae, 0xad, 0xb6, 0x88, 0x90, 0x18,
- 0x50, 0x1d, 0x28, 0x9e, 0x49, 0x00, 0xf7, 0xe4,
- 0x33, 0x1b, 0x99, 0xde, 0xc4, 0xb5, 0x43, 0x3a,
- 0xc7, 0xd3, 0x29, 0xee, 0xb6, 0xdd, 0x26, 0x54,
- 0x5e, 0x96, 0xe5, 0x5b, 0x87, 0x4b, 0xe9, 0x09
-};
-
-unsigned char app_c3[SHA512_DIGEST_LENGTH] = {
- 0xe7, 0x18, 0x48, 0x3d, 0x0c, 0xe7, 0x69, 0x64,
- 0x4e, 0x2e, 0x42, 0xc7, 0xbc, 0x15, 0xb4, 0x63,
- 0x8e, 0x1f, 0x98, 0xb1, 0x3b, 0x20, 0x44, 0x28,
- 0x56, 0x32, 0xa8, 0x03, 0xaf, 0xa9, 0x73, 0xeb,
- 0xde, 0x0f, 0xf2, 0x44, 0x87, 0x7e, 0xa6, 0x0a,
- 0x4c, 0xb0, 0x43, 0x2c, 0xe5, 0x77, 0xc3, 0x1b,
- 0xeb, 0x00, 0x9c, 0x5c, 0x2c, 0x49, 0xaa, 0x2e,
- 0x4e, 0xad, 0xb2, 0x17, 0xad, 0x8c, 0xc0, 0x9b
-};
-
-unsigned char app_d1[SHA384_DIGEST_LENGTH] = {
- 0xcb, 0x00, 0x75, 0x3f, 0x45, 0xa3, 0x5e, 0x8b,
- 0xb5, 0xa0, 0x3d, 0x69, 0x9a, 0xc6, 0x50, 0x07,
- 0x27, 0x2c, 0x32, 0xab, 0x0e, 0xde, 0xd1, 0x63,
- 0x1a, 0x8b, 0x60, 0x5a, 0x43, 0xff, 0x5b, 0xed,
- 0x80, 0x86, 0x07, 0x2b, 0xa1, 0xe7, 0xcc, 0x23,
- 0x58, 0xba, 0xec, 0xa1, 0x34, 0xc8, 0x25, 0xa7
-};
-
-unsigned char app_d2[SHA384_DIGEST_LENGTH] = {
- 0x09, 0x33, 0x0c, 0x33, 0xf7, 0x11, 0x47, 0xe8,
- 0x3d, 0x19, 0x2f, 0xc7, 0x82, 0xcd, 0x1b, 0x47,
- 0x53, 0x11, 0x1b, 0x17, 0x3b, 0x3b, 0x05, 0xd2,
- 0x2f, 0xa0, 0x80, 0x86, 0xe3, 0xb0, 0xf7, 0x12,
- 0xfc, 0xc7, 0xc7, 0x1a, 0x55, 0x7e, 0x2d, 0xb9,
- 0x66, 0xc3, 0xe9, 0xfa, 0x91, 0x74, 0x60, 0x39
-};
-
-unsigned char app_d3[SHA384_DIGEST_LENGTH] = {
- 0x9d, 0x0e, 0x18, 0x09, 0x71, 0x64, 0x74, 0xcb,
- 0x08, 0x6e, 0x83, 0x4e, 0x31, 0x0a, 0x4a, 0x1c,
- 0xed, 0x14, 0x9e, 0x9c, 0x00, 0xf2, 0x48, 0x52,
- 0x79, 0x72, 0xce, 0xc5, 0x70, 0x4c, 0x2a, 0x5b,
- 0x07, 0xb8, 0xb3, 0xdc, 0x38, 0xec, 0xc4, 0xeb,
- 0xae, 0x97, 0xdd, 0xd8, 0x7f, 0x3d, 0x89, 0x85
-};
-
-int main(int argc, char **argv)
-{
- unsigned char md[SHA512_DIGEST_LENGTH];
- int i;
- EVP_MD_CTX evp;
-
-# ifdef OPENSSL_IA32_SSE2
- /*
- * Alternative to this is to call OpenSSL_add_all_algorithms... The below
- * code is retained exclusively for debugging purposes.
- */
- {
- char *env;
-
- if ((env = getenv("OPENSSL_ia32cap")))
- OPENSSL_ia32cap = strtoul(env, NULL, 0);
- }
-# endif
-
- fprintf(stdout, "Testing SHA-512 ");
-
- EVP_Digest("abc", 3, md, NULL, EVP_sha512(), NULL);
- if (memcmp(md, app_c1, sizeof(app_c1))) {
- fflush(stdout);
- fprintf(stderr, "\nTEST 1 of 3 failed.\n");
- return 1;
- } else
- fprintf(stdout, ".");
- fflush(stdout);
-
- EVP_Digest("abcdefgh" "bcdefghi" "cdefghij" "defghijk"
- "efghijkl" "fghijklm" "ghijklmn" "hijklmno"
- "ijklmnop" "jklmnopq" "klmnopqr" "lmnopqrs"
- "mnopqrst" "nopqrstu", 112, md, NULL, EVP_sha512(), NULL);
- if (memcmp(md, app_c2, sizeof(app_c2))) {
- fflush(stdout);
- fprintf(stderr, "\nTEST 2 of 3 failed.\n");
- return 1;
- } else
- fprintf(stdout, ".");
- fflush(stdout);
-
- EVP_MD_CTX_init(&evp);
- EVP_DigestInit_ex(&evp, EVP_sha512(), NULL);
- for (i = 0; i < 1000000; i += 288)
- EVP_DigestUpdate(&evp, "aaaaaaaa" "aaaaaaaa" "aaaaaaaa" "aaaaaaaa"
- "aaaaaaaa" "aaaaaaaa" "aaaaaaaa" "aaaaaaaa"
- "aaaaaaaa" "aaaaaaaa" "aaaaaaaa" "aaaaaaaa"
- "aaaaaaaa" "aaaaaaaa" "aaaaaaaa" "aaaaaaaa"
- "aaaaaaaa" "aaaaaaaa" "aaaaaaaa" "aaaaaaaa"
- "aaaaaaaa" "aaaaaaaa" "aaaaaaaa" "aaaaaaaa"
- "aaaaaaaa" "aaaaaaaa" "aaaaaaaa" "aaaaaaaa"
- "aaaaaaaa" "aaaaaaaa" "aaaaaaaa" "aaaaaaaa"
- "aaaaaaaa" "aaaaaaaa" "aaaaaaaa" "aaaaaaaa",
- (1000000 - i) < 288 ? 1000000 - i : 288);
- EVP_DigestFinal_ex(&evp, md, NULL);
- EVP_MD_CTX_cleanup(&evp);
-
- if (memcmp(md, app_c3, sizeof(app_c3))) {
- fflush(stdout);
- fprintf(stderr, "\nTEST 3 of 3 failed.\n");
- return 1;
- } else
- fprintf(stdout, ".");
- fflush(stdout);
-
- fprintf(stdout, " passed.\n");
- fflush(stdout);
-
- fprintf(stdout, "Testing SHA-384 ");
-
- EVP_Digest("abc", 3, md, NULL, EVP_sha384(), NULL);
- if (memcmp(md, app_d1, sizeof(app_d1))) {
- fflush(stdout);
- fprintf(stderr, "\nTEST 1 of 3 failed.\n");
- return 1;
- } else
- fprintf(stdout, ".");
- fflush(stdout);
-
- EVP_Digest("abcdefgh" "bcdefghi" "cdefghij" "defghijk"
- "efghijkl" "fghijklm" "ghijklmn" "hijklmno"
- "ijklmnop" "jklmnopq" "klmnopqr" "lmnopqrs"
- "mnopqrst" "nopqrstu", 112, md, NULL, EVP_sha384(), NULL);
- if (memcmp(md, app_d2, sizeof(app_d2))) {
- fflush(stdout);
- fprintf(stderr, "\nTEST 2 of 3 failed.\n");
- return 1;
- } else
- fprintf(stdout, ".");
- fflush(stdout);
-
- EVP_MD_CTX_init(&evp);
- EVP_DigestInit_ex(&evp, EVP_sha384(), NULL);
- for (i = 0; i < 1000000; i += 64)
- EVP_DigestUpdate(&evp, "aaaaaaaa" "aaaaaaaa" "aaaaaaaa" "aaaaaaaa"
- "aaaaaaaa" "aaaaaaaa" "aaaaaaaa" "aaaaaaaa",
- (1000000 - i) < 64 ? 1000000 - i : 64);
- EVP_DigestFinal_ex(&evp, md, NULL);
- EVP_MD_CTX_cleanup(&evp);
-
- if (memcmp(md, app_d3, sizeof(app_d3))) {
- fflush(stdout);
- fprintf(stderr, "\nTEST 3 of 3 failed.\n");
- return 1;
- } else
- fprintf(stdout, ".");
- fflush(stdout);
-
- fprintf(stdout, " passed.\n");
- fflush(stdout);
-
- return 0;
-}
-#endif
diff --git a/crypto/sha/sha_dgst.c b/crypto/sha/sha_dgst.c
deleted file mode 100644
index f77cf5e38d8e..000000000000
--- a/crypto/sha/sha_dgst.c
+++ /dev/null
@@ -1,74 +0,0 @@
-/* crypto/sha/sha1dgst.c */
-/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
- * All rights reserved.
- *
- * This package is an SSL implementation written
- * by Eric Young (eay@cryptsoft.com).
- * The implementation was written so as to conform with Netscapes SSL.
- *
- * This library is free for commercial and non-commercial use as long as
- * the following conditions are aheared to. The following conditions
- * apply to all code found in this distribution, be it the RC4, RSA,
- * lhash, DES, etc., code; not just the SSL code. The SSL documentation
- * included with this distribution is covered by the same copyright terms
- * except that the holder is Tim Hudson (tjh@cryptsoft.com).
- *
- * Copyright remains Eric Young's, and as such any Copyright notices in
- * the code are not to be removed.
- * If this package is used in a product, Eric Young should be given attribution
- * as the author of the parts of the library used.
- * This can be in the form of a textual message at program startup or
- * in documentation (online or textual) provided with the package.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- * must display the following acknowledgement:
- * "This product includes cryptographic software written by
- * Eric Young (eay@cryptsoft.com)"
- * The word 'cryptographic' can be left out if the rouines from the library
- * being used are not cryptographic related :-).
- * 4. If you include any Windows specific code (or a derivative thereof) from
- * the apps directory (application code) you must include an acknowledgement:
- * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
- *
- * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * The licence and distribution terms for any publically available version or
- * derivative of this code cannot be changed. i.e. this code cannot simply be
- * copied and put under another distribution licence
- * [including the GNU Public Licence.]
- */
-
-#include <openssl/crypto.h>
-#include <openssl/opensslconf.h>
-#if !defined(OPENSSL_NO_SHA0) && !defined(OPENSSL_NO_SHA)
-
-# undef SHA_1
-# define SHA_0
-
-# include <openssl/opensslv.h>
-
-const char SHA_version[] = "SHA" OPENSSL_VERSION_PTEXT;
-
-/* The implementation is in ../md32_common.h */
-
-# include "sha_locl.h"
-
-#endif
diff --git a/crypto/sha/sha_locl.h b/crypto/sha/sha_locl.h
index 03bd411ede69..4e5a09038267 100644
--- a/crypto/sha/sha_locl.h
+++ b/crypto/sha/sha_locl.h
@@ -1,59 +1,10 @@
-/* crypto/sha/sha_locl.h */
-/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
- * All rights reserved.
- *
- * This package is an SSL implementation written
- * by Eric Young (eay@cryptsoft.com).
- * The implementation was written so as to conform with Netscapes SSL.
- *
- * This library is free for commercial and non-commercial use as long as
- * the following conditions are aheared to. The following conditions
- * apply to all code found in this distribution, be it the RC4, RSA,
- * lhash, DES, etc., code; not just the SSL code. The SSL documentation
- * included with this distribution is covered by the same copyright terms
- * except that the holder is Tim Hudson (tjh@cryptsoft.com).
- *
- * Copyright remains Eric Young's, and as such any Copyright notices in
- * the code are not to be removed.
- * If this package is used in a product, Eric Young should be given attribution
- * as the author of the parts of the library used.
- * This can be in the form of a textual message at program startup or
- * in documentation (online or textual) provided with the package.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- * must display the following acknowledgement:
- * "This product includes cryptographic software written by
- * Eric Young (eay@cryptsoft.com)"
- * The word 'cryptographic' can be left out if the rouines from the library
- * being used are not cryptographic related :-).
- * 4. If you include any Windows specific code (or a derivative thereof) from
- * the apps directory (application code) you must include an acknowledgement:
- * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
- *
- * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
+/*
+ * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
*
- * The licence and distribution terms for any publically available version or
- * derivative of this code cannot be changed. i.e. this code cannot simply be
- * copied and put under another distribution licence
- * [including the GNU Public Licence.]
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
*/
#include <stdlib.h>
@@ -76,45 +27,22 @@
ll=(c)->h4; (void)HOST_l2c(ll,(s)); \
} while (0)
-#if defined(SHA_0)
-
-# define HASH_UPDATE SHA_Update
-# define HASH_TRANSFORM SHA_Transform
-# define HASH_FINAL SHA_Final
-# define HASH_INIT SHA_Init
-# define HASH_BLOCK_DATA_ORDER sha_block_data_order
-# define Xupdate(a,ix,ia,ib,ic,id) (ix=(a)=(ia^ib^ic^id))
-
-static void sha_block_data_order(SHA_CTX *c, const void *p, size_t num);
-
-#elif defined(SHA_1)
-
-# define HASH_UPDATE SHA1_Update
-# define HASH_TRANSFORM SHA1_Transform
-# define HASH_FINAL SHA1_Final
-# define HASH_INIT SHA1_Init
-# define HASH_BLOCK_DATA_ORDER sha1_block_data_order
-# if defined(__MWERKS__) && defined(__MC68K__)
- /* Metrowerks for Motorola fails otherwise:-( <appro@fy.chalmers.se> */
-# define Xupdate(a,ix,ia,ib,ic,id) do { (a)=(ia^ib^ic^id); \
- ix=(a)=ROTATE((a),1); \
- } while (0)
-# else
-# define Xupdate(a,ix,ia,ib,ic,id) ( (a)=(ia^ib^ic^id), \
+#define HASH_UPDATE SHA1_Update
+#define HASH_TRANSFORM SHA1_Transform
+#define HASH_FINAL SHA1_Final
+#define HASH_INIT SHA1_Init
+#define HASH_BLOCK_DATA_ORDER sha1_block_data_order
+#define Xupdate(a,ix,ia,ib,ic,id) ( (a)=(ia^ib^ic^id), \
ix=(a)=ROTATE((a),1) \
)
-# endif
-
-# ifndef SHA1_ASM
-static
-# endif
-void sha1_block_data_order(SHA_CTX *c, const void *p, size_t num);
+#ifndef SHA1_ASM
+static void sha1_block_data_order(SHA_CTX *c, const void *p, size_t num);
#else
-# error "Either SHA_0 or SHA_1 must be defined."
+void sha1_block_data_order(SHA_CTX *c, const void *p, size_t num);
#endif
-#include "md32_common.h"
+#include "internal/md32_common.h"
#define INIT_DATA_h0 0x67452301UL
#define INIT_DATA_h1 0xefcdab89UL
@@ -122,11 +50,7 @@ void sha1_block_data_order(SHA_CTX *c, const void *p, size_t num);
#define INIT_DATA_h3 0x10325476UL
#define INIT_DATA_h4 0xc3d2e1f0UL
-#ifdef SHA_0
-fips_md_init(SHA)
-#else
-fips_md_init_ctx(SHA1, SHA)
-#endif
+int HASH_INIT(SHA_CTX *c)
{
memset(c, 0, sizeof(*c));
c->h0 = INIT_DATA_h0;
@@ -143,11 +67,12 @@ fips_md_init_ctx(SHA1, SHA)
#define K_60_79 0xca62c1d6UL
/*
- * As pointed out by Wei Dai <weidai@eskimo.com>, F() below can be simplified
- * to the code in F_00_19. Wei attributes these optimisations to Peter
- * Gutmann's SHS code, and he attributes it to Rich Schroeppel. #define
- * F(x,y,z) (((x) & (y)) | ((~(x)) & (z))) I've just become aware of another
- * tweak to be made, again from Wei Dai, in F_40_59, (x&a)|(y&a) -> (x|y)&a
+ * As pointed out by Wei Dai, F() below can be simplified to the code in
+ * F_00_19. Wei attributes these optimizations to Peter Gutmann's SHS code,
+ * and he attributes it to Rich Schroeppel.
+ * #define F(x,y,z) (((x) & (y)) | ((~(x)) & (z)))
+ * I've just become aware of another tweak to be made, again from Wei Dai,
+ * in F_40_59, (x&a)|(y&a) -> (x|y)&a
*/
#define F_00_19(b,c,d) ((((c) ^ (d)) & (b)) ^ (d))
#define F_20_39(b,c,d) ((b) ^ (c) ^ (d))
@@ -191,12 +116,11 @@ fips_md_init_ctx(SHA1, SHA)
# ifndef MD32_XARRAY
/*
* Originally X was an array. As it's automatic it's natural
- * to expect RISC compiler to accomodate at least part of it in
+ * to expect RISC compiler to accommodate at least part of it in
* the register bank, isn't it? Unfortunately not all compilers
* "find" this expectation reasonable:-( On order to make such
* compilers generate better code I replace X[] with a bunch of
* X0, X1, etc. See the function body below...
- * <appro@fy.chalmers.se>
*/
# define X(i) XX##i
# else
@@ -208,7 +132,7 @@ fips_md_init_ctx(SHA1, SHA)
# define X(i) XX[i]
# endif
-# if !defined(SHA_1) || !defined(SHA1_ASM)
+# if !defined(SHA1_ASM)
static void HASH_BLOCK_DATA_ORDER(SHA_CTX *c, const void *p, size_t num)
{
const unsigned char *data = p;
@@ -442,7 +366,7 @@ static void HASH_BLOCK_DATA_ORDER(SHA_CTX *c, const void *p, size_t num)
E=D, D=C, C=ROTATE(B,30), B=A; \
A=ROTATE(A,5)+T+xa; } while(0)
-# if !defined(SHA_1) || !defined(SHA1_ASM)
+# if !defined(SHA1_ASM)
static void HASH_BLOCK_DATA_ORDER(SHA_CTX *c, const void *p, size_t num)
{
const unsigned char *data = p;
@@ -458,7 +382,7 @@ static void HASH_BLOCK_DATA_ORDER(SHA_CTX *c, const void *p, size_t num)
for (;;) {
for (i = 0; i < 16; i++) {
- HOST_c2l(data, l);
+ (void)HOST_c2l(data, l);
X[i] = l;
BODY_00_15(X[i]);
}
diff --git a/crypto/sha/sha_one.c b/crypto/sha/sha_one.c
deleted file mode 100644
index 0930b98a66b6..000000000000
--- a/crypto/sha/sha_one.c
+++ /dev/null
@@ -1,79 +0,0 @@
-/* crypto/sha/sha_one.c */
-/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
- * All rights reserved.
- *
- * This package is an SSL implementation written
- * by Eric Young (eay@cryptsoft.com).
- * The implementation was written so as to conform with Netscapes SSL.
- *
- * This library is free for commercial and non-commercial use as long as
- * the following conditions are aheared to. The following conditions
- * apply to all code found in this distribution, be it the RC4, RSA,
- * lhash, DES, etc., code; not just the SSL code. The SSL documentation
- * included with this distribution is covered by the same copyright terms
- * except that the holder is Tim Hudson (tjh@cryptsoft.com).
- *
- * Copyright remains Eric Young's, and as such any Copyright notices in
- * the code are not to be removed.
- * If this package is used in a product, Eric Young should be given attribution
- * as the author of the parts of the library used.
- * This can be in the form of a textual message at program startup or
- * in documentation (online or textual) provided with the package.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- * must display the following acknowledgement:
- * "This product includes cryptographic software written by
- * Eric Young (eay@cryptsoft.com)"
- * The word 'cryptographic' can be left out if the rouines from the library
- * being used are not cryptographic related :-).
- * 4. If you include any Windows specific code (or a derivative thereof) from
- * the apps directory (application code) you must include an acknowledgement:
- * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
- *
- * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * The licence and distribution terms for any publically available version or
- * derivative of this code cannot be changed. i.e. this code cannot simply be
- * copied and put under another distribution licence
- * [including the GNU Public Licence.]
- */
-
-#include <stdio.h>
-#include <string.h>
-#include <openssl/sha.h>
-#include <openssl/crypto.h>
-
-#ifndef OPENSSL_NO_SHA0
-unsigned char *SHA(const unsigned char *d, size_t n, unsigned char *md)
-{
- SHA_CTX c;
- static unsigned char m[SHA_DIGEST_LENGTH];
-
- if (md == NULL)
- md = m;
- if (!SHA_Init(&c))
- return NULL;
- SHA_Update(&c, d, n);
- SHA_Final(md, &c);
- OPENSSL_cleanse(&c, sizeof(c));
- return (md);
-}
-#endif
diff --git a/crypto/sha/shatest.c b/crypto/sha/shatest.c
deleted file mode 100644
index 105060a7ec2d..000000000000
--- a/crypto/sha/shatest.c
+++ /dev/null
@@ -1,174 +0,0 @@
-/* crypto/sha/shatest.c */
-/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
- * All rights reserved.
- *
- * This package is an SSL implementation written
- * by Eric Young (eay@cryptsoft.com).
- * The implementation was written so as to conform with Netscapes SSL.
- *
- * This library is free for commercial and non-commercial use as long as
- * the following conditions are aheared to. The following conditions
- * apply to all code found in this distribution, be it the RC4, RSA,
- * lhash, DES, etc., code; not just the SSL code. The SSL documentation
- * included with this distribution is covered by the same copyright terms
- * except that the holder is Tim Hudson (tjh@cryptsoft.com).
- *
- * Copyright remains Eric Young's, and as such any Copyright notices in
- * the code are not to be removed.
- * If this package is used in a product, Eric Young should be given attribution
- * as the author of the parts of the library used.
- * This can be in the form of a textual message at program startup or
- * in documentation (online or textual) provided with the package.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- * must display the following acknowledgement:
- * "This product includes cryptographic software written by
- * Eric Young (eay@cryptsoft.com)"
- * The word 'cryptographic' can be left out if the rouines from the library
- * being used are not cryptographic related :-).
- * 4. If you include any Windows specific code (or a derivative thereof) from
- * the apps directory (application code) you must include an acknowledgement:
- * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
- *
- * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * The licence and distribution terms for any publically available version or
- * derivative of this code cannot be changed. i.e. this code cannot simply be
- * copied and put under another distribution licence
- * [including the GNU Public Licence.]
- */
-
-#include <stdio.h>
-#include <string.h>
-#include <stdlib.h>
-
-#include "../e_os.h"
-
-#if defined(OPENSSL_NO_SHA) || defined(OPENSSL_NO_SHA0)
-int main(int argc, char *argv[])
-{
- printf("No SHA0 support\n");
- return (0);
-}
-#else
-# include <openssl/evp.h>
-# include <openssl/sha.h>
-
-# ifdef CHARSET_EBCDIC
-# include <openssl/ebcdic.h>
-# endif
-
-# define SHA_0 /* FIPS 180 */
-# undef SHA_1 /* FIPS 180-1 */
-
-static char *test[] = {
- "abc",
- "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq",
- NULL,
-};
-
-# ifdef SHA_0
-static char *ret[] = {
- "0164b8a914cd2a5e74c4f7ff082c4d97f1edf880",
- "d2516ee1acfa5baf33dfc1c471e438449ef134c8",
-};
-
-static char *bigret = "3232affa48628a26653b5aaa44541fd90d690603";
-# endif
-# ifdef SHA_1
-static char *ret[] = {
- "a9993e364706816aba3e25717850c26c9cd0d89d",
- "84983e441c3bd26ebaae4aa1f95129e5e54670f1",
-};
-
-static char *bigret = "34aa973cd4c4daa4f61eeb2bdbad27316534016f";
-# endif
-
-static char *pt(unsigned char *md);
-int main(int argc, char *argv[])
-{
- int i, err = 0;
- char **P, **R;
- static unsigned char buf[1000];
- char *p, *r;
- EVP_MD_CTX c;
- unsigned char md[SHA_DIGEST_LENGTH];
-
-# ifdef CHARSET_EBCDIC
- ebcdic2ascii(test[0], test[0], strlen(test[0]));
- ebcdic2ascii(test[1], test[1], strlen(test[1]));
-# endif
-
- EVP_MD_CTX_init(&c);
- P = test;
- R = ret;
- i = 1;
- while (*P != NULL) {
- EVP_Digest(*P, strlen(*P), md, NULL, EVP_sha(), NULL);
- p = pt(md);
- if (strcmp(p, *R) != 0) {
- printf("error calculating SHA on '%s'\n", *P);
- printf("got %s instead of %s\n", p, *R);
- err++;
- } else
- printf("test %d ok\n", i);
- i++;
- R++;
- P++;
- }
-
- memset(buf, 'a', 1000);
-# ifdef CHARSET_EBCDIC
- ebcdic2ascii(buf, buf, 1000);
-# endif /* CHARSET_EBCDIC */
- EVP_DigestInit_ex(&c, EVP_sha(), NULL);
- for (i = 0; i < 1000; i++)
- EVP_DigestUpdate(&c, buf, 1000);
- EVP_DigestFinal_ex(&c, md, NULL);
- p = pt(md);
-
- r = bigret;
- if (strcmp(p, r) != 0) {
- printf("error calculating SHA on '%s'\n", p);
- printf("got %s instead of %s\n", p, r);
- err++;
- } else
- printf("test 3 ok\n");
-
-# ifdef OPENSSL_SYS_NETWARE
- if (err)
- printf("ERROR: %d\n", err);
-# endif
- EVP_MD_CTX_cleanup(&c);
- EXIT(err);
- return (0);
-}
-
-static char *pt(unsigned char *md)
-{
- int i;
- static char buf[80];
-
- for (i = 0; i < SHA_DIGEST_LENGTH; i++)
- sprintf(&(buf[i * 2]), "%02x", md[i]);
- return (buf);
-}
-#endif