summaryrefslogtreecommitdiff
path: root/contrib/cortex-strings
diff options
context:
space:
mode:
authorAndrew Turner <andrew@FreeBSD.org>2016-09-19 13:12:09 +0000
committerAndrew Turner <andrew@FreeBSD.org>2016-09-19 13:12:09 +0000
commit09a53ad8f1318c5daae6cfb19d97f4f6459f0013 (patch)
tree2db2658fb2b0e98f2e5771acfa810aead6052d20 /contrib/cortex-strings
parentbddfc749fafad1d8ccd1f2b612da2a527fe86c1f (diff)
parent5a194ab47811dee4fd1bd7c1fe163865fb468ae1 (diff)
downloadsrc-test2-09a53ad8f1318c5daae6cfb19d97f4f6459f0013.tar.gz
src-test2-09a53ad8f1318c5daae6cfb19d97f4f6459f0013.zip
Notes
Diffstat (limited to 'contrib/cortex-strings')
-rw-r--r--contrib/cortex-strings/.gitignore11
-rw-r--r--contrib/cortex-strings/Makefile.am327
-rw-r--r--contrib/cortex-strings/README111
-rwxr-xr-xcontrib/cortex-strings/autogen.sh69
-rw-r--r--contrib/cortex-strings/benchmarks/dhry/dhry.h311
-rw-r--r--contrib/cortex-strings/benchmarks/dhry/dhry_1.c778
-rw-r--r--contrib/cortex-strings/benchmarks/dhry/dhry_2.c186
-rw-r--r--contrib/cortex-strings/benchmarks/multi/harness.c407
-rw-r--r--contrib/cortex-strings/configure.ac88
-rwxr-xr-xcontrib/cortex-strings/scripts/add-license.sh79
-rw-r--r--contrib/cortex-strings/scripts/bench.py175
-rw-r--r--contrib/cortex-strings/scripts/fixup.py27
-rw-r--r--contrib/cortex-strings/scripts/libplot.py78
-rw-r--r--contrib/cortex-strings/scripts/plot-align.py67
-rw-r--r--contrib/cortex-strings/scripts/plot-sizes.py120
-rw-r--r--contrib/cortex-strings/scripts/plot-top.py61
-rw-r--r--contrib/cortex-strings/scripts/plot.py123
-rwxr-xr-xcontrib/cortex-strings/scripts/trim.sh9
-rw-r--r--contrib/cortex-strings/src/aarch64/memchr.S172
-rw-r--r--contrib/cortex-strings/src/aarch64/memcmp.S162
-rw-r--r--contrib/cortex-strings/src/aarch64/memcpy.S225
-rw-r--r--contrib/cortex-strings/src/aarch64/memmove.S150
-rw-r--r--contrib/cortex-strings/src/aarch64/memset.S235
-rw-r--r--contrib/cortex-strings/src/aarch64/strchr.S159
-rw-r--r--contrib/cortex-strings/src/aarch64/strchrnul.S144
-rw-r--r--contrib/cortex-strings/src/aarch64/strcmp.S166
-rw-r--r--contrib/cortex-strings/src/aarch64/strcpy.S336
-rw-r--r--contrib/cortex-strings/src/aarch64/strlen.S233
-rw-r--r--contrib/cortex-strings/src/aarch64/strncmp.S222
-rw-r--r--contrib/cortex-strings/src/aarch64/strnlen.S181
-rw-r--r--contrib/cortex-strings/src/arm/memchr.S155
-rw-r--r--contrib/cortex-strings/src/arm/memcpy.S617
-rw-r--r--contrib/cortex-strings/src/arm/memset.S122
-rw-r--r--contrib/cortex-strings/src/arm/strchr.S80
-rw-r--r--contrib/cortex-strings/src/arm/strcmp.S500
-rw-r--r--contrib/cortex-strings/src/thumb-2/strcpy.c173
-rw-r--r--contrib/cortex-strings/src/thumb-2/strlen.S150
-rw-r--r--contrib/cortex-strings/src/thumb/aeabi_idiv.S318
-rw-r--r--contrib/cortex-strings/src/thumb/strcmp-armv6m.S143
39 files changed, 7670 insertions, 0 deletions
diff --git a/contrib/cortex-strings/.gitignore b/contrib/cortex-strings/.gitignore
new file mode 100644
index 000000000000..558ca155c736
--- /dev/null
+++ b/contrib/cortex-strings/.gitignore
@@ -0,0 +1,11 @@
+*.a
+*.o
+*.la
+*.lo
+*.png
+*.pyc
+.deps
+.dirstamp
+.libs
+try-*
+cache.txt
diff --git a/contrib/cortex-strings/Makefile.am b/contrib/cortex-strings/Makefile.am
new file mode 100644
index 000000000000..db6bb93254a4
--- /dev/null
+++ b/contrib/cortex-strings/Makefile.am
@@ -0,0 +1,327 @@
+# Copyright (c) 2011, Linaro Limited
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of the Linaro nor the
+# names of its contributors may be used to endorse or promote products
+# derived from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+# Top level Makefile for cortex-strings
+
+# Used to record the compiler version in the executables
+COMPILER = $(shell $(CC) --version 2>&1 | head -n1)
+
+# The main library
+lib_LTLIBRARIES = \
+ libcortex-strings.la
+
+## Test suite
+check_PROGRAMS = \
+ tests/test-memchr \
+ tests/test-memcmp \
+ tests/test-memcpy \
+ tests/test-memmove \
+ tests/test-memset \
+ tests/test-strchr \
+ tests/test-strcmp \
+ tests/test-strcpy \
+ tests/test-strlen \
+ tests/test-strncmp \
+ tests/test-strnlen
+
+# Options for the tests
+tests_cflags = -I$(srcdir)/tests $(AM_CFLAGS)
+tests_ldadd = libcortex-strings.la
+tests_test_memchr_LDADD = $(tests_ldadd)
+tests_test_memchr_CFLAGS = $(tests_cflags)
+tests_test_memcmp_LDADD = $(tests_ldadd)
+tests_test_memcmp_CFLAGS = $(tests_cflags)
+tests_test_memcpy_LDADD = $(tests_ldadd)
+tests_test_memcpy_CFLAGS = $(tests_cflags)
+tests_test_memmove_LDADD = $(tests_ldadd)
+tests_test_memmove_CFLAGS = $(tests_cflags)
+tests_test_memset_LDADD = $(tests_ldadd)
+tests_test_memset_CFLAGS = $(tests_cflags)
+tests_test_strchr_LDADD = $(tests_ldadd)
+tests_test_strchr_CFLAGS = $(tests_cflags)
+tests_test_strcmp_LDADD = $(tests_ldadd)
+tests_test_strcmp_CFLAGS = $(tests_cflags)
+tests_test_strcpy_LDADD = $(tests_ldadd)
+tests_test_strcpy_CFLAGS = $(tests_cflags)
+tests_test_strlen_LDADD = $(tests_ldadd)
+tests_test_strlen_CFLAGS = $(tests_cflags)
+tests_test_strncmp_LDADD = $(tests_ldadd)
+tests_test_strncmp_CFLAGS = $(tests_cflags)
+
+TESTS = $(check_PROGRAMS)
+
+## Benchmarks
+noinst_PROGRAMS = \
+ dhry \
+ dhry-native \
+ try-none \
+ try-this \
+ try-plain \
+ try-newlib-c \
+ try-bionic-c \
+ try-glibc-c
+
+# Good 'ol Dhrystone
+dhry_SOURCES = \
+ benchmarks/dhry/dhry_1.c \
+ benchmarks/dhry/dhry_2.c \
+ benchmarks/dhry/dhry.h
+
+dhry_CFLAGS = -Dcompiler="\"$(COMPILER)\"" -Doptions="\"$(CFLAGS)\""
+dhry_LDADD = libcortex-strings.la
+
+dhry_native_SOURCES = $(dhry_SOURCES)
+dhry_native_CFLAGS = $(dhry_CFLAGS)
+
+# Benchmark harness
+noinst_LIBRARIES = \
+ libmulti.a \
+ libbionic-c.a \
+ libglibc-c.a \
+ libnewlib-c.a \
+ libplain.a
+
+libmulti_a_SOURCES = \
+ benchmarks/multi/harness.c
+
+libmulti_a_CFLAGS = -DVERSION=\"$(VERSION)\" $(AM_CFLAGS)
+
+## Other architecture independant implementaions
+libbionic_c_a_SOURCES = \
+ reference/bionic-c/bcopy.c \
+ reference/bionic-c/memchr.c \
+ reference/bionic-c/memcmp.c \
+ reference/bionic-c/memcpy.c \
+ reference/bionic-c/memset.c \
+ reference/bionic-c/strchr.c \
+ reference/bionic-c/strcmp.c \
+ reference/bionic-c/strcpy.c \
+ reference/bionic-c/strlen.c
+
+libglibc_c_a_SOURCES = \
+ reference/glibc-c/memchr.c \
+ reference/glibc-c/memcmp.c \
+ reference/glibc-c/memcpy.c \
+ reference/glibc-c/memset.c \
+ reference/glibc-c/strchr.c \
+ reference/glibc-c/strcmp.c \
+ reference/glibc-c/strcpy.c \
+ reference/glibc-c/strlen.c \
+ reference/glibc-c/wordcopy.c \
+ reference/glibc-c/memcopy.h \
+ reference/glibc-c/pagecopy.h
+
+libnewlib_c_a_SOURCES = \
+ reference/newlib-c/memchr.c \
+ reference/newlib-c/memcmp.c \
+ reference/newlib-c/memcpy.c \
+ reference/newlib-c/memset.c \
+ reference/newlib-c/strchr.c \
+ reference/newlib-c/strcmp.c \
+ reference/newlib-c/strcpy.c \
+ reference/newlib-c/strlen.c \
+ reference/newlib-c/shim.h
+
+libplain_a_SOURCES = \
+ reference/plain/memset.c \
+ reference/plain/memcpy.c \
+ reference/plain/strcmp.c \
+ reference/plain/strcpy.c
+
+try_none_SOURCES =
+try_none_LDADD = libmulti.a -lrt
+try_this_SOURCES =
+try_this_LDADD = libmulti.a libcortex-strings.la -lrt
+try_bionic_c_SOURCES =
+try_bionic_c_LDADD = libmulti.a libbionic-c.a -lrt
+try_glibc_c_SOURCES =
+try_glibc_c_LDADD = libmulti.a libglibc-c.a -lrt
+try_newlib_c_SOURCES =
+try_newlib_c_LDADD = libmulti.a libnewlib-c.a -lrt
+try_plain_SOURCES =
+try_plain_LDADD = libmulti.a libplain.a -lrt
+
+# Architecture specific
+
+if HOST_AARCH32
+
+if WITH_NEON
+# Pull in the NEON specific files
+neon_bionic_a9_sources = \
+ reference/bionic-a9/memcpy.S \
+ reference/bionic-a9/memset.S
+neon_bionic_a15_sources = \
+ reference/bionic-a15/memcpy.S \
+ reference/bionic-a15/memset.S
+fpu_flags = -mfpu=neon
+else
+if WITH_VFP
+fpu_flags = -mfpu=vfp
+else
+fpu_flags = -msoft-float
+endif
+endif
+
+# Benchmarks and example programs
+noinst_PROGRAMS += \
+ try-bionic-a9 \
+ try-bionic-a15 \
+ try-csl \
+ try-glibc \
+ try-newlib \
+ try-newlib-xscale
+
+# Libraries used in the benchmarks and examples
+noinst_LIBRARIES += \
+ libbionic-a9.a \
+ libbionic-a15.a \
+ libcsl.a \
+ libglibc.a \
+ libnewlib.a \
+ libnewlib-xscale.a
+
+# Main library
+libcortex_strings_la_SOURCES = \
+ src/thumb-2/strcpy.c \
+ src/arm/memchr.S \
+ src/arm/strchr.S \
+ src/thumb-2/strlen.S \
+ src/arm/memset.S \
+ src/arm/memcpy.S \
+ src/arm/strcmp.S
+
+# Libraries containing the difference reference versions
+libbionic_a9_a_SOURCES = \
+ $(neon_bionic_a9_sources) \
+ reference/bionic-a9/memcmp.S \
+ reference/bionic-a9/strcmp.S \
+ reference/bionic-a9/strcpy.S \
+ reference/bionic-a9/strlen.c
+
+libbionic_a9_a_CFLAGS = -Wa,-mimplicit-it=thumb
+
+libbionic_a15_a_SOURCES = \
+ $(neon_bionic_a15_sources) \
+ reference/bionic-a15/memcmp.S \
+ reference/bionic-a15/strcmp.S \
+ reference/bionic-a15/strcpy.S \
+ reference/bionic-a15/strlen.c
+
+libbionic_a15_a_CFLAGS = -Wa,-mimplicit-it=thumb
+
+libcsl_a_SOURCES = \
+ reference/csl/memcpy.c \
+ reference/csl/memset.c \
+ reference/csl/arm_asm.h
+
+libglibc_a_SOURCES = \
+ reference/glibc/memcpy.S \
+ reference/glibc/memset.S \
+ reference/glibc/strchr.S \
+ reference/glibc/strlen.S
+
+libnewlib_a_SOURCES = \
+ reference/newlib/memcpy.S \
+ reference/newlib/strcmp.S \
+ reference/newlib/strcpy.c \
+ reference/newlib/strlen.c \
+ reference/newlib/arm_asm.h \
+ reference/newlib/shim.h
+
+libnewlib_xscale_a_SOURCES = \
+ reference/newlib-xscale/memchr.c \
+ reference/newlib-xscale/memcpy.c \
+ reference/newlib-xscale/memset.c \
+ reference/newlib-xscale/strchr.c \
+ reference/newlib-xscale/strcmp.c \
+ reference/newlib-xscale/strcpy.c \
+ reference/newlib-xscale/strlen.c \
+ reference/newlib-xscale/xscale.h
+
+# Flags for the benchmark helpers
+try_bionic_a9_SOURCES =
+try_bionic_a9_LDADD = libmulti.a libbionic-a9.a -lrt
+try_bionic_a15_SOURCES =
+try_bionic_a15_LDADD = libmulti.a libbionic-a15.a -lrt
+try_csl_SOURCES =
+try_csl_LDADD = libmulti.a libcsl.a -lrt
+try_glibc_SOURCES =
+try_glibc_LDADD = libmulti.a libglibc.a -lrt
+try_newlib_SOURCES =
+try_newlib_LDADD = libmulti.a libnewlib.a -lrt
+try_newlib_xscale_SOURCES =
+try_newlib_xscale_LDADD = libmulti.a libnewlib-xscale.a -lrt
+
+AM_CPPFLAGS = $(fpu_flags)
+AM_LDFLAGS = $(fpu_flags)
+
+endif
+
+# aarch64 specific
+if HOST_AARCH64
+
+libcortex_strings_la_SOURCES = \
+ src/aarch64/memchr.S \
+ src/aarch64/memcmp.S \
+ src/aarch64/memcpy.S \
+ src/aarch64/memmove.S \
+ src/aarch64/memset.S \
+ src/aarch64/strchr.S \
+ src/aarch64/strchrnul.S \
+ src/aarch64/strcmp.S \
+ src/aarch64/strcpy.S \
+ src/aarch64/strlen.S \
+ src/aarch64/strncmp.S \
+ src/aarch64/strnlen.S
+
+endif
+
+libcortex_strings_la_LDFLAGS = -version-info 1:0:0
+
+AM_CFLAGS = \
+ -std=gnu99 -Wall \
+ -fno-builtin -fno-stack-protector -U_FORTIFY_SOURCE \
+ $(AM_CPPFLAGS)
+
+if WITH_SUBMACHINE
+AM_CFLAGS += \
+ -mtune=$(submachine)
+endif
+
+EXTRA_DIST = \
+ tests/hp-timing.h \
+ tests/test-string.h \
+ tests/test-skeleton.c \
+ scripts/add-license.sh \
+ scripts/bench.py \
+ scripts/fixup.py \
+ scripts/libplot.py \
+ scripts/plot-align.py \
+ scripts/plot.py \
+ scripts/plot-sizes.py \
+ scripts/plot-top.py \
+ scripts/trim.sh \
+ autogen.sh
diff --git a/contrib/cortex-strings/README b/contrib/cortex-strings/README
new file mode 100644
index 000000000000..5e9e9d3f1d6f
--- /dev/null
+++ b/contrib/cortex-strings/README
@@ -0,0 +1,111 @@
+= Cortex-A String Routines =
+
+This package contains optimised string routines including memcpy(), memset(),
+strcpy(), strlen() for the ARM Cortex-A series of cores.
+
+Various implementations of these routines are provided, including generic
+implementations for ARMv7-A cores with/without Neon, Thumb2 implementations
+and generic implementations for cores supporting AArch64.
+
+== Getting started ==
+First configure and then install libcortex-strings.so. To make other
+applications use this library, either add -lcortex-strings to the link
+command or use LD_PRELOAD to load the library into existing applications.
+
+Our intent is to get these routines into the common C libraries such
+as GLIBC, Bionic, and Newlib. Your system may already include them!
+
+== Contents ==
+ * src/ contains the routines themselves
+ * tests/ contains the unit tests
+ * reference/ contains reference copies of other ARM-focused
+ implementations gathered from around the Internet
+ * benchmarks/ contains various benchmarks, tools, and scripts used to
+ check and report on the different implementations.
+
+The src directory contains different variants organised by the
+implementation they run on and optional features used. For example:
+ * src/thumb-2 contains generic non-NEON routines for AArch32 (with Thumb-2).
+ * src/arm contains tuned routines for Cortex-A class processors.
+ * src/aarch64 contains generic routines for AArch64.
+ * src/thumb contains generic routines for armv6-M (with Thumb).
+
+== Reference versions ==
+reference/ contains versions collected from various popular Open
+Source libraries. These have been modified for use in benchmarking.
+Please refer to the individual files for any licensing terms.
+
+The routines were collected from the following releases:
+ * EGLIBC 2.13
+ * Newlib 1.19.0
+ * Bionic android-2.3.5_r1
+
+== Licensing ==
+All Linaro-authored routines are under the modified BSD license:
+
+Copyright (c) 2011, Linaro Limited
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the Linaro nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+All ARM-authored routines are under the modified BSD license:
+
+Copyright (c) 2014 ARM Ltd
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the Linaro nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+All third party routines are under a GPL compatible license.
+
+== Notes and Limitations ==
+Some of the implementations have been collected from other
+projects and have a variety of licenses and copyright holders.
+
+== Style ==
+Assembly code attempts to follow the GLIBC coding convetions. They
+are:
+ * Copyright headers in C style comment blocks
+ * Instructions indented with one tab
+ * Operands indented with one tab
+ * Text is wrapped at 70 characters
+ * End of line comments are fine
diff --git a/contrib/cortex-strings/autogen.sh b/contrib/cortex-strings/autogen.sh
new file mode 100755
index 000000000000..8e0591cc315a
--- /dev/null
+++ b/contrib/cortex-strings/autogen.sh
@@ -0,0 +1,69 @@
+#!/bin/sh
+#
+# autogen.sh glue for hplip
+#
+# HPLIP used to have five or so different autotools trees. Upstream
+# has reduced it to two. Still, this script is capable of cleaning
+# just about any possible mess of autoconf files.
+#
+# BE CAREFUL with trees that are not completely automake-generated,
+# this script deletes all Makefile.in files it can find.
+#
+# Requires: automake 1.9, autoconf 2.57+
+# Conflicts: autoconf 2.13
+set -e
+
+# Refresh GNU autotools toolchain.
+echo Cleaning autotools files...
+find -type d -name autom4te.cache -print0 | xargs -0 rm -rf \;
+find -type f \( -name missing -o -name install-sh -o -name mkinstalldirs \
+ -o -name depcomp -o -name ltmain.sh -o -name configure \
+ -o -name config.sub -o -name config.guess \
+ -o -name Makefile.in \) -print0 | xargs -0 rm -f
+
+echo Running autoreconf...
+autoreconf --force --install
+
+# For the Debian package build
+test -d debian && {
+ # link these in Debian builds
+ rm -f config.sub config.guess
+ ln -s /usr/share/misc/config.sub .
+ ln -s /usr/share/misc/config.guess .
+
+ # refresh list of executable scripts, to avoid possible breakage if
+ # upstream tarball does not include the file or if it is mispackaged
+ # for whatever reason.
+ [ "$1" = "updateexec" ] && {
+ echo Generating list of executable files...
+ rm -f debian/executable.files
+ find -type f -perm +111 ! -name '.*' -fprint debian/executable.files
+ }
+
+ # Remove any files in upstream tarball that we don't have in the Debian
+ # package (because diff cannot remove files)
+ version=`dpkg-parsechangelog | awk '/Version:/ { print $2 }' | sed -e 's/-[^-]\+$//'`
+ source=`dpkg-parsechangelog | awk '/Source:/ { print $2 }' | tr -d ' '`
+ if test -r ../${source}_${version}.orig.tar.gz ; then
+ echo Generating list of files that should be removed...
+ rm -f debian/deletable.files
+ touch debian/deletable.files
+ [ -e debian/tmp ] && rm -rf debian/tmp
+ mkdir debian/tmp
+ ( cd debian/tmp ; tar -zxf ../../../${source}_${version}.orig.tar.gz )
+ find debian/tmp/ -type f ! -name '.*' -print0 | xargs -0 -ri echo '{}' | \
+ while read -r i ; do
+ if test -e "${i}" ; then
+ filename=$(echo "${i}" | sed -e 's#.*debian/tmp/[^/]\+/##')
+ test -e "${filename}" || echo "${filename}" >>debian/deletable.files
+ fi
+ done
+ rm -fr debian/tmp
+ else
+ echo Emptying list of files that should be deleted...
+ rm -f debian/deletable.files
+ touch debian/deletable.files
+ fi
+}
+
+exit 0
diff --git a/contrib/cortex-strings/benchmarks/dhry/dhry.h b/contrib/cortex-strings/benchmarks/dhry/dhry.h
new file mode 100644
index 000000000000..3010aecdd31f
--- /dev/null
+++ b/contrib/cortex-strings/benchmarks/dhry/dhry.h
@@ -0,0 +1,311 @@
+/*
+ **************************************************************************
+ * DHRYSTONE 2.1 BENCHMARK PC VERSION
+ **************************************************************************
+ *
+ * "DHRYSTONE" Benchmark Program
+ * -----------------------------
+ *
+ * Version: C, Version 2.1
+ *
+ * File: dhry.h (part 1 of 3)
+ *
+ * Date: May 25, 1988
+ *
+ * Author: Reinhold P. Weicker
+ * Siemens AG, AUT E 51
+ * Postfach 3220
+ * 8520 Erlangen
+ * Germany (West)
+ * Phone: [+49]-9131-7-20330
+ * (8-17 Central European Time)
+ * Usenet: ..!mcsun!unido!estevax!weicker
+ *
+ * Original Version (in Ada) published in
+ * "Communications of the ACM" vol. 27., no. 10 (Oct. 1984),
+ * pp. 1013 - 1030, together with the statistics
+ * on which the distribution of statements etc. is based.
+ *
+ * In this C version, the following C library functions are used:
+ * - strcpy, strcmp (inside the measurement loop)
+ * - printf, scanf (outside the measurement loop)
+ * In addition, Berkeley UNIX system calls "times ()" or "time ()"
+ * are used for execution time measurement. For measurements
+ * on other systems, these calls have to be changed.
+ *
+ * Collection of Results:
+ * Reinhold Weicker (address see above) and
+ *
+ * Rick Richardson
+ * PC Research. Inc.
+ * 94 Apple Orchard Drive
+ * Tinton Falls, NJ 07724
+ * Phone: (201) 389-8963 (9-17 EST)
+ * Usenet: ...!uunet!pcrat!rick
+ *
+ * Please send results to Rick Richardson and/or Reinhold Weicker.
+ * Complete information should be given on hardware and software used.
+ * Hardware information includes: Machine type, CPU, type and size
+ * of caches; for microprocessors: clock frequency, memory speed
+ * (number of wait states).
+ * Software information includes: Compiler (and runtime library)
+ * manufacturer and version, compilation switches, OS version.
+ * The Operating System version may give an indication about the
+ * compiler; Dhrystone itself performs no OS calls in the measurement
+ * loop.
+ *
+ * The complete output generated by the program should be mailed
+ * such that at least some checks for correctness can be made.
+ *
+ **************************************************************************
+ *
+ * This version has changes made by Roy Longbottom to conform to a common
+ * format for a series of standard benchmarks for PCs:
+ *
+ * Running time greater than 5 seconds due to inaccuracy of the PC clock.
+ *
+ * Automatic adjustment of run time, no manually inserted parameters.
+ *
+ * Initial display of calibration times to confirm linearity.
+ *
+ * Display of results within one screen (or at a slow speed as the test
+ * progresses) so that it can be seen to have run successfully.
+ *
+ * Facilities to type in details of system used etc.
+ *
+ * All results and details appended to a results file.
+ *
+ *
+ * Roy Longbottom
+ * 101323.2241@compuserve.com
+ *
+ **************************************************************************
+ *
+ * For details of history, changes, other defines, benchmark construction
+ * statistics see official versions from ftp.nosc.mil/pub/aburto where
+ * the latest table of results (dhry.tbl) are available. See also
+ * netlib@ornl.gov
+ *
+ **************************************************************************
+ *
+ * Defines: The following "Defines" are possible:
+ * -DREG=register (default: Not defined)
+ * As an approximation to what an average C programmer
+ * might do, the "register" storage class is applied
+ * (if enabled by -DREG=register)
+ * - for local variables, if they are used (dynamically)
+ * five or more times
+ * - for parameters if they are used (dynamically)
+ * six or more times
+ * Note that an optimal "register" strategy is
+ * compiler-dependent, and that "register" declarations
+ * do not necessarily lead to faster execution.
+ * -DNOSTRUCTASSIGN (default: Not defined)
+ * Define if the C compiler does not support
+ * assignment of structures.
+ * -DNOENUMS (default: Not defined)
+ * Define if the C compiler does not support
+ * enumeration types.
+ ***************************************************************************
+ *
+ * Compilation model and measurement (IMPORTANT):
+ *
+ * This C version of Dhrystone consists of three files:
+ * - dhry.h (this file, containing global definitions and comments)
+ * - dhry_1.c (containing the code corresponding to Ada package Pack_1)
+ * - dhry_2.c (containing the code corresponding to Ada package Pack_2)
+ *
+ * The following "ground rules" apply for measurements:
+ * - Separate compilation
+ * - No procedure merging
+ * - Otherwise, compiler optimizations are allowed but should be indicated
+ * - Default results are those without register declarations
+ * See the companion paper "Rationale for Dhrystone Version 2" for a more
+ * detailed discussion of these ground rules.
+ *
+ * For 16-Bit processors (e.g. 80186, 80286), times for all compilation
+ * models ("small", "medium", "large" etc.) should be given if possible,
+ * together with a definition of these models for the compiler system used.
+ *
+ **************************************************************************
+ * Examples of Pentium Results
+ *
+ * Dhrystone Benchmark Version 2.1 (Language: C)
+ *
+ * Month run 4/1996
+ * PC model Escom
+ * CPU Pentium
+ * Clock MHz 100
+ * Cache 256K
+ * Options Neptune chipset
+ * OS/DOS Windows 95
+ * Compiler Watcom C/ C++ 10.5 Win386
+ * OptLevel -otexan -zp8 -fp5 -5r
+ * Run by Roy Longbottom
+ * From UK
+ * Mail 101323.2241@compuserve.com
+ *
+ * Final values (* implementation-dependent):
+ *
+ * Int_Glob: O.K. 5
+ * Bool_Glob: O.K. 1
+ * Ch_1_Glob: O.K. A
+ * Ch_2_Glob: O.K. B
+ * Arr_1_Glob[8]: O.K. 7
+ * Arr_2_Glob8/7: O.K. 1600010
+ * Ptr_Glob->
+ * Ptr_Comp: * 98008
+ * Discr: O.K. 0
+ * Enum_Comp: O.K. 2
+ * Int_Comp: O.K. 17
+ * Str_Comp: O.K. DHRYSTONE PROGRAM, SOME STRING
+ * Next_Ptr_Glob->
+ * Ptr_Comp: * 98008 same as above
+ * Discr: O.K. 0
+ * Enum_Comp: O.K. 1
+ * Int_Comp: O.K. 18
+ * Str_Comp: O.K. DHRYSTONE PROGRAM, SOME STRING
+ * Int_1_Loc: O.K. 5
+ * Int_2_Loc: O.K. 13
+ * Int_3_Loc: O.K. 7
+ * Enum_Loc: O.K. 1
+ * Str_1_Loc: O.K. DHRYSTONE PROGRAM, 1'ST STRING
+ * Str_2_Loc: O.K. DHRYSTONE PROGRAM, 2'ND STRING
+ *
+ * Register option Selected.
+ *
+ * Microseconds 1 loop: 4.53
+ * Dhrystones / second: 220690
+ * VAX MIPS rating: 125.61
+ *
+ *
+ * Dhrystone Benchmark Version 2.1 (Language: C)
+ *
+ * Month run 4/1996
+ * PC model Escom
+ * CPU Pentium
+ * Clock MHz 100
+ * Cache 256K
+ * Options Neptune chipset
+ * OS/DOS Windows 95
+ * Compiler Watcom C/ C++ 10.5 Win386
+ * OptLevel No optimisation
+ * Run by Roy Longbottom
+ * From UK
+ * Mail 101323.2241@compuserve.com
+ *
+ * Final values (* implementation-dependent):
+ *
+ * Int_Glob: O.K. 5
+ * Bool_Glob: O.K. 1
+ * Ch_1_Glob: O.K. A
+ * Ch_2_Glob: O.K. B
+ * Arr_1_Glob[8]: O.K. 7
+ * Arr_2_Glob8/7: O.K. 320010
+ * Ptr_Glob->
+ * Ptr_Comp: * 98004
+ * Discr: O.K. 0
+ * Enum_Comp: O.K. 2
+ * Int_Comp: O.K. 17
+ * Str_Comp: O.K. DHRYSTONE PROGRAM, SOME STRING
+ * Next_Ptr_Glob->
+ * Ptr_Comp: * 98004 same as above
+ * Discr: O.K. 0
+ * Enum_Comp: O.K. 1
+ * Int_Comp: O.K. 18
+ * Str_Comp: O.K. DHRYSTONE PROGRAM, SOME STRING
+ * Int_1_Loc: O.K. 5
+ * Int_2_Loc: O.K. 13
+ * Int_3_Loc: O.K. 7
+ * Enum_Loc: O.K. 1
+ * Str_1_Loc: O.K. DHRYSTONE PROGRAM, 1'ST STRING
+ * Str_2_Loc: O.K. DHRYSTONE PROGRAM, 2'ND STRING
+ *
+ * Register option Not selected.
+ *
+ * Microseconds 1 loop: 20.06
+ * Dhrystones / second: 49844
+ * VAX MIPS rating: 28.37
+ *
+ **************************************************************************
+ */
+
+/* Compiler and system dependent definitions: */
+
+#ifndef TIME
+#define TIMES
+#endif
+ /* Use times(2) time function unless */
+ /* explicitly defined otherwise */
+
+#ifdef TIMES
+/* #include <sys/types.h>
+ #include <sys/times.h> */
+ /* for "times" */
+#endif
+
+#define Mic_secs_Per_Second 1000000.0
+ /* Berkeley UNIX C returns process times in seconds/HZ */
+
+#ifdef NOSTRUCTASSIGN
+#define structassign(d, s) memcpy(&(d), &(s), sizeof(d))
+#else
+#define structassign(d, s) d = s
+#endif
+
+#ifdef NOENUM
+#define Ident_1 0
+#define Ident_2 1
+#define Ident_3 2
+#define Ident_4 3
+#define Ident_5 4
+ typedef int Enumeration;
+#else
+ typedef enum {Ident_1, Ident_2, Ident_3, Ident_4, Ident_5}
+ Enumeration;
+#endif
+ /* for boolean and enumeration types in Ada, Pascal */
+
+/* General definitions: */
+
+#include <stdio.h>
+#include <string.h>
+
+ /* for strcpy, strcmp */
+
+#define Null 0
+ /* Value of a Null pointer */
+#define true 1
+#define false 0
+
+typedef int One_Thirty;
+typedef int One_Fifty;
+typedef char Capital_Letter;
+typedef int Boolean;
+typedef char Str_30 [31];
+typedef int Arr_1_Dim [50];
+typedef int Arr_2_Dim [50] [50];
+
+typedef struct record
+ {
+ struct record *Ptr_Comp;
+ Enumeration Discr;
+ union {
+ struct {
+ Enumeration Enum_Comp;
+ int Int_Comp;
+ char Str_Comp [31];
+ } var_1;
+ struct {
+ Enumeration E_Comp_2;
+ char Str_2_Comp [31];
+ } var_2;
+ struct {
+ char Ch_1_Comp;
+ char Ch_2_Comp;
+ } var_3;
+ } variant;
+ } Rec_Type, *Rec_Pointer;
+
+
+
diff --git a/contrib/cortex-strings/benchmarks/dhry/dhry_1.c b/contrib/cortex-strings/benchmarks/dhry/dhry_1.c
new file mode 100644
index 000000000000..da272e4c4234
--- /dev/null
+++ b/contrib/cortex-strings/benchmarks/dhry/dhry_1.c
@@ -0,0 +1,778 @@
+/*
+ *************************************************************************
+ *
+ * "DHRYSTONE" Benchmark Program
+ * -----------------------------
+ *
+ * Version: C, Version 2.1
+ *
+ * File: dhry_1.c (part 2 of 3)
+ *
+ * Date: May 25, 1988
+ *
+ * Author: Reinhold P. Weicker
+ *
+ *************************************************************************
+ */
+
+ #include <time.h>
+ #include <stdlib.h>
+ #include <stdio.h>
+ #include "dhry.h"
+ /*COMPILER COMPILER COMPILER COMPILER COMPILER COMPILER COMPILER*/
+
+ #ifdef COW
+ #define compiler "Watcom C/C++ 10.5 Win386"
+ #define options " -otexan -zp8 -5r -ms"
+ #endif
+ #ifdef CNW
+ #define compiler "Watcom C/C++ 10.5 Win386"
+ #define options " No optimisation"
+ #endif
+ #ifdef COD
+ #define compiler "Watcom C/C++ 10.5 Dos4GW"
+ #define options " -otexan -zp8 -5r -ms"
+ #endif
+ #ifdef CND
+ #define compiler "Watcom C/C++ 10.5 Dos4GW"
+ #define options " No optimisation"
+ #endif
+ #ifdef CONT
+ #define compiler "Watcom C/C++ 10.5 Win32NT"
+ #define options " -otexan -zp8 -5r -ms"
+ #endif
+ #ifdef CNNT
+ #define compiler "Watcom C/C++ 10.5 Win32NT"
+ #define options " No optimisation"
+ #endif
+ #ifdef COO2
+ #define compiler "Watcom C/C++ 10.5 OS/2-32"
+ #define options " -otexan -zp8 -5r -ms"
+ #endif
+ #ifdef CNO2
+ #define compiler "Watcom C/C++ 10.5 OS/2-32"
+ #define options " No optimisation"
+ #endif
+
+
+/* Global Variables: */
+
+Rec_Pointer Ptr_Glob,
+ Next_Ptr_Glob;
+int Int_Glob;
+ Boolean Bool_Glob;
+ char Ch_1_Glob,
+ Ch_2_Glob;
+ int Arr_1_Glob [50];
+ int Arr_2_Glob [50] [50];
+ int getinput = 1;
+
+
+ char Reg_Define[100] = "Register option Selected.";
+
+ Enumeration Func_1 (Capital_Letter Ch_1_Par_Val,
+ Capital_Letter Ch_2_Par_Val);
+ /*
+ forward declaration necessary since Enumeration may not simply be int
+ */
+
+ #ifndef ROPT
+ #define REG
+ /* REG becomes defined as empty */
+ /* i.e. no register variables */
+ #else
+ #define REG register
+ #endif
+
+ void Proc_1 (REG Rec_Pointer Ptr_Val_Par);
+ void Proc_2 (One_Fifty *Int_Par_Ref);
+ void Proc_3 (Rec_Pointer *Ptr_Ref_Par);
+ void Proc_4 ();
+ void Proc_5 ();
+ void Proc_6 (Enumeration Enum_Val_Par, Enumeration *Enum_Ref_Par);
+ void Proc_7 (One_Fifty Int_1_Par_Val, One_Fifty Int_2_Par_Val,
+ One_Fifty *Int_Par_Ref);
+ void Proc_8 (Arr_1_Dim Arr_1_Par_Ref, Arr_2_Dim Arr_2_Par_Ref,
+ int Int_1_Par_Val, int Int_2_Par_Val);
+
+ Boolean Func_2 (Str_30 Str_1_Par_Ref, Str_30 Str_2_Par_Ref);
+
+
+ /* variables for time measurement: */
+
+ #define Too_Small_Time 2
+ /* Measurements should last at least 2 seconds */
+
+ double Begin_Time,
+ End_Time,
+ User_Time;
+
+ double Microseconds,
+ Dhrystones_Per_Second,
+ Vax_Mips;
+
+ /* end of variables for time measurement */
+
+
+ void main (int argc, char *argv[])
+ /*****/
+
+ /* main program, corresponds to procedures */
+ /* Main and Proc_0 in the Ada version */
+ {
+ double dtime();
+
+ One_Fifty Int_1_Loc;
+ REG One_Fifty Int_2_Loc;
+ One_Fifty Int_3_Loc;
+ REG char Ch_Index;
+ Enumeration Enum_Loc;
+ Str_30 Str_1_Loc;
+ Str_30 Str_2_Loc;
+ REG int Run_Index;
+ REG int Number_Of_Runs;
+ int endit, count = 10;
+ FILE *Ap;
+ char general[9][80] = {" "};
+
+ /* Initializations */
+ if (argc > 1)
+ {
+ switch (argv[1][0])
+ {
+ case 'N':
+ getinput = 0;
+ break;
+ case 'n':
+ getinput = 0;
+ break;
+ }
+ }
+
+ if ((Ap = fopen("Dhry.txt","a+")) == NULL)
+ {
+ printf("Can not open Dhry.txt\n\n");
+ printf("Press any key\n");
+ exit(1);
+ }
+
+/***********************************************************************
+ * Change for compiler and optimisation used *
+ ***********************************************************************/
+
+ Next_Ptr_Glob = (Rec_Pointer) malloc (sizeof (Rec_Type));
+ Ptr_Glob = (Rec_Pointer) malloc (sizeof (Rec_Type));
+
+ Ptr_Glob->Ptr_Comp = Next_Ptr_Glob;
+ Ptr_Glob->Discr = Ident_1;
+ Ptr_Glob->variant.var_1.Enum_Comp = Ident_3;
+ Ptr_Glob->variant.var_1.Int_Comp = 40;
+ strcpy (Ptr_Glob->variant.var_1.Str_Comp,
+ "DHRYSTONE PROGRAM, SOME STRING");
+ strcpy (Str_1_Loc, "DHRYSTONE PROGRAM, 1'ST STRING");
+
+ Arr_2_Glob [8][7] = 10;
+ /* Was missing in published program. Without this statement, */
+ /* Arr_2_Glob [8][7] would have an undefined value. */
+ /* Warning: With 16-Bit processors and Number_Of_Runs > 32000, */
+ /* overflow may occur for this array element. */
+
+ printf ("\n");
+ printf ("Dhrystone Benchmark, Version 2.1 (Language: C or C++)\n");
+ printf ("\n");
+
+ if (getinput == 0)
+ {
+ printf ("No run time input data\n\n");
+ }
+ else
+ {
+ printf ("With run time input data\n\n");
+ }
+
+ printf ("Compiler %s\n", compiler);
+ printf ("Optimisation %s\n", options);
+ #ifdef ROPT
+ printf ("Register option selected\n\n");
+ #else
+ printf ("Register option not selected\n\n");
+ strcpy(Reg_Define, "Register option Not selected.");
+ #endif
+
+ /*
+ if (Reg)
+ {
+ printf ("Program compiled with 'register' attribute\n");
+ printf ("\n");
+ }
+ else
+ {
+ printf ("Program compiled without 'register' attribute\n");
+ printf ("\n");
+ }
+
+ printf ("Please give the number of runs through the benchmark: ");
+ {
+ int n;
+ scanf ("%d", &n);
+ Number_Of_Runs = n;
+ }
+ printf ("\n");
+ printf ("Execution starts, %d runs through Dhrystone\n",
+ Number_Of_Runs);
+ */
+
+ Number_Of_Runs = 5000;
+
+ do
+ {
+
+ Number_Of_Runs = Number_Of_Runs * 2;
+ count = count - 1;
+ Arr_2_Glob [8][7] = 10;
+
+ /***************/
+ /* Start timer */
+ /***************/
+
+ Begin_Time = dtime();
+
+ for (Run_Index = 1; Run_Index <= Number_Of_Runs; ++Run_Index)
+ {
+
+ Proc_5();
+ Proc_4();
+ /* Ch_1_Glob == 'A', Ch_2_Glob == 'B', Bool_Glob == true */
+ Int_1_Loc = 2;
+ Int_2_Loc = 3;
+ strcpy (Str_2_Loc, "DHRYSTONE PROGRAM, 2'ND STRING");
+ Enum_Loc = Ident_2;
+ Bool_Glob = ! Func_2 (Str_1_Loc, Str_2_Loc);
+ /* Bool_Glob == 1 */
+ while (Int_1_Loc < Int_2_Loc) /* loop body executed once */
+ {
+ Int_3_Loc = 5 * Int_1_Loc - Int_2_Loc;
+ /* Int_3_Loc == 7 */
+ Proc_7 (Int_1_Loc, Int_2_Loc, &Int_3_Loc);
+ /* Int_3_Loc == 7 */
+ Int_1_Loc += 1;
+ } /* while */
+ /* Int_1_Loc == 3, Int_2_Loc == 3, Int_3_Loc == 7 */
+ Proc_8 (Arr_1_Glob, Arr_2_Glob, Int_1_Loc, Int_3_Loc);
+ /* Int_Glob == 5 */
+ Proc_1 (Ptr_Glob);
+ for (Ch_Index = 'A'; Ch_Index <= Ch_2_Glob; ++Ch_Index)
+ /* loop body executed twice */
+ {
+ if (Enum_Loc == Func_1 (Ch_Index, 'C'))
+ /* then, not executed */
+ {
+ Proc_6 (Ident_1, &Enum_Loc);
+ strcpy (Str_2_Loc, "DHRYSTONE PROGRAM, 3'RD STRING");
+ Int_2_Loc = Run_Index;
+ Int_Glob = Run_Index;
+ }
+ }
+ /* Int_1_Loc == 3, Int_2_Loc == 3, Int_3_Loc == 7 */
+ Int_2_Loc = Int_2_Loc * Int_1_Loc;
+ Int_1_Loc = Int_2_Loc / Int_3_Loc;
+ Int_2_Loc = 7 * (Int_2_Loc - Int_3_Loc) - Int_1_Loc;
+ /* Int_1_Loc == 1, Int_2_Loc == 13, Int_3_Loc == 7 */
+ Proc_2 (&Int_1_Loc);
+ /* Int_1_Loc == 5 */
+
+ } /* loop "for Run_Index" */
+
+ /**************/
+ /* Stop timer */
+ /**************/
+
+ End_Time = dtime();
+ User_Time = End_Time - Begin_Time;
+
+ printf ("%12.0f runs %6.2f seconds \n",(double) Number_Of_Runs, User_Time);
+ if (User_Time > 5)
+ {
+ count = 0;
+ }
+ else
+ {
+ if (User_Time < 0.1)
+ {
+ Number_Of_Runs = Number_Of_Runs * 5;
+ }
+ }
+ } /* calibrate/run do while */
+ while (count >0);
+
+ printf ("\n");
+ printf ("Final values (* implementation-dependent):\n");
+ printf ("\n");
+ printf ("Int_Glob: ");
+ if (Int_Glob == 5) printf ("O.K. ");
+ else printf ("WRONG ");
+ printf ("%d ", Int_Glob);
+
+ printf ("Bool_Glob: ");
+ if (Bool_Glob == 1) printf ("O.K. ");
+ else printf ("WRONG ");
+ printf ("%d\n", Bool_Glob);
+
+ printf ("Ch_1_Glob: ");
+ if (Ch_1_Glob == 'A') printf ("O.K. ");
+ else printf ("WRONG ");
+ printf ("%c ", Ch_1_Glob);
+
+ printf ("Ch_2_Glob: ");
+ if (Ch_2_Glob == 'B') printf ("O.K. ");
+ else printf ("WRONG ");
+ printf ("%c\n", Ch_2_Glob);
+
+ printf ("Arr_1_Glob[8]: ");
+ if (Arr_1_Glob[8] == 7) printf ("O.K. ");
+ else printf ("WRONG ");
+ printf ("%d ", Arr_1_Glob[8]);
+
+ printf ("Arr_2_Glob8/7: ");
+ if (Arr_2_Glob[8][7] == Number_Of_Runs + 10)
+ printf ("O.K. ");
+ else printf ("WRONG ");
+ printf ("%10d\n", Arr_2_Glob[8][7]);
+
+ printf ("Ptr_Glob-> ");
+ printf (" Ptr_Comp: * %d\n", (int) Ptr_Glob->Ptr_Comp);
+
+ printf (" Discr: ");
+ if (Ptr_Glob->Discr == 0) printf ("O.K. ");
+ else printf ("WRONG ");
+ printf ("%d ", Ptr_Glob->Discr);
+
+ printf ("Enum_Comp: ");
+ if (Ptr_Glob->variant.var_1.Enum_Comp == 2)
+ printf ("O.K. ");
+ else printf ("WRONG ");
+ printf ("%d\n", Ptr_Glob->variant.var_1.Enum_Comp);
+
+ printf (" Int_Comp: ");
+ if (Ptr_Glob->variant.var_1.Int_Comp == 17) printf ("O.K. ");
+ else printf ("WRONG ");
+ printf ("%d ", Ptr_Glob->variant.var_1.Int_Comp);
+
+ printf ("Str_Comp: ");
+ if (strcmp(Ptr_Glob->variant.var_1.Str_Comp,
+ "DHRYSTONE PROGRAM, SOME STRING") == 0)
+ printf ("O.K. ");
+ else printf ("WRONG ");
+ printf ("%s\n", Ptr_Glob->variant.var_1.Str_Comp);
+
+ printf ("Next_Ptr_Glob-> ");
+ printf (" Ptr_Comp: * %d", (int) Next_Ptr_Glob->Ptr_Comp);
+ printf (" same as above\n");
+
+ printf (" Discr: ");
+ if (Next_Ptr_Glob->Discr == 0)
+ printf ("O.K. ");
+ else printf ("WRONG ");
+ printf ("%d ", Next_Ptr_Glob->Discr);
+
+ printf ("Enum_Comp: ");
+ if (Next_Ptr_Glob->variant.var_1.Enum_Comp == 1)
+ printf ("O.K. ");
+ else printf ("WRONG ");
+ printf ("%d\n", Next_Ptr_Glob->variant.var_1.Enum_Comp);
+
+ printf (" Int_Comp: ");
+ if (Next_Ptr_Glob->variant.var_1.Int_Comp == 18)
+ printf ("O.K. ");
+ else printf ("WRONG ");
+ printf ("%d ", Next_Ptr_Glob->variant.var_1.Int_Comp);
+
+ printf ("Str_Comp: ");
+ if (strcmp(Next_Ptr_Glob->variant.var_1.Str_Comp,
+ "DHRYSTONE PROGRAM, SOME STRING") == 0)
+ printf ("O.K. ");
+ else printf ("WRONG ");
+ printf ("%s\n", Next_Ptr_Glob->variant.var_1.Str_Comp);
+
+ printf ("Int_1_Loc: ");
+ if (Int_1_Loc == 5)
+ printf ("O.K. ");
+ else printf ("WRONG ");
+ printf ("%d ", Int_1_Loc);
+
+ printf ("Int_2_Loc: ");
+ if (Int_2_Loc == 13)
+ printf ("O.K. ");
+ else printf ("WRONG ");
+ printf ("%d\n", Int_2_Loc);
+
+ printf ("Int_3_Loc: ");
+ if (Int_3_Loc == 7)
+ printf ("O.K. ");
+ else printf ("WRONG ");
+ printf ("%d ", Int_3_Loc);
+
+ printf ("Enum_Loc: ");
+ if (Enum_Loc == 1)
+ printf ("O.K. ");
+ else printf ("WRONG ");
+ printf ("%d\n", Enum_Loc);
+
+ printf ("Str_1_Loc: ");
+ if (strcmp(Str_1_Loc, "DHRYSTONE PROGRAM, 1'ST STRING") == 0)
+ printf ("O.K. ");
+ else printf ("WRONG ");
+ printf ("%s\n", Str_1_Loc);
+
+ printf ("Str_2_Loc: ");
+ if (strcmp(Str_2_Loc, "DHRYSTONE PROGRAM, 2'ND STRING") == 0)
+ printf ("O.K. ");
+ else printf ("WRONG ");
+ printf ("%s\n", Str_2_Loc);
+
+ printf ("\n");
+
+
+ if (User_Time < Too_Small_Time)
+ {
+ printf ("Measured time too small to obtain meaningful results\n");
+ printf ("Please increase number of runs\n");
+ printf ("\n");
+ }
+ else
+ {
+ Microseconds = User_Time * Mic_secs_Per_Second
+ / (double) Number_Of_Runs;
+ Dhrystones_Per_Second = (double) Number_Of_Runs / User_Time;
+ Vax_Mips = Dhrystones_Per_Second / 1757.0;
+
+ printf ("Microseconds for one run through Dhrystone: ");
+ printf ("%12.2lf \n", Microseconds);
+ printf ("Dhrystones per Second: ");
+ printf ("%10.0lf \n", Dhrystones_Per_Second);
+ printf ("VAX MIPS rating = ");
+ printf ("%12.2lf \n",Vax_Mips);
+ printf ("\n");
+
+/************************************************************************
+ * Type details of hardware, software etc. *
+ ************************************************************************/
+
+ if (getinput == 1)
+ {
+ printf ("Enter the following which will be added with results to file DHRY.TXT\n");
+ printf ("When submitting a number of results you need only provide details once\n");
+ printf ("but a cross reference such as an abbreviated CPU type would be useful.\n");
+ printf ("You can kill (exit or close) the program now and no data will be added.\n\n");
+
+ printf ("PC Supplier/model ? ");
+ gets(general[1]);
+
+ printf ("CPU chip ? ");
+ gets(general[2]);
+
+ printf ("Clock MHz ? ");
+ gets(general[3]);
+
+ printf ("Cache size ? ");
+ gets(general[4]);
+
+ printf ("Chipset & H/W options ? ");
+ gets(general[5]);
+
+ printf ("OS/DOS version ? ");
+ gets(general[6]);
+
+ printf ("Your name ? ");
+ gets(general[7]);
+
+ printf ("Company/Location ? ");
+ gets(general[8]);
+
+ printf ("E-mail address ? ");
+ gets(general[0]);
+ }
+/************************************************************************
+ * Add results to output file Dhry.txt *
+ ************************************************************************/
+ fprintf (Ap, "-------------------- -----------------------------------"
+ "\n");
+ fprintf (Ap, "Dhrystone Benchmark Version 2.1 (Language: C++)\n\n");
+ fprintf (Ap, "PC model %s\n", general[1]);
+ fprintf (Ap, "CPU %s\n", general[2]);
+ fprintf (Ap, "Clock MHz %s\n", general[3]);
+ fprintf (Ap, "Cache %s\n", general[4]);
+ fprintf (Ap, "Options %s\n", general[5]);
+ fprintf (Ap, "OS/DOS %s\n", general[6]);
+ fprintf (Ap, "Compiler %s\n", compiler);
+ fprintf (Ap, "OptLevel %s\n", options);
+ fprintf (Ap, "Run by %s\n", general[7]);
+ fprintf (Ap, "From %s\n", general[8]);
+ fprintf (Ap, "Mail %s\n\n", general[0]);
+
+ fprintf (Ap, "Final values (* implementation-dependent):\n");
+ fprintf (Ap, "\n");
+ fprintf (Ap, "Int_Glob: ");
+ if (Int_Glob == 5) fprintf (Ap, "O.K. ");
+ else fprintf (Ap, "WRONG ");
+ fprintf (Ap, "%d\n", Int_Glob);
+
+ fprintf (Ap, "Bool_Glob: ");
+ if (Bool_Glob == 1) fprintf (Ap, "O.K. ");
+ else fprintf (Ap, "WRONG ");
+ fprintf (Ap, "%d\n", Bool_Glob);
+
+ fprintf (Ap, "Ch_1_Glob: ");
+ if (Ch_1_Glob == 'A') fprintf (Ap, "O.K. ");
+ else fprintf (Ap, "WRONG ");
+ fprintf (Ap, "%c\n", Ch_1_Glob);
+
+ fprintf (Ap, "Ch_2_Glob: ");
+ if (Ch_2_Glob == 'B') fprintf (Ap, "O.K. ");
+ else fprintf (Ap, "WRONG ");
+ fprintf (Ap, "%c\n", Ch_2_Glob);
+
+ fprintf (Ap, "Arr_1_Glob[8]: ");
+ if (Arr_1_Glob[8] == 7) fprintf (Ap, "O.K. ");
+ else fprintf (Ap, "WRONG ");
+ fprintf (Ap, "%d\n", Arr_1_Glob[8]);
+
+ fprintf (Ap, "Arr_2_Glob8/7: ");
+ if (Arr_2_Glob[8][7] == Number_Of_Runs + 10)
+ fprintf (Ap, "O.K. ");
+ else fprintf (Ap, "WRONG ");
+ fprintf (Ap, "%10d\n", Arr_2_Glob[8][7]);
+
+ fprintf (Ap, "Ptr_Glob-> \n");
+ fprintf (Ap, " Ptr_Comp: * %d\n", (int) Ptr_Glob->Ptr_Comp);
+
+ fprintf (Ap, " Discr: ");
+ if (Ptr_Glob->Discr == 0) fprintf (Ap, "O.K. ");
+ else fprintf (Ap, "WRONG ");
+ fprintf (Ap, "%d\n", Ptr_Glob->Discr);
+
+ fprintf (Ap, " Enum_Comp: ");
+ if (Ptr_Glob->variant.var_1.Enum_Comp == 2)
+ fprintf (Ap, "O.K. ");
+ else fprintf (Ap, "WRONG ");
+ fprintf (Ap, "%d\n", Ptr_Glob->variant.var_1.Enum_Comp);
+
+ fprintf (Ap, " Int_Comp: ");
+ if (Ptr_Glob->variant.var_1.Int_Comp == 17) fprintf (Ap, "O.K. ");
+ else fprintf (Ap, "WRONG ");
+ fprintf (Ap, "%d\n", Ptr_Glob->variant.var_1.Int_Comp);
+
+ fprintf (Ap, " Str_Comp: ");
+ if (strcmp(Ptr_Glob->variant.var_1.Str_Comp,
+ "DHRYSTONE PROGRAM, SOME STRING") == 0)
+ fprintf (Ap, "O.K. ");
+ else fprintf (Ap, "WRONG ");
+ fprintf (Ap, "%s\n", Ptr_Glob->variant.var_1.Str_Comp);
+
+ fprintf (Ap, "Next_Ptr_Glob-> \n");
+ fprintf (Ap, " Ptr_Comp: * %d", (int) Next_Ptr_Glob->Ptr_Comp);
+ fprintf (Ap, " same as above\n");
+
+ fprintf (Ap, " Discr: ");
+ if (Next_Ptr_Glob->Discr == 0)
+ fprintf (Ap, "O.K. ");
+ else fprintf (Ap, "WRONG ");
+ fprintf (Ap, "%d\n", Next_Ptr_Glob->Discr);
+
+ fprintf (Ap, " Enum_Comp: ");
+ if (Next_Ptr_Glob->variant.var_1.Enum_Comp == 1)
+ fprintf (Ap, "O.K. ");
+ else fprintf (Ap, "WRONG ");
+ fprintf (Ap, "%d\n", Next_Ptr_Glob->variant.var_1.Enum_Comp);
+
+ fprintf (Ap, " Int_Comp: ");
+ if (Next_Ptr_Glob->variant.var_1.Int_Comp == 18)
+ fprintf (Ap, "O.K. ");
+ else fprintf (Ap, "WRONG ");
+ fprintf (Ap, "%d\n", Next_Ptr_Glob->variant.var_1.Int_Comp);
+
+ fprintf (Ap, " Str_Comp: ");
+ if (strcmp(Next_Ptr_Glob->variant.var_1.Str_Comp,
+ "DHRYSTONE PROGRAM, SOME STRING") == 0)
+ fprintf (Ap, "O.K. ");
+ else fprintf (Ap, "WRONG ");
+ fprintf (Ap, "%s\n", Next_Ptr_Glob->variant.var_1.Str_Comp);
+
+ fprintf (Ap, "Int_1_Loc: ");
+ if (Int_1_Loc == 5)
+ fprintf (Ap, "O.K. ");
+ else fprintf (Ap, "WRONG ");
+ fprintf (Ap, "%d\n", Int_1_Loc);
+
+ fprintf (Ap, "Int_2_Loc: ");
+ if (Int_2_Loc == 13)
+ fprintf (Ap, "O.K. ");
+ else fprintf (Ap, "WRONG ");
+ fprintf (Ap, "%d\n", Int_2_Loc);
+
+ fprintf (Ap, "Int_3_Loc: ");
+ if (Int_3_Loc == 7)
+ fprintf (Ap, "O.K. ");
+ else fprintf (Ap, "WRONG ");
+ fprintf (Ap, "%d\n", Int_3_Loc);
+
+ fprintf (Ap, "Enum_Loc: ");
+ if (Enum_Loc == 1)
+ fprintf (Ap, "O.K. ");
+ else fprintf (Ap, "WRONG ");
+ fprintf (Ap, "%d\n", Enum_Loc);
+
+ fprintf (Ap, "Str_1_Loc: ");
+ if (strcmp(Str_1_Loc, "DHRYSTONE PROGRAM, 1'ST STRING") == 0)
+ fprintf (Ap, "O.K. ");
+ else fprintf (Ap, "WRONG ");
+ fprintf (Ap, "%s\n", Str_1_Loc);
+
+ fprintf (Ap, "Str_2_Loc: ");
+ if (strcmp(Str_2_Loc, "DHRYSTONE PROGRAM, 2'ND STRING") == 0)
+ fprintf (Ap, "O.K. ");
+ else fprintf (Ap, "WRONG ");
+ fprintf (Ap, "%s\n", Str_2_Loc);
+
+
+ fprintf (Ap, "\n");
+ fprintf(Ap,"%s\n",Reg_Define);
+ fprintf (Ap, "\n");
+ fprintf(Ap,"Microseconds 1 loop: %12.2lf\n",Microseconds);
+ fprintf(Ap,"Dhrystones / second: %10.0lf\n",Dhrystones_Per_Second);
+ fprintf(Ap,"VAX MIPS rating: %12.2lf\n\n",Vax_Mips);
+ fclose(Ap);
+ }
+
+ printf ("\n");
+ printf ("A new results file will have been created in the same directory as the\n");
+ printf (".EXE files if one did not already exist. If you made a mistake on input, \n");
+ printf ("you can use a text editor to correct it, delete the results or copy \n");
+ printf ("them to a different file name. If you intend to run multiple tests you\n");
+ printf ("you may wish to rename DHRY.TXT with a more informative title.\n\n");
+ printf ("Please submit feedback and results files as a posting in Section 12\n");
+ printf ("or to Roy_Longbottom@compuserve.com\n\n");
+
+ if (getinput == 1)
+ {
+ printf("Press any key to exit\n");
+ printf ("\nIf this is displayed you must close the window in the normal way\n");
+ }
+ }
+
+
+ void Proc_1 (REG Rec_Pointer Ptr_Val_Par)
+ /******************/
+
+ /* executed once */
+ {
+ REG Rec_Pointer Next_Record = Ptr_Val_Par->Ptr_Comp;
+ /* == Ptr_Glob_Next */
+ /* Local variable, initialized with Ptr_Val_Par->Ptr_Comp, */
+ /* corresponds to "rename" in Ada, "with" in Pascal */
+
+ structassign (*Ptr_Val_Par->Ptr_Comp, *Ptr_Glob);
+ Ptr_Val_Par->variant.var_1.Int_Comp = 5;
+ Next_Record->variant.var_1.Int_Comp
+ = Ptr_Val_Par->variant.var_1.Int_Comp;
+ Next_Record->Ptr_Comp = Ptr_Val_Par->Ptr_Comp;
+ Proc_3 (&Next_Record->Ptr_Comp);
+ /* Ptr_Val_Par->Ptr_Comp->Ptr_Comp
+ == Ptr_Glob->Ptr_Comp */
+ if (Next_Record->Discr == Ident_1)
+ /* then, executed */
+ {
+ Next_Record->variant.var_1.Int_Comp = 6;
+ Proc_6 (Ptr_Val_Par->variant.var_1.Enum_Comp,
+ &Next_Record->variant.var_1.Enum_Comp);
+ Next_Record->Ptr_Comp = Ptr_Glob->Ptr_Comp;
+ Proc_7 (Next_Record->variant.var_1.Int_Comp, 10,
+ &Next_Record->variant.var_1.Int_Comp);
+ }
+ else /* not executed */
+ structassign (*Ptr_Val_Par, *Ptr_Val_Par->Ptr_Comp);
+ } /* Proc_1 */
+
+
+ void Proc_2 (One_Fifty *Int_Par_Ref)
+ /******************/
+ /* executed once */
+ /* *Int_Par_Ref == 1, becomes 4 */
+
+ {
+ One_Fifty Int_Loc;
+ Enumeration Enum_Loc;
+
+ Int_Loc = *Int_Par_Ref + 10;
+ do /* executed once */
+ if (Ch_1_Glob == 'A')
+ /* then, executed */
+ {
+ Int_Loc -= 1;
+ *Int_Par_Ref = Int_Loc - Int_Glob;
+ Enum_Loc = Ident_1;
+ } /* if */
+ while (Enum_Loc != Ident_1); /* true */
+ } /* Proc_2 */
+
+
+ void Proc_3 (Rec_Pointer *Ptr_Ref_Par)
+ /******************/
+ /* executed once */
+ /* Ptr_Ref_Par becomes Ptr_Glob */
+
+ {
+ if (Ptr_Glob != Null)
+ /* then, executed */
+ *Ptr_Ref_Par = Ptr_Glob->Ptr_Comp;
+ Proc_7 (10, Int_Glob, &Ptr_Glob->variant.var_1.Int_Comp);
+ } /* Proc_3 */
+
+
+void Proc_4 () /* without parameters */
+ /*******/
+ /* executed once */
+ {
+ Boolean Bool_Loc;
+
+ Bool_Loc = Ch_1_Glob == 'A';
+ Bool_Glob = Bool_Loc | Bool_Glob;
+ Ch_2_Glob = 'B';
+ } /* Proc_4 */
+
+
+ void Proc_5 () /* without parameters */
+ /*******/
+ /* executed once */
+ {
+ Ch_1_Glob = 'A';
+ Bool_Glob = false;
+ } /* Proc_5 */
+
+
+ /* Procedure for the assignment of structures, */
+ /* if the C compiler doesn't support this feature */
+ #ifdef NOSTRUCTASSIGN
+ memcpy (d, s, l)
+ register char *d;
+ register char *s;
+ register int l;
+ {
+ while (l--) *d++ = *s++;
+ }
+ #endif
+
+
+double dtime()
+{
+
+ /* #include <ctype.h> */
+
+ #define HZ CLOCKS_PER_SEC
+ clock_t tnow;
+
+ double q;
+ tnow = clock();
+ q = (double)tnow / (double)HZ;
+ return q;
+}
diff --git a/contrib/cortex-strings/benchmarks/dhry/dhry_2.c b/contrib/cortex-strings/benchmarks/dhry/dhry_2.c
new file mode 100644
index 000000000000..434945c99344
--- /dev/null
+++ b/contrib/cortex-strings/benchmarks/dhry/dhry_2.c
@@ -0,0 +1,186 @@
+ /*
+ *************************************************************************
+ *
+ * "DHRYSTONE" Benchmark Program
+ * -----------------------------
+ *
+ * Version: C, Version 2.1
+ *
+ * File: dhry_2.c (part 3 of 3)
+ *
+ * Date: May 25, 1988
+ *
+ * Author: Reinhold P. Weicker
+ *
+ *************************************************************************
+ */
+
+ #include "dhry.h"
+
+ #ifndef REG
+ #define REG
+ /* REG becomes defined as empty */
+ /* i.e. no register variables */
+ #else
+ #define REG register
+ #endif
+
+ extern int Int_Glob;
+ extern char Ch_1_Glob;
+
+ Boolean Func_3 (Enumeration Enum_Par_Val);
+
+ void Proc_6 (Enumeration Enum_Val_Par, Enumeration *Enum_Ref_Par)
+ /*********************************/
+ /* executed once */
+ /* Enum_Val_Par == Ident_3, Enum_Ref_Par becomes Ident_2 */
+
+ {
+ *Enum_Ref_Par = Enum_Val_Par;
+ if (! Func_3 (Enum_Val_Par))
+ /* then, not executed */
+ *Enum_Ref_Par = Ident_4;
+ switch (Enum_Val_Par)
+ {
+ case Ident_1:
+ *Enum_Ref_Par = Ident_1;
+ break;
+ case Ident_2:
+ if (Int_Glob > 100)
+ /* then */
+ *Enum_Ref_Par = Ident_1;
+ else *Enum_Ref_Par = Ident_4;
+ break;
+ case Ident_3: /* executed */
+ *Enum_Ref_Par = Ident_2;
+ break;
+ case Ident_4: break;
+ case Ident_5:
+ *Enum_Ref_Par = Ident_3;
+ break;
+ } /* switch */
+ } /* Proc_6 */
+
+
+ void Proc_7 (One_Fifty Int_1_Par_Val, One_Fifty Int_2_Par_Val,
+ One_Fifty *Int_Par_Ref)
+ /**********************************************/
+ /* executed three times */
+ /* first call: Int_1_Par_Val == 2, Int_2_Par_Val == 3, */
+ /* Int_Par_Ref becomes 7 */
+ /* second call: Int_1_Par_Val == 10, Int_2_Par_Val == 5, */
+ /* Int_Par_Ref becomes 17 */
+ /* third call: Int_1_Par_Val == 6, Int_2_Par_Val == 10, */
+ /* Int_Par_Ref becomes 18 */
+
+ {
+ One_Fifty Int_Loc;
+
+ Int_Loc = Int_1_Par_Val + 2;
+ *Int_Par_Ref = Int_2_Par_Val + Int_Loc;
+ } /* Proc_7 */
+
+
+ void Proc_8 (Arr_1_Dim Arr_1_Par_Ref, Arr_2_Dim Arr_2_Par_Ref,
+ int Int_1_Par_Val, int Int_2_Par_Val)
+ /*********************************************************************/
+ /* executed once */
+ /* Int_Par_Val_1 == 3 */
+ /* Int_Par_Val_2 == 7 */
+
+ {
+ REG One_Fifty Int_Index;
+ REG One_Fifty Int_Loc;
+
+ Int_Loc = Int_1_Par_Val + 5;
+ Arr_1_Par_Ref [Int_Loc] = Int_2_Par_Val;
+ Arr_1_Par_Ref [Int_Loc+1] = Arr_1_Par_Ref [Int_Loc];
+ Arr_1_Par_Ref [Int_Loc+30] = Int_Loc;
+ for (Int_Index = Int_Loc; Int_Index <= Int_Loc+1; ++Int_Index)
+ Arr_2_Par_Ref [Int_Loc] [Int_Index] = Int_Loc;
+ Arr_2_Par_Ref [Int_Loc] [Int_Loc-1] += 1;
+ Arr_2_Par_Ref [Int_Loc+20] [Int_Loc] = Arr_1_Par_Ref [Int_Loc];
+ Int_Glob = 5;
+ } /* Proc_8 */
+
+
+ Enumeration Func_1 (Capital_Letter Ch_1_Par_Val,
+ Capital_Letter Ch_2_Par_Val)
+ /*************************************************/
+ /* executed three times */
+ /* first call: Ch_1_Par_Val == 'H', Ch_2_Par_Val == 'R' */
+ /* second call: Ch_1_Par_Val == 'A', Ch_2_Par_Val == 'C' */
+ /* third call: Ch_1_Par_Val == 'B', Ch_2_Par_Val == 'C' */
+
+ {
+ Capital_Letter Ch_1_Loc;
+ Capital_Letter Ch_2_Loc;
+
+ Ch_1_Loc = Ch_1_Par_Val;
+ Ch_2_Loc = Ch_1_Loc;
+ if (Ch_2_Loc != Ch_2_Par_Val)
+ /* then, executed */
+ return (Ident_1);
+ else /* not executed */
+ {
+ Ch_1_Glob = Ch_1_Loc;
+ return (Ident_2);
+ }
+ } /* Func_1 */
+
+
+ Boolean Func_2 (Str_30 Str_1_Par_Ref, Str_30 Str_2_Par_Ref)
+ /*************************************************/
+ /* executed once */
+ /* Str_1_Par_Ref == "DHRYSTONE PROGRAM, 1'ST STRING" */
+ /* Str_2_Par_Ref == "DHRYSTONE PROGRAM, 2'ND STRING" */
+
+ {
+ REG One_Thirty Int_Loc;
+ Capital_Letter Ch_Loc;
+
+ Int_Loc = 2;
+ while (Int_Loc <= 2) /* loop body executed once */
+ if (Func_1 (Str_1_Par_Ref[Int_Loc],
+ Str_2_Par_Ref[Int_Loc+1]) == Ident_1)
+ /* then, executed */
+ {
+ Ch_Loc = 'A';
+ Int_Loc += 1;
+ } /* if, while */
+ if (Ch_Loc >= 'W' && Ch_Loc < 'Z')
+ /* then, not executed */
+ Int_Loc = 7;
+ if (Ch_Loc == 'R')
+ /* then, not executed */
+ return (true);
+ else /* executed */
+ {
+ if (strcmp (Str_1_Par_Ref, Str_2_Par_Ref) > 0)
+ /* then, not executed */
+ {
+ Int_Loc += 7;
+ Int_Glob = Int_Loc;
+ return (true);
+ }
+ else /* executed */
+ return (false);
+ } /* if Ch_Loc */
+ } /* Func_2 */
+
+
+ Boolean Func_3 (Enumeration Enum_Par_Val)
+ /***************************/
+ /* executed once */
+ /* Enum_Par_Val == Ident_3 */
+
+ {
+ Enumeration Enum_Loc;
+
+ Enum_Loc = Enum_Par_Val;
+ if (Enum_Loc == Ident_3)
+ /* then, executed */
+ return (true);
+ else /* not executed */
+ return (false);
+ } /* Func_3 */
diff --git a/contrib/cortex-strings/benchmarks/multi/harness.c b/contrib/cortex-strings/benchmarks/multi/harness.c
new file mode 100644
index 000000000000..257a308e6b4e
--- /dev/null
+++ b/contrib/cortex-strings/benchmarks/multi/harness.c
@@ -0,0 +1,407 @@
+/*
+ * Copyright (c) 2011, Linaro Limited
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the Linaro nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** A simple harness that times how long a string function takes to
+ * run.
+ */
+
+/* PENDING: Add EPL */
+
+#include <string.h>
+#include <time.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <assert.h>
+#include <unistd.h>
+#include <errno.h>
+
+#define NUM_ELEMS(_x) (sizeof(_x) / sizeof((_x)[0]))
+
+#ifndef VERSION
+#define VERSION "(unknown version)"
+#endif
+
+/** Make sure a function is called by using the return value */
+#define SPOIL(_x) volatile long x = (long)(_x); (void)x
+
+/** Type of functions that can be tested */
+typedef void (*stub_t)(void *dest, void *src, size_t n);
+
+/** Meta data about one test */
+struct test
+{
+ /** Test name */
+ const char *name;
+ /** Function to test */
+ stub_t stub;
+};
+
+/** Flush the cache by reading a chunk of memory */
+static void empty(volatile char *against)
+{
+ /* We know that there's a 16 k cache with 64 byte lines giving
+ a total of 256 lines. Read randomly from 256*5 places should
+ flush everything */
+ int offset = (1024 - 256)*1024;
+
+ for (int i = offset; i < offset + 16*1024*3; i += 64)
+ {
+ against[i];
+ }
+}
+
+/** Stub that does nothing. Used for calibrating */
+static void xbounce(void *dest, void *src, size_t n)
+{
+ SPOIL(0);
+}
+
+/** Stub that calls memcpy */
+static void xmemcpy(void *dest, void *src, size_t n)
+{
+ SPOIL(memcpy(dest, src, n));
+}
+
+/** Stub that calls memset */
+static void xmemset(void *dest, void *src, size_t n)
+{
+ SPOIL(memset(dest, 0, n));
+}
+
+/** Stub that calls memcmp */
+static void xmemcmp(void *dest, void *src, size_t n)
+{
+ SPOIL(memcmp(dest, src, n));
+}
+
+/** Stub that calls strcpy */
+static void xstrcpy(void *dest, void *src, size_t n)
+{
+ SPOIL(strcpy(dest, src));
+}
+
+/** Stub that calls strlen */
+static void xstrlen(void *dest, void *src, size_t n)
+{
+ SPOIL(strlen(dest));
+}
+
+/** Stub that calls strcmp */
+static void xstrcmp(void *dest, void *src, size_t n)
+{
+ SPOIL(strcmp(dest, src));
+}
+
+/** Stub that calls strchr */
+static void xstrchr(void *dest, void *src, size_t n)
+{
+ /* Put the character at the end of the string and before the null */
+ ((char *)src)[n-1] = 32;
+ SPOIL(strchr(src, 32));
+}
+
+/** Stub that calls memchr */
+static void xmemchr(void *dest, void *src, size_t n)
+{
+ /* Put the character at the end of the block */
+ ((char *)src)[n-1] = 32;
+ SPOIL(memchr(src, 32, n));
+}
+
+/** All functions that can be tested */
+static const struct test tests[] =
+ {
+ { "bounce", xbounce },
+ { "memchr", xmemchr },
+ { "memcpy", xmemcpy },
+ { "memset", xmemset },
+ { "memcmp", xmemcmp },
+ { "strchr", xstrchr },
+ { "strcmp", xstrcmp },
+ { "strcpy", xstrcpy },
+ { "strlen", xstrlen },
+ { NULL }
+ };
+
+/** Show basic usage */
+static void usage(const char* name)
+{
+ printf("%s %s: run a string related benchmark.\n"
+ "usage: %s [-c block-size] [-l loop-count] [-a alignment|src_alignment:dst_alignment] [-f] [-t test-name] [-r run-id]\n"
+ , name, VERSION, name);
+
+ printf("Tests:");
+
+ for (const struct test *ptest = tests; ptest->name != NULL; ptest++)
+ {
+ printf(" %s", ptest->name);
+ }
+
+ printf("\n");
+
+ exit(-1);
+}
+
+/** Find the test by name */
+static const struct test *find_test(const char *name)
+{
+ if (name == NULL)
+ {
+ return tests + 0;
+ }
+ else
+ {
+ for (const struct test *p = tests; p->name != NULL; p++)
+ {
+ if (strcmp(p->name, name) == 0)
+ {
+ return p;
+ }
+ }
+ }
+
+ return NULL;
+}
+
+#define MIN_BUFFER_SIZE 1024*1024
+#define MAX_ALIGNMENT 256
+
+/** Take a pointer and ensure that the lower bits == alignment */
+static char *realign(char *p, int alignment)
+{
+ uintptr_t pp = (uintptr_t)p;
+ pp = (pp + (MAX_ALIGNMENT - 1)) & ~(MAX_ALIGNMENT - 1);
+ pp += alignment;
+
+ return (char *)pp;
+}
+
+static int parse_int_arg(const char *arg, const char *exe_name)
+{
+ long int ret;
+
+ errno = 0;
+ ret = strtol(arg, NULL, 0);
+
+ if (errno)
+ {
+ usage(exe_name);
+ }
+
+ return (int)ret;
+}
+
+static void parse_alignment_arg(const char *arg, const char *exe_name,
+ int *src_alignment, int *dst_alignment)
+{
+ long int ret;
+ char *endptr;
+
+ errno = 0;
+ ret = strtol(arg, &endptr, 0);
+
+ if (errno)
+ {
+ usage(exe_name);
+ }
+
+ *src_alignment = (int)ret;
+
+ if (ret > 256 || ret < 1)
+ {
+ printf("Alignment should be in the range [1, 256].\n");
+ usage(exe_name);
+ }
+
+ if (ret == 256)
+ ret = 0;
+
+ if (endptr && *endptr == ':')
+ {
+ errno = 0;
+ ret = strtol(endptr + 1, NULL, 0);
+
+ if (errno)
+ {
+ usage(exe_name);
+ }
+
+ if (ret > 256 || ret < 1)
+ {
+ printf("Alignment should be in the range [1, 256].\n");
+ usage(exe_name);
+ }
+
+ if (ret == 256)
+ ret = 0;
+ }
+
+ *dst_alignment = (int)ret;
+}
+
+/** Setup and run a test */
+int main(int argc, char **argv)
+{
+ /* Size of src and dest buffers */
+ size_t buffer_size = MIN_BUFFER_SIZE;
+
+ /* Number of bytes per call */
+ int count = 31;
+ /* Number of times to run */
+ int loops = 10000000;
+ /* True to flush the cache each time */
+ int flush = 0;
+ /* Name of the test */
+ const char *name = NULL;
+ /* Alignment of buffers */
+ int src_alignment = 8;
+ int dst_alignment = 8;
+ /* Name of the run */
+ const char *run_id = "0";
+
+ int opt;
+
+ while ((opt = getopt(argc, argv, "c:l:ft:r:hva:")) > 0)
+ {
+ switch (opt)
+ {
+ case 'c':
+ count = parse_int_arg(optarg, argv[0]);
+ break;
+ case 'l':
+ loops = parse_int_arg(optarg, argv[0]);
+ break;
+ case 'a':
+ parse_alignment_arg(optarg, argv[0], &src_alignment, &dst_alignment);
+ break;
+ case 'f':
+ flush = 1;
+ break;
+ case 't':
+ name = strdup(optarg);
+ break;
+ case 'r':
+ run_id = strdup(optarg);
+ break;
+ case 'h':
+ usage(argv[0]);
+ break;
+ default:
+ usage(argv[0]);
+ break;
+ }
+ }
+
+ /* Find the test by name */
+ const struct test *ptest = find_test(name);
+
+ if (ptest == NULL)
+ {
+ usage(argv[0]);
+ }
+
+ if (count + MAX_ALIGNMENT * 2 > MIN_BUFFER_SIZE)
+ {
+ buffer_size = count + MAX_ALIGNMENT * 2;
+ }
+
+ /* Buffers to read and write from */
+ char *src = malloc(buffer_size);
+ char *dest = malloc(buffer_size);
+
+ assert(src != NULL && dest != NULL);
+
+ src = realign(src, src_alignment);
+ dest = realign(dest, dst_alignment);
+
+ /* Fill the buffer with non-zero, reproducable random data */
+ srandom(1539);
+
+ for (int i = 0; i < buffer_size; i++)
+ {
+ src[i] = (char)random() | 1;
+ dest[i] = src[i];
+ }
+
+ /* Make sure the buffers are null terminated for any string tests */
+ src[count] = 0;
+ dest[count] = 0;
+
+ struct timespec start, end;
+ int err = clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);
+ assert(err == 0);
+
+ /* Preload */
+ stub_t stub = ptest->stub;
+
+ /* Run two variants to reduce the cost of testing for the flush */
+ if (flush == 0)
+ {
+ for (int i = 0; i < loops; i++)
+ {
+ (*stub)(dest, src, count);
+ }
+ }
+ else
+ {
+ for (int i = 0; i < loops; i++)
+ {
+ (*stub)(dest, src, count);
+ empty(dest);
+ }
+ }
+
+ err = clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end);
+ assert(err == 0);
+
+ /* Drop any leading path and pull the variant name out of the executable */
+ char *variant = strrchr(argv[0], '/');
+
+ if (variant == NULL)
+ {
+ variant = argv[0];
+ }
+
+ variant = strstr(variant, "try-");
+ assert(variant != NULL);
+
+ double elapsed = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) * 1e-9;
+ /* Estimate the bounce time. Measured on a Panda. */
+ double bounced = 0.448730 * loops / 50000000;
+
+ /* Dump both machine and human readable versions */
+ printf("%s:%s:%u:%u:%d:%d:%s:%.6f: took %.6f s for %u calls to %s of %u bytes. ~%.3f MB/s corrected.\n",
+ variant + 4, ptest->name,
+ count, loops, src_alignment, dst_alignment, run_id,
+ elapsed,
+ elapsed, loops, ptest->name, count,
+ (double)loops*count/(elapsed - bounced)/(1024*1024));
+
+ return 0;
+}
diff --git a/contrib/cortex-strings/configure.ac b/contrib/cortex-strings/configure.ac
new file mode 100644
index 000000000000..56f1ced94299
--- /dev/null
+++ b/contrib/cortex-strings/configure.ac
@@ -0,0 +1,88 @@
+# Copyright (c) 2011-2012, Linaro Limited
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of the Linaro nor the
+# names of its contributors may be used to endorse or promote products
+# derived from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+AC_INIT(cortex-strings, 1.1-2012.06~dev)
+AM_INIT_AUTOMAKE(foreign subdir-objects color-tests dist-bzip2)
+AC_CONFIG_HEADERS([config.h])
+AC_CONFIG_FILES(Makefile)
+AC_CANONICAL_HOST
+AM_PROG_AS
+AC_PROG_CC
+AC_PROG_LIBTOOL
+
+default_submachine=
+
+case $host in
+aarch64*-*-*)
+ arch=aarch64
+ ;;
+arm*-*-*)
+ arch=aarch32
+ default_submachine=cortex-a9
+ ;;
+x86_64-*-*-*)
+ arch=generic
+ ;;
+*)
+ AC_MSG_ERROR([unknown architecture $host])
+ ;;
+esac
+
+AM_CONDITIONAL([HOST_AARCH32], [test x$arch = xaarch32])
+AM_CONDITIONAL([HOST_AARCH64], [test x$arch = xaarch64])
+AM_CONDITIONAL([HOST_GENERIC], [test x$arch = xgeneric])
+
+AC_ARG_WITH([cpu],
+ AS_HELP_STRING([--with-cpu=CPU],
+ [select code for CPU variant @<:@default=cortex-a9@:>@]]),
+ [dnl
+ case "$withval" in
+ yes|'') AC_MSG_ERROR([--with-cpu requires an argument]) ;;
+ no) ;;
+ *) submachine="$withval" ;;
+ esac
+],
+[submachine=$default_submachine])
+
+AC_SUBST(submachine)
+AM_CONDITIONAL([WITH_SUBMACHINE], [test x$submachine != x])
+
+AC_ARG_WITH([neon],
+ AC_HELP_STRING([--with-neon],
+ [include NEON specific routines @<:@default=yes@:>@]),
+ [with_neon=$withval],
+ [with_neon=yes])
+AC_SUBST(with_neon)
+AM_CONDITIONAL(WITH_NEON, test x$with_neon = xyes)
+
+AC_ARG_WITH([vfp],
+ AC_HELP_STRING([--with-vfp],
+ [include VFP specific routines @<:@default=yes@:>@]),
+ [with_vfp=$withval],
+ [with_vfp=yes])
+AC_SUBST(with_vfp)
+AM_CONDITIONAL(WITH_VFP, test x$with_vfp = xyes)
+
+AC_OUTPUT
diff --git a/contrib/cortex-strings/scripts/add-license.sh b/contrib/cortex-strings/scripts/add-license.sh
new file mode 100755
index 000000000000..8a6c0710fbbe
--- /dev/null
+++ b/contrib/cortex-strings/scripts/add-license.sh
@@ -0,0 +1,79 @@
+#!/bin/bash
+#
+# Add the modified BSD license to a file
+#
+
+f=`mktemp -d`
+trap "rm -rf $f" EXIT
+
+year=`date +%Y`
+cat > $f/original <<EOF
+Copyright (c) $year, Linaro Limited
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the Linaro nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+EOF
+
+# Translate it to C style
+echo "/*" > $f/c
+sed -r 's/(.*)/ * \1/' $f/original | sed -r 's/ +$//' >> $f/c
+echo " */" >> $f/c
+echo >> $f/c
+
+# ...and shell style
+sed -r 's/(.*)/# \1/' $f/original | sed -r 's/ +$//' >> $f/shell
+echo '#' >> $f/shell
+echo >> $f/shell
+
+for name in $@; do
+ if grep -q Copyright $name; then
+ echo $name already has some type of copyright
+ continue
+ fi
+
+ case $name in
+ # These files don't have an explicit license
+ *autogen.sh*)
+ continue;;
+ *reference/newlib/*)
+ continue;;
+ *reference/newlib-xscale/*)
+ continue;;
+ */dhry/*)
+ continue;;
+
+ *.c)
+ src=$f/c
+ ;;
+ *.sh|*.am|*.ac)
+ src=$f/shell
+ ;;
+ *)
+ echo Unrecognied extension on $name
+ continue
+ esac
+
+ cat $src $name > $f/next
+ mv $f/next $name
+ echo Updated $name
+done
diff --git a/contrib/cortex-strings/scripts/bench.py b/contrib/cortex-strings/scripts/bench.py
new file mode 100644
index 000000000000..476a5322a747
--- /dev/null
+++ b/contrib/cortex-strings/scripts/bench.py
@@ -0,0 +1,175 @@
+#!/usr/bin/env python
+
+"""Simple harness that benchmarks different variants of the routines,
+caches the results, and emits all of the records at the end.
+
+Results are generated for different values of:
+ * Source
+ * Routine
+ * Length
+ * Alignment
+"""
+
+import argparse
+import subprocess
+import math
+import sys
+
+# Prefix to the executables
+build = '../build/try-'
+
+ALL = 'memchr memcmp memcpy memset strchr strcmp strcpy strlen'
+
+HAS = {
+ 'this': 'bounce memchr memcpy memset strchr strcmp strcpy strlen',
+ 'bionic-a9': 'memcmp memcpy memset strcmp strcpy strlen',
+ 'bionic-a15': 'memcmp memcpy memset strcmp strcpy strlen',
+ 'bionic-c': ALL,
+ 'csl': 'memcpy memset',
+ 'glibc': 'memcpy memset strchr strlen',
+ 'glibc-c': ALL,
+ 'newlib': 'memcpy strcmp strcpy strlen',
+ 'newlib-c': ALL,
+ 'newlib-xscale': 'memchr memcpy memset strchr strcmp strcpy strlen',
+ 'plain': 'memset memcpy strcmp strcpy',
+}
+
+BOUNCE_ALIGNMENTS = ['1']
+SINGLE_BUFFER_ALIGNMENTS = ['1', '2', '4', '8', '16', '32']
+DUAL_BUFFER_ALIGNMENTS = ['1:32', '2:32', '4:32', '8:32', '16:32', '32:32']
+
+ALIGNMENTS = {
+ 'bounce': BOUNCE_ALIGNMENTS,
+ 'memchr': SINGLE_BUFFER_ALIGNMENTS,
+ 'memset': SINGLE_BUFFER_ALIGNMENTS,
+ 'strchr': SINGLE_BUFFER_ALIGNMENTS,
+ 'strlen': SINGLE_BUFFER_ALIGNMENTS,
+ 'memcmp': DUAL_BUFFER_ALIGNMENTS,
+ 'memcpy': DUAL_BUFFER_ALIGNMENTS,
+ 'strcmp': DUAL_BUFFER_ALIGNMENTS,
+ 'strcpy': DUAL_BUFFER_ALIGNMENTS,
+}
+
+VARIANTS = sorted(HAS.keys())
+FUNCTIONS = sorted(ALIGNMENTS.keys())
+
+NUM_RUNS = 5
+
+def run(cache, variant, function, bytes, loops, alignment, run_id, quiet=False):
+ """Perform a single run, exercising the cache as appropriate."""
+ key = ':'.join('%s' % x for x in (variant, function, bytes, loops, alignment, run_id))
+
+ if key in cache:
+ got = cache[key]
+ else:
+ xbuild = build
+ cmd = '%(xbuild)s%(variant)s -t %(function)s -c %(bytes)s -l %(loops)s -a %(alignment)s -r %(run_id)s' % locals()
+
+ try:
+ got = subprocess.check_output(cmd.split()).strip()
+ except OSError, ex:
+ assert False, 'Error %s while running %s' % (ex, cmd)
+
+ parts = got.split(':')
+ took = float(parts[7])
+
+ cache[key] = got
+
+ if not quiet:
+ print got
+ sys.stdout.flush()
+
+ return took
+
+def run_many(cache, variants, bytes, all_functions):
+ # We want the data to come out in a useful order. So fix an
+ # alignment and function, and do all sizes for a variant first
+ bytes = sorted(bytes)
+ mid = bytes[int(len(bytes)/1.5)]
+
+ if not all_functions:
+ # Use the ordering in 'this' as the default
+ all_functions = HAS['this'].split()
+
+ # Find all other functions
+ for functions in HAS.values():
+ for function in functions.split():
+ if function not in all_functions:
+ all_functions.append(function)
+
+ for function in all_functions:
+ for alignment in ALIGNMENTS[function]:
+ for variant in variants:
+ if function not in HAS[variant].split():
+ continue
+
+ # Run a tracer through and see how long it takes and
+ # adjust the number of loops based on that. Not great
+ # for memchr() and similar which are O(n), but it will
+ # do
+ f = 50000000
+ want = 5.0
+
+ loops = int(f / math.sqrt(max(1, mid)))
+ took = run(cache, variant, function, mid, loops, alignment, 0,
+ quiet=True)
+ # Keep it reasonable for silly routines like bounce
+ factor = min(20, max(0.05, want/took))
+ f = f * factor
+
+ # Round f to a few significant figures
+ scale = 10**int(math.log10(f) - 1)
+ f = scale*int(f/scale)
+
+ for b in sorted(bytes):
+ # Figure out the number of loops to give a roughly consistent run
+ loops = int(f / math.sqrt(max(1, b)))
+ for run_id in range(0, NUM_RUNS):
+ run(cache, variant, function, b, loops, alignment,
+ run_id)
+
+def run_top(cache):
+ parser = argparse.ArgumentParser()
+ parser.add_argument("-v", "--variants", nargs="+", help="library variant to run (run all if not specified)", default = VARIANTS, choices = VARIANTS)
+ parser.add_argument("-f", "--functions", nargs="+", help="function to run (run all if not specified)", default = FUNCTIONS, choices = FUNCTIONS)
+ parser.add_argument("-l", "--limit", type=int, help="upper limit to test to (in bytes)", default = 512*1024)
+ args = parser.parse_args()
+
+ # Test all powers of 2
+ step1 = 2.0
+ # Test intermediate powers of 1.4
+ step2 = 1.4
+
+ bytes = []
+
+ for step in [step1, step2]:
+ if step:
+ # Figure out how many steps get us up to the top
+ steps = int(round(math.log(args.limit) / math.log(step)))
+ bytes.extend([int(step**x) for x in range(0, steps+1)])
+
+ run_many(cache, args.variants, bytes, args.functions)
+
+def main():
+ cachename = 'cache.txt'
+
+ cache = {}
+
+ try:
+ with open(cachename) as f:
+ for line in f:
+ line = line.strip()
+ parts = line.split(':')
+ cache[':'.join(parts[:7])] = line
+ except:
+ pass
+
+ try:
+ run_top(cache)
+ finally:
+ with open(cachename, 'w') as f:
+ for line in sorted(cache.values()):
+ print >> f, line
+
+if __name__ == '__main__':
+ main()
diff --git a/contrib/cortex-strings/scripts/fixup.py b/contrib/cortex-strings/scripts/fixup.py
new file mode 100644
index 000000000000..003783a49838
--- /dev/null
+++ b/contrib/cortex-strings/scripts/fixup.py
@@ -0,0 +1,27 @@
+"""Simple script that enables target specific blocks based on the first argument.
+
+Matches comment blocks like this:
+
+/* For Foo: abc
+def
+*/
+
+and de-comments them giving:
+abc
+def
+"""
+import re
+import sys
+
+def main():
+ key = sys.argv[1]
+ expr = re.compile(r'/\* For %s:\s([^*]+)\*/' % key, re.M)
+
+ for arg in sys.argv[2:]:
+ with open(arg) as f:
+ body = f.read()
+ with open(arg, 'w') as f:
+ f.write(expr.sub(r'\1', body))
+
+if __name__ == '__main__':
+ main()
diff --git a/contrib/cortex-strings/scripts/libplot.py b/contrib/cortex-strings/scripts/libplot.py
new file mode 100644
index 000000000000..034ffd331a59
--- /dev/null
+++ b/contrib/cortex-strings/scripts/libplot.py
@@ -0,0 +1,78 @@
+"""Shared routines for the plotters."""
+
+import fileinput
+import collections
+
+Record = collections.namedtuple('Record', 'variant function bytes loops src_alignment dst_alignment run_id elapsed rest')
+
+
+def make_colours():
+ return iter('m b g r c y k pink orange brown grey'.split())
+
+def parse_value(v):
+ """Turn text into a primitive"""
+ try:
+ if '.' in v:
+ return float(v)
+ else:
+ return int(v)
+ except ValueError:
+ return v
+
+def create_column_tuple(record, names):
+ cols = [getattr(record, name) for name in names]
+ return tuple(cols)
+
+def unique(records, name, prefer=''):
+ """Return the unique values of a column in the records"""
+ if type(name) == tuple:
+ values = list(set(create_column_tuple(x, name) for x in records))
+ else:
+ values = list(set(getattr(x, name) for x in records))
+
+ if not values:
+ return values
+ elif type(values[0]) == str:
+ return sorted(values, key=lambda x: '%-06d|%s' % (-prefer.find(x), x))
+ else:
+ return sorted(values)
+
+def alignments_equal(alignments):
+ for alignment in alignments:
+ if alignment[0] != alignment[1]:
+ return False
+ return True
+
+def parse_row(line):
+ return Record(*[parse_value(y) for y in line.split(':')])
+
+def parse():
+ """Parse a record file into named tuples, correcting for loop
+ overhead along the way.
+ """
+ records = [parse_row(x) for x in fileinput.input()]
+
+ # Pull out any bounce values
+ costs = {}
+
+ for record in [x for x in records if x.function=='bounce']:
+ costs[(record.bytes, record.loops)] = record.elapsed
+
+ # Fix up all of the records for cost
+ out = []
+
+ for record in records:
+ if record.function == 'bounce':
+ continue
+
+ cost = costs.get((record.bytes, record.loops), None)
+
+ if not cost:
+ out.append(record)
+ else:
+ # Unfortunately you can't update a namedtuple...
+ values = list(record)
+ values[-2] -= cost
+ out.append(Record(*values))
+
+ return out
diff --git a/contrib/cortex-strings/scripts/plot-align.py b/contrib/cortex-strings/scripts/plot-align.py
new file mode 100644
index 000000000000..524aa20a6c12
--- /dev/null
+++ b/contrib/cortex-strings/scripts/plot-align.py
@@ -0,0 +1,67 @@
+#!/usr/bin/env python
+
+"""Plot the performance of different variants of one routine versus alignment.
+"""
+
+import libplot
+
+import pylab
+
+
+def plot(records, bytes, function):
+ records = [x for x in records if x.bytes==bytes and x.function==function]
+
+ variants = libplot.unique(records, 'variant', prefer='this')
+ alignments = libplot.unique(records, ('src_alignment', 'dst_alignment'))
+
+ X = pylab.arange(len(alignments))
+ width = 1.0/(len(variants)+1)
+
+ colours = libplot.make_colours()
+
+ pylab.figure(1).set_size_inches((16, 12))
+ pylab.clf()
+
+ for i, variant in enumerate(variants):
+ heights = []
+
+ for alignment in alignments:
+ matches = [x for x in records if x.variant==variant and x.src_alignment==alignment[0] and x.dst_alignment==alignment[1]]
+
+ if matches:
+ vals = [match.bytes*match.loops/match.elapsed/(1024*1024) for
+ match in matches]
+ mean = sum(vals)/len(vals)
+ heights.append(mean)
+ else:
+ heights.append(0)
+
+ pylab.bar(X+i*width, heights, width, color=colours.next(), label=variant)
+
+
+ axes = pylab.axes()
+ if libplot.alignments_equal(alignments):
+ alignment_labels = ["%s" % x[0] for x in alignments]
+ else:
+ alignment_labels = ["%s:%s" % (x[0], x[1]) for x in alignments]
+ axes.set_xticklabels(alignment_labels)
+ axes.set_xticks(X + 0.5)
+
+ pylab.title('Performance of different variants of %(function)s for %(bytes)d byte blocks' % locals())
+ pylab.xlabel('Alignment')
+ pylab.ylabel('Rate (MB/s)')
+ pylab.legend(loc='lower right', ncol=3)
+ pylab.grid()
+ pylab.savefig('alignment-%(function)s-%(bytes)d.png' % locals(), dpi=72)
+
+def main():
+ records = libplot.parse()
+
+ for function in libplot.unique(records, 'function'):
+ for bytes in libplot.unique(records, 'bytes'):
+ plot(records, bytes, function)
+
+ pylab.show()
+
+if __name__ == '__main__':
+ main()
diff --git a/contrib/cortex-strings/scripts/plot-sizes.py b/contrib/cortex-strings/scripts/plot-sizes.py
new file mode 100644
index 000000000000..26a22bc4d6ef
--- /dev/null
+++ b/contrib/cortex-strings/scripts/plot-sizes.py
@@ -0,0 +1,120 @@
+#!/usr/bin/env python
+
+"""Plot the performance for different block sizes of one function across
+variants.
+"""
+
+import libplot
+
+import pylab
+import pdb
+import math
+
+def pretty_kb(v):
+ if v < 1024:
+ return '%d' % v
+ else:
+ if v % 1024 == 0:
+ return '%d k' % (v//1024)
+ else:
+ return '%.1f k' % (v/1024)
+
+def plot(records, function, alignment=None, scale=1):
+ variants = libplot.unique(records, 'variant', prefer='this')
+ records = [x for x in records if x.function==function]
+
+ if alignment != None:
+ records = [x for x in records if x.src_alignment==alignment[0] and
+ x.dst_alignment==alignment[1]]
+
+ alignments = libplot.unique(records, ('src_alignment', 'dst_alignment'))
+ if len(alignments) != 1:
+ return False
+ if libplot.alignments_equal(alignments):
+ aalignment = alignments[0][0]
+ else:
+ aalignment = "%s:%s" % (alignments[0][0], alignments[0][1])
+
+ bytes = libplot.unique(records, 'bytes')[0]
+
+ colours = libplot.make_colours()
+ all_x = []
+
+ pylab.figure(1).set_size_inches((6.4*scale, 4.8*scale))
+ pylab.clf()
+
+ if 'str' in function:
+ # The harness fills out to 16k. Anything past that is an
+ # early match
+ top = 16384
+ else:
+ top = 2**31
+
+ for variant in variants:
+ matches = [x for x in records if x.variant==variant and x.bytes <= top]
+ matches.sort(key=lambda x: x.bytes)
+
+ X = sorted(list(set([x.bytes for x in matches])))
+ Y = []
+ Yerr = []
+ for xbytes in X:
+ vals = [x.bytes*x.loops/x.elapsed/(1024*1024) for x in matches if x.bytes == xbytes]
+ if len(vals) > 1:
+ mean = sum(vals)/len(vals)
+ Y.append(mean)
+ if len(Yerr) == 0:
+ Yerr = [[], []]
+ err1 = max(vals) - mean
+ assert err1 >= 0
+ err2 = min(vals) - mean
+ assert err2 <= 0
+ Yerr[0].append(abs(err2))
+ Yerr[1].append(err1)
+ else:
+ Y.append(vals[0])
+
+ all_x.extend(X)
+ colour = colours.next()
+
+ if X:
+ pylab.plot(X, Y, c=colour)
+ if len(Yerr) > 0:
+ pylab.errorbar(X, Y, yerr=Yerr, c=colour, label=variant, fmt='o')
+ else:
+ pylab.scatter(X, Y, c=colour, label=variant, edgecolors='none')
+
+ pylab.legend(loc='upper left', ncol=3, prop={'size': 'small'})
+ pylab.grid()
+ pylab.title('%(function)s of %(aalignment)s byte aligned blocks' % locals())
+ pylab.xlabel('Size (B)')
+ pylab.ylabel('Rate (MB/s)')
+
+ # Figure out how high the range goes
+ top = max(all_x)
+
+ power = int(round(math.log(max(all_x)) / math.log(2)))
+
+ pylab.semilogx()
+
+ pylab.axes().set_xticks([2**x for x in range(0, power+1)])
+ pylab.axes().set_xticklabels([pretty_kb(2**x) for x in range(0, power+1)])
+ pylab.xlim(0, top)
+ pylab.ylim(0, pylab.ylim()[1])
+ return True
+
+def main():
+ records = libplot.parse()
+
+ functions = libplot.unique(records, 'function')
+ alignments = libplot.unique(records, ('src_alignment', 'dst_alignment'))
+
+ for function in functions:
+ for alignment in alignments:
+ for scale in [1, 2.5]:
+ if plot(records, function, alignment, scale):
+ pylab.savefig('sizes-%s-%02d-%02d-%.1f.png' % (function, alignment[0], alignment[1], scale), dpi=72)
+
+ pylab.show()
+
+if __name__ == '__main__':
+ main()
diff --git a/contrib/cortex-strings/scripts/plot-top.py b/contrib/cortex-strings/scripts/plot-top.py
new file mode 100644
index 000000000000..4095239ac815
--- /dev/null
+++ b/contrib/cortex-strings/scripts/plot-top.py
@@ -0,0 +1,61 @@
+#!/usr/bin/env python
+
+"""Plot the performance of different variants of the string routines
+for one size.
+"""
+
+import libplot
+
+import pylab
+
+
+def plot(records, bytes):
+ records = [x for x in records if x.bytes==bytes]
+
+ variants = libplot.unique(records, 'variant', prefer='this')
+ functions = libplot.unique(records, 'function')
+
+ X = pylab.arange(len(functions))
+ width = 1.0/(len(variants)+1)
+
+ colours = libplot.make_colours()
+
+ pylab.figure(1).set_size_inches((16, 12))
+ pylab.clf()
+
+ for i, variant in enumerate(variants):
+ heights = []
+
+ for function in functions:
+ matches = [x for x in records if x.variant==variant and x.function==function and x.src_alignment==8]
+
+ if matches:
+ vals = [match.bytes*match.loops/match.elapsed/(1024*1024) for
+ match in matches]
+ mean = sum(vals)/len(vals)
+ heights.append(mean)
+ else:
+ heights.append(0)
+
+ pylab.bar(X+i*width, heights, width, color=colours.next(), label=variant)
+
+ axes = pylab.axes()
+ axes.set_xticklabels(functions)
+ axes.set_xticks(X + 0.5)
+
+ pylab.title('Performance of different variants for %d byte blocks' % bytes)
+ pylab.ylabel('Rate (MB/s)')
+ pylab.legend(loc='upper left', ncol=3)
+ pylab.grid()
+ pylab.savefig('top-%06d.png' % bytes, dpi=72)
+
+def main():
+ records = libplot.parse()
+
+ for bytes in libplot.unique(records, 'bytes'):
+ plot(records, bytes)
+
+ pylab.show()
+
+if __name__ == '__main__':
+ main()
diff --git a/contrib/cortex-strings/scripts/plot.py b/contrib/cortex-strings/scripts/plot.py
new file mode 100644
index 000000000000..aa2bb1adb560
--- /dev/null
+++ b/contrib/cortex-strings/scripts/plot.py
@@ -0,0 +1,123 @@
+"""Plot the results for each test. Spits out a set of images into the
+current directory.
+"""
+
+import libplot
+
+import fileinput
+import collections
+import pprint
+
+import pylab
+
+Record = collections.namedtuple('Record', 'variant test size loops src_alignment dst_alignment run_id rawtime comment time bytes rate')
+
+def unique(rows, name):
+ """Takes a list of values, pulls out the named field, and returns
+ a list of the unique values of this field.
+ """
+ return sorted(set(getattr(x, name) for x in rows))
+
+def to_float(v):
+ """Convert a string into a better type.
+
+ >>> to_float('foo')
+ 'foo'
+ >>> to_float('1.23')
+ 1.23
+ >>> to_float('45')
+ 45
+ """
+ try:
+ if '.' in v:
+ return float(v)
+ else:
+ return int(v)
+ except:
+ return v
+
+def parse():
+ # Split the input up
+ rows = [x.strip().split(':') for x in fileinput.input()]
+ # Automatically turn numbers into the base type
+ rows = [[to_float(y) for y in x] for x in rows]
+
+ # Scan once to calculate the overhead
+ r = [Record(*(x + [0, 0, 0])) for x in rows]
+ bounces = pylab.array([(x.loops, x.rawtime) for x in r if x.test == 'bounce'])
+ fit = pylab.polyfit(bounces[:,0], bounces[:,1], 1)
+
+ records = []
+
+ for row in rows:
+ # Make a dummy record so we can use the names
+ r1 = Record(*(row + [0, 0, 0]))
+
+ bytes = r1.size * r1.loops
+ # Calculate the bounce time
+ delta = pylab.polyval(fit, [r1.loops])
+ time = r1.rawtime - delta
+ rate = bytes / time
+
+ records.append(Record(*(row + [time, bytes, rate])))
+
+ return records
+
+def plot(records, field, scale, ylabel):
+ variants = unique(records, 'variant')
+ tests = unique(records, 'test')
+
+ colours = libplot.make_colours()
+
+ # A little hack. We want the 'all' record to be drawn last so
+ # that it's obvious on the graph. Assume that no tests come
+ # before it alphabetically
+ variants.reverse()
+
+ for test in tests:
+ for variant in variants:
+ v = [x for x in records if x.test==test and x.variant==variant]
+ v.sort(key=lambda x: x.size)
+ V = pylab.array([(x.size, getattr(x, field)) for x in v])
+
+ # Ensure our results appear
+ order = 1 if variant == 'this' else 0
+
+ try:
+ # A little hack. We want the 'all' to be obvious on
+ # the graph
+ if variant == 'all':
+ pylab.scatter(V[:,0], V[:,1]/scale, label=variant)
+ pylab.plot(V[:,0], V[:,1]/scale)
+ else:
+ pylab.plot(V[:,0], V[:,1]/scale, label=variant,
+ zorder=order, c = colours.next())
+
+ except Exception, ex:
+ # michaelh1 likes to run this script while the test is
+ # still running which can lead to bad data
+ print ex, 'on %s of %s' % (variant, test)
+
+ pylab.legend(loc='lower right', ncol=2, prop={'size': 'small'})
+ pylab.xlabel('Block size (B)')
+ pylab.ylabel(ylabel)
+ pylab.title('%s %s' % (test, field))
+ pylab.grid()
+
+ pylab.savefig('%s-%s.png' % (test, field), dpi=100)
+ pylab.semilogx(basex=2)
+ pylab.savefig('%s-%s-semilog.png' % (test, field), dpi=100)
+ pylab.clf()
+
+def test():
+ import doctest
+ doctest.testmod()
+
+def main():
+ records = parse()
+
+ plot(records, 'rate', 1024**2, 'Rate (MB/s)')
+ plot(records, 'time', 1, 'Total time (s)')
+
+if __name__ == '__main__':
+ main()
diff --git a/contrib/cortex-strings/scripts/trim.sh b/contrib/cortex-strings/scripts/trim.sh
new file mode 100755
index 000000000000..dab1047f34f9
--- /dev/null
+++ b/contrib/cortex-strings/scripts/trim.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+#
+# Trims the whitespace from around any given images
+#
+
+for i in $@; do
+ convert $i -bordercolor white -border 1x1 -trim +repage -alpha off +dither -colors 32 PNG8:next-$i
+ mv next-$i $i
+done
diff --git a/contrib/cortex-strings/src/aarch64/memchr.S b/contrib/cortex-strings/src/aarch64/memchr.S
new file mode 100644
index 000000000000..8da65ec232ac
--- /dev/null
+++ b/contrib/cortex-strings/src/aarch64/memchr.S
@@ -0,0 +1,172 @@
+/*
+ * memchr - find a character in a memory zone
+ *
+ * Copyright (c) 2014, ARM Limited
+ * All rights Reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the company nor the names of its contributors
+ * may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * Neon Available.
+ */
+
+/* Arguments and results. */
+#define srcin x0
+#define chrin w1
+#define cntin x2
+
+#define result x0
+
+#define src x3
+#define tmp x4
+#define wtmp2 w5
+#define synd x6
+#define soff x9
+#define cntrem x10
+
+#define vrepchr v0
+#define vdata1 v1
+#define vdata2 v2
+#define vhas_chr1 v3
+#define vhas_chr2 v4
+#define vrepmask v5
+#define vend v6
+
+/*
+ * Core algorithm:
+ *
+ * For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits
+ * per byte. For each tuple, bit 0 is set if the relevant byte matched the
+ * requested character and bit 1 is not used (faster than using a 32bit
+ * syndrome). Since the bits in the syndrome reflect exactly the order in which
+ * things occur in the original string, counting trailing zeros allows to
+ * identify exactly which byte has matched.
+ */
+
+ .macro def_fn f p2align=0
+ .text
+ .p2align \p2align
+ .global \f
+ .type \f, %function
+\f:
+ .endm
+
+def_fn memchr
+ /* Do not dereference srcin if no bytes to compare. */
+ cbz cntin, .Lzero_length
+ /*
+ * Magic constant 0x40100401 allows us to identify which lane matches
+ * the requested byte.
+ */
+ mov wtmp2, #0x0401
+ movk wtmp2, #0x4010, lsl #16
+ dup vrepchr.16b, chrin
+ /* Work with aligned 32-byte chunks */
+ bic src, srcin, #31
+ dup vrepmask.4s, wtmp2
+ ands soff, srcin, #31
+ and cntrem, cntin, #31
+ b.eq .Lloop
+
+ /*
+ * Input string is not 32-byte aligned. We calculate the syndrome
+ * value for the aligned 32 bytes block containing the first bytes
+ * and mask the irrelevant part.
+ */
+
+ ld1 {vdata1.16b, vdata2.16b}, [src], #32
+ sub tmp, soff, #32
+ adds cntin, cntin, tmp
+ cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
+ cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
+ and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+ and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+ addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */
+ addp vend.16b, vend.16b, vend.16b /* 128->64 */
+ mov synd, vend.2d[0]
+ /* Clear the soff*2 lower bits */
+ lsl tmp, soff, #1
+ lsr synd, synd, tmp
+ lsl synd, synd, tmp
+ /* The first block can also be the last */
+ b.ls .Lmasklast
+ /* Have we found something already? */
+ cbnz synd, .Ltail
+
+.Lloop:
+ ld1 {vdata1.16b, vdata2.16b}, [src], #32
+ subs cntin, cntin, #32
+ cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
+ cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
+ /* If we're out of data we finish regardless of the result */
+ b.ls .Lend
+ /* Use a fast check for the termination condition */
+ orr vend.16b, vhas_chr1.16b, vhas_chr2.16b
+ addp vend.2d, vend.2d, vend.2d
+ mov synd, vend.2d[0]
+ /* We're not out of data, loop if we haven't found the character */
+ cbz synd, .Lloop
+
+.Lend:
+ /* Termination condition found, let's calculate the syndrome value */
+ and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+ and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+ addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */
+ addp vend.16b, vend.16b, vend.16b /* 128->64 */
+ mov synd, vend.2d[0]
+ /* Only do the clear for the last possible block */
+ b.hi .Ltail
+
+.Lmasklast:
+ /* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */
+ add tmp, cntrem, soff
+ and tmp, tmp, #31
+ sub tmp, tmp, #32
+ neg tmp, tmp, lsl #1
+ lsl synd, synd, tmp
+ lsr synd, synd, tmp
+
+.Ltail:
+ /* Count the trailing zeros using bit reversing */
+ rbit synd, synd
+ /* Compensate the last post-increment */
+ sub src, src, #32
+ /* Check that we have found a character */
+ cmp synd, #0
+ /* And count the leading zeros */
+ clz synd, synd
+ /* Compute the potential result */
+ add result, src, synd, lsr #1
+ /* Select result or NULL */
+ csel result, xzr, result, eq
+ ret
+
+.Lzero_length:
+ mov result, #0
+ ret
+
+ .size memchr, . - memchr
diff --git a/contrib/cortex-strings/src/aarch64/memcmp.S b/contrib/cortex-strings/src/aarch64/memcmp.S
new file mode 100644
index 000000000000..abba416b07a7
--- /dev/null
+++ b/contrib/cortex-strings/src/aarch64/memcmp.S
@@ -0,0 +1,162 @@
+/* memcmp - compare memory
+
+ Copyright (c) 2013, Linaro Limited
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the Linaro nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ */
+
+ .macro def_fn f p2align=0
+ .text
+ .p2align \p2align
+ .global \f
+ .type \f, %function
+\f:
+ .endm
+
+/* Parameters and result. */
+#define src1 x0
+#define src2 x1
+#define limit x2
+#define result x0
+
+/* Internal variables. */
+#define data1 x3
+#define data1w w3
+#define data2 x4
+#define data2w w4
+#define has_nul x5
+#define diff x6
+#define endloop x7
+#define tmp1 x8
+#define tmp2 x9
+#define tmp3 x10
+#define pos x11
+#define limit_wd x12
+#define mask x13
+
+def_fn memcmp p2align=6
+ cbz limit, .Lret0
+ eor tmp1, src1, src2
+ tst tmp1, #7
+ b.ne .Lmisaligned8
+ ands tmp1, src1, #7
+ b.ne .Lmutual_align
+ add limit_wd, limit, #7
+ lsr limit_wd, limit_wd, #3
+ /* Start of performance-critical section -- one 64B cache line. */
+.Lloop_aligned:
+ ldr data1, [src1], #8
+ ldr data2, [src2], #8
+.Lstart_realigned:
+ subs limit_wd, limit_wd, #1
+ eor diff, data1, data2 /* Non-zero if differences found. */
+ csinv endloop, diff, xzr, ne /* Last Dword or differences. */
+ cbz endloop, .Lloop_aligned
+ /* End of performance-critical section -- one 64B cache line. */
+
+ /* Not reached the limit, must have found a diff. */
+ cbnz limit_wd, .Lnot_limit
+
+ /* Limit % 8 == 0 => all bytes significant. */
+ ands limit, limit, #7
+ b.eq .Lnot_limit
+
+ lsl limit, limit, #3 /* Bits -> bytes. */
+ mov mask, #~0
+#ifdef __AARCH64EB__
+ lsr mask, mask, limit
+#else
+ lsl mask, mask, limit
+#endif
+ bic data1, data1, mask
+ bic data2, data2, mask
+
+ orr diff, diff, mask
+.Lnot_limit:
+
+#ifndef __AARCH64EB__
+ rev diff, diff
+ rev data1, data1
+ rev data2, data2
+#endif
+ /* The MS-non-zero bit of DIFF marks either the first bit
+ that is different, or the end of the significant data.
+ Shifting left now will bring the critical information into the
+ top bits. */
+ clz pos, diff
+ lsl data1, data1, pos
+ lsl data2, data2, pos
+ /* But we need to zero-extend (char is unsigned) the value and then
+ perform a signed 32-bit subtraction. */
+ lsr data1, data1, #56
+ sub result, data1, data2, lsr #56
+ ret
+
+.Lmutual_align:
+ /* Sources are mutually aligned, but are not currently at an
+ alignment boundary. Round down the addresses and then mask off
+ the bytes that precede the start point. */
+ bic src1, src1, #7
+ bic src2, src2, #7
+ add limit, limit, tmp1 /* Adjust the limit for the extra. */
+ lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */
+ ldr data1, [src1], #8
+ neg tmp1, tmp1 /* Bits to alignment -64. */
+ ldr data2, [src2], #8
+ mov tmp2, #~0
+#ifdef __AARCH64EB__
+ /* Big-endian. Early bytes are at MSB. */
+ lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
+#else
+ /* Little-endian. Early bytes are at LSB. */
+ lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
+#endif
+ add limit_wd, limit, #7
+ orr data1, data1, tmp2
+ orr data2, data2, tmp2
+ lsr limit_wd, limit_wd, #3
+ b .Lstart_realigned
+
+.Lret0:
+ mov result, #0
+ ret
+
+ .p2align 6
+.Lmisaligned8:
+ sub limit, limit, #1
+1:
+ /* Perhaps we can do better than this. */
+ ldrb data1w, [src1], #1
+ ldrb data2w, [src2], #1
+ subs limit, limit, #1
+ ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
+ b.eq 1b
+ sub result, data1, data2
+ ret
+ .size memcmp, . - memcmp
diff --git a/contrib/cortex-strings/src/aarch64/memcpy.S b/contrib/cortex-strings/src/aarch64/memcpy.S
new file mode 100644
index 000000000000..cbae37121844
--- /dev/null
+++ b/contrib/cortex-strings/src/aarch64/memcpy.S
@@ -0,0 +1,225 @@
+/* Copyright (c) 2012, Linaro Limited
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the Linaro nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
+
+/*
+ * Copyright (c) 2015 ARM Ltd
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the company may not be used to endorse or promote
+ * products derived from this software without specific prior written
+ * permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses.
+ *
+ */
+
+#define dstin x0
+#define src x1
+#define count x2
+#define dst x3
+#define srcend x4
+#define dstend x5
+#define A_l x6
+#define A_lw w6
+#define A_h x7
+#define A_hw w7
+#define B_l x8
+#define B_lw w8
+#define B_h x9
+#define C_l x10
+#define C_h x11
+#define D_l x12
+#define D_h x13
+#define E_l src
+#define E_h count
+#define F_l srcend
+#define F_h dst
+#define tmp1 x9
+
+#define L(l) .L ## l
+
+ .macro def_fn f p2align=0
+ .text
+ .p2align \p2align
+ .global \f
+ .type \f, %function
+\f:
+ .endm
+
+/* Copies are split into 3 main cases: small copies of up to 16 bytes,
+ medium copies of 17..96 bytes which are fully unrolled. Large copies
+ of more than 96 bytes align the destination and use an unrolled loop
+ processing 64 bytes per iteration.
+ Small and medium copies read all data before writing, allowing any
+ kind of overlap, and memmove tailcalls memcpy for these cases as
+ well as non-overlapping copies.
+*/
+
+def_fn memcpy p2align=6
+ prfm PLDL1KEEP, [src]
+ add srcend, src, count
+ add dstend, dstin, count
+ cmp count, 16
+ b.ls L(copy16)
+ cmp count, 96
+ b.hi L(copy_long)
+
+ /* Medium copies: 17..96 bytes. */
+ sub tmp1, count, 1
+ ldp A_l, A_h, [src]
+ tbnz tmp1, 6, L(copy96)
+ ldp D_l, D_h, [srcend, -16]
+ tbz tmp1, 5, 1f
+ ldp B_l, B_h, [src, 16]
+ ldp C_l, C_h, [srcend, -32]
+ stp B_l, B_h, [dstin, 16]
+ stp C_l, C_h, [dstend, -32]
+1:
+ stp A_l, A_h, [dstin]
+ stp D_l, D_h, [dstend, -16]
+ ret
+
+ .p2align 4
+ /* Small copies: 0..16 bytes. */
+L(copy16):
+ cmp count, 8
+ b.lo 1f
+ ldr A_l, [src]
+ ldr A_h, [srcend, -8]
+ str A_l, [dstin]
+ str A_h, [dstend, -8]
+ ret
+ .p2align 4
+1:
+ tbz count, 2, 1f
+ ldr A_lw, [src]
+ ldr A_hw, [srcend, -4]
+ str A_lw, [dstin]
+ str A_hw, [dstend, -4]
+ ret
+
+ /* Copy 0..3 bytes. Use a branchless sequence that copies the same
+ byte 3 times if count==1, or the 2nd byte twice if count==2. */
+1:
+ cbz count, 2f
+ lsr tmp1, count, 1
+ ldrb A_lw, [src]
+ ldrb A_hw, [srcend, -1]
+ ldrb B_lw, [src, tmp1]
+ strb A_lw, [dstin]
+ strb B_lw, [dstin, tmp1]
+ strb A_hw, [dstend, -1]
+2: ret
+
+ .p2align 4
+ /* Copy 64..96 bytes. Copy 64 bytes from the start and
+ 32 bytes from the end. */
+L(copy96):
+ ldp B_l, B_h, [src, 16]
+ ldp C_l, C_h, [src, 32]
+ ldp D_l, D_h, [src, 48]
+ ldp E_l, E_h, [srcend, -32]
+ ldp F_l, F_h, [srcend, -16]
+ stp A_l, A_h, [dstin]
+ stp B_l, B_h, [dstin, 16]
+ stp C_l, C_h, [dstin, 32]
+ stp D_l, D_h, [dstin, 48]
+ stp E_l, E_h, [dstend, -32]
+ stp F_l, F_h, [dstend, -16]
+ ret
+
+ /* Align DST to 16 byte alignment so that we don't cross cache line
+ boundaries on both loads and stores. There are at least 96 bytes
+ to copy, so copy 16 bytes unaligned and then align. The loop
+ copies 64 bytes per iteration and prefetches one iteration ahead. */
+
+ .p2align 4
+L(copy_long):
+ and tmp1, dstin, 15
+ bic dst, dstin, 15
+ ldp D_l, D_h, [src]
+ sub src, src, tmp1
+ add count, count, tmp1 /* Count is now 16 too large. */
+ ldp A_l, A_h, [src, 16]
+ stp D_l, D_h, [dstin]
+ ldp B_l, B_h, [src, 32]
+ ldp C_l, C_h, [src, 48]
+ ldp D_l, D_h, [src, 64]!
+ subs count, count, 128 + 16 /* Test and readjust count. */
+ b.ls 2f
+1:
+ stp A_l, A_h, [dst, 16]
+ ldp A_l, A_h, [src, 16]
+ stp B_l, B_h, [dst, 32]
+ ldp B_l, B_h, [src, 32]
+ stp C_l, C_h, [dst, 48]
+ ldp C_l, C_h, [src, 48]
+ stp D_l, D_h, [dst, 64]!
+ ldp D_l, D_h, [src, 64]!
+ subs count, count, 64
+ b.hi 1b
+
+ /* Write the last full set of 64 bytes. The remainder is at most 64
+ bytes, so it is safe to always copy 64 bytes from the end even if
+ there is just 1 byte left. */
+2:
+ ldp E_l, E_h, [srcend, -64]
+ stp A_l, A_h, [dst, 16]
+ ldp A_l, A_h, [srcend, -48]
+ stp B_l, B_h, [dst, 32]
+ ldp B_l, B_h, [srcend, -32]
+ stp C_l, C_h, [dst, 48]
+ ldp C_l, C_h, [srcend, -16]
+ stp D_l, D_h, [dst, 64]
+ stp E_l, E_h, [dstend, -64]
+ stp A_l, A_h, [dstend, -48]
+ stp B_l, B_h, [dstend, -32]
+ stp C_l, C_h, [dstend, -16]
+ ret
+
+ .size memcpy, . - memcpy
diff --git a/contrib/cortex-strings/src/aarch64/memmove.S b/contrib/cortex-strings/src/aarch64/memmove.S
new file mode 100644
index 000000000000..c9fe6c1f5710
--- /dev/null
+++ b/contrib/cortex-strings/src/aarch64/memmove.S
@@ -0,0 +1,150 @@
+/* Copyright (c) 2013, Linaro Limited
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the Linaro nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
+
+/*
+ * Copyright (c) 2015 ARM Ltd
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the company may not be used to endorse or promote
+ * products derived from this software without specific prior written
+ * permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses
+ */
+
+ .macro def_fn f p2align=0
+ .text
+ .p2align \p2align
+ .global \f
+ .type \f, %function
+\f:
+ .endm
+
+/* Parameters and result. */
+#define dstin x0
+#define src x1
+#define count x2
+#define srcend x3
+#define dstend x4
+#define tmp1 x5
+#define A_l x6
+#define A_h x7
+#define B_l x8
+#define B_h x9
+#define C_l x10
+#define C_h x11
+#define D_l x12
+#define D_h x13
+#define E_l count
+#define E_h tmp1
+
+/* All memmoves up to 96 bytes are done by memcpy as it supports overlaps.
+ Larger backwards copies are also handled by memcpy. The only remaining
+ case is forward large copies. The destination is aligned, and an
+ unrolled loop processes 64 bytes per iteration.
+*/
+
+def_fn memmove, 6
+ sub tmp1, dstin, src
+ cmp count, 96
+ ccmp tmp1, count, 2, hi
+ b.hs memcpy
+
+ cbz tmp1, 3f
+ add dstend, dstin, count
+ add srcend, src, count
+
+ /* Align dstend to 16 byte alignment so that we don't cross cache line
+ boundaries on both loads and stores. There are at least 96 bytes
+ to copy, so copy 16 bytes unaligned and then align. The loop
+ copies 64 bytes per iteration and prefetches one iteration ahead. */
+
+ and tmp1, dstend, 15
+ ldp D_l, D_h, [srcend, -16]
+ sub srcend, srcend, tmp1
+ sub count, count, tmp1
+ ldp A_l, A_h, [srcend, -16]
+ stp D_l, D_h, [dstend, -16]
+ ldp B_l, B_h, [srcend, -32]
+ ldp C_l, C_h, [srcend, -48]
+ ldp D_l, D_h, [srcend, -64]!
+ sub dstend, dstend, tmp1
+ subs count, count, 128
+ b.ls 2f
+ nop
+1:
+ stp A_l, A_h, [dstend, -16]
+ ldp A_l, A_h, [srcend, -16]
+ stp B_l, B_h, [dstend, -32]
+ ldp B_l, B_h, [srcend, -32]
+ stp C_l, C_h, [dstend, -48]
+ ldp C_l, C_h, [srcend, -48]
+ stp D_l, D_h, [dstend, -64]!
+ ldp D_l, D_h, [srcend, -64]!
+ subs count, count, 64
+ b.hi 1b
+
+ /* Write the last full set of 64 bytes. The remainder is at most 64
+ bytes, so it is safe to always copy 64 bytes from the start even if
+ there is just 1 byte left. */
+2:
+ ldp E_l, E_h, [src, 48]
+ stp A_l, A_h, [dstend, -16]
+ ldp A_l, A_h, [src, 32]
+ stp B_l, B_h, [dstend, -32]
+ ldp B_l, B_h, [src, 16]
+ stp C_l, C_h, [dstend, -48]
+ ldp C_l, C_h, [src]
+ stp D_l, D_h, [dstend, -64]
+ stp E_l, E_h, [dstin, 48]
+ stp A_l, A_h, [dstin, 32]
+ stp B_l, B_h, [dstin, 16]
+ stp C_l, C_h, [dstin]
+3: ret
+
+ .size memmove, . - memmove
diff --git a/contrib/cortex-strings/src/aarch64/memset.S b/contrib/cortex-strings/src/aarch64/memset.S
new file mode 100644
index 000000000000..2d6675ad9907
--- /dev/null
+++ b/contrib/cortex-strings/src/aarch64/memset.S
@@ -0,0 +1,235 @@
+/* Copyright (c) 2012, Linaro Limited
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the Linaro nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
+
+/*
+ * Copyright (c) 2015 ARM Ltd
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the company may not be used to endorse or promote
+ * products derived from this software without specific prior written
+ * permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses
+ *
+ */
+
+
+#define dstin x0
+#define val x1
+#define valw w1
+#define count x2
+#define dst x3
+#define dstend x4
+#define tmp1 x5
+#define tmp1w w5
+#define tmp2 x6
+#define tmp2w w6
+#define zva_len x7
+#define zva_lenw w7
+
+#define L(l) .L ## l
+
+ .macro def_fn f p2align=0
+ .text
+ .p2align \p2align
+ .global \f
+ .type \f, %function
+\f:
+ .endm
+
+def_fn memset p2align=6
+
+ dup v0.16B, valw
+ add dstend, dstin, count
+
+ cmp count, 96
+ b.hi L(set_long)
+ cmp count, 16
+ b.hs L(set_medium)
+ mov val, v0.D[0]
+
+ /* Set 0..15 bytes. */
+ tbz count, 3, 1f
+ str val, [dstin]
+ str val, [dstend, -8]
+ ret
+ nop
+1: tbz count, 2, 2f
+ str valw, [dstin]
+ str valw, [dstend, -4]
+ ret
+2: cbz count, 3f
+ strb valw, [dstin]
+ tbz count, 1, 3f
+ strh valw, [dstend, -2]
+3: ret
+
+ /* Set 17..96 bytes. */
+L(set_medium):
+ str q0, [dstin]
+ tbnz count, 6, L(set96)
+ str q0, [dstend, -16]
+ tbz count, 5, 1f
+ str q0, [dstin, 16]
+ str q0, [dstend, -32]
+1: ret
+
+ .p2align 4
+ /* Set 64..96 bytes. Write 64 bytes from the start and
+ 32 bytes from the end. */
+L(set96):
+ str q0, [dstin, 16]
+ stp q0, q0, [dstin, 32]
+ stp q0, q0, [dstend, -32]
+ ret
+
+ .p2align 3
+ nop
+L(set_long):
+ and valw, valw, 255
+ bic dst, dstin, 15
+ str q0, [dstin]
+ cmp count, 256
+ ccmp valw, 0, 0, cs
+ b.eq L(try_zva)
+L(no_zva):
+ sub count, dstend, dst /* Count is 16 too large. */
+ add dst, dst, 16
+ sub count, count, 64 + 16 /* Adjust count and bias for loop. */
+1: stp q0, q0, [dst], 64
+ stp q0, q0, [dst, -32]
+L(tail64):
+ subs count, count, 64
+ b.hi 1b
+2: stp q0, q0, [dstend, -64]
+ stp q0, q0, [dstend, -32]
+ ret
+
+ .p2align 3
+L(try_zva):
+ mrs tmp1, dczid_el0
+ tbnz tmp1w, 4, L(no_zva)
+ and tmp1w, tmp1w, 15
+ cmp tmp1w, 4 /* ZVA size is 64 bytes. */
+ b.ne L(zva_128)
+
+ /* Write the first and last 64 byte aligned block using stp rather
+ than using DC ZVA. This is faster on some cores.
+ */
+L(zva_64):
+ str q0, [dst, 16]
+ stp q0, q0, [dst, 32]
+ bic dst, dst, 63
+ stp q0, q0, [dst, 64]
+ stp q0, q0, [dst, 96]
+ sub count, dstend, dst /* Count is now 128 too large. */
+ sub count, count, 128+64+64 /* Adjust count and bias for loop. */
+ add dst, dst, 128
+ nop
+1: dc zva, dst
+ add dst, dst, 64
+ subs count, count, 64
+ b.hi 1b
+ stp q0, q0, [dst, 0]
+ stp q0, q0, [dst, 32]
+ stp q0, q0, [dstend, -64]
+ stp q0, q0, [dstend, -32]
+ ret
+
+ .p2align 3
+L(zva_128):
+ cmp tmp1w, 5 /* ZVA size is 128 bytes. */
+ b.ne L(zva_other)
+
+ str q0, [dst, 16]
+ stp q0, q0, [dst, 32]
+ stp q0, q0, [dst, 64]
+ stp q0, q0, [dst, 96]
+ bic dst, dst, 127
+ sub count, dstend, dst /* Count is now 128 too large. */
+ sub count, count, 128+128 /* Adjust count and bias for loop. */
+ add dst, dst, 128
+1: dc zva, dst
+ add dst, dst, 128
+ subs count, count, 128
+ b.hi 1b
+ stp q0, q0, [dstend, -128]
+ stp q0, q0, [dstend, -96]
+ stp q0, q0, [dstend, -64]
+ stp q0, q0, [dstend, -32]
+ ret
+
+L(zva_other):
+ mov tmp2w, 4
+ lsl zva_lenw, tmp2w, tmp1w
+ add tmp1, zva_len, 64 /* Max alignment bytes written. */
+ cmp count, tmp1
+ blo L(no_zva)
+
+ sub tmp2, zva_len, 1
+ add tmp1, dst, zva_len
+ add dst, dst, 16
+ subs count, tmp1, dst /* Actual alignment bytes to write. */
+ bic tmp1, tmp1, tmp2 /* Aligned dc zva start address. */
+ beq 2f
+1: stp q0, q0, [dst], 64
+ stp q0, q0, [dst, -32]
+ subs count, count, 64
+ b.hi 1b
+2: mov dst, tmp1
+ sub count, dstend, tmp1 /* Remaining bytes to write. */
+ subs count, count, zva_len
+ b.lo 4f
+3: dc zva, dst
+ add dst, dst, zva_len
+ subs count, count, zva_len
+ b.hs 3b
+4: add count, count, zva_len
+ b L(tail64)
+
+ .size memset, . - memset
diff --git a/contrib/cortex-strings/src/aarch64/strchr.S b/contrib/cortex-strings/src/aarch64/strchr.S
new file mode 100644
index 000000000000..2f94651584f3
--- /dev/null
+++ b/contrib/cortex-strings/src/aarch64/strchr.S
@@ -0,0 +1,159 @@
+/*
+ strchr - find a character in a string
+
+ Copyright (c) 2014, ARM Limited
+ All rights Reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the company nor the names of its contributors
+ may be used to endorse or promote products derived from this
+ software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * Neon Available.
+ */
+
+/* Arguments and results. */
+#define srcin x0
+#define chrin w1
+
+#define result x0
+
+#define src x2
+#define tmp1 x3
+#define wtmp2 w4
+#define tmp3 x5
+
+#define vrepchr v0
+#define vdata1 v1
+#define vdata2 v2
+#define vhas_nul1 v3
+#define vhas_nul2 v4
+#define vhas_chr1 v5
+#define vhas_chr2 v6
+#define vrepmask_0 v7
+#define vrepmask_c v16
+#define vend1 v17
+#define vend2 v18
+
+/* Core algorithm.
+
+ For each 32-byte hunk we calculate a 64-bit syndrome value, with
+ two bits per byte (LSB is always in bits 0 and 1, for both big
+ and little-endian systems). For each tuple, bit 0 is set iff
+ the relevant byte matched the requested character; bit 1 is set
+ iff the relevant byte matched the NUL end of string (we trigger
+ off bit0 for the special case of looking for NUL). Since the bits
+ in the syndrome reflect exactly the order in which things occur
+ in the original string a count_trailing_zeros() operation will
+ identify exactly which byte is causing the termination, and why. */
+
+/* Locals and temporaries. */
+
+ .macro def_fn f p2align=0
+ .text
+ .p2align \p2align
+ .global \f
+ .type \f, %function
+\f:
+ .endm
+
+def_fn strchr
+ /* Magic constant 0x40100401 to allow us to identify which lane
+ matches the requested byte. Magic constant 0x80200802 used
+ similarly for NUL termination. */
+ mov wtmp2, #0x0401
+ movk wtmp2, #0x4010, lsl #16
+ dup vrepchr.16b, chrin
+ bic src, srcin, #31 /* Work with aligned 32-byte hunks. */
+ dup vrepmask_c.4s, wtmp2
+ ands tmp1, srcin, #31
+ add vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */
+ b.eq .Lloop
+
+ /* Input string is not 32-byte aligned. Rather than forcing
+ the padding bytes to a safe value, we calculate the syndrome
+ for all the bytes, but then mask off those bits of the
+ syndrome that are related to the padding. */
+ ld1 {vdata1.16b, vdata2.16b}, [src], #32
+ neg tmp1, tmp1
+ cmeq vhas_nul1.16b, vdata1.16b, #0
+ cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
+ cmeq vhas_nul2.16b, vdata2.16b, #0
+ cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
+ and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
+ and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
+ and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
+ and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
+ orr vend1.16b, vhas_nul1.16b, vhas_chr1.16b
+ orr vend2.16b, vhas_nul2.16b, vhas_chr2.16b
+ lsl tmp1, tmp1, #1
+ addp vend1.16b, vend1.16b, vend2.16b // 256->128
+ mov tmp3, #~0
+ addp vend1.16b, vend1.16b, vend2.16b // 128->64
+ lsr tmp1, tmp3, tmp1
+
+ mov tmp3, vend1.2d[0]
+ bic tmp1, tmp3, tmp1 // Mask padding bits.
+ cbnz tmp1, .Ltail
+
+.Lloop:
+ ld1 {vdata1.16b, vdata2.16b}, [src], #32
+ cmeq vhas_nul1.16b, vdata1.16b, #0
+ cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
+ cmeq vhas_nul2.16b, vdata2.16b, #0
+ cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
+ /* Use a fast check for the termination condition. */
+ orr vend1.16b, vhas_nul1.16b, vhas_chr1.16b
+ orr vend2.16b, vhas_nul2.16b, vhas_chr2.16b
+ orr vend1.16b, vend1.16b, vend2.16b
+ addp vend1.2d, vend1.2d, vend1.2d
+ mov tmp1, vend1.2d[0]
+ cbz tmp1, .Lloop
+
+ /* Termination condition found. Now need to establish exactly why
+ we terminated. */
+ and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
+ and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
+ and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
+ and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
+ orr vend1.16b, vhas_nul1.16b, vhas_chr1.16b
+ orr vend2.16b, vhas_nul2.16b, vhas_chr2.16b
+ addp vend1.16b, vend1.16b, vend2.16b // 256->128
+ addp vend1.16b, vend1.16b, vend2.16b // 128->64
+
+ mov tmp1, vend1.2d[0]
+.Ltail:
+ /* Count the trailing zeros, by bit reversing... */
+ rbit tmp1, tmp1
+ /* Re-bias source. */
+ sub src, src, #32
+ clz tmp1, tmp1 /* And counting the leading zeros. */
+ /* Tmp1 is even if the target charager was found first. Otherwise
+ we've found the end of string and we weren't looking for NUL. */
+ tst tmp1, #1
+ add result, src, tmp1, lsr #1
+ csel result, result, xzr, eq
+ ret
+
+ .size strchr, . - strchr
diff --git a/contrib/cortex-strings/src/aarch64/strchrnul.S b/contrib/cortex-strings/src/aarch64/strchrnul.S
new file mode 100644
index 000000000000..928f90d0869a
--- /dev/null
+++ b/contrib/cortex-strings/src/aarch64/strchrnul.S
@@ -0,0 +1,144 @@
+/*
+ strchrnul - find a character or nul in a string
+
+ Copyright (c) 2014, ARM Limited
+ All rights Reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the company nor the names of its contributors
+ may be used to endorse or promote products derived from this
+ software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * Neon Available.
+ */
+
+/* Arguments and results. */
+#define srcin x0
+#define chrin w1
+
+#define result x0
+
+#define src x2
+#define tmp1 x3
+#define wtmp2 w4
+#define tmp3 x5
+
+#define vrepchr v0
+#define vdata1 v1
+#define vdata2 v2
+#define vhas_nul1 v3
+#define vhas_nul2 v4
+#define vhas_chr1 v5
+#define vhas_chr2 v6
+#define vrepmask v7
+#define vend1 v16
+
+/* Core algorithm.
+
+ For each 32-byte hunk we calculate a 64-bit syndrome value, with
+ two bits per byte (LSB is always in bits 0 and 1, for both big
+ and little-endian systems). For each tuple, bit 0 is set iff
+ the relevant byte matched the requested character or nul. Since the
+ bits in the syndrome reflect exactly the order in which things occur
+ in the original string a count_trailing_zeros() operation will
+ identify exactly which byte is causing the termination. */
+
+/* Locals and temporaries. */
+
+ .macro def_fn f p2align=0
+ .text
+ .p2align \p2align
+ .global \f
+ .type \f, %function
+\f:
+ .endm
+
+def_fn strchrnul
+ /* Magic constant 0x40100401 to allow us to identify which lane
+ matches the termination condition. */
+ mov wtmp2, #0x0401
+ movk wtmp2, #0x4010, lsl #16
+ dup vrepchr.16b, chrin
+ bic src, srcin, #31 /* Work with aligned 32-byte hunks. */
+ dup vrepmask.4s, wtmp2
+ ands tmp1, srcin, #31
+ b.eq .Lloop
+
+ /* Input string is not 32-byte aligned. Rather than forcing
+ the padding bytes to a safe value, we calculate the syndrome
+ for all the bytes, but then mask off those bits of the
+ syndrome that are related to the padding. */
+ ld1 {vdata1.16b, vdata2.16b}, [src], #32
+ neg tmp1, tmp1
+ cmeq vhas_nul1.16b, vdata1.16b, #0
+ cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
+ cmeq vhas_nul2.16b, vdata2.16b, #0
+ cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
+ orr vhas_chr1.16b, vhas_chr1.16b, vhas_nul1.16b
+ orr vhas_chr2.16b, vhas_chr2.16b, vhas_nul2.16b
+ and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+ and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+ lsl tmp1, tmp1, #1
+ addp vend1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
+ mov tmp3, #~0
+ addp vend1.16b, vend1.16b, vend1.16b // 128->64
+ lsr tmp1, tmp3, tmp1
+
+ mov tmp3, vend1.2d[0]
+ bic tmp1, tmp3, tmp1 // Mask padding bits.
+ cbnz tmp1, .Ltail
+
+.Lloop:
+ ld1 {vdata1.16b, vdata2.16b}, [src], #32
+ cmeq vhas_nul1.16b, vdata1.16b, #0
+ cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
+ cmeq vhas_nul2.16b, vdata2.16b, #0
+ cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
+ /* Use a fast check for the termination condition. */
+ orr vhas_chr1.16b, vhas_nul1.16b, vhas_chr1.16b
+ orr vhas_chr2.16b, vhas_nul2.16b, vhas_chr2.16b
+ orr vend1.16b, vhas_chr1.16b, vhas_chr2.16b
+ addp vend1.2d, vend1.2d, vend1.2d
+ mov tmp1, vend1.2d[0]
+ cbz tmp1, .Lloop
+
+ /* Termination condition found. Now need to establish exactly why
+ we terminated. */
+ and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+ and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+ addp vend1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
+ addp vend1.16b, vend1.16b, vend1.16b // 128->64
+
+ mov tmp1, vend1.2d[0]
+.Ltail:
+ /* Count the trailing zeros, by bit reversing... */
+ rbit tmp1, tmp1
+ /* Re-bias source. */
+ sub src, src, #32
+ clz tmp1, tmp1 /* ... and counting the leading zeros. */
+ /* tmp1 is twice the offset into the fragment. */
+ add result, src, tmp1, lsr #1
+ ret
+
+ .size strchrnul, . - strchrnul
diff --git a/contrib/cortex-strings/src/aarch64/strcmp.S b/contrib/cortex-strings/src/aarch64/strcmp.S
new file mode 100644
index 000000000000..e5af383ca899
--- /dev/null
+++ b/contrib/cortex-strings/src/aarch64/strcmp.S
@@ -0,0 +1,166 @@
+/* Copyright (c) 2012, Linaro Limited
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the Linaro nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ */
+
+ .macro def_fn f p2align=0
+ .text
+ .p2align \p2align
+ .global \f
+ .type \f, %function
+\f:
+ .endm
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+#define REP8_80 0x8080808080808080
+
+/* Parameters and result. */
+#define src1 x0
+#define src2 x1
+#define result x0
+
+/* Internal variables. */
+#define data1 x2
+#define data1w w2
+#define data2 x3
+#define data2w w3
+#define has_nul x4
+#define diff x5
+#define syndrome x6
+#define tmp1 x7
+#define tmp2 x8
+#define tmp3 x9
+#define zeroones x10
+#define pos x11
+
+ /* Start of performance-critical section -- one 64B cache line. */
+def_fn strcmp p2align=6
+ eor tmp1, src1, src2
+ mov zeroones, #REP8_01
+ tst tmp1, #7
+ b.ne .Lmisaligned8
+ ands tmp1, src1, #7
+ b.ne .Lmutual_align
+ /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+ (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+ can be done in parallel across the entire word. */
+.Lloop_aligned:
+ ldr data1, [src1], #8
+ ldr data2, [src2], #8
+.Lstart_realigned:
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, #REP8_7f
+ eor diff, data1, data2 /* Non-zero if differences found. */
+ bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
+ orr syndrome, diff, has_nul
+ cbz syndrome, .Lloop_aligned
+ /* End of performance-critical section -- one 64B cache line. */
+
+#ifndef __AARCH64EB__
+ rev syndrome, syndrome
+ rev data1, data1
+ /* The MS-non-zero bit of the syndrome marks either the first bit
+ that is different, or the top bit of the first zero byte.
+ Shifting left now will bring the critical information into the
+ top bits. */
+ clz pos, syndrome
+ rev data2, data2
+ lsl data1, data1, pos
+ lsl data2, data2, pos
+ /* But we need to zero-extend (char is unsigned) the value and then
+ perform a signed 32-bit subtraction. */
+ lsr data1, data1, #56
+ sub result, data1, data2, lsr #56
+ ret
+#else
+ /* For big-endian we cannot use the trick with the syndrome value
+ as carry-propagation can corrupt the upper bits if the trailing
+ bytes in the string contain 0x01. */
+ /* However, if there is no NUL byte in the dword, we can generate
+ the result directly. We can't just subtract the bytes as the
+ MSB might be significant. */
+ cbnz has_nul, 1f
+ cmp data1, data2
+ cset result, ne
+ cneg result, result, lo
+ ret
+1:
+ /* Re-compute the NUL-byte detection, using a byte-reversed value. */
+ rev tmp3, data1
+ sub tmp1, tmp3, zeroones
+ orr tmp2, tmp3, #REP8_7f
+ bic has_nul, tmp1, tmp2
+ rev has_nul, has_nul
+ orr syndrome, diff, has_nul
+ clz pos, syndrome
+ /* The MS-non-zero bit of the syndrome marks either the first bit
+ that is different, or the top bit of the first zero byte.
+ Shifting left now will bring the critical information into the
+ top bits. */
+ lsl data1, data1, pos
+ lsl data2, data2, pos
+ /* But we need to zero-extend (char is unsigned) the value and then
+ perform a signed 32-bit subtraction. */
+ lsr data1, data1, #56
+ sub result, data1, data2, lsr #56
+ ret
+#endif
+
+.Lmutual_align:
+ /* Sources are mutually aligned, but are not currently at an
+ alignment boundary. Round down the addresses and then mask off
+ the bytes that preceed the start point. */
+ bic src1, src1, #7
+ bic src2, src2, #7
+ lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */
+ ldr data1, [src1], #8
+ neg tmp1, tmp1 /* Bits to alignment -64. */
+ ldr data2, [src2], #8
+ mov tmp2, #~0
+#ifdef __AARCH64EB__
+ /* Big-endian. Early bytes are at MSB. */
+ lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
+#else
+ /* Little-endian. Early bytes are at LSB. */
+ lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
+#endif
+ orr data1, data1, tmp2
+ orr data2, data2, tmp2
+ b .Lstart_realigned
+
+.Lmisaligned8:
+ /* We can do better than this. */
+ ldrb data1w, [src1], #1
+ ldrb data2w, [src2], #1
+ cmp data1w, #1
+ ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
+ b.eq .Lmisaligned8
+ sub result, data1, data2
+ ret
diff --git a/contrib/cortex-strings/src/aarch64/strcpy.S b/contrib/cortex-strings/src/aarch64/strcpy.S
new file mode 100644
index 000000000000..3d0d7f5b8dc8
--- /dev/null
+++ b/contrib/cortex-strings/src/aarch64/strcpy.S
@@ -0,0 +1,336 @@
+/*
+ strcpy/stpcpy - copy a string returning pointer to start/end.
+
+ Copyright (c) 2013, 2014, 2015 ARM Ltd.
+ All Rights Reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the company nor the names of its contributors
+ may be used to endorse or promote products derived from this
+ software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
+ */
+
+/* To build as stpcpy, define BUILD_STPCPY before compiling this file.
+
+ To test the page crossing code path more thoroughly, compile with
+ -DSTRCPY_TEST_PAGE_CROSS - this will force all copies through the slower
+ entry path. This option is not intended for production use. */
+
+/* Arguments and results. */
+#define dstin x0
+#define srcin x1
+
+/* Locals and temporaries. */
+#define src x2
+#define dst x3
+#define data1 x4
+#define data1w w4
+#define data2 x5
+#define data2w w5
+#define has_nul1 x6
+#define has_nul2 x7
+#define tmp1 x8
+#define tmp2 x9
+#define tmp3 x10
+#define tmp4 x11
+#define zeroones x12
+#define data1a x13
+#define data2a x14
+#define pos x15
+#define len x16
+#define to_align x17
+
+#ifdef BUILD_STPCPY
+#define STRCPY stpcpy
+#else
+#define STRCPY strcpy
+#endif
+
+ .macro def_fn f p2align=0
+ .text
+ .p2align \p2align
+ .global \f
+ .type \f, %function
+\f:
+ .endm
+
+ /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+ (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+ can be done in parallel across the entire word. */
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+#define REP8_80 0x8080808080808080
+
+ /* AArch64 systems have a minimum page size of 4k. We can do a quick
+ page size check for crossing this boundary on entry and if we
+ do not, then we can short-circuit much of the entry code. We
+ expect early page-crossing strings to be rare (probability of
+ 16/MIN_PAGE_SIZE ~= 0.4%), so the branch should be quite
+ predictable, even with random strings.
+
+ We don't bother checking for larger page sizes, the cost of setting
+ up the correct page size is just not worth the extra gain from
+ a small reduction in the cases taking the slow path. Note that
+ we only care about whether the first fetch, which may be
+ misaligned, crosses a page boundary - after that we move to aligned
+ fetches for the remainder of the string. */
+
+#ifdef STRCPY_TEST_PAGE_CROSS
+ /* Make everything that isn't Qword aligned look like a page cross. */
+#define MIN_PAGE_P2 4
+#else
+#define MIN_PAGE_P2 12
+#endif
+
+#define MIN_PAGE_SIZE (1 << MIN_PAGE_P2)
+
+def_fn STRCPY p2align=6
+ /* For moderately short strings, the fastest way to do the copy is to
+ calculate the length of the string in the same way as strlen, then
+ essentially do a memcpy of the result. This avoids the need for
+ multiple byte copies and further means that by the time we
+ reach the bulk copy loop we know we can always use DWord
+ accesses. We expect strcpy to rarely be called repeatedly
+ with the same source string, so branch prediction is likely to
+ always be difficult - we mitigate against this by preferring
+ conditional select operations over branches whenever this is
+ feasible. */
+ and tmp2, srcin, #(MIN_PAGE_SIZE - 1)
+ mov zeroones, #REP8_01
+ and to_align, srcin, #15
+ cmp tmp2, #(MIN_PAGE_SIZE - 16)
+ neg tmp1, to_align
+ /* The first fetch will straddle a (possible) page boundary iff
+ srcin + 15 causes bit[MIN_PAGE_P2] to change value. A 16-byte
+ aligned string will never fail the page align check, so will
+ always take the fast path. */
+ b.gt .Lpage_cross
+
+.Lpage_cross_ok:
+ ldp data1, data2, [srcin]
+#ifdef __AARCH64EB__
+ /* Because we expect the end to be found within 16 characters
+ (profiling shows this is the most common case), it's worth
+ swapping the bytes now to save having to recalculate the
+ termination syndrome later. We preserve data1 and data2
+ so that we can re-use the values later on. */
+ rev tmp2, data1
+ sub tmp1, tmp2, zeroones
+ orr tmp2, tmp2, #REP8_7f
+ bics has_nul1, tmp1, tmp2
+ b.ne .Lfp_le8
+ rev tmp4, data2
+ sub tmp3, tmp4, zeroones
+ orr tmp4, tmp4, #REP8_7f
+#else
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, #REP8_7f
+ bics has_nul1, tmp1, tmp2
+ b.ne .Lfp_le8
+ sub tmp3, data2, zeroones
+ orr tmp4, data2, #REP8_7f
+#endif
+ bics has_nul2, tmp3, tmp4
+ b.eq .Lbulk_entry
+
+ /* The string is short (<=16 bytes). We don't know exactly how
+ short though, yet. Work out the exact length so that we can
+ quickly select the optimal copy strategy. */
+.Lfp_gt8:
+ rev has_nul2, has_nul2
+ clz pos, has_nul2
+ mov tmp2, #56
+ add dst, dstin, pos, lsr #3 /* Bits to bytes. */
+ sub pos, tmp2, pos
+#ifdef __AARCH64EB__
+ lsr data2, data2, pos
+#else
+ lsl data2, data2, pos
+#endif
+ str data2, [dst, #1]
+ str data1, [dstin]
+#ifdef BUILD_STPCPY
+ add dstin, dst, #8
+#endif
+ ret
+
+.Lfp_le8:
+ rev has_nul1, has_nul1
+ clz pos, has_nul1
+ add dst, dstin, pos, lsr #3 /* Bits to bytes. */
+ subs tmp2, pos, #24 /* Pos in bits. */
+ b.lt .Lfp_lt4
+#ifdef __AARCH64EB__
+ mov tmp2, #56
+ sub pos, tmp2, pos
+ lsr data2, data1, pos
+ lsr data1, data1, #32
+#else
+ lsr data2, data1, tmp2
+#endif
+ /* 4->7 bytes to copy. */
+ str data2w, [dst, #-3]
+ str data1w, [dstin]
+#ifdef BUILD_STPCPY
+ mov dstin, dst
+#endif
+ ret
+.Lfp_lt4:
+ cbz pos, .Lfp_lt2
+ /* 2->3 bytes to copy. */
+#ifdef __AARCH64EB__
+ lsr data1, data1, #48
+#endif
+ strh data1w, [dstin]
+ /* Fall-through, one byte (max) to go. */
+.Lfp_lt2:
+ /* Null-terminated string. Last character must be zero! */
+ strb wzr, [dst]
+#ifdef BUILD_STPCPY
+ mov dstin, dst
+#endif
+ ret
+
+ .p2align 6
+ /* Aligning here ensures that the entry code and main loop all lies
+ within one 64-byte cache line. */
+.Lbulk_entry:
+ sub to_align, to_align, #16
+ stp data1, data2, [dstin]
+ sub src, srcin, to_align
+ sub dst, dstin, to_align
+ b .Lentry_no_page_cross
+
+ /* The inner loop deals with two Dwords at a time. This has a
+ slightly higher start-up cost, but we should win quite quickly,
+ especially on cores with a high number of issue slots per
+ cycle, as we get much better parallelism out of the operations. */
+.Lmain_loop:
+ stp data1, data2, [dst], #16
+.Lentry_no_page_cross:
+ ldp data1, data2, [src], #16
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, #REP8_7f
+ sub tmp3, data2, zeroones
+ orr tmp4, data2, #REP8_7f
+ bic has_nul1, tmp1, tmp2
+ bics has_nul2, tmp3, tmp4
+ ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */
+ b.eq .Lmain_loop
+
+ /* Since we know we are copying at least 16 bytes, the fastest way
+ to deal with the tail is to determine the location of the
+ trailing NUL, then (re)copy the 16 bytes leading up to that. */
+ cmp has_nul1, #0
+#ifdef __AARCH64EB__
+ /* For big-endian, carry propagation (if the final byte in the
+ string is 0x01) means we cannot use has_nul directly. The
+ easiest way to get the correct byte is to byte-swap the data
+ and calculate the syndrome a second time. */
+ csel data1, data1, data2, ne
+ rev data1, data1
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, #REP8_7f
+ bic has_nul1, tmp1, tmp2
+#else
+ csel has_nul1, has_nul1, has_nul2, ne
+#endif
+ rev has_nul1, has_nul1
+ clz pos, has_nul1
+ add tmp1, pos, #72
+ add pos, pos, #8
+ csel pos, pos, tmp1, ne
+ add src, src, pos, lsr #3
+ add dst, dst, pos, lsr #3
+ ldp data1, data2, [src, #-32]
+ stp data1, data2, [dst, #-16]
+#ifdef BUILD_STPCPY
+ sub dstin, dst, #1
+#endif
+ ret
+
+.Lpage_cross:
+ bic src, srcin, #15
+ /* Start by loading two words at [srcin & ~15], then forcing the
+ bytes that precede srcin to 0xff. This means they never look
+ like termination bytes. */
+ ldp data1, data2, [src]
+ lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */
+ tst to_align, #7
+ csetm tmp2, ne
+#ifdef __AARCH64EB__
+ lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
+#else
+ lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
+#endif
+ orr data1, data1, tmp2
+ orr data2a, data2, tmp2
+ cmp to_align, #8
+ csinv data1, data1, xzr, lt
+ csel data2, data2, data2a, lt
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, #REP8_7f
+ sub tmp3, data2, zeroones
+ orr tmp4, data2, #REP8_7f
+ bic has_nul1, tmp1, tmp2
+ bics has_nul2, tmp3, tmp4
+ ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */
+ b.eq .Lpage_cross_ok
+ /* We now need to make data1 and data2 look like they've been
+ loaded directly from srcin. Do a rotate on the 128-bit value. */
+ lsl tmp1, to_align, #3 /* Bytes->bits. */
+ neg tmp2, to_align, lsl #3
+#ifdef __AARCH64EB__
+ lsl data1a, data1, tmp1
+ lsr tmp4, data2, tmp2
+ lsl data2, data2, tmp1
+ orr tmp4, tmp4, data1a
+ cmp to_align, #8
+ csel data1, tmp4, data2, lt
+ rev tmp2, data1
+ rev tmp4, data2
+ sub tmp1, tmp2, zeroones
+ orr tmp2, tmp2, #REP8_7f
+ sub tmp3, tmp4, zeroones
+ orr tmp4, tmp4, #REP8_7f
+#else
+ lsr data1a, data1, tmp1
+ lsl tmp4, data2, tmp2
+ lsr data2, data2, tmp1
+ orr tmp4, tmp4, data1a
+ cmp to_align, #8
+ csel data1, tmp4, data2, lt
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, #REP8_7f
+ sub tmp3, data2, zeroones
+ orr tmp4, data2, #REP8_7f
+#endif
+ bic has_nul1, tmp1, tmp2
+ cbnz has_nul1, .Lfp_le8
+ bic has_nul2, tmp3, tmp4
+ b .Lfp_gt8
+
+ .size STRCPY, . - STRCPY
diff --git a/contrib/cortex-strings/src/aarch64/strlen.S b/contrib/cortex-strings/src/aarch64/strlen.S
new file mode 100644
index 000000000000..585064002309
--- /dev/null
+++ b/contrib/cortex-strings/src/aarch64/strlen.S
@@ -0,0 +1,233 @@
+/* Copyright (c) 2013-2015, Linaro Limited
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the Linaro nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
+ */
+
+/* To test the page crossing code path more thoroughly, compile with
+ -DTEST_PAGE_CROSS - this will force all calls through the slower
+ entry path. This option is not intended for production use. */
+
+/* Arguments and results. */
+#define srcin x0
+#define len x0
+
+/* Locals and temporaries. */
+#define src x1
+#define data1 x2
+#define data2 x3
+#define has_nul1 x4
+#define has_nul2 x5
+#define tmp1 x4
+#define tmp2 x5
+#define tmp3 x6
+#define tmp4 x7
+#define zeroones x8
+
+#define L(l) .L ## l
+
+ .macro def_fn f p2align=0
+ .text
+ .p2align \p2align
+ .global \f
+ .type \f, %function
+\f:
+ .endm
+
+ /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+ (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+ can be done in parallel across the entire word. A faster check
+ (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives
+ false hits for characters 129..255. */
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+#define REP8_80 0x8080808080808080
+
+#ifdef TEST_PAGE_CROSS
+# define MIN_PAGE_SIZE 15
+#else
+# define MIN_PAGE_SIZE 4096
+#endif
+
+ /* Since strings are short on average, we check the first 16 bytes
+ of the string for a NUL character. In order to do an unaligned ldp
+ safely we have to do a page cross check first. If there is a NUL
+ byte we calculate the length from the 2 8-byte words using
+ conditional select to reduce branch mispredictions (it is unlikely
+ strlen will be repeatedly called on strings with the same length).
+
+ If the string is longer than 16 bytes, we align src so don't need
+ further page cross checks, and process 32 bytes per iteration
+ using the fast NUL check. If we encounter non-ASCII characters,
+ fallback to a second loop using the full NUL check.
+
+ If the page cross check fails, we read 16 bytes from an aligned
+ address, remove any characters before the string, and continue
+ in the main loop using aligned loads. Since strings crossing a
+ page in the first 16 bytes are rare (probability of
+ 16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized.
+
+ AArch64 systems have a minimum page size of 4k. We don't bother
+ checking for larger page sizes - the cost of setting up the correct
+ page size is just not worth the extra gain from a small reduction in
+ the cases taking the slow path. Note that we only care about
+ whether the first fetch, which may be misaligned, crosses a page
+ boundary. */
+
+def_fn strlen p2align=6
+ and tmp1, srcin, MIN_PAGE_SIZE - 1
+ mov zeroones, REP8_01
+ cmp tmp1, MIN_PAGE_SIZE - 16
+ b.gt L(page_cross)
+ ldp data1, data2, [srcin]
+#ifdef __AARCH64EB__
+ /* For big-endian, carry propagation (if the final byte in the
+ string is 0x01) means we cannot use has_nul1/2 directly.
+ Since we expect strings to be small and early-exit,
+ byte-swap the data now so has_null1/2 will be correct. */
+ rev data1, data1
+ rev data2, data2
+#endif
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, REP8_7f
+ sub tmp3, data2, zeroones
+ orr tmp4, data2, REP8_7f
+ bics has_nul1, tmp1, tmp2
+ bic has_nul2, tmp3, tmp4
+ ccmp has_nul2, 0, 0, eq
+ beq L(main_loop_entry)
+
+ /* Enter with C = has_nul1 == 0. */
+ csel has_nul1, has_nul1, has_nul2, cc
+ mov len, 8
+ rev has_nul1, has_nul1
+ clz tmp1, has_nul1
+ csel len, xzr, len, cc
+ add len, len, tmp1, lsr 3
+ ret
+
+ /* The inner loop processes 32 bytes per iteration and uses the fast
+ NUL check. If we encounter non-ASCII characters, use a second
+ loop with the accurate NUL check. */
+ .p2align 4
+L(main_loop_entry):
+ bic src, srcin, 15
+ sub src, src, 16
+L(main_loop):
+ ldp data1, data2, [src, 32]!
+.Lpage_cross_entry:
+ sub tmp1, data1, zeroones
+ sub tmp3, data2, zeroones
+ orr tmp2, tmp1, tmp3
+ tst tmp2, zeroones, lsl 7
+ bne 1f
+ ldp data1, data2, [src, 16]
+ sub tmp1, data1, zeroones
+ sub tmp3, data2, zeroones
+ orr tmp2, tmp1, tmp3
+ tst tmp2, zeroones, lsl 7
+ beq L(main_loop)
+ add src, src, 16
+1:
+ /* The fast check failed, so do the slower, accurate NUL check. */
+ orr tmp2, data1, REP8_7f
+ orr tmp4, data2, REP8_7f
+ bics has_nul1, tmp1, tmp2
+ bic has_nul2, tmp3, tmp4
+ ccmp has_nul2, 0, 0, eq
+ beq L(nonascii_loop)
+
+ /* Enter with C = has_nul1 == 0. */
+L(tail):
+#ifdef __AARCH64EB__
+ /* For big-endian, carry propagation (if the final byte in the
+ string is 0x01) means we cannot use has_nul1/2 directly. The
+ easiest way to get the correct byte is to byte-swap the data
+ and calculate the syndrome a second time. */
+ csel data1, data1, data2, cc
+ rev data1, data1
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, REP8_7f
+ bic has_nul1, tmp1, tmp2
+#else
+ csel has_nul1, has_nul1, has_nul2, cc
+#endif
+ sub len, src, srcin
+ rev has_nul1, has_nul1
+ add tmp2, len, 8
+ clz tmp1, has_nul1
+ csel len, len, tmp2, cc
+ add len, len, tmp1, lsr 3
+ ret
+
+L(nonascii_loop):
+ ldp data1, data2, [src, 16]!
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, REP8_7f
+ sub tmp3, data2, zeroones
+ orr tmp4, data2, REP8_7f
+ bics has_nul1, tmp1, tmp2
+ bic has_nul2, tmp3, tmp4
+ ccmp has_nul2, 0, 0, eq
+ bne L(tail)
+ ldp data1, data2, [src, 16]!
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, REP8_7f
+ sub tmp3, data2, zeroones
+ orr tmp4, data2, REP8_7f
+ bics has_nul1, tmp1, tmp2
+ bic has_nul2, tmp3, tmp4
+ ccmp has_nul2, 0, 0, eq
+ beq L(nonascii_loop)
+ b L(tail)
+
+ /* Load 16 bytes from [srcin & ~15] and force the bytes that precede
+ srcin to 0x7f, so we ignore any NUL bytes before the string.
+ Then continue in the aligned loop. */
+L(page_cross):
+ bic src, srcin, 15
+ ldp data1, data2, [src]
+ lsl tmp1, srcin, 3
+ mov tmp4, -1
+#ifdef __AARCH64EB__
+ /* Big-endian. Early bytes are at MSB. */
+ lsr tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */
+#else
+ /* Little-endian. Early bytes are at LSB. */
+ lsl tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */
+#endif
+ orr tmp1, tmp1, REP8_80
+ orn data1, data1, tmp1
+ orn tmp2, data2, tmp1
+ tst srcin, 8
+ csel data1, data1, tmp4, eq
+ csel data2, data2, tmp2, eq
+ b L(page_cross_entry)
+
+ .size strlen, . - strlen
diff --git a/contrib/cortex-strings/src/aarch64/strncmp.S b/contrib/cortex-strings/src/aarch64/strncmp.S
new file mode 100644
index 000000000000..21367877fa4d
--- /dev/null
+++ b/contrib/cortex-strings/src/aarch64/strncmp.S
@@ -0,0 +1,222 @@
+/* Copyright (c) 2013, Linaro Limited
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the Linaro nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ */
+
+ .macro def_fn f p2align=0
+ .text
+ .p2align \p2align
+ .global \f
+ .type \f, %function
+\f:
+ .endm
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+#define REP8_80 0x8080808080808080
+
+/* Parameters and result. */
+#define src1 x0
+#define src2 x1
+#define limit x2
+#define result x0
+
+/* Internal variables. */
+#define data1 x3
+#define data1w w3
+#define data2 x4
+#define data2w w4
+#define has_nul x5
+#define diff x6
+#define syndrome x7
+#define tmp1 x8
+#define tmp2 x9
+#define tmp3 x10
+#define zeroones x11
+#define pos x12
+#define limit_wd x13
+#define mask x14
+#define endloop x15
+
+ .text
+ .p2align 6
+ .rep 7
+ nop /* Pad so that the loop below fits a cache line. */
+ .endr
+def_fn strncmp
+ cbz limit, .Lret0
+ eor tmp1, src1, src2
+ mov zeroones, #REP8_01
+ tst tmp1, #7
+ b.ne .Lmisaligned8
+ ands tmp1, src1, #7
+ b.ne .Lmutual_align
+ /* Calculate the number of full and partial words -1. */
+ sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
+ lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */
+
+ /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+ (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+ can be done in parallel across the entire word. */
+ /* Start of performance-critical section -- one 64B cache line. */
+.Lloop_aligned:
+ ldr data1, [src1], #8
+ ldr data2, [src2], #8
+.Lstart_realigned:
+ subs limit_wd, limit_wd, #1
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, #REP8_7f
+ eor diff, data1, data2 /* Non-zero if differences found. */
+ csinv endloop, diff, xzr, pl /* Last Dword or differences. */
+ bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
+ ccmp endloop, #0, #0, eq
+ b.eq .Lloop_aligned
+ /* End of performance-critical section -- one 64B cache line. */
+
+ /* Not reached the limit, must have found the end or a diff. */
+ tbz limit_wd, #63, .Lnot_limit
+
+ /* Limit % 8 == 0 => all bytes significant. */
+ ands limit, limit, #7
+ b.eq .Lnot_limit
+
+ lsl limit, limit, #3 /* Bits -> bytes. */
+ mov mask, #~0
+#ifdef __AARCH64EB__
+ lsr mask, mask, limit
+#else
+ lsl mask, mask, limit
+#endif
+ bic data1, data1, mask
+ bic data2, data2, mask
+
+ /* Make sure that the NUL byte is marked in the syndrome. */
+ orr has_nul, has_nul, mask
+
+.Lnot_limit:
+ orr syndrome, diff, has_nul
+
+#ifndef __AARCH64EB__
+ rev syndrome, syndrome
+ rev data1, data1
+ /* The MS-non-zero bit of the syndrome marks either the first bit
+ that is different, or the top bit of the first zero byte.
+ Shifting left now will bring the critical information into the
+ top bits. */
+ clz pos, syndrome
+ rev data2, data2
+ lsl data1, data1, pos
+ lsl data2, data2, pos
+ /* But we need to zero-extend (char is unsigned) the value and then
+ perform a signed 32-bit subtraction. */
+ lsr data1, data1, #56
+ sub result, data1, data2, lsr #56
+ ret
+#else
+ /* For big-endian we cannot use the trick with the syndrome value
+ as carry-propagation can corrupt the upper bits if the trailing
+ bytes in the string contain 0x01. */
+ /* However, if there is no NUL byte in the dword, we can generate
+ the result directly. We can't just subtract the bytes as the
+ MSB might be significant. */
+ cbnz has_nul, 1f
+ cmp data1, data2
+ cset result, ne
+ cneg result, result, lo
+ ret
+1:
+ /* Re-compute the NUL-byte detection, using a byte-reversed value. */
+ rev tmp3, data1
+ sub tmp1, tmp3, zeroones
+ orr tmp2, tmp3, #REP8_7f
+ bic has_nul, tmp1, tmp2
+ rev has_nul, has_nul
+ orr syndrome, diff, has_nul
+ clz pos, syndrome
+ /* The MS-non-zero bit of the syndrome marks either the first bit
+ that is different, or the top bit of the first zero byte.
+ Shifting left now will bring the critical information into the
+ top bits. */
+ lsl data1, data1, pos
+ lsl data2, data2, pos
+ /* But we need to zero-extend (char is unsigned) the value and then
+ perform a signed 32-bit subtraction. */
+ lsr data1, data1, #56
+ sub result, data1, data2, lsr #56
+ ret
+#endif
+
+.Lmutual_align:
+ /* Sources are mutually aligned, but are not currently at an
+ alignment boundary. Round down the addresses and then mask off
+ the bytes that precede the start point.
+ We also need to adjust the limit calculations, but without
+ overflowing if the limit is near ULONG_MAX. */
+ bic src1, src1, #7
+ bic src2, src2, #7
+ ldr data1, [src1], #8
+ neg tmp3, tmp1, lsl #3 /* 64 - bits(bytes beyond align). */
+ ldr data2, [src2], #8
+ mov tmp2, #~0
+ sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
+#ifdef __AARCH64EB__
+ /* Big-endian. Early bytes are at MSB. */
+ lsl tmp2, tmp2, tmp3 /* Shift (tmp1 & 63). */
+#else
+ /* Little-endian. Early bytes are at LSB. */
+ lsr tmp2, tmp2, tmp3 /* Shift (tmp1 & 63). */
+#endif
+ and tmp3, limit_wd, #7
+ lsr limit_wd, limit_wd, #3
+ /* Adjust the limit. Only low 3 bits used, so overflow irrelevant. */
+ add limit, limit, tmp1
+ add tmp3, tmp3, tmp1
+ orr data1, data1, tmp2
+ orr data2, data2, tmp2
+ add limit_wd, limit_wd, tmp3, lsr #3
+ b .Lstart_realigned
+
+.Lret0:
+ mov result, #0
+ ret
+
+ .p2align 6
+.Lmisaligned8:
+ sub limit, limit, #1
+1:
+ /* Perhaps we can do better than this. */
+ ldrb data1w, [src1], #1
+ ldrb data2w, [src2], #1
+ subs limit, limit, #1
+ ccmp data1w, #1, #0, cs /* NZCV = 0b0000. */
+ ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
+ b.eq 1b
+ sub result, data1, data2
+ ret
+ .size strncmp, . - strncmp
diff --git a/contrib/cortex-strings/src/aarch64/strnlen.S b/contrib/cortex-strings/src/aarch64/strnlen.S
new file mode 100644
index 000000000000..c0e609871839
--- /dev/null
+++ b/contrib/cortex-strings/src/aarch64/strnlen.S
@@ -0,0 +1,181 @@
+/* strnlen - calculate the length of a string with limit.
+
+ Copyright (c) 2013, Linaro Limited
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the Linaro nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ */
+
+/* Arguments and results. */
+#define srcin x0
+#define len x0
+#define limit x1
+
+/* Locals and temporaries. */
+#define src x2
+#define data1 x3
+#define data2 x4
+#define data2a x5
+#define has_nul1 x6
+#define has_nul2 x7
+#define tmp1 x8
+#define tmp2 x9
+#define tmp3 x10
+#define tmp4 x11
+#define zeroones x12
+#define pos x13
+#define limit_wd x14
+
+ .macro def_fn f p2align=0
+ .text
+ .p2align \p2align
+ .global \f
+ .type \f, %function
+\f:
+ .endm
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+#define REP8_80 0x8080808080808080
+
+ .text
+ .p2align 6
+.Lstart:
+ /* Pre-pad to ensure critical loop begins an icache line. */
+ .rep 7
+ nop
+ .endr
+ /* Put this code here to avoid wasting more space with pre-padding. */
+.Lhit_limit:
+ mov len, limit
+ ret
+
+def_fn strnlen
+ cbz limit, .Lhit_limit
+ mov zeroones, #REP8_01
+ bic src, srcin, #15
+ ands tmp1, srcin, #15
+ b.ne .Lmisaligned
+ /* Calculate the number of full and partial words -1. */
+ sub limit_wd, limit, #1 /* Limit != 0, so no underflow. */
+ lsr limit_wd, limit_wd, #4 /* Convert to Qwords. */
+
+ /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+ (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+ can be done in parallel across the entire word. */
+ /* The inner loop deals with two Dwords at a time. This has a
+ slightly higher start-up cost, but we should win quite quickly,
+ especially on cores with a high number of issue slots per
+ cycle, as we get much better parallelism out of the operations. */
+
+ /* Start of critial section -- keep to one 64Byte cache line. */
+.Lloop:
+ ldp data1, data2, [src], #16
+.Lrealigned:
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, #REP8_7f
+ sub tmp3, data2, zeroones
+ orr tmp4, data2, #REP8_7f
+ bic has_nul1, tmp1, tmp2
+ bic has_nul2, tmp3, tmp4
+ subs limit_wd, limit_wd, #1
+ orr tmp1, has_nul1, has_nul2
+ ccmp tmp1, #0, #0, pl /* NZCV = 0000 */
+ b.eq .Lloop
+ /* End of critical section -- keep to one 64Byte cache line. */
+
+ orr tmp1, has_nul1, has_nul2
+ cbz tmp1, .Lhit_limit /* No null in final Qword. */
+
+ /* We know there's a null in the final Qword. The easiest thing
+ to do now is work out the length of the string and return
+ MIN (len, limit). */
+
+ sub len, src, srcin
+ cbz has_nul1, .Lnul_in_data2
+#ifdef __AARCH64EB__
+ mov data2, data1
+#endif
+ sub len, len, #8
+ mov has_nul2, has_nul1
+.Lnul_in_data2:
+#ifdef __AARCH64EB__
+ /* For big-endian, carry propagation (if the final byte in the
+ string is 0x01) means we cannot use has_nul directly. The
+ easiest way to get the correct byte is to byte-swap the data
+ and calculate the syndrome a second time. */
+ rev data2, data2
+ sub tmp1, data2, zeroones
+ orr tmp2, data2, #REP8_7f
+ bic has_nul2, tmp1, tmp2
+#endif
+ sub len, len, #8
+ rev has_nul2, has_nul2
+ clz pos, has_nul2
+ add len, len, pos, lsr #3 /* Bits to bytes. */
+ cmp len, limit
+ csel len, len, limit, ls /* Return the lower value. */
+ ret
+
+.Lmisaligned:
+ /* Deal with a partial first word.
+ We're doing two things in parallel here;
+ 1) Calculate the number of words (but avoiding overflow if
+ limit is near ULONG_MAX) - to do this we need to work out
+ limit + tmp1 - 1 as a 65-bit value before shifting it;
+ 2) Load and mask the initial data words - we force the bytes
+ before the ones we are interested in to 0xff - this ensures
+ early bytes will not hit any zero detection. */
+ sub limit_wd, limit, #1
+ neg tmp4, tmp1
+ cmp tmp1, #8
+
+ and tmp3, limit_wd, #15
+ lsr limit_wd, limit_wd, #4
+ mov tmp2, #~0
+
+ ldp data1, data2, [src], #16
+ lsl tmp4, tmp4, #3 /* Bytes beyond alignment -> bits. */
+ add tmp3, tmp3, tmp1
+
+#ifdef __AARCH64EB__
+ /* Big-endian. Early bytes are at MSB. */
+ lsl tmp2, tmp2, tmp4 /* Shift (tmp1 & 63). */
+#else
+ /* Little-endian. Early bytes are at LSB. */
+ lsr tmp2, tmp2, tmp4 /* Shift (tmp1 & 63). */
+#endif
+ add limit_wd, limit_wd, tmp3, lsr #4
+
+ orr data1, data1, tmp2
+ orr data2a, data2, tmp2
+
+ csinv data1, data1, xzr, le
+ csel data2, data2, data2a, le
+ b .Lrealigned
+ .size strnlen, . - .Lstart /* Include pre-padding in size. */
diff --git a/contrib/cortex-strings/src/arm/memchr.S b/contrib/cortex-strings/src/arm/memchr.S
new file mode 100644
index 000000000000..92a2d9f0967d
--- /dev/null
+++ b/contrib/cortex-strings/src/arm/memchr.S
@@ -0,0 +1,155 @@
+/* Copyright (c) 2010-2011, Linaro Limited
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ * Neither the name of Linaro Limited nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ Written by Dave Gilbert <david.gilbert@linaro.org>
+
+ This memchr routine is optimised on a Cortex-A9 and should work on
+ all ARMv7 processors. It has a fast past for short sizes, and has
+ an optimised path for large data sets; the worst case is finding the
+ match early in a large data set.
+
+ */
+
+@ 2011-02-07 david.gilbert@linaro.org
+@ Extracted from local git a5b438d861
+@ 2011-07-14 david.gilbert@linaro.org
+@ Import endianness fix from local git ea786f1b
+@ 2011-12-07 david.gilbert@linaro.org
+@ Removed unneeded cbz from align loop
+
+ .syntax unified
+ .arch armv7-a
+
+@ this lets us check a flag in a 00/ff byte easily in either endianness
+#ifdef __ARMEB__
+#define CHARTSTMASK(c) 1<<(31-(c*8))
+#else
+#define CHARTSTMASK(c) 1<<(c*8)
+#endif
+ .text
+ .thumb
+
+@ ---------------------------------------------------------------------------
+ .thumb_func
+ .align 2
+ .p2align 4,,15
+ .global memchr
+ .type memchr,%function
+memchr:
+ @ r0 = start of memory to scan
+ @ r1 = character to look for
+ @ r2 = length
+ @ returns r0 = pointer to character or NULL if not found
+ and r1,r1,#0xff @ Don't think we can trust the caller to actually pass a char
+
+ cmp r2,#16 @ If it's short don't bother with anything clever
+ blt 20f
+
+ tst r0, #7 @ If it's already aligned skip the next bit
+ beq 10f
+
+ @ Work up to an aligned point
+5:
+ ldrb r3, [r0],#1
+ subs r2, r2, #1
+ cmp r3, r1
+ beq 50f @ If it matches exit found
+ tst r0, #7
+ bne 5b @ If not aligned yet then do next byte
+
+10:
+ @ At this point, we are aligned, we know we have at least 8 bytes to work with
+ push {r4,r5,r6,r7}
+ orr r1, r1, r1, lsl #8 @ expand the match word across to all bytes
+ orr r1, r1, r1, lsl #16
+ bic r4, r2, #7 @ Number of double words to work with
+ mvns r7, #0 @ all F's
+ movs r3, #0
+
+15:
+ ldmia r0!,{r5,r6}
+ subs r4, r4, #8
+ eor r5,r5, r1 @ Get it so that r5,r6 have 00's where the bytes match the target
+ eor r6,r6, r1
+ uadd8 r5, r5, r7 @ Parallel add 0xff - sets the GE bits for anything that wasn't 0
+ sel r5, r3, r7 @ bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION
+ uadd8 r6, r6, r7 @ Parallel add 0xff - sets the GE bits for anything that wasn't 0
+ sel r6, r5, r7 @ chained....bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION
+ cbnz r6, 60f
+ bne 15b @ (Flags from the subs above) If not run out of bytes then go around again
+
+ pop {r4,r5,r6,r7}
+ and r1,r1,#0xff @ Get r1 back to a single character from the expansion above
+ and r2,r2,#7 @ Leave the count remaining as the number after the double words have been done
+
+20:
+ cbz r2, 40f @ 0 length or hit the end already then not found
+
+21: @ Post aligned section, or just a short call
+ ldrb r3,[r0],#1
+ subs r2,r2,#1
+ eor r3,r3,r1 @ r3 = 0 if match - doesn't break flags from sub
+ cbz r3, 50f
+ bne 21b @ on r2 flags
+
+40:
+ movs r0,#0 @ not found
+ bx lr
+
+50:
+ subs r0,r0,#1 @ found
+ bx lr
+
+60: @ We're here because the fast path found a hit - now we have to track down exactly which word it was
+ @ r0 points to the start of the double word after the one that was tested
+ @ r5 has the 00/ff pattern for the first word, r6 has the chained value
+ cmp r5, #0
+ itte eq
+ moveq r5, r6 @ the end is in the 2nd word
+ subeq r0,r0,#3 @ Points to 2nd byte of 2nd word
+ subne r0,r0,#7 @ or 2nd byte of 1st word
+
+ @ r0 currently points to the 3rd byte of the word containing the hit
+ tst r5, # CHARTSTMASK(0) @ 1st character
+ bne 61f
+ adds r0,r0,#1
+ tst r5, # CHARTSTMASK(1) @ 2nd character
+ ittt eq
+ addeq r0,r0,#1
+ tsteq r5, # (3<<15) @ 2nd & 3rd character
+ @ If not the 3rd must be the last one
+ addeq r0,r0,#1
+
+61:
+ pop {r4,r5,r6,r7}
+ subs r0,r0,#1
+ bx lr
diff --git a/contrib/cortex-strings/src/arm/memcpy.S b/contrib/cortex-strings/src/arm/memcpy.S
new file mode 100644
index 000000000000..dd405ec13925
--- /dev/null
+++ b/contrib/cortex-strings/src/arm/memcpy.S
@@ -0,0 +1,617 @@
+/* Copyright (c) 2013, Linaro Limited
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ * Neither the name of Linaro Limited nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ This memcpy routine is optimised for Cortex-A15 cores and takes advantage
+ of VFP or NEON when built with the appropriate flags.
+
+ Assumptions:
+
+ ARMv6 (ARMv7-a if using Neon)
+ ARM state
+ Unaligned accesses
+
+ */
+
+ .syntax unified
+ /* This implementation requires ARM state. */
+ .arm
+
+#ifdef __ARM_NEON__
+
+ .fpu neon
+ .arch armv7-a
+# define FRAME_SIZE 4
+# define USE_VFP
+# define USE_NEON
+
+#elif !defined (__SOFTFP__)
+
+ .arch armv6
+ .fpu vfpv2
+# define FRAME_SIZE 32
+# define USE_VFP
+
+#else
+ .arch armv6
+# define FRAME_SIZE 32
+
+#endif
+
+/* Old versions of GAS incorrectly implement the NEON align semantics. */
+#ifdef BROKEN_ASM_NEON_ALIGN
+#define ALIGN(addr, align) addr,:align
+#else
+#define ALIGN(addr, align) addr:align
+#endif
+
+#define PC_OFFSET 8 /* PC pipeline compensation. */
+#define INSN_SIZE 4
+
+/* Call parameters. */
+#define dstin r0
+#define src r1
+#define count r2
+
+/* Locals. */
+#define tmp1 r3
+#define dst ip
+#define tmp2 r10
+
+#ifndef USE_NEON
+/* For bulk copies using GP registers. */
+#define A_l r2 /* Call-clobbered. */
+#define A_h r3 /* Call-clobbered. */
+#define B_l r4
+#define B_h r5
+#define C_l r6
+#define C_h r7
+#define D_l r8
+#define D_h r9
+#endif
+
+/* Number of lines ahead to pre-fetch data. If you change this the code
+ below will need adjustment to compensate. */
+
+#define prefetch_lines 5
+
+#ifdef USE_VFP
+ .macro cpy_line_vfp vreg, base
+ vstr \vreg, [dst, #\base]
+ vldr \vreg, [src, #\base]
+ vstr d0, [dst, #\base + 8]
+ vldr d0, [src, #\base + 8]
+ vstr d1, [dst, #\base + 16]
+ vldr d1, [src, #\base + 16]
+ vstr d2, [dst, #\base + 24]
+ vldr d2, [src, #\base + 24]
+ vstr \vreg, [dst, #\base + 32]
+ vldr \vreg, [src, #\base + prefetch_lines * 64 - 32]
+ vstr d0, [dst, #\base + 40]
+ vldr d0, [src, #\base + 40]
+ vstr d1, [dst, #\base + 48]
+ vldr d1, [src, #\base + 48]
+ vstr d2, [dst, #\base + 56]
+ vldr d2, [src, #\base + 56]
+ .endm
+
+ .macro cpy_tail_vfp vreg, base
+ vstr \vreg, [dst, #\base]
+ vldr \vreg, [src, #\base]
+ vstr d0, [dst, #\base + 8]
+ vldr d0, [src, #\base + 8]
+ vstr d1, [dst, #\base + 16]
+ vldr d1, [src, #\base + 16]
+ vstr d2, [dst, #\base + 24]
+ vldr d2, [src, #\base + 24]
+ vstr \vreg, [dst, #\base + 32]
+ vstr d0, [dst, #\base + 40]
+ vldr d0, [src, #\base + 40]
+ vstr d1, [dst, #\base + 48]
+ vldr d1, [src, #\base + 48]
+ vstr d2, [dst, #\base + 56]
+ vldr d2, [src, #\base + 56]
+ .endm
+#endif
+
+ .macro def_fn f p2align=0
+ .text
+ .p2align \p2align
+ .global \f
+ .type \f, %function
+\f:
+ .endm
+
+def_fn memcpy p2align=6
+
+ mov dst, dstin /* Preserve dstin, we need to return it. */
+ cmp count, #64
+ bge .Lcpy_not_short
+ /* Deal with small copies quickly by dropping straight into the
+ exit block. */
+
+.Ltail63unaligned:
+#ifdef USE_NEON
+ and tmp1, count, #0x38
+ rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
+ add pc, pc, tmp1
+ vld1.8 {d0}, [src]! /* 14 words to go. */
+ vst1.8 {d0}, [dst]!
+ vld1.8 {d0}, [src]! /* 12 words to go. */
+ vst1.8 {d0}, [dst]!
+ vld1.8 {d0}, [src]! /* 10 words to go. */
+ vst1.8 {d0}, [dst]!
+ vld1.8 {d0}, [src]! /* 8 words to go. */
+ vst1.8 {d0}, [dst]!
+ vld1.8 {d0}, [src]! /* 6 words to go. */
+ vst1.8 {d0}, [dst]!
+ vld1.8 {d0}, [src]! /* 4 words to go. */
+ vst1.8 {d0}, [dst]!
+ vld1.8 {d0}, [src]! /* 2 words to go. */
+ vst1.8 {d0}, [dst]!
+
+ tst count, #4
+ ldrne tmp1, [src], #4
+ strne tmp1, [dst], #4
+#else
+ /* Copy up to 15 full words of data. May not be aligned. */
+ /* Cannot use VFP for unaligned data. */
+ and tmp1, count, #0x3c
+ add dst, dst, tmp1
+ add src, src, tmp1
+ rsb tmp1, tmp1, #(60 - PC_OFFSET/2 + INSN_SIZE/2)
+ /* Jump directly into the sequence below at the correct offset. */
+ add pc, pc, tmp1, lsl #1
+
+ ldr tmp1, [src, #-60] /* 15 words to go. */
+ str tmp1, [dst, #-60]
+
+ ldr tmp1, [src, #-56] /* 14 words to go. */
+ str tmp1, [dst, #-56]
+ ldr tmp1, [src, #-52]
+ str tmp1, [dst, #-52]
+
+ ldr tmp1, [src, #-48] /* 12 words to go. */
+ str tmp1, [dst, #-48]
+ ldr tmp1, [src, #-44]
+ str tmp1, [dst, #-44]
+
+ ldr tmp1, [src, #-40] /* 10 words to go. */
+ str tmp1, [dst, #-40]
+ ldr tmp1, [src, #-36]
+ str tmp1, [dst, #-36]
+
+ ldr tmp1, [src, #-32] /* 8 words to go. */
+ str tmp1, [dst, #-32]
+ ldr tmp1, [src, #-28]
+ str tmp1, [dst, #-28]
+
+ ldr tmp1, [src, #-24] /* 6 words to go. */
+ str tmp1, [dst, #-24]
+ ldr tmp1, [src, #-20]
+ str tmp1, [dst, #-20]
+
+ ldr tmp1, [src, #-16] /* 4 words to go. */
+ str tmp1, [dst, #-16]
+ ldr tmp1, [src, #-12]
+ str tmp1, [dst, #-12]
+
+ ldr tmp1, [src, #-8] /* 2 words to go. */
+ str tmp1, [dst, #-8]
+ ldr tmp1, [src, #-4]
+ str tmp1, [dst, #-4]
+#endif
+
+ lsls count, count, #31
+ ldrhcs tmp1, [src], #2
+ ldrbne src, [src] /* Src is dead, use as a scratch. */
+ strhcs tmp1, [dst], #2
+ strbne src, [dst]
+ bx lr
+
+.Lcpy_not_short:
+ /* At least 64 bytes to copy, but don't know the alignment yet. */
+ str tmp2, [sp, #-FRAME_SIZE]!
+ and tmp2, src, #7
+ and tmp1, dst, #7
+ cmp tmp1, tmp2
+ bne .Lcpy_notaligned
+
+#ifdef USE_VFP
+ /* Magic dust alert! Force VFP on Cortex-A9. Experiments show
+ that the FP pipeline is much better at streaming loads and
+ stores. This is outside the critical loop. */
+ vmov.f32 s0, s0
+#endif
+
+ /* SRC and DST have the same mutual 64-bit alignment, but we may
+ still need to pre-copy some bytes to get to natural alignment.
+ We bring SRC and DST into full 64-bit alignment. */
+ lsls tmp2, dst, #29
+ beq 1f
+ rsbs tmp2, tmp2, #0
+ sub count, count, tmp2, lsr #29
+ ldrmi tmp1, [src], #4
+ strmi tmp1, [dst], #4
+ lsls tmp2, tmp2, #2
+ ldrhcs tmp1, [src], #2
+ ldrbne tmp2, [src], #1
+ strhcs tmp1, [dst], #2
+ strbne tmp2, [dst], #1
+
+1:
+ subs tmp2, count, #64 /* Use tmp2 for count. */
+ blt .Ltail63aligned
+
+ cmp tmp2, #512
+ bge .Lcpy_body_long
+
+.Lcpy_body_medium: /* Count in tmp2. */
+#ifdef USE_VFP
+1:
+ vldr d0, [src, #0]
+ subs tmp2, tmp2, #64
+ vldr d1, [src, #8]
+ vstr d0, [dst, #0]
+ vldr d0, [src, #16]
+ vstr d1, [dst, #8]
+ vldr d1, [src, #24]
+ vstr d0, [dst, #16]
+ vldr d0, [src, #32]
+ vstr d1, [dst, #24]
+ vldr d1, [src, #40]
+ vstr d0, [dst, #32]
+ vldr d0, [src, #48]
+ vstr d1, [dst, #40]
+ vldr d1, [src, #56]
+ vstr d0, [dst, #48]
+ add src, src, #64
+ vstr d1, [dst, #56]
+ add dst, dst, #64
+ bge 1b
+ tst tmp2, #0x3f
+ beq .Ldone
+
+.Ltail63aligned: /* Count in tmp2. */
+ and tmp1, tmp2, #0x38
+ add dst, dst, tmp1
+ add src, src, tmp1
+ rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
+ add pc, pc, tmp1
+
+ vldr d0, [src, #-56] /* 14 words to go. */
+ vstr d0, [dst, #-56]
+ vldr d0, [src, #-48] /* 12 words to go. */
+ vstr d0, [dst, #-48]
+ vldr d0, [src, #-40] /* 10 words to go. */
+ vstr d0, [dst, #-40]
+ vldr d0, [src, #-32] /* 8 words to go. */
+ vstr d0, [dst, #-32]
+ vldr d0, [src, #-24] /* 6 words to go. */
+ vstr d0, [dst, #-24]
+ vldr d0, [src, #-16] /* 4 words to go. */
+ vstr d0, [dst, #-16]
+ vldr d0, [src, #-8] /* 2 words to go. */
+ vstr d0, [dst, #-8]
+#else
+ sub src, src, #8
+ sub dst, dst, #8
+1:
+ ldrd A_l, A_h, [src, #8]
+ strd A_l, A_h, [dst, #8]
+ ldrd A_l, A_h, [src, #16]
+ strd A_l, A_h, [dst, #16]
+ ldrd A_l, A_h, [src, #24]
+ strd A_l, A_h, [dst, #24]
+ ldrd A_l, A_h, [src, #32]
+ strd A_l, A_h, [dst, #32]
+ ldrd A_l, A_h, [src, #40]
+ strd A_l, A_h, [dst, #40]
+ ldrd A_l, A_h, [src, #48]
+ strd A_l, A_h, [dst, #48]
+ ldrd A_l, A_h, [src, #56]
+ strd A_l, A_h, [dst, #56]
+ ldrd A_l, A_h, [src, #64]!
+ strd A_l, A_h, [dst, #64]!
+ subs tmp2, tmp2, #64
+ bge 1b
+ tst tmp2, #0x3f
+ bne 1f
+ ldr tmp2,[sp], #FRAME_SIZE
+ bx lr
+1:
+ add src, src, #8
+ add dst, dst, #8
+
+.Ltail63aligned: /* Count in tmp2. */
+ /* Copy up to 7 d-words of data. Similar to Ltail63unaligned, but
+ we know that the src and dest are 64-bit aligned so we can use
+ LDRD/STRD to improve efficiency. */
+ /* TMP2 is now negative, but we don't care about that. The bottom
+ six bits still tell us how many bytes are left to copy. */
+
+ and tmp1, tmp2, #0x38
+ add dst, dst, tmp1
+ add src, src, tmp1
+ rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
+ add pc, pc, tmp1
+ ldrd A_l, A_h, [src, #-56] /* 14 words to go. */
+ strd A_l, A_h, [dst, #-56]
+ ldrd A_l, A_h, [src, #-48] /* 12 words to go. */
+ strd A_l, A_h, [dst, #-48]
+ ldrd A_l, A_h, [src, #-40] /* 10 words to go. */
+ strd A_l, A_h, [dst, #-40]
+ ldrd A_l, A_h, [src, #-32] /* 8 words to go. */
+ strd A_l, A_h, [dst, #-32]
+ ldrd A_l, A_h, [src, #-24] /* 6 words to go. */
+ strd A_l, A_h, [dst, #-24]
+ ldrd A_l, A_h, [src, #-16] /* 4 words to go. */
+ strd A_l, A_h, [dst, #-16]
+ ldrd A_l, A_h, [src, #-8] /* 2 words to go. */
+ strd A_l, A_h, [dst, #-8]
+
+#endif
+ tst tmp2, #4
+ ldrne tmp1, [src], #4
+ strne tmp1, [dst], #4
+ lsls tmp2, tmp2, #31 /* Count (tmp2) now dead. */
+ ldrhcs tmp1, [src], #2
+ ldrbne tmp2, [src]
+ strhcs tmp1, [dst], #2
+ strbne tmp2, [dst]
+
+.Ldone:
+ ldr tmp2, [sp], #FRAME_SIZE
+ bx lr
+
+.Lcpy_body_long: /* Count in tmp2. */
+
+ /* Long copy. We know that there's at least (prefetch_lines * 64)
+ bytes to go. */
+#ifdef USE_VFP
+ /* Don't use PLD. Instead, read some data in advance of the current
+ copy position into a register. This should act like a PLD
+ operation but we won't have to repeat the transfer. */
+
+ vldr d3, [src, #0]
+ vldr d4, [src, #64]
+ vldr d5, [src, #128]
+ vldr d6, [src, #192]
+ vldr d7, [src, #256]
+
+ vldr d0, [src, #8]
+ vldr d1, [src, #16]
+ vldr d2, [src, #24]
+ add src, src, #32
+
+ subs tmp2, tmp2, #prefetch_lines * 64 * 2
+ blt 2f
+1:
+ cpy_line_vfp d3, 0
+ cpy_line_vfp d4, 64
+ cpy_line_vfp d5, 128
+ add dst, dst, #3 * 64
+ add src, src, #3 * 64
+ cpy_line_vfp d6, 0
+ cpy_line_vfp d7, 64
+ add dst, dst, #2 * 64
+ add src, src, #2 * 64
+ subs tmp2, tmp2, #prefetch_lines * 64
+ bge 1b
+
+2:
+ cpy_tail_vfp d3, 0
+ cpy_tail_vfp d4, 64
+ cpy_tail_vfp d5, 128
+ add src, src, #3 * 64
+ add dst, dst, #3 * 64
+ cpy_tail_vfp d6, 0
+ vstr d7, [dst, #64]
+ vldr d7, [src, #64]
+ vstr d0, [dst, #64 + 8]
+ vldr d0, [src, #64 + 8]
+ vstr d1, [dst, #64 + 16]
+ vldr d1, [src, #64 + 16]
+ vstr d2, [dst, #64 + 24]
+ vldr d2, [src, #64 + 24]
+ vstr d7, [dst, #64 + 32]
+ add src, src, #96
+ vstr d0, [dst, #64 + 40]
+ vstr d1, [dst, #64 + 48]
+ vstr d2, [dst, #64 + 56]
+ add dst, dst, #128
+ add tmp2, tmp2, #prefetch_lines * 64
+ b .Lcpy_body_medium
+#else
+ /* Long copy. Use an SMS style loop to maximize the I/O
+ bandwidth of the core. We don't have enough spare registers
+ to synthesise prefetching, so use PLD operations. */
+ /* Pre-bias src and dst. */
+ sub src, src, #8
+ sub dst, dst, #8
+ pld [src, #8]
+ pld [src, #72]
+ subs tmp2, tmp2, #64
+ pld [src, #136]
+ ldrd A_l, A_h, [src, #8]
+ strd B_l, B_h, [sp, #8]
+ ldrd B_l, B_h, [src, #16]
+ strd C_l, C_h, [sp, #16]
+ ldrd C_l, C_h, [src, #24]
+ strd D_l, D_h, [sp, #24]
+ pld [src, #200]
+ ldrd D_l, D_h, [src, #32]!
+ b 1f
+ .p2align 6
+2:
+ pld [src, #232]
+ strd A_l, A_h, [dst, #40]
+ ldrd A_l, A_h, [src, #40]
+ strd B_l, B_h, [dst, #48]
+ ldrd B_l, B_h, [src, #48]
+ strd C_l, C_h, [dst, #56]
+ ldrd C_l, C_h, [src, #56]
+ strd D_l, D_h, [dst, #64]!
+ ldrd D_l, D_h, [src, #64]!
+ subs tmp2, tmp2, #64
+1:
+ strd A_l, A_h, [dst, #8]
+ ldrd A_l, A_h, [src, #8]
+ strd B_l, B_h, [dst, #16]
+ ldrd B_l, B_h, [src, #16]
+ strd C_l, C_h, [dst, #24]
+ ldrd C_l, C_h, [src, #24]
+ strd D_l, D_h, [dst, #32]
+ ldrd D_l, D_h, [src, #32]
+ bcs 2b
+ /* Save the remaining bytes and restore the callee-saved regs. */
+ strd A_l, A_h, [dst, #40]
+ add src, src, #40
+ strd B_l, B_h, [dst, #48]
+ ldrd B_l, B_h, [sp, #8]
+ strd C_l, C_h, [dst, #56]
+ ldrd C_l, C_h, [sp, #16]
+ strd D_l, D_h, [dst, #64]
+ ldrd D_l, D_h, [sp, #24]
+ add dst, dst, #72
+ tst tmp2, #0x3f
+ bne .Ltail63aligned
+ ldr tmp2, [sp], #FRAME_SIZE
+ bx lr
+#endif
+
+.Lcpy_notaligned:
+ pld [src]
+ pld [src, #64]
+ /* There's at least 64 bytes to copy, but there is no mutual
+ alignment. */
+ /* Bring DST to 64-bit alignment. */
+ lsls tmp2, dst, #29
+ pld [src, #(2 * 64)]
+ beq 1f
+ rsbs tmp2, tmp2, #0
+ sub count, count, tmp2, lsr #29
+ ldrmi tmp1, [src], #4
+ strmi tmp1, [dst], #4
+ lsls tmp2, tmp2, #2
+ ldrbne tmp1, [src], #1
+ ldrhcs tmp2, [src], #2
+ strbne tmp1, [dst], #1
+ strhcs tmp2, [dst], #2
+1:
+ pld [src, #(3 * 64)]
+ subs count, count, #64
+ ldrmi tmp2, [sp], #FRAME_SIZE
+ bmi .Ltail63unaligned
+ pld [src, #(4 * 64)]
+
+#ifdef USE_NEON
+ vld1.8 {d0-d3}, [src]!
+ vld1.8 {d4-d7}, [src]!
+ subs count, count, #64
+ bmi 2f
+1:
+ pld [src, #(4 * 64)]
+ vst1.8 {d0-d3}, [ALIGN (dst, 64)]!
+ vld1.8 {d0-d3}, [src]!
+ vst1.8 {d4-d7}, [ALIGN (dst, 64)]!
+ vld1.8 {d4-d7}, [src]!
+ subs count, count, #64
+ bpl 1b
+2:
+ vst1.8 {d0-d3}, [ALIGN (dst, 64)]!
+ vst1.8 {d4-d7}, [ALIGN (dst, 64)]!
+ ands count, count, #0x3f
+#else
+ /* Use an SMS style loop to maximize the I/O bandwidth. */
+ sub src, src, #4
+ sub dst, dst, #8
+ subs tmp2, count, #64 /* Use tmp2 for count. */
+ ldr A_l, [src, #4]
+ ldr A_h, [src, #8]
+ strd B_l, B_h, [sp, #8]
+ ldr B_l, [src, #12]
+ ldr B_h, [src, #16]
+ strd C_l, C_h, [sp, #16]
+ ldr C_l, [src, #20]
+ ldr C_h, [src, #24]
+ strd D_l, D_h, [sp, #24]
+ ldr D_l, [src, #28]
+ ldr D_h, [src, #32]!
+ b 1f
+ .p2align 6
+2:
+ pld [src, #(5 * 64) - (32 - 4)]
+ strd A_l, A_h, [dst, #40]
+ ldr A_l, [src, #36]
+ ldr A_h, [src, #40]
+ strd B_l, B_h, [dst, #48]
+ ldr B_l, [src, #44]
+ ldr B_h, [src, #48]
+ strd C_l, C_h, [dst, #56]
+ ldr C_l, [src, #52]
+ ldr C_h, [src, #56]
+ strd D_l, D_h, [dst, #64]!
+ ldr D_l, [src, #60]
+ ldr D_h, [src, #64]!
+ subs tmp2, tmp2, #64
+1:
+ strd A_l, A_h, [dst, #8]
+ ldr A_l, [src, #4]
+ ldr A_h, [src, #8]
+ strd B_l, B_h, [dst, #16]
+ ldr B_l, [src, #12]
+ ldr B_h, [src, #16]
+ strd C_l, C_h, [dst, #24]
+ ldr C_l, [src, #20]
+ ldr C_h, [src, #24]
+ strd D_l, D_h, [dst, #32]
+ ldr D_l, [src, #28]
+ ldr D_h, [src, #32]
+ bcs 2b
+
+ /* Save the remaining bytes and restore the callee-saved regs. */
+ strd A_l, A_h, [dst, #40]
+ add src, src, #36
+ strd B_l, B_h, [dst, #48]
+ ldrd B_l, B_h, [sp, #8]
+ strd C_l, C_h, [dst, #56]
+ ldrd C_l, C_h, [sp, #16]
+ strd D_l, D_h, [dst, #64]
+ ldrd D_l, D_h, [sp, #24]
+ add dst, dst, #72
+ ands count, tmp2, #0x3f
+#endif
+ ldr tmp2, [sp], #FRAME_SIZE
+ bne .Ltail63unaligned
+ bx lr
+
+ .size memcpy, . - memcpy
diff --git a/contrib/cortex-strings/src/arm/memset.S b/contrib/cortex-strings/src/arm/memset.S
new file mode 100644
index 000000000000..c0ad588ab11e
--- /dev/null
+++ b/contrib/cortex-strings/src/arm/memset.S
@@ -0,0 +1,122 @@
+/* Copyright (c) 2010-2011, Linaro Limited
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ * Neither the name of Linaro Limited nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ Written by Dave Gilbert <david.gilbert@linaro.org>
+
+ This memset routine is optimised on a Cortex-A9 and should work on
+ all ARMv7 processors.
+
+ */
+
+ .syntax unified
+ .arch armv7-a
+
+@ 2011-08-30 david.gilbert@linaro.org
+@ Extracted from local git 2f11b436
+
+@ this lets us check a flag in a 00/ff byte easily in either endianness
+#ifdef __ARMEB__
+#define CHARTSTMASK(c) 1<<(31-(c*8))
+#else
+#define CHARTSTMASK(c) 1<<(c*8)
+#endif
+ .text
+ .thumb
+
+@ ---------------------------------------------------------------------------
+ .thumb_func
+ .align 2
+ .p2align 4,,15
+ .global memset
+ .type memset,%function
+memset:
+ @ r0 = address
+ @ r1 = character
+ @ r2 = count
+ @ returns original address in r0
+
+ mov r3, r0 @ Leave r0 alone
+ cbz r2, 10f @ Exit if 0 length
+
+ tst r0, #7
+ beq 2f @ Already aligned
+
+ @ Ok, so we're misaligned here
+1:
+ strb r1, [r3], #1
+ subs r2,r2,#1
+ tst r3, #7
+ cbz r2, 10f @ Exit if we hit the end
+ bne 1b @ go round again if still misaligned
+
+2:
+ @ OK, so we're aligned
+ push {r4,r5,r6,r7}
+ bics r4, r2, #15 @ if less than 16 bytes then need to finish it off
+ beq 5f
+
+3:
+ @ POSIX says that ch is cast to an unsigned char. A uxtb is one
+ @ byte and takes two cycles, where an AND is four bytes but one
+ @ cycle.
+ and r1, #0xFF
+ orr r1, r1, r1, lsl#8 @ Same character into all bytes
+ orr r1, r1, r1, lsl#16
+ mov r5,r1
+ mov r6,r1
+ mov r7,r1
+
+4:
+ subs r4,r4,#16
+ stmia r3!,{r1,r5,r6,r7}
+ bne 4b
+ and r2,r2,#15
+
+ @ At this point we're still aligned and we have upto align-1 bytes left to right
+ @ we can avoid some of the byte-at-a time now by testing for some big chunks
+ tst r2,#8
+ itt ne
+ subne r2,r2,#8
+ stmiane r3!,{r1,r5}
+
+5:
+ pop {r4,r5,r6,r7}
+ cbz r2, 10f
+
+ @ Got to do any last < alignment bytes
+6:
+ subs r2,r2,#1
+ strb r1,[r3],#1
+ bne 6b
+
+10:
+ bx lr @ goodbye
diff --git a/contrib/cortex-strings/src/arm/strchr.S b/contrib/cortex-strings/src/arm/strchr.S
new file mode 100644
index 000000000000..8e06dd403afd
--- /dev/null
+++ b/contrib/cortex-strings/src/arm/strchr.S
@@ -0,0 +1,80 @@
+/* Copyright (c) 2010-2011, Linaro Limited
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ * Neither the name of Linaro Limited nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ Written by Dave Gilbert <david.gilbert@linaro.org>
+
+ A very simple strchr routine, from benchmarks on A9 it's a bit faster than
+ the current version in eglibc (2.12.1-0ubuntu14 package)
+ I don't think doing a word at a time version is worth it since a lot
+ of strchr cases are very short anyway.
+
+ */
+
+@ 2011-02-07 david.gilbert@linaro.org
+@ Extracted from local git a5b438d861
+
+ .syntax unified
+ .arch armv7-a
+
+ .text
+ .thumb
+
+@ ---------------------------------------------------------------------------
+
+ .thumb_func
+ .align 2
+ .p2align 4,,15
+ .global strchr
+ .type strchr,%function
+strchr:
+ @ r0 = start of string
+ @ r1 = character to match
+ @ returns NULL for no match, or a pointer to the match
+ and r1,r1, #255
+
+1:
+ ldrb r2,[r0],#1
+ cmp r2,r1
+ cbz r2,10f
+ bne 1b
+
+ @ We're here if it matched
+5:
+ subs r0,r0,#1
+ bx lr
+
+10:
+ @ We're here if we ran off the end
+ cmp r1, #0 @ Corner case - you're allowed to search for the nil and get a pointer to it
+ beq 5b @ A bit messy, if it's common we should branch at the start to a special loop
+ mov r0,#0
+ bx lr
diff --git a/contrib/cortex-strings/src/arm/strcmp.S b/contrib/cortex-strings/src/arm/strcmp.S
new file mode 100644
index 000000000000..5fad272e48a6
--- /dev/null
+++ b/contrib/cortex-strings/src/arm/strcmp.S
@@ -0,0 +1,500 @@
+/*
+ * Copyright (c) 2012-2014 ARM Ltd
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the company may not be used to endorse or promote
+ * products derived from this software without specific prior written
+ * permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* Implementation of strcmp for ARMv7 when DSP instructions are
+ available. Use ldrd to support wider loads, provided the data
+ is sufficiently aligned. Use saturating arithmetic to optimize
+ the compares. */
+
+/* Build Options:
+ STRCMP_NO_PRECHECK: Don't run a quick pre-check of the first
+ byte in the string. If comparing completely random strings
+ the pre-check will save time, since there is a very high
+ probability of a mismatch in the first character: we save
+ significant overhead if this is the common case. However,
+ if strings are likely to be identical (eg because we're
+ verifying a hit in a hash table), then this check is largely
+ redundant. */
+
+#define STRCMP_NO_PRECHECK 0
+
+ /* This version uses Thumb-2 code. */
+ .thumb
+ .syntax unified
+
+#ifdef __ARM_BIG_ENDIAN
+#define S2LO lsl
+#define S2LOEQ lsleq
+#define S2HI lsr
+#define MSB 0x000000ff
+#define LSB 0xff000000
+#define BYTE0_OFFSET 24
+#define BYTE1_OFFSET 16
+#define BYTE2_OFFSET 8
+#define BYTE3_OFFSET 0
+#else /* not __ARM_BIG_ENDIAN */
+#define S2LO lsr
+#define S2LOEQ lsreq
+#define S2HI lsl
+#define BYTE0_OFFSET 0
+#define BYTE1_OFFSET 8
+#define BYTE2_OFFSET 16
+#define BYTE3_OFFSET 24
+#define MSB 0xff000000
+#define LSB 0x000000ff
+#endif /* not __ARM_BIG_ENDIAN */
+
+ .macro def_fn f p2align=0
+ .text
+ .p2align \p2align
+ .global \f
+ .type \f, %function
+\f:
+ .endm
+
+/* Parameters and result. */
+#define src1 r0
+#define src2 r1
+#define result r0 /* Overlaps src1. */
+
+/* Internal variables. */
+#define tmp1 r4
+#define tmp2 r5
+#define const_m1 r12
+
+/* Additional internal variables for 64-bit aligned data. */
+#define data1a r2
+#define data1b r3
+#define data2a r6
+#define data2b r7
+#define syndrome_a tmp1
+#define syndrome_b tmp2
+
+/* Additional internal variables for 32-bit aligned data. */
+#define data1 r2
+#define data2 r3
+#define syndrome tmp2
+
+
+ /* Macro to compute and return the result value for word-aligned
+ cases. */
+ .macro strcmp_epilogue_aligned synd d1 d2 restore_r6
+#ifdef __ARM_BIG_ENDIAN
+ /* If data1 contains a zero byte, then syndrome will contain a 1 in
+ bit 7 of that byte. Otherwise, the highest set bit in the
+ syndrome will highlight the first different bit. It is therefore
+ sufficient to extract the eight bits starting with the syndrome
+ bit. */
+ clz tmp1, \synd
+ lsl r1, \d2, tmp1
+ .if \restore_r6
+ ldrd r6, r7, [sp, #8]
+ .endif
+ .cfi_restore 6
+ .cfi_restore 7
+ lsl \d1, \d1, tmp1
+ .cfi_remember_state
+ lsr result, \d1, #24
+ ldrd r4, r5, [sp], #16
+ .cfi_restore 4
+ .cfi_restore 5
+ sub result, result, r1, lsr #24
+ bx lr
+#else
+ /* To use the big-endian trick we'd have to reverse all three words.
+ that's slower than this approach. */
+ rev \synd, \synd
+ clz tmp1, \synd
+ bic tmp1, tmp1, #7
+ lsr r1, \d2, tmp1
+ .cfi_remember_state
+ .if \restore_r6
+ ldrd r6, r7, [sp, #8]
+ .endif
+ .cfi_restore 6
+ .cfi_restore 7
+ lsr \d1, \d1, tmp1
+ and result, \d1, #255
+ and r1, r1, #255
+ ldrd r4, r5, [sp], #16
+ .cfi_restore 4
+ .cfi_restore 5
+ sub result, result, r1
+
+ bx lr
+#endif
+ .endm
+
+ .text
+ .p2align 5
+.Lstrcmp_start_addr:
+#if STRCMP_NO_PRECHECK == 0
+.Lfastpath_exit:
+ sub r0, r2, r3
+ bx lr
+ nop
+#endif
+def_fn strcmp
+#if STRCMP_NO_PRECHECK == 0
+ ldrb r2, [src1]
+ ldrb r3, [src2]
+ cmp r2, #1
+ it cs
+ cmpcs r2, r3
+ bne .Lfastpath_exit
+#endif
+ .cfi_startproc
+ strd r4, r5, [sp, #-16]!
+ .cfi_def_cfa_offset 16
+ .cfi_offset 4, -16
+ .cfi_offset 5, -12
+ orr tmp1, src1, src2
+ strd r6, r7, [sp, #8]
+ .cfi_offset 6, -8
+ .cfi_offset 7, -4
+ mvn const_m1, #0
+ lsl r2, tmp1, #29
+ cbz r2, .Lloop_aligned8
+
+.Lnot_aligned:
+ eor tmp1, src1, src2
+ tst tmp1, #7
+ bne .Lmisaligned8
+
+ /* Deal with mutual misalignment by aligning downwards and then
+ masking off the unwanted loaded data to prevent a difference. */
+ and tmp1, src1, #7
+ bic src1, src1, #7
+ and tmp2, tmp1, #3
+ bic src2, src2, #7
+ lsl tmp2, tmp2, #3 /* Bytes -> bits. */
+ ldrd data1a, data1b, [src1], #16
+ tst tmp1, #4
+ ldrd data2a, data2b, [src2], #16
+ /* In thumb code we can't use MVN with a register shift, but
+ we do have ORN. */
+ S2HI tmp1, const_m1, tmp2
+ orn data1a, data1a, tmp1
+ orn data2a, data2a, tmp1
+ beq .Lstart_realigned8
+ orn data1b, data1b, tmp1
+ mov data1a, const_m1
+ orn data2b, data2b, tmp1
+ mov data2a, const_m1
+ b .Lstart_realigned8
+
+ /* Unwind the inner loop by a factor of 2, giving 16 bytes per
+ pass. */
+ .p2align 5,,12 /* Don't start in the tail bytes of a cache line. */
+ .p2align 2 /* Always word aligned. */
+.Lloop_aligned8:
+ ldrd data1a, data1b, [src1], #16
+ ldrd data2a, data2b, [src2], #16
+.Lstart_realigned8:
+ uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */
+ eor syndrome_a, data1a, data2a
+ sel syndrome_a, syndrome_a, const_m1
+ cbnz syndrome_a, .Ldiff_in_a
+ uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */
+ eor syndrome_b, data1b, data2b
+ sel syndrome_b, syndrome_b, const_m1
+ cbnz syndrome_b, .Ldiff_in_b
+
+ ldrd data1a, data1b, [src1, #-8]
+ ldrd data2a, data2b, [src2, #-8]
+ uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */
+ eor syndrome_a, data1a, data2a
+ sel syndrome_a, syndrome_a, const_m1
+ uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */
+ eor syndrome_b, data1b, data2b
+ sel syndrome_b, syndrome_b, const_m1
+ /* Can't use CBZ for backwards branch. */
+ orrs syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */
+ beq .Lloop_aligned8
+
+.Ldiff_found:
+ cbnz syndrome_a, .Ldiff_in_a
+
+.Ldiff_in_b:
+ strcmp_epilogue_aligned syndrome_b, data1b, data2b 1
+
+.Ldiff_in_a:
+ .cfi_restore_state
+ strcmp_epilogue_aligned syndrome_a, data1a, data2a 1
+
+ .cfi_restore_state
+.Lmisaligned8:
+ tst tmp1, #3
+ bne .Lmisaligned4
+ ands tmp1, src1, #3
+ bne .Lmutual_align4
+
+ /* Unrolled by a factor of 2, to reduce the number of post-increment
+ operations. */
+.Lloop_aligned4:
+ ldr data1, [src1], #8
+ ldr data2, [src2], #8
+.Lstart_realigned4:
+ uadd8 syndrome, data1, const_m1 /* Only need GE bits. */
+ eor syndrome, data1, data2
+ sel syndrome, syndrome, const_m1
+ cbnz syndrome, .Laligned4_done
+ ldr data1, [src1, #-4]
+ ldr data2, [src2, #-4]
+ uadd8 syndrome, data1, const_m1
+ eor syndrome, data1, data2
+ sel syndrome, syndrome, const_m1
+ cmp syndrome, #0
+ beq .Lloop_aligned4
+
+.Laligned4_done:
+ strcmp_epilogue_aligned syndrome, data1, data2, 0
+
+.Lmutual_align4:
+ .cfi_restore_state
+ /* Deal with mutual misalignment by aligning downwards and then
+ masking off the unwanted loaded data to prevent a difference. */
+ lsl tmp1, tmp1, #3 /* Bytes -> bits. */
+ bic src1, src1, #3
+ ldr data1, [src1], #8
+ bic src2, src2, #3
+ ldr data2, [src2], #8
+
+ /* In thumb code we can't use MVN with a register shift, but
+ we do have ORN. */
+ S2HI tmp1, const_m1, tmp1
+ orn data1, data1, tmp1
+ orn data2, data2, tmp1
+ b .Lstart_realigned4
+
+.Lmisaligned4:
+ ands tmp1, src1, #3
+ beq .Lsrc1_aligned
+ sub src2, src2, tmp1
+ bic src1, src1, #3
+ lsls tmp1, tmp1, #31
+ ldr data1, [src1], #4
+ beq .Laligned_m2
+ bcs .Laligned_m1
+
+#if STRCMP_NO_PRECHECK == 1
+ ldrb data2, [src2, #1]
+ uxtb tmp1, data1, ror #BYTE1_OFFSET
+ subs tmp1, tmp1, data2
+ bne .Lmisaligned_exit
+ cbz data2, .Lmisaligned_exit
+
+.Laligned_m2:
+ ldrb data2, [src2, #2]
+ uxtb tmp1, data1, ror #BYTE2_OFFSET
+ subs tmp1, tmp1, data2
+ bne .Lmisaligned_exit
+ cbz data2, .Lmisaligned_exit
+
+.Laligned_m1:
+ ldrb data2, [src2, #3]
+ uxtb tmp1, data1, ror #BYTE3_OFFSET
+ subs tmp1, tmp1, data2
+ bne .Lmisaligned_exit
+ add src2, src2, #4
+ cbnz data2, .Lsrc1_aligned
+#else /* STRCMP_NO_PRECHECK */
+ /* If we've done the pre-check, then we don't need to check the
+ first byte again here. */
+ ldrb data2, [src2, #2]
+ uxtb tmp1, data1, ror #BYTE2_OFFSET
+ subs tmp1, tmp1, data2
+ bne .Lmisaligned_exit
+ cbz data2, .Lmisaligned_exit
+
+.Laligned_m2:
+ ldrb data2, [src2, #3]
+ uxtb tmp1, data1, ror #BYTE3_OFFSET
+ subs tmp1, tmp1, data2
+ bne .Lmisaligned_exit
+ cbnz data2, .Laligned_m1
+#endif
+
+.Lmisaligned_exit:
+ .cfi_remember_state
+ mov result, tmp1
+ ldr r4, [sp], #16
+ .cfi_restore 4
+ bx lr
+
+#if STRCMP_NO_PRECHECK == 0
+.Laligned_m1:
+ add src2, src2, #4
+#endif
+.Lsrc1_aligned:
+ .cfi_restore_state
+ /* src1 is word aligned, but src2 has no common alignment
+ with it. */
+ ldr data1, [src1], #4
+ lsls tmp1, src2, #31 /* C=src2[1], Z=src2[0]. */
+
+ bic src2, src2, #3
+ ldr data2, [src2], #4
+ bhi .Loverlap1 /* C=1, Z=0 => src2[1:0] = 0b11. */
+ bcs .Loverlap2 /* C=1, Z=1 => src2[1:0] = 0b10. */
+
+ /* (overlap3) C=0, Z=0 => src2[1:0] = 0b01. */
+.Loverlap3:
+ bic tmp1, data1, #MSB
+ uadd8 syndrome, data1, const_m1
+ eors syndrome, tmp1, data2, S2LO #8
+ sel syndrome, syndrome, const_m1
+ bne 4f
+ cbnz syndrome, 5f
+ ldr data2, [src2], #4
+ eor tmp1, tmp1, data1
+ cmp tmp1, data2, S2HI #24
+ bne 6f
+ ldr data1, [src1], #4
+ b .Loverlap3
+4:
+ S2LO data2, data2, #8
+ b .Lstrcmp_tail
+
+5:
+ bics syndrome, syndrome, #MSB
+ bne .Lstrcmp_done_equal
+
+ /* We can only get here if the MSB of data1 contains 0, so
+ fast-path the exit. */
+ ldrb result, [src2]
+ .cfi_remember_state
+ ldrd r4, r5, [sp], #16
+ .cfi_restore 4
+ .cfi_restore 5
+ /* R6/7 Not used in this sequence. */
+ .cfi_restore 6
+ .cfi_restore 7
+ neg result, result
+ bx lr
+
+6:
+ .cfi_restore_state
+ S2LO data1, data1, #24
+ and data2, data2, #LSB
+ b .Lstrcmp_tail
+
+ .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */
+.Loverlap2:
+ and tmp1, data1, const_m1, S2LO #16
+ uadd8 syndrome, data1, const_m1
+ eors syndrome, tmp1, data2, S2LO #16
+ sel syndrome, syndrome, const_m1
+ bne 4f
+ cbnz syndrome, 5f
+ ldr data2, [src2], #4
+ eor tmp1, tmp1, data1
+ cmp tmp1, data2, S2HI #16
+ bne 6f
+ ldr data1, [src1], #4
+ b .Loverlap2
+4:
+ S2LO data2, data2, #16
+ b .Lstrcmp_tail
+5:
+ ands syndrome, syndrome, const_m1, S2LO #16
+ bne .Lstrcmp_done_equal
+
+ ldrh data2, [src2]
+ S2LO data1, data1, #16
+#ifdef __ARM_BIG_ENDIAN
+ lsl data2, data2, #16
+#endif
+ b .Lstrcmp_tail
+
+6:
+ S2LO data1, data1, #16
+ and data2, data2, const_m1, S2LO #16
+ b .Lstrcmp_tail
+
+ .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */
+.Loverlap1:
+ and tmp1, data1, #LSB
+ uadd8 syndrome, data1, const_m1
+ eors syndrome, tmp1, data2, S2LO #24
+ sel syndrome, syndrome, const_m1
+ bne 4f
+ cbnz syndrome, 5f
+ ldr data2, [src2], #4
+ eor tmp1, tmp1, data1
+ cmp tmp1, data2, S2HI #8
+ bne 6f
+ ldr data1, [src1], #4
+ b .Loverlap1
+4:
+ S2LO data2, data2, #24
+ b .Lstrcmp_tail
+5:
+ tst syndrome, #LSB
+ bne .Lstrcmp_done_equal
+ ldr data2, [src2]
+6:
+ S2LO data1, data1, #8
+ bic data2, data2, #MSB
+ b .Lstrcmp_tail
+
+.Lstrcmp_done_equal:
+ mov result, #0
+ .cfi_remember_state
+ ldrd r4, r5, [sp], #16
+ .cfi_restore 4
+ .cfi_restore 5
+ /* R6/7 not used in this sequence. */
+ .cfi_restore 6
+ .cfi_restore 7
+ bx lr
+
+.Lstrcmp_tail:
+ .cfi_restore_state
+#ifndef __ARM_BIG_ENDIAN
+ rev data1, data1
+ rev data2, data2
+ /* Now everything looks big-endian... */
+#endif
+ uadd8 tmp1, data1, const_m1
+ eor tmp1, data1, data2
+ sel syndrome, tmp1, const_m1
+ clz tmp1, syndrome
+ lsl data1, data1, tmp1
+ lsl data2, data2, tmp1
+ lsr result, data1, #24
+ ldrd r4, r5, [sp], #16
+ .cfi_restore 4
+ .cfi_restore 5
+ /* R6/7 not used in this sequence. */
+ .cfi_restore 6
+ .cfi_restore 7
+ sub result, result, data2, lsr #24
+ bx lr
+ .cfi_endproc
+ .size strcmp, . - .Lstrcmp_start_addr
diff --git a/contrib/cortex-strings/src/thumb-2/strcpy.c b/contrib/cortex-strings/src/thumb-2/strcpy.c
new file mode 100644
index 000000000000..78195001a14c
--- /dev/null
+++ b/contrib/cortex-strings/src/thumb-2/strcpy.c
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2008 ARM Ltd
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the company may not be used to endorse or promote
+ * products derived from this software without specific prior written
+ * permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* For GLIBC:
+#include <string.h>
+#include <memcopy.h>
+
+#undef strcmp
+*/
+
+#ifdef __thumb2__
+#define magic1(REG) "#0x01010101"
+#define magic2(REG) "#0x80808080"
+#else
+#define magic1(REG) #REG
+#define magic2(REG) #REG ", lsl #7"
+#endif
+
+char* __attribute__((naked))
+strcpy (char* dst, const char* src)
+{
+ asm (
+#if !(defined(__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED) || \
+ (defined (__thumb__) && !defined (__thumb2__)))
+ "pld [r1, #0]\n\t"
+ "eor r2, r0, r1\n\t"
+ "mov ip, r0\n\t"
+ "tst r2, #3\n\t"
+ "bne 4f\n\t"
+ "tst r1, #3\n\t"
+ "bne 3f\n"
+ "5:\n\t"
+#ifndef __thumb2__
+ "str r5, [sp, #-4]!\n\t"
+ "mov r5, #0x01\n\t"
+ "orr r5, r5, r5, lsl #8\n\t"
+ "orr r5, r5, r5, lsl #16\n\t"
+#endif
+
+ "str r4, [sp, #-4]!\n\t"
+ "tst r1, #4\n\t"
+ "ldr r3, [r1], #4\n\t"
+ "beq 2f\n\t"
+ "sub r2, r3, "magic1(r5)"\n\t"
+ "bics r2, r2, r3\n\t"
+ "tst r2, "magic2(r5)"\n\t"
+ "itt eq\n\t"
+ "streq r3, [ip], #4\n\t"
+ "ldreq r3, [r1], #4\n"
+ "bne 1f\n\t"
+ /* Inner loop. We now know that r1 is 64-bit aligned, so we
+ can safely fetch up to two words. This allows us to avoid
+ load stalls. */
+ ".p2align 2\n"
+ "2:\n\t"
+ "pld [r1, #8]\n\t"
+ "ldr r4, [r1], #4\n\t"
+ "sub r2, r3, "magic1(r5)"\n\t"
+ "bics r2, r2, r3\n\t"
+ "tst r2, "magic2(r5)"\n\t"
+ "sub r2, r4, "magic1(r5)"\n\t"
+ "bne 1f\n\t"
+ "str r3, [ip], #4\n\t"
+ "bics r2, r2, r4\n\t"
+ "tst r2, "magic2(r5)"\n\t"
+ "itt eq\n\t"
+ "ldreq r3, [r1], #4\n\t"
+ "streq r4, [ip], #4\n\t"
+ "beq 2b\n\t"
+ "mov r3, r4\n"
+ "1:\n\t"
+#ifdef __ARMEB__
+ "rors r3, r3, #24\n\t"
+#endif
+ "strb r3, [ip], #1\n\t"
+ "tst r3, #0xff\n\t"
+#ifdef __ARMEL__
+ "ror r3, r3, #8\n\t"
+#endif
+ "bne 1b\n\t"
+ "ldr r4, [sp], #4\n\t"
+#ifndef __thumb2__
+ "ldr r5, [sp], #4\n\t"
+#endif
+ "BX LR\n"
+
+ /* Strings have the same offset from word alignment, but it's
+ not zero. */
+ "3:\n\t"
+ "tst r1, #1\n\t"
+ "beq 1f\n\t"
+ "ldrb r2, [r1], #1\n\t"
+ "strb r2, [ip], #1\n\t"
+ "cmp r2, #0\n\t"
+ "it eq\n"
+ "BXEQ LR\n"
+ "1:\n\t"
+ "tst r1, #2\n\t"
+ "beq 5b\n\t"
+ "ldrh r2, [r1], #2\n\t"
+#ifdef __ARMEB__
+ "tst r2, #0xff00\n\t"
+ "iteet ne\n\t"
+ "strneh r2, [ip], #2\n\t"
+ "lsreq r2, r2, #8\n\t"
+ "streqb r2, [ip]\n\t"
+ "tstne r2, #0xff\n\t"
+#else
+ "tst r2, #0xff\n\t"
+ "itet ne\n\t"
+ "strneh r2, [ip], #2\n\t"
+ "streqb r2, [ip]\n\t"
+ "tstne r2, #0xff00\n\t"
+#endif
+ "bne 5b\n\t"
+ "BX LR\n"
+
+ /* src and dst do not have a common word-alignement. Fall back to
+ byte copying. */
+ "4:\n\t"
+ "ldrb r2, [r1], #1\n\t"
+ "strb r2, [ip], #1\n\t"
+ "cmp r2, #0\n\t"
+ "bne 4b\n\t"
+ "BX LR"
+
+#elif !defined (__thumb__) || defined (__thumb2__)
+ "mov r3, r0\n\t"
+ "1:\n\t"
+ "ldrb r2, [r1], #1\n\t"
+ "strb r2, [r3], #1\n\t"
+ "cmp r2, #0\n\t"
+ "bne 1b\n\t"
+ "BX LR"
+#else
+ "mov r3, r0\n\t"
+ "1:\n\t"
+ "ldrb r2, [r1]\n\t"
+ "add r1, r1, #1\n\t"
+ "strb r2, [r3]\n\t"
+ "add r3, r3, #1\n\t"
+ "cmp r2, #0\n\t"
+ "bne 1b\n\t"
+ "BX LR"
+#endif
+ );
+}
+/* For GLIBC: libc_hidden_builtin_def (strcpy) */
diff --git a/contrib/cortex-strings/src/thumb-2/strlen.S b/contrib/cortex-strings/src/thumb-2/strlen.S
new file mode 100644
index 000000000000..8efa2356fdd1
--- /dev/null
+++ b/contrib/cortex-strings/src/thumb-2/strlen.S
@@ -0,0 +1,150 @@
+/* Copyright (c) 2010-2011,2013 Linaro Limited
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ * Neither the name of Linaro Limited nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ Assumes:
+ ARMv6T2, AArch32
+
+ */
+
+ .macro def_fn f p2align=0
+ .text
+ .p2align \p2align
+ .global \f
+ .type \f, %function
+\f:
+ .endm
+
+#ifdef __ARMEB__
+#define S2LO lsl
+#define S2HI lsr
+#else
+#define S2LO lsr
+#define S2HI lsl
+#endif
+
+ /* This code requires Thumb. */
+ .thumb
+ .syntax unified
+
+/* Parameters and result. */
+#define srcin r0
+#define result r0
+
+/* Internal variables. */
+#define src r1
+#define data1a r2
+#define data1b r3
+#define const_m1 r12
+#define const_0 r4
+#define tmp1 r4 /* Overlaps const_0 */
+#define tmp2 r5
+
+def_fn strlen p2align=6
+ pld [srcin, #0]
+ strd r4, r5, [sp, #-8]!
+ bic src, srcin, #7
+ mvn const_m1, #0
+ ands tmp1, srcin, #7 /* (8 - bytes) to alignment. */
+ pld [src, #32]
+ bne.w .Lmisaligned8
+ mov const_0, #0
+ mov result, #-8
+.Lloop_aligned:
+ /* Bytes 0-7. */
+ ldrd data1a, data1b, [src]
+ pld [src, #64]
+ add result, result, #8
+.Lstart_realigned:
+ uadd8 data1a, data1a, const_m1 /* Saturating GE<0:3> set. */
+ sel data1a, const_0, const_m1 /* Select based on GE<0:3>. */
+ uadd8 data1b, data1b, const_m1
+ sel data1b, data1a, const_m1 /* Only used if d1a == 0. */
+ cbnz data1b, .Lnull_found
+
+ /* Bytes 8-15. */
+ ldrd data1a, data1b, [src, #8]
+ uadd8 data1a, data1a, const_m1 /* Saturating GE<0:3> set. */
+ add result, result, #8
+ sel data1a, const_0, const_m1 /* Select based on GE<0:3>. */
+ uadd8 data1b, data1b, const_m1
+ sel data1b, data1a, const_m1 /* Only used if d1a == 0. */
+ cbnz data1b, .Lnull_found
+
+ /* Bytes 16-23. */
+ ldrd data1a, data1b, [src, #16]
+ uadd8 data1a, data1a, const_m1 /* Saturating GE<0:3> set. */
+ add result, result, #8
+ sel data1a, const_0, const_m1 /* Select based on GE<0:3>. */
+ uadd8 data1b, data1b, const_m1
+ sel data1b, data1a, const_m1 /* Only used if d1a == 0. */
+ cbnz data1b, .Lnull_found
+
+ /* Bytes 24-31. */
+ ldrd data1a, data1b, [src, #24]
+ add src, src, #32
+ uadd8 data1a, data1a, const_m1 /* Saturating GE<0:3> set. */
+ add result, result, #8
+ sel data1a, const_0, const_m1 /* Select based on GE<0:3>. */
+ uadd8 data1b, data1b, const_m1
+ sel data1b, data1a, const_m1 /* Only used if d1a == 0. */
+ cmp data1b, #0
+ beq .Lloop_aligned
+
+.Lnull_found:
+ cmp data1a, #0
+ itt eq
+ addeq result, result, #4
+ moveq data1a, data1b
+#ifndef __ARMEB__
+ rev data1a, data1a
+#endif
+ clz data1a, data1a
+ ldrd r4, r5, [sp], #8
+ add result, result, data1a, lsr #3 /* Bits -> Bytes. */
+ bx lr
+
+.Lmisaligned8:
+ ldrd data1a, data1b, [src]
+ and tmp2, tmp1, #3
+ rsb result, tmp1, #0
+ lsl tmp2, tmp2, #3 /* Bytes -> bits. */
+ tst tmp1, #4
+ pld [src, #64]
+ S2HI tmp2, const_m1, tmp2
+ orn data1a, data1a, tmp2
+ itt ne
+ ornne data1b, data1b, tmp2
+ movne data1a, const_m1
+ mov const_0, #0
+ b .Lstart_realigned
+ .size strlen, . - strlen
+
diff --git a/contrib/cortex-strings/src/thumb/aeabi_idiv.S b/contrib/cortex-strings/src/thumb/aeabi_idiv.S
new file mode 100644
index 000000000000..b15a02c21932
--- /dev/null
+++ b/contrib/cortex-strings/src/thumb/aeabi_idiv.S
@@ -0,0 +1,318 @@
+/*
+ * Copyright (c) 2014 ARM Ltd
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the company may not be used to endorse or promote
+ * products derived from this software without specific prior written
+ * permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* An executable stack is *not* required for these functions. */
+
+.section .note.GNU-stack,"",%progbits
+.previous
+.eabi_attribute 25, 1
+
+/* ANSI concatenation macros. */
+
+#define CONCAT1(a, b) CONCAT2(a, b)
+#define CONCAT2(a, b) a ## b
+
+/* Use the right prefix for global labels. */
+
+#define SYM(x) CONCAT1 (__USER_LABEL_PREFIX__, x)
+
+#define TYPE(x) .type SYM(x),function
+#define SIZE(x) .size SYM(x), . - SYM(x)
+#define LSYM(x) .x
+
+.macro cfi_start start_label, end_label
+ .pushsection .debug_frame
+LSYM(Lstart_frame):
+ .4byte LSYM(Lend_cie) - LSYM(Lstart_cie)
+LSYM(Lstart_cie):
+ .4byte 0xffffffff
+ .byte 0x1
+ .ascii "\0"
+ .uleb128 0x1
+ .sleb128 -4
+ .byte 0xe
+ .byte 0xc
+ .uleb128 0xd
+ .uleb128 0x0
+
+ .align 2
+LSYM(Lend_cie):
+ .4byte LSYM(Lend_fde)-LSYM(Lstart_fde)
+LSYM(Lstart_fde):
+ .4byte LSYM(Lstart_frame)
+ .4byte \start_label
+ .4byte \end_label-\start_label
+ .popsection
+.endm
+
+.macro cfi_end end_label
+ .pushsection .debug_frame
+ .align 2
+LSYM(Lend_fde):
+ .popsection
+\end_label:
+.endm
+
+.macro THUMB_LDIV0 name signed
+ push {r0, lr}
+ movs r0, #0
+ bl SYM(__aeabi_idiv0)
+ pop {r1, pc}
+.endm
+
+.macro FUNC_END name
+ SIZE (__\name)
+.endm
+
+.macro DIV_FUNC_END name signed
+ cfi_start __\name, LSYM(Lend_div0)
+LSYM(Ldiv0):
+ THUMB_LDIV0 \name \signed
+ cfi_end LSYM(Lend_div0)
+ FUNC_END \name
+.endm
+
+.macro THUMB_FUNC_START name
+ .globl SYM (\name)
+ TYPE (\name)
+ .thumb_func
+SYM (\name):
+.endm
+
+.macro FUNC_START name
+ .text
+ .globl SYM (__\name)
+ TYPE (__\name)
+ .align 0
+ .force_thumb
+ .thumb_func
+ .syntax unified
+SYM (__\name):
+.endm
+
+.macro FUNC_ALIAS new old
+ .globl SYM (__\new)
+ .thumb_set SYM (__\new), SYM (__\old)
+.endm
+
+/* Register aliases. */
+work .req r4
+dividend .req r0
+divisor .req r1
+overdone .req r2
+result .req r2
+curbit .req r3
+
+/* ------------------------------------------------------------------------ */
+/* Bodies of the division and modulo routines. */
+/* ------------------------------------------------------------------------ */
+.macro BranchToDiv n, label
+ lsrs curbit, dividend, \n
+ cmp curbit, divisor
+ bcc \label
+.endm
+
+.macro DoDiv n
+ lsrs curbit, dividend, \n
+ cmp curbit, divisor
+ bcc 1f
+ lsls curbit, divisor, \n
+ subs dividend, dividend, curbit
+
+1: adcs result, result
+.endm
+
+.macro THUMB1_Div_Positive
+ movs result, #0
+ BranchToDiv #1, LSYM(Lthumb1_div1)
+ BranchToDiv #4, LSYM(Lthumb1_div4)
+ BranchToDiv #8, LSYM(Lthumb1_div8)
+ BranchToDiv #12, LSYM(Lthumb1_div12)
+ BranchToDiv #16, LSYM(Lthumb1_div16)
+LSYM(Lthumb1_div_large_positive):
+ movs result, #0xff
+ lsls divisor, divisor, #8
+ rev result, result
+ lsrs curbit, dividend, #16
+ cmp curbit, divisor
+ bcc 1f
+ asrs result, #8
+ lsls divisor, divisor, #8
+ beq LSYM(Ldivbyzero_waypoint)
+
+1: lsrs curbit, dividend, #12
+ cmp curbit, divisor
+ bcc LSYM(Lthumb1_div12)
+ b LSYM(Lthumb1_div16)
+LSYM(Lthumb1_div_loop):
+ lsrs divisor, divisor, #8
+LSYM(Lthumb1_div16):
+ Dodiv #15
+ Dodiv #14
+ Dodiv #13
+ Dodiv #12
+LSYM(Lthumb1_div12):
+ Dodiv #11
+ Dodiv #10
+ Dodiv #9
+ Dodiv #8
+ bcs LSYM(Lthumb1_div_loop)
+LSYM(Lthumb1_div8):
+ Dodiv #7
+ Dodiv #6
+ Dodiv #5
+LSYM(Lthumb1_div5):
+ Dodiv #4
+LSYM(Lthumb1_div4):
+ Dodiv #3
+LSYM(Lthumb1_div3):
+ Dodiv #2
+LSYM(Lthumb1_div2):
+ Dodiv #1
+LSYM(Lthumb1_div1):
+ subs divisor, dividend, divisor
+ bcs 1f
+ mov divisor, dividend
+
+1: adcs result, result
+ mov dividend, result
+ bx lr
+
+LSYM(Ldivbyzero_waypoint):
+ b LSYM(Ldiv0)
+.endm
+
+.macro THUMB1_Div_Negative
+ lsrs result, divisor, #31
+ beq 1f
+ rsbs divisor, divisor, #0
+
+1: asrs curbit, dividend, #32
+ bcc 2f
+ rsbs dividend, dividend, #0
+
+2: eors curbit, result
+ movs result, #0
+ mov ip, curbit
+ BranchToDiv #4, LSYM(Lthumb1_div_negative4)
+ BranchToDiv #8, LSYM(Lthumb1_div_negative8)
+LSYM(Lthumb1_div_large):
+ movs result, #0xfc
+ lsls divisor, divisor, #6
+ rev result, result
+ lsrs curbit, dividend, #8
+ cmp curbit, divisor
+ bcc LSYM(Lthumb1_div_negative8)
+
+ lsls divisor, divisor, #6
+ asrs result, result, #6
+ cmp curbit, divisor
+ bcc LSYM(Lthumb1_div_negative8)
+
+ lsls divisor, divisor, #6
+ asrs result, result, #6
+ cmp curbit, divisor
+ bcc LSYM(Lthumb1_div_negative8)
+
+ lsls divisor, divisor, #6
+ beq LSYM(Ldivbyzero_negative)
+ asrs result, result, #6
+ b LSYM(Lthumb1_div_negative8)
+LSYM(Lthumb1_div_negative_loop):
+ lsrs divisor, divisor, #6
+LSYM(Lthumb1_div_negative8):
+ DoDiv #7
+ DoDiv #6
+ DoDiv #5
+ DoDiv #4
+LSYM(Lthumb1_div_negative4):
+ DoDiv #3
+ DoDiv #2
+ bcs LSYM(Lthumb1_div_negative_loop)
+ DoDiv #1
+ subs divisor, dividend, divisor
+ bcs 1f
+ mov divisor, dividend
+
+1: mov curbit, ip
+ adcs result, result
+ asrs curbit, curbit, #1
+ mov dividend, result
+ bcc 2f
+ rsbs dividend, dividend, #0
+ cmp curbit, #0
+
+2: bpl 3f
+ rsbs divisor, divisor, #0
+
+3: bx lr
+
+LSYM(Ldivbyzero_negative):
+ mov curbit, ip
+ asrs curbit, curbit, #1
+ bcc LSYM(Ldiv0)
+ rsbs dividend, dividend, #0
+.endm
+
+/* ------------------------------------------------------------------------ */
+/* Start of the Real Functions */
+/* ------------------------------------------------------------------------ */
+
+ FUNC_START aeabi_idiv0
+ bx lr
+ FUNC_END aeabi_idiv0
+
+ FUNC_START divsi3
+ FUNC_ALIAS aeabi_idiv divsi3
+
+LSYM(divsi3_skip_div0_test):
+ mov curbit, dividend
+ orrs curbit, divisor
+ bmi LSYM(Lthumb1_div_negative)
+
+LSYM(Lthumb1_div_positive):
+ THUMB1_Div_Positive
+
+LSYM(Lthumb1_div_negative):
+ THUMB1_Div_Negative
+
+ DIV_FUNC_END divsi3 signed
+
+ FUNC_START aeabi_idivmod
+
+ cmp r1, #0
+ beq LSYM(Ldiv0)
+ push {r0, r1, lr}
+ bl LSYM(divsi3_skip_div0_test)
+ POP {r1, r2, r3}
+ mul r2, r0
+ sub r1, r1, r2
+ bx r3
+
+ FUNC_END aeabi_idivmod
+/* ------------------------------------------------------------------------ */
diff --git a/contrib/cortex-strings/src/thumb/strcmp-armv6m.S b/contrib/cortex-strings/src/thumb/strcmp-armv6m.S
new file mode 100644
index 000000000000..d1255e0d36ed
--- /dev/null
+++ b/contrib/cortex-strings/src/thumb/strcmp-armv6m.S
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2014 ARM Ltd
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the company may not be used to endorse or promote
+ * products derived from this software without specific prior written
+ * permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* Implementation of strcmp for ARMv6m. This version is only used in
+ ARMv6-M when we want an efficient implementation. Otherwize if the
+ code size is preferred, strcmp-armv4t.S will be used. */
+
+ .thumb_func
+ .syntax unified
+ .arch armv6-m
+
+ .macro DoSub n, label
+ subs r0, r0, r1
+#ifdef __ARM_BIG_ENDIAN
+ lsrs r1, r4, \n
+#else
+ lsls r1, r4, \n
+#endif
+ orrs r1, r0
+ bne \label
+ .endm
+
+ .macro Byte_Test n, label
+ lsrs r0, r2, \n
+ lsrs r1, r3, \n
+ DoSub \n, \label
+ .endm
+
+ .text
+ .p2align 0
+ .global strcmp
+ .type strcmp, %function
+strcmp:
+ .cfi_startproc
+ mov r2, r0
+ push {r4, r5, r6, lr}
+ orrs r2, r1
+ lsls r2, r2, #30
+ bne 6f
+ ldr r5, =0x01010101
+ lsls r6, r5, #7
+1:
+ ldmia r0!, {r2}
+ ldmia r1!, {r3}
+ subs r4, r2, r5
+ bics r4, r2
+ ands r4, r6
+ beq 3f
+
+#ifdef __ARM_BIG_ENDIAN
+ Byte_Test #24, 4f
+ Byte_Test #16, 4f
+ Byte_Test #8, 4f
+
+ b 7f
+3:
+ cmp r2, r3
+ beq 1b
+ cmp r2, r3
+#else
+ uxtb r0, r2
+ uxtb r1, r3
+ DoSub #24, 2f
+
+ uxth r0, r2
+ uxth r1, r3
+ DoSub #16, 2f
+
+ lsls r0, r2, #8
+ lsls r1, r3, #8
+ lsrs r0, r0, #8
+ lsrs r1, r1, #8
+ DoSub #8, 2f
+
+ lsrs r0, r2, #24
+ lsrs r1, r3, #24
+ subs r0, r0, r1
+2:
+ pop {r4, r5, r6, pc}
+
+3:
+ cmp r2, r3
+ beq 1b
+ rev r0, r2
+ rev r1, r3
+ cmp r0, r1
+#endif
+
+ bls 5f
+ movs r0, #1
+4:
+ pop {r4, r5, r6, pc}
+5:
+ movs r0, #0
+ mvns r0, r0
+ pop {r4, r5, r6, pc}
+6:
+ ldrb r2, [r0, #0]
+ ldrb r3, [r1, #0]
+ adds r0, #1
+ adds r1, #1
+ cmp r2, #0
+ beq 7f
+ cmp r2, r3
+ bne 7f
+ ldrb r2, [r0, #0]
+ ldrb r3, [r1, #0]
+ adds r0, #1
+ adds r1, #1
+ cmp r2, #0
+ beq 7f
+ cmp r2, r3
+ beq 6b
+7:
+ subs r0, r2, r3
+ pop {r4, r5, r6, pc}
+ .cfi_endproc
+ .size strcmp, . - strcmp