47 files changed, 484 insertions, 572 deletions
diff --git a/string/Dir.mk b/string/Dir.mk
index 40ff5acc093e..dd8283ec4977 100644
--- a/string/Dir.mk
+++ b/string/Dir.mk
@@ -13,9 +13,12 @@ all-string bench-string check-string install-string clean-string:
 else
 
 string-lib-srcs := $(wildcard $(S)/$(ARCH)/*.[cS])
+string-lib-srcs += $(wildcard $(S)/$(ARCH)/experimental/*.[cS])
 string-test-srcs := $(wildcard $(S)/test/*.c)
 string-bench-srcs := $(wildcard $(S)/bench/*.c)
 
+string-arch-include-dir := $(wildcard $(S)/$(ARCH))
+string-arch-includes := $(wildcard $(S)/$(ARCH)/*.h)
 string-includes := $(patsubst $(S)/%,build/%,$(wildcard $(S)/include/*.h))
 
 string-libs := \
@@ -43,6 +46,7 @@ string-tests := \
 
 string-benches := \
 	build/bin/bench/memcpy \
+	build/bin/bench/memset \
 	build/bin/bench/strlen
 
 string-lib-objs := $(patsubst $(S)/%,$(B)/%.o,$(basename $(string-lib-srcs)))
@@ -64,8 +68,8 @@ string-files := \
 
 all-string: $(string-libs) $(string-tests) $(string-benches) $(string-includes)
 
-$(string-objs): $(string-includes)
-$(string-objs): CFLAGS_ALL += $(string-cflags)
+$(string-objs): $(string-includes) $(string-arch-includes)
+$(string-objs): CFLAGS_ALL += $(string-cflags) -I$(string-arch-include-dir)
 
 $(string-test-objs): CFLAGS_ALL += -D_GNU_SOURCE
 
@@ -101,6 +105,7 @@ check-string: $(string-tests-out)
 bench-string: $(string-benches)
 	$(EMULATOR) build/bin/bench/strlen
 	$(EMULATOR) build/bin/bench/memcpy
+	$(EMULATOR) build/bin/bench/memset
 
 install-string: \
  $(string-libs:build/lib/%=$(DESTDIR)$(libdir)/%) \
diff --git a/string/aarch64/__mtag_tag_region.S b/string/aarch64/__mtag_tag_region.S
index 207e22950c6d..34b5789240da 100644
--- a/string/aarch64/__mtag_tag_region.S
+++ b/string/aarch64/__mtag_tag_region.S
@@ -27,9 +27,6 @@
 #define zva_val	x4
 
 ENTRY (__mtag_tag_region)
-	PTR_ARG (0)
-	SIZE_ARG (1)
-
 	add	dstend, dstin, count
 
 	cmp	count, 96
diff --git a/string/aarch64/__mtag_tag_zero_region.S b/string/aarch64/__mtag_tag_zero_region.S
index 44b8e0114f42..2fa248e25621 100644
--- a/string/aarch64/__mtag_tag_zero_region.S
+++ b/string/aarch64/__mtag_tag_zero_region.S
@@ -27,9 +27,6 @@
 #define zva_val	x4
 
 ENTRY (__mtag_tag_zero_region)
-	PTR_ARG (0)
-	SIZE_ARG (1)
-
 	add	dstend, dstin, count
 
 	cmp	count, 96
diff --git a/string/aarch64/asmdefs.h b/string/aarch64/asmdefs.h
index 131b95e1fea9..90166676977a 100644
--- a/string/aarch64/asmdefs.h
+++ b/string/aarch64/asmdefs.h
@@ -21,19 +21,6 @@
 #define FEATURE_1_PAC 2
 
 /* Add a NT_GNU_PROPERTY_TYPE_0 note.  */
-#ifdef __ILP32__
-#define GNU_PROPERTY(type, value)	\
-  .section .note.gnu.property, "a";	\
-  .p2align 2;				\
-  .word 4;				\
-  .word 12;				\
-  .word 5;				\
-  .asciz "GNU";				\
-  .word type;				\
-  .word 4;				\
-  .word value;				\
-  .text
-#else
 #define GNU_PROPERTY(type, value)	\
   .section .note.gnu.property, "a";	\
   .p2align 3;				\
@@ -46,7 +33,6 @@
   .word value;				\
   .word 0;				\
   .text
-#endif
 
 /* If set then the GNU Property Note section will be added to
    mark objects to support BTI and PAC-RET.  */
@@ -80,27 +66,4 @@ GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC)
 
 #define L(l) .L ## l
 
-#ifdef __ILP32__
-  /* Sanitize padding bits of pointer arguments as per aapcs64 */
-#define PTR_ARG(n)  mov w##n, w##n
-#else
-#define PTR_ARG(n)
-#endif
-
-#ifdef __ILP32__
-  /* Sanitize padding bits of size arguments as per aapcs64 */
-#define SIZE_ARG(n)  mov w##n, w##n
-#else
-#define SIZE_ARG(n)
-#endif
-
-/* Compiler supports SVE instructions  */
-#ifndef HAVE_SVE
-# if __aarch64__ && (__GNUC__ >= 8 || __clang_major__ >= 5)
-#   define HAVE_SVE 1
-# else
-#   define HAVE_SVE 0
-# endif
-#endif
-
 #endif
diff --git a/string/aarch64/memchr-sve.S b/string/aarch64/experimental/memchr-sve.S
index b851cf31f238..b314551f3e0f 100644
--- a/string/aarch64/memchr-sve.S
+++ b/string/aarch64/experimental/memchr-sve.S
@@ -7,7 +7,8 @@
 
 #include "asmdefs.h"
 
-#if __ARM_FEATURE_SVE
+.arch armv8-a+sve
+
 /* Assumptions:
  *
  * ARMv8-a, AArch64
@@ -15,8 +16,6 @@
  */
 
 ENTRY (__memchr_aarch64_sve)
-	PTR_ARG (0)
-	SIZE_ARG (2)
 	dup	z1.b, w1			/* duplicate c to a vector */
 	setffr					/* initialize FFR */
 	mov	x3, 0				/* initialize off */
@@ -59,6 +58,3 @@ ENTRY (__memchr_aarch64_sve)
 	ret
 
 END (__memchr_aarch64_sve)
-
-#endif
-
diff --git a/string/aarch64/memcmp-sve.S b/string/aarch64/experimental/memcmp-sve.S
index d52ce4555344..ad3534836d04 100644
--- a/string/aarch64/memcmp-sve.S
+++ b/string/aarch64/experimental/memcmp-sve.S
@@ -7,7 +7,8 @@
 
 #include "asmdefs.h"
 
-#if __ARM_FEATURE_SVE
+.arch armv8-a+sve
+
 /* Assumptions:
  *
  * ARMv8-a, AArch64
@@ -15,9 +16,6 @@
  */
 
 ENTRY (__memcmp_aarch64_sve)
-	PTR_ARG (0)
-	PTR_ARG (1)
-	SIZE_ARG (2)
 	mov	x3, 0			/* initialize off */
 
 0:	whilelo	p0.b, x3, x2		/* while off < max */
@@ -46,6 +44,3 @@ ENTRY (__memcmp_aarch64_sve)
 	ret
 
 END (__memcmp_aarch64_sve)
-
-#endif
-
diff --git a/string/aarch64/stpcpy-sve.S b/string/aarch64/experimental/stpcpy-sve.S
index 5d3f14b86026..5d3f14b86026 100644
--- a/string/aarch64/stpcpy-sve.S
+++ b/string/aarch64/experimental/stpcpy-sve.S
diff --git a/string/aarch64/strchr-sve.S b/string/aarch64/experimental/strchr-sve.S
index ff075167bfef..7d74ae9ff232 100644
--- a/string/aarch64/strchr-sve.S
+++ b/string/aarch64/experimental/strchr-sve.S
@@ -7,7 +7,8 @@
 
 #include "asmdefs.h"
 
-#if __ARM_FEATURE_SVE
+.arch armv8-a+sve
+
 /* Assumptions:
  *
  * ARMv8-a, AArch64
@@ -22,7 +23,6 @@
 #endif
 
 ENTRY (FUNC)
-	PTR_ARG (0)
 	dup	z1.b, w1		/* replicate byte across vector */
 	setffr				/* initialize FFR */
 	ptrue	p1.b			/* all ones; loop invariant */
@@ -65,6 +65,3 @@ ENTRY (FUNC)
 	b	0b
 
 END (FUNC)
-
-#endif
-
diff --git a/string/aarch64/strchrnul-sve.S b/string/aarch64/experimental/strchrnul-sve.S
index 0005f9177514..0005f9177514 100644
--- a/string/aarch64/strchrnul-sve.S
+++ b/string/aarch64/experimental/strchrnul-sve.S
diff --git a/string/aarch64/strcmp-sve.S b/string/aarch64/experimental/strcmp-sve.S
index eaf909a378f1..b6c249588534 100644
--- a/string/aarch64/strcmp-sve.S
+++ b/string/aarch64/experimental/strcmp-sve.S
@@ -7,7 +7,8 @@
 
 #include "asmdefs.h"
 
-#if __ARM_FEATURE_SVE
+.arch armv8-a+sve
+
 /* Assumptions:
  *
  * ARMv8-a, AArch64
@@ -15,8 +16,6 @@
  */
 
 ENTRY (__strcmp_aarch64_sve)
-	PTR_ARG (0)
-	PTR_ARG (1)
 	setffr				/* initialize FFR */
 	ptrue	p1.b, all		/* all ones; loop invariant */
 	mov	x2, 0			/* initialize offset */
@@ -54,6 +53,3 @@ ENTRY (__strcmp_aarch64_sve)
 	b	1b
 
 END (__strcmp_aarch64_sve)
-
-#endif
-
diff --git a/string/aarch64/strcpy-sve.S b/string/aarch64/experimental/strcpy-sve.S
index 00e72dce4451..57b77c8a00e7 100644
--- a/string/aarch64/strcpy-sve.S
+++ b/string/aarch64/experimental/strcpy-sve.S
@@ -7,7 +7,8 @@
 
 #include "asmdefs.h"
 
-#if __ARM_FEATURE_SVE
+.arch armv8-a+sve
+
 /* Assumptions:
  *
  * ARMv8-a, AArch64
@@ -22,8 +23,6 @@
 #endif
 
 ENTRY (FUNC)
-	PTR_ARG (0)
-	PTR_ARG (1)
 	setffr				/* initialize FFR */
 	ptrue	p2.b, all		/* all ones; loop invariant */
 	mov	x2, 0			/* initialize offset */
@@ -66,6 +65,3 @@ ENTRY (FUNC)
 	ret
 
 END (FUNC)
-
-#endif
-
diff --git a/string/aarch64/strlen-sve.S b/string/aarch64/experimental/strlen-sve.S
index 12ebbdba5c93..c83155052c07 100644
--- a/string/aarch64/strlen-sve.S
+++ b/string/aarch64/experimental/strlen-sve.S
@@ -7,7 +7,8 @@
 
 #include "asmdefs.h"
 
-#if __ARM_FEATURE_SVE
+.arch armv8-a+sve
+
 /* Assumptions:
  *
  * ARMv8-a, AArch64
@@ -15,7 +16,6 @@
  */
 
 ENTRY (__strlen_aarch64_sve)
-	PTR_ARG (0)
 	setffr			/* initialize FFR */
 	ptrue	p2.b		/* all ones; loop invariant */
 	mov	x1, 0		/* initialize length */
@@ -50,6 +50,3 @@ ENTRY (__strlen_aarch64_sve)
 	b	0b
 
 END (__strlen_aarch64_sve)
-
-#endif
-
diff --git a/string/aarch64/strncmp-sve.S b/string/aarch64/experimental/strncmp-sve.S
index 6a9e9f7b6437..a281e642d8aa 100644
--- a/string/aarch64/strncmp-sve.S
+++ b/string/aarch64/experimental/strncmp-sve.S
@@ -7,7 +7,8 @@
 
 #include "asmdefs.h"
 
-#if __ARM_FEATURE_SVE
+.arch armv8-a+sve
+
 /* Assumptions:
  *
  * ARMv8-a, AArch64
@@ -15,9 +16,6 @@
  */
 
 ENTRY (__strncmp_aarch64_sve)
-	PTR_ARG (0)
-	PTR_ARG (1)
-	SIZE_ARG (2)
 	setffr				/* initialize FFR */
 	mov	x3, 0			/* initialize off */
 
@@ -64,6 +62,3 @@ ENTRY (__strncmp_aarch64_sve)
 	ret
 
 END (__strncmp_aarch64_sve)
-
-#endif
-
diff --git a/string/aarch64/strnlen-sve.S b/string/aarch64/experimental/strnlen-sve.S
index 6c43dc427da7..11d835a1b13c 100644
--- a/string/aarch64/strnlen-sve.S
+++ b/string/aarch64/experimental/strnlen-sve.S
@@ -7,7 +7,8 @@
 
 #include "asmdefs.h"
 
-#if __ARM_FEATURE_SVE
+.arch armv8-a+sve
+
 /* Assumptions:
  *
  * ARMv8-a, AArch64
@@ -15,8 +16,6 @@
  */
 
 ENTRY (__strnlen_aarch64_sve)
-	PTR_ARG (0)
-	SIZE_ARG (1)
 	setffr				/* initialize FFR */
 	mov	x2, 0			/* initialize len */
 	b	1f
@@ -69,6 +68,3 @@ ENTRY (__strnlen_aarch64_sve)
 	ret
 
 END (__strnlen_aarch64_sve)
-
-#endif
-
diff --git a/string/aarch64/strrchr-sve.S b/string/aarch64/experimental/strrchr-sve.S
index 825a7384cfc1..731edaddf156 100644
--- a/string/aarch64/strrchr-sve.S
+++ b/string/aarch64/experimental/strrchr-sve.S
@@ -7,7 +7,8 @@
 
 #include "asmdefs.h"
 
-#if __ARM_FEATURE_SVE
+.arch armv8-a+sve
+
 /* Assumptions:
  *
  * ARMv8-a, AArch64
@@ -15,7 +16,6 @@
  */
 
 ENTRY (__strrchr_aarch64_sve)
-	PTR_ARG (0)
 	dup	z1.b, w1		/* replicate byte across vector */
 	setffr				/* initialize FFR */
 	ptrue	p1.b			/* all ones; loop invariant */
@@ -79,6 +79,3 @@ ENTRY (__strrchr_aarch64_sve)
 	ret
 
 END (__strrchr_aarch64_sve)
-
-#endif
-
diff --git a/string/aarch64/memchr-mte.S b/string/aarch64/memchr-mte.S
index 948c3cbc7dd4..68bd0af9a8c5 100644
--- a/string/aarch64/memchr-mte.S
+++ b/string/aarch64/memchr-mte.S
@@ -40,8 +40,6 @@
    exactly which byte matched.  */
 
 ENTRY (__memchr_aarch64_mte)
-	PTR_ARG (0)
-	SIZE_ARG (2)
 	bic	src, srcin, 15
 	cbz	cntin, L(nomatch)
 	ld1	{vdata.16b}, [src]
diff --git a/string/aarch64/memchr.S b/string/aarch64/memchr.S
index fe6cfe2bc0e2..d12a38abbc30 100644
--- a/string/aarch64/memchr.S
+++ b/string/aarch64/memchr.S
@@ -47,8 +47,6 @@
  */
 
 ENTRY (__memchr_aarch64)
-	PTR_ARG (0)
-	SIZE_ARG (2)
 	/* Do not dereference srcin if no bytes to compare.  */
 	cbz	cntin, L(zero_length)
 	/*
diff --git a/string/aarch64/memcmp.S b/string/aarch64/memcmp.S
index 35135e72cc8e..43439de4db69 100644
--- a/string/aarch64/memcmp.S
+++ b/string/aarch64/memcmp.S
@@ -30,10 +30,6 @@
 
 
 ENTRY (__memcmp_aarch64)
-	PTR_ARG (0)
-	PTR_ARG (1)
-	SIZE_ARG (2)
-
 	cmp	limit, 16
 	b.lo	L(less16)
 	ldp	data1, data3, [src1]
diff --git a/string/aarch64/memcpy-advsimd.S b/string/aarch64/memcpy-advsimd.S
index 9d3027d4d3cd..cbf4c581500e 100644
--- a/string/aarch64/memcpy-advsimd.S
+++ b/string/aarch64/memcpy-advsimd.S
@@ -52,9 +52,6 @@
 
 ENTRY_ALIAS (__memmove_aarch64_simd)
 ENTRY (__memcpy_aarch64_simd)
-	PTR_ARG (0)
-	PTR_ARG (1)
-	SIZE_ARG (2)
 	add	srcend, src, count
 	cmp	count, 128
 	b.hi	L(copy_long)
diff --git a/string/aarch64/memcpy-mops.S b/string/aarch64/memcpy-mops.S
index b45c31418717..03ae95570c04 100644
--- a/string/aarch64/memcpy-mops.S
+++ b/string/aarch64/memcpy-mops.S
@@ -8,10 +8,6 @@
 #include "asmdefs.h"
 
 ENTRY (__memcpy_aarch64_mops)
-	PTR_ARG (0)
-	PTR_ARG (1)
-	SIZE_ARG (2)
-
 	mov	x3, x0
 	.inst	0x19010443	/* cpyfp   [x3]!, [x1]!, x2!  */
 	.inst	0x19410443	/* cpyfm   [x3]!, [x1]!, x2!  */
diff --git a/string/aarch64/memcpy-sve.S b/string/aarch64/memcpy-sve.S
index e8a946d7db37..9b05cb2a58ee 100644
--- a/string/aarch64/memcpy-sve.S
+++ b/string/aarch64/memcpy-sve.S
@@ -13,8 +13,6 @@
 
 #include "asmdefs.h"
 
-#ifdef HAVE_SVE
-
 .arch armv8-a+sve
 
 #define dstin	x0
@@ -51,10 +49,6 @@
 
 ENTRY_ALIAS (__memmove_aarch64_sve)
 ENTRY (__memcpy_aarch64_sve)
-	PTR_ARG (0)
-	PTR_ARG (1)
-	SIZE_ARG (2)
-
 	cmp	count, 128
 	b.hi	L(copy_long)
 	cntb	vlen
@@ -173,5 +167,3 @@ L(return):
 	ret
 
 END (__memcpy_aarch64_sve)
-
-#endif
diff --git a/string/aarch64/memcpy.S b/string/aarch64/memcpy.S
index 7c0606e2104a..351f1a11f097 100644
--- a/string/aarch64/memcpy.S
+++ b/string/aarch64/memcpy.S
@@ -55,9 +55,6 @@
 
 ENTRY_ALIAS (__memmove_aarch64)
 ENTRY (__memcpy_aarch64)
-	PTR_ARG (0)
-	PTR_ARG (1)
-	SIZE_ARG (2)
 	add	srcend, src, count
 	add	dstend, dstin, count
 	cmp	count, 128
diff --git a/string/aarch64/memmove-mops.S b/string/aarch64/memmove-mops.S
index 6c73017bb16f..d9839f86e9b4 100644
--- a/string/aarch64/memmove-mops.S
+++ b/string/aarch64/memmove-mops.S
@@ -8,10 +8,6 @@
 #include "asmdefs.h"
 
 ENTRY (__memmove_aarch64_mops)
-	PTR_ARG (0)
-	PTR_ARG (1)
-	SIZE_ARG (2)
-
 	mov	x3, x0
 	.inst	0x1d010443	/* cpyp    [x3]!, [x1]!, x2!  */
 	.inst	0x1d410443	/* cpym    [x3]!, [x1]!, x2!  */
diff --git a/string/aarch64/memrchr.S b/string/aarch64/memrchr.S
index 6418bdf56f41..ed38478a6faa 100644
--- a/string/aarch64/memrchr.S
+++ b/string/aarch64/memrchr.S
@@ -42,7 +42,6 @@
    exactly which byte matched.  */
 
 ENTRY (__memrchr_aarch64)
-	PTR_ARG (0)
 	add	end, srcin, cntin
 	sub	endm1, end, 1
 	bic	src, endm1, 15
diff --git a/string/aarch64/memset-mops.S b/string/aarch64/memset-mops.S
index ec791493bae9..00d8e7d2c05f 100644
--- a/string/aarch64/memset-mops.S
+++ b/string/aarch64/memset-mops.S
@@ -8,9 +8,6 @@
 #include "asmdefs.h"
 
 ENTRY (__memset_aarch64_mops)
-	PTR_ARG (0)
-	SIZE_ARG (2)
-
 	mov     x3, x0
 	.inst   0x19c10443	/* setp    [x3]!, x2!, x1  */
 	.inst   0x19c14443	/* setm    [x3]!, x2!, x1  */
diff --git a/string/aarch64/memset-sve.S b/string/aarch64/memset-sve.S
new file mode 100644
index 000000000000..efaeaece284e
--- /dev/null
+++ b/string/aarch64/memset-sve.S
@@ -0,0 +1,114 @@
+/*
+ * memset - fill memory with a constant byte
+ *
+ * Copyright (c) 2024-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD, SVE, unaligned accesses.
+ *
+ */
+
+#include "asmdefs.h"
+
+.arch armv8-a+sve
+
+#define dstin	x0
+#define val	x1
+#define valw	w1
+#define count	x2
+#define dst	x3
+#define dstend	x4
+#define zva_val	x5
+#define vlen	x5
+#define off	x3
+#define dstend2 x5
+
+ENTRY (__memset_aarch64_sve)
+	dup	v0.16B, valw
+	cmp	count, 16
+	b.lo	L(set_16)
+
+	add	dstend, dstin, count
+	cmp	count, 64
+	b.hs	L(set_128)
+
+	/* Set 16..63 bytes.  */
+	mov	off, 16
+	and	off, off, count, lsr 1
+	sub	dstend2, dstend, off
+	str	q0, [dstin]
+	str	q0, [dstin, off]
+	str	q0, [dstend2, -16]
+	str	q0, [dstend, -16]
+	ret
+
+	.p2align 4
+L(set_16):
+	whilelo p0.b, xzr, count
+	st1b	z0.b, p0, [dstin]
+	ret
+
+	.p2align 4
+L(set_128):
+	bic	dst, dstin, 15
+	cmp	count, 128
+	b.hi	L(set_long)
+	stp	q0, q0, [dstin]
+	stp	q0, q0, [dstin, 32]
+	stp	q0, q0, [dstend, -64]
+	stp	q0, q0, [dstend, -32]
+	ret
+
+	.p2align 4
+L(set_long):
+	cmp	count, 256
+	b.lo	L(no_zva)
+	tst	valw, 255
+	b.ne	L(no_zva)
+
+#ifndef SKIP_ZVA_CHECK
+	mrs	zva_val, dczid_el0
+	and	zva_val, zva_val, 31
+	cmp	zva_val, 4		/* ZVA size is 64 bytes.  */
+	b.ne	L(no_zva)
+#endif
+	str	q0, [dstin]
+	str	q0, [dst, 16]
+	bic	dst, dstin, 31
+	stp	q0, q0, [dst, 32]
+	bic	dst, dstin, 63
+	sub	count, dstend, dst	/* Count is now 64 too large.  */
+	sub	count, count, 128	/* Adjust count and bias for loop.  */
+
+	sub	x8, dstend, 1		/* Write last bytes before ZVA loop.  */
+	bic	x8, x8, 15
+	stp	q0, q0, [x8, -48]
+	str	q0, [x8, -16]
+	str	q0, [dstend, -16]
+
+	.p2align 4
+L(zva64_loop):
+	add	dst, dst, 64
+	dc	zva, dst
+	subs	count, count, 64
+	b.hi	L(zva64_loop)
+	ret
+
+L(no_zva):
+	str	q0, [dstin]
+	sub	count, dstend, dst	/* Count is 16 too large.  */
+	sub	count, count, 64 + 16	/* Adjust count and bias for loop.  */
+L(no_zva_loop):
+	stp	q0, q0, [dst, 16]
+	stp	q0, q0, [dst, 48]
+	add	dst, dst, 64
+	subs	count, count, 64
+	b.hi	L(no_zva_loop)
+	stp	q0, q0, [dstend, -64]
+	stp	q0, q0, [dstend, -32]
+	ret
+
+END (__memset_aarch64_sve)
diff --git a/string/aarch64/memset.S b/string/aarch64/memset.S
index 553b0fcaefea..906a4dcf46c6 100644
--- a/string/aarch64/memset.S
+++ b/string/aarch64/memset.S
@@ -1,7 +1,7 @@
 /*
  * memset - fill memory with a constant byte
  *
- * Copyright (c) 2012-2022, Arm Limited.
+ * Copyright (c) 2012-2024, Arm Limited.
  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
@@ -20,93 +20,98 @@
 #define dst	x3
 #define dstend	x4
 #define zva_val	x5
+#define off	x3
+#define dstend2	x5
 
 ENTRY (__memset_aarch64)
-	PTR_ARG (0)
-	SIZE_ARG (2)
-
 	dup	v0.16B, valw
-	add	dstend, dstin, count
-
-	cmp	count, 96
-	b.hi	L(set_long)
 	cmp	count, 16
-	b.hs	L(set_medium)
-	mov	val, v0.D[0]
+	b.lo	L(set_small)
 
-	/* Set 0..15 bytes.  */
-	tbz	count, 3, 1f
-	str	val, [dstin]
-	str	val, [dstend, -8]
+	add	dstend, dstin, count
+	cmp	count, 64
+	b.hs	L(set_128)
+
+	/* Set 16..63 bytes.  */
+	mov	off, 16
+	and	off, off, count, lsr 1
+	sub	dstend2, dstend, off
+	str	q0, [dstin]
+	str	q0, [dstin, off]
+	str	q0, [dstend2, -16]
+	str	q0, [dstend, -16]
 	ret
+
 	.p2align 4
-1:	tbz	count, 2, 2f
-	str	valw, [dstin]
-	str	valw, [dstend, -4]
+	/* Set 0..15 bytes.  */
+L(set_small):
+	add	dstend, dstin, count
+	cmp	count, 4
+	b.lo	2f
+	lsr	off, count, 3
+	sub	dstend2, dstend, off, lsl 2
+	str	s0, [dstin]
+	str	s0, [dstin, off, lsl 2]
+	str	s0, [dstend2, -4]
+	str	s0, [dstend, -4]
 	ret
+
+	/* Set 0..3 bytes.  */
 2:	cbz	count, 3f
+	lsr	off, count, 1
 	strb	valw, [dstin]
-	tbz	count, 1, 3f
-	strh	valw, [dstend, -2]
+	strb	valw, [dstin, off]
+	strb	valw, [dstend, -1]
 3:	ret
 
-	/* Set 17..96 bytes.  */
-L(set_medium):
-	str	q0, [dstin]
-	tbnz	count, 6, L(set96)
-	str	q0, [dstend, -16]
-	tbz	count, 5, 1f
-	str	q0, [dstin, 16]
-	str	q0, [dstend, -32]
-1:	ret
-
 	.p2align 4
-	/* Set 64..96 bytes.  Write 64 bytes from the start and
-	   32 bytes from the end.  */
-L(set96):
-	str	q0, [dstin, 16]
+L(set_128):
+	bic	dst, dstin, 15
+	cmp	count, 128
+	b.hi	L(set_long)
+	stp	q0, q0, [dstin]
 	stp	q0, q0, [dstin, 32]
+	stp	q0, q0, [dstend, -64]
 	stp	q0, q0, [dstend, -32]
 	ret
 
 	.p2align 4
 L(set_long):
-	and	valw, valw, 255
-	bic	dst, dstin, 15
 	str	q0, [dstin]
-	cmp	count, 160
-	ccmp	valw, 0, 0, hs
+	str	q0, [dst, 16]
+	tst	valw, 255
 	b.ne	L(no_zva)
-
 #ifndef SKIP_ZVA_CHECK
 	mrs	zva_val, dczid_el0
 	and	zva_val, zva_val, 31
 	cmp	zva_val, 4		/* ZVA size is 64 bytes.  */
 	b.ne	L(no_zva)
 #endif
-	str	q0, [dst, 16]
 	stp	q0, q0, [dst, 32]
-	bic	dst, dst, 63
+	bic	dst, dstin, 63
 	sub	count, dstend, dst	/* Count is now 64 too large.  */
-	sub	count, count, 128	/* Adjust count and bias for loop.  */
+	sub	count, count, 64 + 64	/* Adjust count and bias for loop.  */
+
+	/* Write last bytes before ZVA loop.  */
+	stp	q0, q0, [dstend, -64]
+	stp	q0, q0, [dstend, -32]
 
 	.p2align 4
-L(zva_loop):
+L(zva64_loop):
 	add	dst, dst, 64
 	dc	zva, dst
 	subs	count, count, 64
-	b.hi	L(zva_loop)
-	stp	q0, q0, [dstend, -64]
-	stp	q0, q0, [dstend, -32]
+	b.hi	L(zva64_loop)
 	ret
 
+	.p2align 3
 L(no_zva):
-	sub	count, dstend, dst	/* Count is 16 too large.  */
-	sub	dst, dst, 16		/* Dst is biased by -32.  */
-	sub	count, count, 64 + 16	/* Adjust count and bias for loop.  */
+	sub	count, dstend, dst	/* Count is 32 too large.  */
+	sub	count, count, 64 + 32	/* Adjust count and bias for loop.  */
 L(no_zva_loop):
 	stp	q0, q0, [dst, 32]
-	stp	q0, q0, [dst, 64]!
+	stp	q0, q0, [dst, 64]
+	add	dst, dst, 64
 	subs	count, count, 64
 	b.hi	L(no_zva_loop)
 	stp	q0, q0, [dstend, -64]
@@ -114,4 +119,3 @@ L(no_zva_loop):
 	ret
 
 END (__memset_aarch64)
-
diff --git a/string/aarch64/strchr-mte.S b/string/aarch64/strchr-mte.S
index 6ec08f7acc76..42b747311bc6 100644
--- a/string/aarch64/strchr-mte.S
+++ b/string/aarch64/strchr-mte.S
@@ -39,7 +39,6 @@
    If it is not a multiple of 4, there was no match.  */
 
 ENTRY (__strchr_aarch64_mte)
-	PTR_ARG (0)
 	bic	src, srcin, 15
 	dup	vrepchr.16b, chrin
 	ld1	{vdata.16b}, [src]
diff --git a/string/aarch64/strchr.S b/string/aarch64/strchr.S
index 37193bd947a7..c1d01e9635b6 100644
--- a/string/aarch64/strchr.S
+++ b/string/aarch64/strchr.S
@@ -51,7 +51,6 @@
 /* Locals and temporaries.  */
 
 ENTRY (__strchr_aarch64)
-	PTR_ARG (0)
 	/* Magic constant 0xc0300c03 to allow us to identify which lane
 	   matches the requested byte.  Even bits are set if the character
 	   matches, odd bits if either the char is NUL or matches.  */
diff --git a/string/aarch64/strchrnul-mte.S b/string/aarch64/strchrnul-mte.S
index 543ee88bb285..b3180cdf9e2c 100644
--- a/string/aarch64/strchrnul-mte.S
+++ b/string/aarch64/strchrnul-mte.S
@@ -38,7 +38,6 @@
    exactly which byte matched.  */
 
 ENTRY (__strchrnul_aarch64_mte)
-	PTR_ARG (0)
 	bic	src, srcin, 15
 	dup	vrepchr.16b, chrin
 	ld1	{vdata.16b}, [src]
diff --git a/string/aarch64/strchrnul.S b/string/aarch64/strchrnul.S
index 666e8d0304c1..0a32c46c30c5 100644
--- a/string/aarch64/strchrnul.S
+++ b/string/aarch64/strchrnul.S
@@ -47,7 +47,6 @@
 /* Locals and temporaries.  */
 
 ENTRY (__strchrnul_aarch64)
-	PTR_ARG (0)
 	/* Magic constant 0x40100401 to allow us to identify which lane
 	   matches the termination condition.  */
 	mov	wtmp2, #0x0401
diff --git a/string/aarch64/strcmp.S b/string/aarch64/strcmp.S
index 137a9aa06681..7c0d0485a89b 100644
--- a/string/aarch64/strcmp.S
+++ b/string/aarch64/strcmp.S
@@ -51,8 +51,6 @@
 
 
 ENTRY (__strcmp_aarch64)
-	PTR_ARG (0)
-	PTR_ARG (1)
 	sub	off2, src2, src1
 	mov	zeroones, REP8_01
 	and	tmp, src1, 7
diff --git a/string/aarch64/strcpy.S b/string/aarch64/strcpy.S
index 97ae37ea4229..5852616e6024 100644
--- a/string/aarch64/strcpy.S
+++ b/string/aarch64/strcpy.S
@@ -52,8 +52,6 @@
    exactly which byte matched.  */
 
 ENTRY (STRCPY)
-	PTR_ARG (0)
-	PTR_ARG (1)
 	bic	src, srcin, 15
 	ld1	{vdata.16b}, [src]
 	cmeq	vhas_nul.16b, vdata.16b, 0
diff --git a/string/aarch64/strlen-mte.S b/string/aarch64/strlen-mte.S
index 77235797f7c5..afa72eed9a43 100644
--- a/string/aarch64/strlen-mte.S
+++ b/string/aarch64/strlen-mte.S
@@ -33,7 +33,6 @@
    identifies the first zero byte.  */
 
 ENTRY (__strlen_aarch64_mte)
-	PTR_ARG (0)
 	bic	src, srcin, 15
 	ld1	{vdata.16b}, [src]
 	cmeq	vhas_nul.16b, vdata.16b, 0
@@ -41,37 +40,50 @@ ENTRY (__strlen_aarch64_mte)
 	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	fmov	synd, dend
 	lsr	synd, synd, shift
-	cbz	synd, L(loop)
+	cbz	synd, L(next16)
 
 	rbit	synd, synd
 	clz	result, synd
 	lsr	result, result, 2
 	ret
 
+L(next16):
+	ldr	data, [src, 16]
+	cmeq	vhas_nul.16b, vdata.16b, 0
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
+	fmov	synd, dend
+	cbz	synd, L(loop)
+	add	src, src, 16
+#ifndef __AARCH64EB__
+	rbit	synd, synd
+#endif
+	sub	result, src, srcin
+	clz	tmp, synd
+	add	result, result, tmp, lsr 2
+	ret
+
 	.p2align 5
 L(loop):
-	ldr	data, [src, 16]
+	ldr	data, [src, 32]!
 	cmeq	vhas_nul.16b, vdata.16b, 0
-	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	addhn	vend.8b, vhas_nul.8h, vhas_nul.8h
 	fmov	synd, dend
 	cbnz	synd, L(loop_end)
-	ldr	data, [src, 32]!
+	ldr	data, [src, 16]
 	cmeq	vhas_nul.16b, vdata.16b, 0
-	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	addhn	vend.8b, vhas_nul.8h, vhas_nul.8h
 	fmov	synd, dend
 	cbz	synd, L(loop)
-	sub	src, src, 16
+	add	src, src, 16
 L(loop_end):
-	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
-	sub	result, src, srcin
-	fmov	synd, dend
+	sub	result, shift, src, lsl 2	/* (srcin - src) << 2.  */
 #ifndef __AARCH64EB__
 	rbit	synd, synd
+	sub	result, result, 3
 #endif
-	add	result, result, 16
 	clz	tmp, synd
-	add	result, result, tmp, lsr 2
+	sub	result, tmp, result
+	lsr	result, result, 2
 	ret
 
 END (__strlen_aarch64_mte)
-
diff --git a/string/aarch64/strlen.S b/string/aarch64/strlen.S
index 6f6f08f636b2..0ebb26be844c 100644
--- a/string/aarch64/strlen.S
+++ b/string/aarch64/strlen.S
@@ -75,7 +75,6 @@
    character, return the length, if not, continue in the main loop.  */
 
 ENTRY (__strlen_aarch64)
-	PTR_ARG (0)
 	and	tmp1, srcin, MIN_PAGE_SIZE - 1
 	cmp	tmp1, MIN_PAGE_SIZE - 32
 	b.hi	L(page_cross)
diff --git a/string/aarch64/strncmp.S b/string/aarch64/strncmp.S
index 128a10c52bb1..493a0f06ed1d 100644
--- a/string/aarch64/strncmp.S
+++ b/string/aarch64/strncmp.S
@@ -55,9 +55,6 @@
 #endif
 
 ENTRY (__strncmp_aarch64)
-	PTR_ARG (0)
-	PTR_ARG (1)
-	SIZE_ARG (2)
 	cbz	limit, L(ret0)
 	eor	tmp1, src1, src2
 	mov	zeroones, #REP8_01
diff --git a/string/aarch64/strnlen.S b/string/aarch64/strnlen.S
index f2090a7485a5..6a96ec268f1a 100644
--- a/string/aarch64/strnlen.S
+++ b/string/aarch64/strnlen.S
@@ -36,8 +36,6 @@
    identifies the first zero byte.  */
 
 ENTRY (__strnlen_aarch64)
-	PTR_ARG (0)
-	SIZE_ARG (1)
 	bic	src, srcin, 15
 	cbz	cntin, L(nomatch)
 	ld1	{vdata.16b}, [src]
diff --git a/string/aarch64/strrchr-mte.S b/string/aarch64/strrchr-mte.S
index bb61ab9ad4e7..8668ce6d2916 100644
--- a/string/aarch64/strrchr-mte.S
+++ b/string/aarch64/strrchr-mte.S
@@ -42,7 +42,6 @@
    if the relevant byte matched the NUL end of string.  */
 
 ENTRY (__strrchr_aarch64_mte)
-	PTR_ARG (0)
 	bic	src, srcin, 15
 	dup	vrepchr.16b, chrin
 	movi	vrepmask.16b, 0x33
diff --git a/string/aarch64/strrchr.S b/string/aarch64/strrchr.S
index bf9cb297b6cb..f5713f4260fb 100644
--- a/string/aarch64/strrchr.S
+++ b/string/aarch64/strrchr.S
@@ -55,7 +55,6 @@
    identify exactly which byte is causing the termination, and why.  */
 
 ENTRY (__strrchr_aarch64)
-	PTR_ARG (0)
 	/* Magic constant 0x40100401 to allow us to identify which lane
 	   matches the requested byte.  Magic constant 0x80200802 used
 	   similarly for NUL termination.  */
diff --git a/string/bench/memcpy.c b/string/bench/memcpy.c
index b628f9b60d96..583fa505db75 100644
--- a/string/bench/memcpy.c
+++ b/string/bench/memcpy.c
@@ -20,35 +20,18 @@
 #define MIN_SIZE 32768
 #define MAX_SIZE (1024 * 1024)
 
-static uint8_t a[MAX_SIZE + 4096 + 64] __attribute__((__aligned__(64)));
-static uint8_t b[MAX_SIZE + 4096 + 64] __attribute__((__aligned__(64)));
-
-#define F(x) {#x, x},
-
-static const struct fun
-{
-  const char *name;
-  void *(*fun)(void *, const void *, size_t);
-} funtab[] =
-{
-#if __aarch64__
-  F(__memcpy_aarch64)
-# if __ARM_NEON
-  F(__memcpy_aarch64_simd)
-# endif
-# if __ARM_FEATURE_SVE
-  F(__memcpy_aarch64_sve)
-# endif
-# if WANT_MOPS
-  F(__memcpy_aarch64_mops)
-# endif
-#elif __arm__
-  F(__memcpy_arm)
-#endif
-  F(memcpy)
-#undef F
-  {0, 0}
-};
+static uint8_t a[MAX_SIZE + 4096 + 64] __attribute__((__aligned__(4096)));
+static uint8_t b[MAX_SIZE + 4096 + 64] __attribute__((__aligned__(4096)));
+
+#define DOTEST(STR,TESTFN)			\
+  printf (STR);					\
+  RUN (TESTFN, memcpy);				\
+  RUNA64 (TESTFN, __memcpy_aarch64);		\
+  RUNA64 (TESTFN, __memcpy_aarch64_simd);	\
+  RUNSVE (TESTFN, __memcpy_aarch64_sve);	\
+  RUNMOPS (TESTFN, __memcpy_aarch64_mops);	\
+  RUNA32 (TESTFN, __memcpy_arm);		\
+  printf ("\n");
 
 typedef struct { uint16_t size; uint16_t freq; } freq_data_t;
 typedef struct { uint8_t align; uint16_t freq; } align_data_t;
@@ -160,183 +143,125 @@ init_copies (size_t max_size)
   return total;
 }
 
-int main (void)
+static void inline __attribute ((always_inline))
+memcpy_random (const char *name, void *(*fn)(void *, const void *, size_t))
 {
-  init_copy_distribution ();
-
-  memset (a, 1, sizeof (a));
-  memset (b, 2, sizeof (b));
-
-  printf("Random memcpy (bytes/ns):\n");
-  for (int f = 0; funtab[f].name != 0; f++)
-    {
-      size_t total = 0;
-      uint64_t tsum = 0;
-      printf ("%22s ", funtab[f].name);
-      rand32 (0x12345678);
-
-      for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2)
-	{
-	  size_t copy_size = init_copies (size) * ITERS;
-
-	  for (int c = 0; c < NUM_TESTS; c++)
-	    funtab[f].fun (b + test_arr[c].dst, a + test_arr[c].src,
-			   test_arr[c].len);
-
-	  uint64_t t = clock_get_ns ();
-	  for (int i = 0; i < ITERS; i++)
-	    for (int c = 0; c < NUM_TESTS; c++)
-	      funtab[f].fun (b + test_arr[c].dst, a + test_arr[c].src,
-			     test_arr[c].len);
-	  t = clock_get_ns () - t;
-	  total += copy_size;
-	  tsum += t;
-	  printf ("%dK: %.2f ", size / 1024, (double)copy_size / t);
-	}
-      printf( "avg %.2f\n", (double)total / tsum);
-    }
-
-  size_t total = 0;
-  uint64_t tsum = 0;
-  printf ("%22s ", "memcpy_call");
-  rand32 (0x12345678);
-
+  printf ("%22s ", name);
+  uint64_t total = 0, tsum = 0;
   for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2)
     {
-      size_t copy_size = init_copies (size) * ITERS;
+      uint64_t copy_size = init_copies (size) * ITERS;
 
       for (int c = 0; c < NUM_TESTS; c++)
-	memcpy (b + test_arr[c].dst, a + test_arr[c].src, test_arr[c].len);
+	fn (b + test_arr[c].dst, a + test_arr[c].src, test_arr[c].len);
 
       uint64_t t = clock_get_ns ();
       for (int i = 0; i < ITERS; i++)
 	for (int c = 0; c < NUM_TESTS; c++)
-	  memcpy (b + test_arr[c].dst, a + test_arr[c].src, test_arr[c].len);
+	  fn (b + test_arr[c].dst, a + test_arr[c].src, test_arr[c].len);
       t = clock_get_ns () - t;
       total += copy_size;
       tsum += t;
-      printf ("%dK: %.2f ", size / 1024, (double)copy_size / t);
+      printf ("%dK: %5.2f ", size / 1024, (double)copy_size / t);
     }
-  printf( "avg %.2f\n", (double)total / tsum);
-
+  printf( "avg %5.2f\n", (double)total / tsum);
+}
 
-  printf ("\nAligned medium memcpy (bytes/ns):\n");
-  for (int f = 0; funtab[f].name != 0; f++)
-    {
-      printf ("%22s ", funtab[f].name);
-
-      for (int size = 8; size <= 512; size *= 2)
-	{
-	  uint64_t t = clock_get_ns ();
-	  for (int i = 0; i < ITERS2; i++)
-	    funtab[f].fun (b, a, size);
-	  t = clock_get_ns () - t;
-	  printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
-	}
-      printf ("\n");
-    }
+static void inline __attribute ((always_inline))
+memcpy_medium_aligned (const char *name, void *(*fn)(void *, const void *, size_t))
+{
+  printf ("%22s ", name);
 
-  printf ("%22s ", "memcpy_call");
   for (int size = 8; size <= 512; size *= 2)
     {
       uint64_t t = clock_get_ns ();
       for (int i = 0; i < ITERS2; i++)
-	memcpy (b, a, size);
+	fn (b, a, size);
       t = clock_get_ns () - t;
-      printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
+      printf ("%dB: %5.2f ", size, (double)size * ITERS2 / t);
     }
   printf ("\n");
+}
 
+static void inline __attribute ((always_inline))
+memcpy_medium_unaligned (const char *name, void *(*fn)(void *, const void *, size_t))
+{
+  printf ("%22s ", name);
 
-  printf ("\nUnaligned medium memcpy (bytes/ns):\n");
-  for (int f = 0; funtab[f].name != 0; f++)
-    {
-      printf ("%22s ", funtab[f].name);
-
-      for (int size = 8; size <= 512; size *= 2)
-	{
-	  uint64_t t = clock_get_ns ();
-	  for (int i = 0; i < ITERS2; i++)
-	    funtab[f].fun (b + 3, a + 1, size);
-	  t = clock_get_ns () - t;
-	  printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
-	}
-      printf ("\n");
-    }
-
-  printf ("%22s ", "memcpy_call");
   for (int size = 8; size <= 512; size *= 2)
     {
       uint64_t t = clock_get_ns ();
       for (int i = 0; i < ITERS2; i++)
-	memcpy (b + 3, a + 1, size);
+	fn (b + 3, a + 1, size);
       t = clock_get_ns () - t;
-      printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
+      printf ("%dB: %5.2f ", size, (double)size * ITERS2 / t);
     }
   printf ("\n");
+}
 
+static void inline __attribute ((always_inline))
+memcpy_large (const char *name, void *(*fn)(void *, const void *, size_t))
+{
+  printf ("%22s ", name);
 
-  printf ("\nLarge memcpy (bytes/ns):\n");
-  for (int f = 0; funtab[f].name != 0; f++)
-    {
-      printf ("%22s ", funtab[f].name);
-
-      for (int size = 1024; size <= 65536; size *= 2)
-	{
-	  uint64_t t = clock_get_ns ();
-	  for (int i = 0; i < ITERS3; i++)
-	    funtab[f].fun (b, a, size);
-	  t = clock_get_ns () - t;
-	  printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
-	}
-      printf ("\n");
-    }
-
-  printf ("%22s ", "memcpy_call");
   for (int size = 1024; size <= 65536; size *= 2)
     {
       uint64_t t = clock_get_ns ();
       for (int i = 0; i < ITERS3; i++)
-	memcpy (b, a, size);
+	fn (b, a, size);
       t = clock_get_ns () - t;
-      printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
+      printf ("%dK: %5.2f ", size / 1024, (double)size * ITERS3 / t);
     }
   printf ("\n");
+}
 
+static void inline __attribute ((always_inline))
+memmove_forward_unaligned (const char *name, void *(*fn)(void *, const void *, size_t))
+{
+  printf ("%22s ", name);
 
-  printf ("\nUnaligned forwards memmove (bytes/ns):\n");
-  for (int f = 0; funtab[f].name != 0; f++)
+  for (int size = 1024; size <= 65536; size *= 2)
     {
-      printf ("%22s ", funtab[f].name);
-
-      for (int size = 1024; size <= 65536; size *= 2)
-	{
-	  uint64_t t = clock_get_ns ();
-	  for (int i = 0; i < ITERS3; i++)
-	    funtab[f].fun (a, a + 256 + (i & 31), size);
-	  t = clock_get_ns () - t;
-	  printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
-	}
-      printf ("\n");
+      uint64_t t = clock_get_ns ();
+      for (int i = 0; i < ITERS3; i++)
+        fn (a, a + 256 + (i & 31), size);
+      t = clock_get_ns () - t;
+      printf ("%dK: %5.2f ", size / 1024, (double)size * ITERS3 / t);
     }
 
+  printf ("\n");
+}
+
+static void inline __attribute ((always_inline))
+memmove_backward_unaligned (const char *name, void *(*fn)(void *, const void *, size_t))
+{
+  printf ("%22s ", name);
 
-  printf ("\nUnaligned backwards memmove (bytes/ns):\n");
-  for (int f = 0; funtab[f].name != 0; f++)
+  for (int size = 1024; size <= 65536; size *= 2)
     {
-      printf ("%22s ", funtab[f].name);
-
-      for (int size = 1024; size <= 65536; size *= 2)
-	{
-	  uint64_t t = clock_get_ns ();
-	  for (int i = 0; i < ITERS3; i++)
-	    funtab[f].fun (a + 256 + (i & 31), a, size);
-	  t = clock_get_ns () - t;
-	  printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
-	}
-      printf ("\n");
+      uint64_t t = clock_get_ns ();
+      for (int i = 0; i < ITERS3; i++)
+	fn (a + 256 + (i & 31), a, size);
+      t = clock_get_ns () - t;
+      printf ("%dK: %5.2f ", size / 1024, (double)size * ITERS3 / t);
     }
+
   printf ("\n");
+}
+
+int main (void)
+{
+  init_copy_distribution ();
+
+  memset (a, 1, sizeof (a));
+  memset (b, 2, sizeof (b));
+
+  DOTEST ("Random memcpy (bytes/ns):\n", memcpy_random);
+  DOTEST ("Medium memcpy aligned (bytes/ns):\n", memcpy_medium_aligned);
+  DOTEST ("Medium memcpy unaligned (bytes/ns):\n", memcpy_medium_unaligned);
+  DOTEST ("Large memcpy (bytes/ns):\n", memcpy_large);
+  DOTEST ("Forwards memmove unaligned (bytes/ns):\n", memmove_forward_unaligned);
+  DOTEST ("Backwards memmove unaligned (bytes/ns):\n", memmove_backward_unaligned);
 
   return 0;
 }
diff --git a/string/bench/memset.c b/string/bench/memset.c
index 990e23ba9a36..07474e469146 100644
--- a/string/bench/memset.c
+++ b/string/bench/memset.c
@@ -20,25 +20,16 @@
 #define MIN_SIZE 32768
 #define MAX_SIZE (1024 * 1024)
 
-static uint8_t a[MAX_SIZE + 4096] __attribute__((__aligned__(64)));
+static uint8_t a[MAX_SIZE + 4096] __attribute__((__aligned__(4096)));
 
-#define F(x) {#x, x},
-
-static const struct fun
-{
-  const char *name;
-  void *(*fun)(void *, int, size_t);
-} funtab[] =
-{
-#if __aarch64__
-  F(__memset_aarch64)
-#elif __arm__
-  F(__memset_arm)
-#endif
-  F(memset)
-#undef F
-  {0, 0}
-};
+#define DOTEST(STR,TESTFN)			\
+  printf (STR);					\
+  RUN (TESTFN, memset);				\
+  RUNA64 (TESTFN, __memset_aarch64);		\
+  RUNSVE (TESTFN, __memset_aarch64_sve);	\
+  RUNMOPS (TESTFN, __memset_mops);		\
+  RUNA32 (TESTFN, __memset_arm);		\
+  printf ("\n");
 
 typedef struct { uint32_t offset : 20, len : 12; } memset_test_t;
 static memset_test_t test_arr[NUM_TESTS];
@@ -127,117 +118,73 @@ init_memset (size_t max_size)
   return total;
 }
 
-
-int main (void)
+static void inline __attribute ((always_inline))
+memset_random (const char *name, void *(*set)(void *, int, size_t))
 {
-  init_memset_distribution ();
-
-  memset (a, 1, sizeof (a));
-
-  printf("Random memset (bytes/ns):\n");
-  for (int f = 0; funtab[f].name != 0; f++)
-    {
-      size_t total_size = 0;
-      uint64_t tsum = 0;
-      printf ("%22s ", funtab[f].name);
-      rand32 (0x12345678);
-
-      for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2)
-	{
-	  size_t memset_size = init_memset (size) * ITERS;
-
-	  for (int c = 0; c < NUM_TESTS; c++)
-	    funtab[f].fun (a + test_arr[c].offset, 0, test_arr[c].len);
-
-	  uint64_t t = clock_get_ns ();
-	  for (int i = 0; i < ITERS; i++)
-	    for (int c = 0; c < NUM_TESTS; c++)
-	      funtab[f].fun (a + test_arr[c].offset, 0, test_arr[c].len);
-	  t = clock_get_ns () - t;
-	  total_size += memset_size;
-	  tsum += t;
-	  printf ("%dK: %.2f ", size / 1024, (double)memset_size / t);
-	}
-      printf( "avg %.2f\n", (double)total_size / tsum);
-    }
-
-  size_t total_size = 0;
+  uint64_t total_size = 0;
   uint64_t tsum = 0;
-  printf ("%22s ", "memset_call");
+  printf ("%22s ", name);
   rand32 (0x12345678);
 
   for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2)
     {
-      size_t memset_size = init_memset (size) * ITERS;
+      uint64_t memset_size = init_memset (size) * ITERS;
 
       for (int c = 0; c < NUM_TESTS; c++)
-	memset (a + test_arr[c].offset, 0, test_arr[c].len);
+	set (a + test_arr[c].offset, 0, test_arr[c].len);
 
       uint64_t t = clock_get_ns ();
       for (int i = 0; i < ITERS; i++)
 	for (int c = 0; c < NUM_TESTS; c++)
-	  memset (a + test_arr[c].offset, 0, test_arr[c].len);
+	  set (a + test_arr[c].offset, 0, test_arr[c].len);
       t = clock_get_ns () - t;
       total_size += memset_size;
       tsum += t;
-      printf ("%dK: %.2f ", size / 1024, (double)memset_size / t);
+      printf ("%dK: %5.2f ", size / 1024, (double)memset_size / t);
     }
-  printf( "avg %.2f\n", (double)total_size / tsum);
-
+  printf( "avg %5.2f\n", (double)total_size / tsum);
+}
 
-  printf ("\nMedium memset (bytes/ns):\n");
-  for (int f = 0; funtab[f].name != 0; f++)
-    {
-      printf ("%22s ", funtab[f].name);
-
-      for (int size = 8; size <= 512; size *= 2)
-	{
-	  uint64_t t = clock_get_ns ();
-	  for (int i = 0; i < ITERS2; i++)
-	    funtab[f].fun (a, 0, size);
-	  t = clock_get_ns () - t;
-	  printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
-	}
-      printf ("\n");
-    }
+static void inline __attribute ((always_inline))
+memset_medium (const char *name, void *(*set)(void *, int, size_t))
+{
+  printf ("%22s ", name);
 
-  printf ("%22s ", "memset_call");
   for (int size = 8; size <= 512; size *= 2)
     {
       uint64_t t = clock_get_ns ();
       for (int i = 0; i < ITERS2; i++)
-	memset (a, 0, size);
+	set (a, 0, size);
       t = clock_get_ns () - t;
-      printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
+      printf ("%dB: %5.2f ", size, (double)size * ITERS2 / t);
     }
+  printf ("\n");
+}
 
+static void inline __attribute ((always_inline))
+memset_large (const char *name, void *(*set)(void *, int, size_t))
+{
+  printf ("%22s ", name);
 
-  printf ("\nLarge memset (bytes/ns):\n");
-  for (int f = 0; funtab[f].name != 0; f++)
-    {
-      printf ("%22s ", funtab[f].name);
-
-      for (int size = 1024; size <= 65536; size *= 2)
-	{
-	  uint64_t t = clock_get_ns ();
-	  for (int i = 0; i < ITERS3; i++)
-	    funtab[f].fun (a, 0, size);
-	  t = clock_get_ns () - t;
-	  printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
-	}
-      printf ("\n");
-    }
-
-  printf ("%22s ", "memset_call");
   for (int size = 1024; size <= 65536; size *= 2)
     {
       uint64_t t = clock_get_ns ();
       for (int i = 0; i < ITERS3; i++)
-	memset (a, 0, size);
+	set (a, 0, size);
       t = clock_get_ns () - t;
-      printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
+      printf ("%dKB: %6.2f ", size / 1024, (double)size * ITERS3 / t);
     }
-  printf ("\n\n");
+  printf ("\n");
+}
+
+int main (void)
+{
+  init_memset_distribution ();
+
+  memset (a, 1, sizeof (a));
 
+  DOTEST ("Random memset (bytes/ns):\n", memset_random);
+  DOTEST ("Medium memset (bytes/ns):\n", memset_medium);
+  DOTEST ("Large memset (bytes/ns):\n", memset_large);
   return 0;
 }
diff --git a/string/bench/strlen.c b/string/bench/strlen.c
index f05d0d5b89e6..a8dd55cf5fc4 100644
--- a/string/bench/strlen.c
+++ b/string/bench/strlen.c
@@ -14,40 +14,23 @@
 #include "benchlib.h"
 
 #define ITERS 5000
-#define ITERS2 20000000
-#define ITERS3 2000000
-#define NUM_TESTS 16384
+#define ITERS2 40000000
+#define ITERS3 4000000
+#define NUM_TESTS 65536
 
 #define MAX_ALIGN 32
-#define MAX_STRLEN 256
+#define MAX_STRLEN 128
 
 static char a[(MAX_STRLEN + 1) * MAX_ALIGN] __attribute__((__aligned__(4096)));
 
-#define F(x, mte) {#x, x, mte},
-
-static const struct fun
-{
-  const char *name;
-  size_t (*fun) (const char *s);
-  int test_mte;
-} funtab[] = {
-  // clang-format off
-  F(strlen, 0)
-#if __aarch64__
-  F(__strlen_aarch64, 0)
-  F(__strlen_aarch64_mte, 1)
-# if __ARM_FEATURE_SVE
-  F(__strlen_aarch64_sve, 1)
-# endif
-#elif __arm__
-# if __ARM_ARCH >= 6 && __ARM_ARCH_ISA_THUMB == 2
-  F(__strlen_armv6t2, 0)
-# endif
-#endif
-  {0, 0, 0}
-  // clang-format on
-};
-#undef F
+#define DOTEST(STR,TESTFN)			\
+  printf (STR);					\
+  RUN (TESTFN, strlen);				\
+  RUNA64 (TESTFN, __strlen_aarch64);		\
+  RUNA64 (TESTFN, __strlen_aarch64_mte);	\
+  RUNSVE (TESTFN, __strlen_aarch64_sve);	\
+  RUNT32 (TESTFN, __strlen_armv6t2);		\
+  printf ("\n");
 
 static uint16_t strlen_tests[NUM_TESTS];
 
@@ -124,98 +107,119 @@ init_strlen_tests (void)
 
       strlen_tests[n] =
 	index[(align + exp_len) & (MAX_ALIGN - 1)] + MAX_STRLEN - exp_len;
+      assert ((strlen_tests[n] & (align - 1)) == 0);
+      assert (strlen (a + strlen_tests[n]) == exp_len);
     }
 }
 
 static volatile size_t maskv = 0;
 
-int main (void)
+static void inline __attribute ((always_inline))
+strlen_random (const char *name, size_t (*fn)(const char *))
 {
-  rand32 (0x12345678);
-  init_strlen_distribution ();
-  init_strlen_tests ();
+  size_t res = 0, mask = maskv;
+  uint64_t strlen_size = 0;
+  printf ("%22s ", name);
+
+  for (int c = 0; c < NUM_TESTS; c++)
+    strlen_size += fn (a + strlen_tests[c]) + 1;
+  strlen_size *= ITERS;
+
+  /* Measure throughput of strlen.  */
+  uint64_t t = clock_get_ns ();
+  for (int i = 0; i < ITERS; i++)
+    for (int c = 0; c < NUM_TESTS; c++)
+      res += fn (a + strlen_tests[c]);
+  t = clock_get_ns () - t;
+  printf ("tp: %.3f ", (double)strlen_size / t);
+
+  /* Measure latency of strlen result with (res & mask).  */
+  t = clock_get_ns ();
+  for (int i = 0; i < ITERS; i++)
+    for (int c = 0; c < NUM_TESTS; c++)
+      res += fn (a + strlen_tests[c] + (res & mask));
+  t = clock_get_ns () - t;
+  printf ("lat: %.3f\n", (double)strlen_size / t);
+  maskv = res & mask;
+}
 
-  printf ("\nRandom strlen (bytes/ns):\n");
-  for (int f = 0; funtab[f].name != 0; f++)
-    {
-      size_t res = 0, strlen_size = 0, mask = maskv;
-      printf ("%22s ", funtab[f].name);
+static void inline __attribute ((always_inline))
+strlen_small_aligned (const char *name, size_t (*fn)(const char *))
+{
+  printf ("%22s ", name);
 
-      for (int c = 0; c < NUM_TESTS; c++)
-	strlen_size += funtab[f].fun (a + strlen_tests[c]);
-      strlen_size *= ITERS;
+  size_t res = 0, mask = maskv;
+  for (int size = 1; size <= 64; size *= 2)
+    {
+      memset (a, 'x', size);
+      a[size - 1] = 0;
 
-      /* Measure latency of strlen result with (res & mask).  */
       uint64_t t = clock_get_ns ();
-      for (int i = 0; i < ITERS; i++)
-	for (int c = 0; c < NUM_TESTS; c++)
-	  res = funtab[f].fun (a + strlen_tests[c] + (res & mask));
+      for (int i = 0; i < ITERS2; i++)
+	res += fn (a + (i & mask));
       t = clock_get_ns () - t;
-      printf ("%.2f\n", (double)strlen_size / t);
+      printf ("%d%c: %5.2f ", size < 1024 ? size : size / 1024,
+	      size < 1024 ? 'B' : 'K', (double)size * ITERS2 / t);
     }
+  maskv &= res;
+  printf ("\n");
+}
 
-  printf ("\nSmall aligned strlen (bytes/ns):\n");
-  for (int f = 0; funtab[f].name != 0; f++)
-    {
-      printf ("%22s ", funtab[f].name);
-
-      for (int size = 1; size <= 64; size *= 2)
-	{
-	  memset (a, 'x', size);
-	  a[size - 1] = 0;
-
-	  uint64_t t = clock_get_ns ();
-	  for (int i = 0; i < ITERS2; i++)
-	    funtab[f].fun (a);
-	  t = clock_get_ns () - t;
-	  printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
-		  size < 1024 ? 'B' : 'K', (double)size * ITERS2 / t);
-	}
-      printf ("\n");
-    }
+static void inline __attribute ((always_inline))
+strlen_small_unaligned (const char *name, size_t (*fn)(const char *))
+{
+  printf ("%22s ", name);
 
-  printf ("\nSmall unaligned strlen (bytes/ns):\n");
-  for (int f = 0; funtab[f].name != 0; f++)
+  size_t res = 0, mask = maskv;
+  int align = 9;
+  for (int size = 1; size <= 64; size *= 2)
     {
-      printf ("%22s ", funtab[f].name);
-
-      int align = 9;
-      for (int size = 1; size <= 64; size *= 2)
-	{
-	  memset (a + align, 'x', size);
-	  a[align + size - 1] = 0;
-
-	  uint64_t t = clock_get_ns ();
-	  for (int i = 0; i < ITERS2; i++)
-	    funtab[f].fun (a + align);
-	  t = clock_get_ns () - t;
-	  printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
-		  size < 1024 ? 'B' : 'K', (double)size * ITERS2 / t);
-	}
-      printf ("\n");
+      memset (a + align, 'x', size);
+      a[align + size - 1] = 0;
+
+      uint64_t t = clock_get_ns ();
+      for (int i = 0; i < ITERS2; i++)
+	res += fn (a + align + (i & mask));
+      t = clock_get_ns () - t;
+      printf ("%d%c: %5.2f ", size < 1024 ? size : size / 1024,
+	      size < 1024 ? 'B' : 'K', (double)size * ITERS2 / t);
     }
+  maskv &= res;
+  printf ("\n");
+}
 
-  printf ("\nMedium strlen (bytes/ns):\n");
-  for (int f = 0; funtab[f].name != 0; f++)
+static void inline __attribute ((always_inline))
+strlen_medium (const char *name, size_t (*fn)(const char *))
+{
+  printf ("%22s ", name);
+
+  size_t res = 0, mask = maskv;
+  for (int size = 128; size <= 4096; size *= 2)
     {
-      printf ("%22s ", funtab[f].name);
-
-      for (int size = 128; size <= 4096; size *= 2)
-	{
-	  memset (a, 'x', size);
-	  a[size - 1] = 0;
-
-	  uint64_t t = clock_get_ns ();
-	  for (int i = 0; i < ITERS3; i++)
-	    funtab[f].fun (a);
-	  t = clock_get_ns () - t;
-	  printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
-		  size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t);
-	}
-      printf ("\n");
-    }
+      memset (a, 'x', size);
+      a[size - 1] = 0;
 
+      uint64_t t = clock_get_ns ();
+      for (int i = 0; i < ITERS3; i++)
+	res += fn (a + (i & mask));
+      t = clock_get_ns () - t;
+      printf ("%d%c: %5.2f ", size < 1024 ? size : size / 1024,
+	      size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t);
+    }
+  maskv &= res;
   printf ("\n");
+}
+
+int main (void)
+{
+  rand32 (0x12345678);
+  init_strlen_distribution ();
+  init_strlen_tests ();
+
+  DOTEST ("Random strlen (bytes/ns):\n", strlen_random);
+  DOTEST ("Small aligned strlen (bytes/ns):\n", strlen_small_aligned);
+  DOTEST ("Small unaligned strlen (bytes/ns):\n", strlen_small_unaligned);
+  DOTEST ("Medium strlen (bytes/ns):\n", strlen_medium);
 
   return 0;
 }
diff --git a/string/include/benchlib.h b/string/include/benchlib.h
index f1bbea388cd2..486504e99ddf 100644
--- a/string/include/benchlib.h
+++ b/string/include/benchlib.h
@@ -30,4 +30,35 @@ rand32 (uint32_t seed)
   return res;
 }
 
+/* Macros to run a benchmark BENCH using string function FN.  */
+#define RUN(BENCH, FN) BENCH(#FN, FN)
 
+#if __aarch64__
+# define RUNA64(BENCH, FN) BENCH(#FN, FN)
+#else
+# define RUNA64(BENCH, FN)
+#endif
+
+#if __ARM_FEATURE_SVE
+# define RUNSVE(BENCH, FN) BENCH(#FN, FN)
+#else
+# define RUNSVE(BENCH, FN)
+#endif
+
+#if WANT_MOPS
+# define RUNMOPS(BENCH, FN) BENCH(#FN, FN)
+#else
+# define RUNMOPS(BENCH, FN)
+#endif
+
+#if __arm__
+# define RUNA32(BENCH, FN) BENCH(#FN, FN)
+#else
+# define RUNA32(BENCH, FN)
+#endif
+
+#if __arm__ && __ARM_ARCH >= 6 && __ARM_ARCH_ISA_THUMB == 2
+# define RUNT32(BENCH, FN) BENCH(#FN, FN)
+#else
+# define RUNT32(BENCH, FN)
+#endif
diff --git a/string/include/stringlib.h b/string/include/stringlib.h
index 01da7ebfc18d..bb9db930f132 100644
--- a/string/include/stringlib.h
+++ b/string/include/stringlib.h
@@ -33,13 +33,12 @@ char *__strchr_aarch64_mte (const char *, int);
 char * __strchrnul_aarch64_mte (const char *, int );
 size_t __strlen_aarch64_mte (const char *);
 char *__strrchr_aarch64_mte (const char *, int);
-#if __ARM_NEON
 void *__memcpy_aarch64_simd (void *__restrict, const void *__restrict, size_t);
 void *__memmove_aarch64_simd (void *, const void *, size_t);
-#endif
 # if __ARM_FEATURE_SVE
 void *__memcpy_aarch64_sve (void *__restrict, const void *__restrict, size_t);
 void *__memmove_aarch64_sve (void *__restrict, const void *__restrict, size_t);
+void *__memset_aarch64_sve (void *, int, size_t);
 void *__memchr_aarch64_sve (const void *, int, size_t);
 int __memcmp_aarch64_sve (const void *, const void *, size_t);
 char *__strchr_aarch64_sve (const char *, int);
diff --git a/string/test/memcpy.c b/string/test/memcpy.c
index dc95844bd45a..98255e06f31c 100644
--- a/string/test/memcpy.c
+++ b/string/test/memcpy.c
@@ -25,9 +25,7 @@ static const struct fun
   F(memcpy, 0)
 #if __aarch64__
   F(__memcpy_aarch64, 1)
-# if __ARM_NEON
   F(__memcpy_aarch64_simd, 1)
-# endif
 # if __ARM_FEATURE_SVE
   F(__memcpy_aarch64_sve, 1)
 # endif
diff --git a/string/test/memmove.c b/string/test/memmove.c
index b85dd1e864ef..ff3f7652f763 100644
--- a/string/test/memmove.c
+++ b/string/test/memmove.c
@@ -25,9 +25,7 @@ static const struct fun
   F(memmove, 0)
 #if __aarch64__
   F(__memmove_aarch64, 1)
-# if __ARM_NEON
   F(__memmove_aarch64_simd, 1)
-# endif
 # if __ARM_FEATURE_SVE
   F(__memmove_aarch64_sve, 1)
 # endif
diff --git a/string/test/memset.c b/string/test/memset.c
index 7d09c267ffec..a9639f9b28b0 100644
--- a/string/test/memset.c
+++ b/string/test/memset.c
@@ -25,6 +25,9 @@ static const struct fun
   F(memset, 0)
 #if __aarch64__
   F(__memset_aarch64, 1)
+# if __ARM_FEATURE_SVE
+  F(__memset_aarch64_sve, 1)
+# endif
 # if WANT_MOPS
   F(__memset_aarch64_mops, 1)
 # endif