aboutsummaryrefslogtreecommitdiff
path: root/string
diff options
context:
space:
mode:
Diffstat (limited to 'string')
-rw-r--r--string/Dir.mk9
-rw-r--r--string/aarch64/__mtag_tag_region.S3
-rw-r--r--string/aarch64/__mtag_tag_zero_region.S3
-rw-r--r--string/aarch64/asmdefs.h37
-rw-r--r--string/aarch64/experimental/memchr-sve.S (renamed from string/aarch64/memchr-sve.S)8
-rw-r--r--string/aarch64/experimental/memcmp-sve.S (renamed from string/aarch64/memcmp-sve.S)9
-rw-r--r--string/aarch64/experimental/stpcpy-sve.S (renamed from string/aarch64/stpcpy-sve.S)0
-rw-r--r--string/aarch64/experimental/strchr-sve.S (renamed from string/aarch64/strchr-sve.S)7
-rw-r--r--string/aarch64/experimental/strchrnul-sve.S (renamed from string/aarch64/strchrnul-sve.S)0
-rw-r--r--string/aarch64/experimental/strcmp-sve.S (renamed from string/aarch64/strcmp-sve.S)8
-rw-r--r--string/aarch64/experimental/strcpy-sve.S (renamed from string/aarch64/strcpy-sve.S)8
-rw-r--r--string/aarch64/experimental/strlen-sve.S (renamed from string/aarch64/strlen-sve.S)7
-rw-r--r--string/aarch64/experimental/strncmp-sve.S (renamed from string/aarch64/strncmp-sve.S)9
-rw-r--r--string/aarch64/experimental/strnlen-sve.S (renamed from string/aarch64/strnlen-sve.S)8
-rw-r--r--string/aarch64/experimental/strrchr-sve.S (renamed from string/aarch64/strrchr-sve.S)7
-rw-r--r--string/aarch64/memchr-mte.S2
-rw-r--r--string/aarch64/memchr.S2
-rw-r--r--string/aarch64/memcmp.S4
-rw-r--r--string/aarch64/memcpy-advsimd.S3
-rw-r--r--string/aarch64/memcpy-mops.S4
-rw-r--r--string/aarch64/memcpy-sve.S8
-rw-r--r--string/aarch64/memcpy.S3
-rw-r--r--string/aarch64/memmove-mops.S4
-rw-r--r--string/aarch64/memrchr.S1
-rw-r--r--string/aarch64/memset-mops.S3
-rw-r--r--string/aarch64/memset-sve.S114
-rw-r--r--string/aarch64/memset.S104
-rw-r--r--string/aarch64/strchr-mte.S1
-rw-r--r--string/aarch64/strchr.S1
-rw-r--r--string/aarch64/strchrnul-mte.S1
-rw-r--r--string/aarch64/strchrnul.S1
-rw-r--r--string/aarch64/strcmp.S2
-rw-r--r--string/aarch64/strcpy.S2
-rw-r--r--string/aarch64/strlen-mte.S38
-rw-r--r--string/aarch64/strlen.S1
-rw-r--r--string/aarch64/strncmp.S3
-rw-r--r--string/aarch64/strnlen.S2
-rw-r--r--string/aarch64/strrchr-mte.S1
-rw-r--r--string/aarch64/strrchr.S1
-rw-r--r--string/bench/memcpy.c239
-rw-r--r--string/bench/memset.c141
-rw-r--r--string/bench/strlen.c206
-rw-r--r--string/include/benchlib.h31
-rw-r--r--string/include/stringlib.h3
-rw-r--r--string/test/memcpy.c2
-rw-r--r--string/test/memmove.c2
-rw-r--r--string/test/memset.c3
47 files changed, 484 insertions, 572 deletions
diff --git a/string/Dir.mk b/string/Dir.mk
index 40ff5acc093e..dd8283ec4977 100644
--- a/string/Dir.mk
+++ b/string/Dir.mk
@@ -13,9 +13,12 @@ all-string bench-string check-string install-string clean-string:
else
string-lib-srcs := $(wildcard $(S)/$(ARCH)/*.[cS])
+string-lib-srcs += $(wildcard $(S)/$(ARCH)/experimental/*.[cS])
string-test-srcs := $(wildcard $(S)/test/*.c)
string-bench-srcs := $(wildcard $(S)/bench/*.c)
+string-arch-include-dir := $(wildcard $(S)/$(ARCH))
+string-arch-includes := $(wildcard $(S)/$(ARCH)/*.h)
string-includes := $(patsubst $(S)/%,build/%,$(wildcard $(S)/include/*.h))
string-libs := \
@@ -43,6 +46,7 @@ string-tests := \
string-benches := \
build/bin/bench/memcpy \
+ build/bin/bench/memset \
build/bin/bench/strlen
string-lib-objs := $(patsubst $(S)/%,$(B)/%.o,$(basename $(string-lib-srcs)))
@@ -64,8 +68,8 @@ string-files := \
all-string: $(string-libs) $(string-tests) $(string-benches) $(string-includes)
-$(string-objs): $(string-includes)
-$(string-objs): CFLAGS_ALL += $(string-cflags)
+$(string-objs): $(string-includes) $(string-arch-includes)
+$(string-objs): CFLAGS_ALL += $(string-cflags) -I$(string-arch-include-dir)
$(string-test-objs): CFLAGS_ALL += -D_GNU_SOURCE
@@ -101,6 +105,7 @@ check-string: $(string-tests-out)
bench-string: $(string-benches)
$(EMULATOR) build/bin/bench/strlen
$(EMULATOR) build/bin/bench/memcpy
+ $(EMULATOR) build/bin/bench/memset
install-string: \
$(string-libs:build/lib/%=$(DESTDIR)$(libdir)/%) \
diff --git a/string/aarch64/__mtag_tag_region.S b/string/aarch64/__mtag_tag_region.S
index 207e22950c6d..34b5789240da 100644
--- a/string/aarch64/__mtag_tag_region.S
+++ b/string/aarch64/__mtag_tag_region.S
@@ -27,9 +27,6 @@
#define zva_val x4
ENTRY (__mtag_tag_region)
- PTR_ARG (0)
- SIZE_ARG (1)
-
add dstend, dstin, count
cmp count, 96
diff --git a/string/aarch64/__mtag_tag_zero_region.S b/string/aarch64/__mtag_tag_zero_region.S
index 44b8e0114f42..2fa248e25621 100644
--- a/string/aarch64/__mtag_tag_zero_region.S
+++ b/string/aarch64/__mtag_tag_zero_region.S
@@ -27,9 +27,6 @@
#define zva_val x4
ENTRY (__mtag_tag_zero_region)
- PTR_ARG (0)
- SIZE_ARG (1)
-
add dstend, dstin, count
cmp count, 96
diff --git a/string/aarch64/asmdefs.h b/string/aarch64/asmdefs.h
index 131b95e1fea9..90166676977a 100644
--- a/string/aarch64/asmdefs.h
+++ b/string/aarch64/asmdefs.h
@@ -21,19 +21,6 @@
#define FEATURE_1_PAC 2
/* Add a NT_GNU_PROPERTY_TYPE_0 note. */
-#ifdef __ILP32__
-#define GNU_PROPERTY(type, value) \
- .section .note.gnu.property, "a"; \
- .p2align 2; \
- .word 4; \
- .word 12; \
- .word 5; \
- .asciz "GNU"; \
- .word type; \
- .word 4; \
- .word value; \
- .text
-#else
#define GNU_PROPERTY(type, value) \
.section .note.gnu.property, "a"; \
.p2align 3; \
@@ -46,7 +33,6 @@
.word value; \
.word 0; \
.text
-#endif
/* If set then the GNU Property Note section will be added to
mark objects to support BTI and PAC-RET. */
@@ -80,27 +66,4 @@ GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC)
#define L(l) .L ## l
-#ifdef __ILP32__
- /* Sanitize padding bits of pointer arguments as per aapcs64 */
-#define PTR_ARG(n) mov w##n, w##n
-#else
-#define PTR_ARG(n)
-#endif
-
-#ifdef __ILP32__
- /* Sanitize padding bits of size arguments as per aapcs64 */
-#define SIZE_ARG(n) mov w##n, w##n
-#else
-#define SIZE_ARG(n)
-#endif
-
-/* Compiler supports SVE instructions */
-#ifndef HAVE_SVE
-# if __aarch64__ && (__GNUC__ >= 8 || __clang_major__ >= 5)
-# define HAVE_SVE 1
-# else
-# define HAVE_SVE 0
-# endif
-#endif
-
#endif
diff --git a/string/aarch64/memchr-sve.S b/string/aarch64/experimental/memchr-sve.S
index b851cf31f238..b314551f3e0f 100644
--- a/string/aarch64/memchr-sve.S
+++ b/string/aarch64/experimental/memchr-sve.S
@@ -7,7 +7,8 @@
#include "asmdefs.h"
-#if __ARM_FEATURE_SVE
+.arch armv8-a+sve
+
/* Assumptions:
*
* ARMv8-a, AArch64
@@ -15,8 +16,6 @@
*/
ENTRY (__memchr_aarch64_sve)
- PTR_ARG (0)
- SIZE_ARG (2)
dup z1.b, w1 /* duplicate c to a vector */
setffr /* initialize FFR */
mov x3, 0 /* initialize off */
@@ -59,6 +58,3 @@ ENTRY (__memchr_aarch64_sve)
ret
END (__memchr_aarch64_sve)
-
-#endif
-
diff --git a/string/aarch64/memcmp-sve.S b/string/aarch64/experimental/memcmp-sve.S
index d52ce4555344..ad3534836d04 100644
--- a/string/aarch64/memcmp-sve.S
+++ b/string/aarch64/experimental/memcmp-sve.S
@@ -7,7 +7,8 @@
#include "asmdefs.h"
-#if __ARM_FEATURE_SVE
+.arch armv8-a+sve
+
/* Assumptions:
*
* ARMv8-a, AArch64
@@ -15,9 +16,6 @@
*/
ENTRY (__memcmp_aarch64_sve)
- PTR_ARG (0)
- PTR_ARG (1)
- SIZE_ARG (2)
mov x3, 0 /* initialize off */
0: whilelo p0.b, x3, x2 /* while off < max */
@@ -46,6 +44,3 @@ ENTRY (__memcmp_aarch64_sve)
ret
END (__memcmp_aarch64_sve)
-
-#endif
-
diff --git a/string/aarch64/stpcpy-sve.S b/string/aarch64/experimental/stpcpy-sve.S
index 5d3f14b86026..5d3f14b86026 100644
--- a/string/aarch64/stpcpy-sve.S
+++ b/string/aarch64/experimental/stpcpy-sve.S
diff --git a/string/aarch64/strchr-sve.S b/string/aarch64/experimental/strchr-sve.S
index ff075167bfef..7d74ae9ff232 100644
--- a/string/aarch64/strchr-sve.S
+++ b/string/aarch64/experimental/strchr-sve.S
@@ -7,7 +7,8 @@
#include "asmdefs.h"
-#if __ARM_FEATURE_SVE
+.arch armv8-a+sve
+
/* Assumptions:
*
* ARMv8-a, AArch64
@@ -22,7 +23,6 @@
#endif
ENTRY (FUNC)
- PTR_ARG (0)
dup z1.b, w1 /* replicate byte across vector */
setffr /* initialize FFR */
ptrue p1.b /* all ones; loop invariant */
@@ -65,6 +65,3 @@ ENTRY (FUNC)
b 0b
END (FUNC)
-
-#endif
-
diff --git a/string/aarch64/strchrnul-sve.S b/string/aarch64/experimental/strchrnul-sve.S
index 0005f9177514..0005f9177514 100644
--- a/string/aarch64/strchrnul-sve.S
+++ b/string/aarch64/experimental/strchrnul-sve.S
diff --git a/string/aarch64/strcmp-sve.S b/string/aarch64/experimental/strcmp-sve.S
index eaf909a378f1..b6c249588534 100644
--- a/string/aarch64/strcmp-sve.S
+++ b/string/aarch64/experimental/strcmp-sve.S
@@ -7,7 +7,8 @@
#include "asmdefs.h"
-#if __ARM_FEATURE_SVE
+.arch armv8-a+sve
+
/* Assumptions:
*
* ARMv8-a, AArch64
@@ -15,8 +16,6 @@
*/
ENTRY (__strcmp_aarch64_sve)
- PTR_ARG (0)
- PTR_ARG (1)
setffr /* initialize FFR */
ptrue p1.b, all /* all ones; loop invariant */
mov x2, 0 /* initialize offset */
@@ -54,6 +53,3 @@ ENTRY (__strcmp_aarch64_sve)
b 1b
END (__strcmp_aarch64_sve)
-
-#endif
-
diff --git a/string/aarch64/strcpy-sve.S b/string/aarch64/experimental/strcpy-sve.S
index 00e72dce4451..57b77c8a00e7 100644
--- a/string/aarch64/strcpy-sve.S
+++ b/string/aarch64/experimental/strcpy-sve.S
@@ -7,7 +7,8 @@
#include "asmdefs.h"
-#if __ARM_FEATURE_SVE
+.arch armv8-a+sve
+
/* Assumptions:
*
* ARMv8-a, AArch64
@@ -22,8 +23,6 @@
#endif
ENTRY (FUNC)
- PTR_ARG (0)
- PTR_ARG (1)
setffr /* initialize FFR */
ptrue p2.b, all /* all ones; loop invariant */
mov x2, 0 /* initialize offset */
@@ -66,6 +65,3 @@ ENTRY (FUNC)
ret
END (FUNC)
-
-#endif
-
diff --git a/string/aarch64/strlen-sve.S b/string/aarch64/experimental/strlen-sve.S
index 12ebbdba5c93..c83155052c07 100644
--- a/string/aarch64/strlen-sve.S
+++ b/string/aarch64/experimental/strlen-sve.S
@@ -7,7 +7,8 @@
#include "asmdefs.h"
-#if __ARM_FEATURE_SVE
+.arch armv8-a+sve
+
/* Assumptions:
*
* ARMv8-a, AArch64
@@ -15,7 +16,6 @@
*/
ENTRY (__strlen_aarch64_sve)
- PTR_ARG (0)
setffr /* initialize FFR */
ptrue p2.b /* all ones; loop invariant */
mov x1, 0 /* initialize length */
@@ -50,6 +50,3 @@ ENTRY (__strlen_aarch64_sve)
b 0b
END (__strlen_aarch64_sve)
-
-#endif
-
diff --git a/string/aarch64/strncmp-sve.S b/string/aarch64/experimental/strncmp-sve.S
index 6a9e9f7b6437..a281e642d8aa 100644
--- a/string/aarch64/strncmp-sve.S
+++ b/string/aarch64/experimental/strncmp-sve.S
@@ -7,7 +7,8 @@
#include "asmdefs.h"
-#if __ARM_FEATURE_SVE
+.arch armv8-a+sve
+
/* Assumptions:
*
* ARMv8-a, AArch64
@@ -15,9 +16,6 @@
*/
ENTRY (__strncmp_aarch64_sve)
- PTR_ARG (0)
- PTR_ARG (1)
- SIZE_ARG (2)
setffr /* initialize FFR */
mov x3, 0 /* initialize off */
@@ -64,6 +62,3 @@ ENTRY (__strncmp_aarch64_sve)
ret
END (__strncmp_aarch64_sve)
-
-#endif
-
diff --git a/string/aarch64/strnlen-sve.S b/string/aarch64/experimental/strnlen-sve.S
index 6c43dc427da7..11d835a1b13c 100644
--- a/string/aarch64/strnlen-sve.S
+++ b/string/aarch64/experimental/strnlen-sve.S
@@ -7,7 +7,8 @@
#include "asmdefs.h"
-#if __ARM_FEATURE_SVE
+.arch armv8-a+sve
+
/* Assumptions:
*
* ARMv8-a, AArch64
@@ -15,8 +16,6 @@
*/
ENTRY (__strnlen_aarch64_sve)
- PTR_ARG (0)
- SIZE_ARG (1)
setffr /* initialize FFR */
mov x2, 0 /* initialize len */
b 1f
@@ -69,6 +68,3 @@ ENTRY (__strnlen_aarch64_sve)
ret
END (__strnlen_aarch64_sve)
-
-#endif
-
diff --git a/string/aarch64/strrchr-sve.S b/string/aarch64/experimental/strrchr-sve.S
index 825a7384cfc1..731edaddf156 100644
--- a/string/aarch64/strrchr-sve.S
+++ b/string/aarch64/experimental/strrchr-sve.S
@@ -7,7 +7,8 @@
#include "asmdefs.h"
-#if __ARM_FEATURE_SVE
+.arch armv8-a+sve
+
/* Assumptions:
*
* ARMv8-a, AArch64
@@ -15,7 +16,6 @@
*/
ENTRY (__strrchr_aarch64_sve)
- PTR_ARG (0)
dup z1.b, w1 /* replicate byte across vector */
setffr /* initialize FFR */
ptrue p1.b /* all ones; loop invariant */
@@ -79,6 +79,3 @@ ENTRY (__strrchr_aarch64_sve)
ret
END (__strrchr_aarch64_sve)
-
-#endif
-
diff --git a/string/aarch64/memchr-mte.S b/string/aarch64/memchr-mte.S
index 948c3cbc7dd4..68bd0af9a8c5 100644
--- a/string/aarch64/memchr-mte.S
+++ b/string/aarch64/memchr-mte.S
@@ -40,8 +40,6 @@
exactly which byte matched. */
ENTRY (__memchr_aarch64_mte)
- PTR_ARG (0)
- SIZE_ARG (2)
bic src, srcin, 15
cbz cntin, L(nomatch)
ld1 {vdata.16b}, [src]
diff --git a/string/aarch64/memchr.S b/string/aarch64/memchr.S
index fe6cfe2bc0e2..d12a38abbc30 100644
--- a/string/aarch64/memchr.S
+++ b/string/aarch64/memchr.S
@@ -47,8 +47,6 @@
*/
ENTRY (__memchr_aarch64)
- PTR_ARG (0)
- SIZE_ARG (2)
/* Do not dereference srcin if no bytes to compare. */
cbz cntin, L(zero_length)
/*
diff --git a/string/aarch64/memcmp.S b/string/aarch64/memcmp.S
index 35135e72cc8e..43439de4db69 100644
--- a/string/aarch64/memcmp.S
+++ b/string/aarch64/memcmp.S
@@ -30,10 +30,6 @@
ENTRY (__memcmp_aarch64)
- PTR_ARG (0)
- PTR_ARG (1)
- SIZE_ARG (2)
-
cmp limit, 16
b.lo L(less16)
ldp data1, data3, [src1]
diff --git a/string/aarch64/memcpy-advsimd.S b/string/aarch64/memcpy-advsimd.S
index 9d3027d4d3cd..cbf4c581500e 100644
--- a/string/aarch64/memcpy-advsimd.S
+++ b/string/aarch64/memcpy-advsimd.S
@@ -52,9 +52,6 @@
ENTRY_ALIAS (__memmove_aarch64_simd)
ENTRY (__memcpy_aarch64_simd)
- PTR_ARG (0)
- PTR_ARG (1)
- SIZE_ARG (2)
add srcend, src, count
cmp count, 128
b.hi L(copy_long)
diff --git a/string/aarch64/memcpy-mops.S b/string/aarch64/memcpy-mops.S
index b45c31418717..03ae95570c04 100644
--- a/string/aarch64/memcpy-mops.S
+++ b/string/aarch64/memcpy-mops.S
@@ -8,10 +8,6 @@
#include "asmdefs.h"
ENTRY (__memcpy_aarch64_mops)
- PTR_ARG (0)
- PTR_ARG (1)
- SIZE_ARG (2)
-
mov x3, x0
.inst 0x19010443 /* cpyfp [x3]!, [x1]!, x2! */
.inst 0x19410443 /* cpyfm [x3]!, [x1]!, x2! */
diff --git a/string/aarch64/memcpy-sve.S b/string/aarch64/memcpy-sve.S
index e8a946d7db37..9b05cb2a58ee 100644
--- a/string/aarch64/memcpy-sve.S
+++ b/string/aarch64/memcpy-sve.S
@@ -13,8 +13,6 @@
#include "asmdefs.h"
-#ifdef HAVE_SVE
-
.arch armv8-a+sve
#define dstin x0
@@ -51,10 +49,6 @@
ENTRY_ALIAS (__memmove_aarch64_sve)
ENTRY (__memcpy_aarch64_sve)
- PTR_ARG (0)
- PTR_ARG (1)
- SIZE_ARG (2)
-
cmp count, 128
b.hi L(copy_long)
cntb vlen
@@ -173,5 +167,3 @@ L(return):
ret
END (__memcpy_aarch64_sve)
-
-#endif
diff --git a/string/aarch64/memcpy.S b/string/aarch64/memcpy.S
index 7c0606e2104a..351f1a11f097 100644
--- a/string/aarch64/memcpy.S
+++ b/string/aarch64/memcpy.S
@@ -55,9 +55,6 @@
ENTRY_ALIAS (__memmove_aarch64)
ENTRY (__memcpy_aarch64)
- PTR_ARG (0)
- PTR_ARG (1)
- SIZE_ARG (2)
add srcend, src, count
add dstend, dstin, count
cmp count, 128
diff --git a/string/aarch64/memmove-mops.S b/string/aarch64/memmove-mops.S
index 6c73017bb16f..d9839f86e9b4 100644
--- a/string/aarch64/memmove-mops.S
+++ b/string/aarch64/memmove-mops.S
@@ -8,10 +8,6 @@
#include "asmdefs.h"
ENTRY (__memmove_aarch64_mops)
- PTR_ARG (0)
- PTR_ARG (1)
- SIZE_ARG (2)
-
mov x3, x0
.inst 0x1d010443 /* cpyp [x3]!, [x1]!, x2! */
.inst 0x1d410443 /* cpym [x3]!, [x1]!, x2! */
diff --git a/string/aarch64/memrchr.S b/string/aarch64/memrchr.S
index 6418bdf56f41..ed38478a6faa 100644
--- a/string/aarch64/memrchr.S
+++ b/string/aarch64/memrchr.S
@@ -42,7 +42,6 @@
exactly which byte matched. */
ENTRY (__memrchr_aarch64)
- PTR_ARG (0)
add end, srcin, cntin
sub endm1, end, 1
bic src, endm1, 15
diff --git a/string/aarch64/memset-mops.S b/string/aarch64/memset-mops.S
index ec791493bae9..00d8e7d2c05f 100644
--- a/string/aarch64/memset-mops.S
+++ b/string/aarch64/memset-mops.S
@@ -8,9 +8,6 @@
#include "asmdefs.h"
ENTRY (__memset_aarch64_mops)
- PTR_ARG (0)
- SIZE_ARG (2)
-
mov x3, x0
.inst 0x19c10443 /* setp [x3]!, x2!, x1 */
.inst 0x19c14443 /* setm [x3]!, x2!, x1 */
diff --git a/string/aarch64/memset-sve.S b/string/aarch64/memset-sve.S
new file mode 100644
index 000000000000..efaeaece284e
--- /dev/null
+++ b/string/aarch64/memset-sve.S
@@ -0,0 +1,114 @@
+/*
+ * memset - fill memory with a constant byte
+ *
+ * Copyright (c) 2024-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD, SVE, unaligned accesses.
+ *
+ */
+
+#include "asmdefs.h"
+
+.arch armv8-a+sve
+
+#define dstin x0
+#define val x1
+#define valw w1
+#define count x2
+#define dst x3
+#define dstend x4
+#define zva_val x5
+#define vlen x5
+#define off x3
+#define dstend2 x5
+
+ENTRY (__memset_aarch64_sve)
+ dup v0.16B, valw
+ cmp count, 16
+ b.lo L(set_16)
+
+ add dstend, dstin, count
+ cmp count, 64
+ b.hs L(set_128)
+
+ /* Set 16..63 bytes. */
+ mov off, 16
+ and off, off, count, lsr 1
+ sub dstend2, dstend, off
+ str q0, [dstin]
+ str q0, [dstin, off]
+ str q0, [dstend2, -16]
+ str q0, [dstend, -16]
+ ret
+
+ .p2align 4
+L(set_16):
+ whilelo p0.b, xzr, count
+ st1b z0.b, p0, [dstin]
+ ret
+
+ .p2align 4
+L(set_128):
+ bic dst, dstin, 15
+ cmp count, 128
+ b.hi L(set_long)
+ stp q0, q0, [dstin]
+ stp q0, q0, [dstin, 32]
+ stp q0, q0, [dstend, -64]
+ stp q0, q0, [dstend, -32]
+ ret
+
+ .p2align 4
+L(set_long):
+ cmp count, 256
+ b.lo L(no_zva)
+ tst valw, 255
+ b.ne L(no_zva)
+
+#ifndef SKIP_ZVA_CHECK
+ mrs zva_val, dczid_el0
+ and zva_val, zva_val, 31
+ cmp zva_val, 4 /* ZVA size is 64 bytes. */
+ b.ne L(no_zva)
+#endif
+ str q0, [dstin]
+ str q0, [dst, 16]
+ bic dst, dstin, 31
+ stp q0, q0, [dst, 32]
+ bic dst, dstin, 63
+ sub count, dstend, dst /* Count is now 64 too large. */
+ sub count, count, 128 /* Adjust count and bias for loop. */
+
+ sub x8, dstend, 1 /* Write last bytes before ZVA loop. */
+ bic x8, x8, 15
+ stp q0, q0, [x8, -48]
+ str q0, [x8, -16]
+ str q0, [dstend, -16]
+
+ .p2align 4
+L(zva64_loop):
+ add dst, dst, 64
+ dc zva, dst
+ subs count, count, 64
+ b.hi L(zva64_loop)
+ ret
+
+L(no_zva):
+ str q0, [dstin]
+ sub count, dstend, dst /* Count is 16 too large. */
+ sub count, count, 64 + 16 /* Adjust count and bias for loop. */
+L(no_zva_loop):
+ stp q0, q0, [dst, 16]
+ stp q0, q0, [dst, 48]
+ add dst, dst, 64
+ subs count, count, 64
+ b.hi L(no_zva_loop)
+ stp q0, q0, [dstend, -64]
+ stp q0, q0, [dstend, -32]
+ ret
+
+END (__memset_aarch64_sve)
diff --git a/string/aarch64/memset.S b/string/aarch64/memset.S
index 553b0fcaefea..906a4dcf46c6 100644
--- a/string/aarch64/memset.S
+++ b/string/aarch64/memset.S
@@ -1,7 +1,7 @@
/*
* memset - fill memory with a constant byte
*
- * Copyright (c) 2012-2022, Arm Limited.
+ * Copyright (c) 2012-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
@@ -20,93 +20,98 @@
#define dst x3
#define dstend x4
#define zva_val x5
+#define off x3
+#define dstend2 x5
ENTRY (__memset_aarch64)
- PTR_ARG (0)
- SIZE_ARG (2)
-
dup v0.16B, valw
- add dstend, dstin, count
-
- cmp count, 96
- b.hi L(set_long)
cmp count, 16
- b.hs L(set_medium)
- mov val, v0.D[0]
+ b.lo L(set_small)
- /* Set 0..15 bytes. */
- tbz count, 3, 1f
- str val, [dstin]
- str val, [dstend, -8]
+ add dstend, dstin, count
+ cmp count, 64
+ b.hs L(set_128)
+
+ /* Set 16..63 bytes. */
+ mov off, 16
+ and off, off, count, lsr 1
+ sub dstend2, dstend, off
+ str q0, [dstin]
+ str q0, [dstin, off]
+ str q0, [dstend2, -16]
+ str q0, [dstend, -16]
ret
+
.p2align 4
-1: tbz count, 2, 2f
- str valw, [dstin]
- str valw, [dstend, -4]
+ /* Set 0..15 bytes. */
+L(set_small):
+ add dstend, dstin, count
+ cmp count, 4
+ b.lo 2f
+ lsr off, count, 3
+ sub dstend2, dstend, off, lsl 2
+ str s0, [dstin]
+ str s0, [dstin, off, lsl 2]
+ str s0, [dstend2, -4]
+ str s0, [dstend, -4]
ret
+
+ /* Set 0..3 bytes. */
2: cbz count, 3f
+ lsr off, count, 1
strb valw, [dstin]
- tbz count, 1, 3f
- strh valw, [dstend, -2]
+ strb valw, [dstin, off]
+ strb valw, [dstend, -1]
3: ret
- /* Set 17..96 bytes. */
-L(set_medium):
- str q0, [dstin]
- tbnz count, 6, L(set96)
- str q0, [dstend, -16]
- tbz count, 5, 1f
- str q0, [dstin, 16]
- str q0, [dstend, -32]
-1: ret
-
.p2align 4
- /* Set 64..96 bytes. Write 64 bytes from the start and
- 32 bytes from the end. */
-L(set96):
- str q0, [dstin, 16]
+L(set_128):
+ bic dst, dstin, 15
+ cmp count, 128
+ b.hi L(set_long)
+ stp q0, q0, [dstin]
stp q0, q0, [dstin, 32]
+ stp q0, q0, [dstend, -64]
stp q0, q0, [dstend, -32]
ret
.p2align 4
L(set_long):
- and valw, valw, 255
- bic dst, dstin, 15
str q0, [dstin]
- cmp count, 160
- ccmp valw, 0, 0, hs
+ str q0, [dst, 16]
+ tst valw, 255
b.ne L(no_zva)
-
#ifndef SKIP_ZVA_CHECK
mrs zva_val, dczid_el0
and zva_val, zva_val, 31
cmp zva_val, 4 /* ZVA size is 64 bytes. */
b.ne L(no_zva)
#endif
- str q0, [dst, 16]
stp q0, q0, [dst, 32]
- bic dst, dst, 63
+ bic dst, dstin, 63
sub count, dstend, dst /* Count is now 64 too large. */
- sub count, count, 128 /* Adjust count and bias for loop. */
+ sub count, count, 64 + 64 /* Adjust count and bias for loop. */
+
+ /* Write last bytes before ZVA loop. */
+ stp q0, q0, [dstend, -64]
+ stp q0, q0, [dstend, -32]
.p2align 4
-L(zva_loop):
+L(zva64_loop):
add dst, dst, 64
dc zva, dst
subs count, count, 64
- b.hi L(zva_loop)
- stp q0, q0, [dstend, -64]
- stp q0, q0, [dstend, -32]
+ b.hi L(zva64_loop)
ret
+ .p2align 3
L(no_zva):
- sub count, dstend, dst /* Count is 16 too large. */
- sub dst, dst, 16 /* Dst is biased by -32. */
- sub count, count, 64 + 16 /* Adjust count and bias for loop. */
+ sub count, dstend, dst /* Count is 32 too large. */
+ sub count, count, 64 + 32 /* Adjust count and bias for loop. */
L(no_zva_loop):
stp q0, q0, [dst, 32]
- stp q0, q0, [dst, 64]!
+ stp q0, q0, [dst, 64]
+ add dst, dst, 64
subs count, count, 64
b.hi L(no_zva_loop)
stp q0, q0, [dstend, -64]
@@ -114,4 +119,3 @@ L(no_zva_loop):
ret
END (__memset_aarch64)
-
diff --git a/string/aarch64/strchr-mte.S b/string/aarch64/strchr-mte.S
index 6ec08f7acc76..42b747311bc6 100644
--- a/string/aarch64/strchr-mte.S
+++ b/string/aarch64/strchr-mte.S
@@ -39,7 +39,6 @@
If it is not a multiple of 4, there was no match. */
ENTRY (__strchr_aarch64_mte)
- PTR_ARG (0)
bic src, srcin, 15
dup vrepchr.16b, chrin
ld1 {vdata.16b}, [src]
diff --git a/string/aarch64/strchr.S b/string/aarch64/strchr.S
index 37193bd947a7..c1d01e9635b6 100644
--- a/string/aarch64/strchr.S
+++ b/string/aarch64/strchr.S
@@ -51,7 +51,6 @@
/* Locals and temporaries. */
ENTRY (__strchr_aarch64)
- PTR_ARG (0)
/* Magic constant 0xc0300c03 to allow us to identify which lane
matches the requested byte. Even bits are set if the character
matches, odd bits if either the char is NUL or matches. */
diff --git a/string/aarch64/strchrnul-mte.S b/string/aarch64/strchrnul-mte.S
index 543ee88bb285..b3180cdf9e2c 100644
--- a/string/aarch64/strchrnul-mte.S
+++ b/string/aarch64/strchrnul-mte.S
@@ -38,7 +38,6 @@
exactly which byte matched. */
ENTRY (__strchrnul_aarch64_mte)
- PTR_ARG (0)
bic src, srcin, 15
dup vrepchr.16b, chrin
ld1 {vdata.16b}, [src]
diff --git a/string/aarch64/strchrnul.S b/string/aarch64/strchrnul.S
index 666e8d0304c1..0a32c46c30c5 100644
--- a/string/aarch64/strchrnul.S
+++ b/string/aarch64/strchrnul.S
@@ -47,7 +47,6 @@
/* Locals and temporaries. */
ENTRY (__strchrnul_aarch64)
- PTR_ARG (0)
/* Magic constant 0x40100401 to allow us to identify which lane
matches the termination condition. */
mov wtmp2, #0x0401
diff --git a/string/aarch64/strcmp.S b/string/aarch64/strcmp.S
index 137a9aa06681..7c0d0485a89b 100644
--- a/string/aarch64/strcmp.S
+++ b/string/aarch64/strcmp.S
@@ -51,8 +51,6 @@
ENTRY (__strcmp_aarch64)
- PTR_ARG (0)
- PTR_ARG (1)
sub off2, src2, src1
mov zeroones, REP8_01
and tmp, src1, 7
diff --git a/string/aarch64/strcpy.S b/string/aarch64/strcpy.S
index 97ae37ea4229..5852616e6024 100644
--- a/string/aarch64/strcpy.S
+++ b/string/aarch64/strcpy.S
@@ -52,8 +52,6 @@
exactly which byte matched. */
ENTRY (STRCPY)
- PTR_ARG (0)
- PTR_ARG (1)
bic src, srcin, 15
ld1 {vdata.16b}, [src]
cmeq vhas_nul.16b, vdata.16b, 0
diff --git a/string/aarch64/strlen-mte.S b/string/aarch64/strlen-mte.S
index 77235797f7c5..afa72eed9a43 100644
--- a/string/aarch64/strlen-mte.S
+++ b/string/aarch64/strlen-mte.S
@@ -33,7 +33,6 @@
identifies the first zero byte. */
ENTRY (__strlen_aarch64_mte)
- PTR_ARG (0)
bic src, srcin, 15
ld1 {vdata.16b}, [src]
cmeq vhas_nul.16b, vdata.16b, 0
@@ -41,37 +40,50 @@ ENTRY (__strlen_aarch64_mte)
shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
fmov synd, dend
lsr synd, synd, shift
- cbz synd, L(loop)
+ cbz synd, L(next16)
rbit synd, synd
clz result, synd
lsr result, result, 2
ret
+L(next16):
+ ldr data, [src, 16]
+ cmeq vhas_nul.16b, vdata.16b, 0
+ shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
+ fmov synd, dend
+ cbz synd, L(loop)
+ add src, src, 16
+#ifndef __AARCH64EB__
+ rbit synd, synd
+#endif
+ sub result, src, srcin
+ clz tmp, synd
+ add result, result, tmp, lsr 2
+ ret
+
.p2align 5
L(loop):
- ldr data, [src, 16]
+ ldr data, [src, 32]!
cmeq vhas_nul.16b, vdata.16b, 0
- umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
+ addhn vend.8b, vhas_nul.8h, vhas_nul.8h
fmov synd, dend
cbnz synd, L(loop_end)
- ldr data, [src, 32]!
+ ldr data, [src, 16]
cmeq vhas_nul.16b, vdata.16b, 0
- umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
+ addhn vend.8b, vhas_nul.8h, vhas_nul.8h
fmov synd, dend
cbz synd, L(loop)
- sub src, src, 16
+ add src, src, 16
L(loop_end):
- shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
- sub result, src, srcin
- fmov synd, dend
+ sub result, shift, src, lsl 2 /* (srcin - src) << 2. */
#ifndef __AARCH64EB__
rbit synd, synd
+ sub result, result, 3
#endif
- add result, result, 16
clz tmp, synd
- add result, result, tmp, lsr 2
+ sub result, tmp, result
+ lsr result, result, 2
ret
END (__strlen_aarch64_mte)
-
diff --git a/string/aarch64/strlen.S b/string/aarch64/strlen.S
index 6f6f08f636b2..0ebb26be844c 100644
--- a/string/aarch64/strlen.S
+++ b/string/aarch64/strlen.S
@@ -75,7 +75,6 @@
character, return the length, if not, continue in the main loop. */
ENTRY (__strlen_aarch64)
- PTR_ARG (0)
and tmp1, srcin, MIN_PAGE_SIZE - 1
cmp tmp1, MIN_PAGE_SIZE - 32
b.hi L(page_cross)
diff --git a/string/aarch64/strncmp.S b/string/aarch64/strncmp.S
index 128a10c52bb1..493a0f06ed1d 100644
--- a/string/aarch64/strncmp.S
+++ b/string/aarch64/strncmp.S
@@ -55,9 +55,6 @@
#endif
ENTRY (__strncmp_aarch64)
- PTR_ARG (0)
- PTR_ARG (1)
- SIZE_ARG (2)
cbz limit, L(ret0)
eor tmp1, src1, src2
mov zeroones, #REP8_01
diff --git a/string/aarch64/strnlen.S b/string/aarch64/strnlen.S
index f2090a7485a5..6a96ec268f1a 100644
--- a/string/aarch64/strnlen.S
+++ b/string/aarch64/strnlen.S
@@ -36,8 +36,6 @@
identifies the first zero byte. */
ENTRY (__strnlen_aarch64)
- PTR_ARG (0)
- SIZE_ARG (1)
bic src, srcin, 15
cbz cntin, L(nomatch)
ld1 {vdata.16b}, [src]
diff --git a/string/aarch64/strrchr-mte.S b/string/aarch64/strrchr-mte.S
index bb61ab9ad4e7..8668ce6d2916 100644
--- a/string/aarch64/strrchr-mte.S
+++ b/string/aarch64/strrchr-mte.S
@@ -42,7 +42,6 @@
if the relevant byte matched the NUL end of string. */
ENTRY (__strrchr_aarch64_mte)
- PTR_ARG (0)
bic src, srcin, 15
dup vrepchr.16b, chrin
movi vrepmask.16b, 0x33
diff --git a/string/aarch64/strrchr.S b/string/aarch64/strrchr.S
index bf9cb297b6cb..f5713f4260fb 100644
--- a/string/aarch64/strrchr.S
+++ b/string/aarch64/strrchr.S
@@ -55,7 +55,6 @@
identify exactly which byte is causing the termination, and why. */
ENTRY (__strrchr_aarch64)
- PTR_ARG (0)
/* Magic constant 0x40100401 to allow us to identify which lane
matches the requested byte. Magic constant 0x80200802 used
similarly for NUL termination. */
diff --git a/string/bench/memcpy.c b/string/bench/memcpy.c
index b628f9b60d96..583fa505db75 100644
--- a/string/bench/memcpy.c
+++ b/string/bench/memcpy.c
@@ -20,35 +20,18 @@
#define MIN_SIZE 32768
#define MAX_SIZE (1024 * 1024)
-static uint8_t a[MAX_SIZE + 4096 + 64] __attribute__((__aligned__(64)));
-static uint8_t b[MAX_SIZE + 4096 + 64] __attribute__((__aligned__(64)));
-
-#define F(x) {#x, x},
-
-static const struct fun
-{
- const char *name;
- void *(*fun)(void *, const void *, size_t);
-} funtab[] =
-{
-#if __aarch64__
- F(__memcpy_aarch64)
-# if __ARM_NEON
- F(__memcpy_aarch64_simd)
-# endif
-# if __ARM_FEATURE_SVE
- F(__memcpy_aarch64_sve)
-# endif
-# if WANT_MOPS
- F(__memcpy_aarch64_mops)
-# endif
-#elif __arm__
- F(__memcpy_arm)
-#endif
- F(memcpy)
-#undef F
- {0, 0}
-};
+static uint8_t a[MAX_SIZE + 4096 + 64] __attribute__((__aligned__(4096)));
+static uint8_t b[MAX_SIZE + 4096 + 64] __attribute__((__aligned__(4096)));
+
+#define DOTEST(STR,TESTFN) \
+ printf (STR); \
+ RUN (TESTFN, memcpy); \
+ RUNA64 (TESTFN, __memcpy_aarch64); \
+ RUNA64 (TESTFN, __memcpy_aarch64_simd); \
+ RUNSVE (TESTFN, __memcpy_aarch64_sve); \
+ RUNMOPS (TESTFN, __memcpy_aarch64_mops); \
+ RUNA32 (TESTFN, __memcpy_arm); \
+ printf ("\n");
typedef struct { uint16_t size; uint16_t freq; } freq_data_t;
typedef struct { uint8_t align; uint16_t freq; } align_data_t;
@@ -160,183 +143,125 @@ init_copies (size_t max_size)
return total;
}
-int main (void)
+static void inline __attribute ((always_inline))
+memcpy_random (const char *name, void *(*fn)(void *, const void *, size_t))
{
- init_copy_distribution ();
-
- memset (a, 1, sizeof (a));
- memset (b, 2, sizeof (b));
-
- printf("Random memcpy (bytes/ns):\n");
- for (int f = 0; funtab[f].name != 0; f++)
- {
- size_t total = 0;
- uint64_t tsum = 0;
- printf ("%22s ", funtab[f].name);
- rand32 (0x12345678);
-
- for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2)
- {
- size_t copy_size = init_copies (size) * ITERS;
-
- for (int c = 0; c < NUM_TESTS; c++)
- funtab[f].fun (b + test_arr[c].dst, a + test_arr[c].src,
- test_arr[c].len);
-
- uint64_t t = clock_get_ns ();
- for (int i = 0; i < ITERS; i++)
- for (int c = 0; c < NUM_TESTS; c++)
- funtab[f].fun (b + test_arr[c].dst, a + test_arr[c].src,
- test_arr[c].len);
- t = clock_get_ns () - t;
- total += copy_size;
- tsum += t;
- printf ("%dK: %.2f ", size / 1024, (double)copy_size / t);
- }
- printf( "avg %.2f\n", (double)total / tsum);
- }
-
- size_t total = 0;
- uint64_t tsum = 0;
- printf ("%22s ", "memcpy_call");
- rand32 (0x12345678);
-
+ printf ("%22s ", name);
+ uint64_t total = 0, tsum = 0;
for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2)
{
- size_t copy_size = init_copies (size) * ITERS;
+ uint64_t copy_size = init_copies (size) * ITERS;
for (int c = 0; c < NUM_TESTS; c++)
- memcpy (b + test_arr[c].dst, a + test_arr[c].src, test_arr[c].len);
+ fn (b + test_arr[c].dst, a + test_arr[c].src, test_arr[c].len);
uint64_t t = clock_get_ns ();
for (int i = 0; i < ITERS; i++)
for (int c = 0; c < NUM_TESTS; c++)
- memcpy (b + test_arr[c].dst, a + test_arr[c].src, test_arr[c].len);
+ fn (b + test_arr[c].dst, a + test_arr[c].src, test_arr[c].len);
t = clock_get_ns () - t;
total += copy_size;
tsum += t;
- printf ("%dK: %.2f ", size / 1024, (double)copy_size / t);
+ printf ("%dK: %5.2f ", size / 1024, (double)copy_size / t);
}
- printf( "avg %.2f\n", (double)total / tsum);
-
+ printf( "avg %5.2f\n", (double)total / tsum);
+}
- printf ("\nAligned medium memcpy (bytes/ns):\n");
- for (int f = 0; funtab[f].name != 0; f++)
- {
- printf ("%22s ", funtab[f].name);
-
- for (int size = 8; size <= 512; size *= 2)
- {
- uint64_t t = clock_get_ns ();
- for (int i = 0; i < ITERS2; i++)
- funtab[f].fun (b, a, size);
- t = clock_get_ns () - t;
- printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
- }
- printf ("\n");
- }
+static void inline __attribute ((always_inline))
+memcpy_medium_aligned (const char *name, void *(*fn)(void *, const void *, size_t))
+{
+ printf ("%22s ", name);
- printf ("%22s ", "memcpy_call");
for (int size = 8; size <= 512; size *= 2)
{
uint64_t t = clock_get_ns ();
for (int i = 0; i < ITERS2; i++)
- memcpy (b, a, size);
+ fn (b, a, size);
t = clock_get_ns () - t;
- printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
+ printf ("%dB: %5.2f ", size, (double)size * ITERS2 / t);
}
printf ("\n");
+}
+static void inline __attribute ((always_inline))
+memcpy_medium_unaligned (const char *name, void *(*fn)(void *, const void *, size_t))
+{
+ printf ("%22s ", name);
- printf ("\nUnaligned medium memcpy (bytes/ns):\n");
- for (int f = 0; funtab[f].name != 0; f++)
- {
- printf ("%22s ", funtab[f].name);
-
- for (int size = 8; size <= 512; size *= 2)
- {
- uint64_t t = clock_get_ns ();
- for (int i = 0; i < ITERS2; i++)
- funtab[f].fun (b + 3, a + 1, size);
- t = clock_get_ns () - t;
- printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
- }
- printf ("\n");
- }
-
- printf ("%22s ", "memcpy_call");
for (int size = 8; size <= 512; size *= 2)
{
uint64_t t = clock_get_ns ();
for (int i = 0; i < ITERS2; i++)
- memcpy (b + 3, a + 1, size);
+ fn (b + 3, a + 1, size);
t = clock_get_ns () - t;
- printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
+ printf ("%dB: %5.2f ", size, (double)size * ITERS2 / t);
}
printf ("\n");
+}
+static void inline __attribute ((always_inline))
+memcpy_large (const char *name, void *(*fn)(void *, const void *, size_t))
+{
+ printf ("%22s ", name);
- printf ("\nLarge memcpy (bytes/ns):\n");
- for (int f = 0; funtab[f].name != 0; f++)
- {
- printf ("%22s ", funtab[f].name);
-
- for (int size = 1024; size <= 65536; size *= 2)
- {
- uint64_t t = clock_get_ns ();
- for (int i = 0; i < ITERS3; i++)
- funtab[f].fun (b, a, size);
- t = clock_get_ns () - t;
- printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
- }
- printf ("\n");
- }
-
- printf ("%22s ", "memcpy_call");
for (int size = 1024; size <= 65536; size *= 2)
{
uint64_t t = clock_get_ns ();
for (int i = 0; i < ITERS3; i++)
- memcpy (b, a, size);
+ fn (b, a, size);
t = clock_get_ns () - t;
- printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
+ printf ("%dK: %5.2f ", size / 1024, (double)size * ITERS3 / t);
}
printf ("\n");
+}
+static void inline __attribute ((always_inline))
+memmove_forward_unaligned (const char *name, void *(*fn)(void *, const void *, size_t))
+{
+ printf ("%22s ", name);
- printf ("\nUnaligned forwards memmove (bytes/ns):\n");
- for (int f = 0; funtab[f].name != 0; f++)
+ for (int size = 1024; size <= 65536; size *= 2)
{
- printf ("%22s ", funtab[f].name);
-
- for (int size = 1024; size <= 65536; size *= 2)
- {
- uint64_t t = clock_get_ns ();
- for (int i = 0; i < ITERS3; i++)
- funtab[f].fun (a, a + 256 + (i & 31), size);
- t = clock_get_ns () - t;
- printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
- }
- printf ("\n");
+ uint64_t t = clock_get_ns ();
+ for (int i = 0; i < ITERS3; i++)
+ fn (a, a + 256 + (i & 31), size);
+ t = clock_get_ns () - t;
+ printf ("%dK: %5.2f ", size / 1024, (double)size * ITERS3 / t);
}
+ printf ("\n");
+}
+
+static void inline __attribute ((always_inline))
+memmove_backward_unaligned (const char *name, void *(*fn)(void *, const void *, size_t))
+{
+ printf ("%22s ", name);
- printf ("\nUnaligned backwards memmove (bytes/ns):\n");
- for (int f = 0; funtab[f].name != 0; f++)
+ for (int size = 1024; size <= 65536; size *= 2)
{
- printf ("%22s ", funtab[f].name);
-
- for (int size = 1024; size <= 65536; size *= 2)
- {
- uint64_t t = clock_get_ns ();
- for (int i = 0; i < ITERS3; i++)
- funtab[f].fun (a + 256 + (i & 31), a, size);
- t = clock_get_ns () - t;
- printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
- }
- printf ("\n");
+ uint64_t t = clock_get_ns ();
+ for (int i = 0; i < ITERS3; i++)
+ fn (a + 256 + (i & 31), a, size);
+ t = clock_get_ns () - t;
+ printf ("%dK: %5.2f ", size / 1024, (double)size * ITERS3 / t);
}
+
printf ("\n");
+}
+
+int main (void)
+{
+ init_copy_distribution ();
+
+ memset (a, 1, sizeof (a));
+ memset (b, 2, sizeof (b));
+
+ DOTEST ("Random memcpy (bytes/ns):\n", memcpy_random);
+ DOTEST ("Medium memcpy aligned (bytes/ns):\n", memcpy_medium_aligned);
+ DOTEST ("Medium memcpy unaligned (bytes/ns):\n", memcpy_medium_unaligned);
+ DOTEST ("Large memcpy (bytes/ns):\n", memcpy_large);
+ DOTEST ("Forwards memmove unaligned (bytes/ns):\n", memmove_forward_unaligned);
+ DOTEST ("Backwards memmove unaligned (bytes/ns):\n", memmove_backward_unaligned);
return 0;
}
diff --git a/string/bench/memset.c b/string/bench/memset.c
index 990e23ba9a36..07474e469146 100644
--- a/string/bench/memset.c
+++ b/string/bench/memset.c
@@ -20,25 +20,16 @@
#define MIN_SIZE 32768
#define MAX_SIZE (1024 * 1024)
-static uint8_t a[MAX_SIZE + 4096] __attribute__((__aligned__(64)));
+static uint8_t a[MAX_SIZE + 4096] __attribute__((__aligned__(4096)));
-#define F(x) {#x, x},
-
-static const struct fun
-{
- const char *name;
- void *(*fun)(void *, int, size_t);
-} funtab[] =
-{
-#if __aarch64__
- F(__memset_aarch64)
-#elif __arm__
- F(__memset_arm)
-#endif
- F(memset)
-#undef F
- {0, 0}
-};
+#define DOTEST(STR,TESTFN) \
+ printf (STR); \
+ RUN (TESTFN, memset); \
+ RUNA64 (TESTFN, __memset_aarch64); \
+ RUNSVE (TESTFN, __memset_aarch64_sve); \
+ RUNMOPS (TESTFN, __memset_mops); \
+ RUNA32 (TESTFN, __memset_arm); \
+ printf ("\n");
typedef struct { uint32_t offset : 20, len : 12; } memset_test_t;
static memset_test_t test_arr[NUM_TESTS];
@@ -127,117 +118,73 @@ init_memset (size_t max_size)
return total;
}
-
-int main (void)
+static void inline __attribute ((always_inline))
+memset_random (const char *name, void *(*set)(void *, int, size_t))
{
- init_memset_distribution ();
-
- memset (a, 1, sizeof (a));
-
- printf("Random memset (bytes/ns):\n");
- for (int f = 0; funtab[f].name != 0; f++)
- {
- size_t total_size = 0;
- uint64_t tsum = 0;
- printf ("%22s ", funtab[f].name);
- rand32 (0x12345678);
-
- for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2)
- {
- size_t memset_size = init_memset (size) * ITERS;
-
- for (int c = 0; c < NUM_TESTS; c++)
- funtab[f].fun (a + test_arr[c].offset, 0, test_arr[c].len);
-
- uint64_t t = clock_get_ns ();
- for (int i = 0; i < ITERS; i++)
- for (int c = 0; c < NUM_TESTS; c++)
- funtab[f].fun (a + test_arr[c].offset, 0, test_arr[c].len);
- t = clock_get_ns () - t;
- total_size += memset_size;
- tsum += t;
- printf ("%dK: %.2f ", size / 1024, (double)memset_size / t);
- }
- printf( "avg %.2f\n", (double)total_size / tsum);
- }
-
- size_t total_size = 0;
+ uint64_t total_size = 0;
uint64_t tsum = 0;
- printf ("%22s ", "memset_call");
+ printf ("%22s ", name);
rand32 (0x12345678);
for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2)
{
- size_t memset_size = init_memset (size) * ITERS;
+ uint64_t memset_size = init_memset (size) * ITERS;
for (int c = 0; c < NUM_TESTS; c++)
- memset (a + test_arr[c].offset, 0, test_arr[c].len);
+ set (a + test_arr[c].offset, 0, test_arr[c].len);
uint64_t t = clock_get_ns ();
for (int i = 0; i < ITERS; i++)
for (int c = 0; c < NUM_TESTS; c++)
- memset (a + test_arr[c].offset, 0, test_arr[c].len);
+ set (a + test_arr[c].offset, 0, test_arr[c].len);
t = clock_get_ns () - t;
total_size += memset_size;
tsum += t;
- printf ("%dK: %.2f ", size / 1024, (double)memset_size / t);
+ printf ("%dK: %5.2f ", size / 1024, (double)memset_size / t);
}
- printf( "avg %.2f\n", (double)total_size / tsum);
-
+ printf( "avg %5.2f\n", (double)total_size / tsum);
+}
- printf ("\nMedium memset (bytes/ns):\n");
- for (int f = 0; funtab[f].name != 0; f++)
- {
- printf ("%22s ", funtab[f].name);
-
- for (int size = 8; size <= 512; size *= 2)
- {
- uint64_t t = clock_get_ns ();
- for (int i = 0; i < ITERS2; i++)
- funtab[f].fun (a, 0, size);
- t = clock_get_ns () - t;
- printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
- }
- printf ("\n");
- }
+static void inline __attribute ((always_inline))
+memset_medium (const char *name, void *(*set)(void *, int, size_t))
+{
+ printf ("%22s ", name);
- printf ("%22s ", "memset_call");
for (int size = 8; size <= 512; size *= 2)
{
uint64_t t = clock_get_ns ();
for (int i = 0; i < ITERS2; i++)
- memset (a, 0, size);
+ set (a, 0, size);
t = clock_get_ns () - t;
- printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
+ printf ("%dB: %5.2f ", size, (double)size * ITERS2 / t);
}
+ printf ("\n");
+}
+static void inline __attribute ((always_inline))
+memset_large (const char *name, void *(*set)(void *, int, size_t))
+{
+ printf ("%22s ", name);
- printf ("\nLarge memset (bytes/ns):\n");
- for (int f = 0; funtab[f].name != 0; f++)
- {
- printf ("%22s ", funtab[f].name);
-
- for (int size = 1024; size <= 65536; size *= 2)
- {
- uint64_t t = clock_get_ns ();
- for (int i = 0; i < ITERS3; i++)
- funtab[f].fun (a, 0, size);
- t = clock_get_ns () - t;
- printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
- }
- printf ("\n");
- }
-
- printf ("%22s ", "memset_call");
for (int size = 1024; size <= 65536; size *= 2)
{
uint64_t t = clock_get_ns ();
for (int i = 0; i < ITERS3; i++)
- memset (a, 0, size);
+ set (a, 0, size);
t = clock_get_ns () - t;
- printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
+ printf ("%dKB: %6.2f ", size / 1024, (double)size * ITERS3 / t);
}
- printf ("\n\n");
+ printf ("\n");
+}
+
+int main (void)
+{
+ init_memset_distribution ();
+
+ memset (a, 1, sizeof (a));
+ DOTEST ("Random memset (bytes/ns):\n", memset_random);
+ DOTEST ("Medium memset (bytes/ns):\n", memset_medium);
+ DOTEST ("Large memset (bytes/ns):\n", memset_large);
return 0;
}
diff --git a/string/bench/strlen.c b/string/bench/strlen.c
index f05d0d5b89e6..a8dd55cf5fc4 100644
--- a/string/bench/strlen.c
+++ b/string/bench/strlen.c
@@ -14,40 +14,23 @@
#include "benchlib.h"
#define ITERS 5000
-#define ITERS2 20000000
-#define ITERS3 2000000
-#define NUM_TESTS 16384
+#define ITERS2 40000000
+#define ITERS3 4000000
+#define NUM_TESTS 65536
#define MAX_ALIGN 32
-#define MAX_STRLEN 256
+#define MAX_STRLEN 128
static char a[(MAX_STRLEN + 1) * MAX_ALIGN] __attribute__((__aligned__(4096)));
-#define F(x, mte) {#x, x, mte},
-
-static const struct fun
-{
- const char *name;
- size_t (*fun) (const char *s);
- int test_mte;
-} funtab[] = {
- // clang-format off
- F(strlen, 0)
-#if __aarch64__
- F(__strlen_aarch64, 0)
- F(__strlen_aarch64_mte, 1)
-# if __ARM_FEATURE_SVE
- F(__strlen_aarch64_sve, 1)
-# endif
-#elif __arm__
-# if __ARM_ARCH >= 6 && __ARM_ARCH_ISA_THUMB == 2
- F(__strlen_armv6t2, 0)
-# endif
-#endif
- {0, 0, 0}
- // clang-format on
-};
-#undef F
+#define DOTEST(STR,TESTFN) \
+ printf (STR); \
+ RUN (TESTFN, strlen); \
+ RUNA64 (TESTFN, __strlen_aarch64); \
+ RUNA64 (TESTFN, __strlen_aarch64_mte); \
+ RUNSVE (TESTFN, __strlen_aarch64_sve); \
+ RUNT32 (TESTFN, __strlen_armv6t2); \
+ printf ("\n");
static uint16_t strlen_tests[NUM_TESTS];
@@ -124,98 +107,119 @@ init_strlen_tests (void)
strlen_tests[n] =
index[(align + exp_len) & (MAX_ALIGN - 1)] + MAX_STRLEN - exp_len;
+ assert ((strlen_tests[n] & (align - 1)) == 0);
+ assert (strlen (a + strlen_tests[n]) == exp_len);
}
}
static volatile size_t maskv = 0;
-int main (void)
+static void inline __attribute ((always_inline))
+strlen_random (const char *name, size_t (*fn)(const char *))
{
- rand32 (0x12345678);
- init_strlen_distribution ();
- init_strlen_tests ();
+ size_t res = 0, mask = maskv;
+ uint64_t strlen_size = 0;
+ printf ("%22s ", name);
+
+ for (int c = 0; c < NUM_TESTS; c++)
+ strlen_size += fn (a + strlen_tests[c]) + 1;
+ strlen_size *= ITERS;
+
+ /* Measure throughput of strlen. */
+ uint64_t t = clock_get_ns ();
+ for (int i = 0; i < ITERS; i++)
+ for (int c = 0; c < NUM_TESTS; c++)
+ res += fn (a + strlen_tests[c]);
+ t = clock_get_ns () - t;
+ printf ("tp: %.3f ", (double)strlen_size / t);
+
+ /* Measure latency of strlen result with (res & mask). */
+ t = clock_get_ns ();
+ for (int i = 0; i < ITERS; i++)
+ for (int c = 0; c < NUM_TESTS; c++)
+ res += fn (a + strlen_tests[c] + (res & mask));
+ t = clock_get_ns () - t;
+ printf ("lat: %.3f\n", (double)strlen_size / t);
+ maskv = res & mask;
+}
- printf ("\nRandom strlen (bytes/ns):\n");
- for (int f = 0; funtab[f].name != 0; f++)
- {
- size_t res = 0, strlen_size = 0, mask = maskv;
- printf ("%22s ", funtab[f].name);
+static void inline __attribute ((always_inline))
+strlen_small_aligned (const char *name, size_t (*fn)(const char *))
+{
+ printf ("%22s ", name);
- for (int c = 0; c < NUM_TESTS; c++)
- strlen_size += funtab[f].fun (a + strlen_tests[c]);
- strlen_size *= ITERS;
+ size_t res = 0, mask = maskv;
+ for (int size = 1; size <= 64; size *= 2)
+ {
+ memset (a, 'x', size);
+ a[size - 1] = 0;
- /* Measure latency of strlen result with (res & mask). */
uint64_t t = clock_get_ns ();
- for (int i = 0; i < ITERS; i++)
- for (int c = 0; c < NUM_TESTS; c++)
- res = funtab[f].fun (a + strlen_tests[c] + (res & mask));
+ for (int i = 0; i < ITERS2; i++)
+ res += fn (a + (i & mask));
t = clock_get_ns () - t;
- printf ("%.2f\n", (double)strlen_size / t);
+ printf ("%d%c: %5.2f ", size < 1024 ? size : size / 1024,
+ size < 1024 ? 'B' : 'K', (double)size * ITERS2 / t);
}
+ maskv &= res;
+ printf ("\n");
+}
- printf ("\nSmall aligned strlen (bytes/ns):\n");
- for (int f = 0; funtab[f].name != 0; f++)
- {
- printf ("%22s ", funtab[f].name);
-
- for (int size = 1; size <= 64; size *= 2)
- {
- memset (a, 'x', size);
- a[size - 1] = 0;
-
- uint64_t t = clock_get_ns ();
- for (int i = 0; i < ITERS2; i++)
- funtab[f].fun (a);
- t = clock_get_ns () - t;
- printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
- size < 1024 ? 'B' : 'K', (double)size * ITERS2 / t);
- }
- printf ("\n");
- }
+static void inline __attribute ((always_inline))
+strlen_small_unaligned (const char *name, size_t (*fn)(const char *))
+{
+ printf ("%22s ", name);
- printf ("\nSmall unaligned strlen (bytes/ns):\n");
- for (int f = 0; funtab[f].name != 0; f++)
+ size_t res = 0, mask = maskv;
+ int align = 9;
+ for (int size = 1; size <= 64; size *= 2)
{
- printf ("%22s ", funtab[f].name);
-
- int align = 9;
- for (int size = 1; size <= 64; size *= 2)
- {
- memset (a + align, 'x', size);
- a[align + size - 1] = 0;
-
- uint64_t t = clock_get_ns ();
- for (int i = 0; i < ITERS2; i++)
- funtab[f].fun (a + align);
- t = clock_get_ns () - t;
- printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
- size < 1024 ? 'B' : 'K', (double)size * ITERS2 / t);
- }
- printf ("\n");
+ memset (a + align, 'x', size);
+ a[align + size - 1] = 0;
+
+ uint64_t t = clock_get_ns ();
+ for (int i = 0; i < ITERS2; i++)
+ res += fn (a + align + (i & mask));
+ t = clock_get_ns () - t;
+ printf ("%d%c: %5.2f ", size < 1024 ? size : size / 1024,
+ size < 1024 ? 'B' : 'K', (double)size * ITERS2 / t);
}
+ maskv &= res;
+ printf ("\n");
+}
- printf ("\nMedium strlen (bytes/ns):\n");
- for (int f = 0; funtab[f].name != 0; f++)
+static void inline __attribute ((always_inline))
+strlen_medium (const char *name, size_t (*fn)(const char *))
+{
+ printf ("%22s ", name);
+
+ size_t res = 0, mask = maskv;
+ for (int size = 128; size <= 4096; size *= 2)
{
- printf ("%22s ", funtab[f].name);
-
- for (int size = 128; size <= 4096; size *= 2)
- {
- memset (a, 'x', size);
- a[size - 1] = 0;
-
- uint64_t t = clock_get_ns ();
- for (int i = 0; i < ITERS3; i++)
- funtab[f].fun (a);
- t = clock_get_ns () - t;
- printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
- size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t);
- }
- printf ("\n");
- }
+ memset (a, 'x', size);
+ a[size - 1] = 0;
+ uint64_t t = clock_get_ns ();
+ for (int i = 0; i < ITERS3; i++)
+ res += fn (a + (i & mask));
+ t = clock_get_ns () - t;
+ printf ("%d%c: %5.2f ", size < 1024 ? size : size / 1024,
+ size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t);
+ }
+ maskv &= res;
printf ("\n");
+}
+
+int main (void)
+{
+ rand32 (0x12345678);
+ init_strlen_distribution ();
+ init_strlen_tests ();
+
+ DOTEST ("Random strlen (bytes/ns):\n", strlen_random);
+ DOTEST ("Small aligned strlen (bytes/ns):\n", strlen_small_aligned);
+ DOTEST ("Small unaligned strlen (bytes/ns):\n", strlen_small_unaligned);
+ DOTEST ("Medium strlen (bytes/ns):\n", strlen_medium);
return 0;
}
diff --git a/string/include/benchlib.h b/string/include/benchlib.h
index f1bbea388cd2..486504e99ddf 100644
--- a/string/include/benchlib.h
+++ b/string/include/benchlib.h
@@ -30,4 +30,35 @@ rand32 (uint32_t seed)
return res;
}
+/* Macros to run a benchmark BENCH using string function FN. */
+#define RUN(BENCH, FN) BENCH(#FN, FN)
+#if __aarch64__
+# define RUNA64(BENCH, FN) BENCH(#FN, FN)
+#else
+# define RUNA64(BENCH, FN)
+#endif
+
+#if __ARM_FEATURE_SVE
+# define RUNSVE(BENCH, FN) BENCH(#FN, FN)
+#else
+# define RUNSVE(BENCH, FN)
+#endif
+
+#if WANT_MOPS
+# define RUNMOPS(BENCH, FN) BENCH(#FN, FN)
+#else
+# define RUNMOPS(BENCH, FN)
+#endif
+
+#if __arm__
+# define RUNA32(BENCH, FN) BENCH(#FN, FN)
+#else
+# define RUNA32(BENCH, FN)
+#endif
+
+#if __arm__ && __ARM_ARCH >= 6 && __ARM_ARCH_ISA_THUMB == 2
+# define RUNT32(BENCH, FN) BENCH(#FN, FN)
+#else
+# define RUNT32(BENCH, FN)
+#endif
diff --git a/string/include/stringlib.h b/string/include/stringlib.h
index 01da7ebfc18d..bb9db930f132 100644
--- a/string/include/stringlib.h
+++ b/string/include/stringlib.h
@@ -33,13 +33,12 @@ char *__strchr_aarch64_mte (const char *, int);
char * __strchrnul_aarch64_mte (const char *, int );
size_t __strlen_aarch64_mte (const char *);
char *__strrchr_aarch64_mte (const char *, int);
-#if __ARM_NEON
void *__memcpy_aarch64_simd (void *__restrict, const void *__restrict, size_t);
void *__memmove_aarch64_simd (void *, const void *, size_t);
-#endif
# if __ARM_FEATURE_SVE
void *__memcpy_aarch64_sve (void *__restrict, const void *__restrict, size_t);
void *__memmove_aarch64_sve (void *__restrict, const void *__restrict, size_t);
+void *__memset_aarch64_sve (void *, int, size_t);
void *__memchr_aarch64_sve (const void *, int, size_t);
int __memcmp_aarch64_sve (const void *, const void *, size_t);
char *__strchr_aarch64_sve (const char *, int);
diff --git a/string/test/memcpy.c b/string/test/memcpy.c
index dc95844bd45a..98255e06f31c 100644
--- a/string/test/memcpy.c
+++ b/string/test/memcpy.c
@@ -25,9 +25,7 @@ static const struct fun
F(memcpy, 0)
#if __aarch64__
F(__memcpy_aarch64, 1)
-# if __ARM_NEON
F(__memcpy_aarch64_simd, 1)
-# endif
# if __ARM_FEATURE_SVE
F(__memcpy_aarch64_sve, 1)
# endif
diff --git a/string/test/memmove.c b/string/test/memmove.c
index b85dd1e864ef..ff3f7652f763 100644
--- a/string/test/memmove.c
+++ b/string/test/memmove.c
@@ -25,9 +25,7 @@ static const struct fun
F(memmove, 0)
#if __aarch64__
F(__memmove_aarch64, 1)
-# if __ARM_NEON
F(__memmove_aarch64_simd, 1)
-# endif
# if __ARM_FEATURE_SVE
F(__memmove_aarch64_sve, 1)
# endif
diff --git a/string/test/memset.c b/string/test/memset.c
index 7d09c267ffec..a9639f9b28b0 100644
--- a/string/test/memset.c
+++ b/string/test/memset.c
@@ -25,6 +25,9 @@ static const struct fun
F(memset, 0)
#if __aarch64__
F(__memset_aarch64, 1)
+# if __ARM_FEATURE_SVE
+ F(__memset_aarch64_sve, 1)
+# endif
# if WANT_MOPS
F(__memset_aarch64_mops, 1)
# endif