aboutsummaryrefslogtreecommitdiff
path: root/string/aarch64/memset.S
diff options
context:
space:
mode:
Diffstat (limited to 'string/aarch64/memset.S')
-rw-r--r--string/aarch64/memset.S104
1 files changed, 54 insertions, 50 deletions
diff --git a/string/aarch64/memset.S b/string/aarch64/memset.S
index 553b0fcaefea..906a4dcf46c6 100644
--- a/string/aarch64/memset.S
+++ b/string/aarch64/memset.S
@@ -1,7 +1,7 @@
/*
* memset - fill memory with a constant byte
*
- * Copyright (c) 2012-2022, Arm Limited.
+ * Copyright (c) 2012-2024, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
@@ -20,93 +20,98 @@
#define dst x3
#define dstend x4
#define zva_val x5
+#define off x3
+#define dstend2 x5
ENTRY (__memset_aarch64)
- PTR_ARG (0)
- SIZE_ARG (2)
-
dup v0.16B, valw
- add dstend, dstin, count
-
- cmp count, 96
- b.hi L(set_long)
cmp count, 16
- b.hs L(set_medium)
- mov val, v0.D[0]
+ b.lo L(set_small)
- /* Set 0..15 bytes. */
- tbz count, 3, 1f
- str val, [dstin]
- str val, [dstend, -8]
+ add dstend, dstin, count
+ cmp count, 64
+ b.hs L(set_128)
+
+ /* Set 16..63 bytes. */
+ mov off, 16
+ and off, off, count, lsr 1
+ sub dstend2, dstend, off
+ str q0, [dstin]
+ str q0, [dstin, off]
+ str q0, [dstend2, -16]
+ str q0, [dstend, -16]
ret
+
.p2align 4
-1: tbz count, 2, 2f
- str valw, [dstin]
- str valw, [dstend, -4]
+ /* Set 0..15 bytes. */
+L(set_small):
+ add dstend, dstin, count
+ cmp count, 4
+ b.lo 2f
+ lsr off, count, 3
+ sub dstend2, dstend, off, lsl 2
+ str s0, [dstin]
+ str s0, [dstin, off, lsl 2]
+ str s0, [dstend2, -4]
+ str s0, [dstend, -4]
ret
+
+ /* Set 0..3 bytes. */
2: cbz count, 3f
+ lsr off, count, 1
strb valw, [dstin]
- tbz count, 1, 3f
- strh valw, [dstend, -2]
+ strb valw, [dstin, off]
+ strb valw, [dstend, -1]
3: ret
- /* Set 17..96 bytes. */
-L(set_medium):
- str q0, [dstin]
- tbnz count, 6, L(set96)
- str q0, [dstend, -16]
- tbz count, 5, 1f
- str q0, [dstin, 16]
- str q0, [dstend, -32]
-1: ret
-
.p2align 4
- /* Set 64..96 bytes. Write 64 bytes from the start and
- 32 bytes from the end. */
-L(set96):
- str q0, [dstin, 16]
+L(set_128):
+ bic dst, dstin, 15
+ cmp count, 128
+ b.hi L(set_long)
+ stp q0, q0, [dstin]
stp q0, q0, [dstin, 32]
+ stp q0, q0, [dstend, -64]
stp q0, q0, [dstend, -32]
ret
.p2align 4
L(set_long):
- and valw, valw, 255
- bic dst, dstin, 15
str q0, [dstin]
- cmp count, 160
- ccmp valw, 0, 0, hs
+ str q0, [dst, 16]
+ tst valw, 255
b.ne L(no_zva)
-
#ifndef SKIP_ZVA_CHECK
mrs zva_val, dczid_el0
and zva_val, zva_val, 31
cmp zva_val, 4 /* ZVA size is 64 bytes. */
b.ne L(no_zva)
#endif
- str q0, [dst, 16]
stp q0, q0, [dst, 32]
- bic dst, dst, 63
+ bic dst, dstin, 63
sub count, dstend, dst /* Count is now 64 too large. */
- sub count, count, 128 /* Adjust count and bias for loop. */
+ sub count, count, 64 + 64 /* Adjust count and bias for loop. */
+
+ /* Write last bytes before ZVA loop. */
+ stp q0, q0, [dstend, -64]
+ stp q0, q0, [dstend, -32]
.p2align 4
-L(zva_loop):
+L(zva64_loop):
add dst, dst, 64
dc zva, dst
subs count, count, 64
- b.hi L(zva_loop)
- stp q0, q0, [dstend, -64]
- stp q0, q0, [dstend, -32]
+ b.hi L(zva64_loop)
ret
+ .p2align 3
L(no_zva):
- sub count, dstend, dst /* Count is 16 too large. */
- sub dst, dst, 16 /* Dst is biased by -32. */
- sub count, count, 64 + 16 /* Adjust count and bias for loop. */
+ sub count, dstend, dst /* Count is 32 too large. */
+ sub count, count, 64 + 32 /* Adjust count and bias for loop. */
L(no_zva_loop):
stp q0, q0, [dst, 32]
- stp q0, q0, [dst, 64]!
+ stp q0, q0, [dst, 64]
+ add dst, dst, 64
subs count, count, 64
b.hi L(no_zva_loop)
stp q0, q0, [dstend, -64]
@@ -114,4 +119,3 @@ L(no_zva_loop):
ret
END (__memset_aarch64)
-