aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm-project/compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines.S
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm-project/compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines.S')
-rw-r--r--contrib/llvm-project/compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines.S352
1 files changed, 352 insertions, 0 deletions
diff --git a/contrib/llvm-project/compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines.S b/contrib/llvm-project/compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines.S
new file mode 100644
index 000000000000..0318d9a6f1eb
--- /dev/null
+++ b/contrib/llvm-project/compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines.S
@@ -0,0 +1,352 @@
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// Routines taken from libc/AOR_v20.02/string/aarch64
+
+#include "../assembly.h"
+
+#ifdef __aarch64__
+
+#define L(l) .L ## l
+
+//
+// __arm_sc_memcpy / __arm_sc_memmove
+//
+
+#define dstin x0
+#define src x1
+#define count x2
+#define dst x3
+#define srcend1 x4
+#define dstend1 x5
+#define A_l x6
+#define A_lw w6
+#define A_h x7
+#define B_l x8
+#define B_lw w8
+#define B_h x9
+#define C_l x10
+#define C_lw w10
+#define C_h x11
+#define D_l x12
+#define D_h x13
+#define E_l x14
+#define E_h x15
+#define F_l x16
+#define F_h x17
+#define G_l count
+#define G_h dst
+#define H_l src
+#define H_h srcend1
+#define tmp1 x14
+
+/* This implementation handles overlaps and supports both memcpy and memmove
+ from a single entry point. It uses unaligned accesses and branchless
+ sequences to keep the code small, simple and improve performance.
+
+ Copies are split into 3 main cases: small copies of up to 32 bytes, medium
+ copies of up to 128 bytes, and large copies. The overhead of the overlap
+ check is negligible since it is only required for large copies.
+
+ Large copies use a software pipelined loop processing 64 bytes per iteration.
+ The destination pointer is 16-byte aligned to minimize unaligned accesses.
+ The loop tail is handled by always copying 64 bytes from the end.
+*/
+
+DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_sc_memcpy)
+ add srcend1, src, count
+ add dstend1, dstin, count
+ cmp count, 128
+ b.hi L(copy_long)
+ cmp count, 32
+ b.hi L(copy32_128)
+
+ /* Small copies: 0..32 bytes. */
+ cmp count, 16
+ b.lo L(copy16)
+ ldp A_l, A_h, [src]
+ ldp D_l, D_h, [srcend1, -16]
+ stp A_l, A_h, [dstin]
+ stp D_l, D_h, [dstend1, -16]
+ ret
+
+ /* Copy 8-15 bytes. */
+L(copy16):
+ tbz count, 3, L(copy8)
+ ldr A_l, [src]
+ ldr A_h, [srcend1, -8]
+ str A_l, [dstin]
+ str A_h, [dstend1, -8]
+ ret
+
+ .p2align 3
+ /* Copy 4-7 bytes. */
+L(copy8):
+ tbz count, 2, L(copy4)
+ ldr A_lw, [src]
+ ldr B_lw, [srcend1, -4]
+ str A_lw, [dstin]
+ str B_lw, [dstend1, -4]
+ ret
+
+ /* Copy 0..3 bytes using a branchless sequence. */
+L(copy4):
+ cbz count, L(copy0)
+ lsr tmp1, count, 1
+ ldrb A_lw, [src]
+ ldrb C_lw, [srcend1, -1]
+ ldrb B_lw, [src, tmp1]
+ strb A_lw, [dstin]
+ strb B_lw, [dstin, tmp1]
+ strb C_lw, [dstend1, -1]
+L(copy0):
+ ret
+
+ .p2align 4
+ /* Medium copies: 33..128 bytes. */
+L(copy32_128):
+ ldp A_l, A_h, [src]
+ ldp B_l, B_h, [src, 16]
+ ldp C_l, C_h, [srcend1, -32]
+ ldp D_l, D_h, [srcend1, -16]
+ cmp count, 64
+ b.hi L(copy128)
+ stp A_l, A_h, [dstin]
+ stp B_l, B_h, [dstin, 16]
+ stp C_l, C_h, [dstend1, -32]
+ stp D_l, D_h, [dstend1, -16]
+ ret
+
+ .p2align 4
+ /* Copy 65..128 bytes. */
+L(copy128):
+ ldp E_l, E_h, [src, 32]
+ ldp F_l, F_h, [src, 48]
+ cmp count, 96
+ b.ls L(copy96)
+ ldp G_l, G_h, [srcend1, -64]
+ ldp H_l, H_h, [srcend1, -48]
+ stp G_l, G_h, [dstend1, -64]
+ stp H_l, H_h, [dstend1, -48]
+L(copy96):
+ stp A_l, A_h, [dstin]
+ stp B_l, B_h, [dstin, 16]
+ stp E_l, E_h, [dstin, 32]
+ stp F_l, F_h, [dstin, 48]
+ stp C_l, C_h, [dstend1, -32]
+ stp D_l, D_h, [dstend1, -16]
+ ret
+
+ .p2align 4
+ /* Copy more than 128 bytes. */
+L(copy_long):
+ /* Use backwards copy if there is an overlap. */
+ sub tmp1, dstin, src
+ cbz tmp1, L(copy0)
+ cmp tmp1, count
+ b.lo L(copy_long_backwards)
+
+ /* Copy 16 bytes and then align dst to 16-byte alignment. */
+
+ ldp D_l, D_h, [src]
+ and tmp1, dstin, 15
+ bic dst, dstin, 15
+ sub src, src, tmp1
+ add count, count, tmp1 /* Count is now 16 too large. */
+ ldp A_l, A_h, [src, 16]
+ stp D_l, D_h, [dstin]
+ ldp B_l, B_h, [src, 32]
+ ldp C_l, C_h, [src, 48]
+ ldp D_l, D_h, [src, 64]!
+ subs count, count, 128 + 16 /* Test and readjust count. */
+ b.ls L(copy64_from_end)
+L(loop64):
+ stp A_l, A_h, [dst, 16]
+ ldp A_l, A_h, [src, 16]
+ stp B_l, B_h, [dst, 32]
+ ldp B_l, B_h, [src, 32]
+ stp C_l, C_h, [dst, 48]
+ ldp C_l, C_h, [src, 48]
+ stp D_l, D_h, [dst, 64]!
+ ldp D_l, D_h, [src, 64]!
+ subs count, count, 64
+ b.hi L(loop64)
+
+ /* Write the last iteration and copy 64 bytes from the end. */
+L(copy64_from_end):
+ ldp E_l, E_h, [srcend1, -64]
+ stp A_l, A_h, [dst, 16]
+ ldp A_l, A_h, [srcend1, -48]
+ stp B_l, B_h, [dst, 32]
+ ldp B_l, B_h, [srcend1, -32]
+ stp C_l, C_h, [dst, 48]
+ ldp C_l, C_h, [srcend1, -16]
+ stp D_l, D_h, [dst, 64]
+ stp E_l, E_h, [dstend1, -64]
+ stp A_l, A_h, [dstend1, -48]
+ stp B_l, B_h, [dstend1, -32]
+ stp C_l, C_h, [dstend1, -16]
+ ret
+
+ .p2align 4
+
+ /* Large backwards copy for overlapping copies.
+ Copy 16 bytes and then align dst to 16-byte alignment. */
+L(copy_long_backwards):
+ ldp D_l, D_h, [srcend1, -16]
+ and tmp1, dstend1, 15
+ sub srcend1, srcend1, tmp1
+ sub count, count, tmp1
+ ldp A_l, A_h, [srcend1, -16]
+ stp D_l, D_h, [dstend1, -16]
+ ldp B_l, B_h, [srcend1, -32]
+ ldp C_l, C_h, [srcend1, -48]
+ ldp D_l, D_h, [srcend1, -64]!
+ sub dstend1, dstend1, tmp1
+ subs count, count, 128
+ b.ls L(copy64_from_start)
+
+L(loop64_backwards):
+ stp A_l, A_h, [dstend1, -16]
+ ldp A_l, A_h, [srcend1, -16]
+ stp B_l, B_h, [dstend1, -32]
+ ldp B_l, B_h, [srcend1, -32]
+ stp C_l, C_h, [dstend1, -48]
+ ldp C_l, C_h, [srcend1, -48]
+ stp D_l, D_h, [dstend1, -64]!
+ ldp D_l, D_h, [srcend1, -64]!
+ subs count, count, 64
+ b.hi L(loop64_backwards)
+
+ /* Write the last iteration and copy 64 bytes from the start. */
+L(copy64_from_start):
+ ldp G_l, G_h, [src, 48]
+ stp A_l, A_h, [dstend1, -16]
+ ldp A_l, A_h, [src, 32]
+ stp B_l, B_h, [dstend1, -32]
+ ldp B_l, B_h, [src, 16]
+ stp C_l, C_h, [dstend1, -48]
+ ldp C_l, C_h, [src]
+ stp D_l, D_h, [dstend1, -64]
+ stp G_l, G_h, [dstin, 48]
+ stp A_l, A_h, [dstin, 32]
+ stp B_l, B_h, [dstin, 16]
+ stp C_l, C_h, [dstin]
+ ret
+END_COMPILERRT_OUTLINE_FUNCTION(__arm_sc_memcpy)
+
+DEFINE_COMPILERRT_FUNCTION_ALIAS(__arm_sc_memmove, __arm_sc_memcpy)
+
+
+//
+// __arm_sc_memset
+//
+
+#define dstin x0
+#define val x1
+#define valw w1
+#define count x2
+#define dst x3
+#define dstend2 x4
+#define zva_val x5
+
+DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_sc_memset)
+#ifdef __ARM_FEATURE_SVE
+ mov z0.b, valw
+#else
+ bfi valw, valw, #8, #8
+ bfi valw, valw, #16, #16
+ bfi val, val, #32, #32
+ fmov d0, val
+ fmov v0.d[1], val
+#endif
+ add dstend2, dstin, count
+
+ cmp count, 96
+ b.hi L(set_long)
+ cmp count, 16
+ b.hs L(set_medium)
+ mov val, v0.D[0]
+
+ /* Set 0..15 bytes. */
+ tbz count, 3, 1f
+ str val, [dstin]
+ str val, [dstend2, -8]
+ ret
+ nop
+1: tbz count, 2, 2f
+ str valw, [dstin]
+ str valw, [dstend2, -4]
+ ret
+2: cbz count, 3f
+ strb valw, [dstin]
+ tbz count, 1, 3f
+ strh valw, [dstend2, -2]
+3: ret
+
+ /* Set 17..96 bytes. */
+L(set_medium):
+ str q0, [dstin]
+ tbnz count, 6, L(set96)
+ str q0, [dstend2, -16]
+ tbz count, 5, 1f
+ str q0, [dstin, 16]
+ str q0, [dstend2, -32]
+1: ret
+
+ .p2align 4
+ /* Set 64..96 bytes. Write 64 bytes from the start and
+ 32 bytes from the end. */
+L(set96):
+ str q0, [dstin, 16]
+ stp q0, q0, [dstin, 32]
+ stp q0, q0, [dstend2, -32]
+ ret
+
+ .p2align 4
+L(set_long):
+ and valw, valw, 255
+ bic dst, dstin, 15
+ str q0, [dstin]
+ cmp count, 160
+ ccmp valw, 0, 0, hs
+ b.ne L(no_zva)
+
+#ifndef SKIP_ZVA_CHECK
+ mrs zva_val, dczid_el0
+ and zva_val, zva_val, 31
+ cmp zva_val, 4 /* ZVA size is 64 bytes. */
+ b.ne L(no_zva)
+#endif
+ str q0, [dst, 16]
+ stp q0, q0, [dst, 32]
+ bic dst, dst, 63
+ sub count, dstend2, dst /* Count is now 64 too large. */
+ sub count, count, 128 /* Adjust count and bias for loop. */
+
+ .p2align 4
+L(zva_loop):
+ add dst, dst, 64
+ dc zva, dst
+ subs count, count, 64
+ b.hi L(zva_loop)
+ stp q0, q0, [dstend2, -64]
+ stp q0, q0, [dstend2, -32]
+ ret
+
+L(no_zva):
+ sub count, dstend2, dst /* Count is 16 too large. */
+ sub dst, dst, 16 /* Dst is biased by -32. */
+ sub count, count, 64 + 16 /* Adjust count and bias for loop. */
+L(no_zva_loop):
+ stp q0, q0, [dst, 32]
+ stp q0, q0, [dst, 64]!
+ subs count, count, 64
+ b.hi L(no_zva_loop)
+ stp q0, q0, [dstend2, -64]
+ stp q0, q0, [dstend2, -32]
+ ret
+END_COMPILERRT_OUTLINE_FUNCTION(__arm_sc_memset)
+
+#endif // __aarch64__