aboutsummaryrefslogtreecommitdiff
path: root/lib/libc/riscv
diff options
context:
space:
mode:
Diffstat (limited to 'lib/libc/riscv')
-rw-r--r--lib/libc/riscv/string/Makefile.inc10
-rw-r--r--lib/libc/riscv/string/bcopy.c14
-rw-r--r--lib/libc/riscv/string/bzero.c14
-rw-r--r--lib/libc/riscv/string/memchr.S188
-rw-r--r--lib/libc/riscv/string/memcpy.S217
-rw-r--r--lib/libc/riscv/string/memset.S95
-rw-r--r--lib/libc/riscv/string/strchrnul.S116
-rw-r--r--lib/libc/riscv/string/strlen.S77
-rw-r--r--lib/libc/riscv/string/strnlen.S143
-rw-r--r--lib/libc/riscv/string/strrchr.S127
10 files changed, 1001 insertions, 0 deletions
diff --git a/lib/libc/riscv/string/Makefile.inc b/lib/libc/riscv/string/Makefile.inc
new file mode 100644
index 000000000000..6dae6b2cb62d
--- /dev/null
+++ b/lib/libc/riscv/string/Makefile.inc
@@ -0,0 +1,10 @@
+MDSRCS+= \
+ bcopy.c \
+ bzero.c \
+ memchr.S \
+ memcpy.S \
+ memset.S \
+ strlen.S \
+ strnlen.S \
+ strchrnul.S \
+ strrchr.S
diff --git a/lib/libc/riscv/string/bcopy.c b/lib/libc/riscv/string/bcopy.c
new file mode 100644
index 000000000000..0dee529fb9df
--- /dev/null
+++ b/lib/libc/riscv/string/bcopy.c
@@ -0,0 +1,14 @@
+/*-
+ * Public domain.
+ */
+
+#include <string.h>
+
+#undef bcopy /* _FORTIFY_SOURCE */
+
+void
+bcopy(const void *src, void *dst, size_t len)
+{
+
+ memmove(dst, src, len);
+}
diff --git a/lib/libc/riscv/string/bzero.c b/lib/libc/riscv/string/bzero.c
new file mode 100644
index 000000000000..d82f3061865b
--- /dev/null
+++ b/lib/libc/riscv/string/bzero.c
@@ -0,0 +1,14 @@
+/*-
+ * Public domain.
+ */
+
+#include <string.h>
+
+#undef bzero /* _FORTIFY_SOURCE */
+
+void
+bzero(void *b, size_t len)
+{
+
+ memset(b, 0, len);
+}
diff --git a/lib/libc/riscv/string/memchr.S b/lib/libc/riscv/string/memchr.S
new file mode 100644
index 000000000000..e6e04bfae96a
--- /dev/null
+++ b/lib/libc/riscv/string/memchr.S
@@ -0,0 +1,188 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2024 Strahinja Stanisic <strajabot@FreeBSD.org>
+ */
+
+#include <machine/asm.h>
+
+/*
+ * a0 - const void *b
+ * a1 - int c
+ * a2 - size_t len
+ */
+ENTRY(memchr)
+ /*
+ * a0 - const char *ptr
+ * a1 - char cccccccc[8]
+ * a2 - char iter[8]
+ * a3 - uint8_t *end
+ * a4 - uint64_t *end_align
+ * a5 - uint64_t *end_unroll
+ */
+
+ beqz a2, .Lno_match
+
+ /* c = (uint8_t) c */
+ andi a1, a1, 0xFF
+
+ /*
+ * t0 = 0x0101010101010101
+ * t1 = 0x8080808080808080
+ * t2 = b << 3
+ * cccccccc = (uint8_t)c * t0
+ * end = b + len;
+ * ptr = b & ~0b111
+ */
+ add a3, a0, a2
+ li t0, 0x01010101
+ sltu t2, a0, a3
+ slli t1, t0, 32
+ neg t2, t2
+ or t0, t0, t1
+ and a3, a3, t2
+ slli t1, t0, 7
+ slli t2, a0, 3
+ and a0, a0, ~0b111
+ mul a1, t0, a1
+
+ ld a2, (a0)
+
+ /*
+ * mask_start = REP8_0x01 ^ (REP8_0x01 << t2)
+ * iter = iter ^ cccccccc
+ * iter = iter | mask_start
+ */
+ sll t2, t0, t2
+ xor a2, a2, a1
+ xor t2, t2, t0
+ or a2, a2, t2
+
+ /* has_zero(iter)
+ * end_align = (end + 7) & ~0b111;
+ */
+ addi a4, a3, 7
+ not t2, a2
+ sub a2, a2, t0
+ and t2, t2, t1
+ andi a4, a4, ~0b111
+ and a2, a2, t2
+
+ /* ptr = ptr + 8 */
+ addi a0, a0, 8
+
+ bnez a2, .Lfind_zero
+
+ /* if(ptr == end_align) */
+ beq a0, a4, .Lno_match
+
+ /* end_unroll = end_align & ~0b1111 */
+ andi a5, a4, ~0b1111
+
+ /*
+ * Instead of branching to check if `ptr` is 16-byte aligned:
+ * - Probe the next 8 bytes for `c`
+ * - Align `ptr` down to the nearest 16-byte boundary
+ *
+ * If `ptr` was already 16-byte aligned, those 8 bytes will be
+ * checked again inside the unrolled loop.
+ *
+ * This removes an unpredictable branch and improves performance.
+ */
+
+ ld a2, (a0)
+ xor a2, a2, a1
+
+ not t2, a2
+ sub a2, a2, t0
+ and t2, t2, t1
+ and a2, a2, t2
+
+ addi a0, a0, 8
+
+ bnez a2, .Lfind_zero
+
+ andi a0, a0, ~0b1111
+
+ /* while(ptr != end_unroll) */
+ beq a0, a5, .Lskip_loop
+.Lloop:
+ ld a2, (a0)
+ ld t3, 8(a0)
+
+ xor a2, a2, a1
+ xor t3, t3, a1
+
+ not t2, a2
+ not t4, t3
+ sub a2, a2, t0
+ sub t3, t3, t0
+ and t2, t2, t1
+ and t4, t4, t1
+ and a2, a2, t2
+ and t3, t3, t4
+
+ addi a0, a0, 8
+
+ bnez a2, .Lfind_zero
+
+ /* move into iter for find_zero */
+ mv a2, t3
+
+ addi a0, a0, 8
+
+ bnez a2, .Lfind_zero
+
+ bne a0, a5, .Lloop
+.Lskip_loop:
+
+ /* there might be one 8byte left */
+ beq a0, a4, .Lno_match
+
+ ld a2, (a0)
+ xor a2, a2, a1
+
+ not t2, a2
+ sub a2, a2, t0
+ and t2, t2, t1
+ and a2, a2, t2
+
+ addi a0, a0, 8
+
+ beqz a2, .Lno_match
+
+.Lfind_zero:
+ /*
+ * ptr = ptr - 8
+ * t1 = 0x0001020304050607
+ * iter = iter & (-iter)
+ * iter = iter >> 7
+ * iter = iter * t1
+ * iter = iter >> 56
+ */
+ li t1, 0x10203000
+ neg t0, a2
+ slli t1, t1, 4
+ and a2, a2, t0
+ addi t1, t1, 0x405
+ srli a2, a2, 7
+ slli t1, t1, 16
+ addi a0, a0, -8
+ addi t1, t1, 0x607
+ mul a2, a2, t1
+ srli a2, a2, 56
+
+ /* left = end - ptr */
+ sub t0, a3, a0
+
+ /* return iter < left ? ptr + iter : NULL */
+ sltu t1, a2, t0
+ neg t1, t1
+ add a0, a0, a2
+ and a0, a0, t1
+ ret
+
+.Lno_match:
+ li a0, 0
+ ret
+END(memchr)
diff --git a/lib/libc/riscv/string/memcpy.S b/lib/libc/riscv/string/memcpy.S
new file mode 100644
index 000000000000..7536514df777
--- /dev/null
+++ b/lib/libc/riscv/string/memcpy.S
@@ -0,0 +1,217 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2024 Strahinja Stanisic <strajabot@FreeBSD.org>
+ */
+
+#include <machine/asm.h>
+
+/*
+ * a0 - void* dst
+ * a1 - const void* src
+ * a2 - size_t len
+ */
+ENTRY(memcpy)
+ beqz a2, .Lreturn
+
+ /* diff = (dstv - srcv) & 0b111 */
+ sub t0, a0, a1
+ andi t0, t0, 0b111
+
+ sltiu t1, a2, 8
+
+ /* we never change a0, because memcpy returns the original dst */
+ mv a3, a0
+
+ /* len < 8 */
+ bnez t1, .Lend
+
+ /* t1 = (-dst) & 0b111 */
+ neg t1, a0
+ andi t1, t1, 0b111
+
+ sub a2, a2, t1
+
+ la t2, .Lduff_start
+ slli t3, t1, 3
+ sub t2, t2, t3
+ jr t2
+ lb t3, 6(a1)
+ sb t3, 6(a3)
+ lb t3, 5(a1)
+ sb t3, 5(a3)
+ lb t3, 4(a1)
+ sb t3, 4(a3)
+ lb t3, 3(a1)
+ sb t3, 3(a3)
+ lb t3, 2(a1)
+ sb t3, 2(a3)
+ lb t3, 1(a1)
+ sb t3, 1(a3)
+ lb t3, 0(a1)
+ sb t3, 0(a3)
+.Lduff_start:
+
+ add a1, a1, t1
+ add a3, a3, t1
+
+ beqz a2, .Lreturn
+
+ beqz t0, .Lmemcpy8
+
+ /*
+ * a4 - size_t right_shift
+ * a5 - size_t left_shift
+ * a6 - size_t whole (number of dword stores)
+ */
+
+ /* right_shift = (src % 0b111) * 8; */
+ andi a4, a1, 0b111
+ slli a4, a4, 3
+
+ /* left_shift = 64 - right_shift */
+ neg a5, a4
+
+ /* whole = len / 8 */
+ srli a6, a2, 3
+
+ /* len = len % 8 */
+ andi a2, a2, 0b111
+
+ /* t0 - uint64_t* ptr */
+
+ /* ptr = src & ~0b111 */
+ andi t0, a1, ~0b111
+
+ /* src += whole * 8 */
+ slli t1, a6, 3
+ add a1, a1, t1
+
+ /*
+ * t1 - uint64_t low
+ * t2 - uint64_t high
+ */
+
+ /* low = *ptr++ */
+ ld t1, (t0)
+ addi t0, t0, 8
+
+ /* low >>= right_shift */
+ srl t1, t1, a4
+
+ beqz a6, .Llmain_skip
+.Llmain:
+ /* high = *ptr++ */
+ ld t2, (t0)
+ addi t0, t0, 8
+
+ /* whole-- */
+ addi a6, a6, -1
+
+ /* temp = (high << left_shift) | low */
+ sll t3, t2, a5
+ or t3, t3, t1
+
+ /* low = high >> right_shift */
+ srl t1, t2, a4
+
+ /* *dst++ = temp */
+ sd t3, (a3)
+ addi a3, a3, 8
+
+ bnez a6, .Llmain
+
+.Llmain_skip:
+
+.Lend:
+ la t1, .Lduff_end
+ slli t2, a2, 3
+ sub t1, t1, t2
+ jr t1
+ lb t2, 6(a1)
+ sb t2, 6(a3)
+ lb t2, 5(a1)
+ sb t2, 5(a3)
+ lb t2, 4(a1)
+ sb t2, 4(a3)
+ lb t2, 3(a1)
+ sb t2, 3(a3)
+ lb t2, 2(a1)
+ sb t2, 2(a3)
+ lb t2, 1(a1)
+ sb t2, 1(a3)
+ lb t2, 0(a1)
+ sb t2, 0(a3)
+.Lduff_end:
+
+.Lreturn:
+ ret
+
+/* exectued when dst - src is multiple of 8
+ * a0 - void* dst
+ * a1 - const void* src
+ * a2 - size_t len
+ */
+.Lmemcpy8:
+
+ beqz a2, .Lreturn
+
+ slti t0, a2, 128
+ bnez t0, .Llmain8_64_skip
+
+ /* a4 - uint64_t* end_unroll */
+
+ /* end_unroll = dst + len / 64 * 64 */
+ andi t0, a2, ~0b111111
+ add a4, a3, t0
+
+ /* len = len % 64 */
+ andi a2, a2, 0b111111
+
+.Llmain8_64:
+ ld t0, 0(a1)
+ ld t1, 8(a1)
+ ld t2, 16(a1)
+ ld t3, 24(a1)
+ sd t0, 0(a3)
+ sd t1, 8(a3)
+ sd t2, 16(a3)
+ sd t3, 24(a3)
+ ld t0, 32(a1)
+ ld t1, 40(a1)
+ ld t2, 48(a1)
+ ld t3, 56(a1)
+ sd t0, 32(a3)
+ sd t1, 40(a3)
+ sd t2, 48(a3)
+ sd t3, 56(a3)
+ addi a3, a3, 64
+ addi a1, a1, 64
+ bne a3, a4, .Llmain8_64
+.Llmain8_64_skip:
+
+ beqz a2, .Lreturn
+
+ /* a4 - uint64_t* end_align */
+
+ /* end_align = (dst + len) & ~0b111 */
+ add a4, a3, a2
+ andi a4, a4, ~0b111
+
+ /* len = len % 8 */
+ andi a2, a2, 0b111
+
+ beq a3, a4, .Llmain8_skip
+.Llmain8:
+ ld t0, (a1)
+ sd t0, (a3)
+ addi a3, a3, 8
+ addi a1, a1, 8
+ bne a3, a4, .Llmain8
+.Llmain8_skip:
+
+ la t1, .Lduff_end
+ slli t2, a2, 3
+ sub t1, t1, t2
+ jr t1
+END(memcpy)
diff --git a/lib/libc/riscv/string/memset.S b/lib/libc/riscv/string/memset.S
new file mode 100644
index 000000000000..ca435dfdd5c1
--- /dev/null
+++ b/lib/libc/riscv/string/memset.S
@@ -0,0 +1,95 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2024 Strahinja Stanisic <strajabot@FreeBSD.org>
+ */
+
+#include <machine/asm.h>
+
+/*
+ * register a0 - void *dest
+ * register a1 - int c
+ * register a2 - size_t len
+ */
+ENTRY(memset)
+ andi a1, a1, 0xFF
+
+ sltiu t1, a2, 8
+ mv t0, a0
+ bnez t1, .Lend
+
+ li t1, 0x0101010101010101
+ mul a1, a1, t1
+
+ andi t1, a0, 0b111
+ andi t0, a0, ~0b111
+
+ beqz t1, .Lloop_store_64
+
+ la t2, .Lduff_start
+ slli t3, t1, 2
+ add t2, t2, t3
+ jr -4(t2)
+.Lduff_start:
+ sb a1, 1(t0)
+ sb a1, 2(t0)
+ sb a1, 3(t0)
+ sb a1, 4(t0)
+ sb a1, 5(t0)
+ sb a1, 6(t0)
+ sb a1, 7(t0)
+
+ /* a3 = a3 -(8-a) <=> a3 = a3 + (a-8) */
+ addi t1, t1, -8
+ add a2, a2, t1
+ addi t0, t0, 8
+
+.Lloop_store_64:
+ slti t1, a2, 64
+ bnez t1, .Lstore_rest
+ sd a1, 0(t0)
+ sd a1, 8(t0)
+ sd a1, 16(t0)
+ sd a1, 24(t0)
+ sd a1, 32(t0)
+ sd a1, 40(t0)
+ sd a1, 48(t0)
+ sd a1, 56(t0)
+ addi a2, a2, -64
+ addi t0, t0, 64
+ j .Lloop_store_64
+
+.Lstore_rest:
+ la t2, .Lduff_rest
+ andi t3, a2, ~0b111
+ srli t4, t3, 1
+ sub t2, t2, t4
+ jr t2
+ sd a1, 56(t0)
+ sd a1, 48(t0)
+ sd a1, 40(t0)
+ sd a1, 32(t0)
+ sd a1, 24(t0)
+ sd a1, 16(t0)
+ sd a1, 8(t0)
+ sd a1, 0(t0)
+.Lduff_rest:
+ add t0, t0, t3
+ sub a2, a2, t3
+
+.Lend:
+ slli a2, a2, 2
+ la t2, .Lduff_end
+ sub t2, t2, a2
+ jr t2
+ sb a1, 6(t0)
+ sb a1, 5(t0)
+ sb a1, 4(t0)
+ sb a1, 3(t0)
+ sb a1, 2(t0)
+ sb a1, 1(t0)
+ sb a1, (t0)
+.Lduff_end:
+ ret
+END(memset)
+
diff --git a/lib/libc/riscv/string/strchrnul.S b/lib/libc/riscv/string/strchrnul.S
new file mode 100644
index 000000000000..8abba71c4199
--- /dev/null
+++ b/lib/libc/riscv/string/strchrnul.S
@@ -0,0 +1,116 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2024 Strahinja Stanisic <strajabot@FreeBSD.org>
+ */
+
+#include <machine/asm.h>
+
+ .weak strchrnul
+ .set strchrnul, __strchrnul
+
+/*
+ * a0 - const char *str
+ * a1 - int c;
+ */
+ENTRY(__strchrnul)
+ /*
+ * a0 - const char *ptr;
+ * a1 - char cccccccc[8];
+ * a2 - char iter[8];
+ * a3 - char mask_end
+ */
+
+ /* int to char */
+ andi a1, a1, 0xFF
+
+ /* t0 = 0x0101010101010101 */
+ li t0, 0x01010101
+ slli t1, t0, 32
+ or t0, t0, t1
+
+ /* t1 = 0x8080808080808080 */
+ slli t1, t0, 7
+
+ /* spread char across bytes */
+ mul a1, a1, t0
+
+ /* align_offset */
+ andi t2, a0, 0b111
+
+ /* align pointer */
+ andi a0, a0, ~0b111
+
+ /* if pointer is aligned skip to loop */
+ beqz t2, .Lloop
+
+ ld a2, (a0)
+
+ /* mask_start calculation */
+ slli t2, t2, 3
+ neg t2, t2
+ srl t2, t0, t2
+
+ /* fill bytes before start with non-zero */
+ or a3, a2, t2
+
+ xor a2, a2, a1
+ or a2, a2, t2
+
+ /* has_zero for \0 */
+ not t3, a3
+ not t2, a2
+ sub a3, a3, t0
+ sub a2, a2, t0
+ and a3, a3, t3
+ and a2, a2, t2
+ and a3, a3, t1
+ and a2, a2, t1
+
+
+ /* if \0 or c was found, exit */
+ or a2, a2, a3
+ addi a0, a0, 8
+ bnez a2, .Lfind_char
+
+
+.Lloop:
+ ld a2, (a0)
+
+ /* has_zero for both \0 or c */
+ xor a3, a2, a1
+
+ not t2, a2
+ not t3, a3
+ sub a2, a2, t0
+ sub a3, a3, t0
+ and a2, a2, t2
+ and a3, a3, t3
+ and a2, a2, t1
+ and a3, a3, t1
+
+ /* if \0 or c was found, exit */
+ or a2, a2, a3
+ addi a0, a0, 8
+ beqz a2, .Lloop
+
+.Lfind_char:
+ addi a0, a0, -8
+
+ /* isolate lowest set bit */
+ neg t0, a2
+ and a2, a2, t0
+
+ li t0, 0x0001020304050607
+ srli a2, a2, 7
+
+ /* lowest set bit is 2^(8*k)
+ * multiplying by it shifts the idx array in t0 by k bytes to the left */
+ mul a2, a2, t0
+
+ /* highest byte contains idx of first zero */
+ srli a2, a2, 56
+
+ add a0, a0, a2
+ ret
+END(__strchrnul)
diff --git a/lib/libc/riscv/string/strlen.S b/lib/libc/riscv/string/strlen.S
new file mode 100644
index 000000000000..3beb160f2e6f
--- /dev/null
+++ b/lib/libc/riscv/string/strlen.S
@@ -0,0 +1,77 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2024 Strahinja Stanisic <strajabot@FreeBSD.org>
+ */
+
+#include <machine/asm.h>
+
+/*
+ * https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
+ * uses haszero(v) (((v) - 0x01010101UL) & ~(v) & 0x80808080UL)
+ * which evalutates > 0 when there is zero in v
+ *
+ * register a0 - char *s
+ */
+ENTRY(strlen)
+ /*
+ * register a0 - char *str_start
+ * register a1 - char *str_ptr
+ * register a2 - char[8] iter
+ */
+
+ /* load constants for haszero */
+ li t0, 0x0101010101010101
+ slli t1, t0, 7 # 0x8080808080808080, avoid li
+
+ /* check alignment of str_start */
+ andi a1, a0, ~0b111
+ ld a2, (a1)
+ beq a1, a0, .Lhas_zero
+
+ /* fill bytes before str_start with non-zero */
+ slli t2, a0, 3
+ addi t3, t2, -64
+ neg t3, t3
+ srl t3, t0, t3
+ or a2, a2, t3
+
+ /* unrolled iteration of haszero */
+ not t2, a2
+ sub a2, a2, t0
+ and a2, a2, t2
+ and a2, a2, t1
+
+ bnez a2, .Lfind_zero
+
+.Lloop_has_zero:
+ ld a2, 8(a1)
+ addi a1, a1, 8 # move ptr to next 8byte
+.Lhas_zero:
+ not t2, a2
+ sub a2, a2, t0
+ and a2, a2, t2
+ and a2, a2, t1
+
+ beqz a2, .Lloop_has_zero
+
+.Lfind_zero:
+ /* use (iter & -iter) to isolate lowest set bit */
+ sub a3, zero, a2 #a3 = -iter
+ and t1, a2, a3 #t1 = (iter & -iter)
+
+ li t0, 0x0001020304050607
+ srli t1, t1, 7
+ /*
+ * lowest set bit is 2^(8*k)
+ * multiplying by it shifts the idx array in t0 by k bytes to the left
+ */
+ mul t1, t1, t0
+ /* highest byte contains idx of first zero */
+ srli t1, t1, 56
+
+ add a1, a1, t1
+ sub a0, a1, a0
+ ret
+END(strlen)
+
diff --git a/lib/libc/riscv/string/strnlen.S b/lib/libc/riscv/string/strnlen.S
new file mode 100644
index 000000000000..c0fd959548ff
--- /dev/null
+++ b/lib/libc/riscv/string/strnlen.S
@@ -0,0 +1,143 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2024 Strahinja Stanisic <strajabot@FreeBSD.org>
+ */
+
+#include <machine/asm.h>
+
+/*
+ * a0 - const char *s
+ * a1 - size_t maxlen;
+ */
+ENTRY(strnlen)
+ /*
+ * a0 - const char *s;
+ * a1 - size_t maxlen;
+ * a2 - uint64_t *ptr;
+ * a3 - char iter[8];
+ * a4 - uint64_t *end_align;
+ * a5 - uint64_t *end_unroll;
+ */
+
+ beqz a1, .Lnot_found
+
+ /* ptr = s & ~0b111 */
+ /* t0 = 0x0101010101010101 */
+ /* t1 = 0x8080808080808080 */
+ /* end_align = (s + maxlen + 7) & ~0b111 */
+ /* mask_start = t0 >> ((-s.value) << 3) */
+ add a4, a0, a1
+ li t0, 0x01010101
+ addi a4, a4, 7
+ slli t1, t0, 32
+ neg t2, a0
+ andi a4, a4, ~0b111
+ or t0, t0, t1
+ slli t2, t2, 3
+ andi a2, a0, ~0b111
+ slli t1, t0, 7
+ srl t2, t0, t2
+
+ /* if pointer is aligned skip to loop */
+ beq a0, a2, .Lskip_start
+
+ /* iter = *ptr */
+ ld a3, (a2)
+
+ /* iter = iter | mask_start */
+ or a3, a3, t2
+
+ /* has_zero */
+ not t2, a3
+ sub a3, a3, t0
+ and t2, t2, t1
+ and a3, a3, t2
+
+ addi a2, a2, 8
+ bnez a3, .Lfind_zero
+
+.Lskip_start:
+ /* end_unroll */
+ sub t2, a4, a2
+ andi t2, t2, ~0b1111
+ add a5, a2, t2
+
+ /* while (ptr != end_unroll) */
+ beq a2, a5, .Lskip_loop
+.Lloop:
+ ld a3, (a2)
+ ld a6, 8(a2)
+
+ /* has_zero */
+ not t2, a3
+ not t3, a6
+ sub a3, a3, t0
+ sub a6, a6, t0
+ and t2, t2, t1
+ and t3, t3, t1
+ and a3, a3, t2
+ and a6, a6, t3
+
+ addi a2, a2, 8
+ bnez a3, .Lfind_zero
+
+ mv a3, a6
+
+ addi a2, a2, 8
+ bnez a3, .Lfind_zero
+
+ bne a2, a5, .Lloop
+
+.Lskip_loop:
+
+ beq a2, a4, .Lnot_found
+
+ ld a3, (a2)
+
+ /* has_zero */
+ not t2, a3
+ sub a3, a3, t0
+ and t2, t2, t1
+ and a3, a3, t2
+
+
+ addi a2, a2, 8
+ beqz a3, .Lnot_found
+
+.Lfind_zero:
+
+ /* move ptr back */
+ addi a2, a2, -8
+
+ /* isolate lowest set bit */
+ neg t0, a3
+ and a3, a3, t0
+
+ li t0, 0x0001020304050607
+ srli a3, a3, 7
+
+ /* lowest set bit is 2^(8*k)
+ * multiplying by it shifts the idx array in t0 by k bytes to the left */
+ mul a3, a3, t0
+
+ /* highest byte contains idx of first zero */
+ srli a3, a3, 56
+
+ /* zero_idx */
+ sub a2, a2, a0
+ add a2, a2, a3
+
+ /* min(zero_idx, maxlen) */
+ sub a2, a2, a1
+ srai t1, a2, 63
+ and a2, a2, t1
+ add a0, a1, a2
+
+ ret
+
+.Lnot_found:
+ mv a0, a1
+ ret
+
+END(strnlen)
diff --git a/lib/libc/riscv/string/strrchr.S b/lib/libc/riscv/string/strrchr.S
new file mode 100644
index 000000000000..e922a692e77f
--- /dev/null
+++ b/lib/libc/riscv/string/strrchr.S
@@ -0,0 +1,127 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2024 Strahinja Stanisic <strajabot@FreeBSD.org>
+ */
+
+#include <machine/asm.h>
+
+ .weak rindex
+ .set rindex, strrchr
+
+/*
+ * a0 - const char *s
+ * a1 - int c
+ */
+ENTRY(strrchr)
+ /*
+ * a0 - const char *ptr_align
+ * a1 - temporary
+ * a2 - temporary
+ * a3 - temporary
+ * a4 - temporary
+ * a5 - const char[8] cccccccc
+ * a6 - const uint64_t *save_align
+ * a7 - const uint64_t save_iter
+ * t0 - const uintr64_t REP8_0X01
+ * t1 - const uintr64_t REP8_0X80
+ */
+
+ /*
+ * save_align = 0
+ * save_iter = 0xFFFFFFFFFFFFFF00
+ * REP8_0X01 = 0x0101010101010101
+ * cccccccc = (char)c * REP8_0X01
+ * REP8_0X80 = (REP8_0X80 << 7) << ((str % 8) * 8)
+ * ptr_align = str - str % 8
+ */
+ li t0, 0x01010101
+ li a6, 0
+ slli a2, a0, 3
+ slli t1, t0, 32
+ li a7, 0xFFFFFFFFFFFFFF00
+ or t0, t0, t1
+ andi a1, a1, 0xFF
+ slli t1, t0, 7
+ andi a0, a0, ~0b111
+ mul a5, a1, t0
+ sll t1, t1, a2
+
+.Lloop: /* do { */
+ ld a1, 0(a0) /* a1 -> data = *ptr_align */
+ not a3, a1 /* a3 -> nhz = ~data */
+ xor a2, a1, a5 /* a2 -> iter = data ^ cccccccc */
+ sub a1, a1, t0 /* a1 -> hz = data - REP8_0X01 */
+ not a4, a2 /* a4 -> nhc = ~iter */
+ and a1, a1, a3 /* hz = hz & nhz */
+ sub a3, a2, t0 /* a3 -> hc = iter - REP8_0X01 */
+ and a1, a1, t1 /* hz = hz & REP8_0X80 */
+ and a3, a3, a4 /* hc = hc & nhc */
+ addi a4, a1, -1 /* a4 -> mask_end = hz - 1 */
+ and a3, a3, t1 /* hc = hc & REP8_0X80 */
+ xor a4, a4, a1 /* mask_end = mask_end ^ hz */
+ addi a0, a0, 8 /* ptr_align = ptr_align + 8 */
+ and a3, a3, a4 /* hc = hc & mask_end */
+ slli t1, t0, 7 /* REP8_0X80 = REP8_0X01 << 7 */
+ not a4, a4 /* mask_end = ~mask_end */
+
+ beqz a3, .Lskip_save /* if(!hc) goto skip_save */
+ or a2, a2, a4 /* iter = iter | mask_end */
+ addi a6, a0, -8 /* save_align = ptr_align - 8 */
+ mv a7, a2 /* save_iter = iter */
+
+.Lskip_save:
+ beqz a1, .Lloop /* } while(!hz) */
+
+.Lfind_char:
+ /*
+ * a1 -> iter = save_iter
+ * a2 -> mask_iter = 0xFF00000000000000
+ * a3 -> match_off = 7
+ */
+ li a2, 0xFF
+ mv a1, a7
+ slli a2, a2, 56
+ li a3, 7
+
+ and a0, a1, a2
+ srli a2, a2, 8
+ beqz a0, .Lret
+
+ addi a3, a3, -1
+ and a0, a1, a2
+ srli a2, a2, 8
+ beqz a0, .Lret
+
+ addi a3, a3, -1
+ and a0, a1, a2
+ srli a2, a2, 8
+ beqz a0, .Lret
+
+ addi a3, a3, -1
+ and a0, a1, a2
+ srli a2, a2, 8
+ beqz a0, .Lret
+
+ addi a3, a3, -1
+ and a0, a1, a2
+ srli a2, a2, 8
+ beqz a0, .Lret
+
+ addi a3, a3, -1
+ and a0, a1, a2
+ srli a2, a2, 8
+ beqz a0, .Lret
+
+ addi a3, a3, -1
+ and a0, a1, a2
+ srli a2, a2, 8
+ beqz a0, .Lret
+
+ addi a3, a3, -1
+
+.Lret:
+ /* return save_align + match_offset */
+ add a0, a6, a3
+ ret
+END(strrchr)