diff options
Diffstat (limited to 'lib/libc/riscv/string')
| -rw-r--r-- | lib/libc/riscv/string/Makefile.inc | 10 | ||||
| -rw-r--r-- | lib/libc/riscv/string/bcopy.c | 14 | ||||
| -rw-r--r-- | lib/libc/riscv/string/bzero.c | 14 | ||||
| -rw-r--r-- | lib/libc/riscv/string/memchr.S | 188 | ||||
| -rw-r--r-- | lib/libc/riscv/string/memcpy.S | 217 | ||||
| -rw-r--r-- | lib/libc/riscv/string/memset.S | 95 | ||||
| -rw-r--r-- | lib/libc/riscv/string/strchrnul.S | 116 | ||||
| -rw-r--r-- | lib/libc/riscv/string/strlen.S | 77 | ||||
| -rw-r--r-- | lib/libc/riscv/string/strnlen.S | 143 | ||||
| -rw-r--r-- | lib/libc/riscv/string/strrchr.S | 127 |
10 files changed, 1001 insertions, 0 deletions
diff --git a/lib/libc/riscv/string/Makefile.inc b/lib/libc/riscv/string/Makefile.inc new file mode 100644 index 000000000000..6dae6b2cb62d --- /dev/null +++ b/lib/libc/riscv/string/Makefile.inc @@ -0,0 +1,10 @@ +MDSRCS+= \ + bcopy.c \ + bzero.c \ + memchr.S \ + memcpy.S \ + memset.S \ + strlen.S \ + strnlen.S \ + strchrnul.S \ + strrchr.S diff --git a/lib/libc/riscv/string/bcopy.c b/lib/libc/riscv/string/bcopy.c new file mode 100644 index 000000000000..0dee529fb9df --- /dev/null +++ b/lib/libc/riscv/string/bcopy.c @@ -0,0 +1,14 @@ +/*- + * Public domain. + */ + +#include <string.h> + +#undef bcopy /* _FORTIFY_SOURCE */ + +void +bcopy(const void *src, void *dst, size_t len) +{ + + memmove(dst, src, len); +} diff --git a/lib/libc/riscv/string/bzero.c b/lib/libc/riscv/string/bzero.c new file mode 100644 index 000000000000..d82f3061865b --- /dev/null +++ b/lib/libc/riscv/string/bzero.c @@ -0,0 +1,14 @@ +/*- + * Public domain. + */ + +#include <string.h> + +#undef bzero /* _FORTIFY_SOURCE */ + +void +bzero(void *b, size_t len) +{ + + memset(b, 0, len); +} diff --git a/lib/libc/riscv/string/memchr.S b/lib/libc/riscv/string/memchr.S new file mode 100644 index 000000000000..e6e04bfae96a --- /dev/null +++ b/lib/libc/riscv/string/memchr.S @@ -0,0 +1,188 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2024 Strahinja Stanisic <strajabot@FreeBSD.org> + */ + +#include <machine/asm.h> + +/* + * a0 - const void *b + * a1 - int c + * a2 - size_t len + */ +ENTRY(memchr) + /* + * a0 - const char *ptr + * a1 - char cccccccc[8] + * a2 - char iter[8] + * a3 - uint8_t *end + * a4 - uint64_t *end_align + * a5 - uint64_t *end_unroll + */ + + beqz a2, .Lno_match + + /* c = (uint8_t) c */ + andi a1, a1, 0xFF + + /* + * t0 = 0x0101010101010101 + * t1 = 0x8080808080808080 + * t2 = b << 3 + * cccccccc = (uint8_t)c * t0 + * end = b + len; + * ptr = b & ~0b111 + */ + add a3, a0, a2 + li t0, 0x01010101 + sltu t2, a0, a3 + slli t1, t0, 32 + neg t2, t2 + or t0, t0, t1 + and a3, a3, t2 + slli t1, t0, 7 + slli t2, a0, 3 + and a0, a0, ~0b111 + mul a1, t0, a1 + + ld a2, (a0) + + /* + * mask_start = REP8_0x01 ^ (REP8_0x01 << t2) + * iter = iter ^ cccccccc + * iter = iter | mask_start + */ + sll t2, t0, t2 + xor a2, a2, a1 + xor t2, t2, t0 + or a2, a2, t2 + + /* has_zero(iter) + * end_align = (end + 7) & ~0b111; + */ + addi a4, a3, 7 + not t2, a2 + sub a2, a2, t0 + and t2, t2, t1 + andi a4, a4, ~0b111 + and a2, a2, t2 + + /* ptr = ptr + 8 */ + addi a0, a0, 8 + + bnez a2, .Lfind_zero + + /* if(ptr == end_align) */ + beq a0, a4, .Lno_match + + /* end_unroll = end_align & ~0b1111 */ + andi a5, a4, ~0b1111 + + /* + * Instead of branching to check if `ptr` is 16-byte aligned: + * - Probe the next 8 bytes for `c` + * - Align `ptr` down to the nearest 16-byte boundary + * + * If `ptr` was already 16-byte aligned, those 8 bytes will be + * checked again inside the unrolled loop. + * + * This removes an unpredictable branch and improves performance. + */ + + ld a2, (a0) + xor a2, a2, a1 + + not t2, a2 + sub a2, a2, t0 + and t2, t2, t1 + and a2, a2, t2 + + addi a0, a0, 8 + + bnez a2, .Lfind_zero + + andi a0, a0, ~0b1111 + + /* while(ptr != end_unroll) */ + beq a0, a5, .Lskip_loop +.Lloop: + ld a2, (a0) + ld t3, 8(a0) + + xor a2, a2, a1 + xor t3, t3, a1 + + not t2, a2 + not t4, t3 + sub a2, a2, t0 + sub t3, t3, t0 + and t2, t2, t1 + and t4, t4, t1 + and a2, a2, t2 + and t3, t3, t4 + + addi a0, a0, 8 + + bnez a2, .Lfind_zero + + /* move into iter for find_zero */ + mv a2, t3 + + addi a0, a0, 8 + + bnez a2, .Lfind_zero + + bne a0, a5, .Lloop +.Lskip_loop: + + /* there might be one 8byte left */ + beq a0, a4, .Lno_match + + ld a2, (a0) + xor a2, a2, a1 + + not t2, a2 + sub a2, a2, t0 + and t2, t2, t1 + and a2, a2, t2 + + addi a0, a0, 8 + + beqz a2, .Lno_match + +.Lfind_zero: + /* + * ptr = ptr - 8 + * t1 = 0x0001020304050607 + * iter = iter & (-iter) + * iter = iter >> 7 + * iter = iter * t1 + * iter = iter >> 56 + */ + li t1, 0x10203000 + neg t0, a2 + slli t1, t1, 4 + and a2, a2, t0 + addi t1, t1, 0x405 + srli a2, a2, 7 + slli t1, t1, 16 + addi a0, a0, -8 + addi t1, t1, 0x607 + mul a2, a2, t1 + srli a2, a2, 56 + + /* left = end - ptr */ + sub t0, a3, a0 + + /* return iter < left ? ptr + iter : NULL */ + sltu t1, a2, t0 + neg t1, t1 + add a0, a0, a2 + and a0, a0, t1 + ret + +.Lno_match: + li a0, 0 + ret +END(memchr) diff --git a/lib/libc/riscv/string/memcpy.S b/lib/libc/riscv/string/memcpy.S new file mode 100644 index 000000000000..7536514df777 --- /dev/null +++ b/lib/libc/riscv/string/memcpy.S @@ -0,0 +1,217 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2024 Strahinja Stanisic <strajabot@FreeBSD.org> + */ + +#include <machine/asm.h> + +/* + * a0 - void* dst + * a1 - const void* src + * a2 - size_t len + */ +ENTRY(memcpy) + beqz a2, .Lreturn + + /* diff = (dstv - srcv) & 0b111 */ + sub t0, a0, a1 + andi t0, t0, 0b111 + + sltiu t1, a2, 8 + + /* we never change a0, because memcpy returns the original dst */ + mv a3, a0 + + /* len < 8 */ + bnez t1, .Lend + + /* t1 = (-dst) & 0b111 */ + neg t1, a0 + andi t1, t1, 0b111 + + sub a2, a2, t1 + + la t2, .Lduff_start + slli t3, t1, 3 + sub t2, t2, t3 + jr t2 + lb t3, 6(a1) + sb t3, 6(a3) + lb t3, 5(a1) + sb t3, 5(a3) + lb t3, 4(a1) + sb t3, 4(a3) + lb t3, 3(a1) + sb t3, 3(a3) + lb t3, 2(a1) + sb t3, 2(a3) + lb t3, 1(a1) + sb t3, 1(a3) + lb t3, 0(a1) + sb t3, 0(a3) +.Lduff_start: + + add a1, a1, t1 + add a3, a3, t1 + + beqz a2, .Lreturn + + beqz t0, .Lmemcpy8 + + /* + * a4 - size_t right_shift + * a5 - size_t left_shift + * a6 - size_t whole (number of dword stores) + */ + + /* right_shift = (src % 0b111) * 8; */ + andi a4, a1, 0b111 + slli a4, a4, 3 + + /* left_shift = 64 - right_shift */ + neg a5, a4 + + /* whole = len / 8 */ + srli a6, a2, 3 + + /* len = len % 8 */ + andi a2, a2, 0b111 + + /* t0 - uint64_t* ptr */ + + /* ptr = src & ~0b111 */ + andi t0, a1, ~0b111 + + /* src += whole * 8 */ + slli t1, a6, 3 + add a1, a1, t1 + + /* + * t1 - uint64_t low + * t2 - uint64_t high + */ + + /* low = *ptr++ */ + ld t1, (t0) + addi t0, t0, 8 + + /* low >>= right_shift */ + srl t1, t1, a4 + + beqz a6, .Llmain_skip +.Llmain: + /* high = *ptr++ */ + ld t2, (t0) + addi t0, t0, 8 + + /* whole-- */ + addi a6, a6, -1 + + /* temp = (high << left_shift) | low */ + sll t3, t2, a5 + or t3, t3, t1 + + /* low = high >> right_shift */ + srl t1, t2, a4 + + /* *dst++ = temp */ + sd t3, (a3) + addi a3, a3, 8 + + bnez a6, .Llmain + +.Llmain_skip: + +.Lend: + la t1, .Lduff_end + slli t2, a2, 3 + sub t1, t1, t2 + jr t1 + lb t2, 6(a1) + sb t2, 6(a3) + lb t2, 5(a1) + sb t2, 5(a3) + lb t2, 4(a1) + sb t2, 4(a3) + lb t2, 3(a1) + sb t2, 3(a3) + lb t2, 2(a1) + sb t2, 2(a3) + lb t2, 1(a1) + sb t2, 1(a3) + lb t2, 0(a1) + sb t2, 0(a3) +.Lduff_end: + +.Lreturn: + ret + +/* exectued when dst - src is multiple of 8 + * a0 - void* dst + * a1 - const void* src + * a2 - size_t len + */ +.Lmemcpy8: + + beqz a2, .Lreturn + + slti t0, a2, 128 + bnez t0, .Llmain8_64_skip + + /* a4 - uint64_t* end_unroll */ + + /* end_unroll = dst + len / 64 * 64 */ + andi t0, a2, ~0b111111 + add a4, a3, t0 + + /* len = len % 64 */ + andi a2, a2, 0b111111 + +.Llmain8_64: + ld t0, 0(a1) + ld t1, 8(a1) + ld t2, 16(a1) + ld t3, 24(a1) + sd t0, 0(a3) + sd t1, 8(a3) + sd t2, 16(a3) + sd t3, 24(a3) + ld t0, 32(a1) + ld t1, 40(a1) + ld t2, 48(a1) + ld t3, 56(a1) + sd t0, 32(a3) + sd t1, 40(a3) + sd t2, 48(a3) + sd t3, 56(a3) + addi a3, a3, 64 + addi a1, a1, 64 + bne a3, a4, .Llmain8_64 +.Llmain8_64_skip: + + beqz a2, .Lreturn + + /* a4 - uint64_t* end_align */ + + /* end_align = (dst + len) & ~0b111 */ + add a4, a3, a2 + andi a4, a4, ~0b111 + + /* len = len % 8 */ + andi a2, a2, 0b111 + + beq a3, a4, .Llmain8_skip +.Llmain8: + ld t0, (a1) + sd t0, (a3) + addi a3, a3, 8 + addi a1, a1, 8 + bne a3, a4, .Llmain8 +.Llmain8_skip: + + la t1, .Lduff_end + slli t2, a2, 3 + sub t1, t1, t2 + jr t1 +END(memcpy) diff --git a/lib/libc/riscv/string/memset.S b/lib/libc/riscv/string/memset.S new file mode 100644 index 000000000000..ca435dfdd5c1 --- /dev/null +++ b/lib/libc/riscv/string/memset.S @@ -0,0 +1,95 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2024 Strahinja Stanisic <strajabot@FreeBSD.org> + */ + +#include <machine/asm.h> + +/* + * register a0 - void *dest + * register a1 - int c + * register a2 - size_t len + */ +ENTRY(memset) + andi a1, a1, 0xFF + + sltiu t1, a2, 8 + mv t0, a0 + bnez t1, .Lend + + li t1, 0x0101010101010101 + mul a1, a1, t1 + + andi t1, a0, 0b111 + andi t0, a0, ~0b111 + + beqz t1, .Lloop_store_64 + + la t2, .Lduff_start + slli t3, t1, 2 + add t2, t2, t3 + jr -4(t2) +.Lduff_start: + sb a1, 1(t0) + sb a1, 2(t0) + sb a1, 3(t0) + sb a1, 4(t0) + sb a1, 5(t0) + sb a1, 6(t0) + sb a1, 7(t0) + + /* a3 = a3 -(8-a) <=> a3 = a3 + (a-8) */ + addi t1, t1, -8 + add a2, a2, t1 + addi t0, t0, 8 + +.Lloop_store_64: + slti t1, a2, 64 + bnez t1, .Lstore_rest + sd a1, 0(t0) + sd a1, 8(t0) + sd a1, 16(t0) + sd a1, 24(t0) + sd a1, 32(t0) + sd a1, 40(t0) + sd a1, 48(t0) + sd a1, 56(t0) + addi a2, a2, -64 + addi t0, t0, 64 + j .Lloop_store_64 + +.Lstore_rest: + la t2, .Lduff_rest + andi t3, a2, ~0b111 + srli t4, t3, 1 + sub t2, t2, t4 + jr t2 + sd a1, 56(t0) + sd a1, 48(t0) + sd a1, 40(t0) + sd a1, 32(t0) + sd a1, 24(t0) + sd a1, 16(t0) + sd a1, 8(t0) + sd a1, 0(t0) +.Lduff_rest: + add t0, t0, t3 + sub a2, a2, t3 + +.Lend: + slli a2, a2, 2 + la t2, .Lduff_end + sub t2, t2, a2 + jr t2 + sb a1, 6(t0) + sb a1, 5(t0) + sb a1, 4(t0) + sb a1, 3(t0) + sb a1, 2(t0) + sb a1, 1(t0) + sb a1, (t0) +.Lduff_end: + ret +END(memset) + diff --git a/lib/libc/riscv/string/strchrnul.S b/lib/libc/riscv/string/strchrnul.S new file mode 100644 index 000000000000..8abba71c4199 --- /dev/null +++ b/lib/libc/riscv/string/strchrnul.S @@ -0,0 +1,116 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2024 Strahinja Stanisic <strajabot@FreeBSD.org> + */ + +#include <machine/asm.h> + + .weak strchrnul + .set strchrnul, __strchrnul + +/* + * a0 - const char *str + * a1 - int c; + */ +ENTRY(__strchrnul) + /* + * a0 - const char *ptr; + * a1 - char cccccccc[8]; + * a2 - char iter[8]; + * a3 - char mask_end + */ + + /* int to char */ + andi a1, a1, 0xFF + + /* t0 = 0x0101010101010101 */ + li t0, 0x01010101 + slli t1, t0, 32 + or t0, t0, t1 + + /* t1 = 0x8080808080808080 */ + slli t1, t0, 7 + + /* spread char across bytes */ + mul a1, a1, t0 + + /* align_offset */ + andi t2, a0, 0b111 + + /* align pointer */ + andi a0, a0, ~0b111 + + /* if pointer is aligned skip to loop */ + beqz t2, .Lloop + + ld a2, (a0) + + /* mask_start calculation */ + slli t2, t2, 3 + neg t2, t2 + srl t2, t0, t2 + + /* fill bytes before start with non-zero */ + or a3, a2, t2 + + xor a2, a2, a1 + or a2, a2, t2 + + /* has_zero for \0 */ + not t3, a3 + not t2, a2 + sub a3, a3, t0 + sub a2, a2, t0 + and a3, a3, t3 + and a2, a2, t2 + and a3, a3, t1 + and a2, a2, t1 + + + /* if \0 or c was found, exit */ + or a2, a2, a3 + addi a0, a0, 8 + bnez a2, .Lfind_char + + +.Lloop: + ld a2, (a0) + + /* has_zero for both \0 or c */ + xor a3, a2, a1 + + not t2, a2 + not t3, a3 + sub a2, a2, t0 + sub a3, a3, t0 + and a2, a2, t2 + and a3, a3, t3 + and a2, a2, t1 + and a3, a3, t1 + + /* if \0 or c was found, exit */ + or a2, a2, a3 + addi a0, a0, 8 + beqz a2, .Lloop + +.Lfind_char: + addi a0, a0, -8 + + /* isolate lowest set bit */ + neg t0, a2 + and a2, a2, t0 + + li t0, 0x0001020304050607 + srli a2, a2, 7 + + /* lowest set bit is 2^(8*k) + * multiplying by it shifts the idx array in t0 by k bytes to the left */ + mul a2, a2, t0 + + /* highest byte contains idx of first zero */ + srli a2, a2, 56 + + add a0, a0, a2 + ret +END(__strchrnul) diff --git a/lib/libc/riscv/string/strlen.S b/lib/libc/riscv/string/strlen.S new file mode 100644 index 000000000000..3beb160f2e6f --- /dev/null +++ b/lib/libc/riscv/string/strlen.S @@ -0,0 +1,77 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2024 Strahinja Stanisic <strajabot@FreeBSD.org> + */ + +#include <machine/asm.h> + +/* + * https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord + * uses haszero(v) (((v) - 0x01010101UL) & ~(v) & 0x80808080UL) + * which evalutates > 0 when there is zero in v + * + * register a0 - char *s + */ +ENTRY(strlen) + /* + * register a0 - char *str_start + * register a1 - char *str_ptr + * register a2 - char[8] iter + */ + + /* load constants for haszero */ + li t0, 0x0101010101010101 + slli t1, t0, 7 # 0x8080808080808080, avoid li + + /* check alignment of str_start */ + andi a1, a0, ~0b111 + ld a2, (a1) + beq a1, a0, .Lhas_zero + + /* fill bytes before str_start with non-zero */ + slli t2, a0, 3 + addi t3, t2, -64 + neg t3, t3 + srl t3, t0, t3 + or a2, a2, t3 + + /* unrolled iteration of haszero */ + not t2, a2 + sub a2, a2, t0 + and a2, a2, t2 + and a2, a2, t1 + + bnez a2, .Lfind_zero + +.Lloop_has_zero: + ld a2, 8(a1) + addi a1, a1, 8 # move ptr to next 8byte +.Lhas_zero: + not t2, a2 + sub a2, a2, t0 + and a2, a2, t2 + and a2, a2, t1 + + beqz a2, .Lloop_has_zero + +.Lfind_zero: + /* use (iter & -iter) to isolate lowest set bit */ + sub a3, zero, a2 #a3 = -iter + and t1, a2, a3 #t1 = (iter & -iter) + + li t0, 0x0001020304050607 + srli t1, t1, 7 + /* + * lowest set bit is 2^(8*k) + * multiplying by it shifts the idx array in t0 by k bytes to the left + */ + mul t1, t1, t0 + /* highest byte contains idx of first zero */ + srli t1, t1, 56 + + add a1, a1, t1 + sub a0, a1, a0 + ret +END(strlen) + diff --git a/lib/libc/riscv/string/strnlen.S b/lib/libc/riscv/string/strnlen.S new file mode 100644 index 000000000000..c0fd959548ff --- /dev/null +++ b/lib/libc/riscv/string/strnlen.S @@ -0,0 +1,143 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2024 Strahinja Stanisic <strajabot@FreeBSD.org> + */ + +#include <machine/asm.h> + +/* + * a0 - const char *s + * a1 - size_t maxlen; + */ +ENTRY(strnlen) + /* + * a0 - const char *s; + * a1 - size_t maxlen; + * a2 - uint64_t *ptr; + * a3 - char iter[8]; + * a4 - uint64_t *end_align; + * a5 - uint64_t *end_unroll; + */ + + beqz a1, .Lnot_found + + /* ptr = s & ~0b111 */ + /* t0 = 0x0101010101010101 */ + /* t1 = 0x8080808080808080 */ + /* end_align = (s + maxlen + 7) & ~0b111 */ + /* mask_start = t0 >> ((-s.value) << 3) */ + add a4, a0, a1 + li t0, 0x01010101 + addi a4, a4, 7 + slli t1, t0, 32 + neg t2, a0 + andi a4, a4, ~0b111 + or t0, t0, t1 + slli t2, t2, 3 + andi a2, a0, ~0b111 + slli t1, t0, 7 + srl t2, t0, t2 + + /* if pointer is aligned skip to loop */ + beq a0, a2, .Lskip_start + + /* iter = *ptr */ + ld a3, (a2) + + /* iter = iter | mask_start */ + or a3, a3, t2 + + /* has_zero */ + not t2, a3 + sub a3, a3, t0 + and t2, t2, t1 + and a3, a3, t2 + + addi a2, a2, 8 + bnez a3, .Lfind_zero + +.Lskip_start: + /* end_unroll */ + sub t2, a4, a2 + andi t2, t2, ~0b1111 + add a5, a2, t2 + + /* while (ptr != end_unroll) */ + beq a2, a5, .Lskip_loop +.Lloop: + ld a3, (a2) + ld a6, 8(a2) + + /* has_zero */ + not t2, a3 + not t3, a6 + sub a3, a3, t0 + sub a6, a6, t0 + and t2, t2, t1 + and t3, t3, t1 + and a3, a3, t2 + and a6, a6, t3 + + addi a2, a2, 8 + bnez a3, .Lfind_zero + + mv a3, a6 + + addi a2, a2, 8 + bnez a3, .Lfind_zero + + bne a2, a5, .Lloop + +.Lskip_loop: + + beq a2, a4, .Lnot_found + + ld a3, (a2) + + /* has_zero */ + not t2, a3 + sub a3, a3, t0 + and t2, t2, t1 + and a3, a3, t2 + + + addi a2, a2, 8 + beqz a3, .Lnot_found + +.Lfind_zero: + + /* move ptr back */ + addi a2, a2, -8 + + /* isolate lowest set bit */ + neg t0, a3 + and a3, a3, t0 + + li t0, 0x0001020304050607 + srli a3, a3, 7 + + /* lowest set bit is 2^(8*k) + * multiplying by it shifts the idx array in t0 by k bytes to the left */ + mul a3, a3, t0 + + /* highest byte contains idx of first zero */ + srli a3, a3, 56 + + /* zero_idx */ + sub a2, a2, a0 + add a2, a2, a3 + + /* min(zero_idx, maxlen) */ + sub a2, a2, a1 + srai t1, a2, 63 + and a2, a2, t1 + add a0, a1, a2 + + ret + +.Lnot_found: + mv a0, a1 + ret + +END(strnlen) diff --git a/lib/libc/riscv/string/strrchr.S b/lib/libc/riscv/string/strrchr.S new file mode 100644 index 000000000000..e922a692e77f --- /dev/null +++ b/lib/libc/riscv/string/strrchr.S @@ -0,0 +1,127 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2024 Strahinja Stanisic <strajabot@FreeBSD.org> + */ + +#include <machine/asm.h> + + .weak rindex + .set rindex, strrchr + +/* + * a0 - const char *s + * a1 - int c + */ +ENTRY(strrchr) + /* + * a0 - const char *ptr_align + * a1 - temporary + * a2 - temporary + * a3 - temporary + * a4 - temporary + * a5 - const char[8] cccccccc + * a6 - const uint64_t *save_align + * a7 - const uint64_t save_iter + * t0 - const uintr64_t REP8_0X01 + * t1 - const uintr64_t REP8_0X80 + */ + + /* + * save_align = 0 + * save_iter = 0xFFFFFFFFFFFFFF00 + * REP8_0X01 = 0x0101010101010101 + * cccccccc = (char)c * REP8_0X01 + * REP8_0X80 = (REP8_0X80 << 7) << ((str % 8) * 8) + * ptr_align = str - str % 8 + */ + li t0, 0x01010101 + li a6, 0 + slli a2, a0, 3 + slli t1, t0, 32 + li a7, 0xFFFFFFFFFFFFFF00 + or t0, t0, t1 + andi a1, a1, 0xFF + slli t1, t0, 7 + andi a0, a0, ~0b111 + mul a5, a1, t0 + sll t1, t1, a2 + +.Lloop: /* do { */ + ld a1, 0(a0) /* a1 -> data = *ptr_align */ + not a3, a1 /* a3 -> nhz = ~data */ + xor a2, a1, a5 /* a2 -> iter = data ^ cccccccc */ + sub a1, a1, t0 /* a1 -> hz = data - REP8_0X01 */ + not a4, a2 /* a4 -> nhc = ~iter */ + and a1, a1, a3 /* hz = hz & nhz */ + sub a3, a2, t0 /* a3 -> hc = iter - REP8_0X01 */ + and a1, a1, t1 /* hz = hz & REP8_0X80 */ + and a3, a3, a4 /* hc = hc & nhc */ + addi a4, a1, -1 /* a4 -> mask_end = hz - 1 */ + and a3, a3, t1 /* hc = hc & REP8_0X80 */ + xor a4, a4, a1 /* mask_end = mask_end ^ hz */ + addi a0, a0, 8 /* ptr_align = ptr_align + 8 */ + and a3, a3, a4 /* hc = hc & mask_end */ + slli t1, t0, 7 /* REP8_0X80 = REP8_0X01 << 7 */ + not a4, a4 /* mask_end = ~mask_end */ + + beqz a3, .Lskip_save /* if(!hc) goto skip_save */ + or a2, a2, a4 /* iter = iter | mask_end */ + addi a6, a0, -8 /* save_align = ptr_align - 8 */ + mv a7, a2 /* save_iter = iter */ + +.Lskip_save: + beqz a1, .Lloop /* } while(!hz) */ + +.Lfind_char: + /* + * a1 -> iter = save_iter + * a2 -> mask_iter = 0xFF00000000000000 + * a3 -> match_off = 7 + */ + li a2, 0xFF + mv a1, a7 + slli a2, a2, 56 + li a3, 7 + + and a0, a1, a2 + srli a2, a2, 8 + beqz a0, .Lret + + addi a3, a3, -1 + and a0, a1, a2 + srli a2, a2, 8 + beqz a0, .Lret + + addi a3, a3, -1 + and a0, a1, a2 + srli a2, a2, 8 + beqz a0, .Lret + + addi a3, a3, -1 + and a0, a1, a2 + srli a2, a2, 8 + beqz a0, .Lret + + addi a3, a3, -1 + and a0, a1, a2 + srli a2, a2, 8 + beqz a0, .Lret + + addi a3, a3, -1 + and a0, a1, a2 + srli a2, a2, 8 + beqz a0, .Lret + + addi a3, a3, -1 + and a0, a1, a2 + srli a2, a2, 8 + beqz a0, .Lret + + addi a3, a3, -1 + +.Lret: + /* return save_align + match_offset */ + add a0, a6, a3 + ret +END(strrchr) |
