diff options
Diffstat (limited to 'lib/libc/amd64/string/stpcpy.S')
-rw-r--r-- | lib/libc/amd64/string/stpcpy.S | 237 |
1 files changed, 237 insertions, 0 deletions
diff --git a/lib/libc/amd64/string/stpcpy.S b/lib/libc/amd64/string/stpcpy.S new file mode 100644 index 000000000000..59358e3245a8 --- /dev/null +++ b/lib/libc/amd64/string/stpcpy.S @@ -0,0 +1,237 @@ +/*- + * Copyright (c) 2023, The FreeBSD Foundation + * + * SPDX-License-Expression: BSD-2-Clause + * + * Portions of this software were developed by Robert Clausecker + * <fuz@FreeBSD.org> under sponsorship from the FreeBSD Foundation. + * + * Adapted from NetBSD's common/lib/libc/arch/x86_64/string/strcpy.S + * written by J.T. Conklin <jtc@acorntoolworks.com> and + * adapted by Guillaume Morin <guillaume@morinfr.org> to implement stpcpy + * that was originally dedicated to the public domain + */ + +#include <machine/asm.h> + +#include "amd64_archlevel.h" + +#define ALIGN_TEXT .p2align 4, 0x90 + + .weak stpcpy + .set stpcpy, __stpcpy +ARCHFUNCS(__stpcpy) + ARCHFUNC(__stpcpy, scalar) + ARCHFUNC(__stpcpy, baseline) +ENDARCHFUNCS(__stpcpy) + +/* + * This stpcpy implementation copies a byte at a time until the + * source pointer is aligned to a word boundary, it then copies by + * words until it finds a word containing a zero byte, and finally + * copies by bytes until the end of the string is reached. + * + * While this may result in unaligned stores if the source and + * destination pointers are unaligned with respect to each other, + * it is still faster than either byte copies or the overhead of + * an implementation suitable for machines with strict alignment + * requirements. + */ + +ARCHENTRY(__stpcpy, scalar) + movabsq $0x0101010101010101,%r8 + movabsq $0x8080808080808080,%r9 + + /* + * Align source to a word boundary. + * Consider unrolling loop? + */ +.Lalign: + testb $7,%sil + je .Lword_aligned + movb (%rsi),%dl + incq %rsi + movb %dl,(%rdi) + incq %rdi + testb %dl,%dl + jne .Lalign + movq %rdi,%rax + dec %rax + ret + + ALIGN_TEXT +.Lloop: + movq %rdx,(%rdi) + addq $8,%rdi +.Lword_aligned: + movq (%rsi),%rdx + movq %rdx,%rcx + addq $8,%rsi + subq %r8,%rcx + testq %r9,%rcx + je .Lloop + + /* + * In rare cases, the above loop may exit prematurely. We must + * return to the loop if none of the bytes in the word equal 0. + */ + + movb %dl,(%rdi) + testb %dl,%dl /* 1st byte == 0? */ + je .Ldone + incq %rdi + + shrq $8,%rdx + movb %dl,(%rdi) + testb %dl,%dl /* 2nd byte == 0? */ + je .Ldone + incq %rdi + + shrq $8,%rdx + movb %dl,(%rdi) + testb %dl,%dl /* 3rd byte == 0? */ + je .Ldone + incq %rdi + + shrq $8,%rdx + movb %dl,(%rdi) + testb %dl,%dl /* 4th byte == 0? */ + je .Ldone + incq %rdi + + shrq $8,%rdx + movb %dl,(%rdi) + testb %dl,%dl /* 5th byte == 0? */ + je .Ldone + incq %rdi + + shrq $8,%rdx + movb %dl,(%rdi) + testb %dl,%dl /* 6th byte == 0? */ + je .Ldone + incq %rdi + + shrq $8,%rdx + movb %dl,(%rdi) + testb %dl,%dl /* 7th byte == 0? */ + je .Ldone + incq %rdi + + shrq $8,%rdx + movb %dl,(%rdi) + incq %rdi + testb %dl,%dl /* 8th byte == 0? */ + jne .Lword_aligned + decq %rdi + +.Ldone: + movq %rdi,%rax + ret +ARCHEND(__stpcpy, scalar) + +ARCHENTRY(__stpcpy, baseline) + mov %esi, %ecx + mov %rdi, %rdx + sub %rsi, %rdi # express destination as distance to surce + and $~0xf, %rsi # align source to 16 byte + movdqa (%rsi), %xmm0 # head of string with junk before + pxor %xmm1, %xmm1 + and $0xf, %ecx # misalignment in bytes + pcmpeqb %xmm1, %xmm0 # NUL byte present? + pmovmskb %xmm0, %eax + shr %cl, %eax # clear out matches in junk bytes + bsf %eax, %eax # find match if any + jnz .Lrunt + + /* first normal iteration: write head back if it succeeds */ + movdqa 16(%rsi), %xmm0 # 16 bytes of current iteration + movdqu (%rsi, %rcx, 1), %xmm2 # first 16 bytes of the string + pcmpeqb %xmm0, %xmm1 # NUL byte present? + pmovmskb %xmm1, %eax + test %eax, %eax # find match if any + jnz .Lshorty + + movdqu %xmm2, (%rdx) # store beginning of string + + /* main loop, unrolled twice */ + ALIGN_TEXT +0: movdqa 32(%rsi), %xmm2 # load current iteraion + movdqu %xmm0, 16(%rsi, %rdi, 1) # write back previous iteraion + pxor %xmm1, %xmm1 + add $32, %rsi + pcmpeqb %xmm2, %xmm1 # NUL byte present? + pmovmskb %xmm1, %eax + test %eax, %eax + jnz 1f + + movdqa 16(%rsi), %xmm0 # load current iteraion + movdqu %xmm2, (%rsi, %rdi, 1) # write back previous iteraion + pxor %xmm1, %xmm1 + pcmpeqb %xmm0, %xmm1 # NUL byte present? + pmovmskb %xmm1, %eax + test %eax, %eax + jz 0b + + /* end of string after main loop has iterated */ + add $16, %rsi # advance rsi to second unrolled half +1: tzcnt %eax, %eax # find location of match + # (behaves as bsf on pre-x86-64-v3 CPUs) + add %rsi, %rax # point to NUL byte + movdqu -15(%rax), %xmm0 # last 16 bytes of string + movdqu %xmm0, -15(%rax, %rdi, 1) # copied to destination + add %rdi, %rax # point to destination's NUL byte + ret + + /* NUL encountered in second iteration */ +.Lshorty: + tzcnt %eax, %eax + add $16, %eax # account for length of first iteration + sub %ecx, %eax # but not the parts before the string + + /* NUL encountered in first iteration */ +.Lrunt: lea 1(%rax), %edi # string length including NUL byte + add %rcx, %rsi # point to beginning of string + add %rdx, %rax # point to NUL byte + + /* transfer 16--32 bytes */ +.L1632: cmp $16, %edi + jb .L0815 + + movdqu -16(%rsi, %rdi, 1), %xmm0 # load last 16 bytes + movdqu %xmm2, (%rdx) # store first 16 bytes + movdqu %xmm0, -15(%rax) # store last 16 bytes + ret + + /* transfer 8--15 bytes */ +.L0815: cmp $8, %edi + jb .L0407 + + mov (%rsi), %rcx # load first 8 bytes + mov -8(%rsi, %rdi, 1), %rdi # load last 8 bytes + mov %rcx, (%rdx) # store to dst + mov %rdi, -7(%rax) # dito + ret + + /* transfer 4--7 bytes */ +.L0407: cmp $4, %edi + jb .L0203 + + mov (%rsi), %ecx + mov -4(%rsi, %rdi, 1), %edi + mov %ecx, (%rdx) + mov %edi, -3(%rax) + ret + + /* transfer 2--3 bytes */ +.L0203: cmp $2, %edi + jb .L0101 + + movzwl (%rsi), %ecx + mov %cx, (%rdx) # store first two bytes + + /* transfer 0 bytes (last byte is always NUL) */ +.L0101: movb $0, (%rax) # store terminating NUL byte + ret +ARCHEND(__stpcpy, baseline) + + .section .note.GNU-stack,"",%progbits |