aboutsummaryrefslogtreecommitdiff
path: root/lib/libc/amd64/string/stpcpy.S
diff options
context:
space:
mode:
Diffstat (limited to 'lib/libc/amd64/string/stpcpy.S')
-rw-r--r--lib/libc/amd64/string/stpcpy.S237
1 files changed, 237 insertions, 0 deletions
diff --git a/lib/libc/amd64/string/stpcpy.S b/lib/libc/amd64/string/stpcpy.S
new file mode 100644
index 000000000000..59358e3245a8
--- /dev/null
+++ b/lib/libc/amd64/string/stpcpy.S
@@ -0,0 +1,237 @@
+/*-
+ * Copyright (c) 2023, The FreeBSD Foundation
+ *
+ * SPDX-License-Expression: BSD-2-Clause
+ *
+ * Portions of this software were developed by Robert Clausecker
+ * <fuz@FreeBSD.org> under sponsorship from the FreeBSD Foundation.
+ *
+ * Adapted from NetBSD's common/lib/libc/arch/x86_64/string/strcpy.S
+ * written by J.T. Conklin <jtc@acorntoolworks.com> and
+ * adapted by Guillaume Morin <guillaume@morinfr.org> to implement stpcpy
+ * that was originally dedicated to the public domain
+ */
+
+#include <machine/asm.h>
+
+#include "amd64_archlevel.h"
+
+#define ALIGN_TEXT .p2align 4, 0x90
+
+ .weak stpcpy
+ .set stpcpy, __stpcpy
+ARCHFUNCS(__stpcpy)
+ ARCHFUNC(__stpcpy, scalar)
+ ARCHFUNC(__stpcpy, baseline)
+ENDARCHFUNCS(__stpcpy)
+
+/*
+ * This stpcpy implementation copies a byte at a time until the
+ * source pointer is aligned to a word boundary, it then copies by
+ * words until it finds a word containing a zero byte, and finally
+ * copies by bytes until the end of the string is reached.
+ *
+ * While this may result in unaligned stores if the source and
+ * destination pointers are unaligned with respect to each other,
+ * it is still faster than either byte copies or the overhead of
+ * an implementation suitable for machines with strict alignment
+ * requirements.
+ */
+
+ARCHENTRY(__stpcpy, scalar)
+ movabsq $0x0101010101010101,%r8
+ movabsq $0x8080808080808080,%r9
+
+ /*
+ * Align source to a word boundary.
+ * Consider unrolling loop?
+ */
+.Lalign:
+ testb $7,%sil
+ je .Lword_aligned
+ movb (%rsi),%dl
+ incq %rsi
+ movb %dl,(%rdi)
+ incq %rdi
+ testb %dl,%dl
+ jne .Lalign
+ movq %rdi,%rax
+ dec %rax
+ ret
+
+ ALIGN_TEXT
+.Lloop:
+ movq %rdx,(%rdi)
+ addq $8,%rdi
+.Lword_aligned:
+ movq (%rsi),%rdx
+ movq %rdx,%rcx
+ addq $8,%rsi
+ subq %r8,%rcx
+ testq %r9,%rcx
+ je .Lloop
+
+ /*
+ * In rare cases, the above loop may exit prematurely. We must
+ * return to the loop if none of the bytes in the word equal 0.
+ */
+
+ movb %dl,(%rdi)
+ testb %dl,%dl /* 1st byte == 0? */
+ je .Ldone
+ incq %rdi
+
+ shrq $8,%rdx
+ movb %dl,(%rdi)
+ testb %dl,%dl /* 2nd byte == 0? */
+ je .Ldone
+ incq %rdi
+
+ shrq $8,%rdx
+ movb %dl,(%rdi)
+ testb %dl,%dl /* 3rd byte == 0? */
+ je .Ldone
+ incq %rdi
+
+ shrq $8,%rdx
+ movb %dl,(%rdi)
+ testb %dl,%dl /* 4th byte == 0? */
+ je .Ldone
+ incq %rdi
+
+ shrq $8,%rdx
+ movb %dl,(%rdi)
+ testb %dl,%dl /* 5th byte == 0? */
+ je .Ldone
+ incq %rdi
+
+ shrq $8,%rdx
+ movb %dl,(%rdi)
+ testb %dl,%dl /* 6th byte == 0? */
+ je .Ldone
+ incq %rdi
+
+ shrq $8,%rdx
+ movb %dl,(%rdi)
+ testb %dl,%dl /* 7th byte == 0? */
+ je .Ldone
+ incq %rdi
+
+ shrq $8,%rdx
+ movb %dl,(%rdi)
+ incq %rdi
+ testb %dl,%dl /* 8th byte == 0? */
+ jne .Lword_aligned
+ decq %rdi
+
+.Ldone:
+ movq %rdi,%rax
+ ret
+ARCHEND(__stpcpy, scalar)
+
+ARCHENTRY(__stpcpy, baseline)
+ mov %esi, %ecx
+ mov %rdi, %rdx
+ sub %rsi, %rdi # express destination as distance to surce
+ and $~0xf, %rsi # align source to 16 byte
+ movdqa (%rsi), %xmm0 # head of string with junk before
+ pxor %xmm1, %xmm1
+ and $0xf, %ecx # misalignment in bytes
+ pcmpeqb %xmm1, %xmm0 # NUL byte present?
+ pmovmskb %xmm0, %eax
+ shr %cl, %eax # clear out matches in junk bytes
+ bsf %eax, %eax # find match if any
+ jnz .Lrunt
+
+ /* first normal iteration: write head back if it succeeds */
+ movdqa 16(%rsi), %xmm0 # 16 bytes of current iteration
+ movdqu (%rsi, %rcx, 1), %xmm2 # first 16 bytes of the string
+ pcmpeqb %xmm0, %xmm1 # NUL byte present?
+ pmovmskb %xmm1, %eax
+ test %eax, %eax # find match if any
+ jnz .Lshorty
+
+ movdqu %xmm2, (%rdx) # store beginning of string
+
+ /* main loop, unrolled twice */
+ ALIGN_TEXT
+0: movdqa 32(%rsi), %xmm2 # load current iteraion
+ movdqu %xmm0, 16(%rsi, %rdi, 1) # write back previous iteraion
+ pxor %xmm1, %xmm1
+ add $32, %rsi
+ pcmpeqb %xmm2, %xmm1 # NUL byte present?
+ pmovmskb %xmm1, %eax
+ test %eax, %eax
+ jnz 1f
+
+ movdqa 16(%rsi), %xmm0 # load current iteraion
+ movdqu %xmm2, (%rsi, %rdi, 1) # write back previous iteraion
+ pxor %xmm1, %xmm1
+ pcmpeqb %xmm0, %xmm1 # NUL byte present?
+ pmovmskb %xmm1, %eax
+ test %eax, %eax
+ jz 0b
+
+ /* end of string after main loop has iterated */
+ add $16, %rsi # advance rsi to second unrolled half
+1: tzcnt %eax, %eax # find location of match
+ # (behaves as bsf on pre-x86-64-v3 CPUs)
+ add %rsi, %rax # point to NUL byte
+ movdqu -15(%rax), %xmm0 # last 16 bytes of string
+ movdqu %xmm0, -15(%rax, %rdi, 1) # copied to destination
+ add %rdi, %rax # point to destination's NUL byte
+ ret
+
+ /* NUL encountered in second iteration */
+.Lshorty:
+ tzcnt %eax, %eax
+ add $16, %eax # account for length of first iteration
+ sub %ecx, %eax # but not the parts before the string
+
+ /* NUL encountered in first iteration */
+.Lrunt: lea 1(%rax), %edi # string length including NUL byte
+ add %rcx, %rsi # point to beginning of string
+ add %rdx, %rax # point to NUL byte
+
+ /* transfer 16--32 bytes */
+.L1632: cmp $16, %edi
+ jb .L0815
+
+ movdqu -16(%rsi, %rdi, 1), %xmm0 # load last 16 bytes
+ movdqu %xmm2, (%rdx) # store first 16 bytes
+ movdqu %xmm0, -15(%rax) # store last 16 bytes
+ ret
+
+ /* transfer 8--15 bytes */
+.L0815: cmp $8, %edi
+ jb .L0407
+
+ mov (%rsi), %rcx # load first 8 bytes
+ mov -8(%rsi, %rdi, 1), %rdi # load last 8 bytes
+ mov %rcx, (%rdx) # store to dst
+ mov %rdi, -7(%rax) # dito
+ ret
+
+ /* transfer 4--7 bytes */
+.L0407: cmp $4, %edi
+ jb .L0203
+
+ mov (%rsi), %ecx
+ mov -4(%rsi, %rdi, 1), %edi
+ mov %ecx, (%rdx)
+ mov %edi, -3(%rax)
+ ret
+
+ /* transfer 2--3 bytes */
+.L0203: cmp $2, %edi
+ jb .L0101
+
+ movzwl (%rsi), %ecx
+ mov %cx, (%rdx) # store first two bytes
+
+ /* transfer 0 bytes (last byte is always NUL) */
+.L0101: movb $0, (%rax) # store terminating NUL byte
+ ret
+ARCHEND(__stpcpy, baseline)
+
+ .section .note.GNU-stack,"",%progbits