diff options
Diffstat (limited to 'lib/libc/amd64/string/memcmp.S')
-rw-r--r-- | lib/libc/amd64/string/memcmp.S | 420 |
1 files changed, 420 insertions, 0 deletions
diff --git a/lib/libc/amd64/string/memcmp.S b/lib/libc/amd64/string/memcmp.S new file mode 100644 index 000000000000..dc8bcff73cb9 --- /dev/null +++ b/lib/libc/amd64/string/memcmp.S @@ -0,0 +1,420 @@ +/*- + * Copyright (c) 2018, 2023 The FreeBSD Foundation + * + * This software was developed by Mateusz Guzik <mjg@FreeBSD.org> + * under sponsorship from the FreeBSD Foundation. + * + * Portions of this software were developed by Robert Clausecker + * <fuz@FreeBSD.org> under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <machine/asm.h> +#include <machine/param.h> + +#include "amd64_archlevel.h" + +/* + * Note: this routine was written with kernel use in mind (read: no simd), + * it is only present in userspace as a temporary measure until something + * better gets imported. + */ + +#define ALIGN_TEXT .p2align 4,0x90 /* 16-byte alignment, nop filled */ + +#ifdef BCMP +#define memcmp bcmp +#endif + +ARCHFUNCS(memcmp) + ARCHFUNC(memcmp, scalar) + ARCHFUNC(memcmp, baseline) +ENDARCHFUNCS(memcmp) + +ARCHENTRY(memcmp, scalar) + xorl %eax,%eax +10: + cmpq $16,%rdx + ja 101632f + + cmpb $8,%dl + jg 100816f + + cmpb $4,%dl + jg 100408f + + cmpb $2,%dl + jge 100204f + + cmpb $1,%dl + jl 100000f + movzbl (%rdi),%eax + movzbl (%rsi),%r8d + subl %r8d,%eax +100000: + ret + + ALIGN_TEXT +100816: + movq (%rdi),%r8 + movq (%rsi),%r9 + cmpq %r8,%r9 + jne 80f + movq -8(%rdi,%rdx),%r8 + movq -8(%rsi,%rdx),%r9 + cmpq %r8,%r9 + jne 10081608f + ret + ALIGN_TEXT +100408: + movl (%rdi),%r8d + movl (%rsi),%r9d + cmpl %r8d,%r9d + jne 80f + movl -4(%rdi,%rdx),%r8d + movl -4(%rsi,%rdx),%r9d + cmpl %r8d,%r9d + jne 10040804f + ret + ALIGN_TEXT +100204: + movzwl (%rdi),%r8d + movzwl (%rsi),%r9d + cmpl %r8d,%r9d + jne 1f + movzwl -2(%rdi,%rdx),%r8d + movzwl -2(%rsi,%rdx),%r9d + cmpl %r8d,%r9d + jne 1f + ret + ALIGN_TEXT +101632: + cmpq $32,%rdx + ja 103200f + movq (%rdi),%r8 + movq (%rsi),%r9 + cmpq %r8,%r9 + jne 80f + movq 8(%rdi),%r8 + movq 8(%rsi),%r9 + cmpq %r8,%r9 + jne 10163208f + movq -16(%rdi,%rdx),%r8 + movq -16(%rsi,%rdx),%r9 + cmpq %r8,%r9 + jne 10163216f + movq -8(%rdi,%rdx),%r8 + movq -8(%rsi,%rdx),%r9 + cmpq %r8,%r9 + jne 10163224f + ret + ALIGN_TEXT +103200: + movq (%rdi),%r8 + movq 8(%rdi),%r9 + subq (%rsi),%r8 + subq 8(%rsi),%r9 + orq %r8,%r9 + jnz 10320000f + + movq 16(%rdi),%r8 + movq 24(%rdi),%r9 + subq 16(%rsi),%r8 + subq 24(%rsi),%r9 + orq %r8,%r9 + jnz 10320016f + + leaq 32(%rdi),%rdi + leaq 32(%rsi),%rsi + subq $32,%rdx + cmpq $32,%rdx + jae 103200b + cmpb $0,%dl + jne 10b + ret + +/* + * Mismatch was found. + */ +#ifdef BCMP + ALIGN_TEXT +10320016: +10320000: +10081608: +10163224: +10163216: +10163208: +10040804: +80: +1: + leal 1(%eax),%eax + ret +#else +/* + * We need to compute the difference between strings. + * Start with narrowing the range down (16 -> 8 -> 4 bytes). + */ + ALIGN_TEXT +10320016: + leaq 16(%rdi),%rdi + leaq 16(%rsi),%rsi +10320000: + movq (%rdi),%r8 + movq (%rsi),%r9 + cmpq %r8,%r9 + jne 80f + leaq 8(%rdi),%rdi + leaq 8(%rsi),%rsi + jmp 80f + ALIGN_TEXT +10081608: +10163224: + leaq -8(%rdi,%rdx),%rdi + leaq -8(%rsi,%rdx),%rsi + jmp 80f + ALIGN_TEXT +10163216: + leaq -16(%rdi,%rdx),%rdi + leaq -16(%rsi,%rdx),%rsi + jmp 80f + ALIGN_TEXT +10163208: + leaq 8(%rdi),%rdi + leaq 8(%rsi),%rsi + jmp 80f + ALIGN_TEXT +10040804: + leaq -4(%rdi,%rdx),%rdi + leaq -4(%rsi,%rdx),%rsi + jmp 1f + + ALIGN_TEXT +80: + movl (%rdi),%r8d + movl (%rsi),%r9d + cmpl %r8d,%r9d + jne 1f + leaq 4(%rdi),%rdi + leaq 4(%rsi),%rsi + +/* + * We have up to 4 bytes to inspect. + */ +1: + movzbl (%rdi),%eax + movzbl (%rsi),%r8d + cmpb %r8b,%al + jne 2f + + movzbl 1(%rdi),%eax + movzbl 1(%rsi),%r8d + cmpb %r8b,%al + jne 2f + + movzbl 2(%rdi),%eax + movzbl 2(%rsi),%r8d + cmpb %r8b,%al + jne 2f + + movzbl 3(%rdi),%eax + movzbl 3(%rsi),%r8d +2: + subl %r8d,%eax + ret +#endif +ARCHEND(memcmp, scalar) + +ARCHENTRY(memcmp, baseline) + cmp $32, %rdx # enough to permit use of the long kernel? + ja .Llong + + test %rdx, %rdx # zero bytes buffer? + je .L0 + + /* + * Compare strings of 1--32 bytes. We want to do this by + * loading into two xmm registers and then comparing. To avoid + * crossing into unmapped pages, we either load 32 bytes from + * the start of the buffer or 32 bytes before its end, depending + * on whether there is a page boundary between the overread area + * or not. + */ + + /* check for page boundaries overreads */ + lea 31(%rdi), %eax # end of overread + lea 31(%rsi), %r8d + lea -1(%rdi, %rdx, 1), %ecx # last character in buffer + lea -1(%rsi, %rdx, 1), %r9d + xor %ecx, %eax + xor %r9d, %r8d + test $PAGE_SIZE, %eax # are they on different pages? + jz 0f + + /* fix up rdi */ + movdqu -32(%rdi, %rdx, 1), %xmm0 + movdqu -16(%rdi, %rdx, 1), %xmm1 + lea -8(%rsp), %rdi # end of replacement buffer + sub %rdx, %rdi # start of replacement buffer + movdqa %xmm0, -40(%rsp) # copy to replacement buffer + movdqa %xmm1, -24(%rsp) + +0: test $PAGE_SIZE, %r8d + jz 0f + + /* fix up rsi */ + movdqu -32(%rsi, %rdx, 1), %xmm0 + movdqu -16(%rsi, %rdx, 1), %xmm1 + lea -40(%rsp), %rsi # end of replacement buffer + sub %rdx, %rsi # start of replacement buffer + movdqa %xmm0, -72(%rsp) # copy to replacement buffer + movdqa %xmm1, -56(%rsp) + + /* load data and compare properly */ +0: movdqu 16(%rdi), %xmm1 + movdqu 16(%rsi), %xmm3 + movdqu (%rdi), %xmm0 + movdqu (%rsi), %xmm2 + mov %edx, %ecx + mov $-1, %edx + shl %cl, %rdx # ones where the buffer is not + pcmpeqb %xmm3, %xmm1 + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm1, %ecx + pmovmskb %xmm0, %eax + shl $16, %ecx + or %ecx, %eax # ones where the buffers match + or %edx, %eax # including where the buffer is not + not %eax # ones where there is a mismatch +#ifndef BCMP + bsf %eax, %edx # location of the first mismatch + cmovz %eax, %edx # including if there is no mismatch + movzbl (%rdi, %rdx, 1), %eax # mismatching bytes + movzbl (%rsi, %rdx, 1), %edx + sub %edx, %eax +#endif + ret + + /* empty input */ +.L0: xor %eax, %eax + ret + + /* compare 33+ bytes */ + ALIGN_TEXT +.Llong: movdqu (%rdi), %xmm0 # load head + movdqu (%rsi), %xmm2 + mov %rdi, %rcx + sub %rdi, %rsi # express rsi as distance from rdi + and $~0xf, %rdi # align rdi to 16 bytes + movdqu 16(%rsi, %rdi, 1), %xmm1 + pcmpeqb 16(%rdi), %xmm1 # compare second half of this iteration + add %rcx, %rdx # pointer to last byte in buffer + jc .Loverflow # did this overflow? +0: pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + xor $0xffff, %eax # any mismatch? + jne .Lmismatch_head + add $64, %rdi # advance to next iteration + jmp 1f # and get going with the loop + + /* + * If we got here, a buffer length was passed to memcmp(a, b, len) + * such that a + len < a. While this sort of usage is illegal, + * it is plausible that a caller tries to do something like + * memcmp(a, b, SIZE_MAX) if a and b are known to differ, intending + * for memcmp() to stop comparing at the first mismatch. This + * behaviour is not guaranteed by any version of ISO/IEC 9899, + * but usually works out in practice. Let's try to make this + * case work by comparing until the end of the address space. + */ +.Loverflow: + mov $-1, %rdx # compare until the end of memory + jmp 0b + + /* process buffer 32 bytes at a time */ + ALIGN_TEXT +0: movdqu -32(%rsi, %rdi, 1), %xmm0 + movdqu -16(%rsi, %rdi, 1), %xmm1 + pcmpeqb -32(%rdi), %xmm0 + pcmpeqb -16(%rdi), %xmm1 + add $32, %rdi # advance to next iteration +1: pand %xmm0, %xmm1 # 0xff where both halves matched + pmovmskb %xmm1, %eax + cmp $0xffff, %eax # all bytes matched? + jne .Lmismatch + cmp %rdx, %rdi # end of buffer reached? + jb 0b + + /* less than 32 bytes left to compare */ + movdqu -16(%rdx), %xmm1 # load 32 byte tail through end pointer + movdqu -16(%rdx, %rsi, 1), %xmm3 + movdqu -32(%rdx), %xmm0 + movdqu -32(%rdx, %rsi, 1), %xmm2 + pcmpeqb %xmm3, %xmm1 + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm1, %ecx + pmovmskb %xmm0, %eax + shl $16, %ecx + or %ecx, %eax # ones where the buffers match + not %eax # ones where there is a mismatch +#ifndef BCMP + bsf %eax, %ecx # location of the first mismatch + cmovz %eax, %ecx # including if there is no mismatch + add %rcx, %rdx # pointer to potential mismatch + movzbl -32(%rdx), %eax # mismatching bytes + movzbl -32(%rdx, %rsi, 1), %edx + sub %edx, %eax +#endif + ret + +#ifdef BCMP +.Lmismatch: + mov $1, %eax +.Lmismatch_head: + ret +#else /* memcmp */ +.Lmismatch_head: + tzcnt %eax, %eax # location of mismatch + add %rax, %rcx # pointer to mismatch + movzbl (%rcx), %eax # mismatching bytes + movzbl (%rcx, %rsi, 1), %ecx + sub %ecx, %eax + ret + +.Lmismatch: + movdqu -48(%rsi, %rdi, 1), %xmm1 + pcmpeqb -48(%rdi), %xmm1 # reconstruct xmm1 before PAND + pmovmskb %xmm0, %eax # mismatches in first 16 bytes + pmovmskb %xmm1, %edx # mismatches in second 16 bytes + shl $16, %edx + or %edx, %eax # mismatches in both + not %eax # matches in both + tzcnt %eax, %eax # location of mismatch + add %rax, %rdi # pointer to mismatch + movzbl -64(%rdi), %eax # mismatching bytes + movzbl -64(%rdi, %rsi, 1), %ecx + sub %ecx, %eax + ret +#endif +ARCHEND(memcmp, baseline) + + .section .note.GNU-stack,"",%progbits |