diff options
Diffstat (limited to 'MdePkg/Library/BaseMemoryLibOptDxe/AArch64/ScanMem.S')
-rw-r--r-- | MdePkg/Library/BaseMemoryLibOptDxe/AArch64/ScanMem.S | 161 |
1 files changed, 161 insertions, 0 deletions
diff --git a/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/ScanMem.S b/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/ScanMem.S new file mode 100644 index 0000000000000..708ebb59c4238 --- /dev/null +++ b/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/ScanMem.S @@ -0,0 +1,161 @@ +// +// Copyright (c) 2014, ARM Limited +// All rights Reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of the company nor the names of its contributors +// may be used to endorse or promote products derived from this +// software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// + +// Assumptions: +// +// ARMv8-a, AArch64 +// Neon Available. +// + +// Arguments and results. +#define srcin x0 +#define cntin x1 +#define chrin w2 + +#define result x0 + +#define src x3 +#define tmp x4 +#define wtmp2 w5 +#define synd x6 +#define soff x9 +#define cntrem x10 + +#define vrepchr v0 +#define vdata1 v1 +#define vdata2 v2 +#define vhas_chr1 v3 +#define vhas_chr2 v4 +#define vrepmask v5 +#define vend v6 + +// +// Core algorithm: +// +// For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits +// per byte. For each tuple, bit 0 is set if the relevant byte matched the +// requested character and bit 1 is not used (faster than using a 32bit +// syndrome). Since the bits in the syndrome reflect exactly the order in which +// things occur in the original string, counting trailing zeros allows to +// identify exactly which byte has matched. +// + +ASM_GLOBAL ASM_PFX(InternalMemScanMem8) +ASM_PFX(InternalMemScanMem8): + // Do not dereference srcin if no bytes to compare. + cbz cntin, .Lzero_length + // + // Magic constant 0x40100401 allows us to identify which lane matches + // the requested byte. + // + mov wtmp2, #0x0401 + movk wtmp2, #0x4010, lsl #16 + dup vrepchr.16b, chrin + // Work with aligned 32-byte chunks + bic src, srcin, #31 + dup vrepmask.4s, wtmp2 + ands soff, srcin, #31 + and cntrem, cntin, #31 + b.eq .Lloop + + // + // Input string is not 32-byte aligned. We calculate the syndrome + // value for the aligned 32 bytes block containing the first bytes + // and mask the irrelevant part. + // + + ld1 {vdata1.16b, vdata2.16b}, [src], #32 + sub tmp, soff, #32 + adds cntin, cntin, tmp + cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b + cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b + and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b + and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b + addp vend.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128 + addp vend.16b, vend.16b, vend.16b // 128->64 + mov synd, vend.d[0] + // Clear the soff*2 lower bits + lsl tmp, soff, #1 + lsr synd, synd, tmp + lsl synd, synd, tmp + // The first block can also be the last + b.ls .Lmasklast + // Have we found something already? + cbnz synd, .Ltail + +.Lloop: + ld1 {vdata1.16b, vdata2.16b}, [src], #32 + subs cntin, cntin, #32 + cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b + cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b + // If we're out of data we finish regardless of the result + b.ls .Lend + // Use a fast check for the termination condition + orr vend.16b, vhas_chr1.16b, vhas_chr2.16b + addp vend.2d, vend.2d, vend.2d + mov synd, vend.d[0] + // We're not out of data, loop if we haven't found the character + cbz synd, .Lloop + +.Lend: + // Termination condition found, let's calculate the syndrome value + and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b + and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b + addp vend.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128 + addp vend.16b, vend.16b, vend.16b // 128->64 + mov synd, vend.d[0] + // Only do the clear for the last possible block + b.hi .Ltail + +.Lmasklast: + // Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits + add tmp, cntrem, soff + and tmp, tmp, #31 + sub tmp, tmp, #32 + neg tmp, tmp, lsl #1 + lsl synd, synd, tmp + lsr synd, synd, tmp + +.Ltail: + // Count the trailing zeros using bit reversing + rbit synd, synd + // Compensate the last post-increment + sub src, src, #32 + // Check that we have found a character + cmp synd, #0 + // And count the leading zeros + clz synd, synd + // Compute the potential result + add result, src, synd, lsr #1 + // Select result or NULL + csel result, xzr, result, eq + ret + +.Lzero_length: + mov result, #0 + ret |