summaryrefslogtreecommitdiff
path: root/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/ScanMem.S
diff options
context:
space:
mode:
Diffstat (limited to 'MdePkg/Library/BaseMemoryLibOptDxe/AArch64/ScanMem.S')
-rw-r--r--MdePkg/Library/BaseMemoryLibOptDxe/AArch64/ScanMem.S161
1 files changed, 161 insertions, 0 deletions
diff --git a/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/ScanMem.S b/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/ScanMem.S
new file mode 100644
index 0000000000000..708ebb59c4238
--- /dev/null
+++ b/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/ScanMem.S
@@ -0,0 +1,161 @@
+//
+// Copyright (c) 2014, ARM Limited
+// All rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+// * Neither the name of the company nor the names of its contributors
+// may be used to endorse or promote products derived from this
+// software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+
+// Assumptions:
+//
+// ARMv8-a, AArch64
+// Neon Available.
+//
+
+// Arguments and results.
+#define srcin x0
+#define cntin x1
+#define chrin w2
+
+#define result x0
+
+#define src x3
+#define tmp x4
+#define wtmp2 w5
+#define synd x6
+#define soff x9
+#define cntrem x10
+
+#define vrepchr v0
+#define vdata1 v1
+#define vdata2 v2
+#define vhas_chr1 v3
+#define vhas_chr2 v4
+#define vrepmask v5
+#define vend v6
+
+//
+// Core algorithm:
+//
+// For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits
+// per byte. For each tuple, bit 0 is set if the relevant byte matched the
+// requested character and bit 1 is not used (faster than using a 32bit
+// syndrome). Since the bits in the syndrome reflect exactly the order in which
+// things occur in the original string, counting trailing zeros allows to
+// identify exactly which byte has matched.
+//
+
+ASM_GLOBAL ASM_PFX(InternalMemScanMem8)
+ASM_PFX(InternalMemScanMem8):
+ // Do not dereference srcin if no bytes to compare.
+ cbz cntin, .Lzero_length
+ //
+ // Magic constant 0x40100401 allows us to identify which lane matches
+ // the requested byte.
+ //
+ mov wtmp2, #0x0401
+ movk wtmp2, #0x4010, lsl #16
+ dup vrepchr.16b, chrin
+ // Work with aligned 32-byte chunks
+ bic src, srcin, #31
+ dup vrepmask.4s, wtmp2
+ ands soff, srcin, #31
+ and cntrem, cntin, #31
+ b.eq .Lloop
+
+ //
+ // Input string is not 32-byte aligned. We calculate the syndrome
+ // value for the aligned 32 bytes block containing the first bytes
+ // and mask the irrelevant part.
+ //
+
+ ld1 {vdata1.16b, vdata2.16b}, [src], #32
+ sub tmp, soff, #32
+ adds cntin, cntin, tmp
+ cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
+ cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
+ and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+ and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+ addp vend.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
+ addp vend.16b, vend.16b, vend.16b // 128->64
+ mov synd, vend.d[0]
+ // Clear the soff*2 lower bits
+ lsl tmp, soff, #1
+ lsr synd, synd, tmp
+ lsl synd, synd, tmp
+ // The first block can also be the last
+ b.ls .Lmasklast
+ // Have we found something already?
+ cbnz synd, .Ltail
+
+.Lloop:
+ ld1 {vdata1.16b, vdata2.16b}, [src], #32
+ subs cntin, cntin, #32
+ cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
+ cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
+ // If we're out of data we finish regardless of the result
+ b.ls .Lend
+ // Use a fast check for the termination condition
+ orr vend.16b, vhas_chr1.16b, vhas_chr2.16b
+ addp vend.2d, vend.2d, vend.2d
+ mov synd, vend.d[0]
+ // We're not out of data, loop if we haven't found the character
+ cbz synd, .Lloop
+
+.Lend:
+ // Termination condition found, let's calculate the syndrome value
+ and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+ and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+ addp vend.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
+ addp vend.16b, vend.16b, vend.16b // 128->64
+ mov synd, vend.d[0]
+ // Only do the clear for the last possible block
+ b.hi .Ltail
+
+.Lmasklast:
+ // Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits
+ add tmp, cntrem, soff
+ and tmp, tmp, #31
+ sub tmp, tmp, #32
+ neg tmp, tmp, lsl #1
+ lsl synd, synd, tmp
+ lsr synd, synd, tmp
+
+.Ltail:
+ // Count the trailing zeros using bit reversing
+ rbit synd, synd
+ // Compensate the last post-increment
+ sub src, src, #32
+ // Check that we have found a character
+ cmp synd, #0
+ // And count the leading zeros
+ clz synd, synd
+ // Compute the potential result
+ add result, src, synd, lsr #1
+ // Select result or NULL
+ csel result, xzr, result, eq
+ ret
+
+.Lzero_length:
+ mov result, #0
+ ret