1 files changed, 247 insertions, 0 deletions
diff --git a/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/SetMem.S b/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/SetMem.S
new file mode 100644
index 000000000000..384fbdc6c04e
--- /dev/null
+++ b/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/SetMem.S
@@ -0,0 +1,247 @@
+//
+// Copyright (c) 2012 - 2016, Linaro Limited
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//     * Redistributions of source code must retain the above copyright
+//       notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above copyright
+//       notice, this list of conditions and the following disclaimer in the
+//       documentation and/or other materials provided with the distribution.
+//     * Neither the name of the Linaro nor the
+//       names of its contributors may be used to endorse or promote products
+//       derived from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+
+//
+// Copyright (c) 2015 ARM Ltd
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+// 1. Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+// 3. The name of the company may not be used to endorse or promote
+//    products derived from this software without specific prior written
+//    permission.
+//
+// THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+// IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+
+// Assumptions:
+//
+// ARMv8-a, AArch64, unaligned accesses
+//
+//
+
+#define dstin     x0
+#define count     x1
+#define val       x2
+#define valw      w2
+#define dst       x3
+#define dstend    x4
+#define tmp1      x5
+#define tmp1w     w5
+#define tmp2      x6
+#define tmp2w     w6
+#define zva_len   x7
+#define zva_lenw  w7
+
+#define L(l) .L ## l
+
+ASM_GLOBAL ASM_PFX(InternalMemSetMem16)
+ASM_PFX(InternalMemSetMem16):
+    dup     v0.8H, valw
+    lsl     count, count, #1
+    b       0f
+
+ASM_GLOBAL ASM_PFX(InternalMemSetMem32)
+ASM_PFX(InternalMemSetMem32):
+    dup     v0.4S, valw
+    lsl     count, count, #2
+    b       0f
+
+ASM_GLOBAL ASM_PFX(InternalMemSetMem64)
+ASM_PFX(InternalMemSetMem64):
+    dup     v0.2D, val
+    lsl     count, count, #3
+    b       0f
+
+ASM_GLOBAL ASM_PFX(InternalMemZeroMem)
+ASM_PFX(InternalMemZeroMem):
+    movi    v0.16B, #0
+    b       0f
+
+ASM_GLOBAL ASM_PFX(InternalMemSetMem)
+ASM_PFX(InternalMemSetMem):
+    dup     v0.16B, valw
+0:  add     dstend, dstin, count
+    mov     val, v0.D[0]
+
+    cmp     count, 96
+    b.hi    L(set_long)
+    cmp     count, 16
+    b.hs    L(set_medium)
+
+    // Set 0..15 bytes.
+    tbz     count, 3, 1f
+    str     val, [dstin]
+    str     val, [dstend, -8]
+    ret
+    nop
+1:  tbz     count, 2, 2f
+    str     valw, [dstin]
+    str     valw, [dstend, -4]
+    ret
+2:  cbz     count, 3f
+    strb    valw, [dstin]
+    tbz     count, 1, 3f
+    strh    valw, [dstend, -2]
+3:  ret
+
+    // Set 17..96 bytes.
+L(set_medium):
+    str     q0, [dstin]
+    tbnz    count, 6, L(set96)
+    str     q0, [dstend, -16]
+    tbz     count, 5, 1f
+    str     q0, [dstin, 16]
+    str     q0, [dstend, -32]
+1:  ret
+
+    .p2align 4
+    // Set 64..96 bytes.  Write 64 bytes from the start and
+    // 32 bytes from the end.
+L(set96):
+    str     q0, [dstin, 16]
+    stp     q0, q0, [dstin, 32]
+    stp     q0, q0, [dstend, -32]
+    ret
+
+    .p2align 3
+    nop
+L(set_long):
+    bic     dst, dstin, 15
+    str     q0, [dstin]
+    cmp     count, 256
+    ccmp    val, 0, 0, cs
+    b.eq    L(try_zva)
+L(no_zva):
+    sub     count, dstend, dst        // Count is 16 too large.
+    add     dst, dst, 16
+    sub     count, count, 64 + 16     // Adjust count and bias for loop.
+1:  stp     q0, q0, [dst], 64
+    stp     q0, q0, [dst, -32]
+L(tail64):
+    subs    count, count, 64
+    b.hi    1b
+2:  stp     q0, q0, [dstend, -64]
+    stp     q0, q0, [dstend, -32]
+    ret
+
+    .p2align 3
+L(try_zva):
+    mrs     tmp1, dczid_el0
+    tbnz    tmp1w, 4, L(no_zva)
+    and     tmp1w, tmp1w, 15
+    cmp     tmp1w, 4                  // ZVA size is 64 bytes.
+    b.ne    L(zva_128)
+
+    // Write the first and last 64 byte aligned block using stp rather
+    // than using DC ZVA.  This is faster on some cores.
+L(zva_64):
+    str     q0, [dst, 16]
+    stp     q0, q0, [dst, 32]
+    bic     dst, dst, 63
+    stp     q0, q0, [dst, 64]
+    stp     q0, q0, [dst, 96]
+    sub     count, dstend, dst         // Count is now 128 too large.
+    sub     count, count, 128+64+64    // Adjust count and bias for loop.
+    add     dst, dst, 128
+    nop
+1:  dc      zva, dst
+    add     dst, dst, 64
+    subs    count, count, 64
+    b.hi    1b
+    stp     q0, q0, [dst, 0]
+    stp     q0, q0, [dst, 32]
+    stp     q0, q0, [dstend, -64]
+    stp     q0, q0, [dstend, -32]
+    ret
+
+    .p2align 3
+L(zva_128):
+    cmp     tmp1w, 5                    // ZVA size is 128 bytes.
+    b.ne    L(zva_other)
+
+    str     q0, [dst, 16]
+    stp     q0, q0, [dst, 32]
+    stp     q0, q0, [dst, 64]
+    stp     q0, q0, [dst, 96]
+    bic     dst, dst, 127
+    sub     count, dstend, dst          // Count is now 128 too large.
+    sub     count, count, 128+128       // Adjust count and bias for loop.
+    add     dst, dst, 128
+1:  dc      zva, dst
+    add     dst, dst, 128
+    subs    count, count, 128
+    b.hi    1b
+    stp     q0, q0, [dstend, -128]
+    stp     q0, q0, [dstend, -96]
+    stp     q0, q0, [dstend, -64]
+    stp     q0, q0, [dstend, -32]
+    ret
+
+L(zva_other):
+    mov     tmp2w, 4
+    lsl     zva_lenw, tmp2w, tmp1w
+    add     tmp1, zva_len, 64           // Max alignment bytes written.
+    cmp     count, tmp1
+    blo     L(no_zva)
+
+    sub     tmp2, zva_len, 1
+    add     tmp1, dst, zva_len
+    add     dst, dst, 16
+    subs    count, tmp1, dst            // Actual alignment bytes to write.
+    bic     tmp1, tmp1, tmp2            // Aligned dc zva start address.
+    beq     2f
+1:  stp     q0, q0, [dst], 64
+    stp     q0, q0, [dst, -32]
+    subs    count, count, 64
+    b.hi    1b
+2:  mov     dst, tmp1
+    sub     count, dstend, tmp1         // Remaining bytes to write.
+    subs    count, count, zva_len
+    b.lo    4f
+3:  dc      zva, dst
+    add     dst, dst, zva_len
+    subs    count, count, zva_len
+    b.hs    3b
+4:  add     count, count, zva_len
+    b       L(tail64)