aboutsummaryrefslogtreecommitdiff
path: root/lib/builtins/hexagon/memcpy_forward_vp4cp4n2.S
diff options
context:
space:
mode:
Diffstat (limited to 'lib/builtins/hexagon/memcpy_forward_vp4cp4n2.S')
-rw-r--r--lib/builtins/hexagon/memcpy_forward_vp4cp4n2.S125
1 files changed, 125 insertions, 0 deletions
diff --git a/lib/builtins/hexagon/memcpy_forward_vp4cp4n2.S b/lib/builtins/hexagon/memcpy_forward_vp4cp4n2.S
new file mode 100644
index 000000000000..fbe09086cd33
--- /dev/null
+++ b/lib/builtins/hexagon/memcpy_forward_vp4cp4n2.S
@@ -0,0 +1,125 @@
+//===----------------------Hexagon builtin routine ------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// An optimized version of a memcpy which is equivalent to the following loop:
+//
+// volatile unsigned *dest;
+// unsigned *src;
+//
+// for (i = 0; i < num_words; ++i)
+// *dest++ = *src++;
+//
+// The corresponding C prototype for this function would be
+// void hexagon_memcpy_forward_vp4cp4n2(volatile unsigned *dest,
+// const unsigned *src,
+// unsigned num_words);
+//
+// *** Both dest and src must be aligned to 32-bit boundaries. ***
+// The code does not perform any runtime checks for this, and will fail
+// in bad ways if this requirement is not met.
+//
+// The "forward" in the name refers to the fact that the function copies
+// the words going forward in memory. It is incorrect to use this function
+// for cases where the original code copied words in any other order.
+//
+// *** This function is only for the use by the compiler. ***
+// The only indended use is for the LLVM compiler to generate calls to
+// this function, when a mem-copy loop, like the one above, is detected.
+
+ .text
+
+// Inputs:
+// r0: dest
+// r1: src
+// r2: num_words
+
+ .globl hexagon_memcpy_forward_vp4cp4n2
+ .balign 32
+ .type hexagon_memcpy_forward_vp4cp4n2,@function
+hexagon_memcpy_forward_vp4cp4n2:
+
+ // Compute r3 to be the number of words remaining in the current page.
+ // At the same time, compute r4 to be the number of 32-byte blocks
+ // remaining in the page (for prefetch).
+ {
+ r3 = sub(##4096, r1)
+ r5 = lsr(r2, #3)
+ }
+ {
+ // The word count before end-of-page is in the 12 lowest bits of r3.
+ // (If the address in r1 was already page-aligned, the bits are 0.)
+ r3 = extractu(r3, #10, #2)
+ r4 = extractu(r3, #7, #5)
+ }
+ {
+ r3 = minu(r2, r3)
+ r4 = minu(r5, r4)
+ }
+ {
+ r4 = or(r4, ##2105344) // 2105344 = 0x202000
+ p0 = cmp.eq(r3, #0)
+ if (p0.new) jump:nt .Lskipprolog
+ }
+ l2fetch(r1, r4)
+ {
+ loop0(.Lprolog, r3)
+ r2 = sub(r2, r3) // r2 = number of words left after the prolog.
+ }
+ .falign
+.Lprolog:
+ {
+ r4 = memw(r1++#4)
+ memw(r0++#4) = r4.new
+ } :endloop0
+.Lskipprolog:
+ {
+ // Let r3 = number of whole pages left (page = 1024 words).
+ r3 = lsr(r2, #10)
+ if (cmp.eq(r3.new, #0)) jump:nt .Lskipmain
+ }
+ {
+ loop1(.Lout, r3)
+ r2 = extractu(r2, #10, #0) // r2 = r2 & 1023
+ r3 = ##2105472 // r3 = 0x202080 (prefetch info)
+ }
+ // Iterate over pages.
+ .falign
+.Lout:
+ // Prefetch each individual page.
+ l2fetch(r1, r3)
+ loop0(.Lpage, #512)
+ .falign
+.Lpage:
+ r5:4 = memd(r1++#8)
+ {
+ memw(r0++#8) = r4
+ memw(r0+#4) = r5
+ } :endloop0:endloop1
+.Lskipmain:
+ {
+ r3 = ##2105344 // r3 = 0x202000 (prefetch info)
+ r4 = lsr(r2, #3) // r4 = number of 32-byte blocks remaining.
+ p0 = cmp.eq(r2, #0)
+ if (p0.new) jumpr:nt r31
+ }
+ {
+ r3 = or(r3, r4)
+ loop0(.Lepilog, r2)
+ }
+ l2fetch(r1, r3)
+ .falign
+.Lepilog:
+ {
+ r4 = memw(r1++#4)
+ memw(r0++#4) = r4.new
+ } :endloop0
+
+ jumpr r31
+
+.size hexagon_memcpy_forward_vp4cp4n2, . - hexagon_memcpy_forward_vp4cp4n2