diff options
Diffstat (limited to 'lib/builtins/hexagon/memcpy_forward_vp4cp4n2.S')
-rw-r--r-- | lib/builtins/hexagon/memcpy_forward_vp4cp4n2.S | 125 |
1 files changed, 125 insertions, 0 deletions
diff --git a/lib/builtins/hexagon/memcpy_forward_vp4cp4n2.S b/lib/builtins/hexagon/memcpy_forward_vp4cp4n2.S new file mode 100644 index 000000000000..fbe09086cd33 --- /dev/null +++ b/lib/builtins/hexagon/memcpy_forward_vp4cp4n2.S @@ -0,0 +1,125 @@ +//===----------------------Hexagon builtin routine ------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// An optimized version of a memcpy which is equivalent to the following loop: +// +// volatile unsigned *dest; +// unsigned *src; +// +// for (i = 0; i < num_words; ++i) +// *dest++ = *src++; +// +// The corresponding C prototype for this function would be +// void hexagon_memcpy_forward_vp4cp4n2(volatile unsigned *dest, +// const unsigned *src, +// unsigned num_words); +// +// *** Both dest and src must be aligned to 32-bit boundaries. *** +// The code does not perform any runtime checks for this, and will fail +// in bad ways if this requirement is not met. +// +// The "forward" in the name refers to the fact that the function copies +// the words going forward in memory. It is incorrect to use this function +// for cases where the original code copied words in any other order. +// +// *** This function is only for the use by the compiler. *** +// The only indended use is for the LLVM compiler to generate calls to +// this function, when a mem-copy loop, like the one above, is detected. + + .text + +// Inputs: +// r0: dest +// r1: src +// r2: num_words + + .globl hexagon_memcpy_forward_vp4cp4n2 + .balign 32 + .type hexagon_memcpy_forward_vp4cp4n2,@function +hexagon_memcpy_forward_vp4cp4n2: + + // Compute r3 to be the number of words remaining in the current page. + // At the same time, compute r4 to be the number of 32-byte blocks + // remaining in the page (for prefetch). + { + r3 = sub(##4096, r1) + r5 = lsr(r2, #3) + } + { + // The word count before end-of-page is in the 12 lowest bits of r3. + // (If the address in r1 was already page-aligned, the bits are 0.) + r3 = extractu(r3, #10, #2) + r4 = extractu(r3, #7, #5) + } + { + r3 = minu(r2, r3) + r4 = minu(r5, r4) + } + { + r4 = or(r4, ##2105344) // 2105344 = 0x202000 + p0 = cmp.eq(r3, #0) + if (p0.new) jump:nt .Lskipprolog + } + l2fetch(r1, r4) + { + loop0(.Lprolog, r3) + r2 = sub(r2, r3) // r2 = number of words left after the prolog. + } + .falign +.Lprolog: + { + r4 = memw(r1++#4) + memw(r0++#4) = r4.new + } :endloop0 +.Lskipprolog: + { + // Let r3 = number of whole pages left (page = 1024 words). + r3 = lsr(r2, #10) + if (cmp.eq(r3.new, #0)) jump:nt .Lskipmain + } + { + loop1(.Lout, r3) + r2 = extractu(r2, #10, #0) // r2 = r2 & 1023 + r3 = ##2105472 // r3 = 0x202080 (prefetch info) + } + // Iterate over pages. + .falign +.Lout: + // Prefetch each individual page. + l2fetch(r1, r3) + loop0(.Lpage, #512) + .falign +.Lpage: + r5:4 = memd(r1++#8) + { + memw(r0++#8) = r4 + memw(r0+#4) = r5 + } :endloop0:endloop1 +.Lskipmain: + { + r3 = ##2105344 // r3 = 0x202000 (prefetch info) + r4 = lsr(r2, #3) // r4 = number of 32-byte blocks remaining. + p0 = cmp.eq(r2, #0) + if (p0.new) jumpr:nt r31 + } + { + r3 = or(r3, r4) + loop0(.Lepilog, r2) + } + l2fetch(r1, r3) + .falign +.Lepilog: + { + r4 = memw(r1++#4) + memw(r0++#4) = r4.new + } :endloop0 + + jumpr r31 + +.size hexagon_memcpy_forward_vp4cp4n2, . - hexagon_memcpy_forward_vp4cp4n2 |