aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2023-07-26 19:03:47 +0000
committerDimitry Andric <dim@FreeBSD.org>2023-07-26 19:04:23 +0000
commit7fa27ce4a07f19b07799a767fc29416f3b625afb (patch)
tree27825c83636c4de341eb09a74f49f5d38a15d165 /llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
parente3b557809604d036af6e00c60f012c2025b59a5e (diff)
downloadsrc-7fa27ce4a07f19b07799a767fc29416f3b625afb.tar.gz
src-7fa27ce4a07f19b07799a767fc29416f3b625afb.zip
Diffstat (limited to 'llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp')
-rw-r--r--llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp471
1 files changed, 383 insertions, 88 deletions
diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 64846484f936..68642a01b37c 100644
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -46,13 +46,10 @@
#include "llvm/IR/Type.h"
#include "llvm/IR/User.h"
#include "llvm/IR/Value.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/Local.h"
#include <algorithm>
#include <cassert>
@@ -72,6 +69,7 @@ STATISTIC(NumMemSetInfer, "Number of memsets inferred");
STATISTIC(NumMoveToCpy, "Number of memmoves converted to memcpy");
STATISTIC(NumCpyToSet, "Number of memcpys converted to memset");
STATISTIC(NumCallSlot, "Number of call slot optimizations performed");
+STATISTIC(NumStackMove, "Number of stack-move optimizations performed");
namespace {
@@ -255,54 +253,6 @@ void MemsetRanges::addRange(int64_t Start, int64_t Size, Value *Ptr,
// MemCpyOptLegacyPass Pass
//===----------------------------------------------------------------------===//
-namespace {
-
-class MemCpyOptLegacyPass : public FunctionPass {
- MemCpyOptPass Impl;
-
-public:
- static char ID; // Pass identification, replacement for typeid
-
- MemCpyOptLegacyPass() : FunctionPass(ID) {
- initializeMemCpyOptLegacyPassPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnFunction(Function &F) override;
-
-private:
- // This transformation requires dominator postdominator info
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesCFG();
- AU.addRequired<AssumptionCacheTracker>();
- AU.addRequired<DominatorTreeWrapperPass>();
- AU.addPreserved<DominatorTreeWrapperPass>();
- AU.addPreserved<GlobalsAAWrapperPass>();
- AU.addRequired<TargetLibraryInfoWrapperPass>();
- AU.addRequired<AAResultsWrapperPass>();
- AU.addPreserved<AAResultsWrapperPass>();
- AU.addRequired<MemorySSAWrapperPass>();
- AU.addPreserved<MemorySSAWrapperPass>();
- }
-};
-
-} // end anonymous namespace
-
-char MemCpyOptLegacyPass::ID = 0;
-
-/// The public interface to this file...
-FunctionPass *llvm::createMemCpyOptPass() { return new MemCpyOptLegacyPass(); }
-
-INITIALIZE_PASS_BEGIN(MemCpyOptLegacyPass, "memcpyopt", "MemCpy Optimization",
- false, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
-INITIALIZE_PASS_END(MemCpyOptLegacyPass, "memcpyopt", "MemCpy Optimization",
- false, false)
-
// Check that V is either not accessible by the caller, or unwinding cannot
// occur between Start and End.
static bool mayBeVisibleThroughUnwinding(Value *V, Instruction *Start,
@@ -463,7 +413,7 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
// Check to see if this store is to a constant offset from the start ptr.
std::optional<int64_t> Offset =
- isPointerOffset(StartPtr, NextStore->getPointerOperand(), DL);
+ NextStore->getPointerOperand()->getPointerOffsetFrom(StartPtr, DL);
if (!Offset)
break;
@@ -477,7 +427,7 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
// Check to see if this store is to a constant offset from the start ptr.
std::optional<int64_t> Offset =
- isPointerOffset(StartPtr, MSI->getDest(), DL);
+ MSI->getDest()->getPointerOffsetFrom(StartPtr, DL);
if (!Offset)
break;
@@ -781,6 +731,23 @@ bool MemCpyOptPass::processStoreOfLoad(StoreInst *SI, LoadInst *LI,
return true;
}
+ // If this is a load-store pair from a stack slot to a stack slot, we
+ // might be able to perform the stack-move optimization just as we do for
+ // memcpys from an alloca to an alloca.
+ if (auto *DestAlloca = dyn_cast<AllocaInst>(SI->getPointerOperand())) {
+ if (auto *SrcAlloca = dyn_cast<AllocaInst>(LI->getPointerOperand())) {
+ if (performStackMoveOptzn(LI, SI, DestAlloca, SrcAlloca,
+ DL.getTypeStoreSize(T), BAA)) {
+ // Avoid invalidating the iterator.
+ BBI = SI->getNextNonDebugInstruction()->getIterator();
+ eraseInstruction(SI);
+ eraseInstruction(LI);
+ ++NumMemCpyInstr;
+ return true;
+ }
+ }
+ }
+
return false;
}
@@ -1200,8 +1167,14 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M,
// still want to eliminate the intermediate value, but we have to generate a
// memmove instead of memcpy.
bool UseMemMove = false;
- if (isModSet(BAA.getModRefInfo(M, MemoryLocation::getForSource(MDep))))
+ if (isModSet(BAA.getModRefInfo(M, MemoryLocation::getForSource(MDep)))) {
+ // Don't convert llvm.memcpy.inline into memmove because memmove can be
+ // lowered as a call, and that is not allowed for llvm.memcpy.inline (and
+ // there is no inline version of llvm.memmove)
+ if (isa<MemCpyInlineInst>(M))
+ return false;
UseMemMove = true;
+ }
// If all checks passed, then we can transform M.
LLVM_DEBUG(dbgs() << "MemCpyOptPass: Forwarding memcpy->memcpy src:\n"
@@ -1246,13 +1219,18 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M,
/// In other words, transform:
/// \code
/// memset(dst, c, dst_size);
+/// ...
/// memcpy(dst, src, src_size);
/// \endcode
/// into:
/// \code
-/// memcpy(dst, src, src_size);
+/// ...
/// memset(dst + src_size, c, dst_size <= src_size ? 0 : dst_size - src_size);
+/// memcpy(dst, src, src_size);
/// \endcode
+///
+/// The memset is sunk to just before the memcpy to ensure that src_size is
+/// present when emitting the simplified memset.
bool MemCpyOptPass::processMemSetMemCpyDependence(MemCpyInst *MemCpy,
MemSetInst *MemSet,
BatchAAResults &BAA) {
@@ -1300,6 +1278,15 @@ bool MemCpyOptPass::processMemSetMemCpyDependence(MemCpyInst *MemCpy,
IRBuilder<> Builder(MemCpy);
+ // Preserve the debug location of the old memset for the code emitted here
+ // related to the new memset. This is correct according to the rules in
+ // https://llvm.org/docs/HowToUpdateDebugInfo.html about "when to preserve an
+ // instruction location", given that we move the memset within the basic
+ // block.
+ assert(MemSet->getParent() == MemCpy->getParent() &&
+ "Preserving debug location based on moving memset within BB.");
+ Builder.SetCurrentDebugLocation(MemSet->getDebugLoc());
+
// If the sizes have different types, zext the smaller one.
if (DestSize->getType() != SrcSize->getType()) {
if (DestSize->getType()->getIntegerBitWidth() >
@@ -1323,9 +1310,8 @@ bool MemCpyOptPass::processMemSetMemCpyDependence(MemCpyInst *MemCpy,
assert(isa<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(MemCpy)) &&
"MemCpy must be a MemoryDef");
- // The new memset is inserted after the memcpy, but it is known that its
- // defining access is the memset about to be removed which immediately
- // precedes the memcpy.
+ // The new memset is inserted before the memcpy, and it is known that the
+ // memcpy's defining access is the memset about to be removed.
auto *LastDef =
cast<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(MemCpy));
auto *NewAccess = MSSAU->createMemoryAccessBefore(
@@ -1440,6 +1426,217 @@ bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy,
return true;
}
+// Attempts to optimize the pattern whereby memory is copied from an alloca to
+// another alloca, where the two allocas don't have conflicting mod/ref. If
+// successful, the two allocas can be merged into one and the transfer can be
+// deleted. This pattern is generated frequently in Rust, due to the ubiquity of
+// move operations in that language.
+//
+// Once we determine that the optimization is safe to perform, we replace all
+// uses of the destination alloca with the source alloca. We also "shrink wrap"
+// the lifetime markers of the single merged alloca to before the first use
+// and after the last use. Note that the "shrink wrapping" procedure is a safe
+// transformation only because we restrict the scope of this optimization to
+// allocas that aren't captured.
+bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store,
+ AllocaInst *DestAlloca,
+ AllocaInst *SrcAlloca, uint64_t Size,
+ BatchAAResults &BAA) {
+ LLVM_DEBUG(dbgs() << "Stack Move: Attempting to optimize:\n"
+ << *Store << "\n");
+
+ // Make sure the two allocas are in the same address space.
+ if (SrcAlloca->getAddressSpace() != DestAlloca->getAddressSpace()) {
+ LLVM_DEBUG(dbgs() << "Stack Move: Address space mismatch\n");
+ return false;
+ }
+
+ // 1. Check that copy is full. Calculate the static size of the allocas to be
+ // merged, bail out if we can't.
+ const DataLayout &DL = DestAlloca->getModule()->getDataLayout();
+ std::optional<TypeSize> SrcSize = SrcAlloca->getAllocationSize(DL);
+ if (!SrcSize || SrcSize->isScalable() || Size != SrcSize->getFixedValue()) {
+ LLVM_DEBUG(dbgs() << "Stack Move: Source alloca size mismatch\n");
+ return false;
+ }
+ std::optional<TypeSize> DestSize = DestAlloca->getAllocationSize(DL);
+ if (!DestSize || DestSize->isScalable() ||
+ Size != DestSize->getFixedValue()) {
+ LLVM_DEBUG(dbgs() << "Stack Move: Destination alloca size mismatch\n");
+ return false;
+ }
+
+ // 2-1. Check that src and dest are static allocas, which are not affected by
+ // stacksave/stackrestore.
+ if (!SrcAlloca->isStaticAlloca() || !DestAlloca->isStaticAlloca() ||
+ SrcAlloca->getParent() != Load->getParent() ||
+ SrcAlloca->getParent() != Store->getParent())
+ return false;
+
+ // 2-2. Check that src and dest are never captured, unescaped allocas. Also
+ // collect lifetime markers first/last users in order to shrink wrap the
+ // lifetimes, and instructions with noalias metadata to remove them.
+
+ SmallVector<Instruction *, 4> LifetimeMarkers;
+ Instruction *FirstUser = nullptr, *LastUser = nullptr;
+ SmallSet<Instruction *, 4> NoAliasInstrs;
+
+ // Recursively track the user and check whether modified alias exist.
+ auto IsDereferenceableOrNull = [](Value *V, const DataLayout &DL) -> bool {
+ bool CanBeNull, CanBeFreed;
+ return V->getPointerDereferenceableBytes(DL, CanBeNull, CanBeFreed);
+ };
+
+ auto CaptureTrackingWithModRef =
+ [&](Instruction *AI,
+ function_ref<bool(Instruction *)> ModRefCallback) -> bool {
+ SmallVector<Instruction *, 8> Worklist;
+ Worklist.push_back(AI);
+ unsigned MaxUsesToExplore = getDefaultMaxUsesToExploreForCaptureTracking();
+ Worklist.reserve(MaxUsesToExplore);
+ SmallSet<const Use *, 20> Visited;
+ while (!Worklist.empty()) {
+ Instruction *I = Worklist.back();
+ Worklist.pop_back();
+ for (const Use &U : I->uses()) {
+ if (Visited.size() >= MaxUsesToExplore) {
+ LLVM_DEBUG(
+ dbgs()
+ << "Stack Move: Exceeded max uses to see ModRef, bailing\n");
+ return false;
+ }
+ if (!Visited.insert(&U).second)
+ continue;
+ switch (DetermineUseCaptureKind(U, IsDereferenceableOrNull)) {
+ case UseCaptureKind::MAY_CAPTURE:
+ return false;
+ case UseCaptureKind::PASSTHROUGH:
+ // Instructions cannot have non-instruction users.
+ Worklist.push_back(cast<Instruction>(U.getUser()));
+ continue;
+ case UseCaptureKind::NO_CAPTURE: {
+ auto *UI = cast<Instruction>(U.getUser());
+ if (DestAlloca->getParent() != UI->getParent())
+ return false;
+ if (!FirstUser || UI->comesBefore(FirstUser))
+ FirstUser = UI;
+ if (!LastUser || LastUser->comesBefore(UI))
+ LastUser = UI;
+ if (UI->isLifetimeStartOrEnd()) {
+ // We note the locations of these intrinsic calls so that we can
+ // delete them later if the optimization succeeds, this is safe
+ // since both llvm.lifetime.start and llvm.lifetime.end intrinsics
+ // conceptually fill all the bytes of the alloca with an undefined
+ // value.
+ int64_t Size = cast<ConstantInt>(UI->getOperand(0))->getSExtValue();
+ if (Size < 0 || Size == DestSize) {
+ LifetimeMarkers.push_back(UI);
+ continue;
+ }
+ }
+ if (UI->hasMetadata(LLVMContext::MD_noalias))
+ NoAliasInstrs.insert(UI);
+ if (!ModRefCallback(UI))
+ return false;
+ }
+ }
+ }
+ }
+ return true;
+ };
+
+ // 3. Check that dest has no Mod/Ref, except full size lifetime intrinsics,
+ // from the alloca to the Store.
+ ModRefInfo DestModRef = ModRefInfo::NoModRef;
+ MemoryLocation DestLoc(DestAlloca, LocationSize::precise(Size));
+ auto DestModRefCallback = [&](Instruction *UI) -> bool {
+ // We don't care about the store itself.
+ if (UI == Store)
+ return true;
+ ModRefInfo Res = BAA.getModRefInfo(UI, DestLoc);
+ // FIXME: For multi-BB cases, we need to see reachability from it to
+ // store.
+ // Bailout if Dest may have any ModRef before Store.
+ if (UI->comesBefore(Store) && isModOrRefSet(Res))
+ return false;
+ DestModRef |= BAA.getModRefInfo(UI, DestLoc);
+
+ return true;
+ };
+
+ if (!CaptureTrackingWithModRef(DestAlloca, DestModRefCallback))
+ return false;
+
+ // 3. Check that, from after the Load to the end of the BB,
+ // 3-1. if the dest has any Mod, src has no Ref, and
+ // 3-2. if the dest has any Ref, src has no Mod except full-sized lifetimes.
+ MemoryLocation SrcLoc(SrcAlloca, LocationSize::precise(Size));
+
+ auto SrcModRefCallback = [&](Instruction *UI) -> bool {
+ // Any ModRef before Load doesn't matter, also Load and Store can be
+ // ignored.
+ if (UI->comesBefore(Load) || UI == Load || UI == Store)
+ return true;
+ ModRefInfo Res = BAA.getModRefInfo(UI, SrcLoc);
+ if ((isModSet(DestModRef) && isRefSet(Res)) ||
+ (isRefSet(DestModRef) && isModSet(Res)))
+ return false;
+
+ return true;
+ };
+
+ if (!CaptureTrackingWithModRef(SrcAlloca, SrcModRefCallback))
+ return false;
+
+ // We can do the transformation. First, align the allocas appropriately.
+ SrcAlloca->setAlignment(
+ std::max(SrcAlloca->getAlign(), DestAlloca->getAlign()));
+
+ // Merge the two allocas.
+ DestAlloca->replaceAllUsesWith(SrcAlloca);
+ eraseInstruction(DestAlloca);
+
+ // Drop metadata on the source alloca.
+ SrcAlloca->dropUnknownNonDebugMetadata();
+
+ // Do "shrink wrap" the lifetimes, if the original lifetime intrinsics exists.
+ if (!LifetimeMarkers.empty()) {
+ LLVMContext &C = SrcAlloca->getContext();
+ IRBuilder<> Builder(C);
+
+ ConstantInt *AllocaSize = ConstantInt::get(Type::getInt64Ty(C), Size);
+ // Create a new lifetime start marker before the first user of src or alloca
+ // users.
+ Builder.SetInsertPoint(FirstUser->getParent(), FirstUser->getIterator());
+ Builder.CreateLifetimeStart(SrcAlloca, AllocaSize);
+
+ // Create a new lifetime end marker after the last user of src or alloca
+ // users.
+ // FIXME: If the last user is the terminator for the bb, we can insert
+ // lifetime.end marker to the immidiate post-dominator, but currently do
+ // nothing.
+ if (!LastUser->isTerminator()) {
+ Builder.SetInsertPoint(LastUser->getParent(), ++LastUser->getIterator());
+ Builder.CreateLifetimeEnd(SrcAlloca, AllocaSize);
+ }
+
+ // Remove all other lifetime markers.
+ for (Instruction *I : LifetimeMarkers)
+ eraseInstruction(I);
+ }
+
+ // As this transformation can cause memory accesses that didn't previously
+ // alias to begin to alias one another, we remove !noalias metadata from any
+ // uses of either alloca. This is conservative, but more precision doesn't
+ // seem worthwhile right now.
+ for (Instruction *I : NoAliasInstrs)
+ I->setMetadata(LLVMContext::MD_noalias, nullptr);
+
+ LLVM_DEBUG(dbgs() << "Stack Move: Performed staack-move optimization\n");
+ NumStackMove++;
+ return true;
+}
+
/// Perform simplification of memcpy's. If we have memcpy A
/// which copies X to Y, and memcpy B which copies Y to Z, then we can rewrite
/// B to be a memcpy from X to Z (or potentially a memmove, depending on
@@ -1484,8 +1681,8 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
MSSA->getWalker()->getClobberingMemoryAccess(AnyClobber, DestLoc, BAA);
// Try to turn a partially redundant memset + memcpy into
- // memcpy + smaller memset. We don't need the memcpy size for this.
- // The memcpy most post-dom the memset, so limit this to the same basic
+ // smaller memset + memcpy. We don't need the memcpy size for this.
+ // The memcpy must post-dom the memset, so limit this to the same basic
// block. A non-local generalization is likely not worthwhile.
if (auto *MD = dyn_cast<MemoryDef>(DestClobber))
if (auto *MDep = dyn_cast_or_null<MemSetInst>(MD->getMemoryInst()))
@@ -1496,13 +1693,14 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
MemoryAccess *SrcClobber = MSSA->getWalker()->getClobberingMemoryAccess(
AnyClobber, MemoryLocation::getForSource(M), BAA);
- // There are four possible optimizations we can do for memcpy:
+ // There are five possible optimizations we can do for memcpy:
// a) memcpy-memcpy xform which exposes redundance for DSE.
// b) call-memcpy xform for return slot optimization.
// c) memcpy from freshly alloca'd space or space that has just started
// its lifetime copies undefined data, and we can therefore eliminate
// the memcpy in favor of the data that was already at the destination.
// d) memcpy from a just-memset'd source can be turned into memset.
+ // e) elimination of memcpy via stack-move optimization.
if (auto *MD = dyn_cast<MemoryDef>(SrcClobber)) {
if (Instruction *MI = MD->getMemoryInst()) {
if (auto *CopySize = dyn_cast<ConstantInt>(M->getLength())) {
@@ -1521,7 +1719,8 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
}
}
if (auto *MDep = dyn_cast<MemCpyInst>(MI))
- return processMemCpyMemCpyDependence(M, MDep, BAA);
+ if (processMemCpyMemCpyDependence(M, MDep, BAA))
+ return true;
if (auto *MDep = dyn_cast<MemSetInst>(MI)) {
if (performMemCpyToMemSetOptzn(M, MDep, BAA)) {
LLVM_DEBUG(dbgs() << "Converted memcpy to memset\n");
@@ -1540,6 +1739,27 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
}
}
+ // If the transfer is from a stack slot to a stack slot, then we may be able
+ // to perform the stack-move optimization. See the comments in
+ // performStackMoveOptzn() for more details.
+ auto *DestAlloca = dyn_cast<AllocaInst>(M->getDest());
+ if (!DestAlloca)
+ return false;
+ auto *SrcAlloca = dyn_cast<AllocaInst>(M->getSource());
+ if (!SrcAlloca)
+ return false;
+ ConstantInt *Len = dyn_cast<ConstantInt>(M->getLength());
+ if (Len == nullptr)
+ return false;
+ if (performStackMoveOptzn(M, M, DestAlloca, SrcAlloca, Len->getZExtValue(),
+ BAA)) {
+ // Avoid invalidating the iterator.
+ BBI = M->getNextNonDebugInstruction()->getIterator();
+ eraseInstruction(M);
+ ++NumMemCpyInstr;
+ return true;
+ }
+
return false;
}
@@ -1623,24 +1843,110 @@ bool MemCpyOptPass::processByValArgument(CallBase &CB, unsigned ArgNo) {
// foo(*a)
// It would be invalid to transform the second memcpy into foo(*b).
if (writtenBetween(MSSA, BAA, MemoryLocation::getForSource(MDep),
- MSSA->getMemoryAccess(MDep), MSSA->getMemoryAccess(&CB)))
+ MSSA->getMemoryAccess(MDep), CallAccess))
return false;
- Value *TmpCast = MDep->getSource();
- if (MDep->getSource()->getType() != ByValArg->getType()) {
- BitCastInst *TmpBitCast = new BitCastInst(MDep->getSource(), ByValArg->getType(),
- "tmpcast", &CB);
- // Set the tmpcast's DebugLoc to MDep's
- TmpBitCast->setDebugLoc(MDep->getDebugLoc());
- TmpCast = TmpBitCast;
- }
-
LLVM_DEBUG(dbgs() << "MemCpyOptPass: Forwarding memcpy to byval:\n"
<< " " << *MDep << "\n"
<< " " << CB << "\n");
// Otherwise we're good! Update the byval argument.
- CB.setArgOperand(ArgNo, TmpCast);
+ CB.setArgOperand(ArgNo, MDep->getSource());
+ ++NumMemCpyInstr;
+ return true;
+}
+
+/// This is called on memcpy dest pointer arguments attributed as immutable
+/// during call. Try to use memcpy source directly if all of the following
+/// conditions are satisfied.
+/// 1. The memcpy dst is neither modified during the call nor captured by the
+/// call. (if readonly, noalias, nocapture attributes on call-site.)
+/// 2. The memcpy dst is an alloca with known alignment & size.
+/// 2-1. The memcpy length == the alloca size which ensures that the new
+/// pointer is dereferenceable for the required range
+/// 2-2. The src pointer has alignment >= the alloca alignment or can be
+/// enforced so.
+/// 3. The memcpy dst and src is not modified between the memcpy and the call.
+/// (if MSSA clobber check is safe.)
+/// 4. The memcpy src is not modified during the call. (ModRef check shows no
+/// Mod.)
+bool MemCpyOptPass::processImmutArgument(CallBase &CB, unsigned ArgNo) {
+ // 1. Ensure passed argument is immutable during call.
+ if (!(CB.paramHasAttr(ArgNo, Attribute::NoAlias) &&
+ CB.paramHasAttr(ArgNo, Attribute::NoCapture)))
+ return false;
+ const DataLayout &DL = CB.getCaller()->getParent()->getDataLayout();
+ Value *ImmutArg = CB.getArgOperand(ArgNo);
+
+ // 2. Check that arg is alloca
+ // TODO: Even if the arg gets back to branches, we can remove memcpy if all
+ // the alloca alignments can be enforced to source alignment.
+ auto *AI = dyn_cast<AllocaInst>(ImmutArg->stripPointerCasts());
+ if (!AI)
+ return false;
+
+ std::optional<TypeSize> AllocaSize = AI->getAllocationSize(DL);
+ // Can't handle unknown size alloca.
+ // (e.g. Variable Length Array, Scalable Vector)
+ if (!AllocaSize || AllocaSize->isScalable())
+ return false;
+ MemoryLocation Loc(ImmutArg, LocationSize::precise(*AllocaSize));
+ MemoryUseOrDef *CallAccess = MSSA->getMemoryAccess(&CB);
+ if (!CallAccess)
+ return false;
+
+ MemCpyInst *MDep = nullptr;
+ BatchAAResults BAA(*AA);
+ MemoryAccess *Clobber = MSSA->getWalker()->getClobberingMemoryAccess(
+ CallAccess->getDefiningAccess(), Loc, BAA);
+ if (auto *MD = dyn_cast<MemoryDef>(Clobber))
+ MDep = dyn_cast_or_null<MemCpyInst>(MD->getMemoryInst());
+
+ // If the immut argument isn't fed by a memcpy, ignore it. If it is fed by
+ // a memcpy, check that the arg equals the memcpy dest.
+ if (!MDep || MDep->isVolatile() || AI != MDep->getDest())
+ return false;
+
+ // The address space of the memcpy source must match the immut argument
+ if (MDep->getSource()->getType()->getPointerAddressSpace() !=
+ ImmutArg->getType()->getPointerAddressSpace())
+ return false;
+
+ // 2-1. The length of the memcpy must be equal to the size of the alloca.
+ auto *MDepLen = dyn_cast<ConstantInt>(MDep->getLength());
+ if (!MDepLen || AllocaSize != MDepLen->getValue())
+ return false;
+
+ // 2-2. the memcpy source align must be larger than or equal the alloca's
+ // align. If not so, we check to see if we can force the source of the memcpy
+ // to the alignment we need. If we fail, we bail out.
+ Align MemDepAlign = MDep->getSourceAlign().valueOrOne();
+ Align AllocaAlign = AI->getAlign();
+ if (MemDepAlign < AllocaAlign &&
+ getOrEnforceKnownAlignment(MDep->getSource(), AllocaAlign, DL, &CB, AC,
+ DT) < AllocaAlign)
+ return false;
+
+ // 3. Verify that the source doesn't change in between the memcpy and
+ // the call.
+ // memcpy(a <- b)
+ // *b = 42;
+ // foo(*a)
+ // It would be invalid to transform the second memcpy into foo(*b).
+ if (writtenBetween(MSSA, BAA, MemoryLocation::getForSource(MDep),
+ MSSA->getMemoryAccess(MDep), CallAccess))
+ return false;
+
+ // 4. The memcpy src must not be modified during the call.
+ if (isModSet(AA->getModRefInfo(&CB, MemoryLocation::getForSource(MDep))))
+ return false;
+
+ LLVM_DEBUG(dbgs() << "MemCpyOptPass: Forwarding memcpy to Immut src:\n"
+ << " " << *MDep << "\n"
+ << " " << CB << "\n");
+
+ // Otherwise we're good! Update the immut argument.
+ CB.setArgOperand(ArgNo, MDep->getSource());
++NumMemCpyInstr;
return true;
}
@@ -1673,9 +1979,12 @@ bool MemCpyOptPass::iterateOnFunction(Function &F) {
else if (auto *M = dyn_cast<MemMoveInst>(I))
RepeatInstruction = processMemMove(M);
else if (auto *CB = dyn_cast<CallBase>(I)) {
- for (unsigned i = 0, e = CB->arg_size(); i != e; ++i)
+ for (unsigned i = 0, e = CB->arg_size(); i != e; ++i) {
if (CB->isByValArgument(i))
MadeChange |= processByValArgument(*CB, i);
+ else if (CB->onlyReadsMemory(i))
+ MadeChange |= processImmutArgument(*CB, i);
+ }
}
// Reprocess the instruction if desired.
@@ -1730,17 +2039,3 @@ bool MemCpyOptPass::runImpl(Function &F, TargetLibraryInfo *TLI_,
return MadeChange;
}
-
-/// This is the main transformation entry point for a function.
-bool MemCpyOptLegacyPass::runOnFunction(Function &F) {
- if (skipFunction(F))
- return false;
-
- auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
- auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
- auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
- auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- auto *MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA();
-
- return Impl.runImpl(F, TLI, AA, AC, DT, MSSA);
-}