diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2023-07-26 19:03:47 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2023-07-26 19:04:23 +0000 |
commit | 7fa27ce4a07f19b07799a767fc29416f3b625afb (patch) | |
tree | 27825c83636c4de341eb09a74f49f5d38a15d165 /llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp | |
parent | e3b557809604d036af6e00c60f012c2025b59a5e (diff) | |
download | src-7fa27ce4a07f19b07799a767fc29416f3b625afb.tar.gz src-7fa27ce4a07f19b07799a767fc29416f3b625afb.zip |
Diffstat (limited to 'llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp')
-rw-r--r-- | llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp | 471 |
1 files changed, 383 insertions, 88 deletions
diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp index 64846484f936..68642a01b37c 100644 --- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -46,13 +46,10 @@ #include "llvm/IR/Type.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" -#include "llvm/InitializePasses.h" -#include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/Debug.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/Local.h" #include <algorithm> #include <cassert> @@ -72,6 +69,7 @@ STATISTIC(NumMemSetInfer, "Number of memsets inferred"); STATISTIC(NumMoveToCpy, "Number of memmoves converted to memcpy"); STATISTIC(NumCpyToSet, "Number of memcpys converted to memset"); STATISTIC(NumCallSlot, "Number of call slot optimizations performed"); +STATISTIC(NumStackMove, "Number of stack-move optimizations performed"); namespace { @@ -255,54 +253,6 @@ void MemsetRanges::addRange(int64_t Start, int64_t Size, Value *Ptr, // MemCpyOptLegacyPass Pass //===----------------------------------------------------------------------===// -namespace { - -class MemCpyOptLegacyPass : public FunctionPass { - MemCpyOptPass Impl; - -public: - static char ID; // Pass identification, replacement for typeid - - MemCpyOptLegacyPass() : FunctionPass(ID) { - initializeMemCpyOptLegacyPassPass(*PassRegistry::getPassRegistry()); - } - - bool runOnFunction(Function &F) override; - -private: - // This transformation requires dominator postdominator info - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - AU.addRequired<AssumptionCacheTracker>(); - AU.addRequired<DominatorTreeWrapperPass>(); - AU.addPreserved<DominatorTreeWrapperPass>(); - AU.addPreserved<GlobalsAAWrapperPass>(); - AU.addRequired<TargetLibraryInfoWrapperPass>(); - AU.addRequired<AAResultsWrapperPass>(); - AU.addPreserved<AAResultsWrapperPass>(); - AU.addRequired<MemorySSAWrapperPass>(); - AU.addPreserved<MemorySSAWrapperPass>(); - } -}; - -} // end anonymous namespace - -char MemCpyOptLegacyPass::ID = 0; - -/// The public interface to this file... -FunctionPass *llvm::createMemCpyOptPass() { return new MemCpyOptLegacyPass(); } - -INITIALIZE_PASS_BEGIN(MemCpyOptLegacyPass, "memcpyopt", "MemCpy Optimization", - false, false) -INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) -INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) -INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) -INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass) -INITIALIZE_PASS_END(MemCpyOptLegacyPass, "memcpyopt", "MemCpy Optimization", - false, false) - // Check that V is either not accessible by the caller, or unwinding cannot // occur between Start and End. static bool mayBeVisibleThroughUnwinding(Value *V, Instruction *Start, @@ -463,7 +413,7 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst, // Check to see if this store is to a constant offset from the start ptr. std::optional<int64_t> Offset = - isPointerOffset(StartPtr, NextStore->getPointerOperand(), DL); + NextStore->getPointerOperand()->getPointerOffsetFrom(StartPtr, DL); if (!Offset) break; @@ -477,7 +427,7 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst, // Check to see if this store is to a constant offset from the start ptr. std::optional<int64_t> Offset = - isPointerOffset(StartPtr, MSI->getDest(), DL); + MSI->getDest()->getPointerOffsetFrom(StartPtr, DL); if (!Offset) break; @@ -781,6 +731,23 @@ bool MemCpyOptPass::processStoreOfLoad(StoreInst *SI, LoadInst *LI, return true; } + // If this is a load-store pair from a stack slot to a stack slot, we + // might be able to perform the stack-move optimization just as we do for + // memcpys from an alloca to an alloca. + if (auto *DestAlloca = dyn_cast<AllocaInst>(SI->getPointerOperand())) { + if (auto *SrcAlloca = dyn_cast<AllocaInst>(LI->getPointerOperand())) { + if (performStackMoveOptzn(LI, SI, DestAlloca, SrcAlloca, + DL.getTypeStoreSize(T), BAA)) { + // Avoid invalidating the iterator. + BBI = SI->getNextNonDebugInstruction()->getIterator(); + eraseInstruction(SI); + eraseInstruction(LI); + ++NumMemCpyInstr; + return true; + } + } + } + return false; } @@ -1200,8 +1167,14 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M, // still want to eliminate the intermediate value, but we have to generate a // memmove instead of memcpy. bool UseMemMove = false; - if (isModSet(BAA.getModRefInfo(M, MemoryLocation::getForSource(MDep)))) + if (isModSet(BAA.getModRefInfo(M, MemoryLocation::getForSource(MDep)))) { + // Don't convert llvm.memcpy.inline into memmove because memmove can be + // lowered as a call, and that is not allowed for llvm.memcpy.inline (and + // there is no inline version of llvm.memmove) + if (isa<MemCpyInlineInst>(M)) + return false; UseMemMove = true; + } // If all checks passed, then we can transform M. LLVM_DEBUG(dbgs() << "MemCpyOptPass: Forwarding memcpy->memcpy src:\n" @@ -1246,13 +1219,18 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M, /// In other words, transform: /// \code /// memset(dst, c, dst_size); +/// ... /// memcpy(dst, src, src_size); /// \endcode /// into: /// \code -/// memcpy(dst, src, src_size); +/// ... /// memset(dst + src_size, c, dst_size <= src_size ? 0 : dst_size - src_size); +/// memcpy(dst, src, src_size); /// \endcode +/// +/// The memset is sunk to just before the memcpy to ensure that src_size is +/// present when emitting the simplified memset. bool MemCpyOptPass::processMemSetMemCpyDependence(MemCpyInst *MemCpy, MemSetInst *MemSet, BatchAAResults &BAA) { @@ -1300,6 +1278,15 @@ bool MemCpyOptPass::processMemSetMemCpyDependence(MemCpyInst *MemCpy, IRBuilder<> Builder(MemCpy); + // Preserve the debug location of the old memset for the code emitted here + // related to the new memset. This is correct according to the rules in + // https://llvm.org/docs/HowToUpdateDebugInfo.html about "when to preserve an + // instruction location", given that we move the memset within the basic + // block. + assert(MemSet->getParent() == MemCpy->getParent() && + "Preserving debug location based on moving memset within BB."); + Builder.SetCurrentDebugLocation(MemSet->getDebugLoc()); + // If the sizes have different types, zext the smaller one. if (DestSize->getType() != SrcSize->getType()) { if (DestSize->getType()->getIntegerBitWidth() > @@ -1323,9 +1310,8 @@ bool MemCpyOptPass::processMemSetMemCpyDependence(MemCpyInst *MemCpy, assert(isa<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(MemCpy)) && "MemCpy must be a MemoryDef"); - // The new memset is inserted after the memcpy, but it is known that its - // defining access is the memset about to be removed which immediately - // precedes the memcpy. + // The new memset is inserted before the memcpy, and it is known that the + // memcpy's defining access is the memset about to be removed. auto *LastDef = cast<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(MemCpy)); auto *NewAccess = MSSAU->createMemoryAccessBefore( @@ -1440,6 +1426,217 @@ bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy, return true; } +// Attempts to optimize the pattern whereby memory is copied from an alloca to +// another alloca, where the two allocas don't have conflicting mod/ref. If +// successful, the two allocas can be merged into one and the transfer can be +// deleted. This pattern is generated frequently in Rust, due to the ubiquity of +// move operations in that language. +// +// Once we determine that the optimization is safe to perform, we replace all +// uses of the destination alloca with the source alloca. We also "shrink wrap" +// the lifetime markers of the single merged alloca to before the first use +// and after the last use. Note that the "shrink wrapping" procedure is a safe +// transformation only because we restrict the scope of this optimization to +// allocas that aren't captured. +bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store, + AllocaInst *DestAlloca, + AllocaInst *SrcAlloca, uint64_t Size, + BatchAAResults &BAA) { + LLVM_DEBUG(dbgs() << "Stack Move: Attempting to optimize:\n" + << *Store << "\n"); + + // Make sure the two allocas are in the same address space. + if (SrcAlloca->getAddressSpace() != DestAlloca->getAddressSpace()) { + LLVM_DEBUG(dbgs() << "Stack Move: Address space mismatch\n"); + return false; + } + + // 1. Check that copy is full. Calculate the static size of the allocas to be + // merged, bail out if we can't. + const DataLayout &DL = DestAlloca->getModule()->getDataLayout(); + std::optional<TypeSize> SrcSize = SrcAlloca->getAllocationSize(DL); + if (!SrcSize || SrcSize->isScalable() || Size != SrcSize->getFixedValue()) { + LLVM_DEBUG(dbgs() << "Stack Move: Source alloca size mismatch\n"); + return false; + } + std::optional<TypeSize> DestSize = DestAlloca->getAllocationSize(DL); + if (!DestSize || DestSize->isScalable() || + Size != DestSize->getFixedValue()) { + LLVM_DEBUG(dbgs() << "Stack Move: Destination alloca size mismatch\n"); + return false; + } + + // 2-1. Check that src and dest are static allocas, which are not affected by + // stacksave/stackrestore. + if (!SrcAlloca->isStaticAlloca() || !DestAlloca->isStaticAlloca() || + SrcAlloca->getParent() != Load->getParent() || + SrcAlloca->getParent() != Store->getParent()) + return false; + + // 2-2. Check that src and dest are never captured, unescaped allocas. Also + // collect lifetime markers first/last users in order to shrink wrap the + // lifetimes, and instructions with noalias metadata to remove them. + + SmallVector<Instruction *, 4> LifetimeMarkers; + Instruction *FirstUser = nullptr, *LastUser = nullptr; + SmallSet<Instruction *, 4> NoAliasInstrs; + + // Recursively track the user and check whether modified alias exist. + auto IsDereferenceableOrNull = [](Value *V, const DataLayout &DL) -> bool { + bool CanBeNull, CanBeFreed; + return V->getPointerDereferenceableBytes(DL, CanBeNull, CanBeFreed); + }; + + auto CaptureTrackingWithModRef = + [&](Instruction *AI, + function_ref<bool(Instruction *)> ModRefCallback) -> bool { + SmallVector<Instruction *, 8> Worklist; + Worklist.push_back(AI); + unsigned MaxUsesToExplore = getDefaultMaxUsesToExploreForCaptureTracking(); + Worklist.reserve(MaxUsesToExplore); + SmallSet<const Use *, 20> Visited; + while (!Worklist.empty()) { + Instruction *I = Worklist.back(); + Worklist.pop_back(); + for (const Use &U : I->uses()) { + if (Visited.size() >= MaxUsesToExplore) { + LLVM_DEBUG( + dbgs() + << "Stack Move: Exceeded max uses to see ModRef, bailing\n"); + return false; + } + if (!Visited.insert(&U).second) + continue; + switch (DetermineUseCaptureKind(U, IsDereferenceableOrNull)) { + case UseCaptureKind::MAY_CAPTURE: + return false; + case UseCaptureKind::PASSTHROUGH: + // Instructions cannot have non-instruction users. + Worklist.push_back(cast<Instruction>(U.getUser())); + continue; + case UseCaptureKind::NO_CAPTURE: { + auto *UI = cast<Instruction>(U.getUser()); + if (DestAlloca->getParent() != UI->getParent()) + return false; + if (!FirstUser || UI->comesBefore(FirstUser)) + FirstUser = UI; + if (!LastUser || LastUser->comesBefore(UI)) + LastUser = UI; + if (UI->isLifetimeStartOrEnd()) { + // We note the locations of these intrinsic calls so that we can + // delete them later if the optimization succeeds, this is safe + // since both llvm.lifetime.start and llvm.lifetime.end intrinsics + // conceptually fill all the bytes of the alloca with an undefined + // value. + int64_t Size = cast<ConstantInt>(UI->getOperand(0))->getSExtValue(); + if (Size < 0 || Size == DestSize) { + LifetimeMarkers.push_back(UI); + continue; + } + } + if (UI->hasMetadata(LLVMContext::MD_noalias)) + NoAliasInstrs.insert(UI); + if (!ModRefCallback(UI)) + return false; + } + } + } + } + return true; + }; + + // 3. Check that dest has no Mod/Ref, except full size lifetime intrinsics, + // from the alloca to the Store. + ModRefInfo DestModRef = ModRefInfo::NoModRef; + MemoryLocation DestLoc(DestAlloca, LocationSize::precise(Size)); + auto DestModRefCallback = [&](Instruction *UI) -> bool { + // We don't care about the store itself. + if (UI == Store) + return true; + ModRefInfo Res = BAA.getModRefInfo(UI, DestLoc); + // FIXME: For multi-BB cases, we need to see reachability from it to + // store. + // Bailout if Dest may have any ModRef before Store. + if (UI->comesBefore(Store) && isModOrRefSet(Res)) + return false; + DestModRef |= BAA.getModRefInfo(UI, DestLoc); + + return true; + }; + + if (!CaptureTrackingWithModRef(DestAlloca, DestModRefCallback)) + return false; + + // 3. Check that, from after the Load to the end of the BB, + // 3-1. if the dest has any Mod, src has no Ref, and + // 3-2. if the dest has any Ref, src has no Mod except full-sized lifetimes. + MemoryLocation SrcLoc(SrcAlloca, LocationSize::precise(Size)); + + auto SrcModRefCallback = [&](Instruction *UI) -> bool { + // Any ModRef before Load doesn't matter, also Load and Store can be + // ignored. + if (UI->comesBefore(Load) || UI == Load || UI == Store) + return true; + ModRefInfo Res = BAA.getModRefInfo(UI, SrcLoc); + if ((isModSet(DestModRef) && isRefSet(Res)) || + (isRefSet(DestModRef) && isModSet(Res))) + return false; + + return true; + }; + + if (!CaptureTrackingWithModRef(SrcAlloca, SrcModRefCallback)) + return false; + + // We can do the transformation. First, align the allocas appropriately. + SrcAlloca->setAlignment( + std::max(SrcAlloca->getAlign(), DestAlloca->getAlign())); + + // Merge the two allocas. + DestAlloca->replaceAllUsesWith(SrcAlloca); + eraseInstruction(DestAlloca); + + // Drop metadata on the source alloca. + SrcAlloca->dropUnknownNonDebugMetadata(); + + // Do "shrink wrap" the lifetimes, if the original lifetime intrinsics exists. + if (!LifetimeMarkers.empty()) { + LLVMContext &C = SrcAlloca->getContext(); + IRBuilder<> Builder(C); + + ConstantInt *AllocaSize = ConstantInt::get(Type::getInt64Ty(C), Size); + // Create a new lifetime start marker before the first user of src or alloca + // users. + Builder.SetInsertPoint(FirstUser->getParent(), FirstUser->getIterator()); + Builder.CreateLifetimeStart(SrcAlloca, AllocaSize); + + // Create a new lifetime end marker after the last user of src or alloca + // users. + // FIXME: If the last user is the terminator for the bb, we can insert + // lifetime.end marker to the immidiate post-dominator, but currently do + // nothing. + if (!LastUser->isTerminator()) { + Builder.SetInsertPoint(LastUser->getParent(), ++LastUser->getIterator()); + Builder.CreateLifetimeEnd(SrcAlloca, AllocaSize); + } + + // Remove all other lifetime markers. + for (Instruction *I : LifetimeMarkers) + eraseInstruction(I); + } + + // As this transformation can cause memory accesses that didn't previously + // alias to begin to alias one another, we remove !noalias metadata from any + // uses of either alloca. This is conservative, but more precision doesn't + // seem worthwhile right now. + for (Instruction *I : NoAliasInstrs) + I->setMetadata(LLVMContext::MD_noalias, nullptr); + + LLVM_DEBUG(dbgs() << "Stack Move: Performed staack-move optimization\n"); + NumStackMove++; + return true; +} + /// Perform simplification of memcpy's. If we have memcpy A /// which copies X to Y, and memcpy B which copies Y to Z, then we can rewrite /// B to be a memcpy from X to Z (or potentially a memmove, depending on @@ -1484,8 +1681,8 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) { MSSA->getWalker()->getClobberingMemoryAccess(AnyClobber, DestLoc, BAA); // Try to turn a partially redundant memset + memcpy into - // memcpy + smaller memset. We don't need the memcpy size for this. - // The memcpy most post-dom the memset, so limit this to the same basic + // smaller memset + memcpy. We don't need the memcpy size for this. + // The memcpy must post-dom the memset, so limit this to the same basic // block. A non-local generalization is likely not worthwhile. if (auto *MD = dyn_cast<MemoryDef>(DestClobber)) if (auto *MDep = dyn_cast_or_null<MemSetInst>(MD->getMemoryInst())) @@ -1496,13 +1693,14 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) { MemoryAccess *SrcClobber = MSSA->getWalker()->getClobberingMemoryAccess( AnyClobber, MemoryLocation::getForSource(M), BAA); - // There are four possible optimizations we can do for memcpy: + // There are five possible optimizations we can do for memcpy: // a) memcpy-memcpy xform which exposes redundance for DSE. // b) call-memcpy xform for return slot optimization. // c) memcpy from freshly alloca'd space or space that has just started // its lifetime copies undefined data, and we can therefore eliminate // the memcpy in favor of the data that was already at the destination. // d) memcpy from a just-memset'd source can be turned into memset. + // e) elimination of memcpy via stack-move optimization. if (auto *MD = dyn_cast<MemoryDef>(SrcClobber)) { if (Instruction *MI = MD->getMemoryInst()) { if (auto *CopySize = dyn_cast<ConstantInt>(M->getLength())) { @@ -1521,7 +1719,8 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) { } } if (auto *MDep = dyn_cast<MemCpyInst>(MI)) - return processMemCpyMemCpyDependence(M, MDep, BAA); + if (processMemCpyMemCpyDependence(M, MDep, BAA)) + return true; if (auto *MDep = dyn_cast<MemSetInst>(MI)) { if (performMemCpyToMemSetOptzn(M, MDep, BAA)) { LLVM_DEBUG(dbgs() << "Converted memcpy to memset\n"); @@ -1540,6 +1739,27 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) { } } + // If the transfer is from a stack slot to a stack slot, then we may be able + // to perform the stack-move optimization. See the comments in + // performStackMoveOptzn() for more details. + auto *DestAlloca = dyn_cast<AllocaInst>(M->getDest()); + if (!DestAlloca) + return false; + auto *SrcAlloca = dyn_cast<AllocaInst>(M->getSource()); + if (!SrcAlloca) + return false; + ConstantInt *Len = dyn_cast<ConstantInt>(M->getLength()); + if (Len == nullptr) + return false; + if (performStackMoveOptzn(M, M, DestAlloca, SrcAlloca, Len->getZExtValue(), + BAA)) { + // Avoid invalidating the iterator. + BBI = M->getNextNonDebugInstruction()->getIterator(); + eraseInstruction(M); + ++NumMemCpyInstr; + return true; + } + return false; } @@ -1623,24 +1843,110 @@ bool MemCpyOptPass::processByValArgument(CallBase &CB, unsigned ArgNo) { // foo(*a) // It would be invalid to transform the second memcpy into foo(*b). if (writtenBetween(MSSA, BAA, MemoryLocation::getForSource(MDep), - MSSA->getMemoryAccess(MDep), MSSA->getMemoryAccess(&CB))) + MSSA->getMemoryAccess(MDep), CallAccess)) return false; - Value *TmpCast = MDep->getSource(); - if (MDep->getSource()->getType() != ByValArg->getType()) { - BitCastInst *TmpBitCast = new BitCastInst(MDep->getSource(), ByValArg->getType(), - "tmpcast", &CB); - // Set the tmpcast's DebugLoc to MDep's - TmpBitCast->setDebugLoc(MDep->getDebugLoc()); - TmpCast = TmpBitCast; - } - LLVM_DEBUG(dbgs() << "MemCpyOptPass: Forwarding memcpy to byval:\n" << " " << *MDep << "\n" << " " << CB << "\n"); // Otherwise we're good! Update the byval argument. - CB.setArgOperand(ArgNo, TmpCast); + CB.setArgOperand(ArgNo, MDep->getSource()); + ++NumMemCpyInstr; + return true; +} + +/// This is called on memcpy dest pointer arguments attributed as immutable +/// during call. Try to use memcpy source directly if all of the following +/// conditions are satisfied. +/// 1. The memcpy dst is neither modified during the call nor captured by the +/// call. (if readonly, noalias, nocapture attributes on call-site.) +/// 2. The memcpy dst is an alloca with known alignment & size. +/// 2-1. The memcpy length == the alloca size which ensures that the new +/// pointer is dereferenceable for the required range +/// 2-2. The src pointer has alignment >= the alloca alignment or can be +/// enforced so. +/// 3. The memcpy dst and src is not modified between the memcpy and the call. +/// (if MSSA clobber check is safe.) +/// 4. The memcpy src is not modified during the call. (ModRef check shows no +/// Mod.) +bool MemCpyOptPass::processImmutArgument(CallBase &CB, unsigned ArgNo) { + // 1. Ensure passed argument is immutable during call. + if (!(CB.paramHasAttr(ArgNo, Attribute::NoAlias) && + CB.paramHasAttr(ArgNo, Attribute::NoCapture))) + return false; + const DataLayout &DL = CB.getCaller()->getParent()->getDataLayout(); + Value *ImmutArg = CB.getArgOperand(ArgNo); + + // 2. Check that arg is alloca + // TODO: Even if the arg gets back to branches, we can remove memcpy if all + // the alloca alignments can be enforced to source alignment. + auto *AI = dyn_cast<AllocaInst>(ImmutArg->stripPointerCasts()); + if (!AI) + return false; + + std::optional<TypeSize> AllocaSize = AI->getAllocationSize(DL); + // Can't handle unknown size alloca. + // (e.g. Variable Length Array, Scalable Vector) + if (!AllocaSize || AllocaSize->isScalable()) + return false; + MemoryLocation Loc(ImmutArg, LocationSize::precise(*AllocaSize)); + MemoryUseOrDef *CallAccess = MSSA->getMemoryAccess(&CB); + if (!CallAccess) + return false; + + MemCpyInst *MDep = nullptr; + BatchAAResults BAA(*AA); + MemoryAccess *Clobber = MSSA->getWalker()->getClobberingMemoryAccess( + CallAccess->getDefiningAccess(), Loc, BAA); + if (auto *MD = dyn_cast<MemoryDef>(Clobber)) + MDep = dyn_cast_or_null<MemCpyInst>(MD->getMemoryInst()); + + // If the immut argument isn't fed by a memcpy, ignore it. If it is fed by + // a memcpy, check that the arg equals the memcpy dest. + if (!MDep || MDep->isVolatile() || AI != MDep->getDest()) + return false; + + // The address space of the memcpy source must match the immut argument + if (MDep->getSource()->getType()->getPointerAddressSpace() != + ImmutArg->getType()->getPointerAddressSpace()) + return false; + + // 2-1. The length of the memcpy must be equal to the size of the alloca. + auto *MDepLen = dyn_cast<ConstantInt>(MDep->getLength()); + if (!MDepLen || AllocaSize != MDepLen->getValue()) + return false; + + // 2-2. the memcpy source align must be larger than or equal the alloca's + // align. If not so, we check to see if we can force the source of the memcpy + // to the alignment we need. If we fail, we bail out. + Align MemDepAlign = MDep->getSourceAlign().valueOrOne(); + Align AllocaAlign = AI->getAlign(); + if (MemDepAlign < AllocaAlign && + getOrEnforceKnownAlignment(MDep->getSource(), AllocaAlign, DL, &CB, AC, + DT) < AllocaAlign) + return false; + + // 3. Verify that the source doesn't change in between the memcpy and + // the call. + // memcpy(a <- b) + // *b = 42; + // foo(*a) + // It would be invalid to transform the second memcpy into foo(*b). + if (writtenBetween(MSSA, BAA, MemoryLocation::getForSource(MDep), + MSSA->getMemoryAccess(MDep), CallAccess)) + return false; + + // 4. The memcpy src must not be modified during the call. + if (isModSet(AA->getModRefInfo(&CB, MemoryLocation::getForSource(MDep)))) + return false; + + LLVM_DEBUG(dbgs() << "MemCpyOptPass: Forwarding memcpy to Immut src:\n" + << " " << *MDep << "\n" + << " " << CB << "\n"); + + // Otherwise we're good! Update the immut argument. + CB.setArgOperand(ArgNo, MDep->getSource()); ++NumMemCpyInstr; return true; } @@ -1673,9 +1979,12 @@ bool MemCpyOptPass::iterateOnFunction(Function &F) { else if (auto *M = dyn_cast<MemMoveInst>(I)) RepeatInstruction = processMemMove(M); else if (auto *CB = dyn_cast<CallBase>(I)) { - for (unsigned i = 0, e = CB->arg_size(); i != e; ++i) + for (unsigned i = 0, e = CB->arg_size(); i != e; ++i) { if (CB->isByValArgument(i)) MadeChange |= processByValArgument(*CB, i); + else if (CB->onlyReadsMemory(i)) + MadeChange |= processImmutArgument(*CB, i); + } } // Reprocess the instruction if desired. @@ -1730,17 +2039,3 @@ bool MemCpyOptPass::runImpl(Function &F, TargetLibraryInfo *TLI_, return MadeChange; } - -/// This is the main transformation entry point for a function. -bool MemCpyOptLegacyPass::runOnFunction(Function &F) { - if (skipFunction(F)) - return false; - - auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); - auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); - auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); - auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - auto *MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA(); - - return Impl.runImpl(F, TLI, AA, AC, DT, MSSA); -} |