diff options
Diffstat (limited to 'lib/Transforms/Scalar')
57 files changed, 9247 insertions, 8586 deletions
diff --git a/lib/Transforms/Scalar/ADCE.cpp b/lib/Transforms/Scalar/ADCE.cpp index 590a52da6b192..0eed0240c7416 100644 --- a/lib/Transforms/Scalar/ADCE.cpp +++ b/lib/Transforms/Scalar/ADCE.cpp @@ -22,10 +22,12 @@ #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CFG.h" +#include "llvm/IR/DebugInfoMetadata.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/Pass.h" +#include "llvm/ProfileData/InstrProf.h" #include "llvm/Transforms/Scalar.h" using namespace llvm; @@ -33,22 +35,70 @@ using namespace llvm; STATISTIC(NumRemoved, "Number of instructions removed"); +static void collectLiveScopes(const DILocalScope &LS, + SmallPtrSetImpl<const Metadata *> &AliveScopes) { + if (!AliveScopes.insert(&LS).second) + return; + + if (isa<DISubprogram>(LS)) + return; + + // Tail-recurse through the scope chain. + collectLiveScopes(cast<DILocalScope>(*LS.getScope()), AliveScopes); +} + +static void collectLiveScopes(const DILocation &DL, + SmallPtrSetImpl<const Metadata *> &AliveScopes) { + // Even though DILocations are not scopes, shove them into AliveScopes so we + // don't revisit them. + if (!AliveScopes.insert(&DL).second) + return; + + // Collect live scopes from the scope chain. + collectLiveScopes(*DL.getScope(), AliveScopes); + + // Tail-recurse through the inlined-at chain. + if (const DILocation *IA = DL.getInlinedAt()) + collectLiveScopes(*IA, AliveScopes); +} + +// Check if this instruction is a runtime call for value profiling and +// if it's instrumenting a constant. +static bool isInstrumentsConstant(Instruction &I) { + if (CallInst *CI = dyn_cast<CallInst>(&I)) + if (Function *Callee = CI->getCalledFunction()) + if (Callee->getName().equals(getInstrProfValueProfFuncName())) + if (isa<Constant>(CI->getArgOperand(0))) + return true; + return false; +} + static bool aggressiveDCE(Function& F) { - SmallPtrSet<Instruction*, 128> Alive; + SmallPtrSet<Instruction*, 32> Alive; SmallVector<Instruction*, 128> Worklist; // Collect the set of "root" instructions that are known live. for (Instruction &I : instructions(F)) { - if (isa<TerminatorInst>(I) || isa<DbgInfoIntrinsic>(I) || I.isEHPad() || - I.mayHaveSideEffects()) { + if (isa<TerminatorInst>(I) || I.isEHPad() || I.mayHaveSideEffects()) { + // Skip any value profile instrumentation calls if they are + // instrumenting constants. + if (isInstrumentsConstant(I)) + continue; Alive.insert(&I); Worklist.push_back(&I); } } - // Propagate liveness backwards to operands. + // Propagate liveness backwards to operands. Keep track of live debug info + // scopes. + SmallPtrSet<const Metadata *, 32> AliveScopes; while (!Worklist.empty()) { Instruction *Curr = Worklist.pop_back_val(); + + // Collect the live debug info scopes attached to this instruction. + if (const DILocation *DL = Curr->getDebugLoc()) + collectLiveScopes(*DL, AliveScopes); + for (Use &OI : Curr->operands()) { if (Instruction *Inst = dyn_cast<Instruction>(OI)) if (Alive.insert(Inst).second) @@ -61,10 +111,30 @@ static bool aggressiveDCE(Function& F) { // value of the function, and may therefore be deleted safely. // NOTE: We reuse the Worklist vector here for memory efficiency. for (Instruction &I : instructions(F)) { - if (!Alive.count(&I)) { - Worklist.push_back(&I); - I.dropAllReferences(); + // Check if the instruction is alive. + if (Alive.count(&I)) + continue; + + if (auto *DII = dyn_cast<DbgInfoIntrinsic>(&I)) { + // Check if the scope of this variable location is alive. + if (AliveScopes.count(DII->getDebugLoc()->getScope())) + continue; + + // Fallthrough and drop the intrinsic. + DEBUG({ + // If intrinsic is pointing at a live SSA value, there may be an + // earlier optimization bug: if we know the location of the variable, + // why isn't the scope of the location alive? + if (Value *V = DII->getVariableLocation()) + if (Instruction *II = dyn_cast<Instruction>(V)) + if (Alive.count(II)) + dbgs() << "Dropping debug info for " << *DII << "\n"; + }); } + + // Prepare to delete. + Worklist.push_back(&I); + I.dropAllReferences(); } for (Instruction *&I : Worklist) { @@ -75,10 +145,14 @@ static bool aggressiveDCE(Function& F) { return !Worklist.empty(); } -PreservedAnalyses ADCEPass::run(Function &F) { - if (aggressiveDCE(F)) - return PreservedAnalyses::none(); - return PreservedAnalyses::all(); +PreservedAnalyses ADCEPass::run(Function &F, FunctionAnalysisManager &) { + if (!aggressiveDCE(F)) + return PreservedAnalyses::all(); + + // FIXME: This should also 'preserve the CFG'. + auto PA = PreservedAnalyses(); + PA.preserve<GlobalsAA>(); + return PA; } namespace { @@ -89,7 +163,7 @@ struct ADCELegacyPass : public FunctionPass { } bool runOnFunction(Function& F) override { - if (skipOptnoneFunction(F)) + if (skipFunction(F)) return false; return aggressiveDCE(F); } diff --git a/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp b/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp index 4b721d38adba7..7f8b8ce91e79a 100644 --- a/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp +++ b/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp @@ -18,6 +18,7 @@ #define AA_NAME "alignment-from-assumptions" #define DEBUG_TYPE AA_NAME +#include "llvm/Transforms/Scalar/AlignmentFromAssumptions.h" #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" @@ -25,13 +26,11 @@ #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/LoopInfo.h" -#include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Constant.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Instruction.h" -#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/Module.h" #include "llvm/Support/Debug.h" @@ -67,18 +66,7 @@ struct AlignmentFromAssumptions : public FunctionPass { AU.addPreserved<ScalarEvolutionWrapperPass>(); } - // For memory transfers, we need a common alignment for both the source and - // destination. If we have a new alignment for only one operand of a transfer - // instruction, save it in these maps. If we reach the other operand through - // another assumption later, then we may change the alignment at that point. - DenseMap<MemTransferInst *, unsigned> NewDestAlignments, NewSrcAlignments; - - ScalarEvolution *SE; - DominatorTree *DT; - - bool extractAlignmentInfo(CallInst *I, Value *&AAPtr, const SCEV *&AlignSCEV, - const SCEV *&OffSCEV); - bool processAssumption(CallInst *I); + AlignmentFromAssumptionsPass Impl; }; } @@ -209,9 +197,10 @@ static unsigned getNewAlignment(const SCEV *AASCEV, const SCEV *AlignSCEV, return 0; } -bool AlignmentFromAssumptions::extractAlignmentInfo(CallInst *I, - Value *&AAPtr, const SCEV *&AlignSCEV, - const SCEV *&OffSCEV) { +bool AlignmentFromAssumptionsPass::extractAlignmentInfo(CallInst *I, + Value *&AAPtr, + const SCEV *&AlignSCEV, + const SCEV *&OffSCEV) { // An alignment assume must be a statement about the least-significant // bits of the pointer being zero, possibly with some offset. ICmpInst *ICI = dyn_cast<ICmpInst>(I->getArgOperand(0)); @@ -302,7 +291,7 @@ bool AlignmentFromAssumptions::extractAlignmentInfo(CallInst *I, return true; } -bool AlignmentFromAssumptions::processAssumption(CallInst *ACall) { +bool AlignmentFromAssumptionsPass::processAssumption(CallInst *ACall) { Value *AAPtr; const SCEV *AlignSCEV, *OffSCEV; if (!extractAlignmentInfo(ACall, AAPtr, AlignSCEV, OffSCEV)) @@ -411,14 +400,26 @@ bool AlignmentFromAssumptions::processAssumption(CallInst *ACall) { } bool AlignmentFromAssumptions::runOnFunction(Function &F) { - bool Changed = false; + if (skipFunction(F)) + return false; + auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); - SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); - DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); + DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + + return Impl.runImpl(F, AC, SE, DT); +} + +bool AlignmentFromAssumptionsPass::runImpl(Function &F, AssumptionCache &AC, + ScalarEvolution *SE_, + DominatorTree *DT_) { + SE = SE_; + DT = DT_; NewDestAlignments.clear(); NewSrcAlignments.clear(); + bool Changed = false; for (auto &AssumeVH : AC.assumptions()) if (AssumeVH) Changed |= processAssumption(cast<CallInst>(AssumeVH)); @@ -426,3 +427,20 @@ bool AlignmentFromAssumptions::runOnFunction(Function &F) { return Changed; } +PreservedAnalyses +AlignmentFromAssumptionsPass::run(Function &F, FunctionAnalysisManager &AM) { + + AssumptionCache &AC = AM.getResult<AssumptionAnalysis>(F); + ScalarEvolution &SE = AM.getResult<ScalarEvolutionAnalysis>(F); + DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F); + bool Changed = runImpl(F, AC, &SE, &DT); + if (!Changed) + return PreservedAnalyses::all(); + PreservedAnalyses PA; + PA.preserve<AAManager>(); + PA.preserve<ScalarEvolutionAnalysis>(); + PA.preserve<GlobalsAA>(); + PA.preserve<LoopAnalysis>(); + PA.preserve<DominatorTreeAnalysis>(); + return PA; +} diff --git a/lib/Transforms/Scalar/BDCE.cpp b/lib/Transforms/Scalar/BDCE.cpp index cb9b8b6fffc84..4f6225f4c7b01 100644 --- a/lib/Transforms/Scalar/BDCE.cpp +++ b/lib/Transforms/Scalar/BDCE.cpp @@ -14,11 +14,11 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Scalar/BDCE.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/DemandedBits.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/IR/CFG.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" @@ -27,6 +27,7 @@ #include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar.h" using namespace llvm; #define DEBUG_TYPE "bdce" @@ -34,35 +35,7 @@ using namespace llvm; STATISTIC(NumRemoved, "Number of instructions removed (unused)"); STATISTIC(NumSimplified, "Number of instructions trivialized (dead bits)"); -namespace { -struct BDCE : public FunctionPass { - static char ID; // Pass identification, replacement for typeid - BDCE() : FunctionPass(ID) { - initializeBDCEPass(*PassRegistry::getPassRegistry()); - } - - bool runOnFunction(Function& F) override; - - void getAnalysisUsage(AnalysisUsage& AU) const override { - AU.setPreservesCFG(); - AU.addRequired<DemandedBits>(); - AU.addPreserved<GlobalsAAWrapperPass>(); - } -}; -} - -char BDCE::ID = 0; -INITIALIZE_PASS_BEGIN(BDCE, "bdce", "Bit-Tracking Dead Code Elimination", - false, false) -INITIALIZE_PASS_DEPENDENCY(DemandedBits) -INITIALIZE_PASS_END(BDCE, "bdce", "Bit-Tracking Dead Code Elimination", - false, false) - -bool BDCE::runOnFunction(Function& F) { - if (skipOptnoneFunction(F)) - return false; - DemandedBits &DB = getAnalysis<DemandedBits>(); - +static bool bitTrackingDCE(Function &F, DemandedBits &DB) { SmallVector<Instruction*, 128> Worklist; bool Changed = false; for (Instruction &I : instructions(F)) { @@ -96,7 +69,44 @@ bool BDCE::runOnFunction(Function& F) { return Changed; } -FunctionPass *llvm::createBitTrackingDCEPass() { - return new BDCE(); +PreservedAnalyses BDCEPass::run(Function &F, FunctionAnalysisManager &AM) { + auto &DB = AM.getResult<DemandedBitsAnalysis>(F); + if (!bitTrackingDCE(F, DB)) + return PreservedAnalyses::all(); + + // FIXME: This should also 'preserve the CFG'. + auto PA = PreservedAnalyses(); + PA.preserve<GlobalsAA>(); + return PA; } +namespace { +struct BDCELegacyPass : public FunctionPass { + static char ID; // Pass identification, replacement for typeid + BDCELegacyPass() : FunctionPass(ID) { + initializeBDCELegacyPassPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override { + if (skipFunction(F)) + return false; + auto &DB = getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); + return bitTrackingDCE(F, DB); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired<DemandedBitsWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); + } +}; +} + +char BDCELegacyPass::ID = 0; +INITIALIZE_PASS_BEGIN(BDCELegacyPass, "bdce", + "Bit-Tracking Dead Code Elimination", false, false) +INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) +INITIALIZE_PASS_END(BDCELegacyPass, "bdce", + "Bit-Tracking Dead Code Elimination", false, false) + +FunctionPass *llvm::createBitTrackingDCEPass() { return new BDCELegacyPass(); } diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt index a0ddbd0852063..9f04344b8b0a2 100644 --- a/lib/Transforms/Scalar/CMakeLists.txt +++ b/lib/Transforms/Scalar/CMakeLists.txt @@ -10,13 +10,16 @@ add_llvm_library(LLVMScalarOpts EarlyCSE.cpp FlattenCFGPass.cpp Float2Int.cpp + GuardWidening.cpp GVN.cpp + GVNHoist.cpp InductiveRangeCheckElimination.cpp IndVarSimplify.cpp JumpThreading.cpp LICM.cpp LoadCombine.cpp LoopDeletion.cpp + LoopDataPrefetch.cpp LoopDistribute.cpp LoopIdiomRecognize.cpp LoopInstSimplify.cpp @@ -24,11 +27,14 @@ add_llvm_library(LLVMScalarOpts LoopLoadElimination.cpp LoopRerollPass.cpp LoopRotation.cpp + LoopSimplifyCFG.cpp LoopStrengthReduce.cpp LoopUnrollPass.cpp LoopUnswitch.cpp + LoopVersioningLICM.cpp LowerAtomic.cpp LowerExpectIntrinsic.cpp + LowerGuardIntrinsic.cpp MemCpyOptimizer.cpp MergedLoadStoreMotion.cpp NaryReassociate.cpp @@ -40,7 +46,6 @@ add_llvm_library(LLVMScalarOpts SCCP.cpp SROA.cpp Scalar.cpp - ScalarReplAggregates.cpp Scalarizer.cpp SeparateConstOffsetFromGEP.cpp SimplifyCFGPass.cpp diff --git a/lib/Transforms/Scalar/ConstantHoisting.cpp b/lib/Transforms/Scalar/ConstantHoisting.cpp index 84f7f5fff5b59..913e939c2bd40 100644 --- a/lib/Transforms/Scalar/ConstantHoisting.cpp +++ b/lib/Transforms/Scalar/ConstantHoisting.cpp @@ -33,20 +33,20 @@ // %0 = load i64* inttoptr (i64 big_constant to i64*) //===----------------------------------------------------------------------===// -#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Scalar/ConstantHoisting.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/Constants.h" -#include "llvm/IR/Dominators.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar.h" #include <tuple> using namespace llvm; +using namespace consthoist; #define DEBUG_TYPE "consthoist" @@ -54,75 +54,12 @@ STATISTIC(NumConstantsHoisted, "Number of constants hoisted"); STATISTIC(NumConstantsRebased, "Number of constants rebased"); namespace { -struct ConstantUser; -struct RebasedConstantInfo; - -typedef SmallVector<ConstantUser, 8> ConstantUseListType; -typedef SmallVector<RebasedConstantInfo, 4> RebasedConstantListType; - -/// \brief Keeps track of the user of a constant and the operand index where the -/// constant is used. -struct ConstantUser { - Instruction *Inst; - unsigned OpndIdx; - - ConstantUser(Instruction *Inst, unsigned Idx) : Inst(Inst), OpndIdx(Idx) { } -}; - -/// \brief Keeps track of a constant candidate and its uses. -struct ConstantCandidate { - ConstantUseListType Uses; - ConstantInt *ConstInt; - unsigned CumulativeCost; - - ConstantCandidate(ConstantInt *ConstInt) - : ConstInt(ConstInt), CumulativeCost(0) { } - - /// \brief Add the user to the use list and update the cost. - void addUser(Instruction *Inst, unsigned Idx, unsigned Cost) { - CumulativeCost += Cost; - Uses.push_back(ConstantUser(Inst, Idx)); - } -}; - -/// \brief This represents a constant that has been rebased with respect to a -/// base constant. The difference to the base constant is recorded in Offset. -struct RebasedConstantInfo { - ConstantUseListType Uses; - Constant *Offset; - - RebasedConstantInfo(ConstantUseListType &&Uses, Constant *Offset) - : Uses(std::move(Uses)), Offset(Offset) { } -}; - -/// \brief A base constant and all its rebased constants. -struct ConstantInfo { - ConstantInt *BaseConstant; - RebasedConstantListType RebasedConstants; -}; - /// \brief The constant hoisting pass. -class ConstantHoisting : public FunctionPass { - typedef DenseMap<ConstantInt *, unsigned> ConstCandMapType; - typedef std::vector<ConstantCandidate> ConstCandVecType; - - const TargetTransformInfo *TTI; - DominatorTree *DT; - BasicBlock *Entry; - - /// Keeps track of constant candidates found in the function. - ConstCandVecType ConstCandVec; - - /// Keep track of cast instructions we already cloned. - SmallDenseMap<Instruction *, Instruction *> ClonedCastMap; - - /// These are the final constants we decided to hoist. - SmallVector<ConstantInfo, 8> ConstantVec; +class ConstantHoistingLegacyPass : public FunctionPass { public: static char ID; // Pass identification, replacement for typeid - ConstantHoisting() : FunctionPass(ID), TTI(nullptr), DT(nullptr), - Entry(nullptr) { - initializeConstantHoistingPass(*PassRegistry::getPassRegistry()); + ConstantHoistingLegacyPass() : FunctionPass(ID) { + initializeConstantHoistingLegacyPassPass(*PassRegistry::getPassRegistry()); } bool runOnFunction(Function &Fn) override; @@ -135,67 +72,36 @@ public: AU.addRequired<TargetTransformInfoWrapperPass>(); } -private: - /// \brief Initialize the pass. - void setup(Function &Fn) { - DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(Fn); - Entry = &Fn.getEntryBlock(); - } + void releaseMemory() override { Impl.releaseMemory(); } - /// \brief Cleanup. - void cleanup() { - ConstantVec.clear(); - ClonedCastMap.clear(); - ConstCandVec.clear(); - - TTI = nullptr; - DT = nullptr; - Entry = nullptr; - } - - Instruction *findMatInsertPt(Instruction *Inst, unsigned Idx = ~0U) const; - Instruction *findConstantInsertionPoint(const ConstantInfo &ConstInfo) const; - void collectConstantCandidates(ConstCandMapType &ConstCandMap, - Instruction *Inst, unsigned Idx, - ConstantInt *ConstInt); - void collectConstantCandidates(ConstCandMapType &ConstCandMap, - Instruction *Inst); - void collectConstantCandidates(Function &Fn); - void findAndMakeBaseConstant(ConstCandVecType::iterator S, - ConstCandVecType::iterator E); - void findBaseConstants(); - void emitBaseConstants(Instruction *Base, Constant *Offset, - const ConstantUser &ConstUser); - bool emitBaseConstants(); - void deleteDeadCastInst() const; - bool optimizeConstants(Function &Fn); +private: + ConstantHoistingPass Impl; }; } -char ConstantHoisting::ID = 0; -INITIALIZE_PASS_BEGIN(ConstantHoisting, "consthoist", "Constant Hoisting", - false, false) +char ConstantHoistingLegacyPass::ID = 0; +INITIALIZE_PASS_BEGIN(ConstantHoistingLegacyPass, "consthoist", + "Constant Hoisting", false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) -INITIALIZE_PASS_END(ConstantHoisting, "consthoist", "Constant Hoisting", - false, false) +INITIALIZE_PASS_END(ConstantHoistingLegacyPass, "consthoist", + "Constant Hoisting", false, false) FunctionPass *llvm::createConstantHoistingPass() { - return new ConstantHoisting(); + return new ConstantHoistingLegacyPass(); } /// \brief Perform the constant hoisting optimization for the given function. -bool ConstantHoisting::runOnFunction(Function &Fn) { - if (skipOptnoneFunction(Fn)) +bool ConstantHoistingLegacyPass::runOnFunction(Function &Fn) { + if (skipFunction(Fn)) return false; DEBUG(dbgs() << "********** Begin Constant Hoisting **********\n"); DEBUG(dbgs() << "********** Function: " << Fn.getName() << '\n'); - setup(Fn); - - bool MadeChange = optimizeConstants(Fn); + bool MadeChange = Impl.runImpl( + Fn, getAnalysis<TargetTransformInfoWrapperPass>().getTTI(Fn), + getAnalysis<DominatorTreeWrapperPass>().getDomTree(), Fn.getEntryBlock()); if (MadeChange) { DEBUG(dbgs() << "********** Function after Constant Hoisting: " @@ -204,15 +110,13 @@ bool ConstantHoisting::runOnFunction(Function &Fn) { } DEBUG(dbgs() << "********** End Constant Hoisting **********\n"); - cleanup(); - return MadeChange; } /// \brief Find the constant materialization insertion point. -Instruction *ConstantHoisting::findMatInsertPt(Instruction *Inst, - unsigned Idx) const { +Instruction *ConstantHoistingPass::findMatInsertPt(Instruction *Inst, + unsigned Idx) const { // If the operand is a cast instruction, then we have to materialize the // constant before the cast instruction. if (Idx != ~0U) { @@ -237,8 +141,8 @@ Instruction *ConstantHoisting::findMatInsertPt(Instruction *Inst, } /// \brief Find an insertion point that dominates all uses. -Instruction *ConstantHoisting:: -findConstantInsertionPoint(const ConstantInfo &ConstInfo) const { +Instruction *ConstantHoistingPass::findConstantInsertionPoint( + const ConstantInfo &ConstInfo) const { assert(!ConstInfo.RebasedConstants.empty() && "Invalid constant info entry."); // Collect all basic blocks. SmallPtrSet<BasicBlock *, 8> BBs; @@ -272,10 +176,9 @@ findConstantInsertionPoint(const ConstantInfo &ConstInfo) const { /// The operand at index Idx is not necessarily the constant integer itself. It /// could also be a cast instruction or a constant expression that uses the // constant integer. -void ConstantHoisting::collectConstantCandidates(ConstCandMapType &ConstCandMap, - Instruction *Inst, - unsigned Idx, - ConstantInt *ConstInt) { +void ConstantHoistingPass::collectConstantCandidates( + ConstCandMapType &ConstCandMap, Instruction *Inst, unsigned Idx, + ConstantInt *ConstInt) { unsigned Cost; // Ask the target about the cost of materializing the constant for the given // instruction and operand index. @@ -309,8 +212,8 @@ void ConstantHoisting::collectConstantCandidates(ConstCandMapType &ConstCandMap, /// \brief Scan the instruction for expensive integer constants and record them /// in the constant candidate vector. -void ConstantHoisting::collectConstantCandidates(ConstCandMapType &ConstCandMap, - Instruction *Inst) { +void ConstantHoistingPass::collectConstantCandidates( + ConstCandMapType &ConstCandMap, Instruction *Inst) { // Skip all cast instructions. They are visited indirectly later on. if (Inst->isCast()) return; @@ -320,6 +223,18 @@ void ConstantHoisting::collectConstantCandidates(ConstCandMapType &ConstCandMap, if (isa<InlineAsm>(Call->getCalledValue())) return; + // Switch cases must remain constant, and if the value being tested is + // constant the entire thing should disappear. + if (isa<SwitchInst>(Inst)) + return; + + // Static allocas (constant size in the entry block) are handled by + // prologue/epilogue insertion so they're free anyway. We definitely don't + // want to make them non-constant. + auto AI = dyn_cast<AllocaInst>(Inst); + if (AI && AI->isStaticAlloca()) + return; + // Scan all operands. for (unsigned Idx = 0, E = Inst->getNumOperands(); Idx != E; ++Idx) { Value *Opnd = Inst->getOperand(Idx); @@ -363,25 +278,116 @@ void ConstantHoisting::collectConstantCandidates(ConstCandMapType &ConstCandMap, /// \brief Collect all integer constants in the function that cannot be folded /// into an instruction itself. -void ConstantHoisting::collectConstantCandidates(Function &Fn) { +void ConstantHoistingPass::collectConstantCandidates(Function &Fn) { ConstCandMapType ConstCandMap; for (BasicBlock &BB : Fn) for (Instruction &Inst : BB) collectConstantCandidates(ConstCandMap, &Inst); } -/// \brief Find the base constant within the given range and rebase all other -/// constants with respect to the base constant. -void ConstantHoisting::findAndMakeBaseConstant(ConstCandVecType::iterator S, - ConstCandVecType::iterator E) { - auto MaxCostItr = S; +// This helper function is necessary to deal with values that have different +// bit widths (APInt Operator- does not like that). If the value cannot be +// represented in uint64 we return an "empty" APInt. This is then interpreted +// as the value is not in range. +static llvm::Optional<APInt> calculateOffsetDiff(APInt V1, APInt V2) +{ + llvm::Optional<APInt> Res = None; + unsigned BW = V1.getBitWidth() > V2.getBitWidth() ? + V1.getBitWidth() : V2.getBitWidth(); + uint64_t LimVal1 = V1.getLimitedValue(); + uint64_t LimVal2 = V2.getLimitedValue(); + + if (LimVal1 == ~0ULL || LimVal2 == ~0ULL) + return Res; + + uint64_t Diff = LimVal1 - LimVal2; + return APInt(BW, Diff, true); +} + +// From a list of constants, one needs to picked as the base and the other +// constants will be transformed into an offset from that base constant. The +// question is which we can pick best? For example, consider these constants +// and their number of uses: +// +// Constants| 2 | 4 | 12 | 42 | +// NumUses | 3 | 2 | 8 | 7 | +// +// Selecting constant 12 because it has the most uses will generate negative +// offsets for constants 2 and 4 (i.e. -10 and -8 respectively). If negative +// offsets lead to less optimal code generation, then there might be better +// solutions. Suppose immediates in the range of 0..35 are most optimally +// supported by the architecture, then selecting constant 2 is most optimal +// because this will generate offsets: 0, 2, 10, 40. Offsets 0, 2 and 10 are in +// range 0..35, and thus 3 + 2 + 8 = 13 uses are in range. Selecting 12 would +// have only 8 uses in range, so choosing 2 as a base is more optimal. Thus, in +// selecting the base constant the range of the offsets is a very important +// factor too that we take into account here. This algorithm calculates a total +// costs for selecting a constant as the base and substract the costs if +// immediates are out of range. It has quadratic complexity, so we call this +// function only when we're optimising for size and there are less than 100 +// constants, we fall back to the straightforward algorithm otherwise +// which does not do all the offset calculations. +unsigned +ConstantHoistingPass::maximizeConstantsInRange(ConstCandVecType::iterator S, + ConstCandVecType::iterator E, + ConstCandVecType::iterator &MaxCostItr) { unsigned NumUses = 0; - // Use the constant that has the maximum cost as base constant. + + if(!Entry->getParent()->optForSize() || std::distance(S,E) > 100) { + for (auto ConstCand = S; ConstCand != E; ++ConstCand) { + NumUses += ConstCand->Uses.size(); + if (ConstCand->CumulativeCost > MaxCostItr->CumulativeCost) + MaxCostItr = ConstCand; + } + return NumUses; + } + + DEBUG(dbgs() << "== Maximize constants in range ==\n"); + int MaxCost = -1; for (auto ConstCand = S; ConstCand != E; ++ConstCand) { + auto Value = ConstCand->ConstInt->getValue(); + Type *Ty = ConstCand->ConstInt->getType(); + int Cost = 0; NumUses += ConstCand->Uses.size(); - if (ConstCand->CumulativeCost > MaxCostItr->CumulativeCost) + DEBUG(dbgs() << "= Constant: " << ConstCand->ConstInt->getValue() << "\n"); + + for (auto User : ConstCand->Uses) { + unsigned Opcode = User.Inst->getOpcode(); + unsigned OpndIdx = User.OpndIdx; + Cost += TTI->getIntImmCost(Opcode, OpndIdx, Value, Ty); + DEBUG(dbgs() << "Cost: " << Cost << "\n"); + + for (auto C2 = S; C2 != E; ++C2) { + llvm::Optional<APInt> Diff = calculateOffsetDiff( + C2->ConstInt->getValue(), + ConstCand->ConstInt->getValue()); + if (Diff) { + const int ImmCosts = + TTI->getIntImmCodeSizeCost(Opcode, OpndIdx, Diff.getValue(), Ty); + Cost -= ImmCosts; + DEBUG(dbgs() << "Offset " << Diff.getValue() << " " + << "has penalty: " << ImmCosts << "\n" + << "Adjusted cost: " << Cost << "\n"); + } + } + } + DEBUG(dbgs() << "Cumulative cost: " << Cost << "\n"); + if (Cost > MaxCost) { + MaxCost = Cost; MaxCostItr = ConstCand; + DEBUG(dbgs() << "New candidate: " << MaxCostItr->ConstInt->getValue() + << "\n"); + } } + return NumUses; +} + +/// \brief Find the base constant within the given range and rebase all other +/// constants with respect to the base constant. +void ConstantHoistingPass::findAndMakeBaseConstant( + ConstCandVecType::iterator S, ConstCandVecType::iterator E) { + auto MaxCostItr = S; + unsigned NumUses = maximizeConstantsInRange(S, E, MaxCostItr); // Don't hoist constants that have only one use. if (NumUses <= 1) @@ -404,7 +410,7 @@ void ConstantHoisting::findAndMakeBaseConstant(ConstCandVecType::iterator S, /// \brief Finds and combines constant candidates that can be easily /// rematerialized with an add from a common base constant. -void ConstantHoisting::findBaseConstants() { +void ConstantHoistingPass::findBaseConstants() { // Sort the constants by value and type. This invalidates the mapping! std::sort(ConstCandVec.begin(), ConstCandVec.end(), [](const ConstantCandidate &LHS, const ConstantCandidate &RHS) { @@ -466,8 +472,9 @@ static bool updateOperand(Instruction *Inst, unsigned Idx, Instruction *Mat) { /// \brief Emit materialization code for all rebased constants and update their /// users. -void ConstantHoisting::emitBaseConstants(Instruction *Base, Constant *Offset, - const ConstantUser &ConstUser) { +void ConstantHoistingPass::emitBaseConstants(Instruction *Base, + Constant *Offset, + const ConstantUser &ConstUser) { Instruction *Mat = Base; if (Offset) { Instruction *InsertionPt = findMatInsertPt(ConstUser.Inst, @@ -538,7 +545,7 @@ void ConstantHoisting::emitBaseConstants(Instruction *Base, Constant *Offset, /// \brief Hoist and hide the base constant behind a bitcast and emit /// materialization code for derived constants. -bool ConstantHoisting::emitBaseConstants() { +bool ConstantHoistingPass::emitBaseConstants() { bool MadeChange = false; for (auto const &ConstInfo : ConstantVec) { // Hoist and hide the base constant behind a bitcast. @@ -572,14 +579,18 @@ bool ConstantHoisting::emitBaseConstants() { /// \brief Check all cast instructions we made a copy of and remove them if they /// have no more users. -void ConstantHoisting::deleteDeadCastInst() const { +void ConstantHoistingPass::deleteDeadCastInst() const { for (auto const &I : ClonedCastMap) if (I.first->use_empty()) I.first->eraseFromParent(); } /// \brief Optimize expensive integer constants in the given function. -bool ConstantHoisting::optimizeConstants(Function &Fn) { +bool ConstantHoistingPass::runImpl(Function &Fn, TargetTransformInfo &TTI, + DominatorTree &DT, BasicBlock &Entry) { + this->TTI = &TTI; + this->DT = &DT; + this->Entry = &Entry; // Collect all constant candidates. collectConstantCandidates(Fn); @@ -604,3 +615,14 @@ bool ConstantHoisting::optimizeConstants(Function &Fn) { return MadeChange; } + +PreservedAnalyses ConstantHoistingPass::run(Function &F, + FunctionAnalysisManager &AM) { + auto &DT = AM.getResult<DominatorTreeAnalysis>(F); + auto &TTI = AM.getResult<TargetIRAnalysis>(F); + if (!runImpl(F, TTI, DT, F.getEntryBlock())) + return PreservedAnalyses::all(); + + // FIXME: This should also 'preserve the CFG'. + return PreservedAnalyses::none(); +} diff --git a/lib/Transforms/Scalar/ConstantProp.cpp b/lib/Transforms/Scalar/ConstantProp.cpp index c974ebb9456f8..88172d19fe5a9 100644 --- a/lib/Transforms/Scalar/ConstantProp.cpp +++ b/lib/Transforms/Scalar/ConstantProp.cpp @@ -61,11 +61,14 @@ FunctionPass *llvm::createConstantPropagationPass() { } bool ConstantPropagation::runOnFunction(Function &F) { + if (skipFunction(F)) + return false; + // Initialize the worklist to all of the instructions ready to process... std::set<Instruction*> WorkList; - for(inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) { - WorkList.insert(&*i); - } + for (Instruction &I: instructions(&F)) + WorkList.insert(&I); + bool Changed = false; const DataLayout &DL = F.getParent()->getDataLayout(); TargetLibraryInfo *TLI = diff --git a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp index 686bd40711049..c0fed05333921 100644 --- a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp +++ b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp @@ -11,6 +11,7 @@ // //===----------------------------------------------------------------------===// +#include "llvm/Transforms/Scalar/CorrelatedValuePropagation.h" #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/GlobalsModRef.h" @@ -35,22 +36,11 @@ STATISTIC(NumMemAccess, "Number of memory access targets propagated"); STATISTIC(NumCmps, "Number of comparisons propagated"); STATISTIC(NumReturns, "Number of return values propagated"); STATISTIC(NumDeadCases, "Number of switch cases removed"); +STATISTIC(NumSDivs, "Number of sdiv converted to udiv"); +STATISTIC(NumSRems, "Number of srem converted to urem"); namespace { class CorrelatedValuePropagation : public FunctionPass { - LazyValueInfo *LVI; - - bool processSelect(SelectInst *SI); - bool processPHI(PHINode *P); - bool processMemAccess(Instruction *I); - bool processCmp(CmpInst *C); - bool processSwitch(SwitchInst *SI); - bool processCallSite(CallSite CS); - - /// Return a constant value for V usable at At and everything it - /// dominates. If no such Constant can be found, return nullptr. - Constant *getConstantAt(Value *V, Instruction *At); - public: static char ID; CorrelatedValuePropagation(): FunctionPass(ID) { @@ -60,7 +50,7 @@ namespace { bool runOnFunction(Function &F) override; void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<LazyValueInfo>(); + AU.addRequired<LazyValueInfoWrapperPass>(); AU.addPreserved<GlobalsAAWrapperPass>(); } }; @@ -69,7 +59,7 @@ namespace { char CorrelatedValuePropagation::ID = 0; INITIALIZE_PASS_BEGIN(CorrelatedValuePropagation, "correlated-propagation", "Value Propagation", false, false) -INITIALIZE_PASS_DEPENDENCY(LazyValueInfo) +INITIALIZE_PASS_DEPENDENCY(LazyValueInfoWrapperPass) INITIALIZE_PASS_END(CorrelatedValuePropagation, "correlated-propagation", "Value Propagation", false, false) @@ -78,7 +68,7 @@ Pass *llvm::createCorrelatedValuePropagationPass() { return new CorrelatedValuePropagation(); } -bool CorrelatedValuePropagation::processSelect(SelectInst *S) { +static bool processSelect(SelectInst *S, LazyValueInfo *LVI) { if (S->getType()->isVectorTy()) return false; if (isa<Constant>(S->getOperand(0))) return false; @@ -101,7 +91,7 @@ bool CorrelatedValuePropagation::processSelect(SelectInst *S) { return true; } -bool CorrelatedValuePropagation::processPHI(PHINode *P) { +static bool processPHI(PHINode *P, LazyValueInfo *LVI) { bool Changed = false; BasicBlock *BB = P->getParent(); @@ -169,7 +159,7 @@ bool CorrelatedValuePropagation::processPHI(PHINode *P) { return Changed; } -bool CorrelatedValuePropagation::processMemAccess(Instruction *I) { +static bool processMemAccess(Instruction *I, LazyValueInfo *LVI) { Value *Pointer = nullptr; if (LoadInst *L = dyn_cast<LoadInst>(I)) Pointer = L->getPointerOperand(); @@ -186,11 +176,11 @@ bool CorrelatedValuePropagation::processMemAccess(Instruction *I) { return true; } -/// processCmp - See if LazyValueInfo's ability to exploit edge conditions, -/// or range information is sufficient to prove this comparison. Even for -/// local conditions, this can sometimes prove conditions instcombine can't by +/// See if LazyValueInfo's ability to exploit edge conditions or range +/// information is sufficient to prove this comparison. Even for local +/// conditions, this can sometimes prove conditions instcombine can't by /// exploiting range information. -bool CorrelatedValuePropagation::processCmp(CmpInst *C) { +static bool processCmp(CmpInst *C, LazyValueInfo *LVI) { Value *Op0 = C->getOperand(0); Constant *Op1 = dyn_cast<Constant>(C->getOperand(1)); if (!Op1) return false; @@ -218,14 +208,14 @@ bool CorrelatedValuePropagation::processCmp(CmpInst *C) { return true; } -/// processSwitch - Simplify a switch instruction by removing cases which can -/// never fire. If the uselessness of a case could be determined locally then -/// constant propagation would already have figured it out. Instead, walk the -/// predecessors and statically evaluate cases based on information available -/// on that edge. Cases that cannot fire no matter what the incoming edge can -/// safely be removed. If a case fires on every incoming edge then the entire -/// switch can be removed and replaced with a branch to the case destination. -bool CorrelatedValuePropagation::processSwitch(SwitchInst *SI) { +/// Simplify a switch instruction by removing cases which can never fire. If the +/// uselessness of a case could be determined locally then constant propagation +/// would already have figured it out. Instead, walk the predecessors and +/// statically evaluate cases based on information available on that edge. Cases +/// that cannot fire no matter what the incoming edge can safely be removed. If +/// a case fires on every incoming edge then the entire switch can be removed +/// and replaced with a branch to the case destination. +static bool processSwitch(SwitchInst *SI, LazyValueInfo *LVI) { Value *Cond = SI->getCondition(); BasicBlock *BB = SI->getParent(); @@ -304,16 +294,18 @@ bool CorrelatedValuePropagation::processSwitch(SwitchInst *SI) { return Changed; } -/// processCallSite - Infer nonnull attributes for the arguments at the -/// specified callsite. -bool CorrelatedValuePropagation::processCallSite(CallSite CS) { +/// Infer nonnull attributes for the arguments at the specified callsite. +static bool processCallSite(CallSite CS, LazyValueInfo *LVI) { SmallVector<unsigned, 4> Indices; unsigned ArgNo = 0; for (Value *V : CS.args()) { PointerType *Type = dyn_cast<PointerType>(V->getType()); - + // Try to mark pointer typed parameters as non-null. We skip the + // relatively expensive analysis for constants which are obviously either + // null or non-null to start with. if (Type && !CS.paramHasAttr(ArgNo + 1, Attribute::NonNull) && + !isa<Constant>(V) && LVI->getPredicateAt(ICmpInst::ICMP_EQ, V, ConstantPointerNull::get(Type), CS.getInstruction()) == LazyValueInfo::False) @@ -334,7 +326,62 @@ bool CorrelatedValuePropagation::processCallSite(CallSite CS) { return true; } -Constant *CorrelatedValuePropagation::getConstantAt(Value *V, Instruction *At) { +// Helper function to rewrite srem and sdiv. As a policy choice, we choose not +// to waste compile time on anything where the operands are local defs. While +// LVI can sometimes reason about such cases, it's not its primary purpose. +static bool hasLocalDefs(BinaryOperator *SDI) { + for (Value *O : SDI->operands()) { + auto *I = dyn_cast<Instruction>(O); + if (I && I->getParent() == SDI->getParent()) + return true; + } + return false; +} + +static bool hasPositiveOperands(BinaryOperator *SDI, LazyValueInfo *LVI) { + Constant *Zero = ConstantInt::get(SDI->getType(), 0); + for (Value *O : SDI->operands()) { + auto Result = LVI->getPredicateAt(ICmpInst::ICMP_SGE, O, Zero, SDI); + if (Result != LazyValueInfo::True) + return false; + } + return true; +} + +static bool processSRem(BinaryOperator *SDI, LazyValueInfo *LVI) { + if (SDI->getType()->isVectorTy() || hasLocalDefs(SDI) || + !hasPositiveOperands(SDI, LVI)) + return false; + + ++NumSRems; + auto *BO = BinaryOperator::CreateURem(SDI->getOperand(0), SDI->getOperand(1), + SDI->getName(), SDI); + SDI->replaceAllUsesWith(BO); + SDI->eraseFromParent(); + return true; +} + +/// See if LazyValueInfo's ability to exploit edge conditions or range +/// information is sufficient to prove the both operands of this SDiv are +/// positive. If this is the case, replace the SDiv with a UDiv. Even for local +/// conditions, this can sometimes prove conditions instcombine can't by +/// exploiting range information. +static bool processSDiv(BinaryOperator *SDI, LazyValueInfo *LVI) { + if (SDI->getType()->isVectorTy() || hasLocalDefs(SDI) || + !hasPositiveOperands(SDI, LVI)) + return false; + + ++NumSDivs; + auto *BO = BinaryOperator::CreateUDiv(SDI->getOperand(0), SDI->getOperand(1), + SDI->getName(), SDI); + BO->setIsExact(SDI->isExact()); + SDI->replaceAllUsesWith(BO); + SDI->eraseFromParent(); + + return true; +} + +static Constant *getConstantAt(Value *V, Instruction *At, LazyValueInfo *LVI) { if (Constant *C = LVI->getConstant(V, At->getParent(), At)) return C; @@ -357,44 +404,45 @@ Constant *CorrelatedValuePropagation::getConstantAt(Value *V, Instruction *At) { ConstantInt::getFalse(C->getContext()); } -bool CorrelatedValuePropagation::runOnFunction(Function &F) { - if (skipOptnoneFunction(F)) - return false; - - LVI = &getAnalysis<LazyValueInfo>(); - +static bool runImpl(Function &F, LazyValueInfo *LVI) { bool FnChanged = false; - for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI) { + for (BasicBlock &BB : F) { bool BBChanged = false; - for (BasicBlock::iterator BI = FI->begin(), BE = FI->end(); BI != BE; ) { + for (BasicBlock::iterator BI = BB.begin(), BE = BB.end(); BI != BE;) { Instruction *II = &*BI++; switch (II->getOpcode()) { case Instruction::Select: - BBChanged |= processSelect(cast<SelectInst>(II)); + BBChanged |= processSelect(cast<SelectInst>(II), LVI); break; case Instruction::PHI: - BBChanged |= processPHI(cast<PHINode>(II)); + BBChanged |= processPHI(cast<PHINode>(II), LVI); break; case Instruction::ICmp: case Instruction::FCmp: - BBChanged |= processCmp(cast<CmpInst>(II)); + BBChanged |= processCmp(cast<CmpInst>(II), LVI); break; case Instruction::Load: case Instruction::Store: - BBChanged |= processMemAccess(II); + BBChanged |= processMemAccess(II, LVI); break; case Instruction::Call: case Instruction::Invoke: - BBChanged |= processCallSite(CallSite(II)); + BBChanged |= processCallSite(CallSite(II), LVI); + break; + case Instruction::SRem: + BBChanged |= processSRem(cast<BinaryOperator>(II), LVI); + break; + case Instruction::SDiv: + BBChanged |= processSDiv(cast<BinaryOperator>(II), LVI); break; } } - Instruction *Term = FI->getTerminator(); + Instruction *Term = BB.getTerminator(); switch (Term->getOpcode()) { case Instruction::Switch: - BBChanged |= processSwitch(cast<SwitchInst>(Term)); + BBChanged |= processSwitch(cast<SwitchInst>(Term), LVI); break; case Instruction::Ret: { auto *RI = cast<ReturnInst>(Term); @@ -404,7 +452,7 @@ bool CorrelatedValuePropagation::runOnFunction(Function &F) { auto *RetVal = RI->getReturnValue(); if (!RetVal) break; // handle "ret void" if (isa<Constant>(RetVal)) break; // nothing to do - if (auto *C = getConstantAt(RetVal, RI)) { + if (auto *C = getConstantAt(RetVal, RI, LVI)) { ++NumReturns; RI->replaceUsesOfWith(RetVal, C); BBChanged = true; @@ -417,3 +465,28 @@ bool CorrelatedValuePropagation::runOnFunction(Function &F) { return FnChanged; } + +bool CorrelatedValuePropagation::runOnFunction(Function &F) { + if (skipFunction(F)) + return false; + + LazyValueInfo *LVI = &getAnalysis<LazyValueInfoWrapperPass>().getLVI(); + return runImpl(F, LVI); +} + +PreservedAnalyses +CorrelatedValuePropagationPass::run(Function &F, FunctionAnalysisManager &AM) { + + LazyValueInfo *LVI = &AM.getResult<LazyValueAnalysis>(F); + bool Changed = runImpl(F, LVI); + + // FIXME: We need to invalidate LVI to avoid PR28400. Is there a better + // solution? + AM.invalidate<LazyValueAnalysis>(F); + + if (!Changed) + return PreservedAnalyses::all(); + PreservedAnalyses PA; + PA.preserve<GlobalsAA>(); + return PA; +} diff --git a/lib/Transforms/Scalar/DCE.cpp b/lib/Transforms/Scalar/DCE.cpp index b67c3c7742fd7..f73809d9f0454 100644 --- a/lib/Transforms/Scalar/DCE.cpp +++ b/lib/Transforms/Scalar/DCE.cpp @@ -16,13 +16,14 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Scalar/DCE.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instruction.h" #include "llvm/Pass.h" #include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/Local.h" using namespace llvm; @@ -41,7 +42,7 @@ namespace { initializeDeadInstEliminationPass(*PassRegistry::getPassRegistry()); } bool runOnBasicBlock(BasicBlock &BB) override { - if (skipOptnoneFunction(BB)) + if (skipBasicBlock(BB)) return false; auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI() : nullptr; @@ -71,28 +72,6 @@ Pass *llvm::createDeadInstEliminationPass() { return new DeadInstElimination(); } - -namespace { - //===--------------------------------------------------------------------===// - // DeadCodeElimination pass implementation - // - struct DCE : public FunctionPass { - static char ID; // Pass identification, replacement for typeid - DCE() : FunctionPass(ID) { - initializeDCEPass(*PassRegistry::getPassRegistry()); - } - - bool runOnFunction(Function &F) override; - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - } - }; -} - -char DCE::ID = 0; -INITIALIZE_PASS(DCE, "dce", "Dead Code Elimination", false, false) - static bool DCEInstruction(Instruction *I, SmallSetVector<Instruction *, 16> &WorkList, const TargetLibraryInfo *TLI) { @@ -121,13 +100,7 @@ static bool DCEInstruction(Instruction *I, return false; } -bool DCE::runOnFunction(Function &F) { - if (skipOptnoneFunction(F)) - return false; - - auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); - TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI() : nullptr; - +static bool eliminateDeadCode(Function &F, TargetLibraryInfo *TLI) { bool MadeChange = false; SmallSetVector<Instruction *, 16> WorkList; // Iterate over the original function, only adding insts to the worklist @@ -150,7 +123,38 @@ bool DCE::runOnFunction(Function &F) { return MadeChange; } -FunctionPass *llvm::createDeadCodeEliminationPass() { - return new DCE(); +PreservedAnalyses DCEPass::run(Function &F, AnalysisManager<Function> &AM) { + if (eliminateDeadCode(F, AM.getCachedResult<TargetLibraryAnalysis>(F))) + return PreservedAnalyses::none(); + return PreservedAnalyses::all(); +} + +namespace { +struct DCELegacyPass : public FunctionPass { + static char ID; // Pass identification, replacement for typeid + DCELegacyPass() : FunctionPass(ID) { + initializeDCELegacyPassPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override { + if (skipFunction(F)) + return false; + + auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); + TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI() : nullptr; + + return eliminateDeadCode(F, TLI); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + } +}; } +char DCELegacyPass::ID = 0; +INITIALIZE_PASS(DCELegacyPass, "dce", "Dead Code Elimination", false, false) + +FunctionPass *llvm::createDeadCodeEliminationPass() { + return new DCELegacyPass(); +} diff --git a/lib/Transforms/Scalar/DeadStoreElimination.cpp b/lib/Transforms/Scalar/DeadStoreElimination.cpp index 36ad0a5f7b91c..ed58a87ae1a8a 100644 --- a/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -15,7 +15,8 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Scalar/DeadStoreElimination.h" +#include "llvm/ADT/DenseMap.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/Statistic.h" @@ -34,9 +35,12 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/Local.h" +#include <map> using namespace llvm; #define DEBUG_TYPE "dse" @@ -44,90 +48,35 @@ using namespace llvm; STATISTIC(NumRedundantStores, "Number of redundant stores deleted"); STATISTIC(NumFastStores, "Number of stores deleted"); STATISTIC(NumFastOther , "Number of other instrs removed"); +STATISTIC(NumCompletePartials, "Number of stores dead by later partials"); -namespace { - struct DSE : public FunctionPass { - AliasAnalysis *AA; - MemoryDependenceAnalysis *MD; - DominatorTree *DT; - const TargetLibraryInfo *TLI; - - static char ID; // Pass identification, replacement for typeid - DSE() : FunctionPass(ID), AA(nullptr), MD(nullptr), DT(nullptr) { - initializeDSEPass(*PassRegistry::getPassRegistry()); - } - - bool runOnFunction(Function &F) override { - if (skipOptnoneFunction(F)) - return false; - - AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); - MD = &getAnalysis<MemoryDependenceAnalysis>(); - DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); +static cl::opt<bool> +EnablePartialOverwriteTracking("enable-dse-partial-overwrite-tracking", + cl::init(true), cl::Hidden, + cl::desc("Enable partial-overwrite tracking in DSE")); - bool Changed = false; - for (BasicBlock &I : F) - // Only check non-dead blocks. Dead blocks may have strange pointer - // cycles that will confuse alias analysis. - if (DT->isReachableFromEntry(&I)) - Changed |= runOnBasicBlock(I); - - AA = nullptr; MD = nullptr; DT = nullptr; - return Changed; - } - - bool runOnBasicBlock(BasicBlock &BB); - bool MemoryIsNotModifiedBetween(Instruction *FirstI, Instruction *SecondI); - bool HandleFree(CallInst *F); - bool handleEndBlock(BasicBlock &BB); - void RemoveAccessedObjects(const MemoryLocation &LoadedLoc, - SmallSetVector<Value *, 16> &DeadStackObjects, - const DataLayout &DL); - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<AAResultsWrapperPass>(); - AU.addRequired<MemoryDependenceAnalysis>(); - AU.addRequired<TargetLibraryInfoWrapperPass>(); - AU.addPreserved<DominatorTreeWrapperPass>(); - AU.addPreserved<GlobalsAAWrapperPass>(); - AU.addPreserved<MemoryDependenceAnalysis>(); - } - }; -} - -char DSE::ID = 0; -INITIALIZE_PASS_BEGIN(DSE, "dse", "Dead Store Elimination", false, false) -INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) -INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) -INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis) -INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) -INITIALIZE_PASS_END(DSE, "dse", "Dead Store Elimination", false, false) - -FunctionPass *llvm::createDeadStoreEliminationPass() { return new DSE(); } //===----------------------------------------------------------------------===// // Helper functions //===----------------------------------------------------------------------===// -/// DeleteDeadInstruction - Delete this instruction. Before we do, go through -/// and zero out all the operands of this instruction. If any of them become -/// dead, delete them and the computation tree that feeds them. -/// +/// Delete this instruction. Before we do, go through and zero out all the +/// operands of this instruction. If any of them become dead, delete them and +/// the computation tree that feeds them. /// If ValueSet is non-null, remove any deleted instructions from it as well. -/// -static void DeleteDeadInstruction(Instruction *I, - MemoryDependenceAnalysis &MD, - const TargetLibraryInfo &TLI, - SmallSetVector<Value*, 16> *ValueSet = nullptr) { +static void +deleteDeadInstruction(Instruction *I, BasicBlock::iterator *BBI, + MemoryDependenceResults &MD, const TargetLibraryInfo &TLI, + SmallSetVector<Value *, 16> *ValueSet = nullptr) { SmallVector<Instruction*, 32> NowDeadInsts; NowDeadInsts.push_back(I); --NumFastOther; + // Keeping the iterator straight is a pain, so we let this routine tell the + // caller what the next instruction is after we're done mucking about. + BasicBlock::iterator NewIter = *BBI; + // Before we touch this instruction, remove it from memdep! do { Instruction *DeadInst = NowDeadInsts.pop_back_val(); @@ -150,15 +99,19 @@ static void DeleteDeadInstruction(Instruction *I, NowDeadInsts.push_back(OpI); } - DeadInst->eraseFromParent(); + + if (NewIter == DeadInst->getIterator()) + NewIter = DeadInst->eraseFromParent(); + else + DeadInst->eraseFromParent(); if (ValueSet) ValueSet->remove(DeadInst); } while (!NowDeadInsts.empty()); + *BBI = NewIter; } - -/// hasMemoryWrite - Does this instruction write some memory? This only returns -/// true for things that we can analyze with other helpers below. +/// Does this instruction write some memory? This only returns true for things +/// that we can analyze with other helpers below. static bool hasMemoryWrite(Instruction *I, const TargetLibraryInfo &TLI) { if (isa<StoreInst>(I)) return true; @@ -176,30 +129,23 @@ static bool hasMemoryWrite(Instruction *I, const TargetLibraryInfo &TLI) { } if (auto CS = CallSite(I)) { if (Function *F = CS.getCalledFunction()) { - if (TLI.has(LibFunc::strcpy) && - F->getName() == TLI.getName(LibFunc::strcpy)) { + StringRef FnName = F->getName(); + if (TLI.has(LibFunc::strcpy) && FnName == TLI.getName(LibFunc::strcpy)) return true; - } - if (TLI.has(LibFunc::strncpy) && - F->getName() == TLI.getName(LibFunc::strncpy)) { + if (TLI.has(LibFunc::strncpy) && FnName == TLI.getName(LibFunc::strncpy)) return true; - } - if (TLI.has(LibFunc::strcat) && - F->getName() == TLI.getName(LibFunc::strcat)) { + if (TLI.has(LibFunc::strcat) && FnName == TLI.getName(LibFunc::strcat)) return true; - } - if (TLI.has(LibFunc::strncat) && - F->getName() == TLI.getName(LibFunc::strncat)) { + if (TLI.has(LibFunc::strncat) && FnName == TLI.getName(LibFunc::strncat)) return true; - } } } return false; } -/// getLocForWrite - Return a Location stored to by the specified instruction. -/// If isRemovable returns true, this function and getLocForRead completely -/// describe the memory operations for this instruction. +/// Return a Location stored to by the specified instruction. If isRemovable +/// returns true, this function and getLocForRead completely describe the memory +/// operations for this instruction. static MemoryLocation getLocForWrite(Instruction *Inst, AliasAnalysis &AA) { if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) return MemoryLocation::get(SI); @@ -228,8 +174,8 @@ static MemoryLocation getLocForWrite(Instruction *Inst, AliasAnalysis &AA) { } } -/// getLocForRead - Return the location read by the specified "hasMemoryWrite" -/// instruction if any. +/// Return the location read by the specified "hasMemoryWrite" instruction if +/// any. static MemoryLocation getLocForRead(Instruction *Inst, const TargetLibraryInfo &TLI) { assert(hasMemoryWrite(Inst, TLI) && "Unknown instruction case"); @@ -241,9 +187,8 @@ static MemoryLocation getLocForRead(Instruction *Inst, return MemoryLocation(); } - -/// isRemovable - If the value of this instruction and the memory it writes to -/// is unused, may we delete this instruction? +/// If the value of this instruction and the memory it writes to is unused, may +/// we delete this instruction? static bool isRemovable(Instruction *I) { // Don't remove volatile/atomic stores. if (StoreInst *SI = dyn_cast<StoreInst>(I)) @@ -275,9 +220,9 @@ static bool isRemovable(Instruction *I) { } -/// isShortenable - Returns true if this instruction can be safely shortened in +/// Returns true if the end of this instruction can be safely shortened in /// length. -static bool isShortenable(Instruction *I) { +static bool isShortenableAtTheEnd(Instruction *I) { // Don't shorten stores for now if (isa<StoreInst>(I)) return false; @@ -288,6 +233,7 @@ static bool isShortenable(Instruction *I) { case Intrinsic::memset: case Intrinsic::memcpy: // Do shorten memory intrinsics. + // FIXME: Add memmove if it's also safe to transform. return true; } } @@ -297,7 +243,16 @@ static bool isShortenable(Instruction *I) { return false; } -/// getStoredPointerOperand - Return the pointer that is being written to. +/// Returns true if the beginning of this instruction can be safely shortened +/// in length. +static bool isShortenableAtTheBeginning(Instruction *I) { + // FIXME: Handle only memset for now. Supporting memcpy/memmove should be + // easily done by offsetting the source address. + IntrinsicInst *II = dyn_cast<IntrinsicInst>(I); + return II && II->getIntrinsicID() == Intrinsic::memset; +} + +/// Return the pointer that is being written to. static Value *getStoredPointerOperand(Instruction *I) { if (StoreInst *SI = dyn_cast<StoreInst>(I)) return SI->getPointerOperand(); @@ -327,46 +282,45 @@ static uint64_t getPointerSize(const Value *V, const DataLayout &DL, } namespace { - enum OverwriteResult - { - OverwriteComplete, - OverwriteEnd, - OverwriteUnknown - }; +enum OverwriteResult { + OverwriteBegin, + OverwriteComplete, + OverwriteEnd, + OverwriteUnknown +}; } -/// isOverwrite - Return 'OverwriteComplete' if a store to the 'Later' location -/// completely overwrites a store to the 'Earlier' location. -/// 'OverwriteEnd' if the end of the 'Earlier' location is completely -/// overwritten by 'Later', or 'OverwriteUnknown' if nothing can be determined +typedef DenseMap<Instruction *, + std::map<int64_t, int64_t>> InstOverlapIntervalsTy; + +/// Return 'OverwriteComplete' if a store to the 'Later' location completely +/// overwrites a store to the 'Earlier' location, 'OverwriteEnd' if the end of +/// the 'Earlier' location is completely overwritten by 'Later', +/// 'OverwriteBegin' if the beginning of the 'Earlier' location is overwritten +/// by 'Later', or 'OverwriteUnknown' if nothing can be determined. static OverwriteResult isOverwrite(const MemoryLocation &Later, const MemoryLocation &Earlier, const DataLayout &DL, const TargetLibraryInfo &TLI, - int64_t &EarlierOff, int64_t &LaterOff) { + int64_t &EarlierOff, int64_t &LaterOff, + Instruction *DepWrite, + InstOverlapIntervalsTy &IOL) { + // If we don't know the sizes of either access, then we can't do a comparison. + if (Later.Size == MemoryLocation::UnknownSize || + Earlier.Size == MemoryLocation::UnknownSize) + return OverwriteUnknown; + const Value *P1 = Earlier.Ptr->stripPointerCasts(); const Value *P2 = Later.Ptr->stripPointerCasts(); // If the start pointers are the same, we just have to compare sizes to see if // the later store was larger than the earlier store. if (P1 == P2) { - // If we don't know the sizes of either access, then we can't do a - // comparison. - if (Later.Size == MemoryLocation::UnknownSize || - Earlier.Size == MemoryLocation::UnknownSize) - return OverwriteUnknown; - // Make sure that the Later size is >= the Earlier size. if (Later.Size >= Earlier.Size) return OverwriteComplete; } - // Otherwise, we have to have size information, and the later store has to be - // larger than the earlier one. - if (Later.Size == MemoryLocation::UnknownSize || - Earlier.Size == MemoryLocation::UnknownSize) - return OverwriteUnknown; - // Check to see if the later store is to the entire object (either a global, // an alloca, or a byval/inalloca argument). If so, then it clearly // overwrites any other store to the same object. @@ -416,8 +370,68 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later, uint64_t(EarlierOff - LaterOff) + Earlier.Size <= Later.Size) return OverwriteComplete; - // The other interesting case is if the later store overwrites the end of - // the earlier store + // We may now overlap, although the overlap is not complete. There might also + // be other incomplete overlaps, and together, they might cover the complete + // earlier write. + // Note: The correctness of this logic depends on the fact that this function + // is not even called providing DepWrite when there are any intervening reads. + if (EnablePartialOverwriteTracking && + LaterOff < int64_t(EarlierOff + Earlier.Size) && + int64_t(LaterOff + Later.Size) >= EarlierOff) { + + // Insert our part of the overlap into the map. + auto &IM = IOL[DepWrite]; + DEBUG(dbgs() << "DSE: Partial overwrite: Earlier [" << EarlierOff << ", " << + int64_t(EarlierOff + Earlier.Size) << ") Later [" << + LaterOff << ", " << int64_t(LaterOff + Later.Size) << ")\n"); + + // Make sure that we only insert non-overlapping intervals and combine + // adjacent intervals. The intervals are stored in the map with the ending + // offset as the key (in the half-open sense) and the starting offset as + // the value. + int64_t LaterIntStart = LaterOff, LaterIntEnd = LaterOff + Later.Size; + + // Find any intervals ending at, or after, LaterIntStart which start + // before LaterIntEnd. + auto ILI = IM.lower_bound(LaterIntStart); + if (ILI != IM.end() && ILI->second <= LaterIntEnd) { + // This existing interval is overlapped with the current store somewhere + // in [LaterIntStart, LaterIntEnd]. Merge them by erasing the existing + // intervals and adjusting our start and end. + LaterIntStart = std::min(LaterIntStart, ILI->second); + LaterIntEnd = std::max(LaterIntEnd, ILI->first); + ILI = IM.erase(ILI); + + // Continue erasing and adjusting our end in case other previous + // intervals are also overlapped with the current store. + // + // |--- ealier 1 ---| |--- ealier 2 ---| + // |------- later---------| + // + while (ILI != IM.end() && ILI->second <= LaterIntEnd) { + assert(ILI->second > LaterIntStart && "Unexpected interval"); + LaterIntEnd = std::max(LaterIntEnd, ILI->first); + ILI = IM.erase(ILI); + } + } + + IM[LaterIntEnd] = LaterIntStart; + + ILI = IM.begin(); + if (ILI->second <= EarlierOff && + ILI->first >= int64_t(EarlierOff + Earlier.Size)) { + DEBUG(dbgs() << "DSE: Full overwrite from partials: Earlier [" << + EarlierOff << ", " << + int64_t(EarlierOff + Earlier.Size) << + ") Composite Later [" << + ILI->second << ", " << ILI->first << ")\n"); + ++NumCompletePartials; + return OverwriteComplete; + } + } + + // Another interesting case is if the later store overwrites the end of the + // earlier store. // // |--earlier--| // |-- later --| @@ -429,11 +443,25 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later, int64_t(LaterOff + Later.Size) >= int64_t(EarlierOff + Earlier.Size)) return OverwriteEnd; + // Finally, we also need to check if the later store overwrites the beginning + // of the earlier store. + // + // |--earlier--| + // |-- later --| + // + // In this case we may want to move the destination address and trim the size + // of earlier to avoid generating writes to addresses which will definitely + // be overwritten later. + if (LaterOff <= EarlierOff && int64_t(LaterOff + Later.Size) > EarlierOff) { + assert (int64_t(LaterOff + Later.Size) < int64_t(EarlierOff + Earlier.Size) + && "Expect to be handled as OverwriteComplete" ); + return OverwriteBegin; + } // Otherwise, they don't completely overlap. return OverwriteUnknown; } -/// isPossibleSelfRead - If 'Inst' might be a self read (i.e. a noop copy of a +/// If 'Inst' might be a self read (i.e. a noop copy of a /// memory region into an identical pointer) then it doesn't actually make its /// input dead in the traditional sense. Consider this case: /// @@ -478,192 +506,13 @@ static bool isPossibleSelfRead(Instruction *Inst, } -//===----------------------------------------------------------------------===// -// DSE Pass -//===----------------------------------------------------------------------===// - -bool DSE::runOnBasicBlock(BasicBlock &BB) { - const DataLayout &DL = BB.getModule()->getDataLayout(); - bool MadeChange = false; - - // Do a top-down walk on the BB. - for (BasicBlock::iterator BBI = BB.begin(), BBE = BB.end(); BBI != BBE; ) { - Instruction *Inst = &*BBI++; - - // Handle 'free' calls specially. - if (CallInst *F = isFreeCall(Inst, TLI)) { - MadeChange |= HandleFree(F); - continue; - } - - // If we find something that writes memory, get its memory dependence. - if (!hasMemoryWrite(Inst, *TLI)) - continue; - - // If we're storing the same value back to a pointer that we just - // loaded from, then the store can be removed. - if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { - - auto RemoveDeadInstAndUpdateBBI = [&](Instruction *DeadInst) { - // DeleteDeadInstruction can delete the current instruction. Save BBI - // in case we need it. - WeakVH NextInst(&*BBI); - - DeleteDeadInstruction(DeadInst, *MD, *TLI); - - if (!NextInst) // Next instruction deleted. - BBI = BB.begin(); - else if (BBI != BB.begin()) // Revisit this instruction if possible. - --BBI; - ++NumRedundantStores; - MadeChange = true; - }; - - if (LoadInst *DepLoad = dyn_cast<LoadInst>(SI->getValueOperand())) { - if (SI->getPointerOperand() == DepLoad->getPointerOperand() && - isRemovable(SI) && - MemoryIsNotModifiedBetween(DepLoad, SI)) { - - DEBUG(dbgs() << "DSE: Remove Store Of Load from same pointer:\n " - << "LOAD: " << *DepLoad << "\n STORE: " << *SI << '\n'); - - RemoveDeadInstAndUpdateBBI(SI); - continue; - } - } - - // Remove null stores into the calloc'ed objects - Constant *StoredConstant = dyn_cast<Constant>(SI->getValueOperand()); - - if (StoredConstant && StoredConstant->isNullValue() && - isRemovable(SI)) { - Instruction *UnderlyingPointer = dyn_cast<Instruction>( - GetUnderlyingObject(SI->getPointerOperand(), DL)); - - if (UnderlyingPointer && isCallocLikeFn(UnderlyingPointer, TLI) && - MemoryIsNotModifiedBetween(UnderlyingPointer, SI)) { - DEBUG(dbgs() - << "DSE: Remove null store to the calloc'ed object:\n DEAD: " - << *Inst << "\n OBJECT: " << *UnderlyingPointer << '\n'); - - RemoveDeadInstAndUpdateBBI(SI); - continue; - } - } - } - - MemDepResult InstDep = MD->getDependency(Inst); - - // Ignore any store where we can't find a local dependence. - // FIXME: cross-block DSE would be fun. :) - if (!InstDep.isDef() && !InstDep.isClobber()) - continue; - - // Figure out what location is being stored to. - MemoryLocation Loc = getLocForWrite(Inst, *AA); - - // If we didn't get a useful location, fail. - if (!Loc.Ptr) - continue; - - while (InstDep.isDef() || InstDep.isClobber()) { - // Get the memory clobbered by the instruction we depend on. MemDep will - // skip any instructions that 'Loc' clearly doesn't interact with. If we - // end up depending on a may- or must-aliased load, then we can't optimize - // away the store and we bail out. However, if we depend on on something - // that overwrites the memory location we *can* potentially optimize it. - // - // Find out what memory location the dependent instruction stores. - Instruction *DepWrite = InstDep.getInst(); - MemoryLocation DepLoc = getLocForWrite(DepWrite, *AA); - // If we didn't get a useful location, or if it isn't a size, bail out. - if (!DepLoc.Ptr) - break; - - // If we find a write that is a) removable (i.e., non-volatile), b) is - // completely obliterated by the store to 'Loc', and c) which we know that - // 'Inst' doesn't load from, then we can remove it. - if (isRemovable(DepWrite) && - !isPossibleSelfRead(Inst, Loc, DepWrite, *TLI, *AA)) { - int64_t InstWriteOffset, DepWriteOffset; - OverwriteResult OR = - isOverwrite(Loc, DepLoc, DL, *TLI, DepWriteOffset, InstWriteOffset); - if (OR == OverwriteComplete) { - DEBUG(dbgs() << "DSE: Remove Dead Store:\n DEAD: " - << *DepWrite << "\n KILLER: " << *Inst << '\n'); - - // Delete the store and now-dead instructions that feed it. - DeleteDeadInstruction(DepWrite, *MD, *TLI); - ++NumFastStores; - MadeChange = true; - - // DeleteDeadInstruction can delete the current instruction in loop - // cases, reset BBI. - BBI = Inst->getIterator(); - if (BBI != BB.begin()) - --BBI; - break; - } else if (OR == OverwriteEnd && isShortenable(DepWrite)) { - // TODO: base this on the target vector size so that if the earlier - // store was too small to get vector writes anyway then its likely - // a good idea to shorten it - // Power of 2 vector writes are probably always a bad idea to optimize - // as any store/memset/memcpy is likely using vector instructions so - // shortening it to not vector size is likely to be slower - MemIntrinsic* DepIntrinsic = cast<MemIntrinsic>(DepWrite); - unsigned DepWriteAlign = DepIntrinsic->getAlignment(); - if (llvm::isPowerOf2_64(InstWriteOffset) || - ((DepWriteAlign != 0) && InstWriteOffset % DepWriteAlign == 0)) { - - DEBUG(dbgs() << "DSE: Remove Dead Store:\n OW END: " - << *DepWrite << "\n KILLER (offset " - << InstWriteOffset << ", " - << DepLoc.Size << ")" - << *Inst << '\n'); - - Value* DepWriteLength = DepIntrinsic->getLength(); - Value* TrimmedLength = ConstantInt::get(DepWriteLength->getType(), - InstWriteOffset - - DepWriteOffset); - DepIntrinsic->setLength(TrimmedLength); - MadeChange = true; - } - } - } - - // If this is a may-aliased store that is clobbering the store value, we - // can keep searching past it for another must-aliased pointer that stores - // to the same location. For example, in: - // store -> P - // store -> Q - // store -> P - // we can remove the first store to P even though we don't know if P and Q - // alias. - if (DepWrite == &BB.front()) break; - - // Can't look past this instruction if it might read 'Loc'. - if (AA->getModRefInfo(DepWrite, Loc) & MRI_Ref) - break; - - InstDep = MD->getPointerDependencyFrom(Loc, false, - DepWrite->getIterator(), &BB); - } - } - - // If this block ends in a return, unwind, or unreachable, all allocas are - // dead at its end, which means stores to them are also dead. - if (BB.getTerminator()->getNumSuccessors() == 0) - MadeChange |= handleEndBlock(BB); - - return MadeChange; -} - /// Returns true if the memory which is accessed by the second instruction is not /// modified between the first and the second instruction. /// Precondition: Second instruction must be dominated by the first /// instruction. -bool DSE::MemoryIsNotModifiedBetween(Instruction *FirstI, - Instruction *SecondI) { +static bool memoryIsNotModifiedBetween(Instruction *FirstI, + Instruction *SecondI, + AliasAnalysis *AA) { SmallVector<BasicBlock *, 16> WorkList; SmallPtrSet<BasicBlock *, 8> Visited; BasicBlock::iterator FirstBBI(FirstI); @@ -718,7 +567,7 @@ bool DSE::MemoryIsNotModifiedBetween(Instruction *FirstI, /// Find all blocks that will unconditionally lead to the block BB and append /// them to F. -static void FindUnconditionalPreds(SmallVectorImpl<BasicBlock *> &Blocks, +static void findUnconditionalPreds(SmallVectorImpl<BasicBlock *> &Blocks, BasicBlock *BB, DominatorTree *DT) { for (pred_iterator I = pred_begin(BB), E = pred_end(BB); I != E; ++I) { BasicBlock *Pred = *I; @@ -732,9 +581,11 @@ static void FindUnconditionalPreds(SmallVectorImpl<BasicBlock *> &Blocks, } } -/// HandleFree - Handle frees of entire structures whose dependency is a store +/// Handle frees of entire structures whose dependency is a store /// to a field of that structure. -bool DSE::HandleFree(CallInst *F) { +static bool handleFree(CallInst *F, AliasAnalysis *AA, + MemoryDependenceResults *MD, DominatorTree *DT, + const TargetLibraryInfo *TLI) { bool MadeChange = false; MemoryLocation Loc = MemoryLocation(F->getOperand(0)); @@ -761,10 +612,9 @@ bool DSE::HandleFree(CallInst *F) { if (!AA->isMustAlias(F->getArgOperand(0), DepPointer)) break; - auto Next = ++Dependency->getIterator(); - - // DCE instructions only used to calculate that store - DeleteDeadInstruction(Dependency, *MD, *TLI); + // DCE instructions only used to calculate that store. + BasicBlock::iterator BBI(Dependency); + deleteDeadInstruction(Dependency, &BBI, *MD, *TLI); ++NumFastStores; MadeChange = true; @@ -773,23 +623,53 @@ bool DSE::HandleFree(CallInst *F) { // s[0] = 0; // s[1] = 0; // This has just been deleted. // free(s); - Dep = MD->getPointerDependencyFrom(Loc, false, Next, BB); + Dep = MD->getPointerDependencyFrom(Loc, false, BBI, BB); } if (Dep.isNonLocal()) - FindUnconditionalPreds(Blocks, BB, DT); + findUnconditionalPreds(Blocks, BB, DT); } return MadeChange; } -/// handleEndBlock - Remove dead stores to stack-allocated locations in the -/// function end block. Ex: +/// Check to see if the specified location may alias any of the stack objects in +/// the DeadStackObjects set. If so, they become live because the location is +/// being loaded. +static void removeAccessedObjects(const MemoryLocation &LoadedLoc, + SmallSetVector<Value *, 16> &DeadStackObjects, + const DataLayout &DL, AliasAnalysis *AA, + const TargetLibraryInfo *TLI) { + const Value *UnderlyingPointer = GetUnderlyingObject(LoadedLoc.Ptr, DL); + + // A constant can't be in the dead pointer set. + if (isa<Constant>(UnderlyingPointer)) + return; + + // If the kill pointer can be easily reduced to an alloca, don't bother doing + // extraneous AA queries. + if (isa<AllocaInst>(UnderlyingPointer) || isa<Argument>(UnderlyingPointer)) { + DeadStackObjects.remove(const_cast<Value*>(UnderlyingPointer)); + return; + } + + // Remove objects that could alias LoadedLoc. + DeadStackObjects.remove_if([&](Value *I) { + // See if the loaded location could alias the stack location. + MemoryLocation StackLoc(I, getPointerSize(I, DL, *TLI)); + return !AA->isNoAlias(StackLoc, LoadedLoc); + }); +} + +/// Remove dead stores to stack-allocated locations in the function end block. +/// Ex: /// %A = alloca i32 /// ... /// store i32 1, i32* %A /// ret void -bool DSE::handleEndBlock(BasicBlock &BB) { +static bool handleEndBlock(BasicBlock &BB, AliasAnalysis *AA, + MemoryDependenceResults *MD, + const TargetLibraryInfo *TLI) { bool MadeChange = false; // Keep track of all of the stack objects that are dead at the end of the @@ -828,15 +708,14 @@ bool DSE::handleEndBlock(BasicBlock &BB) { // Stores to stack values are valid candidates for removal. bool AllDead = true; - for (SmallVectorImpl<Value *>::iterator I = Pointers.begin(), - E = Pointers.end(); I != E; ++I) - if (!DeadStackObjects.count(*I)) { + for (Value *Pointer : Pointers) + if (!DeadStackObjects.count(Pointer)) { AllDead = false; break; } if (AllDead) { - Instruction *Dead = &*BBI++; + Instruction *Dead = &*BBI; DEBUG(dbgs() << "DSE: Dead Store at End of Block:\n DEAD: " << *Dead << "\n Objects: "; @@ -849,7 +728,7 @@ bool DSE::handleEndBlock(BasicBlock &BB) { dbgs() << '\n'); // DCE instructions only used to calculate that store. - DeleteDeadInstruction(Dead, *MD, *TLI, &DeadStackObjects); + deleteDeadInstruction(Dead, &BBI, *MD, *TLI, &DeadStackObjects); ++NumFastStores; MadeChange = true; continue; @@ -858,8 +737,7 @@ bool DSE::handleEndBlock(BasicBlock &BB) { // Remove any dead non-memory-mutating instructions. if (isInstructionTriviallyDead(&*BBI, TLI)) { - Instruction *Inst = &*BBI++; - DeleteDeadInstruction(Inst, *MD, *TLI, &DeadStackObjects); + deleteDeadInstruction(&*BBI, &BBI, *MD, *TLI, &DeadStackObjects); ++NumFastOther; MadeChange = true; continue; @@ -873,7 +751,7 @@ bool DSE::handleEndBlock(BasicBlock &BB) { } if (auto CS = CallSite(&*BBI)) { - // Remove allocation function calls from the list of dead stack objects; + // Remove allocation function calls from the list of dead stack objects; // there can't be any references before the definition. if (isAllocLikeFn(&*BBI, TLI)) DeadStackObjects.remove(&*BBI); @@ -900,6 +778,14 @@ bool DSE::handleEndBlock(BasicBlock &BB) { continue; } + // We can remove the dead stores, irrespective of the fence and its ordering + // (release/acquire/seq_cst). Fences only constraints the ordering of + // already visible stores, it does not make a store visible to other + // threads. So, skipping over a fence does not change a store from being + // dead. + if (isa<FenceInst>(*BBI)) + continue; + MemoryLocation LoadedLoc; // If we encounter a use of the pointer, it is no longer considered dead @@ -922,7 +808,7 @@ bool DSE::handleEndBlock(BasicBlock &BB) { // Remove any allocas from the DeadPointer set that are loaded, as this // makes any stores above the access live. - RemoveAccessedObjects(LoadedLoc, DeadStackObjects, DL); + removeAccessedObjects(LoadedLoc, DeadStackObjects, DL, AA, TLI); // If all of the allocas were clobbered by the access then we're not going // to find anything else to process. @@ -933,29 +819,285 @@ bool DSE::handleEndBlock(BasicBlock &BB) { return MadeChange; } -/// RemoveAccessedObjects - Check to see if the specified location may alias any -/// of the stack objects in the DeadStackObjects set. If so, they become live -/// because the location is being loaded. -void DSE::RemoveAccessedObjects(const MemoryLocation &LoadedLoc, - SmallSetVector<Value *, 16> &DeadStackObjects, - const DataLayout &DL) { - const Value *UnderlyingPointer = GetUnderlyingObject(LoadedLoc.Ptr, DL); +static bool eliminateNoopStore(Instruction *Inst, BasicBlock::iterator &BBI, + AliasAnalysis *AA, MemoryDependenceResults *MD, + const DataLayout &DL, + const TargetLibraryInfo *TLI) { + // Must be a store instruction. + StoreInst *SI = dyn_cast<StoreInst>(Inst); + if (!SI) + return false; - // A constant can't be in the dead pointer set. - if (isa<Constant>(UnderlyingPointer)) - return; + // If we're storing the same value back to a pointer that we just loaded from, + // then the store can be removed. + if (LoadInst *DepLoad = dyn_cast<LoadInst>(SI->getValueOperand())) { + if (SI->getPointerOperand() == DepLoad->getPointerOperand() && + isRemovable(SI) && memoryIsNotModifiedBetween(DepLoad, SI, AA)) { - // If the kill pointer can be easily reduced to an alloca, don't bother doing - // extraneous AA queries. - if (isa<AllocaInst>(UnderlyingPointer) || isa<Argument>(UnderlyingPointer)) { - DeadStackObjects.remove(const_cast<Value*>(UnderlyingPointer)); - return; + DEBUG(dbgs() << "DSE: Remove Store Of Load from same pointer:\n LOAD: " + << *DepLoad << "\n STORE: " << *SI << '\n'); + + deleteDeadInstruction(SI, &BBI, *MD, *TLI); + ++NumRedundantStores; + return true; + } } - // Remove objects that could alias LoadedLoc. - DeadStackObjects.remove_if([&](Value *I) { - // See if the loaded location could alias the stack location. - MemoryLocation StackLoc(I, getPointerSize(I, DL, *TLI)); - return !AA->isNoAlias(StackLoc, LoadedLoc); - }); + // Remove null stores into the calloc'ed objects + Constant *StoredConstant = dyn_cast<Constant>(SI->getValueOperand()); + if (StoredConstant && StoredConstant->isNullValue() && isRemovable(SI)) { + Instruction *UnderlyingPointer = + dyn_cast<Instruction>(GetUnderlyingObject(SI->getPointerOperand(), DL)); + + if (UnderlyingPointer && isCallocLikeFn(UnderlyingPointer, TLI) && + memoryIsNotModifiedBetween(UnderlyingPointer, SI, AA)) { + DEBUG( + dbgs() << "DSE: Remove null store to the calloc'ed object:\n DEAD: " + << *Inst << "\n OBJECT: " << *UnderlyingPointer << '\n'); + + deleteDeadInstruction(SI, &BBI, *MD, *TLI); + ++NumRedundantStores; + return true; + } + } + return false; +} + +static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA, + MemoryDependenceResults *MD, DominatorTree *DT, + const TargetLibraryInfo *TLI) { + const DataLayout &DL = BB.getModule()->getDataLayout(); + bool MadeChange = false; + + // A map of interval maps representing partially-overwritten value parts. + InstOverlapIntervalsTy IOL; + + // Do a top-down walk on the BB. + for (BasicBlock::iterator BBI = BB.begin(), BBE = BB.end(); BBI != BBE; ) { + // Handle 'free' calls specially. + if (CallInst *F = isFreeCall(&*BBI, TLI)) { + MadeChange |= handleFree(F, AA, MD, DT, TLI); + // Increment BBI after handleFree has potentially deleted instructions. + // This ensures we maintain a valid iterator. + ++BBI; + continue; + } + + Instruction *Inst = &*BBI++; + + // Check to see if Inst writes to memory. If not, continue. + if (!hasMemoryWrite(Inst, *TLI)) + continue; + + // eliminateNoopStore will update in iterator, if necessary. + if (eliminateNoopStore(Inst, BBI, AA, MD, DL, TLI)) { + MadeChange = true; + continue; + } + + // If we find something that writes memory, get its memory dependence. + MemDepResult InstDep = MD->getDependency(Inst); + + // Ignore any store where we can't find a local dependence. + // FIXME: cross-block DSE would be fun. :) + if (!InstDep.isDef() && !InstDep.isClobber()) + continue; + + // Figure out what location is being stored to. + MemoryLocation Loc = getLocForWrite(Inst, *AA); + + // If we didn't get a useful location, fail. + if (!Loc.Ptr) + continue; + + while (InstDep.isDef() || InstDep.isClobber()) { + // Get the memory clobbered by the instruction we depend on. MemDep will + // skip any instructions that 'Loc' clearly doesn't interact with. If we + // end up depending on a may- or must-aliased load, then we can't optimize + // away the store and we bail out. However, if we depend on something + // that overwrites the memory location we *can* potentially optimize it. + // + // Find out what memory location the dependent instruction stores. + Instruction *DepWrite = InstDep.getInst(); + MemoryLocation DepLoc = getLocForWrite(DepWrite, *AA); + // If we didn't get a useful location, or if it isn't a size, bail out. + if (!DepLoc.Ptr) + break; + + // If we find a write that is a) removable (i.e., non-volatile), b) is + // completely obliterated by the store to 'Loc', and c) which we know that + // 'Inst' doesn't load from, then we can remove it. + if (isRemovable(DepWrite) && + !isPossibleSelfRead(Inst, Loc, DepWrite, *TLI, *AA)) { + int64_t InstWriteOffset, DepWriteOffset; + OverwriteResult OR = + isOverwrite(Loc, DepLoc, DL, *TLI, DepWriteOffset, InstWriteOffset, + DepWrite, IOL); + if (OR == OverwriteComplete) { + DEBUG(dbgs() << "DSE: Remove Dead Store:\n DEAD: " + << *DepWrite << "\n KILLER: " << *Inst << '\n'); + + // Delete the store and now-dead instructions that feed it. + deleteDeadInstruction(DepWrite, &BBI, *MD, *TLI); + ++NumFastStores; + MadeChange = true; + + // We erased DepWrite; start over. + InstDep = MD->getDependency(Inst); + continue; + } else if ((OR == OverwriteEnd && isShortenableAtTheEnd(DepWrite)) || + ((OR == OverwriteBegin && + isShortenableAtTheBeginning(DepWrite)))) { + // TODO: base this on the target vector size so that if the earlier + // store was too small to get vector writes anyway then its likely + // a good idea to shorten it + // Power of 2 vector writes are probably always a bad idea to optimize + // as any store/memset/memcpy is likely using vector instructions so + // shortening it to not vector size is likely to be slower + MemIntrinsic *DepIntrinsic = cast<MemIntrinsic>(DepWrite); + unsigned DepWriteAlign = DepIntrinsic->getAlignment(); + bool IsOverwriteEnd = (OR == OverwriteEnd); + if (!IsOverwriteEnd) + InstWriteOffset = int64_t(InstWriteOffset + Loc.Size); + + if ((llvm::isPowerOf2_64(InstWriteOffset) && + DepWriteAlign <= InstWriteOffset) || + ((DepWriteAlign != 0) && InstWriteOffset % DepWriteAlign == 0)) { + + DEBUG(dbgs() << "DSE: Remove Dead Store:\n OW " + << (IsOverwriteEnd ? "END" : "BEGIN") << ": " + << *DepWrite << "\n KILLER (offset " + << InstWriteOffset << ", " << DepLoc.Size << ")" + << *Inst << '\n'); + + int64_t NewLength = + IsOverwriteEnd + ? InstWriteOffset - DepWriteOffset + : DepLoc.Size - (InstWriteOffset - DepWriteOffset); + + Value *DepWriteLength = DepIntrinsic->getLength(); + Value *TrimmedLength = + ConstantInt::get(DepWriteLength->getType(), NewLength); + DepIntrinsic->setLength(TrimmedLength); + + if (!IsOverwriteEnd) { + int64_t OffsetMoved = (InstWriteOffset - DepWriteOffset); + Value *Indices[1] = { + ConstantInt::get(DepWriteLength->getType(), OffsetMoved)}; + GetElementPtrInst *NewDestGEP = GetElementPtrInst::CreateInBounds( + DepIntrinsic->getRawDest(), Indices, "", DepWrite); + DepIntrinsic->setDest(NewDestGEP); + } + MadeChange = true; + } + } + } + + // If this is a may-aliased store that is clobbering the store value, we + // can keep searching past it for another must-aliased pointer that stores + // to the same location. For example, in: + // store -> P + // store -> Q + // store -> P + // we can remove the first store to P even though we don't know if P and Q + // alias. + if (DepWrite == &BB.front()) break; + + // Can't look past this instruction if it might read 'Loc'. + if (AA->getModRefInfo(DepWrite, Loc) & MRI_Ref) + break; + + InstDep = MD->getPointerDependencyFrom(Loc, false, + DepWrite->getIterator(), &BB); + } + } + + // If this block ends in a return, unwind, or unreachable, all allocas are + // dead at its end, which means stores to them are also dead. + if (BB.getTerminator()->getNumSuccessors() == 0) + MadeChange |= handleEndBlock(BB, AA, MD, TLI); + + return MadeChange; +} + +static bool eliminateDeadStores(Function &F, AliasAnalysis *AA, + MemoryDependenceResults *MD, DominatorTree *DT, + const TargetLibraryInfo *TLI) { + bool MadeChange = false; + for (BasicBlock &BB : F) + // Only check non-dead blocks. Dead blocks may have strange pointer + // cycles that will confuse alias analysis. + if (DT->isReachableFromEntry(&BB)) + MadeChange |= eliminateDeadStores(BB, AA, MD, DT, TLI); + return MadeChange; +} + +//===----------------------------------------------------------------------===// +// DSE Pass +//===----------------------------------------------------------------------===// +PreservedAnalyses DSEPass::run(Function &F, FunctionAnalysisManager &AM) { + AliasAnalysis *AA = &AM.getResult<AAManager>(F); + DominatorTree *DT = &AM.getResult<DominatorTreeAnalysis>(F); + MemoryDependenceResults *MD = &AM.getResult<MemoryDependenceAnalysis>(F); + const TargetLibraryInfo *TLI = &AM.getResult<TargetLibraryAnalysis>(F); + + if (!eliminateDeadStores(F, AA, MD, DT, TLI)) + return PreservedAnalyses::all(); + PreservedAnalyses PA; + PA.preserve<DominatorTreeAnalysis>(); + PA.preserve<GlobalsAA>(); + PA.preserve<MemoryDependenceAnalysis>(); + return PA; +} + +namespace { +/// A legacy pass for the legacy pass manager that wraps \c DSEPass. +class DSELegacyPass : public FunctionPass { +public: + DSELegacyPass() : FunctionPass(ID) { + initializeDSELegacyPassPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override { + if (skipFunction(F)) + return false; + + DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); + MemoryDependenceResults *MD = + &getAnalysis<MemoryDependenceWrapperPass>().getMemDep(); + const TargetLibraryInfo *TLI = + &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + + return eliminateDeadStores(F, AA, MD, DT, TLI); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addRequired<AAResultsWrapperPass>(); + AU.addRequired<MemoryDependenceWrapperPass>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); + AU.addPreserved<DominatorTreeWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); + AU.addPreserved<MemoryDependenceWrapperPass>(); + } + + static char ID; // Pass identification, replacement for typeid +}; +} // end anonymous namespace + +char DSELegacyPass::ID = 0; +INITIALIZE_PASS_BEGIN(DSELegacyPass, "dse", "Dead Store Elimination", false, + false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_END(DSELegacyPass, "dse", "Dead Store Elimination", false, + false) + +FunctionPass *llvm::createDeadStoreEliminationPass() { + return new DSELegacyPass(); } diff --git a/lib/Transforms/Scalar/EarlyCSE.cpp b/lib/Transforms/Scalar/EarlyCSE.cpp index 7ef062e71ff3a..9d0ef42e0396d 100644 --- a/lib/Transforms/Scalar/EarlyCSE.cpp +++ b/lib/Transforms/Scalar/EarlyCSE.cpp @@ -16,8 +16,8 @@ #include "llvm/ADT/Hashing.h" #include "llvm/ADT/ScopedHashTable.h" #include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" @@ -40,6 +40,7 @@ using namespace llvm::PatternMatch; STATISTIC(NumSimplify, "Number of instructions simplified or DCE'd"); STATISTIC(NumCSE, "Number of instructions CSE'd"); +STATISTIC(NumCSECVP, "Number of compare instructions CVP'd"); STATISTIC(NumCSELoad, "Number of load instructions CSE'd"); STATISTIC(NumCSECall, "Number of call instructions CSE'd"); STATISTIC(NumDSE, "Number of trivial dead stores removed"); @@ -97,15 +98,6 @@ unsigned DenseMapInfo<SimpleValue>::getHashValue(SimpleValue Val) { if (BinOp->isCommutative() && BinOp->getOperand(0) > BinOp->getOperand(1)) std::swap(LHS, RHS); - if (isa<OverflowingBinaryOperator>(BinOp)) { - // Hash the overflow behavior - unsigned Overflow = - BinOp->hasNoSignedWrap() * OverflowingBinaryOperator::NoSignedWrap | - BinOp->hasNoUnsignedWrap() * - OverflowingBinaryOperator::NoUnsignedWrap; - return hash_combine(BinOp->getOpcode(), Overflow, LHS, RHS); - } - return hash_combine(BinOp->getOpcode(), LHS, RHS); } @@ -152,7 +144,7 @@ bool DenseMapInfo<SimpleValue>::isEqual(SimpleValue LHS, SimpleValue RHS) { if (LHSI->getOpcode() != RHSI->getOpcode()) return false; - if (LHSI->isIdenticalTo(RHSI)) + if (LHSI->isIdenticalToWhenDefined(RHSI)) return true; // If we're not strictly identical, we still might be a commutable instruction @@ -164,15 +156,6 @@ bool DenseMapInfo<SimpleValue>::isEqual(SimpleValue LHS, SimpleValue RHS) { "same opcode, but different instruction type?"); BinaryOperator *RHSBinOp = cast<BinaryOperator>(RHSI); - // Check overflow attributes - if (isa<OverflowingBinaryOperator>(LHSBinOp)) { - assert(isa<OverflowingBinaryOperator>(RHSBinOp) && - "same opcode, but different operator type?"); - if (LHSBinOp->hasNoUnsignedWrap() != RHSBinOp->hasNoUnsignedWrap() || - LHSBinOp->hasNoSignedWrap() != RHSBinOp->hasNoSignedWrap()) - return false; - } - // Commuted equality return LHSBinOp->getOperand(0) == RHSBinOp->getOperand(1) && LHSBinOp->getOperand(1) == RHSBinOp->getOperand(0); @@ -296,16 +279,18 @@ public: /// present the table; it is the responsibility of the consumer to inspect /// the atomicity/volatility if needed. struct LoadValue { - Value *Data; + Instruction *DefInst; unsigned Generation; int MatchingId; bool IsAtomic; + bool IsInvariant; LoadValue() - : Data(nullptr), Generation(0), MatchingId(-1), IsAtomic(false) {} - LoadValue(Value *Data, unsigned Generation, unsigned MatchingId, - bool IsAtomic) - : Data(Data), Generation(Generation), MatchingId(MatchingId), - IsAtomic(IsAtomic) {} + : DefInst(nullptr), Generation(0), MatchingId(-1), IsAtomic(false), + IsInvariant(false) {} + LoadValue(Instruction *Inst, unsigned Generation, unsigned MatchingId, + bool IsAtomic, bool IsInvariant) + : DefInst(Inst), Generation(Generation), MatchingId(MatchingId), + IsAtomic(IsAtomic), IsInvariant(IsInvariant) {} }; typedef RecyclingAllocator<BumpPtrAllocator, ScopedHashTableVal<Value *, LoadValue>> @@ -318,7 +303,8 @@ public: /// values. /// /// It uses the same generation count as loads. - typedef ScopedHashTable<CallValue, std::pair<Value *, unsigned>> CallHTType; + typedef ScopedHashTable<CallValue, std::pair<Instruction *, unsigned>> + CallHTType; CallHTType AvailableCalls; /// \brief This is the current generation of the memory value. @@ -354,7 +340,7 @@ private: // Contains all the needed information to create a stack for doing a depth // first tranversal of the tree. This includes scopes for values, loads, and // calls as well as the generation. There is a child iterator so that the - // children do not need to be store spearately. + // children do not need to be store separately. class StackNode { public: StackNode(ScopedHTType &AvailableValues, LoadHTType &AvailableLoads, @@ -446,7 +432,12 @@ private: return true; } - + bool isInvariantLoad() const { + if (auto *LI = dyn_cast<LoadInst>(Inst)) + return LI->getMetadata(LLVMContext::MD_invariant_load) != nullptr; + return false; + } + bool isMatchingMemLoc(const ParseMemoryInst &Inst) const { return (getPointerOperand() == Inst.getPointerOperand() && getMatchingId() == Inst.getMatchingId()); @@ -500,6 +491,7 @@ private: } bool EarlyCSE::processNode(DomTreeNode *Node) { + bool Changed = false; BasicBlock *BB = Node->getBlock(); // If this block has a single predecessor, then the predecessor is the parent @@ -513,7 +505,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { // If this node has a single predecessor which ends in a conditional branch, // we can infer the value of the branch condition given that we took this - // path. We need the single predeccesor to ensure there's not another path + // path. We need the single predecessor to ensure there's not another path // which reaches this block where the condition might hold a different // value. Since we're adding this to the scoped hash table (like any other // def), it will have been popped if we encounter a future merge block. @@ -530,9 +522,13 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { DEBUG(dbgs() << "EarlyCSE CVP: Add conditional value for '" << CondInst->getName() << "' as " << *ConditionalConstant << " in " << BB->getName() << "\n"); - // Replace all dominated uses with the known value - replaceDominatedUsesWith(CondInst, ConditionalConstant, DT, - BasicBlockEdge(Pred, BB)); + // Replace all dominated uses with the known value. + if (unsigned Count = + replaceDominatedUsesWith(CondInst, ConditionalConstant, DT, + BasicBlockEdge(Pred, BB))) { + Changed = true; + NumCSECVP = NumCSECVP + Count; + } } /// LastStore - Keep track of the last non-volatile store that we saw... for @@ -541,7 +537,6 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { /// stores which can occur in bitfield code among other things. Instruction *LastStore = nullptr; - bool Changed = false; const DataLayout &DL = BB->getModule()->getDataLayout(); // See if any instructions in the block can be eliminated. If so, do it. If @@ -567,15 +562,38 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { continue; } + if (match(Inst, m_Intrinsic<Intrinsic::experimental_guard>())) { + if (auto *CondI = + dyn_cast<Instruction>(cast<CallInst>(Inst)->getArgOperand(0))) { + // The condition we're on guarding here is true for all dominated + // locations. + if (SimpleValue::canHandle(CondI)) + AvailableValues.insert(CondI, ConstantInt::getTrue(BB->getContext())); + } + + // Guard intrinsics read all memory, but don't write any memory. + // Accordingly, don't update the generation but consume the last store (to + // avoid an incorrect DSE). + LastStore = nullptr; + continue; + } + // If the instruction can be simplified (e.g. X+0 = X) then replace it with // its simpler value. if (Value *V = SimplifyInstruction(Inst, DL, &TLI, &DT, &AC)) { DEBUG(dbgs() << "EarlyCSE Simplify: " << *Inst << " to: " << *V << '\n'); - Inst->replaceAllUsesWith(V); - Inst->eraseFromParent(); - Changed = true; - ++NumSimplify; - continue; + if (!Inst->use_empty()) { + Inst->replaceAllUsesWith(V); + Changed = true; + } + if (isInstructionTriviallyDead(Inst, &TLI)) { + Inst->eraseFromParent(); + Changed = true; + } + if (Changed) { + ++NumSimplify; + continue; + } } // If this is a simple instruction that we can value number, process it. @@ -583,6 +601,8 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { // See if the instruction has an available value. If so, use it. if (Value *V = AvailableValues.lookup(Inst)) { DEBUG(dbgs() << "EarlyCSE CSE: " << *Inst << " to: " << *V << '\n'); + if (auto *I = dyn_cast<Instruction>(V)) + I->andIRFlags(Inst); Inst->replaceAllUsesWith(V); Inst->eraseFromParent(); Changed = true; @@ -606,18 +626,25 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { } // If we have an available version of this load, and if it is the right - // generation, replace this instruction. + // generation or the load is known to be from an invariant location, + // replace this instruction. + // + // A dominating invariant load implies that the location loaded from is + // unchanging beginning at the point of the invariant load, so the load + // we're CSE'ing _away_ does not need to be invariant, only the available + // load we're CSE'ing _to_ does. LoadValue InVal = AvailableLoads.lookup(MemInst.getPointerOperand()); - if (InVal.Data != nullptr && InVal.Generation == CurrentGeneration && + if (InVal.DefInst != nullptr && + (InVal.Generation == CurrentGeneration || InVal.IsInvariant) && InVal.MatchingId == MemInst.getMatchingId() && // We don't yet handle removing loads with ordering of any kind. !MemInst.isVolatile() && MemInst.isUnordered() && // We can't replace an atomic load with one which isn't also atomic. InVal.IsAtomic >= MemInst.isAtomic()) { - Value *Op = getOrCreateResult(InVal.Data, Inst->getType()); + Value *Op = getOrCreateResult(InVal.DefInst, Inst->getType()); if (Op != nullptr) { DEBUG(dbgs() << "EarlyCSE CSE LOAD: " << *Inst - << " to: " << *InVal.Data << '\n'); + << " to: " << *InVal.DefInst << '\n'); if (!Inst->use_empty()) Inst->replaceAllUsesWith(Op); Inst->eraseFromParent(); @@ -631,7 +658,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { AvailableLoads.insert( MemInst.getPointerOperand(), LoadValue(Inst, CurrentGeneration, MemInst.getMatchingId(), - MemInst.isAtomic())); + MemInst.isAtomic(), MemInst.isInvariantLoad())); LastStore = nullptr; continue; } @@ -649,7 +676,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { if (CallValue::canHandle(Inst)) { // If we have an available version of this call, and if it is the right // generation, replace this instruction. - std::pair<Value *, unsigned> InVal = AvailableCalls.lookup(Inst); + std::pair<Instruction *, unsigned> InVal = AvailableCalls.lookup(Inst); if (InVal.first != nullptr && InVal.second == CurrentGeneration) { DEBUG(dbgs() << "EarlyCSE CSE CALL: " << *Inst << " to: " << *InVal.first << '\n'); @@ -663,7 +690,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { // Otherwise, remember that we have this instruction. AvailableCalls.insert( - Inst, std::pair<Value *, unsigned>(Inst, CurrentGeneration)); + Inst, std::pair<Instruction *, unsigned>(Inst, CurrentGeneration)); continue; } @@ -673,7 +700,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { // to advance the generation. We do need to prevent DSE across the fence, // but that's handled above. if (FenceInst *FI = dyn_cast<FenceInst>(Inst)) - if (FI->getOrdering() == Release) { + if (FI->getOrdering() == AtomicOrdering::Release) { assert(Inst->mayReadFromMemory() && "relied on to prevent DSE above"); continue; } @@ -685,8 +712,8 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { // the store originally was. if (MemInst.isValid() && MemInst.isStore()) { LoadValue InVal = AvailableLoads.lookup(MemInst.getPointerOperand()); - if (InVal.Data && - InVal.Data == getOrCreateResult(Inst, InVal.Data->getType()) && + if (InVal.DefInst && + InVal.DefInst == getOrCreateResult(Inst, InVal.DefInst->getType()) && InVal.Generation == CurrentGeneration && InVal.MatchingId == MemInst.getMatchingId() && // We don't yet handle removing stores with ordering of any kind. @@ -743,7 +770,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { AvailableLoads.insert( MemInst.getPointerOperand(), LoadValue(Inst, CurrentGeneration, MemInst.getMatchingId(), - MemInst.isAtomic())); + MemInst.isAtomic(), /*IsInvariant=*/false)); // Remember that this was the last unordered store we saw for DSE. We // don't yet handle DSE on ordered or volatile stores since we don't @@ -818,11 +845,11 @@ bool EarlyCSE::run() { } PreservedAnalyses EarlyCSEPass::run(Function &F, - AnalysisManager<Function> *AM) { - auto &TLI = AM->getResult<TargetLibraryAnalysis>(F); - auto &TTI = AM->getResult<TargetIRAnalysis>(F); - auto &DT = AM->getResult<DominatorTreeAnalysis>(F); - auto &AC = AM->getResult<AssumptionAnalysis>(F); + AnalysisManager<Function> &AM) { + auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); + auto &TTI = AM.getResult<TargetIRAnalysis>(F); + auto &DT = AM.getResult<DominatorTreeAnalysis>(F); + auto &AC = AM.getResult<AssumptionAnalysis>(F); EarlyCSE CSE(TLI, TTI, DT, AC); @@ -833,6 +860,7 @@ PreservedAnalyses EarlyCSEPass::run(Function &F, // FIXME: Bundle this with other CFG-preservation. PreservedAnalyses PA; PA.preserve<DominatorTreeAnalysis>(); + PA.preserve<GlobalsAA>(); return PA; } @@ -853,7 +881,7 @@ public: } bool runOnFunction(Function &F) override { - if (skipOptnoneFunction(F)) + if (skipFunction(F)) return false; auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); diff --git a/lib/Transforms/Scalar/Float2Int.cpp b/lib/Transforms/Scalar/Float2Int.cpp index 7f5d78656b50b..7aa6dc6992b61 100644 --- a/lib/Transforms/Scalar/Float2Int.cpp +++ b/lib/Transforms/Scalar/Float2Int.cpp @@ -13,15 +13,13 @@ //===----------------------------------------------------------------------===// #define DEBUG_TYPE "float2int" + +#include "llvm/Transforms/Scalar/Float2Int.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/APSInt.h" -#include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/EquivalenceClasses.h" -#include "llvm/ADT/MapVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/GlobalsModRef.h" -#include "llvm/IR/ConstantRange.h" #include "llvm/IR/Constants.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstIterator.h" @@ -53,41 +51,31 @@ MaxIntegerBW("float2int-max-integer-bw", cl::init(64), cl::Hidden, "(default=64)")); namespace { - struct Float2Int : public FunctionPass { + struct Float2IntLegacyPass : public FunctionPass { static char ID; // Pass identification, replacement for typeid - Float2Int() : FunctionPass(ID) { - initializeFloat2IntPass(*PassRegistry::getPassRegistry()); + Float2IntLegacyPass() : FunctionPass(ID) { + initializeFloat2IntLegacyPassPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override { + if (skipFunction(F)) + return false; + + return Impl.runImpl(F); } - bool runOnFunction(Function &F) override; void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); AU.addPreserved<GlobalsAAWrapperPass>(); } - void findRoots(Function &F, SmallPtrSet<Instruction*,8> &Roots); - ConstantRange seen(Instruction *I, ConstantRange R); - ConstantRange badRange(); - ConstantRange unknownRange(); - ConstantRange validateRange(ConstantRange R); - void walkBackwards(const SmallPtrSetImpl<Instruction*> &Roots); - void walkForwards(); - bool validateAndTransform(); - Value *convert(Instruction *I, Type *ToTy); - void cleanup(); - - MapVector<Instruction*, ConstantRange > SeenInsts; - SmallPtrSet<Instruction*,8> Roots; - EquivalenceClasses<Instruction*> ECs; - MapVector<Instruction*, Value*> ConvertedInsts; - LLVMContext *Ctx; + private: + Float2IntPass Impl; }; } -char Float2Int::ID = 0; -INITIALIZE_PASS_BEGIN(Float2Int, "float2int", "Float to int", false, false) -INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) -INITIALIZE_PASS_END(Float2Int, "float2int", "Float to int", false, false) +char Float2IntLegacyPass::ID = 0; +INITIALIZE_PASS(Float2IntLegacyPass, "float2int", "Float to int", false, false) // Given a FCmp predicate, return a matching ICmp predicate if one // exists, otherwise return BAD_ICMP_PREDICATE. @@ -129,7 +117,7 @@ static Instruction::BinaryOps mapBinOpcode(unsigned Opcode) { // Find the roots - instructions that convert from the FP domain to // integer domain. -void Float2Int::findRoots(Function &F, SmallPtrSet<Instruction*,8> &Roots) { +void Float2IntPass::findRoots(Function &F, SmallPtrSet<Instruction*,8> &Roots) { for (auto &I : instructions(F)) { if (isa<VectorType>(I.getType())) continue; @@ -149,7 +137,7 @@ void Float2Int::findRoots(Function &F, SmallPtrSet<Instruction*,8> &Roots) { } // Helper - mark I as having been traversed, having range R. -ConstantRange Float2Int::seen(Instruction *I, ConstantRange R) { +ConstantRange Float2IntPass::seen(Instruction *I, ConstantRange R) { DEBUG(dbgs() << "F2I: " << *I << ":" << R << "\n"); if (SeenInsts.find(I) != SeenInsts.end()) SeenInsts.find(I)->second = R; @@ -159,13 +147,13 @@ ConstantRange Float2Int::seen(Instruction *I, ConstantRange R) { } // Helper - get a range representing a poison value. -ConstantRange Float2Int::badRange() { +ConstantRange Float2IntPass::badRange() { return ConstantRange(MaxIntegerBW + 1, true); } -ConstantRange Float2Int::unknownRange() { +ConstantRange Float2IntPass::unknownRange() { return ConstantRange(MaxIntegerBW + 1, false); } -ConstantRange Float2Int::validateRange(ConstantRange R) { +ConstantRange Float2IntPass::validateRange(ConstantRange R) { if (R.getBitWidth() > MaxIntegerBW + 1) return badRange(); return R; @@ -185,7 +173,7 @@ ConstantRange Float2Int::validateRange(ConstantRange R) { // Breadth-first walk of the use-def graph; determine the set of nodes // we care about and eagerly determine if some of them are poisonous. -void Float2Int::walkBackwards(const SmallPtrSetImpl<Instruction*> &Roots) { +void Float2IntPass::walkBackwards(const SmallPtrSetImpl<Instruction*> &Roots) { std::deque<Instruction*> Worklist(Roots.begin(), Roots.end()); while (!Worklist.empty()) { Instruction *I = Worklist.back(); @@ -246,8 +234,8 @@ void Float2Int::walkBackwards(const SmallPtrSetImpl<Instruction*> &Roots) { // Walk forwards down the list of seen instructions, so we visit defs before // uses. -void Float2Int::walkForwards() { - for (auto &It : make_range(SeenInsts.rbegin(), SeenInsts.rend())) { +void Float2IntPass::walkForwards() { + for (auto &It : reverse(SeenInsts)) { if (It.second != unknownRange()) continue; @@ -318,7 +306,7 @@ void Float2Int::walkForwards() { // Instead, we ask APFloat to round itself to an integral value - this // preserves sign-of-zero - then compare the result with the original. // - APFloat F = CF->getValueAPF(); + const APFloat &F = CF->getValueAPF(); // First, weed out obviously incorrect values. Non-finite numbers // can't be represented and neither can negative zero, unless @@ -357,7 +345,7 @@ void Float2Int::walkForwards() { } // If there is a valid transform to be done, do it. -bool Float2Int::validateAndTransform() { +bool Float2IntPass::validateAndTransform() { bool MadeChange = false; // Iterate over every disjoint partition of the def-use graph. @@ -439,7 +427,7 @@ bool Float2Int::validateAndTransform() { return MadeChange; } -Value *Float2Int::convert(Instruction *I, Type *ToTy) { +Value *Float2IntPass::convert(Instruction *I, Type *ToTy) { if (ConvertedInsts.find(I) != ConvertedInsts.end()) // Already converted this instruction. return ConvertedInsts[I]; @@ -511,15 +499,12 @@ Value *Float2Int::convert(Instruction *I, Type *ToTy) { } // Perform dead code elimination on the instructions we just modified. -void Float2Int::cleanup() { - for (auto &I : make_range(ConvertedInsts.rbegin(), ConvertedInsts.rend())) +void Float2IntPass::cleanup() { + for (auto &I : reverse(ConvertedInsts)) I.first->eraseFromParent(); } -bool Float2Int::runOnFunction(Function &F) { - if (skipOptnoneFunction(F)) - return false; - +bool Float2IntPass::runImpl(Function &F) { DEBUG(dbgs() << "F2I: Looking at function " << F.getName() << "\n"); // Clear out all state. ECs = EquivalenceClasses<Instruction*>(); @@ -540,4 +525,17 @@ bool Float2Int::runOnFunction(Function &F) { return Modified; } -FunctionPass *llvm::createFloat2IntPass() { return new Float2Int(); } +namespace llvm { +FunctionPass *createFloat2IntPass() { return new Float2IntLegacyPass(); } + +PreservedAnalyses Float2IntPass::run(Function &F, FunctionAnalysisManager &) { + if (!runImpl(F)) + return PreservedAnalyses::all(); + else { + // FIXME: This should also 'preserve the CFG'. + PreservedAnalyses PA; + PA.preserve<GlobalsAA>(); + return PA; + } +} +} // End namespace llvm diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp index a028b8c444bae..a35a1062cbcd8 100644 --- a/lib/Transforms/Scalar/GVN.cpp +++ b/lib/Transforms/Scalar/GVN.cpp @@ -15,7 +15,7 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Scalar/GVN.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/Hashing.h" @@ -44,7 +44,6 @@ #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/PatternMatch.h" -#include "llvm/Support/Allocator.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" @@ -53,6 +52,7 @@ #include "llvm/Transforms/Utils/SSAUpdater.h" #include <vector> using namespace llvm; +using namespace llvm::gvn; using namespace PatternMatch; #define DEBUG_TYPE "gvn" @@ -74,106 +74,167 @@ static cl::opt<uint32_t> MaxRecurseDepth("max-recurse-depth", cl::Hidden, cl::init(1000), cl::ZeroOrMore, cl::desc("Max recurse depth (default = 1000)")); -//===----------------------------------------------------------------------===// -// ValueTable Class -//===----------------------------------------------------------------------===// - -/// This class holds the mapping between values and value numbers. It is used -/// as an efficient mechanism to determine the expression-wise equivalence of -/// two values. -namespace { - struct Expression { - uint32_t opcode; - Type *type; - SmallVector<uint32_t, 4> varargs; +struct llvm::GVN::Expression { + uint32_t opcode; + Type *type; + SmallVector<uint32_t, 4> varargs; - Expression(uint32_t o = ~2U) : opcode(o) { } + Expression(uint32_t o = ~2U) : opcode(o) {} - bool operator==(const Expression &other) const { - if (opcode != other.opcode) - return false; - if (opcode == ~0U || opcode == ~1U) - return true; - if (type != other.type) - return false; - if (varargs != other.varargs) - return false; + bool operator==(const Expression &other) const { + if (opcode != other.opcode) + return false; + if (opcode == ~0U || opcode == ~1U) return true; - } - - friend hash_code hash_value(const Expression &Value) { - return hash_combine(Value.opcode, Value.type, - hash_combine_range(Value.varargs.begin(), - Value.varargs.end())); - } - }; + if (type != other.type) + return false; + if (varargs != other.varargs) + return false; + return true; + } - class ValueTable { - DenseMap<Value*, uint32_t> valueNumbering; - DenseMap<Expression, uint32_t> expressionNumbering; - AliasAnalysis *AA; - MemoryDependenceAnalysis *MD; - DominatorTree *DT; - - uint32_t nextValueNumber; - - Expression create_expression(Instruction* I); - Expression create_cmp_expression(unsigned Opcode, - CmpInst::Predicate Predicate, - Value *LHS, Value *RHS); - Expression create_extractvalue_expression(ExtractValueInst* EI); - uint32_t lookup_or_add_call(CallInst* C); - public: - ValueTable() : nextValueNumber(1) { } - uint32_t lookup_or_add(Value *V); - uint32_t lookup(Value *V) const; - uint32_t lookup_or_add_cmp(unsigned Opcode, CmpInst::Predicate Pred, - Value *LHS, Value *RHS); - bool exists(Value *V) const; - void add(Value *V, uint32_t num); - void clear(); - void erase(Value *v); - void setAliasAnalysis(AliasAnalysis* A) { AA = A; } - AliasAnalysis *getAliasAnalysis() const { return AA; } - void setMemDep(MemoryDependenceAnalysis* M) { MD = M; } - void setDomTree(DominatorTree* D) { DT = D; } - uint32_t getNextUnusedValueNumber() { return nextValueNumber; } - void verifyRemoved(const Value *) const; - }; -} + friend hash_code hash_value(const Expression &Value) { + return hash_combine( + Value.opcode, Value.type, + hash_combine_range(Value.varargs.begin(), Value.varargs.end())); + } +}; namespace llvm { -template <> struct DenseMapInfo<Expression> { - static inline Expression getEmptyKey() { - return ~0U; - } +template <> struct DenseMapInfo<GVN::Expression> { + static inline GVN::Expression getEmptyKey() { return ~0U; } - static inline Expression getTombstoneKey() { - return ~1U; - } + static inline GVN::Expression getTombstoneKey() { return ~1U; } - static unsigned getHashValue(const Expression e) { + static unsigned getHashValue(const GVN::Expression &e) { using llvm::hash_value; return static_cast<unsigned>(hash_value(e)); } - static bool isEqual(const Expression &LHS, const Expression &RHS) { + static bool isEqual(const GVN::Expression &LHS, const GVN::Expression &RHS) { return LHS == RHS; } }; +} // End llvm namespace. + +/// Represents a particular available value that we know how to materialize. +/// Materialization of an AvailableValue never fails. An AvailableValue is +/// implicitly associated with a rematerialization point which is the +/// location of the instruction from which it was formed. +struct llvm::gvn::AvailableValue { + enum ValType { + SimpleVal, // A simple offsetted value that is accessed. + LoadVal, // A value produced by a load. + MemIntrin, // A memory intrinsic which is loaded from. + UndefVal // A UndefValue representing a value from dead block (which + // is not yet physically removed from the CFG). + }; -} + /// V - The value that is live out of the block. + PointerIntPair<Value *, 2, ValType> Val; + + /// Offset - The byte offset in Val that is interesting for the load query. + unsigned Offset; + + static AvailableValue get(Value *V, unsigned Offset = 0) { + AvailableValue Res; + Res.Val.setPointer(V); + Res.Val.setInt(SimpleVal); + Res.Offset = Offset; + return Res; + } + + static AvailableValue getMI(MemIntrinsic *MI, unsigned Offset = 0) { + AvailableValue Res; + Res.Val.setPointer(MI); + Res.Val.setInt(MemIntrin); + Res.Offset = Offset; + return Res; + } + + static AvailableValue getLoad(LoadInst *LI, unsigned Offset = 0) { + AvailableValue Res; + Res.Val.setPointer(LI); + Res.Val.setInt(LoadVal); + Res.Offset = Offset; + return Res; + } + + static AvailableValue getUndef() { + AvailableValue Res; + Res.Val.setPointer(nullptr); + Res.Val.setInt(UndefVal); + Res.Offset = 0; + return Res; + } + + bool isSimpleValue() const { return Val.getInt() == SimpleVal; } + bool isCoercedLoadValue() const { return Val.getInt() == LoadVal; } + bool isMemIntrinValue() const { return Val.getInt() == MemIntrin; } + bool isUndefValue() const { return Val.getInt() == UndefVal; } + + Value *getSimpleValue() const { + assert(isSimpleValue() && "Wrong accessor"); + return Val.getPointer(); + } + + LoadInst *getCoercedLoadValue() const { + assert(isCoercedLoadValue() && "Wrong accessor"); + return cast<LoadInst>(Val.getPointer()); + } + + MemIntrinsic *getMemIntrinValue() const { + assert(isMemIntrinValue() && "Wrong accessor"); + return cast<MemIntrinsic>(Val.getPointer()); + } + + /// Emit code at the specified insertion point to adjust the value defined + /// here to the specified type. This handles various coercion cases. + Value *MaterializeAdjustedValue(LoadInst *LI, Instruction *InsertPt, + GVN &gvn) const; +}; + +/// Represents an AvailableValue which can be rematerialized at the end of +/// the associated BasicBlock. +struct llvm::gvn::AvailableValueInBlock { + /// BB - The basic block in question. + BasicBlock *BB; + + /// AV - The actual available value + AvailableValue AV; + + static AvailableValueInBlock get(BasicBlock *BB, AvailableValue &&AV) { + AvailableValueInBlock Res; + Res.BB = BB; + Res.AV = std::move(AV); + return Res; + } + + static AvailableValueInBlock get(BasicBlock *BB, Value *V, + unsigned Offset = 0) { + return get(BB, AvailableValue::get(V, Offset)); + } + static AvailableValueInBlock getUndef(BasicBlock *BB) { + return get(BB, AvailableValue::getUndef()); + } + + /// Emit code at the end of this block to adjust the value defined here to + /// the specified type. This handles various coercion cases. + Value *MaterializeAdjustedValue(LoadInst *LI, GVN &gvn) const { + return AV.MaterializeAdjustedValue(LI, BB->getTerminator(), gvn); + } +}; //===----------------------------------------------------------------------===// // ValueTable Internal Functions //===----------------------------------------------------------------------===// -Expression ValueTable::create_expression(Instruction *I) { +GVN::Expression GVN::ValueTable::createExpr(Instruction *I) { Expression e; e.type = I->getType(); e.opcode = I->getOpcode(); for (Instruction::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) - e.varargs.push_back(lookup_or_add(*OI)); + e.varargs.push_back(lookupOrAdd(*OI)); if (I->isCommutative()) { // Ensure that commutative instructions that only differ by a permutation // of their operands get the same value number by sorting the operand value @@ -201,15 +262,15 @@ Expression ValueTable::create_expression(Instruction *I) { return e; } -Expression ValueTable::create_cmp_expression(unsigned Opcode, - CmpInst::Predicate Predicate, - Value *LHS, Value *RHS) { +GVN::Expression GVN::ValueTable::createCmpExpr(unsigned Opcode, + CmpInst::Predicate Predicate, + Value *LHS, Value *RHS) { assert((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && "Not a comparison!"); Expression e; e.type = CmpInst::makeCmpResultType(LHS->getType()); - e.varargs.push_back(lookup_or_add(LHS)); - e.varargs.push_back(lookup_or_add(RHS)); + e.varargs.push_back(lookupOrAdd(LHS)); + e.varargs.push_back(lookupOrAdd(RHS)); // Sort the operand value numbers so x<y and y>x get the same value number. if (e.varargs[0] > e.varargs[1]) { @@ -220,7 +281,7 @@ Expression ValueTable::create_cmp_expression(unsigned Opcode, return e; } -Expression ValueTable::create_extractvalue_expression(ExtractValueInst *EI) { +GVN::Expression GVN::ValueTable::createExtractvalueExpr(ExtractValueInst *EI) { assert(EI && "Not an ExtractValueInst?"); Expression e; e.type = EI->getType(); @@ -252,8 +313,8 @@ Expression ValueTable::create_extractvalue_expression(ExtractValueInst *EI) { // Intrinsic recognized. Grab its args to finish building the expression. assert(I->getNumArgOperands() == 2 && "Expect two args for recognised intrinsics."); - e.varargs.push_back(lookup_or_add(I->getArgOperand(0))); - e.varargs.push_back(lookup_or_add(I->getArgOperand(1))); + e.varargs.push_back(lookupOrAdd(I->getArgOperand(0))); + e.varargs.push_back(lookupOrAdd(I->getArgOperand(1))); return e; } } @@ -263,7 +324,7 @@ Expression ValueTable::create_extractvalue_expression(ExtractValueInst *EI) { e.opcode = EI->getOpcode(); for (Instruction::op_iterator OI = EI->op_begin(), OE = EI->op_end(); OI != OE; ++OI) - e.varargs.push_back(lookup_or_add(*OI)); + e.varargs.push_back(lookupOrAdd(*OI)); for (ExtractValueInst::idx_iterator II = EI->idx_begin(), IE = EI->idx_end(); II != IE; ++II) @@ -276,20 +337,32 @@ Expression ValueTable::create_extractvalue_expression(ExtractValueInst *EI) { // ValueTable External Functions //===----------------------------------------------------------------------===// +GVN::ValueTable::ValueTable() : nextValueNumber(1) {} +GVN::ValueTable::ValueTable(const ValueTable &Arg) + : valueNumbering(Arg.valueNumbering), + expressionNumbering(Arg.expressionNumbering), AA(Arg.AA), MD(Arg.MD), + DT(Arg.DT), nextValueNumber(Arg.nextValueNumber) {} +GVN::ValueTable::ValueTable(ValueTable &&Arg) + : valueNumbering(std::move(Arg.valueNumbering)), + expressionNumbering(std::move(Arg.expressionNumbering)), + AA(std::move(Arg.AA)), MD(std::move(Arg.MD)), DT(std::move(Arg.DT)), + nextValueNumber(std::move(Arg.nextValueNumber)) {} +GVN::ValueTable::~ValueTable() {} + /// add - Insert a value into the table with a specified value number. -void ValueTable::add(Value *V, uint32_t num) { +void GVN::ValueTable::add(Value *V, uint32_t num) { valueNumbering.insert(std::make_pair(V, num)); } -uint32_t ValueTable::lookup_or_add_call(CallInst *C) { +uint32_t GVN::ValueTable::lookupOrAddCall(CallInst *C) { if (AA->doesNotAccessMemory(C)) { - Expression exp = create_expression(C); + Expression exp = createExpr(C); uint32_t &e = expressionNumbering[exp]; if (!e) e = nextValueNumber++; valueNumbering[C] = e; return e; } else if (AA->onlyReadsMemory(C)) { - Expression exp = create_expression(C); + Expression exp = createExpr(C); uint32_t &e = expressionNumbering[exp]; if (!e) { e = nextValueNumber++; @@ -318,21 +391,21 @@ uint32_t ValueTable::lookup_or_add_call(CallInst *C) { } for (unsigned i = 0, e = C->getNumArgOperands(); i < e; ++i) { - uint32_t c_vn = lookup_or_add(C->getArgOperand(i)); - uint32_t cd_vn = lookup_or_add(local_cdep->getArgOperand(i)); + uint32_t c_vn = lookupOrAdd(C->getArgOperand(i)); + uint32_t cd_vn = lookupOrAdd(local_cdep->getArgOperand(i)); if (c_vn != cd_vn) { valueNumbering[C] = nextValueNumber; return nextValueNumber++; } } - uint32_t v = lookup_or_add(local_cdep); + uint32_t v = lookupOrAdd(local_cdep); valueNumbering[C] = v; return v; } // Non-local case. - const MemoryDependenceAnalysis::NonLocalDepInfo &deps = + const MemoryDependenceResults::NonLocalDepInfo &deps = MD->getNonLocalCallDependency(CallSite(C)); // FIXME: Move the checking logic to MemDep! CallInst* cdep = nullptr; @@ -372,15 +445,15 @@ uint32_t ValueTable::lookup_or_add_call(CallInst *C) { return nextValueNumber++; } for (unsigned i = 0, e = C->getNumArgOperands(); i < e; ++i) { - uint32_t c_vn = lookup_or_add(C->getArgOperand(i)); - uint32_t cd_vn = lookup_or_add(cdep->getArgOperand(i)); + uint32_t c_vn = lookupOrAdd(C->getArgOperand(i)); + uint32_t cd_vn = lookupOrAdd(cdep->getArgOperand(i)); if (c_vn != cd_vn) { valueNumbering[C] = nextValueNumber; return nextValueNumber++; } } - uint32_t v = lookup_or_add(cdep); + uint32_t v = lookupOrAdd(cdep); valueNumbering[C] = v; return v; @@ -391,11 +464,11 @@ uint32_t ValueTable::lookup_or_add_call(CallInst *C) { } /// Returns true if a value number exists for the specified value. -bool ValueTable::exists(Value *V) const { return valueNumbering.count(V) != 0; } +bool GVN::ValueTable::exists(Value *V) const { return valueNumbering.count(V) != 0; } /// lookup_or_add - Returns the value number for the specified value, assigning /// it a new number if it did not have one before. -uint32_t ValueTable::lookup_or_add(Value *V) { +uint32_t GVN::ValueTable::lookupOrAdd(Value *V) { DenseMap<Value*, uint32_t>::iterator VI = valueNumbering.find(V); if (VI != valueNumbering.end()) return VI->second; @@ -409,7 +482,7 @@ uint32_t ValueTable::lookup_or_add(Value *V) { Expression exp; switch (I->getOpcode()) { case Instruction::Call: - return lookup_or_add_call(cast<CallInst>(I)); + return lookupOrAddCall(cast<CallInst>(I)); case Instruction::Add: case Instruction::FAdd: case Instruction::Sub: @@ -448,10 +521,10 @@ uint32_t ValueTable::lookup_or_add(Value *V) { case Instruction::ShuffleVector: case Instruction::InsertValue: case Instruction::GetElementPtr: - exp = create_expression(I); + exp = createExpr(I); break; case Instruction::ExtractValue: - exp = create_extractvalue_expression(cast<ExtractValueInst>(I)); + exp = createExtractvalueExpr(cast<ExtractValueInst>(I)); break; default: valueNumbering[V] = nextValueNumber; @@ -466,7 +539,7 @@ uint32_t ValueTable::lookup_or_add(Value *V) { /// Returns the value number of the specified value. Fails if /// the value has not yet been numbered. -uint32_t ValueTable::lookup(Value *V) const { +uint32_t GVN::ValueTable::lookup(Value *V) const { DenseMap<Value*, uint32_t>::const_iterator VI = valueNumbering.find(V); assert(VI != valueNumbering.end() && "Value not numbered?"); return VI->second; @@ -476,30 +549,30 @@ uint32_t ValueTable::lookup(Value *V) const { /// assigning it a new number if it did not have one before. Useful when /// we deduced the result of a comparison, but don't immediately have an /// instruction realizing that comparison to hand. -uint32_t ValueTable::lookup_or_add_cmp(unsigned Opcode, - CmpInst::Predicate Predicate, - Value *LHS, Value *RHS) { - Expression exp = create_cmp_expression(Opcode, Predicate, LHS, RHS); +uint32_t GVN::ValueTable::lookupOrAddCmp(unsigned Opcode, + CmpInst::Predicate Predicate, + Value *LHS, Value *RHS) { + Expression exp = createCmpExpr(Opcode, Predicate, LHS, RHS); uint32_t& e = expressionNumbering[exp]; if (!e) e = nextValueNumber++; return e; } /// Remove all entries from the ValueTable. -void ValueTable::clear() { +void GVN::ValueTable::clear() { valueNumbering.clear(); expressionNumbering.clear(); nextValueNumber = 1; } /// Remove a value from the value numbering. -void ValueTable::erase(Value *V) { +void GVN::ValueTable::erase(Value *V) { valueNumbering.erase(V); } /// verifyRemoved - Verify that the value is removed from all internal data /// structures. -void ValueTable::verifyRemoved(const Value *V) const { +void GVN::ValueTable::verifyRemoved(const Value *V) const { for (DenseMap<Value*, uint32_t>::const_iterator I = valueNumbering.begin(), E = valueNumbering.end(); I != E; ++I) { assert(I->first != V && "Inst still occurs in value numbering map!"); @@ -510,251 +583,26 @@ void ValueTable::verifyRemoved(const Value *V) const { // GVN Pass //===----------------------------------------------------------------------===// -namespace { - class GVN; - struct AvailableValueInBlock { - /// BB - The basic block in question. - BasicBlock *BB; - enum ValType { - SimpleVal, // A simple offsetted value that is accessed. - LoadVal, // A value produced by a load. - MemIntrin, // A memory intrinsic which is loaded from. - UndefVal // A UndefValue representing a value from dead block (which - // is not yet physically removed from the CFG). - }; - - /// V - The value that is live out of the block. - PointerIntPair<Value *, 2, ValType> Val; - - /// Offset - The byte offset in Val that is interesting for the load query. - unsigned Offset; - - static AvailableValueInBlock get(BasicBlock *BB, Value *V, - unsigned Offset = 0) { - AvailableValueInBlock Res; - Res.BB = BB; - Res.Val.setPointer(V); - Res.Val.setInt(SimpleVal); - Res.Offset = Offset; - return Res; - } - - static AvailableValueInBlock getMI(BasicBlock *BB, MemIntrinsic *MI, - unsigned Offset = 0) { - AvailableValueInBlock Res; - Res.BB = BB; - Res.Val.setPointer(MI); - Res.Val.setInt(MemIntrin); - Res.Offset = Offset; - return Res; - } - - static AvailableValueInBlock getLoad(BasicBlock *BB, LoadInst *LI, - unsigned Offset = 0) { - AvailableValueInBlock Res; - Res.BB = BB; - Res.Val.setPointer(LI); - Res.Val.setInt(LoadVal); - Res.Offset = Offset; - return Res; - } - - static AvailableValueInBlock getUndef(BasicBlock *BB) { - AvailableValueInBlock Res; - Res.BB = BB; - Res.Val.setPointer(nullptr); - Res.Val.setInt(UndefVal); - Res.Offset = 0; - return Res; - } - - bool isSimpleValue() const { return Val.getInt() == SimpleVal; } - bool isCoercedLoadValue() const { return Val.getInt() == LoadVal; } - bool isMemIntrinValue() const { return Val.getInt() == MemIntrin; } - bool isUndefValue() const { return Val.getInt() == UndefVal; } - - Value *getSimpleValue() const { - assert(isSimpleValue() && "Wrong accessor"); - return Val.getPointer(); - } - - LoadInst *getCoercedLoadValue() const { - assert(isCoercedLoadValue() && "Wrong accessor"); - return cast<LoadInst>(Val.getPointer()); - } - - MemIntrinsic *getMemIntrinValue() const { - assert(isMemIntrinValue() && "Wrong accessor"); - return cast<MemIntrinsic>(Val.getPointer()); - } - - /// Emit code into this block to adjust the value defined here to the - /// specified type. This handles various coercion cases. - Value *MaterializeAdjustedValue(LoadInst *LI, GVN &gvn) const; - }; - - class GVN : public FunctionPass { - bool NoLoads; - MemoryDependenceAnalysis *MD; - DominatorTree *DT; - const TargetLibraryInfo *TLI; - AssumptionCache *AC; - SetVector<BasicBlock *> DeadBlocks; - - ValueTable VN; - - /// A mapping from value numbers to lists of Value*'s that - /// have that value number. Use findLeader to query it. - struct LeaderTableEntry { - Value *Val; - const BasicBlock *BB; - LeaderTableEntry *Next; - }; - DenseMap<uint32_t, LeaderTableEntry> LeaderTable; - BumpPtrAllocator TableAllocator; - - // Block-local map of equivalent values to their leader, does not - // propagate to any successors. Entries added mid-block are applied - // to the remaining instructions in the block. - SmallMapVector<llvm::Value *, llvm::Constant *, 4> ReplaceWithConstMap; - SmallVector<Instruction*, 8> InstrsToErase; - - typedef SmallVector<NonLocalDepResult, 64> LoadDepVect; - typedef SmallVector<AvailableValueInBlock, 64> AvailValInBlkVect; - typedef SmallVector<BasicBlock*, 64> UnavailBlkVect; - - public: - static char ID; // Pass identification, replacement for typeid - explicit GVN(bool noloads = false) - : FunctionPass(ID), NoLoads(noloads), MD(nullptr) { - initializeGVNPass(*PassRegistry::getPassRegistry()); - } - - bool runOnFunction(Function &F) override; - - /// This removes the specified instruction from - /// our various maps and marks it for deletion. - void markInstructionForDeletion(Instruction *I) { - VN.erase(I); - InstrsToErase.push_back(I); - } - - DominatorTree &getDominatorTree() const { return *DT; } - AliasAnalysis *getAliasAnalysis() const { return VN.getAliasAnalysis(); } - MemoryDependenceAnalysis &getMemDep() const { return *MD; } - private: - /// Push a new Value to the LeaderTable onto the list for its value number. - void addToLeaderTable(uint32_t N, Value *V, const BasicBlock *BB) { - LeaderTableEntry &Curr = LeaderTable[N]; - if (!Curr.Val) { - Curr.Val = V; - Curr.BB = BB; - return; - } - - LeaderTableEntry *Node = TableAllocator.Allocate<LeaderTableEntry>(); - Node->Val = V; - Node->BB = BB; - Node->Next = Curr.Next; - Curr.Next = Node; - } - - /// Scan the list of values corresponding to a given - /// value number, and remove the given instruction if encountered. - void removeFromLeaderTable(uint32_t N, Instruction *I, BasicBlock *BB) { - LeaderTableEntry* Prev = nullptr; - LeaderTableEntry* Curr = &LeaderTable[N]; - - while (Curr && (Curr->Val != I || Curr->BB != BB)) { - Prev = Curr; - Curr = Curr->Next; - } - - if (!Curr) - return; - - if (Prev) { - Prev->Next = Curr->Next; - } else { - if (!Curr->Next) { - Curr->Val = nullptr; - Curr->BB = nullptr; - } else { - LeaderTableEntry* Next = Curr->Next; - Curr->Val = Next->Val; - Curr->BB = Next->BB; - Curr->Next = Next->Next; - } - } - } - - // List of critical edges to be split between iterations. - SmallVector<std::pair<TerminatorInst*, unsigned>, 4> toSplit; - - // This transformation requires dominator postdominator info - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<AssumptionCacheTracker>(); - AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<TargetLibraryInfoWrapperPass>(); - if (!NoLoads) - AU.addRequired<MemoryDependenceAnalysis>(); - AU.addRequired<AAResultsWrapperPass>(); - - AU.addPreserved<DominatorTreeWrapperPass>(); - AU.addPreserved<GlobalsAAWrapperPass>(); - } - - - // Helper functions of redundant load elimination - bool processLoad(LoadInst *L); - bool processNonLocalLoad(LoadInst *L); - bool processAssumeIntrinsic(IntrinsicInst *II); - void AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps, - AvailValInBlkVect &ValuesPerBlock, - UnavailBlkVect &UnavailableBlocks); - bool PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, - UnavailBlkVect &UnavailableBlocks); - - // Other helper routines - bool processInstruction(Instruction *I); - bool processBlock(BasicBlock *BB); - void dump(DenseMap<uint32_t, Value*> &d); - bool iterateOnFunction(Function &F); - bool performPRE(Function &F); - bool performScalarPRE(Instruction *I); - bool performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred, - unsigned int ValNo); - Value *findLeader(const BasicBlock *BB, uint32_t num); - void cleanupGlobalSets(); - void verifyRemoved(const Instruction *I) const; - bool splitCriticalEdges(); - BasicBlock *splitCriticalEdges(BasicBlock *Pred, BasicBlock *Succ); - bool replaceOperandsWithConsts(Instruction *I) const; - bool propagateEquality(Value *LHS, Value *RHS, const BasicBlockEdge &Root, - bool DominatesByEdge); - bool processFoldableCondBr(BranchInst *BI); - void addDeadBlock(BasicBlock *BB); - void assignValNumForDeadCode(); - }; - - char GVN::ID = 0; -} - -// The public interface to this file... -FunctionPass *llvm::createGVNPass(bool NoLoads) { - return new GVN(NoLoads); +PreservedAnalyses GVN::run(Function &F, AnalysisManager<Function> &AM) { + // FIXME: The order of evaluation of these 'getResult' calls is very + // significant! Re-ordering these variables will cause GVN when run alone to + // be less effective! We should fix memdep and basic-aa to not exhibit this + // behavior, but until then don't change the order here. + auto &AC = AM.getResult<AssumptionAnalysis>(F); + auto &DT = AM.getResult<DominatorTreeAnalysis>(F); + auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); + auto &AA = AM.getResult<AAManager>(F); + auto &MemDep = AM.getResult<MemoryDependenceAnalysis>(F); + bool Changed = runImpl(F, AC, DT, TLI, AA, &MemDep); + if (!Changed) + return PreservedAnalyses::all(); + PreservedAnalyses PA; + PA.preserve<DominatorTreeAnalysis>(); + PA.preserve<GlobalsAA>(); + return PA; } -INITIALIZE_PASS_BEGIN(GVN, "gvn", "Global Value Numbering", false, false) -INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) -INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis) -INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) -INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) -INITIALIZE_PASS_END(GVN, "gvn", "Global Value Numbering", false, false) - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void GVN::dump(DenseMap<uint32_t, Value*>& d) { errs() << "{\n"; for (DenseMap<uint32_t, Value*>::iterator I = d.begin(), @@ -764,7 +612,6 @@ void GVN::dump(DenseMap<uint32_t, Value*>& d) { } errs() << "}\n"; } -#endif /// Return true if we can prove that the value /// we're analyzing is fully available in the specified block. As we go, keep @@ -875,38 +722,45 @@ static bool CanCoerceMustAliasedValueToLoad(Value *StoredVal, static Value *CoerceAvailableValueToLoadType(Value *StoredVal, Type *LoadedTy, IRBuilder<> &IRB, const DataLayout &DL) { - if (!CanCoerceMustAliasedValueToLoad(StoredVal, LoadedTy, DL)) - return nullptr; + assert(CanCoerceMustAliasedValueToLoad(StoredVal, LoadedTy, DL) && + "precondition violation - materialization can't fail"); + + if (auto *CExpr = dyn_cast<ConstantExpr>(StoredVal)) + StoredVal = ConstantFoldConstantExpression(CExpr, DL); // If this is already the right type, just return it. Type *StoredValTy = StoredVal->getType(); - uint64_t StoreSize = DL.getTypeSizeInBits(StoredValTy); - uint64_t LoadSize = DL.getTypeSizeInBits(LoadedTy); + uint64_t StoredValSize = DL.getTypeSizeInBits(StoredValTy); + uint64_t LoadedValSize = DL.getTypeSizeInBits(LoadedTy); // If the store and reload are the same size, we can always reuse it. - if (StoreSize == LoadSize) { + if (StoredValSize == LoadedValSize) { // Pointer to Pointer -> use bitcast. if (StoredValTy->getScalarType()->isPointerTy() && - LoadedTy->getScalarType()->isPointerTy()) - return IRB.CreateBitCast(StoredVal, LoadedTy); + LoadedTy->getScalarType()->isPointerTy()) { + StoredVal = IRB.CreateBitCast(StoredVal, LoadedTy); + } else { + // Convert source pointers to integers, which can be bitcast. + if (StoredValTy->getScalarType()->isPointerTy()) { + StoredValTy = DL.getIntPtrType(StoredValTy); + StoredVal = IRB.CreatePtrToInt(StoredVal, StoredValTy); + } - // Convert source pointers to integers, which can be bitcast. - if (StoredValTy->getScalarType()->isPointerTy()) { - StoredValTy = DL.getIntPtrType(StoredValTy); - StoredVal = IRB.CreatePtrToInt(StoredVal, StoredValTy); - } + Type *TypeToCastTo = LoadedTy; + if (TypeToCastTo->getScalarType()->isPointerTy()) + TypeToCastTo = DL.getIntPtrType(TypeToCastTo); - Type *TypeToCastTo = LoadedTy; - if (TypeToCastTo->getScalarType()->isPointerTy()) - TypeToCastTo = DL.getIntPtrType(TypeToCastTo); + if (StoredValTy != TypeToCastTo) + StoredVal = IRB.CreateBitCast(StoredVal, TypeToCastTo); - if (StoredValTy != TypeToCastTo) - StoredVal = IRB.CreateBitCast(StoredVal, TypeToCastTo); + // Cast to pointer if the load needs a pointer type. + if (LoadedTy->getScalarType()->isPointerTy()) + StoredVal = IRB.CreateIntToPtr(StoredVal, LoadedTy); + } - // Cast to pointer if the load needs a pointer type. - if (LoadedTy->getScalarType()->isPointerTy()) - StoredVal = IRB.CreateIntToPtr(StoredVal, LoadedTy); + if (auto *CExpr = dyn_cast<ConstantExpr>(StoredVal)) + StoredVal = ConstantFoldConstantExpression(CExpr, DL); return StoredVal; } @@ -914,7 +768,8 @@ static Value *CoerceAvailableValueToLoadType(Value *StoredVal, Type *LoadedTy, // If the loaded value is smaller than the available value, then we can // extract out a piece from it. If the available value is too small, then we // can't do anything. - assert(StoreSize >= LoadSize && "CanCoerceMustAliasedValueToLoad fail"); + assert(StoredValSize >= LoadedValSize && + "CanCoerceMustAliasedValueToLoad fail"); // Convert source pointers to integers, which can be manipulated. if (StoredValTy->getScalarType()->isPointerTy()) { @@ -924,29 +779,35 @@ static Value *CoerceAvailableValueToLoadType(Value *StoredVal, Type *LoadedTy, // Convert vectors and fp to integer, which can be manipulated. if (!StoredValTy->isIntegerTy()) { - StoredValTy = IntegerType::get(StoredValTy->getContext(), StoreSize); + StoredValTy = IntegerType::get(StoredValTy->getContext(), StoredValSize); StoredVal = IRB.CreateBitCast(StoredVal, StoredValTy); } // If this is a big-endian system, we need to shift the value down to the low // bits so that a truncate will work. if (DL.isBigEndian()) { - StoredVal = IRB.CreateLShr(StoredVal, StoreSize - LoadSize, "tmp"); + uint64_t ShiftAmt = DL.getTypeStoreSizeInBits(StoredValTy) - + DL.getTypeStoreSizeInBits(LoadedTy); + StoredVal = IRB.CreateLShr(StoredVal, ShiftAmt, "tmp"); } // Truncate the integer to the right size now. - Type *NewIntTy = IntegerType::get(StoredValTy->getContext(), LoadSize); + Type *NewIntTy = IntegerType::get(StoredValTy->getContext(), LoadedValSize); StoredVal = IRB.CreateTrunc(StoredVal, NewIntTy, "trunc"); - if (LoadedTy == NewIntTy) - return StoredVal; + if (LoadedTy != NewIntTy) { + // If the result is a pointer, inttoptr. + if (LoadedTy->getScalarType()->isPointerTy()) + StoredVal = IRB.CreateIntToPtr(StoredVal, LoadedTy, "inttoptr"); + else + // Otherwise, bitcast. + StoredVal = IRB.CreateBitCast(StoredVal, LoadedTy, "bitcast"); + } - // If the result is a pointer, inttoptr. - if (LoadedTy->getScalarType()->isPointerTy()) - return IRB.CreateIntToPtr(StoredVal, LoadedTy, "inttoptr"); + if (auto *CExpr = dyn_cast<ConstantExpr>(StoredVal)) + StoredVal = ConstantFoldConstantExpression(CExpr, DL); - // Otherwise, bitcast. - return IRB.CreateBitCast(StoredVal, LoadedTy, "bitcast"); + return StoredVal; } /// This function is called when we have a @@ -1067,10 +928,15 @@ static int AnalyzeLoadFromClobberingLoad(Type *LoadTy, Value *LoadPtr, GetPointerBaseWithConstantOffset(LoadPtr, LoadOffs, DL); unsigned LoadSize = DL.getTypeStoreSize(LoadTy); - unsigned Size = MemoryDependenceAnalysis::getLoadLoadClobberFullWidthSize( + unsigned Size = MemoryDependenceResults::getLoadLoadClobberFullWidthSize( LoadBase, LoadOffs, LoadSize, DepLI); if (Size == 0) return -1; + // Check non-obvious conditions enforced by MDA which we rely on for being + // able to materialize this potentially available value + assert(DepLI->isSimple() && "Cannot widen volatile/atomic load!"); + assert(DepLI->getType()->isIntegerTy() && "Can't widen non-integer load"); + return AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, Size*8, DL); } @@ -1117,7 +983,7 @@ static int AnalyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr, Src = ConstantExpr::getGetElementPtr(Type::getInt8Ty(Src->getContext()), Src, OffsetCst); Src = ConstantExpr::getBitCast(Src, PointerType::get(LoadTy, AS)); - if (ConstantFoldLoadFromConstPtr(Src, DL)) + if (ConstantFoldLoadFromConstPtr(Src, LoadTy, DL)) return Offset; return -1; } @@ -1173,9 +1039,9 @@ static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset, const DataLayout &DL = SrcVal->getModule()->getDataLayout(); // If Offset+LoadTy exceeds the size of SrcVal, then we must be wanting to // widen SrcVal out to a larger load. - unsigned SrcValSize = DL.getTypeStoreSize(SrcVal->getType()); + unsigned SrcValStoreSize = DL.getTypeStoreSize(SrcVal->getType()); unsigned LoadSize = DL.getTypeStoreSize(LoadTy); - if (Offset+LoadSize > SrcValSize) { + if (Offset+LoadSize > SrcValStoreSize) { assert(SrcVal->isSimple() && "Cannot widen volatile/atomic load!"); assert(SrcVal->getType()->isIntegerTy() && "Can't widen non-integer load"); // If we have a load/load clobber an DepLI can be widened to cover this @@ -1207,8 +1073,7 @@ static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset, // system, we need to shift down to get the relevant bits. Value *RV = NewLoad; if (DL.isBigEndian()) - RV = Builder.CreateLShr(RV, - NewLoadSize*8-SrcVal->getType()->getPrimitiveSizeInBits()); + RV = Builder.CreateLShr(RV, (NewLoadSize - SrcValStoreSize) * 8); RV = Builder.CreateTrunc(RV, SrcVal->getType()); SrcVal->replaceAllUsesWith(RV); @@ -1279,7 +1144,7 @@ static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset, Src = ConstantExpr::getGetElementPtr(Type::getInt8Ty(Src->getContext()), Src, OffsetCst); Src = ConstantExpr::getBitCast(Src, PointerType::get(LoadTy, AS)); - return ConstantFoldLoadFromConstPtr(Src, DL); + return ConstantFoldLoadFromConstPtr(Src, LoadTy, DL); } @@ -1294,7 +1159,8 @@ static Value *ConstructSSAForLoadSet(LoadInst *LI, if (ValuesPerBlock.size() == 1 && gvn.getDominatorTree().properlyDominates(ValuesPerBlock[0].BB, LI->getParent())) { - assert(!ValuesPerBlock[0].isUndefValue() && "Dead BB dominate this block"); + assert(!ValuesPerBlock[0].AV.isUndefValue() && + "Dead BB dominate this block"); return ValuesPerBlock[0].MaterializeAdjustedValue(LI, gvn); } @@ -1316,15 +1182,16 @@ static Value *ConstructSSAForLoadSet(LoadInst *LI, return SSAUpdate.GetValueInMiddleOfBlock(LI->getParent()); } -Value *AvailableValueInBlock::MaterializeAdjustedValue(LoadInst *LI, - GVN &gvn) const { +Value *AvailableValue::MaterializeAdjustedValue(LoadInst *LI, + Instruction *InsertPt, + GVN &gvn) const { Value *Res; Type *LoadTy = LI->getType(); const DataLayout &DL = LI->getModule()->getDataLayout(); if (isSimpleValue()) { Res = getSimpleValue(); if (Res->getType() != LoadTy) { - Res = GetStoreValueForLoad(Res, Offset, LoadTy, BB->getTerminator(), DL); + Res = GetStoreValueForLoad(Res, Offset, LoadTy, InsertPt, DL); DEBUG(dbgs() << "GVN COERCED NONLOCAL VAL:\nOffset: " << Offset << " " << *getSimpleValue() << '\n' @@ -1335,16 +1202,15 @@ Value *AvailableValueInBlock::MaterializeAdjustedValue(LoadInst *LI, if (Load->getType() == LoadTy && Offset == 0) { Res = Load; } else { - Res = GetLoadValueForLoad(Load, Offset, LoadTy, BB->getTerminator(), - gvn); - + Res = GetLoadValueForLoad(Load, Offset, LoadTy, InsertPt, gvn); + DEBUG(dbgs() << "GVN COERCED NONLOCAL LOAD:\nOffset: " << Offset << " " << *getCoercedLoadValue() << '\n' << *Res << '\n' << "\n\n\n"); } } else if (isMemIntrinValue()) { Res = GetMemInstValueForLoad(getMemIntrinValue(), Offset, LoadTy, - BB->getTerminator(), DL); + InsertPt, DL); DEBUG(dbgs() << "GVN COERCED NONLOCAL MEM INTRIN:\nOffset: " << Offset << " " << *getMemIntrinValue() << '\n' << *Res << '\n' << "\n\n\n"); @@ -1353,6 +1219,7 @@ Value *AvailableValueInBlock::MaterializeAdjustedValue(LoadInst *LI, DEBUG(dbgs() << "GVN COERCED NONLOCAL Undef:\n";); return UndefValue::get(LoadTy); } + assert(Res && "failed to materialize?"); return Res; } @@ -1362,7 +1229,134 @@ static bool isLifetimeStart(const Instruction *Inst) { return false; } -void GVN::AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps, +bool GVN::AnalyzeLoadAvailability(LoadInst *LI, MemDepResult DepInfo, + Value *Address, AvailableValue &Res) { + + assert((DepInfo.isDef() || DepInfo.isClobber()) && + "expected a local dependence"); + assert(LI->isUnordered() && "rules below are incorrect for ordered access"); + + const DataLayout &DL = LI->getModule()->getDataLayout(); + + if (DepInfo.isClobber()) { + // If the dependence is to a store that writes to a superset of the bits + // read by the load, we can extract the bits we need for the load from the + // stored value. + if (StoreInst *DepSI = dyn_cast<StoreInst>(DepInfo.getInst())) { + // Can't forward from non-atomic to atomic without violating memory model. + if (Address && LI->isAtomic() <= DepSI->isAtomic()) { + int Offset = + AnalyzeLoadFromClobberingStore(LI->getType(), Address, DepSI); + if (Offset != -1) { + Res = AvailableValue::get(DepSI->getValueOperand(), Offset); + return true; + } + } + } + + // Check to see if we have something like this: + // load i32* P + // load i8* (P+1) + // if we have this, replace the later with an extraction from the former. + if (LoadInst *DepLI = dyn_cast<LoadInst>(DepInfo.getInst())) { + // If this is a clobber and L is the first instruction in its block, then + // we have the first instruction in the entry block. + // Can't forward from non-atomic to atomic without violating memory model. + if (DepLI != LI && Address && LI->isAtomic() <= DepLI->isAtomic()) { + int Offset = + AnalyzeLoadFromClobberingLoad(LI->getType(), Address, DepLI, DL); + + if (Offset != -1) { + Res = AvailableValue::getLoad(DepLI, Offset); + return true; + } + } + } + + // If the clobbering value is a memset/memcpy/memmove, see if we can + // forward a value on from it. + if (MemIntrinsic *DepMI = dyn_cast<MemIntrinsic>(DepInfo.getInst())) { + if (Address && !LI->isAtomic()) { + int Offset = AnalyzeLoadFromClobberingMemInst(LI->getType(), Address, + DepMI, DL); + if (Offset != -1) { + Res = AvailableValue::getMI(DepMI, Offset); + return true; + } + } + } + // Nothing known about this clobber, have to be conservative + DEBUG( + // fast print dep, using operator<< on instruction is too slow. + dbgs() << "GVN: load "; + LI->printAsOperand(dbgs()); + Instruction *I = DepInfo.getInst(); + dbgs() << " is clobbered by " << *I << '\n'; + ); + return false; + } + assert(DepInfo.isDef() && "follows from above"); + + Instruction *DepInst = DepInfo.getInst(); + + // Loading the allocation -> undef. + if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst, TLI) || + // Loading immediately after lifetime begin -> undef. + isLifetimeStart(DepInst)) { + Res = AvailableValue::get(UndefValue::get(LI->getType())); + return true; + } + + // Loading from calloc (which zero initializes memory) -> zero + if (isCallocLikeFn(DepInst, TLI)) { + Res = AvailableValue::get(Constant::getNullValue(LI->getType())); + return true; + } + + if (StoreInst *S = dyn_cast<StoreInst>(DepInst)) { + // Reject loads and stores that are to the same address but are of + // different types if we have to. If the stored value is larger or equal to + // the loaded value, we can reuse it. + if (S->getValueOperand()->getType() != LI->getType() && + !CanCoerceMustAliasedValueToLoad(S->getValueOperand(), + LI->getType(), DL)) + return false; + + // Can't forward from non-atomic to atomic without violating memory model. + if (S->isAtomic() < LI->isAtomic()) + return false; + + Res = AvailableValue::get(S->getValueOperand()); + return true; + } + + if (LoadInst *LD = dyn_cast<LoadInst>(DepInst)) { + // If the types mismatch and we can't handle it, reject reuse of the load. + // If the stored value is larger or equal to the loaded value, we can reuse + // it. + if (LD->getType() != LI->getType() && + !CanCoerceMustAliasedValueToLoad(LD, LI->getType(), DL)) + return false; + + // Can't forward from non-atomic to atomic without violating memory model. + if (LD->isAtomic() < LI->isAtomic()) + return false; + + Res = AvailableValue::getLoad(LD); + return true; + } + + // Unknown def - must be conservative + DEBUG( + // fast print dep, using operator<< on instruction is too slow. + dbgs() << "GVN: load "; + LI->printAsOperand(dbgs()); + dbgs() << " has unknown def " << *DepInst << '\n'; + ); + return false; +} + +void GVN::AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps, AvailValInBlkVect &ValuesPerBlock, UnavailBlkVect &UnavailableBlocks) { @@ -1371,7 +1365,6 @@ void GVN::AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps, // dependencies that produce an unknown value for the load (such as a call // that could potentially clobber the load). unsigned NumDeps = Deps.size(); - const DataLayout &DL = LI->getModule()->getDataLayout(); for (unsigned i = 0, e = NumDeps; i != e; ++i) { BasicBlock *DepBB = Deps[i].getBB(); MemDepResult DepInfo = Deps[i].getResult(); @@ -1388,122 +1381,28 @@ void GVN::AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps, continue; } - if (DepInfo.isClobber()) { - // The address being loaded in this non-local block may not be the same as - // the pointer operand of the load if PHI translation occurs. Make sure - // to consider the right address. - Value *Address = Deps[i].getAddress(); - - // If the dependence is to a store that writes to a superset of the bits - // read by the load, we can extract the bits we need for the load from the - // stored value. - if (StoreInst *DepSI = dyn_cast<StoreInst>(DepInfo.getInst())) { - if (Address) { - int Offset = - AnalyzeLoadFromClobberingStore(LI->getType(), Address, DepSI); - if (Offset != -1) { - ValuesPerBlock.push_back(AvailableValueInBlock::get(DepBB, - DepSI->getValueOperand(), - Offset)); - continue; - } - } - } - - // Check to see if we have something like this: - // load i32* P - // load i8* (P+1) - // if we have this, replace the later with an extraction from the former. - if (LoadInst *DepLI = dyn_cast<LoadInst>(DepInfo.getInst())) { - // If this is a clobber and L is the first instruction in its block, then - // we have the first instruction in the entry block. - if (DepLI != LI && Address) { - int Offset = - AnalyzeLoadFromClobberingLoad(LI->getType(), Address, DepLI, DL); - - if (Offset != -1) { - ValuesPerBlock.push_back(AvailableValueInBlock::getLoad(DepBB,DepLI, - Offset)); - continue; - } - } - } - - // If the clobbering value is a memset/memcpy/memmove, see if we can - // forward a value on from it. - if (MemIntrinsic *DepMI = dyn_cast<MemIntrinsic>(DepInfo.getInst())) { - if (Address) { - int Offset = AnalyzeLoadFromClobberingMemInst(LI->getType(), Address, - DepMI, DL); - if (Offset != -1) { - ValuesPerBlock.push_back(AvailableValueInBlock::getMI(DepBB, DepMI, - Offset)); - continue; - } - } - } - - UnavailableBlocks.push_back(DepBB); - continue; - } - - // DepInfo.isDef() here - - Instruction *DepInst = DepInfo.getInst(); - - // Loading the allocation -> undef. - if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst, TLI) || - // Loading immediately after lifetime begin -> undef. - isLifetimeStart(DepInst)) { - ValuesPerBlock.push_back(AvailableValueInBlock::get(DepBB, - UndefValue::get(LI->getType()))); - continue; - } - - // Loading from calloc (which zero initializes memory) -> zero - if (isCallocLikeFn(DepInst, TLI)) { - ValuesPerBlock.push_back(AvailableValueInBlock::get( - DepBB, Constant::getNullValue(LI->getType()))); - continue; - } - - if (StoreInst *S = dyn_cast<StoreInst>(DepInst)) { - // Reject loads and stores that are to the same address but are of - // different types if we have to. - if (S->getValueOperand()->getType() != LI->getType()) { - // If the stored value is larger or equal to the loaded value, we can - // reuse it. - if (!CanCoerceMustAliasedValueToLoad(S->getValueOperand(), - LI->getType(), DL)) { - UnavailableBlocks.push_back(DepBB); - continue; - } - } + // The address being loaded in this non-local block may not be the same as + // the pointer operand of the load if PHI translation occurs. Make sure + // to consider the right address. + Value *Address = Deps[i].getAddress(); + AvailableValue AV; + if (AnalyzeLoadAvailability(LI, DepInfo, Address, AV)) { + // subtlety: because we know this was a non-local dependency, we know + // it's safe to materialize anywhere between the instruction within + // DepInfo and the end of it's block. ValuesPerBlock.push_back(AvailableValueInBlock::get(DepBB, - S->getValueOperand())); - continue; - } - - if (LoadInst *LD = dyn_cast<LoadInst>(DepInst)) { - // If the types mismatch and we can't handle it, reject reuse of the load. - if (LD->getType() != LI->getType()) { - // If the stored value is larger or equal to the loaded value, we can - // reuse it. - if (!CanCoerceMustAliasedValueToLoad(LD, LI->getType(), DL)) { - UnavailableBlocks.push_back(DepBB); - continue; - } - } - ValuesPerBlock.push_back(AvailableValueInBlock::getLoad(DepBB, LD)); - continue; + std::move(AV))); + } else { + UnavailableBlocks.push_back(DepBB); } - - UnavailableBlocks.push_back(DepBB); } + + assert(NumDeps == ValuesPerBlock.size() + UnavailableBlocks.size() && + "post condition violation"); } -bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, +bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, UnavailBlkVect &UnavailableBlocks) { // Okay, we have *some* definitions of the value. This means that the value // is available in some of our (transitive) predecessors. Lets think about @@ -1661,16 +1560,17 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, // parent's availability map. However, in doing so, we risk getting into // ordering issues. If a block hasn't been processed yet, we would be // marking a value as AVAIL-IN, which isn't what we intend. - VN.lookup_or_add(I); + VN.lookupOrAdd(I); } for (const auto &PredLoad : PredLoads) { BasicBlock *UnavailablePred = PredLoad.first; Value *LoadPtr = PredLoad.second; - Instruction *NewLoad = new LoadInst(LoadPtr, LI->getName()+".pre", false, - LI->getAlignment(), - UnavailablePred->getTerminator()); + auto *NewLoad = new LoadInst(LoadPtr, LI->getName()+".pre", + LI->isVolatile(), LI->getAlignment(), + LI->getOrdering(), LI->getSynchScope(), + UnavailablePred->getTerminator()); // Transfer the old load's AA tags to the new load. AAMDNodes Tags; @@ -1682,6 +1582,8 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, NewLoad->setMetadata(LLVMContext::MD_invariant_load, MD); if (auto *InvGroupMD = LI->getMetadata(LLVMContext::MD_invariant_group)) NewLoad->setMetadata(LLVMContext::MD_invariant_group, InvGroupMD); + if (auto *RangeMD = LI->getMetadata(LLVMContext::MD_range)) + NewLoad->setMetadata(LLVMContext::MD_range, RangeMD); // Transfer DebugLoc. NewLoad->setDebugLoc(LI->getDebugLoc()); @@ -1846,30 +1748,29 @@ bool GVN::processAssumeIntrinsic(IntrinsicInst *IntrinsicI) { } static void patchReplacementInstruction(Instruction *I, Value *Repl) { + auto *ReplInst = dyn_cast<Instruction>(Repl); + if (!ReplInst) + return; + // Patch the replacement so that it is not more restrictive than the value // being replaced. - BinaryOperator *Op = dyn_cast<BinaryOperator>(I); - BinaryOperator *ReplOp = dyn_cast<BinaryOperator>(Repl); - if (Op && ReplOp) - ReplOp->andIRFlags(Op); - - if (Instruction *ReplInst = dyn_cast<Instruction>(Repl)) { - // FIXME: If both the original and replacement value are part of the - // same control-flow region (meaning that the execution of one - // guarantees the execution of the other), then we can combine the - // noalias scopes here and do better than the general conservative - // answer used in combineMetadata(). - - // In general, GVN unifies expressions over different control-flow - // regions, and so we need a conservative combination of the noalias - // scopes. - static const unsigned KnownIDs[] = { - LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope, - LLVMContext::MD_noalias, LLVMContext::MD_range, - LLVMContext::MD_fpmath, LLVMContext::MD_invariant_load, - LLVMContext::MD_invariant_group}; - combineMetadata(ReplInst, I, KnownIDs); - } + ReplInst->andIRFlags(I); + + // FIXME: If both the original and replacement value are part of the + // same control-flow region (meaning that the execution of one + // guarantees the execution of the other), then we can combine the + // noalias scopes here and do better than the general conservative + // answer used in combineMetadata(). + + // In general, GVN unifies expressions over different control-flow + // regions, and so we need a conservative combination of the noalias + // scopes. + static const unsigned KnownIDs[] = { + LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope, + LLVMContext::MD_noalias, LLVMContext::MD_range, + LLVMContext::MD_fpmath, LLVMContext::MD_invariant_load, + LLVMContext::MD_invariant_group}; + combineMetadata(ReplInst, I, KnownIDs); } static void patchAndReplaceAllUsesWith(Instruction *I, Value *Repl) { @@ -1883,7 +1784,8 @@ bool GVN::processLoad(LoadInst *L) { if (!MD) return false; - if (!L->isSimple()) + // This code hasn't been audited for ordered or volatile memory access + if (!L->isUnordered()) return false; if (L->use_empty()) { @@ -1893,84 +1795,14 @@ bool GVN::processLoad(LoadInst *L) { // ... to a pointer that has been loaded from before... MemDepResult Dep = MD->getDependency(L); - const DataLayout &DL = L->getModule()->getDataLayout(); - - // If we have a clobber and target data is around, see if this is a clobber - // that we can fix up through code synthesis. - if (Dep.isClobber()) { - // Check to see if we have something like this: - // store i32 123, i32* %P - // %A = bitcast i32* %P to i8* - // %B = gep i8* %A, i32 1 - // %C = load i8* %B - // - // We could do that by recognizing if the clobber instructions are obviously - // a common base + constant offset, and if the previous store (or memset) - // completely covers this load. This sort of thing can happen in bitfield - // access code. - Value *AvailVal = nullptr; - if (StoreInst *DepSI = dyn_cast<StoreInst>(Dep.getInst())) { - int Offset = AnalyzeLoadFromClobberingStore( - L->getType(), L->getPointerOperand(), DepSI); - if (Offset != -1) - AvailVal = GetStoreValueForLoad(DepSI->getValueOperand(), Offset, - L->getType(), L, DL); - } - - // Check to see if we have something like this: - // load i32* P - // load i8* (P+1) - // if we have this, replace the later with an extraction from the former. - if (LoadInst *DepLI = dyn_cast<LoadInst>(Dep.getInst())) { - // If this is a clobber and L is the first instruction in its block, then - // we have the first instruction in the entry block. - if (DepLI == L) - return false; - - int Offset = AnalyzeLoadFromClobberingLoad( - L->getType(), L->getPointerOperand(), DepLI, DL); - if (Offset != -1) - AvailVal = GetLoadValueForLoad(DepLI, Offset, L->getType(), L, *this); - } - - // If the clobbering value is a memset/memcpy/memmove, see if we can forward - // a value on from it. - if (MemIntrinsic *DepMI = dyn_cast<MemIntrinsic>(Dep.getInst())) { - int Offset = AnalyzeLoadFromClobberingMemInst( - L->getType(), L->getPointerOperand(), DepMI, DL); - if (Offset != -1) - AvailVal = GetMemInstValueForLoad(DepMI, Offset, L->getType(), L, DL); - } - - if (AvailVal) { - DEBUG(dbgs() << "GVN COERCED INST:\n" << *Dep.getInst() << '\n' - << *AvailVal << '\n' << *L << "\n\n\n"); - - // Replace the load! - L->replaceAllUsesWith(AvailVal); - if (AvailVal->getType()->getScalarType()->isPointerTy()) - MD->invalidateCachedPointerInfo(AvailVal); - markInstructionForDeletion(L); - ++NumGVNLoad; - return true; - } - - // If the value isn't available, don't do anything! - DEBUG( - // fast print dep, using operator<< on instruction is too slow. - dbgs() << "GVN: load "; - L->printAsOperand(dbgs()); - Instruction *I = Dep.getInst(); - dbgs() << " is clobbered by " << *I << '\n'; - ); - return false; - } // If it is defined in another block, try harder. if (Dep.isNonLocal()) return processNonLocalLoad(L); - if (!Dep.isDef()) { + // Only handle the local case below + if (!Dep.isDef() && !Dep.isClobber()) { + // This might be a NonFuncLocal or an Unknown DEBUG( // fast print dep, using operator<< on instruction is too slow. dbgs() << "GVN: load "; @@ -1980,86 +1812,18 @@ bool GVN::processLoad(LoadInst *L) { return false; } - Instruction *DepInst = Dep.getInst(); - if (StoreInst *DepSI = dyn_cast<StoreInst>(DepInst)) { - Value *StoredVal = DepSI->getValueOperand(); - - // The store and load are to a must-aliased pointer, but they may not - // actually have the same type. See if we know how to reuse the stored - // value (depending on its type). - if (StoredVal->getType() != L->getType()) { - IRBuilder<> Builder(L); - StoredVal = - CoerceAvailableValueToLoadType(StoredVal, L->getType(), Builder, DL); - if (!StoredVal) - return false; - - DEBUG(dbgs() << "GVN COERCED STORE:\n" << *DepSI << '\n' << *StoredVal - << '\n' << *L << "\n\n\n"); - } - - // Remove it! - L->replaceAllUsesWith(StoredVal); - if (StoredVal->getType()->getScalarType()->isPointerTy()) - MD->invalidateCachedPointerInfo(StoredVal); - markInstructionForDeletion(L); - ++NumGVNLoad; - return true; - } - - if (LoadInst *DepLI = dyn_cast<LoadInst>(DepInst)) { - Value *AvailableVal = DepLI; - - // The loads are of a must-aliased pointer, but they may not actually have - // the same type. See if we know how to reuse the previously loaded value - // (depending on its type). - if (DepLI->getType() != L->getType()) { - IRBuilder<> Builder(L); - AvailableVal = - CoerceAvailableValueToLoadType(DepLI, L->getType(), Builder, DL); - if (!AvailableVal) - return false; - - DEBUG(dbgs() << "GVN COERCED LOAD:\n" << *DepLI << "\n" << *AvailableVal - << "\n" << *L << "\n\n\n"); - } - - // Remove it! - patchAndReplaceAllUsesWith(L, AvailableVal); - if (DepLI->getType()->getScalarType()->isPointerTy()) - MD->invalidateCachedPointerInfo(DepLI); - markInstructionForDeletion(L); - ++NumGVNLoad; - return true; - } - - // If this load really doesn't depend on anything, then we must be loading an - // undef value. This can happen when loading for a fresh allocation with no - // intervening stores, for example. - if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst, TLI)) { - L->replaceAllUsesWith(UndefValue::get(L->getType())); - markInstructionForDeletion(L); - ++NumGVNLoad; - return true; - } + AvailableValue AV; + if (AnalyzeLoadAvailability(L, Dep, L->getPointerOperand(), AV)) { + Value *AvailableValue = AV.MaterializeAdjustedValue(L, L, *this); - // If this load occurs either right after a lifetime begin, - // then the loaded value is undefined. - if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(DepInst)) { - if (II->getIntrinsicID() == Intrinsic::lifetime_start) { - L->replaceAllUsesWith(UndefValue::get(L->getType())); - markInstructionForDeletion(L); - ++NumGVNLoad; - return true; - } - } - - // If this load follows a calloc (which zero initializes memory), - // then the loaded value is zero - if (isCallocLikeFn(DepInst, TLI)) { - L->replaceAllUsesWith(Constant::getNullValue(L->getType())); + // Replace the load! + patchAndReplaceAllUsesWith(L, AvailableValue); markInstructionForDeletion(L); ++NumGVNLoad; + // Tell MDA to rexamine the reused pointer since we might have more + // information after forwarding it. + if (MD && AvailableValue->getType()->getScalarType()->isPointerTy()) + MD->invalidateCachedPointerInfo(AvailableValue); return true; } @@ -2105,9 +1869,8 @@ static bool isOnlyReachableViaThisEdge(const BasicBlockEdge &E, // GVN runs all such loops have preheaders, which means that Dst will have // been changed to have only one predecessor, namely Src. const BasicBlock *Pred = E.getEnd()->getSinglePredecessor(); - const BasicBlock *Src = E.getStart(); - assert((!Pred || Pred == Src) && "No edge between these basic blocks!"); - (void)Src; + assert((!Pred || Pred == E.getStart()) && + "No edge between these basic blocks!"); return Pred != nullptr; } @@ -2133,7 +1896,8 @@ bool GVN::replaceOperandsWithConsts(Instruction *Instr) const { /// The given values are known to be equal in every block /// dominated by 'Root'. Exploit this, for example by replacing 'LHS' with /// 'RHS' everywhere in the scope. Returns whether a change was made. -/// If DominatesByEdge is false, then it means that it is dominated by Root.End. +/// If DominatesByEdge is false, then it means that we will propagate the RHS +/// value starting from the end of Root.Start. bool GVN::propagateEquality(Value *LHS, Value *RHS, const BasicBlockEdge &Root, bool DominatesByEdge) { SmallVector<std::pair<Value*, Value*>, 4> Worklist; @@ -2141,7 +1905,7 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS, const BasicBlockEdge &Root, bool Changed = false; // For speed, compute a conservative fast approximation to // DT->dominates(Root, Root.getEnd()); - bool RootDominatesEnd = isOnlyReachableViaThisEdge(Root, DT); + const bool RootDominatesEnd = isOnlyReachableViaThisEdge(Root, DT); while (!Worklist.empty()) { std::pair<Value*, Value*> Item = Worklist.pop_back_val(); @@ -2164,12 +1928,12 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS, const BasicBlockEdge &Root, // right-hand side, ensure the longest lived term is on the right-hand side, // so the shortest lived term will be replaced by the longest lived. // This tends to expose more simplifications. - uint32_t LVN = VN.lookup_or_add(LHS); + uint32_t LVN = VN.lookupOrAdd(LHS); if ((isa<Argument>(LHS) && isa<Argument>(RHS)) || (isa<Instruction>(LHS) && isa<Instruction>(RHS))) { // Move the 'oldest' value to the right-hand side, using the value number // as a proxy for age. - uint32_t RVN = VN.lookup_or_add(RHS); + uint32_t RVN = VN.lookupOrAdd(RHS); if (LVN < RVN) { std::swap(LHS, RHS); LVN = RVN; @@ -2195,7 +1959,7 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS, const BasicBlockEdge &Root, unsigned NumReplacements = DominatesByEdge ? replaceDominatedUsesWith(LHS, RHS, *DT, Root) - : replaceDominatedUsesWith(LHS, RHS, *DT, Root.getEnd()); + : replaceDominatedUsesWith(LHS, RHS, *DT, Root.getStart()); Changed |= NumReplacements > 0; NumGVNEqProp += NumReplacements; @@ -2245,7 +2009,7 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS, const BasicBlockEdge &Root, // Floating point -0.0 and 0.0 compare equal, so we can only // propagate values if we know that we have a constant and that // its value is non-zero. - + // FIXME: We should do this optimization if 'no signed zeros' is // applicable via an instruction-level fast-math-flag or some other // indicator that relaxed FP semantics are being used. @@ -2253,7 +2017,7 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS, const BasicBlockEdge &Root, if (isa<ConstantFP>(Op1) && !cast<ConstantFP>(Op1)->isZero()) Worklist.push_back(std::make_pair(Op0, Op1)); } - + // If "A >= B" is known true, replace "A < B" with false everywhere. CmpInst::Predicate NotPred = Cmp->getInversePredicate(); Constant *NotVal = ConstantInt::get(Cmp->getType(), isKnownFalse); @@ -2261,7 +2025,7 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS, const BasicBlockEdge &Root, // out the value number that it would have and use that to find an // appropriate instruction (if any). uint32_t NextNum = VN.getNextUnusedValueNumber(); - uint32_t Num = VN.lookup_or_add_cmp(Cmp->getOpcode(), NotPred, Op0, Op1); + uint32_t Num = VN.lookupOrAddCmp(Cmp->getOpcode(), NotPred, Op0, Op1); // If the number we were assigned was brand new then there is no point in // looking for an instruction realizing it: there cannot be one! if (Num < NextNum) { @@ -2271,7 +2035,7 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS, const BasicBlockEdge &Root, DominatesByEdge ? replaceDominatedUsesWith(NotCmp, NotVal, *DT, Root) : replaceDominatedUsesWith(NotCmp, NotVal, *DT, - Root.getEnd()); + Root.getStart()); Changed |= NumReplacements > 0; NumGVNEqProp += NumReplacements; } @@ -2303,12 +2067,21 @@ bool GVN::processInstruction(Instruction *I) { // "%z = and i32 %x, %y" becomes "%z = and i32 %x, %x" which we now simplify. const DataLayout &DL = I->getModule()->getDataLayout(); if (Value *V = SimplifyInstruction(I, DL, TLI, DT, AC)) { - I->replaceAllUsesWith(V); - if (MD && V->getType()->getScalarType()->isPointerTy()) - MD->invalidateCachedPointerInfo(V); - markInstructionForDeletion(I); - ++NumGVNSimpl; - return true; + bool Changed = false; + if (!I->use_empty()) { + I->replaceAllUsesWith(V); + Changed = true; + } + if (isInstructionTriviallyDead(I, TLI)) { + markInstructionForDeletion(I); + Changed = true; + } + if (Changed) { + if (MD && V->getType()->getScalarType()->isPointerTy()) + MD->invalidateCachedPointerInfo(V); + ++NumGVNSimpl; + return true; + } } if (IntrinsicInst *IntrinsicI = dyn_cast<IntrinsicInst>(I)) @@ -2319,7 +2092,7 @@ bool GVN::processInstruction(Instruction *I) { if (processLoad(LI)) return true; - unsigned Num = VN.lookup_or_add(LI); + unsigned Num = VN.lookupOrAdd(LI); addToLeaderTable(Num, LI, LI->getParent()); return false; } @@ -2383,7 +2156,7 @@ bool GVN::processInstruction(Instruction *I) { return false; uint32_t NextNum = VN.getNextUnusedValueNumber(); - unsigned Num = VN.lookup_or_add(I); + unsigned Num = VN.lookupOrAdd(I); // Allocations are always uniquely numbered, so we can save time and memory // by fast failing them. @@ -2422,18 +2195,16 @@ bool GVN::processInstruction(Instruction *I) { } /// runOnFunction - This is the main transformation entry point for a function. -bool GVN::runOnFunction(Function& F) { - if (skipOptnoneFunction(F)) - return false; - - if (!NoLoads) - MD = &getAnalysis<MemoryDependenceAnalysis>(); - DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); - TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); - VN.setAliasAnalysis(&getAnalysis<AAResultsWrapperPass>().getAAResults()); - VN.setMemDep(MD); +bool GVN::runImpl(Function &F, AssumptionCache &RunAC, DominatorTree &RunDT, + const TargetLibraryInfo &RunTLI, AAResults &RunAA, + MemoryDependenceResults *RunMD) { + AC = &RunAC; + DT = &RunDT; VN.setDomTree(DT); + TLI = &RunTLI; + VN.setAliasAnalysis(&RunAA); + MD = RunMD; + VN.setMemDep(MD); bool Changed = false; bool ShouldContinue = true; @@ -2476,7 +2247,7 @@ bool GVN::runOnFunction(Function& F) { cleanupGlobalSets(); // Do not cleanup DeadBlocks in cleanupGlobalSets() as it's called for each - // iteration. + // iteration. DeadBlocks.clear(); return Changed; @@ -2576,8 +2347,6 @@ bool GVN::performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred, } bool GVN::performScalarPRE(Instruction *CurInst) { - SmallVector<std::pair<Value*, BasicBlock*>, 8> predMap; - if (isa<AllocaInst>(CurInst) || isa<TerminatorInst>(CurInst) || isa<PHINode>(CurInst) || CurInst->getType()->isVoidTy() || CurInst->mayReadFromMemory() || CurInst->mayHaveSideEffects() || @@ -2608,8 +2377,8 @@ bool GVN::performScalarPRE(Instruction *CurInst) { unsigned NumWithout = 0; BasicBlock *PREPred = nullptr; BasicBlock *CurrentBlock = CurInst->getParent(); - predMap.clear(); + SmallVector<std::pair<Value *, BasicBlock *>, 8> predMap; for (BasicBlock *P : predecessors(CurrentBlock)) { // We're not interested in PRE where the block is its // own predecessor, or in blocks with predecessors @@ -2702,7 +2471,7 @@ bool GVN::performScalarPRE(Instruction *CurInst) { DEBUG(verifyRemoved(CurInst)); CurInst->eraseFromParent(); ++NumGVNInstr; - + return true; } @@ -2825,7 +2594,7 @@ void GVN::addDeadBlock(BasicBlock *BB) { SmallVector<BasicBlock *, 8> Dom; DT->getDescendants(D, Dom); DeadBlocks.insert(Dom.begin(), Dom.end()); - + // Figure out the dominance-frontier(D). for (BasicBlock *B : Dom) { for (BasicBlock *S : successors(B)) { @@ -2883,13 +2652,13 @@ void GVN::addDeadBlock(BasicBlock *BB) { // If the given branch is recognized as a foldable branch (i.e. conditional // branch with constant condition), it will perform following analyses and // transformation. -// 1) If the dead out-coming edge is a critical-edge, split it. Let +// 1) If the dead out-coming edge is a critical-edge, split it. Let // R be the target of the dead out-coming edge. // 1) Identify the set of dead blocks implied by the branch's dead outcoming // edge. The result of this step will be {X| X is dominated by R} // 2) Identify those blocks which haves at least one dead predecessor. The // result of this step will be dominance-frontier(R). -// 3) Update the PHIs in DF(R) by replacing the operands corresponding to +// 3) Update the PHIs in DF(R) by replacing the operands corresponding to // dead blocks with "UndefVal" in an hope these PHIs will optimized away. // // Return true iff *NEW* dead code are found. @@ -2905,8 +2674,8 @@ bool GVN::processFoldableCondBr(BranchInst *BI) { if (!Cond) return false; - BasicBlock *DeadRoot = Cond->getZExtValue() ? - BI->getSuccessor(1) : BI->getSuccessor(0); + BasicBlock *DeadRoot = + Cond->getZExtValue() ? BI->getSuccessor(1) : BI->getSuccessor(0); if (DeadBlocks.count(DeadRoot)) return false; @@ -2924,8 +2693,62 @@ bool GVN::processFoldableCondBr(BranchInst *BI) { void GVN::assignValNumForDeadCode() { for (BasicBlock *BB : DeadBlocks) { for (Instruction &Inst : *BB) { - unsigned ValNum = VN.lookup_or_add(&Inst); + unsigned ValNum = VN.lookupOrAdd(&Inst); addToLeaderTable(ValNum, &Inst, BB); } } } + +class llvm::gvn::GVNLegacyPass : public FunctionPass { +public: + static char ID; // Pass identification, replacement for typeid + explicit GVNLegacyPass(bool NoLoads = false) + : FunctionPass(ID), NoLoads(NoLoads) { + initializeGVNLegacyPassPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override { + if (skipFunction(F)) + return false; + + return Impl.runImpl( + F, getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F), + getAnalysis<DominatorTreeWrapperPass>().getDomTree(), + getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(), + getAnalysis<AAResultsWrapperPass>().getAAResults(), + NoLoads ? nullptr + : &getAnalysis<MemoryDependenceWrapperPass>().getMemDep()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<AssumptionCacheTracker>(); + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); + if (!NoLoads) + AU.addRequired<MemoryDependenceWrapperPass>(); + AU.addRequired<AAResultsWrapperPass>(); + + AU.addPreserved<DominatorTreeWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); + } + +private: + bool NoLoads; + GVN Impl; +}; + +char GVNLegacyPass::ID = 0; + +// The public interface to this file... +FunctionPass *llvm::createGVNPass(bool NoLoads) { + return new GVNLegacyPass(NoLoads); +} + +INITIALIZE_PASS_BEGIN(GVNLegacyPass, "gvn", "Global Value Numbering", false, false) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) +INITIALIZE_PASS_END(GVNLegacyPass, "gvn", "Global Value Numbering", false, false) diff --git a/lib/Transforms/Scalar/GVNHoist.cpp b/lib/Transforms/Scalar/GVNHoist.cpp new file mode 100644 index 0000000000000..cce1db3874b78 --- /dev/null +++ b/lib/Transforms/Scalar/GVNHoist.cpp @@ -0,0 +1,825 @@ +//===- GVNHoist.cpp - Hoist scalar and load expressions -------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass hoists expressions from branches to a common dominator. It uses +// GVN (global value numbering) to discover expressions computing the same +// values. The primary goal is to reduce the code size, and in some +// cases reduce critical path (by exposing more ILP). +// Hoisting may affect the performance in some cases. To mitigate that, hoisting +// is disabled in the following cases. +// 1. Scalars across calls. +// 2. geps when corresponding load/store cannot be hoisted. +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Scalar/GVN.h" +#include "llvm/Transforms/Utils/MemorySSA.h" + +using namespace llvm; + +#define DEBUG_TYPE "gvn-hoist" + +STATISTIC(NumHoisted, "Number of instructions hoisted"); +STATISTIC(NumRemoved, "Number of instructions removed"); +STATISTIC(NumLoadsHoisted, "Number of loads hoisted"); +STATISTIC(NumLoadsRemoved, "Number of loads removed"); +STATISTIC(NumStoresHoisted, "Number of stores hoisted"); +STATISTIC(NumStoresRemoved, "Number of stores removed"); +STATISTIC(NumCallsHoisted, "Number of calls hoisted"); +STATISTIC(NumCallsRemoved, "Number of calls removed"); + +static cl::opt<int> + MaxHoistedThreshold("gvn-max-hoisted", cl::Hidden, cl::init(-1), + cl::desc("Max number of instructions to hoist " + "(default unlimited = -1)")); +static cl::opt<int> MaxNumberOfBBSInPath( + "gvn-hoist-max-bbs", cl::Hidden, cl::init(4), + cl::desc("Max number of basic blocks on the path between " + "hoisting locations (default = 4, unlimited = -1)")); + +namespace { + +// Provides a sorting function based on the execution order of two instructions. +struct SortByDFSIn { +private: + DenseMap<const BasicBlock *, unsigned> &DFSNumber; + +public: + SortByDFSIn(DenseMap<const BasicBlock *, unsigned> &D) : DFSNumber(D) {} + + // Returns true when A executes before B. + bool operator()(const Instruction *A, const Instruction *B) const { + // FIXME: libc++ has a std::sort() algorithm that will call the compare + // function on the same element. Once PR20837 is fixed and some more years + // pass by and all the buildbots have moved to a corrected std::sort(), + // enable the following assert: + // + // assert(A != B); + + const BasicBlock *BA = A->getParent(); + const BasicBlock *BB = B->getParent(); + unsigned NA = DFSNumber[BA]; + unsigned NB = DFSNumber[BB]; + if (NA < NB) + return true; + if (NA == NB) { + // Sort them in the order they occur in the same basic block. + BasicBlock::const_iterator AI(A), BI(B); + return std::distance(AI, BI) < 0; + } + return false; + } +}; + +// A map from a pair of VNs to all the instructions with those VNs. +typedef DenseMap<std::pair<unsigned, unsigned>, SmallVector<Instruction *, 4>> + VNtoInsns; +// An invalid value number Used when inserting a single value number into +// VNtoInsns. +enum : unsigned { InvalidVN = ~2U }; + +// Records all scalar instructions candidate for code hoisting. +class InsnInfo { + VNtoInsns VNtoScalars; + +public: + // Inserts I and its value number in VNtoScalars. + void insert(Instruction *I, GVN::ValueTable &VN) { + // Scalar instruction. + unsigned V = VN.lookupOrAdd(I); + VNtoScalars[{V, InvalidVN}].push_back(I); + } + + const VNtoInsns &getVNTable() const { return VNtoScalars; } +}; + +// Records all load instructions candidate for code hoisting. +class LoadInfo { + VNtoInsns VNtoLoads; + +public: + // Insert Load and the value number of its memory address in VNtoLoads. + void insert(LoadInst *Load, GVN::ValueTable &VN) { + if (Load->isSimple()) { + unsigned V = VN.lookupOrAdd(Load->getPointerOperand()); + VNtoLoads[{V, InvalidVN}].push_back(Load); + } + } + + const VNtoInsns &getVNTable() const { return VNtoLoads; } +}; + +// Records all store instructions candidate for code hoisting. +class StoreInfo { + VNtoInsns VNtoStores; + +public: + // Insert the Store and a hash number of the store address and the stored + // value in VNtoStores. + void insert(StoreInst *Store, GVN::ValueTable &VN) { + if (!Store->isSimple()) + return; + // Hash the store address and the stored value. + Value *Ptr = Store->getPointerOperand(); + Value *Val = Store->getValueOperand(); + VNtoStores[{VN.lookupOrAdd(Ptr), VN.lookupOrAdd(Val)}].push_back(Store); + } + + const VNtoInsns &getVNTable() const { return VNtoStores; } +}; + +// Records all call instructions candidate for code hoisting. +class CallInfo { + VNtoInsns VNtoCallsScalars; + VNtoInsns VNtoCallsLoads; + VNtoInsns VNtoCallsStores; + +public: + // Insert Call and its value numbering in one of the VNtoCalls* containers. + void insert(CallInst *Call, GVN::ValueTable &VN) { + // A call that doesNotAccessMemory is handled as a Scalar, + // onlyReadsMemory will be handled as a Load instruction, + // all other calls will be handled as stores. + unsigned V = VN.lookupOrAdd(Call); + auto Entry = std::make_pair(V, InvalidVN); + + if (Call->doesNotAccessMemory()) + VNtoCallsScalars[Entry].push_back(Call); + else if (Call->onlyReadsMemory()) + VNtoCallsLoads[Entry].push_back(Call); + else + VNtoCallsStores[Entry].push_back(Call); + } + + const VNtoInsns &getScalarVNTable() const { return VNtoCallsScalars; } + + const VNtoInsns &getLoadVNTable() const { return VNtoCallsLoads; } + + const VNtoInsns &getStoreVNTable() const { return VNtoCallsStores; } +}; + +typedef DenseMap<const BasicBlock *, bool> BBSideEffectsSet; +typedef SmallVector<Instruction *, 4> SmallVecInsn; +typedef SmallVectorImpl<Instruction *> SmallVecImplInsn; + +// This pass hoists common computations across branches sharing common +// dominator. The primary goal is to reduce the code size, and in some +// cases reduce critical path (by exposing more ILP). +class GVNHoist { +public: + GVN::ValueTable VN; + DominatorTree *DT; + AliasAnalysis *AA; + MemoryDependenceResults *MD; + const bool OptForMinSize; + DenseMap<const BasicBlock *, unsigned> DFSNumber; + BBSideEffectsSet BBSideEffects; + MemorySSA *MSSA; + int HoistedCtr; + + enum InsKind { Unknown, Scalar, Load, Store }; + + GVNHoist(DominatorTree *Dt, AliasAnalysis *Aa, MemoryDependenceResults *Md, + bool OptForMinSize) + : DT(Dt), AA(Aa), MD(Md), OptForMinSize(OptForMinSize), HoistedCtr(0) {} + + // Return true when there are exception handling in BB. + bool hasEH(const BasicBlock *BB) { + auto It = BBSideEffects.find(BB); + if (It != BBSideEffects.end()) + return It->second; + + if (BB->isEHPad() || BB->hasAddressTaken()) { + BBSideEffects[BB] = true; + return true; + } + + if (BB->getTerminator()->mayThrow()) { + BBSideEffects[BB] = true; + return true; + } + + BBSideEffects[BB] = false; + return false; + } + + // Return true when all paths from A to the end of the function pass through + // either B or C. + bool hoistingFromAllPaths(const BasicBlock *A, const BasicBlock *B, + const BasicBlock *C) { + // We fully copy the WL in order to be able to remove items from it. + SmallPtrSet<const BasicBlock *, 2> WL; + WL.insert(B); + WL.insert(C); + + for (auto It = df_begin(A), E = df_end(A); It != E;) { + // There exists a path from A to the exit of the function if we are still + // iterating in DF traversal and we removed all instructions from the work + // list. + if (WL.empty()) + return false; + + const BasicBlock *BB = *It; + if (WL.erase(BB)) { + // Stop DFS traversal when BB is in the work list. + It.skipChildren(); + continue; + } + + // Check for end of function, calls that do not return, etc. + if (!isGuaranteedToTransferExecutionToSuccessor(BB->getTerminator())) + return false; + + // Increment DFS traversal when not skipping children. + ++It; + } + + return true; + } + + /* Return true when I1 appears before I2 in the instructions of BB. */ + bool firstInBB(BasicBlock *BB, const Instruction *I1, const Instruction *I2) { + for (Instruction &I : *BB) { + if (&I == I1) + return true; + if (&I == I2) + return false; + } + + llvm_unreachable("I1 and I2 not found in BB"); + } + // Return true when there are users of Def in BB. + bool hasMemoryUseOnPath(MemoryAccess *Def, const BasicBlock *BB, + const Instruction *OldPt) { + const BasicBlock *DefBB = Def->getBlock(); + const BasicBlock *OldBB = OldPt->getParent(); + + for (User *U : Def->users()) + if (auto *MU = dyn_cast<MemoryUse>(U)) { + BasicBlock *UBB = MU->getBlock(); + // Only analyze uses in BB. + if (BB != UBB) + continue; + + // A use in the same block as the Def is on the path. + if (UBB == DefBB) { + assert(MSSA->locallyDominates(Def, MU) && "def not dominating use"); + return true; + } + + if (UBB != OldBB) + return true; + + // It is only harmful to hoist when the use is before OldPt. + if (firstInBB(UBB, MU->getMemoryInst(), OldPt)) + return true; + } + + return false; + } + + // Return true when there are exception handling or loads of memory Def + // between OldPt and NewPt. + + // Decrement by 1 NBBsOnAllPaths for each block between HoistPt and BB, and + // return true when the counter NBBsOnAllPaths reaces 0, except when it is + // initialized to -1 which is unlimited. + bool hasEHOrLoadsOnPath(const Instruction *NewPt, const Instruction *OldPt, + MemoryAccess *Def, int &NBBsOnAllPaths) { + const BasicBlock *NewBB = NewPt->getParent(); + const BasicBlock *OldBB = OldPt->getParent(); + assert(DT->dominates(NewBB, OldBB) && "invalid path"); + assert(DT->dominates(Def->getBlock(), NewBB) && + "def does not dominate new hoisting point"); + + // Walk all basic blocks reachable in depth-first iteration on the inverse + // CFG from OldBB to NewBB. These blocks are all the blocks that may be + // executed between the execution of NewBB and OldBB. Hoisting an expression + // from OldBB into NewBB has to be safe on all execution paths. + for (auto I = idf_begin(OldBB), E = idf_end(OldBB); I != E;) { + if (*I == NewBB) { + // Stop traversal when reaching HoistPt. + I.skipChildren(); + continue; + } + + // Impossible to hoist with exceptions on the path. + if (hasEH(*I)) + return true; + + // Check that we do not move a store past loads. + if (hasMemoryUseOnPath(Def, *I, OldPt)) + return true; + + // Stop walk once the limit is reached. + if (NBBsOnAllPaths == 0) + return true; + + // -1 is unlimited number of blocks on all paths. + if (NBBsOnAllPaths != -1) + --NBBsOnAllPaths; + + ++I; + } + + return false; + } + + // Return true when there are exception handling between HoistPt and BB. + // Decrement by 1 NBBsOnAllPaths for each block between HoistPt and BB, and + // return true when the counter NBBsOnAllPaths reaches 0, except when it is + // initialized to -1 which is unlimited. + bool hasEHOnPath(const BasicBlock *HoistPt, const BasicBlock *BB, + int &NBBsOnAllPaths) { + assert(DT->dominates(HoistPt, BB) && "Invalid path"); + + // Walk all basic blocks reachable in depth-first iteration on + // the inverse CFG from BBInsn to NewHoistPt. These blocks are all the + // blocks that may be executed between the execution of NewHoistPt and + // BBInsn. Hoisting an expression from BBInsn into NewHoistPt has to be safe + // on all execution paths. + for (auto I = idf_begin(BB), E = idf_end(BB); I != E;) { + if (*I == HoistPt) { + // Stop traversal when reaching NewHoistPt. + I.skipChildren(); + continue; + } + + // Impossible to hoist with exceptions on the path. + if (hasEH(*I)) + return true; + + // Stop walk once the limit is reached. + if (NBBsOnAllPaths == 0) + return true; + + // -1 is unlimited number of blocks on all paths. + if (NBBsOnAllPaths != -1) + --NBBsOnAllPaths; + + ++I; + } + + return false; + } + + // Return true when it is safe to hoist a memory load or store U from OldPt + // to NewPt. + bool safeToHoistLdSt(const Instruction *NewPt, const Instruction *OldPt, + MemoryUseOrDef *U, InsKind K, int &NBBsOnAllPaths) { + + // In place hoisting is safe. + if (NewPt == OldPt) + return true; + + const BasicBlock *NewBB = NewPt->getParent(); + const BasicBlock *OldBB = OldPt->getParent(); + const BasicBlock *UBB = U->getBlock(); + + // Check for dependences on the Memory SSA. + MemoryAccess *D = U->getDefiningAccess(); + BasicBlock *DBB = D->getBlock(); + if (DT->properlyDominates(NewBB, DBB)) + // Cannot move the load or store to NewBB above its definition in DBB. + return false; + + if (NewBB == DBB && !MSSA->isLiveOnEntryDef(D)) + if (auto *UD = dyn_cast<MemoryUseOrDef>(D)) + if (firstInBB(DBB, NewPt, UD->getMemoryInst())) + // Cannot move the load or store to NewPt above its definition in D. + return false; + + // Check for unsafe hoistings due to side effects. + if (K == InsKind::Store) { + if (hasEHOrLoadsOnPath(NewPt, OldPt, D, NBBsOnAllPaths)) + return false; + } else if (hasEHOnPath(NewBB, OldBB, NBBsOnAllPaths)) + return false; + + if (UBB == NewBB) { + if (DT->properlyDominates(DBB, NewBB)) + return true; + assert(UBB == DBB); + assert(MSSA->locallyDominates(D, U)); + } + + // No side effects: it is safe to hoist. + return true; + } + + // Return true when it is safe to hoist scalar instructions from BB1 and BB2 + // to HoistBB. + bool safeToHoistScalar(const BasicBlock *HoistBB, const BasicBlock *BB1, + const BasicBlock *BB2, int &NBBsOnAllPaths) { + // Check that the hoisted expression is needed on all paths. When HoistBB + // already contains an instruction to be hoisted, the expression is needed + // on all paths. Enable scalar hoisting at -Oz as it is safe to hoist + // scalars to a place where they are partially needed. + if (!OptForMinSize && BB1 != HoistBB && + !hoistingFromAllPaths(HoistBB, BB1, BB2)) + return false; + + if (hasEHOnPath(HoistBB, BB1, NBBsOnAllPaths) || + hasEHOnPath(HoistBB, BB2, NBBsOnAllPaths)) + return false; + + // Safe to hoist scalars from BB1 and BB2 to HoistBB. + return true; + } + + // Each element of a hoisting list contains the basic block where to hoist and + // a list of instructions to be hoisted. + typedef std::pair<BasicBlock *, SmallVecInsn> HoistingPointInfo; + typedef SmallVector<HoistingPointInfo, 4> HoistingPointList; + + // Partition InstructionsToHoist into a set of candidates which can share a + // common hoisting point. The partitions are collected in HPL. IsScalar is + // true when the instructions in InstructionsToHoist are scalars. IsLoad is + // true when the InstructionsToHoist are loads, false when they are stores. + void partitionCandidates(SmallVecImplInsn &InstructionsToHoist, + HoistingPointList &HPL, InsKind K) { + // No need to sort for two instructions. + if (InstructionsToHoist.size() > 2) { + SortByDFSIn Pred(DFSNumber); + std::sort(InstructionsToHoist.begin(), InstructionsToHoist.end(), Pred); + } + + int NBBsOnAllPaths = MaxNumberOfBBSInPath; + + SmallVecImplInsn::iterator II = InstructionsToHoist.begin(); + SmallVecImplInsn::iterator Start = II; + Instruction *HoistPt = *II; + BasicBlock *HoistBB = HoistPt->getParent(); + MemoryUseOrDef *UD; + if (K != InsKind::Scalar) + UD = cast<MemoryUseOrDef>(MSSA->getMemoryAccess(HoistPt)); + + for (++II; II != InstructionsToHoist.end(); ++II) { + Instruction *Insn = *II; + BasicBlock *BB = Insn->getParent(); + BasicBlock *NewHoistBB; + Instruction *NewHoistPt; + + if (BB == HoistBB) { + NewHoistBB = HoistBB; + NewHoistPt = firstInBB(BB, Insn, HoistPt) ? Insn : HoistPt; + } else { + NewHoistBB = DT->findNearestCommonDominator(HoistBB, BB); + if (NewHoistBB == BB) + NewHoistPt = Insn; + else if (NewHoistBB == HoistBB) + NewHoistPt = HoistPt; + else + NewHoistPt = NewHoistBB->getTerminator(); + } + + if (K == InsKind::Scalar) { + if (safeToHoistScalar(NewHoistBB, HoistBB, BB, NBBsOnAllPaths)) { + // Extend HoistPt to NewHoistPt. + HoistPt = NewHoistPt; + HoistBB = NewHoistBB; + continue; + } + } else { + // When NewBB already contains an instruction to be hoisted, the + // expression is needed on all paths. + // Check that the hoisted expression is needed on all paths: it is + // unsafe to hoist loads to a place where there may be a path not + // loading from the same address: for instance there may be a branch on + // which the address of the load may not be initialized. + if ((HoistBB == NewHoistBB || BB == NewHoistBB || + hoistingFromAllPaths(NewHoistBB, HoistBB, BB)) && + // Also check that it is safe to move the load or store from HoistPt + // to NewHoistPt, and from Insn to NewHoistPt. + safeToHoistLdSt(NewHoistPt, HoistPt, UD, K, NBBsOnAllPaths) && + safeToHoistLdSt(NewHoistPt, Insn, + cast<MemoryUseOrDef>(MSSA->getMemoryAccess(Insn)), + K, NBBsOnAllPaths)) { + // Extend HoistPt to NewHoistPt. + HoistPt = NewHoistPt; + HoistBB = NewHoistBB; + continue; + } + } + + // At this point it is not safe to extend the current hoisting to + // NewHoistPt: save the hoisting list so far. + if (std::distance(Start, II) > 1) + HPL.push_back({HoistBB, SmallVecInsn(Start, II)}); + + // Start over from BB. + Start = II; + if (K != InsKind::Scalar) + UD = cast<MemoryUseOrDef>(MSSA->getMemoryAccess(*Start)); + HoistPt = Insn; + HoistBB = BB; + NBBsOnAllPaths = MaxNumberOfBBSInPath; + } + + // Save the last partition. + if (std::distance(Start, II) > 1) + HPL.push_back({HoistBB, SmallVecInsn(Start, II)}); + } + + // Initialize HPL from Map. + void computeInsertionPoints(const VNtoInsns &Map, HoistingPointList &HPL, + InsKind K) { + for (const auto &Entry : Map) { + if (MaxHoistedThreshold != -1 && ++HoistedCtr > MaxHoistedThreshold) + return; + + const SmallVecInsn &V = Entry.second; + if (V.size() < 2) + continue; + + // Compute the insertion point and the list of expressions to be hoisted. + SmallVecInsn InstructionsToHoist; + for (auto I : V) + if (!hasEH(I->getParent())) + InstructionsToHoist.push_back(I); + + if (!InstructionsToHoist.empty()) + partitionCandidates(InstructionsToHoist, HPL, K); + } + } + + // Return true when all operands of Instr are available at insertion point + // HoistPt. When limiting the number of hoisted expressions, one could hoist + // a load without hoisting its access function. So before hoisting any + // expression, make sure that all its operands are available at insert point. + bool allOperandsAvailable(const Instruction *I, + const BasicBlock *HoistPt) const { + for (const Use &Op : I->operands()) + if (const auto *Inst = dyn_cast<Instruction>(&Op)) + if (!DT->dominates(Inst->getParent(), HoistPt)) + return false; + + return true; + } + + Instruction *firstOfTwo(Instruction *I, Instruction *J) const { + for (Instruction &I1 : *I->getParent()) + if (&I1 == I || &I1 == J) + return &I1; + llvm_unreachable("Both I and J must be from same BB"); + } + + // Replace the use of From with To in Insn. + void replaceUseWith(Instruction *Insn, Value *From, Value *To) const { + for (Value::use_iterator UI = From->use_begin(), UE = From->use_end(); + UI != UE;) { + Use &U = *UI++; + if (U.getUser() == Insn) { + U.set(To); + return; + } + } + llvm_unreachable("should replace exactly once"); + } + + bool makeOperandsAvailable(Instruction *Repl, BasicBlock *HoistPt) const { + // Check whether the GEP of a ld/st can be synthesized at HoistPt. + GetElementPtrInst *Gep = nullptr; + Instruction *Val = nullptr; + if (auto *Ld = dyn_cast<LoadInst>(Repl)) + Gep = dyn_cast<GetElementPtrInst>(Ld->getPointerOperand()); + if (auto *St = dyn_cast<StoreInst>(Repl)) { + Gep = dyn_cast<GetElementPtrInst>(St->getPointerOperand()); + Val = dyn_cast<Instruction>(St->getValueOperand()); + // Check that the stored value is available. + if (Val) { + if (isa<GetElementPtrInst>(Val)) { + // Check whether we can compute the GEP at HoistPt. + if (!allOperandsAvailable(Val, HoistPt)) + return false; + } else if (!DT->dominates(Val->getParent(), HoistPt)) + return false; + } + } + + // Check whether we can compute the Gep at HoistPt. + if (!Gep || !allOperandsAvailable(Gep, HoistPt)) + return false; + + // Copy the gep before moving the ld/st. + Instruction *ClonedGep = Gep->clone(); + ClonedGep->insertBefore(HoistPt->getTerminator()); + replaceUseWith(Repl, Gep, ClonedGep); + + // Also copy Val when it is a GEP. + if (Val && isa<GetElementPtrInst>(Val)) { + Instruction *ClonedVal = Val->clone(); + ClonedVal->insertBefore(HoistPt->getTerminator()); + replaceUseWith(Repl, Val, ClonedVal); + } + + return true; + } + + std::pair<unsigned, unsigned> hoist(HoistingPointList &HPL) { + unsigned NI = 0, NL = 0, NS = 0, NC = 0, NR = 0; + for (const HoistingPointInfo &HP : HPL) { + // Find out whether we already have one of the instructions in HoistPt, + // in which case we do not have to move it. + BasicBlock *HoistPt = HP.first; + const SmallVecInsn &InstructionsToHoist = HP.second; + Instruction *Repl = nullptr; + for (Instruction *I : InstructionsToHoist) + if (I->getParent() == HoistPt) { + // If there are two instructions in HoistPt to be hoisted in place: + // update Repl to be the first one, such that we can rename the uses + // of the second based on the first. + Repl = !Repl ? I : firstOfTwo(Repl, I); + } + + if (Repl) { + // Repl is already in HoistPt: it remains in place. + assert(allOperandsAvailable(Repl, HoistPt) && + "instruction depends on operands that are not available"); + } else { + // When we do not find Repl in HoistPt, select the first in the list + // and move it to HoistPt. + Repl = InstructionsToHoist.front(); + + // We can move Repl in HoistPt only when all operands are available. + // The order in which hoistings are done may influence the availability + // of operands. + if (!allOperandsAvailable(Repl, HoistPt) && + !makeOperandsAvailable(Repl, HoistPt)) + continue; + Repl->moveBefore(HoistPt->getTerminator()); + } + + if (isa<LoadInst>(Repl)) + ++NL; + else if (isa<StoreInst>(Repl)) + ++NS; + else if (isa<CallInst>(Repl)) + ++NC; + else // Scalar + ++NI; + + // Remove and rename all other instructions. + for (Instruction *I : InstructionsToHoist) + if (I != Repl) { + ++NR; + if (isa<LoadInst>(Repl)) + ++NumLoadsRemoved; + else if (isa<StoreInst>(Repl)) + ++NumStoresRemoved; + else if (isa<CallInst>(Repl)) + ++NumCallsRemoved; + I->replaceAllUsesWith(Repl); + I->eraseFromParent(); + } + } + + NumHoisted += NL + NS + NC + NI; + NumRemoved += NR; + NumLoadsHoisted += NL; + NumStoresHoisted += NS; + NumCallsHoisted += NC; + return {NI, NL + NC + NS}; + } + + // Hoist all expressions. Returns Number of scalars hoisted + // and number of non-scalars hoisted. + std::pair<unsigned, unsigned> hoistExpressions(Function &F) { + InsnInfo II; + LoadInfo LI; + StoreInfo SI; + CallInfo CI; + for (BasicBlock *BB : depth_first(&F.getEntryBlock())) { + for (Instruction &I1 : *BB) { + if (auto *Load = dyn_cast<LoadInst>(&I1)) + LI.insert(Load, VN); + else if (auto *Store = dyn_cast<StoreInst>(&I1)) + SI.insert(Store, VN); + else if (auto *Call = dyn_cast<CallInst>(&I1)) { + if (auto *Intr = dyn_cast<IntrinsicInst>(Call)) { + if (isa<DbgInfoIntrinsic>(Intr) || + Intr->getIntrinsicID() == Intrinsic::assume) + continue; + } + if (Call->mayHaveSideEffects()) { + if (!OptForMinSize) + break; + // We may continue hoisting across calls which write to memory. + if (Call->mayThrow()) + break; + } + CI.insert(Call, VN); + } else if (OptForMinSize || !isa<GetElementPtrInst>(&I1)) + // Do not hoist scalars past calls that may write to memory because + // that could result in spills later. geps are handled separately. + // TODO: We can relax this for targets like AArch64 as they have more + // registers than X86. + II.insert(&I1, VN); + } + } + + HoistingPointList HPL; + computeInsertionPoints(II.getVNTable(), HPL, InsKind::Scalar); + computeInsertionPoints(LI.getVNTable(), HPL, InsKind::Load); + computeInsertionPoints(SI.getVNTable(), HPL, InsKind::Store); + computeInsertionPoints(CI.getScalarVNTable(), HPL, InsKind::Scalar); + computeInsertionPoints(CI.getLoadVNTable(), HPL, InsKind::Load); + computeInsertionPoints(CI.getStoreVNTable(), HPL, InsKind::Store); + return hoist(HPL); + } + + bool run(Function &F) { + VN.setDomTree(DT); + VN.setAliasAnalysis(AA); + VN.setMemDep(MD); + bool Res = false; + + unsigned I = 0; + for (const BasicBlock *BB : depth_first(&F.getEntryBlock())) + DFSNumber.insert({BB, ++I}); + + // FIXME: use lazy evaluation of VN to avoid the fix-point computation. + while (1) { + // FIXME: only compute MemorySSA once. We need to update the analysis in + // the same time as transforming the code. + MemorySSA M(F, AA, DT); + MSSA = &M; + + auto HoistStat = hoistExpressions(F); + if (HoistStat.first + HoistStat.second == 0) { + return Res; + } + if (HoistStat.second > 0) { + // To address a limitation of the current GVN, we need to rerun the + // hoisting after we hoisted loads in order to be able to hoist all + // scalars dependent on the hoisted loads. Same for stores. + VN.clear(); + } + Res = true; + } + + return Res; + } +}; + +class GVNHoistLegacyPass : public FunctionPass { +public: + static char ID; + + GVNHoistLegacyPass() : FunctionPass(ID) { + initializeGVNHoistLegacyPassPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override { + auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + auto &AA = getAnalysis<AAResultsWrapperPass>().getAAResults(); + auto &MD = getAnalysis<MemoryDependenceWrapperPass>().getMemDep(); + + GVNHoist G(&DT, &AA, &MD, F.optForMinSize()); + return G.run(F); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addRequired<AAResultsWrapperPass>(); + AU.addRequired<MemoryDependenceWrapperPass>(); + AU.addPreserved<DominatorTreeWrapperPass>(); + } +}; +} // namespace + +PreservedAnalyses GVNHoistPass::run(Function &F, + AnalysisManager<Function> &AM) { + DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F); + AliasAnalysis &AA = AM.getResult<AAManager>(F); + MemoryDependenceResults &MD = AM.getResult<MemoryDependenceAnalysis>(F); + + GVNHoist G(&DT, &AA, &MD, F.optForMinSize()); + if (!G.run(F)) + return PreservedAnalyses::all(); + + PreservedAnalyses PA; + PA.preserve<DominatorTreeAnalysis>(); + return PA; +} + +char GVNHoistLegacyPass::ID = 0; +INITIALIZE_PASS_BEGIN(GVNHoistLegacyPass, "gvn-hoist", + "Early GVN Hoisting of Expressions", false, false) +INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_END(GVNHoistLegacyPass, "gvn-hoist", + "Early GVN Hoisting of Expressions", false, false) + +FunctionPass *llvm::createGVNHoistPass() { return new GVNHoistLegacyPass(); } diff --git a/lib/Transforms/Scalar/GuardWidening.cpp b/lib/Transforms/Scalar/GuardWidening.cpp new file mode 100644 index 0000000000000..7686e65efed92 --- /dev/null +++ b/lib/Transforms/Scalar/GuardWidening.cpp @@ -0,0 +1,691 @@ +//===- GuardWidening.cpp - ---- Guard widening ----------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the guard widening pass. The semantics of the +// @llvm.experimental.guard intrinsic lets LLVM transform it so that it fails +// more often that it did before the transform. This optimization is called +// "widening" and can be used hoist and common runtime checks in situations like +// these: +// +// %cmp0 = 7 u< Length +// call @llvm.experimental.guard(i1 %cmp0) [ "deopt"(...) ] +// call @unknown_side_effects() +// %cmp1 = 9 u< Length +// call @llvm.experimental.guard(i1 %cmp1) [ "deopt"(...) ] +// ... +// +// => +// +// %cmp0 = 9 u< Length +// call @llvm.experimental.guard(i1 %cmp0) [ "deopt"(...) ] +// call @unknown_side_effects() +// ... +// +// If %cmp0 is false, @llvm.experimental.guard will "deoptimize" back to a +// generic implementation of the same function, which will have the correct +// semantics from that point onward. It is always _legal_ to deoptimize (so +// replacing %cmp0 with false is "correct"), though it may not always be +// profitable to do so. +// +// NB! This pass is a work in progress. It hasn't been tuned to be "production +// ready" yet. It is known to have quadriatic running time and will not scale +// to large numbers of guards +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Scalar/GuardWidening.h" +#include "llvm/Pass.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/PostDominators.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/Support/Debug.h" +#include "llvm/Transforms/Scalar.h" + +using namespace llvm; + +#define DEBUG_TYPE "guard-widening" + +namespace { + +class GuardWideningImpl { + DominatorTree &DT; + PostDominatorTree &PDT; + LoopInfo &LI; + + /// The set of guards whose conditions have been widened into dominating + /// guards. + SmallVector<IntrinsicInst *, 16> EliminatedGuards; + + /// The set of guards which have been widened to include conditions to other + /// guards. + DenseSet<IntrinsicInst *> WidenedGuards; + + /// Try to eliminate guard \p Guard by widening it into an earlier dominating + /// guard. \p DFSI is the DFS iterator on the dominator tree that is + /// currently visiting the block containing \p Guard, and \p GuardsPerBlock + /// maps BasicBlocks to the set of guards seen in that block. + bool eliminateGuardViaWidening( + IntrinsicInst *Guard, const df_iterator<DomTreeNode *> &DFSI, + const DenseMap<BasicBlock *, SmallVector<IntrinsicInst *, 8>> & + GuardsPerBlock); + + /// Used to keep track of which widening potential is more effective. + enum WideningScore { + /// Don't widen. + WS_IllegalOrNegative, + + /// Widening is performance neutral as far as the cycles spent in check + /// conditions goes (but can still help, e.g., code layout, having less + /// deopt state). + WS_Neutral, + + /// Widening is profitable. + WS_Positive, + + /// Widening is very profitable. Not significantly different from \c + /// WS_Positive, except by the order. + WS_VeryPositive + }; + + static StringRef scoreTypeToString(WideningScore WS); + + /// Compute the score for widening the condition in \p DominatedGuard + /// (contained in \p DominatedGuardLoop) into \p DominatingGuard (contained in + /// \p DominatingGuardLoop). + WideningScore computeWideningScore(IntrinsicInst *DominatedGuard, + Loop *DominatedGuardLoop, + IntrinsicInst *DominatingGuard, + Loop *DominatingGuardLoop); + + /// Helper to check if \p V can be hoisted to \p InsertPos. + bool isAvailableAt(Value *V, Instruction *InsertPos) { + SmallPtrSet<Instruction *, 8> Visited; + return isAvailableAt(V, InsertPos, Visited); + } + + bool isAvailableAt(Value *V, Instruction *InsertPos, + SmallPtrSetImpl<Instruction *> &Visited); + + /// Helper to hoist \p V to \p InsertPos. Guaranteed to succeed if \c + /// isAvailableAt returned true. + void makeAvailableAt(Value *V, Instruction *InsertPos); + + /// Common helper used by \c widenGuard and \c isWideningCondProfitable. Try + /// to generate an expression computing the logical AND of \p Cond0 and \p + /// Cond1. Return true if the expression computing the AND is only as + /// expensive as computing one of the two. If \p InsertPt is true then + /// actually generate the resulting expression, make it available at \p + /// InsertPt and return it in \p Result (else no change to the IR is made). + bool widenCondCommon(Value *Cond0, Value *Cond1, Instruction *InsertPt, + Value *&Result); + + /// Represents a range check of the form \c Base + \c Offset u< \c Length, + /// with the constraint that \c Length is not negative. \c CheckInst is the + /// pre-existing instruction in the IR that computes the result of this range + /// check. + class RangeCheck { + Value *Base; + ConstantInt *Offset; + Value *Length; + ICmpInst *CheckInst; + + public: + explicit RangeCheck(Value *Base, ConstantInt *Offset, Value *Length, + ICmpInst *CheckInst) + : Base(Base), Offset(Offset), Length(Length), CheckInst(CheckInst) {} + + void setBase(Value *NewBase) { Base = NewBase; } + void setOffset(ConstantInt *NewOffset) { Offset = NewOffset; } + + Value *getBase() const { return Base; } + ConstantInt *getOffset() const { return Offset; } + const APInt &getOffsetValue() const { return getOffset()->getValue(); } + Value *getLength() const { return Length; }; + ICmpInst *getCheckInst() const { return CheckInst; } + + void print(raw_ostream &OS, bool PrintTypes = false) { + OS << "Base: "; + Base->printAsOperand(OS, PrintTypes); + OS << " Offset: "; + Offset->printAsOperand(OS, PrintTypes); + OS << " Length: "; + Length->printAsOperand(OS, PrintTypes); + } + + LLVM_DUMP_METHOD void dump() { + print(dbgs()); + dbgs() << "\n"; + } + }; + + /// Parse \p CheckCond into a conjunction (logical-and) of range checks; and + /// append them to \p Checks. Returns true on success, may clobber \c Checks + /// on failure. + bool parseRangeChecks(Value *CheckCond, SmallVectorImpl<RangeCheck> &Checks) { + SmallPtrSet<Value *, 8> Visited; + return parseRangeChecks(CheckCond, Checks, Visited); + } + + bool parseRangeChecks(Value *CheckCond, SmallVectorImpl<RangeCheck> &Checks, + SmallPtrSetImpl<Value *> &Visited); + + /// Combine the checks in \p Checks into a smaller set of checks and append + /// them into \p CombinedChecks. Return true on success (i.e. all of checks + /// in \p Checks were combined into \p CombinedChecks). Clobbers \p Checks + /// and \p CombinedChecks on success and on failure. + bool combineRangeChecks(SmallVectorImpl<RangeCheck> &Checks, + SmallVectorImpl<RangeCheck> &CombinedChecks); + + /// Can we compute the logical AND of \p Cond0 and \p Cond1 for the price of + /// computing only one of the two expressions? + bool isWideningCondProfitable(Value *Cond0, Value *Cond1) { + Value *ResultUnused; + return widenCondCommon(Cond0, Cond1, /*InsertPt=*/nullptr, ResultUnused); + } + + /// Widen \p ToWiden to fail if \p NewCondition is false (in addition to + /// whatever it is already checking). + void widenGuard(IntrinsicInst *ToWiden, Value *NewCondition) { + Value *Result; + widenCondCommon(ToWiden->getArgOperand(0), NewCondition, ToWiden, Result); + ToWiden->setArgOperand(0, Result); + } + +public: + explicit GuardWideningImpl(DominatorTree &DT, PostDominatorTree &PDT, + LoopInfo &LI) + : DT(DT), PDT(PDT), LI(LI) {} + + /// The entry point for this pass. + bool run(); +}; + +struct GuardWideningLegacyPass : public FunctionPass { + static char ID; + GuardWideningPass Impl; + + GuardWideningLegacyPass() : FunctionPass(ID) { + initializeGuardWideningLegacyPassPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override { + if (skipFunction(F)) + return false; + return GuardWideningImpl( + getAnalysis<DominatorTreeWrapperPass>().getDomTree(), + getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree(), + getAnalysis<LoopInfoWrapperPass>().getLoopInfo()).run(); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addRequired<PostDominatorTreeWrapperPass>(); + AU.addRequired<LoopInfoWrapperPass>(); + } +}; + +} + +bool GuardWideningImpl::run() { + using namespace llvm::PatternMatch; + + DenseMap<BasicBlock *, SmallVector<IntrinsicInst *, 8>> GuardsInBlock; + bool Changed = false; + + for (auto DFI = df_begin(DT.getRootNode()), DFE = df_end(DT.getRootNode()); + DFI != DFE; ++DFI) { + auto *BB = (*DFI)->getBlock(); + auto &CurrentList = GuardsInBlock[BB]; + + for (auto &I : *BB) + if (match(&I, m_Intrinsic<Intrinsic::experimental_guard>())) + CurrentList.push_back(cast<IntrinsicInst>(&I)); + + for (auto *II : CurrentList) + Changed |= eliminateGuardViaWidening(II, DFI, GuardsInBlock); + } + + for (auto *II : EliminatedGuards) + if (!WidenedGuards.count(II)) + II->eraseFromParent(); + + return Changed; +} + +bool GuardWideningImpl::eliminateGuardViaWidening( + IntrinsicInst *GuardInst, const df_iterator<DomTreeNode *> &DFSI, + const DenseMap<BasicBlock *, SmallVector<IntrinsicInst *, 8>> & + GuardsInBlock) { + IntrinsicInst *BestSoFar = nullptr; + auto BestScoreSoFar = WS_IllegalOrNegative; + auto *GuardInstLoop = LI.getLoopFor(GuardInst->getParent()); + + // In the set of dominating guards, find the one we can merge GuardInst with + // for the most profit. + for (unsigned i = 0, e = DFSI.getPathLength(); i != e; ++i) { + auto *CurBB = DFSI.getPath(i)->getBlock(); + auto *CurLoop = LI.getLoopFor(CurBB); + assert(GuardsInBlock.count(CurBB) && "Must have been populated by now!"); + const auto &GuardsInCurBB = GuardsInBlock.find(CurBB)->second; + + auto I = GuardsInCurBB.begin(); + auto E = GuardsInCurBB.end(); + +#ifndef NDEBUG + { + unsigned Index = 0; + for (auto &I : *CurBB) { + if (Index == GuardsInCurBB.size()) + break; + if (GuardsInCurBB[Index] == &I) + Index++; + } + assert(Index == GuardsInCurBB.size() && + "Guards expected to be in order!"); + } +#endif + + assert((i == (e - 1)) == (GuardInst->getParent() == CurBB) && "Bad DFS?"); + + if (i == (e - 1)) { + // Corner case: make sure we're only looking at guards strictly dominating + // GuardInst when visiting GuardInst->getParent(). + auto NewEnd = std::find(I, E, GuardInst); + assert(NewEnd != E && "GuardInst not in its own block?"); + E = NewEnd; + } + + for (auto *Candidate : make_range(I, E)) { + auto Score = + computeWideningScore(GuardInst, GuardInstLoop, Candidate, CurLoop); + DEBUG(dbgs() << "Score between " << *GuardInst->getArgOperand(0) + << " and " << *Candidate->getArgOperand(0) << " is " + << scoreTypeToString(Score) << "\n"); + if (Score > BestScoreSoFar) { + BestScoreSoFar = Score; + BestSoFar = Candidate; + } + } + } + + if (BestScoreSoFar == WS_IllegalOrNegative) { + DEBUG(dbgs() << "Did not eliminate guard " << *GuardInst << "\n"); + return false; + } + + assert(BestSoFar != GuardInst && "Should have never visited same guard!"); + assert(DT.dominates(BestSoFar, GuardInst) && "Should be!"); + + DEBUG(dbgs() << "Widening " << *GuardInst << " into " << *BestSoFar + << " with score " << scoreTypeToString(BestScoreSoFar) << "\n"); + widenGuard(BestSoFar, GuardInst->getArgOperand(0)); + GuardInst->setArgOperand(0, ConstantInt::getTrue(GuardInst->getContext())); + EliminatedGuards.push_back(GuardInst); + WidenedGuards.insert(BestSoFar); + return true; +} + +GuardWideningImpl::WideningScore GuardWideningImpl::computeWideningScore( + IntrinsicInst *DominatedGuard, Loop *DominatedGuardLoop, + IntrinsicInst *DominatingGuard, Loop *DominatingGuardLoop) { + bool HoistingOutOfLoop = false; + + if (DominatingGuardLoop != DominatedGuardLoop) { + if (DominatingGuardLoop && + !DominatingGuardLoop->contains(DominatedGuardLoop)) + return WS_IllegalOrNegative; + + HoistingOutOfLoop = true; + } + + if (!isAvailableAt(DominatedGuard->getArgOperand(0), DominatingGuard)) + return WS_IllegalOrNegative; + + bool HoistingOutOfIf = + !PDT.dominates(DominatedGuard->getParent(), DominatingGuard->getParent()); + + if (isWideningCondProfitable(DominatedGuard->getArgOperand(0), + DominatingGuard->getArgOperand(0))) + return HoistingOutOfLoop ? WS_VeryPositive : WS_Positive; + + if (HoistingOutOfLoop) + return WS_Positive; + + return HoistingOutOfIf ? WS_IllegalOrNegative : WS_Neutral; +} + +bool GuardWideningImpl::isAvailableAt(Value *V, Instruction *Loc, + SmallPtrSetImpl<Instruction *> &Visited) { + auto *Inst = dyn_cast<Instruction>(V); + if (!Inst || DT.dominates(Inst, Loc) || Visited.count(Inst)) + return true; + + if (!isSafeToSpeculativelyExecute(Inst, Loc, &DT) || + Inst->mayReadFromMemory()) + return false; + + Visited.insert(Inst); + + // We only want to go _up_ the dominance chain when recursing. + assert(!isa<PHINode>(Loc) && + "PHIs should return false for isSafeToSpeculativelyExecute"); + assert(DT.isReachableFromEntry(Inst->getParent()) && + "We did a DFS from the block entry!"); + return all_of(Inst->operands(), + [&](Value *Op) { return isAvailableAt(Op, Loc, Visited); }); +} + +void GuardWideningImpl::makeAvailableAt(Value *V, Instruction *Loc) { + auto *Inst = dyn_cast<Instruction>(V); + if (!Inst || DT.dominates(Inst, Loc)) + return; + + assert(isSafeToSpeculativelyExecute(Inst, Loc, &DT) && + !Inst->mayReadFromMemory() && "Should've checked with isAvailableAt!"); + + for (Value *Op : Inst->operands()) + makeAvailableAt(Op, Loc); + + Inst->moveBefore(Loc); +} + +bool GuardWideningImpl::widenCondCommon(Value *Cond0, Value *Cond1, + Instruction *InsertPt, Value *&Result) { + using namespace llvm::PatternMatch; + + { + // L >u C0 && L >u C1 -> L >u max(C0, C1) + ConstantInt *RHS0, *RHS1; + Value *LHS; + ICmpInst::Predicate Pred0, Pred1; + if (match(Cond0, m_ICmp(Pred0, m_Value(LHS), m_ConstantInt(RHS0))) && + match(Cond1, m_ICmp(Pred1, m_Specific(LHS), m_ConstantInt(RHS1)))) { + + ConstantRange CR0 = + ConstantRange::makeExactICmpRegion(Pred0, RHS0->getValue()); + ConstantRange CR1 = + ConstantRange::makeExactICmpRegion(Pred1, RHS1->getValue()); + + // SubsetIntersect is a subset of the actual mathematical intersection of + // CR0 and CR1, while SupersetIntersect is a superset of the actual + // mathematical intersection. If these two ConstantRanges are equal, then + // we know we were able to represent the actual mathematical intersection + // of CR0 and CR1, and can use the same to generate an icmp instruction. + // + // Given what we're doing here and the semantics of guards, it would + // actually be correct to just use SubsetIntersect, but that may be too + // aggressive in cases we care about. + auto SubsetIntersect = CR0.inverse().unionWith(CR1.inverse()).inverse(); + auto SupersetIntersect = CR0.intersectWith(CR1); + + APInt NewRHSAP; + CmpInst::Predicate Pred; + if (SubsetIntersect == SupersetIntersect && + SubsetIntersect.getEquivalentICmp(Pred, NewRHSAP)) { + if (InsertPt) { + ConstantInt *NewRHS = ConstantInt::get(Cond0->getContext(), NewRHSAP); + Result = new ICmpInst(InsertPt, Pred, LHS, NewRHS, "wide.chk"); + } + return true; + } + } + } + + { + SmallVector<GuardWideningImpl::RangeCheck, 4> Checks, CombinedChecks; + if (parseRangeChecks(Cond0, Checks) && parseRangeChecks(Cond1, Checks) && + combineRangeChecks(Checks, CombinedChecks)) { + if (InsertPt) { + Result = nullptr; + for (auto &RC : CombinedChecks) { + makeAvailableAt(RC.getCheckInst(), InsertPt); + if (Result) + Result = BinaryOperator::CreateAnd(RC.getCheckInst(), Result, "", + InsertPt); + else + Result = RC.getCheckInst(); + } + + Result->setName("wide.chk"); + } + return true; + } + } + + // Base case -- just logical-and the two conditions together. + + if (InsertPt) { + makeAvailableAt(Cond0, InsertPt); + makeAvailableAt(Cond1, InsertPt); + + Result = BinaryOperator::CreateAnd(Cond0, Cond1, "wide.chk", InsertPt); + } + + // We were not able to compute Cond0 AND Cond1 for the price of one. + return false; +} + +bool GuardWideningImpl::parseRangeChecks( + Value *CheckCond, SmallVectorImpl<GuardWideningImpl::RangeCheck> &Checks, + SmallPtrSetImpl<Value *> &Visited) { + if (!Visited.insert(CheckCond).second) + return true; + + using namespace llvm::PatternMatch; + + { + Value *AndLHS, *AndRHS; + if (match(CheckCond, m_And(m_Value(AndLHS), m_Value(AndRHS)))) + return parseRangeChecks(AndLHS, Checks) && + parseRangeChecks(AndRHS, Checks); + } + + auto *IC = dyn_cast<ICmpInst>(CheckCond); + if (!IC || !IC->getOperand(0)->getType()->isIntegerTy() || + (IC->getPredicate() != ICmpInst::ICMP_ULT && + IC->getPredicate() != ICmpInst::ICMP_UGT)) + return false; + + Value *CmpLHS = IC->getOperand(0), *CmpRHS = IC->getOperand(1); + if (IC->getPredicate() == ICmpInst::ICMP_UGT) + std::swap(CmpLHS, CmpRHS); + + auto &DL = IC->getModule()->getDataLayout(); + + GuardWideningImpl::RangeCheck Check( + CmpLHS, cast<ConstantInt>(ConstantInt::getNullValue(CmpRHS->getType())), + CmpRHS, IC); + + if (!isKnownNonNegative(Check.getLength(), DL)) + return false; + + // What we have in \c Check now is a correct interpretation of \p CheckCond. + // Try to see if we can move some constant offsets into the \c Offset field. + + bool Changed; + auto &Ctx = CheckCond->getContext(); + + do { + Value *OpLHS; + ConstantInt *OpRHS; + Changed = false; + +#ifndef NDEBUG + auto *BaseInst = dyn_cast<Instruction>(Check.getBase()); + assert((!BaseInst || DT.isReachableFromEntry(BaseInst->getParent())) && + "Unreachable instruction?"); +#endif + + if (match(Check.getBase(), m_Add(m_Value(OpLHS), m_ConstantInt(OpRHS)))) { + Check.setBase(OpLHS); + APInt NewOffset = Check.getOffsetValue() + OpRHS->getValue(); + Check.setOffset(ConstantInt::get(Ctx, NewOffset)); + Changed = true; + } else if (match(Check.getBase(), + m_Or(m_Value(OpLHS), m_ConstantInt(OpRHS)))) { + unsigned BitWidth = OpLHS->getType()->getScalarSizeInBits(); + APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0); + computeKnownBits(OpLHS, KnownZero, KnownOne, DL); + if ((OpRHS->getValue() & KnownZero) == OpRHS->getValue()) { + Check.setBase(OpLHS); + APInt NewOffset = Check.getOffsetValue() + OpRHS->getValue(); + Check.setOffset(ConstantInt::get(Ctx, NewOffset)); + Changed = true; + } + } + } while (Changed); + + Checks.push_back(Check); + return true; +} + +bool GuardWideningImpl::combineRangeChecks( + SmallVectorImpl<GuardWideningImpl::RangeCheck> &Checks, + SmallVectorImpl<GuardWideningImpl::RangeCheck> &RangeChecksOut) { + unsigned OldCount = Checks.size(); + while (!Checks.empty()) { + // Pick all of the range checks with a specific base and length, and try to + // merge them. + Value *CurrentBase = Checks.front().getBase(); + Value *CurrentLength = Checks.front().getLength(); + + SmallVector<GuardWideningImpl::RangeCheck, 3> CurrentChecks; + + auto IsCurrentCheck = [&](GuardWideningImpl::RangeCheck &RC) { + return RC.getBase() == CurrentBase && RC.getLength() == CurrentLength; + }; + + std::copy_if(Checks.begin(), Checks.end(), + std::back_inserter(CurrentChecks), IsCurrentCheck); + Checks.erase(remove_if(Checks, IsCurrentCheck), Checks.end()); + + assert(CurrentChecks.size() != 0 && "We know we have at least one!"); + + if (CurrentChecks.size() < 3) { + RangeChecksOut.insert(RangeChecksOut.end(), CurrentChecks.begin(), + CurrentChecks.end()); + continue; + } + + // CurrentChecks.size() will typically be 3 here, but so far there has been + // no need to hard-code that fact. + + std::sort(CurrentChecks.begin(), CurrentChecks.end(), + [&](const GuardWideningImpl::RangeCheck &LHS, + const GuardWideningImpl::RangeCheck &RHS) { + return LHS.getOffsetValue().slt(RHS.getOffsetValue()); + }); + + // Note: std::sort should not invalidate the ChecksStart iterator. + + ConstantInt *MinOffset = CurrentChecks.front().getOffset(), + *MaxOffset = CurrentChecks.back().getOffset(); + + unsigned BitWidth = MaxOffset->getValue().getBitWidth(); + if ((MaxOffset->getValue() - MinOffset->getValue()) + .ugt(APInt::getSignedMinValue(BitWidth))) + return false; + + APInt MaxDiff = MaxOffset->getValue() - MinOffset->getValue(); + const APInt &HighOffset = MaxOffset->getValue(); + auto OffsetOK = [&](const GuardWideningImpl::RangeCheck &RC) { + return (HighOffset - RC.getOffsetValue()).ult(MaxDiff); + }; + + if (MaxDiff.isMinValue() || + !std::all_of(std::next(CurrentChecks.begin()), CurrentChecks.end(), + OffsetOK)) + return false; + + // We have a series of f+1 checks as: + // + // I+k_0 u< L ... Chk_0 + // I_k_1 u< L ... Chk_1 + // ... + // I_k_f u< L ... Chk_(f+1) + // + // with forall i in [0,f): k_f-k_i u< k_f-k_0 ... Precond_0 + // k_f-k_0 u< INT_MIN+k_f ... Precond_1 + // k_f != k_0 ... Precond_2 + // + // Claim: + // Chk_0 AND Chk_(f+1) implies all the other checks + // + // Informal proof sketch: + // + // We will show that the integer range [I+k_0,I+k_f] does not unsigned-wrap + // (i.e. going from I+k_0 to I+k_f does not cross the -1,0 boundary) and + // thus I+k_f is the greatest unsigned value in that range. + // + // This combined with Ckh_(f+1) shows that everything in that range is u< L. + // Via Precond_0 we know that all of the indices in Chk_0 through Chk_(f+1) + // lie in [I+k_0,I+k_f], this proving our claim. + // + // To see that [I+k_0,I+k_f] is not a wrapping range, note that there are + // two possibilities: I+k_0 u< I+k_f or I+k_0 >u I+k_f (they can't be equal + // since k_0 != k_f). In the former case, [I+k_0,I+k_f] is not a wrapping + // range by definition, and the latter case is impossible: + // + // 0-----I+k_f---I+k_0----L---INT_MAX,INT_MIN------------------(-1) + // xxxxxx xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx + // + // For Chk_0 to succeed, we'd have to have k_f-k_0 (the range highlighted + // with 'x' above) to be at least >u INT_MIN. + + RangeChecksOut.emplace_back(CurrentChecks.front()); + RangeChecksOut.emplace_back(CurrentChecks.back()); + } + + assert(RangeChecksOut.size() <= OldCount && "We pessimized!"); + return RangeChecksOut.size() != OldCount; +} + +PreservedAnalyses GuardWideningPass::run(Function &F, + AnalysisManager<Function> &AM) { + auto &DT = AM.getResult<DominatorTreeAnalysis>(F); + auto &LI = AM.getResult<LoopAnalysis>(F); + auto &PDT = AM.getResult<PostDominatorTreeAnalysis>(F); + bool Changed = GuardWideningImpl(DT, PDT, LI).run(); + return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); +} + +StringRef GuardWideningImpl::scoreTypeToString(WideningScore WS) { + switch (WS) { + case WS_IllegalOrNegative: + return "IllegalOrNegative"; + case WS_Neutral: + return "Neutral"; + case WS_Positive: + return "Positive"; + case WS_VeryPositive: + return "VeryPositive"; + } + + llvm_unreachable("Fully covered switch above!"); +} + +char GuardWideningLegacyPass::ID = 0; + +INITIALIZE_PASS_BEGIN(GuardWideningLegacyPass, "guard-widening", "Widen guards", + false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_END(GuardWideningLegacyPass, "guard-widening", "Widen guards", + false, false) + +FunctionPass *llvm::createGuardWideningPass() { + return new GuardWideningLegacyPass(); +} diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp index ec5e15f0b8f83..542cf38e43bbd 100644 --- a/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -24,13 +24,14 @@ // //===----------------------------------------------------------------------===// +#include "llvm/Transforms/Scalar/IndVarSimplify.h" #include "llvm/Transforms/Scalar.h" -#include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/LoopPassManager.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" #include "llvm/Analysis/TargetLibraryInfo.h" @@ -69,9 +70,6 @@ static cl::opt<bool> VerifyIndvars( "verify-indvars", cl::Hidden, cl::desc("Verify the ScalarEvolution result after running indvars")); -static cl::opt<bool> ReduceLiveIVs("liv-reduce", cl::Hidden, - cl::desc("Reduce live induction variables.")); - enum ReplaceExitVal { NeverRepl, OnlyCheapRepl, AlwaysRepl }; static cl::opt<ReplaceExitVal> ReplaceExitValue( @@ -87,42 +85,16 @@ static cl::opt<ReplaceExitVal> ReplaceExitValue( namespace { struct RewritePhi; -class IndVarSimplify : public LoopPass { - LoopInfo *LI; - ScalarEvolution *SE; - DominatorTree *DT; - TargetLibraryInfo *TLI; +class IndVarSimplify { + LoopInfo *LI; + ScalarEvolution *SE; + DominatorTree *DT; + const DataLayout &DL; + TargetLibraryInfo *TLI; const TargetTransformInfo *TTI; SmallVector<WeakVH, 16> DeadInsts; - bool Changed; -public: - - static char ID; // Pass identification, replacement for typeid - IndVarSimplify() - : LoopPass(ID), LI(nullptr), SE(nullptr), DT(nullptr), Changed(false) { - initializeIndVarSimplifyPass(*PassRegistry::getPassRegistry()); - } - - bool runOnLoop(Loop *L, LPPassManager &LPM) override; - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<LoopInfoWrapperPass>(); - AU.addRequired<ScalarEvolutionWrapperPass>(); - AU.addRequiredID(LoopSimplifyID); - AU.addRequiredID(LCSSAID); - AU.addPreserved<GlobalsAAWrapperPass>(); - AU.addPreserved<ScalarEvolutionWrapperPass>(); - AU.addPreservedID(LoopSimplifyID); - AU.addPreservedID(LCSSAID); - AU.setPreservesCFG(); - } - -private: - void releaseMemory() override { - DeadInsts.clear(); - } + bool Changed = false; bool isValidRewrite(Value *FromVal, Value *ToVal); @@ -133,6 +105,7 @@ private: bool canLoopBeDeleted(Loop *L, SmallVector<RewritePhi, 8> &RewritePhiSet); void rewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter); + void rewriteFirstIterationLoopExitValues(Loop *L); Value *linearFunctionTestReplace(Loop *L, const SCEV *BackedgeTakenCount, PHINode *IndVar, SCEVExpander &Rewriter); @@ -141,22 +114,15 @@ private: Value *expandSCEVIfNeeded(SCEVExpander &Rewriter, const SCEV *S, Loop *L, Instruction *InsertPt, Type *Ty); -}; -} -char IndVarSimplify::ID = 0; -INITIALIZE_PASS_BEGIN(IndVarSimplify, "indvars", - "Induction Variable Simplification", false, false) -INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) -INITIALIZE_PASS_DEPENDENCY(LoopSimplify) -INITIALIZE_PASS_DEPENDENCY(LCSSA) -INITIALIZE_PASS_END(IndVarSimplify, "indvars", - "Induction Variable Simplification", false, false) +public: + IndVarSimplify(LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, + const DataLayout &DL, TargetLibraryInfo *TLI, + TargetTransformInfo *TTI) + : LI(LI), SE(SE), DT(DT), DL(DL), TLI(TLI), TTI(TTI) {} -Pass *llvm::createIndVarSimplifyPass() { - return new IndVarSimplify(); + bool run(Loop *L); +}; } /// Return true if the SCEV expansion generated by the rewriter can replace the @@ -504,10 +470,9 @@ struct RewritePhi { unsigned Ith; // Ith incoming value. Value *Val; // Exit value after expansion. bool HighCost; // High Cost when expansion. - bool SafePhi; // LCSSASafePhiForRAUW. - RewritePhi(PHINode *P, unsigned I, Value *V, bool H, bool S) - : PN(P), Ith(I), Val(V), HighCost(H), SafePhi(S) {} + RewritePhi(PHINode *P, unsigned I, Value *V, bool H) + : PN(P), Ith(I), Val(V), HighCost(H) {} }; } @@ -550,9 +515,7 @@ void IndVarSimplify::rewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) { // Find all values that are computed inside the loop, but used outside of it. // Because of LCSSA, these values will only occur in LCSSA PHI Nodes. Scan // the exit blocks of the loop to find them. - for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) { - BasicBlock *ExitBB = ExitBlocks[i]; - + for (BasicBlock *ExitBB : ExitBlocks) { // If there are no PHI nodes in this exit block, then no values defined // inside the loop are used on this path, skip it. PHINode *PN = dyn_cast<PHINode>(ExitBB->begin()); @@ -560,29 +523,13 @@ void IndVarSimplify::rewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) { unsigned NumPreds = PN->getNumIncomingValues(); - // We would like to be able to RAUW single-incoming value PHI nodes. We - // have to be certain this is safe even when this is an LCSSA PHI node. - // While the computed exit value is no longer varying in *this* loop, the - // exit block may be an exit block for an outer containing loop as well, - // the exit value may be varying in the outer loop, and thus it may still - // require an LCSSA PHI node. The safe case is when this is - // single-predecessor PHI node (LCSSA) and the exit block containing it is - // part of the enclosing loop, or this is the outer most loop of the nest. - // In either case the exit value could (at most) be varying in the same - // loop body as the phi node itself. Thus if it is in turn used outside of - // an enclosing loop it will only be via a separate LCSSA node. - bool LCSSASafePhiForRAUW = - NumPreds == 1 && - (!L->getParentLoop() || L->getParentLoop() == LI->getLoopFor(ExitBB)); - // Iterate over all of the PHI nodes. BasicBlock::iterator BBI = ExitBB->begin(); while ((PN = dyn_cast<PHINode>(BBI++))) { if (PN->use_empty()) continue; // dead use, don't replace it - // SCEV only supports integer expressions for now. - if (!PN->getType()->isIntegerTy() && !PN->getType()->isPointerTy()) + if (!SE->isSCEVable(PN->getType())) continue; // It's necessary to tell ScalarEvolution about this explicitly so that @@ -669,8 +616,7 @@ void IndVarSimplify::rewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) { } // Collect all the candidate PHINodes to be rewritten. - RewritePhiSet.push_back( - RewritePhi(PN, i, ExitVal, HighCost, LCSSASafePhiForRAUW)); + RewritePhiSet.emplace_back(PN, i, ExitVal, HighCost); } } } @@ -699,9 +645,9 @@ void IndVarSimplify::rewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) { if (isInstructionTriviallyDead(Inst, TLI)) DeadInsts.push_back(Inst); - // If we determined that this PHI is safe to replace even if an LCSSA - // PHI, do so. - if (Phi.SafePhi) { + // Replace PN with ExitVal if that is legal and does not break LCSSA. + if (PN->getNumIncomingValues() == 1 && + LI->replacementPreservesLCSSAForm(PN, ExitVal)) { PN->replaceAllUsesWith(ExitVal); PN->eraseFromParent(); } @@ -712,6 +658,80 @@ void IndVarSimplify::rewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) { Rewriter.clearInsertPoint(); } +//===---------------------------------------------------------------------===// +// rewriteFirstIterationLoopExitValues: Rewrite loop exit values if we know +// they will exit at the first iteration. +//===---------------------------------------------------------------------===// + +/// Check to see if this loop has loop invariant conditions which lead to loop +/// exits. If so, we know that if the exit path is taken, it is at the first +/// loop iteration. This lets us predict exit values of PHI nodes that live in +/// loop header. +void IndVarSimplify::rewriteFirstIterationLoopExitValues(Loop *L) { + // Verify the input to the pass is already in LCSSA form. + assert(L->isLCSSAForm(*DT)); + + SmallVector<BasicBlock *, 8> ExitBlocks; + L->getUniqueExitBlocks(ExitBlocks); + auto *LoopHeader = L->getHeader(); + assert(LoopHeader && "Invalid loop"); + + for (auto *ExitBB : ExitBlocks) { + BasicBlock::iterator BBI = ExitBB->begin(); + // If there are no more PHI nodes in this exit block, then no more + // values defined inside the loop are used on this path. + while (auto *PN = dyn_cast<PHINode>(BBI++)) { + for (unsigned IncomingValIdx = 0, E = PN->getNumIncomingValues(); + IncomingValIdx != E; ++IncomingValIdx) { + auto *IncomingBB = PN->getIncomingBlock(IncomingValIdx); + + // We currently only support loop exits from loop header. If the + // incoming block is not loop header, we need to recursively check + // all conditions starting from loop header are loop invariants. + // Additional support might be added in the future. + if (IncomingBB != LoopHeader) + continue; + + // Get condition that leads to the exit path. + auto *TermInst = IncomingBB->getTerminator(); + + Value *Cond = nullptr; + if (auto *BI = dyn_cast<BranchInst>(TermInst)) { + // Must be a conditional branch, otherwise the block + // should not be in the loop. + Cond = BI->getCondition(); + } else if (auto *SI = dyn_cast<SwitchInst>(TermInst)) + Cond = SI->getCondition(); + else + continue; + + if (!L->isLoopInvariant(Cond)) + continue; + + auto *ExitVal = + dyn_cast<PHINode>(PN->getIncomingValue(IncomingValIdx)); + + // Only deal with PHIs. + if (!ExitVal) + continue; + + // If ExitVal is a PHI on the loop header, then we know its + // value along this exit because the exit can only be taken + // on the first iteration. + auto *LoopPreheader = L->getLoopPreheader(); + assert(LoopPreheader && "Invalid loop"); + int PreheaderIdx = ExitVal->getBasicBlockIndex(LoopPreheader); + if (PreheaderIdx != -1) { + assert(ExitVal->getParent() == LoopHeader && + "ExitVal must be in loop header"); + PN->setIncomingValue(IncomingValIdx, + ExitVal->getIncomingValue(PreheaderIdx)); + } + } + } + } +} + /// Check whether it is possible to delete the loop after rewriting exit /// value. If it is possible, ignore ReplaceExitValue and do rewriting /// aggressively. @@ -1240,6 +1260,12 @@ Instruction *WidenIV::widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) { if (UsePhi->getNumOperands() != 1) truncateIVUse(DU, DT, LI); else { + // Widening the PHI requires us to insert a trunc. The logical place + // for this trunc is in the same BB as the PHI. This is not possible if + // the BB is terminated by a catchswitch. + if (isa<CatchSwitchInst>(UsePhi->getParent()->getTerminator())) + return nullptr; + PHINode *WidePhi = PHINode::Create(DU.WideDef->getType(), 1, UsePhi->getName() + ".wide", UsePhi); @@ -1317,8 +1343,7 @@ Instruction *WidenIV::widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) { // Reuse the IV increment that SCEVExpander created as long as it dominates // NarrowUse. Instruction *WideUse = nullptr; - if (WideAddRec == WideIncExpr - && Rewriter.hoistIVInc(WideInc, DU.NarrowUse)) + if (WideAddRec == WideIncExpr && Rewriter.hoistIVInc(WideInc, DU.NarrowUse)) WideUse = WideInc; else { WideUse = cloneIVUser(DU, WideAddRec); @@ -1355,8 +1380,7 @@ void WidenIV::pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef) { if (!Widened.insert(NarrowUser).second) continue; - NarrowIVUsers.push_back( - NarrowIVDefUse(NarrowDef, NarrowUser, WideDef, NeverNegative)); + NarrowIVUsers.emplace_back(NarrowDef, NarrowUser, WideDef, NeverNegative); } } @@ -1391,9 +1415,10 @@ PHINode *WidenIV::createWideIV(SCEVExpander &Rewriter) { // An AddRec must have loop-invariant operands. Since this AddRec is // materialized by a loop header phi, the expression cannot have any post-loop // operands, so they must dominate the loop header. - assert(SE->properlyDominates(AddRec->getStart(), L->getHeader()) && - SE->properlyDominates(AddRec->getStepRecurrence(*SE), L->getHeader()) - && "Loop header phi recurrence inputs do not dominate the loop"); + assert( + SE->properlyDominates(AddRec->getStart(), L->getHeader()) && + SE->properlyDominates(AddRec->getStepRecurrence(*SE), L->getHeader()) && + "Loop header phi recurrence inputs do not dominate the loop"); // The rewriter provides a value for the desired IV expression. This may // either find an existing phi or materialize a new one. Either way, we @@ -1463,8 +1488,6 @@ public: : SE(SCEV), TTI(TTI), IVPhi(IV) { DT = DTree; WI.NarrowIV = IVPhi; - if (ReduceLiveIVs) - setSplitOverflowIntrinsics(); } // Implement the interface used by simplifyUsersOfIV. @@ -1729,6 +1752,7 @@ static PHINode *FindLoopCounter(Loop *L, const SCEV *BECount, const SCEV *BestInit = nullptr; BasicBlock *LatchBlock = L->getLoopLatch(); assert(LatchBlock && "needsLFTR should guarantee a loop latch"); + const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) { PHINode *Phi = cast<PHINode>(I); @@ -1747,8 +1771,7 @@ static PHINode *FindLoopCounter(Loop *L, const SCEV *BECount, // AR may be wider than BECount. With eq/ne tests overflow is immaterial. // AR may not be a narrower type, or we may never exit. uint64_t PhiWidth = SE->getTypeSizeInBits(AR->getType()); - if (PhiWidth < BCWidth || - !L->getHeader()->getModule()->getDataLayout().isLegalInteger(PhiWidth)) + if (PhiWidth < BCWidth || !DL.isLegalInteger(PhiWidth)) continue; const SCEV *Step = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*SE)); @@ -1767,8 +1790,8 @@ static PHINode *FindLoopCounter(Loop *L, const SCEV *BECount, // the loop test. In this case we assume that performing LFTR could not // increase the number of undef users. if (ICmpInst *Cond = getLoopTest(L)) { - if (Phi != getLoopPhiForCounter(Cond->getOperand(0), L, DT) - && Phi != getLoopPhiForCounter(Cond->getOperand(1), L, DT)) { + if (Phi != getLoopPhiForCounter(Cond->getOperand(0), L, DT) && + Phi != getLoopPhiForCounter(Cond->getOperand(1), L, DT)) { continue; } } @@ -1810,9 +1833,7 @@ static Value *genLoopLimit(PHINode *IndVar, const SCEV *IVCount, Loop *L, // finds a valid pointer IV. Sign extend BECount in order to materialize a // GEP. Avoid running SCEVExpander on a new pointer value, instead reusing // the existing GEPs whenever possible. - if (IndVar->getType()->isPointerTy() - && !IVCount->getType()->isPointerTy()) { - + if (IndVar->getType()->isPointerTy() && !IVCount->getType()->isPointerTy()) { // IVOffset will be the new GEP offset that is interpreted by GEP as a // signed value. IVCount on the other hand represents the loop trip count, // which is an unsigned value. FindLoopCounter only allows induction @@ -1833,13 +1854,13 @@ static Value *genLoopLimit(PHINode *IndVar, const SCEV *IVCount, Loop *L, // We could handle pointer IVs other than i8*, but we need to compensate for // gep index scaling. See canExpandBackedgeTakenCount comments. assert(SE->getSizeOfExpr(IntegerType::getInt64Ty(IndVar->getContext()), - cast<PointerType>(GEPBase->getType())->getElementType())->isOne() - && "unit stride pointer IV must be i8*"); + cast<PointerType>(GEPBase->getType()) + ->getElementType())->isOne() && + "unit stride pointer IV must be i8*"); IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); return Builder.CreateGEP(nullptr, GEPBase, GEPOffset, "lftr.limit"); - } - else { + } else { // In any other case, convert both IVInit and IVCount to integers before // comparing. This may result in SCEV expension of pointers, but in practice // SCEV will fold the pointer arithmetic away as such: @@ -1913,8 +1934,9 @@ linearFunctionTestReplace(Loop *L, } Value *ExitCnt = genLoopLimit(IndVar, IVCount, L, Rewriter, SE); - assert(ExitCnt->getType()->isPointerTy() == IndVar->getType()->isPointerTy() - && "genLoopLimit missed a cast"); + assert(ExitCnt->getType()->isPointerTy() == + IndVar->getType()->isPointerTy() && + "genLoopLimit missed a cast"); // Insert a new icmp_ne or icmp_eq instruction before the branch. BranchInst *BI = cast<BranchInst>(L->getExitingBlock()->getTerminator()); @@ -2074,9 +2096,9 @@ void IndVarSimplify::sinkUnusedInvariants(Loop *L) { // IndVarSimplify driver. Manage several subpasses of IV simplification. //===----------------------------------------------------------------------===// -bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { - if (skipOptnoneFunction(L)) - return false; +bool IndVarSimplify::run(Loop *L) { + // We need (and expect!) the incoming loop to be in LCSSA. + assert(L->isRecursivelyLCSSAForm(*DT) && "LCSSA required to run indvars!"); // If LoopSimplify form is not available, stay out of trouble. Some notes: // - LSR currently only supports LoopSimplify-form loops. Indvars' @@ -2089,18 +2111,6 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { if (!L->isLoopSimplifyForm()) return false; - LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); - DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); - TLI = TLIP ? &TLIP->getTLI() : nullptr; - auto *TTIP = getAnalysisIfAvailable<TargetTransformInfoWrapperPass>(); - TTI = TTIP ? &TTIP->getTTI(*L->getHeader()->getParent()) : nullptr; - const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); - - DeadInsts.clear(); - Changed = false; - // If there are any floating-point recurrences, attempt to // transform them to use integer recurrences. rewriteNonIntegerIVs(L); @@ -2172,6 +2182,11 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { // loop may be sunk below the loop to reduce register pressure. sinkUnusedInvariants(L); + // rewriteFirstIterationLoopExitValues does not rely on the computation of + // trip count and therefore can further simplify exit values in addition to + // rewriteLoopExitValues. + rewriteFirstIterationLoopExitValues(L); + // Clean up dead instructions. Changed |= DeleteDeadPHIs(L->getHeader(), TLI); @@ -2197,3 +2212,69 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { return Changed; } + +PreservedAnalyses IndVarSimplifyPass::run(Loop &L, AnalysisManager<Loop> &AM) { + auto &FAM = AM.getResult<FunctionAnalysisManagerLoopProxy>(L).getManager(); + Function *F = L.getHeader()->getParent(); + const DataLayout &DL = F->getParent()->getDataLayout(); + + auto *LI = FAM.getCachedResult<LoopAnalysis>(*F); + auto *SE = FAM.getCachedResult<ScalarEvolutionAnalysis>(*F); + auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(*F); + + assert((LI && SE && DT) && + "Analyses required for indvarsimplify not available!"); + + // Optional analyses. + auto *TTI = FAM.getCachedResult<TargetIRAnalysis>(*F); + auto *TLI = FAM.getCachedResult<TargetLibraryAnalysis>(*F); + + IndVarSimplify IVS(LI, SE, DT, DL, TLI, TTI); + if (!IVS.run(&L)) + return PreservedAnalyses::all(); + + // FIXME: This should also 'preserve the CFG'. + return getLoopPassPreservedAnalyses(); +} + +namespace { +struct IndVarSimplifyLegacyPass : public LoopPass { + static char ID; // Pass identification, replacement for typeid + IndVarSimplifyLegacyPass() : LoopPass(ID) { + initializeIndVarSimplifyLegacyPassPass(*PassRegistry::getPassRegistry()); + } + + bool runOnLoop(Loop *L, LPPassManager &LPM) override { + if (skipLoop(L)) + return false; + + auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); + auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); + auto *TLI = TLIP ? &TLIP->getTLI() : nullptr; + auto *TTIP = getAnalysisIfAvailable<TargetTransformInfoWrapperPass>(); + auto *TTI = TTIP ? &TTIP->getTTI(*L->getHeader()->getParent()) : nullptr; + const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); + + IndVarSimplify IVS(LI, SE, DT, DL, TLI, TTI); + return IVS.run(L); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + getLoopAnalysisUsage(AU); + } +}; +} + +char IndVarSimplifyLegacyPass::ID = 0; +INITIALIZE_PASS_BEGIN(IndVarSimplifyLegacyPass, "indvars", + "Induction Variable Simplification", false, false) +INITIALIZE_PASS_DEPENDENCY(LoopPass) +INITIALIZE_PASS_END(IndVarSimplifyLegacyPass, "indvars", + "Induction Variable Simplification", false, false) + +Pass *llvm::createIndVarSimplifyPass() { + return new IndVarSimplifyLegacyPass(); +} diff --git a/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp b/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp index dea61f6ff3d7e..ec7f09a2d598f 100644 --- a/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp +++ b/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp @@ -67,7 +67,6 @@ #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/SimplifyIndVar.h" #include "llvm/Transforms/Utils/UnrollLoop.h" -#include <array> using namespace llvm; @@ -114,24 +113,22 @@ class InductiveRangeCheck { RANGE_CHECK_UNKNOWN = (unsigned)-1 }; - static const char *rangeCheckKindToStr(RangeCheckKind); + static StringRef rangeCheckKindToStr(RangeCheckKind); - const SCEV *Offset; - const SCEV *Scale; - Value *Length; - BranchInst *Branch; - RangeCheckKind Kind; + const SCEV *Offset = nullptr; + const SCEV *Scale = nullptr; + Value *Length = nullptr; + Use *CheckUse = nullptr; + RangeCheckKind Kind = RANGE_CHECK_UNKNOWN; static RangeCheckKind parseRangeCheckICmp(Loop *L, ICmpInst *ICI, ScalarEvolution &SE, Value *&Index, Value *&Length); - static InductiveRangeCheck::RangeCheckKind - parseRangeCheck(Loop *L, ScalarEvolution &SE, Value *Condition, - const SCEV *&Index, Value *&UpperLimit); - - InductiveRangeCheck() : - Offset(nullptr), Scale(nullptr), Length(nullptr), Branch(nullptr) { } + static void + extractRangeChecksFromCond(Loop *L, ScalarEvolution &SE, Use &ConditionUse, + SmallVectorImpl<InductiveRangeCheck> &Checks, + SmallPtrSetImpl<Value *> &Visited); public: const SCEV *getOffset() const { return Offset; } @@ -150,9 +147,9 @@ public: Length->print(OS); else OS << "(null)"; - OS << "\n Branch: "; - getBranch()->print(OS); - OS << "\n"; + OS << "\n CheckUse: "; + getCheckUse()->getUser()->print(OS); + OS << " Operand: " << getCheckUse()->getOperandNo() << "\n"; } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -161,7 +158,7 @@ public: } #endif - BranchInst *getBranch() const { return Branch; } + Use *getCheckUse() const { return CheckUse; } /// Represents an signed integer range [Range.getBegin(), Range.getEnd()). If /// R.getEnd() sle R.getBegin(), then R denotes the empty range. @@ -180,8 +177,6 @@ public: const SCEV *getEnd() const { return End; } }; - typedef SpecificBumpPtrAllocator<InductiveRangeCheck> AllocatorTy; - /// This is the value the condition of the branch needs to evaluate to for the /// branch to take the hot successor (see (1) above). bool getPassingDirection() { return true; } @@ -190,19 +185,20 @@ public: /// check is redundant and can be constant-folded away. The induction /// variable is not required to be the canonical {0,+,1} induction variable. Optional<Range> computeSafeIterationSpace(ScalarEvolution &SE, - const SCEVAddRecExpr *IndVar, - IRBuilder<> &B) const; - - /// Create an inductive range check out of BI if possible, else return - /// nullptr. - static InductiveRangeCheck *create(AllocatorTy &Alloc, BranchInst *BI, - Loop *L, ScalarEvolution &SE, - BranchProbabilityInfo &BPI); + const SCEVAddRecExpr *IndVar) const; + + /// Parse out a set of inductive range checks from \p BI and append them to \p + /// Checks. + /// + /// NB! There may be conditions feeding into \p BI that aren't inductive range + /// checks, and hence don't end up in \p Checks. + static void + extractRangeChecksFromBranch(BranchInst *BI, Loop *L, ScalarEvolution &SE, + BranchProbabilityInfo &BPI, + SmallVectorImpl<InductiveRangeCheck> &Checks); }; class InductiveRangeCheckElimination : public LoopPass { - InductiveRangeCheck::AllocatorTy Allocator; - public: static char ID; InductiveRangeCheckElimination() : LoopPass(ID) { @@ -211,11 +207,8 @@ public: } void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<LoopInfoWrapperPass>(); - AU.addRequiredID(LoopSimplifyID); - AU.addRequiredID(LCSSAID); - AU.addRequired<ScalarEvolutionWrapperPass>(); AU.addRequired<BranchProbabilityInfoWrapperPass>(); + getLoopAnalysisUsage(AU); } bool runOnLoop(Loop *L, LPPassManager &LPM) override; @@ -226,15 +219,12 @@ char InductiveRangeCheckElimination::ID = 0; INITIALIZE_PASS_BEGIN(InductiveRangeCheckElimination, "irce", "Inductive range check elimination", false, false) -INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(LoopSimplify) -INITIALIZE_PASS_DEPENDENCY(LCSSA) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopPass) INITIALIZE_PASS_END(InductiveRangeCheckElimination, "irce", "Inductive range check elimination", false, false) -const char *InductiveRangeCheck::rangeCheckKindToStr( +StringRef InductiveRangeCheck::rangeCheckKindToStr( InductiveRangeCheck::RangeCheckKind RCK) { switch (RCK) { case InductiveRangeCheck::RANGE_CHECK_UNKNOWN: @@ -253,11 +243,9 @@ const char *InductiveRangeCheck::rangeCheckKindToStr( llvm_unreachable("unknown range check type!"); } -/// Parse a single ICmp instruction, `ICI`, into a range check. If `ICI` -/// cannot +/// Parse a single ICmp instruction, `ICI`, into a range check. If `ICI` cannot /// be interpreted as a range check, return `RANGE_CHECK_UNKNOWN` and set -/// `Index` and `Length` to `nullptr`. Otherwise set `Index` to the value -/// being +/// `Index` and `Length` to `nullptr`. Otherwise set `Index` to the value being /// range checked, and set `Length` to the upper limit `Index` is being range /// checked with if (and only if) the range check type is stronger or equal to /// RANGE_CHECK_UPPER. @@ -327,106 +315,89 @@ InductiveRangeCheck::parseRangeCheckICmp(Loop *L, ICmpInst *ICI, llvm_unreachable("default clause returns!"); } -/// Parses an arbitrary condition into a range check. `Length` is set only if -/// the range check is recognized to be `RANGE_CHECK_UPPER` or stronger. -InductiveRangeCheck::RangeCheckKind -InductiveRangeCheck::parseRangeCheck(Loop *L, ScalarEvolution &SE, - Value *Condition, const SCEV *&Index, - Value *&Length) { +void InductiveRangeCheck::extractRangeChecksFromCond( + Loop *L, ScalarEvolution &SE, Use &ConditionUse, + SmallVectorImpl<InductiveRangeCheck> &Checks, + SmallPtrSetImpl<Value *> &Visited) { using namespace llvm::PatternMatch; - Value *A = nullptr; - Value *B = nullptr; - - if (match(Condition, m_And(m_Value(A), m_Value(B)))) { - Value *IndexA = nullptr, *IndexB = nullptr; - Value *LengthA = nullptr, *LengthB = nullptr; - ICmpInst *ICmpA = dyn_cast<ICmpInst>(A), *ICmpB = dyn_cast<ICmpInst>(B); - - if (!ICmpA || !ICmpB) - return InductiveRangeCheck::RANGE_CHECK_UNKNOWN; - - auto RCKindA = parseRangeCheckICmp(L, ICmpA, SE, IndexA, LengthA); - auto RCKindB = parseRangeCheckICmp(L, ICmpB, SE, IndexB, LengthB); - - if (RCKindA == InductiveRangeCheck::RANGE_CHECK_UNKNOWN || - RCKindB == InductiveRangeCheck::RANGE_CHECK_UNKNOWN) - return InductiveRangeCheck::RANGE_CHECK_UNKNOWN; - - if (IndexA != IndexB) - return InductiveRangeCheck::RANGE_CHECK_UNKNOWN; - - if (LengthA != nullptr && LengthB != nullptr && LengthA != LengthB) - return InductiveRangeCheck::RANGE_CHECK_UNKNOWN; - - Index = SE.getSCEV(IndexA); - if (isa<SCEVCouldNotCompute>(Index)) - return InductiveRangeCheck::RANGE_CHECK_UNKNOWN; + Value *Condition = ConditionUse.get(); + if (!Visited.insert(Condition).second) + return; - Length = LengthA == nullptr ? LengthB : LengthA; + if (match(Condition, m_And(m_Value(), m_Value()))) { + SmallVector<InductiveRangeCheck, 8> SubChecks; + extractRangeChecksFromCond(L, SE, cast<User>(Condition)->getOperandUse(0), + SubChecks, Visited); + extractRangeChecksFromCond(L, SE, cast<User>(Condition)->getOperandUse(1), + SubChecks, Visited); + + if (SubChecks.size() == 2) { + // Handle a special case where we know how to merge two checks separately + // checking the upper and lower bounds into a full range check. + const auto &RChkA = SubChecks[0]; + const auto &RChkB = SubChecks[1]; + if ((RChkA.Length == RChkB.Length || !RChkA.Length || !RChkB.Length) && + RChkA.Offset == RChkB.Offset && RChkA.Scale == RChkB.Scale) { + + // If RChkA.Kind == RChkB.Kind then we just found two identical checks. + // But if one of them is a RANGE_CHECK_LOWER and the other is a + // RANGE_CHECK_UPPER (only possibility if they're different) then + // together they form a RANGE_CHECK_BOTH. + SubChecks[0].Kind = + (InductiveRangeCheck::RangeCheckKind)(RChkA.Kind | RChkB.Kind); + SubChecks[0].Length = RChkA.Length ? RChkA.Length : RChkB.Length; + SubChecks[0].CheckUse = &ConditionUse; + + // We updated one of the checks in place, now erase the other. + SubChecks.pop_back(); + } + } - return (InductiveRangeCheck::RangeCheckKind)(RCKindA | RCKindB); + Checks.insert(Checks.end(), SubChecks.begin(), SubChecks.end()); + return; } - if (ICmpInst *ICI = dyn_cast<ICmpInst>(Condition)) { - Value *IndexVal = nullptr; - - auto RCKind = parseRangeCheckICmp(L, ICI, SE, IndexVal, Length); + ICmpInst *ICI = dyn_cast<ICmpInst>(Condition); + if (!ICI) + return; - if (RCKind == InductiveRangeCheck::RANGE_CHECK_UNKNOWN) - return InductiveRangeCheck::RANGE_CHECK_UNKNOWN; + Value *Length = nullptr, *Index; + auto RCKind = parseRangeCheckICmp(L, ICI, SE, Index, Length); + if (RCKind == InductiveRangeCheck::RANGE_CHECK_UNKNOWN) + return; - Index = SE.getSCEV(IndexVal); - if (isa<SCEVCouldNotCompute>(Index)) - return InductiveRangeCheck::RANGE_CHECK_UNKNOWN; + const auto *IndexAddRec = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(Index)); + bool IsAffineIndex = + IndexAddRec && (IndexAddRec->getLoop() == L) && IndexAddRec->isAffine(); - return RCKind; - } + if (!IsAffineIndex) + return; - return InductiveRangeCheck::RANGE_CHECK_UNKNOWN; + InductiveRangeCheck IRC; + IRC.Length = Length; + IRC.Offset = IndexAddRec->getStart(); + IRC.Scale = IndexAddRec->getStepRecurrence(SE); + IRC.CheckUse = &ConditionUse; + IRC.Kind = RCKind; + Checks.push_back(IRC); } - -InductiveRangeCheck * -InductiveRangeCheck::create(InductiveRangeCheck::AllocatorTy &A, BranchInst *BI, - Loop *L, ScalarEvolution &SE, - BranchProbabilityInfo &BPI) { +void InductiveRangeCheck::extractRangeChecksFromBranch( + BranchInst *BI, Loop *L, ScalarEvolution &SE, BranchProbabilityInfo &BPI, + SmallVectorImpl<InductiveRangeCheck> &Checks) { if (BI->isUnconditional() || BI->getParent() == L->getLoopLatch()) - return nullptr; + return; BranchProbability LikelyTaken(15, 16); - if (BPI.getEdgeProbability(BI->getParent(), (unsigned) 0) < LikelyTaken) - return nullptr; - - Value *Length = nullptr; - const SCEV *IndexSCEV = nullptr; - - auto RCKind = InductiveRangeCheck::parseRangeCheck(L, SE, BI->getCondition(), - IndexSCEV, Length); - - if (RCKind == InductiveRangeCheck::RANGE_CHECK_UNKNOWN) - return nullptr; - - assert(IndexSCEV && "contract with SplitRangeCheckCondition!"); - assert((!(RCKind & InductiveRangeCheck::RANGE_CHECK_UPPER) || Length) && - "contract with SplitRangeCheckCondition!"); - - const SCEVAddRecExpr *IndexAddRec = dyn_cast<SCEVAddRecExpr>(IndexSCEV); - bool IsAffineIndex = - IndexAddRec && (IndexAddRec->getLoop() == L) && IndexAddRec->isAffine(); + if (BPI.getEdgeProbability(BI->getParent(), (unsigned)0) < LikelyTaken) + return; - if (!IsAffineIndex) - return nullptr; - - InductiveRangeCheck *IRC = new (A.Allocate()) InductiveRangeCheck; - IRC->Length = Length; - IRC->Offset = IndexAddRec->getStart(); - IRC->Scale = IndexAddRec->getStepRecurrence(SE); - IRC->Branch = BI; - IRC->Kind = RCKind; - return IRC; + SmallPtrSet<Value *, 8> Visited; + InductiveRangeCheck::extractRangeChecksFromCond(L, SE, BI->getOperandUse(0), + Checks, Visited); } namespace { @@ -666,7 +637,7 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BP return None; } - BranchInst *LatchBr = dyn_cast<BranchInst>(&*Latch->rbegin()); + BranchInst *LatchBr = dyn_cast<BranchInst>(Latch->getTerminator()); if (!LatchBr || LatchBr->isUnconditional()) { FailureReason = "latch terminator not conditional branch"; return None; @@ -792,7 +763,7 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BP return None; } - IRBuilder<> B(&*Preheader->rbegin()); + IRBuilder<> B(Preheader->getTerminator()); RightValue = B.CreateAdd(RightValue, One); } @@ -814,7 +785,7 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BP return None; } - IRBuilder<> B(&*Preheader->rbegin()); + IRBuilder<> B(Preheader->getTerminator()); RightValue = B.CreateSub(RightValue, One); } } @@ -833,7 +804,7 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BP const DataLayout &DL = Preheader->getModule()->getDataLayout(); Value *IndVarStartV = SCEVExpander(SE, DL, "irce") - .expandCodeFor(IndVarStart, IndVarTy, &*Preheader->rbegin()); + .expandCodeFor(IndVarStart, IndVarTy, Preheader->getTerminator()); IndVarStartV->setName("indvar.start"); LoopStructure Result; @@ -947,7 +918,7 @@ void LoopConstrainer::cloneLoop(LoopConstrainer::ClonedLoop &Result, for (Instruction &I : *ClonedBB) RemapInstruction(&I, Result.Map, - RF_NoModuleLevelChanges | RF_IgnoreMissingEntries); + RF_NoModuleLevelChanges | RF_IgnoreMissingLocals); // Exit blocks will now have one more predecessor and their PHI nodes need // to be edited to reflect that. No phi nodes need to be introduced because @@ -1055,7 +1026,7 @@ LoopConstrainer::RewrittenRangeInfo LoopConstrainer::changeIterationSpaceEnd( RRI.PseudoExit = BasicBlock::Create(Ctx, Twine(LS.Tag) + ".pseudo.exit", &F, &*BBInsertLocation); - BranchInst *PreheaderJump = cast<BranchInst>(&*Preheader->rbegin()); + BranchInst *PreheaderJump = cast<BranchInst>(Preheader->getTerminator()); bool Increasing = LS.IndVarIncreasing; IRBuilder<> B(PreheaderJump); @@ -1305,9 +1276,8 @@ bool LoopConstrainer::run() { /// in which the range check can be safely elided. If it cannot compute such a /// range, returns None. Optional<InductiveRangeCheck::Range> -InductiveRangeCheck::computeSafeIterationSpace(ScalarEvolution &SE, - const SCEVAddRecExpr *IndVar, - IRBuilder<> &) const { +InductiveRangeCheck::computeSafeIterationSpace( + ScalarEvolution &SE, const SCEVAddRecExpr *IndVar) const { // IndVar is of the form "A + B * I" (where "I" is the canonical induction // variable, that may or may not exist as a real llvm::Value in the loop) and // this inductive range check is a range check on the "C + D * I" ("C" is @@ -1375,7 +1345,7 @@ InductiveRangeCheck::computeSafeIterationSpace(ScalarEvolution &SE, static Optional<InductiveRangeCheck::Range> IntersectRange(ScalarEvolution &SE, const Optional<InductiveRangeCheck::Range> &R1, - const InductiveRangeCheck::Range &R2, IRBuilder<> &B) { + const InductiveRangeCheck::Range &R2) { if (!R1.hasValue()) return R2; auto &R1Value = R1.getValue(); @@ -1392,6 +1362,9 @@ IntersectRange(ScalarEvolution &SE, } bool InductiveRangeCheckElimination::runOnLoop(Loop *L, LPPassManager &LPM) { + if (skipLoop(L)) + return false; + if (L->getBlocks().size() >= LoopSizeCutoff) { DEBUG(dbgs() << "irce: giving up constraining loop, too large\n";); return false; @@ -1404,17 +1377,15 @@ bool InductiveRangeCheckElimination::runOnLoop(Loop *L, LPPassManager &LPM) { } LLVMContext &Context = Preheader->getContext(); - InductiveRangeCheck::AllocatorTy IRCAlloc; - SmallVector<InductiveRangeCheck *, 16> RangeChecks; + SmallVector<InductiveRangeCheck, 16> RangeChecks; ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); BranchProbabilityInfo &BPI = getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI(); for (auto BBI : L->getBlocks()) if (BranchInst *TBI = dyn_cast<BranchInst>(BBI->getTerminator())) - if (InductiveRangeCheck *IRC = - InductiveRangeCheck::create(IRCAlloc, TBI, L, SE, BPI)) - RangeChecks.push_back(IRC); + InductiveRangeCheck::extractRangeChecksFromBranch(TBI, L, SE, BPI, + RangeChecks); if (RangeChecks.empty()) return false; @@ -1423,8 +1394,8 @@ bool InductiveRangeCheckElimination::runOnLoop(Loop *L, LPPassManager &LPM) { OS << "irce: looking at loop "; L->print(OS); OS << "irce: loop has " << RangeChecks.size() << " inductive range checks: \n"; - for (InductiveRangeCheck *IRC : RangeChecks) - IRC->print(OS); + for (InductiveRangeCheck &IRC : RangeChecks) + IRC.print(OS); }; DEBUG(PrintRecognizedRangeChecks(dbgs())); @@ -1450,14 +1421,14 @@ bool InductiveRangeCheckElimination::runOnLoop(Loop *L, LPPassManager &LPM) { Optional<InductiveRangeCheck::Range> SafeIterRange; Instruction *ExprInsertPt = Preheader->getTerminator(); - SmallVector<InductiveRangeCheck *, 4> RangeChecksToEliminate; + SmallVector<InductiveRangeCheck, 4> RangeChecksToEliminate; IRBuilder<> B(ExprInsertPt); - for (InductiveRangeCheck *IRC : RangeChecks) { - auto Result = IRC->computeSafeIterationSpace(SE, IndVar, B); + for (InductiveRangeCheck &IRC : RangeChecks) { + auto Result = IRC.computeSafeIterationSpace(SE, IndVar); if (Result.hasValue()) { auto MaybeSafeIterRange = - IntersectRange(SE, SafeIterRange, Result.getValue(), B); + IntersectRange(SE, SafeIterRange, Result.getValue()); if (MaybeSafeIterRange.hasValue()) { RangeChecksToEliminate.push_back(IRC); SafeIterRange = MaybeSafeIterRange.getValue(); @@ -1487,11 +1458,11 @@ bool InductiveRangeCheckElimination::runOnLoop(Loop *L, LPPassManager &LPM) { // Optimize away the now-redundant range checks. - for (InductiveRangeCheck *IRC : RangeChecksToEliminate) { - ConstantInt *FoldedRangeCheck = IRC->getPassingDirection() + for (InductiveRangeCheck &IRC : RangeChecksToEliminate) { + ConstantInt *FoldedRangeCheck = IRC.getPassingDirection() ? ConstantInt::getTrue(Context) : ConstantInt::getFalse(Context); - IRC->getBranch()->setCondition(FoldedRangeCheck); + IRC.getCheckUse()->set(FoldedRangeCheck); } } diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp index dcdcfed66e641..b9e717cf763e2 100644 --- a/lib/Transforms/Scalar/JumpThreading.cpp +++ b/lib/Transforms/Scalar/JumpThreading.cpp @@ -11,31 +11,25 @@ // //===----------------------------------------------------------------------===// +#include "llvm/Transforms/Scalar/JumpThreading.h" #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/SmallPtrSet.h" -#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/CFG.h" -#include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/BlockFrequencyInfoImpl.h" -#include "llvm/Analysis/BranchProbabilityInfo.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/InstructionSimplify.h" -#include "llvm/Analysis/LazyValueInfo.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/LoopInfo.h" -#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/MDBuilder.h" #include "llvm/IR/Metadata.h" -#include "llvm/IR/ValueHandle.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -46,6 +40,7 @@ #include <algorithm> #include <memory> using namespace llvm; +using namespace jumpthreading; #define DEBUG_TYPE "jump-threading" @@ -66,17 +61,6 @@ ImplicationSearchThreshold( cl::init(3), cl::Hidden); namespace { - // These are at global scope so static functions can use them too. - typedef SmallVectorImpl<std::pair<Constant*, BasicBlock*> > PredValueInfo; - typedef SmallVector<std::pair<Constant*, BasicBlock*>, 8> PredValueInfoTy; - - // This is used to keep track of what kind of constant we're currently hoping - // to find. - enum ConstantPreference { - WantInteger, - WantBlockAddress - }; - /// This pass performs 'jump threading', which looks at blocks that have /// multiple predecessors and multiple successors. If one or more of the /// predecessors of the block can be proven to always jump to one of the @@ -94,89 +78,31 @@ namespace { /// revectored to the false side of the second if. /// class JumpThreading : public FunctionPass { - TargetLibraryInfo *TLI; - LazyValueInfo *LVI; - std::unique_ptr<BlockFrequencyInfo> BFI; - std::unique_ptr<BranchProbabilityInfo> BPI; - bool HasProfileData; -#ifdef NDEBUG - SmallPtrSet<const BasicBlock *, 16> LoopHeaders; -#else - SmallSet<AssertingVH<const BasicBlock>, 16> LoopHeaders; -#endif - DenseSet<std::pair<Value*, BasicBlock*> > RecursionSet; - - unsigned BBDupThreshold; - - // RAII helper for updating the recursion stack. - struct RecursionSetRemover { - DenseSet<std::pair<Value*, BasicBlock*> > &TheSet; - std::pair<Value*, BasicBlock*> ThePair; - - RecursionSetRemover(DenseSet<std::pair<Value*, BasicBlock*> > &S, - std::pair<Value*, BasicBlock*> P) - : TheSet(S), ThePair(P) { } - - ~RecursionSetRemover() { - TheSet.erase(ThePair); - } - }; + JumpThreadingPass Impl; + public: static char ID; // Pass identification - JumpThreading(int T = -1) : FunctionPass(ID) { - BBDupThreshold = (T == -1) ? BBDuplicateThreshold : unsigned(T); + JumpThreading(int T = -1) : FunctionPass(ID), Impl(T) { initializeJumpThreadingPass(*PassRegistry::getPassRegistry()); } bool runOnFunction(Function &F) override; void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<LazyValueInfo>(); - AU.addPreserved<LazyValueInfo>(); + AU.addRequired<LazyValueInfoWrapperPass>(); + AU.addPreserved<LazyValueInfoWrapperPass>(); AU.addPreserved<GlobalsAAWrapperPass>(); AU.addRequired<TargetLibraryInfoWrapperPass>(); } - void releaseMemory() override { - BFI.reset(); - BPI.reset(); - } - - void FindLoopHeaders(Function &F); - bool ProcessBlock(BasicBlock *BB); - bool ThreadEdge(BasicBlock *BB, const SmallVectorImpl<BasicBlock*> &PredBBs, - BasicBlock *SuccBB); - bool DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB, - const SmallVectorImpl<BasicBlock *> &PredBBs); - - bool ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, - PredValueInfo &Result, - ConstantPreference Preference, - Instruction *CxtI = nullptr); - bool ProcessThreadableEdges(Value *Cond, BasicBlock *BB, - ConstantPreference Preference, - Instruction *CxtI = nullptr); - - bool ProcessBranchOnPHI(PHINode *PN); - bool ProcessBranchOnXOR(BinaryOperator *BO); - bool ProcessImpliedCondition(BasicBlock *BB); - - bool SimplifyPartiallyRedundantLoad(LoadInst *LI); - bool TryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB); - bool TryToUnfoldSelectInCurrBB(BasicBlock *BB); - - private: - BasicBlock *SplitBlockPreds(BasicBlock *BB, ArrayRef<BasicBlock *> Preds, - const char *Suffix); - void UpdateBlockFreqAndEdgeWeight(BasicBlock *PredBB, BasicBlock *BB, - BasicBlock *NewBB, BasicBlock *SuccBB); + void releaseMemory() override { Impl.releaseMemory(); } }; } char JumpThreading::ID = 0; INITIALIZE_PASS_BEGIN(JumpThreading, "jump-threading", "Jump Threading", false, false) -INITIALIZE_PASS_DEPENDENCY(LazyValueInfo) +INITIALIZE_PASS_DEPENDENCY(LazyValueInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_END(JumpThreading, "jump-threading", "Jump Threading", false, false) @@ -184,24 +110,72 @@ INITIALIZE_PASS_END(JumpThreading, "jump-threading", // Public interface to the Jump Threading pass FunctionPass *llvm::createJumpThreadingPass(int Threshold) { return new JumpThreading(Threshold); } +JumpThreadingPass::JumpThreadingPass(int T) { + BBDupThreshold = (T == -1) ? BBDuplicateThreshold : unsigned(T); +} + /// runOnFunction - Top level algorithm. /// bool JumpThreading::runOnFunction(Function &F) { - if (skipOptnoneFunction(F)) + if (skipFunction(F)) return false; + auto TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + auto LVI = &getAnalysis<LazyValueInfoWrapperPass>().getLVI(); + std::unique_ptr<BlockFrequencyInfo> BFI; + std::unique_ptr<BranchProbabilityInfo> BPI; + bool HasProfileData = F.getEntryCount().hasValue(); + if (HasProfileData) { + LoopInfo LI{DominatorTree(F)}; + BPI.reset(new BranchProbabilityInfo(F, LI)); + BFI.reset(new BlockFrequencyInfo(F, *BPI, LI)); + } + return Impl.runImpl(F, TLI, LVI, HasProfileData, std::move(BFI), + std::move(BPI)); +} + +PreservedAnalyses JumpThreadingPass::run(Function &F, + AnalysisManager<Function> &AM) { + + auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); + auto &LVI = AM.getResult<LazyValueAnalysis>(F); + std::unique_ptr<BlockFrequencyInfo> BFI; + std::unique_ptr<BranchProbabilityInfo> BPI; + bool HasProfileData = F.getEntryCount().hasValue(); + if (HasProfileData) { + LoopInfo LI{DominatorTree(F)}; + BPI.reset(new BranchProbabilityInfo(F, LI)); + BFI.reset(new BlockFrequencyInfo(F, *BPI, LI)); + } + bool Changed = + runImpl(F, &TLI, &LVI, HasProfileData, std::move(BFI), std::move(BPI)); + + // FIXME: We need to invalidate LVI to avoid PR28400. Is there a better + // solution? + AM.invalidate<LazyValueAnalysis>(F); + + if (!Changed) + return PreservedAnalyses::all(); + PreservedAnalyses PA; + PA.preserve<GlobalsAA>(); + return PA; +} + +bool JumpThreadingPass::runImpl(Function &F, TargetLibraryInfo *TLI_, + LazyValueInfo *LVI_, bool HasProfileData_, + std::unique_ptr<BlockFrequencyInfo> BFI_, + std::unique_ptr<BranchProbabilityInfo> BPI_) { DEBUG(dbgs() << "Jump threading on function '" << F.getName() << "'\n"); - TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); - LVI = &getAnalysis<LazyValueInfo>(); + TLI = TLI_; + LVI = LVI_; BFI.reset(); BPI.reset(); // When profile data is available, we need to update edge weights after // successful jump threading, which requires both BPI and BFI being available. - HasProfileData = F.getEntryCount().hasValue(); + HasProfileData = HasProfileData_; if (HasProfileData) { - LoopInfo LI{DominatorTree(F)}; - BPI.reset(new BranchProbabilityInfo(F, LI)); - BFI.reset(new BlockFrequencyInfo(F, *BPI, LI)); + BPI = std::move(BPI_); + BFI = std::move(BFI_); } // Remove unreachable blocks from function as they may result in infinite @@ -245,10 +219,13 @@ bool JumpThreading::runOnFunction(Function &F) { // Can't thread an unconditional jump, but if the block is "almost // empty", we can replace uses of it with uses of the successor and make // this dead. + // We should not eliminate the loop header either, because eliminating + // a loop header might later prevent LoopSimplify from transforming nested + // loops into simplified form. if (BI && BI->isUnconditional() && BB != &BB->getParent()->getEntryBlock() && // If the terminator is the only non-phi instruction, try to nuke it. - BB->getFirstNonPHIOrDbg()->isTerminator()) { + BB->getFirstNonPHIOrDbg()->isTerminator() && !LoopHeaders.count(BB)) { // Since TryToSimplifyUncondBranchFromEmptyBlock may delete the // block, we have to make sure it isn't in the LoopHeaders set. We // reinsert afterward if needed. @@ -361,7 +338,7 @@ static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB, /// enough to track all of these properties and keep it up-to-date as the CFG /// mutates, so we don't allow any of these transformations. /// -void JumpThreading::FindLoopHeaders(Function &F) { +void JumpThreadingPass::FindLoopHeaders(Function &F) { SmallVector<std::pair<const BasicBlock*,const BasicBlock*>, 32> Edges; FindFunctionBackedges(F, Edges); @@ -395,10 +372,9 @@ static Constant *getKnownConstant(Value *Val, ConstantPreference Preference) { /// /// This returns true if there were any known values. /// -bool JumpThreading:: -ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, PredValueInfo &Result, - ConstantPreference Preference, - Instruction *CxtI) { +bool JumpThreadingPass::ComputeValueKnownInPredecessors( + Value *V, BasicBlock *BB, PredValueInfo &Result, + ConstantPreference Preference, Instruction *CxtI) { // This method walks up use-def chains recursively. Because of this, we could // get into an infinite loop going around loops in the use-def chain. To // prevent this, keep track of what (value, block) pairs we've already visited @@ -415,7 +391,7 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, PredValueInfo &Result, for (BasicBlock *Pred : predecessors(BB)) Result.push_back(std::make_pair(KC, Pred)); - return true; + return !Result.empty(); } // If V is a non-instruction value, or an instruction in a different block, @@ -465,6 +441,25 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, PredValueInfo &Result, return !Result.empty(); } + // Handle Cast instructions. Only see through Cast when the source operand is + // PHI or Cmp and the source type is i1 to save the compilation time. + if (CastInst *CI = dyn_cast<CastInst>(I)) { + Value *Source = CI->getOperand(0); + if (!Source->getType()->isIntegerTy(1)) + return false; + if (!isa<PHINode>(Source) && !isa<CmpInst>(Source)) + return false; + ComputeValueKnownInPredecessors(Source, BB, Result, Preference, CxtI); + if (Result.empty()) + return false; + + // Convert the known values. + for (auto &R : Result) + R.first = ConstantExpr::getCast(CI->getOpcode(), R.first, CI->getType()); + + return true; + } + PredValueInfoTy LHSVals, RHSVals; // Handle some boolean conditions. @@ -705,7 +700,7 @@ static bool hasAddressTakenAndUsed(BasicBlock *BB) { /// ProcessBlock - If there are any predecessors whose control can be threaded /// through to a successor, transform them now. -bool JumpThreading::ProcessBlock(BasicBlock *BB) { +bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) { // If the block is trivially dead, just return and let the caller nuke it. // This simplifies other transformations. if (pred_empty(BB) && @@ -889,7 +884,7 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) { return false; } -bool JumpThreading::ProcessImpliedCondition(BasicBlock *BB) { +bool JumpThreadingPass::ProcessImpliedCondition(BasicBlock *BB) { auto *BI = dyn_cast<BranchInst>(BB->getTerminator()); if (!BI || !BI->isConditional()) return false; @@ -903,12 +898,17 @@ bool JumpThreading::ProcessImpliedCondition(BasicBlock *BB) { while (CurrentPred && Iter++ < ImplicationSearchThreshold) { auto *PBI = dyn_cast<BranchInst>(CurrentPred->getTerminator()); - if (!PBI || !PBI->isConditional() || PBI->getSuccessor(0) != CurrentBB) + if (!PBI || !PBI->isConditional()) + return false; + if (PBI->getSuccessor(0) != CurrentBB && PBI->getSuccessor(1) != CurrentBB) return false; - if (isImpliedCondition(PBI->getCondition(), Cond, DL)) { - BI->getSuccessor(1)->removePredecessor(BB); - BranchInst::Create(BI->getSuccessor(0), BI); + bool FalseDest = PBI->getSuccessor(1) == CurrentBB; + Optional<bool> Implication = + isImpliedCondition(PBI->getCondition(), Cond, DL, FalseDest); + if (Implication) { + BI->getSuccessor(*Implication ? 1 : 0)->removePredecessor(BB); + BranchInst::Create(BI->getSuccessor(*Implication ? 0 : 1), BI); BI->eraseFromParent(); return true; } @@ -923,9 +923,9 @@ bool JumpThreading::ProcessImpliedCondition(BasicBlock *BB) { /// load instruction, eliminate it by replacing it with a PHI node. This is an /// important optimization that encourages jump threading, and needs to be run /// interlaced with other jump threading tasks. -bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) { - // Don't hack volatile/atomic loads. - if (!LI->isSimple()) return false; +bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) { + // Don't hack volatile and ordered loads. + if (!LI->isUnordered()) return false; // If the load is defined in a block with exactly one predecessor, it can't be // partially redundant. @@ -952,10 +952,9 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) { BasicBlock::iterator BBIt(LI); if (Value *AvailableVal = - FindAvailableLoadedValue(LoadedPtr, LoadBB, BBIt, DefMaxInstsToScan)) { + FindAvailableLoadedValue(LI, LoadBB, BBIt, DefMaxInstsToScan)) { // If the value of the load is locally available within the block, just use // it. This frequently occurs for reg2mem'd allocas. - //cerr << "LOAD ELIMINATED:\n" << *BBIt << *LI << "\n"; // If the returned value is the load itself, replace with an undef. This can // only happen in dead loops. @@ -994,7 +993,7 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) { // Scan the predecessor to see if the value is available in the pred. BBIt = PredBB->end(); AAMDNodes ThisAATags; - Value *PredAvailable = FindAvailableLoadedValue(LoadedPtr, PredBB, BBIt, + Value *PredAvailable = FindAvailableLoadedValue(LI, PredBB, BBIt, DefMaxInstsToScan, nullptr, &ThisAATags); if (!PredAvailable) { @@ -1056,9 +1055,10 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) { if (UnavailablePred) { assert(UnavailablePred->getTerminator()->getNumSuccessors() == 1 && "Can't handle critical edge here!"); - LoadInst *NewVal = new LoadInst(LoadedPtr, LI->getName()+".pr", false, - LI->getAlignment(), - UnavailablePred->getTerminator()); + LoadInst *NewVal = + new LoadInst(LoadedPtr, LI->getName() + ".pr", false, + LI->getAlignment(), LI->getOrdering(), LI->getSynchScope(), + UnavailablePred->getTerminator()); NewVal->setDebugLoc(LI->getDebugLoc()); if (AATags) NewVal->setAAMetadata(AATags); @@ -1100,8 +1100,6 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) { PN->addIncoming(PredV, I->first); } - //cerr << "PRE: " << *LI << *PN << "\n"; - LI->replaceAllUsesWith(PN); LI->eraseFromParent(); @@ -1171,9 +1169,9 @@ FindMostPopularDest(BasicBlock *BB, return MostPopularDest; } -bool JumpThreading::ProcessThreadableEdges(Value *Cond, BasicBlock *BB, - ConstantPreference Preference, - Instruction *CxtI) { +bool JumpThreadingPass::ProcessThreadableEdges(Value *Cond, BasicBlock *BB, + ConstantPreference Preference, + Instruction *CxtI) { // If threading this would thread across a loop header, don't even try to // thread the edge. if (LoopHeaders.count(BB)) @@ -1279,7 +1277,7 @@ bool JumpThreading::ProcessThreadableEdges(Value *Cond, BasicBlock *BB, /// a PHI node in the current block. See if there are any simplifications we /// can do based on inputs to the phi node. /// -bool JumpThreading::ProcessBranchOnPHI(PHINode *PN) { +bool JumpThreadingPass::ProcessBranchOnPHI(PHINode *PN) { BasicBlock *BB = PN->getParent(); // TODO: We could make use of this to do it once for blocks with common PHI @@ -1309,7 +1307,7 @@ bool JumpThreading::ProcessBranchOnPHI(PHINode *PN) { /// a xor instruction in the current block. See if there are any /// simplifications we can do based on inputs to the xor. /// -bool JumpThreading::ProcessBranchOnXOR(BinaryOperator *BO) { +bool JumpThreadingPass::ProcessBranchOnXOR(BinaryOperator *BO) { BasicBlock *BB = BO->getParent(); // If either the LHS or RHS of the xor is a constant, don't do this @@ -1437,9 +1435,9 @@ static void AddPHINodeEntriesForMappedBlock(BasicBlock *PHIBB, /// ThreadEdge - We have decided that it is safe and profitable to factor the /// blocks in PredBBs to one predecessor, then thread an edge from it to SuccBB /// across BB. Transform the IR to reflect this change. -bool JumpThreading::ThreadEdge(BasicBlock *BB, - const SmallVectorImpl<BasicBlock*> &PredBBs, - BasicBlock *SuccBB) { +bool JumpThreadingPass::ThreadEdge(BasicBlock *BB, + const SmallVectorImpl<BasicBlock *> &PredBBs, + BasicBlock *SuccBB) { // If threading to the same block as we come from, we would infinite loop. if (SuccBB == BB) { DEBUG(dbgs() << " Not threading across BB '" << BB->getName() @@ -1593,9 +1591,9 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB, /// Create a new basic block that will be the predecessor of BB and successor of /// all blocks in Preds. When profile data is availble, update the frequency of /// this new block. -BasicBlock *JumpThreading::SplitBlockPreds(BasicBlock *BB, - ArrayRef<BasicBlock *> Preds, - const char *Suffix) { +BasicBlock *JumpThreadingPass::SplitBlockPreds(BasicBlock *BB, + ArrayRef<BasicBlock *> Preds, + const char *Suffix) { // Collect the frequencies of all predecessors of BB, which will be used to // update the edge weight on BB->SuccBB. BlockFrequency PredBBFreq(0); @@ -1615,10 +1613,10 @@ BasicBlock *JumpThreading::SplitBlockPreds(BasicBlock *BB, /// Update the block frequency of BB and branch weight and the metadata on the /// edge BB->SuccBB. This is done by scaling the weight of BB->SuccBB by 1 - /// Freq(PredBB->BB) / Freq(BB->SuccBB). -void JumpThreading::UpdateBlockFreqAndEdgeWeight(BasicBlock *PredBB, - BasicBlock *BB, - BasicBlock *NewBB, - BasicBlock *SuccBB) { +void JumpThreadingPass::UpdateBlockFreqAndEdgeWeight(BasicBlock *PredBB, + BasicBlock *BB, + BasicBlock *NewBB, + BasicBlock *SuccBB) { if (!HasProfileData) return; @@ -1679,8 +1677,8 @@ void JumpThreading::UpdateBlockFreqAndEdgeWeight(BasicBlock *PredBB, /// If we can duplicate the contents of BB up into PredBB do so now, this /// improves the odds that the branch will be on an analyzable instruction like /// a compare. -bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB, - const SmallVectorImpl<BasicBlock *> &PredBBs) { +bool JumpThreadingPass::DuplicateCondBranchOnPHIIntoPred( + BasicBlock *BB, const SmallVectorImpl<BasicBlock *> &PredBBs) { assert(!PredBBs.empty() && "Can't handle an empty set"); // If BB is a loop header, then duplicating this block outside the loop would @@ -1750,13 +1748,18 @@ bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB, // phi translation. if (Value *IV = SimplifyInstruction(New, BB->getModule()->getDataLayout())) { - delete New; ValueMapping[&*BI] = IV; + if (!New->mayHaveSideEffects()) { + delete New; + New = nullptr; + } } else { + ValueMapping[&*BI] = New; + } + if (New) { // Otherwise, insert the new instruction into the block. New->setName(BI->getName()); PredBB->getInstList().insert(OldPredBranch->getIterator(), New); - ValueMapping[&*BI] = New; } } @@ -1829,7 +1832,7 @@ bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB, /// /// And expand the select into a branch structure if one of its arms allows %c /// to be folded. This later enables threading from bb1 over bb2. -bool JumpThreading::TryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB) { +bool JumpThreadingPass::TryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB) { BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator()); PHINode *CondLHS = dyn_cast<PHINode>(CondCmp->getOperand(0)); Constant *CondRHS = cast<Constant>(CondCmp->getOperand(1)); @@ -1907,7 +1910,7 @@ bool JumpThreading::TryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB) { /// select if the associated PHI has at least one constant. If the unfolded /// select is not jump-threaded, it will be folded again in the later /// optimizations. -bool JumpThreading::TryToUnfoldSelectInCurrBB(BasicBlock *BB) { +bool JumpThreadingPass::TryToUnfoldSelectInCurrBB(BasicBlock *BB) { // If threading this would thread across a loop header, don't thread the edge. // See the comments above FindLoopHeaders for justifications and caveats. if (LoopHeaders.count(BB)) diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp index 8923ff74253c1..2c0a70e44f574 100644 --- a/lib/Transforms/Scalar/LICM.cpp +++ b/lib/Transforms/Scalar/LICM.cpp @@ -30,15 +30,19 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Scalar/LICM.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AliasSetTracker.h" #include "llvm/Analysis/BasicAliasAnalysis.h" +#include "llvm/Analysis/CaptureTracking.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Analysis/Loads.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/LoopPassManager.h" +#include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" #include "llvm/Analysis/TargetLibraryInfo.h" @@ -56,183 +60,173 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/SSAUpdater.h" #include <algorithm> +#include <utility> using namespace llvm; #define DEBUG_TYPE "licm" -STATISTIC(NumSunk , "Number of instructions sunk out of loop"); -STATISTIC(NumHoisted , "Number of instructions hoisted out of loop"); +STATISTIC(NumSunk, "Number of instructions sunk out of loop"); +STATISTIC(NumHoisted, "Number of instructions hoisted out of loop"); STATISTIC(NumMovedLoads, "Number of load insts hoisted or sunk"); STATISTIC(NumMovedCalls, "Number of call insts hoisted or sunk"); -STATISTIC(NumPromoted , "Number of memory locations promoted to registers"); +STATISTIC(NumPromoted, "Number of memory locations promoted to registers"); static cl::opt<bool> -DisablePromotion("disable-licm-promotion", cl::Hidden, - cl::desc("Disable memory promotion in LICM pass")); + DisablePromotion("disable-licm-promotion", cl::Hidden, + cl::desc("Disable memory promotion in LICM pass")); static bool inSubLoop(BasicBlock *BB, Loop *CurLoop, LoopInfo *LI); static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop, - const LICMSafetyInfo *SafetyInfo); -static bool hoist(Instruction &I, BasicBlock *Preheader); + const LoopSafetyInfo *SafetyInfo); +static bool hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop, + const LoopSafetyInfo *SafetyInfo); static bool sink(Instruction &I, const LoopInfo *LI, const DominatorTree *DT, const Loop *CurLoop, AliasSetTracker *CurAST, - const LICMSafetyInfo *SafetyInfo); -static bool isGuaranteedToExecute(const Instruction &Inst, - const DominatorTree *DT, - const Loop *CurLoop, - const LICMSafetyInfo *SafetyInfo); + const LoopSafetyInfo *SafetyInfo); static bool isSafeToExecuteUnconditionally(const Instruction &Inst, const DominatorTree *DT, - const TargetLibraryInfo *TLI, const Loop *CurLoop, - const LICMSafetyInfo *SafetyInfo, + const LoopSafetyInfo *SafetyInfo, const Instruction *CtxI = nullptr); static bool pointerInvalidatedByLoop(Value *V, uint64_t Size, - const AAMDNodes &AAInfo, + const AAMDNodes &AAInfo, AliasSetTracker *CurAST); static Instruction * CloneInstructionInExitBlock(Instruction &I, BasicBlock &ExitBlock, PHINode &PN, const LoopInfo *LI, - const LICMSafetyInfo *SafetyInfo); + const LoopSafetyInfo *SafetyInfo); static bool canSinkOrHoistInst(Instruction &I, AliasAnalysis *AA, DominatorTree *DT, TargetLibraryInfo *TLI, Loop *CurLoop, AliasSetTracker *CurAST, - LICMSafetyInfo *SafetyInfo); + LoopSafetyInfo *SafetyInfo); namespace { - struct LICM : public LoopPass { - static char ID; // Pass identification, replacement for typeid - LICM() : LoopPass(ID) { - initializeLICMPass(*PassRegistry::getPassRegistry()); - } +struct LoopInvariantCodeMotion { + bool runOnLoop(Loop *L, AliasAnalysis *AA, LoopInfo *LI, DominatorTree *DT, + TargetLibraryInfo *TLI, ScalarEvolution *SE, bool DeleteAST); - bool runOnLoop(Loop *L, LPPassManager &LPM) override; - - /// This transformation requires natural loop information & requires that - /// loop preheaders be inserted into the CFG... - /// - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<LoopInfoWrapperPass>(); - AU.addRequiredID(LoopSimplifyID); - AU.addPreservedID(LoopSimplifyID); - AU.addRequiredID(LCSSAID); - AU.addPreservedID(LCSSAID); - AU.addRequired<AAResultsWrapperPass>(); - AU.addPreserved<AAResultsWrapperPass>(); - AU.addPreserved<BasicAAWrapperPass>(); - AU.addPreserved<GlobalsAAWrapperPass>(); - AU.addPreserved<ScalarEvolutionWrapperPass>(); - AU.addPreserved<SCEVAAWrapperPass>(); - AU.addRequired<TargetLibraryInfoWrapperPass>(); - } + DenseMap<Loop *, AliasSetTracker *> &getLoopToAliasSetMap() { + return LoopToAliasSetMap; + } + +private: + DenseMap<Loop *, AliasSetTracker *> LoopToAliasSetMap; - using llvm::Pass::doFinalization; + AliasSetTracker *collectAliasInfoForLoop(Loop *L, LoopInfo *LI, + AliasAnalysis *AA); +}; + +struct LegacyLICMPass : public LoopPass { + static char ID; // Pass identification, replacement for typeid + LegacyLICMPass() : LoopPass(ID) { + initializeLegacyLICMPassPass(*PassRegistry::getPassRegistry()); + } - bool doFinalization() override { - assert(LoopToAliasSetMap.empty() && "Didn't free loop alias sets"); + bool runOnLoop(Loop *L, LPPassManager &LPM) override { + if (skipLoop(L)) return false; - } - private: - AliasAnalysis *AA; // Current AliasAnalysis information - LoopInfo *LI; // Current LoopInfo - DominatorTree *DT; // Dominator Tree for the current Loop. + auto *SE = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>(); + return LICM.runOnLoop(L, + &getAnalysis<AAResultsWrapperPass>().getAAResults(), + &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(), + &getAnalysis<DominatorTreeWrapperPass>().getDomTree(), + &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(), + SE ? &SE->getSE() : nullptr, false); + } - TargetLibraryInfo *TLI; // TargetLibraryInfo for constant folding. + /// This transformation requires natural loop information & requires that + /// loop preheaders be inserted into the CFG... + /// + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); + getLoopAnalysisUsage(AU); + } - // State that is updated as we process loops. - bool Changed; // Set to true when we change anything. - BasicBlock *Preheader; // The preheader block of the current loop... - Loop *CurLoop; // The current loop we are working on... - AliasSetTracker *CurAST; // AliasSet information for the current loop... - DenseMap<Loop*, AliasSetTracker*> LoopToAliasSetMap; + using llvm::Pass::doFinalization; - /// cloneBasicBlockAnalysis - Simple Analysis hook. Clone alias set info. - void cloneBasicBlockAnalysis(BasicBlock *From, BasicBlock *To, - Loop *L) override; + bool doFinalization() override { + assert(LICM.getLoopToAliasSetMap().empty() && + "Didn't free loop alias sets"); + return false; + } - /// deleteAnalysisValue - Simple Analysis hook. Delete value V from alias - /// set. - void deleteAnalysisValue(Value *V, Loop *L) override; +private: + LoopInvariantCodeMotion LICM; - /// Simple Analysis hook. Delete loop L from alias set map. - void deleteAnalysisLoop(Loop *L) override; - }; + /// cloneBasicBlockAnalysis - Simple Analysis hook. Clone alias set info. + void cloneBasicBlockAnalysis(BasicBlock *From, BasicBlock *To, + Loop *L) override; + + /// deleteAnalysisValue - Simple Analysis hook. Delete value V from alias + /// set. + void deleteAnalysisValue(Value *V, Loop *L) override; + + /// Simple Analysis hook. Delete loop L from alias set map. + void deleteAnalysisLoop(Loop *L) override; +}; +} + +PreservedAnalyses LICMPass::run(Loop &L, AnalysisManager<Loop> &AM) { + const auto &FAM = + AM.getResult<FunctionAnalysisManagerLoopProxy>(L).getManager(); + Function *F = L.getHeader()->getParent(); + + auto *AA = FAM.getCachedResult<AAManager>(*F); + auto *LI = FAM.getCachedResult<LoopAnalysis>(*F); + auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(*F); + auto *TLI = FAM.getCachedResult<TargetLibraryAnalysis>(*F); + auto *SE = FAM.getCachedResult<ScalarEvolutionAnalysis>(*F); + assert((AA && LI && DT && TLI && SE) && "Analyses for LICM not available"); + + LoopInvariantCodeMotion LICM; + + if (!LICM.runOnLoop(&L, AA, LI, DT, TLI, SE, true)) + return PreservedAnalyses::all(); + + // FIXME: There is no setPreservesCFG in the new PM. When that becomes + // available, it should be used here. + return getLoopPassPreservedAnalyses(); } -char LICM::ID = 0; -INITIALIZE_PASS_BEGIN(LICM, "licm", "Loop Invariant Code Motion", false, false) -INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(LoopSimplify) -INITIALIZE_PASS_DEPENDENCY(LCSSA) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) +char LegacyLICMPass::ID = 0; +INITIALIZE_PASS_BEGIN(LegacyLICMPass, "licm", "Loop Invariant Code Motion", + false, false) +INITIALIZE_PASS_DEPENDENCY(LoopPass) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) -INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) -INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) -INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass) -INITIALIZE_PASS_END(LICM, "licm", "Loop Invariant Code Motion", false, false) +INITIALIZE_PASS_END(LegacyLICMPass, "licm", "Loop Invariant Code Motion", false, + false) -Pass *llvm::createLICMPass() { return new LICM(); } +Pass *llvm::createLICMPass() { return new LegacyLICMPass(); } /// Hoist expressions out of the specified loop. Note, alias info for inner /// loop is not preserved so it is not a good idea to run LICM multiple /// times on one loop. +/// We should delete AST for inner loops in the new pass manager to avoid +/// memory leak. /// -bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) { - if (skipOptnoneFunction(L)) - return false; - - Changed = false; - - // Get our Loop and Alias Analysis information... - LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); - DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - - TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); +bool LoopInvariantCodeMotion::runOnLoop(Loop *L, AliasAnalysis *AA, + LoopInfo *LI, DominatorTree *DT, + TargetLibraryInfo *TLI, + ScalarEvolution *SE, bool DeleteAST) { + bool Changed = false; assert(L->isLCSSAForm(*DT) && "Loop is not in LCSSA form."); - CurAST = new AliasSetTracker(*AA); - // Collect Alias info from subloops. - for (Loop *InnerL : L->getSubLoops()) { - AliasSetTracker *InnerAST = LoopToAliasSetMap[InnerL]; - assert(InnerAST && "Where is my AST?"); - - // What if InnerLoop was modified by other passes ? - CurAST->add(*InnerAST); - - // Once we've incorporated the inner loop's AST into ours, we don't need the - // subloop's anymore. - delete InnerAST; - LoopToAliasSetMap.erase(InnerL); - } - - CurLoop = L; + AliasSetTracker *CurAST = collectAliasInfoForLoop(L, LI, AA); // Get the preheader block to move instructions into... - Preheader = L->getLoopPreheader(); - - // Loop over the body of this loop, looking for calls, invokes, and stores. - // Because subloops have already been incorporated into AST, we skip blocks in - // subloops. - // - for (BasicBlock *BB : L->blocks()) { - if (LI->getLoopFor(BB) == L) // Ignore blocks in subloops. - CurAST->add(*BB); // Incorporate the specified basic block - } + BasicBlock *Preheader = L->getLoopPreheader(); // Compute loop safety information. - LICMSafetyInfo SafetyInfo; - computeLICMSafetyInfo(&SafetyInfo, CurLoop); + LoopSafetyInfo SafetyInfo; + computeLoopSafetyInfo(&SafetyInfo, L); // We want to visit all of the instructions in this loop... that are not parts // of our subloops (they have already had their invariants hoisted out of @@ -245,11 +239,11 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) { // instructions, we perform another pass to hoist them out of the loop. // if (L->hasDedicatedExits()) - Changed |= sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, CurLoop, + Changed |= sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, L, CurAST, &SafetyInfo); if (Preheader) - Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, - CurLoop, CurAST, &SafetyInfo); + Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, L, + CurAST, &SafetyInfo); // Now that all loop invariants have been removed from the loop, promote any // memory references to scalars that we can. @@ -260,9 +254,8 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) { // Loop over all of the alias sets in the tracker object. for (AliasSet &AS : *CurAST) - Changed |= promoteLoopAccessesToScalars(AS, ExitBlocks, InsertPts, - PIC, LI, DT, CurLoop, - CurAST, &SafetyInfo); + Changed |= promoteLoopAccessesToScalars( + AS, ExitBlocks, InsertPts, PIC, LI, DT, TLI, L, CurAST, &SafetyInfo); // Once we have promoted values across the loop body we have to recursively // reform LCSSA as any nested loop may now have values defined within the @@ -271,8 +264,7 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) { // SSAUpdater strategy during promotion that was LCSSA aware and reformed // it as it went. if (Changed) { - auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>(); - formLCSSARecursively(*L, *DT, LI, SEWP ? &SEWP->getSE() : nullptr); + formLCSSARecursively(*L, *DT, LI, SE); } } @@ -283,50 +275,49 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) { assert((!L->getParentLoop() || L->getParentLoop()->isLCSSAForm(*DT)) && "Parent loop not left in LCSSA form after LICM!"); - // Clear out loops state information for the next iteration - CurLoop = nullptr; - Preheader = nullptr; - // If this loop is nested inside of another one, save the alias information // for when we process the outer loop. - if (L->getParentLoop()) + if (L->getParentLoop() && !DeleteAST) LoopToAliasSetMap[L] = CurAST; else delete CurAST; + + if (Changed && SE) + SE->forgetLoopDispositions(L); return Changed; } /// Walk the specified region of the CFG (defined by all blocks dominated by -/// the specified block, and that are in the current loop) in reverse depth +/// the specified block, and that are in the current loop) in reverse depth /// first order w.r.t the DominatorTree. This allows us to visit uses before /// definitions, allowing us to sink a loop body in one pass without iteration. /// bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI, DominatorTree *DT, TargetLibraryInfo *TLI, Loop *CurLoop, - AliasSetTracker *CurAST, LICMSafetyInfo *SafetyInfo) { + AliasSetTracker *CurAST, LoopSafetyInfo *SafetyInfo) { // Verify inputs. - assert(N != nullptr && AA != nullptr && LI != nullptr && - DT != nullptr && CurLoop != nullptr && CurAST != nullptr && - SafetyInfo != nullptr && "Unexpected input to sinkRegion"); + assert(N != nullptr && AA != nullptr && LI != nullptr && DT != nullptr && + CurLoop != nullptr && CurAST != nullptr && SafetyInfo != nullptr && + "Unexpected input to sinkRegion"); - // Set changed as false. - bool Changed = false; - // Get basic block BasicBlock *BB = N->getBlock(); // If this subregion is not in the top level loop at all, exit. - if (!CurLoop->contains(BB)) return Changed; + if (!CurLoop->contains(BB)) + return false; // We are processing blocks in reverse dfo, so process children first. - const std::vector<DomTreeNode*> &Children = N->getChildren(); + bool Changed = false; + const std::vector<DomTreeNode *> &Children = N->getChildren(); for (DomTreeNode *Child : Children) Changed |= sinkRegion(Child, AA, LI, DT, TLI, CurLoop, CurAST, SafetyInfo); // Only need to process the contents of this block if it is not part of a // subloop (which would already have been processed). - if (inSubLoop(BB,CurLoop,LI)) return Changed; + if (inSubLoop(BB, CurLoop, LI)) + return Changed; - for (BasicBlock::iterator II = BB->end(); II != BB->begin(); ) { + for (BasicBlock::iterator II = BB->end(); II != BB->begin();) { Instruction &I = *--II; // If the instruction is dead, we would try to sink it because it isn't used @@ -361,21 +352,23 @@ bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI, /// bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI, DominatorTree *DT, TargetLibraryInfo *TLI, Loop *CurLoop, - AliasSetTracker *CurAST, LICMSafetyInfo *SafetyInfo) { + AliasSetTracker *CurAST, LoopSafetyInfo *SafetyInfo) { // Verify inputs. - assert(N != nullptr && AA != nullptr && LI != nullptr && - DT != nullptr && CurLoop != nullptr && CurAST != nullptr && - SafetyInfo != nullptr && "Unexpected input to hoistRegion"); - // Set changed as false. - bool Changed = false; - // Get basic block + assert(N != nullptr && AA != nullptr && LI != nullptr && DT != nullptr && + CurLoop != nullptr && CurAST != nullptr && SafetyInfo != nullptr && + "Unexpected input to hoistRegion"); + BasicBlock *BB = N->getBlock(); + // If this subregion is not in the top level loop at all, exit. - if (!CurLoop->contains(BB)) return Changed; + if (!CurLoop->contains(BB)) + return false; + // Only need to process the contents of this block if it is not part of a // subloop (which would already have been processed). + bool Changed = false; if (!inSubLoop(BB, CurLoop, LI)) - for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E; ) { + for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E;) { Instruction &I = *II++; // Try constant folding this instruction. If all the operands are // constants, it is technically hoistable, but it would be better to just @@ -396,12 +389,13 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI, // if (CurLoop->hasLoopInvariantOperands(&I) && canSinkOrHoistInst(I, AA, DT, TLI, CurLoop, CurAST, SafetyInfo) && - isSafeToExecuteUnconditionally(I, DT, TLI, CurLoop, SafetyInfo, - CurLoop->getLoopPreheader()->getTerminator())) - Changed |= hoist(I, CurLoop->getLoopPreheader()); + isSafeToExecuteUnconditionally( + I, DT, CurLoop, SafetyInfo, + CurLoop->getLoopPreheader()->getTerminator())) + Changed |= hoist(I, DT, CurLoop, SafetyInfo); } - const std::vector<DomTreeNode*> &Children = N->getChildren(); + const std::vector<DomTreeNode *> &Children = N->getChildren(); for (DomTreeNode *Child : Children) Changed |= hoistRegion(Child, AA, LI, DT, TLI, CurLoop, CurAST, SafetyInfo); return Changed; @@ -410,7 +404,7 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI, /// Computes loop safety information, checks loop body & header /// for the possibility of may throw exception. /// -void llvm::computeLICMSafetyInfo(LICMSafetyInfo * SafetyInfo, Loop * CurLoop) { +void llvm::computeLoopSafetyInfo(LoopSafetyInfo *SafetyInfo, Loop *CurLoop) { assert(CurLoop != nullptr && "CurLoop cant be null"); BasicBlock *Header = CurLoop->getHeader(); // Setting default safety values. @@ -419,15 +413,17 @@ void llvm::computeLICMSafetyInfo(LICMSafetyInfo * SafetyInfo, Loop * CurLoop) { // Iterate over header and compute safety info. for (BasicBlock::iterator I = Header->begin(), E = Header->end(); (I != E) && !SafetyInfo->HeaderMayThrow; ++I) - SafetyInfo->HeaderMayThrow |= I->mayThrow(); - + SafetyInfo->HeaderMayThrow |= + !isGuaranteedToTransferExecutionToSuccessor(&*I); + SafetyInfo->MayThrow = SafetyInfo->HeaderMayThrow; - // Iterate over loop instructions and compute safety info. - for (Loop::block_iterator BB = CurLoop->block_begin(), - BBE = CurLoop->block_end(); (BB != BBE) && !SafetyInfo->MayThrow ; ++BB) + // Iterate over loop instructions and compute safety info. + for (Loop::block_iterator BB = CurLoop->block_begin(), + BBE = CurLoop->block_end(); + (BB != BBE) && !SafetyInfo->MayThrow; ++BB) for (BasicBlock::iterator I = (*BB)->begin(), E = (*BB)->end(); (I != E) && !SafetyInfo->MayThrow; ++I) - SafetyInfo->MayThrow |= I->mayThrow(); + SafetyInfo->MayThrow |= !isGuaranteedToTransferExecutionToSuccessor(&*I); // Compute funclet colors if we might sink/hoist in a function with a funclet // personality routine. @@ -443,11 +439,11 @@ void llvm::computeLICMSafetyInfo(LICMSafetyInfo * SafetyInfo, Loop * CurLoop) { /// bool canSinkOrHoistInst(Instruction &I, AliasAnalysis *AA, DominatorTree *DT, TargetLibraryInfo *TLI, Loop *CurLoop, - AliasSetTracker *CurAST, LICMSafetyInfo *SafetyInfo) { + AliasSetTracker *CurAST, LoopSafetyInfo *SafetyInfo) { // Loads have extra constraints we have to verify before we can hoist them. if (LoadInst *LI = dyn_cast<LoadInst>(&I)) { if (!LI->isUnordered()) - return false; // Don't hoist volatile/atomic loads! + return false; // Don't hoist volatile/atomic loads! // Loads from constant memory are always safe to move, even if they end up // in the same alias set as something that ends up being modified. @@ -499,7 +495,8 @@ bool canSinkOrHoistInst(Instruction &I, AliasAnalysis *AA, DominatorTree *DT, break; } } - if (!FoundMod) return true; + if (!FoundMod) + return true; } // FIXME: This should use mod/ref information to see if we can hoist or @@ -518,9 +515,8 @@ bool canSinkOrHoistInst(Instruction &I, AliasAnalysis *AA, DominatorTree *DT, // TODO: Plumb the context instruction through to make hoisting and sinking // more powerful. Hoisting of loads already works due to the special casing - // above. - return isSafeToExecuteUnconditionally(I, DT, TLI, CurLoop, SafetyInfo, - nullptr); + // above. + return isSafeToExecuteUnconditionally(I, DT, CurLoop, SafetyInfo, nullptr); } /// Returns true if a PHINode is a trivially replaceable with an @@ -541,7 +537,7 @@ static bool isTriviallyReplacablePHI(const PHINode &PN, const Instruction &I) { /// blocks of the loop. /// static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop, - const LICMSafetyInfo *SafetyInfo) { + const LoopSafetyInfo *SafetyInfo) { const auto &BlockColors = SafetyInfo->BlockColors; for (const User *U : I.users()) { const Instruction *UI = cast<Instruction>(U); @@ -588,7 +584,7 @@ static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop, static Instruction * CloneInstructionInExitBlock(Instruction &I, BasicBlock &ExitBlock, PHINode &PN, const LoopInfo *LI, - const LICMSafetyInfo *SafetyInfo) { + const LoopSafetyInfo *SafetyInfo) { Instruction *New; if (auto *CI = dyn_cast<CallInst>(&I)) { const auto &BlockColors = SafetyInfo->BlockColors; @@ -621,7 +617,8 @@ CloneInstructionInExitBlock(Instruction &I, BasicBlock &ExitBlock, PHINode &PN, } ExitBlock.getInstList().insert(ExitBlock.getFirstInsertionPt(), New); - if (!I.getName().empty()) New->setName(I.getName() + ".le"); + if (!I.getName().empty()) + New->setName(I.getName() + ".le"); // Build LCSSA PHI nodes for any in-loop operands. Note that this is // particularly cheap because we can rip off the PHI node that we're @@ -652,18 +649,20 @@ CloneInstructionInExitBlock(Instruction &I, BasicBlock &ExitBlock, PHINode &PN, /// static bool sink(Instruction &I, const LoopInfo *LI, const DominatorTree *DT, const Loop *CurLoop, AliasSetTracker *CurAST, - const LICMSafetyInfo *SafetyInfo) { + const LoopSafetyInfo *SafetyInfo) { DEBUG(dbgs() << "LICM sinking instruction: " << I << "\n"); bool Changed = false; - if (isa<LoadInst>(I)) ++NumMovedLoads; - else if (isa<CallInst>(I)) ++NumMovedCalls; + if (isa<LoadInst>(I)) + ++NumMovedLoads; + else if (isa<CallInst>(I)) + ++NumMovedCalls; ++NumSunk; Changed = true; #ifndef NDEBUG SmallVector<BasicBlock *, 32> ExitBlocks; CurLoop->getUniqueExitBlocks(ExitBlocks); - SmallPtrSet<BasicBlock *, 32> ExitBlockSet(ExitBlocks.begin(), + SmallPtrSet<BasicBlock *, 32> ExitBlockSet(ExitBlocks.begin(), ExitBlocks.end()); #endif @@ -717,18 +716,30 @@ static bool sink(Instruction &I, const LoopInfo *LI, const DominatorTree *DT, /// When an instruction is found to only use loop invariant operands that /// is safe to hoist, this instruction is called to do the dirty work. /// -static bool hoist(Instruction &I, BasicBlock *Preheader) { - DEBUG(dbgs() << "LICM hoisting to " << Preheader->getName() << ": " - << I << "\n"); +static bool hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop, + const LoopSafetyInfo *SafetyInfo) { + auto *Preheader = CurLoop->getLoopPreheader(); + DEBUG(dbgs() << "LICM hoisting to " << Preheader->getName() << ": " << I + << "\n"); + + // Metadata can be dependent on conditions we are hoisting above. + // Conservatively strip all metadata on the instruction unless we were + // guaranteed to execute I if we entered the loop, in which case the metadata + // is valid in the loop preheader. + if (I.hasMetadataOtherThanDebugLoc() && + // The check on hasMetadataOtherThanDebugLoc is to prevent us from burning + // time in isGuaranteedToExecute if we don't actually have anything to + // drop. It is a compile time optimization, not required for correctness. + !isGuaranteedToExecute(I, DT, CurLoop, SafetyInfo)) + I.dropUnknownNonDebugMetadata(); + // Move the new node to the Preheader, before its terminator. I.moveBefore(Preheader->getTerminator()); - // Metadata can be dependent on the condition we are hoisting above. - // Conservatively strip all metadata on the instruction. - I.dropUnknownNonDebugMetadata(); - - if (isa<LoadInst>(I)) ++NumMovedLoads; - else if (isa<CallInst>(I)) ++NumMovedCalls; + if (isa<LoadInst>(I)) + ++NumMovedLoads; + else if (isa<CallInst>(I)) + ++NumMovedCalls; ++NumHoisted; return true; } @@ -736,134 +747,91 @@ static bool hoist(Instruction &I, BasicBlock *Preheader) { /// Only sink or hoist an instruction if it is not a trapping instruction, /// or if the instruction is known not to trap when moved to the preheader. /// or if it is a trapping instruction and is guaranteed to execute. -static bool isSafeToExecuteUnconditionally(const Instruction &Inst, +static bool isSafeToExecuteUnconditionally(const Instruction &Inst, const DominatorTree *DT, - const TargetLibraryInfo *TLI, const Loop *CurLoop, - const LICMSafetyInfo *SafetyInfo, + const LoopSafetyInfo *SafetyInfo, const Instruction *CtxI) { - if (isSafeToSpeculativelyExecute(&Inst, CtxI, DT, TLI)) + if (isSafeToSpeculativelyExecute(&Inst, CtxI, DT)) return true; return isGuaranteedToExecute(Inst, DT, CurLoop, SafetyInfo); } -static bool isGuaranteedToExecute(const Instruction &Inst, - const DominatorTree *DT, - const Loop *CurLoop, - const LICMSafetyInfo * SafetyInfo) { - - // We have to check to make sure that the instruction dominates all - // of the exit blocks. If it doesn't, then there is a path out of the loop - // which does not execute this instruction, so we can't hoist it. - - // If the instruction is in the header block for the loop (which is very - // common), it is always guaranteed to dominate the exit blocks. Since this - // is a common case, and can save some work, check it now. - if (Inst.getParent() == CurLoop->getHeader()) - // If there's a throw in the header block, we can't guarantee we'll reach - // Inst. - return !SafetyInfo->HeaderMayThrow; - - // Somewhere in this loop there is an instruction which may throw and make us - // exit the loop. - if (SafetyInfo->MayThrow) - return false; - - // Get the exit blocks for the current loop. - SmallVector<BasicBlock*, 8> ExitBlocks; - CurLoop->getExitBlocks(ExitBlocks); - - // Verify that the block dominates each of the exit blocks of the loop. - for (BasicBlock *ExitBlock : ExitBlocks) - if (!DT->dominates(Inst.getParent(), ExitBlock)) - return false; - - // As a degenerate case, if the loop is statically infinite then we haven't - // proven anything since there are no exit blocks. - if (ExitBlocks.empty()) - return false; - - return true; -} - namespace { - class LoopPromoter : public LoadAndStorePromoter { - Value *SomePtr; // Designated pointer to store to. - SmallPtrSetImpl<Value*> &PointerMustAliases; - SmallVectorImpl<BasicBlock*> &LoopExitBlocks; - SmallVectorImpl<Instruction*> &LoopInsertPts; - PredIteratorCache &PredCache; - AliasSetTracker &AST; - LoopInfo &LI; - DebugLoc DL; - int Alignment; - AAMDNodes AATags; - - Value *maybeInsertLCSSAPHI(Value *V, BasicBlock *BB) const { - if (Instruction *I = dyn_cast<Instruction>(V)) - if (Loop *L = LI.getLoopFor(I->getParent())) - if (!L->contains(BB)) { - // We need to create an LCSSA PHI node for the incoming value and - // store that. - PHINode *PN = - PHINode::Create(I->getType(), PredCache.size(BB), - I->getName() + ".lcssa", &BB->front()); - for (BasicBlock *Pred : PredCache.get(BB)) - PN->addIncoming(I, Pred); - return PN; - } - return V; - } +class LoopPromoter : public LoadAndStorePromoter { + Value *SomePtr; // Designated pointer to store to. + SmallPtrSetImpl<Value *> &PointerMustAliases; + SmallVectorImpl<BasicBlock *> &LoopExitBlocks; + SmallVectorImpl<Instruction *> &LoopInsertPts; + PredIteratorCache &PredCache; + AliasSetTracker &AST; + LoopInfo &LI; + DebugLoc DL; + int Alignment; + AAMDNodes AATags; - public: - LoopPromoter(Value *SP, - ArrayRef<const Instruction *> Insts, - SSAUpdater &S, SmallPtrSetImpl<Value *> &PMA, - SmallVectorImpl<BasicBlock *> &LEB, - SmallVectorImpl<Instruction *> &LIP, PredIteratorCache &PIC, - AliasSetTracker &ast, LoopInfo &li, DebugLoc dl, int alignment, - const AAMDNodes &AATags) - : LoadAndStorePromoter(Insts, S), SomePtr(SP), PointerMustAliases(PMA), - LoopExitBlocks(LEB), LoopInsertPts(LIP), PredCache(PIC), AST(ast), - LI(li), DL(dl), Alignment(alignment), AATags(AATags) {} - - bool isInstInList(Instruction *I, - const SmallVectorImpl<Instruction*> &) const override { - Value *Ptr; - if (LoadInst *LI = dyn_cast<LoadInst>(I)) - Ptr = LI->getOperand(0); - else - Ptr = cast<StoreInst>(I)->getPointerOperand(); - return PointerMustAliases.count(Ptr); - } + Value *maybeInsertLCSSAPHI(Value *V, BasicBlock *BB) const { + if (Instruction *I = dyn_cast<Instruction>(V)) + if (Loop *L = LI.getLoopFor(I->getParent())) + if (!L->contains(BB)) { + // We need to create an LCSSA PHI node for the incoming value and + // store that. + PHINode *PN = PHINode::Create(I->getType(), PredCache.size(BB), + I->getName() + ".lcssa", &BB->front()); + for (BasicBlock *Pred : PredCache.get(BB)) + PN->addIncoming(I, Pred); + return PN; + } + return V; + } - void doExtraRewritesBeforeFinalDeletion() const override { - // Insert stores after in the loop exit blocks. Each exit block gets a - // store of the live-out values that feed them. Since we've already told - // the SSA updater about the defs in the loop and the preheader - // definition, it is all set and we can start using it. - for (unsigned i = 0, e = LoopExitBlocks.size(); i != e; ++i) { - BasicBlock *ExitBlock = LoopExitBlocks[i]; - Value *LiveInValue = SSA.GetValueInMiddleOfBlock(ExitBlock); - LiveInValue = maybeInsertLCSSAPHI(LiveInValue, ExitBlock); - Value *Ptr = maybeInsertLCSSAPHI(SomePtr, ExitBlock); - Instruction *InsertPos = LoopInsertPts[i]; - StoreInst *NewSI = new StoreInst(LiveInValue, Ptr, InsertPos); - NewSI->setAlignment(Alignment); - NewSI->setDebugLoc(DL); - if (AATags) NewSI->setAAMetadata(AATags); - } - } +public: + LoopPromoter(Value *SP, ArrayRef<const Instruction *> Insts, SSAUpdater &S, + SmallPtrSetImpl<Value *> &PMA, + SmallVectorImpl<BasicBlock *> &LEB, + SmallVectorImpl<Instruction *> &LIP, PredIteratorCache &PIC, + AliasSetTracker &ast, LoopInfo &li, DebugLoc dl, int alignment, + const AAMDNodes &AATags) + : LoadAndStorePromoter(Insts, S), SomePtr(SP), PointerMustAliases(PMA), + LoopExitBlocks(LEB), LoopInsertPts(LIP), PredCache(PIC), AST(ast), + LI(li), DL(std::move(dl)), Alignment(alignment), AATags(AATags) {} + + bool isInstInList(Instruction *I, + const SmallVectorImpl<Instruction *> &) const override { + Value *Ptr; + if (LoadInst *LI = dyn_cast<LoadInst>(I)) + Ptr = LI->getOperand(0); + else + Ptr = cast<StoreInst>(I)->getPointerOperand(); + return PointerMustAliases.count(Ptr); + } - void replaceLoadWithValue(LoadInst *LI, Value *V) const override { - // Update alias analysis. - AST.copyValue(LI, V); + void doExtraRewritesBeforeFinalDeletion() const override { + // Insert stores after in the loop exit blocks. Each exit block gets a + // store of the live-out values that feed them. Since we've already told + // the SSA updater about the defs in the loop and the preheader + // definition, it is all set and we can start using it. + for (unsigned i = 0, e = LoopExitBlocks.size(); i != e; ++i) { + BasicBlock *ExitBlock = LoopExitBlocks[i]; + Value *LiveInValue = SSA.GetValueInMiddleOfBlock(ExitBlock); + LiveInValue = maybeInsertLCSSAPHI(LiveInValue, ExitBlock); + Value *Ptr = maybeInsertLCSSAPHI(SomePtr, ExitBlock); + Instruction *InsertPos = LoopInsertPts[i]; + StoreInst *NewSI = new StoreInst(LiveInValue, Ptr, InsertPos); + NewSI->setAlignment(Alignment); + NewSI->setDebugLoc(DL); + if (AATags) + NewSI->setAAMetadata(AATags); } - void instructionDeleted(Instruction *I) const override { - AST.deleteValue(I); - } - }; + } + + void replaceLoadWithValue(LoadInst *LI, Value *V) const override { + // Update alias analysis. + AST.copyValue(LI, V); + } + void instructionDeleted(Instruction *I) const override { AST.deleteValue(I); } +}; } // end anon namespace /// Try to promote memory values to scalars by sinking stores out of the @@ -871,32 +839,28 @@ namespace { /// the stores in the loop, looking for stores to Must pointers which are /// loop invariant. /// -bool llvm::promoteLoopAccessesToScalars(AliasSet &AS, - SmallVectorImpl<BasicBlock*>&ExitBlocks, - SmallVectorImpl<Instruction*>&InsertPts, - PredIteratorCache &PIC, LoopInfo *LI, - DominatorTree *DT, Loop *CurLoop, - AliasSetTracker *CurAST, - LICMSafetyInfo * SafetyInfo) { +bool llvm::promoteLoopAccessesToScalars( + AliasSet &AS, SmallVectorImpl<BasicBlock *> &ExitBlocks, + SmallVectorImpl<Instruction *> &InsertPts, PredIteratorCache &PIC, + LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, + Loop *CurLoop, AliasSetTracker *CurAST, LoopSafetyInfo *SafetyInfo) { // Verify inputs. - assert(LI != nullptr && DT != nullptr && - CurLoop != nullptr && CurAST != nullptr && - SafetyInfo != nullptr && + assert(LI != nullptr && DT != nullptr && CurLoop != nullptr && + CurAST != nullptr && SafetyInfo != nullptr && "Unexpected Input to promoteLoopAccessesToScalars"); - // Initially set Changed status to false. - bool Changed = false; + // We can promote this alias set if it has a store, if it is a "Must" alias // set, if the pointer is loop invariant, and if we are not eliminating any // volatile loads or stores. if (AS.isForwardingAliasSet() || !AS.isMod() || !AS.isMustAlias() || AS.isVolatile() || !CurLoop->isLoopInvariant(AS.begin()->getValue())) - return Changed; + return false; assert(!AS.empty() && "Must alias set should have at least one pointer element in it!"); Value *SomePtr = AS.begin()->getValue(); - BasicBlock * Preheader = CurLoop->getLoopPreheader(); + BasicBlock *Preheader = CurLoop->getLoopPreheader(); // It isn't safe to promote a load/store from the loop if the load/store is // conditional. For example, turning: @@ -909,12 +873,27 @@ bool llvm::promoteLoopAccessesToScalars(AliasSet &AS, // // is not safe, because *P may only be valid to access if 'c' is true. // + // The safety property divides into two parts: + // 1) The memory may not be dereferenceable on entry to the loop. In this + // case, we can't insert the required load in the preheader. + // 2) The memory model does not allow us to insert a store along any dynamic + // path which did not originally have one. + // // It is safe to promote P if all uses are direct load/stores and if at // least one is guaranteed to be executed. bool GuaranteedToExecute = false; - SmallVector<Instruction*, 64> LoopUses; - SmallPtrSet<Value*, 4> PointerMustAliases; + // It is also safe to promote P if we can prove that speculating a load into + // the preheader is safe (i.e. proving dereferenceability on all + // paths through the loop), and that the memory can be proven thread local + // (so that the memory model requirement doesn't apply.) We first establish + // the former, and then run a capture analysis below to establish the later. + // We can use any access within the alias set to prove dereferenceability + // since they're all must alias. + bool CanSpeculateLoad = false; + + SmallVector<Instruction *, 64> LoopUses; + SmallPtrSet<Value *, 4> PointerMustAliases; // We start with an alignment of one and try to find instructions that allow // us to prove better alignment. @@ -922,11 +901,32 @@ bool llvm::promoteLoopAccessesToScalars(AliasSet &AS, AAMDNodes AATags; bool HasDedicatedExits = CurLoop->hasDedicatedExits(); + // Don't sink stores from loops without dedicated block exits. Exits + // containing indirect branches are not transformed by loop simplify, + // make sure we catch that. An additional load may be generated in the + // preheader for SSA updater, so also avoid sinking when no preheader + // is available. + if (!HasDedicatedExits || !Preheader) + return false; + + const DataLayout &MDL = Preheader->getModule()->getDataLayout(); + + if (SafetyInfo->MayThrow) { + // If a loop can throw, we have to insert a store along each unwind edge. + // That said, we can't actually make the unwind edge explicit. Therefore, + // we have to prove that the store is dead along the unwind edge. + // + // Currently, this code just special-cases alloca instructions. + if (!isa<AllocaInst>(GetUnderlyingObject(SomePtr, MDL))) + return false; + } + // Check that all of the pointers in the alias set have the same type. We // cannot (yet) promote a memory location that is loaded and stored in // different sizes. While we are at it, collect alignment and AA info. - for (AliasSet::iterator ASI = AS.begin(), E = AS.end(); ASI != E; ++ASI) { - Value *ASIV = ASI->getValue(); + bool Changed = false; + for (const auto &ASI : AS) { + Value *ASIV = ASI.getValue(); PointerMustAliases.insert(ASIV); // Check that all of the pointers in the alias set have the same type. We @@ -947,6 +947,10 @@ bool llvm::promoteLoopAccessesToScalars(AliasSet &AS, assert(!Load->isVolatile() && "AST broken"); if (!Load->isSimple()) return Changed; + + if (!GuaranteedToExecute && !CanSpeculateLoad) + CanSpeculateLoad = isSafeToExecuteUnconditionally( + *Load, DT, CurLoop, SafetyInfo, Preheader->getTerminator()); } else if (const StoreInst *Store = dyn_cast<StoreInst>(UI)) { // Stores *of* the pointer are not interesting, only stores *to* the // pointer. @@ -955,13 +959,6 @@ bool llvm::promoteLoopAccessesToScalars(AliasSet &AS, assert(!Store->isVolatile() && "AST broken"); if (!Store->isSimple()) return Changed; - // Don't sink stores from loops without dedicated block exits. Exits - // containing indirect branches are not transformed by loop simplify, - // make sure we catch that. An additional load may be generated in the - // preheader for SSA updater, so also avoid sinking when no preheader - // is available. - if (!HasDedicatedExits || !Preheader) - return Changed; // Note that we only check GuaranteedToExecute inside the store case // so that we do not introduce stores where they did not exist before @@ -972,16 +969,22 @@ bool llvm::promoteLoopAccessesToScalars(AliasSet &AS, // instruction will be executed, update the alignment. // Larger is better, with the exception of 0 being the best alignment. unsigned InstAlignment = Store->getAlignment(); - if ((InstAlignment > Alignment || InstAlignment == 0) && Alignment != 0) + if ((InstAlignment > Alignment || InstAlignment == 0) && + Alignment != 0) { if (isGuaranteedToExecute(*UI, DT, CurLoop, SafetyInfo)) { GuaranteedToExecute = true; Alignment = InstAlignment; } + } else if (!GuaranteedToExecute) { + GuaranteedToExecute = + isGuaranteedToExecute(*UI, DT, CurLoop, SafetyInfo); + } - if (!GuaranteedToExecute) - GuaranteedToExecute = isGuaranteedToExecute(*UI, DT, - CurLoop, SafetyInfo); - + if (!GuaranteedToExecute && !CanSpeculateLoad) { + CanSpeculateLoad = isDereferenceableAndAlignedPointer( + Store->getPointerOperand(), Store->getAlignment(), MDL, + Preheader->getTerminator(), DT); + } } else return Changed; // Not a load or store. @@ -997,8 +1000,17 @@ bool llvm::promoteLoopAccessesToScalars(AliasSet &AS, } } - // If there isn't a guaranteed-to-execute instruction, we can't promote. - if (!GuaranteedToExecute) + // Check legality per comment above. Otherwise, we can't promote. + bool PromotionIsLegal = GuaranteedToExecute; + if (!PromotionIsLegal && CanSpeculateLoad) { + // If this is a thread local location, then we can insert stores along + // paths which originally didn't have them without violating the memory + // model. + Value *Object = GetUnderlyingObject(SomePtr, MDL); + PromotionIsLegal = + isAllocLikeFn(Object, TLI) && !PointerMayBeCaptured(Object, true, true); + } + if (!PromotionIsLegal) return Changed; // Figure out the loop exits and their insertion points, if this is the @@ -1017,7 +1029,8 @@ bool llvm::promoteLoopAccessesToScalars(AliasSet &AS, return Changed; // Otherwise, this is safe to promote, lets do it! - DEBUG(dbgs() << "LICM: Promoting value stored to in loop: " <<*SomePtr<<'\n'); + DEBUG(dbgs() << "LICM: Promoting value stored to in loop: " << *SomePtr + << '\n'); Changed = true; ++NumPromoted; @@ -1028,20 +1041,19 @@ bool llvm::promoteLoopAccessesToScalars(AliasSet &AS, DebugLoc DL = LoopUses[0]->getDebugLoc(); // We use the SSAUpdater interface to insert phi nodes as required. - SmallVector<PHINode*, 16> NewPHIs; + SmallVector<PHINode *, 16> NewPHIs; SSAUpdater SSA(&NewPHIs); - LoopPromoter Promoter(SomePtr, LoopUses, SSA, - PointerMustAliases, ExitBlocks, + LoopPromoter Promoter(SomePtr, LoopUses, SSA, PointerMustAliases, ExitBlocks, InsertPts, PIC, *CurAST, *LI, DL, Alignment, AATags); // Set up the preheader to have a definition of the value. It is the live-out // value from the preheader that uses in the loop will use. - LoadInst *PreheaderLoad = - new LoadInst(SomePtr, SomePtr->getName()+".promoted", - Preheader->getTerminator()); + LoadInst *PreheaderLoad = new LoadInst( + SomePtr, SomePtr->getName() + ".promoted", Preheader->getTerminator()); PreheaderLoad->setAlignment(Alignment); PreheaderLoad->setDebugLoc(DL); - if (AATags) PreheaderLoad->setAAMetadata(AATags); + if (AATags) + PreheaderLoad->setAAMetadata(AATags); SSA.AddAvailableValue(Preheader, PreheaderLoad); // Rewrite all the loads in the loop and remember all the definitions from @@ -1055,10 +1067,67 @@ bool llvm::promoteLoopAccessesToScalars(AliasSet &AS, return Changed; } +/// Returns an owning pointer to an alias set which incorporates aliasing info +/// from L and all subloops of L. +/// FIXME: In new pass manager, there is no helper functions to handle loop +/// analysis such as cloneBasicBlockAnalysis. So the AST needs to be recompute +/// from scratch for every loop. Hook up with the helper functions when +/// available in the new pass manager to avoid redundant computation. +AliasSetTracker * +LoopInvariantCodeMotion::collectAliasInfoForLoop(Loop *L, LoopInfo *LI, + AliasAnalysis *AA) { + AliasSetTracker *CurAST = nullptr; + SmallVector<Loop *, 4> RecomputeLoops; + for (Loop *InnerL : L->getSubLoops()) { + auto MapI = LoopToAliasSetMap.find(InnerL); + // If the AST for this inner loop is missing it may have been merged into + // some other loop's AST and then that loop unrolled, and so we need to + // recompute it. + if (MapI == LoopToAliasSetMap.end()) { + RecomputeLoops.push_back(InnerL); + continue; + } + AliasSetTracker *InnerAST = MapI->second; + + if (CurAST != nullptr) { + // What if InnerLoop was modified by other passes ? + CurAST->add(*InnerAST); + + // Once we've incorporated the inner loop's AST into ours, we don't need + // the subloop's anymore. + delete InnerAST; + } else { + CurAST = InnerAST; + } + LoopToAliasSetMap.erase(MapI); + } + if (CurAST == nullptr) + CurAST = new AliasSetTracker(*AA); + + auto mergeLoop = [&](Loop *L) { + // Loop over the body of this loop, looking for calls, invokes, and stores. + // Because subloops have already been incorporated into AST, we skip blocks + // in subloops. + for (BasicBlock *BB : L->blocks()) + if (LI->getLoopFor(BB) == L) // Ignore blocks in subloops. + CurAST->add(*BB); // Incorporate the specified basic block + }; + + // Add everything from the sub loops that are no longer directly available. + for (Loop *InnerL : RecomputeLoops) + mergeLoop(InnerL); + + // And merge in this loop. + mergeLoop(L); + + return CurAST; +} + /// Simple analysis hook. Clone alias set info. /// -void LICM::cloneBasicBlockAnalysis(BasicBlock *From, BasicBlock *To, Loop *L) { - AliasSetTracker *AST = LoopToAliasSetMap.lookup(L); +void LegacyLICMPass::cloneBasicBlockAnalysis(BasicBlock *From, BasicBlock *To, + Loop *L) { + AliasSetTracker *AST = LICM.getLoopToAliasSetMap().lookup(L); if (!AST) return; @@ -1067,8 +1136,8 @@ void LICM::cloneBasicBlockAnalysis(BasicBlock *From, BasicBlock *To, Loop *L) { /// Simple Analysis hook. Delete value V from alias set /// -void LICM::deleteAnalysisValue(Value *V, Loop *L) { - AliasSetTracker *AST = LoopToAliasSetMap.lookup(L); +void LegacyLICMPass::deleteAnalysisValue(Value *V, Loop *L) { + AliasSetTracker *AST = LICM.getLoopToAliasSetMap().lookup(L); if (!AST) return; @@ -1077,21 +1146,20 @@ void LICM::deleteAnalysisValue(Value *V, Loop *L) { /// Simple Analysis hook. Delete value L from alias set map. /// -void LICM::deleteAnalysisLoop(Loop *L) { - AliasSetTracker *AST = LoopToAliasSetMap.lookup(L); +void LegacyLICMPass::deleteAnalysisLoop(Loop *L) { + AliasSetTracker *AST = LICM.getLoopToAliasSetMap().lookup(L); if (!AST) return; delete AST; - LoopToAliasSetMap.erase(L); + LICM.getLoopToAliasSetMap().erase(L); } - /// Return true if the body of this loop may store into the memory /// location pointed to by V. /// static bool pointerInvalidatedByLoop(Value *V, uint64_t Size, - const AAMDNodes &AAInfo, + const AAMDNodes &AAInfo, AliasSetTracker *CurAST) { // Check to see if any of the basic blocks in CurLoop invalidate *V. return CurAST->getAliasSetForPointer(V, Size, AAInfo).isMod(); @@ -1104,4 +1172,3 @@ static bool inSubLoop(BasicBlock *BB, Loop *CurLoop, LoopInfo *LI) { assert(CurLoop->contains(BB) && "Only valid if BB is IN the loop"); return LI->getLoopFor(BB) != CurLoop; } - diff --git a/lib/Transforms/Scalar/LoadCombine.cpp b/lib/Transforms/Scalar/LoadCombine.cpp index 1648878b06286..dfe51a4ce44c5 100644 --- a/lib/Transforms/Scalar/LoadCombine.cpp +++ b/lib/Transforms/Scalar/LoadCombine.cpp @@ -35,10 +35,12 @@ using namespace llvm; STATISTIC(NumLoadsAnalyzed, "Number of loads analyzed for combining"); STATISTIC(NumLoadsCombined, "Number of loads combined"); +#define LDCOMBINE_NAME "Combine Adjacent Loads" + namespace { struct PointerOffsetPair { Value *Pointer; - uint64_t Offset; + APInt Offset; }; struct LoadPOPPair { @@ -63,12 +65,16 @@ public: using llvm::Pass::doInitialization; bool doInitialization(Function &) override; bool runOnBasicBlock(BasicBlock &BB) override; - void getAnalysisUsage(AnalysisUsage &AU) const override; + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired<AAResultsWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); + } - const char *getPassName() const override { return "LoadCombine"; } + const char *getPassName() const override { return LDCOMBINE_NAME; } static char ID; - typedef IRBuilder<true, TargetFolder> BuilderTy; + typedef IRBuilder<TargetFolder> BuilderTy; private: BuilderTy *Builder; @@ -87,22 +93,25 @@ bool LoadCombine::doInitialization(Function &F) { } PointerOffsetPair LoadCombine::getPointerOffsetPair(LoadInst &LI) { + auto &DL = LI.getModule()->getDataLayout(); + PointerOffsetPair POP; POP.Pointer = LI.getPointerOperand(); - POP.Offset = 0; + unsigned BitWidth = DL.getPointerSizeInBits(LI.getPointerAddressSpace()); + POP.Offset = APInt(BitWidth, 0); + while (isa<BitCastInst>(POP.Pointer) || isa<GetElementPtrInst>(POP.Pointer)) { if (auto *GEP = dyn_cast<GetElementPtrInst>(POP.Pointer)) { - auto &DL = LI.getModule()->getDataLayout(); - unsigned BitWidth = DL.getPointerTypeSizeInBits(GEP->getType()); - APInt Offset(BitWidth, 0); - if (GEP->accumulateConstantOffset(DL, Offset)) - POP.Offset += Offset.getZExtValue(); - else + APInt LastOffset = POP.Offset; + if (!GEP->accumulateConstantOffset(DL, POP.Offset)) { // Can't handle GEPs with variable indices. + POP.Offset = LastOffset; return POP; + } POP.Pointer = GEP->getPointerOperand(); - } else if (auto *BC = dyn_cast<BitCastInst>(POP.Pointer)) + } else if (auto *BC = dyn_cast<BitCastInst>(POP.Pointer)) { POP.Pointer = BC->getOperand(0); + } } return POP; } @@ -115,8 +124,8 @@ bool LoadCombine::combineLoads( continue; std::sort(Loads.second.begin(), Loads.second.end(), [](const LoadPOPPair &A, const LoadPOPPair &B) { - return A.POP.Offset < B.POP.Offset; - }); + return A.POP.Offset.slt(B.POP.Offset); + }); if (aggregateLoads(Loads.second)) Combined = true; } @@ -132,28 +141,31 @@ bool LoadCombine::aggregateLoads(SmallVectorImpl<LoadPOPPair> &Loads) { LoadInst *BaseLoad = nullptr; SmallVector<LoadPOPPair, 8> AggregateLoads; bool Combined = false; - uint64_t PrevOffset = -1ull; + bool ValidPrevOffset = false; + APInt PrevOffset; uint64_t PrevSize = 0; for (auto &L : Loads) { - if (PrevOffset == -1ull) { + if (ValidPrevOffset == false) { BaseLoad = L.Load; PrevOffset = L.POP.Offset; PrevSize = L.Load->getModule()->getDataLayout().getTypeStoreSize( L.Load->getType()); AggregateLoads.push_back(L); + ValidPrevOffset = true; continue; } if (L.Load->getAlignment() > BaseLoad->getAlignment()) continue; - if (L.POP.Offset > PrevOffset + PrevSize) { + APInt PrevEnd = PrevOffset + PrevSize; + if (L.POP.Offset.sgt(PrevEnd)) { // No other load will be combinable if (combineLoads(AggregateLoads)) Combined = true; AggregateLoads.clear(); - PrevOffset = -1; + ValidPrevOffset = false; continue; } - if (L.POP.Offset != PrevOffset + PrevSize) + if (L.POP.Offset != PrevEnd) // This load is offset less than the size of the last load. // FIXME: We may want to handle this case. continue; @@ -199,7 +211,7 @@ bool LoadCombine::combineLoads(SmallVectorImpl<LoadPOPPair> &Loads) { Value *Ptr = Builder->CreateConstGEP1_64( Builder->CreatePointerCast(Loads[0].POP.Pointer, Builder->getInt8PtrTy(AddressSpace)), - Loads[0].POP.Offset); + Loads[0].POP.Offset.getSExtValue()); LoadInst *NewLoad = new LoadInst( Builder->CreatePointerCast( Ptr, PointerType::get(IntegerType::get(Ptr->getContext(), TotalSize), @@ -212,7 +224,7 @@ bool LoadCombine::combineLoads(SmallVectorImpl<LoadPOPPair> &Loads) { Value *V = Builder->CreateExtractInteger( L.Load->getModule()->getDataLayout(), NewLoad, cast<IntegerType>(L.Load->getType()), - L.POP.Offset - Loads[0].POP.Offset, "combine.extract"); + (L.POP.Offset - Loads[0].POP.Offset).getZExtValue(), "combine.extract"); L.Load->replaceAllUsesWith(V); } @@ -221,12 +233,12 @@ bool LoadCombine::combineLoads(SmallVectorImpl<LoadPOPPair> &Loads) { } bool LoadCombine::runOnBasicBlock(BasicBlock &BB) { - if (skipOptnoneFunction(BB)) + if (skipBasicBlock(BB)) return false; AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); - IRBuilder<true, TargetFolder> TheBuilder( + IRBuilder<TargetFolder> TheBuilder( BB.getContext(), TargetFolder(BB.getModule()->getDataLayout())); Builder = &TheBuilder; @@ -260,23 +272,12 @@ bool LoadCombine::runOnBasicBlock(BasicBlock &BB) { return Combined; } -void LoadCombine::getAnalysisUsage(AnalysisUsage &AU) const { - AU.setPreservesCFG(); - - AU.addRequired<AAResultsWrapperPass>(); - AU.addPreserved<GlobalsAAWrapperPass>(); -} - char LoadCombine::ID = 0; BasicBlockPass *llvm::createLoadCombinePass() { return new LoadCombine(); } -INITIALIZE_PASS_BEGIN(LoadCombine, "load-combine", "Combine Adjacent Loads", - false, false) +INITIALIZE_PASS_BEGIN(LoadCombine, "load-combine", LDCOMBINE_NAME, false, false) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) -INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) -INITIALIZE_PASS_END(LoadCombine, "load-combine", "Combine Adjacent Loads", - false, false) - +INITIALIZE_PASS_END(LoadCombine, "load-combine", LDCOMBINE_NAME, false, false) diff --git a/lib/Transforms/Scalar/LoopDataPrefetch.cpp b/lib/Transforms/Scalar/LoopDataPrefetch.cpp new file mode 100644 index 0000000000000..66b59d27dfdeb --- /dev/null +++ b/lib/Transforms/Scalar/LoopDataPrefetch.cpp @@ -0,0 +1,304 @@ +//===-------- LoopDataPrefetch.cpp - Loop Data Prefetching Pass -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements a Loop Data Prefetching Pass. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "loop-data-prefetch" +#include "llvm/Transforms/Scalar.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/CodeMetrics.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" +#include "llvm/Analysis/ScalarEvolutionExpander.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/ValueMapper.h" +using namespace llvm; + +// By default, we limit this to creating 16 PHIs (which is a little over half +// of the allocatable register set). +static cl::opt<bool> +PrefetchWrites("loop-prefetch-writes", cl::Hidden, cl::init(false), + cl::desc("Prefetch write addresses")); + +static cl::opt<unsigned> + PrefetchDistance("prefetch-distance", + cl::desc("Number of instructions to prefetch ahead"), + cl::Hidden); + +static cl::opt<unsigned> + MinPrefetchStride("min-prefetch-stride", + cl::desc("Min stride to add prefetches"), cl::Hidden); + +static cl::opt<unsigned> MaxPrefetchIterationsAhead( + "max-prefetch-iters-ahead", + cl::desc("Max number of iterations to prefetch ahead"), cl::Hidden); + +STATISTIC(NumPrefetches, "Number of prefetches inserted"); + +namespace llvm { + void initializeLoopDataPrefetchPass(PassRegistry&); +} + +namespace { + + class LoopDataPrefetch : public FunctionPass { + public: + static char ID; // Pass ID, replacement for typeid + LoopDataPrefetch() : FunctionPass(ID) { + initializeLoopDataPrefetchPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<AssumptionCacheTracker>(); + AU.addPreserved<DominatorTreeWrapperPass>(); + AU.addRequired<LoopInfoWrapperPass>(); + AU.addPreserved<LoopInfoWrapperPass>(); + AU.addRequired<ScalarEvolutionWrapperPass>(); + // FIXME: For some reason, preserving SE here breaks LSR (even if + // this pass changes nothing). + // AU.addPreserved<ScalarEvolutionWrapperPass>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); + } + + bool runOnFunction(Function &F) override; + + private: + bool runOnLoop(Loop *L); + + /// \brief Check if the the stride of the accesses is large enough to + /// warrant a prefetch. + bool isStrideLargeEnough(const SCEVAddRecExpr *AR); + + unsigned getMinPrefetchStride() { + if (MinPrefetchStride.getNumOccurrences() > 0) + return MinPrefetchStride; + return TTI->getMinPrefetchStride(); + } + + unsigned getPrefetchDistance() { + if (PrefetchDistance.getNumOccurrences() > 0) + return PrefetchDistance; + return TTI->getPrefetchDistance(); + } + + unsigned getMaxPrefetchIterationsAhead() { + if (MaxPrefetchIterationsAhead.getNumOccurrences() > 0) + return MaxPrefetchIterationsAhead; + return TTI->getMaxPrefetchIterationsAhead(); + } + + AssumptionCache *AC; + LoopInfo *LI; + ScalarEvolution *SE; + const TargetTransformInfo *TTI; + const DataLayout *DL; + }; +} + +char LoopDataPrefetch::ID = 0; +INITIALIZE_PASS_BEGIN(LoopDataPrefetch, "loop-data-prefetch", + "Loop Data Prefetch", false, false) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) +INITIALIZE_PASS_END(LoopDataPrefetch, "loop-data-prefetch", + "Loop Data Prefetch", false, false) + +FunctionPass *llvm::createLoopDataPrefetchPass() { return new LoopDataPrefetch(); } + +bool LoopDataPrefetch::isStrideLargeEnough(const SCEVAddRecExpr *AR) { + unsigned TargetMinStride = getMinPrefetchStride(); + // No need to check if any stride goes. + if (TargetMinStride <= 1) + return true; + + const auto *ConstStride = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*SE)); + // If MinStride is set, don't prefetch unless we can ensure that stride is + // larger. + if (!ConstStride) + return false; + + unsigned AbsStride = std::abs(ConstStride->getAPInt().getSExtValue()); + return TargetMinStride <= AbsStride; +} + +bool LoopDataPrefetch::runOnFunction(Function &F) { + if (skipFunction(F)) + return false; + + LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); + DL = &F.getParent()->getDataLayout(); + AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); + TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); + + // If PrefetchDistance is not set, don't run the pass. This gives an + // opportunity for targets to run this pass for selected subtargets only + // (whose TTI sets PrefetchDistance). + if (getPrefetchDistance() == 0) + return false; + assert(TTI->getCacheLineSize() && "Cache line size is not set for target"); + + bool MadeChange = false; + + for (Loop *I : *LI) + for (auto L = df_begin(I), LE = df_end(I); L != LE; ++L) + MadeChange |= runOnLoop(*L); + + return MadeChange; +} + +bool LoopDataPrefetch::runOnLoop(Loop *L) { + bool MadeChange = false; + + // Only prefetch in the inner-most loop + if (!L->empty()) + return MadeChange; + + SmallPtrSet<const Value *, 32> EphValues; + CodeMetrics::collectEphemeralValues(L, AC, EphValues); + + // Calculate the number of iterations ahead to prefetch + CodeMetrics Metrics; + for (Loop::block_iterator I = L->block_begin(), IE = L->block_end(); + I != IE; ++I) { + + // If the loop already has prefetches, then assume that the user knows + // what they are doing and don't add any more. + for (BasicBlock::iterator J = (*I)->begin(), JE = (*I)->end(); + J != JE; ++J) + if (CallInst *CI = dyn_cast<CallInst>(J)) + if (Function *F = CI->getCalledFunction()) + if (F->getIntrinsicID() == Intrinsic::prefetch) + return MadeChange; + + Metrics.analyzeBasicBlock(*I, *TTI, EphValues); + } + unsigned LoopSize = Metrics.NumInsts; + if (!LoopSize) + LoopSize = 1; + + unsigned ItersAhead = getPrefetchDistance() / LoopSize; + if (!ItersAhead) + ItersAhead = 1; + + if (ItersAhead > getMaxPrefetchIterationsAhead()) + return MadeChange; + + Function *F = L->getHeader()->getParent(); + DEBUG(dbgs() << "Prefetching " << ItersAhead + << " iterations ahead (loop size: " << LoopSize << ") in " + << F->getName() << ": " << *L); + + SmallVector<std::pair<Instruction *, const SCEVAddRecExpr *>, 16> PrefLoads; + for (Loop::block_iterator I = L->block_begin(), IE = L->block_end(); + I != IE; ++I) { + for (BasicBlock::iterator J = (*I)->begin(), JE = (*I)->end(); + J != JE; ++J) { + Value *PtrValue; + Instruction *MemI; + + if (LoadInst *LMemI = dyn_cast<LoadInst>(J)) { + MemI = LMemI; + PtrValue = LMemI->getPointerOperand(); + } else if (StoreInst *SMemI = dyn_cast<StoreInst>(J)) { + if (!PrefetchWrites) continue; + MemI = SMemI; + PtrValue = SMemI->getPointerOperand(); + } else continue; + + unsigned PtrAddrSpace = PtrValue->getType()->getPointerAddressSpace(); + if (PtrAddrSpace) + continue; + + if (L->isLoopInvariant(PtrValue)) + continue; + + const SCEV *LSCEV = SE->getSCEV(PtrValue); + const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV); + if (!LSCEVAddRec) + continue; + + // Check if the the stride of the accesses is large enough to warrant a + // prefetch. + if (!isStrideLargeEnough(LSCEVAddRec)) + continue; + + // We don't want to double prefetch individual cache lines. If this load + // is known to be within one cache line of some other load that has + // already been prefetched, then don't prefetch this one as well. + bool DupPref = false; + for (const auto &PrefLoad : PrefLoads) { + const SCEV *PtrDiff = SE->getMinusSCEV(LSCEVAddRec, PrefLoad.second); + if (const SCEVConstant *ConstPtrDiff = + dyn_cast<SCEVConstant>(PtrDiff)) { + int64_t PD = std::abs(ConstPtrDiff->getValue()->getSExtValue()); + if (PD < (int64_t) TTI->getCacheLineSize()) { + DupPref = true; + break; + } + } + } + if (DupPref) + continue; + + const SCEV *NextLSCEV = SE->getAddExpr(LSCEVAddRec, SE->getMulExpr( + SE->getConstant(LSCEVAddRec->getType(), ItersAhead), + LSCEVAddRec->getStepRecurrence(*SE))); + if (!isSafeToExpand(NextLSCEV, *SE)) + continue; + + PrefLoads.push_back(std::make_pair(MemI, LSCEVAddRec)); + + Type *I8Ptr = Type::getInt8PtrTy((*I)->getContext(), PtrAddrSpace); + SCEVExpander SCEVE(*SE, J->getModule()->getDataLayout(), "prefaddr"); + Value *PrefPtrValue = SCEVE.expandCodeFor(NextLSCEV, I8Ptr, MemI); + + IRBuilder<> Builder(MemI); + Module *M = (*I)->getParent()->getParent(); + Type *I32 = Type::getInt32Ty((*I)->getContext()); + Value *PrefetchFunc = Intrinsic::getDeclaration(M, Intrinsic::prefetch); + Builder.CreateCall( + PrefetchFunc, + {PrefPtrValue, + ConstantInt::get(I32, MemI->mayReadFromMemory() ? 0 : 1), + ConstantInt::get(I32, 3), ConstantInt::get(I32, 1)}); + ++NumPrefetches; + DEBUG(dbgs() << " Access: " << *PtrValue << ", SCEV: " << *LSCEV + << "\n"); + emitOptimizationRemark(F->getContext(), DEBUG_TYPE, *F, + MemI->getDebugLoc(), "prefetched memory access"); + + + MadeChange = true; + } + } + + return MadeChange; +} + diff --git a/lib/Transforms/Scalar/LoopDeletion.cpp b/lib/Transforms/Scalar/LoopDeletion.cpp index 7b1940b48c31b..19b2f89555c2b 100644 --- a/lib/Transforms/Scalar/LoopDeletion.cpp +++ b/lib/Transforms/Scalar/LoopDeletion.cpp @@ -14,75 +14,28 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Scalar/LoopDeletion.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/LoopPass.h" -#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/LoopPassManager.h" #include "llvm/IR/Dominators.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/LoopUtils.h" using namespace llvm; #define DEBUG_TYPE "loop-delete" STATISTIC(NumDeleted, "Number of loops deleted"); -namespace { - class LoopDeletion : public LoopPass { - public: - static char ID; // Pass ID, replacement for typeid - LoopDeletion() : LoopPass(ID) { - initializeLoopDeletionPass(*PassRegistry::getPassRegistry()); - } - - // Possibly eliminate loop L if it is dead. - bool runOnLoop(Loop *L, LPPassManager &) override; - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<LoopInfoWrapperPass>(); - AU.addRequired<ScalarEvolutionWrapperPass>(); - AU.addRequiredID(LoopSimplifyID); - AU.addRequiredID(LCSSAID); - - AU.addPreserved<ScalarEvolutionWrapperPass>(); - AU.addPreserved<DominatorTreeWrapperPass>(); - AU.addPreserved<LoopInfoWrapperPass>(); - AU.addPreserved<GlobalsAAWrapperPass>(); - AU.addPreservedID(LoopSimplifyID); - AU.addPreservedID(LCSSAID); - } - - private: - bool isLoopDead(Loop *L, SmallVectorImpl<BasicBlock *> &exitingBlocks, - SmallVectorImpl<BasicBlock *> &exitBlocks, - bool &Changed, BasicBlock *Preheader); - - }; -} - -char LoopDeletion::ID = 0; -INITIALIZE_PASS_BEGIN(LoopDeletion, "loop-deletion", - "Delete dead loops", false, false) -INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) -INITIALIZE_PASS_DEPENDENCY(LoopSimplify) -INITIALIZE_PASS_DEPENDENCY(LCSSA) -INITIALIZE_PASS_END(LoopDeletion, "loop-deletion", - "Delete dead loops", false, false) - -Pass *llvm::createLoopDeletionPass() { - return new LoopDeletion(); -} - /// isLoopDead - Determined if a loop is dead. This assumes that we've already /// checked for unique exit and exiting blocks, and that the code is in LCSSA /// form. -bool LoopDeletion::isLoopDead(Loop *L, - SmallVectorImpl<BasicBlock *> &exitingBlocks, - SmallVectorImpl<BasicBlock *> &exitBlocks, - bool &Changed, BasicBlock *Preheader) { +bool LoopDeletionPass::isLoopDead(Loop *L, ScalarEvolution &SE, + SmallVectorImpl<BasicBlock *> &exitingBlocks, + SmallVectorImpl<BasicBlock *> &exitBlocks, + bool &Changed, BasicBlock *Preheader) { BasicBlock *exitBlock = exitBlocks[0]; // Make sure that all PHI entries coming from the loop are loop invariant. @@ -91,6 +44,8 @@ bool LoopDeletion::isLoopDead(Loop *L, // sufficient to guarantee that no loop-variant values are used outside // of the loop. BasicBlock::iterator BI = exitBlock->begin(); + bool AllEntriesInvariant = true; + bool AllOutgoingValuesSame = true; while (PHINode *P = dyn_cast<PHINode>(BI)) { Value *incoming = P->getIncomingValueForBlock(exitingBlocks[0]); @@ -98,27 +53,37 @@ bool LoopDeletion::isLoopDead(Loop *L, // block. If there are different incoming values for different exiting // blocks, then it is impossible to statically determine which value should // be used. - for (unsigned i = 1, e = exitingBlocks.size(); i < e; ++i) { - if (incoming != P->getIncomingValueForBlock(exitingBlocks[i])) - return false; - } + AllOutgoingValuesSame = + all_of(makeArrayRef(exitingBlocks).slice(1), [&](BasicBlock *BB) { + return incoming == P->getIncomingValueForBlock(BB); + }); + + if (!AllOutgoingValuesSame) + break; if (Instruction *I = dyn_cast<Instruction>(incoming)) - if (!L->makeLoopInvariant(I, Changed, Preheader->getTerminator())) - return false; + if (!L->makeLoopInvariant(I, Changed, Preheader->getTerminator())) { + AllEntriesInvariant = false; + break; + } ++BI; } + if (Changed) + SE.forgetLoopDispositions(L); + + if (!AllEntriesInvariant || !AllOutgoingValuesSame) + return false; + // Make sure that no instructions in the block have potential side-effects. // This includes instructions that could write to memory, and loads that are // marked volatile. This could be made more aggressive by using aliasing // information to identify readonly and readnone calls. for (Loop::block_iterator LI = L->block_begin(), LE = L->block_end(); LI != LE; ++LI) { - for (BasicBlock::iterator BI = (*LI)->begin(), BE = (*LI)->end(); - BI != BE; ++BI) { - if (BI->mayHaveSideEffects()) + for (Instruction &I : **LI) { + if (I.mayHaveSideEffects()) return false; } } @@ -126,15 +91,15 @@ bool LoopDeletion::isLoopDead(Loop *L, return true; } -/// runOnLoop - Remove dead loops, by which we mean loops that do not impact the -/// observable behavior of the program other than finite running time. Note -/// we do ensure that this never remove a loop that might be infinite, as doing -/// so could change the halting/non-halting nature of a program. -/// NOTE: This entire process relies pretty heavily on LoopSimplify and LCSSA -/// in order to make various safety checks work. -bool LoopDeletion::runOnLoop(Loop *L, LPPassManager &) { - if (skipOptnoneFunction(L)) - return false; +/// Remove dead loops, by which we mean loops that do not impact the observable +/// behavior of the program other than finite running time. Note we do ensure +/// that this never remove a loop that might be infinite, as doing so could +/// change the halting/non-halting nature of a program. NOTE: This entire +/// process relies pretty heavily on LoopSimplify and LCSSA in order to make +/// various safety checks work. +bool LoopDeletionPass::runImpl(Loop *L, DominatorTree &DT, ScalarEvolution &SE, + LoopInfo &loopInfo) { + assert(L->isLCSSAForm(DT) && "Expected LCSSA!"); // We can only remove the loop if there is a preheader that we can // branch from after removing it. @@ -151,10 +116,10 @@ bool LoopDeletion::runOnLoop(Loop *L, LPPassManager &) { if (L->begin() != L->end()) return false; - SmallVector<BasicBlock*, 4> exitingBlocks; + SmallVector<BasicBlock *, 4> exitingBlocks; L->getExitingBlocks(exitingBlocks); - SmallVector<BasicBlock*, 4> exitBlocks; + SmallVector<BasicBlock *, 4> exitBlocks; L->getUniqueExitBlocks(exitBlocks); // We require that the loop only have a single exit block. Otherwise, we'd @@ -166,12 +131,11 @@ bool LoopDeletion::runOnLoop(Loop *L, LPPassManager &) { // Finally, we have to check that the loop really is dead. bool Changed = false; - if (!isLoopDead(L, exitingBlocks, exitBlocks, Changed, preheader)) + if (!isLoopDead(L, SE, exitingBlocks, exitBlocks, Changed, preheader)) return Changed; // Don't remove loops for which we can't solve the trip count. // They could be infinite, in which case we'd be changing program behavior. - ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); const SCEV *S = SE.getMaxBackedgeTakenCount(L); if (isa<SCEVCouldNotCompute>(S)) return Changed; @@ -208,16 +172,14 @@ bool LoopDeletion::runOnLoop(Loop *L, LPPassManager &) { // Update the dominator tree and remove the instructions and blocks that will // be deleted from the reference counting scheme. - DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); SmallVector<DomTreeNode*, 8> ChildNodes; for (Loop::block_iterator LI = L->block_begin(), LE = L->block_end(); LI != LE; ++LI) { // Move all of the block's children to be children of the preheader, which // allows us to remove the domtree entry for the block. ChildNodes.insert(ChildNodes.begin(), DT[*LI]->begin(), DT[*LI]->end()); - for (SmallVectorImpl<DomTreeNode *>::iterator DI = ChildNodes.begin(), - DE = ChildNodes.end(); DI != DE; ++DI) { - DT.changeImmediateDominator(*DI, DT[preheader]); + for (DomTreeNode *ChildNode : ChildNodes) { + DT.changeImmediateDominator(ChildNode, DT[preheader]); } ChildNodes.clear(); @@ -238,8 +200,8 @@ bool LoopDeletion::runOnLoop(Loop *L, LPPassManager &) { // Finally, the blocks from loopinfo. This has to happen late because // otherwise our loop iterators won't work. - LoopInfo &loopInfo = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - SmallPtrSet<BasicBlock*, 8> blocks; + + SmallPtrSet<BasicBlock *, 8> blocks; blocks.insert(L->block_begin(), L->block_end()); for (BasicBlock *BB : blocks) loopInfo.removeBlock(BB); @@ -252,3 +214,56 @@ bool LoopDeletion::runOnLoop(Loop *L, LPPassManager &) { return Changed; } + +PreservedAnalyses LoopDeletionPass::run(Loop &L, AnalysisManager<Loop> &AM) { + auto &FAM = AM.getResult<FunctionAnalysisManagerLoopProxy>(L).getManager(); + Function *F = L.getHeader()->getParent(); + + auto &DT = *FAM.getCachedResult<DominatorTreeAnalysis>(*F); + auto &SE = *FAM.getCachedResult<ScalarEvolutionAnalysis>(*F); + auto &LI = *FAM.getCachedResult<LoopAnalysis>(*F); + + bool Changed = runImpl(&L, DT, SE, LI); + if (!Changed) + return PreservedAnalyses::all(); + + return getLoopPassPreservedAnalyses(); +} + +namespace { +class LoopDeletionLegacyPass : public LoopPass { +public: + static char ID; // Pass ID, replacement for typeid + LoopDeletionLegacyPass() : LoopPass(ID) { + initializeLoopDeletionLegacyPassPass(*PassRegistry::getPassRegistry()); + } + + // Possibly eliminate loop L if it is dead. + bool runOnLoop(Loop *L, LPPassManager &) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + getLoopAnalysisUsage(AU); + } +}; +} + +char LoopDeletionLegacyPass::ID = 0; +INITIALIZE_PASS_BEGIN(LoopDeletionLegacyPass, "loop-deletion", + "Delete dead loops", false, false) +INITIALIZE_PASS_DEPENDENCY(LoopPass) +INITIALIZE_PASS_END(LoopDeletionLegacyPass, "loop-deletion", + "Delete dead loops", false, false) + +Pass *llvm::createLoopDeletionPass() { return new LoopDeletionLegacyPass(); } + +bool LoopDeletionLegacyPass::runOnLoop(Loop *L, LPPassManager &) { + if (skipLoop(L)) + return false; + + DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); + LoopInfo &loopInfo = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + + LoopDeletionPass Impl; + return Impl.runImpl(L, DT, SE, loopInfo); +} diff --git a/lib/Transforms/Scalar/LoopDistribute.cpp b/lib/Transforms/Scalar/LoopDistribute.cpp index 3d3cf3e2890b1..7eca28ed2bb73 100644 --- a/lib/Transforms/Scalar/LoopDistribute.cpp +++ b/lib/Transforms/Scalar/LoopDistribute.cpp @@ -22,12 +22,17 @@ // //===----------------------------------------------------------------------===// +#include "llvm/Transforms/Scalar/LoopDistribute.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/EquivalenceClasses.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/LoopAccessAnalysis.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/LoopPassManager.h" +#include "llvm/Analysis/OptimizationDiagnosticInfo.h" +#include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" @@ -60,6 +65,19 @@ static cl::opt<unsigned> DistributeSCEVCheckThreshold( cl::desc("The maximum number of SCEV checks allowed for Loop " "Distribution")); +static cl::opt<unsigned> PragmaDistributeSCEVCheckThreshold( + "loop-distribute-scev-check-threshold-with-pragma", cl::init(128), + cl::Hidden, + cl::desc( + "The maximum number of SCEV checks allowed for Loop " + "Distribution for loop marked with #pragma loop distribute(enable)")); + +// Note that the initial value for this depends on whether the pass is invoked +// directly or from the optimization pipeline. +static cl::opt<bool> EnableLoopDistribute( + "enable-loop-distribute", cl::Hidden, + cl::desc("Enable the new, experimental LoopDistribution Pass")); + STATISTIC(NumLoopsDistributed, "Number of loops distributed"); namespace { @@ -170,7 +188,7 @@ public: // Delete the instructions backwards, as it has a reduced likelihood of // having to update as many def-use and use-def chains. - for (auto *Inst : make_range(Unused.rbegin(), Unused.rend())) { + for (auto *Inst : reverse(Unused)) { if (!Inst->use_empty()) Inst->replaceAllUsesWith(UndefValue::get(Inst->getType())); Inst->eraseFromParent(); @@ -571,121 +589,39 @@ private: AccessesType Accesses; }; -/// \brief The pass class. -class LoopDistribute : public FunctionPass { +/// \brief The actual class performing the per-loop work. +class LoopDistributeForLoop { public: - LoopDistribute() : FunctionPass(ID) { - initializeLoopDistributePass(*PassRegistry::getPassRegistry()); - } - - bool runOnFunction(Function &F) override { - LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - LAA = &getAnalysis<LoopAccessAnalysis>(); - DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); - - // Build up a worklist of inner-loops to vectorize. This is necessary as the - // act of distributing a loop creates new loops and can invalidate iterators - // across the loops. - SmallVector<Loop *, 8> Worklist; - - for (Loop *TopLevelLoop : *LI) - for (Loop *L : depth_first(TopLevelLoop)) - // We only handle inner-most loops. - if (L->empty()) - Worklist.push_back(L); - - // Now walk the identified inner loops. - bool Changed = false; - for (Loop *L : Worklist) - Changed |= processLoop(L); - - // Process each loop nest in the function. - return Changed; - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<ScalarEvolutionWrapperPass>(); - AU.addRequired<LoopInfoWrapperPass>(); - AU.addPreserved<LoopInfoWrapperPass>(); - AU.addRequired<LoopAccessAnalysis>(); - AU.addRequired<DominatorTreeWrapperPass>(); - AU.addPreserved<DominatorTreeWrapperPass>(); - } - - static char ID; - -private: - /// \brief Filter out checks between pointers from the same partition. - /// - /// \p PtrToPartition contains the partition number for pointers. Partition - /// number -1 means that the pointer is used in multiple partitions. In this - /// case we can't safely omit the check. - SmallVector<RuntimePointerChecking::PointerCheck, 4> - includeOnlyCrossPartitionChecks( - const SmallVectorImpl<RuntimePointerChecking::PointerCheck> &AllChecks, - const SmallVectorImpl<int> &PtrToPartition, - const RuntimePointerChecking *RtPtrChecking) { - SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks; - - std::copy_if(AllChecks.begin(), AllChecks.end(), std::back_inserter(Checks), - [&](const RuntimePointerChecking::PointerCheck &Check) { - for (unsigned PtrIdx1 : Check.first->Members) - for (unsigned PtrIdx2 : Check.second->Members) - // Only include this check if there is a pair of pointers - // that require checking and the pointers fall into - // separate partitions. - // - // (Note that we already know at this point that the two - // pointer groups need checking but it doesn't follow - // that each pair of pointers within the two groups need - // checking as well. - // - // In other words we don't want to include a check just - // because there is a pair of pointers between the two - // pointer groups that require checks and a different - // pair whose pointers fall into different partitions.) - if (RtPtrChecking->needsChecking(PtrIdx1, PtrIdx2) && - !RuntimePointerChecking::arePointersInSamePartition( - PtrToPartition, PtrIdx1, PtrIdx2)) - return true; - return false; - }); - - return Checks; + LoopDistributeForLoop(Loop *L, Function *F, LoopInfo *LI, DominatorTree *DT, + ScalarEvolution *SE, OptimizationRemarkEmitter *ORE) + : L(L), F(F), LI(LI), LAI(nullptr), DT(DT), SE(SE), ORE(ORE) { + setForced(); } /// \brief Try to distribute an inner-most loop. - bool processLoop(Loop *L) { + bool processLoop(std::function<const LoopAccessInfo &(Loop &)> &GetLAA) { assert(L->empty() && "Only process inner loops."); DEBUG(dbgs() << "\nLDist: In \"" << L->getHeader()->getParent()->getName() << "\" checking " << *L << "\n"); BasicBlock *PH = L->getLoopPreheader(); - if (!PH) { - DEBUG(dbgs() << "Skipping; no preheader"); - return false; - } - if (!L->getExitBlock()) { - DEBUG(dbgs() << "Skipping; multiple exit blocks"); - return false; - } - // LAA will check that we only have a single exiting block. + if (!PH) + return fail("no preheader"); + if (!L->getExitBlock()) + return fail("multiple exit blocks"); - const LoopAccessInfo &LAI = LAA->getInfo(L, ValueToValueMap()); + // LAA will check that we only have a single exiting block. + LAI = &GetLAA(*L); // Currently, we only distribute to isolate the part of the loop with // dependence cycles to enable partial vectorization. - if (LAI.canVectorizeMemory()) { - DEBUG(dbgs() << "Skipping; memory operations are safe for vectorization"); - return false; - } - auto *Dependences = LAI.getDepChecker().getDependences(); - if (!Dependences || Dependences->empty()) { - DEBUG(dbgs() << "Skipping; No unsafe dependences to isolate"); - return false; - } + if (LAI->canVectorizeMemory()) + return fail("memory operations are safe for vectorization"); + + auto *Dependences = LAI->getDepChecker().getDependences(); + if (!Dependences || Dependences->empty()) + return fail("no unsafe dependences to isolate"); InstPartitionContainer Partitions(L, LI, DT); @@ -708,7 +644,7 @@ private: // NumUnsafeDependencesActive > 0 indicates this situation and in this case // we just keep assigning to the same cyclic partition until // NumUnsafeDependencesActive reaches 0. - const MemoryDepChecker &DepChecker = LAI.getDepChecker(); + const MemoryDepChecker &DepChecker = LAI->getDepChecker(); MemoryInstructionDependences MID(DepChecker.getMemoryInstructions(), *Dependences); @@ -738,14 +674,14 @@ private: DEBUG(dbgs() << "Seeded partitions:\n" << Partitions); if (Partitions.getSize() < 2) - return false; + return fail("cannot isolate unsafe dependencies"); // Run the merge heuristics: Merge non-cyclic adjacent partitions since we // should be able to vectorize these together. Partitions.mergeBeforePopulating(); DEBUG(dbgs() << "\nMerged partitions:\n" << Partitions); if (Partitions.getSize() < 2) - return false; + return fail("cannot isolate unsafe dependencies"); // Now, populate the partitions with non-memory operations. Partitions.populateUsedSet(); @@ -757,15 +693,15 @@ private: DEBUG(dbgs() << "\nPartitions merged to ensure unique loads:\n" << Partitions); if (Partitions.getSize() < 2) - return false; + return fail("cannot isolate unsafe dependencies"); } // Don't distribute the loop if we need too many SCEV run-time checks. - const SCEVUnionPredicate &Pred = LAI.PSE.getUnionPredicate(); - if (Pred.getComplexity() > DistributeSCEVCheckThreshold) { - DEBUG(dbgs() << "Too many SCEV run-time checks needed.\n"); - return false; - } + const SCEVUnionPredicate &Pred = LAI->getPSE().getUnionPredicate(); + if (Pred.getComplexity() > (IsForced.getValueOr(false) + ? PragmaDistributeSCEVCheckThreshold + : DistributeSCEVCheckThreshold)) + return fail("too many SCEV run-time checks needed.\n"); DEBUG(dbgs() << "\nDistributing loop: " << *L << "\n"); // We're done forming the partitions set up the reverse mapping from @@ -779,19 +715,20 @@ private: SplitBlock(PH, PH->getTerminator(), DT, LI); // If we need run-time checks, version the loop now. - auto PtrToPartition = Partitions.computePartitionSetForPointers(LAI); - const auto *RtPtrChecking = LAI.getRuntimePointerChecking(); + auto PtrToPartition = Partitions.computePartitionSetForPointers(*LAI); + const auto *RtPtrChecking = LAI->getRuntimePointerChecking(); const auto &AllChecks = RtPtrChecking->getChecks(); auto Checks = includeOnlyCrossPartitionChecks(AllChecks, PtrToPartition, RtPtrChecking); if (!Pred.isAlwaysTrue() || !Checks.empty()) { DEBUG(dbgs() << "\nPointers:\n"); - DEBUG(LAI.getRuntimePointerChecking()->printChecks(dbgs(), Checks)); - LoopVersioning LVer(LAI, L, LI, DT, SE, false); + DEBUG(LAI->getRuntimePointerChecking()->printChecks(dbgs(), Checks)); + LoopVersioning LVer(*LAI, L, LI, DT, SE, false); LVer.setAliasChecks(std::move(Checks)); - LVer.setSCEVChecks(LAI.PSE.getUnionPredicate()); + LVer.setSCEVChecks(LAI->getPSE().getUnionPredicate()); LVer.versionLoop(DefsUsedOutside); + LVer.annotateLoopWithNoAlias(); } // Create identical copies of the original loop for each partition and hook @@ -810,27 +747,244 @@ private: } ++NumLoopsDistributed; + // Report the success. + emitOptimizationRemark(F->getContext(), LDIST_NAME, *F, L->getStartLoc(), + "distributed loop"); return true; } + /// \brief Provide diagnostics then \return with false. + bool fail(llvm::StringRef Message) { + LLVMContext &Ctx = F->getContext(); + bool Forced = isForced().getValueOr(false); + + DEBUG(dbgs() << "Skipping; " << Message << "\n"); + + // With Rpass-missed report that distribution failed. + ORE->emitOptimizationRemarkMissed( + LDIST_NAME, L, + "loop not distributed: use -Rpass-analysis=loop-distribute for more " + "info"); + + // With Rpass-analysis report why. This is on by default if distribution + // was requested explicitly. + emitOptimizationRemarkAnalysis( + Ctx, Forced ? DiagnosticInfoOptimizationRemarkAnalysis::AlwaysPrint + : LDIST_NAME, + *F, L->getStartLoc(), Twine("loop not distributed: ") + Message); + + // Also issue a warning if distribution was requested explicitly but it + // failed. + if (Forced) + Ctx.diagnose(DiagnosticInfoOptimizationFailure( + *F, L->getStartLoc(), "loop not distributed: failed " + "explicitly specified loop distribution")); + + return false; + } + + /// \brief Return if distribution forced to be enabled/disabled for the loop. + /// + /// If the optional has a value, it indicates whether distribution was forced + /// to be enabled (true) or disabled (false). If the optional has no value + /// distribution was not forced either way. + const Optional<bool> &isForced() const { return IsForced; } + +private: + /// \brief Filter out checks between pointers from the same partition. + /// + /// \p PtrToPartition contains the partition number for pointers. Partition + /// number -1 means that the pointer is used in multiple partitions. In this + /// case we can't safely omit the check. + SmallVector<RuntimePointerChecking::PointerCheck, 4> + includeOnlyCrossPartitionChecks( + const SmallVectorImpl<RuntimePointerChecking::PointerCheck> &AllChecks, + const SmallVectorImpl<int> &PtrToPartition, + const RuntimePointerChecking *RtPtrChecking) { + SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks; + + std::copy_if(AllChecks.begin(), AllChecks.end(), std::back_inserter(Checks), + [&](const RuntimePointerChecking::PointerCheck &Check) { + for (unsigned PtrIdx1 : Check.first->Members) + for (unsigned PtrIdx2 : Check.second->Members) + // Only include this check if there is a pair of pointers + // that require checking and the pointers fall into + // separate partitions. + // + // (Note that we already know at this point that the two + // pointer groups need checking but it doesn't follow + // that each pair of pointers within the two groups need + // checking as well. + // + // In other words we don't want to include a check just + // because there is a pair of pointers between the two + // pointer groups that require checks and a different + // pair whose pointers fall into different partitions.) + if (RtPtrChecking->needsChecking(PtrIdx1, PtrIdx2) && + !RuntimePointerChecking::arePointersInSamePartition( + PtrToPartition, PtrIdx1, PtrIdx2)) + return true; + return false; + }); + + return Checks; + } + + /// \brief Check whether the loop metadata is forcing distribution to be + /// enabled/disabled. + void setForced() { + Optional<const MDOperand *> Value = + findStringMetadataForLoop(L, "llvm.loop.distribute.enable"); + if (!Value) + return; + + const MDOperand *Op = *Value; + assert(Op && mdconst::hasa<ConstantInt>(*Op) && "invalid metadata"); + IsForced = mdconst::extract<ConstantInt>(*Op)->getZExtValue(); + } + + Loop *L; + Function *F; + // Analyses used. LoopInfo *LI; - LoopAccessAnalysis *LAA; + const LoopAccessInfo *LAI; DominatorTree *DT; ScalarEvolution *SE; + OptimizationRemarkEmitter *ORE; + + /// \brief Indicates whether distribution is forced to be enabled/disabled for + /// the loop. + /// + /// If the optional has a value, it indicates whether distribution was forced + /// to be enabled (true) or disabled (false). If the optional has no value + /// distribution was not forced either way. + Optional<bool> IsForced; +}; + +/// Shared implementation between new and old PMs. +static bool runImpl(Function &F, LoopInfo *LI, DominatorTree *DT, + ScalarEvolution *SE, OptimizationRemarkEmitter *ORE, + std::function<const LoopAccessInfo &(Loop &)> &GetLAA, + bool ProcessAllLoops) { + // Build up a worklist of inner-loops to vectorize. This is necessary as the + // act of distributing a loop creates new loops and can invalidate iterators + // across the loops. + SmallVector<Loop *, 8> Worklist; + + for (Loop *TopLevelLoop : *LI) + for (Loop *L : depth_first(TopLevelLoop)) + // We only handle inner-most loops. + if (L->empty()) + Worklist.push_back(L); + + // Now walk the identified inner loops. + bool Changed = false; + for (Loop *L : Worklist) { + LoopDistributeForLoop LDL(L, &F, LI, DT, SE, ORE); + + // If distribution was forced for the specific loop to be + // enabled/disabled, follow that. Otherwise use the global flag. + if (LDL.isForced().getValueOr(ProcessAllLoops)) + Changed |= LDL.processLoop(GetLAA); + } + + // Process each loop nest in the function. + return Changed; +} + +/// \brief The pass class. +class LoopDistributeLegacy : public FunctionPass { +public: + /// \p ProcessAllLoopsByDefault specifies whether loop distribution should be + /// performed by default. Pass -enable-loop-distribute={0,1} overrides this + /// default. We use this to keep LoopDistribution off by default when invoked + /// from the optimization pipeline but on when invoked explicitly from opt. + LoopDistributeLegacy(bool ProcessAllLoopsByDefault = true) + : FunctionPass(ID), ProcessAllLoops(ProcessAllLoopsByDefault) { + // The default is set by the caller. + if (EnableLoopDistribute.getNumOccurrences() > 0) + ProcessAllLoops = EnableLoopDistribute; + initializeLoopDistributeLegacyPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override { + if (skipFunction(F)) + return false; + + auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); + auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); + auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); + std::function<const LoopAccessInfo &(Loop &)> GetLAA = + [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; + + return runImpl(F, LI, DT, SE, ORE, GetLAA, ProcessAllLoops); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<ScalarEvolutionWrapperPass>(); + AU.addRequired<LoopInfoWrapperPass>(); + AU.addPreserved<LoopInfoWrapperPass>(); + AU.addRequired<LoopAccessLegacyAnalysis>(); + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addPreserved<DominatorTreeWrapperPass>(); + AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); + } + + static char ID; + +private: + /// \brief Whether distribution should be on in this function. The per-loop + /// pragma can override this. + bool ProcessAllLoops; }; } // anonymous namespace -char LoopDistribute::ID; +PreservedAnalyses LoopDistributePass::run(Function &F, + FunctionAnalysisManager &AM) { + // FIXME: This does not currently match the behavior from the old PM. + // ProcessAllLoops with the old PM defaults to true when invoked from opt and + // false when invoked from the optimization pipeline. + bool ProcessAllLoops = false; + if (EnableLoopDistribute.getNumOccurrences() > 0) + ProcessAllLoops = EnableLoopDistribute; + + auto &LI = AM.getResult<LoopAnalysis>(F); + auto &DT = AM.getResult<DominatorTreeAnalysis>(F); + auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); + auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); + + auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); + std::function<const LoopAccessInfo &(Loop &)> GetLAA = + [&](Loop &L) -> const LoopAccessInfo & { + return LAM.getResult<LoopAccessAnalysis>(L); + }; + + bool Changed = runImpl(F, &LI, &DT, &SE, &ORE, GetLAA, ProcessAllLoops); + if (!Changed) + return PreservedAnalyses::all(); + PreservedAnalyses PA; + PA.preserve<LoopAnalysis>(); + PA.preserve<DominatorTreeAnalysis>(); + return PA; +} + +char LoopDistributeLegacy::ID; static const char ldist_name[] = "Loop Distribition"; -INITIALIZE_PASS_BEGIN(LoopDistribute, LDIST_NAME, ldist_name, false, false) +INITIALIZE_PASS_BEGIN(LoopDistributeLegacy, LDIST_NAME, ldist_name, false, + false) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(LoopAccessAnalysis) +INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) -INITIALIZE_PASS_END(LoopDistribute, LDIST_NAME, ldist_name, false, false) +INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) +INITIALIZE_PASS_END(LoopDistributeLegacy, LDIST_NAME, ldist_name, false, false) namespace llvm { -FunctionPass *createLoopDistributePass() { return new LoopDistribute(); } +FunctionPass *createLoopDistributePass(bool ProcessAllLoopsByDefault) { + return new LoopDistributeLegacy(ProcessAllLoopsByDefault); +} } diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index 4521640e3947e..1468676a35437 100644 --- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -26,22 +26,21 @@ // i64 and larger types when i64 is legal and the value has few bits set. It // would be good to enhance isel to emit a loop for ctpop in this case. // -// We should enhance the memset/memcpy recognition to handle multiple stores in -// the loop. This would handle things like: -// void foo(_Complex float *P) -// for (i) { __real__(*P) = 0; __imag__(*P) = 0; } -// // This could recognize common matrix multiplies and dot product idioms and // replace them with calls to BLAS (if linked in??). // //===----------------------------------------------------------------------===// -#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Scalar/LoopIdiomRecognize.h" +#include "llvm/ADT/MapVector.h" +#include "llvm/ADT/SetVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Analysis/LoopAccessAnalysis.h" #include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/LoopPassManager.h" #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" @@ -55,7 +54,10 @@ #include "llvm/IR/Module.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/BuildLibCalls.h" #include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/LoopUtils.h" using namespace llvm; #define DEBUG_TYPE "loop-idiom" @@ -65,7 +67,7 @@ STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores"); namespace { -class LoopIdiomRecognize : public LoopPass { +class LoopIdiomRecognize { Loop *CurLoop; AliasAnalysis *AA; DominatorTree *DT; @@ -76,39 +78,21 @@ class LoopIdiomRecognize : public LoopPass { const DataLayout *DL; public: - static char ID; - explicit LoopIdiomRecognize() : LoopPass(ID) { - initializeLoopIdiomRecognizePass(*PassRegistry::getPassRegistry()); - } + explicit LoopIdiomRecognize(AliasAnalysis *AA, DominatorTree *DT, + LoopInfo *LI, ScalarEvolution *SE, + TargetLibraryInfo *TLI, + const TargetTransformInfo *TTI, + const DataLayout *DL) + : CurLoop(nullptr), AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI), + DL(DL) {} - bool runOnLoop(Loop *L, LPPassManager &LPM) override; - - /// This transformation requires natural loop information & requires that - /// loop preheaders be inserted into the CFG. - /// - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<LoopInfoWrapperPass>(); - AU.addPreserved<LoopInfoWrapperPass>(); - AU.addRequiredID(LoopSimplifyID); - AU.addPreservedID(LoopSimplifyID); - AU.addRequiredID(LCSSAID); - AU.addPreservedID(LCSSAID); - AU.addRequired<AAResultsWrapperPass>(); - AU.addPreserved<AAResultsWrapperPass>(); - AU.addRequired<ScalarEvolutionWrapperPass>(); - AU.addPreserved<ScalarEvolutionWrapperPass>(); - AU.addPreserved<SCEVAAWrapperPass>(); - AU.addRequired<DominatorTreeWrapperPass>(); - AU.addPreserved<DominatorTreeWrapperPass>(); - AU.addRequired<TargetLibraryInfoWrapperPass>(); - AU.addRequired<TargetTransformInfoWrapperPass>(); - AU.addPreserved<BasicAAWrapperPass>(); - AU.addPreserved<GlobalsAAWrapperPass>(); - } + bool runOnLoop(Loop *L); private: typedef SmallVector<StoreInst *, 8> StoreList; - StoreList StoreRefsForMemset; + typedef MapVector<Value *, StoreList> StoreListMap; + StoreListMap StoreRefsForMemset; + StoreListMap StoreRefsForMemsetPattern; StoreList StoreRefsForMemcpy; bool HasMemset; bool HasMemsetPattern; @@ -122,14 +106,18 @@ private: SmallVectorImpl<BasicBlock *> &ExitBlocks); void collectStores(BasicBlock *BB); - bool isLegalStore(StoreInst *SI, bool &ForMemset, bool &ForMemcpy); - bool processLoopStore(StoreInst *SI, const SCEV *BECount); + bool isLegalStore(StoreInst *SI, bool &ForMemset, bool &ForMemsetPattern, + bool &ForMemcpy); + bool processLoopStores(SmallVectorImpl<StoreInst *> &SL, const SCEV *BECount, + bool ForMemset); bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount); bool processLoopStridedStore(Value *DestPtr, unsigned StoreSize, unsigned StoreAlignment, Value *StoredVal, - Instruction *TheStore, const SCEVAddRecExpr *Ev, - const SCEV *BECount, bool NegStride); + Instruction *TheStore, + SmallPtrSetImpl<Instruction *> &Stores, + const SCEVAddRecExpr *Ev, const SCEV *BECount, + bool NegStride); bool processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *BECount); /// @} @@ -145,38 +133,82 @@ private: /// @} }; +class LoopIdiomRecognizeLegacyPass : public LoopPass { +public: + static char ID; + explicit LoopIdiomRecognizeLegacyPass() : LoopPass(ID) { + initializeLoopIdiomRecognizeLegacyPassPass( + *PassRegistry::getPassRegistry()); + } + + bool runOnLoop(Loop *L, LPPassManager &LPM) override { + if (skipLoop(L)) + return false; + + AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); + DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); + TargetLibraryInfo *TLI = + &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + const TargetTransformInfo *TTI = + &getAnalysis<TargetTransformInfoWrapperPass>().getTTI( + *L->getHeader()->getParent()); + const DataLayout *DL = &L->getHeader()->getModule()->getDataLayout(); + + LoopIdiomRecognize LIR(AA, DT, LI, SE, TLI, TTI, DL); + return LIR.runOnLoop(L); + } + + /// This transformation requires natural loop information & requires that + /// loop preheaders be inserted into the CFG. + /// + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<TargetLibraryInfoWrapperPass>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); + getLoopAnalysisUsage(AU); + } +}; } // End anonymous namespace. -char LoopIdiomRecognize::ID = 0; -INITIALIZE_PASS_BEGIN(LoopIdiomRecognize, "loop-idiom", "Recognize loop idioms", - false, false) -INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(LoopSimplify) -INITIALIZE_PASS_DEPENDENCY(LCSSA) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) +PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, + AnalysisManager<Loop> &AM) { + const auto &FAM = + AM.getResult<FunctionAnalysisManagerLoopProxy>(L).getManager(); + Function *F = L.getHeader()->getParent(); + + // Use getCachedResult because Loop pass cannot trigger a function analysis. + auto *AA = FAM.getCachedResult<AAManager>(*F); + auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(*F); + auto *LI = FAM.getCachedResult<LoopAnalysis>(*F); + auto *SE = FAM.getCachedResult<ScalarEvolutionAnalysis>(*F); + auto *TLI = FAM.getCachedResult<TargetLibraryAnalysis>(*F); + const auto *TTI = FAM.getCachedResult<TargetIRAnalysis>(*F); + const auto *DL = &L.getHeader()->getModule()->getDataLayout(); + assert((AA && DT && LI && SE && TLI && TTI && DL) && + "Analyses for Loop Idiom Recognition not available"); + + LoopIdiomRecognize LIR(AA, DT, LI, SE, TLI, TTI, DL); + if (!LIR.runOnLoop(&L)) + return PreservedAnalyses::all(); + + return getLoopPassPreservedAnalyses(); +} + +char LoopIdiomRecognizeLegacyPass::ID = 0; +INITIALIZE_PASS_BEGIN(LoopIdiomRecognizeLegacyPass, "loop-idiom", + "Recognize loop idioms", false, false) +INITIALIZE_PASS_DEPENDENCY(LoopPass) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) -INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) -INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) -INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) -INITIALIZE_PASS_END(LoopIdiomRecognize, "loop-idiom", "Recognize loop idioms", - false, false) +INITIALIZE_PASS_END(LoopIdiomRecognizeLegacyPass, "loop-idiom", + "Recognize loop idioms", false, false) -Pass *llvm::createLoopIdiomPass() { return new LoopIdiomRecognize(); } +Pass *llvm::createLoopIdiomPass() { return new LoopIdiomRecognizeLegacyPass(); } -/// deleteDeadInstruction - Delete this instruction. Before we do, go through -/// and zero out all the operands of this instruction. If any of them become -/// dead, delete them and the computation tree that feeds them. -/// -static void deleteDeadInstruction(Instruction *I, - const TargetLibraryInfo *TLI) { - SmallVector<Value *, 16> Operands(I->value_op_begin(), I->value_op_end()); +static void deleteDeadInstruction(Instruction *I) { I->replaceAllUsesWith(UndefValue::get(I->getType())); I->eraseFromParent(); - for (Value *Op : Operands) - RecursivelyDeleteTriviallyDeadInstructions(Op, TLI); } //===----------------------------------------------------------------------===// @@ -185,10 +217,7 @@ static void deleteDeadInstruction(Instruction *I, // //===----------------------------------------------------------------------===// -bool LoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) { - if (skipOptnoneFunction(L)) - return false; - +bool LoopIdiomRecognize::runOnLoop(Loop *L) { CurLoop = L; // If the loop could not be converted to canonical form, it must have an // indirectbr in it, just give up. @@ -200,15 +229,6 @@ bool LoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) { if (Name == "memset" || Name == "memcpy") return false; - AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); - DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); - TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); - TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI( - *CurLoop->getHeader()->getParent()); - DL = &CurLoop->getHeader()->getModule()->getDataLayout(); - HasMemset = TLI->has(LibFunc::memset); HasMemsetPattern = TLI->has(LibFunc::memset_pattern16); HasMemcpy = TLI->has(LibFunc::memcpy); @@ -240,6 +260,14 @@ bool LoopIdiomRecognize::runOnCountableLoop() { << CurLoop->getHeader()->getName() << "\n"); bool MadeChange = false; + + // The following transforms hoist stores/memsets into the loop pre-header. + // Give up if the loop has instructions may throw. + LoopSafetyInfo SafetyInfo; + computeLoopSafetyInfo(&SafetyInfo, CurLoop); + if (SafetyInfo.MayThrow) + return MadeChange; + // Scan all the blocks in the loop that are not in subloops. for (auto *BB : CurLoop->getBlocks()) { // Ignore blocks in subloops. @@ -258,9 +286,9 @@ static unsigned getStoreSizeInBytes(StoreInst *SI, const DataLayout *DL) { return (unsigned)SizeInBits >> 3; } -static unsigned getStoreStride(const SCEVAddRecExpr *StoreEv) { +static APInt getStoreStride(const SCEVAddRecExpr *StoreEv) { const SCEVConstant *ConstStride = cast<SCEVConstant>(StoreEv->getOperand(1)); - return ConstStride->getAPInt().getZExtValue(); + return ConstStride->getAPInt(); } /// getMemSetPatternValue - If a strided store of the specified value is safe to @@ -305,11 +333,15 @@ static Constant *getMemSetPatternValue(Value *V, const DataLayout *DL) { } bool LoopIdiomRecognize::isLegalStore(StoreInst *SI, bool &ForMemset, - bool &ForMemcpy) { + bool &ForMemsetPattern, bool &ForMemcpy) { // Don't touch volatile stores. if (!SI->isSimple()) return false; + // Avoid merging nontemporal stores. + if (SI->getMetadata(LLVMContext::MD_nontemporal)) + return false; + Value *StoredVal = SI->getValueOperand(); Value *StorePtr = SI->getPointerOperand(); @@ -353,7 +385,7 @@ bool LoopIdiomRecognize::isLegalStore(StoreInst *SI, bool &ForMemset, StorePtr->getType()->getPointerAddressSpace() == 0 && (PatternValue = getMemSetPatternValue(StoredVal, DL))) { // It looks like we can use PatternValue! - ForMemset = true; + ForMemsetPattern = true; return true; } @@ -361,7 +393,7 @@ bool LoopIdiomRecognize::isLegalStore(StoreInst *SI, bool &ForMemset, if (HasMemcpy) { // Check to see if the stride matches the size of the store. If so, then we // know that every byte is touched in the loop. - unsigned Stride = getStoreStride(StoreEv); + APInt Stride = getStoreStride(StoreEv); unsigned StoreSize = getStoreSizeInBytes(SI, DL); if (StoreSize != Stride && StoreSize != -Stride) return false; @@ -393,6 +425,7 @@ bool LoopIdiomRecognize::isLegalStore(StoreInst *SI, bool &ForMemset, void LoopIdiomRecognize::collectStores(BasicBlock *BB) { StoreRefsForMemset.clear(); + StoreRefsForMemsetPattern.clear(); StoreRefsForMemcpy.clear(); for (Instruction &I : *BB) { StoreInst *SI = dyn_cast<StoreInst>(&I); @@ -400,15 +433,22 @@ void LoopIdiomRecognize::collectStores(BasicBlock *BB) { continue; bool ForMemset = false; + bool ForMemsetPattern = false; bool ForMemcpy = false; // Make sure this is a strided store with a constant stride. - if (!isLegalStore(SI, ForMemset, ForMemcpy)) + if (!isLegalStore(SI, ForMemset, ForMemsetPattern, ForMemcpy)) continue; // Save the store locations. - if (ForMemset) - StoreRefsForMemset.push_back(SI); - else if (ForMemcpy) + if (ForMemset) { + // Find the base pointer. + Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), *DL); + StoreRefsForMemset[Ptr].push_back(SI); + } else if (ForMemsetPattern) { + // Find the base pointer. + Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), *DL); + StoreRefsForMemsetPattern[Ptr].push_back(SI); + } else if (ForMemcpy) StoreRefsForMemcpy.push_back(SI); } } @@ -430,9 +470,14 @@ bool LoopIdiomRecognize::runOnLoopBlock( // Look for store instructions, which may be optimized to memset/memcpy. collectStores(BB); - // Look for a single store which can be optimized into a memset. - for (auto &SI : StoreRefsForMemset) - MadeChange |= processLoopStore(SI, BECount); + // Look for a single store or sets of stores with a common base, which can be + // optimized into a memset (memset_pattern). The latter most commonly happens + // with structs and handunrolled loops. + for (auto &SL : StoreRefsForMemset) + MadeChange |= processLoopStores(SL.second, BECount, true); + + for (auto &SL : StoreRefsForMemsetPattern) + MadeChange |= processLoopStores(SL.second, BECount, false); // Optimize the store into a memcpy, if it feeds an similarly strided load. for (auto &SI : StoreRefsForMemcpy) @@ -458,26 +503,144 @@ bool LoopIdiomRecognize::runOnLoopBlock( return MadeChange; } -/// processLoopStore - See if this store can be promoted to a memset. -bool LoopIdiomRecognize::processLoopStore(StoreInst *SI, const SCEV *BECount) { - assert(SI->isSimple() && "Expected only non-volatile stores."); +/// processLoopStores - See if this store(s) can be promoted to a memset. +bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL, + const SCEV *BECount, + bool ForMemset) { + // Try to find consecutive stores that can be transformed into memsets. + SetVector<StoreInst *> Heads, Tails; + SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain; + + // Do a quadratic search on all of the given stores and find + // all of the pairs of stores that follow each other. + SmallVector<unsigned, 16> IndexQueue; + for (unsigned i = 0, e = SL.size(); i < e; ++i) { + assert(SL[i]->isSimple() && "Expected only non-volatile stores."); + + Value *FirstStoredVal = SL[i]->getValueOperand(); + Value *FirstStorePtr = SL[i]->getPointerOperand(); + const SCEVAddRecExpr *FirstStoreEv = + cast<SCEVAddRecExpr>(SE->getSCEV(FirstStorePtr)); + APInt FirstStride = getStoreStride(FirstStoreEv); + unsigned FirstStoreSize = getStoreSizeInBytes(SL[i], DL); + + // See if we can optimize just this store in isolation. + if (FirstStride == FirstStoreSize || -FirstStride == FirstStoreSize) { + Heads.insert(SL[i]); + continue; + } - Value *StoredVal = SI->getValueOperand(); - Value *StorePtr = SI->getPointerOperand(); + Value *FirstSplatValue = nullptr; + Constant *FirstPatternValue = nullptr; - // Check to see if the stride matches the size of the store. If so, then we - // know that every byte is touched in the loop. - const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr)); - unsigned Stride = getStoreStride(StoreEv); - unsigned StoreSize = getStoreSizeInBytes(SI, DL); - if (StoreSize != Stride && StoreSize != -Stride) - return false; + if (ForMemset) + FirstSplatValue = isBytewiseValue(FirstStoredVal); + else + FirstPatternValue = getMemSetPatternValue(FirstStoredVal, DL); + + assert((FirstSplatValue || FirstPatternValue) && + "Expected either splat value or pattern value."); + + IndexQueue.clear(); + // If a store has multiple consecutive store candidates, search Stores + // array according to the sequence: from i+1 to e, then from i-1 to 0. + // This is because usually pairing with immediate succeeding or preceding + // candidate create the best chance to find memset opportunity. + unsigned j = 0; + for (j = i + 1; j < e; ++j) + IndexQueue.push_back(j); + for (j = i; j > 0; --j) + IndexQueue.push_back(j - 1); + + for (auto &k : IndexQueue) { + assert(SL[k]->isSimple() && "Expected only non-volatile stores."); + Value *SecondStorePtr = SL[k]->getPointerOperand(); + const SCEVAddRecExpr *SecondStoreEv = + cast<SCEVAddRecExpr>(SE->getSCEV(SecondStorePtr)); + APInt SecondStride = getStoreStride(SecondStoreEv); + + if (FirstStride != SecondStride) + continue; - bool NegStride = StoreSize == -Stride; + Value *SecondStoredVal = SL[k]->getValueOperand(); + Value *SecondSplatValue = nullptr; + Constant *SecondPatternValue = nullptr; + + if (ForMemset) + SecondSplatValue = isBytewiseValue(SecondStoredVal); + else + SecondPatternValue = getMemSetPatternValue(SecondStoredVal, DL); + + assert((SecondSplatValue || SecondPatternValue) && + "Expected either splat value or pattern value."); + + if (isConsecutiveAccess(SL[i], SL[k], *DL, *SE, false)) { + if (ForMemset) { + if (FirstSplatValue != SecondSplatValue) + continue; + } else { + if (FirstPatternValue != SecondPatternValue) + continue; + } + Tails.insert(SL[k]); + Heads.insert(SL[i]); + ConsecutiveChain[SL[i]] = SL[k]; + break; + } + } + } + + // We may run into multiple chains that merge into a single chain. We mark the + // stores that we transformed so that we don't visit the same store twice. + SmallPtrSet<Value *, 16> TransformedStores; + bool Changed = false; + + // For stores that start but don't end a link in the chain: + for (SetVector<StoreInst *>::iterator it = Heads.begin(), e = Heads.end(); + it != e; ++it) { + if (Tails.count(*it)) + continue; + + // We found a store instr that starts a chain. Now follow the chain and try + // to transform it. + SmallPtrSet<Instruction *, 8> AdjacentStores; + StoreInst *I = *it; + + StoreInst *HeadStore = I; + unsigned StoreSize = 0; + + // Collect the chain into a list. + while (Tails.count(I) || Heads.count(I)) { + if (TransformedStores.count(I)) + break; + AdjacentStores.insert(I); + + StoreSize += getStoreSizeInBytes(I, DL); + // Move to the next value in the chain. + I = ConsecutiveChain[I]; + } - // See if we can optimize just this store in isolation. - return processLoopStridedStore(StorePtr, StoreSize, SI->getAlignment(), - StoredVal, SI, StoreEv, BECount, NegStride); + Value *StoredVal = HeadStore->getValueOperand(); + Value *StorePtr = HeadStore->getPointerOperand(); + const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr)); + APInt Stride = getStoreStride(StoreEv); + + // Check to see if the stride matches the size of the stores. If so, then + // we know that every byte is touched in the loop. + if (StoreSize != Stride && StoreSize != -Stride) + continue; + + bool NegStride = StoreSize == -Stride; + + if (processLoopStridedStore(StorePtr, StoreSize, HeadStore->getAlignment(), + StoredVal, HeadStore, AdjacentStores, StoreEv, + BECount, NegStride)) { + TransformedStores.insert(AdjacentStores.begin(), AdjacentStores.end()); + Changed = true; + } + } + + return Changed; } /// processLoopMemSet - See if this memset can be promoted to a large memset. @@ -488,7 +651,7 @@ bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI, return false; // If we're not allowed to hack on memset, we fail. - if (!TLI->has(LibFunc::memset)) + if (!HasMemset) return false; Value *Pointer = MSI->getDest(); @@ -507,11 +670,12 @@ bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI, // Check to see if the stride matches the size of the memset. If so, then we // know that every byte is touched in the loop. - const SCEVConstant *Stride = dyn_cast<SCEVConstant>(Ev->getOperand(1)); + const SCEVConstant *ConstStride = dyn_cast<SCEVConstant>(Ev->getOperand(1)); + if (!ConstStride) + return false; - // TODO: Could also handle negative stride here someday, that will require the - // validity check in mayLoopAccessLocation to be updated though. - if (!Stride || MSI->getLength() != Stride->getValue()) + APInt Stride = ConstStride->getAPInt(); + if (SizeInBytes != Stride && SizeInBytes != -Stride) return false; // Verify that the memset value is loop invariant. If not, we can't promote @@ -520,18 +684,22 @@ bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI, if (!SplatValue || !CurLoop->isLoopInvariant(SplatValue)) return false; + SmallPtrSet<Instruction *, 1> MSIs; + MSIs.insert(MSI); + bool NegStride = SizeInBytes == -Stride; return processLoopStridedStore(Pointer, (unsigned)SizeInBytes, - MSI->getAlignment(), SplatValue, MSI, Ev, - BECount, /*NegStride=*/false); + MSI->getAlignment(), SplatValue, MSI, MSIs, Ev, + BECount, NegStride); } /// mayLoopAccessLocation - Return true if the specified loop might access the /// specified pointer location, which is a loop-strided access. The 'Access' /// argument specifies what the verboten forms of access are (read or write). -static bool mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L, - const SCEV *BECount, unsigned StoreSize, - AliasAnalysis &AA, - Instruction *IgnoredStore) { +static bool +mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L, + const SCEV *BECount, unsigned StoreSize, + AliasAnalysis &AA, + SmallPtrSetImpl<Instruction *> &IgnoredStores) { // Get the location that may be stored across the loop. Since the access is // strided positively through memory, we say that the modified location starts // at the pointer and has infinite size. @@ -550,8 +718,9 @@ static bool mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L, for (Loop::block_iterator BI = L->block_begin(), E = L->block_end(); BI != E; ++BI) - for (BasicBlock::iterator I = (*BI)->begin(), E = (*BI)->end(); I != E; ++I) - if (&*I != IgnoredStore && (AA.getModRefInfo(&*I, StoreLoc) & Access)) + for (Instruction &I : **BI) + if (IgnoredStores.count(&I) == 0 && + (AA.getModRefInfo(&I, StoreLoc) & Access)) return true; return false; @@ -574,7 +743,8 @@ static const SCEV *getStartForNegStride(const SCEV *Start, const SCEV *BECount, /// transform this into a memset or memset_pattern in the loop preheader, do so. bool LoopIdiomRecognize::processLoopStridedStore( Value *DestPtr, unsigned StoreSize, unsigned StoreAlignment, - Value *StoredVal, Instruction *TheStore, const SCEVAddRecExpr *Ev, + Value *StoredVal, Instruction *TheStore, + SmallPtrSetImpl<Instruction *> &Stores, const SCEVAddRecExpr *Ev, const SCEV *BECount, bool NegStride) { Value *SplatValue = isBytewiseValue(StoredVal); Constant *PatternValue = nullptr; @@ -609,7 +779,7 @@ bool LoopIdiomRecognize::processLoopStridedStore( Value *BasePtr = Expander.expandCodeFor(Start, DestInt8PtrTy, Preheader->getTerminator()); if (mayLoopAccessLocation(BasePtr, MRI_ModRef, CurLoop, BECount, StoreSize, - *AA, TheStore)) { + *AA, Stores)) { Expander.clear(); // If we generated new code for the base pointer, clean up. RecursivelyDeleteTriviallyDeadInstructions(BasePtr, TLI); @@ -644,13 +814,14 @@ bool LoopIdiomRecognize::processLoopStridedStore( Value *MSP = M->getOrInsertFunction("memset_pattern16", Builder.getVoidTy(), Int8PtrTy, Int8PtrTy, IntPtr, (void *)nullptr); + inferLibFuncAttributes(*M->getFunction("memset_pattern16"), *TLI); // Otherwise we should form a memset_pattern16. PatternValue is known to be // an constant array of 16-bytes. Plop the value into a mergable global. GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true, GlobalValue::PrivateLinkage, PatternValue, ".memset_pattern"); - GV->setUnnamedAddr(true); // Ok to merge these. + GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); // Ok to merge these. GV->setAlignment(16); Value *PatternPtr = ConstantExpr::getBitCast(GV, Int8PtrTy); NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes}); @@ -662,7 +833,8 @@ bool LoopIdiomRecognize::processLoopStridedStore( // Okay, the memset has been formed. Zap the original store and anything that // feeds into it. - deleteDeadInstruction(TheStore, TLI); + for (auto *I : Stores) + deleteDeadInstruction(I); ++NumMemSet; return true; } @@ -676,7 +848,7 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI, Value *StorePtr = SI->getPointerOperand(); const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr)); - unsigned Stride = getStoreStride(StoreEv); + APInt Stride = getStoreStride(StoreEv); unsigned StoreSize = getStoreSizeInBytes(SI, DL); bool NegStride = StoreSize == -Stride; @@ -714,8 +886,10 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI, Value *StoreBasePtr = Expander.expandCodeFor( StrStart, Builder.getInt8PtrTy(StrAS), Preheader->getTerminator()); + SmallPtrSet<Instruction *, 1> Stores; + Stores.insert(SI); if (mayLoopAccessLocation(StoreBasePtr, MRI_ModRef, CurLoop, BECount, - StoreSize, *AA, SI)) { + StoreSize, *AA, Stores)) { Expander.clear(); // If we generated new code for the base pointer, clean up. RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI); @@ -735,7 +909,7 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI, LdStart, Builder.getInt8PtrTy(LdAS), Preheader->getTerminator()); if (mayLoopAccessLocation(LoadBasePtr, MRI_Mod, CurLoop, BECount, StoreSize, - *AA, SI)) { + *AA, Stores)) { Expander.clear(); // If we generated new code for the base pointer, clean up. RecursivelyDeleteTriviallyDeadInstructions(LoadBasePtr, TLI); @@ -769,7 +943,7 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI, // Okay, the memcpy has been formed. Zap the original store and anything that // feeds into it. - deleteDeadInstruction(SI, TLI); + deleteDeadInstruction(SI); ++NumMemCpy; return true; } @@ -993,7 +1167,7 @@ bool LoopIdiomRecognize::recognizePopcount() { } static CallInst *createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val, - DebugLoc DL) { + const DebugLoc &DL) { Value *Ops[] = {Val}; Type *Tys[] = {Val->getType()}; diff --git a/lib/Transforms/Scalar/LoopInstSimplify.cpp b/lib/Transforms/Scalar/LoopInstSimplify.cpp index b4102fe9ba340..629cb87d7a916 100644 --- a/lib/Transforms/Scalar/LoopInstSimplify.cpp +++ b/lib/Transforms/Scalar/LoopInstSimplify.cpp @@ -11,88 +11,43 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Scalar/LoopInstSimplify.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/LoopPassManager.h" #include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Instructions.h" #include "llvm/Support/Debug.h" -#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/LoopUtils.h" using namespace llvm; #define DEBUG_TYPE "loop-instsimplify" STATISTIC(NumSimplified, "Number of redundant instructions simplified"); -namespace { - class LoopInstSimplify : public LoopPass { - public: - static char ID; // Pass ID, replacement for typeid - LoopInstSimplify() : LoopPass(ID) { - initializeLoopInstSimplifyPass(*PassRegistry::getPassRegistry()); - } - - bool runOnLoop(Loop*, LPPassManager&) override; - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - AU.addRequired<AssumptionCacheTracker>(); - AU.addRequired<LoopInfoWrapperPass>(); - AU.addRequiredID(LoopSimplifyID); - AU.addPreservedID(LoopSimplifyID); - AU.addPreservedID(LCSSAID); - AU.addPreserved<ScalarEvolutionWrapperPass>(); - AU.addRequired<TargetLibraryInfoWrapperPass>(); - } - }; -} - -char LoopInstSimplify::ID = 0; -INITIALIZE_PASS_BEGIN(LoopInstSimplify, "loop-instsimplify", - "Simplify instructions in loops", false, false) -INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) -INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(LCSSA) -INITIALIZE_PASS_END(LoopInstSimplify, "loop-instsimplify", - "Simplify instructions in loops", false, false) - -Pass *llvm::createLoopInstSimplifyPass() { - return new LoopInstSimplify(); -} - -bool LoopInstSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { - if (skipOptnoneFunction(L)) - return false; - - DominatorTreeWrapperPass *DTWP = - getAnalysisIfAvailable<DominatorTreeWrapperPass>(); - DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr; - LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - const TargetLibraryInfo *TLI = - &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); - auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache( - *L->getHeader()->getParent()); - - SmallVector<BasicBlock*, 8> ExitBlocks; +static bool SimplifyLoopInst(Loop *L, DominatorTree *DT, LoopInfo *LI, + AssumptionCache *AC, + const TargetLibraryInfo *TLI) { + SmallVector<BasicBlock *, 8> ExitBlocks; L->getUniqueExitBlocks(ExitBlocks); array_pod_sort(ExitBlocks.begin(), ExitBlocks.end()); - SmallPtrSet<const Instruction*, 8> S1, S2, *ToSimplify = &S1, *Next = &S2; + SmallPtrSet<const Instruction *, 8> S1, S2, *ToSimplify = &S1, *Next = &S2; // The bit we are stealing from the pointer represents whether this basic // block is the header of a subloop, in which case we only process its phis. - typedef PointerIntPair<BasicBlock*, 1> WorklistItem; + typedef PointerIntPair<BasicBlock *, 1> WorklistItem; SmallVector<WorklistItem, 16> VisitStack; - SmallPtrSet<BasicBlock*, 32> Visited; + SmallPtrSet<BasicBlock *, 32> Visited; bool Changed = false; bool LocalChanged; @@ -122,7 +77,7 @@ bool LoopInstSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { // Don't bother simplifying unused instructions. if (!I->use_empty()) { - Value *V = SimplifyInstruction(I, DL, TLI, DT, &AC); + Value *V = SimplifyInstruction(I, DL, TLI, DT, AC); if (V && LI->replacementPreservesLCSSAForm(I, V)) { // Mark all uses for resimplification next time round the loop. for (User *U : I->users()) @@ -133,14 +88,13 @@ bool LoopInstSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { ++NumSimplified; } } - bool res = RecursivelyDeleteTriviallyDeadInstructions(I, TLI); - if (res) { - // RecursivelyDeleteTriviallyDeadInstruction can remove - // more than one instruction, so simply incrementing the - // iterator does not work. When instructions get deleted - // re-iterate instead. - BI = BB->begin(); BE = BB->end(); - LocalChanged |= res; + if (RecursivelyDeleteTriviallyDeadInstructions(I, TLI)) { + // RecursivelyDeleteTriviallyDeadInstruction can remove more than one + // instruction, so simply incrementing the iterator does not work. + // When instructions get deleted re-iterate instead. + BI = BB->begin(); + BE = BB->end(); + LocalChanged = true; } if (IsSubloopHeader && !isa<PHINode>(I)) @@ -148,8 +102,10 @@ bool LoopInstSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { } // Add all successors to the worklist, except for loop exit blocks and the - // bodies of subloops. We visit the headers of loops so that we can process - // their phis, but we contract the rest of the subloop body and only follow + // bodies of subloops. We visit the headers of loops so that we can + // process + // their phis, but we contract the rest of the subloop body and only + // follow // edges leading back to the original loop. for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI) { @@ -158,11 +114,11 @@ bool LoopInstSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { continue; const Loop *SuccLoop = LI->getLoopFor(SuccBB); - if (SuccLoop && SuccLoop->getHeader() == SuccBB - && L->contains(SuccLoop)) { + if (SuccLoop && SuccLoop->getHeader() == SuccBB && + L->contains(SuccLoop)) { VisitStack.push_back(WorklistItem(SuccBB, true)); - SmallVector<BasicBlock*, 8> SubLoopExitBlocks; + SmallVector<BasicBlock *, 8> SubLoopExitBlocks; SuccLoop->getExitBlocks(SubLoopExitBlocks); for (unsigned i = 0; i < SubLoopExitBlocks.size(); ++i) { @@ -174,8 +130,8 @@ bool LoopInstSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { continue; } - bool IsExitBlock = std::binary_search(ExitBlocks.begin(), - ExitBlocks.end(), SuccBB); + bool IsExitBlock = + std::binary_search(ExitBlocks.begin(), ExitBlocks.end(), SuccBB); if (IsExitBlock) continue; @@ -193,3 +149,68 @@ bool LoopInstSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { return Changed; } + +namespace { +class LoopInstSimplifyLegacyPass : public LoopPass { +public: + static char ID; // Pass ID, replacement for typeid + LoopInstSimplifyLegacyPass() : LoopPass(ID) { + initializeLoopInstSimplifyLegacyPassPass(*PassRegistry::getPassRegistry()); + } + + bool runOnLoop(Loop *L, LPPassManager &LPM) override { + if (skipLoop(L)) + return false; + DominatorTreeWrapperPass *DTWP = + getAnalysisIfAvailable<DominatorTreeWrapperPass>(); + DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr; + LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + AssumptionCache *AC = + &getAnalysis<AssumptionCacheTracker>().getAssumptionCache( + *L->getHeader()->getParent()); + const TargetLibraryInfo *TLI = + &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + + return SimplifyLoopInst(L, DT, LI, AC, TLI); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<AssumptionCacheTracker>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); + AU.setPreservesCFG(); + getLoopAnalysisUsage(AU); + } +}; +} + +PreservedAnalyses LoopInstSimplifyPass::run(Loop &L, + AnalysisManager<Loop> &AM) { + const auto &FAM = + AM.getResult<FunctionAnalysisManagerLoopProxy>(L).getManager(); + Function *F = L.getHeader()->getParent(); + + // Use getCachedResult because Loop pass cannot trigger a function analysis. + auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(*F); + auto *LI = FAM.getCachedResult<LoopAnalysis>(*F); + auto *AC = FAM.getCachedResult<AssumptionAnalysis>(*F); + const auto *TLI = FAM.getCachedResult<TargetLibraryAnalysis>(*F); + assert((LI && AC && TLI) && "Analyses for Loop Inst Simplify not available"); + + if (!SimplifyLoopInst(&L, DT, LI, AC, TLI)) + return PreservedAnalyses::all(); + + return getLoopPassPreservedAnalyses(); +} + +char LoopInstSimplifyLegacyPass::ID = 0; +INITIALIZE_PASS_BEGIN(LoopInstSimplifyLegacyPass, "loop-instsimplify", + "Simplify instructions in loops", false, false) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(LoopPass) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_END(LoopInstSimplifyLegacyPass, "loop-instsimplify", + "Simplify instructions in loops", false, false) + +Pass *llvm::createLoopInstSimplifyPass() { + return new LoopInstSimplifyLegacyPass(); +} diff --git a/lib/Transforms/Scalar/LoopInterchange.cpp b/lib/Transforms/Scalar/LoopInterchange.cpp index 4295235a3f364..9241ec3652773 100644 --- a/lib/Transforms/Scalar/LoopInterchange.cpp +++ b/lib/Transforms/Scalar/LoopInterchange.cpp @@ -15,7 +15,6 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/Analysis/AliasSetTracker.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/CodeMetrics.h" @@ -72,7 +71,7 @@ void printDepMatrix(CharMatrix &DepMatrix) { #endif static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level, - Loop *L, DependenceAnalysis *DA) { + Loop *L, DependenceInfo *DI) { typedef SmallVector<Value *, 16> ValueVector; ValueVector MemInstr; @@ -117,7 +116,7 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level, continue; if (isa<LoadInst>(Src) && isa<LoadInst>(Des)) continue; - if (auto D = DA->depends(Src, Des, true)) { + if (auto D = DI->depends(Src, Des, true)) { DEBUG(dbgs() << "Found Dependency between Src=" << Src << " Des=" << Des << "\n"); if (D->isFlow()) { @@ -404,12 +403,9 @@ public: private: void splitInnerLoopLatch(Instruction *); - void splitOuterLoopLatch(); void splitInnerLoopHeader(); bool adjustLoopLinks(); void adjustLoopPreheaders(); - void adjustOuterLoopPreheader(); - void adjustInnerLoopPreheader(); bool adjustLoopBranches(); void updateIncomingBlock(BasicBlock *CurrBlock, BasicBlock *OldPred, BasicBlock *NewPred); @@ -430,11 +426,11 @@ struct LoopInterchange : public FunctionPass { static char ID; ScalarEvolution *SE; LoopInfo *LI; - DependenceAnalysis *DA; + DependenceInfo *DI; DominatorTree *DT; bool PreserveLCSSA; LoopInterchange() - : FunctionPass(ID), SE(nullptr), LI(nullptr), DA(nullptr), DT(nullptr) { + : FunctionPass(ID), SE(nullptr), LI(nullptr), DI(nullptr), DT(nullptr) { initializeLoopInterchangePass(*PassRegistry::getPassRegistry()); } @@ -443,15 +439,18 @@ struct LoopInterchange : public FunctionPass { AU.addRequired<AAResultsWrapperPass>(); AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<LoopInfoWrapperPass>(); - AU.addRequired<DependenceAnalysis>(); + AU.addRequired<DependenceAnalysisWrapperPass>(); AU.addRequiredID(LoopSimplifyID); AU.addRequiredID(LCSSAID); } bool runOnFunction(Function &F) override { + if (skipFunction(F)) + return false; + SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - DA = &getAnalysis<DependenceAnalysis>(); + DI = &getAnalysis<DependenceAnalysisWrapperPass>().getDI(); auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); DT = DTWP ? &DTWP->getDomTree() : nullptr; PreserveLCSSA = mustPreserveAnalysisID(LCSSAID); @@ -472,8 +471,7 @@ struct LoopInterchange : public FunctionPass { } bool isComputableLoopNest(LoopVector LoopList) { - for (auto I = LoopList.begin(), E = LoopList.end(); I != E; ++I) { - Loop *L = *I; + for (Loop *L : LoopList) { const SCEV *ExitCountOuter = SE->getBackedgeTakenCount(L); if (ExitCountOuter == SE->getCouldNotCompute()) { DEBUG(dbgs() << "Couldn't compute Backedge count\n"); @@ -491,7 +489,7 @@ struct LoopInterchange : public FunctionPass { return true; } - unsigned selectLoopForInterchange(LoopVector LoopList) { + unsigned selectLoopForInterchange(const LoopVector &LoopList) { // TODO: Add a better heuristic to select the loop to be interchanged based // on the dependence matrix. Currently we select the innermost loop. return LoopList.size() - 1; @@ -515,7 +513,7 @@ struct LoopInterchange : public FunctionPass { << "\n"); if (!populateDependencyMatrix(DependencyMatrix, LoopList.size(), - OuterMostLoop, DA)) { + OuterMostLoop, DI)) { DEBUG(dbgs() << "Populating Dependency matrix failed\n"); return false; } @@ -813,7 +811,6 @@ bool LoopInterchangeLegality::currentLimitations() { // A[j+1][i+2] = A[j][i]+k; // } // } - bool FoundInduction = false; Instruction *InnerIndexVarInc = nullptr; if (InnerInductionVar->getIncomingBlock(0) == InnerLoopPreHeader) InnerIndexVarInc = @@ -829,17 +826,17 @@ bool LoopInterchangeLegality::currentLimitations() { // we do not have any instruction between the induction variable and branch // instruction. - for (auto I = InnerLoopLatch->rbegin(), E = InnerLoopLatch->rend(); - I != E && !FoundInduction; ++I) { - if (isa<BranchInst>(*I) || isa<CmpInst>(*I) || isa<TruncInst>(*I)) + bool FoundInduction = false; + for (const Instruction &I : reverse(*InnerLoopLatch)) { + if (isa<BranchInst>(I) || isa<CmpInst>(I) || isa<TruncInst>(I)) continue; - const Instruction &Ins = *I; // We found an instruction. If this is not induction variable then it is not // safe to split this loop latch. - if (!Ins.isIdenticalTo(InnerIndexVarInc)) + if (!I.isIdenticalTo(InnerIndexVarInc)) return true; - else - FoundInduction = true; + + FoundInduction = true; + break; } // The loop latch ended and we didn't find the induction variable return as // current limitation. @@ -903,8 +900,7 @@ int LoopInterchangeProfitability::getInstrOrderCost() { BadOrder = GoodOrder = 0; for (auto BI = InnerLoop->block_begin(), BE = InnerLoop->block_end(); BI != BE; ++BI) { - for (auto I = (*BI)->begin(), E = (*BI)->end(); I != E; ++I) { - const Instruction &Ins = *I; + for (Instruction &Ins : **BI) { if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&Ins)) { unsigned NumOp = GEP->getNumOperands(); bool FoundInnerInduction = false; @@ -1073,13 +1069,6 @@ void LoopInterchangeTransform::splitInnerLoopLatch(Instruction *Inc) { InnerLoopLatch = SplitBlock(InnerLoopLatchPred, Inc, DT, LI); } -void LoopInterchangeTransform::splitOuterLoopLatch() { - BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch(); - BasicBlock *OuterLatchLcssaPhiBlock = OuterLoopLatch; - OuterLoopLatch = SplitBlock(OuterLatchLcssaPhiBlock, - OuterLoopLatch->getFirstNonPHI(), DT, LI); -} - void LoopInterchangeTransform::splitInnerLoopHeader() { // Split the inner loop header out. Here make sure that the reduction PHI's @@ -1102,8 +1091,7 @@ void LoopInterchangeTransform::splitInnerLoopHeader() { PHI->replaceAllUsesWith(V); PHIVec.push_back((PHI)); } - for (auto I = PHIVec.begin(), E = PHIVec.end(); I != E; ++I) { - PHINode *P = *I; + for (PHINode *P : PHIVec) { P->eraseFromParent(); } } else { @@ -1124,20 +1112,6 @@ static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) { FromBB->getTerminator()->getIterator()); } -void LoopInterchangeTransform::adjustOuterLoopPreheader() { - BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader(); - BasicBlock *InnerPreHeader = InnerLoop->getLoopPreheader(); - - moveBBContents(OuterLoopPreHeader, InnerPreHeader->getTerminator()); -} - -void LoopInterchangeTransform::adjustInnerLoopPreheader() { - BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); - BasicBlock *OuterHeader = OuterLoop->getHeader(); - - moveBBContents(InnerLoopPreHeader, OuterHeader->getTerminator()); -} - void LoopInterchangeTransform::updateIncomingBlock(BasicBlock *CurrBlock, BasicBlock *OldPred, BasicBlock *NewPred) { @@ -1234,8 +1208,7 @@ bool LoopInterchangeTransform::adjustLoopBranches() { PHINode *LcssaPhi = cast<PHINode>(I); LcssaVec.push_back(LcssaPhi); } - for (auto I = LcssaVec.begin(), E = LcssaVec.end(); I != E; ++I) { - PHINode *P = *I; + for (PHINode *P : LcssaVec) { Value *Incoming = P->getIncomingValueForBlock(InnerLoopLatch); P->replaceAllUsesWith(Incoming); P->eraseFromParent(); @@ -1294,11 +1267,11 @@ char LoopInterchange::ID = 0; INITIALIZE_PASS_BEGIN(LoopInterchange, "loop-interchange", "Interchanges loops for cache reuse", false, false) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) -INITIALIZE_PASS_DEPENDENCY(DependenceAnalysis) +INITIALIZE_PASS_DEPENDENCY(DependenceAnalysisWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) -INITIALIZE_PASS_DEPENDENCY(LCSSA) +INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_END(LoopInterchange, "loop-interchange", diff --git a/lib/Transforms/Scalar/LoopLoadElimination.cpp b/lib/Transforms/Scalar/LoopLoadElimination.cpp index 1064d088514d5..f29228c7659e2 100644 --- a/lib/Transforms/Scalar/LoopLoadElimination.cpp +++ b/lib/Transforms/Scalar/LoopLoadElimination.cpp @@ -28,6 +28,7 @@ #include "llvm/IR/Module.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" +#include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/LoopVersioning.h" #include <forward_list> @@ -61,7 +62,8 @@ struct StoreToLoadForwardingCandidate { /// \brief Return true if the dependence from the store to the load has a /// distance of one. E.g. A[i+1] = A[i] - bool isDependenceDistanceOfOne(PredicatedScalarEvolution &PSE) const { + bool isDependenceDistanceOfOne(PredicatedScalarEvolution &PSE, + Loop *L) const { Value *LoadPtr = Load->getPointerOperand(); Value *StorePtr = Store->getPointerOperand(); Type *LoadPtrType = LoadPtr->getType(); @@ -72,6 +74,13 @@ struct StoreToLoadForwardingCandidate { LoadType == StorePtr->getType()->getPointerElementType() && "Should be a known dependence"); + // Currently we only support accesses with unit stride. FIXME: we should be + // able to handle non unit stirde as well as long as the stride is equal to + // the dependence distance. + if (getPtrStride(PSE, LoadPtr, L) != 1 || + getPtrStride(PSE, StorePtr, L) != 1) + return false; + auto &DL = Load->getParent()->getModule()->getDataLayout(); unsigned TypeByteSize = DL.getTypeAllocSize(const_cast<Type *>(LoadType)); @@ -83,7 +92,7 @@ struct StoreToLoadForwardingCandidate { auto *Dist = cast<SCEVConstant>( PSE.getSE()->getMinusSCEV(StorePtrSCEV, LoadPtrSCEV)); const APInt &Val = Dist->getAPInt(); - return Val.abs() == TypeByteSize; + return Val == TypeByteSize; } Value *getLoadPtr() const { return Load->getPointerOperand(); } @@ -110,12 +119,17 @@ bool doesStoreDominatesAllLatches(BasicBlock *StoreBlock, Loop *L, }); } +/// \brief Return true if the load is not executed on all paths in the loop. +static bool isLoadConditional(LoadInst *Load, Loop *L) { + return Load->getParent() != L->getHeader(); +} + /// \brief The per-loop class that does most of the work. class LoadEliminationForLoop { public: LoadEliminationForLoop(Loop *L, LoopInfo *LI, const LoopAccessInfo &LAI, DominatorTree *DT) - : L(L), LI(LI), LAI(LAI), DT(DT), PSE(LAI.PSE) {} + : L(L), LI(LI), LAI(LAI), DT(DT), PSE(LAI.getPSE()) {} /// \brief Look through the loop-carried and loop-independent dependences in /// this loop and find store->load dependences. @@ -162,6 +176,12 @@ public: auto *Load = dyn_cast<LoadInst>(Destination); if (!Load) continue; + + // Only progagate the value if they are of the same type. + if (Store->getPointerOperand()->getType() != + Load->getPointerOperand()->getType()) + continue; + Candidates.emplace_front(Load, Store); } @@ -219,12 +239,12 @@ public: if (OtherCand == nullptr) continue; - // Handle the very basic of case when the two stores are in the same - // block so deciding which one forwards is easy. The later one forwards - // as long as they both have a dependence distance of one to the load. + // Handle the very basic case when the two stores are in the same block + // so deciding which one forwards is easy. The later one forwards as + // long as they both have a dependence distance of one to the load. if (Cand.Store->getParent() == OtherCand->Store->getParent() && - Cand.isDependenceDistanceOfOne(PSE) && - OtherCand->isDependenceDistanceOfOne(PSE)) { + Cand.isDependenceDistanceOfOne(PSE, L) && + OtherCand->isDependenceDistanceOfOne(PSE, L)) { // They are in the same block, the later one will forward to the load. if (getInstrIndex(OtherCand->Store) < getInstrIndex(Cand.Store)) OtherCand = &Cand; @@ -429,14 +449,21 @@ public: unsigned NumForwarding = 0; for (const StoreToLoadForwardingCandidate Cand : StoreToLoadDependences) { DEBUG(dbgs() << "Candidate " << Cand); + // Make sure that the stored values is available everywhere in the loop in // the next iteration. if (!doesStoreDominatesAllLatches(Cand.Store->getParent(), L, DT)) continue; + // If the load is conditional we can't hoist its 0-iteration instance to + // the preheader because that would make it unconditional. Thus we would + // access a memory location that the original loop did not access. + if (isLoadConditional(Cand.Load, L)) + continue; + // Check whether the SCEV difference is the same as the induction step, // thus we load the value in the next iteration. - if (!Cand.isDependenceDistanceOfOne(PSE)) + if (!Cand.isDependenceDistanceOfOne(PSE, L)) continue; ++NumForwarding; @@ -459,18 +486,25 @@ public: return false; } - if (LAI.PSE.getUnionPredicate().getComplexity() > + if (LAI.getPSE().getUnionPredicate().getComplexity() > LoadElimSCEVCheckThreshold) { DEBUG(dbgs() << "Too many SCEV run-time checks needed.\n"); return false; } - // Point of no-return, start the transformation. First, version the loop if - // necessary. - if (!Checks.empty() || !LAI.PSE.getUnionPredicate().isAlwaysTrue()) { + if (!Checks.empty() || !LAI.getPSE().getUnionPredicate().isAlwaysTrue()) { + if (L->getHeader()->getParent()->optForSize()) { + DEBUG(dbgs() << "Versioning is needed but not allowed when optimizing " + "for size.\n"); + return false; + } + + // Point of no-return, start the transformation. First, version the loop + // if necessary. + LoopVersioning LV(LAI, L, LI, DT, PSE.getSE(), false); LV.setAliasChecks(std::move(Checks)); - LV.setSCEVChecks(LAI.PSE.getUnionPredicate()); + LV.setSCEVChecks(LAI.getPSE().getUnionPredicate()); LV.versionLoop(); } @@ -508,8 +542,11 @@ public: } bool runOnFunction(Function &F) override { + if (skipFunction(F)) + return false; + auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - auto *LAA = &getAnalysis<LoopAccessAnalysis>(); + auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); // Build up a worklist of inner-loops to vectorize. This is necessary as the @@ -526,7 +563,7 @@ public: // Now walk the identified inner loops. bool Changed = false; for (Loop *L : Worklist) { - const LoopAccessInfo &LAI = LAA->getInfo(L, ValueToValueMap()); + const LoopAccessInfo &LAI = LAA->getInfo(L); // The actual work is performed by LoadEliminationForLoop. LoadEliminationForLoop LEL(L, LI, LAI, DT); Changed |= LEL.processLoop(); @@ -537,9 +574,10 @@ public: } void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequiredID(LoopSimplifyID); AU.addRequired<LoopInfoWrapperPass>(); AU.addPreserved<LoopInfoWrapperPass>(); - AU.addRequired<LoopAccessAnalysis>(); + AU.addRequired<LoopAccessLegacyAnalysis>(); AU.addRequired<ScalarEvolutionWrapperPass>(); AU.addRequired<DominatorTreeWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); @@ -554,9 +592,10 @@ static const char LLE_name[] = "Loop Load Elimination"; INITIALIZE_PASS_BEGIN(LoopLoadElimination, LLE_OPTION, LLE_name, false, false) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(LoopAccessAnalysis) +INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_END(LoopLoadElimination, LLE_OPTION, LLE_name, false, false) namespace llvm { diff --git a/lib/Transforms/Scalar/LoopRerollPass.cpp b/lib/Transforms/Scalar/LoopRerollPass.cpp index 27c2d8824df06..d2f1b66076a6c 100644 --- a/lib/Transforms/Scalar/LoopRerollPass.cpp +++ b/lib/Transforms/Scalar/LoopRerollPass.cpp @@ -14,7 +14,7 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/SmallBitVector.h" +#include "llvm/ADT/BitVector.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" @@ -128,9 +128,8 @@ NumToleratedFailedMatches("reroll-num-tolerated-failed-matches", cl::init(400), namespace { enum IterationLimits { - /// The maximum number of iterations that we'll try and reroll. This - /// has to be less than 25 in order to fit into a SmallBitVector. - IL_MaxRerollIterations = 16, + /// The maximum number of iterations that we'll try and reroll. + IL_MaxRerollIterations = 32, /// The bitvector index used by loop induction variables and other /// instructions that belong to all iterations. IL_All, @@ -147,13 +146,8 @@ namespace { bool runOnLoop(Loop *L, LPPassManager &LPM) override; void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<AAResultsWrapperPass>(); - AU.addRequired<LoopInfoWrapperPass>(); - AU.addPreserved<LoopInfoWrapperPass>(); - AU.addRequired<DominatorTreeWrapperPass>(); - AU.addPreserved<DominatorTreeWrapperPass>(); - AU.addRequired<ScalarEvolutionWrapperPass>(); AU.addRequired<TargetLibraryInfoWrapperPass>(); + getLoopAnalysisUsage(AU); } protected: @@ -169,6 +163,9 @@ namespace { // Map between induction variable and its increment DenseMap<Instruction *, int64_t> IVToIncMap; + // For loop with multiple induction variable, remember the one used only to + // control the loop. + Instruction *LoopControlIV; // A chain of isomorphic instructions, identified by a single-use PHI // representing a reduction. Only the last value may be used outside the @@ -356,9 +353,11 @@ namespace { ScalarEvolution *SE, AliasAnalysis *AA, TargetLibraryInfo *TLI, DominatorTree *DT, LoopInfo *LI, bool PreserveLCSSA, - DenseMap<Instruction *, int64_t> &IncrMap) + DenseMap<Instruction *, int64_t> &IncrMap, + Instruction *LoopCtrlIV) : Parent(Parent), L(L), SE(SE), AA(AA), TLI(TLI), DT(DT), LI(LI), - PreserveLCSSA(PreserveLCSSA), IV(IV), IVToIncMap(IncrMap) {} + PreserveLCSSA(PreserveLCSSA), IV(IV), IVToIncMap(IncrMap), + LoopControlIV(LoopCtrlIV) {} /// Stage 1: Find all the DAG roots for the induction variable. bool findRoots(); @@ -370,7 +369,7 @@ namespace { void replace(const SCEV *IterCount); protected: - typedef MapVector<Instruction*, SmallBitVector> UsesTy; + typedef MapVector<Instruction*, BitVector> UsesTy; bool findRootsRecursive(Instruction *IVU, SmallInstructionSet SubsumedInsts); @@ -396,6 +395,8 @@ namespace { bool instrDependsOn(Instruction *I, UsesTy::iterator Start, UsesTy::iterator End); + void replaceIV(Instruction *Inst, Instruction *IV, const SCEV *IterCount); + void updateNonLoopCtrlIncr(); LoopReroll *Parent; @@ -426,8 +427,18 @@ namespace { UsesTy Uses; // Map between induction variable and its increment DenseMap<Instruction *, int64_t> &IVToIncMap; + Instruction *LoopControlIV; }; + // Check if it is a compare-like instruction whose user is a branch + bool isCompareUsedByBranch(Instruction *I) { + auto *TI = I->getParent()->getTerminator(); + if (!isa<BranchInst>(TI) || !isa<CmpInst>(I)) + return false; + return I->hasOneUse() && TI->getOperand(0) == I; + }; + + bool isLoopControlIV(Loop *L, Instruction *IV); void collectPossibleIVs(Loop *L, SmallInstructionVector &PossibleIVs); void collectPossibleReductions(Loop *L, ReductionTracker &Reductions); @@ -438,10 +449,7 @@ namespace { char LoopReroll::ID = 0; INITIALIZE_PASS_BEGIN(LoopReroll, "loop-reroll", "Reroll loops", false, false) -INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) -INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopPass) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_END(LoopReroll, "loop-reroll", "Reroll loops", false, false) @@ -460,6 +468,110 @@ static bool hasUsesOutsideLoop(Instruction *I, Loop *L) { return false; } +static const SCEVConstant *getIncrmentFactorSCEV(ScalarEvolution *SE, + const SCEV *SCEVExpr, + Instruction &IV) { + const SCEVMulExpr *MulSCEV = dyn_cast<SCEVMulExpr>(SCEVExpr); + + // If StepRecurrence of a SCEVExpr is a constant (c1 * c2, c2 = sizeof(ptr)), + // Return c1. + if (!MulSCEV && IV.getType()->isPointerTy()) + if (const SCEVConstant *IncSCEV = dyn_cast<SCEVConstant>(SCEVExpr)) { + const PointerType *PTy = cast<PointerType>(IV.getType()); + Type *ElTy = PTy->getElementType(); + const SCEV *SizeOfExpr = + SE->getSizeOfExpr(SE->getEffectiveSCEVType(IV.getType()), ElTy); + if (IncSCEV->getValue()->getValue().isNegative()) { + const SCEV *NewSCEV = + SE->getUDivExpr(SE->getNegativeSCEV(SCEVExpr), SizeOfExpr); + return dyn_cast<SCEVConstant>(SE->getNegativeSCEV(NewSCEV)); + } else { + return dyn_cast<SCEVConstant>(SE->getUDivExpr(SCEVExpr, SizeOfExpr)); + } + } + + if (!MulSCEV) + return nullptr; + + // If StepRecurrence of a SCEVExpr is a c * sizeof(x), where c is constant, + // Return c. + const SCEVConstant *CIncSCEV = nullptr; + for (const SCEV *Operand : MulSCEV->operands()) { + if (const SCEVConstant *Constant = dyn_cast<SCEVConstant>(Operand)) { + CIncSCEV = Constant; + } else if (const SCEVUnknown *Unknown = dyn_cast<SCEVUnknown>(Operand)) { + Type *AllocTy; + if (!Unknown->isSizeOf(AllocTy)) + break; + } else { + return nullptr; + } + } + return CIncSCEV; +} + +// Check if an IV is only used to control the loop. There are two cases: +// 1. It only has one use which is loop increment, and the increment is only +// used by comparison and the PHI (could has sext with nsw in between), and the +// comparison is only used by branch. +// 2. It is used by loop increment and the comparison, the loop increment is +// only used by the PHI, and the comparison is used only by the branch. +bool LoopReroll::isLoopControlIV(Loop *L, Instruction *IV) { + unsigned IVUses = IV->getNumUses(); + if (IVUses != 2 && IVUses != 1) + return false; + + for (auto *User : IV->users()) { + int32_t IncOrCmpUses = User->getNumUses(); + bool IsCompInst = isCompareUsedByBranch(cast<Instruction>(User)); + + // User can only have one or two uses. + if (IncOrCmpUses != 2 && IncOrCmpUses != 1) + return false; + + // Case 1 + if (IVUses == 1) { + // The only user must be the loop increment. + // The loop increment must have two uses. + if (IsCompInst || IncOrCmpUses != 2) + return false; + } + + // Case 2 + if (IVUses == 2 && IncOrCmpUses != 1) + return false; + + // The users of the IV must be a binary operation or a comparison + if (auto *BO = dyn_cast<BinaryOperator>(User)) { + if (BO->getOpcode() == Instruction::Add) { + // Loop Increment + // User of Loop Increment should be either PHI or CMP + for (auto *UU : User->users()) { + if (PHINode *PN = dyn_cast<PHINode>(UU)) { + if (PN != IV) + return false; + } + // Must be a CMP or an ext (of a value with nsw) then CMP + else { + Instruction *UUser = dyn_cast<Instruction>(UU); + // Skip SExt if we are extending an nsw value + // TODO: Allow ZExt too + if (BO->hasNoSignedWrap() && UUser && UUser->getNumUses() == 1 && + isa<SExtInst>(UUser)) + UUser = dyn_cast<Instruction>(*(UUser->user_begin())); + if (!isCompareUsedByBranch(UUser)) + return false; + } + } + } else + return false; + // Compare : can only have one use, and must be branch + } else if (!IsCompInst) + return false; + } + return true; +} + // Collect the list of loop induction variables with respect to which it might // be possible to reroll the loop. void LoopReroll::collectPossibleIVs(Loop *L, @@ -469,7 +581,7 @@ void LoopReroll::collectPossibleIVs(Loop *L, IE = Header->getFirstInsertionPt(); I != IE; ++I) { if (!isa<PHINode>(I)) continue; - if (!I->getType()->isIntegerTy()) + if (!I->getType()->isIntegerTy() && !I->getType()->isPointerTy()) continue; if (const SCEVAddRecExpr *PHISCEV = @@ -478,15 +590,27 @@ void LoopReroll::collectPossibleIVs(Loop *L, continue; if (!PHISCEV->isAffine()) continue; - if (const SCEVConstant *IncSCEV = - dyn_cast<SCEVConstant>(PHISCEV->getStepRecurrence(*SE))) { - const APInt &AInt = IncSCEV->getAPInt().abs(); + const SCEVConstant *IncSCEV = nullptr; + if (I->getType()->isPointerTy()) + IncSCEV = + getIncrmentFactorSCEV(SE, PHISCEV->getStepRecurrence(*SE), *I); + else + IncSCEV = dyn_cast<SCEVConstant>(PHISCEV->getStepRecurrence(*SE)); + if (IncSCEV) { + const APInt &AInt = IncSCEV->getValue()->getValue().abs(); if (IncSCEV->getValue()->isZero() || AInt.uge(MaxInc)) continue; IVToIncMap[&*I] = IncSCEV->getValue()->getSExtValue(); DEBUG(dbgs() << "LRR: Possible IV: " << *I << " = " << *PHISCEV << "\n"); - PossibleIVs.push_back(&*I); + + if (isLoopControlIV(L, &*I)) { + assert(!LoopControlIV && "Found two loop control only IV"); + LoopControlIV = &(*I); + DEBUG(dbgs() << "LRR: Possible loop control only IV: " << *I << " = " + << *PHISCEV << "\n"); + } else + PossibleIVs.push_back(&*I); } } } @@ -611,9 +735,8 @@ void LoopReroll::DAGRootTracker::collectInLoopUserSet( const SmallInstructionSet &Exclude, const SmallInstructionSet &Final, DenseSet<Instruction *> &Users) { - for (SmallInstructionVector::const_iterator I = Roots.begin(), - IE = Roots.end(); I != IE; ++I) - collectInLoopUserSet(*I, Exclude, Final, Users); + for (Instruction *Root : Roots) + collectInLoopUserSet(Root, Exclude, Final, Users); } static bool isSimpleLoadStore(Instruction *I) { @@ -651,10 +774,12 @@ static bool isSimpleArithmeticOp(User *IVU) { static bool isLoopIncrement(User *U, Instruction *IV) { BinaryOperator *BO = dyn_cast<BinaryOperator>(U); - if (!BO || BO->getOpcode() != Instruction::Add) + + if ((BO && BO->getOpcode() != Instruction::Add) || + (!BO && !isa<GetElementPtrInst>(U))) return false; - for (auto *UU : BO->users()) { + for (auto *UU : U->users()) { PHINode *PN = dyn_cast<PHINode>(UU); if (PN && PN == IV) return true; @@ -1031,6 +1156,33 @@ bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) { Uses[I].set(IL_All); } + // Make sure we mark loop-control-only PHIs as used in all iterations. See + // comment above LoopReroll::isLoopControlIV for more information. + BasicBlock *Header = L->getHeader(); + if (LoopControlIV && LoopControlIV != IV) { + for (auto *U : LoopControlIV->users()) { + Instruction *IVUser = dyn_cast<Instruction>(U); + // IVUser could be loop increment or compare + Uses[IVUser].set(IL_All); + for (auto *UU : IVUser->users()) { + Instruction *UUser = dyn_cast<Instruction>(UU); + // UUser could be compare, PHI or branch + Uses[UUser].set(IL_All); + // Skip SExt + if (isa<SExtInst>(UUser)) { + UUser = dyn_cast<Instruction>(*(UUser->user_begin())); + Uses[UUser].set(IL_All); + } + // Is UUser a compare instruction? + if (UU->hasOneUse()) { + Instruction *BI = dyn_cast<BranchInst>(*UUser->user_begin()); + if (BI == cast<BranchInst>(Header->getTerminator())) + Uses[BI].set(IL_All); + } + } + } + } + // Make sure all instructions in the loop are in one and only one // set. for (auto &KV : Uses) { @@ -1272,61 +1424,136 @@ void LoopReroll::DAGRootTracker::replace(const SCEV *IterCount) { ++J; } - bool Negative = IVToIncMap[IV] < 0; - const DataLayout &DL = Header->getModule()->getDataLayout(); - // We need to create a new induction variable for each different BaseInst. - for (auto &DRS : RootSets) { - // Insert the new induction variable. - const SCEVAddRecExpr *RealIVSCEV = - cast<SCEVAddRecExpr>(SE->getSCEV(DRS.BaseInst)); - const SCEV *Start = RealIVSCEV->getStart(); - const SCEVAddRecExpr *H = cast<SCEVAddRecExpr>(SE->getAddRecExpr( - Start, SE->getConstant(RealIVSCEV->getType(), Negative ? -1 : 1), L, - SCEV::FlagAnyWrap)); - { // Limit the lifetime of SCEVExpander. - SCEVExpander Expander(*SE, DL, "reroll"); - Value *NewIV = Expander.expandCodeFor(H, IV->getType(), &Header->front()); - - for (auto &KV : Uses) { - if (KV.second.find_first() == 0) - KV.first->replaceUsesOfWith(DRS.BaseInst, NewIV); - } + bool HasTwoIVs = LoopControlIV && LoopControlIV != IV; + + if (HasTwoIVs) { + updateNonLoopCtrlIncr(); + replaceIV(LoopControlIV, LoopControlIV, IterCount); + } else + // We need to create a new induction variable for each different BaseInst. + for (auto &DRS : RootSets) + // Insert the new induction variable. + replaceIV(DRS.BaseInst, IV, IterCount); - if (BranchInst *BI = dyn_cast<BranchInst>(Header->getTerminator())) { - // FIXME: Why do we need this check? - if (Uses[BI].find_first() == IL_All) { - const SCEV *ICSCEV = RealIVSCEV->evaluateAtIteration(IterCount, *SE); + SimplifyInstructionsInBlock(Header, TLI); + DeleteDeadPHIs(Header, TLI); +} - // Iteration count SCEV minus 1 - const SCEV *ICMinus1SCEV = SE->getMinusSCEV( - ICSCEV, SE->getConstant(ICSCEV->getType(), Negative ? -1 : 1)); +// For non-loop-control IVs, we only need to update the last increment +// with right amount, then we are done. +void LoopReroll::DAGRootTracker::updateNonLoopCtrlIncr() { + const SCEV *NewInc = nullptr; + for (auto *LoopInc : LoopIncs) { + GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(LoopInc); + const SCEVConstant *COp = nullptr; + if (GEP && LoopInc->getOperand(0)->getType()->isPointerTy()) { + COp = dyn_cast<SCEVConstant>(SE->getSCEV(LoopInc->getOperand(1))); + } else { + COp = dyn_cast<SCEVConstant>(SE->getSCEV(LoopInc->getOperand(0))); + if (!COp) + COp = dyn_cast<SCEVConstant>(SE->getSCEV(LoopInc->getOperand(1))); + } - Value *ICMinus1; // Iteration count minus 1 - if (isa<SCEVConstant>(ICMinus1SCEV)) { - ICMinus1 = Expander.expandCodeFor(ICMinus1SCEV, NewIV->getType(), BI); - } else { - BasicBlock *Preheader = L->getLoopPreheader(); - if (!Preheader) - Preheader = InsertPreheaderForLoop(L, DT, LI, PreserveLCSSA); + assert(COp && "Didn't find constant operand of LoopInc!\n"); - ICMinus1 = Expander.expandCodeFor(ICMinus1SCEV, NewIV->getType(), - Preheader->getTerminator()); - } + const APInt &AInt = COp->getValue()->getValue(); + const SCEV *ScaleSCEV = SE->getConstant(COp->getType(), Scale); + if (AInt.isNegative()) { + NewInc = SE->getNegativeSCEV(COp); + NewInc = SE->getUDivExpr(NewInc, ScaleSCEV); + NewInc = SE->getNegativeSCEV(NewInc); + } else + NewInc = SE->getUDivExpr(COp, ScaleSCEV); + + LoopInc->setOperand(1, dyn_cast<SCEVConstant>(NewInc)->getValue()); + } +} - Value *Cond = - new ICmpInst(BI, CmpInst::ICMP_EQ, NewIV, ICMinus1, "exitcond"); - BI->setCondition(Cond); +void LoopReroll::DAGRootTracker::replaceIV(Instruction *Inst, + Instruction *InstIV, + const SCEV *IterCount) { + BasicBlock *Header = L->getHeader(); + int64_t Inc = IVToIncMap[InstIV]; + bool NeedNewIV = InstIV == LoopControlIV; + bool Negative = !NeedNewIV && Inc < 0; + + const SCEVAddRecExpr *RealIVSCEV = cast<SCEVAddRecExpr>(SE->getSCEV(Inst)); + const SCEV *Start = RealIVSCEV->getStart(); + + if (NeedNewIV) + Start = SE->getConstant(Start->getType(), 0); + + const SCEV *SizeOfExpr = nullptr; + const SCEV *IncrExpr = + SE->getConstant(RealIVSCEV->getType(), Negative ? -1 : 1); + if (auto *PTy = dyn_cast<PointerType>(Inst->getType())) { + Type *ElTy = PTy->getElementType(); + SizeOfExpr = + SE->getSizeOfExpr(SE->getEffectiveSCEVType(Inst->getType()), ElTy); + IncrExpr = SE->getMulExpr(IncrExpr, SizeOfExpr); + } + const SCEV *NewIVSCEV = + SE->getAddRecExpr(Start, IncrExpr, L, SCEV::FlagAnyWrap); + + { // Limit the lifetime of SCEVExpander. + const DataLayout &DL = Header->getModule()->getDataLayout(); + SCEVExpander Expander(*SE, DL, "reroll"); + Value *NewIV = + Expander.expandCodeFor(NewIVSCEV, InstIV->getType(), &Header->front()); + + for (auto &KV : Uses) + if (KV.second.find_first() == 0) + KV.first->replaceUsesOfWith(Inst, NewIV); + + if (BranchInst *BI = dyn_cast<BranchInst>(Header->getTerminator())) { + // FIXME: Why do we need this check? + if (Uses[BI].find_first() == IL_All) { + const SCEV *ICSCEV = RealIVSCEV->evaluateAtIteration(IterCount, *SE); + + if (NeedNewIV) + ICSCEV = SE->getMulExpr(IterCount, + SE->getConstant(IterCount->getType(), Scale)); + + // Iteration count SCEV minus or plus 1 + const SCEV *MinusPlus1SCEV = + SE->getConstant(ICSCEV->getType(), Negative ? -1 : 1); + if (Inst->getType()->isPointerTy()) { + assert(SizeOfExpr && "SizeOfExpr is not initialized"); + MinusPlus1SCEV = SE->getMulExpr(MinusPlus1SCEV, SizeOfExpr); + } - if (BI->getSuccessor(1) != Header) - BI->swapSuccessors(); + const SCEV *ICMinusPlus1SCEV = SE->getMinusSCEV(ICSCEV, MinusPlus1SCEV); + // Iteration count minus 1 + Instruction *InsertPtr = nullptr; + if (isa<SCEVConstant>(ICMinusPlus1SCEV)) { + InsertPtr = BI; + } else { + BasicBlock *Preheader = L->getLoopPreheader(); + if (!Preheader) + Preheader = InsertPreheaderForLoop(L, DT, LI, PreserveLCSSA); + InsertPtr = Preheader->getTerminator(); } + + if (!isa<PointerType>(NewIV->getType()) && NeedNewIV && + (SE->getTypeSizeInBits(NewIV->getType()) < + SE->getTypeSizeInBits(ICMinusPlus1SCEV->getType()))) { + IRBuilder<> Builder(BI); + Builder.SetCurrentDebugLocation(BI->getDebugLoc()); + NewIV = Builder.CreateSExt(NewIV, ICMinusPlus1SCEV->getType()); + } + Value *ICMinusPlus1 = Expander.expandCodeFor( + ICMinusPlus1SCEV, NewIV->getType(), InsertPtr); + + Value *Cond = + new ICmpInst(BI, CmpInst::ICMP_EQ, NewIV, ICMinusPlus1, "exitcond"); + BI->setCondition(Cond); + + if (BI->getSuccessor(1) != Header) + BI->swapSuccessors(); } } } - - SimplifyInstructionsInBlock(Header, TLI); - DeleteDeadPHIs(Header, TLI); } // Validate the selected reductions. All iterations must have an isomorphic @@ -1334,9 +1561,7 @@ void LoopReroll::DAGRootTracker::replace(const SCEV *IterCount) { // entries must appear in order. bool LoopReroll::ReductionTracker::validateSelected() { // For a non-associative reduction, the chain entries must appear in order. - for (DenseSet<int>::iterator RI = Reds.begin(), RIE = Reds.end(); - RI != RIE; ++RI) { - int i = *RI; + for (int i : Reds) { int PrevIter = 0, BaseCount = 0, Count = 0; for (Instruction *J : PossibleReds[i]) { // Note that all instructions in the chain must have been found because @@ -1380,9 +1605,7 @@ bool LoopReroll::ReductionTracker::validateSelected() { void LoopReroll::ReductionTracker::replaceSelected() { // Fixup reductions to refer to the last instruction associated with the // first iteration (not the last). - for (DenseSet<int>::iterator RI = Reds.begin(), RIE = Reds.end(); - RI != RIE; ++RI) { - int i = *RI; + for (int i : Reds) { int j = 0; for (int e = PossibleReds[i].size(); j != e; ++j) if (PossibleRedIter[PossibleReds[i][j]] != 0) { @@ -1396,9 +1619,8 @@ void LoopReroll::ReductionTracker::replaceSelected() { Users.push_back(cast<Instruction>(U)); } - for (SmallInstructionVector::iterator J = Users.begin(), - JE = Users.end(); J != JE; ++J) - (*J)->replaceUsesOfWith(PossibleReds[i].getReducedValue(), + for (Instruction *User : Users) + User->replaceUsesOfWith(PossibleReds[i].getReducedValue(), PossibleReds[i][j]); } } @@ -1450,7 +1672,7 @@ bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header, const SCEV *IterCount, ReductionTracker &Reductions) { DAGRootTracker DAGRoots(this, L, IV, SE, AA, TLI, DT, LI, PreserveLCSSA, - IVToIncMap); + IVToIncMap, LoopControlIV); if (!DAGRoots.findRoots()) return false; @@ -1472,7 +1694,7 @@ bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header, } bool LoopReroll::runOnLoop(Loop *L, LPPassManager &LPM) { - if (skipOptnoneFunction(L)) + if (skipLoop(L)) return false; AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); @@ -1487,41 +1709,46 @@ bool LoopReroll::runOnLoop(Loop *L, LPPassManager &LPM) { "] Loop %" << Header->getName() << " (" << L->getNumBlocks() << " block(s))\n"); - bool Changed = false; - // For now, we'll handle only single BB loops. if (L->getNumBlocks() > 1) - return Changed; + return false; if (!SE->hasLoopInvariantBackedgeTakenCount(L)) - return Changed; + return false; const SCEV *LIBETC = SE->getBackedgeTakenCount(L); const SCEV *IterCount = SE->getAddExpr(LIBETC, SE->getOne(LIBETC->getType())); + DEBUG(dbgs() << "\n Before Reroll:\n" << *(L->getHeader()) << "\n"); DEBUG(dbgs() << "LRR: iteration count = " << *IterCount << "\n"); // First, we need to find the induction variable with respect to which we can // reroll (there may be several possible options). SmallInstructionVector PossibleIVs; IVToIncMap.clear(); + LoopControlIV = nullptr; collectPossibleIVs(L, PossibleIVs); if (PossibleIVs.empty()) { DEBUG(dbgs() << "LRR: No possible IVs found\n"); - return Changed; + return false; } ReductionTracker Reductions; collectPossibleReductions(L, Reductions); + bool Changed = false; // For each possible IV, collect the associated possible set of 'root' nodes // (i+1, i+2, etc.). - for (SmallInstructionVector::iterator I = PossibleIVs.begin(), - IE = PossibleIVs.end(); I != IE; ++I) - if (reroll(*I, L, Header, IterCount, Reductions)) { + for (Instruction *PossibleIV : PossibleIVs) + if (reroll(PossibleIV, L, Header, IterCount, Reductions)) { Changed = true; break; } + DEBUG(dbgs() << "\n After Reroll:\n" << *(L->getHeader()) << "\n"); + + // Trip count of L has changed so SE must be re-evaluated. + if (Changed) + SE->forgetLoop(L); return Changed; } diff --git a/lib/Transforms/Scalar/LoopRotation.cpp b/lib/Transforms/Scalar/LoopRotation.cpp index 5e6c2da08cc32..7a06a25a7073e 100644 --- a/lib/Transforms/Scalar/LoopRotation.cpp +++ b/lib/Transforms/Scalar/LoopRotation.cpp @@ -11,7 +11,7 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Scalar/LoopRotation.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/BasicAliasAnalysis.h" @@ -20,6 +20,7 @@ #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/LoopPassManager.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" #include "llvm/Analysis/TargetTransformInfo.h" @@ -32,20 +33,46 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/SSAUpdater.h" #include "llvm/Transforms/Utils/ValueMapper.h" using namespace llvm; #define DEBUG_TYPE "loop-rotate" -static cl::opt<unsigned> -DefaultRotationThreshold("rotation-max-header-size", cl::init(16), cl::Hidden, - cl::desc("The default maximum header size for automatic loop rotation")); +static cl::opt<unsigned> DefaultRotationThreshold( + "rotation-max-header-size", cl::init(16), cl::Hidden, + cl::desc("The default maximum header size for automatic loop rotation")); STATISTIC(NumRotated, "Number of loops rotated"); +namespace { +/// A simple loop rotation transformation. +class LoopRotate { + const unsigned MaxHeaderSize; + LoopInfo *LI; + const TargetTransformInfo *TTI; + AssumptionCache *AC; + DominatorTree *DT; + ScalarEvolution *SE; + +public: + LoopRotate(unsigned MaxHeaderSize, LoopInfo *LI, + const TargetTransformInfo *TTI, AssumptionCache *AC, + DominatorTree *DT, ScalarEvolution *SE) + : MaxHeaderSize(MaxHeaderSize), LI(LI), TTI(TTI), AC(AC), DT(DT), SE(SE) { + } + bool processLoop(Loop *L); + +private: + bool rotateLoop(Loop *L, bool SimplifiedLatch); + bool simplifyLoopLatch(Loop *L); +}; +} // end anonymous namespace + /// RewriteUsesOfClonedInstructions - We just cloned the instructions from the /// old header into the preheader. If there were uses of the values produced by /// these instruction that were outside of the loop, we have to insert PHI nodes @@ -69,7 +96,7 @@ static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader, if (OrigHeaderVal->use_empty()) continue; - Value *OrigPreHeaderVal = ValueMap[OrigHeaderVal]; + Value *OrigPreHeaderVal = ValueMap.lookup(OrigHeaderVal); // The value now exits in two versions: the initial value in the preheader // and the loop "next" value in the original header. @@ -79,7 +106,8 @@ static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader, // Visit each use of the OrigHeader instruction. for (Value::use_iterator UI = OrigHeaderVal->use_begin(), - UE = OrigHeaderVal->use_end(); UI != UE; ) { + UE = OrigHeaderVal->use_end(); + UI != UE;) { // Grab the use before incrementing the iterator. Use &U = *UI; @@ -108,6 +136,41 @@ static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader, // Anything else can be handled by SSAUpdater. SSA.RewriteUse(U); } + + // Replace MetadataAsValue(ValueAsMetadata(OrigHeaderVal)) uses in debug + // intrinsics. + LLVMContext &C = OrigHeader->getContext(); + if (auto *VAM = ValueAsMetadata::getIfExists(OrigHeaderVal)) { + if (auto *MAV = MetadataAsValue::getIfExists(C, VAM)) { + for (auto UI = MAV->use_begin(), E = MAV->use_end(); UI != E;) { + // Grab the use before incrementing the iterator. Otherwise, altering + // the Use will invalidate the iterator. + Use &U = *UI++; + DbgInfoIntrinsic *UserInst = dyn_cast<DbgInfoIntrinsic>(U.getUser()); + if (!UserInst) + continue; + + // The original users in the OrigHeader are already using the original + // definitions. + BasicBlock *UserBB = UserInst->getParent(); + if (UserBB == OrigHeader) + continue; + + // Users in the OrigPreHeader need to use the value to which the + // original definitions are mapped and anything else can be handled by + // the SSAUpdater. To avoid adding PHINodes, check if the value is + // available in UserBB, if not substitute undef. + Value *NewVal; + if (UserBB == OrigPreheader) + NewVal = OrigPreHeaderVal; + else if (SSA.HasValueForBlock(UserBB)) + NewVal = SSA.GetValueInMiddleOfBlock(UserBB); + else + NewVal = UndefValue::get(OrigHeaderVal->getType()); + U = MetadataAsValue::get(C, ValueAsMetadata::get(NewVal)); + } + } + } } } @@ -121,10 +184,7 @@ static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader, /// rotation. LoopRotate should be repeatable and converge to a canonical /// form. This property is satisfied because simplifying the loop latch can only /// happen once across multiple invocations of the LoopRotate pass. -static bool rotateLoop(Loop *L, unsigned MaxHeaderSize, LoopInfo *LI, - const TargetTransformInfo *TTI, AssumptionCache *AC, - DominatorTree *DT, ScalarEvolution *SE, - bool SimplifiedLatch) { +bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { // If the loop has only one block then there is not much to rotate. if (L->getBlocks().size() == 1) return false; @@ -162,7 +222,14 @@ static bool rotateLoop(Loop *L, unsigned MaxHeaderSize, LoopInfo *LI, Metrics.analyzeBasicBlock(OrigHeader, *TTI, EphValues); if (Metrics.notDuplicatable) { DEBUG(dbgs() << "LoopRotation: NOT rotating - contains non-duplicatable" - << " instructions: "; L->dump()); + << " instructions: "; + L->dump()); + return false; + } + if (Metrics.convergent) { + DEBUG(dbgs() << "LoopRotation: NOT rotating - contains convergent " + "instructions: "; + L->dump()); return false; } if (Metrics.NumInsts > MaxHeaderSize) @@ -225,10 +292,9 @@ static bool rotateLoop(Loop *L, unsigned MaxHeaderSize, LoopInfo *LI, // executing in each iteration of the loop. This means it is safe to hoist // something that might trap, but isn't safe to hoist something that reads // memory (without proving that the loop doesn't write). - if (L->hasLoopInvariantOperands(Inst) && - !Inst->mayReadFromMemory() && !Inst->mayWriteToMemory() && - !isa<TerminatorInst>(Inst) && !isa<DbgInfoIntrinsic>(Inst) && - !isa<AllocaInst>(Inst)) { + if (L->hasLoopInvariantOperands(Inst) && !Inst->mayReadFromMemory() && + !Inst->mayWriteToMemory() && !isa<TerminatorInst>(Inst) && + !isa<DbgInfoIntrinsic>(Inst) && !isa<AllocaInst>(Inst)) { Inst->moveBefore(LoopEntryBranch); continue; } @@ -238,7 +304,7 @@ static bool rotateLoop(Loop *L, unsigned MaxHeaderSize, LoopInfo *LI, // Eagerly remap the operands of the instruction. RemapInstruction(C, ValueMap, - RF_NoModuleLevelChanges|RF_IgnoreMissingEntries); + RF_NoModuleLevelChanges | RF_IgnoreMissingLocals); // With the operands remapped, see if the instruction constant folds or is // otherwise simplifyable. This commonly occurs because the entry from PHI @@ -248,13 +314,18 @@ static bool rotateLoop(Loop *L, unsigned MaxHeaderSize, LoopInfo *LI, if (V && LI->replacementPreservesLCSSAForm(C, V)) { // If so, then delete the temporary instruction and stick the folded value // in the map. - delete C; ValueMap[Inst] = V; + if (!C->mayHaveSideEffects()) { + delete C; + C = nullptr; + } } else { + ValueMap[Inst] = C; + } + if (C) { // Otherwise, stick the new instruction into the new block! C->setName(Inst->getName()); C->insertBefore(LoopEntryBranch); - ValueMap[Inst] = C; } } @@ -280,7 +351,6 @@ static bool rotateLoop(Loop *L, unsigned MaxHeaderSize, LoopInfo *LI, L->moveToHeader(NewHeader); assert(L->getHeader() == NewHeader && "Latch block is our new header"); - // At this point, we've finished our major CFG changes. As part of cloning // the loop into the preheader we've simplified instructions and the // duplicated conditional branch may now be branching on a constant. If it is @@ -291,8 +361,8 @@ static bool rotateLoop(Loop *L, unsigned MaxHeaderSize, LoopInfo *LI, BranchInst *PHBI = cast<BranchInst>(OrigPreheader->getTerminator()); assert(PHBI->isConditional() && "Should be clone of BI condbr!"); if (!isa<ConstantInt>(PHBI->getCondition()) || - PHBI->getSuccessor(cast<ConstantInt>(PHBI->getCondition())->isZero()) - != NewHeader) { + PHBI->getSuccessor(cast<ConstantInt>(PHBI->getCondition())->isZero()) != + NewHeader) { // The conditional branch can't be folded, handle the general case. // Update DominatorTree to reflect the CFG change we just made. Then split // edges as necessary to preserve LoopSimplify form. @@ -329,18 +399,17 @@ static bool rotateLoop(Loop *L, unsigned MaxHeaderSize, LoopInfo *LI, // be split. SmallVector<BasicBlock *, 4> ExitPreds(pred_begin(Exit), pred_end(Exit)); bool SplitLatchEdge = false; - for (SmallVectorImpl<BasicBlock *>::iterator PI = ExitPreds.begin(), - PE = ExitPreds.end(); - PI != PE; ++PI) { + for (BasicBlock *ExitPred : ExitPreds) { // We only need to split loop exit edges. - Loop *PredLoop = LI->getLoopFor(*PI); + Loop *PredLoop = LI->getLoopFor(ExitPred); if (!PredLoop || PredLoop->contains(Exit)) continue; - if (isa<IndirectBrInst>((*PI)->getTerminator())) + if (isa<IndirectBrInst>(ExitPred->getTerminator())) continue; - SplitLatchEdge |= L->getLoopLatch() == *PI; + SplitLatchEdge |= L->getLoopLatch() == ExitPred; BasicBlock *ExitSplit = SplitCriticalEdge( - *PI, Exit, CriticalEdgeSplittingOptions(DT, LI).setPreserveLCSSA()); + ExitPred, Exit, + CriticalEdgeSplittingOptions(DT, LI).setPreserveLCSSA()); ExitSplit->moveBefore(Exit); } assert(SplitLatchEdge && @@ -384,8 +453,8 @@ static bool rotateLoop(Loop *L, unsigned MaxHeaderSize, LoopInfo *LI, } } - // If the dominator changed, this may have an effect on other - // predecessors, continue until we reach a fixpoint. + // If the dominator changed, this may have an effect on other + // predecessors, continue until we reach a fixpoint. } while (Changed); } } @@ -432,7 +501,7 @@ static bool shouldSpeculateInstrs(BasicBlock::iterator Begin, // GEPs are cheap if all indices are constant. if (!cast<GEPOperator>(I)->hasAllConstantIndices()) return false; - // fall-thru to increment case + // fall-thru to increment case case Instruction::Add: case Instruction::Sub: case Instruction::And: @@ -441,11 +510,10 @@ static bool shouldSpeculateInstrs(BasicBlock::iterator Begin, case Instruction::Shl: case Instruction::LShr: case Instruction::AShr: { - Value *IVOpnd = !isa<Constant>(I->getOperand(0)) - ? I->getOperand(0) - : !isa<Constant>(I->getOperand(1)) - ? I->getOperand(1) - : nullptr; + Value *IVOpnd = + !isa<Constant>(I->getOperand(0)) + ? I->getOperand(0) + : !isa<Constant>(I->getOperand(1)) ? I->getOperand(1) : nullptr; if (!IVOpnd) return false; @@ -482,7 +550,7 @@ static bool shouldSpeculateInstrs(BasicBlock::iterator Begin, /// canonical form so downstream passes can handle it. /// /// I don't believe this invalidates SCEV. -static bool simplifyLoopLatch(Loop *L, LoopInfo *LI, DominatorTree *DT) { +bool LoopRotate::simplifyLoopLatch(Loop *L) { BasicBlock *Latch = L->getLoopLatch(); if (!Latch || Latch->hasAddressTaken()) return false; @@ -503,7 +571,7 @@ static bool simplifyLoopLatch(Loop *L, LoopInfo *LI, DominatorTree *DT) { return false; DEBUG(dbgs() << "Folding loop latch " << Latch->getName() << " into " - << LastExit->getName() << "\n"); + << LastExit->getName() << "\n"); // Hoist the instructions from Latch into LastExit. LastExit->getInstList().splice(BI->getIterator(), Latch->getInstList(), @@ -527,26 +595,19 @@ static bool simplifyLoopLatch(Loop *L, LoopInfo *LI, DominatorTree *DT) { return true; } -/// Rotate \c L as many times as possible. Return true if the loop is rotated -/// at least once. -static bool iterativelyRotateLoop(Loop *L, unsigned MaxHeaderSize, LoopInfo *LI, - const TargetTransformInfo *TTI, - AssumptionCache *AC, DominatorTree *DT, - ScalarEvolution *SE) { +/// Rotate \c L, and return true if any modification was made. +bool LoopRotate::processLoop(Loop *L) { // Save the loop metadata. MDNode *LoopMD = L->getLoopID(); // Simplify the loop latch before attempting to rotate the header // upward. Rotation may not be needed if the loop tail can be folded into the // loop exit. - bool SimplifiedLatch = simplifyLoopLatch(L, LI, DT); + bool SimplifiedLatch = simplifyLoopLatch(L); - // One loop can be rotated multiple times. - bool MadeChange = false; - while (rotateLoop(L, MaxHeaderSize, LI, TTI, AC, DT, SE, SimplifiedLatch)) { - MadeChange = true; - SimplifiedLatch = false; - } + bool MadeChange = rotateLoop(L, SimplifiedLatch); + assert((!MadeChange || L->isLoopExiting(L->getLoopLatch())) && + "Loop latch should be exiting after loop-rotate."); // Restore the loop metadata. // NB! We presume LoopRotation DOESN'T ADD its own metadata. @@ -556,15 +617,37 @@ static bool iterativelyRotateLoop(Loop *L, unsigned MaxHeaderSize, LoopInfo *LI, return MadeChange; } +LoopRotatePass::LoopRotatePass() {} + +PreservedAnalyses LoopRotatePass::run(Loop &L, AnalysisManager<Loop> &AM) { + auto &FAM = AM.getResult<FunctionAnalysisManagerLoopProxy>(L).getManager(); + Function *F = L.getHeader()->getParent(); + + auto *LI = FAM.getCachedResult<LoopAnalysis>(*F); + const auto *TTI = FAM.getCachedResult<TargetIRAnalysis>(*F); + auto *AC = FAM.getCachedResult<AssumptionAnalysis>(*F); + assert((LI && TTI && AC) && "Analyses for loop rotation not available"); + + // Optional analyses. + auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(*F); + auto *SE = FAM.getCachedResult<ScalarEvolutionAnalysis>(*F); + LoopRotate LR(DefaultRotationThreshold, LI, TTI, AC, DT, SE); + + bool Changed = LR.processLoop(&L); + if (!Changed) + return PreservedAnalyses::all(); + return getLoopPassPreservedAnalyses(); +} + namespace { -class LoopRotate : public LoopPass { +class LoopRotateLegacyPass : public LoopPass { unsigned MaxHeaderSize; public: static char ID; // Pass ID, replacement for typeid - LoopRotate(int SpecifiedMaxHeaderSize = -1) : LoopPass(ID) { - initializeLoopRotatePass(*PassRegistry::getPassRegistry()); + LoopRotateLegacyPass(int SpecifiedMaxHeaderSize = -1) : LoopPass(ID) { + initializeLoopRotateLegacyPassPass(*PassRegistry::getPassRegistry()); if (SpecifiedMaxHeaderSize == -1) MaxHeaderSize = DefaultRotationThreshold; else @@ -573,24 +656,13 @@ public: // LCSSA form makes instruction renaming easier. void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addPreserved<AAResultsWrapperPass>(); AU.addRequired<AssumptionCacheTracker>(); - AU.addPreserved<DominatorTreeWrapperPass>(); - AU.addRequired<LoopInfoWrapperPass>(); - AU.addPreserved<LoopInfoWrapperPass>(); - AU.addRequiredID(LoopSimplifyID); - AU.addPreservedID(LoopSimplifyID); - AU.addRequiredID(LCSSAID); - AU.addPreservedID(LCSSAID); - AU.addPreserved<ScalarEvolutionWrapperPass>(); - AU.addPreserved<SCEVAAWrapperPass>(); AU.addRequired<TargetTransformInfoWrapperPass>(); - AU.addPreserved<BasicAAWrapperPass>(); - AU.addPreserved<GlobalsAAWrapperPass>(); + getLoopAnalysisUsage(AU); } bool runOnLoop(Loop *L, LPPassManager &LPM) override { - if (skipOptnoneFunction(L)) + if (skipLoop(L)) return false; Function &F = *L->getHeader()->getParent(); @@ -601,24 +673,21 @@ public: auto *DT = DTWP ? &DTWP->getDomTree() : nullptr; auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>(); auto *SE = SEWP ? &SEWP->getSE() : nullptr; - - return iterativelyRotateLoop(L, MaxHeaderSize, LI, TTI, AC, DT, SE); + LoopRotate LR(MaxHeaderSize, LI, TTI, AC, DT, SE); + return LR.processLoop(L); } }; } -char LoopRotate::ID = 0; -INITIALIZE_PASS_BEGIN(LoopRotate, "loop-rotate", "Rotate Loops", false, false) -INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +char LoopRotateLegacyPass::ID = 0; +INITIALIZE_PASS_BEGIN(LoopRotateLegacyPass, "loop-rotate", "Rotate Loops", + false, false) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) -INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(LoopSimplify) -INITIALIZE_PASS_DEPENDENCY(LCSSA) -INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass) -INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) -INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) -INITIALIZE_PASS_END(LoopRotate, "loop-rotate", "Rotate Loops", false, false) +INITIALIZE_PASS_DEPENDENCY(LoopPass) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_END(LoopRotateLegacyPass, "loop-rotate", "Rotate Loops", false, + false) Pass *llvm::createLoopRotatePass(int MaxHeaderSize) { - return new LoopRotate(MaxHeaderSize); + return new LoopRotateLegacyPass(MaxHeaderSize); } diff --git a/lib/Transforms/Scalar/LoopSimplifyCFG.cpp b/lib/Transforms/Scalar/LoopSimplifyCFG.cpp new file mode 100644 index 0000000000000..ec227932c09e5 --- /dev/null +++ b/lib/Transforms/Scalar/LoopSimplifyCFG.cpp @@ -0,0 +1,114 @@ +//===--------- LoopSimplifyCFG.cpp - Loop CFG Simplification Pass ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the Loop SimplifyCFG Pass. This pass is responsible for +// basic loop CFG cleanup, primarily to assist other loop passes. If you +// encounter a noncanonical CFG construct that causes another loop pass to +// perform suboptimally, this is the place to fix it up. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Scalar/LoopSimplifyCFG.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/BasicAliasAnalysis.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/DependenceAnalysis.h" +#include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/LoopPassManager.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/Dominators.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/LoopUtils.h" +using namespace llvm; + +#define DEBUG_TYPE "loop-simplifycfg" + +static bool simplifyLoopCFG(Loop &L, DominatorTree &DT, LoopInfo &LI) { + bool Changed = false; + // Copy blocks into a temporary array to avoid iterator invalidation issues + // as we remove them. + SmallVector<WeakVH, 16> Blocks(L.blocks()); + + for (auto &Block : Blocks) { + // Attempt to merge blocks in the trivial case. Don't modify blocks which + // belong to other loops. + BasicBlock *Succ = cast_or_null<BasicBlock>(Block); + if (!Succ) + continue; + + BasicBlock *Pred = Succ->getSinglePredecessor(); + if (!Pred || !Pred->getSingleSuccessor() || LI.getLoopFor(Pred) != &L) + continue; + + // Pred is going to disappear, so we need to update the loop info. + if (L.getHeader() == Pred) + L.moveToHeader(Succ); + LI.removeBlock(Pred); + MergeBasicBlockIntoOnlyPred(Succ, &DT); + Changed = true; + } + + return Changed; +} + +PreservedAnalyses LoopSimplifyCFGPass::run(Loop &L, AnalysisManager<Loop> &AM) { + const auto &FAM = + AM.getResult<FunctionAnalysisManagerLoopProxy>(L).getManager(); + Function *F = L.getHeader()->getParent(); + + auto *LI = FAM.getCachedResult<LoopAnalysis>(*F); + auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(*F); + assert((LI && DT) && "Analyses for LoopSimplifyCFG not available"); + + if (!simplifyLoopCFG(L, *DT, *LI)) + return PreservedAnalyses::all(); + return getLoopPassPreservedAnalyses(); +} + +namespace { +class LoopSimplifyCFGLegacyPass : public LoopPass { +public: + static char ID; // Pass ID, replacement for typeid + LoopSimplifyCFGLegacyPass() : LoopPass(ID) { + initializeLoopSimplifyCFGLegacyPassPass(*PassRegistry::getPassRegistry()); + } + + bool runOnLoop(Loop *L, LPPassManager &) override { + if (skipLoop(L)) + return false; + + DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + return simplifyLoopCFG(*L, DT, LI); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addPreserved<DependenceAnalysisWrapperPass>(); + getLoopAnalysisUsage(AU); + } +}; +} + +char LoopSimplifyCFGLegacyPass::ID = 0; +INITIALIZE_PASS_BEGIN(LoopSimplifyCFGLegacyPass, "loop-simplifycfg", + "Simplify loop CFG", false, false) +INITIALIZE_PASS_DEPENDENCY(LoopPass) +INITIALIZE_PASS_END(LoopSimplifyCFGLegacyPass, "loop-simplifycfg", + "Simplify loop CFG", false, false) + +Pass *llvm::createLoopSimplifyCFGPass() { + return new LoopSimplifyCFGLegacyPass(); +} diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp index acfdec43d21ae..77c77eb7d798c 100644 --- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -684,10 +684,6 @@ static bool isAddressUse(Instruction *Inst, Value *OperandVal) { switch (II->getIntrinsicID()) { default: break; case Intrinsic::prefetch: - case Intrinsic::x86_sse_storeu_ps: - case Intrinsic::x86_sse2_storeu_pd: - case Intrinsic::x86_sse2_storeu_dq: - case Intrinsic::x86_sse2_storel_dq: if (II->getArgOperand(0) == OperandVal) isAddress = true; break; @@ -704,18 +700,6 @@ static MemAccessTy getAccessType(const Instruction *Inst) { AccessTy.AddrSpace = SI->getPointerAddressSpace(); } else if (const LoadInst *LI = dyn_cast<LoadInst>(Inst)) { AccessTy.AddrSpace = LI->getPointerAddressSpace(); - } else if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) { - // Addressing modes can also be folded into prefetches and a variety - // of intrinsics. - switch (II->getIntrinsicID()) { - default: break; - case Intrinsic::x86_sse_storeu_ps: - case Intrinsic::x86_sse2_storeu_pd: - case Intrinsic::x86_sse2_storeu_dq: - case Intrinsic::x86_sse2_storel_dq: - AccessTy.MemTy = II->getArgOperand(0)->getType(); - break; - } } // All pointers have the same requirements, so canonicalize them to an @@ -963,8 +947,8 @@ void Cost::RateRegister(const SCEV *Reg, isa<SCEVConstant>(cast<SCEVAddRecExpr>(Reg)->getStart())))) ++SetupCost; - NumIVMuls += isa<SCEVMulExpr>(Reg) && - SE.hasComputableLoopEvolution(Reg, L); + NumIVMuls += isa<SCEVMulExpr>(Reg) && + SE.hasComputableLoopEvolution(Reg, L); } /// Record this register in the set. If we haven't seen it before, rate @@ -2752,34 +2736,31 @@ void LSRInstance::CollectChains() { LatchPath.push_back(LoopHeader); // Walk the instruction stream from the loop header to the loop latch. - for (SmallVectorImpl<BasicBlock *>::reverse_iterator - BBIter = LatchPath.rbegin(), BBEnd = LatchPath.rend(); - BBIter != BBEnd; ++BBIter) { - for (BasicBlock::iterator I = (*BBIter)->begin(), E = (*BBIter)->end(); - I != E; ++I) { + for (BasicBlock *BB : reverse(LatchPath)) { + for (Instruction &I : *BB) { // Skip instructions that weren't seen by IVUsers analysis. - if (isa<PHINode>(I) || !IU.isIVUserOrOperand(&*I)) + if (isa<PHINode>(I) || !IU.isIVUserOrOperand(&I)) continue; // Ignore users that are part of a SCEV expression. This way we only // consider leaf IV Users. This effectively rediscovers a portion of // IVUsers analysis but in program order this time. - if (SE.isSCEVable(I->getType()) && !isa<SCEVUnknown>(SE.getSCEV(&*I))) + if (SE.isSCEVable(I.getType()) && !isa<SCEVUnknown>(SE.getSCEV(&I))) continue; // Remove this instruction from any NearUsers set it may be in. for (unsigned ChainIdx = 0, NChains = IVChainVec.size(); ChainIdx < NChains; ++ChainIdx) { - ChainUsersVec[ChainIdx].NearUsers.erase(&*I); + ChainUsersVec[ChainIdx].NearUsers.erase(&I); } // Search for operands that can be chained. SmallPtrSet<Instruction*, 4> UniqueOperands; - User::op_iterator IVOpEnd = I->op_end(); - User::op_iterator IVOpIter = findIVOperand(I->op_begin(), IVOpEnd, L, SE); + User::op_iterator IVOpEnd = I.op_end(); + User::op_iterator IVOpIter = findIVOperand(I.op_begin(), IVOpEnd, L, SE); while (IVOpIter != IVOpEnd) { Instruction *IVOpInst = cast<Instruction>(*IVOpIter); if (UniqueOperands.insert(IVOpInst).second) - ChainInstruction(&*I, IVOpInst, ChainUsersVec); + ChainInstruction(&I, IVOpInst, ChainUsersVec); IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE); } } // Continue walking down the instructions. @@ -4331,28 +4312,15 @@ BasicBlock::iterator LSRInstance::HoistInsertPosition(BasicBlock::iterator IP, const SmallVectorImpl<Instruction *> &Inputs) const { + Instruction *Tentative = &*IP; for (;;) { - const Loop *IPLoop = LI.getLoopFor(IP->getParent()); - unsigned IPLoopDepth = IPLoop ? IPLoop->getLoopDepth() : 0; - - BasicBlock *IDom; - for (DomTreeNode *Rung = DT.getNode(IP->getParent()); ; ) { - if (!Rung) return IP; - Rung = Rung->getIDom(); - if (!Rung) return IP; - IDom = Rung->getBlock(); - - // Don't climb into a loop though. - const Loop *IDomLoop = LI.getLoopFor(IDom); - unsigned IDomDepth = IDomLoop ? IDomLoop->getLoopDepth() : 0; - if (IDomDepth <= IPLoopDepth && - (IDomDepth != IPLoopDepth || IDomLoop == IPLoop)) - break; - } - bool AllDominate = true; Instruction *BetterPos = nullptr; - Instruction *Tentative = IDom->getTerminator(); + // Don't bother attempting to insert before a catchswitch, their basic block + // cannot have other non-PHI instructions. + if (isa<CatchSwitchInst>(Tentative)) + return IP; + for (Instruction *Inst : Inputs) { if (Inst == Tentative || !DT.dominates(Inst, Tentative)) { AllDominate = false; @@ -4360,7 +4328,7 @@ LSRInstance::HoistInsertPosition(BasicBlock::iterator IP, } // Attempt to find an insert position in the middle of the block, // instead of at the end, so that it can be used for other expansions. - if (IDom == Inst->getParent() && + if (Tentative->getParent() == Inst->getParent() && (!BetterPos || !DT.dominates(Inst, BetterPos))) BetterPos = &*std::next(BasicBlock::iterator(Inst)); } @@ -4370,6 +4338,26 @@ LSRInstance::HoistInsertPosition(BasicBlock::iterator IP, IP = BetterPos->getIterator(); else IP = Tentative->getIterator(); + + const Loop *IPLoop = LI.getLoopFor(IP->getParent()); + unsigned IPLoopDepth = IPLoop ? IPLoop->getLoopDepth() : 0; + + BasicBlock *IDom; + for (DomTreeNode *Rung = DT.getNode(IP->getParent()); ; ) { + if (!Rung) return IP; + Rung = Rung->getIDom(); + if (!Rung) return IP; + IDom = Rung->getBlock(); + + // Don't climb into a loop though. + const Loop *IDomLoop = LI.getLoopFor(IDom); + unsigned IDomDepth = IDomLoop ? IDomLoop->getLoopDepth() : 0; + if (IDomDepth <= IPLoopDepth && + (IDomDepth != IPLoopDepth || IDomLoop == IPLoop)) + break; + } + + Tentative = IDom->getTerminator(); } return IP; @@ -4426,7 +4414,7 @@ LSRInstance::AdjustInsertPositionForExpand(BasicBlock::iterator LowestIP, while (isa<PHINode>(IP)) ++IP; // Ignore landingpad instructions. - while (!isa<TerminatorInst>(IP) && IP->isEHPad()) ++IP; + while (IP->isEHPad()) ++IP; // Ignore debug intrinsics. while (isa<DbgInfoIntrinsic>(IP)) ++IP; @@ -4961,7 +4949,7 @@ INITIALIZE_PASS_BEGIN(LoopStrengthReduce, "loop-reduce", INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) -INITIALIZE_PASS_DEPENDENCY(IVUsers) +INITIALIZE_PASS_DEPENDENCY(IVUsersWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_END(LoopStrengthReduce, "loop-reduce", @@ -4991,16 +4979,16 @@ void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const { // Requiring LoopSimplify a second time here prevents IVUsers from running // twice, since LoopSimplify was invalidated by running ScalarEvolution. AU.addRequiredID(LoopSimplifyID); - AU.addRequired<IVUsers>(); - AU.addPreserved<IVUsers>(); + AU.addRequired<IVUsersWrapperPass>(); + AU.addPreserved<IVUsersWrapperPass>(); AU.addRequired<TargetTransformInfoWrapperPass>(); } bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) { - if (skipOptnoneFunction(L)) + if (skipLoop(L)) return false; - auto &IU = getAnalysis<IVUsers>(); + auto &IU = getAnalysis<IVUsersWrapperPass>().getIU(); auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); diff --git a/lib/Transforms/Scalar/LoopUnrollPass.cpp b/lib/Transforms/Scalar/LoopUnrollPass.cpp index ecef6dbe24e64..91af4a1922ce1 100644 --- a/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -12,13 +12,13 @@ // counts of loops easily. //===----------------------------------------------------------------------===// -#include "llvm/Transforms/Scalar.h" #include "llvm/ADT/SetVector.h" -#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CodeMetrics.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/LoopUnrollAnalyzer.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetTransformInfo.h" @@ -31,8 +31,11 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/UnrollLoop.h" #include <climits> +#include <utility> using namespace llvm; @@ -43,40 +46,54 @@ static cl::opt<unsigned> cl::desc("The baseline cost threshold for loop unrolling")); static cl::opt<unsigned> UnrollPercentDynamicCostSavedThreshold( - "unroll-percent-dynamic-cost-saved-threshold", cl::Hidden, + "unroll-percent-dynamic-cost-saved-threshold", cl::init(50), cl::Hidden, cl::desc("The percentage of estimated dynamic cost which must be saved by " "unrolling to allow unrolling up to the max threshold.")); static cl::opt<unsigned> UnrollDynamicCostSavingsDiscount( - "unroll-dynamic-cost-savings-discount", cl::Hidden, + "unroll-dynamic-cost-savings-discount", cl::init(100), cl::Hidden, cl::desc("This is the amount discounted from the total unroll cost when " "the unrolled form has a high dynamic cost savings (triggered by " "the '-unroll-perecent-dynamic-cost-saved-threshold' flag).")); static cl::opt<unsigned> UnrollMaxIterationsCountToAnalyze( - "unroll-max-iteration-count-to-analyze", cl::init(0), cl::Hidden, + "unroll-max-iteration-count-to-analyze", cl::init(10), cl::Hidden, cl::desc("Don't allow loop unrolling to simulate more than this number of" "iterations when checking full unroll profitability")); -static cl::opt<unsigned> -UnrollCount("unroll-count", cl::Hidden, - cl::desc("Use this unroll count for all loops including those with " - "unroll_count pragma values, for testing purposes")); +static cl::opt<unsigned> UnrollCount( + "unroll-count", cl::Hidden, + cl::desc("Use this unroll count for all loops including those with " + "unroll_count pragma values, for testing purposes")); -static cl::opt<bool> -UnrollAllowPartial("unroll-allow-partial", cl::Hidden, - cl::desc("Allows loops to be partially unrolled until " - "-unroll-threshold loop size is reached.")); +static cl::opt<unsigned> UnrollMaxCount( + "unroll-max-count", cl::Hidden, + cl::desc("Set the max unroll count for partial and runtime unrolling, for" + "testing purposes")); + +static cl::opt<unsigned> UnrollFullMaxCount( + "unroll-full-max-count", cl::Hidden, + cl::desc( + "Set the max unroll count for full unrolling, for testing purposes")); static cl::opt<bool> -UnrollRuntime("unroll-runtime", cl::ZeroOrMore, cl::Hidden, - cl::desc("Unroll loops with run-time trip counts")); + UnrollAllowPartial("unroll-allow-partial", cl::Hidden, + cl::desc("Allows loops to be partially unrolled until " + "-unroll-threshold loop size is reached.")); -static cl::opt<unsigned> -PragmaUnrollThreshold("pragma-unroll-threshold", cl::init(16 * 1024), cl::Hidden, - cl::desc("Unrolled size limit for loops with an unroll(full) or " - "unroll_count pragma.")); +static cl::opt<bool> UnrollAllowRemainder( + "unroll-allow-remainder", cl::Hidden, + cl::desc("Allow generation of a loop remainder (extra iterations) " + "when unrolling a loop.")); +static cl::opt<bool> + UnrollRuntime("unroll-runtime", cl::ZeroOrMore, cl::Hidden, + cl::desc("Unroll loops with run-time trip counts")); + +static cl::opt<unsigned> PragmaUnrollThreshold( + "pragma-unroll-threshold", cl::init(16 * 1024), cl::Hidden, + cl::desc("Unrolled size limit for loops with an unroll(full) or " + "unroll_count pragma.")); /// A magic value for use with the Threshold parameter to indicate /// that the loop unroll should be performed regardless of how much @@ -88,26 +105,28 @@ static const unsigned NoThreshold = UINT_MAX; static const unsigned DefaultUnrollRuntimeCount = 8; /// Gather the various unrolling parameters based on the defaults, compiler -/// flags, TTI overrides, pragmas, and user specified parameters. +/// flags, TTI overrides and user specified parameters. static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences( Loop *L, const TargetTransformInfo &TTI, Optional<unsigned> UserThreshold, Optional<unsigned> UserCount, Optional<bool> UserAllowPartial, - Optional<bool> UserRuntime, unsigned PragmaCount, bool PragmaFullUnroll, - bool PragmaEnableUnroll, unsigned TripCount) { + Optional<bool> UserRuntime) { TargetTransformInfo::UnrollingPreferences UP; // Set up the defaults UP.Threshold = 150; - UP.PercentDynamicCostSavedThreshold = 20; - UP.DynamicCostSavingsDiscount = 2000; - UP.OptSizeThreshold = 50; + UP.PercentDynamicCostSavedThreshold = 50; + UP.DynamicCostSavingsDiscount = 100; + UP.OptSizeThreshold = 0; UP.PartialThreshold = UP.Threshold; - UP.PartialOptSizeThreshold = UP.OptSizeThreshold; + UP.PartialOptSizeThreshold = 0; UP.Count = 0; UP.MaxCount = UINT_MAX; + UP.FullUnrollMaxCount = UINT_MAX; UP.Partial = false; UP.Runtime = false; + UP.AllowRemainder = true; UP.AllowExpensiveTripCount = false; + UP.Force = false; // Override with any target specific settings TTI.getUnrollingPreferences(L, UP); @@ -118,12 +137,6 @@ static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences( UP.PartialThreshold = UP.PartialOptSizeThreshold; } - // Apply unroll count pragmas - if (PragmaCount) - UP.Count = PragmaCount; - else if (PragmaFullUnroll) - UP.Count = TripCount; - // Apply any user values specified by cl::opt if (UnrollThreshold.getNumOccurrences() > 0) { UP.Threshold = UnrollThreshold; @@ -134,10 +147,14 @@ static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences( UnrollPercentDynamicCostSavedThreshold; if (UnrollDynamicCostSavingsDiscount.getNumOccurrences() > 0) UP.DynamicCostSavingsDiscount = UnrollDynamicCostSavingsDiscount; - if (UnrollCount.getNumOccurrences() > 0) - UP.Count = UnrollCount; + if (UnrollMaxCount.getNumOccurrences() > 0) + UP.MaxCount = UnrollMaxCount; + if (UnrollFullMaxCount.getNumOccurrences() > 0) + UP.FullUnrollMaxCount = UnrollFullMaxCount; if (UnrollAllowPartial.getNumOccurrences() > 0) UP.Partial = UnrollAllowPartial; + if (UnrollAllowRemainder.getNumOccurrences() > 0) + UP.AllowRemainder = UnrollAllowRemainder; if (UnrollRuntime.getNumOccurrences() > 0) UP.Runtime = UnrollRuntime; @@ -153,259 +170,42 @@ static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences( if (UserRuntime.hasValue()) UP.Runtime = *UserRuntime; - if (PragmaCount > 0 || - ((PragmaFullUnroll || PragmaEnableUnroll) && TripCount != 0)) { - // If the loop has an unrolling pragma, we want to be more aggressive with - // unrolling limits. Set thresholds to at least the PragmaTheshold value - // which is larger than the default limits. - if (UP.Threshold != NoThreshold) - UP.Threshold = std::max<unsigned>(UP.Threshold, PragmaUnrollThreshold); - if (UP.PartialThreshold != NoThreshold) - UP.PartialThreshold = - std::max<unsigned>(UP.PartialThreshold, PragmaUnrollThreshold); - } - return UP; } namespace { -// This class is used to get an estimate of the optimization effects that we -// could get from complete loop unrolling. It comes from the fact that some -// loads might be replaced with concrete constant values and that could trigger -// a chain of instruction simplifications. -// -// E.g. we might have: -// int a[] = {0, 1, 0}; -// v = 0; -// for (i = 0; i < 3; i ++) -// v += b[i]*a[i]; -// If we completely unroll the loop, we would get: -// v = b[0]*a[0] + b[1]*a[1] + b[2]*a[2] -// Which then will be simplified to: -// v = b[0]* 0 + b[1]* 1 + b[2]* 0 -// And finally: -// v = b[1] -class UnrolledInstAnalyzer : private InstVisitor<UnrolledInstAnalyzer, bool> { - typedef InstVisitor<UnrolledInstAnalyzer, bool> Base; - friend class InstVisitor<UnrolledInstAnalyzer, bool>; - struct SimplifiedAddress { - Value *Base = nullptr; - ConstantInt *Offset = nullptr; - }; +/// A struct to densely store the state of an instruction after unrolling at +/// each iteration. +/// +/// This is designed to work like a tuple of <Instruction *, int> for the +/// purposes of hashing and lookup, but to be able to associate two boolean +/// states with each key. +struct UnrolledInstState { + Instruction *I; + int Iteration : 30; + unsigned IsFree : 1; + unsigned IsCounted : 1; +}; -public: - UnrolledInstAnalyzer(unsigned Iteration, - DenseMap<Value *, Constant *> &SimplifiedValues, - ScalarEvolution &SE) - : SimplifiedValues(SimplifiedValues), SE(SE) { - IterationNumber = SE.getConstant(APInt(64, Iteration)); +/// Hashing and equality testing for a set of the instruction states. +struct UnrolledInstStateKeyInfo { + typedef DenseMapInfo<Instruction *> PtrInfo; + typedef DenseMapInfo<std::pair<Instruction *, int>> PairInfo; + static inline UnrolledInstState getEmptyKey() { + return {PtrInfo::getEmptyKey(), 0, 0, 0}; } - - // Allow access to the initial visit method. - using Base::visit; - -private: - /// \brief A cache of pointer bases and constant-folded offsets corresponding - /// to GEP (or derived from GEP) instructions. - /// - /// In order to find the base pointer one needs to perform non-trivial - /// traversal of the corresponding SCEV expression, so it's good to have the - /// results saved. - DenseMap<Value *, SimplifiedAddress> SimplifiedAddresses; - - /// \brief SCEV expression corresponding to number of currently simulated - /// iteration. - const SCEV *IterationNumber; - - /// \brief A Value->Constant map for keeping values that we managed to - /// constant-fold on the given iteration. - /// - /// While we walk the loop instructions, we build up and maintain a mapping - /// of simplified values specific to this iteration. The idea is to propagate - /// any special information we have about loads that can be replaced with - /// constants after complete unrolling, and account for likely simplifications - /// post-unrolling. - DenseMap<Value *, Constant *> &SimplifiedValues; - - ScalarEvolution &SE; - - /// \brief Try to simplify instruction \param I using its SCEV expression. - /// - /// The idea is that some AddRec expressions become constants, which then - /// could trigger folding of other instructions. However, that only happens - /// for expressions whose start value is also constant, which isn't always the - /// case. In another common and important case the start value is just some - /// address (i.e. SCEVUnknown) - in this case we compute the offset and save - /// it along with the base address instead. - bool simplifyInstWithSCEV(Instruction *I) { - if (!SE.isSCEVable(I->getType())) - return false; - - const SCEV *S = SE.getSCEV(I); - if (auto *SC = dyn_cast<SCEVConstant>(S)) { - SimplifiedValues[I] = SC->getValue(); - return true; - } - - auto *AR = dyn_cast<SCEVAddRecExpr>(S); - if (!AR) - return false; - - const SCEV *ValueAtIteration = AR->evaluateAtIteration(IterationNumber, SE); - // Check if the AddRec expression becomes a constant. - if (auto *SC = dyn_cast<SCEVConstant>(ValueAtIteration)) { - SimplifiedValues[I] = SC->getValue(); - return true; - } - - // Check if the offset from the base address becomes a constant. - auto *Base = dyn_cast<SCEVUnknown>(SE.getPointerBase(S)); - if (!Base) - return false; - auto *Offset = - dyn_cast<SCEVConstant>(SE.getMinusSCEV(ValueAtIteration, Base)); - if (!Offset) - return false; - SimplifiedAddress Address; - Address.Base = Base->getValue(); - Address.Offset = Offset->getValue(); - SimplifiedAddresses[I] = Address; - return true; + static inline UnrolledInstState getTombstoneKey() { + return {PtrInfo::getTombstoneKey(), 0, 0, 0}; } - - /// Base case for the instruction visitor. - bool visitInstruction(Instruction &I) { - return simplifyInstWithSCEV(&I); + static inline unsigned getHashValue(const UnrolledInstState &S) { + return PairInfo::getHashValue({S.I, S.Iteration}); } - - /// Try to simplify binary operator I. - /// - /// TODO: Probably it's worth to hoist the code for estimating the - /// simplifications effects to a separate class, since we have a very similar - /// code in InlineCost already. - bool visitBinaryOperator(BinaryOperator &I) { - Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); - if (!isa<Constant>(LHS)) - if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS)) - LHS = SimpleLHS; - if (!isa<Constant>(RHS)) - if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS)) - RHS = SimpleRHS; - - Value *SimpleV = nullptr; - const DataLayout &DL = I.getModule()->getDataLayout(); - if (auto FI = dyn_cast<FPMathOperator>(&I)) - SimpleV = - SimplifyFPBinOp(I.getOpcode(), LHS, RHS, FI->getFastMathFlags(), DL); - else - SimpleV = SimplifyBinOp(I.getOpcode(), LHS, RHS, DL); - - if (Constant *C = dyn_cast_or_null<Constant>(SimpleV)) - SimplifiedValues[&I] = C; - - if (SimpleV) - return true; - return Base::visitBinaryOperator(I); - } - - /// Try to fold load I. - bool visitLoad(LoadInst &I) { - Value *AddrOp = I.getPointerOperand(); - - auto AddressIt = SimplifiedAddresses.find(AddrOp); - if (AddressIt == SimplifiedAddresses.end()) - return false; - ConstantInt *SimplifiedAddrOp = AddressIt->second.Offset; - - auto *GV = dyn_cast<GlobalVariable>(AddressIt->second.Base); - // We're only interested in loads that can be completely folded to a - // constant. - if (!GV || !GV->hasDefinitiveInitializer() || !GV->isConstant()) - return false; - - ConstantDataSequential *CDS = - dyn_cast<ConstantDataSequential>(GV->getInitializer()); - if (!CDS) - return false; - - // We might have a vector load from an array. FIXME: for now we just bail - // out in this case, but we should be able to resolve and simplify such - // loads. - if(!CDS->isElementTypeCompatible(I.getType())) - return false; - - int ElemSize = CDS->getElementType()->getPrimitiveSizeInBits() / 8U; - assert(SimplifiedAddrOp->getValue().getActiveBits() < 64 && - "Unexpectedly large index value."); - int64_t Index = SimplifiedAddrOp->getSExtValue() / ElemSize; - if (Index >= CDS->getNumElements()) { - // FIXME: For now we conservatively ignore out of bound accesses, but - // we're allowed to perform the optimization in this case. - return false; - } - - Constant *CV = CDS->getElementAsConstant(Index); - assert(CV && "Constant expected."); - SimplifiedValues[&I] = CV; - - return true; - } - - bool visitCastInst(CastInst &I) { - // Propagate constants through casts. - Constant *COp = dyn_cast<Constant>(I.getOperand(0)); - if (!COp) - COp = SimplifiedValues.lookup(I.getOperand(0)); - if (COp) - if (Constant *C = - ConstantExpr::getCast(I.getOpcode(), COp, I.getType())) { - SimplifiedValues[&I] = C; - return true; - } - - return Base::visitCastInst(I); - } - - bool visitCmpInst(CmpInst &I) { - Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); - - // First try to handle simplified comparisons. - if (!isa<Constant>(LHS)) - if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS)) - LHS = SimpleLHS; - if (!isa<Constant>(RHS)) - if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS)) - RHS = SimpleRHS; - - if (!isa<Constant>(LHS) && !isa<Constant>(RHS)) { - auto SimplifiedLHS = SimplifiedAddresses.find(LHS); - if (SimplifiedLHS != SimplifiedAddresses.end()) { - auto SimplifiedRHS = SimplifiedAddresses.find(RHS); - if (SimplifiedRHS != SimplifiedAddresses.end()) { - SimplifiedAddress &LHSAddr = SimplifiedLHS->second; - SimplifiedAddress &RHSAddr = SimplifiedRHS->second; - if (LHSAddr.Base == RHSAddr.Base) { - LHS = LHSAddr.Offset; - RHS = RHSAddr.Offset; - } - } - } - } - - if (Constant *CLHS = dyn_cast<Constant>(LHS)) { - if (Constant *CRHS = dyn_cast<Constant>(RHS)) { - if (Constant *C = ConstantExpr::getCompare(I.getPredicate(), CLHS, CRHS)) { - SimplifiedValues[&I] = C; - return true; - } - } - } - - return Base::visitCmpInst(I); + static inline bool isEqual(const UnrolledInstState &LHS, + const UnrolledInstState &RHS) { + return PairInfo::isEqual({LHS.I, LHS.Iteration}, {RHS.I, RHS.Iteration}); } }; -} // namespace - +} namespace { struct EstimatedUnrollCost { @@ -441,18 +241,25 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT, assert(UnrollMaxIterationsCountToAnalyze < (INT_MAX / 2) && "The unroll iterations max is too large!"); + // Only analyze inner loops. We can't properly estimate cost of nested loops + // and we won't visit inner loops again anyway. + if (!L->empty()) + return None; + // Don't simulate loops with a big or unknown tripcount if (!UnrollMaxIterationsCountToAnalyze || !TripCount || TripCount > UnrollMaxIterationsCountToAnalyze) return None; SmallSetVector<BasicBlock *, 16> BBWorklist; + SmallSetVector<std::pair<BasicBlock *, BasicBlock *>, 4> ExitWorklist; DenseMap<Value *, Constant *> SimplifiedValues; SmallVector<std::pair<Value *, Constant *>, 4> SimplifiedInputValues; // The estimated cost of the unrolled form of the loop. We try to estimate // this by simplifying as much as we can while computing the estimate. int UnrolledCost = 0; + // We also track the estimated dynamic (that is, actually executed) cost in // the rolled form. This helps identify cases when the savings from unrolling // aren't just exposing dead control flows, but actual reduced dynamic @@ -460,6 +267,97 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT, // unrolling. int RolledDynamicCost = 0; + // We track the simplification of each instruction in each iteration. We use + // this to recursively merge costs into the unrolled cost on-demand so that + // we don't count the cost of any dead code. This is essentially a map from + // <instruction, int> to <bool, bool>, but stored as a densely packed struct. + DenseSet<UnrolledInstState, UnrolledInstStateKeyInfo> InstCostMap; + + // A small worklist used to accumulate cost of instructions from each + // observable and reached root in the loop. + SmallVector<Instruction *, 16> CostWorklist; + + // PHI-used worklist used between iterations while accumulating cost. + SmallVector<Instruction *, 4> PHIUsedList; + + // Helper function to accumulate cost for instructions in the loop. + auto AddCostRecursively = [&](Instruction &RootI, int Iteration) { + assert(Iteration >= 0 && "Cannot have a negative iteration!"); + assert(CostWorklist.empty() && "Must start with an empty cost list"); + assert(PHIUsedList.empty() && "Must start with an empty phi used list"); + CostWorklist.push_back(&RootI); + for (;; --Iteration) { + do { + Instruction *I = CostWorklist.pop_back_val(); + + // InstCostMap only uses I and Iteration as a key, the other two values + // don't matter here. + auto CostIter = InstCostMap.find({I, Iteration, 0, 0}); + if (CostIter == InstCostMap.end()) + // If an input to a PHI node comes from a dead path through the loop + // we may have no cost data for it here. What that actually means is + // that it is free. + continue; + auto &Cost = *CostIter; + if (Cost.IsCounted) + // Already counted this instruction. + continue; + + // Mark that we are counting the cost of this instruction now. + Cost.IsCounted = true; + + // If this is a PHI node in the loop header, just add it to the PHI set. + if (auto *PhiI = dyn_cast<PHINode>(I)) + if (PhiI->getParent() == L->getHeader()) { + assert(Cost.IsFree && "Loop PHIs shouldn't be evaluated as they " + "inherently simplify during unrolling."); + if (Iteration == 0) + continue; + + // Push the incoming value from the backedge into the PHI used list + // if it is an in-loop instruction. We'll use this to populate the + // cost worklist for the next iteration (as we count backwards). + if (auto *OpI = dyn_cast<Instruction>( + PhiI->getIncomingValueForBlock(L->getLoopLatch()))) + if (L->contains(OpI)) + PHIUsedList.push_back(OpI); + continue; + } + + // First accumulate the cost of this instruction. + if (!Cost.IsFree) { + UnrolledCost += TTI.getUserCost(I); + DEBUG(dbgs() << "Adding cost of instruction (iteration " << Iteration + << "): "); + DEBUG(I->dump()); + } + + // We must count the cost of every operand which is not free, + // recursively. If we reach a loop PHI node, simply add it to the set + // to be considered on the next iteration (backwards!). + for (Value *Op : I->operands()) { + // Check whether this operand is free due to being a constant or + // outside the loop. + auto *OpI = dyn_cast<Instruction>(Op); + if (!OpI || !L->contains(OpI)) + continue; + + // Otherwise accumulate its cost. + CostWorklist.push_back(OpI); + } + } while (!CostWorklist.empty()); + + if (PHIUsedList.empty()) + // We've exhausted the search. + break; + + assert(Iteration > 0 && + "Cannot track PHI-used values past the first iteration!"); + CostWorklist.append(PHIUsedList.begin(), PHIUsedList.end()); + PHIUsedList.clear(); + } + }; + // Ensure that we don't violate the loop structure invariants relied on by // this analysis. assert(L->isLoopSimplifyForm() && "Must put loop into normal form first."); @@ -502,7 +400,7 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT, while (!SimplifiedInputValues.empty()) SimplifiedValues.insert(SimplifiedInputValues.pop_back_val()); - UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, SE); + UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, SE, L); BBWorklist.clear(); BBWorklist.insert(L->getHeader()); @@ -514,22 +412,32 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT, // it. We don't change the actual IR, just count optimization // opportunities. for (Instruction &I : *BB) { - int InstCost = TTI.getUserCost(&I); + // Track this instruction's expected baseline cost when executing the + // rolled loop form. + RolledDynamicCost += TTI.getUserCost(&I); // Visit the instruction to analyze its loop cost after unrolling, - // and if the visitor returns false, include this instruction in the - // unrolled cost. - if (!Analyzer.visit(I)) - UnrolledCost += InstCost; - else { - DEBUG(dbgs() << " " << I - << " would be simplified if loop is unrolled.\n"); - (void)0; - } + // and if the visitor returns true, mark the instruction as free after + // unrolling and continue. + bool IsFree = Analyzer.visit(I); + bool Inserted = InstCostMap.insert({&I, (int)Iteration, + (unsigned)IsFree, + /*IsCounted*/ false}).second; + (void)Inserted; + assert(Inserted && "Cannot have a state for an unvisited instruction!"); + + if (IsFree) + continue; - // Also track this instructions expected cost when executing the rolled - // loop form. - RolledDynamicCost += InstCost; + // If the instruction might have a side-effect recursively account for + // the cost of it and all the instructions leading up to it. + if (I.mayHaveSideEffects()) + AddCostRecursively(I, Iteration); + + // Can't properly model a cost of a call. + // FIXME: With a proper cost model we should be able to do it. + if(isa<CallInst>(&I)) + return None; // If unrolled body turns out to be too big, bail out. if (UnrolledCost > MaxUnrolledLoopSize) { @@ -545,42 +453,45 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT, // Add in the live successors by first checking whether we have terminator // that may be simplified based on the values simplified by this call. + BasicBlock *KnownSucc = nullptr; if (BranchInst *BI = dyn_cast<BranchInst>(TI)) { if (BI->isConditional()) { if (Constant *SimpleCond = SimplifiedValues.lookup(BI->getCondition())) { - BasicBlock *Succ = nullptr; // Just take the first successor if condition is undef if (isa<UndefValue>(SimpleCond)) - Succ = BI->getSuccessor(0); - else - Succ = BI->getSuccessor( - cast<ConstantInt>(SimpleCond)->isZero() ? 1 : 0); - if (L->contains(Succ)) - BBWorklist.insert(Succ); - continue; + KnownSucc = BI->getSuccessor(0); + else if (ConstantInt *SimpleCondVal = + dyn_cast<ConstantInt>(SimpleCond)) + KnownSucc = BI->getSuccessor(SimpleCondVal->isZero() ? 1 : 0); } } } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) { if (Constant *SimpleCond = SimplifiedValues.lookup(SI->getCondition())) { - BasicBlock *Succ = nullptr; // Just take the first successor if condition is undef if (isa<UndefValue>(SimpleCond)) - Succ = SI->getSuccessor(0); - else - Succ = SI->findCaseValue(cast<ConstantInt>(SimpleCond)) - .getCaseSuccessor(); - if (L->contains(Succ)) - BBWorklist.insert(Succ); - continue; + KnownSucc = SI->getSuccessor(0); + else if (ConstantInt *SimpleCondVal = + dyn_cast<ConstantInt>(SimpleCond)) + KnownSucc = SI->findCaseValue(SimpleCondVal).getCaseSuccessor(); } } + if (KnownSucc) { + if (L->contains(KnownSucc)) + BBWorklist.insert(KnownSucc); + else + ExitWorklist.insert({BB, KnownSucc}); + continue; + } // Add BB's successors to the worklist. for (BasicBlock *Succ : successors(BB)) if (L->contains(Succ)) BBWorklist.insert(Succ); + else + ExitWorklist.insert({BB, Succ}); + AddCostRecursively(*TI, Iteration); } // If we found no optimization opportunities on the first iteration, we @@ -591,6 +502,23 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT, return None; } } + + while (!ExitWorklist.empty()) { + BasicBlock *ExitingBB, *ExitBB; + std::tie(ExitingBB, ExitBB) = ExitWorklist.pop_back_val(); + + for (Instruction &I : *ExitBB) { + auto *PN = dyn_cast<PHINode>(&I); + if (!PN) + break; + + Value *Op = PN->getIncomingValueForBlock(ExitingBB); + if (auto *OpI = dyn_cast<Instruction>(Op)) + if (L->contains(OpI)) + AddCostRecursively(*OpI, TripCount - 1); + } + } + DEBUG(dbgs() << "Analysis finished:\n" << "UnrolledCost: " << UnrolledCost << ", " << "RolledDynamicCost: " << RolledDynamicCost << "\n"); @@ -599,18 +527,18 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT, /// ApproximateLoopSize - Approximate the size of the loop. static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls, - bool &NotDuplicatable, + bool &NotDuplicatable, bool &Convergent, const TargetTransformInfo &TTI, AssumptionCache *AC) { SmallPtrSet<const Value *, 32> EphValues; CodeMetrics::collectEphemeralValues(L, AC, EphValues); CodeMetrics Metrics; - for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); - I != E; ++I) - Metrics.analyzeBasicBlock(*I, TTI, EphValues); + for (BasicBlock *BB : L->blocks()) + Metrics.analyzeBasicBlock(BB, TTI, EphValues); NumCalls = Metrics.NumInlineCandidates; NotDuplicatable = Metrics.notDuplicatable; + Convergent = Metrics.convergent; unsigned LoopSize = Metrics.NumInsts; @@ -676,21 +604,22 @@ static unsigned UnrollCountPragmaValue(const Loop *L) { // unrolling pass is run more than once (which it generally is). static void SetLoopAlreadyUnrolled(Loop *L) { MDNode *LoopID = L->getLoopID(); - if (!LoopID) return; - // First remove any existing loop unrolling metadata. SmallVector<Metadata *, 4> MDs; // Reserve first location for self reference to the LoopID metadata node. MDs.push_back(nullptr); - for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { - bool IsUnrollMetadata = false; - MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); - if (MD) { - const MDString *S = dyn_cast<MDString>(MD->getOperand(0)); - IsUnrollMetadata = S && S->getString().startswith("llvm.loop.unroll."); + + if (LoopID) { + for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { + bool IsUnrollMetadata = false; + MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); + if (MD) { + const MDString *S = dyn_cast<MDString>(MD->getOperand(0)); + IsUnrollMetadata = S && S->getString().startswith("llvm.loop.unroll."); + } + if (!IsUnrollMetadata) + MDs.push_back(LoopID->getOperand(i)); } - if (!IsUnrollMetadata) - MDs.push_back(LoopID->getOperand(i)); } // Add unroll(disable) metadata to disable future unrolling. @@ -737,9 +666,9 @@ static bool canUnrollCompletely(Loop *L, unsigned Threshold, (int64_t)UnrolledCost - (int64_t)DynamicCostSavingsDiscount <= (int64_t)Threshold) { DEBUG(dbgs() << " Can fully unroll, because unrolling will reduce the " - "expected dynamic cost by " << PercentDynamicCostSaved - << "% (threshold: " << PercentDynamicCostSavedThreshold - << "%)\n" + "expected dynamic cost by " + << PercentDynamicCostSaved << "% (threshold: " + << PercentDynamicCostSavedThreshold << "%)\n" << " and the unrolled cost (" << UnrolledCost << ") is less than the max threshold (" << DynamicCostSavingsDiscount << ").\n"); @@ -758,82 +687,77 @@ static bool canUnrollCompletely(Loop *L, unsigned Threshold, return false; } -static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, - ScalarEvolution *SE, const TargetTransformInfo &TTI, - AssumptionCache &AC, bool PreserveLCSSA, - Optional<unsigned> ProvidedCount, - Optional<unsigned> ProvidedThreshold, - Optional<bool> ProvidedAllowPartial, - Optional<bool> ProvidedRuntime) { - BasicBlock *Header = L->getHeader(); - DEBUG(dbgs() << "Loop Unroll: F[" << Header->getParent()->getName() - << "] Loop %" << Header->getName() << "\n"); +// Returns true if unroll count was set explicitly. +// Calculates unroll count and writes it to UP.Count. +static bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI, + DominatorTree &DT, LoopInfo *LI, + ScalarEvolution *SE, unsigned TripCount, + unsigned TripMultiple, unsigned LoopSize, + TargetTransformInfo::UnrollingPreferences &UP) { + // BEInsns represents number of instructions optimized when "back edge" + // becomes "fall through" in unrolled loop. + // For now we count a conditional branch on a backedge and a comparison + // feeding it. + unsigned BEInsns = 2; + // Check for explicit Count. + // 1st priority is unroll count set by "unroll-count" option. + bool UserUnrollCount = UnrollCount.getNumOccurrences() > 0; + if (UserUnrollCount) { + UP.Count = UnrollCount; + UP.AllowExpensiveTripCount = true; + UP.Force = true; + if (UP.AllowRemainder && + (LoopSize - BEInsns) * UP.Count + BEInsns < UP.Threshold) + return true; + } - if (HasUnrollDisablePragma(L)) { - return false; + // 2nd priority is unroll count set by pragma. + unsigned PragmaCount = UnrollCountPragmaValue(L); + if (PragmaCount > 0) { + UP.Count = PragmaCount; + UP.Runtime = true; + UP.AllowExpensiveTripCount = true; + UP.Force = true; + if (UP.AllowRemainder && + (LoopSize - BEInsns) * UP.Count + BEInsns < PragmaUnrollThreshold) + return true; } bool PragmaFullUnroll = HasUnrollFullPragma(L); - bool PragmaEnableUnroll = HasUnrollEnablePragma(L); - unsigned PragmaCount = UnrollCountPragmaValue(L); - bool HasPragma = PragmaFullUnroll || PragmaEnableUnroll || PragmaCount > 0; - - // Find trip count and trip multiple if count is not available - unsigned TripCount = 0; - unsigned TripMultiple = 1; - // If there are multiple exiting blocks but one of them is the latch, use the - // latch for the trip count estimation. Otherwise insist on a single exiting - // block for the trip count estimation. - BasicBlock *ExitingBlock = L->getLoopLatch(); - if (!ExitingBlock || !L->isLoopExiting(ExitingBlock)) - ExitingBlock = L->getExitingBlock(); - if (ExitingBlock) { - TripCount = SE->getSmallConstantTripCount(L, ExitingBlock); - TripMultiple = SE->getSmallConstantTripMultiple(L, ExitingBlock); + if (PragmaFullUnroll && TripCount != 0) { + UP.Count = TripCount; + if ((LoopSize - BEInsns) * UP.Count + BEInsns < PragmaUnrollThreshold) + return false; } - TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences( - L, TTI, ProvidedThreshold, ProvidedCount, ProvidedAllowPartial, - ProvidedRuntime, PragmaCount, PragmaFullUnroll, PragmaEnableUnroll, - TripCount); - - unsigned Count = UP.Count; - bool CountSetExplicitly = Count != 0; - // Use a heuristic count if we didn't set anything explicitly. - if (!CountSetExplicitly) - Count = TripCount == 0 ? DefaultUnrollRuntimeCount : TripCount; - if (TripCount && Count > TripCount) - Count = TripCount; + bool PragmaEnableUnroll = HasUnrollEnablePragma(L); + bool ExplicitUnroll = PragmaCount > 0 || PragmaFullUnroll || + PragmaEnableUnroll || UserUnrollCount; - unsigned NumInlineCandidates; - bool notDuplicatable; - unsigned LoopSize = - ApproximateLoopSize(L, NumInlineCandidates, notDuplicatable, TTI, &AC); - DEBUG(dbgs() << " Loop Size = " << LoopSize << "\n"); + uint64_t UnrolledSize; + DebugLoc LoopLoc = L->getStartLoc(); + Function *F = L->getHeader()->getParent(); + LLVMContext &Ctx = F->getContext(); - // When computing the unrolled size, note that the conditional branch on the - // backedge and the comparison feeding it are not replicated like the rest of - // the loop body (which is why 2 is subtracted). - uint64_t UnrolledSize = (uint64_t)(LoopSize-2) * Count + 2; - if (notDuplicatable) { - DEBUG(dbgs() << " Not unrolling loop which contains non-duplicatable" - << " instructions.\n"); - return false; - } - if (NumInlineCandidates != 0) { - DEBUG(dbgs() << " Not unrolling loop with inlinable calls.\n"); - return false; + if (ExplicitUnroll && TripCount != 0) { + // If the loop has an unrolling pragma, we want to be more aggressive with + // unrolling limits. Set thresholds to at least the PragmaThreshold value + // which is larger than the default limits. + UP.Threshold = std::max<unsigned>(UP.Threshold, PragmaUnrollThreshold); + UP.PartialThreshold = + std::max<unsigned>(UP.PartialThreshold, PragmaUnrollThreshold); } - // Given Count, TripCount and thresholds determine the type of - // unrolling which is to be performed. - enum { Full = 0, Partial = 1, Runtime = 2 }; - int Unrolling; - if (TripCount && Count == TripCount) { - Unrolling = Partial; - // If the loop is really small, we don't need to run an expensive analysis. + // 3rd priority is full unroll count. + // Full unroll make sense only when TripCount could be staticaly calculated. + // Also we need to check if we exceed FullUnrollMaxCount. + if (TripCount && TripCount <= UP.FullUnrollMaxCount) { + // When computing the unrolled size, note that BEInsns are not replicated + // like the rest of the loop body. + UnrolledSize = (uint64_t)(LoopSize - BEInsns) * TripCount + BEInsns; if (canUnrollCompletely(L, UP.Threshold, 100, UP.DynamicCostSavingsDiscount, UnrolledSize, UnrolledSize)) { - Unrolling = Full; + UP.Count = TripCount; + return ExplicitUnroll; } else { // The loop isn't that small, but we still can fully unroll it if that // helps to remove a significant number of instructions. @@ -845,99 +769,216 @@ static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, UP.PercentDynamicCostSavedThreshold, UP.DynamicCostSavingsDiscount, Cost->UnrolledCost, Cost->RolledDynamicCost)) { - Unrolling = Full; + UP.Count = TripCount; + return ExplicitUnroll; } } - } else if (TripCount && Count < TripCount) { - Unrolling = Partial; - } else { - Unrolling = Runtime; } - // Reduce count based on the type of unrolling and the threshold values. - unsigned OriginalCount = Count; - bool AllowRuntime = PragmaEnableUnroll || (PragmaCount > 0) || UP.Runtime; - // Don't unroll a runtime trip count loop with unroll full pragma. - if (HasRuntimeUnrollDisablePragma(L) || PragmaFullUnroll) { - AllowRuntime = false; - } - if (Unrolling == Partial) { - bool AllowPartial = PragmaEnableUnroll || UP.Partial; - if (!AllowPartial && !CountSetExplicitly) { + // 4rd priority is partial unrolling. + // Try partial unroll only when TripCount could be staticaly calculated. + if (TripCount) { + if (UP.Count == 0) + UP.Count = TripCount; + UP.Partial |= ExplicitUnroll; + if (!UP.Partial) { DEBUG(dbgs() << " will not try to unroll partially because " << "-unroll-allow-partial not given\n"); + UP.Count = 0; return false; } - if (UP.PartialThreshold != NoThreshold && - UnrolledSize > UP.PartialThreshold) { + if (UP.PartialThreshold != NoThreshold) { // Reduce unroll count to be modulo of TripCount for partial unrolling. - Count = (std::max(UP.PartialThreshold, 3u) - 2) / (LoopSize - 2); - while (Count != 0 && TripCount % Count != 0) - Count--; - } - } else if (Unrolling == Runtime) { - if (!AllowRuntime && !CountSetExplicitly) { - DEBUG(dbgs() << " will not try to unroll loop with runtime trip count " - << "-unroll-runtime not given\n"); - return false; - } - // Reduce unroll count to be the largest power-of-two factor of - // the original count which satisfies the threshold limit. - while (Count != 0 && UnrolledSize > UP.PartialThreshold) { - Count >>= 1; - UnrolledSize = (LoopSize-2) * Count + 2; + UnrolledSize = (uint64_t)(LoopSize - BEInsns) * UP.Count + BEInsns; + if (UnrolledSize > UP.PartialThreshold) + UP.Count = (std::max(UP.PartialThreshold, 3u) - BEInsns) / + (LoopSize - BEInsns); + if (UP.Count > UP.MaxCount) + UP.Count = UP.MaxCount; + while (UP.Count != 0 && TripCount % UP.Count != 0) + UP.Count--; + if (UP.AllowRemainder && UP.Count <= 1) { + // If there is no Count that is modulo of TripCount, set Count to + // largest power-of-two factor that satisfies the threshold limit. + // As we'll create fixup loop, do the type of unrolling only if + // remainder loop is allowed. + UP.Count = DefaultUnrollRuntimeCount; + UnrolledSize = (LoopSize - BEInsns) * UP.Count + BEInsns; + while (UP.Count != 0 && UnrolledSize > UP.PartialThreshold) { + UP.Count >>= 1; + UnrolledSize = (LoopSize - BEInsns) * UP.Count + BEInsns; + } + } + if (UP.Count < 2) { + if (PragmaEnableUnroll) + emitOptimizationRemarkMissed( + Ctx, DEBUG_TYPE, *F, LoopLoc, + "Unable to unroll loop as directed by unroll(enable) pragma " + "because unrolled size is too large."); + UP.Count = 0; + } + } else { + UP.Count = TripCount; } - if (Count > UP.MaxCount) - Count = UP.MaxCount; - DEBUG(dbgs() << " partially unrolling with count: " << Count << "\n"); - } - - if (HasPragma) { - if (PragmaCount != 0) - // If loop has an unroll count pragma mark loop as unrolled to prevent - // unrolling beyond that requested by the pragma. - SetLoopAlreadyUnrolled(L); - - // Emit optimization remarks if we are unable to unroll the loop - // as directed by a pragma. - DebugLoc LoopLoc = L->getStartLoc(); - Function *F = Header->getParent(); - LLVMContext &Ctx = F->getContext(); - if ((PragmaCount > 0) && Count != OriginalCount) { - emitOptimizationRemarkMissed( - Ctx, DEBUG_TYPE, *F, LoopLoc, - "Unable to unroll loop the number of times directed by " - "unroll_count pragma because unrolled size is too large."); - } else if (PragmaFullUnroll && !TripCount) { - emitOptimizationRemarkMissed( - Ctx, DEBUG_TYPE, *F, LoopLoc, - "Unable to fully unroll loop as directed by unroll(full) pragma " - "because loop has a runtime trip count."); - } else if (PragmaEnableUnroll && Count != TripCount && Count < 2) { + if ((PragmaFullUnroll || PragmaEnableUnroll) && TripCount && + UP.Count != TripCount) emitOptimizationRemarkMissed( Ctx, DEBUG_TYPE, *F, LoopLoc, - "Unable to unroll loop as directed by unroll(enable) pragma because " + "Unable to fully unroll loop as directed by unroll pragma because " "unrolled size is too large."); - } else if ((PragmaFullUnroll || PragmaEnableUnroll) && TripCount && - Count != TripCount) { + return ExplicitUnroll; + } + assert(TripCount == 0 && + "All cases when TripCount is constant should be covered here."); + if (PragmaFullUnroll) + emitOptimizationRemarkMissed( + Ctx, DEBUG_TYPE, *F, LoopLoc, + "Unable to fully unroll loop as directed by unroll(full) pragma " + "because loop has a runtime trip count."); + + // 5th priority is runtime unrolling. + // Don't unroll a runtime trip count loop when it is disabled. + if (HasRuntimeUnrollDisablePragma(L)) { + UP.Count = 0; + return false; + } + // Reduce count based on the type of unrolling and the threshold values. + UP.Runtime |= PragmaEnableUnroll || PragmaCount > 0 || UserUnrollCount; + if (!UP.Runtime) { + DEBUG(dbgs() << " will not try to unroll loop with runtime trip count " + << "-unroll-runtime not given\n"); + UP.Count = 0; + return false; + } + if (UP.Count == 0) + UP.Count = DefaultUnrollRuntimeCount; + UnrolledSize = (LoopSize - BEInsns) * UP.Count + BEInsns; + + // Reduce unroll count to be the largest power-of-two factor of + // the original count which satisfies the threshold limit. + while (UP.Count != 0 && UnrolledSize > UP.PartialThreshold) { + UP.Count >>= 1; + UnrolledSize = (LoopSize - BEInsns) * UP.Count + BEInsns; + } + +#ifndef NDEBUG + unsigned OrigCount = UP.Count; +#endif + + if (!UP.AllowRemainder && UP.Count != 0 && (TripMultiple % UP.Count) != 0) { + while (UP.Count != 0 && TripMultiple % UP.Count != 0) + UP.Count >>= 1; + DEBUG(dbgs() << "Remainder loop is restricted (that could architecture " + "specific or because the loop contains a convergent " + "instruction), so unroll count must divide the trip " + "multiple, " + << TripMultiple << ". Reducing unroll count from " + << OrigCount << " to " << UP.Count << ".\n"); + if (PragmaCount > 0 && !UP.AllowRemainder) emitOptimizationRemarkMissed( Ctx, DEBUG_TYPE, *F, LoopLoc, - "Unable to fully unroll loop as directed by unroll pragma because " - "unrolled size is too large."); - } + Twine("Unable to unroll loop the number of times directed by " + "unroll_count pragma because remainder loop is restricted " + "(that could architecture specific or because the loop " + "contains a convergent instruction) and so must have an unroll " + "count that divides the loop trip multiple of ") + + Twine(TripMultiple) + ". Unrolling instead " + Twine(UP.Count) + + " time(s)."); } - if (Unrolling != Full && Count < 2) { - // Partial unrolling by 1 is a nop. For full unrolling, a factor - // of 1 makes sense because loop control can be eliminated. + if (UP.Count > UP.MaxCount) + UP.Count = UP.MaxCount; + DEBUG(dbgs() << " partially unrolling with count: " << UP.Count << "\n"); + if (UP.Count < 2) + UP.Count = 0; + return ExplicitUnroll; +} + +static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, + ScalarEvolution *SE, const TargetTransformInfo &TTI, + AssumptionCache &AC, bool PreserveLCSSA, + Optional<unsigned> ProvidedCount, + Optional<unsigned> ProvidedThreshold, + Optional<bool> ProvidedAllowPartial, + Optional<bool> ProvidedRuntime) { + DEBUG(dbgs() << "Loop Unroll: F[" << L->getHeader()->getParent()->getName() + << "] Loop %" << L->getHeader()->getName() << "\n"); + if (HasUnrollDisablePragma(L)) { return false; } + unsigned NumInlineCandidates; + bool NotDuplicatable; + bool Convergent; + unsigned LoopSize = ApproximateLoopSize( + L, NumInlineCandidates, NotDuplicatable, Convergent, TTI, &AC); + DEBUG(dbgs() << " Loop Size = " << LoopSize << "\n"); + if (NotDuplicatable) { + DEBUG(dbgs() << " Not unrolling loop which contains non-duplicatable" + << " instructions.\n"); + return false; + } + if (NumInlineCandidates != 0) { + DEBUG(dbgs() << " Not unrolling loop with inlinable calls.\n"); + return false; + } + if (!L->isLoopSimplifyForm()) { + DEBUG( + dbgs() << " Not unrolling loop which is not in loop-simplify form.\n"); + return false; + } + + // Find trip count and trip multiple if count is not available + unsigned TripCount = 0; + unsigned TripMultiple = 1; + // If there are multiple exiting blocks but one of them is the latch, use the + // latch for the trip count estimation. Otherwise insist on a single exiting + // block for the trip count estimation. + BasicBlock *ExitingBlock = L->getLoopLatch(); + if (!ExitingBlock || !L->isLoopExiting(ExitingBlock)) + ExitingBlock = L->getExitingBlock(); + if (ExitingBlock) { + TripCount = SE->getSmallConstantTripCount(L, ExitingBlock); + TripMultiple = SE->getSmallConstantTripMultiple(L, ExitingBlock); + } + + TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences( + L, TTI, ProvidedThreshold, ProvidedCount, ProvidedAllowPartial, + ProvidedRuntime); + + // If the loop contains a convergent operation, the prelude we'd add + // to do the first few instructions before we hit the unrolled loop + // is unsafe -- it adds a control-flow dependency to the convergent + // operation. Therefore restrict remainder loop (try unrollig without). + // + // TODO: This is quite conservative. In practice, convergent_op() + // is likely to be called unconditionally in the loop. In this + // case, the program would be ill-formed (on most architectures) + // unless n were the same on all threads in a thread group. + // Assuming n is the same on all threads, any kind of unrolling is + // safe. But currently llvm's notion of convergence isn't powerful + // enough to express this. + if (Convergent) + UP.AllowRemainder = false; + + bool IsCountSetExplicitly = computeUnrollCount(L, TTI, DT, LI, SE, TripCount, + TripMultiple, LoopSize, UP); + if (!UP.Count) + return false; + // Unroll factor (Count) must be less or equal to TripCount. + if (TripCount && UP.Count > TripCount) + UP.Count = TripCount; + // Unroll the loop. - if (!UnrollLoop(L, Count, TripCount, AllowRuntime, UP.AllowExpensiveTripCount, - TripMultiple, LI, SE, &DT, &AC, PreserveLCSSA)) + if (!UnrollLoop(L, UP.Count, TripCount, UP.Force, UP.Runtime, + UP.AllowExpensiveTripCount, TripMultiple, LI, SE, &DT, &AC, + PreserveLCSSA)) return false; + // If loop has an unroll count pragma or unrolled by explicitly set count + // mark loop as unrolled to prevent unrolling beyond that requested. + if (IsCountSetExplicitly) + SetLoopAlreadyUnrolled(L); return true; } @@ -948,8 +989,9 @@ public: LoopUnroll(Optional<unsigned> Threshold = None, Optional<unsigned> Count = None, Optional<bool> AllowPartial = None, Optional<bool> Runtime = None) - : LoopPass(ID), ProvidedCount(Count), ProvidedThreshold(Threshold), - ProvidedAllowPartial(AllowPartial), ProvidedRuntime(Runtime) { + : LoopPass(ID), ProvidedCount(std::move(Count)), + ProvidedThreshold(Threshold), ProvidedAllowPartial(AllowPartial), + ProvidedRuntime(Runtime) { initializeLoopUnrollPass(*PassRegistry::getPassRegistry()); } @@ -959,7 +1001,7 @@ public: Optional<bool> ProvidedRuntime; bool runOnLoop(Loop *L, LPPassManager &) override { - if (skipOptnoneFunction(L)) + if (skipLoop(L)) return false; Function &F = *L->getHeader()->getParent(); @@ -982,35 +1024,19 @@ public: /// void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<AssumptionCacheTracker>(); - AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<LoopInfoWrapperPass>(); - AU.addPreserved<LoopInfoWrapperPass>(); - AU.addRequiredID(LoopSimplifyID); - AU.addPreservedID(LoopSimplifyID); - AU.addRequiredID(LCSSAID); - AU.addPreservedID(LCSSAID); - AU.addRequired<ScalarEvolutionWrapperPass>(); - AU.addPreserved<ScalarEvolutionWrapperPass>(); AU.addRequired<TargetTransformInfoWrapperPass>(); - // FIXME: Loop unroll requires LCSSA. And LCSSA requires dom info. - // If loop unroll does not preserve dom info then LCSSA pass on next - // loop will receive invalid dom info. - // For now, recreate dom info, if loop is unrolled. - AU.addPreserved<DominatorTreeWrapperPass>(); - AU.addPreserved<GlobalsAAWrapperPass>(); + // FIXME: Loop passes are required to preserve domtree, and for now we just + // recreate dom info if anything gets unrolled. + getLoopAnalysisUsage(AU); } }; } char LoopUnroll::ID = 0; INITIALIZE_PASS_BEGIN(LoopUnroll, "loop-unroll", "Unroll loops", false, false) -INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) -INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(LoopSimplify) -INITIALIZE_PASS_DEPENDENCY(LCSSA) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopPass) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_END(LoopUnroll, "loop-unroll", "Unroll loops", false, false) Pass *llvm::createLoopUnrollPass(int Threshold, int Count, int AllowPartial, diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp index 95d7f8a3beda2..71980e85e8cac 100644 --- a/lib/Transforms/Scalar/LoopUnswitch.cpp +++ b/lib/Transforms/Scalar/LoopUnswitch.cpp @@ -55,6 +55,7 @@ #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/LoopUtils.h" #include <algorithm> #include <map> #include <set> @@ -64,6 +65,7 @@ using namespace llvm; STATISTIC(NumBranches, "Number of branches unswitched"); STATISTIC(NumSwitches, "Number of switches unswitched"); +STATISTIC(NumGuards, "Number of guards unswitched"); STATISTIC(NumSelects , "Number of selects unswitched"); STATISTIC(NumTrivial , "Number of unswitches that are trivial"); STATISTIC(NumSimplify, "Number of simplifications of unswitched code"); @@ -187,6 +189,9 @@ namespace { BasicBlock *loopHeader; BasicBlock *loopPreheader; + bool SanitizeMemory; + LoopSafetyInfo SafetyInfo; + // LoopBlocks contains all of the basic blocks of the loop, including the // preheader of the loop, the body of the loop, and the exit blocks of the // loop, in that order. @@ -211,17 +216,8 @@ namespace { /// void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<AssumptionCacheTracker>(); - AU.addRequiredID(LoopSimplifyID); - AU.addPreservedID(LoopSimplifyID); - AU.addRequired<LoopInfoWrapperPass>(); - AU.addPreserved<LoopInfoWrapperPass>(); - AU.addRequiredID(LCSSAID); - AU.addPreservedID(LCSSAID); - AU.addRequired<DominatorTreeWrapperPass>(); - AU.addPreserved<DominatorTreeWrapperPass>(); - AU.addPreserved<ScalarEvolutionWrapperPass>(); AU.addRequired<TargetTransformInfoWrapperPass>(); - AU.addPreserved<GlobalsAAWrapperPass>(); + getLoopAnalysisUsage(AU); } private: @@ -382,11 +378,9 @@ void LUAnalysisCache::cloneData(const Loop *NewLoop, const Loop *OldLoop, char LoopUnswitch::ID = 0; INITIALIZE_PASS_BEGIN(LoopUnswitch, "loop-unswitch", "Unswitch loops", false, false) -INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) -INITIALIZE_PASS_DEPENDENCY(LoopSimplify) -INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(LCSSA) +INITIALIZE_PASS_DEPENDENCY(LoopPass) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_END(LoopUnswitch, "loop-unswitch", "Unswitch loops", false, false) @@ -396,7 +390,11 @@ Pass *llvm::createLoopUnswitchPass(bool Os) { /// Cond is a condition that occurs in L. If it is invariant in the loop, or has /// an invariant piece, return the invariant. Otherwise, return null. -static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed) { +static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed, + DenseMap<Value *, Value *> &Cache) { + auto CacheIt = Cache.find(Cond); + if (CacheIt != Cache.end()) + return CacheIt->second; // We started analyze new instruction, increment scanned instructions counter. ++TotalInsts; @@ -411,8 +409,10 @@ static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed) { // TODO: Handle: br (VARIANT|INVARIANT). // Hoist simple values out. - if (L->makeLoopInvariant(Cond, Changed)) + if (L->makeLoopInvariant(Cond, Changed)) { + Cache[Cond] = Cond; return Cond; + } if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Cond)) if (BO->getOpcode() == Instruction::And || @@ -420,17 +420,29 @@ static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed) { // If either the left or right side is invariant, we can unswitch on this, // which will cause the branch to go away in one loop and the condition to // simplify in the other one. - if (Value *LHS = FindLIVLoopCondition(BO->getOperand(0), L, Changed)) + if (Value *LHS = + FindLIVLoopCondition(BO->getOperand(0), L, Changed, Cache)) { + Cache[Cond] = LHS; return LHS; - if (Value *RHS = FindLIVLoopCondition(BO->getOperand(1), L, Changed)) + } + if (Value *RHS = + FindLIVLoopCondition(BO->getOperand(1), L, Changed, Cache)) { + Cache[Cond] = RHS; return RHS; + } } + Cache[Cond] = nullptr; return nullptr; } +static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed) { + DenseMap<Value *, Value *> Cache; + return FindLIVLoopCondition(Cond, L, Changed, Cache); +} + bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPM_Ref) { - if (skipOptnoneFunction(L)) + if (skipLoop(L)) return false; AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache( @@ -441,6 +453,10 @@ bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPM_Ref) { currentLoop = L; Function *F = currentLoop->getHeader()->getParent(); + SanitizeMemory = F->hasFnAttribute(Attribute::SanitizeMemory); + if (SanitizeMemory) + computeLoopSafetyInfo(&SafetyInfo, L); + EnabledPGO = F->getEntryCount().hasValue(); if (LoopUnswitchWithBlockFrequency && EnabledPGO) { @@ -499,17 +515,34 @@ bool LoopUnswitch::processCurrentLoop() { return true; } - // Do not unswitch loops containing convergent operations, as we might be - // making them control dependent on the unswitch value when they were not - // before. - // FIXME: This could be refined to only bail if the convergent operation is - // not already control-dependent on the unswitch value. + // Run through the instructions in the loop, keeping track of three things: + // + // - That we do not unswitch loops containing convergent operations, as we + // might be making them control dependent on the unswitch value when they + // were not before. + // FIXME: This could be refined to only bail if the convergent operation is + // not already control-dependent on the unswitch value. + // + // - That basic blocks in the loop contain invokes whose predecessor edges we + // cannot split. + // + // - The set of guard intrinsics encountered (these are non terminator + // instructions that are also profitable to be unswitched). + + SmallVector<IntrinsicInst *, 4> Guards; + for (const auto BB : currentLoop->blocks()) { for (auto &I : *BB) { auto CS = CallSite(&I); if (!CS) continue; if (CS.hasFnAttr(Attribute::Convergent)) return false; + if (auto *II = dyn_cast<InvokeInst>(&I)) + if (!II->getUnwindDest()->canSplitPredecessors()) + return false; + if (auto *II = dyn_cast<IntrinsicInst>(&I)) + if (II->getIntrinsicID() == Intrinsic::experimental_guard) + Guards.push_back(II); } } @@ -529,12 +562,36 @@ bool LoopUnswitch::processCurrentLoop() { return false; } + for (IntrinsicInst *Guard : Guards) { + Value *LoopCond = + FindLIVLoopCondition(Guard->getOperand(0), currentLoop, Changed); + if (LoopCond && + UnswitchIfProfitable(LoopCond, ConstantInt::getTrue(Context))) { + // NB! Unswitching (if successful) could have erased some of the + // instructions in Guards leaving dangling pointers there. This is fine + // because we're returning now, and won't look at Guards again. + ++NumGuards; + return true; + } + } + // Loop over all of the basic blocks in the loop. If we find an interior // block that is branching on a loop-invariant condition, we can unswitch this // loop. for (Loop::block_iterator I = currentLoop->block_begin(), E = currentLoop->block_end(); I != E; ++I) { TerminatorInst *TI = (*I)->getTerminator(); + + // Unswitching on a potentially uninitialized predicate is not + // MSan-friendly. Limit this to the cases when the original predicate is + // guaranteed to execute, to avoid creating a use-of-uninitialized-value + // in the code that did not have one. + // This is a workaround for the discrepancy between LLVM IR and MSan + // semantics. See PR28054 for more details. + if (SanitizeMemory && + !isGuaranteedToExecute(*TI, DT, currentLoop, &SafetyInfo)) + continue; + if (BranchInst *BI = dyn_cast<BranchInst>(TI)) { // If this isn't branching on an invariant condition, we can't unswitch // it. @@ -628,8 +685,8 @@ static bool isTrivialLoopExitBlockHelper(Loop *L, BasicBlock *BB, // Okay, everything after this looks good, check to make sure that this block // doesn't include any side effects. - for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) - if (I->mayHaveSideEffects()) + for (Instruction &I : *BB) + if (I.mayHaveSideEffects()) return false; return true; @@ -679,8 +736,8 @@ static Loop *CloneLoop(Loop *L, Loop *PL, ValueToValueMapTy &VM, New.addBasicBlockToLoop(cast<BasicBlock>(VM[*I]), *LI); // Add all of the subloops to the new loop. - for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I) - CloneLoop(*I, &New, VM, LI, LPM); + for (Loop *I : *L) + CloneLoop(I, &New, VM, LI, LPM); return &New; } @@ -1075,10 +1132,9 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val, // Rewrite the code to refer to itself. for (unsigned i = 0, e = NewBlocks.size(); i != e; ++i) - for (BasicBlock::iterator I = NewBlocks[i]->begin(), - E = NewBlocks[i]->end(); I != E; ++I) - RemapInstruction(&*I, VMap, - RF_NoModuleLevelChanges | RF_IgnoreMissingEntries); + for (Instruction &I : *NewBlocks[i]) + RemapInstruction(&I, VMap, + RF_NoModuleLevelChanges | RF_IgnoreMissingLocals); // Rewrite the original preheader to select between versions of the loop. BranchInst *OldBR = cast<BranchInst>(loopPreheader->getTerminator()); @@ -1180,9 +1236,8 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC, Worklist.push_back(UI); } - for (std::vector<Instruction*>::iterator UI = Worklist.begin(), - UE = Worklist.end(); UI != UE; ++UI) - (*UI)->replaceUsesOfWith(LIC, Replacement); + for (Instruction *UI : Worklist) + UI->replaceUsesOfWith(LIC, Replacement); SimplifyCode(Worklist, L); return; diff --git a/lib/Transforms/Scalar/LoopVersioningLICM.cpp b/lib/Transforms/Scalar/LoopVersioningLICM.cpp new file mode 100644 index 0000000000000..0ccf0af7165b5 --- /dev/null +++ b/lib/Transforms/Scalar/LoopVersioningLICM.cpp @@ -0,0 +1,571 @@ +//===----------- LoopVersioningLICM.cpp - LICM Loop Versioning ------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// When alias analysis is uncertain about the aliasing between any two accesses, +// it will return MayAlias. This uncertainty from alias analysis restricts LICM +// from proceeding further. In cases where alias analysis is uncertain we might +// use loop versioning as an alternative. +// +// Loop Versioning will create a version of the loop with aggressive aliasing +// assumptions in addition to the original with conservative (default) aliasing +// assumptions. The version of the loop making aggressive aliasing assumptions +// will have all the memory accesses marked as no-alias. These two versions of +// loop will be preceded by a memory runtime check. This runtime check consists +// of bound checks for all unique memory accessed in loop, and it ensures the +// lack of memory aliasing. The result of the runtime check determines which of +// the loop versions is executed: If the runtime check detects any memory +// aliasing, then the original loop is executed. Otherwise, the version with +// aggressive aliasing assumptions is used. +// +// Following are the top level steps: +// +// a) Perform LoopVersioningLICM's feasibility check. +// b) If loop is a candidate for versioning then create a memory bound check, +// by considering all the memory accesses in loop body. +// c) Clone original loop and set all memory accesses as no-alias in new loop. +// d) Set original loop & versioned loop as a branch target of the runtime check +// result. +// +// It transforms loop as shown below: +// +// +----------------+ +// |Runtime Memcheck| +// +----------------+ +// | +// +----------+----------------+----------+ +// | | +// +---------+----------+ +-----------+----------+ +// |Orig Loop Preheader | |Cloned Loop Preheader | +// +--------------------+ +----------------------+ +// | | +// +--------------------+ +----------------------+ +// |Orig Loop Body | |Cloned Loop Body | +// +--------------------+ +----------------------+ +// | | +// +--------------------+ +----------------------+ +// |Orig Loop Exit Block| |Cloned Loop Exit Block| +// +--------------------+ +-----------+----------+ +// | | +// +----------+--------------+-----------+ +// | +// +-----+----+ +// |Join Block| +// +----------+ +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/MapVector.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/AliasSetTracker.h" +#include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Analysis/LoopAccessAnalysis.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpander.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/Analysis/VectorUtils.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/IR/PredIteratorCache.h" +#include "llvm/IR/Type.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/LoopUtils.h" +#include "llvm/Transforms/Utils/LoopVersioning.h" +#include "llvm/Transforms/Utils/ValueMapper.h" + +#define DEBUG_TYPE "loop-versioning-licm" +static const char* LICMVersioningMetaData = + "llvm.loop.licm_versioning.disable"; + +using namespace llvm; + +/// Threshold minimum allowed percentage for possible +/// invariant instructions in a loop. +static cl::opt<float> + LVInvarThreshold("licm-versioning-invariant-threshold", + cl::desc("LoopVersioningLICM's minimum allowed percentage" + "of possible invariant instructions per loop"), + cl::init(25), cl::Hidden); + +/// Threshold for maximum allowed loop nest/depth +static cl::opt<unsigned> LVLoopDepthThreshold( + "licm-versioning-max-depth-threshold", + cl::desc( + "LoopVersioningLICM's threshold for maximum allowed loop nest/depth"), + cl::init(2), cl::Hidden); + +/// \brief Create MDNode for input string. +static MDNode *createStringMetadata(Loop *TheLoop, StringRef Name, unsigned V) { + LLVMContext &Context = TheLoop->getHeader()->getContext(); + Metadata *MDs[] = { + MDString::get(Context, Name), + ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(Context), V))}; + return MDNode::get(Context, MDs); +} + +/// \brief Set input string into loop metadata by keeping other values intact. +void llvm::addStringMetadataToLoop(Loop *TheLoop, const char *MDString, + unsigned V) { + SmallVector<Metadata *, 4> MDs(1); + // If the loop already has metadata, retain it. + MDNode *LoopID = TheLoop->getLoopID(); + if (LoopID) { + for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { + MDNode *Node = cast<MDNode>(LoopID->getOperand(i)); + MDs.push_back(Node); + } + } + // Add new metadata. + MDs.push_back(createStringMetadata(TheLoop, MDString, V)); + // Replace current metadata node with new one. + LLVMContext &Context = TheLoop->getHeader()->getContext(); + MDNode *NewLoopID = MDNode::get(Context, MDs); + // Set operand 0 to refer to the loop id itself. + NewLoopID->replaceOperandWith(0, NewLoopID); + TheLoop->setLoopID(NewLoopID); +} + +namespace { +struct LoopVersioningLICM : public LoopPass { + static char ID; + + bool runOnLoop(Loop *L, LPPassManager &LPM) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired<AAResultsWrapperPass>(); + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addRequiredID(LCSSAID); + AU.addRequired<LoopAccessLegacyAnalysis>(); + AU.addRequired<LoopInfoWrapperPass>(); + AU.addRequiredID(LoopSimplifyID); + AU.addRequired<ScalarEvolutionWrapperPass>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); + AU.addPreserved<AAResultsWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); + } + + LoopVersioningLICM() + : LoopPass(ID), AA(nullptr), SE(nullptr), LI(nullptr), DT(nullptr), + TLI(nullptr), LAA(nullptr), LAI(nullptr), Changed(false), + Preheader(nullptr), CurLoop(nullptr), CurAST(nullptr), + LoopDepthThreshold(LVLoopDepthThreshold), + InvariantThreshold(LVInvarThreshold), LoadAndStoreCounter(0), + InvariantCounter(0), IsReadOnlyLoop(true) { + initializeLoopVersioningLICMPass(*PassRegistry::getPassRegistry()); + } + + AliasAnalysis *AA; // Current AliasAnalysis information + ScalarEvolution *SE; // Current ScalarEvolution + LoopInfo *LI; // Current LoopInfo + DominatorTree *DT; // Dominator Tree for the current Loop. + TargetLibraryInfo *TLI; // TargetLibraryInfo for constant folding. + LoopAccessLegacyAnalysis *LAA; // Current LoopAccessAnalysis + const LoopAccessInfo *LAI; // Current Loop's LoopAccessInfo + + bool Changed; // Set to true when we change anything. + BasicBlock *Preheader; // The preheader block of the current loop. + Loop *CurLoop; // The current loop we are working on. + AliasSetTracker *CurAST; // AliasSet information for the current loop. + ValueToValueMap Strides; + + unsigned LoopDepthThreshold; // Maximum loop nest threshold + float InvariantThreshold; // Minimum invariant threshold + unsigned LoadAndStoreCounter; // Counter to track num of load & store + unsigned InvariantCounter; // Counter to track num of invariant + bool IsReadOnlyLoop; // Read only loop marker. + + bool isLegalForVersioning(); + bool legalLoopStructure(); + bool legalLoopInstructions(); + bool legalLoopMemoryAccesses(); + bool isLoopAlreadyVisited(); + void setNoAliasToLoop(Loop *); + bool instructionSafeForVersioning(Instruction *); + const char *getPassName() const override { return "Loop Versioning"; } +}; +} + +/// \brief Check loop structure and confirms it's good for LoopVersioningLICM. +bool LoopVersioningLICM::legalLoopStructure() { + // Loop must have a preheader, if not return false. + if (!CurLoop->getLoopPreheader()) { + DEBUG(dbgs() << " loop preheader is missing\n"); + return false; + } + // Loop should be innermost loop, if not return false. + if (CurLoop->getSubLoops().size()) { + DEBUG(dbgs() << " loop is not innermost\n"); + return false; + } + // Loop should have a single backedge, if not return false. + if (CurLoop->getNumBackEdges() != 1) { + DEBUG(dbgs() << " loop has multiple backedges\n"); + return false; + } + // Loop must have a single exiting block, if not return false. + if (!CurLoop->getExitingBlock()) { + DEBUG(dbgs() << " loop has multiple exiting block\n"); + return false; + } + // We only handle bottom-tested loop, i.e. loop in which the condition is + // checked at the end of each iteration. With that we can assume that all + // instructions in the loop are executed the same number of times. + if (CurLoop->getExitingBlock() != CurLoop->getLoopLatch()) { + DEBUG(dbgs() << " loop is not bottom tested\n"); + return false; + } + // Parallel loops must not have aliasing loop-invariant memory accesses. + // Hence we don't need to version anything in this case. + if (CurLoop->isAnnotatedParallel()) { + DEBUG(dbgs() << " Parallel loop is not worth versioning\n"); + return false; + } + // Loop depth more then LoopDepthThreshold are not allowed + if (CurLoop->getLoopDepth() > LoopDepthThreshold) { + DEBUG(dbgs() << " loop depth is more then threshold\n"); + return false; + } + // Loop should have a dedicated exit block, if not return false. + if (!CurLoop->hasDedicatedExits()) { + DEBUG(dbgs() << " loop does not has dedicated exit blocks\n"); + return false; + } + // We need to be able to compute the loop trip count in order + // to generate the bound checks. + const SCEV *ExitCount = SE->getBackedgeTakenCount(CurLoop); + if (ExitCount == SE->getCouldNotCompute()) { + DEBUG(dbgs() << " loop does not has trip count\n"); + return false; + } + return true; +} + +/// \brief Check memory accesses in loop and confirms it's good for +/// LoopVersioningLICM. +bool LoopVersioningLICM::legalLoopMemoryAccesses() { + bool HasMayAlias = false; + bool TypeSafety = false; + bool HasMod = false; + // Memory check: + // Transform phase will generate a versioned loop and also a runtime check to + // ensure the pointers are independent and they don’t alias. + // In version variant of loop, alias meta data asserts that all access are + // mutually independent. + // + // Pointers aliasing in alias domain are avoided because with multiple + // aliasing domains we may not be able to hoist potential loop invariant + // access out of the loop. + // + // Iterate over alias tracker sets, and confirm AliasSets doesn't have any + // must alias set. + for (const auto &I : *CurAST) { + const AliasSet &AS = I; + // Skip Forward Alias Sets, as this should be ignored as part of + // the AliasSetTracker object. + if (AS.isForwardingAliasSet()) + continue; + // With MustAlias its not worth adding runtime bound check. + if (AS.isMustAlias()) + return false; + Value *SomePtr = AS.begin()->getValue(); + bool TypeCheck = true; + // Check for Mod & MayAlias + HasMayAlias |= AS.isMayAlias(); + HasMod |= AS.isMod(); + for (const auto &A : AS) { + Value *Ptr = A.getValue(); + // Alias tracker should have pointers of same data type. + TypeCheck = (TypeCheck && (SomePtr->getType() == Ptr->getType())); + } + // At least one alias tracker should have pointers of same data type. + TypeSafety |= TypeCheck; + } + // Ensure types should be of same type. + if (!TypeSafety) { + DEBUG(dbgs() << " Alias tracker type safety failed!\n"); + return false; + } + // Ensure loop body shouldn't be read only. + if (!HasMod) { + DEBUG(dbgs() << " No memory modified in loop body\n"); + return false; + } + // Make sure alias set has may alias case. + // If there no alias memory ambiguity, return false. + if (!HasMayAlias) { + DEBUG(dbgs() << " No ambiguity in memory access.\n"); + return false; + } + return true; +} + +/// \brief Check loop instructions safe for Loop versioning. +/// It returns true if it's safe else returns false. +/// Consider following: +/// 1) Check all load store in loop body are non atomic & non volatile. +/// 2) Check function call safety, by ensuring its not accessing memory. +/// 3) Loop body shouldn't have any may throw instruction. +bool LoopVersioningLICM::instructionSafeForVersioning(Instruction *I) { + assert(I != nullptr && "Null instruction found!"); + // Check function call safety + if (isa<CallInst>(I) && !AA->doesNotAccessMemory(CallSite(I))) { + DEBUG(dbgs() << " Unsafe call site found.\n"); + return false; + } + // Avoid loops with possiblity of throw + if (I->mayThrow()) { + DEBUG(dbgs() << " May throw instruction found in loop body\n"); + return false; + } + // If current instruction is load instructions + // make sure it's a simple load (non atomic & non volatile) + if (I->mayReadFromMemory()) { + LoadInst *Ld = dyn_cast<LoadInst>(I); + if (!Ld || !Ld->isSimple()) { + DEBUG(dbgs() << " Found a non-simple load.\n"); + return false; + } + LoadAndStoreCounter++; + Value *Ptr = Ld->getPointerOperand(); + // Check loop invariant. + if (SE->isLoopInvariant(SE->getSCEV(Ptr), CurLoop)) + InvariantCounter++; + } + // If current instruction is store instruction + // make sure it's a simple store (non atomic & non volatile) + else if (I->mayWriteToMemory()) { + StoreInst *St = dyn_cast<StoreInst>(I); + if (!St || !St->isSimple()) { + DEBUG(dbgs() << " Found a non-simple store.\n"); + return false; + } + LoadAndStoreCounter++; + Value *Ptr = St->getPointerOperand(); + // Check loop invariant. + if (SE->isLoopInvariant(SE->getSCEV(Ptr), CurLoop)) + InvariantCounter++; + + IsReadOnlyLoop = false; + } + return true; +} + +/// \brief Check loop instructions and confirms it's good for +/// LoopVersioningLICM. +bool LoopVersioningLICM::legalLoopInstructions() { + // Resetting counters. + LoadAndStoreCounter = 0; + InvariantCounter = 0; + IsReadOnlyLoop = true; + // Iterate over loop blocks and instructions of each block and check + // instruction safety. + for (auto *Block : CurLoop->getBlocks()) + for (auto &Inst : *Block) { + // If instruction is unsafe just return false. + if (!instructionSafeForVersioning(&Inst)) + return false; + } + // Get LoopAccessInfo from current loop. + LAI = &LAA->getInfo(CurLoop); + // Check LoopAccessInfo for need of runtime check. + if (LAI->getRuntimePointerChecking()->getChecks().empty()) { + DEBUG(dbgs() << " LAA: Runtime check not found !!\n"); + return false; + } + // Number of runtime-checks should be less then RuntimeMemoryCheckThreshold + if (LAI->getNumRuntimePointerChecks() > + VectorizerParams::RuntimeMemoryCheckThreshold) { + DEBUG(dbgs() << " LAA: Runtime checks are more than threshold !!\n"); + return false; + } + // Loop should have at least one invariant load or store instruction. + if (!InvariantCounter) { + DEBUG(dbgs() << " Invariant not found !!\n"); + return false; + } + // Read only loop not allowed. + if (IsReadOnlyLoop) { + DEBUG(dbgs() << " Found a read-only loop!\n"); + return false; + } + // Profitablity check: + // Check invariant threshold, should be in limit. + if (InvariantCounter * 100 < InvariantThreshold * LoadAndStoreCounter) { + DEBUG(dbgs() + << " Invariant load & store are less then defined threshold\n"); + DEBUG(dbgs() << " Invariant loads & stores: " + << ((InvariantCounter * 100) / LoadAndStoreCounter) << "%\n"); + DEBUG(dbgs() << " Invariant loads & store threshold: " + << InvariantThreshold << "%\n"); + return false; + } + return true; +} + +/// \brief It checks loop is already visited or not. +/// check loop meta data, if loop revisited return true +/// else false. +bool LoopVersioningLICM::isLoopAlreadyVisited() { + // Check LoopVersioningLICM metadata into loop + if (findStringMetadataForLoop(CurLoop, LICMVersioningMetaData)) { + return true; + } + return false; +} + +/// \brief Checks legality for LoopVersioningLICM by considering following: +/// a) loop structure legality b) loop instruction legality +/// c) loop memory access legality. +/// Return true if legal else returns false. +bool LoopVersioningLICM::isLegalForVersioning() { + DEBUG(dbgs() << "Loop: " << *CurLoop); + // Make sure not re-visiting same loop again. + if (isLoopAlreadyVisited()) { + DEBUG( + dbgs() << " Revisiting loop in LoopVersioningLICM not allowed.\n\n"); + return false; + } + // Check loop structure leagality. + if (!legalLoopStructure()) { + DEBUG( + dbgs() << " Loop structure not suitable for LoopVersioningLICM\n\n"); + return false; + } + // Check loop instruction leagality. + if (!legalLoopInstructions()) { + DEBUG(dbgs() + << " Loop instructions not suitable for LoopVersioningLICM\n\n"); + return false; + } + // Check loop memory access leagality. + if (!legalLoopMemoryAccesses()) { + DEBUG(dbgs() + << " Loop memory access not suitable for LoopVersioningLICM\n\n"); + return false; + } + // Loop versioning is feasible, return true. + DEBUG(dbgs() << " Loop Versioning found to be beneficial\n\n"); + return true; +} + +/// \brief Update loop with aggressive aliasing assumptions. +/// It marks no-alias to any pairs of memory operations by assuming +/// loop should not have any must-alias memory accesses pairs. +/// During LoopVersioningLICM legality we ignore loops having must +/// aliasing memory accesses. +void LoopVersioningLICM::setNoAliasToLoop(Loop *VerLoop) { + // Get latch terminator instruction. + Instruction *I = VerLoop->getLoopLatch()->getTerminator(); + // Create alias scope domain. + MDBuilder MDB(I->getContext()); + MDNode *NewDomain = MDB.createAnonymousAliasScopeDomain("LVDomain"); + StringRef Name = "LVAliasScope"; + SmallVector<Metadata *, 4> Scopes, NoAliases; + MDNode *NewScope = MDB.createAnonymousAliasScope(NewDomain, Name); + // Iterate over each instruction of loop. + // set no-alias for all load & store instructions. + for (auto *Block : CurLoop->getBlocks()) { + for (auto &Inst : *Block) { + // Only interested in instruction that may modify or read memory. + if (!Inst.mayReadFromMemory() && !Inst.mayWriteToMemory()) + continue; + Scopes.push_back(NewScope); + NoAliases.push_back(NewScope); + // Set no-alias for current instruction. + Inst.setMetadata( + LLVMContext::MD_noalias, + MDNode::concatenate(Inst.getMetadata(LLVMContext::MD_noalias), + MDNode::get(Inst.getContext(), NoAliases))); + // set alias-scope for current instruction. + Inst.setMetadata( + LLVMContext::MD_alias_scope, + MDNode::concatenate(Inst.getMetadata(LLVMContext::MD_alias_scope), + MDNode::get(Inst.getContext(), Scopes))); + } + } +} + +bool LoopVersioningLICM::runOnLoop(Loop *L, LPPassManager &LPM) { + if (skipLoop(L)) + return false; + Changed = false; + // Get Analysis information. + LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); + SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); + DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); + LAI = nullptr; + // Set Current Loop + CurLoop = L; + // Get the preheader block. + Preheader = L->getLoopPreheader(); + // Initial allocation + CurAST = new AliasSetTracker(*AA); + + // Loop over the body of this loop, construct AST. + for (auto *Block : L->getBlocks()) { + if (LI->getLoopFor(Block) == L) // Ignore blocks in subloop. + CurAST->add(*Block); // Incorporate the specified basic block + } + // Check feasiblity of LoopVersioningLICM. + // If versioning found to be feasible and beneficial then proceed + // else simply return, by cleaning up memory. + if (isLegalForVersioning()) { + // Do loop versioning. + // Create memcheck for memory accessed inside loop. + // Clone original loop, and set blocks properly. + LoopVersioning LVer(*LAI, CurLoop, LI, DT, SE, true); + LVer.versionLoop(); + // Set Loop Versioning metaData for original loop. + addStringMetadataToLoop(LVer.getNonVersionedLoop(), LICMVersioningMetaData); + // Set Loop Versioning metaData for version loop. + addStringMetadataToLoop(LVer.getVersionedLoop(), LICMVersioningMetaData); + // Set "llvm.mem.parallel_loop_access" metaData to versioned loop. + addStringMetadataToLoop(LVer.getVersionedLoop(), + "llvm.mem.parallel_loop_access"); + // Update version loop with aggressive aliasing assumption. + setNoAliasToLoop(LVer.getVersionedLoop()); + Changed = true; + } + // Delete allocated memory. + delete CurAST; + return Changed; +} + +char LoopVersioningLICM::ID = 0; +INITIALIZE_PASS_BEGIN(LoopVersioningLICM, "loop-versioning-licm", + "Loop Versioning For LICM", false, false) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopSimplify) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_END(LoopVersioningLICM, "loop-versioning-licm", + "Loop Versioning For LICM", false, false) + +Pass *llvm::createLoopVersioningLICMPass() { return new LoopVersioningLICM(); } diff --git a/lib/Transforms/Scalar/LowerAtomic.cpp b/lib/Transforms/Scalar/LowerAtomic.cpp index 41511bcb7b048..08e60b16bedff 100644 --- a/lib/Transforms/Scalar/LowerAtomic.cpp +++ b/lib/Transforms/Scalar/LowerAtomic.cpp @@ -12,11 +12,12 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Scalar/LowerAtomic.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/Pass.h" +#include "llvm/Transforms/Scalar.h" using namespace llvm; #define DEBUG_TYPE "loweratomic" @@ -100,49 +101,74 @@ static bool LowerFenceInst(FenceInst *FI) { } static bool LowerLoadInst(LoadInst *LI) { - LI->setAtomic(NotAtomic); + LI->setAtomic(AtomicOrdering::NotAtomic); return true; } static bool LowerStoreInst(StoreInst *SI) { - SI->setAtomic(NotAtomic); + SI->setAtomic(AtomicOrdering::NotAtomic); return true; } -namespace { - struct LowerAtomic : public BasicBlockPass { - static char ID; - LowerAtomic() : BasicBlockPass(ID) { - initializeLowerAtomicPass(*PassRegistry::getPassRegistry()); - } - bool runOnBasicBlock(BasicBlock &BB) override { - if (skipOptnoneFunction(BB)) - return false; - bool Changed = false; - for (BasicBlock::iterator DI = BB.begin(), DE = BB.end(); DI != DE; ) { - Instruction *Inst = &*DI++; - if (FenceInst *FI = dyn_cast<FenceInst>(Inst)) - Changed |= LowerFenceInst(FI); - else if (AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(Inst)) - Changed |= LowerAtomicCmpXchgInst(CXI); - else if (AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(Inst)) - Changed |= LowerAtomicRMWInst(RMWI); - else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) { - if (LI->isAtomic()) - LowerLoadInst(LI); - } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { - if (SI->isAtomic()) - LowerStoreInst(SI); - } - } - return Changed; +static bool runOnBasicBlock(BasicBlock &BB) { + bool Changed = false; + for (BasicBlock::iterator DI = BB.begin(), DE = BB.end(); DI != DE;) { + Instruction *Inst = &*DI++; + if (FenceInst *FI = dyn_cast<FenceInst>(Inst)) + Changed |= LowerFenceInst(FI); + else if (AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(Inst)) + Changed |= LowerAtomicCmpXchgInst(CXI); + else if (AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(Inst)) + Changed |= LowerAtomicRMWInst(RMWI); + else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) { + if (LI->isAtomic()) + LowerLoadInst(LI); + } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { + if (SI->isAtomic()) + LowerStoreInst(SI); } + } + return Changed; +} + +static bool lowerAtomics(Function &F) { + bool Changed = false; + for (BasicBlock &BB : F) { + Changed |= runOnBasicBlock(BB); + } + return Changed; +} + +PreservedAnalyses LowerAtomicPass::run(Function &F, FunctionAnalysisManager &) { + if (lowerAtomics(F)) + return PreservedAnalyses::none(); + return PreservedAnalyses::all(); +} + +namespace { +class LowerAtomicLegacyPass : public FunctionPass { +public: + static char ID; + + LowerAtomicLegacyPass() : FunctionPass(ID) { + initializeLowerAtomicLegacyPassPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override { + if (skipFunction(F)) + return false; + FunctionAnalysisManager DummyFAM; + auto PA = Impl.run(F, DummyFAM); + return !PA.areAllPreserved(); + } + +private: + LowerAtomicPass Impl; }; } -char LowerAtomic::ID = 0; -INITIALIZE_PASS(LowerAtomic, "loweratomic", - "Lower atomic intrinsics to non-atomic form", - false, false) +char LowerAtomicLegacyPass::ID = 0; +INITIALIZE_PASS(LowerAtomicLegacyPass, "loweratomic", + "Lower atomic intrinsics to non-atomic form", false, false) -Pass *llvm::createLowerAtomicPass() { return new LowerAtomic(); } +Pass *llvm::createLowerAtomicPass() { return new LowerAtomicLegacyPass(); } diff --git a/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp b/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp index 2ace902a7a1b8..79f0db1163a4c 100644 --- a/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp +++ b/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp @@ -34,12 +34,24 @@ using namespace llvm; STATISTIC(ExpectIntrinsicsHandled, "Number of 'expect' intrinsic instructions handled"); -static cl::opt<uint32_t> -LikelyBranchWeight("likely-branch-weight", cl::Hidden, cl::init(64), - cl::desc("Weight of the branch likely to be taken (default = 64)")); -static cl::opt<uint32_t> -UnlikelyBranchWeight("unlikely-branch-weight", cl::Hidden, cl::init(4), - cl::desc("Weight of the branch unlikely to be taken (default = 4)")); +// These default values are chosen to represent an extremely skewed outcome for +// a condition, but they leave some room for interpretation by later passes. +// +// If the documentation for __builtin_expect() was made explicit that it should +// only be used in extreme cases, we could make this ratio higher. As it stands, +// programmers may be using __builtin_expect() / llvm.expect to annotate that a +// branch is likely or unlikely to be taken. +// +// There is a known dependency on this ratio in CodeGenPrepare when transforming +// 'select' instructions. It may be worthwhile to hoist these values to some +// shared space, so they can be used directly by other passes. + +static cl::opt<uint32_t> LikelyBranchWeight( + "likely-branch-weight", cl::Hidden, cl::init(2000), + cl::desc("Weight of the branch likely to be taken (default = 2000)")); +static cl::opt<uint32_t> UnlikelyBranchWeight( + "unlikely-branch-weight", cl::Hidden, cl::init(1), + cl::desc("Weight of the branch unlikely to be taken (default = 1)")); static bool handleSwitchExpect(SwitchInst &SI) { CallInst *CI = dyn_cast<CallInst>(SI.getCondition()); @@ -158,7 +170,8 @@ static bool lowerExpectIntrinsic(Function &F) { return Changed; } -PreservedAnalyses LowerExpectIntrinsicPass::run(Function &F) { +PreservedAnalyses LowerExpectIntrinsicPass::run(Function &F, + FunctionAnalysisManager &) { if (lowerExpectIntrinsic(F)) return PreservedAnalyses::none(); diff --git a/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp b/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp new file mode 100644 index 0000000000000..57491007d0141 --- /dev/null +++ b/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp @@ -0,0 +1,123 @@ +//===- LowerGuardIntrinsic.cpp - Lower the guard intrinsic ---------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass lowers the llvm.experimental.guard intrinsic to a conditional call +// to @llvm.experimental.deoptimize. Once this happens, the guard can no longer +// be widened. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Scalar.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +using namespace llvm; + +static cl::opt<uint32_t> PredicatePassBranchWeight( + "guards-predicate-pass-branch-weight", cl::Hidden, cl::init(1 << 20), + cl::desc("The probability of a guard failing is assumed to be the " + "reciprocal of this value (default = 1 << 20)")); + +namespace { +struct LowerGuardIntrinsic : public FunctionPass { + static char ID; + LowerGuardIntrinsic() : FunctionPass(ID) { + initializeLowerGuardIntrinsicPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override; +}; +} + +static void MakeGuardControlFlowExplicit(Function *DeoptIntrinsic, + CallInst *CI) { + OperandBundleDef DeoptOB(*CI->getOperandBundle(LLVMContext::OB_deopt)); + SmallVector<Value *, 4> Args(std::next(CI->arg_begin()), CI->arg_end()); + + auto *CheckBB = CI->getParent(); + auto *DeoptBlockTerm = + SplitBlockAndInsertIfThen(CI->getArgOperand(0), CI, true); + + auto *CheckBI = cast<BranchInst>(CheckBB->getTerminator()); + + // SplitBlockAndInsertIfThen inserts control flow that branches to + // DeoptBlockTerm if the condition is true. We want the opposite. + CheckBI->swapSuccessors(); + + CheckBI->getSuccessor(0)->setName("guarded"); + CheckBI->getSuccessor(1)->setName("deopt"); + + if (auto *MD = CI->getMetadata(LLVMContext::MD_make_implicit)) + CheckBI->setMetadata(LLVMContext::MD_make_implicit, MD); + + MDBuilder MDB(CI->getContext()); + CheckBI->setMetadata(LLVMContext::MD_prof, + MDB.createBranchWeights(PredicatePassBranchWeight, 1)); + + IRBuilder<> B(DeoptBlockTerm); + auto *DeoptCall = B.CreateCall(DeoptIntrinsic, Args, {DeoptOB}, ""); + + if (DeoptIntrinsic->getReturnType()->isVoidTy()) { + B.CreateRetVoid(); + } else { + DeoptCall->setName("deoptcall"); + B.CreateRet(DeoptCall); + } + + DeoptCall->setCallingConv(CI->getCallingConv()); + DeoptBlockTerm->eraseFromParent(); +} + +bool LowerGuardIntrinsic::runOnFunction(Function &F) { + // Check if we can cheaply rule out the possibility of not having any work to + // do. + auto *GuardDecl = F.getParent()->getFunction( + Intrinsic::getName(Intrinsic::experimental_guard)); + if (!GuardDecl || GuardDecl->use_empty()) + return false; + + SmallVector<CallInst *, 8> ToLower; + for (auto &I : instructions(F)) + if (auto *CI = dyn_cast<CallInst>(&I)) + if (auto *F = CI->getCalledFunction()) + if (F->getIntrinsicID() == Intrinsic::experimental_guard) + ToLower.push_back(CI); + + if (ToLower.empty()) + return false; + + auto *DeoptIntrinsic = Intrinsic::getDeclaration( + F.getParent(), Intrinsic::experimental_deoptimize, {F.getReturnType()}); + DeoptIntrinsic->setCallingConv(GuardDecl->getCallingConv()); + + for (auto *CI : ToLower) { + MakeGuardControlFlowExplicit(DeoptIntrinsic, CI); + CI->eraseFromParent(); + } + + return true; +} + +char LowerGuardIntrinsic::ID = 0; +INITIALIZE_PASS(LowerGuardIntrinsic, "lower-guard-intrinsic", + "Lower the guard intrinsic to normal control flow", false, + false) + +Pass *llvm::createLowerGuardIntrinsicPass() { + return new LowerGuardIntrinsic(); +} diff --git a/lib/Transforms/Scalar/Makefile b/lib/Transforms/Scalar/Makefile deleted file mode 100644 index cc42fd00ac7da..0000000000000 --- a/lib/Transforms/Scalar/Makefile +++ /dev/null @@ -1,15 +0,0 @@ -##===- lib/Transforms/Scalar/Makefile ----------------------*- Makefile -*-===## -# -# The LLVM Compiler Infrastructure -# -# This file is distributed under the University of Illinois Open Source -# License. See LICENSE.TXT for details. -# -##===----------------------------------------------------------------------===## - -LEVEL = ../../.. -LIBRARYNAME = LLVMScalarOpts -BUILD_ARCHIVE = 1 - -include $(LEVEL)/Makefile.common - diff --git a/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/lib/Transforms/Scalar/MemCpyOptimizer.cpp index 6b43b0f7a2ad8..d64c658f84369 100644 --- a/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -12,22 +12,16 @@ // //===----------------------------------------------------------------------===// +#include "llvm/Transforms/Scalar/MemCpyOptimizer.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/ADT/DenseSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/Analysis/AssumptionCache.h" -#include "llvm/Analysis/GlobalsModRef.h" -#include "llvm/Analysis/MemoryDependenceAnalysis.h" -#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/DataLayout.h" -#include "llvm/IR/Dominators.h" #include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/IRBuilder.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/IntrinsicInst.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/Local.h" @@ -184,7 +178,7 @@ bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const { // size. If so, check to see whether we will end up actually reducing the // number of stores used. unsigned Bytes = unsigned(End-Start); - unsigned MaxIntSize = DL.getLargestLegalIntTypeSize(); + unsigned MaxIntSize = DL.getLargestLegalIntTypeSizeInBits() / 8; if (MaxIntSize == 0) MaxIntSize = 1; unsigned NumPointerStores = Bytes / MaxIntSize; @@ -301,19 +295,16 @@ void MemsetRanges::addRange(int64_t Start, int64_t Size, Value *Ptr, } //===----------------------------------------------------------------------===// -// MemCpyOpt Pass +// MemCpyOptLegacyPass Pass //===----------------------------------------------------------------------===// namespace { - class MemCpyOpt : public FunctionPass { - MemoryDependenceAnalysis *MD; - TargetLibraryInfo *TLI; + class MemCpyOptLegacyPass : public FunctionPass { + MemCpyOptPass Impl; public: static char ID; // Pass identification, replacement for typeid - MemCpyOpt() : FunctionPass(ID) { - initializeMemCpyOptPass(*PassRegistry::getPassRegistry()); - MD = nullptr; - TLI = nullptr; + MemCpyOptLegacyPass() : FunctionPass(ID) { + initializeMemCpyOptLegacyPassPass(*PassRegistry::getPassRegistry()); } bool runOnFunction(Function &F) override; @@ -324,11 +315,11 @@ namespace { AU.setPreservesCFG(); AU.addRequired<AssumptionCacheTracker>(); AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<MemoryDependenceAnalysis>(); + AU.addRequired<MemoryDependenceWrapperPass>(); AU.addRequired<AAResultsWrapperPass>(); AU.addRequired<TargetLibraryInfoWrapperPass>(); AU.addPreserved<GlobalsAAWrapperPass>(); - AU.addPreserved<MemoryDependenceAnalysis>(); + AU.addPreserved<MemoryDependenceWrapperPass>(); } // Helper functions @@ -348,29 +339,30 @@ namespace { bool iterateOnFunction(Function &F); }; - char MemCpyOpt::ID = 0; + char MemCpyOptLegacyPass::ID = 0; } /// The public interface to this file... -FunctionPass *llvm::createMemCpyOptPass() { return new MemCpyOpt(); } +FunctionPass *llvm::createMemCpyOptPass() { return new MemCpyOptLegacyPass(); } -INITIALIZE_PASS_BEGIN(MemCpyOpt, "memcpyopt", "MemCpy Optimization", +INITIALIZE_PASS_BEGIN(MemCpyOptLegacyPass, "memcpyopt", "MemCpy Optimization", false, false) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis) +INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) -INITIALIZE_PASS_END(MemCpyOpt, "memcpyopt", "MemCpy Optimization", +INITIALIZE_PASS_END(MemCpyOptLegacyPass, "memcpyopt", "MemCpy Optimization", false, false) /// When scanning forward over instructions, we look for some other patterns to /// fold away. In particular, this looks for stores to neighboring locations of /// memory. If it sees enough consecutive ones, it attempts to merge them /// together into a memcpy/memset. -Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst, - Value *StartPtr, Value *ByteVal) { +Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst, + Value *StartPtr, + Value *ByteVal) { const DataLayout &DL = StartInst->getModule()->getDataLayout(); // Okay, so we now have a single store that can be splatable. Scan to find @@ -493,7 +485,93 @@ static unsigned findCommonAlignment(const DataLayout &DL, const StoreInst *SI, return std::min(StoreAlign, LoadAlign); } -bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { +// This method try to lift a store instruction before position P. +// It will lift the store and its argument + that anything that +// may alias with these. +// The method returns true if it was successful. +static bool moveUp(AliasAnalysis &AA, StoreInst *SI, Instruction *P) { + // If the store alias this position, early bail out. + MemoryLocation StoreLoc = MemoryLocation::get(SI); + if (AA.getModRefInfo(P, StoreLoc) != MRI_NoModRef) + return false; + + // Keep track of the arguments of all instruction we plan to lift + // so we can make sure to lift them as well if apropriate. + DenseSet<Instruction*> Args; + if (auto *Ptr = dyn_cast<Instruction>(SI->getPointerOperand())) + if (Ptr->getParent() == SI->getParent()) + Args.insert(Ptr); + + // Instruction to lift before P. + SmallVector<Instruction*, 8> ToLift; + + // Memory locations of lifted instructions. + SmallVector<MemoryLocation, 8> MemLocs; + MemLocs.push_back(StoreLoc); + + // Lifted callsites. + SmallVector<ImmutableCallSite, 8> CallSites; + + for (auto I = --SI->getIterator(), E = P->getIterator(); I != E; --I) { + auto *C = &*I; + + bool MayAlias = AA.getModRefInfo(C) != MRI_NoModRef; + + bool NeedLift = false; + if (Args.erase(C)) + NeedLift = true; + else if (MayAlias) { + NeedLift = std::any_of(MemLocs.begin(), MemLocs.end(), + [C, &AA](const MemoryLocation &ML) { + return AA.getModRefInfo(C, ML); + }); + + if (!NeedLift) + NeedLift = std::any_of(CallSites.begin(), CallSites.end(), + [C, &AA](const ImmutableCallSite &CS) { + return AA.getModRefInfo(C, CS); + }); + } + + if (!NeedLift) + continue; + + if (MayAlias) { + if (auto CS = ImmutableCallSite(C)) { + // If we can't lift this before P, it's game over. + if (AA.getModRefInfo(P, CS) != MRI_NoModRef) + return false; + + CallSites.push_back(CS); + } else if (isa<LoadInst>(C) || isa<StoreInst>(C) || isa<VAArgInst>(C)) { + // If we can't lift this before P, it's game over. + auto ML = MemoryLocation::get(C); + if (AA.getModRefInfo(P, ML) != MRI_NoModRef) + return false; + + MemLocs.push_back(ML); + } else + // We don't know how to lift this instruction. + return false; + } + + ToLift.push_back(C); + for (unsigned k = 0, e = C->getNumOperands(); k != e; ++k) + if (auto *A = dyn_cast<Instruction>(C->getOperand(k))) + if (A->getParent() == SI->getParent()) + Args.insert(A); + } + + // We made it, we need to lift + for (auto *I : reverse(ToLift)) { + DEBUG(dbgs() << "Lifting " << *I << " before " << *P << "\n"); + I->moveBefore(P); + } + + return true; +} + +bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { if (!SI->isSimple()) return false; // Avoid merging nontemporal stores since the resulting @@ -514,7 +592,7 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { auto *T = LI->getType(); if (T->isAggregateType()) { - AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults(); + AliasAnalysis &AA = LookupAliasAnalysis(); MemoryLocation LoadLoc = MemoryLocation::get(LI); // We use alias analysis to check if an instruction may store to @@ -522,26 +600,20 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { // such an instruction is found, we try to promote there instead // of at the store position. Instruction *P = SI; - for (BasicBlock::iterator I = ++LI->getIterator(), E = SI->getIterator(); - I != E; ++I) { - if (!(AA.getModRefInfo(&*I, LoadLoc) & MRI_Mod)) - continue; - - // We found an instruction that may write to the loaded memory. - // We can try to promote at this position instead of the store - // position if nothing alias the store memory after this and the store - // destination is not in the range. - P = &*I; - for (; I != E; ++I) { - MemoryLocation StoreLoc = MemoryLocation::get(SI); - if (&*I == SI->getOperand(1) || - AA.getModRefInfo(&*I, StoreLoc) != MRI_NoModRef) { - P = nullptr; - break; - } + for (auto &I : make_range(++LI->getIterator(), SI->getIterator())) { + if (AA.getModRefInfo(&I, LoadLoc) & MRI_Mod) { + P = &I; + break; } + } - break; + // We found an instruction that may write to the loaded memory. + // We can try to promote at this position instead of the store + // position if nothing alias the store memory after this and the store + // destination is not in the range. + if (P && P != SI) { + if (!moveUp(AA, SI, P)) + P = nullptr; } // If a valid insertion position is found, then we can promote @@ -594,7 +666,9 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { if (C) { // Check that nothing touches the dest of the "copy" between // the call and the store. - AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults(); + Value *CpyDest = SI->getPointerOperand()->stripPointerCasts(); + bool CpyDestIsLocal = isa<AllocaInst>(CpyDest); + AliasAnalysis &AA = LookupAliasAnalysis(); MemoryLocation StoreLoc = MemoryLocation::get(SI); for (BasicBlock::iterator I = --SI->getIterator(), E = C->getIterator(); I != E; --I) { @@ -602,6 +676,12 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { C = nullptr; break; } + // The store to dest may never happen if an exception can be thrown + // between the load and the store. + if (I->mayThrow() && !CpyDestIsLocal) { + C = nullptr; + break; + } } } @@ -665,7 +745,7 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { return false; } -bool MemCpyOpt::processMemSet(MemSetInst *MSI, BasicBlock::iterator &BBI) { +bool MemCpyOptPass::processMemSet(MemSetInst *MSI, BasicBlock::iterator &BBI) { // See if there is another memset or store neighboring this memset which // allows us to widen out the memset to do a single larger store. if (isa<ConstantInt>(MSI->getLength()) && !MSI->isVolatile()) @@ -681,10 +761,9 @@ bool MemCpyOpt::processMemSet(MemSetInst *MSI, BasicBlock::iterator &BBI) { /// Takes a memcpy and a call that it depends on, /// and checks for the possibility of a call slot optimization by having /// the call write its result directly into the destination of the memcpy. -bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy, - Value *cpyDest, Value *cpySrc, - uint64_t cpyLen, unsigned cpyAlign, - CallInst *C) { +bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpy, Value *cpyDest, + Value *cpySrc, uint64_t cpyLen, + unsigned cpyAlign, CallInst *C) { // The general transformation to keep in mind is // // call @func(..., src, ...) @@ -699,6 +778,11 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy, // src only holds uninitialized values at the moment of the call, meaning that // the memcpy can be discarded rather than moved. + // Lifetime marks shouldn't be operated on. + if (Function *F = C->getCalledFunction()) + if (F->isIntrinsic() && F->getIntrinsicID() == Intrinsic::lifetime_start) + return false; + // Deliberately get the source and destination with bitcasts stripped away, // because we'll need to do type comparisons based on the underlying type. CallSite CS(C); @@ -734,6 +818,10 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy, if (destSize < srcSize) return false; } else if (Argument *A = dyn_cast<Argument>(cpyDest)) { + // The store to dest may never happen if the call can throw. + if (C->mayThrow()) + return false; + if (A->getDereferenceableBytes() < srcSize) { // If the destination is an sret parameter then only accesses that are // outside of the returned struct type can trap. @@ -805,7 +893,7 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy, // Since we're changing the parameter to the callsite, we need to make sure // that what would be the new parameter dominates the callsite. - DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + DominatorTree &DT = LookupDomTree(); if (Instruction *cpyDestInst = dyn_cast<Instruction>(cpyDest)) if (!DT.dominates(cpyDestInst, C)) return false; @@ -814,7 +902,7 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy, // unexpected manner, for example via a global, which we deduce from // the use analysis, we also need to know that it does not sneakily // access dest. We rely on AA to figure this out for us. - AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults(); + AliasAnalysis &AA = LookupAliasAnalysis(); ModRefInfo MR = AA.getModRefInfo(C, cpyDest, srcSize); // If necessary, perform additional analysis. if (MR != MRI_NoModRef) @@ -867,7 +955,8 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy, /// We've found that the (upward scanning) memory dependence of memcpy 'M' is /// the memcpy 'MDep'. Try to simplify M to copy from MDep's input if we can. -bool MemCpyOpt::processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep) { +bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M, + MemCpyInst *MDep) { // We can only transforms memcpy's where the dest of one is the source of the // other. if (M->getSource() != MDep->getDest() || MDep->isVolatile()) @@ -888,7 +977,7 @@ bool MemCpyOpt::processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep) { if (!MDepLen || !MLen || MDepLen->getZExtValue() < MLen->getZExtValue()) return false; - AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults(); + AliasAnalysis &AA = LookupAliasAnalysis(); // Verify that the copied-from memory doesn't change in between the two // transfers. For example, in: @@ -954,8 +1043,8 @@ bool MemCpyOpt::processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep) { /// memcpy(dst, src, src_size); /// memset(dst + src_size, c, dst_size <= src_size ? 0 : dst_size - src_size); /// \endcode -bool MemCpyOpt::processMemSetMemCpyDependence(MemCpyInst *MemCpy, - MemSetInst *MemSet) { +bool MemCpyOptPass::processMemSetMemCpyDependence(MemCpyInst *MemCpy, + MemSetInst *MemSet) { // We can only transform memset/memcpy with the same destination. if (MemSet->getDest() != MemCpy->getDest()) return false; @@ -1019,8 +1108,8 @@ bool MemCpyOpt::processMemSetMemCpyDependence(MemCpyInst *MemCpy, /// When dst2_size <= dst1_size. /// /// The \p MemCpy must have a Constant length. -bool MemCpyOpt::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy, - MemSetInst *MemSet) { +bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy, + MemSetInst *MemSet) { // This only makes sense on memcpy(..., memset(...), ...). if (MemSet->getRawDest() != MemCpy->getRawSource()) return false; @@ -1043,7 +1132,7 @@ bool MemCpyOpt::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy, /// B to be a memcpy from X to Z (or potentially a memmove, depending on /// circumstances). This allows later passes to remove the first memcpy /// altogether. -bool MemCpyOpt::processMemCpy(MemCpyInst *M) { +bool MemCpyOptPass::processMemCpy(MemCpyInst *M) { // We can only optimize non-volatile memcpy's. if (M->isVolatile()) return false; @@ -1141,8 +1230,8 @@ bool MemCpyOpt::processMemCpy(MemCpyInst *M) { /// Transforms memmove calls to memcpy calls when the src/dst are guaranteed /// not to alias. -bool MemCpyOpt::processMemMove(MemMoveInst *M) { - AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults(); +bool MemCpyOptPass::processMemMove(MemMoveInst *M) { + AliasAnalysis &AA = LookupAliasAnalysis(); if (!TLI->has(LibFunc::memmove)) return false; @@ -1152,7 +1241,8 @@ bool MemCpyOpt::processMemMove(MemMoveInst *M) { MemoryLocation::getForSource(M))) return false; - DEBUG(dbgs() << "MemCpyOpt: Optimizing memmove -> memcpy: " << *M << "\n"); + DEBUG(dbgs() << "MemCpyOptPass: Optimizing memmove -> memcpy: " << *M + << "\n"); // If not, then we know we can transform this. Type *ArgTys[3] = { M->getRawDest()->getType(), @@ -1170,7 +1260,7 @@ bool MemCpyOpt::processMemMove(MemMoveInst *M) { } /// This is called on every byval argument in call sites. -bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) { +bool MemCpyOptPass::processByValArgument(CallSite CS, unsigned ArgNo) { const DataLayout &DL = CS.getCaller()->getParent()->getDataLayout(); // Find out what feeds this byval argument. Value *ByValArg = CS.getArgument(ArgNo); @@ -1202,10 +1292,8 @@ bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) { // If it is greater than the memcpy, then we check to see if we can force the // source of the memcpy to the alignment we need. If we fail, we bail out. - AssumptionCache &AC = - getAnalysis<AssumptionCacheTracker>().getAssumptionCache( - *CS->getParent()->getParent()); - DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + AssumptionCache &AC = LookupAssumptionCache(); + DominatorTree &DT = LookupDomTree(); if (MDep->getAlignment() < ByValAlign && getOrEnforceKnownAlignment(MDep->getSource(), ByValAlign, DL, CS.getInstruction(), &AC, &DT) < ByValAlign) @@ -1231,7 +1319,7 @@ bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) { TmpCast = new BitCastInst(MDep->getSource(), ByValArg->getType(), "tmpcast", CS.getInstruction()); - DEBUG(dbgs() << "MemCpyOpt: Forwarding memcpy to byval:\n" + DEBUG(dbgs() << "MemCpyOptPass: Forwarding memcpy to byval:\n" << " " << *MDep << "\n" << " " << *CS.getInstruction() << "\n"); @@ -1241,13 +1329,13 @@ bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) { return true; } -/// Executes one iteration of MemCpyOpt. -bool MemCpyOpt::iterateOnFunction(Function &F) { +/// Executes one iteration of MemCpyOptPass. +bool MemCpyOptPass::iterateOnFunction(Function &F) { bool MadeChange = false; // Walk all instruction in the function. - for (Function::iterator BB = F.begin(), BBE = F.end(); BB != BBE; ++BB) { - for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) { + for (BasicBlock &BB : F) { + for (BasicBlock::iterator BI = BB.begin(), BE = BB.end(); BI != BE;) { // Avoid invalidating the iterator. Instruction *I = &*BI++; @@ -1269,7 +1357,8 @@ bool MemCpyOpt::iterateOnFunction(Function &F) { // Reprocess the instruction if desired. if (RepeatInstruction) { - if (BI != BB->begin()) --BI; + if (BI != BB.begin()) + --BI; MadeChange = true; } } @@ -1278,14 +1367,42 @@ bool MemCpyOpt::iterateOnFunction(Function &F) { return MadeChange; } -/// This is the main transformation entry point for a function. -bool MemCpyOpt::runOnFunction(Function &F) { - if (skipOptnoneFunction(F)) - return false; +PreservedAnalyses MemCpyOptPass::run(Function &F, FunctionAnalysisManager &AM) { + + auto &MD = AM.getResult<MemoryDependenceAnalysis>(F); + auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); + + auto LookupAliasAnalysis = [&]() -> AliasAnalysis & { + return AM.getResult<AAManager>(F); + }; + auto LookupAssumptionCache = [&]() -> AssumptionCache & { + return AM.getResult<AssumptionAnalysis>(F); + }; + auto LookupDomTree = [&]() -> DominatorTree & { + return AM.getResult<DominatorTreeAnalysis>(F); + }; + + bool MadeChange = runImpl(F, &MD, &TLI, LookupAliasAnalysis, + LookupAssumptionCache, LookupDomTree); + if (!MadeChange) + return PreservedAnalyses::all(); + PreservedAnalyses PA; + PA.preserve<GlobalsAA>(); + PA.preserve<MemoryDependenceAnalysis>(); + return PA; +} +bool MemCpyOptPass::runImpl( + Function &F, MemoryDependenceResults *MD_, TargetLibraryInfo *TLI_, + std::function<AliasAnalysis &()> LookupAliasAnalysis_, + std::function<AssumptionCache &()> LookupAssumptionCache_, + std::function<DominatorTree &()> LookupDomTree_) { bool MadeChange = false; - MD = &getAnalysis<MemoryDependenceAnalysis>(); - TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + MD = MD_; + TLI = TLI_; + LookupAliasAnalysis = std::move(LookupAliasAnalysis_); + LookupAssumptionCache = std::move(LookupAssumptionCache_); + LookupDomTree = std::move(LookupDomTree_); // If we don't have at least memset and memcpy, there is little point of doing // anything here. These are required by a freestanding implementation, so if @@ -1302,3 +1419,25 @@ bool MemCpyOpt::runOnFunction(Function &F) { MD = nullptr; return MadeChange; } + +/// This is the main transformation entry point for a function. +bool MemCpyOptLegacyPass::runOnFunction(Function &F) { + if (skipFunction(F)) + return false; + + auto *MD = &getAnalysis<MemoryDependenceWrapperPass>().getMemDep(); + auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + + auto LookupAliasAnalysis = [this]() -> AliasAnalysis & { + return getAnalysis<AAResultsWrapperPass>().getAAResults(); + }; + auto LookupAssumptionCache = [this, &F]() -> AssumptionCache & { + return getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); + }; + auto LookupDomTree = [this]() -> DominatorTree & { + return getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + }; + + return Impl.runImpl(F, MD, TLI, LookupAliasAnalysis, LookupAssumptionCache, + LookupDomTree); +} diff --git a/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp b/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp index c812d618c16ac..30261b7550019 100644 --- a/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp +++ b/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp @@ -72,9 +72,7 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Transforms/Scalar.h" -#include "llvm/ADT/SetVector.h" -#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/Transforms/Scalar/MergedLoadStoreMotion.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/CFG.h" @@ -82,51 +80,37 @@ #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/MemoryDependenceAnalysis.h" -#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/PatternMatch.h" -#include "llvm/Support/Allocator.h" -#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/SSAUpdater.h" -#include <vector> using namespace llvm; #define DEBUG_TYPE "mldst-motion" +namespace { //===----------------------------------------------------------------------===// // MergedLoadStoreMotion Pass //===----------------------------------------------------------------------===// +class MergedLoadStoreMotion { + MemoryDependenceResults *MD = nullptr; + AliasAnalysis *AA = nullptr; -namespace { -class MergedLoadStoreMotion : public FunctionPass { - AliasAnalysis *AA; - MemoryDependenceAnalysis *MD; + // The mergeLoad/Store algorithms could have Size0 * Size1 complexity, + // where Size0 and Size1 are the #instructions on the two sides of + // the diamond. The constant chosen here is arbitrary. Compiler Time + // Control is enforced by the check Size0 * Size1 < MagicCompileTimeControl. + const int MagicCompileTimeControl = 250; public: - static char ID; // Pass identification, replacement for typeid - MergedLoadStoreMotion() - : FunctionPass(ID), MD(nullptr), MagicCompileTimeControl(250) { - initializeMergedLoadStoreMotionPass(*PassRegistry::getPassRegistry()); - } - - bool runOnFunction(Function &F) override; + bool run(Function &F, MemoryDependenceResults *MD, AliasAnalysis &AA); private: - // This transformation requires dominator postdominator info - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - AU.addRequired<TargetLibraryInfoWrapperPass>(); - AU.addRequired<AAResultsWrapperPass>(); - AU.addPreserved<GlobalsAAWrapperPass>(); - AU.addPreserved<MemoryDependenceAnalysis>(); - } - - // Helper routines - /// /// \brief Remove instruction from parent and update memory dependence /// analysis. @@ -135,9 +119,9 @@ private: BasicBlock *getDiamondTail(BasicBlock *BB); bool isDiamondHead(BasicBlock *BB); // Routines for hoisting loads - bool isLoadHoistBarrierInRange(const Instruction& Start, - const Instruction& End, - LoadInst* LI); + bool isLoadHoistBarrierInRange(const Instruction &Start, + const Instruction &End, LoadInst *LI, + bool SafeToLoadUnconditionally); LoadInst *canHoistFromBlock(BasicBlock *BB, LoadInst *LI); void hoistInstruction(BasicBlock *BB, Instruction *HoistCand, Instruction *ElseInst); @@ -151,31 +135,8 @@ private: const Instruction &End, MemoryLocation Loc); bool sinkStore(BasicBlock *BB, StoreInst *SinkCand, StoreInst *ElseInst); bool mergeStores(BasicBlock *BB); - // The mergeLoad/Store algorithms could have Size0 * Size1 complexity, - // where Size0 and Size1 are the #instructions on the two sides of - // the diamond. The constant chosen here is arbitrary. Compiler Time - // Control is enforced by the check Size0 * Size1 < MagicCompileTimeControl. - const int MagicCompileTimeControl; }; - -char MergedLoadStoreMotion::ID = 0; -} // anonymous namespace - -/// -/// \brief createMergedLoadStoreMotionPass - The public interface to this file. -/// -FunctionPass *llvm::createMergedLoadStoreMotionPass() { - return new MergedLoadStoreMotion(); -} - -INITIALIZE_PASS_BEGIN(MergedLoadStoreMotion, "mldst-motion", - "MergedLoadStoreMotion", false, false) -INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis) -INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) -INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) -INITIALIZE_PASS_END(MergedLoadStoreMotion, "mldst-motion", - "MergedLoadStoreMotion", false, false) +} // end anonymous namespace /// /// \brief Remove instruction from parent and update memory dependence analysis. @@ -184,9 +145,9 @@ void MergedLoadStoreMotion::removeInstruction(Instruction *Inst) { // Notify the memory dependence analysis. if (MD) { MD->removeInstruction(Inst); - if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) + if (auto *LI = dyn_cast<LoadInst>(Inst)) MD->invalidateCachedPointerInfo(LI->getPointerOperand()); - if (Inst->getType()->getScalarType()->isPointerTy()) { + if (Inst->getType()->isPtrOrPtrVectorTy()) { MD->invalidateCachedPointerInfo(Inst); } } @@ -198,10 +159,7 @@ void MergedLoadStoreMotion::removeInstruction(Instruction *Inst) { /// BasicBlock *MergedLoadStoreMotion::getDiamondTail(BasicBlock *BB) { assert(isDiamondHead(BB) && "Basic block is not head of a diamond"); - BranchInst *BI = (BranchInst *)(BB->getTerminator()); - BasicBlock *Succ0 = BI->getSuccessor(0); - BasicBlock *Tail = Succ0->getTerminator()->getSuccessor(0); - return Tail; + return BB->getTerminator()->getSuccessor(0)->getSingleSuccessor(); } /// @@ -210,25 +168,22 @@ BasicBlock *MergedLoadStoreMotion::getDiamondTail(BasicBlock *BB) { bool MergedLoadStoreMotion::isDiamondHead(BasicBlock *BB) { if (!BB) return false; - if (!isa<BranchInst>(BB->getTerminator())) - return false; - if (BB->getTerminator()->getNumSuccessors() != 2) + auto *BI = dyn_cast<BranchInst>(BB->getTerminator()); + if (!BI || !BI->isConditional()) return false; - BranchInst *BI = (BranchInst *)(BB->getTerminator()); BasicBlock *Succ0 = BI->getSuccessor(0); BasicBlock *Succ1 = BI->getSuccessor(1); - if (!Succ0->getSinglePredecessor() || - Succ0->getTerminator()->getNumSuccessors() != 1) + if (!Succ0->getSinglePredecessor()) return false; - if (!Succ1->getSinglePredecessor() || - Succ1->getTerminator()->getNumSuccessors() != 1) + if (!Succ1->getSinglePredecessor()) return false; - BasicBlock *Tail = Succ0->getTerminator()->getSuccessor(0); + BasicBlock *Succ0Succ = Succ0->getSingleSuccessor(); + BasicBlock *Succ1Succ = Succ1->getSingleSuccessor(); // Ignore triangles. - if (Succ1->getTerminator()->getSuccessor(0) != Tail) + if (!Succ0Succ || !Succ1Succ || Succ0Succ != Succ1Succ) return false; return true; } @@ -240,9 +195,14 @@ bool MergedLoadStoreMotion::isDiamondHead(BasicBlock *BB) { /// being loaded or protect against the load from happening /// it is considered a hoist barrier. /// -bool MergedLoadStoreMotion::isLoadHoistBarrierInRange(const Instruction& Start, - const Instruction& End, - LoadInst* LI) { +bool MergedLoadStoreMotion::isLoadHoistBarrierInRange( + const Instruction &Start, const Instruction &End, LoadInst *LI, + bool SafeToLoadUnconditionally) { + if (!SafeToLoadUnconditionally) + for (const Instruction &Inst : + make_range(Start.getIterator(), End.getIterator())) + if (!isGuaranteedToTransferExecutionToSuccessor(&Inst)) + return true; MemoryLocation Loc = MemoryLocation::get(LI); return AA->canInstructionRangeModRef(Start, End, Loc, MRI_Mod); } @@ -256,23 +216,28 @@ bool MergedLoadStoreMotion::isLoadHoistBarrierInRange(const Instruction& Start, /// LoadInst *MergedLoadStoreMotion::canHoistFromBlock(BasicBlock *BB1, LoadInst *Load0) { - + BasicBlock *BB0 = Load0->getParent(); + BasicBlock *Head = BB0->getSinglePredecessor(); + bool SafeToLoadUnconditionally = isSafeToLoadUnconditionally( + Load0->getPointerOperand(), Load0->getAlignment(), + Load0->getModule()->getDataLayout(), + /*ScanFrom=*/Head->getTerminator()); for (BasicBlock::iterator BBI = BB1->begin(), BBE = BB1->end(); BBI != BBE; ++BBI) { Instruction *Inst = &*BBI; // Only merge and hoist loads when their result in used only in BB - if (!isa<LoadInst>(Inst) || Inst->isUsedOutsideOfBlock(BB1)) + auto *Load1 = dyn_cast<LoadInst>(Inst); + if (!Load1 || Inst->isUsedOutsideOfBlock(BB1)) continue; - LoadInst *Load1 = dyn_cast<LoadInst>(Inst); - BasicBlock *BB0 = Load0->getParent(); - MemoryLocation Loc0 = MemoryLocation::get(Load0); MemoryLocation Loc1 = MemoryLocation::get(Load1); - if (AA->isMustAlias(Loc0, Loc1) && Load0->isSameOperationAs(Load1) && - !isLoadHoistBarrierInRange(BB1->front(), *Load1, Load1) && - !isLoadHoistBarrierInRange(BB0->front(), *Load0, Load0)) { + if (Load0->isSameOperationAs(Load1) && AA->isMustAlias(Loc0, Loc1) && + !isLoadHoistBarrierInRange(BB1->front(), *Load1, Load1, + SafeToLoadUnconditionally) && + !isLoadHoistBarrierInRange(BB0->front(), *Load0, Load0, + SafeToLoadUnconditionally)) { return Load1; } } @@ -319,11 +284,10 @@ void MergedLoadStoreMotion::hoistInstruction(BasicBlock *BB, /// bool MergedLoadStoreMotion::isSafeToHoist(Instruction *I) const { BasicBlock *Parent = I->getParent(); - for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) { - Instruction *Instr = dyn_cast<Instruction>(I->getOperand(i)); - if (Instr && Instr->getParent() == Parent) - return false; - } + for (Use &U : I->operands()) + if (auto *Instr = dyn_cast<Instruction>(&U)) + if (Instr->getParent() == Parent) + return false; return true; } @@ -333,8 +297,8 @@ bool MergedLoadStoreMotion::isSafeToHoist(Instruction *I) const { bool MergedLoadStoreMotion::hoistLoad(BasicBlock *BB, LoadInst *L0, LoadInst *L1) { // Only one definition? - Instruction *A0 = dyn_cast<Instruction>(L0->getPointerOperand()); - Instruction *A1 = dyn_cast<Instruction>(L1->getPointerOperand()); + auto *A0 = dyn_cast<Instruction>(L0->getPointerOperand()); + auto *A1 = dyn_cast<Instruction>(L1->getPointerOperand()); if (A0 && A1 && A0->isIdenticalTo(A1) && isSafeToHoist(A0) && A0->hasOneUse() && (A0->getParent() == L0->getParent()) && A1->hasOneUse() && (A1->getParent() == L1->getParent()) && @@ -345,8 +309,8 @@ bool MergedLoadStoreMotion::hoistLoad(BasicBlock *BB, LoadInst *L0, hoistInstruction(BB, A0, A1); hoistInstruction(BB, L0, L1); return true; - } else - return false; + } + return false; } /// @@ -358,7 +322,7 @@ bool MergedLoadStoreMotion::hoistLoad(BasicBlock *BB, LoadInst *L0, bool MergedLoadStoreMotion::mergeLoads(BasicBlock *BB) { bool MergedLoads = false; assert(isDiamondHead(BB)); - BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator()); + BranchInst *BI = cast<BranchInst>(BB->getTerminator()); BasicBlock *Succ0 = BI->getSuccessor(0); BasicBlock *Succ1 = BI->getSuccessor(1); // #Instructions in Succ1 for Compile Time Control @@ -369,8 +333,8 @@ bool MergedLoadStoreMotion::mergeLoads(BasicBlock *BB) { Instruction *I = &*BBI; ++BBI; - // Only move non-simple (atomic, volatile) loads. - LoadInst *L0 = dyn_cast<LoadInst>(I); + // Don't move non-simple (atomic, volatile) loads. + auto *L0 = dyn_cast<LoadInst>(I); if (!L0 || !L0->isSimple() || L0->isUsedOutsideOfBlock(Succ0)) continue; @@ -399,6 +363,10 @@ bool MergedLoadStoreMotion::mergeLoads(BasicBlock *BB) { bool MergedLoadStoreMotion::isStoreSinkBarrierInRange(const Instruction &Start, const Instruction &End, MemoryLocation Loc) { + for (const Instruction &Inst : + make_range(Start.getIterator(), End.getIterator())) + if (Inst.mayThrow()) + return true; return AA->canInstructionRangeModRef(Start, End, Loc, MRI_ModRef); } @@ -411,22 +379,16 @@ StoreInst *MergedLoadStoreMotion::canSinkFromBlock(BasicBlock *BB1, StoreInst *Store0) { DEBUG(dbgs() << "can Sink? : "; Store0->dump(); dbgs() << "\n"); BasicBlock *BB0 = Store0->getParent(); - for (BasicBlock::reverse_iterator RBI = BB1->rbegin(), RBE = BB1->rend(); - RBI != RBE; ++RBI) { - Instruction *Inst = &*RBI; - - if (!isa<StoreInst>(Inst)) - continue; - - StoreInst *Store1 = cast<StoreInst>(Inst); + for (Instruction &Inst : reverse(*BB1)) { + auto *Store1 = dyn_cast<StoreInst>(&Inst); + if (!Store1) + continue; MemoryLocation Loc0 = MemoryLocation::get(Store0); MemoryLocation Loc1 = MemoryLocation::get(Store1); if (AA->isMustAlias(Loc0, Loc1) && Store0->isSameOperationAs(Store1) && - !isStoreSinkBarrierInRange(*(std::next(BasicBlock::iterator(Store1))), - BB1->back(), Loc1) && - !isStoreSinkBarrierInRange(*(std::next(BasicBlock::iterator(Store0))), - BB0->back(), Loc0)) { + !isStoreSinkBarrierInRange(*Store1->getNextNode(), BB1->back(), Loc1) && + !isStoreSinkBarrierInRange(*Store0->getNextNode(), BB0->back(), Loc0)) { return Store1; } } @@ -439,17 +401,17 @@ StoreInst *MergedLoadStoreMotion::canSinkFromBlock(BasicBlock *BB1, PHINode *MergedLoadStoreMotion::getPHIOperand(BasicBlock *BB, StoreInst *S0, StoreInst *S1) { // Create a phi if the values mismatch. - PHINode *NewPN = nullptr; Value *Opd1 = S0->getValueOperand(); Value *Opd2 = S1->getValueOperand(); - if (Opd1 != Opd2) { - NewPN = PHINode::Create(Opd1->getType(), 2, Opd2->getName() + ".sink", - &BB->front()); - NewPN->addIncoming(Opd1, S0->getParent()); - NewPN->addIncoming(Opd2, S1->getParent()); - if (MD && NewPN->getType()->getScalarType()->isPointerTy()) - MD->invalidateCachedPointerInfo(NewPN); - } + if (Opd1 == Opd2) + return nullptr; + + auto *NewPN = PHINode::Create(Opd1->getType(), 2, Opd2->getName() + ".sink", + &BB->front()); + NewPN->addIncoming(Opd1, S0->getParent()); + NewPN->addIncoming(Opd2, S1->getParent()); + if (MD && NewPN->getType()->getScalarType()->isPointerTy()) + MD->invalidateCachedPointerInfo(NewPN); return NewPN; } @@ -461,8 +423,8 @@ PHINode *MergedLoadStoreMotion::getPHIOperand(BasicBlock *BB, StoreInst *S0, bool MergedLoadStoreMotion::sinkStore(BasicBlock *BB, StoreInst *S0, StoreInst *S1) { // Only one definition? - Instruction *A0 = dyn_cast<Instruction>(S0->getPointerOperand()); - Instruction *A1 = dyn_cast<Instruction>(S1->getPointerOperand()); + auto *A0 = dyn_cast<Instruction>(S0->getPointerOperand()); + auto *A1 = dyn_cast<Instruction>(S1->getPointerOperand()); if (A0 && A1 && A0->isIdenticalTo(A1) && A0->hasOneUse() && (A0->getParent() == S0->getParent()) && A1->hasOneUse() && (A1->getParent() == S1->getParent()) && isa<GetElementPtrInst>(A0)) { @@ -476,7 +438,7 @@ bool MergedLoadStoreMotion::sinkStore(BasicBlock *BB, StoreInst *S0, S0->dropUnknownNonDebugMetadata(); // Create the new store to be inserted at the join point. - StoreInst *SNew = (StoreInst *)(S0->clone()); + StoreInst *SNew = cast<StoreInst>(S0->clone()); Instruction *ANew = A0->clone(); SNew->insertBefore(&*InsertPt); ANew->insertBefore(SNew); @@ -484,9 +446,8 @@ bool MergedLoadStoreMotion::sinkStore(BasicBlock *BB, StoreInst *S0, assert(S0->getParent() == A0->getParent()); assert(S1->getParent() == A1->getParent()); - PHINode *NewPN = getPHIOperand(BB, S0, S1); // New PHI operand? Use it. - if (NewPN) + if (PHINode *NewPN = getPHIOperand(BB, S0, S1)) SNew->setOperand(0, NewPN); removeInstruction(S0); removeInstruction(S1); @@ -532,11 +493,9 @@ bool MergedLoadStoreMotion::mergeStores(BasicBlock *T) { Instruction *I = &*RBI; ++RBI; - // Sink move non-simple (atomic, volatile) stores - if (!isa<StoreInst>(I)) - continue; - StoreInst *S0 = (StoreInst *)I; - if (!S0->isSimple()) + // Don't sink non-simple (atomic, volatile) stores. + auto *S0 = dyn_cast<StoreInst>(I); + if (!S0 || !S0->isSimple()) continue; ++NStores; @@ -551,22 +510,18 @@ bool MergedLoadStoreMotion::mergeStores(BasicBlock *T) { // is likely stale at this point. if (!Res) break; - else { - RBI = Pred0->rbegin(); - RBE = Pred0->rend(); - DEBUG(dbgs() << "Search again\n"; Instruction *I = &*RBI; I->dump()); - } + RBI = Pred0->rbegin(); + RBE = Pred0->rend(); + DEBUG(dbgs() << "Search again\n"; Instruction *I = &*RBI; I->dump()); } } return MergedStores; } -/// -/// \brief Run the transformation for each function -/// -bool MergedLoadStoreMotion::runOnFunction(Function &F) { - MD = getAnalysisIfAvailable<MemoryDependenceAnalysis>(); - AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); +bool MergedLoadStoreMotion::run(Function &F, MemoryDependenceResults *MD, + AliasAnalysis &AA) { + this->MD = MD; + this->AA = &AA; bool Changed = false; DEBUG(dbgs() << "Instruction Merger\n"); @@ -585,3 +540,66 @@ bool MergedLoadStoreMotion::runOnFunction(Function &F) { } return Changed; } + +namespace { +class MergedLoadStoreMotionLegacyPass : public FunctionPass { +public: + static char ID; // Pass identification, replacement for typeid + MergedLoadStoreMotionLegacyPass() : FunctionPass(ID) { + initializeMergedLoadStoreMotionLegacyPassPass( + *PassRegistry::getPassRegistry()); + } + + /// + /// \brief Run the transformation for each function + /// + bool runOnFunction(Function &F) override { + if (skipFunction(F)) + return false; + MergedLoadStoreMotion Impl; + auto *MDWP = getAnalysisIfAvailable<MemoryDependenceWrapperPass>(); + return Impl.run(F, MDWP ? &MDWP->getMemDep() : nullptr, + getAnalysis<AAResultsWrapperPass>().getAAResults()); + } + +private: + // This transformation requires dominator postdominator info + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired<AAResultsWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); + AU.addPreserved<MemoryDependenceWrapperPass>(); + } +}; + +char MergedLoadStoreMotionLegacyPass::ID = 0; +} // anonymous namespace + +/// +/// \brief createMergedLoadStoreMotionPass - The public interface to this file. +/// +FunctionPass *llvm::createMergedLoadStoreMotionPass() { + return new MergedLoadStoreMotionLegacyPass(); +} + +INITIALIZE_PASS_BEGIN(MergedLoadStoreMotionLegacyPass, "mldst-motion", + "MergedLoadStoreMotion", false, false) +INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_END(MergedLoadStoreMotionLegacyPass, "mldst-motion", + "MergedLoadStoreMotion", false, false) + +PreservedAnalyses +MergedLoadStoreMotionPass::run(Function &F, AnalysisManager<Function> &AM) { + MergedLoadStoreMotion Impl; + auto *MD = AM.getCachedResult<MemoryDependenceAnalysis>(F); + auto &AA = AM.getResult<AAManager>(F); + if (!Impl.run(F, MD, AA)) + return PreservedAnalyses::all(); + + // FIXME: This should also 'preserve the CFG'. + PreservedAnalyses PA; + PA.preserve<GlobalsAA>(); + PA.preserve<MemoryDependenceAnalysis>(); + return PA; +} diff --git a/lib/Transforms/Scalar/NaryReassociate.cpp b/lib/Transforms/Scalar/NaryReassociate.cpp index c8f885e7eec53..ed754fa710253 100644 --- a/lib/Transforms/Scalar/NaryReassociate.cpp +++ b/lib/Transforms/Scalar/NaryReassociate.cpp @@ -208,7 +208,7 @@ FunctionPass *llvm::createNaryReassociatePass() { } bool NaryReassociate::runOnFunction(Function &F) { - if (skipOptnoneFunction(F)) + if (skipFunction(F)) return false; AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); @@ -299,49 +299,18 @@ Instruction *NaryReassociate::tryReassociate(Instruction *I) { } } -// FIXME: extract this method into TTI->getGEPCost. static bool isGEPFoldable(GetElementPtrInst *GEP, - const TargetTransformInfo *TTI, - const DataLayout *DL) { - GlobalVariable *BaseGV = nullptr; - int64_t BaseOffset = 0; - bool HasBaseReg = false; - int64_t Scale = 0; - - if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getPointerOperand())) - BaseGV = GV; - else - HasBaseReg = true; - - gep_type_iterator GTI = gep_type_begin(GEP); - for (auto I = GEP->idx_begin(); I != GEP->idx_end(); ++I, ++GTI) { - if (isa<SequentialType>(*GTI)) { - int64_t ElementSize = DL->getTypeAllocSize(GTI.getIndexedType()); - if (ConstantInt *ConstIdx = dyn_cast<ConstantInt>(*I)) { - BaseOffset += ConstIdx->getSExtValue() * ElementSize; - } else { - // Needs scale register. - if (Scale != 0) { - // No addressing mode takes two scale registers. - return false; - } - Scale = ElementSize; - } - } else { - StructType *STy = cast<StructType>(*GTI); - uint64_t Field = cast<ConstantInt>(*I)->getZExtValue(); - BaseOffset += DL->getStructLayout(STy)->getElementOffset(Field); - } - } - - unsigned AddrSpace = GEP->getPointerAddressSpace(); - return TTI->isLegalAddressingMode(GEP->getType()->getElementType(), BaseGV, - BaseOffset, HasBaseReg, Scale, AddrSpace); + const TargetTransformInfo *TTI) { + SmallVector<const Value*, 4> Indices; + for (auto I = GEP->idx_begin(); I != GEP->idx_end(); ++I) + Indices.push_back(*I); + return TTI->getGEPCost(GEP->getSourceElementType(), GEP->getPointerOperand(), + Indices) == TargetTransformInfo::TCC_Free; } Instruction *NaryReassociate::tryReassociateGEP(GetElementPtrInst *GEP) { // Not worth reassociating GEP if it is foldable. - if (isGEPFoldable(GEP, TTI, DL)) + if (isGEPFoldable(GEP, TTI)) return nullptr; gep_type_iterator GTI = gep_type_begin(*GEP); @@ -434,7 +403,7 @@ GetElementPtrInst *NaryReassociate::tryReassociateGEPAtIndex( // NewGEP = (char *)Candidate + RHS * sizeof(IndexedType) uint64_t IndexedSize = DL->getTypeAllocSize(IndexedType); - Type *ElementType = GEP->getType()->getElementType(); + Type *ElementType = GEP->getResultElementType(); uint64_t ElementSize = DL->getTypeAllocSize(ElementType); // Another less rare case: because I is not necessarily the last index of the // GEP, the size of the type at the I-th index (IndexedSize) is not diff --git a/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp b/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp index 9f26f78892c65..c4b3e3464f409 100644 --- a/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp +++ b/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp @@ -13,12 +13,10 @@ // //===----------------------------------------------------------------------===// +#include "llvm/Transforms/Scalar/PartiallyInlineLibCalls.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/IRBuilder.h" -#include "llvm/IR/Intrinsics.h" -#include "llvm/Pass.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" @@ -26,85 +24,9 @@ using namespace llvm; #define DEBUG_TYPE "partially-inline-libcalls" -namespace { - class PartiallyInlineLibCalls : public FunctionPass { - public: - static char ID; - - PartiallyInlineLibCalls() : - FunctionPass(ID) { - initializePartiallyInlineLibCallsPass(*PassRegistry::getPassRegistry()); - } - - void getAnalysisUsage(AnalysisUsage &AU) const override; - bool runOnFunction(Function &F) override; - - private: - /// Optimize calls to sqrt. - bool optimizeSQRT(CallInst *Call, Function *CalledFunc, - BasicBlock &CurrBB, Function::iterator &BB); - }; - - char PartiallyInlineLibCalls::ID = 0; -} - -INITIALIZE_PASS(PartiallyInlineLibCalls, "partially-inline-libcalls", - "Partially inline calls to library functions", false, false) - -void PartiallyInlineLibCalls::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<TargetLibraryInfoWrapperPass>(); - AU.addRequired<TargetTransformInfoWrapperPass>(); - FunctionPass::getAnalysisUsage(AU); -} - -bool PartiallyInlineLibCalls::runOnFunction(Function &F) { - bool Changed = false; - Function::iterator CurrBB; - TargetLibraryInfo *TLI = - &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); - const TargetTransformInfo *TTI = - &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); - for (Function::iterator BB = F.begin(), BE = F.end(); BB != BE;) { - CurrBB = BB++; - - for (BasicBlock::iterator II = CurrBB->begin(), IE = CurrBB->end(); - II != IE; ++II) { - CallInst *Call = dyn_cast<CallInst>(&*II); - Function *CalledFunc; - - if (!Call || !(CalledFunc = Call->getCalledFunction())) - continue; - - // Skip if function either has local linkage or is not a known library - // function. - LibFunc::Func LibFunc; - if (CalledFunc->hasLocalLinkage() || !CalledFunc->hasName() || - !TLI->getLibFunc(CalledFunc->getName(), LibFunc)) - continue; - - switch (LibFunc) { - case LibFunc::sqrtf: - case LibFunc::sqrt: - if (TTI->haveFastSqrt(Call->getType()) && - optimizeSQRT(Call, CalledFunc, *CurrBB, BB)) - break; - continue; - default: - continue; - } - Changed = true; - break; - } - } - - return Changed; -} - -bool PartiallyInlineLibCalls::optimizeSQRT(CallInst *Call, - Function *CalledFunc, - BasicBlock &CurrBB, - Function::iterator &BB) { +static bool optimizeSQRT(CallInst *Call, Function *CalledFunc, + BasicBlock &CurrBB, Function::iterator &BB) { // There is no need to change the IR, since backend will emit sqrt // instruction if the call has already been marked read-only. if (Call->onlyReadsMemory()) @@ -158,6 +80,97 @@ bool PartiallyInlineLibCalls::optimizeSQRT(CallInst *Call, return true; } +static bool runPartiallyInlineLibCalls(Function &F, TargetLibraryInfo *TLI, + const TargetTransformInfo *TTI) { + bool Changed = false; + + Function::iterator CurrBB; + for (Function::iterator BB = F.begin(), BE = F.end(); BB != BE;) { + CurrBB = BB++; + + for (BasicBlock::iterator II = CurrBB->begin(), IE = CurrBB->end(); + II != IE; ++II) { + CallInst *Call = dyn_cast<CallInst>(&*II); + Function *CalledFunc; + + if (!Call || !(CalledFunc = Call->getCalledFunction())) + continue; + + // Skip if function either has local linkage or is not a known library + // function. + LibFunc::Func LibFunc; + if (CalledFunc->hasLocalLinkage() || !CalledFunc->hasName() || + !TLI->getLibFunc(CalledFunc->getName(), LibFunc)) + continue; + + switch (LibFunc) { + case LibFunc::sqrtf: + case LibFunc::sqrt: + if (TTI->haveFastSqrt(Call->getType()) && + optimizeSQRT(Call, CalledFunc, *CurrBB, BB)) + break; + continue; + default: + continue; + } + + Changed = true; + break; + } + } + + return Changed; +} + +PreservedAnalyses +PartiallyInlineLibCallsPass::run(Function &F, AnalysisManager<Function> &AM) { + auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); + auto &TTI = AM.getResult<TargetIRAnalysis>(F); + if (!runPartiallyInlineLibCalls(F, &TLI, &TTI)) + return PreservedAnalyses::all(); + return PreservedAnalyses::none(); +} + +namespace { +class PartiallyInlineLibCallsLegacyPass : public FunctionPass { +public: + static char ID; + + PartiallyInlineLibCallsLegacyPass() : FunctionPass(ID) { + initializePartiallyInlineLibCallsLegacyPassPass( + *PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<TargetLibraryInfoWrapperPass>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); + FunctionPass::getAnalysisUsage(AU); + } + + bool runOnFunction(Function &F) override { + if (skipFunction(F)) + return false; + + TargetLibraryInfo *TLI = + &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + const TargetTransformInfo *TTI = + &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); + return runPartiallyInlineLibCalls(F, TLI, TTI); + } +}; +} + +char PartiallyInlineLibCallsLegacyPass::ID = 0; +INITIALIZE_PASS_BEGIN(PartiallyInlineLibCallsLegacyPass, + "partially-inline-libcalls", + "Partially inline calls to library functions", false, + false) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_END(PartiallyInlineLibCallsLegacyPass, + "partially-inline-libcalls", + "Partially inline calls to library functions", false, false) + FunctionPass *llvm::createPartiallyInlineLibCallsPass() { - return new PartiallyInlineLibCalls(); + return new PartiallyInlineLibCallsLegacyPass(); } diff --git a/lib/Transforms/Scalar/PlaceSafepoints.cpp b/lib/Transforms/Scalar/PlaceSafepoints.cpp index b56b355991200..e47b636348e33 100644 --- a/lib/Transforms/Scalar/PlaceSafepoints.cpp +++ b/lib/Transforms/Scalar/PlaceSafepoints.cpp @@ -49,45 +49,32 @@ //===----------------------------------------------------------------------===// #include "llvm/Pass.h" -#include "llvm/IR/LegacyPassManager.h" -#include "llvm/ADT/SetOperations.h" + #include "llvm/ADT/SetVector.h" #include "llvm/ADT/Statistic.h" -#include "llvm/ADT/StringRef.h" -#include "llvm/Analysis/LoopPass.h" -#include "llvm/Analysis/LoopInfo.h" -#include "llvm/Analysis/ScalarEvolution.h" -#include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/CFG.h" -#include "llvm/Analysis/InstructionSimplify.h" -#include "llvm/IR/BasicBlock.h" +#include "llvm/Analysis/ScalarEvolution.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/Dominators.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/IRBuilder.h" -#include "llvm/IR/InstIterator.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/Intrinsics.h" #include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/Module.h" +#include "llvm/IR/LegacyPassManager.h" #include "llvm/IR/Statepoint.h" -#include "llvm/IR/Value.h" -#include "llvm/IR/Verifier.h" -#include "llvm/Support/Debug.h" #include "llvm/Support/CommandLine.h" -#include "llvm/Support/raw_ostream.h" +#include "llvm/Support/Debug.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/Local.h" #define DEBUG_TYPE "safepoint-placement" + STATISTIC(NumEntrySafepoints, "Number of entry safepoints inserted"); -STATISTIC(NumCallSafepoints, "Number of call safepoints inserted"); STATISTIC(NumBackedgeSafepoints, "Number of backedge safepoints inserted"); -STATISTIC(CallInLoop, "Number of loops w/o safepoints due to calls in loop"); -STATISTIC(FiniteExecution, "Number of loops w/o safepoints finite execution"); +STATISTIC(CallInLoop, + "Number of loops without safepoints due to calls in loop"); +STATISTIC(FiniteExecution, + "Number of loops without safepoints finite execution"); using namespace llvm; @@ -108,9 +95,6 @@ static cl::opt<int> CountedLoopTripWidth("spp-counted-loop-trip-width", static cl::opt<bool> SplitBackedge("spp-split-backedge", cl::Hidden, cl::init(false)); -// Print tracing output -static cl::opt<bool> TraceLSP("spp-trace", cl::Hidden, cl::init(false)); - namespace { /// An analysis pass whose purpose is to identify each of the backedges in @@ -138,8 +122,8 @@ struct PlaceBackedgeSafepointsImpl : public FunctionPass { bool runOnLoop(Loop *); void runOnLoopAndSubLoops(Loop *L) { // Visit all the subloops - for (auto I = L->begin(), E = L->end(); I != E; I++) - runOnLoopAndSubLoops(*I); + for (Loop *I : *L) + runOnLoopAndSubLoops(I); runOnLoop(L); } @@ -147,8 +131,8 @@ struct PlaceBackedgeSafepointsImpl : public FunctionPass { SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - for (auto I = LI->begin(), E = LI->end(); I != E; I++) { - runOnLoopAndSubLoops(*I); + for (Loop *I : *LI) { + runOnLoopAndSubLoops(I); } return false; } @@ -200,13 +184,9 @@ static bool needsStatepoint(const CallSite &CS) { if (call->isInlineAsm()) return false; } - if (isStatepoint(CS) || isGCRelocate(CS) || isGCResult(CS)) { - return false; - } - return true; -} -static Value *ReplaceWithStatepoint(const CallSite &CS); + return !(isStatepoint(CS) || isGCRelocate(CS) || isGCResult(CS)); +} /// Returns true if this loop is known to contain a call safepoint which /// must unconditionally execute on any iteration of the loop which returns @@ -278,43 +258,44 @@ static bool mustBeFiniteCountedLoop(Loop *L, ScalarEvolution *SE, return /* not finite */ false; } -static void scanOneBB(Instruction *start, Instruction *end, - std::vector<CallInst *> &calls, - std::set<BasicBlock *> &seen, - std::vector<BasicBlock *> &worklist) { - for (BasicBlock::iterator itr(start); - itr != start->getParent()->end() && itr != BasicBlock::iterator(end); - itr++) { - if (CallInst *CI = dyn_cast<CallInst>(&*itr)) { - calls.push_back(CI); - } +static void scanOneBB(Instruction *Start, Instruction *End, + std::vector<CallInst *> &Calls, + DenseSet<BasicBlock *> &Seen, + std::vector<BasicBlock *> &Worklist) { + for (BasicBlock::iterator BBI(Start), BBE0 = Start->getParent()->end(), + BBE1 = BasicBlock::iterator(End); + BBI != BBE0 && BBI != BBE1; BBI++) { + if (CallInst *CI = dyn_cast<CallInst>(&*BBI)) + Calls.push_back(CI); + // FIXME: This code does not handle invokes - assert(!dyn_cast<InvokeInst>(&*itr) && + assert(!isa<InvokeInst>(&*BBI) && "support for invokes in poll code needed"); + // Only add the successor blocks if we reach the terminator instruction // without encountering end first - if (itr->isTerminator()) { - BasicBlock *BB = itr->getParent(); + if (BBI->isTerminator()) { + BasicBlock *BB = BBI->getParent(); for (BasicBlock *Succ : successors(BB)) { - if (seen.count(Succ) == 0) { - worklist.push_back(Succ); - seen.insert(Succ); + if (Seen.insert(Succ).second) { + Worklist.push_back(Succ); } } } } } -static void scanInlinedCode(Instruction *start, Instruction *end, - std::vector<CallInst *> &calls, - std::set<BasicBlock *> &seen) { - calls.clear(); - std::vector<BasicBlock *> worklist; - seen.insert(start->getParent()); - scanOneBB(start, end, calls, seen, worklist); - while (!worklist.empty()) { - BasicBlock *BB = worklist.back(); - worklist.pop_back(); - scanOneBB(&*BB->begin(), end, calls, seen, worklist); + +static void scanInlinedCode(Instruction *Start, Instruction *End, + std::vector<CallInst *> &Calls, + DenseSet<BasicBlock *> &Seen) { + Calls.clear(); + std::vector<BasicBlock *> Worklist; + Seen.insert(Start->getParent()); + scanOneBB(Start, End, Calls, Seen, Worklist); + while (!Worklist.empty()) { + BasicBlock *BB = Worklist.back(); + Worklist.pop_back(); + scanOneBB(&*BB->begin(), End, Calls, Seen, Worklist); } } @@ -324,29 +305,27 @@ bool PlaceBackedgeSafepointsImpl::runOnLoop(Loop *L) { // Note: In common usage, there will be only one edge due to LoopSimplify // having run sometime earlier in the pipeline, but this code must be correct // w.r.t. loops with multiple backedges. - BasicBlock *header = L->getHeader(); + BasicBlock *Header = L->getHeader(); SmallVector<BasicBlock*, 16> LoopLatches; L->getLoopLatches(LoopLatches); - for (BasicBlock *pred : LoopLatches) { - assert(L->contains(pred)); + for (BasicBlock *Pred : LoopLatches) { + assert(L->contains(Pred)); // Make a policy decision about whether this loop needs a safepoint or // not. Note that this is about unburdening the optimizer in loops, not // avoiding the runtime cost of the actual safepoint. if (!AllBackedges) { - if (mustBeFiniteCountedLoop(L, SE, pred)) { - if (TraceLSP) - errs() << "skipping safepoint placement in finite loop\n"; + if (mustBeFiniteCountedLoop(L, SE, Pred)) { + DEBUG(dbgs() << "skipping safepoint placement in finite loop\n"); FiniteExecution++; continue; } if (CallSafepointsEnabled && - containsUnconditionalCallSafepoint(L, header, pred, *DT)) { + containsUnconditionalCallSafepoint(L, Header, Pred, *DT)) { // Note: This is only semantically legal since we won't do any further // IPO or inlining before the actual call insertion.. If we hadn't, we // might latter loose this call safepoint. - if (TraceLSP) - errs() << "skipping safepoint placement due to unconditional call\n"; + DEBUG(dbgs() << "skipping safepoint placement due to unconditional call\n"); CallInLoop++; continue; } @@ -360,14 +339,11 @@ bool PlaceBackedgeSafepointsImpl::runOnLoop(Loop *L) { // Safepoint insertion would involve creating a new basic block (as the // target of the current backedge) which does the safepoint (of all live // variables) and branches to the true header - TerminatorInst *term = pred->getTerminator(); + TerminatorInst *Term = Pred->getTerminator(); - if (TraceLSP) { - errs() << "[LSP] terminator instruction: "; - term->dump(); - } + DEBUG(dbgs() << "[LSP] terminator instruction: " << *Term); - PollLocations.push_back(term); + PollLocations.push_back(Term); } return false; @@ -411,27 +387,26 @@ static Instruction *findLocationForEntrySafepoint(Function &F, // hasNextInstruction and nextInstruction are used to iterate // through a "straight line" execution sequence. - auto hasNextInstruction = [](Instruction *I) { - if (!I->isTerminator()) { + auto HasNextInstruction = [](Instruction *I) { + if (!I->isTerminator()) return true; - } + BasicBlock *nextBB = I->getParent()->getUniqueSuccessor(); return nextBB && (nextBB->getUniquePredecessor() != nullptr); }; - auto nextInstruction = [&hasNextInstruction](Instruction *I) { - assert(hasNextInstruction(I) && + auto NextInstruction = [&](Instruction *I) { + assert(HasNextInstruction(I) && "first check if there is a next instruction!"); - if (I->isTerminator()) { + + if (I->isTerminator()) return &I->getParent()->getUniqueSuccessor()->front(); - } else { - return &*++I->getIterator(); - } + return &*++I->getIterator(); }; - Instruction *cursor = nullptr; - for (cursor = &F.getEntryBlock().front(); hasNextInstruction(cursor); - cursor = nextInstruction(cursor)) { + Instruction *Cursor = nullptr; + for (Cursor = &F.getEntryBlock().front(); HasNextInstruction(Cursor); + Cursor = NextInstruction(Cursor)) { // We need to ensure a safepoint poll occurs before any 'real' call. The // easiest way to ensure finite execution between safepoints in the face of @@ -440,51 +415,17 @@ static Instruction *findLocationForEntrySafepoint(Function &F, // which can grow the stack by an unbounded amount. This isn't required // for GC semantics per se, but is a common requirement for languages // which detect stack overflow via guard pages and then throw exceptions. - if (auto CS = CallSite(cursor)) { + if (auto CS = CallSite(Cursor)) { if (doesNotRequireEntrySafepointBefore(CS)) continue; break; } } - assert((hasNextInstruction(cursor) || cursor->isTerminator()) && + assert((HasNextInstruction(Cursor) || Cursor->isTerminator()) && "either we stopped because of a call, or because of terminator"); - return cursor; -} - -/// Identify the list of call sites which need to be have parseable state -static void findCallSafepoints(Function &F, - std::vector<CallSite> &Found /*rval*/) { - assert(Found.empty() && "must be empty!"); - for (Instruction &I : instructions(F)) { - Instruction *inst = &I; - if (isa<CallInst>(inst) || isa<InvokeInst>(inst)) { - CallSite CS(inst); - - // No safepoint needed or wanted - if (!needsStatepoint(CS)) { - continue; - } - - Found.push_back(CS); - } - } -} - -/// Implement a unique function which doesn't require we sort the input -/// vector. Doing so has the effect of changing the output of a couple of -/// tests in ways which make them less useful in testing fused safepoints. -template <typename T> static void unique_unsorted(std::vector<T> &vec) { - std::set<T> seen; - std::vector<T> tmp; - vec.reserve(vec.size()); - std::swap(tmp, vec); - for (auto V : tmp) { - if (seen.insert(V).second) { - vec.push_back(V); - } - } + return Cursor; } static const char *const GCSafepointPollName = "gc.safepoint_poll"; @@ -514,24 +455,6 @@ static bool enableEntrySafepoints(Function &F) { return !NoEntry; } static bool enableBackedgeSafepoints(Function &F) { return !NoBackedge; } static bool enableCallSafepoints(Function &F) { return !NoCall; } -// Normalize basic block to make it ready to be target of invoke statepoint. -// Ensure that 'BB' does not have phi nodes. It may require spliting it. -static BasicBlock *normalizeForInvokeSafepoint(BasicBlock *BB, - BasicBlock *InvokeParent) { - BasicBlock *ret = BB; - - if (!BB->getUniquePredecessor()) { - ret = SplitBlockPredecessors(BB, InvokeParent, ""); - } - - // Now that 'ret' has unique predecessor we can safely remove all phi nodes - // from it - FoldSingleEntryPHINodes(ret); - assert(!isa<PHINode>(ret->begin())); - - return ret; -} - bool PlaceSafepoints::runOnFunction(Function &F) { if (F.isDeclaration() || F.empty()) { // This is a declaration, nothing to do. Must exit early to avoid crash in @@ -549,13 +472,13 @@ bool PlaceSafepoints::runOnFunction(Function &F) { if (!shouldRewriteFunction(F)) return false; - bool modified = false; + bool Modified = false; // In various bits below, we rely on the fact that uses are reachable from // defs. When there are basic blocks unreachable from the entry, dominance // and reachablity queries return non-sensical results. Thus, we preprocess // the function to ensure these properties hold. - modified |= removeUnreachableBlocks(F); + Modified |= removeUnreachableBlocks(F); // STEP 1 - Insert the safepoint polling locations. We do not need to // actually insert parse points yet. That will be done for all polls and @@ -574,8 +497,7 @@ bool PlaceSafepoints::runOnFunction(Function &F) { // with for the moment. legacy::FunctionPassManager FPM(F.getParent()); bool CanAssumeCallSafepoints = enableCallSafepoints(F); - PlaceBackedgeSafepointsImpl *PBS = - new PlaceBackedgeSafepointsImpl(CanAssumeCallSafepoints); + auto *PBS = new PlaceBackedgeSafepointsImpl(CanAssumeCallSafepoints); FPM.add(PBS); FPM.run(F); @@ -603,7 +525,7 @@ bool PlaceSafepoints::runOnFunction(Function &F) { // The poll location must be the terminator of a loop latch block. for (TerminatorInst *Term : PollLocations) { // We are inserting a poll, the function is modified - modified = true; + Modified = true; if (SplitBackedge) { // Split the backedge of the loop and insert the poll within that new @@ -643,14 +565,13 @@ bool PlaceSafepoints::runOnFunction(Function &F) { } if (enableEntrySafepoints(F)) { - Instruction *Location = findLocationForEntrySafepoint(F, DT); - if (!Location) { - // policy choice not to insert? - } else { + if (Instruction *Location = findLocationForEntrySafepoint(F, DT)) { PollsNeeded.push_back(Location); - modified = true; + Modified = true; NumEntrySafepoints++; } + // TODO: else we should assert that there was, in fact, a policy choice to + // not insert a entry safepoint poll. } // Now that we've identified all the needed safepoint poll locations, insert @@ -661,71 +582,8 @@ bool PlaceSafepoints::runOnFunction(Function &F) { ParsePointNeeded.insert(ParsePointNeeded.end(), RuntimeCalls.begin(), RuntimeCalls.end()); } - PollsNeeded.clear(); // make sure we don't accidentally use - // The dominator tree has been invalidated by the inlining performed in the - // above loop. TODO: Teach the inliner how to update the dom tree? - DT.recalculate(F); - - if (enableCallSafepoints(F)) { - std::vector<CallSite> Calls; - findCallSafepoints(F, Calls); - NumCallSafepoints += Calls.size(); - ParsePointNeeded.insert(ParsePointNeeded.end(), Calls.begin(), Calls.end()); - } - - // Unique the vectors since we can end up with duplicates if we scan the call - // site for call safepoints after we add it for entry or backedge. The - // only reason we need tracking at all is that some functions might have - // polls but not call safepoints and thus we might miss marking the runtime - // calls for the polls. (This is useful in test cases!) - unique_unsorted(ParsePointNeeded); - - // Any parse point (no matter what source) will be handled here - - // We're about to start modifying the function - if (!ParsePointNeeded.empty()) - modified = true; - - // Now run through and insert the safepoints, but do _NOT_ update or remove - // any existing uses. We have references to live variables that need to - // survive to the last iteration of this loop. - std::vector<Value *> Results; - Results.reserve(ParsePointNeeded.size()); - for (size_t i = 0; i < ParsePointNeeded.size(); i++) { - CallSite &CS = ParsePointNeeded[i]; - - // For invoke statepoints we need to remove all phi nodes at the normal - // destination block. - // Reason for this is that we can place gc_result only after last phi node - // in basic block. We will get malformed code after RAUW for the - // gc_result if one of this phi nodes uses result from the invoke. - if (InvokeInst *Invoke = dyn_cast<InvokeInst>(CS.getInstruction())) { - normalizeForInvokeSafepoint(Invoke->getNormalDest(), - Invoke->getParent()); - } - - Value *GCResult = ReplaceWithStatepoint(CS); - Results.push_back(GCResult); - } - assert(Results.size() == ParsePointNeeded.size()); - - // Adjust all users of the old call sites to use the new ones instead - for (size_t i = 0; i < ParsePointNeeded.size(); i++) { - CallSite &CS = ParsePointNeeded[i]; - Value *GCResult = Results[i]; - if (GCResult) { - // Can not RAUW for the invoke gc result in case of phi nodes preset. - assert(CS.isCall() || !isa<PHINode>(cast<Instruction>(GCResult)->getParent()->begin())); - - // Replace all uses with the new call - CS.getInstruction()->replaceAllUsesWith(GCResult); - } - // Now that we've handled all uses, remove the original call itself - // Note: The insert point can't be the deleted instruction! - CS.getInstruction()->eraseFromParent(); - } - return modified; + return Modified; } char PlaceBackedgeSafepointsImpl::ID = 0; @@ -763,191 +621,60 @@ InsertSafepointPoll(Instruction *InsertBefore, auto *F = M->getFunction(GCSafepointPollName); assert(F && "gc.safepoint_poll function is missing"); - assert(F->getType()->getElementType() == + assert(F->getValueType() == FunctionType::get(Type::getVoidTy(M->getContext()), false) && "gc.safepoint_poll declared with wrong type"); assert(!F->empty() && "gc.safepoint_poll must be a non-empty function"); CallInst *PollCall = CallInst::Create(F, "", InsertBefore); // Record some information about the call site we're replacing - BasicBlock::iterator before(PollCall), after(PollCall); - bool isBegin(false); - if (before == OrigBB->begin()) { - isBegin = true; - } else { - before--; - } - after++; - assert(after != OrigBB->end() && "must have successor"); + BasicBlock::iterator Before(PollCall), After(PollCall); + bool IsBegin = false; + if (Before == OrigBB->begin()) + IsBegin = true; + else + Before--; - // do the actual inlining + After++; + assert(After != OrigBB->end() && "must have successor"); + + // Do the actual inlining InlineFunctionInfo IFI; bool InlineStatus = InlineFunction(PollCall, IFI); assert(InlineStatus && "inline must succeed"); (void)InlineStatus; // suppress warning in release-asserts - // Check post conditions + // Check post-conditions assert(IFI.StaticAllocas.empty() && "can't have allocs"); - std::vector<CallInst *> calls; // new calls - std::set<BasicBlock *> BBs; // new BBs + insertee + std::vector<CallInst *> Calls; // new calls + DenseSet<BasicBlock *> BBs; // new BBs + insertee + // Include only the newly inserted instructions, Note: begin may not be valid // if we inserted to the beginning of the basic block - BasicBlock::iterator start; - if (isBegin) { - start = OrigBB->begin(); - } else { - start = before; - start++; - } + BasicBlock::iterator Start = IsBegin ? OrigBB->begin() : std::next(Before); // If your poll function includes an unreachable at the end, that's not // valid. Bugpoint likes to create this, so check for it. - assert(isPotentiallyReachable(&*start, &*after, nullptr, nullptr) && + assert(isPotentiallyReachable(&*Start, &*After) && "malformed poll function"); - scanInlinedCode(&*(start), &*(after), calls, BBs); - assert(!calls.empty() && "slow path not found for safepoint poll"); + scanInlinedCode(&*Start, &*After, Calls, BBs); + assert(!Calls.empty() && "slow path not found for safepoint poll"); // Record the fact we need a parsable state at the runtime call contained in // the poll function. This is required so that the runtime knows how to // parse the last frame when we actually take the safepoint (i.e. execute // the slow path) assert(ParsePointsNeeded.empty()); - for (size_t i = 0; i < calls.size(); i++) { - + for (auto *CI : Calls) { // No safepoint needed or wanted - if (!needsStatepoint(calls[i])) { + if (!needsStatepoint(CI)) continue; - } // These are likely runtime calls. Should we assert that via calling // convention or something? - ParsePointsNeeded.push_back(CallSite(calls[i])); - } - assert(ParsePointsNeeded.size() <= calls.size()); -} - -/// Replaces the given call site (Call or Invoke) with a gc.statepoint -/// intrinsic with an empty deoptimization arguments list. This does -/// NOT do explicit relocation for GC support. -static Value *ReplaceWithStatepoint(const CallSite &CS /* to replace */) { - assert(CS.getInstruction()->getModule() && "must be set"); - - // TODO: technically, a pass is not allowed to get functions from within a - // function pass since it might trigger a new function addition. Refactor - // this logic out to the initialization of the pass. Doesn't appear to - // matter in practice. - - // Then go ahead and use the builder do actually do the inserts. We insert - // immediately before the previous instruction under the assumption that all - // arguments will be available here. We can't insert afterwards since we may - // be replacing a terminator. - IRBuilder<> Builder(CS.getInstruction()); - - // Note: The gc args are not filled in at this time, that's handled by - // RewriteStatepointsForGC (which is currently under review). - - // Create the statepoint given all the arguments - Instruction *Token = nullptr; - - uint64_t ID; - uint32_t NumPatchBytes; - - AttributeSet OriginalAttrs = CS.getAttributes(); - Attribute AttrID = - OriginalAttrs.getAttribute(AttributeSet::FunctionIndex, "statepoint-id"); - Attribute AttrNumPatchBytes = OriginalAttrs.getAttribute( - AttributeSet::FunctionIndex, "statepoint-num-patch-bytes"); - - AttrBuilder AttrsToRemove; - bool HasID = AttrID.isStringAttribute() && - !AttrID.getValueAsString().getAsInteger(10, ID); - - if (HasID) - AttrsToRemove.addAttribute("statepoint-id"); - else - ID = 0xABCDEF00; - - bool HasNumPatchBytes = - AttrNumPatchBytes.isStringAttribute() && - !AttrNumPatchBytes.getValueAsString().getAsInteger(10, NumPatchBytes); - - if (HasNumPatchBytes) - AttrsToRemove.addAttribute("statepoint-num-patch-bytes"); - else - NumPatchBytes = 0; - - OriginalAttrs = OriginalAttrs.removeAttributes( - CS.getInstruction()->getContext(), AttributeSet::FunctionIndex, - AttrsToRemove); - - if (CS.isCall()) { - CallInst *ToReplace = cast<CallInst>(CS.getInstruction()); - CallInst *Call = Builder.CreateGCStatepointCall( - ID, NumPatchBytes, CS.getCalledValue(), - makeArrayRef(CS.arg_begin(), CS.arg_end()), None, None, - "safepoint_token"); - Call->setTailCall(ToReplace->isTailCall()); - Call->setCallingConv(ToReplace->getCallingConv()); - - // In case if we can handle this set of attributes - set up function - // attributes directly on statepoint and return attributes later for - // gc_result intrinsic. - Call->setAttributes(OriginalAttrs.getFnAttributes()); - - Token = Call; - - // Put the following gc_result and gc_relocate calls immediately after - // the old call (which we're about to delete). - assert(ToReplace->getNextNode() && "not a terminator, must have next"); - Builder.SetInsertPoint(ToReplace->getNextNode()); - Builder.SetCurrentDebugLocation(ToReplace->getNextNode()->getDebugLoc()); - } else if (CS.isInvoke()) { - InvokeInst *ToReplace = cast<InvokeInst>(CS.getInstruction()); - - // Insert the new invoke into the old block. We'll remove the old one in a - // moment at which point this will become the new terminator for the - // original block. - Builder.SetInsertPoint(ToReplace->getParent()); - InvokeInst *Invoke = Builder.CreateGCStatepointInvoke( - ID, NumPatchBytes, CS.getCalledValue(), ToReplace->getNormalDest(), - ToReplace->getUnwindDest(), makeArrayRef(CS.arg_begin(), CS.arg_end()), - None, None, "safepoint_token"); - - Invoke->setCallingConv(ToReplace->getCallingConv()); - - // In case if we can handle this set of attributes - set up function - // attributes directly on statepoint and return attributes later for - // gc_result intrinsic. - Invoke->setAttributes(OriginalAttrs.getFnAttributes()); - - Token = Invoke; - - // We'll insert the gc.result into the normal block - BasicBlock *NormalDest = ToReplace->getNormalDest(); - // Can not insert gc.result in case of phi nodes preset. - // Should have removed this cases prior to running this function - assert(!isa<PHINode>(NormalDest->begin())); - Instruction *IP = &*(NormalDest->getFirstInsertionPt()); - Builder.SetInsertPoint(IP); - } else { - llvm_unreachable("unexpect type of CallSite"); - } - assert(Token); - - // Handle the return value of the original call - update all uses to use a - // gc_result hanging off the statepoint node we just inserted - - // Only add the gc_result iff there is actually a used result - if (!CS.getType()->isVoidTy() && !CS.getInstruction()->use_empty()) { - std::string TakenName = - CS.getInstruction()->hasName() ? CS.getInstruction()->getName() : ""; - CallInst *GCResult = Builder.CreateGCResult(Token, CS.getType(), TakenName); - GCResult->setAttributes(OriginalAttrs.getRetAttributes()); - return GCResult; - } else { - // No return value for the call. - return nullptr; + ParsePointsNeeded.push_back(CallSite(CI)); } + assert(ParsePointsNeeded.size() <= Calls.size()); } diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp index bcadd4e2bee69..b930a8fb7e999 100644 --- a/lib/Transforms/Scalar/Reassociate.cpp +++ b/lib/Transforms/Scalar/Reassociate.cpp @@ -20,7 +20,7 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Scalar/Reassociate.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/STLExtras.h" @@ -39,9 +39,11 @@ #include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/Local.h" #include <algorithm> using namespace llvm; +using namespace reassociate; #define DEBUG_TYPE "reassociate" @@ -49,17 +51,6 @@ STATISTIC(NumChanged, "Number of insts reassociated"); STATISTIC(NumAnnihil, "Number of expr tree annihilated"); STATISTIC(NumFactor , "Number of multiplies factored"); -namespace { - struct ValueEntry { - unsigned Rank; - Value *Op; - ValueEntry(unsigned R, Value *O) : Rank(R), Op(O) {} - }; - inline bool operator<(const ValueEntry &LHS, const ValueEntry &RHS) { - return LHS.Rank > RHS.Rank; // Sort so that highest rank goes to start. - } -} - #ifndef NDEBUG /// Print out the expression identified in the Ops list. /// @@ -75,120 +66,35 @@ static void PrintOps(Instruction *I, const SmallVectorImpl<ValueEntry> &Ops) { } #endif -namespace { - /// \brief Utility class representing a base and exponent pair which form one - /// factor of some product. - struct Factor { - Value *Base; - unsigned Power; - - Factor(Value *Base, unsigned Power) : Base(Base), Power(Power) {} - - /// \brief Sort factors in descending order by their power. - struct PowerDescendingSorter { - bool operator()(const Factor &LHS, const Factor &RHS) { - return LHS.Power > RHS.Power; - } - }; - - /// \brief Compare factors for equal powers. - struct PowerEqual { - bool operator()(const Factor &LHS, const Factor &RHS) { - return LHS.Power == RHS.Power; - } - }; - }; - - /// Utility class representing a non-constant Xor-operand. We classify - /// non-constant Xor-Operands into two categories: - /// C1) The operand is in the form "X & C", where C is a constant and C != ~0 - /// C2) - /// C2.1) The operand is in the form of "X | C", where C is a non-zero - /// constant. - /// C2.2) Any operand E which doesn't fall into C1 and C2.1, we view this - /// operand as "E | 0" - class XorOpnd { - public: - XorOpnd(Value *V); - - bool isInvalid() const { return SymbolicPart == nullptr; } - bool isOrExpr() const { return isOr; } - Value *getValue() const { return OrigVal; } - Value *getSymbolicPart() const { return SymbolicPart; } - unsigned getSymbolicRank() const { return SymbolicRank; } - const APInt &getConstPart() const { return ConstPart; } - - void Invalidate() { SymbolicPart = OrigVal = nullptr; } - void setSymbolicRank(unsigned R) { SymbolicRank = R; } - - // Sort the XorOpnd-Pointer in ascending order of symbolic-value-rank. - // The purpose is twofold: - // 1) Cluster together the operands sharing the same symbolic-value. - // 2) Operand having smaller symbolic-value-rank is permuted earlier, which - // could potentially shorten crital path, and expose more loop-invariants. - // Note that values' rank are basically defined in RPO order (FIXME). - // So, if Rank(X) < Rank(Y) < Rank(Z), it means X is defined earlier - // than Y which is defined earlier than Z. Permute "x | 1", "Y & 2", - // "z" in the order of X-Y-Z is better than any other orders. - struct PtrSortFunctor { - bool operator()(XorOpnd * const &LHS, XorOpnd * const &RHS) { - return LHS->getSymbolicRank() < RHS->getSymbolicRank(); - } - }; - private: - Value *OrigVal; - Value *SymbolicPart; - APInt ConstPart; - unsigned SymbolicRank; - bool isOr; - }; -} - -namespace { - class Reassociate : public FunctionPass { - DenseMap<BasicBlock*, unsigned> RankMap; - DenseMap<AssertingVH<Value>, unsigned> ValueRankMap; - SetVector<AssertingVH<Instruction> > RedoInsts; - bool MadeChange; - public: - static char ID; // Pass identification, replacement for typeid - Reassociate() : FunctionPass(ID) { - initializeReassociatePass(*PassRegistry::getPassRegistry()); - } - - bool runOnFunction(Function &F) override; - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - AU.addPreserved<GlobalsAAWrapperPass>(); - } - private: - void BuildRankMap(Function &F); - unsigned getRank(Value *V); - void canonicalizeOperands(Instruction *I); - void ReassociateExpression(BinaryOperator *I); - void RewriteExprTree(BinaryOperator *I, SmallVectorImpl<ValueEntry> &Ops); - Value *OptimizeExpression(BinaryOperator *I, - SmallVectorImpl<ValueEntry> &Ops); - Value *OptimizeAdd(Instruction *I, SmallVectorImpl<ValueEntry> &Ops); - Value *OptimizeXor(Instruction *I, SmallVectorImpl<ValueEntry> &Ops); - bool CombineXorOpnd(Instruction *I, XorOpnd *Opnd1, APInt &ConstOpnd, - Value *&Res); - bool CombineXorOpnd(Instruction *I, XorOpnd *Opnd1, XorOpnd *Opnd2, - APInt &ConstOpnd, Value *&Res); - bool collectMultiplyFactors(SmallVectorImpl<ValueEntry> &Ops, - SmallVectorImpl<Factor> &Factors); - Value *buildMinimalMultiplyDAG(IRBuilder<> &Builder, - SmallVectorImpl<Factor> &Factors); - Value *OptimizeMul(BinaryOperator *I, SmallVectorImpl<ValueEntry> &Ops); - Value *RemoveFactorFromExpression(Value *V, Value *Factor); - void EraseInst(Instruction *I); - void RecursivelyEraseDeadInsts(Instruction *I, - SetVector<AssertingVH<Instruction>> &Insts); - void OptimizeInst(Instruction *I); - Instruction *canonicalizeNegConstExpr(Instruction *I); - }; -} +/// Utility class representing a non-constant Xor-operand. We classify +/// non-constant Xor-Operands into two categories: +/// C1) The operand is in the form "X & C", where C is a constant and C != ~0 +/// C2) +/// C2.1) The operand is in the form of "X | C", where C is a non-zero +/// constant. +/// C2.2) Any operand E which doesn't fall into C1 and C2.1, we view this +/// operand as "E | 0" +class llvm::reassociate::XorOpnd { +public: + XorOpnd(Value *V); + + bool isInvalid() const { return SymbolicPart == nullptr; } + bool isOrExpr() const { return isOr; } + Value *getValue() const { return OrigVal; } + Value *getSymbolicPart() const { return SymbolicPart; } + unsigned getSymbolicRank() const { return SymbolicRank; } + const APInt &getConstPart() const { return ConstPart; } + + void Invalidate() { SymbolicPart = OrigVal = nullptr; } + void setSymbolicRank(unsigned R) { SymbolicRank = R; } + +private: + Value *OrigVal; + Value *SymbolicPart; + APInt ConstPart; + unsigned SymbolicRank; + bool isOr; +}; XorOpnd::XorOpnd(Value *V) { assert(!isa<ConstantInt>(V) && "No ConstantInt"); @@ -217,13 +123,6 @@ XorOpnd::XorOpnd(Value *V) { isOr = true; } -char Reassociate::ID = 0; -INITIALIZE_PASS(Reassociate, "reassociate", - "Reassociate expressions", false, false) - -// Public interface to the Reassociate pass -FunctionPass *llvm::createReassociatePass() { return new Reassociate(); } - /// Return true if V is an instruction of the specified opcode and if it /// only has one use. static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode) { @@ -246,7 +145,8 @@ static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode1, return nullptr; } -void Reassociate::BuildRankMap(Function &F) { +void ReassociatePass::BuildRankMap( + Function &F, ReversePostOrderTraversal<Function *> &RPOT) { unsigned i = 2; // Assign distinct ranks to function arguments. @@ -255,22 +155,19 @@ void Reassociate::BuildRankMap(Function &F) { DEBUG(dbgs() << "Calculated Rank[" << I->getName() << "] = " << i << "\n"); } - ReversePostOrderTraversal<Function*> RPOT(&F); - for (ReversePostOrderTraversal<Function*>::rpo_iterator I = RPOT.begin(), - E = RPOT.end(); I != E; ++I) { - BasicBlock *BB = *I; + for (BasicBlock *BB : RPOT) { unsigned BBRank = RankMap[BB] = ++i << 16; // Walk the basic block, adding precomputed ranks for any instructions that // we cannot move. This ensures that the ranks for these instructions are // all different in the block. - for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) - if (mayBeMemoryDependent(*I)) - ValueRankMap[&*I] = ++BBRank; + for (Instruction &I : *BB) + if (mayBeMemoryDependent(I)) + ValueRankMap[&I] = ++BBRank; } } -unsigned Reassociate::getRank(Value *V) { +unsigned ReassociatePass::getRank(Value *V) { Instruction *I = dyn_cast<Instruction>(V); if (!I) { if (isa<Argument>(V)) return ValueRankMap[V]; // Function argument. @@ -301,7 +198,7 @@ unsigned Reassociate::getRank(Value *V) { } // Canonicalize constants to RHS. Otherwise, sort the operands by rank. -void Reassociate::canonicalizeOperands(Instruction *I) { +void ReassociatePass::canonicalizeOperands(Instruction *I) { assert(isa<BinaryOperator>(I) && "Expected binary operator."); assert(I->isCommutative() && "Expected commutative operator."); @@ -711,8 +608,8 @@ static bool LinearizeExprTree(BinaryOperator *I, /// Now that the operands for this expression tree are /// linearized and optimized, emit them in-order. -void Reassociate::RewriteExprTree(BinaryOperator *I, - SmallVectorImpl<ValueEntry> &Ops) { +void ReassociatePass::RewriteExprTree(BinaryOperator *I, + SmallVectorImpl<ValueEntry> &Ops) { assert(Ops.size() > 1 && "Single values should be used directly!"); // Since our optimizations should never increase the number of operations, the @@ -1095,7 +992,7 @@ static Value *EmitAddTreeOfValues(Instruction *I, /// If V is an expression tree that is a multiplication sequence, /// and if this sequence contains a multiply by Factor, /// remove Factor from the tree and return the new tree. -Value *Reassociate::RemoveFactorFromExpression(Value *V, Value *Factor) { +Value *ReassociatePass::RemoveFactorFromExpression(Value *V, Value *Factor) { BinaryOperator *BO = isReassociableOp(V, Instruction::Mul, Instruction::FMul); if (!BO) return nullptr; @@ -1129,7 +1026,7 @@ Value *Reassociate::RemoveFactorFromExpression(Value *V, Value *Factor) { } } else if (ConstantFP *FC1 = dyn_cast<ConstantFP>(Factor)) { if (ConstantFP *FC2 = dyn_cast<ConstantFP>(Factors[i].Op)) { - APFloat F1(FC1->getValueAPF()); + const APFloat &F1 = FC1->getValueAPF(); APFloat F2(FC2->getValueAPF()); F2.changeSign(); if (F1.compare(F2) == APFloat::cmpEqual) { @@ -1258,9 +1155,9 @@ static Value *createAndInstr(Instruction *InsertBefore, Value *Opnd, // If it was successful, true is returned, and the "R" and "C" is returned // via "Res" and "ConstOpnd", respectively; otherwise, false is returned, // and both "Res" and "ConstOpnd" remain unchanged. -// -bool Reassociate::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1, - APInt &ConstOpnd, Value *&Res) { +// +bool ReassociatePass::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1, + APInt &ConstOpnd, Value *&Res) { // Xor-Rule 1: (x | c1) ^ c2 = (x | c1) ^ (c1 ^ c1) ^ c2 // = ((x | c1) ^ c1) ^ (c1 ^ c2) // = (x & ~c1) ^ (c1 ^ c2) @@ -1294,8 +1191,9 @@ bool Reassociate::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1, // via "Res" and "ConstOpnd", respectively (If the entire expression is // evaluated to a constant, the Res is set to NULL); otherwise, false is // returned, and both "Res" and "ConstOpnd" remain unchanged. -bool Reassociate::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1, XorOpnd *Opnd2, - APInt &ConstOpnd, Value *&Res) { +bool ReassociatePass::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1, + XorOpnd *Opnd2, APInt &ConstOpnd, + Value *&Res) { Value *X = Opnd1->getSymbolicPart(); if (X != Opnd2->getSymbolicPart()) return false; @@ -1369,8 +1267,8 @@ bool Reassociate::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1, XorOpnd *Opnd2, /// Optimize a series of operands to an 'xor' instruction. If it can be reduced /// to a single Value, it is returned, otherwise the Ops list is mutated as /// necessary. -Value *Reassociate::OptimizeXor(Instruction *I, - SmallVectorImpl<ValueEntry> &Ops) { +Value *ReassociatePass::OptimizeXor(Instruction *I, + SmallVectorImpl<ValueEntry> &Ops) { if (Value *V = OptimizeAndOrXor(Instruction::Xor, Ops)) return V; @@ -1405,7 +1303,19 @@ Value *Reassociate::OptimizeXor(Instruction *I, // the same symbolic value cluster together. For instance, the input operand // sequence ("x | 123", "y & 456", "x & 789") will be sorted into: // ("x | 123", "x & 789", "y & 456"). - std::stable_sort(OpndPtrs.begin(), OpndPtrs.end(), XorOpnd::PtrSortFunctor()); + // + // The purpose is twofold: + // 1) Cluster together the operands sharing the same symbolic-value. + // 2) Operand having smaller symbolic-value-rank is permuted earlier, which + // could potentially shorten crital path, and expose more loop-invariants. + // Note that values' rank are basically defined in RPO order (FIXME). + // So, if Rank(X) < Rank(Y) < Rank(Z), it means X is defined earlier + // than Y which is defined earlier than Z. Permute "x | 1", "Y & 2", + // "z" in the order of X-Y-Z is better than any other orders. + std::stable_sort(OpndPtrs.begin(), OpndPtrs.end(), + [](XorOpnd *LHS, XorOpnd *RHS) { + return LHS->getSymbolicRank() < RHS->getSymbolicRank(); + }); // Step 3: Combine adjacent operands XorOpnd *PrevOpnd = nullptr; @@ -1478,8 +1388,8 @@ Value *Reassociate::OptimizeXor(Instruction *I, /// Optimize a series of operands to an 'add' instruction. This /// optimizes based on identities. If it can be reduced to a single Value, it /// is returned, otherwise the Ops list is mutated as necessary. -Value *Reassociate::OptimizeAdd(Instruction *I, - SmallVectorImpl<ValueEntry> &Ops) { +Value *ReassociatePass::OptimizeAdd(Instruction *I, + SmallVectorImpl<ValueEntry> &Ops) { // Scan the operand lists looking for X and -X pairs. If we find any, we // can simplify expressions like X+-X == 0 and X+~X ==-1. While we're at it, // scan for any @@ -1716,8 +1626,8 @@ Value *Reassociate::OptimizeAdd(Instruction *I, /// ((((x*y)*x)*y)*x) -> [(x, 3), (y, 2)] /// /// \returns Whether any factors have a power greater than one. -bool Reassociate::collectMultiplyFactors(SmallVectorImpl<ValueEntry> &Ops, - SmallVectorImpl<Factor> &Factors) { +bool ReassociatePass::collectMultiplyFactors(SmallVectorImpl<ValueEntry> &Ops, + SmallVectorImpl<Factor> &Factors) { // FIXME: Have Ops be (ValueEntry, Multiplicity) pairs, simplifying this. // Compute the sum of powers of simplifiable factors. unsigned FactorPowerSum = 0; @@ -1763,7 +1673,10 @@ bool Reassociate::collectMultiplyFactors(SmallVectorImpl<ValueEntry> &Ops, // below our mininum of '4'. assert(FactorPowerSum >= 4); - std::stable_sort(Factors.begin(), Factors.end(), Factor::PowerDescendingSorter()); + std::stable_sort(Factors.begin(), Factors.end(), + [](const Factor &LHS, const Factor &RHS) { + return LHS.Power > RHS.Power; + }); return true; } @@ -1790,8 +1703,9 @@ static Value *buildMultiplyTree(IRBuilder<> &Builder, /// equal and the powers are sorted in decreasing order, compute the minimal /// DAG of multiplies to compute the final product, and return that product /// value. -Value *Reassociate::buildMinimalMultiplyDAG(IRBuilder<> &Builder, - SmallVectorImpl<Factor> &Factors) { +Value * +ReassociatePass::buildMinimalMultiplyDAG(IRBuilder<> &Builder, + SmallVectorImpl<Factor> &Factors) { assert(Factors[0].Power); SmallVector<Value *, 4> OuterProduct; for (unsigned LastIdx = 0, Idx = 1, Size = Factors.size(); @@ -1822,7 +1736,9 @@ Value *Reassociate::buildMinimalMultiplyDAG(IRBuilder<> &Builder, // Unique factors with equal powers -- we've folded them into the first one's // base. Factors.erase(std::unique(Factors.begin(), Factors.end(), - Factor::PowerEqual()), + [](const Factor &LHS, const Factor &RHS) { + return LHS.Power == RHS.Power; + }), Factors.end()); // Iteratively collect the base of each factor with an add power into the @@ -1845,8 +1761,8 @@ Value *Reassociate::buildMinimalMultiplyDAG(IRBuilder<> &Builder, return V; } -Value *Reassociate::OptimizeMul(BinaryOperator *I, - SmallVectorImpl<ValueEntry> &Ops) { +Value *ReassociatePass::OptimizeMul(BinaryOperator *I, + SmallVectorImpl<ValueEntry> &Ops) { // We can only optimize the multiplies when there is a chain of more than // three, such that a balanced tree might require fewer total multiplies. if (Ops.size() < 4) @@ -1869,8 +1785,8 @@ Value *Reassociate::OptimizeMul(BinaryOperator *I, return nullptr; } -Value *Reassociate::OptimizeExpression(BinaryOperator *I, - SmallVectorImpl<ValueEntry> &Ops) { +Value *ReassociatePass::OptimizeExpression(BinaryOperator *I, + SmallVectorImpl<ValueEntry> &Ops) { // Now that we have the linearized expression tree, try to optimize it. // Start by folding any constants that we found. Constant *Cst = nullptr; @@ -1930,7 +1846,7 @@ Value *Reassociate::OptimizeExpression(BinaryOperator *I, // Remove dead instructions and if any operands are trivially dead add them to // Insts so they will be removed as well. -void Reassociate::RecursivelyEraseDeadInsts( +void ReassociatePass::RecursivelyEraseDeadInsts( Instruction *I, SetVector<AssertingVH<Instruction>> &Insts) { assert(isInstructionTriviallyDead(I) && "Trivially dead instructions only!"); SmallVector<Value *, 4> Ops(I->op_begin(), I->op_end()); @@ -1945,7 +1861,7 @@ void Reassociate::RecursivelyEraseDeadInsts( } /// Zap the given instruction, adding interesting operands to the work list. -void Reassociate::EraseInst(Instruction *I) { +void ReassociatePass::EraseInst(Instruction *I) { assert(isInstructionTriviallyDead(I) && "Trivially dead instructions only!"); SmallVector<Value*, 8> Ops(I->op_begin(), I->op_end()); // Erase the dead instruction. @@ -1969,7 +1885,7 @@ void Reassociate::EraseInst(Instruction *I) { // Canonicalize expressions of the following form: // x + (-Constant * y) -> x - (Constant * y) // x - (-Constant * y) -> x + (Constant * y) -Instruction *Reassociate::canonicalizeNegConstExpr(Instruction *I) { +Instruction *ReassociatePass::canonicalizeNegConstExpr(Instruction *I) { if (!I->hasOneUse() || I->getType()->isVectorTy()) return nullptr; @@ -2046,7 +1962,7 @@ Instruction *Reassociate::canonicalizeNegConstExpr(Instruction *I) { /// Inspect and optimize the given instruction. Note that erasing /// instructions is not allowed. -void Reassociate::OptimizeInst(Instruction *I) { +void ReassociatePass::OptimizeInst(Instruction *I) { // Only consider operations that we understand. if (!isa<BinaryOperator>(I)) return; @@ -2173,7 +2089,7 @@ void Reassociate::OptimizeInst(Instruction *I) { ReassociateExpression(BO); } -void Reassociate::ReassociateExpression(BinaryOperator *I) { +void ReassociatePass::ReassociateExpression(BinaryOperator *I) { // First, walk the expression tree, linearizing the tree, collecting the // operand information. SmallVector<RepeatedValue, 8> Tree; @@ -2255,46 +2171,53 @@ void Reassociate::ReassociateExpression(BinaryOperator *I) { RewriteExprTree(I, Ops); } -bool Reassociate::runOnFunction(Function &F) { - if (skipOptnoneFunction(F)) - return false; - - // Calculate the rank map for F - BuildRankMap(F); +PreservedAnalyses ReassociatePass::run(Function &F, FunctionAnalysisManager &) { + // Reassociate needs for each instruction to have its operands already + // processed, so we first perform a RPOT of the basic blocks so that + // when we process a basic block, all its dominators have been processed + // before. + ReversePostOrderTraversal<Function *> RPOT(&F); + BuildRankMap(F, RPOT); MadeChange = false; - for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) { + for (BasicBlock *BI : RPOT) { + // Use a worklist to keep track of which instructions have been processed + // (and which insts won't be optimized again) so when redoing insts, + // optimize insts rightaway which won't be processed later. + SmallSet<Instruction *, 8> Worklist; + + // Insert all instructions in the BB + for (Instruction &I : *BI) + Worklist.insert(&I); + // Optimize every instruction in the basic block. - for (BasicBlock::iterator II = BI->begin(), IE = BI->end(); II != IE; ) + for (BasicBlock::iterator II = BI->begin(), IE = BI->end(); II != IE;) { + // This instruction has been processed. + Worklist.erase(&*II); if (isInstructionTriviallyDead(&*II)) { EraseInst(&*II++); } else { OptimizeInst(&*II); - assert(II->getParent() == BI && "Moved to a different block!"); + assert(II->getParent() == &*BI && "Moved to a different block!"); ++II; } - // Make a copy of all the instructions to be redone so we can remove dead - // instructions. - SetVector<AssertingVH<Instruction>> ToRedo(RedoInsts); - // Iterate over all instructions to be reevaluated and remove trivially dead - // instructions. If any operand of the trivially dead instruction becomes - // dead mark it for deletion as well. Continue this process until all - // trivially dead instructions have been removed. - while (!ToRedo.empty()) { - Instruction *I = ToRedo.pop_back_val(); - if (isInstructionTriviallyDead(I)) - RecursivelyEraseDeadInsts(I, ToRedo); - } - - // Now that we have removed dead instructions, we can reoptimize the - // remaining instructions. - while (!RedoInsts.empty()) { - Instruction *I = RedoInsts.pop_back_val(); - if (isInstructionTriviallyDead(I)) - EraseInst(I); - else - OptimizeInst(I); + // If the above optimizations produced new instructions to optimize or + // made modifications which need to be redone, do them now if they won't + // be handled later. + while (!RedoInsts.empty()) { + Instruction *I = RedoInsts.pop_back_val(); + // Process instructions that won't be processed later, either + // inside the block itself or in another basic block (based on rank), + // since these will be processed later. + if ((I->getParent() != BI || !Worklist.count(I)) && + RankMap[I->getParent()] <= RankMap[BI]) { + if (isInstructionTriviallyDead(I)) + EraseInst(I); + else + OptimizeInst(I); + } + } } } @@ -2302,5 +2225,46 @@ bool Reassociate::runOnFunction(Function &F) { RankMap.clear(); ValueRankMap.clear(); - return MadeChange; + if (MadeChange) { + // FIXME: This should also 'preserve the CFG'. + auto PA = PreservedAnalyses(); + PA.preserve<GlobalsAA>(); + return PA; + } + + return PreservedAnalyses::all(); +} + +namespace { + class ReassociateLegacyPass : public FunctionPass { + ReassociatePass Impl; + public: + static char ID; // Pass identification, replacement for typeid + ReassociateLegacyPass() : FunctionPass(ID) { + initializeReassociateLegacyPassPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override { + if (skipFunction(F)) + return false; + + FunctionAnalysisManager DummyFAM; + auto PA = Impl.run(F, DummyFAM); + return !PA.areAllPreserved(); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addPreserved<GlobalsAAWrapperPass>(); + } + }; +} + +char ReassociateLegacyPass::ID = 0; +INITIALIZE_PASS(ReassociateLegacyPass, "reassociate", + "Reassociate expressions", false, false) + +// Public interface to the Reassociate pass +FunctionPass *llvm::createReassociatePass() { + return new ReassociateLegacyPass(); } diff --git a/lib/Transforms/Scalar/Reg2Mem.cpp b/lib/Transforms/Scalar/Reg2Mem.cpp index 915f89780c080..615029dd161bb 100644 --- a/lib/Transforms/Scalar/Reg2Mem.cpp +++ b/lib/Transforms/Scalar/Reg2Mem.cpp @@ -68,7 +68,7 @@ INITIALIZE_PASS_END(RegToMem, "reg2mem", "Demote all values to stack slots", false, false) bool RegToMem::runOnFunction(Function &F) { - if (F.isDeclaration()) + if (F.isDeclaration() || skipFunction(F)) return false; // Insert all new allocas into entry block. @@ -89,10 +89,9 @@ bool RegToMem::runOnFunction(Function &F) { // Find the escaped instructions. But don't create stack slots for // allocas in entry block. std::list<Instruction*> WorkList; - for (Function::iterator ibb = F.begin(), ibe = F.end(); - ibb != ibe; ++ibb) - for (BasicBlock::iterator iib = ibb->begin(), iie = ibb->end(); - iib != iie; ++iib) { + for (BasicBlock &ibb : F) + for (BasicBlock::iterator iib = ibb.begin(), iie = ibb.end(); iib != iie; + ++iib) { if (!(isa<AllocaInst>(iib) && iib->getParent() == BBEntry) && valueEscapes(&*iib)) { WorkList.push_front(&*iib); @@ -101,25 +100,22 @@ bool RegToMem::runOnFunction(Function &F) { // Demote escaped instructions NumRegsDemoted += WorkList.size(); - for (std::list<Instruction*>::iterator ilb = WorkList.begin(), - ile = WorkList.end(); ilb != ile; ++ilb) - DemoteRegToStack(**ilb, false, AllocaInsertionPoint); + for (Instruction *ilb : WorkList) + DemoteRegToStack(*ilb, false, AllocaInsertionPoint); WorkList.clear(); // Find all phi's - for (Function::iterator ibb = F.begin(), ibe = F.end(); - ibb != ibe; ++ibb) - for (BasicBlock::iterator iib = ibb->begin(), iie = ibb->end(); - iib != iie; ++iib) + for (BasicBlock &ibb : F) + for (BasicBlock::iterator iib = ibb.begin(), iie = ibb.end(); iib != iie; + ++iib) if (isa<PHINode>(iib)) WorkList.push_front(&*iib); // Demote phi nodes NumPhisDemoted += WorkList.size(); - for (std::list<Instruction*>::iterator ilb = WorkList.begin(), - ile = WorkList.end(); ilb != ile; ++ilb) - DemotePHIToStack(cast<PHINode>(*ilb), AllocaInsertionPoint); + for (Instruction *ilb : WorkList) + DemotePHIToStack(cast<PHINode>(ilb), AllocaInsertionPoint); return true; } diff --git a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp index d77d5745e60cc..bab39a32677ff 100644 --- a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp +++ b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp @@ -14,7 +14,6 @@ #include "llvm/Pass.h" #include "llvm/Analysis/CFG.h" -#include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/ADT/SetOperations.h" #include "llvm/ADT/Statistic.h" @@ -63,7 +62,7 @@ static cl::opt<unsigned> RematerializationThreshold("spp-rematerialization-threshold", cl::Hidden, cl::init(6)); -#ifdef XDEBUG +#ifdef EXPENSIVE_CHECKS static bool ClobberNonLive = true; #else static bool ClobberNonLive = false; @@ -72,19 +71,10 @@ static cl::opt<bool, true> ClobberNonLiveOverride("rs4gc-clobber-non-live", cl::location(ClobberNonLive), cl::Hidden); -static cl::opt<bool> UseDeoptBundles("rs4gc-use-deopt-bundles", cl::Hidden, - cl::init(false)); static cl::opt<bool> AllowStatepointWithNoDeoptInfo("rs4gc-allow-statepoint-with-no-deopt-info", cl::Hidden, cl::init(true)); -/// Should we split vectors of pointers into their individual elements? This -/// is known to be buggy, but the alternate implementation isn't yet ready. -/// This is purely to provide a debugging and dianostic hook until the vector -/// split is replaced with vector relocations. -static cl::opt<bool> UseVectorSplit("rs4gc-split-vector-values", cl::Hidden, - cl::init(true)); - namespace { struct RewriteStatepointsForGC : public ModulePass { static char ID; // Pass identification, replacement for typeid @@ -141,24 +131,25 @@ ModulePass *llvm::createRewriteStatepointsForGCPass() { INITIALIZE_PASS_BEGIN(RewriteStatepointsForGC, "rewrite-statepoints-for-gc", "Make relocations explicit at statepoints", false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_END(RewriteStatepointsForGC, "rewrite-statepoints-for-gc", "Make relocations explicit at statepoints", false, false) namespace { struct GCPtrLivenessData { /// Values defined in this block. - DenseMap<BasicBlock *, DenseSet<Value *>> KillSet; + MapVector<BasicBlock *, SetVector<Value *>> KillSet; /// Values used in this block (and thus live); does not included values /// killed within this block. - DenseMap<BasicBlock *, DenseSet<Value *>> LiveSet; + MapVector<BasicBlock *, SetVector<Value *>> LiveSet; /// Values live into this basic block (i.e. used by any /// instruction in this basic block or ones reachable from here) - DenseMap<BasicBlock *, DenseSet<Value *>> LiveIn; + MapVector<BasicBlock *, SetVector<Value *>> LiveIn; /// Values live out of this basic block (i.e. live into /// any successor block) - DenseMap<BasicBlock *, DenseSet<Value *>> LiveOut; + MapVector<BasicBlock *, SetVector<Value *>> LiveOut; }; // The type of the internal cache used inside the findBasePointers family @@ -171,9 +162,9 @@ struct GCPtrLivenessData { // Generally, after the execution of a full findBasePointer call, only the // base relation will remain. Internally, we add a mixture of the two // types, then update all the second type to the first type -typedef DenseMap<Value *, Value *> DefiningValueMapTy; -typedef DenseSet<Value *> StatepointLiveSetTy; -typedef DenseMap<AssertingVH<Instruction>, AssertingVH<Value>> +typedef MapVector<Value *, Value *> DefiningValueMapTy; +typedef SetVector<Value *> StatepointLiveSetTy; +typedef MapVector<AssertingVH<Instruction>, AssertingVH<Value>> RematerializedValueMapTy; struct PartiallyConstructedSafepointRecord { @@ -181,7 +172,7 @@ struct PartiallyConstructedSafepointRecord { StatepointLiveSetTy LiveSet; /// Mapping from live pointers to a base-defining-value - DenseMap<Value *, Value *> PointerToBase; + MapVector<Value *, Value *> PointerToBase; /// The *new* gc.statepoint instruction itself. This produces the token /// that normal path gc.relocates and the gc.result are tied to. @@ -199,9 +190,8 @@ struct PartiallyConstructedSafepointRecord { } static ArrayRef<Use> GetDeoptBundleOperands(ImmutableCallSite CS) { - assert(UseDeoptBundles && "Should not be called otherwise!"); - - Optional<OperandBundleUse> DeoptBundle = CS.getOperandBundle("deopt"); + Optional<OperandBundleUse> DeoptBundle = + CS.getOperandBundle(LLVMContext::OB_deopt); if (!DeoptBundle.hasValue()) { assert(AllowStatepointWithNoDeoptInfo && @@ -229,7 +219,7 @@ static bool isGCPointerType(Type *T) { // For the sake of this example GC, we arbitrarily pick addrspace(1) as our // GC managed heap. We know that a pointer into this heap needs to be // updated and that no other pointer does. - return (1 == PT->getAddressSpace()); + return PT->getAddressSpace() == 1; return false; } @@ -260,8 +250,7 @@ static bool containsGCPtrType(Type *Ty) { if (ArrayType *AT = dyn_cast<ArrayType>(Ty)) return containsGCPtrType(AT->getElementType()); if (StructType *ST = dyn_cast<StructType>(Ty)) - return std::any_of(ST->subtypes().begin(), ST->subtypes().end(), - containsGCPtrType); + return any_of(ST->subtypes(), containsGCPtrType); return false; } @@ -273,19 +262,6 @@ static bool isUnhandledGCPointerType(Type *Ty) { } #endif -static bool order_by_name(Value *a, Value *b) { - if (a->hasName() && b->hasName()) { - return -1 == a->getName().compare(b->getName()); - } else if (a->hasName() && !b->hasName()) { - return true; - } else if (!a->hasName() && b->hasName()) { - return false; - } else { - // Better than nothing, but not stable - return a < b; - } -} - // Return the name of the value suffixed with the provided value, or if the // value didn't have a name, the default value specified. static std::string suffixed_name_or(Value *V, StringRef Suffix, @@ -297,30 +273,25 @@ static std::string suffixed_name_or(Value *V, StringRef Suffix, // given instruction. The analysis is performed immediately before the // given instruction. Values defined by that instruction are not considered // live. Values used by that instruction are considered live. -static void analyzeParsePointLiveness( - DominatorTree &DT, GCPtrLivenessData &OriginalLivenessData, - const CallSite &CS, PartiallyConstructedSafepointRecord &result) { - Instruction *inst = CS.getInstruction(); +static void +analyzeParsePointLiveness(DominatorTree &DT, + GCPtrLivenessData &OriginalLivenessData, CallSite CS, + PartiallyConstructedSafepointRecord &Result) { + Instruction *Inst = CS.getInstruction(); StatepointLiveSetTy LiveSet; - findLiveSetAtInst(inst, OriginalLivenessData, LiveSet); + findLiveSetAtInst(Inst, OriginalLivenessData, LiveSet); if (PrintLiveSet) { - // Note: This output is used by several of the test cases - // The order of elements in a set is not stable, put them in a vec and sort - // by name - SmallVector<Value *, 64> Temp; - Temp.insert(Temp.end(), LiveSet.begin(), LiveSet.end()); - std::sort(Temp.begin(), Temp.end(), order_by_name); - errs() << "Live Variables:\n"; - for (Value *V : Temp) + dbgs() << "Live Variables:\n"; + for (Value *V : LiveSet) dbgs() << " " << V->getName() << " " << *V << "\n"; } if (PrintLiveSetSize) { - errs() << "Safepoint For: " << CS.getCalledValue()->getName() << "\n"; - errs() << "Number live values: " << LiveSet.size() << "\n"; + dbgs() << "Safepoint For: " << CS.getCalledValue()->getName() << "\n"; + dbgs() << "Number live values: " << LiveSet.size() << "\n"; } - result.LiveSet = LiveSet; + Result.LiveSet = LiveSet; } static bool isKnownBaseResult(Value *V); @@ -372,8 +343,10 @@ findBaseDefiningValueOfVector(Value *I) { return BaseDefiningValueResult(I, true); if (isa<Constant>(I)) - // Constant vectors consist only of constant pointers. - return BaseDefiningValueResult(I, true); + // Base of constant vector consists only of constant null pointers. + // For reasoning see similar case inside 'findBaseDefiningValue' function. + return BaseDefiningValueResult(ConstantAggregateZero::get(I->getType()), + true); if (isa<LoadInst>(I)) return BaseDefiningValueResult(I, true); @@ -415,14 +388,20 @@ static BaseDefiningValueResult findBaseDefiningValue(Value *I) { // We should have never reached here if this argument isn't an gc value return BaseDefiningValueResult(I, true); - if (isa<Constant>(I)) + if (isa<Constant>(I)) { // We assume that objects with a constant base (e.g. a global) can't move // and don't need to be reported to the collector because they are always - // live. All constants have constant bases. Besides global references, all - // kinds of constants (e.g. undef, constant expressions, null pointers) can - // be introduced by the inliner or the optimizer, especially on dynamically - // dead paths. See e.g. test4 in constants.ll. - return BaseDefiningValueResult(I, true); + // live. Besides global references, all kinds of constants (e.g. undef, + // constant expressions, null pointers) can be introduced by the inliner or + // the optimizer, especially on dynamically dead paths. + // Here we treat all of them as having single null base. By doing this we + // trying to avoid problems reporting various conflicts in a form of + // "phi (const1, const2)" or "phi (const, regular gc ptr)". + // See constant.ll file for relevant test cases. + + return BaseDefiningValueResult( + ConstantPointerNull::get(cast<PointerType>(I->getType())), true); + } if (CastInst *CI = dyn_cast<CastInst>(I)) { Value *Def = CI->stripPointerCasts(); @@ -570,30 +549,36 @@ class BDVState { public: enum Status { Unknown, Base, Conflict }; - BDVState(Status s, Value *b = nullptr) : status(s), base(b) { - assert(status != Base || b); + BDVState() : Status(Unknown), BaseValue(nullptr) {} + + explicit BDVState(Status Status, Value *BaseValue = nullptr) + : Status(Status), BaseValue(BaseValue) { + assert(Status != Base || BaseValue); } - explicit BDVState(Value *b) : status(Base), base(b) {} - BDVState() : status(Unknown), base(nullptr) {} - Status getStatus() const { return status; } - Value *getBase() const { return base; } + explicit BDVState(Value *BaseValue) : Status(Base), BaseValue(BaseValue) {} + + Status getStatus() const { return Status; } + Value *getBaseValue() const { return BaseValue; } bool isBase() const { return getStatus() == Base; } bool isUnknown() const { return getStatus() == Unknown; } bool isConflict() const { return getStatus() == Conflict; } - bool operator==(const BDVState &other) const { - return base == other.base && status == other.status; + bool operator==(const BDVState &Other) const { + return BaseValue == Other.BaseValue && Status == Other.Status; } bool operator!=(const BDVState &other) const { return !(*this == other); } LLVM_DUMP_METHOD - void dump() const { print(dbgs()); dbgs() << '\n'; } - + void dump() const { + print(dbgs()); + dbgs() << '\n'; + } + void print(raw_ostream &OS) const { - switch (status) { + switch (getStatus()) { case Unknown: OS << "U"; break; @@ -604,13 +589,13 @@ public: OS << "C"; break; }; - OS << " (" << base << " - " - << (base ? base->getName() : "nullptr") << "): "; + OS << " (" << getBaseValue() << " - " + << (getBaseValue() ? getBaseValue()->getName() : "nullptr") << "): "; } private: - Status status; - AssertingVH<Value> base; // non null only if status == base + Status Status; + AssertingVH<Value> BaseValue; // Non-null only if Status == Base. }; } @@ -621,75 +606,50 @@ static raw_ostream &operator<<(raw_ostream &OS, const BDVState &State) { } #endif -namespace { -// Values of type BDVState form a lattice, and this is a helper -// class that implementes the meet operation. The meat of the meet -// operation is implemented in MeetBDVStates::pureMeet -class MeetBDVStates { -public: - /// Initializes the currentResult to the TOP state so that if can be met with - /// any other state to produce that state. - MeetBDVStates() {} - - // Destructively meet the current result with the given BDVState - void meetWith(BDVState otherState) { - currentResult = meet(otherState, currentResult); - } +static BDVState meetBDVStateImpl(const BDVState &LHS, const BDVState &RHS) { + switch (LHS.getStatus()) { + case BDVState::Unknown: + return RHS; - BDVState getResult() const { return currentResult; } + case BDVState::Base: + assert(LHS.getBaseValue() && "can't be null"); + if (RHS.isUnknown()) + return LHS; -private: - BDVState currentResult; - - /// Perform a meet operation on two elements of the BDVState lattice. - static BDVState meet(BDVState LHS, BDVState RHS) { - assert((pureMeet(LHS, RHS) == pureMeet(RHS, LHS)) && - "math is wrong: meet does not commute!"); - BDVState Result = pureMeet(LHS, RHS); - DEBUG(dbgs() << "meet of " << LHS << " with " << RHS - << " produced " << Result << "\n"); - return Result; - } - - static BDVState pureMeet(const BDVState &stateA, const BDVState &stateB) { - switch (stateA.getStatus()) { - case BDVState::Unknown: - return stateB; - - case BDVState::Base: - assert(stateA.getBase() && "can't be null"); - if (stateB.isUnknown()) - return stateA; - - if (stateB.isBase()) { - if (stateA.getBase() == stateB.getBase()) { - assert(stateA == stateB && "equality broken!"); - return stateA; - } - return BDVState(BDVState::Conflict); + if (RHS.isBase()) { + if (LHS.getBaseValue() == RHS.getBaseValue()) { + assert(LHS == RHS && "equality broken!"); + return LHS; } - assert(stateB.isConflict() && "only three states!"); return BDVState(BDVState::Conflict); - - case BDVState::Conflict: - return stateA; } - llvm_unreachable("only three states!"); + assert(RHS.isConflict() && "only three states!"); + return BDVState(BDVState::Conflict); + + case BDVState::Conflict: + return LHS; } -}; + llvm_unreachable("only three states!"); } +// Values of type BDVState form a lattice, and this function implements the meet +// operation. +static BDVState meetBDVState(BDVState LHS, BDVState RHS) { + BDVState Result = meetBDVStateImpl(LHS, RHS); + assert(Result == meetBDVStateImpl(RHS, LHS) && + "Math is wrong: meet does not commute!"); + return Result; +} -/// For a given value or instruction, figure out what base ptr it's derived -/// from. For gc objects, this is simply itself. On success, returns a value -/// which is the base pointer. (This is reliable and can be used for -/// relocation.) On failure, returns nullptr. -static Value *findBasePointer(Value *I, DefiningValueMapTy &cache) { - Value *def = findBaseOrBDV(I, cache); +/// For a given value or instruction, figure out what base ptr its derived from. +/// For gc objects, this is simply itself. On success, returns a value which is +/// the base pointer. (This is reliable and can be used for relocation.) On +/// failure, returns nullptr. +static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache) { + Value *Def = findBaseOrBDV(I, Cache); - if (isKnownBaseResult(def)) { - return def; - } + if (isKnownBaseResult(Def)) + return Def; // Here's the rough algorithm: // - For every SSA value, construct a mapping to either an actual base @@ -731,14 +691,14 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &cache) { // one for which we don't already know a definite base value for /* scope */ { SmallVector<Value*, 16> Worklist; - Worklist.push_back(def); - States.insert(std::make_pair(def, BDVState())); + Worklist.push_back(Def); + States.insert({Def, BDVState()}); while (!Worklist.empty()) { Value *Current = Worklist.pop_back_val(); assert(!isKnownBaseResult(Current) && "why did it get added?"); auto visitIncomingValue = [&](Value *InVal) { - Value *Base = findBaseOrBDV(InVal, cache); + Value *Base = findBaseOrBDV(InVal, Cache); if (isKnownBaseResult(Base)) // Known bases won't need new instructions introduced and can be // ignored safely @@ -748,12 +708,12 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &cache) { if (States.insert(std::make_pair(Base, BDVState())).second) Worklist.push_back(Base); }; - if (PHINode *Phi = dyn_cast<PHINode>(Current)) { - for (Value *InVal : Phi->incoming_values()) + if (PHINode *PN = dyn_cast<PHINode>(Current)) { + for (Value *InVal : PN->incoming_values()) visitIncomingValue(InVal); - } else if (SelectInst *Sel = dyn_cast<SelectInst>(Current)) { - visitIncomingValue(Sel->getTrueValue()); - visitIncomingValue(Sel->getFalseValue()); + } else if (SelectInst *SI = dyn_cast<SelectInst>(Current)) { + visitIncomingValue(SI->getTrueValue()); + visitIncomingValue(SI->getFalseValue()); } else if (auto *EE = dyn_cast<ExtractElementInst>(Current)) { visitIncomingValue(EE->getVectorOperand()); } else if (auto *IE = dyn_cast<InsertElementInst>(Current)) { @@ -762,7 +722,7 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &cache) { } else { // There is one known class of instructions we know we don't handle. assert(isa<ShuffleVectorInst>(Current)); - llvm_unreachable("unimplemented instruction case"); + llvm_unreachable("Unimplemented instruction case"); } } } @@ -784,12 +744,12 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &cache) { return I->second; }; - bool progress = true; - while (progress) { + bool Progress = true; + while (Progress) { #ifndef NDEBUG - const size_t oldSize = States.size(); + const size_t OldSize = States.size(); #endif - progress = false; + Progress = false; // We're only changing values in this loop, thus safe to keep iterators. // Since this is computing a fixed point, the order of visit does not // effect the result. TODO: We could use a worklist here and make this run @@ -801,38 +761,39 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &cache) { // Given an input value for the current instruction, return a BDVState // instance which represents the BDV of that value. auto getStateForInput = [&](Value *V) mutable { - Value *BDV = findBaseOrBDV(V, cache); + Value *BDV = findBaseOrBDV(V, Cache); return getStateForBDV(BDV); }; - MeetBDVStates calculateMeet; - if (SelectInst *select = dyn_cast<SelectInst>(BDV)) { - calculateMeet.meetWith(getStateForInput(select->getTrueValue())); - calculateMeet.meetWith(getStateForInput(select->getFalseValue())); - } else if (PHINode *Phi = dyn_cast<PHINode>(BDV)) { - for (Value *Val : Phi->incoming_values()) - calculateMeet.meetWith(getStateForInput(Val)); + BDVState NewState; + if (SelectInst *SI = dyn_cast<SelectInst>(BDV)) { + NewState = meetBDVState(NewState, getStateForInput(SI->getTrueValue())); + NewState = + meetBDVState(NewState, getStateForInput(SI->getFalseValue())); + } else if (PHINode *PN = dyn_cast<PHINode>(BDV)) { + for (Value *Val : PN->incoming_values()) + NewState = meetBDVState(NewState, getStateForInput(Val)); } else if (auto *EE = dyn_cast<ExtractElementInst>(BDV)) { // The 'meet' for an extractelement is slightly trivial, but it's still // useful in that it drives us to conflict if our input is. - calculateMeet.meetWith(getStateForInput(EE->getVectorOperand())); + NewState = + meetBDVState(NewState, getStateForInput(EE->getVectorOperand())); } else { // Given there's a inherent type mismatch between the operands, will // *always* produce Conflict. auto *IE = cast<InsertElementInst>(BDV); - calculateMeet.meetWith(getStateForInput(IE->getOperand(0))); - calculateMeet.meetWith(getStateForInput(IE->getOperand(1))); + NewState = meetBDVState(NewState, getStateForInput(IE->getOperand(0))); + NewState = meetBDVState(NewState, getStateForInput(IE->getOperand(1))); } - BDVState oldState = States[BDV]; - BDVState newState = calculateMeet.getResult(); - if (oldState != newState) { - progress = true; - States[BDV] = newState; + BDVState OldState = States[BDV]; + if (OldState != NewState) { + Progress = true; + States[BDV] = NewState; } } - assert(oldSize == States.size() && + assert(OldSize == States.size() && "fixed point shouldn't be adding any new nodes to state"); } @@ -842,7 +803,7 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &cache) { DEBUG(dbgs() << " " << Pair.second << " for " << *Pair.first << "\n"); } #endif - + // Insert Phis for all conflicts // TODO: adjust naming patterns to avoid this order of iteration dependency for (auto Pair : States) { @@ -856,14 +817,13 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &cache) { // The problem is that we need to convert from a vector base to a scalar // base for the particular indice we're interested in. if (State.isBase() && isa<ExtractElementInst>(I) && - isa<VectorType>(State.getBase()->getType())) { + isa<VectorType>(State.getBaseValue()->getType())) { auto *EE = cast<ExtractElementInst>(I); // TODO: In many cases, the new instruction is just EE itself. We should // exploit this, but can't do it here since it would break the invariant // about the BDV not being known to be a base. - auto *BaseInst = ExtractElementInst::Create(State.getBase(), - EE->getIndexOperand(), - "base_ee", EE); + auto *BaseInst = ExtractElementInst::Create( + State.getBaseValue(), EE->getIndexOperand(), "base_ee", EE); BaseInst->setMetadata("is_base_value", MDNode::get(I->getContext(), {})); States[I] = BDVState(BDVState::Base, BaseInst); } @@ -871,10 +831,8 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &cache) { // Since we're joining a vector and scalar base, they can never be the // same. As a result, we should always see insert element having reached // the conflict state. - if (isa<InsertElementInst>(I)) { - assert(State.isConflict()); - } - + assert(!isa<InsertElementInst>(I) || State.isConflict()); + if (!State.isConflict()) continue; @@ -887,12 +845,11 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &cache) { assert(NumPreds > 0 && "how did we reach here"); std::string Name = suffixed_name_or(I, ".base", "base_phi"); return PHINode::Create(I->getType(), NumPreds, Name, I); - } else if (SelectInst *Sel = dyn_cast<SelectInst>(I)) { + } else if (SelectInst *SI = dyn_cast<SelectInst>(I)) { // The undef will be replaced later - UndefValue *Undef = UndefValue::get(Sel->getType()); + UndefValue *Undef = UndefValue::get(SI->getType()); std::string Name = suffixed_name_or(I, ".base", "base_select"); - return SelectInst::Create(Sel->getCondition(), Undef, - Undef, Name, Sel); + return SelectInst::Create(SI->getCondition(), Undef, Undef, Name, SI); } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { UndefValue *Undef = UndefValue::get(EE->getVectorOperand()->getType()); std::string Name = suffixed_name_or(I, ".base", "base_ee"); @@ -906,7 +863,6 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &cache) { return InsertElementInst::Create(VecUndef, ScalarUndef, IE->getOperand(2), Name, IE); } - }; Instruction *BaseInst = MakeBaseInstPlaceholder(I); // Add metadata marking this as a base value @@ -921,24 +877,21 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &cache) { // instruction to propagate the base of it's BDV and have entered that newly // introduced instruction into the state table. In either case, we are // assured to be able to determine an instruction which produces it's base - // pointer. + // pointer. auto getBaseForInput = [&](Value *Input, Instruction *InsertPt) { - Value *BDV = findBaseOrBDV(Input, cache); + Value *BDV = findBaseOrBDV(Input, Cache); Value *Base = nullptr; if (isKnownBaseResult(BDV)) { Base = BDV; } else { // Either conflict or base. assert(States.count(BDV)); - Base = States[BDV].getBase(); + Base = States[BDV].getBaseValue(); } - assert(Base && "can't be null"); + assert(Base && "Can't be null"); // The cast is needed since base traversal may strip away bitcasts - if (Base->getType() != Input->getType() && - InsertPt) { - Base = new BitCastInst(Base, Input->getType(), "cast", - InsertPt); - } + if (Base->getType() != Input->getType() && InsertPt) + Base = new BitCastInst(Base, Input->getType(), "cast", InsertPt); return Base; }; @@ -954,12 +907,12 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &cache) { if (!State.isConflict()) continue; - if (PHINode *basephi = dyn_cast<PHINode>(State.getBase())) { - PHINode *phi = cast<PHINode>(BDV); - unsigned NumPHIValues = phi->getNumIncomingValues(); + if (PHINode *BasePHI = dyn_cast<PHINode>(State.getBaseValue())) { + PHINode *PN = cast<PHINode>(BDV); + unsigned NumPHIValues = PN->getNumIncomingValues(); for (unsigned i = 0; i < NumPHIValues; i++) { - Value *InVal = phi->getIncomingValue(i); - BasicBlock *InBB = phi->getIncomingBlock(i); + Value *InVal = PN->getIncomingValue(i); + BasicBlock *InBB = PN->getIncomingBlock(i); // If we've already seen InBB, add the same incoming value // we added for it earlier. The IR verifier requires phi @@ -970,22 +923,21 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &cache) { // bitcasts (and hence two distinct values) as incoming // values for the same basic block. - int blockIndex = basephi->getBasicBlockIndex(InBB); - if (blockIndex != -1) { - Value *oldBase = basephi->getIncomingValue(blockIndex); - basephi->addIncoming(oldBase, InBB); - + int BlockIndex = BasePHI->getBasicBlockIndex(InBB); + if (BlockIndex != -1) { + Value *OldBase = BasePHI->getIncomingValue(BlockIndex); + BasePHI->addIncoming(OldBase, InBB); + #ifndef NDEBUG Value *Base = getBaseForInput(InVal, nullptr); - // In essence this assert states: the only way two - // values incoming from the same basic block may be - // different is by being different bitcasts of the same - // value. A cleanup that remains TODO is changing - // findBaseOrBDV to return an llvm::Value of the correct - // type (and still remain pure). This will remove the - // need to add bitcasts. - assert(Base->stripPointerCasts() == oldBase->stripPointerCasts() && - "sanity -- findBaseOrBDV should be pure!"); + // In essence this assert states: the only way two values + // incoming from the same basic block may be different is by + // being different bitcasts of the same value. A cleanup + // that remains TODO is changing findBaseOrBDV to return an + // llvm::Value of the correct type (and still remain pure). + // This will remove the need to add bitcasts. + assert(Base->stripPointerCasts() == OldBase->stripPointerCasts() && + "Sanity -- findBaseOrBDV should be pure!"); #endif continue; } @@ -994,28 +946,25 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &cache) { // need to insert a bitcast in the incoming block. // TODO: Need to split critical edges if insertion is needed Value *Base = getBaseForInput(InVal, InBB->getTerminator()); - basephi->addIncoming(Base, InBB); + BasePHI->addIncoming(Base, InBB); } - assert(basephi->getNumIncomingValues() == NumPHIValues); - } else if (SelectInst *BaseSel = dyn_cast<SelectInst>(State.getBase())) { - SelectInst *Sel = cast<SelectInst>(BDV); - // Operand 1 & 2 are true, false path respectively. TODO: refactor to - // something more safe and less hacky. - for (int i = 1; i <= 2; i++) { - Value *InVal = Sel->getOperand(i); - // Find the instruction which produces the base for each input. We may - // need to insert a bitcast. - Value *Base = getBaseForInput(InVal, BaseSel); - BaseSel->setOperand(i, Base); - } - } else if (auto *BaseEE = dyn_cast<ExtractElementInst>(State.getBase())) { + assert(BasePHI->getNumIncomingValues() == NumPHIValues); + } else if (SelectInst *BaseSI = + dyn_cast<SelectInst>(State.getBaseValue())) { + SelectInst *SI = cast<SelectInst>(BDV); + + // Find the instruction which produces the base for each input. + // We may need to insert a bitcast. + BaseSI->setTrueValue(getBaseForInput(SI->getTrueValue(), BaseSI)); + BaseSI->setFalseValue(getBaseForInput(SI->getFalseValue(), BaseSI)); + } else if (auto *BaseEE = + dyn_cast<ExtractElementInst>(State.getBaseValue())) { Value *InVal = cast<ExtractElementInst>(BDV)->getVectorOperand(); // Find the instruction which produces the base for each input. We may // need to insert a bitcast. - Value *Base = getBaseForInput(InVal, BaseEE); - BaseEE->setOperand(0, Base); + BaseEE->setOperand(0, getBaseForInput(InVal, BaseEE)); } else { - auto *BaseIE = cast<InsertElementInst>(State.getBase()); + auto *BaseIE = cast<InsertElementInst>(State.getBaseValue()); auto *BdvIE = cast<InsertElementInst>(BDV); auto UpdateOperand = [&](int OperandIdx) { Value *InVal = BdvIE->getOperand(OperandIdx); @@ -1025,69 +974,6 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &cache) { UpdateOperand(0); // vector operand UpdateOperand(1); // scalar operand } - - } - - // Now that we're done with the algorithm, see if we can optimize the - // results slightly by reducing the number of new instructions needed. - // Arguably, this should be integrated into the algorithm above, but - // doing as a post process step is easier to reason about for the moment. - DenseMap<Value *, Value *> ReverseMap; - SmallPtrSet<Instruction *, 16> NewInsts; - SmallSetVector<AssertingVH<Instruction>, 16> Worklist; - // Note: We need to visit the states in a deterministic order. We uses the - // Keys we sorted above for this purpose. Note that we are papering over a - // bigger problem with the algorithm above - it's visit order is not - // deterministic. A larger change is needed to fix this. - for (auto Pair : States) { - auto *BDV = Pair.first; - auto State = Pair.second; - Value *Base = State.getBase(); - assert(BDV && Base); - assert(!isKnownBaseResult(BDV) && "why did it get added?"); - assert(isKnownBaseResult(Base) && - "must be something we 'know' is a base pointer"); - if (!State.isConflict()) - continue; - - ReverseMap[Base] = BDV; - if (auto *BaseI = dyn_cast<Instruction>(Base)) { - NewInsts.insert(BaseI); - Worklist.insert(BaseI); - } - } - auto ReplaceBaseInstWith = [&](Value *BDV, Instruction *BaseI, - Value *Replacement) { - // Add users which are new instructions (excluding self references) - for (User *U : BaseI->users()) - if (auto *UI = dyn_cast<Instruction>(U)) - if (NewInsts.count(UI) && UI != BaseI) - Worklist.insert(UI); - // Then do the actual replacement - NewInsts.erase(BaseI); - ReverseMap.erase(BaseI); - BaseI->replaceAllUsesWith(Replacement); - assert(States.count(BDV)); - assert(States[BDV].isConflict() && States[BDV].getBase() == BaseI); - States[BDV] = BDVState(BDVState::Conflict, Replacement); - BaseI->eraseFromParent(); - }; - const DataLayout &DL = cast<Instruction>(def)->getModule()->getDataLayout(); - while (!Worklist.empty()) { - Instruction *BaseI = Worklist.pop_back_val(); - assert(NewInsts.count(BaseI)); - Value *Bdv = ReverseMap[BaseI]; - if (auto *BdvI = dyn_cast<Instruction>(Bdv)) - if (BaseI->isIdenticalTo(BdvI)) { - DEBUG(dbgs() << "Identical Base: " << *BaseI << "\n"); - ReplaceBaseInstWith(Bdv, BaseI, Bdv); - continue; - } - if (Value *V = SimplifyInstruction(BaseI, DL)) { - DEBUG(dbgs() << "Base " << *BaseI << " simplified to " << *V << "\n"); - ReplaceBaseInstWith(Bdv, BaseI, V); - continue; - } } // Cache all of our results so we can cheaply reuse them @@ -1095,25 +981,27 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &cache) { // relation and one of the base pointer relation! FIXME for (auto Pair : States) { auto *BDV = Pair.first; - Value *base = Pair.second.getBase(); - assert(BDV && base); + Value *Base = Pair.second.getBaseValue(); + assert(BDV && Base); + assert(!isKnownBaseResult(BDV) && "why did it get added?"); - std::string fromstr = cache.count(BDV) ? cache[BDV]->getName() : "none"; DEBUG(dbgs() << "Updating base value cache" - << " for: " << BDV->getName() - << " from: " << fromstr - << " to: " << base->getName() << "\n"); - - if (cache.count(BDV)) { - // Once we transition from the BDV relation being store in the cache to + << " for: " << BDV->getName() << " from: " + << (Cache.count(BDV) ? Cache[BDV]->getName().str() : "none") + << " to: " << Base->getName() << "\n"); + + if (Cache.count(BDV)) { + assert(isKnownBaseResult(Base) && + "must be something we 'know' is a base pointer"); + // Once we transition from the BDV relation being store in the Cache to // the base relation being stored, it must be stable - assert((!isKnownBaseResult(cache[BDV]) || cache[BDV] == base) && + assert((!isKnownBaseResult(Cache[BDV]) || Cache[BDV] == Base) && "base relation should be stable"); } - cache[BDV] = base; + Cache[BDV] = Base; } - assert(cache.count(def)); - return cache[def]; + assert(Cache.count(Def)); + return Cache[Def]; } // For a set of live pointers (base and/or derived), identify the base @@ -1133,15 +1021,9 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &cache) { // pointer was a base pointer. static void findBasePointers(const StatepointLiveSetTy &live, - DenseMap<Value *, Value *> &PointerToBase, + MapVector<Value *, Value *> &PointerToBase, DominatorTree *DT, DefiningValueMapTy &DVCache) { - // For the naming of values inserted to be deterministic - which makes for - // much cleaner and more stable tests - we need to assign an order to the - // live values. DenseSets do not provide a deterministic order across runs. - SmallVector<Value *, 64> Temp; - Temp.insert(Temp.end(), live.begin(), live.end()); - std::sort(Temp.begin(), Temp.end(), order_by_name); - for (Value *ptr : Temp) { + for (Value *ptr : live) { Value *base = findBasePointer(ptr, DVCache); assert(base && "failed to find base pointer"); PointerToBase[ptr] = base; @@ -1149,41 +1031,24 @@ findBasePointers(const StatepointLiveSetTy &live, DT->dominates(cast<Instruction>(base)->getParent(), cast<Instruction>(ptr)->getParent())) && "The base we found better dominate the derived pointer"); - - // If you see this trip and like to live really dangerously, the code should - // be correct, just with idioms the verifier can't handle. You can try - // disabling the verifier at your own substantial risk. - assert(!isa<ConstantPointerNull>(base) && - "the relocation code needs adjustment to handle the relocation of " - "a null pointer constant without causing false positives in the " - "safepoint ir verifier."); } } /// Find the required based pointers (and adjust the live set) for the given /// parse point. static void findBasePointers(DominatorTree &DT, DefiningValueMapTy &DVCache, - const CallSite &CS, + CallSite CS, PartiallyConstructedSafepointRecord &result) { - DenseMap<Value *, Value *> PointerToBase; + MapVector<Value *, Value *> PointerToBase; findBasePointers(result.LiveSet, PointerToBase, &DT, DVCache); if (PrintBasePointers) { - // Note: Need to print these in a stable order since this is checked in - // some tests. errs() << "Base Pairs (w/o Relocation):\n"; - SmallVector<Value *, 64> Temp; - Temp.reserve(PointerToBase.size()); - for (auto Pair : PointerToBase) { - Temp.push_back(Pair.first); - } - std::sort(Temp.begin(), Temp.end(), order_by_name); - for (Value *Ptr : Temp) { - Value *Base = PointerToBase[Ptr]; + for (auto &Pair : PointerToBase) { errs() << " derived "; - Ptr->printAsOperand(errs(), false); + Pair.first->printAsOperand(errs(), false); errs() << " base "; - Base->printAsOperand(errs(), false); + Pair.second->printAsOperand(errs(), false); errs() << "\n";; } } @@ -1194,7 +1059,7 @@ static void findBasePointers(DominatorTree &DT, DefiningValueMapTy &DVCache, /// Given an updated version of the dataflow liveness results, update the /// liveset and base pointer maps for the call site CS. static void recomputeLiveInValues(GCPtrLivenessData &RevisedLivenessData, - const CallSite &CS, + CallSite CS, PartiallyConstructedSafepointRecord &result); static void recomputeLiveInValues( @@ -1206,8 +1071,7 @@ static void recomputeLiveInValues( computeLiveInValues(DT, F, RevisedLivenessData); for (size_t i = 0; i < records.size(); i++) { struct PartiallyConstructedSafepointRecord &info = records[i]; - const CallSite &CS = toUpdate[i]; - recomputeLiveInValues(RevisedLivenessData, CS, info); + recomputeLiveInValues(RevisedLivenessData, toUpdate[i], info); } } @@ -1257,8 +1121,7 @@ static AttributeSet legalizeCallAttributes(AttributeSet AS) { // These attributes control the generation of the gc.statepoint call / // invoke itself; and once the gc.statepoint is in place, they're of no // use. - if (Attr.hasAttribute("statepoint-num-patch-bytes") || - Attr.hasAttribute("statepoint-id")) + if (isStatepointDirectiveAttr(Attr)) continue; Ret = Ret.addAttributes( @@ -1349,11 +1212,37 @@ namespace { class DeferredReplacement { AssertingVH<Instruction> Old; AssertingVH<Instruction> New; + bool IsDeoptimize = false; + + DeferredReplacement() {} public: - explicit DeferredReplacement(Instruction *Old, Instruction *New) : - Old(Old), New(New) { - assert(Old != New && "Not allowed!"); + static DeferredReplacement createRAUW(Instruction *Old, Instruction *New) { + assert(Old != New && Old && New && + "Cannot RAUW equal values or to / from null!"); + + DeferredReplacement D; + D.Old = Old; + D.New = New; + return D; + } + + static DeferredReplacement createDelete(Instruction *ToErase) { + DeferredReplacement D; + D.Old = ToErase; + return D; + } + + static DeferredReplacement createDeoptimizeReplacement(Instruction *Old) { +#ifndef NDEBUG + auto *F = cast<CallInst>(Old)->getCalledFunction(); + assert(F && F->getIntrinsicID() == Intrinsic::experimental_deoptimize && + "Only way to construct a deoptimize deferred replacement"); +#endif + DeferredReplacement D; + D.Old = Old; + D.IsDeoptimize = true; + return D; } /// Does the task represented by this instance. @@ -1362,12 +1251,23 @@ public: Instruction *NewI = New; assert(OldI != NewI && "Disallowed at construction?!"); + assert((!IsDeoptimize || !New) && + "Deoptimize instrinsics are not replaced!"); Old = nullptr; New = nullptr; if (NewI) OldI->replaceAllUsesWith(NewI); + + if (IsDeoptimize) { + // Note: we've inserted instructions, so the call to llvm.deoptimize may + // not necessarilly be followed by the matching return. + auto *RI = cast<ReturnInst>(OldI->getParent()->getTerminator()); + new UnreachableInst(RI->getContext(), RI); + RI->eraseFromParent(); + } + OldI->eraseFromParent(); } }; @@ -1380,8 +1280,6 @@ makeStatepointExplicitImpl(const CallSite CS, /* to replace */ PartiallyConstructedSafepointRecord &Result, std::vector<DeferredReplacement> &Replacements) { assert(BasePtrs.size() == LiveVariables.size()); - assert((UseDeoptBundles || isStatepoint(CS)) && - "This method expects to be rewriting a statepoint"); // Then go ahead and use the builder do actually do the inserts. We insert // immediately before the previous instruction under the assumption that all @@ -1391,47 +1289,53 @@ makeStatepointExplicitImpl(const CallSite CS, /* to replace */ IRBuilder<> Builder(InsertBefore); ArrayRef<Value *> GCArgs(LiveVariables); - uint64_t StatepointID = 0xABCDEF00; + uint64_t StatepointID = StatepointDirectives::DefaultStatepointID; uint32_t NumPatchBytes = 0; uint32_t Flags = uint32_t(StatepointFlags::None); - ArrayRef<Use> CallArgs; - ArrayRef<Use> DeoptArgs; + ArrayRef<Use> CallArgs(CS.arg_begin(), CS.arg_end()); + ArrayRef<Use> DeoptArgs = GetDeoptBundleOperands(CS); ArrayRef<Use> TransitionArgs; - - Value *CallTarget = nullptr; - - if (UseDeoptBundles) { - CallArgs = {CS.arg_begin(), CS.arg_end()}; - DeoptArgs = GetDeoptBundleOperands(CS); - // TODO: we don't fill in TransitionArgs or Flags in this branch, but we - // could have an operand bundle for that too. - AttributeSet OriginalAttrs = CS.getAttributes(); - - Attribute AttrID = OriginalAttrs.getAttribute(AttributeSet::FunctionIndex, - "statepoint-id"); - if (AttrID.isStringAttribute()) - AttrID.getValueAsString().getAsInteger(10, StatepointID); - - Attribute AttrNumPatchBytes = OriginalAttrs.getAttribute( - AttributeSet::FunctionIndex, "statepoint-num-patch-bytes"); - if (AttrNumPatchBytes.isStringAttribute()) - AttrNumPatchBytes.getValueAsString().getAsInteger(10, NumPatchBytes); - - CallTarget = CS.getCalledValue(); - } else { - // This branch will be gone soon, and we will soon only support the - // UseDeoptBundles == true configuration. - Statepoint OldSP(CS); - StatepointID = OldSP.getID(); - NumPatchBytes = OldSP.getNumPatchBytes(); - Flags = OldSP.getFlags(); - - CallArgs = {OldSP.arg_begin(), OldSP.arg_end()}; - DeoptArgs = {OldSP.vm_state_begin(), OldSP.vm_state_end()}; - TransitionArgs = {OldSP.gc_transition_args_begin(), - OldSP.gc_transition_args_end()}; - CallTarget = OldSP.getCalledValue(); + if (auto TransitionBundle = + CS.getOperandBundle(LLVMContext::OB_gc_transition)) { + Flags |= uint32_t(StatepointFlags::GCTransition); + TransitionArgs = TransitionBundle->Inputs; + } + + // Instead of lowering calls to @llvm.experimental.deoptimize as normal calls + // with a return value, we lower then as never returning calls to + // __llvm_deoptimize that are followed by unreachable to get better codegen. + bool IsDeoptimize = false; + + StatepointDirectives SD = + parseStatepointDirectivesFromAttrs(CS.getAttributes()); + if (SD.NumPatchBytes) + NumPatchBytes = *SD.NumPatchBytes; + if (SD.StatepointID) + StatepointID = *SD.StatepointID; + + Value *CallTarget = CS.getCalledValue(); + if (Function *F = dyn_cast<Function>(CallTarget)) { + if (F->getIntrinsicID() == Intrinsic::experimental_deoptimize) { + // Calls to llvm.experimental.deoptimize are lowered to calls to the + // __llvm_deoptimize symbol. We want to resolve this now, since the + // verifier does not allow taking the address of an intrinsic function. + + SmallVector<Type *, 8> DomainTy; + for (Value *Arg : CallArgs) + DomainTy.push_back(Arg->getType()); + auto *FTy = FunctionType::get(Type::getVoidTy(F->getContext()), DomainTy, + /* isVarArg = */ false); + + // Note: CallTarget can be a bitcast instruction of a symbol if there are + // calls to @llvm.experimental.deoptimize with different argument types in + // the same module. This is fine -- we assume the frontend knew what it + // was doing when generating this kind of IR. + CallTarget = + F->getParent()->getOrInsertFunction("__llvm_deoptimize", FTy); + + IsDeoptimize = true; + } } // Create the statepoint given all the arguments @@ -1514,7 +1418,13 @@ makeStatepointExplicitImpl(const CallSite CS, /* to replace */ } assert(Token && "Should be set in one of the above branches!"); - if (UseDeoptBundles) { + if (IsDeoptimize) { + // If we're wrapping an @llvm.experimental.deoptimize in a statepoint, we + // transform the tail-call like structure to a call to a void function + // followed by unreachable to get better codegen. + Replacements.push_back( + DeferredReplacement::createDeoptimizeReplacement(CS.getInstruction())); + } else { Token->setName("statepoint_token"); if (!CS.getType()->isVoidTy() && !CS.getInstruction()->use_empty()) { StringRef Name = @@ -1528,24 +1438,12 @@ makeStatepointExplicitImpl(const CallSite CS, /* to replace */ // llvm::Instruction. Instead, we defer the replacement and deletion to // after the live sets have been made explicit in the IR, and we no longer // have raw pointers to worry about. - Replacements.emplace_back(CS.getInstruction(), GCResult); + Replacements.emplace_back( + DeferredReplacement::createRAUW(CS.getInstruction(), GCResult)); } else { - Replacements.emplace_back(CS.getInstruction(), nullptr); + Replacements.emplace_back( + DeferredReplacement::createDelete(CS.getInstruction())); } - } else { - assert(!CS.getInstruction()->hasNUsesOrMore(2) && - "only valid use before rewrite is gc.result"); - assert(!CS.getInstruction()->hasOneUse() || - isGCResult(cast<Instruction>(*CS.getInstruction()->user_begin()))); - - // Take the name of the original statepoint token if there was one. - Token->takeName(CS.getInstruction()); - - // Update the gc.result of the original statepoint (if any) to use the newly - // inserted statepoint. This is safe to do here since the token can't be - // considered a live reference. - CS.getInstruction()->replaceAllUsesWith(Token); - CS.getInstruction()->eraseFromParent(); } Result.StatepointToken = Token; @@ -1555,43 +1453,13 @@ makeStatepointExplicitImpl(const CallSite CS, /* to replace */ CreateGCRelocates(LiveVariables, LiveStartIdx, BasePtrs, Token, Builder); } -namespace { -struct NameOrdering { - Value *Base; - Value *Derived; - - bool operator()(NameOrdering const &a, NameOrdering const &b) { - return -1 == a.Derived->getName().compare(b.Derived->getName()); - } -}; -} - -static void StabilizeOrder(SmallVectorImpl<Value *> &BaseVec, - SmallVectorImpl<Value *> &LiveVec) { - assert(BaseVec.size() == LiveVec.size()); - - SmallVector<NameOrdering, 64> Temp; - for (size_t i = 0; i < BaseVec.size(); i++) { - NameOrdering v; - v.Base = BaseVec[i]; - v.Derived = LiveVec[i]; - Temp.push_back(v); - } - - std::sort(Temp.begin(), Temp.end(), NameOrdering()); - for (size_t i = 0; i < BaseVec.size(); i++) { - BaseVec[i] = Temp[i].Base; - LiveVec[i] = Temp[i].Derived; - } -} - // Replace an existing gc.statepoint with a new one and a set of gc.relocates // which make the relocations happening at this safepoint explicit. // // WARNING: Does not do any fixup to adjust users of the original live // values. That's the callers responsibility. static void -makeStatepointExplicit(DominatorTree &DT, const CallSite &CS, +makeStatepointExplicit(DominatorTree &DT, CallSite CS, PartiallyConstructedSafepointRecord &Result, std::vector<DeferredReplacement> &Replacements) { const auto &LiveSet = Result.LiveSet; @@ -1609,11 +1477,6 @@ makeStatepointExplicit(DominatorTree &DT, const CallSite &CS, } assert(LiveVec.size() == BaseVec.size()); - // To make the output IR slightly more stable (for use in diffs), ensure a - // fixed order of the values in the safepoint (by sorting the value name). - // The order is otherwise meaningless. - StabilizeOrder(BaseVec, LiveVec); - // Do the actual rewriting and delete the old statepoint makeStatepointExplicitImpl(CS, BaseVec, LiveVec, Result, Replacements); } @@ -1634,7 +1497,7 @@ insertRelocationStores(iterator_range<Value::user_iterator> GCRelocs, if (!Relocate) continue; - Value *OriginalValue = const_cast<Value *>(Relocate->getDerivedPtr()); + Value *OriginalValue = Relocate->getDerivedPtr(); assert(AllocaMap.count(OriginalValue)); Value *Alloca = AllocaMap[OriginalValue]; @@ -1660,11 +1523,10 @@ insertRelocationStores(iterator_range<Value::user_iterator> GCRelocs, // Helper function for the "relocationViaAlloca". Similar to the // "insertRelocationStores" but works for rematerialized values. -static void -insertRematerializationStores( - RematerializedValueMapTy RematerializedValues, - DenseMap<Value *, Value *> &AllocaMap, - DenseSet<Value *> &VisitedLiveValues) { +static void insertRematerializationStores( + const RematerializedValueMapTy &RematerializedValues, + DenseMap<Value *, Value *> &AllocaMap, + DenseSet<Value *> &VisitedLiveValues) { for (auto RematerializedValuePair: RematerializedValues) { Instruction *RematerializedValue = RematerializedValuePair.first; @@ -1691,9 +1553,8 @@ static void relocationViaAlloca( // record initial number of (static) allocas; we'll check we have the same // number when we get done. int InitialAllocaNum = 0; - for (auto I = F.getEntryBlock().begin(), E = F.getEntryBlock().end(); I != E; - I++) - if (isa<AllocaInst>(*I)) + for (Instruction &I : F.getEntryBlock()) + if (isa<AllocaInst>(I)) InitialAllocaNum++; #endif @@ -1777,8 +1638,7 @@ static void relocationViaAlloca( auto InsertClobbersAt = [&](Instruction *IP) { for (auto *AI : ToClobber) { - auto AIType = cast<PointerType>(AI->getType()); - auto PT = cast<PointerType>(AIType->getElementType()); + auto PT = cast<PointerType>(AI->getAllocatedType()); Constant *CPN = ConstantPointerNull::get(PT); StoreInst *Store = new StoreInst(CPN, AI); Store->insertBefore(IP); @@ -1919,141 +1779,7 @@ static void findLiveReferences( computeLiveInValues(DT, F, OriginalLivenessData); for (size_t i = 0; i < records.size(); i++) { struct PartiallyConstructedSafepointRecord &info = records[i]; - const CallSite &CS = toUpdate[i]; - analyzeParsePointLiveness(DT, OriginalLivenessData, CS, info); - } -} - -/// Remove any vector of pointers from the live set by scalarizing them over the -/// statepoint instruction. Adds the scalarized pieces to the live set. It -/// would be preferable to include the vector in the statepoint itself, but -/// the lowering code currently does not handle that. Extending it would be -/// slightly non-trivial since it requires a format change. Given how rare -/// such cases are (for the moment?) scalarizing is an acceptable compromise. -static void splitVectorValues(Instruction *StatepointInst, - StatepointLiveSetTy &LiveSet, - DenseMap<Value *, Value *>& PointerToBase, - DominatorTree &DT) { - SmallVector<Value *, 16> ToSplit; - for (Value *V : LiveSet) - if (isa<VectorType>(V->getType())) - ToSplit.push_back(V); - - if (ToSplit.empty()) - return; - - DenseMap<Value *, SmallVector<Value *, 16>> ElementMapping; - - Function &F = *(StatepointInst->getParent()->getParent()); - - DenseMap<Value *, AllocaInst *> AllocaMap; - // First is normal return, second is exceptional return (invoke only) - DenseMap<Value *, std::pair<Value *, Value *>> Replacements; - for (Value *V : ToSplit) { - AllocaInst *Alloca = - new AllocaInst(V->getType(), "", F.getEntryBlock().getFirstNonPHI()); - AllocaMap[V] = Alloca; - - VectorType *VT = cast<VectorType>(V->getType()); - IRBuilder<> Builder(StatepointInst); - SmallVector<Value *, 16> Elements; - for (unsigned i = 0; i < VT->getNumElements(); i++) - Elements.push_back(Builder.CreateExtractElement(V, Builder.getInt32(i))); - ElementMapping[V] = Elements; - - auto InsertVectorReform = [&](Instruction *IP) { - Builder.SetInsertPoint(IP); - Builder.SetCurrentDebugLocation(IP->getDebugLoc()); - Value *ResultVec = UndefValue::get(VT); - for (unsigned i = 0; i < VT->getNumElements(); i++) - ResultVec = Builder.CreateInsertElement(ResultVec, Elements[i], - Builder.getInt32(i)); - return ResultVec; - }; - - if (isa<CallInst>(StatepointInst)) { - BasicBlock::iterator Next(StatepointInst); - Next++; - Instruction *IP = &*(Next); - Replacements[V].first = InsertVectorReform(IP); - Replacements[V].second = nullptr; - } else { - InvokeInst *Invoke = cast<InvokeInst>(StatepointInst); - // We've already normalized - check that we don't have shared destination - // blocks - BasicBlock *NormalDest = Invoke->getNormalDest(); - assert(!isa<PHINode>(NormalDest->begin())); - BasicBlock *UnwindDest = Invoke->getUnwindDest(); - assert(!isa<PHINode>(UnwindDest->begin())); - // Insert insert element sequences in both successors - Instruction *IP = &*(NormalDest->getFirstInsertionPt()); - Replacements[V].first = InsertVectorReform(IP); - IP = &*(UnwindDest->getFirstInsertionPt()); - Replacements[V].second = InsertVectorReform(IP); - } - } - - for (Value *V : ToSplit) { - AllocaInst *Alloca = AllocaMap[V]; - - // Capture all users before we start mutating use lists - SmallVector<Instruction *, 16> Users; - for (User *U : V->users()) - Users.push_back(cast<Instruction>(U)); - - for (Instruction *I : Users) { - if (auto Phi = dyn_cast<PHINode>(I)) { - for (unsigned i = 0; i < Phi->getNumIncomingValues(); i++) - if (V == Phi->getIncomingValue(i)) { - LoadInst *Load = new LoadInst( - Alloca, "", Phi->getIncomingBlock(i)->getTerminator()); - Phi->setIncomingValue(i, Load); - } - } else { - LoadInst *Load = new LoadInst(Alloca, "", I); - I->replaceUsesOfWith(V, Load); - } - } - - // Store the original value and the replacement value into the alloca - StoreInst *Store = new StoreInst(V, Alloca); - if (auto I = dyn_cast<Instruction>(V)) - Store->insertAfter(I); - else - Store->insertAfter(Alloca); - - // Normal return for invoke, or call return - Instruction *Replacement = cast<Instruction>(Replacements[V].first); - (new StoreInst(Replacement, Alloca))->insertAfter(Replacement); - // Unwind return for invoke only - Replacement = cast_or_null<Instruction>(Replacements[V].second); - if (Replacement) - (new StoreInst(Replacement, Alloca))->insertAfter(Replacement); - } - - // apply mem2reg to promote alloca to SSA - SmallVector<AllocaInst *, 16> Allocas; - for (Value *V : ToSplit) - Allocas.push_back(AllocaMap[V]); - PromoteMemToReg(Allocas, DT); - - // Update our tracking of live pointers and base mappings to account for the - // changes we just made. - for (Value *V : ToSplit) { - auto &Elements = ElementMapping[V]; - - LiveSet.erase(V); - LiveSet.insert(Elements.begin(), Elements.end()); - // We need to update the base mapping as well. - assert(PointerToBase.count(V)); - Value *OldBase = PointerToBase[V]; - auto &BaseElements = ElementMapping[OldBase]; - PointerToBase.erase(V); - assert(Elements.size() == BaseElements.size()); - for (unsigned i = 0; i < Elements.size(); i++) { - Value *Elem = Elements[i]; - PointerToBase[Elem] = BaseElements[i]; - } + analyzeParsePointLiveness(DT, OriginalLivenessData, toUpdate[i], info); } } @@ -2109,7 +1835,7 @@ chainToBasePointerCost(SmallVectorImpl<Instruction*> &Chain, } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Instr)) { // Cost of the address calculation - Type *ValTy = GEP->getPointerOperandType()->getPointerElementType(); + Type *ValTy = GEP->getSourceElementType(); Cost += TTI.getAddressComputationCost(ValTy); // And cost of the GEP itself @@ -2244,7 +1970,7 @@ static void rematerializeLiveValues(CallSite CS, // Remove rematerializaed values from the live set for (auto LiveValue: LiveValuesToBeDeleted) { - Info.LiveSet.erase(LiveValue); + Info.LiveSet.remove(LiveValue); } } @@ -2257,11 +1983,8 @@ static bool insertParsePoints(Function &F, DominatorTree &DT, Uniqued.insert(ToUpdate.begin(), ToUpdate.end()); assert(Uniqued.size() == ToUpdate.size() && "no duplicates please!"); - for (CallSite CS : ToUpdate) { - assert(CS.getInstruction()->getParent()->getParent() == &F); - assert((UseDeoptBundles || isStatepoint(CS)) && - "expected to already be a deopt statepoint"); - } + for (CallSite CS : ToUpdate) + assert(CS.getInstruction()->getFunction() == &F); #endif // When inserting gc.relocates for invokes, we need to be able to insert at @@ -2287,12 +2010,7 @@ static bool insertParsePoints(Function &F, DominatorTree &DT, for (CallSite CS : ToUpdate) { SmallVector<Value *, 64> DeoptValues; - iterator_range<const Use *> DeoptStateRange = - UseDeoptBundles - ? iterator_range<const Use *>(GetDeoptBundleOperands(CS)) - : iterator_range<const Use *>(Statepoint(CS).vm_state_args()); - - for (Value *Arg : DeoptStateRange) { + for (Value *Arg : GetDeoptBundleOperands(CS)) { assert(!isUnhandledGCPointerType(Arg->getType()) && "support for FCA unimplemented"); if (isHandledGCPointerType(Arg->getType())) @@ -2374,29 +2092,13 @@ static bool insertParsePoints(Function &F, DominatorTree &DT, for (auto &Info : Records) for (auto &BasePair : Info.PointerToBase) if (isa<Constant>(BasePair.second)) - Info.LiveSet.erase(BasePair.first); + Info.LiveSet.remove(BasePair.first); for (CallInst *CI : Holders) CI->eraseFromParent(); Holders.clear(); - // Do a limited scalarization of any live at safepoint vector values which - // contain pointers. This enables this pass to run after vectorization at - // the cost of some possible performance loss. Note: This is known to not - // handle updating of the side tables correctly which can lead to relocation - // bugs when the same vector is live at multiple statepoints. We're in the - // process of implementing the alternate lowering - relocating the - // vector-of-pointers as first class item and updating the backend to - // understand that - but that's not yet complete. - if (UseVectorSplit) - for (size_t i = 0; i < Records.size(); i++) { - PartiallyConstructedSafepointRecord &Info = Records[i]; - Instruction *Statepoint = ToUpdate[i].getInstruction(); - splitVectorValues(cast<Instruction>(Statepoint), Info.LiveSet, - Info.PointerToBase, DT); - } - // In order to reduce live set of statepoint we might choose to rematerialize // some values instead of relocating them. This is purely an optimization and // does not influence correctness. @@ -2592,13 +2294,9 @@ bool RewriteStatepointsForGC::runOnFunction(Function &F) { getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); auto NeedsRewrite = [](Instruction &I) { - if (UseDeoptBundles) { - if (ImmutableCallSite CS = ImmutableCallSite(&I)) - return !callsGCLeafFunction(CS); - return false; - } - - return isStatepoint(I); + if (ImmutableCallSite CS = ImmutableCallSite(&I)) + return !callsGCLeafFunction(CS) && !isStatepoint(CS); + return false; }; // Gather all the statepoints which need rewritten. Be careful to only @@ -2682,15 +2380,12 @@ bool RewriteStatepointsForGC::runOnFunction(Function &F) { /// Compute the live-in set for the location rbegin starting from /// the live-out set of the basic block -static void computeLiveInValues(BasicBlock::reverse_iterator rbegin, - BasicBlock::reverse_iterator rend, - DenseSet<Value *> &LiveTmp) { - - for (BasicBlock::reverse_iterator ritr = rbegin; ritr != rend; ritr++) { - Instruction *I = &*ritr; - +static void computeLiveInValues(BasicBlock::reverse_iterator Begin, + BasicBlock::reverse_iterator End, + SetVector<Value *> &LiveTmp) { + for (auto &I : make_range(Begin, End)) { // KILL/Def - Remove this definition from LiveIn - LiveTmp.erase(I); + LiveTmp.remove(&I); // Don't consider *uses* in PHI nodes, we handle their contribution to // predecessor blocks when we seed the LiveOut sets @@ -2698,7 +2393,7 @@ static void computeLiveInValues(BasicBlock::reverse_iterator rbegin, continue; // USE - Add to the LiveIn set for this instruction - for (Value *V : I->operands()) { + for (Value *V : I.operands()) { assert(!isUnhandledGCPointerType(V->getType()) && "support for FCA unimplemented"); if (isHandledGCPointerType(V->getType()) && !isa<Constant>(V)) { @@ -2718,24 +2413,24 @@ static void computeLiveInValues(BasicBlock::reverse_iterator rbegin, } } -static void computeLiveOutSeed(BasicBlock *BB, DenseSet<Value *> &LiveTmp) { - +static void computeLiveOutSeed(BasicBlock *BB, SetVector<Value *> &LiveTmp) { for (BasicBlock *Succ : successors(BB)) { - const BasicBlock::iterator E(Succ->getFirstNonPHI()); - for (BasicBlock::iterator I = Succ->begin(); I != E; I++) { - PHINode *Phi = cast<PHINode>(&*I); - Value *V = Phi->getIncomingValueForBlock(BB); + for (auto &I : *Succ) { + PHINode *PN = dyn_cast<PHINode>(&I); + if (!PN) + break; + + Value *V = PN->getIncomingValueForBlock(BB); assert(!isUnhandledGCPointerType(V->getType()) && "support for FCA unimplemented"); - if (isHandledGCPointerType(V->getType()) && !isa<Constant>(V)) { + if (isHandledGCPointerType(V->getType()) && !isa<Constant>(V)) LiveTmp.insert(V); - } } } } -static DenseSet<Value *> computeKillSet(BasicBlock *BB) { - DenseSet<Value *> KillSet; +static SetVector<Value *> computeKillSet(BasicBlock *BB) { + SetVector<Value *> KillSet; for (Instruction &I : *BB) if (isHandledGCPointerType(I.getType())) KillSet.insert(&I); @@ -2745,7 +2440,7 @@ static DenseSet<Value *> computeKillSet(BasicBlock *BB) { #ifndef NDEBUG /// Check that the items in 'Live' dominate 'TI'. This is used as a basic /// sanity check for the liveness computation. -static void checkBasicSSA(DominatorTree &DT, DenseSet<Value *> &Live, +static void checkBasicSSA(DominatorTree &DT, SetVector<Value *> &Live, TerminatorInst *TI, bool TermOkay = false) { for (Value *V : Live) { if (auto *I = dyn_cast<Instruction>(V)) { @@ -2773,17 +2468,7 @@ static void checkBasicSSA(DominatorTree &DT, GCPtrLivenessData &Data, static void computeLiveInValues(DominatorTree &DT, Function &F, GCPtrLivenessData &Data) { - - SmallSetVector<BasicBlock *, 200> Worklist; - auto AddPredsToWorklist = [&](BasicBlock *BB) { - // We use a SetVector so that we don't have duplicates in the worklist. - Worklist.insert(pred_begin(BB), pred_end(BB)); - }; - auto NextItem = [&]() { - BasicBlock *BB = Worklist.back(); - Worklist.pop_back(); - return BB; - }; + SmallSetVector<BasicBlock *, 32> Worklist; // Seed the liveness for each individual block for (BasicBlock &BB : F) { @@ -2796,56 +2481,55 @@ static void computeLiveInValues(DominatorTree &DT, Function &F, assert(!Data.LiveSet[&BB].count(Kill) && "live set contains kill"); #endif - Data.LiveOut[&BB] = DenseSet<Value *>(); + Data.LiveOut[&BB] = SetVector<Value *>(); computeLiveOutSeed(&BB, Data.LiveOut[&BB]); Data.LiveIn[&BB] = Data.LiveSet[&BB]; - set_union(Data.LiveIn[&BB], Data.LiveOut[&BB]); - set_subtract(Data.LiveIn[&BB], Data.KillSet[&BB]); + Data.LiveIn[&BB].set_union(Data.LiveOut[&BB]); + Data.LiveIn[&BB].set_subtract(Data.KillSet[&BB]); if (!Data.LiveIn[&BB].empty()) - AddPredsToWorklist(&BB); + Worklist.insert(pred_begin(&BB), pred_end(&BB)); } // Propagate that liveness until stable while (!Worklist.empty()) { - BasicBlock *BB = NextItem(); + BasicBlock *BB = Worklist.pop_back_val(); - // Compute our new liveout set, then exit early if it hasn't changed - // despite the contribution of our successor. - DenseSet<Value *> LiveOut = Data.LiveOut[BB]; + // Compute our new liveout set, then exit early if it hasn't changed despite + // the contribution of our successor. + SetVector<Value *> LiveOut = Data.LiveOut[BB]; const auto OldLiveOutSize = LiveOut.size(); for (BasicBlock *Succ : successors(BB)) { assert(Data.LiveIn.count(Succ)); - set_union(LiveOut, Data.LiveIn[Succ]); + LiveOut.set_union(Data.LiveIn[Succ]); } // assert OutLiveOut is a subset of LiveOut if (OldLiveOutSize == LiveOut.size()) { // If the sets are the same size, then we didn't actually add anything - // when unioning our successors LiveIn Thus, the LiveIn of this block + // when unioning our successors LiveIn. Thus, the LiveIn of this block // hasn't changed. continue; } Data.LiveOut[BB] = LiveOut; // Apply the effects of this basic block - DenseSet<Value *> LiveTmp = LiveOut; - set_union(LiveTmp, Data.LiveSet[BB]); - set_subtract(LiveTmp, Data.KillSet[BB]); + SetVector<Value *> LiveTmp = LiveOut; + LiveTmp.set_union(Data.LiveSet[BB]); + LiveTmp.set_subtract(Data.KillSet[BB]); assert(Data.LiveIn.count(BB)); - const DenseSet<Value *> &OldLiveIn = Data.LiveIn[BB]; + const SetVector<Value *> &OldLiveIn = Data.LiveIn[BB]; // assert: OldLiveIn is a subset of LiveTmp if (OldLiveIn.size() != LiveTmp.size()) { Data.LiveIn[BB] = LiveTmp; - AddPredsToWorklist(BB); + Worklist.insert(pred_begin(BB), pred_end(BB)); } - } // while( !worklist.empty() ) + } // while (!Worklist.empty()) #ifndef NDEBUG // Sanity check our output against SSA properties. This helps catch any // missing kills during the above iteration. - for (BasicBlock &BB : F) { + for (BasicBlock &BB : F) checkBasicSSA(DT, Data, BB); - } #endif } @@ -2856,7 +2540,7 @@ static void findLiveSetAtInst(Instruction *Inst, GCPtrLivenessData &Data, // Note: The copy is intentional and required assert(Data.LiveOut.count(BB)); - DenseSet<Value *> LiveOut = Data.LiveOut[BB]; + SetVector<Value *> LiveOut = Data.LiveOut[BB]; // We want to handle the statepoint itself oddly. It's // call result is not live (normal), nor are it's arguments @@ -2864,12 +2548,12 @@ static void findLiveSetAtInst(Instruction *Inst, GCPtrLivenessData &Data, // specifically what we need to relocate BasicBlock::reverse_iterator rend(Inst->getIterator()); computeLiveInValues(BB->rbegin(), rend, LiveOut); - LiveOut.erase(Inst); + LiveOut.remove(Inst); Out.insert(LiveOut.begin(), LiveOut.end()); } static void recomputeLiveInValues(GCPtrLivenessData &RevisedLivenessData, - const CallSite &CS, + CallSite CS, PartiallyConstructedSafepointRecord &Info) { Instruction *Inst = CS.getInstruction(); StatepointLiveSetTy Updated; @@ -2877,33 +2561,32 @@ static void recomputeLiveInValues(GCPtrLivenessData &RevisedLivenessData, #ifndef NDEBUG DenseSet<Value *> Bases; - for (auto KVPair : Info.PointerToBase) { + for (auto KVPair : Info.PointerToBase) Bases.insert(KVPair.second); - } #endif + // We may have base pointers which are now live that weren't before. We need // to update the PointerToBase structure to reflect this. for (auto V : Updated) - if (!Info.PointerToBase.count(V)) { - assert(Bases.count(V) && "can't find base for unexpected live value"); - Info.PointerToBase[V] = V; + if (Info.PointerToBase.insert({V, V}).second) { + assert(Bases.count(V) && "Can't find base for unexpected live value!"); continue; } #ifndef NDEBUG - for (auto V : Updated) { + for (auto V : Updated) assert(Info.PointerToBase.count(V) && - "must be able to find base for live value"); - } + "Must be able to find base for live value!"); #endif // Remove any stale base mappings - this can happen since our liveness is - // more precise then the one inherent in the base pointer analysis + // more precise then the one inherent in the base pointer analysis. DenseSet<Value *> ToErase; for (auto KVPair : Info.PointerToBase) if (!Updated.count(KVPair.first)) ToErase.insert(KVPair.first); - for (auto V : ToErase) + + for (auto *V : ToErase) Info.PointerToBase.erase(V); #ifndef NDEBUG diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp index 8569e080873c9..da700f18cdafb 100644 --- a/lib/Transforms/Scalar/SCCP.cpp +++ b/lib/Transforms/Scalar/SCCP.cpp @@ -17,15 +17,15 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/IPO/SCCP.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/PointerIntPair.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/Constants.h" @@ -38,6 +38,8 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/IPO.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Scalar/SCCP.h" #include "llvm/Transforms/Utils/Local.h" #include <algorithm> using namespace llvm; @@ -57,8 +59,8 @@ namespace { /// class LatticeVal { enum LatticeValueTy { - /// undefined - This LLVM Value has no known value yet. - undefined, + /// unknown - This LLVM Value has no known value yet. + unknown, /// constant - This LLVM Value has a specific constant value. constant, @@ -83,9 +85,9 @@ class LatticeVal { } public: - LatticeVal() : Val(nullptr, undefined) {} + LatticeVal() : Val(nullptr, unknown) {} - bool isUndefined() const { return getLatticeValue() == undefined; } + bool isUnknown() const { return getLatticeValue() == unknown; } bool isConstant() const { return getLatticeValue() == constant || getLatticeValue() == forcedconstant; } @@ -112,7 +114,7 @@ public: return false; } - if (isUndefined()) { + if (isUnknown()) { Val.setInt(constant); assert(V && "Marking constant with NULL"); Val.setPointer(V); @@ -139,7 +141,7 @@ public: } void markForcedConstant(Constant *V) { - assert(isUndefined() && "Can't force a defined value!"); + assert(isUnknown() && "Can't force a defined value!"); Val.setInt(forcedconstant); Val.setPointer(V); } @@ -228,7 +230,7 @@ public: /// performing Interprocedural SCCP. void TrackValueOfGlobalVariable(GlobalVariable *GV) { // We only track the contents of scalar globals. - if (GV->getType()->getElementType()->isSingleValueType()) { + if (GV->getValueType()->isSingleValueType()) { LatticeVal &IV = TrackedGlobals[GV]; if (!isa<UndefValue>(GV->getInitializer())) IV.markConstant(GV->getInitializer()); @@ -268,6 +270,18 @@ public: return BBExecutable.count(BB); } + std::vector<LatticeVal> getStructLatticeValueFor(Value *V) const { + std::vector<LatticeVal> StructValues; + StructType *STy = dyn_cast<StructType>(V->getType()); + assert(STy && "getStructLatticeValueFor() can be called only on structs"); + for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { + auto I = StructValueState.find(std::make_pair(V, i)); + assert(I != StructValueState.end() && "Value not in valuemap!"); + StructValues.push_back(I->second); + } + return StructValues; + } + LatticeVal getLatticeValueFor(Value *V) const { DenseMap<Value*, LatticeVal>::const_iterator I = ValueState.find(V); assert(I != ValueState.end() && "V is not in valuemap!"); @@ -302,6 +316,13 @@ public: } private: + // pushToWorkList - Helper for markConstant/markForcedConstant + void pushToWorkList(LatticeVal &IV, Value *V) { + if (IV.isOverdefined()) + return OverdefinedInstWorkList.push_back(V); + InstWorkList.push_back(V); + } + // markConstant - Make a value be marked as "constant". If the value // is not already a constant, add it to the instruction work list so that // the users of the instruction are updated later. @@ -309,10 +330,7 @@ private: void markConstant(LatticeVal &IV, Value *V, Constant *C) { if (!IV.markConstant(C)) return; DEBUG(dbgs() << "markConstant: " << *C << ": " << *V << '\n'); - if (IV.isOverdefined()) - OverdefinedInstWorkList.push_back(V); - else - InstWorkList.push_back(V); + pushToWorkList(IV, V); } void markConstant(Value *V, Constant *C) { @@ -325,10 +343,7 @@ private: LatticeVal &IV = ValueState[V]; IV.markForcedConstant(C); DEBUG(dbgs() << "markForcedConstant: " << *C << ": " << *V << '\n'); - if (IV.isOverdefined()) - OverdefinedInstWorkList.push_back(V); - else - InstWorkList.push_back(V); + pushToWorkList(IV, V); } @@ -348,14 +363,14 @@ private: } void mergeInValue(LatticeVal &IV, Value *V, LatticeVal MergeWithV) { - if (IV.isOverdefined() || MergeWithV.isUndefined()) + if (IV.isOverdefined() || MergeWithV.isUnknown()) return; // Noop. if (MergeWithV.isOverdefined()) - markOverdefined(IV, V); - else if (IV.isUndefined()) - markConstant(IV, V, MergeWithV.getConstant()); - else if (IV.getConstant() != MergeWithV.getConstant()) - markOverdefined(IV, V); + return markOverdefined(IV, V); + if (IV.isUnknown()) + return markConstant(IV, V, MergeWithV.getConstant()); + if (IV.getConstant() != MergeWithV.getConstant()) + return markOverdefined(IV, V); } void mergeInValue(Value *V, LatticeVal MergeWithV) { @@ -378,7 +393,7 @@ private: return LV; // Common case, already in the map. if (Constant *C = dyn_cast<Constant>(V)) { - // Undef values remain undefined. + // Undef values remain unknown. if (!isa<UndefValue>(V)) LV.markConstant(C); // Constants are constant } @@ -409,7 +424,7 @@ private: if (!Elt) LV.markOverdefined(); // Unknown sort of constant. else if (isa<UndefValue>(Elt)) - ; // Undef values remain undefined. + ; // Undef values remain unknown. else LV.markConstant(Elt); // Constants are constant. } @@ -537,7 +552,7 @@ void SCCPSolver::getFeasibleSuccessors(TerminatorInst &TI, if (!CI) { // Overdefined condition variables, and branches on unfoldable constant // conditions, mean the branch could go either way. - if (!BCValue.isUndefined()) + if (!BCValue.isUnknown()) Succs[0] = Succs[1] = true; return; } @@ -561,9 +576,9 @@ void SCCPSolver::getFeasibleSuccessors(TerminatorInst &TI, LatticeVal SCValue = getValueState(SI->getCondition()); ConstantInt *CI = SCValue.getConstantInt(); - if (!CI) { // Overdefined or undefined condition? + if (!CI) { // Overdefined or unknown condition? // All destinations are executable! - if (!SCValue.isUndefined()) + if (!SCValue.isUnknown()) Succs.assign(TI.getNumSuccessors(), true); return; } @@ -607,7 +622,7 @@ bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) { // undef conditions mean that neither edge is feasible yet. ConstantInt *CI = BCValue.getConstantInt(); if (!CI) - return !BCValue.isUndefined(); + return !BCValue.isUnknown(); // Constant condition variables mean the branch can only go a single way. return BI->getSuccessor(CI->isZero()) == To; @@ -625,7 +640,7 @@ bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) { ConstantInt *CI = SCValue.getConstantInt(); if (!CI) - return !SCValue.isUndefined(); + return !SCValue.isUnknown(); return SI->findCaseValue(CI).getCaseSuccessor() == To; } @@ -677,12 +692,12 @@ void SCCPSolver::visitPHINode(PHINode &PN) { // are overdefined, the PHI becomes overdefined as well. If they are all // constant, and they agree with each other, the PHI becomes the identical // constant. If they are constant and don't agree, the PHI is overdefined. - // If there are no executable operands, the PHI remains undefined. + // If there are no executable operands, the PHI remains unknown. // Constant *OperandVal = nullptr; for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) { LatticeVal IV = getValueState(PN.getIncomingValue(i)); - if (IV.isUndefined()) continue; // Doesn't influence PHI node. + if (IV.isUnknown()) continue; // Doesn't influence PHI node. if (!isEdgeFeasible(PN.getIncomingBlock(i), PN.getParent())) continue; @@ -708,7 +723,7 @@ void SCCPSolver::visitPHINode(PHINode &PN) { // If we exited the loop, this means that the PHI node only has constant // arguments that agree with each other(and OperandVal is the constant) or // OperandVal is null because there are no defined incoming arguments. If - // this is the case, the PHI remains undefined. + // this is the case, the PHI remains unknown. // if (OperandVal) markConstant(&PN, OperandVal); // Acquire operand value @@ -758,8 +773,9 @@ void SCCPSolver::visitCastInst(CastInst &I) { if (OpSt.isOverdefined()) // Inherit overdefinedness of operand markOverdefined(&I); else if (OpSt.isConstant()) { - Constant *C = - ConstantExpr::getCast(I.getOpcode(), OpSt.getConstant(), I.getType()); + // Fold the constant as we build. + Constant *C = ConstantFoldCastOperand(I.getOpcode(), OpSt.getConstant(), + I.getType(), DL); if (isa<UndefValue>(C)) return; // Propagate constant value @@ -829,7 +845,7 @@ void SCCPSolver::visitSelectInst(SelectInst &I) { return markAnythingOverdefined(&I); LatticeVal CondValue = getValueState(I.getCondition()); - if (CondValue.isUndefined()) + if (CondValue.isUnknown()) return; if (ConstantInt *CondCB = CondValue.getConstantInt()) { @@ -849,9 +865,9 @@ void SCCPSolver::visitSelectInst(SelectInst &I) { TVal.getConstant() == FVal.getConstant()) return markConstant(&I, FVal.getConstant()); - if (TVal.isUndefined()) // select ?, undef, X -> X. + if (TVal.isUnknown()) // select ?, undef, X -> X. return mergeInValue(&I, FVal); - if (FVal.isUndefined()) // select ?, X, undef -> X. + if (FVal.isUnknown()) // select ?, X, undef -> X. return mergeInValue(&I, TVal); markOverdefined(&I); } @@ -890,7 +906,7 @@ void SCCPSolver::visitBinaryOperator(Instruction &I) { NonOverdefVal = &V2State; if (NonOverdefVal) { - if (NonOverdefVal->isUndefined()) { + if (NonOverdefVal->isUnknown()) { // Could annihilate value. if (I.getOpcode() == Instruction::And) markConstant(IV, &I, Constant::getNullValue(I.getType())); @@ -934,7 +950,7 @@ void SCCPSolver::visitCmpInst(CmpInst &I) { return markConstant(IV, &I, C); } - // If operands are still undefined, wait for it to resolve. + // If operands are still unknown, wait for it to resolve. if (!V1State.isOverdefined() && !V2State.isOverdefined()) return; @@ -944,69 +960,16 @@ void SCCPSolver::visitCmpInst(CmpInst &I) { void SCCPSolver::visitExtractElementInst(ExtractElementInst &I) { // TODO : SCCP does not handle vectors properly. return markOverdefined(&I); - -#if 0 - LatticeVal &ValState = getValueState(I.getOperand(0)); - LatticeVal &IdxState = getValueState(I.getOperand(1)); - - if (ValState.isOverdefined() || IdxState.isOverdefined()) - markOverdefined(&I); - else if(ValState.isConstant() && IdxState.isConstant()) - markConstant(&I, ConstantExpr::getExtractElement(ValState.getConstant(), - IdxState.getConstant())); -#endif } void SCCPSolver::visitInsertElementInst(InsertElementInst &I) { // TODO : SCCP does not handle vectors properly. return markOverdefined(&I); -#if 0 - LatticeVal &ValState = getValueState(I.getOperand(0)); - LatticeVal &EltState = getValueState(I.getOperand(1)); - LatticeVal &IdxState = getValueState(I.getOperand(2)); - - if (ValState.isOverdefined() || EltState.isOverdefined() || - IdxState.isOverdefined()) - markOverdefined(&I); - else if(ValState.isConstant() && EltState.isConstant() && - IdxState.isConstant()) - markConstant(&I, ConstantExpr::getInsertElement(ValState.getConstant(), - EltState.getConstant(), - IdxState.getConstant())); - else if (ValState.isUndefined() && EltState.isConstant() && - IdxState.isConstant()) - markConstant(&I,ConstantExpr::getInsertElement(UndefValue::get(I.getType()), - EltState.getConstant(), - IdxState.getConstant())); -#endif } void SCCPSolver::visitShuffleVectorInst(ShuffleVectorInst &I) { // TODO : SCCP does not handle vectors properly. return markOverdefined(&I); -#if 0 - LatticeVal &V1State = getValueState(I.getOperand(0)); - LatticeVal &V2State = getValueState(I.getOperand(1)); - LatticeVal &MaskState = getValueState(I.getOperand(2)); - - if (MaskState.isUndefined() || - (V1State.isUndefined() && V2State.isUndefined())) - return; // Undefined output if mask or both inputs undefined. - - if (V1State.isOverdefined() || V2State.isOverdefined() || - MaskState.isOverdefined()) { - markOverdefined(&I); - } else { - // A mix of constant/undef inputs. - Constant *V1 = V1State.isConstant() ? - V1State.getConstant() : UndefValue::get(I.getType()); - Constant *V2 = V2State.isConstant() ? - V2State.getConstant() : UndefValue::get(I.getType()); - Constant *Mask = MaskState.isConstant() ? - MaskState.getConstant() : UndefValue::get(I.getOperand(2)->getType()); - markConstant(&I, ConstantExpr::getShuffleVector(V1, V2, Mask)); - } -#endif } // Handle getelementptr instructions. If all operands are constants then we @@ -1020,7 +983,7 @@ void SCCPSolver::visitGetElementPtrInst(GetElementPtrInst &I) { for (unsigned i = 0, e = I.getNumOperands(); i != e; ++i) { LatticeVal State = getValueState(I.getOperand(i)); - if (State.isUndefined()) + if (State.isUnknown()) return; // Operands are not resolved yet. if (State.isOverdefined()) @@ -1066,7 +1029,7 @@ void SCCPSolver::visitLoadInst(LoadInst &I) { return markAnythingOverdefined(&I); LatticeVal PtrVal = getValueState(I.getOperand(0)); - if (PtrVal.isUndefined()) return; // The pointer is not resolved yet! + if (PtrVal.isUnknown()) return; // The pointer is not resolved yet! LatticeVal &IV = ValueState[&I]; if (IV.isOverdefined()) return; @@ -1094,7 +1057,7 @@ void SCCPSolver::visitLoadInst(LoadInst &I) { } // Transform load from a constant into a constant if possible. - if (Constant *C = ConstantFoldLoadFromConstPtr(Ptr, DL)) { + if (Constant *C = ConstantFoldLoadFromConstPtr(Ptr, I.getType(), DL)) { if (isa<UndefValue>(C)) return; return markConstant(IV, &I, C); @@ -1127,7 +1090,7 @@ CallOverdefined: AI != E; ++AI) { LatticeVal State = getValueState(*AI); - if (State.isUndefined()) + if (State.isUnknown()) return; // Operands are not resolved yet. if (State.isOverdefined()) return markOverdefined(I); @@ -1275,11 +1238,11 @@ void SCCPSolver::Solve() { /// conservatively, as "(zext i8 X -> i32) & 0xFF00" must always return zero, /// even if X isn't defined. bool SCCPSolver::ResolvedUndefsIn(Function &F) { - for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { - if (!BBExecutable.count(&*BB)) + for (BasicBlock &BB : F) { + if (!BBExecutable.count(&BB)) continue; - for (Instruction &I : *BB) { + for (Instruction &I : BB) { // Look for instructions which produce undef values. if (I.getType()->isVoidTy()) continue; @@ -1301,14 +1264,14 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { // more precise than this but it isn't worth bothering. for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { LatticeVal &LV = getStructValueState(&I, i); - if (LV.isUndefined()) + if (LV.isUnknown()) markOverdefined(LV, &I); } continue; } LatticeVal &LV = getValueState(&I); - if (!LV.isUndefined()) continue; + if (!LV.isUnknown()) continue; // extractvalue is safe; check here because the argument is a struct. if (isa<ExtractValueInst>(I)) @@ -1347,7 +1310,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { case Instruction::FDiv: case Instruction::FRem: // Floating-point binary operation: be conservative. - if (Op0LV.isUndefined() && Op1LV.isUndefined()) + if (Op0LV.isUnknown() && Op1LV.isUnknown()) markForcedConstant(&I, Constant::getNullValue(ITy)); else markOverdefined(&I); @@ -1367,7 +1330,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { case Instruction::Mul: case Instruction::And: // Both operands undef -> undef - if (Op0LV.isUndefined() && Op1LV.isUndefined()) + if (Op0LV.isUnknown() && Op1LV.isUnknown()) break; // undef * X -> 0. X could be zero. // undef & X -> 0. X could be zero. @@ -1376,7 +1339,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { case Instruction::Or: // Both operands undef -> undef - if (Op0LV.isUndefined() && Op1LV.isUndefined()) + if (Op0LV.isUnknown() && Op1LV.isUnknown()) break; // undef | X -> -1. X could be -1. markForcedConstant(&I, Constant::getAllOnesValue(ITy)); @@ -1386,7 +1349,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { // undef ^ undef -> 0; strictly speaking, this is not strictly // necessary, but we try to be nice to people who expect this // behavior in simple cases - if (Op0LV.isUndefined() && Op1LV.isUndefined()) { + if (Op0LV.isUnknown() && Op1LV.isUnknown()) { markForcedConstant(&I, Constant::getNullValue(ITy)); return true; } @@ -1399,7 +1362,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { case Instruction::URem: // X / undef -> undef. No change. // X % undef -> undef. No change. - if (Op1LV.isUndefined()) break; + if (Op1LV.isUnknown()) break; // X / 0 -> undef. No change. // X % 0 -> undef. No change. @@ -1413,7 +1376,15 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { case Instruction::AShr: // X >>a undef -> undef. - if (Op1LV.isUndefined()) break; + if (Op1LV.isUnknown()) break; + + // Shifting by the bitwidth or more is undefined. + if (Op1LV.isConstant()) { + if (auto *ShiftAmt = Op1LV.getConstantInt()) + if (ShiftAmt->getLimitedValue() >= + ShiftAmt->getType()->getScalarSizeInBits()) + break; + } // undef >>a X -> all ones markForcedConstant(&I, Constant::getAllOnesValue(ITy)); @@ -1422,7 +1393,15 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { case Instruction::Shl: // X << undef -> undef. // X >> undef -> undef. - if (Op1LV.isUndefined()) break; + if (Op1LV.isUnknown()) break; + + // Shifting by the bitwidth or more is undefined. + if (Op1LV.isConstant()) { + if (auto *ShiftAmt = Op1LV.getConstantInt()) + if (ShiftAmt->getLimitedValue() >= + ShiftAmt->getType()->getScalarSizeInBits()) + break; + } // undef << X -> 0 // undef >> X -> 0 @@ -1431,13 +1410,13 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { case Instruction::Select: Op1LV = getValueState(I.getOperand(1)); // undef ? X : Y -> X or Y. There could be commonality between X/Y. - if (Op0LV.isUndefined()) { + if (Op0LV.isUnknown()) { if (!Op1LV.isConstant()) // Pick the constant one if there is any. Op1LV = getValueState(I.getOperand(2)); - } else if (Op1LV.isUndefined()) { + } else if (Op1LV.isUnknown()) { // c ? undef : undef -> undef. No change. Op1LV = getValueState(I.getOperand(2)); - if (Op1LV.isUndefined()) + if (Op1LV.isUnknown()) break; // Otherwise, c ? undef : x -> x. } else { @@ -1487,17 +1466,17 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { // Check to see if we have a branch or switch on an undefined value. If so // we force the branch to go one way or the other to make the successor // values live. It doesn't really matter which way we force it. - TerminatorInst *TI = BB->getTerminator(); + TerminatorInst *TI = BB.getTerminator(); if (BranchInst *BI = dyn_cast<BranchInst>(TI)) { if (!BI->isConditional()) continue; - if (!getValueState(BI->getCondition()).isUndefined()) + if (!getValueState(BI->getCondition()).isUnknown()) continue; // If the input to SCCP is actually branch on undef, fix the undef to // false. if (isa<UndefValue>(BI->getCondition())) { BI->setCondition(ConstantInt::getFalse(BI->getContext())); - markEdgeExecutable(&*BB, TI->getSuccessor(1)); + markEdgeExecutable(&BB, TI->getSuccessor(1)); return true; } @@ -1510,16 +1489,14 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { } if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) { - if (!SI->getNumCases()) - continue; - if (!getValueState(SI->getCondition()).isUndefined()) + if (!SI->getNumCases() || !getValueState(SI->getCondition()).isUnknown()) continue; // If the input to SCCP is actually switch on undef, fix the undef to // the first constant. if (isa<UndefValue>(SI->getCondition())) { SI->setCondition(SI->case_begin().getCaseValue()); - markEdgeExecutable(&*BB, SI->case_begin().getCaseSuccessor()); + markEdgeExecutable(&BB, SI->case_begin().getCaseSuccessor()); return true; } @@ -1531,75 +1508,53 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { return false; } - -namespace { - //===--------------------------------------------------------------------===// - // - /// SCCP Class - This class uses the SCCPSolver to implement a per-function - /// Sparse Conditional Constant Propagator. - /// - struct SCCP : public FunctionPass { - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<TargetLibraryInfoWrapperPass>(); - AU.addPreserved<GlobalsAAWrapperPass>(); - } - static char ID; // Pass identification, replacement for typeid - SCCP() : FunctionPass(ID) { - initializeSCCPPass(*PassRegistry::getPassRegistry()); +static bool tryToReplaceWithConstant(SCCPSolver &Solver, Value *V) { + Constant *Const = nullptr; + if (V->getType()->isStructTy()) { + std::vector<LatticeVal> IVs = Solver.getStructLatticeValueFor(V); + if (std::any_of(IVs.begin(), IVs.end(), + [](LatticeVal &LV) { return LV.isOverdefined(); })) + return false; + std::vector<Constant *> ConstVals; + StructType *ST = dyn_cast<StructType>(V->getType()); + for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) { + LatticeVal V = IVs[i]; + ConstVals.push_back(V.isConstant() + ? V.getConstant() + : UndefValue::get(ST->getElementType(i))); } + Const = ConstantStruct::get(ST, ConstVals); + } else { + LatticeVal IV = Solver.getLatticeValueFor(V); + if (IV.isOverdefined()) + return false; + Const = IV.isConstant() ? IV.getConstant() : UndefValue::get(V->getType()); + } + assert(Const && "Constant is nullptr here!"); + DEBUG(dbgs() << " Constant: " << *Const << " = " << *V << '\n'); - // runOnFunction - Run the Sparse Conditional Constant Propagation - // algorithm, and return true if the function was modified. - // - bool runOnFunction(Function &F) override; - }; -} // end anonymous namespace - -char SCCP::ID = 0; -INITIALIZE_PASS(SCCP, "sccp", - "Sparse Conditional Constant Propagation", false, false) - -// createSCCPPass - This is the public interface to this file. -FunctionPass *llvm::createSCCPPass() { - return new SCCP(); + // Replaces all of the uses of a variable with uses of the constant. + V->replaceAllUsesWith(Const); + return true; } -static void DeleteInstructionInBlock(BasicBlock *BB) { - DEBUG(dbgs() << " BasicBlock Dead:" << *BB); - ++NumDeadBlocks; - - // Check to see if there are non-terminating instructions to delete. - if (isa<TerminatorInst>(BB->begin())) - return; +static bool tryToReplaceInstWithConstant(SCCPSolver &Solver, Instruction *Inst, + bool shouldEraseFromParent) { + if (!tryToReplaceWithConstant(Solver, Inst)) + return false; - // Delete the instructions backwards, as it has a reduced likelihood of having - // to update as many def-use and use-def chains. - Instruction *EndInst = BB->getTerminator(); // Last not to be deleted. - while (EndInst != BB->begin()) { - // Delete the next to last instruction. - Instruction *Inst = &*--EndInst->getIterator(); - if (!Inst->use_empty()) - Inst->replaceAllUsesWith(UndefValue::get(Inst->getType())); - if (Inst->isEHPad()) { - EndInst = Inst; - continue; - } - BB->getInstList().erase(Inst); - ++NumInstRemoved; - } + // Delete the instruction. + if (shouldEraseFromParent) + Inst->eraseFromParent(); + return true; } -// runOnFunction() - Run the Sparse Conditional Constant Propagation algorithm, +// runSCCP() - Run the Sparse Conditional Constant Propagation algorithm, // and return true if the function was modified. // -bool SCCP::runOnFunction(Function &F) { - if (skipOptnoneFunction(F)) - return false; - +static bool runSCCP(Function &F, const DataLayout &DL, + const TargetLibraryInfo *TLI) { DEBUG(dbgs() << "SCCP on function '" << F.getName() << "'\n"); - const DataLayout &DL = F.getParent()->getDataLayout(); - const TargetLibraryInfo *TLI = - &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); SCCPSolver Solver(DL, TLI); // Mark the first block of the function as being executable. @@ -1623,9 +1578,13 @@ bool SCCP::runOnFunction(Function &F) { // delete their contents now. Note that we cannot actually delete the blocks, // as we cannot modify the CFG of the function. - for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { - if (!Solver.isBlockExecutable(&*BB)) { - DeleteInstructionInBlock(&*BB); + for (BasicBlock &BB : F) { + if (!Solver.isBlockExecutable(&BB)) { + DEBUG(dbgs() << " BasicBlock Dead:" << BB); + + ++NumDeadBlocks; + NumInstRemoved += removeAllNonTerminatorAndEHPadInstructions(&BB); + MadeChanges = true; continue; } @@ -1633,70 +1592,74 @@ bool SCCP::runOnFunction(Function &F) { // Iterate over all of the instructions in a function, replacing them with // constants if we have found them to be of constant values. // - for (BasicBlock::iterator BI = BB->begin(), E = BB->end(); BI != E; ) { + for (BasicBlock::iterator BI = BB.begin(), E = BB.end(); BI != E;) { Instruction *Inst = &*BI++; if (Inst->getType()->isVoidTy() || isa<TerminatorInst>(Inst)) continue; - // TODO: Reconstruct structs from their elements. - if (Inst->getType()->isStructTy()) - continue; - - LatticeVal IV = Solver.getLatticeValueFor(Inst); - if (IV.isOverdefined()) - continue; - - Constant *Const = IV.isConstant() - ? IV.getConstant() : UndefValue::get(Inst->getType()); - DEBUG(dbgs() << " Constant: " << *Const << " = " << *Inst << '\n'); - - // Replaces all of the uses of a variable with uses of the constant. - Inst->replaceAllUsesWith(Const); - - // Delete the instruction. - Inst->eraseFromParent(); - - // Hey, we just changed something! - MadeChanges = true; - ++NumInstRemoved; + if (tryToReplaceInstWithConstant(Solver, Inst, + true /* shouldEraseFromParent */)) { + // Hey, we just changed something! + MadeChanges = true; + ++NumInstRemoved; + } } } return MadeChanges; } +PreservedAnalyses SCCPPass::run(Function &F, AnalysisManager<Function> &AM) { + const DataLayout &DL = F.getParent()->getDataLayout(); + auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); + if (!runSCCP(F, DL, &TLI)) + return PreservedAnalyses::all(); + + auto PA = PreservedAnalyses(); + PA.preserve<GlobalsAA>(); + return PA; +} + namespace { - //===--------------------------------------------------------------------===// +//===--------------------------------------------------------------------===// +// +/// SCCP Class - This class uses the SCCPSolver to implement a per-function +/// Sparse Conditional Constant Propagator. +/// +class SCCPLegacyPass : public FunctionPass { +public: + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<TargetLibraryInfoWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); + } + static char ID; // Pass identification, replacement for typeid + SCCPLegacyPass() : FunctionPass(ID) { + initializeSCCPLegacyPassPass(*PassRegistry::getPassRegistry()); + } + + // runOnFunction - Run the Sparse Conditional Constant Propagation + // algorithm, and return true if the function was modified. // - /// IPSCCP Class - This class implements interprocedural Sparse Conditional - /// Constant Propagation. - /// - struct IPSCCP : public ModulePass { - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<TargetLibraryInfoWrapperPass>(); - } - static char ID; - IPSCCP() : ModulePass(ID) { - initializeIPSCCPPass(*PassRegistry::getPassRegistry()); - } - bool runOnModule(Module &M) override; - }; + bool runOnFunction(Function &F) override { + if (skipFunction(F)) + return false; + const DataLayout &DL = F.getParent()->getDataLayout(); + const TargetLibraryInfo *TLI = + &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + return runSCCP(F, DL, TLI); + } +}; } // end anonymous namespace -char IPSCCP::ID = 0; -INITIALIZE_PASS_BEGIN(IPSCCP, "ipsccp", - "Interprocedural Sparse Conditional Constant Propagation", - false, false) +char SCCPLegacyPass::ID = 0; +INITIALIZE_PASS_BEGIN(SCCPLegacyPass, "sccp", + "Sparse Conditional Constant Propagation", false, false) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) -INITIALIZE_PASS_END(IPSCCP, "ipsccp", - "Interprocedural Sparse Conditional Constant Propagation", - false, false) - -// createIPSCCPPass - This is the public interface to this file. -ModulePass *llvm::createIPSCCPPass() { - return new IPSCCP(); -} +INITIALIZE_PASS_END(SCCPLegacyPass, "sccp", + "Sparse Conditional Constant Propagation", false, false) +// createSCCPPass - This is the public interface to this file. +FunctionPass *llvm::createSCCPPass() { return new SCCPLegacyPass(); } static bool AddressIsTaken(const GlobalValue *GV) { // Delete any dead constantexpr klingons. @@ -1725,10 +1688,8 @@ static bool AddressIsTaken(const GlobalValue *GV) { return false; } -bool IPSCCP::runOnModule(Module &M) { - const DataLayout &DL = M.getDataLayout(); - const TargetLibraryInfo *TLI = - &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); +static bool runIPSCCP(Module &M, const DataLayout &DL, + const TargetLibraryInfo *TLI) { SCCPSolver Solver(DL, TLI); // AddressTakenFunctions - This set keeps track of the address-taken functions @@ -1741,32 +1702,32 @@ bool IPSCCP::runOnModule(Module &M) { // Loop over all functions, marking arguments to those with their addresses // taken or that are external as overdefined. // - for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) { - if (F->isDeclaration()) + for (Function &F : M) { + if (F.isDeclaration()) continue; - // If this is a strong or ODR definition of this function, then we can - // propagate information about its result into callsites of it. - if (!F->mayBeOverridden()) - Solver.AddTrackedFunction(&*F); + // If this is an exact definition of this function, then we can propagate + // information about its result into callsites of it. + if (F.hasExactDefinition()) + Solver.AddTrackedFunction(&F); // If this function only has direct calls that we can see, we can track its // arguments and return value aggressively, and can assume it is not called // unless we see evidence to the contrary. - if (F->hasLocalLinkage()) { - if (AddressIsTaken(&*F)) - AddressTakenFunctions.insert(&*F); + if (F.hasLocalLinkage()) { + if (AddressIsTaken(&F)) + AddressTakenFunctions.insert(&F); else { - Solver.AddArgumentTrackedFunction(&*F); + Solver.AddArgumentTrackedFunction(&F); continue; } } // Assume the function is called. - Solver.MarkBlockExecutable(&F->front()); + Solver.MarkBlockExecutable(&F.front()); // Assume nothing about the incoming arguments. - for (Argument &AI : F->args()) + for (Argument &AI : F.args()) Solver.markAnythingOverdefined(&AI); } @@ -1784,8 +1745,8 @@ bool IPSCCP::runOnModule(Module &M) { DEBUG(dbgs() << "RESOLVING UNDEFS\n"); ResolvedUndefs = false; - for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) - ResolvedUndefs |= Solver.ResolvedUndefsIn(*F); + for (Function &F : M) + ResolvedUndefs |= Solver.ResolvedUndefsIn(F); } bool MadeChanges = false; @@ -1795,79 +1756,47 @@ bool IPSCCP::runOnModule(Module &M) { // SmallVector<BasicBlock*, 512> BlocksToErase; - for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) { - if (F->isDeclaration()) + for (Function &F : M) { + if (F.isDeclaration()) continue; - if (Solver.isBlockExecutable(&F->front())) { - for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end(); - AI != E; ++AI) { - if (AI->use_empty() || AI->getType()->isStructTy()) continue; - - // TODO: Could use getStructLatticeValueFor to find out if the entire - // result is a constant and replace it entirely if so. - - LatticeVal IV = Solver.getLatticeValueFor(&*AI); - if (IV.isOverdefined()) continue; - - Constant *CST = IV.isConstant() ? - IV.getConstant() : UndefValue::get(AI->getType()); - DEBUG(dbgs() << "*** Arg " << *AI << " = " << *CST <<"\n"); - - // Replaces all of the uses of a variable with uses of the - // constant. - AI->replaceAllUsesWith(CST); - ++IPNumArgsElimed; + if (Solver.isBlockExecutable(&F.front())) { + for (Function::arg_iterator AI = F.arg_begin(), E = F.arg_end(); AI != E; + ++AI) { + if (AI->use_empty()) + continue; + if (tryToReplaceWithConstant(Solver, &*AI)) + ++IPNumArgsElimed; } } - for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) { + for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { if (!Solver.isBlockExecutable(&*BB)) { - DeleteInstructionInBlock(&*BB); - MadeChanges = true; + DEBUG(dbgs() << " BasicBlock Dead:" << *BB); - TerminatorInst *TI = BB->getTerminator(); - for (BasicBlock *Succ : TI->successors()) { - if (!Succ->empty() && isa<PHINode>(Succ->begin())) - Succ->removePredecessor(&*BB); - } - if (!TI->use_empty()) - TI->replaceAllUsesWith(UndefValue::get(TI->getType())); - TI->eraseFromParent(); - new UnreachableInst(M.getContext(), &*BB); + ++NumDeadBlocks; + NumInstRemoved += + changeToUnreachable(BB->getFirstNonPHI(), /*UseLLVMTrap=*/false); + + MadeChanges = true; - if (&*BB != &F->front()) + if (&*BB != &F.front()) BlocksToErase.push_back(&*BB); continue; } for (BasicBlock::iterator BI = BB->begin(), E = BB->end(); BI != E; ) { Instruction *Inst = &*BI++; - if (Inst->getType()->isVoidTy() || Inst->getType()->isStructTy()) + if (Inst->getType()->isVoidTy()) continue; - - // TODO: Could use getStructLatticeValueFor to find out if the entire - // result is a constant and replace it entirely if so. - - LatticeVal IV = Solver.getLatticeValueFor(Inst); - if (IV.isOverdefined()) - continue; - - Constant *Const = IV.isConstant() - ? IV.getConstant() : UndefValue::get(Inst->getType()); - DEBUG(dbgs() << " Constant: " << *Const << " = " << *Inst << '\n'); - - // Replaces all of the uses of a variable with uses of the - // constant. - Inst->replaceAllUsesWith(Const); - - // Delete the instruction. - if (!isa<CallInst>(Inst) && !isa<TerminatorInst>(Inst)) - Inst->eraseFromParent(); - - // Hey, we just changed something! - MadeChanges = true; - ++IPNumInstRemoved; + if (tryToReplaceInstWithConstant( + Solver, Inst, + !isa<CallInst>(Inst) && + !isa<TerminatorInst>(Inst) /* shouldEraseFromParent */)) { + // Hey, we just changed something! + MadeChanges = true; + ++IPNumInstRemoved; + } } } @@ -1918,7 +1847,7 @@ bool IPSCCP::runOnModule(Module &M) { } // Finally, delete the basic block. - F->getBasicBlockList().erase(DeadBB); + F.getBasicBlockList().erase(DeadBB); } BlocksToErase.clear(); } @@ -1937,18 +1866,17 @@ bool IPSCCP::runOnModule(Module &M) { // TODO: Process multiple value ret instructions also. const DenseMap<Function*, LatticeVal> &RV = Solver.getTrackedRetVals(); - for (DenseMap<Function*, LatticeVal>::const_iterator I = RV.begin(), - E = RV.end(); I != E; ++I) { - Function *F = I->first; - if (I->second.isOverdefined() || F->getReturnType()->isVoidTy()) + for (const auto &I : RV) { + Function *F = I.first; + if (I.second.isOverdefined() || F->getReturnType()->isVoidTy()) continue; // We can only do this if we know that nothing else can call the function. if (!F->hasLocalLinkage() || AddressTakenFunctions.count(F)) continue; - for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) - if (ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator())) + for (BasicBlock &BB : *F) + if (ReturnInst *RI = dyn_cast<ReturnInst>(BB.getTerminator())) if (!isa<UndefValue>(RI->getOperand(0))) ReturnsToZap.push_back(RI); } @@ -1978,3 +1906,52 @@ bool IPSCCP::runOnModule(Module &M) { return MadeChanges; } + +PreservedAnalyses IPSCCPPass::run(Module &M, AnalysisManager<Module> &AM) { + const DataLayout &DL = M.getDataLayout(); + auto &TLI = AM.getResult<TargetLibraryAnalysis>(M); + if (!runIPSCCP(M, DL, &TLI)) + return PreservedAnalyses::all(); + return PreservedAnalyses::none(); +} + +namespace { +//===--------------------------------------------------------------------===// +// +/// IPSCCP Class - This class implements interprocedural Sparse Conditional +/// Constant Propagation. +/// +class IPSCCPLegacyPass : public ModulePass { +public: + static char ID; + + IPSCCPLegacyPass() : ModulePass(ID) { + initializeIPSCCPLegacyPassPass(*PassRegistry::getPassRegistry()); + } + + bool runOnModule(Module &M) override { + if (skipModule(M)) + return false; + const DataLayout &DL = M.getDataLayout(); + const TargetLibraryInfo *TLI = + &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + return runIPSCCP(M, DL, TLI); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<TargetLibraryInfoWrapperPass>(); + } +}; +} // end anonymous namespace + +char IPSCCPLegacyPass::ID = 0; +INITIALIZE_PASS_BEGIN(IPSCCPLegacyPass, "ipsccp", + "Interprocedural Sparse Conditional Constant Propagation", + false, false) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_END(IPSCCPLegacyPass, "ipsccp", + "Interprocedural Sparse Conditional Constant Propagation", + false, false) + +// createIPSCCPPass - This is the public interface to this file. +ModulePass *llvm::createIPSCCPPass() { return new IPSCCPLegacyPass(); } diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp index a7361b5fe0839..7d33259c030b7 100644 --- a/lib/Transforms/Scalar/SROA.cpp +++ b/lib/Transforms/Scalar/SROA.cpp @@ -55,8 +55,8 @@ #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/PromoteMemToReg.h" -#if __cplusplus >= 201103L && !defined(NDEBUG) -// We only use this for a debug check in C++11 +#ifndef NDEBUG +// We only use this for a debug check. #include <random> #endif @@ -87,12 +87,13 @@ static cl::opt<bool> SROAStrictInbounds("sroa-strict-inbounds", cl::init(false), cl::Hidden); namespace { -/// \brief A custom IRBuilder inserter which prefixes all names if they are -/// preserved. -template <bool preserveNames = true> -class IRBuilderPrefixedInserter - : public IRBuilderDefaultInserter<preserveNames> { +/// \brief A custom IRBuilder inserter which prefixes all names, but only in +/// Assert builds. +class IRBuilderPrefixedInserter : public IRBuilderDefaultInserter { std::string Prefix; + const Twine getNameWithPrefix(const Twine &Name) const { + return Name.isTriviallyEmpty() ? Name : Prefix + Name; + } public: void SetNamePrefix(const Twine &P) { Prefix = P.str(); } @@ -100,27 +101,13 @@ public: protected: void InsertHelper(Instruction *I, const Twine &Name, BasicBlock *BB, BasicBlock::iterator InsertPt) const { - IRBuilderDefaultInserter<preserveNames>::InsertHelper( - I, Name.isTriviallyEmpty() ? Name : Prefix + Name, BB, InsertPt); + IRBuilderDefaultInserter::InsertHelper(I, getNameWithPrefix(Name), BB, + InsertPt); } }; -// Specialization for not preserving the name is trivial. -template <> -class IRBuilderPrefixedInserter<false> - : public IRBuilderDefaultInserter<false> { -public: - void SetNamePrefix(const Twine &P) {} -}; - /// \brief Provide a typedef for IRBuilder that drops names in release builds. -#ifndef NDEBUG -typedef llvm::IRBuilder<true, ConstantFolder, IRBuilderPrefixedInserter<true>> - IRBuilderTy; -#else -typedef llvm::IRBuilder<false, ConstantFolder, IRBuilderPrefixedInserter<false>> - IRBuilderTy; -#endif +using IRBuilderTy = llvm::IRBuilder<ConstantFolder, IRBuilderPrefixedInserter>; } namespace { @@ -694,7 +681,7 @@ private: // langref in a very strict sense. If we ever want to enable // SROAStrictInbounds, this code should be factored cleanly into // PtrUseVisitor, but it is easier to experiment with SROAStrictInbounds - // by writing out the code here where we have tho underlying allocation + // by writing out the code here where we have the underlying allocation // size readily available. APInt GEPOffset = Offset; const DataLayout &DL = GEPI.getModule()->getDataLayout(); @@ -1015,7 +1002,7 @@ AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI) }), Slices.end()); -#if __cplusplus >= 201103L && !defined(NDEBUG) +#ifndef NDEBUG if (SROARandomShuffleSlices) { std::mt19937 MT(static_cast<unsigned>(sys::TimeValue::now().msec())); std::shuffle(Slices.begin(), Slices.end(), MT); @@ -1192,8 +1179,7 @@ static bool isSafePHIToSpeculate(PHINode &PN) { // If this pointer is always safe to load, or if we can prove that there // is already a load in the block, then we can move the load to the pred // block. - if (isDereferenceablePointer(InVal, DL) || - isSafeToLoadUnconditionally(InVal, TI, MaxAlign)) + if (isSafeToLoadUnconditionally(InVal, MaxAlign, DL, TI)) continue; return false; @@ -1262,8 +1248,6 @@ static bool isSafeSelectToSpeculate(SelectInst &SI) { Value *TValue = SI.getTrueValue(); Value *FValue = SI.getFalseValue(); const DataLayout &DL = SI.getModule()->getDataLayout(); - bool TDerefable = isDereferenceablePointer(TValue, DL); - bool FDerefable = isDereferenceablePointer(FValue, DL); for (User *U : SI.users()) { LoadInst *LI = dyn_cast<LoadInst>(U); @@ -1273,11 +1257,9 @@ static bool isSafeSelectToSpeculate(SelectInst &SI) { // Both operands to the select need to be dereferencable, either // absolutely (e.g. allocas) or at this point because we can see other // accesses to it. - if (!TDerefable && - !isSafeToLoadUnconditionally(TValue, LI, LI->getAlignment())) + if (!isSafeToLoadUnconditionally(TValue, LI->getAlignment(), DL, LI)) return false; - if (!FDerefable && - !isSafeToLoadUnconditionally(FValue, LI, LI->getAlignment())) + if (!isSafeToLoadUnconditionally(FValue, LI->getAlignment(), DL, LI)) return false; } @@ -1570,7 +1552,7 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr, if (Operator::getOpcode(Ptr) == Instruction::BitCast) { Ptr = cast<Operator>(Ptr)->getOperand(0); } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(Ptr)) { - if (GA->mayBeOverridden()) + if (GA->isInterposable()) break; Ptr = GA->getAliasee(); } else { @@ -1653,8 +1635,10 @@ static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) { OldTy = OldTy->getScalarType(); NewTy = NewTy->getScalarType(); if (NewTy->isPointerTy() || OldTy->isPointerTy()) { - if (NewTy->isPointerTy() && OldTy->isPointerTy()) - return true; + if (NewTy->isPointerTy() && OldTy->isPointerTy()) { + return cast<PointerType>(NewTy)->getPointerAddressSpace() == + cast<PointerType>(OldTy)->getPointerAddressSpace(); + } if (NewTy->isIntegerTy() || OldTy->isIntegerTy()) return true; return false; @@ -3123,9 +3107,14 @@ private: void emitFunc(Type *Ty, Value *&Agg, const Twine &Name) { assert(Ty->isSingleValueType()); // Extract the single value and store it using the indices. - Value *Store = IRB.CreateStore( - IRB.CreateExtractValue(Agg, Indices, Name + ".extract"), - IRB.CreateInBoundsGEP(nullptr, Ptr, GEPIndices, Name + ".gep")); + // + // The gep and extractvalue values are factored out of the CreateStore + // call to make the output independent of the argument evaluation order. + Value *ExtractValue = + IRB.CreateExtractValue(Agg, Indices, Name + ".extract"); + Value *InBoundsGEP = + IRB.CreateInBoundsGEP(nullptr, Ptr, GEPIndices, Name + ".gep"); + Value *Store = IRB.CreateStore(ExtractValue, InBoundsGEP); (void)Store; DEBUG(dbgs() << " to: " << *Store << "\n"); } @@ -3380,11 +3369,15 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { for (auto &P : AS.partitions()) { for (Slice &S : P) { Instruction *I = cast<Instruction>(S.getUse()->getUser()); - if (!S.isSplittable() ||S.endOffset() <= P.endOffset()) { - // If this was a load we have to track that it can't participate in any - // pre-splitting! + if (!S.isSplittable() || S.endOffset() <= P.endOffset()) { + // If this is a load we have to track that it can't participate in any + // pre-splitting. If this is a store of a load we have to track that + // that load also can't participate in any pre-splitting. if (auto *LI = dyn_cast<LoadInst>(I)) UnsplittableLoads.insert(LI); + else if (auto *SI = dyn_cast<StoreInst>(I)) + if (auto *LI = dyn_cast<LoadInst>(SI->getValueOperand())) + UnsplittableLoads.insert(LI); continue; } assert(P.endOffset() > S.beginOffset() && @@ -3411,9 +3404,9 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { } Loads.push_back(LI); - } else if (auto *SI = dyn_cast<StoreInst>(S.getUse()->getUser())) { - if (!SI || - S.getUse() != &SI->getOperandUse(SI->getPointerOperandIndex())) + } else if (auto *SI = dyn_cast<StoreInst>(I)) { + if (S.getUse() != &SI->getOperandUse(SI->getPointerOperandIndex())) + // Skip stores *of* pointers. FIXME: This shouldn't even be possible! continue; auto *StoredLoad = dyn_cast<LoadInst>(SI->getValueOperand()); if (!StoredLoad || !StoredLoad->isSimple()) @@ -3937,15 +3930,19 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, Worklist.insert(NewAI); } } else { - // If we can't promote the alloca, iterate on it to check for new - // refinements exposed by splitting the current alloca. Don't iterate on an - // alloca which didn't actually change and didn't get promoted. - if (NewAI != &AI) - Worklist.insert(NewAI); - // Drop any post-promotion work items if promotion didn't happen. while (PostPromotionWorklist.size() > PPWOldSize) PostPromotionWorklist.pop_back(); + + // We couldn't promote and we didn't create a new partition, nothing + // happened. + if (NewAI == &AI) + return nullptr; + + // If we can't promote the alloca, iterate on it to check for new + // refinements exposed by splitting the current alloca. Don't iterate on an + // alloca which didn't actually change and didn't get promoted. + Worklist.insert(NewAI); } return NewAI; @@ -4024,12 +4021,12 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) { auto *Var = DbgDecl->getVariable(); auto *Expr = DbgDecl->getExpression(); DIBuilder DIB(*AI.getModule(), /*AllowUnresolved*/ false); - bool IsSplit = Pieces.size() > 1; + uint64_t AllocaSize = DL.getTypeSizeInBits(AI.getAllocatedType()); for (auto Piece : Pieces) { // Create a piece expression describing the new partition or reuse AI's // expression if there is only one partition. auto *PieceExpr = Expr; - if (IsSplit || Expr->isBitPiece()) { + if (Piece.Size < AllocaSize || Expr->isBitPiece()) { // If this alloca is already a scalar replacement of a larger aggregate, // Piece.Offset describes the offset inside the scalar. uint64_t Offset = Expr->isBitPiece() ? Expr->getBitPieceOffset() : 0; @@ -4043,6 +4040,9 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) { Size = std::min(Size, AbsEnd - Start); } PieceExpr = DIB.createBitPieceExpression(Start, Size); + } else { + assert(Pieces.size() == 1 && + "partition is as large as original alloca"); } // Remove any existing dbg.declare intrinsic describing the same alloca. @@ -4237,14 +4237,19 @@ PreservedAnalyses SROA::runImpl(Function &F, DominatorTree &RunDT, PostPromotionWorklist.clear(); } while (!Worklist.empty()); + if (!Changed) + return PreservedAnalyses::all(); + // FIXME: Even when promoting allocas we should preserve some abstract set of // CFG-specific analyses. - return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); + PreservedAnalyses PA; + PA.preserve<GlobalsAA>(); + return PA; } -PreservedAnalyses SROA::run(Function &F, AnalysisManager<Function> *AM) { - return runImpl(F, AM->getResult<DominatorTreeAnalysis>(F), - AM->getResult<AssumptionAnalysis>(F)); +PreservedAnalyses SROA::run(Function &F, AnalysisManager<Function> &AM) { + return runImpl(F, AM.getResult<DominatorTreeAnalysis>(F), + AM.getResult<AssumptionAnalysis>(F)); } /// A legacy pass for the legacy pass manager that wraps the \c SROA pass. @@ -4260,7 +4265,7 @@ public: initializeSROALegacyPassPass(*PassRegistry::getPassRegistry()); } bool runOnFunction(Function &F) override { - if (skipOptnoneFunction(F)) + if (skipFunction(F)) return false; auto PA = Impl.runImpl( diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp index 52d477cc95736..f235b12e49cc9 100644 --- a/lib/Transforms/Scalar/Scalar.cpp +++ b/lib/Transforms/Scalar/Scalar.cpp @@ -20,6 +20,7 @@ #include "llvm/Analysis/Passes.h" #include "llvm/Analysis/ScopedNoAliasAA.h" #include "llvm/Analysis/TypeBasedAliasAnalysis.h" +#include "llvm/Transforms/Scalar/GVN.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Verifier.h" #include "llvm/InitializePasses.h" @@ -31,49 +32,52 @@ using namespace llvm; /// ScalarOpts library. void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeADCELegacyPassPass(Registry); - initializeBDCEPass(Registry); + initializeBDCELegacyPassPass(Registry); initializeAlignmentFromAssumptionsPass(Registry); - initializeConstantHoistingPass(Registry); + initializeConstantHoistingLegacyPassPass(Registry); initializeConstantPropagationPass(Registry); initializeCorrelatedValuePropagationPass(Registry); - initializeDCEPass(Registry); + initializeDCELegacyPassPass(Registry); initializeDeadInstEliminationPass(Registry); initializeScalarizerPass(Registry); - initializeDSEPass(Registry); - initializeGVNPass(Registry); + initializeDSELegacyPassPass(Registry); + initializeGuardWideningLegacyPassPass(Registry); + initializeGVNLegacyPassPass(Registry); initializeEarlyCSELegacyPassPass(Registry); + initializeGVNHoistLegacyPassPass(Registry); initializeFlattenCFGPassPass(Registry); initializeInductiveRangeCheckEliminationPass(Registry); - initializeIndVarSimplifyPass(Registry); + initializeIndVarSimplifyLegacyPassPass(Registry); initializeJumpThreadingPass(Registry); - initializeLICMPass(Registry); - initializeLoopDeletionPass(Registry); - initializeLoopAccessAnalysisPass(Registry); - initializeLoopInstSimplifyPass(Registry); + initializeLegacyLICMPassPass(Registry); + initializeLoopDataPrefetchPass(Registry); + initializeLoopDeletionLegacyPassPass(Registry); + initializeLoopAccessLegacyAnalysisPass(Registry); + initializeLoopInstSimplifyLegacyPassPass(Registry); initializeLoopInterchangePass(Registry); - initializeLoopRotatePass(Registry); + initializeLoopRotateLegacyPassPass(Registry); initializeLoopStrengthReducePass(Registry); initializeLoopRerollPass(Registry); initializeLoopUnrollPass(Registry); initializeLoopUnswitchPass(Registry); - initializeLoopIdiomRecognizePass(Registry); - initializeLowerAtomicPass(Registry); + initializeLoopVersioningLICMPass(Registry); + initializeLoopIdiomRecognizeLegacyPassPass(Registry); + initializeLowerAtomicLegacyPassPass(Registry); initializeLowerExpectIntrinsicPass(Registry); - initializeMemCpyOptPass(Registry); - initializeMergedLoadStoreMotionPass(Registry); + initializeLowerGuardIntrinsicPass(Registry); + initializeMemCpyOptLegacyPassPass(Registry); + initializeMergedLoadStoreMotionLegacyPassPass(Registry); initializeNaryReassociatePass(Registry); - initializePartiallyInlineLibCallsPass(Registry); - initializeReassociatePass(Registry); + initializePartiallyInlineLibCallsLegacyPassPass(Registry); + initializeReassociateLegacyPassPass(Registry); initializeRegToMemPass(Registry); initializeRewriteStatepointsForGCPass(Registry); - initializeSCCPPass(Registry); - initializeIPSCCPPass(Registry); + initializeSCCPLegacyPassPass(Registry); + initializeIPSCCPLegacyPassPass(Registry); initializeSROALegacyPassPass(Registry); - initializeSROA_DTPass(Registry); - initializeSROA_SSAUpPass(Registry); initializeCFGSimplifyPassPass(Registry); initializeStructurizeCFGPass(Registry); - initializeSinkingPass(Registry); + initializeSinkingLegacyPassPass(Registry); initializeTailCallElimPass(Registry); initializeSeparateConstOffsetFromGEPPass(Registry); initializeSpeculativeExecutionPass(Registry); @@ -81,9 +85,11 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeLoadCombinePass(Registry); initializePlaceBackedgeSafepointsImplPass(Registry); initializePlaceSafepointsPass(Registry); - initializeFloat2IntPass(Registry); - initializeLoopDistributePass(Registry); + initializeFloat2IntLegacyPassPass(Registry); + initializeLoopDistributeLegacyPass(Registry); initializeLoopLoadEliminationPass(Registry); + initializeLoopSimplifyCFGLegacyPassPass(Registry); + initializeLoopVersioningPassPass(Registry); } void LLVMInitializeScalarOpts(LLVMPassRegistryRef R) { @@ -154,6 +160,10 @@ void LLVMAddLoopRerollPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createLoopRerollPass()); } +void LLVMAddLoopSimplifyCFGPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createLoopSimplifyCFGPass()); +} + void LLVMAddLoopUnrollPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createLoopUnrollPass()); } @@ -187,16 +197,16 @@ void LLVMAddSCCPPass(LLVMPassManagerRef PM) { } void LLVMAddScalarReplAggregatesPass(LLVMPassManagerRef PM) { - unwrap(PM)->add(createScalarReplAggregatesPass()); + unwrap(PM)->add(createSROAPass()); } void LLVMAddScalarReplAggregatesPassSSA(LLVMPassManagerRef PM) { - unwrap(PM)->add(createScalarReplAggregatesPass(-1, false)); + unwrap(PM)->add(createSROAPass()); } void LLVMAddScalarReplAggregatesPassWithThreshold(LLVMPassManagerRef PM, int Threshold) { - unwrap(PM)->add(createScalarReplAggregatesPass(Threshold)); + unwrap(PM)->add(createSROAPass()); } void LLVMAddSimplifyLibCallsPass(LLVMPassManagerRef PM) { @@ -227,6 +237,10 @@ void LLVMAddEarlyCSEPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createEarlyCSEPass()); } +void LLVMAddGVNHoistLegacyPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createGVNHoistPass()); +} + void LLVMAddTypeBasedAliasAnalysisPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createTypeBasedAAWrapperPass()); } diff --git a/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/lib/Transforms/Scalar/ScalarReplAggregates.cpp deleted file mode 100644 index 114d22ddf2e44..0000000000000 --- a/lib/Transforms/Scalar/ScalarReplAggregates.cpp +++ /dev/null @@ -1,2630 +0,0 @@ -//===- ScalarReplAggregates.cpp - Scalar Replacement of Aggregates --------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This transformation implements the well known scalar replacement of -// aggregates transformation. This xform breaks up alloca instructions of -// aggregate type (structure or array) into individual alloca instructions for -// each member (if possible). Then, if possible, it transforms the individual -// alloca instructions into nice clean scalar SSA form. -// -// This combines a simple SRoA algorithm with the Mem2Reg algorithm because they -// often interact, especially for C++ programs. As such, iterating between -// SRoA, then Mem2Reg until we run out of things to promote works well. -// -//===----------------------------------------------------------------------===// - -#include "llvm/Transforms/Scalar.h" -#include "llvm/ADT/SetVector.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/AssumptionCache.h" -#include "llvm/Analysis/Loads.h" -#include "llvm/Analysis/ValueTracking.h" -#include "llvm/IR/CallSite.h" -#include "llvm/IR/Constants.h" -#include "llvm/IR/DIBuilder.h" -#include "llvm/IR/DataLayout.h" -#include "llvm/IR/DebugInfo.h" -#include "llvm/IR/DerivedTypes.h" -#include "llvm/IR/Dominators.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/GetElementPtrTypeIterator.h" -#include "llvm/IR/GlobalVariable.h" -#include "llvm/IR/IRBuilder.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/LLVMContext.h" -#include "llvm/IR/Module.h" -#include "llvm/IR/Operator.h" -#include "llvm/Pass.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/MathExtras.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Transforms/Utils/Local.h" -#include "llvm/Transforms/Utils/PromoteMemToReg.h" -#include "llvm/Transforms/Utils/SSAUpdater.h" -using namespace llvm; - -#define DEBUG_TYPE "scalarrepl" - -STATISTIC(NumReplaced, "Number of allocas broken up"); -STATISTIC(NumPromoted, "Number of allocas promoted"); -STATISTIC(NumAdjusted, "Number of scalar allocas adjusted to allow promotion"); -STATISTIC(NumConverted, "Number of aggregates converted to scalar"); - -namespace { -#define SROA SROA_ - struct SROA : public FunctionPass { - SROA(int T, bool hasDT, char &ID, int ST, int AT, int SLT) - : FunctionPass(ID), HasDomTree(hasDT) { - if (T == -1) - SRThreshold = 128; - else - SRThreshold = T; - if (ST == -1) - StructMemberThreshold = 32; - else - StructMemberThreshold = ST; - if (AT == -1) - ArrayElementThreshold = 8; - else - ArrayElementThreshold = AT; - if (SLT == -1) - // Do not limit the scalar integer load size if no threshold is given. - ScalarLoadThreshold = -1; - else - ScalarLoadThreshold = SLT; - } - - bool runOnFunction(Function &F) override; - - bool performScalarRepl(Function &F); - bool performPromotion(Function &F); - - private: - bool HasDomTree; - - /// DeadInsts - Keep track of instructions we have made dead, so that - /// we can remove them after we are done working. - SmallVector<Value*, 32> DeadInsts; - - /// AllocaInfo - When analyzing uses of an alloca instruction, this captures - /// information about the uses. All these fields are initialized to false - /// and set to true when something is learned. - struct AllocaInfo { - /// The alloca to promote. - AllocaInst *AI; - - /// CheckedPHIs - This is a set of verified PHI nodes, to prevent infinite - /// looping and avoid redundant work. - SmallPtrSet<PHINode*, 8> CheckedPHIs; - - /// isUnsafe - This is set to true if the alloca cannot be SROA'd. - bool isUnsafe : 1; - - /// isMemCpySrc - This is true if this aggregate is memcpy'd from. - bool isMemCpySrc : 1; - - /// isMemCpyDst - This is true if this aggregate is memcpy'd into. - bool isMemCpyDst : 1; - - /// hasSubelementAccess - This is true if a subelement of the alloca is - /// ever accessed, or false if the alloca is only accessed with mem - /// intrinsics or load/store that only access the entire alloca at once. - bool hasSubelementAccess : 1; - - /// hasALoadOrStore - This is true if there are any loads or stores to it. - /// The alloca may just be accessed with memcpy, for example, which would - /// not set this. - bool hasALoadOrStore : 1; - - explicit AllocaInfo(AllocaInst *ai) - : AI(ai), isUnsafe(false), isMemCpySrc(false), isMemCpyDst(false), - hasSubelementAccess(false), hasALoadOrStore(false) {} - }; - - /// SRThreshold - The maximum alloca size to considered for SROA. - unsigned SRThreshold; - - /// StructMemberThreshold - The maximum number of members a struct can - /// contain to be considered for SROA. - unsigned StructMemberThreshold; - - /// ArrayElementThreshold - The maximum number of elements an array can - /// have to be considered for SROA. - unsigned ArrayElementThreshold; - - /// ScalarLoadThreshold - The maximum size in bits of scalars to load when - /// converting to scalar - unsigned ScalarLoadThreshold; - - void MarkUnsafe(AllocaInfo &I, Instruction *User) { - I.isUnsafe = true; - DEBUG(dbgs() << " Transformation preventing inst: " << *User << '\n'); - } - - bool isSafeAllocaToScalarRepl(AllocaInst *AI); - - void isSafeForScalarRepl(Instruction *I, uint64_t Offset, AllocaInfo &Info); - void isSafePHISelectUseForScalarRepl(Instruction *User, uint64_t Offset, - AllocaInfo &Info); - void isSafeGEP(GetElementPtrInst *GEPI, uint64_t &Offset, AllocaInfo &Info); - void isSafeMemAccess(uint64_t Offset, uint64_t MemSize, - Type *MemOpType, bool isStore, AllocaInfo &Info, - Instruction *TheAccess, bool AllowWholeAccess); - bool TypeHasComponent(Type *T, uint64_t Offset, uint64_t Size, - const DataLayout &DL); - uint64_t FindElementAndOffset(Type *&T, uint64_t &Offset, Type *&IdxTy, - const DataLayout &DL); - - void DoScalarReplacement(AllocaInst *AI, - std::vector<AllocaInst*> &WorkList); - void DeleteDeadInstructions(); - - void RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset, - SmallVectorImpl<AllocaInst *> &NewElts); - void RewriteBitCast(BitCastInst *BC, AllocaInst *AI, uint64_t Offset, - SmallVectorImpl<AllocaInst *> &NewElts); - void RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset, - SmallVectorImpl<AllocaInst *> &NewElts); - void RewriteLifetimeIntrinsic(IntrinsicInst *II, AllocaInst *AI, - uint64_t Offset, - SmallVectorImpl<AllocaInst *> &NewElts); - void RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst, - AllocaInst *AI, - SmallVectorImpl<AllocaInst *> &NewElts); - void RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI, - SmallVectorImpl<AllocaInst *> &NewElts); - void RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI, - SmallVectorImpl<AllocaInst *> &NewElts); - bool ShouldAttemptScalarRepl(AllocaInst *AI); - }; - - // SROA_DT - SROA that uses DominatorTree. - struct SROA_DT : public SROA { - static char ID; - public: - SROA_DT(int T = -1, int ST = -1, int AT = -1, int SLT = -1) : - SROA(T, true, ID, ST, AT, SLT) { - initializeSROA_DTPass(*PassRegistry::getPassRegistry()); - } - - // getAnalysisUsage - This pass does not require any passes, but we know it - // will not alter the CFG, so say so. - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<AssumptionCacheTracker>(); - AU.addRequired<DominatorTreeWrapperPass>(); - AU.setPreservesCFG(); - } - }; - - // SROA_SSAUp - SROA that uses SSAUpdater. - struct SROA_SSAUp : public SROA { - static char ID; - public: - SROA_SSAUp(int T = -1, int ST = -1, int AT = -1, int SLT = -1) : - SROA(T, false, ID, ST, AT, SLT) { - initializeSROA_SSAUpPass(*PassRegistry::getPassRegistry()); - } - - // getAnalysisUsage - This pass does not require any passes, but we know it - // will not alter the CFG, so say so. - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<AssumptionCacheTracker>(); - AU.setPreservesCFG(); - } - }; - -} - -char SROA_DT::ID = 0; -char SROA_SSAUp::ID = 0; - -INITIALIZE_PASS_BEGIN(SROA_DT, "scalarrepl", - "Scalar Replacement of Aggregates (DT)", false, false) -INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) -INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_END(SROA_DT, "scalarrepl", - "Scalar Replacement of Aggregates (DT)", false, false) - -INITIALIZE_PASS_BEGIN(SROA_SSAUp, "scalarrepl-ssa", - "Scalar Replacement of Aggregates (SSAUp)", false, false) -INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) -INITIALIZE_PASS_END(SROA_SSAUp, "scalarrepl-ssa", - "Scalar Replacement of Aggregates (SSAUp)", false, false) - -// Public interface to the ScalarReplAggregates pass -FunctionPass *llvm::createScalarReplAggregatesPass(int Threshold, - bool UseDomTree, - int StructMemberThreshold, - int ArrayElementThreshold, - int ScalarLoadThreshold) { - if (UseDomTree) - return new SROA_DT(Threshold, StructMemberThreshold, ArrayElementThreshold, - ScalarLoadThreshold); - return new SROA_SSAUp(Threshold, StructMemberThreshold, - ArrayElementThreshold, ScalarLoadThreshold); -} - - -//===----------------------------------------------------------------------===// -// Convert To Scalar Optimization. -//===----------------------------------------------------------------------===// - -namespace { -/// ConvertToScalarInfo - This class implements the "Convert To Scalar" -/// optimization, which scans the uses of an alloca and determines if it can -/// rewrite it in terms of a single new alloca that can be mem2reg'd. -class ConvertToScalarInfo { - /// AllocaSize - The size of the alloca being considered in bytes. - unsigned AllocaSize; - const DataLayout &DL; - unsigned ScalarLoadThreshold; - - /// IsNotTrivial - This is set to true if there is some access to the object - /// which means that mem2reg can't promote it. - bool IsNotTrivial; - - /// ScalarKind - Tracks the kind of alloca being considered for promotion, - /// computed based on the uses of the alloca rather than the LLVM type system. - enum { - Unknown, - - // Accesses via GEPs that are consistent with element access of a vector - // type. This will not be converted into a vector unless there is a later - // access using an actual vector type. - ImplicitVector, - - // Accesses via vector operations and GEPs that are consistent with the - // layout of a vector type. - Vector, - - // An integer bag-of-bits with bitwise operations for insertion and - // extraction. Any combination of types can be converted into this kind - // of scalar. - Integer - } ScalarKind; - - /// VectorTy - This tracks the type that we should promote the vector to if - /// it is possible to turn it into a vector. This starts out null, and if it - /// isn't possible to turn into a vector type, it gets set to VoidTy. - VectorType *VectorTy; - - /// HadNonMemTransferAccess - True if there is at least one access to the - /// alloca that is not a MemTransferInst. We don't want to turn structs into - /// large integers unless there is some potential for optimization. - bool HadNonMemTransferAccess; - - /// HadDynamicAccess - True if some element of this alloca was dynamic. - /// We don't yet have support for turning a dynamic access into a large - /// integer. - bool HadDynamicAccess; - -public: - explicit ConvertToScalarInfo(unsigned Size, const DataLayout &DL, - unsigned SLT) - : AllocaSize(Size), DL(DL), ScalarLoadThreshold(SLT), IsNotTrivial(false), - ScalarKind(Unknown), VectorTy(nullptr), HadNonMemTransferAccess(false), - HadDynamicAccess(false) { } - - AllocaInst *TryConvert(AllocaInst *AI); - -private: - bool CanConvertToScalar(Value *V, uint64_t Offset, Value* NonConstantIdx); - void MergeInTypeForLoadOrStore(Type *In, uint64_t Offset); - bool MergeInVectorType(VectorType *VInTy, uint64_t Offset); - void ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, uint64_t Offset, - Value *NonConstantIdx); - - Value *ConvertScalar_ExtractValue(Value *NV, Type *ToType, - uint64_t Offset, Value* NonConstantIdx, - IRBuilder<> &Builder); - Value *ConvertScalar_InsertValue(Value *StoredVal, Value *ExistingVal, - uint64_t Offset, Value* NonConstantIdx, - IRBuilder<> &Builder); -}; -} // end anonymous namespace. - - -/// TryConvert - Analyze the specified alloca, and if it is safe to do so, -/// rewrite it to be a new alloca which is mem2reg'able. This returns the new -/// alloca if possible or null if not. -AllocaInst *ConvertToScalarInfo::TryConvert(AllocaInst *AI) { - // If we can't convert this scalar, or if mem2reg can trivially do it, bail - // out. - if (!CanConvertToScalar(AI, 0, nullptr) || !IsNotTrivial) - return nullptr; - - // If an alloca has only memset / memcpy uses, it may still have an Unknown - // ScalarKind. Treat it as an Integer below. - if (ScalarKind == Unknown) - ScalarKind = Integer; - - if (ScalarKind == Vector && VectorTy->getBitWidth() != AllocaSize * 8) - ScalarKind = Integer; - - // If we were able to find a vector type that can handle this with - // insert/extract elements, and if there was at least one use that had - // a vector type, promote this to a vector. We don't want to promote - // random stuff that doesn't use vectors (e.g. <9 x double>) because then - // we just get a lot of insert/extracts. If at least one vector is - // involved, then we probably really do have a union of vector/array. - Type *NewTy; - if (ScalarKind == Vector) { - assert(VectorTy && "Missing type for vector scalar."); - DEBUG(dbgs() << "CONVERT TO VECTOR: " << *AI << "\n TYPE = " - << *VectorTy << '\n'); - NewTy = VectorTy; // Use the vector type. - } else { - unsigned BitWidth = AllocaSize * 8; - - // Do not convert to scalar integer if the alloca size exceeds the - // scalar load threshold. - if (BitWidth > ScalarLoadThreshold) - return nullptr; - - if ((ScalarKind == ImplicitVector || ScalarKind == Integer) && - !HadNonMemTransferAccess && !DL.fitsInLegalInteger(BitWidth)) - return nullptr; - // Dynamic accesses on integers aren't yet supported. They need us to shift - // by a dynamic amount which could be difficult to work out as we might not - // know whether to use a left or right shift. - if (ScalarKind == Integer && HadDynamicAccess) - return nullptr; - - DEBUG(dbgs() << "CONVERT TO SCALAR INTEGER: " << *AI << "\n"); - // Create and insert the integer alloca. - NewTy = IntegerType::get(AI->getContext(), BitWidth); - } - AllocaInst *NewAI = - new AllocaInst(NewTy, nullptr, "", &AI->getParent()->front()); - ConvertUsesToScalar(AI, NewAI, 0, nullptr); - return NewAI; -} - -/// MergeInTypeForLoadOrStore - Add the 'In' type to the accumulated vector type -/// (VectorTy) so far at the offset specified by Offset (which is specified in -/// bytes). -/// -/// There are two cases we handle here: -/// 1) A union of vector types of the same size and potentially its elements. -/// Here we turn element accesses into insert/extract element operations. -/// This promotes a <4 x float> with a store of float to the third element -/// into a <4 x float> that uses insert element. -/// 2) A fully general blob of memory, which we turn into some (potentially -/// large) integer type with extract and insert operations where the loads -/// and stores would mutate the memory. We mark this by setting VectorTy -/// to VoidTy. -void ConvertToScalarInfo::MergeInTypeForLoadOrStore(Type *In, - uint64_t Offset) { - // If we already decided to turn this into a blob of integer memory, there is - // nothing to be done. - if (ScalarKind == Integer) - return; - - // If this could be contributing to a vector, analyze it. - - // If the In type is a vector that is the same size as the alloca, see if it - // matches the existing VecTy. - if (VectorType *VInTy = dyn_cast<VectorType>(In)) { - if (MergeInVectorType(VInTy, Offset)) - return; - } else if (In->isFloatTy() || In->isDoubleTy() || - (In->isIntegerTy() && In->getPrimitiveSizeInBits() >= 8 && - isPowerOf2_32(In->getPrimitiveSizeInBits()))) { - // Full width accesses can be ignored, because they can always be turned - // into bitcasts. - unsigned EltSize = In->getPrimitiveSizeInBits()/8; - if (EltSize == AllocaSize) - return; - - // If we're accessing something that could be an element of a vector, see - // if the implied vector agrees with what we already have and if Offset is - // compatible with it. - if (Offset % EltSize == 0 && AllocaSize % EltSize == 0 && - (!VectorTy || EltSize == VectorTy->getElementType() - ->getPrimitiveSizeInBits()/8)) { - if (!VectorTy) { - ScalarKind = ImplicitVector; - VectorTy = VectorType::get(In, AllocaSize/EltSize); - } - return; - } - } - - // Otherwise, we have a case that we can't handle with an optimized vector - // form. We can still turn this into a large integer. - ScalarKind = Integer; -} - -/// MergeInVectorType - Handles the vector case of MergeInTypeForLoadOrStore, -/// returning true if the type was successfully merged and false otherwise. -bool ConvertToScalarInfo::MergeInVectorType(VectorType *VInTy, - uint64_t Offset) { - if (VInTy->getBitWidth()/8 == AllocaSize && Offset == 0) { - // If we're storing/loading a vector of the right size, allow it as a - // vector. If this the first vector we see, remember the type so that - // we know the element size. If this is a subsequent access, ignore it - // even if it is a differing type but the same size. Worst case we can - // bitcast the resultant vectors. - if (!VectorTy) - VectorTy = VInTy; - ScalarKind = Vector; - return true; - } - - return false; -} - -/// CanConvertToScalar - V is a pointer. If we can convert the pointee and all -/// its accesses to a single vector type, return true and set VecTy to -/// the new type. If we could convert the alloca into a single promotable -/// integer, return true but set VecTy to VoidTy. Further, if the use is not a -/// completely trivial use that mem2reg could promote, set IsNotTrivial. Offset -/// is the current offset from the base of the alloca being analyzed. -/// -/// If we see at least one access to the value that is as a vector type, set the -/// SawVec flag. -bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset, - Value* NonConstantIdx) { - for (User *U : V->users()) { - Instruction *UI = cast<Instruction>(U); - - if (LoadInst *LI = dyn_cast<LoadInst>(UI)) { - // Don't break volatile loads. - if (!LI->isSimple()) - return false; - // Don't touch MMX operations. - if (LI->getType()->isX86_MMXTy()) - return false; - HadNonMemTransferAccess = true; - MergeInTypeForLoadOrStore(LI->getType(), Offset); - continue; - } - - if (StoreInst *SI = dyn_cast<StoreInst>(UI)) { - // Storing the pointer, not into the value? - if (SI->getOperand(0) == V || !SI->isSimple()) return false; - // Don't touch MMX operations. - if (SI->getOperand(0)->getType()->isX86_MMXTy()) - return false; - HadNonMemTransferAccess = true; - MergeInTypeForLoadOrStore(SI->getOperand(0)->getType(), Offset); - continue; - } - - if (BitCastInst *BCI = dyn_cast<BitCastInst>(UI)) { - if (!onlyUsedByLifetimeMarkers(BCI)) - IsNotTrivial = true; // Can't be mem2reg'd. - if (!CanConvertToScalar(BCI, Offset, NonConstantIdx)) - return false; - continue; - } - - if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(UI)) { - // If this is a GEP with a variable indices, we can't handle it. - PointerType* PtrTy = dyn_cast<PointerType>(GEP->getPointerOperandType()); - if (!PtrTy) - return false; - - // Compute the offset that this GEP adds to the pointer. - SmallVector<Value*, 8> Indices(GEP->op_begin()+1, GEP->op_end()); - Value *GEPNonConstantIdx = nullptr; - if (!GEP->hasAllConstantIndices()) { - if (!isa<VectorType>(PtrTy->getElementType())) - return false; - if (NonConstantIdx) - return false; - GEPNonConstantIdx = Indices.pop_back_val(); - if (!GEPNonConstantIdx->getType()->isIntegerTy(32)) - return false; - HadDynamicAccess = true; - } else - GEPNonConstantIdx = NonConstantIdx; - uint64_t GEPOffset = DL.getIndexedOffset(PtrTy, - Indices); - // See if all uses can be converted. - if (!CanConvertToScalar(GEP, Offset+GEPOffset, GEPNonConstantIdx)) - return false; - IsNotTrivial = true; // Can't be mem2reg'd. - HadNonMemTransferAccess = true; - continue; - } - - // If this is a constant sized memset of a constant value (e.g. 0) we can - // handle it. - if (MemSetInst *MSI = dyn_cast<MemSetInst>(UI)) { - // Store to dynamic index. - if (NonConstantIdx) - return false; - // Store of constant value. - if (!isa<ConstantInt>(MSI->getValue())) - return false; - - // Store of constant size. - ConstantInt *Len = dyn_cast<ConstantInt>(MSI->getLength()); - if (!Len) - return false; - - // If the size differs from the alloca, we can only convert the alloca to - // an integer bag-of-bits. - // FIXME: This should handle all of the cases that are currently accepted - // as vector element insertions. - if (Len->getZExtValue() != AllocaSize || Offset != 0) - ScalarKind = Integer; - - IsNotTrivial = true; // Can't be mem2reg'd. - HadNonMemTransferAccess = true; - continue; - } - - // If this is a memcpy or memmove into or out of the whole allocation, we - // can handle it like a load or store of the scalar type. - if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(UI)) { - // Store to dynamic index. - if (NonConstantIdx) - return false; - ConstantInt *Len = dyn_cast<ConstantInt>(MTI->getLength()); - if (!Len || Len->getZExtValue() != AllocaSize || Offset != 0) - return false; - - IsNotTrivial = true; // Can't be mem2reg'd. - continue; - } - - // If this is a lifetime intrinsic, we can handle it. - if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(UI)) { - if (II->getIntrinsicID() == Intrinsic::lifetime_start || - II->getIntrinsicID() == Intrinsic::lifetime_end) { - continue; - } - } - - // Otherwise, we cannot handle this! - return false; - } - - return true; -} - -/// ConvertUsesToScalar - Convert all of the users of Ptr to use the new alloca -/// directly. This happens when we are converting an "integer union" to a -/// single integer scalar, or when we are converting a "vector union" to a -/// vector with insert/extractelement instructions. -/// -/// Offset is an offset from the original alloca, in bits that need to be -/// shifted to the right. By the end of this, there should be no uses of Ptr. -void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, - uint64_t Offset, - Value* NonConstantIdx) { - while (!Ptr->use_empty()) { - Instruction *User = cast<Instruction>(Ptr->user_back()); - - if (BitCastInst *CI = dyn_cast<BitCastInst>(User)) { - ConvertUsesToScalar(CI, NewAI, Offset, NonConstantIdx); - CI->eraseFromParent(); - continue; - } - - if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(User)) { - // Compute the offset that this GEP adds to the pointer. - SmallVector<Value*, 8> Indices(GEP->op_begin()+1, GEP->op_end()); - Value* GEPNonConstantIdx = nullptr; - if (!GEP->hasAllConstantIndices()) { - assert(!NonConstantIdx && - "Dynamic GEP reading from dynamic GEP unsupported"); - GEPNonConstantIdx = Indices.pop_back_val(); - } else - GEPNonConstantIdx = NonConstantIdx; - uint64_t GEPOffset = DL.getIndexedOffset(GEP->getPointerOperandType(), - Indices); - ConvertUsesToScalar(GEP, NewAI, Offset+GEPOffset*8, GEPNonConstantIdx); - GEP->eraseFromParent(); - continue; - } - - IRBuilder<> Builder(User); - - if (LoadInst *LI = dyn_cast<LoadInst>(User)) { - // The load is a bit extract from NewAI shifted right by Offset bits. - Value *LoadedVal = Builder.CreateLoad(NewAI); - Value *NewLoadVal - = ConvertScalar_ExtractValue(LoadedVal, LI->getType(), Offset, - NonConstantIdx, Builder); - LI->replaceAllUsesWith(NewLoadVal); - LI->eraseFromParent(); - continue; - } - - if (StoreInst *SI = dyn_cast<StoreInst>(User)) { - assert(SI->getOperand(0) != Ptr && "Consistency error!"); - Instruction *Old = Builder.CreateLoad(NewAI, NewAI->getName()+".in"); - Value *New = ConvertScalar_InsertValue(SI->getOperand(0), Old, Offset, - NonConstantIdx, Builder); - Builder.CreateStore(New, NewAI); - SI->eraseFromParent(); - - // If the load we just inserted is now dead, then the inserted store - // overwrote the entire thing. - if (Old->use_empty()) - Old->eraseFromParent(); - continue; - } - - // If this is a constant sized memset of a constant value (e.g. 0) we can - // transform it into a store of the expanded constant value. - if (MemSetInst *MSI = dyn_cast<MemSetInst>(User)) { - assert(MSI->getRawDest() == Ptr && "Consistency error!"); - assert(!NonConstantIdx && "Cannot replace dynamic memset with insert"); - int64_t SNumBytes = cast<ConstantInt>(MSI->getLength())->getSExtValue(); - if (SNumBytes > 0 && (SNumBytes >> 32) == 0) { - unsigned NumBytes = static_cast<unsigned>(SNumBytes); - unsigned Val = cast<ConstantInt>(MSI->getValue())->getZExtValue(); - - // Compute the value replicated the right number of times. - APInt APVal(NumBytes*8, Val); - - // Splat the value if non-zero. - if (Val) - for (unsigned i = 1; i != NumBytes; ++i) - APVal |= APVal << 8; - - Instruction *Old = Builder.CreateLoad(NewAI, NewAI->getName()+".in"); - Value *New = ConvertScalar_InsertValue( - ConstantInt::get(User->getContext(), APVal), - Old, Offset, nullptr, Builder); - Builder.CreateStore(New, NewAI); - - // If the load we just inserted is now dead, then the memset overwrote - // the entire thing. - if (Old->use_empty()) - Old->eraseFromParent(); - } - MSI->eraseFromParent(); - continue; - } - - // If this is a memcpy or memmove into or out of the whole allocation, we - // can handle it like a load or store of the scalar type. - if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(User)) { - assert(Offset == 0 && "must be store to start of alloca"); - assert(!NonConstantIdx && "Cannot replace dynamic transfer with insert"); - - // If the source and destination are both to the same alloca, then this is - // a noop copy-to-self, just delete it. Otherwise, emit a load and store - // as appropriate. - AllocaInst *OrigAI = cast<AllocaInst>(GetUnderlyingObject(Ptr, DL, 0)); - - if (GetUnderlyingObject(MTI->getSource(), DL, 0) != OrigAI) { - // Dest must be OrigAI, change this to be a load from the original - // pointer (bitcasted), then a store to our new alloca. - assert(MTI->getRawDest() == Ptr && "Neither use is of pointer?"); - Value *SrcPtr = MTI->getSource(); - PointerType* SPTy = cast<PointerType>(SrcPtr->getType()); - PointerType* AIPTy = cast<PointerType>(NewAI->getType()); - if (SPTy->getAddressSpace() != AIPTy->getAddressSpace()) { - AIPTy = PointerType::get(AIPTy->getElementType(), - SPTy->getAddressSpace()); - } - SrcPtr = Builder.CreateBitCast(SrcPtr, AIPTy); - - LoadInst *SrcVal = Builder.CreateLoad(SrcPtr, "srcval"); - SrcVal->setAlignment(MTI->getAlignment()); - Builder.CreateStore(SrcVal, NewAI); - } else if (GetUnderlyingObject(MTI->getDest(), DL, 0) != OrigAI) { - // Src must be OrigAI, change this to be a load from NewAI then a store - // through the original dest pointer (bitcasted). - assert(MTI->getRawSource() == Ptr && "Neither use is of pointer?"); - LoadInst *SrcVal = Builder.CreateLoad(NewAI, "srcval"); - - PointerType* DPTy = cast<PointerType>(MTI->getDest()->getType()); - PointerType* AIPTy = cast<PointerType>(NewAI->getType()); - if (DPTy->getAddressSpace() != AIPTy->getAddressSpace()) { - AIPTy = PointerType::get(AIPTy->getElementType(), - DPTy->getAddressSpace()); - } - Value *DstPtr = Builder.CreateBitCast(MTI->getDest(), AIPTy); - - StoreInst *NewStore = Builder.CreateStore(SrcVal, DstPtr); - NewStore->setAlignment(MTI->getAlignment()); - } else { - // Noop transfer. Src == Dst - } - - MTI->eraseFromParent(); - continue; - } - - if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(User)) { - if (II->getIntrinsicID() == Intrinsic::lifetime_start || - II->getIntrinsicID() == Intrinsic::lifetime_end) { - // There's no need to preserve these, as the resulting alloca will be - // converted to a register anyways. - II->eraseFromParent(); - continue; - } - } - - llvm_unreachable("Unsupported operation!"); - } -} - -/// ConvertScalar_ExtractValue - Extract a value of type ToType from an integer -/// or vector value FromVal, extracting the bits from the offset specified by -/// Offset. This returns the value, which is of type ToType. -/// -/// This happens when we are converting an "integer union" to a single -/// integer scalar, or when we are converting a "vector union" to a vector with -/// insert/extractelement instructions. -/// -/// Offset is an offset from the original alloca, in bits that need to be -/// shifted to the right. -Value *ConvertToScalarInfo:: -ConvertScalar_ExtractValue(Value *FromVal, Type *ToType, - uint64_t Offset, Value* NonConstantIdx, - IRBuilder<> &Builder) { - // If the load is of the whole new alloca, no conversion is needed. - Type *FromType = FromVal->getType(); - if (FromType == ToType && Offset == 0) - return FromVal; - - // If the result alloca is a vector type, this is either an element - // access or a bitcast to another vector type of the same size. - if (VectorType *VTy = dyn_cast<VectorType>(FromType)) { - unsigned FromTypeSize = DL.getTypeAllocSize(FromType); - unsigned ToTypeSize = DL.getTypeAllocSize(ToType); - if (FromTypeSize == ToTypeSize) - return Builder.CreateBitCast(FromVal, ToType); - - // Otherwise it must be an element access. - unsigned Elt = 0; - if (Offset) { - unsigned EltSize = DL.getTypeAllocSizeInBits(VTy->getElementType()); - Elt = Offset/EltSize; - assert(EltSize*Elt == Offset && "Invalid modulus in validity checking"); - } - // Return the element extracted out of it. - Value *Idx; - if (NonConstantIdx) { - if (Elt) - Idx = Builder.CreateAdd(NonConstantIdx, - Builder.getInt32(Elt), - "dyn.offset"); - else - Idx = NonConstantIdx; - } else - Idx = Builder.getInt32(Elt); - Value *V = Builder.CreateExtractElement(FromVal, Idx); - if (V->getType() != ToType) - V = Builder.CreateBitCast(V, ToType); - return V; - } - - // If ToType is a first class aggregate, extract out each of the pieces and - // use insertvalue's to form the FCA. - if (StructType *ST = dyn_cast<StructType>(ToType)) { - assert(!NonConstantIdx && - "Dynamic indexing into struct types not supported"); - const StructLayout &Layout = *DL.getStructLayout(ST); - Value *Res = UndefValue::get(ST); - for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) { - Value *Elt = ConvertScalar_ExtractValue(FromVal, ST->getElementType(i), - Offset+Layout.getElementOffsetInBits(i), - nullptr, Builder); - Res = Builder.CreateInsertValue(Res, Elt, i); - } - return Res; - } - - if (ArrayType *AT = dyn_cast<ArrayType>(ToType)) { - assert(!NonConstantIdx && - "Dynamic indexing into array types not supported"); - uint64_t EltSize = DL.getTypeAllocSizeInBits(AT->getElementType()); - Value *Res = UndefValue::get(AT); - for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) { - Value *Elt = ConvertScalar_ExtractValue(FromVal, AT->getElementType(), - Offset+i*EltSize, nullptr, - Builder); - Res = Builder.CreateInsertValue(Res, Elt, i); - } - return Res; - } - - // Otherwise, this must be a union that was converted to an integer value. - IntegerType *NTy = cast<IntegerType>(FromVal->getType()); - - // If this is a big-endian system and the load is narrower than the - // full alloca type, we need to do a shift to get the right bits. - int ShAmt = 0; - if (DL.isBigEndian()) { - // On big-endian machines, the lowest bit is stored at the bit offset - // from the pointer given by getTypeStoreSizeInBits. This matters for - // integers with a bitwidth that is not a multiple of 8. - ShAmt = DL.getTypeStoreSizeInBits(NTy) - - DL.getTypeStoreSizeInBits(ToType) - Offset; - } else { - ShAmt = Offset; - } - - // Note: we support negative bitwidths (with shl) which are not defined. - // We do this to support (f.e.) loads off the end of a structure where - // only some bits are used. - if (ShAmt > 0 && (unsigned)ShAmt < NTy->getBitWidth()) - FromVal = Builder.CreateLShr(FromVal, - ConstantInt::get(FromVal->getType(), ShAmt)); - else if (ShAmt < 0 && (unsigned)-ShAmt < NTy->getBitWidth()) - FromVal = Builder.CreateShl(FromVal, - ConstantInt::get(FromVal->getType(), -ShAmt)); - - // Finally, unconditionally truncate the integer to the right width. - unsigned LIBitWidth = DL.getTypeSizeInBits(ToType); - if (LIBitWidth < NTy->getBitWidth()) - FromVal = - Builder.CreateTrunc(FromVal, IntegerType::get(FromVal->getContext(), - LIBitWidth)); - else if (LIBitWidth > NTy->getBitWidth()) - FromVal = - Builder.CreateZExt(FromVal, IntegerType::get(FromVal->getContext(), - LIBitWidth)); - - // If the result is an integer, this is a trunc or bitcast. - if (ToType->isIntegerTy()) { - // Should be done. - } else if (ToType->isFloatingPointTy() || ToType->isVectorTy()) { - // Just do a bitcast, we know the sizes match up. - FromVal = Builder.CreateBitCast(FromVal, ToType); - } else { - // Otherwise must be a pointer. - FromVal = Builder.CreateIntToPtr(FromVal, ToType); - } - assert(FromVal->getType() == ToType && "Didn't convert right?"); - return FromVal; -} - -/// ConvertScalar_InsertValue - Insert the value "SV" into the existing integer -/// or vector value "Old" at the offset specified by Offset. -/// -/// This happens when we are converting an "integer union" to a -/// single integer scalar, or when we are converting a "vector union" to a -/// vector with insert/extractelement instructions. -/// -/// Offset is an offset from the original alloca, in bits that need to be -/// shifted to the right. -/// -/// NonConstantIdx is an index value if there was a GEP with a non-constant -/// index value. If this is 0 then all GEPs used to find this insert address -/// are constant. -Value *ConvertToScalarInfo:: -ConvertScalar_InsertValue(Value *SV, Value *Old, - uint64_t Offset, Value* NonConstantIdx, - IRBuilder<> &Builder) { - // Convert the stored type to the actual type, shift it left to insert - // then 'or' into place. - Type *AllocaType = Old->getType(); - LLVMContext &Context = Old->getContext(); - - if (VectorType *VTy = dyn_cast<VectorType>(AllocaType)) { - uint64_t VecSize = DL.getTypeAllocSizeInBits(VTy); - uint64_t ValSize = DL.getTypeAllocSizeInBits(SV->getType()); - - // Changing the whole vector with memset or with an access of a different - // vector type? - if (ValSize == VecSize) - return Builder.CreateBitCast(SV, AllocaType); - - // Must be an element insertion. - Type *EltTy = VTy->getElementType(); - if (SV->getType() != EltTy) - SV = Builder.CreateBitCast(SV, EltTy); - uint64_t EltSize = DL.getTypeAllocSizeInBits(EltTy); - unsigned Elt = Offset/EltSize; - Value *Idx; - if (NonConstantIdx) { - if (Elt) - Idx = Builder.CreateAdd(NonConstantIdx, - Builder.getInt32(Elt), - "dyn.offset"); - else - Idx = NonConstantIdx; - } else - Idx = Builder.getInt32(Elt); - return Builder.CreateInsertElement(Old, SV, Idx); - } - - // If SV is a first-class aggregate value, insert each value recursively. - if (StructType *ST = dyn_cast<StructType>(SV->getType())) { - assert(!NonConstantIdx && - "Dynamic indexing into struct types not supported"); - const StructLayout &Layout = *DL.getStructLayout(ST); - for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) { - Value *Elt = Builder.CreateExtractValue(SV, i); - Old = ConvertScalar_InsertValue(Elt, Old, - Offset+Layout.getElementOffsetInBits(i), - nullptr, Builder); - } - return Old; - } - - if (ArrayType *AT = dyn_cast<ArrayType>(SV->getType())) { - assert(!NonConstantIdx && - "Dynamic indexing into array types not supported"); - uint64_t EltSize = DL.getTypeAllocSizeInBits(AT->getElementType()); - for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) { - Value *Elt = Builder.CreateExtractValue(SV, i); - Old = ConvertScalar_InsertValue(Elt, Old, Offset+i*EltSize, nullptr, - Builder); - } - return Old; - } - - // If SV is a float, convert it to the appropriate integer type. - // If it is a pointer, do the same. - unsigned SrcWidth = DL.getTypeSizeInBits(SV->getType()); - unsigned DestWidth = DL.getTypeSizeInBits(AllocaType); - unsigned SrcStoreWidth = DL.getTypeStoreSizeInBits(SV->getType()); - unsigned DestStoreWidth = DL.getTypeStoreSizeInBits(AllocaType); - if (SV->getType()->isFloatingPointTy() || SV->getType()->isVectorTy()) - SV = Builder.CreateBitCast(SV, IntegerType::get(SV->getContext(),SrcWidth)); - else if (SV->getType()->isPointerTy()) - SV = Builder.CreatePtrToInt(SV, DL.getIntPtrType(SV->getType())); - - // Zero extend or truncate the value if needed. - if (SV->getType() != AllocaType) { - if (SV->getType()->getPrimitiveSizeInBits() < - AllocaType->getPrimitiveSizeInBits()) - SV = Builder.CreateZExt(SV, AllocaType); - else { - // Truncation may be needed if storing more than the alloca can hold - // (undefined behavior). - SV = Builder.CreateTrunc(SV, AllocaType); - SrcWidth = DestWidth; - SrcStoreWidth = DestStoreWidth; - } - } - - // If this is a big-endian system and the store is narrower than the - // full alloca type, we need to do a shift to get the right bits. - int ShAmt = 0; - if (DL.isBigEndian()) { - // On big-endian machines, the lowest bit is stored at the bit offset - // from the pointer given by getTypeStoreSizeInBits. This matters for - // integers with a bitwidth that is not a multiple of 8. - ShAmt = DestStoreWidth - SrcStoreWidth - Offset; - } else { - ShAmt = Offset; - } - - // Note: we support negative bitwidths (with shr) which are not defined. - // We do this to support (f.e.) stores off the end of a structure where - // only some bits in the structure are set. - APInt Mask(APInt::getLowBitsSet(DestWidth, SrcWidth)); - if (ShAmt > 0 && (unsigned)ShAmt < DestWidth) { - SV = Builder.CreateShl(SV, ConstantInt::get(SV->getType(), ShAmt)); - Mask <<= ShAmt; - } else if (ShAmt < 0 && (unsigned)-ShAmt < DestWidth) { - SV = Builder.CreateLShr(SV, ConstantInt::get(SV->getType(), -ShAmt)); - Mask = Mask.lshr(-ShAmt); - } - - // Mask out the bits we are about to insert from the old value, and or - // in the new bits. - if (SrcWidth != DestWidth) { - assert(DestWidth > SrcWidth); - Old = Builder.CreateAnd(Old, ConstantInt::get(Context, ~Mask), "mask"); - SV = Builder.CreateOr(Old, SV, "ins"); - } - return SV; -} - - -//===----------------------------------------------------------------------===// -// SRoA Driver -//===----------------------------------------------------------------------===// - - -bool SROA::runOnFunction(Function &F) { - if (skipOptnoneFunction(F)) - return false; - - bool Changed = performPromotion(F); - - while (1) { - bool LocalChange = performScalarRepl(F); - if (!LocalChange) break; // No need to repromote if no scalarrepl - Changed = true; - LocalChange = performPromotion(F); - if (!LocalChange) break; // No need to re-scalarrepl if no promotion - } - - return Changed; -} - -namespace { -class AllocaPromoter : public LoadAndStorePromoter { - AllocaInst *AI; - DIBuilder *DIB; - SmallVector<DbgDeclareInst *, 4> DDIs; - SmallVector<DbgValueInst *, 4> DVIs; -public: - AllocaPromoter(ArrayRef<Instruction*> Insts, SSAUpdater &S, - DIBuilder *DB) - : LoadAndStorePromoter(Insts, S), AI(nullptr), DIB(DB) {} - - void run(AllocaInst *AI, const SmallVectorImpl<Instruction*> &Insts) { - // Remember which alloca we're promoting (for isInstInList). - this->AI = AI; - if (auto *L = LocalAsMetadata::getIfExists(AI)) { - if (auto *DINode = MetadataAsValue::getIfExists(AI->getContext(), L)) { - for (User *U : DINode->users()) - if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(U)) - DDIs.push_back(DDI); - else if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(U)) - DVIs.push_back(DVI); - } - } - - LoadAndStorePromoter::run(Insts); - AI->eraseFromParent(); - for (SmallVectorImpl<DbgDeclareInst *>::iterator I = DDIs.begin(), - E = DDIs.end(); I != E; ++I) { - DbgDeclareInst *DDI = *I; - DDI->eraseFromParent(); - } - for (SmallVectorImpl<DbgValueInst *>::iterator I = DVIs.begin(), - E = DVIs.end(); I != E; ++I) { - DbgValueInst *DVI = *I; - DVI->eraseFromParent(); - } - } - - bool isInstInList(Instruction *I, - const SmallVectorImpl<Instruction*> &Insts) const override { - if (LoadInst *LI = dyn_cast<LoadInst>(I)) - return LI->getOperand(0) == AI; - return cast<StoreInst>(I)->getPointerOperand() == AI; - } - - void updateDebugInfo(Instruction *Inst) const override { - for (SmallVectorImpl<DbgDeclareInst *>::const_iterator I = DDIs.begin(), - E = DDIs.end(); I != E; ++I) { - DbgDeclareInst *DDI = *I; - if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) - ConvertDebugDeclareToDebugValue(DDI, SI, *DIB); - else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) - ConvertDebugDeclareToDebugValue(DDI, LI, *DIB); - } - for (SmallVectorImpl<DbgValueInst *>::const_iterator I = DVIs.begin(), - E = DVIs.end(); I != E; ++I) { - DbgValueInst *DVI = *I; - Value *Arg = nullptr; - if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { - // If an argument is zero extended then use argument directly. The ZExt - // may be zapped by an optimization pass in future. - if (ZExtInst *ZExt = dyn_cast<ZExtInst>(SI->getOperand(0))) - Arg = dyn_cast<Argument>(ZExt->getOperand(0)); - if (SExtInst *SExt = dyn_cast<SExtInst>(SI->getOperand(0))) - Arg = dyn_cast<Argument>(SExt->getOperand(0)); - if (!Arg) - Arg = SI->getOperand(0); - } else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) { - Arg = LI->getOperand(0); - } else { - continue; - } - DIB->insertDbgValueIntrinsic(Arg, 0, DVI->getVariable(), - DVI->getExpression(), DVI->getDebugLoc(), - Inst); - } - } -}; -} // end anon namespace - -/// isSafeSelectToSpeculate - Select instructions that use an alloca and are -/// subsequently loaded can be rewritten to load both input pointers and then -/// select between the result, allowing the load of the alloca to be promoted. -/// From this: -/// %P2 = select i1 %cond, i32* %Alloca, i32* %Other -/// %V = load i32* %P2 -/// to: -/// %V1 = load i32* %Alloca -> will be mem2reg'd -/// %V2 = load i32* %Other -/// %V = select i1 %cond, i32 %V1, i32 %V2 -/// -/// We can do this to a select if its only uses are loads and if the operand to -/// the select can be loaded unconditionally. -static bool isSafeSelectToSpeculate(SelectInst *SI) { - const DataLayout &DL = SI->getModule()->getDataLayout(); - bool TDerefable = isDereferenceablePointer(SI->getTrueValue(), DL); - bool FDerefable = isDereferenceablePointer(SI->getFalseValue(), DL); - - for (User *U : SI->users()) { - LoadInst *LI = dyn_cast<LoadInst>(U); - if (!LI || !LI->isSimple()) return false; - - // Both operands to the select need to be dereferencable, either absolutely - // (e.g. allocas) or at this point because we can see other accesses to it. - if (!TDerefable && - !isSafeToLoadUnconditionally(SI->getTrueValue(), LI, - LI->getAlignment())) - return false; - if (!FDerefable && - !isSafeToLoadUnconditionally(SI->getFalseValue(), LI, - LI->getAlignment())) - return false; - } - - return true; -} - -/// isSafePHIToSpeculate - PHI instructions that use an alloca and are -/// subsequently loaded can be rewritten to load both input pointers in the pred -/// blocks and then PHI the results, allowing the load of the alloca to be -/// promoted. -/// From this: -/// %P2 = phi [i32* %Alloca, i32* %Other] -/// %V = load i32* %P2 -/// to: -/// %V1 = load i32* %Alloca -> will be mem2reg'd -/// ... -/// %V2 = load i32* %Other -/// ... -/// %V = phi [i32 %V1, i32 %V2] -/// -/// We can do this to a select if its only uses are loads and if the operand to -/// the select can be loaded unconditionally. -static bool isSafePHIToSpeculate(PHINode *PN) { - // For now, we can only do this promotion if the load is in the same block as - // the PHI, and if there are no stores between the phi and load. - // TODO: Allow recursive phi users. - // TODO: Allow stores. - BasicBlock *BB = PN->getParent(); - unsigned MaxAlign = 0; - for (User *U : PN->users()) { - LoadInst *LI = dyn_cast<LoadInst>(U); - if (!LI || !LI->isSimple()) return false; - - // For now we only allow loads in the same block as the PHI. This is a - // common case that happens when instcombine merges two loads through a PHI. - if (LI->getParent() != BB) return false; - - // Ensure that there are no instructions between the PHI and the load that - // could store. - for (BasicBlock::iterator BBI(PN); &*BBI != LI; ++BBI) - if (BBI->mayWriteToMemory()) - return false; - - MaxAlign = std::max(MaxAlign, LI->getAlignment()); - } - - const DataLayout &DL = PN->getModule()->getDataLayout(); - - // Okay, we know that we have one or more loads in the same block as the PHI. - // We can transform this if it is safe to push the loads into the predecessor - // blocks. The only thing to watch out for is that we can't put a possibly - // trapping load in the predecessor if it is a critical edge. - for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { - BasicBlock *Pred = PN->getIncomingBlock(i); - Value *InVal = PN->getIncomingValue(i); - - // If the terminator of the predecessor has side-effects (an invoke), - // there is no safe place to put a load in the predecessor. - if (Pred->getTerminator()->mayHaveSideEffects()) - return false; - - // If the value is produced by the terminator of the predecessor - // (an invoke), there is no valid place to put a load in the predecessor. - if (Pred->getTerminator() == InVal) - return false; - - // If the predecessor has a single successor, then the edge isn't critical. - if (Pred->getTerminator()->getNumSuccessors() == 1) - continue; - - // If this pointer is always safe to load, or if we can prove that there is - // already a load in the block, then we can move the load to the pred block. - if (isDereferenceablePointer(InVal, DL) || - isSafeToLoadUnconditionally(InVal, Pred->getTerminator(), MaxAlign)) - continue; - - return false; - } - - return true; -} - - -/// tryToMakeAllocaBePromotable - This returns true if the alloca only has -/// direct (non-volatile) loads and stores to it. If the alloca is close but -/// not quite there, this will transform the code to allow promotion. As such, -/// it is a non-pure predicate. -static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const DataLayout &DL) { - SetVector<Instruction*, SmallVector<Instruction*, 4>, - SmallPtrSet<Instruction*, 4> > InstsToRewrite; - for (User *U : AI->users()) { - if (LoadInst *LI = dyn_cast<LoadInst>(U)) { - if (!LI->isSimple()) - return false; - continue; - } - - if (StoreInst *SI = dyn_cast<StoreInst>(U)) { - if (SI->getOperand(0) == AI || !SI->isSimple()) - return false; // Don't allow a store OF the AI, only INTO the AI. - continue; - } - - if (SelectInst *SI = dyn_cast<SelectInst>(U)) { - // If the condition being selected on is a constant, fold the select, yes - // this does (rarely) happen early on. - if (ConstantInt *CI = dyn_cast<ConstantInt>(SI->getCondition())) { - Value *Result = SI->getOperand(1+CI->isZero()); - SI->replaceAllUsesWith(Result); - SI->eraseFromParent(); - - // This is very rare and we just scrambled the use list of AI, start - // over completely. - return tryToMakeAllocaBePromotable(AI, DL); - } - - // If it is safe to turn "load (select c, AI, ptr)" into a select of two - // loads, then we can transform this by rewriting the select. - if (!isSafeSelectToSpeculate(SI)) - return false; - - InstsToRewrite.insert(SI); - continue; - } - - if (PHINode *PN = dyn_cast<PHINode>(U)) { - if (PN->use_empty()) { // Dead PHIs can be stripped. - InstsToRewrite.insert(PN); - continue; - } - - // If it is safe to turn "load (phi [AI, ptr, ...])" into a PHI of loads - // in the pred blocks, then we can transform this by rewriting the PHI. - if (!isSafePHIToSpeculate(PN)) - return false; - - InstsToRewrite.insert(PN); - continue; - } - - if (BitCastInst *BCI = dyn_cast<BitCastInst>(U)) { - if (onlyUsedByLifetimeMarkers(BCI)) { - InstsToRewrite.insert(BCI); - continue; - } - } - - return false; - } - - // If there are no instructions to rewrite, then all uses are load/stores and - // we're done! - if (InstsToRewrite.empty()) - return true; - - // If we have instructions that need to be rewritten for this to be promotable - // take care of it now. - for (unsigned i = 0, e = InstsToRewrite.size(); i != e; ++i) { - if (BitCastInst *BCI = dyn_cast<BitCastInst>(InstsToRewrite[i])) { - // This could only be a bitcast used by nothing but lifetime intrinsics. - for (BitCastInst::user_iterator I = BCI->user_begin(), E = BCI->user_end(); - I != E;) - cast<Instruction>(*I++)->eraseFromParent(); - BCI->eraseFromParent(); - continue; - } - - if (SelectInst *SI = dyn_cast<SelectInst>(InstsToRewrite[i])) { - // Selects in InstsToRewrite only have load uses. Rewrite each as two - // loads with a new select. - while (!SI->use_empty()) { - LoadInst *LI = cast<LoadInst>(SI->user_back()); - - IRBuilder<> Builder(LI); - LoadInst *TrueLoad = - Builder.CreateLoad(SI->getTrueValue(), LI->getName()+".t"); - LoadInst *FalseLoad = - Builder.CreateLoad(SI->getFalseValue(), LI->getName()+".f"); - - // Transfer alignment and AA info if present. - TrueLoad->setAlignment(LI->getAlignment()); - FalseLoad->setAlignment(LI->getAlignment()); - - AAMDNodes Tags; - LI->getAAMetadata(Tags); - if (Tags) { - TrueLoad->setAAMetadata(Tags); - FalseLoad->setAAMetadata(Tags); - } - - Value *V = Builder.CreateSelect(SI->getCondition(), TrueLoad, FalseLoad); - V->takeName(LI); - LI->replaceAllUsesWith(V); - LI->eraseFromParent(); - } - - // Now that all the loads are gone, the select is gone too. - SI->eraseFromParent(); - continue; - } - - // Otherwise, we have a PHI node which allows us to push the loads into the - // predecessors. - PHINode *PN = cast<PHINode>(InstsToRewrite[i]); - if (PN->use_empty()) { - PN->eraseFromParent(); - continue; - } - - Type *LoadTy = cast<PointerType>(PN->getType())->getElementType(); - PHINode *NewPN = PHINode::Create(LoadTy, PN->getNumIncomingValues(), - PN->getName()+".ld", PN); - - // Get the AA tags and alignment to use from one of the loads. It doesn't - // matter which one we get and if any differ, it doesn't matter. - LoadInst *SomeLoad = cast<LoadInst>(PN->user_back()); - - AAMDNodes AATags; - SomeLoad->getAAMetadata(AATags); - unsigned Align = SomeLoad->getAlignment(); - - // Rewrite all loads of the PN to use the new PHI. - while (!PN->use_empty()) { - LoadInst *LI = cast<LoadInst>(PN->user_back()); - LI->replaceAllUsesWith(NewPN); - LI->eraseFromParent(); - } - - // Inject loads into all of the pred blocks. Keep track of which blocks we - // insert them into in case we have multiple edges from the same block. - DenseMap<BasicBlock*, LoadInst*> InsertedLoads; - - for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { - BasicBlock *Pred = PN->getIncomingBlock(i); - LoadInst *&Load = InsertedLoads[Pred]; - if (!Load) { - Load = new LoadInst(PN->getIncomingValue(i), - PN->getName() + "." + Pred->getName(), - Pred->getTerminator()); - Load->setAlignment(Align); - if (AATags) Load->setAAMetadata(AATags); - } - - NewPN->addIncoming(Load, Pred); - } - - PN->eraseFromParent(); - } - - ++NumAdjusted; - return true; -} - -bool SROA::performPromotion(Function &F) { - std::vector<AllocaInst*> Allocas; - const DataLayout &DL = F.getParent()->getDataLayout(); - DominatorTree *DT = nullptr; - if (HasDomTree) - DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - AssumptionCache &AC = - getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); - - BasicBlock &BB = F.getEntryBlock(); // Get the entry node for the function - DIBuilder DIB(*F.getParent(), /*AllowUnresolved*/ false); - bool Changed = false; - SmallVector<Instruction*, 64> Insts; - while (1) { - Allocas.clear(); - - // Find allocas that are safe to promote, by looking at all instructions in - // the entry node - for (BasicBlock::iterator I = BB.begin(), E = --BB.end(); I != E; ++I) - if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) // Is it an alloca? - if (tryToMakeAllocaBePromotable(AI, DL)) - Allocas.push_back(AI); - - if (Allocas.empty()) break; - - if (HasDomTree) - PromoteMemToReg(Allocas, *DT, nullptr, &AC); - else { - SSAUpdater SSA; - for (unsigned i = 0, e = Allocas.size(); i != e; ++i) { - AllocaInst *AI = Allocas[i]; - - // Build list of instructions to promote. - for (User *U : AI->users()) - Insts.push_back(cast<Instruction>(U)); - AllocaPromoter(Insts, SSA, &DIB).run(AI, Insts); - Insts.clear(); - } - } - NumPromoted += Allocas.size(); - Changed = true; - } - - return Changed; -} - - -/// ShouldAttemptScalarRepl - Decide if an alloca is a good candidate for -/// SROA. It must be a struct or array type with a small number of elements. -bool SROA::ShouldAttemptScalarRepl(AllocaInst *AI) { - Type *T = AI->getAllocatedType(); - // Do not promote any struct that has too many members. - if (StructType *ST = dyn_cast<StructType>(T)) - return ST->getNumElements() <= StructMemberThreshold; - // Do not promote any array that has too many elements. - if (ArrayType *AT = dyn_cast<ArrayType>(T)) - return AT->getNumElements() <= ArrayElementThreshold; - return false; -} - -// performScalarRepl - This algorithm is a simple worklist driven algorithm, -// which runs on all of the alloca instructions in the entry block, removing -// them if they are only used by getelementptr instructions. -// -bool SROA::performScalarRepl(Function &F) { - std::vector<AllocaInst*> WorkList; - const DataLayout &DL = F.getParent()->getDataLayout(); - - // Scan the entry basic block, adding allocas to the worklist. - BasicBlock &BB = F.getEntryBlock(); - for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; ++I) - if (AllocaInst *A = dyn_cast<AllocaInst>(I)) - WorkList.push_back(A); - - // Process the worklist - bool Changed = false; - while (!WorkList.empty()) { - AllocaInst *AI = WorkList.back(); - WorkList.pop_back(); - - // Handle dead allocas trivially. These can be formed by SROA'ing arrays - // with unused elements. - if (AI->use_empty()) { - AI->eraseFromParent(); - Changed = true; - continue; - } - - // If this alloca is impossible for us to promote, reject it early. - if (AI->isArrayAllocation() || !AI->getAllocatedType()->isSized()) - continue; - - // Check to see if we can perform the core SROA transformation. We cannot - // transform the allocation instruction if it is an array allocation - // (allocations OF arrays are ok though), and an allocation of a scalar - // value cannot be decomposed at all. - uint64_t AllocaSize = DL.getTypeAllocSize(AI->getAllocatedType()); - - // Do not promote [0 x %struct]. - if (AllocaSize == 0) continue; - - // Do not promote any struct whose size is too big. - if (AllocaSize > SRThreshold) continue; - - // If the alloca looks like a good candidate for scalar replacement, and if - // all its users can be transformed, then split up the aggregate into its - // separate elements. - if (ShouldAttemptScalarRepl(AI) && isSafeAllocaToScalarRepl(AI)) { - DoScalarReplacement(AI, WorkList); - Changed = true; - continue; - } - - // If we can turn this aggregate value (potentially with casts) into a - // simple scalar value that can be mem2reg'd into a register value. - // IsNotTrivial tracks whether this is something that mem2reg could have - // promoted itself. If so, we don't want to transform it needlessly. Note - // that we can't just check based on the type: the alloca may be of an i32 - // but that has pointer arithmetic to set byte 3 of it or something. - if (AllocaInst *NewAI = - ConvertToScalarInfo((unsigned)AllocaSize, DL, ScalarLoadThreshold) - .TryConvert(AI)) { - NewAI->takeName(AI); - AI->eraseFromParent(); - ++NumConverted; - Changed = true; - continue; - } - - // Otherwise, couldn't process this alloca. - } - - return Changed; -} - -/// DoScalarReplacement - This alloca satisfied the isSafeAllocaToScalarRepl -/// predicate, do SROA now. -void SROA::DoScalarReplacement(AllocaInst *AI, - std::vector<AllocaInst*> &WorkList) { - DEBUG(dbgs() << "Found inst to SROA: " << *AI << '\n'); - SmallVector<AllocaInst*, 32> ElementAllocas; - if (StructType *ST = dyn_cast<StructType>(AI->getAllocatedType())) { - ElementAllocas.reserve(ST->getNumContainedTypes()); - for (unsigned i = 0, e = ST->getNumContainedTypes(); i != e; ++i) { - AllocaInst *NA = new AllocaInst(ST->getContainedType(i), nullptr, - AI->getAlignment(), - AI->getName() + "." + Twine(i), AI); - ElementAllocas.push_back(NA); - WorkList.push_back(NA); // Add to worklist for recursive processing - } - } else { - ArrayType *AT = cast<ArrayType>(AI->getAllocatedType()); - ElementAllocas.reserve(AT->getNumElements()); - Type *ElTy = AT->getElementType(); - for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) { - AllocaInst *NA = new AllocaInst(ElTy, nullptr, AI->getAlignment(), - AI->getName() + "." + Twine(i), AI); - ElementAllocas.push_back(NA); - WorkList.push_back(NA); // Add to worklist for recursive processing - } - } - - // Now that we have created the new alloca instructions, rewrite all the - // uses of the old alloca. - RewriteForScalarRepl(AI, AI, 0, ElementAllocas); - - // Now erase any instructions that were made dead while rewriting the alloca. - DeleteDeadInstructions(); - AI->eraseFromParent(); - - ++NumReplaced; -} - -/// DeleteDeadInstructions - Erase instructions on the DeadInstrs list, -/// recursively including all their operands that become trivially dead. -void SROA::DeleteDeadInstructions() { - while (!DeadInsts.empty()) { - Instruction *I = cast<Instruction>(DeadInsts.pop_back_val()); - - for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI) - if (Instruction *U = dyn_cast<Instruction>(*OI)) { - // Zero out the operand and see if it becomes trivially dead. - // (But, don't add allocas to the dead instruction list -- they are - // already on the worklist and will be deleted separately.) - *OI = nullptr; - if (isInstructionTriviallyDead(U) && !isa<AllocaInst>(U)) - DeadInsts.push_back(U); - } - - I->eraseFromParent(); - } -} - -/// isSafeForScalarRepl - Check if instruction I is a safe use with regard to -/// performing scalar replacement of alloca AI. The results are flagged in -/// the Info parameter. Offset indicates the position within AI that is -/// referenced by this instruction. -void SROA::isSafeForScalarRepl(Instruction *I, uint64_t Offset, - AllocaInfo &Info) { - const DataLayout &DL = I->getModule()->getDataLayout(); - for (Use &U : I->uses()) { - Instruction *User = cast<Instruction>(U.getUser()); - - if (BitCastInst *BC = dyn_cast<BitCastInst>(User)) { - isSafeForScalarRepl(BC, Offset, Info); - } else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(User)) { - uint64_t GEPOffset = Offset; - isSafeGEP(GEPI, GEPOffset, Info); - if (!Info.isUnsafe) - isSafeForScalarRepl(GEPI, GEPOffset, Info); - } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(User)) { - ConstantInt *Length = dyn_cast<ConstantInt>(MI->getLength()); - if (!Length || Length->isNegative()) - return MarkUnsafe(Info, User); - - isSafeMemAccess(Offset, Length->getZExtValue(), nullptr, - U.getOperandNo() == 0, Info, MI, - true /*AllowWholeAccess*/); - } else if (LoadInst *LI = dyn_cast<LoadInst>(User)) { - if (!LI->isSimple()) - return MarkUnsafe(Info, User); - Type *LIType = LI->getType(); - isSafeMemAccess(Offset, DL.getTypeAllocSize(LIType), LIType, false, Info, - LI, true /*AllowWholeAccess*/); - Info.hasALoadOrStore = true; - - } else if (StoreInst *SI = dyn_cast<StoreInst>(User)) { - // Store is ok if storing INTO the pointer, not storing the pointer - if (!SI->isSimple() || SI->getOperand(0) == I) - return MarkUnsafe(Info, User); - - Type *SIType = SI->getOperand(0)->getType(); - isSafeMemAccess(Offset, DL.getTypeAllocSize(SIType), SIType, true, Info, - SI, true /*AllowWholeAccess*/); - Info.hasALoadOrStore = true; - } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(User)) { - if (II->getIntrinsicID() != Intrinsic::lifetime_start && - II->getIntrinsicID() != Intrinsic::lifetime_end) - return MarkUnsafe(Info, User); - } else if (isa<PHINode>(User) || isa<SelectInst>(User)) { - isSafePHISelectUseForScalarRepl(User, Offset, Info); - } else { - return MarkUnsafe(Info, User); - } - if (Info.isUnsafe) return; - } -} - - -/// isSafePHIUseForScalarRepl - If we see a PHI node or select using a pointer -/// derived from the alloca, we can often still split the alloca into elements. -/// This is useful if we have a large alloca where one element is phi'd -/// together somewhere: we can SRoA and promote all the other elements even if -/// we end up not being able to promote this one. -/// -/// All we require is that the uses of the PHI do not index into other parts of -/// the alloca. The most important use case for this is single load and stores -/// that are PHI'd together, which can happen due to code sinking. -void SROA::isSafePHISelectUseForScalarRepl(Instruction *I, uint64_t Offset, - AllocaInfo &Info) { - // If we've already checked this PHI, don't do it again. - if (PHINode *PN = dyn_cast<PHINode>(I)) - if (!Info.CheckedPHIs.insert(PN).second) - return; - - const DataLayout &DL = I->getModule()->getDataLayout(); - for (User *U : I->users()) { - Instruction *UI = cast<Instruction>(U); - - if (BitCastInst *BC = dyn_cast<BitCastInst>(UI)) { - isSafePHISelectUseForScalarRepl(BC, Offset, Info); - } else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(UI)) { - // Only allow "bitcast" GEPs for simplicity. We could generalize this, - // but would have to prove that we're staying inside of an element being - // promoted. - if (!GEPI->hasAllZeroIndices()) - return MarkUnsafe(Info, UI); - isSafePHISelectUseForScalarRepl(GEPI, Offset, Info); - } else if (LoadInst *LI = dyn_cast<LoadInst>(UI)) { - if (!LI->isSimple()) - return MarkUnsafe(Info, UI); - Type *LIType = LI->getType(); - isSafeMemAccess(Offset, DL.getTypeAllocSize(LIType), LIType, false, Info, - LI, false /*AllowWholeAccess*/); - Info.hasALoadOrStore = true; - - } else if (StoreInst *SI = dyn_cast<StoreInst>(UI)) { - // Store is ok if storing INTO the pointer, not storing the pointer - if (!SI->isSimple() || SI->getOperand(0) == I) - return MarkUnsafe(Info, UI); - - Type *SIType = SI->getOperand(0)->getType(); - isSafeMemAccess(Offset, DL.getTypeAllocSize(SIType), SIType, true, Info, - SI, false /*AllowWholeAccess*/); - Info.hasALoadOrStore = true; - } else if (isa<PHINode>(UI) || isa<SelectInst>(UI)) { - isSafePHISelectUseForScalarRepl(UI, Offset, Info); - } else { - return MarkUnsafe(Info, UI); - } - if (Info.isUnsafe) return; - } -} - -/// isSafeGEP - Check if a GEP instruction can be handled for scalar -/// replacement. It is safe when all the indices are constant, in-bounds -/// references, and when the resulting offset corresponds to an element within -/// the alloca type. The results are flagged in the Info parameter. Upon -/// return, Offset is adjusted as specified by the GEP indices. -void SROA::isSafeGEP(GetElementPtrInst *GEPI, - uint64_t &Offset, AllocaInfo &Info) { - gep_type_iterator GEPIt = gep_type_begin(GEPI), E = gep_type_end(GEPI); - if (GEPIt == E) - return; - bool NonConstant = false; - unsigned NonConstantIdxSize = 0; - - // Walk through the GEP type indices, checking the types that this indexes - // into. - for (; GEPIt != E; ++GEPIt) { - // Ignore struct elements, no extra checking needed for these. - if ((*GEPIt)->isStructTy()) - continue; - - ConstantInt *IdxVal = dyn_cast<ConstantInt>(GEPIt.getOperand()); - if (!IdxVal) - return MarkUnsafe(Info, GEPI); - } - - // Compute the offset due to this GEP and check if the alloca has a - // component element at that offset. - SmallVector<Value*, 8> Indices(GEPI->op_begin() + 1, GEPI->op_end()); - // If this GEP is non-constant then the last operand must have been a - // dynamic index into a vector. Pop this now as it has no impact on the - // constant part of the offset. - if (NonConstant) - Indices.pop_back(); - - const DataLayout &DL = GEPI->getModule()->getDataLayout(); - Offset += DL.getIndexedOffset(GEPI->getPointerOperandType(), Indices); - if (!TypeHasComponent(Info.AI->getAllocatedType(), Offset, NonConstantIdxSize, - DL)) - MarkUnsafe(Info, GEPI); -} - -/// isHomogeneousAggregate - Check if type T is a struct or array containing -/// elements of the same type (which is always true for arrays). If so, -/// return true with NumElts and EltTy set to the number of elements and the -/// element type, respectively. -static bool isHomogeneousAggregate(Type *T, unsigned &NumElts, - Type *&EltTy) { - if (ArrayType *AT = dyn_cast<ArrayType>(T)) { - NumElts = AT->getNumElements(); - EltTy = (NumElts == 0 ? nullptr : AT->getElementType()); - return true; - } - if (StructType *ST = dyn_cast<StructType>(T)) { - NumElts = ST->getNumContainedTypes(); - EltTy = (NumElts == 0 ? nullptr : ST->getContainedType(0)); - for (unsigned n = 1; n < NumElts; ++n) { - if (ST->getContainedType(n) != EltTy) - return false; - } - return true; - } - return false; -} - -/// isCompatibleAggregate - Check if T1 and T2 are either the same type or are -/// "homogeneous" aggregates with the same element type and number of elements. -static bool isCompatibleAggregate(Type *T1, Type *T2) { - if (T1 == T2) - return true; - - unsigned NumElts1, NumElts2; - Type *EltTy1, *EltTy2; - if (isHomogeneousAggregate(T1, NumElts1, EltTy1) && - isHomogeneousAggregate(T2, NumElts2, EltTy2) && - NumElts1 == NumElts2 && - EltTy1 == EltTy2) - return true; - - return false; -} - -/// isSafeMemAccess - Check if a load/store/memcpy operates on the entire AI -/// alloca or has an offset and size that corresponds to a component element -/// within it. The offset checked here may have been formed from a GEP with a -/// pointer bitcasted to a different type. -/// -/// If AllowWholeAccess is true, then this allows uses of the entire alloca as a -/// unit. If false, it only allows accesses known to be in a single element. -void SROA::isSafeMemAccess(uint64_t Offset, uint64_t MemSize, - Type *MemOpType, bool isStore, - AllocaInfo &Info, Instruction *TheAccess, - bool AllowWholeAccess) { - const DataLayout &DL = TheAccess->getModule()->getDataLayout(); - // Check if this is a load/store of the entire alloca. - if (Offset == 0 && AllowWholeAccess && - MemSize == DL.getTypeAllocSize(Info.AI->getAllocatedType())) { - // This can be safe for MemIntrinsics (where MemOpType is 0) and integer - // loads/stores (which are essentially the same as the MemIntrinsics with - // regard to copying padding between elements). But, if an alloca is - // flagged as both a source and destination of such operations, we'll need - // to check later for padding between elements. - if (!MemOpType || MemOpType->isIntegerTy()) { - if (isStore) - Info.isMemCpyDst = true; - else - Info.isMemCpySrc = true; - return; - } - // This is also safe for references using a type that is compatible with - // the type of the alloca, so that loads/stores can be rewritten using - // insertvalue/extractvalue. - if (isCompatibleAggregate(MemOpType, Info.AI->getAllocatedType())) { - Info.hasSubelementAccess = true; - return; - } - } - // Check if the offset/size correspond to a component within the alloca type. - Type *T = Info.AI->getAllocatedType(); - if (TypeHasComponent(T, Offset, MemSize, DL)) { - Info.hasSubelementAccess = true; - return; - } - - return MarkUnsafe(Info, TheAccess); -} - -/// TypeHasComponent - Return true if T has a component type with the -/// specified offset and size. If Size is zero, do not check the size. -bool SROA::TypeHasComponent(Type *T, uint64_t Offset, uint64_t Size, - const DataLayout &DL) { - Type *EltTy; - uint64_t EltSize; - if (StructType *ST = dyn_cast<StructType>(T)) { - const StructLayout *Layout = DL.getStructLayout(ST); - unsigned EltIdx = Layout->getElementContainingOffset(Offset); - EltTy = ST->getContainedType(EltIdx); - EltSize = DL.getTypeAllocSize(EltTy); - Offset -= Layout->getElementOffset(EltIdx); - } else if (ArrayType *AT = dyn_cast<ArrayType>(T)) { - EltTy = AT->getElementType(); - EltSize = DL.getTypeAllocSize(EltTy); - if (Offset >= AT->getNumElements() * EltSize) - return false; - Offset %= EltSize; - } else if (VectorType *VT = dyn_cast<VectorType>(T)) { - EltTy = VT->getElementType(); - EltSize = DL.getTypeAllocSize(EltTy); - if (Offset >= VT->getNumElements() * EltSize) - return false; - Offset %= EltSize; - } else { - return false; - } - if (Offset == 0 && (Size == 0 || EltSize == Size)) - return true; - // Check if the component spans multiple elements. - if (Offset + Size > EltSize) - return false; - return TypeHasComponent(EltTy, Offset, Size, DL); -} - -/// RewriteForScalarRepl - Alloca AI is being split into NewElts, so rewrite -/// the instruction I, which references it, to use the separate elements. -/// Offset indicates the position within AI that is referenced by this -/// instruction. -void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset, - SmallVectorImpl<AllocaInst *> &NewElts) { - const DataLayout &DL = I->getModule()->getDataLayout(); - for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); UI!=E;) { - Use &TheUse = *UI++; - Instruction *User = cast<Instruction>(TheUse.getUser()); - - if (BitCastInst *BC = dyn_cast<BitCastInst>(User)) { - RewriteBitCast(BC, AI, Offset, NewElts); - continue; - } - - if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(User)) { - RewriteGEP(GEPI, AI, Offset, NewElts); - continue; - } - - if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(User)) { - ConstantInt *Length = dyn_cast<ConstantInt>(MI->getLength()); - uint64_t MemSize = Length->getZExtValue(); - if (Offset == 0 && MemSize == DL.getTypeAllocSize(AI->getAllocatedType())) - RewriteMemIntrinUserOfAlloca(MI, I, AI, NewElts); - // Otherwise the intrinsic can only touch a single element and the - // address operand will be updated, so nothing else needs to be done. - continue; - } - - if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(User)) { - if (II->getIntrinsicID() == Intrinsic::lifetime_start || - II->getIntrinsicID() == Intrinsic::lifetime_end) { - RewriteLifetimeIntrinsic(II, AI, Offset, NewElts); - } - continue; - } - - if (LoadInst *LI = dyn_cast<LoadInst>(User)) { - Type *LIType = LI->getType(); - - if (isCompatibleAggregate(LIType, AI->getAllocatedType())) { - // Replace: - // %res = load { i32, i32 }* %alloc - // with: - // %load.0 = load i32* %alloc.0 - // %insert.0 insertvalue { i32, i32 } zeroinitializer, i32 %load.0, 0 - // %load.1 = load i32* %alloc.1 - // %insert = insertvalue { i32, i32 } %insert.0, i32 %load.1, 1 - // (Also works for arrays instead of structs) - Value *Insert = UndefValue::get(LIType); - IRBuilder<> Builder(LI); - for (unsigned i = 0, e = NewElts.size(); i != e; ++i) { - Value *Load = Builder.CreateLoad(NewElts[i], "load"); - Insert = Builder.CreateInsertValue(Insert, Load, i, "insert"); - } - LI->replaceAllUsesWith(Insert); - DeadInsts.push_back(LI); - } else if (LIType->isIntegerTy() && - DL.getTypeAllocSize(LIType) == - DL.getTypeAllocSize(AI->getAllocatedType())) { - // If this is a load of the entire alloca to an integer, rewrite it. - RewriteLoadUserOfWholeAlloca(LI, AI, NewElts); - } - continue; - } - - if (StoreInst *SI = dyn_cast<StoreInst>(User)) { - Value *Val = SI->getOperand(0); - Type *SIType = Val->getType(); - if (isCompatibleAggregate(SIType, AI->getAllocatedType())) { - // Replace: - // store { i32, i32 } %val, { i32, i32 }* %alloc - // with: - // %val.0 = extractvalue { i32, i32 } %val, 0 - // store i32 %val.0, i32* %alloc.0 - // %val.1 = extractvalue { i32, i32 } %val, 1 - // store i32 %val.1, i32* %alloc.1 - // (Also works for arrays instead of structs) - IRBuilder<> Builder(SI); - for (unsigned i = 0, e = NewElts.size(); i != e; ++i) { - Value *Extract = Builder.CreateExtractValue(Val, i, Val->getName()); - Builder.CreateStore(Extract, NewElts[i]); - } - DeadInsts.push_back(SI); - } else if (SIType->isIntegerTy() && - DL.getTypeAllocSize(SIType) == - DL.getTypeAllocSize(AI->getAllocatedType())) { - // If this is a store of the entire alloca from an integer, rewrite it. - RewriteStoreUserOfWholeAlloca(SI, AI, NewElts); - } - continue; - } - - if (isa<SelectInst>(User) || isa<PHINode>(User)) { - // If we have a PHI user of the alloca itself (as opposed to a GEP or - // bitcast) we have to rewrite it. GEP and bitcast uses will be RAUW'd to - // the new pointer. - if (!isa<AllocaInst>(I)) continue; - - assert(Offset == 0 && NewElts[0] && - "Direct alloca use should have a zero offset"); - - // If we have a use of the alloca, we know the derived uses will be - // utilizing just the first element of the scalarized result. Insert a - // bitcast of the first alloca before the user as required. - AllocaInst *NewAI = NewElts[0]; - BitCastInst *BCI = new BitCastInst(NewAI, AI->getType(), "", NewAI); - NewAI->moveBefore(BCI); - TheUse = BCI; - continue; - } - } -} - -/// RewriteBitCast - Update a bitcast reference to the alloca being replaced -/// and recursively continue updating all of its uses. -void SROA::RewriteBitCast(BitCastInst *BC, AllocaInst *AI, uint64_t Offset, - SmallVectorImpl<AllocaInst *> &NewElts) { - RewriteForScalarRepl(BC, AI, Offset, NewElts); - if (BC->getOperand(0) != AI) - return; - - // The bitcast references the original alloca. Replace its uses with - // references to the alloca containing offset zero (which is normally at - // index zero, but might not be in cases involving structs with elements - // of size zero). - Type *T = AI->getAllocatedType(); - uint64_t EltOffset = 0; - Type *IdxTy; - uint64_t Idx = FindElementAndOffset(T, EltOffset, IdxTy, - BC->getModule()->getDataLayout()); - Instruction *Val = NewElts[Idx]; - if (Val->getType() != BC->getDestTy()) { - Val = new BitCastInst(Val, BC->getDestTy(), "", BC); - Val->takeName(BC); - } - BC->replaceAllUsesWith(Val); - DeadInsts.push_back(BC); -} - -/// FindElementAndOffset - Return the index of the element containing Offset -/// within the specified type, which must be either a struct or an array. -/// Sets T to the type of the element and Offset to the offset within that -/// element. IdxTy is set to the type of the index result to be used in a -/// GEP instruction. -uint64_t SROA::FindElementAndOffset(Type *&T, uint64_t &Offset, Type *&IdxTy, - const DataLayout &DL) { - uint64_t Idx = 0; - - if (StructType *ST = dyn_cast<StructType>(T)) { - const StructLayout *Layout = DL.getStructLayout(ST); - Idx = Layout->getElementContainingOffset(Offset); - T = ST->getContainedType(Idx); - Offset -= Layout->getElementOffset(Idx); - IdxTy = Type::getInt32Ty(T->getContext()); - return Idx; - } else if (ArrayType *AT = dyn_cast<ArrayType>(T)) { - T = AT->getElementType(); - uint64_t EltSize = DL.getTypeAllocSize(T); - Idx = Offset / EltSize; - Offset -= Idx * EltSize; - IdxTy = Type::getInt64Ty(T->getContext()); - return Idx; - } - VectorType *VT = cast<VectorType>(T); - T = VT->getElementType(); - uint64_t EltSize = DL.getTypeAllocSize(T); - Idx = Offset / EltSize; - Offset -= Idx * EltSize; - IdxTy = Type::getInt64Ty(T->getContext()); - return Idx; -} - -/// RewriteGEP - Check if this GEP instruction moves the pointer across -/// elements of the alloca that are being split apart, and if so, rewrite -/// the GEP to be relative to the new element. -void SROA::RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset, - SmallVectorImpl<AllocaInst *> &NewElts) { - uint64_t OldOffset = Offset; - const DataLayout &DL = GEPI->getModule()->getDataLayout(); - SmallVector<Value*, 8> Indices(GEPI->op_begin() + 1, GEPI->op_end()); - // If the GEP was dynamic then it must have been a dynamic vector lookup. - // In this case, it must be the last GEP operand which is dynamic so keep that - // aside until we've found the constant GEP offset then add it back in at the - // end. - Value* NonConstantIdx = nullptr; - if (!GEPI->hasAllConstantIndices()) - NonConstantIdx = Indices.pop_back_val(); - Offset += DL.getIndexedOffset(GEPI->getPointerOperandType(), Indices); - - RewriteForScalarRepl(GEPI, AI, Offset, NewElts); - - Type *T = AI->getAllocatedType(); - Type *IdxTy; - uint64_t OldIdx = FindElementAndOffset(T, OldOffset, IdxTy, DL); - if (GEPI->getOperand(0) == AI) - OldIdx = ~0ULL; // Force the GEP to be rewritten. - - T = AI->getAllocatedType(); - uint64_t EltOffset = Offset; - uint64_t Idx = FindElementAndOffset(T, EltOffset, IdxTy, DL); - - // If this GEP does not move the pointer across elements of the alloca - // being split, then it does not needs to be rewritten. - if (Idx == OldIdx) - return; - - Type *i32Ty = Type::getInt32Ty(AI->getContext()); - SmallVector<Value*, 8> NewArgs; - NewArgs.push_back(Constant::getNullValue(i32Ty)); - while (EltOffset != 0) { - uint64_t EltIdx = FindElementAndOffset(T, EltOffset, IdxTy, DL); - NewArgs.push_back(ConstantInt::get(IdxTy, EltIdx)); - } - if (NonConstantIdx) { - Type* GepTy = T; - // This GEP has a dynamic index. We need to add "i32 0" to index through - // any structs or arrays in the original type until we get to the vector - // to index. - while (!isa<VectorType>(GepTy)) { - NewArgs.push_back(Constant::getNullValue(i32Ty)); - GepTy = cast<CompositeType>(GepTy)->getTypeAtIndex(0U); - } - NewArgs.push_back(NonConstantIdx); - } - Instruction *Val = NewElts[Idx]; - if (NewArgs.size() > 1) { - Val = GetElementPtrInst::CreateInBounds(Val, NewArgs, "", GEPI); - Val->takeName(GEPI); - } - if (Val->getType() != GEPI->getType()) - Val = new BitCastInst(Val, GEPI->getType(), Val->getName(), GEPI); - GEPI->replaceAllUsesWith(Val); - DeadInsts.push_back(GEPI); -} - -/// RewriteLifetimeIntrinsic - II is a lifetime.start/lifetime.end. Rewrite it -/// to mark the lifetime of the scalarized memory. -void SROA::RewriteLifetimeIntrinsic(IntrinsicInst *II, AllocaInst *AI, - uint64_t Offset, - SmallVectorImpl<AllocaInst *> &NewElts) { - ConstantInt *OldSize = cast<ConstantInt>(II->getArgOperand(0)); - // Put matching lifetime markers on everything from Offset up to - // Offset+OldSize. - Type *AIType = AI->getAllocatedType(); - const DataLayout &DL = II->getModule()->getDataLayout(); - uint64_t NewOffset = Offset; - Type *IdxTy; - uint64_t Idx = FindElementAndOffset(AIType, NewOffset, IdxTy, DL); - - IRBuilder<> Builder(II); - uint64_t Size = OldSize->getLimitedValue(); - - if (NewOffset) { - // Splice the first element and index 'NewOffset' bytes in. SROA will - // split the alloca again later. - unsigned AS = AI->getType()->getAddressSpace(); - Value *V = Builder.CreateBitCast(NewElts[Idx], Builder.getInt8PtrTy(AS)); - V = Builder.CreateGEP(Builder.getInt8Ty(), V, Builder.getInt64(NewOffset)); - - IdxTy = NewElts[Idx]->getAllocatedType(); - uint64_t EltSize = DL.getTypeAllocSize(IdxTy) - NewOffset; - if (EltSize > Size) { - EltSize = Size; - Size = 0; - } else { - Size -= EltSize; - } - if (II->getIntrinsicID() == Intrinsic::lifetime_start) - Builder.CreateLifetimeStart(V, Builder.getInt64(EltSize)); - else - Builder.CreateLifetimeEnd(V, Builder.getInt64(EltSize)); - ++Idx; - } - - for (; Idx != NewElts.size() && Size; ++Idx) { - IdxTy = NewElts[Idx]->getAllocatedType(); - uint64_t EltSize = DL.getTypeAllocSize(IdxTy); - if (EltSize > Size) { - EltSize = Size; - Size = 0; - } else { - Size -= EltSize; - } - if (II->getIntrinsicID() == Intrinsic::lifetime_start) - Builder.CreateLifetimeStart(NewElts[Idx], - Builder.getInt64(EltSize)); - else - Builder.CreateLifetimeEnd(NewElts[Idx], - Builder.getInt64(EltSize)); - } - DeadInsts.push_back(II); -} - -/// RewriteMemIntrinUserOfAlloca - MI is a memcpy/memset/memmove from or to AI. -/// Rewrite it to copy or set the elements of the scalarized memory. -void -SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst, - AllocaInst *AI, - SmallVectorImpl<AllocaInst *> &NewElts) { - // If this is a memcpy/memmove, construct the other pointer as the - // appropriate type. The "Other" pointer is the pointer that goes to memory - // that doesn't have anything to do with the alloca that we are promoting. For - // memset, this Value* stays null. - Value *OtherPtr = nullptr; - unsigned MemAlignment = MI->getAlignment(); - if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) { // memmove/memcopy - if (Inst == MTI->getRawDest()) - OtherPtr = MTI->getRawSource(); - else { - assert(Inst == MTI->getRawSource()); - OtherPtr = MTI->getRawDest(); - } - } - - // If there is an other pointer, we want to convert it to the same pointer - // type as AI has, so we can GEP through it safely. - if (OtherPtr) { - unsigned AddrSpace = - cast<PointerType>(OtherPtr->getType())->getAddressSpace(); - - // Remove bitcasts and all-zero GEPs from OtherPtr. This is an - // optimization, but it's also required to detect the corner case where - // both pointer operands are referencing the same memory, and where - // OtherPtr may be a bitcast or GEP that currently being rewritten. (This - // function is only called for mem intrinsics that access the whole - // aggregate, so non-zero GEPs are not an issue here.) - OtherPtr = OtherPtr->stripPointerCasts(); - - // Copying the alloca to itself is a no-op: just delete it. - if (OtherPtr == AI || OtherPtr == NewElts[0]) { - // This code will run twice for a no-op memcpy -- once for each operand. - // Put only one reference to MI on the DeadInsts list. - for (SmallVectorImpl<Value *>::const_iterator I = DeadInsts.begin(), - E = DeadInsts.end(); I != E; ++I) - if (*I == MI) return; - DeadInsts.push_back(MI); - return; - } - - // If the pointer is not the right type, insert a bitcast to the right - // type. - Type *NewTy = - PointerType::get(AI->getType()->getElementType(), AddrSpace); - - if (OtherPtr->getType() != NewTy) - OtherPtr = new BitCastInst(OtherPtr, NewTy, OtherPtr->getName(), MI); - } - - // Process each element of the aggregate. - bool SROADest = MI->getRawDest() == Inst; - - Constant *Zero = Constant::getNullValue(Type::getInt32Ty(MI->getContext())); - const DataLayout &DL = MI->getModule()->getDataLayout(); - - for (unsigned i = 0, e = NewElts.size(); i != e; ++i) { - // If this is a memcpy/memmove, emit a GEP of the other element address. - Value *OtherElt = nullptr; - unsigned OtherEltAlign = MemAlignment; - - if (OtherPtr) { - Value *Idx[2] = { Zero, - ConstantInt::get(Type::getInt32Ty(MI->getContext()), i) }; - OtherElt = GetElementPtrInst::CreateInBounds(OtherPtr, Idx, - OtherPtr->getName()+"."+Twine(i), - MI); - uint64_t EltOffset; - PointerType *OtherPtrTy = cast<PointerType>(OtherPtr->getType()); - Type *OtherTy = OtherPtrTy->getElementType(); - if (StructType *ST = dyn_cast<StructType>(OtherTy)) { - EltOffset = DL.getStructLayout(ST)->getElementOffset(i); - } else { - Type *EltTy = cast<SequentialType>(OtherTy)->getElementType(); - EltOffset = DL.getTypeAllocSize(EltTy) * i; - } - - // The alignment of the other pointer is the guaranteed alignment of the - // element, which is affected by both the known alignment of the whole - // mem intrinsic and the alignment of the element. If the alignment of - // the memcpy (f.e.) is 32 but the element is at a 4-byte offset, then the - // known alignment is just 4 bytes. - OtherEltAlign = (unsigned)MinAlign(OtherEltAlign, EltOffset); - } - - Value *EltPtr = NewElts[i]; - Type *EltTy = cast<PointerType>(EltPtr->getType())->getElementType(); - - // If we got down to a scalar, insert a load or store as appropriate. - if (EltTy->isSingleValueType()) { - if (isa<MemTransferInst>(MI)) { - if (SROADest) { - // From Other to Alloca. - Value *Elt = new LoadInst(OtherElt, "tmp", false, OtherEltAlign, MI); - new StoreInst(Elt, EltPtr, MI); - } else { - // From Alloca to Other. - Value *Elt = new LoadInst(EltPtr, "tmp", MI); - new StoreInst(Elt, OtherElt, false, OtherEltAlign, MI); - } - continue; - } - assert(isa<MemSetInst>(MI)); - - // If the stored element is zero (common case), just store a null - // constant. - Constant *StoreVal; - if (ConstantInt *CI = dyn_cast<ConstantInt>(MI->getArgOperand(1))) { - if (CI->isZero()) { - StoreVal = Constant::getNullValue(EltTy); // 0.0, null, 0, <0,0> - } else { - // If EltTy is a vector type, get the element type. - Type *ValTy = EltTy->getScalarType(); - - // Construct an integer with the right value. - unsigned EltSize = DL.getTypeSizeInBits(ValTy); - APInt OneVal(EltSize, CI->getZExtValue()); - APInt TotalVal(OneVal); - // Set each byte. - for (unsigned i = 0; 8*i < EltSize; ++i) { - TotalVal = TotalVal.shl(8); - TotalVal |= OneVal; - } - - // Convert the integer value to the appropriate type. - StoreVal = ConstantInt::get(CI->getContext(), TotalVal); - if (ValTy->isPointerTy()) - StoreVal = ConstantExpr::getIntToPtr(StoreVal, ValTy); - else if (ValTy->isFloatingPointTy()) - StoreVal = ConstantExpr::getBitCast(StoreVal, ValTy); - assert(StoreVal->getType() == ValTy && "Type mismatch!"); - - // If the requested value was a vector constant, create it. - if (EltTy->isVectorTy()) { - unsigned NumElts = cast<VectorType>(EltTy)->getNumElements(); - StoreVal = ConstantVector::getSplat(NumElts, StoreVal); - } - } - new StoreInst(StoreVal, EltPtr, MI); - continue; - } - // Otherwise, if we're storing a byte variable, use a memset call for - // this element. - } - - unsigned EltSize = DL.getTypeAllocSize(EltTy); - if (!EltSize) - continue; - - IRBuilder<> Builder(MI); - - // Finally, insert the meminst for this element. - if (isa<MemSetInst>(MI)) { - Builder.CreateMemSet(EltPtr, MI->getArgOperand(1), EltSize, - MI->isVolatile()); - } else { - assert(isa<MemTransferInst>(MI)); - Value *Dst = SROADest ? EltPtr : OtherElt; // Dest ptr - Value *Src = SROADest ? OtherElt : EltPtr; // Src ptr - - if (isa<MemCpyInst>(MI)) - Builder.CreateMemCpy(Dst, Src, EltSize, OtherEltAlign,MI->isVolatile()); - else - Builder.CreateMemMove(Dst, Src, EltSize,OtherEltAlign,MI->isVolatile()); - } - } - DeadInsts.push_back(MI); -} - -/// RewriteStoreUserOfWholeAlloca - We found a store of an integer that -/// overwrites the entire allocation. Extract out the pieces of the stored -/// integer and store them individually. -void -SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI, - SmallVectorImpl<AllocaInst *> &NewElts) { - // Extract each element out of the integer according to its structure offset - // and store the element value to the individual alloca. - Value *SrcVal = SI->getOperand(0); - Type *AllocaEltTy = AI->getAllocatedType(); - const DataLayout &DL = SI->getModule()->getDataLayout(); - uint64_t AllocaSizeBits = DL.getTypeAllocSizeInBits(AllocaEltTy); - - IRBuilder<> Builder(SI); - - // Handle tail padding by extending the operand - if (DL.getTypeSizeInBits(SrcVal->getType()) != AllocaSizeBits) - SrcVal = Builder.CreateZExt(SrcVal, - IntegerType::get(SI->getContext(), AllocaSizeBits)); - - DEBUG(dbgs() << "PROMOTING STORE TO WHOLE ALLOCA: " << *AI << '\n' << *SI - << '\n'); - - // There are two forms here: AI could be an array or struct. Both cases - // have different ways to compute the element offset. - if (StructType *EltSTy = dyn_cast<StructType>(AllocaEltTy)) { - const StructLayout *Layout = DL.getStructLayout(EltSTy); - - for (unsigned i = 0, e = NewElts.size(); i != e; ++i) { - // Get the number of bits to shift SrcVal to get the value. - Type *FieldTy = EltSTy->getElementType(i); - uint64_t Shift = Layout->getElementOffsetInBits(i); - - if (DL.isBigEndian()) - Shift = AllocaSizeBits - Shift - DL.getTypeAllocSizeInBits(FieldTy); - - Value *EltVal = SrcVal; - if (Shift) { - Value *ShiftVal = ConstantInt::get(EltVal->getType(), Shift); - EltVal = Builder.CreateLShr(EltVal, ShiftVal, "sroa.store.elt"); - } - - // Truncate down to an integer of the right size. - uint64_t FieldSizeBits = DL.getTypeSizeInBits(FieldTy); - - // Ignore zero sized fields like {}, they obviously contain no data. - if (FieldSizeBits == 0) continue; - - if (FieldSizeBits != AllocaSizeBits) - EltVal = Builder.CreateTrunc(EltVal, - IntegerType::get(SI->getContext(), FieldSizeBits)); - Value *DestField = NewElts[i]; - if (EltVal->getType() == FieldTy) { - // Storing to an integer field of this size, just do it. - } else if (FieldTy->isFloatingPointTy() || FieldTy->isVectorTy()) { - // Bitcast to the right element type (for fp/vector values). - EltVal = Builder.CreateBitCast(EltVal, FieldTy); - } else { - // Otherwise, bitcast the dest pointer (for aggregates). - DestField = Builder.CreateBitCast(DestField, - PointerType::getUnqual(EltVal->getType())); - } - new StoreInst(EltVal, DestField, SI); - } - - } else { - ArrayType *ATy = cast<ArrayType>(AllocaEltTy); - Type *ArrayEltTy = ATy->getElementType(); - uint64_t ElementOffset = DL.getTypeAllocSizeInBits(ArrayEltTy); - uint64_t ElementSizeBits = DL.getTypeSizeInBits(ArrayEltTy); - - uint64_t Shift; - - if (DL.isBigEndian()) - Shift = AllocaSizeBits-ElementOffset; - else - Shift = 0; - - for (unsigned i = 0, e = NewElts.size(); i != e; ++i) { - // Ignore zero sized fields like {}, they obviously contain no data. - if (ElementSizeBits == 0) continue; - - Value *EltVal = SrcVal; - if (Shift) { - Value *ShiftVal = ConstantInt::get(EltVal->getType(), Shift); - EltVal = Builder.CreateLShr(EltVal, ShiftVal, "sroa.store.elt"); - } - - // Truncate down to an integer of the right size. - if (ElementSizeBits != AllocaSizeBits) - EltVal = Builder.CreateTrunc(EltVal, - IntegerType::get(SI->getContext(), - ElementSizeBits)); - Value *DestField = NewElts[i]; - if (EltVal->getType() == ArrayEltTy) { - // Storing to an integer field of this size, just do it. - } else if (ArrayEltTy->isFloatingPointTy() || - ArrayEltTy->isVectorTy()) { - // Bitcast to the right element type (for fp/vector values). - EltVal = Builder.CreateBitCast(EltVal, ArrayEltTy); - } else { - // Otherwise, bitcast the dest pointer (for aggregates). - DestField = Builder.CreateBitCast(DestField, - PointerType::getUnqual(EltVal->getType())); - } - new StoreInst(EltVal, DestField, SI); - - if (DL.isBigEndian()) - Shift -= ElementOffset; - else - Shift += ElementOffset; - } - } - - DeadInsts.push_back(SI); -} - -/// RewriteLoadUserOfWholeAlloca - We found a load of the entire allocation to -/// an integer. Load the individual pieces to form the aggregate value. -void -SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI, - SmallVectorImpl<AllocaInst *> &NewElts) { - // Extract each element out of the NewElts according to its structure offset - // and form the result value. - Type *AllocaEltTy = AI->getAllocatedType(); - const DataLayout &DL = LI->getModule()->getDataLayout(); - uint64_t AllocaSizeBits = DL.getTypeAllocSizeInBits(AllocaEltTy); - - DEBUG(dbgs() << "PROMOTING LOAD OF WHOLE ALLOCA: " << *AI << '\n' << *LI - << '\n'); - - // There are two forms here: AI could be an array or struct. Both cases - // have different ways to compute the element offset. - const StructLayout *Layout = nullptr; - uint64_t ArrayEltBitOffset = 0; - if (StructType *EltSTy = dyn_cast<StructType>(AllocaEltTy)) { - Layout = DL.getStructLayout(EltSTy); - } else { - Type *ArrayEltTy = cast<ArrayType>(AllocaEltTy)->getElementType(); - ArrayEltBitOffset = DL.getTypeAllocSizeInBits(ArrayEltTy); - } - - Value *ResultVal = - Constant::getNullValue(IntegerType::get(LI->getContext(), AllocaSizeBits)); - - for (unsigned i = 0, e = NewElts.size(); i != e; ++i) { - // Load the value from the alloca. If the NewElt is an aggregate, cast - // the pointer to an integer of the same size before doing the load. - Value *SrcField = NewElts[i]; - Type *FieldTy = - cast<PointerType>(SrcField->getType())->getElementType(); - uint64_t FieldSizeBits = DL.getTypeSizeInBits(FieldTy); - - // Ignore zero sized fields like {}, they obviously contain no data. - if (FieldSizeBits == 0) continue; - - IntegerType *FieldIntTy = IntegerType::get(LI->getContext(), - FieldSizeBits); - if (!FieldTy->isIntegerTy() && !FieldTy->isFloatingPointTy() && - !FieldTy->isVectorTy()) - SrcField = new BitCastInst(SrcField, - PointerType::getUnqual(FieldIntTy), - "", LI); - SrcField = new LoadInst(SrcField, "sroa.load.elt", LI); - - // If SrcField is a fp or vector of the right size but that isn't an - // integer type, bitcast to an integer so we can shift it. - if (SrcField->getType() != FieldIntTy) - SrcField = new BitCastInst(SrcField, FieldIntTy, "", LI); - - // Zero extend the field to be the same size as the final alloca so that - // we can shift and insert it. - if (SrcField->getType() != ResultVal->getType()) - SrcField = new ZExtInst(SrcField, ResultVal->getType(), "", LI); - - // Determine the number of bits to shift SrcField. - uint64_t Shift; - if (Layout) // Struct case. - Shift = Layout->getElementOffsetInBits(i); - else // Array case. - Shift = i*ArrayEltBitOffset; - - if (DL.isBigEndian()) - Shift = AllocaSizeBits-Shift-FieldIntTy->getBitWidth(); - - if (Shift) { - Value *ShiftVal = ConstantInt::get(SrcField->getType(), Shift); - SrcField = BinaryOperator::CreateShl(SrcField, ShiftVal, "", LI); - } - - // Don't create an 'or x, 0' on the first iteration. - if (!isa<Constant>(ResultVal) || - !cast<Constant>(ResultVal)->isNullValue()) - ResultVal = BinaryOperator::CreateOr(SrcField, ResultVal, "", LI); - else - ResultVal = SrcField; - } - - // Handle tail padding by truncating the result - if (DL.getTypeSizeInBits(LI->getType()) != AllocaSizeBits) - ResultVal = new TruncInst(ResultVal, LI->getType(), "", LI); - - LI->replaceAllUsesWith(ResultVal); - DeadInsts.push_back(LI); -} - -/// HasPadding - Return true if the specified type has any structure or -/// alignment padding in between the elements that would be split apart -/// by SROA; return false otherwise. -static bool HasPadding(Type *Ty, const DataLayout &DL) { - if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { - Ty = ATy->getElementType(); - return DL.getTypeSizeInBits(Ty) != DL.getTypeAllocSizeInBits(Ty); - } - - // SROA currently handles only Arrays and Structs. - StructType *STy = cast<StructType>(Ty); - const StructLayout *SL = DL.getStructLayout(STy); - unsigned PrevFieldBitOffset = 0; - for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { - unsigned FieldBitOffset = SL->getElementOffsetInBits(i); - - // Check to see if there is any padding between this element and the - // previous one. - if (i) { - unsigned PrevFieldEnd = - PrevFieldBitOffset+DL.getTypeSizeInBits(STy->getElementType(i-1)); - if (PrevFieldEnd < FieldBitOffset) - return true; - } - PrevFieldBitOffset = FieldBitOffset; - } - // Check for tail padding. - if (unsigned EltCount = STy->getNumElements()) { - unsigned PrevFieldEnd = PrevFieldBitOffset + - DL.getTypeSizeInBits(STy->getElementType(EltCount-1)); - if (PrevFieldEnd < SL->getSizeInBits()) - return true; - } - return false; -} - -/// isSafeStructAllocaToScalarRepl - Check to see if the specified allocation of -/// an aggregate can be broken down into elements. Return 0 if not, 3 if safe, -/// or 1 if safe after canonicalization has been performed. -bool SROA::isSafeAllocaToScalarRepl(AllocaInst *AI) { - // Loop over the use list of the alloca. We can only transform it if all of - // the users are safe to transform. - AllocaInfo Info(AI); - - isSafeForScalarRepl(AI, 0, Info); - if (Info.isUnsafe) { - DEBUG(dbgs() << "Cannot transform: " << *AI << '\n'); - return false; - } - - const DataLayout &DL = AI->getModule()->getDataLayout(); - - // Okay, we know all the users are promotable. If the aggregate is a memcpy - // source and destination, we have to be careful. In particular, the memcpy - // could be moving around elements that live in structure padding of the LLVM - // types, but may actually be used. In these cases, we refuse to promote the - // struct. - if (Info.isMemCpySrc && Info.isMemCpyDst && - HasPadding(AI->getAllocatedType(), DL)) - return false; - - // If the alloca never has an access to just *part* of it, but is accessed - // via loads and stores, then we should use ConvertToScalarInfo to promote - // the alloca instead of promoting each piece at a time and inserting fission - // and fusion code. - if (!Info.hasSubelementAccess && Info.hasALoadOrStore) { - // If the struct/array just has one element, use basic SRoA. - if (StructType *ST = dyn_cast<StructType>(AI->getAllocatedType())) { - if (ST->getNumElements() > 1) return false; - } else { - if (cast<ArrayType>(AI->getAllocatedType())->getNumElements() > 1) - return false; - } - } - - return true; -} diff --git a/lib/Transforms/Scalar/Scalarizer.cpp b/lib/Transforms/Scalar/Scalarizer.cpp index 054bacdc706ba..aed4a4ad4d26a 100644 --- a/lib/Transforms/Scalar/Scalarizer.cpp +++ b/lib/Transforms/Scalar/Scalarizer.cpp @@ -14,12 +14,11 @@ // //===----------------------------------------------------------------------===// +#include "llvm/Transforms/Scalar.h" #include "llvm/ADT/STLExtras.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstVisitor.h" #include "llvm/Pass.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" using namespace llvm; @@ -253,6 +252,8 @@ bool Scalarizer::doInitialization(Module &M) { } bool Scalarizer::runOnFunction(Function &F) { + if (skipFunction(F)) + return false; assert(Gathered.empty() && Scattered.empty()); for (BasicBlock &BB : F) { for (BasicBlock::iterator II = BB.begin(), IE = BB.end(); II != IE;) { @@ -305,7 +306,11 @@ void Scalarizer::gather(Instruction *Op, const ValueVector &CV) { ValueVector &SV = Scattered[Op]; if (!SV.empty()) { for (unsigned I = 0, E = SV.size(); I != E; ++I) { - Instruction *Old = cast<Instruction>(SV[I]); + Value *V = SV[I]; + if (V == nullptr) + continue; + + Instruction *Old = cast<Instruction>(V); CV[I]->takeName(Old); Old->replaceAllUsesWith(CV[I]); Old->eraseFromParent(); @@ -334,13 +339,11 @@ void Scalarizer::transferMetadata(Instruction *Op, const ValueVector &CV) { Op->getAllMetadataOtherThanDebugLoc(MDs); for (unsigned I = 0, E = CV.size(); I != E; ++I) { if (Instruction *New = dyn_cast<Instruction>(CV[I])) { - for (SmallVectorImpl<std::pair<unsigned, MDNode *>>::iterator - MI = MDs.begin(), - ME = MDs.end(); - MI != ME; ++MI) - if (canTransferMetadata(MI->first)) - New->setMetadata(MI->first, MI->second); - New->setDebugLoc(Op->getDebugLoc()); + for (const auto &MD : MDs) + if (canTransferMetadata(MD.first)) + New->setMetadata(MD.first, MD.second); + if (Op->getDebugLoc() && !New->getDebugLoc()) + New->setDebugLoc(Op->getDebugLoc()); } } } @@ -646,10 +649,9 @@ bool Scalarizer::finish() { // made to the Function. if (Gathered.empty() && Scattered.empty()) return false; - for (GatherList::iterator GMI = Gathered.begin(), GME = Gathered.end(); - GMI != GME; ++GMI) { - Instruction *Op = GMI->first; - ValueVector &CV = *GMI->second; + for (const auto &GMI : Gathered) { + Instruction *Op = GMI.first; + ValueVector &CV = *GMI.second; if (!Op->use_empty()) { // The value is still needed, so recreate it using a series of // InsertElements. diff --git a/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp index 86a10d2a16122..d6ae186698c7a 100644 --- a/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp +++ b/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp @@ -590,9 +590,9 @@ Value *ConstantOffsetExtractor::rebuildWithoutConstOffset() { distributeExtsAndCloneChain(UserChain.size() - 1); // Remove all nullptrs (used to be s/zext) from UserChain. unsigned NewSize = 0; - for (auto I = UserChain.begin(), E = UserChain.end(); I != E; ++I) { - if (*I != nullptr) { - UserChain[NewSize] = *I; + for (User *I : UserChain) { + if (I != nullptr) { + UserChain[NewSize] = I; NewSize++; } } @@ -824,8 +824,8 @@ void SeparateConstOffsetFromGEP::lowerToSingleIndexGEPs( // If we created a GEP with constant index, and the base is loop invariant, // then we swap the first one with it, so LICM can move constant GEP out // later. - GetElementPtrInst *FirstGEP = dyn_cast<GetElementPtrInst>(FirstResult); - GetElementPtrInst *SecondGEP = dyn_cast<GetElementPtrInst>(ResultPtr); + GetElementPtrInst *FirstGEP = dyn_cast_or_null<GetElementPtrInst>(FirstResult); + GetElementPtrInst *SecondGEP = dyn_cast_or_null<GetElementPtrInst>(ResultPtr); if (isSwapCandidate && isLegalToSwapOperand(FirstGEP, SecondGEP, L)) swapGEPOperand(FirstGEP, SecondGEP); @@ -911,7 +911,7 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) { getAnalysis<TargetTransformInfoWrapperPass>().getTTI( *GEP->getParent()->getParent()); unsigned AddrSpace = GEP->getPointerAddressSpace(); - if (!TTI.isLegalAddressingMode(GEP->getType()->getElementType(), + if (!TTI.isLegalAddressingMode(GEP->getResultElementType(), /*BaseGV=*/nullptr, AccumulativeByteOffset, /*HasBaseReg=*/true, /*Scale=*/0, AddrSpace)) { @@ -1018,7 +1018,7 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) { // unsigned.. Therefore, we cast ElementTypeSizeOfGEP to signed because it is // used with unsigned integers later. int64_t ElementTypeSizeOfGEP = static_cast<int64_t>( - DL->getTypeAllocSize(GEP->getType()->getElementType())); + DL->getTypeAllocSize(GEP->getResultElementType())); Type *IntPtrTy = DL->getIntPtrType(GEP->getType()); if (AccumulativeByteOffset % ElementTypeSizeOfGEP == 0) { // Very likely. As long as %gep is natually aligned, the byte offset we @@ -1064,7 +1064,7 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) { } bool SeparateConstOffsetFromGEP::runOnFunction(Function &F) { - if (skipOptnoneFunction(F)) + if (skipFunction(F)) return false; if (DisableSeparateConstOffsetFromGEP) @@ -1075,8 +1075,8 @@ bool SeparateConstOffsetFromGEP::runOnFunction(Function &F) { LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); bool Changed = false; - for (Function::iterator B = F.begin(), BE = F.end(); B != BE; ++B) { - for (BasicBlock::iterator I = B->begin(), IE = B->end(); I != IE;) + for (BasicBlock &B : F) { + for (BasicBlock::iterator I = B.begin(), IE = B.end(); I != IE;) if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I++)) Changed |= splitGEP(GEP); // No need to split GEP ConstantExprs because all its indices are constant @@ -1162,8 +1162,8 @@ bool SeparateConstOffsetFromGEP::reuniteExts(Function &F) { } void SeparateConstOffsetFromGEP::verifyNoDeadCode(Function &F) { - for (auto &B : F) { - for (auto &I : B) { + for (BasicBlock &B : F) { + for (Instruction &I : B) { if (isInstructionTriviallyDead(&I)) { std::string ErrMessage; raw_string_ostream RSO(ErrMessage); diff --git a/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/lib/Transforms/Scalar/SimplifyCFGPass.cpp index 63c8836bf3810..2d0a21d2c518a 100644 --- a/lib/Transforms/Scalar/SimplifyCFGPass.cpp +++ b/lib/Transforms/Scalar/SimplifyCFGPass.cpp @@ -21,12 +21,12 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Transforms/Scalar/SimplifyCFG.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/CFG.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/CFG.h" @@ -37,8 +37,10 @@ #include "llvm/IR/Module.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Scalar/SimplifyCFG.h" +#include "llvm/Transforms/Utils/Local.h" +#include <utility> using namespace llvm; #define DEBUG_TYPE "simplifycfg" @@ -131,12 +133,19 @@ static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI, unsigned BonusInstThreshold) { bool Changed = false; bool LocalChange = true; + + SmallVector<std::pair<const BasicBlock *, const BasicBlock *>, 32> Edges; + FindFunctionBackedges(F, Edges); + SmallPtrSet<BasicBlock *, 16> LoopHeaders; + for (unsigned i = 0, e = Edges.size(); i != e; ++i) + LoopHeaders.insert(const_cast<BasicBlock *>(Edges[i].second)); + while (LocalChange) { LocalChange = false; // Loop over all of the basic blocks and remove them if they are unneeded. for (Function::iterator BBIt = F.begin(); BBIt != F.end(); ) { - if (SimplifyCFG(&*BBIt++, TTI, BonusInstThreshold, AC)) { + if (SimplifyCFG(&*BBIt++, TTI, BonusInstThreshold, AC, &LoopHeaders)) { LocalChange = true; ++NumSimpl; } @@ -178,14 +187,15 @@ SimplifyCFGPass::SimplifyCFGPass(int BonusInstThreshold) : BonusInstThreshold(BonusInstThreshold) {} PreservedAnalyses SimplifyCFGPass::run(Function &F, - AnalysisManager<Function> *AM) { - auto &TTI = AM->getResult<TargetIRAnalysis>(F); - auto &AC = AM->getResult<AssumptionAnalysis>(F); + AnalysisManager<Function> &AM) { + auto &TTI = AM.getResult<TargetIRAnalysis>(F); + auto &AC = AM.getResult<AssumptionAnalysis>(F); if (!simplifyFunctionCFG(F, TTI, &AC, BonusInstThreshold)) - return PreservedAnalyses::none(); - - return PreservedAnalyses::all(); + return PreservedAnalyses::all(); + PreservedAnalyses PA; + PA.preserve<GlobalsAA>(); + return PA; } namespace { @@ -196,15 +206,12 @@ struct CFGSimplifyPass : public FunctionPass { CFGSimplifyPass(int T = -1, std::function<bool(const Function &)> Ftor = nullptr) - : FunctionPass(ID), PredicateFtor(Ftor) { + : FunctionPass(ID), PredicateFtor(std::move(Ftor)) { BonusInstThreshold = (T == -1) ? UserBonusInstThreshold : unsigned(T); initializeCFGSimplifyPassPass(*PassRegistry::getPassRegistry()); } bool runOnFunction(Function &F) override { - if (PredicateFtor && !PredicateFtor(F)) - return false; - - if (skipOptnoneFunction(F)) + if (skipFunction(F) || (PredicateFtor && !PredicateFtor(F))) return false; AssumptionCache *AC = @@ -234,6 +241,5 @@ INITIALIZE_PASS_END(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", false, FunctionPass * llvm::createCFGSimplificationPass(int Threshold, std::function<bool(const Function &)> Ftor) { - return new CFGSimplifyPass(Threshold, Ftor); + return new CFGSimplifyPass(Threshold, std::move(Ftor)); } - diff --git a/lib/Transforms/Scalar/Sink.cpp b/lib/Transforms/Scalar/Sink.cpp index 64109b2df1173..d9a296c631221 100644 --- a/lib/Transforms/Scalar/Sink.cpp +++ b/lib/Transforms/Scalar/Sink.cpp @@ -12,7 +12,7 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Scalar/Sink.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/LoopInfo.h" @@ -24,6 +24,7 @@ #include "llvm/IR/Module.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar.h" using namespace llvm; #define DEBUG_TYPE "sink" @@ -31,50 +32,10 @@ using namespace llvm; STATISTIC(NumSunk, "Number of instructions sunk"); STATISTIC(NumSinkIter, "Number of sinking iterations"); -namespace { - class Sinking : public FunctionPass { - DominatorTree *DT; - LoopInfo *LI; - AliasAnalysis *AA; - - public: - static char ID; // Pass identification - Sinking() : FunctionPass(ID) { - initializeSinkingPass(*PassRegistry::getPassRegistry()); - } - - bool runOnFunction(Function &F) override; - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - FunctionPass::getAnalysisUsage(AU); - AU.addRequired<AAResultsWrapperPass>(); - AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<LoopInfoWrapperPass>(); - AU.addPreserved<DominatorTreeWrapperPass>(); - AU.addPreserved<LoopInfoWrapperPass>(); - } - private: - bool ProcessBlock(BasicBlock &BB); - bool SinkInstruction(Instruction *I, SmallPtrSetImpl<Instruction*> &Stores); - bool AllUsesDominatedByBlock(Instruction *Inst, BasicBlock *BB) const; - bool IsAcceptableTarget(Instruction *Inst, BasicBlock *SuccToSinkTo) const; - }; -} // end anonymous namespace - -char Sinking::ID = 0; -INITIALIZE_PASS_BEGIN(Sinking, "sink", "Code sinking", false, false) -INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) -INITIALIZE_PASS_END(Sinking, "sink", "Code sinking", false, false) - -FunctionPass *llvm::createSinkingPass() { return new Sinking(); } - /// AllUsesDominatedByBlock - Return true if all uses of the specified value /// occur in blocks dominated by the specified block. -bool Sinking::AllUsesDominatedByBlock(Instruction *Inst, - BasicBlock *BB) const { +static bool AllUsesDominatedByBlock(Instruction *Inst, BasicBlock *BB, + DominatorTree &DT) { // Ignoring debug uses is necessary so debug info doesn't affect the code. // This may leave a referencing dbg_value in the original block, before // the definition of the vreg. Dwarf generator handles this although the @@ -90,71 +51,13 @@ bool Sinking::AllUsesDominatedByBlock(Instruction *Inst, UseBlock = PN->getIncomingBlock(Num); } // Check that it dominates. - if (!DT->dominates(BB, UseBlock)) + if (!DT.dominates(BB, UseBlock)) return false; } return true; } -bool Sinking::runOnFunction(Function &F) { - DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); - - bool MadeChange, EverMadeChange = false; - - do { - MadeChange = false; - DEBUG(dbgs() << "Sinking iteration " << NumSinkIter << "\n"); - // Process all basic blocks. - for (Function::iterator I = F.begin(), E = F.end(); - I != E; ++I) - MadeChange |= ProcessBlock(*I); - EverMadeChange |= MadeChange; - NumSinkIter++; - } while (MadeChange); - - return EverMadeChange; -} - -bool Sinking::ProcessBlock(BasicBlock &BB) { - // Can't sink anything out of a block that has less than two successors. - if (BB.getTerminator()->getNumSuccessors() <= 1) return false; - - // Don't bother sinking code out of unreachable blocks. In addition to being - // unprofitable, it can also lead to infinite looping, because in an - // unreachable loop there may be nowhere to stop. - if (!DT->isReachableFromEntry(&BB)) return false; - - bool MadeChange = false; - - // Walk the basic block bottom-up. Remember if we saw a store. - BasicBlock::iterator I = BB.end(); - --I; - bool ProcessedBegin = false; - SmallPtrSet<Instruction *, 8> Stores; - do { - Instruction *Inst = &*I; // The instruction to sink. - - // Predecrement I (if it's not begin) so that it isn't invalidated by - // sinking. - ProcessedBegin = I == BB.begin(); - if (!ProcessedBegin) - --I; - - if (isa<DbgInfoIntrinsic>(Inst)) - continue; - - if (SinkInstruction(Inst, Stores)) - ++NumSunk, MadeChange = true; - - // If we just processed the first instruction in the block, we're done. - } while (!ProcessedBegin); - - return MadeChange; -} - -static bool isSafeToMove(Instruction *Inst, AliasAnalysis *AA, +static bool isSafeToMove(Instruction *Inst, AliasAnalysis &AA, SmallPtrSetImpl<Instruction *> &Stores) { if (Inst->mayWriteToMemory()) { @@ -165,7 +68,7 @@ static bool isSafeToMove(Instruction *Inst, AliasAnalysis *AA, if (LoadInst *L = dyn_cast<LoadInst>(Inst)) { MemoryLocation Loc = MemoryLocation::get(L); for (Instruction *S : Stores) - if (AA->getModRefInfo(S, Loc) & MRI_Mod) + if (AA.getModRefInfo(S, Loc) & MRI_Mod) return false; } @@ -173,11 +76,15 @@ static bool isSafeToMove(Instruction *Inst, AliasAnalysis *AA, Inst->mayThrow()) return false; - // Convergent operations cannot be made control-dependent on additional - // values. if (auto CS = CallSite(Inst)) { + // Convergent operations cannot be made control-dependent on additional + // values. if (CS.hasFnAttr(Attribute::Convergent)) return false; + + for (Instruction *S : Stores) + if (AA.getModRefInfo(S, CS) & MRI_Mod) + return false; } return true; @@ -185,8 +92,8 @@ static bool isSafeToMove(Instruction *Inst, AliasAnalysis *AA, /// IsAcceptableTarget - Return true if it is possible to sink the instruction /// in the specified basic block. -bool Sinking::IsAcceptableTarget(Instruction *Inst, - BasicBlock *SuccToSinkTo) const { +static bool IsAcceptableTarget(Instruction *Inst, BasicBlock *SuccToSinkTo, + DominatorTree &DT, LoopInfo &LI) { assert(Inst && "Instruction to be sunk is null"); assert(SuccToSinkTo && "Candidate sink target is null"); @@ -212,25 +119,26 @@ bool Sinking::IsAcceptableTarget(Instruction *Inst, // We don't want to sink across a critical edge if we don't dominate the // successor. We could be introducing calculations to new code paths. - if (!DT->dominates(Inst->getParent(), SuccToSinkTo)) + if (!DT.dominates(Inst->getParent(), SuccToSinkTo)) return false; // Don't sink instructions into a loop. - Loop *succ = LI->getLoopFor(SuccToSinkTo); - Loop *cur = LI->getLoopFor(Inst->getParent()); + Loop *succ = LI.getLoopFor(SuccToSinkTo); + Loop *cur = LI.getLoopFor(Inst->getParent()); if (succ != nullptr && succ != cur) return false; } // Finally, check that all the uses of the instruction are actually // dominated by the candidate - return AllUsesDominatedByBlock(Inst, SuccToSinkTo); + return AllUsesDominatedByBlock(Inst, SuccToSinkTo, DT); } /// SinkInstruction - Determine whether it is safe to sink the specified machine /// instruction out of its current block into a successor. -bool Sinking::SinkInstruction(Instruction *Inst, - SmallPtrSetImpl<Instruction *> &Stores) { +static bool SinkInstruction(Instruction *Inst, + SmallPtrSetImpl<Instruction *> &Stores, + DominatorTree &DT, LoopInfo &LI, AAResults &AA) { // Don't sink static alloca instructions. CodeGen assumes allocas outside the // entry block are dynamically sized stack objects. @@ -257,12 +165,12 @@ bool Sinking::SinkInstruction(Instruction *Inst, // Instructions can only be sunk if all their uses are in blocks // dominated by one of the successors. // Look at all the postdominators and see if we can sink it in one. - DomTreeNode *DTN = DT->getNode(Inst->getParent()); + DomTreeNode *DTN = DT.getNode(Inst->getParent()); for (DomTreeNode::iterator I = DTN->begin(), E = DTN->end(); I != E && SuccToSinkTo == nullptr; ++I) { BasicBlock *Candidate = (*I)->getBlock(); if ((*I)->getIDom()->getBlock() == Inst->getParent() && - IsAcceptableTarget(Inst, Candidate)) + IsAcceptableTarget(Inst, Candidate, DT, LI)) SuccToSinkTo = Candidate; } @@ -270,7 +178,7 @@ bool Sinking::SinkInstruction(Instruction *Inst, // decide which one we should sink to, if any. for (succ_iterator I = succ_begin(Inst->getParent()), E = succ_end(Inst->getParent()); I != E && !SuccToSinkTo; ++I) { - if (IsAcceptableTarget(Inst, *I)) + if (IsAcceptableTarget(Inst, *I, DT, LI)) SuccToSinkTo = *I; } @@ -288,3 +196,111 @@ bool Sinking::SinkInstruction(Instruction *Inst, Inst->moveBefore(&*SuccToSinkTo->getFirstInsertionPt()); return true; } + +static bool ProcessBlock(BasicBlock &BB, DominatorTree &DT, LoopInfo &LI, + AAResults &AA) { + // Can't sink anything out of a block that has less than two successors. + if (BB.getTerminator()->getNumSuccessors() <= 1) return false; + + // Don't bother sinking code out of unreachable blocks. In addition to being + // unprofitable, it can also lead to infinite looping, because in an + // unreachable loop there may be nowhere to stop. + if (!DT.isReachableFromEntry(&BB)) return false; + + bool MadeChange = false; + + // Walk the basic block bottom-up. Remember if we saw a store. + BasicBlock::iterator I = BB.end(); + --I; + bool ProcessedBegin = false; + SmallPtrSet<Instruction *, 8> Stores; + do { + Instruction *Inst = &*I; // The instruction to sink. + + // Predecrement I (if it's not begin) so that it isn't invalidated by + // sinking. + ProcessedBegin = I == BB.begin(); + if (!ProcessedBegin) + --I; + + if (isa<DbgInfoIntrinsic>(Inst)) + continue; + + if (SinkInstruction(Inst, Stores, DT, LI, AA)) { + ++NumSunk; + MadeChange = true; + } + + // If we just processed the first instruction in the block, we're done. + } while (!ProcessedBegin); + + return MadeChange; +} + +static bool iterativelySinkInstructions(Function &F, DominatorTree &DT, + LoopInfo &LI, AAResults &AA) { + bool MadeChange, EverMadeChange = false; + + do { + MadeChange = false; + DEBUG(dbgs() << "Sinking iteration " << NumSinkIter << "\n"); + // Process all basic blocks. + for (BasicBlock &I : F) + MadeChange |= ProcessBlock(I, DT, LI, AA); + EverMadeChange |= MadeChange; + NumSinkIter++; + } while (MadeChange); + + return EverMadeChange; +} + +PreservedAnalyses SinkingPass::run(Function &F, AnalysisManager<Function> &AM) { + auto &DT = AM.getResult<DominatorTreeAnalysis>(F); + auto &LI = AM.getResult<LoopAnalysis>(F); + auto &AA = AM.getResult<AAManager>(F); + + if (!iterativelySinkInstructions(F, DT, LI, AA)) + return PreservedAnalyses::all(); + + auto PA = PreservedAnalyses(); + PA.preserve<DominatorTreeAnalysis>(); + PA.preserve<LoopAnalysis>(); + return PA; +} + +namespace { + class SinkingLegacyPass : public FunctionPass { + public: + static char ID; // Pass identification + SinkingLegacyPass() : FunctionPass(ID) { + initializeSinkingLegacyPassPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override { + auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + auto &AA = getAnalysis<AAResultsWrapperPass>().getAAResults(); + + return iterativelySinkInstructions(F, DT, LI, AA); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + FunctionPass::getAnalysisUsage(AU); + AU.addRequired<AAResultsWrapperPass>(); + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addRequired<LoopInfoWrapperPass>(); + AU.addPreserved<DominatorTreeWrapperPass>(); + AU.addPreserved<LoopInfoWrapperPass>(); + } + }; +} // end anonymous namespace + +char SinkingLegacyPass::ID = 0; +INITIALIZE_PASS_BEGIN(SinkingLegacyPass, "sink", "Code sinking", false, false) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_END(SinkingLegacyPass, "sink", "Code sinking", false, false) + +FunctionPass *llvm::createSinkingPass() { return new SinkingLegacyPass(); } diff --git a/lib/Transforms/Scalar/SpeculativeExecution.cpp b/lib/Transforms/Scalar/SpeculativeExecution.cpp index 147d615488ffe..9bf2d62068194 100644 --- a/lib/Transforms/Scalar/SpeculativeExecution.cpp +++ b/lib/Transforms/Scalar/SpeculativeExecution.cpp @@ -50,9 +50,19 @@ // aggressive speculation while counting on later passes to either capitalize on // that or clean it up. // +// If the pass was created by calling +// createSpeculativeExecutionIfHasBranchDivergencePass or the +// -spec-exec-only-if-divergent-target option is present, this pass only has an +// effect on targets where TargetTransformInfo::hasBranchDivergence() is true; +// on other targets, it is a nop. +// +// This lets you include this pass unconditionally in the IR pass pipeline, but +// only enable it for relevant targets. +// //===----------------------------------------------------------------------===// #include "llvm/ADT/SmallSet.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Instructions.h" @@ -83,19 +93,39 @@ static cl::opt<unsigned> SpecExecMaxNotHoisted( "number of instructions that would not be speculatively executed " "exceeds this limit.")); +static cl::opt<bool> SpecExecOnlyIfDivergentTarget( + "spec-exec-only-if-divergent-target", cl::init(false), cl::Hidden, + cl::desc("Speculative execution is applied only to targets with divergent " + "branches, even if the pass was configured to apply only to all " + "targets.")); + namespace { + class SpeculativeExecution : public FunctionPass { public: - static char ID; - SpeculativeExecution(): FunctionPass(ID) {} + static char ID; + explicit SpeculativeExecution(bool OnlyIfDivergentTarget = false) + : FunctionPass(ID), + OnlyIfDivergentTarget(OnlyIfDivergentTarget || + SpecExecOnlyIfDivergentTarget) {} + + void getAnalysisUsage(AnalysisUsage &AU) const override; + bool runOnFunction(Function &F) override; - void getAnalysisUsage(AnalysisUsage &AU) const override; - bool runOnFunction(Function &F) override; + const char *getPassName() const override { + if (OnlyIfDivergentTarget) + return "Speculatively execute instructions if target has divergent " + "branches"; + return "Speculatively execute instructions"; + } private: bool runOnBasicBlock(BasicBlock &B); bool considerHoistingFromTo(BasicBlock &FromBlock, BasicBlock &ToBlock); + // If true, this pass is a nop unless the target architecture has branch + // divergence. + const bool OnlyIfDivergentTarget; const TargetTransformInfo *TTI = nullptr; }; } // namespace @@ -105,17 +135,23 @@ INITIALIZE_PASS_BEGIN(SpeculativeExecution, "speculative-execution", "Speculatively execute instructions", false, false) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_END(SpeculativeExecution, "speculative-execution", - "Speculatively execute instructions", false, false) + "Speculatively execute instructions", false, false) void SpeculativeExecution::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired<TargetTransformInfoWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); } bool SpeculativeExecution::runOnFunction(Function &F) { - if (skipOptnoneFunction(F)) + if (skipFunction(F)) return false; TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); + if (OnlyIfDivergentTarget && !TTI->hasBranchDivergence()) { + DEBUG(dbgs() << "Not running SpeculativeExecution because " + "TTI->hasBranchDivergence() is false.\n"); + return false; + } bool Changed = false; for (auto& B : F) { @@ -240,4 +276,8 @@ FunctionPass *createSpeculativeExecutionPass() { return new SpeculativeExecution(); } +FunctionPass *createSpeculativeExecutionIfHasBranchDivergencePass() { + return new SpeculativeExecution(/* OnlyIfDivergentTarget = */ true); +} + } // namespace llvm diff --git a/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp b/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp index 1faa65eb34175..292d0400a516b 100644 --- a/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp +++ b/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp @@ -57,8 +57,6 @@ // SLSR. #include <vector> -#include "llvm/ADT/DenseSet.h" -#include "llvm/ADT/FoldingSet.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" @@ -76,6 +74,8 @@ using namespace PatternMatch; namespace { +static const unsigned UnknownAddressSpace = ~0u; + class StraightLineStrengthReduce : public FunctionPass { public: // SLSR candidate. Such a candidate must be in one of the forms described in @@ -234,51 +234,22 @@ bool StraightLineStrengthReduce::isBasisFor(const Candidate &Basis, Basis.CandidateKind == C.CandidateKind); } -// TODO: use TTI->getGEPCost. static bool isGEPFoldable(GetElementPtrInst *GEP, - const TargetTransformInfo *TTI, - const DataLayout *DL) { - GlobalVariable *BaseGV = nullptr; - int64_t BaseOffset = 0; - bool HasBaseReg = false; - int64_t Scale = 0; - - if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getPointerOperand())) - BaseGV = GV; - else - HasBaseReg = true; - - gep_type_iterator GTI = gep_type_begin(GEP); - for (auto I = GEP->idx_begin(); I != GEP->idx_end(); ++I, ++GTI) { - if (isa<SequentialType>(*GTI)) { - int64_t ElementSize = DL->getTypeAllocSize(GTI.getIndexedType()); - if (ConstantInt *ConstIdx = dyn_cast<ConstantInt>(*I)) { - BaseOffset += ConstIdx->getSExtValue() * ElementSize; - } else { - // Needs scale register. - if (Scale != 0) { - // No addressing mode takes two scale registers. - return false; - } - Scale = ElementSize; - } - } else { - StructType *STy = cast<StructType>(*GTI); - uint64_t Field = cast<ConstantInt>(*I)->getZExtValue(); - BaseOffset += DL->getStructLayout(STy)->getElementOffset(Field); - } - } - - unsigned AddrSpace = GEP->getPointerAddressSpace(); - return TTI->isLegalAddressingMode(GEP->getType()->getElementType(), BaseGV, - BaseOffset, HasBaseReg, Scale, AddrSpace); + const TargetTransformInfo *TTI) { + SmallVector<const Value*, 4> Indices; + for (auto I = GEP->idx_begin(); I != GEP->idx_end(); ++I) + Indices.push_back(*I); + return TTI->getGEPCost(GEP->getSourceElementType(), GEP->getPointerOperand(), + Indices) == TargetTransformInfo::TCC_Free; } // Returns whether (Base + Index * Stride) can be folded to an addressing mode. static bool isAddFoldable(const SCEV *Base, ConstantInt *Index, Value *Stride, TargetTransformInfo *TTI) { - return TTI->isLegalAddressingMode(Base->getType(), nullptr, 0, true, - Index->getSExtValue()); + // Index->getSExtValue() may crash if Index is wider than 64-bit. + return Index->getBitWidth() <= 64 && + TTI->isLegalAddressingMode(Base->getType(), nullptr, 0, true, + Index->getSExtValue(), UnknownAddressSpace); } bool StraightLineStrengthReduce::isFoldable(const Candidate &C, @@ -287,7 +258,7 @@ bool StraightLineStrengthReduce::isFoldable(const Candidate &C, if (C.CandidateKind == Candidate::Add) return isAddFoldable(C.Base, C.Index, C.Stride, TTI); if (C.CandidateKind == Candidate::GEP) - return isGEPFoldable(cast<GetElementPtrInst>(C.Ins), TTI, DL); + return isGEPFoldable(cast<GetElementPtrInst>(C.Ins), TTI); return false; } @@ -533,13 +504,23 @@ void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForGEP( IndexExprs, GEP->isInBounds()); Value *ArrayIdx = GEP->getOperand(I); uint64_t ElementSize = DL->getTypeAllocSize(*GTI); - factorArrayIndex(ArrayIdx, BaseExpr, ElementSize, GEP); + if (ArrayIdx->getType()->getIntegerBitWidth() <= + DL->getPointerSizeInBits(GEP->getAddressSpace())) { + // Skip factoring if ArrayIdx is wider than the pointer size, because + // ArrayIdx is implicitly truncated to the pointer size. + factorArrayIndex(ArrayIdx, BaseExpr, ElementSize, GEP); + } // When ArrayIdx is the sext of a value, we try to factor that value as // well. Handling this case is important because array indices are // typically sign-extended to the pointer size. Value *TruncatedArrayIdx = nullptr; - if (match(ArrayIdx, m_SExt(m_Value(TruncatedArrayIdx)))) + if (match(ArrayIdx, m_SExt(m_Value(TruncatedArrayIdx))) && + TruncatedArrayIdx->getType()->getIntegerBitWidth() <= + DL->getPointerSizeInBits(GEP->getAddressSpace())) { + // Skip factoring if TruncatedArrayIdx is wider than the pointer size, + // because TruncatedArrayIdx is implicitly truncated to the pointer size. factorArrayIndex(TruncatedArrayIdx, BaseExpr, ElementSize, GEP); + } IndexExprs[I - 1] = OrigIndexExpr; } @@ -567,10 +548,10 @@ Value *StraightLineStrengthReduce::emitBump(const Candidate &Basis, APInt ElementSize( IndexOffset.getBitWidth(), DL->getTypeAllocSize( - cast<GetElementPtrInst>(Basis.Ins)->getType()->getElementType())); + cast<GetElementPtrInst>(Basis.Ins)->getResultElementType())); APInt Q, R; APInt::sdivrem(IndexOffset, ElementSize, Q, R); - if (R.getSExtValue() == 0) + if (R == 0) IndexOffset = Q; else BumpWithUglyGEP = true; @@ -578,10 +559,10 @@ Value *StraightLineStrengthReduce::emitBump(const Candidate &Basis, // Compute Bump = C - Basis = (i' - i) * S. // Common case 1: if (i' - i) is 1, Bump = S. - if (IndexOffset.getSExtValue() == 1) + if (IndexOffset == 1) return C.Stride; // Common case 2: if (i' - i) is -1, Bump = -S. - if (IndexOffset.getSExtValue() == -1) + if (IndexOffset.isAllOnesValue()) return Builder.CreateNeg(C.Stride); // Otherwise, Bump = (i' - i) * sext/trunc(S). Note that (i' - i) and S may @@ -685,7 +666,7 @@ void StraightLineStrengthReduce::rewriteCandidateWithBasis( } bool StraightLineStrengthReduce::runOnFunction(Function &F) { - if (skipOptnoneFunction(F)) + if (skipFunction(F)) return false; TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); diff --git a/lib/Transforms/Scalar/StructurizeCFG.cpp b/lib/Transforms/Scalar/StructurizeCFG.cpp index 662513c7d8ae0..e9ac39beae5a7 100644 --- a/lib/Transforms/Scalar/StructurizeCFG.cpp +++ b/lib/Transforms/Scalar/StructurizeCFG.cpp @@ -11,6 +11,7 @@ #include "llvm/ADT/MapVector.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SCCIterator.h" +#include "llvm/Analysis/DivergenceAnalysis.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/RegionInfo.h" #include "llvm/Analysis/RegionIterator.h" @@ -161,6 +162,9 @@ public: /// consist of a network of PHI nodes where the true incoming values expresses /// breaks and the false values expresses continue states. class StructurizeCFG : public RegionPass { + bool SkipUniformRegions; + DivergenceAnalysis *DA; + Type *Boolean; ConstantInt *BoolTrue; ConstantInt *BoolFalse; @@ -232,11 +236,18 @@ class StructurizeCFG : public RegionPass { void rebuildSSA(); + bool hasOnlyUniformBranches(const Region *R); + public: static char ID; StructurizeCFG() : - RegionPass(ID) { + RegionPass(ID), SkipUniformRegions(false) { + initializeStructurizeCFGPass(*PassRegistry::getPassRegistry()); + } + + StructurizeCFG(bool SkipUniformRegions) : + RegionPass(ID), SkipUniformRegions(SkipUniformRegions) { initializeStructurizeCFGPass(*PassRegistry::getPassRegistry()); } @@ -250,6 +261,8 @@ public: } void getAnalysisUsage(AnalysisUsage &AU) const override { + if (SkipUniformRegions) + AU.addRequired<DivergenceAnalysis>(); AU.addRequiredID(LowerSwitchID); AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<LoopInfoWrapperPass>(); @@ -264,6 +277,7 @@ char StructurizeCFG::ID = 0; INITIALIZE_PASS_BEGIN(StructurizeCFG, "structurizecfg", "Structurize the CFG", false, false) +INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis) INITIALIZE_PASS_DEPENDENCY(LowerSwitch) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(RegionInfoPass) @@ -297,11 +311,7 @@ void StructurizeCFG::orderNodes() { for (RegionNode *RN : TempOrder) { BasicBlock *BB = RN->getEntry(); Loop *Loop = LI->getLoopFor(BB); - if (!LoopBlocks.count(Loop)) { - LoopBlocks[Loop] = 1; - continue; - } - LoopBlocks[Loop]++; + ++LoopBlocks[Loop]; } unsigned CurrentLoopDepth = 0; @@ -319,11 +329,11 @@ void StructurizeCFG::orderNodes() { // the outer loop. RNVector::iterator LoopI = I; - while(LoopBlocks[CurrentLoop]) { + while (unsigned &BlockCount = LoopBlocks[CurrentLoop]) { LoopI++; BasicBlock *LoopBB = (*LoopI)->getEntry(); if (LI->getLoopFor(LoopBB) == CurrentLoop) { - LoopBlocks[CurrentLoop]--; + --BlockCount; Order.push_back(*LoopI); } } @@ -367,14 +377,8 @@ void StructurizeCFG::analyzeLoops(RegionNode *N) { /// \brief Invert the given condition Value *StructurizeCFG::invert(Value *Condition) { // First: Check if it's a constant - if (Condition == BoolTrue) - return BoolFalse; - - if (Condition == BoolFalse) - return BoolTrue; - - if (Condition == BoolUndef) - return BoolUndef; + if (Constant *C = dyn_cast<Constant>(Condition)) + return ConstantExpr::getNot(C); // Second: If the condition is already inverted, return the original value if (match(Condition, m_Not(m_Value(Condition)))) @@ -491,21 +495,21 @@ void StructurizeCFG::collectInfos() { // Reset the visited nodes Visited.clear(); - for (RNVector::reverse_iterator OI = Order.rbegin(), OE = Order.rend(); - OI != OE; ++OI) { + for (RegionNode *RN : reverse(Order)) { - DEBUG(dbgs() << "Visiting: " << - ((*OI)->isSubRegion() ? "SubRegion with entry: " : "") << - (*OI)->getEntry()->getName() << " Loop Depth: " << LI->getLoopDepth((*OI)->getEntry()) << "\n"); + DEBUG(dbgs() << "Visiting: " + << (RN->isSubRegion() ? "SubRegion with entry: " : "") + << RN->getEntry()->getName() << " Loop Depth: " + << LI->getLoopDepth(RN->getEntry()) << "\n"); // Analyze all the conditions leading to a node - gatherPredicates(*OI); + gatherPredicates(RN); // Remember that we've seen this node - Visited.insert((*OI)->getEntry()); + Visited.insert(RN->getEntry()); // Find the last back edges - analyzeLoops(*OI); + analyzeLoops(RN); } } @@ -584,20 +588,18 @@ void StructurizeCFG::addPhiValues(BasicBlock *From, BasicBlock *To) { /// \brief Add the real PHI value as soon as everything is set up void StructurizeCFG::setPhiValues() { SSAUpdater Updater; - for (BB2BBVecMap::iterator AI = AddedPhis.begin(), AE = AddedPhis.end(); - AI != AE; ++AI) { + for (const auto &AddedPhi : AddedPhis) { - BasicBlock *To = AI->first; - BBVector &From = AI->second; + BasicBlock *To = AddedPhi.first; + const BBVector &From = AddedPhi.second; if (!DeletedPhis.count(To)) continue; PhiMap &Map = DeletedPhis[To]; - for (PhiMap::iterator PI = Map.begin(), PE = Map.end(); - PI != PE; ++PI) { + for (const auto &PI : Map) { - PHINode *Phi = PI->first; + PHINode *Phi = PI.first; Value *Undef = UndefValue::get(Phi->getType()); Updater.Initialize(Phi->getType(), ""); Updater.AddAvailableValue(&Func->getEntryBlock(), Undef); @@ -605,22 +607,20 @@ void StructurizeCFG::setPhiValues() { NearestCommonDominator Dominator(DT); Dominator.addBlock(To, false); - for (BBValueVector::iterator VI = PI->second.begin(), - VE = PI->second.end(); VI != VE; ++VI) { + for (const auto &VI : PI.second) { - Updater.AddAvailableValue(VI->first, VI->second); - Dominator.addBlock(VI->first); + Updater.AddAvailableValue(VI.first, VI.second); + Dominator.addBlock(VI.first); } if (!Dominator.wasResultExplicitMentioned()) Updater.AddAvailableValue(Dominator.getResult(), Undef); - for (BBVector::iterator FI = From.begin(), FE = From.end(); - FI != FE; ++FI) { + for (BasicBlock *FI : From) { - int Idx = Phi->getBasicBlockIndex(*FI); + int Idx = Phi->getBasicBlockIndex(FI); assert(Idx != -1); - Phi->setIncomingValue(Idx, Updater.GetValueAtEndOfBlock(*FI)); + Phi->setIncomingValue(Idx, Updater.GetValueAtEndOfBlock(FI)); } } @@ -914,11 +914,48 @@ void StructurizeCFG::rebuildSSA() { } } +bool StructurizeCFG::hasOnlyUniformBranches(const Region *R) { + for (const BasicBlock *BB : R->blocks()) { + const BranchInst *Br = dyn_cast<BranchInst>(BB->getTerminator()); + if (!Br || !Br->isConditional()) + continue; + + if (!DA->isUniform(Br->getCondition())) + return false; + DEBUG(dbgs() << "BB: " << BB->getName() << " has uniform terminator\n"); + } + return true; +} + /// \brief Run the transformation for each region found bool StructurizeCFG::runOnRegion(Region *R, RGPassManager &RGM) { if (R->isTopLevelRegion()) return false; + if (SkipUniformRegions) { + DA = &getAnalysis<DivergenceAnalysis>(); + // TODO: We could probably be smarter here with how we handle sub-regions. + if (hasOnlyUniformBranches(R)) { + DEBUG(dbgs() << "Skipping region with uniform control flow: " << *R << '\n'); + + // Mark all direct child block terminators as having been treated as + // uniform. To account for a possible future in which non-uniform + // sub-regions are treated more cleverly, indirect children are not + // marked as uniform. + MDNode *MD = MDNode::get(R->getEntry()->getParent()->getContext(), {}); + Region::element_iterator E = R->element_end(); + for (Region::element_iterator I = R->element_begin(); I != E; ++I) { + if (I->isSubRegion()) + continue; + + if (Instruction *Term = I->getEntry()->getTerminator()) + Term->setMetadata("structurizecfg.uniform", MD); + } + + return false; + } + } + Func = R->getEntry()->getParent(); ParentRegion = R; @@ -947,7 +984,6 @@ bool StructurizeCFG::runOnRegion(Region *R, RGPassManager &RGM) { return true; } -/// \brief Create the pass -Pass *llvm::createStructurizeCFGPass() { - return new StructurizeCFG(); +Pass *llvm::createStructurizeCFGPass(bool SkipUniformRegions) { + return new StructurizeCFG(SkipUniformRegions); } diff --git a/lib/Transforms/Scalar/TailRecursionElimination.cpp b/lib/Transforms/Scalar/TailRecursionElimination.cpp index 4e84d72ae7bdd..d5ff997503703 100644 --- a/lib/Transforms/Scalar/TailRecursionElimination.cpp +++ b/lib/Transforms/Scalar/TailRecursionElimination.cpp @@ -50,6 +50,7 @@ // //===----------------------------------------------------------------------===// +#include "llvm/Transforms/Scalar/TailRecursionElimination.h" #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" @@ -85,64 +86,9 @@ STATISTIC(NumEliminated, "Number of tail calls removed"); STATISTIC(NumRetDuped, "Number of return duplicated"); STATISTIC(NumAccumAdded, "Number of accumulators introduced"); -namespace { - struct TailCallElim : public FunctionPass { - const TargetTransformInfo *TTI; - - static char ID; // Pass identification, replacement for typeid - TailCallElim() : FunctionPass(ID) { - initializeTailCallElimPass(*PassRegistry::getPassRegistry()); - } - - void getAnalysisUsage(AnalysisUsage &AU) const override; - - bool runOnFunction(Function &F) override; - - private: - bool runTRE(Function &F); - bool markTails(Function &F, bool &AllCallsAreTailCalls); - - CallInst *FindTRECandidate(Instruction *I, - bool CannotTailCallElimCallsMarkedTail); - bool EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret, - BasicBlock *&OldEntry, - bool &TailCallsAreMarkedTail, - SmallVectorImpl<PHINode *> &ArgumentPHIs, - bool CannotTailCallElimCallsMarkedTail); - bool FoldReturnAndProcessPred(BasicBlock *BB, - ReturnInst *Ret, BasicBlock *&OldEntry, - bool &TailCallsAreMarkedTail, - SmallVectorImpl<PHINode *> &ArgumentPHIs, - bool CannotTailCallElimCallsMarkedTail); - bool ProcessReturningBlock(ReturnInst *RI, BasicBlock *&OldEntry, - bool &TailCallsAreMarkedTail, - SmallVectorImpl<PHINode *> &ArgumentPHIs, - bool CannotTailCallElimCallsMarkedTail); - bool CanMoveAboveCall(Instruction *I, CallInst *CI); - Value *CanTransformAccumulatorRecursion(Instruction *I, CallInst *CI); - }; -} - -char TailCallElim::ID = 0; -INITIALIZE_PASS_BEGIN(TailCallElim, "tailcallelim", - "Tail Call Elimination", false, false) -INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) -INITIALIZE_PASS_END(TailCallElim, "tailcallelim", - "Tail Call Elimination", false, false) - -// Public interface to the TailCallElimination pass -FunctionPass *llvm::createTailCallEliminationPass() { - return new TailCallElim(); -} - -void TailCallElim::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<TargetTransformInfoWrapperPass>(); - AU.addPreserved<GlobalsAAWrapperPass>(); -} - /// \brief Scan the specified function for alloca instructions. /// If it contains any dynamic allocas, returns false. -static bool CanTRE(Function &F) { +static bool canTRE(Function &F) { // Because of PR962, we don't TRE dynamic allocas. for (auto &BB : F) { for (auto &I : BB) { @@ -156,20 +102,6 @@ static bool CanTRE(Function &F) { return true; } -bool TailCallElim::runOnFunction(Function &F) { - if (skipOptnoneFunction(F)) - return false; - - if (F.getFnAttribute("disable-tail-calls").getValueAsString() == "true") - return false; - - bool AllCallsAreTailCalls = false; - bool Modified = markTails(F, AllCallsAreTailCalls); - if (AllCallsAreTailCalls) - Modified |= runTRE(F); - return Modified; -} - namespace { struct AllocaDerivedValueTracker { // Start at a root value and walk its use-def chain to mark calls that use the @@ -250,7 +182,7 @@ struct AllocaDerivedValueTracker { }; } -bool TailCallElim::markTails(Function &F, bool &AllCallsAreTailCalls) { +static bool markTails(Function &F, bool &AllCallsAreTailCalls) { if (F.callsFunctionThatReturnsTwice()) return false; AllCallsAreTailCalls = true; @@ -385,63 +317,11 @@ bool TailCallElim::markTails(Function &F, bool &AllCallsAreTailCalls) { return Modified; } -bool TailCallElim::runTRE(Function &F) { - // If this function is a varargs function, we won't be able to PHI the args - // right, so don't even try to convert it... - if (F.getFunctionType()->isVarArg()) return false; - - TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); - BasicBlock *OldEntry = nullptr; - bool TailCallsAreMarkedTail = false; - SmallVector<PHINode*, 8> ArgumentPHIs; - bool MadeChange = false; - - // If false, we cannot perform TRE on tail calls marked with the 'tail' - // attribute, because doing so would cause the stack size to increase (real - // TRE would deallocate variable sized allocas, TRE doesn't). - bool CanTRETailMarkedCall = CanTRE(F); - - // Change any tail recursive calls to loops. - // - // FIXME: The code generator produces really bad code when an 'escaping - // alloca' is changed from being a static alloca to being a dynamic alloca. - // Until this is resolved, disable this transformation if that would ever - // happen. This bug is PR962. - for (Function::iterator BBI = F.begin(), E = F.end(); BBI != E; /*in loop*/) { - BasicBlock *BB = &*BBI++; // FoldReturnAndProcessPred may delete BB. - if (ReturnInst *Ret = dyn_cast<ReturnInst>(BB->getTerminator())) { - bool Change = ProcessReturningBlock(Ret, OldEntry, TailCallsAreMarkedTail, - ArgumentPHIs, !CanTRETailMarkedCall); - if (!Change && BB->getFirstNonPHIOrDbg() == Ret) - Change = FoldReturnAndProcessPred(BB, Ret, OldEntry, - TailCallsAreMarkedTail, ArgumentPHIs, - !CanTRETailMarkedCall); - MadeChange |= Change; - } - } - - // If we eliminated any tail recursions, it's possible that we inserted some - // silly PHI nodes which just merge an initial value (the incoming operand) - // with themselves. Check to see if we did and clean up our mess if so. This - // occurs when a function passes an argument straight through to its tail - // call. - for (PHINode *PN : ArgumentPHIs) { - // If the PHI Node is a dynamic constant, replace it with the value it is. - if (Value *PNV = SimplifyInstruction(PN, F.getParent()->getDataLayout())) { - PN->replaceAllUsesWith(PNV); - PN->eraseFromParent(); - } - } - - return MadeChange; -} - - /// Return true if it is safe to move the specified /// instruction from after the call to before the call, assuming that all /// instructions between the call and this instruction are movable. /// -bool TailCallElim::CanMoveAboveCall(Instruction *I, CallInst *CI) { +static bool canMoveAboveCall(Instruction *I, CallInst *CI) { // FIXME: We can move load/store/call/free instructions above the call if the // call does not mod/ref the memory location being processed. if (I->mayHaveSideEffects()) // This also handles volatile loads. @@ -454,9 +334,10 @@ bool TailCallElim::CanMoveAboveCall(Instruction *I, CallInst *CI) { // does not write to memory and the load provably won't trap. // FIXME: Writes to memory only matter if they may alias the pointer // being loaded from. + const DataLayout &DL = L->getModule()->getDataLayout(); if (CI->mayWriteToMemory() || - !isSafeToLoadUnconditionally(L->getPointerOperand(), L, - L->getAlignment())) + !isSafeToLoadUnconditionally(L->getPointerOperand(), + L->getAlignment(), DL, L)) return false; } } @@ -512,8 +393,8 @@ static Value *getCommonReturnValue(ReturnInst *IgnoreRI, CallInst *CI) { Function *F = CI->getParent()->getParent(); Value *ReturnedValue = nullptr; - for (Function::iterator BBI = F->begin(), E = F->end(); BBI != E; ++BBI) { - ReturnInst *RI = dyn_cast<ReturnInst>(BBI->getTerminator()); + for (BasicBlock &BBI : *F) { + ReturnInst *RI = dyn_cast<ReturnInst>(BBI.getTerminator()); if (RI == nullptr || RI == IgnoreRI) continue; // We can only perform this transformation if the value returned is @@ -534,8 +415,7 @@ static Value *getCommonReturnValue(ReturnInst *IgnoreRI, CallInst *CI) { /// If the specified instruction can be transformed using accumulator recursion /// elimination, return the constant which is the start of the accumulator /// value. Otherwise return null. -Value *TailCallElim::CanTransformAccumulatorRecursion(Instruction *I, - CallInst *CI) { +static Value *canTransformAccumulatorRecursion(Instruction *I, CallInst *CI) { if (!I->isAssociative() || !I->isCommutative()) return nullptr; assert(I->getNumOperands() == 2 && "Associative/commutative operations should have 2 args!"); @@ -555,15 +435,15 @@ Value *TailCallElim::CanTransformAccumulatorRecursion(Instruction *I, return getCommonReturnValue(cast<ReturnInst>(I->user_back()), CI); } -static Instruction *FirstNonDbg(BasicBlock::iterator I) { +static Instruction *firstNonDbg(BasicBlock::iterator I) { while (isa<DbgInfoIntrinsic>(I)) ++I; return &*I; } -CallInst* -TailCallElim::FindTRECandidate(Instruction *TI, - bool CannotTailCallElimCallsMarkedTail) { +static CallInst *findTRECandidate(Instruction *TI, + bool CannotTailCallElimCallsMarkedTail, + const TargetTransformInfo *TTI) { BasicBlock *BB = TI->getParent(); Function *F = BB->getParent(); @@ -594,8 +474,8 @@ TailCallElim::FindTRECandidate(Instruction *TI, // and disable this xform in this case, because the code generator will // lower the call to fabs into inline code. if (BB == &F->getEntryBlock() && - FirstNonDbg(BB->front().getIterator()) == CI && - FirstNonDbg(std::next(BB->begin())) == TI && CI->getCalledFunction() && + firstNonDbg(BB->front().getIterator()) == CI && + firstNonDbg(std::next(BB->begin())) == TI && CI->getCalledFunction() && !TTI->isLoweredToCall(CI->getCalledFunction())) { // A single-block function with just a call and a return. Check that // the arguments match. @@ -612,7 +492,7 @@ TailCallElim::FindTRECandidate(Instruction *TI, return CI; } -bool TailCallElim::EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret, +static bool eliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret, BasicBlock *&OldEntry, bool &TailCallsAreMarkedTail, SmallVectorImpl<PHINode *> &ArgumentPHIs, @@ -636,14 +516,14 @@ bool TailCallElim::EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret, // Check that this is the case now. BasicBlock::iterator BBI(CI); for (++BBI; &*BBI != Ret; ++BBI) { - if (CanMoveAboveCall(&*BBI, CI)) continue; + if (canMoveAboveCall(&*BBI, CI)) continue; // If we can't move the instruction above the call, it might be because it // is an associative and commutative operation that could be transformed // using accumulator recursion elimination. Check to see if this is the // case, and if so, remember the initial accumulator value for later. if ((AccumulatorRecursionEliminationInitVal = - CanTransformAccumulatorRecursion(&*BBI, CI))) { + canTransformAccumulatorRecursion(&*BBI, CI))) { // Yes, this is accumulator recursion. Remember which instruction // accumulates. AccumulatorRecursionInstr = &*BBI; @@ -773,8 +653,8 @@ bool TailCallElim::EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret, // Finally, rewrite any return instructions in the program to return the PHI // node instead of the "initval" that they do currently. This loop will // actually rewrite the return value we are destroying, but that's ok. - for (Function::iterator BBI = F->begin(), E = F->end(); BBI != E; ++BBI) - if (ReturnInst *RI = dyn_cast<ReturnInst>(BBI->getTerminator())) + for (BasicBlock &BBI : *F) + if (ReturnInst *RI = dyn_cast<ReturnInst>(BBI.getTerminator())) RI->setOperand(0, AccPN); ++NumAccumAdded; } @@ -790,11 +670,12 @@ bool TailCallElim::EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret, return true; } -bool TailCallElim::FoldReturnAndProcessPred(BasicBlock *BB, - ReturnInst *Ret, BasicBlock *&OldEntry, - bool &TailCallsAreMarkedTail, - SmallVectorImpl<PHINode *> &ArgumentPHIs, - bool CannotTailCallElimCallsMarkedTail) { +static bool foldReturnAndProcessPred(BasicBlock *BB, ReturnInst *Ret, + BasicBlock *&OldEntry, + bool &TailCallsAreMarkedTail, + SmallVectorImpl<PHINode *> &ArgumentPHIs, + bool CannotTailCallElimCallsMarkedTail, + const TargetTransformInfo *TTI) { bool Change = false; // If the return block contains nothing but the return and PHI's, @@ -813,7 +694,7 @@ bool TailCallElim::FoldReturnAndProcessPred(BasicBlock *BB, while (!UncondBranchPreds.empty()) { BranchInst *BI = UncondBranchPreds.pop_back_val(); BasicBlock *Pred = BI->getParent(); - if (CallInst *CI = FindTRECandidate(BI, CannotTailCallElimCallsMarkedTail)){ + if (CallInst *CI = findTRECandidate(BI, CannotTailCallElimCallsMarkedTail, TTI)){ DEBUG(dbgs() << "FOLDING: " << *BB << "INTO UNCOND BRANCH PRED: " << *Pred); ReturnInst *RI = FoldReturnIntoUncondBranch(Ret, BB, Pred); @@ -821,11 +702,11 @@ bool TailCallElim::FoldReturnAndProcessPred(BasicBlock *BB, // Cleanup: if all predecessors of BB have been eliminated by // FoldReturnIntoUncondBranch, delete it. It is important to empty it, // because the ret instruction in there is still using a value which - // EliminateRecursiveTailCall will attempt to remove. + // eliminateRecursiveTailCall will attempt to remove. if (!BB->hasAddressTaken() && pred_begin(BB) == pred_end(BB)) BB->eraseFromParent(); - EliminateRecursiveTailCall(CI, RI, OldEntry, TailCallsAreMarkedTail, + eliminateRecursiveTailCall(CI, RI, OldEntry, TailCallsAreMarkedTail, ArgumentPHIs, CannotTailCallElimCallsMarkedTail); ++NumRetDuped; @@ -836,16 +717,124 @@ bool TailCallElim::FoldReturnAndProcessPred(BasicBlock *BB, return Change; } -bool -TailCallElim::ProcessReturningBlock(ReturnInst *Ret, BasicBlock *&OldEntry, - bool &TailCallsAreMarkedTail, - SmallVectorImpl<PHINode *> &ArgumentPHIs, - bool CannotTailCallElimCallsMarkedTail) { - CallInst *CI = FindTRECandidate(Ret, CannotTailCallElimCallsMarkedTail); +static bool processReturningBlock(ReturnInst *Ret, BasicBlock *&OldEntry, + bool &TailCallsAreMarkedTail, + SmallVectorImpl<PHINode *> &ArgumentPHIs, + bool CannotTailCallElimCallsMarkedTail, + const TargetTransformInfo *TTI) { + CallInst *CI = findTRECandidate(Ret, CannotTailCallElimCallsMarkedTail, TTI); if (!CI) return false; - return EliminateRecursiveTailCall(CI, Ret, OldEntry, TailCallsAreMarkedTail, + return eliminateRecursiveTailCall(CI, Ret, OldEntry, TailCallsAreMarkedTail, ArgumentPHIs, CannotTailCallElimCallsMarkedTail); } + +static bool eliminateTailRecursion(Function &F, const TargetTransformInfo *TTI) { + if (F.getFnAttribute("disable-tail-calls").getValueAsString() == "true") + return false; + + bool MadeChange = false; + bool AllCallsAreTailCalls = false; + MadeChange |= markTails(F, AllCallsAreTailCalls); + if (!AllCallsAreTailCalls) + return MadeChange; + + // If this function is a varargs function, we won't be able to PHI the args + // right, so don't even try to convert it... + if (F.getFunctionType()->isVarArg()) + return false; + + BasicBlock *OldEntry = nullptr; + bool TailCallsAreMarkedTail = false; + SmallVector<PHINode*, 8> ArgumentPHIs; + + // If false, we cannot perform TRE on tail calls marked with the 'tail' + // attribute, because doing so would cause the stack size to increase (real + // TRE would deallocate variable sized allocas, TRE doesn't). + bool CanTRETailMarkedCall = canTRE(F); + + // Change any tail recursive calls to loops. + // + // FIXME: The code generator produces really bad code when an 'escaping + // alloca' is changed from being a static alloca to being a dynamic alloca. + // Until this is resolved, disable this transformation if that would ever + // happen. This bug is PR962. + for (Function::iterator BBI = F.begin(), E = F.end(); BBI != E; /*in loop*/) { + BasicBlock *BB = &*BBI++; // foldReturnAndProcessPred may delete BB. + if (ReturnInst *Ret = dyn_cast<ReturnInst>(BB->getTerminator())) { + bool Change = + processReturningBlock(Ret, OldEntry, TailCallsAreMarkedTail, + ArgumentPHIs, !CanTRETailMarkedCall, TTI); + if (!Change && BB->getFirstNonPHIOrDbg() == Ret) + Change = + foldReturnAndProcessPred(BB, Ret, OldEntry, TailCallsAreMarkedTail, + ArgumentPHIs, !CanTRETailMarkedCall, TTI); + MadeChange |= Change; + } + } + + // If we eliminated any tail recursions, it's possible that we inserted some + // silly PHI nodes which just merge an initial value (the incoming operand) + // with themselves. Check to see if we did and clean up our mess if so. This + // occurs when a function passes an argument straight through to its tail + // call. + for (PHINode *PN : ArgumentPHIs) { + // If the PHI Node is a dynamic constant, replace it with the value it is. + if (Value *PNV = SimplifyInstruction(PN, F.getParent()->getDataLayout())) { + PN->replaceAllUsesWith(PNV); + PN->eraseFromParent(); + } + } + + return MadeChange; +} + +namespace { +struct TailCallElim : public FunctionPass { + static char ID; // Pass identification, replacement for typeid + TailCallElim() : FunctionPass(ID) { + initializeTailCallElimPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<TargetTransformInfoWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); + } + + bool runOnFunction(Function &F) override { + if (skipFunction(F)) + return false; + + return eliminateTailRecursion( + F, &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F)); + } +}; +} + +char TailCallElim::ID = 0; +INITIALIZE_PASS_BEGIN(TailCallElim, "tailcallelim", "Tail Call Elimination", + false, false) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_END(TailCallElim, "tailcallelim", "Tail Call Elimination", + false, false) + +// Public interface to the TailCallElimination pass +FunctionPass *llvm::createTailCallEliminationPass() { + return new TailCallElim(); +} + +PreservedAnalyses TailCallElimPass::run(Function &F, + FunctionAnalysisManager &AM) { + + TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F); + + bool Changed = eliminateTailRecursion(F, &TTI); + + if (!Changed) + return PreservedAnalyses::all(); + PreservedAnalyses PA; + PA.preserve<GlobalsAA>(); + return PA; +} |