diff options
Diffstat (limited to 'llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp')
-rw-r--r-- | llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp | 206 |
1 files changed, 137 insertions, 69 deletions
diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp index e9f368628a08..cf02ef1e83f3 100644 --- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -65,12 +65,14 @@ #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/iterator_range.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/IVUsers.h" #include "llvm/Analysis/LoopAnalysisManager.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/MemorySSA.h" +#include "llvm/Analysis/MemorySSAUpdater.h" #include "llvm/Analysis/ScalarEvolution.h" -#include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/ScalarEvolutionNormalization.h" #include "llvm/Analysis/TargetTransformInfo.h" @@ -109,6 +111,7 @@ #include "llvm/Transforms/Utils.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" #include <algorithm> #include <cassert> #include <cstddef> @@ -807,9 +810,14 @@ static bool isAddressUse(const TargetTransformInfo &TTI, switch (II->getIntrinsicID()) { case Intrinsic::memset: case Intrinsic::prefetch: + case Intrinsic::masked_load: if (II->getArgOperand(0) == OperandVal) isAddress = true; break; + case Intrinsic::masked_store: + if (II->getArgOperand(1) == OperandVal) + isAddress = true; + break; case Intrinsic::memmove: case Intrinsic::memcpy: if (II->getArgOperand(0) == OperandVal || @@ -859,6 +867,15 @@ static MemAccessTy getAccessType(const TargetTransformInfo &TTI, AccessTy.AddrSpace = OperandVal->getType()->getPointerAddressSpace(); AccessTy.MemTy = OperandVal->getType(); break; + case Intrinsic::masked_load: + AccessTy.AddrSpace = + II->getArgOperand(0)->getType()->getPointerAddressSpace(); + break; + case Intrinsic::masked_store: + AccessTy.MemTy = II->getOperand(0)->getType(); + AccessTy.AddrSpace = + II->getArgOperand(1)->getType()->getPointerAddressSpace(); + break; default: { MemIntrinsicInfo IntrInfo; if (TTI.getTgtMemIntrinsic(II, IntrInfo) && IntrInfo.PtrVal) { @@ -962,33 +979,6 @@ static bool isHighCostExpansion(const SCEV *S, return true; } -/// If any of the instructions in the specified set are trivially dead, delete -/// them and see if this makes any of their operands subsequently dead. -static bool -DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakTrackingVH> &DeadInsts) { - bool Changed = false; - - while (!DeadInsts.empty()) { - Value *V = DeadInsts.pop_back_val(); - Instruction *I = dyn_cast_or_null<Instruction>(V); - - if (!I || !isInstructionTriviallyDead(I)) - continue; - - for (Use &O : I->operands()) - if (Instruction *U = dyn_cast<Instruction>(O)) { - O = nullptr; - if (U->use_empty()) - DeadInsts.emplace_back(U); - } - - I->eraseFromParent(); - Changed = true; - } - - return Changed; -} - namespace { class LSRUse; @@ -1242,7 +1232,7 @@ void Cost::RateRegister(const Formula &F, const SCEV *Reg, // for now LSR only handles innermost loops). if (AR->getLoop() != L) { // If the AddRec exists, consider it's register free and leave it alone. - if (isExistingPhi(AR, *SE)) + if (isExistingPhi(AR, *SE) && !TTI->shouldFavorPostInc()) return; // It is bad to allow LSR for current loop to add induction variables @@ -1913,9 +1903,10 @@ class LSRInstance { DominatorTree &DT; LoopInfo &LI; AssumptionCache &AC; - TargetLibraryInfo &LibInfo; + TargetLibraryInfo &TLI; const TargetTransformInfo &TTI; Loop *const L; + MemorySSAUpdater *MSSAU; bool FavorBackedgeIndex = false; bool Changed = false; @@ -2018,6 +2009,7 @@ class LSRInstance { void NarrowSearchSpaceByCollapsingUnrolledCode(); void NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(); void NarrowSearchSpaceByFilterFormulaWithSameScaledReg(); + void NarrowSearchSpaceByFilterPostInc(); void NarrowSearchSpaceByDeletingCostlyFormulas(); void NarrowSearchSpaceByPickingWinnerRegs(); void NarrowSearchSpaceUsingHeuristics(); @@ -2053,7 +2045,7 @@ class LSRInstance { public: LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE, DominatorTree &DT, LoopInfo &LI, const TargetTransformInfo &TTI, AssumptionCache &AC, - TargetLibraryInfo &LibInfo); + TargetLibraryInfo &TLI, MemorySSAUpdater *MSSAU); bool getChanged() const { return Changed; } @@ -2830,9 +2822,10 @@ bool IVChain::isProfitableIncrement(const SCEV *OperExpr, /// increments can be computed in fewer registers when chained. /// /// TODO: Consider IVInc free if it's already used in another chains. -static bool -isProfitableChain(IVChain &Chain, SmallPtrSetImpl<Instruction*> &Users, - ScalarEvolution &SE) { +static bool isProfitableChain(IVChain &Chain, + SmallPtrSetImpl<Instruction *> &Users, + ScalarEvolution &SE, + const TargetTransformInfo &TTI) { if (StressIVChain) return true; @@ -2861,7 +2854,14 @@ isProfitableChain(IVChain &Chain, SmallPtrSetImpl<Instruction*> &Users, unsigned NumConstIncrements = 0; unsigned NumVarIncrements = 0; unsigned NumReusedIncrements = 0; + + if (TTI.isProfitableLSRChainElement(Chain.Incs[0].UserInst)) + return true; + for (const IVInc &Inc : Chain) { + if (TTI.isProfitableLSRChainElement(Inc.UserInst)) + return true; + if (Inc.IncExpr->isZero()) continue; @@ -3092,7 +3092,7 @@ void LSRInstance::CollectChains() { for (unsigned UsersIdx = 0, NChains = IVChainVec.size(); UsersIdx < NChains; ++UsersIdx) { if (!isProfitableChain(IVChainVec[UsersIdx], - ChainUsersVec[UsersIdx].FarUsers, SE)) + ChainUsersVec[UsersIdx].FarUsers, SE, TTI)) continue; // Preserve the chain at UsesIdx. if (ChainIdx != UsersIdx) @@ -3212,7 +3212,8 @@ void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter, IVOper = Builder.CreateTruncOrBitCast(IVOper, OperTy, "lsr.chain"); } Inc.UserInst->replaceUsesOfWith(Inc.IVOperand, IVOper); - DeadInsts.emplace_back(Inc.IVOperand); + if (auto *OperandIsInstr = dyn_cast<Instruction>(Inc.IVOperand)) + DeadInsts.emplace_back(OperandIsInstr); } // If LSR created a new, wider phi, we may also replace its postinc. We only // do this if we also found a wide value for the head of the chain. @@ -3240,7 +3241,7 @@ void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter, void LSRInstance::CollectFixupsAndInitialFormulae() { BranchInst *ExitBranch = nullptr; - bool SaveCmp = TTI.canSaveCmp(L, &ExitBranch, &SE, &LI, &DT, &AC, &LibInfo); + bool SaveCmp = TTI.canSaveCmp(L, &ExitBranch, &SE, &LI, &DT, &AC, &TLI); for (const IVStrideUse &U : IU) { Instruction *UserInst = U.getUser(); @@ -3553,9 +3554,6 @@ static bool mayUsePostIncMode(const TargetTransformInfo &TTI, const SCEV *LoopStep = AR->getStepRecurrence(SE); if (!isa<SCEVConstant>(LoopStep)) return false; - if (LU.AccessTy.getType()->getScalarSizeInBits() != - LoopStep->getType()->getScalarSizeInBits()) - return false; // Check if a post-indexed load/store can be used. if (TTI.isIndexedLoadLegal(TTI.MIM_PostInc, AR->getType()) || TTI.isIndexedStoreLegal(TTI.MIM_PostInc, AR->getType())) { @@ -4673,6 +4671,54 @@ void LSRInstance::NarrowSearchSpaceByFilterFormulaWithSameScaledReg() { }); } +/// If we are over the complexity limit, filter out any post-inc prefering +/// variables to only post-inc values. +void LSRInstance::NarrowSearchSpaceByFilterPostInc() { + if (!TTI.shouldFavorPostInc()) + return; + if (EstimateSearchSpaceComplexity() < ComplexityLimit) + return; + + LLVM_DEBUG(dbgs() << "The search space is too complex.\n" + "Narrowing the search space by choosing the lowest " + "register Formula for PostInc Uses.\n"); + + for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { + LSRUse &LU = Uses[LUIdx]; + + if (LU.Kind != LSRUse::Address) + continue; + if (!TTI.isIndexedLoadLegal(TTI.MIM_PostInc, LU.AccessTy.getType()) && + !TTI.isIndexedStoreLegal(TTI.MIM_PostInc, LU.AccessTy.getType())) + continue; + + size_t MinRegs = std::numeric_limits<size_t>::max(); + for (const Formula &F : LU.Formulae) + MinRegs = std::min(F.getNumRegs(), MinRegs); + + bool Any = false; + for (size_t FIdx = 0, NumForms = LU.Formulae.size(); FIdx != NumForms; + ++FIdx) { + Formula &F = LU.Formulae[FIdx]; + if (F.getNumRegs() > MinRegs) { + LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs()); + dbgs() << "\n"); + LU.DeleteFormula(F); + --FIdx; + --NumForms; + Any = true; + } + } + if (Any) + LU.RecomputeRegs(LUIdx, RegUses); + + if (EstimateSearchSpaceComplexity() < ComplexityLimit) + break; + } + + LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs())); +} + /// The function delete formulas with high registers number expectation. /// Assuming we don't know the value of each formula (already delete /// all inefficient), generate probability of not selecting for each @@ -4883,6 +4929,7 @@ void LSRInstance::NarrowSearchSpaceUsingHeuristics() { NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(); if (FilterSameScaledReg) NarrowSearchSpaceByFilterFormulaWithSameScaledReg(); + NarrowSearchSpaceByFilterPostInc(); if (LSRExpNarrow) NarrowSearchSpaceByDeletingCostlyFormulas(); else @@ -4923,19 +4970,24 @@ void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution, // Ignore formulae which may not be ideal in terms of register reuse of // ReqRegs. The formula should use all required registers before // introducing new ones. - int NumReqRegsToFind = std::min(F.getNumRegs(), ReqRegs.size()); - for (const SCEV *Reg : ReqRegs) { - if ((F.ScaledReg && F.ScaledReg == Reg) || - is_contained(F.BaseRegs, Reg)) { - --NumReqRegsToFind; - if (NumReqRegsToFind == 0) - break; + // This can sometimes (notably when trying to favour postinc) lead to + // sub-optimial decisions. There it is best left to the cost modelling to + // get correct. + if (!TTI.shouldFavorPostInc() || LU.Kind != LSRUse::Address) { + int NumReqRegsToFind = std::min(F.getNumRegs(), ReqRegs.size()); + for (const SCEV *Reg : ReqRegs) { + if ((F.ScaledReg && F.ScaledReg == Reg) || + is_contained(F.BaseRegs, Reg)) { + --NumReqRegsToFind; + if (NumReqRegsToFind == 0) + break; + } + } + if (NumReqRegsToFind != 0) { + // If none of the formulae satisfied the required registers, then we could + // clear ReqRegs and try again. Currently, we simply give up in this case. + continue; } - } - if (NumReqRegsToFind != 0) { - // If none of the formulae satisfied the required registers, then we could - // clear ReqRegs and try again. Currently, we simply give up in this case. - continue; } // Evaluate the cost of the current formula. If it's already worse than @@ -5268,7 +5320,8 @@ Value *LSRInstance::Expand(const LSRUse &LU, const LSRFixup &LF, // form, update the ICmp's other operand. if (LU.Kind == LSRUse::ICmpZero) { ICmpInst *CI = cast<ICmpInst>(LF.UserInst); - DeadInsts.emplace_back(CI->getOperand(1)); + if (auto *OperandIsInstr = dyn_cast<Instruction>(CI->getOperand(1))) + DeadInsts.emplace_back(OperandIsInstr); assert(!F.BaseGV && "ICmp does not support folding a global value and " "a scale at the same time!"); if (F.Scale == -1) { @@ -5449,7 +5502,8 @@ void LSRInstance::Rewrite(const LSRUse &LU, const LSRFixup &LF, LF.UserInst->replaceUsesOfWith(LF.OperandValToReplace, FullV); } - DeadInsts.emplace_back(LF.OperandValToReplace); + if (auto *OperandIsInstr = dyn_cast<Instruction>(LF.OperandValToReplace)) + DeadInsts.emplace_back(OperandIsInstr); } /// Rewrite all the fixup locations with new values, following the chosen @@ -5490,16 +5544,17 @@ void LSRInstance::ImplementSolution( // instructions. Rewriter.clear(); - Changed |= DeleteTriviallyDeadInstructions(DeadInsts); + Changed |= RecursivelyDeleteTriviallyDeadInstructionsPermissive(DeadInsts, + &TLI, MSSAU); } LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE, DominatorTree &DT, LoopInfo &LI, const TargetTransformInfo &TTI, AssumptionCache &AC, - TargetLibraryInfo &LibInfo) - : IU(IU), SE(SE), DT(DT), LI(LI), AC(AC), LibInfo(LibInfo), TTI(TTI), L(L), - FavorBackedgeIndex(EnableBackedgeIndexing && - TTI.shouldFavorBackedgeIndex(L)) { + TargetLibraryInfo &TLI, MemorySSAUpdater *MSSAU) + : IU(IU), SE(SE), DT(DT), LI(LI), AC(AC), TLI(TLI), TTI(TTI), L(L), + MSSAU(MSSAU), FavorBackedgeIndex(EnableBackedgeIndexing && + TTI.shouldFavorBackedgeIndex(L)) { // If LoopSimplify form is not available, stay out of trouble. if (!L->isLoopSimplifyForm()) return; @@ -5702,21 +5757,26 @@ void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired<IVUsersWrapperPass>(); AU.addPreserved<IVUsersWrapperPass>(); AU.addRequired<TargetTransformInfoWrapperPass>(); + AU.addPreserved<MemorySSAWrapperPass>(); } static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE, DominatorTree &DT, LoopInfo &LI, const TargetTransformInfo &TTI, - AssumptionCache &AC, - TargetLibraryInfo &LibInfo) { + AssumptionCache &AC, TargetLibraryInfo &TLI, + MemorySSA *MSSA) { bool Changed = false; + std::unique_ptr<MemorySSAUpdater> MSSAU; + if (MSSA) + MSSAU = std::make_unique<MemorySSAUpdater>(MSSA); // Run the main LSR transformation. - Changed |= LSRInstance(L, IU, SE, DT, LI, TTI, AC, LibInfo).getChanged(); + Changed |= + LSRInstance(L, IU, SE, DT, LI, TTI, AC, TLI, MSSAU.get()).getChanged(); // Remove any extra phis created by processing inner loops. - Changed |= DeleteDeadPHIs(L->getHeader()); + Changed |= DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get()); if (EnablePhiElim && L->isLoopSimplifyForm()) { SmallVector<WeakTrackingVH, 16> DeadInsts; const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); @@ -5727,8 +5787,9 @@ static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE, unsigned numFolded = Rewriter.replaceCongruentIVs(L, &DT, DeadInsts, &TTI); if (numFolded) { Changed = true; - DeleteTriviallyDeadInstructions(DeadInsts); - DeleteDeadPHIs(L->getHeader()); + RecursivelyDeleteTriviallyDeadInstructionsPermissive(DeadInsts, &TLI, + MSSAU.get()); + DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get()); } } return Changed; @@ -5746,19 +5807,26 @@ bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) { *L->getHeader()->getParent()); auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache( *L->getHeader()->getParent()); - auto &LibInfo = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI( + auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI( *L->getHeader()->getParent()); - return ReduceLoopStrength(L, IU, SE, DT, LI, TTI, AC, LibInfo); + auto *MSSAAnalysis = getAnalysisIfAvailable<MemorySSAWrapperPass>(); + MemorySSA *MSSA = nullptr; + if (MSSAAnalysis) + MSSA = &MSSAAnalysis->getMSSA(); + return ReduceLoopStrength(L, IU, SE, DT, LI, TTI, AC, TLI, MSSA); } PreservedAnalyses LoopStrengthReducePass::run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &) { if (!ReduceLoopStrength(&L, AM.getResult<IVUsersAnalysis>(L, AR), AR.SE, - AR.DT, AR.LI, AR.TTI, AR.AC, AR.TLI)) + AR.DT, AR.LI, AR.TTI, AR.AC, AR.TLI, AR.MSSA)) return PreservedAnalyses::all(); - return getLoopPassPreservedAnalyses(); + auto PA = getLoopPassPreservedAnalyses(); + if (AR.MSSA) + PA.preserve<MemorySSAAnalysis>(); + return PA; } char LoopStrengthReduce::ID = 0; |