diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2019-08-20 20:50:12 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2019-08-20 20:50:12 +0000 |
commit | e6d1592492a3a379186bfb02bd0f4eda0669c0d5 (patch) | |
tree | 599ab169a01f1c86eda9adc774edaedde2f2db5b /lib/Transforms/Scalar/LoopStrengthReduce.cpp | |
parent | 1a56a5ead7a2e84bee8240f5f6b033b5f1707154 (diff) |
Diffstat (limited to 'lib/Transforms/Scalar/LoopStrengthReduce.cpp')
-rw-r--r-- | lib/Transforms/Scalar/LoopStrengthReduce.cpp | 344 |
1 files changed, 242 insertions, 102 deletions
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 773ffb9df0a2..59a387a186b8 100644 --- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -1,9 +1,8 @@ //===- LoopStrengthReduce.cpp - Strength Reduce IVs in Loops --------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -116,6 +115,7 @@ #include <cstdlib> #include <iterator> #include <limits> +#include <numeric> #include <map> #include <utility> @@ -155,11 +155,19 @@ static cl::opt<bool> FilterSameScaledReg( cl::desc("Narrow LSR search space by filtering non-optimal formulae" " with the same ScaledReg and Scale")); +static cl::opt<bool> EnableBackedgeIndexing( + "lsr-backedge-indexing", cl::Hidden, cl::init(true), + cl::desc("Enable the generation of cross iteration indexed memops")); + static cl::opt<unsigned> ComplexityLimit( "lsr-complexity-limit", cl::Hidden, cl::init(std::numeric_limits<uint16_t>::max()), cl::desc("LSR search space complexity limit")); +static cl::opt<unsigned> SetupCostDepthLimit( + "lsr-setupcost-depth-limit", cl::Hidden, cl::init(7), + cl::desc("The limit on recursion depth for LSRs setup cost")); + #ifndef NDEBUG // Stress test IV chain generation. static cl::opt<bool> StressIVChain( @@ -1007,10 +1015,15 @@ namespace { /// This class is used to measure and compare candidate formulae. class Cost { + const Loop *L = nullptr; + ScalarEvolution *SE = nullptr; + const TargetTransformInfo *TTI = nullptr; TargetTransformInfo::LSRCost C; public: - Cost() { + Cost() = delete; + Cost(const Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI) : + L(L), SE(&SE), TTI(&TTI) { C.Insns = 0; C.NumRegs = 0; C.AddRecCost = 0; @@ -1021,7 +1034,7 @@ public: C.ScaleCost = 0; } - bool isLess(Cost &Other, const TargetTransformInfo &TTI); + bool isLess(Cost &Other); void Lose(); @@ -1040,12 +1053,9 @@ public: return C.NumRegs == ~0u; } - void RateFormula(const TargetTransformInfo &TTI, - const Formula &F, + void RateFormula(const Formula &F, SmallPtrSetImpl<const SCEV *> &Regs, const DenseSet<const SCEV *> &VisitedRegs, - const Loop *L, - ScalarEvolution &SE, DominatorTree &DT, const LSRUse &LU, SmallPtrSetImpl<const SCEV *> *LoserRegs = nullptr); @@ -1053,17 +1063,11 @@ public: void dump() const; private: - void RateRegister(const SCEV *Reg, - SmallPtrSetImpl<const SCEV *> &Regs, - const Loop *L, - ScalarEvolution &SE, DominatorTree &DT, - const TargetTransformInfo &TTI); - void RatePrimaryRegister(const SCEV *Reg, + void RateRegister(const Formula &F, const SCEV *Reg, + SmallPtrSetImpl<const SCEV *> &Regs); + void RatePrimaryRegister(const Formula &F, const SCEV *Reg, SmallPtrSetImpl<const SCEV *> &Regs, - const Loop *L, - ScalarEvolution &SE, DominatorTree &DT, - SmallPtrSetImpl<const SCEV *> *LoserRegs, - const TargetTransformInfo &TTI); + SmallPtrSetImpl<const SCEV *> *LoserRegs); }; /// An operand value in an instruction which is to be replaced with some @@ -1208,19 +1212,36 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, bool HasBaseReg, int64_t Scale, Instruction *Fixup = nullptr); +static unsigned getSetupCost(const SCEV *Reg, unsigned Depth) { + if (isa<SCEVUnknown>(Reg) || isa<SCEVConstant>(Reg)) + return 1; + if (Depth == 0) + return 0; + if (const auto *S = dyn_cast<SCEVAddRecExpr>(Reg)) + return getSetupCost(S->getStart(), Depth - 1); + if (auto S = dyn_cast<SCEVCastExpr>(Reg)) + return getSetupCost(S->getOperand(), Depth - 1); + if (auto S = dyn_cast<SCEVNAryExpr>(Reg)) + return std::accumulate(S->op_begin(), S->op_end(), 0, + [&](unsigned i, const SCEV *Reg) { + return i + getSetupCost(Reg, Depth - 1); + }); + if (auto S = dyn_cast<SCEVUDivExpr>(Reg)) + return getSetupCost(S->getLHS(), Depth - 1) + + getSetupCost(S->getRHS(), Depth - 1); + return 0; +} + /// Tally up interesting quantities from the given register. -void Cost::RateRegister(const SCEV *Reg, - SmallPtrSetImpl<const SCEV *> &Regs, - const Loop *L, - ScalarEvolution &SE, DominatorTree &DT, - const TargetTransformInfo &TTI) { +void Cost::RateRegister(const Formula &F, const SCEV *Reg, + SmallPtrSetImpl<const SCEV *> &Regs) { if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) { // If this is an addrec for another loop, it should be an invariant // with respect to L since L is the innermost loop (at least // for now LSR only handles innermost loops). if (AR->getLoop() != L) { // If the AddRec exists, consider it's register free and leave it alone. - if (isExistingPhi(AR, SE)) + if (isExistingPhi(AR, *SE)) return; // It is bad to allow LSR for current loop to add induction variables @@ -1236,16 +1257,24 @@ void Cost::RateRegister(const SCEV *Reg, } unsigned LoopCost = 1; - if (TTI.shouldFavorPostInc()) { - const SCEV *LoopStep = AR->getStepRecurrence(SE); - if (isa<SCEVConstant>(LoopStep)) { - // Check if a post-indexed load/store can be used. - if (TTI.isIndexedLoadLegal(TTI.MIM_PostInc, AR->getType()) || - TTI.isIndexedStoreLegal(TTI.MIM_PostInc, AR->getType())) { + if (TTI->isIndexedLoadLegal(TTI->MIM_PostInc, AR->getType()) || + TTI->isIndexedStoreLegal(TTI->MIM_PostInc, AR->getType())) { + + // If the step size matches the base offset, we could use pre-indexed + // addressing. + if (TTI->shouldFavorBackedgeIndex(L)) { + if (auto *Step = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*SE))) + if (Step->getAPInt() == F.BaseOffset) + LoopCost = 0; + } + + if (TTI->shouldFavorPostInc()) { + const SCEV *LoopStep = AR->getStepRecurrence(*SE); + if (isa<SCEVConstant>(LoopStep)) { const SCEV *LoopStart = AR->getStart(); if (!isa<SCEVConstant>(LoopStart) && - SE.isLoopInvariant(LoopStart, L)) - LoopCost = 0; + SE->isLoopInvariant(LoopStart, L)) + LoopCost = 0; } } } @@ -1255,7 +1284,7 @@ void Cost::RateRegister(const SCEV *Reg, // TODO: The non-affine case isn't precisely modeled here. if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1))) { if (!Regs.count(AR->getOperand(1))) { - RateRegister(AR->getOperand(1), Regs, L, SE, DT, TTI); + RateRegister(F, AR->getOperand(1), Regs); if (isLoser()) return; } @@ -1265,43 +1294,34 @@ void Cost::RateRegister(const SCEV *Reg, // Rough heuristic; favor registers which don't require extra setup // instructions in the preheader. - if (!isa<SCEVUnknown>(Reg) && - !isa<SCEVConstant>(Reg) && - !(isa<SCEVAddRecExpr>(Reg) && - (isa<SCEVUnknown>(cast<SCEVAddRecExpr>(Reg)->getStart()) || - isa<SCEVConstant>(cast<SCEVAddRecExpr>(Reg)->getStart())))) - ++C.SetupCost; + C.SetupCost += getSetupCost(Reg, SetupCostDepthLimit); + // Ensure we don't, even with the recusion limit, produce invalid costs. + C.SetupCost = std::min<unsigned>(C.SetupCost, 1 << 16); C.NumIVMuls += isa<SCEVMulExpr>(Reg) && - SE.hasComputableLoopEvolution(Reg, L); + SE->hasComputableLoopEvolution(Reg, L); } /// Record this register in the set. If we haven't seen it before, rate /// it. Optional LoserRegs provides a way to declare any formula that refers to /// one of those regs an instant loser. -void Cost::RatePrimaryRegister(const SCEV *Reg, +void Cost::RatePrimaryRegister(const Formula &F, const SCEV *Reg, SmallPtrSetImpl<const SCEV *> &Regs, - const Loop *L, - ScalarEvolution &SE, DominatorTree &DT, - SmallPtrSetImpl<const SCEV *> *LoserRegs, - const TargetTransformInfo &TTI) { + SmallPtrSetImpl<const SCEV *> *LoserRegs) { if (LoserRegs && LoserRegs->count(Reg)) { Lose(); return; } if (Regs.insert(Reg).second) { - RateRegister(Reg, Regs, L, SE, DT, TTI); + RateRegister(F, Reg, Regs); if (LoserRegs && isLoser()) LoserRegs->insert(Reg); } } -void Cost::RateFormula(const TargetTransformInfo &TTI, - const Formula &F, +void Cost::RateFormula(const Formula &F, SmallPtrSetImpl<const SCEV *> &Regs, const DenseSet<const SCEV *> &VisitedRegs, - const Loop *L, - ScalarEvolution &SE, DominatorTree &DT, const LSRUse &LU, SmallPtrSetImpl<const SCEV *> *LoserRegs) { assert(F.isCanonical(*L) && "Cost is accurate only for canonical formula"); @@ -1314,7 +1334,7 @@ void Cost::RateFormula(const TargetTransformInfo &TTI, Lose(); return; } - RatePrimaryRegister(ScaledReg, Regs, L, SE, DT, LoserRegs, TTI); + RatePrimaryRegister(F, ScaledReg, Regs, LoserRegs); if (isLoser()) return; } @@ -1323,7 +1343,7 @@ void Cost::RateFormula(const TargetTransformInfo &TTI, Lose(); return; } - RatePrimaryRegister(BaseReg, Regs, L, SE, DT, LoserRegs, TTI); + RatePrimaryRegister(F, BaseReg, Regs, LoserRegs); if (isLoser()) return; } @@ -1334,11 +1354,11 @@ void Cost::RateFormula(const TargetTransformInfo &TTI, // Do not count the base and a possible second register if the target // allows to fold 2 registers. C.NumBaseAdds += - NumBaseParts - (1 + (F.Scale && isAMCompletelyFolded(TTI, LU, F))); + NumBaseParts - (1 + (F.Scale && isAMCompletelyFolded(*TTI, LU, F))); C.NumBaseAdds += (F.UnfoldedOffset != 0); // Accumulate non-free scaling amounts. - C.ScaleCost += getScalingFactorCost(TTI, LU, F, *L); + C.ScaleCost += getScalingFactorCost(*TTI, LU, F, *L); // Tally up the non-zero immediates. for (const LSRFixup &Fixup : LU.Fixups) { @@ -1353,7 +1373,7 @@ void Cost::RateFormula(const TargetTransformInfo &TTI, // Check with target if this offset with this instruction is // specifically not supported. if (LU.Kind == LSRUse::Address && Offset != 0 && - !isAMCompletelyFolded(TTI, LSRUse::Address, LU.AccessTy, F.BaseGV, + !isAMCompletelyFolded(*TTI, LSRUse::Address, LU.AccessTy, F.BaseGV, Offset, F.HasBaseReg, F.Scale, Fixup.UserInst)) C.NumBaseAdds++; } @@ -1366,7 +1386,7 @@ void Cost::RateFormula(const TargetTransformInfo &TTI, // Treat every new register that exceeds TTI.getNumberOfRegisters() - 1 as // additional instruction (at least fill). - unsigned TTIRegNum = TTI.getNumberOfRegisters(false) - 1; + unsigned TTIRegNum = TTI->getNumberOfRegisters(false) - 1; if (C.NumRegs > TTIRegNum) { // Cost already exceeded TTIRegNum, then only newly added register can add // new instructions. @@ -1386,7 +1406,8 @@ void Cost::RateFormula(const TargetTransformInfo &TTI, // // For {-10, +, 1}: // i = i + 1; - if (LU.Kind == LSRUse::ICmpZero && !F.hasZeroEnd() && !TTI.canMacroFuseCmp()) + if (LU.Kind == LSRUse::ICmpZero && !F.hasZeroEnd() && + !TTI->canMacroFuseCmp()) C.Insns++; // Each new AddRec adds 1 instruction to calculation. C.Insns += (C.AddRecCost - PrevAddRecCost); @@ -1410,11 +1431,11 @@ void Cost::Lose() { } /// Choose the lower cost. -bool Cost::isLess(Cost &Other, const TargetTransformInfo &TTI) { +bool Cost::isLess(Cost &Other) { if (InsnsCost.getNumOccurrences() > 0 && InsnsCost && C.Insns != Other.C.Insns) return C.Insns < Other.C.Insns; - return TTI.isLSRCostLess(C, Other.C); + return TTI->isLSRCostLess(C, Other.C); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -1888,8 +1909,11 @@ class LSRInstance { ScalarEvolution &SE; DominatorTree &DT; LoopInfo &LI; + AssumptionCache &AC; + TargetLibraryInfo &LibInfo; const TargetTransformInfo &TTI; Loop *const L; + bool FavorBackedgeIndex = false; bool Changed = false; /// This is the insert position that the current loop's induction variable @@ -1910,7 +1934,7 @@ class LSRInstance { SmallSetVector<Type *, 4> Types; /// The list of interesting uses. - SmallVector<LSRUse, 16> Uses; + mutable SmallVector<LSRUse, 16> Uses; /// Track which uses use which register candidates. RegUseTracker RegUses; @@ -2025,7 +2049,8 @@ class LSRInstance { public: LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE, DominatorTree &DT, - LoopInfo &LI, const TargetTransformInfo &TTI); + LoopInfo &LI, const TargetTransformInfo &TTI, AssumptionCache &AC, + TargetLibraryInfo &LibInfo); bool getChanged() const { return Changed; } @@ -2804,7 +2829,7 @@ bool IVChain::isProfitableIncrement(const SCEV *OperExpr, /// TODO: Consider IVInc free if it's already used in another chains. static bool isProfitableChain(IVChain &Chain, SmallPtrSetImpl<Instruction*> &Users, - ScalarEvolution &SE, const TargetTransformInfo &TTI) { + ScalarEvolution &SE) { if (StressIVChain) return true; @@ -3064,7 +3089,7 @@ void LSRInstance::CollectChains() { for (unsigned UsersIdx = 0, NChains = IVChainVec.size(); UsersIdx < NChains; ++UsersIdx) { if (!isProfitableChain(IVChainVec[UsersIdx], - ChainUsersVec[UsersIdx].FarUsers, SE, TTI)) + ChainUsersVec[UsersIdx].FarUsers, SE)) continue; // Preserve the chain at UsesIdx. if (ChainIdx != UsersIdx) @@ -3078,7 +3103,7 @@ void LSRInstance::CollectChains() { void LSRInstance::FinalizeChain(IVChain &Chain) { assert(!Chain.Incs.empty() && "empty IV chains are not allowed"); LLVM_DEBUG(dbgs() << "Final Chain: " << *Chain.Incs[0].UserInst << "\n"); - + for (const IVInc &Inc : Chain) { LLVM_DEBUG(dbgs() << " Inc: " << *Inc.UserInst << "\n"); auto UseI = find(Inc.UserInst->operands(), Inc.IVOperand); @@ -3100,7 +3125,7 @@ static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst, MemAccessTy AccessTy = getAccessType(TTI, UserInst, Operand); int64_t IncOffset = IncConst->getValue()->getSExtValue(); if (!isAlwaysFoldable(TTI, LSRUse::Address, AccessTy, /*BaseGV=*/nullptr, - IncOffset, /*HaseBaseReg=*/false)) + IncOffset, /*HasBaseReg=*/false)) return false; return true; @@ -3210,6 +3235,9 @@ void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter, } void LSRInstance::CollectFixupsAndInitialFormulae() { + BranchInst *ExitBranch = nullptr; + bool SaveCmp = TTI.canSaveCmp(L, &ExitBranch, &SE, &LI, &DT, &AC, &LibInfo); + for (const IVStrideUse &U : IU) { Instruction *UserInst = U.getUser(); // Skip IV users that are part of profitable IV Chains. @@ -3239,6 +3267,10 @@ void LSRInstance::CollectFixupsAndInitialFormulae() { // equality icmps, thanks to IndVarSimplify. if (ICmpInst *CI = dyn_cast<ICmpInst>(UserInst)) if (CI->isEquality()) { + // If CI can be saved in some target, like replaced inside hardware loop + // in PowerPC, no need to generate initial formulae for it. + if (SaveCmp && CI == dyn_cast<ICmpInst>(ExitBranch->getCondition())) + continue; // Swap the operands if needed to put the OperandValToReplace on the // left, for consistency. Value *NV = CI->getOperand(1); @@ -3738,10 +3770,11 @@ void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, void LSRInstance::GenerateConstantOffsetsImpl( LSRUse &LU, unsigned LUIdx, const Formula &Base, const SmallVectorImpl<int64_t> &Worklist, size_t Idx, bool IsScaledReg) { - const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx]; - for (int64_t Offset : Worklist) { + + auto GenerateOffset = [&](const SCEV *G, int64_t Offset) { Formula F = Base; F.BaseOffset = (uint64_t)Base.BaseOffset - Offset; + if (isLegalUse(TTI, LU.MinOffset - Offset, LU.MaxOffset - Offset, LU.Kind, LU.AccessTy, F)) { // Add the offset to the base register. @@ -3761,7 +3794,35 @@ void LSRInstance::GenerateConstantOffsetsImpl( (void)InsertFormula(LU, LUIdx, F); } + }; + + const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx]; + + // With constant offsets and constant steps, we can generate pre-inc + // accesses by having the offset equal the step. So, for access #0 with a + // step of 8, we generate a G - 8 base which would require the first access + // to be ((G - 8) + 8),+,8. The pre-indexed access then updates the pointer + // for itself and hopefully becomes the base for other accesses. This means + // means that a single pre-indexed access can be generated to become the new + // base pointer for each iteration of the loop, resulting in no extra add/sub + // instructions for pointer updating. + if (FavorBackedgeIndex && LU.Kind == LSRUse::Address) { + if (auto *GAR = dyn_cast<SCEVAddRecExpr>(G)) { + if (auto *StepRec = + dyn_cast<SCEVConstant>(GAR->getStepRecurrence(SE))) { + const APInt &StepInt = StepRec->getAPInt(); + int64_t Step = StepInt.isNegative() ? + StepInt.getSExtValue() : StepInt.getZExtValue(); + + for (int64_t Offset : Worklist) { + Offset -= Step; + GenerateOffset(G, Offset); + } + } + } } + for (int64_t Offset : Worklist) + GenerateOffset(G, Offset); int64_t Imm = ExtractImmediate(G, SE); if (G->isZero() || Imm == 0) @@ -3968,9 +4029,27 @@ void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) { if (SrcTy != DstTy && TTI.isTruncateFree(SrcTy, DstTy)) { Formula F = Base; - if (F.ScaledReg) F.ScaledReg = SE.getAnyExtendExpr(F.ScaledReg, SrcTy); - for (const SCEV *&BaseReg : F.BaseRegs) - BaseReg = SE.getAnyExtendExpr(BaseReg, SrcTy); + // Sometimes SCEV is able to prove zero during ext transform. It may + // happen if SCEV did not do all possible transforms while creating the + // initial node (maybe due to depth limitations), but it can do them while + // taking ext. + if (F.ScaledReg) { + const SCEV *NewScaledReg = SE.getAnyExtendExpr(F.ScaledReg, SrcTy); + if (NewScaledReg->isZero()) + continue; + F.ScaledReg = NewScaledReg; + } + bool HasZeroBaseReg = false; + for (const SCEV *&BaseReg : F.BaseRegs) { + const SCEV *NewBaseReg = SE.getAnyExtendExpr(BaseReg, SrcTy); + if (NewBaseReg->isZero()) { + HasZeroBaseReg = true; + break; + } + BaseReg = NewBaseReg; + } + if (HasZeroBaseReg) + continue; // TODO: This assumes we've done basic processing on all uses and // have an idea what the register usage is. @@ -4067,11 +4146,17 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { // Conservatively examine offsets between this orig reg a few selected // other orig regs. + int64_t First = Imms.begin()->first; + int64_t Last = std::prev(Imms.end())->first; + // Compute (First + Last) / 2 without overflow using the fact that + // First + Last = 2 * (First + Last) + (First ^ Last). + int64_t Avg = (First & Last) + ((First ^ Last) >> 1); + // If the result is negative and First is odd and Last even (or vice versa), + // we rounded towards -inf. Add 1 in that case, to round towards 0. + Avg = Avg + ((First ^ Last) & ((uint64_t)Avg >> 63)); ImmMapTy::const_iterator OtherImms[] = { - Imms.begin(), std::prev(Imms.end()), - Imms.lower_bound((Imms.begin()->first + std::prev(Imms.end())->first) / - 2) - }; + Imms.begin(), std::prev(Imms.end()), + Imms.lower_bound(Avg)}; for (size_t i = 0, e = array_lengthof(OtherImms); i != e; ++i) { ImmMapTy::const_iterator M = OtherImms[i]; if (M == J || M == JE) continue; @@ -4249,9 +4334,9 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() { // avoids the need to recompute this information across formulae using the // same bad AddRec. Passing LoserRegs is also essential unless we remove // the corresponding bad register from the Regs set. - Cost CostF; + Cost CostF(L, SE, TTI); Regs.clear(); - CostF.RateFormula(TTI, F, Regs, VisitedRegs, L, SE, DT, LU, &LoserRegs); + CostF.RateFormula(F, Regs, VisitedRegs, LU, &LoserRegs); if (CostF.isLoser()) { // During initial formula generation, undesirable formulae are generated // by uses within other loops that have some non-trivial address mode or @@ -4282,10 +4367,10 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() { Formula &Best = LU.Formulae[P.first->second]; - Cost CostBest; + Cost CostBest(L, SE, TTI); Regs.clear(); - CostBest.RateFormula(TTI, Best, Regs, VisitedRegs, L, SE, DT, LU); - if (CostF.isLess(CostBest, TTI)) + CostBest.RateFormula(Best, Regs, VisitedRegs, LU); + if (CostF.isLess(CostBest)) std::swap(F, Best); LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs()); dbgs() << "\n" @@ -4357,7 +4442,9 @@ void LSRInstance::NarrowSearchSpaceByDetectingSupersets() { I = F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I) { if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*I)) { Formula NewF = F; - NewF.BaseOffset += C->getValue()->getSExtValue(); + //FIXME: Formulas should store bitwidth to do wrapping properly. + // See PR41034. + NewF.BaseOffset += (uint64_t)C->getValue()->getSExtValue(); NewF.BaseRegs.erase(NewF.BaseRegs.begin() + (I - F.BaseRegs.begin())); if (LU.HasFormulaWithSameRegs(NewF)) { @@ -4400,7 +4487,7 @@ void LSRInstance::NarrowSearchSpaceByDetectingSupersets() { /// When there are many registers for expressions like A, A+1, A+2, etc., /// allocate a single register for them. void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() { - if (EstimateSearchSpaceComplexity() < ComplexityLimit) + if (EstimateSearchSpaceComplexity() < ComplexityLimit) return; LLVM_DEBUG( @@ -4533,12 +4620,13 @@ void LSRInstance::NarrowSearchSpaceByFilterFormulaWithSameScaledReg() { // If the new register numbers are the same, choose the Formula with // less Cost. - Cost CostFA, CostFB; + Cost CostFA(L, SE, TTI); + Cost CostFB(L, SE, TTI); Regs.clear(); - CostFA.RateFormula(TTI, FA, Regs, VisitedRegs, L, SE, DT, LU); + CostFA.RateFormula(FA, Regs, VisitedRegs, LU); Regs.clear(); - CostFB.RateFormula(TTI, FB, Regs, VisitedRegs, L, SE, DT, LU); - return CostFA.isLess(CostFB, TTI); + CostFB.RateFormula(FB, Regs, VisitedRegs, LU); + return CostFA.isLess(CostFB); }; bool Any = false; @@ -4824,7 +4912,7 @@ void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution, ReqRegs.insert(S); SmallPtrSet<const SCEV *, 16> NewRegs; - Cost NewCost; + Cost NewCost(L, SE, TTI); for (const Formula &F : LU.Formulae) { // Ignore formulae which may not be ideal in terms of register reuse of // ReqRegs. The formula should use all required registers before @@ -4848,8 +4936,8 @@ void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution, // the current best, prune the search at that point. NewCost = CurCost; NewRegs = CurRegs; - NewCost.RateFormula(TTI, F, NewRegs, VisitedRegs, L, SE, DT, LU); - if (NewCost.isLess(SolutionCost, TTI)) { + NewCost.RateFormula(F, NewRegs, VisitedRegs, LU); + if (NewCost.isLess(SolutionCost)) { Workspace.push_back(&F); if (Workspace.size() != Uses.size()) { SolveRecurse(Solution, SolutionCost, Workspace, NewCost, @@ -4858,9 +4946,9 @@ void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution, VisitedRegs.insert(F.ScaledReg ? F.ScaledReg : F.BaseRegs[0]); } else { LLVM_DEBUG(dbgs() << "New best at "; NewCost.print(dbgs()); - dbgs() << ".\n Regs:"; for (const SCEV *S - : NewRegs) dbgs() - << ' ' << *S; + dbgs() << ".\nRegs:\n"; + for (const SCEV *S : NewRegs) dbgs() + << "- " << *S << "\n"; dbgs() << '\n'); SolutionCost = NewCost; @@ -4875,9 +4963,9 @@ void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution, /// vector. void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const { SmallVector<const Formula *, 8> Workspace; - Cost SolutionCost; + Cost SolutionCost(L, SE, TTI); SolutionCost.Lose(); - Cost CurCost; + Cost CurCost(L, SE, TTI); SmallPtrSet<const SCEV *, 16> CurRegs; DenseSet<const SCEV *> VisitedRegs; Workspace.reserve(Uses.size()); @@ -5215,6 +5303,7 @@ void LSRInstance::RewriteForPHI( DenseMap<BasicBlock *, Value *> Inserted; for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) if (PN->getIncomingValue(i) == LF.OperandValToReplace) { + bool needUpdateFixups = false; BasicBlock *BB = PN->getIncomingBlock(i); // If this is a critical edge, split the edge so that we do not insert @@ -5233,7 +5322,7 @@ void LSRInstance::RewriteForPHI( NewBB = SplitCriticalEdge(BB, Parent, CriticalEdgeSplittingOptions(&DT, &LI) .setMergeIdenticalEdges() - .setDontDeleteUselessPHIs()); + .setKeepOneInputPHIs()); } else { SmallVector<BasicBlock*, 2> NewBBs; SplitLandingPadPredecessors(Parent, BB, "", "", NewBBs, &DT, &LI); @@ -5253,6 +5342,8 @@ void LSRInstance::RewriteForPHI( e = PN->getNumIncomingValues(); BB = NewBB; i = PN->getBasicBlockIndex(BB); + + needUpdateFixups = true; } } } @@ -5277,6 +5368,44 @@ void LSRInstance::RewriteForPHI( PN->setIncomingValue(i, FullV); Pair.first->second = FullV; } + + // If LSR splits critical edge and phi node has other pending + // fixup operands, we need to update those pending fixups. Otherwise + // formulae will not be implemented completely and some instructions + // will not be eliminated. + if (needUpdateFixups) { + for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) + for (LSRFixup &Fixup : Uses[LUIdx].Fixups) + // If fixup is supposed to rewrite some operand in the phi + // that was just updated, it may be already moved to + // another phi node. Such fixup requires update. + if (Fixup.UserInst == PN) { + // Check if the operand we try to replace still exists in the + // original phi. + bool foundInOriginalPHI = false; + for (const auto &val : PN->incoming_values()) + if (val == Fixup.OperandValToReplace) { + foundInOriginalPHI = true; + break; + } + + // If fixup operand found in original PHI - nothing to do. + if (foundInOriginalPHI) + continue; + + // Otherwise it might be moved to another PHI and requires update. + // If fixup operand not found in any of the incoming blocks that + // means we have already rewritten it - nothing to do. + for (const auto &Block : PN->blocks()) + for (BasicBlock::iterator I = Block->begin(); isa<PHINode>(I); + ++I) { + PHINode *NewPN = cast<PHINode>(I); + for (const auto &val : NewPN->incoming_values()) + if (val == Fixup.OperandValToReplace) + Fixup.UserInst = NewPN; + } + } + } } } @@ -5360,8 +5489,11 @@ void LSRInstance::ImplementSolution( LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE, DominatorTree &DT, LoopInfo &LI, - const TargetTransformInfo &TTI) - : IU(IU), SE(SE), DT(DT), LI(LI), TTI(TTI), L(L) { + const TargetTransformInfo &TTI, AssumptionCache &AC, + TargetLibraryInfo &LibInfo) + : IU(IU), SE(SE), DT(DT), LI(LI), AC(AC), LibInfo(LibInfo), TTI(TTI), L(L), + FavorBackedgeIndex(EnableBackedgeIndexing && + TTI.shouldFavorBackedgeIndex(L)) { // If LoopSimplify form is not available, stay out of trouble. if (!L->isLoopSimplifyForm()) return; @@ -5556,6 +5688,8 @@ void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const { AU.addPreserved<DominatorTreeWrapperPass>(); AU.addRequired<ScalarEvolutionWrapperPass>(); AU.addPreserved<ScalarEvolutionWrapperPass>(); + AU.addRequired<AssumptionCacheTracker>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); // Requiring LoopSimplify a second time here prevents IVUsers from running // twice, since LoopSimplify was invalidated by running ScalarEvolution. AU.addRequiredID(LoopSimplifyID); @@ -5566,11 +5700,14 @@ void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const { static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE, DominatorTree &DT, LoopInfo &LI, - const TargetTransformInfo &TTI) { + const TargetTransformInfo &TTI, + AssumptionCache &AC, + TargetLibraryInfo &LibInfo) { + bool Changed = false; // Run the main LSR transformation. - Changed |= LSRInstance(L, IU, SE, DT, LI, TTI).getChanged(); + Changed |= LSRInstance(L, IU, SE, DT, LI, TTI, AC, LibInfo).getChanged(); // Remove any extra phis created by processing inner loops. Changed |= DeleteDeadPHIs(L->getHeader()); @@ -5601,14 +5738,17 @@ bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) { auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); const auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI( *L->getHeader()->getParent()); - return ReduceLoopStrength(L, IU, SE, DT, LI, TTI); + auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache( + *L->getHeader()->getParent()); + auto &LibInfo = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + return ReduceLoopStrength(L, IU, SE, DT, LI, TTI, AC, LibInfo); } PreservedAnalyses LoopStrengthReducePass::run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &) { if (!ReduceLoopStrength(&L, AM.getResult<IVUsersAnalysis>(L, AR), AR.SE, - AR.DT, AR.LI, AR.TTI)) + AR.DT, AR.LI, AR.TTI, AR.AC, AR.TLI)) return PreservedAnalyses::all(); return getLoopPassPreservedAnalyses(); |