aboutsummaryrefslogtreecommitdiff
path: root/lib/Transforms/Scalar/LoopStrengthReduce.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Transforms/Scalar/LoopStrengthReduce.cpp')
-rw-r--r--lib/Transforms/Scalar/LoopStrengthReduce.cpp344
1 files changed, 242 insertions, 102 deletions
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 773ffb9df0a2..59a387a186b8 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -1,9 +1,8 @@
//===- LoopStrengthReduce.cpp - Strength Reduce IVs in Loops --------------===//
//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
@@ -116,6 +115,7 @@
#include <cstdlib>
#include <iterator>
#include <limits>
+#include <numeric>
#include <map>
#include <utility>
@@ -155,11 +155,19 @@ static cl::opt<bool> FilterSameScaledReg(
cl::desc("Narrow LSR search space by filtering non-optimal formulae"
" with the same ScaledReg and Scale"));
+static cl::opt<bool> EnableBackedgeIndexing(
+ "lsr-backedge-indexing", cl::Hidden, cl::init(true),
+ cl::desc("Enable the generation of cross iteration indexed memops"));
+
static cl::opt<unsigned> ComplexityLimit(
"lsr-complexity-limit", cl::Hidden,
cl::init(std::numeric_limits<uint16_t>::max()),
cl::desc("LSR search space complexity limit"));
+static cl::opt<unsigned> SetupCostDepthLimit(
+ "lsr-setupcost-depth-limit", cl::Hidden, cl::init(7),
+ cl::desc("The limit on recursion depth for LSRs setup cost"));
+
#ifndef NDEBUG
// Stress test IV chain generation.
static cl::opt<bool> StressIVChain(
@@ -1007,10 +1015,15 @@ namespace {
/// This class is used to measure and compare candidate formulae.
class Cost {
+ const Loop *L = nullptr;
+ ScalarEvolution *SE = nullptr;
+ const TargetTransformInfo *TTI = nullptr;
TargetTransformInfo::LSRCost C;
public:
- Cost() {
+ Cost() = delete;
+ Cost(const Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI) :
+ L(L), SE(&SE), TTI(&TTI) {
C.Insns = 0;
C.NumRegs = 0;
C.AddRecCost = 0;
@@ -1021,7 +1034,7 @@ public:
C.ScaleCost = 0;
}
- bool isLess(Cost &Other, const TargetTransformInfo &TTI);
+ bool isLess(Cost &Other);
void Lose();
@@ -1040,12 +1053,9 @@ public:
return C.NumRegs == ~0u;
}
- void RateFormula(const TargetTransformInfo &TTI,
- const Formula &F,
+ void RateFormula(const Formula &F,
SmallPtrSetImpl<const SCEV *> &Regs,
const DenseSet<const SCEV *> &VisitedRegs,
- const Loop *L,
- ScalarEvolution &SE, DominatorTree &DT,
const LSRUse &LU,
SmallPtrSetImpl<const SCEV *> *LoserRegs = nullptr);
@@ -1053,17 +1063,11 @@ public:
void dump() const;
private:
- void RateRegister(const SCEV *Reg,
- SmallPtrSetImpl<const SCEV *> &Regs,
- const Loop *L,
- ScalarEvolution &SE, DominatorTree &DT,
- const TargetTransformInfo &TTI);
- void RatePrimaryRegister(const SCEV *Reg,
+ void RateRegister(const Formula &F, const SCEV *Reg,
+ SmallPtrSetImpl<const SCEV *> &Regs);
+ void RatePrimaryRegister(const Formula &F, const SCEV *Reg,
SmallPtrSetImpl<const SCEV *> &Regs,
- const Loop *L,
- ScalarEvolution &SE, DominatorTree &DT,
- SmallPtrSetImpl<const SCEV *> *LoserRegs,
- const TargetTransformInfo &TTI);
+ SmallPtrSetImpl<const SCEV *> *LoserRegs);
};
/// An operand value in an instruction which is to be replaced with some
@@ -1208,19 +1212,36 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
bool HasBaseReg, int64_t Scale,
Instruction *Fixup = nullptr);
+static unsigned getSetupCost(const SCEV *Reg, unsigned Depth) {
+ if (isa<SCEVUnknown>(Reg) || isa<SCEVConstant>(Reg))
+ return 1;
+ if (Depth == 0)
+ return 0;
+ if (const auto *S = dyn_cast<SCEVAddRecExpr>(Reg))
+ return getSetupCost(S->getStart(), Depth - 1);
+ if (auto S = dyn_cast<SCEVCastExpr>(Reg))
+ return getSetupCost(S->getOperand(), Depth - 1);
+ if (auto S = dyn_cast<SCEVNAryExpr>(Reg))
+ return std::accumulate(S->op_begin(), S->op_end(), 0,
+ [&](unsigned i, const SCEV *Reg) {
+ return i + getSetupCost(Reg, Depth - 1);
+ });
+ if (auto S = dyn_cast<SCEVUDivExpr>(Reg))
+ return getSetupCost(S->getLHS(), Depth - 1) +
+ getSetupCost(S->getRHS(), Depth - 1);
+ return 0;
+}
+
/// Tally up interesting quantities from the given register.
-void Cost::RateRegister(const SCEV *Reg,
- SmallPtrSetImpl<const SCEV *> &Regs,
- const Loop *L,
- ScalarEvolution &SE, DominatorTree &DT,
- const TargetTransformInfo &TTI) {
+void Cost::RateRegister(const Formula &F, const SCEV *Reg,
+ SmallPtrSetImpl<const SCEV *> &Regs) {
if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
// If this is an addrec for another loop, it should be an invariant
// with respect to L since L is the innermost loop (at least
// for now LSR only handles innermost loops).
if (AR->getLoop() != L) {
// If the AddRec exists, consider it's register free and leave it alone.
- if (isExistingPhi(AR, SE))
+ if (isExistingPhi(AR, *SE))
return;
// It is bad to allow LSR for current loop to add induction variables
@@ -1236,16 +1257,24 @@ void Cost::RateRegister(const SCEV *Reg,
}
unsigned LoopCost = 1;
- if (TTI.shouldFavorPostInc()) {
- const SCEV *LoopStep = AR->getStepRecurrence(SE);
- if (isa<SCEVConstant>(LoopStep)) {
- // Check if a post-indexed load/store can be used.
- if (TTI.isIndexedLoadLegal(TTI.MIM_PostInc, AR->getType()) ||
- TTI.isIndexedStoreLegal(TTI.MIM_PostInc, AR->getType())) {
+ if (TTI->isIndexedLoadLegal(TTI->MIM_PostInc, AR->getType()) ||
+ TTI->isIndexedStoreLegal(TTI->MIM_PostInc, AR->getType())) {
+
+ // If the step size matches the base offset, we could use pre-indexed
+ // addressing.
+ if (TTI->shouldFavorBackedgeIndex(L)) {
+ if (auto *Step = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*SE)))
+ if (Step->getAPInt() == F.BaseOffset)
+ LoopCost = 0;
+ }
+
+ if (TTI->shouldFavorPostInc()) {
+ const SCEV *LoopStep = AR->getStepRecurrence(*SE);
+ if (isa<SCEVConstant>(LoopStep)) {
const SCEV *LoopStart = AR->getStart();
if (!isa<SCEVConstant>(LoopStart) &&
- SE.isLoopInvariant(LoopStart, L))
- LoopCost = 0;
+ SE->isLoopInvariant(LoopStart, L))
+ LoopCost = 0;
}
}
}
@@ -1255,7 +1284,7 @@ void Cost::RateRegister(const SCEV *Reg,
// TODO: The non-affine case isn't precisely modeled here.
if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1))) {
if (!Regs.count(AR->getOperand(1))) {
- RateRegister(AR->getOperand(1), Regs, L, SE, DT, TTI);
+ RateRegister(F, AR->getOperand(1), Regs);
if (isLoser())
return;
}
@@ -1265,43 +1294,34 @@ void Cost::RateRegister(const SCEV *Reg,
// Rough heuristic; favor registers which don't require extra setup
// instructions in the preheader.
- if (!isa<SCEVUnknown>(Reg) &&
- !isa<SCEVConstant>(Reg) &&
- !(isa<SCEVAddRecExpr>(Reg) &&
- (isa<SCEVUnknown>(cast<SCEVAddRecExpr>(Reg)->getStart()) ||
- isa<SCEVConstant>(cast<SCEVAddRecExpr>(Reg)->getStart()))))
- ++C.SetupCost;
+ C.SetupCost += getSetupCost(Reg, SetupCostDepthLimit);
+ // Ensure we don't, even with the recusion limit, produce invalid costs.
+ C.SetupCost = std::min<unsigned>(C.SetupCost, 1 << 16);
C.NumIVMuls += isa<SCEVMulExpr>(Reg) &&
- SE.hasComputableLoopEvolution(Reg, L);
+ SE->hasComputableLoopEvolution(Reg, L);
}
/// Record this register in the set. If we haven't seen it before, rate
/// it. Optional LoserRegs provides a way to declare any formula that refers to
/// one of those regs an instant loser.
-void Cost::RatePrimaryRegister(const SCEV *Reg,
+void Cost::RatePrimaryRegister(const Formula &F, const SCEV *Reg,
SmallPtrSetImpl<const SCEV *> &Regs,
- const Loop *L,
- ScalarEvolution &SE, DominatorTree &DT,
- SmallPtrSetImpl<const SCEV *> *LoserRegs,
- const TargetTransformInfo &TTI) {
+ SmallPtrSetImpl<const SCEV *> *LoserRegs) {
if (LoserRegs && LoserRegs->count(Reg)) {
Lose();
return;
}
if (Regs.insert(Reg).second) {
- RateRegister(Reg, Regs, L, SE, DT, TTI);
+ RateRegister(F, Reg, Regs);
if (LoserRegs && isLoser())
LoserRegs->insert(Reg);
}
}
-void Cost::RateFormula(const TargetTransformInfo &TTI,
- const Formula &F,
+void Cost::RateFormula(const Formula &F,
SmallPtrSetImpl<const SCEV *> &Regs,
const DenseSet<const SCEV *> &VisitedRegs,
- const Loop *L,
- ScalarEvolution &SE, DominatorTree &DT,
const LSRUse &LU,
SmallPtrSetImpl<const SCEV *> *LoserRegs) {
assert(F.isCanonical(*L) && "Cost is accurate only for canonical formula");
@@ -1314,7 +1334,7 @@ void Cost::RateFormula(const TargetTransformInfo &TTI,
Lose();
return;
}
- RatePrimaryRegister(ScaledReg, Regs, L, SE, DT, LoserRegs, TTI);
+ RatePrimaryRegister(F, ScaledReg, Regs, LoserRegs);
if (isLoser())
return;
}
@@ -1323,7 +1343,7 @@ void Cost::RateFormula(const TargetTransformInfo &TTI,
Lose();
return;
}
- RatePrimaryRegister(BaseReg, Regs, L, SE, DT, LoserRegs, TTI);
+ RatePrimaryRegister(F, BaseReg, Regs, LoserRegs);
if (isLoser())
return;
}
@@ -1334,11 +1354,11 @@ void Cost::RateFormula(const TargetTransformInfo &TTI,
// Do not count the base and a possible second register if the target
// allows to fold 2 registers.
C.NumBaseAdds +=
- NumBaseParts - (1 + (F.Scale && isAMCompletelyFolded(TTI, LU, F)));
+ NumBaseParts - (1 + (F.Scale && isAMCompletelyFolded(*TTI, LU, F)));
C.NumBaseAdds += (F.UnfoldedOffset != 0);
// Accumulate non-free scaling amounts.
- C.ScaleCost += getScalingFactorCost(TTI, LU, F, *L);
+ C.ScaleCost += getScalingFactorCost(*TTI, LU, F, *L);
// Tally up the non-zero immediates.
for (const LSRFixup &Fixup : LU.Fixups) {
@@ -1353,7 +1373,7 @@ void Cost::RateFormula(const TargetTransformInfo &TTI,
// Check with target if this offset with this instruction is
// specifically not supported.
if (LU.Kind == LSRUse::Address && Offset != 0 &&
- !isAMCompletelyFolded(TTI, LSRUse::Address, LU.AccessTy, F.BaseGV,
+ !isAMCompletelyFolded(*TTI, LSRUse::Address, LU.AccessTy, F.BaseGV,
Offset, F.HasBaseReg, F.Scale, Fixup.UserInst))
C.NumBaseAdds++;
}
@@ -1366,7 +1386,7 @@ void Cost::RateFormula(const TargetTransformInfo &TTI,
// Treat every new register that exceeds TTI.getNumberOfRegisters() - 1 as
// additional instruction (at least fill).
- unsigned TTIRegNum = TTI.getNumberOfRegisters(false) - 1;
+ unsigned TTIRegNum = TTI->getNumberOfRegisters(false) - 1;
if (C.NumRegs > TTIRegNum) {
// Cost already exceeded TTIRegNum, then only newly added register can add
// new instructions.
@@ -1386,7 +1406,8 @@ void Cost::RateFormula(const TargetTransformInfo &TTI,
//
// For {-10, +, 1}:
// i = i + 1;
- if (LU.Kind == LSRUse::ICmpZero && !F.hasZeroEnd() && !TTI.canMacroFuseCmp())
+ if (LU.Kind == LSRUse::ICmpZero && !F.hasZeroEnd() &&
+ !TTI->canMacroFuseCmp())
C.Insns++;
// Each new AddRec adds 1 instruction to calculation.
C.Insns += (C.AddRecCost - PrevAddRecCost);
@@ -1410,11 +1431,11 @@ void Cost::Lose() {
}
/// Choose the lower cost.
-bool Cost::isLess(Cost &Other, const TargetTransformInfo &TTI) {
+bool Cost::isLess(Cost &Other) {
if (InsnsCost.getNumOccurrences() > 0 && InsnsCost &&
C.Insns != Other.C.Insns)
return C.Insns < Other.C.Insns;
- return TTI.isLSRCostLess(C, Other.C);
+ return TTI->isLSRCostLess(C, Other.C);
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -1888,8 +1909,11 @@ class LSRInstance {
ScalarEvolution &SE;
DominatorTree &DT;
LoopInfo &LI;
+ AssumptionCache &AC;
+ TargetLibraryInfo &LibInfo;
const TargetTransformInfo &TTI;
Loop *const L;
+ bool FavorBackedgeIndex = false;
bool Changed = false;
/// This is the insert position that the current loop's induction variable
@@ -1910,7 +1934,7 @@ class LSRInstance {
SmallSetVector<Type *, 4> Types;
/// The list of interesting uses.
- SmallVector<LSRUse, 16> Uses;
+ mutable SmallVector<LSRUse, 16> Uses;
/// Track which uses use which register candidates.
RegUseTracker RegUses;
@@ -2025,7 +2049,8 @@ class LSRInstance {
public:
LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE, DominatorTree &DT,
- LoopInfo &LI, const TargetTransformInfo &TTI);
+ LoopInfo &LI, const TargetTransformInfo &TTI, AssumptionCache &AC,
+ TargetLibraryInfo &LibInfo);
bool getChanged() const { return Changed; }
@@ -2804,7 +2829,7 @@ bool IVChain::isProfitableIncrement(const SCEV *OperExpr,
/// TODO: Consider IVInc free if it's already used in another chains.
static bool
isProfitableChain(IVChain &Chain, SmallPtrSetImpl<Instruction*> &Users,
- ScalarEvolution &SE, const TargetTransformInfo &TTI) {
+ ScalarEvolution &SE) {
if (StressIVChain)
return true;
@@ -3064,7 +3089,7 @@ void LSRInstance::CollectChains() {
for (unsigned UsersIdx = 0, NChains = IVChainVec.size();
UsersIdx < NChains; ++UsersIdx) {
if (!isProfitableChain(IVChainVec[UsersIdx],
- ChainUsersVec[UsersIdx].FarUsers, SE, TTI))
+ ChainUsersVec[UsersIdx].FarUsers, SE))
continue;
// Preserve the chain at UsesIdx.
if (ChainIdx != UsersIdx)
@@ -3078,7 +3103,7 @@ void LSRInstance::CollectChains() {
void LSRInstance::FinalizeChain(IVChain &Chain) {
assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
LLVM_DEBUG(dbgs() << "Final Chain: " << *Chain.Incs[0].UserInst << "\n");
-
+
for (const IVInc &Inc : Chain) {
LLVM_DEBUG(dbgs() << " Inc: " << *Inc.UserInst << "\n");
auto UseI = find(Inc.UserInst->operands(), Inc.IVOperand);
@@ -3100,7 +3125,7 @@ static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst,
MemAccessTy AccessTy = getAccessType(TTI, UserInst, Operand);
int64_t IncOffset = IncConst->getValue()->getSExtValue();
if (!isAlwaysFoldable(TTI, LSRUse::Address, AccessTy, /*BaseGV=*/nullptr,
- IncOffset, /*HaseBaseReg=*/false))
+ IncOffset, /*HasBaseReg=*/false))
return false;
return true;
@@ -3210,6 +3235,9 @@ void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter,
}
void LSRInstance::CollectFixupsAndInitialFormulae() {
+ BranchInst *ExitBranch = nullptr;
+ bool SaveCmp = TTI.canSaveCmp(L, &ExitBranch, &SE, &LI, &DT, &AC, &LibInfo);
+
for (const IVStrideUse &U : IU) {
Instruction *UserInst = U.getUser();
// Skip IV users that are part of profitable IV Chains.
@@ -3239,6 +3267,10 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {
// equality icmps, thanks to IndVarSimplify.
if (ICmpInst *CI = dyn_cast<ICmpInst>(UserInst))
if (CI->isEquality()) {
+ // If CI can be saved in some target, like replaced inside hardware loop
+ // in PowerPC, no need to generate initial formulae for it.
+ if (SaveCmp && CI == dyn_cast<ICmpInst>(ExitBranch->getCondition()))
+ continue;
// Swap the operands if needed to put the OperandValToReplace on the
// left, for consistency.
Value *NV = CI->getOperand(1);
@@ -3738,10 +3770,11 @@ void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx,
void LSRInstance::GenerateConstantOffsetsImpl(
LSRUse &LU, unsigned LUIdx, const Formula &Base,
const SmallVectorImpl<int64_t> &Worklist, size_t Idx, bool IsScaledReg) {
- const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
- for (int64_t Offset : Worklist) {
+
+ auto GenerateOffset = [&](const SCEV *G, int64_t Offset) {
Formula F = Base;
F.BaseOffset = (uint64_t)Base.BaseOffset - Offset;
+
if (isLegalUse(TTI, LU.MinOffset - Offset, LU.MaxOffset - Offset, LU.Kind,
LU.AccessTy, F)) {
// Add the offset to the base register.
@@ -3761,7 +3794,35 @@ void LSRInstance::GenerateConstantOffsetsImpl(
(void)InsertFormula(LU, LUIdx, F);
}
+ };
+
+ const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
+
+ // With constant offsets and constant steps, we can generate pre-inc
+ // accesses by having the offset equal the step. So, for access #0 with a
+ // step of 8, we generate a G - 8 base which would require the first access
+ // to be ((G - 8) + 8),+,8. The pre-indexed access then updates the pointer
+ // for itself and hopefully becomes the base for other accesses. This means
+ // means that a single pre-indexed access can be generated to become the new
+ // base pointer for each iteration of the loop, resulting in no extra add/sub
+ // instructions for pointer updating.
+ if (FavorBackedgeIndex && LU.Kind == LSRUse::Address) {
+ if (auto *GAR = dyn_cast<SCEVAddRecExpr>(G)) {
+ if (auto *StepRec =
+ dyn_cast<SCEVConstant>(GAR->getStepRecurrence(SE))) {
+ const APInt &StepInt = StepRec->getAPInt();
+ int64_t Step = StepInt.isNegative() ?
+ StepInt.getSExtValue() : StepInt.getZExtValue();
+
+ for (int64_t Offset : Worklist) {
+ Offset -= Step;
+ GenerateOffset(G, Offset);
+ }
+ }
+ }
}
+ for (int64_t Offset : Worklist)
+ GenerateOffset(G, Offset);
int64_t Imm = ExtractImmediate(G, SE);
if (G->isZero() || Imm == 0)
@@ -3968,9 +4029,27 @@ void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) {
if (SrcTy != DstTy && TTI.isTruncateFree(SrcTy, DstTy)) {
Formula F = Base;
- if (F.ScaledReg) F.ScaledReg = SE.getAnyExtendExpr(F.ScaledReg, SrcTy);
- for (const SCEV *&BaseReg : F.BaseRegs)
- BaseReg = SE.getAnyExtendExpr(BaseReg, SrcTy);
+ // Sometimes SCEV is able to prove zero during ext transform. It may
+ // happen if SCEV did not do all possible transforms while creating the
+ // initial node (maybe due to depth limitations), but it can do them while
+ // taking ext.
+ if (F.ScaledReg) {
+ const SCEV *NewScaledReg = SE.getAnyExtendExpr(F.ScaledReg, SrcTy);
+ if (NewScaledReg->isZero())
+ continue;
+ F.ScaledReg = NewScaledReg;
+ }
+ bool HasZeroBaseReg = false;
+ for (const SCEV *&BaseReg : F.BaseRegs) {
+ const SCEV *NewBaseReg = SE.getAnyExtendExpr(BaseReg, SrcTy);
+ if (NewBaseReg->isZero()) {
+ HasZeroBaseReg = true;
+ break;
+ }
+ BaseReg = NewBaseReg;
+ }
+ if (HasZeroBaseReg)
+ continue;
// TODO: This assumes we've done basic processing on all uses and
// have an idea what the register usage is.
@@ -4067,11 +4146,17 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
// Conservatively examine offsets between this orig reg a few selected
// other orig regs.
+ int64_t First = Imms.begin()->first;
+ int64_t Last = std::prev(Imms.end())->first;
+ // Compute (First + Last) / 2 without overflow using the fact that
+ // First + Last = 2 * (First + Last) + (First ^ Last).
+ int64_t Avg = (First & Last) + ((First ^ Last) >> 1);
+ // If the result is negative and First is odd and Last even (or vice versa),
+ // we rounded towards -inf. Add 1 in that case, to round towards 0.
+ Avg = Avg + ((First ^ Last) & ((uint64_t)Avg >> 63));
ImmMapTy::const_iterator OtherImms[] = {
- Imms.begin(), std::prev(Imms.end()),
- Imms.lower_bound((Imms.begin()->first + std::prev(Imms.end())->first) /
- 2)
- };
+ Imms.begin(), std::prev(Imms.end()),
+ Imms.lower_bound(Avg)};
for (size_t i = 0, e = array_lengthof(OtherImms); i != e; ++i) {
ImmMapTy::const_iterator M = OtherImms[i];
if (M == J || M == JE) continue;
@@ -4249,9 +4334,9 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
// avoids the need to recompute this information across formulae using the
// same bad AddRec. Passing LoserRegs is also essential unless we remove
// the corresponding bad register from the Regs set.
- Cost CostF;
+ Cost CostF(L, SE, TTI);
Regs.clear();
- CostF.RateFormula(TTI, F, Regs, VisitedRegs, L, SE, DT, LU, &LoserRegs);
+ CostF.RateFormula(F, Regs, VisitedRegs, LU, &LoserRegs);
if (CostF.isLoser()) {
// During initial formula generation, undesirable formulae are generated
// by uses within other loops that have some non-trivial address mode or
@@ -4282,10 +4367,10 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
Formula &Best = LU.Formulae[P.first->second];
- Cost CostBest;
+ Cost CostBest(L, SE, TTI);
Regs.clear();
- CostBest.RateFormula(TTI, Best, Regs, VisitedRegs, L, SE, DT, LU);
- if (CostF.isLess(CostBest, TTI))
+ CostBest.RateFormula(Best, Regs, VisitedRegs, LU);
+ if (CostF.isLess(CostBest))
std::swap(F, Best);
LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
dbgs() << "\n"
@@ -4357,7 +4442,9 @@ void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {
I = F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I) {
if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*I)) {
Formula NewF = F;
- NewF.BaseOffset += C->getValue()->getSExtValue();
+ //FIXME: Formulas should store bitwidth to do wrapping properly.
+ // See PR41034.
+ NewF.BaseOffset += (uint64_t)C->getValue()->getSExtValue();
NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
(I - F.BaseRegs.begin()));
if (LU.HasFormulaWithSameRegs(NewF)) {
@@ -4400,7 +4487,7 @@ void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {
/// When there are many registers for expressions like A, A+1, A+2, etc.,
/// allocate a single register for them.
void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
- if (EstimateSearchSpaceComplexity() < ComplexityLimit)
+ if (EstimateSearchSpaceComplexity() < ComplexityLimit)
return;
LLVM_DEBUG(
@@ -4533,12 +4620,13 @@ void LSRInstance::NarrowSearchSpaceByFilterFormulaWithSameScaledReg() {
// If the new register numbers are the same, choose the Formula with
// less Cost.
- Cost CostFA, CostFB;
+ Cost CostFA(L, SE, TTI);
+ Cost CostFB(L, SE, TTI);
Regs.clear();
- CostFA.RateFormula(TTI, FA, Regs, VisitedRegs, L, SE, DT, LU);
+ CostFA.RateFormula(FA, Regs, VisitedRegs, LU);
Regs.clear();
- CostFB.RateFormula(TTI, FB, Regs, VisitedRegs, L, SE, DT, LU);
- return CostFA.isLess(CostFB, TTI);
+ CostFB.RateFormula(FB, Regs, VisitedRegs, LU);
+ return CostFA.isLess(CostFB);
};
bool Any = false;
@@ -4824,7 +4912,7 @@ void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
ReqRegs.insert(S);
SmallPtrSet<const SCEV *, 16> NewRegs;
- Cost NewCost;
+ Cost NewCost(L, SE, TTI);
for (const Formula &F : LU.Formulae) {
// Ignore formulae which may not be ideal in terms of register reuse of
// ReqRegs. The formula should use all required registers before
@@ -4848,8 +4936,8 @@ void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
// the current best, prune the search at that point.
NewCost = CurCost;
NewRegs = CurRegs;
- NewCost.RateFormula(TTI, F, NewRegs, VisitedRegs, L, SE, DT, LU);
- if (NewCost.isLess(SolutionCost, TTI)) {
+ NewCost.RateFormula(F, NewRegs, VisitedRegs, LU);
+ if (NewCost.isLess(SolutionCost)) {
Workspace.push_back(&F);
if (Workspace.size() != Uses.size()) {
SolveRecurse(Solution, SolutionCost, Workspace, NewCost,
@@ -4858,9 +4946,9 @@ void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
VisitedRegs.insert(F.ScaledReg ? F.ScaledReg : F.BaseRegs[0]);
} else {
LLVM_DEBUG(dbgs() << "New best at "; NewCost.print(dbgs());
- dbgs() << ".\n Regs:"; for (const SCEV *S
- : NewRegs) dbgs()
- << ' ' << *S;
+ dbgs() << ".\nRegs:\n";
+ for (const SCEV *S : NewRegs) dbgs()
+ << "- " << *S << "\n";
dbgs() << '\n');
SolutionCost = NewCost;
@@ -4875,9 +4963,9 @@ void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
/// vector.
void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const {
SmallVector<const Formula *, 8> Workspace;
- Cost SolutionCost;
+ Cost SolutionCost(L, SE, TTI);
SolutionCost.Lose();
- Cost CurCost;
+ Cost CurCost(L, SE, TTI);
SmallPtrSet<const SCEV *, 16> CurRegs;
DenseSet<const SCEV *> VisitedRegs;
Workspace.reserve(Uses.size());
@@ -5215,6 +5303,7 @@ void LSRInstance::RewriteForPHI(
DenseMap<BasicBlock *, Value *> Inserted;
for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
if (PN->getIncomingValue(i) == LF.OperandValToReplace) {
+ bool needUpdateFixups = false;
BasicBlock *BB = PN->getIncomingBlock(i);
// If this is a critical edge, split the edge so that we do not insert
@@ -5233,7 +5322,7 @@ void LSRInstance::RewriteForPHI(
NewBB = SplitCriticalEdge(BB, Parent,
CriticalEdgeSplittingOptions(&DT, &LI)
.setMergeIdenticalEdges()
- .setDontDeleteUselessPHIs());
+ .setKeepOneInputPHIs());
} else {
SmallVector<BasicBlock*, 2> NewBBs;
SplitLandingPadPredecessors(Parent, BB, "", "", NewBBs, &DT, &LI);
@@ -5253,6 +5342,8 @@ void LSRInstance::RewriteForPHI(
e = PN->getNumIncomingValues();
BB = NewBB;
i = PN->getBasicBlockIndex(BB);
+
+ needUpdateFixups = true;
}
}
}
@@ -5277,6 +5368,44 @@ void LSRInstance::RewriteForPHI(
PN->setIncomingValue(i, FullV);
Pair.first->second = FullV;
}
+
+ // If LSR splits critical edge and phi node has other pending
+ // fixup operands, we need to update those pending fixups. Otherwise
+ // formulae will not be implemented completely and some instructions
+ // will not be eliminated.
+ if (needUpdateFixups) {
+ for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx)
+ for (LSRFixup &Fixup : Uses[LUIdx].Fixups)
+ // If fixup is supposed to rewrite some operand in the phi
+ // that was just updated, it may be already moved to
+ // another phi node. Such fixup requires update.
+ if (Fixup.UserInst == PN) {
+ // Check if the operand we try to replace still exists in the
+ // original phi.
+ bool foundInOriginalPHI = false;
+ for (const auto &val : PN->incoming_values())
+ if (val == Fixup.OperandValToReplace) {
+ foundInOriginalPHI = true;
+ break;
+ }
+
+ // If fixup operand found in original PHI - nothing to do.
+ if (foundInOriginalPHI)
+ continue;
+
+ // Otherwise it might be moved to another PHI and requires update.
+ // If fixup operand not found in any of the incoming blocks that
+ // means we have already rewritten it - nothing to do.
+ for (const auto &Block : PN->blocks())
+ for (BasicBlock::iterator I = Block->begin(); isa<PHINode>(I);
+ ++I) {
+ PHINode *NewPN = cast<PHINode>(I);
+ for (const auto &val : NewPN->incoming_values())
+ if (val == Fixup.OperandValToReplace)
+ Fixup.UserInst = NewPN;
+ }
+ }
+ }
}
}
@@ -5360,8 +5489,11 @@ void LSRInstance::ImplementSolution(
LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE,
DominatorTree &DT, LoopInfo &LI,
- const TargetTransformInfo &TTI)
- : IU(IU), SE(SE), DT(DT), LI(LI), TTI(TTI), L(L) {
+ const TargetTransformInfo &TTI, AssumptionCache &AC,
+ TargetLibraryInfo &LibInfo)
+ : IU(IU), SE(SE), DT(DT), LI(LI), AC(AC), LibInfo(LibInfo), TTI(TTI), L(L),
+ FavorBackedgeIndex(EnableBackedgeIndexing &&
+ TTI.shouldFavorBackedgeIndex(L)) {
// If LoopSimplify form is not available, stay out of trouble.
if (!L->isLoopSimplifyForm())
return;
@@ -5556,6 +5688,8 @@ void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {
AU.addPreserved<DominatorTreeWrapperPass>();
AU.addRequired<ScalarEvolutionWrapperPass>();
AU.addPreserved<ScalarEvolutionWrapperPass>();
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
// Requiring LoopSimplify a second time here prevents IVUsers from running
// twice, since LoopSimplify was invalidated by running ScalarEvolution.
AU.addRequiredID(LoopSimplifyID);
@@ -5566,11 +5700,14 @@ void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {
static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE,
DominatorTree &DT, LoopInfo &LI,
- const TargetTransformInfo &TTI) {
+ const TargetTransformInfo &TTI,
+ AssumptionCache &AC,
+ TargetLibraryInfo &LibInfo) {
+
bool Changed = false;
// Run the main LSR transformation.
- Changed |= LSRInstance(L, IU, SE, DT, LI, TTI).getChanged();
+ Changed |= LSRInstance(L, IU, SE, DT, LI, TTI, AC, LibInfo).getChanged();
// Remove any extra phis created by processing inner loops.
Changed |= DeleteDeadPHIs(L->getHeader());
@@ -5601,14 +5738,17 @@ bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) {
auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
const auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
*L->getHeader()->getParent());
- return ReduceLoopStrength(L, IU, SE, DT, LI, TTI);
+ auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
+ *L->getHeader()->getParent());
+ auto &LibInfo = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ return ReduceLoopStrength(L, IU, SE, DT, LI, TTI, AC, LibInfo);
}
PreservedAnalyses LoopStrengthReducePass::run(Loop &L, LoopAnalysisManager &AM,
LoopStandardAnalysisResults &AR,
LPMUpdater &) {
if (!ReduceLoopStrength(&L, AM.getResult<IVUsersAnalysis>(L, AR), AR.SE,
- AR.DT, AR.LI, AR.TTI))
+ AR.DT, AR.LI, AR.TTI, AR.AC, AR.TLI))
return PreservedAnalyses::all();
return getLoopPassPreservedAnalyses();