aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2023-02-11 12:38:04 +0000
committerDimitry Andric <dim@FreeBSD.org>2023-02-11 12:38:11 +0000
commite3b557809604d036af6e00c60f012c2025b59a5e (patch)
tree8a11ba2269a3b669601e2fd41145b174008f4da8 /llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
parent08e8dd7b9db7bb4a9de26d44c1cbfd24e869c014 (diff)
Diffstat (limited to 'llvm/lib/Transforms/Vectorize/LoopVectorize.cpp')
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp1683
1 files changed, 916 insertions, 767 deletions
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 238b074089aa..a28099d8ba7d 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -65,8 +65,6 @@
#include "llvm/ADT/DenseMapInfo.h"
#include "llvm/ADT/Hashing.h"
#include "llvm/ADT/MapVector.h"
-#include "llvm/ADT/None.h"
-#include "llvm/ADT/Optional.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallSet.h"
@@ -142,6 +140,7 @@
#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
#include <algorithm>
#include <cassert>
+#include <cmath>
#include <cstdint>
#include <functional>
#include <iterator>
@@ -362,10 +361,15 @@ cl::opt<bool> llvm::EnableLoopVectorization(
"vectorize-loops", cl::init(true), cl::Hidden,
cl::desc("Run the Loop vectorization passes"));
-cl::opt<bool> PrintVPlansInDotFormat(
- "vplan-print-in-dot-format", cl::init(false), cl::Hidden,
+static cl::opt<bool> PrintVPlansInDotFormat(
+ "vplan-print-in-dot-format", cl::Hidden,
cl::desc("Use dot format instead of plain text when dumping VPlans"));
+static cl::opt<cl::boolOrDefault> ForceSafeDivisor(
+ "force-widen-divrem-via-safe-divisor", cl::Hidden,
+ cl::desc(
+ "Override cost based safe divisor widening for div/rem instructions"));
+
/// A helper function that returns true if the given type is irregular. The
/// type is irregular if its allocated size doesn't equal the store size of an
/// element of the corresponding vector type.
@@ -396,8 +400,9 @@ static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
/// 1) Returns exact trip count if it is known.
/// 2) Returns expected trip count according to profile data if any.
/// 3) Returns upper bound estimate if it is known.
-/// 4) Returns None if all of the above failed.
-static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
+/// 4) Returns std::nullopt if all of the above failed.
+static std::optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE,
+ Loop *L) {
// Check if exact trip count is known.
if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
return ExpectedTC;
@@ -405,17 +410,19 @@ static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
// Check if there is an expected trip count available from profile data.
if (LoopVectorizeWithBlockFrequency)
if (auto EstimatedTC = getLoopEstimatedTripCount(L))
- return EstimatedTC;
+ return *EstimatedTC;
// Check if upper bound estimate is known.
if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
return ExpectedTC;
- return None;
+ return std::nullopt;
}
+namespace {
// Forward declare GeneratedRTChecks.
class GeneratedRTChecks;
+} // namespace
namespace llvm {
@@ -473,10 +480,6 @@ public:
/// complex control flow around the loops.
virtual std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton();
- /// Widen a single call instruction within the innermost loop.
- void widenCallInstruction(CallInst &CI, VPValue *Def, VPUser &ArgOperands,
- VPTransformState &State);
-
/// Fix the vectorized code, taking care of header phi's, live-outs, and more.
void fixVectorizedLoop(VPTransformState &State, VPlan &Plan);
@@ -493,7 +496,8 @@ public:
/// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
/// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
/// Instr's operands.
- void scalarizeInstruction(Instruction *Instr, VPReplicateRecipe *RepRecipe,
+ void scalarizeInstruction(const Instruction *Instr,
+ VPReplicateRecipe *RepRecipe,
const VPIteration &Instance, bool IfPredicateInstr,
VPTransformState &State);
@@ -529,6 +533,17 @@ public:
// generated by fixReduction.
PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc);
+ /// Create a new phi node for the induction variable \p OrigPhi to resume
+ /// iteration count in the scalar epilogue, from where the vectorized loop
+ /// left off. In cases where the loop skeleton is more complicated (eg.
+ /// epilogue vectorization) and the resume values can come from an additional
+ /// bypass block, the \p AdditionalBypass pair provides information about the
+ /// bypass block and the end value on the edge from bypass to this loop.
+ PHINode *createInductionResumeValue(
+ PHINode *OrigPhi, const InductionDescriptor &ID,
+ ArrayRef<BasicBlock *> BypassBlocks,
+ std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
+
protected:
friend class LoopVectorizationPlanner;
@@ -552,7 +567,7 @@ protected:
/// Create the exit value of first order recurrences in the middle block and
/// update their users.
- void fixFirstOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR,
+ void fixFixedOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR,
VPTransformState &State);
/// Create code for the loop exit value of the reduction.
@@ -611,7 +626,7 @@ protected:
/// Complete the loop skeleton by adding debug MDs, creating appropriate
/// conditional branches in the middle block, preparing the builder and
/// running the verifier. Return the preheader of the completed vector loop.
- BasicBlock *completeLoopSkeleton(MDNode *OrigLoopID);
+ BasicBlock *completeLoopSkeleton();
/// Collect poison-generating recipes that may generate a poison value that is
/// used after vectorization, even when their operands are not poison. Those
@@ -643,9 +658,6 @@ protected:
/// Dominator Tree.
DominatorTree *DT;
- /// Alias Analysis.
- AAResults *AA;
-
/// Target Library Info.
const TargetLibraryInfo *TLI;
@@ -951,6 +963,27 @@ Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) {
return VF.isScalable() ? B.CreateVScale(EC) : EC;
}
+const SCEV *createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE) {
+ const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
+ assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && "Invalid loop count");
+
+ ScalarEvolution &SE = *PSE.getSE();
+
+ // The exit count might have the type of i64 while the phi is i32. This can
+ // happen if we have an induction variable that is sign extended before the
+ // compare. The only way that we get a backedge taken count is that the
+ // induction variable was signed and as such will not overflow. In such a case
+ // truncation is legal.
+ if (SE.getTypeSizeInBits(BackedgeTakenCount->getType()) >
+ IdxTy->getPrimitiveSizeInBits())
+ BackedgeTakenCount = SE.getTruncateOrNoop(BackedgeTakenCount, IdxTy);
+ BackedgeTakenCount = SE.getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
+
+ // Get the total trip count from the count by adding 1.
+ return SE.getAddExpr(BackedgeTakenCount,
+ SE.getOne(BackedgeTakenCount->getType()));
+}
+
static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy,
ElementCount VF) {
assert(FTy->isFloatingPointTy() && "Expected floating point type!");
@@ -1037,27 +1070,25 @@ void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
// Add new definitions to the worklist.
for (VPValue *operand : CurRec->operands())
- if (VPDef *OpDef = operand->getDef())
- Worklist.push_back(cast<VPRecipeBase>(OpDef));
+ if (VPRecipeBase *OpDef = operand->getDefiningRecipe())
+ Worklist.push_back(OpDef);
}
});
// Traverse all the recipes in the VPlan and collect the poison-generating
// recipes in the backward slice starting at the address of a VPWidenRecipe or
// VPInterleaveRecipe.
- auto Iter = depth_first(
- VPBlockRecursiveTraversalWrapper<VPBlockBase *>(State.Plan->getEntry()));
+ auto Iter = vp_depth_first_deep(State.Plan->getEntry());
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
for (VPRecipeBase &Recipe : *VPBB) {
if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) {
Instruction &UnderlyingInstr = WidenRec->getIngredient();
- VPDef *AddrDef = WidenRec->getAddr()->getDef();
+ VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
if (AddrDef && WidenRec->isConsecutive() &&
Legal->blockNeedsPredication(UnderlyingInstr.getParent()))
- collectPoisonGeneratingInstrsInBackwardSlice(
- cast<VPRecipeBase>(AddrDef));
+ collectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
} else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
- VPDef *AddrDef = InterleaveRec->getAddr()->getDef();
+ VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
if (AddrDef) {
// Check if any member of the interleave group needs predication.
const InterleaveGroup<Instruction> *InterGroup =
@@ -1072,8 +1103,7 @@ void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
}
if (NeedPredication)
- collectPoisonGeneratingInstrsInBackwardSlice(
- cast<VPRecipeBase>(AddrDef));
+ collectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
}
}
}
@@ -1182,7 +1212,7 @@ public:
/// If interleave count has been specified by metadata it will be returned.
/// Otherwise, the interleave count is computed and returned. VF and LoopCost
/// are the selected vectorization factor and the cost of the selected VF.
- unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost);
+ unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost);
/// Memory access instruction may be vectorized in more than one way.
/// Form of instruction after vectorization depends on cost.
@@ -1435,47 +1465,49 @@ public:
}));
}
- /// Returns true if \p I is an instruction that will be scalarized with
- /// predication when vectorizing \p I with vectorization factor \p VF. Such
- /// instructions include conditional stores and instructions that may divide
- /// by zero.
- bool isScalarWithPredication(Instruction *I, ElementCount VF) const;
-
- // Returns true if \p I is an instruction that will be predicated either
- // through scalar predication or masked load/store or masked gather/scatter.
- // \p VF is the vectorization factor that will be used to vectorize \p I.
- // Superset of instructions that return true for isScalarWithPredication.
- bool isPredicatedInst(Instruction *I, ElementCount VF) {
- // When we know the load's address is loop invariant and the instruction
- // in the original scalar loop was unconditionally executed then we
- // don't need to mark it as a predicated instruction. Tail folding may
- // introduce additional predication, but we're guaranteed to always have
- // at least one active lane. We call Legal->blockNeedsPredication here
- // because it doesn't query tail-folding.
- if (Legal->isUniformMemOp(*I) && isa<LoadInst>(I) &&
- !Legal->blockNeedsPredication(I->getParent()))
+ /// Given costs for both strategies, return true if the scalar predication
+ /// lowering should be used for div/rem. This incorporates an override
+ /// option so it is not simply a cost comparison.
+ bool isDivRemScalarWithPredication(InstructionCost ScalarCost,
+ InstructionCost SafeDivisorCost) const {
+ switch (ForceSafeDivisor) {
+ case cl::BOU_UNSET:
+ return ScalarCost < SafeDivisorCost;
+ case cl::BOU_TRUE:
return false;
- if (!blockNeedsPredicationForAnyReason(I->getParent()))
- return false;
- // Loads and stores that need some form of masked operation are predicated
- // instructions.
- if (isa<LoadInst>(I) || isa<StoreInst>(I))
- return Legal->isMaskRequired(I);
- return isScalarWithPredication(I, VF);
+ case cl::BOU_FALSE:
+ return true;
+ };
+ llvm_unreachable("impossible case value");
}
+ /// Returns true if \p I is an instruction which requires predication and
+ /// for which our chosen predication strategy is scalarization (i.e. we
+ /// don't have an alternate strategy such as masking available).
+ /// \p VF is the vectorization factor that will be used to vectorize \p I.
+ bool isScalarWithPredication(Instruction *I, ElementCount VF) const;
+
+ /// Returns true if \p I is an instruction that needs to be predicated
+ /// at runtime. The result is independent of the predication mechanism.
+ /// Superset of instructions that return true for isScalarWithPredication.
+ bool isPredicatedInst(Instruction *I) const;
+
+ /// Return the costs for our two available strategies for lowering a
+ /// div/rem operation which requires speculating at least one lane.
+ /// First result is for scalarization (will be invalid for scalable
+ /// vectors); second is for the safe-divisor strategy.
+ std::pair<InstructionCost, InstructionCost>
+ getDivRemSpeculationCost(Instruction *I,
+ ElementCount VF) const;
+
/// Returns true if \p I is a memory instruction with consecutive memory
/// access that can be widened.
- bool
- memoryInstructionCanBeWidened(Instruction *I,
- ElementCount VF = ElementCount::getFixed(1));
+ bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF);
/// Returns true if \p I is a memory instruction in an interleaved-group
/// of memory accesses that can be vectorized with wide vector loads/stores
/// and shuffles.
- bool
- interleavedAccessCanBeWidened(Instruction *I,
- ElementCount VF = ElementCount::getFixed(1));
+ bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF);
/// Check if \p Instr belongs to any interleaved access group.
bool isAccessInterleaved(Instruction *Instr) {
@@ -1567,7 +1599,7 @@ public:
/// Convenience function that returns the value of vscale_range iff
/// vscale_range.min == vscale_range.max or otherwise returns the value
/// returned by the corresponding TLI method.
- Optional<unsigned> getVScaleForTuning() const;
+ std::optional<unsigned> getVScaleForTuning() const;
private:
unsigned NumPredStores = 0;
@@ -1623,7 +1655,7 @@ private:
/// Return the cost of instructions in an inloop reduction pattern, if I is
/// part of that pattern.
- Optional<InstructionCost>
+ std::optional<InstructionCost>
getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
TTI::TargetCostKind CostKind);
@@ -1651,8 +1683,8 @@ private:
/// Estimate the overhead of scalarizing an instruction. This is a
/// convenience wrapper for the type-based getScalarizationOverhead API.
- InstructionCost getScalarizationOverhead(Instruction *I,
- ElementCount VF) const;
+ InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF,
+ TTI::TargetCostKind CostKind) const;
/// Returns true if an artificially high cost for emulated masked memrefs
/// should be used.
@@ -1719,8 +1751,9 @@ private:
/// scalarize and their scalar costs are collected in \p ScalarCosts. A
/// non-negative return value implies the expression will be scalarized.
/// Currently, only single-use chains are considered for scalarization.
- int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
- ElementCount VF);
+ InstructionCost computePredInstDiscount(Instruction *PredInst,
+ ScalarCostsTy &ScalarCosts,
+ ElementCount VF);
/// Collect the instructions that are uniform after vectorization. An
/// instruction is uniform if we represent it with a single scalar value in
@@ -1835,6 +1868,7 @@ public:
};
} // end namespace llvm
+namespace {
/// Helper struct to manage generating runtime checks for vectorization.
///
/// The runtime checks are created up-front in temporary blocks to allow better
@@ -1914,7 +1948,7 @@ public:
if (DiffChecks) {
Value *RuntimeVF = nullptr;
MemRuntimeCheckCond = addDiffRuntimeChecks(
- MemCheckBlock->getTerminator(), L, *DiffChecks, MemCheckExp,
+ MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp,
[VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
if (!RuntimeVF)
RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);
@@ -2099,6 +2133,7 @@ public:
return MemCheckBlock;
}
};
+} // namespace
// Return true if \p OuterLp is an outer loop annotated with hints for explicit
// vectorization. The loop needs to be annotated with #pragma omp simd
@@ -2194,18 +2229,15 @@ struct LoopVectorize : public FunctionPass {
auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
- auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
- auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
+ auto &LAIs = getAnalysis<LoopAccessLegacyAnalysis>().getLAIs();
auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
- std::function<const LoopAccessInfo &(Loop &)> GetLAA =
- [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
-
- return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
- GetLAA, *ORE, PSI).MadeAnyChange;
+ return Impl
+ .runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AC, LAIs, *ORE, PSI)
+ .MadeAnyChange;
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -2215,7 +2247,6 @@ struct LoopVectorize : public FunctionPass {
AU.addRequired<LoopInfoWrapperPass>();
AU.addRequired<ScalarEvolutionWrapperPass>();
AU.addRequired<TargetTransformInfoWrapperPass>();
- AU.addRequired<AAResultsWrapperPass>();
AU.addRequired<LoopAccessLegacyAnalysis>();
AU.addRequired<DemandedBitsWrapperPass>();
AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
@@ -2321,12 +2352,16 @@ static void buildScalarSteps(Value *ScalarIV, Value *Step,
const InductionDescriptor &ID, VPValue *Def,
VPTransformState &State) {
IRBuilderBase &Builder = State.Builder;
- // We shouldn't have to build scalar steps if we aren't vectorizing.
- assert(State.VF.isVector() && "VF should be greater than one");
- // Get the value type and ensure it and the step have the same integer type.
+
+ // Ensure step has the same type as that of scalar IV.
Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
- assert(ScalarIVTy == Step->getType() &&
- "Val and Step should have the same type");
+ if (ScalarIVTy != Step->getType()) {
+ // TODO: Also use VPDerivedIVRecipe when only the step needs truncating, to
+ // avoid separate truncate here.
+ assert(Step->getType()->isIntegerTy() &&
+ "Truncation requires an integer step");
+ Step = State.Builder.CreateTrunc(Step, ScalarIVTy);
+ }
// We build scalar steps for both integer and floating-point induction
// variables. Here, we determine the kind of arithmetic we will perform.
@@ -2343,7 +2378,6 @@ static void buildScalarSteps(Value *ScalarIV, Value *Step,
// Determine the number of scalars we need to generate for each unroll
// iteration.
bool FirstLaneOnly = vputils::onlyFirstLaneUsed(Def);
- unsigned Lanes = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
// Compute the scalar steps and save the results in State.
Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
ScalarIVTy->getScalarSizeInBits());
@@ -2357,7 +2391,17 @@ static void buildScalarSteps(Value *ScalarIV, Value *Step,
SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV);
}
- for (unsigned Part = 0; Part < State.UF; ++Part) {
+ unsigned StartPart = 0;
+ unsigned EndPart = State.UF;
+ unsigned StartLane = 0;
+ unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
+ if (State.Instance) {
+ StartPart = State.Instance->Part;
+ EndPart = StartPart + 1;
+ StartLane = State.Instance->Lane.getKnownLane();
+ EndLane = StartLane + 1;
+ }
+ for (unsigned Part = StartPart; Part < EndPart; ++Part) {
Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part);
if (!FirstLaneOnly && State.VF.isScalable()) {
@@ -2376,7 +2420,7 @@ static void buildScalarSteps(Value *ScalarIV, Value *Step,
if (ScalarIVTy->isFloatingPointTy())
StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy);
- for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
+ for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) {
Value *StartIdx = Builder.CreateBinOp(
AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane));
// The step returned by `createStepForVF` is a runtime-evaluated value
@@ -2415,8 +2459,14 @@ static Value *CreateStepValue(const SCEV *Step, ScalarEvolution &SE,
static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index,
Value *StartValue, Value *Step,
const InductionDescriptor &ID) {
- assert(Index->getType()->getScalarType() == Step->getType() &&
- "Index scalar type does not match StepValue type");
+ Type *StepTy = Step->getType();
+ Value *CastedIndex = StepTy->isIntegerTy()
+ ? B.CreateSExtOrTrunc(Index, StepTy)
+ : B.CreateCast(Instruction::SIToFP, Index, StepTy);
+ if (CastedIndex != Index) {
+ CastedIndex->setName(CastedIndex->getName() + ".cast");
+ Index = CastedIndex;
+ }
// Note: the IR at this point is broken. We cannot use SE to create any new
// SCEV and then expand it, hoping that SCEV's simplification will give us
@@ -2682,6 +2732,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
for (unsigned Part = 0; Part < UF; Part++) {
// Collect the stored vector from each member.
SmallVector<Value *, 4> StoredVecs;
+ unsigned StoredIdx = 0;
for (unsigned i = 0; i < InterleaveFactor; i++) {
assert((Group->getMember(i) || MaskForGaps) &&
"Fail to get a member from an interleaved store group");
@@ -2694,7 +2745,8 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
continue;
}
- Value *StoredVec = State.get(StoredValues[i], Part);
+ Value *StoredVec = State.get(StoredValues[StoredIdx], Part);
+ ++StoredIdx;
if (Group->isReverse())
StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse");
@@ -2738,7 +2790,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
}
}
-void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
+void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
VPReplicateRecipe *RepRecipe,
const VPIteration &Instance,
bool IfPredicateInstr,
@@ -2772,11 +2824,10 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
// Replace the operands of the cloned instructions with their scalar
// equivalents in the new loop.
- for (auto &I : enumerate(RepRecipe->operands())) {
+ for (const auto &I : enumerate(RepRecipe->operands())) {
auto InputInstance = Instance;
VPValue *Operand = I.value();
- VPReplicateRecipe *OperandR = dyn_cast<VPReplicateRecipe>(Operand);
- if (OperandR && OperandR->isUniform())
+ if (vputils::isUniformAfterVectorization(Operand))
InputInstance.Lane = VPLane::getFirstLane();
Cloned->setOperand(I.index(), State.get(Operand, InputInstance));
}
@@ -2803,33 +2854,15 @@ Value *InnerLoopVectorizer::getOrCreateTripCount(BasicBlock *InsertBlock) {
assert(InsertBlock);
IRBuilder<> Builder(InsertBlock->getTerminator());
// Find the loop boundaries.
- ScalarEvolution *SE = PSE.getSE();
- const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
- assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&
- "Invalid loop count");
-
Type *IdxTy = Legal->getWidestInductionType();
assert(IdxTy && "No type for induction");
-
- // The exit count might have the type of i64 while the phi is i32. This can
- // happen if we have an induction variable that is sign extended before the
- // compare. The only way that we get a backedge taken count is that the
- // induction variable was signed and as such will not overflow. In such a case
- // truncation is legal.
- if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
- IdxTy->getPrimitiveSizeInBits())
- BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
- BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
-
- // Get the total trip count from the count by adding 1.
- const SCEV *ExitCount = SE->getAddExpr(
- BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
+ const SCEV *ExitCount = createTripCountSCEV(IdxTy, PSE);
const DataLayout &DL = InsertBlock->getModule()->getDataLayout();
// Expand the trip count and place the new instructions in the preheader.
// Notice that the pre-header does not change, only the loop body.
- SCEVExpander Exp(*SE, DL, "induction");
+ SCEVExpander Exp(*PSE.getSE(), DL, "induction");
// Count holds the overall loop count (N).
TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
@@ -3080,7 +3113,7 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
// 1) If we know that we must execute the scalar epilogue, emit an
// unconditional branch.
// 2) Otherwise, we must have a single unique exit block (due to how we
- // implement the multiple exit case). In this case, set up a conditonal
+ // implement the multiple exit case). In this case, set up a conditional
// branch from the middle block to the loop scalar preheader, and the
// exit block. completeLoopSkeleton will update the condition to use an
// iteration check, if required to decide whether to execute the remainder.
@@ -3101,88 +3134,87 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
}
-void InnerLoopVectorizer::createInductionResumeValues(
+PHINode *InnerLoopVectorizer::createInductionResumeValue(
+ PHINode *OrigPhi, const InductionDescriptor &II,
+ ArrayRef<BasicBlock *> BypassBlocks,
std::pair<BasicBlock *, Value *> AdditionalBypass) {
- assert(((AdditionalBypass.first && AdditionalBypass.second) ||
- (!AdditionalBypass.first && !AdditionalBypass.second)) &&
- "Inconsistent information about additional bypass.");
-
Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
assert(VectorTripCount && "Expected valid arguments");
- // We are going to resume the execution of the scalar loop.
- // Go over all of the induction variables that we found and fix the
- // PHIs that are left in the scalar version of the loop.
- // The starting values of PHI nodes depend on the counter of the last
- // iteration in the vectorized loop.
- // If we come from a bypass edge then we need to start from the original
- // start value.
+
Instruction *OldInduction = Legal->getPrimaryInduction();
- for (auto &InductionEntry : Legal->getInductionVars()) {
- PHINode *OrigPhi = InductionEntry.first;
- InductionDescriptor II = InductionEntry.second;
+ Value *&EndValue = IVEndValues[OrigPhi];
+ Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
+ if (OrigPhi == OldInduction) {
+ // We know what the end value is.
+ EndValue = VectorTripCount;
+ } else {
+ IRBuilder<> B(LoopVectorPreHeader->getTerminator());
- Value *&EndValue = IVEndValues[OrigPhi];
- Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
- if (OrigPhi == OldInduction) {
- // We know what the end value is.
- EndValue = VectorTripCount;
- } else {
- IRBuilder<> B(LoopVectorPreHeader->getTerminator());
+ // Fast-math-flags propagate from the original induction instruction.
+ if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
+ B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
- // Fast-math-flags propagate from the original induction instruction.
- if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
- B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
+ Value *Step =
+ CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint());
+ EndValue =
+ emitTransformedIndex(B, VectorTripCount, II.getStartValue(), Step, II);
+ EndValue->setName("ind.end");
- Type *StepType = II.getStep()->getType();
- Instruction::CastOps CastOp =
- CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
- Value *VTC = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.vtc");
+ // Compute the end value for the additional bypass (if applicable).
+ if (AdditionalBypass.first) {
+ B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
Value *Step =
CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint());
- EndValue = emitTransformedIndex(B, VTC, II.getStartValue(), Step, II);
- EndValue->setName("ind.end");
-
- // Compute the end value for the additional bypass (if applicable).
- if (AdditionalBypass.first) {
- B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
- CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true,
- StepType, true);
- Value *Step =
- CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint());
- VTC =
- B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.vtc");
- EndValueFromAdditionalBypass =
- emitTransformedIndex(B, VTC, II.getStartValue(), Step, II);
- EndValueFromAdditionalBypass->setName("ind.end");
- }
+ EndValueFromAdditionalBypass = emitTransformedIndex(
+ B, AdditionalBypass.second, II.getStartValue(), Step, II);
+ EndValueFromAdditionalBypass->setName("ind.end");
}
+ }
- // Create phi nodes to merge from the backedge-taken check block.
- PHINode *BCResumeVal =
- PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
- LoopScalarPreHeader->getTerminator());
- // Copy original phi DL over to the new one.
- BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
+ // Create phi nodes to merge from the backedge-taken check block.
+ PHINode *BCResumeVal = PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
+ LoopScalarPreHeader->getTerminator());
+ // Copy original phi DL over to the new one.
+ BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
- // The new PHI merges the original incoming value, in case of a bypass,
- // or the value at the end of the vectorized loop.
- BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
+ // The new PHI merges the original incoming value, in case of a bypass,
+ // or the value at the end of the vectorized loop.
+ BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
- // Fix the scalar body counter (PHI node).
- // The old induction's phi node in the scalar body needs the truncated
- // value.
- for (BasicBlock *BB : LoopBypassBlocks)
- BCResumeVal->addIncoming(II.getStartValue(), BB);
+ // Fix the scalar body counter (PHI node).
+ // The old induction's phi node in the scalar body needs the truncated
+ // value.
+ for (BasicBlock *BB : BypassBlocks)
+ BCResumeVal->addIncoming(II.getStartValue(), BB);
- if (AdditionalBypass.first)
- BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
- EndValueFromAdditionalBypass);
+ if (AdditionalBypass.first)
+ BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
+ EndValueFromAdditionalBypass);
+ return BCResumeVal;
+}
+void InnerLoopVectorizer::createInductionResumeValues(
+ std::pair<BasicBlock *, Value *> AdditionalBypass) {
+ assert(((AdditionalBypass.first && AdditionalBypass.second) ||
+ (!AdditionalBypass.first && !AdditionalBypass.second)) &&
+ "Inconsistent information about additional bypass.");
+ // We are going to resume the execution of the scalar loop.
+ // Go over all of the induction variables that we found and fix the
+ // PHIs that are left in the scalar version of the loop.
+ // The starting values of PHI nodes depend on the counter of the last
+ // iteration in the vectorized loop.
+ // If we come from a bypass edge then we need to start from the original
+ // start value.
+ for (const auto &InductionEntry : Legal->getInductionVars()) {
+ PHINode *OrigPhi = InductionEntry.first;
+ const InductionDescriptor &II = InductionEntry.second;
+ PHINode *BCResumeVal = createInductionResumeValue(
+ OrigPhi, II, LoopBypassBlocks, AdditionalBypass);
OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
}
}
-BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(MDNode *OrigLoopID) {
+BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() {
// The trip counts should be cached by now.
Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
@@ -3251,18 +3283,6 @@ InnerLoopVectorizer::createVectorizedLoopSkeleton() {
...
*/
- // Get the metadata of the original loop before it gets modified.
- MDNode *OrigLoopID = OrigLoop->getLoopID();
-
- // Workaround! Compute the trip count of the original loop and cache it
- // before we start modifying the CFG. This code has a systemic problem
- // wherein it tries to run analysis over partially constructed IR; this is
- // wrong, and not simply for SCEV. The trip count of the original loop
- // simply happens to be prone to hitting this in practice. In theory, we
- // can hit the same issue for any SCEV, or ValueTracking query done during
- // mutation. See PR49900.
- getOrCreateTripCount(OrigLoop->getLoopPreheader());
-
// Create an empty vector loop, and prepare basic blocks for the runtime
// checks.
createVectorLoopSkeleton("");
@@ -3286,7 +3306,7 @@ InnerLoopVectorizer::createVectorizedLoopSkeleton() {
// Emit phis for the new starting index of the scalar loop.
createInductionResumeValues();
- return {completeLoopSkeleton(OrigLoopID), nullptr};
+ return {completeLoopSkeleton(), nullptr};
}
// Fix up external users of the induction variable. At this point, we are
@@ -3334,17 +3354,11 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
Value *CountMinusOne = B.CreateSub(
VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1));
- Value *CMO =
- !II.getStep()->getType()->isIntegerTy()
- ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
- II.getStep()->getType())
- : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
- CMO->setName("cast.cmo");
-
+ CountMinusOne->setName("cmo");
Value *Step = CreateStepValue(II.getStep(), *PSE.getSE(),
VectorHeader->getTerminator());
Value *Escape =
- emitTransformedIndex(B, CMO, II.getStartValue(), Step, II);
+ emitTransformedIndex(B, CountMinusOne, II.getStartValue(), Step, II);
Escape->setName("ind.escape");
MissingVals[UI] = Escape;
}
@@ -3429,8 +3443,9 @@ LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
// to be vectors, so we need to extract individual elements from there,
// execute VF scalar calls, and then gather the result into the vector return
// value.
+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
InstructionCost ScalarCallCost =
- TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput);
+ TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, CostKind);
if (VF.isScalar())
return ScalarCallCost;
@@ -3441,7 +3456,8 @@ LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
// Compute costs of unpacking argument values for the scalar calls and
// packing the return values to a vector.
- InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
+ InstructionCost ScalarizationCost =
+ getScalarizationOverhead(CI, VF, CostKind);
InstructionCost Cost =
ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
@@ -3457,7 +3473,7 @@ LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
// If the corresponding vector cost is cheaper, return its cost.
InstructionCost VectorCallCost =
- TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput);
+ TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind);
if (VectorCallCost < Cost) {
NeedToScalarize = false;
Cost = VectorCallCost;
@@ -3672,7 +3688,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
// edge.
// Fix-up external users of the induction variables.
- for (auto &Entry : Legal->getInductionVars())
+ for (const auto &Entry : Legal->getInductionVars())
fixupIVUsers(Entry.first, Entry.second,
getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()),
IVEndValues[Entry.first], LoopMiddleBlock,
@@ -3682,7 +3698,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
// Fix LCSSA phis not already fixed earlier. Extracts may need to be generated
// in the exit block, so update the builder.
State.Builder.SetInsertPoint(State.CFG.ExitBB->getFirstNonPHI());
- for (auto &KV : Plan.getLiveOuts())
+ for (const auto &KV : Plan.getLiveOuts())
KV.second->fixPhi(Plan, State);
for (Instruction *PI : PredicatedInstructions)
@@ -3722,11 +3738,11 @@ void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {
if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
fixReduction(ReductionPhi, State);
else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
- fixFirstOrderRecurrence(FOR, State);
+ fixFixedOrderRecurrence(FOR, State);
}
}
-void InnerLoopVectorizer::fixFirstOrderRecurrence(
+void InnerLoopVectorizer::fixFixedOrderRecurrence(
VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) {
// This is the second phase of vectorizing first-order recurrences. An
// overview of the transformation is described below. Suppose we have the
@@ -4019,7 +4035,7 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
// We know that the loop is in LCSSA form. We need to update the PHI nodes
// in the exit blocks. See comment on analogous loop in
- // fixFirstOrderRecurrence for a more complete explaination of the logic.
+ // fixFixedOrderRecurrence for a more complete explaination of the logic.
if (!Cost->requiresScalarEpilogue(VF))
for (PHINode &LCSSAPhi : LoopExitBlock->phis())
if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) {
@@ -4146,8 +4162,7 @@ void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
void InnerLoopVectorizer::fixNonInductionPHIs(VPlan &Plan,
VPTransformState &State) {
- auto Iter = depth_first(
- VPBlockRecursiveTraversalWrapper<VPBlockBase *>(Plan.getEntry()));
+ auto Iter = vp_depth_first_deep(Plan.getEntry());
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
for (VPRecipeBase &P : VPBB->phis()) {
VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P);
@@ -4170,78 +4185,6 @@ bool InnerLoopVectorizer::useOrderedReductions(
return Cost->useOrderedReductions(RdxDesc);
}
-void InnerLoopVectorizer::widenCallInstruction(CallInst &CI, VPValue *Def,
- VPUser &ArgOperands,
- VPTransformState &State) {
- assert(!isa<DbgInfoIntrinsic>(CI) &&
- "DbgInfoIntrinsic should have been dropped during VPlan construction");
- State.setDebugLocFromInst(&CI);
-
- SmallVector<Type *, 4> Tys;
- for (Value *ArgOperand : CI.args())
- Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));
-
- Intrinsic::ID ID = getVectorIntrinsicIDForCall(&CI, TLI);
-
- // The flag shows whether we use Intrinsic or a usual Call for vectorized
- // version of the instruction.
- // Is it beneficial to perform intrinsic call compared to lib call?
- bool NeedToScalarize = false;
- InstructionCost CallCost = Cost->getVectorCallCost(&CI, VF, NeedToScalarize);
- InstructionCost IntrinsicCost =
- ID ? Cost->getVectorIntrinsicCost(&CI, VF) : 0;
- bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
- assert((UseVectorIntrinsic || !NeedToScalarize) &&
- "Instruction should be scalarized elsewhere.");
- assert((IntrinsicCost.isValid() || CallCost.isValid()) &&
- "Either the intrinsic cost or vector call cost must be valid");
-
- for (unsigned Part = 0; Part < UF; ++Part) {
- SmallVector<Type *, 2> TysForDecl = {CI.getType()};
- SmallVector<Value *, 4> Args;
- for (auto &I : enumerate(ArgOperands.operands())) {
- // Some intrinsics have a scalar argument - don't replace it with a
- // vector.
- Value *Arg;
- if (!UseVectorIntrinsic ||
- !isVectorIntrinsicWithScalarOpAtArg(ID, I.index()))
- Arg = State.get(I.value(), Part);
- else
- Arg = State.get(I.value(), VPIteration(0, 0));
- if (isVectorIntrinsicWithOverloadTypeAtArg(ID, I.index()))
- TysForDecl.push_back(Arg->getType());
- Args.push_back(Arg);
- }
-
- Function *VectorF;
- if (UseVectorIntrinsic) {
- // Use vector version of the intrinsic.
- if (VF.isVector())
- TysForDecl[0] = VectorType::get(CI.getType()->getScalarType(), VF);
- Module *M = State.Builder.GetInsertBlock()->getModule();
- VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
- assert(VectorF && "Can't retrieve vector intrinsic.");
- } else {
- // Use vector version of the function call.
- const VFShape Shape = VFShape::get(CI, VF, false /*HasGlobalPred*/);
-#ifndef NDEBUG
- assert(VFDatabase(CI).getVectorizedFunction(Shape) != nullptr &&
- "Can't create vector function.");
-#endif
- VectorF = VFDatabase(CI).getVectorizedFunction(Shape);
- }
- SmallVector<OperandBundleDef, 1> OpBundles;
- CI.getOperandBundlesAsDefs(OpBundles);
- CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
-
- if (isa<FPMathOperator>(V))
- V->copyFastMathFlags(&CI);
-
- State.set(Def, V, Part);
- State.addMetadata(V, &CI);
- }
-}
-
void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
// We should not collect Scalars more than once per VF. Right now, this
// function is called from collectUniformsAndScalars(), which already does
@@ -4350,8 +4293,10 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
// induction variable when the PHI user is scalarized.
auto ForcedScalar = ForcedScalars.find(VF);
if (ForcedScalar != ForcedScalars.end())
- for (auto *I : ForcedScalar->second)
+ for (auto *I : ForcedScalar->second) {
+ LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n");
Worklist.insert(I);
+ }
// Expand the worklist by looking through any bitcasts and getelementptr
// instructions we've already identified as scalar. This is similar to the
@@ -4376,7 +4321,7 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
// An induction variable will remain scalar if all users of the induction
// variable and induction variable update remain scalar.
- for (auto &Induction : Legal->getInductionVars()) {
+ for (const auto &Induction : Legal->getInductionVars()) {
auto *Ind = Induction.first;
auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
@@ -4429,15 +4374,16 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
bool LoopVectorizationCostModel::isScalarWithPredication(
Instruction *I, ElementCount VF) const {
- if (!blockNeedsPredicationForAnyReason(I->getParent()))
+ if (!isPredicatedInst(I))
return false;
+
+ // Do we have a non-scalar lowering for this predicated
+ // instruction? No - it is scalar with predication.
switch(I->getOpcode()) {
default:
- break;
+ return true;
case Instruction::Load:
case Instruction::Store: {
- if (!Legal->isMaskRequired(I))
- return false;
auto *Ptr = getLoadStorePointerOperand(I);
auto *Ty = getLoadStoreType(I);
Type *VTy = Ty;
@@ -4452,12 +4398,119 @@ bool LoopVectorizationCostModel::isScalarWithPredication(
case Instruction::UDiv:
case Instruction::SDiv:
case Instruction::SRem:
+ case Instruction::URem: {
+ // We have the option to use the safe-divisor idiom to avoid predication.
+ // The cost based decision here will always select safe-divisor for
+ // scalable vectors as scalarization isn't legal.
+ const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
+ return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost);
+ }
+ }
+}
+
+bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {
+ if (!blockNeedsPredicationForAnyReason(I->getParent()))
+ return false;
+
+ // Can we prove this instruction is safe to unconditionally execute?
+ // If not, we must use some form of predication.
+ switch(I->getOpcode()) {
+ default:
+ return false;
+ case Instruction::Load:
+ case Instruction::Store: {
+ if (!Legal->isMaskRequired(I))
+ return false;
+ // When we know the load's address is loop invariant and the instruction
+ // in the original scalar loop was unconditionally executed then we
+ // don't need to mark it as a predicated instruction. Tail folding may
+ // introduce additional predication, but we're guaranteed to always have
+ // at least one active lane. We call Legal->blockNeedsPredication here
+ // because it doesn't query tail-folding. For stores, we need to prove
+ // both speculation safety (which follows from the same argument as loads),
+ // but also must prove the value being stored is correct. The easiest
+ // form of the later is to require that all values stored are the same.
+ if (Legal->isUniformMemOp(*I) &&
+ (isa<LoadInst>(I) ||
+ (isa<StoreInst>(I) &&
+ TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()))) &&
+ !Legal->blockNeedsPredication(I->getParent()))
+ return false;
+ return true;
+ }
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::SRem:
case Instruction::URem:
// TODO: We can use the loop-preheader as context point here and get
// context sensitive reasoning
return !isSafeToSpeculativelyExecute(I);
}
- return false;
+}
+
+std::pair<InstructionCost, InstructionCost>
+LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
+ ElementCount VF) const {
+ assert(I->getOpcode() == Instruction::UDiv ||
+ I->getOpcode() == Instruction::SDiv ||
+ I->getOpcode() == Instruction::SRem ||
+ I->getOpcode() == Instruction::URem);
+ assert(!isSafeToSpeculativelyExecute(I));
+
+ const TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+
+ // Scalarization isn't legal for scalable vector types
+ InstructionCost ScalarizationCost = InstructionCost::getInvalid();
+ if (!VF.isScalable()) {
+ // Get the scalarization cost and scale this amount by the probability of
+ // executing the predicated block. If the instruction is not predicated,
+ // we fall through to the next case.
+ ScalarizationCost = 0;
+
+ // These instructions have a non-void type, so account for the phi nodes
+ // that we will create. This cost is likely to be zero. The phi node
+ // cost, if any, should be scaled by the block probability because it
+ // models a copy at the end of each predicated block.
+ ScalarizationCost += VF.getKnownMinValue() *
+ TTI.getCFInstrCost(Instruction::PHI, CostKind);
+
+ // The cost of the non-predicated instruction.
+ ScalarizationCost += VF.getKnownMinValue() *
+ TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind);
+
+ // The cost of insertelement and extractelement instructions needed for
+ // scalarization.
+ ScalarizationCost += getScalarizationOverhead(I, VF, CostKind);
+
+ // Scale the cost by the probability of executing the predicated blocks.
+ // This assumes the predicated block for each vector lane is equally
+ // likely.
+ ScalarizationCost = ScalarizationCost / getReciprocalPredBlockProb();
+ }
+ InstructionCost SafeDivisorCost = 0;
+
+ auto *VecTy = ToVectorTy(I->getType(), VF);
+
+ // The cost of the select guard to ensure all lanes are well defined
+ // after we speculate above any internal control flow.
+ SafeDivisorCost += TTI.getCmpSelInstrCost(
+ Instruction::Select, VecTy,
+ ToVectorTy(Type::getInt1Ty(I->getContext()), VF),
+ CmpInst::BAD_ICMP_PREDICATE, CostKind);
+
+ // Certain instructions can be cheaper to vectorize if they have a constant
+ // second vector operand. One example of this are shifts on x86.
+ Value *Op2 = I->getOperand(1);
+ auto Op2Info = TTI.getOperandInfo(Op2);
+ if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
+ Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
+
+ SmallVector<const Value *, 4> Operands(I->operand_values());
+ SafeDivisorCost += TTI.getArithmeticInstrCost(
+ I->getOpcode(), VecTy, CostKind,
+ {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
+ Op2Info, Operands, I);
+ return {ScalarizationCost, SafeDivisorCost};
}
bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
@@ -4610,17 +4663,26 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
addToWorklistIfAllowed(Cmp);
+ // Return true if all lanes perform the same memory operation, and we can
+ // thus chose to execute only one.
+ auto isUniformMemOpUse = [&](Instruction *I) {
+ if (!Legal->isUniformMemOp(*I))
+ return false;
+ if (isa<LoadInst>(I))
+ // Loading the same address always produces the same result - at least
+ // assuming aliasing and ordering which have already been checked.
+ return true;
+ // Storing the same value on every iteration.
+ return TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand());
+ };
+
auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
InstWidening WideningDecision = getWideningDecision(I, VF);
assert(WideningDecision != CM_Unknown &&
"Widening decision should be ready at this moment");
- // A uniform memory op is itself uniform. We exclude uniform stores
- // here as they demand the last lane, not the first one.
- if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) {
- assert(WideningDecision == CM_Scalarize);
+ if (isUniformMemOpUse(I))
return true;
- }
return (WideningDecision == CM_Widen ||
WideningDecision == CM_Widen_Reverse ||
@@ -4674,9 +4736,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
if (!Ptr)
continue;
- // A uniform memory op is itself uniform. We exclude uniform stores
- // here as they demand the last lane, not the first one.
- if (isa<LoadInst>(I) && Legal->isUniformMemOp(I))
+ if (isUniformMemOpUse(&I))
addToWorklistIfAllowed(&I);
if (isUniformDecision(&I, VF)) {
@@ -4707,14 +4767,14 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
while (idx != Worklist.size()) {
Instruction *I = Worklist[idx++];
- for (auto OV : I->operand_values()) {
+ for (auto *OV : I->operand_values()) {
// isOutOfScope operands cannot be uniform instructions.
if (isOutOfScope(OV))
continue;
// First order recurrence Phi's should typically be considered
// non-uniform.
auto *OP = dyn_cast<PHINode>(OV);
- if (OP && Legal->isFirstOrderRecurrence(OP))
+ if (OP && Legal->isFixedOrderRecurrence(OP))
continue;
// If all the users of the operand are uniform, then add the
// operand into the uniform worklist.
@@ -4733,7 +4793,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
// nodes separately. An induction variable will remain uniform if all users
// of the induction variable and induction variable update remain uniform.
// The code below handles both pointer and non-pointer induction variables.
- for (auto &Induction : Legal->getInductionVars()) {
+ for (const auto &Induction : Legal->getInductionVars()) {
auto *Ind = Induction.first;
auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
@@ -4846,12 +4906,12 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
return MaxScalableVF;
// Limit MaxScalableVF by the maximum safe dependence distance.
- Optional<unsigned> MaxVScale = TTI.getMaxVScale();
+ std::optional<unsigned> MaxVScale = TTI.getMaxVScale();
if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange))
MaxVScale =
TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
- MaxScalableVF = ElementCount::getScalable(
- MaxVScale ? (MaxSafeElements / MaxVScale.value()) : 0);
+ MaxScalableVF =
+ ElementCount::getScalable(MaxVScale ? (MaxSafeElements / *MaxVScale) : 0);
if (!MaxScalableVF)
reportVectorizationInfo(
"Max legal vector width too small, scalable vectorization "
@@ -4991,7 +5051,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
case CM_ScalarEpilogueAllowed:
return computeFeasibleMaxVF(TC, UserVF, false);
case CM_ScalarEpilogueNotAllowedUsePredicate:
- LLVM_FALLTHROUGH;
+ [[fallthrough]];
case CM_ScalarEpilogueNotNeededUsePredicate:
LLVM_DEBUG(
dbgs() << "LV: vector predicate hint/switch found.\n"
@@ -5113,7 +5173,7 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType,
ElementCount MaxSafeVF, bool FoldTailByMasking) {
bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
- TypeSize WidestRegister = TTI.getRegisterBitWidth(
+ const TypeSize WidestRegister = TTI.getRegisterBitWidth(
ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
: TargetTransformInfo::RGK_FixedWidthVector);
@@ -5127,7 +5187,7 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
// Ensure MaxVF is a power of 2; the dependence distance bound may not be.
// Note that both WidestRegister and WidestType may not be a powers of 2.
auto MaxVectorElementCount = ElementCount::get(
- PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType),
+ PowerOf2Floor(WidestRegister.getKnownMinValue() / WidestType),
ComputeScalableMaxVF);
MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
@@ -5140,9 +5200,14 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
return ElementCount::getFixed(1);
}
- const auto TripCountEC = ElementCount::getFixed(ConstTripCount);
- if (ConstTripCount &&
- ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) &&
+ unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue();
+ if (MaxVectorElementCount.isScalable() &&
+ TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
+ auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
+ auto Min = Attr.getVScaleRangeMin();
+ WidestRegisterMinEC *= Min;
+ }
+ if (ConstTripCount && ConstTripCount <= WidestRegisterMinEC &&
(!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) {
// If loop trip count (TC) is known at compile time there is no point in
// choosing VF greater than TC (as done in the loop below). Select maximum
@@ -5163,7 +5228,7 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
if (MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 &&
TTI.shouldMaximizeVectorBandwidth(RegKind))) {
auto MaxVectorElementCountMaxBW = ElementCount::get(
- PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType),
+ PowerOf2Floor(WidestRegister.getKnownMinValue() / SmallestType),
ComputeScalableMaxVF);
MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
@@ -5208,7 +5273,7 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
return MaxVF;
}
-Optional<unsigned> LoopVectorizationCostModel::getVScaleForTuning() const {
+std::optional<unsigned> LoopVectorizationCostModel::getVScaleForTuning() const {
if (TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
auto Min = Attr.getVScaleRangeMin();
@@ -5244,11 +5309,11 @@ bool LoopVectorizationCostModel::isMoreProfitable(
// Improve estimate for the vector width if it is scalable.
unsigned EstimatedWidthA = A.Width.getKnownMinValue();
unsigned EstimatedWidthB = B.Width.getKnownMinValue();
- if (Optional<unsigned> VScale = getVScaleForTuning()) {
+ if (std::optional<unsigned> VScale = getVScaleForTuning()) {
if (A.Width.isScalable())
- EstimatedWidthA *= VScale.value();
+ EstimatedWidthA *= *VScale;
if (B.Width.isScalable())
- EstimatedWidthB *= VScale.value();
+ EstimatedWidthB *= *VScale;
}
// Assume vscale may be larger than 1 (or the value being tuned for),
@@ -5294,7 +5359,7 @@ VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
#ifndef NDEBUG
unsigned AssumedMinimumVscale = 1;
- if (Optional<unsigned> VScale = getVScaleForTuning())
+ if (std::optional<unsigned> VScale = getVScaleForTuning())
AssumedMinimumVscale = *VScale;
unsigned Width =
Candidate.Width.isScalable()
@@ -5365,7 +5430,7 @@ VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
raw_string_ostream OS(OutString);
assert(!Subset.empty() && "Unexpected empty range");
OS << "Instruction with invalid costs prevented vectorization at VF=(";
- for (auto &Pair : Subset)
+ for (const auto &Pair : Subset)
OS << (Pair.second == Subset.front().second ? "" : ", ")
<< Pair.second;
OS << "):";
@@ -5403,12 +5468,12 @@ bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(
// Cross iteration phis such as reductions need special handling and are
// currently unsupported.
if (any_of(L.getHeader()->phis(),
- [&](PHINode &Phi) { return Legal->isFirstOrderRecurrence(&Phi); }))
+ [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); }))
return false;
// Phis with uses outside of the loop require special handling and are
// currently unsupported.
- for (auto &Entry : Legal->getInductionVars()) {
+ for (const auto &Entry : Legal->getInductionVars()) {
// Look for uses of the value of the induction at the last iteration.
Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch());
for (User *U : PostInc->users())
@@ -5420,14 +5485,6 @@ bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(
return false;
}
- // Induction variables that are widened require special handling that is
- // currently not supported.
- if (any_of(Legal->getInductionVars(), [&](auto &Entry) {
- return !(this->isScalarAfterVectorization(Entry.first, VF) ||
- this->isProfitableToScalarize(Entry.first, VF));
- }))
- return false;
-
// Epilogue vectorization code has not been auditted to ensure it handles
// non-latch exits properly. It may be fine, but it needs auditted and
// tested.
@@ -5443,6 +5500,11 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
// as register pressure, code size increase and cost of extra branches into
// account. For now we apply a very crude heuristic and only consider loops
// with vectorization factors larger than a certain value.
+
+ // Allow the target to opt out entirely.
+ if (!TTI.preferEpilogueVectorization())
+ return false;
+
// We also consider epilogue vectorization unprofitable for targets that don't
// consider interleaving beneficial (eg. MVE).
if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1)
@@ -5512,7 +5574,7 @@ LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
ElementCount EstimatedRuntimeVF = MainLoopVF;
if (MainLoopVF.isScalable()) {
EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue());
- if (Optional<unsigned> VScale = getVScaleForTuning())
+ if (std::optional<unsigned> VScale = getVScaleForTuning())
EstimatedRuntimeVF *= *VScale;
}
@@ -5542,7 +5604,7 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() {
// Reset MaxWidth so that we can find the smallest type used by recurrences
// in the loop.
MaxWidth = -1U;
- for (auto &PhiDescriptorPair : Legal->getReductionVars()) {
+ for (const auto &PhiDescriptorPair : Legal->getReductionVars()) {
const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
// When finding the min width used by the recurrence we need to account
// for casts on the input operands of the recurrence.
@@ -5554,9 +5616,9 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() {
} else {
for (Type *T : ElementTypesInLoop) {
MinWidth = std::min<unsigned>(
- MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
+ MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
MaxWidth = std::max<unsigned>(
- MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
+ MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
}
}
return {MinWidth, MaxWidth};
@@ -5605,8 +5667,9 @@ void LoopVectorizationCostModel::collectElementTypesForWidening() {
}
}
-unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
- unsigned LoopCost) {
+unsigned
+LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
+ InstructionCost LoopCost) {
// -- The interleave heuristics --
// We interleave the loop in order to expose ILP and reduce the loop overhead.
// There are many micro-architectural considerations that we can't predict
@@ -5642,9 +5705,8 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
// If we did not calculate the cost for VF (because the user selected the VF)
// then we calculate the cost of VF here.
if (LoopCost == 0) {
- InstructionCost C = expectedCost(VF).first;
- assert(C.isValid() && "Expected to have chosen a VF with valid cost");
- LoopCost = *C.getValue();
+ LoopCost = expectedCost(VF).first;
+ assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");
// Loop body is free and there is no need for interleaving.
if (LoopCost == 0)
@@ -5772,8 +5834,8 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
// We assume that the cost overhead is 1 and we use the cost model
// to estimate the cost of the loop and interleave until the cost of the
// loop overhead is about 5% of the cost of the loop.
- unsigned SmallIC =
- std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
+ unsigned SmallIC = std::min(
+ IC, (unsigned)PowerOf2Floor(SmallLoopCost / *LoopCost.getValue()));
// Interleave until store/load ports (estimated by max interleave count) are
// saturated.
@@ -5888,8 +5950,9 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
IntervalMap EndPoint;
// Saves the list of instruction indices that are used in the loop.
SmallPtrSet<Instruction *, 8> Ends;
- // Saves the list of values that are used in the loop but are
- // defined outside the loop, such as arguments and constants.
+ // Saves the list of values that are used in the loop but are defined outside
+ // the loop (not including non-instruction values such as arguments and
+ // constants).
SmallPtrSet<Value *, 8> LoopInvariants;
for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
@@ -5901,6 +5964,9 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
auto *Instr = dyn_cast<Instruction>(U);
// Ignore non-instruction values such as arguments, constants, etc.
+ // FIXME: Might need some motivation why these values are ignored. If
+ // for example an argument is used inside the loop it will increase the
+ // register pressure (so shouldn't we add it to LoopInvariants).
if (!Instr)
continue;
@@ -5956,44 +6022,44 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
// For each VF find the maximum usage of registers.
for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
- // Count the number of live intervals.
+ // Count the number of registers used, per register class, given all open
+ // intervals.
+ // Note that elements in this SmallMapVector will be default constructed
+ // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if
+ // there is no previous entry for ClassID.
SmallMapVector<unsigned, unsigned, 4> RegUsage;
if (VFs[j].isScalar()) {
- for (auto Inst : OpenIntervals) {
- unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
- if (RegUsage.find(ClassID) == RegUsage.end())
- RegUsage[ClassID] = 1;
- else
- RegUsage[ClassID] += 1;
+ for (auto *Inst : OpenIntervals) {
+ unsigned ClassID =
+ TTI.getRegisterClassForType(false, Inst->getType());
+ // FIXME: The target might use more than one register for the type
+ // even in the scalar case.
+ RegUsage[ClassID] += 1;
}
} else {
collectUniformsAndScalars(VFs[j]);
- for (auto Inst : OpenIntervals) {
+ for (auto *Inst : OpenIntervals) {
// Skip ignored values for VF > 1.
if (VecValuesToIgnore.count(Inst))
continue;
if (isScalarAfterVectorization(Inst, VFs[j])) {
- unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
- if (RegUsage.find(ClassID) == RegUsage.end())
- RegUsage[ClassID] = 1;
- else
- RegUsage[ClassID] += 1;
+ unsigned ClassID =
+ TTI.getRegisterClassForType(false, Inst->getType());
+ // FIXME: The target might use more than one register for the type
+ // even in the scalar case.
+ RegUsage[ClassID] += 1;
} else {
- unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
- if (RegUsage.find(ClassID) == RegUsage.end())
- RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
- else
- RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
+ unsigned ClassID =
+ TTI.getRegisterClassForType(true, Inst->getType());
+ RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
}
}
}
for (auto& pair : RegUsage) {
- if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
- MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
- else
- MaxUsages[j][pair.first] = pair.second;
+ auto &Entry = MaxUsages[j][pair.first];
+ Entry = std::max(Entry, pair.second);
}
}
@@ -6005,17 +6071,19 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
}
for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
+ // Note that elements in this SmallMapVector will be default constructed
+ // as 0. So we can use "Invariant[ClassID] += n" in the code below even if
+ // there is no previous entry for ClassID.
SmallMapVector<unsigned, unsigned, 4> Invariant;
- for (auto Inst : LoopInvariants) {
+ for (auto *Inst : LoopInvariants) {
+ // FIXME: The target might use more than one register for the type
+ // even in the scalar case.
unsigned Usage =
VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
unsigned ClassID =
TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType());
- if (Invariant.find(ClassID) == Invariant.end())
- Invariant[ClassID] = Usage;
- else
- Invariant[ClassID] += Usage;
+ Invariant[ClassID] += Usage;
}
LLVM_DEBUG({
@@ -6054,7 +6122,7 @@ bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
// from moving "masked load/store" check from legality to cost model.
// Masked Load/Gather emulation was previously never allowed.
// Limited number of Masked Store/Scatter emulation was allowed.
- assert((isPredicatedInst(I, VF) || Legal->isUniformMemOp(*I)) &&
+ assert((isPredicatedInst(I)) &&
"Expecting a scalar emulated instruction");
return isa<LoadInst>(I) ||
(isa<StoreInst>(I) &&
@@ -6099,7 +6167,7 @@ void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
}
}
-int LoopVectorizationCostModel::computePredInstDiscount(
+InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
assert(!isUniformAfterVectorization(PredInst, VF) &&
"Instruction marked uniform-after-vectorization will be predicated");
@@ -6173,13 +6241,14 @@ int LoopVectorizationCostModel::computePredInstDiscount(
// Compute the scalarization overhead of needed insertelement instructions
// and phi nodes.
+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
ScalarCost += TTI.getScalarizationOverhead(
cast<VectorType>(ToVectorTy(I->getType(), VF)),
- APInt::getAllOnes(VF.getFixedValue()), true, false);
+ APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true,
+ /*Extract*/ false, CostKind);
ScalarCost +=
- VF.getFixedValue() *
- TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
+ VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
}
// Compute the scalarization overhead of needed extractelement
@@ -6195,7 +6264,8 @@ int LoopVectorizationCostModel::computePredInstDiscount(
else if (needsExtract(J, VF)) {
ScalarCost += TTI.getScalarizationOverhead(
cast<VectorType>(ToVectorTy(J->getType(), VF)),
- APInt::getAllOnes(VF.getFixedValue()), false, true);
+ APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false,
+ /*Extract*/ true, CostKind);
}
}
@@ -6208,7 +6278,7 @@ int LoopVectorizationCostModel::computePredInstDiscount(
ScalarCosts[I] = ScalarCost;
}
- return *Discount.getValue();
+ return Discount;
}
LoopVectorizationCostModel::VectorizationCostTy
@@ -6324,19 +6394,20 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
// Don't pass *I here, since it is scalar but will actually be part of a
// vectorized loop where the user of it is a vectorized instruction.
+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
const Align Alignment = getLoadStoreAlignment(I);
- Cost += VF.getKnownMinValue() *
- TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
- AS, TTI::TCK_RecipThroughput);
+ Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(),
+ ValTy->getScalarType(),
+ Alignment, AS, CostKind);
// Get the overhead of the extractelement and insertelement instructions
// we might create due to scalarization.
- Cost += getScalarizationOverhead(I, VF);
+ Cost += getScalarizationOverhead(I, VF, CostKind);
// If we have a predicated load/store, it will need extra i1 extracts and
// conditional branches, but may not be executed for each vector lane. Scale
// the cost by the probability of executing the predicated block.
- if (isPredicatedInst(I, VF)) {
+ if (isPredicatedInst(I)) {
Cost /= getReciprocalPredBlockProb();
// Add the cost of an i1 extract and a branch
@@ -6344,8 +6415,8 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF);
Cost += TTI.getScalarizationOverhead(
Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
- /*Insert=*/false, /*Extract=*/true);
- Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput);
+ /*Insert=*/false, /*Extract=*/true, CostKind);
+ Cost += TTI.getCFInstrCost(Instruction::Br, CostKind);
if (useEmulatedMaskMemRefHack(I, VF))
// Artificially setting to a high enough value to practically disable
@@ -6370,17 +6441,19 @@ LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
"Stride should be 1 or -1 for consecutive memory access");
const Align Alignment = getLoadStoreAlignment(I);
InstructionCost Cost = 0;
- if (Legal->isMaskRequired(I))
+ if (Legal->isMaskRequired(I)) {
Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
CostKind);
- else
+ } else {
+ TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
- CostKind, I);
+ CostKind, OpInfo, I);
+ }
bool Reverse = ConsecutiveStride < 0;
if (Reverse)
- Cost +=
- TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0);
+ Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy,
+ std::nullopt, CostKind, 0);
return Cost;
}
@@ -6409,7 +6482,7 @@ LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
(isLoopInvariantStoreValue
? 0
: TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
- VF.getKnownMinValue() - 1));
+ CostKind, VF.getKnownMinValue() - 1));
}
InstructionCost
@@ -6437,6 +6510,7 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
Type *ValTy = getLoadStoreType(I);
auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
unsigned AS = getLoadStoreAddressSpace(I);
+ enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
auto Group = getInterleavedAccessGroup(I);
assert(Group && "Fail to get an interleaved access group.");
@@ -6456,25 +6530,26 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
(isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
- AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps);
+ AS, CostKind, Legal->isMaskRequired(I), UseMaskForGaps);
if (Group->isReverse()) {
// TODO: Add support for reversed masked interleaved access.
assert(!Legal->isMaskRequired(I) &&
"Reverse masked interleaved access not supported.");
- Cost +=
- Group->getNumMembers() *
- TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0);
+ Cost += Group->getNumMembers() *
+ TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy,
+ std::nullopt, CostKind, 0);
}
return Cost;
}
-Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost(
+std::optional<InstructionCost>
+LoopVectorizationCostModel::getReductionPatternCost(
Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) {
using namespace llvm::PatternMatch;
// Early exit for no inloop reductions
if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty))
- return None;
+ return std::nullopt;
auto *VectorTy = cast<VectorType>(Ty);
// We are looking for a pattern of, and finding the minimal acceptable cost:
@@ -6492,20 +6567,19 @@ Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost(
Instruction *RetI = I;
if (match(RetI, m_ZExtOrSExt(m_Value()))) {
if (!RetI->hasOneUser())
- return None;
+ return std::nullopt;
RetI = RetI->user_back();
}
- if (match(RetI, m_Mul(m_Value(), m_Value())) &&
+
+ if (match(RetI, m_OneUse(m_Mul(m_Value(), m_Value()))) &&
RetI->user_back()->getOpcode() == Instruction::Add) {
- if (!RetI->hasOneUser())
- return None;
RetI = RetI->user_back();
}
// Test if the found instruction is a reduction, and if not return an invalid
// cost specifying the parent to use the original cost modelling.
if (!InLoopReductionImmediateChains.count(RetI))
- return None;
+ return std::nullopt;
// Find the reduction this chain is a part of and calculate the basic cost of
// the reduction on its own.
@@ -6541,7 +6615,7 @@ Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost(
VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
Instruction *Op0, *Op1;
- if (RedOp &&
+ if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
match(RedOp,
m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) &&
match(Op0, m_ZExtOrSExt(m_Value())) &&
@@ -6550,7 +6624,7 @@ Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost(
!TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) &&
(Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
- // Matched reduce(ext(mul(ext(A), ext(B)))
+ // Matched reduce.add(ext(mul(ext(A), ext(B)))
// Note that the extend opcodes need to all match, or if A==B they will have
// been converted to zext(mul(sext(A), sext(A))) as it is known positive,
// which is equally fine.
@@ -6567,9 +6641,8 @@ Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost(
TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,
TTI::CastContextHint::None, CostKind, RedOp);
- InstructionCost RedCost = TTI.getExtendedAddReductionCost(
- /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
- CostKind);
+ InstructionCost RedCost = TTI.getMulAccReductionCost(
+ IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
if (RedCost.isValid() &&
RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
@@ -6579,16 +6652,16 @@ Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost(
// Matched reduce(ext(A))
bool IsUnsigned = isa<ZExtInst>(RedOp);
auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
- InstructionCost RedCost = TTI.getExtendedAddReductionCost(
- /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
- CostKind);
+ InstructionCost RedCost = TTI.getExtendedReductionCost(
+ RdxDesc.getOpcode(), IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
+ RdxDesc.getFastMathFlags(), CostKind);
InstructionCost ExtCost =
TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
TTI::CastContextHint::None, CostKind, RedOp);
if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
return I == RetI ? RedCost : 0;
- } else if (RedOp &&
+ } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
if (match(Op0, m_ZExtOrSExt(m_Value())) &&
Op0->getOpcode() == Op1->getOpcode() &&
@@ -6601,7 +6674,7 @@ Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost(
: Op0Ty;
auto *ExtType = VectorType::get(LargestOpTy, VectorTy);
- // Matched reduce(mul(ext(A), ext(B))), where the two ext may be of
+ // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of
// different sizes. We take the largest type as the ext to reduce, and add
// the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
InstructionCost ExtCost0 = TTI.getCastInstrCost(
@@ -6613,9 +6686,8 @@ Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost(
InstructionCost MulCost =
TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
- InstructionCost RedCost = TTI.getExtendedAddReductionCost(
- /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
- CostKind);
+ InstructionCost RedCost = TTI.getMulAccReductionCost(
+ IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
InstructionCost ExtraExtCost = 0;
if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
@@ -6629,20 +6701,19 @@ Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost(
(RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
return I == RetI ? RedCost : 0;
} else if (!match(I, m_ZExtOrSExt(m_Value()))) {
- // Matched reduce(mul())
+ // Matched reduce.add(mul())
InstructionCost MulCost =
TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
- InstructionCost RedCost = TTI.getExtendedAddReductionCost(
- /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy,
- CostKind);
+ InstructionCost RedCost = TTI.getMulAccReductionCost(
+ true, RdxDesc.getRecurrenceType(), VectorTy, CostKind);
if (RedCost.isValid() && RedCost < MulCost + BaseCost)
return I == RetI ? RedCost : 0;
}
}
- return I == RetI ? Optional<InstructionCost>(BaseCost) : None;
+ return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt;
}
InstructionCost
@@ -6655,9 +6726,10 @@ LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
const Align Alignment = getLoadStoreAlignment(I);
unsigned AS = getLoadStoreAddressSpace(I);
+ TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
return TTI.getAddressComputationCost(ValTy) +
TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
- TTI::TCK_RecipThroughput, I);
+ TTI::TCK_RecipThroughput, OpInfo, I);
}
return getWideningCost(I, VF);
}
@@ -6705,9 +6777,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
return VectorizationCostTy(C, TypeNotScalarized);
}
-InstructionCost
-LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
- ElementCount VF) const {
+InstructionCost LoopVectorizationCostModel::getScalarizationOverhead(
+ Instruction *I, ElementCount VF, TTI::TargetCostKind CostKind) const {
// There is no mechanism yet to create a scalable scalarization loop,
// so this is currently Invalid.
@@ -6722,8 +6793,9 @@ LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
if (!RetTy->isVoidTy() &&
(!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
Cost += TTI.getScalarizationOverhead(
- cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), true,
- false);
+ cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()),
+ /*Insert*/ true,
+ /*Extract*/ false, CostKind);
// Some targets keep addresses scalar.
if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
@@ -6743,7 +6815,7 @@ LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
for (auto *V : filterExtractingOperands(Ops, VF))
Tys.push_back(MaybeVectorizeType(V->getType(), VF));
return Cost + TTI.getOperandsScalarizationOverhead(
- filterExtractingOperands(Ops, VF), Tys);
+ filterExtractingOperands(Ops, VF), Tys, CostKind);
}
void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
@@ -6765,29 +6837,47 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
NumPredStores++;
if (Legal->isUniformMemOp(I)) {
- // Lowering story for uniform memory ops is currently a bit complicated.
- // Scalarization works for everything which isn't a store with scalable
- // VF. Fixed len VFs just scalarize and then DCE later; scalarization
- // knows how to handle uniform-per-part values (i.e. the first lane
- // in each unrolled VF) and can thus handle scalable loads too. For
- // scalable stores, we use a scatter if legal. If not, we have no way
- // to lower (currently) and thus have to abort vectorization.
- if (isa<StoreInst>(&I) && VF.isScalable()) {
- if (isLegalGatherOrScatter(&I, VF))
- setWideningDecision(&I, VF, CM_GatherScatter,
- getGatherScatterCost(&I, VF));
- else
- // Error case, abort vectorization
- setWideningDecision(&I, VF, CM_Scalarize,
- InstructionCost::getInvalid());
- continue;
- }
+ auto isLegalToScalarize = [&]() {
+ if (!VF.isScalable())
+ // Scalarization of fixed length vectors "just works".
+ return true;
+
+ // We have dedicated lowering for unpredicated uniform loads and
+ // stores. Note that even with tail folding we know that at least
+ // one lane is active (i.e. generalized predication is not possible
+ // here), and the logic below depends on this fact.
+ if (!foldTailByMasking())
+ return true;
+
+ // For scalable vectors, a uniform memop load is always
+ // uniform-by-parts and we know how to scalarize that.
+ if (isa<LoadInst>(I))
+ return true;
+
+ // A uniform store isn't neccessarily uniform-by-part
+ // and we can't assume scalarization.
+ auto &SI = cast<StoreInst>(I);
+ return TheLoop->isLoopInvariant(SI.getValueOperand());
+ };
+
+ const InstructionCost GatherScatterCost =
+ isLegalGatherOrScatter(&I, VF) ?
+ getGatherScatterCost(&I, VF) : InstructionCost::getInvalid();
+
// Load: Scalar load + broadcast
// Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
- // TODO: Avoid replicating loads and stores instead of relying on
- // instcombine to remove them.
- setWideningDecision(&I, VF, CM_Scalarize,
- getUniformMemOpCost(&I, VF));
+ // FIXME: This cost is a significant under-estimate for tail folded
+ // memory ops.
+ const InstructionCost ScalarizationCost = isLegalToScalarize() ?
+ getUniformMemOpCost(&I, VF) : InstructionCost::getInvalid();
+
+ // Choose better solution for the current VF, Note that Invalid
+ // costs compare as maximumal large. If both are invalid, we get
+ // scalable invalid which signals a failure and a vectorization abort.
+ if (GatherScatterCost < ScalarizationCost)
+ setWideningDecision(&I, VF, CM_GatherScatter, GatherScatterCost);
+ else
+ setWideningDecision(&I, VF, CM_Scalarize, ScalarizationCost);
continue;
}
@@ -6982,7 +7072,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
return (
TTI.getScalarizationOverhead(
- Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), false, true) +
+ Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()),
+ /*Insert*/ false, /*Extract*/ true, CostKind) +
(TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
} else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
// The back-edge branch will remain, as will all scalar branches.
@@ -6998,11 +7089,13 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
auto *Phi = cast<PHINode>(I);
// First-order recurrences are replaced by vector shuffles inside the loop.
- // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
- if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi))
- return TTI.getShuffleCost(
- TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy),
- None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1));
+ if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) {
+ SmallVector<int> Mask(VF.getKnownMinValue());
+ std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
+ return TTI.getShuffleCost(TargetTransformInfo::SK_Splice,
+ cast<VectorType>(VectorTy), Mask, CostKind,
+ VF.getKnownMinValue() - 1);
+ }
// Phi nodes in non-header blocks (not inductions, reductions, etc.) are
// converted into select instructions. We require N - 1 selects per phi
@@ -7020,34 +7113,13 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
case Instruction::SDiv:
case Instruction::URem:
case Instruction::SRem:
- // If we have a predicated instruction, it may not be executed for each
- // vector lane. Get the scalarization cost and scale this amount by the
- // probability of executing the predicated block. If the instruction is not
- // predicated, we fall through to the next case.
- if (VF.isVector() && isScalarWithPredication(I, VF)) {
- InstructionCost Cost = 0;
-
- // These instructions have a non-void type, so account for the phi nodes
- // that we will create. This cost is likely to be zero. The phi node
- // cost, if any, should be scaled by the block probability because it
- // models a copy at the end of each predicated block.
- Cost += VF.getKnownMinValue() *
- TTI.getCFInstrCost(Instruction::PHI, CostKind);
-
- // The cost of the non-predicated instruction.
- Cost += VF.getKnownMinValue() *
- TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);
-
- // The cost of insertelement and extractelement instructions needed for
- // scalarization.
- Cost += getScalarizationOverhead(I, VF);
-
- // Scale the cost by the probability of executing the predicated blocks.
- // This assumes the predicated block for each vector lane is equally
- // likely.
- return Cost / getReciprocalPredBlockProb();
+ if (VF.isVector() && isPredicatedInst(I)) {
+ const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
+ return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ?
+ ScalarCost : SafeDivisorCost;
}
- LLVM_FALLTHROUGH;
+ // We've proven all lanes safe to speculate, fall through.
+ [[fallthrough]];
case Instruction::Add:
case Instruction::FAdd:
case Instruction::Sub:
@@ -7073,22 +7145,22 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
// Certain instructions can be cheaper to vectorize if they have a constant
// second vector operand. One example of this are shifts on x86.
Value *Op2 = I->getOperand(1);
- TargetTransformInfo::OperandValueProperties Op2VP;
- TargetTransformInfo::OperandValueKind Op2VK =
- TTI.getOperandInfo(Op2, Op2VP);
- if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
- Op2VK = TargetTransformInfo::OK_UniformValue;
+ auto Op2Info = TTI.getOperandInfo(Op2);
+ if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
+ Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
SmallVector<const Value *, 4> Operands(I->operand_values());
return TTI.getArithmeticInstrCost(
- I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue,
- Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
+ I->getOpcode(), VectorTy, CostKind,
+ {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
+ Op2Info, Operands, I);
}
case Instruction::FNeg: {
return TTI.getArithmeticInstrCost(
- I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue,
- TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None,
- TargetTransformInfo::OP_None, I->getOperand(0), I);
+ I->getOpcode(), VectorTy, CostKind,
+ {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
+ {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
+ I->getOperand(0), I);
}
case Instruction::Select: {
SelectInst *SI = cast<SelectInst>(I);
@@ -7101,17 +7173,15 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
// select x, y, false --> x & y
// select x, true, y --> x | y
- TTI::OperandValueProperties Op1VP = TTI::OP_None;
- TTI::OperandValueProperties Op2VP = TTI::OP_None;
- TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP);
- TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP);
+ const auto [Op1VK, Op1VP] = TTI::getOperandInfo(Op0);
+ const auto [Op2VK, Op2VP] = TTI::getOperandInfo(Op1);
assert(Op0->getType()->getScalarSizeInBits() == 1 &&
Op1->getType()->getScalarSizeInBits() == 1);
SmallVector<const Value *, 2> Operands{Op0, Op1};
return TTI.getArithmeticInstrCost(
match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,
- CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I);
+ CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, I);
}
Type *CondTy = SI->getCondition()->getType();
@@ -7153,7 +7223,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
case Instruction::BitCast:
if (I->getType()->isPointerTy())
return 0;
- LLVM_FALLTHROUGH;
+ [[fallthrough]];
case Instruction::ZExt:
case Instruction::SExt:
case Instruction::FPToUI:
@@ -7262,7 +7332,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
// the result would need to be a vector of pointers.
if (VF.isScalable())
return InstructionCost::getInvalid();
- LLVM_FALLTHROUGH;
+ [[fallthrough]];
default:
// This opcode is unknown. Assume that it is the same as 'mul'.
return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
@@ -7276,7 +7346,6 @@ static const char lv_name[] = "Loop Vectorization";
INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
@@ -7317,14 +7386,14 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
// Ignore type-promoting instructions we identified during reduction
// detection.
- for (auto &Reduction : Legal->getReductionVars()) {
+ for (const auto &Reduction : Legal->getReductionVars()) {
const RecurrenceDescriptor &RedDes = Reduction.second;
const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
VecValuesToIgnore.insert(Casts.begin(), Casts.end());
}
// Ignore type-casting instructions we identified during induction
// detection.
- for (auto &Induction : Legal->getInductionVars()) {
+ for (const auto &Induction : Legal->getInductionVars()) {
const InductionDescriptor &IndDes = Induction.second;
const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
VecValuesToIgnore.insert(Casts.begin(), Casts.end());
@@ -7332,7 +7401,7 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
}
void LoopVectorizationCostModel::collectInLoopReductions() {
- for (auto &Reduction : Legal->getReductionVars()) {
+ for (const auto &Reduction : Legal->getReductionVars()) {
PHINode *Phi = Reduction.first;
const RecurrenceDescriptor &RdxDesc = Reduction.second;
@@ -7394,7 +7463,7 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
if (UserVF.isZero()) {
VF = ElementCount::getFixed(determineVPlanVF(
TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
- .getFixedSize(),
+ .getFixedValue(),
CM));
LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
@@ -7425,12 +7494,12 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
return VectorizationFactor::Disabled();
}
-Optional<VectorizationFactor>
+std::optional<VectorizationFactor>
LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
assert(OrigLoop->isInnermost() && "Inner loop expected.");
FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
- return None;
+ return std::nullopt;
// Invalidate interleave groups if all blocks of loop will be predicated.
if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
@@ -7550,9 +7619,26 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
InnerLoopVectorizer &ILV,
DominatorTree *DT,
bool IsEpilogueVectorization) {
+ assert(BestVPlan.hasVF(BestVF) &&
+ "Trying to execute plan with unsupported VF");
+ assert(BestVPlan.hasUF(BestUF) &&
+ "Trying to execute plan with unsupported UF");
+
LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF
<< '\n');
+ // Workaround! Compute the trip count of the original loop and cache it
+ // before we start modifying the CFG. This code has a systemic problem
+ // wherein it tries to run analysis over partially constructed IR; this is
+ // wrong, and not simply for SCEV. The trip count of the original loop
+ // simply happens to be prone to hitting this in practice. In theory, we
+ // can hit the same issue for any SCEV, or ValueTracking query done during
+ // mutation. See PR49900.
+ ILV.getOrCreateTripCount(OrigLoop->getLoopPreheader());
+
+ if (!IsEpilogueVectorization)
+ VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
+
// Perform the actual loop transformation.
// 1. Set up the skeleton for vectorization, including vector pre-header and
@@ -7602,7 +7688,7 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
// replace the vectorizer-specific hints below).
MDNode *OrigLoopID = OrigLoop->getLoopID();
- Optional<MDNode *> VectorizedLoopID =
+ std::optional<MDNode *> VectorizedLoopID =
makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
LLVMLoopVectorizeFollowupVectorized});
@@ -7610,7 +7696,7 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
BestVPlan.getVectorLoopRegion()->getEntryBasicBlock();
Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
if (VectorizedLoopID)
- L->setLoopID(VectorizedLoopID.value());
+ L->setLoopID(*VectorizedLoopID);
else {
// Keep all loop hints from the original loop on the vector loop (we'll
// replace the vectorizer-specific hints below).
@@ -7620,9 +7706,7 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
LoopVectorizeHints Hints(L, true, *ORE);
Hints.setAlreadyVectorized();
}
- // Disable runtime unrolling when vectorizing the epilogue loop.
- if (CanonicalIVStartValue)
- AddRuntimeUnrollDisableMetaData(L);
+ AddRuntimeUnrollDisableMetaData(L);
// 3. Fix the vectorized code: take care of header phi's, live-outs,
// predication, updating analyses.
@@ -7651,16 +7735,6 @@ Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
std::pair<BasicBlock *, Value *>
EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
- MDNode *OrigLoopID = OrigLoop->getLoopID();
-
- // Workaround! Compute the trip count of the original loop and cache it
- // before we start modifying the CFG. This code has a systemic problem
- // wherein it tries to run analysis over partially constructed IR; this is
- // wrong, and not simply for SCEV. The trip count of the original loop
- // simply happens to be prone to hitting this in practice. In theory, we
- // can hit the same issue for any SCEV, or ValueTracking query done during
- // mutation. See PR49900.
- getOrCreateTripCount(OrigLoop->getLoopPreheader());
createVectorLoopSkeleton("");
// Generate the code to check the minimum iteration count of the vector
@@ -7691,11 +7765,11 @@ EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
// Skip induction resume value creation here because they will be created in
- // the second pass. If we created them here, they wouldn't be used anyway,
- // because the vplan in the second pass still contains the inductions from the
- // original loop.
+ // the second pass for the scalar loop. The induction resume values for the
+ // inductions in the epilogue loop are created before executing the plan for
+ // the epilogue loop.
- return {completeLoopSkeleton(OrigLoopID), nullptr};
+ return {completeLoopSkeleton(), nullptr};
}
void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
@@ -7779,7 +7853,6 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
std::pair<BasicBlock *, Value *>
EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
- MDNode *OrigLoopID = OrigLoop->getLoopID();
createVectorLoopSkeleton("vec.epilog.");
// Now, compare the remaining count and if there aren't enough iterations to
@@ -7825,31 +7898,40 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
DT->changeImmediateDominator(LoopExitBlock,
EPI.EpilogueIterationCountCheck);
- // Keep track of bypass blocks, as they feed start values to the induction
- // phis in the scalar loop preheader.
+ // Keep track of bypass blocks, as they feed start values to the induction and
+ // reduction phis in the scalar loop preheader.
if (EPI.SCEVSafetyCheck)
LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
if (EPI.MemSafetyCheck)
LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
- // The vec.epilog.iter.check block may contain Phi nodes from reductions which
- // merge control-flow from the latch block and the middle block. Update the
- // incoming values here and move the Phi into the preheader.
+ // The vec.epilog.iter.check block may contain Phi nodes from inductions or
+ // reductions which merge control-flow from the latch block and the middle
+ // block. Update the incoming values here and move the Phi into the preheader.
SmallVector<PHINode *, 4> PhisInBlock;
for (PHINode &Phi : VecEpilogueIterationCountCheck->phis())
PhisInBlock.push_back(&Phi);
for (PHINode *Phi : PhisInBlock) {
+ Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI());
Phi->replaceIncomingBlockWith(
VecEpilogueIterationCountCheck->getSinglePredecessor(),
VecEpilogueIterationCountCheck);
+
+ // If the phi doesn't have an incoming value from the
+ // EpilogueIterationCountCheck, we are done. Otherwise remove the incoming
+ // value and also those from other check blocks. This is needed for
+ // reduction phis only.
+ if (none_of(Phi->blocks(), [&](BasicBlock *IncB) {
+ return EPI.EpilogueIterationCountCheck == IncB;
+ }))
+ continue;
Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck);
if (EPI.SCEVSafetyCheck)
Phi->removeIncomingValue(EPI.SCEVSafetyCheck);
if (EPI.MemSafetyCheck)
Phi->removeIncomingValue(EPI.MemSafetyCheck);
- Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI());
}
// Generate a resume induction for the vector epilogue and put it in the
@@ -7871,7 +7953,7 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
createInductionResumeValues({VecEpilogueIterationCountCheck,
EPI.VectorTripCount} /* AdditionalBypass */);
- return {completeLoopSkeleton(OrigLoopID), EPResumeVal};
+ return {completeLoopSkeleton(), EPResumeVal};
}
BasicBlock *
@@ -8149,9 +8231,18 @@ VPRecipeBase *VPRecipeBuilder::tryToOptimizeInductionPHI(
*PSE.getSE(), *OrigLoop, Range);
// Check if this is pointer induction. If so, build the recipe for it.
- if (auto *II = Legal->getPointerInductionDescriptor(Phi))
- return new VPWidenPointerInductionRecipe(Phi, Operands[0], *II,
- *PSE.getSE());
+ if (auto *II = Legal->getPointerInductionDescriptor(Phi)) {
+ VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep(),
+ *PSE.getSE());
+ assert(isa<SCEVConstant>(II->getStep()));
+ return new VPWidenPointerInductionRecipe(
+ Phi, Operands[0], Step, *II,
+ LoopVectorizationPlanner::getDecisionAndClampRange(
+ [&](ElementCount VF) {
+ return CM.isScalarAfterVectorization(Phi, VF);
+ },
+ Range));
+ }
return nullptr;
}
@@ -8188,12 +8279,8 @@ VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi,
VPlanPtr &Plan) {
// If all incoming values are equal, the incoming VPValue can be used directly
// instead of creating a new VPBlendRecipe.
- VPValue *FirstIncoming = Operands[0];
- if (all_of(Operands, [FirstIncoming](const VPValue *Inc) {
- return FirstIncoming == Inc;
- })) {
+ if (llvm::all_equal(Operands))
return Operands[0];
- }
unsigned NumIncoming = Phi->getNumIncomingValues();
// For in-loop reductions, we do not need to create an additional select.
@@ -8252,24 +8339,42 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
ID == Intrinsic::experimental_noalias_scope_decl))
return nullptr;
- auto willWiden = [&](ElementCount VF) -> bool {
- Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
- // The following case may be scalarized depending on the VF.
- // The flag shows whether we use Intrinsic or a usual Call for vectorized
- // version of the instruction.
- // Is it beneficial to perform intrinsic call compared to lib call?
- bool NeedToScalarize = false;
- InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
- InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0;
- bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
- return UseVectorIntrinsic || !NeedToScalarize;
- };
+ ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size());
- if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
- return nullptr;
+ // Is it beneficial to perform intrinsic call compared to lib call?
+ bool ShouldUseVectorIntrinsic =
+ ID && LoopVectorizationPlanner::getDecisionAndClampRange(
+ [&](ElementCount VF) -> bool {
+ bool NeedToScalarize = false;
+ // Is it beneficial to perform intrinsic call compared to lib
+ // call?
+ InstructionCost CallCost =
+ CM.getVectorCallCost(CI, VF, NeedToScalarize);
+ InstructionCost IntrinsicCost =
+ CM.getVectorIntrinsicCost(CI, VF);
+ return IntrinsicCost <= CallCost;
+ },
+ Range);
+ if (ShouldUseVectorIntrinsic)
+ return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), ID);
+
+ // Is better to call a vectorized version of the function than to to scalarize
+ // the call?
+ auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange(
+ [&](ElementCount VF) -> bool {
+ // The following case may be scalarized depending on the VF.
+ // The flag shows whether we can use a usual Call for vectorized
+ // version of the instruction.
+ bool NeedToScalarize = false;
+ CM.getVectorCallCost(CI, VF, NeedToScalarize);
+ return !NeedToScalarize;
+ },
+ Range);
+ if (ShouldUseVectorCall)
+ return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()),
+ Intrinsic::not_intrinsic);
- ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size());
- return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()));
+ return nullptr;
}
bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
@@ -8286,55 +8391,65 @@ bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
Range);
}
-VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
- ArrayRef<VPValue *> Operands) const {
- auto IsVectorizableOpcode = [](unsigned Opcode) {
- switch (Opcode) {
- case Instruction::Add:
- case Instruction::And:
- case Instruction::AShr:
- case Instruction::BitCast:
- case Instruction::FAdd:
- case Instruction::FCmp:
- case Instruction::FDiv:
- case Instruction::FMul:
- case Instruction::FNeg:
- case Instruction::FPExt:
- case Instruction::FPToSI:
- case Instruction::FPToUI:
- case Instruction::FPTrunc:
- case Instruction::FRem:
- case Instruction::FSub:
- case Instruction::ICmp:
- case Instruction::IntToPtr:
- case Instruction::LShr:
- case Instruction::Mul:
- case Instruction::Or:
- case Instruction::PtrToInt:
- case Instruction::SDiv:
- case Instruction::Select:
- case Instruction::SExt:
- case Instruction::Shl:
- case Instruction::SIToFP:
- case Instruction::SRem:
- case Instruction::Sub:
- case Instruction::Trunc:
- case Instruction::UDiv:
- case Instruction::UIToFP:
- case Instruction::URem:
- case Instruction::Xor:
- case Instruction::ZExt:
- case Instruction::Freeze:
- return true;
+VPRecipeBase *VPRecipeBuilder::tryToWiden(Instruction *I,
+ ArrayRef<VPValue *> Operands,
+ VPBasicBlock *VPBB, VPlanPtr &Plan) {
+ switch (I->getOpcode()) {
+ default:
+ return nullptr;
+ case Instruction::SDiv:
+ case Instruction::UDiv:
+ case Instruction::SRem:
+ case Instruction::URem: {
+ // If not provably safe, use a select to form a safe divisor before widening the
+ // div/rem operation itself. Otherwise fall through to general handling below.
+ if (CM.isPredicatedInst(I)) {
+ SmallVector<VPValue *> Ops(Operands.begin(), Operands.end());
+ VPValue *Mask = createBlockInMask(I->getParent(), Plan);
+ VPValue *One =
+ Plan->getOrAddExternalDef(ConstantInt::get(I->getType(), 1u, false));
+ auto *SafeRHS =
+ new VPInstruction(Instruction::Select, {Mask, Ops[1], One},
+ I->getDebugLoc());
+ VPBB->appendRecipe(SafeRHS);
+ Ops[1] = SafeRHS;
+ return new VPWidenRecipe(*I, make_range(Ops.begin(), Ops.end()));
}
- return false;
+ LLVM_FALLTHROUGH;
+ }
+ case Instruction::Add:
+ case Instruction::And:
+ case Instruction::AShr:
+ case Instruction::BitCast:
+ case Instruction::FAdd:
+ case Instruction::FCmp:
+ case Instruction::FDiv:
+ case Instruction::FMul:
+ case Instruction::FNeg:
+ case Instruction::FPExt:
+ case Instruction::FPToSI:
+ case Instruction::FPToUI:
+ case Instruction::FPTrunc:
+ case Instruction::FRem:
+ case Instruction::FSub:
+ case Instruction::ICmp:
+ case Instruction::IntToPtr:
+ case Instruction::LShr:
+ case Instruction::Mul:
+ case Instruction::Or:
+ case Instruction::PtrToInt:
+ case Instruction::Select:
+ case Instruction::SExt:
+ case Instruction::Shl:
+ case Instruction::SIToFP:
+ case Instruction::Sub:
+ case Instruction::Trunc:
+ case Instruction::UIToFP:
+ case Instruction::Xor:
+ case Instruction::ZExt:
+ case Instruction::Freeze:
+ return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end()));
};
-
- if (!IsVectorizableOpcode(I->getOpcode()))
- return nullptr;
-
- // Success: widen this instruction.
- return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end()));
}
void VPRecipeBuilder::fixHeaderPhis() {
@@ -8354,9 +8469,7 @@ VPBasicBlock *VPRecipeBuilder::handleReplication(
[&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
Range);
- bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
- [&](ElementCount VF) { return CM.isPredicatedInst(I, VF); },
- Range);
+ bool IsPredicated = CM.isPredicatedInst(I);
// Even if the instruction is not marked as uniform, there are certain
// intrinsic calls that can be effectively treated as such, so we check for
@@ -8396,11 +8509,12 @@ VPBasicBlock *VPRecipeBuilder::handleReplication(
// value. Avoid hoisting the insert-element which packs the scalar value into
// a vector value, as that happens iff all users use the vector value.
for (VPValue *Op : Recipe->operands()) {
- auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef());
+ auto *PredR =
+ dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDefiningRecipe());
if (!PredR)
continue;
- auto *RepR =
- cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef());
+ auto *RepR = cast<VPReplicateRecipe>(
+ PredR->getOperand(0)->getDefiningRecipe());
assert(RepR->isPredicated() &&
"expected Replicate recipe to be predicated");
RepR->setAlsoPack(false);
@@ -8469,20 +8583,26 @@ VPRecipeBuilder::createReplicateRegion(VPReplicateRecipe *PredRecipe,
VPRecipeOrVPValueTy
VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
ArrayRef<VPValue *> Operands,
- VFRange &Range, VPlanPtr &Plan) {
+ VFRange &Range, VPBasicBlock *VPBB,
+ VPlanPtr &Plan) {
// First, check for specific widening recipes that deal with inductions, Phi
// nodes, calls and memory operations.
VPRecipeBase *Recipe;
if (auto Phi = dyn_cast<PHINode>(Instr)) {
if (Phi->getParent() != OrigLoop->getHeader())
return tryToBlend(Phi, Operands, Plan);
+
+ // Always record recipes for header phis. Later first-order recurrence phis
+ // can have earlier phis as incoming values.
+ recordRecipeOf(Phi);
+
if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, *Plan, Range)))
return toVPRecipeResult(Recipe);
VPHeaderPHIRecipe *PhiRecipe = nullptr;
assert((Legal->isReductionVariable(Phi) ||
- Legal->isFirstOrderRecurrence(Phi)) &&
- "can only widen reductions and first-order recurrences here");
+ Legal->isFixedOrderRecurrence(Phi)) &&
+ "can only widen reductions and fixed-order recurrences here");
VPValue *StartV = Operands[0];
if (Legal->isReductionVariable(Phi)) {
const RecurrenceDescriptor &RdxDesc =
@@ -8493,13 +8613,21 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
CM.isInLoopReduction(Phi),
CM.useOrderedReductions(RdxDesc));
} else {
+ // TODO: Currently fixed-order recurrences are modeled as chains of
+ // first-order recurrences. If there are no users of the intermediate
+ // recurrences in the chain, the fixed order recurrence should be modeled
+ // directly, enabling more efficient codegen.
PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
}
// Record the incoming value from the backedge, so we can add the incoming
// value from the backedge after all recipes have been created.
- recordRecipeOf(cast<Instruction>(
- Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch())));
+ auto *Inc = cast<Instruction>(
+ Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
+ auto RecipeIter = Ingredient2Recipe.find(Inc);
+ if (RecipeIter == Ingredient2Recipe.end())
+ recordRecipeOf(Inc);
+
PhisToFix.push_back(PhiRecipe);
return toVPRecipeResult(PhiRecipe);
}
@@ -8534,7 +8662,7 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
*SI, make_range(Operands.begin(), Operands.end()), InvariantCond));
}
- return toVPRecipeResult(tryToWiden(Instr, Operands));
+ return toVPRecipeResult(tryToWiden(Instr, Operands, VPBB, Plan));
}
void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
@@ -8564,7 +8692,7 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
assert(
SinkTarget != FirstInst &&
"Must find a live instruction (at least the one feeding the "
- "first-order recurrence PHI) before reaching beginning of the block");
+ "fixed-order recurrence PHI) before reaching beginning of the block");
SinkTarget = SinkTarget->getPrevNode();
assert(SinkTarget != P.first &&
"sink source equals target, no sinking required");
@@ -8696,18 +8824,18 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
// Mark instructions we'll need to sink later and their targets as
// ingredients whose recipe we'll need to record.
- for (auto &Entry : SinkAfter) {
+ for (const auto &Entry : SinkAfter) {
RecipeBuilder.recordRecipeOf(Entry.first);
RecipeBuilder.recordRecipeOf(Entry.second);
}
- for (auto &Reduction : CM.getInLoopReductionChains()) {
+ for (const auto &Reduction : CM.getInLoopReductionChains()) {
PHINode *Phi = Reduction.first;
RecurKind Kind =
Legal->getReductionVars().find(Phi)->second.getRecurrenceKind();
const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
RecipeBuilder.recordRecipeOf(Phi);
- for (auto &R : ReductionOperations) {
+ for (const auto &R : ReductionOperations) {
RecipeBuilder.recordRecipeOf(R);
// For min/max reductions, where we have a pair of icmp/select, we also
// need to record the ICmp recipe, so it can be removed later.
@@ -8805,14 +8933,14 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
continue;
if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe(
- Instr, Operands, Range, Plan)) {
+ Instr, Operands, Range, VPBB, Plan)) {
// If Instr can be simplified to an existing VPValue, use it.
if (RecipeOrValue.is<VPValue *>()) {
auto *VPV = RecipeOrValue.get<VPValue *>();
Plan->addVPValue(Instr, VPV);
// If the re-used value is a recipe, register the recipe for the
// instruction, in case the recipe for Instr needs to be recorded.
- if (auto *R = dyn_cast_or_null<VPRecipeBase>(VPV->getDef()))
+ if (VPRecipeBase *R = VPV->getDefiningRecipe())
RecipeBuilder.setRecipe(Instr, R);
continue;
}
@@ -8854,11 +8982,6 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
}
- HeaderVPBB->setName("vector.body");
-
- // Fold the last, empty block into its predecessor.
- VPBB = VPBlockUtils::tryToMergeBlockIntoPredecessor(VPBB);
- assert(VPBB && "expected to fold last (empty) block");
// After here, VPBB should not be used.
VPBB = nullptr;
@@ -8888,7 +9011,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
}
return nullptr;
};
- for (auto &Entry : SinkAfter) {
+ for (const auto &Entry : SinkAfter) {
VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
@@ -8949,14 +9072,19 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
RecipeBuilder, Range.Start);
// Introduce a recipe to combine the incoming and previous values of a
- // first-order recurrence.
+ // fixed-order recurrence.
for (VPRecipeBase &R :
Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R);
if (!RecurPhi)
continue;
- VPRecipeBase *PrevRecipe = RecurPhi->getBackedgeRecipe();
+ VPRecipeBase *PrevRecipe = &RecurPhi->getBackedgeRecipe();
+ // Fixed-order recurrences do not contain cycles, so this loop is guaranteed
+ // to terminate.
+ while (auto *PrevPhi =
+ dyn_cast<VPFirstOrderRecurrencePHIRecipe>(PrevRecipe))
+ PrevRecipe = &PrevPhi->getBackedgeRecipe();
VPBasicBlock *InsertBlock = PrevRecipe->getParent();
auto *Region = GetReplicateRegion(PrevRecipe);
if (Region)
@@ -8983,7 +9111,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
// Interleave memory: for each Interleave Group we marked earlier as relevant
// for this VPlan, replace the Recipes widening its memory instructions with a
// single VPInterleaveRecipe at its insertion point.
- for (auto IG : InterleaveGroups) {
+ for (const auto *IG : InterleaveGroups) {
auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
RecipeBuilder.getRecipe(IG->getInsertPos()));
SmallVector<VPValue *, 4> StoredValues;
@@ -9011,33 +9139,28 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
}
}
- std::string PlanName;
- raw_string_ostream RSO(PlanName);
- ElementCount VF = Range.Start;
- Plan->addVF(VF);
- RSO << "Initial VPlan for VF={" << VF;
- for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) {
+ for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End);
+ VF *= 2)
Plan->addVF(VF);
- RSO << "," << VF;
- }
- RSO << "},UF>=1";
- RSO.flush();
- Plan->setName(PlanName);
+ Plan->setName("Initial VPlan");
// From this point onwards, VPlan-to-VPlan transformations may change the plan
// in ways that accessing values using original IR values is incorrect.
Plan->disableValue2VPValue();
VPlanTransforms::optimizeInductions(*Plan, *PSE.getSE());
- VPlanTransforms::sinkScalarOperands(*Plan);
VPlanTransforms::removeDeadRecipes(*Plan);
- VPlanTransforms::mergeReplicateRegions(*Plan);
- VPlanTransforms::removeRedundantExpandSCEVRecipes(*Plan);
- // Fold Exit block into its predecessor if possible.
- // TODO: Fold block earlier once all VPlan transforms properly maintain a
- // VPBasicBlock as exit.
- VPBlockUtils::tryToMergeBlockIntoPredecessor(TopRegion->getExiting());
+ bool ShouldSimplify = true;
+ while (ShouldSimplify) {
+ ShouldSimplify = VPlanTransforms::sinkScalarOperands(*Plan);
+ ShouldSimplify |=
+ VPlanTransforms::mergeReplicateRegionsIntoSuccessors(*Plan);
+ ShouldSimplify |= VPlanTransforms::mergeBlocksIntoPredecessors(*Plan);
+ }
+
+ VPlanTransforms::removeRedundantExpandSCEVRecipes(*Plan);
+ VPlanTransforms::mergeBlocksIntoPredecessors(*Plan);
assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid");
return Plan;
@@ -9066,7 +9189,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
VPlanTransforms::VPInstructionsToVPRecipes(
OrigLoop, Plan,
[this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },
- DeadInstructions, *PSE.getSE());
+ DeadInstructions, *PSE.getSE(), *TLI);
// Remove the existing terminator of the exiting block of the top-most region.
// A BranchOnCount will be added instead when adding the canonical IV recipes.
@@ -9087,7 +9210,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
void LoopVectorizationPlanner::adjustRecipesForReductions(
VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder,
ElementCount MinVF) {
- for (auto &Reduction : CM.getInLoopReductionChains()) {
+ for (const auto &Reduction : CM.getInLoopReductionChains()) {
PHINode *Phi = Reduction.first;
const RecurrenceDescriptor &RdxDesc =
Legal->getReductionVars().find(Phi)->second;
@@ -9127,9 +9250,13 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;
VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
- auto *CondOp = CM.blockNeedsPredicationForAnyReason(R->getParent())
- ? RecipeBuilder.createBlockInMask(R->getParent(), Plan)
- : nullptr;
+ VPValue *CondOp = nullptr;
+ if (CM.blockNeedsPredicationForAnyReason(R->getParent())) {
+ VPBuilder::InsertPointGuard Guard(Builder);
+ Builder.setInsertPoint(WidenRecipe->getParent(),
+ WidenRecipe->getIterator());
+ CondOp = RecipeBuilder.createBlockInMask(R->getParent(), Plan);
+ }
if (IsFMulAdd) {
// If the instruction is a call to the llvm.fmuladd intrinsic then we
@@ -9179,7 +9306,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
VPValue *Cond =
RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
VPValue *Red = PhiR->getBackedgeValue();
- assert(cast<VPRecipeBase>(Red->getDef())->getParent() != LatchVPBB &&
+ assert(Red->getDefiningRecipe()->getParent() != LatchVPBB &&
"reduction recipe must be defined before latch");
Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR});
}
@@ -9217,11 +9344,6 @@ void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
}
#endif
-void VPWidenCallRecipe::execute(VPTransformState &State) {
- State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this,
- *this, State);
-}
-
void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
assert(!State.Instance && "Int or FP induction being replicated.");
@@ -9353,8 +9475,7 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
PartStart, ConstantInt::get(PtrInd->getType(), Lane));
Value *GlobalIdx = State.Builder.CreateAdd(PtrInd, Idx);
- Value *Step = CreateStepValue(IndDesc.getStep(), SE,
- State.CFG.PrevBB->getTerminator());
+ Value *Step = State.get(getOperand(1), VPIteration(0, Part));
Value *SclrGep = emitTransformedIndex(
State.Builder, GlobalIdx, IndDesc.getStartValue(), Step, IndDesc);
SclrGep->setName("next.gep");
@@ -9378,12 +9499,9 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
NewPointerPhi->addIncoming(ScalarStartValue, VectorPH);
// A pointer induction, performed by using a gep
- const DataLayout &DL = NewPointerPhi->getModule()->getDataLayout();
Instruction *InductionLoc = &*State.Builder.GetInsertPoint();
- const SCEV *ScalarStep = IndDesc.getStep();
- SCEVExpander Exp(SE, DL, "induction");
- Value *ScalarStepValue = Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
+ Value *ScalarStepValue = State.get(getOperand(1), VPIteration(0, 0));
Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF);
Value *NumUnrolledElems =
State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
@@ -9411,6 +9529,8 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
StartOffset = State.Builder.CreateAdd(
StartOffset, State.Builder.CreateStepVector(VecPhiType));
+ assert(ScalarStepValue == State.get(getOperand(1), VPIteration(0, Part)) &&
+ "scalar step must be the same across all parts");
Value *GEP = State.Builder.CreateGEP(
IndDesc.getElementType(), NewPointerPhi,
State.Builder.CreateMul(
@@ -9421,8 +9541,8 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
}
}
-void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
- assert(!State.Instance && "VPScalarIVStepsRecipe being replicated.");
+void VPDerivedIVRecipe::execute(VPTransformState &State) {
+ assert(!State.Instance && "VPDerivedIVRecipe being replicated.");
// Fast-math-flags propagate from the original induction instruction.
IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
@@ -9432,52 +9552,33 @@ void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
IndDesc.getInductionBinOp()->getFastMathFlags());
Value *Step = State.get(getStepValue(), VPIteration(0, 0));
- auto CreateScalarIV = [&](Value *&Step) -> Value * {
- Value *ScalarIV = State.get(getCanonicalIV(), VPIteration(0, 0));
- auto *CanonicalIV = State.get(getParent()->getPlan()->getCanonicalIV(), 0);
- if (!isCanonical() || CanonicalIV->getType() != Ty) {
- ScalarIV =
- Ty->isIntegerTy()
- ? State.Builder.CreateSExtOrTrunc(ScalarIV, Ty)
- : State.Builder.CreateCast(Instruction::SIToFP, ScalarIV, Ty);
- ScalarIV = emitTransformedIndex(State.Builder, ScalarIV,
- getStartValue()->getLiveInIRValue(), Step,
- IndDesc);
- ScalarIV->setName("offset.idx");
- }
- if (TruncToTy) {
- assert(Step->getType()->isIntegerTy() &&
- "Truncation requires an integer step");
- ScalarIV = State.Builder.CreateTrunc(ScalarIV, TruncToTy);
- Step = State.Builder.CreateTrunc(Step, TruncToTy);
- }
- return ScalarIV;
- };
-
- Value *ScalarIV = CreateScalarIV(Step);
- if (State.VF.isVector()) {
- buildScalarSteps(ScalarIV, Step, IndDesc, this, State);
- return;
+ Value *CanonicalIV = State.get(getCanonicalIV(), VPIteration(0, 0));
+ Value *DerivedIV =
+ emitTransformedIndex(State.Builder, CanonicalIV,
+ getStartValue()->getLiveInIRValue(), Step, IndDesc);
+ DerivedIV->setName("offset.idx");
+ if (ResultTy != DerivedIV->getType()) {
+ assert(Step->getType()->isIntegerTy() &&
+ "Truncation requires an integer step");
+ DerivedIV = State.Builder.CreateTrunc(DerivedIV, ResultTy);
}
+ assert(DerivedIV != CanonicalIV && "IV didn't need transforming?");
- for (unsigned Part = 0; Part < State.UF; ++Part) {
- assert(!State.VF.isScalable() && "scalable vectors not yet supported.");
- Value *EntryPart;
- if (Step->getType()->isFloatingPointTy()) {
- Value *StartIdx =
- getRuntimeVFAsFloat(State.Builder, Step->getType(), State.VF * Part);
- // Floating-point operations inherit FMF via the builder's flags.
- Value *MulOp = State.Builder.CreateFMul(StartIdx, Step);
- EntryPart = State.Builder.CreateBinOp(IndDesc.getInductionOpcode(),
- ScalarIV, MulOp);
- } else {
- Value *StartIdx =
- getRuntimeVF(State.Builder, Step->getType(), State.VF * Part);
- EntryPart = State.Builder.CreateAdd(
- ScalarIV, State.Builder.CreateMul(StartIdx, Step), "induction");
- }
- State.set(this, EntryPart, Part);
- }
+ State.set(this, DerivedIV, VPIteration(0, 0));
+}
+
+void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
+ // Fast-math-flags propagate from the original induction instruction.
+ IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
+ if (IndDesc.getInductionBinOp() &&
+ isa<FPMathOperator>(IndDesc.getInductionBinOp()))
+ State.Builder.setFastMathFlags(
+ IndDesc.getInductionBinOp()->getFastMathFlags());
+
+ Value *BaseIV = State.get(getOperand(0), VPIteration(0, 0));
+ Value *Step = State.get(getStepValue(), VPIteration(0, 0));
+
+ buildScalarSteps(BaseIV, Step, IndDesc, this, State);
}
void VPInterleaveRecipe::execute(VPTransformState &State) {
@@ -9536,9 +9637,10 @@ void VPReductionRecipe::execute(VPTransformState &State) {
}
void VPReplicateRecipe::execute(VPTransformState &State) {
+ Instruction *UI = getUnderlyingInstr();
if (State.Instance) { // Generate a single instance.
assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
- State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *State.Instance,
+ State.ILV->scalarizeInstruction(UI, this, *State.Instance,
IsPredicated, State);
// Insert scalar instance packing it into a vector.
if (AlsoPack && State.VF.isVector()) {
@@ -9546,7 +9648,7 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
if (State.Instance->Lane.isFirstLane()) {
assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
Value *Poison = PoisonValue::get(
- VectorType::get(getUnderlyingValue()->getType(), State.VF));
+ VectorType::get(UI->getType(), State.VF));
State.set(this, Poison, State.Instance->Part);
}
State.ILV->packScalarIntoVectorValue(this, *State.Instance, State);
@@ -9555,12 +9657,36 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
}
if (IsUniform) {
+ // If the recipe is uniform across all parts (instead of just per VF), only
+ // generate a single instance.
+ if ((isa<LoadInst>(UI) || isa<StoreInst>(UI)) &&
+ all_of(operands(), [](VPValue *Op) {
+ return Op->isDefinedOutsideVectorRegions();
+ })) {
+ State.ILV->scalarizeInstruction(UI, this, VPIteration(0, 0), IsPredicated,
+ State);
+ if (user_begin() != user_end()) {
+ for (unsigned Part = 1; Part < State.UF; ++Part)
+ State.set(this, State.get(this, VPIteration(0, 0)),
+ VPIteration(Part, 0));
+ }
+ return;
+ }
+
// Uniform within VL means we need to generate lane 0 only for each
// unrolled copy.
for (unsigned Part = 0; Part < State.UF; ++Part)
- State.ILV->scalarizeInstruction(getUnderlyingInstr(), this,
- VPIteration(Part, 0), IsPredicated,
- State);
+ State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, 0),
+ IsPredicated, State);
+ return;
+ }
+
+ // A store of a loop varying value to a loop invariant address only
+ // needs only the last copy of the store.
+ if (isa<StoreInst>(UI) && !getOperand(1)->hasDefiningRecipe()) {
+ auto Lane = VPLane::getLastLaneForVF(State.VF);
+ State.ILV->scalarizeInstruction(UI, this, VPIteration(State.UF - 1, Lane), IsPredicated,
+ State);
return;
}
@@ -9569,9 +9695,8 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
const unsigned EndLane = State.VF.getKnownMinValue();
for (unsigned Part = 0; Part < State.UF; ++Part)
for (unsigned Lane = 0; Lane < EndLane; ++Lane)
- State.ILV->scalarizeInstruction(getUnderlyingInstr(), this,
- VPIteration(Part, Lane), IsPredicated,
- State);
+ State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane),
+ IsPredicated, State);
}
void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
@@ -9709,7 +9834,7 @@ static ScalarEpilogueLowering getScalarEpilogueLowering(
Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
- LoopVectorizationLegality &LVL) {
+ LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI) {
// 1) OptSize takes precedence over all other options, i.e. if this is set,
// don't look at hints or options, and don't request a scalar epilogue.
// (For PGSO, as shouldOptimizeForSize isn't currently accessible from
@@ -9744,7 +9869,7 @@ static ScalarEpilogueLowering getScalarEpilogueLowering(
};
// 4) if the TTI hook indicates this is profitable, request predication.
- if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, &LVL))
+ if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, &LVL, IAI))
return CM_ScalarEpilogueNotNeededUsePredicate;
return CM_ScalarEpilogueAllowed;
@@ -9770,15 +9895,14 @@ Value *VPTransformState::get(VPValue *Def, unsigned Part) {
return ScalarValue;
}
- auto *RepR = dyn_cast<VPReplicateRecipe>(Def);
- bool IsUniform = RepR && RepR->isUniform();
+ bool IsUniform = vputils::isUniformAfterVectorization(Def);
unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1;
// Check if there is a scalar value for the selected lane.
if (!hasScalarValue(Def, {Part, LastLane})) {
- // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform.
- assert((isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) ||
- isa<VPScalarIVStepsRecipe>(Def->getDef())) &&
+ // At the moment, VPWidenIntOrFpInductionRecipes and VPScalarIVStepsRecipes can also be uniform.
+ assert((isa<VPWidenIntOrFpInductionRecipe>(Def->getDefiningRecipe()) ||
+ isa<VPScalarIVStepsRecipe>(Def->getDefiningRecipe())) &&
"unexpected recipe found to be invariant");
IsUniform = true;
LastLane = 0;
@@ -9839,7 +9963,7 @@ static bool processLoopInVPlanNativePath(
InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
- F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
+ F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL, &IAI);
LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
&Hints, IAI);
@@ -9927,7 +10051,7 @@ static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
VectorizationFactor &VF,
- Optional<unsigned> VScale, Loop *L,
+ std::optional<unsigned> VScale, Loop *L,
ScalarEvolution &SE) {
InstructionCost CheckCost = Checks.getCost();
if (!CheckCost.isValid())
@@ -10075,7 +10199,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// Check if it is legal to vectorize the loop.
LoopVectorizationRequirements Requirements;
- LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
+ LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE,
&Requirements, &Hints, DB, AC, BFI, PSI);
if (!LVL.canVectorize(EnableVPlanNativePath)) {
LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
@@ -10083,11 +10207,6 @@ bool LoopVectorizePass::processLoop(Loop *L) {
return false;
}
- // Check the function attributes and profiles to find out if this function
- // should be optimized for size.
- ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
- F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
-
// Entrance to the VPlan-native vectorization path. Outer loops are processed
// here. They may require CFG and instruction level transformations before
// even evaluating whether vectorization is profitable. Since we cannot modify
@@ -10099,6 +10218,22 @@ bool LoopVectorizePass::processLoop(Loop *L) {
assert(L->isInnermost() && "Inner loop expected.");
+ InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
+ bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
+
+ // If an override option has been passed in for interleaved accesses, use it.
+ if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
+ UseInterleaved = EnableInterleavedMemAccesses;
+
+ // Analyze interleaved memory accesses.
+ if (UseInterleaved)
+ IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
+
+ // Check the function attributes and profiles to find out if this function
+ // should be optimized for size.
+ ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
+ F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL, &IAI);
+
// Check the loop for a trip count threshold: vectorize loops with a tiny trip
// count by optimizing for size, to minimize overheads.
auto ExpectedTC = getSmallBestKnownTC(*SE, L);
@@ -10109,15 +10244,24 @@ bool LoopVectorizePass::processLoop(Loop *L) {
if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
else {
- LLVM_DEBUG(dbgs() << "\n");
- SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
+ if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) {
+ LLVM_DEBUG(dbgs() << "\n");
+ SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
+ } else {
+ LLVM_DEBUG(dbgs() << " But the target considers the trip count too "
+ "small to consider vectorizing.\n");
+ reportVectorizationFailure(
+ "The trip count is below the minial threshold value.",
+ "loop trip count is too low, avoiding vectorization",
+ "LowTripCount", ORE, L);
+ Hints.emitRemarkWithHints();
+ return false;
+ }
}
}
- // Check the function attributes to see if implicit floats are allowed.
- // FIXME: This check doesn't seem possibly correct -- what if the loop is
- // an integer loop and the vector instructions selected are purely integer
- // vector instructions?
+ // Check the function attributes to see if implicit floats or vectors are
+ // allowed.
if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
reportVectorizationFailure(
"Can't vectorize when the NoImplicitFloat attribute is used",
@@ -10162,18 +10306,6 @@ bool LoopVectorizePass::processLoop(Loop *L) {
return false;
}
- bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
- InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
-
- // If an override option has been passed in for interleaved accesses, use it.
- if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
- UseInterleaved = EnableInterleavedMemAccesses;
-
- // Analyze interleaved memory accesses.
- if (UseInterleaved) {
- IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
- }
-
// Use the cost model.
LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
F, &Hints, IAI);
@@ -10188,7 +10320,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
unsigned UserIC = Hints.getInterleave();
// Plan how to best vectorize, return the best VF and its cost.
- Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
+ std::optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
VectorizationFactor VF = VectorizationFactor::Disabled();
unsigned IC = 1;
@@ -10198,7 +10330,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
if (MaybeVF) {
VF = *MaybeVF;
// Select the interleave count.
- IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue());
+ IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
unsigned SelectedIC = std::max(IC, UserIC);
// Optimistically generate runtime checks if they are needed. Drop them if
@@ -10360,16 +10492,39 @@ bool LoopVectorizePass::processLoop(Loop *L) {
VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
Header->setName("vec.epilog.vector.body");
- // Ensure that the start values for any VPReductionPHIRecipes are
- // updated before vectorising the epilogue loop.
+ // Ensure that the start values for any VPWidenIntOrFpInductionRecipe,
+ // VPWidenPointerInductionRecipe and VPReductionPHIRecipes are updated
+ // before vectorizing the epilogue loop.
for (VPRecipeBase &R : Header->phis()) {
+ if (isa<VPCanonicalIVPHIRecipe>(&R))
+ continue;
+
+ Value *ResumeV = nullptr;
+ // TODO: Move setting of resume values to prepareToExecute.
if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
- if (auto *Resume = MainILV.getReductionResumeValue(
- ReductionPhi->getRecurrenceDescriptor())) {
- VPValue *StartVal = BestEpiPlan.getOrAddExternalDef(Resume);
- ReductionPhi->setOperand(0, StartVal);
+ ResumeV = MainILV.getReductionResumeValue(
+ ReductionPhi->getRecurrenceDescriptor());
+ } else {
+ // Create induction resume values for both widened pointer and
+ // integer/fp inductions and update the start value of the induction
+ // recipes to use the resume value.
+ PHINode *IndPhi = nullptr;
+ const InductionDescriptor *ID;
+ if (auto *Ind = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
+ IndPhi = cast<PHINode>(Ind->getUnderlyingValue());
+ ID = &Ind->getInductionDescriptor();
+ } else {
+ auto *WidenInd = cast<VPWidenIntOrFpInductionRecipe>(&R);
+ IndPhi = WidenInd->getPHINode();
+ ID = &WidenInd->getInductionDescriptor();
}
+
+ ResumeV = MainILV.createInductionResumeValue(
+ IndPhi, *ID, {EPI.MainLoopIterationCountCheck});
}
+ assert(ResumeV && "Must have a resume value");
+ VPValue *StartVal = BestEpiPlan.getOrAddExternalDef(ResumeV);
+ cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);
}
LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
@@ -10407,11 +10562,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {
checkMixedPrecision(L, ORE);
}
- Optional<MDNode *> RemainderLoopID =
+ std::optional<MDNode *> RemainderLoopID =
makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
LLVMLoopVectorizeFollowupEpilogue});
if (RemainderLoopID) {
- L->setLoopID(RemainderLoopID.value());
+ L->setLoopID(*RemainderLoopID);
} else {
if (DisableRuntimeUnroll)
AddRuntimeUnrollDisableMetaData(L);
@@ -10427,8 +10582,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
LoopVectorizeResult LoopVectorizePass::runImpl(
Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
- DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_,
- std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
+ DemandedBits &DB_, AssumptionCache &AC_, LoopAccessInfoManager &LAIs_,
OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
SE = &SE_;
LI = &LI_;
@@ -10436,9 +10590,8 @@ LoopVectorizeResult LoopVectorizePass::runImpl(
DT = &DT_;
BFI = &BFI_;
TLI = TLI_;
- AA = &AA_;
AC = &AC_;
- GetLAA = &GetLAA_;
+ LAIs = &LAIs_;
DB = &DB_;
ORE = &ORE_;
PSI = PSI_;
@@ -10461,7 +10614,7 @@ LoopVectorizeResult LoopVectorizePass::runImpl(
// legality and profitability checks. This means running the loop vectorizer
// will simplify all loops, regardless of whether anything end up being
// vectorized.
- for (auto &L : *LI)
+ for (const auto &L : *LI)
Changed |= CFGChanged |=
simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
@@ -10484,6 +10637,9 @@ LoopVectorizeResult LoopVectorizePass::runImpl(
Changed |= formLCSSARecursively(*L, *DT, LI, SE);
Changed |= CFGChanged |= processLoop(L);
+
+ if (Changed)
+ LAIs->clear();
}
// Process each loop nest in the function.
@@ -10502,23 +10658,16 @@ PreservedAnalyses LoopVectorizePass::run(Function &F,
auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
- auto &AA = AM.getResult<AAManager>(F);
auto &AC = AM.getResult<AssumptionAnalysis>(F);
auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
- auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
- std::function<const LoopAccessInfo &(Loop &)> GetLAA =
- [&](Loop &L) -> const LoopAccessInfo & {
- LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE,
- TLI, TTI, nullptr, nullptr, nullptr};
- return LAM.getResult<LoopAccessAnalysis>(L, AR);
- };
+ LoopAccessInfoManager &LAIs = AM.getResult<LoopAccessAnalysis>(F);
auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
ProfileSummaryInfo *PSI =
MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
LoopVectorizeResult Result =
- runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
+ runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AC, LAIs, ORE, PSI);
if (!Result.MadeAnyChange)
return PreservedAnalyses::all();
PreservedAnalyses PA;