aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2023-07-26 19:03:47 +0000
committerDimitry Andric <dim@FreeBSD.org>2023-07-26 19:04:23 +0000
commit7fa27ce4a07f19b07799a767fc29416f3b625afb (patch)
tree27825c83636c4de341eb09a74f49f5d38a15d165 /llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
parente3b557809604d036af6e00c60f012c2025b59a5e (diff)
Diffstat (limited to 'llvm/lib/Transforms/Vectorize/LoopVectorize.cpp')
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp2346
1 files changed, 1181 insertions, 1165 deletions
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index a28099d8ba7d..d7e40e8ef978 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -98,6 +98,7 @@
#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfo.h"
#include "llvm/IR/DebugInfoMetadata.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/DerivedTypes.h"
@@ -120,8 +121,6 @@
#include "llvm/IR/Value.h"
#include "llvm/IR/ValueHandle.h"
#include "llvm/IR/Verifier.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Compiler.h"
@@ -231,6 +230,25 @@ static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
"prefers tail-folding, don't attempt vectorization if "
"tail-folding fails.")));
+static cl::opt<TailFoldingStyle> ForceTailFoldingStyle(
+ "force-tail-folding-style", cl::desc("Force the tail folding style"),
+ cl::init(TailFoldingStyle::None),
+ cl::values(
+ clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"),
+ clEnumValN(
+ TailFoldingStyle::Data, "data",
+ "Create lane mask for data only, using active.lane.mask intrinsic"),
+ clEnumValN(TailFoldingStyle::DataWithoutLaneMask,
+ "data-without-lane-mask",
+ "Create lane mask with compare/stepvector"),
+ clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control",
+ "Create lane mask using active.lane.mask intrinsic, and use "
+ "it for both data and control flow"),
+ clEnumValN(
+ TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck,
+ "data-and-control-without-rt-check",
+ "Similar to data-and-control, but remove the runtime check")));
+
static cl::opt<bool> MaximizeBandwidth(
"vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
cl::desc("Maximize bandwidth when selecting vectorization factor which "
@@ -338,10 +356,12 @@ static cl::opt<bool> PreferPredicatedReductionSelect(
cl::desc(
"Prefer predicating a reduction operation over an after loop select."));
+namespace llvm {
cl::opt<bool> EnableVPlanNativePath(
- "enable-vplan-native-path", cl::init(false), cl::Hidden,
+ "enable-vplan-native-path", cl::Hidden,
cl::desc("Enable VPlan-native vectorization path with "
"support for outer loop vectorization."));
+}
// This flag enables the stress testing of the VPlan H-CFG construction in the
// VPlan-native vectorization path. It must be used in conjuction with
@@ -419,9 +439,42 @@ static std::optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE,
return std::nullopt;
}
+/// Return a vector containing interleaved elements from multiple
+/// smaller input vectors.
+static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
+ const Twine &Name) {
+ unsigned Factor = Vals.size();
+ assert(Factor > 1 && "Tried to interleave invalid number of vectors");
+
+ VectorType *VecTy = cast<VectorType>(Vals[0]->getType());
+#ifndef NDEBUG
+ for (Value *Val : Vals)
+ assert(Val->getType() == VecTy && "Tried to interleave mismatched types");
+#endif
+
+ // Scalable vectors cannot use arbitrary shufflevectors (only splats), so
+ // must use intrinsics to interleave.
+ if (VecTy->isScalableTy()) {
+ VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy);
+ return Builder.CreateIntrinsic(
+ WideVecTy, Intrinsic::experimental_vector_interleave2, Vals,
+ /*FMFSource=*/nullptr, Name);
+ }
+
+ // Fixed length. Start by concatenating all vectors into a wide vector.
+ Value *WideVec = concatenateVectors(Builder, Vals);
+
+ // Interleave the elements into the wide vector.
+ const unsigned NumElts = VecTy->getElementCount().getFixedValue();
+ return Builder.CreateShuffleVector(
+ WideVec, createInterleaveMask(NumElts, Factor), Name);
+}
+
namespace {
// Forward declare GeneratedRTChecks.
class GeneratedRTChecks;
+
+using SCEV2ValueTy = DenseMap<const SCEV *, Value *>;
} // namespace
namespace llvm {
@@ -477,8 +530,10 @@ public:
/// loop and the start value for the canonical induction, if it is != 0. The
/// latter is the case when vectorizing the epilogue loop. In the case of
/// epilogue vectorization, this function is overriden to handle the more
- /// complex control flow around the loops.
- virtual std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton();
+ /// complex control flow around the loops. \p ExpandedSCEVs is used to
+ /// look up SCEV expansions for expressions needed during skeleton creation.
+ virtual std::pair<BasicBlock *, Value *>
+ createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs);
/// Fix the vectorized code, taking care of header phi's, live-outs, and more.
void fixVectorizedLoop(VPTransformState &State, VPlan &Plan);
@@ -498,7 +553,7 @@ public:
/// Instr's operands.
void scalarizeInstruction(const Instruction *Instr,
VPReplicateRecipe *RepRecipe,
- const VPIteration &Instance, bool IfPredicateInstr,
+ const VPIteration &Instance,
VPTransformState &State);
/// Construct the vector value of a scalarized value \p V one lane at a time.
@@ -513,7 +568,7 @@ public:
ArrayRef<VPValue *> VPDefs,
VPTransformState &State, VPValue *Addr,
ArrayRef<VPValue *> StoredValues,
- VPValue *BlockInMask = nullptr);
+ VPValue *BlockInMask, bool NeedsMaskForGaps);
/// Fix the non-induction PHIs in \p Plan.
void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State);
@@ -522,28 +577,30 @@ public:
/// able to vectorize with strict in-order reductions for the given RdxDesc.
bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc);
- /// Create a broadcast instruction. This method generates a broadcast
- /// instruction (shuffle) for loop invariant values and for the induction
- /// value. If this is the induction variable then we extend it to N, N+1, ...
- /// this is needed because each iteration in the loop corresponds to a SIMD
- /// element.
- virtual Value *getBroadcastInstrs(Value *V);
-
// Returns the resume value (bc.merge.rdx) for a reduction as
// generated by fixReduction.
PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc);
/// Create a new phi node for the induction variable \p OrigPhi to resume
/// iteration count in the scalar epilogue, from where the vectorized loop
- /// left off. In cases where the loop skeleton is more complicated (eg.
- /// epilogue vectorization) and the resume values can come from an additional
- /// bypass block, the \p AdditionalBypass pair provides information about the
- /// bypass block and the end value on the edge from bypass to this loop.
+ /// left off. \p Step is the SCEV-expanded induction step to use. In cases
+ /// where the loop skeleton is more complicated (i.e., epilogue vectorization)
+ /// and the resume values can come from an additional bypass block, the \p
+ /// AdditionalBypass pair provides information about the bypass block and the
+ /// end value on the edge from bypass to this loop.
PHINode *createInductionResumeValue(
- PHINode *OrigPhi, const InductionDescriptor &ID,
+ PHINode *OrigPhi, const InductionDescriptor &ID, Value *Step,
ArrayRef<BasicBlock *> BypassBlocks,
std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
+ /// Returns the original loop trip count.
+ Value *getTripCount() const { return TripCount; }
+
+ /// Used to set the trip count after ILV's construction and after the
+ /// preheader block has been executed. Note that this always holds the trip
+ /// count of the original loop for both main loop and epilogue vectorization.
+ void setTripCount(Value *TC) { TripCount = TC; }
+
protected:
friend class LoopVectorizationPlanner;
@@ -560,7 +617,7 @@ protected:
void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
Value *VectorTripCount, Value *EndValue,
BasicBlock *MiddleBlock, BasicBlock *VectorHeader,
- VPlan &Plan);
+ VPlan &Plan, VPTransformState &State);
/// Handle all cross-iteration phis in the header.
void fixCrossIterationPHIs(VPTransformState &State);
@@ -573,10 +630,6 @@ protected:
/// Create code for the loop exit value of the reduction.
void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State);
- /// Clear NSW/NUW flags from reduction instructions if necessary.
- void clearReductionWrapFlags(VPReductionPHIRecipe *PhiR,
- VPTransformState &State);
-
/// Iteratively sink the scalarized operands of a predicated instruction into
/// the block that was created for it.
void sinkScalarOperands(Instruction *PredInst);
@@ -585,9 +638,6 @@ protected:
/// represented as.
void truncateToMinimalBitwidths(VPTransformState &State);
- /// Returns (and creates if needed) the original loop trip count.
- Value *getOrCreateTripCount(BasicBlock *InsertBlock);
-
/// Returns (and creates if needed) the trip count of the widened loop.
Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock);
@@ -621,6 +671,7 @@ protected:
/// block, the \p AdditionalBypass pair provides information about the bypass
/// block and the end value on the edge from bypass to this loop.
void createInductionResumeValues(
+ const SCEV2ValueTy &ExpandedSCEVs,
std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
/// Complete the loop skeleton by adding debug MDs, creating appropriate
@@ -758,9 +809,6 @@ public:
ElementCount::getFixed(1),
ElementCount::getFixed(1), UnrollFactor, LVL, CM,
BFI, PSI, Check) {}
-
-private:
- Value *getBroadcastInstrs(Value *V) override;
};
/// Encapsulate information regarding vectorization of a loop and its epilogue.
@@ -810,15 +858,16 @@ public:
// Override this function to handle the more complex control flow around the
// three loops.
- std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton() final {
- return createEpilogueVectorizedLoopSkeleton();
+ std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton(
+ const SCEV2ValueTy &ExpandedSCEVs) final {
+ return createEpilogueVectorizedLoopSkeleton(ExpandedSCEVs);
}
/// The interface for creating a vectorized skeleton using one of two
/// different strategies, each corresponding to one execution of the vplan
/// as described above.
virtual std::pair<BasicBlock *, Value *>
- createEpilogueVectorizedLoopSkeleton() = 0;
+ createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) = 0;
/// Holds and updates state information required to vectorize the main loop
/// and its epilogue in two separate passes. This setup helps us avoid
@@ -846,7 +895,8 @@ public:
EPI, LVL, CM, BFI, PSI, Check) {}
/// Implements the interface for creating a vectorized skeleton using the
/// *main loop* strategy (ie the first pass of vplan execution).
- std::pair<BasicBlock *, Value *> createEpilogueVectorizedLoopSkeleton() final;
+ std::pair<BasicBlock *, Value *>
+ createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
protected:
/// Emits an iteration count bypass check once for the main loop (when \p
@@ -876,7 +926,8 @@ public:
}
/// Implements the interface for creating a vectorized skeleton using the
/// *epilogue loop* strategy (ie the second pass of vplan execution).
- std::pair<BasicBlock *, Value *> createEpilogueVectorizedLoopSkeleton() final;
+ std::pair<BasicBlock *, Value *>
+ createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
protected:
/// Emits an iteration count bypass check after the main vector loop has
@@ -953,35 +1004,21 @@ namespace llvm {
Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
int64_t Step) {
assert(Ty->isIntegerTy() && "Expected an integer step");
- Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue());
- return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
+ return B.CreateElementCount(Ty, VF.multiplyCoefficientBy(Step));
}
/// Return the runtime value for VF.
Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) {
- Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue());
- return VF.isScalable() ? B.CreateVScale(EC) : EC;
+ return B.CreateElementCount(Ty, VF);
}
-const SCEV *createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE) {
+const SCEV *createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE,
+ Loop *OrigLoop) {
const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && "Invalid loop count");
ScalarEvolution &SE = *PSE.getSE();
-
- // The exit count might have the type of i64 while the phi is i32. This can
- // happen if we have an induction variable that is sign extended before the
- // compare. The only way that we get a backedge taken count is that the
- // induction variable was signed and as such will not overflow. In such a case
- // truncation is legal.
- if (SE.getTypeSizeInBits(BackedgeTakenCount->getType()) >
- IdxTy->getPrimitiveSizeInBits())
- BackedgeTakenCount = SE.getTruncateOrNoop(BackedgeTakenCount, IdxTy);
- BackedgeTakenCount = SE.getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
-
- // Get the total trip count from the count by adding 1.
- return SE.getAddExpr(BackedgeTakenCount,
- SE.getOne(BackedgeTakenCount->getType()));
+ return SE.getTripCountFromExitCount(BackedgeTakenCount, IdxTy, OrigLoop);
}
static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy,
@@ -1062,11 +1099,17 @@ void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
continue;
// This recipe contributes to the address computation of a widen
- // load/store. Collect recipe if its underlying instruction has
- // poison-generating flags.
- Instruction *Instr = CurRec->getUnderlyingInstr();
- if (Instr && Instr->hasPoisonGeneratingFlags())
- State.MayGeneratePoisonRecipes.insert(CurRec);
+ // load/store. If the underlying instruction has poison-generating flags,
+ // drop them directly.
+ if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {
+ RecWithFlags->dropPoisonGeneratingFlags();
+ } else {
+ Instruction *Instr = CurRec->getUnderlyingInstr();
+ (void)Instr;
+ assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
+ "found instruction with poison generating flags not covered by "
+ "VPRecipeWithIRFlags");
+ }
// Add new definitions to the worklist.
for (VPValue *operand : CurRec->operands())
@@ -1143,15 +1186,7 @@ enum ScalarEpilogueLowering {
CM_ScalarEpilogueNotAllowedUsePredicate
};
-/// ElementCountComparator creates a total ordering for ElementCount
-/// for the purposes of using it in a set structure.
-struct ElementCountComparator {
- bool operator()(const ElementCount &LHS, const ElementCount &RHS) const {
- return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) <
- std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
- }
-};
-using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>;
+using InstructionVFPair = std::pair<Instruction *, ElementCount>;
/// LoopVectorizationCostModel - estimates the expected speedups due to
/// vectorization.
@@ -1184,17 +1219,6 @@ public:
/// otherwise.
bool runtimeChecksRequired();
- /// \return The most profitable vectorization factor and the cost of that VF.
- /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO
- /// then this vectorization factor will be selected if vectorization is
- /// possible.
- VectorizationFactor
- selectVectorizationFactor(const ElementCountSet &CandidateVFs);
-
- VectorizationFactor
- selectEpilogueVectorizationFactor(const ElementCount MaxVF,
- const LoopVectorizationPlanner &LVP);
-
/// Setup cost-based decisions for user vectorization factor.
/// \return true if the UserVF is a feasible VF to be chosen.
bool selectUserVectorizationFactor(ElementCount UserVF) {
@@ -1278,11 +1302,17 @@ public:
auto Scalars = InstsToScalarize.find(VF);
assert(Scalars != InstsToScalarize.end() &&
"VF not yet analyzed for scalarization profitability");
- return Scalars->second.find(I) != Scalars->second.end();
+ return Scalars->second.contains(I);
}
/// Returns true if \p I is known to be uniform after vectorization.
bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
+ // Pseudo probe needs to be duplicated for each unrolled iteration and
+ // vector lane so that profiled loop trip count can be accurately
+ // accumulated instead of being under counted.
+ if (isa<PseudoProbeInst>(I))
+ return false;
+
if (VF.isScalar())
return true;
@@ -1316,7 +1346,7 @@ public:
/// \returns True if instruction \p I can be truncated to a smaller bitwidth
/// for vectorization factor \p VF.
bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
- return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
+ return VF.isVector() && MinBWs.contains(I) &&
!isProfitableToScalarize(I, VF) &&
!isScalarAfterVectorization(I, VF);
}
@@ -1379,7 +1409,7 @@ public:
InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
assert(VF.isVector() && "Expected VF >=2");
std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
- assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
+ assert(WideningDecisions.contains(InstOnVF) &&
"The cost is not calculated");
return WideningDecisions[InstOnVF].second;
}
@@ -1419,7 +1449,7 @@ public:
/// that may be vectorized as interleave, gather-scatter or scalarized.
void collectUniformsAndScalars(ElementCount VF) {
// Do the analysis once.
- if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
+ if (VF.isScalar() || Uniforms.contains(VF))
return;
setCostBasedWideningDecision(VF);
collectLoopUniforms(VF);
@@ -1442,8 +1472,7 @@ public:
/// Returns true if the target machine can represent \p V as a masked gather
/// or scatter operation.
- bool isLegalGatherOrScatter(Value *V,
- ElementCount VF = ElementCount::getFixed(1)) {
+ bool isLegalGatherOrScatter(Value *V, ElementCount VF) {
bool LI = isa<LoadInst>(V);
bool SI = isa<StoreInst>(V);
if (!LI && !SI)
@@ -1522,14 +1551,29 @@ public:
/// Returns true if we're required to use a scalar epilogue for at least
/// the final iteration of the original loop.
- bool requiresScalarEpilogue(ElementCount VF) const {
+ bool requiresScalarEpilogue(bool IsVectorizing) const {
if (!isScalarEpilogueAllowed())
return false;
// If we might exit from anywhere but the latch, must run the exiting
// iteration in scalar form.
if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
return true;
- return VF.isVector() && InterleaveInfo.requiresScalarEpilogue();
+ return IsVectorizing && InterleaveInfo.requiresScalarEpilogue();
+ }
+
+ /// Returns true if we're required to use a scalar epilogue for at least
+ /// the final iteration of the original loop for all VFs in \p Range.
+ /// A scalar epilogue must either be required for all VFs in \p Range or for
+ /// none.
+ bool requiresScalarEpilogue(VFRange Range) const {
+ auto RequiresScalarEpilogue = [this](ElementCount VF) {
+ return requiresScalarEpilogue(VF.isVector());
+ };
+ bool IsRequired = all_of(Range, RequiresScalarEpilogue);
+ assert(
+ (IsRequired || none_of(Range, RequiresScalarEpilogue)) &&
+ "all VFs in range must agree on whether a scalar epilogue is required");
+ return IsRequired;
}
/// Returns true if a scalar epilogue is not allowed due to optsize or a
@@ -1538,14 +1582,21 @@ public:
return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
}
- /// Returns true if all loop blocks should be masked to fold tail loop.
- bool foldTailByMasking() const { return FoldTailByMasking; }
+ /// Returns the TailFoldingStyle that is best for the current loop.
+ TailFoldingStyle
+ getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
+ if (!CanFoldTailByMasking)
+ return TailFoldingStyle::None;
+
+ if (ForceTailFoldingStyle.getNumOccurrences())
+ return ForceTailFoldingStyle;
+
+ return TTI.getPreferredTailFoldingStyle(IVUpdateMayOverflow);
+ }
- /// Returns true if were tail-folding and want to use the active lane mask
- /// for vector loop control flow.
- bool useActiveLaneMaskForControlFlow() const {
- return FoldTailByMasking &&
- TTI.emitGetActiveLaneMask() == PredicationStyle::DataAndControlFlow;
+ /// Returns true if all loop blocks should be masked to fold tail loop.
+ bool foldTailByMasking() const {
+ return getTailFoldingStyle() != TailFoldingStyle::None;
}
/// Returns true if the instructions in this block requires predication
@@ -1582,12 +1633,8 @@ public:
/// scalarized -
/// i.e. either vector version isn't available, or is too expensive.
InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF,
- bool &NeedToScalarize) const;
-
- /// Returns true if the per-lane cost of VectorizationFactor A is lower than
- /// that of B.
- bool isMoreProfitable(const VectorizationFactor &A,
- const VectorizationFactor &B) const;
+ Function **Variant,
+ bool *NeedsMask = nullptr) const;
/// Invalidates decisions already taken by the cost model.
void invalidateCostModelingDecisions() {
@@ -1596,10 +1643,29 @@ public:
Scalars.clear();
}
- /// Convenience function that returns the value of vscale_range iff
- /// vscale_range.min == vscale_range.max or otherwise returns the value
- /// returned by the corresponding TLI method.
- std::optional<unsigned> getVScaleForTuning() const;
+ /// The vectorization cost is a combination of the cost itself and a boolean
+ /// indicating whether any of the contributing operations will actually
+ /// operate on vector values after type legalization in the backend. If this
+ /// latter value is false, then all operations will be scalarized (i.e. no
+ /// vectorization has actually taken place).
+ using VectorizationCostTy = std::pair<InstructionCost, bool>;
+
+ /// Returns the expected execution cost. The unit of the cost does
+ /// not matter because we use the 'cost' units to compare different
+ /// vector widths. The cost that is returned is *not* normalized by
+ /// the factor width. If \p Invalid is not nullptr, this function
+ /// will add a pair(Instruction*, ElementCount) to \p Invalid for
+ /// each instruction that has an Invalid cost for the given VF.
+ VectorizationCostTy
+ expectedCost(ElementCount VF,
+ SmallVectorImpl<InstructionVFPair> *Invalid = nullptr);
+
+ bool hasPredStores() const { return NumPredStores > 0; }
+
+ /// Returns true if epilogue vectorization is considered profitable, and
+ /// false otherwise.
+ /// \p VF is the vectorization factor chosen for the original loop.
+ bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
private:
unsigned NumPredStores = 0;
@@ -1626,24 +1692,6 @@ private:
/// of elements.
ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
- /// The vectorization cost is a combination of the cost itself and a boolean
- /// indicating whether any of the contributing operations will actually
- /// operate on vector values after type legalization in the backend. If this
- /// latter value is false, then all operations will be scalarized (i.e. no
- /// vectorization has actually taken place).
- using VectorizationCostTy = std::pair<InstructionCost, bool>;
-
- /// Returns the expected execution cost. The unit of the cost does
- /// not matter because we use the 'cost' units to compare different
- /// vector widths. The cost that is returned is *not* normalized by
- /// the factor width. If \p Invalid is not nullptr, this function
- /// will add a pair(Instruction*, ElementCount) to \p Invalid for
- /// each instruction that has an Invalid cost for the given VF.
- using InstructionVFPair = std::pair<Instruction *, ElementCount>;
- VectorizationCostTy
- expectedCost(ElementCount VF,
- SmallVectorImpl<InstructionVFPair> *Invalid = nullptr);
-
/// Returns the execution time cost of an instruction for a given vector
/// width. Vector width of one means scalar.
VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
@@ -1715,7 +1763,7 @@ private:
ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
/// All blocks of loop are to be masked to fold tail of scalar iterations.
- bool FoldTailByMasking = false;
+ bool CanFoldTailByMasking = false;
/// A map holding scalar costs for different vectorization factors. The
/// presence of a cost for an instruction in the mapping indicates that the
@@ -1796,8 +1844,7 @@ private:
// the scalars are collected. That should be a safe assumption in most
// cases, because we check if the operands have vectorizable types
// beforehand in LoopVectorizationLegality.
- return Scalars.find(VF) == Scalars.end() ||
- !isScalarAfterVectorization(I, VF);
+ return !Scalars.contains(VF) || !isScalarAfterVectorization(I, VF);
};
/// Returns a range containing only operands needing to be extracted.
@@ -1807,16 +1854,6 @@ private:
Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
}
- /// Determines if we have the infrastructure to vectorize loop \p L and its
- /// epilogue, assuming the main loop is vectorized by \p VF.
- bool isCandidateForEpilogueVectorization(const Loop &L,
- const ElementCount VF) const;
-
- /// Returns true if epilogue vectorization is considered profitable, and
- /// false otherwise.
- /// \p VF is the vectorization factor chosen for the original loop.
- bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
-
public:
/// The loop that we evaluate.
Loop *TheLoop;
@@ -1862,9 +1899,6 @@ public:
/// All element types found in the loop.
SmallPtrSet<Type *, 16> ElementTypesInLoop;
-
- /// Profitable vector factors.
- SmallVector<VectorizationFactor, 8> ProfitableVFs;
};
} // end namespace llvm
@@ -2135,6 +2169,17 @@ public:
};
} // namespace
+static bool useActiveLaneMask(TailFoldingStyle Style) {
+ return Style == TailFoldingStyle::Data ||
+ Style == TailFoldingStyle::DataAndControlFlow ||
+ Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
+}
+
+static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style) {
+ return Style == TailFoldingStyle::DataAndControlFlow ||
+ Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
+}
+
// Return true if \p OuterLp is an outer loop annotated with hints for explicit
// vectorization. The loop needs to be annotated with #pragma omp simd
// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
@@ -2202,97 +2247,11 @@ static void collectSupportedLoops(Loop &L, LoopInfo *LI,
collectSupportedLoops(*InnerL, LI, ORE, V);
}
-namespace {
-
-/// The LoopVectorize Pass.
-struct LoopVectorize : public FunctionPass {
- /// Pass identification, replacement for typeid
- static char ID;
-
- LoopVectorizePass Impl;
-
- explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
- bool VectorizeOnlyWhenForced = false)
- : FunctionPass(ID),
- Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
- initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnFunction(Function &F) override {
- if (skipFunction(F))
- return false;
-
- auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
- auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
- auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
- auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
- auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
- auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
- auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
- auto &LAIs = getAnalysis<LoopAccessLegacyAnalysis>().getLAIs();
- auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
- auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
- auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
-
- return Impl
- .runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AC, LAIs, *ORE, PSI)
- .MadeAnyChange;
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<AssumptionCacheTracker>();
- AU.addRequired<BlockFrequencyInfoWrapperPass>();
- AU.addRequired<DominatorTreeWrapperPass>();
- AU.addRequired<LoopInfoWrapperPass>();
- AU.addRequired<ScalarEvolutionWrapperPass>();
- AU.addRequired<TargetTransformInfoWrapperPass>();
- AU.addRequired<LoopAccessLegacyAnalysis>();
- AU.addRequired<DemandedBitsWrapperPass>();
- AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
- AU.addRequired<InjectTLIMappingsLegacy>();
-
- // We currently do not preserve loopinfo/dominator analyses with outer loop
- // vectorization. Until this is addressed, mark these analyses as preserved
- // only for non-VPlan-native path.
- // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
- if (!EnableVPlanNativePath) {
- AU.addPreserved<LoopInfoWrapperPass>();
- AU.addPreserved<DominatorTreeWrapperPass>();
- }
-
- AU.addPreserved<BasicAAWrapperPass>();
- AU.addPreserved<GlobalsAAWrapperPass>();
- AU.addRequired<ProfileSummaryInfoWrapperPass>();
- }
-};
-
-} // end anonymous namespace
-
//===----------------------------------------------------------------------===//
// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
// LoopVectorizationCostModel and LoopVectorizationPlanner.
//===----------------------------------------------------------------------===//
-Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
- // We need to place the broadcast of invariant variables outside the loop,
- // but only if it's proven safe to do so. Else, broadcast will be inside
- // vector loop body.
- Instruction *Instr = dyn_cast<Instruction>(V);
- bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
- (!Instr ||
- DT->dominates(Instr->getParent(), LoopVectorPreHeader));
- // Place the code for broadcasting invariant variables in the new preheader.
- IRBuilder<>::InsertPointGuard Guard(Builder);
- if (SafeToHoist)
- Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
-
- // Broadcast the scalar into all locations in the vector.
- Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
-
- return Shuf;
-}
-
/// This function adds
/// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
/// to each vector element of Val. The sequence starts at StartIndex.
@@ -2435,21 +2394,6 @@ static void buildScalarSteps(Value *ScalarIV, Value *Step,
}
}
-// Generate code for the induction step. Note that induction steps are
-// required to be loop-invariant
-static Value *CreateStepValue(const SCEV *Step, ScalarEvolution &SE,
- Instruction *InsertBefore,
- Loop *OrigLoop = nullptr) {
- const DataLayout &DL = SE.getDataLayout();
- assert((!OrigLoop || SE.isLoopInvariant(Step, OrigLoop)) &&
- "Induction step should be loop invariant");
- if (auto *E = dyn_cast<SCEVUnknown>(Step))
- return E->getValue();
-
- SCEVExpander Exp(SE, DL, "induction");
- return Exp.expandCodeFor(Step, Step->getType(), InsertBefore);
-}
-
/// Compute the transformed value of Index at offset StartValue using step
/// StepValue.
/// For integer induction, returns StartValue + Index * StepValue.
@@ -2514,9 +2458,7 @@ static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index,
return CreateAdd(StartValue, Offset);
}
case InductionDescriptor::IK_PtrInduction: {
- assert(isa<Constant>(Step) &&
- "Expected constant step for pointer induction");
- return B.CreateGEP(ID.getElementType(), StartValue, CreateMul(Index, Step));
+ return B.CreateGEP(B.getInt8Ty(), StartValue, CreateMul(Index, Step));
}
case InductionDescriptor::IK_FpInduction: {
assert(!isa<VectorType>(Index->getType()) &&
@@ -2538,6 +2480,50 @@ static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index,
llvm_unreachable("invalid enum");
}
+std::optional<unsigned> getMaxVScale(const Function &F,
+ const TargetTransformInfo &TTI) {
+ if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale())
+ return MaxVScale;
+
+ if (F.hasFnAttribute(Attribute::VScaleRange))
+ return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
+
+ return std::nullopt;
+}
+
+/// For the given VF and UF and maximum trip count computed for the loop, return
+/// whether the induction variable might overflow in the vectorized loop. If not,
+/// then we know a runtime overflow check always evaluates to false and can be
+/// removed.
+static bool isIndvarOverflowCheckKnownFalse(
+ const LoopVectorizationCostModel *Cost,
+ ElementCount VF, std::optional<unsigned> UF = std::nullopt) {
+ // Always be conservative if we don't know the exact unroll factor.
+ unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF);
+
+ Type *IdxTy = Cost->Legal->getWidestInductionType();
+ APInt MaxUIntTripCount = cast<IntegerType>(IdxTy)->getMask();
+
+ // We know the runtime overflow check is known false iff the (max) trip-count
+ // is known and (max) trip-count + (VF * UF) does not overflow in the type of
+ // the vector loop induction variable.
+ if (unsigned TC =
+ Cost->PSE.getSE()->getSmallConstantMaxTripCount(Cost->TheLoop)) {
+ uint64_t MaxVF = VF.getKnownMinValue();
+ if (VF.isScalable()) {
+ std::optional<unsigned> MaxVScale =
+ getMaxVScale(*Cost->TheFunction, Cost->TTI);
+ if (!MaxVScale)
+ return false;
+ MaxVF *= *MaxVScale;
+ }
+
+ return (MaxUIntTripCount - TC).ugt(MaxVF * MaxUF);
+ }
+
+ return false;
+}
+
void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def,
const VPIteration &Instance,
VPTransformState &State) {
@@ -2591,14 +2577,13 @@ static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
void InnerLoopVectorizer::vectorizeInterleaveGroup(
const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs,
VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
- VPValue *BlockInMask) {
+ VPValue *BlockInMask, bool NeedsMaskForGaps) {
Instruction *Instr = Group->getInsertPos();
const DataLayout &DL = Instr->getModule()->getDataLayout();
// Prepare for the vector type of the interleaved load/store.
Type *ScalarTy = getLoadStoreType(Instr);
unsigned InterleaveFactor = Group->getFactor();
- assert(!VF.isScalable() && "scalable vectors not yet supported.");
auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
// Prepare for the new pointers.
@@ -2609,14 +2594,21 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
assert((!BlockInMask || !Group->isReverse()) &&
"Reversed masked interleave-group not supported.");
+ Value *Idx;
// If the group is reverse, adjust the index to refer to the last vector lane
// instead of the first. We adjust the index from the first vector lane,
// rather than directly getting the pointer for lane VF - 1, because the
// pointer operand of the interleaved access is supposed to be uniform. For
// uniform instructions, we're only required to generate a value for the
// first vector lane in each unroll iteration.
- if (Group->isReverse())
- Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
+ if (Group->isReverse()) {
+ Value *RuntimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF);
+ Idx = Builder.CreateSub(RuntimeVF, Builder.getInt32(1));
+ Idx = Builder.CreateMul(Idx, Builder.getInt32(Group->getFactor()));
+ Idx = Builder.CreateAdd(Idx, Builder.getInt32(Index));
+ Idx = Builder.CreateNeg(Idx);
+ } else
+ Idx = Builder.getInt32(-Index);
for (unsigned Part = 0; Part < UF; Part++) {
Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
@@ -2637,8 +2629,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
bool InBounds = false;
if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
InBounds = gep->isInBounds();
- AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
- cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
+ AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Idx, "", InBounds);
// Cast to the vector pointer type.
unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
@@ -2649,14 +2640,43 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
State.setDebugLocFromInst(Instr);
Value *PoisonVec = PoisonValue::get(VecTy);
- Value *MaskForGaps = nullptr;
- if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
- MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
- assert(MaskForGaps && "Mask for Gaps is required but it is null");
- }
+ auto CreateGroupMask = [this, &BlockInMask, &State, &InterleaveFactor](
+ unsigned Part, Value *MaskForGaps) -> Value * {
+ if (VF.isScalable()) {
+ assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
+ assert(InterleaveFactor == 2 &&
+ "Unsupported deinterleave factor for scalable vectors");
+ auto *BlockInMaskPart = State.get(BlockInMask, Part);
+ SmallVector<Value *, 2> Ops = {BlockInMaskPart, BlockInMaskPart};
+ auto *MaskTy =
+ VectorType::get(Builder.getInt1Ty(), VF.getKnownMinValue() * 2, true);
+ return Builder.CreateIntrinsic(
+ MaskTy, Intrinsic::experimental_vector_interleave2, Ops,
+ /*FMFSource=*/nullptr, "interleaved.mask");
+ }
+
+ if (!BlockInMask)
+ return MaskForGaps;
+
+ Value *BlockInMaskPart = State.get(BlockInMask, Part);
+ Value *ShuffledMask = Builder.CreateShuffleVector(
+ BlockInMaskPart,
+ createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
+ "interleaved.mask");
+ return MaskForGaps ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
+ MaskForGaps)
+ : ShuffledMask;
+ };
// Vectorize the interleaved load group.
if (isa<LoadInst>(Instr)) {
+ Value *MaskForGaps = nullptr;
+ if (NeedsMaskForGaps) {
+ MaskForGaps =
+ createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
+ assert(MaskForGaps && "Mask for Gaps is required but it is null");
+ }
+
// For each unroll part, create a wide load for the group.
SmallVector<Value *, 2> NewLoads;
for (unsigned Part = 0; Part < UF; Part++) {
@@ -2664,18 +2684,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
if (BlockInMask || MaskForGaps) {
assert(useMaskedInterleavedAccesses(*TTI) &&
"masked interleaved groups are not allowed.");
- Value *GroupMask = MaskForGaps;
- if (BlockInMask) {
- Value *BlockInMaskPart = State.get(BlockInMask, Part);
- Value *ShuffledMask = Builder.CreateShuffleVector(
- BlockInMaskPart,
- createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
- "interleaved.mask");
- GroupMask = MaskForGaps
- ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
- MaskForGaps)
- : ShuffledMask;
- }
+ Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
NewLoad =
Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(),
GroupMask, PoisonVec, "wide.masked.vec");
@@ -2687,6 +2696,41 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
NewLoads.push_back(NewLoad);
}
+ if (VecTy->isScalableTy()) {
+ assert(InterleaveFactor == 2 &&
+ "Unsupported deinterleave factor for scalable vectors");
+
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ // Scalable vectors cannot use arbitrary shufflevectors (only splats),
+ // so must use intrinsics to deinterleave.
+ Value *DI = Builder.CreateIntrinsic(
+ Intrinsic::experimental_vector_deinterleave2, VecTy, NewLoads[Part],
+ /*FMFSource=*/nullptr, "strided.vec");
+ unsigned J = 0;
+ for (unsigned I = 0; I < InterleaveFactor; ++I) {
+ Instruction *Member = Group->getMember(I);
+
+ if (!Member)
+ continue;
+
+ Value *StridedVec = Builder.CreateExtractValue(DI, I);
+ // If this member has different type, cast the result type.
+ if (Member->getType() != ScalarTy) {
+ VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
+ StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
+ }
+
+ if (Group->isReverse())
+ StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
+
+ State.set(VPDefs[J], StridedVec, Part);
+ ++J;
+ }
+ }
+
+ return;
+ }
+
// For each member in the group, shuffle out the appropriate data from the
// wide loads.
unsigned J = 0;
@@ -2724,7 +2768,8 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
auto *SubVT = VectorType::get(ScalarTy, VF);
// Vectorize the interleaved store group.
- MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
+ Value *MaskForGaps =
+ createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) &&
"masked interleaved groups are not allowed.");
assert((!MaskForGaps || !VF.isScalable()) &&
@@ -2759,27 +2804,11 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
StoredVecs.push_back(StoredVec);
}
- // Concatenate all vectors into a wide vector.
- Value *WideVec = concatenateVectors(Builder, StoredVecs);
-
- // Interleave the elements in the wide vector.
- Value *IVec = Builder.CreateShuffleVector(
- WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
- "interleaved.vec");
-
+ // Interleave all the smaller vectors into one wider vector.
+ Value *IVec = interleaveVectors(Builder, StoredVecs, "interleaved.vec");
Instruction *NewStoreInstr;
if (BlockInMask || MaskForGaps) {
- Value *GroupMask = MaskForGaps;
- if (BlockInMask) {
- Value *BlockInMaskPart = State.get(BlockInMask, Part);
- Value *ShuffledMask = Builder.CreateShuffleVector(
- BlockInMaskPart,
- createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
- "interleaved.mask");
- GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And,
- ShuffledMask, MaskForGaps)
- : ShuffledMask;
- }
+ Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part],
Group->getAlign(), GroupMask);
} else
@@ -2793,7 +2822,6 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
VPReplicateRecipe *RepRecipe,
const VPIteration &Instance,
- bool IfPredicateInstr,
VPTransformState &State) {
assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
@@ -2810,14 +2838,7 @@ void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
if (!IsVoidRetTy)
Cloned->setName(Instr->getName() + ".cloned");
- // If the scalarized instruction contributes to the address computation of a
- // widen masked load/store which was in a basic block that needed predication
- // and is not predicated after vectorization, we can't propagate
- // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized
- // instruction could feed a poison value to the base address of the widen
- // load/store.
- if (State.MayGeneratePoisonRecipes.contains(RepRecipe))
- Cloned->dropPoisonGeneratingFlags();
+ RepRecipe->setFlags(Cloned);
if (Instr->getDebugLoc())
State.setDebugLocFromInst(Instr);
@@ -2843,45 +2864,17 @@ void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
AC->registerAssumption(II);
// End if-block.
+ bool IfPredicateInstr = RepRecipe->getParent()->getParent()->isReplicator();
if (IfPredicateInstr)
PredicatedInstructions.push_back(Cloned);
}
-Value *InnerLoopVectorizer::getOrCreateTripCount(BasicBlock *InsertBlock) {
- if (TripCount)
- return TripCount;
-
- assert(InsertBlock);
- IRBuilder<> Builder(InsertBlock->getTerminator());
- // Find the loop boundaries.
- Type *IdxTy = Legal->getWidestInductionType();
- assert(IdxTy && "No type for induction");
- const SCEV *ExitCount = createTripCountSCEV(IdxTy, PSE);
-
- const DataLayout &DL = InsertBlock->getModule()->getDataLayout();
-
- // Expand the trip count and place the new instructions in the preheader.
- // Notice that the pre-header does not change, only the loop body.
- SCEVExpander Exp(*PSE.getSE(), DL, "induction");
-
- // Count holds the overall loop count (N).
- TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
- InsertBlock->getTerminator());
-
- if (TripCount->getType()->isPointerTy())
- TripCount =
- CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
- InsertBlock->getTerminator());
-
- return TripCount;
-}
-
Value *
InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
if (VectorTripCount)
return VectorTripCount;
- Value *TC = getOrCreateTripCount(InsertBlock);
+ Value *TC = getTripCount();
IRBuilder<> Builder(InsertBlock->getTerminator());
Type *Ty = TC->getType();
@@ -2917,7 +2910,7 @@ InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
// the step does not evenly divide the trip count, no adjustment is necessary
// since there will already be scalar iterations. Note that the minimum
// iterations check ensures that N >= Step.
- if (Cost->requiresScalarEpilogue(VF)) {
+ if (Cost->requiresScalarEpilogue(VF.isVector())) {
auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
R = Builder.CreateSelect(IsZero, Step, R);
}
@@ -2930,10 +2923,10 @@ InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
const DataLayout &DL) {
// Verify that V is a vector type with same number of elements as DstVTy.
- auto *DstFVTy = cast<FixedVectorType>(DstVTy);
- unsigned VF = DstFVTy->getNumElements();
- auto *SrcVecTy = cast<FixedVectorType>(V->getType());
- assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
+ auto *DstFVTy = cast<VectorType>(DstVTy);
+ auto VF = DstFVTy->getElementCount();
+ auto *SrcVecTy = cast<VectorType>(V->getType());
+ assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match");
Type *SrcElemTy = SrcVecTy->getElementType();
Type *DstElemTy = DstFVTy->getElementType();
assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
@@ -2953,13 +2946,13 @@ Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
"Only one type should be a floating point type");
Type *IntTy =
IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
- auto *VecIntTy = FixedVectorType::get(IntTy, VF);
+ auto *VecIntTy = VectorType::get(IntTy, VF);
Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
}
void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
- Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
+ Value *Count = getTripCount();
// Reuse existing vector loop preheader for TC checks.
// Note that new preheader block is generated for vector loop.
BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
@@ -2970,8 +2963,8 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
// vector trip count is zero. This check also covers the case where adding one
// to the backedge-taken count overflowed leading to an incorrect trip count
// of zero. In this case we will also jump to the scalar loop.
- auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE
- : ICmpInst::ICMP_ULT;
+ auto P = Cost->requiresScalarEpilogue(VF.isVector()) ? ICmpInst::ICMP_ULE
+ : ICmpInst::ICMP_ULT;
// If tail is to be folded, vector loop takes care of all iterations.
Type *CountTy = Count->getType();
@@ -2989,10 +2982,13 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF));
};
- if (!Cost->foldTailByMasking())
+ TailFoldingStyle Style = Cost->getTailFoldingStyle();
+ if (Style == TailFoldingStyle::None)
CheckMinIters =
Builder.CreateICmp(P, Count, CreateStep(), "min.iters.check");
- else if (VF.isScalable()) {
+ else if (VF.isScalable() &&
+ !isIndvarOverflowCheckKnownFalse(Cost, VF, UF) &&
+ Style != TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) {
// vscale is not necessarily a power-of-2, which means we cannot guarantee
// an overflow to zero when updating induction variables and so an
// additional overflow check is required before entering the vector loop.
@@ -3017,7 +3013,7 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
// Update dominator for Bypass & LoopExit (if needed).
DT->changeImmediateDominator(Bypass, TCCheckBlock);
- if (!Cost->requiresScalarEpilogue(VF))
+ if (!Cost->requiresScalarEpilogue(VF.isVector()))
// If there is an epilogue which must run, there's no edge from the
// middle block to exit blocks and thus no need to update the immediate
// dominator of the exit blocks.
@@ -3044,7 +3040,7 @@ BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
// Update dominator only if this is first RT check.
if (LoopBypassBlocks.empty()) {
DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
- if (!Cost->requiresScalarEpilogue(VF))
+ if (!Cost->requiresScalarEpilogue(VF.isVector()))
// If there is an epilogue which must run, there's no edge from the
// middle block to exit blocks and thus no need to update the immediate
// dominator of the exit blocks.
@@ -3097,7 +3093,7 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
LoopVectorPreHeader = OrigLoop->getLoopPreheader();
assert(LoopVectorPreHeader && "Invalid loop structure");
LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
- assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) &&
+ assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector())) &&
"multiple exit loop without required epilogue?");
LoopMiddleBlock =
@@ -3117,17 +3113,18 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
// branch from the middle block to the loop scalar preheader, and the
// exit block. completeLoopSkeleton will update the condition to use an
// iteration check, if required to decide whether to execute the remainder.
- BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ?
- BranchInst::Create(LoopScalarPreHeader) :
- BranchInst::Create(LoopExitBlock, LoopScalarPreHeader,
- Builder.getTrue());
+ BranchInst *BrInst =
+ Cost->requiresScalarEpilogue(VF.isVector())
+ ? BranchInst::Create(LoopScalarPreHeader)
+ : BranchInst::Create(LoopExitBlock, LoopScalarPreHeader,
+ Builder.getTrue());
BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
// Update dominator for loop exit. During skeleton creation, only the vector
// pre-header and the middle block are created. The vector loop is entirely
// created during VPlan exection.
- if (!Cost->requiresScalarEpilogue(VF))
+ if (!Cost->requiresScalarEpilogue(VF.isVector()))
// If there is an epilogue which must run, there's no edge from the
// middle block to exit blocks and thus no need to update the immediate
// dominator of the exit blocks.
@@ -3135,7 +3132,7 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
}
PHINode *InnerLoopVectorizer::createInductionResumeValue(
- PHINode *OrigPhi, const InductionDescriptor &II,
+ PHINode *OrigPhi, const InductionDescriptor &II, Value *Step,
ArrayRef<BasicBlock *> BypassBlocks,
std::pair<BasicBlock *, Value *> AdditionalBypass) {
Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
@@ -3154,8 +3151,6 @@ PHINode *InnerLoopVectorizer::createInductionResumeValue(
if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
- Value *Step =
- CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint());
EndValue =
emitTransformedIndex(B, VectorTripCount, II.getStartValue(), Step, II);
EndValue->setName("ind.end");
@@ -3163,8 +3158,6 @@ PHINode *InnerLoopVectorizer::createInductionResumeValue(
// Compute the end value for the additional bypass (if applicable).
if (AdditionalBypass.first) {
B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
- Value *Step =
- CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint());
EndValueFromAdditionalBypass = emitTransformedIndex(
B, AdditionalBypass.second, II.getStartValue(), Step, II);
EndValueFromAdditionalBypass->setName("ind.end");
@@ -3193,7 +3186,22 @@ PHINode *InnerLoopVectorizer::createInductionResumeValue(
return BCResumeVal;
}
+/// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
+/// expansion results.
+static Value *getExpandedStep(const InductionDescriptor &ID,
+ const SCEV2ValueTy &ExpandedSCEVs) {
+ const SCEV *Step = ID.getStep();
+ if (auto *C = dyn_cast<SCEVConstant>(Step))
+ return C->getValue();
+ if (auto *U = dyn_cast<SCEVUnknown>(Step))
+ return U->getValue();
+ auto I = ExpandedSCEVs.find(Step);
+ assert(I != ExpandedSCEVs.end() && "SCEV must be expanded at this point");
+ return I->second;
+}
+
void InnerLoopVectorizer::createInductionResumeValues(
+ const SCEV2ValueTy &ExpandedSCEVs,
std::pair<BasicBlock *, Value *> AdditionalBypass) {
assert(((AdditionalBypass.first && AdditionalBypass.second) ||
(!AdditionalBypass.first && !AdditionalBypass.second)) &&
@@ -3209,14 +3217,15 @@ void InnerLoopVectorizer::createInductionResumeValues(
PHINode *OrigPhi = InductionEntry.first;
const InductionDescriptor &II = InductionEntry.second;
PHINode *BCResumeVal = createInductionResumeValue(
- OrigPhi, II, LoopBypassBlocks, AdditionalBypass);
+ OrigPhi, II, getExpandedStep(II, ExpandedSCEVs), LoopBypassBlocks,
+ AdditionalBypass);
OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
}
}
BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() {
// The trip counts should be cached by now.
- Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
+ Value *Count = getTripCount();
Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
@@ -3229,7 +3238,8 @@ BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() {
// Thus if tail is to be folded, we know we don't need to run the
// remainder and we can use the previous value for the condition (true).
// 3) Otherwise, construct a runtime check.
- if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) {
+ if (!Cost->requiresScalarEpilogue(VF.isVector()) &&
+ !Cost->foldTailByMasking()) {
Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
Count, VectorTripCount, "cmp.n",
LoopMiddleBlock->getTerminator());
@@ -3250,14 +3260,16 @@ BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() {
}
std::pair<BasicBlock *, Value *>
-InnerLoopVectorizer::createVectorizedLoopSkeleton() {
+InnerLoopVectorizer::createVectorizedLoopSkeleton(
+ const SCEV2ValueTy &ExpandedSCEVs) {
/*
In this function we generate a new loop. The new loop will contain
the vectorized instructions while the old loop will continue to run the
scalar remainder.
- [ ] <-- loop iteration number check.
- / |
+ [ ] <-- old preheader - loop iteration number check and SCEVs in Plan's
+ / | preheader are expanded here. Eventually all required SCEV
+ / | expansion should happen here.
/ v
| [ ] <-- vector loop bypass (may consist of multiple blocks).
| / |
@@ -3304,7 +3316,7 @@ InnerLoopVectorizer::createVectorizedLoopSkeleton() {
emitMemRuntimeChecks(LoopScalarPreHeader);
// Emit phis for the new starting index of the scalar loop.
- createInductionResumeValues();
+ createInductionResumeValues(ExpandedSCEVs);
return {completeLoopSkeleton(), nullptr};
}
@@ -3317,7 +3329,8 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
const InductionDescriptor &II,
Value *VectorTripCount, Value *EndValue,
BasicBlock *MiddleBlock,
- BasicBlock *VectorHeader, VPlan &Plan) {
+ BasicBlock *VectorHeader, VPlan &Plan,
+ VPTransformState &State) {
// There are two kinds of external IV usages - those that use the value
// computed in the last iteration (the PHI) and those that use the penultimate
// value (the value that feeds into the phi from the loop latch).
@@ -3345,7 +3358,6 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
auto *UI = cast<Instruction>(U);
if (!OrigLoop->contains(UI)) {
assert(isa<PHINode>(UI) && "Expected LCSSA form");
-
IRBuilder<> B(MiddleBlock->getTerminator());
// Fast-math-flags propagate from the original induction instruction.
@@ -3355,8 +3367,11 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
Value *CountMinusOne = B.CreateSub(
VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1));
CountMinusOne->setName("cmo");
- Value *Step = CreateStepValue(II.getStep(), *PSE.getSE(),
- VectorHeader->getTerminator());
+
+ VPValue *StepVPV = Plan.getSCEVExpansion(II.getStep());
+ assert(StepVPV && "step must have been expanded during VPlan execution");
+ Value *Step = StepVPV->isLiveIn() ? StepVPV->getLiveInIRValue()
+ : State.get(StepVPV, {0, 0});
Value *Escape =
emitTransformedIndex(B, CountMinusOne, II.getStartValue(), Step, II);
Escape->setName("ind.escape");
@@ -3430,12 +3445,12 @@ static void cse(BasicBlock *BB) {
}
}
-InstructionCost
-LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
- bool &NeedToScalarize) const {
+InstructionCost LoopVectorizationCostModel::getVectorCallCost(
+ CallInst *CI, ElementCount VF, Function **Variant, bool *NeedsMask) const {
Function *F = CI->getCalledFunction();
Type *ScalarRetTy = CI->getType();
SmallVector<Type *, 4> Tys, ScalarTys;
+ bool MaskRequired = Legal->isMaskRequired(CI);
for (auto &ArgOp : CI->args())
ScalarTys.push_back(ArgOp->getType());
@@ -3464,18 +3479,39 @@ LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
// If we can't emit a vector call for this function, then the currently found
// cost is the cost we need to return.
- NeedToScalarize = true;
- VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
+ InstructionCost MaskCost = 0;
+ VFShape Shape = VFShape::get(*CI, VF, MaskRequired);
+ if (NeedsMask)
+ *NeedsMask = MaskRequired;
Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
+ // If we want an unmasked vector function but can't find one matching the VF,
+ // maybe we can find vector function that does use a mask and synthesize
+ // an all-true mask.
+ if (!VecFunc && !MaskRequired) {
+ Shape = VFShape::get(*CI, VF, /*HasGlobalPred=*/true);
+ VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
+ // If we found one, add in the cost of creating a mask
+ if (VecFunc) {
+ if (NeedsMask)
+ *NeedsMask = true;
+ MaskCost = TTI.getShuffleCost(
+ TargetTransformInfo::SK_Broadcast,
+ VectorType::get(
+ IntegerType::getInt1Ty(VecFunc->getFunctionType()->getContext()),
+ VF));
+ }
+ }
+ // We don't support masked function calls yet, but we can scalarize a
+ // masked call with branches (unless VF is scalable).
if (!TLI || CI->isNoBuiltin() || !VecFunc)
- return Cost;
+ return VF.isScalable() ? InstructionCost::getInvalid() : Cost;
// If the corresponding vector cost is cheaper, return its cost.
InstructionCost VectorCallCost =
- TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind);
+ TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind) + MaskCost;
if (VectorCallCost < Cost) {
- NeedToScalarize = false;
+ *Variant = VecFunc;
Cost = VectorCallCost;
}
return Cost;
@@ -3675,14 +3711,25 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
// Forget the original basic block.
PSE.getSE()->forgetLoop(OrigLoop);
+ // After vectorization, the exit blocks of the original loop will have
+ // additional predecessors. Invalidate SCEVs for the exit phis in case SE
+ // looked through single-entry phis.
+ SmallVector<BasicBlock *> ExitBlocks;
+ OrigLoop->getExitBlocks(ExitBlocks);
+ for (BasicBlock *Exit : ExitBlocks)
+ for (PHINode &PN : Exit->phis())
+ PSE.getSE()->forgetValue(&PN);
+
VPBasicBlock *LatchVPBB = Plan.getVectorLoopRegion()->getExitingBasicBlock();
Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]);
- if (Cost->requiresScalarEpilogue(VF)) {
+ if (Cost->requiresScalarEpilogue(VF.isVector())) {
// No edge from the middle block to the unique exit block has been inserted
// and there is nothing to fix from vector loop; phis should have incoming
// from scalar loop only.
- Plan.clearLiveOuts();
} else {
+ // TODO: Check VPLiveOuts to see if IV users need fixing instead of checking
+ // the cost model.
+
// If we inserted an edge from the middle block to the unique exit block,
// update uses outside the loop (phis) to account for the newly inserted
// edge.
@@ -3692,7 +3739,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
fixupIVUsers(Entry.first, Entry.second,
getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()),
IVEndValues[Entry.first], LoopMiddleBlock,
- VectorLoop->getHeader(), Plan);
+ VectorLoop->getHeader(), Plan, State);
}
// Fix LCSSA phis not already fixed earlier. Extracts may need to be generated
@@ -3799,31 +3846,53 @@ void InnerLoopVectorizer::fixFixedOrderRecurrence(
Value *Incoming = State.get(PreviousDef, UF - 1);
auto *ExtractForScalar = Incoming;
auto *IdxTy = Builder.getInt32Ty();
+ Value *RuntimeVF = nullptr;
if (VF.isVector()) {
auto *One = ConstantInt::get(IdxTy, 1);
Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
- auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
+ RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
- ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx,
- "vector.recur.extract");
- }
- // Extract the second last element in the middle block if the
- // Phi is used outside the loop. We need to extract the phi itself
- // and not the last element (the phi update in the current iteration). This
- // will be the value when jumping to the exit block from the LoopMiddleBlock,
- // when the scalar loop is not run at all.
- Value *ExtractForPhiUsedOutsideLoop = nullptr;
- if (VF.isVector()) {
- auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
- auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2));
- ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
- Incoming, Idx, "vector.recur.extract.for.phi");
- } else if (UF > 1)
- // When loop is unrolled without vectorizing, initialize
- // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value
- // of `Incoming`. This is analogous to the vectorized case above: extracting
- // the second last element when VF > 1.
- ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);
+ ExtractForScalar =
+ Builder.CreateExtractElement(Incoming, LastIdx, "vector.recur.extract");
+ }
+
+ auto RecurSplice = cast<VPInstruction>(*PhiR->user_begin());
+ assert(PhiR->getNumUsers() == 1 &&
+ RecurSplice->getOpcode() ==
+ VPInstruction::FirstOrderRecurrenceSplice &&
+ "recurrence phi must have a single user: FirstOrderRecurrenceSplice");
+ SmallVector<VPLiveOut *> LiveOuts;
+ for (VPUser *U : RecurSplice->users())
+ if (auto *LiveOut = dyn_cast<VPLiveOut>(U))
+ LiveOuts.push_back(LiveOut);
+
+ if (!LiveOuts.empty()) {
+ // Extract the second last element in the middle block if the
+ // Phi is used outside the loop. We need to extract the phi itself
+ // and not the last element (the phi update in the current iteration). This
+ // will be the value when jumping to the exit block from the
+ // LoopMiddleBlock, when the scalar loop is not run at all.
+ Value *ExtractForPhiUsedOutsideLoop = nullptr;
+ if (VF.isVector()) {
+ auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2));
+ ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
+ Incoming, Idx, "vector.recur.extract.for.phi");
+ } else {
+ assert(UF > 1 && "VF and UF cannot both be 1");
+ // When loop is unrolled without vectorizing, initialize
+ // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled
+ // value of `Incoming`. This is analogous to the vectorized case above:
+ // extracting the second last element when VF > 1.
+ ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);
+ }
+
+ for (VPLiveOut *LiveOut : LiveOuts) {
+ assert(!Cost->requiresScalarEpilogue(VF.isVector()));
+ PHINode *LCSSAPhi = LiveOut->getPhi();
+ LCSSAPhi->addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
+ State.Plan->removeLiveOut(LCSSAPhi);
+ }
+ }
// Fix the initial value of the original recurrence in the scalar loop.
Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
@@ -3837,22 +3906,6 @@ void InnerLoopVectorizer::fixFixedOrderRecurrence(
Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
Phi->setName("scalar.recur");
-
- // Finally, fix users of the recurrence outside the loop. The users will need
- // either the last value of the scalar recurrence or the last value of the
- // vector recurrence we extracted in the middle block. Since the loop is in
- // LCSSA form, we just need to find all the phi nodes for the original scalar
- // recurrence in the exit block, and then add an edge for the middle block.
- // Note that LCSSA does not imply single entry when the original scalar loop
- // had multiple exiting edges (as we always run the last iteration in the
- // scalar epilogue); in that case, there is no edge from middle to exit and
- // and thus no phis which needed updated.
- if (!Cost->requiresScalarEpilogue(VF))
- for (PHINode &LCSSAPhi : LoopExitBlock->phis())
- if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi)) {
- LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
- State.Plan->removeLiveOut(&LCSSAPhi);
- }
}
void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
@@ -3872,9 +3925,6 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
// This is the vector-clone of the value that leaves the loop.
Type *VecTy = State.get(LoopExitInstDef, 0)->getType();
- // Wrap flags are in general invalid after vectorization, clear them.
- clearReductionWrapFlags(PhiR, State);
-
// Before each round, move the insertion point right between
// the PHIs and the values we are going to write.
// This allows us to write both PHINodes and the extractelement
@@ -4036,7 +4086,7 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
// We know that the loop is in LCSSA form. We need to update the PHI nodes
// in the exit blocks. See comment on analogous loop in
// fixFixedOrderRecurrence for a more complete explaination of the logic.
- if (!Cost->requiresScalarEpilogue(VF))
+ if (!Cost->requiresScalarEpilogue(VF.isVector()))
for (PHINode &LCSSAPhi : LoopExitBlock->phis())
if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) {
LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
@@ -4054,38 +4104,6 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
}
-void InnerLoopVectorizer::clearReductionWrapFlags(VPReductionPHIRecipe *PhiR,
- VPTransformState &State) {
- const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
- RecurKind RK = RdxDesc.getRecurrenceKind();
- if (RK != RecurKind::Add && RK != RecurKind::Mul)
- return;
-
- SmallVector<VPValue *, 8> Worklist;
- SmallPtrSet<VPValue *, 8> Visited;
- Worklist.push_back(PhiR);
- Visited.insert(PhiR);
-
- while (!Worklist.empty()) {
- VPValue *Cur = Worklist.pop_back_val();
- for (unsigned Part = 0; Part < UF; ++Part) {
- Value *V = State.get(Cur, Part);
- if (!isa<OverflowingBinaryOperator>(V))
- break;
- cast<Instruction>(V)->dropPoisonGeneratingFlags();
- }
-
- for (VPUser *U : Cur->users()) {
- auto *UserRecipe = dyn_cast<VPRecipeBase>(U);
- if (!UserRecipe)
- continue;
- for (VPValue *V : UserRecipe->definedValues())
- if (Visited.insert(V).second)
- Worklist.push_back(V);
- }
- }
-}
-
void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
// The basic block and loop containing the predicated instruction.
auto *PredBB = PredInst->getParent();
@@ -4125,10 +4143,11 @@ void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
// We can't sink an instruction if it is a phi node, is not in the loop,
- // or may have side effects.
+ // may have side effects or may read from memory.
+ // TODO Could dor more granular checking to allow sinking a load past non-store instructions.
if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
- I->mayHaveSideEffects())
- continue;
+ I->mayHaveSideEffects() || I->mayReadFromMemory())
+ continue;
// If the instruction is already in PredBB, check if we can sink its
// operands. In that case, VPlan's sinkScalarOperands() succeeded in
@@ -4189,7 +4208,7 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
// We should not collect Scalars more than once per VF. Right now, this
// function is called from collectUniformsAndScalars(), which already does
// this check. Collecting Scalars for VF=1 does not make any sense.
- assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&
+ assert(VF.isVector() && !Scalars.contains(VF) &&
"This function should not be visited twice for the same VF");
// This avoids any chances of creating a REPLICATE recipe during planning
@@ -4382,6 +4401,8 @@ bool LoopVectorizationCostModel::isScalarWithPredication(
switch(I->getOpcode()) {
default:
return true;
+ case Instruction::Call:
+ return !VFDatabase::hasMaskedVariant(*(cast<CallInst>(I)), VF);
case Instruction::Load:
case Instruction::Store: {
auto *Ptr = getLoadStorePointerOperand(I);
@@ -4430,10 +4451,10 @@ bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {
// both speculation safety (which follows from the same argument as loads),
// but also must prove the value being stored is correct. The easiest
// form of the later is to require that all values stored are the same.
- if (Legal->isUniformMemOp(*I) &&
- (isa<LoadInst>(I) ||
- (isa<StoreInst>(I) &&
- TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()))) &&
+ if (Legal->isInvariant(getLoadStorePointerOperand(I)) &&
+ (isa<LoadInst>(I) ||
+ (isa<StoreInst>(I) &&
+ TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()))) &&
!Legal->blockNeedsPredication(I->getParent()))
return false;
return true;
@@ -4445,6 +4466,8 @@ bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {
// TODO: We can use the loop-preheader as context point here and get
// context sensitive reasoning
return !isSafeToSpeculativelyExecute(I);
+ case Instruction::Call:
+ return Legal->isMaskRequired(I);
}
}
@@ -4502,7 +4525,8 @@ LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
// second vector operand. One example of this are shifts on x86.
Value *Op2 = I->getOperand(1);
auto Op2Info = TTI.getOperandInfo(Op2);
- if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
+ if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
+ Legal->isInvariant(Op2))
Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
SmallVector<const Value *, 4> Operands(I->operand_values());
@@ -4614,7 +4638,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
// already does this check. Collecting Uniforms for VF=1 does not make any
// sense.
- assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&
+ assert(VF.isVector() && !Uniforms.contains(VF) &&
"This function should not be visited twice for the same VF");
// Visit the list of Uniforms. If we'll not find any uniform value, we'll
@@ -4663,10 +4687,18 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
addToWorklistIfAllowed(Cmp);
+ auto PrevVF = VF.divideCoefficientBy(2);
// Return true if all lanes perform the same memory operation, and we can
// thus chose to execute only one.
auto isUniformMemOpUse = [&](Instruction *I) {
- if (!Legal->isUniformMemOp(*I))
+ // If the value was already known to not be uniform for the previous
+ // (smaller VF), it cannot be uniform for the larger VF.
+ if (PrevVF.isVector()) {
+ auto Iter = Uniforms.find(PrevVF);
+ if (Iter != Uniforms.end() && !Iter->second.contains(I))
+ return false;
+ }
+ if (!Legal->isUniformMemOp(*I, VF))
return false;
if (isa<LoadInst>(I))
// Loading the same address always produces the same result - at least
@@ -4689,11 +4721,14 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
WideningDecision == CM_Interleave);
};
-
// Returns true if Ptr is the pointer operand of a memory access instruction
- // I, and I is known to not require scalarization.
+ // I, I is known to not require scalarization, and the pointer is not also
+ // stored.
auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
- return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
+ if (isa<StoreInst>(I) && I->getOperand(0) == Ptr)
+ return false;
+ return getLoadStorePointerOperand(I) == Ptr &&
+ (isUniformDecision(I, VF) || Legal->isInvariant(Ptr));
};
// Holds a list of values which are known to have at least one uniform use.
@@ -4739,10 +4774,8 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
if (isUniformMemOpUse(&I))
addToWorklistIfAllowed(&I);
- if (isUniformDecision(&I, VF)) {
- assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check");
+ if (isVectorizedMemAccessUse(&I, Ptr))
HasUniformUse.insert(Ptr);
- }
}
// Add to the worklist any operands which have *only* uniform (e.g. lane 0
@@ -4906,12 +4939,11 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
return MaxScalableVF;
// Limit MaxScalableVF by the maximum safe dependence distance.
- std::optional<unsigned> MaxVScale = TTI.getMaxVScale();
- if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange))
- MaxVScale =
- TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
- MaxScalableVF =
- ElementCount::getScalable(MaxVScale ? (MaxSafeElements / *MaxVScale) : 0);
+ if (std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI))
+ MaxScalableVF = ElementCount::getScalable(MaxSafeElements / *MaxVScale);
+ else
+ MaxScalableVF = ElementCount::getScalable(0);
+
if (!MaxScalableVF)
reportVectorizationInfo(
"Max legal vector width too small, scalable vectorization "
@@ -4932,7 +4964,7 @@ FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
// the memory accesses that is most restrictive (involved in the smallest
// dependence distance).
unsigned MaxSafeElements =
- PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
+ llvm::bit_floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
@@ -5105,16 +5137,26 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
}
FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true);
+
// Avoid tail folding if the trip count is known to be a multiple of any VF
- // we chose.
- // FIXME: The condition below pessimises the case for fixed-width vectors,
- // when scalable VFs are also candidates for vectorization.
- if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) {
- ElementCount MaxFixedVF = MaxFactors.FixedVF;
- assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) &&
+ // we choose.
+ std::optional<unsigned> MaxPowerOf2RuntimeVF =
+ MaxFactors.FixedVF.getFixedValue();
+ if (MaxFactors.ScalableVF) {
+ std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
+ if (MaxVScale && TTI.isVScaleKnownToBeAPowerOfTwo()) {
+ MaxPowerOf2RuntimeVF = std::max<unsigned>(
+ *MaxPowerOf2RuntimeVF,
+ *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue());
+ } else
+ MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now.
+ }
+
+ if (MaxPowerOf2RuntimeVF && *MaxPowerOf2RuntimeVF > 0) {
+ assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
"MaxFixedVF must be a power of 2");
- unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC
- : MaxFixedVF.getFixedValue();
+ unsigned MaxVFtimesIC =
+ UserIC ? *MaxPowerOf2RuntimeVF * UserIC : *MaxPowerOf2RuntimeVF;
ScalarEvolution *SE = PSE.getSE();
const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
const SCEV *ExitCount = SE->getAddExpr(
@@ -5134,7 +5176,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
// by masking.
// FIXME: look for a smaller MaxVF that does divide TC rather than masking.
if (Legal->prepareToFoldTailByMasking()) {
- FoldTailByMasking = true;
+ CanFoldTailByMasking = true;
return MaxFactors;
}
@@ -5187,7 +5229,7 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
// Ensure MaxVF is a power of 2; the dependence distance bound may not be.
// Note that both WidestRegister and WidestType may not be a powers of 2.
auto MaxVectorElementCount = ElementCount::get(
- PowerOf2Floor(WidestRegister.getKnownMinValue() / WidestType),
+ llvm::bit_floor(WidestRegister.getKnownMinValue() / WidestType),
ComputeScalableMaxVF);
MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
@@ -5207,6 +5249,13 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
auto Min = Attr.getVScaleRangeMin();
WidestRegisterMinEC *= Min;
}
+
+ // When a scalar epilogue is required, at least one iteration of the scalar
+ // loop has to execute. Adjust ConstTripCount accordingly to avoid picking a
+ // max VF that results in a dead vector loop.
+ if (ConstTripCount > 0 && requiresScalarEpilogue(true))
+ ConstTripCount -= 1;
+
if (ConstTripCount && ConstTripCount <= WidestRegisterMinEC &&
(!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) {
// If loop trip count (TC) is known at compile time there is no point in
@@ -5214,7 +5263,7 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
// power of two which doesn't exceed TC.
// If MaxVectorElementCount is scalable, we only fall back on a fixed VF
// when the TC is less than or equal to the known number of lanes.
- auto ClampedConstTripCount = PowerOf2Floor(ConstTripCount);
+ auto ClampedConstTripCount = llvm::bit_floor(ConstTripCount);
LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
"exceeding the constant trip count: "
<< ClampedConstTripCount << "\n");
@@ -5228,7 +5277,7 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
if (MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 &&
TTI.shouldMaximizeVectorBandwidth(RegKind))) {
auto MaxVectorElementCountMaxBW = ElementCount::get(
- PowerOf2Floor(WidestRegister.getKnownMinValue() / SmallestType),
+ llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType),
ComputeScalableMaxVF);
MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
@@ -5273,9 +5322,14 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
return MaxVF;
}
-std::optional<unsigned> LoopVectorizationCostModel::getVScaleForTuning() const {
- if (TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
- auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
+/// Convenience function that returns the value of vscale_range iff
+/// vscale_range.min == vscale_range.max or otherwise returns the value
+/// returned by the corresponding TTI method.
+static std::optional<unsigned>
+getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI) {
+ const Function *Fn = L->getHeader()->getParent();
+ if (Fn->hasFnAttribute(Attribute::VScaleRange)) {
+ auto Attr = Fn->getFnAttribute(Attribute::VScaleRange);
auto Min = Attr.getVScaleRangeMin();
auto Max = Attr.getVScaleRangeMax();
if (Max && Min == Max)
@@ -5285,31 +5339,39 @@ std::optional<unsigned> LoopVectorizationCostModel::getVScaleForTuning() const {
return TTI.getVScaleForTuning();
}
-bool LoopVectorizationCostModel::isMoreProfitable(
+bool LoopVectorizationPlanner::isMoreProfitable(
const VectorizationFactor &A, const VectorizationFactor &B) const {
InstructionCost CostA = A.Cost;
InstructionCost CostB = B.Cost;
- unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop);
-
- if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking &&
- MaxTripCount) {
- // If we are folding the tail and the trip count is a known (possibly small)
- // constant, the trip count will be rounded up to an integer number of
- // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF),
- // which we compare directly. When not folding the tail, the total cost will
- // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is
- // approximated with the per-lane cost below instead of using the tripcount
- // as here.
- auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue());
- auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue());
+ unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(OrigLoop);
+
+ if (!A.Width.isScalable() && !B.Width.isScalable() && MaxTripCount) {
+ // If the trip count is a known (possibly small) constant, the trip count
+ // will be rounded up to an integer number of iterations under
+ // FoldTailByMasking. The total cost in that case will be
+ // VecCost*ceil(TripCount/VF). When not folding the tail, the total
+ // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be
+ // some extra overheads, but for the purpose of comparing the costs of
+ // different VFs we can use this to compare the total loop-body cost
+ // expected after vectorization.
+ auto GetCostForTC = [MaxTripCount, this](unsigned VF,
+ InstructionCost VectorCost,
+ InstructionCost ScalarCost) {
+ return CM.foldTailByMasking() ? VectorCost * divideCeil(MaxTripCount, VF)
+ : VectorCost * (MaxTripCount / VF) +
+ ScalarCost * (MaxTripCount % VF);
+ };
+ auto RTCostA = GetCostForTC(A.Width.getFixedValue(), CostA, A.ScalarCost);
+ auto RTCostB = GetCostForTC(B.Width.getFixedValue(), CostB, B.ScalarCost);
+
return RTCostA < RTCostB;
}
// Improve estimate for the vector width if it is scalable.
unsigned EstimatedWidthA = A.Width.getKnownMinValue();
unsigned EstimatedWidthB = B.Width.getKnownMinValue();
- if (std::optional<unsigned> VScale = getVScaleForTuning()) {
+ if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI)) {
if (A.Width.isScalable())
EstimatedWidthA *= *VScale;
if (B.Width.isScalable())
@@ -5328,9 +5390,74 @@ bool LoopVectorizationCostModel::isMoreProfitable(
return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA);
}
-VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
+static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts,
+ OptimizationRemarkEmitter *ORE,
+ Loop *TheLoop) {
+ if (InvalidCosts.empty())
+ return;
+
+ // Emit a report of VFs with invalid costs in the loop.
+
+ // Group the remarks per instruction, keeping the instruction order from
+ // InvalidCosts.
+ std::map<Instruction *, unsigned> Numbering;
+ unsigned I = 0;
+ for (auto &Pair : InvalidCosts)
+ if (!Numbering.count(Pair.first))
+ Numbering[Pair.first] = I++;
+
+ // Sort the list, first on instruction(number) then on VF.
+ sort(InvalidCosts, [&Numbering](InstructionVFPair &A, InstructionVFPair &B) {
+ if (Numbering[A.first] != Numbering[B.first])
+ return Numbering[A.first] < Numbering[B.first];
+ ElementCountComparator ECC;
+ return ECC(A.second, B.second);
+ });
+
+ // For a list of ordered instruction-vf pairs:
+ // [(load, vf1), (load, vf2), (store, vf1)]
+ // Group the instructions together to emit separate remarks for:
+ // load (vf1, vf2)
+ // store (vf1)
+ auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts);
+ auto Subset = ArrayRef<InstructionVFPair>();
+ do {
+ if (Subset.empty())
+ Subset = Tail.take_front(1);
+
+ Instruction *I = Subset.front().first;
+
+ // If the next instruction is different, or if there are no other pairs,
+ // emit a remark for the collated subset. e.g.
+ // [(load, vf1), (load, vf2))]
+ // to emit:
+ // remark: invalid costs for 'load' at VF=(vf, vf2)
+ if (Subset == Tail || Tail[Subset.size()].first != I) {
+ std::string OutString;
+ raw_string_ostream OS(OutString);
+ assert(!Subset.empty() && "Unexpected empty range");
+ OS << "Instruction with invalid costs prevented vectorization at VF=(";
+ for (const auto &Pair : Subset)
+ OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second;
+ OS << "):";
+ if (auto *CI = dyn_cast<CallInst>(I))
+ OS << " call to " << CI->getCalledFunction()->getName();
+ else
+ OS << " " << I->getOpcodeName();
+ OS.flush();
+ reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I);
+ Tail = Tail.drop_front(Subset.size());
+ Subset = {};
+ } else
+ // Grow the subset by one element
+ Subset = Tail.take_front(Subset.size() + 1);
+ } while (!Tail.empty());
+}
+
+VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor(
const ElementCountSet &VFCandidates) {
- InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first;
+ InstructionCost ExpectedCost =
+ CM.expectedCost(ElementCount::getFixed(1)).first;
LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
assert(VFCandidates.count(ElementCount::getFixed(1)) &&
@@ -5340,7 +5467,7 @@ VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
ExpectedCost);
VectorizationFactor ChosenFactor = ScalarCost;
- bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
+ bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
if (ForceVectorization && VFCandidates.size() > 1) {
// Ignore scalar width, because the user explicitly wants vectorization.
// Initialize cost to max so that VF = 2 is, at least, chosen during cost
@@ -5354,12 +5481,13 @@ VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
if (i.isScalar())
continue;
- VectorizationCostTy C = expectedCost(i, &InvalidCosts);
+ LoopVectorizationCostModel::VectorizationCostTy C =
+ CM.expectedCost(i, &InvalidCosts);
VectorizationFactor Candidate(i, C.first, ScalarCost.ScalarCost);
#ifndef NDEBUG
unsigned AssumedMinimumVscale = 1;
- if (std::optional<unsigned> VScale = getVScaleForTuning())
+ if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI))
AssumedMinimumVscale = *VScale;
unsigned Width =
Candidate.Width.isScalable()
@@ -5388,70 +5516,13 @@ VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
ChosenFactor = Candidate;
}
- // Emit a report of VFs with invalid costs in the loop.
- if (!InvalidCosts.empty()) {
- // Group the remarks per instruction, keeping the instruction order from
- // InvalidCosts.
- std::map<Instruction *, unsigned> Numbering;
- unsigned I = 0;
- for (auto &Pair : InvalidCosts)
- if (!Numbering.count(Pair.first))
- Numbering[Pair.first] = I++;
-
- // Sort the list, first on instruction(number) then on VF.
- llvm::sort(InvalidCosts,
- [&Numbering](InstructionVFPair &A, InstructionVFPair &B) {
- if (Numbering[A.first] != Numbering[B.first])
- return Numbering[A.first] < Numbering[B.first];
- ElementCountComparator ECC;
- return ECC(A.second, B.second);
- });
-
- // For a list of ordered instruction-vf pairs:
- // [(load, vf1), (load, vf2), (store, vf1)]
- // Group the instructions together to emit separate remarks for:
- // load (vf1, vf2)
- // store (vf1)
- auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts);
- auto Subset = ArrayRef<InstructionVFPair>();
- do {
- if (Subset.empty())
- Subset = Tail.take_front(1);
-
- Instruction *I = Subset.front().first;
-
- // If the next instruction is different, or if there are no other pairs,
- // emit a remark for the collated subset. e.g.
- // [(load, vf1), (load, vf2))]
- // to emit:
- // remark: invalid costs for 'load' at VF=(vf, vf2)
- if (Subset == Tail || Tail[Subset.size()].first != I) {
- std::string OutString;
- raw_string_ostream OS(OutString);
- assert(!Subset.empty() && "Unexpected empty range");
- OS << "Instruction with invalid costs prevented vectorization at VF=(";
- for (const auto &Pair : Subset)
- OS << (Pair.second == Subset.front().second ? "" : ", ")
- << Pair.second;
- OS << "):";
- if (auto *CI = dyn_cast<CallInst>(I))
- OS << " call to " << CI->getCalledFunction()->getName();
- else
- OS << " " << I->getOpcodeName();
- OS.flush();
- reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I);
- Tail = Tail.drop_front(Subset.size());
- Subset = {};
- } else
- // Grow the subset by one element
- Subset = Tail.take_front(Subset.size() + 1);
- } while (!Tail.empty());
- }
+ emitInvalidCostRemarks(InvalidCosts, ORE, OrigLoop);
- if (!EnableCondStoresVectorization && NumPredStores) {
- reportVectorizationFailure("There are conditional stores.",
+ if (!EnableCondStoresVectorization && CM.hasPredStores()) {
+ reportVectorizationFailure(
+ "There are conditional stores.",
"store that is conditionally executed prevents vectorization",
- "ConditionalStore", ORE, TheLoop);
+ "ConditionalStore", ORE, OrigLoop);
ChosenFactor = ScalarCost;
}
@@ -5463,11 +5534,11 @@ VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
return ChosenFactor;
}
-bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(
- const Loop &L, ElementCount VF) const {
+bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
+ ElementCount VF) const {
// Cross iteration phis such as reductions need special handling and are
// currently unsupported.
- if (any_of(L.getHeader()->phis(),
+ if (any_of(OrigLoop->getHeader()->phis(),
[&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); }))
return false;
@@ -5475,20 +5546,21 @@ bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(
// currently unsupported.
for (const auto &Entry : Legal->getInductionVars()) {
// Look for uses of the value of the induction at the last iteration.
- Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch());
+ Value *PostInc =
+ Entry.first->getIncomingValueForBlock(OrigLoop->getLoopLatch());
for (User *U : PostInc->users())
- if (!L.contains(cast<Instruction>(U)))
+ if (!OrigLoop->contains(cast<Instruction>(U)))
return false;
// Look for uses of penultimate value of the induction.
for (User *U : Entry.first->users())
- if (!L.contains(cast<Instruction>(U)))
+ if (!OrigLoop->contains(cast<Instruction>(U)))
return false;
}
// Epilogue vectorization code has not been auditted to ensure it handles
// non-latch exits properly. It may be fine, but it needs auditted and
// tested.
- if (L.getExitingBlock() != L.getLoopLatch())
+ if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch())
return false;
return true;
@@ -5507,62 +5579,59 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
// We also consider epilogue vectorization unprofitable for targets that don't
// consider interleaving beneficial (eg. MVE).
- if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1)
+ if (TTI.getMaxInterleaveFactor(VF) <= 1)
return false;
- // FIXME: We should consider changing the threshold for scalable
- // vectors to take VScaleForTuning into account.
- if (VF.getKnownMinValue() >= EpilogueVectorizationMinVF)
+
+ unsigned Multiplier = 1;
+ if (VF.isScalable())
+ Multiplier = getVScaleForTuning(TheLoop, TTI).value_or(1);
+ if ((Multiplier * VF.getKnownMinValue()) >= EpilogueVectorizationMinVF)
return true;
return false;
}
-VectorizationFactor
-LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
- const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) {
+VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
+ const ElementCount MainLoopVF, unsigned IC) {
VectorizationFactor Result = VectorizationFactor::Disabled();
if (!EnableEpilogueVectorization) {
- LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";);
+ LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n");
return Result;
}
- if (!isScalarEpilogueAllowed()) {
- LLVM_DEBUG(
- dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is "
- "allowed.\n";);
+ if (!CM.isScalarEpilogueAllowed()) {
+ LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no "
+ "epilogue is allowed.\n");
return Result;
}
// Not really a cost consideration, but check for unsupported cases here to
// simplify the logic.
- if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) {
- LLVM_DEBUG(
- dbgs() << "LEV: Unable to vectorize epilogue because the loop is "
- "not a supported candidate.\n";);
+ if (!isCandidateForEpilogueVectorization(MainLoopVF)) {
+ LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop "
+ "is not a supported candidate.\n");
return Result;
}
if (EpilogueVectorizationForceVF > 1) {
- LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";);
+ LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n");
ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF);
- if (LVP.hasPlanWithVF(ForcedEC))
+ if (hasPlanWithVF(ForcedEC))
return {ForcedEC, 0, 0};
else {
- LLVM_DEBUG(
- dbgs()
- << "LEV: Epilogue vectorization forced factor is not viable.\n";);
+ LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not "
+ "viable.\n");
return Result;
}
}
- if (TheLoop->getHeader()->getParent()->hasOptSize() ||
- TheLoop->getHeader()->getParent()->hasMinSize()) {
+ if (OrigLoop->getHeader()->getParent()->hasOptSize() ||
+ OrigLoop->getHeader()->getParent()->hasMinSize()) {
LLVM_DEBUG(
- dbgs()
- << "LEV: Epilogue vectorization skipped due to opt for size.\n";);
+ dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n");
return Result;
}
- if (!isEpilogueVectorizationProfitable(MainLoopVF)) {
+ if (!CM.isEpilogueVectorizationProfitable(MainLoopVF)) {
LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
"this loop\n");
return Result;
@@ -5574,21 +5643,48 @@ LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
ElementCount EstimatedRuntimeVF = MainLoopVF;
if (MainLoopVF.isScalable()) {
EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue());
- if (std::optional<unsigned> VScale = getVScaleForTuning())
+ if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI))
EstimatedRuntimeVF *= *VScale;
}
- for (auto &NextVF : ProfitableVFs)
- if (((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
- ElementCount::isKnownLT(NextVF.Width, EstimatedRuntimeVF)) ||
- ElementCount::isKnownLT(NextVF.Width, MainLoopVF)) &&
- (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) &&
- LVP.hasPlanWithVF(NextVF.Width))
+ ScalarEvolution &SE = *PSE.getSE();
+ Type *TCType = Legal->getWidestInductionType();
+ const SCEV *RemainingIterations = nullptr;
+ for (auto &NextVF : ProfitableVFs) {
+ // Skip candidate VFs without a corresponding VPlan.
+ if (!hasPlanWithVF(NextVF.Width))
+ continue;
+
+ // Skip candidate VFs with widths >= the estimate runtime VF (scalable
+ // vectors) or the VF of the main loop (fixed vectors).
+ if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
+ ElementCount::isKnownGE(NextVF.Width, EstimatedRuntimeVF)) ||
+ ElementCount::isKnownGE(NextVF.Width, MainLoopVF))
+ continue;
+
+ // If NextVF is greater than the number of remaining iterations, the
+ // epilogue loop would be dead. Skip such factors.
+ if (!MainLoopVF.isScalable() && !NextVF.Width.isScalable()) {
+ // TODO: extend to support scalable VFs.
+ if (!RemainingIterations) {
+ const SCEV *TC = createTripCountSCEV(TCType, PSE, OrigLoop);
+ RemainingIterations = SE.getURemExpr(
+ TC, SE.getConstant(TCType, MainLoopVF.getKnownMinValue() * IC));
+ }
+ if (SE.isKnownPredicate(
+ CmpInst::ICMP_UGT,
+ SE.getConstant(TCType, NextVF.Width.getKnownMinValue()),
+ RemainingIterations))
+ continue;
+ }
+
+ if (Result.Width.isScalar() || isMoreProfitable(NextVF, Result))
Result = NextVF;
+ }
if (Result != VectorizationFactor::Disabled())
LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
- << Result.Width << "\n";);
+ << Result.Width << "\n");
return Result;
}
@@ -5688,7 +5784,7 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
return 1;
// We used the distance for the interleave count.
- if (Legal->getMaxSafeDepDistBytes() != -1U)
+ if (!Legal->isSafeForAnyVectorWidth())
return 1;
auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
@@ -5750,20 +5846,19 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
- unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
+ unsigned TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs) /
+ MaxLocalUsers);
// Don't count the induction variable as interleaved.
if (EnableIndVarRegisterHeur) {
- TmpIC =
- PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
- std::max(1U, (MaxLocalUsers - 1)));
+ TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs - 1) /
+ std::max(1U, (MaxLocalUsers - 1)));
}
IC = std::min(IC, TmpIC);
}
// Clamp the interleave ranges to reasonable counts.
- unsigned MaxInterleaveCount =
- TTI.getMaxInterleaveFactor(VF.getKnownMinValue());
+ unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
// Check if the user has overridden the max.
if (VF.isScalar()) {
@@ -5834,8 +5929,8 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
// We assume that the cost overhead is 1 and we use the cost model
// to estimate the cost of the loop and interleave until the cost of the
// loop overhead is about 5% of the cost of the loop.
- unsigned SmallIC = std::min(
- IC, (unsigned)PowerOf2Floor(SmallLoopCost / *LoopCost.getValue()));
+ unsigned SmallIC = std::min(IC, (unsigned)llvm::bit_floor<uint64_t>(
+ SmallLoopCost / *LoopCost.getValue()));
// Interleave until store/load ports (estimated by max interleave count) are
// saturated.
@@ -5953,7 +6048,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
// Saves the list of values that are used in the loop but are defined outside
// the loop (not including non-instruction values such as arguments and
// constants).
- SmallPtrSet<Value *, 8> LoopInvariants;
+ SmallSetVector<Instruction *, 8> LoopInvariants;
for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
for (Instruction &I : BB->instructionsWithoutDebug()) {
@@ -6079,11 +6174,16 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
for (auto *Inst : LoopInvariants) {
// FIXME: The target might use more than one register for the type
// even in the scalar case.
- unsigned Usage =
- VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
+ bool IsScalar = all_of(Inst->users(), [&](User *U) {
+ auto *I = cast<Instruction>(U);
+ return TheLoop != LI->getLoopFor(I->getParent()) ||
+ isScalarAfterVectorization(I, VFs[i]);
+ });
+
+ ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[i];
unsigned ClassID =
- TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType());
- Invariant[ClassID] += Usage;
+ TTI.getRegisterClassForType(VF.isVector(), Inst->getType());
+ Invariant[ClassID] += GetRegUsage(Inst->getType(), VF);
}
LLVM_DEBUG({
@@ -6134,8 +6234,7 @@ void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
// instructions to scalarize, there's nothing to do. Collection may already
// have occurred if we have a user-selected VF and are now computing the
// expected cost for interleaving.
- if (VF.isScalar() || VF.isZero() ||
- InstsToScalarize.find(VF) != InstsToScalarize.end())
+ if (VF.isScalar() || VF.isZero() || InstsToScalarize.contains(VF))
return;
// Initialize a mapping for VF in InstsToScalalarize. If we find that it's
@@ -6224,7 +6323,7 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
Instruction *I = Worklist.pop_back_val();
// If we've already analyzed the instruction, there's nothing to do.
- if (ScalarCosts.find(I) != ScalarCosts.end())
+ if (ScalarCosts.contains(I))
continue;
// Compute the cost of the vector instruction. Note that this cost already
@@ -6362,11 +6461,6 @@ static const SCEV *getAddressAccessSCEV(
return PSE.getSCEV(Ptr);
}
-static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
- return Legal->hasStride(I->getOperand(0)) ||
- Legal->hasStride(I->getOperand(1));
-}
-
InstructionCost
LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
ElementCount VF) {
@@ -6460,7 +6554,7 @@ LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
InstructionCost
LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
ElementCount VF) {
- assert(Legal->isUniformMemOp(*I));
+ assert(Legal->isUniformMemOp(*I, VF));
Type *ValTy = getLoadStoreType(I);
auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
@@ -6475,7 +6569,7 @@ LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
}
StoreInst *SI = cast<StoreInst>(I);
- bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
+ bool isLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand());
return TTI.getAddressComputationCost(ValTy) +
TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
CostKind) +
@@ -6502,11 +6596,6 @@ LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
InstructionCost
LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
ElementCount VF) {
- // TODO: Once we have support for interleaving with scalable vectors
- // we can calculate the cost properly here.
- if (VF.isScalable())
- return InstructionCost::getInvalid();
-
Type *ValTy = getLoadStoreType(I);
auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
unsigned AS = getLoadStoreAddressSpace(I);
@@ -6836,7 +6925,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF))
NumPredStores++;
- if (Legal->isUniformMemOp(I)) {
+ if (Legal->isUniformMemOp(I, VF)) {
auto isLegalToScalarize = [&]() {
if (!VF.isScalable())
// Scalarization of fixed length vectors "just works".
@@ -7134,8 +7223,12 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
case Instruction::And:
case Instruction::Or:
case Instruction::Xor: {
- // Since we will replace the stride by 1 the multiplication should go away.
- if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
+ // If we're speculating on the stride being 1, the multiplication may
+ // fold away. We can generalize this for all operations using the notion
+ // of neutral elements. (TODO)
+ if (I->getOpcode() == Instruction::Mul &&
+ (PSE.getSCEV(I->getOperand(0))->isOne() ||
+ PSE.getSCEV(I->getOperand(1))->isOne()))
return 0;
// Detect reduction patterns
@@ -7146,7 +7239,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
// second vector operand. One example of this are shifts on x86.
Value *Op2 = I->getOperand(1);
auto Op2Info = TTI.getOperandInfo(Op2);
- if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
+ if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
+ Legal->isInvariant(Op2))
Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
SmallVector<const Value *, 4> Operands(I->operand_values());
@@ -7304,7 +7398,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
VectorTy =
largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
} else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
- SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
+ // Leave SrcVecTy unchanged - we only shrink the destination element
+ // type.
VectorTy =
smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
}
@@ -7316,9 +7411,9 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
if (RecurrenceDescriptor::isFMulAddIntrinsic(I))
if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
return *RedCost;
- bool NeedToScalarize;
+ Function *Variant;
CallInst *CI = cast<CallInst>(I);
- InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
+ InstructionCost CallCost = getVectorCallCost(CI, VF, &Variant);
if (getVectorIntrinsicIDForCall(CI, TLI)) {
InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
return std::min(CallCost, IntrinsicCost);
@@ -7339,37 +7434,6 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
} // end of switch.
}
-char LoopVectorize::ID = 0;
-
-static const char lv_name[] = "Loop Vectorization";
-
-INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
-INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
-INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
-
-namespace llvm {
-
-Pass *createLoopVectorizePass() { return new LoopVectorize(); }
-
-Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
- bool VectorizeOnlyWhenForced) {
- return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
-}
-
-} // end namespace llvm
-
void LoopVectorizationCostModel::collectValuesToIgnore() {
// Ignore ephemeral values.
CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
@@ -7462,7 +7526,7 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
// reasonable one.
if (UserVF.isZero()) {
VF = ElementCount::getFixed(determineVPlanVF(
- TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
+ TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
.getFixedValue(),
CM));
LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
@@ -7497,13 +7561,16 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
std::optional<VectorizationFactor>
LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
assert(OrigLoop->isInnermost() && "Inner loop expected.");
+ CM.collectValuesToIgnore();
+ CM.collectElementTypesForWidening();
+
FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
return std::nullopt;
// Invalidate interleave groups if all blocks of loop will be predicated.
if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
- !useMaskedInterleavedAccesses(*TTI)) {
+ !useMaskedInterleavedAccesses(TTI)) {
LLVM_DEBUG(
dbgs()
<< "LV: Invalidate all interleaved groups due to fold-tail by masking "
@@ -7527,6 +7594,12 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
CM.collectInLoopReductions();
buildVPlansWithVPRecipes(UserVF, UserVF);
+ if (!hasPlanWithVF(UserVF)) {
+ LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << UserVF
+ << ".\n");
+ return std::nullopt;
+ }
+
LLVM_DEBUG(printPlans(dbgs()));
return {{UserVF, 0, 0}};
} else
@@ -7562,8 +7635,13 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
return VectorizationFactor::Disabled();
// Select the optimal vectorization factor.
- VectorizationFactor VF = CM.selectVectorizationFactor(VFCandidates);
+ VectorizationFactor VF = selectVectorizationFactor(VFCandidates);
assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero.");
+ if (!hasPlanWithVF(VF.Width)) {
+ LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << VF.Width
+ << ".\n");
+ return std::nullopt;
+ }
return VF;
}
@@ -7614,43 +7692,51 @@ static void AddRuntimeUnrollDisableMetaData(Loop *L) {
}
}
-void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
- VPlan &BestVPlan,
- InnerLoopVectorizer &ILV,
- DominatorTree *DT,
- bool IsEpilogueVectorization) {
+SCEV2ValueTy LoopVectorizationPlanner::executePlan(
+ ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
+ InnerLoopVectorizer &ILV, DominatorTree *DT, bool IsEpilogueVectorization,
+ DenseMap<const SCEV *, Value *> *ExpandedSCEVs) {
assert(BestVPlan.hasVF(BestVF) &&
"Trying to execute plan with unsupported VF");
assert(BestVPlan.hasUF(BestUF) &&
"Trying to execute plan with unsupported UF");
+ assert(
+ (IsEpilogueVectorization || !ExpandedSCEVs) &&
+ "expanded SCEVs to reuse can only be used during epilogue vectorization");
LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF
<< '\n');
- // Workaround! Compute the trip count of the original loop and cache it
- // before we start modifying the CFG. This code has a systemic problem
- // wherein it tries to run analysis over partially constructed IR; this is
- // wrong, and not simply for SCEV. The trip count of the original loop
- // simply happens to be prone to hitting this in practice. In theory, we
- // can hit the same issue for any SCEV, or ValueTracking query done during
- // mutation. See PR49900.
- ILV.getOrCreateTripCount(OrigLoop->getLoopPreheader());
-
if (!IsEpilogueVectorization)
VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
// Perform the actual loop transformation.
+ VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan};
+
+ // 0. Generate SCEV-dependent code into the preheader, including TripCount,
+ // before making any changes to the CFG.
+ if (!BestVPlan.getPreheader()->empty()) {
+ State.CFG.PrevBB = OrigLoop->getLoopPreheader();
+ State.Builder.SetInsertPoint(OrigLoop->getLoopPreheader()->getTerminator());
+ BestVPlan.getPreheader()->execute(&State);
+ }
+ if (!ILV.getTripCount())
+ ILV.setTripCount(State.get(BestVPlan.getTripCount(), {0, 0}));
+ else
+ assert(IsEpilogueVectorization && "should only re-use the existing trip "
+ "count during epilogue vectorization");
// 1. Set up the skeleton for vectorization, including vector pre-header and
// middle block. The vector loop is created during VPlan execution.
- VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan};
Value *CanonicalIVStartValue;
std::tie(State.CFG.PrevBB, CanonicalIVStartValue) =
- ILV.createVectorizedLoopSkeleton();
+ ILV.createVectorizedLoopSkeleton(ExpandedSCEVs ? *ExpandedSCEVs
+ : State.ExpandedSCEVs);
// Only use noalias metadata when using memory checks guaranteeing no overlap
// across all iterations.
const LoopAccessInfo *LAI = ILV.Legal->getLAI();
+ std::unique_ptr<LoopVersioning> LVer = nullptr;
if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() &&
!LAI->getRuntimePointerChecking()->getDiffChecks()) {
@@ -7658,9 +7744,10 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
// still use it to add the noalias metadata.
// TODO: Find a better way to re-use LoopVersioning functionality to add
// metadata.
- State.LVer = std::make_unique<LoopVersioning>(
+ LVer = std::make_unique<LoopVersioning>(
*LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT,
PSE.getSE());
+ State.LVer = &*LVer;
State.LVer->prepareNoAliasMetadata();
}
@@ -7677,10 +7764,9 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
//===------------------------------------------------===//
// 2. Copy and widen instructions from the old loop into the new loop.
- BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr),
- ILV.getOrCreateVectorTripCount(nullptr),
- CanonicalIVStartValue, State,
- IsEpilogueVectorization);
+ BestVPlan.prepareToExecute(
+ ILV.getTripCount(), ILV.getOrCreateVectorTripCount(nullptr),
+ CanonicalIVStartValue, State, IsEpilogueVectorization);
BestVPlan.execute(&State);
@@ -7706,13 +7792,18 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
LoopVectorizeHints Hints(L, true, *ORE);
Hints.setAlreadyVectorized();
}
- AddRuntimeUnrollDisableMetaData(L);
+ TargetTransformInfo::UnrollingPreferences UP;
+ TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);
+ if (!UP.UnrollVectorizedLoop || CanonicalIVStartValue)
+ AddRuntimeUnrollDisableMetaData(L);
// 3. Fix the vectorized code: take care of header phi's, live-outs,
// predication, updating analyses.
ILV.fixVectorizedLoop(State, BestVPlan);
ILV.printDebugTracesAtEnd();
+
+ return State.ExpandedSCEVs;
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -7725,8 +7816,6 @@ void LoopVectorizationPlanner::printPlans(raw_ostream &O) {
}
#endif
-Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
-
//===--------------------------------------------------------------------===//
// EpilogueVectorizerMainLoop
//===--------------------------------------------------------------------===//
@@ -7734,7 +7823,8 @@ Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
/// This function is partially responsible for generating the control flow
/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
std::pair<BasicBlock *, Value *>
-EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
+EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton(
+ const SCEV2ValueTy &ExpandedSCEVs) {
createVectorLoopSkeleton("");
// Generate the code to check the minimum iteration count of the vector
@@ -7795,7 +7885,7 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
assert(Bypass && "Expected valid bypass basic block.");
ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
- Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
+ Value *Count = getTripCount();
// Reuse existing vector loop preheader for TC checks.
// Note that new preheader block is generated for vector loop.
BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
@@ -7803,8 +7893,10 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
// Generate code to check if the loop's trip count is less than VF * UF of the
// main vector loop.
- auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ?
- ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
+ auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF.isVector()
+ : VF.isVector())
+ ? ICmpInst::ICMP_ULE
+ : ICmpInst::ICMP_ULT;
Value *CheckMinIters = Builder.CreateICmp(
P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor),
@@ -7824,7 +7916,7 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
// Update dominator for Bypass & LoopExit.
DT->changeImmediateDominator(Bypass, TCCheckBlock);
- if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF))
+ if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector()))
// For loops with multiple exits, there's no edge from the middle block
// to exit blocks (as the epilogue must run) and thus no need to update
// the immediate dominator of the exit blocks.
@@ -7852,7 +7944,8 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
/// This function is partially responsible for generating the control flow
/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
std::pair<BasicBlock *, Value *>
-EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
+EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
+ const SCEV2ValueTy &ExpandedSCEVs) {
createVectorLoopSkeleton("vec.epilog.");
// Now, compare the remaining count and if there aren't enough iterations to
@@ -7891,7 +7984,7 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
DT->changeImmediateDominator(LoopScalarPreHeader,
EPI.EpilogueIterationCountCheck);
- if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF))
+ if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector()))
// If there is an epilogue which must run, there's no edge from the
// middle block to exit blocks and thus no need to update the immediate
// dominator of the exit blocks.
@@ -7950,7 +8043,8 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
// check, then the resume value for the induction variable comes from
// the trip count of the main vector loop, hence passing the AdditionalBypass
// argument.
- createInductionResumeValues({VecEpilogueIterationCountCheck,
+ createInductionResumeValues(ExpandedSCEVs,
+ {VecEpilogueIterationCountCheck,
EPI.VectorTripCount} /* AdditionalBypass */);
return {completeLoopSkeleton(), EPResumeVal};
@@ -7972,8 +8066,9 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
// Generate code to check if the loop's trip count is less than VF * UF of the
// vector epilogue loop.
- auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ?
- ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
+ auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector())
+ ? ICmpInst::ICMP_ULE
+ : ICmpInst::ICMP_ULT;
Value *CheckMinIters =
Builder.CreateICmp(P, Count,
@@ -8008,8 +8103,7 @@ bool LoopVectorizationPlanner::getDecisionAndClampRange(
assert(!Range.isEmpty() && "Trying to test an empty VF range.");
bool PredicateAtRangeStart = Predicate(Range.Start);
- for (ElementCount TmpVF = Range.Start * 2;
- ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2)
+ for (ElementCount TmpVF : VFRange(Range.Start * 2, Range.End))
if (Predicate(TmpVF) != PredicateAtRangeStart) {
Range.End = TmpVF;
break;
@@ -8025,16 +8119,16 @@ bool LoopVectorizationPlanner::getDecisionAndClampRange(
/// buildVPlan().
void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
ElementCount MaxVF) {
- auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
- for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
- VFRange SubRange = {VF, MaxVFPlusOne};
+ auto MaxVFTimes2 = MaxVF * 2;
+ for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
+ VFRange SubRange = {VF, MaxVFTimes2};
VPlans.push_back(buildVPlan(SubRange));
VF = SubRange.End;
}
}
VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
- VPlanPtr &Plan) {
+ VPlan &Plan) {
assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
// Look for cached value.
@@ -8058,7 +8152,7 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
if (OrigLoop->isLoopExiting(Src))
return EdgeMaskCache[Edge] = SrcMask;
- VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition());
+ VPValue *EdgeMask = Plan.getVPValueOrAddLiveIn(BI->getCondition());
assert(EdgeMask && "No Edge Mask found for condition");
if (BI->getSuccessor(0) != Dst)
@@ -8069,7 +8163,7 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
// 'select i1 SrcMask, i1 EdgeMask, i1 false'.
// The select version does not introduce new UB if SrcMask is false and
// EdgeMask is poison. Using 'and' here introduces undefined behavior.
- VPValue *False = Plan->getOrAddVPValue(
+ VPValue *False = Plan.getVPValueOrAddLiveIn(
ConstantInt::getFalse(BI->getCondition()->getType()));
EdgeMask =
Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc());
@@ -8078,7 +8172,7 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
return EdgeMaskCache[Edge] = EdgeMask;
}
-VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
+VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) {
assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
// Look for cached value.
@@ -8098,29 +8192,28 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
// If we're using the active lane mask for control flow, then we get the
// mask from the active lane mask PHI that is cached in the VPlan.
- PredicationStyle EmitGetActiveLaneMask = CM.TTI.emitGetActiveLaneMask();
- if (EmitGetActiveLaneMask == PredicationStyle::DataAndControlFlow)
- return BlockMaskCache[BB] = Plan->getActiveLaneMaskPhi();
+ TailFoldingStyle TFStyle = CM.getTailFoldingStyle();
+ if (useActiveLaneMaskForControlFlow(TFStyle))
+ return BlockMaskCache[BB] = Plan.getActiveLaneMaskPhi();
// Introduce the early-exit compare IV <= BTC to form header block mask.
// This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
// constructing the desired canonical IV in the header block as its first
// non-phi instructions.
- VPBasicBlock *HeaderVPBB =
- Plan->getVectorLoopRegion()->getEntryBasicBlock();
+ VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
- auto *IV = new VPWidenCanonicalIVRecipe(Plan->getCanonicalIV());
+ auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV());
HeaderVPBB->insert(IV, HeaderVPBB->getFirstNonPhi());
VPBuilder::InsertPointGuard Guard(Builder);
Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
- if (EmitGetActiveLaneMask != PredicationStyle::None) {
- VPValue *TC = Plan->getOrCreateTripCount();
+ if (useActiveLaneMask(TFStyle)) {
+ VPValue *TC = Plan.getTripCount();
BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC},
nullptr, "active.lane.mask");
} else {
- VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
+ VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
}
return BlockMaskCache[BB] = BlockMask;
@@ -8168,7 +8261,7 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
VPValue *Mask = nullptr;
if (Legal->isMaskRequired(I))
- Mask = createBlockInMask(I->getParent(), Plan);
+ Mask = createBlockInMask(I->getParent(), *Plan);
// Determine if the pointer operand of the access is either consecutive or
// reverse consecutive.
@@ -8189,22 +8282,11 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
/// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
/// insert a recipe to expand the step for the induction recipe.
-static VPWidenIntOrFpInductionRecipe *createWidenInductionRecipes(
- PHINode *Phi, Instruction *PhiOrTrunc, VPValue *Start,
- const InductionDescriptor &IndDesc, LoopVectorizationCostModel &CM,
- VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop, VFRange &Range) {
- // Returns true if an instruction \p I should be scalarized instead of
- // vectorized for the chosen vectorization factor.
- auto ShouldScalarizeInstruction = [&CM](Instruction *I, ElementCount VF) {
- return CM.isScalarAfterVectorization(I, VF) ||
- CM.isProfitableToScalarize(I, VF);
- };
-
- bool NeedsScalarIVOnly = LoopVectorizationPlanner::getDecisionAndClampRange(
- [&](ElementCount VF) {
- return ShouldScalarizeInstruction(PhiOrTrunc, VF);
- },
- Range);
+static VPWidenIntOrFpInductionRecipe *
+createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc,
+ VPValue *Start, const InductionDescriptor &IndDesc,
+ VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop,
+ VFRange &Range) {
assert(IndDesc.getStartValue() ==
Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));
assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&
@@ -8213,12 +8295,10 @@ static VPWidenIntOrFpInductionRecipe *createWidenInductionRecipes(
VPValue *Step =
vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE);
if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {
- return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI,
- !NeedsScalarIVOnly);
+ return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI);
}
assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
- return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc,
- !NeedsScalarIVOnly);
+ return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc);
}
VPRecipeBase *VPRecipeBuilder::tryToOptimizeInductionPHI(
@@ -8227,14 +8307,13 @@ VPRecipeBase *VPRecipeBuilder::tryToOptimizeInductionPHI(
// Check if this is an integer or fp induction. If so, build the recipe that
// produces its scalar and vector values.
if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
- return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, CM, Plan,
+ return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, Plan,
*PSE.getSE(), *OrigLoop, Range);
// Check if this is pointer induction. If so, build the recipe for it.
if (auto *II = Legal->getPointerInductionDescriptor(Phi)) {
VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep(),
*PSE.getSE());
- assert(isa<SCEVConstant>(II->getStep()));
return new VPWidenPointerInductionRecipe(
Phi, Operands[0], Step, *II,
LoopVectorizationPlanner::getDecisionAndClampRange(
@@ -8267,9 +8346,9 @@ VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
auto *Phi = cast<PHINode>(I->getOperand(0));
const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
- VPValue *Start = Plan.getOrAddVPValue(II.getStartValue());
- return createWidenInductionRecipes(Phi, I, Start, II, CM, Plan,
- *PSE.getSE(), *OrigLoop, Range);
+ VPValue *Start = Plan.getVPValueOrAddLiveIn(II.getStartValue());
+ return createWidenInductionRecipes(Phi, I, Start, II, Plan, *PSE.getSE(),
+ *OrigLoop, Range);
}
return nullptr;
}
@@ -8309,7 +8388,7 @@ VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi,
for (unsigned In = 0; In < NumIncoming; In++) {
VPValue *EdgeMask =
- createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
+ createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), *Plan);
assert((EdgeMask || NumIncoming == 1) &&
"Multiple predecessors with one having a full mask");
OperandsWithMask.push_back(Operands[In]);
@@ -8321,8 +8400,8 @@ VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi,
VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
ArrayRef<VPValue *> Operands,
- VFRange &Range) const {
-
+ VFRange &Range,
+ VPlanPtr &Plan) {
bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
[this, CI](ElementCount VF) {
return CM.isScalarWithPredication(CI, VF);
@@ -8339,17 +8418,17 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
ID == Intrinsic::experimental_noalias_scope_decl))
return nullptr;
- ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size());
+ SmallVector<VPValue *, 4> Ops(Operands.take_front(CI->arg_size()));
// Is it beneficial to perform intrinsic call compared to lib call?
bool ShouldUseVectorIntrinsic =
ID && LoopVectorizationPlanner::getDecisionAndClampRange(
[&](ElementCount VF) -> bool {
- bool NeedToScalarize = false;
+ Function *Variant;
// Is it beneficial to perform intrinsic call compared to lib
// call?
InstructionCost CallCost =
- CM.getVectorCallCost(CI, VF, NeedToScalarize);
+ CM.getVectorCallCost(CI, VF, &Variant);
InstructionCost IntrinsicCost =
CM.getVectorIntrinsicCost(CI, VF);
return IntrinsicCost <= CallCost;
@@ -8358,6 +8437,9 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
if (ShouldUseVectorIntrinsic)
return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), ID);
+ Function *Variant = nullptr;
+ ElementCount VariantVF;
+ bool NeedsMask = false;
// Is better to call a vectorized version of the function than to to scalarize
// the call?
auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange(
@@ -8365,14 +8447,57 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
// The following case may be scalarized depending on the VF.
// The flag shows whether we can use a usual Call for vectorized
// version of the instruction.
- bool NeedToScalarize = false;
- CM.getVectorCallCost(CI, VF, NeedToScalarize);
- return !NeedToScalarize;
+
+ // If we've found a variant at a previous VF, then stop looking. A
+ // vectorized variant of a function expects input in a certain shape
+ // -- basically the number of input registers, the number of lanes
+ // per register, and whether there's a mask required.
+ // We store a pointer to the variant in the VPWidenCallRecipe, so
+ // once we have an appropriate variant it's only valid for that VF.
+ // This will force a different vplan to be generated for each VF that
+ // finds a valid variant.
+ if (Variant)
+ return false;
+ CM.getVectorCallCost(CI, VF, &Variant, &NeedsMask);
+ // If we found a valid vector variant at this VF, then store the VF
+ // in case we need to generate a mask.
+ if (Variant)
+ VariantVF = VF;
+ return Variant != nullptr;
},
Range);
- if (ShouldUseVectorCall)
+ if (ShouldUseVectorCall) {
+ if (NeedsMask) {
+ // We have 2 cases that would require a mask:
+ // 1) The block needs to be predicated, either due to a conditional
+ // in the scalar loop or use of an active lane mask with
+ // tail-folding, and we use the appropriate mask for the block.
+ // 2) No mask is required for the block, but the only available
+ // vector variant at this VF requires a mask, so we synthesize an
+ // all-true mask.
+ VPValue *Mask = nullptr;
+ if (Legal->isMaskRequired(CI))
+ Mask = createBlockInMask(CI->getParent(), *Plan);
+ else
+ Mask = Plan->getVPValueOrAddLiveIn(ConstantInt::getTrue(
+ IntegerType::getInt1Ty(Variant->getFunctionType()->getContext())));
+
+ VFShape Shape = VFShape::get(*CI, VariantVF, /*HasGlobalPred=*/true);
+ unsigned MaskPos = 0;
+
+ for (const VFInfo &Info : VFDatabase::getMappings(*CI))
+ if (Info.Shape == Shape) {
+ assert(Info.isMasked() && "Vector function info shape mismatch");
+ MaskPos = Info.getParamIndexForOptionalMask().value();
+ break;
+ }
+
+ Ops.insert(Ops.begin() + MaskPos, Mask);
+ }
+
return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()),
- Intrinsic::not_intrinsic);
+ Intrinsic::not_intrinsic, Variant);
+ }
return nullptr;
}
@@ -8405,9 +8530,9 @@ VPRecipeBase *VPRecipeBuilder::tryToWiden(Instruction *I,
// div/rem operation itself. Otherwise fall through to general handling below.
if (CM.isPredicatedInst(I)) {
SmallVector<VPValue *> Ops(Operands.begin(), Operands.end());
- VPValue *Mask = createBlockInMask(I->getParent(), Plan);
- VPValue *One =
- Plan->getOrAddExternalDef(ConstantInt::get(I->getType(), 1u, false));
+ VPValue *Mask = createBlockInMask(I->getParent(), *Plan);
+ VPValue *One = Plan->getVPValueOrAddLiveIn(
+ ConstantInt::get(I->getType(), 1u, false));
auto *SafeRHS =
new VPInstruction(Instruction::Select, {Mask, Ops[1], One},
I->getDebugLoc());
@@ -8415,38 +8540,26 @@ VPRecipeBase *VPRecipeBuilder::tryToWiden(Instruction *I,
Ops[1] = SafeRHS;
return new VPWidenRecipe(*I, make_range(Ops.begin(), Ops.end()));
}
- LLVM_FALLTHROUGH;
+ [[fallthrough]];
}
case Instruction::Add:
case Instruction::And:
case Instruction::AShr:
- case Instruction::BitCast:
case Instruction::FAdd:
case Instruction::FCmp:
case Instruction::FDiv:
case Instruction::FMul:
case Instruction::FNeg:
- case Instruction::FPExt:
- case Instruction::FPToSI:
- case Instruction::FPToUI:
- case Instruction::FPTrunc:
case Instruction::FRem:
case Instruction::FSub:
case Instruction::ICmp:
- case Instruction::IntToPtr:
case Instruction::LShr:
case Instruction::Mul:
case Instruction::Or:
- case Instruction::PtrToInt:
case Instruction::Select:
- case Instruction::SExt:
case Instruction::Shl:
- case Instruction::SIToFP:
case Instruction::Sub:
- case Instruction::Trunc:
- case Instruction::UIToFP:
case Instruction::Xor:
- case Instruction::ZExt:
case Instruction::Freeze:
return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end()));
};
@@ -8462,9 +8575,9 @@ void VPRecipeBuilder::fixHeaderPhis() {
}
}
-VPBasicBlock *VPRecipeBuilder::handleReplication(
- Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
- VPlanPtr &Plan) {
+VPRecipeOrVPValueTy VPRecipeBuilder::handleReplication(Instruction *I,
+ VFRange &Range,
+ VPlan &Plan) {
bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
[&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
Range);
@@ -8501,83 +8614,22 @@ VPBasicBlock *VPRecipeBuilder::handleReplication(
break;
}
}
-
- auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
- IsUniform, IsPredicated);
-
- // Find if I uses a predicated instruction. If so, it will use its scalar
- // value. Avoid hoisting the insert-element which packs the scalar value into
- // a vector value, as that happens iff all users use the vector value.
- for (VPValue *Op : Recipe->operands()) {
- auto *PredR =
- dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDefiningRecipe());
- if (!PredR)
- continue;
- auto *RepR = cast<VPReplicateRecipe>(
- PredR->getOperand(0)->getDefiningRecipe());
- assert(RepR->isPredicated() &&
- "expected Replicate recipe to be predicated");
- RepR->setAlsoPack(false);
- }
-
- // Finalize the recipe for Instr, first if it is not predicated.
+ VPValue *BlockInMask = nullptr;
if (!IsPredicated) {
+ // Finalize the recipe for Instr, first if it is not predicated.
LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
- setRecipe(I, Recipe);
- Plan->addVPValue(I, Recipe);
- VPBB->appendRecipe(Recipe);
- return VPBB;
- }
- LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
-
- VPBlockBase *SingleSucc = VPBB->getSingleSuccessor();
- assert(SingleSucc && "VPBB must have a single successor when handling "
- "predicated replication.");
- VPBlockUtils::disconnectBlocks(VPBB, SingleSucc);
- // Record predicated instructions for above packing optimizations.
- VPBlockBase *Region = createReplicateRegion(Recipe, Plan);
- VPBlockUtils::insertBlockAfter(Region, VPBB);
- auto *RegSucc = new VPBasicBlock();
- VPBlockUtils::insertBlockAfter(RegSucc, Region);
- VPBlockUtils::connectBlocks(RegSucc, SingleSucc);
- return RegSucc;
-}
-
-VPRegionBlock *
-VPRecipeBuilder::createReplicateRegion(VPReplicateRecipe *PredRecipe,
- VPlanPtr &Plan) {
- Instruction *Instr = PredRecipe->getUnderlyingInstr();
- // Instructions marked for predication are replicated and placed under an
- // if-then construct to prevent side-effects.
- // Generate recipes to compute the block mask for this region.
- VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
-
- // Build the triangular if-then region.
- std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
- assert(Instr->getParent() && "Predicated instruction not in any basic block");
- auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
- auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
- auto *PHIRecipe = Instr->getType()->isVoidTy()
- ? nullptr
- : new VPPredInstPHIRecipe(PredRecipe);
- if (PHIRecipe) {
- setRecipe(Instr, PHIRecipe);
- Plan->addVPValue(Instr, PHIRecipe);
} else {
- setRecipe(Instr, PredRecipe);
- Plan->addVPValue(Instr, PredRecipe);
+ LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
+ // Instructions marked for predication are replicated and a mask operand is
+ // added initially. Masked replicate recipes will later be placed under an
+ // if-then construct to prevent side-effects. Generate recipes to compute
+ // the block mask for this region.
+ BlockInMask = createBlockInMask(I->getParent(), Plan);
}
- auto *Exiting = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
- auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
- VPRegionBlock *Region = new VPRegionBlock(Entry, Exiting, RegionName, true);
-
- // Note: first set Entry as region entry and then connect successors starting
- // from it in order, to propagate the "parent" of each VPBasicBlock.
- VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry);
- VPBlockUtils::connectBlocks(Pred, Exiting);
-
- return Region;
+ auto *Recipe = new VPReplicateRecipe(I, Plan.mapToVPValues(I->operands()),
+ IsUniform, BlockInMask);
+ return toVPRecipeResult(Recipe);
}
VPRecipeOrVPValueTy
@@ -8643,7 +8695,7 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
return nullptr;
if (auto *CI = dyn_cast<CallInst>(Instr))
- return toVPRecipeResult(tryToWidenCall(CI, Operands, Range));
+ return toVPRecipeResult(tryToWidenCall(CI, Operands, Range, Plan));
if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan));
@@ -8653,13 +8705,16 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
return toVPRecipeResult(new VPWidenGEPRecipe(
- GEP, make_range(Operands.begin(), Operands.end()), OrigLoop));
+ GEP, make_range(Operands.begin(), Operands.end())));
if (auto *SI = dyn_cast<SelectInst>(Instr)) {
- bool InvariantCond =
- PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
return toVPRecipeResult(new VPWidenSelectRecipe(
- *SI, make_range(Operands.begin(), Operands.end()), InvariantCond));
+ *SI, make_range(Operands.begin(), Operands.end())));
+ }
+
+ if (auto *CI = dyn_cast<CastInst>(Instr)) {
+ return toVPRecipeResult(
+ new VPWidenCastRecipe(CI->getOpcode(), Operands[0], CI->getType(), CI));
}
return toVPRecipeResult(tryToWiden(Instr, Operands, VPBB, Plan));
@@ -8677,34 +8732,11 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
auto &ConditionalAssumes = Legal->getConditionalAssumes();
DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
- MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
- // Dead instructions do not need sinking. Remove them from SinkAfter.
- for (Instruction *I : DeadInstructions)
- SinkAfter.erase(I);
-
- // Cannot sink instructions after dead instructions (there won't be any
- // recipes for them). Instead, find the first non-dead previous instruction.
- for (auto &P : Legal->getSinkAfter()) {
- Instruction *SinkTarget = P.second;
- Instruction *FirstInst = &*SinkTarget->getParent()->begin();
- (void)FirstInst;
- while (DeadInstructions.contains(SinkTarget)) {
- assert(
- SinkTarget != FirstInst &&
- "Must find a live instruction (at least the one feeding the "
- "fixed-order recurrence PHI) before reaching beginning of the block");
- SinkTarget = SinkTarget->getPrevNode();
- assert(SinkTarget != P.first &&
- "sink source equals target, no sinking required");
- }
- P.second = SinkTarget;
- }
-
- auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
- for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
- VFRange SubRange = {VF, MaxVFPlusOne};
- VPlans.push_back(
- buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter));
+ auto MaxVFTimes2 = MaxVF * 2;
+ for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
+ VFRange SubRange = {VF, MaxVFTimes2};
+ if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange, DeadInstructions))
+ VPlans.push_back(std::move(*Plan));
VF = SubRange.End;
}
}
@@ -8712,10 +8744,9 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
// Add the necessary canonical IV and branch recipes required to control the
// loop.
static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL,
- bool HasNUW,
- bool UseLaneMaskForLoopControlFlow) {
+ TailFoldingStyle Style) {
Value *StartIdx = ConstantInt::get(IdxTy, 0);
- auto *StartV = Plan.getOrAddVPValue(StartIdx);
+ auto *StartV = Plan.getVPValueOrAddLiveIn(StartIdx);
// Add a VPCanonicalIVPHIRecipe starting at 0 to the header.
auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);
@@ -8725,6 +8756,7 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL,
// Add a CanonicalIVIncrement{NUW} VPInstruction to increment the scalar
// IV by VF * UF.
+ bool HasNUW = Style == TailFoldingStyle::None;
auto *CanonicalIVIncrement =
new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW
: VPInstruction::CanonicalIVIncrement,
@@ -8732,11 +8764,10 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL,
CanonicalIVPHI->addOperand(CanonicalIVIncrement);
VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
- EB->appendRecipe(CanonicalIVIncrement);
-
- if (UseLaneMaskForLoopControlFlow) {
+ if (useActiveLaneMaskForControlFlow(Style)) {
// Create the active lane mask instruction in the vplan preheader.
- VPBasicBlock *Preheader = Plan.getEntry()->getEntryBasicBlock();
+ VPBasicBlock *VecPreheader =
+ cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSinglePredecessor());
// We can't use StartV directly in the ActiveLaneMask VPInstruction, since
// we have to take unrolling into account. Each part needs to start at
@@ -8745,14 +8776,34 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL,
new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementForPartNUW
: VPInstruction::CanonicalIVIncrementForPart,
{StartV}, DL, "index.part.next");
- Preheader->appendRecipe(CanonicalIVIncrementParts);
+ VecPreheader->appendRecipe(CanonicalIVIncrementParts);
// Create the ActiveLaneMask instruction using the correct start values.
- VPValue *TC = Plan.getOrCreateTripCount();
+ VPValue *TC = Plan.getTripCount();
+
+ VPValue *TripCount, *IncrementValue;
+ if (Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) {
+ // When avoiding a runtime check, the active.lane.mask inside the loop
+ // uses a modified trip count and the induction variable increment is
+ // done after the active.lane.mask intrinsic is called.
+ auto *TCMinusVF =
+ new VPInstruction(VPInstruction::CalculateTripCountMinusVF, {TC}, DL);
+ VecPreheader->appendRecipe(TCMinusVF);
+ IncrementValue = CanonicalIVPHI;
+ TripCount = TCMinusVF;
+ } else {
+ // When the loop is guarded by a runtime overflow check for the loop
+ // induction variable increment by VF, we can increment the value before
+ // the get.active.lane mask and use the unmodified tripcount.
+ EB->appendRecipe(CanonicalIVIncrement);
+ IncrementValue = CanonicalIVIncrement;
+ TripCount = TC;
+ }
+
auto *EntryALM = new VPInstruction(VPInstruction::ActiveLaneMask,
{CanonicalIVIncrementParts, TC}, DL,
"active.lane.mask.entry");
- Preheader->appendRecipe(EntryALM);
+ VecPreheader->appendRecipe(EntryALM);
// Now create the ActiveLaneMaskPhi recipe in the main loop using the
// preheader ActiveLaneMask instruction.
@@ -8763,15 +8814,21 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL,
CanonicalIVIncrementParts =
new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementForPartNUW
: VPInstruction::CanonicalIVIncrementForPart,
- {CanonicalIVIncrement}, DL);
+ {IncrementValue}, DL);
EB->appendRecipe(CanonicalIVIncrementParts);
auto *ALM = new VPInstruction(VPInstruction::ActiveLaneMask,
- {CanonicalIVIncrementParts, TC}, DL,
+ {CanonicalIVIncrementParts, TripCount}, DL,
"active.lane.mask.next");
EB->appendRecipe(ALM);
LaneMaskPhi->addOperand(ALM);
+ if (Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) {
+ // Do the increment of the canonical IV after the active.lane.mask, because
+ // that value is still based off %CanonicalIVPHI
+ EB->appendRecipe(CanonicalIVIncrement);
+ }
+
// We have to invert the mask here because a true condition means jumping
// to the exit block.
auto *NotMask = new VPInstruction(VPInstruction::Not, ALM, DL);
@@ -8781,6 +8838,8 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL,
new VPInstruction(VPInstruction::BranchOnCond, {NotMask}, DL);
EB->appendRecipe(BranchBack);
} else {
+ EB->appendRecipe(CanonicalIVIncrement);
+
// Add the BranchOnCount VPInstruction to the latch.
VPInstruction *BranchBack = new VPInstruction(
VPInstruction::BranchOnCount,
@@ -8804,14 +8863,13 @@ static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB,
for (PHINode &ExitPhi : ExitBB->phis()) {
Value *IncomingValue =
ExitPhi.getIncomingValueForBlock(ExitingBB);
- VPValue *V = Plan.getOrAddVPValue(IncomingValue, true);
+ VPValue *V = Plan.getVPValueOrAddLiveIn(IncomingValue);
Plan.addLiveOut(&ExitPhi, V);
}
}
-VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
- VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
- const MapVector<Instruction *, Instruction *> &SinkAfter) {
+std::optional<VPlanPtr> LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
+ VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions) {
SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
@@ -8822,12 +8880,6 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
// process after constructing the initial VPlan.
// ---------------------------------------------------------------------------
- // Mark instructions we'll need to sink later and their targets as
- // ingredients whose recipe we'll need to record.
- for (const auto &Entry : SinkAfter) {
- RecipeBuilder.recordRecipeOf(Entry.first);
- RecipeBuilder.recordRecipeOf(Entry.second);
- }
for (const auto &Reduction : CM.getInLoopReductionChains()) {
PHINode *Phi = Reduction.first;
RecurKind Kind =
@@ -8852,9 +8904,15 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
// single VPInterleaveRecipe.
for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
auto applyIG = [IG, this](ElementCount VF) -> bool {
- return (VF.isVector() && // Query is illegal for VF == 1
- CM.getWideningDecision(IG->getInsertPos(), VF) ==
- LoopVectorizationCostModel::CM_Interleave);
+ bool Result = (VF.isVector() && // Query is illegal for VF == 1
+ CM.getWideningDecision(IG->getInsertPos(), VF) ==
+ LoopVectorizationCostModel::CM_Interleave);
+ // For scalable vectors, the only interleave factor currently supported
+ // is 2 since we require the (de)interleave2 intrinsics instead of
+ // shufflevectors.
+ assert((!Result || !VF.isScalable() || IG->getFactor() == 2) &&
+ "Unsupported interleave factor for scalable vectors");
+ return Result;
};
if (!getDecisionAndClampRange(applyIG, Range))
continue;
@@ -8869,26 +8927,34 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
// visit each basic block after having visited its predecessor basic blocks.
// ---------------------------------------------------------------------------
- // Create initial VPlan skeleton, starting with a block for the pre-header,
- // followed by a region for the vector loop, followed by the middle block. The
- // skeleton vector loop region contains a header and latch block.
- VPBasicBlock *Preheader = new VPBasicBlock("vector.ph");
- auto Plan = std::make_unique<VPlan>(Preheader);
-
+ // Create initial VPlan skeleton, having a basic block for the pre-header
+ // which contains SCEV expansions that need to happen before the CFG is
+ // modified; a basic block for the vector pre-header, followed by a region for
+ // the vector loop, followed by the middle basic block. The skeleton vector
+ // loop region contains a header and latch basic blocks.
+ VPlanPtr Plan = VPlan::createInitialVPlan(
+ createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop),
+ *PSE.getSE());
VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body");
VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch");
VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB);
auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop");
- VPBlockUtils::insertBlockAfter(TopRegion, Preheader);
+ VPBlockUtils::insertBlockAfter(TopRegion, Plan->getEntry());
VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block");
VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion);
+ // Don't use getDecisionAndClampRange here, because we don't know the UF
+ // so this function is better to be conservative, rather than to split
+ // it up into different VPlans.
+ bool IVUpdateMayOverflow = false;
+ for (ElementCount VF : Range)
+ IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF);
+
Instruction *DLInst =
getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(),
DLInst ? DLInst->getDebugLoc() : DebugLoc(),
- !CM.foldTailByMasking(),
- CM.useActiveLaneMaskForControlFlow());
+ CM.getTailFoldingStyle(IVUpdateMayOverflow));
// Scan the body of the loop in a topological order to visit each basic block
// after having visited its predecessor basic blocks.
@@ -8896,18 +8962,16 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
DFS.perform(LI);
VPBasicBlock *VPBB = HeaderVPBB;
- SmallVector<VPWidenIntOrFpInductionRecipe *> InductionsToMove;
for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
// Relevant instructions from basic block BB will be grouped into VPRecipe
// ingredients and fill a new VPBasicBlock.
- unsigned VPBBsForBB = 0;
if (VPBB != HeaderVPBB)
VPBB->setName(BB->getName());
Builder.setInsertPoint(VPBB);
// Introduce each ingredient into VPlan.
// TODO: Model and preserve debug intrinsics in VPlan.
- for (Instruction &I : BB->instructionsWithoutDebug()) {
+ for (Instruction &I : BB->instructionsWithoutDebug(false)) {
Instruction *Instr = &I;
// First filter out irrelevant instructions, to ensure no recipes are
@@ -8918,7 +8982,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
SmallVector<VPValue *, 4> Operands;
auto *Phi = dyn_cast<PHINode>(Instr);
if (Phi && Phi->getParent() == OrigLoop->getHeader()) {
- Operands.push_back(Plan->getOrAddVPValue(
+ Operands.push_back(Plan->getVPValueOrAddLiveIn(
Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));
} else {
auto OpRange = Plan->mapToVPValues(Instr->operands());
@@ -8932,50 +8996,36 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
continue;
- if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe(
- Instr, Operands, Range, VPBB, Plan)) {
- // If Instr can be simplified to an existing VPValue, use it.
- if (RecipeOrValue.is<VPValue *>()) {
- auto *VPV = RecipeOrValue.get<VPValue *>();
- Plan->addVPValue(Instr, VPV);
- // If the re-used value is a recipe, register the recipe for the
- // instruction, in case the recipe for Instr needs to be recorded.
- if (VPRecipeBase *R = VPV->getDefiningRecipe())
- RecipeBuilder.setRecipe(Instr, R);
- continue;
- }
- // Otherwise, add the new recipe.
- VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>();
- for (auto *Def : Recipe->definedValues()) {
- auto *UV = Def->getUnderlyingValue();
- Plan->addVPValue(UV, Def);
- }
-
- if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) &&
- HeaderVPBB->getFirstNonPhi() != VPBB->end()) {
- // Keep track of VPWidenIntOrFpInductionRecipes not in the phi section
- // of the header block. That can happen for truncates of induction
- // variables. Those recipes are moved to the phi section of the header
- // block after applying SinkAfter, which relies on the original
- // position of the trunc.
- assert(isa<TruncInst>(Instr));
- InductionsToMove.push_back(
- cast<VPWidenIntOrFpInductionRecipe>(Recipe));
- }
- RecipeBuilder.setRecipe(Instr, Recipe);
- VPBB->appendRecipe(Recipe);
+ auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe(
+ Instr, Operands, Range, VPBB, Plan);
+ if (!RecipeOrValue)
+ RecipeOrValue = RecipeBuilder.handleReplication(Instr, Range, *Plan);
+ // If Instr can be simplified to an existing VPValue, use it.
+ if (isa<VPValue *>(RecipeOrValue)) {
+ auto *VPV = cast<VPValue *>(RecipeOrValue);
+ Plan->addVPValue(Instr, VPV);
+ // If the re-used value is a recipe, register the recipe for the
+ // instruction, in case the recipe for Instr needs to be recorded.
+ if (VPRecipeBase *R = VPV->getDefiningRecipe())
+ RecipeBuilder.setRecipe(Instr, R);
continue;
}
-
- // Otherwise, if all widening options failed, Instruction is to be
- // replicated. This may create a successor for VPBB.
- VPBasicBlock *NextVPBB =
- RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan);
- if (NextVPBB != VPBB) {
- VPBB = NextVPBB;
- VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
- : "");
+ // Otherwise, add the new recipe.
+ VPRecipeBase *Recipe = cast<VPRecipeBase *>(RecipeOrValue);
+ for (auto *Def : Recipe->definedValues()) {
+ auto *UV = Def->getUnderlyingValue();
+ Plan->addVPValue(UV, Def);
}
+
+ RecipeBuilder.setRecipe(Instr, Recipe);
+ if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) &&
+ HeaderVPBB->getFirstNonPhi() != VPBB->end()) {
+ // Move VPWidenIntOrFpInductionRecipes for optimized truncates to the
+ // phi section of HeaderVPBB.
+ assert(isa<TruncInst>(Instr));
+ Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
+ } else
+ VPBB->appendRecipe(Recipe);
}
VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB);
@@ -8985,7 +9035,12 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
// After here, VPBB should not be used.
VPBB = nullptr;
- addUsersInExitBlock(HeaderVPBB, MiddleVPBB, OrigLoop, *Plan);
+ if (CM.requiresScalarEpilogue(Range)) {
+ // No edge from the middle block to the unique exit block has been inserted
+ // and there is nothing to fix from vector loop; phis should have incoming
+ // from scalar loop only.
+ } else
+ addUsersInExitBlock(HeaderVPBB, MiddleVPBB, OrigLoop, *Plan);
assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
!Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
@@ -8998,116 +9053,10 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
// bring the VPlan to its final state.
// ---------------------------------------------------------------------------
- // Apply Sink-After legal constraints.
- auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * {
- auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent());
- if (Region && Region->isReplicator()) {
- assert(Region->getNumSuccessors() == 1 &&
- Region->getNumPredecessors() == 1 && "Expected SESE region!");
- assert(R->getParent()->size() == 1 &&
- "A recipe in an original replicator region must be the only "
- "recipe in its block");
- return Region;
- }
- return nullptr;
- };
- for (const auto &Entry : SinkAfter) {
- VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
- VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
-
- auto *TargetRegion = GetReplicateRegion(Target);
- auto *SinkRegion = GetReplicateRegion(Sink);
- if (!SinkRegion) {
- // If the sink source is not a replicate region, sink the recipe directly.
- if (TargetRegion) {
- // The target is in a replication region, make sure to move Sink to
- // the block after it, not into the replication region itself.
- VPBasicBlock *NextBlock =
- cast<VPBasicBlock>(TargetRegion->getSuccessors().front());
- Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi());
- } else
- Sink->moveAfter(Target);
- continue;
- }
-
- // The sink source is in a replicate region. Unhook the region from the CFG.
- auto *SinkPred = SinkRegion->getSinglePredecessor();
- auto *SinkSucc = SinkRegion->getSingleSuccessor();
- VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion);
- VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc);
- VPBlockUtils::connectBlocks(SinkPred, SinkSucc);
-
- if (TargetRegion) {
- // The target recipe is also in a replicate region, move the sink region
- // after the target region.
- auto *TargetSucc = TargetRegion->getSingleSuccessor();
- VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc);
- VPBlockUtils::connectBlocks(TargetRegion, SinkRegion);
- VPBlockUtils::connectBlocks(SinkRegion, TargetSucc);
- } else {
- // The sink source is in a replicate region, we need to move the whole
- // replicate region, which should only contain a single recipe in the
- // main block.
- auto *SplitBlock =
- Target->getParent()->splitAt(std::next(Target->getIterator()));
-
- auto *SplitPred = SplitBlock->getSinglePredecessor();
-
- VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock);
- VPBlockUtils::connectBlocks(SplitPred, SinkRegion);
- VPBlockUtils::connectBlocks(SinkRegion, SplitBlock);
- }
- }
-
- VPlanTransforms::removeRedundantCanonicalIVs(*Plan);
- VPlanTransforms::removeRedundantInductionCasts(*Plan);
-
- // Now that sink-after is done, move induction recipes for optimized truncates
- // to the phi section of the header block.
- for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove)
- Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
-
// Adjust the recipes for any inloop reductions.
adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExiting()), Plan,
RecipeBuilder, Range.Start);
- // Introduce a recipe to combine the incoming and previous values of a
- // fixed-order recurrence.
- for (VPRecipeBase &R :
- Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
- auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R);
- if (!RecurPhi)
- continue;
-
- VPRecipeBase *PrevRecipe = &RecurPhi->getBackedgeRecipe();
- // Fixed-order recurrences do not contain cycles, so this loop is guaranteed
- // to terminate.
- while (auto *PrevPhi =
- dyn_cast<VPFirstOrderRecurrencePHIRecipe>(PrevRecipe))
- PrevRecipe = &PrevPhi->getBackedgeRecipe();
- VPBasicBlock *InsertBlock = PrevRecipe->getParent();
- auto *Region = GetReplicateRegion(PrevRecipe);
- if (Region)
- InsertBlock = dyn_cast<VPBasicBlock>(Region->getSingleSuccessor());
- if (!InsertBlock) {
- InsertBlock = new VPBasicBlock(Region->getName() + ".succ");
- VPBlockUtils::insertBlockAfter(InsertBlock, Region);
- }
- if (Region || PrevRecipe->isPhi())
- Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
- else
- Builder.setInsertPoint(InsertBlock, std::next(PrevRecipe->getIterator()));
-
- auto *RecurSplice = cast<VPInstruction>(
- Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice,
- {RecurPhi, RecurPhi->getBackedgeValue()}));
-
- RecurPhi->replaceAllUsesWith(RecurSplice);
- // Set the first operand of RecurSplice to RecurPhi again, after replacing
- // all users.
- RecurSplice->setOperand(0, RecurPhi);
- }
-
// Interleave memory: for each Interleave Group we marked earlier as relevant
// for this VPlan, replace the Recipes widening its memory instructions with a
// single VPInterleaveRecipe at its insertion point.
@@ -9122,48 +9071,66 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
StoredValues.push_back(StoreR->getStoredValue());
}
+ bool NeedsMaskForGaps =
+ IG->requiresScalarEpilogue() && !CM.isScalarEpilogueAllowed();
auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
- Recipe->getMask());
+ Recipe->getMask(), NeedsMaskForGaps);
VPIG->insertBefore(Recipe);
unsigned J = 0;
for (unsigned i = 0; i < IG->getFactor(); ++i)
if (Instruction *Member = IG->getMember(i)) {
+ VPRecipeBase *MemberR = RecipeBuilder.getRecipe(Member);
if (!Member->getType()->isVoidTy()) {
- VPValue *OriginalV = Plan->getVPValue(Member);
- Plan->removeVPValueFor(Member);
- Plan->addVPValue(Member, VPIG->getVPValue(J));
+ VPValue *OriginalV = MemberR->getVPSingleValue();
OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
J++;
}
- RecipeBuilder.getRecipe(Member)->eraseFromParent();
+ MemberR->eraseFromParent();
}
}
- for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End);
- VF *= 2)
+ for (ElementCount VF : Range)
Plan->addVF(VF);
Plan->setName("Initial VPlan");
+ // Replace VPValues for known constant strides guaranteed by predicate scalar
+ // evolution.
+ for (auto [_, Stride] : Legal->getLAI()->getSymbolicStrides()) {
+ auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
+ auto *ScevStride = dyn_cast<SCEVConstant>(PSE.getSCEV(StrideV));
+ // Only handle constant strides for now.
+ if (!ScevStride)
+ continue;
+ Constant *CI = ConstantInt::get(Stride->getType(), ScevStride->getAPInt());
+
+ auto *ConstVPV = Plan->getVPValueOrAddLiveIn(CI);
+ // The versioned value may not be used in the loop directly, so just add a
+ // new live-in in those cases.
+ Plan->getVPValueOrAddLiveIn(StrideV)->replaceAllUsesWith(ConstVPV);
+ }
+
// From this point onwards, VPlan-to-VPlan transformations may change the plan
// in ways that accessing values using original IR values is incorrect.
Plan->disableValue2VPValue();
+ // Sink users of fixed-order recurrence past the recipe defining the previous
+ // value and introduce FirstOrderRecurrenceSplice VPInstructions.
+ if (!VPlanTransforms::adjustFixedOrderRecurrences(*Plan, Builder))
+ return std::nullopt;
+
+ VPlanTransforms::removeRedundantCanonicalIVs(*Plan);
+ VPlanTransforms::removeRedundantInductionCasts(*Plan);
+
VPlanTransforms::optimizeInductions(*Plan, *PSE.getSE());
VPlanTransforms::removeDeadRecipes(*Plan);
- bool ShouldSimplify = true;
- while (ShouldSimplify) {
- ShouldSimplify = VPlanTransforms::sinkScalarOperands(*Plan);
- ShouldSimplify |=
- VPlanTransforms::mergeReplicateRegionsIntoSuccessors(*Plan);
- ShouldSimplify |= VPlanTransforms::mergeBlocksIntoPredecessors(*Plan);
- }
+ VPlanTransforms::createAndOptimizeReplicateRegions(*Plan);
VPlanTransforms::removeRedundantExpandSCEVRecipes(*Plan);
VPlanTransforms::mergeBlocksIntoPredecessors(*Plan);
assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid");
- return Plan;
+ return std::make_optional(std::move(Plan));
}
VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
@@ -9175,21 +9142,21 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
// Create new empty VPlan
- auto Plan = std::make_unique<VPlan>();
+ auto Plan = VPlan::createInitialVPlan(
+ createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop),
+ *PSE.getSE());
// Build hierarchical CFG
VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
HCFGBuilder.buildHierarchicalCFG();
- for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End);
- VF *= 2)
+ for (ElementCount VF : Range)
Plan->addVF(VF);
- SmallPtrSet<Instruction *, 1> DeadInstructions;
VPlanTransforms::VPInstructionsToVPRecipes(
- OrigLoop, Plan,
+ Plan,
[this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },
- DeadInstructions, *PSE.getSE(), *TLI);
+ *PSE.getSE(), *TLI);
// Remove the existing terminator of the exiting block of the top-most region.
// A BranchOnCount will be added instead when adding the canonical IV recipes.
@@ -9198,7 +9165,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
Term->eraseFromParent();
addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(),
- true, CM.useActiveLaneMaskForControlFlow());
+ CM.getTailFoldingStyle());
return Plan;
}
@@ -9255,7 +9222,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
VPBuilder::InsertPointGuard Guard(Builder);
Builder.setInsertPoint(WidenRecipe->getParent(),
WidenRecipe->getIterator());
- CondOp = RecipeBuilder.createBlockInMask(R->getParent(), Plan);
+ CondOp = RecipeBuilder.createBlockInMask(R->getParent(), *Plan);
}
if (IsFMulAdd) {
@@ -9270,7 +9237,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
VecOp = FMulRecipe;
}
VPReductionRecipe *RedRecipe =
- new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, TTI);
+ new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, &TTI);
WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
Plan->removeVPValueFor(R);
Plan->addVPValue(R, RedRecipe);
@@ -9304,13 +9271,15 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
if (!PhiR || PhiR->isInLoop())
continue;
VPValue *Cond =
- RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
+ RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), *Plan);
VPValue *Red = PhiR->getBackedgeValue();
assert(Red->getDefiningRecipe()->getParent() != LatchVPBB &&
"reduction recipe must be defined before latch");
Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR});
}
}
+
+ VPlanTransforms::clearReductionWrapFlags(*Plan);
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -9475,7 +9444,7 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
PartStart, ConstantInt::get(PtrInd->getType(), Lane));
Value *GlobalIdx = State.Builder.CreateAdd(PtrInd, Idx);
- Value *Step = State.get(getOperand(1), VPIteration(0, Part));
+ Value *Step = State.get(getOperand(1), VPIteration(Part, Lane));
Value *SclrGep = emitTransformedIndex(
State.Builder, GlobalIdx, IndDesc.getStartValue(), Step, IndDesc);
SclrGep->setName("next.gep");
@@ -9485,8 +9454,6 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
return;
}
- assert(isa<SCEVConstant>(IndDesc.getStep()) &&
- "Induction step not a SCEV constant!");
Type *PhiType = IndDesc.getStep()->getType();
// Build a pointer phi
@@ -9506,7 +9473,7 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
Value *NumUnrolledElems =
State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
Value *InductionGEP = GetElementPtrInst::Create(
- IndDesc.getElementType(), NewPointerPhi,
+ State.Builder.getInt8Ty(), NewPointerPhi,
State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
InductionLoc);
// Add induction update using an incorrect block temporarily. The phi node
@@ -9529,10 +9496,10 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
StartOffset = State.Builder.CreateAdd(
StartOffset, State.Builder.CreateStepVector(VecPhiType));
- assert(ScalarStepValue == State.get(getOperand(1), VPIteration(0, Part)) &&
+ assert(ScalarStepValue == State.get(getOperand(1), VPIteration(Part, 0)) &&
"scalar step must be the same across all parts");
Value *GEP = State.Builder.CreateGEP(
- IndDesc.getElementType(), NewPointerPhi,
+ State.Builder.getInt8Ty(), NewPointerPhi,
State.Builder.CreateMul(
StartOffset,
State.Builder.CreateVectorSplat(State.VF, ScalarStepValue),
@@ -9584,7 +9551,8 @@ void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
void VPInterleaveRecipe::execute(VPTransformState &State) {
assert(!State.Instance && "Interleave group being replicated.");
State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(),
- getStoredValues(), getMask());
+ getStoredValues(), getMask(),
+ NeedsMaskForGaps);
}
void VPReductionRecipe::execute(VPTransformState &State) {
@@ -9640,10 +9608,9 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
Instruction *UI = getUnderlyingInstr();
if (State.Instance) { // Generate a single instance.
assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
- State.ILV->scalarizeInstruction(UI, this, *State.Instance,
- IsPredicated, State);
+ State.ILV->scalarizeInstruction(UI, this, *State.Instance, State);
// Insert scalar instance packing it into a vector.
- if (AlsoPack && State.VF.isVector()) {
+ if (State.VF.isVector() && shouldPack()) {
// If we're constructing lane 0, initialize to start from poison.
if (State.Instance->Lane.isFirstLane()) {
assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
@@ -9663,8 +9630,7 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
all_of(operands(), [](VPValue *Op) {
return Op->isDefinedOutsideVectorRegions();
})) {
- State.ILV->scalarizeInstruction(UI, this, VPIteration(0, 0), IsPredicated,
- State);
+ State.ILV->scalarizeInstruction(UI, this, VPIteration(0, 0), State);
if (user_begin() != user_end()) {
for (unsigned Part = 1; Part < State.UF; ++Part)
State.set(this, State.get(this, VPIteration(0, 0)),
@@ -9676,16 +9642,16 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
// Uniform within VL means we need to generate lane 0 only for each
// unrolled copy.
for (unsigned Part = 0; Part < State.UF; ++Part)
- State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, 0),
- IsPredicated, State);
+ State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, 0), State);
return;
}
- // A store of a loop varying value to a loop invariant address only
- // needs only the last copy of the store.
- if (isa<StoreInst>(UI) && !getOperand(1)->hasDefiningRecipe()) {
+ // A store of a loop varying value to a uniform address only needs the last
+ // copy of the store.
+ if (isa<StoreInst>(UI) &&
+ vputils::isUniformAfterVectorization(getOperand(1))) {
auto Lane = VPLane::getLastLaneForVF(State.VF);
- State.ILV->scalarizeInstruction(UI, this, VPIteration(State.UF - 1, Lane), IsPredicated,
+ State.ILV->scalarizeInstruction(UI, this, VPIteration(State.UF - 1, Lane),
State);
return;
}
@@ -9695,8 +9661,7 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
const unsigned EndLane = State.VF.getKnownMinValue();
for (unsigned Part = 0; Part < State.UF; ++Part)
for (unsigned Lane = 0; Lane < EndLane; ++Lane)
- State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane),
- IsPredicated, State);
+ State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane), State);
}
void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
@@ -9714,7 +9679,7 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
const Align Alignment = getLoadStoreAlignment(&Ingredient);
- bool CreateGatherScatter = !Consecutive;
+ bool CreateGatherScatter = !isConsecutive();
auto &Builder = State.Builder;
InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF);
@@ -9725,36 +9690,39 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
// Calculate the pointer for the specific unroll-part.
- GetElementPtrInst *PartPtr = nullptr;
-
+ Value *PartPtr = nullptr;
+
+ // Use i32 for the gep index type when the value is constant,
+ // or query DataLayout for a more suitable index type otherwise.
+ const DataLayout &DL =
+ Builder.GetInsertBlock()->getModule()->getDataLayout();
+ Type *IndexTy = State.VF.isScalable() && (isReverse() || Part > 0)
+ ? DL.getIndexType(ScalarDataTy->getPointerTo())
+ : Builder.getInt32Ty();
bool InBounds = false;
if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
InBounds = gep->isInBounds();
- if (Reverse) {
+ if (isReverse()) {
// If the address is consecutive but reversed, then the
// wide store needs to start at the last vector element.
// RunTimeVF = VScale * VF.getKnownMinValue()
// For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
- Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF);
+ Value *RunTimeVF = getRuntimeVF(Builder, IndexTy, State.VF);
// NumElt = -Part * RunTimeVF
- Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF);
+ Value *NumElt =
+ Builder.CreateMul(ConstantInt::get(IndexTy, -(int64_t)Part), RunTimeVF);
// LastLane = 1 - RunTimeVF
- Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF);
+ Value *LastLane =
+ Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF);
+ PartPtr = Builder.CreateGEP(ScalarDataTy, Ptr, NumElt, "", InBounds);
PartPtr =
- cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt));
- PartPtr->setIsInBounds(InBounds);
- PartPtr = cast<GetElementPtrInst>(
- Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane));
- PartPtr->setIsInBounds(InBounds);
+ Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane, "", InBounds);
if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
BlockInMaskParts[Part] =
Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse");
} else {
- Value *Increment =
- createStepForVF(Builder, Builder.getInt32Ty(), State.VF, Part);
- PartPtr = cast<GetElementPtrInst>(
- Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
- PartPtr->setIsInBounds(InBounds);
+ Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Part);
+ PartPtr = Builder.CreateGEP(ScalarDataTy, Ptr, Increment, "", InBounds);
}
unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
@@ -9774,7 +9742,7 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
MaskPart);
} else {
- if (Reverse) {
+ if (isReverse()) {
// If we store to reverse consecutive memory locations, then we need
// to reverse the order of elements in the stored value.
StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
@@ -9833,7 +9801,6 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
static ScalarEpilogueLowering getScalarEpilogueLowering(
Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
- AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI) {
// 1) OptSize takes precedence over all other options, i.e. if this is set,
// don't look at hints or options, and don't request a scalar epilogue.
@@ -9869,7 +9836,8 @@ static ScalarEpilogueLowering getScalarEpilogueLowering(
};
// 4) if the TTI hook indicates this is profitable, request predication.
- if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, &LVL, IAI))
+ TailFoldingInfo TFI(TLI, &LVL, IAI);
+ if (TTI->preferPredicateOverEpilogue(&TFI))
return CM_ScalarEpilogueNotNeededUsePredicate;
return CM_ScalarEpilogueAllowed;
@@ -9880,9 +9848,29 @@ Value *VPTransformState::get(VPValue *Def, unsigned Part) {
if (hasVectorValue(Def, Part))
return Data.PerPartOutput[Def][Part];
+ auto GetBroadcastInstrs = [this, Def](Value *V) {
+ bool SafeToHoist = Def->isDefinedOutsideVectorRegions();
+ if (VF.isScalar())
+ return V;
+ // Place the code for broadcasting invariant variables in the new preheader.
+ IRBuilder<>::InsertPointGuard Guard(Builder);
+ if (SafeToHoist) {
+ BasicBlock *LoopVectorPreHeader = CFG.VPBB2IRBB[cast<VPBasicBlock>(
+ Plan->getVectorLoopRegion()->getSinglePredecessor())];
+ if (LoopVectorPreHeader)
+ Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
+ }
+
+ // Place the code for broadcasting invariant variables in the new preheader.
+ // Broadcast the scalar into all locations in the vector.
+ Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
+
+ return Shuf;
+ };
+
if (!hasScalarValue(Def, {Part, 0})) {
Value *IRV = Def->getLiveInIRValue();
- Value *B = ILV->getBroadcastInstrs(IRV);
+ Value *B = GetBroadcastInstrs(IRV);
set(Def, B, Part);
return B;
}
@@ -9900,9 +9888,11 @@ Value *VPTransformState::get(VPValue *Def, unsigned Part) {
unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1;
// Check if there is a scalar value for the selected lane.
if (!hasScalarValue(Def, {Part, LastLane})) {
- // At the moment, VPWidenIntOrFpInductionRecipes and VPScalarIVStepsRecipes can also be uniform.
+ // At the moment, VPWidenIntOrFpInductionRecipes, VPScalarIVStepsRecipes and
+ // VPExpandSCEVRecipes can also be uniform.
assert((isa<VPWidenIntOrFpInductionRecipe>(Def->getDefiningRecipe()) ||
- isa<VPScalarIVStepsRecipe>(Def->getDefiningRecipe())) &&
+ isa<VPScalarIVStepsRecipe>(Def->getDefiningRecipe()) ||
+ isa<VPExpandSCEVRecipe>(Def->getDefiningRecipe())) &&
"unexpected recipe found to be invariant");
IsUniform = true;
LastLane = 0;
@@ -9927,7 +9917,7 @@ Value *VPTransformState::get(VPValue *Def, unsigned Part) {
// State, we will only generate the insertelements once.
Value *VectorValue = nullptr;
if (IsUniform) {
- VectorValue = ILV->getBroadcastInstrs(ScalarValue);
+ VectorValue = GetBroadcastInstrs(ScalarValue);
set(Def, VectorValue, Part);
} else {
// Initialize packing with insertelements to start from undef.
@@ -9962,15 +9952,15 @@ static bool processLoopInVPlanNativePath(
Function *F = L->getHeader()->getParent();
InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
- ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
- F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL, &IAI);
+ ScalarEpilogueLowering SEL =
+ getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, *LVL, &IAI);
LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
&Hints, IAI);
// Use the planner for outer loop vectorization.
// TODO: CM is not used at this point inside the planner. Turn CM into an
// optional argument if we don't need it in the future.
- LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, ORE);
+ LoopVectorizationPlanner LVP(L, LI, TLI, *TTI, LVL, CM, IAI, PSE, Hints, ORE);
// Get user vectorization factor.
ElementCount UserVF = Hints.getWidth();
@@ -10231,8 +10221,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// Check the function attributes and profiles to find out if this function
// should be optimized for size.
- ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
- F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL, &IAI);
+ ScalarEpilogueLowering SEL =
+ getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL, &IAI);
// Check the loop for a trip count threshold: vectorize loops with a tiny trip
// count by optimizing for size, to minimize overheads.
@@ -10309,11 +10299,9 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// Use the cost model.
LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
F, &Hints, IAI);
- CM.collectValuesToIgnore();
- CM.collectElementTypesForWidening();
-
// Use the planner for vectorization.
- LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, ORE);
+ LoopVectorizationPlanner LVP(L, LI, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
+ ORE);
// Get user vectorization factor and interleave count.
ElementCount UserVF = Hints.getWidth();
@@ -10342,7 +10330,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
bool ForceVectorization =
Hints.getForce() == LoopVectorizeHints::FK_Enabled;
if (!ForceVectorization &&
- !areRuntimeChecksProfitable(Checks, VF, CM.getVScaleForTuning(), L,
+ !areRuntimeChecksProfitable(Checks, VF, getVScaleForTuning(L, *TTI), L,
*PSE.getSE())) {
ORE->emit([&]() {
return OptimizationRemarkAnalysisAliasing(
@@ -10464,7 +10452,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// Consider vectorizing the epilogue too if it's profitable.
VectorizationFactor EpilogueVF =
- CM.selectEpilogueVectorizationFactor(VF.Width, LVP);
+ LVP.selectEpilogueVectorizationFactor(VF.Width, IC);
if (EpilogueVF.Width.isVector()) {
// The first pass vectorizes the main loop and creates a scalar epilogue
@@ -10475,8 +10463,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
EPI, &LVL, &CM, BFI, PSI, Checks);
VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF);
- LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV,
- DT, true);
+ auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF,
+ BestMainPlan, MainILV, DT, true);
++LoopsVectorized;
// Second pass vectorizes the epilogue and adjusts the control flow
@@ -10492,6 +10480,21 @@ bool LoopVectorizePass::processLoop(Loop *L) {
VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
Header->setName("vec.epilog.vector.body");
+ // Re-use the trip count and steps expanded for the main loop, as
+ // skeleton creation needs it as a value that dominates both the scalar
+ // and vector epilogue loops
+ // TODO: This is a workaround needed for epilogue vectorization and it
+ // should be removed once induction resume value creation is done
+ // directly in VPlan.
+ EpilogILV.setTripCount(MainILV.getTripCount());
+ for (auto &R : make_early_inc_range(*BestEpiPlan.getPreheader())) {
+ auto *ExpandR = cast<VPExpandSCEVRecipe>(&R);
+ auto *ExpandedVal = BestEpiPlan.getVPValueOrAddLiveIn(
+ ExpandedSCEVs.find(ExpandR->getSCEV())->second);
+ ExpandR->replaceAllUsesWith(ExpandedVal);
+ ExpandR->eraseFromParent();
+ }
+
// Ensure that the start values for any VPWidenIntOrFpInductionRecipe,
// VPWidenPointerInductionRecipe and VPReductionPHIRecipes are updated
// before vectorizing the epilogue loop.
@@ -10520,15 +10523,16 @@ bool LoopVectorizePass::processLoop(Loop *L) {
}
ResumeV = MainILV.createInductionResumeValue(
- IndPhi, *ID, {EPI.MainLoopIterationCountCheck});
+ IndPhi, *ID, getExpandedStep(*ID, ExpandedSCEVs),
+ {EPI.MainLoopIterationCountCheck});
}
assert(ResumeV && "Must have a resume value");
- VPValue *StartVal = BestEpiPlan.getOrAddExternalDef(ResumeV);
+ VPValue *StartVal = BestEpiPlan.getVPValueOrAddLiveIn(ResumeV);
cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);
}
LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
- DT, true);
+ DT, true, &ExpandedSCEVs);
++LoopsEpilogueVectorized;
if (!MainILV.areSafetyChecksAdded())
@@ -10581,14 +10585,14 @@ bool LoopVectorizePass::processLoop(Loop *L) {
LoopVectorizeResult LoopVectorizePass::runImpl(
Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
- DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
+ DominatorTree &DT_, BlockFrequencyInfo *BFI_, TargetLibraryInfo *TLI_,
DemandedBits &DB_, AssumptionCache &AC_, LoopAccessInfoManager &LAIs_,
OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
SE = &SE_;
LI = &LI_;
TTI = &TTI_;
DT = &DT_;
- BFI = &BFI_;
+ BFI = BFI_;
TLI = TLI_;
AC = &AC_;
LAIs = &LAIs_;
@@ -10604,7 +10608,7 @@ LoopVectorizeResult LoopVectorizePass::runImpl(
// vector registers, loop vectorization may still enable scalar
// interleaving.
if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
- TTI->getMaxInterleaveFactor(1) < 2)
+ TTI->getMaxInterleaveFactor(ElementCount::getFixed(1)) < 2)
return LoopVectorizeResult(false, false);
bool Changed = false, CFGChanged = false;
@@ -10656,7 +10660,6 @@ PreservedAnalyses LoopVectorizePass::run(Function &F,
auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
auto &TTI = AM.getResult<TargetIRAnalysis>(F);
auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
- auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
auto &AC = AM.getResult<AssumptionAnalysis>(F);
auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
@@ -10666,12 +10669,20 @@ PreservedAnalyses LoopVectorizePass::run(Function &F,
auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
ProfileSummaryInfo *PSI =
MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
+ BlockFrequencyInfo *BFI = nullptr;
+ if (PSI && PSI->hasProfileSummary())
+ BFI = &AM.getResult<BlockFrequencyAnalysis>(F);
LoopVectorizeResult Result =
runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AC, LAIs, ORE, PSI);
if (!Result.MadeAnyChange)
return PreservedAnalyses::all();
PreservedAnalyses PA;
+ if (isAssignmentTrackingEnabled(*F.getParent())) {
+ for (auto &BB : F)
+ RemoveRedundantDbgInstrs(&BB);
+ }
+
// We currently do not preserve loopinfo/dominator analyses with outer loop
// vectorization. Until this is addressed, mark these analyses as preserved
// only for non-VPlan-native path.
@@ -10679,6 +10690,11 @@ PreservedAnalyses LoopVectorizePass::run(Function &F,
if (!EnableVPlanNativePath) {
PA.preserve<LoopAnalysis>();
PA.preserve<DominatorTreeAnalysis>();
+ PA.preserve<ScalarEvolutionAnalysis>();
+
+#ifdef EXPENSIVE_CHECKS
+ SE.verify();
+#endif
}
if (Result.MadeCFGChange) {
@@ -10699,8 +10715,8 @@ void LoopVectorizePass::printPipeline(
static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
OS, MapClassName2PassName);
- OS << "<";
+ OS << '<';
OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
- OS << ">";
+ OS << '>';
}