aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp')
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp2062
1 files changed, 873 insertions, 1189 deletions
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index b603bbe55dc9..f82e161fb846 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -27,7 +27,7 @@
//
// There is a development effort going on to migrate loop vectorizer to the
// VPlan infrastructure and to introduce outer loop vectorization support (see
-// docs/Proposal/VectorizationPlan.rst and
+// docs/VectorizationPlan.rst and
// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
// purpose, we temporarily introduced the VPlan-native vectorization path: an
// alternative vectorization path that is natively implemented on top of the
@@ -57,6 +57,7 @@
#include "LoopVectorizationPlanner.h"
#include "VPRecipeBuilder.h"
#include "VPlan.h"
+#include "VPlanAnalysis.h"
#include "VPlanHCFGBuilder.h"
#include "VPlanTransforms.h"
#include "llvm/ADT/APInt.h"
@@ -111,10 +112,12 @@
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/Metadata.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Operator.h"
#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/ProfDataUtils.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Use.h"
#include "llvm/IR/User.h"
@@ -390,6 +393,21 @@ static cl::opt<cl::boolOrDefault> ForceSafeDivisor(
cl::desc(
"Override cost based safe divisor widening for div/rem instructions"));
+static cl::opt<bool> UseWiderVFIfCallVariantsPresent(
+ "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true),
+ cl::Hidden,
+ cl::desc("Try wider VFs if they enable the use of vector variants"));
+
+// Likelyhood of bypassing the vectorized loop because assumptions about SCEV
+// variables not overflowing do not hold. See `emitSCEVChecks`.
+static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127};
+// Likelyhood of bypassing the vectorized loop because pointers overlap. See
+// `emitMemRuntimeChecks`.
+static constexpr uint32_t MemCheckBypassWeights[] = {1, 127};
+// Likelyhood of bypassing the vectorized loop because there are zero trips left
+// after prolog. See `emitIterationCountCheck`.
+static constexpr uint32_t MinItersBypassWeights[] = {1, 127};
+
/// A helper function that returns true if the given type is irregular. The
/// type is irregular if its allocated size doesn't equal the store size of an
/// element of the corresponding vector type.
@@ -408,13 +426,6 @@ static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
/// we always assume predicated blocks have a 50% chance of executing.
static unsigned getReciprocalPredBlockProb() { return 2; }
-/// A helper function that returns an integer or floating-point constant with
-/// value C.
-static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
- return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
- : ConstantFP::get(Ty, C);
-}
-
/// Returns "best known" trip count for the specified loop \p L as defined by
/// the following procedure:
/// 1) Returns exact trip count if it is known.
@@ -556,10 +567,6 @@ public:
const VPIteration &Instance,
VPTransformState &State);
- /// Construct the vector value of a scalarized value \p V one lane at a time.
- void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance,
- VPTransformState &State);
-
/// Try to vectorize interleaved access group \p Group with the base address
/// given in \p Addr, optionally masking the vector operations if \p
/// BlockInMask is non-null. Use \p State to translate given VPValues to IR
@@ -634,10 +641,6 @@ protected:
/// the block that was created for it.
void sinkScalarOperands(Instruction *PredInst);
- /// Shrinks vector element sizes to the smallest bitwidth they can be legally
- /// represented as.
- void truncateToMinimalBitwidths(VPTransformState &State);
-
/// Returns (and creates if needed) the trip count of the widened loop.
Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock);
@@ -943,21 +946,21 @@ protected:
/// Look for a meaningful debug location on the instruction or it's
/// operands.
-static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
+static DebugLoc getDebugLocFromInstOrOperands(Instruction *I) {
if (!I)
- return I;
+ return DebugLoc();
DebugLoc Empty;
if (I->getDebugLoc() != Empty)
- return I;
+ return I->getDebugLoc();
for (Use &Op : I->operands()) {
if (Instruction *OpInst = dyn_cast<Instruction>(Op))
if (OpInst->getDebugLoc() != Empty)
- return OpInst;
+ return OpInst->getDebugLoc();
}
- return I;
+ return I->getDebugLoc();
}
/// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
@@ -1021,14 +1024,6 @@ const SCEV *createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE,
return SE.getTripCountFromExitCount(BackedgeTakenCount, IdxTy, OrigLoop);
}
-static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy,
- ElementCount VF) {
- assert(FTy->isFloatingPointTy() && "Expected floating point type!");
- Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits());
- Value *RuntimeVF = getRuntimeVF(B, IntTy, VF);
- return B.CreateUIToFP(RuntimeVF, FTy);
-}
-
void reportVectorizationFailure(const StringRef DebugMsg,
const StringRef OREMsg, const StringRef ORETag,
OptimizationRemarkEmitter *ORE, Loop *TheLoop,
@@ -1050,6 +1045,23 @@ void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
<< Msg);
}
+/// Report successful vectorization of the loop. In case an outer loop is
+/// vectorized, prepend "outer" to the vectorization remark.
+static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop,
+ VectorizationFactor VF, unsigned IC) {
+ LLVM_DEBUG(debugVectorizationMessage(
+ "Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop",
+ nullptr));
+ StringRef LoopType = TheLoop->isInnermost() ? "" : "outer ";
+ ORE->emit([&]() {
+ return OptimizationRemark(LV_NAME, "Vectorized", TheLoop->getStartLoc(),
+ TheLoop->getHeader())
+ << "vectorized " << LoopType << "loop (vectorization width: "
+ << ore::NV("VectorizationFactor", VF.Width)
+ << ", interleaved count: " << ore::NV("InterleaveCount", IC) << ")";
+ });
+}
+
} // end namespace llvm
#ifndef NDEBUG
@@ -1104,7 +1116,8 @@ void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {
RecWithFlags->dropPoisonGeneratingFlags();
} else {
- Instruction *Instr = CurRec->getUnderlyingInstr();
+ Instruction *Instr = dyn_cast_or_null<Instruction>(
+ CurRec->getVPSingleValue()->getUnderlyingValue());
(void)Instr;
assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
"found instruction with poison generating flags not covered by "
@@ -1247,6 +1260,13 @@ public:
/// avoid redundant calculations.
void setCostBasedWideningDecision(ElementCount VF);
+ /// A call may be vectorized in different ways depending on whether we have
+ /// vectorized variants available and whether the target supports masking.
+ /// This function analyzes all calls in the function at the supplied VF,
+ /// makes a decision based on the costs of available options, and stores that
+ /// decision in a map for use in planning and plan execution.
+ void setVectorizedCallDecision(ElementCount VF);
+
/// A struct that represents some properties of the register usage
/// of a loop.
struct RegisterUsage {
@@ -1270,7 +1290,7 @@ public:
void collectElementTypesForWidening();
/// Split reductions into those that happen in the loop, and those that happen
- /// outside. In loop reductions are collected into InLoopReductionChains.
+ /// outside. In loop reductions are collected into InLoopReductions.
void collectInLoopReductions();
/// Returns true if we should use strict in-order reductions for the given
@@ -1358,7 +1378,9 @@ public:
CM_Widen_Reverse, // For consecutive accesses with stride -1.
CM_Interleave,
CM_GatherScatter,
- CM_Scalarize
+ CM_Scalarize,
+ CM_VectorCall,
+ CM_IntrinsicCall
};
/// Save vectorization decision \p W and \p Cost taken by the cost model for
@@ -1414,6 +1436,29 @@ public:
return WideningDecisions[InstOnVF].second;
}
+ struct CallWideningDecision {
+ InstWidening Kind;
+ Function *Variant;
+ Intrinsic::ID IID;
+ std::optional<unsigned> MaskPos;
+ InstructionCost Cost;
+ };
+
+ void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind,
+ Function *Variant, Intrinsic::ID IID,
+ std::optional<unsigned> MaskPos,
+ InstructionCost Cost) {
+ assert(!VF.isScalar() && "Expected vector VF");
+ CallWideningDecisions[std::make_pair(CI, VF)] = {Kind, Variant, IID,
+ MaskPos, Cost};
+ }
+
+ CallWideningDecision getCallWideningDecision(CallInst *CI,
+ ElementCount VF) const {
+ assert(!VF.isScalar() && "Expected vector VF");
+ return CallWideningDecisions.at(std::make_pair(CI, VF));
+ }
+
/// Return True if instruction \p I is an optimizable truncate whose operand
/// is an induction variable. Such a truncate will be removed by adding a new
/// induction variable with the destination type.
@@ -1447,11 +1492,15 @@ public:
/// Collect Uniform and Scalar values for the given \p VF.
/// The sets depend on CM decision for Load/Store instructions
/// that may be vectorized as interleave, gather-scatter or scalarized.
+ /// Also make a decision on what to do about call instructions in the loop
+ /// at that VF -- scalarize, call a known vector routine, or call a
+ /// vector intrinsic.
void collectUniformsAndScalars(ElementCount VF) {
// Do the analysis once.
if (VF.isScalar() || Uniforms.contains(VF))
return;
setCostBasedWideningDecision(VF);
+ setVectorizedCallDecision(VF);
collectLoopUniforms(VF);
collectLoopScalars(VF);
}
@@ -1606,20 +1655,9 @@ public:
return foldTailByMasking() || Legal->blockNeedsPredication(BB);
}
- /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
- /// nodes to the chain of instructions representing the reductions. Uses a
- /// MapVector to ensure deterministic iteration order.
- using ReductionChainMap =
- SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
-
- /// Return the chain of instructions representing an inloop reduction.
- const ReductionChainMap &getInLoopReductionChains() const {
- return InLoopReductionChains;
- }
-
/// Returns true if the Phi is part of an inloop reduction.
bool isInLoopReduction(PHINode *Phi) const {
- return InLoopReductionChains.count(Phi);
+ return InLoopReductions.contains(Phi);
}
/// Estimate cost of an intrinsic call instruction CI if it were vectorized
@@ -1629,16 +1667,13 @@ public:
/// Estimate cost of a call instruction CI if it were vectorized with factor
/// VF. Return the cost of the instruction, including scalarization overhead
- /// if it's needed. The flag NeedToScalarize shows if the call needs to be
- /// scalarized -
- /// i.e. either vector version isn't available, or is too expensive.
- InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF,
- Function **Variant,
- bool *NeedsMask = nullptr) const;
+ /// if it's needed.
+ InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const;
/// Invalidates decisions already taken by the cost model.
void invalidateCostModelingDecisions() {
WideningDecisions.clear();
+ CallWideningDecisions.clear();
Uniforms.clear();
Scalars.clear();
}
@@ -1675,14 +1710,14 @@ private:
/// elements is a power-of-2 larger than zero. If scalable vectorization is
/// disabled or unsupported, then the scalable part will be equal to
/// ElementCount::getScalable(0).
- FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount,
+ FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount,
ElementCount UserVF,
bool FoldTailByMasking);
/// \return the maximized element count based on the targets vector
/// registers and the loop trip-count, but limited to a maximum safe VF.
/// This is a helper function of computeFeasibleMaxVF.
- ElementCount getMaximizedVFForTarget(unsigned ConstTripCount,
+ ElementCount getMaximizedVFForTarget(unsigned MaxTripCount,
unsigned SmallestType,
unsigned WidestType,
ElementCount MaxSafeVF,
@@ -1705,7 +1740,7 @@ private:
/// part of that pattern.
std::optional<InstructionCost>
getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
- TTI::TargetCostKind CostKind);
+ TTI::TargetCostKind CostKind) const;
/// Calculate vectorization cost of memory instruction \p I.
InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
@@ -1783,15 +1818,12 @@ private:
/// scalarized.
DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
- /// PHINodes of the reductions that should be expanded in-loop along with
- /// their associated chains of reduction operations, in program order from top
- /// (PHI) to bottom
- ReductionChainMap InLoopReductionChains;
+ /// PHINodes of the reductions that should be expanded in-loop.
+ SmallPtrSet<PHINode *, 4> InLoopReductions;
/// A Map of inloop reduction operations and their immediate chain operand.
/// FIXME: This can be removed once reductions can be costed correctly in
- /// vplan. This was added to allow quick lookup to the inloop operations,
- /// without having to loop through InLoopReductionChains.
+ /// VPlan. This was added to allow quick lookup of the inloop operations.
DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
/// Returns the expected difference in cost from scalarizing the expression
@@ -1830,6 +1862,11 @@ private:
DecisionList WideningDecisions;
+ using CallDecisionList =
+ DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>;
+
+ CallDecisionList CallWideningDecisions;
+
/// Returns true if \p V is expected to be vectorized and it needs to be
/// extracted.
bool needsExtract(Value *V, ElementCount VF) const {
@@ -1933,12 +1970,14 @@ class GeneratedRTChecks {
SCEVExpander MemCheckExp;
bool CostTooHigh = false;
+ const bool AddBranchWeights;
public:
GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
- TargetTransformInfo *TTI, const DataLayout &DL)
+ TargetTransformInfo *TTI, const DataLayout &DL,
+ bool AddBranchWeights)
: DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"),
- MemCheckExp(SE, DL, "scev.check") {}
+ MemCheckExp(SE, DL, "scev.check"), AddBranchWeights(AddBranchWeights) {}
/// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
/// accurately estimate the cost of the runtime checks. The blocks are
@@ -1990,9 +2029,9 @@ public:
},
IC);
} else {
- MemRuntimeCheckCond =
- addRuntimeChecks(MemCheckBlock->getTerminator(), L,
- RtPtrChecking.getChecks(), MemCheckExp);
+ MemRuntimeCheckCond = addRuntimeChecks(
+ MemCheckBlock->getTerminator(), L, RtPtrChecking.getChecks(),
+ MemCheckExp, VectorizerParams::HoistRuntimeChecks);
}
assert(MemRuntimeCheckCond &&
"no RT checks generated although RtPtrChecking "
@@ -2131,8 +2170,10 @@ public:
DT->addNewBlock(SCEVCheckBlock, Pred);
DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
- ReplaceInstWithInst(SCEVCheckBlock->getTerminator(),
- BranchInst::Create(Bypass, LoopVectorPreHeader, Cond));
+ BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader, Cond);
+ if (AddBranchWeights)
+ setBranchWeights(BI, SCEVCheckBypassWeights);
+ ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), &BI);
return SCEVCheckBlock;
}
@@ -2156,9 +2197,12 @@ public:
if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
PL->addBasicBlockToLoop(MemCheckBlock, *LI);
- ReplaceInstWithInst(
- MemCheckBlock->getTerminator(),
- BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond));
+ BranchInst &BI =
+ *BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond);
+ if (AddBranchWeights) {
+ setBranchWeights(BI, MemCheckBypassWeights);
+ }
+ ReplaceInstWithInst(MemCheckBlock->getTerminator(), &BI);
MemCheckBlock->getTerminator()->setDebugLoc(
Pred->getTerminator()->getDebugLoc());
@@ -2252,157 +2296,17 @@ static void collectSupportedLoops(Loop &L, LoopInfo *LI,
// LoopVectorizationCostModel and LoopVectorizationPlanner.
//===----------------------------------------------------------------------===//
-/// This function adds
-/// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
-/// to each vector element of Val. The sequence starts at StartIndex.
-/// \p Opcode is relevant for FP induction variable.
-static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,
- Instruction::BinaryOps BinOp, ElementCount VF,
- IRBuilderBase &Builder) {
- assert(VF.isVector() && "only vector VFs are supported");
-
- // Create and check the types.
- auto *ValVTy = cast<VectorType>(Val->getType());
- ElementCount VLen = ValVTy->getElementCount();
-
- Type *STy = Val->getType()->getScalarType();
- assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
- "Induction Step must be an integer or FP");
- assert(Step->getType() == STy && "Step has wrong type");
-
- SmallVector<Constant *, 8> Indices;
-
- // Create a vector of consecutive numbers from zero to VF.
- VectorType *InitVecValVTy = ValVTy;
- if (STy->isFloatingPointTy()) {
- Type *InitVecValSTy =
- IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
- InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
- }
- Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
-
- // Splat the StartIdx
- Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx);
-
- if (STy->isIntegerTy()) {
- InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
- Step = Builder.CreateVectorSplat(VLen, Step);
- assert(Step->getType() == Val->getType() && "Invalid step vec");
- // FIXME: The newly created binary instructions should contain nsw/nuw
- // flags, which can be found from the original scalar operations.
- Step = Builder.CreateMul(InitVec, Step);
- return Builder.CreateAdd(Val, Step, "induction");
- }
-
- // Floating point induction.
- assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
- "Binary Opcode should be specified for FP induction");
- InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
- InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat);
-
- Step = Builder.CreateVectorSplat(VLen, Step);
- Value *MulOp = Builder.CreateFMul(InitVec, Step);
- return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
-}
-
-/// Compute scalar induction steps. \p ScalarIV is the scalar induction
-/// variable on which to base the steps, \p Step is the size of the step.
-static void buildScalarSteps(Value *ScalarIV, Value *Step,
- const InductionDescriptor &ID, VPValue *Def,
- VPTransformState &State) {
- IRBuilderBase &Builder = State.Builder;
-
- // Ensure step has the same type as that of scalar IV.
- Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
- if (ScalarIVTy != Step->getType()) {
- // TODO: Also use VPDerivedIVRecipe when only the step needs truncating, to
- // avoid separate truncate here.
- assert(Step->getType()->isIntegerTy() &&
- "Truncation requires an integer step");
- Step = State.Builder.CreateTrunc(Step, ScalarIVTy);
- }
-
- // We build scalar steps for both integer and floating-point induction
- // variables. Here, we determine the kind of arithmetic we will perform.
- Instruction::BinaryOps AddOp;
- Instruction::BinaryOps MulOp;
- if (ScalarIVTy->isIntegerTy()) {
- AddOp = Instruction::Add;
- MulOp = Instruction::Mul;
- } else {
- AddOp = ID.getInductionOpcode();
- MulOp = Instruction::FMul;
- }
-
- // Determine the number of scalars we need to generate for each unroll
- // iteration.
- bool FirstLaneOnly = vputils::onlyFirstLaneUsed(Def);
- // Compute the scalar steps and save the results in State.
- Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
- ScalarIVTy->getScalarSizeInBits());
- Type *VecIVTy = nullptr;
- Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
- if (!FirstLaneOnly && State.VF.isScalable()) {
- VecIVTy = VectorType::get(ScalarIVTy, State.VF);
- UnitStepVec =
- Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));
- SplatStep = Builder.CreateVectorSplat(State.VF, Step);
- SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV);
- }
-
- unsigned StartPart = 0;
- unsigned EndPart = State.UF;
- unsigned StartLane = 0;
- unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
- if (State.Instance) {
- StartPart = State.Instance->Part;
- EndPart = StartPart + 1;
- StartLane = State.Instance->Lane.getKnownLane();
- EndLane = StartLane + 1;
- }
- for (unsigned Part = StartPart; Part < EndPart; ++Part) {
- Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part);
-
- if (!FirstLaneOnly && State.VF.isScalable()) {
- auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
- auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
- if (ScalarIVTy->isFloatingPointTy())
- InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
- auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
- auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
- State.set(Def, Add, Part);
- // It's useful to record the lane values too for the known minimum number
- // of elements so we do those below. This improves the code quality when
- // trying to extract the first element, for example.
- }
-
- if (ScalarIVTy->isFloatingPointTy())
- StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy);
-
- for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) {
- Value *StartIdx = Builder.CreateBinOp(
- AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane));
- // The step returned by `createStepForVF` is a runtime-evaluated value
- // when VF is scalable. Otherwise, it should be folded into a Constant.
- assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
- "Expected StartIdx to be folded to a constant when VF is not "
- "scalable");
- auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
- auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul);
- State.set(Def, Add, VPIteration(Part, Lane));
- }
- }
-}
-
/// Compute the transformed value of Index at offset StartValue using step
/// StepValue.
/// For integer induction, returns StartValue + Index * StepValue.
/// For pointer induction, returns StartValue[Index * StepValue].
/// FIXME: The newly created binary instructions should contain nsw/nuw
/// flags, which can be found from the original scalar operations.
-static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index,
- Value *StartValue, Value *Step,
- const InductionDescriptor &ID) {
+static Value *
+emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue,
+ Value *Step,
+ InductionDescriptor::InductionKind InductionKind,
+ const BinaryOperator *InductionBinOp) {
Type *StepTy = Step->getType();
Value *CastedIndex = StepTy->isIntegerTy()
? B.CreateSExtOrTrunc(Index, StepTy)
@@ -2446,7 +2350,7 @@ static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index,
return B.CreateMul(X, Y);
};
- switch (ID.getKind()) {
+ switch (InductionKind) {
case InductionDescriptor::IK_IntInduction: {
assert(!isa<VectorType>(Index->getType()) &&
"Vector indices not supported for integer inductions yet");
@@ -2464,7 +2368,6 @@ static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index,
assert(!isa<VectorType>(Index->getType()) &&
"Vector indices not supported for FP inductions yet");
assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
- auto InductionBinOp = ID.getInductionBinOp();
assert(InductionBinOp &&
(InductionBinOp->getOpcode() == Instruction::FAdd ||
InductionBinOp->getOpcode() == Instruction::FSub) &&
@@ -2524,17 +2427,6 @@ static bool isIndvarOverflowCheckKnownFalse(
return false;
}
-void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def,
- const VPIteration &Instance,
- VPTransformState &State) {
- Value *ScalarInst = State.get(Def, Instance);
- Value *VectorValue = State.get(Def, Instance.Part);
- VectorValue = Builder.CreateInsertElement(
- VectorValue, ScalarInst,
- Instance.Lane.getAsRuntimeExpr(State.Builder, VF));
- State.set(Def, VectorValue, Instance.Part);
-}
-
// Return whether we allow using masked interleave-groups (for dealing with
// strided loads/stores that reside in predicated blocks, or for dealing
// with gaps).
@@ -2612,7 +2504,8 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
for (unsigned Part = 0; Part < UF; Part++) {
Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
- State.setDebugLocFromInst(AddrPart);
+ if (auto *I = dyn_cast<Instruction>(AddrPart))
+ State.setDebugLocFrom(I->getDebugLoc());
// Notice current instruction could be any index. Need to adjust the address
// to the member of index 0.
@@ -2630,14 +2523,10 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
InBounds = gep->isInBounds();
AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Idx, "", InBounds);
-
- // Cast to the vector pointer type.
- unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
- Type *PtrTy = VecTy->getPointerTo(AddressSpace);
- AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
+ AddrParts.push_back(AddrPart);
}
- State.setDebugLocFromInst(Instr);
+ State.setDebugLocFrom(Instr->getDebugLoc());
Value *PoisonVec = PoisonValue::get(VecTy);
auto CreateGroupMask = [this, &BlockInMask, &State, &InterleaveFactor](
@@ -2835,13 +2724,20 @@ void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
bool IsVoidRetTy = Instr->getType()->isVoidTy();
Instruction *Cloned = Instr->clone();
- if (!IsVoidRetTy)
+ if (!IsVoidRetTy) {
Cloned->setName(Instr->getName() + ".cloned");
+#if !defined(NDEBUG)
+ // Verify that VPlan type inference results agree with the type of the
+ // generated values.
+ assert(State.TypeAnalysis.inferScalarType(RepRecipe) == Cloned->getType() &&
+ "inferred type and type from generated instructions do not match");
+#endif
+ }
RepRecipe->setFlags(Cloned);
- if (Instr->getDebugLoc())
- State.setDebugLocFromInst(Instr);
+ if (auto DL = Instr->getDebugLoc())
+ State.setDebugLocFrom(DL);
// Replace the operands of the cloned instructions with their scalar
// equivalents in the new loop.
@@ -3019,9 +2915,11 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
// dominator of the exit blocks.
DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
- ReplaceInstWithInst(
- TCCheckBlock->getTerminator(),
- BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
+ BranchInst &BI =
+ *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
+ if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
+ setBranchWeights(BI, MinItersBypassWeights);
+ ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
LoopBypassBlocks.push_back(TCCheckBlock);
}
@@ -3151,15 +3049,17 @@ PHINode *InnerLoopVectorizer::createInductionResumeValue(
if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
- EndValue =
- emitTransformedIndex(B, VectorTripCount, II.getStartValue(), Step, II);
+ EndValue = emitTransformedIndex(B, VectorTripCount, II.getStartValue(),
+ Step, II.getKind(), II.getInductionBinOp());
EndValue->setName("ind.end");
// Compute the end value for the additional bypass (if applicable).
if (AdditionalBypass.first) {
- B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
- EndValueFromAdditionalBypass = emitTransformedIndex(
- B, AdditionalBypass.second, II.getStartValue(), Step, II);
+ B.SetInsertPoint(AdditionalBypass.first,
+ AdditionalBypass.first->getFirstInsertionPt());
+ EndValueFromAdditionalBypass =
+ emitTransformedIndex(B, AdditionalBypass.second, II.getStartValue(),
+ Step, II.getKind(), II.getInductionBinOp());
EndValueFromAdditionalBypass->setName("ind.end");
}
}
@@ -3240,16 +3140,25 @@ BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() {
// 3) Otherwise, construct a runtime check.
if (!Cost->requiresScalarEpilogue(VF.isVector()) &&
!Cost->foldTailByMasking()) {
- Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
- Count, VectorTripCount, "cmp.n",
- LoopMiddleBlock->getTerminator());
-
// Here we use the same DebugLoc as the scalar loop latch terminator instead
// of the corresponding compare because they may have ended up with
// different line numbers and we want to avoid awkward line stepping while
// debugging. Eg. if the compare has got a line number inside the loop.
- CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc());
- cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN);
+ // TODO: At the moment, CreateICmpEQ will simplify conditions with constant
+ // operands. Perform simplification directly on VPlan once the branch is
+ // modeled there.
+ IRBuilder<> B(LoopMiddleBlock->getTerminator());
+ B.SetCurrentDebugLocation(ScalarLatchTerm->getDebugLoc());
+ Value *CmpN = B.CreateICmpEQ(Count, VectorTripCount, "cmp.n");
+ BranchInst &BI = *cast<BranchInst>(LoopMiddleBlock->getTerminator());
+ BI.setCondition(CmpN);
+ if (hasBranchWeightMD(*ScalarLatchTerm)) {
+ // Assume that `Count % VectorTripCount` is equally distributed.
+ unsigned TripCount = UF * VF.getKnownMinValue();
+ assert(TripCount > 0 && "trip count should not be zero");
+ const uint32_t Weights[] = {1, TripCount - 1};
+ setBranchWeights(BI, Weights);
+ }
}
#ifdef EXPENSIVE_CHECKS
@@ -3373,7 +3282,8 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
Value *Step = StepVPV->isLiveIn() ? StepVPV->getLiveInIRValue()
: State.get(StepVPV, {0, 0});
Value *Escape =
- emitTransformedIndex(B, CountMinusOne, II.getStartValue(), Step, II);
+ emitTransformedIndex(B, CountMinusOne, II.getStartValue(), Step,
+ II.getKind(), II.getInductionBinOp());
Escape->setName("ind.escape");
MissingVals[UI] = Escape;
}
@@ -3445,76 +3355,33 @@ static void cse(BasicBlock *BB) {
}
}
-InstructionCost LoopVectorizationCostModel::getVectorCallCost(
- CallInst *CI, ElementCount VF, Function **Variant, bool *NeedsMask) const {
- Function *F = CI->getCalledFunction();
- Type *ScalarRetTy = CI->getType();
- SmallVector<Type *, 4> Tys, ScalarTys;
- bool MaskRequired = Legal->isMaskRequired(CI);
- for (auto &ArgOp : CI->args())
- ScalarTys.push_back(ArgOp->getType());
+InstructionCost
+LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
+ ElementCount VF) const {
+ // We only need to calculate a cost if the VF is scalar; for actual vectors
+ // we should already have a pre-calculated cost at each VF.
+ if (!VF.isScalar())
+ return CallWideningDecisions.at(std::make_pair(CI, VF)).Cost;
- // Estimate cost of scalarized vector call. The source operands are assumed
- // to be vectors, so we need to extract individual elements from there,
- // execute VF scalar calls, and then gather the result into the vector return
- // value.
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
- InstructionCost ScalarCallCost =
- TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, CostKind);
- if (VF.isScalar())
- return ScalarCallCost;
-
- // Compute corresponding vector type for return value and arguments.
- Type *RetTy = ToVectorTy(ScalarRetTy, VF);
- for (Type *ScalarTy : ScalarTys)
- Tys.push_back(ToVectorTy(ScalarTy, VF));
-
- // Compute costs of unpacking argument values for the scalar calls and
- // packing the return values to a vector.
- InstructionCost ScalarizationCost =
- getScalarizationOverhead(CI, VF, CostKind);
+ Type *RetTy = CI->getType();
+ if (RecurrenceDescriptor::isFMulAddIntrinsic(CI))
+ if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind))
+ return *RedCost;
- InstructionCost Cost =
- ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
-
- // If we can't emit a vector call for this function, then the currently found
- // cost is the cost we need to return.
- InstructionCost MaskCost = 0;
- VFShape Shape = VFShape::get(*CI, VF, MaskRequired);
- if (NeedsMask)
- *NeedsMask = MaskRequired;
- Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
- // If we want an unmasked vector function but can't find one matching the VF,
- // maybe we can find vector function that does use a mask and synthesize
- // an all-true mask.
- if (!VecFunc && !MaskRequired) {
- Shape = VFShape::get(*CI, VF, /*HasGlobalPred=*/true);
- VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
- // If we found one, add in the cost of creating a mask
- if (VecFunc) {
- if (NeedsMask)
- *NeedsMask = true;
- MaskCost = TTI.getShuffleCost(
- TargetTransformInfo::SK_Broadcast,
- VectorType::get(
- IntegerType::getInt1Ty(VecFunc->getFunctionType()->getContext()),
- VF));
- }
- }
+ SmallVector<Type *, 4> Tys;
+ for (auto &ArgOp : CI->args())
+ Tys.push_back(ArgOp->getType());
- // We don't support masked function calls yet, but we can scalarize a
- // masked call with branches (unless VF is scalable).
- if (!TLI || CI->isNoBuiltin() || !VecFunc)
- return VF.isScalable() ? InstructionCost::getInvalid() : Cost;
+ InstructionCost ScalarCallCost =
+ TTI.getCallInstrCost(CI->getCalledFunction(), RetTy, Tys, CostKind);
- // If the corresponding vector cost is cheaper, return its cost.
- InstructionCost VectorCallCost =
- TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind) + MaskCost;
- if (VectorCallCost < Cost) {
- *Variant = VecFunc;
- Cost = VectorCallCost;
+ // If this is an intrinsic we may have a lower cost for it.
+ if (getVectorIntrinsicIDForCall(CI, TLI)) {
+ InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
+ return std::min(ScalarCallCost, IntrinsicCost);
}
- return Cost;
+ return ScalarCallCost;
}
static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) {
@@ -3558,146 +3425,8 @@ static Type *largestIntegerVectorType(Type *T1, Type *T2) {
return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
}
-void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) {
- // For every instruction `I` in MinBWs, truncate the operands, create a
- // truncated version of `I` and reextend its result. InstCombine runs
- // later and will remove any ext/trunc pairs.
- SmallPtrSet<Value *, 4> Erased;
- for (const auto &KV : Cost->getMinimalBitwidths()) {
- // If the value wasn't vectorized, we must maintain the original scalar
- // type. The absence of the value from State indicates that it
- // wasn't vectorized.
- // FIXME: Should not rely on getVPValue at this point.
- VPValue *Def = State.Plan->getVPValue(KV.first, true);
- if (!State.hasAnyVectorValue(Def))
- continue;
- for (unsigned Part = 0; Part < UF; ++Part) {
- Value *I = State.get(Def, Part);
- if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
- continue;
- Type *OriginalTy = I->getType();
- Type *ScalarTruncatedTy =
- IntegerType::get(OriginalTy->getContext(), KV.second);
- auto *TruncatedTy = VectorType::get(
- ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount());
- if (TruncatedTy == OriginalTy)
- continue;
-
- IRBuilder<> B(cast<Instruction>(I));
- auto ShrinkOperand = [&](Value *V) -> Value * {
- if (auto *ZI = dyn_cast<ZExtInst>(V))
- if (ZI->getSrcTy() == TruncatedTy)
- return ZI->getOperand(0);
- return B.CreateZExtOrTrunc(V, TruncatedTy);
- };
-
- // The actual instruction modification depends on the instruction type,
- // unfortunately.
- Value *NewI = nullptr;
- if (auto *BO = dyn_cast<BinaryOperator>(I)) {
- NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
- ShrinkOperand(BO->getOperand(1)));
-
- // Any wrapping introduced by shrinking this operation shouldn't be
- // considered undefined behavior. So, we can't unconditionally copy
- // arithmetic wrapping flags to NewI.
- cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
- } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
- NewI =
- B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
- ShrinkOperand(CI->getOperand(1)));
- } else if (auto *SI = dyn_cast<SelectInst>(I)) {
- NewI = B.CreateSelect(SI->getCondition(),
- ShrinkOperand(SI->getTrueValue()),
- ShrinkOperand(SI->getFalseValue()));
- } else if (auto *CI = dyn_cast<CastInst>(I)) {
- switch (CI->getOpcode()) {
- default:
- llvm_unreachable("Unhandled cast!");
- case Instruction::Trunc:
- NewI = ShrinkOperand(CI->getOperand(0));
- break;
- case Instruction::SExt:
- NewI = B.CreateSExtOrTrunc(
- CI->getOperand(0),
- smallestIntegerVectorType(OriginalTy, TruncatedTy));
- break;
- case Instruction::ZExt:
- NewI = B.CreateZExtOrTrunc(
- CI->getOperand(0),
- smallestIntegerVectorType(OriginalTy, TruncatedTy));
- break;
- }
- } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
- auto Elements0 =
- cast<VectorType>(SI->getOperand(0)->getType())->getElementCount();
- auto *O0 = B.CreateZExtOrTrunc(
- SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
- auto Elements1 =
- cast<VectorType>(SI->getOperand(1)->getType())->getElementCount();
- auto *O1 = B.CreateZExtOrTrunc(
- SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
-
- NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
- } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
- // Don't do anything with the operands, just extend the result.
- continue;
- } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
- auto Elements =
- cast<VectorType>(IE->getOperand(0)->getType())->getElementCount();
- auto *O0 = B.CreateZExtOrTrunc(
- IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
- auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
- NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
- } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
- auto Elements =
- cast<VectorType>(EE->getOperand(0)->getType())->getElementCount();
- auto *O0 = B.CreateZExtOrTrunc(
- EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
- NewI = B.CreateExtractElement(O0, EE->getOperand(2));
- } else {
- // If we don't know what to do, be conservative and don't do anything.
- continue;
- }
-
- // Lastly, extend the result.
- NewI->takeName(cast<Instruction>(I));
- Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
- I->replaceAllUsesWith(Res);
- cast<Instruction>(I)->eraseFromParent();
- Erased.insert(I);
- State.reset(Def, Res, Part);
- }
- }
-
- // We'll have created a bunch of ZExts that are now parentless. Clean up.
- for (const auto &KV : Cost->getMinimalBitwidths()) {
- // If the value wasn't vectorized, we must maintain the original scalar
- // type. The absence of the value from State indicates that it
- // wasn't vectorized.
- // FIXME: Should not rely on getVPValue at this point.
- VPValue *Def = State.Plan->getVPValue(KV.first, true);
- if (!State.hasAnyVectorValue(Def))
- continue;
- for (unsigned Part = 0; Part < UF; ++Part) {
- Value *I = State.get(Def, Part);
- ZExtInst *Inst = dyn_cast<ZExtInst>(I);
- if (Inst && Inst->use_empty()) {
- Value *NewI = Inst->getOperand(0);
- Inst->eraseFromParent();
- State.reset(Def, NewI, Part);
- }
- }
- }
-}
-
void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
VPlan &Plan) {
- // Insert truncates and extends for any truncated instructions as hints to
- // InstCombine.
- if (VF.isVector())
- truncateToMinimalBitwidths(State);
-
// Fix widened non-induction PHIs by setting up the PHI operands.
if (EnableVPlanNativePath)
fixNonInductionPHIs(Plan, State);
@@ -3710,6 +3439,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
// Forget the original basic block.
PSE.getSE()->forgetLoop(OrigLoop);
+ PSE.getSE()->forgetBlockAndLoopDispositions();
// After vectorization, the exit blocks of the original loop will have
// additional predecessors. Invalidate SCEVs for the exit phis in case SE
@@ -3718,7 +3448,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
OrigLoop->getExitBlocks(ExitBlocks);
for (BasicBlock *Exit : ExitBlocks)
for (PHINode &PN : Exit->phis())
- PSE.getSE()->forgetValue(&PN);
+ PSE.getSE()->forgetLcssaPhiWithNewPredecessor(OrigLoop, &PN);
VPBasicBlock *LatchVPBB = Plan.getVectorLoopRegion()->getExitingBasicBlock();
Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]);
@@ -3744,7 +3474,8 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
// Fix LCSSA phis not already fixed earlier. Extracts may need to be generated
// in the exit block, so update the builder.
- State.Builder.SetInsertPoint(State.CFG.ExitBB->getFirstNonPHI());
+ State.Builder.SetInsertPoint(State.CFG.ExitBB,
+ State.CFG.ExitBB->getFirstNonPHIIt());
for (const auto &KV : Plan.getLiveOuts())
KV.second->fixPhi(Plan, State);
@@ -3782,40 +3513,10 @@ void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {
VPBasicBlock *Header =
State.Plan->getVectorLoopRegion()->getEntryBasicBlock();
- // Gather all VPReductionPHIRecipe and sort them so that Intermediate stores
- // sank outside of the loop would keep the same order as they had in the
- // original loop.
- SmallVector<VPReductionPHIRecipe *> ReductionPHIList;
for (VPRecipeBase &R : Header->phis()) {
if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
- ReductionPHIList.emplace_back(ReductionPhi);
+ fixReduction(ReductionPhi, State);
}
- stable_sort(ReductionPHIList, [this](const VPReductionPHIRecipe *R1,
- const VPReductionPHIRecipe *R2) {
- auto *IS1 = R1->getRecurrenceDescriptor().IntermediateStore;
- auto *IS2 = R2->getRecurrenceDescriptor().IntermediateStore;
-
- // If neither of the recipes has an intermediate store, keep the order the
- // same.
- if (!IS1 && !IS2)
- return false;
-
- // If only one of the recipes has an intermediate store, then move it
- // towards the beginning of the list.
- if (IS1 && !IS2)
- return true;
-
- if (!IS1 && IS2)
- return false;
-
- // If both recipes have an intermediate store, then the recipe with the
- // later store should be processed earlier. So it should go to the beginning
- // of the list.
- return DT->dominates(IS2, IS1);
- });
-
- for (VPReductionPHIRecipe *ReductionPhi : ReductionPHIList)
- fixReduction(ReductionPhi, State);
for (VPRecipeBase &R : Header->phis()) {
if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
@@ -3929,7 +3630,7 @@ void InnerLoopVectorizer::fixFixedOrderRecurrence(
}
// Fix the initial value of the original recurrence in the scalar loop.
- Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
+ Builder.SetInsertPoint(LoopScalarPreHeader, LoopScalarPreHeader->begin());
PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue();
@@ -3953,90 +3654,56 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
RecurKind RK = RdxDesc.getRecurrenceKind();
TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
- State.setDebugLocFromInst(ReductionStartValue);
+ if (auto *I = dyn_cast<Instruction>(&*ReductionStartValue))
+ State.setDebugLocFrom(I->getDebugLoc());
VPValue *LoopExitInstDef = PhiR->getBackedgeValue();
- // This is the vector-clone of the value that leaves the loop.
- Type *VecTy = State.get(LoopExitInstDef, 0)->getType();
// Before each round, move the insertion point right between
// the PHIs and the values we are going to write.
// This allows us to write both PHINodes and the extractelement
// instructions.
- Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
+ Builder.SetInsertPoint(LoopMiddleBlock,
+ LoopMiddleBlock->getFirstInsertionPt());
- State.setDebugLocFromInst(LoopExitInst);
+ State.setDebugLocFrom(LoopExitInst->getDebugLoc());
Type *PhiTy = OrigPhi->getType();
-
- VPBasicBlock *LatchVPBB =
- PhiR->getParent()->getEnclosingLoopRegion()->getExitingBasicBlock();
- BasicBlock *VectorLoopLatch = State.CFG.VPBB2IRBB[LatchVPBB];
// If tail is folded by masking, the vector value to leave the loop should be
// a Select choosing between the vectorized LoopExitInst and vectorized Phi,
// instead of the former. For an inloop reduction the reduction will already
// be predicated, and does not need to be handled here.
if (Cost->foldTailByMasking() && !PhiR->isInLoop()) {
- for (unsigned Part = 0; Part < UF; ++Part) {
- Value *VecLoopExitInst = State.get(LoopExitInstDef, Part);
- SelectInst *Sel = nullptr;
- for (User *U : VecLoopExitInst->users()) {
- if (isa<SelectInst>(U)) {
- assert(!Sel && "Reduction exit feeding two selects");
- Sel = cast<SelectInst>(U);
- } else
- assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
- }
- assert(Sel && "Reduction exit feeds no select");
- State.reset(LoopExitInstDef, Sel, Part);
-
- if (isa<FPMathOperator>(Sel))
- Sel->setFastMathFlags(RdxDesc.getFastMathFlags());
-
- // If the target can create a predicated operator for the reduction at no
- // extra cost in the loop (for example a predicated vadd), it can be
- // cheaper for the select to remain in the loop than be sunk out of it,
- // and so use the select value for the phi instead of the old
- // LoopExitValue.
- if (PreferPredicatedReductionSelect ||
- TTI->preferPredicatedReductionSelect(
- RdxDesc.getOpcode(), PhiTy,
- TargetTransformInfo::ReductionFlags())) {
- auto *VecRdxPhi =
- cast<PHINode>(State.get(PhiR, Part));
- VecRdxPhi->setIncomingValueForBlock(VectorLoopLatch, Sel);
+ VPValue *Def = nullptr;
+ for (VPUser *U : LoopExitInstDef->users()) {
+ auto *S = dyn_cast<VPInstruction>(U);
+ if (S && S->getOpcode() == Instruction::Select) {
+ Def = S;
+ break;
}
}
+ if (Def)
+ LoopExitInstDef = Def;
}
+ VectorParts RdxParts(UF);
+ for (unsigned Part = 0; Part < UF; ++Part)
+ RdxParts[Part] = State.get(LoopExitInstDef, Part);
+
// If the vector reduction can be performed in a smaller type, we truncate
// then extend the loop exit value to enable InstCombine to evaluate the
// entire expression in the smaller type.
if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
- assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
+ Builder.SetInsertPoint(LoopMiddleBlock,
+ LoopMiddleBlock->getFirstInsertionPt());
Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
- Builder.SetInsertPoint(VectorLoopLatch->getTerminator());
- VectorParts RdxParts(UF);
- for (unsigned Part = 0; Part < UF; ++Part) {
- RdxParts[Part] = State.get(LoopExitInstDef, Part);
- Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
- Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
- : Builder.CreateZExt(Trunc, VecTy);
- for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users()))
- if (U != Trunc) {
- U->replaceUsesOfWith(RdxParts[Part], Extnd);
- RdxParts[Part] = Extnd;
- }
- }
- Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
for (unsigned Part = 0; Part < UF; ++Part) {
RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
- State.reset(LoopExitInstDef, RdxParts[Part], Part);
}
}
// Reduce all of the unrolled parts into a single vector.
- Value *ReducedPartRdx = State.get(LoopExitInstDef, 0);
+ Value *ReducedPartRdx = RdxParts[0];
unsigned Op = RecurrenceDescriptor::getOpcode(RK);
// The middle block terminator has already been assigned a DebugLoc here (the
@@ -4046,21 +3713,21 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
// conditional branch, and (c) other passes may add new predecessors which
// terminate on this line. This is the easiest way to ensure we don't
// accidentally cause an extra step back into the loop while debugging.
- State.setDebugLocFromInst(LoopMiddleBlock->getTerminator());
+ State.setDebugLocFrom(LoopMiddleBlock->getTerminator()->getDebugLoc());
if (PhiR->isOrdered())
- ReducedPartRdx = State.get(LoopExitInstDef, UF - 1);
+ ReducedPartRdx = RdxParts[UF - 1];
else {
// Floating-point operations should have some FMF to enable the reduction.
IRBuilderBase::FastMathFlagGuard FMFG(Builder);
Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
for (unsigned Part = 1; Part < UF; ++Part) {
- Value *RdxPart = State.get(LoopExitInstDef, Part);
- if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
+ Value *RdxPart = RdxParts[Part];
+ if (Op != Instruction::ICmp && Op != Instruction::FCmp)
ReducedPartRdx = Builder.CreateBinOp(
(Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
- } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK))
- ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK,
- ReducedPartRdx, RdxPart);
+ else if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK))
+ ReducedPartRdx = createAnyOfOp(Builder, ReductionStartValue, RK,
+ ReducedPartRdx, RdxPart);
else
ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
}
@@ -4070,7 +3737,7 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
// target reduction in the loop using a Reduction recipe.
if (VF.isVector() && !PhiR->isInLoop()) {
ReducedPartRdx =
- createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi);
+ createTargetReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi);
// If the reduction can be performed in a smaller type, we need to extend
// the reduction to the wider type before we branch to the original loop.
if (PhiTy != RdxDesc.getRecurrenceType())
@@ -4107,7 +3774,8 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
// inside the loop, create the final store here.
if (StoreInst *SI = RdxDesc.IntermediateStore) {
StoreInst *NewSI =
- Builder.CreateStore(ReducedPartRdx, SI->getPointerOperand());
+ Builder.CreateAlignedStore(ReducedPartRdx, SI->getPointerOperand(),
+ SI->getAlign());
propagateMetadata(NewSI, SI);
// If the reduction value is used in other places,
@@ -4436,7 +4104,10 @@ bool LoopVectorizationCostModel::isScalarWithPredication(
default:
return true;
case Instruction::Call:
- return !VFDatabase::hasMaskedVariant(*(cast<CallInst>(I)), VF);
+ if (VF.isScalar())
+ return true;
+ return CallWideningDecisions.at(std::make_pair(cast<CallInst>(I), VF))
+ .Kind == CM_Scalarize;
case Instruction::Load:
case Instruction::Store: {
auto *Ptr = getLoadStorePointerOperand(I);
@@ -4988,7 +4659,7 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
}
FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
- unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) {
+ unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking) {
MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
unsigned SmallestType, WidestType;
std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
@@ -5076,12 +4747,12 @@ FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
FixedScalableVFPair Result(ElementCount::getFixed(1),
ElementCount::getScalable(0));
if (auto MaxVF =
- getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
+ getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
MaxSafeFixedVF, FoldTailByMasking))
Result.FixedVF = MaxVF;
if (auto MaxVF =
- getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
+ getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
MaxSafeScalableVF, FoldTailByMasking))
if (MaxVF.isScalable()) {
Result.ScalableVF = MaxVF;
@@ -5105,6 +4776,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
}
unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
+ unsigned MaxTC = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop);
LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
if (TC == 1) {
reportVectorizationFailure("Single iteration (non) loop",
@@ -5115,7 +4787,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
switch (ScalarEpilogueStatus) {
case CM_ScalarEpilogueAllowed:
- return computeFeasibleMaxVF(TC, UserVF, false);
+ return computeFeasibleMaxVF(MaxTC, UserVF, false);
case CM_ScalarEpilogueNotAllowedUsePredicate:
[[fallthrough]];
case CM_ScalarEpilogueNotNeededUsePredicate:
@@ -5153,7 +4825,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
"scalar epilogue instead.\n");
ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
- return computeFeasibleMaxVF(TC, UserVF, false);
+ return computeFeasibleMaxVF(MaxTC, UserVF, false);
}
return FixedScalableVFPair::getNone();
}
@@ -5170,7 +4842,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
}
- FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true);
+ FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTC, UserVF, true);
// Avoid tail folding if the trip count is known to be a multiple of any VF
// we choose.
@@ -5246,7 +4918,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
}
ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
- unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType,
+ unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,
ElementCount MaxSafeVF, bool FoldTailByMasking) {
bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
const TypeSize WidestRegister = TTI.getRegisterBitWidth(
@@ -5285,31 +4957,35 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
}
// When a scalar epilogue is required, at least one iteration of the scalar
- // loop has to execute. Adjust ConstTripCount accordingly to avoid picking a
+ // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
// max VF that results in a dead vector loop.
- if (ConstTripCount > 0 && requiresScalarEpilogue(true))
- ConstTripCount -= 1;
-
- if (ConstTripCount && ConstTripCount <= WidestRegisterMinEC &&
- (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) {
- // If loop trip count (TC) is known at compile time there is no point in
- // choosing VF greater than TC (as done in the loop below). Select maximum
- // power of two which doesn't exceed TC.
- // If MaxVectorElementCount is scalable, we only fall back on a fixed VF
- // when the TC is less than or equal to the known number of lanes.
- auto ClampedConstTripCount = llvm::bit_floor(ConstTripCount);
+ if (MaxTripCount > 0 && requiresScalarEpilogue(true))
+ MaxTripCount -= 1;
+
+ if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC &&
+ (!FoldTailByMasking || isPowerOf2_32(MaxTripCount))) {
+ // If upper bound loop trip count (TC) is known at compile time there is no
+ // point in choosing VF greater than TC (as done in the loop below). Select
+ // maximum power of two which doesn't exceed TC. If MaxVectorElementCount is
+ // scalable, we only fall back on a fixed VF when the TC is less than or
+ // equal to the known number of lanes.
+ auto ClampedUpperTripCount = llvm::bit_floor(MaxTripCount);
LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
"exceeding the constant trip count: "
- << ClampedConstTripCount << "\n");
- return ElementCount::getFixed(ClampedConstTripCount);
+ << ClampedUpperTripCount << "\n");
+ return ElementCount::get(
+ ClampedUpperTripCount,
+ FoldTailByMasking ? MaxVectorElementCount.isScalable() : false);
}
TargetTransformInfo::RegisterKind RegKind =
ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
: TargetTransformInfo::RGK_FixedWidthVector;
ElementCount MaxVF = MaxVectorElementCount;
- if (MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 &&
- TTI.shouldMaximizeVectorBandwidth(RegKind))) {
+ if (MaximizeBandwidth ||
+ (MaximizeBandwidth.getNumOccurrences() == 0 &&
+ (TTI.shouldMaximizeVectorBandwidth(RegKind) ||
+ (UseWiderVFIfCallVariantsPresent && Legal->hasVectorCallVariants())))) {
auto MaxVectorElementCountMaxBW = ElementCount::get(
llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType),
ComputeScalableMaxVF);
@@ -5981,7 +5657,7 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
HasReductions &&
any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
const RecurrenceDescriptor &RdxDesc = Reduction.second;
- return RecurrenceDescriptor::isSelectCmpRecurrenceKind(
+ return RecurrenceDescriptor::isAnyOfRecurrenceKind(
RdxDesc.getRecurrenceKind());
});
if (HasSelectCmpReductions) {
@@ -6149,6 +5825,8 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
if (ValuesToIgnore.count(I))
continue;
+ collectInLoopReductions();
+
// For each VF find the maximum usage of registers.
for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
// Count the number of registers used, per register class, given all open
@@ -6668,10 +6346,11 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
std::optional<InstructionCost>
LoopVectorizationCostModel::getReductionPatternCost(
- Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) {
+ Instruction *I, ElementCount VF, Type *Ty,
+ TTI::TargetCostKind CostKind) const {
using namespace llvm::PatternMatch;
// Early exit for no inloop reductions
- if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty))
+ if (InLoopReductions.empty() || VF.isScalar() || !isa<VectorType>(Ty))
return std::nullopt;
auto *VectorTy = cast<VectorType>(Ty);
@@ -6706,10 +6385,10 @@ LoopVectorizationCostModel::getReductionPatternCost(
// Find the reduction this chain is a part of and calculate the basic cost of
// the reduction on its own.
- Instruction *LastChain = InLoopReductionImmediateChains[RetI];
+ Instruction *LastChain = InLoopReductionImmediateChains.at(RetI);
Instruction *ReductionPhi = LastChain;
while (!isa<PHINode>(ReductionPhi))
- ReductionPhi = InLoopReductionImmediateChains[ReductionPhi];
+ ReductionPhi = InLoopReductionImmediateChains.at(ReductionPhi);
const RecurrenceDescriptor &RdxDesc =
Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second;
@@ -7127,6 +6806,168 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
}
}
+void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
+ assert(!VF.isScalar() &&
+ "Trying to set a vectorization decision for a scalar VF");
+
+ for (BasicBlock *BB : TheLoop->blocks()) {
+ // For each instruction in the old loop.
+ for (Instruction &I : *BB) {
+ CallInst *CI = dyn_cast<CallInst>(&I);
+
+ if (!CI)
+ continue;
+
+ InstructionCost ScalarCost = InstructionCost::getInvalid();
+ InstructionCost VectorCost = InstructionCost::getInvalid();
+ InstructionCost IntrinsicCost = InstructionCost::getInvalid();
+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+
+ Function *ScalarFunc = CI->getCalledFunction();
+ Type *ScalarRetTy = CI->getType();
+ SmallVector<Type *, 4> Tys, ScalarTys;
+ bool MaskRequired = Legal->isMaskRequired(CI);
+ for (auto &ArgOp : CI->args())
+ ScalarTys.push_back(ArgOp->getType());
+
+ // Compute corresponding vector type for return value and arguments.
+ Type *RetTy = ToVectorTy(ScalarRetTy, VF);
+ for (Type *ScalarTy : ScalarTys)
+ Tys.push_back(ToVectorTy(ScalarTy, VF));
+
+ // An in-loop reduction using an fmuladd intrinsic is a special case;
+ // we don't want the normal cost for that intrinsic.
+ if (RecurrenceDescriptor::isFMulAddIntrinsic(CI))
+ if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind)) {
+ setCallWideningDecision(CI, VF, CM_IntrinsicCall, nullptr,
+ getVectorIntrinsicIDForCall(CI, TLI),
+ std::nullopt, *RedCost);
+ continue;
+ }
+
+ // Estimate cost of scalarized vector call. The source operands are
+ // assumed to be vectors, so we need to extract individual elements from
+ // there, execute VF scalar calls, and then gather the result into the
+ // vector return value.
+ InstructionCost ScalarCallCost =
+ TTI.getCallInstrCost(ScalarFunc, ScalarRetTy, ScalarTys, CostKind);
+
+ // Compute costs of unpacking argument values for the scalar calls and
+ // packing the return values to a vector.
+ InstructionCost ScalarizationCost =
+ getScalarizationOverhead(CI, VF, CostKind);
+
+ ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
+
+ // Find the cost of vectorizing the call, if we can find a suitable
+ // vector variant of the function.
+ bool UsesMask = false;
+ VFInfo FuncInfo;
+ Function *VecFunc = nullptr;
+ // Search through any available variants for one we can use at this VF.
+ for (VFInfo &Info : VFDatabase::getMappings(*CI)) {
+ // Must match requested VF.
+ if (Info.Shape.VF != VF)
+ continue;
+
+ // Must take a mask argument if one is required
+ if (MaskRequired && !Info.isMasked())
+ continue;
+
+ // Check that all parameter kinds are supported
+ bool ParamsOk = true;
+ for (VFParameter Param : Info.Shape.Parameters) {
+ switch (Param.ParamKind) {
+ case VFParamKind::Vector:
+ break;
+ case VFParamKind::OMP_Uniform: {
+ Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
+ // Make sure the scalar parameter in the loop is invariant.
+ if (!PSE.getSE()->isLoopInvariant(PSE.getSCEV(ScalarParam),
+ TheLoop))
+ ParamsOk = false;
+ break;
+ }
+ case VFParamKind::OMP_Linear: {
+ Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
+ // Find the stride for the scalar parameter in this loop and see if
+ // it matches the stride for the variant.
+ // TODO: do we need to figure out the cost of an extract to get the
+ // first lane? Or do we hope that it will be folded away?
+ ScalarEvolution *SE = PSE.getSE();
+ const auto *SAR =
+ dyn_cast<SCEVAddRecExpr>(SE->getSCEV(ScalarParam));
+
+ if (!SAR || SAR->getLoop() != TheLoop) {
+ ParamsOk = false;
+ break;
+ }
+
+ const SCEVConstant *Step =
+ dyn_cast<SCEVConstant>(SAR->getStepRecurrence(*SE));
+
+ if (!Step ||
+ Step->getAPInt().getSExtValue() != Param.LinearStepOrPos)
+ ParamsOk = false;
+
+ break;
+ }
+ case VFParamKind::GlobalPredicate:
+ UsesMask = true;
+ break;
+ default:
+ ParamsOk = false;
+ break;
+ }
+ }
+
+ if (!ParamsOk)
+ continue;
+
+ // Found a suitable candidate, stop here.
+ VecFunc = CI->getModule()->getFunction(Info.VectorName);
+ FuncInfo = Info;
+ break;
+ }
+
+ // Add in the cost of synthesizing a mask if one wasn't required.
+ InstructionCost MaskCost = 0;
+ if (VecFunc && UsesMask && !MaskRequired)
+ MaskCost = TTI.getShuffleCost(
+ TargetTransformInfo::SK_Broadcast,
+ VectorType::get(IntegerType::getInt1Ty(
+ VecFunc->getFunctionType()->getContext()),
+ VF));
+
+ if (TLI && VecFunc && !CI->isNoBuiltin())
+ VectorCost =
+ TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind) + MaskCost;
+
+ // Find the cost of an intrinsic; some targets may have instructions that
+ // perform the operation without needing an actual call.
+ Intrinsic::ID IID = getVectorIntrinsicIDForCall(CI, TLI);
+ if (IID != Intrinsic::not_intrinsic)
+ IntrinsicCost = getVectorIntrinsicCost(CI, VF);
+
+ InstructionCost Cost = ScalarCost;
+ InstWidening Decision = CM_Scalarize;
+
+ if (VectorCost <= Cost) {
+ Cost = VectorCost;
+ Decision = CM_VectorCall;
+ }
+
+ if (IntrinsicCost <= Cost) {
+ Cost = IntrinsicCost;
+ Decision = CM_IntrinsicCall;
+ }
+
+ setCallWideningDecision(CI, VF, Decision, VecFunc, IID,
+ FuncInfo.getParamIndexForOptionalMask(), Cost);
+ }
+ }
+}
+
InstructionCost
LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
Type *&VectorTy) {
@@ -7156,7 +6997,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
// With the exception of GEPs and PHIs, after scalarization there should
// only be one copy of the instruction generated in the loop. This is
// because the VF is either 1, or any instructions that need scalarizing
- // have already been dealt with by the the time we get here. As a result,
+ // have already been dealt with by the time we get here. As a result,
// it means we don't have to multiply the instruction cost by VF.
assert(I->getOpcode() == Instruction::GetElementPtr ||
I->getOpcode() == Instruction::PHI ||
@@ -7384,6 +7225,9 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
return TTI::CastContextHint::Reversed;
case LoopVectorizationCostModel::CM_Unknown:
llvm_unreachable("Instr did not go through cost modelling?");
+ case LoopVectorizationCostModel::CM_VectorCall:
+ case LoopVectorizationCostModel::CM_IntrinsicCall:
+ llvm_unreachable_internal("Instr has invalid widening decision");
}
llvm_unreachable("Unhandled case!");
@@ -7441,19 +7285,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
}
- case Instruction::Call: {
- if (RecurrenceDescriptor::isFMulAddIntrinsic(I))
- if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
- return *RedCost;
- Function *Variant;
- CallInst *CI = cast<CallInst>(I);
- InstructionCost CallCost = getVectorCallCost(CI, VF, &Variant);
- if (getVectorIntrinsicIDForCall(CI, TLI)) {
- InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
- return std::min(CallCost, IntrinsicCost);
- }
- return CallCost;
- }
+ case Instruction::Call:
+ return getVectorCallCost(cast<CallInst>(I), VF);
case Instruction::ExtractValue:
return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
case Instruction::Alloca:
@@ -7521,8 +7354,9 @@ void LoopVectorizationCostModel::collectInLoopReductions() {
SmallVector<Instruction *, 4> ReductionOperations =
RdxDesc.getReductionOpChain(Phi, TheLoop);
bool InLoop = !ReductionOperations.empty();
+
if (InLoop) {
- InLoopReductionChains[Phi] = ReductionOperations;
+ InLoopReductions.insert(Phi);
// Add the elements to InLoopReductionImmediateChains for cost modelling.
Instruction *LastChain = Phi;
for (auto *I : ReductionOperations) {
@@ -7535,21 +7369,38 @@ void LoopVectorizationCostModel::collectInLoopReductions() {
}
}
+VPValue *VPBuilder::createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B,
+ DebugLoc DL, const Twine &Name) {
+ assert(Pred >= CmpInst::FIRST_ICMP_PREDICATE &&
+ Pred <= CmpInst::LAST_ICMP_PREDICATE && "invalid predicate");
+ return tryInsertInstruction(
+ new VPInstruction(Instruction::ICmp, Pred, A, B, DL, Name));
+}
+
+// This function will select a scalable VF if the target supports scalable
+// vectors and a fixed one otherwise.
// TODO: we could return a pair of values that specify the max VF and
// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
// doesn't have a cost model that can choose which plan to execute if
// more than one is generated.
-static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
- LoopVectorizationCostModel &CM) {
+static ElementCount determineVPlanVF(const TargetTransformInfo &TTI,
+ LoopVectorizationCostModel &CM) {
unsigned WidestType;
std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
- return WidestVectorRegBits / WidestType;
+
+ TargetTransformInfo::RegisterKind RegKind =
+ TTI.enableScalableVectorization()
+ ? TargetTransformInfo::RGK_ScalableVector
+ : TargetTransformInfo::RGK_FixedWidthVector;
+
+ TypeSize RegSize = TTI.getRegisterBitWidth(RegKind);
+ unsigned N = RegSize.getKnownMinValue() / WidestType;
+ return ElementCount::get(N, RegSize.isScalable());
}
VectorizationFactor
LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
- assert(!UserVF.isScalable() && "scalable vectors not yet supported");
ElementCount VF = UserVF;
// Outer loop handling: They may require CFG and instruction level
// transformations before even evaluating whether vectorization is profitable.
@@ -7559,10 +7410,7 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
// If the user doesn't provide a vectorization factor, determine a
// reasonable one.
if (UserVF.isZero()) {
- VF = ElementCount::getFixed(determineVPlanVF(
- TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
- .getFixedValue(),
- CM));
+ VF = determineVPlanVF(TTI, CM);
LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
// Make sure we have a VF > 1 for stress testing.
@@ -7571,6 +7419,17 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
<< "overriding computed VF.\n");
VF = ElementCount::getFixed(4);
}
+ } else if (UserVF.isScalable() && !TTI.supportsScalableVectors() &&
+ !ForceTargetSupportsScalableVectors) {
+ LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but "
+ << "not supported by the target.\n");
+ reportVectorizationFailure(
+ "Scalable vectorization requested but not supported by the target",
+ "the scalable user-specified vectorization width for outer-loop "
+ "vectorization cannot be used because the target does not support "
+ "scalable vectors.",
+ "ScalableVFUnfeasible", ORE, OrigLoop);
+ return VectorizationFactor::Disabled();
}
assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
assert(isPowerOf2_32(VF.getKnownMinValue()) &&
@@ -7624,9 +7483,9 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
"VF needs to be a power of two");
// Collect the instructions (and their associated costs) that will be more
// profitable to scalarize.
+ CM.collectInLoopReductions();
if (CM.selectUserVectorizationFactor(UserVF)) {
LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
- CM.collectInLoopReductions();
buildVPlansWithVPRecipes(UserVF, UserVF);
if (!hasPlanWithVF(UserVF)) {
LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << UserVF
@@ -7650,6 +7509,7 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
VFCandidates.insert(VF);
+ CM.collectInLoopReductions();
for (const auto &VF : VFCandidates) {
// Collect Uniform and Scalar instructions after vectorization with VF.
CM.collectUniformsAndScalars(VF);
@@ -7660,7 +7520,6 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
CM.collectInstsToScalarize(VF);
}
- CM.collectInLoopReductions();
buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
@@ -7705,7 +7564,7 @@ static void AddRuntimeUnrollDisableMetaData(Loop *L) {
if (MD) {
const auto *S = dyn_cast<MDString>(MD->getOperand(0));
IsUnrollMetadata =
- S && S->getString().startswith("llvm.loop.unroll.disable");
+ S && S->getString().starts_with("llvm.loop.unroll.disable");
}
MDs.push_back(LoopID->getOperand(i));
}
@@ -7729,7 +7588,7 @@ static void AddRuntimeUnrollDisableMetaData(Loop *L) {
SCEV2ValueTy LoopVectorizationPlanner::executePlan(
ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
InnerLoopVectorizer &ILV, DominatorTree *DT, bool IsEpilogueVectorization,
- DenseMap<const SCEV *, Value *> *ExpandedSCEVs) {
+ const DenseMap<const SCEV *, Value *> *ExpandedSCEVs) {
assert(BestVPlan.hasVF(BestVF) &&
"Trying to execute plan with unsupported VF");
assert(BestVPlan.hasUF(BestUF) &&
@@ -7745,7 +7604,8 @@ SCEV2ValueTy LoopVectorizationPlanner::executePlan(
VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
// Perform the actual loop transformation.
- VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan};
+ VPTransformState State(BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan,
+ OrigLoop->getHeader()->getContext());
// 0. Generate SCEV-dependent code into the preheader, including TripCount,
// before making any changes to the CFG.
@@ -7798,9 +7658,9 @@ SCEV2ValueTy LoopVectorizationPlanner::executePlan(
//===------------------------------------------------===//
// 2. Copy and widen instructions from the old loop into the new loop.
- BestVPlan.prepareToExecute(
- ILV.getTripCount(), ILV.getOrCreateVectorTripCount(nullptr),
- CanonicalIVStartValue, State, IsEpilogueVectorization);
+ BestVPlan.prepareToExecute(ILV.getTripCount(),
+ ILV.getOrCreateVectorTripCount(nullptr),
+ CanonicalIVStartValue, State);
BestVPlan.execute(&State);
@@ -7964,9 +7824,11 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
EPI.TripCount = Count;
}
- ReplaceInstWithInst(
- TCCheckBlock->getTerminator(),
- BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
+ BranchInst &BI =
+ *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
+ if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
+ setBranchWeights(BI, MinItersBypassWeights);
+ ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
return TCCheckBlock;
}
@@ -8064,8 +7926,8 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
// Generate a resume induction for the vector epilogue and put it in the
// vector epilogue preheader
Type *IdxTy = Legal->getWidestInductionType();
- PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val",
- LoopVectorPreHeader->getFirstNonPHI());
+ PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val");
+ EPResumeVal->insertBefore(LoopVectorPreHeader->getFirstNonPHIIt());
EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
EPI.MainLoopIterationCountCheck);
@@ -8110,9 +7972,22 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
EPI.EpilogueVF, EPI.EpilogueUF),
"min.epilog.iters.check");
- ReplaceInstWithInst(
- Insert->getTerminator(),
- BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
+ BranchInst &BI =
+ *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
+ if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
+ unsigned MainLoopStep = UF * VF.getKnownMinValue();
+ unsigned EpilogueLoopStep =
+ EPI.EpilogueUF * EPI.EpilogueVF.getKnownMinValue();
+ // We assume the remaining `Count` is equally distributed in
+ // [0, MainLoopStep)
+ // So the probability for `Count < EpilogueLoopStep` should be
+ // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep
+ unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep);
+ const uint32_t Weights[] = {EstimatedSkipCount,
+ MainLoopStep - EstimatedSkipCount};
+ setBranchWeights(BI, Weights);
+ }
+ ReplaceInstWithInst(Insert->getTerminator(), &BI);
LoopBypassBlocks.push_back(Insert);
return Insert;
@@ -8206,6 +8081,33 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
return EdgeMaskCache[Edge] = EdgeMask;
}
+void VPRecipeBuilder::createHeaderMask(VPlan &Plan) {
+ BasicBlock *Header = OrigLoop->getHeader();
+
+ // When not folding the tail, use nullptr to model all-true mask.
+ if (!CM.foldTailByMasking()) {
+ BlockMaskCache[Header] = nullptr;
+ return;
+ }
+
+ // Introduce the early-exit compare IV <= BTC to form header block mask.
+ // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
+ // constructing the desired canonical IV in the header block as its first
+ // non-phi instructions.
+
+ VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
+ auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
+ auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV());
+ HeaderVPBB->insert(IV, NewInsertionPoint);
+
+ VPBuilder::InsertPointGuard Guard(Builder);
+ Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
+ VPValue *BlockMask = nullptr;
+ VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
+ BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC);
+ BlockMaskCache[Header] = BlockMask;
+}
+
VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) {
assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
@@ -8214,45 +8116,12 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) {
if (BCEntryIt != BlockMaskCache.end())
return BCEntryIt->second;
+ assert(OrigLoop->getHeader() != BB &&
+ "Loop header must have cached block mask");
+
// All-one mask is modelled as no-mask following the convention for masked
// load/store/gather/scatter. Initialize BlockMask to no-mask.
VPValue *BlockMask = nullptr;
-
- if (OrigLoop->getHeader() == BB) {
- if (!CM.blockNeedsPredicationForAnyReason(BB))
- return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
-
- assert(CM.foldTailByMasking() && "must fold the tail");
-
- // If we're using the active lane mask for control flow, then we get the
- // mask from the active lane mask PHI that is cached in the VPlan.
- TailFoldingStyle TFStyle = CM.getTailFoldingStyle();
- if (useActiveLaneMaskForControlFlow(TFStyle))
- return BlockMaskCache[BB] = Plan.getActiveLaneMaskPhi();
-
- // Introduce the early-exit compare IV <= BTC to form header block mask.
- // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
- // constructing the desired canonical IV in the header block as its first
- // non-phi instructions.
-
- VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
- auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
- auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV());
- HeaderVPBB->insert(IV, HeaderVPBB->getFirstNonPhi());
-
- VPBuilder::InsertPointGuard Guard(Builder);
- Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
- if (useActiveLaneMask(TFStyle)) {
- VPValue *TC = Plan.getTripCount();
- BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC},
- nullptr, "active.lane.mask");
- } else {
- VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
- BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
- }
- return BlockMaskCache[BB] = BlockMask;
- }
-
// This is the block mask. We OR all incoming edges.
for (auto *Predecessor : predecessors(BB)) {
VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
@@ -8458,22 +8327,15 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
bool ShouldUseVectorIntrinsic =
ID && LoopVectorizationPlanner::getDecisionAndClampRange(
[&](ElementCount VF) -> bool {
- Function *Variant;
- // Is it beneficial to perform intrinsic call compared to lib
- // call?
- InstructionCost CallCost =
- CM.getVectorCallCost(CI, VF, &Variant);
- InstructionCost IntrinsicCost =
- CM.getVectorIntrinsicCost(CI, VF);
- return IntrinsicCost <= CallCost;
+ return CM.getCallWideningDecision(CI, VF).Kind ==
+ LoopVectorizationCostModel::CM_IntrinsicCall;
},
Range);
if (ShouldUseVectorIntrinsic)
return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), ID);
Function *Variant = nullptr;
- ElementCount VariantVF;
- bool NeedsMask = false;
+ std::optional<unsigned> MaskPos;
// Is better to call a vectorized version of the function than to to scalarize
// the call?
auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange(
@@ -8492,16 +8354,19 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
// finds a valid variant.
if (Variant)
return false;
- CM.getVectorCallCost(CI, VF, &Variant, &NeedsMask);
- // If we found a valid vector variant at this VF, then store the VF
- // in case we need to generate a mask.
- if (Variant)
- VariantVF = VF;
- return Variant != nullptr;
+ LoopVectorizationCostModel::CallWideningDecision Decision =
+ CM.getCallWideningDecision(CI, VF);
+ if (Decision.Kind == LoopVectorizationCostModel::CM_VectorCall) {
+ Variant = Decision.Variant;
+ MaskPos = Decision.MaskPos;
+ return true;
+ }
+
+ return false;
},
Range);
if (ShouldUseVectorCall) {
- if (NeedsMask) {
+ if (MaskPos.has_value()) {
// We have 2 cases that would require a mask:
// 1) The block needs to be predicated, either due to a conditional
// in the scalar loop or use of an active lane mask with
@@ -8516,17 +8381,7 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
Mask = Plan->getVPValueOrAddLiveIn(ConstantInt::getTrue(
IntegerType::getInt1Ty(Variant->getFunctionType()->getContext())));
- VFShape Shape = VFShape::get(*CI, VariantVF, /*HasGlobalPred=*/true);
- unsigned MaskPos = 0;
-
- for (const VFInfo &Info : VFDatabase::getMappings(*CI))
- if (Info.Shape == Shape) {
- assert(Info.isMasked() && "Vector function info shape mismatch");
- MaskPos = Info.getParamIndexForOptionalMask().value();
- break;
- }
-
- Ops.insert(Ops.begin() + MaskPos, Mask);
+ Ops.insert(Ops.begin() + *MaskPos, Mask);
}
return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()),
@@ -8747,8 +8602,8 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
}
if (auto *CI = dyn_cast<CastInst>(Instr)) {
- return toVPRecipeResult(
- new VPWidenCastRecipe(CI->getOpcode(), Operands[0], CI->getType(), CI));
+ return toVPRecipeResult(new VPWidenCastRecipe(CI->getOpcode(), Operands[0],
+ CI->getType(), *CI));
}
return toVPRecipeResult(tryToWiden(Instr, Operands, VPBB, Plan));
@@ -8758,27 +8613,26 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
ElementCount MaxVF) {
assert(OrigLoop->isInnermost() && "Inner loop expected.");
- // Add assume instructions we need to drop to DeadInstructions, to prevent
- // them from being added to the VPlan.
- // TODO: We only need to drop assumes in blocks that get flattend. If the
- // control flow is preserved, we should keep them.
- SmallPtrSet<Instruction *, 4> DeadInstructions;
- auto &ConditionalAssumes = Legal->getConditionalAssumes();
- DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
-
auto MaxVFTimes2 = MaxVF * 2;
for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
VFRange SubRange = {VF, MaxVFTimes2};
- if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange, DeadInstructions))
- VPlans.push_back(std::move(*Plan));
+ if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange)) {
+ // Now optimize the initial VPlan.
+ if (!Plan->hasVF(ElementCount::getFixed(1)))
+ VPlanTransforms::truncateToMinimalBitwidths(
+ *Plan, CM.getMinimalBitwidths(), PSE.getSE()->getContext());
+ VPlanTransforms::optimize(*Plan, *PSE.getSE());
+ assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid");
+ VPlans.push_back(std::move(Plan));
+ }
VF = SubRange.End;
}
}
// Add the necessary canonical IV and branch recipes required to control the
// loop.
-static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL,
- TailFoldingStyle Style) {
+static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
+ DebugLoc DL) {
Value *StartIdx = ConstantInt::get(IdxTy, 0);
auto *StartV = Plan.getVPValueOrAddLiveIn(StartIdx);
@@ -8790,102 +8644,24 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL,
// Add a CanonicalIVIncrement{NUW} VPInstruction to increment the scalar
// IV by VF * UF.
- bool HasNUW = Style == TailFoldingStyle::None;
auto *CanonicalIVIncrement =
- new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW
- : VPInstruction::CanonicalIVIncrement,
- {CanonicalIVPHI}, DL, "index.next");
+ new VPInstruction(Instruction::Add, {CanonicalIVPHI, &Plan.getVFxUF()},
+ {HasNUW, false}, DL, "index.next");
CanonicalIVPHI->addOperand(CanonicalIVIncrement);
VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
- if (useActiveLaneMaskForControlFlow(Style)) {
- // Create the active lane mask instruction in the vplan preheader.
- VPBasicBlock *VecPreheader =
- cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSinglePredecessor());
-
- // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
- // we have to take unrolling into account. Each part needs to start at
- // Part * VF
- auto *CanonicalIVIncrementParts =
- new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementForPartNUW
- : VPInstruction::CanonicalIVIncrementForPart,
- {StartV}, DL, "index.part.next");
- VecPreheader->appendRecipe(CanonicalIVIncrementParts);
-
- // Create the ActiveLaneMask instruction using the correct start values.
- VPValue *TC = Plan.getTripCount();
-
- VPValue *TripCount, *IncrementValue;
- if (Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) {
- // When avoiding a runtime check, the active.lane.mask inside the loop
- // uses a modified trip count and the induction variable increment is
- // done after the active.lane.mask intrinsic is called.
- auto *TCMinusVF =
- new VPInstruction(VPInstruction::CalculateTripCountMinusVF, {TC}, DL);
- VecPreheader->appendRecipe(TCMinusVF);
- IncrementValue = CanonicalIVPHI;
- TripCount = TCMinusVF;
- } else {
- // When the loop is guarded by a runtime overflow check for the loop
- // induction variable increment by VF, we can increment the value before
- // the get.active.lane mask and use the unmodified tripcount.
- EB->appendRecipe(CanonicalIVIncrement);
- IncrementValue = CanonicalIVIncrement;
- TripCount = TC;
- }
-
- auto *EntryALM = new VPInstruction(VPInstruction::ActiveLaneMask,
- {CanonicalIVIncrementParts, TC}, DL,
- "active.lane.mask.entry");
- VecPreheader->appendRecipe(EntryALM);
-
- // Now create the ActiveLaneMaskPhi recipe in the main loop using the
- // preheader ActiveLaneMask instruction.
- auto *LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc());
- Header->insert(LaneMaskPhi, Header->getFirstNonPhi());
-
- // Create the active lane mask for the next iteration of the loop.
- CanonicalIVIncrementParts =
- new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementForPartNUW
- : VPInstruction::CanonicalIVIncrementForPart,
- {IncrementValue}, DL);
- EB->appendRecipe(CanonicalIVIncrementParts);
-
- auto *ALM = new VPInstruction(VPInstruction::ActiveLaneMask,
- {CanonicalIVIncrementParts, TripCount}, DL,
- "active.lane.mask.next");
- EB->appendRecipe(ALM);
- LaneMaskPhi->addOperand(ALM);
-
- if (Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) {
- // Do the increment of the canonical IV after the active.lane.mask, because
- // that value is still based off %CanonicalIVPHI
- EB->appendRecipe(CanonicalIVIncrement);
- }
-
- // We have to invert the mask here because a true condition means jumping
- // to the exit block.
- auto *NotMask = new VPInstruction(VPInstruction::Not, ALM, DL);
- EB->appendRecipe(NotMask);
-
- VPInstruction *BranchBack =
- new VPInstruction(VPInstruction::BranchOnCond, {NotMask}, DL);
- EB->appendRecipe(BranchBack);
- } else {
- EB->appendRecipe(CanonicalIVIncrement);
+ EB->appendRecipe(CanonicalIVIncrement);
- // Add the BranchOnCount VPInstruction to the latch.
- VPInstruction *BranchBack = new VPInstruction(
- VPInstruction::BranchOnCount,
- {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
- EB->appendRecipe(BranchBack);
- }
+ // Add the BranchOnCount VPInstruction to the latch.
+ VPInstruction *BranchBack =
+ new VPInstruction(VPInstruction::BranchOnCount,
+ {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
+ EB->appendRecipe(BranchBack);
}
// Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the
// original exit block.
-static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB,
- VPBasicBlock *MiddleVPBB, Loop *OrigLoop,
+static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB, Loop *OrigLoop,
VPlan &Plan) {
BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock();
BasicBlock *ExitingBB = OrigLoop->getExitingBlock();
@@ -8902,8 +8678,8 @@ static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB,
}
}
-std::optional<VPlanPtr> LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
- VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions) {
+VPlanPtr
+LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
@@ -8914,24 +8690,6 @@ std::optional<VPlanPtr> LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
// process after constructing the initial VPlan.
// ---------------------------------------------------------------------------
- for (const auto &Reduction : CM.getInLoopReductionChains()) {
- PHINode *Phi = Reduction.first;
- RecurKind Kind =
- Legal->getReductionVars().find(Phi)->second.getRecurrenceKind();
- const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
-
- RecipeBuilder.recordRecipeOf(Phi);
- for (const auto &R : ReductionOperations) {
- RecipeBuilder.recordRecipeOf(R);
- // For min/max reductions, where we have a pair of icmp/select, we also
- // need to record the ICmp recipe, so it can be removed later.
- assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&
- "Only min/max recurrences allowed for inloop reductions");
- if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
- RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0)));
- }
- }
-
// For each interleave group which is relevant for this (possibly trimmed)
// Range, add it to the set of groups to be later applied to the VPlan and add
// placeholders for its members' Recipes which we'll be replacing with a
@@ -8972,23 +8730,27 @@ std::optional<VPlanPtr> LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body");
VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch");
VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB);
- auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop");
- VPBlockUtils::insertBlockAfter(TopRegion, Plan->getEntry());
- VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block");
- VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion);
+ Plan->getVectorLoopRegion()->setEntry(HeaderVPBB);
+ Plan->getVectorLoopRegion()->setExiting(LatchVPBB);
// Don't use getDecisionAndClampRange here, because we don't know the UF
// so this function is better to be conservative, rather than to split
// it up into different VPlans.
+ // TODO: Consider using getDecisionAndClampRange here to split up VPlans.
bool IVUpdateMayOverflow = false;
for (ElementCount VF : Range)
IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF);
- Instruction *DLInst =
- getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
- addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(),
- DLInst ? DLInst->getDebugLoc() : DebugLoc(),
- CM.getTailFoldingStyle(IVUpdateMayOverflow));
+ DebugLoc DL = getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
+ TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow);
+ // When not folding the tail, we know that the induction increment will not
+ // overflow.
+ bool HasNUW = Style == TailFoldingStyle::None;
+ addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL);
+
+ // Proactively create header mask. Masks for other blocks are created on
+ // demand.
+ RecipeBuilder.createHeaderMask(*Plan);
// Scan the body of the loop in a topological order to visit each basic block
// after having visited its predecessor basic blocks.
@@ -9005,14 +8767,8 @@ std::optional<VPlanPtr> LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
// Introduce each ingredient into VPlan.
// TODO: Model and preserve debug intrinsics in VPlan.
- for (Instruction &I : BB->instructionsWithoutDebug(false)) {
+ for (Instruction &I : drop_end(BB->instructionsWithoutDebug(false))) {
Instruction *Instr = &I;
-
- // First filter out irrelevant instructions, to ensure no recipes are
- // built for them.
- if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
- continue;
-
SmallVector<VPValue *, 4> Operands;
auto *Phi = dyn_cast<PHINode>(Instr);
if (Phi && Phi->getParent() == OrigLoop->getHeader()) {
@@ -9052,11 +8808,18 @@ std::optional<VPlanPtr> LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
}
RecipeBuilder.setRecipe(Instr, Recipe);
- if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) &&
- HeaderVPBB->getFirstNonPhi() != VPBB->end()) {
- // Move VPWidenIntOrFpInductionRecipes for optimized truncates to the
- // phi section of HeaderVPBB.
- assert(isa<TruncInst>(Instr));
+ if (isa<VPHeaderPHIRecipe>(Recipe)) {
+ // VPHeaderPHIRecipes must be kept in the phi section of HeaderVPBB. In
+ // the following cases, VPHeaderPHIRecipes may be created after non-phi
+ // recipes and need to be moved to the phi section of HeaderVPBB:
+ // * tail-folding (non-phi recipes computing the header mask are
+ // introduced earlier than regular header phi recipes, and should appear
+ // after them)
+ // * Optimizing truncates to VPWidenIntOrFpInductionRecipe.
+
+ assert((HeaderVPBB->getFirstNonPhi() == VPBB->end() ||
+ CM.foldTailByMasking() || isa<TruncInst>(Instr)) &&
+ "unexpected recipe needs moving");
Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
} else
VPBB->appendRecipe(Recipe);
@@ -9074,7 +8837,7 @@ std::optional<VPlanPtr> LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
// and there is nothing to fix from vector loop; phis should have incoming
// from scalar loop only.
} else
- addUsersInExitBlock(HeaderVPBB, MiddleVPBB, OrigLoop, *Plan);
+ addUsersInExitBlock(HeaderVPBB, OrigLoop, *Plan);
assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
!Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
@@ -9088,8 +8851,7 @@ std::optional<VPlanPtr> LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
// ---------------------------------------------------------------------------
// Adjust the recipes for any inloop reductions.
- adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExiting()), Plan,
- RecipeBuilder, Range.Start);
+ adjustRecipesForReductions(LatchVPBB, Plan, RecipeBuilder, Range.Start);
// Interleave memory: for each Interleave Group we marked earlier as relevant
// for this VPlan, replace the Recipes widening its memory instructions with a
@@ -9150,21 +8912,18 @@ std::optional<VPlanPtr> LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
// Sink users of fixed-order recurrence past the recipe defining the previous
// value and introduce FirstOrderRecurrenceSplice VPInstructions.
if (!VPlanTransforms::adjustFixedOrderRecurrences(*Plan, Builder))
- return std::nullopt;
-
- VPlanTransforms::removeRedundantCanonicalIVs(*Plan);
- VPlanTransforms::removeRedundantInductionCasts(*Plan);
-
- VPlanTransforms::optimizeInductions(*Plan, *PSE.getSE());
- VPlanTransforms::removeDeadRecipes(*Plan);
-
- VPlanTransforms::createAndOptimizeReplicateRegions(*Plan);
-
- VPlanTransforms::removeRedundantExpandSCEVRecipes(*Plan);
- VPlanTransforms::mergeBlocksIntoPredecessors(*Plan);
+ return nullptr;
- assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid");
- return std::make_optional(std::move(Plan));
+ if (useActiveLaneMask(Style)) {
+ // TODO: Move checks to VPlanTransforms::addActiveLaneMask once
+ // TailFoldingStyle is visible there.
+ bool ForControlFlow = useActiveLaneMaskForControlFlow(Style);
+ bool WithoutRuntimeCheck =
+ Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
+ VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow,
+ WithoutRuntimeCheck);
+ }
+ return Plan;
}
VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
@@ -9198,8 +8957,11 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator();
Term->eraseFromParent();
- addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(),
- CM.getTailFoldingStyle());
+ // Tail folding is not supported for outer loops, so the induction increment
+ // is guaranteed to not wrap.
+ bool HasNUW = true;
+ addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW,
+ DebugLoc());
return Plan;
}
@@ -9211,105 +8973,211 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
void LoopVectorizationPlanner::adjustRecipesForReductions(
VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder,
ElementCount MinVF) {
- for (const auto &Reduction : CM.getInLoopReductionChains()) {
- PHINode *Phi = Reduction.first;
- const RecurrenceDescriptor &RdxDesc =
- Legal->getReductionVars().find(Phi)->second;
- const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
-
- if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc))
+ VPBasicBlock *Header = Plan->getVectorLoopRegion()->getEntryBasicBlock();
+ // Gather all VPReductionPHIRecipe and sort them so that Intermediate stores
+ // sank outside of the loop would keep the same order as they had in the
+ // original loop.
+ SmallVector<VPReductionPHIRecipe *> ReductionPHIList;
+ for (VPRecipeBase &R : Header->phis()) {
+ if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
+ ReductionPHIList.emplace_back(ReductionPhi);
+ }
+ bool HasIntermediateStore = false;
+ stable_sort(ReductionPHIList,
+ [this, &HasIntermediateStore](const VPReductionPHIRecipe *R1,
+ const VPReductionPHIRecipe *R2) {
+ auto *IS1 = R1->getRecurrenceDescriptor().IntermediateStore;
+ auto *IS2 = R2->getRecurrenceDescriptor().IntermediateStore;
+ HasIntermediateStore |= IS1 || IS2;
+
+ // If neither of the recipes has an intermediate store, keep the
+ // order the same.
+ if (!IS1 && !IS2)
+ return false;
+
+ // If only one of the recipes has an intermediate store, then
+ // move it towards the beginning of the list.
+ if (IS1 && !IS2)
+ return true;
+
+ if (!IS1 && IS2)
+ return false;
+
+ // If both recipes have an intermediate store, then the recipe
+ // with the later store should be processed earlier. So it
+ // should go to the beginning of the list.
+ return DT->dominates(IS2, IS1);
+ });
+
+ if (HasIntermediateStore && ReductionPHIList.size() > 1)
+ for (VPRecipeBase *R : ReductionPHIList)
+ R->moveBefore(*Header, Header->getFirstNonPhi());
+
+ SmallVector<VPReductionPHIRecipe *> InLoopReductionPhis;
+ for (VPRecipeBase &R : Header->phis()) {
+ auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
+ if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered()))
continue;
+ InLoopReductionPhis.push_back(PhiR);
+ }
+
+ for (VPReductionPHIRecipe *PhiR : InLoopReductionPhis) {
+ const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
+ RecurKind Kind = RdxDesc.getRecurrenceKind();
+ assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
+ "AnyOf reductions are not allowed for in-loop reductions");
+
+ // Collect the chain of "link" recipes for the reduction starting at PhiR.
+ SetVector<VPRecipeBase *> Worklist;
+ Worklist.insert(PhiR);
+ for (unsigned I = 0; I != Worklist.size(); ++I) {
+ VPRecipeBase *Cur = Worklist[I];
+ for (VPUser *U : Cur->getVPSingleValue()->users()) {
+ auto *UserRecipe = dyn_cast<VPRecipeBase>(U);
+ if (!UserRecipe)
+ continue;
+ assert(UserRecipe->getNumDefinedValues() == 1 &&
+ "recipes must define exactly one result value");
+ Worklist.insert(UserRecipe);
+ }
+ }
+
+ // Visit operation "Links" along the reduction chain top-down starting from
+ // the phi until LoopExitValue. We keep track of the previous item
+ // (PreviousLink) to tell which of the two operands of a Link will remain
+ // scalar and which will be reduced. For minmax by select(cmp), Link will be
+ // the select instructions.
+ VPRecipeBase *PreviousLink = PhiR; // Aka Worklist[0].
+ for (VPRecipeBase *CurrentLink : Worklist.getArrayRef().drop_front()) {
+ VPValue *PreviousLinkV = PreviousLink->getVPSingleValue();
+
+ Instruction *CurrentLinkI = CurrentLink->getUnderlyingInstr();
- // ReductionOperations are orders top-down from the phi's use to the
- // LoopExitValue. We keep a track of the previous item (the Chain) to tell
- // which of the two operands will remain scalar and which will be reduced.
- // For minmax the chain will be the select instructions.
- Instruction *Chain = Phi;
- for (Instruction *R : ReductionOperations) {
- VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R);
- RecurKind Kind = RdxDesc.getRecurrenceKind();
-
- VPValue *ChainOp = Plan->getVPValue(Chain);
- unsigned FirstOpId;
- assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&
- "Only min/max recurrences allowed for inloop reductions");
+ // Index of the first operand which holds a non-mask vector operand.
+ unsigned IndexOfFirstOperand;
// Recognize a call to the llvm.fmuladd intrinsic.
bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
- assert((!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) &&
- "Expected instruction to be a call to the llvm.fmuladd intrinsic");
- if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
- assert(isa<VPWidenSelectRecipe>(WidenRecipe) &&
- "Expected to replace a VPWidenSelectSC");
- FirstOpId = 1;
+ VPValue *VecOp;
+ VPBasicBlock *LinkVPBB = CurrentLink->getParent();
+ if (IsFMulAdd) {
+ assert(
+ RecurrenceDescriptor::isFMulAddIntrinsic(CurrentLinkI) &&
+ "Expected instruction to be a call to the llvm.fmuladd intrinsic");
+ assert(((MinVF.isScalar() && isa<VPReplicateRecipe>(CurrentLink)) ||
+ isa<VPWidenCallRecipe>(CurrentLink)) &&
+ CurrentLink->getOperand(2) == PreviousLinkV &&
+ "expected a call where the previous link is the added operand");
+
+ // If the instruction is a call to the llvm.fmuladd intrinsic then we
+ // need to create an fmul recipe (multiplying the first two operands of
+ // the fmuladd together) to use as the vector operand for the fadd
+ // reduction.
+ VPInstruction *FMulRecipe = new VPInstruction(
+ Instruction::FMul,
+ {CurrentLink->getOperand(0), CurrentLink->getOperand(1)},
+ CurrentLinkI->getFastMathFlags());
+ LinkVPBB->insert(FMulRecipe, CurrentLink->getIterator());
+ VecOp = FMulRecipe;
} else {
- assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) ||
- (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) &&
- "Expected to replace a VPWidenSC");
- FirstOpId = 0;
+ if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
+ if (isa<VPWidenRecipe>(CurrentLink)) {
+ assert(isa<CmpInst>(CurrentLinkI) &&
+ "need to have the compare of the select");
+ continue;
+ }
+ assert(isa<VPWidenSelectRecipe>(CurrentLink) &&
+ "must be a select recipe");
+ IndexOfFirstOperand = 1;
+ } else {
+ assert((MinVF.isScalar() || isa<VPWidenRecipe>(CurrentLink)) &&
+ "Expected to replace a VPWidenSC");
+ IndexOfFirstOperand = 0;
+ }
+ // Note that for non-commutable operands (cmp-selects), the semantics of
+ // the cmp-select are captured in the recurrence kind.
+ unsigned VecOpId =
+ CurrentLink->getOperand(IndexOfFirstOperand) == PreviousLinkV
+ ? IndexOfFirstOperand + 1
+ : IndexOfFirstOperand;
+ VecOp = CurrentLink->getOperand(VecOpId);
+ assert(VecOp != PreviousLinkV &&
+ CurrentLink->getOperand(CurrentLink->getNumOperands() - 1 -
+ (VecOpId - IndexOfFirstOperand)) ==
+ PreviousLinkV &&
+ "PreviousLinkV must be the operand other than VecOp");
}
- unsigned VecOpId =
- R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;
- VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
+ BasicBlock *BB = CurrentLinkI->getParent();
VPValue *CondOp = nullptr;
- if (CM.blockNeedsPredicationForAnyReason(R->getParent())) {
+ if (CM.blockNeedsPredicationForAnyReason(BB)) {
VPBuilder::InsertPointGuard Guard(Builder);
- Builder.setInsertPoint(WidenRecipe->getParent(),
- WidenRecipe->getIterator());
- CondOp = RecipeBuilder.createBlockInMask(R->getParent(), *Plan);
+ Builder.setInsertPoint(CurrentLink);
+ CondOp = RecipeBuilder.createBlockInMask(BB, *Plan);
}
- if (IsFMulAdd) {
- // If the instruction is a call to the llvm.fmuladd intrinsic then we
- // need to create an fmul recipe to use as the vector operand for the
- // fadd reduction.
- VPInstruction *FMulRecipe = new VPInstruction(
- Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))});
- FMulRecipe->setFastMathFlags(R->getFastMathFlags());
- WidenRecipe->getParent()->insert(FMulRecipe,
- WidenRecipe->getIterator());
- VecOp = FMulRecipe;
- }
- VPReductionRecipe *RedRecipe =
- new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, &TTI);
- WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
- Plan->removeVPValueFor(R);
- Plan->addVPValue(R, RedRecipe);
+ VPReductionRecipe *RedRecipe = new VPReductionRecipe(
+ RdxDesc, CurrentLinkI, PreviousLinkV, VecOp, CondOp);
// Append the recipe to the end of the VPBasicBlock because we need to
// ensure that it comes after all of it's inputs, including CondOp.
- WidenRecipe->getParent()->appendRecipe(RedRecipe);
- WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
- WidenRecipe->eraseFromParent();
-
- if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
- VPRecipeBase *CompareRecipe =
- RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0)));
- assert(isa<VPWidenRecipe>(CompareRecipe) &&
- "Expected to replace a VPWidenSC");
- assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 &&
- "Expected no remaining users");
- CompareRecipe->eraseFromParent();
- }
- Chain = R;
+ // Note that this transformation may leave over dead recipes (including
+ // CurrentLink), which will be cleaned by a later VPlan transform.
+ LinkVPBB->appendRecipe(RedRecipe);
+ CurrentLink->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
+ PreviousLink = RedRecipe;
}
}
-
- // If tail is folded by masking, introduce selects between the phi
- // and the live-out instruction of each reduction, at the beginning of the
- // dedicated latch block.
- if (CM.foldTailByMasking()) {
- Builder.setInsertPoint(LatchVPBB, LatchVPBB->begin());
+ Builder.setInsertPoint(&*LatchVPBB->begin());
for (VPRecipeBase &R :
Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
- VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
- if (!PhiR || PhiR->isInLoop())
- continue;
+ VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
+ if (!PhiR || PhiR->isInLoop())
+ continue;
+
+ const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
+ auto *Result = PhiR->getBackedgeValue()->getDefiningRecipe();
+ // If tail is folded by masking, introduce selects between the phi
+ // and the live-out instruction of each reduction, at the beginning of the
+ // dedicated latch block.
+ if (CM.foldTailByMasking()) {
VPValue *Cond =
RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), *Plan);
VPValue *Red = PhiR->getBackedgeValue();
assert(Red->getDefiningRecipe()->getParent() != LatchVPBB &&
"reduction recipe must be defined before latch");
- Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR});
+ FastMathFlags FMFs = RdxDesc.getFastMathFlags();
+ Type *PhiTy = PhiR->getOperand(0)->getLiveInIRValue()->getType();
+ Result =
+ PhiTy->isFloatingPointTy()
+ ? new VPInstruction(Instruction::Select, {Cond, Red, PhiR}, FMFs)
+ : new VPInstruction(Instruction::Select, {Cond, Red, PhiR});
+ Result->insertBefore(&*Builder.getInsertPoint());
+ Red->replaceUsesWithIf(
+ Result->getVPSingleValue(),
+ [](VPUser &U, unsigned) { return isa<VPLiveOut>(&U); });
+ if (PreferPredicatedReductionSelect ||
+ TTI.preferPredicatedReductionSelect(
+ PhiR->getRecurrenceDescriptor().getOpcode(), PhiTy,
+ TargetTransformInfo::ReductionFlags()))
+ PhiR->setOperand(1, Result->getVPSingleValue());
+ }
+ // If the vector reduction can be performed in a smaller type, we truncate
+ // then extend the loop exit value to enable InstCombine to evaluate the
+ // entire expression in the smaller type.
+ Type *PhiTy = PhiR->getStartValue()->getLiveInIRValue()->getType();
+ if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
+ assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
+ Type *RdxTy = RdxDesc.getRecurrenceType();
+ auto *Trunc = new VPWidenCastRecipe(Instruction::Trunc,
+ Result->getVPSingleValue(), RdxTy);
+ auto *Extnd =
+ RdxDesc.isSigned()
+ ? new VPWidenCastRecipe(Instruction::SExt, Trunc, PhiTy)
+ : new VPWidenCastRecipe(Instruction::ZExt, Trunc, PhiTy);
+
+ Trunc->insertAfter(Result);
+ Extnd->insertAfter(Trunc);
+ Result->getVPSingleValue()->replaceAllUsesWith(Extnd);
+ Trunc->setOperand(0, Result->getVPSingleValue());
}
}
@@ -9347,107 +9215,6 @@ void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
}
#endif
-void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
- assert(!State.Instance && "Int or FP induction being replicated.");
-
- Value *Start = getStartValue()->getLiveInIRValue();
- const InductionDescriptor &ID = getInductionDescriptor();
- TruncInst *Trunc = getTruncInst();
- IRBuilderBase &Builder = State.Builder;
- assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
- assert(State.VF.isVector() && "must have vector VF");
-
- // The value from the original loop to which we are mapping the new induction
- // variable.
- Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
-
- // Fast-math-flags propagate from the original induction instruction.
- IRBuilder<>::FastMathFlagGuard FMFG(Builder);
- if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
- Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
-
- // Now do the actual transformations, and start with fetching the step value.
- Value *Step = State.get(getStepValue(), VPIteration(0, 0));
-
- assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
- "Expected either an induction phi-node or a truncate of it!");
-
- // Construct the initial value of the vector IV in the vector loop preheader
- auto CurrIP = Builder.saveIP();
- BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
- Builder.SetInsertPoint(VectorPH->getTerminator());
- if (isa<TruncInst>(EntryVal)) {
- assert(Start->getType()->isIntegerTy() &&
- "Truncation requires an integer type");
- auto *TruncType = cast<IntegerType>(EntryVal->getType());
- Step = Builder.CreateTrunc(Step, TruncType);
- Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
- }
-
- Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0);
- Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);
- Value *SteppedStart = getStepVector(
- SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder);
-
- // We create vector phi nodes for both integer and floating-point induction
- // variables. Here, we determine the kind of arithmetic we will perform.
- Instruction::BinaryOps AddOp;
- Instruction::BinaryOps MulOp;
- if (Step->getType()->isIntegerTy()) {
- AddOp = Instruction::Add;
- MulOp = Instruction::Mul;
- } else {
- AddOp = ID.getInductionOpcode();
- MulOp = Instruction::FMul;
- }
-
- // Multiply the vectorization factor by the step using integer or
- // floating-point arithmetic as appropriate.
- Type *StepType = Step->getType();
- Value *RuntimeVF;
- if (Step->getType()->isFloatingPointTy())
- RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF);
- else
- RuntimeVF = getRuntimeVF(Builder, StepType, State.VF);
- Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
-
- // Create a vector splat to use in the induction update.
- //
- // FIXME: If the step is non-constant, we create the vector splat with
- // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
- // handle a constant vector splat.
- Value *SplatVF = isa<Constant>(Mul)
- ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul))
- : Builder.CreateVectorSplat(State.VF, Mul);
- Builder.restoreIP(CurrIP);
-
- // We may need to add the step a number of times, depending on the unroll
- // factor. The last of those goes into the PHI.
- PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
- &*State.CFG.PrevBB->getFirstInsertionPt());
- VecInd->setDebugLoc(EntryVal->getDebugLoc());
- Instruction *LastInduction = VecInd;
- for (unsigned Part = 0; Part < State.UF; ++Part) {
- State.set(this, LastInduction, Part);
-
- if (isa<TruncInst>(EntryVal))
- State.addMetadata(LastInduction, EntryVal);
-
- LastInduction = cast<Instruction>(
- Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
- LastInduction->setDebugLoc(EntryVal->getDebugLoc());
- }
-
- LastInduction->setName("vec.ind.next");
- VecInd->addIncoming(SteppedStart, VectorPH);
- // Add induction update using an incorrect block temporarily. The phi node
- // will be fixed after VPlan execution. Note that at this point the latch
- // block cannot be used, as it does not exist yet.
- // TODO: Model increment value in VPlan, by turning the recipe into a
- // multi-def and a subclass of VPHeaderPHIRecipe.
- VecInd->addIncoming(LastInduction, VectorPH);
-}
-
void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction &&
"Not a pointer induction according to InductionDescriptor!");
@@ -9480,7 +9247,8 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
Value *Step = State.get(getOperand(1), VPIteration(Part, Lane));
Value *SclrGep = emitTransformedIndex(
- State.Builder, GlobalIdx, IndDesc.getStartValue(), Step, IndDesc);
+ State.Builder, GlobalIdx, IndDesc.getStartValue(), Step,
+ IndDesc.getKind(), IndDesc.getInductionBinOp());
SclrGep->setName("next.gep");
State.set(this, SclrGep, VPIteration(Part, Lane));
}
@@ -9547,41 +9315,26 @@ void VPDerivedIVRecipe::execute(VPTransformState &State) {
// Fast-math-flags propagate from the original induction instruction.
IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
- if (IndDesc.getInductionBinOp() &&
- isa<FPMathOperator>(IndDesc.getInductionBinOp()))
- State.Builder.setFastMathFlags(
- IndDesc.getInductionBinOp()->getFastMathFlags());
+ if (FPBinOp)
+ State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags());
Value *Step = State.get(getStepValue(), VPIteration(0, 0));
Value *CanonicalIV = State.get(getCanonicalIV(), VPIteration(0, 0));
- Value *DerivedIV =
- emitTransformedIndex(State.Builder, CanonicalIV,
- getStartValue()->getLiveInIRValue(), Step, IndDesc);
+ Value *DerivedIV = emitTransformedIndex(
+ State.Builder, CanonicalIV, getStartValue()->getLiveInIRValue(), Step,
+ Kind, cast_if_present<BinaryOperator>(FPBinOp));
DerivedIV->setName("offset.idx");
- if (ResultTy != DerivedIV->getType()) {
- assert(Step->getType()->isIntegerTy() &&
+ if (TruncResultTy) {
+ assert(TruncResultTy != DerivedIV->getType() &&
+ Step->getType()->isIntegerTy() &&
"Truncation requires an integer step");
- DerivedIV = State.Builder.CreateTrunc(DerivedIV, ResultTy);
+ DerivedIV = State.Builder.CreateTrunc(DerivedIV, TruncResultTy);
}
assert(DerivedIV != CanonicalIV && "IV didn't need transforming?");
State.set(this, DerivedIV, VPIteration(0, 0));
}
-void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
- // Fast-math-flags propagate from the original induction instruction.
- IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
- if (IndDesc.getInductionBinOp() &&
- isa<FPMathOperator>(IndDesc.getInductionBinOp()))
- State.Builder.setFastMathFlags(
- IndDesc.getInductionBinOp()->getFastMathFlags());
-
- Value *BaseIV = State.get(getOperand(0), VPIteration(0, 0));
- Value *Step = State.get(getStepValue(), VPIteration(0, 0));
-
- buildScalarSteps(BaseIV, Step, IndDesc, this, State);
-}
-
void VPInterleaveRecipe::execute(VPTransformState &State) {
assert(!State.Instance && "Interleave group being replicated.");
State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(),
@@ -9592,48 +9345,51 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
void VPReductionRecipe::execute(VPTransformState &State) {
assert(!State.Instance && "Reduction being replicated.");
Value *PrevInChain = State.get(getChainOp(), 0);
- RecurKind Kind = RdxDesc->getRecurrenceKind();
- bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc);
+ RecurKind Kind = RdxDesc.getRecurrenceKind();
+ bool IsOrdered = State.ILV->useOrderedReductions(RdxDesc);
// Propagate the fast-math flags carried by the underlying instruction.
IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
- State.Builder.setFastMathFlags(RdxDesc->getFastMathFlags());
+ State.Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
for (unsigned Part = 0; Part < State.UF; ++Part) {
Value *NewVecOp = State.get(getVecOp(), Part);
if (VPValue *Cond = getCondOp()) {
- Value *NewCond = State.get(Cond, Part);
- VectorType *VecTy = cast<VectorType>(NewVecOp->getType());
- Value *Iden = RdxDesc->getRecurrenceIdentity(
- Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags());
- Value *IdenVec =
- State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden);
- Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec);
+ Value *NewCond = State.VF.isVector() ? State.get(Cond, Part)
+ : State.get(Cond, {Part, 0});
+ VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType());
+ Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType();
+ Value *Iden = RdxDesc.getRecurrenceIdentity(Kind, ElementTy,
+ RdxDesc.getFastMathFlags());
+ if (State.VF.isVector()) {
+ Iden =
+ State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden);
+ }
+
+ Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Iden);
NewVecOp = Select;
}
Value *NewRed;
Value *NextInChain;
if (IsOrdered) {
if (State.VF.isVector())
- NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp,
+ NewRed = createOrderedReduction(State.Builder, RdxDesc, NewVecOp,
PrevInChain);
else
NewRed = State.Builder.CreateBinOp(
- (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), PrevInChain,
+ (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), PrevInChain,
NewVecOp);
PrevInChain = NewRed;
} else {
PrevInChain = State.get(getChainOp(), Part);
- NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp);
+ NewRed = createTargetReduction(State.Builder, RdxDesc, NewVecOp);
}
if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
- NextInChain =
- createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(),
- NewRed, PrevInChain);
+ NextInChain = createMinMaxOp(State.Builder, RdxDesc.getRecurrenceKind(),
+ NewRed, PrevInChain);
} else if (IsOrdered)
NextInChain = NewRed;
else
NextInChain = State.Builder.CreateBinOp(
- (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), NewRed,
- PrevInChain);
+ (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, PrevInChain);
State.set(this, NextInChain, Part);
}
}
@@ -9652,7 +9408,7 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
VectorType::get(UI->getType(), State.VF));
State.set(this, Poison, State.Instance->Part);
}
- State.ILV->packScalarIntoVectorValue(this, *State.Instance, State);
+ State.packScalarIntoVectorValue(this, *State.Instance);
}
return;
}
@@ -9718,9 +9474,16 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
auto &Builder = State.Builder;
InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF);
bool isMaskRequired = getMask();
- if (isMaskRequired)
- for (unsigned Part = 0; Part < State.UF; ++Part)
- BlockInMaskParts[Part] = State.get(getMask(), Part);
+ if (isMaskRequired) {
+ // Mask reversal is only neede for non-all-one (null) masks, as reverse of a
+ // null all-one mask is a null mask.
+ for (unsigned Part = 0; Part < State.UF; ++Part) {
+ Value *Mask = State.get(getMask(), Part);
+ if (isReverse())
+ Mask = Builder.CreateVectorReverse(Mask, "reverse");
+ BlockInMaskParts[Part] = Mask;
+ }
+ }
const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
// Calculate the pointer for the specific unroll-part.
@@ -9731,7 +9494,8 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
const DataLayout &DL =
Builder.GetInsertBlock()->getModule()->getDataLayout();
Type *IndexTy = State.VF.isScalable() && (isReverse() || Part > 0)
- ? DL.getIndexType(ScalarDataTy->getPointerTo())
+ ? DL.getIndexType(PointerType::getUnqual(
+ ScalarDataTy->getContext()))
: Builder.getInt32Ty();
bool InBounds = false;
if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
@@ -9751,21 +9515,17 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
PartPtr = Builder.CreateGEP(ScalarDataTy, Ptr, NumElt, "", InBounds);
PartPtr =
Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane, "", InBounds);
- if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
- BlockInMaskParts[Part] =
- Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse");
} else {
Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Part);
PartPtr = Builder.CreateGEP(ScalarDataTy, Ptr, Increment, "", InBounds);
}
- unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
- return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
+ return PartPtr;
};
// Handle Stores:
if (SI) {
- State.setDebugLocFromInst(SI);
+ State.setDebugLocFrom(SI->getDebugLoc());
for (unsigned Part = 0; Part < State.UF; ++Part) {
Instruction *NewSI = nullptr;
@@ -9798,7 +9558,7 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
// Handle loads.
assert(LI && "Must have a load instruction");
- State.setDebugLocFromInst(LI);
+ State.setDebugLocFrom(LI->getDebugLoc());
for (unsigned Part = 0; Part < State.UF; ++Part) {
Value *NewLI;
if (CreateGatherScatter) {
@@ -9877,95 +9637,6 @@ static ScalarEpilogueLowering getScalarEpilogueLowering(
return CM_ScalarEpilogueAllowed;
}
-Value *VPTransformState::get(VPValue *Def, unsigned Part) {
- // If Values have been set for this Def return the one relevant for \p Part.
- if (hasVectorValue(Def, Part))
- return Data.PerPartOutput[Def][Part];
-
- auto GetBroadcastInstrs = [this, Def](Value *V) {
- bool SafeToHoist = Def->isDefinedOutsideVectorRegions();
- if (VF.isScalar())
- return V;
- // Place the code for broadcasting invariant variables in the new preheader.
- IRBuilder<>::InsertPointGuard Guard(Builder);
- if (SafeToHoist) {
- BasicBlock *LoopVectorPreHeader = CFG.VPBB2IRBB[cast<VPBasicBlock>(
- Plan->getVectorLoopRegion()->getSinglePredecessor())];
- if (LoopVectorPreHeader)
- Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
- }
-
- // Place the code for broadcasting invariant variables in the new preheader.
- // Broadcast the scalar into all locations in the vector.
- Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
-
- return Shuf;
- };
-
- if (!hasScalarValue(Def, {Part, 0})) {
- Value *IRV = Def->getLiveInIRValue();
- Value *B = GetBroadcastInstrs(IRV);
- set(Def, B, Part);
- return B;
- }
-
- Value *ScalarValue = get(Def, {Part, 0});
- // If we aren't vectorizing, we can just copy the scalar map values over
- // to the vector map.
- if (VF.isScalar()) {
- set(Def, ScalarValue, Part);
- return ScalarValue;
- }
-
- bool IsUniform = vputils::isUniformAfterVectorization(Def);
-
- unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1;
- // Check if there is a scalar value for the selected lane.
- if (!hasScalarValue(Def, {Part, LastLane})) {
- // At the moment, VPWidenIntOrFpInductionRecipes, VPScalarIVStepsRecipes and
- // VPExpandSCEVRecipes can also be uniform.
- assert((isa<VPWidenIntOrFpInductionRecipe>(Def->getDefiningRecipe()) ||
- isa<VPScalarIVStepsRecipe>(Def->getDefiningRecipe()) ||
- isa<VPExpandSCEVRecipe>(Def->getDefiningRecipe())) &&
- "unexpected recipe found to be invariant");
- IsUniform = true;
- LastLane = 0;
- }
-
- auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane}));
- // Set the insert point after the last scalarized instruction or after the
- // last PHI, if LastInst is a PHI. This ensures the insertelement sequence
- // will directly follow the scalar definitions.
- auto OldIP = Builder.saveIP();
- auto NewIP =
- isa<PHINode>(LastInst)
- ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI())
- : std::next(BasicBlock::iterator(LastInst));
- Builder.SetInsertPoint(&*NewIP);
-
- // However, if we are vectorizing, we need to construct the vector values.
- // If the value is known to be uniform after vectorization, we can just
- // broadcast the scalar value corresponding to lane zero for each unroll
- // iteration. Otherwise, we construct the vector values using
- // insertelement instructions. Since the resulting vectors are stored in
- // State, we will only generate the insertelements once.
- Value *VectorValue = nullptr;
- if (IsUniform) {
- VectorValue = GetBroadcastInstrs(ScalarValue);
- set(Def, VectorValue, Part);
- } else {
- // Initialize packing with insertelements to start from undef.
- assert(!VF.isScalable() && "VF is assumed to be non scalable.");
- Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF));
- set(Def, Undef, Part);
- for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
- ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this);
- VectorValue = get(Def, Part);
- }
- Builder.restoreIP(OldIP);
- return VectorValue;
-}
-
// Process the loop in the VPlan-native vectorization path. This path builds
// VPlan upfront in the vectorization pipeline, which allows to apply
// VPlan-to-VPlan transformations from the very beginning without modifying the
@@ -9994,7 +9665,8 @@ static bool processLoopInVPlanNativePath(
// Use the planner for outer loop vectorization.
// TODO: CM is not used at this point inside the planner. Turn CM into an
// optional argument if we don't need it in the future.
- LoopVectorizationPlanner LVP(L, LI, TLI, *TTI, LVL, CM, IAI, PSE, Hints, ORE);
+ LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, Hints,
+ ORE);
// Get user vectorization factor.
ElementCount UserVF = Hints.getWidth();
@@ -10013,8 +9685,10 @@ static bool processLoopInVPlanNativePath(
VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
{
+ bool AddBranchWeights =
+ hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
- F->getParent()->getDataLayout());
+ F->getParent()->getDataLayout(), AddBranchWeights);
InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
VF.Width, 1, LVL, &CM, BFI, PSI, Checks);
LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
@@ -10022,6 +9696,8 @@ static bool processLoopInVPlanNativePath(
LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
}
+ reportVectorization(ORE, L, VF, 1);
+
// Mark the loop as already vectorized to avoid vectorizing again.
Hints.setAlreadyVectorized();
assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
@@ -10076,7 +9752,8 @@ static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
VectorizationFactor &VF,
std::optional<unsigned> VScale, Loop *L,
- ScalarEvolution &SE) {
+ ScalarEvolution &SE,
+ ScalarEpilogueLowering SEL) {
InstructionCost CheckCost = Checks.getCost();
if (!CheckCost.isValid())
return false;
@@ -10146,11 +9823,13 @@ static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
// RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC
double MinTC2 = RtC * 10 / ScalarC;
- // Now pick the larger minimum. If it is not a multiple of VF, choose the
- // next closest multiple of VF. This should partly compensate for ignoring
- // the epilogue cost.
+ // Now pick the larger minimum. If it is not a multiple of VF and a scalar
+ // epilogue is allowed, choose the next closest multiple of VF. This should
+ // partly compensate for ignoring the epilogue cost.
uint64_t MinTC = std::ceil(std::max(MinTC1, MinTC2));
- VF.MinProfitableTripCount = ElementCount::getFixed(alignTo(MinTC, IntVF));
+ if (SEL == CM_ScalarEpilogueAllowed)
+ MinTC = alignTo(MinTC, IntVF);
+ VF.MinProfitableTripCount = ElementCount::getFixed(MinTC);
LLVM_DEBUG(
dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
@@ -10270,7 +9949,14 @@ bool LoopVectorizePass::processLoop(Loop *L) {
else {
if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) {
LLVM_DEBUG(dbgs() << "\n");
- SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
+ // Predicate tail-folded loops are efficient even when the loop
+ // iteration count is low. However, setting the epilogue policy to
+ // `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops
+ // with runtime checks. It's more effective to let
+ // `areRuntimeChecksProfitable` determine if vectorization is beneficial
+ // for the loop.
+ if (SEL != CM_ScalarEpilogueNotNeededUsePredicate)
+ SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
} else {
LLVM_DEBUG(dbgs() << " But the target considers the trip count too "
"small to consider vectorizing.\n");
@@ -10334,7 +10020,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
F, &Hints, IAI);
// Use the planner for vectorization.
- LoopVectorizationPlanner LVP(L, LI, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
+ LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
ORE);
// Get user vectorization factor and interleave count.
@@ -10347,8 +10033,10 @@ bool LoopVectorizePass::processLoop(Loop *L) {
VectorizationFactor VF = VectorizationFactor::Disabled();
unsigned IC = 1;
+ bool AddBranchWeights =
+ hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
- F->getParent()->getDataLayout());
+ F->getParent()->getDataLayout(), AddBranchWeights);
if (MaybeVF) {
VF = *MaybeVF;
// Select the interleave count.
@@ -10365,7 +10053,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
Hints.getForce() == LoopVectorizeHints::FK_Enabled;
if (!ForceVectorization &&
!areRuntimeChecksProfitable(Checks, VF, getVScaleForTuning(L, *TTI), L,
- *PSE.getSE())) {
+ *PSE.getSE(), SEL)) {
ORE->emit([&]() {
return OptimizationRemarkAnalysisAliasing(
DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
@@ -10587,13 +10275,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
DisableRuntimeUnroll = true;
}
// Report the vectorization decision.
- ORE->emit([&]() {
- return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
- L->getHeader())
- << "vectorized loop (vectorization width: "
- << NV("VectorizationFactor", VF.Width)
- << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
- });
+ reportVectorization(ORE, L, VF, IC);
}
if (ORE->allowExtraAnalysis(LV_NAME))
@@ -10676,8 +10358,14 @@ LoopVectorizeResult LoopVectorizePass::runImpl(
Changed |= CFGChanged |= processLoop(L);
- if (Changed)
+ if (Changed) {
LAIs->clear();
+
+#ifndef NDEBUG
+ if (VerifySCEV)
+ SE->verify();
+#endif
+ }
}
// Process each loop nest in the function.
@@ -10725,10 +10413,6 @@ PreservedAnalyses LoopVectorizePass::run(Function &F,
PA.preserve<LoopAnalysis>();
PA.preserve<DominatorTreeAnalysis>();
PA.preserve<ScalarEvolutionAnalysis>();
-
-#ifdef EXPENSIVE_CHECKS
- SE.verify();
-#endif
}
if (Result.MadeCFGChange) {