aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Transforms/Vectorize/LoopVectorize.cpp')
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp831
1 files changed, 451 insertions, 380 deletions
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 4747f34fcc62..d11f4146b590 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -470,10 +470,11 @@ public:
/// on, while the old loop will be used as the scalar remainder. Control flow
/// is generated around the vectorized (and scalar epilogue) loops consisting
/// of various checks and bypasses. Return the pre-header block of the new
- /// loop.
- /// In the case of epilogue vectorization, this function is overriden to
- /// handle the more complex control flow around the loops.
- virtual BasicBlock *createVectorizedLoopSkeleton();
+ /// loop and the start value for the canonical induction, if it is != 0. The
+ /// latter is the case when vectorizing the epilogue loop. In the case of
+ /// epilogue vectorization, this function is overriden to handle the more
+ /// complex control flow around the loops.
+ virtual std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton();
/// Widen a single call instruction within the innermost loop.
void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands,
@@ -507,10 +508,10 @@ public:
/// Widen an integer or floating-point induction variable \p IV. If \p Trunc
/// is provided, the integer induction variable will first be truncated to
- /// the corresponding type.
- void widenIntOrFpInduction(PHINode *IV, const InductionDescriptor &ID,
- Value *Start, TruncInst *Trunc, VPValue *Def,
- VPTransformState &State);
+ /// the corresponding type. \p CanonicalIV is the scalar value generated for
+ /// the canonical induction variable.
+ void widenIntOrFpInduction(PHINode *IV, VPWidenIntOrFpInductionRecipe *Def,
+ VPTransformState &State, Value *CanonicalIV);
/// Construct the vector value of a scalarized value \p V one lane at a time.
void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance,
@@ -556,6 +557,10 @@ public:
/// vector of instructions.
void addMetadata(ArrayRef<Value *> To, Instruction *From);
+ // Returns the resume value (bc.merge.rdx) for a reduction as
+ // generated by fixReduction.
+ PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc);
+
protected:
friend class LoopVectorizationPlanner;
@@ -573,16 +578,18 @@ protected:
Value *CountRoundDown, Value *EndValue,
BasicBlock *MiddleBlock);
- /// Create a new induction variable inside L.
- PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
- Value *Step, Instruction *DL);
+ /// Introduce a conditional branch (on true, condition to be set later) at the
+ /// end of the header=latch connecting it to itself (across the backedge) and
+ /// to the exit block of \p L.
+ void createHeaderBranch(Loop *L);
/// Handle all cross-iteration phis in the header.
void fixCrossIterationPHIs(VPTransformState &State);
/// Create the exit value of first order recurrences in the middle block and
/// update their users.
- void fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR, VPTransformState &State);
+ void fixFirstOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR,
+ VPTransformState &State);
/// Create code for the loop exit value of the reduction.
void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State);
@@ -606,14 +613,6 @@ protected:
/// represented as.
void truncateToMinimalBitwidths(VPTransformState &State);
- /// This function adds
- /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
- /// to each vector element of Val. The sequence starts at StartIndex.
- /// \p Opcode is relevant for FP induction variable.
- virtual Value *
- getStepVector(Value *Val, Value *StartIdx, Value *Step,
- Instruction::BinaryOps Opcode = Instruction::BinaryOpsEnd);
-
/// Compute scalar induction steps. \p ScalarIV is the scalar induction
/// variable on which to base the steps, \p Step is the size of the step, and
/// \p EntryVal is the value from the original loop that maps to the steps.
@@ -640,9 +639,6 @@ protected:
/// Returns true if we should generate a scalar version of \p IV.
bool needsScalarInduction(Instruction *IV) const;
- /// Generate a shuffle sequence that will reverse the vector Vec.
- virtual Value *reverseVector(Value *Vec);
-
/// Returns (and creates if needed) the original loop trip count.
Value *getOrCreateTripCount(Loop *NewLoop);
@@ -685,14 +681,13 @@ protected:
Loop *createVectorLoopSkeleton(StringRef Prefix);
/// Create new phi nodes for the induction variables to resume iteration count
- /// in the scalar epilogue, from where the vectorized loop left off (given by
- /// \p VectorTripCount).
+ /// in the scalar epilogue, from where the vectorized loop left off.
/// In cases where the loop skeleton is more complicated (eg. epilogue
/// vectorization) and the resume values can come from an additional bypass
/// block, the \p AdditionalBypass pair provides information about the bypass
/// block and the end value on the edge from bypass to this loop.
void createInductionResumeValues(
- Loop *L, Value *VectorTripCount,
+ Loop *L,
std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
/// Complete the loop skeleton by adding debug MDs, creating appropriate
@@ -795,12 +790,6 @@ protected:
/// A list of all bypass blocks. The first block is the entry of the loop.
SmallVector<BasicBlock *, 4> LoopBypassBlocks;
- /// The new Induction variable which was added to the new block.
- PHINode *Induction = nullptr;
-
- /// The induction variable of the old basic block.
- PHINode *OldInduction = nullptr;
-
/// Store instructions that were predicated.
SmallVector<Instruction *, 4> PredicatedInstructions;
@@ -838,6 +827,11 @@ protected:
/// Structure to hold information about generated runtime checks, responsible
/// for cleaning the checks, if vectorization turns out unprofitable.
GeneratedRTChecks &RTChecks;
+
+ // Holds the resume values for reductions in the loops, used to set the
+ // correct start value of reduction PHIs when vectorizing the epilogue.
+ SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4>
+ ReductionResumeValues;
};
class InnerLoopUnroller : public InnerLoopVectorizer {
@@ -856,10 +850,6 @@ public:
private:
Value *getBroadcastInstrs(Value *V) override;
- Value *getStepVector(
- Value *Val, Value *StartIdx, Value *Step,
- Instruction::BinaryOps Opcode = Instruction::BinaryOpsEnd) override;
- Value *reverseVector(Value *Vec) override;
};
/// Encapsulate information regarding vectorization of a loop and its epilogue.
@@ -909,14 +899,16 @@ public:
// Override this function to handle the more complex control flow around the
// three loops.
- BasicBlock *createVectorizedLoopSkeleton() final override {
+ std::pair<BasicBlock *, Value *>
+ createVectorizedLoopSkeleton() final override {
return createEpilogueVectorizedLoopSkeleton();
}
/// The interface for creating a vectorized skeleton using one of two
/// different strategies, each corresponding to one execution of the vplan
/// as described above.
- virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0;
+ virtual std::pair<BasicBlock *, Value *>
+ createEpilogueVectorizedLoopSkeleton() = 0;
/// Holds and updates state information required to vectorize the main loop
/// and its epilogue in two separate passes. This setup helps us avoid
@@ -944,7 +936,8 @@ public:
EPI, LVL, CM, BFI, PSI, Check) {}
/// Implements the interface for creating a vectorized skeleton using the
/// *main loop* strategy (ie the first pass of vplan execution).
- BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
+ std::pair<BasicBlock *, Value *>
+ createEpilogueVectorizedLoopSkeleton() final override;
protected:
/// Emits an iteration count bypass check once for the main loop (when \p
@@ -973,7 +966,8 @@ public:
EPI, LVL, CM, BFI, PSI, Checks) {}
/// Implements the interface for creating a vectorized skeleton using the
/// *epilogue loop* strategy (ie the second pass of vplan execution).
- BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
+ std::pair<BasicBlock *, Value *>
+ createEpilogueVectorizedLoopSkeleton() final override;
protected:
/// Emits an iteration count bypass check after the main vector loop has
@@ -1069,16 +1063,16 @@ static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
}
+namespace llvm {
+
/// Return a value for Step multiplied by VF.
-static Value *createStepForVF(IRBuilder<> &B, Type *Ty, ElementCount VF,
- int64_t Step) {
+Value *createStepForVF(IRBuilder<> &B, Type *Ty, ElementCount VF,
+ int64_t Step) {
assert(Ty->isIntegerTy() && "Expected an integer step");
Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue());
return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
}
-namespace llvm {
-
/// Return the runtime value for VF.
Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) {
Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue());
@@ -1163,7 +1157,8 @@ void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
// will lead to gather/scatter instructions, which don't need to be
// handled.
if (isa<VPWidenMemoryInstructionRecipe>(CurRec) ||
- isa<VPInterleaveRecipe>(CurRec))
+ isa<VPInterleaveRecipe>(CurRec) ||
+ isa<VPCanonicalIVPHIRecipe>(CurRec))
continue;
// This recipe contributes to the address computation of a widen
@@ -1232,6 +1227,14 @@ void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
}
}
+PHINode *InnerLoopVectorizer::getReductionResumeValue(
+ const RecurrenceDescriptor &RdxDesc) {
+ auto It = ReductionResumeValues.find(&RdxDesc);
+ assert(It != ReductionResumeValues.end() &&
+ "Expected to find a resume value for the reduction.");
+ return It->second;
+}
+
namespace llvm {
// Loop vectorization cost-model hints how the scalar epilogue loop should be
@@ -1556,13 +1559,16 @@ public:
/// Returns true if the target machine can represent \p V as a masked gather
/// or scatter operation.
- bool isLegalGatherOrScatter(Value *V) {
+ bool isLegalGatherOrScatter(Value *V,
+ ElementCount VF = ElementCount::getFixed(1)) {
bool LI = isa<LoadInst>(V);
bool SI = isa<StoreInst>(V);
if (!LI && !SI)
return false;
auto *Ty = getLoadStoreType(V);
Align Align = getLoadStoreAlignment(V);
+ if (VF.isVector())
+ Ty = VectorType::get(Ty, VF);
return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
(SI && TTI.isLegalMaskedScatter(Ty, Align));
}
@@ -1577,16 +1583,17 @@ public:
}
/// Returns true if \p I is an instruction that will be scalarized with
- /// predication. Such instructions include conditional stores and
- /// instructions that may divide by zero.
- /// If a non-zero VF has been calculated, we check if I will be scalarized
- /// predication for that VF.
- bool isScalarWithPredication(Instruction *I) const;
+ /// predication when vectorizing \p I with vectorization factor \p VF. Such
+ /// instructions include conditional stores and instructions that may divide
+ /// by zero.
+ bool isScalarWithPredication(Instruction *I, ElementCount VF) const;
// Returns true if \p I is an instruction that will be predicated either
// through scalar predication or masked load/store or masked gather/scatter.
+ // \p VF is the vectorization factor that will be used to vectorize \p I.
// Superset of instructions that return true for isScalarWithPredication.
- bool isPredicatedInst(Instruction *I, bool IsKnownUniform = false) {
+ bool isPredicatedInst(Instruction *I, ElementCount VF,
+ bool IsKnownUniform = false) {
// When we know the load is uniform and the original scalar loop was not
// predicated we don't need to mark it as a predicated instruction. Any
// vectorised blocks created when tail-folding are something artificial we
@@ -1602,7 +1609,7 @@ public:
// instructions.
if (isa<LoadInst>(I) || isa<StoreInst>(I))
return Legal->isMaskRequired(I);
- return isScalarWithPredication(I);
+ return isScalarWithPredication(I, VF);
}
/// Returns true if \p I is a memory instruction with consecutive memory
@@ -1794,7 +1801,7 @@ private:
/// Returns true if an artificially high cost for emulated masked memrefs
/// should be used.
- bool useEmulatedMaskMemRefHack(Instruction *I);
+ bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
/// Map of scalar integer values to the smallest bitwidth they can be legally
/// represented as. The vector equivalents of these values should be truncated
@@ -2078,8 +2085,8 @@ public:
/// Remove the created SCEV & memory runtime check blocks & instructions, if
/// unused.
~GeneratedRTChecks() {
- SCEVExpanderCleaner SCEVCleaner(SCEVExp, *DT);
- SCEVExpanderCleaner MemCheckCleaner(MemCheckExp, *DT);
+ SCEVExpanderCleaner SCEVCleaner(SCEVExp);
+ SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
if (!SCEVCheckCond)
SCEVCleaner.markResultUsed();
@@ -2335,6 +2342,60 @@ Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
return Shuf;
}
+/// This function adds
+/// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
+/// to each vector element of Val. The sequence starts at StartIndex.
+/// \p Opcode is relevant for FP induction variable.
+static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,
+ Instruction::BinaryOps BinOp, ElementCount VF,
+ IRBuilder<> &Builder) {
+ assert(VF.isVector() && "only vector VFs are supported");
+
+ // Create and check the types.
+ auto *ValVTy = cast<VectorType>(Val->getType());
+ ElementCount VLen = ValVTy->getElementCount();
+
+ Type *STy = Val->getType()->getScalarType();
+ assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
+ "Induction Step must be an integer or FP");
+ assert(Step->getType() == STy && "Step has wrong type");
+
+ SmallVector<Constant *, 8> Indices;
+
+ // Create a vector of consecutive numbers from zero to VF.
+ VectorType *InitVecValVTy = ValVTy;
+ Type *InitVecValSTy = STy;
+ if (STy->isFloatingPointTy()) {
+ InitVecValSTy =
+ IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
+ InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
+ }
+ Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
+
+ // Splat the StartIdx
+ Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx);
+
+ if (STy->isIntegerTy()) {
+ InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
+ Step = Builder.CreateVectorSplat(VLen, Step);
+ assert(Step->getType() == Val->getType() && "Invalid step vec");
+ // FIXME: The newly created binary instructions should contain nsw/nuw
+ // flags, which can be found from the original scalar operations.
+ Step = Builder.CreateMul(InitVec, Step);
+ return Builder.CreateAdd(Val, Step, "induction");
+ }
+
+ // Floating point induction.
+ assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
+ "Binary Opcode should be specified for FP induction");
+ InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
+ InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat);
+
+ Step = Builder.CreateVectorSplat(VLen, Step);
+ Value *MulOp = Builder.CreateFMul(InitVec, Step);
+ return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
+}
+
void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
const InductionDescriptor &II, Value *Step, Value *Start,
Instruction *EntryVal, VPValue *Def, VPTransformState &State) {
@@ -2355,8 +2416,8 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0);
Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);
- Value *SteppedStart =
- getStepVector(SplatStart, Zero, Step, II.getInductionOpcode());
+ Value *SteppedStart = getStepVector(
+ SplatStart, Zero, Step, II.getInductionOpcode(), State.VF, State.Builder);
// We create vector phi nodes for both integer and floating-point induction
// variables. Here, we determine the kind of arithmetic we will perform.
@@ -2411,8 +2472,7 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
// placement of all induction updates.
auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
- auto *ICmp = cast<Instruction>(Br->getCondition());
- LastInduction->moveBefore(ICmp);
+ LastInduction->moveBefore(Br);
LastInduction->setName("vec.ind.next");
VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
@@ -2434,15 +2494,15 @@ bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
return llvm::any_of(IV->users(), isScalarInst);
}
-void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV,
- const InductionDescriptor &ID,
- Value *Start, TruncInst *Trunc,
- VPValue *Def,
- VPTransformState &State) {
+void InnerLoopVectorizer::widenIntOrFpInduction(
+ PHINode *IV, VPWidenIntOrFpInductionRecipe *Def, VPTransformState &State,
+ Value *CanonicalIV) {
+ Value *Start = Def->getStartValue()->getLiveInIRValue();
+ const InductionDescriptor &ID = Def->getInductionDescriptor();
+ TruncInst *Trunc = Def->getTruncInst();
IRBuilder<> &Builder = State.Builder;
- assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
- "Primary induction variable must have an integer type");
assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
+ assert(!State.VF.isZero() && "VF must be non-zero");
// The value from the original loop to which we are mapping the new induction
// variable.
@@ -2468,12 +2528,13 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV,
// induction variable and step. Otherwise, derive these values from the
// induction descriptor.
auto CreateScalarIV = [&](Value *&Step) -> Value * {
- Value *ScalarIV = Induction;
- if (IV != OldInduction) {
- ScalarIV = IV->getType()->isIntegerTy()
- ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
- : Builder.CreateCast(Instruction::SIToFP, Induction,
- IV->getType());
+ Value *ScalarIV = CanonicalIV;
+ Type *NeededType = IV->getType();
+ if (!Def->isCanonical() || ScalarIV->getType() != NeededType) {
+ ScalarIV =
+ NeededType->isIntegerTy()
+ ? Builder.CreateSExtOrTrunc(ScalarIV, NeededType)
+ : Builder.CreateCast(Instruction::SIToFP, ScalarIV, NeededType);
ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID,
State.CFG.PrevBB);
ScalarIV->setName("offset.idx");
@@ -2493,7 +2554,6 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV,
auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) {
Value *Broadcasted = getBroadcastInstrs(ScalarIV);
for (unsigned Part = 0; Part < UF; ++Part) {
- assert(!State.VF.isScalable() && "scalable vectors not yet supported.");
Value *StartIdx;
if (Step->getType()->isFloatingPointTy())
StartIdx =
@@ -2502,7 +2562,8 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV,
StartIdx = getRuntimeVF(Builder, Step->getType(), State.VF * Part);
Value *EntryPart =
- getStepVector(Broadcasted, StartIdx, Step, ID.getInductionOpcode());
+ getStepVector(Broadcasted, StartIdx, Step, ID.getInductionOpcode(),
+ State.VF, State.Builder);
State.set(Def, EntryPart, Part);
if (Trunc)
addMetadata(EntryPart, Trunc);
@@ -2516,9 +2577,31 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV,
// Now do the actual transformations, and start with creating the step value.
Value *Step = CreateStepValue(ID.getStep());
- if (State.VF.isZero() || State.VF.isScalar()) {
+ if (State.VF.isScalar()) {
Value *ScalarIV = CreateScalarIV(Step);
- CreateSplatIV(ScalarIV, Step);
+ Type *ScalarTy = IntegerType::get(ScalarIV->getContext(),
+ Step->getType()->getScalarSizeInBits());
+
+ Instruction::BinaryOps IncOp = ID.getInductionOpcode();
+ if (IncOp == Instruction::BinaryOpsEnd)
+ IncOp = Instruction::Add;
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ Value *StartIdx = ConstantInt::get(ScalarTy, Part);
+ Instruction::BinaryOps MulOp = Instruction::Mul;
+ if (Step->getType()->isFloatingPointTy()) {
+ StartIdx = Builder.CreateUIToFP(StartIdx, Step->getType());
+ MulOp = Instruction::FMul;
+ }
+
+ Value *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
+ Value *EntryPart = Builder.CreateBinOp(IncOp, ScalarIV, Mul, "induction");
+ State.set(Def, EntryPart, Part);
+ if (Trunc) {
+ assert(!Step->getType()->isFloatingPointTy() &&
+ "fp inductions shouldn't be truncated");
+ addMetadata(EntryPart, Trunc);
+ }
+ }
return;
}
@@ -2554,54 +2637,6 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV,
buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State);
}
-Value *InnerLoopVectorizer::getStepVector(Value *Val, Value *StartIdx,
- Value *Step,
- Instruction::BinaryOps BinOp) {
- // Create and check the types.
- auto *ValVTy = cast<VectorType>(Val->getType());
- ElementCount VLen = ValVTy->getElementCount();
-
- Type *STy = Val->getType()->getScalarType();
- assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
- "Induction Step must be an integer or FP");
- assert(Step->getType() == STy && "Step has wrong type");
-
- SmallVector<Constant *, 8> Indices;
-
- // Create a vector of consecutive numbers from zero to VF.
- VectorType *InitVecValVTy = ValVTy;
- Type *InitVecValSTy = STy;
- if (STy->isFloatingPointTy()) {
- InitVecValSTy =
- IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
- InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
- }
- Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
-
- // Splat the StartIdx
- Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx);
-
- if (STy->isIntegerTy()) {
- InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
- Step = Builder.CreateVectorSplat(VLen, Step);
- assert(Step->getType() == Val->getType() && "Invalid step vec");
- // FIXME: The newly created binary instructions should contain nsw/nuw flags,
- // which can be found from the original scalar operations.
- Step = Builder.CreateMul(InitVec, Step);
- return Builder.CreateAdd(Val, Step, "induction");
- }
-
- // Floating point induction.
- assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
- "Binary Opcode should be specified for FP induction");
- InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
- InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat);
-
- Step = Builder.CreateVectorSplat(VLen, Step);
- Value *MulOp = Builder.CreateFMul(InitVec, Step);
- return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
-}
-
void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
Instruction *EntryVal,
const InductionDescriptor &ID,
@@ -2691,11 +2726,6 @@ void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def,
State.set(Def, VectorValue, Instance.Part);
}
-Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
- assert(Vec->getType()->isVectorTy() && "Invalid type");
- return Builder.CreateVectorReverse(Vec, "reverse");
-}
-
// Return whether we allow using masked interleave-groups (for dealing with
// strided loads/stores that reside in predicated blocks, or for dealing
// with gaps).
@@ -2858,7 +2888,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
}
if (Group->isReverse())
- StridedVec = reverseVector(StridedVec);
+ StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
State.set(VPDefs[J], StridedVec, Part);
}
@@ -2894,7 +2924,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
Value *StoredVec = State.get(StoredValues[i], Part);
if (Group->isReverse())
- StoredVec = reverseVector(StoredVec);
+ StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse");
// If this member has different type, cast it to a unified type.
@@ -2993,43 +3023,21 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
PredicatedInstructions.push_back(Cloned);
}
-PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
- Value *End, Value *Step,
- Instruction *DL) {
+void InnerLoopVectorizer::createHeaderBranch(Loop *L) {
BasicBlock *Header = L->getHeader();
- BasicBlock *Latch = L->getLoopLatch();
- // As we're just creating this loop, it's possible no latch exists
- // yet. If so, use the header as this will be a single block loop.
- if (!Latch)
- Latch = Header;
-
- IRBuilder<> B(&*Header->getFirstInsertionPt());
- Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
- setDebugLocFromInst(OldInst, &B);
- auto *Induction = B.CreatePHI(Start->getType(), 2, "index");
+ assert(!L->getLoopLatch() && "loop should not have a latch at this point");
- B.SetInsertPoint(Latch->getTerminator());
+ IRBuilder<> B(Header->getTerminator());
+ Instruction *OldInst =
+ getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
setDebugLocFromInst(OldInst, &B);
- // Create i+1 and fill the PHINode.
- //
- // If the tail is not folded, we know that End - Start >= Step (either
- // statically or through the minimum iteration checks). We also know that both
- // Start % Step == 0 and End % Step == 0. We exit the vector loop if %IV +
- // %Step == %End. Hence we must exit the loop before %IV + %Step unsigned
- // overflows and we can mark the induction increment as NUW.
- Value *Next = B.CreateAdd(Induction, Step, "index.next",
- /*NUW=*/!Cost->foldTailByMasking(), /*NSW=*/false);
- Induction->addIncoming(Start, L->getLoopPreheader());
- Induction->addIncoming(Next, Latch);
- // Create the compare.
- Value *ICmp = B.CreateICmpEQ(Next, End);
- B.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header);
+ // Connect the header to the exit and header blocks and replace the old
+ // terminator.
+ B.CreateCondBr(B.getTrue(), L->getUniqueExitBlock(), Header);
// Now we have two terminators. Remove the old one from the block.
- Latch->getTerminator()->eraseFromParent();
-
- return Induction;
+ Header->getTerminator()->eraseFromParent();
}
Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
@@ -3099,10 +3107,9 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
if (Cost->foldTailByMasking()) {
assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
"VF*UF must be a power of 2 when folding tail by masking");
- assert(!VF.isScalable() &&
- "Tail folding not yet supported for scalable vectors");
+ Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF);
TC = Builder.CreateAdd(
- TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up");
+ TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up");
}
// Now we need to generate the expression for the part of the loop that the
@@ -3436,12 +3443,13 @@ Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
}
void InnerLoopVectorizer::createInductionResumeValues(
- Loop *L, Value *VectorTripCount,
- std::pair<BasicBlock *, Value *> AdditionalBypass) {
- assert(VectorTripCount && L && "Expected valid arguments");
+ Loop *L, std::pair<BasicBlock *, Value *> AdditionalBypass) {
assert(((AdditionalBypass.first && AdditionalBypass.second) ||
(!AdditionalBypass.first && !AdditionalBypass.second)) &&
"Inconsistent information about additional bypass.");
+
+ Value *VectorTripCount = getOrCreateVectorTripCount(L);
+ assert(VectorTripCount && L && "Expected valid arguments");
// We are going to resume the execution of the scalar loop.
// Go over all of the induction variables that we found and fix the
// PHIs that are left in the scalar version of the loop.
@@ -3449,6 +3457,7 @@ void InnerLoopVectorizer::createInductionResumeValues(
// iteration in the vectorized loop.
// If we come from a bypass edge then we need to start from the original
// start value.
+ Instruction *OldInduction = Legal->getPrimaryInduction();
for (auto &InductionEntry : Legal->getInductionVars()) {
PHINode *OrigPhi = InductionEntry.first;
InductionDescriptor II = InductionEntry.second;
@@ -3546,25 +3555,6 @@ BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
"Inconsistent vector loop preheader");
Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
- Optional<MDNode *> VectorizedLoopID =
- makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
- LLVMLoopVectorizeFollowupVectorized});
- if (VectorizedLoopID.hasValue()) {
- L->setLoopID(VectorizedLoopID.getValue());
-
- // Do not setAlreadyVectorized if loop attributes have been defined
- // explicitly.
- return LoopVectorPreHeader;
- }
-
- // Keep all loop hints from the original loop on the vector loop (we'll
- // replace the vectorizer-specific hints below).
- if (MDNode *LID = OrigLoop->getLoopID())
- L->setLoopID(LID);
-
- LoopVectorizeHints Hints(L, true, *ORE, TTI);
- Hints.setAlreadyVectorized();
-
#ifdef EXPENSIVE_CHECKS
assert(DT->verify(DominatorTree::VerificationLevel::Fast));
LI->verify(*DT);
@@ -3573,7 +3563,8 @@ BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
return LoopVectorPreHeader;
}
-BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
+std::pair<BasicBlock *, Value *>
+InnerLoopVectorizer::createVectorizedLoopSkeleton() {
/*
In this function we generate a new loop. The new loop will contain
the vectorized instructions while the old loop will continue to run the
@@ -3638,33 +3629,12 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
// faster.
emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
- // Some loops have a single integer induction variable, while other loops
- // don't. One example is c++ iterators that often have multiple pointer
- // induction variables. In the code below we also support a case where we
- // don't have a single induction variable.
- //
- // We try to obtain an induction variable from the original loop as hard
- // as possible. However if we don't find one that:
- // - is an integer
- // - counts from zero, stepping by one
- // - is the size of the widest induction variable type
- // then we create a new one.
- OldInduction = Legal->getPrimaryInduction();
- Type *IdxTy = Legal->getWidestInductionType();
- Value *StartIdx = ConstantInt::get(IdxTy, 0);
- // The loop step is equal to the vectorization factor (num of SIMD elements)
- // times the unroll factor (num of SIMD instructions).
- Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt());
- Value *Step = createStepForVF(Builder, IdxTy, VF, UF);
- Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
- Induction =
- createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
- getDebugLocFromInstOrOperands(OldInduction));
+ createHeaderBranch(Lp);
// Emit phis for the new starting index of the scalar loop.
- createInductionResumeValues(Lp, CountRoundDown);
+ createInductionResumeValues(Lp);
- return completeLoopSkeleton(Lp, OrigLoopID);
+ return {completeLoopSkeleton(Lp, OrigLoopID), nullptr};
}
// Fix up external users of the induction variable. At this point, we are
@@ -4088,8 +4058,8 @@ void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {
}
}
-void InnerLoopVectorizer::fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR,
- VPTransformState &State) {
+void InnerLoopVectorizer::fixFirstOrderRecurrence(
+ VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) {
// This is the second phase of vectorizing first-order recurrences. An
// overview of the transformation is described below. Suppose we have the
// following loop.
@@ -4334,13 +4304,29 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
: Builder.CreateZExt(ReducedPartRdx, PhiTy);
}
+ PHINode *ResumePhi =
+ dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
+
// Create a phi node that merges control-flow from the backedge-taken check
// block and the middle block.
PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx",
LoopScalarPreHeader->getTerminator());
- for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
- BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
- BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
+
+ // If we are fixing reductions in the epilogue loop then we should already
+ // have created a bc.merge.rdx Phi after the main vector body. Ensure that
+ // we carry over the incoming values correctly.
+ for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
+ if (Incoming == LoopMiddleBlock)
+ BCBlockPhi->addIncoming(ReducedPartRdx, Incoming);
+ else if (ResumePhi && llvm::is_contained(ResumePhi->blocks(), Incoming))
+ BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming),
+ Incoming);
+ else
+ BCBlockPhi->addIncoming(ReductionStartValue, Incoming);
+ }
+
+ // Set the resume value for this reduction
+ ReductionResumeValues.insert({&RdxDesc, BCBlockPhi});
// Now, we need to fix the users of the reduction variable
// inside and outside of the scalar remainder loop.
@@ -4557,6 +4543,9 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
InductionDescriptor II = Legal->getInductionVars().lookup(P);
const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
+ auto *IVR = PhiR->getParent()->getPlan()->getCanonicalIV();
+ PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0));
+
// FIXME: The newly created binary instructions should contain nsw/nuw flags,
// which can be found from the original scalar operations.
switch (II.getKind()) {
@@ -4572,7 +4561,7 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
if (Cost->isScalarAfterVectorization(P, State.VF)) {
// This is the normalized GEP that starts counting at zero.
Value *PtrInd =
- Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType());
+ Builder.CreateSExtOrTrunc(CanonicalIV, II.getStep()->getType());
// Determine the number of scalars we need to generate for each unroll
// iteration. If the instruction is uniform, we only need to generate the
// first lane. Otherwise, we generate all VF values.
@@ -4602,10 +4591,10 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
Type *PhiType = II.getStep()->getType();
// Build a pointer phi
- Value *ScalarStartValue = II.getStartValue();
+ Value *ScalarStartValue = PhiR->getStartValue()->getLiveInIRValue();
Type *ScStValueType = ScalarStartValue->getType();
PHINode *NewPointerPhi =
- PHINode::Create(ScStValueType, 2, "pointer.phi", Induction);
+ PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV);
NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader);
// A pointer induction, performed by using a gep
@@ -4916,7 +4905,8 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
Scalars[VF].insert(Worklist.begin(), Worklist.end());
}
-bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) const {
+bool LoopVectorizationCostModel::isScalarWithPredication(
+ Instruction *I, ElementCount VF) const {
if (!blockNeedsPredicationForAnyReason(I->getParent()))
return false;
switch(I->getOpcode()) {
@@ -4928,11 +4918,14 @@ bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) const {
return false;
auto *Ptr = getLoadStorePointerOperand(I);
auto *Ty = getLoadStoreType(I);
+ Type *VTy = Ty;
+ if (VF.isVector())
+ VTy = VectorType::get(Ty, VF);
const Align Alignment = getLoadStoreAlignment(I);
return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
- TTI.isLegalMaskedGather(Ty, Alignment))
+ TTI.isLegalMaskedGather(VTy, Alignment))
: !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
- TTI.isLegalMaskedScatter(Ty, Alignment));
+ TTI.isLegalMaskedScatter(VTy, Alignment));
}
case Instruction::UDiv:
case Instruction::SDiv:
@@ -5005,7 +4998,7 @@ bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
// If the instruction is a store located in a predicated block, it will be
// scalarized.
- if (isScalarWithPredication(I))
+ if (isScalarWithPredication(I, VF))
return false;
// If the instruction's allocated size doesn't equal it's type size, it
@@ -5056,7 +5049,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
<< *I << "\n");
return;
}
- if (isScalarWithPredication(I)) {
+ if (isScalarWithPredication(I, VF)) {
LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
<< *I << "\n");
return;
@@ -5531,10 +5524,12 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
}
}
- // For scalable vectors, don't use tail folding as this is currently not yet
- // supported. The code is likely to have ended up here if the tripcount is
- // low, in which case it makes sense not to use scalable vectors.
- if (MaxFactors.ScalableVF.isVector())
+ // For scalable vectors don't use tail folding for low trip counts or
+ // optimizing for code size. We only permit this if the user has explicitly
+ // requested it.
+ if (ScalarEpilogueStatus != CM_ScalarEpilogueNotNeededUsePredicate &&
+ ScalarEpilogueStatus != CM_ScalarEpilogueNotAllowedUsePredicate &&
+ MaxFactors.ScalableVF.isVector())
MaxFactors.ScalableVF = ElementCount::getScalable(0);
// If we don't know the precise trip count, or if the trip count that we
@@ -5849,10 +5844,8 @@ bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(
const Loop &L, ElementCount VF) const {
// Cross iteration phis such as reductions need special handling and are
// currently unsupported.
- if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) {
- return Legal->isFirstOrderRecurrence(&Phi) ||
- Legal->isReductionVariable(&Phi);
- }))
+ if (any_of(L.getHeader()->phis(),
+ [&](PHINode &Phi) { return Legal->isFirstOrderRecurrence(&Phi); }))
return false;
// Phis with uses outside of the loop require special handling and are
@@ -5978,11 +5971,29 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() {
unsigned MinWidth = -1U;
unsigned MaxWidth = 8;
const DataLayout &DL = TheFunction->getParent()->getDataLayout();
- for (Type *T : ElementTypesInLoop) {
- MinWidth = std::min<unsigned>(
- MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
- MaxWidth = std::max<unsigned>(
- MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
+ // For in-loop reductions, no element types are added to ElementTypesInLoop
+ // if there are no loads/stores in the loop. In this case, check through the
+ // reduction variables to determine the maximum width.
+ if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
+ // Reset MaxWidth so that we can find the smallest type used by recurrences
+ // in the loop.
+ MaxWidth = -1U;
+ for (auto &PhiDescriptorPair : Legal->getReductionVars()) {
+ const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
+ // When finding the min width used by the recurrence we need to account
+ // for casts on the input operands of the recurrence.
+ MaxWidth = std::min<unsigned>(
+ MaxWidth, std::min<unsigned>(
+ RdxDesc.getMinWidthCastToRecurrenceTypeInBits(),
+ RdxDesc.getRecurrenceType()->getScalarSizeInBits()));
+ }
+ } else {
+ for (Type *T : ElementTypesInLoop) {
+ MinWidth = std::min<unsigned>(
+ MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
+ MaxWidth = std::max<unsigned>(
+ MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
+ }
}
return {MinWidth, MaxWidth};
}
@@ -6022,18 +6033,8 @@ void LoopVectorizationCostModel::collectElementTypesForWidening() {
if (auto *ST = dyn_cast<StoreInst>(&I))
T = ST->getValueOperand()->getType();
- // Ignore loaded pointer types and stored pointer types that are not
- // vectorizable.
- //
- // FIXME: The check here attempts to predict whether a load or store will
- // be vectorized. We only know this for certain after a VF has
- // been selected. Here, we assume that if an access can be
- // vectorized, it will be. We should also look at extending this
- // optimization to non-pointer types.
- //
- if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
- !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
- continue;
+ assert(T->isSized() &&
+ "Expected the load/store/recurrence type to be sized");
ElementTypesInLoop.insert(T);
}
@@ -6475,7 +6476,8 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
return RUs;
}
-bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
+bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
+ ElementCount VF) {
// TODO: Cost model for emulated masked load/store is completely
// broken. This hack guides the cost model to use an artificially
// high enough value to practically disable vectorization with such
@@ -6484,8 +6486,7 @@ bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
// from moving "masked load/store" check from legality to cost model.
// Masked Load/Gather emulation was previously never allowed.
// Limited number of Masked Store/Scatter emulation was allowed.
- assert(isPredicatedInst(I) &&
- "Expecting a scalar emulated instruction");
+ assert(isPredicatedInst(I, VF) && "Expecting a scalar emulated instruction");
return isa<LoadInst>(I) ||
(isa<StoreInst>(I) &&
NumPredStores > NumberOfStoresToPredicate);
@@ -6512,13 +6513,13 @@ void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
if (!blockNeedsPredicationForAnyReason(BB))
continue;
for (Instruction &I : *BB)
- if (isScalarWithPredication(&I)) {
+ if (isScalarWithPredication(&I, VF)) {
ScalarCostsTy ScalarCosts;
// Do not apply discount if scalable, because that would lead to
// invalid scalarization costs.
// Do not apply discount logic if hacked cost is needed
// for emulated masked memrefs.
- if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I) &&
+ if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) &&
computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
// Remember that BB will remain after vectorization.
@@ -6554,7 +6555,7 @@ int LoopVectorizationCostModel::computePredInstDiscount(
// If the instruction is scalar with predication, it will be analyzed
// separately. We ignore it within the context of PredInst.
- if (isScalarWithPredication(I))
+ if (isScalarWithPredication(I, VF))
return false;
// If any of the instruction's operands are uniform after vectorization,
@@ -6601,7 +6602,7 @@ int LoopVectorizationCostModel::computePredInstDiscount(
// Compute the scalarization overhead of needed insertelement instructions
// and phi nodes.
- if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
+ if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
ScalarCost += TTI.getScalarizationOverhead(
cast<VectorType>(ToVectorTy(I->getType(), VF)),
APInt::getAllOnes(VF.getFixedValue()), true, false);
@@ -6764,7 +6765,7 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
// If we have a predicated load/store, it will need extra i1 extracts and
// conditional branches, but may not be executed for each vector lane. Scale
// the cost by the probability of executing the predicated block.
- if (isPredicatedInst(I)) {
+ if (isPredicatedInst(I, VF)) {
Cost /= getReciprocalPredBlockProb();
// Add the cost of an i1 extract and a branch
@@ -6775,7 +6776,7 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
/*Insert=*/false, /*Extract=*/true);
Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput);
- if (useEmulatedMaskMemRefHack(I))
+ if (useEmulatedMaskMemRefHack(I, VF))
// Artificially setting to a high enough value to practically disable
// vectorization with such operations.
Cost = 3000000;
@@ -7182,7 +7183,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
// predicated uniform stores. Today they are treated as any other
// predicated store (see added test cases in
// invariant-store-vectorization.ll).
- if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
+ if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF))
NumPredStores++;
if (Legal->isUniformMemOp(I)) {
@@ -7192,7 +7193,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
// Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
InstructionCost Cost;
if (isa<StoreInst>(&I) && VF.isScalable() &&
- isLegalGatherOrScatter(&I)) {
+ isLegalGatherOrScatter(&I, VF)) {
Cost = getGatherScatterCost(&I, VF);
setWideningDecision(&I, VF, CM_GatherScatter, Cost);
} else {
@@ -7234,7 +7235,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
}
InstructionCost GatherScatterCost =
- isLegalGatherOrScatter(&I)
+ isLegalGatherOrScatter(&I, VF)
? getGatherScatterCost(&I, VF) * NumAccesses
: InstructionCost::getInvalid();
@@ -7437,7 +7438,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
// vector lane. Get the scalarization cost and scale this amount by the
// probability of executing the predicated block. If the instruction is not
// predicated, we fall through to the next case.
- if (VF.isVector() && isScalarWithPredication(I)) {
+ if (VF.isVector() && isScalarWithPredication(I, VF)) {
InstructionCost Cost = 0;
// These instructions have a non-void type, so account for the phi nodes
@@ -7941,6 +7942,40 @@ VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
llvm_unreachable("No plan found!");
}
+static void AddRuntimeUnrollDisableMetaData(Loop *L) {
+ SmallVector<Metadata *, 4> MDs;
+ // Reserve first location for self reference to the LoopID metadata node.
+ MDs.push_back(nullptr);
+ bool IsUnrollMetadata = false;
+ MDNode *LoopID = L->getLoopID();
+ if (LoopID) {
+ // First find existing loop unrolling disable metadata.
+ for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
+ auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
+ if (MD) {
+ const auto *S = dyn_cast<MDString>(MD->getOperand(0));
+ IsUnrollMetadata =
+ S && S->getString().startswith("llvm.loop.unroll.disable");
+ }
+ MDs.push_back(LoopID->getOperand(i));
+ }
+ }
+
+ if (!IsUnrollMetadata) {
+ // Add runtime unroll disable metadata.
+ LLVMContext &Context = L->getHeader()->getContext();
+ SmallVector<Metadata *, 1> DisableOperands;
+ DisableOperands.push_back(
+ MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
+ MDNode *DisableNode = MDNode::get(Context, DisableOperands);
+ MDs.push_back(DisableNode);
+ MDNode *NewLoopID = MDNode::get(Context, MDs);
+ // Set operand 0 to refer to the loop id itself.
+ NewLoopID->replaceOperandWith(0, NewLoopID);
+ L->setLoopID(NewLoopID);
+ }
+}
+
void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
VPlan &BestVPlan,
InnerLoopVectorizer &ILV,
@@ -7952,9 +7987,9 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
// 1. Create a new empty loop. Unlink the old loop and connect the new one.
VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan};
- State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
- State.TripCount = ILV.getOrCreateTripCount(nullptr);
- State.CanonicalIV = ILV.Induction;
+ Value *CanonicalIVStartValue;
+ std::tie(State.CFG.PrevBB, CanonicalIVStartValue) =
+ ILV.createVectorizedLoopSkeleton();
ILV.collectPoisonGeneratingRecipes(State);
ILV.printDebugTracesAtStart();
@@ -7968,8 +8003,35 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
//===------------------------------------------------===//
// 2. Copy and widen instructions from the old loop into the new loop.
+ BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr),
+ ILV.getOrCreateVectorTripCount(nullptr),
+ CanonicalIVStartValue, State);
BestVPlan.execute(&State);
+ // Keep all loop hints from the original loop on the vector loop (we'll
+ // replace the vectorizer-specific hints below).
+ MDNode *OrigLoopID = OrigLoop->getLoopID();
+
+ Optional<MDNode *> VectorizedLoopID =
+ makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
+ LLVMLoopVectorizeFollowupVectorized});
+
+ Loop *L = LI->getLoopFor(State.CFG.PrevBB);
+ if (VectorizedLoopID.hasValue())
+ L->setLoopID(VectorizedLoopID.getValue());
+ else {
+ // Keep all loop hints from the original loop on the vector loop (we'll
+ // replace the vectorizer-specific hints below).
+ if (MDNode *LID = OrigLoop->getLoopID())
+ L->setLoopID(LID);
+
+ LoopVectorizeHints Hints(L, true, *ORE);
+ Hints.setAlreadyVectorized();
+ }
+ // Disable runtime unrolling when vectorizing the epilogue loop.
+ if (CanonicalIVStartValue)
+ AddRuntimeUnrollDisableMetaData(L);
+
// 3. Fix the vectorized code: take care of header phi's, live-outs,
// predication, updating analyses.
ILV.fixVectorizedLoop(State);
@@ -8032,66 +8094,16 @@ void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
}
}
-Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
-
Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
-Value *InnerLoopUnroller::getStepVector(Value *Val, Value *StartIdx,
- Value *Step,
- Instruction::BinaryOps BinOp) {
- // When unrolling and the VF is 1, we only need to add a simple scalar.
- Type *Ty = Val->getType();
- assert(!Ty->isVectorTy() && "Val must be a scalar");
-
- if (Ty->isFloatingPointTy()) {
- // Floating-point operations inherit FMF via the builder's flags.
- Value *MulOp = Builder.CreateFMul(StartIdx, Step);
- return Builder.CreateBinOp(BinOp, Val, MulOp);
- }
- return Builder.CreateAdd(Val, Builder.CreateMul(StartIdx, Step), "induction");
-}
-
-static void AddRuntimeUnrollDisableMetaData(Loop *L) {
- SmallVector<Metadata *, 4> MDs;
- // Reserve first location for self reference to the LoopID metadata node.
- MDs.push_back(nullptr);
- bool IsUnrollMetadata = false;
- MDNode *LoopID = L->getLoopID();
- if (LoopID) {
- // First find existing loop unrolling disable metadata.
- for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
- auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
- if (MD) {
- const auto *S = dyn_cast<MDString>(MD->getOperand(0));
- IsUnrollMetadata =
- S && S->getString().startswith("llvm.loop.unroll.disable");
- }
- MDs.push_back(LoopID->getOperand(i));
- }
- }
-
- if (!IsUnrollMetadata) {
- // Add runtime unroll disable metadata.
- LLVMContext &Context = L->getHeader()->getContext();
- SmallVector<Metadata *, 1> DisableOperands;
- DisableOperands.push_back(
- MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
- MDNode *DisableNode = MDNode::get(Context, DisableOperands);
- MDs.push_back(DisableNode);
- MDNode *NewLoopID = MDNode::get(Context, MDs);
- // Set operand 0 to refer to the loop id itself.
- NewLoopID->replaceOperandWith(0, NewLoopID);
- L->setLoopID(NewLoopID);
- }
-}
-
//===--------------------------------------------------------------------===//
// EpilogueVectorizerMainLoop
//===--------------------------------------------------------------------===//
/// This function is partially responsible for generating the control flow
/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
-BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
+std::pair<BasicBlock *, Value *>
+EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
MDNode *OrigLoopID = OrigLoop->getLoopID();
Loop *Lp = createVectorLoopSkeleton("");
@@ -8120,24 +8132,16 @@ BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false);
// Generate the induction variable.
- OldInduction = Legal->getPrimaryInduction();
- Type *IdxTy = Legal->getWidestInductionType();
- Value *StartIdx = ConstantInt::get(IdxTy, 0);
-
- IRBuilder<> B(&*Lp->getLoopPreheader()->getFirstInsertionPt());
- Value *Step = getRuntimeVF(B, IdxTy, VF * UF);
Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
EPI.VectorTripCount = CountRoundDown;
- Induction =
- createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
- getDebugLocFromInstOrOperands(OldInduction));
+ createHeaderBranch(Lp);
// Skip induction resume value creation here because they will be created in
// the second pass. If we created them here, they wouldn't be used anyway,
// because the vplan in the second pass still contains the inductions from the
// original loop.
- return completeLoopSkeleton(Lp, OrigLoopID);
+ return {completeLoopSkeleton(Lp, OrigLoopID), nullptr};
}
void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
@@ -8219,7 +8223,7 @@ BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck(
/// This function is partially responsible for generating the control flow
/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
-BasicBlock *
+std::pair<BasicBlock *, Value *>
EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
MDNode *OrigLoopID = OrigLoop->getLoopID();
Loop *Lp = createVectorLoopSkeleton("vec.epilog.");
@@ -8275,6 +8279,25 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
+ // The vec.epilog.iter.check block may contain Phi nodes from reductions which
+ // merge control-flow from the latch block and the middle block. Update the
+ // incoming values here and move the Phi into the preheader.
+ SmallVector<PHINode *, 4> PhisInBlock;
+ for (PHINode &Phi : VecEpilogueIterationCountCheck->phis())
+ PhisInBlock.push_back(&Phi);
+
+ for (PHINode *Phi : PhisInBlock) {
+ Phi->replaceIncomingBlockWith(
+ VecEpilogueIterationCountCheck->getSinglePredecessor(),
+ VecEpilogueIterationCountCheck);
+ Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck);
+ if (EPI.SCEVSafetyCheck)
+ Phi->removeIncomingValue(EPI.SCEVSafetyCheck);
+ if (EPI.MemSafetyCheck)
+ Phi->removeIncomingValue(EPI.MemSafetyCheck);
+ Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI());
+ }
+
// Generate a resume induction for the vector epilogue and put it in the
// vector epilogue preheader
Type *IdxTy = Legal->getWidestInductionType();
@@ -8285,13 +8308,7 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
EPI.MainLoopIterationCountCheck);
// Generate the induction variable.
- OldInduction = Legal->getPrimaryInduction();
- Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
- Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
- Value *StartIdx = EPResumeVal;
- Induction =
- createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
- getDebugLocFromInstOrOperands(OldInduction));
+ createHeaderBranch(Lp);
// Generate induction resume values. These variables save the new starting
// indexes for the scalar loop. They are used to test if there are any tail
@@ -8300,12 +8317,10 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
// check, then the resume value for the induction variable comes from
// the trip count of the main vector loop, hence passing the AdditionalBypass
// argument.
- createInductionResumeValues(Lp, CountRoundDown,
- {VecEpilogueIterationCountCheck,
- EPI.VectorTripCount} /* AdditionalBypass */);
+ createInductionResumeValues(Lp, {VecEpilogueIterationCountCheck,
+ EPI.VectorTripCount} /* AdditionalBypass */);
- AddRuntimeUnrollDisableMetaData(Lp);
- return completeLoopSkeleton(Lp, OrigLoopID);
+ return {completeLoopSkeleton(Lp, OrigLoopID), EPResumeVal};
}
BasicBlock *
@@ -8447,33 +8462,22 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
// Introduce the early-exit compare IV <= BTC to form header block mask.
- // This is used instead of IV < TC because TC may wrap, unlike BTC.
- // Start by constructing the desired canonical IV in the header block.
- VPValue *IV = nullptr;
- if (Legal->getPrimaryInduction())
- IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction());
- else {
- VPBasicBlock *HeaderVPBB = Plan->getEntry()->getEntryBasicBlock();
- auto *IVRecipe = new VPWidenCanonicalIVRecipe();
- HeaderVPBB->insert(IVRecipe, HeaderVPBB->getFirstNonPhi());
- IV = IVRecipe;
- }
+ // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
+ // constructing the desired canonical IV in the header block as its first
+ // non-phi instructions.
+ assert(CM.foldTailByMasking() && "must fold the tail");
+ VPBasicBlock *HeaderVPBB = Plan->getEntry()->getEntryBasicBlock();
+ auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
+ auto *IV = new VPWidenCanonicalIVRecipe(Plan->getCanonicalIV());
+ HeaderVPBB->insert(IV, HeaderVPBB->getFirstNonPhi());
- // Create the block in mask as the first non-phi instruction in the block.
VPBuilder::InsertPointGuard Guard(Builder);
- auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi();
- Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint);
-
- VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
- bool TailFolded = !CM.isScalarEpilogueAllowed();
-
- if (TailFolded && CM.TTI.emitGetActiveLaneMask()) {
- // While ActiveLaneMask is a binary op that consumes the loop tripcount
- // as a second argument, we only pass the IV here and extract the
- // tripcount from the transform state where codegen of the VP instructions
- // happen.
- BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV});
+ Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
+ if (CM.TTI.emitGetActiveLaneMask()) {
+ VPValue *TC = Plan->getOrCreateTripCount();
+ BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC});
} else {
+ VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
}
return BlockMaskCache[BB] = BlockMask;
@@ -8621,7 +8625,9 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
VFRange &Range) const {
bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
- [this, CI](ElementCount VF) { return CM.isScalarWithPredication(CI); },
+ [this, CI](ElementCount VF) {
+ return CM.isScalarWithPredication(CI, VF);
+ },
Range);
if (IsPredicated)
@@ -8661,7 +8667,8 @@ bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
// scalarization is profitable or it is predicated.
auto WillScalarize = [this, I](ElementCount VF) -> bool {
return CM.isScalarAfterVectorization(I, VF) ||
- CM.isProfitableToScalarize(I, VF) || CM.isScalarWithPredication(I);
+ CM.isProfitableToScalarize(I, VF) ||
+ CM.isScalarWithPredication(I, VF);
};
return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
Range);
@@ -8719,7 +8726,7 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
void VPRecipeBuilder::fixHeaderPhis() {
BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
- for (VPWidenPHIRecipe *R : PhisToFix) {
+ for (VPHeaderPHIRecipe *R : PhisToFix) {
auto *PN = cast<PHINode>(R->getUnderlyingValue());
VPRecipeBase *IncR =
getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));
@@ -8735,7 +8742,7 @@ VPBasicBlock *VPRecipeBuilder::handleReplication(
Range);
bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
- [&](ElementCount VF) { return CM.isPredicatedInst(I, IsUniform); },
+ [&](ElementCount VF) { return CM.isPredicatedInst(I, VF, IsUniform); },
Range);
// Even if the instruction is not marked as uniform, there are certain
@@ -8861,7 +8868,7 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands)))
return toVPRecipeResult(Recipe);
- VPWidenPHIRecipe *PhiRecipe = nullptr;
+ VPHeaderPHIRecipe *PhiRecipe = nullptr;
if (Legal->isReductionVariable(Phi) || Legal->isFirstOrderRecurrence(Phi)) {
VPValue *StartV = Operands[0];
if (Legal->isReductionVariable(Phi)) {
@@ -8882,11 +8889,14 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch())));
PhisToFix.push_back(PhiRecipe);
} else {
- // TODO: record start and backedge value for remaining pointer induction
- // phis.
+ // TODO: record backedge value for remaining pointer induction phis.
assert(Phi->getType()->isPointerTy() &&
"only pointer phis should be handled here");
- PhiRecipe = new VPWidenPHIRecipe(Phi);
+ assert(Legal->getInductionVars().count(Phi) &&
+ "Not an induction variable");
+ InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
+ VPValue *Start = Plan->getOrAddVPValue(II.getStartValue());
+ PhiRecipe = new VPWidenPHIRecipe(Phi, Start);
}
return toVPRecipeResult(PhiRecipe);
@@ -8966,6 +8976,40 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
}
}
+// Add a VPCanonicalIVPHIRecipe starting at 0 to the header, a
+// CanonicalIVIncrement{NUW} VPInstruction to increment it by VF * UF and a
+// BranchOnCount VPInstruction to the latch.
+static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL,
+ bool HasNUW, bool IsVPlanNative) {
+ Value *StartIdx = ConstantInt::get(IdxTy, 0);
+ auto *StartV = Plan.getOrAddVPValue(StartIdx);
+
+ auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);
+ VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
+ VPBasicBlock *Header = TopRegion->getEntryBasicBlock();
+ if (IsVPlanNative)
+ Header = cast<VPBasicBlock>(Header->getSingleSuccessor());
+ Header->insert(CanonicalIVPHI, Header->begin());
+
+ auto *CanonicalIVIncrement =
+ new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW
+ : VPInstruction::CanonicalIVIncrement,
+ {CanonicalIVPHI}, DL);
+ CanonicalIVPHI->addOperand(CanonicalIVIncrement);
+
+ VPBasicBlock *EB = TopRegion->getExitBasicBlock();
+ if (IsVPlanNative) {
+ EB = cast<VPBasicBlock>(EB->getSinglePredecessor());
+ EB->setCondBit(nullptr);
+ }
+ EB->appendRecipe(CanonicalIVIncrement);
+
+ auto *BranchOnCount =
+ new VPInstruction(VPInstruction::BranchOnCount,
+ {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
+ EB->appendRecipe(BranchOnCount);
+}
+
VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
const MapVector<Instruction *, Instruction *> &SinkAfter) {
@@ -9033,6 +9077,12 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop");
auto Plan = std::make_unique<VPlan>(TopRegion);
+ Instruction *DLInst =
+ getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
+ addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(),
+ DLInst ? DLInst->getDebugLoc() : DebugLoc(),
+ !CM.foldTailByMasking(), false);
+
// Scan the body of the loop in a topological order to visit each basic block
// after having visited its predecessor basic blocks.
LoopBlocksDFS DFS(OrigLoop);
@@ -9194,6 +9244,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
}
}
+ VPlanTransforms::removeRedundantCanonicalIVs(*Plan);
VPlanTransforms::removeRedundantInductionCasts(*Plan);
// Now that sink-after is done, move induction recipes for optimized truncates
@@ -9325,6 +9376,9 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
OrigLoop, Plan,
[this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },
DeadInstructions, *PSE.getSE());
+
+ addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(),
+ true, true);
return Plan;
}
@@ -9414,16 +9468,19 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
}
// If tail is folded by masking, introduce selects between the phi
- // and the live-out instruction of each reduction, at the end of the latch.
+ // and the live-out instruction of each reduction, at the beginning of the
+ // dedicated latch block.
if (CM.foldTailByMasking()) {
+ Builder.setInsertPoint(LatchVPBB, LatchVPBB->begin());
for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) {
VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
if (!PhiR || PhiR->isInLoop())
continue;
- Builder.setInsertPoint(LatchVPBB);
VPValue *Cond =
RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
VPValue *Red = PhiR->getBackedgeValue();
+ assert(cast<VPRecipeBase>(Red->getDef())->getParent() != LatchVPBB &&
+ "reduction recipe must be defined before latch");
Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR});
}
}
@@ -9682,9 +9739,8 @@ void VPWidenGEPRecipe::execute(VPTransformState &State) {
void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
assert(!State.Instance && "Int or FP induction being replicated.");
- State.ILV->widenIntOrFpInduction(IV, getInductionDescriptor(),
- getStartValue()->getLiveInIRValue(),
- getTruncInst(), getVPValue(0), State);
+ auto *CanonicalIV = State.get(getParent()->getPlan()->getCanonicalIV(), 0);
+ State.ILV->widenIntOrFpInduction(IV, this, State, CanonicalIV);
}
void VPWidenPHIRecipe::execute(VPTransformState &State) {
@@ -10013,7 +10069,7 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
}
- State.set(getVPSingleValue(), NewLI, Part);
+ State.set(this, NewLI, Part);
}
}
@@ -10561,6 +10617,21 @@ bool LoopVectorizePass::processLoop(Loop *L) {
Checks);
VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF);
+
+ // Ensure that the start values for any VPReductionPHIRecipes are
+ // updated before vectorising the epilogue loop.
+ VPBasicBlock *Header = BestEpiPlan.getEntry()->getEntryBasicBlock();
+ for (VPRecipeBase &R : Header->phis()) {
+ if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
+ if (auto *Resume = MainILV.getReductionResumeValue(
+ ReductionPhi->getRecurrenceDescriptor())) {
+ VPValue *StartVal = new VPValue(Resume);
+ BestEpiPlan.addExternalDef(StartVal);
+ ReductionPhi->setOperand(0, StartVal);
+ }
+ }
+ }
+
LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
DT);
++LoopsEpilogueVectorized;