aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm-project/llvm/lib/Transforms/Vectorize
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Transforms/Vectorize')
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp2
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h27
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp481
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp24
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h7
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp1
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h34
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp88
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp23
9 files changed, 375 insertions, 312 deletions
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index fa2459d1ca02..1f11d4894f77 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -1193,7 +1193,7 @@ std::optional<APInt> Vectorizer::getConstantOffsetComplexAddrs(
OpA->getType() != OpB->getType())
return std::nullopt;
- uint64_t Stride = DL.getTypeAllocSize(GTIA.getIndexedType());
+ uint64_t Stride = GTIA.getSequentialElementStride(DL);
// Only look through a ZExt/SExt.
if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA))
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 577ce8000de2..cff72ae263d8 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -167,9 +167,14 @@ public:
}
VPValue *createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal,
- DebugLoc DL, const Twine &Name = "") {
- return createNaryOp(Instruction::Select, {Cond, TrueVal, FalseVal}, DL,
- Name);
+ DebugLoc DL, const Twine &Name = "",
+ std::optional<FastMathFlags> FMFs = std::nullopt) {
+ auto *Select =
+ FMFs ? new VPInstruction(Instruction::Select, {Cond, TrueVal, FalseVal},
+ *FMFs, DL, Name)
+ : new VPInstruction(Instruction::Select, {Cond, TrueVal, FalseVal},
+ DL, Name);
+ return tryInsertInstruction(Select);
}
/// Create a new ICmp VPInstruction with predicate \p Pred and operands \p A
@@ -341,16 +346,20 @@ public:
/// Return the best VPlan for \p VF.
VPlan &getBestPlanFor(ElementCount VF) const;
- /// Generate the IR code for the body of the vectorized loop according to the
- /// best selected \p VF, \p UF and VPlan \p BestPlan.
+ /// Generate the IR code for the vectorized loop captured in VPlan \p BestPlan
+ /// according to the best selected \p VF and \p UF.
+ ///
/// TODO: \p IsEpilogueVectorization is needed to avoid issues due to epilogue
/// vectorization re-using plans for both the main and epilogue vector loops.
/// It should be removed once the re-use issue has been fixed.
/// \p ExpandedSCEVs is passed during execution of the plan for epilogue loop
- /// to re-use expansion results generated during main plan execution. Returns
- /// a mapping of SCEVs to their expanded IR values. Note that this is a
- /// temporary workaround needed due to the current epilogue handling.
- DenseMap<const SCEV *, Value *>
+ /// to re-use expansion results generated during main plan execution.
+ ///
+ /// Returns a mapping of SCEVs to their expanded IR values and a mapping for
+ /// the reduction resume values. Note that this is a temporary workaround
+ /// needed due to the current epilogue handling.
+ std::pair<DenseMap<const SCEV *, Value *>,
+ DenseMap<const RecurrenceDescriptor *, Value *>>
executePlan(ElementCount VF, unsigned UF, VPlan &BestPlan,
InnerLoopVectorizer &LB, DominatorTree *DT,
bool IsEpilogueVectorization,
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 8e135d80f4f2..51ce88480c08 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -584,10 +584,6 @@ public:
/// able to vectorize with strict in-order reductions for the given RdxDesc.
bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc);
- // Returns the resume value (bc.merge.rdx) for a reduction as
- // generated by fixReduction.
- PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc);
-
/// Create a new phi node for the induction variable \p OrigPhi to resume
/// iteration count in the scalar epilogue, from where the vectorized loop
/// left off. \p Step is the SCEV-expanded induction step to use. In cases
@@ -626,9 +622,6 @@ protected:
BasicBlock *MiddleBlock, BasicBlock *VectorHeader,
VPlan &Plan, VPTransformState &State);
- /// Handle all cross-iteration phis in the header.
- void fixCrossIterationPHIs(VPTransformState &State);
-
/// Create the exit value of first order recurrences in the middle block and
/// update their users.
void fixFixedOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR,
@@ -1166,14 +1159,6 @@ void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
}
}
-PHINode *InnerLoopVectorizer::getReductionResumeValue(
- const RecurrenceDescriptor &RdxDesc) {
- auto It = ReductionResumeValues.find(&RdxDesc);
- assert(It != ReductionResumeValues.end() &&
- "Expected to find a resume value for the reduction.");
- return It->second;
-}
-
namespace llvm {
// Loop vectorization cost-model hints how the scalar epilogue loop should be
@@ -3434,8 +3419,15 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
// At this point every instruction in the original loop is widened to a
// vector form. Now we need to fix the recurrences in the loop. These PHI
// nodes are currently empty because we did not want to introduce cycles.
- // This is the second stage of vectorizing recurrences.
- fixCrossIterationPHIs(State);
+ // This is the second stage of vectorizing recurrences. Note that fixing
+ // reduction phis are already modeled in VPlan.
+ // TODO: Also model fixing fixed-order recurrence phis in VPlan.
+ VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion();
+ VPBasicBlock *HeaderVPBB = VectorRegion->getEntryBasicBlock();
+ for (VPRecipeBase &R : HeaderVPBB->phis()) {
+ if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
+ fixFixedOrderRecurrence(FOR, State);
+ }
// Forget the original basic block.
PSE.getSE()->forgetLoop(OrigLoop);
@@ -3450,7 +3442,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
for (PHINode &PN : Exit->phis())
PSE.getSE()->forgetLcssaPhiWithNewPredecessor(OrigLoop, &PN);
- VPBasicBlock *LatchVPBB = Plan.getVectorLoopRegion()->getExitingBasicBlock();
+ VPBasicBlock *LatchVPBB = VectorRegion->getExitingBasicBlock();
Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]);
if (Cost->requiresScalarEpilogue(VF.isVector())) {
// No edge from the middle block to the unique exit block has been inserted
@@ -3503,27 +3495,6 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
VF.getKnownMinValue() * UF);
}
-void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {
- // In order to support recurrences we need to be able to vectorize Phi nodes.
- // Phi nodes have cycles, so we need to vectorize them in two stages. This is
- // stage #2: We now need to fix the recurrences by adding incoming edges to
- // the currently empty PHI nodes. At this point every instruction in the
- // original loop is widened to a vector form so we can use them to construct
- // the incoming edges.
- VPBasicBlock *Header =
- State.Plan->getVectorLoopRegion()->getEntryBasicBlock();
-
- for (VPRecipeBase &R : Header->phis()) {
- if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
- fixReduction(ReductionPhi, State);
- }
-
- for (VPRecipeBase &R : Header->phis()) {
- if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
- fixFixedOrderRecurrence(FOR, State);
- }
-}
-
void InnerLoopVectorizer::fixFixedOrderRecurrence(
VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) {
// This is the second phase of vectorizing first-order recurrences. An
@@ -3643,169 +3614,6 @@ void InnerLoopVectorizer::fixFixedOrderRecurrence(
Phi->setName("scalar.recur");
}
-void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
- VPTransformState &State) {
- PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
- // Get it's reduction variable descriptor.
- assert(Legal->isReductionVariable(OrigPhi) &&
- "Unable to find the reduction variable");
- const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
-
- RecurKind RK = RdxDesc.getRecurrenceKind();
- TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
- Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
- if (auto *I = dyn_cast<Instruction>(&*ReductionStartValue))
- State.setDebugLocFrom(I->getDebugLoc());
-
- VPValue *LoopExitInstDef = PhiR->getBackedgeValue();
-
- // Before each round, move the insertion point right between
- // the PHIs and the values we are going to write.
- // This allows us to write both PHINodes and the extractelement
- // instructions.
- Builder.SetInsertPoint(LoopMiddleBlock,
- LoopMiddleBlock->getFirstInsertionPt());
-
- State.setDebugLocFrom(LoopExitInst->getDebugLoc());
-
- Type *PhiTy = OrigPhi->getType();
- // If tail is folded by masking, the vector value to leave the loop should be
- // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
- // instead of the former. For an inloop reduction the reduction will already
- // be predicated, and does not need to be handled here.
- if (Cost->foldTailByMasking() && !PhiR->isInLoop()) {
- VPValue *Def = nullptr;
- for (VPUser *U : LoopExitInstDef->users()) {
- auto *S = dyn_cast<VPInstruction>(U);
- if (S && S->getOpcode() == Instruction::Select) {
- Def = S;
- break;
- }
- }
- if (Def)
- LoopExitInstDef = Def;
- }
-
- VectorParts RdxParts(UF);
- for (unsigned Part = 0; Part < UF; ++Part)
- RdxParts[Part] = State.get(LoopExitInstDef, Part);
-
- // If the vector reduction can be performed in a smaller type, we truncate
- // then extend the loop exit value to enable InstCombine to evaluate the
- // entire expression in the smaller type.
- if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
- Builder.SetInsertPoint(LoopMiddleBlock,
- LoopMiddleBlock->getFirstInsertionPt());
- Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
- for (unsigned Part = 0; Part < UF; ++Part) {
- RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
- }
- }
-
- // Reduce all of the unrolled parts into a single vector.
- Value *ReducedPartRdx = RdxParts[0];
- unsigned Op = RecurrenceDescriptor::getOpcode(RK);
-
- // The middle block terminator has already been assigned a DebugLoc here (the
- // OrigLoop's single latch terminator). We want the whole middle block to
- // appear to execute on this line because: (a) it is all compiler generated,
- // (b) these instructions are always executed after evaluating the latch
- // conditional branch, and (c) other passes may add new predecessors which
- // terminate on this line. This is the easiest way to ensure we don't
- // accidentally cause an extra step back into the loop while debugging.
- State.setDebugLocFrom(LoopMiddleBlock->getTerminator()->getDebugLoc());
- if (PhiR->isOrdered())
- ReducedPartRdx = RdxParts[UF - 1];
- else {
- // Floating-point operations should have some FMF to enable the reduction.
- IRBuilderBase::FastMathFlagGuard FMFG(Builder);
- Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
- for (unsigned Part = 1; Part < UF; ++Part) {
- Value *RdxPart = RdxParts[Part];
- if (Op != Instruction::ICmp && Op != Instruction::FCmp)
- ReducedPartRdx = Builder.CreateBinOp(
- (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
- else if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK))
- ReducedPartRdx = createAnyOfOp(Builder, ReductionStartValue, RK,
- ReducedPartRdx, RdxPart);
- else
- ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
- }
- }
-
- // Create the reduction after the loop. Note that inloop reductions create the
- // target reduction in the loop using a Reduction recipe.
- if (VF.isVector() && !PhiR->isInLoop()) {
- ReducedPartRdx =
- createTargetReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi);
- // If the reduction can be performed in a smaller type, we need to extend
- // the reduction to the wider type before we branch to the original loop.
- if (PhiTy != RdxDesc.getRecurrenceType())
- ReducedPartRdx = RdxDesc.isSigned()
- ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
- : Builder.CreateZExt(ReducedPartRdx, PhiTy);
- }
-
- PHINode *ResumePhi =
- dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
-
- // Create a phi node that merges control-flow from the backedge-taken check
- // block and the middle block.
- PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx",
- LoopScalarPreHeader->getTerminator());
-
- // If we are fixing reductions in the epilogue loop then we should already
- // have created a bc.merge.rdx Phi after the main vector body. Ensure that
- // we carry over the incoming values correctly.
- for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
- if (Incoming == LoopMiddleBlock)
- BCBlockPhi->addIncoming(ReducedPartRdx, Incoming);
- else if (ResumePhi && llvm::is_contained(ResumePhi->blocks(), Incoming))
- BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming),
- Incoming);
- else
- BCBlockPhi->addIncoming(ReductionStartValue, Incoming);
- }
-
- // Set the resume value for this reduction
- ReductionResumeValues.insert({&RdxDesc, BCBlockPhi});
-
- // If there were stores of the reduction value to a uniform memory address
- // inside the loop, create the final store here.
- if (StoreInst *SI = RdxDesc.IntermediateStore) {
- StoreInst *NewSI =
- Builder.CreateAlignedStore(ReducedPartRdx, SI->getPointerOperand(),
- SI->getAlign());
- propagateMetadata(NewSI, SI);
-
- // If the reduction value is used in other places,
- // then let the code below create PHI's for that.
- }
-
- // Now, we need to fix the users of the reduction variable
- // inside and outside of the scalar remainder loop.
-
- // We know that the loop is in LCSSA form. We need to update the PHI nodes
- // in the exit blocks. See comment on analogous loop in
- // fixFixedOrderRecurrence for a more complete explaination of the logic.
- if (!Cost->requiresScalarEpilogue(VF.isVector()))
- for (PHINode &LCSSAPhi : LoopExitBlock->phis())
- if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) {
- LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
- State.Plan->removeLiveOut(&LCSSAPhi);
- }
-
- // Fix the scalar loop reduction variable with the incoming reduction sum
- // from the vector body and from the backedge value.
- int IncomingEdgeBlockIdx =
- OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
- assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
- // Pick the other block.
- int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
- OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
- OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
-}
-
void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
// The basic block and loop containing the predicated instruction.
auto *PredBB = PredInst->getParent();
@@ -5579,21 +5387,45 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
}
- // If trip count is known or estimated compile time constant, limit the
- // interleave count to be less than the trip count divided by VF, provided it
- // is at least 1.
- //
- // For scalable vectors we can't know if interleaving is beneficial. It may
- // not be beneficial for small loops if none of the lanes in the second vector
- // iterations is enabled. However, for larger loops, there is likely to be a
- // similar benefit as for fixed-width vectors. For now, we choose to leave
- // the InterleaveCount as if vscale is '1', although if some information about
- // the vector is known (e.g. min vector size), we can make a better decision.
- if (BestKnownTC) {
- MaxInterleaveCount =
- std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
- // Make sure MaxInterleaveCount is greater than 0.
- MaxInterleaveCount = std::max(1u, MaxInterleaveCount);
+ unsigned EstimatedVF = VF.getKnownMinValue();
+ if (VF.isScalable()) {
+ if (std::optional<unsigned> VScale = getVScaleForTuning(TheLoop, TTI))
+ EstimatedVF *= *VScale;
+ }
+ assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
+
+ unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
+ if (KnownTC) {
+ // If trip count is known we select between two prospective ICs, where
+ // 1) the aggressive IC is capped by the trip count divided by VF
+ // 2) the conservative IC is capped by the trip count divided by (VF * 2)
+ // The final IC is selected in a way that the epilogue loop trip count is
+ // minimized while maximizing the IC itself, so that we either run the
+ // vector loop at least once if it generates a small epilogue loop, or else
+ // we run the vector loop at least twice.
+
+ unsigned InterleaveCountUB = bit_floor(
+ std::max(1u, std::min(KnownTC / EstimatedVF, MaxInterleaveCount)));
+ unsigned InterleaveCountLB = bit_floor(std::max(
+ 1u, std::min(KnownTC / (EstimatedVF * 2), MaxInterleaveCount)));
+ MaxInterleaveCount = InterleaveCountLB;
+
+ if (InterleaveCountUB != InterleaveCountLB) {
+ unsigned TailTripCountUB = (KnownTC % (EstimatedVF * InterleaveCountUB));
+ unsigned TailTripCountLB = (KnownTC % (EstimatedVF * InterleaveCountLB));
+ // If both produce same scalar tail, maximize the IC to do the same work
+ // in fewer vector loop iterations
+ if (TailTripCountUB == TailTripCountLB)
+ MaxInterleaveCount = InterleaveCountUB;
+ }
+ } else if (BestKnownTC) {
+ // If trip count is an estimated compile time constant, limit the
+ // IC to be capped by the trip count divided by VF * 2, such that the vector
+ // loop runs at least twice to make interleaving seem profitable when there
+ // is an epilogue loop present. Since exact Trip count is not known we
+ // choose to be conservative in our IC estimate.
+ MaxInterleaveCount = bit_floor(std::max(
+ 1u, std::min(*BestKnownTC / (EstimatedVF * 2), MaxInterleaveCount)));
}
assert(MaxInterleaveCount > 0 &&
@@ -7585,7 +7417,65 @@ static void AddRuntimeUnrollDisableMetaData(Loop *L) {
}
}
-SCEV2ValueTy LoopVectorizationPlanner::executePlan(
+// Check if \p RedResult is a ComputeReductionResult instruction, and if it is
+// create a merge phi node for it and add it to \p ReductionResumeValues.
+static void createAndCollectMergePhiForReduction(
+ VPInstruction *RedResult,
+ DenseMap<const RecurrenceDescriptor *, Value *> &ReductionResumeValues,
+ VPTransformState &State, Loop *OrigLoop, BasicBlock *LoopMiddleBlock) {
+ if (!RedResult ||
+ RedResult->getOpcode() != VPInstruction::ComputeReductionResult)
+ return;
+
+ auto *PhiR = cast<VPReductionPHIRecipe>(RedResult->getOperand(0));
+ const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
+
+ TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
+ Value *FinalValue =
+ State.get(RedResult, VPIteration(State.UF - 1, VPLane::getFirstLane()));
+ auto *ResumePhi =
+ dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
+
+ // TODO: bc.merge.rdx should not be created here, instead it should be
+ // modeled in VPlan.
+ BasicBlock *LoopScalarPreHeader = OrigLoop->getLoopPreheader();
+ // Create a phi node that merges control-flow from the backedge-taken check
+ // block and the middle block.
+ auto *BCBlockPhi = PHINode::Create(FinalValue->getType(), 2, "bc.merge.rdx",
+ LoopScalarPreHeader->getTerminator());
+
+ // If we are fixing reductions in the epilogue loop then we should already
+ // have created a bc.merge.rdx Phi after the main vector body. Ensure that
+ // we carry over the incoming values correctly.
+ for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
+ if (Incoming == LoopMiddleBlock)
+ BCBlockPhi->addIncoming(FinalValue, Incoming);
+ else if (ResumePhi && is_contained(ResumePhi->blocks(), Incoming))
+ BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming),
+ Incoming);
+ else
+ BCBlockPhi->addIncoming(ReductionStartValue, Incoming);
+ }
+
+ auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
+ // TODO: This fixup should instead be modeled in VPlan.
+ // Fix the scalar loop reduction variable with the incoming reduction sum
+ // from the vector body and from the backedge value.
+ int IncomingEdgeBlockIdx =
+ OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
+ assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
+ // Pick the other block.
+ int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
+ OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
+ Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
+ OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
+
+ ReductionResumeValues[&RdxDesc] = BCBlockPhi;
+}
+
+std::pair<DenseMap<const SCEV *, Value *>,
+ DenseMap<const RecurrenceDescriptor *, Value *>>
+LoopVectorizationPlanner::executePlan(
ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
InnerLoopVectorizer &ILV, DominatorTree *DT, bool IsEpilogueVectorization,
const DenseMap<const SCEV *, Value *> *ExpandedSCEVs) {
@@ -7664,6 +7554,17 @@ SCEV2ValueTy LoopVectorizationPlanner::executePlan(
BestVPlan.execute(&State);
+ // 2.5 Collect reduction resume values.
+ DenseMap<const RecurrenceDescriptor *, Value *> ReductionResumeValues;
+ auto *ExitVPBB =
+ cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
+ for (VPRecipeBase &R : *ExitVPBB) {
+ createAndCollectMergePhiForReduction(dyn_cast<VPInstruction>(&R),
+ ReductionResumeValues, State, OrigLoop,
+ State.CFG.VPBB2IRBB[ExitVPBB]);
+ }
+
+ // 2.6. Maintain Loop Hints
// Keep all loop hints from the original loop on the vector loop (we'll
// replace the vectorizer-specific hints below).
MDNode *OrigLoopID = OrigLoop->getLoopID();
@@ -7697,7 +7598,7 @@ SCEV2ValueTy LoopVectorizationPlanner::executePlan(
ILV.printDebugTracesAtEnd();
- return State.ExpandedSCEVs;
+ return {State.ExpandedSCEVs, ReductionResumeValues};
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -8046,7 +7947,7 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
if (ECEntryIt != EdgeMaskCache.end())
return ECEntryIt->second;
- VPValue *SrcMask = createBlockInMask(Src, Plan);
+ VPValue *SrcMask = getBlockInMask(Src);
// The terminator has to be a branch inst!
BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
@@ -8108,14 +8009,17 @@ void VPRecipeBuilder::createHeaderMask(VPlan &Plan) {
BlockMaskCache[Header] = BlockMask;
}
-VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) {
- assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
-
- // Look for cached value.
- BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
- if (BCEntryIt != BlockMaskCache.end())
- return BCEntryIt->second;
+VPValue *VPRecipeBuilder::getBlockInMask(BasicBlock *BB) const {
+ // Return the cached value.
+ BlockMaskCacheTy::const_iterator BCEntryIt = BlockMaskCache.find(BB);
+ assert(BCEntryIt != BlockMaskCache.end() &&
+ "Trying to access mask for block without one.");
+ return BCEntryIt->second;
+}
+void VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) {
+ assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
+ assert(BlockMaskCache.count(BB) == 0 && "Mask for block already computed");
assert(OrigLoop->getHeader() != BB &&
"Loop header must have cached block mask");
@@ -8125,8 +8029,9 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) {
// This is the block mask. We OR all incoming edges.
for (auto *Predecessor : predecessors(BB)) {
VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
- if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
- return BlockMaskCache[BB] = EdgeMask;
+ if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is too.
+ BlockMaskCache[BB] = EdgeMask;
+ }
if (!BlockMask) { // BlockMask has its initialized nullptr value.
BlockMask = EdgeMask;
@@ -8136,7 +8041,7 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) {
BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
}
- return BlockMaskCache[BB] = BlockMask;
+ BlockMaskCache[BB] = BlockMask;
}
VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
@@ -8164,7 +8069,7 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
VPValue *Mask = nullptr;
if (Legal->isMaskRequired(I))
- Mask = createBlockInMask(I->getParent(), *Plan);
+ Mask = getBlockInMask(I->getParent());
// Determine if the pointer operand of the access is either consecutive or
// reverse consecutive.
@@ -8176,8 +8081,11 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1];
if (Consecutive) {
- auto *VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I),
- Reverse, I->getDebugLoc());
+ auto *GEP = dyn_cast<GetElementPtrInst>(
+ Ptr->getUnderlyingValue()->stripPointerCasts());
+ auto *VectorPtr = new VPVectorPointerRecipe(
+ Ptr, getLoadStoreType(I), Reverse, GEP ? GEP->isInBounds() : false,
+ I->getDebugLoc());
Builder.getInsertBlock()->appendRecipe(VectorPtr);
Ptr = VectorPtr;
}
@@ -8383,7 +8291,7 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
// all-true mask.
VPValue *Mask = nullptr;
if (Legal->isMaskRequired(CI))
- Mask = createBlockInMask(CI->getParent(), *Plan);
+ Mask = getBlockInMask(CI->getParent());
else
Mask = Plan->getVPValueOrAddLiveIn(ConstantInt::getTrue(
IntegerType::getInt1Ty(Variant->getFunctionType()->getContext())));
@@ -8426,7 +8334,7 @@ VPRecipeBase *VPRecipeBuilder::tryToWiden(Instruction *I,
// div/rem operation itself. Otherwise fall through to general handling below.
if (CM.isPredicatedInst(I)) {
SmallVector<VPValue *> Ops(Operands.begin(), Operands.end());
- VPValue *Mask = createBlockInMask(I->getParent(), *Plan);
+ VPValue *Mask = getBlockInMask(I->getParent());
VPValue *One = Plan->getVPValueOrAddLiveIn(
ConstantInt::get(I->getType(), 1u, false));
auto *SafeRHS =
@@ -8520,7 +8428,7 @@ VPRecipeOrVPValueTy VPRecipeBuilder::handleReplication(Instruction *I,
// added initially. Masked replicate recipes will later be placed under an
// if-then construct to prevent side-effects. Generate recipes to compute
// the block mask for this region.
- BlockInMask = createBlockInMask(I->getParent(), Plan);
+ BlockInMask = getBlockInMask(I->getParent());
}
auto *Recipe = new VPReplicateRecipe(I, Plan.mapToVPValues(I->operands()),
@@ -8755,16 +8663,16 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
bool HasNUW = Style == TailFoldingStyle::None;
addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL);
- // Proactively create header mask. Masks for other blocks are created on
- // demand.
- RecipeBuilder.createHeaderMask(*Plan);
-
// Scan the body of the loop in a topological order to visit each basic block
// after having visited its predecessor basic blocks.
LoopBlocksDFS DFS(OrigLoop);
DFS.perform(LI);
VPBasicBlock *VPBB = HeaderVPBB;
+ bool NeedsMasks = CM.foldTailByMasking() ||
+ any_of(OrigLoop->blocks(), [this](BasicBlock *BB) {
+ return Legal->blockNeedsPredication(BB);
+ });
for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
// Relevant instructions from basic block BB will be grouped into VPRecipe
// ingredients and fill a new VPBasicBlock.
@@ -8772,6 +8680,11 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
VPBB->setName(BB->getName());
Builder.setInsertPoint(VPBB);
+ if (VPBB == HeaderVPBB)
+ RecipeBuilder.createHeaderMask(*Plan);
+ else if (NeedsMasks)
+ RecipeBuilder.createBlockInMask(BB, *Plan);
+
// Introduce each ingredient into VPlan.
// TODO: Model and preserve debug intrinsics in VPlan.
for (Instruction &I : drop_end(BB->instructionsWithoutDebug(false))) {
@@ -8977,10 +8890,15 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
// to reductions, with one operand being vector and the other being the scalar
// reduction chain. For other reductions, a select is introduced between the phi
// and live-out recipes when folding the tail.
+//
+// A ComputeReductionResult recipe is added to the middle block, also for
+// in-loop reductions which compute their result in-loop, because generating
+// the subsequent bc.merge.rdx phi is driven by ComputeReductionResult recipes.
void LoopVectorizationPlanner::adjustRecipesForReductions(
VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder,
ElementCount MinVF) {
- VPBasicBlock *Header = Plan->getVectorLoopRegion()->getEntryBasicBlock();
+ VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
+ VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock();
// Gather all VPReductionPHIRecipe and sort them so that Intermediate stores
// sank outside of the loop would keep the same order as they had in the
// original loop.
@@ -9020,15 +8938,11 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
for (VPRecipeBase *R : ReductionPHIList)
R->moveBefore(*Header, Header->getFirstNonPhi());
- SmallVector<VPReductionPHIRecipe *> InLoopReductionPhis;
for (VPRecipeBase &R : Header->phis()) {
auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered()))
continue;
- InLoopReductionPhis.push_back(PhiR);
- }
- for (VPReductionPHIRecipe *PhiR : InLoopReductionPhis) {
const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
RecurKind Kind = RdxDesc.getRecurrenceKind();
assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
@@ -9119,7 +9033,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
if (CM.blockNeedsPredicationForAnyReason(BB)) {
VPBuilder::InsertPointGuard Guard(Builder);
Builder.setInsertPoint(CurrentLink);
- CondOp = RecipeBuilder.createBlockInMask(BB, *Plan);
+ CondOp = RecipeBuilder.getBlockInMask(BB);
}
VPReductionRecipe *RedRecipe = new VPReductionRecipe(
@@ -9137,36 +9051,38 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
for (VPRecipeBase &R :
Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
- if (!PhiR || PhiR->isInLoop())
+ if (!PhiR)
continue;
const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
- auto *Result = PhiR->getBackedgeValue()->getDefiningRecipe();
// If tail is folded by masking, introduce selects between the phi
// and the live-out instruction of each reduction, at the beginning of the
// dedicated latch block.
- if (CM.foldTailByMasking()) {
- VPValue *Cond =
- RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), *Plan);
- VPValue *Red = PhiR->getBackedgeValue();
- assert(Red->getDefiningRecipe()->getParent() != LatchVPBB &&
+ auto *OrigExitingVPV = PhiR->getBackedgeValue();
+ auto *NewExitingVPV = PhiR->getBackedgeValue();
+ if (!PhiR->isInLoop() && CM.foldTailByMasking()) {
+ VPValue *Cond = RecipeBuilder.getBlockInMask(OrigLoop->getHeader());
+ assert(OrigExitingVPV->getDefiningRecipe()->getParent() != LatchVPBB &&
"reduction recipe must be defined before latch");
- FastMathFlags FMFs = RdxDesc.getFastMathFlags();
Type *PhiTy = PhiR->getOperand(0)->getLiveInIRValue()->getType();
- Result =
+ std::optional<FastMathFlags> FMFs =
PhiTy->isFloatingPointTy()
- ? new VPInstruction(Instruction::Select, {Cond, Red, PhiR}, FMFs)
- : new VPInstruction(Instruction::Select, {Cond, Red, PhiR});
- Result->insertBefore(&*Builder.getInsertPoint());
- Red->replaceUsesWithIf(
- Result->getVPSingleValue(),
- [](VPUser &U, unsigned) { return isa<VPLiveOut>(&U); });
+ ? std::make_optional(RdxDesc.getFastMathFlags())
+ : std::nullopt;
+ NewExitingVPV =
+ Builder.createSelect(Cond, OrigExitingVPV, PhiR, {}, "", FMFs);
+ OrigExitingVPV->replaceUsesWithIf(NewExitingVPV, [](VPUser &U, unsigned) {
+ return isa<VPInstruction>(&U) &&
+ cast<VPInstruction>(&U)->getOpcode() ==
+ VPInstruction::ComputeReductionResult;
+ });
if (PreferPredicatedReductionSelect ||
TTI.preferPredicatedReductionSelect(
PhiR->getRecurrenceDescriptor().getOpcode(), PhiTy,
TargetTransformInfo::ReductionFlags()))
- PhiR->setOperand(1, Result->getVPSingleValue());
+ PhiR->setOperand(1, NewExitingVPV);
}
+
// If the vector reduction can be performed in a smaller type, we truncate
// then extend the loop exit value to enable InstCombine to evaluate the
// entire expression in the smaller type.
@@ -9174,18 +9090,40 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
Type *RdxTy = RdxDesc.getRecurrenceType();
- auto *Trunc = new VPWidenCastRecipe(Instruction::Trunc,
- Result->getVPSingleValue(), RdxTy);
+ auto *Trunc =
+ new VPWidenCastRecipe(Instruction::Trunc, NewExitingVPV, RdxTy);
auto *Extnd =
RdxDesc.isSigned()
? new VPWidenCastRecipe(Instruction::SExt, Trunc, PhiTy)
: new VPWidenCastRecipe(Instruction::ZExt, Trunc, PhiTy);
- Trunc->insertAfter(Result);
+ Trunc->insertAfter(NewExitingVPV->getDefiningRecipe());
Extnd->insertAfter(Trunc);
- Result->getVPSingleValue()->replaceAllUsesWith(Extnd);
- Trunc->setOperand(0, Result->getVPSingleValue());
+ if (PhiR->getOperand(1) == NewExitingVPV)
+ PhiR->setOperand(1, Extnd->getVPSingleValue());
+ NewExitingVPV = Extnd;
}
+
+ // We want code in the middle block to appear to execute on the location of
+ // the scalar loop's latch terminator because: (a) it is all compiler
+ // generated, (b) these instructions are always executed after evaluating
+ // the latch conditional branch, and (c) other passes may add new
+ // predecessors which terminate on this line. This is the easiest way to
+ // ensure we don't accidentally cause an extra step back into the loop while
+ // debugging.
+ DebugLoc ExitDL = OrigLoop->getLoopLatch()->getTerminator()->getDebugLoc();
+
+ // TODO: At the moment ComputeReductionResult also drives creation of the
+ // bc.merge.rdx phi nodes, hence it needs to be created unconditionally here
+ // even for in-loop reductions, until the reduction resume value handling is
+ // also modeled in VPlan.
+ auto *FinalReductionResult = new VPInstruction(
+ VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL);
+ cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor())
+ ->appendRecipe(FinalReductionResult);
+ OrigExitingVPV->replaceUsesWithIf(
+ FinalReductionResult,
+ [](VPUser &User, unsigned) { return isa<VPLiveOut>(&User); });
}
VPlanTransforms::clearReductionWrapFlags(*Plan);
@@ -10152,8 +10090,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
EPI, &LVL, &CM, BFI, PSI, Checks);
VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF);
- auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF,
- BestMainPlan, MainILV, DT, true);
+ const auto &[ExpandedSCEVs, ReductionResumeValues] = LVP.executePlan(
+ EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV, DT, true);
++LoopsVectorized;
// Second pass vectorizes the epilogue and adjusts the control flow
@@ -10194,8 +10132,9 @@ bool LoopVectorizePass::processLoop(Loop *L) {
Value *ResumeV = nullptr;
// TODO: Move setting of resume values to prepareToExecute.
if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
- ResumeV = MainILV.getReductionResumeValue(
- ReductionPhi->getRecurrenceDescriptor());
+ ResumeV = ReductionResumeValues
+ .find(&ReductionPhi->getRecurrenceDescriptor())
+ ->second;
} else {
// Create induction resume values for both widened pointer and
// integer/fp inductions and update the start value of the induction
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 304991526064..8e22b54f002d 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -10596,7 +10596,8 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
inversePermutation(E->ReorderIndices, ReorderMask);
if (!ReorderMask.empty())
reorderScalars(GatheredScalars, ReorderMask);
- auto FindReusedSplat = [&](MutableArrayRef<int> Mask, unsigned InputVF) {
+ auto FindReusedSplat = [&](MutableArrayRef<int> Mask, unsigned InputVF,
+ unsigned I, unsigned SliceSize) {
if (!isSplat(E->Scalars) || none_of(E->Scalars, [](Value *V) {
return isa<UndefValue>(V) && !isa<PoisonValue>(V);
}))
@@ -10619,11 +10620,13 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
Idx == 0) ||
(Mask.size() == InputVF &&
ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))) {
- std::iota(Mask.begin(), Mask.end(), 0);
+ std::iota(std::next(Mask.begin(), I * SliceSize),
+ std::next(Mask.begin(), (I + 1) * SliceSize), 0);
} else {
- unsigned I =
+ unsigned IVal =
*find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });
- std::fill(Mask.begin(), Mask.end(), I);
+ std::fill(std::next(Mask.begin(), I * SliceSize),
+ std::next(Mask.begin(), (I + 1) * SliceSize), IVal);
}
return true;
};
@@ -10872,7 +10875,8 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
} else if (Vec1) {
IsUsedInExpr &= FindReusedSplat(
ExtractMask,
- cast<FixedVectorType>(Vec1->getType())->getNumElements());
+ cast<FixedVectorType>(Vec1->getType())->getNumElements(), 0,
+ ExtractMask.size());
ShuffleBuilder.add(Vec1, ExtractMask, /*ForExtracts=*/true);
IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1);
} else {
@@ -10898,7 +10902,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
copy(SubMask, std::next(VecMask.begin(), I * SliceSize));
if (TEs.size() == 1) {
IsUsedInExpr &=
- FindReusedSplat(VecMask, TEs.front()->getVectorFactor());
+ FindReusedSplat(VecMask, TEs.front()->getVectorFactor(), I, SliceSize);
ShuffleBuilder.add(*TEs.front(), VecMask);
if (TEs.front()->VectorizedValue)
IsNonPoisoned &=
@@ -11139,6 +11143,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
case Instruction::ExtractElement: {
Value *V = E->getSingleOperand(0);
+ if (const TreeEntry *TE = getTreeEntry(V))
+ V = TE->VectorizedValue;
setInsertPointAfterBundle(E);
V = FinalShuffle(V, E, VecTy, IsSigned);
E->VectorizedValue = V;
@@ -11903,8 +11909,10 @@ Value *BoUpSLP::vectorizeTree(
if (!Ex) {
// "Reuse" the existing extract to improve final codegen.
if (auto *ES = dyn_cast<ExtractElementInst>(Scalar)) {
- Ex = Builder.CreateExtractElement(ES->getOperand(0),
- ES->getOperand(1));
+ Value *V = ES->getVectorOperand();
+ if (const TreeEntry *ETE = getTreeEntry(V))
+ V = ETE->VectorizedValue;
+ Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
} else {
Ex = Builder.CreateExtractElement(Vec, Lane);
}
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index 7ff6749a0908..4b3143aead46 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -138,8 +138,11 @@ public:
/// A helper function that computes the predicate of the block BB, assuming
/// that the header block of the loop is set to True or the loop mask when
- /// tail folding. It returns the *entry* mask for the block BB.
- VPValue *createBlockInMask(BasicBlock *BB, VPlan &Plan);
+ /// tail folding.
+ void createBlockInMask(BasicBlock *BB, VPlan &Plan);
+
+ /// Returns the *entry* mask for the block \p BB.
+ VPValue *getBlockInMask(BasicBlock *BB) const;
/// A helper function that computes the predicate of the edge between SRC
/// and DST.
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 1d7df9c9575a..b6e56c47c227 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -446,6 +446,7 @@ void VPBasicBlock::execute(VPTransformState *State) {
// ExitBB can be re-used for the exit block of the Plan.
NewBB = State->CFG.ExitBB;
State->CFG.PrevBB = NewBB;
+ State->Builder.SetInsertPoint(NewBB->getFirstNonPHI());
// Update the branch instruction in the predecessor to branch to ExitBB.
VPBlockBase *PredVPB = getSingleHierarchicalPredecessor();
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h
index 7d33baac52c9..4b4f4911eb64 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -842,6 +842,12 @@ public:
WrapFlagsTy(bool HasNUW, bool HasNSW) : HasNUW(HasNUW), HasNSW(HasNSW) {}
};
+protected:
+ struct GEPFlagsTy {
+ char IsInBounds : 1;
+ GEPFlagsTy(bool IsInBounds) : IsInBounds(IsInBounds) {}
+ };
+
private:
struct DisjointFlagsTy {
char IsDisjoint : 1;
@@ -849,9 +855,6 @@ private:
struct ExactFlagsTy {
char IsExact : 1;
};
- struct GEPFlagsTy {
- char IsInBounds : 1;
- };
struct NonNegFlagsTy {
char NonNeg : 1;
};
@@ -933,12 +936,21 @@ public:
: VPRecipeBase(SC, Operands, DL), OpType(OperationType::FPMathOp),
FMFs(FMFs) {}
+protected:
+ template <typename IterT>
+ VPRecipeWithIRFlags(const unsigned char SC, IterT Operands,
+ GEPFlagsTy GEPFlags, DebugLoc DL = {})
+ : VPRecipeBase(SC, Operands, DL), OpType(OperationType::GEPOp),
+ GEPFlags(GEPFlags) {}
+
+public:
static inline bool classof(const VPRecipeBase *R) {
return R->getVPDefID() == VPRecipeBase::VPInstructionSC ||
R->getVPDefID() == VPRecipeBase::VPWidenSC ||
R->getVPDefID() == VPRecipeBase::VPWidenGEPSC ||
R->getVPDefID() == VPRecipeBase::VPWidenCastSC ||
- R->getVPDefID() == VPRecipeBase::VPReplicateSC;
+ R->getVPDefID() == VPRecipeBase::VPReplicateSC ||
+ R->getVPDefID() == VPRecipeBase::VPVectorPointerSC;
}
/// Drop all poison-generating flags.
@@ -1061,7 +1073,8 @@ public:
// Increment the canonical IV separately for each unrolled part.
CanonicalIVIncrementForPart,
BranchOnCount,
- BranchOnCond
+ BranchOnCond,
+ ComputeReductionResult,
};
private:
@@ -1360,15 +1373,16 @@ public:
/// A recipe to compute the pointers for widened memory accesses of IndexTy for
/// all parts. If IsReverse is true, compute pointers for accessing the input in
/// reverse order per part.
-class VPVectorPointerRecipe : public VPRecipeBase, public VPValue {
+class VPVectorPointerRecipe : public VPRecipeWithIRFlags, public VPValue {
Type *IndexedTy;
bool IsReverse;
public:
VPVectorPointerRecipe(VPValue *Ptr, Type *IndexedTy, bool IsReverse,
- DebugLoc DL)
- : VPRecipeBase(VPDef::VPVectorPointerSC, {Ptr}, DL), VPValue(this),
- IndexedTy(IndexedTy), IsReverse(IsReverse) {}
+ bool IsInBounds, DebugLoc DL)
+ : VPRecipeWithIRFlags(VPDef::VPVectorPointerSC, ArrayRef<VPValue *>(Ptr),
+ GEPFlagsTy(IsInBounds), DL),
+ VPValue(this), IndexedTy(IndexedTy), IsReverse(IsReverse) {}
VP_CLASSOF_IMPL(VPDef::VPVectorPointerSC)
@@ -3132,6 +3146,8 @@ inline bool isUniformAfterVectorization(VPValue *VPV) {
return Rep->isUniform();
if (auto *GEP = dyn_cast<VPWidenGEPRecipe>(Def))
return all_of(GEP->operands(), isUniformAfterVectorization);
+ if (auto *VPI = dyn_cast<VPInstruction>(Def))
+ return VPI->getOpcode() == VPInstruction::ComputeReductionResult;
return false;
}
} // end namespace vputils
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 76961629aece..1f844bce2310 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -28,6 +28,7 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
#include <cassert>
@@ -119,7 +120,9 @@ bool VPRecipeBase::mayHaveSideEffects() const {
return false;
case VPInstructionSC:
switch (cast<VPInstruction>(this)->getOpcode()) {
+ case Instruction::Or:
case Instruction::ICmp:
+ case Instruction::Select:
case VPInstruction::Not:
case VPInstruction::CalculateTripCountMinusVF:
case VPInstruction::CanonicalIVIncrementForPart:
@@ -401,6 +404,84 @@ Value *VPInstruction::generateInstruction(VPTransformState &State,
Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
return CondBr;
}
+ case VPInstruction::ComputeReductionResult: {
+ if (Part != 0)
+ return State.get(this, 0);
+
+ // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
+ // and will be removed by breaking up the recipe further.
+ auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0));
+ auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
+ // Get its reduction variable descriptor.
+ const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
+
+ RecurKind RK = RdxDesc.getRecurrenceKind();
+
+ State.setDebugLocFrom(getDebugLoc());
+
+ VPValue *LoopExitingDef = getOperand(1);
+ Type *PhiTy = OrigPhi->getType();
+ VectorParts RdxParts(State.UF);
+ for (unsigned Part = 0; Part < State.UF; ++Part)
+ RdxParts[Part] = State.get(LoopExitingDef, Part);
+
+ // If the vector reduction can be performed in a smaller type, we truncate
+ // then extend the loop exit value to enable InstCombine to evaluate the
+ // entire expression in the smaller type.
+ // TODO: Handle this in truncateToMinBW.
+ if (State.VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
+ Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), State.VF);
+ for (unsigned Part = 0; Part < State.UF; ++Part)
+ RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
+ }
+ // Reduce all of the unrolled parts into a single vector.
+ Value *ReducedPartRdx = RdxParts[0];
+ unsigned Op = RecurrenceDescriptor::getOpcode(RK);
+
+ if (PhiR->isOrdered()) {
+ ReducedPartRdx = RdxParts[State.UF - 1];
+ } else {
+ // Floating-point operations should have some FMF to enable the reduction.
+ IRBuilderBase::FastMathFlagGuard FMFG(Builder);
+ Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
+ for (unsigned Part = 1; Part < State.UF; ++Part) {
+ Value *RdxPart = RdxParts[Part];
+ if (Op != Instruction::ICmp && Op != Instruction::FCmp)
+ ReducedPartRdx = Builder.CreateBinOp(
+ (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
+ else if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) {
+ TrackingVH<Value> ReductionStartValue =
+ RdxDesc.getRecurrenceStartValue();
+ ReducedPartRdx = createAnyOfOp(Builder, ReductionStartValue, RK,
+ ReducedPartRdx, RdxPart);
+ } else
+ ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
+ }
+ }
+
+ // Create the reduction after the loop. Note that inloop reductions create
+ // the target reduction in the loop using a Reduction recipe.
+ if (State.VF.isVector() && !PhiR->isInLoop()) {
+ ReducedPartRdx =
+ createTargetReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi);
+ // If the reduction can be performed in a smaller type, we need to extend
+ // the reduction to the wider type before we branch to the original loop.
+ if (PhiTy != RdxDesc.getRecurrenceType())
+ ReducedPartRdx = RdxDesc.isSigned()
+ ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
+ : Builder.CreateZExt(ReducedPartRdx, PhiTy);
+ }
+
+ // If there were stores of the reduction value to a uniform memory address
+ // inside the loop, create the final store here.
+ if (StoreInst *SI = RdxDesc.IntermediateStore) {
+ auto *NewSI = Builder.CreateAlignedStore(
+ ReducedPartRdx, SI->getPointerOperand(), SI->getAlign());
+ propagateMetadata(NewSI, SI);
+ }
+
+ return ReducedPartRdx;
+ }
default:
llvm_unreachable("Unsupported opcode for instruction");
}
@@ -477,6 +558,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
case VPInstruction::BranchOnCount:
O << "branch-on-count";
break;
+ case VPInstruction::ComputeReductionResult:
+ O << "compute-reduction-result";
+ break;
default:
O << Instruction::getOpcodeName(getOpcode());
}
@@ -1225,9 +1309,7 @@ void VPVectorPointerRecipe ::execute(VPTransformState &State) {
? DL.getIndexType(IndexedTy->getPointerTo())
: Builder.getInt32Ty();
Value *Ptr = State.get(getOperand(0), VPIteration(0, 0));
- bool InBounds = false;
- if (auto *GEP = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
- InBounds = GEP->isInBounds();
+ bool InBounds = isInBounds();
if (IsReverse) {
// If the address is consecutive but reversed, then the
// wide store needs to start at the last vector element.
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 33132880d5a4..5c430620a2dc 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -829,15 +829,20 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
Type *ATy = TypeInfo.inferScalarType(A);
if (TruncTy == ATy) {
Trunc->replaceAllUsesWith(A);
- } else if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
- auto *VPC =
- new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A, TruncTy);
- VPC->insertBefore(&R);
- Trunc->replaceAllUsesWith(VPC);
- } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
- auto *VPC = new VPWidenCastRecipe(Instruction::Trunc, A, TruncTy);
- VPC->insertBefore(&R);
- Trunc->replaceAllUsesWith(VPC);
+ } else {
+ // Don't replace a scalarizing recipe with a widened cast.
+ if (isa<VPReplicateRecipe>(&R))
+ break;
+ if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
+ auto *VPC =
+ new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A, TruncTy);
+ VPC->insertBefore(&R);
+ Trunc->replaceAllUsesWith(VPC);
+ } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
+ auto *VPC = new VPWidenCastRecipe(Instruction::Trunc, A, TruncTy);
+ VPC->insertBefore(&R);
+ Trunc->replaceAllUsesWith(VPC);
+ }
}
#ifndef NDEBUG
// Verify that the cached type info is for both A and its users is still