aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp')
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp481
1 files changed, 210 insertions, 271 deletions
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 8e135d80f4f2..51ce88480c08 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -584,10 +584,6 @@ public:
/// able to vectorize with strict in-order reductions for the given RdxDesc.
bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc);
- // Returns the resume value (bc.merge.rdx) for a reduction as
- // generated by fixReduction.
- PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc);
-
/// Create a new phi node for the induction variable \p OrigPhi to resume
/// iteration count in the scalar epilogue, from where the vectorized loop
/// left off. \p Step is the SCEV-expanded induction step to use. In cases
@@ -626,9 +622,6 @@ protected:
BasicBlock *MiddleBlock, BasicBlock *VectorHeader,
VPlan &Plan, VPTransformState &State);
- /// Handle all cross-iteration phis in the header.
- void fixCrossIterationPHIs(VPTransformState &State);
-
/// Create the exit value of first order recurrences in the middle block and
/// update their users.
void fixFixedOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR,
@@ -1166,14 +1159,6 @@ void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
}
}
-PHINode *InnerLoopVectorizer::getReductionResumeValue(
- const RecurrenceDescriptor &RdxDesc) {
- auto It = ReductionResumeValues.find(&RdxDesc);
- assert(It != ReductionResumeValues.end() &&
- "Expected to find a resume value for the reduction.");
- return It->second;
-}
-
namespace llvm {
// Loop vectorization cost-model hints how the scalar epilogue loop should be
@@ -3434,8 +3419,15 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
// At this point every instruction in the original loop is widened to a
// vector form. Now we need to fix the recurrences in the loop. These PHI
// nodes are currently empty because we did not want to introduce cycles.
- // This is the second stage of vectorizing recurrences.
- fixCrossIterationPHIs(State);
+ // This is the second stage of vectorizing recurrences. Note that fixing
+ // reduction phis are already modeled in VPlan.
+ // TODO: Also model fixing fixed-order recurrence phis in VPlan.
+ VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion();
+ VPBasicBlock *HeaderVPBB = VectorRegion->getEntryBasicBlock();
+ for (VPRecipeBase &R : HeaderVPBB->phis()) {
+ if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
+ fixFixedOrderRecurrence(FOR, State);
+ }
// Forget the original basic block.
PSE.getSE()->forgetLoop(OrigLoop);
@@ -3450,7 +3442,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
for (PHINode &PN : Exit->phis())
PSE.getSE()->forgetLcssaPhiWithNewPredecessor(OrigLoop, &PN);
- VPBasicBlock *LatchVPBB = Plan.getVectorLoopRegion()->getExitingBasicBlock();
+ VPBasicBlock *LatchVPBB = VectorRegion->getExitingBasicBlock();
Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]);
if (Cost->requiresScalarEpilogue(VF.isVector())) {
// No edge from the middle block to the unique exit block has been inserted
@@ -3503,27 +3495,6 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
VF.getKnownMinValue() * UF);
}
-void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {
- // In order to support recurrences we need to be able to vectorize Phi nodes.
- // Phi nodes have cycles, so we need to vectorize them in two stages. This is
- // stage #2: We now need to fix the recurrences by adding incoming edges to
- // the currently empty PHI nodes. At this point every instruction in the
- // original loop is widened to a vector form so we can use them to construct
- // the incoming edges.
- VPBasicBlock *Header =
- State.Plan->getVectorLoopRegion()->getEntryBasicBlock();
-
- for (VPRecipeBase &R : Header->phis()) {
- if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
- fixReduction(ReductionPhi, State);
- }
-
- for (VPRecipeBase &R : Header->phis()) {
- if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
- fixFixedOrderRecurrence(FOR, State);
- }
-}
-
void InnerLoopVectorizer::fixFixedOrderRecurrence(
VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) {
// This is the second phase of vectorizing first-order recurrences. An
@@ -3643,169 +3614,6 @@ void InnerLoopVectorizer::fixFixedOrderRecurrence(
Phi->setName("scalar.recur");
}
-void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
- VPTransformState &State) {
- PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
- // Get it's reduction variable descriptor.
- assert(Legal->isReductionVariable(OrigPhi) &&
- "Unable to find the reduction variable");
- const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
-
- RecurKind RK = RdxDesc.getRecurrenceKind();
- TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
- Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
- if (auto *I = dyn_cast<Instruction>(&*ReductionStartValue))
- State.setDebugLocFrom(I->getDebugLoc());
-
- VPValue *LoopExitInstDef = PhiR->getBackedgeValue();
-
- // Before each round, move the insertion point right between
- // the PHIs and the values we are going to write.
- // This allows us to write both PHINodes and the extractelement
- // instructions.
- Builder.SetInsertPoint(LoopMiddleBlock,
- LoopMiddleBlock->getFirstInsertionPt());
-
- State.setDebugLocFrom(LoopExitInst->getDebugLoc());
-
- Type *PhiTy = OrigPhi->getType();
- // If tail is folded by masking, the vector value to leave the loop should be
- // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
- // instead of the former. For an inloop reduction the reduction will already
- // be predicated, and does not need to be handled here.
- if (Cost->foldTailByMasking() && !PhiR->isInLoop()) {
- VPValue *Def = nullptr;
- for (VPUser *U : LoopExitInstDef->users()) {
- auto *S = dyn_cast<VPInstruction>(U);
- if (S && S->getOpcode() == Instruction::Select) {
- Def = S;
- break;
- }
- }
- if (Def)
- LoopExitInstDef = Def;
- }
-
- VectorParts RdxParts(UF);
- for (unsigned Part = 0; Part < UF; ++Part)
- RdxParts[Part] = State.get(LoopExitInstDef, Part);
-
- // If the vector reduction can be performed in a smaller type, we truncate
- // then extend the loop exit value to enable InstCombine to evaluate the
- // entire expression in the smaller type.
- if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
- Builder.SetInsertPoint(LoopMiddleBlock,
- LoopMiddleBlock->getFirstInsertionPt());
- Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
- for (unsigned Part = 0; Part < UF; ++Part) {
- RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
- }
- }
-
- // Reduce all of the unrolled parts into a single vector.
- Value *ReducedPartRdx = RdxParts[0];
- unsigned Op = RecurrenceDescriptor::getOpcode(RK);
-
- // The middle block terminator has already been assigned a DebugLoc here (the
- // OrigLoop's single latch terminator). We want the whole middle block to
- // appear to execute on this line because: (a) it is all compiler generated,
- // (b) these instructions are always executed after evaluating the latch
- // conditional branch, and (c) other passes may add new predecessors which
- // terminate on this line. This is the easiest way to ensure we don't
- // accidentally cause an extra step back into the loop while debugging.
- State.setDebugLocFrom(LoopMiddleBlock->getTerminator()->getDebugLoc());
- if (PhiR->isOrdered())
- ReducedPartRdx = RdxParts[UF - 1];
- else {
- // Floating-point operations should have some FMF to enable the reduction.
- IRBuilderBase::FastMathFlagGuard FMFG(Builder);
- Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
- for (unsigned Part = 1; Part < UF; ++Part) {
- Value *RdxPart = RdxParts[Part];
- if (Op != Instruction::ICmp && Op != Instruction::FCmp)
- ReducedPartRdx = Builder.CreateBinOp(
- (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
- else if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK))
- ReducedPartRdx = createAnyOfOp(Builder, ReductionStartValue, RK,
- ReducedPartRdx, RdxPart);
- else
- ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
- }
- }
-
- // Create the reduction after the loop. Note that inloop reductions create the
- // target reduction in the loop using a Reduction recipe.
- if (VF.isVector() && !PhiR->isInLoop()) {
- ReducedPartRdx =
- createTargetReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi);
- // If the reduction can be performed in a smaller type, we need to extend
- // the reduction to the wider type before we branch to the original loop.
- if (PhiTy != RdxDesc.getRecurrenceType())
- ReducedPartRdx = RdxDesc.isSigned()
- ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
- : Builder.CreateZExt(ReducedPartRdx, PhiTy);
- }
-
- PHINode *ResumePhi =
- dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
-
- // Create a phi node that merges control-flow from the backedge-taken check
- // block and the middle block.
- PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx",
- LoopScalarPreHeader->getTerminator());
-
- // If we are fixing reductions in the epilogue loop then we should already
- // have created a bc.merge.rdx Phi after the main vector body. Ensure that
- // we carry over the incoming values correctly.
- for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
- if (Incoming == LoopMiddleBlock)
- BCBlockPhi->addIncoming(ReducedPartRdx, Incoming);
- else if (ResumePhi && llvm::is_contained(ResumePhi->blocks(), Incoming))
- BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming),
- Incoming);
- else
- BCBlockPhi->addIncoming(ReductionStartValue, Incoming);
- }
-
- // Set the resume value for this reduction
- ReductionResumeValues.insert({&RdxDesc, BCBlockPhi});
-
- // If there were stores of the reduction value to a uniform memory address
- // inside the loop, create the final store here.
- if (StoreInst *SI = RdxDesc.IntermediateStore) {
- StoreInst *NewSI =
- Builder.CreateAlignedStore(ReducedPartRdx, SI->getPointerOperand(),
- SI->getAlign());
- propagateMetadata(NewSI, SI);
-
- // If the reduction value is used in other places,
- // then let the code below create PHI's for that.
- }
-
- // Now, we need to fix the users of the reduction variable
- // inside and outside of the scalar remainder loop.
-
- // We know that the loop is in LCSSA form. We need to update the PHI nodes
- // in the exit blocks. See comment on analogous loop in
- // fixFixedOrderRecurrence for a more complete explaination of the logic.
- if (!Cost->requiresScalarEpilogue(VF.isVector()))
- for (PHINode &LCSSAPhi : LoopExitBlock->phis())
- if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) {
- LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
- State.Plan->removeLiveOut(&LCSSAPhi);
- }
-
- // Fix the scalar loop reduction variable with the incoming reduction sum
- // from the vector body and from the backedge value.
- int IncomingEdgeBlockIdx =
- OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
- assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
- // Pick the other block.
- int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
- OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
- OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
-}
-
void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
// The basic block and loop containing the predicated instruction.
auto *PredBB = PredInst->getParent();
@@ -5579,21 +5387,45 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
}
- // If trip count is known or estimated compile time constant, limit the
- // interleave count to be less than the trip count divided by VF, provided it
- // is at least 1.
- //
- // For scalable vectors we can't know if interleaving is beneficial. It may
- // not be beneficial for small loops if none of the lanes in the second vector
- // iterations is enabled. However, for larger loops, there is likely to be a
- // similar benefit as for fixed-width vectors. For now, we choose to leave
- // the InterleaveCount as if vscale is '1', although if some information about
- // the vector is known (e.g. min vector size), we can make a better decision.
- if (BestKnownTC) {
- MaxInterleaveCount =
- std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
- // Make sure MaxInterleaveCount is greater than 0.
- MaxInterleaveCount = std::max(1u, MaxInterleaveCount);
+ unsigned EstimatedVF = VF.getKnownMinValue();
+ if (VF.isScalable()) {
+ if (std::optional<unsigned> VScale = getVScaleForTuning(TheLoop, TTI))
+ EstimatedVF *= *VScale;
+ }
+ assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
+
+ unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
+ if (KnownTC) {
+ // If trip count is known we select between two prospective ICs, where
+ // 1) the aggressive IC is capped by the trip count divided by VF
+ // 2) the conservative IC is capped by the trip count divided by (VF * 2)
+ // The final IC is selected in a way that the epilogue loop trip count is
+ // minimized while maximizing the IC itself, so that we either run the
+ // vector loop at least once if it generates a small epilogue loop, or else
+ // we run the vector loop at least twice.
+
+ unsigned InterleaveCountUB = bit_floor(
+ std::max(1u, std::min(KnownTC / EstimatedVF, MaxInterleaveCount)));
+ unsigned InterleaveCountLB = bit_floor(std::max(
+ 1u, std::min(KnownTC / (EstimatedVF * 2), MaxInterleaveCount)));
+ MaxInterleaveCount = InterleaveCountLB;
+
+ if (InterleaveCountUB != InterleaveCountLB) {
+ unsigned TailTripCountUB = (KnownTC % (EstimatedVF * InterleaveCountUB));
+ unsigned TailTripCountLB = (KnownTC % (EstimatedVF * InterleaveCountLB));
+ // If both produce same scalar tail, maximize the IC to do the same work
+ // in fewer vector loop iterations
+ if (TailTripCountUB == TailTripCountLB)
+ MaxInterleaveCount = InterleaveCountUB;
+ }
+ } else if (BestKnownTC) {
+ // If trip count is an estimated compile time constant, limit the
+ // IC to be capped by the trip count divided by VF * 2, such that the vector
+ // loop runs at least twice to make interleaving seem profitable when there
+ // is an epilogue loop present. Since exact Trip count is not known we
+ // choose to be conservative in our IC estimate.
+ MaxInterleaveCount = bit_floor(std::max(
+ 1u, std::min(*BestKnownTC / (EstimatedVF * 2), MaxInterleaveCount)));
}
assert(MaxInterleaveCount > 0 &&
@@ -7585,7 +7417,65 @@ static void AddRuntimeUnrollDisableMetaData(Loop *L) {
}
}
-SCEV2ValueTy LoopVectorizationPlanner::executePlan(
+// Check if \p RedResult is a ComputeReductionResult instruction, and if it is
+// create a merge phi node for it and add it to \p ReductionResumeValues.
+static void createAndCollectMergePhiForReduction(
+ VPInstruction *RedResult,
+ DenseMap<const RecurrenceDescriptor *, Value *> &ReductionResumeValues,
+ VPTransformState &State, Loop *OrigLoop, BasicBlock *LoopMiddleBlock) {
+ if (!RedResult ||
+ RedResult->getOpcode() != VPInstruction::ComputeReductionResult)
+ return;
+
+ auto *PhiR = cast<VPReductionPHIRecipe>(RedResult->getOperand(0));
+ const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
+
+ TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
+ Value *FinalValue =
+ State.get(RedResult, VPIteration(State.UF - 1, VPLane::getFirstLane()));
+ auto *ResumePhi =
+ dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
+
+ // TODO: bc.merge.rdx should not be created here, instead it should be
+ // modeled in VPlan.
+ BasicBlock *LoopScalarPreHeader = OrigLoop->getLoopPreheader();
+ // Create a phi node that merges control-flow from the backedge-taken check
+ // block and the middle block.
+ auto *BCBlockPhi = PHINode::Create(FinalValue->getType(), 2, "bc.merge.rdx",
+ LoopScalarPreHeader->getTerminator());
+
+ // If we are fixing reductions in the epilogue loop then we should already
+ // have created a bc.merge.rdx Phi after the main vector body. Ensure that
+ // we carry over the incoming values correctly.
+ for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
+ if (Incoming == LoopMiddleBlock)
+ BCBlockPhi->addIncoming(FinalValue, Incoming);
+ else if (ResumePhi && is_contained(ResumePhi->blocks(), Incoming))
+ BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming),
+ Incoming);
+ else
+ BCBlockPhi->addIncoming(ReductionStartValue, Incoming);
+ }
+
+ auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
+ // TODO: This fixup should instead be modeled in VPlan.
+ // Fix the scalar loop reduction variable with the incoming reduction sum
+ // from the vector body and from the backedge value.
+ int IncomingEdgeBlockIdx =
+ OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
+ assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
+ // Pick the other block.
+ int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
+ OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
+ Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
+ OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
+
+ ReductionResumeValues[&RdxDesc] = BCBlockPhi;
+}
+
+std::pair<DenseMap<const SCEV *, Value *>,
+ DenseMap<const RecurrenceDescriptor *, Value *>>
+LoopVectorizationPlanner::executePlan(
ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
InnerLoopVectorizer &ILV, DominatorTree *DT, bool IsEpilogueVectorization,
const DenseMap<const SCEV *, Value *> *ExpandedSCEVs) {
@@ -7664,6 +7554,17 @@ SCEV2ValueTy LoopVectorizationPlanner::executePlan(
BestVPlan.execute(&State);
+ // 2.5 Collect reduction resume values.
+ DenseMap<const RecurrenceDescriptor *, Value *> ReductionResumeValues;
+ auto *ExitVPBB =
+ cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
+ for (VPRecipeBase &R : *ExitVPBB) {
+ createAndCollectMergePhiForReduction(dyn_cast<VPInstruction>(&R),
+ ReductionResumeValues, State, OrigLoop,
+ State.CFG.VPBB2IRBB[ExitVPBB]);
+ }
+
+ // 2.6. Maintain Loop Hints
// Keep all loop hints from the original loop on the vector loop (we'll
// replace the vectorizer-specific hints below).
MDNode *OrigLoopID = OrigLoop->getLoopID();
@@ -7697,7 +7598,7 @@ SCEV2ValueTy LoopVectorizationPlanner::executePlan(
ILV.printDebugTracesAtEnd();
- return State.ExpandedSCEVs;
+ return {State.ExpandedSCEVs, ReductionResumeValues};
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -8046,7 +7947,7 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
if (ECEntryIt != EdgeMaskCache.end())
return ECEntryIt->second;
- VPValue *SrcMask = createBlockInMask(Src, Plan);
+ VPValue *SrcMask = getBlockInMask(Src);
// The terminator has to be a branch inst!
BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
@@ -8108,14 +8009,17 @@ void VPRecipeBuilder::createHeaderMask(VPlan &Plan) {
BlockMaskCache[Header] = BlockMask;
}
-VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) {
- assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
-
- // Look for cached value.
- BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
- if (BCEntryIt != BlockMaskCache.end())
- return BCEntryIt->second;
+VPValue *VPRecipeBuilder::getBlockInMask(BasicBlock *BB) const {
+ // Return the cached value.
+ BlockMaskCacheTy::const_iterator BCEntryIt = BlockMaskCache.find(BB);
+ assert(BCEntryIt != BlockMaskCache.end() &&
+ "Trying to access mask for block without one.");
+ return BCEntryIt->second;
+}
+void VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) {
+ assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
+ assert(BlockMaskCache.count(BB) == 0 && "Mask for block already computed");
assert(OrigLoop->getHeader() != BB &&
"Loop header must have cached block mask");
@@ -8125,8 +8029,9 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) {
// This is the block mask. We OR all incoming edges.
for (auto *Predecessor : predecessors(BB)) {
VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
- if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
- return BlockMaskCache[BB] = EdgeMask;
+ if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is too.
+ BlockMaskCache[BB] = EdgeMask;
+ }
if (!BlockMask) { // BlockMask has its initialized nullptr value.
BlockMask = EdgeMask;
@@ -8136,7 +8041,7 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) {
BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
}
- return BlockMaskCache[BB] = BlockMask;
+ BlockMaskCache[BB] = BlockMask;
}
VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
@@ -8164,7 +8069,7 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
VPValue *Mask = nullptr;
if (Legal->isMaskRequired(I))
- Mask = createBlockInMask(I->getParent(), *Plan);
+ Mask = getBlockInMask(I->getParent());
// Determine if the pointer operand of the access is either consecutive or
// reverse consecutive.
@@ -8176,8 +8081,11 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1];
if (Consecutive) {
- auto *VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I),
- Reverse, I->getDebugLoc());
+ auto *GEP = dyn_cast<GetElementPtrInst>(
+ Ptr->getUnderlyingValue()->stripPointerCasts());
+ auto *VectorPtr = new VPVectorPointerRecipe(
+ Ptr, getLoadStoreType(I), Reverse, GEP ? GEP->isInBounds() : false,
+ I->getDebugLoc());
Builder.getInsertBlock()->appendRecipe(VectorPtr);
Ptr = VectorPtr;
}
@@ -8383,7 +8291,7 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
// all-true mask.
VPValue *Mask = nullptr;
if (Legal->isMaskRequired(CI))
- Mask = createBlockInMask(CI->getParent(), *Plan);
+ Mask = getBlockInMask(CI->getParent());
else
Mask = Plan->getVPValueOrAddLiveIn(ConstantInt::getTrue(
IntegerType::getInt1Ty(Variant->getFunctionType()->getContext())));
@@ -8426,7 +8334,7 @@ VPRecipeBase *VPRecipeBuilder::tryToWiden(Instruction *I,
// div/rem operation itself. Otherwise fall through to general handling below.
if (CM.isPredicatedInst(I)) {
SmallVector<VPValue *> Ops(Operands.begin(), Operands.end());
- VPValue *Mask = createBlockInMask(I->getParent(), *Plan);
+ VPValue *Mask = getBlockInMask(I->getParent());
VPValue *One = Plan->getVPValueOrAddLiveIn(
ConstantInt::get(I->getType(), 1u, false));
auto *SafeRHS =
@@ -8520,7 +8428,7 @@ VPRecipeOrVPValueTy VPRecipeBuilder::handleReplication(Instruction *I,
// added initially. Masked replicate recipes will later be placed under an
// if-then construct to prevent side-effects. Generate recipes to compute
// the block mask for this region.
- BlockInMask = createBlockInMask(I->getParent(), Plan);
+ BlockInMask = getBlockInMask(I->getParent());
}
auto *Recipe = new VPReplicateRecipe(I, Plan.mapToVPValues(I->operands()),
@@ -8755,16 +8663,16 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
bool HasNUW = Style == TailFoldingStyle::None;
addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL);
- // Proactively create header mask. Masks for other blocks are created on
- // demand.
- RecipeBuilder.createHeaderMask(*Plan);
-
// Scan the body of the loop in a topological order to visit each basic block
// after having visited its predecessor basic blocks.
LoopBlocksDFS DFS(OrigLoop);
DFS.perform(LI);
VPBasicBlock *VPBB = HeaderVPBB;
+ bool NeedsMasks = CM.foldTailByMasking() ||
+ any_of(OrigLoop->blocks(), [this](BasicBlock *BB) {
+ return Legal->blockNeedsPredication(BB);
+ });
for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
// Relevant instructions from basic block BB will be grouped into VPRecipe
// ingredients and fill a new VPBasicBlock.
@@ -8772,6 +8680,11 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
VPBB->setName(BB->getName());
Builder.setInsertPoint(VPBB);
+ if (VPBB == HeaderVPBB)
+ RecipeBuilder.createHeaderMask(*Plan);
+ else if (NeedsMasks)
+ RecipeBuilder.createBlockInMask(BB, *Plan);
+
// Introduce each ingredient into VPlan.
// TODO: Model and preserve debug intrinsics in VPlan.
for (Instruction &I : drop_end(BB->instructionsWithoutDebug(false))) {
@@ -8977,10 +8890,15 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
// to reductions, with one operand being vector and the other being the scalar
// reduction chain. For other reductions, a select is introduced between the phi
// and live-out recipes when folding the tail.
+//
+// A ComputeReductionResult recipe is added to the middle block, also for
+// in-loop reductions which compute their result in-loop, because generating
+// the subsequent bc.merge.rdx phi is driven by ComputeReductionResult recipes.
void LoopVectorizationPlanner::adjustRecipesForReductions(
VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder,
ElementCount MinVF) {
- VPBasicBlock *Header = Plan->getVectorLoopRegion()->getEntryBasicBlock();
+ VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
+ VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock();
// Gather all VPReductionPHIRecipe and sort them so that Intermediate stores
// sank outside of the loop would keep the same order as they had in the
// original loop.
@@ -9020,15 +8938,11 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
for (VPRecipeBase *R : ReductionPHIList)
R->moveBefore(*Header, Header->getFirstNonPhi());
- SmallVector<VPReductionPHIRecipe *> InLoopReductionPhis;
for (VPRecipeBase &R : Header->phis()) {
auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered()))
continue;
- InLoopReductionPhis.push_back(PhiR);
- }
- for (VPReductionPHIRecipe *PhiR : InLoopReductionPhis) {
const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
RecurKind Kind = RdxDesc.getRecurrenceKind();
assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
@@ -9119,7 +9033,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
if (CM.blockNeedsPredicationForAnyReason(BB)) {
VPBuilder::InsertPointGuard Guard(Builder);
Builder.setInsertPoint(CurrentLink);
- CondOp = RecipeBuilder.createBlockInMask(BB, *Plan);
+ CondOp = RecipeBuilder.getBlockInMask(BB);
}
VPReductionRecipe *RedRecipe = new VPReductionRecipe(
@@ -9137,36 +9051,38 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
for (VPRecipeBase &R :
Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
- if (!PhiR || PhiR->isInLoop())
+ if (!PhiR)
continue;
const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
- auto *Result = PhiR->getBackedgeValue()->getDefiningRecipe();
// If tail is folded by masking, introduce selects between the phi
// and the live-out instruction of each reduction, at the beginning of the
// dedicated latch block.
- if (CM.foldTailByMasking()) {
- VPValue *Cond =
- RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), *Plan);
- VPValue *Red = PhiR->getBackedgeValue();
- assert(Red->getDefiningRecipe()->getParent() != LatchVPBB &&
+ auto *OrigExitingVPV = PhiR->getBackedgeValue();
+ auto *NewExitingVPV = PhiR->getBackedgeValue();
+ if (!PhiR->isInLoop() && CM.foldTailByMasking()) {
+ VPValue *Cond = RecipeBuilder.getBlockInMask(OrigLoop->getHeader());
+ assert(OrigExitingVPV->getDefiningRecipe()->getParent() != LatchVPBB &&
"reduction recipe must be defined before latch");
- FastMathFlags FMFs = RdxDesc.getFastMathFlags();
Type *PhiTy = PhiR->getOperand(0)->getLiveInIRValue()->getType();
- Result =
+ std::optional<FastMathFlags> FMFs =
PhiTy->isFloatingPointTy()
- ? new VPInstruction(Instruction::Select, {Cond, Red, PhiR}, FMFs)
- : new VPInstruction(Instruction::Select, {Cond, Red, PhiR});
- Result->insertBefore(&*Builder.getInsertPoint());
- Red->replaceUsesWithIf(
- Result->getVPSingleValue(),
- [](VPUser &U, unsigned) { return isa<VPLiveOut>(&U); });
+ ? std::make_optional(RdxDesc.getFastMathFlags())
+ : std::nullopt;
+ NewExitingVPV =
+ Builder.createSelect(Cond, OrigExitingVPV, PhiR, {}, "", FMFs);
+ OrigExitingVPV->replaceUsesWithIf(NewExitingVPV, [](VPUser &U, unsigned) {
+ return isa<VPInstruction>(&U) &&
+ cast<VPInstruction>(&U)->getOpcode() ==
+ VPInstruction::ComputeReductionResult;
+ });
if (PreferPredicatedReductionSelect ||
TTI.preferPredicatedReductionSelect(
PhiR->getRecurrenceDescriptor().getOpcode(), PhiTy,
TargetTransformInfo::ReductionFlags()))
- PhiR->setOperand(1, Result->getVPSingleValue());
+ PhiR->setOperand(1, NewExitingVPV);
}
+
// If the vector reduction can be performed in a smaller type, we truncate
// then extend the loop exit value to enable InstCombine to evaluate the
// entire expression in the smaller type.
@@ -9174,18 +9090,40 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
Type *RdxTy = RdxDesc.getRecurrenceType();
- auto *Trunc = new VPWidenCastRecipe(Instruction::Trunc,
- Result->getVPSingleValue(), RdxTy);
+ auto *Trunc =
+ new VPWidenCastRecipe(Instruction::Trunc, NewExitingVPV, RdxTy);
auto *Extnd =
RdxDesc.isSigned()
? new VPWidenCastRecipe(Instruction::SExt, Trunc, PhiTy)
: new VPWidenCastRecipe(Instruction::ZExt, Trunc, PhiTy);
- Trunc->insertAfter(Result);
+ Trunc->insertAfter(NewExitingVPV->getDefiningRecipe());
Extnd->insertAfter(Trunc);
- Result->getVPSingleValue()->replaceAllUsesWith(Extnd);
- Trunc->setOperand(0, Result->getVPSingleValue());
+ if (PhiR->getOperand(1) == NewExitingVPV)
+ PhiR->setOperand(1, Extnd->getVPSingleValue());
+ NewExitingVPV = Extnd;
}
+
+ // We want code in the middle block to appear to execute on the location of
+ // the scalar loop's latch terminator because: (a) it is all compiler
+ // generated, (b) these instructions are always executed after evaluating
+ // the latch conditional branch, and (c) other passes may add new
+ // predecessors which terminate on this line. This is the easiest way to
+ // ensure we don't accidentally cause an extra step back into the loop while
+ // debugging.
+ DebugLoc ExitDL = OrigLoop->getLoopLatch()->getTerminator()->getDebugLoc();
+
+ // TODO: At the moment ComputeReductionResult also drives creation of the
+ // bc.merge.rdx phi nodes, hence it needs to be created unconditionally here
+ // even for in-loop reductions, until the reduction resume value handling is
+ // also modeled in VPlan.
+ auto *FinalReductionResult = new VPInstruction(
+ VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL);
+ cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor())
+ ->appendRecipe(FinalReductionResult);
+ OrigExitingVPV->replaceUsesWithIf(
+ FinalReductionResult,
+ [](VPUser &User, unsigned) { return isa<VPLiveOut>(&User); });
}
VPlanTransforms::clearReductionWrapFlags(*Plan);
@@ -10152,8 +10090,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
EPI, &LVL, &CM, BFI, PSI, Checks);
VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF);
- auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF,
- BestMainPlan, MainILV, DT, true);
+ const auto &[ExpandedSCEVs, ReductionResumeValues] = LVP.executePlan(
+ EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV, DT, true);
++LoopsVectorized;
// Second pass vectorizes the epilogue and adjusts the control flow
@@ -10194,8 +10132,9 @@ bool LoopVectorizePass::processLoop(Loop *L) {
Value *ResumeV = nullptr;
// TODO: Move setting of resume values to prepareToExecute.
if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
- ResumeV = MainILV.getReductionResumeValue(
- ReductionPhi->getRecurrenceDescriptor());
+ ResumeV = ReductionResumeValues
+ .find(&ReductionPhi->getRecurrenceDescriptor())
+ ->second;
} else {
// Create induction resume values for both widened pointer and
// integer/fp inductions and update the start value of the induction