aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2024-07-27 23:34:35 +0000
committerDimitry Andric <dim@FreeBSD.org>2024-10-23 18:26:01 +0000
commit0fca6ea1d4eea4c934cfff25ac9ee8ad6fe95583 (patch)
tree6cf5ab1f05330c6773b1f3f64799d56a9c7a1faa /contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp
parent6b9f7133aba44189d9625c352bc2c2a59baf18ef (diff)
parentac9a064cb179f3425b310fa2847f8764ac970a4d (diff)
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp')
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp664
1 files changed, 507 insertions, 157 deletions
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 3eeb1a6948f2..58de6256900f 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -17,13 +17,16 @@
//===----------------------------------------------------------------------===//
#include "VPlan.h"
+#include "LoopVectorizationPlanner.h"
#include "VPlanCFG.h"
#include "VPlanDominatorTree.h"
+#include "VPlanPatternMatch.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CFG.h"
@@ -46,6 +49,7 @@
#include <vector>
using namespace llvm;
+using namespace llvm::VPlanPatternMatch;
namespace llvm {
extern cl::opt<bool> EnableVPlanNativePath;
@@ -212,6 +216,14 @@ VPBasicBlock::iterator VPBasicBlock::getFirstNonPhi() {
return It;
}
+VPTransformState::VPTransformState(ElementCount VF, unsigned UF, LoopInfo *LI,
+ DominatorTree *DT, IRBuilderBase &Builder,
+ InnerLoopVectorizer *ILV, VPlan *Plan,
+ LLVMContext &Ctx)
+ : VF(VF), UF(UF), CFG(DT), LI(LI), Builder(Builder), ILV(ILV), Plan(Plan),
+ LVer(nullptr),
+ TypeAnalysis(Plan->getCanonicalIV()->getScalarType(), Ctx) {}
+
Value *VPTransformState::get(VPValue *Def, const VPIteration &Instance) {
if (Def->isLiveIn())
return Def->getLiveInIRValue();
@@ -220,6 +232,11 @@ Value *VPTransformState::get(VPValue *Def, const VPIteration &Instance) {
return Data
.PerPartScalars[Def][Instance.Part][Instance.Lane.mapToCacheIndex(VF)];
}
+ if (!Instance.Lane.isFirstLane() &&
+ vputils::isUniformAfterVectorization(Def) &&
+ hasScalarValue(Def, {Instance.Part, VPLane::getFirstLane()})) {
+ return Data.PerPartScalars[Def][Instance.Part][0];
+ }
assert(hasVectorValue(Def, Instance.Part));
auto *VecPart = Data.PerPartOutput[Def][Instance.Part];
@@ -234,7 +251,17 @@ Value *VPTransformState::get(VPValue *Def, const VPIteration &Instance) {
return Extract;
}
-Value *VPTransformState::get(VPValue *Def, unsigned Part) {
+Value *VPTransformState::get(VPValue *Def, unsigned Part, bool NeedsScalar) {
+ if (NeedsScalar) {
+ assert((VF.isScalar() || Def->isLiveIn() || hasVectorValue(Def, Part) ||
+ !vputils::onlyFirstLaneUsed(Def) ||
+ (hasScalarValue(Def, VPIteration(Part, 0)) &&
+ Data.PerPartScalars[Def][Part].size() == 1)) &&
+ "Trying to access a single scalar per part but has multiple scalars "
+ "per part.");
+ return get(Def, VPIteration(Part, 0));
+ }
+
// If Values have been set for this Def return the one relevant for \p Part.
if (hasVectorValue(Def, Part))
return Data.PerPartOutput[Def][Part];
@@ -339,23 +366,14 @@ void VPTransformState::addNewMetadata(Instruction *To,
LVer->annotateInstWithNoAlias(To, Orig);
}
-void VPTransformState::addMetadata(Instruction *To, Instruction *From) {
- // No source instruction to transfer metadata from?
- if (!From)
- return;
-
- propagateMetadata(To, From);
- addNewMetadata(To, From);
-}
-
-void VPTransformState::addMetadata(ArrayRef<Value *> To, Instruction *From) {
+void VPTransformState::addMetadata(Value *To, Instruction *From) {
// No source instruction to transfer metadata from?
if (!From)
return;
- for (Value *V : To) {
- if (Instruction *I = dyn_cast<Instruction>(V))
- addMetadata(I, From);
+ if (Instruction *ToI = dyn_cast<Instruction>(To)) {
+ propagateMetadata(ToI, From);
+ addNewMetadata(ToI, From);
}
}
@@ -426,10 +444,42 @@ VPBasicBlock::createEmptyBasicBlock(VPTransformState::CFGState &CFG) {
"Trying to reset an existing successor block.");
TermBr->setSuccessor(idx, NewBB);
}
+ CFG.DTU.applyUpdates({{DominatorTree::Insert, PredBB, NewBB}});
}
return NewBB;
}
+void VPIRBasicBlock::execute(VPTransformState *State) {
+ assert(getHierarchicalSuccessors().size() <= 2 &&
+ "VPIRBasicBlock can have at most two successors at the moment!");
+ State->Builder.SetInsertPoint(getIRBasicBlock()->getTerminator());
+ executeRecipes(State, getIRBasicBlock());
+ if (getSingleSuccessor()) {
+ assert(isa<UnreachableInst>(getIRBasicBlock()->getTerminator()));
+ auto *Br = State->Builder.CreateBr(getIRBasicBlock());
+ Br->setOperand(0, nullptr);
+ getIRBasicBlock()->getTerminator()->eraseFromParent();
+ }
+
+ for (VPBlockBase *PredVPBlock : getHierarchicalPredecessors()) {
+ VPBasicBlock *PredVPBB = PredVPBlock->getExitingBasicBlock();
+ BasicBlock *PredBB = State->CFG.VPBB2IRBB[PredVPBB];
+ assert(PredBB && "Predecessor basic-block not found building successor.");
+ LLVM_DEBUG(dbgs() << "LV: draw edge from" << PredBB->getName() << '\n');
+
+ auto *PredBBTerminator = PredBB->getTerminator();
+ auto *TermBr = cast<BranchInst>(PredBBTerminator);
+ // Set each forward successor here when it is created, excluding
+ // backedges. A backward successor is set when the branch is created.
+ const auto &PredVPSuccessors = PredVPBB->getHierarchicalSuccessors();
+ unsigned idx = PredVPSuccessors.front() == this ? 0 : 1;
+ assert(!TermBr->getSuccessor(idx) &&
+ "Trying to reset an existing successor block.");
+ TermBr->setSuccessor(idx, IRBB);
+ State->CFG.DTU.applyUpdates({{DominatorTree::Insert, PredBB, IRBB}});
+ }
+}
+
void VPBasicBlock::execute(VPTransformState *State) {
bool Replica = State->Instance && !State->Instance->isFirstIteration();
VPBasicBlock *PrevVPBB = State->CFG.PrevVPBB;
@@ -441,29 +491,14 @@ void VPBasicBlock::execute(VPTransformState *State) {
return R && !R->isReplicator();
};
- // 1. Create an IR basic block, or reuse the last one or ExitBB if possible.
- if (getPlan()->getVectorLoopRegion()->getSingleSuccessor() == this) {
- // ExitBB can be re-used for the exit block of the Plan.
- NewBB = State->CFG.ExitBB;
- State->CFG.PrevBB = NewBB;
- State->Builder.SetInsertPoint(NewBB->getFirstNonPHI());
-
- // Update the branch instruction in the predecessor to branch to ExitBB.
- VPBlockBase *PredVPB = getSingleHierarchicalPredecessor();
- VPBasicBlock *ExitingVPBB = PredVPB->getExitingBasicBlock();
- assert(PredVPB->getSingleSuccessor() == this &&
- "predecessor must have the current block as only successor");
- BasicBlock *ExitingBB = State->CFG.VPBB2IRBB[ExitingVPBB];
- // The Exit block of a loop is always set to be successor 0 of the Exiting
- // block.
- cast<BranchInst>(ExitingBB->getTerminator())->setSuccessor(0, NewBB);
- } else if (PrevVPBB && /* A */
- !((SingleHPred = getSingleHierarchicalPredecessor()) &&
- SingleHPred->getExitingBasicBlock() == PrevVPBB &&
- PrevVPBB->getSingleHierarchicalSuccessor() &&
- (SingleHPred->getParent() == getEnclosingLoopRegion() &&
- !IsLoopRegion(SingleHPred))) && /* B */
- !(Replica && getPredecessors().empty())) { /* C */
+ // 1. Create an IR basic block.
+ if (PrevVPBB && /* A */
+ !((SingleHPred = getSingleHierarchicalPredecessor()) &&
+ SingleHPred->getExitingBasicBlock() == PrevVPBB &&
+ PrevVPBB->getSingleHierarchicalSuccessor() &&
+ (SingleHPred->getParent() == getEnclosingLoopRegion() &&
+ !IsLoopRegion(SingleHPred))) && /* B */
+ !(Replica && getPredecessors().empty())) { /* C */
// The last IR basic block is reused, as an optimization, in three cases:
// A. the first VPBB reuses the loop pre-header BB - when PrevVPBB is null;
// B. when the current VPBB has a single (hierarchical) predecessor which
@@ -486,16 +521,7 @@ void VPBasicBlock::execute(VPTransformState *State) {
}
// 2. Fill the IR basic block with IR instructions.
- LLVM_DEBUG(dbgs() << "LV: vectorizing VPBB:" << getName()
- << " in BB:" << NewBB->getName() << '\n');
-
- State->CFG.VPBB2IRBB[this] = NewBB;
- State->CFG.PrevVPBB = this;
-
- for (VPRecipeBase &Recipe : Recipes)
- Recipe.execute(*State);
-
- LLVM_DEBUG(dbgs() << "LV: filled BB:" << *NewBB);
+ executeRecipes(State, NewBB);
}
void VPBasicBlock::dropAllReferences(VPValue *NewValue) {
@@ -508,6 +534,19 @@ void VPBasicBlock::dropAllReferences(VPValue *NewValue) {
}
}
+void VPBasicBlock::executeRecipes(VPTransformState *State, BasicBlock *BB) {
+ LLVM_DEBUG(dbgs() << "LV: vectorizing VPBB:" << getName()
+ << " in BB:" << BB->getName() << '\n');
+
+ State->CFG.VPBB2IRBB[this] = BB;
+ State->CFG.PrevVPBB = this;
+
+ for (VPRecipeBase &Recipe : Recipes)
+ Recipe.execute(*State);
+
+ LLVM_DEBUG(dbgs() << "LV: filled BB:" << *BB);
+}
+
VPBasicBlock *VPBasicBlock::splitAt(iterator SplitAt) {
assert((SplitAt == end() || SplitAt->getParent() == this) &&
"can only split at a position in the same block");
@@ -552,14 +591,13 @@ static bool hasConditionalTerminator(const VPBasicBlock *VPBB) {
}
const VPRecipeBase *R = &VPBB->back();
- auto *VPI = dyn_cast<VPInstruction>(R);
- bool IsCondBranch =
- isa<VPBranchOnMaskRecipe>(R) ||
- (VPI && (VPI->getOpcode() == VPInstruction::BranchOnCond ||
- VPI->getOpcode() == VPInstruction::BranchOnCount));
+ bool IsCondBranch = isa<VPBranchOnMaskRecipe>(R) ||
+ match(R, m_BranchOnCond(m_VPValue())) ||
+ match(R, m_BranchOnCount(m_VPValue(), m_VPValue()));
(void)IsCondBranch;
- if (VPBB->getNumSuccessors() >= 2 || VPBB->isExiting()) {
+ if (VPBB->getNumSuccessors() >= 2 ||
+ (VPBB->isExiting() && !VPBB->getParent()->isReplicator())) {
assert(IsCondBranch && "block with multiple successors not terminated by "
"conditional branch recipe");
@@ -585,7 +623,7 @@ const VPRecipeBase *VPBasicBlock::getTerminator() const {
}
bool VPBasicBlock::isExiting() const {
- return getParent()->getExitingBasicBlock() == this;
+ return getParent() && getParent()->getExitingBasicBlock() == this;
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -615,6 +653,72 @@ void VPBasicBlock::print(raw_ostream &O, const Twine &Indent,
}
#endif
+static std::pair<VPBlockBase *, VPBlockBase *> cloneFrom(VPBlockBase *Entry);
+
+// Clone the CFG for all nodes reachable from \p Entry, this includes cloning
+// the blocks and their recipes. Operands of cloned recipes will NOT be updated.
+// Remapping of operands must be done separately. Returns a pair with the new
+// entry and exiting blocks of the cloned region. If \p Entry isn't part of a
+// region, return nullptr for the exiting block.
+static std::pair<VPBlockBase *, VPBlockBase *> cloneFrom(VPBlockBase *Entry) {
+ DenseMap<VPBlockBase *, VPBlockBase *> Old2NewVPBlocks;
+ VPBlockBase *Exiting = nullptr;
+ bool InRegion = Entry->getParent();
+ // First, clone blocks reachable from Entry.
+ for (VPBlockBase *BB : vp_depth_first_shallow(Entry)) {
+ VPBlockBase *NewBB = BB->clone();
+ Old2NewVPBlocks[BB] = NewBB;
+ if (InRegion && BB->getNumSuccessors() == 0) {
+ assert(!Exiting && "Multiple exiting blocks?");
+ Exiting = BB;
+ }
+ }
+ assert((!InRegion || Exiting) && "regions must have a single exiting block");
+
+ // Second, update the predecessors & successors of the cloned blocks.
+ for (VPBlockBase *BB : vp_depth_first_shallow(Entry)) {
+ VPBlockBase *NewBB = Old2NewVPBlocks[BB];
+ SmallVector<VPBlockBase *> NewPreds;
+ for (VPBlockBase *Pred : BB->getPredecessors()) {
+ NewPreds.push_back(Old2NewVPBlocks[Pred]);
+ }
+ NewBB->setPredecessors(NewPreds);
+ SmallVector<VPBlockBase *> NewSuccs;
+ for (VPBlockBase *Succ : BB->successors()) {
+ NewSuccs.push_back(Old2NewVPBlocks[Succ]);
+ }
+ NewBB->setSuccessors(NewSuccs);
+ }
+
+#if !defined(NDEBUG)
+ // Verify that the order of predecessors and successors matches in the cloned
+ // version.
+ for (const auto &[OldBB, NewBB] :
+ zip(vp_depth_first_shallow(Entry),
+ vp_depth_first_shallow(Old2NewVPBlocks[Entry]))) {
+ for (const auto &[OldPred, NewPred] :
+ zip(OldBB->getPredecessors(), NewBB->getPredecessors()))
+ assert(NewPred == Old2NewVPBlocks[OldPred] && "Different predecessors");
+
+ for (const auto &[OldSucc, NewSucc] :
+ zip(OldBB->successors(), NewBB->successors()))
+ assert(NewSucc == Old2NewVPBlocks[OldSucc] && "Different successors");
+ }
+#endif
+
+ return std::make_pair(Old2NewVPBlocks[Entry],
+ Exiting ? Old2NewVPBlocks[Exiting] : nullptr);
+}
+
+VPRegionBlock *VPRegionBlock::clone() {
+ const auto &[NewEntry, NewExiting] = cloneFrom(getEntry());
+ auto *NewRegion =
+ new VPRegionBlock(NewEntry, NewExiting, getName(), isReplicator());
+ for (VPBlockBase *Block : vp_depth_first_shallow(NewEntry))
+ Block->setParent(NewRegion);
+ return NewRegion;
+}
+
void VPRegionBlock::dropAllReferences(VPValue *NewValue) {
for (VPBlockBase *Block : vp_depth_first_shallow(Entry))
// Drop all references in VPBasicBlocks and replace all uses with
@@ -673,6 +777,48 @@ void VPRegionBlock::execute(VPTransformState *State) {
State->Instance.reset();
}
+InstructionCost VPBasicBlock::cost(ElementCount VF, VPCostContext &Ctx) {
+ InstructionCost Cost = 0;
+ for (VPRecipeBase &R : Recipes)
+ Cost += R.cost(VF, Ctx);
+ return Cost;
+}
+
+InstructionCost VPRegionBlock::cost(ElementCount VF, VPCostContext &Ctx) {
+ if (!isReplicator()) {
+ InstructionCost Cost = 0;
+ for (VPBlockBase *Block : vp_depth_first_shallow(getEntry()))
+ Cost += Block->cost(VF, Ctx);
+ InstructionCost BackedgeCost =
+ Ctx.TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput);
+ LLVM_DEBUG(dbgs() << "Cost of " << BackedgeCost << " for VF " << VF
+ << ": vector loop backedge\n");
+ Cost += BackedgeCost;
+ return Cost;
+ }
+
+ // Compute the cost of a replicate region. Replicating isn't supported for
+ // scalable vectors, return an invalid cost for them.
+ // TODO: Discard scalable VPlans with replicate recipes earlier after
+ // construction.
+ if (VF.isScalable())
+ return InstructionCost::getInvalid();
+
+ // First compute the cost of the conditionally executed recipes, followed by
+ // account for the branching cost, except if the mask is a header mask or
+ // uniform condition.
+ using namespace llvm::VPlanPatternMatch;
+ VPBasicBlock *Then = cast<VPBasicBlock>(getEntry()->getSuccessors()[0]);
+ InstructionCost ThenCost = Then->cost(VF, Ctx);
+
+ // For the scalar case, we may not always execute the original predicated
+ // block, Thus, scale the block's cost by the probability of executing it.
+ if (VF.isScalar())
+ return ThenCost / getReciprocalPredBlockProb();
+
+ return ThenCost;
+}
+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPRegionBlock::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
@@ -709,17 +855,61 @@ VPlan::~VPlan() {
delete BackedgeTakenCount;
}
-VPlanPtr VPlan::createInitialVPlan(const SCEV *TripCount, ScalarEvolution &SE) {
- VPBasicBlock *Preheader = new VPBasicBlock("ph");
+VPlanPtr VPlan::createInitialVPlan(const SCEV *TripCount, ScalarEvolution &SE,
+ bool RequiresScalarEpilogueCheck,
+ bool TailFolded, Loop *TheLoop) {
+ VPIRBasicBlock *Entry = new VPIRBasicBlock(TheLoop->getLoopPreheader());
VPBasicBlock *VecPreheader = new VPBasicBlock("vector.ph");
- auto Plan = std::make_unique<VPlan>(Preheader, VecPreheader);
+ auto Plan = std::make_unique<VPlan>(Entry, VecPreheader);
Plan->TripCount =
vputils::getOrCreateVPValueForSCEVExpr(*Plan, TripCount, SE);
- // Create empty VPRegionBlock, to be filled during processing later.
- auto *TopRegion = new VPRegionBlock("vector loop", false /*isReplicator*/);
+ // Create VPRegionBlock, with empty header and latch blocks, to be filled
+ // during processing later.
+ VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body");
+ VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch");
+ VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB);
+ auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop",
+ false /*isReplicator*/);
+
VPBlockUtils::insertBlockAfter(TopRegion, VecPreheader);
VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block");
VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion);
+
+ VPBasicBlock *ScalarPH = new VPBasicBlock("scalar.ph");
+ if (!RequiresScalarEpilogueCheck) {
+ VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
+ return Plan;
+ }
+
+ // If needed, add a check in the middle block to see if we have completed
+ // all of the iterations in the first vector loop. Three cases:
+ // 1) If (N - N%VF) == N, then we *don't* need to run the remainder.
+ // Thus if tail is to be folded, we know we don't need to run the
+ // remainder and we can set the condition to true.
+ // 2) If we require a scalar epilogue, there is no conditional branch as
+ // we unconditionally branch to the scalar preheader. Do nothing.
+ // 3) Otherwise, construct a runtime check.
+ BasicBlock *IRExitBlock = TheLoop->getUniqueExitBlock();
+ auto *VPExitBlock = new VPIRBasicBlock(IRExitBlock);
+ // The connection order corresponds to the operands of the conditional branch.
+ VPBlockUtils::insertBlockAfter(VPExitBlock, MiddleVPBB);
+ VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
+
+ auto *ScalarLatchTerm = TheLoop->getLoopLatch()->getTerminator();
+ // Here we use the same DebugLoc as the scalar loop latch terminator instead
+ // of the corresponding compare because they may have ended up with
+ // different line numbers and we want to avoid awkward line stepping while
+ // debugging. Eg. if the compare has got a line number inside the loop.
+ VPBuilder Builder(MiddleVPBB);
+ VPValue *Cmp =
+ TailFolded
+ ? Plan->getOrAddLiveIn(ConstantInt::getTrue(
+ IntegerType::getInt1Ty(TripCount->getType()->getContext())))
+ : Builder.createICmp(CmpInst::ICMP_EQ, Plan->getTripCount(),
+ &Plan->getVectorTripCount(),
+ ScalarLatchTerm->getDebugLoc(), "cmp.n");
+ Builder.createNaryOp(VPInstruction::BranchOnCond, {Cmp},
+ ScalarLatchTerm->getDebugLoc());
return Plan;
}
@@ -732,31 +922,26 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
auto *TCMO = Builder.CreateSub(TripCountV,
ConstantInt::get(TripCountV->getType(), 1),
"trip.count.minus.1");
- auto VF = State.VF;
- Value *VTCMO =
- VF.isScalar() ? TCMO : Builder.CreateVectorSplat(VF, TCMO, "broadcast");
- for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
- State.set(BackedgeTakenCount, VTCMO, Part);
+ BackedgeTakenCount->setUnderlyingValue(TCMO);
}
- for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
- State.set(&VectorTripCount, VectorTripCountV, Part);
+ VectorTripCount.setUnderlyingValue(VectorTripCountV);
IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
// FIXME: Model VF * UF computation completely in VPlan.
- State.set(&VFxUF,
- createStepForVF(Builder, TripCountV->getType(), State.VF, State.UF),
- 0);
+ VFxUF.setUnderlyingValue(
+ createStepForVF(Builder, TripCountV->getType(), State.VF, State.UF));
// When vectorizing the epilogue loop, the canonical induction start value
// needs to be changed from zero to the value after the main vector loop.
// FIXME: Improve modeling for canonical IV start values in the epilogue loop.
if (CanonicalIVStartValue) {
- VPValue *VPV = getVPValueOrAddLiveIn(CanonicalIVStartValue);
+ VPValue *VPV = getOrAddLiveIn(CanonicalIVStartValue);
auto *IV = getCanonicalIV();
assert(all_of(IV->users(),
[](const VPUser *U) {
return isa<VPScalarIVStepsRecipe>(U) ||
+ isa<VPScalarCastRecipe>(U) ||
isa<VPDerivedIVRecipe>(U) ||
cast<VPInstruction>(U)->getOpcode() ==
Instruction::Add;
@@ -767,20 +952,69 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
}
}
+/// Replace \p VPBB with a VPIRBasicBlock wrapping \p IRBB. All recipes from \p
+/// VPBB are moved to the newly created VPIRBasicBlock. VPBB must have a single
+/// predecessor, which is rewired to the new VPIRBasicBlock. All successors of
+/// VPBB, if any, are rewired to the new VPIRBasicBlock.
+static void replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB) {
+ VPIRBasicBlock *IRMiddleVPBB = new VPIRBasicBlock(IRBB);
+ for (auto &R : make_early_inc_range(*VPBB))
+ R.moveBefore(*IRMiddleVPBB, IRMiddleVPBB->end());
+ VPBlockBase *PredVPBB = VPBB->getSinglePredecessor();
+ VPBlockUtils::disconnectBlocks(PredVPBB, VPBB);
+ VPBlockUtils::connectBlocks(PredVPBB, IRMiddleVPBB);
+ for (auto *Succ : to_vector(VPBB->getSuccessors())) {
+ VPBlockUtils::connectBlocks(IRMiddleVPBB, Succ);
+ VPBlockUtils::disconnectBlocks(VPBB, Succ);
+ }
+ delete VPBB;
+}
+
/// Generate the code inside the preheader and body of the vectorized loop.
/// Assumes a single pre-header basic-block was created for this. Introduce
/// additional basic-blocks as needed, and fill them all.
void VPlan::execute(VPTransformState *State) {
- // Set the reverse mapping from VPValues to Values for code generation.
- for (auto &Entry : Value2VPValue)
- State->VPValue2Value[Entry.second] = Entry.first;
-
// Initialize CFG state.
State->CFG.PrevVPBB = nullptr;
State->CFG.ExitBB = State->CFG.PrevBB->getSingleSuccessor();
BasicBlock *VectorPreHeader = State->CFG.PrevBB;
State->Builder.SetInsertPoint(VectorPreHeader->getTerminator());
+ // Disconnect VectorPreHeader from ExitBB in both the CFG and DT.
+ cast<BranchInst>(VectorPreHeader->getTerminator())->setSuccessor(0, nullptr);
+ State->CFG.DTU.applyUpdates(
+ {{DominatorTree::Delete, VectorPreHeader, State->CFG.ExitBB}});
+
+ // Replace regular VPBB's for the middle and scalar preheader blocks with
+ // VPIRBasicBlocks wrapping their IR blocks. The IR blocks are created during
+ // skeleton creation, so we can only create the VPIRBasicBlocks now during
+ // VPlan execution rather than earlier during VPlan construction.
+ BasicBlock *MiddleBB = State->CFG.ExitBB;
+ VPBasicBlock *MiddleVPBB =
+ cast<VPBasicBlock>(getVectorLoopRegion()->getSingleSuccessor());
+ // Find the VPBB for the scalar preheader, relying on the current structure
+ // when creating the middle block and its successrs: if there's a single
+ // predecessor, it must be the scalar preheader. Otherwise, the second
+ // successor is the scalar preheader.
+ BasicBlock *ScalarPh = MiddleBB->getSingleSuccessor();
+ auto &MiddleSuccs = MiddleVPBB->getSuccessors();
+ assert((MiddleSuccs.size() == 1 || MiddleSuccs.size() == 2) &&
+ "middle block has unexpected successors");
+ VPBasicBlock *ScalarPhVPBB = cast<VPBasicBlock>(
+ MiddleSuccs.size() == 1 ? MiddleSuccs[0] : MiddleSuccs[1]);
+ assert(!isa<VPIRBasicBlock>(ScalarPhVPBB) &&
+ "scalar preheader cannot be wrapped already");
+ replaceVPBBWithIRVPBB(ScalarPhVPBB, ScalarPh);
+ replaceVPBBWithIRVPBB(MiddleVPBB, MiddleBB);
+
+ // Disconnect the middle block from its single successor (the scalar loop
+ // header) in both the CFG and DT. The branch will be recreated during VPlan
+ // execution.
+ auto *BrInst = new UnreachableInst(MiddleBB->getContext());
+ BrInst->insertBefore(MiddleBB->getTerminator());
+ MiddleBB->getTerminator()->eraseFromParent();
+ State->CFG.DTU.applyUpdates({{DominatorTree::Delete, MiddleBB, ScalarPh}});
+
// Generate code in the loop pre-header and body.
for (VPBlockBase *Block : vp_depth_first_shallow(Entry))
Block->execute(State);
@@ -803,11 +1037,8 @@ void VPlan::execute(VPTransformState *State) {
Phi = cast<PHINode>(State->get(R.getVPSingleValue(), 0));
} else {
auto *WidenPhi = cast<VPWidenPointerInductionRecipe>(&R);
- // TODO: Split off the case that all users of a pointer phi are scalar
- // from the VPWidenPointerInductionRecipe.
- if (WidenPhi->onlyScalarsGenerated(State->VF))
- continue;
-
+ assert(!WidenPhi->onlyScalarsGenerated(State->VF.isScalable()) &&
+ "recipe generating only scalars should have been replaced");
auto *GEP = cast<GetElementPtrInst>(State->get(WidenPhi, 0));
Phi = cast<PHINode>(GEP->getPointerOperand());
}
@@ -826,27 +1057,36 @@ void VPlan::execute(VPTransformState *State) {
// only a single part is generated, which provides the last part from the
// previous iteration. For non-ordered reductions all UF parts are
// generated.
- bool SinglePartNeeded = isa<VPCanonicalIVPHIRecipe>(PhiR) ||
- isa<VPFirstOrderRecurrencePHIRecipe>(PhiR) ||
- (isa<VPReductionPHIRecipe>(PhiR) &&
- cast<VPReductionPHIRecipe>(PhiR)->isOrdered());
+ bool SinglePartNeeded =
+ isa<VPCanonicalIVPHIRecipe>(PhiR) ||
+ isa<VPFirstOrderRecurrencePHIRecipe, VPEVLBasedIVPHIRecipe>(PhiR) ||
+ (isa<VPReductionPHIRecipe>(PhiR) &&
+ cast<VPReductionPHIRecipe>(PhiR)->isOrdered());
+ bool NeedsScalar =
+ isa<VPCanonicalIVPHIRecipe, VPEVLBasedIVPHIRecipe>(PhiR) ||
+ (isa<VPReductionPHIRecipe>(PhiR) &&
+ cast<VPReductionPHIRecipe>(PhiR)->isInLoop());
unsigned LastPartForNewPhi = SinglePartNeeded ? 1 : State->UF;
for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
- Value *Phi = State->get(PhiR, Part);
- Value *Val = State->get(PhiR->getBackedgeValue(),
- SinglePartNeeded ? State->UF - 1 : Part);
+ Value *Phi = State->get(PhiR, Part, NeedsScalar);
+ Value *Val =
+ State->get(PhiR->getBackedgeValue(),
+ SinglePartNeeded ? State->UF - 1 : Part, NeedsScalar);
cast<PHINode>(Phi)->addIncoming(Val, VectorLatchBB);
}
}
- // We do not attempt to preserve DT for outer loop vectorization currently.
- if (!EnableVPlanNativePath) {
- BasicBlock *VectorHeaderBB = State->CFG.VPBB2IRBB[Header];
- State->DT->addNewBlock(VectorHeaderBB, VectorPreHeader);
- updateDominatorTree(State->DT, VectorHeaderBB, VectorLatchBB,
- State->CFG.ExitBB);
- }
+ State->CFG.DTU.flush();
+ assert(State->CFG.DTU.getDomTree().verify(
+ DominatorTree::VerificationLevel::Fast) &&
+ "DT not preserved correctly");
+}
+
+InstructionCost VPlan::cost(ElementCount VF, VPCostContext &Ctx) {
+ // For now only return the cost of the vector loop region, ignoring any other
+ // blocks, like the preheader or middle blocks.
+ return getVectorLoopRegion()->cost(VF, Ctx);
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -944,42 +1184,85 @@ void VPlan::addLiveOut(PHINode *PN, VPValue *V) {
LiveOuts.insert({PN, new VPLiveOut(PN, V)});
}
-void VPlan::updateDominatorTree(DominatorTree *DT, BasicBlock *LoopHeaderBB,
- BasicBlock *LoopLatchBB,
- BasicBlock *LoopExitBB) {
- // The vector body may be more than a single basic-block by this point.
- // Update the dominator tree information inside the vector body by propagating
- // it from header to latch, expecting only triangular control-flow, if any.
- BasicBlock *PostDomSucc = nullptr;
- for (auto *BB = LoopHeaderBB; BB != LoopLatchBB; BB = PostDomSucc) {
- // Get the list of successors of this block.
- std::vector<BasicBlock *> Succs(succ_begin(BB), succ_end(BB));
- assert(Succs.size() <= 2 &&
- "Basic block in vector loop has more than 2 successors.");
- PostDomSucc = Succs[0];
- if (Succs.size() == 1) {
- assert(PostDomSucc->getSinglePredecessor() &&
- "PostDom successor has more than one predecessor.");
- DT->addNewBlock(PostDomSucc, BB);
- continue;
- }
- BasicBlock *InterimSucc = Succs[1];
- if (PostDomSucc->getSingleSuccessor() == InterimSucc) {
- PostDomSucc = Succs[1];
- InterimSucc = Succs[0];
+static void remapOperands(VPBlockBase *Entry, VPBlockBase *NewEntry,
+ DenseMap<VPValue *, VPValue *> &Old2NewVPValues) {
+ // Update the operands of all cloned recipes starting at NewEntry. This
+ // traverses all reachable blocks. This is done in two steps, to handle cycles
+ // in PHI recipes.
+ ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>>
+ OldDeepRPOT(Entry);
+ ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>>
+ NewDeepRPOT(NewEntry);
+ // First, collect all mappings from old to new VPValues defined by cloned
+ // recipes.
+ for (const auto &[OldBB, NewBB] :
+ zip(VPBlockUtils::blocksOnly<VPBasicBlock>(OldDeepRPOT),
+ VPBlockUtils::blocksOnly<VPBasicBlock>(NewDeepRPOT))) {
+ assert(OldBB->getRecipeList().size() == NewBB->getRecipeList().size() &&
+ "blocks must have the same number of recipes");
+ for (const auto &[OldR, NewR] : zip(*OldBB, *NewBB)) {
+ assert(OldR.getNumOperands() == NewR.getNumOperands() &&
+ "recipes must have the same number of operands");
+ assert(OldR.getNumDefinedValues() == NewR.getNumDefinedValues() &&
+ "recipes must define the same number of operands");
+ for (const auto &[OldV, NewV] :
+ zip(OldR.definedValues(), NewR.definedValues()))
+ Old2NewVPValues[OldV] = NewV;
}
- assert(InterimSucc->getSingleSuccessor() == PostDomSucc &&
- "One successor of a basic block does not lead to the other.");
- assert(InterimSucc->getSinglePredecessor() &&
- "Interim successor has more than one predecessor.");
- assert(PostDomSucc->hasNPredecessors(2) &&
- "PostDom successor has more than two predecessors.");
- DT->addNewBlock(InterimSucc, BB);
- DT->addNewBlock(PostDomSucc, BB);
}
- // Latch block is a new dominator for the loop exit.
- DT->changeImmediateDominator(LoopExitBB, LoopLatchBB);
- assert(DT->verify(DominatorTree::VerificationLevel::Fast));
+
+ // Update all operands to use cloned VPValues.
+ for (VPBasicBlock *NewBB :
+ VPBlockUtils::blocksOnly<VPBasicBlock>(NewDeepRPOT)) {
+ for (VPRecipeBase &NewR : *NewBB)
+ for (unsigned I = 0, E = NewR.getNumOperands(); I != E; ++I) {
+ VPValue *NewOp = Old2NewVPValues.lookup(NewR.getOperand(I));
+ NewR.setOperand(I, NewOp);
+ }
+ }
+}
+
+VPlan *VPlan::duplicate() {
+ // Clone blocks.
+ VPBasicBlock *NewPreheader = Preheader->clone();
+ const auto &[NewEntry, __] = cloneFrom(Entry);
+
+ // Create VPlan, clone live-ins and remap operands in the cloned blocks.
+ auto *NewPlan = new VPlan(NewPreheader, cast<VPBasicBlock>(NewEntry));
+ DenseMap<VPValue *, VPValue *> Old2NewVPValues;
+ for (VPValue *OldLiveIn : VPLiveInsToFree) {
+ Old2NewVPValues[OldLiveIn] =
+ NewPlan->getOrAddLiveIn(OldLiveIn->getLiveInIRValue());
+ }
+ Old2NewVPValues[&VectorTripCount] = &NewPlan->VectorTripCount;
+ Old2NewVPValues[&VFxUF] = &NewPlan->VFxUF;
+ if (BackedgeTakenCount) {
+ NewPlan->BackedgeTakenCount = new VPValue();
+ Old2NewVPValues[BackedgeTakenCount] = NewPlan->BackedgeTakenCount;
+ }
+ assert(TripCount && "trip count must be set");
+ if (TripCount->isLiveIn())
+ Old2NewVPValues[TripCount] =
+ NewPlan->getOrAddLiveIn(TripCount->getLiveInIRValue());
+ // else NewTripCount will be created and inserted into Old2NewVPValues when
+ // TripCount is cloned. In any case NewPlan->TripCount is updated below.
+
+ remapOperands(Preheader, NewPreheader, Old2NewVPValues);
+ remapOperands(Entry, NewEntry, Old2NewVPValues);
+
+ // Clone live-outs.
+ for (const auto &[_, LO] : LiveOuts)
+ NewPlan->addLiveOut(LO->getPhi(), Old2NewVPValues[LO->getOperand(0)]);
+
+ // Initialize remaining fields of cloned VPlan.
+ NewPlan->VFs = VFs;
+ NewPlan->UFs = UFs;
+ // TODO: Adjust names.
+ NewPlan->Name = Name;
+ assert(Old2NewVPValues.contains(TripCount) &&
+ "TripCount must have been added to Old2NewVPValues");
+ NewPlan->TripCount = Old2NewVPValues[TripCount];
+ return NewPlan;
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -1168,18 +1451,7 @@ void VPValue::replaceUsesWithIf(
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPValue::printAsOperand(raw_ostream &OS, VPSlotTracker &Tracker) const {
- if (const Value *UV = getUnderlyingValue()) {
- OS << "ir<";
- UV->printAsOperand(OS, false);
- OS << ">";
- return;
- }
-
- unsigned Slot = Tracker.getSlot(this);
- if (Slot == unsigned(-1))
- OS << "<badref>";
- else
- OS << "vp<%" << Tracker.getSlot(this) << ">";
+ OS << Tracker.getOrCreateName(this);
}
void VPUser::printOperands(raw_ostream &O, VPSlotTracker &SlotTracker) const {
@@ -1203,7 +1475,7 @@ void VPInterleavedAccessInfo::visitBlock(VPBlockBase *Block, Old2NewTy &Old2New,
InterleavedAccessInfo &IAI) {
if (VPBasicBlock *VPBB = dyn_cast<VPBasicBlock>(Block)) {
for (VPRecipeBase &VPI : *VPBB) {
- if (isa<VPHeaderPHIRecipe>(&VPI))
+ if (isa<VPWidenPHIRecipe>(&VPI))
continue;
assert(isa<VPInstruction>(&VPI) && "Can only handle VPInstructions");
auto *VPInst = cast<VPInstruction>(&VPI);
@@ -1241,40 +1513,98 @@ VPInterleavedAccessInfo::VPInterleavedAccessInfo(VPlan &Plan,
visitRegion(Plan.getVectorLoopRegion(), Old2New, IAI);
}
-void VPSlotTracker::assignSlot(const VPValue *V) {
- assert(!Slots.contains(V) && "VPValue already has a slot!");
- Slots[V] = NextSlot++;
+void VPSlotTracker::assignName(const VPValue *V) {
+ assert(!VPValue2Name.contains(V) && "VPValue already has a name!");
+ auto *UV = V->getUnderlyingValue();
+ if (!UV) {
+ VPValue2Name[V] = (Twine("vp<%") + Twine(NextSlot) + ">").str();
+ NextSlot++;
+ return;
+ }
+
+ // Use the name of the underlying Value, wrapped in "ir<>", and versioned by
+ // appending ".Number" to the name if there are multiple uses.
+ std::string Name;
+ raw_string_ostream S(Name);
+ UV->printAsOperand(S, false);
+ assert(!Name.empty() && "Name cannot be empty.");
+ std::string BaseName = (Twine("ir<") + Name + Twine(">")).str();
+
+ // First assign the base name for V.
+ const auto &[A, _] = VPValue2Name.insert({V, BaseName});
+ // Integer or FP constants with different types will result in he same string
+ // due to stripping types.
+ if (V->isLiveIn() && isa<ConstantInt, ConstantFP>(UV))
+ return;
+
+ // If it is already used by C > 0 other VPValues, increase the version counter
+ // C and use it for V.
+ const auto &[C, UseInserted] = BaseName2Version.insert({BaseName, 0});
+ if (!UseInserted) {
+ C->second++;
+ A->second = (BaseName + Twine(".") + Twine(C->second)).str();
+ }
}
-void VPSlotTracker::assignSlots(const VPlan &Plan) {
+void VPSlotTracker::assignNames(const VPlan &Plan) {
if (Plan.VFxUF.getNumUsers() > 0)
- assignSlot(&Plan.VFxUF);
- assignSlot(&Plan.VectorTripCount);
+ assignName(&Plan.VFxUF);
+ assignName(&Plan.VectorTripCount);
if (Plan.BackedgeTakenCount)
- assignSlot(Plan.BackedgeTakenCount);
- assignSlots(Plan.getPreheader());
+ assignName(Plan.BackedgeTakenCount);
+ for (VPValue *LI : Plan.VPLiveInsToFree)
+ assignName(LI);
+ assignNames(Plan.getPreheader());
ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<const VPBlockBase *>>
RPOT(VPBlockDeepTraversalWrapper<const VPBlockBase *>(Plan.getEntry()));
for (const VPBasicBlock *VPBB :
VPBlockUtils::blocksOnly<const VPBasicBlock>(RPOT))
- assignSlots(VPBB);
+ assignNames(VPBB);
}
-void VPSlotTracker::assignSlots(const VPBasicBlock *VPBB) {
+void VPSlotTracker::assignNames(const VPBasicBlock *VPBB) {
for (const VPRecipeBase &Recipe : *VPBB)
for (VPValue *Def : Recipe.definedValues())
- assignSlot(Def);
+ assignName(Def);
}
-bool vputils::onlyFirstLaneUsed(VPValue *Def) {
+std::string VPSlotTracker::getOrCreateName(const VPValue *V) const {
+ std::string Name = VPValue2Name.lookup(V);
+ if (!Name.empty())
+ return Name;
+
+ // If no name was assigned, no VPlan was provided when creating the slot
+ // tracker or it is not reachable from the provided VPlan. This can happen,
+ // e.g. when trying to print a recipe that has not been inserted into a VPlan
+ // in a debugger.
+ // TODO: Update VPSlotTracker constructor to assign names to recipes &
+ // VPValues not associated with a VPlan, instead of constructing names ad-hoc
+ // here.
+ const VPRecipeBase *DefR = V->getDefiningRecipe();
+ (void)DefR;
+ assert((!DefR || !DefR->getParent() || !DefR->getParent()->getPlan()) &&
+ "VPValue defined by a recipe in a VPlan?");
+
+ // Use the underlying value's name, if there is one.
+ if (auto *UV = V->getUnderlyingValue()) {
+ std::string Name;
+ raw_string_ostream S(Name);
+ UV->printAsOperand(S, false);
+ return (Twine("ir<") + Name + ">").str();
+ }
+
+ return "<badref>";
+}
+
+bool vputils::onlyFirstLaneUsed(const VPValue *Def) {
return all_of(Def->users(),
- [Def](VPUser *U) { return U->onlyFirstLaneUsed(Def); });
+ [Def](const VPUser *U) { return U->onlyFirstLaneUsed(Def); });
}
-bool vputils::onlyFirstPartUsed(VPValue *Def) {
+bool vputils::onlyFirstPartUsed(const VPValue *Def) {
return all_of(Def->users(),
- [Def](VPUser *U) { return U->onlyFirstPartUsed(Def); });
+ [Def](const VPUser *U) { return U->onlyFirstPartUsed(Def); });
}
VPValue *vputils::getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr,
@@ -1283,9 +1613,9 @@ VPValue *vputils::getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr,
return Expanded;
VPValue *Expanded = nullptr;
if (auto *E = dyn_cast<SCEVConstant>(Expr))
- Expanded = Plan.getVPValueOrAddLiveIn(E->getValue());
+ Expanded = Plan.getOrAddLiveIn(E->getValue());
else if (auto *E = dyn_cast<SCEVUnknown>(Expr))
- Expanded = Plan.getVPValueOrAddLiveIn(E->getValue());
+ Expanded = Plan.getOrAddLiveIn(E->getValue());
else {
Expanded = new VPExpandSCEVRecipe(Expr, SE);
Plan.getPreheader()->appendRecipe(Expanded->getDefiningRecipe());
@@ -1293,3 +1623,23 @@ VPValue *vputils::getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr,
Plan.addSCEVExpansion(Expr, Expanded);
return Expanded;
}
+
+bool vputils::isHeaderMask(VPValue *V, VPlan &Plan) {
+ if (isa<VPActiveLaneMaskPHIRecipe>(V))
+ return true;
+
+ auto IsWideCanonicalIV = [](VPValue *A) {
+ return isa<VPWidenCanonicalIVRecipe>(A) ||
+ (isa<VPWidenIntOrFpInductionRecipe>(A) &&
+ cast<VPWidenIntOrFpInductionRecipe>(A)->isCanonical());
+ };
+
+ VPValue *A, *B;
+ if (match(V, m_ActiveLaneMask(m_VPValue(A), m_VPValue(B))))
+ return B == Plan.getTripCount() &&
+ (match(A, m_ScalarIVSteps(m_CanonicalIV(), m_SpecificInt(1))) ||
+ IsWideCanonicalIV(A));
+
+ return match(V, m_Binary<Instruction::ICmp>(m_VPValue(A), m_VPValue(B))) &&
+ IsWideCanonicalIV(A) && B == Plan.getOrCreateBackedgeTakenCount();
+}