aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Transforms/Vectorize/VPlan.cpp
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2022-01-27 22:06:42 +0000
committerDimitry Andric <dim@FreeBSD.org>2022-01-27 22:06:42 +0000
commit6f8fc217eaa12bf657be1c6468ed9938d10168b3 (patch)
treea1fd89b864d9b93e2ad68fe1dcf7afee2e3c8d76 /llvm/lib/Transforms/Vectorize/VPlan.cpp
parent77fc4c146f0870ffb09c1afb823ccbe742c5e6ff (diff)
Diffstat (limited to 'llvm/lib/Transforms/Vectorize/VPlan.cpp')
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlan.cpp244
1 files changed, 187 insertions, 57 deletions
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 1d9e71663cd2..a96c122db2a9 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -677,10 +677,10 @@ void VPInstruction::generateInstruction(VPTransformState &State,
// Get first lane of vector induction variable.
Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0));
// Get the original loop tripcount.
- Value *ScalarTC = State.TripCount;
+ Value *ScalarTC = State.get(getOperand(1), Part);
auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
- auto *PredTy = FixedVectorType::get(Int1Ty, State.VF.getKnownMinValue());
+ auto *PredTy = VectorType::get(Int1Ty, State.VF);
Instruction *Call = Builder.CreateIntrinsic(
Intrinsic::get_active_lane_mask, {PredTy, ScalarTC->getType()},
{VIVElem0, ScalarTC}, nullptr, "active.lane.mask");
@@ -711,6 +711,51 @@ void VPInstruction::generateInstruction(VPTransformState &State,
}
break;
}
+
+ case VPInstruction::CanonicalIVIncrement:
+ case VPInstruction::CanonicalIVIncrementNUW: {
+ Value *Next = nullptr;
+ if (Part == 0) {
+ bool IsNUW = getOpcode() == VPInstruction::CanonicalIVIncrementNUW;
+ auto *Phi = State.get(getOperand(0), 0);
+ // The loop step is equal to the vectorization factor (num of SIMD
+ // elements) times the unroll factor (num of SIMD instructions).
+ Value *Step =
+ createStepForVF(Builder, Phi->getType(), State.VF, State.UF);
+ Next = Builder.CreateAdd(Phi, Step, "index.next", IsNUW, false);
+ } else {
+ Next = State.get(this, 0);
+ }
+
+ State.set(this, Next, Part);
+ break;
+ }
+ case VPInstruction::BranchOnCount: {
+ if (Part != 0)
+ break;
+ // First create the compare.
+ Value *IV = State.get(getOperand(0), Part);
+ Value *TC = State.get(getOperand(1), Part);
+ Value *Cond = Builder.CreateICmpEQ(IV, TC);
+
+ // Now create the branch.
+ auto *Plan = getParent()->getPlan();
+ VPRegionBlock *TopRegion = Plan->getVectorLoopRegion();
+ VPBasicBlock *Header = TopRegion->getEntry()->getEntryBasicBlock();
+ if (Header->empty()) {
+ assert(EnableVPlanNativePath &&
+ "empty entry block only expected in VPlanNativePath");
+ Header = cast<VPBasicBlock>(Header->getSingleSuccessor());
+ }
+ // TODO: Once the exit block is modeled in VPlan, use it instead of going
+ // through State.CFG.LastBB.
+ BasicBlock *Exit =
+ cast<BranchInst>(State.CFG.LastBB->getTerminator())->getSuccessor(0);
+
+ Builder.CreateCondBr(Cond, Exit, State.CFG.VPBB2IRBB[Header]);
+ Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
+ break;
+ }
default:
llvm_unreachable("Unsupported opcode for instruction");
}
@@ -758,6 +803,15 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
case VPInstruction::FirstOrderRecurrenceSplice:
O << "first-order splice";
break;
+ case VPInstruction::CanonicalIVIncrement:
+ O << "VF * UF + ";
+ break;
+ case VPInstruction::CanonicalIVIncrementNUW:
+ O << "VF * UF +(nuw) ";
+ break;
+ case VPInstruction::BranchOnCount:
+ O << "branch-on-count ";
+ break;
default:
O << Instruction::getOpcodeName(getOpcode());
}
@@ -786,23 +840,55 @@ void VPInstruction::setFastMathFlags(FastMathFlags FMFNew) {
FMF = FMFNew;
}
-/// Generate the code inside the body of the vectorized loop. Assumes a single
-/// LoopVectorBody basic-block was created for this. Introduce additional
-/// basic-blocks as needed, and fill them all.
-void VPlan::execute(VPTransformState *State) {
- // -1. Check if the backedge taken count is needed, and if so build it.
+void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
+ Value *CanonicalIVStartValue,
+ VPTransformState &State) {
+ // Check if the trip count is needed, and if so build it.
+ if (TripCount && TripCount->getNumUsers()) {
+ for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
+ State.set(TripCount, TripCountV, Part);
+ }
+
+ // Check if the backedge taken count is needed, and if so build it.
if (BackedgeTakenCount && BackedgeTakenCount->getNumUsers()) {
- Value *TC = State->TripCount;
- IRBuilder<> Builder(State->CFG.PrevBB->getTerminator());
- auto *TCMO = Builder.CreateSub(TC, ConstantInt::get(TC->getType(), 1),
+ IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
+ auto *TCMO = Builder.CreateSub(TripCountV,
+ ConstantInt::get(TripCountV->getType(), 1),
"trip.count.minus.1");
- auto VF = State->VF;
+ auto VF = State.VF;
Value *VTCMO =
VF.isScalar() ? TCMO : Builder.CreateVectorSplat(VF, TCMO, "broadcast");
- for (unsigned Part = 0, UF = State->UF; Part < UF; ++Part)
- State->set(BackedgeTakenCount, VTCMO, Part);
+ for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
+ State.set(BackedgeTakenCount, VTCMO, Part);
}
+ for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
+ State.set(&VectorTripCount, VectorTripCountV, Part);
+
+ // When vectorizing the epilogue loop, the canonical induction start value
+ // needs to be changed from zero to the value after the main vector loop.
+ if (CanonicalIVStartValue) {
+ VPValue *VPV = new VPValue(CanonicalIVStartValue);
+ addExternalDef(VPV);
+ auto *IV = getCanonicalIV();
+ assert(all_of(IV->users(),
+ [](const VPUser *U) {
+ auto *VPI = cast<VPInstruction>(U);
+ return VPI->getOpcode() ==
+ VPInstruction::CanonicalIVIncrement ||
+ VPI->getOpcode() ==
+ VPInstruction::CanonicalIVIncrementNUW;
+ }) &&
+ "the canonical IV should only be used by its increments when "
+ "resetting the start value");
+ IV->setOperand(0, VPV);
+ }
+}
+
+/// Generate the code inside the body of the vectorized loop. Assumes a single
+/// LoopVectorBody basic-block was created for this. Introduce additional
+/// basic-blocks as needed, and fill them all.
+void VPlan::execute(VPTransformState *State) {
// 0. Set the reverse mapping from VPValues to Values for code generation.
for (auto &Entry : Value2VPValue)
State->VPValue2Value[Entry.second] = Entry.first;
@@ -834,28 +920,6 @@ void VPlan::execute(VPTransformState *State) {
for (VPBlockBase *Block : depth_first(Entry))
Block->execute(State);
- // Fix the latch value of reduction and first-order recurrences phis in the
- // vector loop.
- VPBasicBlock *Header = Entry->getEntryBasicBlock();
- for (VPRecipeBase &R : Header->phis()) {
- auto *PhiR = dyn_cast<VPWidenPHIRecipe>(&R);
- if (!PhiR || !(isa<VPFirstOrderRecurrencePHIRecipe>(&R) ||
- isa<VPReductionPHIRecipe>(&R)))
- continue;
- // For first-order recurrences and in-order reduction phis, only a single
- // part is generated, which provides the last part from the previous
- // iteration. Otherwise all UF parts are generated.
- bool SinglePartNeeded = isa<VPFirstOrderRecurrencePHIRecipe>(&R) ||
- cast<VPReductionPHIRecipe>(&R)->isOrdered();
- unsigned LastPartForNewPhi = SinglePartNeeded ? 1 : State->UF;
- for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
- Value *VecPhi = State->get(PhiR, Part);
- Value *Val = State->get(PhiR->getBackedgeValue(),
- SinglePartNeeded ? State->UF - 1 : Part);
- cast<PHINode>(VecPhi)->addIncoming(Val, VectorLatchBB);
- }
- }
-
// Setup branch terminator successors for VPBBs in VPBBsToFix based on
// VPBB's successors.
for (auto VPBB : State->CFG.VPBBsToFix) {
@@ -876,13 +940,19 @@ void VPlan::execute(VPTransformState *State) {
// 3. Merge the temporary latch created with the last basic-block filled.
BasicBlock *LastBB = State->CFG.PrevBB;
+ assert(isa<BranchInst>(LastBB->getTerminator()) &&
+ "Expected VPlan CFG to terminate with branch");
+
+ // Move both the branch and check from LastBB to VectorLatchBB.
+ auto *LastBranch = cast<BranchInst>(LastBB->getTerminator());
+ LastBranch->moveBefore(VectorLatchBB->getTerminator());
+ VectorLatchBB->getTerminator()->eraseFromParent();
+ // Move condition so it is guaranteed to be next to branch. This is only done
+ // to avoid excessive test updates.
+ // TODO: Remove special handling once the increments for all inductions are
+ // modeled explicitly in VPlan.
+ cast<Instruction>(LastBranch->getCondition())->moveBefore(LastBranch);
// Connect LastBB to VectorLatchBB to facilitate their merge.
- assert((EnableVPlanNativePath ||
- isa<UnreachableInst>(LastBB->getTerminator())) &&
- "Expected InnerLoop VPlan CFG to terminate with unreachable");
- assert((!EnableVPlanNativePath || isa<BranchInst>(LastBB->getTerminator())) &&
- "Expected VPlan CFG to terminate with branch in NativePath");
- LastBB->getTerminator()->eraseFromParent();
BranchInst::Create(VectorLatchBB, LastBB);
// Merge LastBB with Latch.
@@ -891,6 +961,37 @@ void VPlan::execute(VPTransformState *State) {
assert(Merged && "Could not merge last basic block with latch.");
VectorLatchBB = LastBB;
+ // Fix the latch value of canonical, reduction and first-order recurrences
+ // phis in the vector loop.
+ VPBasicBlock *Header = Entry->getEntryBasicBlock();
+ if (Header->empty()) {
+ assert(EnableVPlanNativePath);
+ Header = cast<VPBasicBlock>(Header->getSingleSuccessor());
+ }
+ for (VPRecipeBase &R : Header->phis()) {
+ // Skip phi-like recipes that generate their backedege values themselves.
+ // TODO: Model their backedge values explicitly.
+ if (isa<VPWidenIntOrFpInductionRecipe>(&R) || isa<VPWidenPHIRecipe>(&R))
+ continue;
+
+ auto *PhiR = cast<VPHeaderPHIRecipe>(&R);
+ // For canonical IV, first-order recurrences and in-order reduction phis,
+ // only a single part is generated, which provides the last part from the
+ // previous iteration. For non-ordered reductions all UF parts are
+ // generated.
+ bool SinglePartNeeded = isa<VPCanonicalIVPHIRecipe>(PhiR) ||
+ isa<VPFirstOrderRecurrencePHIRecipe>(PhiR) ||
+ cast<VPReductionPHIRecipe>(PhiR)->isOrdered();
+ unsigned LastPartForNewPhi = SinglePartNeeded ? 1 : State->UF;
+
+ for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
+ Value *Phi = State->get(PhiR, Part);
+ Value *Val = State->get(PhiR->getBackedgeValue(),
+ SinglePartNeeded ? State->UF - 1 : Part);
+ cast<PHINode>(Phi)->addIncoming(Val, VectorLatchBB);
+ }
+ }
+
// We do not attempt to preserve DT for outer loop vectorization currently.
if (!EnableVPlanNativePath)
updateDominatorTree(State->DT, VectorPreHeaderBB, VectorLatchBB,
@@ -904,6 +1005,12 @@ void VPlan::print(raw_ostream &O) const {
O << "VPlan '" << Name << "' {";
+ if (VectorTripCount.getNumUsers() > 0) {
+ O << "\nLive-in ";
+ VectorTripCount.printAsOperand(O, SlotTracker);
+ O << " = vector-trip-count\n";
+ }
+
if (BackedgeTakenCount && BackedgeTakenCount->getNumUsers()) {
O << "\nLive-in ";
BackedgeTakenCount->printAsOperand(O, SlotTracker);
@@ -1155,7 +1262,15 @@ void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
} else
O << " " << VPlanIngredient(IV);
}
+#endif
+bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
+ auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue());
+ auto *StepC = dyn_cast<SCEVConstant>(getInductionDescriptor().getStep());
+ return StartC && StartC->isZero() && StepC && StepC->isOne();
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent << "WIDEN-GEP ";
@@ -1255,7 +1370,7 @@ void VPWidenMemoryInstructionRecipe::print(raw_ostream &O, const Twine &Indent,
O << Indent << "WIDEN ";
if (!isStore()) {
- getVPSingleValue()->printAsOperand(O, SlotTracker);
+ printAsOperand(O, SlotTracker);
O << " = ";
}
O << Instruction::getOpcodeName(Ingredient.getOpcode()) << " ";
@@ -1264,26 +1379,39 @@ void VPWidenMemoryInstructionRecipe::print(raw_ostream &O, const Twine &Indent,
}
#endif
+void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) {
+ Value *Start = getStartValue()->getLiveInIRValue();
+ PHINode *EntryPart = PHINode::Create(
+ Start->getType(), 2, "index", &*State.CFG.PrevBB->getFirstInsertionPt());
+ EntryPart->addIncoming(Start, State.CFG.VectorPreHeader);
+ EntryPart->setDebugLoc(DL);
+ for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
+ State.set(this, EntryPart, Part);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << Indent << "EMIT ";
+ printAsOperand(O, SlotTracker);
+ O << " = CANONICAL-INDUCTION";
+}
+#endif
+
void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
- Value *CanonicalIV = State.CanonicalIV;
+ Value *CanonicalIV = State.get(getOperand(0), 0);
Type *STy = CanonicalIV->getType();
IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
ElementCount VF = State.VF;
- assert(!VF.isScalable() && "the code following assumes non scalables ECs");
Value *VStart = VF.isScalar()
? CanonicalIV
- : Builder.CreateVectorSplat(VF.getKnownMinValue(),
- CanonicalIV, "broadcast");
+ : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast");
for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
- SmallVector<Constant *, 8> Indices;
- for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
- Indices.push_back(
- ConstantInt::get(STy, Part * VF.getKnownMinValue() + Lane));
- // If VF == 1, there is only one iteration in the loop above, thus the
- // element pushed back into Indices is ConstantInt::get(STy, Part)
- Constant *VStep =
- VF.isScalar() ? Indices.back() : ConstantVector::get(Indices);
- // Add the consecutive indices to the vector value.
+ Value *VStep = createStepForVF(Builder, STy, VF, Part);
+ if (VF.isVector()) {
+ VStep = Builder.CreateVectorSplat(VF, VStep);
+ VStep = Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType()));
+ }
Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv");
State.set(this, CanonicalVectorIV, Part);
}
@@ -1294,7 +1422,8 @@ void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent << "EMIT ";
printAsOperand(O, SlotTracker);
- O << " = WIDEN-CANONICAL-INDUCTION";
+ O << " = WIDEN-CANONICAL-INDUCTION ";
+ printOperands(O, SlotTracker);
}
#endif
@@ -1461,7 +1590,7 @@ void VPInterleavedAccessInfo::visitBlock(VPBlockBase *Block, Old2NewTy &Old2New,
InterleavedAccessInfo &IAI) {
if (VPBasicBlock *VPBB = dyn_cast<VPBasicBlock>(Block)) {
for (VPRecipeBase &VPI : *VPBB) {
- if (isa<VPWidenPHIRecipe>(&VPI))
+ if (isa<VPHeaderPHIRecipe>(&VPI))
continue;
assert(isa<VPInstruction>(&VPI) && "Can only handle VPInstructions");
auto *VPInst = cast<VPInstruction>(&VPI);
@@ -1506,6 +1635,7 @@ void VPSlotTracker::assignSlots(const VPlan &Plan) {
for (const VPValue *V : Plan.VPExternalDefs)
assignSlot(V);
+ assignSlot(&Plan.VectorTripCount);
if (Plan.BackedgeTakenCount)
assignSlot(Plan.BackedgeTakenCount);