diff options
| author | Dimitry Andric <dim@FreeBSD.org> | 2022-01-27 22:06:42 +0000 |
|---|---|---|
| committer | Dimitry Andric <dim@FreeBSD.org> | 2022-01-27 22:06:42 +0000 |
| commit | 6f8fc217eaa12bf657be1c6468ed9938d10168b3 (patch) | |
| tree | a1fd89b864d9b93e2ad68fe1dcf7afee2e3c8d76 /llvm/lib/Transforms/Vectorize/VPlan.cpp | |
| parent | 77fc4c146f0870ffb09c1afb823ccbe742c5e6ff (diff) | |
Diffstat (limited to 'llvm/lib/Transforms/Vectorize/VPlan.cpp')
| -rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlan.cpp | 244 |
1 files changed, 187 insertions, 57 deletions
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 1d9e71663cd2..a96c122db2a9 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -677,10 +677,10 @@ void VPInstruction::generateInstruction(VPTransformState &State, // Get first lane of vector induction variable. Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0)); // Get the original loop tripcount. - Value *ScalarTC = State.TripCount; + Value *ScalarTC = State.get(getOperand(1), Part); auto *Int1Ty = Type::getInt1Ty(Builder.getContext()); - auto *PredTy = FixedVectorType::get(Int1Ty, State.VF.getKnownMinValue()); + auto *PredTy = VectorType::get(Int1Ty, State.VF); Instruction *Call = Builder.CreateIntrinsic( Intrinsic::get_active_lane_mask, {PredTy, ScalarTC->getType()}, {VIVElem0, ScalarTC}, nullptr, "active.lane.mask"); @@ -711,6 +711,51 @@ void VPInstruction::generateInstruction(VPTransformState &State, } break; } + + case VPInstruction::CanonicalIVIncrement: + case VPInstruction::CanonicalIVIncrementNUW: { + Value *Next = nullptr; + if (Part == 0) { + bool IsNUW = getOpcode() == VPInstruction::CanonicalIVIncrementNUW; + auto *Phi = State.get(getOperand(0), 0); + // The loop step is equal to the vectorization factor (num of SIMD + // elements) times the unroll factor (num of SIMD instructions). + Value *Step = + createStepForVF(Builder, Phi->getType(), State.VF, State.UF); + Next = Builder.CreateAdd(Phi, Step, "index.next", IsNUW, false); + } else { + Next = State.get(this, 0); + } + + State.set(this, Next, Part); + break; + } + case VPInstruction::BranchOnCount: { + if (Part != 0) + break; + // First create the compare. + Value *IV = State.get(getOperand(0), Part); + Value *TC = State.get(getOperand(1), Part); + Value *Cond = Builder.CreateICmpEQ(IV, TC); + + // Now create the branch. + auto *Plan = getParent()->getPlan(); + VPRegionBlock *TopRegion = Plan->getVectorLoopRegion(); + VPBasicBlock *Header = TopRegion->getEntry()->getEntryBasicBlock(); + if (Header->empty()) { + assert(EnableVPlanNativePath && + "empty entry block only expected in VPlanNativePath"); + Header = cast<VPBasicBlock>(Header->getSingleSuccessor()); + } + // TODO: Once the exit block is modeled in VPlan, use it instead of going + // through State.CFG.LastBB. + BasicBlock *Exit = + cast<BranchInst>(State.CFG.LastBB->getTerminator())->getSuccessor(0); + + Builder.CreateCondBr(Cond, Exit, State.CFG.VPBB2IRBB[Header]); + Builder.GetInsertBlock()->getTerminator()->eraseFromParent(); + break; + } default: llvm_unreachable("Unsupported opcode for instruction"); } @@ -758,6 +803,15 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, case VPInstruction::FirstOrderRecurrenceSplice: O << "first-order splice"; break; + case VPInstruction::CanonicalIVIncrement: + O << "VF * UF + "; + break; + case VPInstruction::CanonicalIVIncrementNUW: + O << "VF * UF +(nuw) "; + break; + case VPInstruction::BranchOnCount: + O << "branch-on-count "; + break; default: O << Instruction::getOpcodeName(getOpcode()); } @@ -786,23 +840,55 @@ void VPInstruction::setFastMathFlags(FastMathFlags FMFNew) { FMF = FMFNew; } -/// Generate the code inside the body of the vectorized loop. Assumes a single -/// LoopVectorBody basic-block was created for this. Introduce additional -/// basic-blocks as needed, and fill them all. -void VPlan::execute(VPTransformState *State) { - // -1. Check if the backedge taken count is needed, and if so build it. +void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV, + Value *CanonicalIVStartValue, + VPTransformState &State) { + // Check if the trip count is needed, and if so build it. + if (TripCount && TripCount->getNumUsers()) { + for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) + State.set(TripCount, TripCountV, Part); + } + + // Check if the backedge taken count is needed, and if so build it. if (BackedgeTakenCount && BackedgeTakenCount->getNumUsers()) { - Value *TC = State->TripCount; - IRBuilder<> Builder(State->CFG.PrevBB->getTerminator()); - auto *TCMO = Builder.CreateSub(TC, ConstantInt::get(TC->getType(), 1), + IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); + auto *TCMO = Builder.CreateSub(TripCountV, + ConstantInt::get(TripCountV->getType(), 1), "trip.count.minus.1"); - auto VF = State->VF; + auto VF = State.VF; Value *VTCMO = VF.isScalar() ? TCMO : Builder.CreateVectorSplat(VF, TCMO, "broadcast"); - for (unsigned Part = 0, UF = State->UF; Part < UF; ++Part) - State->set(BackedgeTakenCount, VTCMO, Part); + for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) + State.set(BackedgeTakenCount, VTCMO, Part); } + for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) + State.set(&VectorTripCount, VectorTripCountV, Part); + + // When vectorizing the epilogue loop, the canonical induction start value + // needs to be changed from zero to the value after the main vector loop. + if (CanonicalIVStartValue) { + VPValue *VPV = new VPValue(CanonicalIVStartValue); + addExternalDef(VPV); + auto *IV = getCanonicalIV(); + assert(all_of(IV->users(), + [](const VPUser *U) { + auto *VPI = cast<VPInstruction>(U); + return VPI->getOpcode() == + VPInstruction::CanonicalIVIncrement || + VPI->getOpcode() == + VPInstruction::CanonicalIVIncrementNUW; + }) && + "the canonical IV should only be used by its increments when " + "resetting the start value"); + IV->setOperand(0, VPV); + } +} + +/// Generate the code inside the body of the vectorized loop. Assumes a single +/// LoopVectorBody basic-block was created for this. Introduce additional +/// basic-blocks as needed, and fill them all. +void VPlan::execute(VPTransformState *State) { // 0. Set the reverse mapping from VPValues to Values for code generation. for (auto &Entry : Value2VPValue) State->VPValue2Value[Entry.second] = Entry.first; @@ -834,28 +920,6 @@ void VPlan::execute(VPTransformState *State) { for (VPBlockBase *Block : depth_first(Entry)) Block->execute(State); - // Fix the latch value of reduction and first-order recurrences phis in the - // vector loop. - VPBasicBlock *Header = Entry->getEntryBasicBlock(); - for (VPRecipeBase &R : Header->phis()) { - auto *PhiR = dyn_cast<VPWidenPHIRecipe>(&R); - if (!PhiR || !(isa<VPFirstOrderRecurrencePHIRecipe>(&R) || - isa<VPReductionPHIRecipe>(&R))) - continue; - // For first-order recurrences and in-order reduction phis, only a single - // part is generated, which provides the last part from the previous - // iteration. Otherwise all UF parts are generated. - bool SinglePartNeeded = isa<VPFirstOrderRecurrencePHIRecipe>(&R) || - cast<VPReductionPHIRecipe>(&R)->isOrdered(); - unsigned LastPartForNewPhi = SinglePartNeeded ? 1 : State->UF; - for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) { - Value *VecPhi = State->get(PhiR, Part); - Value *Val = State->get(PhiR->getBackedgeValue(), - SinglePartNeeded ? State->UF - 1 : Part); - cast<PHINode>(VecPhi)->addIncoming(Val, VectorLatchBB); - } - } - // Setup branch terminator successors for VPBBs in VPBBsToFix based on // VPBB's successors. for (auto VPBB : State->CFG.VPBBsToFix) { @@ -876,13 +940,19 @@ void VPlan::execute(VPTransformState *State) { // 3. Merge the temporary latch created with the last basic-block filled. BasicBlock *LastBB = State->CFG.PrevBB; + assert(isa<BranchInst>(LastBB->getTerminator()) && + "Expected VPlan CFG to terminate with branch"); + + // Move both the branch and check from LastBB to VectorLatchBB. + auto *LastBranch = cast<BranchInst>(LastBB->getTerminator()); + LastBranch->moveBefore(VectorLatchBB->getTerminator()); + VectorLatchBB->getTerminator()->eraseFromParent(); + // Move condition so it is guaranteed to be next to branch. This is only done + // to avoid excessive test updates. + // TODO: Remove special handling once the increments for all inductions are + // modeled explicitly in VPlan. + cast<Instruction>(LastBranch->getCondition())->moveBefore(LastBranch); // Connect LastBB to VectorLatchBB to facilitate their merge. - assert((EnableVPlanNativePath || - isa<UnreachableInst>(LastBB->getTerminator())) && - "Expected InnerLoop VPlan CFG to terminate with unreachable"); - assert((!EnableVPlanNativePath || isa<BranchInst>(LastBB->getTerminator())) && - "Expected VPlan CFG to terminate with branch in NativePath"); - LastBB->getTerminator()->eraseFromParent(); BranchInst::Create(VectorLatchBB, LastBB); // Merge LastBB with Latch. @@ -891,6 +961,37 @@ void VPlan::execute(VPTransformState *State) { assert(Merged && "Could not merge last basic block with latch."); VectorLatchBB = LastBB; + // Fix the latch value of canonical, reduction and first-order recurrences + // phis in the vector loop. + VPBasicBlock *Header = Entry->getEntryBasicBlock(); + if (Header->empty()) { + assert(EnableVPlanNativePath); + Header = cast<VPBasicBlock>(Header->getSingleSuccessor()); + } + for (VPRecipeBase &R : Header->phis()) { + // Skip phi-like recipes that generate their backedege values themselves. + // TODO: Model their backedge values explicitly. + if (isa<VPWidenIntOrFpInductionRecipe>(&R) || isa<VPWidenPHIRecipe>(&R)) + continue; + + auto *PhiR = cast<VPHeaderPHIRecipe>(&R); + // For canonical IV, first-order recurrences and in-order reduction phis, + // only a single part is generated, which provides the last part from the + // previous iteration. For non-ordered reductions all UF parts are + // generated. + bool SinglePartNeeded = isa<VPCanonicalIVPHIRecipe>(PhiR) || + isa<VPFirstOrderRecurrencePHIRecipe>(PhiR) || + cast<VPReductionPHIRecipe>(PhiR)->isOrdered(); + unsigned LastPartForNewPhi = SinglePartNeeded ? 1 : State->UF; + + for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) { + Value *Phi = State->get(PhiR, Part); + Value *Val = State->get(PhiR->getBackedgeValue(), + SinglePartNeeded ? State->UF - 1 : Part); + cast<PHINode>(Phi)->addIncoming(Val, VectorLatchBB); + } + } + // We do not attempt to preserve DT for outer loop vectorization currently. if (!EnableVPlanNativePath) updateDominatorTree(State->DT, VectorPreHeaderBB, VectorLatchBB, @@ -904,6 +1005,12 @@ void VPlan::print(raw_ostream &O) const { O << "VPlan '" << Name << "' {"; + if (VectorTripCount.getNumUsers() > 0) { + O << "\nLive-in "; + VectorTripCount.printAsOperand(O, SlotTracker); + O << " = vector-trip-count\n"; + } + if (BackedgeTakenCount && BackedgeTakenCount->getNumUsers()) { O << "\nLive-in "; BackedgeTakenCount->printAsOperand(O, SlotTracker); @@ -1155,7 +1262,15 @@ void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent, } else O << " " << VPlanIngredient(IV); } +#endif +bool VPWidenIntOrFpInductionRecipe::isCanonical() const { + auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue()); + auto *StepC = dyn_cast<SCEVConstant>(getInductionDescriptor().getStep()); + return StartC && StartC->isZero() && StepC && StepC->isOne(); +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "WIDEN-GEP "; @@ -1255,7 +1370,7 @@ void VPWidenMemoryInstructionRecipe::print(raw_ostream &O, const Twine &Indent, O << Indent << "WIDEN "; if (!isStore()) { - getVPSingleValue()->printAsOperand(O, SlotTracker); + printAsOperand(O, SlotTracker); O << " = "; } O << Instruction::getOpcodeName(Ingredient.getOpcode()) << " "; @@ -1264,26 +1379,39 @@ void VPWidenMemoryInstructionRecipe::print(raw_ostream &O, const Twine &Indent, } #endif +void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) { + Value *Start = getStartValue()->getLiveInIRValue(); + PHINode *EntryPart = PHINode::Create( + Start->getType(), 2, "index", &*State.CFG.PrevBB->getFirstInsertionPt()); + EntryPart->addIncoming(Start, State.CFG.VectorPreHeader); + EntryPart->setDebugLoc(DL); + for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) + State.set(this, EntryPart, Part); +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "EMIT "; + printAsOperand(O, SlotTracker); + O << " = CANONICAL-INDUCTION"; +} +#endif + void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) { - Value *CanonicalIV = State.CanonicalIV; + Value *CanonicalIV = State.get(getOperand(0), 0); Type *STy = CanonicalIV->getType(); IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); ElementCount VF = State.VF; - assert(!VF.isScalable() && "the code following assumes non scalables ECs"); Value *VStart = VF.isScalar() ? CanonicalIV - : Builder.CreateVectorSplat(VF.getKnownMinValue(), - CanonicalIV, "broadcast"); + : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast"); for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) { - SmallVector<Constant *, 8> Indices; - for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) - Indices.push_back( - ConstantInt::get(STy, Part * VF.getKnownMinValue() + Lane)); - // If VF == 1, there is only one iteration in the loop above, thus the - // element pushed back into Indices is ConstantInt::get(STy, Part) - Constant *VStep = - VF.isScalar() ? Indices.back() : ConstantVector::get(Indices); - // Add the consecutive indices to the vector value. + Value *VStep = createStepForVF(Builder, STy, VF, Part); + if (VF.isVector()) { + VStep = Builder.CreateVectorSplat(VF, VStep); + VStep = Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType())); + } Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv"); State.set(this, CanonicalVectorIV, Part); } @@ -1294,7 +1422,8 @@ void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "EMIT "; printAsOperand(O, SlotTracker); - O << " = WIDEN-CANONICAL-INDUCTION"; + O << " = WIDEN-CANONICAL-INDUCTION "; + printOperands(O, SlotTracker); } #endif @@ -1461,7 +1590,7 @@ void VPInterleavedAccessInfo::visitBlock(VPBlockBase *Block, Old2NewTy &Old2New, InterleavedAccessInfo &IAI) { if (VPBasicBlock *VPBB = dyn_cast<VPBasicBlock>(Block)) { for (VPRecipeBase &VPI : *VPBB) { - if (isa<VPWidenPHIRecipe>(&VPI)) + if (isa<VPHeaderPHIRecipe>(&VPI)) continue; assert(isa<VPInstruction>(&VPI) && "Can only handle VPInstructions"); auto *VPInst = cast<VPInstruction>(&VPI); @@ -1506,6 +1635,7 @@ void VPSlotTracker::assignSlots(const VPlan &Plan) { for (const VPValue *V : Plan.VPExternalDefs) assignSlot(V); + assignSlot(&Plan.VectorTripCount); if (Plan.BackedgeTakenCount) assignSlot(Plan.BackedgeTakenCount); |
