diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2022-07-03 14:10:23 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2022-07-03 14:10:23 +0000 |
commit | 145449b1e420787bb99721a429341fa6be3adfb6 (patch) | |
tree | 1d56ae694a6de602e348dd80165cf881a36600ed /llvm/lib/Transforms/Vectorize/VPlan.cpp | |
parent | ecbca9f5fb7d7613d2b94982c4825eb0d33d6842 (diff) |
Diffstat (limited to 'llvm/lib/Transforms/Vectorize/VPlan.cpp')
-rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlan.cpp | 1161 |
1 files changed, 294 insertions, 867 deletions
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 342d4a074e10..4d709097c306 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -23,11 +23,10 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Twine.h" -#include "llvm/Analysis/IVDescriptors.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CFG.h" -#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Type.h" @@ -35,13 +34,13 @@ #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/GenericDomTreeConstruction.h" #include "llvm/Support/GraphWriter.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/LoopVersioning.h" +#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" #include <cassert> -#include <iterator> #include <string> #include <vector> @@ -60,7 +59,7 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const VPValue &V) { } #endif -Value *VPLane::getAsRuntimeExpr(IRBuilder<> &Builder, +Value *VPLane::getAsRuntimeExpr(IRBuilderBase &Builder, const ElementCount &VF) const { switch (LaneKind) { case VPLane::Kind::ScalableLast: @@ -158,25 +157,25 @@ void VPBlockBase::setPlan(VPlan *ParentPlan) { } /// \return the VPBasicBlock that is the exit of Block, possibly indirectly. -const VPBasicBlock *VPBlockBase::getExitBasicBlock() const { +const VPBasicBlock *VPBlockBase::getExitingBasicBlock() const { const VPBlockBase *Block = this; while (const VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block)) - Block = Region->getExit(); + Block = Region->getExiting(); return cast<VPBasicBlock>(Block); } -VPBasicBlock *VPBlockBase::getExitBasicBlock() { +VPBasicBlock *VPBlockBase::getExitingBasicBlock() { VPBlockBase *Block = this; while (VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block)) - Block = Region->getExit(); + Block = Region->getExiting(); return cast<VPBasicBlock>(Block); } VPBlockBase *VPBlockBase::getEnclosingBlockWithSuccessors() { if (!Successors.empty() || !Parent) return this; - assert(Parent->getExit() == this && - "Block w/o successors not the exit of its parent."); + assert(Parent->getExiting() == this && + "Block w/o successors not the exiting block of its parent."); return Parent->getEnclosingBlockWithSuccessors(); } @@ -188,28 +187,6 @@ VPBlockBase *VPBlockBase::getEnclosingBlockWithPredecessors() { return Parent->getEnclosingBlockWithPredecessors(); } -VPValue *VPBlockBase::getCondBit() { - return CondBitUser.getSingleOperandOrNull(); -} - -const VPValue *VPBlockBase::getCondBit() const { - return CondBitUser.getSingleOperandOrNull(); -} - -void VPBlockBase::setCondBit(VPValue *CV) { CondBitUser.resetSingleOpUser(CV); } - -VPValue *VPBlockBase::getPredicate() { - return PredicateUser.getSingleOperandOrNull(); -} - -const VPValue *VPBlockBase::getPredicate() const { - return PredicateUser.getSingleOperandOrNull(); -} - -void VPBlockBase::setPredicate(VPValue *CV) { - PredicateUser.resetSingleOpUser(CV); -} - void VPBlockBase::deleteCFG(VPBlockBase *Entry) { SmallVector<VPBlockBase *, 8> Blocks(depth_first(Entry)); @@ -245,6 +222,52 @@ Value *VPTransformState::get(VPValue *Def, const VPIteration &Instance) { // set(Def, Extract, Instance); return Extract; } +BasicBlock *VPTransformState::CFGState::getPreheaderBBFor(VPRecipeBase *R) { + VPRegionBlock *LoopRegion = R->getParent()->getEnclosingLoopRegion(); + return VPBB2IRBB[LoopRegion->getPreheaderVPBB()]; +} + +void VPTransformState::addNewMetadata(Instruction *To, + const Instruction *Orig) { + // If the loop was versioned with memchecks, add the corresponding no-alias + // metadata. + if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) + LVer->annotateInstWithNoAlias(To, Orig); +} + +void VPTransformState::addMetadata(Instruction *To, Instruction *From) { + propagateMetadata(To, From); + addNewMetadata(To, From); +} + +void VPTransformState::addMetadata(ArrayRef<Value *> To, Instruction *From) { + for (Value *V : To) { + if (Instruction *I = dyn_cast<Instruction>(V)) + addMetadata(I, From); + } +} + +void VPTransformState::setDebugLocFromInst(const Value *V) { + if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) { + const DILocation *DIL = Inst->getDebugLoc(); + + // When a FSDiscriminator is enabled, we don't need to add the multiply + // factors to the discriminators. + if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && + !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) { + // FIXME: For scalable vectors, assume vscale=1. + auto NewDIL = + DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); + if (NewDIL) + Builder.SetCurrentDebugLocation(*NewDIL); + else + LLVM_DEBUG(dbgs() << "Failed to create new discriminator: " + << DIL->getFilename() << " Line: " << DIL->getLine()); + } else + Builder.SetCurrentDebugLocation(DIL); + } else + Builder.SetCurrentDebugLocation(DebugLoc()); +} BasicBlock * VPBasicBlock::createEmptyBasicBlock(VPTransformState::CFGState &CFG) { @@ -252,43 +275,36 @@ VPBasicBlock::createEmptyBasicBlock(VPTransformState::CFGState &CFG) { // Pred stands for Predessor. Prev stands for Previous - last visited/created. BasicBlock *PrevBB = CFG.PrevBB; BasicBlock *NewBB = BasicBlock::Create(PrevBB->getContext(), getName(), - PrevBB->getParent(), CFG.LastBB); + PrevBB->getParent(), CFG.ExitBB); LLVM_DEBUG(dbgs() << "LV: created " << NewBB->getName() << '\n'); // Hook up the new basic block to its predecessors. for (VPBlockBase *PredVPBlock : getHierarchicalPredecessors()) { - VPBasicBlock *PredVPBB = PredVPBlock->getExitBasicBlock(); - auto &PredVPSuccessors = PredVPBB->getSuccessors(); + VPBasicBlock *PredVPBB = PredVPBlock->getExitingBasicBlock(); + auto &PredVPSuccessors = PredVPBB->getHierarchicalSuccessors(); BasicBlock *PredBB = CFG.VPBB2IRBB[PredVPBB]; - // In outer loop vectorization scenario, the predecessor BBlock may not yet - // be visited(backedge). Mark the VPBasicBlock for fixup at the end of - // vectorization. We do not encounter this case in inner loop vectorization - // as we start out by building a loop skeleton with the vector loop header - // and latch blocks. As a result, we never enter this function for the - // header block in the non VPlan-native path. - if (!PredBB) { - assert(EnableVPlanNativePath && - "Unexpected null predecessor in non VPlan-native path"); - CFG.VPBBsToFix.push_back(PredVPBB); - continue; - } - assert(PredBB && "Predecessor basic-block not found building successor."); auto *PredBBTerminator = PredBB->getTerminator(); LLVM_DEBUG(dbgs() << "LV: draw edge from" << PredBB->getName() << '\n'); + + auto *TermBr = dyn_cast<BranchInst>(PredBBTerminator); if (isa<UnreachableInst>(PredBBTerminator)) { assert(PredVPSuccessors.size() == 1 && "Predecessor ending w/o branch must have single successor."); + DebugLoc DL = PredBBTerminator->getDebugLoc(); PredBBTerminator->eraseFromParent(); - BranchInst::Create(NewBB, PredBB); + auto *Br = BranchInst::Create(NewBB, PredBB); + Br->setDebugLoc(DL); + } else if (TermBr && !TermBr->isConditional()) { + TermBr->setSuccessor(0, NewBB); } else { - assert(PredVPSuccessors.size() == 2 && - "Predecessor ending with branch must have two successors."); + // Set each forward successor here when it is created, excluding + // backedges. A backward successor is set when the branch is created. unsigned idx = PredVPSuccessors.front() == this ? 0 : 1; - assert(!PredBBTerminator->getSuccessor(idx) && + assert(!TermBr->getSuccessor(idx) && "Trying to reset an existing successor block."); - PredBBTerminator->setSuccessor(idx, NewBB); + TermBr->setSuccessor(idx, NewBB); } } return NewBB; @@ -300,27 +316,51 @@ void VPBasicBlock::execute(VPTransformState *State) { VPBlockBase *SingleHPred = nullptr; BasicBlock *NewBB = State->CFG.PrevBB; // Reuse it if possible. - // 1. Create an IR basic block, or reuse the last one if possible. - // The last IR basic block is reused, as an optimization, in three cases: - // A. the first VPBB reuses the loop header BB - when PrevVPBB is null; - // B. when the current VPBB has a single (hierarchical) predecessor which - // is PrevVPBB and the latter has a single (hierarchical) successor; and - // C. when the current VPBB is an entry of a region replica - where PrevVPBB - // is the exit of this region from a previous instance, or the predecessor - // of this region. - if (PrevVPBB && /* A */ - !((SingleHPred = getSingleHierarchicalPredecessor()) && - SingleHPred->getExitBasicBlock() == PrevVPBB && - PrevVPBB->getSingleHierarchicalSuccessor()) && /* B */ - !(Replica && getPredecessors().empty())) { /* C */ + auto IsLoopRegion = [](VPBlockBase *BB) { + auto *R = dyn_cast<VPRegionBlock>(BB); + return R && !R->isReplicator(); + }; + + // 1. Create an IR basic block, or reuse the last one or ExitBB if possible. + if (getPlan()->getVectorLoopRegion()->getSingleSuccessor() == this) { + // ExitBB can be re-used for the exit block of the Plan. + NewBB = State->CFG.ExitBB; + State->CFG.PrevBB = NewBB; + + // Update the branch instruction in the predecessor to branch to ExitBB. + VPBlockBase *PredVPB = getSingleHierarchicalPredecessor(); + VPBasicBlock *ExitingVPBB = PredVPB->getExitingBasicBlock(); + assert(PredVPB->getSingleSuccessor() == this && + "predecessor must have the current block as only successor"); + BasicBlock *ExitingBB = State->CFG.VPBB2IRBB[ExitingVPBB]; + // The Exit block of a loop is always set to be successor 0 of the Exiting + // block. + cast<BranchInst>(ExitingBB->getTerminator())->setSuccessor(0, NewBB); + } else if (PrevVPBB && /* A */ + !((SingleHPred = getSingleHierarchicalPredecessor()) && + SingleHPred->getExitingBasicBlock() == PrevVPBB && + PrevVPBB->getSingleHierarchicalSuccessor() && + (SingleHPred->getParent() == getEnclosingLoopRegion() && + !IsLoopRegion(SingleHPred))) && /* B */ + !(Replica && getPredecessors().empty())) { /* C */ + // The last IR basic block is reused, as an optimization, in three cases: + // A. the first VPBB reuses the loop pre-header BB - when PrevVPBB is null; + // B. when the current VPBB has a single (hierarchical) predecessor which + // is PrevVPBB and the latter has a single (hierarchical) successor which + // both are in the same non-replicator region; and + // C. when the current VPBB is an entry of a region replica - where PrevVPBB + // is the exiting VPBB of this region from a previous instance, or the + // predecessor of this region. + NewBB = createEmptyBasicBlock(State->CFG); State->Builder.SetInsertPoint(NewBB); // Temporarily terminate with unreachable until CFG is rewired. UnreachableInst *Terminator = State->Builder.CreateUnreachable(); + // Register NewBB in its loop. In innermost loops its the same for all + // BB's. + if (State->CurrentVectorLoop) + State->CurrentVectorLoop->addBasicBlockToLoop(NewBB, *State->LI); State->Builder.SetInsertPoint(Terminator); - // Register NewBB in its loop. In innermost loops its the same for all BB's. - Loop *L = State->LI->getLoopFor(State->CFG.LastBB); - L->addBasicBlockToLoop(NewBB, *State->LI); State->CFG.PrevBB = NewBB; } @@ -334,29 +374,6 @@ void VPBasicBlock::execute(VPTransformState *State) { for (VPRecipeBase &Recipe : Recipes) Recipe.execute(*State); - VPValue *CBV; - if (EnableVPlanNativePath && (CBV = getCondBit())) { - assert(CBV->getUnderlyingValue() && - "Unexpected null underlying value for condition bit"); - - // Condition bit value in a VPBasicBlock is used as the branch selector. In - // the VPlan-native path case, since all branches are uniform we generate a - // branch instruction using the condition value from vector lane 0 and dummy - // successors. The successors are fixed later when the successor blocks are - // visited. - Value *NewCond = State->get(CBV, {0, 0}); - - // Replace the temporary unreachable terminator with the new conditional - // branch. - auto *CurrentTerminator = NewBB->getTerminator(); - assert(isa<UnreachableInst>(CurrentTerminator) && - "Expected to replace unreachable terminator with conditional " - "branch."); - auto *CondBr = BranchInst::Create(NewBB, nullptr, NewCond); - CondBr->setSuccessor(0, nullptr); - ReplaceInstWithInst(CurrentTerminator, CondBr); - } - LLVM_DEBUG(dbgs() << "LV: filled BB:" << *NewBB); } @@ -395,6 +412,61 @@ VPBasicBlock *VPBasicBlock::splitAt(iterator SplitAt) { return SplitBlock; } +VPRegionBlock *VPBasicBlock::getEnclosingLoopRegion() { + VPRegionBlock *P = getParent(); + if (P && P->isReplicator()) { + P = P->getParent(); + assert(!cast<VPRegionBlock>(P)->isReplicator() && + "unexpected nested replicate regions"); + } + return P; +} + +static bool hasConditionalTerminator(const VPBasicBlock *VPBB) { + if (VPBB->empty()) { + assert( + VPBB->getNumSuccessors() < 2 && + "block with multiple successors doesn't have a recipe as terminator"); + return false; + } + + const VPRecipeBase *R = &VPBB->back(); + auto *VPI = dyn_cast<VPInstruction>(R); + bool IsCondBranch = + isa<VPBranchOnMaskRecipe>(R) || + (VPI && (VPI->getOpcode() == VPInstruction::BranchOnCond || + VPI->getOpcode() == VPInstruction::BranchOnCount)); + (void)IsCondBranch; + + if (VPBB->getNumSuccessors() >= 2 || VPBB->isExiting()) { + assert(IsCondBranch && "block with multiple successors not terminated by " + "conditional branch recipe"); + + return true; + } + + assert( + !IsCondBranch && + "block with 0 or 1 successors terminated by conditional branch recipe"); + return false; +} + +VPRecipeBase *VPBasicBlock::getTerminator() { + if (hasConditionalTerminator(this)) + return &back(); + return nullptr; +} + +const VPRecipeBase *VPBasicBlock::getTerminator() const { + if (hasConditionalTerminator(this)) + return &back(); + return nullptr; +} + +bool VPBasicBlock::isExiting() const { + return getParent()->getExitingBasicBlock() == this; +} + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPBlockBase::printSuccessors(raw_ostream &O, const Twine &Indent) const { if (getSuccessors().empty()) { @@ -411,13 +483,6 @@ void VPBlockBase::printSuccessors(raw_ostream &O, const Twine &Indent) const { void VPBasicBlock::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << getName() << ":\n"; - if (const VPValue *Pred = getPredicate()) { - O << Indent << "BlockPredicate:"; - Pred->printAsOperand(O, SlotTracker); - if (const auto *PredInst = dyn_cast<VPInstruction>(Pred)) - O << " (" << PredInst->getParent()->getName() << ")"; - O << '\n'; - } auto RecipeIndent = Indent + " "; for (const VPRecipeBase &Recipe : *this) { @@ -426,14 +491,6 @@ void VPBasicBlock::print(raw_ostream &O, const Twine &Indent, } printSuccessors(O, Indent); - - if (const VPValue *CBV = getCondBit()) { - O << Indent << "CondBit: "; - CBV->printAsOperand(O, SlotTracker); - if (const auto *CBI = dyn_cast<VPInstruction>(CBV)) - O << " (" << CBI->getParent()->getName() << ")"; - O << '\n'; - } } #endif @@ -448,25 +505,26 @@ void VPRegionBlock::execute(VPTransformState *State) { ReversePostOrderTraversal<VPBlockBase *> RPOT(Entry); if (!isReplicator()) { + // Create and register the new vector loop. + Loop *PrevLoop = State->CurrentVectorLoop; + State->CurrentVectorLoop = State->LI->AllocateLoop(); + BasicBlock *VectorPH = State->CFG.VPBB2IRBB[getPreheaderVPBB()]; + Loop *ParentLoop = State->LI->getLoopFor(VectorPH); + + // Insert the new loop into the loop nest and register the new basic blocks + // before calling any utilities such as SCEV that require valid LoopInfo. + if (ParentLoop) + ParentLoop->addChildLoop(State->CurrentVectorLoop); + else + State->LI->addTopLevelLoop(State->CurrentVectorLoop); + // Visit the VPBlocks connected to "this", starting from it. for (VPBlockBase *Block : RPOT) { - if (EnableVPlanNativePath) { - // The inner loop vectorization path does not represent loop preheader - // and exit blocks as part of the VPlan. In the VPlan-native path, skip - // vectorizing loop preheader block. In future, we may replace this - // check with the check for loop preheader. - if (Block->getNumPredecessors() == 0) - continue; - - // Skip vectorizing loop exit block. In future, we may replace this - // check with the check for loop exit. - if (Block->getNumSuccessors() == 0) - continue; - } - LLVM_DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n'); Block->execute(State); } + + State->CurrentVectorLoop = PrevLoop; return; } @@ -508,341 +566,32 @@ void VPRegionBlock::print(raw_ostream &O, const Twine &Indent, } #endif -bool VPRecipeBase::mayWriteToMemory() const { - switch (getVPDefID()) { - case VPWidenMemoryInstructionSC: { - return cast<VPWidenMemoryInstructionRecipe>(this)->isStore(); - } - case VPReplicateSC: - case VPWidenCallSC: - return cast<Instruction>(getVPSingleValue()->getUnderlyingValue()) - ->mayWriteToMemory(); - case VPBranchOnMaskSC: - return false; - case VPWidenIntOrFpInductionSC: - case VPWidenCanonicalIVSC: - case VPWidenPHISC: - case VPBlendSC: - case VPWidenSC: - case VPWidenGEPSC: - case VPReductionSC: - case VPWidenSelectSC: { - const Instruction *I = - dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue()); - (void)I; - assert((!I || !I->mayWriteToMemory()) && - "underlying instruction may write to memory"); - return false; - } - default: - return true; - } -} - -bool VPRecipeBase::mayReadFromMemory() const { - switch (getVPDefID()) { - case VPWidenMemoryInstructionSC: { - return !cast<VPWidenMemoryInstructionRecipe>(this)->isStore(); - } - case VPReplicateSC: - case VPWidenCallSC: - return cast<Instruction>(getVPSingleValue()->getUnderlyingValue()) - ->mayReadFromMemory(); - case VPBranchOnMaskSC: - return false; - case VPWidenIntOrFpInductionSC: - case VPWidenCanonicalIVSC: - case VPWidenPHISC: - case VPBlendSC: - case VPWidenSC: - case VPWidenGEPSC: - case VPReductionSC: - case VPWidenSelectSC: { - const Instruction *I = - dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue()); - (void)I; - assert((!I || !I->mayReadFromMemory()) && - "underlying instruction may read from memory"); - return false; - } - default: - return true; - } -} - -bool VPRecipeBase::mayHaveSideEffects() const { - switch (getVPDefID()) { - case VPBranchOnMaskSC: - return false; - case VPWidenIntOrFpInductionSC: - case VPWidenCanonicalIVSC: - case VPWidenPHISC: - case VPBlendSC: - case VPWidenSC: - case VPWidenGEPSC: - case VPReductionSC: - case VPWidenSelectSC: { - const Instruction *I = - dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue()); - (void)I; - assert((!I || !I->mayHaveSideEffects()) && - "underlying instruction has side-effects"); - return false; - } - case VPReplicateSC: { - auto *R = cast<VPReplicateRecipe>(this); - return R->getUnderlyingInstr()->mayHaveSideEffects(); - } - default: - return true; - } -} - -void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) { - assert(!Parent && "Recipe already in some VPBasicBlock"); - assert(InsertPos->getParent() && - "Insertion position not in any VPBasicBlock"); - Parent = InsertPos->getParent(); - Parent->getRecipeList().insert(InsertPos->getIterator(), this); -} - -void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) { - assert(!Parent && "Recipe already in some VPBasicBlock"); - assert(InsertPos->getParent() && - "Insertion position not in any VPBasicBlock"); - Parent = InsertPos->getParent(); - Parent->getRecipeList().insertAfter(InsertPos->getIterator(), this); -} - -void VPRecipeBase::removeFromParent() { - assert(getParent() && "Recipe not in any VPBasicBlock"); - getParent()->getRecipeList().remove(getIterator()); - Parent = nullptr; -} - -iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() { - assert(getParent() && "Recipe not in any VPBasicBlock"); - return getParent()->getRecipeList().erase(getIterator()); -} - -void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) { - removeFromParent(); - insertAfter(InsertPos); -} - -void VPRecipeBase::moveBefore(VPBasicBlock &BB, - iplist<VPRecipeBase>::iterator I) { - assert(I == BB.end() || I->getParent() == &BB); - removeFromParent(); - Parent = &BB; - BB.getRecipeList().insert(I, this); -} - -void VPInstruction::generateInstruction(VPTransformState &State, - unsigned Part) { - IRBuilder<> &Builder = State.Builder; - Builder.SetCurrentDebugLocation(DL); - - if (Instruction::isBinaryOp(getOpcode())) { - Value *A = State.get(getOperand(0), Part); - Value *B = State.get(getOperand(1), Part); - Value *V = Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B); - State.set(this, V, Part); - return; - } - - switch (getOpcode()) { - case VPInstruction::Not: { - Value *A = State.get(getOperand(0), Part); - Value *V = Builder.CreateNot(A); - State.set(this, V, Part); - break; - } - case VPInstruction::ICmpULE: { - Value *IV = State.get(getOperand(0), Part); - Value *TC = State.get(getOperand(1), Part); - Value *V = Builder.CreateICmpULE(IV, TC); - State.set(this, V, Part); - break; - } - case Instruction::Select: { - Value *Cond = State.get(getOperand(0), Part); - Value *Op1 = State.get(getOperand(1), Part); - Value *Op2 = State.get(getOperand(2), Part); - Value *V = Builder.CreateSelect(Cond, Op1, Op2); - State.set(this, V, Part); - break; - } - case VPInstruction::ActiveLaneMask: { - // Get first lane of vector induction variable. - Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0)); - // Get the original loop tripcount. - Value *ScalarTC = State.get(getOperand(1), Part); - - auto *Int1Ty = Type::getInt1Ty(Builder.getContext()); - auto *PredTy = VectorType::get(Int1Ty, State.VF); - Instruction *Call = Builder.CreateIntrinsic( - Intrinsic::get_active_lane_mask, {PredTy, ScalarTC->getType()}, - {VIVElem0, ScalarTC}, nullptr, "active.lane.mask"); - State.set(this, Call, Part); - break; - } - case VPInstruction::FirstOrderRecurrenceSplice: { - // Generate code to combine the previous and current values in vector v3. - // - // vector.ph: - // v_init = vector(..., ..., ..., a[-1]) - // br vector.body - // - // vector.body - // i = phi [0, vector.ph], [i+4, vector.body] - // v1 = phi [v_init, vector.ph], [v2, vector.body] - // v2 = a[i, i+1, i+2, i+3]; - // v3 = vector(v1(3), v2(0, 1, 2)) - - // For the first part, use the recurrence phi (v1), otherwise v2. - auto *V1 = State.get(getOperand(0), 0); - Value *PartMinus1 = Part == 0 ? V1 : State.get(getOperand(1), Part - 1); - if (!PartMinus1->getType()->isVectorTy()) { - State.set(this, PartMinus1, Part); - } else { - Value *V2 = State.get(getOperand(1), Part); - State.set(this, Builder.CreateVectorSplice(PartMinus1, V2, -1), Part); - } - break; - } - - case VPInstruction::CanonicalIVIncrement: - case VPInstruction::CanonicalIVIncrementNUW: { - Value *Next = nullptr; - if (Part == 0) { - bool IsNUW = getOpcode() == VPInstruction::CanonicalIVIncrementNUW; - auto *Phi = State.get(getOperand(0), 0); - // The loop step is equal to the vectorization factor (num of SIMD - // elements) times the unroll factor (num of SIMD instructions). - Value *Step = - createStepForVF(Builder, Phi->getType(), State.VF, State.UF); - Next = Builder.CreateAdd(Phi, Step, "index.next", IsNUW, false); - } else { - Next = State.get(this, 0); - } - - State.set(this, Next, Part); - break; - } - case VPInstruction::BranchOnCount: { - if (Part != 0) - break; - // First create the compare. - Value *IV = State.get(getOperand(0), Part); - Value *TC = State.get(getOperand(1), Part); - Value *Cond = Builder.CreateICmpEQ(IV, TC); - - // Now create the branch. - auto *Plan = getParent()->getPlan(); - VPRegionBlock *TopRegion = Plan->getVectorLoopRegion(); - VPBasicBlock *Header = TopRegion->getEntry()->getEntryBasicBlock(); - if (Header->empty()) { - assert(EnableVPlanNativePath && - "empty entry block only expected in VPlanNativePath"); - Header = cast<VPBasicBlock>(Header->getSingleSuccessor()); +void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV, + Value *CanonicalIVStartValue, + VPTransformState &State, + bool IsEpilogueVectorization) { + + VPBasicBlock *ExitingVPBB = getVectorLoopRegion()->getExitingBasicBlock(); + auto *Term = dyn_cast<VPInstruction>(&ExitingVPBB->back()); + // Try to simplify BranchOnCount to 'BranchOnCond true' if TC <= VF * UF when + // preparing to execute the plan for the main vector loop. + if (!IsEpilogueVectorization && Term && + Term->getOpcode() == VPInstruction::BranchOnCount && + isa<ConstantInt>(TripCountV)) { + ConstantInt *C = cast<ConstantInt>(TripCountV); + uint64_t TCVal = C->getZExtValue(); + if (TCVal && TCVal <= State.VF.getKnownMinValue() * State.UF) { + auto *BOC = + new VPInstruction(VPInstruction::BranchOnCond, + {getOrAddExternalDef(State.Builder.getTrue())}); + Term->eraseFromParent(); + ExitingVPBB->appendRecipe(BOC); + // TODO: Further simplifications are possible + // 1. Replace inductions with constants. + // 2. Replace vector loop region with VPBasicBlock. } - // TODO: Once the exit block is modeled in VPlan, use it instead of going - // through State.CFG.LastBB. - BasicBlock *Exit = - cast<BranchInst>(State.CFG.LastBB->getTerminator())->getSuccessor(0); - - Builder.CreateCondBr(Cond, Exit, State.CFG.VPBB2IRBB[Header]); - Builder.GetInsertBlock()->getTerminator()->eraseFromParent(); - break; - } - default: - llvm_unreachable("Unsupported opcode for instruction"); - } -} - -void VPInstruction::execute(VPTransformState &State) { - assert(!State.Instance && "VPInstruction executing an Instance"); - IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); - State.Builder.setFastMathFlags(FMF); - for (unsigned Part = 0; Part < State.UF; ++Part) - generateInstruction(State, Part); -} - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void VPInstruction::dump() const { - VPSlotTracker SlotTracker(getParent()->getPlan()); - print(dbgs(), "", SlotTracker); -} - -void VPInstruction::print(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent << "EMIT "; - - if (hasResult()) { - printAsOperand(O, SlotTracker); - O << " = "; - } - - switch (getOpcode()) { - case VPInstruction::Not: - O << "not"; - break; - case VPInstruction::ICmpULE: - O << "icmp ule"; - break; - case VPInstruction::SLPLoad: - O << "combined load"; - break; - case VPInstruction::SLPStore: - O << "combined store"; - break; - case VPInstruction::ActiveLaneMask: - O << "active lane mask"; - break; - case VPInstruction::FirstOrderRecurrenceSplice: - O << "first-order splice"; - break; - case VPInstruction::CanonicalIVIncrement: - O << "VF * UF + "; - break; - case VPInstruction::CanonicalIVIncrementNUW: - O << "VF * UF +(nuw) "; - break; - case VPInstruction::BranchOnCount: - O << "branch-on-count "; - break; - default: - O << Instruction::getOpcodeName(getOpcode()); - } - - O << FMF; - - for (const VPValue *Operand : operands()) { - O << " "; - Operand->printAsOperand(O, SlotTracker); } - if (DL) { - O << ", !dbg "; - DL.print(O); - } -} -#endif - -void VPInstruction::setFastMathFlags(FastMathFlags FMFNew) { - // Make sure the VPInstruction is a floating-point operation. - assert((Opcode == Instruction::FAdd || Opcode == Instruction::FMul || - Opcode == Instruction::FNeg || Opcode == Instruction::FSub || - Opcode == Instruction::FDiv || Opcode == Instruction::FRem || - Opcode == Instruction::FCmp) && - "this op can't take fast-math flags"); - FMF = FMFNew; -} - -void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV, - Value *CanonicalIVStartValue, - VPTransformState &State) { // Check if the trip count is needed, and if so build it. if (TripCount && TripCount->getNumUsers()) { for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) @@ -868,111 +617,78 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV, // When vectorizing the epilogue loop, the canonical induction start value // needs to be changed from zero to the value after the main vector loop. if (CanonicalIVStartValue) { - VPValue *VPV = new VPValue(CanonicalIVStartValue); - addExternalDef(VPV); + VPValue *VPV = getOrAddExternalDef(CanonicalIVStartValue); auto *IV = getCanonicalIV(); assert(all_of(IV->users(), [](const VPUser *U) { + if (isa<VPScalarIVStepsRecipe>(U)) + return true; auto *VPI = cast<VPInstruction>(U); return VPI->getOpcode() == VPInstruction::CanonicalIVIncrement || VPI->getOpcode() == VPInstruction::CanonicalIVIncrementNUW; }) && - "the canonical IV should only be used by its increments when " + "the canonical IV should only be used by its increments or " + "ScalarIVSteps when " "resetting the start value"); IV->setOperand(0, VPV); } } -/// Generate the code inside the body of the vectorized loop. Assumes a single -/// LoopVectorBody basic-block was created for this. Introduce additional -/// basic-blocks as needed, and fill them all. +/// Generate the code inside the preheader and body of the vectorized loop. +/// Assumes a single pre-header basic-block was created for this. Introduce +/// additional basic-blocks as needed, and fill them all. void VPlan::execute(VPTransformState *State) { - // 0. Set the reverse mapping from VPValues to Values for code generation. + // Set the reverse mapping from VPValues to Values for code generation. for (auto &Entry : Value2VPValue) State->VPValue2Value[Entry.second] = Entry.first; - BasicBlock *VectorPreHeaderBB = State->CFG.PrevBB; - State->CFG.VectorPreHeader = VectorPreHeaderBB; - BasicBlock *VectorHeaderBB = VectorPreHeaderBB->getSingleSuccessor(); - assert(VectorHeaderBB && "Loop preheader does not have a single successor."); - - // 1. Make room to generate basic-blocks inside loop body if needed. - BasicBlock *VectorLatchBB = VectorHeaderBB->splitBasicBlock( - VectorHeaderBB->getFirstInsertionPt(), "vector.body.latch"); - Loop *L = State->LI->getLoopFor(VectorHeaderBB); - L->addBasicBlockToLoop(VectorLatchBB, *State->LI); - // Remove the edge between Header and Latch to allow other connections. - // Temporarily terminate with unreachable until CFG is rewired. - // Note: this asserts the generated code's assumption that - // getFirstInsertionPt() can be dereferenced into an Instruction. - VectorHeaderBB->getTerminator()->eraseFromParent(); - State->Builder.SetInsertPoint(VectorHeaderBB); - UnreachableInst *Terminator = State->Builder.CreateUnreachable(); - State->Builder.SetInsertPoint(Terminator); - - // 2. Generate code in loop body. + // Initialize CFG state. State->CFG.PrevVPBB = nullptr; - State->CFG.PrevBB = VectorHeaderBB; - State->CFG.LastBB = VectorLatchBB; + State->CFG.ExitBB = State->CFG.PrevBB->getSingleSuccessor(); + BasicBlock *VectorPreHeader = State->CFG.PrevBB; + State->Builder.SetInsertPoint(VectorPreHeader->getTerminator()); + // Generate code in the loop pre-header and body. for (VPBlockBase *Block : depth_first(Entry)) Block->execute(State); - // Setup branch terminator successors for VPBBs in VPBBsToFix based on - // VPBB's successors. - for (auto VPBB : State->CFG.VPBBsToFix) { - assert(EnableVPlanNativePath && - "Unexpected VPBBsToFix in non VPlan-native path"); - BasicBlock *BB = State->CFG.VPBB2IRBB[VPBB]; - assert(BB && "Unexpected null basic block for VPBB"); - - unsigned Idx = 0; - auto *BBTerminator = BB->getTerminator(); - - for (VPBlockBase *SuccVPBlock : VPBB->getHierarchicalSuccessors()) { - VPBasicBlock *SuccVPBB = SuccVPBlock->getEntryBasicBlock(); - BBTerminator->setSuccessor(Idx, State->CFG.VPBB2IRBB[SuccVPBB]); - ++Idx; - } - } - - // 3. Merge the temporary latch created with the last basic-block filled. - BasicBlock *LastBB = State->CFG.PrevBB; - assert(isa<BranchInst>(LastBB->getTerminator()) && - "Expected VPlan CFG to terminate with branch"); - - // Move both the branch and check from LastBB to VectorLatchBB. - auto *LastBranch = cast<BranchInst>(LastBB->getTerminator()); - LastBranch->moveBefore(VectorLatchBB->getTerminator()); - VectorLatchBB->getTerminator()->eraseFromParent(); - // Move condition so it is guaranteed to be next to branch. This is only done - // to avoid excessive test updates. - // TODO: Remove special handling once the increments for all inductions are - // modeled explicitly in VPlan. - cast<Instruction>(LastBranch->getCondition())->moveBefore(LastBranch); - // Connect LastBB to VectorLatchBB to facilitate their merge. - BranchInst::Create(VectorLatchBB, LastBB); - - // Merge LastBB with Latch. - bool Merged = MergeBlockIntoPredecessor(VectorLatchBB, nullptr, State->LI); - (void)Merged; - assert(Merged && "Could not merge last basic block with latch."); - VectorLatchBB = LastBB; + VPBasicBlock *LatchVPBB = getVectorLoopRegion()->getExitingBasicBlock(); + BasicBlock *VectorLatchBB = State->CFG.VPBB2IRBB[LatchVPBB]; // Fix the latch value of canonical, reduction and first-order recurrences // phis in the vector loop. - VPBasicBlock *Header = Entry->getEntryBasicBlock(); - if (Header->empty()) { - assert(EnableVPlanNativePath); - Header = cast<VPBasicBlock>(Header->getSingleSuccessor()); - } + VPBasicBlock *Header = getVectorLoopRegion()->getEntryBasicBlock(); for (VPRecipeBase &R : Header->phis()) { // Skip phi-like recipes that generate their backedege values themselves. - // TODO: Model their backedge values explicitly. - if (isa<VPWidenIntOrFpInductionRecipe>(&R) || isa<VPWidenPHIRecipe>(&R)) + if (isa<VPWidenPHIRecipe>(&R)) + continue; + + if (isa<VPWidenPointerInductionRecipe>(&R) || + isa<VPWidenIntOrFpInductionRecipe>(&R)) { + PHINode *Phi = nullptr; + if (isa<VPWidenIntOrFpInductionRecipe>(&R)) { + Phi = cast<PHINode>(State->get(R.getVPSingleValue(), 0)); + } else { + auto *WidenPhi = cast<VPWidenPointerInductionRecipe>(&R); + // TODO: Split off the case that all users of a pointer phi are scalar + // from the VPWidenPointerInductionRecipe. + if (WidenPhi->onlyScalarsGenerated(State->VF)) + continue; + + auto *GEP = cast<GetElementPtrInst>(State->get(WidenPhi, 0)); + Phi = cast<PHINode>(GEP->getPointerOperand()); + } + + Phi->setIncomingBlock(1, VectorLatchBB); + + // Move the last step to the end of the latch block. This ensures + // consistent placement of all induction updates. + Instruction *Inc = cast<Instruction>(Phi->getIncomingValue(1)); + Inc->moveBefore(VectorLatchBB->getTerminator()->getPrevNode()); continue; + } auto *PhiR = cast<VPHeaderPHIRecipe>(&R); // For canonical IV, first-order recurrences and in-order reduction phis, @@ -993,9 +709,12 @@ void VPlan::execute(VPTransformState *State) { } // We do not attempt to preserve DT for outer loop vectorization currently. - if (!EnableVPlanNativePath) - updateDominatorTree(State->DT, VectorPreHeaderBB, VectorLatchBB, - L->getExitBlock()); + if (!EnableVPlanNativePath) { + BasicBlock *VectorHeaderBB = State->CFG.VPBB2IRBB[Header]; + State->DT->addNewBlock(VectorHeaderBB, VectorPreHeader); + updateDominatorTree(State->DT, VectorHeaderBB, VectorLatchBB, + State->CFG.ExitBB); + } } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -1021,6 +740,17 @@ void VPlan::print(raw_ostream &O) const { O << '\n'; Block->print(O, "", SlotTracker); } + + if (!LiveOuts.empty()) + O << "\n"; + for (auto &KV : LiveOuts) { + O << "Live-out "; + KV.second->getPhi()->printAsOperand(O); + O << " = "; + KV.second->getOperand(0)->printAsOperand(O, SlotTracker); + O << "\n"; + } + O << "}\n"; } @@ -1034,11 +764,14 @@ LLVM_DUMP_METHOD void VPlan::dump() const { print(dbgs()); } #endif -void VPlan::updateDominatorTree(DominatorTree *DT, BasicBlock *LoopPreHeaderBB, +void VPlan::addLiveOut(PHINode *PN, VPValue *V) { + assert(LiveOuts.count(PN) == 0 && "an exit value for PN already exists"); + LiveOuts.insert({PN, new VPLiveOut(PN, V)}); +} + +void VPlan::updateDominatorTree(DominatorTree *DT, BasicBlock *LoopHeaderBB, BasicBlock *LoopLatchBB, BasicBlock *LoopExitBB) { - BasicBlock *LoopHeaderBB = LoopPreHeaderBB->getSingleSuccessor(); - assert(LoopHeaderBB && "Loop preheader does not have a single successor."); // The vector body may be more than a single basic-block by this point. // Update the dominator tree information inside the vector body by propagating // it from header to latch, expecting only triangular control-flow, if any. @@ -1075,6 +808,7 @@ void VPlan::updateDominatorTree(DominatorTree *DT, BasicBlock *LoopPreHeaderBB, } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + Twine VPlanPrinter::getUID(const VPBlockBase *Block) { return (isa<VPRegionBlock>(Block) ? "cluster_N" : "N") + Twine(getOrCreateBID(Block)); @@ -1122,8 +856,8 @@ void VPlanPrinter::dumpBlock(const VPBlockBase *Block) { void VPlanPrinter::drawEdge(const VPBlockBase *From, const VPBlockBase *To, bool Hidden, const Twine &Label) { // Due to "dot" we print an edge between two regions as an edge between the - // exit basic block and the entry basic of the respective regions. - const VPBlockBase *Tail = From->getExitBasicBlock(); + // exiting basic block and the entry basic of the respective regions. + const VPBlockBase *Tail = From->getExitingBasicBlock(); const VPBlockBase *Head = To->getEntryBasicBlock(); OS << Indent << getUID(Tail) << " -> " << getUID(Head); OS << " [ label=\"" << Label << '\"'; @@ -1213,328 +947,6 @@ void VPlanIngredient::print(raw_ostream &O) const { V->printAsOperand(O, false); } -void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent << "WIDEN-CALL "; - - auto *CI = cast<CallInst>(getUnderlyingInstr()); - if (CI->getType()->isVoidTy()) - O << "void "; - else { - printAsOperand(O, SlotTracker); - O << " = "; - } - - O << "call @" << CI->getCalledFunction()->getName() << "("; - printOperands(O, SlotTracker); - O << ")"; -} - -void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent << "WIDEN-SELECT "; - printAsOperand(O, SlotTracker); - O << " = select "; - getOperand(0)->printAsOperand(O, SlotTracker); - O << ", "; - getOperand(1)->printAsOperand(O, SlotTracker); - O << ", "; - getOperand(2)->printAsOperand(O, SlotTracker); - O << (InvariantCond ? " (condition is loop invariant)" : ""); -} - -void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent << "WIDEN "; - printAsOperand(O, SlotTracker); - O << " = " << getUnderlyingInstr()->getOpcodeName() << " "; - printOperands(O, SlotTracker); -} - -void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent << "WIDEN-INDUCTION"; - if (getTruncInst()) { - O << "\\l\""; - O << " +\n" << Indent << "\" " << VPlanIngredient(IV) << "\\l\""; - O << " +\n" << Indent << "\" "; - getVPValue(0)->printAsOperand(O, SlotTracker); - } else - O << " " << VPlanIngredient(IV); -} -#endif - -bool VPWidenIntOrFpInductionRecipe::isCanonical() const { - auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue()); - auto *StepC = dyn_cast<SCEVConstant>(getInductionDescriptor().getStep()); - return StartC && StartC->isZero() && StepC && StepC->isOne(); -} - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent << "WIDEN-GEP "; - O << (IsPtrLoopInvariant ? "Inv" : "Var"); - size_t IndicesNumber = IsIndexLoopInvariant.size(); - for (size_t I = 0; I < IndicesNumber; ++I) - O << "[" << (IsIndexLoopInvariant[I] ? "Inv" : "Var") << "]"; - - O << " "; - printAsOperand(O, SlotTracker); - O << " = getelementptr "; - printOperands(O, SlotTracker); -} - -void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent << "WIDEN-PHI "; - - auto *OriginalPhi = cast<PHINode>(getUnderlyingValue()); - // Unless all incoming values are modeled in VPlan print the original PHI - // directly. - // TODO: Remove once all VPWidenPHIRecipe instances keep all relevant incoming - // values as VPValues. - if (getNumOperands() != OriginalPhi->getNumOperands()) { - O << VPlanIngredient(OriginalPhi); - return; - } - - printAsOperand(O, SlotTracker); - O << " = phi "; - printOperands(O, SlotTracker); -} - -void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent << "BLEND "; - Phi->printAsOperand(O, false); - O << " ="; - if (getNumIncomingValues() == 1) { - // Not a User of any mask: not really blending, this is a - // single-predecessor phi. - O << " "; - getIncomingValue(0)->printAsOperand(O, SlotTracker); - } else { - for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) { - O << " "; - getIncomingValue(I)->printAsOperand(O, SlotTracker); - O << "/"; - getMask(I)->printAsOperand(O, SlotTracker); - } - } -} - -void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent << "REDUCE "; - printAsOperand(O, SlotTracker); - O << " = "; - getChainOp()->printAsOperand(O, SlotTracker); - O << " +"; - if (isa<FPMathOperator>(getUnderlyingInstr())) - O << getUnderlyingInstr()->getFastMathFlags(); - O << " reduce." << Instruction::getOpcodeName(RdxDesc->getOpcode()) << " ("; - getVecOp()->printAsOperand(O, SlotTracker); - if (getCondOp()) { - O << ", "; - getCondOp()->printAsOperand(O, SlotTracker); - } - O << ")"; -} - -void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent << (IsUniform ? "CLONE " : "REPLICATE "); - - if (!getUnderlyingInstr()->getType()->isVoidTy()) { - printAsOperand(O, SlotTracker); - O << " = "; - } - O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode()) << " "; - printOperands(O, SlotTracker); - - if (AlsoPack) - O << " (S->V)"; -} - -void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent << "PHI-PREDICATED-INSTRUCTION "; - printAsOperand(O, SlotTracker); - O << " = "; - printOperands(O, SlotTracker); -} - -void VPWidenMemoryInstructionRecipe::print(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent << "WIDEN "; - - if (!isStore()) { - printAsOperand(O, SlotTracker); - O << " = "; - } - O << Instruction::getOpcodeName(Ingredient.getOpcode()) << " "; - - printOperands(O, SlotTracker); -} -#endif - -void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) { - Value *Start = getStartValue()->getLiveInIRValue(); - PHINode *EntryPart = PHINode::Create( - Start->getType(), 2, "index", &*State.CFG.PrevBB->getFirstInsertionPt()); - EntryPart->addIncoming(Start, State.CFG.VectorPreHeader); - EntryPart->setDebugLoc(DL); - for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) - State.set(this, EntryPart, Part); -} - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent << "EMIT "; - printAsOperand(O, SlotTracker); - O << " = CANONICAL-INDUCTION"; -} -#endif - -void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) { - Value *CanonicalIV = State.get(getOperand(0), 0); - Type *STy = CanonicalIV->getType(); - IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); - ElementCount VF = State.VF; - Value *VStart = VF.isScalar() - ? CanonicalIV - : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast"); - for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) { - Value *VStep = createStepForVF(Builder, STy, VF, Part); - if (VF.isVector()) { - VStep = Builder.CreateVectorSplat(VF, VStep); - VStep = Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType())); - } - Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv"); - State.set(this, CanonicalVectorIV, Part); - } -} - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent << "EMIT "; - printAsOperand(O, SlotTracker); - O << " = WIDEN-CANONICAL-INDUCTION "; - printOperands(O, SlotTracker); -} -#endif - -void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) { - auto &Builder = State.Builder; - // Create a vector from the initial value. - auto *VectorInit = getStartValue()->getLiveInIRValue(); - - Type *VecTy = State.VF.isScalar() - ? VectorInit->getType() - : VectorType::get(VectorInit->getType(), State.VF); - - if (State.VF.isVector()) { - auto *IdxTy = Builder.getInt32Ty(); - auto *One = ConstantInt::get(IdxTy, 1); - IRBuilder<>::InsertPointGuard Guard(Builder); - Builder.SetInsertPoint(State.CFG.VectorPreHeader->getTerminator()); - auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF); - auto *LastIdx = Builder.CreateSub(RuntimeVF, One); - VectorInit = Builder.CreateInsertElement( - PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init"); - } - - // Create a phi node for the new recurrence. - PHINode *EntryPart = PHINode::Create( - VecTy, 2, "vector.recur", &*State.CFG.PrevBB->getFirstInsertionPt()); - EntryPart->addIncoming(VectorInit, State.CFG.VectorPreHeader); - State.set(this, EntryPart, 0); -} - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent << "FIRST-ORDER-RECURRENCE-PHI "; - printAsOperand(O, SlotTracker); - O << " = phi "; - printOperands(O, SlotTracker); -} -#endif - -void VPReductionPHIRecipe::execute(VPTransformState &State) { - PHINode *PN = cast<PHINode>(getUnderlyingValue()); - auto &Builder = State.Builder; - - // In order to support recurrences we need to be able to vectorize Phi nodes. - // Phi nodes have cycles, so we need to vectorize them in two stages. This is - // stage #1: We create a new vector PHI node with no incoming edges. We'll use - // this value when we vectorize all of the instructions that use the PHI. - bool ScalarPHI = State.VF.isScalar() || IsInLoop; - Type *VecTy = - ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), State.VF); - - BasicBlock *HeaderBB = State.CFG.PrevBB; - assert(State.LI->getLoopFor(HeaderBB)->getHeader() == HeaderBB && - "recipe must be in the vector loop header"); - unsigned LastPartForNewPhi = isOrdered() ? 1 : State.UF; - for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) { - Value *EntryPart = - PHINode::Create(VecTy, 2, "vec.phi", &*HeaderBB->getFirstInsertionPt()); - State.set(this, EntryPart, Part); - } - - // Reductions do not have to start at zero. They can start with - // any loop invariant values. - VPValue *StartVPV = getStartValue(); - Value *StartV = StartVPV->getLiveInIRValue(); - - Value *Iden = nullptr; - RecurKind RK = RdxDesc.getRecurrenceKind(); - if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) || - RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) { - // MinMax reduction have the start value as their identify. - if (ScalarPHI) { - Iden = StartV; - } else { - IRBuilderBase::InsertPointGuard IPBuilder(Builder); - Builder.SetInsertPoint(State.CFG.VectorPreHeader->getTerminator()); - StartV = Iden = - Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident"); - } - } else { - Iden = RdxDesc.getRecurrenceIdentity(RK, VecTy->getScalarType(), - RdxDesc.getFastMathFlags()); - - if (!ScalarPHI) { - Iden = Builder.CreateVectorSplat(State.VF, Iden); - IRBuilderBase::InsertPointGuard IPBuilder(Builder); - Builder.SetInsertPoint(State.CFG.VectorPreHeader->getTerminator()); - Constant *Zero = Builder.getInt32(0); - StartV = Builder.CreateInsertElement(Iden, StartV, Zero); - } - } - - for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) { - Value *EntryPart = State.get(this, Part); - // Make sure to add the reduction start value only to the - // first unroll part. - Value *StartVal = (Part == 0) ? StartV : Iden; - cast<PHINode>(EntryPart)->addIncoming(StartVal, State.CFG.VectorPreHeader); - } -} - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent << "WIDEN-REDUCTION-PHI "; - - printAsOperand(O, SlotTracker); - O << " = phi "; - printOperands(O, SlotTracker); -} #endif template void DomTreeBuilder::Calculate<VPDominatorTree>(VPDominatorTree &DT); @@ -1594,7 +1006,10 @@ void VPInterleavedAccessInfo::visitBlock(VPBlockBase *Block, Old2NewTy &Old2New, continue; assert(isa<VPInstruction>(&VPI) && "Can only handle VPInstructions"); auto *VPInst = cast<VPInstruction>(&VPI); - auto *Inst = cast<Instruction>(VPInst->getUnderlyingValue()); + + auto *Inst = dyn_cast_or_null<Instruction>(VPInst->getUnderlyingValue()); + if (!Inst) + continue; auto *IG = IAI.getInterleaveGroup(Inst); if (!IG) continue; @@ -1622,7 +1037,7 @@ void VPInterleavedAccessInfo::visitBlock(VPBlockBase *Block, Old2NewTy &Old2New, VPInterleavedAccessInfo::VPInterleavedAccessInfo(VPlan &Plan, InterleavedAccessInfo &IAI) { Old2NewTy Old2New; - visitRegion(cast<VPRegionBlock>(Plan.getEntry()), Old2New, IAI); + visitRegion(Plan.getVectorLoopRegion(), Old2New, IAI); } void VPSlotTracker::assignSlot(const VPValue *V) { @@ -1632,8 +1047,8 @@ void VPSlotTracker::assignSlot(const VPValue *V) { void VPSlotTracker::assignSlots(const VPlan &Plan) { - for (const VPValue *V : Plan.VPExternalDefs) - assignSlot(V); + for (const auto &P : Plan.VPExternalDefs) + assignSlot(P.second); assignSlot(&Plan.VectorTripCount); if (Plan.BackedgeTakenCount) @@ -1651,7 +1066,19 @@ void VPSlotTracker::assignSlots(const VPlan &Plan) { } bool vputils::onlyFirstLaneUsed(VPValue *Def) { - return all_of(Def->users(), [Def](VPUser *U) { - return cast<VPRecipeBase>(U)->onlyFirstLaneUsed(Def); - }); + return all_of(Def->users(), + [Def](VPUser *U) { return U->onlyFirstLaneUsed(Def); }); +} + +VPValue *vputils::getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr, + ScalarEvolution &SE) { + if (auto *E = dyn_cast<SCEVConstant>(Expr)) + return Plan.getOrAddExternalDef(E->getValue()); + if (auto *E = dyn_cast<SCEVUnknown>(Expr)) + return Plan.getOrAddExternalDef(E->getValue()); + + VPBasicBlock *Preheader = Plan.getEntry()->getEntryBasicBlock(); + VPValue *Step = new VPExpandSCEVRecipe(Expr, SE); + Preheader->appendRecipe(cast<VPRecipeBase>(Step->getDef())); + return Step; } |