diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2022-07-14 18:58:48 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2023-02-08 19:03:59 +0000 |
commit | 753f127f3ace09432b2baeffd71a308760641a62 (patch) | |
tree | 97694ab339c0ca6145ebb429c7505019565b9a60 /contrib/llvm-project/llvm/lib/Transforms/Vectorize | |
parent | 81ad626541db97eb356e2c1d4a20eb2a26a766ab (diff) | |
parent | 1f917f69ff07f09b6dbb670971f57f8efe718b84 (diff) |
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Transforms/Vectorize')
11 files changed, 1006 insertions, 574 deletions
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index 6242d9a93fc1..183ba86abcb4 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -386,20 +386,6 @@ static bool isUniformLoopNest(Loop *Lp, Loop *OuterLp) { return true; } -/// Check whether it is safe to if-convert this phi node. -/// -/// Phi nodes with constant expressions that can trap are not safe to if -/// convert. -static bool canIfConvertPHINodes(BasicBlock *BB) { - for (PHINode &Phi : BB->phis()) { - for (Value *V : Phi.incoming_values()) - if (auto *C = dyn_cast<Constant>(V)) - if (C->canTrap()) - return false; - } - return true; -} - static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) { if (Ty->isPointerTy()) return DL.getIntPtrType(Ty); @@ -993,7 +979,6 @@ bool LoopVectorizationLegality::canVectorizeMemory() { } } - Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks()); PSE.addPredicate(LAI->getPSE().getPredicate()); return true; } @@ -1098,13 +1083,6 @@ bool LoopVectorizationLegality::blockCanBePredicated( SmallPtrSetImpl<const Instruction *> &MaskedOp, SmallPtrSetImpl<Instruction *> &ConditionalAssumes) const { for (Instruction &I : *BB) { - // Check that we don't have a constant expression that can trap as operand. - for (Value *Operand : I.operands()) { - if (auto *C = dyn_cast<Constant>(Operand)) - if (C->canTrap()) - return false; - } - // We can predicate blocks with calls to assume, as long as we drop them in // case we flatten the CFG via predication. if (match(&I, m_Intrinsic<Intrinsic::assume>())) { @@ -1190,7 +1168,6 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() { } // Collect the blocks that need predication. - BasicBlock *Header = TheLoop->getHeader(); for (BasicBlock *BB : TheLoop->blocks()) { // We don't support switch statements inside loops. if (!isa<BranchInst>(BB->getTerminator())) { @@ -1212,13 +1189,6 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() { BB->getTerminator()); return false; } - } else if (BB != Header && !canIfConvertPHINodes(BB)) { - reportVectorizationFailure( - "Control flow cannot be substituted for a select", - "control flow cannot be substituted for a select", - "NoCFGForSelect", ORE, TheLoop, - BB->getTerminator()); - return false; } } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index 0cb2032fa45a..2e9a9fe0640e 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -33,7 +33,6 @@ class LoopInfo; class LoopVectorizationLegality; class LoopVectorizationCostModel; class PredicatedScalarEvolution; -class LoopVectorizationRequirements; class LoopVectorizeHints; class OptimizationRemarkEmitter; class TargetTransformInfo; @@ -46,8 +45,9 @@ class VPBuilder { VPBasicBlock::iterator InsertPt = VPBasicBlock::iterator(); VPInstruction *createInstruction(unsigned Opcode, - ArrayRef<VPValue *> Operands, DebugLoc DL) { - VPInstruction *Instr = new VPInstruction(Opcode, Operands, DL); + ArrayRef<VPValue *> Operands, DebugLoc DL, + const Twine &Name = "") { + VPInstruction *Instr = new VPInstruction(Opcode, Operands, DL, Name); if (BB) BB->insert(Instr, InsertPt); return Instr; @@ -55,8 +55,8 @@ class VPBuilder { VPInstruction *createInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands, - DebugLoc DL) { - return createInstruction(Opcode, ArrayRef<VPValue *>(Operands), DL); + DebugLoc DL, const Twine &Name = "") { + return createInstruction(Opcode, ArrayRef<VPValue *>(Operands), DL, Name); } public: @@ -124,34 +124,37 @@ public: /// Create an N-ary operation with \p Opcode, \p Operands and set \p Inst as /// its underlying Instruction. VPValue *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands, - Instruction *Inst = nullptr) { + Instruction *Inst = nullptr, const Twine &Name = "") { DebugLoc DL; if (Inst) DL = Inst->getDebugLoc(); - VPInstruction *NewVPInst = createInstruction(Opcode, Operands, DL); + VPInstruction *NewVPInst = createInstruction(Opcode, Operands, DL, Name); NewVPInst->setUnderlyingValue(Inst); return NewVPInst; } VPValue *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands, - DebugLoc DL) { - return createInstruction(Opcode, Operands, DL); + DebugLoc DL, const Twine &Name = "") { + return createInstruction(Opcode, Operands, DL, Name); } - VPValue *createNot(VPValue *Operand, DebugLoc DL) { - return createInstruction(VPInstruction::Not, {Operand}, DL); + VPValue *createNot(VPValue *Operand, DebugLoc DL, const Twine &Name = "") { + return createInstruction(VPInstruction::Not, {Operand}, DL, Name); } - VPValue *createAnd(VPValue *LHS, VPValue *RHS, DebugLoc DL) { - return createInstruction(Instruction::BinaryOps::And, {LHS, RHS}, DL); + VPValue *createAnd(VPValue *LHS, VPValue *RHS, DebugLoc DL, + const Twine &Name = "") { + return createInstruction(Instruction::BinaryOps::And, {LHS, RHS}, DL, Name); } - VPValue *createOr(VPValue *LHS, VPValue *RHS, DebugLoc DL) { - return createInstruction(Instruction::BinaryOps::Or, {LHS, RHS}, DL); + VPValue *createOr(VPValue *LHS, VPValue *RHS, DebugLoc DL, + const Twine &Name = "") { + return createInstruction(Instruction::BinaryOps::Or, {LHS, RHS}, DL, Name); } VPValue *createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal, - DebugLoc DL) { - return createNaryOp(Instruction::Select, {Cond, TrueVal, FalseVal}, DL); + DebugLoc DL, const Twine &Name = "") { + return createNaryOp(Instruction::Select, {Cond, TrueVal, FalseVal}, DL, + Name); } //===--------------------------------------------------------------------===// @@ -191,6 +194,10 @@ struct VectorizationFactor { /// Cost of the scalar loop. InstructionCost ScalarCost; + /// The minimum trip count required to make vectorization profitable, e.g. due + /// to runtime checks. + ElementCount MinProfitableTripCount; + VectorizationFactor(ElementCount Width, InstructionCost Cost, InstructionCost ScalarCost) : Width(Width), Cost(Cost), ScalarCost(ScalarCost) {} @@ -268,8 +275,6 @@ class LoopVectorizationPlanner { const LoopVectorizeHints &Hints; - LoopVectorizationRequirements &Requirements; - OptimizationRemarkEmitter *ORE; SmallVector<VPlanPtr, 4> VPlans; @@ -285,10 +290,9 @@ public: InterleavedAccessInfo &IAI, PredicatedScalarEvolution &PSE, const LoopVectorizeHints &Hints, - LoopVectorizationRequirements &Requirements, OptimizationRemarkEmitter *ORE) : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM), IAI(IAI), - PSE(PSE), Hints(Hints), Requirements(Requirements), ORE(ORE) {} + PSE(PSE), Hints(Hints), ORE(ORE) {} /// Plan how to best vectorize, return the best VF and its cost, or None if /// vectorization and interleaving should be avoided up front. @@ -332,11 +336,6 @@ public: bool requiresTooManyRuntimeChecks() const; protected: - /// Collect the instructions from the original loop that would be trivially - /// dead in the vectorized loop if generated. - void collectTriviallyDeadInstructions( - SmallPtrSetImpl<Instruction *> &DeadInstructions); - /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive, /// according to the information gathered by Legal when it checked if it is /// legal to vectorize the loop. diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index b637b2d5ddae..0777a1385916 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -196,10 +196,9 @@ static cl::opt<unsigned> TinyTripCountVectorThreshold( "value are vectorized only if no scalar iteration overheads " "are incurred.")); -static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold( - "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, - cl::desc("The maximum allowed number of runtime memory checks with a " - "vectorize(enable) pragma.")); +static cl::opt<unsigned> VectorizeMemoryCheckThreshold( + "vectorize-memory-check-threshold", cl::init(128), cl::Hidden, + cl::desc("The maximum allowed number of runtime memory checks")); // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, // that predication is preferred, and this lists all options. I.e., the @@ -442,6 +441,7 @@ public: const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, ElementCount VecWidth, + ElementCount MinProfitableTripCount, unsigned UnrollFactor, LoopVectorizationLegality *LVL, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) @@ -453,6 +453,11 @@ public: // of the original loop header may change as the transformation happens. OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); + + if (MinProfitableTripCount.isZero()) + this->MinProfitableTripCount = VecWidth; + else + this->MinProfitableTripCount = MinProfitableTripCount; } virtual ~InnerLoopVectorizer() = default; @@ -656,6 +661,8 @@ protected: /// vector elements. ElementCount VF; + ElementCount MinProfitableTripCount; + /// The vectorization unroll factor to use. Each scalar is vectorized to this /// many different vector instructions. unsigned UF; @@ -735,6 +742,7 @@ public: LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, + ElementCount::getFixed(1), ElementCount::getFixed(1), UnrollFactor, LVL, CM, BFI, PSI, Check) {} @@ -783,8 +791,8 @@ public: BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks) : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, - EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI, - Checks), + EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL, + CM, BFI, PSI, Checks), EPI(EPI) {} // Override this function to handle the more complex control flow around the @@ -1018,7 +1026,8 @@ void InnerLoopVectorizer::collectPoisonGeneratingRecipes( if (isa<VPWidenMemoryInstructionRecipe>(CurRec) || isa<VPInterleaveRecipe>(CurRec) || isa<VPScalarIVStepsRecipe>(CurRec) || - isa<VPCanonicalIVPHIRecipe>(CurRec)) + isa<VPCanonicalIVPHIRecipe>(CurRec) || + isa<VPActiveLaneMaskPHIRecipe>(CurRec)) continue; // This recipe contributes to the address computation of a widen @@ -1503,6 +1512,13 @@ public: /// Returns true if all loop blocks should be masked to fold tail loop. bool foldTailByMasking() const { return FoldTailByMasking; } + /// Returns true if were tail-folding and want to use the active lane mask + /// for vector loop control flow. + bool useActiveLaneMaskForControlFlow() const { + return FoldTailByMasking && + TTI.emitGetActiveLaneMask() == PredicationStyle::DataAndControlFlow; + } + /// Returns true if the instructions in this block requires predication /// for any reason, e.g. because tail folding now requires a predicate /// or because the block in the original loop was predicated. @@ -1551,14 +1567,14 @@ public: Scalars.clear(); } -private: - unsigned NumPredStores = 0; - /// Convenience function that returns the value of vscale_range iff /// vscale_range.min == vscale_range.max or otherwise returns the value /// returned by the corresponding TLI method. Optional<unsigned> getVScaleForTuning() const; +private: + unsigned NumPredStores = 0; + /// \return An upper bound for the vectorization factors for both /// fixed and scalable vectorization, where the minimum-known number of /// elements is a power-of-2 larger than zero. If scalable vectorization is @@ -1661,7 +1677,8 @@ private: /// A set containing all BasicBlocks that are known to present after /// vectorization as a predicated block. - SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; + DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>> + PredicatedBBsAfterVectorization; /// Records whether it is allowed to have the original scalar loop execute at /// least once. This may be needed as a fallback loop in case runtime @@ -1849,14 +1866,17 @@ class GeneratedRTChecks { DominatorTree *DT; LoopInfo *LI; + TargetTransformInfo *TTI; SCEVExpander SCEVExp; SCEVExpander MemCheckExp; + bool CostTooHigh = false; + public: GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, - const DataLayout &DL) - : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"), + TargetTransformInfo *TTI, const DataLayout &DL) + : DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"), MemCheckExp(SE, DL, "scev.check") {} /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can @@ -1867,6 +1887,15 @@ public: void Create(Loop *L, const LoopAccessInfo &LAI, const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) { + // Hard cutoff to limit compile-time increase in case a very large number of + // runtime checks needs to be generated. + // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to + // profile info. + CostTooHigh = + LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold; + if (CostTooHigh) + return; + BasicBlock *LoopHeader = L->getHeader(); BasicBlock *Preheader = L->getLoopPreheader(); @@ -1938,6 +1967,44 @@ public: } } + InstructionCost getCost() { + if (SCEVCheckBlock || MemCheckBlock) + LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n"); + + if (CostTooHigh) { + InstructionCost Cost; + Cost.setInvalid(); + LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n"); + return Cost; + } + + InstructionCost RTCheckCost = 0; + if (SCEVCheckBlock) + for (Instruction &I : *SCEVCheckBlock) { + if (SCEVCheckBlock->getTerminator() == &I) + continue; + InstructionCost C = + TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput); + LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n"); + RTCheckCost += C; + } + if (MemCheckBlock) + for (Instruction &I : *MemCheckBlock) { + if (MemCheckBlock->getTerminator() == &I) + continue; + InstructionCost C = + TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput); + LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n"); + RTCheckCost += C; + } + + if (SCEVCheckBlock || MemCheckBlock) + LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost + << "\n"); + + return RTCheckCost; + } + /// Remove the created SCEV & memory runtime check blocks & instructions, if /// unused. ~GeneratedRTChecks() { @@ -2880,9 +2947,16 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { // If tail is to be folded, vector loop takes care of all iterations. Type *CountTy = Count->getType(); Value *CheckMinIters = Builder.getFalse(); - Value *Step = createStepForVF(Builder, CountTy, VF, UF); + auto CreateStep = [&]() { + // Create step with max(MinProTripCount, UF * VF). + if (UF * VF.getKnownMinValue() < MinProfitableTripCount.getKnownMinValue()) + return createStepForVF(Builder, CountTy, MinProfitableTripCount, 1); + return createStepForVF(Builder, CountTy, VF, UF); + }; + if (!Cost->foldTailByMasking()) - CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); + CheckMinIters = + Builder.CreateICmp(P, Count, CreateStep(), "min.iters.check"); else if (VF.isScalable()) { // vscale is not necessarily a power-of-2, which means we cannot guarantee // an overflow to zero when updating induction variables and so an @@ -2894,8 +2968,9 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count); // Don't execute the vector loop if (UMax - n) < (VF * UF). - CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, Step); + CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep()); } + // Create new preheader for vector loop. LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, @@ -2920,7 +2995,6 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { } BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) { - BasicBlock *const SCEVCheckBlock = RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock); if (!SCEVCheckBlock) @@ -4792,7 +4866,7 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { MaxVScale = TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax(); MaxScalableVF = ElementCount::getScalable( - MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); + MaxVScale ? (MaxSafeElements / MaxVScale.value()) : 0); if (!MaxScalableVF) reportVectorizationInfo( "Max legal vector width too small, scalable vectorization " @@ -5187,9 +5261,9 @@ bool LoopVectorizationCostModel::isMoreProfitable( unsigned EstimatedWidthB = B.Width.getKnownMinValue(); if (Optional<unsigned> VScale = getVScaleForTuning()) { if (A.Width.isScalable()) - EstimatedWidthA *= VScale.getValue(); + EstimatedWidthA *= VScale.value(); if (B.Width.isScalable()) - EstimatedWidthB *= VScale.getValue(); + EstimatedWidthB *= VScale.value(); } // Assume vscale may be larger than 1 (or the value being tuned for), @@ -5872,10 +5946,11 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); - auto GetRegUsage = [&TTI = TTI](Type *Ty, ElementCount VF) -> unsigned { + const auto &TTICapture = TTI; + auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned { if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) return 0; - return TTI.getRegUsageForType(VectorType::get(Ty, VF)); + return TTICapture.getRegUsageForType(VectorType::get(Ty, VF)); }; for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { @@ -6014,6 +6089,8 @@ void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { // map will indicate that we've analyzed it already. ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; + PredicatedBBsAfterVectorization[VF].clear(); + // Find all the instructions that are scalar with predication in the loop and // determine if it would be better to not if-convert the blocks they are in. // If so, we also record the instructions to scalarize. @@ -6031,7 +6108,7 @@ void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { computePredInstDiscount(&I, ScalarCosts, VF) >= 0) ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); // Remember that BB will remain after vectorization. - PredicatedBBsAfterVectorization.insert(BB); + PredicatedBBsAfterVectorization[VF].insert(BB); } } } @@ -6896,8 +6973,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, bool ScalarPredicatedBB = false; BranchInst *BI = cast<BranchInst>(I); if (VF.isVector() && BI->isConditional() && - (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || - PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) + (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) || + PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1)))) ScalarPredicatedBB = true; if (ScalarPredicatedBB) { @@ -7363,14 +7440,6 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { return VectorizationFactor::Disabled(); } -bool LoopVectorizationPlanner::requiresTooManyRuntimeChecks() const { - unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks(); - return (NumRuntimePointerChecks > - VectorizerParams::RuntimeMemoryCheckThreshold && - !Hints.allowReordering()) || - NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; -} - Optional<VectorizationFactor> LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { assert(OrigLoop->isInnermost() && "Inner loop expected."); @@ -7439,7 +7508,9 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { return VectorizationFactor::Disabled(); // Select the optimal vectorization factor. - return CM.selectVectorizationFactor(VFCandidates); + VectorizationFactor VF = CM.selectVectorizationFactor(VFCandidates); + assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero."); + return VF; } VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const { @@ -7554,7 +7625,7 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, BestVPlan.getVectorLoopRegion()->getEntryBasicBlock(); Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]); if (VectorizedLoopID) - L->setLoopID(VectorizedLoopID.getValue()); + L->setLoopID(VectorizedLoopID.value()); else { // Keep all loop hints from the original loop on the vector loop (we'll // replace the vectorizer-specific hints below). @@ -7585,51 +7656,6 @@ void LoopVectorizationPlanner::printPlans(raw_ostream &O) { } #endif -void LoopVectorizationPlanner::collectTriviallyDeadInstructions( - SmallPtrSetImpl<Instruction *> &DeadInstructions) { - - // We create new control-flow for the vectorized loop, so the original exit - // conditions will be dead after vectorization if it's only used by the - // terminator - SmallVector<BasicBlock*> ExitingBlocks; - OrigLoop->getExitingBlocks(ExitingBlocks); - for (auto *BB : ExitingBlocks) { - auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0)); - if (!Cmp || !Cmp->hasOneUse()) - continue; - - // TODO: we should introduce a getUniqueExitingBlocks on Loop - if (!DeadInstructions.insert(Cmp).second) - continue; - - // The operands of the icmp is often a dead trunc, used by IndUpdate. - // TODO: can recurse through operands in general - for (Value *Op : Cmp->operands()) { - if (isa<TruncInst>(Op) && Op->hasOneUse()) - DeadInstructions.insert(cast<Instruction>(Op)); - } - } - - // We create new "steps" for induction variable updates to which the original - // induction variables map. An original update instruction will be dead if - // all its users except the induction variable are dead. - auto *Latch = OrigLoop->getLoopLatch(); - for (auto &Induction : Legal->getInductionVars()) { - PHINode *Ind = Induction.first; - auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); - - // If the tail is to be folded by masking, the primary induction variable, - // if exists, isn't dead: it will be used for masking. Don't kill it. - if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) - continue; - - if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { - return U == Ind || DeadInstructions.count(cast<Instruction>(U)); - })) - DeadInstructions.insert(IndUpdate); - } -} - Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } //===--------------------------------------------------------------------===// @@ -8001,11 +8027,19 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { if (!CM.blockNeedsPredicationForAnyReason(BB)) return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. + assert(CM.foldTailByMasking() && "must fold the tail"); + + // If we're using the active lane mask for control flow, then we get the + // mask from the active lane mask PHI that is cached in the VPlan. + PredicationStyle EmitGetActiveLaneMask = CM.TTI.emitGetActiveLaneMask(); + if (EmitGetActiveLaneMask == PredicationStyle::DataAndControlFlow) + return BlockMaskCache[BB] = Plan->getActiveLaneMaskPhi(); + // Introduce the early-exit compare IV <= BTC to form header block mask. // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by // constructing the desired canonical IV in the header block as its first // non-phi instructions. - assert(CM.foldTailByMasking() && "must fold the tail"); + VPBasicBlock *HeaderVPBB = Plan->getVectorLoopRegion()->getEntryBasicBlock(); auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi(); @@ -8014,9 +8048,10 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { VPBuilder::InsertPointGuard Guard(Builder); Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); - if (CM.TTI.emitGetActiveLaneMask()) { + if (EmitGetActiveLaneMask != PredicationStyle::None) { VPValue *TC = Plan->getOrCreateTripCount(); - BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC}); + BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC}, + nullptr, "active.lane.mask"); } else { VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); @@ -8409,9 +8444,8 @@ VPBasicBlock *VPRecipeBuilder::handleReplication( return RegSucc; } -VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, - VPRecipeBase *PredRecipe, - VPlanPtr &Plan) { +VPRegionBlock *VPRecipeBuilder::createReplicateRegion( + Instruction *Instr, VPReplicateRecipe *PredRecipe, VPlanPtr &Plan) { // Instructions marked for predication are replicated and placed under an // if-then construct to prevent side-effects. @@ -8425,7 +8459,7 @@ VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); auto *PHIRecipe = Instr->getType()->isVoidTy() ? nullptr - : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); + : new VPPredInstPHIRecipe(PredRecipe); if (PHIRecipe) { Plan->removeVPValueFor(Instr); Plan->addVPValue(Instr, PHIRecipe); @@ -8517,19 +8551,11 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF) { assert(OrigLoop->isInnermost() && "Inner loop expected."); - // Collect instructions from the original loop that will become trivially dead - // in the vectorized loop. We don't need to vectorize these instructions. For - // example, original induction update instructions can become dead because we - // separately emit induction "steps" when generating code for the new loop. - // Similarly, we create a new latch condition when setting up the structure - // of the new loop, so the old one can become dead. - SmallPtrSet<Instruction *, 4> DeadInstructions; - collectTriviallyDeadInstructions(DeadInstructions); - // Add assume instructions we need to drop to DeadInstructions, to prevent // them from being added to the VPlan. // TODO: We only need to drop assumes in blocks that get flattend. If the // control flow is preserved, we should keep them. + SmallPtrSet<Instruction *, 4> DeadInstructions; auto &ConditionalAssumes = Legal->getConditionalAssumes(); DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); @@ -8565,32 +8591,84 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, } } -// Add a VPCanonicalIVPHIRecipe starting at 0 to the header, a -// CanonicalIVIncrement{NUW} VPInstruction to increment it by VF * UF and a -// BranchOnCount VPInstruction to the latch. +// Add the necessary canonical IV and branch recipes required to control the +// loop. static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL, - bool HasNUW) { + bool HasNUW, + bool UseLaneMaskForLoopControlFlow) { Value *StartIdx = ConstantInt::get(IdxTy, 0); auto *StartV = Plan.getOrAddVPValue(StartIdx); + // Add a VPCanonicalIVPHIRecipe starting at 0 to the header. auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL); VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); VPBasicBlock *Header = TopRegion->getEntryBasicBlock(); Header->insert(CanonicalIVPHI, Header->begin()); + // Add a CanonicalIVIncrement{NUW} VPInstruction to increment the scalar + // IV by VF * UF. auto *CanonicalIVIncrement = new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW : VPInstruction::CanonicalIVIncrement, - {CanonicalIVPHI}, DL); + {CanonicalIVPHI}, DL, "index.next"); CanonicalIVPHI->addOperand(CanonicalIVIncrement); VPBasicBlock *EB = TopRegion->getExitingBasicBlock(); EB->appendRecipe(CanonicalIVIncrement); - auto *BranchOnCount = - new VPInstruction(VPInstruction::BranchOnCount, - {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); - EB->appendRecipe(BranchOnCount); + if (UseLaneMaskForLoopControlFlow) { + // Create the active lane mask instruction in the vplan preheader. + VPBasicBlock *Preheader = Plan.getEntry()->getEntryBasicBlock(); + + // We can't use StartV directly in the ActiveLaneMask VPInstruction, since + // we have to take unrolling into account. Each part needs to start at + // Part * VF + auto *CanonicalIVIncrementParts = + new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementForPartNUW + : VPInstruction::CanonicalIVIncrementForPart, + {StartV}, DL, "index.part.next"); + Preheader->appendRecipe(CanonicalIVIncrementParts); + + // Create the ActiveLaneMask instruction using the correct start values. + VPValue *TC = Plan.getOrCreateTripCount(); + auto *EntryALM = new VPInstruction(VPInstruction::ActiveLaneMask, + {CanonicalIVIncrementParts, TC}, DL, + "active.lane.mask.entry"); + Preheader->appendRecipe(EntryALM); + + // Now create the ActiveLaneMaskPhi recipe in the main loop using the + // preheader ActiveLaneMask instruction. + auto *LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc()); + Header->insert(LaneMaskPhi, Header->getFirstNonPhi()); + + // Create the active lane mask for the next iteration of the loop. + CanonicalIVIncrementParts = + new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementForPartNUW + : VPInstruction::CanonicalIVIncrementForPart, + {CanonicalIVIncrement}, DL); + EB->appendRecipe(CanonicalIVIncrementParts); + + auto *ALM = new VPInstruction(VPInstruction::ActiveLaneMask, + {CanonicalIVIncrementParts, TC}, DL, + "active.lane.mask.next"); + EB->appendRecipe(ALM); + LaneMaskPhi->addOperand(ALM); + + // We have to invert the mask here because a true condition means jumping + // to the exit block. + auto *NotMask = new VPInstruction(VPInstruction::Not, ALM, DL); + EB->appendRecipe(NotMask); + + VPInstruction *BranchBack = + new VPInstruction(VPInstruction::BranchOnCond, {NotMask}, DL); + EB->appendRecipe(BranchBack); + } else { + // Add the BranchOnCount VPInstruction to the latch. + VPInstruction *BranchBack = new VPInstruction( + VPInstruction::BranchOnCount, + {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); + EB->appendRecipe(BranchBack); + } } // Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the @@ -8691,7 +8769,8 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DLInst ? DLInst->getDebugLoc() : DebugLoc(), - !CM.foldTailByMasking()); + !CM.foldTailByMasking(), + CM.useActiveLaneMaskForControlFlow()); // Scan the body of the loop in a topological order to visit each basic block // after having visited its predecessor basic blocks. @@ -8961,8 +9040,8 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( VPlanTransforms::optimizeInductions(*Plan, *PSE.getSE()); VPlanTransforms::sinkScalarOperands(*Plan); - VPlanTransforms::mergeReplicateRegions(*Plan); VPlanTransforms::removeDeadRecipes(*Plan); + VPlanTransforms::mergeReplicateRegions(*Plan); VPlanTransforms::removeRedundantExpandSCEVRecipes(*Plan); // Fold Exit block into its predecessor if possible. @@ -9006,7 +9085,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { Term->eraseFromParent(); addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(), - true); + true, CM.useActiveLaneMaskForControlFlow()); return Plan; } @@ -9078,7 +9157,9 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); Plan->removeVPValueFor(R); Plan->addVPValue(R, RedRecipe); - WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); + // Append the recipe to the end of the VPBasicBlock because we need to + // ensure that it comes after all of it's inputs, including CondOp. + WidenRecipe->getParent()->appendRecipe(RedRecipe); WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); WidenRecipe->eraseFromParent(); @@ -9151,229 +9232,6 @@ void VPWidenCallRecipe::execute(VPTransformState &State) { *this, State); } -void VPWidenSelectRecipe::execute(VPTransformState &State) { - auto &I = *cast<SelectInst>(getUnderlyingInstr()); - State.setDebugLocFromInst(&I); - - // The condition can be loop invariant but still defined inside the - // loop. This means that we can't just use the original 'cond' value. - // We have to take the 'vectorized' value and pick the first lane. - // Instcombine will make this a no-op. - auto *InvarCond = - InvariantCond ? State.get(getOperand(0), VPIteration(0, 0)) : nullptr; - - for (unsigned Part = 0; Part < State.UF; ++Part) { - Value *Cond = InvarCond ? InvarCond : State.get(getOperand(0), Part); - Value *Op0 = State.get(getOperand(1), Part); - Value *Op1 = State.get(getOperand(2), Part); - Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1); - State.set(this, Sel, Part); - State.addMetadata(Sel, &I); - } -} - -void VPWidenRecipe::execute(VPTransformState &State) { - auto &I = *cast<Instruction>(getUnderlyingValue()); - auto &Builder = State.Builder; - switch (I.getOpcode()) { - case Instruction::Call: - case Instruction::Br: - case Instruction::PHI: - case Instruction::GetElementPtr: - case Instruction::Select: - llvm_unreachable("This instruction is handled by a different recipe."); - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::SRem: - case Instruction::URem: - case Instruction::Add: - case Instruction::FAdd: - case Instruction::Sub: - case Instruction::FSub: - case Instruction::FNeg: - case Instruction::Mul: - case Instruction::FMul: - case Instruction::FDiv: - case Instruction::FRem: - case Instruction::Shl: - case Instruction::LShr: - case Instruction::AShr: - case Instruction::And: - case Instruction::Or: - case Instruction::Xor: { - // Just widen unops and binops. - State.setDebugLocFromInst(&I); - - for (unsigned Part = 0; Part < State.UF; ++Part) { - SmallVector<Value *, 2> Ops; - for (VPValue *VPOp : operands()) - Ops.push_back(State.get(VPOp, Part)); - - Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); - - if (auto *VecOp = dyn_cast<Instruction>(V)) { - VecOp->copyIRFlags(&I); - - // If the instruction is vectorized and was in a basic block that needed - // predication, we can't propagate poison-generating flags (nuw/nsw, - // exact, etc.). The control flow has been linearized and the - // instruction is no longer guarded by the predicate, which could make - // the flag properties to no longer hold. - if (State.MayGeneratePoisonRecipes.contains(this)) - VecOp->dropPoisonGeneratingFlags(); - } - - // Use this vector value for all users of the original instruction. - State.set(this, V, Part); - State.addMetadata(V, &I); - } - - break; - } - case Instruction::Freeze: { - State.setDebugLocFromInst(&I); - - for (unsigned Part = 0; Part < State.UF; ++Part) { - Value *Op = State.get(getOperand(0), Part); - - Value *Freeze = Builder.CreateFreeze(Op); - State.set(this, Freeze, Part); - } - break; - } - case Instruction::ICmp: - case Instruction::FCmp: { - // Widen compares. Generate vector compares. - bool FCmp = (I.getOpcode() == Instruction::FCmp); - auto *Cmp = cast<CmpInst>(&I); - State.setDebugLocFromInst(Cmp); - for (unsigned Part = 0; Part < State.UF; ++Part) { - Value *A = State.get(getOperand(0), Part); - Value *B = State.get(getOperand(1), Part); - Value *C = nullptr; - if (FCmp) { - // Propagate fast math flags. - IRBuilder<>::FastMathFlagGuard FMFG(Builder); - Builder.setFastMathFlags(Cmp->getFastMathFlags()); - C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); - } else { - C = Builder.CreateICmp(Cmp->getPredicate(), A, B); - } - State.set(this, C, Part); - State.addMetadata(C, &I); - } - - break; - } - - case Instruction::ZExt: - case Instruction::SExt: - case Instruction::FPToUI: - case Instruction::FPToSI: - case Instruction::FPExt: - case Instruction::PtrToInt: - case Instruction::IntToPtr: - case Instruction::SIToFP: - case Instruction::UIToFP: - case Instruction::Trunc: - case Instruction::FPTrunc: - case Instruction::BitCast: { - auto *CI = cast<CastInst>(&I); - State.setDebugLocFromInst(CI); - - /// Vectorize casts. - Type *DestTy = (State.VF.isScalar()) - ? CI->getType() - : VectorType::get(CI->getType(), State.VF); - - for (unsigned Part = 0; Part < State.UF; ++Part) { - Value *A = State.get(getOperand(0), Part); - Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); - State.set(this, Cast, Part); - State.addMetadata(Cast, &I); - } - break; - } - default: - // This instruction is not vectorized by simple widening. - LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); - llvm_unreachable("Unhandled instruction!"); - } // end of switch. -} - -void VPWidenGEPRecipe::execute(VPTransformState &State) { - auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr()); - // Construct a vector GEP by widening the operands of the scalar GEP as - // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP - // results in a vector of pointers when at least one operand of the GEP - // is vector-typed. Thus, to keep the representation compact, we only use - // vector-typed operands for loop-varying values. - - if (State.VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { - // If we are vectorizing, but the GEP has only loop-invariant operands, - // the GEP we build (by only using vector-typed operands for - // loop-varying values) would be a scalar pointer. Thus, to ensure we - // produce a vector of pointers, we need to either arbitrarily pick an - // operand to broadcast, or broadcast a clone of the original GEP. - // Here, we broadcast a clone of the original. - // - // TODO: If at some point we decide to scalarize instructions having - // loop-invariant operands, this special case will no longer be - // required. We would add the scalarization decision to - // collectLoopScalars() and teach getVectorValue() to broadcast - // the lane-zero scalar value. - auto *Clone = State.Builder.Insert(GEP->clone()); - for (unsigned Part = 0; Part < State.UF; ++Part) { - Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone); - State.set(this, EntryPart, Part); - State.addMetadata(EntryPart, GEP); - } - } else { - // If the GEP has at least one loop-varying operand, we are sure to - // produce a vector of pointers. But if we are only unrolling, we want - // to produce a scalar GEP for each unroll part. Thus, the GEP we - // produce with the code below will be scalar (if VF == 1) or vector - // (otherwise). Note that for the unroll-only case, we still maintain - // values in the vector mapping with initVector, as we do for other - // instructions. - for (unsigned Part = 0; Part < State.UF; ++Part) { - // The pointer operand of the new GEP. If it's loop-invariant, we - // won't broadcast it. - auto *Ptr = IsPtrLoopInvariant - ? State.get(getOperand(0), VPIteration(0, 0)) - : State.get(getOperand(0), Part); - - // Collect all the indices for the new GEP. If any index is - // loop-invariant, we won't broadcast it. - SmallVector<Value *, 4> Indices; - for (unsigned I = 1, E = getNumOperands(); I < E; I++) { - VPValue *Operand = getOperand(I); - if (IsIndexLoopInvariant[I - 1]) - Indices.push_back(State.get(Operand, VPIteration(0, 0))); - else - Indices.push_back(State.get(Operand, Part)); - } - - // If the GEP instruction is vectorized and was in a basic block that - // needed predication, we can't propagate the poison-generating 'inbounds' - // flag. The control flow has been linearized and the GEP is no longer - // guarded by the predicate, which could make the 'inbounds' properties to - // no longer hold. - bool IsInBounds = - GEP->isInBounds() && State.MayGeneratePoisonRecipes.count(this) == 0; - - // Create the new GEP. Note that this GEP may be a scalar if VF == 1, - // but it should be a vector, otherwise. - auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr, - Indices, "", IsInBounds); - assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && - "NewGEP is not a pointer vector"); - State.set(this, NewGEP, Part); - State.addMetadata(NewGEP, GEP); - } - } -} - void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { assert(!State.Instance && "Int or FP induction being replicated."); @@ -9632,45 +9490,6 @@ void VPScalarIVStepsRecipe::execute(VPTransformState &State) { } } -void VPBlendRecipe::execute(VPTransformState &State) { - State.setDebugLocFromInst(Phi); - // We know that all PHIs in non-header blocks are converted into - // selects, so we don't have to worry about the insertion order and we - // can just use the builder. - // At this point we generate the predication tree. There may be - // duplications since this is a simple recursive scan, but future - // optimizations will clean it up. - - unsigned NumIncoming = getNumIncomingValues(); - - // Generate a sequence of selects of the form: - // SELECT(Mask3, In3, - // SELECT(Mask2, In2, - // SELECT(Mask1, In1, - // In0))) - // Note that Mask0 is never used: lanes for which no path reaches this phi and - // are essentially undef are taken from In0. - InnerLoopVectorizer::VectorParts Entry(State.UF); - for (unsigned In = 0; In < NumIncoming; ++In) { - for (unsigned Part = 0; Part < State.UF; ++Part) { - // We might have single edge PHIs (blocks) - use an identity - // 'select' for the first PHI operand. - Value *In0 = State.get(getIncomingValue(In), Part); - if (In == 0) - Entry[Part] = In0; // Initialize with the first incoming value. - else { - // Select between the current value and the previous incoming edge - // based on the incoming mask. - Value *Cond = State.get(getMask(In), Part); - Entry[Part] = - State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); - } - } - } - for (unsigned Part = 0; Part < State.UF; ++Part) - State.set(this, Entry[Part], Part); -} - void VPInterleaveRecipe::execute(VPTransformState &State) { assert(!State.Instance && "Interleave group being replicated."); State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), @@ -9758,32 +9577,6 @@ void VPReplicateRecipe::execute(VPTransformState &State) { State); } -void VPBranchOnMaskRecipe::execute(VPTransformState &State) { - assert(State.Instance && "Branch on Mask works only on single instance."); - - unsigned Part = State.Instance->Part; - unsigned Lane = State.Instance->Lane.getKnownLane(); - - Value *ConditionBit = nullptr; - VPValue *BlockInMask = getMask(); - if (BlockInMask) { - ConditionBit = State.get(BlockInMask, Part); - if (ConditionBit->getType()->isVectorTy()) - ConditionBit = State.Builder.CreateExtractElement( - ConditionBit, State.Builder.getInt32(Lane)); - } else // Block in mask is all-one. - ConditionBit = State.Builder.getTrue(); - - // Replace the temporary unreachable terminator with a new conditional branch, - // whose two destinations will be set later when they are created. - auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); - assert(isa<UnreachableInst>(CurrentTerminator) && - "Expected to replace unreachable terminator with conditional branch."); - auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); - CondBr->setSuccessor(0, nullptr); - ReplaceInstWithInst(CurrentTerminator, CondBr); -} - void VPPredInstPHIRecipe::execute(VPTransformState &State) { assert(State.Instance && "Predicated instruction PHI works per instance."); Instruction *ScalarPredInst = @@ -10103,8 +9896,7 @@ static bool processLoopInVPlanNativePath( // Use the planner for outer loop vectorization. // TODO: CM is not used at this point inside the planner. Turn CM into an // optional argument if we don't need it in the future. - LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, - Requirements, ORE); + LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, ORE); // Get user vectorization factor. ElementCount UserVF = Hints.getWidth(); @@ -10123,10 +9915,10 @@ static bool processLoopInVPlanNativePath( VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); { - GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, + GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI, F->getParent()->getDataLayout()); - InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, - &CM, BFI, PSI, Checks); + InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, + VF.Width, 1, LVL, &CM, BFI, PSI, Checks); LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" << L->getHeader()->getParent()->getName() << "\"\n"); LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false); @@ -10183,6 +9975,105 @@ static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { } } +static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks, + VectorizationFactor &VF, + Optional<unsigned> VScale, Loop *L, + ScalarEvolution &SE) { + InstructionCost CheckCost = Checks.getCost(); + if (!CheckCost.isValid()) + return false; + + // When interleaving only scalar and vector cost will be equal, which in turn + // would lead to a divide by 0. Fall back to hard threshold. + if (VF.Width.isScalar()) { + if (CheckCost > VectorizeMemoryCheckThreshold) { + LLVM_DEBUG( + dbgs() + << "LV: Interleaving only is not profitable due to runtime checks\n"); + return false; + } + return true; + } + + // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated. + double ScalarC = *VF.ScalarCost.getValue(); + if (ScalarC == 0) + return true; + + // First, compute the minimum iteration count required so that the vector + // loop outperforms the scalar loop. + // The total cost of the scalar loop is + // ScalarC * TC + // where + // * TC is the actual trip count of the loop. + // * ScalarC is the cost of a single scalar iteration. + // + // The total cost of the vector loop is + // RtC + VecC * (TC / VF) + EpiC + // where + // * RtC is the cost of the generated runtime checks + // * VecC is the cost of a single vector iteration. + // * TC is the actual trip count of the loop + // * VF is the vectorization factor + // * EpiCost is the cost of the generated epilogue, including the cost + // of the remaining scalar operations. + // + // Vectorization is profitable once the total vector cost is less than the + // total scalar cost: + // RtC + VecC * (TC / VF) + EpiC < ScalarC * TC + // + // Now we can compute the minimum required trip count TC as + // (RtC + EpiC) / (ScalarC - (VecC / VF)) < TC + // + // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that + // the computations are performed on doubles, not integers and the result + // is rounded up, hence we get an upper estimate of the TC. + unsigned IntVF = VF.Width.getKnownMinValue(); + if (VF.Width.isScalable()) { + unsigned AssumedMinimumVscale = 1; + if (VScale) + AssumedMinimumVscale = *VScale; + IntVF *= AssumedMinimumVscale; + } + double VecCOverVF = double(*VF.Cost.getValue()) / IntVF; + double RtC = *CheckCost.getValue(); + double MinTC1 = RtC / (ScalarC - VecCOverVF); + + // Second, compute a minimum iteration count so that the cost of the + // runtime checks is only a fraction of the total scalar loop cost. This + // adds a loop-dependent bound on the overhead incurred if the runtime + // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC + // * TC. To bound the runtime check to be a fraction 1/X of the scalar + // cost, compute + // RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC + double MinTC2 = RtC * 10 / ScalarC; + + // Now pick the larger minimum. If it is not a multiple of VF, choose the + // next closest multiple of VF. This should partly compensate for ignoring + // the epilogue cost. + uint64_t MinTC = std::ceil(std::max(MinTC1, MinTC2)); + VF.MinProfitableTripCount = ElementCount::getFixed(alignTo(MinTC, IntVF)); + + LLVM_DEBUG( + dbgs() << "LV: Minimum required TC for runtime checks to be profitable:" + << VF.MinProfitableTripCount << "\n"); + + // Skip vectorization if the expected trip count is less than the minimum + // required trip count. + if (auto ExpectedTC = getSmallBestKnownTC(SE, L)) { + if (ElementCount::isKnownLT(ElementCount::getFixed(*ExpectedTC), + VF.MinProfitableTripCount)) { + LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected " + "trip count < minimum profitable VF (" + << *ExpectedTC << " < " << VF.MinProfitableTripCount + << ")\n"); + + return false; + } + } + return true; +} + LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || !EnableLoopInterleaving), @@ -10340,8 +10231,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { CM.collectElementTypesForWidening(); // Use the planner for vectorization. - LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, - Requirements, ORE); + LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, ORE); // Get user vectorization factor and interleave count. ElementCount UserVF = Hints.getWidth(); @@ -10353,10 +10243,25 @@ bool LoopVectorizePass::processLoop(Loop *L) { VectorizationFactor VF = VectorizationFactor::Disabled(); unsigned IC = 1; - GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, + GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI, F->getParent()->getDataLayout()); if (MaybeVF) { - if (LVP.requiresTooManyRuntimeChecks()) { + VF = *MaybeVF; + // Select the interleave count. + IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue()); + + unsigned SelectedIC = std::max(IC, UserIC); + // Optimistically generate runtime checks if they are needed. Drop them if + // they turn out to not be profitable. + if (VF.Width.isVector() || SelectedIC > 1) + Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC); + + // Check if it is profitable to vectorize with runtime checks. + bool ForceVectorization = + Hints.getForce() == LoopVectorizeHints::FK_Enabled; + if (!ForceVectorization && + !areRuntimeChecksProfitable(Checks, VF, CM.getVScaleForTuning(), L, + *PSE.getSE())) { ORE->emit([&]() { return OptimizationRemarkAnalysisAliasing( DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(), @@ -10368,15 +10273,6 @@ bool LoopVectorizePass::processLoop(Loop *L) { Hints.emitRemarkWithHints(); return false; } - VF = *MaybeVF; - // Select the interleave count. - IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue()); - - unsigned SelectedIC = std::max(IC, UserIC); - // Optimistically generate runtime checks if they are needed. Drop them if - // they turn out to not be profitable. - if (VF.Width.isVector() || SelectedIC > 1) - Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC); } // Identify the diagnostic messages that should be produced. @@ -10533,8 +10429,9 @@ bool LoopVectorizePass::processLoop(Loop *L) { if (!MainILV.areSafetyChecksAdded()) DisableRuntimeUnroll = true; } else { - InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, - &LVL, &CM, BFI, PSI, Checks); + InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, + VF.MinProfitableTripCount, IC, &LVL, &CM, BFI, + PSI, Checks); VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false); @@ -10564,7 +10461,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, LLVMLoopVectorizeFollowupEpilogue}); if (RemainderLoopID) { - L->setLoopID(RemainderLoopID.getValue()); + L->setLoopID(RemainderLoopID.value()); } else { if (DisableRuntimeUnroll) AddRuntimeUnrollDisableMetaData(L); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 019a09665a67..e136cd9aedac 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -2637,7 +2637,7 @@ private: AliasCacheKey key = std::make_pair(Inst1, Inst2); Optional<bool> &result = AliasCache[key]; if (result) { - return result.getValue(); + return result.value(); } bool aliased = true; if (Loc1.Ptr && isSimple(Inst1)) @@ -4592,7 +4592,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, }; InstructionsState S = getSameOpcode(VL); - if (Depth == RecursionMaxDepth) { + + // Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of + // a load), in which case peek through to include it in the tree, without + // ballooning over-budget. + if (Depth >= RecursionMaxDepth && + !(S.MainOp && isa<Instruction>(S.MainOp) && S.MainOp == S.AltOp && + VL.size() >= 4 && + (match(S.MainOp, m_Load(m_Value())) || all_of(VL, [&S](const Value *I) { + return match(I, + m_OneUse(m_ZExtOrSExt(m_OneUse(m_Load(m_Value()))))) && + cast<Instruction>(I)->getOpcode() == + cast<Instruction>(S.MainOp)->getOpcode(); + })))) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n"); if (TryToFindDuplicates(S)) newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, @@ -11217,7 +11229,7 @@ public: return OptimizationRemarkMissed( SV_NAME, "HorSLPNotBeneficial", ReducedValsToOps.find(VL[0])->second.front()) - << "Vectorizing horizontal reduction is possible" + << "Vectorizing horizontal reduction is possible " << "but not beneficial with cost " << ore::NV("Cost", Cost) << " and threshold " << ore::NV("Threshold", -SLPCostThreshold); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h index 97f2b1a93815..c7949c42c03e 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -159,7 +159,8 @@ public: /// Create a replicating region for instruction \p I that requires /// predication. \p PredRecipe is a VPReplicateRecipe holding \p I. - VPRegionBlock *createReplicateRegion(Instruction *I, VPRecipeBase *PredRecipe, + VPRegionBlock *createReplicateRegion(Instruction *I, + VPReplicateRecipe *PredRecipe, VPlanPtr &Plan); /// Build a VPReplicationRecipe for \p I and enclose it within a Region if it diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp index 4d709097c306..30032dda7f60 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -248,25 +248,27 @@ void VPTransformState::addMetadata(ArrayRef<Value *> To, Instruction *From) { } void VPTransformState::setDebugLocFromInst(const Value *V) { - if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) { - const DILocation *DIL = Inst->getDebugLoc(); - - // When a FSDiscriminator is enabled, we don't need to add the multiply - // factors to the discriminators. - if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && - !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) { - // FIXME: For scalable vectors, assume vscale=1. - auto NewDIL = - DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); - if (NewDIL) - Builder.SetCurrentDebugLocation(*NewDIL); - else - LLVM_DEBUG(dbgs() << "Failed to create new discriminator: " - << DIL->getFilename() << " Line: " << DIL->getLine()); - } else - Builder.SetCurrentDebugLocation(DIL); - } else + const Instruction *Inst = dyn_cast<Instruction>(V); + if (!Inst) { Builder.SetCurrentDebugLocation(DebugLoc()); + return; + } + + const DILocation *DIL = Inst->getDebugLoc(); + // When a FSDiscriminator is enabled, we don't need to add the multiply + // factors to the discriminators. + if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && + !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) { + // FIXME: For scalable vectors, assume vscale=1. + auto NewDIL = + DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); + if (NewDIL) + Builder.SetCurrentDebugLocation(*NewDIL); + else + LLVM_DEBUG(dbgs() << "Failed to create new discriminator: " + << DIL->getFilename() << " Line: " << DIL->getLine()); + } else + Builder.SetCurrentDebugLocation(DIL); } BasicBlock * @@ -566,6 +568,24 @@ void VPRegionBlock::print(raw_ostream &O, const Twine &Indent, } #endif +VPActiveLaneMaskPHIRecipe *VPlan::getActiveLaneMaskPhi() { + VPBasicBlock *Header = getVectorLoopRegion()->getEntryBasicBlock(); + for (VPRecipeBase &R : Header->phis()) { + if (isa<VPActiveLaneMaskPHIRecipe>(&R)) + return cast<VPActiveLaneMaskPHIRecipe>(&R); + } + return nullptr; +} + +static bool canSimplifyBranchOnCond(VPInstruction *Term) { + VPInstruction *Not = dyn_cast<VPInstruction>(Term->getOperand(0)); + if (!Not || Not->getOpcode() != VPInstruction::Not) + return false; + + VPInstruction *ALM = dyn_cast<VPInstruction>(Not->getOperand(0)); + return ALM && ALM->getOpcode() == VPInstruction::ActiveLaneMask; +} + void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV, Value *CanonicalIVStartValue, VPTransformState &State, @@ -573,11 +593,15 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV, VPBasicBlock *ExitingVPBB = getVectorLoopRegion()->getExitingBasicBlock(); auto *Term = dyn_cast<VPInstruction>(&ExitingVPBB->back()); - // Try to simplify BranchOnCount to 'BranchOnCond true' if TC <= VF * UF when - // preparing to execute the plan for the main vector loop. - if (!IsEpilogueVectorization && Term && - Term->getOpcode() == VPInstruction::BranchOnCount && - isa<ConstantInt>(TripCountV)) { + // Try to simplify the branch condition if TC <= VF * UF when preparing to + // execute the plan for the main vector loop. We only do this if the + // terminator is: + // 1. BranchOnCount, or + // 2. BranchOnCond where the input is Not(ActiveLaneMask). + if (!IsEpilogueVectorization && Term && isa<ConstantInt>(TripCountV) && + (Term->getOpcode() == VPInstruction::BranchOnCount || + (Term->getOpcode() == VPInstruction::BranchOnCond && + canSimplifyBranchOnCond(Term)))) { ConstantInt *C = cast<ConstantInt>(TripCountV); uint64_t TCVal = C->getZExtValue(); if (TCVal && TCVal <= State.VF.getKnownMinValue() * State.UF) { @@ -697,7 +721,8 @@ void VPlan::execute(VPTransformState *State) { // generated. bool SinglePartNeeded = isa<VPCanonicalIVPHIRecipe>(PhiR) || isa<VPFirstOrderRecurrencePHIRecipe>(PhiR) || - cast<VPReductionPHIRecipe>(PhiR)->isOrdered(); + (isa<VPReductionPHIRecipe>(PhiR) && + cast<VPReductionPHIRecipe>(PhiR)->isOrdered()); unsigned LastPartForNewPhi = SinglePartNeeded ? 1 : State->UF; for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) { diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h index 09da4a545d0d..f009a7ee6b4b 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h @@ -784,6 +784,10 @@ public: ActiveLaneMask, CanonicalIVIncrement, CanonicalIVIncrementNUW, + // The next two are similar to the above, but instead increment the + // canonical IV separately for each unrolled part. + CanonicalIVIncrementForPart, + CanonicalIVIncrementForPartNUW, BranchOnCount, BranchOnCond }; @@ -794,6 +798,9 @@ private: FastMathFlags FMF; DebugLoc DL; + /// An optional name that can be used for the generated IR instruction. + const std::string Name; + /// Utility method serving execute(): generates a single instance of the /// modeled instruction. void generateInstruction(VPTransformState &State, unsigned Part); @@ -802,14 +809,15 @@ protected: void setUnderlyingInstr(Instruction *I) { setUnderlyingValue(I); } public: - VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, DebugLoc DL) + VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, DebugLoc DL, + const Twine &Name = "") : VPRecipeBase(VPRecipeBase::VPInstructionSC, Operands), VPValue(VPValue::VPVInstructionSC, nullptr, this), Opcode(Opcode), - DL(DL) {} + DL(DL), Name(Name.str()) {} VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands, - DebugLoc DL = {}) - : VPInstruction(Opcode, ArrayRef<VPValue *>(Operands), DL) {} + DebugLoc DL = {}, const Twine &Name = "") + : VPInstruction(Opcode, ArrayRef<VPValue *>(Operands), DL, Name) {} /// Method to support type inquiry through isa, cast, and dyn_cast. static inline bool classof(const VPValue *V) { @@ -818,7 +826,7 @@ public: VPInstruction *clone() const { SmallVector<VPValue *, 2> Operands(operands()); - return new VPInstruction(Opcode, Operands, DL); + return new VPInstruction(Opcode, Operands, DL, Name); } /// Method to support type inquiry through isa, cast, and dyn_cast. @@ -897,6 +905,8 @@ public: case VPInstruction::ActiveLaneMask: case VPInstruction::CanonicalIVIncrement: case VPInstruction::CanonicalIVIncrementNUW: + case VPInstruction::CanonicalIVIncrementForPart: + case VPInstruction::CanonicalIVIncrementForPartNUW: case VPInstruction::BranchOnCount: return true; }; @@ -1125,6 +1135,7 @@ public: /// Method to support type inquiry through isa, cast, and dyn_cast. static inline bool classof(const VPRecipeBase *B) { return B->getVPDefID() == VPRecipeBase::VPCanonicalIVPHISC || + B->getVPDefID() == VPRecipeBase::VPActiveLaneMaskPHISC || B->getVPDefID() == VPRecipeBase::VPFirstOrderRecurrencePHISC || B->getVPDefID() == VPRecipeBase::VPReductionPHISC || B->getVPDefID() == VPRecipeBase::VPWidenIntOrFpInductionSC || @@ -1132,6 +1143,7 @@ public: } static inline bool classof(const VPValue *V) { return V->getVPValueID() == VPValue::VPVCanonicalIVPHISC || + V->getVPValueID() == VPValue::VPVActiveLaneMaskPHISC || V->getVPValueID() == VPValue::VPVFirstOrderRecurrencePHISC || V->getVPValueID() == VPValue::VPVReductionPHISC || V->getVPValueID() == VPValue::VPVWidenIntOrFpInductionSC || @@ -1861,6 +1873,42 @@ public: } }; +/// A recipe for generating the active lane mask for the vector loop that is +/// used to predicate the vector operations. +/// TODO: It would be good to use the existing VPWidenPHIRecipe instead and +/// remove VPActiveLaneMaskPHIRecipe. +class VPActiveLaneMaskPHIRecipe : public VPHeaderPHIRecipe { + DebugLoc DL; + +public: + VPActiveLaneMaskPHIRecipe(VPValue *StartMask, DebugLoc DL) + : VPHeaderPHIRecipe(VPValue::VPVActiveLaneMaskPHISC, + VPActiveLaneMaskPHISC, nullptr, StartMask), + DL(DL) {} + + ~VPActiveLaneMaskPHIRecipe() override = default; + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPDef *D) { + return D->getVPDefID() == VPActiveLaneMaskPHISC; + } + static inline bool classof(const VPHeaderPHIRecipe *D) { + return D->getVPDefID() == VPActiveLaneMaskPHISC; + } + static inline bool classof(const VPValue *V) { + return V->getVPValueID() == VPValue::VPVActiveLaneMaskPHISC; + } + + /// Generate the active lane mask phi of the vector loop. + void execute(VPTransformState &State) override; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif +}; + /// A Recipe for widening the canonical induction variable of the vector loop. class VPWidenCanonicalIVRecipe : public VPRecipeBase, public VPValue { public: @@ -2656,6 +2704,10 @@ public: return cast<VPCanonicalIVPHIRecipe>(&*EntryVPBB->begin()); } + /// Find and return the VPActiveLaneMaskPHIRecipe from the header - there + /// be only one at most. If there isn't one, then return nullptr. + VPActiveLaneMaskPHIRecipe *getActiveLaneMaskPhi(); + void addLiveOut(PHINode *PN, VPValue *V); void clearLiveOuts() { diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 92422b17457c..fdd901a4a70d 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -26,13 +26,19 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" #include <cassert> using namespace llvm; +using VectorParts = SmallVector<Value *, 2>; + extern cl::opt<bool> EnableVPlanNativePath; +#define LV_NAME "loop-vectorize" +#define DEBUG_TYPE LV_NAME + bool VPRecipeBase::mayWriteToMemory() const { switch (getVPDefID()) { case VPWidenMemoryInstructionSC: { @@ -186,7 +192,8 @@ void VPInstruction::generateInstruction(VPTransformState &State, if (Instruction::isBinaryOp(getOpcode())) { Value *A = State.get(getOperand(0), Part); Value *B = State.get(getOperand(1), Part); - Value *V = Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B); + Value *V = + Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name); State.set(this, V, Part); return; } @@ -194,14 +201,14 @@ void VPInstruction::generateInstruction(VPTransformState &State, switch (getOpcode()) { case VPInstruction::Not: { Value *A = State.get(getOperand(0), Part); - Value *V = Builder.CreateNot(A); + Value *V = Builder.CreateNot(A, Name); State.set(this, V, Part); break; } case VPInstruction::ICmpULE: { Value *IV = State.get(getOperand(0), Part); Value *TC = State.get(getOperand(1), Part); - Value *V = Builder.CreateICmpULE(IV, TC); + Value *V = Builder.CreateICmpULE(IV, TC, Name); State.set(this, V, Part); break; } @@ -209,7 +216,7 @@ void VPInstruction::generateInstruction(VPTransformState &State, Value *Cond = State.get(getOperand(0), Part); Value *Op1 = State.get(getOperand(1), Part); Value *Op2 = State.get(getOperand(2), Part); - Value *V = Builder.CreateSelect(Cond, Op1, Op2); + Value *V = Builder.CreateSelect(Cond, Op1, Op2, Name); State.set(this, V, Part); break; } @@ -223,7 +230,7 @@ void VPInstruction::generateInstruction(VPTransformState &State, auto *PredTy = VectorType::get(Int1Ty, State.VF); Instruction *Call = Builder.CreateIntrinsic( Intrinsic::get_active_lane_mask, {PredTy, ScalarTC->getType()}, - {VIVElem0, ScalarTC}, nullptr, "active.lane.mask"); + {VIVElem0, ScalarTC}, nullptr, Name); State.set(this, Call, Part); break; } @@ -247,7 +254,8 @@ void VPInstruction::generateInstruction(VPTransformState &State, State.set(this, PartMinus1, Part); } else { Value *V2 = State.get(getOperand(1), Part); - State.set(this, Builder.CreateVectorSplice(PartMinus1, V2, -1), Part); + State.set(this, Builder.CreateVectorSplice(PartMinus1, V2, -1, Name), + Part); } break; } @@ -261,7 +269,7 @@ void VPInstruction::generateInstruction(VPTransformState &State, // elements) times the unroll factor (num of SIMD instructions). Value *Step = createStepForVF(Builder, Phi->getType(), State.VF, State.UF); - Next = Builder.CreateAdd(Phi, Step, "index.next", IsNUW, false); + Next = Builder.CreateAdd(Phi, Step, Name, IsNUW, false); } else { Next = State.get(this, 0); } @@ -269,6 +277,23 @@ void VPInstruction::generateInstruction(VPTransformState &State, State.set(this, Next, Part); break; } + + case VPInstruction::CanonicalIVIncrementForPart: + case VPInstruction::CanonicalIVIncrementForPartNUW: { + bool IsNUW = getOpcode() == VPInstruction::CanonicalIVIncrementForPartNUW; + auto *IV = State.get(getOperand(0), VPIteration(0, 0)); + if (Part == 0) { + State.set(this, IV, Part); + break; + } + + // The canonical IV is incremented by the vectorization factor (num of SIMD + // elements) times the unroll part. + Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part); + Value *Next = Builder.CreateAdd(IV, Step, Name, IsNUW, false); + State.set(this, Next, Part); + break; + } case VPInstruction::BranchOnCond: { if (Part != 0) break; @@ -370,6 +395,12 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, case VPInstruction::BranchOnCond: O << "branch-on-cond"; break; + case VPInstruction::CanonicalIVIncrementForPart: + O << "VF * Part + "; + break; + case VPInstruction::CanonicalIVIncrementForPartNUW: + O << "VF * Part +(nuw) "; + break; case VPInstruction::BranchOnCount: O << "branch-on-count "; break; @@ -431,7 +462,158 @@ void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent, getOperand(2)->printAsOperand(O, SlotTracker); O << (InvariantCond ? " (condition is loop invariant)" : ""); } +#endif + +void VPWidenSelectRecipe::execute(VPTransformState &State) { + auto &I = *cast<SelectInst>(getUnderlyingInstr()); + State.setDebugLocFromInst(&I); + + // The condition can be loop invariant but still defined inside the + // loop. This means that we can't just use the original 'cond' value. + // We have to take the 'vectorized' value and pick the first lane. + // Instcombine will make this a no-op. + auto *InvarCond = + InvariantCond ? State.get(getOperand(0), VPIteration(0, 0)) : nullptr; + + for (unsigned Part = 0; Part < State.UF; ++Part) { + Value *Cond = InvarCond ? InvarCond : State.get(getOperand(0), Part); + Value *Op0 = State.get(getOperand(1), Part); + Value *Op1 = State.get(getOperand(2), Part); + Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1); + State.set(this, Sel, Part); + State.addMetadata(Sel, &I); + } +} + +void VPWidenRecipe::execute(VPTransformState &State) { + auto &I = *cast<Instruction>(getUnderlyingValue()); + auto &Builder = State.Builder; + switch (I.getOpcode()) { + case Instruction::Call: + case Instruction::Br: + case Instruction::PHI: + case Instruction::GetElementPtr: + case Instruction::Select: + llvm_unreachable("This instruction is handled by a different recipe."); + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::SRem: + case Instruction::URem: + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::FNeg: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::FDiv: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + // Just widen unops and binops. + State.setDebugLocFromInst(&I); + + for (unsigned Part = 0; Part < State.UF; ++Part) { + SmallVector<Value *, 2> Ops; + for (VPValue *VPOp : operands()) + Ops.push_back(State.get(VPOp, Part)); + + Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); + + if (auto *VecOp = dyn_cast<Instruction>(V)) { + VecOp->copyIRFlags(&I); + + // If the instruction is vectorized and was in a basic block that needed + // predication, we can't propagate poison-generating flags (nuw/nsw, + // exact, etc.). The control flow has been linearized and the + // instruction is no longer guarded by the predicate, which could make + // the flag properties to no longer hold. + if (State.MayGeneratePoisonRecipes.contains(this)) + VecOp->dropPoisonGeneratingFlags(); + } + + // Use this vector value for all users of the original instruction. + State.set(this, V, Part); + State.addMetadata(V, &I); + } + + break; + } + case Instruction::Freeze: { + State.setDebugLocFromInst(&I); + + for (unsigned Part = 0; Part < State.UF; ++Part) { + Value *Op = State.get(getOperand(0), Part); + + Value *Freeze = Builder.CreateFreeze(Op); + State.set(this, Freeze, Part); + } + break; + } + case Instruction::ICmp: + case Instruction::FCmp: { + // Widen compares. Generate vector compares. + bool FCmp = (I.getOpcode() == Instruction::FCmp); + auto *Cmp = cast<CmpInst>(&I); + State.setDebugLocFromInst(Cmp); + for (unsigned Part = 0; Part < State.UF; ++Part) { + Value *A = State.get(getOperand(0), Part); + Value *B = State.get(getOperand(1), Part); + Value *C = nullptr; + if (FCmp) { + // Propagate fast math flags. + IRBuilder<>::FastMathFlagGuard FMFG(Builder); + Builder.setFastMathFlags(Cmp->getFastMathFlags()); + C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); + } else { + C = Builder.CreateICmp(Cmp->getPredicate(), A, B); + } + State.set(this, C, Part); + State.addMetadata(C, &I); + } + break; + } + + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::BitCast: { + auto *CI = cast<CastInst>(&I); + State.setDebugLocFromInst(CI); + + /// Vectorize casts. + Type *DestTy = (State.VF.isScalar()) + ? CI->getType() + : VectorType::get(CI->getType(), State.VF); + + for (unsigned Part = 0; Part < State.UF; ++Part) { + Value *A = State.get(getOperand(0), Part); + Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); + State.set(this, Cast, Part); + State.addMetadata(Cast, &I); + } + break; + } + default: + // This instruction is not vectorized by simple widening. + LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); + llvm_unreachable("Unhandled instruction!"); + } // end of switch. +} +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "WIDEN "; @@ -487,7 +669,82 @@ void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent, O << Indent << "= SCALAR-STEPS "; printOperands(O, SlotTracker); } +#endif + +void VPWidenGEPRecipe::execute(VPTransformState &State) { + auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr()); + // Construct a vector GEP by widening the operands of the scalar GEP as + // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP + // results in a vector of pointers when at least one operand of the GEP + // is vector-typed. Thus, to keep the representation compact, we only use + // vector-typed operands for loop-varying values. + + if (State.VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { + // If we are vectorizing, but the GEP has only loop-invariant operands, + // the GEP we build (by only using vector-typed operands for + // loop-varying values) would be a scalar pointer. Thus, to ensure we + // produce a vector of pointers, we need to either arbitrarily pick an + // operand to broadcast, or broadcast a clone of the original GEP. + // Here, we broadcast a clone of the original. + // + // TODO: If at some point we decide to scalarize instructions having + // loop-invariant operands, this special case will no longer be + // required. We would add the scalarization decision to + // collectLoopScalars() and teach getVectorValue() to broadcast + // the lane-zero scalar value. + auto *Clone = State.Builder.Insert(GEP->clone()); + for (unsigned Part = 0; Part < State.UF; ++Part) { + Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone); + State.set(this, EntryPart, Part); + State.addMetadata(EntryPart, GEP); + } + } else { + // If the GEP has at least one loop-varying operand, we are sure to + // produce a vector of pointers. But if we are only unrolling, we want + // to produce a scalar GEP for each unroll part. Thus, the GEP we + // produce with the code below will be scalar (if VF == 1) or vector + // (otherwise). Note that for the unroll-only case, we still maintain + // values in the vector mapping with initVector, as we do for other + // instructions. + for (unsigned Part = 0; Part < State.UF; ++Part) { + // The pointer operand of the new GEP. If it's loop-invariant, we + // won't broadcast it. + auto *Ptr = IsPtrLoopInvariant + ? State.get(getOperand(0), VPIteration(0, 0)) + : State.get(getOperand(0), Part); + + // Collect all the indices for the new GEP. If any index is + // loop-invariant, we won't broadcast it. + SmallVector<Value *, 4> Indices; + for (unsigned I = 1, E = getNumOperands(); I < E; I++) { + VPValue *Operand = getOperand(I); + if (IsIndexLoopInvariant[I - 1]) + Indices.push_back(State.get(Operand, VPIteration(0, 0))); + else + Indices.push_back(State.get(Operand, Part)); + } + + // If the GEP instruction is vectorized and was in a basic block that + // needed predication, we can't propagate the poison-generating 'inbounds' + // flag. The control flow has been linearized and the GEP is no longer + // guarded by the predicate, which could make the 'inbounds' properties to + // no longer hold. + bool IsInBounds = + GEP->isInBounds() && State.MayGeneratePoisonRecipes.count(this) == 0; + + // Create the new GEP. Note that this GEP may be a scalar if VF == 1, + // but it should be a vector, otherwise. + auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr, + Indices, "", IsInBounds); + assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && + "NewGEP is not a pointer vector"); + State.set(this, NewGEP, Part); + State.addMetadata(NewGEP, GEP); + } + } +} +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "WIDEN-GEP "; @@ -501,7 +758,48 @@ void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent, O << " = getelementptr "; printOperands(O, SlotTracker); } +#endif +void VPBlendRecipe::execute(VPTransformState &State) { + State.setDebugLocFromInst(Phi); + // We know that all PHIs in non-header blocks are converted into + // selects, so we don't have to worry about the insertion order and we + // can just use the builder. + // At this point we generate the predication tree. There may be + // duplications since this is a simple recursive scan, but future + // optimizations will clean it up. + + unsigned NumIncoming = getNumIncomingValues(); + + // Generate a sequence of selects of the form: + // SELECT(Mask3, In3, + // SELECT(Mask2, In2, + // SELECT(Mask1, In1, + // In0))) + // Note that Mask0 is never used: lanes for which no path reaches this phi and + // are essentially undef are taken from In0. + VectorParts Entry(State.UF); + for (unsigned In = 0; In < NumIncoming; ++In) { + for (unsigned Part = 0; Part < State.UF; ++Part) { + // We might have single edge PHIs (blocks) - use an identity + // 'select' for the first PHI operand. + Value *In0 = State.get(getIncomingValue(In), Part); + if (In == 0) + Entry[Part] = In0; // Initialize with the first incoming value. + else { + // Select between the current value and the previous incoming edge + // based on the incoming mask. + Value *Cond = State.get(getMask(In), Part); + Entry[Part] = + State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); + } + } + } + for (unsigned Part = 0; Part < State.UF; ++Part) + State.set(this, Entry[Part], Part); +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "BLEND "; @@ -566,7 +864,35 @@ void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent, if (AlsoPack) O << " (S->V)"; } +#endif +void VPBranchOnMaskRecipe::execute(VPTransformState &State) { + assert(State.Instance && "Branch on Mask works only on single instance."); + + unsigned Part = State.Instance->Part; + unsigned Lane = State.Instance->Lane.getKnownLane(); + + Value *ConditionBit = nullptr; + VPValue *BlockInMask = getMask(); + if (BlockInMask) { + ConditionBit = State.get(BlockInMask, Part); + if (ConditionBit->getType()->isVectorTy()) + ConditionBit = State.Builder.CreateExtractElement( + ConditionBit, State.Builder.getInt32(Lane)); + } else // Block in mask is all-one. + ConditionBit = State.Builder.getTrue(); + + // Replace the temporary unreachable terminator with a new conditional branch, + // whose two destinations will be set later when they are created. + auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); + assert(isa<UnreachableInst>(CurrentTerminator) && + "Expected to replace unreachable terminator with conditional branch."); + auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); + CondBr->setSuccessor(0, nullptr); + ReplaceInstWithInst(CurrentTerminator, CondBr); +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "PHI-PREDICATED-INSTRUCTION "; @@ -838,3 +1164,28 @@ void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent, printOperands(O, SlotTracker); } #endif + +// TODO: It would be good to use the existing VPWidenPHIRecipe instead and +// remove VPActiveLaneMaskPHIRecipe. +void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) { + BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); + for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) { + Value *StartMask = State.get(getOperand(0), Part); + PHINode *EntryPart = + State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask"); + EntryPart->addIncoming(StartMask, VectorPH); + EntryPart->setDebugLoc(DL); + State.set(this, EntryPart, Part); + } +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "ACTIVE-LANE-MASK-PHI "; + + printAsOperand(O, SlotTracker); + O << " = phi "; + printOperands(O, SlotTracker); +} +#endif diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanValue.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanValue.h index 5fc676834331..c99fae1b2ab4 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -103,6 +103,7 @@ public: // Phi-like VPValues. Need to be kept together. VPVBlendSC, VPVCanonicalIVPHISC, + VPVActiveLaneMaskPHISC, VPVFirstOrderRecurrencePHISC, VPVWidenPHISC, VPVWidenIntOrFpInductionSC, @@ -358,6 +359,7 @@ public: // Phi-like recipes. Need to be kept together. VPBlendSC, VPCanonicalIVPHISC, + VPActiveLaneMaskPHISC, VPFirstOrderRecurrencePHISC, VPWidenPHISC, VPWidenIntOrFpInductionSC, diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index f917883145c0..3501de6ab38e 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -133,32 +133,48 @@ void VPlanVerifier::verifyHierarchicalCFG( verifyRegionRec(TopRegion); } -bool VPlanVerifier::verifyPlanIsValid(const VPlan &Plan) { - auto Iter = depth_first( - VPBlockRecursiveTraversalWrapper<const VPBlockBase *>(Plan.getEntry())); - for (const VPBasicBlock *VPBB : - VPBlockUtils::blocksOnly<const VPBasicBlock>(Iter)) { - // Verify that phi-like recipes are at the beginning of the block, with no - // other recipes in between. - auto RecipeI = VPBB->begin(); - auto End = VPBB->end(); - while (RecipeI != End && RecipeI->isPhi()) - RecipeI++; +static bool verifyVPBasicBlock(const VPBasicBlock *VPBB) { + // Verify that phi-like recipes are at the beginning of the block, with no + // other recipes in between. + auto RecipeI = VPBB->begin(); + auto End = VPBB->end(); + unsigned NumActiveLaneMaskPhiRecipes = 0; + while (RecipeI != End && RecipeI->isPhi()) { + if (isa<VPActiveLaneMaskPHIRecipe>(RecipeI)) + NumActiveLaneMaskPhiRecipes++; + RecipeI++; + } - while (RecipeI != End) { - if (RecipeI->isPhi() && !isa<VPBlendRecipe>(&*RecipeI)) { - errs() << "Found phi-like recipe after non-phi recipe"; + if (NumActiveLaneMaskPhiRecipes > 1) { + errs() << "There should be no more than one VPActiveLaneMaskPHIRecipe"; + return false; + } + + while (RecipeI != End) { + if (RecipeI->isPhi() && !isa<VPBlendRecipe>(&*RecipeI)) { + errs() << "Found phi-like recipe after non-phi recipe"; #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) - errs() << ": "; - RecipeI->dump(); - errs() << "after\n"; - std::prev(RecipeI)->dump(); + errs() << ": "; + RecipeI->dump(); + errs() << "after\n"; + std::prev(RecipeI)->dump(); #endif - return false; - } - RecipeI++; + return false; } + RecipeI++; + } + + return true; +} + +bool VPlanVerifier::verifyPlanIsValid(const VPlan &Plan) { + auto Iter = depth_first( + VPBlockRecursiveTraversalWrapper<const VPBlockBase *>(Plan.getEntry())); + for (const VPBasicBlock *VPBB : + VPBlockUtils::blocksOnly<const VPBasicBlock>(Iter)) { + if (!verifyVPBasicBlock(VPBB)) + return false; } const VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); @@ -181,15 +197,16 @@ bool VPlanVerifier::verifyPlanIsValid(const VPlan &Plan) { } if (Exiting->empty()) { - errs() << "VPlan vector loop exiting block must end with BranchOnCount " - "VPInstruction but is empty\n"; + errs() << "VPlan vector loop exiting block must end with BranchOnCount or " + "BranchOnCond VPInstruction but is empty\n"; return false; } auto *LastInst = dyn_cast<VPInstruction>(std::prev(Exiting->end())); - if (!LastInst || LastInst->getOpcode() != VPInstruction::BranchOnCount) { - errs() << "VPlan vector loop exit must end with BranchOnCount " - "VPInstruction\n"; + if (!LastInst || (LastInst->getOpcode() != VPInstruction::BranchOnCount && + LastInst->getOpcode() != VPInstruction::BranchOnCond)) { + errs() << "VPlan vector loop exit must end with BranchOnCount or " + "BranchOnCond VPInstruction\n"; return false; } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 90598937affc..d12624ffb824 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -414,6 +414,10 @@ static Value *createShiftShuffle(Value *Vec, unsigned OldIndex, static ExtractElementInst *translateExtract(ExtractElementInst *ExtElt, unsigned NewIndex, IRBuilder<> &Builder) { + // Shufflevectors can only be created for fixed-width vectors. + if (!isa<FixedVectorType>(ExtElt->getOperand(0)->getType())) + return nullptr; + // If the extract can be constant-folded, this code is unsimplified. Defer // to other passes to handle that. Value *X = ExtElt->getVectorOperand(); @@ -1249,14 +1253,20 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { if (!Op0 || !Op1 || Op0 == Op1 || !Op0->isBinaryOp() || !Op1->isBinaryOp() || VT != Op0->getType()) return false; - auto *SVI0A = dyn_cast<ShuffleVectorInst>(Op0->getOperand(0)); - auto *SVI0B = dyn_cast<ShuffleVectorInst>(Op0->getOperand(1)); - auto *SVI1A = dyn_cast<ShuffleVectorInst>(Op1->getOperand(0)); - auto *SVI1B = dyn_cast<ShuffleVectorInst>(Op1->getOperand(1)); + auto *SVI0A = dyn_cast<Instruction>(Op0->getOperand(0)); + auto *SVI0B = dyn_cast<Instruction>(Op0->getOperand(1)); + auto *SVI1A = dyn_cast<Instruction>(Op1->getOperand(0)); + auto *SVI1B = dyn_cast<Instruction>(Op1->getOperand(1)); + SmallPtrSet<Instruction *, 4> InputShuffles({SVI0A, SVI0B, SVI1A, SVI1B}); auto checkSVNonOpUses = [&](Instruction *I) { if (!I || I->getOperand(0)->getType() != VT) return true; - return any_of(I->users(), [&](User *U) { return U != Op0 && U != Op1; }); + return any_of(I->users(), [&](User *U) { + return U != Op0 && U != Op1 && + !(isa<ShuffleVectorInst>(U) && + (InputShuffles.contains(cast<Instruction>(U)) || + isInstructionTriviallyDead(cast<Instruction>(U)))); + }); }; if (checkSVNonOpUses(SVI0A) || checkSVNonOpUses(SVI0B) || checkSVNonOpUses(SVI1A) || checkSVNonOpUses(SVI1B)) @@ -1271,6 +1281,9 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { auto *SV = dyn_cast<ShuffleVectorInst>(U); if (!SV || SV->getType() != VT) return false; + if ((SV->getOperand(0) != Op0 && SV->getOperand(0) != Op1) || + (SV->getOperand(1) != Op0 && SV->getOperand(1) != Op1)) + return false; if (!llvm::is_contained(Shuffles, SV)) Shuffles.push_back(SV); } @@ -1283,13 +1296,25 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { if (FromReduction && Shuffles.size() > 1) return false; + // Add any shuffle uses for the shuffles we have found, to include them in our + // cost calculations. + if (!FromReduction) { + for (ShuffleVectorInst *SV : Shuffles) { + for (auto U : SV->users()) { + ShuffleVectorInst *SSV = dyn_cast<ShuffleVectorInst>(U); + if (SSV && isa<UndefValue>(SSV->getOperand(1))) + Shuffles.push_back(SSV); + } + } + } + // For each of the output shuffles, we try to sort all the first vector // elements to the beginning, followed by the second array elements at the // end. If the binops are legalized to smaller vectors, this may reduce total // number of binops. We compute the ReconstructMask mask needed to convert // back to the original lane order. - SmallVector<int> V1, V2; - SmallVector<SmallVector<int>> ReconstructMasks; + SmallVector<std::pair<int, int>> V1, V2; + SmallVector<SmallVector<int>> OrigReconstructMasks; int MaxV1Elt = 0, MaxV2Elt = 0; unsigned NumElts = VT->getNumElements(); for (ShuffleVectorInst *SVN : Shuffles) { @@ -1300,6 +1325,16 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { // case we need to commute the mask). Value *SVOp0 = SVN->getOperand(0); Value *SVOp1 = SVN->getOperand(1); + if (isa<UndefValue>(SVOp1)) { + auto *SSV = cast<ShuffleVectorInst>(SVOp0); + SVOp0 = SSV->getOperand(0); + SVOp1 = SSV->getOperand(1); + for (unsigned I = 0, E = Mask.size(); I != E; I++) { + if (Mask[I] >= static_cast<int>(SSV->getShuffleMask().size())) + return false; + Mask[I] = Mask[I] < 0 ? Mask[I] : SSV->getMaskValue(Mask[I]); + } + } if (SVOp0 == Op1 && SVOp1 == Op0) { std::swap(SVOp0, SVOp1); ShuffleVectorInst::commuteShuffleMask(Mask, NumElts); @@ -1316,21 +1351,25 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { ReconstructMask.push_back(-1); } else if (Mask[I] < static_cast<int>(NumElts)) { MaxV1Elt = std::max(MaxV1Elt, Mask[I]); - auto It = find(V1, Mask[I]); + auto It = find_if(V1, [&](const std::pair<int, int> &A) { + return Mask[I] == A.first; + }); if (It != V1.end()) ReconstructMask.push_back(It - V1.begin()); else { ReconstructMask.push_back(V1.size()); - V1.push_back(Mask[I]); + V1.emplace_back(Mask[I], V1.size()); } } else { MaxV2Elt = std::max<int>(MaxV2Elt, Mask[I] - NumElts); - auto It = find(V2, Mask[I] - NumElts); + auto It = find_if(V2, [&](const std::pair<int, int> &A) { + return Mask[I] - static_cast<int>(NumElts) == A.first; + }); if (It != V2.end()) ReconstructMask.push_back(NumElts + It - V2.begin()); else { ReconstructMask.push_back(NumElts + V2.size()); - V2.push_back(Mask[I] - NumElts); + V2.emplace_back(Mask[I] - NumElts, NumElts + V2.size()); } } } @@ -1339,7 +1378,7 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { // result. In-order can help simplify the shuffle away. if (FromReduction) sort(ReconstructMask); - ReconstructMasks.push_back(ReconstructMask); + OrigReconstructMasks.push_back(std::move(ReconstructMask)); } // If the Maximum element used from V1 and V2 are not larger than the new @@ -1351,16 +1390,68 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { MaxV2Elt == static_cast<int>(V2.size()) - 1)) return false; + // GetBaseMaskValue takes one of the inputs, which may either be a shuffle, a + // shuffle of another shuffle, or not a shuffle (that is treated like a + // identity shuffle). + auto GetBaseMaskValue = [&](Instruction *I, int M) { + auto *SV = dyn_cast<ShuffleVectorInst>(I); + if (!SV) + return M; + if (isa<UndefValue>(SV->getOperand(1))) + if (auto *SSV = dyn_cast<ShuffleVectorInst>(SV->getOperand(0))) + if (InputShuffles.contains(SSV)) + return SSV->getMaskValue(SV->getMaskValue(M)); + return SV->getMaskValue(M); + }; + + // Attempt to sort the inputs my ascending mask values to make simpler input + // shuffles and push complex shuffles down to the uses. We sort on the first + // of the two input shuffle orders, to try and get at least one input into a + // nice order. + auto SortBase = [&](Instruction *A, std::pair<int, int> X, + std::pair<int, int> Y) { + int MXA = GetBaseMaskValue(A, X.first); + int MYA = GetBaseMaskValue(A, Y.first); + return MXA < MYA; + }; + stable_sort(V1, [&](std::pair<int, int> A, std::pair<int, int> B) { + return SortBase(SVI0A, A, B); + }); + stable_sort(V2, [&](std::pair<int, int> A, std::pair<int, int> B) { + return SortBase(SVI1A, A, B); + }); + // Calculate our ReconstructMasks from the OrigReconstructMasks and the + // modified order of the input shuffles. + SmallVector<SmallVector<int>> ReconstructMasks; + for (auto Mask : OrigReconstructMasks) { + SmallVector<int> ReconstructMask; + for (int M : Mask) { + auto FindIndex = [](const SmallVector<std::pair<int, int>> &V, int M) { + auto It = find_if(V, [M](auto A) { return A.second == M; }); + assert(It != V.end() && "Expected all entries in Mask"); + return std::distance(V.begin(), It); + }; + if (M < 0) + ReconstructMask.push_back(-1); + else if (M < static_cast<int>(NumElts)) { + ReconstructMask.push_back(FindIndex(V1, M)); + } else { + ReconstructMask.push_back(NumElts + FindIndex(V2, M)); + } + } + ReconstructMasks.push_back(std::move(ReconstructMask)); + } + // Calculate the masks needed for the new input shuffles, which get padded // with undef SmallVector<int> V1A, V1B, V2A, V2B; for (unsigned I = 0; I < V1.size(); I++) { - V1A.push_back(SVI0A->getMaskValue(V1[I])); - V1B.push_back(SVI0B->getMaskValue(V1[I])); + V1A.push_back(GetBaseMaskValue(SVI0A, V1[I].first)); + V1B.push_back(GetBaseMaskValue(SVI0B, V1[I].first)); } for (unsigned I = 0; I < V2.size(); I++) { - V2A.push_back(SVI1A->getMaskValue(V2[I])); - V2B.push_back(SVI1B->getMaskValue(V2[I])); + V2A.push_back(GetBaseMaskValue(SVI1A, V2[I].first)); + V2B.push_back(GetBaseMaskValue(SVI1B, V2[I].first)); } while (V1A.size() < NumElts) { V1A.push_back(UndefMaskElem); @@ -1371,9 +1462,14 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { V2B.push_back(UndefMaskElem); } - auto AddShuffleCost = [&](InstructionCost C, ShuffleVectorInst *SV) { - return C + - TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, VT, SV->getShuffleMask()); + auto AddShuffleCost = [&](InstructionCost C, Instruction *I) { + auto *SV = dyn_cast<ShuffleVectorInst>(I); + if (!SV) + return C; + return C + TTI.getShuffleCost(isa<UndefValue>(SV->getOperand(1)) + ? TTI::SK_PermuteSingleSrc + : TTI::SK_PermuteTwoSrc, + VT, SV->getShuffleMask()); }; auto AddShuffleMaskCost = [&](InstructionCost C, ArrayRef<int> Mask) { return C + TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, VT, Mask); @@ -1386,9 +1482,6 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { TTI.getArithmeticInstrCost(Op1->getOpcode(), VT); CostBefore += std::accumulate(Shuffles.begin(), Shuffles.end(), InstructionCost(0), AddShuffleCost); - // This set helps us only cost each unique shuffle once. - SmallPtrSet<ShuffleVectorInst *, 4> InputShuffles( - {SVI0A, SVI0B, SVI1A, SVI1B}); CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(), InstructionCost(0), AddShuffleCost); @@ -1408,22 +1501,35 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { std::accumulate(OutputShuffleMasks.begin(), OutputShuffleMasks.end(), InstructionCost(0), AddShuffleMaskCost); + LLVM_DEBUG(dbgs() << "Found a binop select shuffle pattern: " << I << "\n"); + LLVM_DEBUG(dbgs() << " CostBefore: " << CostBefore + << " vs CostAfter: " << CostAfter << "\n"); if (CostBefore <= CostAfter) return false; // The cost model has passed, create the new instructions. - Builder.SetInsertPoint(SVI0A); - Value *NSV0A = Builder.CreateShuffleVector(SVI0A->getOperand(0), - SVI0A->getOperand(1), V1A); - Builder.SetInsertPoint(SVI0B); - Value *NSV0B = Builder.CreateShuffleVector(SVI0B->getOperand(0), - SVI0B->getOperand(1), V1B); - Builder.SetInsertPoint(SVI1A); - Value *NSV1A = Builder.CreateShuffleVector(SVI1A->getOperand(0), - SVI1A->getOperand(1), V2A); - Builder.SetInsertPoint(SVI1B); - Value *NSV1B = Builder.CreateShuffleVector(SVI1B->getOperand(0), - SVI1B->getOperand(1), V2B); + auto GetShuffleOperand = [&](Instruction *I, unsigned Op) -> Value * { + auto *SV = dyn_cast<ShuffleVectorInst>(I); + if (!SV) + return I; + if (isa<UndefValue>(SV->getOperand(1))) + if (auto *SSV = dyn_cast<ShuffleVectorInst>(SV->getOperand(0))) + if (InputShuffles.contains(SSV)) + return SSV->getOperand(Op); + return SV->getOperand(Op); + }; + Builder.SetInsertPoint(SVI0A->getNextNode()); + Value *NSV0A = Builder.CreateShuffleVector(GetShuffleOperand(SVI0A, 0), + GetShuffleOperand(SVI0A, 1), V1A); + Builder.SetInsertPoint(SVI0B->getNextNode()); + Value *NSV0B = Builder.CreateShuffleVector(GetShuffleOperand(SVI0B, 0), + GetShuffleOperand(SVI0B, 1), V1B); + Builder.SetInsertPoint(SVI1A->getNextNode()); + Value *NSV1A = Builder.CreateShuffleVector(GetShuffleOperand(SVI1A, 0), + GetShuffleOperand(SVI1A, 1), V2A); + Builder.SetInsertPoint(SVI1B->getNextNode()); + Value *NSV1B = Builder.CreateShuffleVector(GetShuffleOperand(SVI1B, 0), + GetShuffleOperand(SVI1B, 1), V2B); Builder.SetInsertPoint(Op0); Value *NOp0 = Builder.CreateBinOp((Instruction::BinaryOps)Op0->getOpcode(), NSV0A, NSV0B); |