diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2023-07-26 19:03:47 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2023-07-26 19:04:23 +0000 |
commit | 7fa27ce4a07f19b07799a767fc29416f3b625afb (patch) | |
tree | 27825c83636c4de341eb09a74f49f5d38a15d165 /llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | |
parent | e3b557809604d036af6e00c60f012c2025b59a5e (diff) |
Diffstat (limited to 'llvm/lib/Transforms/Vectorize/LoopVectorize.cpp')
-rw-r--r-- | llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 2346 |
1 files changed, 1181 insertions, 1165 deletions
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index a28099d8ba7d..d7e40e8ef978 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -98,6 +98,7 @@ #include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/DebugInfo.h" #include "llvm/IR/DebugInfoMetadata.h" #include "llvm/IR/DebugLoc.h" #include "llvm/IR/DerivedTypes.h" @@ -120,8 +121,6 @@ #include "llvm/IR/Value.h" #include "llvm/IR/ValueHandle.h" #include "llvm/IR/Verifier.h" -#include "llvm/InitializePasses.h" -#include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Compiler.h" @@ -231,6 +230,25 @@ static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( "prefers tail-folding, don't attempt vectorization if " "tail-folding fails."))); +static cl::opt<TailFoldingStyle> ForceTailFoldingStyle( + "force-tail-folding-style", cl::desc("Force the tail folding style"), + cl::init(TailFoldingStyle::None), + cl::values( + clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"), + clEnumValN( + TailFoldingStyle::Data, "data", + "Create lane mask for data only, using active.lane.mask intrinsic"), + clEnumValN(TailFoldingStyle::DataWithoutLaneMask, + "data-without-lane-mask", + "Create lane mask with compare/stepvector"), + clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control", + "Create lane mask using active.lane.mask intrinsic, and use " + "it for both data and control flow"), + clEnumValN( + TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck, + "data-and-control-without-rt-check", + "Similar to data-and-control, but remove the runtime check"))); + static cl::opt<bool> MaximizeBandwidth( "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, cl::desc("Maximize bandwidth when selecting vectorization factor which " @@ -338,10 +356,12 @@ static cl::opt<bool> PreferPredicatedReductionSelect( cl::desc( "Prefer predicating a reduction operation over an after loop select.")); +namespace llvm { cl::opt<bool> EnableVPlanNativePath( - "enable-vplan-native-path", cl::init(false), cl::Hidden, + "enable-vplan-native-path", cl::Hidden, cl::desc("Enable VPlan-native vectorization path with " "support for outer loop vectorization.")); +} // This flag enables the stress testing of the VPlan H-CFG construction in the // VPlan-native vectorization path. It must be used in conjuction with @@ -419,9 +439,42 @@ static std::optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, return std::nullopt; } +/// Return a vector containing interleaved elements from multiple +/// smaller input vectors. +static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals, + const Twine &Name) { + unsigned Factor = Vals.size(); + assert(Factor > 1 && "Tried to interleave invalid number of vectors"); + + VectorType *VecTy = cast<VectorType>(Vals[0]->getType()); +#ifndef NDEBUG + for (Value *Val : Vals) + assert(Val->getType() == VecTy && "Tried to interleave mismatched types"); +#endif + + // Scalable vectors cannot use arbitrary shufflevectors (only splats), so + // must use intrinsics to interleave. + if (VecTy->isScalableTy()) { + VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy); + return Builder.CreateIntrinsic( + WideVecTy, Intrinsic::experimental_vector_interleave2, Vals, + /*FMFSource=*/nullptr, Name); + } + + // Fixed length. Start by concatenating all vectors into a wide vector. + Value *WideVec = concatenateVectors(Builder, Vals); + + // Interleave the elements into the wide vector. + const unsigned NumElts = VecTy->getElementCount().getFixedValue(); + return Builder.CreateShuffleVector( + WideVec, createInterleaveMask(NumElts, Factor), Name); +} + namespace { // Forward declare GeneratedRTChecks. class GeneratedRTChecks; + +using SCEV2ValueTy = DenseMap<const SCEV *, Value *>; } // namespace namespace llvm { @@ -477,8 +530,10 @@ public: /// loop and the start value for the canonical induction, if it is != 0. The /// latter is the case when vectorizing the epilogue loop. In the case of /// epilogue vectorization, this function is overriden to handle the more - /// complex control flow around the loops. - virtual std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton(); + /// complex control flow around the loops. \p ExpandedSCEVs is used to + /// look up SCEV expansions for expressions needed during skeleton creation. + virtual std::pair<BasicBlock *, Value *> + createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs); /// Fix the vectorized code, taking care of header phi's, live-outs, and more. void fixVectorizedLoop(VPTransformState &State, VPlan &Plan); @@ -498,7 +553,7 @@ public: /// Instr's operands. void scalarizeInstruction(const Instruction *Instr, VPReplicateRecipe *RepRecipe, - const VPIteration &Instance, bool IfPredicateInstr, + const VPIteration &Instance, VPTransformState &State); /// Construct the vector value of a scalarized value \p V one lane at a time. @@ -513,7 +568,7 @@ public: ArrayRef<VPValue *> VPDefs, VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, - VPValue *BlockInMask = nullptr); + VPValue *BlockInMask, bool NeedsMaskForGaps); /// Fix the non-induction PHIs in \p Plan. void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State); @@ -522,28 +577,30 @@ public: /// able to vectorize with strict in-order reductions for the given RdxDesc. bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc); - /// Create a broadcast instruction. This method generates a broadcast - /// instruction (shuffle) for loop invariant values and for the induction - /// value. If this is the induction variable then we extend it to N, N+1, ... - /// this is needed because each iteration in the loop corresponds to a SIMD - /// element. - virtual Value *getBroadcastInstrs(Value *V); - // Returns the resume value (bc.merge.rdx) for a reduction as // generated by fixReduction. PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc); /// Create a new phi node for the induction variable \p OrigPhi to resume /// iteration count in the scalar epilogue, from where the vectorized loop - /// left off. In cases where the loop skeleton is more complicated (eg. - /// epilogue vectorization) and the resume values can come from an additional - /// bypass block, the \p AdditionalBypass pair provides information about the - /// bypass block and the end value on the edge from bypass to this loop. + /// left off. \p Step is the SCEV-expanded induction step to use. In cases + /// where the loop skeleton is more complicated (i.e., epilogue vectorization) + /// and the resume values can come from an additional bypass block, the \p + /// AdditionalBypass pair provides information about the bypass block and the + /// end value on the edge from bypass to this loop. PHINode *createInductionResumeValue( - PHINode *OrigPhi, const InductionDescriptor &ID, + PHINode *OrigPhi, const InductionDescriptor &ID, Value *Step, ArrayRef<BasicBlock *> BypassBlocks, std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); + /// Returns the original loop trip count. + Value *getTripCount() const { return TripCount; } + + /// Used to set the trip count after ILV's construction and after the + /// preheader block has been executed. Note that this always holds the trip + /// count of the original loop for both main loop and epilogue vectorization. + void setTripCount(Value *TC) { TripCount = TC; } + protected: friend class LoopVectorizationPlanner; @@ -560,7 +617,7 @@ protected: void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, Value *VectorTripCount, Value *EndValue, BasicBlock *MiddleBlock, BasicBlock *VectorHeader, - VPlan &Plan); + VPlan &Plan, VPTransformState &State); /// Handle all cross-iteration phis in the header. void fixCrossIterationPHIs(VPTransformState &State); @@ -573,10 +630,6 @@ protected: /// Create code for the loop exit value of the reduction. void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State); - /// Clear NSW/NUW flags from reduction instructions if necessary. - void clearReductionWrapFlags(VPReductionPHIRecipe *PhiR, - VPTransformState &State); - /// Iteratively sink the scalarized operands of a predicated instruction into /// the block that was created for it. void sinkScalarOperands(Instruction *PredInst); @@ -585,9 +638,6 @@ protected: /// represented as. void truncateToMinimalBitwidths(VPTransformState &State); - /// Returns (and creates if needed) the original loop trip count. - Value *getOrCreateTripCount(BasicBlock *InsertBlock); - /// Returns (and creates if needed) the trip count of the widened loop. Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock); @@ -621,6 +671,7 @@ protected: /// block, the \p AdditionalBypass pair provides information about the bypass /// block and the end value on the edge from bypass to this loop. void createInductionResumeValues( + const SCEV2ValueTy &ExpandedSCEVs, std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); /// Complete the loop skeleton by adding debug MDs, creating appropriate @@ -758,9 +809,6 @@ public: ElementCount::getFixed(1), ElementCount::getFixed(1), UnrollFactor, LVL, CM, BFI, PSI, Check) {} - -private: - Value *getBroadcastInstrs(Value *V) override; }; /// Encapsulate information regarding vectorization of a loop and its epilogue. @@ -810,15 +858,16 @@ public: // Override this function to handle the more complex control flow around the // three loops. - std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton() final { - return createEpilogueVectorizedLoopSkeleton(); + std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton( + const SCEV2ValueTy &ExpandedSCEVs) final { + return createEpilogueVectorizedLoopSkeleton(ExpandedSCEVs); } /// The interface for creating a vectorized skeleton using one of two /// different strategies, each corresponding to one execution of the vplan /// as described above. virtual std::pair<BasicBlock *, Value *> - createEpilogueVectorizedLoopSkeleton() = 0; + createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) = 0; /// Holds and updates state information required to vectorize the main loop /// and its epilogue in two separate passes. This setup helps us avoid @@ -846,7 +895,8 @@ public: EPI, LVL, CM, BFI, PSI, Check) {} /// Implements the interface for creating a vectorized skeleton using the /// *main loop* strategy (ie the first pass of vplan execution). - std::pair<BasicBlock *, Value *> createEpilogueVectorizedLoopSkeleton() final; + std::pair<BasicBlock *, Value *> + createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final; protected: /// Emits an iteration count bypass check once for the main loop (when \p @@ -876,7 +926,8 @@ public: } /// Implements the interface for creating a vectorized skeleton using the /// *epilogue loop* strategy (ie the second pass of vplan execution). - std::pair<BasicBlock *, Value *> createEpilogueVectorizedLoopSkeleton() final; + std::pair<BasicBlock *, Value *> + createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final; protected: /// Emits an iteration count bypass check after the main vector loop has @@ -953,35 +1004,21 @@ namespace llvm { Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, int64_t Step) { assert(Ty->isIntegerTy() && "Expected an integer step"); - Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue()); - return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; + return B.CreateElementCount(Ty, VF.multiplyCoefficientBy(Step)); } /// Return the runtime value for VF. Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) { - Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue()); - return VF.isScalable() ? B.CreateVScale(EC) : EC; + return B.CreateElementCount(Ty, VF); } -const SCEV *createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE) { +const SCEV *createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE, + Loop *OrigLoop) { const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && "Invalid loop count"); ScalarEvolution &SE = *PSE.getSE(); - - // The exit count might have the type of i64 while the phi is i32. This can - // happen if we have an induction variable that is sign extended before the - // compare. The only way that we get a backedge taken count is that the - // induction variable was signed and as such will not overflow. In such a case - // truncation is legal. - if (SE.getTypeSizeInBits(BackedgeTakenCount->getType()) > - IdxTy->getPrimitiveSizeInBits()) - BackedgeTakenCount = SE.getTruncateOrNoop(BackedgeTakenCount, IdxTy); - BackedgeTakenCount = SE.getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); - - // Get the total trip count from the count by adding 1. - return SE.getAddExpr(BackedgeTakenCount, - SE.getOne(BackedgeTakenCount->getType())); + return SE.getTripCountFromExitCount(BackedgeTakenCount, IdxTy, OrigLoop); } static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy, @@ -1062,11 +1099,17 @@ void InnerLoopVectorizer::collectPoisonGeneratingRecipes( continue; // This recipe contributes to the address computation of a widen - // load/store. Collect recipe if its underlying instruction has - // poison-generating flags. - Instruction *Instr = CurRec->getUnderlyingInstr(); - if (Instr && Instr->hasPoisonGeneratingFlags()) - State.MayGeneratePoisonRecipes.insert(CurRec); + // load/store. If the underlying instruction has poison-generating flags, + // drop them directly. + if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) { + RecWithFlags->dropPoisonGeneratingFlags(); + } else { + Instruction *Instr = CurRec->getUnderlyingInstr(); + (void)Instr; + assert((!Instr || !Instr->hasPoisonGeneratingFlags()) && + "found instruction with poison generating flags not covered by " + "VPRecipeWithIRFlags"); + } // Add new definitions to the worklist. for (VPValue *operand : CurRec->operands()) @@ -1143,15 +1186,7 @@ enum ScalarEpilogueLowering { CM_ScalarEpilogueNotAllowedUsePredicate }; -/// ElementCountComparator creates a total ordering for ElementCount -/// for the purposes of using it in a set structure. -struct ElementCountComparator { - bool operator()(const ElementCount &LHS, const ElementCount &RHS) const { - return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) < - std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue()); - } -}; -using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>; +using InstructionVFPair = std::pair<Instruction *, ElementCount>; /// LoopVectorizationCostModel - estimates the expected speedups due to /// vectorization. @@ -1184,17 +1219,6 @@ public: /// otherwise. bool runtimeChecksRequired(); - /// \return The most profitable vectorization factor and the cost of that VF. - /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO - /// then this vectorization factor will be selected if vectorization is - /// possible. - VectorizationFactor - selectVectorizationFactor(const ElementCountSet &CandidateVFs); - - VectorizationFactor - selectEpilogueVectorizationFactor(const ElementCount MaxVF, - const LoopVectorizationPlanner &LVP); - /// Setup cost-based decisions for user vectorization factor. /// \return true if the UserVF is a feasible VF to be chosen. bool selectUserVectorizationFactor(ElementCount UserVF) { @@ -1278,11 +1302,17 @@ public: auto Scalars = InstsToScalarize.find(VF); assert(Scalars != InstsToScalarize.end() && "VF not yet analyzed for scalarization profitability"); - return Scalars->second.find(I) != Scalars->second.end(); + return Scalars->second.contains(I); } /// Returns true if \p I is known to be uniform after vectorization. bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { + // Pseudo probe needs to be duplicated for each unrolled iteration and + // vector lane so that profiled loop trip count can be accurately + // accumulated instead of being under counted. + if (isa<PseudoProbeInst>(I)) + return false; + if (VF.isScalar()) return true; @@ -1316,7 +1346,7 @@ public: /// \returns True if instruction \p I can be truncated to a smaller bitwidth /// for vectorization factor \p VF. bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { - return VF.isVector() && MinBWs.find(I) != MinBWs.end() && + return VF.isVector() && MinBWs.contains(I) && !isProfitableToScalarize(I, VF) && !isScalarAfterVectorization(I, VF); } @@ -1379,7 +1409,7 @@ public: InstructionCost getWideningCost(Instruction *I, ElementCount VF) { assert(VF.isVector() && "Expected VF >=2"); std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); - assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && + assert(WideningDecisions.contains(InstOnVF) && "The cost is not calculated"); return WideningDecisions[InstOnVF].second; } @@ -1419,7 +1449,7 @@ public: /// that may be vectorized as interleave, gather-scatter or scalarized. void collectUniformsAndScalars(ElementCount VF) { // Do the analysis once. - if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) + if (VF.isScalar() || Uniforms.contains(VF)) return; setCostBasedWideningDecision(VF); collectLoopUniforms(VF); @@ -1442,8 +1472,7 @@ public: /// Returns true if the target machine can represent \p V as a masked gather /// or scatter operation. - bool isLegalGatherOrScatter(Value *V, - ElementCount VF = ElementCount::getFixed(1)) { + bool isLegalGatherOrScatter(Value *V, ElementCount VF) { bool LI = isa<LoadInst>(V); bool SI = isa<StoreInst>(V); if (!LI && !SI) @@ -1522,14 +1551,29 @@ public: /// Returns true if we're required to use a scalar epilogue for at least /// the final iteration of the original loop. - bool requiresScalarEpilogue(ElementCount VF) const { + bool requiresScalarEpilogue(bool IsVectorizing) const { if (!isScalarEpilogueAllowed()) return false; // If we might exit from anywhere but the latch, must run the exiting // iteration in scalar form. if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) return true; - return VF.isVector() && InterleaveInfo.requiresScalarEpilogue(); + return IsVectorizing && InterleaveInfo.requiresScalarEpilogue(); + } + + /// Returns true if we're required to use a scalar epilogue for at least + /// the final iteration of the original loop for all VFs in \p Range. + /// A scalar epilogue must either be required for all VFs in \p Range or for + /// none. + bool requiresScalarEpilogue(VFRange Range) const { + auto RequiresScalarEpilogue = [this](ElementCount VF) { + return requiresScalarEpilogue(VF.isVector()); + }; + bool IsRequired = all_of(Range, RequiresScalarEpilogue); + assert( + (IsRequired || none_of(Range, RequiresScalarEpilogue)) && + "all VFs in range must agree on whether a scalar epilogue is required"); + return IsRequired; } /// Returns true if a scalar epilogue is not allowed due to optsize or a @@ -1538,14 +1582,21 @@ public: return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; } - /// Returns true if all loop blocks should be masked to fold tail loop. - bool foldTailByMasking() const { return FoldTailByMasking; } + /// Returns the TailFoldingStyle that is best for the current loop. + TailFoldingStyle + getTailFoldingStyle(bool IVUpdateMayOverflow = true) const { + if (!CanFoldTailByMasking) + return TailFoldingStyle::None; + + if (ForceTailFoldingStyle.getNumOccurrences()) + return ForceTailFoldingStyle; + + return TTI.getPreferredTailFoldingStyle(IVUpdateMayOverflow); + } - /// Returns true if were tail-folding and want to use the active lane mask - /// for vector loop control flow. - bool useActiveLaneMaskForControlFlow() const { - return FoldTailByMasking && - TTI.emitGetActiveLaneMask() == PredicationStyle::DataAndControlFlow; + /// Returns true if all loop blocks should be masked to fold tail loop. + bool foldTailByMasking() const { + return getTailFoldingStyle() != TailFoldingStyle::None; } /// Returns true if the instructions in this block requires predication @@ -1582,12 +1633,8 @@ public: /// scalarized - /// i.e. either vector version isn't available, or is too expensive. InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, - bool &NeedToScalarize) const; - - /// Returns true if the per-lane cost of VectorizationFactor A is lower than - /// that of B. - bool isMoreProfitable(const VectorizationFactor &A, - const VectorizationFactor &B) const; + Function **Variant, + bool *NeedsMask = nullptr) const; /// Invalidates decisions already taken by the cost model. void invalidateCostModelingDecisions() { @@ -1596,10 +1643,29 @@ public: Scalars.clear(); } - /// Convenience function that returns the value of vscale_range iff - /// vscale_range.min == vscale_range.max or otherwise returns the value - /// returned by the corresponding TLI method. - std::optional<unsigned> getVScaleForTuning() const; + /// The vectorization cost is a combination of the cost itself and a boolean + /// indicating whether any of the contributing operations will actually + /// operate on vector values after type legalization in the backend. If this + /// latter value is false, then all operations will be scalarized (i.e. no + /// vectorization has actually taken place). + using VectorizationCostTy = std::pair<InstructionCost, bool>; + + /// Returns the expected execution cost. The unit of the cost does + /// not matter because we use the 'cost' units to compare different + /// vector widths. The cost that is returned is *not* normalized by + /// the factor width. If \p Invalid is not nullptr, this function + /// will add a pair(Instruction*, ElementCount) to \p Invalid for + /// each instruction that has an Invalid cost for the given VF. + VectorizationCostTy + expectedCost(ElementCount VF, + SmallVectorImpl<InstructionVFPair> *Invalid = nullptr); + + bool hasPredStores() const { return NumPredStores > 0; } + + /// Returns true if epilogue vectorization is considered profitable, and + /// false otherwise. + /// \p VF is the vectorization factor chosen for the original loop. + bool isEpilogueVectorizationProfitable(const ElementCount VF) const; private: unsigned NumPredStores = 0; @@ -1626,24 +1692,6 @@ private: /// of elements. ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); - /// The vectorization cost is a combination of the cost itself and a boolean - /// indicating whether any of the contributing operations will actually - /// operate on vector values after type legalization in the backend. If this - /// latter value is false, then all operations will be scalarized (i.e. no - /// vectorization has actually taken place). - using VectorizationCostTy = std::pair<InstructionCost, bool>; - - /// Returns the expected execution cost. The unit of the cost does - /// not matter because we use the 'cost' units to compare different - /// vector widths. The cost that is returned is *not* normalized by - /// the factor width. If \p Invalid is not nullptr, this function - /// will add a pair(Instruction*, ElementCount) to \p Invalid for - /// each instruction that has an Invalid cost for the given VF. - using InstructionVFPair = std::pair<Instruction *, ElementCount>; - VectorizationCostTy - expectedCost(ElementCount VF, - SmallVectorImpl<InstructionVFPair> *Invalid = nullptr); - /// Returns the execution time cost of an instruction for a given vector /// width. Vector width of one means scalar. VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); @@ -1715,7 +1763,7 @@ private: ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; /// All blocks of loop are to be masked to fold tail of scalar iterations. - bool FoldTailByMasking = false; + bool CanFoldTailByMasking = false; /// A map holding scalar costs for different vectorization factors. The /// presence of a cost for an instruction in the mapping indicates that the @@ -1796,8 +1844,7 @@ private: // the scalars are collected. That should be a safe assumption in most // cases, because we check if the operands have vectorizable types // beforehand in LoopVectorizationLegality. - return Scalars.find(VF) == Scalars.end() || - !isScalarAfterVectorization(I, VF); + return !Scalars.contains(VF) || !isScalarAfterVectorization(I, VF); }; /// Returns a range containing only operands needing to be extracted. @@ -1807,16 +1854,6 @@ private: Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); } - /// Determines if we have the infrastructure to vectorize loop \p L and its - /// epilogue, assuming the main loop is vectorized by \p VF. - bool isCandidateForEpilogueVectorization(const Loop &L, - const ElementCount VF) const; - - /// Returns true if epilogue vectorization is considered profitable, and - /// false otherwise. - /// \p VF is the vectorization factor chosen for the original loop. - bool isEpilogueVectorizationProfitable(const ElementCount VF) const; - public: /// The loop that we evaluate. Loop *TheLoop; @@ -1862,9 +1899,6 @@ public: /// All element types found in the loop. SmallPtrSet<Type *, 16> ElementTypesInLoop; - - /// Profitable vector factors. - SmallVector<VectorizationFactor, 8> ProfitableVFs; }; } // end namespace llvm @@ -2135,6 +2169,17 @@ public: }; } // namespace +static bool useActiveLaneMask(TailFoldingStyle Style) { + return Style == TailFoldingStyle::Data || + Style == TailFoldingStyle::DataAndControlFlow || + Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck; +} + +static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style) { + return Style == TailFoldingStyle::DataAndControlFlow || + Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck; +} + // Return true if \p OuterLp is an outer loop annotated with hints for explicit // vectorization. The loop needs to be annotated with #pragma omp simd // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the @@ -2202,97 +2247,11 @@ static void collectSupportedLoops(Loop &L, LoopInfo *LI, collectSupportedLoops(*InnerL, LI, ORE, V); } -namespace { - -/// The LoopVectorize Pass. -struct LoopVectorize : public FunctionPass { - /// Pass identification, replacement for typeid - static char ID; - - LoopVectorizePass Impl; - - explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, - bool VectorizeOnlyWhenForced = false) - : FunctionPass(ID), - Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { - initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); - } - - bool runOnFunction(Function &F) override { - if (skipFunction(F)) - return false; - - auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); - auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); - auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); - auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); - auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; - auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); - auto &LAIs = getAnalysis<LoopAccessLegacyAnalysis>().getLAIs(); - auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); - auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); - auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); - - return Impl - .runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AC, LAIs, *ORE, PSI) - .MadeAnyChange; - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<AssumptionCacheTracker>(); - AU.addRequired<BlockFrequencyInfoWrapperPass>(); - AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<LoopInfoWrapperPass>(); - AU.addRequired<ScalarEvolutionWrapperPass>(); - AU.addRequired<TargetTransformInfoWrapperPass>(); - AU.addRequired<LoopAccessLegacyAnalysis>(); - AU.addRequired<DemandedBitsWrapperPass>(); - AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); - AU.addRequired<InjectTLIMappingsLegacy>(); - - // We currently do not preserve loopinfo/dominator analyses with outer loop - // vectorization. Until this is addressed, mark these analyses as preserved - // only for non-VPlan-native path. - // TODO: Preserve Loop and Dominator analyses for VPlan-native path. - if (!EnableVPlanNativePath) { - AU.addPreserved<LoopInfoWrapperPass>(); - AU.addPreserved<DominatorTreeWrapperPass>(); - } - - AU.addPreserved<BasicAAWrapperPass>(); - AU.addPreserved<GlobalsAAWrapperPass>(); - AU.addRequired<ProfileSummaryInfoWrapperPass>(); - } -}; - -} // end anonymous namespace - //===----------------------------------------------------------------------===// // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and // LoopVectorizationCostModel and LoopVectorizationPlanner. //===----------------------------------------------------------------------===// -Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { - // We need to place the broadcast of invariant variables outside the loop, - // but only if it's proven safe to do so. Else, broadcast will be inside - // vector loop body. - Instruction *Instr = dyn_cast<Instruction>(V); - bool SafeToHoist = OrigLoop->isLoopInvariant(V) && - (!Instr || - DT->dominates(Instr->getParent(), LoopVectorPreHeader)); - // Place the code for broadcasting invariant variables in the new preheader. - IRBuilder<>::InsertPointGuard Guard(Builder); - if (SafeToHoist) - Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); - - // Broadcast the scalar into all locations in the vector. - Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); - - return Shuf; -} - /// This function adds /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) /// to each vector element of Val. The sequence starts at StartIndex. @@ -2435,21 +2394,6 @@ static void buildScalarSteps(Value *ScalarIV, Value *Step, } } -// Generate code for the induction step. Note that induction steps are -// required to be loop-invariant -static Value *CreateStepValue(const SCEV *Step, ScalarEvolution &SE, - Instruction *InsertBefore, - Loop *OrigLoop = nullptr) { - const DataLayout &DL = SE.getDataLayout(); - assert((!OrigLoop || SE.isLoopInvariant(Step, OrigLoop)) && - "Induction step should be loop invariant"); - if (auto *E = dyn_cast<SCEVUnknown>(Step)) - return E->getValue(); - - SCEVExpander Exp(SE, DL, "induction"); - return Exp.expandCodeFor(Step, Step->getType(), InsertBefore); -} - /// Compute the transformed value of Index at offset StartValue using step /// StepValue. /// For integer induction, returns StartValue + Index * StepValue. @@ -2514,9 +2458,7 @@ static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index, return CreateAdd(StartValue, Offset); } case InductionDescriptor::IK_PtrInduction: { - assert(isa<Constant>(Step) && - "Expected constant step for pointer induction"); - return B.CreateGEP(ID.getElementType(), StartValue, CreateMul(Index, Step)); + return B.CreateGEP(B.getInt8Ty(), StartValue, CreateMul(Index, Step)); } case InductionDescriptor::IK_FpInduction: { assert(!isa<VectorType>(Index->getType()) && @@ -2538,6 +2480,50 @@ static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index, llvm_unreachable("invalid enum"); } +std::optional<unsigned> getMaxVScale(const Function &F, + const TargetTransformInfo &TTI) { + if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale()) + return MaxVScale; + + if (F.hasFnAttribute(Attribute::VScaleRange)) + return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax(); + + return std::nullopt; +} + +/// For the given VF and UF and maximum trip count computed for the loop, return +/// whether the induction variable might overflow in the vectorized loop. If not, +/// then we know a runtime overflow check always evaluates to false and can be +/// removed. +static bool isIndvarOverflowCheckKnownFalse( + const LoopVectorizationCostModel *Cost, + ElementCount VF, std::optional<unsigned> UF = std::nullopt) { + // Always be conservative if we don't know the exact unroll factor. + unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF); + + Type *IdxTy = Cost->Legal->getWidestInductionType(); + APInt MaxUIntTripCount = cast<IntegerType>(IdxTy)->getMask(); + + // We know the runtime overflow check is known false iff the (max) trip-count + // is known and (max) trip-count + (VF * UF) does not overflow in the type of + // the vector loop induction variable. + if (unsigned TC = + Cost->PSE.getSE()->getSmallConstantMaxTripCount(Cost->TheLoop)) { + uint64_t MaxVF = VF.getKnownMinValue(); + if (VF.isScalable()) { + std::optional<unsigned> MaxVScale = + getMaxVScale(*Cost->TheFunction, Cost->TTI); + if (!MaxVScale) + return false; + MaxVF *= *MaxVScale; + } + + return (MaxUIntTripCount - TC).ugt(MaxVF * MaxUF); + } + + return false; +} + void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, VPTransformState &State) { @@ -2591,14 +2577,13 @@ static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { void InnerLoopVectorizer::vectorizeInterleaveGroup( const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, - VPValue *BlockInMask) { + VPValue *BlockInMask, bool NeedsMaskForGaps) { Instruction *Instr = Group->getInsertPos(); const DataLayout &DL = Instr->getModule()->getDataLayout(); // Prepare for the vector type of the interleaved load/store. Type *ScalarTy = getLoadStoreType(Instr); unsigned InterleaveFactor = Group->getFactor(); - assert(!VF.isScalable() && "scalable vectors not yet supported."); auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); // Prepare for the new pointers. @@ -2609,14 +2594,21 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( assert((!BlockInMask || !Group->isReverse()) && "Reversed masked interleave-group not supported."); + Value *Idx; // If the group is reverse, adjust the index to refer to the last vector lane // instead of the first. We adjust the index from the first vector lane, // rather than directly getting the pointer for lane VF - 1, because the // pointer operand of the interleaved access is supposed to be uniform. For // uniform instructions, we're only required to generate a value for the // first vector lane in each unroll iteration. - if (Group->isReverse()) - Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); + if (Group->isReverse()) { + Value *RuntimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF); + Idx = Builder.CreateSub(RuntimeVF, Builder.getInt32(1)); + Idx = Builder.CreateMul(Idx, Builder.getInt32(Group->getFactor())); + Idx = Builder.CreateAdd(Idx, Builder.getInt32(Index)); + Idx = Builder.CreateNeg(Idx); + } else + Idx = Builder.getInt32(-Index); for (unsigned Part = 0; Part < UF; Part++) { Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); @@ -2637,8 +2629,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( bool InBounds = false; if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) InBounds = gep->isInBounds(); - AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); - cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); + AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Idx, "", InBounds); // Cast to the vector pointer type. unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); @@ -2649,14 +2640,43 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( State.setDebugLocFromInst(Instr); Value *PoisonVec = PoisonValue::get(VecTy); - Value *MaskForGaps = nullptr; - if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { - MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); - assert(MaskForGaps && "Mask for Gaps is required but it is null"); - } + auto CreateGroupMask = [this, &BlockInMask, &State, &InterleaveFactor]( + unsigned Part, Value *MaskForGaps) -> Value * { + if (VF.isScalable()) { + assert(!MaskForGaps && "Interleaved groups with gaps are not supported."); + assert(InterleaveFactor == 2 && + "Unsupported deinterleave factor for scalable vectors"); + auto *BlockInMaskPart = State.get(BlockInMask, Part); + SmallVector<Value *, 2> Ops = {BlockInMaskPart, BlockInMaskPart}; + auto *MaskTy = + VectorType::get(Builder.getInt1Ty(), VF.getKnownMinValue() * 2, true); + return Builder.CreateIntrinsic( + MaskTy, Intrinsic::experimental_vector_interleave2, Ops, + /*FMFSource=*/nullptr, "interleaved.mask"); + } + + if (!BlockInMask) + return MaskForGaps; + + Value *BlockInMaskPart = State.get(BlockInMask, Part); + Value *ShuffledMask = Builder.CreateShuffleVector( + BlockInMaskPart, + createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), + "interleaved.mask"); + return MaskForGaps ? Builder.CreateBinOp(Instruction::And, ShuffledMask, + MaskForGaps) + : ShuffledMask; + }; // Vectorize the interleaved load group. if (isa<LoadInst>(Instr)) { + Value *MaskForGaps = nullptr; + if (NeedsMaskForGaps) { + MaskForGaps = + createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); + assert(MaskForGaps && "Mask for Gaps is required but it is null"); + } + // For each unroll part, create a wide load for the group. SmallVector<Value *, 2> NewLoads; for (unsigned Part = 0; Part < UF; Part++) { @@ -2664,18 +2684,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( if (BlockInMask || MaskForGaps) { assert(useMaskedInterleavedAccesses(*TTI) && "masked interleaved groups are not allowed."); - Value *GroupMask = MaskForGaps; - if (BlockInMask) { - Value *BlockInMaskPart = State.get(BlockInMask, Part); - Value *ShuffledMask = Builder.CreateShuffleVector( - BlockInMaskPart, - createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), - "interleaved.mask"); - GroupMask = MaskForGaps - ? Builder.CreateBinOp(Instruction::And, ShuffledMask, - MaskForGaps) - : ShuffledMask; - } + Value *GroupMask = CreateGroupMask(Part, MaskForGaps); NewLoad = Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(), GroupMask, PoisonVec, "wide.masked.vec"); @@ -2687,6 +2696,41 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( NewLoads.push_back(NewLoad); } + if (VecTy->isScalableTy()) { + assert(InterleaveFactor == 2 && + "Unsupported deinterleave factor for scalable vectors"); + + for (unsigned Part = 0; Part < UF; ++Part) { + // Scalable vectors cannot use arbitrary shufflevectors (only splats), + // so must use intrinsics to deinterleave. + Value *DI = Builder.CreateIntrinsic( + Intrinsic::experimental_vector_deinterleave2, VecTy, NewLoads[Part], + /*FMFSource=*/nullptr, "strided.vec"); + unsigned J = 0; + for (unsigned I = 0; I < InterleaveFactor; ++I) { + Instruction *Member = Group->getMember(I); + + if (!Member) + continue; + + Value *StridedVec = Builder.CreateExtractValue(DI, I); + // If this member has different type, cast the result type. + if (Member->getType() != ScalarTy) { + VectorType *OtherVTy = VectorType::get(Member->getType(), VF); + StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); + } + + if (Group->isReverse()) + StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse"); + + State.set(VPDefs[J], StridedVec, Part); + ++J; + } + } + + return; + } + // For each member in the group, shuffle out the appropriate data from the // wide loads. unsigned J = 0; @@ -2724,7 +2768,8 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( auto *SubVT = VectorType::get(ScalarTy, VF); // Vectorize the interleaved store group. - MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); + Value *MaskForGaps = + createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) && "masked interleaved groups are not allowed."); assert((!MaskForGaps || !VF.isScalable()) && @@ -2759,27 +2804,11 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( StoredVecs.push_back(StoredVec); } - // Concatenate all vectors into a wide vector. - Value *WideVec = concatenateVectors(Builder, StoredVecs); - - // Interleave the elements in the wide vector. - Value *IVec = Builder.CreateShuffleVector( - WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), - "interleaved.vec"); - + // Interleave all the smaller vectors into one wider vector. + Value *IVec = interleaveVectors(Builder, StoredVecs, "interleaved.vec"); Instruction *NewStoreInstr; if (BlockInMask || MaskForGaps) { - Value *GroupMask = MaskForGaps; - if (BlockInMask) { - Value *BlockInMaskPart = State.get(BlockInMask, Part); - Value *ShuffledMask = Builder.CreateShuffleVector( - BlockInMaskPart, - createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), - "interleaved.mask"); - GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And, - ShuffledMask, MaskForGaps) - : ShuffledMask; - } + Value *GroupMask = CreateGroupMask(Part, MaskForGaps); NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part], Group->getAlign(), GroupMask); } else @@ -2793,7 +2822,6 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr, VPReplicateRecipe *RepRecipe, const VPIteration &Instance, - bool IfPredicateInstr, VPTransformState &State) { assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); @@ -2810,14 +2838,7 @@ void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr, if (!IsVoidRetTy) Cloned->setName(Instr->getName() + ".cloned"); - // If the scalarized instruction contributes to the address computation of a - // widen masked load/store which was in a basic block that needed predication - // and is not predicated after vectorization, we can't propagate - // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized - // instruction could feed a poison value to the base address of the widen - // load/store. - if (State.MayGeneratePoisonRecipes.contains(RepRecipe)) - Cloned->dropPoisonGeneratingFlags(); + RepRecipe->setFlags(Cloned); if (Instr->getDebugLoc()) State.setDebugLocFromInst(Instr); @@ -2843,45 +2864,17 @@ void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr, AC->registerAssumption(II); // End if-block. + bool IfPredicateInstr = RepRecipe->getParent()->getParent()->isReplicator(); if (IfPredicateInstr) PredicatedInstructions.push_back(Cloned); } -Value *InnerLoopVectorizer::getOrCreateTripCount(BasicBlock *InsertBlock) { - if (TripCount) - return TripCount; - - assert(InsertBlock); - IRBuilder<> Builder(InsertBlock->getTerminator()); - // Find the loop boundaries. - Type *IdxTy = Legal->getWidestInductionType(); - assert(IdxTy && "No type for induction"); - const SCEV *ExitCount = createTripCountSCEV(IdxTy, PSE); - - const DataLayout &DL = InsertBlock->getModule()->getDataLayout(); - - // Expand the trip count and place the new instructions in the preheader. - // Notice that the pre-header does not change, only the loop body. - SCEVExpander Exp(*PSE.getSE(), DL, "induction"); - - // Count holds the overall loop count (N). - TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), - InsertBlock->getTerminator()); - - if (TripCount->getType()->isPointerTy()) - TripCount = - CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", - InsertBlock->getTerminator()); - - return TripCount; -} - Value * InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) { if (VectorTripCount) return VectorTripCount; - Value *TC = getOrCreateTripCount(InsertBlock); + Value *TC = getTripCount(); IRBuilder<> Builder(InsertBlock->getTerminator()); Type *Ty = TC->getType(); @@ -2917,7 +2910,7 @@ InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) { // the step does not evenly divide the trip count, no adjustment is necessary // since there will already be scalar iterations. Note that the minimum // iterations check ensures that N >= Step. - if (Cost->requiresScalarEpilogue(VF)) { + if (Cost->requiresScalarEpilogue(VF.isVector())) { auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); R = Builder.CreateSelect(IsZero, Step, R); } @@ -2930,10 +2923,10 @@ InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) { Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, const DataLayout &DL) { // Verify that V is a vector type with same number of elements as DstVTy. - auto *DstFVTy = cast<FixedVectorType>(DstVTy); - unsigned VF = DstFVTy->getNumElements(); - auto *SrcVecTy = cast<FixedVectorType>(V->getType()); - assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); + auto *DstFVTy = cast<VectorType>(DstVTy); + auto VF = DstFVTy->getElementCount(); + auto *SrcVecTy = cast<VectorType>(V->getType()); + assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match"); Type *SrcElemTy = SrcVecTy->getElementType(); Type *DstElemTy = DstFVTy->getElementType(); assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && @@ -2953,13 +2946,13 @@ Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, "Only one type should be a floating point type"); Type *IntTy = IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); - auto *VecIntTy = FixedVectorType::get(IntTy, VF); + auto *VecIntTy = VectorType::get(IntTy, VF); Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); } void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { - Value *Count = getOrCreateTripCount(LoopVectorPreHeader); + Value *Count = getTripCount(); // Reuse existing vector loop preheader for TC checks. // Note that new preheader block is generated for vector loop. BasicBlock *const TCCheckBlock = LoopVectorPreHeader; @@ -2970,8 +2963,8 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { // vector trip count is zero. This check also covers the case where adding one // to the backedge-taken count overflowed leading to an incorrect trip count // of zero. In this case we will also jump to the scalar loop. - auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE - : ICmpInst::ICMP_ULT; + auto P = Cost->requiresScalarEpilogue(VF.isVector()) ? ICmpInst::ICMP_ULE + : ICmpInst::ICMP_ULT; // If tail is to be folded, vector loop takes care of all iterations. Type *CountTy = Count->getType(); @@ -2989,10 +2982,13 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF)); }; - if (!Cost->foldTailByMasking()) + TailFoldingStyle Style = Cost->getTailFoldingStyle(); + if (Style == TailFoldingStyle::None) CheckMinIters = Builder.CreateICmp(P, Count, CreateStep(), "min.iters.check"); - else if (VF.isScalable()) { + else if (VF.isScalable() && + !isIndvarOverflowCheckKnownFalse(Cost, VF, UF) && + Style != TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) { // vscale is not necessarily a power-of-2, which means we cannot guarantee // an overflow to zero when updating induction variables and so an // additional overflow check is required before entering the vector loop. @@ -3017,7 +3013,7 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { // Update dominator for Bypass & LoopExit (if needed). DT->changeImmediateDominator(Bypass, TCCheckBlock); - if (!Cost->requiresScalarEpilogue(VF)) + if (!Cost->requiresScalarEpilogue(VF.isVector())) // If there is an epilogue which must run, there's no edge from the // middle block to exit blocks and thus no need to update the immediate // dominator of the exit blocks. @@ -3044,7 +3040,7 @@ BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) { // Update dominator only if this is first RT check. if (LoopBypassBlocks.empty()) { DT->changeImmediateDominator(Bypass, SCEVCheckBlock); - if (!Cost->requiresScalarEpilogue(VF)) + if (!Cost->requiresScalarEpilogue(VF.isVector())) // If there is an epilogue which must run, there's no edge from the // middle block to exit blocks and thus no need to update the immediate // dominator of the exit blocks. @@ -3097,7 +3093,7 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { LoopVectorPreHeader = OrigLoop->getLoopPreheader(); assert(LoopVectorPreHeader && "Invalid loop structure"); LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr - assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) && + assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector())) && "multiple exit loop without required epilogue?"); LoopMiddleBlock = @@ -3117,17 +3113,18 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { // branch from the middle block to the loop scalar preheader, and the // exit block. completeLoopSkeleton will update the condition to use an // iteration check, if required to decide whether to execute the remainder. - BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ? - BranchInst::Create(LoopScalarPreHeader) : - BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, - Builder.getTrue()); + BranchInst *BrInst = + Cost->requiresScalarEpilogue(VF.isVector()) + ? BranchInst::Create(LoopScalarPreHeader) + : BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, + Builder.getTrue()); BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); // Update dominator for loop exit. During skeleton creation, only the vector // pre-header and the middle block are created. The vector loop is entirely // created during VPlan exection. - if (!Cost->requiresScalarEpilogue(VF)) + if (!Cost->requiresScalarEpilogue(VF.isVector())) // If there is an epilogue which must run, there's no edge from the // middle block to exit blocks and thus no need to update the immediate // dominator of the exit blocks. @@ -3135,7 +3132,7 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { } PHINode *InnerLoopVectorizer::createInductionResumeValue( - PHINode *OrigPhi, const InductionDescriptor &II, + PHINode *OrigPhi, const InductionDescriptor &II, Value *Step, ArrayRef<BasicBlock *> BypassBlocks, std::pair<BasicBlock *, Value *> AdditionalBypass) { Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); @@ -3154,8 +3151,6 @@ PHINode *InnerLoopVectorizer::createInductionResumeValue( if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); - Value *Step = - CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint()); EndValue = emitTransformedIndex(B, VectorTripCount, II.getStartValue(), Step, II); EndValue->setName("ind.end"); @@ -3163,8 +3158,6 @@ PHINode *InnerLoopVectorizer::createInductionResumeValue( // Compute the end value for the additional bypass (if applicable). if (AdditionalBypass.first) { B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); - Value *Step = - CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint()); EndValueFromAdditionalBypass = emitTransformedIndex( B, AdditionalBypass.second, II.getStartValue(), Step, II); EndValueFromAdditionalBypass->setName("ind.end"); @@ -3193,7 +3186,22 @@ PHINode *InnerLoopVectorizer::createInductionResumeValue( return BCResumeVal; } +/// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV +/// expansion results. +static Value *getExpandedStep(const InductionDescriptor &ID, + const SCEV2ValueTy &ExpandedSCEVs) { + const SCEV *Step = ID.getStep(); + if (auto *C = dyn_cast<SCEVConstant>(Step)) + return C->getValue(); + if (auto *U = dyn_cast<SCEVUnknown>(Step)) + return U->getValue(); + auto I = ExpandedSCEVs.find(Step); + assert(I != ExpandedSCEVs.end() && "SCEV must be expanded at this point"); + return I->second; +} + void InnerLoopVectorizer::createInductionResumeValues( + const SCEV2ValueTy &ExpandedSCEVs, std::pair<BasicBlock *, Value *> AdditionalBypass) { assert(((AdditionalBypass.first && AdditionalBypass.second) || (!AdditionalBypass.first && !AdditionalBypass.second)) && @@ -3209,14 +3217,15 @@ void InnerLoopVectorizer::createInductionResumeValues( PHINode *OrigPhi = InductionEntry.first; const InductionDescriptor &II = InductionEntry.second; PHINode *BCResumeVal = createInductionResumeValue( - OrigPhi, II, LoopBypassBlocks, AdditionalBypass); + OrigPhi, II, getExpandedStep(II, ExpandedSCEVs), LoopBypassBlocks, + AdditionalBypass); OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); } } BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() { // The trip counts should be cached by now. - Value *Count = getOrCreateTripCount(LoopVectorPreHeader); + Value *Count = getTripCount(); Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); @@ -3229,7 +3238,8 @@ BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() { // Thus if tail is to be folded, we know we don't need to run the // remainder and we can use the previous value for the condition (true). // 3) Otherwise, construct a runtime check. - if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) { + if (!Cost->requiresScalarEpilogue(VF.isVector()) && + !Cost->foldTailByMasking()) { Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, VectorTripCount, "cmp.n", LoopMiddleBlock->getTerminator()); @@ -3250,14 +3260,16 @@ BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() { } std::pair<BasicBlock *, Value *> -InnerLoopVectorizer::createVectorizedLoopSkeleton() { +InnerLoopVectorizer::createVectorizedLoopSkeleton( + const SCEV2ValueTy &ExpandedSCEVs) { /* In this function we generate a new loop. The new loop will contain the vectorized instructions while the old loop will continue to run the scalar remainder. - [ ] <-- loop iteration number check. - / | + [ ] <-- old preheader - loop iteration number check and SCEVs in Plan's + / | preheader are expanded here. Eventually all required SCEV + / | expansion should happen here. / v | [ ] <-- vector loop bypass (may consist of multiple blocks). | / | @@ -3304,7 +3316,7 @@ InnerLoopVectorizer::createVectorizedLoopSkeleton() { emitMemRuntimeChecks(LoopScalarPreHeader); // Emit phis for the new starting index of the scalar loop. - createInductionResumeValues(); + createInductionResumeValues(ExpandedSCEVs); return {completeLoopSkeleton(), nullptr}; } @@ -3317,7 +3329,8 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, Value *VectorTripCount, Value *EndValue, BasicBlock *MiddleBlock, - BasicBlock *VectorHeader, VPlan &Plan) { + BasicBlock *VectorHeader, VPlan &Plan, + VPTransformState &State) { // There are two kinds of external IV usages - those that use the value // computed in the last iteration (the PHI) and those that use the penultimate // value (the value that feeds into the phi from the loop latch). @@ -3345,7 +3358,6 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, auto *UI = cast<Instruction>(U); if (!OrigLoop->contains(UI)) { assert(isa<PHINode>(UI) && "Expected LCSSA form"); - IRBuilder<> B(MiddleBlock->getTerminator()); // Fast-math-flags propagate from the original induction instruction. @@ -3355,8 +3367,11 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, Value *CountMinusOne = B.CreateSub( VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1)); CountMinusOne->setName("cmo"); - Value *Step = CreateStepValue(II.getStep(), *PSE.getSE(), - VectorHeader->getTerminator()); + + VPValue *StepVPV = Plan.getSCEVExpansion(II.getStep()); + assert(StepVPV && "step must have been expanded during VPlan execution"); + Value *Step = StepVPV->isLiveIn() ? StepVPV->getLiveInIRValue() + : State.get(StepVPV, {0, 0}); Value *Escape = emitTransformedIndex(B, CountMinusOne, II.getStartValue(), Step, II); Escape->setName("ind.escape"); @@ -3430,12 +3445,12 @@ static void cse(BasicBlock *BB) { } } -InstructionCost -LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, - bool &NeedToScalarize) const { +InstructionCost LoopVectorizationCostModel::getVectorCallCost( + CallInst *CI, ElementCount VF, Function **Variant, bool *NeedsMask) const { Function *F = CI->getCalledFunction(); Type *ScalarRetTy = CI->getType(); SmallVector<Type *, 4> Tys, ScalarTys; + bool MaskRequired = Legal->isMaskRequired(CI); for (auto &ArgOp : CI->args()) ScalarTys.push_back(ArgOp->getType()); @@ -3464,18 +3479,39 @@ LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, // If we can't emit a vector call for this function, then the currently found // cost is the cost we need to return. - NeedToScalarize = true; - VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); + InstructionCost MaskCost = 0; + VFShape Shape = VFShape::get(*CI, VF, MaskRequired); + if (NeedsMask) + *NeedsMask = MaskRequired; Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); + // If we want an unmasked vector function but can't find one matching the VF, + // maybe we can find vector function that does use a mask and synthesize + // an all-true mask. + if (!VecFunc && !MaskRequired) { + Shape = VFShape::get(*CI, VF, /*HasGlobalPred=*/true); + VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); + // If we found one, add in the cost of creating a mask + if (VecFunc) { + if (NeedsMask) + *NeedsMask = true; + MaskCost = TTI.getShuffleCost( + TargetTransformInfo::SK_Broadcast, + VectorType::get( + IntegerType::getInt1Ty(VecFunc->getFunctionType()->getContext()), + VF)); + } + } + // We don't support masked function calls yet, but we can scalarize a + // masked call with branches (unless VF is scalable). if (!TLI || CI->isNoBuiltin() || !VecFunc) - return Cost; + return VF.isScalable() ? InstructionCost::getInvalid() : Cost; // If the corresponding vector cost is cheaper, return its cost. InstructionCost VectorCallCost = - TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind); + TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind) + MaskCost; if (VectorCallCost < Cost) { - NeedToScalarize = false; + *Variant = VecFunc; Cost = VectorCallCost; } return Cost; @@ -3675,14 +3711,25 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State, // Forget the original basic block. PSE.getSE()->forgetLoop(OrigLoop); + // After vectorization, the exit blocks of the original loop will have + // additional predecessors. Invalidate SCEVs for the exit phis in case SE + // looked through single-entry phis. + SmallVector<BasicBlock *> ExitBlocks; + OrigLoop->getExitBlocks(ExitBlocks); + for (BasicBlock *Exit : ExitBlocks) + for (PHINode &PN : Exit->phis()) + PSE.getSE()->forgetValue(&PN); + VPBasicBlock *LatchVPBB = Plan.getVectorLoopRegion()->getExitingBasicBlock(); Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]); - if (Cost->requiresScalarEpilogue(VF)) { + if (Cost->requiresScalarEpilogue(VF.isVector())) { // No edge from the middle block to the unique exit block has been inserted // and there is nothing to fix from vector loop; phis should have incoming // from scalar loop only. - Plan.clearLiveOuts(); } else { + // TODO: Check VPLiveOuts to see if IV users need fixing instead of checking + // the cost model. + // If we inserted an edge from the middle block to the unique exit block, // update uses outside the loop (phis) to account for the newly inserted // edge. @@ -3692,7 +3739,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State, fixupIVUsers(Entry.first, Entry.second, getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()), IVEndValues[Entry.first], LoopMiddleBlock, - VectorLoop->getHeader(), Plan); + VectorLoop->getHeader(), Plan, State); } // Fix LCSSA phis not already fixed earlier. Extracts may need to be generated @@ -3799,31 +3846,53 @@ void InnerLoopVectorizer::fixFixedOrderRecurrence( Value *Incoming = State.get(PreviousDef, UF - 1); auto *ExtractForScalar = Incoming; auto *IdxTy = Builder.getInt32Ty(); + Value *RuntimeVF = nullptr; if (VF.isVector()) { auto *One = ConstantInt::get(IdxTy, 1); Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); - auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); + RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); auto *LastIdx = Builder.CreateSub(RuntimeVF, One); - ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx, - "vector.recur.extract"); - } - // Extract the second last element in the middle block if the - // Phi is used outside the loop. We need to extract the phi itself - // and not the last element (the phi update in the current iteration). This - // will be the value when jumping to the exit block from the LoopMiddleBlock, - // when the scalar loop is not run at all. - Value *ExtractForPhiUsedOutsideLoop = nullptr; - if (VF.isVector()) { - auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); - auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2)); - ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( - Incoming, Idx, "vector.recur.extract.for.phi"); - } else if (UF > 1) - // When loop is unrolled without vectorizing, initialize - // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value - // of `Incoming`. This is analogous to the vectorized case above: extracting - // the second last element when VF > 1. - ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2); + ExtractForScalar = + Builder.CreateExtractElement(Incoming, LastIdx, "vector.recur.extract"); + } + + auto RecurSplice = cast<VPInstruction>(*PhiR->user_begin()); + assert(PhiR->getNumUsers() == 1 && + RecurSplice->getOpcode() == + VPInstruction::FirstOrderRecurrenceSplice && + "recurrence phi must have a single user: FirstOrderRecurrenceSplice"); + SmallVector<VPLiveOut *> LiveOuts; + for (VPUser *U : RecurSplice->users()) + if (auto *LiveOut = dyn_cast<VPLiveOut>(U)) + LiveOuts.push_back(LiveOut); + + if (!LiveOuts.empty()) { + // Extract the second last element in the middle block if the + // Phi is used outside the loop. We need to extract the phi itself + // and not the last element (the phi update in the current iteration). This + // will be the value when jumping to the exit block from the + // LoopMiddleBlock, when the scalar loop is not run at all. + Value *ExtractForPhiUsedOutsideLoop = nullptr; + if (VF.isVector()) { + auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2)); + ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( + Incoming, Idx, "vector.recur.extract.for.phi"); + } else { + assert(UF > 1 && "VF and UF cannot both be 1"); + // When loop is unrolled without vectorizing, initialize + // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled + // value of `Incoming`. This is analogous to the vectorized case above: + // extracting the second last element when VF > 1. + ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2); + } + + for (VPLiveOut *LiveOut : LiveOuts) { + assert(!Cost->requiresScalarEpilogue(VF.isVector())); + PHINode *LCSSAPhi = LiveOut->getPhi(); + LCSSAPhi->addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); + State.Plan->removeLiveOut(LCSSAPhi); + } + } // Fix the initial value of the original recurrence in the scalar loop. Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); @@ -3837,22 +3906,6 @@ void InnerLoopVectorizer::fixFixedOrderRecurrence( Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); Phi->setName("scalar.recur"); - - // Finally, fix users of the recurrence outside the loop. The users will need - // either the last value of the scalar recurrence or the last value of the - // vector recurrence we extracted in the middle block. Since the loop is in - // LCSSA form, we just need to find all the phi nodes for the original scalar - // recurrence in the exit block, and then add an edge for the middle block. - // Note that LCSSA does not imply single entry when the original scalar loop - // had multiple exiting edges (as we always run the last iteration in the - // scalar epilogue); in that case, there is no edge from middle to exit and - // and thus no phis which needed updated. - if (!Cost->requiresScalarEpilogue(VF)) - for (PHINode &LCSSAPhi : LoopExitBlock->phis()) - if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi)) { - LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); - State.Plan->removeLiveOut(&LCSSAPhi); - } } void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, @@ -3872,9 +3925,6 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, // This is the vector-clone of the value that leaves the loop. Type *VecTy = State.get(LoopExitInstDef, 0)->getType(); - // Wrap flags are in general invalid after vectorization, clear them. - clearReductionWrapFlags(PhiR, State); - // Before each round, move the insertion point right between // the PHIs and the values we are going to write. // This allows us to write both PHINodes and the extractelement @@ -4036,7 +4086,7 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, // We know that the loop is in LCSSA form. We need to update the PHI nodes // in the exit blocks. See comment on analogous loop in // fixFixedOrderRecurrence for a more complete explaination of the logic. - if (!Cost->requiresScalarEpilogue(VF)) + if (!Cost->requiresScalarEpilogue(VF.isVector())) for (PHINode &LCSSAPhi : LoopExitBlock->phis()) if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) { LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); @@ -4054,38 +4104,6 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); } -void InnerLoopVectorizer::clearReductionWrapFlags(VPReductionPHIRecipe *PhiR, - VPTransformState &State) { - const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); - RecurKind RK = RdxDesc.getRecurrenceKind(); - if (RK != RecurKind::Add && RK != RecurKind::Mul) - return; - - SmallVector<VPValue *, 8> Worklist; - SmallPtrSet<VPValue *, 8> Visited; - Worklist.push_back(PhiR); - Visited.insert(PhiR); - - while (!Worklist.empty()) { - VPValue *Cur = Worklist.pop_back_val(); - for (unsigned Part = 0; Part < UF; ++Part) { - Value *V = State.get(Cur, Part); - if (!isa<OverflowingBinaryOperator>(V)) - break; - cast<Instruction>(V)->dropPoisonGeneratingFlags(); - } - - for (VPUser *U : Cur->users()) { - auto *UserRecipe = dyn_cast<VPRecipeBase>(U); - if (!UserRecipe) - continue; - for (VPValue *V : UserRecipe->definedValues()) - if (Visited.insert(V).second) - Worklist.push_back(V); - } - } -} - void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { // The basic block and loop containing the predicated instruction. auto *PredBB = PredInst->getParent(); @@ -4125,10 +4143,11 @@ void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); // We can't sink an instruction if it is a phi node, is not in the loop, - // or may have side effects. + // may have side effects or may read from memory. + // TODO Could dor more granular checking to allow sinking a load past non-store instructions. if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) || - I->mayHaveSideEffects()) - continue; + I->mayHaveSideEffects() || I->mayReadFromMemory()) + continue; // If the instruction is already in PredBB, check if we can sink its // operands. In that case, VPlan's sinkScalarOperands() succeeded in @@ -4189,7 +4208,7 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { // We should not collect Scalars more than once per VF. Right now, this // function is called from collectUniformsAndScalars(), which already does // this check. Collecting Scalars for VF=1 does not make any sense. - assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && + assert(VF.isVector() && !Scalars.contains(VF) && "This function should not be visited twice for the same VF"); // This avoids any chances of creating a REPLICATE recipe during planning @@ -4382,6 +4401,8 @@ bool LoopVectorizationCostModel::isScalarWithPredication( switch(I->getOpcode()) { default: return true; + case Instruction::Call: + return !VFDatabase::hasMaskedVariant(*(cast<CallInst>(I)), VF); case Instruction::Load: case Instruction::Store: { auto *Ptr = getLoadStorePointerOperand(I); @@ -4430,10 +4451,10 @@ bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const { // both speculation safety (which follows from the same argument as loads), // but also must prove the value being stored is correct. The easiest // form of the later is to require that all values stored are the same. - if (Legal->isUniformMemOp(*I) && - (isa<LoadInst>(I) || - (isa<StoreInst>(I) && - TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()))) && + if (Legal->isInvariant(getLoadStorePointerOperand(I)) && + (isa<LoadInst>(I) || + (isa<StoreInst>(I) && + TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()))) && !Legal->blockNeedsPredication(I->getParent())) return false; return true; @@ -4445,6 +4466,8 @@ bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const { // TODO: We can use the loop-preheader as context point here and get // context sensitive reasoning return !isSafeToSpeculativelyExecute(I); + case Instruction::Call: + return Legal->isMaskRequired(I); } } @@ -4502,7 +4525,8 @@ LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I, // second vector operand. One example of this are shifts on x86. Value *Op2 = I->getOperand(1); auto Op2Info = TTI.getOperandInfo(Op2); - if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) + if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && + Legal->isInvariant(Op2)) Op2Info.Kind = TargetTransformInfo::OK_UniformValue; SmallVector<const Value *, 4> Operands(I->operand_values()); @@ -4614,7 +4638,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { // already does this check. Collecting Uniforms for VF=1 does not make any // sense. - assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && + assert(VF.isVector() && !Uniforms.contains(VF) && "This function should not be visited twice for the same VF"); // Visit the list of Uniforms. If we'll not find any uniform value, we'll @@ -4663,10 +4687,18 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) addToWorklistIfAllowed(Cmp); + auto PrevVF = VF.divideCoefficientBy(2); // Return true if all lanes perform the same memory operation, and we can // thus chose to execute only one. auto isUniformMemOpUse = [&](Instruction *I) { - if (!Legal->isUniformMemOp(*I)) + // If the value was already known to not be uniform for the previous + // (smaller VF), it cannot be uniform for the larger VF. + if (PrevVF.isVector()) { + auto Iter = Uniforms.find(PrevVF); + if (Iter != Uniforms.end() && !Iter->second.contains(I)) + return false; + } + if (!Legal->isUniformMemOp(*I, VF)) return false; if (isa<LoadInst>(I)) // Loading the same address always produces the same result - at least @@ -4689,11 +4721,14 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { WideningDecision == CM_Interleave); }; - // Returns true if Ptr is the pointer operand of a memory access instruction - // I, and I is known to not require scalarization. + // I, I is known to not require scalarization, and the pointer is not also + // stored. auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { - return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); + if (isa<StoreInst>(I) && I->getOperand(0) == Ptr) + return false; + return getLoadStorePointerOperand(I) == Ptr && + (isUniformDecision(I, VF) || Legal->isInvariant(Ptr)); }; // Holds a list of values which are known to have at least one uniform use. @@ -4739,10 +4774,8 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { if (isUniformMemOpUse(&I)) addToWorklistIfAllowed(&I); - if (isUniformDecision(&I, VF)) { - assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check"); + if (isVectorizedMemAccessUse(&I, Ptr)) HasUniformUse.insert(Ptr); - } } // Add to the worklist any operands which have *only* uniform (e.g. lane 0 @@ -4906,12 +4939,11 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { return MaxScalableVF; // Limit MaxScalableVF by the maximum safe dependence distance. - std::optional<unsigned> MaxVScale = TTI.getMaxVScale(); - if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange)) - MaxVScale = - TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax(); - MaxScalableVF = - ElementCount::getScalable(MaxVScale ? (MaxSafeElements / *MaxVScale) : 0); + if (std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI)) + MaxScalableVF = ElementCount::getScalable(MaxSafeElements / *MaxVScale); + else + MaxScalableVF = ElementCount::getScalable(0); + if (!MaxScalableVF) reportVectorizationInfo( "Max legal vector width too small, scalable vectorization " @@ -4932,7 +4964,7 @@ FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF( // the memory accesses that is most restrictive (involved in the smallest // dependence distance). unsigned MaxSafeElements = - PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); + llvm::bit_floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements); auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements); @@ -5105,16 +5137,26 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { } FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true); + // Avoid tail folding if the trip count is known to be a multiple of any VF - // we chose. - // FIXME: The condition below pessimises the case for fixed-width vectors, - // when scalable VFs are also candidates for vectorization. - if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) { - ElementCount MaxFixedVF = MaxFactors.FixedVF; - assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) && + // we choose. + std::optional<unsigned> MaxPowerOf2RuntimeVF = + MaxFactors.FixedVF.getFixedValue(); + if (MaxFactors.ScalableVF) { + std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI); + if (MaxVScale && TTI.isVScaleKnownToBeAPowerOfTwo()) { + MaxPowerOf2RuntimeVF = std::max<unsigned>( + *MaxPowerOf2RuntimeVF, + *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue()); + } else + MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now. + } + + if (MaxPowerOf2RuntimeVF && *MaxPowerOf2RuntimeVF > 0) { + assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) && "MaxFixedVF must be a power of 2"); - unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC - : MaxFixedVF.getFixedValue(); + unsigned MaxVFtimesIC = + UserIC ? *MaxPowerOf2RuntimeVF * UserIC : *MaxPowerOf2RuntimeVF; ScalarEvolution *SE = PSE.getSE(); const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); const SCEV *ExitCount = SE->getAddExpr( @@ -5134,7 +5176,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { // by masking. // FIXME: look for a smaller MaxVF that does divide TC rather than masking. if (Legal->prepareToFoldTailByMasking()) { - FoldTailByMasking = true; + CanFoldTailByMasking = true; return MaxFactors; } @@ -5187,7 +5229,7 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( // Ensure MaxVF is a power of 2; the dependence distance bound may not be. // Note that both WidestRegister and WidestType may not be a powers of 2. auto MaxVectorElementCount = ElementCount::get( - PowerOf2Floor(WidestRegister.getKnownMinValue() / WidestType), + llvm::bit_floor(WidestRegister.getKnownMinValue() / WidestType), ComputeScalableMaxVF); MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF); LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " @@ -5207,6 +5249,13 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( auto Min = Attr.getVScaleRangeMin(); WidestRegisterMinEC *= Min; } + + // When a scalar epilogue is required, at least one iteration of the scalar + // loop has to execute. Adjust ConstTripCount accordingly to avoid picking a + // max VF that results in a dead vector loop. + if (ConstTripCount > 0 && requiresScalarEpilogue(true)) + ConstTripCount -= 1; + if (ConstTripCount && ConstTripCount <= WidestRegisterMinEC && (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) { // If loop trip count (TC) is known at compile time there is no point in @@ -5214,7 +5263,7 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( // power of two which doesn't exceed TC. // If MaxVectorElementCount is scalable, we only fall back on a fixed VF // when the TC is less than or equal to the known number of lanes. - auto ClampedConstTripCount = PowerOf2Floor(ConstTripCount); + auto ClampedConstTripCount = llvm::bit_floor(ConstTripCount); LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not " "exceeding the constant trip count: " << ClampedConstTripCount << "\n"); @@ -5228,7 +5277,7 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( if (MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 && TTI.shouldMaximizeVectorBandwidth(RegKind))) { auto MaxVectorElementCountMaxBW = ElementCount::get( - PowerOf2Floor(WidestRegister.getKnownMinValue() / SmallestType), + llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType), ComputeScalableMaxVF); MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); @@ -5273,9 +5322,14 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( return MaxVF; } -std::optional<unsigned> LoopVectorizationCostModel::getVScaleForTuning() const { - if (TheFunction->hasFnAttribute(Attribute::VScaleRange)) { - auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange); +/// Convenience function that returns the value of vscale_range iff +/// vscale_range.min == vscale_range.max or otherwise returns the value +/// returned by the corresponding TTI method. +static std::optional<unsigned> +getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI) { + const Function *Fn = L->getHeader()->getParent(); + if (Fn->hasFnAttribute(Attribute::VScaleRange)) { + auto Attr = Fn->getFnAttribute(Attribute::VScaleRange); auto Min = Attr.getVScaleRangeMin(); auto Max = Attr.getVScaleRangeMax(); if (Max && Min == Max) @@ -5285,31 +5339,39 @@ std::optional<unsigned> LoopVectorizationCostModel::getVScaleForTuning() const { return TTI.getVScaleForTuning(); } -bool LoopVectorizationCostModel::isMoreProfitable( +bool LoopVectorizationPlanner::isMoreProfitable( const VectorizationFactor &A, const VectorizationFactor &B) const { InstructionCost CostA = A.Cost; InstructionCost CostB = B.Cost; - unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop); - - if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking && - MaxTripCount) { - // If we are folding the tail and the trip count is a known (possibly small) - // constant, the trip count will be rounded up to an integer number of - // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF), - // which we compare directly. When not folding the tail, the total cost will - // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is - // approximated with the per-lane cost below instead of using the tripcount - // as here. - auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue()); - auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue()); + unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(OrigLoop); + + if (!A.Width.isScalable() && !B.Width.isScalable() && MaxTripCount) { + // If the trip count is a known (possibly small) constant, the trip count + // will be rounded up to an integer number of iterations under + // FoldTailByMasking. The total cost in that case will be + // VecCost*ceil(TripCount/VF). When not folding the tail, the total + // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be + // some extra overheads, but for the purpose of comparing the costs of + // different VFs we can use this to compare the total loop-body cost + // expected after vectorization. + auto GetCostForTC = [MaxTripCount, this](unsigned VF, + InstructionCost VectorCost, + InstructionCost ScalarCost) { + return CM.foldTailByMasking() ? VectorCost * divideCeil(MaxTripCount, VF) + : VectorCost * (MaxTripCount / VF) + + ScalarCost * (MaxTripCount % VF); + }; + auto RTCostA = GetCostForTC(A.Width.getFixedValue(), CostA, A.ScalarCost); + auto RTCostB = GetCostForTC(B.Width.getFixedValue(), CostB, B.ScalarCost); + return RTCostA < RTCostB; } // Improve estimate for the vector width if it is scalable. unsigned EstimatedWidthA = A.Width.getKnownMinValue(); unsigned EstimatedWidthB = B.Width.getKnownMinValue(); - if (std::optional<unsigned> VScale = getVScaleForTuning()) { + if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI)) { if (A.Width.isScalable()) EstimatedWidthA *= *VScale; if (B.Width.isScalable()) @@ -5328,9 +5390,74 @@ bool LoopVectorizationCostModel::isMoreProfitable( return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA); } -VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor( +static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts, + OptimizationRemarkEmitter *ORE, + Loop *TheLoop) { + if (InvalidCosts.empty()) + return; + + // Emit a report of VFs with invalid costs in the loop. + + // Group the remarks per instruction, keeping the instruction order from + // InvalidCosts. + std::map<Instruction *, unsigned> Numbering; + unsigned I = 0; + for (auto &Pair : InvalidCosts) + if (!Numbering.count(Pair.first)) + Numbering[Pair.first] = I++; + + // Sort the list, first on instruction(number) then on VF. + sort(InvalidCosts, [&Numbering](InstructionVFPair &A, InstructionVFPair &B) { + if (Numbering[A.first] != Numbering[B.first]) + return Numbering[A.first] < Numbering[B.first]; + ElementCountComparator ECC; + return ECC(A.second, B.second); + }); + + // For a list of ordered instruction-vf pairs: + // [(load, vf1), (load, vf2), (store, vf1)] + // Group the instructions together to emit separate remarks for: + // load (vf1, vf2) + // store (vf1) + auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts); + auto Subset = ArrayRef<InstructionVFPair>(); + do { + if (Subset.empty()) + Subset = Tail.take_front(1); + + Instruction *I = Subset.front().first; + + // If the next instruction is different, or if there are no other pairs, + // emit a remark for the collated subset. e.g. + // [(load, vf1), (load, vf2))] + // to emit: + // remark: invalid costs for 'load' at VF=(vf, vf2) + if (Subset == Tail || Tail[Subset.size()].first != I) { + std::string OutString; + raw_string_ostream OS(OutString); + assert(!Subset.empty() && "Unexpected empty range"); + OS << "Instruction with invalid costs prevented vectorization at VF=("; + for (const auto &Pair : Subset) + OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second; + OS << "):"; + if (auto *CI = dyn_cast<CallInst>(I)) + OS << " call to " << CI->getCalledFunction()->getName(); + else + OS << " " << I->getOpcodeName(); + OS.flush(); + reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I); + Tail = Tail.drop_front(Subset.size()); + Subset = {}; + } else + // Grow the subset by one element + Subset = Tail.take_front(Subset.size() + 1); + } while (!Tail.empty()); +} + +VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor( const ElementCountSet &VFCandidates) { - InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; + InstructionCost ExpectedCost = + CM.expectedCost(ElementCount::getFixed(1)).first; LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); assert(VFCandidates.count(ElementCount::getFixed(1)) && @@ -5340,7 +5467,7 @@ VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor( ExpectedCost); VectorizationFactor ChosenFactor = ScalarCost; - bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; + bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled; if (ForceVectorization && VFCandidates.size() > 1) { // Ignore scalar width, because the user explicitly wants vectorization. // Initialize cost to max so that VF = 2 is, at least, chosen during cost @@ -5354,12 +5481,13 @@ VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor( if (i.isScalar()) continue; - VectorizationCostTy C = expectedCost(i, &InvalidCosts); + LoopVectorizationCostModel::VectorizationCostTy C = + CM.expectedCost(i, &InvalidCosts); VectorizationFactor Candidate(i, C.first, ScalarCost.ScalarCost); #ifndef NDEBUG unsigned AssumedMinimumVscale = 1; - if (std::optional<unsigned> VScale = getVScaleForTuning()) + if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI)) AssumedMinimumVscale = *VScale; unsigned Width = Candidate.Width.isScalable() @@ -5388,70 +5516,13 @@ VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor( ChosenFactor = Candidate; } - // Emit a report of VFs with invalid costs in the loop. - if (!InvalidCosts.empty()) { - // Group the remarks per instruction, keeping the instruction order from - // InvalidCosts. - std::map<Instruction *, unsigned> Numbering; - unsigned I = 0; - for (auto &Pair : InvalidCosts) - if (!Numbering.count(Pair.first)) - Numbering[Pair.first] = I++; - - // Sort the list, first on instruction(number) then on VF. - llvm::sort(InvalidCosts, - [&Numbering](InstructionVFPair &A, InstructionVFPair &B) { - if (Numbering[A.first] != Numbering[B.first]) - return Numbering[A.first] < Numbering[B.first]; - ElementCountComparator ECC; - return ECC(A.second, B.second); - }); - - // For a list of ordered instruction-vf pairs: - // [(load, vf1), (load, vf2), (store, vf1)] - // Group the instructions together to emit separate remarks for: - // load (vf1, vf2) - // store (vf1) - auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts); - auto Subset = ArrayRef<InstructionVFPair>(); - do { - if (Subset.empty()) - Subset = Tail.take_front(1); - - Instruction *I = Subset.front().first; - - // If the next instruction is different, or if there are no other pairs, - // emit a remark for the collated subset. e.g. - // [(load, vf1), (load, vf2))] - // to emit: - // remark: invalid costs for 'load' at VF=(vf, vf2) - if (Subset == Tail || Tail[Subset.size()].first != I) { - std::string OutString; - raw_string_ostream OS(OutString); - assert(!Subset.empty() && "Unexpected empty range"); - OS << "Instruction with invalid costs prevented vectorization at VF=("; - for (const auto &Pair : Subset) - OS << (Pair.second == Subset.front().second ? "" : ", ") - << Pair.second; - OS << "):"; - if (auto *CI = dyn_cast<CallInst>(I)) - OS << " call to " << CI->getCalledFunction()->getName(); - else - OS << " " << I->getOpcodeName(); - OS.flush(); - reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I); - Tail = Tail.drop_front(Subset.size()); - Subset = {}; - } else - // Grow the subset by one element - Subset = Tail.take_front(Subset.size() + 1); - } while (!Tail.empty()); - } + emitInvalidCostRemarks(InvalidCosts, ORE, OrigLoop); - if (!EnableCondStoresVectorization && NumPredStores) { - reportVectorizationFailure("There are conditional stores.", + if (!EnableCondStoresVectorization && CM.hasPredStores()) { + reportVectorizationFailure( + "There are conditional stores.", "store that is conditionally executed prevents vectorization", - "ConditionalStore", ORE, TheLoop); + "ConditionalStore", ORE, OrigLoop); ChosenFactor = ScalarCost; } @@ -5463,11 +5534,11 @@ VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor( return ChosenFactor; } -bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( - const Loop &L, ElementCount VF) const { +bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization( + ElementCount VF) const { // Cross iteration phis such as reductions need special handling and are // currently unsupported. - if (any_of(L.getHeader()->phis(), + if (any_of(OrigLoop->getHeader()->phis(), [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); })) return false; @@ -5475,20 +5546,21 @@ bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( // currently unsupported. for (const auto &Entry : Legal->getInductionVars()) { // Look for uses of the value of the induction at the last iteration. - Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); + Value *PostInc = + Entry.first->getIncomingValueForBlock(OrigLoop->getLoopLatch()); for (User *U : PostInc->users()) - if (!L.contains(cast<Instruction>(U))) + if (!OrigLoop->contains(cast<Instruction>(U))) return false; // Look for uses of penultimate value of the induction. for (User *U : Entry.first->users()) - if (!L.contains(cast<Instruction>(U))) + if (!OrigLoop->contains(cast<Instruction>(U))) return false; } // Epilogue vectorization code has not been auditted to ensure it handles // non-latch exits properly. It may be fine, but it needs auditted and // tested. - if (L.getExitingBlock() != L.getLoopLatch()) + if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch()) return false; return true; @@ -5507,62 +5579,59 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( // We also consider epilogue vectorization unprofitable for targets that don't // consider interleaving beneficial (eg. MVE). - if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) + if (TTI.getMaxInterleaveFactor(VF) <= 1) return false; - // FIXME: We should consider changing the threshold for scalable - // vectors to take VScaleForTuning into account. - if (VF.getKnownMinValue() >= EpilogueVectorizationMinVF) + + unsigned Multiplier = 1; + if (VF.isScalable()) + Multiplier = getVScaleForTuning(TheLoop, TTI).value_or(1); + if ((Multiplier * VF.getKnownMinValue()) >= EpilogueVectorizationMinVF) return true; return false; } -VectorizationFactor -LoopVectorizationCostModel::selectEpilogueVectorizationFactor( - const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { +VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( + const ElementCount MainLoopVF, unsigned IC) { VectorizationFactor Result = VectorizationFactor::Disabled(); if (!EnableEpilogueVectorization) { - LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); + LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n"); return Result; } - if (!isScalarEpilogueAllowed()) { - LLVM_DEBUG( - dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " - "allowed.\n";); + if (!CM.isScalarEpilogueAllowed()) { + LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no " + "epilogue is allowed.\n"); return Result; } // Not really a cost consideration, but check for unsupported cases here to // simplify the logic. - if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { - LLVM_DEBUG( - dbgs() << "LEV: Unable to vectorize epilogue because the loop is " - "not a supported candidate.\n";); + if (!isCandidateForEpilogueVectorization(MainLoopVF)) { + LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop " + "is not a supported candidate.\n"); return Result; } if (EpilogueVectorizationForceVF > 1) { - LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); + LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n"); ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF); - if (LVP.hasPlanWithVF(ForcedEC)) + if (hasPlanWithVF(ForcedEC)) return {ForcedEC, 0, 0}; else { - LLVM_DEBUG( - dbgs() - << "LEV: Epilogue vectorization forced factor is not viable.\n";); + LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not " + "viable.\n"); return Result; } } - if (TheLoop->getHeader()->getParent()->hasOptSize() || - TheLoop->getHeader()->getParent()->hasMinSize()) { + if (OrigLoop->getHeader()->getParent()->hasOptSize() || + OrigLoop->getHeader()->getParent()->hasMinSize()) { LLVM_DEBUG( - dbgs() - << "LEV: Epilogue vectorization skipped due to opt for size.\n";); + dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n"); return Result; } - if (!isEpilogueVectorizationProfitable(MainLoopVF)) { + if (!CM.isEpilogueVectorizationProfitable(MainLoopVF)) { LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for " "this loop\n"); return Result; @@ -5574,21 +5643,48 @@ LoopVectorizationCostModel::selectEpilogueVectorizationFactor( ElementCount EstimatedRuntimeVF = MainLoopVF; if (MainLoopVF.isScalable()) { EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue()); - if (std::optional<unsigned> VScale = getVScaleForTuning()) + if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI)) EstimatedRuntimeVF *= *VScale; } - for (auto &NextVF : ProfitableVFs) - if (((!NextVF.Width.isScalable() && MainLoopVF.isScalable() && - ElementCount::isKnownLT(NextVF.Width, EstimatedRuntimeVF)) || - ElementCount::isKnownLT(NextVF.Width, MainLoopVF)) && - (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) && - LVP.hasPlanWithVF(NextVF.Width)) + ScalarEvolution &SE = *PSE.getSE(); + Type *TCType = Legal->getWidestInductionType(); + const SCEV *RemainingIterations = nullptr; + for (auto &NextVF : ProfitableVFs) { + // Skip candidate VFs without a corresponding VPlan. + if (!hasPlanWithVF(NextVF.Width)) + continue; + + // Skip candidate VFs with widths >= the estimate runtime VF (scalable + // vectors) or the VF of the main loop (fixed vectors). + if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() && + ElementCount::isKnownGE(NextVF.Width, EstimatedRuntimeVF)) || + ElementCount::isKnownGE(NextVF.Width, MainLoopVF)) + continue; + + // If NextVF is greater than the number of remaining iterations, the + // epilogue loop would be dead. Skip such factors. + if (!MainLoopVF.isScalable() && !NextVF.Width.isScalable()) { + // TODO: extend to support scalable VFs. + if (!RemainingIterations) { + const SCEV *TC = createTripCountSCEV(TCType, PSE, OrigLoop); + RemainingIterations = SE.getURemExpr( + TC, SE.getConstant(TCType, MainLoopVF.getKnownMinValue() * IC)); + } + if (SE.isKnownPredicate( + CmpInst::ICMP_UGT, + SE.getConstant(TCType, NextVF.Width.getKnownMinValue()), + RemainingIterations)) + continue; + } + + if (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) Result = NextVF; + } if (Result != VectorizationFactor::Disabled()) LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " - << Result.Width << "\n";); + << Result.Width << "\n"); return Result; } @@ -5688,7 +5784,7 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, return 1; // We used the distance for the interleave count. - if (Legal->getMaxSafeDepDistBytes() != -1U) + if (!Legal->isSafeForAnyVectorWidth()) return 1; auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); @@ -5750,20 +5846,19 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; - unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); + unsigned TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs) / + MaxLocalUsers); // Don't count the induction variable as interleaved. if (EnableIndVarRegisterHeur) { - TmpIC = - PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / - std::max(1U, (MaxLocalUsers - 1))); + TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs - 1) / + std::max(1U, (MaxLocalUsers - 1))); } IC = std::min(IC, TmpIC); } // Clamp the interleave ranges to reasonable counts. - unsigned MaxInterleaveCount = - TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); + unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF); // Check if the user has overridden the max. if (VF.isScalar()) { @@ -5834,8 +5929,8 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, // We assume that the cost overhead is 1 and we use the cost model // to estimate the cost of the loop and interleave until the cost of the // loop overhead is about 5% of the cost of the loop. - unsigned SmallIC = std::min( - IC, (unsigned)PowerOf2Floor(SmallLoopCost / *LoopCost.getValue())); + unsigned SmallIC = std::min(IC, (unsigned)llvm::bit_floor<uint64_t>( + SmallLoopCost / *LoopCost.getValue())); // Interleave until store/load ports (estimated by max interleave count) are // saturated. @@ -5953,7 +6048,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { // Saves the list of values that are used in the loop but are defined outside // the loop (not including non-instruction values such as arguments and // constants). - SmallPtrSet<Value *, 8> LoopInvariants; + SmallSetVector<Instruction *, 8> LoopInvariants; for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { for (Instruction &I : BB->instructionsWithoutDebug()) { @@ -6079,11 +6174,16 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { for (auto *Inst : LoopInvariants) { // FIXME: The target might use more than one register for the type // even in the scalar case. - unsigned Usage = - VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); + bool IsScalar = all_of(Inst->users(), [&](User *U) { + auto *I = cast<Instruction>(U); + return TheLoop != LI->getLoopFor(I->getParent()) || + isScalarAfterVectorization(I, VFs[i]); + }); + + ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[i]; unsigned ClassID = - TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); - Invariant[ClassID] += Usage; + TTI.getRegisterClassForType(VF.isVector(), Inst->getType()); + Invariant[ClassID] += GetRegUsage(Inst->getType(), VF); } LLVM_DEBUG({ @@ -6134,8 +6234,7 @@ void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { // instructions to scalarize, there's nothing to do. Collection may already // have occurred if we have a user-selected VF and are now computing the // expected cost for interleaving. - if (VF.isScalar() || VF.isZero() || - InstsToScalarize.find(VF) != InstsToScalarize.end()) + if (VF.isScalar() || VF.isZero() || InstsToScalarize.contains(VF)) return; // Initialize a mapping for VF in InstsToScalalarize. If we find that it's @@ -6224,7 +6323,7 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount( Instruction *I = Worklist.pop_back_val(); // If we've already analyzed the instruction, there's nothing to do. - if (ScalarCosts.find(I) != ScalarCosts.end()) + if (ScalarCosts.contains(I)) continue; // Compute the cost of the vector instruction. Note that this cost already @@ -6362,11 +6461,6 @@ static const SCEV *getAddressAccessSCEV( return PSE.getSCEV(Ptr); } -static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { - return Legal->hasStride(I->getOperand(0)) || - Legal->hasStride(I->getOperand(1)); -} - InstructionCost LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, ElementCount VF) { @@ -6460,7 +6554,7 @@ LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, InstructionCost LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, ElementCount VF) { - assert(Legal->isUniformMemOp(*I)); + assert(Legal->isUniformMemOp(*I, VF)); Type *ValTy = getLoadStoreType(I); auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); @@ -6475,7 +6569,7 @@ LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, } StoreInst *SI = cast<StoreInst>(I); - bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); + bool isLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand()); return TTI.getAddressComputationCost(ValTy) + TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, CostKind) + @@ -6502,11 +6596,6 @@ LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, InstructionCost LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, ElementCount VF) { - // TODO: Once we have support for interleaving with scalable vectors - // we can calculate the cost properly here. - if (VF.isScalable()) - return InstructionCost::getInvalid(); - Type *ValTy = getLoadStoreType(I); auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); unsigned AS = getLoadStoreAddressSpace(I); @@ -6836,7 +6925,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF)) NumPredStores++; - if (Legal->isUniformMemOp(I)) { + if (Legal->isUniformMemOp(I, VF)) { auto isLegalToScalarize = [&]() { if (!VF.isScalable()) // Scalarization of fixed length vectors "just works". @@ -7134,8 +7223,12 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, case Instruction::And: case Instruction::Or: case Instruction::Xor: { - // Since we will replace the stride by 1 the multiplication should go away. - if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) + // If we're speculating on the stride being 1, the multiplication may + // fold away. We can generalize this for all operations using the notion + // of neutral elements. (TODO) + if (I->getOpcode() == Instruction::Mul && + (PSE.getSCEV(I->getOperand(0))->isOne() || + PSE.getSCEV(I->getOperand(1))->isOne())) return 0; // Detect reduction patterns @@ -7146,7 +7239,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, // second vector operand. One example of this are shifts on x86. Value *Op2 = I->getOperand(1); auto Op2Info = TTI.getOperandInfo(Op2); - if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) + if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && + Legal->isInvariant(Op2)) Op2Info.Kind = TargetTransformInfo::OK_UniformValue; SmallVector<const Value *, 4> Operands(I->operand_values()); @@ -7304,7 +7398,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, VectorTy = largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { - SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); + // Leave SrcVecTy unchanged - we only shrink the destination element + // type. VectorTy = smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); } @@ -7316,9 +7411,9 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, if (RecurrenceDescriptor::isFMulAddIntrinsic(I)) if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) return *RedCost; - bool NeedToScalarize; + Function *Variant; CallInst *CI = cast<CallInst>(I); - InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); + InstructionCost CallCost = getVectorCallCost(CI, VF, &Variant); if (getVectorIntrinsicIDForCall(CI, TLI)) { InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); return std::min(CallCost, IntrinsicCost); @@ -7339,37 +7434,6 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, } // end of switch. } -char LoopVectorize::ID = 0; - -static const char lv_name[] = "Loop Vectorization"; - -INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) -INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) -INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) -INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) -INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) -INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) -INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) -INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) -INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) -INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) - -namespace llvm { - -Pass *createLoopVectorizePass() { return new LoopVectorize(); } - -Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, - bool VectorizeOnlyWhenForced) { - return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); -} - -} // end namespace llvm - void LoopVectorizationCostModel::collectValuesToIgnore() { // Ignore ephemeral values. CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); @@ -7462,7 +7526,7 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { // reasonable one. if (UserVF.isZero()) { VF = ElementCount::getFixed(determineVPlanVF( - TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) + TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) .getFixedValue(), CM)); LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); @@ -7497,13 +7561,16 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { std::optional<VectorizationFactor> LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { assert(OrigLoop->isInnermost() && "Inner loop expected."); + CM.collectValuesToIgnore(); + CM.collectElementTypesForWidening(); + FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC); if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. return std::nullopt; // Invalidate interleave groups if all blocks of loop will be predicated. if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) && - !useMaskedInterleavedAccesses(*TTI)) { + !useMaskedInterleavedAccesses(TTI)) { LLVM_DEBUG( dbgs() << "LV: Invalidate all interleaved groups due to fold-tail by masking " @@ -7527,6 +7594,12 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); CM.collectInLoopReductions(); buildVPlansWithVPRecipes(UserVF, UserVF); + if (!hasPlanWithVF(UserVF)) { + LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << UserVF + << ".\n"); + return std::nullopt; + } + LLVM_DEBUG(printPlans(dbgs())); return {{UserVF, 0, 0}}; } else @@ -7562,8 +7635,13 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { return VectorizationFactor::Disabled(); // Select the optimal vectorization factor. - VectorizationFactor VF = CM.selectVectorizationFactor(VFCandidates); + VectorizationFactor VF = selectVectorizationFactor(VFCandidates); assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero."); + if (!hasPlanWithVF(VF.Width)) { + LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << VF.Width + << ".\n"); + return std::nullopt; + } return VF; } @@ -7614,43 +7692,51 @@ static void AddRuntimeUnrollDisableMetaData(Loop *L) { } } -void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, - VPlan &BestVPlan, - InnerLoopVectorizer &ILV, - DominatorTree *DT, - bool IsEpilogueVectorization) { +SCEV2ValueTy LoopVectorizationPlanner::executePlan( + ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan, + InnerLoopVectorizer &ILV, DominatorTree *DT, bool IsEpilogueVectorization, + DenseMap<const SCEV *, Value *> *ExpandedSCEVs) { assert(BestVPlan.hasVF(BestVF) && "Trying to execute plan with unsupported VF"); assert(BestVPlan.hasUF(BestUF) && "Trying to execute plan with unsupported UF"); + assert( + (IsEpilogueVectorization || !ExpandedSCEVs) && + "expanded SCEVs to reuse can only be used during epilogue vectorization"); LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF << '\n'); - // Workaround! Compute the trip count of the original loop and cache it - // before we start modifying the CFG. This code has a systemic problem - // wherein it tries to run analysis over partially constructed IR; this is - // wrong, and not simply for SCEV. The trip count of the original loop - // simply happens to be prone to hitting this in practice. In theory, we - // can hit the same issue for any SCEV, or ValueTracking query done during - // mutation. See PR49900. - ILV.getOrCreateTripCount(OrigLoop->getLoopPreheader()); - if (!IsEpilogueVectorization) VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE); // Perform the actual loop transformation. + VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan}; + + // 0. Generate SCEV-dependent code into the preheader, including TripCount, + // before making any changes to the CFG. + if (!BestVPlan.getPreheader()->empty()) { + State.CFG.PrevBB = OrigLoop->getLoopPreheader(); + State.Builder.SetInsertPoint(OrigLoop->getLoopPreheader()->getTerminator()); + BestVPlan.getPreheader()->execute(&State); + } + if (!ILV.getTripCount()) + ILV.setTripCount(State.get(BestVPlan.getTripCount(), {0, 0})); + else + assert(IsEpilogueVectorization && "should only re-use the existing trip " + "count during epilogue vectorization"); // 1. Set up the skeleton for vectorization, including vector pre-header and // middle block. The vector loop is created during VPlan execution. - VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan}; Value *CanonicalIVStartValue; std::tie(State.CFG.PrevBB, CanonicalIVStartValue) = - ILV.createVectorizedLoopSkeleton(); + ILV.createVectorizedLoopSkeleton(ExpandedSCEVs ? *ExpandedSCEVs + : State.ExpandedSCEVs); // Only use noalias metadata when using memory checks guaranteeing no overlap // across all iterations. const LoopAccessInfo *LAI = ILV.Legal->getLAI(); + std::unique_ptr<LoopVersioning> LVer = nullptr; if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() && !LAI->getRuntimePointerChecking()->getDiffChecks()) { @@ -7658,9 +7744,10 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, // still use it to add the noalias metadata. // TODO: Find a better way to re-use LoopVersioning functionality to add // metadata. - State.LVer = std::make_unique<LoopVersioning>( + LVer = std::make_unique<LoopVersioning>( *LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT, PSE.getSE()); + State.LVer = &*LVer; State.LVer->prepareNoAliasMetadata(); } @@ -7677,10 +7764,9 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, //===------------------------------------------------===// // 2. Copy and widen instructions from the old loop into the new loop. - BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr), - ILV.getOrCreateVectorTripCount(nullptr), - CanonicalIVStartValue, State, - IsEpilogueVectorization); + BestVPlan.prepareToExecute( + ILV.getTripCount(), ILV.getOrCreateVectorTripCount(nullptr), + CanonicalIVStartValue, State, IsEpilogueVectorization); BestVPlan.execute(&State); @@ -7706,13 +7792,18 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, LoopVectorizeHints Hints(L, true, *ORE); Hints.setAlreadyVectorized(); } - AddRuntimeUnrollDisableMetaData(L); + TargetTransformInfo::UnrollingPreferences UP; + TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE); + if (!UP.UnrollVectorizedLoop || CanonicalIVStartValue) + AddRuntimeUnrollDisableMetaData(L); // 3. Fix the vectorized code: take care of header phi's, live-outs, // predication, updating analyses. ILV.fixVectorizedLoop(State, BestVPlan); ILV.printDebugTracesAtEnd(); + + return State.ExpandedSCEVs; } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -7725,8 +7816,6 @@ void LoopVectorizationPlanner::printPlans(raw_ostream &O) { } #endif -Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } - //===--------------------------------------------------------------------===// // EpilogueVectorizerMainLoop //===--------------------------------------------------------------------===// @@ -7734,7 +7823,8 @@ Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } /// This function is partially responsible for generating the control flow /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. std::pair<BasicBlock *, Value *> -EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { +EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton( + const SCEV2ValueTy &ExpandedSCEVs) { createVectorLoopSkeleton(""); // Generate the code to check the minimum iteration count of the vector @@ -7795,7 +7885,7 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass, assert(Bypass && "Expected valid bypass basic block."); ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF; unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; - Value *Count = getOrCreateTripCount(LoopVectorPreHeader); + Value *Count = getTripCount(); // Reuse existing vector loop preheader for TC checks. // Note that new preheader block is generated for vector loop. BasicBlock *const TCCheckBlock = LoopVectorPreHeader; @@ -7803,8 +7893,10 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass, // Generate code to check if the loop's trip count is less than VF * UF of the // main vector loop. - auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ? - ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; + auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF.isVector() + : VF.isVector()) + ? ICmpInst::ICMP_ULE + : ICmpInst::ICMP_ULT; Value *CheckMinIters = Builder.CreateICmp( P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor), @@ -7824,7 +7916,7 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass, // Update dominator for Bypass & LoopExit. DT->changeImmediateDominator(Bypass, TCCheckBlock); - if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) + if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector())) // For loops with multiple exits, there's no edge from the middle block // to exit blocks (as the epilogue must run) and thus no need to update // the immediate dominator of the exit blocks. @@ -7852,7 +7944,8 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass, /// This function is partially responsible for generating the control flow /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. std::pair<BasicBlock *, Value *> -EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { +EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton( + const SCEV2ValueTy &ExpandedSCEVs) { createVectorLoopSkeleton("vec.epilog."); // Now, compare the remaining count and if there aren't enough iterations to @@ -7891,7 +7984,7 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { DT->changeImmediateDominator(LoopScalarPreHeader, EPI.EpilogueIterationCountCheck); - if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) + if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector())) // If there is an epilogue which must run, there's no edge from the // middle block to exit blocks and thus no need to update the immediate // dominator of the exit blocks. @@ -7950,7 +8043,8 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { // check, then the resume value for the induction variable comes from // the trip count of the main vector loop, hence passing the AdditionalBypass // argument. - createInductionResumeValues({VecEpilogueIterationCountCheck, + createInductionResumeValues(ExpandedSCEVs, + {VecEpilogueIterationCountCheck, EPI.VectorTripCount} /* AdditionalBypass */); return {completeLoopSkeleton(), EPResumeVal}; @@ -7972,8 +8066,9 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( // Generate code to check if the loop's trip count is less than VF * UF of the // vector epilogue loop. - auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ? - ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; + auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector()) + ? ICmpInst::ICMP_ULE + : ICmpInst::ICMP_ULT; Value *CheckMinIters = Builder.CreateICmp(P, Count, @@ -8008,8 +8103,7 @@ bool LoopVectorizationPlanner::getDecisionAndClampRange( assert(!Range.isEmpty() && "Trying to test an empty VF range."); bool PredicateAtRangeStart = Predicate(Range.Start); - for (ElementCount TmpVF = Range.Start * 2; - ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) + for (ElementCount TmpVF : VFRange(Range.Start * 2, Range.End)) if (Predicate(TmpVF) != PredicateAtRangeStart) { Range.End = TmpVF; break; @@ -8025,16 +8119,16 @@ bool LoopVectorizationPlanner::getDecisionAndClampRange( /// buildVPlan(). void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, ElementCount MaxVF) { - auto MaxVFPlusOne = MaxVF.getWithIncrement(1); - for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { - VFRange SubRange = {VF, MaxVFPlusOne}; + auto MaxVFTimes2 = MaxVF * 2; + for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) { + VFRange SubRange = {VF, MaxVFTimes2}; VPlans.push_back(buildVPlan(SubRange)); VF = SubRange.End; } } VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, - VPlanPtr &Plan) { + VPlan &Plan) { assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); // Look for cached value. @@ -8058,7 +8152,7 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, if (OrigLoop->isLoopExiting(Src)) return EdgeMaskCache[Edge] = SrcMask; - VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); + VPValue *EdgeMask = Plan.getVPValueOrAddLiveIn(BI->getCondition()); assert(EdgeMask && "No Edge Mask found for condition"); if (BI->getSuccessor(0) != Dst) @@ -8069,7 +8163,7 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, // 'select i1 SrcMask, i1 EdgeMask, i1 false'. // The select version does not introduce new UB if SrcMask is false and // EdgeMask is poison. Using 'and' here introduces undefined behavior. - VPValue *False = Plan->getOrAddVPValue( + VPValue *False = Plan.getVPValueOrAddLiveIn( ConstantInt::getFalse(BI->getCondition()->getType())); EdgeMask = Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc()); @@ -8078,7 +8172,7 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, return EdgeMaskCache[Edge] = EdgeMask; } -VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { +VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) { assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); // Look for cached value. @@ -8098,29 +8192,28 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { // If we're using the active lane mask for control flow, then we get the // mask from the active lane mask PHI that is cached in the VPlan. - PredicationStyle EmitGetActiveLaneMask = CM.TTI.emitGetActiveLaneMask(); - if (EmitGetActiveLaneMask == PredicationStyle::DataAndControlFlow) - return BlockMaskCache[BB] = Plan->getActiveLaneMaskPhi(); + TailFoldingStyle TFStyle = CM.getTailFoldingStyle(); + if (useActiveLaneMaskForControlFlow(TFStyle)) + return BlockMaskCache[BB] = Plan.getActiveLaneMaskPhi(); // Introduce the early-exit compare IV <= BTC to form header block mask. // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by // constructing the desired canonical IV in the header block as its first // non-phi instructions. - VPBasicBlock *HeaderVPBB = - Plan->getVectorLoopRegion()->getEntryBasicBlock(); + VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi(); - auto *IV = new VPWidenCanonicalIVRecipe(Plan->getCanonicalIV()); + auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV()); HeaderVPBB->insert(IV, HeaderVPBB->getFirstNonPhi()); VPBuilder::InsertPointGuard Guard(Builder); Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); - if (EmitGetActiveLaneMask != PredicationStyle::None) { - VPValue *TC = Plan->getOrCreateTripCount(); + if (useActiveLaneMask(TFStyle)) { + VPValue *TC = Plan.getTripCount(); BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC}, nullptr, "active.lane.mask"); } else { - VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); + VPValue *BTC = Plan.getOrCreateBackedgeTakenCount(); BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); } return BlockMaskCache[BB] = BlockMask; @@ -8168,7 +8261,7 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, VPValue *Mask = nullptr; if (Legal->isMaskRequired(I)) - Mask = createBlockInMask(I->getParent(), Plan); + Mask = createBlockInMask(I->getParent(), *Plan); // Determine if the pointer operand of the access is either consecutive or // reverse consecutive. @@ -8189,22 +8282,11 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also /// insert a recipe to expand the step for the induction recipe. -static VPWidenIntOrFpInductionRecipe *createWidenInductionRecipes( - PHINode *Phi, Instruction *PhiOrTrunc, VPValue *Start, - const InductionDescriptor &IndDesc, LoopVectorizationCostModel &CM, - VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop, VFRange &Range) { - // Returns true if an instruction \p I should be scalarized instead of - // vectorized for the chosen vectorization factor. - auto ShouldScalarizeInstruction = [&CM](Instruction *I, ElementCount VF) { - return CM.isScalarAfterVectorization(I, VF) || - CM.isProfitableToScalarize(I, VF); - }; - - bool NeedsScalarIVOnly = LoopVectorizationPlanner::getDecisionAndClampRange( - [&](ElementCount VF) { - return ShouldScalarizeInstruction(PhiOrTrunc, VF); - }, - Range); +static VPWidenIntOrFpInductionRecipe * +createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc, + VPValue *Start, const InductionDescriptor &IndDesc, + VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop, + VFRange &Range) { assert(IndDesc.getStartValue() == Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader())); assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) && @@ -8213,12 +8295,10 @@ static VPWidenIntOrFpInductionRecipe *createWidenInductionRecipes( VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE); if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) { - return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI, - !NeedsScalarIVOnly); + return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI); } assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here"); - return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, - !NeedsScalarIVOnly); + return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc); } VPRecipeBase *VPRecipeBuilder::tryToOptimizeInductionPHI( @@ -8227,14 +8307,13 @@ VPRecipeBase *VPRecipeBuilder::tryToOptimizeInductionPHI( // Check if this is an integer or fp induction. If so, build the recipe that // produces its scalar and vector values. if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi)) - return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, CM, Plan, + return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, Plan, *PSE.getSE(), *OrigLoop, Range); // Check if this is pointer induction. If so, build the recipe for it. if (auto *II = Legal->getPointerInductionDescriptor(Phi)) { VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep(), *PSE.getSE()); - assert(isa<SCEVConstant>(II->getStep())); return new VPWidenPointerInductionRecipe( Phi, Operands[0], Step, *II, LoopVectorizationPlanner::getDecisionAndClampRange( @@ -8267,9 +8346,9 @@ VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( auto *Phi = cast<PHINode>(I->getOperand(0)); const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi); - VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); - return createWidenInductionRecipes(Phi, I, Start, II, CM, Plan, - *PSE.getSE(), *OrigLoop, Range); + VPValue *Start = Plan.getVPValueOrAddLiveIn(II.getStartValue()); + return createWidenInductionRecipes(Phi, I, Start, II, Plan, *PSE.getSE(), + *OrigLoop, Range); } return nullptr; } @@ -8309,7 +8388,7 @@ VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, for (unsigned In = 0; In < NumIncoming; In++) { VPValue *EdgeMask = - createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); + createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), *Plan); assert((EdgeMask || NumIncoming == 1) && "Multiple predecessors with one having a full mask"); OperandsWithMask.push_back(Operands[In]); @@ -8321,8 +8400,8 @@ VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, ArrayRef<VPValue *> Operands, - VFRange &Range) const { - + VFRange &Range, + VPlanPtr &Plan) { bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( [this, CI](ElementCount VF) { return CM.isScalarWithPredication(CI, VF); @@ -8339,17 +8418,17 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, ID == Intrinsic::experimental_noalias_scope_decl)) return nullptr; - ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size()); + SmallVector<VPValue *, 4> Ops(Operands.take_front(CI->arg_size())); // Is it beneficial to perform intrinsic call compared to lib call? bool ShouldUseVectorIntrinsic = ID && LoopVectorizationPlanner::getDecisionAndClampRange( [&](ElementCount VF) -> bool { - bool NeedToScalarize = false; + Function *Variant; // Is it beneficial to perform intrinsic call compared to lib // call? InstructionCost CallCost = - CM.getVectorCallCost(CI, VF, NeedToScalarize); + CM.getVectorCallCost(CI, VF, &Variant); InstructionCost IntrinsicCost = CM.getVectorIntrinsicCost(CI, VF); return IntrinsicCost <= CallCost; @@ -8358,6 +8437,9 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, if (ShouldUseVectorIntrinsic) return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), ID); + Function *Variant = nullptr; + ElementCount VariantVF; + bool NeedsMask = false; // Is better to call a vectorized version of the function than to to scalarize // the call? auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange( @@ -8365,14 +8447,57 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, // The following case may be scalarized depending on the VF. // The flag shows whether we can use a usual Call for vectorized // version of the instruction. - bool NeedToScalarize = false; - CM.getVectorCallCost(CI, VF, NeedToScalarize); - return !NeedToScalarize; + + // If we've found a variant at a previous VF, then stop looking. A + // vectorized variant of a function expects input in a certain shape + // -- basically the number of input registers, the number of lanes + // per register, and whether there's a mask required. + // We store a pointer to the variant in the VPWidenCallRecipe, so + // once we have an appropriate variant it's only valid for that VF. + // This will force a different vplan to be generated for each VF that + // finds a valid variant. + if (Variant) + return false; + CM.getVectorCallCost(CI, VF, &Variant, &NeedsMask); + // If we found a valid vector variant at this VF, then store the VF + // in case we need to generate a mask. + if (Variant) + VariantVF = VF; + return Variant != nullptr; }, Range); - if (ShouldUseVectorCall) + if (ShouldUseVectorCall) { + if (NeedsMask) { + // We have 2 cases that would require a mask: + // 1) The block needs to be predicated, either due to a conditional + // in the scalar loop or use of an active lane mask with + // tail-folding, and we use the appropriate mask for the block. + // 2) No mask is required for the block, but the only available + // vector variant at this VF requires a mask, so we synthesize an + // all-true mask. + VPValue *Mask = nullptr; + if (Legal->isMaskRequired(CI)) + Mask = createBlockInMask(CI->getParent(), *Plan); + else + Mask = Plan->getVPValueOrAddLiveIn(ConstantInt::getTrue( + IntegerType::getInt1Ty(Variant->getFunctionType()->getContext()))); + + VFShape Shape = VFShape::get(*CI, VariantVF, /*HasGlobalPred=*/true); + unsigned MaskPos = 0; + + for (const VFInfo &Info : VFDatabase::getMappings(*CI)) + if (Info.Shape == Shape) { + assert(Info.isMasked() && "Vector function info shape mismatch"); + MaskPos = Info.getParamIndexForOptionalMask().value(); + break; + } + + Ops.insert(Ops.begin() + MaskPos, Mask); + } + return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), - Intrinsic::not_intrinsic); + Intrinsic::not_intrinsic, Variant); + } return nullptr; } @@ -8405,9 +8530,9 @@ VPRecipeBase *VPRecipeBuilder::tryToWiden(Instruction *I, // div/rem operation itself. Otherwise fall through to general handling below. if (CM.isPredicatedInst(I)) { SmallVector<VPValue *> Ops(Operands.begin(), Operands.end()); - VPValue *Mask = createBlockInMask(I->getParent(), Plan); - VPValue *One = - Plan->getOrAddExternalDef(ConstantInt::get(I->getType(), 1u, false)); + VPValue *Mask = createBlockInMask(I->getParent(), *Plan); + VPValue *One = Plan->getVPValueOrAddLiveIn( + ConstantInt::get(I->getType(), 1u, false)); auto *SafeRHS = new VPInstruction(Instruction::Select, {Mask, Ops[1], One}, I->getDebugLoc()); @@ -8415,38 +8540,26 @@ VPRecipeBase *VPRecipeBuilder::tryToWiden(Instruction *I, Ops[1] = SafeRHS; return new VPWidenRecipe(*I, make_range(Ops.begin(), Ops.end())); } - LLVM_FALLTHROUGH; + [[fallthrough]]; } case Instruction::Add: case Instruction::And: case Instruction::AShr: - case Instruction::BitCast: case Instruction::FAdd: case Instruction::FCmp: case Instruction::FDiv: case Instruction::FMul: case Instruction::FNeg: - case Instruction::FPExt: - case Instruction::FPToSI: - case Instruction::FPToUI: - case Instruction::FPTrunc: case Instruction::FRem: case Instruction::FSub: case Instruction::ICmp: - case Instruction::IntToPtr: case Instruction::LShr: case Instruction::Mul: case Instruction::Or: - case Instruction::PtrToInt: case Instruction::Select: - case Instruction::SExt: case Instruction::Shl: - case Instruction::SIToFP: case Instruction::Sub: - case Instruction::Trunc: - case Instruction::UIToFP: case Instruction::Xor: - case Instruction::ZExt: case Instruction::Freeze: return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end())); }; @@ -8462,9 +8575,9 @@ void VPRecipeBuilder::fixHeaderPhis() { } } -VPBasicBlock *VPRecipeBuilder::handleReplication( - Instruction *I, VFRange &Range, VPBasicBlock *VPBB, - VPlanPtr &Plan) { +VPRecipeOrVPValueTy VPRecipeBuilder::handleReplication(Instruction *I, + VFRange &Range, + VPlan &Plan) { bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, Range); @@ -8501,83 +8614,22 @@ VPBasicBlock *VPRecipeBuilder::handleReplication( break; } } - - auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), - IsUniform, IsPredicated); - - // Find if I uses a predicated instruction. If so, it will use its scalar - // value. Avoid hoisting the insert-element which packs the scalar value into - // a vector value, as that happens iff all users use the vector value. - for (VPValue *Op : Recipe->operands()) { - auto *PredR = - dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDefiningRecipe()); - if (!PredR) - continue; - auto *RepR = cast<VPReplicateRecipe>( - PredR->getOperand(0)->getDefiningRecipe()); - assert(RepR->isPredicated() && - "expected Replicate recipe to be predicated"); - RepR->setAlsoPack(false); - } - - // Finalize the recipe for Instr, first if it is not predicated. + VPValue *BlockInMask = nullptr; if (!IsPredicated) { + // Finalize the recipe for Instr, first if it is not predicated. LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); - setRecipe(I, Recipe); - Plan->addVPValue(I, Recipe); - VPBB->appendRecipe(Recipe); - return VPBB; - } - LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); - - VPBlockBase *SingleSucc = VPBB->getSingleSuccessor(); - assert(SingleSucc && "VPBB must have a single successor when handling " - "predicated replication."); - VPBlockUtils::disconnectBlocks(VPBB, SingleSucc); - // Record predicated instructions for above packing optimizations. - VPBlockBase *Region = createReplicateRegion(Recipe, Plan); - VPBlockUtils::insertBlockAfter(Region, VPBB); - auto *RegSucc = new VPBasicBlock(); - VPBlockUtils::insertBlockAfter(RegSucc, Region); - VPBlockUtils::connectBlocks(RegSucc, SingleSucc); - return RegSucc; -} - -VPRegionBlock * -VPRecipeBuilder::createReplicateRegion(VPReplicateRecipe *PredRecipe, - VPlanPtr &Plan) { - Instruction *Instr = PredRecipe->getUnderlyingInstr(); - // Instructions marked for predication are replicated and placed under an - // if-then construct to prevent side-effects. - // Generate recipes to compute the block mask for this region. - VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); - - // Build the triangular if-then region. - std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); - assert(Instr->getParent() && "Predicated instruction not in any basic block"); - auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); - auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); - auto *PHIRecipe = Instr->getType()->isVoidTy() - ? nullptr - : new VPPredInstPHIRecipe(PredRecipe); - if (PHIRecipe) { - setRecipe(Instr, PHIRecipe); - Plan->addVPValue(Instr, PHIRecipe); } else { - setRecipe(Instr, PredRecipe); - Plan->addVPValue(Instr, PredRecipe); + LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); + // Instructions marked for predication are replicated and a mask operand is + // added initially. Masked replicate recipes will later be placed under an + // if-then construct to prevent side-effects. Generate recipes to compute + // the block mask for this region. + BlockInMask = createBlockInMask(I->getParent(), Plan); } - auto *Exiting = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); - auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); - VPRegionBlock *Region = new VPRegionBlock(Entry, Exiting, RegionName, true); - - // Note: first set Entry as region entry and then connect successors starting - // from it in order, to propagate the "parent" of each VPBasicBlock. - VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry); - VPBlockUtils::connectBlocks(Pred, Exiting); - - return Region; + auto *Recipe = new VPReplicateRecipe(I, Plan.mapToVPValues(I->operands()), + IsUniform, BlockInMask); + return toVPRecipeResult(Recipe); } VPRecipeOrVPValueTy @@ -8643,7 +8695,7 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, return nullptr; if (auto *CI = dyn_cast<CallInst>(Instr)) - return toVPRecipeResult(tryToWidenCall(CI, Operands, Range)); + return toVPRecipeResult(tryToWidenCall(CI, Operands, Range, Plan)); if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan)); @@ -8653,13 +8705,16 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) return toVPRecipeResult(new VPWidenGEPRecipe( - GEP, make_range(Operands.begin(), Operands.end()), OrigLoop)); + GEP, make_range(Operands.begin(), Operands.end()))); if (auto *SI = dyn_cast<SelectInst>(Instr)) { - bool InvariantCond = - PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); return toVPRecipeResult(new VPWidenSelectRecipe( - *SI, make_range(Operands.begin(), Operands.end()), InvariantCond)); + *SI, make_range(Operands.begin(), Operands.end()))); + } + + if (auto *CI = dyn_cast<CastInst>(Instr)) { + return toVPRecipeResult( + new VPWidenCastRecipe(CI->getOpcode(), Operands[0], CI->getType(), CI)); } return toVPRecipeResult(tryToWiden(Instr, Operands, VPBB, Plan)); @@ -8677,34 +8732,11 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, auto &ConditionalAssumes = Legal->getConditionalAssumes(); DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); - MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); - // Dead instructions do not need sinking. Remove them from SinkAfter. - for (Instruction *I : DeadInstructions) - SinkAfter.erase(I); - - // Cannot sink instructions after dead instructions (there won't be any - // recipes for them). Instead, find the first non-dead previous instruction. - for (auto &P : Legal->getSinkAfter()) { - Instruction *SinkTarget = P.second; - Instruction *FirstInst = &*SinkTarget->getParent()->begin(); - (void)FirstInst; - while (DeadInstructions.contains(SinkTarget)) { - assert( - SinkTarget != FirstInst && - "Must find a live instruction (at least the one feeding the " - "fixed-order recurrence PHI) before reaching beginning of the block"); - SinkTarget = SinkTarget->getPrevNode(); - assert(SinkTarget != P.first && - "sink source equals target, no sinking required"); - } - P.second = SinkTarget; - } - - auto MaxVFPlusOne = MaxVF.getWithIncrement(1); - for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { - VFRange SubRange = {VF, MaxVFPlusOne}; - VPlans.push_back( - buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); + auto MaxVFTimes2 = MaxVF * 2; + for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) { + VFRange SubRange = {VF, MaxVFTimes2}; + if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange, DeadInstructions)) + VPlans.push_back(std::move(*Plan)); VF = SubRange.End; } } @@ -8712,10 +8744,9 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, // Add the necessary canonical IV and branch recipes required to control the // loop. static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL, - bool HasNUW, - bool UseLaneMaskForLoopControlFlow) { + TailFoldingStyle Style) { Value *StartIdx = ConstantInt::get(IdxTy, 0); - auto *StartV = Plan.getOrAddVPValue(StartIdx); + auto *StartV = Plan.getVPValueOrAddLiveIn(StartIdx); // Add a VPCanonicalIVPHIRecipe starting at 0 to the header. auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL); @@ -8725,6 +8756,7 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL, // Add a CanonicalIVIncrement{NUW} VPInstruction to increment the scalar // IV by VF * UF. + bool HasNUW = Style == TailFoldingStyle::None; auto *CanonicalIVIncrement = new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW : VPInstruction::CanonicalIVIncrement, @@ -8732,11 +8764,10 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL, CanonicalIVPHI->addOperand(CanonicalIVIncrement); VPBasicBlock *EB = TopRegion->getExitingBasicBlock(); - EB->appendRecipe(CanonicalIVIncrement); - - if (UseLaneMaskForLoopControlFlow) { + if (useActiveLaneMaskForControlFlow(Style)) { // Create the active lane mask instruction in the vplan preheader. - VPBasicBlock *Preheader = Plan.getEntry()->getEntryBasicBlock(); + VPBasicBlock *VecPreheader = + cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSinglePredecessor()); // We can't use StartV directly in the ActiveLaneMask VPInstruction, since // we have to take unrolling into account. Each part needs to start at @@ -8745,14 +8776,34 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL, new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementForPartNUW : VPInstruction::CanonicalIVIncrementForPart, {StartV}, DL, "index.part.next"); - Preheader->appendRecipe(CanonicalIVIncrementParts); + VecPreheader->appendRecipe(CanonicalIVIncrementParts); // Create the ActiveLaneMask instruction using the correct start values. - VPValue *TC = Plan.getOrCreateTripCount(); + VPValue *TC = Plan.getTripCount(); + + VPValue *TripCount, *IncrementValue; + if (Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) { + // When avoiding a runtime check, the active.lane.mask inside the loop + // uses a modified trip count and the induction variable increment is + // done after the active.lane.mask intrinsic is called. + auto *TCMinusVF = + new VPInstruction(VPInstruction::CalculateTripCountMinusVF, {TC}, DL); + VecPreheader->appendRecipe(TCMinusVF); + IncrementValue = CanonicalIVPHI; + TripCount = TCMinusVF; + } else { + // When the loop is guarded by a runtime overflow check for the loop + // induction variable increment by VF, we can increment the value before + // the get.active.lane mask and use the unmodified tripcount. + EB->appendRecipe(CanonicalIVIncrement); + IncrementValue = CanonicalIVIncrement; + TripCount = TC; + } + auto *EntryALM = new VPInstruction(VPInstruction::ActiveLaneMask, {CanonicalIVIncrementParts, TC}, DL, "active.lane.mask.entry"); - Preheader->appendRecipe(EntryALM); + VecPreheader->appendRecipe(EntryALM); // Now create the ActiveLaneMaskPhi recipe in the main loop using the // preheader ActiveLaneMask instruction. @@ -8763,15 +8814,21 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL, CanonicalIVIncrementParts = new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementForPartNUW : VPInstruction::CanonicalIVIncrementForPart, - {CanonicalIVIncrement}, DL); + {IncrementValue}, DL); EB->appendRecipe(CanonicalIVIncrementParts); auto *ALM = new VPInstruction(VPInstruction::ActiveLaneMask, - {CanonicalIVIncrementParts, TC}, DL, + {CanonicalIVIncrementParts, TripCount}, DL, "active.lane.mask.next"); EB->appendRecipe(ALM); LaneMaskPhi->addOperand(ALM); + if (Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) { + // Do the increment of the canonical IV after the active.lane.mask, because + // that value is still based off %CanonicalIVPHI + EB->appendRecipe(CanonicalIVIncrement); + } + // We have to invert the mask here because a true condition means jumping // to the exit block. auto *NotMask = new VPInstruction(VPInstruction::Not, ALM, DL); @@ -8781,6 +8838,8 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL, new VPInstruction(VPInstruction::BranchOnCond, {NotMask}, DL); EB->appendRecipe(BranchBack); } else { + EB->appendRecipe(CanonicalIVIncrement); + // Add the BranchOnCount VPInstruction to the latch. VPInstruction *BranchBack = new VPInstruction( VPInstruction::BranchOnCount, @@ -8804,14 +8863,13 @@ static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB, for (PHINode &ExitPhi : ExitBB->phis()) { Value *IncomingValue = ExitPhi.getIncomingValueForBlock(ExitingBB); - VPValue *V = Plan.getOrAddVPValue(IncomingValue, true); + VPValue *V = Plan.getVPValueOrAddLiveIn(IncomingValue); Plan.addLiveOut(&ExitPhi, V); } } -VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( - VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, - const MapVector<Instruction *, Instruction *> &SinkAfter) { +std::optional<VPlanPtr> LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( + VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions) { SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; @@ -8822,12 +8880,6 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( // process after constructing the initial VPlan. // --------------------------------------------------------------------------- - // Mark instructions we'll need to sink later and their targets as - // ingredients whose recipe we'll need to record. - for (const auto &Entry : SinkAfter) { - RecipeBuilder.recordRecipeOf(Entry.first); - RecipeBuilder.recordRecipeOf(Entry.second); - } for (const auto &Reduction : CM.getInLoopReductionChains()) { PHINode *Phi = Reduction.first; RecurKind Kind = @@ -8852,9 +8904,15 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( // single VPInterleaveRecipe. for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { auto applyIG = [IG, this](ElementCount VF) -> bool { - return (VF.isVector() && // Query is illegal for VF == 1 - CM.getWideningDecision(IG->getInsertPos(), VF) == - LoopVectorizationCostModel::CM_Interleave); + bool Result = (VF.isVector() && // Query is illegal for VF == 1 + CM.getWideningDecision(IG->getInsertPos(), VF) == + LoopVectorizationCostModel::CM_Interleave); + // For scalable vectors, the only interleave factor currently supported + // is 2 since we require the (de)interleave2 intrinsics instead of + // shufflevectors. + assert((!Result || !VF.isScalable() || IG->getFactor() == 2) && + "Unsupported interleave factor for scalable vectors"); + return Result; }; if (!getDecisionAndClampRange(applyIG, Range)) continue; @@ -8869,26 +8927,34 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( // visit each basic block after having visited its predecessor basic blocks. // --------------------------------------------------------------------------- - // Create initial VPlan skeleton, starting with a block for the pre-header, - // followed by a region for the vector loop, followed by the middle block. The - // skeleton vector loop region contains a header and latch block. - VPBasicBlock *Preheader = new VPBasicBlock("vector.ph"); - auto Plan = std::make_unique<VPlan>(Preheader); - + // Create initial VPlan skeleton, having a basic block for the pre-header + // which contains SCEV expansions that need to happen before the CFG is + // modified; a basic block for the vector pre-header, followed by a region for + // the vector loop, followed by the middle basic block. The skeleton vector + // loop region contains a header and latch basic blocks. + VPlanPtr Plan = VPlan::createInitialVPlan( + createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop), + *PSE.getSE()); VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body"); VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch"); VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB); auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop"); - VPBlockUtils::insertBlockAfter(TopRegion, Preheader); + VPBlockUtils::insertBlockAfter(TopRegion, Plan->getEntry()); VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block"); VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion); + // Don't use getDecisionAndClampRange here, because we don't know the UF + // so this function is better to be conservative, rather than to split + // it up into different VPlans. + bool IVUpdateMayOverflow = false; + for (ElementCount VF : Range) + IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF); + Instruction *DLInst = getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DLInst ? DLInst->getDebugLoc() : DebugLoc(), - !CM.foldTailByMasking(), - CM.useActiveLaneMaskForControlFlow()); + CM.getTailFoldingStyle(IVUpdateMayOverflow)); // Scan the body of the loop in a topological order to visit each basic block // after having visited its predecessor basic blocks. @@ -8896,18 +8962,16 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( DFS.perform(LI); VPBasicBlock *VPBB = HeaderVPBB; - SmallVector<VPWidenIntOrFpInductionRecipe *> InductionsToMove; for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { // Relevant instructions from basic block BB will be grouped into VPRecipe // ingredients and fill a new VPBasicBlock. - unsigned VPBBsForBB = 0; if (VPBB != HeaderVPBB) VPBB->setName(BB->getName()); Builder.setInsertPoint(VPBB); // Introduce each ingredient into VPlan. // TODO: Model and preserve debug intrinsics in VPlan. - for (Instruction &I : BB->instructionsWithoutDebug()) { + for (Instruction &I : BB->instructionsWithoutDebug(false)) { Instruction *Instr = &I; // First filter out irrelevant instructions, to ensure no recipes are @@ -8918,7 +8982,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( SmallVector<VPValue *, 4> Operands; auto *Phi = dyn_cast<PHINode>(Instr); if (Phi && Phi->getParent() == OrigLoop->getHeader()) { - Operands.push_back(Plan->getOrAddVPValue( + Operands.push_back(Plan->getVPValueOrAddLiveIn( Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()))); } else { auto OpRange = Plan->mapToVPValues(Instr->operands()); @@ -8932,50 +8996,36 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) continue; - if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe( - Instr, Operands, Range, VPBB, Plan)) { - // If Instr can be simplified to an existing VPValue, use it. - if (RecipeOrValue.is<VPValue *>()) { - auto *VPV = RecipeOrValue.get<VPValue *>(); - Plan->addVPValue(Instr, VPV); - // If the re-used value is a recipe, register the recipe for the - // instruction, in case the recipe for Instr needs to be recorded. - if (VPRecipeBase *R = VPV->getDefiningRecipe()) - RecipeBuilder.setRecipe(Instr, R); - continue; - } - // Otherwise, add the new recipe. - VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>(); - for (auto *Def : Recipe->definedValues()) { - auto *UV = Def->getUnderlyingValue(); - Plan->addVPValue(UV, Def); - } - - if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) && - HeaderVPBB->getFirstNonPhi() != VPBB->end()) { - // Keep track of VPWidenIntOrFpInductionRecipes not in the phi section - // of the header block. That can happen for truncates of induction - // variables. Those recipes are moved to the phi section of the header - // block after applying SinkAfter, which relies on the original - // position of the trunc. - assert(isa<TruncInst>(Instr)); - InductionsToMove.push_back( - cast<VPWidenIntOrFpInductionRecipe>(Recipe)); - } - RecipeBuilder.setRecipe(Instr, Recipe); - VPBB->appendRecipe(Recipe); + auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe( + Instr, Operands, Range, VPBB, Plan); + if (!RecipeOrValue) + RecipeOrValue = RecipeBuilder.handleReplication(Instr, Range, *Plan); + // If Instr can be simplified to an existing VPValue, use it. + if (isa<VPValue *>(RecipeOrValue)) { + auto *VPV = cast<VPValue *>(RecipeOrValue); + Plan->addVPValue(Instr, VPV); + // If the re-used value is a recipe, register the recipe for the + // instruction, in case the recipe for Instr needs to be recorded. + if (VPRecipeBase *R = VPV->getDefiningRecipe()) + RecipeBuilder.setRecipe(Instr, R); continue; } - - // Otherwise, if all widening options failed, Instruction is to be - // replicated. This may create a successor for VPBB. - VPBasicBlock *NextVPBB = - RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan); - if (NextVPBB != VPBB) { - VPBB = NextVPBB; - VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) - : ""); + // Otherwise, add the new recipe. + VPRecipeBase *Recipe = cast<VPRecipeBase *>(RecipeOrValue); + for (auto *Def : Recipe->definedValues()) { + auto *UV = Def->getUnderlyingValue(); + Plan->addVPValue(UV, Def); } + + RecipeBuilder.setRecipe(Instr, Recipe); + if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) && + HeaderVPBB->getFirstNonPhi() != VPBB->end()) { + // Move VPWidenIntOrFpInductionRecipes for optimized truncates to the + // phi section of HeaderVPBB. + assert(isa<TruncInst>(Instr)); + Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi()); + } else + VPBB->appendRecipe(Recipe); } VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB); @@ -8985,7 +9035,12 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( // After here, VPBB should not be used. VPBB = nullptr; - addUsersInExitBlock(HeaderVPBB, MiddleVPBB, OrigLoop, *Plan); + if (CM.requiresScalarEpilogue(Range)) { + // No edge from the middle block to the unique exit block has been inserted + // and there is nothing to fix from vector loop; phis should have incoming + // from scalar loop only. + } else + addUsersInExitBlock(HeaderVPBB, MiddleVPBB, OrigLoop, *Plan); assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) && !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() && @@ -8998,116 +9053,10 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( // bring the VPlan to its final state. // --------------------------------------------------------------------------- - // Apply Sink-After legal constraints. - auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * { - auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent()); - if (Region && Region->isReplicator()) { - assert(Region->getNumSuccessors() == 1 && - Region->getNumPredecessors() == 1 && "Expected SESE region!"); - assert(R->getParent()->size() == 1 && - "A recipe in an original replicator region must be the only " - "recipe in its block"); - return Region; - } - return nullptr; - }; - for (const auto &Entry : SinkAfter) { - VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); - VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); - - auto *TargetRegion = GetReplicateRegion(Target); - auto *SinkRegion = GetReplicateRegion(Sink); - if (!SinkRegion) { - // If the sink source is not a replicate region, sink the recipe directly. - if (TargetRegion) { - // The target is in a replication region, make sure to move Sink to - // the block after it, not into the replication region itself. - VPBasicBlock *NextBlock = - cast<VPBasicBlock>(TargetRegion->getSuccessors().front()); - Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); - } else - Sink->moveAfter(Target); - continue; - } - - // The sink source is in a replicate region. Unhook the region from the CFG. - auto *SinkPred = SinkRegion->getSinglePredecessor(); - auto *SinkSucc = SinkRegion->getSingleSuccessor(); - VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion); - VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc); - VPBlockUtils::connectBlocks(SinkPred, SinkSucc); - - if (TargetRegion) { - // The target recipe is also in a replicate region, move the sink region - // after the target region. - auto *TargetSucc = TargetRegion->getSingleSuccessor(); - VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc); - VPBlockUtils::connectBlocks(TargetRegion, SinkRegion); - VPBlockUtils::connectBlocks(SinkRegion, TargetSucc); - } else { - // The sink source is in a replicate region, we need to move the whole - // replicate region, which should only contain a single recipe in the - // main block. - auto *SplitBlock = - Target->getParent()->splitAt(std::next(Target->getIterator())); - - auto *SplitPred = SplitBlock->getSinglePredecessor(); - - VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock); - VPBlockUtils::connectBlocks(SplitPred, SinkRegion); - VPBlockUtils::connectBlocks(SinkRegion, SplitBlock); - } - } - - VPlanTransforms::removeRedundantCanonicalIVs(*Plan); - VPlanTransforms::removeRedundantInductionCasts(*Plan); - - // Now that sink-after is done, move induction recipes for optimized truncates - // to the phi section of the header block. - for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove) - Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi()); - // Adjust the recipes for any inloop reductions. adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExiting()), Plan, RecipeBuilder, Range.Start); - // Introduce a recipe to combine the incoming and previous values of a - // fixed-order recurrence. - for (VPRecipeBase &R : - Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { - auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R); - if (!RecurPhi) - continue; - - VPRecipeBase *PrevRecipe = &RecurPhi->getBackedgeRecipe(); - // Fixed-order recurrences do not contain cycles, so this loop is guaranteed - // to terminate. - while (auto *PrevPhi = - dyn_cast<VPFirstOrderRecurrencePHIRecipe>(PrevRecipe)) - PrevRecipe = &PrevPhi->getBackedgeRecipe(); - VPBasicBlock *InsertBlock = PrevRecipe->getParent(); - auto *Region = GetReplicateRegion(PrevRecipe); - if (Region) - InsertBlock = dyn_cast<VPBasicBlock>(Region->getSingleSuccessor()); - if (!InsertBlock) { - InsertBlock = new VPBasicBlock(Region->getName() + ".succ"); - VPBlockUtils::insertBlockAfter(InsertBlock, Region); - } - if (Region || PrevRecipe->isPhi()) - Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi()); - else - Builder.setInsertPoint(InsertBlock, std::next(PrevRecipe->getIterator())); - - auto *RecurSplice = cast<VPInstruction>( - Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice, - {RecurPhi, RecurPhi->getBackedgeValue()})); - - RecurPhi->replaceAllUsesWith(RecurSplice); - // Set the first operand of RecurSplice to RecurPhi again, after replacing - // all users. - RecurSplice->setOperand(0, RecurPhi); - } - // Interleave memory: for each Interleave Group we marked earlier as relevant // for this VPlan, replace the Recipes widening its memory instructions with a // single VPInterleaveRecipe at its insertion point. @@ -9122,48 +9071,66 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( StoredValues.push_back(StoreR->getStoredValue()); } + bool NeedsMaskForGaps = + IG->requiresScalarEpilogue() && !CM.isScalarEpilogueAllowed(); auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, - Recipe->getMask()); + Recipe->getMask(), NeedsMaskForGaps); VPIG->insertBefore(Recipe); unsigned J = 0; for (unsigned i = 0; i < IG->getFactor(); ++i) if (Instruction *Member = IG->getMember(i)) { + VPRecipeBase *MemberR = RecipeBuilder.getRecipe(Member); if (!Member->getType()->isVoidTy()) { - VPValue *OriginalV = Plan->getVPValue(Member); - Plan->removeVPValueFor(Member); - Plan->addVPValue(Member, VPIG->getVPValue(J)); + VPValue *OriginalV = MemberR->getVPSingleValue(); OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); J++; } - RecipeBuilder.getRecipe(Member)->eraseFromParent(); + MemberR->eraseFromParent(); } } - for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); - VF *= 2) + for (ElementCount VF : Range) Plan->addVF(VF); Plan->setName("Initial VPlan"); + // Replace VPValues for known constant strides guaranteed by predicate scalar + // evolution. + for (auto [_, Stride] : Legal->getLAI()->getSymbolicStrides()) { + auto *StrideV = cast<SCEVUnknown>(Stride)->getValue(); + auto *ScevStride = dyn_cast<SCEVConstant>(PSE.getSCEV(StrideV)); + // Only handle constant strides for now. + if (!ScevStride) + continue; + Constant *CI = ConstantInt::get(Stride->getType(), ScevStride->getAPInt()); + + auto *ConstVPV = Plan->getVPValueOrAddLiveIn(CI); + // The versioned value may not be used in the loop directly, so just add a + // new live-in in those cases. + Plan->getVPValueOrAddLiveIn(StrideV)->replaceAllUsesWith(ConstVPV); + } + // From this point onwards, VPlan-to-VPlan transformations may change the plan // in ways that accessing values using original IR values is incorrect. Plan->disableValue2VPValue(); + // Sink users of fixed-order recurrence past the recipe defining the previous + // value and introduce FirstOrderRecurrenceSplice VPInstructions. + if (!VPlanTransforms::adjustFixedOrderRecurrences(*Plan, Builder)) + return std::nullopt; + + VPlanTransforms::removeRedundantCanonicalIVs(*Plan); + VPlanTransforms::removeRedundantInductionCasts(*Plan); + VPlanTransforms::optimizeInductions(*Plan, *PSE.getSE()); VPlanTransforms::removeDeadRecipes(*Plan); - bool ShouldSimplify = true; - while (ShouldSimplify) { - ShouldSimplify = VPlanTransforms::sinkScalarOperands(*Plan); - ShouldSimplify |= - VPlanTransforms::mergeReplicateRegionsIntoSuccessors(*Plan); - ShouldSimplify |= VPlanTransforms::mergeBlocksIntoPredecessors(*Plan); - } + VPlanTransforms::createAndOptimizeReplicateRegions(*Plan); VPlanTransforms::removeRedundantExpandSCEVRecipes(*Plan); VPlanTransforms::mergeBlocksIntoPredecessors(*Plan); assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid"); - return Plan; + return std::make_optional(std::move(Plan)); } VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { @@ -9175,21 +9142,21 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); // Create new empty VPlan - auto Plan = std::make_unique<VPlan>(); + auto Plan = VPlan::createInitialVPlan( + createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop), + *PSE.getSE()); // Build hierarchical CFG VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); HCFGBuilder.buildHierarchicalCFG(); - for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); - VF *= 2) + for (ElementCount VF : Range) Plan->addVF(VF); - SmallPtrSet<Instruction *, 1> DeadInstructions; VPlanTransforms::VPInstructionsToVPRecipes( - OrigLoop, Plan, + Plan, [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); }, - DeadInstructions, *PSE.getSE(), *TLI); + *PSE.getSE(), *TLI); // Remove the existing terminator of the exiting block of the top-most region. // A BranchOnCount will be added instead when adding the canonical IV recipes. @@ -9198,7 +9165,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { Term->eraseFromParent(); addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(), - true, CM.useActiveLaneMaskForControlFlow()); + CM.getTailFoldingStyle()); return Plan; } @@ -9255,7 +9222,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( VPBuilder::InsertPointGuard Guard(Builder); Builder.setInsertPoint(WidenRecipe->getParent(), WidenRecipe->getIterator()); - CondOp = RecipeBuilder.createBlockInMask(R->getParent(), Plan); + CondOp = RecipeBuilder.createBlockInMask(R->getParent(), *Plan); } if (IsFMulAdd) { @@ -9270,7 +9237,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( VecOp = FMulRecipe; } VPReductionRecipe *RedRecipe = - new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, TTI); + new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, &TTI); WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); Plan->removeVPValueFor(R); Plan->addVPValue(R, RedRecipe); @@ -9304,13 +9271,15 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( if (!PhiR || PhiR->isInLoop()) continue; VPValue *Cond = - RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); + RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), *Plan); VPValue *Red = PhiR->getBackedgeValue(); assert(Red->getDefiningRecipe()->getParent() != LatchVPBB && "reduction recipe must be defined before latch"); Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR}); } } + + VPlanTransforms::clearReductionWrapFlags(*Plan); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -9475,7 +9444,7 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) { PartStart, ConstantInt::get(PtrInd->getType(), Lane)); Value *GlobalIdx = State.Builder.CreateAdd(PtrInd, Idx); - Value *Step = State.get(getOperand(1), VPIteration(0, Part)); + Value *Step = State.get(getOperand(1), VPIteration(Part, Lane)); Value *SclrGep = emitTransformedIndex( State.Builder, GlobalIdx, IndDesc.getStartValue(), Step, IndDesc); SclrGep->setName("next.gep"); @@ -9485,8 +9454,6 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) { return; } - assert(isa<SCEVConstant>(IndDesc.getStep()) && - "Induction step not a SCEV constant!"); Type *PhiType = IndDesc.getStep()->getType(); // Build a pointer phi @@ -9506,7 +9473,7 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) { Value *NumUnrolledElems = State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF)); Value *InductionGEP = GetElementPtrInst::Create( - IndDesc.getElementType(), NewPointerPhi, + State.Builder.getInt8Ty(), NewPointerPhi, State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", InductionLoc); // Add induction update using an incorrect block temporarily. The phi node @@ -9529,10 +9496,10 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) { StartOffset = State.Builder.CreateAdd( StartOffset, State.Builder.CreateStepVector(VecPhiType)); - assert(ScalarStepValue == State.get(getOperand(1), VPIteration(0, Part)) && + assert(ScalarStepValue == State.get(getOperand(1), VPIteration(Part, 0)) && "scalar step must be the same across all parts"); Value *GEP = State.Builder.CreateGEP( - IndDesc.getElementType(), NewPointerPhi, + State.Builder.getInt8Ty(), NewPointerPhi, State.Builder.CreateMul( StartOffset, State.Builder.CreateVectorSplat(State.VF, ScalarStepValue), @@ -9584,7 +9551,8 @@ void VPScalarIVStepsRecipe::execute(VPTransformState &State) { void VPInterleaveRecipe::execute(VPTransformState &State) { assert(!State.Instance && "Interleave group being replicated."); State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), - getStoredValues(), getMask()); + getStoredValues(), getMask(), + NeedsMaskForGaps); } void VPReductionRecipe::execute(VPTransformState &State) { @@ -9640,10 +9608,9 @@ void VPReplicateRecipe::execute(VPTransformState &State) { Instruction *UI = getUnderlyingInstr(); if (State.Instance) { // Generate a single instance. assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); - State.ILV->scalarizeInstruction(UI, this, *State.Instance, - IsPredicated, State); + State.ILV->scalarizeInstruction(UI, this, *State.Instance, State); // Insert scalar instance packing it into a vector. - if (AlsoPack && State.VF.isVector()) { + if (State.VF.isVector() && shouldPack()) { // If we're constructing lane 0, initialize to start from poison. if (State.Instance->Lane.isFirstLane()) { assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); @@ -9663,8 +9630,7 @@ void VPReplicateRecipe::execute(VPTransformState &State) { all_of(operands(), [](VPValue *Op) { return Op->isDefinedOutsideVectorRegions(); })) { - State.ILV->scalarizeInstruction(UI, this, VPIteration(0, 0), IsPredicated, - State); + State.ILV->scalarizeInstruction(UI, this, VPIteration(0, 0), State); if (user_begin() != user_end()) { for (unsigned Part = 1; Part < State.UF; ++Part) State.set(this, State.get(this, VPIteration(0, 0)), @@ -9676,16 +9642,16 @@ void VPReplicateRecipe::execute(VPTransformState &State) { // Uniform within VL means we need to generate lane 0 only for each // unrolled copy. for (unsigned Part = 0; Part < State.UF; ++Part) - State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, 0), - IsPredicated, State); + State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, 0), State); return; } - // A store of a loop varying value to a loop invariant address only - // needs only the last copy of the store. - if (isa<StoreInst>(UI) && !getOperand(1)->hasDefiningRecipe()) { + // A store of a loop varying value to a uniform address only needs the last + // copy of the store. + if (isa<StoreInst>(UI) && + vputils::isUniformAfterVectorization(getOperand(1))) { auto Lane = VPLane::getLastLaneForVF(State.VF); - State.ILV->scalarizeInstruction(UI, this, VPIteration(State.UF - 1, Lane), IsPredicated, + State.ILV->scalarizeInstruction(UI, this, VPIteration(State.UF - 1, Lane), State); return; } @@ -9695,8 +9661,7 @@ void VPReplicateRecipe::execute(VPTransformState &State) { const unsigned EndLane = State.VF.getKnownMinValue(); for (unsigned Part = 0; Part < State.UF; ++Part) for (unsigned Lane = 0; Lane < EndLane; ++Lane) - State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane), - IsPredicated, State); + State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane), State); } void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { @@ -9714,7 +9679,7 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { auto *DataTy = VectorType::get(ScalarDataTy, State.VF); const Align Alignment = getLoadStoreAlignment(&Ingredient); - bool CreateGatherScatter = !Consecutive; + bool CreateGatherScatter = !isConsecutive(); auto &Builder = State.Builder; InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF); @@ -9725,36 +9690,39 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { // Calculate the pointer for the specific unroll-part. - GetElementPtrInst *PartPtr = nullptr; - + Value *PartPtr = nullptr; + + // Use i32 for the gep index type when the value is constant, + // or query DataLayout for a more suitable index type otherwise. + const DataLayout &DL = + Builder.GetInsertBlock()->getModule()->getDataLayout(); + Type *IndexTy = State.VF.isScalable() && (isReverse() || Part > 0) + ? DL.getIndexType(ScalarDataTy->getPointerTo()) + : Builder.getInt32Ty(); bool InBounds = false; if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) InBounds = gep->isInBounds(); - if (Reverse) { + if (isReverse()) { // If the address is consecutive but reversed, then the // wide store needs to start at the last vector element. // RunTimeVF = VScale * VF.getKnownMinValue() // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() - Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF); + Value *RunTimeVF = getRuntimeVF(Builder, IndexTy, State.VF); // NumElt = -Part * RunTimeVF - Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF); + Value *NumElt = + Builder.CreateMul(ConstantInt::get(IndexTy, -(int64_t)Part), RunTimeVF); // LastLane = 1 - RunTimeVF - Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF); + Value *LastLane = + Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF); + PartPtr = Builder.CreateGEP(ScalarDataTy, Ptr, NumElt, "", InBounds); PartPtr = - cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt)); - PartPtr->setIsInBounds(InBounds); - PartPtr = cast<GetElementPtrInst>( - Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane)); - PartPtr->setIsInBounds(InBounds); + Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane, "", InBounds); if (isMaskRequired) // Reverse of a null all-one mask is a null mask. BlockInMaskParts[Part] = Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse"); } else { - Value *Increment = - createStepForVF(Builder, Builder.getInt32Ty(), State.VF, Part); - PartPtr = cast<GetElementPtrInst>( - Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); - PartPtr->setIsInBounds(InBounds); + Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Part); + PartPtr = Builder.CreateGEP(ScalarDataTy, Ptr, Increment, "", InBounds); } unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); @@ -9774,7 +9742,7 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, MaskPart); } else { - if (Reverse) { + if (isReverse()) { // If we store to reverse consecutive memory locations, then we need // to reverse the order of elements in the stored value. StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse"); @@ -9833,7 +9801,6 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { static ScalarEpilogueLowering getScalarEpilogueLowering( Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, - AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI) { // 1) OptSize takes precedence over all other options, i.e. if this is set, // don't look at hints or options, and don't request a scalar epilogue. @@ -9869,7 +9836,8 @@ static ScalarEpilogueLowering getScalarEpilogueLowering( }; // 4) if the TTI hook indicates this is profitable, request predication. - if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, &LVL, IAI)) + TailFoldingInfo TFI(TLI, &LVL, IAI); + if (TTI->preferPredicateOverEpilogue(&TFI)) return CM_ScalarEpilogueNotNeededUsePredicate; return CM_ScalarEpilogueAllowed; @@ -9880,9 +9848,29 @@ Value *VPTransformState::get(VPValue *Def, unsigned Part) { if (hasVectorValue(Def, Part)) return Data.PerPartOutput[Def][Part]; + auto GetBroadcastInstrs = [this, Def](Value *V) { + bool SafeToHoist = Def->isDefinedOutsideVectorRegions(); + if (VF.isScalar()) + return V; + // Place the code for broadcasting invariant variables in the new preheader. + IRBuilder<>::InsertPointGuard Guard(Builder); + if (SafeToHoist) { + BasicBlock *LoopVectorPreHeader = CFG.VPBB2IRBB[cast<VPBasicBlock>( + Plan->getVectorLoopRegion()->getSinglePredecessor())]; + if (LoopVectorPreHeader) + Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); + } + + // Place the code for broadcasting invariant variables in the new preheader. + // Broadcast the scalar into all locations in the vector. + Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); + + return Shuf; + }; + if (!hasScalarValue(Def, {Part, 0})) { Value *IRV = Def->getLiveInIRValue(); - Value *B = ILV->getBroadcastInstrs(IRV); + Value *B = GetBroadcastInstrs(IRV); set(Def, B, Part); return B; } @@ -9900,9 +9888,11 @@ Value *VPTransformState::get(VPValue *Def, unsigned Part) { unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1; // Check if there is a scalar value for the selected lane. if (!hasScalarValue(Def, {Part, LastLane})) { - // At the moment, VPWidenIntOrFpInductionRecipes and VPScalarIVStepsRecipes can also be uniform. + // At the moment, VPWidenIntOrFpInductionRecipes, VPScalarIVStepsRecipes and + // VPExpandSCEVRecipes can also be uniform. assert((isa<VPWidenIntOrFpInductionRecipe>(Def->getDefiningRecipe()) || - isa<VPScalarIVStepsRecipe>(Def->getDefiningRecipe())) && + isa<VPScalarIVStepsRecipe>(Def->getDefiningRecipe()) || + isa<VPExpandSCEVRecipe>(Def->getDefiningRecipe())) && "unexpected recipe found to be invariant"); IsUniform = true; LastLane = 0; @@ -9927,7 +9917,7 @@ Value *VPTransformState::get(VPValue *Def, unsigned Part) { // State, we will only generate the insertelements once. Value *VectorValue = nullptr; if (IsUniform) { - VectorValue = ILV->getBroadcastInstrs(ScalarValue); + VectorValue = GetBroadcastInstrs(ScalarValue); set(Def, VectorValue, Part); } else { // Initialize packing with insertelements to start from undef. @@ -9962,15 +9952,15 @@ static bool processLoopInVPlanNativePath( Function *F = L->getHeader()->getParent(); InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); - ScalarEpilogueLowering SEL = getScalarEpilogueLowering( - F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL, &IAI); + ScalarEpilogueLowering SEL = + getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, *LVL, &IAI); LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, &Hints, IAI); // Use the planner for outer loop vectorization. // TODO: CM is not used at this point inside the planner. Turn CM into an // optional argument if we don't need it in the future. - LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, ORE); + LoopVectorizationPlanner LVP(L, LI, TLI, *TTI, LVL, CM, IAI, PSE, Hints, ORE); // Get user vectorization factor. ElementCount UserVF = Hints.getWidth(); @@ -10231,8 +10221,8 @@ bool LoopVectorizePass::processLoop(Loop *L) { // Check the function attributes and profiles to find out if this function // should be optimized for size. - ScalarEpilogueLowering SEL = getScalarEpilogueLowering( - F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL, &IAI); + ScalarEpilogueLowering SEL = + getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL, &IAI); // Check the loop for a trip count threshold: vectorize loops with a tiny trip // count by optimizing for size, to minimize overheads. @@ -10309,11 +10299,9 @@ bool LoopVectorizePass::processLoop(Loop *L) { // Use the cost model. LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, F, &Hints, IAI); - CM.collectValuesToIgnore(); - CM.collectElementTypesForWidening(); - // Use the planner for vectorization. - LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, ORE); + LoopVectorizationPlanner LVP(L, LI, TLI, *TTI, &LVL, CM, IAI, PSE, Hints, + ORE); // Get user vectorization factor and interleave count. ElementCount UserVF = Hints.getWidth(); @@ -10342,7 +10330,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled; if (!ForceVectorization && - !areRuntimeChecksProfitable(Checks, VF, CM.getVScaleForTuning(), L, + !areRuntimeChecksProfitable(Checks, VF, getVScaleForTuning(L, *TTI), L, *PSE.getSE())) { ORE->emit([&]() { return OptimizationRemarkAnalysisAliasing( @@ -10464,7 +10452,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { // Consider vectorizing the epilogue too if it's profitable. VectorizationFactor EpilogueVF = - CM.selectEpilogueVectorizationFactor(VF.Width, LVP); + LVP.selectEpilogueVectorizationFactor(VF.Width, IC); if (EpilogueVF.Width.isVector()) { // The first pass vectorizes the main loop and creates a scalar epilogue @@ -10475,8 +10463,8 @@ bool LoopVectorizePass::processLoop(Loop *L) { EPI, &LVL, &CM, BFI, PSI, Checks); VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF); - LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV, - DT, true); + auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, + BestMainPlan, MainILV, DT, true); ++LoopsVectorized; // Second pass vectorizes the epilogue and adjusts the control flow @@ -10492,6 +10480,21 @@ bool LoopVectorizePass::processLoop(Loop *L) { VPBasicBlock *Header = VectorLoop->getEntryBasicBlock(); Header->setName("vec.epilog.vector.body"); + // Re-use the trip count and steps expanded for the main loop, as + // skeleton creation needs it as a value that dominates both the scalar + // and vector epilogue loops + // TODO: This is a workaround needed for epilogue vectorization and it + // should be removed once induction resume value creation is done + // directly in VPlan. + EpilogILV.setTripCount(MainILV.getTripCount()); + for (auto &R : make_early_inc_range(*BestEpiPlan.getPreheader())) { + auto *ExpandR = cast<VPExpandSCEVRecipe>(&R); + auto *ExpandedVal = BestEpiPlan.getVPValueOrAddLiveIn( + ExpandedSCEVs.find(ExpandR->getSCEV())->second); + ExpandR->replaceAllUsesWith(ExpandedVal); + ExpandR->eraseFromParent(); + } + // Ensure that the start values for any VPWidenIntOrFpInductionRecipe, // VPWidenPointerInductionRecipe and VPReductionPHIRecipes are updated // before vectorizing the epilogue loop. @@ -10520,15 +10523,16 @@ bool LoopVectorizePass::processLoop(Loop *L) { } ResumeV = MainILV.createInductionResumeValue( - IndPhi, *ID, {EPI.MainLoopIterationCountCheck}); + IndPhi, *ID, getExpandedStep(*ID, ExpandedSCEVs), + {EPI.MainLoopIterationCountCheck}); } assert(ResumeV && "Must have a resume value"); - VPValue *StartVal = BestEpiPlan.getOrAddExternalDef(ResumeV); + VPValue *StartVal = BestEpiPlan.getVPValueOrAddLiveIn(ResumeV); cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal); } LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, - DT, true); + DT, true, &ExpandedSCEVs); ++LoopsEpilogueVectorized; if (!MainILV.areSafetyChecksAdded()) @@ -10581,14 +10585,14 @@ bool LoopVectorizePass::processLoop(Loop *L) { LoopVectorizeResult LoopVectorizePass::runImpl( Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, - DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, + DominatorTree &DT_, BlockFrequencyInfo *BFI_, TargetLibraryInfo *TLI_, DemandedBits &DB_, AssumptionCache &AC_, LoopAccessInfoManager &LAIs_, OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { SE = &SE_; LI = &LI_; TTI = &TTI_; DT = &DT_; - BFI = &BFI_; + BFI = BFI_; TLI = TLI_; AC = &AC_; LAIs = &LAIs_; @@ -10604,7 +10608,7 @@ LoopVectorizeResult LoopVectorizePass::runImpl( // vector registers, loop vectorization may still enable scalar // interleaving. if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && - TTI->getMaxInterleaveFactor(1) < 2) + TTI->getMaxInterleaveFactor(ElementCount::getFixed(1)) < 2) return LoopVectorizeResult(false, false); bool Changed = false, CFGChanged = false; @@ -10656,7 +10660,6 @@ PreservedAnalyses LoopVectorizePass::run(Function &F, auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); auto &TTI = AM.getResult<TargetIRAnalysis>(F); auto &DT = AM.getResult<DominatorTreeAnalysis>(F); - auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); auto &AC = AM.getResult<AssumptionAnalysis>(F); auto &DB = AM.getResult<DemandedBitsAnalysis>(F); @@ -10666,12 +10669,20 @@ PreservedAnalyses LoopVectorizePass::run(Function &F, auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); ProfileSummaryInfo *PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); + BlockFrequencyInfo *BFI = nullptr; + if (PSI && PSI->hasProfileSummary()) + BFI = &AM.getResult<BlockFrequencyAnalysis>(F); LoopVectorizeResult Result = runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AC, LAIs, ORE, PSI); if (!Result.MadeAnyChange) return PreservedAnalyses::all(); PreservedAnalyses PA; + if (isAssignmentTrackingEnabled(*F.getParent())) { + for (auto &BB : F) + RemoveRedundantDbgInstrs(&BB); + } + // We currently do not preserve loopinfo/dominator analyses with outer loop // vectorization. Until this is addressed, mark these analyses as preserved // only for non-VPlan-native path. @@ -10679,6 +10690,11 @@ PreservedAnalyses LoopVectorizePass::run(Function &F, if (!EnableVPlanNativePath) { PA.preserve<LoopAnalysis>(); PA.preserve<DominatorTreeAnalysis>(); + PA.preserve<ScalarEvolutionAnalysis>(); + +#ifdef EXPENSIVE_CHECKS + SE.verify(); +#endif } if (Result.MadeCFGChange) { @@ -10699,8 +10715,8 @@ void LoopVectorizePass::printPipeline( static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline( OS, MapClassName2PassName); - OS << "<"; + OS << '<'; OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;"; OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;"; - OS << ">"; + OS << '>'; } |