diff options
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp')
| -rw-r--r-- | contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 2905 |
1 files changed, 1369 insertions, 1536 deletions
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index dd596c567cd4..6d28b8fabe42 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -59,7 +59,9 @@ #include "VPlan.h" #include "VPlanAnalysis.h" #include "VPlanHCFGBuilder.h" +#include "VPlanPatternMatch.h" #include "VPlanTransforms.h" +#include "VPlanVerifier.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" @@ -123,6 +125,7 @@ #include "llvm/IR/User.h" #include "llvm/IR/Value.h" #include "llvm/IR/ValueHandle.h" +#include "llvm/IR/VectorBuilder.h" #include "llvm/IR/Verifier.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" @@ -202,6 +205,11 @@ static cl::opt<unsigned> VectorizeMemoryCheckThreshold( "vectorize-memory-check-threshold", cl::init(128), cl::Hidden, cl::desc("The maximum allowed number of runtime memory checks")); +static cl::opt<bool> UseLegacyCostModel( + "vectorize-use-legacy-cost-model", cl::init(false), cl::Hidden, + cl::desc("Use the legacy cost model instead of the VPlan-based cost model. " + "This option will be removed in the future.")); + // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, // that predication is preferred, and this lists all options. I.e., the // vectorizer will try to fold the tail-loop (epilogue) into the vector body @@ -247,10 +255,12 @@ static cl::opt<TailFoldingStyle> ForceTailFoldingStyle( clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control", "Create lane mask using active.lane.mask intrinsic, and use " "it for both data and control flow"), - clEnumValN( - TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck, - "data-and-control-without-rt-check", - "Similar to data-and-control, but remove the runtime check"))); + clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck, + "data-and-control-without-rt-check", + "Similar to data-and-control, but remove the runtime check"), + clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl", + "Use predicated EVL instructions for tail folding. If EVL " + "is unsupported, fallback to data-without-lane-mask."))); static cl::opt<bool> MaximizeBandwidth( "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, @@ -267,11 +277,6 @@ static cl::opt<bool> EnableMaskedInterleavedMemAccesses( "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); -static cl::opt<unsigned> TinyTripCountInterleaveThreshold( - "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, - cl::desc("We don't interleave loops with a estimated constant trip count " - "below this number")); - static cl::opt<unsigned> ForceTargetNumScalarRegs( "force-target-num-scalar-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of scalar registers.")); @@ -290,7 +295,7 @@ static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( cl::desc("A flag that overrides the target's max interleave factor for " "vectorized loops.")); -static cl::opt<unsigned> ForceTargetInstructionCost( +cl::opt<unsigned> ForceTargetInstructionCost( "force-target-instruction-cost", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's expected cost for " "an instruction to a single constant value. Mostly " @@ -319,12 +324,6 @@ static cl::opt<bool> EnableLoadStoreRuntimeInterleave( cl::desc( "Enable runtime interleaving until load/store ports are saturated")); -/// Interleave small loops with scalar reductions. -static cl::opt<bool> InterleaveSmallLoopScalarReduction( - "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, - cl::desc("Enable interleaving for loops with small iteration counts that " - "contain scalar reductions to expose ILP.")); - /// The number of stores in a loop that are allowed to need predication. static cl::opt<unsigned> NumberOfStoresToPredicate( "vectorize-num-stores-pred", cl::init(1), cl::Hidden, @@ -418,14 +417,6 @@ static bool hasIrregularType(Type *Ty, const DataLayout &DL) { return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); } -/// A helper function that returns the reciprocal of the block probability of -/// predicated blocks. If we return X, we are assuming the predicated block -/// will execute once for every X iterations of the loop header. -/// -/// TODO: We should use actual block probability here, if available. Currently, -/// we always assume predicated blocks have a 50% chance of executing. -static unsigned getReciprocalPredBlockProb() { return 2; } - /// Returns "best known" trip count for the specified loop \p L as defined by /// the following procedure: /// 1) Returns exact trip count if it is known. @@ -450,37 +441,6 @@ static std::optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, return std::nullopt; } -/// Return a vector containing interleaved elements from multiple -/// smaller input vectors. -static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals, - const Twine &Name) { - unsigned Factor = Vals.size(); - assert(Factor > 1 && "Tried to interleave invalid number of vectors"); - - VectorType *VecTy = cast<VectorType>(Vals[0]->getType()); -#ifndef NDEBUG - for (Value *Val : Vals) - assert(Val->getType() == VecTy && "Tried to interleave mismatched types"); -#endif - - // Scalable vectors cannot use arbitrary shufflevectors (only splats), so - // must use intrinsics to interleave. - if (VecTy->isScalableTy()) { - VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy); - return Builder.CreateIntrinsic( - WideVecTy, Intrinsic::experimental_vector_interleave2, Vals, - /*FMFSource=*/nullptr, Name); - } - - // Fixed length. Start by concatenating all vectors into a wide vector. - Value *WideVec = concatenateVectors(Builder, Vals); - - // Interleave the elements into the wide vector. - const unsigned NumElts = VecTy->getElementCount().getFixedValue(); - return Builder.CreateShuffleVector( - WideVec, createInterleaveMask(NumElts, Factor), Name); -} - namespace { // Forward declare GeneratedRTChecks. class GeneratedRTChecks; @@ -552,11 +512,6 @@ public: // Return true if any runtime check is added. bool areSafetyChecksAdded() { return AddedSafetyChecks; } - /// A type for vectorized values in the new loop. Each value from the - /// original loop, when vectorized, is represented by UF vector values in the - /// new unrolled loop, where UF is the unroll factor. - using VectorParts = SmallVector<Value *, 2>; - /// A helper function to scalarize a single Instruction in the innermost loop. /// Generates a sequence of scalar instances for each lane between \p MinLane /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, @@ -567,23 +522,9 @@ public: const VPIteration &Instance, VPTransformState &State); - /// Try to vectorize interleaved access group \p Group with the base address - /// given in \p Addr, optionally masking the vector operations if \p - /// BlockInMask is non-null. Use \p State to translate given VPValues to IR - /// values in the vectorized loop. - void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, - ArrayRef<VPValue *> VPDefs, - VPTransformState &State, VPValue *Addr, - ArrayRef<VPValue *> StoredValues, - VPValue *BlockInMask, bool NeedsMaskForGaps); - /// Fix the non-induction PHIs in \p Plan. void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State); - /// Returns true if the reordering of FP operations is not allowed, but we are - /// able to vectorize with strict in-order reductions for the given RdxDesc. - bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc); - /// Create a new phi node for the induction variable \p OrigPhi to resume /// iteration count in the scalar epilogue, from where the vectorized loop /// left off. \p Step is the SCEV-expanded induction step to use. In cases @@ -622,14 +563,6 @@ protected: BasicBlock *MiddleBlock, BasicBlock *VectorHeader, VPlan &Plan, VPTransformState &State); - /// Create the exit value of first order recurrences in the middle block and - /// update their users. - void fixFixedOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR, - VPTransformState &State); - - /// Create code for the loop exit value of the reduction. - void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State); - /// Iteratively sink the scalarized operands of a predicated instruction into /// the block that was created for it. void sinkScalarOperands(Instruction *PredInst); @@ -637,11 +570,6 @@ protected: /// Returns (and creates if needed) the trip count of the widened loop. Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock); - /// Returns a bitcasted value to the requested vector type. - /// Also handles bitcasts of vector<float> <-> vector<pointer> types. - Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, - const DataLayout &DL); - /// Emit a bypass check to see if the vector trip count is zero, including if /// it overflows. void emitIterationCountCheck(BasicBlock *Bypass); @@ -675,17 +603,6 @@ protected: /// running the verifier. Return the preheader of the completed vector loop. BasicBlock *completeLoopSkeleton(); - /// Collect poison-generating recipes that may generate a poison value that is - /// used after vectorization, even when their operands are not poison. Those - /// recipes meet the following conditions: - /// * Contribute to the address computation of a recipe generating a widen - /// memory load/store (VPWidenMemoryInstructionRecipe or - /// VPInterleaveRecipe). - /// * Such a widen memory load/store has at least one underlying Instruction - /// that is in a basic block that needs predication and after vectorization - /// the generated instruction won't be predicated. - void collectPoisonGeneratingRecipes(VPTransformState &State); - /// Allow subclasses to override and print debug traces before/after vplan /// execution, when trace information is requested. virtual void printDebugTracesAtStart(){}; @@ -1028,9 +945,12 @@ void reportVectorizationFailure(const StringRef DebugMsg, << "loop not vectorized: " << OREMsg); } -void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, +/// Reports an informative message: print \p Msg for debugging purposes as well +/// as an optimization remark. Uses either \p I as location of the remark, or +/// otherwise \p TheLoop. +static void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, Loop *TheLoop, - Instruction *I) { + Instruction *I = nullptr) { LLVM_DEBUG(debugVectorizationMessage("", Msg, I)); LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); ORE->emit( @@ -1057,108 +977,6 @@ static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop, } // end namespace llvm -#ifndef NDEBUG -/// \return string containing a file name and a line # for the given loop. -static std::string getDebugLocString(const Loop *L) { - std::string Result; - if (L) { - raw_string_ostream OS(Result); - if (const DebugLoc LoopDbgLoc = L->getStartLoc()) - LoopDbgLoc.print(OS); - else - // Just print the module name. - OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); - OS.flush(); - } - return Result; -} -#endif - -void InnerLoopVectorizer::collectPoisonGeneratingRecipes( - VPTransformState &State) { - - // Collect recipes in the backward slice of `Root` that may generate a poison - // value that is used after vectorization. - SmallPtrSet<VPRecipeBase *, 16> Visited; - auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) { - SmallVector<VPRecipeBase *, 16> Worklist; - Worklist.push_back(Root); - - // Traverse the backward slice of Root through its use-def chain. - while (!Worklist.empty()) { - VPRecipeBase *CurRec = Worklist.back(); - Worklist.pop_back(); - - if (!Visited.insert(CurRec).second) - continue; - - // Prune search if we find another recipe generating a widen memory - // instruction. Widen memory instructions involved in address computation - // will lead to gather/scatter instructions, which don't need to be - // handled. - if (isa<VPWidenMemoryInstructionRecipe>(CurRec) || - isa<VPInterleaveRecipe>(CurRec) || - isa<VPScalarIVStepsRecipe>(CurRec) || - isa<VPCanonicalIVPHIRecipe>(CurRec) || - isa<VPActiveLaneMaskPHIRecipe>(CurRec)) - continue; - - // This recipe contributes to the address computation of a widen - // load/store. If the underlying instruction has poison-generating flags, - // drop them directly. - if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) { - RecWithFlags->dropPoisonGeneratingFlags(); - } else { - Instruction *Instr = dyn_cast_or_null<Instruction>( - CurRec->getVPSingleValue()->getUnderlyingValue()); - (void)Instr; - assert((!Instr || !Instr->hasPoisonGeneratingFlags()) && - "found instruction with poison generating flags not covered by " - "VPRecipeWithIRFlags"); - } - - // Add new definitions to the worklist. - for (VPValue *operand : CurRec->operands()) - if (VPRecipeBase *OpDef = operand->getDefiningRecipe()) - Worklist.push_back(OpDef); - } - }); - - // Traverse all the recipes in the VPlan and collect the poison-generating - // recipes in the backward slice starting at the address of a VPWidenRecipe or - // VPInterleaveRecipe. - auto Iter = vp_depth_first_deep(State.Plan->getEntry()); - for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { - for (VPRecipeBase &Recipe : *VPBB) { - if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) { - Instruction &UnderlyingInstr = WidenRec->getIngredient(); - VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe(); - if (AddrDef && WidenRec->isConsecutive() && - Legal->blockNeedsPredication(UnderlyingInstr.getParent())) - collectPoisonGeneratingInstrsInBackwardSlice(AddrDef); - } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) { - VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe(); - if (AddrDef) { - // Check if any member of the interleave group needs predication. - const InterleaveGroup<Instruction> *InterGroup = - InterleaveRec->getInterleaveGroup(); - bool NeedPredication = false; - for (int I = 0, NumMembers = InterGroup->getNumMembers(); - I < NumMembers; ++I) { - Instruction *Member = InterGroup->getMember(I); - if (Member) - NeedPredication |= - Legal->blockNeedsPredication(Member->getParent()); - } - - if (NeedPredication) - collectPoisonGeneratingInstrsInBackwardSlice(AddrDef); - } - } - } - } -} - namespace llvm { // Loop vectorization cost-model hints how the scalar epilogue loop should be @@ -1222,7 +1040,7 @@ public: bool selectUserVectorizationFactor(ElementCount UserVF) { collectUniformsAndScalars(UserVF); collectInstsToScalarize(UserVF); - return expectedCost(UserVF).first.isValid(); + return expectedCost(UserVF).isValid(); } /// \return The size (in bits) of the smallest and widest types in the code @@ -1298,11 +1116,9 @@ public: bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { assert(VF.isVector() && "Profitable to scalarize relevant only for VF > 1."); - - // Cost model is not run in the VPlan-native path - return conservative - // result until this changes. - if (EnableVPlanNativePath) - return false; + assert( + TheLoop->isInnermost() && + "cost-model should not be used for outer loops (in VPlan-native path)"); auto Scalars = InstsToScalarize.find(VF); assert(Scalars != InstsToScalarize.end() && @@ -1312,6 +1128,9 @@ public: /// Returns true if \p I is known to be uniform after vectorization. bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { + assert( + TheLoop->isInnermost() && + "cost-model should not be used for outer loops (in VPlan-native path)"); // Pseudo probe needs to be duplicated for each unrolled iteration and // vector lane so that profiled loop trip count can be accurately // accumulated instead of being under counted. @@ -1321,11 +1140,6 @@ public: if (VF.isScalar()) return true; - // Cost model is not run in the VPlan-native path - return conservative - // result until this changes. - if (EnableVPlanNativePath) - return false; - auto UniformsPerVF = Uniforms.find(VF); assert(UniformsPerVF != Uniforms.end() && "VF not yet analyzed for uniformity"); @@ -1334,14 +1148,12 @@ public: /// Returns true if \p I is known to be scalar after vectorization. bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { + assert( + TheLoop->isInnermost() && + "cost-model should not be used for outer loops (in VPlan-native path)"); if (VF.isScalar()) return true; - // Cost model is not run in the VPlan-native path - return conservative - // result until this changes. - if (EnableVPlanNativePath) - return false; - auto ScalarsPerVF = Scalars.find(VF); assert(ScalarsPerVF != Scalars.end() && "Scalar values are not calculated for VF"); @@ -1399,10 +1211,9 @@ public: /// through the cost modeling. InstWidening getWideningDecision(Instruction *I, ElementCount VF) const { assert(VF.isVector() && "Expected VF to be a vector VF"); - // Cost model is not run in the VPlan-native path - return conservative - // result until this changes. - if (EnableVPlanNativePath) - return CM_GatherScatter; + assert( + TheLoop->isInnermost() && + "cost-model should not be used for outer loops (in VPlan-native path)"); std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); auto Itr = WideningDecisions.find(InstOnVF); @@ -1570,29 +1381,40 @@ public: /// Returns true if \p I is a memory instruction in an interleaved-group /// of memory accesses that can be vectorized with wide vector loads/stores /// and shuffles. - bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF); + bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF) const; /// Check if \p Instr belongs to any interleaved access group. - bool isAccessInterleaved(Instruction *Instr) { + bool isAccessInterleaved(Instruction *Instr) const { return InterleaveInfo.isInterleaved(Instr); } /// Get the interleaved access group that \p Instr belongs to. const InterleaveGroup<Instruction> * - getInterleavedAccessGroup(Instruction *Instr) { + getInterleavedAccessGroup(Instruction *Instr) const { return InterleaveInfo.getInterleaveGroup(Instr); } /// Returns true if we're required to use a scalar epilogue for at least /// the final iteration of the original loop. bool requiresScalarEpilogue(bool IsVectorizing) const { - if (!isScalarEpilogueAllowed()) + if (!isScalarEpilogueAllowed()) { + LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n"); return false; + } // If we might exit from anywhere but the latch, must run the exiting // iteration in scalar form. - if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) + if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { + LLVM_DEBUG( + dbgs() << "LV: Loop requires scalar epilogue: multiple exits\n"); + return true; + } + if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) { + LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: " + "interleaved group requires scalar epilogue\n"); return true; - return IsVectorizing && InterleaveInfo.requiresScalarEpilogue(); + } + LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n"); + return false; } /// Returns true if we're required to use a scalar epilogue for at least @@ -1617,19 +1439,67 @@ public: } /// Returns the TailFoldingStyle that is best for the current loop. - TailFoldingStyle - getTailFoldingStyle(bool IVUpdateMayOverflow = true) const { - if (!CanFoldTailByMasking) + TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const { + if (!ChosenTailFoldingStyle) return TailFoldingStyle::None; + return IVUpdateMayOverflow ? ChosenTailFoldingStyle->first + : ChosenTailFoldingStyle->second; + } + + /// Selects and saves TailFoldingStyle for 2 options - if IV update may + /// overflow or not. + /// \param IsScalableVF true if scalable vector factors enabled. + /// \param UserIC User specific interleave count. + void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC) { + assert(!ChosenTailFoldingStyle && "Tail folding must not be selected yet."); + if (!Legal->canFoldTailByMasking()) { + ChosenTailFoldingStyle = + std::make_pair(TailFoldingStyle::None, TailFoldingStyle::None); + return; + } - if (ForceTailFoldingStyle.getNumOccurrences()) - return ForceTailFoldingStyle; + if (!ForceTailFoldingStyle.getNumOccurrences()) { + ChosenTailFoldingStyle = std::make_pair( + TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/true), + TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false)); + return; + } - return TTI.getPreferredTailFoldingStyle(IVUpdateMayOverflow); + // Set styles when forced. + ChosenTailFoldingStyle = std::make_pair(ForceTailFoldingStyle.getValue(), + ForceTailFoldingStyle.getValue()); + if (ForceTailFoldingStyle != TailFoldingStyle::DataWithEVL) + return; + // Override forced styles if needed. + // FIXME: use actual opcode/data type for analysis here. + // FIXME: Investigate opportunity for fixed vector factor. + bool EVLIsLegal = + IsScalableVF && UserIC <= 1 && + TTI.hasActiveVectorLength(0, nullptr, Align()) && + !EnableVPlanNativePath && + // FIXME: implement support for max safe dependency distance. + Legal->isSafeForAnyVectorWidth(); + if (!EVLIsLegal) { + // If for some reason EVL mode is unsupported, fallback to + // DataWithoutLaneMask to try to vectorize the loop with folded tail + // in a generic way. + ChosenTailFoldingStyle = + std::make_pair(TailFoldingStyle::DataWithoutLaneMask, + TailFoldingStyle::DataWithoutLaneMask); + LLVM_DEBUG( + dbgs() + << "LV: Preference for VP intrinsics indicated. Will " + "not try to generate VP Intrinsics " + << (UserIC > 1 + ? "since interleave count specified is greater than 1.\n" + : "due to non-interleaving reasons.\n")); + } } /// Returns true if all loop blocks should be masked to fold tail loop. bool foldTailByMasking() const { + // TODO: check if it is possible to check for None style independent of + // IVUpdateMayOverflow flag in getTailFoldingStyle. return getTailFoldingStyle() != TailFoldingStyle::None; } @@ -1640,6 +1510,12 @@ public: return foldTailByMasking() || Legal->blockNeedsPredication(BB); } + /// Returns true if VP intrinsics with explicit vector length support should + /// be generated in the tail folded loop. + bool foldTailWithEVL() const { + return getTailFoldingStyle() == TailFoldingStyle::DataWithEVL; + } + /// Returns true if the Phi is part of an inloop reduction. bool isInLoopReduction(PHINode *Phi) const { return InLoopReductions.contains(Phi); @@ -1663,20 +1539,13 @@ public: Scalars.clear(); } - /// The vectorization cost is a combination of the cost itself and a boolean - /// indicating whether any of the contributing operations will actually - /// operate on vector values after type legalization in the backend. If this - /// latter value is false, then all operations will be scalarized (i.e. no - /// vectorization has actually taken place). - using VectorizationCostTy = std::pair<InstructionCost, bool>; - /// Returns the expected execution cost. The unit of the cost does /// not matter because we use the 'cost' units to compare different /// vector widths. The cost that is returned is *not* normalized by /// the factor width. If \p Invalid is not nullptr, this function /// will add a pair(Instruction*, ElementCount) to \p Invalid for /// each instruction that has an Invalid cost for the given VF. - VectorizationCostTy + InstructionCost expectedCost(ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid = nullptr); @@ -1687,6 +1556,16 @@ public: /// \p VF is the vectorization factor chosen for the original loop. bool isEpilogueVectorizationProfitable(const ElementCount VF) const; + /// Returns the execution time cost of an instruction for a given vector + /// width. Vector width of one means scalar. + InstructionCost getInstructionCost(Instruction *I, ElementCount VF); + + /// Return the cost of instructions in an inloop reduction pattern, if I is + /// part of that pattern. + std::optional<InstructionCost> + getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy, + TTI::TargetCostKind CostKind) const; + private: unsigned NumPredStores = 0; @@ -1708,25 +1587,14 @@ private: ElementCount MaxSafeVF, bool FoldTailByMasking); + /// Checks if scalable vectorization is supported and enabled. Caches the + /// result to avoid repeated debug dumps for repeated queries. + bool isScalableVectorizationAllowed(); + /// \return the maximum legal scalable VF, based on the safe max number /// of elements. ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); - /// Returns the execution time cost of an instruction for a given vector - /// width. Vector width of one means scalar. - VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); - - /// The cost-computation logic from getInstructionCost which provides - /// the vector type as an output parameter. - InstructionCost getInstructionCost(Instruction *I, ElementCount VF, - Type *&VectorTy); - - /// Return the cost of instructions in an inloop reduction pattern, if I is - /// part of that pattern. - std::optional<InstructionCost> - getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy, - TTI::TargetCostKind CostKind) const; - /// Calculate vectorization cost of memory instruction \p I. InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); @@ -1782,8 +1650,13 @@ private: /// iterations to execute in the scalar loop. ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; - /// All blocks of loop are to be masked to fold tail of scalar iterations. - bool CanFoldTailByMasking = false; + /// Control finally chosen tail folding style. The first element is used if + /// the IV update may overflow, the second element - if it does not. + std::optional<std::pair<TailFoldingStyle, TailFoldingStyle>> + ChosenTailFoldingStyle; + + /// true if scalable vectorization is supported and enabled. + std::optional<bool> IsScalableVectorizationAllowed; /// A map holding scalar costs for different vectorization factors. The /// presence of a cost for an instruction in the mapping indicates that the @@ -2118,16 +1991,18 @@ public: BestTripCount = *EstimatedTC; } + BestTripCount = std::max(BestTripCount, 1U); InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount; // Let's ensure the cost is always at least 1. NewMemCheckCost = std::max(*NewMemCheckCost.getValue(), (InstructionCost::CostType)1); - LLVM_DEBUG(dbgs() - << "We expect runtime memory checks to be hoisted " - << "out of the outer loop. Cost reduced from " - << MemCheckCost << " to " << NewMemCheckCost << '\n'); + if (BestTripCount > 1) + LLVM_DEBUG(dbgs() + << "We expect runtime memory checks to be hoisted " + << "out of the outer loop. Cost reduced from " + << MemCheckCost << " to " << NewMemCheckCost << '\n'); MemCheckCost = NewMemCheckCost; } @@ -2207,7 +2082,7 @@ public: BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader, Cond); if (AddBranchWeights) - setBranchWeights(BI, SCEVCheckBypassWeights); + setBranchWeights(BI, SCEVCheckBypassWeights, /*IsExpected=*/false); ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), &BI); return SCEVCheckBlock; } @@ -2235,7 +2110,7 @@ public: BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond); if (AddBranchWeights) { - setBranchWeights(BI, MemCheckBypassWeights); + setBranchWeights(BI, MemCheckBypassWeights, /*IsExpected=*/false); } ReplaceInstWithInst(MemCheckBlock->getTerminator(), &BI); MemCheckBlock->getTerminator()->setDebugLoc( @@ -2472,276 +2347,6 @@ static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { return TTI.enableMaskedInterleavedAccessVectorization(); } -// Try to vectorize the interleave group that \p Instr belongs to. -// -// E.g. Translate following interleaved load group (factor = 3): -// for (i = 0; i < N; i+=3) { -// R = Pic[i]; // Member of index 0 -// G = Pic[i+1]; // Member of index 1 -// B = Pic[i+2]; // Member of index 2 -// ... // do something to R, G, B -// } -// To: -// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B -// %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements -// %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements -// %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements -// -// Or translate following interleaved store group (factor = 3): -// for (i = 0; i < N; i+=3) { -// ... do something to R, G, B -// Pic[i] = R; // Member of index 0 -// Pic[i+1] = G; // Member of index 1 -// Pic[i+2] = B; // Member of index 2 -// } -// To: -// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> -// %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> -// %interleaved.vec = shuffle %R_G.vec, %B_U.vec, -// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements -// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B -void InnerLoopVectorizer::vectorizeInterleaveGroup( - const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, - VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, - VPValue *BlockInMask, bool NeedsMaskForGaps) { - Instruction *Instr = Group->getInsertPos(); - const DataLayout &DL = Instr->getModule()->getDataLayout(); - - // Prepare for the vector type of the interleaved load/store. - Type *ScalarTy = getLoadStoreType(Instr); - unsigned InterleaveFactor = Group->getFactor(); - auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); - - // Prepare for the new pointers. - SmallVector<Value *, 2> AddrParts; - unsigned Index = Group->getIndex(Instr); - - // TODO: extend the masked interleaved-group support to reversed access. - assert((!BlockInMask || !Group->isReverse()) && - "Reversed masked interleave-group not supported."); - - Value *Idx; - // If the group is reverse, adjust the index to refer to the last vector lane - // instead of the first. We adjust the index from the first vector lane, - // rather than directly getting the pointer for lane VF - 1, because the - // pointer operand of the interleaved access is supposed to be uniform. For - // uniform instructions, we're only required to generate a value for the - // first vector lane in each unroll iteration. - if (Group->isReverse()) { - Value *RuntimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF); - Idx = Builder.CreateSub(RuntimeVF, Builder.getInt32(1)); - Idx = Builder.CreateMul(Idx, Builder.getInt32(Group->getFactor())); - Idx = Builder.CreateAdd(Idx, Builder.getInt32(Index)); - Idx = Builder.CreateNeg(Idx); - } else - Idx = Builder.getInt32(-Index); - - for (unsigned Part = 0; Part < UF; Part++) { - Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); - if (auto *I = dyn_cast<Instruction>(AddrPart)) - State.setDebugLocFrom(I->getDebugLoc()); - - // Notice current instruction could be any index. Need to adjust the address - // to the member of index 0. - // - // E.g. a = A[i+1]; // Member of index 1 (Current instruction) - // b = A[i]; // Member of index 0 - // Current pointer is pointed to A[i+1], adjust it to A[i]. - // - // E.g. A[i+1] = a; // Member of index 1 - // A[i] = b; // Member of index 0 - // A[i+2] = c; // Member of index 2 (Current instruction) - // Current pointer is pointed to A[i+2], adjust it to A[i]. - - bool InBounds = false; - if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) - InBounds = gep->isInBounds(); - AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Idx, "", InBounds); - AddrParts.push_back(AddrPart); - } - - State.setDebugLocFrom(Instr->getDebugLoc()); - Value *PoisonVec = PoisonValue::get(VecTy); - - auto CreateGroupMask = [this, &BlockInMask, &State, &InterleaveFactor]( - unsigned Part, Value *MaskForGaps) -> Value * { - if (VF.isScalable()) { - assert(!MaskForGaps && "Interleaved groups with gaps are not supported."); - assert(InterleaveFactor == 2 && - "Unsupported deinterleave factor for scalable vectors"); - auto *BlockInMaskPart = State.get(BlockInMask, Part); - SmallVector<Value *, 2> Ops = {BlockInMaskPart, BlockInMaskPart}; - auto *MaskTy = - VectorType::get(Builder.getInt1Ty(), VF.getKnownMinValue() * 2, true); - return Builder.CreateIntrinsic( - MaskTy, Intrinsic::experimental_vector_interleave2, Ops, - /*FMFSource=*/nullptr, "interleaved.mask"); - } - - if (!BlockInMask) - return MaskForGaps; - - Value *BlockInMaskPart = State.get(BlockInMask, Part); - Value *ShuffledMask = Builder.CreateShuffleVector( - BlockInMaskPart, - createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), - "interleaved.mask"); - return MaskForGaps ? Builder.CreateBinOp(Instruction::And, ShuffledMask, - MaskForGaps) - : ShuffledMask; - }; - - // Vectorize the interleaved load group. - if (isa<LoadInst>(Instr)) { - Value *MaskForGaps = nullptr; - if (NeedsMaskForGaps) { - MaskForGaps = - createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); - assert(MaskForGaps && "Mask for Gaps is required but it is null"); - } - - // For each unroll part, create a wide load for the group. - SmallVector<Value *, 2> NewLoads; - for (unsigned Part = 0; Part < UF; Part++) { - Instruction *NewLoad; - if (BlockInMask || MaskForGaps) { - assert(useMaskedInterleavedAccesses(*TTI) && - "masked interleaved groups are not allowed."); - Value *GroupMask = CreateGroupMask(Part, MaskForGaps); - NewLoad = - Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(), - GroupMask, PoisonVec, "wide.masked.vec"); - } - else - NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], - Group->getAlign(), "wide.vec"); - Group->addMetadata(NewLoad); - NewLoads.push_back(NewLoad); - } - - if (VecTy->isScalableTy()) { - assert(InterleaveFactor == 2 && - "Unsupported deinterleave factor for scalable vectors"); - - for (unsigned Part = 0; Part < UF; ++Part) { - // Scalable vectors cannot use arbitrary shufflevectors (only splats), - // so must use intrinsics to deinterleave. - Value *DI = Builder.CreateIntrinsic( - Intrinsic::experimental_vector_deinterleave2, VecTy, NewLoads[Part], - /*FMFSource=*/nullptr, "strided.vec"); - unsigned J = 0; - for (unsigned I = 0; I < InterleaveFactor; ++I) { - Instruction *Member = Group->getMember(I); - - if (!Member) - continue; - - Value *StridedVec = Builder.CreateExtractValue(DI, I); - // If this member has different type, cast the result type. - if (Member->getType() != ScalarTy) { - VectorType *OtherVTy = VectorType::get(Member->getType(), VF); - StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); - } - - if (Group->isReverse()) - StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse"); - - State.set(VPDefs[J], StridedVec, Part); - ++J; - } - } - - return; - } - - // For each member in the group, shuffle out the appropriate data from the - // wide loads. - unsigned J = 0; - for (unsigned I = 0; I < InterleaveFactor; ++I) { - Instruction *Member = Group->getMember(I); - - // Skip the gaps in the group. - if (!Member) - continue; - - auto StrideMask = - createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); - for (unsigned Part = 0; Part < UF; Part++) { - Value *StridedVec = Builder.CreateShuffleVector( - NewLoads[Part], StrideMask, "strided.vec"); - - // If this member has different type, cast the result type. - if (Member->getType() != ScalarTy) { - assert(!VF.isScalable() && "VF is assumed to be non scalable."); - VectorType *OtherVTy = VectorType::get(Member->getType(), VF); - StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); - } - - if (Group->isReverse()) - StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse"); - - State.set(VPDefs[J], StridedVec, Part); - } - ++J; - } - return; - } - - // The sub vector type for current instruction. - auto *SubVT = VectorType::get(ScalarTy, VF); - - // Vectorize the interleaved store group. - Value *MaskForGaps = - createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); - assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) && - "masked interleaved groups are not allowed."); - assert((!MaskForGaps || !VF.isScalable()) && - "masking gaps for scalable vectors is not yet supported."); - for (unsigned Part = 0; Part < UF; Part++) { - // Collect the stored vector from each member. - SmallVector<Value *, 4> StoredVecs; - unsigned StoredIdx = 0; - for (unsigned i = 0; i < InterleaveFactor; i++) { - assert((Group->getMember(i) || MaskForGaps) && - "Fail to get a member from an interleaved store group"); - Instruction *Member = Group->getMember(i); - - // Skip the gaps in the group. - if (!Member) { - Value *Undef = PoisonValue::get(SubVT); - StoredVecs.push_back(Undef); - continue; - } - - Value *StoredVec = State.get(StoredValues[StoredIdx], Part); - ++StoredIdx; - - if (Group->isReverse()) - StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse"); - - // If this member has different type, cast it to a unified type. - - if (StoredVec->getType() != SubVT) - StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); - - StoredVecs.push_back(StoredVec); - } - - // Interleave all the smaller vectors into one wider vector. - Value *IVec = interleaveVectors(Builder, StoredVecs, "interleaved.vec"); - Instruction *NewStoreInstr; - if (BlockInMask || MaskForGaps) { - Value *GroupMask = CreateGroupMask(Part, MaskForGaps); - NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part], - Group->getAlign(), GroupMask); - } else - NewStoreInstr = - Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); - - Group->addMetadata(NewStoreInstr); - } -} - void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr, VPReplicateRecipe *RepRecipe, const VPIteration &Instance, @@ -2822,9 +2427,8 @@ InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) { if (Cost->foldTailByMasking()) { assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && "VF*UF must be a power of 2 when folding tail by masking"); - Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF); - TC = Builder.CreateAdd( - TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up"); + TC = Builder.CreateAdd(TC, Builder.CreateSub(Step, ConstantInt::get(Ty, 1)), + "n.rnd.up"); } // Now we need to generate the expression for the part of the loop that the @@ -2850,37 +2454,6 @@ InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) { return VectorTripCount; } -Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, - const DataLayout &DL) { - // Verify that V is a vector type with same number of elements as DstVTy. - auto *DstFVTy = cast<VectorType>(DstVTy); - auto VF = DstFVTy->getElementCount(); - auto *SrcVecTy = cast<VectorType>(V->getType()); - assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match"); - Type *SrcElemTy = SrcVecTy->getElementType(); - Type *DstElemTy = DstFVTy->getElementType(); - assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && - "Vector elements must have same size"); - - // Do a direct cast if element types are castable. - if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { - return Builder.CreateBitOrPointerCast(V, DstFVTy); - } - // V cannot be directly casted to desired vector type. - // May happen when V is a floating point vector but DstVTy is a vector of - // pointers or vice-versa. Handle this using a two-step bitcast using an - // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. - assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && - "Only one type should be a pointer type"); - assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && - "Only one type should be a floating point type"); - Type *IntTy = - IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); - auto *VecIntTy = VectorType::get(IntTy, VF); - Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); - return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); -} - void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { Value *Count = getTripCount(); // Reuse existing vector loop preheader for TC checks. @@ -2943,16 +2516,10 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { // Update dominator for Bypass & LoopExit (if needed). DT->changeImmediateDominator(Bypass, TCCheckBlock); - if (!Cost->requiresScalarEpilogue(VF.isVector())) - // If there is an epilogue which must run, there's no edge from the - // middle block to exit blocks and thus no need to update the immediate - // dominator of the exit blocks. - DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); - BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters); if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) - setBranchWeights(BI, MinItersBypassWeights); + setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false); ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI); LoopBypassBlocks.push_back(TCCheckBlock); } @@ -3034,33 +2601,6 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { LoopScalarPreHeader = SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, nullptr, Twine(Prefix) + "scalar.ph"); - - auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); - - // Set up the middle block terminator. Two cases: - // 1) If we know that we must execute the scalar epilogue, emit an - // unconditional branch. - // 2) Otherwise, we must have a single unique exit block (due to how we - // implement the multiple exit case). In this case, set up a conditional - // branch from the middle block to the loop scalar preheader, and the - // exit block. completeLoopSkeleton will update the condition to use an - // iteration check, if required to decide whether to execute the remainder. - BranchInst *BrInst = - Cost->requiresScalarEpilogue(VF.isVector()) - ? BranchInst::Create(LoopScalarPreHeader) - : BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, - Builder.getTrue()); - BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); - ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); - - // Update dominator for loop exit. During skeleton creation, only the vector - // pre-header and the middle block are created. The vector loop is entirely - // created during VPlan exection. - if (!Cost->requiresScalarEpilogue(VF.isVector())) - // If there is an epilogue which must run, there's no edge from the - // middle block to exit blocks and thus no need to update the immediate - // dominator of the exit blocks. - DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); } PHINode *InnerLoopVectorizer::createInductionResumeValue( @@ -3100,7 +2640,7 @@ PHINode *InnerLoopVectorizer::createInductionResumeValue( // Create phi nodes to merge from the backedge-taken check block. PHINode *BCResumeVal = PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", - LoopScalarPreHeader->getTerminator()); + LoopScalarPreHeader->getFirstNonPHI()); // Copy original phi DL over to the new one. BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); @@ -3157,51 +2697,6 @@ void InnerLoopVectorizer::createInductionResumeValues( } } -BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() { - // The trip counts should be cached by now. - Value *Count = getTripCount(); - Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); - - auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); - - // Add a check in the middle block to see if we have completed - // all of the iterations in the first vector loop. Three cases: - // 1) If we require a scalar epilogue, there is no conditional branch as - // we unconditionally branch to the scalar preheader. Do nothing. - // 2) If (N - N%VF) == N, then we *don't* need to run the remainder. - // Thus if tail is to be folded, we know we don't need to run the - // remainder and we can use the previous value for the condition (true). - // 3) Otherwise, construct a runtime check. - if (!Cost->requiresScalarEpilogue(VF.isVector()) && - !Cost->foldTailByMasking()) { - // Here we use the same DebugLoc as the scalar loop latch terminator instead - // of the corresponding compare because they may have ended up with - // different line numbers and we want to avoid awkward line stepping while - // debugging. Eg. if the compare has got a line number inside the loop. - // TODO: At the moment, CreateICmpEQ will simplify conditions with constant - // operands. Perform simplification directly on VPlan once the branch is - // modeled there. - IRBuilder<> B(LoopMiddleBlock->getTerminator()); - B.SetCurrentDebugLocation(ScalarLatchTerm->getDebugLoc()); - Value *CmpN = B.CreateICmpEQ(Count, VectorTripCount, "cmp.n"); - BranchInst &BI = *cast<BranchInst>(LoopMiddleBlock->getTerminator()); - BI.setCondition(CmpN); - if (hasBranchWeightMD(*ScalarLatchTerm)) { - // Assume that `Count % VectorTripCount` is equally distributed. - unsigned TripCount = UF * VF.getKnownMinValue(); - assert(TripCount > 0 && "trip count should not be zero"); - const uint32_t Weights[] = {1, TripCount - 1}; - setBranchWeights(BI, Weights); - } - } - -#ifdef EXPENSIVE_CHECKS - assert(DT->verify(DominatorTree::VerificationLevel::Fast)); -#endif - - return LoopVectorPreHeader; -} - std::pair<BasicBlock *, Value *> InnerLoopVectorizer::createVectorizedLoopSkeleton( const SCEV2ValueTy &ExpandedSCEVs) { @@ -3224,17 +2719,18 @@ InnerLoopVectorizer::createVectorizedLoopSkeleton( | [ ]_| <-- vector loop (created during VPlan execution). | | | v - \ -[ ] <--- middle-block. + \ -[ ] <--- middle-block (wrapped in VPIRBasicBlock with the branch to + | | successors created during VPlan execution) \/ | /\ v - | ->[ ] <--- new preheader. + | ->[ ] <--- new preheader (wrapped in VPIRBasicBlock). | | (opt) v <-- edge from middle to exit iff epilogue is not required. | [ ] \ | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue). \ | \ v - >[ ] <-- exit block(s). + >[ ] <-- exit block(s). (wrapped in VPIRBasicBlock) ... */ @@ -3261,7 +2757,7 @@ InnerLoopVectorizer::createVectorizedLoopSkeleton( // Emit phis for the new starting index of the scalar loop. createInductionResumeValues(ExpandedSCEVs); - return {completeLoopSkeleton(), nullptr}; + return {LoopVectorPreHeader, nullptr}; } // Fix up external users of the induction variable. At this point, we are @@ -3447,37 +2943,12 @@ LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, TargetTransformInfo::TCK_RecipThroughput); } -static Type *smallestIntegerVectorType(Type *T1, Type *T2) { - auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); - auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); - return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; -} - -static Type *largestIntegerVectorType(Type *T1, Type *T2) { - auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); - auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); - return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; -} - void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State, VPlan &Plan) { // Fix widened non-induction PHIs by setting up the PHI operands. if (EnableVPlanNativePath) fixNonInductionPHIs(Plan, State); - // At this point every instruction in the original loop is widened to a - // vector form. Now we need to fix the recurrences in the loop. These PHI - // nodes are currently empty because we did not want to introduce cycles. - // This is the second stage of vectorizing recurrences. Note that fixing - // reduction phis are already modeled in VPlan. - // TODO: Also model fixing fixed-order recurrence phis in VPlan. - VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion(); - VPBasicBlock *HeaderVPBB = VectorRegion->getEntryBasicBlock(); - for (VPRecipeBase &R : HeaderVPBB->phis()) { - if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) - fixFixedOrderRecurrence(FOR, State); - } - // Forget the original basic block. PSE.getSE()->forgetLoop(OrigLoop); PSE.getSE()->forgetBlockAndLoopDispositions(); @@ -3491,6 +2962,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State, for (PHINode &PN : Exit->phis()) PSE.getSE()->forgetLcssaPhiWithNewPredecessor(OrigLoop, &PN); + VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion(); VPBasicBlock *LatchVPBB = VectorRegion->getExitingBasicBlock(); Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]); if (Cost->requiresScalarEpilogue(VF.isVector())) { @@ -3513,10 +2985,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State, VectorLoop->getHeader(), Plan, State); } - // Fix LCSSA phis not already fixed earlier. Extracts may need to be generated - // in the exit block, so update the builder. - State.Builder.SetInsertPoint(State.CFG.ExitBB, - State.CFG.ExitBB->getFirstNonPHIIt()); + // Fix live-out phis not already fixed earlier. for (const auto &KV : Plan.getLiveOuts()) KV.second->fixPhi(Plan, State); @@ -3544,125 +3013,6 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State, VF.getKnownMinValue() * UF); } -void InnerLoopVectorizer::fixFixedOrderRecurrence( - VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) { - // This is the second phase of vectorizing first-order recurrences. An - // overview of the transformation is described below. Suppose we have the - // following loop. - // - // for (int i = 0; i < n; ++i) - // b[i] = a[i] - a[i - 1]; - // - // There is a first-order recurrence on "a". For this loop, the shorthand - // scalar IR looks like: - // - // scalar.ph: - // s_init = a[-1] - // br scalar.body - // - // scalar.body: - // i = phi [0, scalar.ph], [i+1, scalar.body] - // s1 = phi [s_init, scalar.ph], [s2, scalar.body] - // s2 = a[i] - // b[i] = s2 - s1 - // br cond, scalar.body, ... - // - // In this example, s1 is a recurrence because it's value depends on the - // previous iteration. In the first phase of vectorization, we created a - // vector phi v1 for s1. We now complete the vectorization and produce the - // shorthand vector IR shown below (for VF = 4, UF = 1). - // - // vector.ph: - // v_init = vector(..., ..., ..., a[-1]) - // br vector.body - // - // vector.body - // i = phi [0, vector.ph], [i+4, vector.body] - // v1 = phi [v_init, vector.ph], [v2, vector.body] - // v2 = a[i, i+1, i+2, i+3]; - // v3 = vector(v1(3), v2(0, 1, 2)) - // b[i, i+1, i+2, i+3] = v2 - v3 - // br cond, vector.body, middle.block - // - // middle.block: - // x = v2(3) - // br scalar.ph - // - // scalar.ph: - // s_init = phi [x, middle.block], [a[-1], otherwise] - // br scalar.body - // - // After execution completes the vector loop, we extract the next value of - // the recurrence (x) to use as the initial value in the scalar loop. - - // Extract the last vector element in the middle block. This will be the - // initial value for the recurrence when jumping to the scalar loop. - VPValue *PreviousDef = PhiR->getBackedgeValue(); - Value *Incoming = State.get(PreviousDef, UF - 1); - auto *ExtractForScalar = Incoming; - auto *IdxTy = Builder.getInt32Ty(); - Value *RuntimeVF = nullptr; - if (VF.isVector()) { - auto *One = ConstantInt::get(IdxTy, 1); - Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); - RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); - auto *LastIdx = Builder.CreateSub(RuntimeVF, One); - ExtractForScalar = - Builder.CreateExtractElement(Incoming, LastIdx, "vector.recur.extract"); - } - - auto RecurSplice = cast<VPInstruction>(*PhiR->user_begin()); - assert(PhiR->getNumUsers() == 1 && - RecurSplice->getOpcode() == - VPInstruction::FirstOrderRecurrenceSplice && - "recurrence phi must have a single user: FirstOrderRecurrenceSplice"); - SmallVector<VPLiveOut *> LiveOuts; - for (VPUser *U : RecurSplice->users()) - if (auto *LiveOut = dyn_cast<VPLiveOut>(U)) - LiveOuts.push_back(LiveOut); - - if (!LiveOuts.empty()) { - // Extract the second last element in the middle block if the - // Phi is used outside the loop. We need to extract the phi itself - // and not the last element (the phi update in the current iteration). This - // will be the value when jumping to the exit block from the - // LoopMiddleBlock, when the scalar loop is not run at all. - Value *ExtractForPhiUsedOutsideLoop = nullptr; - if (VF.isVector()) { - auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2)); - ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( - Incoming, Idx, "vector.recur.extract.for.phi"); - } else { - assert(UF > 1 && "VF and UF cannot both be 1"); - // When loop is unrolled without vectorizing, initialize - // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled - // value of `Incoming`. This is analogous to the vectorized case above: - // extracting the second last element when VF > 1. - ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2); - } - - for (VPLiveOut *LiveOut : LiveOuts) { - assert(!Cost->requiresScalarEpilogue(VF.isVector())); - PHINode *LCSSAPhi = LiveOut->getPhi(); - LCSSAPhi->addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); - State.Plan->removeLiveOut(LCSSAPhi); - } - } - - // Fix the initial value of the original recurrence in the scalar loop. - Builder.SetInsertPoint(LoopScalarPreHeader, LoopScalarPreHeader->begin()); - PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue()); - auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); - auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue(); - for (auto *BB : predecessors(LoopScalarPreHeader)) { - auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; - Start->addIncoming(Incoming, BB); - } - - Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); - Phi->setName("scalar.recur"); -} - void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { // The basic block and loop containing the predicated instruction. auto *PredBB = PredInst->getParent(); @@ -3758,11 +3108,6 @@ void InnerLoopVectorizer::fixNonInductionPHIs(VPlan &Plan, } } -bool InnerLoopVectorizer::useOrderedReductions( - const RecurrenceDescriptor &RdxDesc) { - return Cost->useOrderedReductions(RdxDesc); -} - void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { // We should not collect Scalars more than once per VF. Right now, this // function is called from collectUniformsAndScalars(), which already does @@ -3928,6 +3273,13 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { if (!ScalarInd) continue; + // If the induction variable update is a fixed-order recurrence, neither the + // induction variable or its update should be marked scalar after + // vectorization. + auto *IndUpdatePhi = dyn_cast<PHINode>(IndUpdate); + if (IndUpdatePhi && Legal->isFixedOrderRecurrence(IndUpdatePhi)) + continue; + // Determine if all users of the induction variable update instruction are // scalar after vectorization. auto ScalarIndUpdate = @@ -4100,7 +3452,7 @@ LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I, } bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( - Instruction *I, ElementCount VF) { + Instruction *I, ElementCount VF) const { assert(isAccessInterleaved(I) && "Expecting interleaved access."); assert(getWideningDecision(I, VF) == CM_Unknown && "Decision should not be set yet."); @@ -4109,7 +3461,7 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( // If the instruction's allocated size doesn't equal it's type size, it // requires padding and will be scalarized. - auto &DL = I->getModule()->getDataLayout(); + auto &DL = I->getDataLayout(); auto *ScalarTy = getLoadStoreType(I); if (hasIrregularType(ScalarTy, DL)) return false; @@ -4187,7 +3539,7 @@ bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( // If the instruction's allocated size doesn't equal it's type size, it // requires padding and will be scalarized. - auto &DL = I->getModule()->getDataLayout(); + auto &DL = I->getDataLayout(); if (hasIrregularType(ScalarTy, DL)) return false; @@ -4220,34 +3572,37 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { // Worklist containing uniform instructions demanding lane 0. SetVector<Instruction *> Worklist; - BasicBlock *Latch = TheLoop->getLoopLatch(); // Add uniform instructions demanding lane 0 to the worklist. Instructions - // that are scalar with predication must not be considered uniform after + // that require predication must not be considered uniform after // vectorization, because that would create an erroneous replicating region // where only a single instance out of VF should be formed. - // TODO: optimize such seldom cases if found important, see PR40816. auto addToWorklistIfAllowed = [&](Instruction *I) -> void { if (isOutOfScope(I)) { LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " << *I << "\n"); return; } - if (isScalarWithPredication(I, VF)) { - LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " - << *I << "\n"); + if (isPredicatedInst(I)) { + LLVM_DEBUG( + dbgs() << "LV: Found not uniform due to requiring predication: " << *I + << "\n"); return; } LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); Worklist.insert(I); }; - // Start with the conditional branch. If the branch condition is an - // instruction contained in the loop that is only used by the branch, it is - // uniform. - auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); - if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) - addToWorklistIfAllowed(Cmp); + // Start with the conditional branches exiting the loop. If the branch + // condition is an instruction contained in the loop that is only used by the + // branch, it is uniform. + SmallVector<BasicBlock *> Exiting; + TheLoop->getExitingBlocks(Exiting); + for (BasicBlock *E : Exiting) { + auto *Cmp = dyn_cast<Instruction>(E->getTerminator()->getOperand(0)); + if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) + addToWorklistIfAllowed(Cmp); + } auto PrevVF = VF.divideCoefficientBy(2); // Return true if all lanes perform the same memory operation, and we can @@ -4388,6 +3743,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { // nodes separately. An induction variable will remain uniform if all users // of the induction variable and induction variable update remain uniform. // The code below handles both pointer and non-pointer induction variables. + BasicBlock *Latch = TheLoop->getLoopLatch(); for (const auto &Induction : Legal->getInductionVars()) { auto *Ind = Induction.first; auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); @@ -4454,15 +3810,18 @@ bool LoopVectorizationCostModel::runtimeChecksRequired() { return false; } -ElementCount -LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { +bool LoopVectorizationCostModel::isScalableVectorizationAllowed() { + if (IsScalableVectorizationAllowed) + return *IsScalableVectorizationAllowed; + + IsScalableVectorizationAllowed = false; if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) - return ElementCount::getScalable(0); + return false; if (Hints->isScalableVectorizationDisabled()) { reportVectorizationInfo("Scalable vectorization is explicitly disabled", "ScalableVectorizationDisabled", ORE, TheLoop); - return ElementCount::getScalable(0); + return false; } LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n"); @@ -4482,7 +3841,7 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { "Scalable vectorization not supported for the reduction " "operations found in this loop.", "ScalableVFUnfeasible", ORE, TheLoop); - return ElementCount::getScalable(0); + return false; } // Disable scalable vectorization if the loop contains any instructions @@ -4494,17 +3853,33 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { reportVectorizationInfo("Scalable vectorization is not supported " "for all element types found in this loop.", "ScalableVFUnfeasible", ORE, TheLoop); - return ElementCount::getScalable(0); + return false; + } + + if (!Legal->isSafeForAnyVectorWidth() && !getMaxVScale(*TheFunction, TTI)) { + reportVectorizationInfo("The target does not provide maximum vscale value " + "for safe distance analysis.", + "ScalableVFUnfeasible", ORE, TheLoop); + return false; } + IsScalableVectorizationAllowed = true; + return true; +} + +ElementCount +LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { + if (!isScalableVectorizationAllowed()) + return ElementCount::getScalable(0); + + auto MaxScalableVF = ElementCount::getScalable( + std::numeric_limits<ElementCount::ScalarTy>::max()); if (Legal->isSafeForAnyVectorWidth()) return MaxScalableVF; + std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI); // Limit MaxScalableVF by the maximum safe dependence distance. - if (std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI)) - MaxScalableVF = ElementCount::getScalable(MaxSafeElements / *MaxVScale); - else - MaxScalableVF = ElementCount::getScalable(0); + MaxScalableVF = ElementCount::getScalable(MaxSafeElements / *MaxVScale); if (!MaxScalableVF) reportVectorizationInfo( @@ -4738,8 +4113,22 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { // found modulo the vectorization factor is not zero, try to fold the tail // by masking. // FIXME: look for a smaller MaxVF that does divide TC rather than masking. - if (Legal->prepareToFoldTailByMasking()) { - CanFoldTailByMasking = true; + setTailFoldingStyles(MaxFactors.ScalableVF.isScalable(), UserIC); + if (foldTailByMasking()) { + if (getTailFoldingStyle() == TailFoldingStyle::DataWithEVL) { + LLVM_DEBUG( + dbgs() + << "LV: tail is folded with EVL, forcing unroll factor to be 1. Will " + "try to generate VP Intrinsics with scalable vector " + "factors only.\n"); + // Tail folded loop using VP intrinsics restricts the VF to be scalable + // for now. + // TODO: extend it for fixed vectors, if required. + assert(MaxFactors.ScalableVF.isScalable() && + "Expected scalable vector factor."); + + MaxFactors.FixedVF = ElementCount::getFixed(1); + } return MaxFactors; } @@ -4860,15 +4249,12 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( // Select the largest VF which doesn't require more registers than existing // ones. - for (int i = RUs.size() - 1; i >= 0; --i) { - bool Selected = true; - for (auto &pair : RUs[i].MaxLocalUsers) { - unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); - if (pair.second > TargetNumRegisters) - Selected = false; - } - if (Selected) { - MaxVF = VFs[i]; + for (int I = RUs.size() - 1; I >= 0; --I) { + const auto &MLU = RUs[I].MaxLocalUsers; + if (all_of(MLU, [&](decltype(MLU.front()) &LU) { + return LU.second <= TTI.getNumberOfRegisters(LU.first); + })) { + MaxVF = VFs[I]; break; } } @@ -4913,28 +4299,6 @@ bool LoopVectorizationPlanner::isMoreProfitable( unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(OrigLoop); - if (!A.Width.isScalable() && !B.Width.isScalable() && MaxTripCount) { - // If the trip count is a known (possibly small) constant, the trip count - // will be rounded up to an integer number of iterations under - // FoldTailByMasking. The total cost in that case will be - // VecCost*ceil(TripCount/VF). When not folding the tail, the total - // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be - // some extra overheads, but for the purpose of comparing the costs of - // different VFs we can use this to compare the total loop-body cost - // expected after vectorization. - auto GetCostForTC = [MaxTripCount, this](unsigned VF, - InstructionCost VectorCost, - InstructionCost ScalarCost) { - return CM.foldTailByMasking() ? VectorCost * divideCeil(MaxTripCount, VF) - : VectorCost * (MaxTripCount / VF) + - ScalarCost * (MaxTripCount % VF); - }; - auto RTCostA = GetCostForTC(A.Width.getFixedValue(), CostA, A.ScalarCost); - auto RTCostB = GetCostForTC(B.Width.getFixedValue(), CostB, B.ScalarCost); - - return RTCostA < RTCostB; - } - // Improve estimate for the vector width if it is scalable. unsigned EstimatedWidthA = A.Width.getKnownMinValue(); unsigned EstimatedWidthB = B.Width.getKnownMinValue(); @@ -4948,13 +4312,39 @@ bool LoopVectorizationPlanner::isMoreProfitable( // Assume vscale may be larger than 1 (or the value being tuned for), // so that scalable vectorization is slightly favorable over fixed-width // vectorization. - if (A.Width.isScalable() && !B.Width.isScalable()) - return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA); + bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost() && + A.Width.isScalable() && !B.Width.isScalable(); + + auto CmpFn = [PreferScalable](const InstructionCost &LHS, + const InstructionCost &RHS) { + return PreferScalable ? LHS <= RHS : LHS < RHS; + }; // To avoid the need for FP division: - // (CostA / A.Width) < (CostB / B.Width) - // <=> (CostA * B.Width) < (CostB * A.Width) - return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA); + // (CostA / EstimatedWidthA) < (CostB / EstimatedWidthB) + // <=> (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA) + if (!MaxTripCount) + return CmpFn(CostA * EstimatedWidthB, CostB * EstimatedWidthA); + + auto GetCostForTC = [MaxTripCount, this](unsigned VF, + InstructionCost VectorCost, + InstructionCost ScalarCost) { + // If the trip count is a known (possibly small) constant, the trip count + // will be rounded up to an integer number of iterations under + // FoldTailByMasking. The total cost in that case will be + // VecCost*ceil(TripCount/VF). When not folding the tail, the total + // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be + // some extra overheads, but for the purpose of comparing the costs of + // different VFs we can use this to compare the total loop-body cost + // expected after vectorization. + if (CM.foldTailByMasking()) + return VectorCost * divideCeil(MaxTripCount, VF); + return VectorCost * (MaxTripCount / VF) + ScalarCost * (MaxTripCount % VF); + }; + + auto RTCostA = GetCostForTC(EstimatedWidthA, CostA, A.ScalarCost); + auto RTCostB = GetCostForTC(EstimatedWidthB, CostB, B.ScalarCost); + return CmpFn(RTCostA, RTCostB); } static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts, @@ -4977,8 +4367,10 @@ static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts, sort(InvalidCosts, [&Numbering](InstructionVFPair &A, InstructionVFPair &B) { if (Numbering[A.first] != Numbering[B.first]) return Numbering[A.first] < Numbering[B.first]; - ElementCountComparator ECC; - return ECC(A.second, B.second); + const auto &LHS = A.second; + const auto &RHS = B.second; + return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) < + std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue()); }); // For a list of ordered instruction-vf pairs: @@ -5021,13 +4413,111 @@ static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts, } while (!Tail.empty()); } -VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor( - const ElementCountSet &VFCandidates) { - InstructionCost ExpectedCost = - CM.expectedCost(ElementCount::getFixed(1)).first; +/// Check if any recipe of \p Plan will generate a vector value, which will be +/// assigned a vector register. +static bool willGenerateVectors(VPlan &Plan, ElementCount VF, + const TargetTransformInfo &TTI) { + assert(VF.isVector() && "Checking a scalar VF?"); + VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType(), + Plan.getCanonicalIV()->getScalarType()->getContext()); + DenseSet<VPRecipeBase *> EphemeralRecipes; + collectEphemeralRecipesForVPlan(Plan, EphemeralRecipes); + // Set of already visited types. + DenseSet<Type *> Visited; + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( + vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) { + for (VPRecipeBase &R : *VPBB) { + if (EphemeralRecipes.contains(&R)) + continue; + // Continue early if the recipe is considered to not produce a vector + // result. Note that this includes VPInstruction where some opcodes may + // produce a vector, to preserve existing behavior as VPInstructions model + // aspects not directly mapped to existing IR instructions. + switch (R.getVPDefID()) { + case VPDef::VPDerivedIVSC: + case VPDef::VPScalarIVStepsSC: + case VPDef::VPScalarCastSC: + case VPDef::VPReplicateSC: + case VPDef::VPInstructionSC: + case VPDef::VPCanonicalIVPHISC: + case VPDef::VPVectorPointerSC: + case VPDef::VPExpandSCEVSC: + case VPDef::VPEVLBasedIVPHISC: + case VPDef::VPPredInstPHISC: + case VPDef::VPBranchOnMaskSC: + continue; + case VPDef::VPReductionSC: + case VPDef::VPActiveLaneMaskPHISC: + case VPDef::VPWidenCallSC: + case VPDef::VPWidenCanonicalIVSC: + case VPDef::VPWidenCastSC: + case VPDef::VPWidenGEPSC: + case VPDef::VPWidenSC: + case VPDef::VPWidenSelectSC: + case VPDef::VPBlendSC: + case VPDef::VPFirstOrderRecurrencePHISC: + case VPDef::VPWidenPHISC: + case VPDef::VPWidenIntOrFpInductionSC: + case VPDef::VPWidenPointerInductionSC: + case VPDef::VPReductionPHISC: + case VPDef::VPInterleaveSC: + case VPDef::VPWidenLoadEVLSC: + case VPDef::VPWidenLoadSC: + case VPDef::VPWidenStoreEVLSC: + case VPDef::VPWidenStoreSC: + break; + default: + llvm_unreachable("unhandled recipe"); + } + + auto WillWiden = [&TTI, VF](Type *ScalarTy) { + Type *VectorTy = ToVectorTy(ScalarTy, VF); + unsigned NumLegalParts = TTI.getNumberOfParts(VectorTy); + if (!NumLegalParts) + return false; + if (VF.isScalable()) { + // <vscale x 1 x iN> is assumed to be profitable over iN because + // scalable registers are a distinct register class from scalar + // ones. If we ever find a target which wants to lower scalable + // vectors back to scalars, we'll need to update this code to + // explicitly ask TTI about the register class uses for each part. + return NumLegalParts <= VF.getKnownMinValue(); + } + // Two or more parts that share a register - are vectorized. + return NumLegalParts < VF.getKnownMinValue(); + }; + + // If no def nor is a store, e.g., branches, continue - no value to check. + if (R.getNumDefinedValues() == 0 && + !isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe, VPInterleaveRecipe>( + &R)) + continue; + // For multi-def recipes, currently only interleaved loads, suffice to + // check first def only. + // For stores check their stored value; for interleaved stores suffice + // the check first stored value only. In all cases this is the second + // operand. + VPValue *ToCheck = + R.getNumDefinedValues() >= 1 ? R.getVPValue(0) : R.getOperand(1); + Type *ScalarTy = TypeInfo.inferScalarType(ToCheck); + if (!Visited.insert({ScalarTy}).second) + continue; + if (WillWiden(ScalarTy)) + return true; + } + } + + return false; +} + +VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { + InstructionCost ExpectedCost = CM.expectedCost(ElementCount::getFixed(1)); LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); - assert(VFCandidates.count(ElementCount::getFixed(1)) && + assert(any_of(VPlans, + [](std::unique_ptr<VPlan> &P) { + return P->hasVF(ElementCount::getFixed(1)); + }) && "Expected Scalar VF to be a candidate"); const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost, @@ -5035,7 +4525,8 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor( VectorizationFactor ChosenFactor = ScalarCost; bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled; - if (ForceVectorization && VFCandidates.size() > 1) { + if (ForceVectorization && + (VPlans.size() > 1 || !VPlans[0]->hasScalarVFOnly())) { // Ignore scalar width, because the user explicitly wants vectorization. // Initialize cost to max so that VF = 2 is, at least, chosen during cost // evaluation. @@ -5043,43 +4534,45 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor( } SmallVector<InstructionVFPair> InvalidCosts; - for (const auto &i : VFCandidates) { - // The cost for scalar VF=1 is already calculated, so ignore it. - if (i.isScalar()) - continue; + for (auto &P : VPlans) { + for (ElementCount VF : P->vectorFactors()) { + // The cost for scalar VF=1 is already calculated, so ignore it. + if (VF.isScalar()) + continue; - LoopVectorizationCostModel::VectorizationCostTy C = - CM.expectedCost(i, &InvalidCosts); - VectorizationFactor Candidate(i, C.first, ScalarCost.ScalarCost); + InstructionCost C = CM.expectedCost(VF, &InvalidCosts); + VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost); #ifndef NDEBUG - unsigned AssumedMinimumVscale = - getVScaleForTuning(OrigLoop, TTI).value_or(1); - unsigned Width = - Candidate.Width.isScalable() - ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale - : Candidate.Width.getFixedValue(); - LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i - << " costs: " << (Candidate.Cost / Width)); - if (i.isScalable()) - LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of " - << AssumedMinimumVscale << ")"); - LLVM_DEBUG(dbgs() << ".\n"); + unsigned AssumedMinimumVscale = + getVScaleForTuning(OrigLoop, TTI).value_or(1); + unsigned Width = + Candidate.Width.isScalable() + ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale + : Candidate.Width.getFixedValue(); + LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF + << " costs: " << (Candidate.Cost / Width)); + if (VF.isScalable()) + LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of " + << AssumedMinimumVscale << ")"); + LLVM_DEBUG(dbgs() << ".\n"); #endif - if (!C.second && !ForceVectorization) { - LLVM_DEBUG( - dbgs() << "LV: Not considering vector loop of width " << i - << " because it will not generate any vector instructions.\n"); - continue; - } + if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) { + LLVM_DEBUG( + dbgs() + << "LV: Not considering vector loop of width " << VF + << " because it will not generate any vector instructions.\n"); + continue; + } - // If profitable add it to ProfitableVF list. - if (isMoreProfitable(Candidate, ScalarCost)) - ProfitableVFs.push_back(Candidate); + // If profitable add it to ProfitableVF list. + if (isMoreProfitable(Candidate, ScalarCost)) + ProfitableVFs.push_back(Candidate); - if (isMoreProfitable(Candidate, ChosenFactor)) - ChosenFactor = Candidate; + if (isMoreProfitable(Candidate, ChosenFactor)) + ChosenFactor = Candidate; + } } emitInvalidCostRemarks(InvalidCosts, ORE, OrigLoop); @@ -5258,7 +4751,7 @@ std::pair<unsigned, unsigned> LoopVectorizationCostModel::getSmallestAndWidestTypes() { unsigned MinWidth = -1U; unsigned MaxWidth = 8; - const DataLayout &DL = TheFunction->getParent()->getDataLayout(); + const DataLayout &DL = TheFunction->getDataLayout(); // For in-loop reductions, no element types are added to ElementTypesInLoop // if there are no loads/stores in the loop. In this case, check through the // reduction variables to determine the maximum width. @@ -5349,25 +4842,24 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, if (!isScalarEpilogueAllowed()) return 1; + // Do not interleave if EVL is preferred and no User IC is specified. + if (foldTailWithEVL()) { + LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. " + "Unroll factor forced to be 1.\n"); + return 1; + } + // We used the distance for the interleave count. if (!Legal->isSafeForAnyVectorWidth()) return 1; auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); const bool HasReductions = !Legal->getReductionVars().empty(); - // Do not interleave loops with a relatively small known or estimated trip - // count. But we will interleave when InterleaveSmallLoopScalarReduction is - // enabled, and the code has scalar reductions(HasReductions && VF = 1), - // because with the above conditions interleaving can expose ILP and break - // cross iteration dependences for reductions. - if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && - !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) - return 1; // If we did not calculate the cost for VF (because the user selected the VF) // then we calculate the cost of VF here. if (LoopCost == 0) { - LoopCost = expectedCost(VF).first; + LoopCost = expectedCost(VF); assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost"); // Loop body is free and there is no need for interleaving. @@ -5443,7 +4935,12 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1"); unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop); - if (KnownTC) { + if (KnownTC > 0) { + // At least one iteration must be scalar when this constraint holds. So the + // maximum available iterations for interleaving is one less. + unsigned AvailableTC = + requiresScalarEpilogue(VF.isVector()) ? KnownTC - 1 : KnownTC; + // If trip count is known we select between two prospective ICs, where // 1) the aggressive IC is capped by the trip count divided by VF // 2) the conservative IC is capped by the trip count divided by (VF * 2) @@ -5453,27 +4950,35 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, // we run the vector loop at least twice. unsigned InterleaveCountUB = bit_floor( - std::max(1u, std::min(KnownTC / EstimatedVF, MaxInterleaveCount))); + std::max(1u, std::min(AvailableTC / EstimatedVF, MaxInterleaveCount))); unsigned InterleaveCountLB = bit_floor(std::max( - 1u, std::min(KnownTC / (EstimatedVF * 2), MaxInterleaveCount))); + 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount))); MaxInterleaveCount = InterleaveCountLB; if (InterleaveCountUB != InterleaveCountLB) { - unsigned TailTripCountUB = (KnownTC % (EstimatedVF * InterleaveCountUB)); - unsigned TailTripCountLB = (KnownTC % (EstimatedVF * InterleaveCountLB)); + unsigned TailTripCountUB = + (AvailableTC % (EstimatedVF * InterleaveCountUB)); + unsigned TailTripCountLB = + (AvailableTC % (EstimatedVF * InterleaveCountLB)); // If both produce same scalar tail, maximize the IC to do the same work // in fewer vector loop iterations if (TailTripCountUB == TailTripCountLB) MaxInterleaveCount = InterleaveCountUB; } - } else if (BestKnownTC) { + } else if (BestKnownTC && *BestKnownTC > 0) { + // At least one iteration must be scalar when this constraint holds. So the + // maximum available iterations for interleaving is one less. + unsigned AvailableTC = requiresScalarEpilogue(VF.isVector()) + ? (*BestKnownTC) - 1 + : *BestKnownTC; + // If trip count is an estimated compile time constant, limit the // IC to be capped by the trip count divided by VF * 2, such that the vector // loop runs at least twice to make interleaving seem profitable when there // is an epilogue loop present. Since exact Trip count is not known we // choose to be conservative in our IC estimate. MaxInterleaveCount = bit_floor(std::max( - 1u, std::min(*BestKnownTC / (EstimatedVF * 2), MaxInterleaveCount))); + 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount))); } assert(MaxInterleaveCount > 0 && @@ -5577,8 +5082,7 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, // If there are scalar reductions and TTI has enabled aggressive // interleaving for reductions, we will interleave to expose ILP. - if (InterleaveSmallLoopScalarReduction && VF.isScalar() && - AggressivelyInterleaveReductions) { + if (VF.isScalar() && AggressivelyInterleaveReductions) { LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); // Interleave no less than SmallIC but not as aggressive as the normal IC // to satisfy the rare situation when resources are too limited. @@ -5845,15 +5349,21 @@ void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { for (Instruction &I : *BB) if (isScalarWithPredication(&I, VF)) { ScalarCostsTy ScalarCosts; - // Do not apply discount if scalable, because that would lead to - // invalid scalarization costs. - // Do not apply discount logic if hacked cost is needed - // for emulated masked memrefs. - if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) && + // Do not apply discount logic for: + // 1. Scalars after vectorization, as there will only be a single copy + // of the instruction. + // 2. Scalable VF, as that would lead to invalid scalarization costs. + // 3. Emulated masked memrefs, if a hacked cost is needed. + if (!isScalarAfterVectorization(&I, VF) && !VF.isScalable() && + !useEmulatedMaskMemRefHack(&I, VF) && computePredInstDiscount(&I, ScalarCosts, VF) >= 0) ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); // Remember that BB will remain after vectorization. PredicatedBBsAfterVectorization[VF].insert(BB); + for (auto *Pred : predecessors(BB)) { + if (Pred->getSingleSuccessor() == BB) + PredicatedBBsAfterVectorization[VF].insert(Pred); + } } } } @@ -5920,15 +5430,14 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount( // Compute the cost of the vector instruction. Note that this cost already // includes the scalarization overhead of the predicated instruction. - InstructionCost VectorCost = getInstructionCost(I, VF).first; + InstructionCost VectorCost = getInstructionCost(I, VF); // Compute the cost of the scalarized instruction. This cost is the cost of // the instruction as if it wasn't if-converted and instead remained in the // predicated block. We will scale this cost by block probability after // computing the scalarization overhead. InstructionCost ScalarCost = - VF.getFixedValue() * - getInstructionCost(I, ElementCount::getFixed(1)).first; + VF.getFixedValue() * getInstructionCost(I, ElementCount::getFixed(1)); // Compute the scalarization overhead of needed insertelement instructions // and phi nodes. @@ -5972,14 +5481,13 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount( return Discount; } -LoopVectorizationCostModel::VectorizationCostTy -LoopVectorizationCostModel::expectedCost( +InstructionCost LoopVectorizationCostModel::expectedCost( ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) { - VectorizationCostTy Cost; + InstructionCost Cost; // For each block. for (BasicBlock *BB : TheLoop->blocks()) { - VectorizationCostTy BlockCost; + InstructionCost BlockCost; // For each instruction in the old loop. for (Instruction &I : BB->instructionsWithoutDebug()) { @@ -5988,22 +5496,19 @@ LoopVectorizationCostModel::expectedCost( (VF.isVector() && VecValuesToIgnore.count(&I))) continue; - VectorizationCostTy C = getInstructionCost(&I, VF); + InstructionCost C = getInstructionCost(&I, VF); // Check if we should override the cost. - if (C.first.isValid() && - ForceTargetInstructionCost.getNumOccurrences() > 0) - C.first = InstructionCost(ForceTargetInstructionCost); + if (C.isValid() && ForceTargetInstructionCost.getNumOccurrences() > 0) + C = InstructionCost(ForceTargetInstructionCost); // Keep a list of instructions with invalid costs. - if (Invalid && !C.first.isValid()) + if (Invalid && !C.isValid()) Invalid->emplace_back(&I, VF); - BlockCost.first += C.first; - BlockCost.second |= C.second; - LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first - << " for VF " << VF << " For instruction: " << I - << '\n'); + BlockCost += C; + LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF " + << VF << " For instruction: " << I << '\n'); } // If we are vectorizing a predicated block, it will have been @@ -6014,10 +5519,9 @@ LoopVectorizationCostModel::expectedCost( // cost by the probability of executing it. blockNeedsPredication from // Legal is used so as to not include all blocks in tail folded loops. if (VF.isScalar() && Legal->blockNeedsPredication(BB)) - BlockCost.first /= getReciprocalPredBlockProb(); + BlockCost /= getReciprocalPredBlockProb(); - Cost.first += BlockCost.first; - Cost.second |= BlockCost.second; + Cost += BlockCost; } return Cost; @@ -6273,12 +5777,20 @@ LoopVectorizationCostModel::getReductionPatternCost( const RecurrenceDescriptor &RdxDesc = Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second; - InstructionCost BaseCost = TTI.getArithmeticReductionCost( - RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind); + InstructionCost BaseCost; + RecurKind RK = RdxDesc.getRecurrenceKind(); + if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) { + Intrinsic::ID MinMaxID = getMinMaxReductionIntrinsicOp(RK); + BaseCost = TTI.getMinMaxReductionCost(MinMaxID, VectorTy, + RdxDesc.getFastMathFlags(), CostKind); + } else { + BaseCost = TTI.getArithmeticReductionCost( + RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind); + } // For a call to the llvm.fmuladd intrinsic we need to add the cost of a // normal fmul instruction to the cost of the fadd reduction. - if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd) + if (RK == RecurKind::FMulAdd) BaseCost += TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind); @@ -6416,49 +5928,6 @@ LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, return getWideningCost(I, VF); } -LoopVectorizationCostModel::VectorizationCostTy -LoopVectorizationCostModel::getInstructionCost(Instruction *I, - ElementCount VF) { - // If we know that this instruction will remain uniform, check the cost of - // the scalar version. - if (isUniformAfterVectorization(I, VF)) - VF = ElementCount::getFixed(1); - - if (VF.isVector() && isProfitableToScalarize(I, VF)) - return VectorizationCostTy(InstsToScalarize[VF][I], false); - - // Forced scalars do not have any scalarization overhead. - auto ForcedScalar = ForcedScalars.find(VF); - if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { - auto InstSet = ForcedScalar->second; - if (InstSet.count(I)) - return VectorizationCostTy( - (getInstructionCost(I, ElementCount::getFixed(1)).first * - VF.getKnownMinValue()), - false); - } - - Type *VectorTy; - InstructionCost C = getInstructionCost(I, VF, VectorTy); - - bool TypeNotScalarized = false; - if (VF.isVector() && VectorTy->isVectorTy()) { - if (unsigned NumParts = TTI.getNumberOfParts(VectorTy)) { - if (VF.isScalable()) - // <vscale x 1 x iN> is assumed to be profitable over iN because - // scalable registers are a distinct register class from scalar ones. - // If we ever find a target which wants to lower scalable vectors - // back to scalars, we'll need to update this code to explicitly - // ask TTI about the register class uses for each part. - TypeNotScalarized = NumParts <= VF.getKnownMinValue(); - else - TypeNotScalarized = NumParts < VF.getKnownMinValue(); - } else - C = InstructionCost::getInvalid(); - } - return VectorizationCostTy(C, TypeNotScalarized); -} - InstructionCost LoopVectorizationCostModel::getScalarizationOverhead( Instruction *I, ElementCount VF, TTI::TargetCostKind CostKind) const { @@ -6849,8 +6318,25 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) { } InstructionCost -LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, - Type *&VectorTy) { +LoopVectorizationCostModel::getInstructionCost(Instruction *I, + ElementCount VF) { + // If we know that this instruction will remain uniform, check the cost of + // the scalar version. + if (isUniformAfterVectorization(I, VF)) + VF = ElementCount::getFixed(1); + + if (VF.isVector() && isProfitableToScalarize(I, VF)) + return InstsToScalarize[VF][I]; + + // Forced scalars do not have any scalarization overhead. + auto ForcedScalar = ForcedScalars.find(VF); + if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { + auto InstSet = ForcedScalar->second; + if (InstSet.count(I)) + return getInstructionCost(I, ElementCount::getFixed(1)) * + VF.getKnownMinValue(); + } + Type *RetTy = I->getType(); if (canTruncateToMinimalBitwidth(I, VF)) RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); @@ -6873,6 +6359,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, }; (void) hasSingleCopyAfterVectorization; + Type *VectorTy; if (isScalarAfterVectorization(I, VF)) { // With the exception of GEPs and PHIs, after scalarization there should // only be one copy of the instruction generated in the loop. This is @@ -6888,6 +6375,10 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, } else VectorTy = ToVectorTy(RetTy, VF); + if (VF.isVector() && VectorTy->isVectorTy() && + !TTI.getNumberOfParts(VectorTy)) + return InstructionCost::getInvalid(); + // TODO: We need to estimate the cost of intrinsic calls. switch (I->getOpcode()) { case Instruction::GetElementPtr: @@ -6900,11 +6391,15 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, // In cases of scalarized and predicated instructions, there will be VF // predicated blocks in the vectorized loop. Each branch around these // blocks requires also an extract of its vector compare i1 element. + // Note that the conditional branch from the loop latch will be replaced by + // a single branch controlling the loop, so there is no extra overhead from + // scalarization. bool ScalarPredicatedBB = false; BranchInst *BI = cast<BranchInst>(I); if (VF.isVector() && BI->isConditional() && (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) || - PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1)))) + PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))) && + BI->getParent() != TheLoop->getLoopLatch()) ScalarPredicatedBB = true; if (ScalarPredicatedBB) { @@ -6934,6 +6429,11 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, // First-order recurrences are replaced by vector shuffles inside the loop. if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) { + // For <vscale x 1 x i64>, if vscale = 1 we are unable to extract the + // penultimate value of the recurrence. + // TODO: Consider vscale_range info. + if (VF.isScalable() && VF.getKnownMinValue() == 1) + return InstructionCost::getInvalid(); SmallVector<int> Mask(VF.getKnownMinValue()); std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1); return TTI.getShuffleCost(TargetTransformInfo::SK_Splice, @@ -6999,25 +6499,10 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, Op2Info.Kind = TargetTransformInfo::OK_UniformValue; SmallVector<const Value *, 4> Operands(I->operand_values()); - auto InstrCost = TTI.getArithmeticInstrCost( + return TTI.getArithmeticInstrCost( I->getOpcode(), VectorTy, CostKind, {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, - Op2Info, Operands, I); - - // Some targets can replace frem with vector library calls. - InstructionCost VecCallCost = InstructionCost::getInvalid(); - if (I->getOpcode() == Instruction::FRem) { - LibFunc Func; - if (TLI->getLibFunc(I->getOpcode(), I->getType(), Func) && - TLI->isFunctionVectorizable(TLI->getName(Func), VF)) { - SmallVector<Type *, 4> OpTypes; - for (auto &Op : I->operands()) - OpTypes.push_back(Op->getType()); - VecCallCost = - TTI.getCallInstrCost(nullptr, VectorTy, OpTypes, CostKind); - } - } - return std::min(InstrCost, VecCallCost); + Op2Info, Operands, I, TLI); } case Instruction::FNeg: { return TTI.getArithmeticInstrCost( @@ -7157,25 +6642,20 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, return *RedCost; Type *SrcScalarTy = I->getOperand(0)->getType(); + Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); + if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) + SrcScalarTy = + IntegerType::get(SrcScalarTy->getContext(), MinBWs[Op0AsInstruction]); Type *SrcVecTy = VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; + if (canTruncateToMinimalBitwidth(I, VF)) { - // This cast is going to be shrunk. This may remove the cast or it might - // turn it into slightly different cast. For example, if MinBW == 16, - // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". - // - // Calculate the modified src and dest types. - Type *MinVecTy = VectorTy; - if (Opcode == Instruction::Trunc) { - SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); - VectorTy = - largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); - } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { - // Leave SrcVecTy unchanged - we only shrink the destination element - // type. - VectorTy = - smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); - } + // If the result type is <= the source type, there will be no extend + // after truncating the users to the minimal required bitwidth. + if (VectorTy->getScalarSizeInBits() <= SrcVecTy->getScalarSizeInBits() && + (I->getOpcode() == Instruction::ZExt || + I->getOpcode() == Instruction::SExt)) + return 0; } return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); @@ -7200,16 +6680,43 @@ void LoopVectorizationCostModel::collectValuesToIgnore() { // Ignore ephemeral values. CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); - // Find all stores to invariant variables. Since they are going to sink - // outside the loop we do not need calculate cost for them. + SmallVector<Value *, 4> DeadInterleavePointerOps; for (BasicBlock *BB : TheLoop->blocks()) for (Instruction &I : *BB) { + // Find all stores to invariant variables. Since they are going to sink + // outside the loop we do not need calculate cost for them. StoreInst *SI; if ((SI = dyn_cast<StoreInst>(&I)) && Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) ValuesToIgnore.insert(&I); + + // For interleave groups, we only create a pointer for the start of the + // interleave group. Queue up addresses of group members except the insert + // position for further processing. + if (isAccessInterleaved(&I)) { + auto *Group = getInterleavedAccessGroup(&I); + if (Group->getInsertPos() == &I) + continue; + Value *PointerOp = getLoadStorePointerOperand(&I); + DeadInterleavePointerOps.push_back(PointerOp); + } } + // Mark ops feeding interleave group members as free, if they are only used + // by other dead computations. + for (unsigned I = 0; I != DeadInterleavePointerOps.size(); ++I) { + auto *Op = dyn_cast<Instruction>(DeadInterleavePointerOps[I]); + if (!Op || !TheLoop->contains(Op) || any_of(Op->users(), [this](User *U) { + Instruction *UI = cast<Instruction>(U); + return !VecValuesToIgnore.contains(U) && + (!isAccessInterleaved(UI) || + getInterleavedAccessGroup(UI)->getInsertPos() == UI); + })) + continue; + VecValuesToIgnore.insert(Op); + DeadInterleavePointerOps.append(Op->op_begin(), Op->op_end()); + } + // Ignore type-promoting instructions we identified during reduction // detection. for (const auto &Reduction : Legal->getReductionVars()) { @@ -7370,6 +6877,9 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { CM.invalidateCostModelingDecisions(); } + if (CM.foldTailByMasking()) + Legal->prepareToFoldTailByMasking(); + ElementCount MaxUserVF = UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF; bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF); @@ -7395,14 +6905,14 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { "InvalidCost", ORE, OrigLoop); } - // Populate the set of Vectorization Factor Candidates. - ElementCountSet VFCandidates; + // Collect the Vectorization Factor Candidates. + SmallVector<ElementCount> VFCandidates; for (auto VF = ElementCount::getFixed(1); ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2) - VFCandidates.insert(VF); + VFCandidates.push_back(VF); for (auto VF = ElementCount::getScalable(1); ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2) - VFCandidates.insert(VF); + VFCandidates.push_back(VF); CM.collectInLoopReductions(); for (const auto &VF : VFCandidates) { @@ -7419,11 +6929,17 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF); LLVM_DEBUG(printPlans(dbgs())); - if (!MaxFactors.hasVector()) + if (VPlans.empty()) + return std::nullopt; + if (all_of(VPlans, + [](std::unique_ptr<VPlan> &P) { return P->hasScalarVFOnly(); })) return VectorizationFactor::Disabled(); - // Select the optimal vectorization factor. - VectorizationFactor VF = selectVectorizationFactor(VFCandidates); + // Select the optimal vectorization factor according to the legacy cost-model. + // This is now only used to verify the decisions by the new VPlan-based + // cost-model and will be retired once the VPlan-based cost-model is + // stabilized. + VectorizationFactor VF = selectVectorizationFactor(); assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero."); if (!hasPlanWithVF(VF.Width)) { LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << VF.Width @@ -7433,6 +6949,212 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { return VF; } +InstructionCost VPCostContext::getLegacyCost(Instruction *UI, + ElementCount VF) const { + return CM.getInstructionCost(UI, VF); +} + +bool VPCostContext::skipCostComputation(Instruction *UI, bool IsVector) const { + return CM.ValuesToIgnore.contains(UI) || + (IsVector && CM.VecValuesToIgnore.contains(UI)) || + SkipCostComputation.contains(UI); +} + +InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan, + ElementCount VF) const { + InstructionCost Cost = 0; + LLVMContext &LLVMCtx = OrigLoop->getHeader()->getContext(); + VPCostContext CostCtx(CM.TTI, Legal->getWidestInductionType(), LLVMCtx, CM); + + // Cost modeling for inductions is inaccurate in the legacy cost model + // compared to the recipes that are generated. To match here initially during + // VPlan cost model bring up directly use the induction costs from the legacy + // cost model. Note that we do this as pre-processing; the VPlan may not have + // any recipes associated with the original induction increment instruction + // and may replace truncates with VPWidenIntOrFpInductionRecipe. We precompute + // the cost of induction phis and increments (both that are represented by + // recipes and those that are not), to avoid distinguishing between them here, + // and skip all recipes that represent induction phis and increments (the + // former case) later on, if they exist, to avoid counting them twice. + // Similarly we pre-compute the cost of any optimized truncates. + // TODO: Switch to more accurate costing based on VPlan. + for (const auto &[IV, IndDesc] : Legal->getInductionVars()) { + Instruction *IVInc = cast<Instruction>( + IV->getIncomingValueForBlock(OrigLoop->getLoopLatch())); + SmallVector<Instruction *> IVInsts = {IV, IVInc}; + for (User *U : IV->users()) { + auto *CI = cast<Instruction>(U); + if (!CostCtx.CM.isOptimizableIVTruncate(CI, VF)) + continue; + IVInsts.push_back(CI); + } + for (Instruction *IVInst : IVInsts) { + if (!CostCtx.SkipCostComputation.insert(IVInst).second) + continue; + InstructionCost InductionCost = CostCtx.getLegacyCost(IVInst, VF); + LLVM_DEBUG({ + dbgs() << "Cost of " << InductionCost << " for VF " << VF + << ": induction instruction " << *IVInst << "\n"; + }); + Cost += InductionCost; + } + } + + /// Compute the cost of all exiting conditions of the loop using the legacy + /// cost model. This is to match the legacy behavior, which adds the cost of + /// all exit conditions. Note that this over-estimates the cost, as there will + /// be a single condition to control the vector loop. + SmallVector<BasicBlock *> Exiting; + CM.TheLoop->getExitingBlocks(Exiting); + SetVector<Instruction *> ExitInstrs; + // Collect all exit conditions. + for (BasicBlock *EB : Exiting) { + auto *Term = dyn_cast<BranchInst>(EB->getTerminator()); + if (!Term) + continue; + if (auto *CondI = dyn_cast<Instruction>(Term->getOperand(0))) { + ExitInstrs.insert(CondI); + } + } + // Compute the cost of all instructions only feeding the exit conditions. + for (unsigned I = 0; I != ExitInstrs.size(); ++I) { + Instruction *CondI = ExitInstrs[I]; + if (!OrigLoop->contains(CondI) || + !CostCtx.SkipCostComputation.insert(CondI).second) + continue; + Cost += CostCtx.getLegacyCost(CondI, VF); + for (Value *Op : CondI->operands()) { + auto *OpI = dyn_cast<Instruction>(Op); + if (!OpI || any_of(OpI->users(), [&ExitInstrs, this](User *U) { + return OrigLoop->contains(cast<Instruction>(U)->getParent()) && + !ExitInstrs.contains(cast<Instruction>(U)); + })) + continue; + ExitInstrs.insert(OpI); + } + } + + // The legacy cost model has special logic to compute the cost of in-loop + // reductions, which may be smaller than the sum of all instructions involved + // in the reduction. For AnyOf reductions, VPlan codegen may remove the select + // which the legacy cost model uses to assign cost. Pre-compute their costs + // for now. + // TODO: Switch to costing based on VPlan once the logic has been ported. + for (const auto &[RedPhi, RdxDesc] : Legal->getReductionVars()) { + if (!CM.isInLoopReduction(RedPhi) && + !RecurrenceDescriptor::isAnyOfRecurrenceKind( + RdxDesc.getRecurrenceKind())) + continue; + + // AnyOf reduction codegen may remove the select. To match the legacy cost + // model, pre-compute the cost for AnyOf reductions here. + if (RecurrenceDescriptor::isAnyOfRecurrenceKind( + RdxDesc.getRecurrenceKind())) { + auto *Select = cast<SelectInst>(*find_if( + RedPhi->users(), [](User *U) { return isa<SelectInst>(U); })); + assert(!CostCtx.SkipCostComputation.contains(Select) && + "reduction op visited multiple times"); + CostCtx.SkipCostComputation.insert(Select); + auto ReductionCost = CostCtx.getLegacyCost(Select, VF); + LLVM_DEBUG(dbgs() << "Cost of " << ReductionCost << " for VF " << VF + << ":\n any-of reduction " << *Select << "\n"); + Cost += ReductionCost; + continue; + } + + const auto &ChainOps = RdxDesc.getReductionOpChain(RedPhi, OrigLoop); + SetVector<Instruction *> ChainOpsAndOperands(ChainOps.begin(), + ChainOps.end()); + // Also include the operands of instructions in the chain, as the cost-model + // may mark extends as free. + for (auto *ChainOp : ChainOps) { + for (Value *Op : ChainOp->operands()) { + if (auto *I = dyn_cast<Instruction>(Op)) + ChainOpsAndOperands.insert(I); + } + } + + // Pre-compute the cost for I, if it has a reduction pattern cost. + for (Instruction *I : ChainOpsAndOperands) { + auto ReductionCost = CM.getReductionPatternCost( + I, VF, ToVectorTy(I->getType(), VF), TTI::TCK_RecipThroughput); + if (!ReductionCost) + continue; + + assert(!CostCtx.SkipCostComputation.contains(I) && + "reduction op visited multiple times"); + CostCtx.SkipCostComputation.insert(I); + LLVM_DEBUG(dbgs() << "Cost of " << ReductionCost << " for VF " << VF + << ":\n in-loop reduction " << *I << "\n"); + Cost += *ReductionCost; + } + } + + // Pre-compute the costs for branches except for the backedge, as the number + // of replicate regions in a VPlan may not directly match the number of + // branches, which would lead to different decisions. + // TODO: Compute cost of branches for each replicate region in the VPlan, + // which is more accurate than the legacy cost model. + for (BasicBlock *BB : OrigLoop->blocks()) { + if (BB == OrigLoop->getLoopLatch()) + continue; + CostCtx.SkipCostComputation.insert(BB->getTerminator()); + auto BranchCost = CostCtx.getLegacyCost(BB->getTerminator(), VF); + Cost += BranchCost; + } + // Now compute and add the VPlan-based cost. + Cost += Plan.cost(VF, CostCtx); + LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost << "\n"); + return Cost; +} + +VPlan &LoopVectorizationPlanner::getBestPlan() const { + // If there is a single VPlan with a single VF, return it directly. + VPlan &FirstPlan = *VPlans[0]; + if (VPlans.size() == 1 && size(FirstPlan.vectorFactors()) == 1) + return FirstPlan; + + VPlan *BestPlan = &FirstPlan; + ElementCount ScalarVF = ElementCount::getFixed(1); + assert(hasPlanWithVF(ScalarVF) && + "More than a single plan/VF w/o any plan having scalar VF"); + + // TODO: Compute scalar cost using VPlan-based cost model. + InstructionCost ScalarCost = CM.expectedCost(ScalarVF); + VectorizationFactor BestFactor(ScalarVF, ScalarCost, ScalarCost); + + bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled; + if (ForceVectorization) { + // Ignore scalar width, because the user explicitly wants vectorization. + // Initialize cost to max so that VF = 2 is, at least, chosen during cost + // evaluation. + BestFactor.Cost = InstructionCost::getMax(); + } + + for (auto &P : VPlans) { + for (ElementCount VF : P->vectorFactors()) { + if (VF.isScalar()) + continue; + if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) { + LLVM_DEBUG( + dbgs() + << "LV: Not considering vector loop of width " << VF + << " because it will not generate any vector instructions.\n"); + continue; + } + + InstructionCost Cost = cost(*P, VF); + VectorizationFactor CurrentFactor(VF, Cost, ScalarCost); + if (isMoreProfitable(CurrentFactor, BestFactor)) { + BestFactor = CurrentFactor; + BestPlan = &*P; + } + } + } + BestPlan->setVF(BestFactor.Width); + return *BestPlan; +} + VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const { assert(count_if(VPlans, [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) == @@ -7485,7 +7207,8 @@ static void AddRuntimeUnrollDisableMetaData(Loop *L) { static void createAndCollectMergePhiForReduction( VPInstruction *RedResult, DenseMap<const RecurrenceDescriptor *, Value *> &ReductionResumeValues, - VPTransformState &State, Loop *OrigLoop, BasicBlock *LoopMiddleBlock) { + VPTransformState &State, Loop *OrigLoop, BasicBlock *LoopMiddleBlock, + bool VectorizingEpilogue) { if (!RedResult || RedResult->getOpcode() != VPInstruction::ComputeReductionResult) return; @@ -7493,19 +7216,29 @@ static void createAndCollectMergePhiForReduction( auto *PhiR = cast<VPReductionPHIRecipe>(RedResult->getOperand(0)); const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); - TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); Value *FinalValue = State.get(RedResult, VPIteration(State.UF - 1, VPLane::getFirstLane())); auto *ResumePhi = dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue()); + if (VectorizingEpilogue && RecurrenceDescriptor::isAnyOfRecurrenceKind( + RdxDesc.getRecurrenceKind())) { + auto *Cmp = cast<ICmpInst>(PhiR->getStartValue()->getUnderlyingValue()); + assert(Cmp->getPredicate() == CmpInst::ICMP_NE); + assert(Cmp->getOperand(1) == RdxDesc.getRecurrenceStartValue()); + ResumePhi = cast<PHINode>(Cmp->getOperand(0)); + } + assert((!VectorizingEpilogue || ResumePhi) && + "when vectorizing the epilogue loop, we need a resume phi from main " + "vector loop"); // TODO: bc.merge.rdx should not be created here, instead it should be // modeled in VPlan. BasicBlock *LoopScalarPreHeader = OrigLoop->getLoopPreheader(); // Create a phi node that merges control-flow from the backedge-taken check // block and the middle block. - auto *BCBlockPhi = PHINode::Create(FinalValue->getType(), 2, "bc.merge.rdx", - LoopScalarPreHeader->getTerminator()); + auto *BCBlockPhi = + PHINode::Create(FinalValue->getType(), 2, "bc.merge.rdx", + LoopScalarPreHeader->getTerminator()->getIterator()); // If we are fixing reductions in the epilogue loop then we should already // have created a bc.merge.rdx Phi after the main vector body. Ensure that @@ -7517,7 +7250,7 @@ static void createAndCollectMergePhiForReduction( BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming), Incoming); else - BCBlockPhi->addIncoming(ReductionStartValue, Incoming); + BCBlockPhi->addIncoming(RdxDesc.getRecurrenceStartValue(), Incoming); } auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); @@ -7549,12 +7282,14 @@ LoopVectorizationPlanner::executePlan( assert( (IsEpilogueVectorization || !ExpandedSCEVs) && "expanded SCEVs to reuse can only be used during epilogue vectorization"); + (void)IsEpilogueVectorization; - LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF - << '\n'); + VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE); - if (!IsEpilogueVectorization) - VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE); + LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF + << ", UF=" << BestUF << '\n'); + BestVPlan.setName("Final VPlan"); + LLVM_DEBUG(BestVPlan.dump()); // Perform the actual loop transformation. VPTransformState State(BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan, @@ -7579,6 +7314,9 @@ LoopVectorizationPlanner::executePlan( std::tie(State.CFG.PrevBB, CanonicalIVStartValue) = ILV.createVectorizedLoopSkeleton(ExpandedSCEVs ? *ExpandedSCEVs : State.ExpandedSCEVs); +#ifdef EXPENSIVE_CHECKS + assert(DT->verify(DominatorTree::VerificationLevel::Fast)); +#endif // Only use noalias metadata when using memory checks guaranteeing no overlap // across all iterations. @@ -7598,8 +7336,6 @@ LoopVectorizationPlanner::executePlan( State.LVer->prepareNoAliasMetadata(); } - ILV.collectPoisonGeneratingRecipes(State); - ILV.printDebugTracesAtStart(); //===------------------------------------------------===// @@ -7622,9 +7358,9 @@ LoopVectorizationPlanner::executePlan( auto *ExitVPBB = cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion()->getSingleSuccessor()); for (VPRecipeBase &R : *ExitVPBB) { - createAndCollectMergePhiForReduction(dyn_cast<VPInstruction>(&R), - ReductionResumeValues, State, OrigLoop, - State.CFG.VPBB2IRBB[ExitVPBB]); + createAndCollectMergePhiForReduction( + dyn_cast<VPInstruction>(&R), ReductionResumeValues, State, OrigLoop, + State.CFG.VPBB2IRBB[ExitVPBB], ExpandedSCEVs); } // 2.6. Maintain Loop Hints @@ -7661,6 +7397,18 @@ LoopVectorizationPlanner::executePlan( ILV.printDebugTracesAtEnd(); + // 4. Adjust branch weight of the branch in the middle block. + auto *MiddleTerm = + cast<BranchInst>(State.CFG.VPBB2IRBB[ExitVPBB]->getTerminator()); + if (MiddleTerm->isConditional() && + hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) { + // Assume that `Count % VectorTripCount` is equally distributed. + unsigned TripCount = State.UF * State.VF.getKnownMinValue(); + assert(TripCount > 0 && "trip count should not be zero"); + const uint32_t Weights[] = {1, TripCount - 1}; + setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false); + } + return {State.ExpandedSCEVs, ReductionResumeValues}; } @@ -7717,7 +7465,7 @@ EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton( // inductions in the epilogue loop are created before executing the plan for // the epilogue loop. - return {completeLoopSkeleton(), nullptr}; + return {LoopVectorPreHeader, nullptr}; } void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { @@ -7772,14 +7520,8 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass, DT->getNode(Bypass)->getIDom()) && "TC check is expected to dominate Bypass"); - // Update dominator for Bypass & LoopExit. + // Update dominator for Bypass. DT->changeImmediateDominator(Bypass, TCCheckBlock); - if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector())) - // For loops with multiple exits, there's no edge from the middle block - // to exit blocks (as the epilogue must run) and thus no need to update - // the immediate dominator of the exit blocks. - DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); - LoopBypassBlocks.push_back(TCCheckBlock); // Save the trip count so we don't have to regenerate it in the @@ -7791,7 +7533,7 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass, BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters); if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) - setBranchWeights(BI, MinItersBypassWeights); + setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false); ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI); return TCCheckBlock; @@ -7810,11 +7552,10 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton( // Now, compare the remaining count and if there aren't enough iterations to // execute the vectorized epilogue skip to the scalar part. - BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; - VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); - LoopVectorPreHeader = - SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, - LI, nullptr, "vec.epilog.ph"); + LoopVectorPreHeader->setName("vec.epilog.ph"); + BasicBlock *VecEpilogueIterationCountCheck = + SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->begin(), DT, LI, + nullptr, "vec.epilog.iter.check", true); emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader, VecEpilogueIterationCountCheck); @@ -7907,7 +7648,7 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton( {VecEpilogueIterationCountCheck, EPI.VectorTripCount} /* AdditionalBypass */); - return {completeLoopSkeleton(), EPResumeVal}; + return {LoopVectorPreHeader, EPResumeVal}; } BasicBlock * @@ -7949,10 +7690,9 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep); const uint32_t Weights[] = {EstimatedSkipCount, MainLoopStep - EstimatedSkipCount}; - setBranchWeights(BI, Weights); + setBranchWeights(BI, Weights, /*IsExpected=*/false); } ReplaceInstWithInst(Insert->getTerminator(), &BI); - LoopBypassBlocks.push_back(Insert); return Insert; } @@ -8000,8 +7740,19 @@ void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, } } -VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, - VPlan &Plan) { +iterator_range<mapped_iterator<Use *, std::function<VPValue *(Value *)>>> +VPRecipeBuilder::mapToVPValues(User::op_range Operands) { + std::function<VPValue *(Value *)> Fn = [this](Value *Op) { + if (auto *I = dyn_cast<Instruction>(Op)) { + if (auto *R = Ingredient2Recipe.lookup(I)) + return R->getVPSingleValue(); + } + return Plan.getOrAddLiveIn(Op); + }; + return map_range(Operands, Fn); +} + +VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) { assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); // Look for cached value. @@ -8025,27 +7776,34 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, if (OrigLoop->isLoopExiting(Src)) return EdgeMaskCache[Edge] = SrcMask; - VPValue *EdgeMask = Plan.getVPValueOrAddLiveIn(BI->getCondition()); + VPValue *EdgeMask = getVPValueOrAddLiveIn(BI->getCondition(), Plan); assert(EdgeMask && "No Edge Mask found for condition"); if (BI->getSuccessor(0) != Dst) EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc()); if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. - // The condition is 'SrcMask && EdgeMask', which is equivalent to - // 'select i1 SrcMask, i1 EdgeMask, i1 false'. - // The select version does not introduce new UB if SrcMask is false and - // EdgeMask is poison. Using 'and' here introduces undefined behavior. - VPValue *False = Plan.getVPValueOrAddLiveIn( - ConstantInt::getFalse(BI->getCondition()->getType())); - EdgeMask = - Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc()); + // The bitwise 'And' of SrcMask and EdgeMask introduces new UB if SrcMask + // is false and EdgeMask is poison. Avoid that by using 'LogicalAnd' + // instead which generates 'select i1 SrcMask, i1 EdgeMask, i1 false'. + EdgeMask = Builder.createLogicalAnd(SrcMask, EdgeMask, BI->getDebugLoc()); } return EdgeMaskCache[Edge] = EdgeMask; } -void VPRecipeBuilder::createHeaderMask(VPlan &Plan) { +VPValue *VPRecipeBuilder::getEdgeMask(BasicBlock *Src, BasicBlock *Dst) const { + assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); + + // Look for cached value. + std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); + EdgeMaskCacheTy::const_iterator ECEntryIt = EdgeMaskCache.find(Edge); + assert(ECEntryIt != EdgeMaskCache.end() && + "looking up mask for edge which has not been created"); + return ECEntryIt->second; +} + +void VPRecipeBuilder::createHeaderMask() { BasicBlock *Header = OrigLoop->getHeader(); // When not folding the tail, use nullptr to model all-true mask. @@ -8080,7 +7838,7 @@ VPValue *VPRecipeBuilder::getBlockInMask(BasicBlock *BB) const { return BCEntryIt->second; } -void VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) { +void VPRecipeBuilder::createBlockInMask(BasicBlock *BB) { assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); assert(BlockMaskCache.count(BB) == 0 && "Mask for block already computed"); assert(OrigLoop->getHeader() != BB && @@ -8091,7 +7849,7 @@ void VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) { VPValue *BlockMask = nullptr; // This is the block mask. We OR all incoming edges. for (auto *Predecessor : predecessors(BB)) { - VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); + VPValue *EdgeMask = createEdgeMask(Predecessor, BB); if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is too. BlockMaskCache[BB] = EdgeMask; return; @@ -8108,10 +7866,9 @@ void VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) { BlockMaskCache[BB] = BlockMask; } -VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, - ArrayRef<VPValue *> Operands, - VFRange &Range, - VPlanPtr &Plan) { +VPWidenMemoryRecipe * +VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands, + VFRange &Range) { assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && "Must be called with either a load or store"); @@ -8154,12 +7911,12 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, Ptr = VectorPtr; } if (LoadInst *Load = dyn_cast<LoadInst>(I)) - return new VPWidenMemoryInstructionRecipe(*Load, Ptr, Mask, Consecutive, - Reverse); + return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse, + I->getDebugLoc()); StoreInst *Store = cast<StoreInst>(I); - return new VPWidenMemoryInstructionRecipe(*Store, Ptr, Operands[0], Mask, - Consecutive, Reverse); + return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive, + Reverse, I->getDebugLoc()); } /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also @@ -8167,8 +7924,7 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, static VPWidenIntOrFpInductionRecipe * createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc, VPValue *Start, const InductionDescriptor &IndDesc, - VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop, - VFRange &Range) { + VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop) { assert(IndDesc.getStartValue() == Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader())); assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) && @@ -8183,14 +7939,14 @@ createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc, return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc); } -VPRecipeBase *VPRecipeBuilder::tryToOptimizeInductionPHI( - PHINode *Phi, ArrayRef<VPValue *> Operands, VPlan &Plan, VFRange &Range) { +VPHeaderPHIRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI( + PHINode *Phi, ArrayRef<VPValue *> Operands, VFRange &Range) { // Check if this is an integer or fp induction. If so, build the recipe that // produces its scalar and vector values. if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi)) return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, Plan, - *PSE.getSE(), *OrigLoop, Range); + *PSE.getSE(), *OrigLoop); // Check if this is pointer induction. If so, build the recipe for it. if (auto *II = Legal->getPointerInductionDescriptor(Phi)) { @@ -8208,7 +7964,7 @@ VPRecipeBase *VPRecipeBuilder::tryToOptimizeInductionPHI( } VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( - TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, VPlan &Plan) { + TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range) { // Optimize the special case where the source is a constant integer // induction variable. Notice that we can only optimize the 'trunc' case // because (a) FP conversions lose precision, (b) sext/zext may wrap, and @@ -8228,62 +7984,46 @@ VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( auto *Phi = cast<PHINode>(I->getOperand(0)); const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi); - VPValue *Start = Plan.getVPValueOrAddLiveIn(II.getStartValue()); + VPValue *Start = Plan.getOrAddLiveIn(II.getStartValue()); return createWidenInductionRecipes(Phi, I, Start, II, Plan, *PSE.getSE(), - *OrigLoop, Range); + *OrigLoop); } return nullptr; } -VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, - ArrayRef<VPValue *> Operands, - VPlanPtr &Plan) { - // If all incoming values are equal, the incoming VPValue can be used directly - // instead of creating a new VPBlendRecipe. - if (llvm::all_equal(Operands)) - return Operands[0]; - +VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, + ArrayRef<VPValue *> Operands) { unsigned NumIncoming = Phi->getNumIncomingValues(); - // For in-loop reductions, we do not need to create an additional select. - VPValue *InLoopVal = nullptr; - for (unsigned In = 0; In < NumIncoming; In++) { - PHINode *PhiOp = - dyn_cast_or_null<PHINode>(Operands[In]->getUnderlyingValue()); - if (PhiOp && CM.isInLoopReduction(PhiOp)) { - assert(!InLoopVal && "Found more than one in-loop reduction!"); - InLoopVal = Operands[In]; - } - } - - assert((!InLoopVal || NumIncoming == 2) && - "Found an in-loop reduction for PHI with unexpected number of " - "incoming values"); - if (InLoopVal) - return Operands[Operands[0] == InLoopVal ? 1 : 0]; // We know that all PHIs in non-header blocks are converted into selects, so // we don't have to worry about the insertion order and we can just use the // builder. At this point we generate the predication tree. There may be // duplications since this is a simple recursive scan, but future // optimizations will clean it up. + // TODO: At the moment the first mask is always skipped, but it would be + // better to skip the most expensive mask. SmallVector<VPValue *, 2> OperandsWithMask; for (unsigned In = 0; In < NumIncoming; In++) { - VPValue *EdgeMask = - createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), *Plan); - assert((EdgeMask || NumIncoming == 1) && - "Multiple predecessors with one having a full mask"); OperandsWithMask.push_back(Operands[In]); - if (EdgeMask) - OperandsWithMask.push_back(EdgeMask); + VPValue *EdgeMask = + getEdgeMask(Phi->getIncomingBlock(In), Phi->getParent()); + if (!EdgeMask) { + assert(In == 0 && "Both null and non-null edge masks found"); + assert(all_equal(Operands) && + "Distinct incoming values with one having a full mask"); + break; + } + if (In == 0) + continue; + OperandsWithMask.push_back(EdgeMask); } - return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask)); + return new VPBlendRecipe(Phi, OperandsWithMask); } VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, ArrayRef<VPValue *> Operands, - VFRange &Range, - VPlanPtr &Plan) { + VFRange &Range) { bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( [this, CI](ElementCount VF) { return CM.isScalarWithPredication(CI, VF); @@ -8301,6 +8041,7 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, return nullptr; SmallVector<VPValue *, 4> Ops(Operands.take_front(CI->arg_size())); + Ops.push_back(Operands.back()); // Is it beneficial to perform intrinsic call compared to lib call? bool ShouldUseVectorIntrinsic = @@ -8311,7 +8052,7 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, }, Range); if (ShouldUseVectorIntrinsic) - return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), ID, + return new VPWidenCallRecipe(CI, make_range(Ops.begin(), Ops.end()), ID, CI->getDebugLoc()); Function *Variant = nullptr; @@ -8358,13 +8099,13 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, if (Legal->isMaskRequired(CI)) Mask = getBlockInMask(CI->getParent()); else - Mask = Plan->getVPValueOrAddLiveIn(ConstantInt::getTrue( + Mask = Plan.getOrAddLiveIn(ConstantInt::getTrue( IntegerType::getInt1Ty(Variant->getFunctionType()->getContext()))); Ops.insert(Ops.begin() + *MaskPos, Mask); } - return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), + return new VPWidenCallRecipe(CI, make_range(Ops.begin(), Ops.end()), Intrinsic::not_intrinsic, CI->getDebugLoc(), Variant); } @@ -8386,9 +8127,9 @@ bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { Range); } -VPRecipeBase *VPRecipeBuilder::tryToWiden(Instruction *I, - ArrayRef<VPValue *> Operands, - VPBasicBlock *VPBB, VPlanPtr &Plan) { +VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, + ArrayRef<VPValue *> Operands, + VPBasicBlock *VPBB) { switch (I->getOpcode()) { default: return nullptr; @@ -8401,12 +8142,9 @@ VPRecipeBase *VPRecipeBuilder::tryToWiden(Instruction *I, if (CM.isPredicatedInst(I)) { SmallVector<VPValue *> Ops(Operands.begin(), Operands.end()); VPValue *Mask = getBlockInMask(I->getParent()); - VPValue *One = Plan->getVPValueOrAddLiveIn( - ConstantInt::get(I->getType(), 1u, false)); - auto *SafeRHS = - new VPInstruction(Instruction::Select, {Mask, Ops[1], One}, - I->getDebugLoc()); - VPBB->appendRecipe(SafeRHS); + VPValue *One = + Plan.getOrAddLiveIn(ConstantInt::get(I->getType(), 1u, false)); + auto *SafeRHS = Builder.createSelect(Mask, Ops[1], One, I->getDebugLoc()); Ops[1] = SafeRHS; return new VPWidenRecipe(*I, make_range(Ops.begin(), Ops.end())); } @@ -8445,9 +8183,8 @@ void VPRecipeBuilder::fixHeaderPhis() { } } -VPRecipeOrVPValueTy VPRecipeBuilder::handleReplication(Instruction *I, - VFRange &Range, - VPlan &Plan) { +VPReplicateRecipe *VPRecipeBuilder::handleReplication(Instruction *I, + VFRange &Range) { bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, Range); @@ -8497,29 +8234,30 @@ VPRecipeOrVPValueTy VPRecipeBuilder::handleReplication(Instruction *I, BlockInMask = getBlockInMask(I->getParent()); } - auto *Recipe = new VPReplicateRecipe(I, Plan.mapToVPValues(I->operands()), + // Note that there is some custom logic to mark some intrinsics as uniform + // manually above for scalable vectors, which this assert needs to account for + // as well. + assert((Range.Start.isScalar() || !IsUniform || !IsPredicated || + (Range.Start.isScalable() && isa<IntrinsicInst>(I))) && + "Should not predicate a uniform recipe"); + auto *Recipe = new VPReplicateRecipe(I, mapToVPValues(I->operands()), IsUniform, BlockInMask); - return toVPRecipeResult(Recipe); + return Recipe; } -VPRecipeOrVPValueTy +VPRecipeBase * VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, ArrayRef<VPValue *> Operands, - VFRange &Range, VPBasicBlock *VPBB, - VPlanPtr &Plan) { + VFRange &Range, VPBasicBlock *VPBB) { // First, check for specific widening recipes that deal with inductions, Phi // nodes, calls and memory operations. VPRecipeBase *Recipe; if (auto Phi = dyn_cast<PHINode>(Instr)) { if (Phi->getParent() != OrigLoop->getHeader()) - return tryToBlend(Phi, Operands, Plan); + return tryToBlend(Phi, Operands); - // Always record recipes for header phis. Later first-order recurrence phis - // can have earlier phis as incoming values. - recordRecipeOf(Phi); - - if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, *Plan, Range))) - return toVPRecipeResult(Recipe); + if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range))) + return Recipe; VPHeaderPHIRecipe *PhiRecipe = nullptr; assert((Legal->isReductionVariable(Phi) || @@ -8542,22 +8280,13 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV); } - // Record the incoming value from the backedge, so we can add the incoming - // value from the backedge after all recipes have been created. - auto *Inc = cast<Instruction>( - Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch())); - auto RecipeIter = Ingredient2Recipe.find(Inc); - if (RecipeIter == Ingredient2Recipe.end()) - recordRecipeOf(Inc); - PhisToFix.push_back(PhiRecipe); - return toVPRecipeResult(PhiRecipe); + return PhiRecipe; } - if (isa<TruncInst>(Instr) && - (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands, - Range, *Plan))) - return toVPRecipeResult(Recipe); + if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate( + cast<TruncInst>(Instr), Operands, Range))) + return Recipe; // All widen recipes below deal only with VF > 1. if (LoopVectorizationPlanner::getDecisionAndClampRange( @@ -8565,29 +8294,29 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, return nullptr; if (auto *CI = dyn_cast<CallInst>(Instr)) - return toVPRecipeResult(tryToWidenCall(CI, Operands, Range, Plan)); + return tryToWidenCall(CI, Operands, Range); if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) - return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan)); + return tryToWidenMemory(Instr, Operands, Range); if (!shouldWiden(Instr, Range)) return nullptr; if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) - return toVPRecipeResult(new VPWidenGEPRecipe( - GEP, make_range(Operands.begin(), Operands.end()))); + return new VPWidenGEPRecipe(GEP, + make_range(Operands.begin(), Operands.end())); if (auto *SI = dyn_cast<SelectInst>(Instr)) { - return toVPRecipeResult(new VPWidenSelectRecipe( - *SI, make_range(Operands.begin(), Operands.end()))); + return new VPWidenSelectRecipe( + *SI, make_range(Operands.begin(), Operands.end())); } if (auto *CI = dyn_cast<CastInst>(Instr)) { - return toVPRecipeResult(new VPWidenCastRecipe(CI->getOpcode(), Operands[0], - CI->getType(), *CI)); + return new VPWidenCastRecipe(CI->getOpcode(), Operands[0], CI->getType(), + *CI); } - return toVPRecipeResult(tryToWiden(Instr, Operands, VPBB, Plan)); + return tryToWiden(Instr, Operands, VPBB); } void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, @@ -8603,7 +8332,12 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, VPlanTransforms::truncateToMinimalBitwidths( *Plan, CM.getMinimalBitwidths(), PSE.getSE()->getContext()); VPlanTransforms::optimize(*Plan, *PSE.getSE()); - assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid"); + // TODO: try to put it close to addActiveLaneMask(). + // Discard the plan if it is not EVL-compatible + if (CM.foldTailWithEVL() && + !VPlanTransforms::tryAddExplicitVectorLength(*Plan)) + break; + assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid"); VPlans.push_back(std::move(Plan)); } VF = SubRange.End; @@ -8615,7 +8349,7 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW, DebugLoc DL) { Value *StartIdx = ConstantInt::get(IdxTy, 0); - auto *StartV = Plan.getVPValueOrAddLiveIn(StartIdx); + auto *StartV = Plan.getOrAddLiveIn(StartIdx); // Add a VPCanonicalIVPHIRecipe starting at 0 to the header. auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL); @@ -8623,27 +8357,22 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW, VPBasicBlock *Header = TopRegion->getEntryBasicBlock(); Header->insert(CanonicalIVPHI, Header->begin()); - // Add a CanonicalIVIncrement{NUW} VPInstruction to increment the scalar - // IV by VF * UF. - auto *CanonicalIVIncrement = - new VPInstruction(Instruction::Add, {CanonicalIVPHI, &Plan.getVFxUF()}, - {HasNUW, false}, DL, "index.next"); + VPBuilder Builder(TopRegion->getExitingBasicBlock()); + // Add a VPInstruction to increment the scalar canonical IV by VF * UF. + auto *CanonicalIVIncrement = Builder.createOverflowingOp( + Instruction::Add, {CanonicalIVPHI, &Plan.getVFxUF()}, {HasNUW, false}, DL, + "index.next"); CanonicalIVPHI->addOperand(CanonicalIVIncrement); - VPBasicBlock *EB = TopRegion->getExitingBasicBlock(); - EB->appendRecipe(CanonicalIVIncrement); - // Add the BranchOnCount VPInstruction to the latch. - VPInstruction *BranchBack = - new VPInstruction(VPInstruction::BranchOnCount, - {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); - EB->appendRecipe(BranchBack); + Builder.createNaryOp(VPInstruction::BranchOnCount, + {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); } // Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the // original exit block. static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB, Loop *OrigLoop, - VPlan &Plan) { + VPRecipeBuilder &Builder, VPlan &Plan) { BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock(); BasicBlock *ExitingBB = OrigLoop->getExitingBlock(); // Only handle single-exit loops with unique exit blocks for now. @@ -8654,17 +8383,115 @@ static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB, Loop *OrigLoop, for (PHINode &ExitPhi : ExitBB->phis()) { Value *IncomingValue = ExitPhi.getIncomingValueForBlock(ExitingBB); - VPValue *V = Plan.getVPValueOrAddLiveIn(IncomingValue); + VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue, Plan); + // Exit values for inductions are computed and updated outside of VPlan and + // independent of induction recipes. + // TODO: Compute induction exit values in VPlan, use VPLiveOuts to update + // live-outs. + if ((isa<VPWidenIntOrFpInductionRecipe>(V) && + !cast<VPWidenIntOrFpInductionRecipe>(V)->getTruncInst()) || + isa<VPWidenPointerInductionRecipe>(V)) + continue; Plan.addLiveOut(&ExitPhi, V); } } +/// Feed a resume value for every FOR from the vector loop to the scalar loop, +/// if middle block branches to scalar preheader, by introducing ExtractFromEnd +/// and ResumePhi recipes in each, respectively, and a VPLiveOut which uses the +/// latter and corresponds to the scalar header. +static void addLiveOutsForFirstOrderRecurrences(VPlan &Plan) { + VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion(); + + // Start by finding out if middle block branches to scalar preheader, which is + // not a VPIRBasicBlock, unlike Exit block - the other possible successor of + // middle block. + // TODO: Should be replaced by + // Plan->getScalarLoopRegion()->getSinglePredecessor() in the future once the + // scalar region is modeled as well. + VPBasicBlock *ScalarPHVPBB = nullptr; + auto *MiddleVPBB = cast<VPBasicBlock>(VectorRegion->getSingleSuccessor()); + for (VPBlockBase *Succ : MiddleVPBB->getSuccessors()) { + if (isa<VPIRBasicBlock>(Succ)) + continue; + assert(!ScalarPHVPBB && "Two candidates for ScalarPHVPBB?"); + ScalarPHVPBB = cast<VPBasicBlock>(Succ); + } + if (!ScalarPHVPBB) + return; + + VPBuilder ScalarPHBuilder(ScalarPHVPBB); + VPBuilder MiddleBuilder(MiddleVPBB); + // Reset insert point so new recipes are inserted before terminator and + // condition, if there is either the former or both. + if (auto *Terminator = MiddleVPBB->getTerminator()) { + auto *Condition = dyn_cast<VPInstruction>(Terminator->getOperand(0)); + assert((!Condition || Condition->getParent() == MiddleVPBB) && + "Condition expected in MiddleVPBB"); + MiddleBuilder.setInsertPoint(Condition ? Condition : Terminator); + } + VPValue *OneVPV = Plan.getOrAddLiveIn( + ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1)); + + for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) { + auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi); + if (!FOR) + continue; + + // Extract the resume value and create a new VPLiveOut for it. + auto *Resume = MiddleBuilder.createNaryOp(VPInstruction::ExtractFromEnd, + {FOR->getBackedgeValue(), OneVPV}, + {}, "vector.recur.extract"); + auto *ResumePhiRecipe = ScalarPHBuilder.createNaryOp( + VPInstruction::ResumePhi, {Resume, FOR->getStartValue()}, {}, + "scalar.recur.init"); + Plan.addLiveOut(cast<PHINode>(FOR->getUnderlyingInstr()), ResumePhiRecipe); + } +} + VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; - VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); + // --------------------------------------------------------------------------- + // Build initial VPlan: Scan the body of the loop in a topological order to + // visit each basic block after having visited its predecessor basic blocks. + // --------------------------------------------------------------------------- + + // Create initial VPlan skeleton, having a basic block for the pre-header + // which contains SCEV expansions that need to happen before the CFG is + // modified; a basic block for the vector pre-header, followed by a region for + // the vector loop, followed by the middle basic block. The skeleton vector + // loop region contains a header and latch basic blocks. + + bool RequiresScalarEpilogueCheck = + LoopVectorizationPlanner::getDecisionAndClampRange( + [this](ElementCount VF) { + return !CM.requiresScalarEpilogue(VF.isVector()); + }, + Range); + VPlanPtr Plan = VPlan::createInitialVPlan( + createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop), + *PSE.getSE(), RequiresScalarEpilogueCheck, CM.foldTailByMasking(), + OrigLoop); + + // Don't use getDecisionAndClampRange here, because we don't know the UF + // so this function is better to be conservative, rather than to split + // it up into different VPlans. + // TODO: Consider using getDecisionAndClampRange here to split up VPlans. + bool IVUpdateMayOverflow = false; + for (ElementCount VF : Range) + IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF); + + DebugLoc DL = getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); + TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow); + // When not folding the tail, we know that the induction increment will not + // overflow. + bool HasNUW = Style == TailFoldingStyle::None; + addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL); + + VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, Legal, CM, PSE, Builder); // --------------------------------------------------------------------------- // Pre-construction: record ingredients whose recipes we'll need to further @@ -8690,55 +8517,26 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { if (!getDecisionAndClampRange(applyIG, Range)) continue; InterleaveGroups.insert(IG); - for (unsigned i = 0; i < IG->getFactor(); i++) - if (Instruction *Member = IG->getMember(i)) - RecipeBuilder.recordRecipeOf(Member); }; // --------------------------------------------------------------------------- - // Build initial VPlan: Scan the body of the loop in a topological order to - // visit each basic block after having visited its predecessor basic blocks. + // Construct recipes for the instructions in the loop // --------------------------------------------------------------------------- - // Create initial VPlan skeleton, having a basic block for the pre-header - // which contains SCEV expansions that need to happen before the CFG is - // modified; a basic block for the vector pre-header, followed by a region for - // the vector loop, followed by the middle basic block. The skeleton vector - // loop region contains a header and latch basic blocks. - VPlanPtr Plan = VPlan::createInitialVPlan( - createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop), - *PSE.getSE()); - VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body"); - VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch"); - VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB); - Plan->getVectorLoopRegion()->setEntry(HeaderVPBB); - Plan->getVectorLoopRegion()->setExiting(LatchVPBB); - - // Don't use getDecisionAndClampRange here, because we don't know the UF - // so this function is better to be conservative, rather than to split - // it up into different VPlans. - // TODO: Consider using getDecisionAndClampRange here to split up VPlans. - bool IVUpdateMayOverflow = false; - for (ElementCount VF : Range) - IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF); - - DebugLoc DL = getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); - TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow); - // When not folding the tail, we know that the induction increment will not - // overflow. - bool HasNUW = Style == TailFoldingStyle::None; - addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL); - // Scan the body of the loop in a topological order to visit each basic block // after having visited its predecessor basic blocks. LoopBlocksDFS DFS(OrigLoop); DFS.perform(LI); + VPBasicBlock *HeaderVPBB = Plan->getVectorLoopRegion()->getEntryBasicBlock(); VPBasicBlock *VPBB = HeaderVPBB; - bool NeedsMasks = CM.foldTailByMasking() || - any_of(OrigLoop->blocks(), [this](BasicBlock *BB) { - return Legal->blockNeedsPredication(BB); - }); + BasicBlock *HeaderBB = OrigLoop->getHeader(); + bool NeedsMasks = + CM.foldTailByMasking() || + any_of(OrigLoop->blocks(), [this, HeaderBB](BasicBlock *BB) { + bool NeedsBlends = BB != HeaderBB && !BB->phis().empty(); + return Legal->blockNeedsPredication(BB) || NeedsBlends; + }); for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { // Relevant instructions from basic block BB will be grouped into VPRecipe // ingredients and fill a new VPBasicBlock. @@ -8747,9 +8545,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { Builder.setInsertPoint(VPBB); if (VPBB == HeaderVPBB) - RecipeBuilder.createHeaderMask(*Plan); + RecipeBuilder.createHeaderMask(); else if (NeedsMasks) - RecipeBuilder.createBlockInMask(BB, *Plan); + RecipeBuilder.createBlockInMask(BB); // Introduce each ingredient into VPlan. // TODO: Model and preserve debug intrinsics in VPlan. @@ -8757,11 +8555,11 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { Instruction *Instr = &I; SmallVector<VPValue *, 4> Operands; auto *Phi = dyn_cast<PHINode>(Instr); - if (Phi && Phi->getParent() == OrigLoop->getHeader()) { - Operands.push_back(Plan->getVPValueOrAddLiveIn( + if (Phi && Phi->getParent() == HeaderBB) { + Operands.push_back(Plan->getOrAddLiveIn( Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()))); } else { - auto OpRange = Plan->mapToVPValues(Instr->operands()); + auto OpRange = RecipeBuilder.mapToVPValues(Instr->operands()); Operands = {OpRange.begin(), OpRange.end()}; } @@ -8772,26 +8570,10 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) continue; - auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe( - Instr, Operands, Range, VPBB, Plan); - if (!RecipeOrValue) - RecipeOrValue = RecipeBuilder.handleReplication(Instr, Range, *Plan); - // If Instr can be simplified to an existing VPValue, use it. - if (isa<VPValue *>(RecipeOrValue)) { - auto *VPV = cast<VPValue *>(RecipeOrValue); - Plan->addVPValue(Instr, VPV); - // If the re-used value is a recipe, register the recipe for the - // instruction, in case the recipe for Instr needs to be recorded. - if (VPRecipeBase *R = VPV->getDefiningRecipe()) - RecipeBuilder.setRecipe(Instr, R); - continue; - } - // Otherwise, add the new recipe. - VPRecipeBase *Recipe = cast<VPRecipeBase *>(RecipeOrValue); - for (auto *Def : Recipe->definedValues()) { - auto *UV = Def->getUnderlyingValue(); - Plan->addVPValue(UV, Def); - } + VPRecipeBase *Recipe = + RecipeBuilder.tryToCreateWidenRecipe(Instr, Operands, Range, VPBB); + if (!Recipe) + Recipe = RecipeBuilder.handleReplication(Instr, Range); RecipeBuilder.setRecipe(Instr, Recipe); if (isa<VPHeaderPHIRecipe>(Recipe)) { @@ -8823,7 +8605,7 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { // and there is nothing to fix from vector loop; phis should have incoming // from scalar loop only. } else - addUsersInExitBlock(HeaderVPBB, OrigLoop, *Plan); + addUsersInExitBlock(HeaderVPBB, OrigLoop, RecipeBuilder, *Plan); assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) && !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() && @@ -8831,30 +8613,33 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { "VPBasicBlock"); RecipeBuilder.fixHeaderPhis(); + addLiveOutsForFirstOrderRecurrences(*Plan); + // --------------------------------------------------------------------------- // Transform initial VPlan: Apply previously taken decisions, in order, to // bring the VPlan to its final state. // --------------------------------------------------------------------------- // Adjust the recipes for any inloop reductions. - adjustRecipesForReductions(LatchVPBB, Plan, RecipeBuilder, Range.Start); + adjustRecipesForReductions(Plan, RecipeBuilder, Range.Start); // Interleave memory: for each Interleave Group we marked earlier as relevant // for this VPlan, replace the Recipes widening its memory instructions with a // single VPInterleaveRecipe at its insertion point. for (const auto *IG : InterleaveGroups) { - auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( - RecipeBuilder.getRecipe(IG->getInsertPos())); + auto *Recipe = + cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getInsertPos())); SmallVector<VPValue *, 4> StoredValues; for (unsigned i = 0; i < IG->getFactor(); ++i) if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) { - auto *StoreR = - cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI)); + auto *StoreR = cast<VPWidenStoreRecipe>(RecipeBuilder.getRecipe(SI)); StoredValues.push_back(StoreR->getStoredValue()); } bool NeedsMaskForGaps = IG->requiresScalarEpilogue() && !CM.isScalarEpilogueAllowed(); + assert((!NeedsMaskForGaps || useMaskedInterleavedAccesses(CM.TTI)) && + "masked interleaved groups are not allowed."); auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, Recipe->getMask(), NeedsMaskForGaps); VPIG->insertBefore(Recipe); @@ -8883,17 +8668,31 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { // Only handle constant strides for now. if (!ScevStride) continue; - Constant *CI = ConstantInt::get(Stride->getType(), ScevStride->getAPInt()); - auto *ConstVPV = Plan->getVPValueOrAddLiveIn(CI); - // The versioned value may not be used in the loop directly, so just add a - // new live-in in those cases. - Plan->getVPValueOrAddLiveIn(StrideV)->replaceAllUsesWith(ConstVPV); + auto *CI = Plan->getOrAddLiveIn( + ConstantInt::get(Stride->getType(), ScevStride->getAPInt())); + if (VPValue *StrideVPV = Plan->getLiveIn(StrideV)) + StrideVPV->replaceAllUsesWith(CI); + + // The versioned value may not be used in the loop directly but through a + // sext/zext. Add new live-ins in those cases. + for (Value *U : StrideV->users()) { + if (!isa<SExtInst, ZExtInst>(U)) + continue; + VPValue *StrideVPV = Plan->getLiveIn(U); + if (!StrideVPV) + continue; + unsigned BW = U->getType()->getScalarSizeInBits(); + APInt C = isa<SExtInst>(U) ? ScevStride->getAPInt().sext(BW) + : ScevStride->getAPInt().zext(BW); + VPValue *CI = Plan->getOrAddLiveIn(ConstantInt::get(U->getType(), C)); + StrideVPV->replaceAllUsesWith(CI); + } } - // From this point onwards, VPlan-to-VPlan transformations may change the plan - // in ways that accessing values using original IR values is incorrect. - Plan->disableValue2VPValue(); + VPlanTransforms::dropPoisonGeneratingRecipes(*Plan, [this](BasicBlock *BB) { + return Legal->blockNeedsPredication(BB); + }); // Sink users of fixed-order recurrence past the recipe defining the previous // value and introduce FirstOrderRecurrenceSplice VPInstructions. @@ -8923,7 +8722,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { // Create new empty VPlan auto Plan = VPlan::createInitialVPlan( createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop), - *PSE.getSE()); + *PSE.getSE(), true, false, OrigLoop); // Build hierarchical CFG VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); @@ -8948,6 +8747,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { bool HasNUW = true; addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DebugLoc()); + assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid"); return Plan; } @@ -8960,9 +8760,12 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { // A ComputeReductionResult recipe is added to the middle block, also for // in-loop reductions which compute their result in-loop, because generating // the subsequent bc.merge.rdx phi is driven by ComputeReductionResult recipes. +// +// Adjust AnyOf reductions; replace the reduction phi for the selected value +// with a boolean reduction phi node to check if the condition is true in any +// iteration. The final value is selected by the final ComputeReductionResult. void LoopVectorizationPlanner::adjustRecipesForReductions( - VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, - ElementCount MinVF) { + VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) { VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion(); VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock(); // Gather all VPReductionPHIRecipe and sort them so that Intermediate stores @@ -9034,7 +8837,9 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( // the phi until LoopExitValue. We keep track of the previous item // (PreviousLink) to tell which of the two operands of a Link will remain // scalar and which will be reduced. For minmax by select(cmp), Link will be - // the select instructions. + // the select instructions. Blend recipes of in-loop reduction phi's will + // get folded to their non-phi operand, as the reduction recipe handles the + // condition directly. VPSingleDefRecipe *PreviousLink = PhiR; // Aka Worklist[0]. for (VPSingleDefRecipe *CurrentLink : Worklist.getArrayRef().drop_front()) { Instruction *CurrentLinkI = CurrentLink->getUnderlyingInstr(); @@ -9065,6 +8870,20 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( LinkVPBB->insert(FMulRecipe, CurrentLink->getIterator()); VecOp = FMulRecipe; } else { + auto *Blend = dyn_cast<VPBlendRecipe>(CurrentLink); + if (PhiR->isInLoop() && Blend) { + assert(Blend->getNumIncomingValues() == 2 && + "Blend must have 2 incoming values"); + if (Blend->getIncomingValue(0) == PhiR) + Blend->replaceAllUsesWith(Blend->getIncomingValue(1)); + else { + assert(Blend->getIncomingValue(1) == PhiR && + "PhiR must be an operand of the blend"); + Blend->replaceAllUsesWith(Blend->getIncomingValue(0)); + } + continue; + } + if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { if (isa<VPWidenRecipe>(CurrentLink)) { assert(isa<CmpInst>(CurrentLinkI) && @@ -9095,14 +8914,12 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( BasicBlock *BB = CurrentLinkI->getParent(); VPValue *CondOp = nullptr; - if (CM.blockNeedsPredicationForAnyReason(BB)) { - VPBuilder::InsertPointGuard Guard(Builder); - Builder.setInsertPoint(CurrentLink); + if (CM.blockNeedsPredicationForAnyReason(BB)) CondOp = RecipeBuilder.getBlockInMask(BB); - } - VPReductionRecipe *RedRecipe = new VPReductionRecipe( - RdxDesc, CurrentLinkI, PreviousLink, VecOp, CondOp); + VPReductionRecipe *RedRecipe = + new VPReductionRecipe(RdxDesc, CurrentLinkI, PreviousLink, VecOp, + CondOp, CM.useOrderedReductions(RdxDesc)); // Append the recipe to the end of the VPBasicBlock because we need to // ensure that it comes after all of it's inputs, including CondOp. // Note that this transformation may leave over dead recipes (including @@ -9112,7 +8929,11 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( PreviousLink = RedRecipe; } } + VPBasicBlock *LatchVPBB = VectorLoopRegion->getExitingBasicBlock(); Builder.setInsertPoint(&*LatchVPBB->begin()); + VPBasicBlock *MiddleVPBB = + cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor()); + VPBasicBlock::iterator IP = MiddleVPBB->getFirstNonPhi(); for (VPRecipeBase &R : Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); @@ -9120,6 +8941,41 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( continue; const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); + // Adjust AnyOf reductions; replace the reduction phi for the selected value + // with a boolean reduction phi node to check if the condition is true in + // any iteration. The final value is selected by the final + // ComputeReductionResult. + if (RecurrenceDescriptor::isAnyOfRecurrenceKind( + RdxDesc.getRecurrenceKind())) { + auto *Select = cast<VPRecipeBase>(*find_if(PhiR->users(), [](VPUser *U) { + return isa<VPWidenSelectRecipe>(U) || + (isa<VPReplicateRecipe>(U) && + cast<VPReplicateRecipe>(U)->getUnderlyingInstr()->getOpcode() == + Instruction::Select); + })); + VPValue *Cmp = Select->getOperand(0); + // If the compare is checking the reduction PHI node, adjust it to check + // the start value. + if (VPRecipeBase *CmpR = Cmp->getDefiningRecipe()) { + for (unsigned I = 0; I != CmpR->getNumOperands(); ++I) + if (CmpR->getOperand(I) == PhiR) + CmpR->setOperand(I, PhiR->getStartValue()); + } + VPBuilder::InsertPointGuard Guard(Builder); + Builder.setInsertPoint(Select); + + // If the true value of the select is the reduction phi, the new value is + // selected if the negated condition is true in any iteration. + if (Select->getOperand(1) == PhiR) + Cmp = Builder.createNot(Cmp); + VPValue *Or = Builder.createOr(PhiR, Cmp); + Select->getVPSingleValue()->replaceAllUsesWith(Or); + + // Convert the reduction phi to operate on bools. + PhiR->setOperand(0, Plan->getOrAddLiveIn(ConstantInt::getFalse( + OrigLoop->getHeader()->getContext()))); + } + // If tail is folded by masking, introduce selects between the phi // and the live-out instruction of each reduction, at the beginning of the // dedicated latch block. @@ -9152,7 +9008,9 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( // then extend the loop exit value to enable InstCombine to evaluate the // entire expression in the smaller type. Type *PhiTy = PhiR->getStartValue()->getLiveInIRValue()->getType(); - if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { + if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType() && + !RecurrenceDescriptor::isAnyOfRecurrenceKind( + RdxDesc.getRecurrenceKind())) { assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!"); Type *RdxTy = RdxDesc.getRecurrenceType(); auto *Trunc = @@ -9184,8 +9042,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( // also modeled in VPlan. auto *FinalReductionResult = new VPInstruction( VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL); - cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor()) - ->appendRecipe(FinalReductionResult); + FinalReductionResult->insertBefore(*MiddleVPBB, IP); OrigExitingVPV->replaceUsesWithIf( FinalReductionResult, [](VPUser &User, unsigned) { return isa<VPLiveOut>(&User); }); @@ -9194,91 +9051,29 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( VPlanTransforms::clearReductionWrapFlags(*Plan); } -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; - IG->getInsertPos()->printAsOperand(O, false); - O << ", "; - getAddr()->printAsOperand(O, SlotTracker); - VPValue *Mask = getMask(); - if (Mask) { - O << ", "; - Mask->printAsOperand(O, SlotTracker); - } - - unsigned OpIdx = 0; - for (unsigned i = 0; i < IG->getFactor(); ++i) { - if (!IG->getMember(i)) - continue; - if (getNumStoreOperands() > 0) { - O << "\n" << Indent << " store "; - getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker); - O << " to index " << i; - } else { - O << "\n" << Indent << " "; - getVPValue(OpIdx)->printAsOperand(O, SlotTracker); - O << " = load from index " << i; - } - ++OpIdx; - } -} -#endif - void VPWidenPointerInductionRecipe::execute(VPTransformState &State) { assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction && "Not a pointer induction according to InductionDescriptor!"); assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() && "Unexpected type."); + assert(!onlyScalarsGenerated(State.VF.isScalable()) && + "Recipe should have been replaced"); auto *IVR = getParent()->getPlan()->getCanonicalIV(); - PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0)); - - if (onlyScalarsGenerated(State.VF)) { - // This is the normalized GEP that starts counting at zero. - Value *PtrInd = State.Builder.CreateSExtOrTrunc( - CanonicalIV, IndDesc.getStep()->getType()); - // Determine the number of scalars we need to generate for each unroll - // iteration. If the instruction is uniform, we only need to generate the - // first lane. Otherwise, we generate all VF values. - bool IsUniform = vputils::onlyFirstLaneUsed(this); - assert((IsUniform || !State.VF.isScalable()) && - "Cannot scalarize a scalable VF"); - unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue(); - - for (unsigned Part = 0; Part < State.UF; ++Part) { - Value *PartStart = - createStepForVF(State.Builder, PtrInd->getType(), State.VF, Part); - - for (unsigned Lane = 0; Lane < Lanes; ++Lane) { - Value *Idx = State.Builder.CreateAdd( - PartStart, ConstantInt::get(PtrInd->getType(), Lane)); - Value *GlobalIdx = State.Builder.CreateAdd(PtrInd, Idx); - - Value *Step = State.get(getOperand(1), VPIteration(Part, Lane)); - Value *SclrGep = emitTransformedIndex( - State.Builder, GlobalIdx, IndDesc.getStartValue(), Step, - IndDesc.getKind(), IndDesc.getInductionBinOp()); - SclrGep->setName("next.gep"); - State.set(this, SclrGep, VPIteration(Part, Lane)); - } - } - return; - } - + PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0, /*IsScalar*/ true)); Type *PhiType = IndDesc.getStep()->getType(); // Build a pointer phi Value *ScalarStartValue = getStartValue()->getLiveInIRValue(); Type *ScStValueType = ScalarStartValue->getType(); - PHINode *NewPointerPhi = - PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV); + PHINode *NewPointerPhi = PHINode::Create(ScStValueType, 2, "pointer.phi", + CanonicalIV->getIterator()); BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); NewPointerPhi->addIncoming(ScalarStartValue, VectorPH); // A pointer induction, performed by using a gep - Instruction *InductionLoc = &*State.Builder.GetInsertPoint(); + BasicBlock::iterator InductionLoc = State.Builder.GetInsertPoint(); Value *ScalarStepValue = State.get(getOperand(1), VPIteration(0, 0)); Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF); @@ -9329,84 +9124,21 @@ void VPDerivedIVRecipe::execute(VPTransformState &State) { State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags()); Value *Step = State.get(getStepValue(), VPIteration(0, 0)); - Value *CanonicalIV = State.get(getCanonicalIV(), VPIteration(0, 0)); + Value *CanonicalIV = State.get(getOperand(1), VPIteration(0, 0)); Value *DerivedIV = emitTransformedIndex( State.Builder, CanonicalIV, getStartValue()->getLiveInIRValue(), Step, Kind, cast_if_present<BinaryOperator>(FPBinOp)); DerivedIV->setName("offset.idx"); - if (TruncResultTy) { - assert(TruncResultTy != DerivedIV->getType() && - Step->getType()->isIntegerTy() && - "Truncation requires an integer step"); - DerivedIV = State.Builder.CreateTrunc(DerivedIV, TruncResultTy); - } assert(DerivedIV != CanonicalIV && "IV didn't need transforming?"); State.set(this, DerivedIV, VPIteration(0, 0)); } -void VPInterleaveRecipe::execute(VPTransformState &State) { - assert(!State.Instance && "Interleave group being replicated."); - State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), - getStoredValues(), getMask(), - NeedsMaskForGaps); -} - -void VPReductionRecipe::execute(VPTransformState &State) { - assert(!State.Instance && "Reduction being replicated."); - Value *PrevInChain = State.get(getChainOp(), 0); - RecurKind Kind = RdxDesc.getRecurrenceKind(); - bool IsOrdered = State.ILV->useOrderedReductions(RdxDesc); - // Propagate the fast-math flags carried by the underlying instruction. - IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); - State.Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); - for (unsigned Part = 0; Part < State.UF; ++Part) { - Value *NewVecOp = State.get(getVecOp(), Part); - if (VPValue *Cond = getCondOp()) { - Value *NewCond = State.VF.isVector() ? State.get(Cond, Part) - : State.get(Cond, {Part, 0}); - VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType()); - Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType(); - Value *Iden = RdxDesc.getRecurrenceIdentity(Kind, ElementTy, - RdxDesc.getFastMathFlags()); - if (State.VF.isVector()) { - Iden = - State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden); - } - - Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Iden); - NewVecOp = Select; - } - Value *NewRed; - Value *NextInChain; - if (IsOrdered) { - if (State.VF.isVector()) - NewRed = createOrderedReduction(State.Builder, RdxDesc, NewVecOp, - PrevInChain); - else - NewRed = State.Builder.CreateBinOp( - (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), PrevInChain, - NewVecOp); - PrevInChain = NewRed; - } else { - PrevInChain = State.get(getChainOp(), Part); - NewRed = createTargetReduction(State.Builder, RdxDesc, NewVecOp); - } - if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { - NextInChain = createMinMaxOp(State.Builder, RdxDesc.getRecurrenceKind(), - NewRed, PrevInChain); - } else if (IsOrdered) - NextInChain = NewRed; - else - NextInChain = State.Builder.CreateBinOp( - (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, PrevInChain); - State.set(this, NextInChain, Part); - } -} - void VPReplicateRecipe::execute(VPTransformState &State) { Instruction *UI = getUnderlyingInstr(); if (State.Instance) { // Generate a single instance. + assert((State.VF.isScalar() || !isUniform()) && + "uniform recipe shouldn't be predicated"); assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); State.ILV->scalarizeInstruction(UI, this, *State.Instance, State); // Insert scalar instance packing it into a vector. @@ -9464,98 +9196,180 @@ void VPReplicateRecipe::execute(VPTransformState &State) { State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane), State); } -void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { - VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; - - // Attempt to issue a wide load. - LoadInst *LI = dyn_cast<LoadInst>(&Ingredient); - StoreInst *SI = dyn_cast<StoreInst>(&Ingredient); - - assert((LI || SI) && "Invalid Load/Store instruction"); - assert((!SI || StoredValue) && "No stored value provided for widened store"); - assert((!LI || !StoredValue) && "Stored value provided for widened load"); +void VPWidenLoadRecipe::execute(VPTransformState &State) { + auto *LI = cast<LoadInst>(&Ingredient); Type *ScalarDataTy = getLoadStoreType(&Ingredient); - auto *DataTy = VectorType::get(ScalarDataTy, State.VF); const Align Alignment = getLoadStoreAlignment(&Ingredient); - bool CreateGatherScatter = !isConsecutive(); + bool CreateGather = !isConsecutive(); auto &Builder = State.Builder; - InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF); - bool isMaskRequired = getMask(); - if (isMaskRequired) { - // Mask reversal is only needed for non-all-one (null) masks, as reverse of - // a null all-one mask is a null mask. - for (unsigned Part = 0; Part < State.UF; ++Part) { - Value *Mask = State.get(getMask(), Part); + State.setDebugLocFrom(getDebugLoc()); + for (unsigned Part = 0; Part < State.UF; ++Part) { + Value *NewLI; + Value *Mask = nullptr; + if (auto *VPMask = getMask()) { + // Mask reversal is only needed for non-all-one (null) masks, as reverse + // of a null all-one mask is a null mask. + Mask = State.get(VPMask, Part); if (isReverse()) Mask = Builder.CreateVectorReverse(Mask, "reverse"); - BlockInMaskParts[Part] = Mask; } + + Value *Addr = State.get(getAddr(), Part, /*IsScalar*/ !CreateGather); + if (CreateGather) { + NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr, + "wide.masked.gather"); + } else if (Mask) { + NewLI = Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask, + PoisonValue::get(DataTy), + "wide.masked.load"); + } else { + NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load"); + } + // Add metadata to the load, but setVectorValue to the reverse shuffle. + State.addMetadata(NewLI, LI); + if (Reverse) + NewLI = Builder.CreateVectorReverse(NewLI, "reverse"); + State.set(this, NewLI, Part); } +} - // Handle Stores: - if (SI) { - State.setDebugLocFrom(SI->getDebugLoc()); +/// Use all-true mask for reverse rather than actual mask, as it avoids a +/// dependence w/o affecting the result. +static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand, + Value *EVL, const Twine &Name) { + VectorType *ValTy = cast<VectorType>(Operand->getType()); + Value *AllTrueMask = + Builder.CreateVectorSplat(ValTy->getElementCount(), Builder.getTrue()); + return Builder.CreateIntrinsic(ValTy, Intrinsic::experimental_vp_reverse, + {Operand, AllTrueMask, EVL}, nullptr, Name); +} - for (unsigned Part = 0; Part < State.UF; ++Part) { - Instruction *NewSI = nullptr; - Value *StoredVal = State.get(StoredValue, Part); - if (CreateGatherScatter) { - Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; - Value *VectorGep = State.get(getAddr(), Part); - NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, - MaskPart); - } else { - if (isReverse()) { - // If we store to reverse consecutive memory locations, then we need - // to reverse the order of elements in the stored value. - StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse"); - // We don't want to update the value in the map as it might be used in - // another expression. So don't call resetVectorValue(StoredVal). - } - auto *VecPtr = State.get(getAddr(), Part); - if (isMaskRequired) - NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, - BlockInMaskParts[Part]); - else - NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); - } - State.addMetadata(NewSI, SI); - } - return; +void VPWidenLoadEVLRecipe::execute(VPTransformState &State) { + assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with " + "explicit vector length."); + auto *LI = cast<LoadInst>(&Ingredient); + + Type *ScalarDataTy = getLoadStoreType(&Ingredient); + auto *DataTy = VectorType::get(ScalarDataTy, State.VF); + const Align Alignment = getLoadStoreAlignment(&Ingredient); + bool CreateGather = !isConsecutive(); + + auto &Builder = State.Builder; + State.setDebugLocFrom(getDebugLoc()); + CallInst *NewLI; + Value *EVL = State.get(getEVL(), VPIteration(0, 0)); + Value *Addr = State.get(getAddr(), 0, !CreateGather); + Value *Mask = nullptr; + if (VPValue *VPMask = getMask()) { + Mask = State.get(VPMask, 0); + if (isReverse()) + Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask"); + } else { + Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue()); + } + + if (CreateGather) { + NewLI = + Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL}, + nullptr, "wide.masked.gather"); + } else { + VectorBuilder VBuilder(Builder); + VBuilder.setEVL(EVL).setMask(Mask); + NewLI = cast<CallInst>(VBuilder.createVectorInstruction( + Instruction::Load, DataTy, Addr, "vp.op.load")); } + NewLI->addParamAttr( + 0, Attribute::getWithAlignment(NewLI->getContext(), Alignment)); + State.addMetadata(NewLI, LI); + Instruction *Res = NewLI; + if (isReverse()) + Res = createReverseEVL(Builder, Res, EVL, "vp.reverse"); + State.set(this, Res, 0); +} + +void VPWidenStoreRecipe::execute(VPTransformState &State) { + auto *SI = cast<StoreInst>(&Ingredient); + + VPValue *StoredVPValue = getStoredValue(); + bool CreateScatter = !isConsecutive(); + const Align Alignment = getLoadStoreAlignment(&Ingredient); + + auto &Builder = State.Builder; + State.setDebugLocFrom(getDebugLoc()); - // Handle loads. - assert(LI && "Must have a load instruction"); - State.setDebugLocFrom(LI->getDebugLoc()); for (unsigned Part = 0; Part < State.UF; ++Part) { - Value *NewLI; - if (CreateGatherScatter) { - Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; - Value *VectorGep = State.get(getAddr(), Part); - NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart, - nullptr, "wide.masked.gather"); - State.addMetadata(NewLI, LI); - } else { - auto *VecPtr = State.get(getAddr(), Part); - if (isMaskRequired) - NewLI = Builder.CreateMaskedLoad( - DataTy, VecPtr, Alignment, BlockInMaskParts[Part], - PoisonValue::get(DataTy), "wide.masked.load"); - else - NewLI = - Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); + Instruction *NewSI = nullptr; + Value *Mask = nullptr; + if (auto *VPMask = getMask()) { + // Mask reversal is only needed for non-all-one (null) masks, as reverse + // of a null all-one mask is a null mask. + Mask = State.get(VPMask, Part); + if (isReverse()) + Mask = Builder.CreateVectorReverse(Mask, "reverse"); + } - // Add metadata to the load, but setVectorValue to the reverse shuffle. - State.addMetadata(NewLI, LI); - if (Reverse) - NewLI = Builder.CreateVectorReverse(NewLI, "reverse"); + Value *StoredVal = State.get(StoredVPValue, Part); + if (isReverse()) { + // If we store to reverse consecutive memory locations, then we need + // to reverse the order of elements in the stored value. + StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse"); + // We don't want to update the value in the map as it might be used in + // another expression. So don't call resetVectorValue(StoredVal). } + Value *Addr = State.get(getAddr(), Part, /*IsScalar*/ !CreateScatter); + if (CreateScatter) + NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask); + else if (Mask) + NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask); + else + NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment); + State.addMetadata(NewSI, SI); + } +} - State.set(getVPSingleValue(), NewLI, Part); +void VPWidenStoreEVLRecipe::execute(VPTransformState &State) { + assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with " + "explicit vector length."); + auto *SI = cast<StoreInst>(&Ingredient); + + VPValue *StoredValue = getStoredValue(); + bool CreateScatter = !isConsecutive(); + const Align Alignment = getLoadStoreAlignment(&Ingredient); + + auto &Builder = State.Builder; + State.setDebugLocFrom(getDebugLoc()); + + CallInst *NewSI = nullptr; + Value *StoredVal = State.get(StoredValue, 0); + Value *EVL = State.get(getEVL(), VPIteration(0, 0)); + if (isReverse()) + StoredVal = createReverseEVL(Builder, StoredVal, EVL, "vp.reverse"); + Value *Mask = nullptr; + if (VPValue *VPMask = getMask()) { + Mask = State.get(VPMask, 0); + if (isReverse()) + Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask"); + } else { + Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue()); } + Value *Addr = State.get(getAddr(), 0, !CreateScatter); + if (CreateScatter) { + NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()), + Intrinsic::vp_scatter, + {StoredVal, Addr, Mask, EVL}); + } else { + VectorBuilder VBuilder(Builder); + VBuilder.setEVL(EVL).setMask(Mask); + NewSI = cast<CallInst>(VBuilder.createVectorInstruction( + Instruction::Store, Type::getVoidTy(EVL->getContext()), + {StoredVal, Addr})); + } + NewSI->addParamAttr( + 1, Attribute::getWithAlignment(NewSI->getContext(), Alignment)); + State.addMetadata(NewSI, SI); } // Determine how to lower the scalar epilogue, which depends on 1) optimising @@ -9658,7 +9472,7 @@ static bool processLoopInVPlanNativePath( bool AddBranchWeights = hasBranchWeightMD(*L->getLoopLatch()->getTerminator()); GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI, - F->getParent()->getDataLayout(), AddBranchWeights); + F->getDataLayout(), AddBranchWeights); InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, VF.Width, 1, LVL, &CM, BFI, PSI, Checks); LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" @@ -9741,7 +9555,7 @@ static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks, } // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated. - double ScalarC = *VF.ScalarCost.getValue(); + uint64_t ScalarC = *VF.ScalarCost.getValue(); if (ScalarC == 0) return true; @@ -9768,7 +9582,7 @@ static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks, // RtC + VecC * (TC / VF) + EpiC < ScalarC * TC // // Now we can compute the minimum required trip count TC as - // (RtC + EpiC) / (ScalarC - (VecC / VF)) < TC + // VF * (RtC + EpiC) / (ScalarC * VF - VecC) < TC // // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that // the computations are performed on doubles, not integers and the result @@ -9780,9 +9594,9 @@ static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks, AssumedMinimumVscale = *VScale; IntVF *= AssumedMinimumVscale; } - double VecCOverVF = double(*VF.Cost.getValue()) / IntVF; - double RtC = *CheckCost.getValue(); - double MinTC1 = RtC / (ScalarC - VecCOverVF); + uint64_t RtC = *CheckCost.getValue(); + uint64_t Div = ScalarC * IntVF - *VF.Cost.getValue(); + uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(RtC * IntVF, Div); // Second, compute a minimum iteration count so that the cost of the // runtime checks is only a fraction of the total scalar loop cost. This @@ -9791,12 +9605,12 @@ static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks, // * TC. To bound the runtime check to be a fraction 1/X of the scalar // cost, compute // RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC - double MinTC2 = RtC * 10 / ScalarC; + uint64_t MinTC2 = divideCeil(RtC * 10, ScalarC); // Now pick the larger minimum. If it is not a multiple of VF and a scalar // epilogue is allowed, choose the next closest multiple of VF. This should // partly compensate for ignoring the epilogue cost. - uint64_t MinTC = std::ceil(std::max(MinTC1, MinTC2)); + uint64_t MinTC = std::max(MinTC1, MinTC2); if (SEL == CM_ScalarEpilogueAllowed) MinTC = alignTo(MinTC, IntVF); VF.MinProfitableTripCount = ElementCount::getFixed(MinTC); @@ -9831,13 +9645,9 @@ bool LoopVectorizePass::processLoop(Loop *L) { assert((EnableVPlanNativePath || L->isInnermost()) && "VPlan-native path is not enabled. Only process inner loops."); -#ifndef NDEBUG - const std::string DebugLocStr = getDebugLocString(L); -#endif /* NDEBUG */ - LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '" << L->getHeader()->getParent()->getName() << "' from " - << DebugLocStr << "\n"); + << L->getLocStr() << "\n"); LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI); @@ -10006,7 +9816,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { bool AddBranchWeights = hasBranchWeightMD(*L->getLoopLatch()->getTerminator()); GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI, - F->getParent()->getDataLayout(), AddBranchWeights); + F->getDataLayout(), AddBranchWeights); if (MaybeVF) { VF = *MaybeVF; // Select the interleave count. @@ -10107,7 +9917,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { }); } else if (VectorizeLoop && !InterleaveLoop) { LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width - << ") in " << DebugLocStr << '\n'); + << ") in " << L->getLocStr() << '\n'); ORE->emit([&]() { return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, L->getStartLoc(), L->getHeader()) @@ -10115,7 +9925,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { }); } else if (VectorizeLoop && InterleaveLoop) { LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width - << ") in " << DebugLocStr << '\n'); + << ") in " << L->getLocStr() << '\n'); LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); } @@ -10130,7 +9940,10 @@ bool LoopVectorizePass::processLoop(Loop *L) { InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM, BFI, PSI, Checks); - VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); + VPlan &BestPlan = + UseLegacyCostModel ? LVP.getBestPlanFor(VF.Width) : LVP.getBestPlan(); + assert((UseLegacyCostModel || BestPlan.hasScalarVFOnly()) && + "VPlan cost model and legacy cost model disagreed"); LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false); ORE->emit([&]() { @@ -10154,9 +9967,10 @@ bool LoopVectorizePass::processLoop(Loop *L) { EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, EPI, &LVL, &CM, BFI, PSI, Checks); - VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF); + std::unique_ptr<VPlan> BestMainPlan( + LVP.getBestPlanFor(EPI.MainLoopVF).duplicate()); const auto &[ExpandedSCEVs, ReductionResumeValues] = LVP.executePlan( - EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV, DT, true); + EPI.MainLoopVF, EPI.MainLoopUF, *BestMainPlan, MainILV, DT, true); ++LoopsVectorized; // Second pass vectorizes the epilogue and adjusts the control flow @@ -10181,9 +9995,11 @@ bool LoopVectorizePass::processLoop(Loop *L) { EpilogILV.setTripCount(MainILV.getTripCount()); for (auto &R : make_early_inc_range(*BestEpiPlan.getPreheader())) { auto *ExpandR = cast<VPExpandSCEVRecipe>(&R); - auto *ExpandedVal = BestEpiPlan.getVPValueOrAddLiveIn( + auto *ExpandedVal = BestEpiPlan.getOrAddLiveIn( ExpandedSCEVs.find(ExpandR->getSCEV())->second); ExpandR->replaceAllUsesWith(ExpandedVal); + if (BestEpiPlan.getTripCount() == ExpandR) + BestEpiPlan.resetTripCount(ExpandedVal); ExpandR->eraseFromParent(); } @@ -10197,9 +10013,19 @@ bool LoopVectorizePass::processLoop(Loop *L) { Value *ResumeV = nullptr; // TODO: Move setting of resume values to prepareToExecute. if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) { - ResumeV = ReductionResumeValues - .find(&ReductionPhi->getRecurrenceDescriptor()) - ->second; + const RecurrenceDescriptor &RdxDesc = + ReductionPhi->getRecurrenceDescriptor(); + RecurKind RK = RdxDesc.getRecurrenceKind(); + ResumeV = ReductionResumeValues.find(&RdxDesc)->second; + if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) { + // VPReductionPHIRecipes for AnyOf reductions expect a boolean as + // start value; compare the final value from the main vector loop + // to the start value. + IRBuilder<> Builder( + cast<Instruction>(ResumeV)->getParent()->getFirstNonPHI()); + ResumeV = Builder.CreateICmpNE(ResumeV, + RdxDesc.getRecurrenceStartValue()); + } } else { // Create induction resume values for both widened pointer and // integer/fp inductions and update the start value of the induction @@ -10220,10 +10046,12 @@ bool LoopVectorizePass::processLoop(Loop *L) { {EPI.MainLoopIterationCountCheck}); } assert(ResumeV && "Must have a resume value"); - VPValue *StartVal = BestEpiPlan.getVPValueOrAddLiveIn(ResumeV); + VPValue *StartVal = BestEpiPlan.getOrAddLiveIn(ResumeV); cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal); } + assert(DT->verify(DominatorTree::VerificationLevel::Fast) && + "DT not preserved correctly"); LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, DT, true, &ExpandedSCEVs); ++LoopsEpilogueVectorized; @@ -10231,12 +10059,22 @@ bool LoopVectorizePass::processLoop(Loop *L) { if (!MainILV.areSafetyChecksAdded()) DisableRuntimeUnroll = true; } else { - InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, + ElementCount Width = VF.Width; + VPlan &BestPlan = + UseLegacyCostModel ? LVP.getBestPlanFor(Width) : LVP.getBestPlan(); + if (!UseLegacyCostModel) { + assert(size(BestPlan.vectorFactors()) == 1 && + "Plan should have a single VF"); + Width = *BestPlan.vectorFactors().begin(); + LLVM_DEBUG(dbgs() + << "VF picked by VPlan cost model: " << Width << "\n"); + assert(VF.Width == Width && + "VPlan cost model and legacy cost model disagreed"); + } + InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, Width, VF.MinProfitableTripCount, IC, &LVL, &CM, BFI, PSI, Checks); - - VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); - LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false); + LVP.executePlan(Width, IC, BestPlan, LB, DT, false); ++LoopsVectorized; // Add metadata to disable runtime unrolling a scalar loop when there @@ -10376,15 +10214,10 @@ PreservedAnalyses LoopVectorizePass::run(Function &F, RemoveRedundantDbgInstrs(&BB); } - // We currently do not preserve loopinfo/dominator analyses with outer loop - // vectorization. Until this is addressed, mark these analyses as preserved - // only for non-VPlan-native path. - // TODO: Preserve Loop and Dominator analyses for VPlan-native path. - if (!EnableVPlanNativePath) { - PA.preserve<LoopAnalysis>(); - PA.preserve<DominatorTreeAnalysis>(); - PA.preserve<ScalarEvolutionAnalysis>(); - } + PA.preserve<LoopAnalysis>(); + PA.preserve<DominatorTreeAnalysis>(); + PA.preserve<ScalarEvolutionAnalysis>(); + PA.preserve<LoopAccessAnalysis>(); if (Result.MadeCFGChange) { // Making CFG changes likely means a loop got vectorized. Indicate that |
