From e3b557809604d036af6e00c60f012c2025b59a5e Mon Sep 17 00:00:00 2001 From: Dimitry Andric Date: Sat, 11 Feb 2023 13:38:04 +0100 Subject: Vendor import of llvm-project main llvmorg-16-init-18548-gb0daacf58f41, the last commit before the upstream release/17.x branch was created. --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 1683 ++++++++++++----------- 1 file changed, 916 insertions(+), 767 deletions(-) (limited to 'llvm/lib/Transforms/Vectorize/LoopVectorize.cpp') diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 238b074089aa..a28099d8ba7d 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -65,8 +65,6 @@ #include "llvm/ADT/DenseMapInfo.h" #include "llvm/ADT/Hashing.h" #include "llvm/ADT/MapVector.h" -#include "llvm/ADT/None.h" -#include "llvm/ADT/Optional.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" @@ -142,6 +140,7 @@ #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" #include #include +#include #include #include #include @@ -362,10 +361,15 @@ cl::opt llvm::EnableLoopVectorization( "vectorize-loops", cl::init(true), cl::Hidden, cl::desc("Run the Loop vectorization passes")); -cl::opt PrintVPlansInDotFormat( - "vplan-print-in-dot-format", cl::init(false), cl::Hidden, +static cl::opt PrintVPlansInDotFormat( + "vplan-print-in-dot-format", cl::Hidden, cl::desc("Use dot format instead of plain text when dumping VPlans")); +static cl::opt ForceSafeDivisor( + "force-widen-divrem-via-safe-divisor", cl::Hidden, + cl::desc( + "Override cost based safe divisor widening for div/rem instructions")); + /// A helper function that returns true if the given type is irregular. The /// type is irregular if its allocated size doesn't equal the store size of an /// element of the corresponding vector type. @@ -396,8 +400,9 @@ static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { /// 1) Returns exact trip count if it is known. /// 2) Returns expected trip count according to profile data if any. /// 3) Returns upper bound estimate if it is known. -/// 4) Returns None if all of the above failed. -static Optional getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { +/// 4) Returns std::nullopt if all of the above failed. +static std::optional getSmallBestKnownTC(ScalarEvolution &SE, + Loop *L) { // Check if exact trip count is known. if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) return ExpectedTC; @@ -405,17 +410,19 @@ static Optional getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { // Check if there is an expected trip count available from profile data. if (LoopVectorizeWithBlockFrequency) if (auto EstimatedTC = getLoopEstimatedTripCount(L)) - return EstimatedTC; + return *EstimatedTC; // Check if upper bound estimate is known. if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) return ExpectedTC; - return None; + return std::nullopt; } +namespace { // Forward declare GeneratedRTChecks. class GeneratedRTChecks; +} // namespace namespace llvm { @@ -473,10 +480,6 @@ public: /// complex control flow around the loops. virtual std::pair createVectorizedLoopSkeleton(); - /// Widen a single call instruction within the innermost loop. - void widenCallInstruction(CallInst &CI, VPValue *Def, VPUser &ArgOperands, - VPTransformState &State); - /// Fix the vectorized code, taking care of header phi's, live-outs, and more. void fixVectorizedLoop(VPTransformState &State, VPlan &Plan); @@ -493,7 +496,8 @@ public: /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p /// Instr's operands. - void scalarizeInstruction(Instruction *Instr, VPReplicateRecipe *RepRecipe, + void scalarizeInstruction(const Instruction *Instr, + VPReplicateRecipe *RepRecipe, const VPIteration &Instance, bool IfPredicateInstr, VPTransformState &State); @@ -529,6 +533,17 @@ public: // generated by fixReduction. PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc); + /// Create a new phi node for the induction variable \p OrigPhi to resume + /// iteration count in the scalar epilogue, from where the vectorized loop + /// left off. In cases where the loop skeleton is more complicated (eg. + /// epilogue vectorization) and the resume values can come from an additional + /// bypass block, the \p AdditionalBypass pair provides information about the + /// bypass block and the end value on the edge from bypass to this loop. + PHINode *createInductionResumeValue( + PHINode *OrigPhi, const InductionDescriptor &ID, + ArrayRef BypassBlocks, + std::pair AdditionalBypass = {nullptr, nullptr}); + protected: friend class LoopVectorizationPlanner; @@ -552,7 +567,7 @@ protected: /// Create the exit value of first order recurrences in the middle block and /// update their users. - void fixFirstOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR, + void fixFixedOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State); /// Create code for the loop exit value of the reduction. @@ -611,7 +626,7 @@ protected: /// Complete the loop skeleton by adding debug MDs, creating appropriate /// conditional branches in the middle block, preparing the builder and /// running the verifier. Return the preheader of the completed vector loop. - BasicBlock *completeLoopSkeleton(MDNode *OrigLoopID); + BasicBlock *completeLoopSkeleton(); /// Collect poison-generating recipes that may generate a poison value that is /// used after vectorization, even when their operands are not poison. Those @@ -643,9 +658,6 @@ protected: /// Dominator Tree. DominatorTree *DT; - /// Alias Analysis. - AAResults *AA; - /// Target Library Info. const TargetLibraryInfo *TLI; @@ -951,6 +963,27 @@ Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) { return VF.isScalable() ? B.CreateVScale(EC) : EC; } +const SCEV *createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE) { + const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); + assert(!isa(BackedgeTakenCount) && "Invalid loop count"); + + ScalarEvolution &SE = *PSE.getSE(); + + // The exit count might have the type of i64 while the phi is i32. This can + // happen if we have an induction variable that is sign extended before the + // compare. The only way that we get a backedge taken count is that the + // induction variable was signed and as such will not overflow. In such a case + // truncation is legal. + if (SE.getTypeSizeInBits(BackedgeTakenCount->getType()) > + IdxTy->getPrimitiveSizeInBits()) + BackedgeTakenCount = SE.getTruncateOrNoop(BackedgeTakenCount, IdxTy); + BackedgeTakenCount = SE.getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); + + // Get the total trip count from the count by adding 1. + return SE.getAddExpr(BackedgeTakenCount, + SE.getOne(BackedgeTakenCount->getType())); +} + static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy, ElementCount VF) { assert(FTy->isFloatingPointTy() && "Expected floating point type!"); @@ -1037,27 +1070,25 @@ void InnerLoopVectorizer::collectPoisonGeneratingRecipes( // Add new definitions to the worklist. for (VPValue *operand : CurRec->operands()) - if (VPDef *OpDef = operand->getDef()) - Worklist.push_back(cast(OpDef)); + if (VPRecipeBase *OpDef = operand->getDefiningRecipe()) + Worklist.push_back(OpDef); } }); // Traverse all the recipes in the VPlan and collect the poison-generating // recipes in the backward slice starting at the address of a VPWidenRecipe or // VPInterleaveRecipe. - auto Iter = depth_first( - VPBlockRecursiveTraversalWrapper(State.Plan->getEntry())); + auto Iter = vp_depth_first_deep(State.Plan->getEntry()); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly(Iter)) { for (VPRecipeBase &Recipe : *VPBB) { if (auto *WidenRec = dyn_cast(&Recipe)) { Instruction &UnderlyingInstr = WidenRec->getIngredient(); - VPDef *AddrDef = WidenRec->getAddr()->getDef(); + VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe(); if (AddrDef && WidenRec->isConsecutive() && Legal->blockNeedsPredication(UnderlyingInstr.getParent())) - collectPoisonGeneratingInstrsInBackwardSlice( - cast(AddrDef)); + collectPoisonGeneratingInstrsInBackwardSlice(AddrDef); } else if (auto *InterleaveRec = dyn_cast(&Recipe)) { - VPDef *AddrDef = InterleaveRec->getAddr()->getDef(); + VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe(); if (AddrDef) { // Check if any member of the interleave group needs predication. const InterleaveGroup *InterGroup = @@ -1072,8 +1103,7 @@ void InnerLoopVectorizer::collectPoisonGeneratingRecipes( } if (NeedPredication) - collectPoisonGeneratingInstrsInBackwardSlice( - cast(AddrDef)); + collectPoisonGeneratingInstrsInBackwardSlice(AddrDef); } } } @@ -1182,7 +1212,7 @@ public: /// If interleave count has been specified by metadata it will be returned. /// Otherwise, the interleave count is computed and returned. VF and LoopCost /// are the selected vectorization factor and the cost of the selected VF. - unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); + unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost); /// Memory access instruction may be vectorized in more than one way. /// Form of instruction after vectorization depends on cost. @@ -1435,47 +1465,49 @@ public: })); } - /// Returns true if \p I is an instruction that will be scalarized with - /// predication when vectorizing \p I with vectorization factor \p VF. Such - /// instructions include conditional stores and instructions that may divide - /// by zero. - bool isScalarWithPredication(Instruction *I, ElementCount VF) const; - - // Returns true if \p I is an instruction that will be predicated either - // through scalar predication or masked load/store or masked gather/scatter. - // \p VF is the vectorization factor that will be used to vectorize \p I. - // Superset of instructions that return true for isScalarWithPredication. - bool isPredicatedInst(Instruction *I, ElementCount VF) { - // When we know the load's address is loop invariant and the instruction - // in the original scalar loop was unconditionally executed then we - // don't need to mark it as a predicated instruction. Tail folding may - // introduce additional predication, but we're guaranteed to always have - // at least one active lane. We call Legal->blockNeedsPredication here - // because it doesn't query tail-folding. - if (Legal->isUniformMemOp(*I) && isa(I) && - !Legal->blockNeedsPredication(I->getParent())) + /// Given costs for both strategies, return true if the scalar predication + /// lowering should be used for div/rem. This incorporates an override + /// option so it is not simply a cost comparison. + bool isDivRemScalarWithPredication(InstructionCost ScalarCost, + InstructionCost SafeDivisorCost) const { + switch (ForceSafeDivisor) { + case cl::BOU_UNSET: + return ScalarCost < SafeDivisorCost; + case cl::BOU_TRUE: return false; - if (!blockNeedsPredicationForAnyReason(I->getParent())) - return false; - // Loads and stores that need some form of masked operation are predicated - // instructions. - if (isa(I) || isa(I)) - return Legal->isMaskRequired(I); - return isScalarWithPredication(I, VF); + case cl::BOU_FALSE: + return true; + }; + llvm_unreachable("impossible case value"); } + /// Returns true if \p I is an instruction which requires predication and + /// for which our chosen predication strategy is scalarization (i.e. we + /// don't have an alternate strategy such as masking available). + /// \p VF is the vectorization factor that will be used to vectorize \p I. + bool isScalarWithPredication(Instruction *I, ElementCount VF) const; + + /// Returns true if \p I is an instruction that needs to be predicated + /// at runtime. The result is independent of the predication mechanism. + /// Superset of instructions that return true for isScalarWithPredication. + bool isPredicatedInst(Instruction *I) const; + + /// Return the costs for our two available strategies for lowering a + /// div/rem operation which requires speculating at least one lane. + /// First result is for scalarization (will be invalid for scalable + /// vectors); second is for the safe-divisor strategy. + std::pair + getDivRemSpeculationCost(Instruction *I, + ElementCount VF) const; + /// Returns true if \p I is a memory instruction with consecutive memory /// access that can be widened. - bool - memoryInstructionCanBeWidened(Instruction *I, - ElementCount VF = ElementCount::getFixed(1)); + bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF); /// Returns true if \p I is a memory instruction in an interleaved-group /// of memory accesses that can be vectorized with wide vector loads/stores /// and shuffles. - bool - interleavedAccessCanBeWidened(Instruction *I, - ElementCount VF = ElementCount::getFixed(1)); + bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF); /// Check if \p Instr belongs to any interleaved access group. bool isAccessInterleaved(Instruction *Instr) { @@ -1567,7 +1599,7 @@ public: /// Convenience function that returns the value of vscale_range iff /// vscale_range.min == vscale_range.max or otherwise returns the value /// returned by the corresponding TLI method. - Optional getVScaleForTuning() const; + std::optional getVScaleForTuning() const; private: unsigned NumPredStores = 0; @@ -1623,7 +1655,7 @@ private: /// Return the cost of instructions in an inloop reduction pattern, if I is /// part of that pattern. - Optional + std::optional getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy, TTI::TargetCostKind CostKind); @@ -1651,8 +1683,8 @@ private: /// Estimate the overhead of scalarizing an instruction. This is a /// convenience wrapper for the type-based getScalarizationOverhead API. - InstructionCost getScalarizationOverhead(Instruction *I, - ElementCount VF) const; + InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF, + TTI::TargetCostKind CostKind) const; /// Returns true if an artificially high cost for emulated masked memrefs /// should be used. @@ -1719,8 +1751,9 @@ private: /// scalarize and their scalar costs are collected in \p ScalarCosts. A /// non-negative return value implies the expression will be scalarized. /// Currently, only single-use chains are considered for scalarization. - int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, - ElementCount VF); + InstructionCost computePredInstDiscount(Instruction *PredInst, + ScalarCostsTy &ScalarCosts, + ElementCount VF); /// Collect the instructions that are uniform after vectorization. An /// instruction is uniform if we represent it with a single scalar value in @@ -1835,6 +1868,7 @@ public: }; } // end namespace llvm +namespace { /// Helper struct to manage generating runtime checks for vectorization. /// /// The runtime checks are created up-front in temporary blocks to allow better @@ -1914,7 +1948,7 @@ public: if (DiffChecks) { Value *RuntimeVF = nullptr; MemRuntimeCheckCond = addDiffRuntimeChecks( - MemCheckBlock->getTerminator(), L, *DiffChecks, MemCheckExp, + MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp, [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) { if (!RuntimeVF) RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF); @@ -2099,6 +2133,7 @@ public: return MemCheckBlock; } }; +} // namespace // Return true if \p OuterLp is an outer loop annotated with hints for explicit // vectorization. The loop needs to be annotated with #pragma omp simd @@ -2194,18 +2229,15 @@ struct LoopVectorize : public FunctionPass { auto *BFI = &getAnalysis().getBFI(); auto *TLIP = getAnalysisIfAvailable(); auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; - auto *AA = &getAnalysis().getAAResults(); auto *AC = &getAnalysis().getAssumptionCache(F); - auto *LAA = &getAnalysis(); + auto &LAIs = getAnalysis().getLAIs(); auto *DB = &getAnalysis().getDemandedBits(); auto *ORE = &getAnalysis().getORE(); auto *PSI = &getAnalysis().getPSI(); - std::function GetLAA = - [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; - - return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, - GetLAA, *ORE, PSI).MadeAnyChange; + return Impl + .runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AC, LAIs, *ORE, PSI) + .MadeAnyChange; } void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -2215,7 +2247,6 @@ struct LoopVectorize : public FunctionPass { AU.addRequired(); AU.addRequired(); AU.addRequired(); - AU.addRequired(); AU.addRequired(); AU.addRequired(); AU.addRequired(); @@ -2321,12 +2352,16 @@ static void buildScalarSteps(Value *ScalarIV, Value *Step, const InductionDescriptor &ID, VPValue *Def, VPTransformState &State) { IRBuilderBase &Builder = State.Builder; - // We shouldn't have to build scalar steps if we aren't vectorizing. - assert(State.VF.isVector() && "VF should be greater than one"); - // Get the value type and ensure it and the step have the same integer type. + + // Ensure step has the same type as that of scalar IV. Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); - assert(ScalarIVTy == Step->getType() && - "Val and Step should have the same type"); + if (ScalarIVTy != Step->getType()) { + // TODO: Also use VPDerivedIVRecipe when only the step needs truncating, to + // avoid separate truncate here. + assert(Step->getType()->isIntegerTy() && + "Truncation requires an integer step"); + Step = State.Builder.CreateTrunc(Step, ScalarIVTy); + } // We build scalar steps for both integer and floating-point induction // variables. Here, we determine the kind of arithmetic we will perform. @@ -2343,7 +2378,6 @@ static void buildScalarSteps(Value *ScalarIV, Value *Step, // Determine the number of scalars we need to generate for each unroll // iteration. bool FirstLaneOnly = vputils::onlyFirstLaneUsed(Def); - unsigned Lanes = FirstLaneOnly ? 1 : State.VF.getKnownMinValue(); // Compute the scalar steps and save the results in State. Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), ScalarIVTy->getScalarSizeInBits()); @@ -2357,7 +2391,17 @@ static void buildScalarSteps(Value *ScalarIV, Value *Step, SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV); } - for (unsigned Part = 0; Part < State.UF; ++Part) { + unsigned StartPart = 0; + unsigned EndPart = State.UF; + unsigned StartLane = 0; + unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue(); + if (State.Instance) { + StartPart = State.Instance->Part; + EndPart = StartPart + 1; + StartLane = State.Instance->Lane.getKnownLane(); + EndLane = StartLane + 1; + } + for (unsigned Part = StartPart; Part < EndPart; ++Part) { Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part); if (!FirstLaneOnly && State.VF.isScalable()) { @@ -2376,7 +2420,7 @@ static void buildScalarSteps(Value *ScalarIV, Value *Step, if (ScalarIVTy->isFloatingPointTy()) StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy); - for (unsigned Lane = 0; Lane < Lanes; ++Lane) { + for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) { Value *StartIdx = Builder.CreateBinOp( AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane)); // The step returned by `createStepForVF` is a runtime-evaluated value @@ -2415,8 +2459,14 @@ static Value *CreateStepValue(const SCEV *Step, ScalarEvolution &SE, static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue, Value *Step, const InductionDescriptor &ID) { - assert(Index->getType()->getScalarType() == Step->getType() && - "Index scalar type does not match StepValue type"); + Type *StepTy = Step->getType(); + Value *CastedIndex = StepTy->isIntegerTy() + ? B.CreateSExtOrTrunc(Index, StepTy) + : B.CreateCast(Instruction::SIToFP, Index, StepTy); + if (CastedIndex != Index) { + CastedIndex->setName(CastedIndex->getName() + ".cast"); + Index = CastedIndex; + } // Note: the IR at this point is broken. We cannot use SE to create any new // SCEV and then expand it, hoping that SCEV's simplification will give us @@ -2682,6 +2732,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( for (unsigned Part = 0; Part < UF; Part++) { // Collect the stored vector from each member. SmallVector StoredVecs; + unsigned StoredIdx = 0; for (unsigned i = 0; i < InterleaveFactor; i++) { assert((Group->getMember(i) || MaskForGaps) && "Fail to get a member from an interleaved store group"); @@ -2694,7 +2745,8 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( continue; } - Value *StoredVec = State.get(StoredValues[i], Part); + Value *StoredVec = State.get(StoredValues[StoredIdx], Part); + ++StoredIdx; if (Group->isReverse()) StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse"); @@ -2738,7 +2790,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( } } -void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, +void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr, VPReplicateRecipe *RepRecipe, const VPIteration &Instance, bool IfPredicateInstr, @@ -2772,11 +2824,10 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, // Replace the operands of the cloned instructions with their scalar // equivalents in the new loop. - for (auto &I : enumerate(RepRecipe->operands())) { + for (const auto &I : enumerate(RepRecipe->operands())) { auto InputInstance = Instance; VPValue *Operand = I.value(); - VPReplicateRecipe *OperandR = dyn_cast(Operand); - if (OperandR && OperandR->isUniform()) + if (vputils::isUniformAfterVectorization(Operand)) InputInstance.Lane = VPLane::getFirstLane(); Cloned->setOperand(I.index(), State.get(Operand, InputInstance)); } @@ -2803,33 +2854,15 @@ Value *InnerLoopVectorizer::getOrCreateTripCount(BasicBlock *InsertBlock) { assert(InsertBlock); IRBuilder<> Builder(InsertBlock->getTerminator()); // Find the loop boundaries. - ScalarEvolution *SE = PSE.getSE(); - const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); - assert(!isa(BackedgeTakenCount) && - "Invalid loop count"); - Type *IdxTy = Legal->getWidestInductionType(); assert(IdxTy && "No type for induction"); - - // The exit count might have the type of i64 while the phi is i32. This can - // happen if we have an induction variable that is sign extended before the - // compare. The only way that we get a backedge taken count is that the - // induction variable was signed and as such will not overflow. In such a case - // truncation is legal. - if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > - IdxTy->getPrimitiveSizeInBits()) - BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); - BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); - - // Get the total trip count from the count by adding 1. - const SCEV *ExitCount = SE->getAddExpr( - BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); + const SCEV *ExitCount = createTripCountSCEV(IdxTy, PSE); const DataLayout &DL = InsertBlock->getModule()->getDataLayout(); // Expand the trip count and place the new instructions in the preheader. // Notice that the pre-header does not change, only the loop body. - SCEVExpander Exp(*SE, DL, "induction"); + SCEVExpander Exp(*PSE.getSE(), DL, "induction"); // Count holds the overall loop count (N). TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), @@ -3080,7 +3113,7 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { // 1) If we know that we must execute the scalar epilogue, emit an // unconditional branch. // 2) Otherwise, we must have a single unique exit block (due to how we - // implement the multiple exit case). In this case, set up a conditonal + // implement the multiple exit case). In this case, set up a conditional // branch from the middle block to the loop scalar preheader, and the // exit block. completeLoopSkeleton will update the condition to use an // iteration check, if required to decide whether to execute the remainder. @@ -3101,88 +3134,87 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); } -void InnerLoopVectorizer::createInductionResumeValues( +PHINode *InnerLoopVectorizer::createInductionResumeValue( + PHINode *OrigPhi, const InductionDescriptor &II, + ArrayRef BypassBlocks, std::pair AdditionalBypass) { - assert(((AdditionalBypass.first && AdditionalBypass.second) || - (!AdditionalBypass.first && !AdditionalBypass.second)) && - "Inconsistent information about additional bypass."); - Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); assert(VectorTripCount && "Expected valid arguments"); - // We are going to resume the execution of the scalar loop. - // Go over all of the induction variables that we found and fix the - // PHIs that are left in the scalar version of the loop. - // The starting values of PHI nodes depend on the counter of the last - // iteration in the vectorized loop. - // If we come from a bypass edge then we need to start from the original - // start value. + Instruction *OldInduction = Legal->getPrimaryInduction(); - for (auto &InductionEntry : Legal->getInductionVars()) { - PHINode *OrigPhi = InductionEntry.first; - InductionDescriptor II = InductionEntry.second; + Value *&EndValue = IVEndValues[OrigPhi]; + Value *EndValueFromAdditionalBypass = AdditionalBypass.second; + if (OrigPhi == OldInduction) { + // We know what the end value is. + EndValue = VectorTripCount; + } else { + IRBuilder<> B(LoopVectorPreHeader->getTerminator()); - Value *&EndValue = IVEndValues[OrigPhi]; - Value *EndValueFromAdditionalBypass = AdditionalBypass.second; - if (OrigPhi == OldInduction) { - // We know what the end value is. - EndValue = VectorTripCount; - } else { - IRBuilder<> B(LoopVectorPreHeader->getTerminator()); + // Fast-math-flags propagate from the original induction instruction. + if (II.getInductionBinOp() && isa(II.getInductionBinOp())) + B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); - // Fast-math-flags propagate from the original induction instruction. - if (II.getInductionBinOp() && isa(II.getInductionBinOp())) - B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); + Value *Step = + CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint()); + EndValue = + emitTransformedIndex(B, VectorTripCount, II.getStartValue(), Step, II); + EndValue->setName("ind.end"); - Type *StepType = II.getStep()->getType(); - Instruction::CastOps CastOp = - CastInst::getCastOpcode(VectorTripCount, true, StepType, true); - Value *VTC = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.vtc"); + // Compute the end value for the additional bypass (if applicable). + if (AdditionalBypass.first) { + B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); Value *Step = CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint()); - EndValue = emitTransformedIndex(B, VTC, II.getStartValue(), Step, II); - EndValue->setName("ind.end"); - - // Compute the end value for the additional bypass (if applicable). - if (AdditionalBypass.first) { - B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); - CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, - StepType, true); - Value *Step = - CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint()); - VTC = - B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.vtc"); - EndValueFromAdditionalBypass = - emitTransformedIndex(B, VTC, II.getStartValue(), Step, II); - EndValueFromAdditionalBypass->setName("ind.end"); - } + EndValueFromAdditionalBypass = emitTransformedIndex( + B, AdditionalBypass.second, II.getStartValue(), Step, II); + EndValueFromAdditionalBypass->setName("ind.end"); } + } - // Create phi nodes to merge from the backedge-taken check block. - PHINode *BCResumeVal = - PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", - LoopScalarPreHeader->getTerminator()); - // Copy original phi DL over to the new one. - BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); + // Create phi nodes to merge from the backedge-taken check block. + PHINode *BCResumeVal = PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", + LoopScalarPreHeader->getTerminator()); + // Copy original phi DL over to the new one. + BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); - // The new PHI merges the original incoming value, in case of a bypass, - // or the value at the end of the vectorized loop. - BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); + // The new PHI merges the original incoming value, in case of a bypass, + // or the value at the end of the vectorized loop. + BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); - // Fix the scalar body counter (PHI node). - // The old induction's phi node in the scalar body needs the truncated - // value. - for (BasicBlock *BB : LoopBypassBlocks) - BCResumeVal->addIncoming(II.getStartValue(), BB); + // Fix the scalar body counter (PHI node). + // The old induction's phi node in the scalar body needs the truncated + // value. + for (BasicBlock *BB : BypassBlocks) + BCResumeVal->addIncoming(II.getStartValue(), BB); - if (AdditionalBypass.first) - BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, - EndValueFromAdditionalBypass); + if (AdditionalBypass.first) + BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, + EndValueFromAdditionalBypass); + return BCResumeVal; +} +void InnerLoopVectorizer::createInductionResumeValues( + std::pair AdditionalBypass) { + assert(((AdditionalBypass.first && AdditionalBypass.second) || + (!AdditionalBypass.first && !AdditionalBypass.second)) && + "Inconsistent information about additional bypass."); + // We are going to resume the execution of the scalar loop. + // Go over all of the induction variables that we found and fix the + // PHIs that are left in the scalar version of the loop. + // The starting values of PHI nodes depend on the counter of the last + // iteration in the vectorized loop. + // If we come from a bypass edge then we need to start from the original + // start value. + for (const auto &InductionEntry : Legal->getInductionVars()) { + PHINode *OrigPhi = InductionEntry.first; + const InductionDescriptor &II = InductionEntry.second; + PHINode *BCResumeVal = createInductionResumeValue( + OrigPhi, II, LoopBypassBlocks, AdditionalBypass); OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); } } -BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(MDNode *OrigLoopID) { +BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() { // The trip counts should be cached by now. Value *Count = getOrCreateTripCount(LoopVectorPreHeader); Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); @@ -3251,18 +3283,6 @@ InnerLoopVectorizer::createVectorizedLoopSkeleton() { ... */ - // Get the metadata of the original loop before it gets modified. - MDNode *OrigLoopID = OrigLoop->getLoopID(); - - // Workaround! Compute the trip count of the original loop and cache it - // before we start modifying the CFG. This code has a systemic problem - // wherein it tries to run analysis over partially constructed IR; this is - // wrong, and not simply for SCEV. The trip count of the original loop - // simply happens to be prone to hitting this in practice. In theory, we - // can hit the same issue for any SCEV, or ValueTracking query done during - // mutation. See PR49900. - getOrCreateTripCount(OrigLoop->getLoopPreheader()); - // Create an empty vector loop, and prepare basic blocks for the runtime // checks. createVectorLoopSkeleton(""); @@ -3286,7 +3306,7 @@ InnerLoopVectorizer::createVectorizedLoopSkeleton() { // Emit phis for the new starting index of the scalar loop. createInductionResumeValues(); - return {completeLoopSkeleton(OrigLoopID), nullptr}; + return {completeLoopSkeleton(), nullptr}; } // Fix up external users of the induction variable. At this point, we are @@ -3334,17 +3354,11 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, Value *CountMinusOne = B.CreateSub( VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1)); - Value *CMO = - !II.getStep()->getType()->isIntegerTy() - ? B.CreateCast(Instruction::SIToFP, CountMinusOne, - II.getStep()->getType()) - : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); - CMO->setName("cast.cmo"); - + CountMinusOne->setName("cmo"); Value *Step = CreateStepValue(II.getStep(), *PSE.getSE(), VectorHeader->getTerminator()); Value *Escape = - emitTransformedIndex(B, CMO, II.getStartValue(), Step, II); + emitTransformedIndex(B, CountMinusOne, II.getStartValue(), Step, II); Escape->setName("ind.escape"); MissingVals[UI] = Escape; } @@ -3429,8 +3443,9 @@ LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, // to be vectors, so we need to extract individual elements from there, // execute VF scalar calls, and then gather the result into the vector return // value. + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; InstructionCost ScalarCallCost = - TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput); + TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, CostKind); if (VF.isScalar()) return ScalarCallCost; @@ -3441,7 +3456,8 @@ LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, // Compute costs of unpacking argument values for the scalar calls and // packing the return values to a vector. - InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF); + InstructionCost ScalarizationCost = + getScalarizationOverhead(CI, VF, CostKind); InstructionCost Cost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; @@ -3457,7 +3473,7 @@ LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, // If the corresponding vector cost is cheaper, return its cost. InstructionCost VectorCallCost = - TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput); + TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind); if (VectorCallCost < Cost) { NeedToScalarize = false; Cost = VectorCallCost; @@ -3672,7 +3688,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State, // edge. // Fix-up external users of the induction variables. - for (auto &Entry : Legal->getInductionVars()) + for (const auto &Entry : Legal->getInductionVars()) fixupIVUsers(Entry.first, Entry.second, getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()), IVEndValues[Entry.first], LoopMiddleBlock, @@ -3682,7 +3698,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State, // Fix LCSSA phis not already fixed earlier. Extracts may need to be generated // in the exit block, so update the builder. State.Builder.SetInsertPoint(State.CFG.ExitBB->getFirstNonPHI()); - for (auto &KV : Plan.getLiveOuts()) + for (const auto &KV : Plan.getLiveOuts()) KV.second->fixPhi(Plan, State); for (Instruction *PI : PredicatedInstructions) @@ -3722,11 +3738,11 @@ void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { if (auto *ReductionPhi = dyn_cast(&R)) fixReduction(ReductionPhi, State); else if (auto *FOR = dyn_cast(&R)) - fixFirstOrderRecurrence(FOR, State); + fixFixedOrderRecurrence(FOR, State); } } -void InnerLoopVectorizer::fixFirstOrderRecurrence( +void InnerLoopVectorizer::fixFixedOrderRecurrence( VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) { // This is the second phase of vectorizing first-order recurrences. An // overview of the transformation is described below. Suppose we have the @@ -4019,7 +4035,7 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, // We know that the loop is in LCSSA form. We need to update the PHI nodes // in the exit blocks. See comment on analogous loop in - // fixFirstOrderRecurrence for a more complete explaination of the logic. + // fixFixedOrderRecurrence for a more complete explaination of the logic. if (!Cost->requiresScalarEpilogue(VF)) for (PHINode &LCSSAPhi : LoopExitBlock->phis()) if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) { @@ -4146,8 +4162,7 @@ void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { void InnerLoopVectorizer::fixNonInductionPHIs(VPlan &Plan, VPTransformState &State) { - auto Iter = depth_first( - VPBlockRecursiveTraversalWrapper(Plan.getEntry())); + auto Iter = vp_depth_first_deep(Plan.getEntry()); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly(Iter)) { for (VPRecipeBase &P : VPBB->phis()) { VPWidenPHIRecipe *VPPhi = dyn_cast(&P); @@ -4170,78 +4185,6 @@ bool InnerLoopVectorizer::useOrderedReductions( return Cost->useOrderedReductions(RdxDesc); } -void InnerLoopVectorizer::widenCallInstruction(CallInst &CI, VPValue *Def, - VPUser &ArgOperands, - VPTransformState &State) { - assert(!isa(CI) && - "DbgInfoIntrinsic should have been dropped during VPlan construction"); - State.setDebugLocFromInst(&CI); - - SmallVector Tys; - for (Value *ArgOperand : CI.args()) - Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); - - Intrinsic::ID ID = getVectorIntrinsicIDForCall(&CI, TLI); - - // The flag shows whether we use Intrinsic or a usual Call for vectorized - // version of the instruction. - // Is it beneficial to perform intrinsic call compared to lib call? - bool NeedToScalarize = false; - InstructionCost CallCost = Cost->getVectorCallCost(&CI, VF, NeedToScalarize); - InstructionCost IntrinsicCost = - ID ? Cost->getVectorIntrinsicCost(&CI, VF) : 0; - bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; - assert((UseVectorIntrinsic || !NeedToScalarize) && - "Instruction should be scalarized elsewhere."); - assert((IntrinsicCost.isValid() || CallCost.isValid()) && - "Either the intrinsic cost or vector call cost must be valid"); - - for (unsigned Part = 0; Part < UF; ++Part) { - SmallVector TysForDecl = {CI.getType()}; - SmallVector Args; - for (auto &I : enumerate(ArgOperands.operands())) { - // Some intrinsics have a scalar argument - don't replace it with a - // vector. - Value *Arg; - if (!UseVectorIntrinsic || - !isVectorIntrinsicWithScalarOpAtArg(ID, I.index())) - Arg = State.get(I.value(), Part); - else - Arg = State.get(I.value(), VPIteration(0, 0)); - if (isVectorIntrinsicWithOverloadTypeAtArg(ID, I.index())) - TysForDecl.push_back(Arg->getType()); - Args.push_back(Arg); - } - - Function *VectorF; - if (UseVectorIntrinsic) { - // Use vector version of the intrinsic. - if (VF.isVector()) - TysForDecl[0] = VectorType::get(CI.getType()->getScalarType(), VF); - Module *M = State.Builder.GetInsertBlock()->getModule(); - VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); - assert(VectorF && "Can't retrieve vector intrinsic."); - } else { - // Use vector version of the function call. - const VFShape Shape = VFShape::get(CI, VF, false /*HasGlobalPred*/); -#ifndef NDEBUG - assert(VFDatabase(CI).getVectorizedFunction(Shape) != nullptr && - "Can't create vector function."); -#endif - VectorF = VFDatabase(CI).getVectorizedFunction(Shape); - } - SmallVector OpBundles; - CI.getOperandBundlesAsDefs(OpBundles); - CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); - - if (isa(V)) - V->copyFastMathFlags(&CI); - - State.set(Def, V, Part); - State.addMetadata(V, &CI); - } -} - void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { // We should not collect Scalars more than once per VF. Right now, this // function is called from collectUniformsAndScalars(), which already does @@ -4350,8 +4293,10 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { // induction variable when the PHI user is scalarized. auto ForcedScalar = ForcedScalars.find(VF); if (ForcedScalar != ForcedScalars.end()) - for (auto *I : ForcedScalar->second) + for (auto *I : ForcedScalar->second) { + LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n"); Worklist.insert(I); + } // Expand the worklist by looking through any bitcasts and getelementptr // instructions we've already identified as scalar. This is similar to the @@ -4376,7 +4321,7 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { // An induction variable will remain scalar if all users of the induction // variable and induction variable update remain scalar. - for (auto &Induction : Legal->getInductionVars()) { + for (const auto &Induction : Legal->getInductionVars()) { auto *Ind = Induction.first; auto *IndUpdate = cast(Ind->getIncomingValueForBlock(Latch)); @@ -4429,15 +4374,16 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { bool LoopVectorizationCostModel::isScalarWithPredication( Instruction *I, ElementCount VF) const { - if (!blockNeedsPredicationForAnyReason(I->getParent())) + if (!isPredicatedInst(I)) return false; + + // Do we have a non-scalar lowering for this predicated + // instruction? No - it is scalar with predication. switch(I->getOpcode()) { default: - break; + return true; case Instruction::Load: case Instruction::Store: { - if (!Legal->isMaskRequired(I)) - return false; auto *Ptr = getLoadStorePointerOperand(I); auto *Ty = getLoadStoreType(I); Type *VTy = Ty; @@ -4452,12 +4398,119 @@ bool LoopVectorizationCostModel::isScalarWithPredication( case Instruction::UDiv: case Instruction::SDiv: case Instruction::SRem: + case Instruction::URem: { + // We have the option to use the safe-divisor idiom to avoid predication. + // The cost based decision here will always select safe-divisor for + // scalable vectors as scalarization isn't legal. + const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF); + return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost); + } + } +} + +bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const { + if (!blockNeedsPredicationForAnyReason(I->getParent())) + return false; + + // Can we prove this instruction is safe to unconditionally execute? + // If not, we must use some form of predication. + switch(I->getOpcode()) { + default: + return false; + case Instruction::Load: + case Instruction::Store: { + if (!Legal->isMaskRequired(I)) + return false; + // When we know the load's address is loop invariant and the instruction + // in the original scalar loop was unconditionally executed then we + // don't need to mark it as a predicated instruction. Tail folding may + // introduce additional predication, but we're guaranteed to always have + // at least one active lane. We call Legal->blockNeedsPredication here + // because it doesn't query tail-folding. For stores, we need to prove + // both speculation safety (which follows from the same argument as loads), + // but also must prove the value being stored is correct. The easiest + // form of the later is to require that all values stored are the same. + if (Legal->isUniformMemOp(*I) && + (isa(I) || + (isa(I) && + TheLoop->isLoopInvariant(cast(I)->getValueOperand()))) && + !Legal->blockNeedsPredication(I->getParent())) + return false; + return true; + } + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::SRem: case Instruction::URem: // TODO: We can use the loop-preheader as context point here and get // context sensitive reasoning return !isSafeToSpeculativelyExecute(I); } - return false; +} + +std::pair +LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I, + ElementCount VF) const { + assert(I->getOpcode() == Instruction::UDiv || + I->getOpcode() == Instruction::SDiv || + I->getOpcode() == Instruction::SRem || + I->getOpcode() == Instruction::URem); + assert(!isSafeToSpeculativelyExecute(I)); + + const TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + + // Scalarization isn't legal for scalable vector types + InstructionCost ScalarizationCost = InstructionCost::getInvalid(); + if (!VF.isScalable()) { + // Get the scalarization cost and scale this amount by the probability of + // executing the predicated block. If the instruction is not predicated, + // we fall through to the next case. + ScalarizationCost = 0; + + // These instructions have a non-void type, so account for the phi nodes + // that we will create. This cost is likely to be zero. The phi node + // cost, if any, should be scaled by the block probability because it + // models a copy at the end of each predicated block. + ScalarizationCost += VF.getKnownMinValue() * + TTI.getCFInstrCost(Instruction::PHI, CostKind); + + // The cost of the non-predicated instruction. + ScalarizationCost += VF.getKnownMinValue() * + TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind); + + // The cost of insertelement and extractelement instructions needed for + // scalarization. + ScalarizationCost += getScalarizationOverhead(I, VF, CostKind); + + // Scale the cost by the probability of executing the predicated blocks. + // This assumes the predicated block for each vector lane is equally + // likely. + ScalarizationCost = ScalarizationCost / getReciprocalPredBlockProb(); + } + InstructionCost SafeDivisorCost = 0; + + auto *VecTy = ToVectorTy(I->getType(), VF); + + // The cost of the select guard to ensure all lanes are well defined + // after we speculate above any internal control flow. + SafeDivisorCost += TTI.getCmpSelInstrCost( + Instruction::Select, VecTy, + ToVectorTy(Type::getInt1Ty(I->getContext()), VF), + CmpInst::BAD_ICMP_PREDICATE, CostKind); + + // Certain instructions can be cheaper to vectorize if they have a constant + // second vector operand. One example of this are shifts on x86. + Value *Op2 = I->getOperand(1); + auto Op2Info = TTI.getOperandInfo(Op2); + if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) + Op2Info.Kind = TargetTransformInfo::OK_UniformValue; + + SmallVector Operands(I->operand_values()); + SafeDivisorCost += TTI.getArithmeticInstrCost( + I->getOpcode(), VecTy, CostKind, + {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, + Op2Info, Operands, I); + return {ScalarizationCost, SafeDivisorCost}; } bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( @@ -4610,17 +4663,26 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) addToWorklistIfAllowed(Cmp); + // Return true if all lanes perform the same memory operation, and we can + // thus chose to execute only one. + auto isUniformMemOpUse = [&](Instruction *I) { + if (!Legal->isUniformMemOp(*I)) + return false; + if (isa(I)) + // Loading the same address always produces the same result - at least + // assuming aliasing and ordering which have already been checked. + return true; + // Storing the same value on every iteration. + return TheLoop->isLoopInvariant(cast(I)->getValueOperand()); + }; + auto isUniformDecision = [&](Instruction *I, ElementCount VF) { InstWidening WideningDecision = getWideningDecision(I, VF); assert(WideningDecision != CM_Unknown && "Widening decision should be ready at this moment"); - // A uniform memory op is itself uniform. We exclude uniform stores - // here as they demand the last lane, not the first one. - if (isa(I) && Legal->isUniformMemOp(*I)) { - assert(WideningDecision == CM_Scalarize); + if (isUniformMemOpUse(I)) return true; - } return (WideningDecision == CM_Widen || WideningDecision == CM_Widen_Reverse || @@ -4674,9 +4736,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { if (!Ptr) continue; - // A uniform memory op is itself uniform. We exclude uniform stores - // here as they demand the last lane, not the first one. - if (isa(I) && Legal->isUniformMemOp(I)) + if (isUniformMemOpUse(&I)) addToWorklistIfAllowed(&I); if (isUniformDecision(&I, VF)) { @@ -4707,14 +4767,14 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { while (idx != Worklist.size()) { Instruction *I = Worklist[idx++]; - for (auto OV : I->operand_values()) { + for (auto *OV : I->operand_values()) { // isOutOfScope operands cannot be uniform instructions. if (isOutOfScope(OV)) continue; // First order recurrence Phi's should typically be considered // non-uniform. auto *OP = dyn_cast(OV); - if (OP && Legal->isFirstOrderRecurrence(OP)) + if (OP && Legal->isFixedOrderRecurrence(OP)) continue; // If all the users of the operand are uniform, then add the // operand into the uniform worklist. @@ -4733,7 +4793,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { // nodes separately. An induction variable will remain uniform if all users // of the induction variable and induction variable update remain uniform. // The code below handles both pointer and non-pointer induction variables. - for (auto &Induction : Legal->getInductionVars()) { + for (const auto &Induction : Legal->getInductionVars()) { auto *Ind = Induction.first; auto *IndUpdate = cast(Ind->getIncomingValueForBlock(Latch)); @@ -4846,12 +4906,12 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { return MaxScalableVF; // Limit MaxScalableVF by the maximum safe dependence distance. - Optional MaxVScale = TTI.getMaxVScale(); + std::optional MaxVScale = TTI.getMaxVScale(); if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange)) MaxVScale = TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax(); - MaxScalableVF = ElementCount::getScalable( - MaxVScale ? (MaxSafeElements / MaxVScale.value()) : 0); + MaxScalableVF = + ElementCount::getScalable(MaxVScale ? (MaxSafeElements / *MaxVScale) : 0); if (!MaxScalableVF) reportVectorizationInfo( "Max legal vector width too small, scalable vectorization " @@ -4991,7 +5051,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { case CM_ScalarEpilogueAllowed: return computeFeasibleMaxVF(TC, UserVF, false); case CM_ScalarEpilogueNotAllowedUsePredicate: - LLVM_FALLTHROUGH; + [[fallthrough]]; case CM_ScalarEpilogueNotNeededUsePredicate: LLVM_DEBUG( dbgs() << "LV: vector predicate hint/switch found.\n" @@ -5113,7 +5173,7 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType, ElementCount MaxSafeVF, bool FoldTailByMasking) { bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); - TypeSize WidestRegister = TTI.getRegisterBitWidth( + const TypeSize WidestRegister = TTI.getRegisterBitWidth( ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector : TargetTransformInfo::RGK_FixedWidthVector); @@ -5127,7 +5187,7 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( // Ensure MaxVF is a power of 2; the dependence distance bound may not be. // Note that both WidestRegister and WidestType may not be a powers of 2. auto MaxVectorElementCount = ElementCount::get( - PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType), + PowerOf2Floor(WidestRegister.getKnownMinValue() / WidestType), ComputeScalableMaxVF); MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF); LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " @@ -5140,9 +5200,14 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( return ElementCount::getFixed(1); } - const auto TripCountEC = ElementCount::getFixed(ConstTripCount); - if (ConstTripCount && - ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) && + unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue(); + if (MaxVectorElementCount.isScalable() && + TheFunction->hasFnAttribute(Attribute::VScaleRange)) { + auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange); + auto Min = Attr.getVScaleRangeMin(); + WidestRegisterMinEC *= Min; + } + if (ConstTripCount && ConstTripCount <= WidestRegisterMinEC && (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) { // If loop trip count (TC) is known at compile time there is no point in // choosing VF greater than TC (as done in the loop below). Select maximum @@ -5163,7 +5228,7 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( if (MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 && TTI.shouldMaximizeVectorBandwidth(RegKind))) { auto MaxVectorElementCountMaxBW = ElementCount::get( - PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType), + PowerOf2Floor(WidestRegister.getKnownMinValue() / SmallestType), ComputeScalableMaxVF); MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); @@ -5208,7 +5273,7 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( return MaxVF; } -Optional LoopVectorizationCostModel::getVScaleForTuning() const { +std::optional LoopVectorizationCostModel::getVScaleForTuning() const { if (TheFunction->hasFnAttribute(Attribute::VScaleRange)) { auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange); auto Min = Attr.getVScaleRangeMin(); @@ -5244,11 +5309,11 @@ bool LoopVectorizationCostModel::isMoreProfitable( // Improve estimate for the vector width if it is scalable. unsigned EstimatedWidthA = A.Width.getKnownMinValue(); unsigned EstimatedWidthB = B.Width.getKnownMinValue(); - if (Optional VScale = getVScaleForTuning()) { + if (std::optional VScale = getVScaleForTuning()) { if (A.Width.isScalable()) - EstimatedWidthA *= VScale.value(); + EstimatedWidthA *= *VScale; if (B.Width.isScalable()) - EstimatedWidthB *= VScale.value(); + EstimatedWidthB *= *VScale; } // Assume vscale may be larger than 1 (or the value being tuned for), @@ -5294,7 +5359,7 @@ VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor( #ifndef NDEBUG unsigned AssumedMinimumVscale = 1; - if (Optional VScale = getVScaleForTuning()) + if (std::optional VScale = getVScaleForTuning()) AssumedMinimumVscale = *VScale; unsigned Width = Candidate.Width.isScalable() @@ -5365,7 +5430,7 @@ VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor( raw_string_ostream OS(OutString); assert(!Subset.empty() && "Unexpected empty range"); OS << "Instruction with invalid costs prevented vectorization at VF=("; - for (auto &Pair : Subset) + for (const auto &Pair : Subset) OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second; OS << "):"; @@ -5403,12 +5468,12 @@ bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( // Cross iteration phis such as reductions need special handling and are // currently unsupported. if (any_of(L.getHeader()->phis(), - [&](PHINode &Phi) { return Legal->isFirstOrderRecurrence(&Phi); })) + [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); })) return false; // Phis with uses outside of the loop require special handling and are // currently unsupported. - for (auto &Entry : Legal->getInductionVars()) { + for (const auto &Entry : Legal->getInductionVars()) { // Look for uses of the value of the induction at the last iteration. Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); for (User *U : PostInc->users()) @@ -5420,14 +5485,6 @@ bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( return false; } - // Induction variables that are widened require special handling that is - // currently not supported. - if (any_of(Legal->getInductionVars(), [&](auto &Entry) { - return !(this->isScalarAfterVectorization(Entry.first, VF) || - this->isProfitableToScalarize(Entry.first, VF)); - })) - return false; - // Epilogue vectorization code has not been auditted to ensure it handles // non-latch exits properly. It may be fine, but it needs auditted and // tested. @@ -5443,6 +5500,11 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( // as register pressure, code size increase and cost of extra branches into // account. For now we apply a very crude heuristic and only consider loops // with vectorization factors larger than a certain value. + + // Allow the target to opt out entirely. + if (!TTI.preferEpilogueVectorization()) + return false; + // We also consider epilogue vectorization unprofitable for targets that don't // consider interleaving beneficial (eg. MVE). if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) @@ -5512,7 +5574,7 @@ LoopVectorizationCostModel::selectEpilogueVectorizationFactor( ElementCount EstimatedRuntimeVF = MainLoopVF; if (MainLoopVF.isScalable()) { EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue()); - if (Optional VScale = getVScaleForTuning()) + if (std::optional VScale = getVScaleForTuning()) EstimatedRuntimeVF *= *VScale; } @@ -5542,7 +5604,7 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() { // Reset MaxWidth so that we can find the smallest type used by recurrences // in the loop. MaxWidth = -1U; - for (auto &PhiDescriptorPair : Legal->getReductionVars()) { + for (const auto &PhiDescriptorPair : Legal->getReductionVars()) { const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second; // When finding the min width used by the recurrence we need to account // for casts on the input operands of the recurrence. @@ -5554,9 +5616,9 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() { } else { for (Type *T : ElementTypesInLoop) { MinWidth = std::min( - MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); + MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue()); MaxWidth = std::max( - MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); + MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue()); } } return {MinWidth, MaxWidth}; @@ -5605,8 +5667,9 @@ void LoopVectorizationCostModel::collectElementTypesForWidening() { } } -unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, - unsigned LoopCost) { +unsigned +LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, + InstructionCost LoopCost) { // -- The interleave heuristics -- // We interleave the loop in order to expose ILP and reduce the loop overhead. // There are many micro-architectural considerations that we can't predict @@ -5642,9 +5705,8 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, // If we did not calculate the cost for VF (because the user selected the VF) // then we calculate the cost of VF here. if (LoopCost == 0) { - InstructionCost C = expectedCost(VF).first; - assert(C.isValid() && "Expected to have chosen a VF with valid cost"); - LoopCost = *C.getValue(); + LoopCost = expectedCost(VF).first; + assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost"); // Loop body is free and there is no need for interleaving. if (LoopCost == 0) @@ -5772,8 +5834,8 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, // We assume that the cost overhead is 1 and we use the cost model // to estimate the cost of the loop and interleave until the cost of the // loop overhead is about 5% of the cost of the loop. - unsigned SmallIC = - std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); + unsigned SmallIC = std::min( + IC, (unsigned)PowerOf2Floor(SmallLoopCost / *LoopCost.getValue())); // Interleave until store/load ports (estimated by max interleave count) are // saturated. @@ -5888,8 +5950,9 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef VFs) { IntervalMap EndPoint; // Saves the list of instruction indices that are used in the loop. SmallPtrSet Ends; - // Saves the list of values that are used in the loop but are - // defined outside the loop, such as arguments and constants. + // Saves the list of values that are used in the loop but are defined outside + // the loop (not including non-instruction values such as arguments and + // constants). SmallPtrSet LoopInvariants; for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { @@ -5901,6 +5964,9 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef VFs) { auto *Instr = dyn_cast(U); // Ignore non-instruction values such as arguments, constants, etc. + // FIXME: Might need some motivation why these values are ignored. If + // for example an argument is used inside the loop it will increase the + // register pressure (so shouldn't we add it to LoopInvariants). if (!Instr) continue; @@ -5956,44 +6022,44 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef VFs) { // For each VF find the maximum usage of registers. for (unsigned j = 0, e = VFs.size(); j < e; ++j) { - // Count the number of live intervals. + // Count the number of registers used, per register class, given all open + // intervals. + // Note that elements in this SmallMapVector will be default constructed + // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if + // there is no previous entry for ClassID. SmallMapVector RegUsage; if (VFs[j].isScalar()) { - for (auto Inst : OpenIntervals) { - unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); - if (RegUsage.find(ClassID) == RegUsage.end()) - RegUsage[ClassID] = 1; - else - RegUsage[ClassID] += 1; + for (auto *Inst : OpenIntervals) { + unsigned ClassID = + TTI.getRegisterClassForType(false, Inst->getType()); + // FIXME: The target might use more than one register for the type + // even in the scalar case. + RegUsage[ClassID] += 1; } } else { collectUniformsAndScalars(VFs[j]); - for (auto Inst : OpenIntervals) { + for (auto *Inst : OpenIntervals) { // Skip ignored values for VF > 1. if (VecValuesToIgnore.count(Inst)) continue; if (isScalarAfterVectorization(Inst, VFs[j])) { - unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); - if (RegUsage.find(ClassID) == RegUsage.end()) - RegUsage[ClassID] = 1; - else - RegUsage[ClassID] += 1; + unsigned ClassID = + TTI.getRegisterClassForType(false, Inst->getType()); + // FIXME: The target might use more than one register for the type + // even in the scalar case. + RegUsage[ClassID] += 1; } else { - unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); - if (RegUsage.find(ClassID) == RegUsage.end()) - RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); - else - RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); + unsigned ClassID = + TTI.getRegisterClassForType(true, Inst->getType()); + RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); } } } for (auto& pair : RegUsage) { - if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) - MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); - else - MaxUsages[j][pair.first] = pair.second; + auto &Entry = MaxUsages[j][pair.first]; + Entry = std::max(Entry, pair.second); } } @@ -6005,17 +6071,19 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef VFs) { } for (unsigned i = 0, e = VFs.size(); i < e; ++i) { + // Note that elements in this SmallMapVector will be default constructed + // as 0. So we can use "Invariant[ClassID] += n" in the code below even if + // there is no previous entry for ClassID. SmallMapVector Invariant; - for (auto Inst : LoopInvariants) { + for (auto *Inst : LoopInvariants) { + // FIXME: The target might use more than one register for the type + // even in the scalar case. unsigned Usage = VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); unsigned ClassID = TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); - if (Invariant.find(ClassID) == Invariant.end()) - Invariant[ClassID] = Usage; - else - Invariant[ClassID] += Usage; + Invariant[ClassID] += Usage; } LLVM_DEBUG({ @@ -6054,7 +6122,7 @@ bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I, // from moving "masked load/store" check from legality to cost model. // Masked Load/Gather emulation was previously never allowed. // Limited number of Masked Store/Scatter emulation was allowed. - assert((isPredicatedInst(I, VF) || Legal->isUniformMemOp(*I)) && + assert((isPredicatedInst(I)) && "Expecting a scalar emulated instruction"); return isa(I) || (isa(I) && @@ -6099,7 +6167,7 @@ void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { } } -int LoopVectorizationCostModel::computePredInstDiscount( +InstructionCost LoopVectorizationCostModel::computePredInstDiscount( Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { assert(!isUniformAfterVectorization(PredInst, VF) && "Instruction marked uniform-after-vectorization will be predicated"); @@ -6173,13 +6241,14 @@ int LoopVectorizationCostModel::computePredInstDiscount( // Compute the scalarization overhead of needed insertelement instructions // and phi nodes. + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) { ScalarCost += TTI.getScalarizationOverhead( cast(ToVectorTy(I->getType(), VF)), - APInt::getAllOnes(VF.getFixedValue()), true, false); + APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true, + /*Extract*/ false, CostKind); ScalarCost += - VF.getFixedValue() * - TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); + VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind); } // Compute the scalarization overhead of needed extractelement @@ -6195,7 +6264,8 @@ int LoopVectorizationCostModel::computePredInstDiscount( else if (needsExtract(J, VF)) { ScalarCost += TTI.getScalarizationOverhead( cast(ToVectorTy(J->getType(), VF)), - APInt::getAllOnes(VF.getFixedValue()), false, true); + APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false, + /*Extract*/ true, CostKind); } } @@ -6208,7 +6278,7 @@ int LoopVectorizationCostModel::computePredInstDiscount( ScalarCosts[I] = ScalarCost; } - return *Discount.getValue(); + return Discount; } LoopVectorizationCostModel::VectorizationCostTy @@ -6324,19 +6394,20 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, // Don't pass *I here, since it is scalar but will actually be part of a // vectorized loop where the user of it is a vectorized instruction. + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; const Align Alignment = getLoadStoreAlignment(I); - Cost += VF.getKnownMinValue() * - TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, - AS, TTI::TCK_RecipThroughput); + Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(), + ValTy->getScalarType(), + Alignment, AS, CostKind); // Get the overhead of the extractelement and insertelement instructions // we might create due to scalarization. - Cost += getScalarizationOverhead(I, VF); + Cost += getScalarizationOverhead(I, VF, CostKind); // If we have a predicated load/store, it will need extra i1 extracts and // conditional branches, but may not be executed for each vector lane. Scale // the cost by the probability of executing the predicated block. - if (isPredicatedInst(I, VF)) { + if (isPredicatedInst(I)) { Cost /= getReciprocalPredBlockProb(); // Add the cost of an i1 extract and a branch @@ -6344,8 +6415,8 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); Cost += TTI.getScalarizationOverhead( Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()), - /*Insert=*/false, /*Extract=*/true); - Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput); + /*Insert=*/false, /*Extract=*/true, CostKind); + Cost += TTI.getCFInstrCost(Instruction::Br, CostKind); if (useEmulatedMaskMemRefHack(I, VF)) // Artificially setting to a high enough value to practically disable @@ -6370,17 +6441,19 @@ LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, "Stride should be 1 or -1 for consecutive memory access"); const Align Alignment = getLoadStoreAlignment(I); InstructionCost Cost = 0; - if (Legal->isMaskRequired(I)) + if (Legal->isMaskRequired(I)) { Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, CostKind); - else + } else { + TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0)); Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, - CostKind, I); + CostKind, OpInfo, I); + } bool Reverse = ConsecutiveStride < 0; if (Reverse) - Cost += - TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); + Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, + std::nullopt, CostKind, 0); return Cost; } @@ -6409,7 +6482,7 @@ LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, (isLoopInvariantStoreValue ? 0 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, - VF.getKnownMinValue() - 1)); + CostKind, VF.getKnownMinValue() - 1)); } InstructionCost @@ -6437,6 +6510,7 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, Type *ValTy = getLoadStoreType(I); auto *VectorTy = cast(ToVectorTy(ValTy, VF)); unsigned AS = getLoadStoreAddressSpace(I); + enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; auto Group = getInterleavedAccessGroup(I); assert(Group && "Fail to get an interleaved access group."); @@ -6456,25 +6530,26 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, (isa(I) && (Group->getNumMembers() < Group->getFactor())); InstructionCost Cost = TTI.getInterleavedMemoryOpCost( I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), - AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); + AS, CostKind, Legal->isMaskRequired(I), UseMaskForGaps); if (Group->isReverse()) { // TODO: Add support for reversed masked interleaved access. assert(!Legal->isMaskRequired(I) && "Reverse masked interleaved access not supported."); - Cost += - Group->getNumMembers() * - TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); + Cost += Group->getNumMembers() * + TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, + std::nullopt, CostKind, 0); } return Cost; } -Optional LoopVectorizationCostModel::getReductionPatternCost( +std::optional +LoopVectorizationCostModel::getReductionPatternCost( Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { using namespace llvm::PatternMatch; // Early exit for no inloop reductions if (InLoopReductionChains.empty() || VF.isScalar() || !isa(Ty)) - return None; + return std::nullopt; auto *VectorTy = cast(Ty); // We are looking for a pattern of, and finding the minimal acceptable cost: @@ -6492,20 +6567,19 @@ Optional LoopVectorizationCostModel::getReductionPatternCost( Instruction *RetI = I; if (match(RetI, m_ZExtOrSExt(m_Value()))) { if (!RetI->hasOneUser()) - return None; + return std::nullopt; RetI = RetI->user_back(); } - if (match(RetI, m_Mul(m_Value(), m_Value())) && + + if (match(RetI, m_OneUse(m_Mul(m_Value(), m_Value()))) && RetI->user_back()->getOpcode() == Instruction::Add) { - if (!RetI->hasOneUser()) - return None; RetI = RetI->user_back(); } // Test if the found instruction is a reduction, and if not return an invalid // cost specifying the parent to use the original cost modelling. if (!InLoopReductionImmediateChains.count(RetI)) - return None; + return std::nullopt; // Find the reduction this chain is a part of and calculate the basic cost of // the reduction on its own. @@ -6541,7 +6615,7 @@ Optional LoopVectorizationCostModel::getReductionPatternCost( VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); Instruction *Op0, *Op1; - if (RedOp && + if (RedOp && RdxDesc.getOpcode() == Instruction::Add && match(RedOp, m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) && match(Op0, m_ZExtOrSExt(m_Value())) && @@ -6550,7 +6624,7 @@ Optional LoopVectorizationCostModel::getReductionPatternCost( !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) && (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) { - // Matched reduce(ext(mul(ext(A), ext(B))) + // Matched reduce.add(ext(mul(ext(A), ext(B))) // Note that the extend opcodes need to all match, or if A==B they will have // been converted to zext(mul(sext(A), sext(A))) as it is known positive, // which is equally fine. @@ -6567,9 +6641,8 @@ Optional LoopVectorizationCostModel::getReductionPatternCost( TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType, TTI::CastContextHint::None, CostKind, RedOp); - InstructionCost RedCost = TTI.getExtendedAddReductionCost( - /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, - CostKind); + InstructionCost RedCost = TTI.getMulAccReductionCost( + IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind); if (RedCost.isValid() && RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost) @@ -6579,16 +6652,16 @@ Optional LoopVectorizationCostModel::getReductionPatternCost( // Matched reduce(ext(A)) bool IsUnsigned = isa(RedOp); auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); - InstructionCost RedCost = TTI.getExtendedAddReductionCost( - /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, - CostKind); + InstructionCost RedCost = TTI.getExtendedReductionCost( + RdxDesc.getOpcode(), IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, + RdxDesc.getFastMathFlags(), CostKind); InstructionCost ExtCost = TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, TTI::CastContextHint::None, CostKind, RedOp); if (RedCost.isValid() && RedCost < BaseCost + ExtCost) return I == RetI ? RedCost : 0; - } else if (RedOp && + } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add && match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) { if (match(Op0, m_ZExtOrSExt(m_Value())) && Op0->getOpcode() == Op1->getOpcode() && @@ -6601,7 +6674,7 @@ Optional LoopVectorizationCostModel::getReductionPatternCost( : Op0Ty; auto *ExtType = VectorType::get(LargestOpTy, VectorTy); - // Matched reduce(mul(ext(A), ext(B))), where the two ext may be of + // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of // different sizes. We take the largest type as the ext to reduce, and add // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))). InstructionCost ExtCost0 = TTI.getCastInstrCost( @@ -6613,9 +6686,8 @@ Optional LoopVectorizationCostModel::getReductionPatternCost( InstructionCost MulCost = TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); - InstructionCost RedCost = TTI.getExtendedAddReductionCost( - /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, - CostKind); + InstructionCost RedCost = TTI.getMulAccReductionCost( + IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind); InstructionCost ExtraExtCost = 0; if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) { Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1; @@ -6629,20 +6701,19 @@ Optional LoopVectorizationCostModel::getReductionPatternCost( (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost)) return I == RetI ? RedCost : 0; } else if (!match(I, m_ZExtOrSExt(m_Value()))) { - // Matched reduce(mul()) + // Matched reduce.add(mul()) InstructionCost MulCost = TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); - InstructionCost RedCost = TTI.getExtendedAddReductionCost( - /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy, - CostKind); + InstructionCost RedCost = TTI.getMulAccReductionCost( + true, RdxDesc.getRecurrenceType(), VectorTy, CostKind); if (RedCost.isValid() && RedCost < MulCost + BaseCost) return I == RetI ? RedCost : 0; } } - return I == RetI ? Optional(BaseCost) : None; + return I == RetI ? std::optional(BaseCost) : std::nullopt; } InstructionCost @@ -6655,9 +6726,10 @@ LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, const Align Alignment = getLoadStoreAlignment(I); unsigned AS = getLoadStoreAddressSpace(I); + TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0)); return TTI.getAddressComputationCost(ValTy) + TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, - TTI::TCK_RecipThroughput, I); + TTI::TCK_RecipThroughput, OpInfo, I); } return getWideningCost(I, VF); } @@ -6705,9 +6777,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, return VectorizationCostTy(C, TypeNotScalarized); } -InstructionCost -LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, - ElementCount VF) const { +InstructionCost LoopVectorizationCostModel::getScalarizationOverhead( + Instruction *I, ElementCount VF, TTI::TargetCostKind CostKind) const { // There is no mechanism yet to create a scalable scalarization loop, // so this is currently Invalid. @@ -6722,8 +6793,9 @@ LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, if (!RetTy->isVoidTy() && (!isa(I) || !TTI.supportsEfficientVectorElementLoadStore())) Cost += TTI.getScalarizationOverhead( - cast(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), true, - false); + cast(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), + /*Insert*/ true, + /*Extract*/ false, CostKind); // Some targets keep addresses scalar. if (isa(I) && !TTI.prefersVectorizedAddressing()) @@ -6743,7 +6815,7 @@ LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, for (auto *V : filterExtractingOperands(Ops, VF)) Tys.push_back(MaybeVectorizeType(V->getType(), VF)); return Cost + TTI.getOperandsScalarizationOverhead( - filterExtractingOperands(Ops, VF), Tys); + filterExtractingOperands(Ops, VF), Tys, CostKind); } void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { @@ -6765,29 +6837,47 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { NumPredStores++; if (Legal->isUniformMemOp(I)) { - // Lowering story for uniform memory ops is currently a bit complicated. - // Scalarization works for everything which isn't a store with scalable - // VF. Fixed len VFs just scalarize and then DCE later; scalarization - // knows how to handle uniform-per-part values (i.e. the first lane - // in each unrolled VF) and can thus handle scalable loads too. For - // scalable stores, we use a scatter if legal. If not, we have no way - // to lower (currently) and thus have to abort vectorization. - if (isa(&I) && VF.isScalable()) { - if (isLegalGatherOrScatter(&I, VF)) - setWideningDecision(&I, VF, CM_GatherScatter, - getGatherScatterCost(&I, VF)); - else - // Error case, abort vectorization - setWideningDecision(&I, VF, CM_Scalarize, - InstructionCost::getInvalid()); - continue; - } + auto isLegalToScalarize = [&]() { + if (!VF.isScalable()) + // Scalarization of fixed length vectors "just works". + return true; + + // We have dedicated lowering for unpredicated uniform loads and + // stores. Note that even with tail folding we know that at least + // one lane is active (i.e. generalized predication is not possible + // here), and the logic below depends on this fact. + if (!foldTailByMasking()) + return true; + + // For scalable vectors, a uniform memop load is always + // uniform-by-parts and we know how to scalarize that. + if (isa(I)) + return true; + + // A uniform store isn't neccessarily uniform-by-part + // and we can't assume scalarization. + auto &SI = cast(I); + return TheLoop->isLoopInvariant(SI.getValueOperand()); + }; + + const InstructionCost GatherScatterCost = + isLegalGatherOrScatter(&I, VF) ? + getGatherScatterCost(&I, VF) : InstructionCost::getInvalid(); + // Load: Scalar load + broadcast // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract - // TODO: Avoid replicating loads and stores instead of relying on - // instcombine to remove them. - setWideningDecision(&I, VF, CM_Scalarize, - getUniformMemOpCost(&I, VF)); + // FIXME: This cost is a significant under-estimate for tail folded + // memory ops. + const InstructionCost ScalarizationCost = isLegalToScalarize() ? + getUniformMemOpCost(&I, VF) : InstructionCost::getInvalid(); + + // Choose better solution for the current VF, Note that Invalid + // costs compare as maximumal large. If both are invalid, we get + // scalable invalid which signals a failure and a vectorization abort. + if (GatherScatterCost < ScalarizationCost) + setWideningDecision(&I, VF, CM_GatherScatter, GatherScatterCost); + else + setWideningDecision(&I, VF, CM_Scalarize, ScalarizationCost); continue; } @@ -6982,7 +7072,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); return ( TTI.getScalarizationOverhead( - Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), false, true) + + Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), + /*Insert*/ false, /*Extract*/ true, CostKind) + (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue())); } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) // The back-edge branch will remain, as will all scalar branches. @@ -6998,11 +7089,13 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, auto *Phi = cast(I); // First-order recurrences are replaced by vector shuffles inside the loop. - // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. - if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) - return TTI.getShuffleCost( - TargetTransformInfo::SK_ExtractSubvector, cast(VectorTy), - None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); + if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) { + SmallVector Mask(VF.getKnownMinValue()); + std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1); + return TTI.getShuffleCost(TargetTransformInfo::SK_Splice, + cast(VectorTy), Mask, CostKind, + VF.getKnownMinValue() - 1); + } // Phi nodes in non-header blocks (not inductions, reductions, etc.) are // converted into select instructions. We require N - 1 selects per phi @@ -7020,34 +7113,13 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, case Instruction::SDiv: case Instruction::URem: case Instruction::SRem: - // If we have a predicated instruction, it may not be executed for each - // vector lane. Get the scalarization cost and scale this amount by the - // probability of executing the predicated block. If the instruction is not - // predicated, we fall through to the next case. - if (VF.isVector() && isScalarWithPredication(I, VF)) { - InstructionCost Cost = 0; - - // These instructions have a non-void type, so account for the phi nodes - // that we will create. This cost is likely to be zero. The phi node - // cost, if any, should be scaled by the block probability because it - // models a copy at the end of each predicated block. - Cost += VF.getKnownMinValue() * - TTI.getCFInstrCost(Instruction::PHI, CostKind); - - // The cost of the non-predicated instruction. - Cost += VF.getKnownMinValue() * - TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); - - // The cost of insertelement and extractelement instructions needed for - // scalarization. - Cost += getScalarizationOverhead(I, VF); - - // Scale the cost by the probability of executing the predicated blocks. - // This assumes the predicated block for each vector lane is equally - // likely. - return Cost / getReciprocalPredBlockProb(); + if (VF.isVector() && isPredicatedInst(I)) { + const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF); + return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ? + ScalarCost : SafeDivisorCost; } - LLVM_FALLTHROUGH; + // We've proven all lanes safe to speculate, fall through. + [[fallthrough]]; case Instruction::Add: case Instruction::FAdd: case Instruction::Sub: @@ -7073,22 +7145,22 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, // Certain instructions can be cheaper to vectorize if they have a constant // second vector operand. One example of this are shifts on x86. Value *Op2 = I->getOperand(1); - TargetTransformInfo::OperandValueProperties Op2VP; - TargetTransformInfo::OperandValueKind Op2VK = - TTI.getOperandInfo(Op2, Op2VP); - if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) - Op2VK = TargetTransformInfo::OK_UniformValue; + auto Op2Info = TTI.getOperandInfo(Op2); + if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) + Op2Info.Kind = TargetTransformInfo::OK_UniformValue; SmallVector Operands(I->operand_values()); return TTI.getArithmeticInstrCost( - I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, - Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); + I->getOpcode(), VectorTy, CostKind, + {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, + Op2Info, Operands, I); } case Instruction::FNeg: { return TTI.getArithmeticInstrCost( - I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, - TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None, - TargetTransformInfo::OP_None, I->getOperand(0), I); + I->getOpcode(), VectorTy, CostKind, + {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, + {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, + I->getOperand(0), I); } case Instruction::Select: { SelectInst *SI = cast(I); @@ -7101,17 +7173,15 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) { // select x, y, false --> x & y // select x, true, y --> x | y - TTI::OperandValueProperties Op1VP = TTI::OP_None; - TTI::OperandValueProperties Op2VP = TTI::OP_None; - TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP); - TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP); + const auto [Op1VK, Op1VP] = TTI::getOperandInfo(Op0); + const auto [Op2VK, Op2VP] = TTI::getOperandInfo(Op1); assert(Op0->getType()->getScalarSizeInBits() == 1 && Op1->getType()->getScalarSizeInBits() == 1); SmallVector Operands{Op0, Op1}; return TTI.getArithmeticInstrCost( match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy, - CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I); + CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, I); } Type *CondTy = SI->getCondition()->getType(); @@ -7153,7 +7223,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, case Instruction::BitCast: if (I->getType()->isPointerTy()) return 0; - LLVM_FALLTHROUGH; + [[fallthrough]]; case Instruction::ZExt: case Instruction::SExt: case Instruction::FPToUI: @@ -7262,7 +7332,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, // the result would need to be a vector of pointers. if (VF.isScalable()) return InstructionCost::getInvalid(); - LLVM_FALLTHROUGH; + [[fallthrough]]; default: // This opcode is unknown. Assume that it is the same as 'mul'. return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); @@ -7276,7 +7346,6 @@ static const char lv_name[] = "Loop Vectorization"; INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) -INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) @@ -7317,14 +7386,14 @@ void LoopVectorizationCostModel::collectValuesToIgnore() { // Ignore type-promoting instructions we identified during reduction // detection. - for (auto &Reduction : Legal->getReductionVars()) { + for (const auto &Reduction : Legal->getReductionVars()) { const RecurrenceDescriptor &RedDes = Reduction.second; const SmallPtrSetImpl &Casts = RedDes.getCastInsts(); VecValuesToIgnore.insert(Casts.begin(), Casts.end()); } // Ignore type-casting instructions we identified during induction // detection. - for (auto &Induction : Legal->getInductionVars()) { + for (const auto &Induction : Legal->getInductionVars()) { const InductionDescriptor &IndDes = Induction.second; const SmallVectorImpl &Casts = IndDes.getCastInsts(); VecValuesToIgnore.insert(Casts.begin(), Casts.end()); @@ -7332,7 +7401,7 @@ void LoopVectorizationCostModel::collectValuesToIgnore() { } void LoopVectorizationCostModel::collectInLoopReductions() { - for (auto &Reduction : Legal->getReductionVars()) { + for (const auto &Reduction : Legal->getReductionVars()) { PHINode *Phi = Reduction.first; const RecurrenceDescriptor &RdxDesc = Reduction.second; @@ -7394,7 +7463,7 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { if (UserVF.isZero()) { VF = ElementCount::getFixed(determineVPlanVF( TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) - .getFixedSize(), + .getFixedValue(), CM)); LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); @@ -7425,12 +7494,12 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { return VectorizationFactor::Disabled(); } -Optional +std::optional LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { assert(OrigLoop->isInnermost() && "Inner loop expected."); FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC); if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. - return None; + return std::nullopt; // Invalidate interleave groups if all blocks of loop will be predicated. if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) && @@ -7550,9 +7619,26 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, InnerLoopVectorizer &ILV, DominatorTree *DT, bool IsEpilogueVectorization) { + assert(BestVPlan.hasVF(BestVF) && + "Trying to execute plan with unsupported VF"); + assert(BestVPlan.hasUF(BestUF) && + "Trying to execute plan with unsupported UF"); + LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF << '\n'); + // Workaround! Compute the trip count of the original loop and cache it + // before we start modifying the CFG. This code has a systemic problem + // wherein it tries to run analysis over partially constructed IR; this is + // wrong, and not simply for SCEV. The trip count of the original loop + // simply happens to be prone to hitting this in practice. In theory, we + // can hit the same issue for any SCEV, or ValueTracking query done during + // mutation. See PR49900. + ILV.getOrCreateTripCount(OrigLoop->getLoopPreheader()); + + if (!IsEpilogueVectorization) + VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE); + // Perform the actual loop transformation. // 1. Set up the skeleton for vectorization, including vector pre-header and @@ -7602,7 +7688,7 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, // replace the vectorizer-specific hints below). MDNode *OrigLoopID = OrigLoop->getLoopID(); - Optional VectorizedLoopID = + std::optional VectorizedLoopID = makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, LLVMLoopVectorizeFollowupVectorized}); @@ -7610,7 +7696,7 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, BestVPlan.getVectorLoopRegion()->getEntryBasicBlock(); Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]); if (VectorizedLoopID) - L->setLoopID(VectorizedLoopID.value()); + L->setLoopID(*VectorizedLoopID); else { // Keep all loop hints from the original loop on the vector loop (we'll // replace the vectorizer-specific hints below). @@ -7620,9 +7706,7 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, LoopVectorizeHints Hints(L, true, *ORE); Hints.setAlreadyVectorized(); } - // Disable runtime unrolling when vectorizing the epilogue loop. - if (CanonicalIVStartValue) - AddRuntimeUnrollDisableMetaData(L); + AddRuntimeUnrollDisableMetaData(L); // 3. Fix the vectorized code: take care of header phi's, live-outs, // predication, updating analyses. @@ -7651,16 +7735,6 @@ Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. std::pair EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { - MDNode *OrigLoopID = OrigLoop->getLoopID(); - - // Workaround! Compute the trip count of the original loop and cache it - // before we start modifying the CFG. This code has a systemic problem - // wherein it tries to run analysis over partially constructed IR; this is - // wrong, and not simply for SCEV. The trip count of the original loop - // simply happens to be prone to hitting this in practice. In theory, we - // can hit the same issue for any SCEV, or ValueTracking query done during - // mutation. See PR49900. - getOrCreateTripCount(OrigLoop->getLoopPreheader()); createVectorLoopSkeleton(""); // Generate the code to check the minimum iteration count of the vector @@ -7691,11 +7765,11 @@ EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); // Skip induction resume value creation here because they will be created in - // the second pass. If we created them here, they wouldn't be used anyway, - // because the vplan in the second pass still contains the inductions from the - // original loop. + // the second pass for the scalar loop. The induction resume values for the + // inductions in the epilogue loop are created before executing the plan for + // the epilogue loop. - return {completeLoopSkeleton(OrigLoopID), nullptr}; + return {completeLoopSkeleton(), nullptr}; } void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { @@ -7779,7 +7853,6 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass, /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. std::pair EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { - MDNode *OrigLoopID = OrigLoop->getLoopID(); createVectorLoopSkeleton("vec.epilog."); // Now, compare the remaining count and if there aren't enough iterations to @@ -7825,31 +7898,40 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { DT->changeImmediateDominator(LoopExitBlock, EPI.EpilogueIterationCountCheck); - // Keep track of bypass blocks, as they feed start values to the induction - // phis in the scalar loop preheader. + // Keep track of bypass blocks, as they feed start values to the induction and + // reduction phis in the scalar loop preheader. if (EPI.SCEVSafetyCheck) LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); if (EPI.MemSafetyCheck) LoopBypassBlocks.push_back(EPI.MemSafetyCheck); LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); - // The vec.epilog.iter.check block may contain Phi nodes from reductions which - // merge control-flow from the latch block and the middle block. Update the - // incoming values here and move the Phi into the preheader. + // The vec.epilog.iter.check block may contain Phi nodes from inductions or + // reductions which merge control-flow from the latch block and the middle + // block. Update the incoming values here and move the Phi into the preheader. SmallVector PhisInBlock; for (PHINode &Phi : VecEpilogueIterationCountCheck->phis()) PhisInBlock.push_back(&Phi); for (PHINode *Phi : PhisInBlock) { + Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI()); Phi->replaceIncomingBlockWith( VecEpilogueIterationCountCheck->getSinglePredecessor(), VecEpilogueIterationCountCheck); + + // If the phi doesn't have an incoming value from the + // EpilogueIterationCountCheck, we are done. Otherwise remove the incoming + // value and also those from other check blocks. This is needed for + // reduction phis only. + if (none_of(Phi->blocks(), [&](BasicBlock *IncB) { + return EPI.EpilogueIterationCountCheck == IncB; + })) + continue; Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck); if (EPI.SCEVSafetyCheck) Phi->removeIncomingValue(EPI.SCEVSafetyCheck); if (EPI.MemSafetyCheck) Phi->removeIncomingValue(EPI.MemSafetyCheck); - Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI()); } // Generate a resume induction for the vector epilogue and put it in the @@ -7871,7 +7953,7 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { createInductionResumeValues({VecEpilogueIterationCountCheck, EPI.VectorTripCount} /* AdditionalBypass */); - return {completeLoopSkeleton(OrigLoopID), EPResumeVal}; + return {completeLoopSkeleton(), EPResumeVal}; } BasicBlock * @@ -8149,9 +8231,18 @@ VPRecipeBase *VPRecipeBuilder::tryToOptimizeInductionPHI( *PSE.getSE(), *OrigLoop, Range); // Check if this is pointer induction. If so, build the recipe for it. - if (auto *II = Legal->getPointerInductionDescriptor(Phi)) - return new VPWidenPointerInductionRecipe(Phi, Operands[0], *II, - *PSE.getSE()); + if (auto *II = Legal->getPointerInductionDescriptor(Phi)) { + VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep(), + *PSE.getSE()); + assert(isa(II->getStep())); + return new VPWidenPointerInductionRecipe( + Phi, Operands[0], Step, *II, + LoopVectorizationPlanner::getDecisionAndClampRange( + [&](ElementCount VF) { + return CM.isScalarAfterVectorization(Phi, VF); + }, + Range)); + } return nullptr; } @@ -8188,12 +8279,8 @@ VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) { // If all incoming values are equal, the incoming VPValue can be used directly // instead of creating a new VPBlendRecipe. - VPValue *FirstIncoming = Operands[0]; - if (all_of(Operands, [FirstIncoming](const VPValue *Inc) { - return FirstIncoming == Inc; - })) { + if (llvm::all_equal(Operands)) return Operands[0]; - } unsigned NumIncoming = Phi->getNumIncomingValues(); // For in-loop reductions, we do not need to create an additional select. @@ -8252,24 +8339,42 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, ID == Intrinsic::experimental_noalias_scope_decl)) return nullptr; - auto willWiden = [&](ElementCount VF) -> bool { - Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); - // The following case may be scalarized depending on the VF. - // The flag shows whether we use Intrinsic or a usual Call for vectorized - // version of the instruction. - // Is it beneficial to perform intrinsic call compared to lib call? - bool NeedToScalarize = false; - InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); - InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0; - bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; - return UseVectorIntrinsic || !NeedToScalarize; - }; + ArrayRef Ops = Operands.take_front(CI->arg_size()); - if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) - return nullptr; + // Is it beneficial to perform intrinsic call compared to lib call? + bool ShouldUseVectorIntrinsic = + ID && LoopVectorizationPlanner::getDecisionAndClampRange( + [&](ElementCount VF) -> bool { + bool NeedToScalarize = false; + // Is it beneficial to perform intrinsic call compared to lib + // call? + InstructionCost CallCost = + CM.getVectorCallCost(CI, VF, NeedToScalarize); + InstructionCost IntrinsicCost = + CM.getVectorIntrinsicCost(CI, VF); + return IntrinsicCost <= CallCost; + }, + Range); + if (ShouldUseVectorIntrinsic) + return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), ID); + + // Is better to call a vectorized version of the function than to to scalarize + // the call? + auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange( + [&](ElementCount VF) -> bool { + // The following case may be scalarized depending on the VF. + // The flag shows whether we can use a usual Call for vectorized + // version of the instruction. + bool NeedToScalarize = false; + CM.getVectorCallCost(CI, VF, NeedToScalarize); + return !NeedToScalarize; + }, + Range); + if (ShouldUseVectorCall) + return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), + Intrinsic::not_intrinsic); - ArrayRef Ops = Operands.take_front(CI->arg_size()); - return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end())); + return nullptr; } bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { @@ -8286,55 +8391,65 @@ bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { Range); } -VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, - ArrayRef Operands) const { - auto IsVectorizableOpcode = [](unsigned Opcode) { - switch (Opcode) { - case Instruction::Add: - case Instruction::And: - case Instruction::AShr: - case Instruction::BitCast: - case Instruction::FAdd: - case Instruction::FCmp: - case Instruction::FDiv: - case Instruction::FMul: - case Instruction::FNeg: - case Instruction::FPExt: - case Instruction::FPToSI: - case Instruction::FPToUI: - case Instruction::FPTrunc: - case Instruction::FRem: - case Instruction::FSub: - case Instruction::ICmp: - case Instruction::IntToPtr: - case Instruction::LShr: - case Instruction::Mul: - case Instruction::Or: - case Instruction::PtrToInt: - case Instruction::SDiv: - case Instruction::Select: - case Instruction::SExt: - case Instruction::Shl: - case Instruction::SIToFP: - case Instruction::SRem: - case Instruction::Sub: - case Instruction::Trunc: - case Instruction::UDiv: - case Instruction::UIToFP: - case Instruction::URem: - case Instruction::Xor: - case Instruction::ZExt: - case Instruction::Freeze: - return true; +VPRecipeBase *VPRecipeBuilder::tryToWiden(Instruction *I, + ArrayRef Operands, + VPBasicBlock *VPBB, VPlanPtr &Plan) { + switch (I->getOpcode()) { + default: + return nullptr; + case Instruction::SDiv: + case Instruction::UDiv: + case Instruction::SRem: + case Instruction::URem: { + // If not provably safe, use a select to form a safe divisor before widening the + // div/rem operation itself. Otherwise fall through to general handling below. + if (CM.isPredicatedInst(I)) { + SmallVector Ops(Operands.begin(), Operands.end()); + VPValue *Mask = createBlockInMask(I->getParent(), Plan); + VPValue *One = + Plan->getOrAddExternalDef(ConstantInt::get(I->getType(), 1u, false)); + auto *SafeRHS = + new VPInstruction(Instruction::Select, {Mask, Ops[1], One}, + I->getDebugLoc()); + VPBB->appendRecipe(SafeRHS); + Ops[1] = SafeRHS; + return new VPWidenRecipe(*I, make_range(Ops.begin(), Ops.end())); } - return false; + LLVM_FALLTHROUGH; + } + case Instruction::Add: + case Instruction::And: + case Instruction::AShr: + case Instruction::BitCast: + case Instruction::FAdd: + case Instruction::FCmp: + case Instruction::FDiv: + case Instruction::FMul: + case Instruction::FNeg: + case Instruction::FPExt: + case Instruction::FPToSI: + case Instruction::FPToUI: + case Instruction::FPTrunc: + case Instruction::FRem: + case Instruction::FSub: + case Instruction::ICmp: + case Instruction::IntToPtr: + case Instruction::LShr: + case Instruction::Mul: + case Instruction::Or: + case Instruction::PtrToInt: + case Instruction::Select: + case Instruction::SExt: + case Instruction::Shl: + case Instruction::SIToFP: + case Instruction::Sub: + case Instruction::Trunc: + case Instruction::UIToFP: + case Instruction::Xor: + case Instruction::ZExt: + case Instruction::Freeze: + return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end())); }; - - if (!IsVectorizableOpcode(I->getOpcode())) - return nullptr; - - // Success: widen this instruction. - return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end())); } void VPRecipeBuilder::fixHeaderPhis() { @@ -8354,9 +8469,7 @@ VPBasicBlock *VPRecipeBuilder::handleReplication( [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, Range); - bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( - [&](ElementCount VF) { return CM.isPredicatedInst(I, VF); }, - Range); + bool IsPredicated = CM.isPredicatedInst(I); // Even if the instruction is not marked as uniform, there are certain // intrinsic calls that can be effectively treated as such, so we check for @@ -8396,11 +8509,12 @@ VPBasicBlock *VPRecipeBuilder::handleReplication( // value. Avoid hoisting the insert-element which packs the scalar value into // a vector value, as that happens iff all users use the vector value. for (VPValue *Op : Recipe->operands()) { - auto *PredR = dyn_cast_or_null(Op->getDef()); + auto *PredR = + dyn_cast_or_null(Op->getDefiningRecipe()); if (!PredR) continue; - auto *RepR = - cast_or_null(PredR->getOperand(0)->getDef()); + auto *RepR = cast( + PredR->getOperand(0)->getDefiningRecipe()); assert(RepR->isPredicated() && "expected Replicate recipe to be predicated"); RepR->setAlsoPack(false); @@ -8469,20 +8583,26 @@ VPRecipeBuilder::createReplicateRegion(VPReplicateRecipe *PredRecipe, VPRecipeOrVPValueTy VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, ArrayRef Operands, - VFRange &Range, VPlanPtr &Plan) { + VFRange &Range, VPBasicBlock *VPBB, + VPlanPtr &Plan) { // First, check for specific widening recipes that deal with inductions, Phi // nodes, calls and memory operations. VPRecipeBase *Recipe; if (auto Phi = dyn_cast(Instr)) { if (Phi->getParent() != OrigLoop->getHeader()) return tryToBlend(Phi, Operands, Plan); + + // Always record recipes for header phis. Later first-order recurrence phis + // can have earlier phis as incoming values. + recordRecipeOf(Phi); + if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, *Plan, Range))) return toVPRecipeResult(Recipe); VPHeaderPHIRecipe *PhiRecipe = nullptr; assert((Legal->isReductionVariable(Phi) || - Legal->isFirstOrderRecurrence(Phi)) && - "can only widen reductions and first-order recurrences here"); + Legal->isFixedOrderRecurrence(Phi)) && + "can only widen reductions and fixed-order recurrences here"); VPValue *StartV = Operands[0]; if (Legal->isReductionVariable(Phi)) { const RecurrenceDescriptor &RdxDesc = @@ -8493,13 +8613,21 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, CM.isInLoopReduction(Phi), CM.useOrderedReductions(RdxDesc)); } else { + // TODO: Currently fixed-order recurrences are modeled as chains of + // first-order recurrences. If there are no users of the intermediate + // recurrences in the chain, the fixed order recurrence should be modeled + // directly, enabling more efficient codegen. PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV); } // Record the incoming value from the backedge, so we can add the incoming // value from the backedge after all recipes have been created. - recordRecipeOf(cast( - Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()))); + auto *Inc = cast( + Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch())); + auto RecipeIter = Ingredient2Recipe.find(Inc); + if (RecipeIter == Ingredient2Recipe.end()) + recordRecipeOf(Inc); + PhisToFix.push_back(PhiRecipe); return toVPRecipeResult(PhiRecipe); } @@ -8534,7 +8662,7 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, *SI, make_range(Operands.begin(), Operands.end()), InvariantCond)); } - return toVPRecipeResult(tryToWiden(Instr, Operands)); + return toVPRecipeResult(tryToWiden(Instr, Operands, VPBB, Plan)); } void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, @@ -8564,7 +8692,7 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, assert( SinkTarget != FirstInst && "Must find a live instruction (at least the one feeding the " - "first-order recurrence PHI) before reaching beginning of the block"); + "fixed-order recurrence PHI) before reaching beginning of the block"); SinkTarget = SinkTarget->getPrevNode(); assert(SinkTarget != P.first && "sink source equals target, no sinking required"); @@ -8696,18 +8824,18 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( // Mark instructions we'll need to sink later and their targets as // ingredients whose recipe we'll need to record. - for (auto &Entry : SinkAfter) { + for (const auto &Entry : SinkAfter) { RecipeBuilder.recordRecipeOf(Entry.first); RecipeBuilder.recordRecipeOf(Entry.second); } - for (auto &Reduction : CM.getInLoopReductionChains()) { + for (const auto &Reduction : CM.getInLoopReductionChains()) { PHINode *Phi = Reduction.first; RecurKind Kind = Legal->getReductionVars().find(Phi)->second.getRecurrenceKind(); const SmallVector &ReductionOperations = Reduction.second; RecipeBuilder.recordRecipeOf(Phi); - for (auto &R : ReductionOperations) { + for (const auto &R : ReductionOperations) { RecipeBuilder.recordRecipeOf(R); // For min/max reductions, where we have a pair of icmp/select, we also // need to record the ICmp recipe, so it can be removed later. @@ -8805,14 +8933,14 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( continue; if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe( - Instr, Operands, Range, Plan)) { + Instr, Operands, Range, VPBB, Plan)) { // If Instr can be simplified to an existing VPValue, use it. if (RecipeOrValue.is()) { auto *VPV = RecipeOrValue.get(); Plan->addVPValue(Instr, VPV); // If the re-used value is a recipe, register the recipe for the // instruction, in case the recipe for Instr needs to be recorded. - if (auto *R = dyn_cast_or_null(VPV->getDef())) + if (VPRecipeBase *R = VPV->getDefiningRecipe()) RecipeBuilder.setRecipe(Instr, R); continue; } @@ -8854,11 +8982,6 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( VPBB = cast(VPBB->getSingleSuccessor()); } - HeaderVPBB->setName("vector.body"); - - // Fold the last, empty block into its predecessor. - VPBB = VPBlockUtils::tryToMergeBlockIntoPredecessor(VPBB); - assert(VPBB && "expected to fold last (empty) block"); // After here, VPBB should not be used. VPBB = nullptr; @@ -8888,7 +9011,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( } return nullptr; }; - for (auto &Entry : SinkAfter) { + for (const auto &Entry : SinkAfter) { VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); @@ -8949,14 +9072,19 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( RecipeBuilder, Range.Start); // Introduce a recipe to combine the incoming and previous values of a - // first-order recurrence. + // fixed-order recurrence. for (VPRecipeBase &R : Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { auto *RecurPhi = dyn_cast(&R); if (!RecurPhi) continue; - VPRecipeBase *PrevRecipe = RecurPhi->getBackedgeRecipe(); + VPRecipeBase *PrevRecipe = &RecurPhi->getBackedgeRecipe(); + // Fixed-order recurrences do not contain cycles, so this loop is guaranteed + // to terminate. + while (auto *PrevPhi = + dyn_cast(PrevRecipe)) + PrevRecipe = &PrevPhi->getBackedgeRecipe(); VPBasicBlock *InsertBlock = PrevRecipe->getParent(); auto *Region = GetReplicateRegion(PrevRecipe); if (Region) @@ -8983,7 +9111,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( // Interleave memory: for each Interleave Group we marked earlier as relevant // for this VPlan, replace the Recipes widening its memory instructions with a // single VPInterleaveRecipe at its insertion point. - for (auto IG : InterleaveGroups) { + for (const auto *IG : InterleaveGroups) { auto *Recipe = cast( RecipeBuilder.getRecipe(IG->getInsertPos())); SmallVector StoredValues; @@ -9011,33 +9139,28 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( } } - std::string PlanName; - raw_string_ostream RSO(PlanName); - ElementCount VF = Range.Start; - Plan->addVF(VF); - RSO << "Initial VPlan for VF={" << VF; - for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { + for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); + VF *= 2) Plan->addVF(VF); - RSO << "," << VF; - } - RSO << "},UF>=1"; - RSO.flush(); - Plan->setName(PlanName); + Plan->setName("Initial VPlan"); // From this point onwards, VPlan-to-VPlan transformations may change the plan // in ways that accessing values using original IR values is incorrect. Plan->disableValue2VPValue(); VPlanTransforms::optimizeInductions(*Plan, *PSE.getSE()); - VPlanTransforms::sinkScalarOperands(*Plan); VPlanTransforms::removeDeadRecipes(*Plan); - VPlanTransforms::mergeReplicateRegions(*Plan); - VPlanTransforms::removeRedundantExpandSCEVRecipes(*Plan); - // Fold Exit block into its predecessor if possible. - // TODO: Fold block earlier once all VPlan transforms properly maintain a - // VPBasicBlock as exit. - VPBlockUtils::tryToMergeBlockIntoPredecessor(TopRegion->getExiting()); + bool ShouldSimplify = true; + while (ShouldSimplify) { + ShouldSimplify = VPlanTransforms::sinkScalarOperands(*Plan); + ShouldSimplify |= + VPlanTransforms::mergeReplicateRegionsIntoSuccessors(*Plan); + ShouldSimplify |= VPlanTransforms::mergeBlocksIntoPredecessors(*Plan); + } + + VPlanTransforms::removeRedundantExpandSCEVRecipes(*Plan); + VPlanTransforms::mergeBlocksIntoPredecessors(*Plan); assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid"); return Plan; @@ -9066,7 +9189,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { VPlanTransforms::VPInstructionsToVPRecipes( OrigLoop, Plan, [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); }, - DeadInstructions, *PSE.getSE()); + DeadInstructions, *PSE.getSE(), *TLI); // Remove the existing terminator of the exiting block of the top-most region. // A BranchOnCount will be added instead when adding the canonical IV recipes. @@ -9087,7 +9210,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { void LoopVectorizationPlanner::adjustRecipesForReductions( VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) { - for (auto &Reduction : CM.getInLoopReductionChains()) { + for (const auto &Reduction : CM.getInLoopReductionChains()) { PHINode *Phi = Reduction.first; const RecurrenceDescriptor &RdxDesc = Legal->getReductionVars().find(Phi)->second; @@ -9127,9 +9250,13 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); - auto *CondOp = CM.blockNeedsPredicationForAnyReason(R->getParent()) - ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) - : nullptr; + VPValue *CondOp = nullptr; + if (CM.blockNeedsPredicationForAnyReason(R->getParent())) { + VPBuilder::InsertPointGuard Guard(Builder); + Builder.setInsertPoint(WidenRecipe->getParent(), + WidenRecipe->getIterator()); + CondOp = RecipeBuilder.createBlockInMask(R->getParent(), Plan); + } if (IsFMulAdd) { // If the instruction is a call to the llvm.fmuladd intrinsic then we @@ -9179,7 +9306,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( VPValue *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); VPValue *Red = PhiR->getBackedgeValue(); - assert(cast(Red->getDef())->getParent() != LatchVPBB && + assert(Red->getDefiningRecipe()->getParent() != LatchVPBB && "reduction recipe must be defined before latch"); Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR}); } @@ -9217,11 +9344,6 @@ void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, } #endif -void VPWidenCallRecipe::execute(VPTransformState &State) { - State.ILV->widenCallInstruction(*cast(getUnderlyingInstr()), this, - *this, State); -} - void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { assert(!State.Instance && "Int or FP induction being replicated."); @@ -9353,8 +9475,7 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) { PartStart, ConstantInt::get(PtrInd->getType(), Lane)); Value *GlobalIdx = State.Builder.CreateAdd(PtrInd, Idx); - Value *Step = CreateStepValue(IndDesc.getStep(), SE, - State.CFG.PrevBB->getTerminator()); + Value *Step = State.get(getOperand(1), VPIteration(0, Part)); Value *SclrGep = emitTransformedIndex( State.Builder, GlobalIdx, IndDesc.getStartValue(), Step, IndDesc); SclrGep->setName("next.gep"); @@ -9378,12 +9499,9 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) { NewPointerPhi->addIncoming(ScalarStartValue, VectorPH); // A pointer induction, performed by using a gep - const DataLayout &DL = NewPointerPhi->getModule()->getDataLayout(); Instruction *InductionLoc = &*State.Builder.GetInsertPoint(); - const SCEV *ScalarStep = IndDesc.getStep(); - SCEVExpander Exp(SE, DL, "induction"); - Value *ScalarStepValue = Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); + Value *ScalarStepValue = State.get(getOperand(1), VPIteration(0, 0)); Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF); Value *NumUnrolledElems = State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF)); @@ -9411,6 +9529,8 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) { StartOffset = State.Builder.CreateAdd( StartOffset, State.Builder.CreateStepVector(VecPhiType)); + assert(ScalarStepValue == State.get(getOperand(1), VPIteration(0, Part)) && + "scalar step must be the same across all parts"); Value *GEP = State.Builder.CreateGEP( IndDesc.getElementType(), NewPointerPhi, State.Builder.CreateMul( @@ -9421,8 +9541,8 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) { } } -void VPScalarIVStepsRecipe::execute(VPTransformState &State) { - assert(!State.Instance && "VPScalarIVStepsRecipe being replicated."); +void VPDerivedIVRecipe::execute(VPTransformState &State) { + assert(!State.Instance && "VPDerivedIVRecipe being replicated."); // Fast-math-flags propagate from the original induction instruction. IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); @@ -9432,52 +9552,33 @@ void VPScalarIVStepsRecipe::execute(VPTransformState &State) { IndDesc.getInductionBinOp()->getFastMathFlags()); Value *Step = State.get(getStepValue(), VPIteration(0, 0)); - auto CreateScalarIV = [&](Value *&Step) -> Value * { - Value *ScalarIV = State.get(getCanonicalIV(), VPIteration(0, 0)); - auto *CanonicalIV = State.get(getParent()->getPlan()->getCanonicalIV(), 0); - if (!isCanonical() || CanonicalIV->getType() != Ty) { - ScalarIV = - Ty->isIntegerTy() - ? State.Builder.CreateSExtOrTrunc(ScalarIV, Ty) - : State.Builder.CreateCast(Instruction::SIToFP, ScalarIV, Ty); - ScalarIV = emitTransformedIndex(State.Builder, ScalarIV, - getStartValue()->getLiveInIRValue(), Step, - IndDesc); - ScalarIV->setName("offset.idx"); - } - if (TruncToTy) { - assert(Step->getType()->isIntegerTy() && - "Truncation requires an integer step"); - ScalarIV = State.Builder.CreateTrunc(ScalarIV, TruncToTy); - Step = State.Builder.CreateTrunc(Step, TruncToTy); - } - return ScalarIV; - }; - - Value *ScalarIV = CreateScalarIV(Step); - if (State.VF.isVector()) { - buildScalarSteps(ScalarIV, Step, IndDesc, this, State); - return; + Value *CanonicalIV = State.get(getCanonicalIV(), VPIteration(0, 0)); + Value *DerivedIV = + emitTransformedIndex(State.Builder, CanonicalIV, + getStartValue()->getLiveInIRValue(), Step, IndDesc); + DerivedIV->setName("offset.idx"); + if (ResultTy != DerivedIV->getType()) { + assert(Step->getType()->isIntegerTy() && + "Truncation requires an integer step"); + DerivedIV = State.Builder.CreateTrunc(DerivedIV, ResultTy); } + assert(DerivedIV != CanonicalIV && "IV didn't need transforming?"); - for (unsigned Part = 0; Part < State.UF; ++Part) { - assert(!State.VF.isScalable() && "scalable vectors not yet supported."); - Value *EntryPart; - if (Step->getType()->isFloatingPointTy()) { - Value *StartIdx = - getRuntimeVFAsFloat(State.Builder, Step->getType(), State.VF * Part); - // Floating-point operations inherit FMF via the builder's flags. - Value *MulOp = State.Builder.CreateFMul(StartIdx, Step); - EntryPart = State.Builder.CreateBinOp(IndDesc.getInductionOpcode(), - ScalarIV, MulOp); - } else { - Value *StartIdx = - getRuntimeVF(State.Builder, Step->getType(), State.VF * Part); - EntryPart = State.Builder.CreateAdd( - ScalarIV, State.Builder.CreateMul(StartIdx, Step), "induction"); - } - State.set(this, EntryPart, Part); - } + State.set(this, DerivedIV, VPIteration(0, 0)); +} + +void VPScalarIVStepsRecipe::execute(VPTransformState &State) { + // Fast-math-flags propagate from the original induction instruction. + IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); + if (IndDesc.getInductionBinOp() && + isa(IndDesc.getInductionBinOp())) + State.Builder.setFastMathFlags( + IndDesc.getInductionBinOp()->getFastMathFlags()); + + Value *BaseIV = State.get(getOperand(0), VPIteration(0, 0)); + Value *Step = State.get(getStepValue(), VPIteration(0, 0)); + + buildScalarSteps(BaseIV, Step, IndDesc, this, State); } void VPInterleaveRecipe::execute(VPTransformState &State) { @@ -9536,9 +9637,10 @@ void VPReductionRecipe::execute(VPTransformState &State) { } void VPReplicateRecipe::execute(VPTransformState &State) { + Instruction *UI = getUnderlyingInstr(); if (State.Instance) { // Generate a single instance. assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); - State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *State.Instance, + State.ILV->scalarizeInstruction(UI, this, *State.Instance, IsPredicated, State); // Insert scalar instance packing it into a vector. if (AlsoPack && State.VF.isVector()) { @@ -9546,7 +9648,7 @@ void VPReplicateRecipe::execute(VPTransformState &State) { if (State.Instance->Lane.isFirstLane()) { assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); Value *Poison = PoisonValue::get( - VectorType::get(getUnderlyingValue()->getType(), State.VF)); + VectorType::get(UI->getType(), State.VF)); State.set(this, Poison, State.Instance->Part); } State.ILV->packScalarIntoVectorValue(this, *State.Instance, State); @@ -9555,12 +9657,36 @@ void VPReplicateRecipe::execute(VPTransformState &State) { } if (IsUniform) { + // If the recipe is uniform across all parts (instead of just per VF), only + // generate a single instance. + if ((isa(UI) || isa(UI)) && + all_of(operands(), [](VPValue *Op) { + return Op->isDefinedOutsideVectorRegions(); + })) { + State.ILV->scalarizeInstruction(UI, this, VPIteration(0, 0), IsPredicated, + State); + if (user_begin() != user_end()) { + for (unsigned Part = 1; Part < State.UF; ++Part) + State.set(this, State.get(this, VPIteration(0, 0)), + VPIteration(Part, 0)); + } + return; + } + // Uniform within VL means we need to generate lane 0 only for each // unrolled copy. for (unsigned Part = 0; Part < State.UF; ++Part) - State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, - VPIteration(Part, 0), IsPredicated, - State); + State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, 0), + IsPredicated, State); + return; + } + + // A store of a loop varying value to a loop invariant address only + // needs only the last copy of the store. + if (isa(UI) && !getOperand(1)->hasDefiningRecipe()) { + auto Lane = VPLane::getLastLaneForVF(State.VF); + State.ILV->scalarizeInstruction(UI, this, VPIteration(State.UF - 1, Lane), IsPredicated, + State); return; } @@ -9569,9 +9695,8 @@ void VPReplicateRecipe::execute(VPTransformState &State) { const unsigned EndLane = State.VF.getKnownMinValue(); for (unsigned Part = 0; Part < State.UF; ++Part) for (unsigned Lane = 0; Lane < EndLane; ++Lane) - State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, - VPIteration(Part, Lane), IsPredicated, - State); + State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane), + IsPredicated, State); } void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { @@ -9709,7 +9834,7 @@ static ScalarEpilogueLowering getScalarEpilogueLowering( Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, - LoopVectorizationLegality &LVL) { + LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI) { // 1) OptSize takes precedence over all other options, i.e. if this is set, // don't look at hints or options, and don't request a scalar epilogue. // (For PGSO, as shouldOptimizeForSize isn't currently accessible from @@ -9744,7 +9869,7 @@ static ScalarEpilogueLowering getScalarEpilogueLowering( }; // 4) if the TTI hook indicates this is profitable, request predication. - if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, &LVL)) + if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, &LVL, IAI)) return CM_ScalarEpilogueNotNeededUsePredicate; return CM_ScalarEpilogueAllowed; @@ -9770,15 +9895,14 @@ Value *VPTransformState::get(VPValue *Def, unsigned Part) { return ScalarValue; } - auto *RepR = dyn_cast(Def); - bool IsUniform = RepR && RepR->isUniform(); + bool IsUniform = vputils::isUniformAfterVectorization(Def); unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1; // Check if there is a scalar value for the selected lane. if (!hasScalarValue(Def, {Part, LastLane})) { - // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform. - assert((isa(Def->getDef()) || - isa(Def->getDef())) && + // At the moment, VPWidenIntOrFpInductionRecipes and VPScalarIVStepsRecipes can also be uniform. + assert((isa(Def->getDefiningRecipe()) || + isa(Def->getDefiningRecipe())) && "unexpected recipe found to be invariant"); IsUniform = true; LastLane = 0; @@ -9839,7 +9963,7 @@ static bool processLoopInVPlanNativePath( InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); ScalarEpilogueLowering SEL = getScalarEpilogueLowering( - F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); + F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL, &IAI); LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, &Hints, IAI); @@ -9927,7 +10051,7 @@ static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks, VectorizationFactor &VF, - Optional VScale, Loop *L, + std::optional VScale, Loop *L, ScalarEvolution &SE) { InstructionCost CheckCost = Checks.getCost(); if (!CheckCost.isValid()) @@ -10075,7 +10199,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { // Check if it is legal to vectorize the loop. LoopVectorizationRequirements Requirements; - LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, + LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE, &Requirements, &Hints, DB, AC, BFI, PSI); if (!LVL.canVectorize(EnableVPlanNativePath)) { LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); @@ -10083,11 +10207,6 @@ bool LoopVectorizePass::processLoop(Loop *L) { return false; } - // Check the function attributes and profiles to find out if this function - // should be optimized for size. - ScalarEpilogueLowering SEL = getScalarEpilogueLowering( - F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); - // Entrance to the VPlan-native vectorization path. Outer loops are processed // here. They may require CFG and instruction level transformations before // even evaluating whether vectorization is profitable. Since we cannot modify @@ -10099,6 +10218,22 @@ bool LoopVectorizePass::processLoop(Loop *L) { assert(L->isInnermost() && "Inner loop expected."); + InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); + bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); + + // If an override option has been passed in for interleaved accesses, use it. + if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) + UseInterleaved = EnableInterleavedMemAccesses; + + // Analyze interleaved memory accesses. + if (UseInterleaved) + IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); + + // Check the function attributes and profiles to find out if this function + // should be optimized for size. + ScalarEpilogueLowering SEL = getScalarEpilogueLowering( + F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL, &IAI); + // Check the loop for a trip count threshold: vectorize loops with a tiny trip // count by optimizing for size, to minimize overheads. auto ExpectedTC = getSmallBestKnownTC(*SE, L); @@ -10109,15 +10244,24 @@ bool LoopVectorizePass::processLoop(Loop *L) { if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); else { - LLVM_DEBUG(dbgs() << "\n"); - SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; + if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) { + LLVM_DEBUG(dbgs() << "\n"); + SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; + } else { + LLVM_DEBUG(dbgs() << " But the target considers the trip count too " + "small to consider vectorizing.\n"); + reportVectorizationFailure( + "The trip count is below the minial threshold value.", + "loop trip count is too low, avoiding vectorization", + "LowTripCount", ORE, L); + Hints.emitRemarkWithHints(); + return false; + } } } - // Check the function attributes to see if implicit floats are allowed. - // FIXME: This check doesn't seem possibly correct -- what if the loop is - // an integer loop and the vector instructions selected are purely integer - // vector instructions? + // Check the function attributes to see if implicit floats or vectors are + // allowed. if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { reportVectorizationFailure( "Can't vectorize when the NoImplicitFloat attribute is used", @@ -10162,18 +10306,6 @@ bool LoopVectorizePass::processLoop(Loop *L) { return false; } - bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); - InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); - - // If an override option has been passed in for interleaved accesses, use it. - if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) - UseInterleaved = EnableInterleavedMemAccesses; - - // Analyze interleaved memory accesses. - if (UseInterleaved) { - IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); - } - // Use the cost model. LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, F, &Hints, IAI); @@ -10188,7 +10320,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { unsigned UserIC = Hints.getInterleave(); // Plan how to best vectorize, return the best VF and its cost. - Optional MaybeVF = LVP.plan(UserVF, UserIC); + std::optional MaybeVF = LVP.plan(UserVF, UserIC); VectorizationFactor VF = VectorizationFactor::Disabled(); unsigned IC = 1; @@ -10198,7 +10330,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { if (MaybeVF) { VF = *MaybeVF; // Select the interleave count. - IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue()); + IC = CM.selectInterleaveCount(VF.Width, VF.Cost); unsigned SelectedIC = std::max(IC, UserIC); // Optimistically generate runtime checks if they are needed. Drop them if @@ -10360,16 +10492,39 @@ bool LoopVectorizePass::processLoop(Loop *L) { VPBasicBlock *Header = VectorLoop->getEntryBasicBlock(); Header->setName("vec.epilog.vector.body"); - // Ensure that the start values for any VPReductionPHIRecipes are - // updated before vectorising the epilogue loop. + // Ensure that the start values for any VPWidenIntOrFpInductionRecipe, + // VPWidenPointerInductionRecipe and VPReductionPHIRecipes are updated + // before vectorizing the epilogue loop. for (VPRecipeBase &R : Header->phis()) { + if (isa(&R)) + continue; + + Value *ResumeV = nullptr; + // TODO: Move setting of resume values to prepareToExecute. if (auto *ReductionPhi = dyn_cast(&R)) { - if (auto *Resume = MainILV.getReductionResumeValue( - ReductionPhi->getRecurrenceDescriptor())) { - VPValue *StartVal = BestEpiPlan.getOrAddExternalDef(Resume); - ReductionPhi->setOperand(0, StartVal); + ResumeV = MainILV.getReductionResumeValue( + ReductionPhi->getRecurrenceDescriptor()); + } else { + // Create induction resume values for both widened pointer and + // integer/fp inductions and update the start value of the induction + // recipes to use the resume value. + PHINode *IndPhi = nullptr; + const InductionDescriptor *ID; + if (auto *Ind = dyn_cast(&R)) { + IndPhi = cast(Ind->getUnderlyingValue()); + ID = &Ind->getInductionDescriptor(); + } else { + auto *WidenInd = cast(&R); + IndPhi = WidenInd->getPHINode(); + ID = &WidenInd->getInductionDescriptor(); } + + ResumeV = MainILV.createInductionResumeValue( + IndPhi, *ID, {EPI.MainLoopIterationCountCheck}); } + assert(ResumeV && "Must have a resume value"); + VPValue *StartVal = BestEpiPlan.getOrAddExternalDef(ResumeV); + cast(&R)->setStartValue(StartVal); } LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, @@ -10407,11 +10562,11 @@ bool LoopVectorizePass::processLoop(Loop *L) { checkMixedPrecision(L, ORE); } - Optional RemainderLoopID = + std::optional RemainderLoopID = makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, LLVMLoopVectorizeFollowupEpilogue}); if (RemainderLoopID) { - L->setLoopID(RemainderLoopID.value()); + L->setLoopID(*RemainderLoopID); } else { if (DisableRuntimeUnroll) AddRuntimeUnrollDisableMetaData(L); @@ -10427,8 +10582,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { LoopVectorizeResult LoopVectorizePass::runImpl( Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, - DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, - std::function &GetLAA_, + DemandedBits &DB_, AssumptionCache &AC_, LoopAccessInfoManager &LAIs_, OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { SE = &SE_; LI = &LI_; @@ -10436,9 +10590,8 @@ LoopVectorizeResult LoopVectorizePass::runImpl( DT = &DT_; BFI = &BFI_; TLI = TLI_; - AA = &AA_; AC = &AC_; - GetLAA = &GetLAA_; + LAIs = &LAIs_; DB = &DB_; ORE = &ORE_; PSI = PSI_; @@ -10461,7 +10614,7 @@ LoopVectorizeResult LoopVectorizePass::runImpl( // legality and profitability checks. This means running the loop vectorizer // will simplify all loops, regardless of whether anything end up being // vectorized. - for (auto &L : *LI) + for (const auto &L : *LI) Changed |= CFGChanged |= simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); @@ -10484,6 +10637,9 @@ LoopVectorizeResult LoopVectorizePass::runImpl( Changed |= formLCSSARecursively(*L, *DT, LI, SE); Changed |= CFGChanged |= processLoop(L); + + if (Changed) + LAIs->clear(); } // Process each loop nest in the function. @@ -10502,23 +10658,16 @@ PreservedAnalyses LoopVectorizePass::run(Function &F, auto &DT = AM.getResult(F); auto &BFI = AM.getResult(F); auto &TLI = AM.getResult(F); - auto &AA = AM.getResult(F); auto &AC = AM.getResult(F); auto &DB = AM.getResult(F); auto &ORE = AM.getResult(F); - auto &LAM = AM.getResult(F).getManager(); - std::function GetLAA = - [&](Loop &L) -> const LoopAccessInfo & { - LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, - TLI, TTI, nullptr, nullptr, nullptr}; - return LAM.getResult(L, AR); - }; + LoopAccessInfoManager &LAIs = AM.getResult(F); auto &MAMProxy = AM.getResult(F); ProfileSummaryInfo *PSI = MAMProxy.getCachedResult(*F.getParent()); LoopVectorizeResult Result = - runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); + runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AC, LAIs, ORE, PSI); if (!Result.MadeAnyChange) return PreservedAnalyses::all(); PreservedAnalyses PA; -- cgit v1.2.3