diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2022-07-14 18:58:48 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2023-02-08 19:03:59 +0000 |
commit | 753f127f3ace09432b2baeffd71a308760641a62 (patch) | |
tree | 97694ab339c0ca6145ebb429c7505019565b9a60 /contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | |
parent | 81ad626541db97eb356e2c1d4a20eb2a26a766ab (diff) | |
parent | 1f917f69ff07f09b6dbb670971f57f8efe718b84 (diff) |
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp')
-rw-r--r-- | contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 733 |
1 files changed, 315 insertions, 418 deletions
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index b637b2d5ddae..0777a1385916 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -196,10 +196,9 @@ static cl::opt<unsigned> TinyTripCountVectorThreshold( "value are vectorized only if no scalar iteration overheads " "are incurred.")); -static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold( - "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, - cl::desc("The maximum allowed number of runtime memory checks with a " - "vectorize(enable) pragma.")); +static cl::opt<unsigned> VectorizeMemoryCheckThreshold( + "vectorize-memory-check-threshold", cl::init(128), cl::Hidden, + cl::desc("The maximum allowed number of runtime memory checks")); // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, // that predication is preferred, and this lists all options. I.e., the @@ -442,6 +441,7 @@ public: const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, ElementCount VecWidth, + ElementCount MinProfitableTripCount, unsigned UnrollFactor, LoopVectorizationLegality *LVL, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) @@ -453,6 +453,11 @@ public: // of the original loop header may change as the transformation happens. OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); + + if (MinProfitableTripCount.isZero()) + this->MinProfitableTripCount = VecWidth; + else + this->MinProfitableTripCount = MinProfitableTripCount; } virtual ~InnerLoopVectorizer() = default; @@ -656,6 +661,8 @@ protected: /// vector elements. ElementCount VF; + ElementCount MinProfitableTripCount; + /// The vectorization unroll factor to use. Each scalar is vectorized to this /// many different vector instructions. unsigned UF; @@ -735,6 +742,7 @@ public: LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, + ElementCount::getFixed(1), ElementCount::getFixed(1), UnrollFactor, LVL, CM, BFI, PSI, Check) {} @@ -783,8 +791,8 @@ public: BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks) : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, - EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI, - Checks), + EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL, + CM, BFI, PSI, Checks), EPI(EPI) {} // Override this function to handle the more complex control flow around the @@ -1018,7 +1026,8 @@ void InnerLoopVectorizer::collectPoisonGeneratingRecipes( if (isa<VPWidenMemoryInstructionRecipe>(CurRec) || isa<VPInterleaveRecipe>(CurRec) || isa<VPScalarIVStepsRecipe>(CurRec) || - isa<VPCanonicalIVPHIRecipe>(CurRec)) + isa<VPCanonicalIVPHIRecipe>(CurRec) || + isa<VPActiveLaneMaskPHIRecipe>(CurRec)) continue; // This recipe contributes to the address computation of a widen @@ -1503,6 +1512,13 @@ public: /// Returns true if all loop blocks should be masked to fold tail loop. bool foldTailByMasking() const { return FoldTailByMasking; } + /// Returns true if were tail-folding and want to use the active lane mask + /// for vector loop control flow. + bool useActiveLaneMaskForControlFlow() const { + return FoldTailByMasking && + TTI.emitGetActiveLaneMask() == PredicationStyle::DataAndControlFlow; + } + /// Returns true if the instructions in this block requires predication /// for any reason, e.g. because tail folding now requires a predicate /// or because the block in the original loop was predicated. @@ -1551,14 +1567,14 @@ public: Scalars.clear(); } -private: - unsigned NumPredStores = 0; - /// Convenience function that returns the value of vscale_range iff /// vscale_range.min == vscale_range.max or otherwise returns the value /// returned by the corresponding TLI method. Optional<unsigned> getVScaleForTuning() const; +private: + unsigned NumPredStores = 0; + /// \return An upper bound for the vectorization factors for both /// fixed and scalable vectorization, where the minimum-known number of /// elements is a power-of-2 larger than zero. If scalable vectorization is @@ -1661,7 +1677,8 @@ private: /// A set containing all BasicBlocks that are known to present after /// vectorization as a predicated block. - SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; + DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>> + PredicatedBBsAfterVectorization; /// Records whether it is allowed to have the original scalar loop execute at /// least once. This may be needed as a fallback loop in case runtime @@ -1849,14 +1866,17 @@ class GeneratedRTChecks { DominatorTree *DT; LoopInfo *LI; + TargetTransformInfo *TTI; SCEVExpander SCEVExp; SCEVExpander MemCheckExp; + bool CostTooHigh = false; + public: GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, - const DataLayout &DL) - : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"), + TargetTransformInfo *TTI, const DataLayout &DL) + : DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"), MemCheckExp(SE, DL, "scev.check") {} /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can @@ -1867,6 +1887,15 @@ public: void Create(Loop *L, const LoopAccessInfo &LAI, const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) { + // Hard cutoff to limit compile-time increase in case a very large number of + // runtime checks needs to be generated. + // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to + // profile info. + CostTooHigh = + LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold; + if (CostTooHigh) + return; + BasicBlock *LoopHeader = L->getHeader(); BasicBlock *Preheader = L->getLoopPreheader(); @@ -1938,6 +1967,44 @@ public: } } + InstructionCost getCost() { + if (SCEVCheckBlock || MemCheckBlock) + LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n"); + + if (CostTooHigh) { + InstructionCost Cost; + Cost.setInvalid(); + LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n"); + return Cost; + } + + InstructionCost RTCheckCost = 0; + if (SCEVCheckBlock) + for (Instruction &I : *SCEVCheckBlock) { + if (SCEVCheckBlock->getTerminator() == &I) + continue; + InstructionCost C = + TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput); + LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n"); + RTCheckCost += C; + } + if (MemCheckBlock) + for (Instruction &I : *MemCheckBlock) { + if (MemCheckBlock->getTerminator() == &I) + continue; + InstructionCost C = + TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput); + LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n"); + RTCheckCost += C; + } + + if (SCEVCheckBlock || MemCheckBlock) + LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost + << "\n"); + + return RTCheckCost; + } + /// Remove the created SCEV & memory runtime check blocks & instructions, if /// unused. ~GeneratedRTChecks() { @@ -2880,9 +2947,16 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { // If tail is to be folded, vector loop takes care of all iterations. Type *CountTy = Count->getType(); Value *CheckMinIters = Builder.getFalse(); - Value *Step = createStepForVF(Builder, CountTy, VF, UF); + auto CreateStep = [&]() { + // Create step with max(MinProTripCount, UF * VF). + if (UF * VF.getKnownMinValue() < MinProfitableTripCount.getKnownMinValue()) + return createStepForVF(Builder, CountTy, MinProfitableTripCount, 1); + return createStepForVF(Builder, CountTy, VF, UF); + }; + if (!Cost->foldTailByMasking()) - CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); + CheckMinIters = + Builder.CreateICmp(P, Count, CreateStep(), "min.iters.check"); else if (VF.isScalable()) { // vscale is not necessarily a power-of-2, which means we cannot guarantee // an overflow to zero when updating induction variables and so an @@ -2894,8 +2968,9 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count); // Don't execute the vector loop if (UMax - n) < (VF * UF). - CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, Step); + CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep()); } + // Create new preheader for vector loop. LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, @@ -2920,7 +2995,6 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { } BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) { - BasicBlock *const SCEVCheckBlock = RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock); if (!SCEVCheckBlock) @@ -4792,7 +4866,7 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { MaxVScale = TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax(); MaxScalableVF = ElementCount::getScalable( - MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); + MaxVScale ? (MaxSafeElements / MaxVScale.value()) : 0); if (!MaxScalableVF) reportVectorizationInfo( "Max legal vector width too small, scalable vectorization " @@ -5187,9 +5261,9 @@ bool LoopVectorizationCostModel::isMoreProfitable( unsigned EstimatedWidthB = B.Width.getKnownMinValue(); if (Optional<unsigned> VScale = getVScaleForTuning()) { if (A.Width.isScalable()) - EstimatedWidthA *= VScale.getValue(); + EstimatedWidthA *= VScale.value(); if (B.Width.isScalable()) - EstimatedWidthB *= VScale.getValue(); + EstimatedWidthB *= VScale.value(); } // Assume vscale may be larger than 1 (or the value being tuned for), @@ -5872,10 +5946,11 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); - auto GetRegUsage = [&TTI = TTI](Type *Ty, ElementCount VF) -> unsigned { + const auto &TTICapture = TTI; + auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned { if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) return 0; - return TTI.getRegUsageForType(VectorType::get(Ty, VF)); + return TTICapture.getRegUsageForType(VectorType::get(Ty, VF)); }; for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { @@ -6014,6 +6089,8 @@ void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { // map will indicate that we've analyzed it already. ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; + PredicatedBBsAfterVectorization[VF].clear(); + // Find all the instructions that are scalar with predication in the loop and // determine if it would be better to not if-convert the blocks they are in. // If so, we also record the instructions to scalarize. @@ -6031,7 +6108,7 @@ void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { computePredInstDiscount(&I, ScalarCosts, VF) >= 0) ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); // Remember that BB will remain after vectorization. - PredicatedBBsAfterVectorization.insert(BB); + PredicatedBBsAfterVectorization[VF].insert(BB); } } } @@ -6896,8 +6973,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, bool ScalarPredicatedBB = false; BranchInst *BI = cast<BranchInst>(I); if (VF.isVector() && BI->isConditional() && - (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || - PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) + (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) || + PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1)))) ScalarPredicatedBB = true; if (ScalarPredicatedBB) { @@ -7363,14 +7440,6 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { return VectorizationFactor::Disabled(); } -bool LoopVectorizationPlanner::requiresTooManyRuntimeChecks() const { - unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks(); - return (NumRuntimePointerChecks > - VectorizerParams::RuntimeMemoryCheckThreshold && - !Hints.allowReordering()) || - NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; -} - Optional<VectorizationFactor> LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { assert(OrigLoop->isInnermost() && "Inner loop expected."); @@ -7439,7 +7508,9 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { return VectorizationFactor::Disabled(); // Select the optimal vectorization factor. - return CM.selectVectorizationFactor(VFCandidates); + VectorizationFactor VF = CM.selectVectorizationFactor(VFCandidates); + assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero."); + return VF; } VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const { @@ -7554,7 +7625,7 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, BestVPlan.getVectorLoopRegion()->getEntryBasicBlock(); Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]); if (VectorizedLoopID) - L->setLoopID(VectorizedLoopID.getValue()); + L->setLoopID(VectorizedLoopID.value()); else { // Keep all loop hints from the original loop on the vector loop (we'll // replace the vectorizer-specific hints below). @@ -7585,51 +7656,6 @@ void LoopVectorizationPlanner::printPlans(raw_ostream &O) { } #endif -void LoopVectorizationPlanner::collectTriviallyDeadInstructions( - SmallPtrSetImpl<Instruction *> &DeadInstructions) { - - // We create new control-flow for the vectorized loop, so the original exit - // conditions will be dead after vectorization if it's only used by the - // terminator - SmallVector<BasicBlock*> ExitingBlocks; - OrigLoop->getExitingBlocks(ExitingBlocks); - for (auto *BB : ExitingBlocks) { - auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0)); - if (!Cmp || !Cmp->hasOneUse()) - continue; - - // TODO: we should introduce a getUniqueExitingBlocks on Loop - if (!DeadInstructions.insert(Cmp).second) - continue; - - // The operands of the icmp is often a dead trunc, used by IndUpdate. - // TODO: can recurse through operands in general - for (Value *Op : Cmp->operands()) { - if (isa<TruncInst>(Op) && Op->hasOneUse()) - DeadInstructions.insert(cast<Instruction>(Op)); - } - } - - // We create new "steps" for induction variable updates to which the original - // induction variables map. An original update instruction will be dead if - // all its users except the induction variable are dead. - auto *Latch = OrigLoop->getLoopLatch(); - for (auto &Induction : Legal->getInductionVars()) { - PHINode *Ind = Induction.first; - auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); - - // If the tail is to be folded by masking, the primary induction variable, - // if exists, isn't dead: it will be used for masking. Don't kill it. - if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) - continue; - - if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { - return U == Ind || DeadInstructions.count(cast<Instruction>(U)); - })) - DeadInstructions.insert(IndUpdate); - } -} - Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } //===--------------------------------------------------------------------===// @@ -8001,11 +8027,19 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { if (!CM.blockNeedsPredicationForAnyReason(BB)) return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. + assert(CM.foldTailByMasking() && "must fold the tail"); + + // If we're using the active lane mask for control flow, then we get the + // mask from the active lane mask PHI that is cached in the VPlan. + PredicationStyle EmitGetActiveLaneMask = CM.TTI.emitGetActiveLaneMask(); + if (EmitGetActiveLaneMask == PredicationStyle::DataAndControlFlow) + return BlockMaskCache[BB] = Plan->getActiveLaneMaskPhi(); + // Introduce the early-exit compare IV <= BTC to form header block mask. // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by // constructing the desired canonical IV in the header block as its first // non-phi instructions. - assert(CM.foldTailByMasking() && "must fold the tail"); + VPBasicBlock *HeaderVPBB = Plan->getVectorLoopRegion()->getEntryBasicBlock(); auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi(); @@ -8014,9 +8048,10 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { VPBuilder::InsertPointGuard Guard(Builder); Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); - if (CM.TTI.emitGetActiveLaneMask()) { + if (EmitGetActiveLaneMask != PredicationStyle::None) { VPValue *TC = Plan->getOrCreateTripCount(); - BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC}); + BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC}, + nullptr, "active.lane.mask"); } else { VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); @@ -8409,9 +8444,8 @@ VPBasicBlock *VPRecipeBuilder::handleReplication( return RegSucc; } -VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, - VPRecipeBase *PredRecipe, - VPlanPtr &Plan) { +VPRegionBlock *VPRecipeBuilder::createReplicateRegion( + Instruction *Instr, VPReplicateRecipe *PredRecipe, VPlanPtr &Plan) { // Instructions marked for predication are replicated and placed under an // if-then construct to prevent side-effects. @@ -8425,7 +8459,7 @@ VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); auto *PHIRecipe = Instr->getType()->isVoidTy() ? nullptr - : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); + : new VPPredInstPHIRecipe(PredRecipe); if (PHIRecipe) { Plan->removeVPValueFor(Instr); Plan->addVPValue(Instr, PHIRecipe); @@ -8517,19 +8551,11 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF) { assert(OrigLoop->isInnermost() && "Inner loop expected."); - // Collect instructions from the original loop that will become trivially dead - // in the vectorized loop. We don't need to vectorize these instructions. For - // example, original induction update instructions can become dead because we - // separately emit induction "steps" when generating code for the new loop. - // Similarly, we create a new latch condition when setting up the structure - // of the new loop, so the old one can become dead. - SmallPtrSet<Instruction *, 4> DeadInstructions; - collectTriviallyDeadInstructions(DeadInstructions); - // Add assume instructions we need to drop to DeadInstructions, to prevent // them from being added to the VPlan. // TODO: We only need to drop assumes in blocks that get flattend. If the // control flow is preserved, we should keep them. + SmallPtrSet<Instruction *, 4> DeadInstructions; auto &ConditionalAssumes = Legal->getConditionalAssumes(); DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); @@ -8565,32 +8591,84 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, } } -// Add a VPCanonicalIVPHIRecipe starting at 0 to the header, a -// CanonicalIVIncrement{NUW} VPInstruction to increment it by VF * UF and a -// BranchOnCount VPInstruction to the latch. +// Add the necessary canonical IV and branch recipes required to control the +// loop. static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL, - bool HasNUW) { + bool HasNUW, + bool UseLaneMaskForLoopControlFlow) { Value *StartIdx = ConstantInt::get(IdxTy, 0); auto *StartV = Plan.getOrAddVPValue(StartIdx); + // Add a VPCanonicalIVPHIRecipe starting at 0 to the header. auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL); VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); VPBasicBlock *Header = TopRegion->getEntryBasicBlock(); Header->insert(CanonicalIVPHI, Header->begin()); + // Add a CanonicalIVIncrement{NUW} VPInstruction to increment the scalar + // IV by VF * UF. auto *CanonicalIVIncrement = new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW : VPInstruction::CanonicalIVIncrement, - {CanonicalIVPHI}, DL); + {CanonicalIVPHI}, DL, "index.next"); CanonicalIVPHI->addOperand(CanonicalIVIncrement); VPBasicBlock *EB = TopRegion->getExitingBasicBlock(); EB->appendRecipe(CanonicalIVIncrement); - auto *BranchOnCount = - new VPInstruction(VPInstruction::BranchOnCount, - {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); - EB->appendRecipe(BranchOnCount); + if (UseLaneMaskForLoopControlFlow) { + // Create the active lane mask instruction in the vplan preheader. + VPBasicBlock *Preheader = Plan.getEntry()->getEntryBasicBlock(); + + // We can't use StartV directly in the ActiveLaneMask VPInstruction, since + // we have to take unrolling into account. Each part needs to start at + // Part * VF + auto *CanonicalIVIncrementParts = + new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementForPartNUW + : VPInstruction::CanonicalIVIncrementForPart, + {StartV}, DL, "index.part.next"); + Preheader->appendRecipe(CanonicalIVIncrementParts); + + // Create the ActiveLaneMask instruction using the correct start values. + VPValue *TC = Plan.getOrCreateTripCount(); + auto *EntryALM = new VPInstruction(VPInstruction::ActiveLaneMask, + {CanonicalIVIncrementParts, TC}, DL, + "active.lane.mask.entry"); + Preheader->appendRecipe(EntryALM); + + // Now create the ActiveLaneMaskPhi recipe in the main loop using the + // preheader ActiveLaneMask instruction. + auto *LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc()); + Header->insert(LaneMaskPhi, Header->getFirstNonPhi()); + + // Create the active lane mask for the next iteration of the loop. + CanonicalIVIncrementParts = + new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementForPartNUW + : VPInstruction::CanonicalIVIncrementForPart, + {CanonicalIVIncrement}, DL); + EB->appendRecipe(CanonicalIVIncrementParts); + + auto *ALM = new VPInstruction(VPInstruction::ActiveLaneMask, + {CanonicalIVIncrementParts, TC}, DL, + "active.lane.mask.next"); + EB->appendRecipe(ALM); + LaneMaskPhi->addOperand(ALM); + + // We have to invert the mask here because a true condition means jumping + // to the exit block. + auto *NotMask = new VPInstruction(VPInstruction::Not, ALM, DL); + EB->appendRecipe(NotMask); + + VPInstruction *BranchBack = + new VPInstruction(VPInstruction::BranchOnCond, {NotMask}, DL); + EB->appendRecipe(BranchBack); + } else { + // Add the BranchOnCount VPInstruction to the latch. + VPInstruction *BranchBack = new VPInstruction( + VPInstruction::BranchOnCount, + {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); + EB->appendRecipe(BranchBack); + } } // Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the @@ -8691,7 +8769,8 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DLInst ? DLInst->getDebugLoc() : DebugLoc(), - !CM.foldTailByMasking()); + !CM.foldTailByMasking(), + CM.useActiveLaneMaskForControlFlow()); // Scan the body of the loop in a topological order to visit each basic block // after having visited its predecessor basic blocks. @@ -8961,8 +9040,8 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( VPlanTransforms::optimizeInductions(*Plan, *PSE.getSE()); VPlanTransforms::sinkScalarOperands(*Plan); - VPlanTransforms::mergeReplicateRegions(*Plan); VPlanTransforms::removeDeadRecipes(*Plan); + VPlanTransforms::mergeReplicateRegions(*Plan); VPlanTransforms::removeRedundantExpandSCEVRecipes(*Plan); // Fold Exit block into its predecessor if possible. @@ -9006,7 +9085,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { Term->eraseFromParent(); addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(), - true); + true, CM.useActiveLaneMaskForControlFlow()); return Plan; } @@ -9078,7 +9157,9 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); Plan->removeVPValueFor(R); Plan->addVPValue(R, RedRecipe); - WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); + // Append the recipe to the end of the VPBasicBlock because we need to + // ensure that it comes after all of it's inputs, including CondOp. + WidenRecipe->getParent()->appendRecipe(RedRecipe); WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); WidenRecipe->eraseFromParent(); @@ -9151,229 +9232,6 @@ void VPWidenCallRecipe::execute(VPTransformState &State) { *this, State); } -void VPWidenSelectRecipe::execute(VPTransformState &State) { - auto &I = *cast<SelectInst>(getUnderlyingInstr()); - State.setDebugLocFromInst(&I); - - // The condition can be loop invariant but still defined inside the - // loop. This means that we can't just use the original 'cond' value. - // We have to take the 'vectorized' value and pick the first lane. - // Instcombine will make this a no-op. - auto *InvarCond = - InvariantCond ? State.get(getOperand(0), VPIteration(0, 0)) : nullptr; - - for (unsigned Part = 0; Part < State.UF; ++Part) { - Value *Cond = InvarCond ? InvarCond : State.get(getOperand(0), Part); - Value *Op0 = State.get(getOperand(1), Part); - Value *Op1 = State.get(getOperand(2), Part); - Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1); - State.set(this, Sel, Part); - State.addMetadata(Sel, &I); - } -} - -void VPWidenRecipe::execute(VPTransformState &State) { - auto &I = *cast<Instruction>(getUnderlyingValue()); - auto &Builder = State.Builder; - switch (I.getOpcode()) { - case Instruction::Call: - case Instruction::Br: - case Instruction::PHI: - case Instruction::GetElementPtr: - case Instruction::Select: - llvm_unreachable("This instruction is handled by a different recipe."); - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::SRem: - case Instruction::URem: - case Instruction::Add: - case Instruction::FAdd: - case Instruction::Sub: - case Instruction::FSub: - case Instruction::FNeg: - case Instruction::Mul: - case Instruction::FMul: - case Instruction::FDiv: - case Instruction::FRem: - case Instruction::Shl: - case Instruction::LShr: - case Instruction::AShr: - case Instruction::And: - case Instruction::Or: - case Instruction::Xor: { - // Just widen unops and binops. - State.setDebugLocFromInst(&I); - - for (unsigned Part = 0; Part < State.UF; ++Part) { - SmallVector<Value *, 2> Ops; - for (VPValue *VPOp : operands()) - Ops.push_back(State.get(VPOp, Part)); - - Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); - - if (auto *VecOp = dyn_cast<Instruction>(V)) { - VecOp->copyIRFlags(&I); - - // If the instruction is vectorized and was in a basic block that needed - // predication, we can't propagate poison-generating flags (nuw/nsw, - // exact, etc.). The control flow has been linearized and the - // instruction is no longer guarded by the predicate, which could make - // the flag properties to no longer hold. - if (State.MayGeneratePoisonRecipes.contains(this)) - VecOp->dropPoisonGeneratingFlags(); - } - - // Use this vector value for all users of the original instruction. - State.set(this, V, Part); - State.addMetadata(V, &I); - } - - break; - } - case Instruction::Freeze: { - State.setDebugLocFromInst(&I); - - for (unsigned Part = 0; Part < State.UF; ++Part) { - Value *Op = State.get(getOperand(0), Part); - - Value *Freeze = Builder.CreateFreeze(Op); - State.set(this, Freeze, Part); - } - break; - } - case Instruction::ICmp: - case Instruction::FCmp: { - // Widen compares. Generate vector compares. - bool FCmp = (I.getOpcode() == Instruction::FCmp); - auto *Cmp = cast<CmpInst>(&I); - State.setDebugLocFromInst(Cmp); - for (unsigned Part = 0; Part < State.UF; ++Part) { - Value *A = State.get(getOperand(0), Part); - Value *B = State.get(getOperand(1), Part); - Value *C = nullptr; - if (FCmp) { - // Propagate fast math flags. - IRBuilder<>::FastMathFlagGuard FMFG(Builder); - Builder.setFastMathFlags(Cmp->getFastMathFlags()); - C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); - } else { - C = Builder.CreateICmp(Cmp->getPredicate(), A, B); - } - State.set(this, C, Part); - State.addMetadata(C, &I); - } - - break; - } - - case Instruction::ZExt: - case Instruction::SExt: - case Instruction::FPToUI: - case Instruction::FPToSI: - case Instruction::FPExt: - case Instruction::PtrToInt: - case Instruction::IntToPtr: - case Instruction::SIToFP: - case Instruction::UIToFP: - case Instruction::Trunc: - case Instruction::FPTrunc: - case Instruction::BitCast: { - auto *CI = cast<CastInst>(&I); - State.setDebugLocFromInst(CI); - - /// Vectorize casts. - Type *DestTy = (State.VF.isScalar()) - ? CI->getType() - : VectorType::get(CI->getType(), State.VF); - - for (unsigned Part = 0; Part < State.UF; ++Part) { - Value *A = State.get(getOperand(0), Part); - Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); - State.set(this, Cast, Part); - State.addMetadata(Cast, &I); - } - break; - } - default: - // This instruction is not vectorized by simple widening. - LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); - llvm_unreachable("Unhandled instruction!"); - } // end of switch. -} - -void VPWidenGEPRecipe::execute(VPTransformState &State) { - auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr()); - // Construct a vector GEP by widening the operands of the scalar GEP as - // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP - // results in a vector of pointers when at least one operand of the GEP - // is vector-typed. Thus, to keep the representation compact, we only use - // vector-typed operands for loop-varying values. - - if (State.VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { - // If we are vectorizing, but the GEP has only loop-invariant operands, - // the GEP we build (by only using vector-typed operands for - // loop-varying values) would be a scalar pointer. Thus, to ensure we - // produce a vector of pointers, we need to either arbitrarily pick an - // operand to broadcast, or broadcast a clone of the original GEP. - // Here, we broadcast a clone of the original. - // - // TODO: If at some point we decide to scalarize instructions having - // loop-invariant operands, this special case will no longer be - // required. We would add the scalarization decision to - // collectLoopScalars() and teach getVectorValue() to broadcast - // the lane-zero scalar value. - auto *Clone = State.Builder.Insert(GEP->clone()); - for (unsigned Part = 0; Part < State.UF; ++Part) { - Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone); - State.set(this, EntryPart, Part); - State.addMetadata(EntryPart, GEP); - } - } else { - // If the GEP has at least one loop-varying operand, we are sure to - // produce a vector of pointers. But if we are only unrolling, we want - // to produce a scalar GEP for each unroll part. Thus, the GEP we - // produce with the code below will be scalar (if VF == 1) or vector - // (otherwise). Note that for the unroll-only case, we still maintain - // values in the vector mapping with initVector, as we do for other - // instructions. - for (unsigned Part = 0; Part < State.UF; ++Part) { - // The pointer operand of the new GEP. If it's loop-invariant, we - // won't broadcast it. - auto *Ptr = IsPtrLoopInvariant - ? State.get(getOperand(0), VPIteration(0, 0)) - : State.get(getOperand(0), Part); - - // Collect all the indices for the new GEP. If any index is - // loop-invariant, we won't broadcast it. - SmallVector<Value *, 4> Indices; - for (unsigned I = 1, E = getNumOperands(); I < E; I++) { - VPValue *Operand = getOperand(I); - if (IsIndexLoopInvariant[I - 1]) - Indices.push_back(State.get(Operand, VPIteration(0, 0))); - else - Indices.push_back(State.get(Operand, Part)); - } - - // If the GEP instruction is vectorized and was in a basic block that - // needed predication, we can't propagate the poison-generating 'inbounds' - // flag. The control flow has been linearized and the GEP is no longer - // guarded by the predicate, which could make the 'inbounds' properties to - // no longer hold. - bool IsInBounds = - GEP->isInBounds() && State.MayGeneratePoisonRecipes.count(this) == 0; - - // Create the new GEP. Note that this GEP may be a scalar if VF == 1, - // but it should be a vector, otherwise. - auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr, - Indices, "", IsInBounds); - assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && - "NewGEP is not a pointer vector"); - State.set(this, NewGEP, Part); - State.addMetadata(NewGEP, GEP); - } - } -} - void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { assert(!State.Instance && "Int or FP induction being replicated."); @@ -9632,45 +9490,6 @@ void VPScalarIVStepsRecipe::execute(VPTransformState &State) { } } -void VPBlendRecipe::execute(VPTransformState &State) { - State.setDebugLocFromInst(Phi); - // We know that all PHIs in non-header blocks are converted into - // selects, so we don't have to worry about the insertion order and we - // can just use the builder. - // At this point we generate the predication tree. There may be - // duplications since this is a simple recursive scan, but future - // optimizations will clean it up. - - unsigned NumIncoming = getNumIncomingValues(); - - // Generate a sequence of selects of the form: - // SELECT(Mask3, In3, - // SELECT(Mask2, In2, - // SELECT(Mask1, In1, - // In0))) - // Note that Mask0 is never used: lanes for which no path reaches this phi and - // are essentially undef are taken from In0. - InnerLoopVectorizer::VectorParts Entry(State.UF); - for (unsigned In = 0; In < NumIncoming; ++In) { - for (unsigned Part = 0; Part < State.UF; ++Part) { - // We might have single edge PHIs (blocks) - use an identity - // 'select' for the first PHI operand. - Value *In0 = State.get(getIncomingValue(In), Part); - if (In == 0) - Entry[Part] = In0; // Initialize with the first incoming value. - else { - // Select between the current value and the previous incoming edge - // based on the incoming mask. - Value *Cond = State.get(getMask(In), Part); - Entry[Part] = - State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); - } - } - } - for (unsigned Part = 0; Part < State.UF; ++Part) - State.set(this, Entry[Part], Part); -} - void VPInterleaveRecipe::execute(VPTransformState &State) { assert(!State.Instance && "Interleave group being replicated."); State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), @@ -9758,32 +9577,6 @@ void VPReplicateRecipe::execute(VPTransformState &State) { State); } -void VPBranchOnMaskRecipe::execute(VPTransformState &State) { - assert(State.Instance && "Branch on Mask works only on single instance."); - - unsigned Part = State.Instance->Part; - unsigned Lane = State.Instance->Lane.getKnownLane(); - - Value *ConditionBit = nullptr; - VPValue *BlockInMask = getMask(); - if (BlockInMask) { - ConditionBit = State.get(BlockInMask, Part); - if (ConditionBit->getType()->isVectorTy()) - ConditionBit = State.Builder.CreateExtractElement( - ConditionBit, State.Builder.getInt32(Lane)); - } else // Block in mask is all-one. - ConditionBit = State.Builder.getTrue(); - - // Replace the temporary unreachable terminator with a new conditional branch, - // whose two destinations will be set later when they are created. - auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); - assert(isa<UnreachableInst>(CurrentTerminator) && - "Expected to replace unreachable terminator with conditional branch."); - auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); - CondBr->setSuccessor(0, nullptr); - ReplaceInstWithInst(CurrentTerminator, CondBr); -} - void VPPredInstPHIRecipe::execute(VPTransformState &State) { assert(State.Instance && "Predicated instruction PHI works per instance."); Instruction *ScalarPredInst = @@ -10103,8 +9896,7 @@ static bool processLoopInVPlanNativePath( // Use the planner for outer loop vectorization. // TODO: CM is not used at this point inside the planner. Turn CM into an // optional argument if we don't need it in the future. - LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, - Requirements, ORE); + LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, ORE); // Get user vectorization factor. ElementCount UserVF = Hints.getWidth(); @@ -10123,10 +9915,10 @@ static bool processLoopInVPlanNativePath( VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); { - GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, + GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI, F->getParent()->getDataLayout()); - InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, - &CM, BFI, PSI, Checks); + InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, + VF.Width, 1, LVL, &CM, BFI, PSI, Checks); LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" << L->getHeader()->getParent()->getName() << "\"\n"); LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false); @@ -10183,6 +9975,105 @@ static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { } } +static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks, + VectorizationFactor &VF, + Optional<unsigned> VScale, Loop *L, + ScalarEvolution &SE) { + InstructionCost CheckCost = Checks.getCost(); + if (!CheckCost.isValid()) + return false; + + // When interleaving only scalar and vector cost will be equal, which in turn + // would lead to a divide by 0. Fall back to hard threshold. + if (VF.Width.isScalar()) { + if (CheckCost > VectorizeMemoryCheckThreshold) { + LLVM_DEBUG( + dbgs() + << "LV: Interleaving only is not profitable due to runtime checks\n"); + return false; + } + return true; + } + + // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated. + double ScalarC = *VF.ScalarCost.getValue(); + if (ScalarC == 0) + return true; + + // First, compute the minimum iteration count required so that the vector + // loop outperforms the scalar loop. + // The total cost of the scalar loop is + // ScalarC * TC + // where + // * TC is the actual trip count of the loop. + // * ScalarC is the cost of a single scalar iteration. + // + // The total cost of the vector loop is + // RtC + VecC * (TC / VF) + EpiC + // where + // * RtC is the cost of the generated runtime checks + // * VecC is the cost of a single vector iteration. + // * TC is the actual trip count of the loop + // * VF is the vectorization factor + // * EpiCost is the cost of the generated epilogue, including the cost + // of the remaining scalar operations. + // + // Vectorization is profitable once the total vector cost is less than the + // total scalar cost: + // RtC + VecC * (TC / VF) + EpiC < ScalarC * TC + // + // Now we can compute the minimum required trip count TC as + // (RtC + EpiC) / (ScalarC - (VecC / VF)) < TC + // + // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that + // the computations are performed on doubles, not integers and the result + // is rounded up, hence we get an upper estimate of the TC. + unsigned IntVF = VF.Width.getKnownMinValue(); + if (VF.Width.isScalable()) { + unsigned AssumedMinimumVscale = 1; + if (VScale) + AssumedMinimumVscale = *VScale; + IntVF *= AssumedMinimumVscale; + } + double VecCOverVF = double(*VF.Cost.getValue()) / IntVF; + double RtC = *CheckCost.getValue(); + double MinTC1 = RtC / (ScalarC - VecCOverVF); + + // Second, compute a minimum iteration count so that the cost of the + // runtime checks is only a fraction of the total scalar loop cost. This + // adds a loop-dependent bound on the overhead incurred if the runtime + // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC + // * TC. To bound the runtime check to be a fraction 1/X of the scalar + // cost, compute + // RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC + double MinTC2 = RtC * 10 / ScalarC; + + // Now pick the larger minimum. If it is not a multiple of VF, choose the + // next closest multiple of VF. This should partly compensate for ignoring + // the epilogue cost. + uint64_t MinTC = std::ceil(std::max(MinTC1, MinTC2)); + VF.MinProfitableTripCount = ElementCount::getFixed(alignTo(MinTC, IntVF)); + + LLVM_DEBUG( + dbgs() << "LV: Minimum required TC for runtime checks to be profitable:" + << VF.MinProfitableTripCount << "\n"); + + // Skip vectorization if the expected trip count is less than the minimum + // required trip count. + if (auto ExpectedTC = getSmallBestKnownTC(SE, L)) { + if (ElementCount::isKnownLT(ElementCount::getFixed(*ExpectedTC), + VF.MinProfitableTripCount)) { + LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected " + "trip count < minimum profitable VF (" + << *ExpectedTC << " < " << VF.MinProfitableTripCount + << ")\n"); + + return false; + } + } + return true; +} + LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || !EnableLoopInterleaving), @@ -10340,8 +10231,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { CM.collectElementTypesForWidening(); // Use the planner for vectorization. - LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, - Requirements, ORE); + LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, ORE); // Get user vectorization factor and interleave count. ElementCount UserVF = Hints.getWidth(); @@ -10353,10 +10243,25 @@ bool LoopVectorizePass::processLoop(Loop *L) { VectorizationFactor VF = VectorizationFactor::Disabled(); unsigned IC = 1; - GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, + GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI, F->getParent()->getDataLayout()); if (MaybeVF) { - if (LVP.requiresTooManyRuntimeChecks()) { + VF = *MaybeVF; + // Select the interleave count. + IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue()); + + unsigned SelectedIC = std::max(IC, UserIC); + // Optimistically generate runtime checks if they are needed. Drop them if + // they turn out to not be profitable. + if (VF.Width.isVector() || SelectedIC > 1) + Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC); + + // Check if it is profitable to vectorize with runtime checks. + bool ForceVectorization = + Hints.getForce() == LoopVectorizeHints::FK_Enabled; + if (!ForceVectorization && + !areRuntimeChecksProfitable(Checks, VF, CM.getVScaleForTuning(), L, + *PSE.getSE())) { ORE->emit([&]() { return OptimizationRemarkAnalysisAliasing( DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(), @@ -10368,15 +10273,6 @@ bool LoopVectorizePass::processLoop(Loop *L) { Hints.emitRemarkWithHints(); return false; } - VF = *MaybeVF; - // Select the interleave count. - IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue()); - - unsigned SelectedIC = std::max(IC, UserIC); - // Optimistically generate runtime checks if they are needed. Drop them if - // they turn out to not be profitable. - if (VF.Width.isVector() || SelectedIC > 1) - Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC); } // Identify the diagnostic messages that should be produced. @@ -10533,8 +10429,9 @@ bool LoopVectorizePass::processLoop(Loop *L) { if (!MainILV.areSafetyChecksAdded()) DisableRuntimeUnroll = true; } else { - InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, - &LVL, &CM, BFI, PSI, Checks); + InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, + VF.MinProfitableTripCount, IC, &LVL, &CM, BFI, + PSI, Checks); VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false); @@ -10564,7 +10461,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, LLVMLoopVectorizeFollowupEpilogue}); if (RemainderLoopID) { - L->setLoopID(RemainderLoopID.getValue()); + L->setLoopID(RemainderLoopID.value()); } else { if (DisableRuntimeUnroll) AddRuntimeUnrollDisableMetaData(L); |