diff options
Diffstat (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')
-rw-r--r-- | lib/Transforms/Vectorize/LoopVectorize.cpp | 738 |
1 files changed, 479 insertions, 259 deletions
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 46265e3f3e13..8f0bf70f873c 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -177,6 +177,14 @@ static cl::opt<unsigned> TinyTripCountVectorThreshold( "value are vectorized only if no scalar iteration overheads " "are incurred.")); +// Indicates that an epilogue is undesired, predication is preferred. +// This means that the vectorizer will try to fold the loop-tail (epilogue) +// into the loop and predicate the loop body accordingly. +static cl::opt<bool> PreferPredicateOverEpilog( + "prefer-predicate-over-epilog", cl::init(false), cl::Hidden, + cl::desc("Indicate that an epilogue is undesired, predication should be " + "used instead.")); + static cl::opt<bool> MaximizeBandwidth( "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, cl::desc("Maximize bandwidth when selecting vectorization factor which " @@ -347,6 +355,29 @@ static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { : ConstantFP::get(Ty, C); } +/// Returns "best known" trip count for the specified loop \p L as defined by +/// the following procedure: +/// 1) Returns exact trip count if it is known. +/// 2) Returns expected trip count according to profile data if any. +/// 3) Returns upper bound estimate if it is known. +/// 4) Returns None if all of the above failed. +static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { + // Check if exact trip count is known. + if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) + return ExpectedTC; + + // Check if there is an expected trip count available from profile data. + if (LoopVectorizeWithBlockFrequency) + if (auto EstimatedTC = getLoopEstimatedTripCount(L)) + return EstimatedTC; + + // Check if upper bound estimate is known. + if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) + return ExpectedTC; + + return None; +} + namespace llvm { /// InnerLoopVectorizer vectorizes loops which contain only one basic @@ -795,6 +826,59 @@ void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) B.SetCurrentDebugLocation(DebugLoc()); } +/// Write a record \p DebugMsg about vectorization failure to the debug +/// output stream. If \p I is passed, it is an instruction that prevents +/// vectorization. +#ifndef NDEBUG +static void debugVectorizationFailure(const StringRef DebugMsg, + Instruction *I) { + dbgs() << "LV: Not vectorizing: " << DebugMsg; + if (I != nullptr) + dbgs() << " " << *I; + else + dbgs() << '.'; + dbgs() << '\n'; +} +#endif + +/// Create an analysis remark that explains why vectorization failed +/// +/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p +/// RemarkName is the identifier for the remark. If \p I is passed it is an +/// instruction that prevents vectorization. Otherwise \p TheLoop is used for +/// the location of the remark. \return the remark object that can be +/// streamed to. +static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, + StringRef RemarkName, Loop *TheLoop, Instruction *I) { + Value *CodeRegion = TheLoop->getHeader(); + DebugLoc DL = TheLoop->getStartLoc(); + + if (I) { + CodeRegion = I->getParent(); + // If there is no debug location attached to the instruction, revert back to + // using the loop's. + if (I->getDebugLoc()) + DL = I->getDebugLoc(); + } + + OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); + R << "loop not vectorized: "; + return R; +} + +namespace llvm { + +void reportVectorizationFailure(const StringRef DebugMsg, + const StringRef OREMsg, const StringRef ORETag, + OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) { + LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I)); + LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); + ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), + ORETag, TheLoop, I) << OREMsg); +} + +} // end namespace llvm + #ifndef NDEBUG /// \return string containing a file name and a line # for the given loop. static std::string getDebugLocString(const Loop *L) { @@ -836,6 +920,26 @@ void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, namespace llvm { +// Loop vectorization cost-model hints how the scalar epilogue loop should be +// lowered. +enum ScalarEpilogueLowering { + + // The default: allowing scalar epilogues. + CM_ScalarEpilogueAllowed, + + // Vectorization with OptForSize: don't allow epilogues. + CM_ScalarEpilogueNotAllowedOptSize, + + // A special case of vectorisation with OptForSize: loops with a very small + // trip count are considered for vectorization under OptForSize, thereby + // making sure the cost of their loop body is dominant, free of runtime + // guards and scalar iteration overheads. + CM_ScalarEpilogueNotAllowedLowTripLoop, + + // Loop hint predicate indicating an epilogue is undesired. + CM_ScalarEpilogueNotNeededUsePredicate +}; + /// LoopVectorizationCostModel - estimates the expected speedups due to /// vectorization. /// In many cases vectorization is not profitable. This can happen because of @@ -845,20 +949,26 @@ namespace llvm { /// different operations. class LoopVectorizationCostModel { public: - LoopVectorizationCostModel(Loop *L, PredicatedScalarEvolution &PSE, - LoopInfo *LI, LoopVectorizationLegality *Legal, + LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, + PredicatedScalarEvolution &PSE, LoopInfo *LI, + LoopVectorizationLegality *Legal, const TargetTransformInfo &TTI, const TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, const Function *F, const LoopVectorizeHints *Hints, InterleavedAccessInfo &IAI) - : TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB), - AC(AC), ORE(ORE), TheFunction(F), Hints(Hints), InterleaveInfo(IAI) {} + : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), + TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), + Hints(Hints), InterleaveInfo(IAI) {} /// \return An upper bound for the vectorization factor, or None if /// vectorization and interleaving should be avoided up front. - Optional<unsigned> computeMaxVF(bool OptForSize); + Optional<unsigned> computeMaxVF(); + + /// \return True if runtime checks are required for vectorization, and false + /// otherwise. + bool runtimeChecksRequired(); /// \return The most profitable vectorization factor and the cost of that VF. /// This method checks every power of two up to MaxVF. If UserVF is not ZERO @@ -881,8 +991,7 @@ public: /// If interleave count has been specified by metadata it will be returned. /// Otherwise, the interleave count is computed and returned. VF and LoopCost /// are the selected vectorization factor and the cost of the selected VF. - unsigned selectInterleaveCount(bool OptForSize, unsigned VF, - unsigned LoopCost); + unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost); /// Memory access instruction may be vectorized in more than one way. /// Form of instruction after vectorization depends on cost. @@ -897,10 +1006,11 @@ public: /// of a loop. struct RegisterUsage { /// Holds the number of loop invariant values that are used in the loop. - unsigned LoopInvariantRegs; - + /// The key is ClassID of target-provided register class. + SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; /// Holds the maximum number of concurrent live intervals in the loop. - unsigned MaxLocalUsers; + /// The key is ClassID of target-provided register class. + SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; }; /// \return Returns information about the register usages of the loop for the @@ -1080,14 +1190,16 @@ public: /// Returns true if the target machine supports masked store operation /// for the given \p DataType and kind of access to \p Ptr. - bool isLegalMaskedStore(Type *DataType, Value *Ptr) { - return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedStore(DataType); + bool isLegalMaskedStore(Type *DataType, Value *Ptr, MaybeAlign Alignment) { + return Legal->isConsecutivePtr(Ptr) && + TTI.isLegalMaskedStore(DataType, Alignment); } /// Returns true if the target machine supports masked load operation /// for the given \p DataType and kind of access to \p Ptr. - bool isLegalMaskedLoad(Type *DataType, Value *Ptr) { - return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedLoad(DataType); + bool isLegalMaskedLoad(Type *DataType, Value *Ptr, MaybeAlign Alignment) { + return Legal->isConsecutivePtr(Ptr) && + TTI.isLegalMaskedLoad(DataType, Alignment); } /// Returns true if the target machine supports masked scatter operation @@ -1157,11 +1269,14 @@ public: /// to handle accesses with gaps, and there is nothing preventing us from /// creating a scalar epilogue. bool requiresScalarEpilogue() const { - return IsScalarEpilogueAllowed && InterleaveInfo.requiresScalarEpilogue(); + return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue(); } - /// Returns true if a scalar epilogue is not allowed due to optsize. - bool isScalarEpilogueAllowed() const { return IsScalarEpilogueAllowed; } + /// Returns true if a scalar epilogue is not allowed due to optsize or a + /// loop hint annotation. + bool isScalarEpilogueAllowed() const { + return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; + } /// Returns true if all loop blocks should be masked to fold tail loop. bool foldTailByMasking() const { return FoldTailByMasking; } @@ -1187,7 +1302,7 @@ private: /// \return An upper bound for the vectorization factor, larger than zero. /// One is returned if vectorization should best be avoided due to cost. - unsigned computeFeasibleMaxVF(bool OptForSize, unsigned ConstTripCount); + unsigned computeFeasibleMaxVF(unsigned ConstTripCount); /// The vectorization cost is a combination of the cost itself and a boolean /// indicating whether any of the contributing operations will actually @@ -1246,15 +1361,6 @@ private: /// should be used. bool useEmulatedMaskMemRefHack(Instruction *I); - /// Create an analysis remark that explains why vectorization failed - /// - /// \p RemarkName is the identifier for the remark. \return the remark object - /// that can be streamed to. - OptimizationRemarkAnalysis createMissedAnalysis(StringRef RemarkName) { - return createLVMissedAnalysis(Hints->vectorizeAnalysisPassName(), - RemarkName, TheLoop); - } - /// Map of scalar integer values to the smallest bitwidth they can be legally /// represented as. The vector equivalents of these values should be truncated /// to this type. @@ -1270,13 +1376,13 @@ private: SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; /// Records whether it is allowed to have the original scalar loop execute at - /// least once. This may be needed as a fallback loop in case runtime + /// least once. This may be needed as a fallback loop in case runtime /// aliasing/dependence checks fail, or to handle the tail/remainder /// iterations when the trip count is unknown or doesn't divide by the VF, /// or as a peel-loop to handle gaps in interleave-groups. /// Under optsize and when the trip count is very small we don't allow any /// iterations to execute in the scalar loop. - bool IsScalarEpilogueAllowed = true; + ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; /// All blocks of loop are to be masked to fold tail of scalar iterations. bool FoldTailByMasking = false; @@ -1496,7 +1602,7 @@ struct LoopVectorize : public FunctionPass { auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); - auto *TLI = TLIP ? &TLIP->getTLI() : nullptr; + auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); @@ -2253,12 +2359,11 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, Type *ScalarDataTy = getMemInstValueType(Instr); Type *DataTy = VectorType::get(ScalarDataTy, VF); Value *Ptr = getLoadStorePointerOperand(Instr); - unsigned Alignment = getLoadStoreAlignment(Instr); // An alignment of 0 means target abi alignment. We need to use the scalar's // target abi alignment in such a case. const DataLayout &DL = Instr->getModule()->getDataLayout(); - if (!Alignment) - Alignment = DL.getABITypeAlignment(ScalarDataTy); + const Align Alignment = + DL.getValueOrABITypeAlignment(getLoadStoreAlignment(Instr), ScalarDataTy); unsigned AddressSpace = getLoadStoreAddressSpace(Instr); // Determine if the pointer operand of the access is either consecutive or @@ -2322,8 +2427,8 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, if (CreateGatherScatter) { Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr; Value *VectorGep = getOrCreateVectorValue(Ptr, Part); - NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, - MaskPart); + NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, + Alignment.value(), MaskPart); } else { if (Reverse) { // If we store to reverse consecutive memory locations, then we need @@ -2334,10 +2439,11 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, } auto *VecPtr = CreateVecPtr(Part, Ptr); if (isMaskRequired) - NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, - Mask[Part]); + NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, + Alignment.value(), Mask[Part]); else - NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); + NewSI = + Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment.value()); } addMetadata(NewSI, SI); } @@ -2352,18 +2458,18 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, if (CreateGatherScatter) { Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr; Value *VectorGep = getOrCreateVectorValue(Ptr, Part); - NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart, + NewLI = Builder.CreateMaskedGather(VectorGep, Alignment.value(), MaskPart, nullptr, "wide.masked.gather"); addMetadata(NewLI, LI); } else { auto *VecPtr = CreateVecPtr(Part, Ptr); if (isMaskRequired) - NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part], + NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment.value(), Mask[Part], UndefValue::get(DataTy), "wide.masked.load"); else - NewLI = - Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); + NewLI = Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment.value(), + "wide.load"); // Add metadata to the load, but setVectorValue to the reverse shuffle. addMetadata(NewLI, LI); @@ -2615,8 +2721,9 @@ void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { if (C->isZero()) return; - assert(!Cost->foldTailByMasking() && - "Cannot SCEV check stride or overflow when folding tail"); + assert(!BB->getParent()->hasOptSize() && + "Cannot SCEV check stride or overflow when optimizing for size"); + // Create a new block containing the stride check. BB->setName("vector.scevcheck"); auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); @@ -2649,7 +2756,20 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { if (!MemRuntimeCheck) return; - assert(!Cost->foldTailByMasking() && "Cannot check memory when folding tail"); + if (BB->getParent()->hasOptSize()) { + assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && + "Cannot emit memory checks when optimizing for size, unless forced " + "to vectorize."); + ORE->emit([&]() { + return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", + L->getStartLoc(), L->getHeader()) + << "Code-size may be reduced by not forcing " + "vectorization, or by source-code modifications " + "eliminating the need for runtime checks " + "(e.g., adding 'restrict')."; + }); + } + // Create a new block containing the memory check. BB->setName("vector.memcheck"); auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); @@ -2666,7 +2786,7 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { // We currently don't use LoopVersioning for the actual loop cloning but we // still use it to add the noalias metadata. - LVer = llvm::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT, + LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT, PSE.getSE()); LVer->prepareNoAliasMetadata(); } @@ -3598,6 +3718,26 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { setDebugLocFromInst(Builder, LoopExitInst); + // If tail is folded by masking, the vector value to leave the loop should be + // a Select choosing between the vectorized LoopExitInst and vectorized Phi, + // instead of the former. + if (Cost->foldTailByMasking()) { + for (unsigned Part = 0; Part < UF; ++Part) { + Value *VecLoopExitInst = + VectorLoopValueMap.getVectorValue(LoopExitInst, Part); + Value *Sel = nullptr; + for (User *U : VecLoopExitInst->users()) { + if (isa<SelectInst>(U)) { + assert(!Sel && "Reduction exit feeding two selects"); + Sel = U; + } else + assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); + } + assert(Sel && "Reduction exit feeds no select"); + VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel); + } + } + // If the vector reduction can be performed in a smaller type, we truncate // then extend the loop exit value to enable InstCombine to evaluate the // entire expression in the smaller type. @@ -4064,7 +4204,7 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) { case Instruction::FCmp: { // Widen compares. Generate vector compares. bool FCmp = (I.getOpcode() == Instruction::FCmp); - auto *Cmp = dyn_cast<CmpInst>(&I); + auto *Cmp = cast<CmpInst>(&I); setDebugLocFromInst(Builder, Cmp); for (unsigned Part = 0; Part < UF; ++Part) { Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part); @@ -4097,7 +4237,7 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) { case Instruction::Trunc: case Instruction::FPTrunc: case Instruction::BitCast: { - auto *CI = dyn_cast<CastInst>(&I); + auto *CI = cast<CastInst>(&I); setDebugLocFromInst(Builder, CI); /// Vectorize casts. @@ -4421,9 +4561,10 @@ bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigne "Widening decision should be ready at this moment"); return WideningDecision == CM_Scalarize; } + const MaybeAlign Alignment = getLoadStoreAlignment(I); return isa<LoadInst>(I) ? - !(isLegalMaskedLoad(Ty, Ptr) || isLegalMaskedGather(Ty)) - : !(isLegalMaskedStore(Ty, Ptr) || isLegalMaskedScatter(Ty)); + !(isLegalMaskedLoad(Ty, Ptr, Alignment) || isLegalMaskedGather(Ty)) + : !(isLegalMaskedStore(Ty, Ptr, Alignment) || isLegalMaskedScatter(Ty)); } case Instruction::UDiv: case Instruction::SDiv: @@ -4452,10 +4593,10 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I, // Check if masking is required. // A Group may need masking for one of two reasons: it resides in a block that // needs predication, or it was decided to use masking to deal with gaps. - bool PredicatedAccessRequiresMasking = + bool PredicatedAccessRequiresMasking = Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); - bool AccessWithGapsRequiresMasking = - Group->requiresScalarEpilogue() && !IsScalarEpilogueAllowed; + bool AccessWithGapsRequiresMasking = + Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) return true; @@ -4466,8 +4607,9 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I, "Masked interleave-groups for predicated accesses are not enabled."); auto *Ty = getMemInstValueType(I); - return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty) - : TTI.isLegalMaskedStore(Ty); + const MaybeAlign Alignment = getLoadStoreAlignment(I); + return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) + : TTI.isLegalMaskedStore(Ty, Alignment); } bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I, @@ -4675,82 +4817,96 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { Uniforms[VF].insert(Worklist.begin(), Worklist.end()); } -Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) { - if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { - // TODO: It may by useful to do since it's still likely to be dynamically - // uniform if the target can skip. - LLVM_DEBUG( - dbgs() << "LV: Not inserting runtime ptr check for divergent target"); - - ORE->emit( - createMissedAnalysis("CantVersionLoopWithDivergentTarget") - << "runtime pointer checks needed. Not enabled for divergent target"); - - return None; - } - - unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); - if (!OptForSize) // Remaining checks deal with scalar loop when OptForSize. - return computeFeasibleMaxVF(OptForSize, TC); +bool LoopVectorizationCostModel::runtimeChecksRequired() { + LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); if (Legal->getRuntimePointerChecking()->Need) { - ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize") - << "runtime pointer checks needed. Enable vectorization of this " - "loop with '#pragma clang loop vectorize(enable)' when " - "compiling with -Os/-Oz"); - LLVM_DEBUG( - dbgs() - << "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n"); - return None; + reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", + "runtime pointer checks needed. Enable vectorization of this " + "loop with '#pragma clang loop vectorize(enable)' when " + "compiling with -Os/-Oz", + "CantVersionLoopWithOptForSize", ORE, TheLoop); + return true; } if (!PSE.getUnionPredicate().getPredicates().empty()) { - ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize") - << "runtime SCEV checks needed. Enable vectorization of this " - "loop with '#pragma clang loop vectorize(enable)' when " - "compiling with -Os/-Oz"); - LLVM_DEBUG( - dbgs() - << "LV: Aborting. Runtime SCEV check is required with -Os/-Oz.\n"); - return None; + reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", + "runtime SCEV checks needed. Enable vectorization of this " + "loop with '#pragma clang loop vectorize(enable)' when " + "compiling with -Os/-Oz", + "CantVersionLoopWithOptForSize", ORE, TheLoop); + return true; } // FIXME: Avoid specializing for stride==1 instead of bailing out. if (!Legal->getLAI()->getSymbolicStrides().empty()) { - ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize") - << "runtime stride == 1 checks needed. Enable vectorization of " - "this loop with '#pragma clang loop vectorize(enable)' when " - "compiling with -Os/-Oz"); - LLVM_DEBUG( - dbgs() - << "LV: Aborting. Runtime stride check is required with -Os/-Oz.\n"); + reportVectorizationFailure("Runtime stride check is required with -Os/-Oz", + "runtime stride == 1 checks needed. Enable vectorization of " + "this loop with '#pragma clang loop vectorize(enable)' when " + "compiling with -Os/-Oz", + "CantVersionLoopWithOptForSize", ORE, TheLoop); + return true; + } + + return false; +} + +Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() { + if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { + // TODO: It may by useful to do since it's still likely to be dynamically + // uniform if the target can skip. + reportVectorizationFailure( + "Not inserting runtime ptr check for divergent target", + "runtime pointer checks needed. Not enabled for divergent target", + "CantVersionLoopWithDivergentTarget", ORE, TheLoop); return None; } - // If we optimize the program for size, avoid creating the tail loop. + unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); - if (TC == 1) { - ORE->emit(createMissedAnalysis("SingleIterationLoop") - << "loop trip count is one, irrelevant for vectorization"); - LLVM_DEBUG(dbgs() << "LV: Aborting, single iteration (non) loop.\n"); + reportVectorizationFailure("Single iteration (non) loop", + "loop trip count is one, irrelevant for vectorization", + "SingleIterationLoop", ORE, TheLoop); return None; } - // Record that scalar epilogue is not allowed. - LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); + switch (ScalarEpilogueStatus) { + case CM_ScalarEpilogueAllowed: + return computeFeasibleMaxVF(TC); + case CM_ScalarEpilogueNotNeededUsePredicate: + LLVM_DEBUG( + dbgs() << "LV: vector predicate hint/switch found.\n" + << "LV: Not allowing scalar epilogue, creating predicated " + << "vector loop.\n"); + break; + case CM_ScalarEpilogueNotAllowedLowTripLoop: + // fallthrough as a special case of OptForSize + case CM_ScalarEpilogueNotAllowedOptSize: + if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) + LLVM_DEBUG( + dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); + else + LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " + << "count.\n"); + + // Bail if runtime checks are required, which are not good when optimising + // for size. + if (runtimeChecksRequired()) + return None; + break; + } - IsScalarEpilogueAllowed = !OptForSize; + // Now try the tail folding - // We don't create an epilogue when optimizing for size. // Invalidate interleave groups that require an epilogue if we can't mask // the interleave-group. - if (!useMaskedInterleavedAccesses(TTI)) + if (!useMaskedInterleavedAccesses(TTI)) InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); - unsigned MaxVF = computeFeasibleMaxVF(OptForSize, TC); - + unsigned MaxVF = computeFeasibleMaxVF(TC); if (TC > 0 && TC % MaxVF == 0) { + // Accept MaxVF if we do not have a tail. LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); return MaxVF; } @@ -4759,28 +4915,30 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) { // found modulo the vectorization factor is not zero, try to fold the tail // by masking. // FIXME: look for a smaller MaxVF that does divide TC rather than masking. - if (Legal->canFoldTailByMasking()) { + if (Legal->prepareToFoldTailByMasking()) { FoldTailByMasking = true; return MaxVF; } if (TC == 0) { - ORE->emit( - createMissedAnalysis("UnknownLoopCountComplexCFG") - << "unable to calculate the loop count due to complex control flow"); + reportVectorizationFailure( + "Unable to calculate the loop count due to complex control flow", + "unable to calculate the loop count due to complex control flow", + "UnknownLoopCountComplexCFG", ORE, TheLoop); return None; } - ORE->emit(createMissedAnalysis("NoTailLoopWithOptForSize") - << "cannot optimize for size and vectorize at the same time. " - "Enable vectorization of this loop with '#pragma clang loop " - "vectorize(enable)' when compiling with -Os/-Oz"); + reportVectorizationFailure( + "Cannot optimize for size and vectorize at the same time.", + "cannot optimize for size and vectorize at the same time. " + "Enable vectorization of this loop with '#pragma clang loop " + "vectorize(enable)' when compiling with -Os/-Oz", + "NoTailLoopWithOptForSize", ORE, TheLoop); return None; } unsigned -LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize, - unsigned ConstTripCount) { +LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) { MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); unsigned SmallestType, WidestType; std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); @@ -4818,8 +4976,8 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize, } unsigned MaxVF = MaxVectorSize; - if (TTI.shouldMaximizeVectorBandwidth(OptForSize) || - (MaximizeBandwidth && !OptForSize)) { + if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) || + (MaximizeBandwidth && isScalarEpilogueAllowed())) { // Collect all viable vectorization factors larger than the default MaxVF // (i.e. MaxVectorSize). SmallVector<unsigned, 8> VFs; @@ -4832,9 +4990,14 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize, // Select the largest VF which doesn't require more registers than existing // ones. - unsigned TargetNumRegisters = TTI.getNumberOfRegisters(true); for (int i = RUs.size() - 1; i >= 0; --i) { - if (RUs[i].MaxLocalUsers <= TargetNumRegisters) { + bool Selected = true; + for (auto& pair : RUs[i].MaxLocalUsers) { + unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); + if (pair.second > TargetNumRegisters) + Selected = false; + } + if (Selected) { MaxVF = VFs[i]; break; } @@ -4886,10 +5049,9 @@ LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) { } if (!EnableCondStoresVectorization && NumPredStores) { - ORE->emit(createMissedAnalysis("ConditionalStore") - << "store that is conditionally executed prevents vectorization"); - LLVM_DEBUG( - dbgs() << "LV: No vectorization. There are conditional stores.\n"); + reportVectorizationFailure("There are conditional stores.", + "store that is conditionally executed prevents vectorization", + "ConditionalStore", ORE, TheLoop); Width = 1; Cost = ScalarCost; } @@ -4958,8 +5120,7 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() { return {MinWidth, MaxWidth}; } -unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, - unsigned VF, +unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, unsigned LoopCost) { // -- The interleave heuristics -- // We interleave the loop in order to expose ILP and reduce the loop overhead. @@ -4975,8 +5136,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, // 3. We don't interleave if we think that we will spill registers to memory // due to the increased register pressure. - // When we optimize for size, we don't interleave. - if (OptForSize) + if (!isScalarEpilogueAllowed()) return 1; // We used the distance for the interleave count. @@ -4988,22 +5148,12 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, if (TC > 1 && TC < TinyTripCountInterleaveThreshold) return 1; - unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1); - LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters - << " registers\n"); - - if (VF == 1) { - if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) - TargetNumRegisters = ForceTargetNumScalarRegs; - } else { - if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) - TargetNumRegisters = ForceTargetNumVectorRegs; - } - RegisterUsage R = calculateRegisterUsage({VF})[0]; // We divide by these constants so assume that we have at least one // instruction that uses at least one register. - R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U); + for (auto& pair : R.MaxLocalUsers) { + pair.second = std::max(pair.second, 1U); + } // We calculate the interleave count using the following formula. // Subtract the number of loop invariants from the number of available @@ -5016,13 +5166,35 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, // We also want power of two interleave counts to ensure that the induction // variable of the vector loop wraps to zero, when tail is folded by masking; // this currently happens when OptForSize, in which case IC is set to 1 above. - unsigned IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) / - R.MaxLocalUsers); + unsigned IC = UINT_MAX; - // Don't count the induction variable as interleaved. - if (EnableIndVarRegisterHeur) - IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) / - std::max(1U, (R.MaxLocalUsers - 1))); + for (auto& pair : R.MaxLocalUsers) { + unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); + LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters + << " registers of " + << TTI.getRegisterClassName(pair.first) << " register class\n"); + if (VF == 1) { + if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) + TargetNumRegisters = ForceTargetNumScalarRegs; + } else { + if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) + TargetNumRegisters = ForceTargetNumVectorRegs; + } + unsigned MaxLocalUsers = pair.second; + unsigned LoopInvariantRegs = 0; + if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) + LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; + + unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); + // Don't count the induction variable as interleaved. + if (EnableIndVarRegisterHeur) { + TmpIC = + PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / + std::max(1U, (MaxLocalUsers - 1))); + } + + IC = std::min(IC, TmpIC); + } // Clamp the interleave ranges to reasonable counts. unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF); @@ -5036,6 +5208,14 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; } + // If the trip count is constant, limit the interleave count to be less than + // the trip count divided by VF. + if (TC > 0) { + assert(TC >= VF && "VF exceeds trip count?"); + if ((TC / VF) < MaxInterleaveCount) + MaxInterleaveCount = (TC / VF); + } + // If we did not calculate the cost for VF (because the user selected the VF) // then we calculate the cost of VF here. if (LoopCost == 0) @@ -5044,7 +5224,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, assert(LoopCost && "Non-zero loop cost expected"); // Clamp the calculated IC to be between the 1 and the max interleave count - // that the target allows. + // that the target and trip count allows. if (IC > MaxInterleaveCount) IC = MaxInterleaveCount; else if (IC < 1) @@ -5196,7 +5376,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { const DataLayout &DL = TheFunction->getParent()->getDataLayout(); SmallVector<RegisterUsage, 8> RUs(VFs.size()); - SmallVector<unsigned, 8> MaxUsages(VFs.size(), 0); + SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); @@ -5226,21 +5406,45 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { // For each VF find the maximum usage of registers. for (unsigned j = 0, e = VFs.size(); j < e; ++j) { + // Count the number of live intervals. + SmallMapVector<unsigned, unsigned, 4> RegUsage; + if (VFs[j] == 1) { - MaxUsages[j] = std::max(MaxUsages[j], OpenIntervals.size()); - continue; + for (auto Inst : OpenIntervals) { + unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); + if (RegUsage.find(ClassID) == RegUsage.end()) + RegUsage[ClassID] = 1; + else + RegUsage[ClassID] += 1; + } + } else { + collectUniformsAndScalars(VFs[j]); + for (auto Inst : OpenIntervals) { + // Skip ignored values for VF > 1. + if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end()) + continue; + if (isScalarAfterVectorization(Inst, VFs[j])) { + unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); + if (RegUsage.find(ClassID) == RegUsage.end()) + RegUsage[ClassID] = 1; + else + RegUsage[ClassID] += 1; + } else { + unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); + if (RegUsage.find(ClassID) == RegUsage.end()) + RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); + else + RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); + } + } } - collectUniformsAndScalars(VFs[j]); - // Count the number of live intervals. - unsigned RegUsage = 0; - for (auto Inst : OpenIntervals) { - // Skip ignored values for VF > 1. - if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end() || - isScalarAfterVectorization(Inst, VFs[j])) - continue; - RegUsage += GetRegUsage(Inst->getType(), VFs[j]); + + for (auto& pair : RegUsage) { + if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) + MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); + else + MaxUsages[j][pair.first] = pair.second; } - MaxUsages[j] = std::max(MaxUsages[j], RegUsage); } LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " @@ -5251,18 +5455,34 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { } for (unsigned i = 0, e = VFs.size(); i < e; ++i) { - unsigned Invariant = 0; - if (VFs[i] == 1) - Invariant = LoopInvariants.size(); - else { - for (auto Inst : LoopInvariants) - Invariant += GetRegUsage(Inst->getType(), VFs[i]); + SmallMapVector<unsigned, unsigned, 4> Invariant; + + for (auto Inst : LoopInvariants) { + unsigned Usage = VFs[i] == 1 ? 1 : GetRegUsage(Inst->getType(), VFs[i]); + unsigned ClassID = TTI.getRegisterClassForType(VFs[i] > 1, Inst->getType()); + if (Invariant.find(ClassID) == Invariant.end()) + Invariant[ClassID] = Usage; + else + Invariant[ClassID] += Usage; } - LLVM_DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] << '\n'); - LLVM_DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages[i] << '\n'); - LLVM_DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant - << '\n'); + LLVM_DEBUG({ + dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; + dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() + << " item\n"; + for (const auto &pair : MaxUsages[i]) { + dbgs() << "LV(REG): RegisterClass: " + << TTI.getRegisterClassName(pair.first) << ", " << pair.second + << " registers\n"; + } + dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() + << " item\n"; + for (const auto &pair : Invariant) { + dbgs() << "LV(REG): RegisterClass: " + << TTI.getRegisterClassName(pair.first) << ", " << pair.second + << " registers\n"; + } + }); RU.LoopInvariantRegs = Invariant; RU.MaxLocalUsers = MaxUsages[i]; @@ -5511,7 +5731,6 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, Type *ValTy = getMemInstValueType(I); auto SE = PSE.getSE(); - unsigned Alignment = getLoadStoreAlignment(I); unsigned AS = getLoadStoreAddressSpace(I); Value *Ptr = getLoadStorePointerOperand(I); Type *PtrTy = ToVectorTy(Ptr->getType(), VF); @@ -5525,9 +5744,9 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, // Don't pass *I here, since it is scalar but will actually be part of a // vectorized loop where the user of it is a vectorized instruction. - Cost += VF * - TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, - AS); + const MaybeAlign Alignment = getLoadStoreAlignment(I); + Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), + Alignment ? Alignment->value() : 0, AS); // Get the overhead of the extractelement and insertelement instructions // we might create due to scalarization. @@ -5552,18 +5771,20 @@ unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, unsigned VF) { Type *ValTy = getMemInstValueType(I); Type *VectorTy = ToVectorTy(ValTy, VF); - unsigned Alignment = getLoadStoreAlignment(I); Value *Ptr = getLoadStorePointerOperand(I); unsigned AS = getLoadStoreAddressSpace(I); int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && "Stride should be 1 or -1 for consecutive memory access"); + const MaybeAlign Alignment = getLoadStoreAlignment(I); unsigned Cost = 0; if (Legal->isMaskRequired(I)) - Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); + Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, + Alignment ? Alignment->value() : 0, AS); else - Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I); + Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, + Alignment ? Alignment->value() : 0, AS, I); bool Reverse = ConsecutiveStride < 0; if (Reverse) @@ -5575,33 +5796,37 @@ unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, unsigned VF) { Type *ValTy = getMemInstValueType(I); Type *VectorTy = ToVectorTy(ValTy, VF); - unsigned Alignment = getLoadStoreAlignment(I); + const MaybeAlign Alignment = getLoadStoreAlignment(I); unsigned AS = getLoadStoreAddressSpace(I); if (isa<LoadInst>(I)) { return TTI.getAddressComputationCost(ValTy) + - TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) + + TTI.getMemoryOpCost(Instruction::Load, ValTy, + Alignment ? Alignment->value() : 0, AS) + TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); } StoreInst *SI = cast<StoreInst>(I); bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); return TTI.getAddressComputationCost(ValTy) + - TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) + - (isLoopInvariantStoreValue ? 0 : TTI.getVectorInstrCost( - Instruction::ExtractElement, - VectorTy, VF - 1)); + TTI.getMemoryOpCost(Instruction::Store, ValTy, + Alignment ? Alignment->value() : 0, AS) + + (isLoopInvariantStoreValue + ? 0 + : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, + VF - 1)); } unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, unsigned VF) { Type *ValTy = getMemInstValueType(I); Type *VectorTy = ToVectorTy(ValTy, VF); - unsigned Alignment = getLoadStoreAlignment(I); + const MaybeAlign Alignment = getLoadStoreAlignment(I); Value *Ptr = getLoadStorePointerOperand(I); return TTI.getAddressComputationCost(VectorTy) + TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr, - Legal->isMaskRequired(I), Alignment); + Legal->isMaskRequired(I), + Alignment ? Alignment->value() : 0); } unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, @@ -5626,8 +5851,8 @@ unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, } // Calculate the cost of the whole interleaved group. - bool UseMaskForGaps = - Group->requiresScalarEpilogue() && !IsScalarEpilogueAllowed; + bool UseMaskForGaps = + Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); unsigned Cost = TTI.getInterleavedMemoryOpCost( I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlignment(), AS, Legal->isMaskRequired(I), UseMaskForGaps); @@ -5648,11 +5873,12 @@ unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, // moment. if (VF == 1) { Type *ValTy = getMemInstValueType(I); - unsigned Alignment = getLoadStoreAlignment(I); + const MaybeAlign Alignment = getLoadStoreAlignment(I); unsigned AS = getLoadStoreAddressSpace(I); return TTI.getAddressComputationCost(ValTy) + - TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I); + TTI.getMemoryOpCost(I->getOpcode(), ValTy, + Alignment ? Alignment->value() : 0, AS, I); } return getWideningCost(I, VF); } @@ -6167,8 +6393,7 @@ static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, } VectorizationFactor -LoopVectorizationPlanner::planInVPlanNativePath(bool OptForSize, - unsigned UserVF) { +LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) { unsigned VF = UserVF; // Outer loop handling: They may require CFG and instruction level // transformations before even evaluating whether vectorization is profitable. @@ -6207,10 +6432,9 @@ LoopVectorizationPlanner::planInVPlanNativePath(bool OptForSize, return VectorizationFactor::Disabled(); } -Optional<VectorizationFactor> LoopVectorizationPlanner::plan(bool OptForSize, - unsigned UserVF) { +Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF) { assert(OrigLoop->empty() && "Inner loop expected."); - Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(OptForSize); + Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(); if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. return None; @@ -6840,8 +7064,15 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF, // If the tail is to be folded by masking, the primary induction variable // needs to be represented in VPlan for it to model early-exit masking. - if (CM.foldTailByMasking()) + // Also, both the Phi and the live-out instruction of each reduction are + // required in order to introduce a select between them in VPlan. + if (CM.foldTailByMasking()) { NeedDef.insert(Legal->getPrimaryInduction()); + for (auto &Reduction : *Legal->getReductionVars()) { + NeedDef.insert(Reduction.first); + NeedDef.insert(Reduction.second.getLoopExitInstr()); + } + } // Collect instructions from the original loop that will become trivially dead // in the vectorized loop. We don't need to vectorize these instructions. For @@ -6873,7 +7104,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( // Create a dummy pre-entry VPBasicBlock to start building the VPlan. VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); - auto Plan = llvm::make_unique<VPlan>(VPBB); + auto Plan = std::make_unique<VPlan>(VPBB); VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder); // Represent values that will have defs inside VPlan. @@ -6968,6 +7199,18 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( VPBlockUtils::disconnectBlocks(PreEntry, Entry); delete PreEntry; + // Finally, if tail is folded by masking, introduce selects between the phi + // and the live-out instruction of each reduction, at the end of the latch. + if (CM.foldTailByMasking()) { + Builder.setInsertPoint(VPBB); + auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); + for (auto &Reduction : *Legal->getReductionVars()) { + VPValue *Phi = Plan->getVPValue(Reduction.first); + VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr()); + Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); + } + } + std::string PlanName; raw_string_ostream RSO(PlanName); unsigned VF = Range.Start; @@ -6993,7 +7236,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); // Create new empty VPlan - auto Plan = llvm::make_unique<VPlan>(); + auto Plan = std::make_unique<VPlan>(); // Build hierarchical CFG VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); @@ -7199,6 +7442,20 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { State.ILV->vectorizeMemoryInstruction(&Instr, &MaskValues); } +static ScalarEpilogueLowering +getScalarEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints, + ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) { + ScalarEpilogueLowering SEL = CM_ScalarEpilogueAllowed; + if (Hints.getForce() != LoopVectorizeHints::FK_Enabled && + (F->hasOptSize() || + llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI))) + SEL = CM_ScalarEpilogueNotAllowedOptSize; + else if (PreferPredicateOverEpilog || Hints.getPredicate()) + SEL = CM_ScalarEpilogueNotNeededUsePredicate; + + return SEL; +} + // Process the loop in the VPlan-native vectorization path. This path builds // VPlan upfront in the vectorization pipeline, which allows to apply // VPlan-to-VPlan transformations from the very beginning without modifying the @@ -7213,7 +7470,9 @@ static bool processLoopInVPlanNativePath( assert(EnableVPlanNativePath && "VPlan-native path is disabled."); Function *F = L->getHeader()->getParent(); InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); - LoopVectorizationCostModel CM(L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, + ScalarEpilogueLowering SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI); + + LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, &Hints, IAI); // Use the planner for outer loop vectorization. // TODO: CM is not used at this point inside the planner. Turn CM into an @@ -7223,15 +7482,8 @@ static bool processLoopInVPlanNativePath( // Get user vectorization factor. const unsigned UserVF = Hints.getWidth(); - // Check the function attributes and profiles to find out if this function - // should be optimized for size. - bool OptForSize = - Hints.getForce() != LoopVectorizeHints::FK_Enabled && - (F->hasOptSize() || - llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI)); - // Plan how to best vectorize, return the best VF and its cost. - const VectorizationFactor VF = LVP.planInVPlanNativePath(OptForSize, UserVF); + const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); // If we are stress testing VPlan builds, do not attempt to generate vector // code. Masked vector code generation support will follow soon. @@ -7310,10 +7562,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { // Check the function attributes and profiles to find out if this function // should be optimized for size. - bool OptForSize = - Hints.getForce() != LoopVectorizeHints::FK_Enabled && - (F->hasOptSize() || - llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI)); + ScalarEpilogueLowering SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI); // Entrance to the VPlan-native vectorization path. Outer loops are processed // here. They may require CFG and instruction level transformations before @@ -7325,36 +7574,11 @@ bool LoopVectorizePass::processLoop(Loop *L) { ORE, BFI, PSI, Hints); assert(L->empty() && "Inner loop expected."); + // Check the loop for a trip count threshold: vectorize loops with a tiny trip // count by optimizing for size, to minimize overheads. - // Prefer constant trip counts over profile data, over upper bound estimate. - unsigned ExpectedTC = 0; - bool HasExpectedTC = false; - if (const SCEVConstant *ConstExits = - dyn_cast<SCEVConstant>(SE->getBackedgeTakenCount(L))) { - const APInt &ExitsCount = ConstExits->getAPInt(); - // We are interested in small values for ExpectedTC. Skip over those that - // can't fit an unsigned. - if (ExitsCount.ult(std::numeric_limits<unsigned>::max())) { - ExpectedTC = static_cast<unsigned>(ExitsCount.getZExtValue()) + 1; - HasExpectedTC = true; - } - } - // ExpectedTC may be large because it's bound by a variable. Check - // profiling information to validate we should vectorize. - if (!HasExpectedTC && LoopVectorizeWithBlockFrequency) { - auto EstimatedTC = getLoopEstimatedTripCount(L); - if (EstimatedTC) { - ExpectedTC = *EstimatedTC; - HasExpectedTC = true; - } - } - if (!HasExpectedTC) { - ExpectedTC = SE->getSmallConstantMaxTripCount(L); - HasExpectedTC = (ExpectedTC > 0); - } - - if (HasExpectedTC && ExpectedTC < TinyTripCountVectorThreshold) { + auto ExpectedTC = getSmallBestKnownTC(*SE, L); + if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " << "This loop is worth vectorizing only if no scalar " << "iteration overheads are incurred."); @@ -7362,10 +7586,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); else { LLVM_DEBUG(dbgs() << "\n"); - // Loops with a very small trip count are considered for vectorization - // under OptForSize, thereby making sure the cost of their loop body is - // dominant, free of runtime guards and scalar iteration overheads. - OptForSize = true; + SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; } } @@ -7374,11 +7595,10 @@ bool LoopVectorizePass::processLoop(Loop *L) { // an integer loop and the vector instructions selected are purely integer // vector instructions? if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { - LLVM_DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat" - "attribute is used.\n"); - ORE->emit(createLVMissedAnalysis(Hints.vectorizeAnalysisPassName(), - "NoImplicitFloat", L) - << "loop not vectorized due to NoImplicitFloat attribute"); + reportVectorizationFailure( + "Can't vectorize when the NoImplicitFloat attribute is used", + "loop not vectorized due to NoImplicitFloat attribute", + "NoImplicitFloat", ORE, L); Hints.emitRemarkWithHints(); return false; } @@ -7389,11 +7609,10 @@ bool LoopVectorizePass::processLoop(Loop *L) { // additional fp-math flags can help. if (Hints.isPotentiallyUnsafe() && TTI->isFPVectorizationPotentiallyUnsafe()) { - LLVM_DEBUG( - dbgs() << "LV: Potentially unsafe FP op prevents vectorization.\n"); - ORE->emit( - createLVMissedAnalysis(Hints.vectorizeAnalysisPassName(), "UnsafeFP", L) - << "loop not vectorized due to unsafe FP support."); + reportVectorizationFailure( + "Potentially unsafe FP op prevents vectorization", + "loop not vectorized due to unsafe FP support.", + "UnsafeFP", ORE, L); Hints.emitRemarkWithHints(); return false; } @@ -7411,8 +7630,8 @@ bool LoopVectorizePass::processLoop(Loop *L) { } // Use the cost model. - LoopVectorizationCostModel CM(L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, F, - &Hints, IAI); + LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, + F, &Hints, IAI); CM.collectValuesToIgnore(); // Use the planner for vectorization. @@ -7422,7 +7641,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { unsigned UserVF = Hints.getWidth(); // Plan how to best vectorize, return the best VF and its cost. - Optional<VectorizationFactor> MaybeVF = LVP.plan(OptForSize, UserVF); + Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF); VectorizationFactor VF = VectorizationFactor::Disabled(); unsigned IC = 1; @@ -7431,7 +7650,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { if (MaybeVF) { VF = *MaybeVF; // Select the interleave count. - IC = CM.selectInterleaveCount(OptForSize, VF.Width, VF.Cost); + IC = CM.selectInterleaveCount(VF.Width, VF.Cost); } // Identify the diagnostic messages that should be produced. @@ -7609,7 +7828,8 @@ bool LoopVectorizePass::runImpl( // The second condition is necessary because, even if the target has no // vector registers, loop vectorization may still enable scalar // interleaving. - if (!TTI->getNumberOfRegisters(true) && TTI->getMaxInterleaveFactor(1) < 2) + if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && + TTI->getMaxInterleaveFactor(1) < 2) return false; bool Changed = false; |