summaryrefslogtreecommitdiff
path: root/lib/Transforms/Vectorize/LoopVectorize.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')
-rw-r--r--lib/Transforms/Vectorize/LoopVectorize.cpp738
1 files changed, 479 insertions, 259 deletions
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 46265e3f3e13..8f0bf70f873c 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -177,6 +177,14 @@ static cl::opt<unsigned> TinyTripCountVectorThreshold(
"value are vectorized only if no scalar iteration overheads "
"are incurred."));
+// Indicates that an epilogue is undesired, predication is preferred.
+// This means that the vectorizer will try to fold the loop-tail (epilogue)
+// into the loop and predicate the loop body accordingly.
+static cl::opt<bool> PreferPredicateOverEpilog(
+ "prefer-predicate-over-epilog", cl::init(false), cl::Hidden,
+ cl::desc("Indicate that an epilogue is undesired, predication should be "
+ "used instead."));
+
static cl::opt<bool> MaximizeBandwidth(
"vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
cl::desc("Maximize bandwidth when selecting vectorization factor which "
@@ -347,6 +355,29 @@ static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
: ConstantFP::get(Ty, C);
}
+/// Returns "best known" trip count for the specified loop \p L as defined by
+/// the following procedure:
+/// 1) Returns exact trip count if it is known.
+/// 2) Returns expected trip count according to profile data if any.
+/// 3) Returns upper bound estimate if it is known.
+/// 4) Returns None if all of the above failed.
+static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
+ // Check if exact trip count is known.
+ if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
+ return ExpectedTC;
+
+ // Check if there is an expected trip count available from profile data.
+ if (LoopVectorizeWithBlockFrequency)
+ if (auto EstimatedTC = getLoopEstimatedTripCount(L))
+ return EstimatedTC;
+
+ // Check if upper bound estimate is known.
+ if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
+ return ExpectedTC;
+
+ return None;
+}
+
namespace llvm {
/// InnerLoopVectorizer vectorizes loops which contain only one basic
@@ -795,6 +826,59 @@ void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr)
B.SetCurrentDebugLocation(DebugLoc());
}
+/// Write a record \p DebugMsg about vectorization failure to the debug
+/// output stream. If \p I is passed, it is an instruction that prevents
+/// vectorization.
+#ifndef NDEBUG
+static void debugVectorizationFailure(const StringRef DebugMsg,
+ Instruction *I) {
+ dbgs() << "LV: Not vectorizing: " << DebugMsg;
+ if (I != nullptr)
+ dbgs() << " " << *I;
+ else
+ dbgs() << '.';
+ dbgs() << '\n';
+}
+#endif
+
+/// Create an analysis remark that explains why vectorization failed
+///
+/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
+/// RemarkName is the identifier for the remark. If \p I is passed it is an
+/// instruction that prevents vectorization. Otherwise \p TheLoop is used for
+/// the location of the remark. \return the remark object that can be
+/// streamed to.
+static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
+ StringRef RemarkName, Loop *TheLoop, Instruction *I) {
+ Value *CodeRegion = TheLoop->getHeader();
+ DebugLoc DL = TheLoop->getStartLoc();
+
+ if (I) {
+ CodeRegion = I->getParent();
+ // If there is no debug location attached to the instruction, revert back to
+ // using the loop's.
+ if (I->getDebugLoc())
+ DL = I->getDebugLoc();
+ }
+
+ OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
+ R << "loop not vectorized: ";
+ return R;
+}
+
+namespace llvm {
+
+void reportVectorizationFailure(const StringRef DebugMsg,
+ const StringRef OREMsg, const StringRef ORETag,
+ OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
+ LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
+ LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
+ ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
+ ORETag, TheLoop, I) << OREMsg);
+}
+
+} // end namespace llvm
+
#ifndef NDEBUG
/// \return string containing a file name and a line # for the given loop.
static std::string getDebugLocString(const Loop *L) {
@@ -836,6 +920,26 @@ void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
namespace llvm {
+// Loop vectorization cost-model hints how the scalar epilogue loop should be
+// lowered.
+enum ScalarEpilogueLowering {
+
+ // The default: allowing scalar epilogues.
+ CM_ScalarEpilogueAllowed,
+
+ // Vectorization with OptForSize: don't allow epilogues.
+ CM_ScalarEpilogueNotAllowedOptSize,
+
+ // A special case of vectorisation with OptForSize: loops with a very small
+ // trip count are considered for vectorization under OptForSize, thereby
+ // making sure the cost of their loop body is dominant, free of runtime
+ // guards and scalar iteration overheads.
+ CM_ScalarEpilogueNotAllowedLowTripLoop,
+
+ // Loop hint predicate indicating an epilogue is undesired.
+ CM_ScalarEpilogueNotNeededUsePredicate
+};
+
/// LoopVectorizationCostModel - estimates the expected speedups due to
/// vectorization.
/// In many cases vectorization is not profitable. This can happen because of
@@ -845,20 +949,26 @@ namespace llvm {
/// different operations.
class LoopVectorizationCostModel {
public:
- LoopVectorizationCostModel(Loop *L, PredicatedScalarEvolution &PSE,
- LoopInfo *LI, LoopVectorizationLegality *Legal,
+ LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
+ PredicatedScalarEvolution &PSE, LoopInfo *LI,
+ LoopVectorizationLegality *Legal,
const TargetTransformInfo &TTI,
const TargetLibraryInfo *TLI, DemandedBits *DB,
AssumptionCache *AC,
OptimizationRemarkEmitter *ORE, const Function *F,
const LoopVectorizeHints *Hints,
InterleavedAccessInfo &IAI)
- : TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB),
- AC(AC), ORE(ORE), TheFunction(F), Hints(Hints), InterleaveInfo(IAI) {}
+ : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
+ TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
+ Hints(Hints), InterleaveInfo(IAI) {}
/// \return An upper bound for the vectorization factor, or None if
/// vectorization and interleaving should be avoided up front.
- Optional<unsigned> computeMaxVF(bool OptForSize);
+ Optional<unsigned> computeMaxVF();
+
+ /// \return True if runtime checks are required for vectorization, and false
+ /// otherwise.
+ bool runtimeChecksRequired();
/// \return The most profitable vectorization factor and the cost of that VF.
/// This method checks every power of two up to MaxVF. If UserVF is not ZERO
@@ -881,8 +991,7 @@ public:
/// If interleave count has been specified by metadata it will be returned.
/// Otherwise, the interleave count is computed and returned. VF and LoopCost
/// are the selected vectorization factor and the cost of the selected VF.
- unsigned selectInterleaveCount(bool OptForSize, unsigned VF,
- unsigned LoopCost);
+ unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost);
/// Memory access instruction may be vectorized in more than one way.
/// Form of instruction after vectorization depends on cost.
@@ -897,10 +1006,11 @@ public:
/// of a loop.
struct RegisterUsage {
/// Holds the number of loop invariant values that are used in the loop.
- unsigned LoopInvariantRegs;
-
+ /// The key is ClassID of target-provided register class.
+ SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
/// Holds the maximum number of concurrent live intervals in the loop.
- unsigned MaxLocalUsers;
+ /// The key is ClassID of target-provided register class.
+ SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
};
/// \return Returns information about the register usages of the loop for the
@@ -1080,14 +1190,16 @@ public:
/// Returns true if the target machine supports masked store operation
/// for the given \p DataType and kind of access to \p Ptr.
- bool isLegalMaskedStore(Type *DataType, Value *Ptr) {
- return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedStore(DataType);
+ bool isLegalMaskedStore(Type *DataType, Value *Ptr, MaybeAlign Alignment) {
+ return Legal->isConsecutivePtr(Ptr) &&
+ TTI.isLegalMaskedStore(DataType, Alignment);
}
/// Returns true if the target machine supports masked load operation
/// for the given \p DataType and kind of access to \p Ptr.
- bool isLegalMaskedLoad(Type *DataType, Value *Ptr) {
- return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedLoad(DataType);
+ bool isLegalMaskedLoad(Type *DataType, Value *Ptr, MaybeAlign Alignment) {
+ return Legal->isConsecutivePtr(Ptr) &&
+ TTI.isLegalMaskedLoad(DataType, Alignment);
}
/// Returns true if the target machine supports masked scatter operation
@@ -1157,11 +1269,14 @@ public:
/// to handle accesses with gaps, and there is nothing preventing us from
/// creating a scalar epilogue.
bool requiresScalarEpilogue() const {
- return IsScalarEpilogueAllowed && InterleaveInfo.requiresScalarEpilogue();
+ return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue();
}
- /// Returns true if a scalar epilogue is not allowed due to optsize.
- bool isScalarEpilogueAllowed() const { return IsScalarEpilogueAllowed; }
+ /// Returns true if a scalar epilogue is not allowed due to optsize or a
+ /// loop hint annotation.
+ bool isScalarEpilogueAllowed() const {
+ return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
+ }
/// Returns true if all loop blocks should be masked to fold tail loop.
bool foldTailByMasking() const { return FoldTailByMasking; }
@@ -1187,7 +1302,7 @@ private:
/// \return An upper bound for the vectorization factor, larger than zero.
/// One is returned if vectorization should best be avoided due to cost.
- unsigned computeFeasibleMaxVF(bool OptForSize, unsigned ConstTripCount);
+ unsigned computeFeasibleMaxVF(unsigned ConstTripCount);
/// The vectorization cost is a combination of the cost itself and a boolean
/// indicating whether any of the contributing operations will actually
@@ -1246,15 +1361,6 @@ private:
/// should be used.
bool useEmulatedMaskMemRefHack(Instruction *I);
- /// Create an analysis remark that explains why vectorization failed
- ///
- /// \p RemarkName is the identifier for the remark. \return the remark object
- /// that can be streamed to.
- OptimizationRemarkAnalysis createMissedAnalysis(StringRef RemarkName) {
- return createLVMissedAnalysis(Hints->vectorizeAnalysisPassName(),
- RemarkName, TheLoop);
- }
-
/// Map of scalar integer values to the smallest bitwidth they can be legally
/// represented as. The vector equivalents of these values should be truncated
/// to this type.
@@ -1270,13 +1376,13 @@ private:
SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
/// Records whether it is allowed to have the original scalar loop execute at
- /// least once. This may be needed as a fallback loop in case runtime
+ /// least once. This may be needed as a fallback loop in case runtime
/// aliasing/dependence checks fail, or to handle the tail/remainder
/// iterations when the trip count is unknown or doesn't divide by the VF,
/// or as a peel-loop to handle gaps in interleave-groups.
/// Under optsize and when the trip count is very small we don't allow any
/// iterations to execute in the scalar loop.
- bool IsScalarEpilogueAllowed = true;
+ ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
/// All blocks of loop are to be masked to fold tail of scalar iterations.
bool FoldTailByMasking = false;
@@ -1496,7 +1602,7 @@ struct LoopVectorize : public FunctionPass {
auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
- auto *TLI = TLIP ? &TLIP->getTLI() : nullptr;
+ auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
@@ -2253,12 +2359,11 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
Type *ScalarDataTy = getMemInstValueType(Instr);
Type *DataTy = VectorType::get(ScalarDataTy, VF);
Value *Ptr = getLoadStorePointerOperand(Instr);
- unsigned Alignment = getLoadStoreAlignment(Instr);
// An alignment of 0 means target abi alignment. We need to use the scalar's
// target abi alignment in such a case.
const DataLayout &DL = Instr->getModule()->getDataLayout();
- if (!Alignment)
- Alignment = DL.getABITypeAlignment(ScalarDataTy);
+ const Align Alignment =
+ DL.getValueOrABITypeAlignment(getLoadStoreAlignment(Instr), ScalarDataTy);
unsigned AddressSpace = getLoadStoreAddressSpace(Instr);
// Determine if the pointer operand of the access is either consecutive or
@@ -2322,8 +2427,8 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
if (CreateGatherScatter) {
Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
- NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
- MaskPart);
+ NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep,
+ Alignment.value(), MaskPart);
} else {
if (Reverse) {
// If we store to reverse consecutive memory locations, then we need
@@ -2334,10 +2439,11 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
}
auto *VecPtr = CreateVecPtr(Part, Ptr);
if (isMaskRequired)
- NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
- Mask[Part]);
+ NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr,
+ Alignment.value(), Mask[Part]);
else
- NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
+ NewSI =
+ Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment.value());
}
addMetadata(NewSI, SI);
}
@@ -2352,18 +2458,18 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
if (CreateGatherScatter) {
Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
- NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
+ NewLI = Builder.CreateMaskedGather(VectorGep, Alignment.value(), MaskPart,
nullptr, "wide.masked.gather");
addMetadata(NewLI, LI);
} else {
auto *VecPtr = CreateVecPtr(Part, Ptr);
if (isMaskRequired)
- NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part],
+ NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment.value(), Mask[Part],
UndefValue::get(DataTy),
"wide.masked.load");
else
- NewLI =
- Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
+ NewLI = Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment.value(),
+ "wide.load");
// Add metadata to the load, but setVectorValue to the reverse shuffle.
addMetadata(NewLI, LI);
@@ -2615,8 +2721,9 @@ void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
if (C->isZero())
return;
- assert(!Cost->foldTailByMasking() &&
- "Cannot SCEV check stride or overflow when folding tail");
+ assert(!BB->getParent()->hasOptSize() &&
+ "Cannot SCEV check stride or overflow when optimizing for size");
+
// Create a new block containing the stride check.
BB->setName("vector.scevcheck");
auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
@@ -2649,7 +2756,20 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
if (!MemRuntimeCheck)
return;
- assert(!Cost->foldTailByMasking() && "Cannot check memory when folding tail");
+ if (BB->getParent()->hasOptSize()) {
+ assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
+ "Cannot emit memory checks when optimizing for size, unless forced "
+ "to vectorize.");
+ ORE->emit([&]() {
+ return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
+ L->getStartLoc(), L->getHeader())
+ << "Code-size may be reduced by not forcing "
+ "vectorization, or by source-code modifications "
+ "eliminating the need for runtime checks "
+ "(e.g., adding 'restrict').";
+ });
+ }
+
// Create a new block containing the memory check.
BB->setName("vector.memcheck");
auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
@@ -2666,7 +2786,7 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
// We currently don't use LoopVersioning for the actual loop cloning but we
// still use it to add the noalias metadata.
- LVer = llvm::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
+ LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
PSE.getSE());
LVer->prepareNoAliasMetadata();
}
@@ -3598,6 +3718,26 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
setDebugLocFromInst(Builder, LoopExitInst);
+ // If tail is folded by masking, the vector value to leave the loop should be
+ // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
+ // instead of the former.
+ if (Cost->foldTailByMasking()) {
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ Value *VecLoopExitInst =
+ VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
+ Value *Sel = nullptr;
+ for (User *U : VecLoopExitInst->users()) {
+ if (isa<SelectInst>(U)) {
+ assert(!Sel && "Reduction exit feeding two selects");
+ Sel = U;
+ } else
+ assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
+ }
+ assert(Sel && "Reduction exit feeds no select");
+ VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel);
+ }
+ }
+
// If the vector reduction can be performed in a smaller type, we truncate
// then extend the loop exit value to enable InstCombine to evaluate the
// entire expression in the smaller type.
@@ -4064,7 +4204,7 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) {
case Instruction::FCmp: {
// Widen compares. Generate vector compares.
bool FCmp = (I.getOpcode() == Instruction::FCmp);
- auto *Cmp = dyn_cast<CmpInst>(&I);
+ auto *Cmp = cast<CmpInst>(&I);
setDebugLocFromInst(Builder, Cmp);
for (unsigned Part = 0; Part < UF; ++Part) {
Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part);
@@ -4097,7 +4237,7 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) {
case Instruction::Trunc:
case Instruction::FPTrunc:
case Instruction::BitCast: {
- auto *CI = dyn_cast<CastInst>(&I);
+ auto *CI = cast<CastInst>(&I);
setDebugLocFromInst(Builder, CI);
/// Vectorize casts.
@@ -4421,9 +4561,10 @@ bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigne
"Widening decision should be ready at this moment");
return WideningDecision == CM_Scalarize;
}
+ const MaybeAlign Alignment = getLoadStoreAlignment(I);
return isa<LoadInst>(I) ?
- !(isLegalMaskedLoad(Ty, Ptr) || isLegalMaskedGather(Ty))
- : !(isLegalMaskedStore(Ty, Ptr) || isLegalMaskedScatter(Ty));
+ !(isLegalMaskedLoad(Ty, Ptr, Alignment) || isLegalMaskedGather(Ty))
+ : !(isLegalMaskedStore(Ty, Ptr, Alignment) || isLegalMaskedScatter(Ty));
}
case Instruction::UDiv:
case Instruction::SDiv:
@@ -4452,10 +4593,10 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,
// Check if masking is required.
// A Group may need masking for one of two reasons: it resides in a block that
// needs predication, or it was decided to use masking to deal with gaps.
- bool PredicatedAccessRequiresMasking =
+ bool PredicatedAccessRequiresMasking =
Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
- bool AccessWithGapsRequiresMasking =
- Group->requiresScalarEpilogue() && !IsScalarEpilogueAllowed;
+ bool AccessWithGapsRequiresMasking =
+ Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
return true;
@@ -4466,8 +4607,9 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,
"Masked interleave-groups for predicated accesses are not enabled.");
auto *Ty = getMemInstValueType(I);
- return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty)
- : TTI.isLegalMaskedStore(Ty);
+ const MaybeAlign Alignment = getLoadStoreAlignment(I);
+ return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
+ : TTI.isLegalMaskedStore(Ty, Alignment);
}
bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I,
@@ -4675,82 +4817,96 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
Uniforms[VF].insert(Worklist.begin(), Worklist.end());
}
-Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
- if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
- // TODO: It may by useful to do since it's still likely to be dynamically
- // uniform if the target can skip.
- LLVM_DEBUG(
- dbgs() << "LV: Not inserting runtime ptr check for divergent target");
-
- ORE->emit(
- createMissedAnalysis("CantVersionLoopWithDivergentTarget")
- << "runtime pointer checks needed. Not enabled for divergent target");
-
- return None;
- }
-
- unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
- if (!OptForSize) // Remaining checks deal with scalar loop when OptForSize.
- return computeFeasibleMaxVF(OptForSize, TC);
+bool LoopVectorizationCostModel::runtimeChecksRequired() {
+ LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
if (Legal->getRuntimePointerChecking()->Need) {
- ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize")
- << "runtime pointer checks needed. Enable vectorization of this "
- "loop with '#pragma clang loop vectorize(enable)' when "
- "compiling with -Os/-Oz");
- LLVM_DEBUG(
- dbgs()
- << "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n");
- return None;
+ reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
+ "runtime pointer checks needed. Enable vectorization of this "
+ "loop with '#pragma clang loop vectorize(enable)' when "
+ "compiling with -Os/-Oz",
+ "CantVersionLoopWithOptForSize", ORE, TheLoop);
+ return true;
}
if (!PSE.getUnionPredicate().getPredicates().empty()) {
- ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize")
- << "runtime SCEV checks needed. Enable vectorization of this "
- "loop with '#pragma clang loop vectorize(enable)' when "
- "compiling with -Os/-Oz");
- LLVM_DEBUG(
- dbgs()
- << "LV: Aborting. Runtime SCEV check is required with -Os/-Oz.\n");
- return None;
+ reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
+ "runtime SCEV checks needed. Enable vectorization of this "
+ "loop with '#pragma clang loop vectorize(enable)' when "
+ "compiling with -Os/-Oz",
+ "CantVersionLoopWithOptForSize", ORE, TheLoop);
+ return true;
}
// FIXME: Avoid specializing for stride==1 instead of bailing out.
if (!Legal->getLAI()->getSymbolicStrides().empty()) {
- ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize")
- << "runtime stride == 1 checks needed. Enable vectorization of "
- "this loop with '#pragma clang loop vectorize(enable)' when "
- "compiling with -Os/-Oz");
- LLVM_DEBUG(
- dbgs()
- << "LV: Aborting. Runtime stride check is required with -Os/-Oz.\n");
+ reportVectorizationFailure("Runtime stride check is required with -Os/-Oz",
+ "runtime stride == 1 checks needed. Enable vectorization of "
+ "this loop with '#pragma clang loop vectorize(enable)' when "
+ "compiling with -Os/-Oz",
+ "CantVersionLoopWithOptForSize", ORE, TheLoop);
+ return true;
+ }
+
+ return false;
+}
+
+Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() {
+ if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
+ // TODO: It may by useful to do since it's still likely to be dynamically
+ // uniform if the target can skip.
+ reportVectorizationFailure(
+ "Not inserting runtime ptr check for divergent target",
+ "runtime pointer checks needed. Not enabled for divergent target",
+ "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
return None;
}
- // If we optimize the program for size, avoid creating the tail loop.
+ unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
-
if (TC == 1) {
- ORE->emit(createMissedAnalysis("SingleIterationLoop")
- << "loop trip count is one, irrelevant for vectorization");
- LLVM_DEBUG(dbgs() << "LV: Aborting, single iteration (non) loop.\n");
+ reportVectorizationFailure("Single iteration (non) loop",
+ "loop trip count is one, irrelevant for vectorization",
+ "SingleIterationLoop", ORE, TheLoop);
return None;
}
- // Record that scalar epilogue is not allowed.
- LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
+ switch (ScalarEpilogueStatus) {
+ case CM_ScalarEpilogueAllowed:
+ return computeFeasibleMaxVF(TC);
+ case CM_ScalarEpilogueNotNeededUsePredicate:
+ LLVM_DEBUG(
+ dbgs() << "LV: vector predicate hint/switch found.\n"
+ << "LV: Not allowing scalar epilogue, creating predicated "
+ << "vector loop.\n");
+ break;
+ case CM_ScalarEpilogueNotAllowedLowTripLoop:
+ // fallthrough as a special case of OptForSize
+ case CM_ScalarEpilogueNotAllowedOptSize:
+ if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
+ LLVM_DEBUG(
+ dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
+ else
+ LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
+ << "count.\n");
+
+ // Bail if runtime checks are required, which are not good when optimising
+ // for size.
+ if (runtimeChecksRequired())
+ return None;
+ break;
+ }
- IsScalarEpilogueAllowed = !OptForSize;
+ // Now try the tail folding
- // We don't create an epilogue when optimizing for size.
// Invalidate interleave groups that require an epilogue if we can't mask
// the interleave-group.
- if (!useMaskedInterleavedAccesses(TTI))
+ if (!useMaskedInterleavedAccesses(TTI))
InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
- unsigned MaxVF = computeFeasibleMaxVF(OptForSize, TC);
-
+ unsigned MaxVF = computeFeasibleMaxVF(TC);
if (TC > 0 && TC % MaxVF == 0) {
+ // Accept MaxVF if we do not have a tail.
LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
return MaxVF;
}
@@ -4759,28 +4915,30 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
// found modulo the vectorization factor is not zero, try to fold the tail
// by masking.
// FIXME: look for a smaller MaxVF that does divide TC rather than masking.
- if (Legal->canFoldTailByMasking()) {
+ if (Legal->prepareToFoldTailByMasking()) {
FoldTailByMasking = true;
return MaxVF;
}
if (TC == 0) {
- ORE->emit(
- createMissedAnalysis("UnknownLoopCountComplexCFG")
- << "unable to calculate the loop count due to complex control flow");
+ reportVectorizationFailure(
+ "Unable to calculate the loop count due to complex control flow",
+ "unable to calculate the loop count due to complex control flow",
+ "UnknownLoopCountComplexCFG", ORE, TheLoop);
return None;
}
- ORE->emit(createMissedAnalysis("NoTailLoopWithOptForSize")
- << "cannot optimize for size and vectorize at the same time. "
- "Enable vectorization of this loop with '#pragma clang loop "
- "vectorize(enable)' when compiling with -Os/-Oz");
+ reportVectorizationFailure(
+ "Cannot optimize for size and vectorize at the same time.",
+ "cannot optimize for size and vectorize at the same time. "
+ "Enable vectorization of this loop with '#pragma clang loop "
+ "vectorize(enable)' when compiling with -Os/-Oz",
+ "NoTailLoopWithOptForSize", ORE, TheLoop);
return None;
}
unsigned
-LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize,
- unsigned ConstTripCount) {
+LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
unsigned SmallestType, WidestType;
std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
@@ -4818,8 +4976,8 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize,
}
unsigned MaxVF = MaxVectorSize;
- if (TTI.shouldMaximizeVectorBandwidth(OptForSize) ||
- (MaximizeBandwidth && !OptForSize)) {
+ if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
+ (MaximizeBandwidth && isScalarEpilogueAllowed())) {
// Collect all viable vectorization factors larger than the default MaxVF
// (i.e. MaxVectorSize).
SmallVector<unsigned, 8> VFs;
@@ -4832,9 +4990,14 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize,
// Select the largest VF which doesn't require more registers than existing
// ones.
- unsigned TargetNumRegisters = TTI.getNumberOfRegisters(true);
for (int i = RUs.size() - 1; i >= 0; --i) {
- if (RUs[i].MaxLocalUsers <= TargetNumRegisters) {
+ bool Selected = true;
+ for (auto& pair : RUs[i].MaxLocalUsers) {
+ unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
+ if (pair.second > TargetNumRegisters)
+ Selected = false;
+ }
+ if (Selected) {
MaxVF = VFs[i];
break;
}
@@ -4886,10 +5049,9 @@ LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
}
if (!EnableCondStoresVectorization && NumPredStores) {
- ORE->emit(createMissedAnalysis("ConditionalStore")
- << "store that is conditionally executed prevents vectorization");
- LLVM_DEBUG(
- dbgs() << "LV: No vectorization. There are conditional stores.\n");
+ reportVectorizationFailure("There are conditional stores.",
+ "store that is conditionally executed prevents vectorization",
+ "ConditionalStore", ORE, TheLoop);
Width = 1;
Cost = ScalarCost;
}
@@ -4958,8 +5120,7 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() {
return {MinWidth, MaxWidth};
}
-unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
- unsigned VF,
+unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
unsigned LoopCost) {
// -- The interleave heuristics --
// We interleave the loop in order to expose ILP and reduce the loop overhead.
@@ -4975,8 +5136,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
// 3. We don't interleave if we think that we will spill registers to memory
// due to the increased register pressure.
- // When we optimize for size, we don't interleave.
- if (OptForSize)
+ if (!isScalarEpilogueAllowed())
return 1;
// We used the distance for the interleave count.
@@ -4988,22 +5148,12 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
if (TC > 1 && TC < TinyTripCountInterleaveThreshold)
return 1;
- unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1);
- LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
- << " registers\n");
-
- if (VF == 1) {
- if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
- TargetNumRegisters = ForceTargetNumScalarRegs;
- } else {
- if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
- TargetNumRegisters = ForceTargetNumVectorRegs;
- }
-
RegisterUsage R = calculateRegisterUsage({VF})[0];
// We divide by these constants so assume that we have at least one
// instruction that uses at least one register.
- R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U);
+ for (auto& pair : R.MaxLocalUsers) {
+ pair.second = std::max(pair.second, 1U);
+ }
// We calculate the interleave count using the following formula.
// Subtract the number of loop invariants from the number of available
@@ -5016,13 +5166,35 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
// We also want power of two interleave counts to ensure that the induction
// variable of the vector loop wraps to zero, when tail is folded by masking;
// this currently happens when OptForSize, in which case IC is set to 1 above.
- unsigned IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) /
- R.MaxLocalUsers);
+ unsigned IC = UINT_MAX;
- // Don't count the induction variable as interleaved.
- if (EnableIndVarRegisterHeur)
- IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) /
- std::max(1U, (R.MaxLocalUsers - 1)));
+ for (auto& pair : R.MaxLocalUsers) {
+ unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
+ LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
+ << " registers of "
+ << TTI.getRegisterClassName(pair.first) << " register class\n");
+ if (VF == 1) {
+ if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
+ TargetNumRegisters = ForceTargetNumScalarRegs;
+ } else {
+ if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
+ TargetNumRegisters = ForceTargetNumVectorRegs;
+ }
+ unsigned MaxLocalUsers = pair.second;
+ unsigned LoopInvariantRegs = 0;
+ if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
+ LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
+
+ unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
+ // Don't count the induction variable as interleaved.
+ if (EnableIndVarRegisterHeur) {
+ TmpIC =
+ PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
+ std::max(1U, (MaxLocalUsers - 1)));
+ }
+
+ IC = std::min(IC, TmpIC);
+ }
// Clamp the interleave ranges to reasonable counts.
unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
@@ -5036,6 +5208,14 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
}
+ // If the trip count is constant, limit the interleave count to be less than
+ // the trip count divided by VF.
+ if (TC > 0) {
+ assert(TC >= VF && "VF exceeds trip count?");
+ if ((TC / VF) < MaxInterleaveCount)
+ MaxInterleaveCount = (TC / VF);
+ }
+
// If we did not calculate the cost for VF (because the user selected the VF)
// then we calculate the cost of VF here.
if (LoopCost == 0)
@@ -5044,7 +5224,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
assert(LoopCost && "Non-zero loop cost expected");
// Clamp the calculated IC to be between the 1 and the max interleave count
- // that the target allows.
+ // that the target and trip count allows.
if (IC > MaxInterleaveCount)
IC = MaxInterleaveCount;
else if (IC < 1)
@@ -5196,7 +5376,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
const DataLayout &DL = TheFunction->getParent()->getDataLayout();
SmallVector<RegisterUsage, 8> RUs(VFs.size());
- SmallVector<unsigned, 8> MaxUsages(VFs.size(), 0);
+ SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
@@ -5226,21 +5406,45 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
// For each VF find the maximum usage of registers.
for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
+ // Count the number of live intervals.
+ SmallMapVector<unsigned, unsigned, 4> RegUsage;
+
if (VFs[j] == 1) {
- MaxUsages[j] = std::max(MaxUsages[j], OpenIntervals.size());
- continue;
+ for (auto Inst : OpenIntervals) {
+ unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
+ if (RegUsage.find(ClassID) == RegUsage.end())
+ RegUsage[ClassID] = 1;
+ else
+ RegUsage[ClassID] += 1;
+ }
+ } else {
+ collectUniformsAndScalars(VFs[j]);
+ for (auto Inst : OpenIntervals) {
+ // Skip ignored values for VF > 1.
+ if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end())
+ continue;
+ if (isScalarAfterVectorization(Inst, VFs[j])) {
+ unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
+ if (RegUsage.find(ClassID) == RegUsage.end())
+ RegUsage[ClassID] = 1;
+ else
+ RegUsage[ClassID] += 1;
+ } else {
+ unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
+ if (RegUsage.find(ClassID) == RegUsage.end())
+ RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
+ else
+ RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
+ }
+ }
}
- collectUniformsAndScalars(VFs[j]);
- // Count the number of live intervals.
- unsigned RegUsage = 0;
- for (auto Inst : OpenIntervals) {
- // Skip ignored values for VF > 1.
- if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end() ||
- isScalarAfterVectorization(Inst, VFs[j]))
- continue;
- RegUsage += GetRegUsage(Inst->getType(), VFs[j]);
+
+ for (auto& pair : RegUsage) {
+ if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
+ MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
+ else
+ MaxUsages[j][pair.first] = pair.second;
}
- MaxUsages[j] = std::max(MaxUsages[j], RegUsage);
}
LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
@@ -5251,18 +5455,34 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
}
for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
- unsigned Invariant = 0;
- if (VFs[i] == 1)
- Invariant = LoopInvariants.size();
- else {
- for (auto Inst : LoopInvariants)
- Invariant += GetRegUsage(Inst->getType(), VFs[i]);
+ SmallMapVector<unsigned, unsigned, 4> Invariant;
+
+ for (auto Inst : LoopInvariants) {
+ unsigned Usage = VFs[i] == 1 ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
+ unsigned ClassID = TTI.getRegisterClassForType(VFs[i] > 1, Inst->getType());
+ if (Invariant.find(ClassID) == Invariant.end())
+ Invariant[ClassID] = Usage;
+ else
+ Invariant[ClassID] += Usage;
}
- LLVM_DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] << '\n');
- LLVM_DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages[i] << '\n');
- LLVM_DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant
- << '\n');
+ LLVM_DEBUG({
+ dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
+ dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
+ << " item\n";
+ for (const auto &pair : MaxUsages[i]) {
+ dbgs() << "LV(REG): RegisterClass: "
+ << TTI.getRegisterClassName(pair.first) << ", " << pair.second
+ << " registers\n";
+ }
+ dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
+ << " item\n";
+ for (const auto &pair : Invariant) {
+ dbgs() << "LV(REG): RegisterClass: "
+ << TTI.getRegisterClassName(pair.first) << ", " << pair.second
+ << " registers\n";
+ }
+ });
RU.LoopInvariantRegs = Invariant;
RU.MaxLocalUsers = MaxUsages[i];
@@ -5511,7 +5731,6 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
Type *ValTy = getMemInstValueType(I);
auto SE = PSE.getSE();
- unsigned Alignment = getLoadStoreAlignment(I);
unsigned AS = getLoadStoreAddressSpace(I);
Value *Ptr = getLoadStorePointerOperand(I);
Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
@@ -5525,9 +5744,9 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
// Don't pass *I here, since it is scalar but will actually be part of a
// vectorized loop where the user of it is a vectorized instruction.
- Cost += VF *
- TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
- AS);
+ const MaybeAlign Alignment = getLoadStoreAlignment(I);
+ Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
+ Alignment ? Alignment->value() : 0, AS);
// Get the overhead of the extractelement and insertelement instructions
// we might create due to scalarization.
@@ -5552,18 +5771,20 @@ unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
unsigned VF) {
Type *ValTy = getMemInstValueType(I);
Type *VectorTy = ToVectorTy(ValTy, VF);
- unsigned Alignment = getLoadStoreAlignment(I);
Value *Ptr = getLoadStorePointerOperand(I);
unsigned AS = getLoadStoreAddressSpace(I);
int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
"Stride should be 1 or -1 for consecutive memory access");
+ const MaybeAlign Alignment = getLoadStoreAlignment(I);
unsigned Cost = 0;
if (Legal->isMaskRequired(I))
- Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
+ Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy,
+ Alignment ? Alignment->value() : 0, AS);
else
- Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I);
+ Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy,
+ Alignment ? Alignment->value() : 0, AS, I);
bool Reverse = ConsecutiveStride < 0;
if (Reverse)
@@ -5575,33 +5796,37 @@ unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
unsigned VF) {
Type *ValTy = getMemInstValueType(I);
Type *VectorTy = ToVectorTy(ValTy, VF);
- unsigned Alignment = getLoadStoreAlignment(I);
+ const MaybeAlign Alignment = getLoadStoreAlignment(I);
unsigned AS = getLoadStoreAddressSpace(I);
if (isa<LoadInst>(I)) {
return TTI.getAddressComputationCost(ValTy) +
- TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) +
+ TTI.getMemoryOpCost(Instruction::Load, ValTy,
+ Alignment ? Alignment->value() : 0, AS) +
TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
}
StoreInst *SI = cast<StoreInst>(I);
bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
return TTI.getAddressComputationCost(ValTy) +
- TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) +
- (isLoopInvariantStoreValue ? 0 : TTI.getVectorInstrCost(
- Instruction::ExtractElement,
- VectorTy, VF - 1));
+ TTI.getMemoryOpCost(Instruction::Store, ValTy,
+ Alignment ? Alignment->value() : 0, AS) +
+ (isLoopInvariantStoreValue
+ ? 0
+ : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
+ VF - 1));
}
unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
unsigned VF) {
Type *ValTy = getMemInstValueType(I);
Type *VectorTy = ToVectorTy(ValTy, VF);
- unsigned Alignment = getLoadStoreAlignment(I);
+ const MaybeAlign Alignment = getLoadStoreAlignment(I);
Value *Ptr = getLoadStorePointerOperand(I);
return TTI.getAddressComputationCost(VectorTy) +
TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
- Legal->isMaskRequired(I), Alignment);
+ Legal->isMaskRequired(I),
+ Alignment ? Alignment->value() : 0);
}
unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
@@ -5626,8 +5851,8 @@ unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
}
// Calculate the cost of the whole interleaved group.
- bool UseMaskForGaps =
- Group->requiresScalarEpilogue() && !IsScalarEpilogueAllowed;
+ bool UseMaskForGaps =
+ Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
unsigned Cost = TTI.getInterleavedMemoryOpCost(
I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
Group->getAlignment(), AS, Legal->isMaskRequired(I), UseMaskForGaps);
@@ -5648,11 +5873,12 @@ unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
// moment.
if (VF == 1) {
Type *ValTy = getMemInstValueType(I);
- unsigned Alignment = getLoadStoreAlignment(I);
+ const MaybeAlign Alignment = getLoadStoreAlignment(I);
unsigned AS = getLoadStoreAddressSpace(I);
return TTI.getAddressComputationCost(ValTy) +
- TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I);
+ TTI.getMemoryOpCost(I->getOpcode(), ValTy,
+ Alignment ? Alignment->value() : 0, AS, I);
}
return getWideningCost(I, VF);
}
@@ -6167,8 +6393,7 @@ static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
}
VectorizationFactor
-LoopVectorizationPlanner::planInVPlanNativePath(bool OptForSize,
- unsigned UserVF) {
+LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) {
unsigned VF = UserVF;
// Outer loop handling: They may require CFG and instruction level
// transformations before even evaluating whether vectorization is profitable.
@@ -6207,10 +6432,9 @@ LoopVectorizationPlanner::planInVPlanNativePath(bool OptForSize,
return VectorizationFactor::Disabled();
}
-Optional<VectorizationFactor> LoopVectorizationPlanner::plan(bool OptForSize,
- unsigned UserVF) {
+Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF) {
assert(OrigLoop->empty() && "Inner loop expected.");
- Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(OptForSize);
+ Optional<unsigned> MaybeMaxVF = CM.computeMaxVF();
if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
return None;
@@ -6840,8 +7064,15 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
// If the tail is to be folded by masking, the primary induction variable
// needs to be represented in VPlan for it to model early-exit masking.
- if (CM.foldTailByMasking())
+ // Also, both the Phi and the live-out instruction of each reduction are
+ // required in order to introduce a select between them in VPlan.
+ if (CM.foldTailByMasking()) {
NeedDef.insert(Legal->getPrimaryInduction());
+ for (auto &Reduction : *Legal->getReductionVars()) {
+ NeedDef.insert(Reduction.first);
+ NeedDef.insert(Reduction.second.getLoopExitInstr());
+ }
+ }
// Collect instructions from the original loop that will become trivially dead
// in the vectorized loop. We don't need to vectorize these instructions. For
@@ -6873,7 +7104,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
// Create a dummy pre-entry VPBasicBlock to start building the VPlan.
VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
- auto Plan = llvm::make_unique<VPlan>(VPBB);
+ auto Plan = std::make_unique<VPlan>(VPBB);
VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder);
// Represent values that will have defs inside VPlan.
@@ -6968,6 +7199,18 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
VPBlockUtils::disconnectBlocks(PreEntry, Entry);
delete PreEntry;
+ // Finally, if tail is folded by masking, introduce selects between the phi
+ // and the live-out instruction of each reduction, at the end of the latch.
+ if (CM.foldTailByMasking()) {
+ Builder.setInsertPoint(VPBB);
+ auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
+ for (auto &Reduction : *Legal->getReductionVars()) {
+ VPValue *Phi = Plan->getVPValue(Reduction.first);
+ VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr());
+ Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
+ }
+ }
+
std::string PlanName;
raw_string_ostream RSO(PlanName);
unsigned VF = Range.Start;
@@ -6993,7 +7236,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
// Create new empty VPlan
- auto Plan = llvm::make_unique<VPlan>();
+ auto Plan = std::make_unique<VPlan>();
// Build hierarchical CFG
VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
@@ -7199,6 +7442,20 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
State.ILV->vectorizeMemoryInstruction(&Instr, &MaskValues);
}
+static ScalarEpilogueLowering
+getScalarEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints,
+ ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) {
+ ScalarEpilogueLowering SEL = CM_ScalarEpilogueAllowed;
+ if (Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
+ (F->hasOptSize() ||
+ llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI)))
+ SEL = CM_ScalarEpilogueNotAllowedOptSize;
+ else if (PreferPredicateOverEpilog || Hints.getPredicate())
+ SEL = CM_ScalarEpilogueNotNeededUsePredicate;
+
+ return SEL;
+}
+
// Process the loop in the VPlan-native vectorization path. This path builds
// VPlan upfront in the vectorization pipeline, which allows to apply
// VPlan-to-VPlan transformations from the very beginning without modifying the
@@ -7213,7 +7470,9 @@ static bool processLoopInVPlanNativePath(
assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
Function *F = L->getHeader()->getParent();
InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
- LoopVectorizationCostModel CM(L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
+ ScalarEpilogueLowering SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI);
+
+ LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
&Hints, IAI);
// Use the planner for outer loop vectorization.
// TODO: CM is not used at this point inside the planner. Turn CM into an
@@ -7223,15 +7482,8 @@ static bool processLoopInVPlanNativePath(
// Get user vectorization factor.
const unsigned UserVF = Hints.getWidth();
- // Check the function attributes and profiles to find out if this function
- // should be optimized for size.
- bool OptForSize =
- Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
- (F->hasOptSize() ||
- llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI));
-
// Plan how to best vectorize, return the best VF and its cost.
- const VectorizationFactor VF = LVP.planInVPlanNativePath(OptForSize, UserVF);
+ const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
// If we are stress testing VPlan builds, do not attempt to generate vector
// code. Masked vector code generation support will follow soon.
@@ -7310,10 +7562,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// Check the function attributes and profiles to find out if this function
// should be optimized for size.
- bool OptForSize =
- Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
- (F->hasOptSize() ||
- llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI));
+ ScalarEpilogueLowering SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI);
// Entrance to the VPlan-native vectorization path. Outer loops are processed
// here. They may require CFG and instruction level transformations before
@@ -7325,36 +7574,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {
ORE, BFI, PSI, Hints);
assert(L->empty() && "Inner loop expected.");
+
// Check the loop for a trip count threshold: vectorize loops with a tiny trip
// count by optimizing for size, to minimize overheads.
- // Prefer constant trip counts over profile data, over upper bound estimate.
- unsigned ExpectedTC = 0;
- bool HasExpectedTC = false;
- if (const SCEVConstant *ConstExits =
- dyn_cast<SCEVConstant>(SE->getBackedgeTakenCount(L))) {
- const APInt &ExitsCount = ConstExits->getAPInt();
- // We are interested in small values for ExpectedTC. Skip over those that
- // can't fit an unsigned.
- if (ExitsCount.ult(std::numeric_limits<unsigned>::max())) {
- ExpectedTC = static_cast<unsigned>(ExitsCount.getZExtValue()) + 1;
- HasExpectedTC = true;
- }
- }
- // ExpectedTC may be large because it's bound by a variable. Check
- // profiling information to validate we should vectorize.
- if (!HasExpectedTC && LoopVectorizeWithBlockFrequency) {
- auto EstimatedTC = getLoopEstimatedTripCount(L);
- if (EstimatedTC) {
- ExpectedTC = *EstimatedTC;
- HasExpectedTC = true;
- }
- }
- if (!HasExpectedTC) {
- ExpectedTC = SE->getSmallConstantMaxTripCount(L);
- HasExpectedTC = (ExpectedTC > 0);
- }
-
- if (HasExpectedTC && ExpectedTC < TinyTripCountVectorThreshold) {
+ auto ExpectedTC = getSmallBestKnownTC(*SE, L);
+ if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
<< "This loop is worth vectorizing only if no scalar "
<< "iteration overheads are incurred.");
@@ -7362,10 +7586,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
else {
LLVM_DEBUG(dbgs() << "\n");
- // Loops with a very small trip count are considered for vectorization
- // under OptForSize, thereby making sure the cost of their loop body is
- // dominant, free of runtime guards and scalar iteration overheads.
- OptForSize = true;
+ SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
}
}
@@ -7374,11 +7595,10 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// an integer loop and the vector instructions selected are purely integer
// vector instructions?
if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
- LLVM_DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat"
- "attribute is used.\n");
- ORE->emit(createLVMissedAnalysis(Hints.vectorizeAnalysisPassName(),
- "NoImplicitFloat", L)
- << "loop not vectorized due to NoImplicitFloat attribute");
+ reportVectorizationFailure(
+ "Can't vectorize when the NoImplicitFloat attribute is used",
+ "loop not vectorized due to NoImplicitFloat attribute",
+ "NoImplicitFloat", ORE, L);
Hints.emitRemarkWithHints();
return false;
}
@@ -7389,11 +7609,10 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// additional fp-math flags can help.
if (Hints.isPotentiallyUnsafe() &&
TTI->isFPVectorizationPotentiallyUnsafe()) {
- LLVM_DEBUG(
- dbgs() << "LV: Potentially unsafe FP op prevents vectorization.\n");
- ORE->emit(
- createLVMissedAnalysis(Hints.vectorizeAnalysisPassName(), "UnsafeFP", L)
- << "loop not vectorized due to unsafe FP support.");
+ reportVectorizationFailure(
+ "Potentially unsafe FP op prevents vectorization",
+ "loop not vectorized due to unsafe FP support.",
+ "UnsafeFP", ORE, L);
Hints.emitRemarkWithHints();
return false;
}
@@ -7411,8 +7630,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
}
// Use the cost model.
- LoopVectorizationCostModel CM(L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, F,
- &Hints, IAI);
+ LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
+ F, &Hints, IAI);
CM.collectValuesToIgnore();
// Use the planner for vectorization.
@@ -7422,7 +7641,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
unsigned UserVF = Hints.getWidth();
// Plan how to best vectorize, return the best VF and its cost.
- Optional<VectorizationFactor> MaybeVF = LVP.plan(OptForSize, UserVF);
+ Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF);
VectorizationFactor VF = VectorizationFactor::Disabled();
unsigned IC = 1;
@@ -7431,7 +7650,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
if (MaybeVF) {
VF = *MaybeVF;
// Select the interleave count.
- IC = CM.selectInterleaveCount(OptForSize, VF.Width, VF.Cost);
+ IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
}
// Identify the diagnostic messages that should be produced.
@@ -7609,7 +7828,8 @@ bool LoopVectorizePass::runImpl(
// The second condition is necessary because, even if the target has no
// vector registers, loop vectorization may still enable scalar
// interleaving.
- if (!TTI->getNumberOfRegisters(true) && TTI->getMaxInterleaveFactor(1) < 2)
+ if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
+ TTI->getMaxInterleaveFactor(1) < 2)
return false;
bool Changed = false;