diff options
Diffstat (limited to 'llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp')
| -rw-r--r-- | llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp | 190 |
1 files changed, 99 insertions, 91 deletions
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index 2ab0848193f6..3c484fb0d28a 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -37,11 +37,15 @@ static cl::opt<bool> EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden, cl::desc("Enable if-conversion during vectorization.")); -static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold( - "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, - cl::desc("The maximum allowed number of runtime memory checks with a " - "vectorize(enable) pragma.")); +namespace llvm { +cl::opt<bool> + HintsAllowReordering("hints-allow-reordering", cl::init(true), cl::Hidden, + cl::desc("Allow enabling loop hints to reorder " + "FP operations during vectorization.")); +} +// TODO: Move size-based thresholds out of legality checking, make cost based +// decisions instead of hard thresholds. static cl::opt<unsigned> VectorizeSCEVCheckThreshold( "vectorize-scev-check-threshold", cl::init(16), cl::Hidden, cl::desc("The maximum number of SCEV checks allowed.")); @@ -51,6 +55,23 @@ static cl::opt<unsigned> PragmaVectorizeSCEVCheckThreshold( cl::desc("The maximum number of SCEV checks allowed with a " "vectorize(enable) pragma")); +// FIXME: When scalable vectorization is stable enough, change the default +// to SK_PreferFixedWidth. +static cl::opt<LoopVectorizeHints::ScalableForceKind> ScalableVectorization( + "scalable-vectorization", cl::init(LoopVectorizeHints::SK_FixedWidthOnly), + cl::Hidden, + cl::desc("Control whether the compiler can use scalable vectors to " + "vectorize a loop"), + cl::values( + clEnumValN(LoopVectorizeHints::SK_FixedWidthOnly, "off", + "Scalable vectorization is disabled."), + clEnumValN(LoopVectorizeHints::SK_PreferFixedWidth, "on", + "Scalable vectorization is available, but favor fixed-width " + "vectorization when the cost is inconclusive."), + clEnumValN(LoopVectorizeHints::SK_PreferScalable, "preferred", + "Scalable vectorization is available and favored when the " + "cost is inconclusive."))); + /// Maximum vectorization interleave count. static const unsigned MaxInterleaveFactor = 16; @@ -60,7 +81,7 @@ bool LoopVectorizeHints::Hint::validate(unsigned Val) { switch (Kind) { case HK_WIDTH: return isPowerOf2_32(Val) && Val <= VectorizerParams::MaxVectorWidth; - case HK_UNROLL: + case HK_INTERLEAVE: return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor; case HK_FORCE: return (Val <= 1); @@ -76,12 +97,12 @@ LoopVectorizeHints::LoopVectorizeHints(const Loop *L, bool InterleaveOnlyWhenForced, OptimizationRemarkEmitter &ORE) : Width("vectorize.width", VectorizerParams::VectorizationFactor, HK_WIDTH), - Interleave("interleave.count", InterleaveOnlyWhenForced, HK_UNROLL), + Interleave("interleave.count", InterleaveOnlyWhenForced, HK_INTERLEAVE), Force("vectorize.enable", FK_Undefined, HK_FORCE), IsVectorized("isvectorized", 0, HK_ISVECTORIZED), Predicate("vectorize.predicate.enable", FK_Undefined, HK_PREDICATE), - Scalable("vectorize.scalable.enable", false, HK_SCALABLE), TheLoop(L), - ORE(ORE) { + Scalable("vectorize.scalable.enable", SK_Unspecified, HK_SCALABLE), + TheLoop(L), ORE(ORE) { // Populate values with existing loop metadata. getHintsFromMetadata(); @@ -89,13 +110,23 @@ LoopVectorizeHints::LoopVectorizeHints(const Loop *L, if (VectorizerParams::isInterleaveForced()) Interleave.Value = VectorizerParams::VectorizationInterleave; + if ((LoopVectorizeHints::ScalableForceKind)Scalable.Value == SK_Unspecified) + // If the width is set, but the metadata says nothing about the scalable + // property, then assume it concerns only a fixed-width UserVF. + // If width is not set, the flag takes precedence. + Scalable.Value = Width.Value ? SK_FixedWidthOnly : ScalableVectorization; + else if (ScalableVectorization == SK_FixedWidthOnly) + // If the flag is set to disable any use of scalable vectors, override the + // loop hint. + Scalable.Value = SK_FixedWidthOnly; + if (IsVectorized.Value != 1) // If the vectorization width and interleaving count are both 1 then // consider the loop to have been already vectorized because there's // nothing more that we can do. IsVectorized.Value = - getWidth() == ElementCount::getFixed(1) && Interleave.Value == 1; - LLVM_DEBUG(if (InterleaveOnlyWhenForced && Interleave.Value == 1) dbgs() + getWidth() == ElementCount::getFixed(1) && getInterleave() == 1; + LLVM_DEBUG(if (InterleaveOnlyWhenForced && getInterleave() == 1) dbgs() << "LV: Interleaving disabled by the pass manager\n"); } @@ -168,8 +199,8 @@ void LoopVectorizeHints::emitRemarkWithHints() const { R << " (Force=" << NV("Force", true); if (Width.Value != 0) R << ", Vector Width=" << NV("VectorWidth", getWidth()); - if (Interleave.Value != 0) - R << ", Interleave Count=" << NV("InterleaveCount", Interleave.Value); + if (getInterleave() != 0) + R << ", Interleave Count=" << NV("InterleaveCount", getInterleave()); R << ")"; } return R; @@ -187,6 +218,15 @@ const char *LoopVectorizeHints::vectorizeAnalysisPassName() const { return OptimizationRemarkAnalysis::AlwaysPrint; } +bool LoopVectorizeHints::allowReordering() const { + // Allow the vectorizer to change the order of operations if enabling + // loop hints are provided + ElementCount EC = getWidth(); + return HintsAllowReordering && + (getForce() == LoopVectorizeHints::FK_Enabled || + EC.getKnownMinValue() > 1); +} + void LoopVectorizeHints::getHintsFromMetadata() { MDNode *LoopID = TheLoop->getLoopID(); if (!LoopID) @@ -246,42 +286,6 @@ void LoopVectorizeHints::setHint(StringRef Name, Metadata *Arg) { } } -bool LoopVectorizationRequirements::doesNotMeet( - Function *F, Loop *L, const LoopVectorizeHints &Hints) { - const char *PassName = Hints.vectorizeAnalysisPassName(); - bool Failed = false; - if (UnsafeAlgebraInst && !Hints.allowReordering()) { - ORE.emit([&]() { - return OptimizationRemarkAnalysisFPCommute( - PassName, "CantReorderFPOps", UnsafeAlgebraInst->getDebugLoc(), - UnsafeAlgebraInst->getParent()) - << "loop not vectorized: cannot prove it is safe to reorder " - "floating-point operations"; - }); - Failed = true; - } - - // Test if runtime memcheck thresholds are exceeded. - bool PragmaThresholdReached = - NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; - bool ThresholdReached = - NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold; - if ((ThresholdReached && !Hints.allowReordering()) || - PragmaThresholdReached) { - ORE.emit([&]() { - return OptimizationRemarkAnalysisAliasing(PassName, "CantReorderMemOps", - L->getStartLoc(), - L->getHeader()) - << "loop not vectorized: cannot prove it is safe to reorder " - "memory operations"; - }); - LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); - Failed = true; - } - - return Failed; -} - // Return true if the inner loop \p Lp is uniform with regard to the outer loop // \p OuterLp (i.e., if the outer loop is vectorized, all the vector lanes // executing the inner loop will execute the same iterations). This check is @@ -415,7 +419,7 @@ static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst, return false; } -int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { +int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) const { const ValueToValueMap &Strides = getSymbolicStrides() ? *getSymbolicStrides() : ValueToValueMap(); @@ -582,7 +586,7 @@ bool LoopVectorizationLegality::setupOuterLoopInductions() { /// Checks if a function is scalarizable according to the TLI, in /// the sense that it should be vectorized and then expanded in -/// multiple scalarcalls. This is represented in the +/// multiple scalar calls. This is represented in the /// TLI via mappings that do not specify a vector name, as in the /// following example: /// @@ -594,22 +598,24 @@ static bool isTLIScalarize(const TargetLibraryInfo &TLI, const CallInst &CI) { bool Scalarize = TLI.isFunctionVectorizable(ScalarName); // Check that all known VFs are not associated to a vector // function, i.e. the vector name is emty. - if (Scalarize) - for (unsigned VF = 2, WidestVF = TLI.getWidestVF(ScalarName); - VF <= WidestVF; VF *= 2) { + if (Scalarize) { + ElementCount WidestFixedVF, WidestScalableVF; + TLI.getWidestVF(ScalarName, WidestFixedVF, WidestScalableVF); + for (ElementCount VF = ElementCount::getFixed(2); + ElementCount::isKnownLE(VF, WidestFixedVF); VF *= 2) Scalarize &= !TLI.isFunctionVectorizable(ScalarName, VF); - } + for (ElementCount VF = ElementCount::getScalable(1); + ElementCount::isKnownLE(VF, WidestScalableVF); VF *= 2) + Scalarize &= !TLI.isFunctionVectorizable(ScalarName, VF); + assert((WidestScalableVF.isZero() || !Scalarize) && + "Caller may decide to scalarize a variant using a scalable VF"); + } return Scalarize; } bool LoopVectorizationLegality::canVectorizeInstrs() { BasicBlock *Header = TheLoop->getHeader(); - // Look for the attribute signaling the absence of NaNs. - Function &F = *Header->getParent(); - HasFunNoNaNAttr = - F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true"; - // For each block in the loop. for (BasicBlock *BB : TheLoop->blocks()) { // Scan the instructions in the block and look for hazards. @@ -649,8 +655,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { RecurrenceDescriptor RedDes; if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes, DB, AC, DT)) { - if (RedDes.hasUnsafeAlgebra()) - Requirements->addUnsafeAlgebraInst(RedDes.getUnsafeAlgebraInst()); + Requirements->addExactFPMathInst(RedDes.getExactFPMathInst()); AllowedExit.insert(RedDes.getLoopExitInstr()); Reductions[Phi] = RedDes; continue; @@ -673,8 +678,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { InductionDescriptor ID; if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID)) { addInductionPhi(Phi, ID, AllowedExit); - if (ID.hasUnsafeAlgebra() && !HasFunNoNaNAttr) - Requirements->addUnsafeAlgebraInst(ID.getUnsafeAlgebraInst()); + Requirements->addExactFPMathInst(ID.getExactFPMathInst()); continue; } @@ -881,6 +885,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() { "loop not vectorized: ", *LAR); }); } + if (!LAI->canVectorizeMemory()) return false; @@ -890,12 +895,38 @@ bool LoopVectorizationLegality::canVectorizeMemory() { "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop); return false; } + Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks()); PSE.addPredicate(LAI->getPSE().getUnionPredicate()); - return true; } +bool LoopVectorizationLegality::canVectorizeFPMath( + bool EnableStrictReductions) { + + // First check if there is any ExactFP math or if we allow reassociations + if (!Requirements->getExactFPInst() || Hints->allowReordering()) + return true; + + // If the above is false, we have ExactFPMath & do not allow reordering. + // If the EnableStrictReductions flag is set, first check if we have any + // Exact FP induction vars, which we cannot vectorize. + if (!EnableStrictReductions || + any_of(getInductionVars(), [&](auto &Induction) -> bool { + InductionDescriptor IndDesc = Induction.second; + return IndDesc.getExactFPMathInst(); + })) + return false; + + // We can now only vectorize if all reductions with Exact FP math also + // have the isOrdered flag set, which indicates that we can move the + // reduction operations in-loop. + return (all_of(getReductionVars(), [&](auto &Reduction) -> bool { + const RecurrenceDescriptor &RdxDesc = Reduction.second; + return !RdxDesc.hasExactFPMath() || RdxDesc.isOrdered(); + })); +} + bool LoopVectorizationLegality::isInductionPhi(const Value *V) { Value *In0 = const_cast<Value *>(V); PHINode *PN = dyn_cast_or_null<PHINode>(In0); @@ -918,17 +949,14 @@ bool LoopVectorizationLegality::isFirstOrderRecurrence(const PHINode *Phi) { return FirstOrderRecurrences.count(Phi); } -bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) { +bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) const { return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT); } bool LoopVectorizationLegality::blockCanBePredicated( BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs, SmallPtrSetImpl<const Instruction *> &MaskedOp, - SmallPtrSetImpl<Instruction *> &ConditionalAssumes, - bool PreserveGuards) const { - const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel(); - + SmallPtrSetImpl<Instruction *> &ConditionalAssumes) const { for (Instruction &I : *BB) { // Check that we don't have a constant expression that can trap as operand. for (Value *Operand : I.operands()) { @@ -956,11 +984,7 @@ bool LoopVectorizationLegality::blockCanBePredicated( if (!LI) return false; if (!SafePtrs.count(LI->getPointerOperand())) { - // !llvm.mem.parallel_loop_access implies if-conversion safety. - // Otherwise, record that the load needs (real or emulated) masking - // and let the cost model decide. - if (!IsAnnotatedParallel || PreserveGuards) - MaskedOp.insert(LI); + MaskedOp.insert(LI); continue; } } @@ -1101,21 +1125,6 @@ bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp, return false; } - // We currently must have a single "exit block" after the loop. Note that - // multiple "exiting blocks" inside the loop are allowed, provided they all - // reach the single exit block. - // TODO: This restriction can be relaxed in the near future, it's here solely - // to allow separation of changes for review. We need to generalize the phi - // update logic in a number of places. - if (!Lp->getUniqueExitBlock()) { - reportVectorizationFailure("The loop must have a unique exit block", - "loop control flow is not understood by vectorizer", - "CFGNotUnderstood", ORE, TheLoop); - if (DoExtraAnalysis) - Result = false; - else - return false; - } return Result; } @@ -1276,8 +1285,7 @@ bool LoopVectorizationLegality::prepareToFoldTailByMasking() { // do not need predication such as the header block. for (BasicBlock *BB : TheLoop->blocks()) { if (!blockCanBePredicated(BB, SafePointers, TmpMaskedOp, - TmpConditionalAssumes, - /* MaskAllLoads= */ true)) { + TmpConditionalAssumes)) { LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking as requested.\n"); return false; } |
