diff options
Diffstat (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')
-rw-r--r-- | lib/Transforms/Vectorize/LoopVectorize.cpp | 2806 |
1 files changed, 817 insertions, 1989 deletions
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 52f32cda2609..3c693f5d5ee0 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -26,6 +26,14 @@ // of vectorization. It decides on the optimal vector width, which // can be one, if vectorization is not profitable. // +// There is a development effort going on to migrate loop vectorizer to the +// VPlan infrastructure and to introduce outer loop vectorization support (see +// docs/Proposal/VectorizationPlan.rst and +// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this +// purpose, we temporarily introduced the VPlan-native vectorization path: an +// alternative vectorization path that is natively implemented on top of the +// VPlan infrastructure. See EnableVPlanNativePath for enabling. +// //===----------------------------------------------------------------------===// // // The reduction-variable vectorization is based on the paper: @@ -47,8 +55,9 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Vectorize/LoopVectorize.h" -#include "VPlan.h" -#include "VPlanBuilder.h" +#include "LoopVectorizationPlanner.h" +#include "VPRecipeBuilder.h" +#include "VPlanHCFGBuilder.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" @@ -57,11 +66,9 @@ #include "llvm/ADT/MapVector.h" #include "llvm/ADT/None.h" #include "llvm/ADT/Optional.h" -#include "llvm/ADT/SCCIterator.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" -#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringRef.h" @@ -70,6 +77,7 @@ #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/BlockFrequencyInfo.h" +#include "llvm/Analysis/CFG.h" #include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/DemandedBits.h" #include "llvm/Analysis/GlobalsModRef.h" @@ -124,6 +132,7 @@ #include "llvm/Transforms/Utils/LoopSimplify.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/LoopVersioning.h" +#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" #include <algorithm> #include <cassert> #include <cstdint> @@ -145,10 +154,6 @@ using namespace llvm; STATISTIC(LoopsVectorized, "Number of loops vectorized"); STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); -static cl::opt<bool> - EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden, - cl::desc("Enable if-conversion during vectorization.")); - /// Loops with a known constant trip count below this number are vectorized only /// if no scalar iteration overheads are incurred. static cl::opt<unsigned> TinyTripCountVectorThreshold( @@ -184,9 +189,6 @@ static cl::opt<unsigned> ForceTargetNumVectorRegs( "force-target-num-vector-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of vector registers.")); -/// Maximum vectorization interleave count. -static const unsigned MaxInterleaveFactor = 16; - static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's max interleave factor for " @@ -209,7 +211,7 @@ static cl::opt<unsigned> SmallLoopCost( "The cost of a loop that is considered 'small' by the interleaver.")); static cl::opt<bool> LoopVectorizeWithBlockFrequency( - "loop-vectorize-with-block-frequency", cl::init(false), cl::Hidden, + "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, cl::desc("Enable the use of the block frequency analysis to access PGO " "heuristics minimizing code growth in cold regions and being more " "aggressive in hot regions.")); @@ -238,71 +240,21 @@ static cl::opt<unsigned> MaxNestedScalarReductionIC( cl::desc("The maximum interleave count to use when interleaving a scalar " "reduction in a nested loop.")); -static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold( - "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, - cl::desc("The maximum allowed number of runtime memory checks with a " - "vectorize(enable) pragma.")); - -static cl::opt<unsigned> VectorizeSCEVCheckThreshold( - "vectorize-scev-check-threshold", cl::init(16), cl::Hidden, - cl::desc("The maximum number of SCEV checks allowed.")); - -static cl::opt<unsigned> PragmaVectorizeSCEVCheckThreshold( - "pragma-vectorize-scev-check-threshold", cl::init(128), cl::Hidden, - cl::desc("The maximum number of SCEV checks allowed with a " - "vectorize(enable) pragma")); - -/// Create an analysis remark that explains why vectorization failed -/// -/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p -/// RemarkName is the identifier for the remark. If \p I is passed it is an -/// instruction that prevents vectorization. Otherwise \p TheLoop is used for -/// the location of the remark. \return the remark object that can be -/// streamed to. -static OptimizationRemarkAnalysis -createMissedAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop, - Instruction *I = nullptr) { - Value *CodeRegion = TheLoop->getHeader(); - DebugLoc DL = TheLoop->getStartLoc(); - - if (I) { - CodeRegion = I->getParent(); - // If there is no debug location attached to the instruction, revert back to - // using the loop's. - if (I->getDebugLoc()) - DL = I->getDebugLoc(); - } - - OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); - R << "loop not vectorized: "; - return R; -} - -namespace { - -class LoopVectorizationLegality; -class LoopVectorizationCostModel; -class LoopVectorizationRequirements; - -} // end anonymous namespace - -/// Returns true if the given loop body has a cycle, excluding the loop -/// itself. -static bool hasCyclesInLoopBody(const Loop &L) { - if (!L.empty()) - return true; - - for (const auto &SCC : - make_range(scc_iterator<Loop, LoopBodyTraits>::begin(L), - scc_iterator<Loop, LoopBodyTraits>::end(L))) { - if (SCC.size() > 1) { - DEBUG(dbgs() << "LVL: Detected a cycle in the loop body:\n"); - DEBUG(L.dump()); - return true; - } - } - return false; -} +static cl::opt<bool> EnableVPlanNativePath( + "enable-vplan-native-path", cl::init(false), cl::Hidden, + cl::desc("Enable VPlan-native vectorization path with " + "support for outer loop vectorization.")); + +// This flag enables the stress testing of the VPlan H-CFG construction in the +// VPlan-native vectorization path. It must be used in conjuction with +// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the +// verification of the H-CFGs built. +static cl::opt<bool> VPlanBuildStressTest( + "vplan-build-stress-test", cl::init(false), cl::Hidden, + cl::desc( + "Build VPlan for every supported loop nest in the function and bail " + "out right after the build (stress test the VPlan H-CFG construction " + "in the VPlan-native vectorization path).")); /// A helper function for converting Scalar types to vector types. /// If the incoming type is void, we return void. If the VF is 1, we return @@ -317,16 +269,6 @@ static Type *ToVectorTy(Type *Scalar, unsigned VF) { // in the project. They can be effectively organized in a common Load/Store // utilities unit. -/// A helper function that returns the pointer operand of a load or store -/// instruction. -static Value *getPointerOperand(Value *I) { - if (auto *LI = dyn_cast<LoadInst>(I)) - return LI->getPointerOperand(); - if (auto *SI = dyn_cast<StoreInst>(I)) - return SI->getPointerOperand(); - return nullptr; -} - /// A helper function that returns the type of loaded or stored value. static Type *getMemInstValueType(Value *I) { assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && @@ -373,7 +315,7 @@ static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) { /// A helper function that returns the reciprocal of the block probability of /// predicated blocks. If we return X, we are assuming the predicated block -/// will execute once for for every X iterations of the loop header. +/// will execute once for every X iterations of the loop header. /// /// TODO: We should use actual block probability here, if available. Currently, /// we always assume predicated blocks have a 50% chance of executing. @@ -502,7 +444,7 @@ public: void vectorizeMemoryInstruction(Instruction *Instr, VectorParts *BlockInMask = nullptr); - /// \brief Set the debug location in the builder using the debug location in + /// Set the debug location in the builder using the debug location in /// the instruction. void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); @@ -538,7 +480,7 @@ protected: /// vectorizing this phi node. void fixReduction(PHINode *Phi); - /// \brief The Loop exit block may have single value PHI nodes with some + /// The Loop exit block may have single value PHI nodes with some /// incoming value. While vectorizing we only handled real values /// that were defined inside the loop and we should have one value for /// each predecessor of its parent basic block. See PR14725. @@ -573,9 +515,9 @@ protected: /// Compute scalar induction steps. \p ScalarIV is the scalar induction /// variable on which to base the steps, \p Step is the size of the step, and /// \p EntryVal is the value from the original loop that maps to the steps. - /// Note that \p EntryVal doesn't have to be an induction variable (e.g., it - /// can be a truncate instruction). - void buildScalarSteps(Value *ScalarIV, Value *Step, Value *EntryVal, + /// Note that \p EntryVal doesn't have to be an induction variable - it + /// can also be a truncate instruction. + void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, const InductionDescriptor &ID); /// Create a vector induction phi node based on an existing scalar one. \p @@ -602,10 +544,20 @@ protected: /// vector loop for both the Phi and the cast. /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, /// Otherwise, \p VectorLoopValue is a widened/vectorized value. - void recordVectorLoopValueForInductionCast (const InductionDescriptor &ID, - Value *VectorLoopValue, - unsigned Part, - unsigned Lane = UINT_MAX); + /// + /// \p EntryVal is the value from the original loop that maps to the vector + /// phi node and is used to distinguish what is the IV currently being + /// processed - original one (if \p EntryVal is a phi corresponding to the + /// original IV) or the "newly-created" one based on the proof mentioned above + /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the + /// latter case \p EntryVal is a TruncInst and we must not record anything for + /// that IV, but it's error-prone to expect callers of this routine to care + /// about that, hence this explicit parameter. + void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID, + const Instruction *EntryVal, + Value *VectorLoopValue, + unsigned Part, + unsigned Lane = UINT_MAX); /// Generate a shuffle sequence that will reverse the vector Vec. virtual Value *reverseVector(Value *Vec); @@ -646,7 +598,7 @@ protected: /// loop. void addMetadata(Instruction *To, Instruction *From); - /// \brief Similar to the previous function but it adds the metadata to a + /// Similar to the previous function but it adds the metadata to a /// vector of instructions. void addMetadata(ArrayRef<Value *> To, Instruction *From); @@ -679,7 +631,7 @@ protected: /// Interface to emit optimization remarks. OptimizationRemarkEmitter *ORE; - /// \brief LoopVersioning. It's only set up (non-null) if memchecks were + /// LoopVersioning. It's only set up (non-null) if memchecks were /// used. /// /// This is currently only used to add no-alias metadata based on the @@ -777,7 +729,7 @@ private: } // end namespace llvm -/// \brief Look for a meaningful debug location on the instruction or it's +/// Look for a meaningful debug location on the instruction or it's /// operands. static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { if (!I) @@ -849,7 +801,7 @@ void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, namespace llvm { -/// \brief The group of interleaved loads/stores sharing the same stride and +/// The group of interleaved loads/stores sharing the same stride and /// close to each other. /// /// Each member in this group has an index starting from 0, and the largest @@ -893,7 +845,7 @@ public: unsigned getAlignment() const { return Align; } unsigned getNumMembers() const { return Members.size(); } - /// \brief Try to insert a new member \p Instr with index \p Index and + /// Try to insert a new member \p Instr with index \p Index and /// alignment \p NewAlign. The index is related to the leader and it could be /// negative if it is the new leader. /// @@ -927,7 +879,7 @@ public: return true; } - /// \brief Get the member with the given index \p Index + /// Get the member with the given index \p Index /// /// \returns nullptr if contains no such member. Instruction *getMember(unsigned Index) const { @@ -938,7 +890,7 @@ public: return Members.find(Key)->second; } - /// \brief Get the index for the given member. Unlike the key in the member + /// Get the index for the given member. Unlike the key in the member /// map, the index starts from 0. unsigned getIndex(Instruction *Instr) const { for (auto I : Members) @@ -989,7 +941,7 @@ private: namespace { -/// \brief Drive the analysis of interleaved memory accesses in the loop. +/// Drive the analysis of interleaved memory accesses in the loop. /// /// Use this class to analyze interleaved accesses only when we can vectorize /// a loop. Otherwise it's meaningless to do analysis as the vectorization @@ -1000,11 +952,12 @@ namespace { class InterleavedAccessInfo { public: InterleavedAccessInfo(PredicatedScalarEvolution &PSE, Loop *L, - DominatorTree *DT, LoopInfo *LI) - : PSE(PSE), TheLoop(L), DT(DT), LI(LI) {} + DominatorTree *DT, LoopInfo *LI, + const LoopAccessInfo *LAI) + : PSE(PSE), TheLoop(L), DT(DT), LI(LI), LAI(LAI) {} ~InterleavedAccessInfo() { - SmallSet<InterleaveGroup *, 4> DelSet; + SmallPtrSet<InterleaveGroup *, 4> DelSet; // Avoid releasing a pointer twice. for (auto &I : InterleaveGroupMap) DelSet.insert(I.second); @@ -1012,16 +965,16 @@ public: delete Ptr; } - /// \brief Analyze the interleaved accesses and collect them in interleave + /// Analyze the interleaved accesses and collect them in interleave /// groups. Substitute symbolic strides using \p Strides. - void analyzeInterleaving(const ValueToValueMap &Strides); + void analyzeInterleaving(); - /// \brief Check if \p Instr belongs to any interleave group. + /// Check if \p Instr belongs to any interleave group. bool isInterleaved(Instruction *Instr) const { return InterleaveGroupMap.count(Instr); } - /// \brief Get the interleave group that \p Instr belongs to. + /// Get the interleave group that \p Instr belongs to. /// /// \returns nullptr if doesn't have such group. InterleaveGroup *getInterleaveGroup(Instruction *Instr) const { @@ -1030,13 +983,10 @@ public: return nullptr; } - /// \brief Returns true if an interleaved group that may access memory + /// Returns true if an interleaved group that may access memory /// out-of-bounds requires a scalar epilogue iteration for correctness. bool requiresScalarEpilogue() const { return RequiresScalarEpilogue; } - /// \brief Initialize the LoopAccessInfo used for dependence checking. - void setLAI(const LoopAccessInfo *Info) { LAI = Info; } - private: /// A wrapper around ScalarEvolution, used to add runtime SCEV checks. /// Simplifies SCEV expressions in the context of existing SCEV assumptions. @@ -1047,7 +997,7 @@ private: Loop *TheLoop; DominatorTree *DT; LoopInfo *LI; - const LoopAccessInfo *LAI = nullptr; + const LoopAccessInfo *LAI; /// True if the loop may contain non-reversed interleaved groups with /// out-of-bounds accesses. We ensure we don't speculatively access memory @@ -1061,7 +1011,7 @@ private: /// access to a set of dependent sink accesses. DenseMap<Instruction *, SmallPtrSet<Instruction *, 2>> Dependences; - /// \brief The descriptor for a strided memory access. + /// The descriptor for a strided memory access. struct StrideDescriptor { StrideDescriptor() = default; StrideDescriptor(int64_t Stride, const SCEV *Scev, uint64_t Size, @@ -1081,10 +1031,10 @@ private: unsigned Align = 0; }; - /// \brief A type for holding instructions and their stride descriptors. + /// A type for holding instructions and their stride descriptors. using StrideEntry = std::pair<Instruction *, StrideDescriptor>; - /// \brief Create a new interleave group with the given instruction \p Instr, + /// Create a new interleave group with the given instruction \p Instr, /// stride \p Stride and alignment \p Align. /// /// \returns the newly created interleave group. @@ -1096,7 +1046,7 @@ private: return InterleaveGroupMap[Instr]; } - /// \brief Release the group and remove all the relationships. + /// Release the group and remove all the relationships. void releaseGroup(InterleaveGroup *Group) { for (unsigned i = 0; i < Group->getFactor(); i++) if (Instruction *Member = Group->getMember(i)) @@ -1105,28 +1055,28 @@ private: delete Group; } - /// \brief Collect all the accesses with a constant stride in program order. + /// Collect all the accesses with a constant stride in program order. void collectConstStrideAccesses( MapVector<Instruction *, StrideDescriptor> &AccessStrideInfo, const ValueToValueMap &Strides); - /// \brief Returns true if \p Stride is allowed in an interleaved group. + /// Returns true if \p Stride is allowed in an interleaved group. static bool isStrided(int Stride) { unsigned Factor = std::abs(Stride); return Factor >= 2 && Factor <= MaxInterleaveGroupFactor; } - /// \brief Returns true if \p BB is a predicated block. + /// Returns true if \p BB is a predicated block. bool isPredicated(BasicBlock *BB) const { return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT); } - /// \brief Returns true if LoopAccessInfo can be used for dependence queries. + /// Returns true if LoopAccessInfo can be used for dependence queries. bool areDependencesValid() const { return LAI && LAI->getDepChecker().getDependences(); } - /// \brief Returns true if memory accesses \p A and \p B can be reordered, if + /// Returns true if memory accesses \p A and \p B can be reordered, if /// necessary, when constructing interleaved groups. /// /// \p A must precede \p B in program order. We return false if reordering is @@ -1174,7 +1124,7 @@ private: return !Dependences.count(Src) || !Dependences.lookup(Src).count(Sink); } - /// \brief Collect the dependences from LoopAccessInfo. + /// Collect the dependences from LoopAccessInfo. /// /// We process the dependences once during the interleaved access analysis to /// enable constant-time dependence queries. @@ -1187,315 +1137,6 @@ private: } }; -/// Utility class for getting and setting loop vectorizer hints in the form -/// of loop metadata. -/// This class keeps a number of loop annotations locally (as member variables) -/// and can, upon request, write them back as metadata on the loop. It will -/// initially scan the loop for existing metadata, and will update the local -/// values based on information in the loop. -/// We cannot write all values to metadata, as the mere presence of some info, -/// for example 'force', means a decision has been made. So, we need to be -/// careful NOT to add them if the user hasn't specifically asked so. -class LoopVectorizeHints { - enum HintKind { HK_WIDTH, HK_UNROLL, HK_FORCE, HK_ISVECTORIZED }; - - /// Hint - associates name and validation with the hint value. - struct Hint { - const char *Name; - unsigned Value; // This may have to change for non-numeric values. - HintKind Kind; - - Hint(const char *Name, unsigned Value, HintKind Kind) - : Name(Name), Value(Value), Kind(Kind) {} - - bool validate(unsigned Val) { - switch (Kind) { - case HK_WIDTH: - return isPowerOf2_32(Val) && Val <= VectorizerParams::MaxVectorWidth; - case HK_UNROLL: - return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor; - case HK_FORCE: - return (Val <= 1); - case HK_ISVECTORIZED: - return (Val==0 || Val==1); - } - return false; - } - }; - - /// Vectorization width. - Hint Width; - - /// Vectorization interleave factor. - Hint Interleave; - - /// Vectorization forced - Hint Force; - - /// Already Vectorized - Hint IsVectorized; - - /// Return the loop metadata prefix. - static StringRef Prefix() { return "llvm.loop."; } - - /// True if there is any unsafe math in the loop. - bool PotentiallyUnsafe = false; - -public: - enum ForceKind { - FK_Undefined = -1, ///< Not selected. - FK_Disabled = 0, ///< Forcing disabled. - FK_Enabled = 1, ///< Forcing enabled. - }; - - LoopVectorizeHints(const Loop *L, bool DisableInterleaving, - OptimizationRemarkEmitter &ORE) - : Width("vectorize.width", VectorizerParams::VectorizationFactor, - HK_WIDTH), - Interleave("interleave.count", DisableInterleaving, HK_UNROLL), - Force("vectorize.enable", FK_Undefined, HK_FORCE), - IsVectorized("isvectorized", 0, HK_ISVECTORIZED), TheLoop(L), ORE(ORE) { - // Populate values with existing loop metadata. - getHintsFromMetadata(); - - // force-vector-interleave overrides DisableInterleaving. - if (VectorizerParams::isInterleaveForced()) - Interleave.Value = VectorizerParams::VectorizationInterleave; - - if (IsVectorized.Value != 1) - // If the vectorization width and interleaving count are both 1 then - // consider the loop to have been already vectorized because there's - // nothing more that we can do. - IsVectorized.Value = Width.Value == 1 && Interleave.Value == 1; - DEBUG(if (DisableInterleaving && Interleave.Value == 1) dbgs() - << "LV: Interleaving disabled by the pass manager\n"); - } - - /// Mark the loop L as already vectorized by setting the width to 1. - void setAlreadyVectorized() { - IsVectorized.Value = 1; - Hint Hints[] = {IsVectorized}; - writeHintsToMetadata(Hints); - } - - bool allowVectorization(Function *F, Loop *L, bool AlwaysVectorize) const { - if (getForce() == LoopVectorizeHints::FK_Disabled) { - DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n"); - emitRemarkWithHints(); - return false; - } - - if (!AlwaysVectorize && getForce() != LoopVectorizeHints::FK_Enabled) { - DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n"); - emitRemarkWithHints(); - return false; - } - - if (getIsVectorized() == 1) { - DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n"); - // FIXME: Add interleave.disable metadata. This will allow - // vectorize.disable to be used without disabling the pass and errors - // to differentiate between disabled vectorization and a width of 1. - ORE.emit([&]() { - return OptimizationRemarkAnalysis(vectorizeAnalysisPassName(), - "AllDisabled", L->getStartLoc(), - L->getHeader()) - << "loop not vectorized: vectorization and interleaving are " - "explicitly disabled, or the loop has already been " - "vectorized"; - }); - return false; - } - - return true; - } - - /// Dumps all the hint information. - void emitRemarkWithHints() const { - using namespace ore; - - ORE.emit([&]() { - if (Force.Value == LoopVectorizeHints::FK_Disabled) - return OptimizationRemarkMissed(LV_NAME, "MissedExplicitlyDisabled", - TheLoop->getStartLoc(), - TheLoop->getHeader()) - << "loop not vectorized: vectorization is explicitly disabled"; - else { - OptimizationRemarkMissed R(LV_NAME, "MissedDetails", - TheLoop->getStartLoc(), - TheLoop->getHeader()); - R << "loop not vectorized"; - if (Force.Value == LoopVectorizeHints::FK_Enabled) { - R << " (Force=" << NV("Force", true); - if (Width.Value != 0) - R << ", Vector Width=" << NV("VectorWidth", Width.Value); - if (Interleave.Value != 0) - R << ", Interleave Count=" - << NV("InterleaveCount", Interleave.Value); - R << ")"; - } - return R; - } - }); - } - - unsigned getWidth() const { return Width.Value; } - unsigned getInterleave() const { return Interleave.Value; } - unsigned getIsVectorized() const { return IsVectorized.Value; } - enum ForceKind getForce() const { return (ForceKind)Force.Value; } - - /// \brief If hints are provided that force vectorization, use the AlwaysPrint - /// pass name to force the frontend to print the diagnostic. - const char *vectorizeAnalysisPassName() const { - if (getWidth() == 1) - return LV_NAME; - if (getForce() == LoopVectorizeHints::FK_Disabled) - return LV_NAME; - if (getForce() == LoopVectorizeHints::FK_Undefined && getWidth() == 0) - return LV_NAME; - return OptimizationRemarkAnalysis::AlwaysPrint; - } - - bool allowReordering() const { - // When enabling loop hints are provided we allow the vectorizer to change - // the order of operations that is given by the scalar loop. This is not - // enabled by default because can be unsafe or inefficient. For example, - // reordering floating-point operations will change the way round-off - // error accumulates in the loop. - return getForce() == LoopVectorizeHints::FK_Enabled || getWidth() > 1; - } - - bool isPotentiallyUnsafe() const { - // Avoid FP vectorization if the target is unsure about proper support. - // This may be related to the SIMD unit in the target not handling - // IEEE 754 FP ops properly, or bad single-to-double promotions. - // Otherwise, a sequence of vectorized loops, even without reduction, - // could lead to different end results on the destination vectors. - return getForce() != LoopVectorizeHints::FK_Enabled && PotentiallyUnsafe; - } - - void setPotentiallyUnsafe() { PotentiallyUnsafe = true; } - -private: - /// Find hints specified in the loop metadata and update local values. - void getHintsFromMetadata() { - MDNode *LoopID = TheLoop->getLoopID(); - if (!LoopID) - return; - - // First operand should refer to the loop id itself. - assert(LoopID->getNumOperands() > 0 && "requires at least one operand"); - assert(LoopID->getOperand(0) == LoopID && "invalid loop id"); - - for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { - const MDString *S = nullptr; - SmallVector<Metadata *, 4> Args; - - // The expected hint is either a MDString or a MDNode with the first - // operand a MDString. - if (const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i))) { - if (!MD || MD->getNumOperands() == 0) - continue; - S = dyn_cast<MDString>(MD->getOperand(0)); - for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i) - Args.push_back(MD->getOperand(i)); - } else { - S = dyn_cast<MDString>(LoopID->getOperand(i)); - assert(Args.size() == 0 && "too many arguments for MDString"); - } - - if (!S) - continue; - - // Check if the hint starts with the loop metadata prefix. - StringRef Name = S->getString(); - if (Args.size() == 1) - setHint(Name, Args[0]); - } - } - - /// Checks string hint with one operand and set value if valid. - void setHint(StringRef Name, Metadata *Arg) { - if (!Name.startswith(Prefix())) - return; - Name = Name.substr(Prefix().size(), StringRef::npos); - - const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(Arg); - if (!C) - return; - unsigned Val = C->getZExtValue(); - - Hint *Hints[] = {&Width, &Interleave, &Force, &IsVectorized}; - for (auto H : Hints) { - if (Name == H->Name) { - if (H->validate(Val)) - H->Value = Val; - else - DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n"); - break; - } - } - } - - /// Create a new hint from name / value pair. - MDNode *createHintMetadata(StringRef Name, unsigned V) const { - LLVMContext &Context = TheLoop->getHeader()->getContext(); - Metadata *MDs[] = {MDString::get(Context, Name), - ConstantAsMetadata::get( - ConstantInt::get(Type::getInt32Ty(Context), V))}; - return MDNode::get(Context, MDs); - } - - /// Matches metadata with hint name. - bool matchesHintMetadataName(MDNode *Node, ArrayRef<Hint> HintTypes) { - MDString *Name = dyn_cast<MDString>(Node->getOperand(0)); - if (!Name) - return false; - - for (auto H : HintTypes) - if (Name->getString().endswith(H.Name)) - return true; - return false; - } - - /// Sets current hints into loop metadata, keeping other values intact. - void writeHintsToMetadata(ArrayRef<Hint> HintTypes) { - if (HintTypes.empty()) - return; - - // Reserve the first element to LoopID (see below). - SmallVector<Metadata *, 4> MDs(1); - // If the loop already has metadata, then ignore the existing operands. - MDNode *LoopID = TheLoop->getLoopID(); - if (LoopID) { - for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { - MDNode *Node = cast<MDNode>(LoopID->getOperand(i)); - // If node in update list, ignore old value. - if (!matchesHintMetadataName(Node, HintTypes)) - MDs.push_back(Node); - } - } - - // Now, add the missing hints. - for (auto H : HintTypes) - MDs.push_back(createHintMetadata(Twine(Prefix(), H.Name).str(), H.Value)); - - // Replace current metadata node with new one. - LLVMContext &Context = TheLoop->getHeader()->getContext(); - MDNode *NewLoopID = MDNode::get(Context, MDs); - // Set operand 0 to refer to the loop id itself. - NewLoopID->replaceOperandWith(0, NewLoopID); - - TheLoop->setLoopID(NewLoopID); - } - - /// The loop these hints belong to. - const Loop *TheLoop; - - /// Interface to emit optimization remarks. - OptimizationRemarkEmitter &ORE; -}; - } // end anonymous namespace static void emitMissedWarning(Function *F, Loop *L, @@ -1519,324 +1160,7 @@ static void emitMissedWarning(Function *F, Loop *L, } } -namespace { - -/// LoopVectorizationLegality checks if it is legal to vectorize a loop, and -/// to what vectorization factor. -/// This class does not look at the profitability of vectorization, only the -/// legality. This class has two main kinds of checks: -/// * Memory checks - The code in canVectorizeMemory checks if vectorization -/// will change the order of memory accesses in a way that will change the -/// correctness of the program. -/// * Scalars checks - The code in canVectorizeInstrs and canVectorizeMemory -/// checks for a number of different conditions, such as the availability of a -/// single induction variable, that all types are supported and vectorize-able, -/// etc. This code reflects the capabilities of InnerLoopVectorizer. -/// This class is also used by InnerLoopVectorizer for identifying -/// induction variable and the different reduction variables. -class LoopVectorizationLegality { -public: - LoopVectorizationLegality( - Loop *L, PredicatedScalarEvolution &PSE, DominatorTree *DT, - TargetLibraryInfo *TLI, AliasAnalysis *AA, Function *F, - const TargetTransformInfo *TTI, - std::function<const LoopAccessInfo &(Loop &)> *GetLAA, LoopInfo *LI, - OptimizationRemarkEmitter *ORE, LoopVectorizationRequirements *R, - LoopVectorizeHints *H) - : TheLoop(L), PSE(PSE), TLI(TLI), TTI(TTI), DT(DT), GetLAA(GetLAA), - ORE(ORE), InterleaveInfo(PSE, L, DT, LI), Requirements(R), Hints(H) {} - - /// ReductionList contains the reduction descriptors for all - /// of the reductions that were found in the loop. - using ReductionList = DenseMap<PHINode *, RecurrenceDescriptor>; - - /// InductionList saves induction variables and maps them to the - /// induction descriptor. - using InductionList = MapVector<PHINode *, InductionDescriptor>; - - /// RecurrenceSet contains the phi nodes that are recurrences other than - /// inductions and reductions. - using RecurrenceSet = SmallPtrSet<const PHINode *, 8>; - - /// Returns true if it is legal to vectorize this loop. - /// This does not mean that it is profitable to vectorize this - /// loop, only that it is legal to do so. - bool canVectorize(); - - /// Returns the primary induction variable. - PHINode *getPrimaryInduction() { return PrimaryInduction; } - - /// Returns the reduction variables found in the loop. - ReductionList *getReductionVars() { return &Reductions; } - - /// Returns the induction variables found in the loop. - InductionList *getInductionVars() { return &Inductions; } - - /// Return the first-order recurrences found in the loop. - RecurrenceSet *getFirstOrderRecurrences() { return &FirstOrderRecurrences; } - - /// Return the set of instructions to sink to handle first-order recurrences. - DenseMap<Instruction *, Instruction *> &getSinkAfter() { return SinkAfter; } - - /// Returns the widest induction type. - Type *getWidestInductionType() { return WidestIndTy; } - - /// Returns True if V is a Phi node of an induction variable in this loop. - bool isInductionPhi(const Value *V); - - /// Returns True if V is a cast that is part of an induction def-use chain, - /// and had been proven to be redundant under a runtime guard (in other - /// words, the cast has the same SCEV expression as the induction phi). - bool isCastedInductionVariable(const Value *V); - - /// Returns True if V can be considered as an induction variable in this - /// loop. V can be the induction phi, or some redundant cast in the def-use - /// chain of the inducion phi. - bool isInductionVariable(const Value *V); - - /// Returns True if PN is a reduction variable in this loop. - bool isReductionVariable(PHINode *PN) { return Reductions.count(PN); } - - /// Returns True if Phi is a first-order recurrence in this loop. - bool isFirstOrderRecurrence(const PHINode *Phi); - - /// Return true if the block BB needs to be predicated in order for the loop - /// to be vectorized. - bool blockNeedsPredication(BasicBlock *BB); - - /// Check if this pointer is consecutive when vectorizing. This happens - /// when the last index of the GEP is the induction variable, or that the - /// pointer itself is an induction variable. - /// This check allows us to vectorize A[idx] into a wide load/store. - /// Returns: - /// 0 - Stride is unknown or non-consecutive. - /// 1 - Address is consecutive. - /// -1 - Address is consecutive, and decreasing. - /// NOTE: This method must only be used before modifying the original scalar - /// loop. Do not use after invoking 'createVectorizedLoopSkeleton' (PR34965). - int isConsecutivePtr(Value *Ptr); - - /// Returns true if the value V is uniform within the loop. - bool isUniform(Value *V); - - /// Returns the information that we collected about runtime memory check. - const RuntimePointerChecking *getRuntimePointerChecking() const { - return LAI->getRuntimePointerChecking(); - } - - const LoopAccessInfo *getLAI() const { return LAI; } - - /// \brief Check if \p Instr belongs to any interleaved access group. - bool isAccessInterleaved(Instruction *Instr) { - return InterleaveInfo.isInterleaved(Instr); - } - - /// \brief Get the interleaved access group that \p Instr belongs to. - const InterleaveGroup *getInterleavedAccessGroup(Instruction *Instr) { - return InterleaveInfo.getInterleaveGroup(Instr); - } - - /// \brief Returns true if an interleaved group requires a scalar iteration - /// to handle accesses with gaps. - bool requiresScalarEpilogue() const { - return InterleaveInfo.requiresScalarEpilogue(); - } - - unsigned getMaxSafeDepDistBytes() { return LAI->getMaxSafeDepDistBytes(); } - - uint64_t getMaxSafeRegisterWidth() const { - return LAI->getDepChecker().getMaxSafeRegisterWidth(); - } - - bool hasStride(Value *V) { return LAI->hasStride(V); } - - /// Returns true if the target machine supports masked store operation - /// for the given \p DataType and kind of access to \p Ptr. - bool isLegalMaskedStore(Type *DataType, Value *Ptr) { - return isConsecutivePtr(Ptr) && TTI->isLegalMaskedStore(DataType); - } - - /// Returns true if the target machine supports masked load operation - /// for the given \p DataType and kind of access to \p Ptr. - bool isLegalMaskedLoad(Type *DataType, Value *Ptr) { - return isConsecutivePtr(Ptr) && TTI->isLegalMaskedLoad(DataType); - } - - /// Returns true if the target machine supports masked scatter operation - /// for the given \p DataType. - bool isLegalMaskedScatter(Type *DataType) { - return TTI->isLegalMaskedScatter(DataType); - } - - /// Returns true if the target machine supports masked gather operation - /// for the given \p DataType. - bool isLegalMaskedGather(Type *DataType) { - return TTI->isLegalMaskedGather(DataType); - } - - /// Returns true if the target machine can represent \p V as a masked gather - /// or scatter operation. - bool isLegalGatherOrScatter(Value *V) { - auto *LI = dyn_cast<LoadInst>(V); - auto *SI = dyn_cast<StoreInst>(V); - if (!LI && !SI) - return false; - auto *Ptr = getPointerOperand(V); - auto *Ty = cast<PointerType>(Ptr->getType())->getElementType(); - return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty)); - } - - /// Returns true if vector representation of the instruction \p I - /// requires mask. - bool isMaskRequired(const Instruction *I) { return (MaskedOp.count(I) != 0); } - - unsigned getNumStores() const { return LAI->getNumStores(); } - unsigned getNumLoads() const { return LAI->getNumLoads(); } - unsigned getNumPredStores() const { return NumPredStores; } - - /// Returns true if \p I is an instruction that will be scalarized with - /// predication. Such instructions include conditional stores and - /// instructions that may divide by zero. - bool isScalarWithPredication(Instruction *I); - - /// Returns true if \p I is a memory instruction with consecutive memory - /// access that can be widened. - bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1); - - // Returns true if the NoNaN attribute is set on the function. - bool hasFunNoNaNAttr() const { return HasFunNoNaNAttr; } - -private: - /// Check if a single basic block loop is vectorizable. - /// At this point we know that this is a loop with a constant trip count - /// and we only need to check individual instructions. - bool canVectorizeInstrs(); - - /// When we vectorize loops we may change the order in which - /// we read and write from memory. This method checks if it is - /// legal to vectorize the code, considering only memory constrains. - /// Returns true if the loop is vectorizable - bool canVectorizeMemory(); - - /// Return true if we can vectorize this loop using the IF-conversion - /// transformation. - bool canVectorizeWithIfConvert(); - - /// Return true if all of the instructions in the block can be speculatively - /// executed. \p SafePtrs is a list of addresses that are known to be legal - /// and we know that we can read from them without segfault. - bool blockCanBePredicated(BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs); - - /// Updates the vectorization state by adding \p Phi to the inductions list. - /// This can set \p Phi as the main induction of the loop if \p Phi is a - /// better choice for the main induction than the existing one. - void addInductionPhi(PHINode *Phi, const InductionDescriptor &ID, - SmallPtrSetImpl<Value *> &AllowedExit); - - /// Create an analysis remark that explains why vectorization failed - /// - /// \p RemarkName is the identifier for the remark. If \p I is passed it is - /// an instruction that prevents vectorization. Otherwise the loop is used - /// for the location of the remark. \return the remark object that can be - /// streamed to. - OptimizationRemarkAnalysis - createMissedAnalysis(StringRef RemarkName, Instruction *I = nullptr) const { - return ::createMissedAnalysis(Hints->vectorizeAnalysisPassName(), - RemarkName, TheLoop, I); - } - - /// \brief If an access has a symbolic strides, this maps the pointer value to - /// the stride symbol. - const ValueToValueMap *getSymbolicStrides() { - // FIXME: Currently, the set of symbolic strides is sometimes queried before - // it's collected. This happens from canVectorizeWithIfConvert, when the - // pointer is checked to reference consecutive elements suitable for a - // masked access. - return LAI ? &LAI->getSymbolicStrides() : nullptr; - } - - unsigned NumPredStores = 0; - - /// The loop that we evaluate. - Loop *TheLoop; - - /// A wrapper around ScalarEvolution used to add runtime SCEV checks. - /// Applies dynamic knowledge to simplify SCEV expressions in the context - /// of existing SCEV assumptions. The analysis will also add a minimal set - /// of new predicates if this is required to enable vectorization and - /// unrolling. - PredicatedScalarEvolution &PSE; - - /// Target Library Info. - TargetLibraryInfo *TLI; - - /// Target Transform Info - const TargetTransformInfo *TTI; - - /// Dominator Tree. - DominatorTree *DT; - - // LoopAccess analysis. - std::function<const LoopAccessInfo &(Loop &)> *GetLAA; - - // And the loop-accesses info corresponding to this loop. This pointer is - // null until canVectorizeMemory sets it up. - const LoopAccessInfo *LAI = nullptr; - - /// Interface to emit optimization remarks. - OptimizationRemarkEmitter *ORE; - - /// The interleave access information contains groups of interleaved accesses - /// with the same stride and close to each other. - InterleavedAccessInfo InterleaveInfo; - - // --- vectorization state --- // - - /// Holds the primary induction variable. This is the counter of the - /// loop. - PHINode *PrimaryInduction = nullptr; - - /// Holds the reduction variables. - ReductionList Reductions; - - /// Holds all of the induction variables that we found in the loop. - /// Notice that inductions don't need to start at zero and that induction - /// variables can be pointers. - InductionList Inductions; - - /// Holds all the casts that participate in the update chain of the induction - /// variables, and that have been proven to be redundant (possibly under a - /// runtime guard). These casts can be ignored when creating the vectorized - /// loop body. - SmallPtrSet<Instruction *, 4> InductionCastsToIgnore; - - /// Holds the phi nodes that are first-order recurrences. - RecurrenceSet FirstOrderRecurrences; - - /// Holds instructions that need to sink past other instructions to handle - /// first-order recurrences. - DenseMap<Instruction *, Instruction *> SinkAfter; - - /// Holds the widest induction type encountered. - Type *WidestIndTy = nullptr; - - /// Allowed outside users. This holds the induction and reduction - /// vars which can be accessed from outside the loop. - SmallPtrSet<Value *, 4> AllowedExit; - - /// Can we assume the absence of NaNs. - bool HasFunNoNaNAttr = false; - - /// Vectorization requirements that will go through late-evaluation. - LoopVectorizationRequirements *Requirements; - - /// Used to emit an analysis of any legality issues. - LoopVectorizeHints *Hints; - - /// While vectorizing these instructions we have to generate a - /// call to the appropriate masked intrinsic - SmallPtrSet<const Instruction *, 8> MaskedOp; -}; +namespace llvm { /// LoopVectorizationCostModel - estimates the expected speedups due to /// vectorization. @@ -1853,23 +1177,15 @@ public: const TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, const Function *F, - const LoopVectorizeHints *Hints) + const LoopVectorizeHints *Hints, + InterleavedAccessInfo &IAI) : TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB), - AC(AC), ORE(ORE), TheFunction(F), Hints(Hints) {} + AC(AC), ORE(ORE), TheFunction(F), Hints(Hints), InterleaveInfo(IAI) {} /// \return An upper bound for the vectorization factor, or None if /// vectorization should be avoided up front. Optional<unsigned> computeMaxVF(bool OptForSize); - /// Information about vectorization costs - struct VectorizationFactor { - // Vector width with best cost - unsigned Width; - - // Cost of the loop with that width - unsigned Cost; - }; - /// \return The most profitable vectorization factor and the cost of that VF. /// This method checks every power of two up to MaxVF. If UserVF is not ZERO /// then this vectorization factor will be selected if vectorization is @@ -1903,7 +1219,7 @@ public: /// avoid redundant calculations. void setCostBasedWideningDecision(unsigned VF); - /// \brief A struct that represents some properties of the register usage + /// A struct that represents some properties of the register usage /// of a loop. struct RegisterUsage { /// Holds the number of loop invariant values that are used in the loop. @@ -1911,9 +1227,6 @@ public: /// Holds the maximum number of concurrent live intervals in the loop. unsigned MaxLocalUsers; - - /// Holds the number of instructions in the loop. - unsigned NumInstructions; }; /// \return Returns information about the register usages of the loop for the @@ -2063,7 +1376,69 @@ public: collectLoopScalars(VF); } + /// Returns true if the target machine supports masked store operation + /// for the given \p DataType and kind of access to \p Ptr. + bool isLegalMaskedStore(Type *DataType, Value *Ptr) { + return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedStore(DataType); + } + + /// Returns true if the target machine supports masked load operation + /// for the given \p DataType and kind of access to \p Ptr. + bool isLegalMaskedLoad(Type *DataType, Value *Ptr) { + return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedLoad(DataType); + } + + /// Returns true if the target machine supports masked scatter operation + /// for the given \p DataType. + bool isLegalMaskedScatter(Type *DataType) { + return TTI.isLegalMaskedScatter(DataType); + } + + /// Returns true if the target machine supports masked gather operation + /// for the given \p DataType. + bool isLegalMaskedGather(Type *DataType) { + return TTI.isLegalMaskedGather(DataType); + } + + /// Returns true if the target machine can represent \p V as a masked gather + /// or scatter operation. + bool isLegalGatherOrScatter(Value *V) { + bool LI = isa<LoadInst>(V); + bool SI = isa<StoreInst>(V); + if (!LI && !SI) + return false; + auto *Ty = getMemInstValueType(V); + return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty)); + } + + /// Returns true if \p I is an instruction that will be scalarized with + /// predication. Such instructions include conditional stores and + /// instructions that may divide by zero. + bool isScalarWithPredication(Instruction *I); + + /// Returns true if \p I is a memory instruction with consecutive memory + /// access that can be widened. + bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1); + + /// Check if \p Instr belongs to any interleaved access group. + bool isAccessInterleaved(Instruction *Instr) { + return InterleaveInfo.isInterleaved(Instr); + } + + /// Get the interleaved access group that \p Instr belongs to. + const InterleaveGroup *getInterleavedAccessGroup(Instruction *Instr) { + return InterleaveInfo.getInterleaveGroup(Instr); + } + + /// Returns true if an interleaved group requires a scalar iteration + /// to handle accesses with gaps. + bool requiresScalarEpilogue() const { + return InterleaveInfo.requiresScalarEpilogue(); + } + private: + unsigned NumPredStores = 0; + /// \return An upper bound for the vectorization factor, larger than zero. /// One is returned if vectorization should best be avoided due to cost. unsigned computeFeasibleMaxVF(bool OptForSize, unsigned ConstTripCount); @@ -2115,12 +1490,16 @@ private: /// as a vector operation. bool isConsecutiveLoadOrStore(Instruction *I); + /// Returns true if an artificially high cost for emulated masked memrefs + /// should be used. + bool useEmulatedMaskMemRefHack(Instruction *I); + /// Create an analysis remark that explains why vectorization failed /// /// \p RemarkName is the identifier for the remark. \return the remark object /// that can be streamed to. OptimizationRemarkAnalysis createMissedAnalysis(StringRef RemarkName) { - return ::createMissedAnalysis(Hints->vectorizeAnalysisPassName(), + return createLVMissedAnalysis(Hints->vectorizeAnalysisPassName(), RemarkName, TheLoop); } @@ -2222,6 +1601,10 @@ public: /// Loop Vectorize Hint. const LoopVectorizeHints *Hints; + /// The interleave access information contains groups of interleaved accesses + /// with the same stride and close to each other. + InterleavedAccessInfo &InterleaveInfo; + /// Values to ignore in the cost model. SmallPtrSet<const Value *, 16> ValuesToIgnore; @@ -2229,271 +1612,78 @@ public: SmallPtrSet<const Value *, 16> VecValuesToIgnore; }; -} // end anonymous namespace - -namespace llvm { - -/// InnerLoopVectorizer vectorizes loops which contain only one basic -/// LoopVectorizationPlanner - drives the vectorization process after having -/// passed Legality checks. -/// The planner builds and optimizes the Vectorization Plans which record the -/// decisions how to vectorize the given loop. In particular, represent the -/// control-flow of the vectorized version, the replication of instructions that -/// are to be scalarized, and interleave access groups. -class LoopVectorizationPlanner { - /// The loop that we evaluate. - Loop *OrigLoop; - - /// Loop Info analysis. - LoopInfo *LI; - - /// Target Library Info. - const TargetLibraryInfo *TLI; - - /// Target Transform Info. - const TargetTransformInfo *TTI; - - /// The legality analysis. - LoopVectorizationLegality *Legal; - - /// The profitablity analysis. - LoopVectorizationCostModel &CM; - - using VPlanPtr = std::unique_ptr<VPlan>; - - SmallVector<VPlanPtr, 4> VPlans; - - /// This class is used to enable the VPlan to invoke a method of ILV. This is - /// needed until the method is refactored out of ILV and becomes reusable. - struct VPCallbackILV : public VPCallback { - InnerLoopVectorizer &ILV; - - VPCallbackILV(InnerLoopVectorizer &ILV) : ILV(ILV) {} - - Value *getOrCreateVectorValues(Value *V, unsigned Part) override { - return ILV.getOrCreateVectorValue(V, Part); - } - }; - - /// A builder used to construct the current plan. - VPBuilder Builder; - - /// When we if-convert we need to create edge masks. We have to cache values - /// so that we don't end up with exponential recursion/IR. Note that - /// if-conversion currently takes place during VPlan-construction, so these - /// caches are only used at that stage. - using EdgeMaskCacheTy = - DenseMap<std::pair<BasicBlock *, BasicBlock *>, VPValue *>; - using BlockMaskCacheTy = DenseMap<BasicBlock *, VPValue *>; - EdgeMaskCacheTy EdgeMaskCache; - BlockMaskCacheTy BlockMaskCache; - - unsigned BestVF = 0; - unsigned BestUF = 0; - -public: - LoopVectorizationPlanner(Loop *L, LoopInfo *LI, const TargetLibraryInfo *TLI, - const TargetTransformInfo *TTI, - LoopVectorizationLegality *Legal, - LoopVectorizationCostModel &CM) - : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM) {} - - /// Plan how to best vectorize, return the best VF and its cost. - LoopVectorizationCostModel::VectorizationFactor plan(bool OptForSize, - unsigned UserVF); - - /// Finalize the best decision and dispose of all other VPlans. - void setBestPlan(unsigned VF, unsigned UF); - - /// Generate the IR code for the body of the vectorized loop according to the - /// best selected VPlan. - void executePlan(InnerLoopVectorizer &LB, DominatorTree *DT); - - void printPlans(raw_ostream &O) { - for (const auto &Plan : VPlans) - O << *Plan; - } - -protected: - /// Collect the instructions from the original loop that would be trivially - /// dead in the vectorized loop if generated. - void collectTriviallyDeadInstructions( - SmallPtrSetImpl<Instruction *> &DeadInstructions); - - /// A range of powers-of-2 vectorization factors with fixed start and - /// adjustable end. The range includes start and excludes end, e.g.,: - /// [1, 9) = {1, 2, 4, 8} - struct VFRange { - // A power of 2. - const unsigned Start; - - // Need not be a power of 2. If End <= Start range is empty. - unsigned End; - }; - - /// Test a \p Predicate on a \p Range of VF's. Return the value of applying - /// \p Predicate on Range.Start, possibly decreasing Range.End such that the - /// returned value holds for the entire \p Range. - bool getDecisionAndClampRange(const std::function<bool(unsigned)> &Predicate, - VFRange &Range); - - /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive, - /// according to the information gathered by Legal when it checked if it is - /// legal to vectorize the loop. - void buildVPlans(unsigned MinVF, unsigned MaxVF); - -private: - /// A helper function that computes the predicate of the block BB, assuming - /// that the header block of the loop is set to True. It returns the *entry* - /// mask for the block BB. - VPValue *createBlockInMask(BasicBlock *BB, VPlanPtr &Plan); - - /// A helper function that computes the predicate of the edge between SRC - /// and DST. - VPValue *createEdgeMask(BasicBlock *Src, BasicBlock *Dst, VPlanPtr &Plan); - - /// Check if \I belongs to an Interleave Group within the given VF \p Range, - /// \return true in the first returned value if so and false otherwise. - /// Build a new VPInterleaveGroup Recipe if \I is the primary member of an IG - /// for \p Range.Start, and provide it as the second returned value. - /// Note that if \I is an adjunct member of an IG for \p Range.Start, the - /// \return value is <true, nullptr>, as it is handled by another recipe. - /// \p Range.End may be decreased to ensure same decision from \p Range.Start - /// to \p Range.End. - VPInterleaveRecipe *tryToInterleaveMemory(Instruction *I, VFRange &Range); - - // Check if \I is a memory instruction to be widened for \p Range.Start and - // potentially masked. Such instructions are handled by a recipe that takes an - // additional VPInstruction for the mask. - VPWidenMemoryInstructionRecipe *tryToWidenMemory(Instruction *I, - VFRange &Range, - VPlanPtr &Plan); - - /// Check if an induction recipe should be constructed for \I within the given - /// VF \p Range. If so build and return it. If not, return null. \p Range.End - /// may be decreased to ensure same decision from \p Range.Start to - /// \p Range.End. - VPWidenIntOrFpInductionRecipe *tryToOptimizeInduction(Instruction *I, - VFRange &Range); - - /// Handle non-loop phi nodes. Currently all such phi nodes are turned into - /// a sequence of select instructions as the vectorizer currently performs - /// full if-conversion. - VPBlendRecipe *tryToBlend(Instruction *I, VPlanPtr &Plan); - - /// Check if \p I can be widened within the given VF \p Range. If \p I can be - /// widened for \p Range.Start, check if the last recipe of \p VPBB can be - /// extended to include \p I or else build a new VPWidenRecipe for it and - /// append it to \p VPBB. Return true if \p I can be widened for Range.Start, - /// false otherwise. Range.End may be decreased to ensure same decision from - /// \p Range.Start to \p Range.End. - bool tryToWiden(Instruction *I, VPBasicBlock *VPBB, VFRange &Range); - - /// Build a VPReplicationRecipe for \p I and enclose it within a Region if it - /// is predicated. \return \p VPBB augmented with this new recipe if \p I is - /// not predicated, otherwise \return a new VPBasicBlock that succeeds the new - /// Region. Update the packing decision of predicated instructions if they - /// feed \p I. Range.End may be decreased to ensure same recipe behavior from - /// \p Range.Start to \p Range.End. - VPBasicBlock *handleReplication( - Instruction *I, VFRange &Range, VPBasicBlock *VPBB, - DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe, - VPlanPtr &Plan); - - /// Create a replicating region for instruction \p I that requires - /// predication. \p PredRecipe is a VPReplicateRecipe holding \p I. - VPRegionBlock *createReplicateRegion(Instruction *I, VPRecipeBase *PredRecipe, - VPlanPtr &Plan); - - /// Build a VPlan according to the information gathered by Legal. \return a - /// VPlan for vectorization factors \p Range.Start and up to \p Range.End - /// exclusive, possibly decreasing \p Range.End. - VPlanPtr buildVPlan(VFRange &Range, - const SmallPtrSetImpl<Value *> &NeedDef); -}; - } // end namespace llvm -namespace { - -/// \brief This holds vectorization requirements that must be verified late in -/// the process. The requirements are set by legalize and costmodel. Once -/// vectorization has been determined to be possible and profitable the -/// requirements can be verified by looking for metadata or compiler options. -/// For example, some loops require FP commutativity which is only allowed if -/// vectorization is explicitly specified or if the fast-math compiler option -/// has been provided. -/// Late evaluation of these requirements allows helpful diagnostics to be -/// composed that tells the user what need to be done to vectorize the loop. For -/// example, by specifying #pragma clang loop vectorize or -ffast-math. Late -/// evaluation should be used only when diagnostics can generated that can be -/// followed by a non-expert user. -class LoopVectorizationRequirements { -public: - LoopVectorizationRequirements(OptimizationRemarkEmitter &ORE) : ORE(ORE) {} - - void addUnsafeAlgebraInst(Instruction *I) { - // First unsafe algebra instruction. - if (!UnsafeAlgebraInst) - UnsafeAlgebraInst = I; - } - - void addRuntimePointerChecks(unsigned Num) { NumRuntimePointerChecks = Num; } - - bool doesNotMeet(Function *F, Loop *L, const LoopVectorizeHints &Hints) { - const char *PassName = Hints.vectorizeAnalysisPassName(); - bool Failed = false; - if (UnsafeAlgebraInst && !Hints.allowReordering()) { - ORE.emit([&]() { - return OptimizationRemarkAnalysisFPCommute( - PassName, "CantReorderFPOps", - UnsafeAlgebraInst->getDebugLoc(), - UnsafeAlgebraInst->getParent()) - << "loop not vectorized: cannot prove it is safe to reorder " - "floating-point operations"; - }); - Failed = true; - } - - // Test if runtime memcheck thresholds are exceeded. - bool PragmaThresholdReached = - NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; - bool ThresholdReached = - NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold; - if ((ThresholdReached && !Hints.allowReordering()) || - PragmaThresholdReached) { - ORE.emit([&]() { - return OptimizationRemarkAnalysisAliasing(PassName, "CantReorderMemOps", - L->getStartLoc(), - L->getHeader()) - << "loop not vectorized: cannot prove it is safe to reorder " - "memory operations"; - }); - DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); - Failed = true; - } +// Return true if \p OuterLp is an outer loop annotated with hints for explicit +// vectorization. The loop needs to be annotated with #pragma omp simd +// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the +// vector length information is not provided, vectorization is not considered +// explicit. Interleave hints are not allowed either. These limitations will be +// relaxed in the future. +// Please, note that we are currently forced to abuse the pragma 'clang +// vectorize' semantics. This pragma provides *auto-vectorization hints* +// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' +// provides *explicit vectorization hints* (LV can bypass legal checks and +// assume that vectorization is legal). However, both hints are implemented +// using the same metadata (llvm.loop.vectorize, processed by +// LoopVectorizeHints). This will be fixed in the future when the native IR +// representation for pragma 'omp simd' is introduced. +static bool isExplicitVecOuterLoop(Loop *OuterLp, + OptimizationRemarkEmitter *ORE) { + assert(!OuterLp->empty() && "This is not an outer loop"); + LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); + + // Only outer loops with an explicit vectorization hint are supported. + // Unannotated outer loops are ignored. + if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) + return false; - return Failed; + Function *Fn = OuterLp->getHeader()->getParent(); + if (!Hints.allowVectorization(Fn, OuterLp, false /*AlwaysVectorize*/)) { + LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); + return false; } -private: - unsigned NumRuntimePointerChecks = 0; - Instruction *UnsafeAlgebraInst = nullptr; + if (!Hints.getWidth()) { + LLVM_DEBUG(dbgs() << "LV: Not vectorizing: No user vector width.\n"); + emitMissedWarning(Fn, OuterLp, Hints, ORE); + return false; + } - /// Interface to emit optimization remarks. - OptimizationRemarkEmitter &ORE; -}; + if (Hints.getInterleave() > 1) { + // TODO: Interleave support is future work. + LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " + "outer loops.\n"); + emitMissedWarning(Fn, OuterLp, Hints, ORE); + return false; + } -} // end anonymous namespace + return true; +} -static void addAcyclicInnerLoop(Loop &L, SmallVectorImpl<Loop *> &V) { - if (L.empty()) { - if (!hasCyclesInLoopBody(L)) +static void collectSupportedLoops(Loop &L, LoopInfo *LI, + OptimizationRemarkEmitter *ORE, + SmallVectorImpl<Loop *> &V) { + // Collect inner loops and outer loops without irreducible control flow. For + // now, only collect outer loops that have explicit vectorization hints. If we + // are stress testing the VPlan H-CFG construction, we collect the outermost + // loop of every loop nest. + if (L.empty() || VPlanBuildStressTest || + (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { + LoopBlocksRPO RPOT(&L); + RPOT.perform(LI); + if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { V.push_back(&L); - return; + // TODO: Collect inner loops inside marked outer loops in case + // vectorization fails for the outer loop. Do not invoke + // 'containsIrreducibleCFG' again for inner loops when the outer loop is + // already known to be reducible. We can use an inherited attribute for + // that. + return; + } } for (Loop *InnerL : L) - addAcyclicInnerLoop(*InnerL, V); + collectSupportedLoops(*InnerL, LI, ORE, V); } namespace { @@ -2562,14 +1752,16 @@ struct LoopVectorize : public FunctionPass { //===----------------------------------------------------------------------===// Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { - // We need to place the broadcast of invariant variables outside the loop. + // We need to place the broadcast of invariant variables outside the loop, + // but only if it's proven safe to do so. Else, broadcast will be inside + // vector loop body. Instruction *Instr = dyn_cast<Instruction>(V); - bool NewInstr = (Instr && Instr->getParent() == LoopVectorBody); - bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr; - + bool SafeToHoist = OrigLoop->isLoopInvariant(V) && + (!Instr || + DT->dominates(Instr->getParent(), LoopVectorPreHeader)); // Place the code for broadcasting invariant variables in the new preheader. IRBuilder<>::InsertPointGuard Guard(Builder); - if (Invariant) + if (SafeToHoist) Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); // Broadcast the scalar into all locations in the vector. @@ -2580,6 +1772,8 @@ Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( const InductionDescriptor &II, Value *Step, Instruction *EntryVal) { + assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && + "Expected either an induction phi-node or a truncate of it!"); Value *Start = II.getStartValue(); // Construct the initial value of the vector IV in the vector loop preheader @@ -2627,14 +1821,18 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( // factor. The last of those goes into the PHI. PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", &*LoopVectorBody->getFirstInsertionPt()); + VecInd->setDebugLoc(EntryVal->getDebugLoc()); Instruction *LastInduction = VecInd; for (unsigned Part = 0; Part < UF; ++Part) { VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction); - recordVectorLoopValueForInductionCast(II, LastInduction, Part); + if (isa<TruncInst>(EntryVal)) addMetadata(LastInduction, EntryVal); + recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part); + LastInduction = cast<Instruction>(addFastMathFlag( Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"))); + LastInduction->setDebugLoc(EntryVal->getDebugLoc()); } // Move the last step to the end of the latch block. This ensures consistent @@ -2665,8 +1863,20 @@ bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { } void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( - const InductionDescriptor &ID, Value *VectorLoopVal, unsigned Part, - unsigned Lane) { + const InductionDescriptor &ID, const Instruction *EntryVal, + Value *VectorLoopVal, unsigned Part, unsigned Lane) { + assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && + "Expected either an induction phi-node or a truncate of it!"); + + // This induction variable is not the phi from the original loop but the + // newly-created IV based on the proof that casted Phi is equal to the + // uncasted Phi in the vectorized loop (under a runtime guard possibly). It + // re-uses the same InductionDescriptor that original IV uses but we don't + // have to do any recording in this case - that is done when original IV is + // processed. + if (isa<TruncInst>(EntryVal)) + return; + const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); if (Casts.empty()) return; @@ -2754,15 +1964,16 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { // If we haven't yet vectorized the induction variable, splat the scalar // induction variable, and build the necessary step vectors. + // TODO: Don't do it unless the vectorized IV is really required. if (!VectorizedIV) { Value *Broadcasted = getBroadcastInstrs(ScalarIV); for (unsigned Part = 0; Part < UF; ++Part) { Value *EntryPart = getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode()); VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart); - recordVectorLoopValueForInductionCast(ID, EntryPart, Part); if (Trunc) addMetadata(EntryPart, Trunc); + recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part); } } @@ -2833,7 +2044,7 @@ Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, } void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, - Value *EntryVal, + Instruction *EntryVal, const InductionDescriptor &ID) { // We shouldn't have to build scalar steps if we aren't vectorizing. assert(VF > 1 && "VF should be greater than one"); @@ -2868,25 +2079,11 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add); - recordVectorLoopValueForInductionCast(ID, Add, Part, Lane); + recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane); } } } -int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { - const ValueToValueMap &Strides = getSymbolicStrides() ? *getSymbolicStrides() : - ValueToValueMap(); - - int Stride = getPtrStride(PSE, Ptr, TheLoop, Strides, true, false); - if (Stride == 1 || Stride == -1) - return Stride; - return 0; -} - -bool LoopVectorizationLegality::isUniform(Value *V) { - return LAI->isUniform(V); -} - Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { assert(V != Induction && "The new induction variable should not be used."); assert(!V->getType()->isVectorTy() && "Can't widen a vector"); @@ -3046,7 +2243,7 @@ Value *InnerLoopVectorizer::reverseVector(Value *Vec) { // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) { - const InterleaveGroup *Group = Legal->getInterleavedAccessGroup(Instr); + const InterleaveGroup *Group = Cost->getInterleavedAccessGroup(Instr); assert(Group && "Fail to get an interleaved access group."); // Skip if current instruction is not the insert position. @@ -3054,7 +2251,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) { return; const DataLayout &DL = Instr->getModule()->getDataLayout(); - Value *Ptr = getPointerOperand(Instr); + Value *Ptr = getLoadStorePointerOperand(Instr); // Prepare for the vector type of the interleaved load/store. Type *ScalarTy = getMemInstValueType(Instr); @@ -3076,6 +2273,10 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) { if (Group->isReverse()) Index += (VF - 1) * Group->getFactor(); + bool InBounds = false; + if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) + InBounds = gep->isInBounds(); + for (unsigned Part = 0; Part < UF; Part++) { Value *NewPtr = getOrCreateScalarValue(Ptr, {Part, 0}); @@ -3091,6 +2292,8 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) { // A[i+2] = c; // Member of index 2 (Current instruction) // Current pointer is pointed to A[i+2], adjust it to A[i]. NewPtr = Builder.CreateGEP(NewPtr, Builder.getInt32(-Index)); + if (InBounds) + cast<GetElementPtrInst>(NewPtr)->setIsInBounds(true); // Cast to the vector pointer type. NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy)); @@ -3196,7 +2399,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, Type *ScalarDataTy = getMemInstValueType(Instr); Type *DataTy = VectorType::get(ScalarDataTy, VF); - Value *Ptr = getPointerOperand(Instr); + Value *Ptr = getLoadStorePointerOperand(Instr); unsigned Alignment = getMemInstAlignment(Instr); // An alignment of 0 means target abi alignment. We need to use the scalar's // target abi alignment in such a case. @@ -3227,10 +2430,37 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, if (isMaskRequired) Mask = *BlockInMask; + bool InBounds = false; + if (auto *gep = dyn_cast<GetElementPtrInst>( + getLoadStorePointerOperand(Instr)->stripPointerCasts())) + InBounds = gep->isInBounds(); + + const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { + // Calculate the pointer for the specific unroll-part. + GetElementPtrInst *PartPtr = nullptr; + + if (Reverse) { + // If the address is consecutive but reversed, then the + // wide store needs to start at the last vector element. + PartPtr = cast<GetElementPtrInst>( + Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF))); + PartPtr->setIsInBounds(InBounds); + PartPtr = cast<GetElementPtrInst>( + Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF))); + PartPtr->setIsInBounds(InBounds); + if (isMaskRequired) // Reverse of a null all-one mask is a null mask. + Mask[Part] = reverseVector(Mask[Part]); + } else { + PartPtr = cast<GetElementPtrInst>( + Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF))); + PartPtr->setIsInBounds(InBounds); + } + + return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); + }; + // Handle Stores: if (SI) { - assert(!Legal->isUniform(SI->getPointerOperand()) && - "We do not allow storing to uniform addresses"); setDebugLocFromInst(Builder, SI); for (unsigned Part = 0; Part < UF; ++Part) { @@ -3242,30 +2472,14 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, MaskPart); } else { - // Calculate the pointer for the specific unroll-part. - Value *PartPtr = - Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF)); - if (Reverse) { // If we store to reverse consecutive memory locations, then we need // to reverse the order of elements in the stored value. StoredVal = reverseVector(StoredVal); // We don't want to update the value in the map as it might be used in // another expression. So don't call resetVectorValue(StoredVal). - - // If the address is consecutive but reversed, then the - // wide store needs to start at the last vector element. - PartPtr = - Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF)); - PartPtr = - Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF)); - if (isMaskRequired) // Reverse of a null all-one mask is a null mask. - Mask[Part] = reverseVector(Mask[Part]); } - - Value *VecPtr = - Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); - + auto *VecPtr = CreateVecPtr(Part, Ptr); if (isMaskRequired) NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, Mask[Part]); @@ -3289,21 +2503,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, nullptr, "wide.masked.gather"); addMetadata(NewLI, LI); } else { - // Calculate the pointer for the specific unroll-part. - Value *PartPtr = - Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF)); - - if (Reverse) { - // If the address is consecutive but reversed, then the - // wide load needs to start at the last vector element. - PartPtr = Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF)); - PartPtr = Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF)); - if (isMaskRequired) // Reverse of a null all-one mask is a null mask. - Mask[Part] = reverseVector(Mask[Part]); - } - - Value *VecPtr = - Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); + auto *VecPtr = CreateVecPtr(Part, Ptr); if (isMaskRequired) NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part], UndefValue::get(DataTy), @@ -3457,7 +2657,7 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { // does not evenly divide the trip count, no adjustment is necessary since // there will already be scalar iterations. Note that the minimum iterations // check ensures that N >= Step. - if (VF > 1 && Legal->requiresScalarEpilogue()) { + if (VF > 1 && Cost->requiresScalarEpilogue()) { auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); R = Builder.CreateSelect(IsZero, Step, R); } @@ -3508,8 +2708,8 @@ void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, // vector trip count is zero. This check also covers the case where adding one // to the backedge-taken count overflowed leading to an incorrect trip count // of zero. In this case we will also jump to the scalar loop. - auto P = Legal->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE - : ICmpInst::ICMP_ULT; + auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE + : ICmpInst::ICMP_ULT; Value *CheckMinIters = Builder.CreateICmp( P, Count, ConstantInt::get(Count->getType(), VF * UF), "min.iters.check"); @@ -3714,6 +2914,8 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { // Create phi nodes to merge from the backedge-taken check block. PHINode *BCResumeVal = PHINode::Create( OrigPhi->getType(), 3, "bc.resume.val", ScalarPH->getTerminator()); + // Copy original phi DL over to the new one. + BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); Value *&EndValue = IVEndValues[OrigPhi]; if (OrigPhi == OldInduction) { // We know what the end value is. @@ -3871,7 +3073,7 @@ struct CSEDenseMapInfo { } // end anonymous namespace -///\brief Perform cse of induction variable instructions. +///Perform cse of induction variable instructions. static void cse(BasicBlock *BB) { // Perform simple cse. SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; @@ -3893,7 +3095,7 @@ static void cse(BasicBlock *BB) { } } -/// \brief Estimate the overhead of scalarizing an instruction. This is a +/// Estimate the overhead of scalarizing an instruction. This is a /// convenience wrapper for the type-based getScalarizationOverhead API. static unsigned getScalarizationOverhead(Instruction *I, unsigned VF, const TargetTransformInfo &TTI) { @@ -4074,7 +3276,7 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() { SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1)); NewI = B.CreateShuffleVector(O0, O1, SI->getMask()); - } else if (isa<LoadInst>(I)) { + } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { // Don't do anything with the operands, just extend the result. continue; } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { @@ -4089,7 +3291,8 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() { EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); NewI = B.CreateExtractElement(O0, EE->getOperand(2)); } else { - llvm_unreachable("Unhandled instruction type!"); + // If we don't know what to do, be conservative and don't do anything. + continue; } // Lastly, extend the result. @@ -4164,15 +3367,12 @@ void InnerLoopVectorizer::fixCrossIterationPHIs() { // the currently empty PHI nodes. At this point every instruction in the // original loop is widened to a vector form so we can use them to construct // the incoming edges. - for (Instruction &I : *OrigLoop->getHeader()) { - PHINode *Phi = dyn_cast<PHINode>(&I); - if (!Phi) - break; + for (PHINode &Phi : OrigLoop->getHeader()->phis()) { // Handle first-order recurrences and reductions that need to be fixed. - if (Legal->isFirstOrderRecurrence(Phi)) - fixFirstOrderRecurrence(Phi); - else if (Legal->isReductionVariable(Phi)) - fixReduction(Phi); + if (Legal->isFirstOrderRecurrence(&Phi)) + fixFirstOrderRecurrence(&Phi); + else if (Legal->isReductionVariable(&Phi)) + fixReduction(&Phi); } } @@ -4335,15 +3535,11 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { // Finally, fix users of the recurrence outside the loop. The users will need // either the last value of the scalar recurrence or the last value of the // vector recurrence we extracted in the middle block. Since the loop is in - // LCSSA form, we just need to find the phi node for the original scalar + // LCSSA form, we just need to find all the phi nodes for the original scalar // recurrence in the exit block, and then add an edge for the middle block. - for (auto &I : *LoopExitBlock) { - auto *LCSSAPhi = dyn_cast<PHINode>(&I); - if (!LCSSAPhi) - break; - if (LCSSAPhi->getIncomingValue(0) == Phi) { - LCSSAPhi->addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); - break; + for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { + if (LCSSAPhi.getIncomingValue(0) == Phi) { + LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); } } } @@ -4499,21 +3695,15 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { // inside and outside of the scalar remainder loop. // We know that the loop is in LCSSA form. We need to update the // PHI nodes in the exit blocks. - for (BasicBlock::iterator LEI = LoopExitBlock->begin(), - LEE = LoopExitBlock->end(); - LEI != LEE; ++LEI) { - PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI); - if (!LCSSAPhi) - break; - + for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { // All PHINodes need to have a single entry edge, or two if // we already fixed them. - assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); + assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); // We found a reduction value exit-PHI. Update it with the // incoming bypass edge. - if (LCSSAPhi->getIncomingValue(0) == LoopExitInst) - LCSSAPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); + if (LCSSAPhi.getIncomingValue(0) == LoopExitInst) + LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); } // end of the LCSSA phi scan. // Fix the scalar loop reduction variable with the incoming reduction sum @@ -4528,14 +3718,11 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { } void InnerLoopVectorizer::fixLCSSAPHIs() { - for (Instruction &LEI : *LoopExitBlock) { - auto *LCSSAPhi = dyn_cast<PHINode>(&LEI); - if (!LCSSAPhi) - break; - if (LCSSAPhi->getNumIncomingValues() == 1) { - assert(OrigLoop->isLoopInvariant(LCSSAPhi->getIncomingValue(0)) && + for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { + if (LCSSAPhi.getNumIncomingValues() == 1) { + assert(OrigLoop->isLoopInvariant(LCSSAPhi.getIncomingValue(0)) && "Incoming value isn't loop invariant"); - LCSSAPhi->addIncoming(LCSSAPhi->getIncomingValue(0), LoopMiddleBlock); + LCSSAPhi.addIncoming(LCSSAPhi.getIncomingValue(0), LoopMiddleBlock); } } } @@ -4955,7 +4142,7 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) { default: // This instruction is not vectorized by simple widening. - DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); + LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); llvm_unreachable("Unhandled instruction!"); } // end of switch. } @@ -4973,467 +4160,7 @@ void InnerLoopVectorizer::updateAnalysis() { DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]); DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader); DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]); - DEBUG(DT->verifyDomTree()); -} - -/// \brief Check whether it is safe to if-convert this phi node. -/// -/// Phi nodes with constant expressions that can trap are not safe to if -/// convert. -static bool canIfConvertPHINodes(BasicBlock *BB) { - for (Instruction &I : *BB) { - auto *Phi = dyn_cast<PHINode>(&I); - if (!Phi) - return true; - for (Value *V : Phi->incoming_values()) - if (auto *C = dyn_cast<Constant>(V)) - if (C->canTrap()) - return false; - } - return true; -} - -bool LoopVectorizationLegality::canVectorizeWithIfConvert() { - if (!EnableIfConversion) { - ORE->emit(createMissedAnalysis("IfConversionDisabled") - << "if-conversion is disabled"); - return false; - } - - assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable"); - - // A list of pointers that we can safely read and write to. - SmallPtrSet<Value *, 8> SafePointes; - - // Collect safe addresses. - for (BasicBlock *BB : TheLoop->blocks()) { - if (blockNeedsPredication(BB)) - continue; - - for (Instruction &I : *BB) - if (auto *Ptr = getPointerOperand(&I)) - SafePointes.insert(Ptr); - } - - // Collect the blocks that need predication. - BasicBlock *Header = TheLoop->getHeader(); - for (BasicBlock *BB : TheLoop->blocks()) { - // We don't support switch statements inside loops. - if (!isa<BranchInst>(BB->getTerminator())) { - ORE->emit(createMissedAnalysis("LoopContainsSwitch", BB->getTerminator()) - << "loop contains a switch statement"); - return false; - } - - // We must be able to predicate all blocks that need to be predicated. - if (blockNeedsPredication(BB)) { - if (!blockCanBePredicated(BB, SafePointes)) { - ORE->emit(createMissedAnalysis("NoCFGForSelect", BB->getTerminator()) - << "control flow cannot be substituted for a select"); - return false; - } - } else if (BB != Header && !canIfConvertPHINodes(BB)) { - ORE->emit(createMissedAnalysis("NoCFGForSelect", BB->getTerminator()) - << "control flow cannot be substituted for a select"); - return false; - } - } - - // We can if-convert this loop. - return true; -} - -bool LoopVectorizationLegality::canVectorize() { - // Store the result and return it at the end instead of exiting early, in case - // allowExtraAnalysis is used to report multiple reasons for not vectorizing. - bool Result = true; - - bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE); - // We must have a loop in canonical form. Loops with indirectbr in them cannot - // be canonicalized. - if (!TheLoop->getLoopPreheader()) { - DEBUG(dbgs() << "LV: Loop doesn't have a legal pre-header.\n"); - ORE->emit(createMissedAnalysis("CFGNotUnderstood") - << "loop control flow is not understood by vectorizer"); - if (DoExtraAnalysis) - Result = false; - else - return false; - } - - // FIXME: The code is currently dead, since the loop gets sent to - // LoopVectorizationLegality is already an innermost loop. - // - // We can only vectorize innermost loops. - if (!TheLoop->empty()) { - ORE->emit(createMissedAnalysis("NotInnermostLoop") - << "loop is not the innermost loop"); - if (DoExtraAnalysis) - Result = false; - else - return false; - } - - // We must have a single backedge. - if (TheLoop->getNumBackEdges() != 1) { - ORE->emit(createMissedAnalysis("CFGNotUnderstood") - << "loop control flow is not understood by vectorizer"); - if (DoExtraAnalysis) - Result = false; - else - return false; - } - - // We must have a single exiting block. - if (!TheLoop->getExitingBlock()) { - ORE->emit(createMissedAnalysis("CFGNotUnderstood") - << "loop control flow is not understood by vectorizer"); - if (DoExtraAnalysis) - Result = false; - else - return false; - } - - // We only handle bottom-tested loops, i.e. loop in which the condition is - // checked at the end of each iteration. With that we can assume that all - // instructions in the loop are executed the same number of times. - if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { - ORE->emit(createMissedAnalysis("CFGNotUnderstood") - << "loop control flow is not understood by vectorizer"); - if (DoExtraAnalysis) - Result = false; - else - return false; - } - - // We need to have a loop header. - DEBUG(dbgs() << "LV: Found a loop: " << TheLoop->getHeader()->getName() - << '\n'); - - // Check if we can if-convert non-single-bb loops. - unsigned NumBlocks = TheLoop->getNumBlocks(); - if (NumBlocks != 1 && !canVectorizeWithIfConvert()) { - DEBUG(dbgs() << "LV: Can't if-convert the loop.\n"); - if (DoExtraAnalysis) - Result = false; - else - return false; - } - - // Check if we can vectorize the instructions and CFG in this loop. - if (!canVectorizeInstrs()) { - DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n"); - if (DoExtraAnalysis) - Result = false; - else - return false; - } - - // Go over each instruction and look at memory deps. - if (!canVectorizeMemory()) { - DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n"); - if (DoExtraAnalysis) - Result = false; - else - return false; - } - - DEBUG(dbgs() << "LV: We can vectorize this loop" - << (LAI->getRuntimePointerChecking()->Need - ? " (with a runtime bound check)" - : "") - << "!\n"); - - bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); - - // If an override option has been passed in for interleaved accesses, use it. - if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) - UseInterleaved = EnableInterleavedMemAccesses; - - // Analyze interleaved memory accesses. - if (UseInterleaved) - InterleaveInfo.analyzeInterleaving(*getSymbolicStrides()); - - unsigned SCEVThreshold = VectorizeSCEVCheckThreshold; - if (Hints->getForce() == LoopVectorizeHints::FK_Enabled) - SCEVThreshold = PragmaVectorizeSCEVCheckThreshold; - - if (PSE.getUnionPredicate().getComplexity() > SCEVThreshold) { - ORE->emit(createMissedAnalysis("TooManySCEVRunTimeChecks") - << "Too many SCEV assumptions need to be made and checked " - << "at runtime"); - DEBUG(dbgs() << "LV: Too many SCEV checks needed.\n"); - if (DoExtraAnalysis) - Result = false; - else - return false; - } - - // Okay! We've done all the tests. If any have failed, return false. Otherwise - // we can vectorize, and at this point we don't have any other mem analysis - // which may limit our maximum vectorization factor, so just return true with - // no restrictions. - return Result; -} - -static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) { - if (Ty->isPointerTy()) - return DL.getIntPtrType(Ty); - - // It is possible that char's or short's overflow when we ask for the loop's - // trip count, work around this by changing the type size. - if (Ty->getScalarSizeInBits() < 32) - return Type::getInt32Ty(Ty->getContext()); - - return Ty; -} - -static Type *getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1) { - Ty0 = convertPointerToIntegerType(DL, Ty0); - Ty1 = convertPointerToIntegerType(DL, Ty1); - if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits()) - return Ty0; - return Ty1; -} - -/// \brief Check that the instruction has outside loop users and is not an -/// identified reduction variable. -static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst, - SmallPtrSetImpl<Value *> &AllowedExit) { - // Reduction and Induction instructions are allowed to have exit users. All - // other instructions must not have external users. - if (!AllowedExit.count(Inst)) - // Check that all of the users of the loop are inside the BB. - for (User *U : Inst->users()) { - Instruction *UI = cast<Instruction>(U); - // This user may be a reduction exit value. - if (!TheLoop->contains(UI)) { - DEBUG(dbgs() << "LV: Found an outside user for : " << *UI << '\n'); - return true; - } - } - return false; -} - -void LoopVectorizationLegality::addInductionPhi( - PHINode *Phi, const InductionDescriptor &ID, - SmallPtrSetImpl<Value *> &AllowedExit) { - Inductions[Phi] = ID; - - // In case this induction also comes with casts that we know we can ignore - // in the vectorized loop body, record them here. All casts could be recorded - // here for ignoring, but suffices to record only the first (as it is the - // only one that may bw used outside the cast sequence). - const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); - if (!Casts.empty()) - InductionCastsToIgnore.insert(*Casts.begin()); - - Type *PhiTy = Phi->getType(); - const DataLayout &DL = Phi->getModule()->getDataLayout(); - - // Get the widest type. - if (!PhiTy->isFloatingPointTy()) { - if (!WidestIndTy) - WidestIndTy = convertPointerToIntegerType(DL, PhiTy); - else - WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy); - } - - // Int inductions are special because we only allow one IV. - if (ID.getKind() == InductionDescriptor::IK_IntInduction && - ID.getConstIntStepValue() && - ID.getConstIntStepValue()->isOne() && - isa<Constant>(ID.getStartValue()) && - cast<Constant>(ID.getStartValue())->isNullValue()) { - - // Use the phi node with the widest type as induction. Use the last - // one if there are multiple (no good reason for doing this other - // than it is expedient). We've checked that it begins at zero and - // steps by one, so this is a canonical induction variable. - if (!PrimaryInduction || PhiTy == WidestIndTy) - PrimaryInduction = Phi; - } - - // Both the PHI node itself, and the "post-increment" value feeding - // back into the PHI node may have external users. - // We can allow those uses, except if the SCEVs we have for them rely - // on predicates that only hold within the loop, since allowing the exit - // currently means re-using this SCEV outside the loop. - if (PSE.getUnionPredicate().isAlwaysTrue()) { - AllowedExit.insert(Phi); - AllowedExit.insert(Phi->getIncomingValueForBlock(TheLoop->getLoopLatch())); - } - - DEBUG(dbgs() << "LV: Found an induction variable.\n"); -} - -bool LoopVectorizationLegality::canVectorizeInstrs() { - BasicBlock *Header = TheLoop->getHeader(); - - // Look for the attribute signaling the absence of NaNs. - Function &F = *Header->getParent(); - HasFunNoNaNAttr = - F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true"; - - // For each block in the loop. - for (BasicBlock *BB : TheLoop->blocks()) { - // Scan the instructions in the block and look for hazards. - for (Instruction &I : *BB) { - if (auto *Phi = dyn_cast<PHINode>(&I)) { - Type *PhiTy = Phi->getType(); - // Check that this PHI type is allowed. - if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() && - !PhiTy->isPointerTy()) { - ORE->emit(createMissedAnalysis("CFGNotUnderstood", Phi) - << "loop control flow is not understood by vectorizer"); - DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n"); - return false; - } - - // If this PHINode is not in the header block, then we know that we - // can convert it to select during if-conversion. No need to check if - // the PHIs in this block are induction or reduction variables. - if (BB != Header) { - // Check that this instruction has no outside users or is an - // identified reduction value with an outside user. - if (!hasOutsideLoopUser(TheLoop, Phi, AllowedExit)) - continue; - ORE->emit(createMissedAnalysis("NeitherInductionNorReduction", Phi) - << "value could not be identified as " - "an induction or reduction variable"); - return false; - } - - // We only allow if-converted PHIs with exactly two incoming values. - if (Phi->getNumIncomingValues() != 2) { - ORE->emit(createMissedAnalysis("CFGNotUnderstood", Phi) - << "control flow not understood by vectorizer"); - DEBUG(dbgs() << "LV: Found an invalid PHI.\n"); - return false; - } - - RecurrenceDescriptor RedDes; - if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes)) { - if (RedDes.hasUnsafeAlgebra()) - Requirements->addUnsafeAlgebraInst(RedDes.getUnsafeAlgebraInst()); - AllowedExit.insert(RedDes.getLoopExitInstr()); - Reductions[Phi] = RedDes; - continue; - } - - InductionDescriptor ID; - if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID)) { - addInductionPhi(Phi, ID, AllowedExit); - if (ID.hasUnsafeAlgebra() && !HasFunNoNaNAttr) - Requirements->addUnsafeAlgebraInst(ID.getUnsafeAlgebraInst()); - continue; - } - - if (RecurrenceDescriptor::isFirstOrderRecurrence(Phi, TheLoop, - SinkAfter, DT)) { - FirstOrderRecurrences.insert(Phi); - continue; - } - - // As a last resort, coerce the PHI to a AddRec expression - // and re-try classifying it a an induction PHI. - if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID, true)) { - addInductionPhi(Phi, ID, AllowedExit); - continue; - } - - ORE->emit(createMissedAnalysis("NonReductionValueUsedOutsideLoop", Phi) - << "value that could not be identified as " - "reduction is used outside the loop"); - DEBUG(dbgs() << "LV: Found an unidentified PHI." << *Phi << "\n"); - return false; - } // end of PHI handling - - // We handle calls that: - // * Are debug info intrinsics. - // * Have a mapping to an IR intrinsic. - // * Have a vector version available. - auto *CI = dyn_cast<CallInst>(&I); - if (CI && !getVectorIntrinsicIDForCall(CI, TLI) && - !isa<DbgInfoIntrinsic>(CI) && - !(CI->getCalledFunction() && TLI && - TLI->isFunctionVectorizable(CI->getCalledFunction()->getName()))) { - ORE->emit(createMissedAnalysis("CantVectorizeCall", CI) - << "call instruction cannot be vectorized"); - DEBUG(dbgs() << "LV: Found a non-intrinsic, non-libfunc callsite.\n"); - return false; - } - - // Intrinsics such as powi,cttz and ctlz are legal to vectorize if the - // second argument is the same (i.e. loop invariant) - if (CI && hasVectorInstrinsicScalarOpd( - getVectorIntrinsicIDForCall(CI, TLI), 1)) { - auto *SE = PSE.getSE(); - if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(1)), TheLoop)) { - ORE->emit(createMissedAnalysis("CantVectorizeIntrinsic", CI) - << "intrinsic instruction cannot be vectorized"); - DEBUG(dbgs() << "LV: Found unvectorizable intrinsic " << *CI << "\n"); - return false; - } - } - - // Check that the instruction return type is vectorizable. - // Also, we can't vectorize extractelement instructions. - if ((!VectorType::isValidElementType(I.getType()) && - !I.getType()->isVoidTy()) || - isa<ExtractElementInst>(I)) { - ORE->emit(createMissedAnalysis("CantVectorizeInstructionReturnType", &I) - << "instruction return type cannot be vectorized"); - DEBUG(dbgs() << "LV: Found unvectorizable type.\n"); - return false; - } - - // Check that the stored type is vectorizable. - if (auto *ST = dyn_cast<StoreInst>(&I)) { - Type *T = ST->getValueOperand()->getType(); - if (!VectorType::isValidElementType(T)) { - ORE->emit(createMissedAnalysis("CantVectorizeStore", ST) - << "store instruction cannot be vectorized"); - return false; - } - - // FP instructions can allow unsafe algebra, thus vectorizable by - // non-IEEE-754 compliant SIMD units. - // This applies to floating-point math operations and calls, not memory - // operations, shuffles, or casts, as they don't change precision or - // semantics. - } else if (I.getType()->isFloatingPointTy() && (CI || I.isBinaryOp()) && - !I.isFast()) { - DEBUG(dbgs() << "LV: Found FP op with unsafe algebra.\n"); - Hints->setPotentiallyUnsafe(); - } - - // Reduction instructions are allowed to have exit users. - // All other instructions must not have external users. - if (hasOutsideLoopUser(TheLoop, &I, AllowedExit)) { - ORE->emit(createMissedAnalysis("ValueUsedOutsideLoop", &I) - << "value cannot be used outside the loop"); - return false; - } - } // next instr. - } - - if (!PrimaryInduction) { - DEBUG(dbgs() << "LV: Did not find one integer induction var.\n"); - if (Inductions.empty()) { - ORE->emit(createMissedAnalysis("NoInductionVariable") - << "loop induction variable could not be identified"); - return false; - } - } - - // Now we know the widest induction type, check if our found induction - // is the same size. If it's not, unset it here and InnerLoopVectorizer - // will create another. - if (PrimaryInduction && WidestIndTy != PrimaryInduction->getType()) - PrimaryInduction = nullptr; - - return true; + assert(DT->verify(DominatorTree::VerificationLevel::Fast)); } void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { @@ -5461,7 +4188,7 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { if (auto *Store = dyn_cast<StoreInst>(MemAccess)) if (Ptr == Store->getValueOperand()) return WideningDecision == CM_Scalarize; - assert(Ptr == getPointerOperand(MemAccess) && + assert(Ptr == getLoadStorePointerOperand(MemAccess) && "Ptr is neither a value or pointer operand"); return WideningDecision != CM_GatherScatter; }; @@ -5527,7 +4254,7 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { } for (auto *I : ScalarPtrs) if (!PossibleNonScalarPtrs.count(I)) { - DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); + LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); Worklist.insert(I); } @@ -5544,8 +4271,9 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { continue; Worklist.insert(Ind); Worklist.insert(IndUpdate); - DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); - DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate << "\n"); + LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); + LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate + << "\n"); } // Insert the forced scalars. @@ -5572,7 +4300,7 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { isScalarUse(J, Src)); })) { Worklist.insert(Src); - DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); + LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); } } @@ -5612,21 +4340,30 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { // The induction variable and its update instruction will remain scalar. Worklist.insert(Ind); Worklist.insert(IndUpdate); - DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); - DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate << "\n"); + LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); + LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate + << "\n"); } Scalars[VF].insert(Worklist.begin(), Worklist.end()); } -bool LoopVectorizationLegality::isScalarWithPredication(Instruction *I) { - if (!blockNeedsPredication(I->getParent())) +bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) { + if (!Legal->blockNeedsPredication(I->getParent())) return false; switch(I->getOpcode()) { default: break; - case Instruction::Store: - return !isMaskRequired(I); + case Instruction::Load: + case Instruction::Store: { + if (!Legal->isMaskRequired(I)) + return false; + auto *Ptr = getLoadStorePointerOperand(I); + auto *Ty = getMemInstValueType(I); + return isa<LoadInst>(I) ? + !(isLegalMaskedLoad(Ty, Ptr) || isLegalMaskedGather(Ty)) + : !(isLegalMaskedStore(Ty, Ptr) || isLegalMaskedScatter(Ty)); + } case Instruction::UDiv: case Instruction::SDiv: case Instruction::SRem: @@ -5636,17 +4373,17 @@ bool LoopVectorizationLegality::isScalarWithPredication(Instruction *I) { return false; } -bool LoopVectorizationLegality::memoryInstructionCanBeWidened(Instruction *I, - unsigned VF) { +bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I, + unsigned VF) { // Get and ensure we have a valid memory instruction. LoadInst *LI = dyn_cast<LoadInst>(I); StoreInst *SI = dyn_cast<StoreInst>(I); assert((LI || SI) && "Invalid memory instruction"); - auto *Ptr = getPointerOperand(I); + auto *Ptr = getLoadStorePointerOperand(I); // In order to be widened, the pointer should be consecutive, first of all. - if (!isConsecutivePtr(Ptr)) + if (!Legal->isConsecutivePtr(Ptr)) return false; // If the instruction is a store located in a predicated block, it will be @@ -5697,7 +4434,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) { Worklist.insert(Cmp); - DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n"); + LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n"); } // Holds consecutive and consecutive-like pointers. Consecutive-like pointers @@ -5729,7 +4466,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { for (auto *BB : TheLoop->blocks()) for (auto &I : *BB) { // If there's no pointer operand, there's nothing to do. - auto *Ptr = dyn_cast_or_null<Instruction>(getPointerOperand(&I)); + auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); if (!Ptr) continue; @@ -5737,7 +4474,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { // pointer operand. auto UsersAreMemAccesses = llvm::all_of(Ptr->users(), [&](User *U) -> bool { - return getPointerOperand(U) == Ptr; + return getLoadStorePointerOperand(U) == Ptr; }); // Ensure the memory instruction will not be scalarized or used by @@ -5758,7 +4495,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { // aren't also identified as possibly non-uniform. for (auto *V : ConsecutiveLikePtrs) if (!PossibleNonUniformPtrs.count(V)) { - DEBUG(dbgs() << "LV: Found uniform instruction: " << *V << "\n"); + LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *V << "\n"); Worklist.insert(V); } @@ -5777,10 +4514,11 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { if (llvm::all_of(OI->users(), [&](User *U) -> bool { auto *J = cast<Instruction>(U); return !TheLoop->contains(J) || Worklist.count(J) || - (OI == getPointerOperand(J) && isUniformDecision(J, VF)); + (OI == getLoadStorePointerOperand(J) && + isUniformDecision(J, VF)); })) { Worklist.insert(OI); - DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n"); + LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n"); } } } @@ -5788,7 +4526,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { // Returns true if Ptr is the pointer operand of a memory access instruction // I, and I is known to not require scalarization. auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { - return getPointerOperand(I) == Ptr && isUniformDecision(I, VF); + return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); }; // For an instruction to be added into Worklist above, all its users inside @@ -5825,123 +4563,14 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { // The induction variable and its update instruction will remain uniform. Worklist.insert(Ind); Worklist.insert(IndUpdate); - DEBUG(dbgs() << "LV: Found uniform instruction: " << *Ind << "\n"); - DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate << "\n"); + LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Ind << "\n"); + LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate + << "\n"); } Uniforms[VF].insert(Worklist.begin(), Worklist.end()); } -bool LoopVectorizationLegality::canVectorizeMemory() { - LAI = &(*GetLAA)(*TheLoop); - InterleaveInfo.setLAI(LAI); - const OptimizationRemarkAnalysis *LAR = LAI->getReport(); - if (LAR) { - ORE->emit([&]() { - return OptimizationRemarkAnalysis(Hints->vectorizeAnalysisPassName(), - "loop not vectorized: ", *LAR); - }); - } - if (!LAI->canVectorizeMemory()) - return false; - - if (LAI->hasStoreToLoopInvariantAddress()) { - ORE->emit(createMissedAnalysis("CantVectorizeStoreToLoopInvariantAddress") - << "write to a loop invariant address could not be vectorized"); - DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n"); - return false; - } - - Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks()); - PSE.addPredicate(LAI->getPSE().getUnionPredicate()); - - return true; -} - -bool LoopVectorizationLegality::isInductionPhi(const Value *V) { - Value *In0 = const_cast<Value *>(V); - PHINode *PN = dyn_cast_or_null<PHINode>(In0); - if (!PN) - return false; - - return Inductions.count(PN); -} - -bool LoopVectorizationLegality::isCastedInductionVariable(const Value *V) { - auto *Inst = dyn_cast<Instruction>(V); - return (Inst && InductionCastsToIgnore.count(Inst)); -} - -bool LoopVectorizationLegality::isInductionVariable(const Value *V) { - return isInductionPhi(V) || isCastedInductionVariable(V); -} - -bool LoopVectorizationLegality::isFirstOrderRecurrence(const PHINode *Phi) { - return FirstOrderRecurrences.count(Phi); -} - -bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) { - return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT); -} - -bool LoopVectorizationLegality::blockCanBePredicated( - BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs) { - const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel(); - - for (Instruction &I : *BB) { - // Check that we don't have a constant expression that can trap as operand. - for (Value *Operand : I.operands()) { - if (auto *C = dyn_cast<Constant>(Operand)) - if (C->canTrap()) - return false; - } - // We might be able to hoist the load. - if (I.mayReadFromMemory()) { - auto *LI = dyn_cast<LoadInst>(&I); - if (!LI) - return false; - if (!SafePtrs.count(LI->getPointerOperand())) { - if (isLegalMaskedLoad(LI->getType(), LI->getPointerOperand()) || - isLegalMaskedGather(LI->getType())) { - MaskedOp.insert(LI); - continue; - } - // !llvm.mem.parallel_loop_access implies if-conversion safety. - if (IsAnnotatedParallel) - continue; - return false; - } - } - - if (I.mayWriteToMemory()) { - auto *SI = dyn_cast<StoreInst>(&I); - // We only support predication of stores in basic blocks with one - // predecessor. - if (!SI) - return false; - - // Build a masked store if it is legal for the target. - if (isLegalMaskedStore(SI->getValueOperand()->getType(), - SI->getPointerOperand()) || - isLegalMaskedScatter(SI->getValueOperand()->getType())) { - MaskedOp.insert(SI); - continue; - } - - bool isSafePtr = (SafePtrs.count(SI->getPointerOperand()) != 0); - bool isSinglePredecessor = SI->getParent()->getSinglePredecessor(); - - if (++NumPredStores > NumberOfStoresToPredicate || !isSafePtr || - !isSinglePredecessor) - return false; - } - if (I.mayThrow()) - return false; - } - - return true; -} - void InterleavedAccessInfo::collectConstStrideAccesses( MapVector<Instruction *, StrideDescriptor> &AccessStrideInfo, const ValueToValueMap &Strides) { @@ -5962,7 +4591,7 @@ void InterleavedAccessInfo::collectConstStrideAccesses( if (!LI && !SI) continue; - Value *Ptr = getPointerOperand(&I); + Value *Ptr = getLoadStorePointerOperand(&I); // We don't check wrapping here because we don't know yet if Ptr will be // part of a full group or a group with gaps. Checking wrapping for all // pointers (even those that end up in groups with no gaps) will be overly @@ -6022,9 +4651,9 @@ void InterleavedAccessInfo::collectConstStrideAccesses( // this group because it and (2) are dependent. However, (1) can be grouped // with other accesses that may precede it in program order. Note that a // bottom-up order does not imply that WAW dependences should not be checked. -void InterleavedAccessInfo::analyzeInterleaving( - const ValueToValueMap &Strides) { - DEBUG(dbgs() << "LV: Analyzing interleaved accesses...\n"); +void InterleavedAccessInfo::analyzeInterleaving() { + LLVM_DEBUG(dbgs() << "LV: Analyzing interleaved accesses...\n"); + const ValueToValueMap &Strides = LAI->getSymbolicStrides(); // Holds all accesses with a constant stride. MapVector<Instruction *, StrideDescriptor> AccessStrideInfo; @@ -6065,7 +4694,8 @@ void InterleavedAccessInfo::analyzeInterleaving( if (isStrided(DesB.Stride)) { Group = getInterleaveGroup(B); if (!Group) { - DEBUG(dbgs() << "LV: Creating an interleave group with:" << *B << '\n'); + LLVM_DEBUG(dbgs() << "LV: Creating an interleave group with:" << *B + << '\n'); Group = createInterleaveGroup(B, DesB.Stride, DesB.Align); } if (B->mayWriteToMemory()) @@ -6124,7 +4754,12 @@ void InterleavedAccessInfo::analyzeInterleaving( // Ignore A if it's already in a group or isn't the same kind of memory // operation as B. - if (isInterleaved(A) || A->mayReadFromMemory() != B->mayReadFromMemory()) + // Note that mayReadFromMemory() isn't mutually exclusive to mayWriteToMemory + // in the case of atomic loads. We shouldn't see those here, canVectorizeMemory() + // should have returned false - except for the case we asked for optimization + // remarks. + if (isInterleaved(A) || (A->mayReadFromMemory() != B->mayReadFromMemory()) + || (A->mayWriteToMemory() != B->mayWriteToMemory())) continue; // Check rules 1 and 2. Ignore A if its stride or size is different from @@ -6163,8 +4798,9 @@ void InterleavedAccessInfo::analyzeInterleaving( // Try to insert A into B's group. if (Group->insertMember(A, IndexA, DesA.Align)) { - DEBUG(dbgs() << "LV: Inserted:" << *A << '\n' - << " into the interleave group with" << *B << '\n'); + LLVM_DEBUG(dbgs() << "LV: Inserted:" << *A << '\n' + << " into the interleave group with" << *B + << '\n'); InterleaveGroupMap[A] = Group; // Set the first load in program order as the insert position. @@ -6177,8 +4813,9 @@ void InterleavedAccessInfo::analyzeInterleaving( // Remove interleaved store groups with gaps. for (InterleaveGroup *Group : StoreGroups) if (Group->getNumMembers() != Group->getFactor()) { - DEBUG(dbgs() << "LV: Invalidate candidate interleaved store group due " - "to gaps.\n"); + LLVM_DEBUG( + dbgs() << "LV: Invalidate candidate interleaved store group due " + "to gaps.\n"); releaseGroup(Group); } // Remove interleaved groups with gaps (currently only loads) whose memory @@ -6207,21 +4844,23 @@ void InterleavedAccessInfo::analyzeInterleaving( // So we check only group member 0 (which is always guaranteed to exist), // and group member Factor - 1; If the latter doesn't exist we rely on // peeling (if it is a non-reveresed accsess -- see Case 3). - Value *FirstMemberPtr = getPointerOperand(Group->getMember(0)); + Value *FirstMemberPtr = getLoadStorePointerOperand(Group->getMember(0)); if (!getPtrStride(PSE, FirstMemberPtr, TheLoop, Strides, /*Assume=*/false, /*ShouldCheckWrap=*/true)) { - DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to " - "first group member potentially pointer-wrapping.\n"); + LLVM_DEBUG( + dbgs() << "LV: Invalidate candidate interleaved group due to " + "first group member potentially pointer-wrapping.\n"); releaseGroup(Group); continue; } Instruction *LastMember = Group->getMember(Group->getFactor() - 1); if (LastMember) { - Value *LastMemberPtr = getPointerOperand(LastMember); + Value *LastMemberPtr = getLoadStorePointerOperand(LastMember); if (!getPtrStride(PSE, LastMemberPtr, TheLoop, Strides, /*Assume=*/false, /*ShouldCheckWrap=*/true)) { - DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to " - "last group member potentially pointer-wrapping.\n"); + LLVM_DEBUG( + dbgs() << "LV: Invalidate candidate interleaved group due to " + "last group member potentially pointer-wrapping.\n"); releaseGroup(Group); } } else { @@ -6231,29 +4870,25 @@ void InterleavedAccessInfo::analyzeInterleaving( // to look for a member at index factor - 1, since every group must have // a member at index zero. if (Group->isReverse()) { - DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to " - "a reverse access with gaps.\n"); + LLVM_DEBUG( + dbgs() << "LV: Invalidate candidate interleaved group due to " + "a reverse access with gaps.\n"); releaseGroup(Group); continue; } - DEBUG(dbgs() << "LV: Interleaved group requires epilogue iteration.\n"); + LLVM_DEBUG( + dbgs() << "LV: Interleaved group requires epilogue iteration.\n"); RequiresScalarEpilogue = true; } } } Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) { - if (!EnableCondStoresVectorization && Legal->getNumPredStores()) { - ORE->emit(createMissedAnalysis("ConditionalStore") - << "store that is conditionally executed prevents vectorization"); - DEBUG(dbgs() << "LV: No vectorization. There are conditional stores.\n"); - return None; - } - if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { // TODO: It may by useful to do since it's still likely to be dynamically // uniform if the target can skip. - DEBUG(dbgs() << "LV: Not inserting runtime ptr check for divergent target"); + LLVM_DEBUG( + dbgs() << "LV: Not inserting runtime ptr check for divergent target"); ORE->emit( createMissedAnalysis("CantVersionLoopWithDivergentTarget") @@ -6271,20 +4906,22 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) { << "runtime pointer checks needed. Enable vectorization of this " "loop with '#pragma clang loop vectorize(enable)' when " "compiling with -Os/-Oz"); - DEBUG(dbgs() - << "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n"); + LLVM_DEBUG( + dbgs() + << "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n"); return None; } // If we optimize the program for size, avoid creating the tail loop. - DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); + LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); // If we don't know the precise trip count, don't try to vectorize. if (TC < 2) { ORE->emit( createMissedAnalysis("UnknownLoopCountComplexCFG") << "unable to calculate the loop count due to complex control flow"); - DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n"); + LLVM_DEBUG( + dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n"); return None; } @@ -6302,7 +4939,8 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) { "same time. Enable vectorization of this loop " "with '#pragma clang loop vectorize(enable)' " "when compiling with -Os/-Oz"); - DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n"); + LLVM_DEBUG( + dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n"); return None; } @@ -6327,29 +4965,30 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize, unsigned MaxVectorSize = WidestRegister / WidestType; - DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType << " / " - << WidestType << " bits.\n"); - DEBUG(dbgs() << "LV: The Widest register safe to use is: " << WidestRegister - << " bits.\n"); + LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType + << " / " << WidestType << " bits.\n"); + LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " + << WidestRegister << " bits.\n"); - assert(MaxVectorSize <= 64 && "Did not expect to pack so many elements" - " into one vector!"); + assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements" + " into one vector!"); if (MaxVectorSize == 0) { - DEBUG(dbgs() << "LV: The target has no vector registers.\n"); + LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); MaxVectorSize = 1; return MaxVectorSize; } else if (ConstTripCount && ConstTripCount < MaxVectorSize && isPowerOf2_32(ConstTripCount)) { // We need to clamp the VF to be the ConstTripCount. There is no point in // choosing a higher viable VF as done in the loop below. - DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " - << ConstTripCount << "\n"); + LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " + << ConstTripCount << "\n"); MaxVectorSize = ConstTripCount; return MaxVectorSize; } unsigned MaxVF = MaxVectorSize; - if (MaximizeBandwidth && !OptForSize) { + if (TTI.shouldMaximizeVectorBandwidth(OptForSize) || + (MaximizeBandwidth && !OptForSize)) { // Collect all viable vectorization factors larger than the default MaxVF // (i.e. MaxVectorSize). SmallVector<unsigned, 8> VFs; @@ -6369,24 +5008,30 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize, break; } } + if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) { + if (MaxVF < MinVF) { + LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF + << ") with target's minimum: " << MinVF << '\n'); + MaxVF = MinVF; + } + } } return MaxVF; } -LoopVectorizationCostModel::VectorizationFactor +VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) { float Cost = expectedCost(1).first; -#ifndef NDEBUG const float ScalarCost = Cost; -#endif /* NDEBUG */ unsigned Width = 1; - DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n"); + LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n"); bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; - // Ignore scalar width, because the user explicitly wants vectorization. if (ForceVectorization && MaxVF > 1) { - Width = 2; - Cost = expectedCost(Width).first / (float)Width; + // Ignore scalar width, because the user explicitly wants vectorization. + // Initialize cost to max so that VF = 2 is, at least, chosen during cost + // evaluation. + Cost = std::numeric_limits<float>::max(); } for (unsigned i = 2; i <= MaxVF; i *= 2) { @@ -6395,10 +5040,10 @@ LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) { // the vector elements. VectorizationCostTy C = expectedCost(i); float VectorCost = C.first / (float)i; - DEBUG(dbgs() << "LV: Vector loop of width " << i - << " costs: " << (int)VectorCost << ".\n"); + LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i + << " costs: " << (int)VectorCost << ".\n"); if (!C.second && !ForceVectorization) { - DEBUG( + LLVM_DEBUG( dbgs() << "LV: Not considering vector loop of width " << i << " because it will not generate any vector instructions.\n"); continue; @@ -6409,10 +5054,19 @@ LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) { } } - DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() - << "LV: Vectorization seems to be not beneficial, " - << "but was forced by a user.\n"); - DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); + if (!EnableCondStoresVectorization && NumPredStores) { + ORE->emit(createMissedAnalysis("ConditionalStore") + << "store that is conditionally executed prevents vectorization"); + LLVM_DEBUG( + dbgs() << "LV: No vectorization. There are conditional stores.\n"); + Width = 1; + Cost = ScalarCost; + } + + LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() + << "LV: Vectorization seems to be not beneficial, " + << "but was forced by a user.\n"); + LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)}; return Factor; } @@ -6460,7 +5114,7 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() { // optimization to non-pointer types. // if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && - !Legal->isAccessInterleaved(&I) && !Legal->isLegalGatherOrScatter(&I)) + !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) continue; MinWidth = std::min(MinWidth, @@ -6504,8 +5158,8 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, return 1; unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1); - DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters - << " registers\n"); + LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters + << " registers\n"); if (VF == 1) { if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) @@ -6519,7 +5173,6 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, // We divide by these constants so assume that we have at least one // instruction that uses at least one register. R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U); - R.NumInstructions = std::max(R.NumInstructions, 1U); // We calculate the interleave count using the following formula. // Subtract the number of loop invariants from the number of available @@ -6564,7 +5217,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, // Interleave if we vectorized this loop and there is a reduction that could // benefit from interleaving. if (VF > 1 && !Legal->getReductionVars()->empty()) { - DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); + LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); return IC; } @@ -6575,7 +5228,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, // We want to interleave small loops in order to reduce the loop overhead and // potentially expose ILP opportunities. - DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'); + LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'); if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { // We assume that the cost overhead is 1 and we use the cost model // to estimate the cost of the loop and interleave until the cost of the @@ -6603,11 +5256,12 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, if (EnableLoadStoreRuntimeInterleave && std::max(StoresIC, LoadsIC) > SmallIC) { - DEBUG(dbgs() << "LV: Interleaving to saturate store or load ports.\n"); + LLVM_DEBUG( + dbgs() << "LV: Interleaving to saturate store or load ports.\n"); return std::max(StoresIC, LoadsIC); } - DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); + LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); return SmallIC; } @@ -6615,11 +5269,11 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, // this point) that could benefit from interleaving. bool HasReductions = !Legal->getReductionVars()->empty(); if (TTI.enableAggressiveInterleaving(HasReductions)) { - DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); + LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); return IC; } - DEBUG(dbgs() << "LV: Not Interleaving.\n"); + LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); return 1; } @@ -6646,7 +5300,6 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { DFS.perform(LI); RegisterUsage RU; - RU.NumInstructions = 0; // Each 'key' in the map opens a new interval. The values // of the map are the index of the 'last seen' usage of the @@ -6658,14 +5311,13 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { // Marks the end of each interval. IntervalMap EndPoint; // Saves the list of instruction indices that are used in the loop. - SmallSet<Instruction *, 8> Ends; + SmallPtrSet<Instruction *, 8> Ends; // Saves the list of values that are used in the loop but are // defined outside the loop, such as arguments and constants. SmallPtrSet<Value *, 8> LoopInvariants; unsigned Index = 0; for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { - RU.NumInstructions += BB->size(); for (Instruction &I : *BB) { IdxToInstr[Index++] = &I; @@ -6698,7 +5350,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { for (auto &Interval : EndPoint) TransposeEnds[Interval.second].push_back(Interval.first); - SmallSet<Instruction *, 8> OpenIntervals; + SmallPtrSet<Instruction *, 8> OpenIntervals; // Get the size of the widest register. unsigned MaxSafeDepDist = -1U; @@ -6711,7 +5363,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { SmallVector<RegisterUsage, 8> RUs(VFs.size()); SmallVector<unsigned, 8> MaxUsages(VFs.size(), 0); - DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); + LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); // A lambda that gets the register usage for the given type and VF. auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) { @@ -6756,8 +5408,8 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { MaxUsages[j] = std::max(MaxUsages[j], RegUsage); } - DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " - << OpenIntervals.size() << '\n'); + LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " + << OpenIntervals.size() << '\n'); // Add the current instruction to the list of open intervals. OpenIntervals.insert(I); @@ -6772,10 +5424,10 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { Invariant += GetRegUsage(Inst->getType(), VFs[i]); } - DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] << '\n'); - DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages[i] << '\n'); - DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << '\n'); - DEBUG(dbgs() << "LV(REG): LoopSize: " << RU.NumInstructions << '\n'); + LLVM_DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] << '\n'); + LLVM_DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages[i] << '\n'); + LLVM_DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant + << '\n'); RU.LoopInvariantRegs = Invariant; RU.MaxLocalUsers = MaxUsages[i]; @@ -6785,6 +5437,22 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { return RUs; } +bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ + // TODO: Cost model for emulated masked load/store is completely + // broken. This hack guides the cost model to use an artificially + // high enough value to practically disable vectorization with such + // operations, except where previously deployed legality hack allowed + // using very low cost values. This is to avoid regressions coming simply + // from moving "masked load/store" check from legality to cost model. + // Masked Load/Gather emulation was previously never allowed. + // Limited number of Masked Store/Scatter emulation was allowed. + assert(isScalarWithPredication(I) && + "Expecting a scalar emulated instruction"); + return isa<LoadInst>(I) || + (isa<StoreInst>(I) && + NumPredStores > NumberOfStoresToPredicate); +} + void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) { // If we aren't vectorizing the loop, or if we've already collected the // instructions to scalarize, there's nothing to do. Collection may already @@ -6805,11 +5473,13 @@ void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) { if (!Legal->blockNeedsPredication(BB)) continue; for (Instruction &I : *BB) - if (Legal->isScalarWithPredication(&I)) { + if (isScalarWithPredication(&I)) { ScalarCostsTy ScalarCosts; - if (computePredInstDiscount(&I, ScalarCosts, VF) >= 0) + // Do not apply discount logic if hacked cost is needed + // for emulated masked memrefs. + if (!useEmulatedMaskMemRefHack(&I) && + computePredInstDiscount(&I, ScalarCosts, VF) >= 0) ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); - // Remember that BB will remain after vectorization. PredicatedBBsAfterVectorization.insert(BB); } @@ -6844,7 +5514,7 @@ int LoopVectorizationCostModel::computePredInstDiscount( // If the instruction is scalar with predication, it will be analyzed // separately. We ignore it within the context of PredInst. - if (Legal->isScalarWithPredication(I)) + if (isScalarWithPredication(I)) return false; // If any of the instruction's operands are uniform after vectorization, @@ -6898,7 +5568,7 @@ int LoopVectorizationCostModel::computePredInstDiscount( // Compute the scalarization overhead of needed insertelement instructions // and phi nodes. - if (Legal->isScalarWithPredication(I) && !I->getType()->isVoidTy()) { + if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF), true, false); ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI); @@ -6940,11 +5610,7 @@ LoopVectorizationCostModel::expectedCost(unsigned VF) { VectorizationCostTy BlockCost; // For each instruction in the old loop. - for (Instruction &I : *BB) { - // Skip dbg intrinsics. - if (isa<DbgInfoIntrinsic>(I)) - continue; - + for (Instruction &I : BB->instructionsWithoutDebug()) { // Skip ignored values. if (ValuesToIgnore.count(&I) || (VF > 1 && VecValuesToIgnore.count(&I))) @@ -6958,8 +5624,9 @@ LoopVectorizationCostModel::expectedCost(unsigned VF) { BlockCost.first += C.first; BlockCost.second |= C.second; - DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first << " for VF " - << VF << " For instruction: " << I << '\n'); + LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first + << " for VF " << VF << " For instruction: " << I + << '\n'); } // If we are vectorizing a predicated block, it will have been @@ -6978,7 +5645,7 @@ LoopVectorizationCostModel::expectedCost(unsigned VF) { return Cost; } -/// \brief Gets Address Access SCEV after verifying that the access pattern +/// Gets Address Access SCEV after verifying that the access pattern /// is loop invariant except the induction variable dependence. /// /// This SCEV can be sent to the Target in order to estimate the address @@ -7020,7 +5687,7 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, unsigned Alignment = getMemInstAlignment(I); unsigned AS = getMemInstAddressSpace(I); - Value *Ptr = getPointerOperand(I); + Value *Ptr = getLoadStorePointerOperand(I); Type *PtrTy = ToVectorTy(Ptr->getType(), VF); // Figure out whether the access is strided and get the stride value @@ -7041,9 +5708,15 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, // If we have a predicated store, it may not be executed for each vector // lane. Scale the cost by the probability of executing the predicated // block. - if (Legal->isScalarWithPredication(I)) + if (isScalarWithPredication(I)) { Cost /= getReciprocalPredBlockProb(); + if (useEmulatedMaskMemRefHack(I)) + // Artificially setting to a high enough value to practically disable + // vectorization with such operations. + Cost = 3000000; + } + return Cost; } @@ -7052,7 +5725,7 @@ unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, Type *ValTy = getMemInstValueType(I); Type *VectorTy = ToVectorTy(ValTy, VF); unsigned Alignment = getMemInstAlignment(I); - Value *Ptr = getPointerOperand(I); + Value *Ptr = getLoadStorePointerOperand(I); unsigned AS = getMemInstAddressSpace(I); int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); @@ -7088,7 +5761,7 @@ unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, Type *ValTy = getMemInstValueType(I); Type *VectorTy = ToVectorTy(ValTy, VF); unsigned Alignment = getMemInstAlignment(I); - Value *Ptr = getPointerOperand(I); + Value *Ptr = getLoadStorePointerOperand(I); return TTI.getAddressComputationCost(VectorTy) + TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr, @@ -7101,7 +5774,7 @@ unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, Type *VectorTy = ToVectorTy(ValTy, VF); unsigned AS = getMemInstAddressSpace(I); - auto Group = Legal->getInterleavedAccessGroup(I); + auto Group = getInterleavedAccessGroup(I); assert(Group && "Fail to get an interleaved access group."); unsigned InterleaveFactor = Group->getFactor(); @@ -7168,13 +5841,16 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { if (VF == 1) return; + NumPredStores = 0; for (BasicBlock *BB : TheLoop->blocks()) { // For each instruction in the old loop. for (Instruction &I : *BB) { - Value *Ptr = getPointerOperand(&I); + Value *Ptr = getLoadStorePointerOperand(&I); if (!Ptr) continue; + if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) + NumPredStores++; if (isa<LoadInst>(&I) && Legal->isUniform(Ptr)) { // Scalar load + broadcast unsigned Cost = getUniformMemOpCost(&I, VF); @@ -7183,9 +5859,10 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { } // We assume that widening is the best solution when possible. - if (Legal->memoryInstructionCanBeWidened(&I, VF)) { + if (memoryInstructionCanBeWidened(&I, VF)) { unsigned Cost = getConsecutiveMemOpCost(&I, VF); - int ConsecutiveStride = Legal->isConsecutivePtr(getPointerOperand(&I)); + int ConsecutiveStride = + Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && "Expected consecutive stride."); InstWidening Decision = @@ -7197,8 +5874,8 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { // Choose between Interleaving, Gather/Scatter or Scalarization. unsigned InterleaveCost = std::numeric_limits<unsigned>::max(); unsigned NumAccesses = 1; - if (Legal->isAccessInterleaved(&I)) { - auto Group = Legal->getInterleavedAccessGroup(&I); + if (isAccessInterleaved(&I)) { + auto Group = getInterleavedAccessGroup(&I); assert(Group && "Fail to get an interleaved access group."); // Make one decision for the whole group. @@ -7210,7 +5887,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { } unsigned GatherScatterCost = - Legal->isLegalGatherOrScatter(&I) + isLegalGatherOrScatter(&I) ? getGatherScatterCost(&I, VF) * NumAccesses : std::numeric_limits<unsigned>::max(); @@ -7235,7 +5912,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { // If the instructions belongs to an interleave group, the whole group // receives the same decision. The whole group receives the cost, but // the cost will actually be assigned to one instruction. - if (auto Group = Legal->getInterleavedAccessGroup(&I)) + if (auto Group = getInterleavedAccessGroup(&I)) setWideningDecision(Group, VF, Decision, Cost); else setWideningDecision(&I, VF, Decision, Cost); @@ -7255,7 +5932,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { for (BasicBlock *BB : TheLoop->blocks()) for (Instruction &I : *BB) { Instruction *PtrDef = - dyn_cast_or_null<Instruction>(getPointerOperand(&I)); + dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); if (PtrDef && TheLoop->contains(PtrDef) && getWideningDecision(&I, VF) != CM_GatherScatter) AddrDefs.insert(PtrDef); @@ -7285,7 +5962,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { // Scalarize a widened load of address. setWideningDecision(I, VF, CM_Scalarize, (VF * getMemoryInstructionCost(I, 1))); - else if (auto Group = Legal->getInterleavedAccessGroup(I)) { + else if (auto Group = getInterleavedAccessGroup(I)) { // Scalarize an interleave group of address loads. for (unsigned I = 0; I < Group->getFactor(); ++I) { if (Instruction *Member = Group->getMember(I)) @@ -7371,7 +6048,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, // vector lane. Get the scalarization cost and scale this amount by the // probability of executing the predicated block. If the instruction is not // predicated, we fall through to the next case. - if (VF > 1 && Legal->isScalarWithPredication(I)) { + if (VF > 1 && isScalarWithPredication(I)) { unsigned Cost = 0; // These instructions have a non-void type, so account for the phi nodes @@ -7569,7 +6246,7 @@ Pass *createLoopVectorizePass(bool NoUnrolling, bool AlwaysVectorize) { bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { // Check if the pointer operand of a load or store instruction is // consecutive. - if (auto *Ptr = getPointerOperand(Inst)) + if (auto *Ptr = getLoadStorePointerOperand(Inst)) return Legal->isConsecutivePtr(Ptr); return false; } @@ -7594,23 +6271,59 @@ void LoopVectorizationCostModel::collectValuesToIgnore() { } } -LoopVectorizationCostModel::VectorizationFactor +VectorizationFactor +LoopVectorizationPlanner::planInVPlanNativePath(bool OptForSize, + unsigned UserVF) { + // Width 1 means no vectorization, cost 0 means uncomputed cost. + const VectorizationFactor NoVectorization = {1U, 0U}; + + // Outer loop handling: They may require CFG and instruction level + // transformations before even evaluating whether vectorization is profitable. + // Since we cannot modify the incoming IR, we need to build VPlan upfront in + // the vectorization pipeline. + if (!OrigLoop->empty()) { + // TODO: If UserVF is not provided, we set UserVF to 4 for stress testing. + // This won't be necessary when UserVF is not required in the VPlan-native + // path. + if (VPlanBuildStressTest && !UserVF) + UserVF = 4; + + assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); + assert(UserVF && "Expected UserVF for outer loop vectorization."); + assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two"); + LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); + buildVPlans(UserVF, UserVF); + + // For VPlan build stress testing, we bail out after VPlan construction. + if (VPlanBuildStressTest) + return NoVectorization; + + return {UserVF, 0}; + } + + LLVM_DEBUG( + dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " + "VPlan-native path.\n"); + return NoVectorization; +} + +VectorizationFactor LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) { - // Width 1 means no vectorize, cost 0 means uncomputed cost. - const LoopVectorizationCostModel::VectorizationFactor NoVectorization = {1U, - 0U}; + assert(OrigLoop->empty() && "Inner loop expected."); + // Width 1 means no vectorization, cost 0 means uncomputed cost. + const VectorizationFactor NoVectorization = {1U, 0U}; Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(OptForSize); if (!MaybeMaxVF.hasValue()) // Cases considered too costly to vectorize. return NoVectorization; if (UserVF) { - DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); + LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two"); // Collect the instructions (and their associated costs) that will be more // profitable to scalarize. CM.selectUserVectorizationFactor(UserVF); - buildVPlans(UserVF, UserVF); - DEBUG(printPlans(dbgs())); + buildVPlansWithVPRecipes(UserVF, UserVF); + LLVM_DEBUG(printPlans(dbgs())); return {UserVF, 0}; } @@ -7627,8 +6340,8 @@ LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) { CM.collectInstsToScalarize(VF); } - buildVPlans(1, MaxVF); - DEBUG(printPlans(dbgs())); + buildVPlansWithVPRecipes(1, MaxVF); + LLVM_DEBUG(printPlans(dbgs())); if (MaxVF == 1) return NoVectorization; @@ -7637,7 +6350,8 @@ LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) { } void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) { - DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF << '\n'); + LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF + << '\n'); BestVF = VF; BestUF = UF; @@ -7787,30 +6501,15 @@ bool LoopVectorizationPlanner::getDecisionAndClampRange( /// vectorization decision can potentially shorten this sub-range during /// buildVPlan(). void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) { - - // Collect conditions feeding internal conditional branches; they need to be - // represented in VPlan for it to model masking. - SmallPtrSet<Value *, 1> NeedDef; - - auto *Latch = OrigLoop->getLoopLatch(); - for (BasicBlock *BB : OrigLoop->blocks()) { - if (BB == Latch) - continue; - BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator()); - if (Branch && Branch->isConditional()) - NeedDef.insert(Branch->getCondition()); - } - for (unsigned VF = MinVF; VF < MaxVF + 1;) { VFRange SubRange = {VF, MaxVF + 1}; - VPlans.push_back(buildVPlan(SubRange, NeedDef)); + VPlans.push_back(buildVPlan(SubRange)); VF = SubRange.End; } } -VPValue *LoopVectorizationPlanner::createEdgeMask(BasicBlock *Src, - BasicBlock *Dst, - VPlanPtr &Plan) { +VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, + VPlanPtr &Plan) { assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); // Look for cached value. @@ -7840,8 +6539,7 @@ VPValue *LoopVectorizationPlanner::createEdgeMask(BasicBlock *Src, return EdgeMaskCache[Edge] = EdgeMask; } -VPValue *LoopVectorizationPlanner::createBlockInMask(BasicBlock *BB, - VPlanPtr &Plan) { +VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); // Look for cached value. @@ -7874,10 +6572,9 @@ VPValue *LoopVectorizationPlanner::createBlockInMask(BasicBlock *BB, return BlockMaskCache[BB] = BlockMask; } -VPInterleaveRecipe * -LoopVectorizationPlanner::tryToInterleaveMemory(Instruction *I, - VFRange &Range) { - const InterleaveGroup *IG = Legal->getInterleavedAccessGroup(I); +VPInterleaveRecipe *VPRecipeBuilder::tryToInterleaveMemory(Instruction *I, + VFRange &Range) { + const InterleaveGroup *IG = CM.getInterleavedAccessGroup(I); if (!IG) return nullptr; @@ -7889,7 +6586,7 @@ LoopVectorizationPlanner::tryToInterleaveMemory(Instruction *I, LoopVectorizationCostModel::CM_Interleave); }; }; - if (!getDecisionAndClampRange(isIGMember(I), Range)) + if (!LoopVectorizationPlanner::getDecisionAndClampRange(isIGMember(I), Range)) return nullptr; // I is a member of an InterleaveGroup for VF's in the (possibly trimmed) @@ -7902,8 +6599,8 @@ LoopVectorizationPlanner::tryToInterleaveMemory(Instruction *I, } VPWidenMemoryInstructionRecipe * -LoopVectorizationPlanner::tryToWidenMemory(Instruction *I, VFRange &Range, - VPlanPtr &Plan) { +VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, + VPlanPtr &Plan) { if (!isa<LoadInst>(I) && !isa<StoreInst>(I)) return nullptr; @@ -7922,7 +6619,7 @@ LoopVectorizationPlanner::tryToWidenMemory(Instruction *I, VFRange &Range, return Decision != LoopVectorizationCostModel::CM_Scalarize; }; - if (!getDecisionAndClampRange(willWiden, Range)) + if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) return nullptr; VPValue *Mask = nullptr; @@ -7933,8 +6630,7 @@ LoopVectorizationPlanner::tryToWidenMemory(Instruction *I, VFRange &Range, } VPWidenIntOrFpInductionRecipe * -LoopVectorizationPlanner::tryToOptimizeInduction(Instruction *I, - VFRange &Range) { +VPRecipeBuilder::tryToOptimizeInduction(Instruction *I, VFRange &Range) { if (PHINode *Phi = dyn_cast<PHINode>(I)) { // Check if this is an integer or fp induction. If so, build the recipe that // produces its scalar and vector values. @@ -7959,15 +6655,14 @@ LoopVectorizationPlanner::tryToOptimizeInduction(Instruction *I, [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); }; }; - if (isa<TruncInst>(I) && - getDecisionAndClampRange(isOptimizableIVTruncate(I), Range)) + if (isa<TruncInst>(I) && LoopVectorizationPlanner::getDecisionAndClampRange( + isOptimizableIVTruncate(I), Range)) return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), cast<TruncInst>(I)); return nullptr; } -VPBlendRecipe * -LoopVectorizationPlanner::tryToBlend(Instruction *I, VPlanPtr &Plan) { +VPBlendRecipe *VPRecipeBuilder::tryToBlend(Instruction *I, VPlanPtr &Plan) { PHINode *Phi = dyn_cast<PHINode>(I); if (!Phi || Phi->getParent() == OrigLoop->getHeader()) return nullptr; @@ -7991,9 +6686,9 @@ LoopVectorizationPlanner::tryToBlend(Instruction *I, VPlanPtr &Plan) { return new VPBlendRecipe(Phi, Masks); } -bool LoopVectorizationPlanner::tryToWiden(Instruction *I, VPBasicBlock *VPBB, - VFRange &Range) { - if (Legal->isScalarWithPredication(I)) +bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB, + VFRange &Range) { + if (CM.isScalarWithPredication(I)) return false; auto IsVectorizableOpcode = [](unsigned Opcode) { @@ -8077,7 +6772,7 @@ bool LoopVectorizationPlanner::tryToWiden(Instruction *I, VPBasicBlock *VPBB, return true; }; - if (!getDecisionAndClampRange(willWiden, Range)) + if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) return false; // Success: widen this instruction. We optimize the common case where @@ -8092,15 +6787,15 @@ bool LoopVectorizationPlanner::tryToWiden(Instruction *I, VPBasicBlock *VPBB, return true; } -VPBasicBlock *LoopVectorizationPlanner::handleReplication( +VPBasicBlock *VPRecipeBuilder::handleReplication( Instruction *I, VFRange &Range, VPBasicBlock *VPBB, DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe, VPlanPtr &Plan) { - bool IsUniform = getDecisionAndClampRange( + bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); }, Range); - bool IsPredicated = Legal->isScalarWithPredication(I); + bool IsPredicated = CM.isScalarWithPredication(I); auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated); // Find if I uses a predicated instruction. If so, it will use its scalar @@ -8113,24 +6808,25 @@ VPBasicBlock *LoopVectorizationPlanner::handleReplication( // Finalize the recipe for Instr, first if it is not predicated. if (!IsPredicated) { - DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); + LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); VPBB->appendRecipe(Recipe); return VPBB; } - DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); + LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); assert(VPBB->getSuccessors().empty() && "VPBB has successors when handling predicated replication."); // Record predicated instructions for above packing optimizations. PredInst2Recipe[I] = Recipe; - VPBlockBase *Region = - VPBB->setOneSuccessor(createReplicateRegion(I, Recipe, Plan)); - return cast<VPBasicBlock>(Region->setOneSuccessor(new VPBasicBlock())); + VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); + VPBlockUtils::insertBlockAfter(Region, VPBB); + auto *RegSucc = new VPBasicBlock(); + VPBlockUtils::insertBlockAfter(RegSucc, Region); + return RegSucc; } -VPRegionBlock * -LoopVectorizationPlanner::createReplicateRegion(Instruction *Instr, - VPRecipeBase *PredRecipe, - VPlanPtr &Plan) { +VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, + VPRecipeBase *PredRecipe, + VPlanPtr &Plan) { // Instructions marked for predication are replicated and placed under an // if-then construct to prevent side-effects. @@ -8150,19 +6846,67 @@ LoopVectorizationPlanner::createReplicateRegion(Instruction *Instr, // Note: first set Entry as region entry and then connect successors starting // from it in order, to propagate the "parent" of each VPBasicBlock. - Entry->setTwoSuccessors(Pred, Exit); - Pred->setOneSuccessor(Exit); + VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); + VPBlockUtils::connectBlocks(Pred, Exit); return Region; } -LoopVectorizationPlanner::VPlanPtr -LoopVectorizationPlanner::buildVPlan(VFRange &Range, - const SmallPtrSetImpl<Value *> &NeedDef) { - EdgeMaskCache.clear(); - BlockMaskCache.clear(); - DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); - DenseMap<Instruction *, Instruction *> SinkAfterInverse; +bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range, + VPlanPtr &Plan, VPBasicBlock *VPBB) { + VPRecipeBase *Recipe = nullptr; + // Check if Instr should belong to an interleave memory recipe, or already + // does. In the latter case Instr is irrelevant. + if ((Recipe = tryToInterleaveMemory(Instr, Range))) { + VPBB->appendRecipe(Recipe); + return true; + } + + // Check if Instr is a memory operation that should be widened. + if ((Recipe = tryToWidenMemory(Instr, Range, Plan))) { + VPBB->appendRecipe(Recipe); + return true; + } + + // Check if Instr should form some PHI recipe. + if ((Recipe = tryToOptimizeInduction(Instr, Range))) { + VPBB->appendRecipe(Recipe); + return true; + } + if ((Recipe = tryToBlend(Instr, Plan))) { + VPBB->appendRecipe(Recipe); + return true; + } + if (PHINode *Phi = dyn_cast<PHINode>(Instr)) { + VPBB->appendRecipe(new VPWidenPHIRecipe(Phi)); + return true; + } + + // Check if Instr is to be widened by a general VPWidenRecipe, after + // having first checked for specific widening recipes that deal with + // Interleave Groups, Inductions and Phi nodes. + if (tryToWiden(Instr, VPBB, Range)) + return true; + + return false; +} + +void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF, + unsigned MaxVF) { + assert(OrigLoop->empty() && "Inner loop expected."); + + // Collect conditions feeding internal conditional branches; they need to be + // represented in VPlan for it to model masking. + SmallPtrSet<Value *, 1> NeedDef; + + auto *Latch = OrigLoop->getLoopLatch(); + for (BasicBlock *BB : OrigLoop->blocks()) { + if (BB == Latch) + continue; + BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator()); + if (Branch && Branch->isConditional()) + NeedDef.insert(Branch->getCondition()); + } // Collect instructions from the original loop that will become trivially dead // in the vectorized loop. We don't need to vectorize these instructions. For @@ -8173,15 +6917,31 @@ LoopVectorizationPlanner::buildVPlan(VFRange &Range, SmallPtrSet<Instruction *, 4> DeadInstructions; collectTriviallyDeadInstructions(DeadInstructions); + for (unsigned VF = MinVF; VF < MaxVF + 1;) { + VFRange SubRange = {VF, MaxVF + 1}; + VPlans.push_back( + buildVPlanWithVPRecipes(SubRange, NeedDef, DeadInstructions)); + VF = SubRange.End; + } +} + +LoopVectorizationPlanner::VPlanPtr +LoopVectorizationPlanner::buildVPlanWithVPRecipes( + VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef, + SmallPtrSetImpl<Instruction *> &DeadInstructions) { // Hold a mapping from predicated instructions to their recipes, in order to // fix their AlsoPack behavior if a user is determined to replicate and use a // scalar instead of vector value. DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe; + DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); + DenseMap<Instruction *, Instruction *> SinkAfterInverse; + // Create a dummy pre-entry VPBasicBlock to start building the VPlan. VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); auto Plan = llvm::make_unique<VPlan>(VPBB); + VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, TTI, Legal, CM, Builder); // Represent values that will have defs inside VPlan. for (Value *V : NeedDef) Plan->addVPValue(V); @@ -8196,7 +6956,7 @@ LoopVectorizationPlanner::buildVPlan(VFRange &Range, // ingredients and fill a new VPBasicBlock. unsigned VPBBsForBB = 0; auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); - VPBB->setOneSuccessor(FirstVPBBForBB); + VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); VPBB = FirstVPBBForBB; Builder.setInsertPoint(VPBB); @@ -8204,18 +6964,17 @@ LoopVectorizationPlanner::buildVPlan(VFRange &Range, // Organize the ingredients to vectorize from current basic block in the // right order. - for (Instruction &I : *BB) { + for (Instruction &I : BB->instructionsWithoutDebug()) { Instruction *Instr = &I; // First filter out irrelevant instructions, to ensure no recipes are // built for them. - if (isa<BranchInst>(Instr) || isa<DbgInfoIntrinsic>(Instr) || - DeadInstructions.count(Instr)) + if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) continue; // I is a member of an InterleaveGroup for Range.Start. If it's an adjunct // member of the IG, do not construct any Recipe for it. - const InterleaveGroup *IG = Legal->getInterleavedAccessGroup(Instr); + const InterleaveGroup *IG = CM.getInterleavedAccessGroup(Instr); if (IG && Instr != IG->getInsertPos() && Range.Start >= 2 && // Query is illegal for VF == 1 CM.getWideningDecision(Instr, Range.Start) == @@ -8230,8 +6989,9 @@ LoopVectorizationPlanner::buildVPlan(VFRange &Range, // should follow. auto SAIt = SinkAfter.find(Instr); if (SAIt != SinkAfter.end()) { - DEBUG(dbgs() << "Sinking" << *SAIt->first << " after" << *SAIt->second - << " to vectorize a 1st order recurrence.\n"); + LLVM_DEBUG(dbgs() << "Sinking" << *SAIt->first << " after" + << *SAIt->second + << " to vectorize a 1st order recurrence.\n"); SinkAfterInverse[SAIt->second] = Instr; continue; } @@ -8247,45 +7007,13 @@ LoopVectorizationPlanner::buildVPlan(VFRange &Range, // Introduce each ingredient into VPlan. for (Instruction *Instr : Ingredients) { - VPRecipeBase *Recipe = nullptr; - - // Check if Instr should belong to an interleave memory recipe, or already - // does. In the latter case Instr is irrelevant. - if ((Recipe = tryToInterleaveMemory(Instr, Range))) { - VPBB->appendRecipe(Recipe); - continue; - } - - // Check if Instr is a memory operation that should be widened. - if ((Recipe = tryToWidenMemory(Instr, Range, Plan))) { - VPBB->appendRecipe(Recipe); - continue; - } - - // Check if Instr should form some PHI recipe. - if ((Recipe = tryToOptimizeInduction(Instr, Range))) { - VPBB->appendRecipe(Recipe); - continue; - } - if ((Recipe = tryToBlend(Instr, Plan))) { - VPBB->appendRecipe(Recipe); - continue; - } - if (PHINode *Phi = dyn_cast<PHINode>(Instr)) { - VPBB->appendRecipe(new VPWidenPHIRecipe(Phi)); - continue; - } - - // Check if Instr is to be widened by a general VPWidenRecipe, after - // having first checked for specific widening recipes that deal with - // Interleave Groups, Inductions and Phi nodes. - if (tryToWiden(Instr, VPBB, Range)) + if (RecipeBuilder.tryToCreateRecipe(Instr, Range, Plan, VPBB)) continue; // Otherwise, if all widening options failed, Instruction is to be // replicated. This may create a successor for VPBB. - VPBasicBlock *NextVPBB = - handleReplication(Instr, Range, VPBB, PredInst2Recipe, Plan); + VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication( + Instr, Range, VPBB, PredInst2Recipe, Plan); if (NextVPBB != VPBB) { VPBB = NextVPBB; VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) @@ -8300,7 +7028,7 @@ LoopVectorizationPlanner::buildVPlan(VFRange &Range, VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); assert(PreEntry->empty() && "Expecting empty pre-entry block."); VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); - PreEntry->disconnectSuccessor(Entry); + VPBlockUtils::disconnectBlocks(PreEntry, Entry); delete PreEntry; std::string PlanName; @@ -8319,6 +7047,30 @@ LoopVectorizationPlanner::buildVPlan(VFRange &Range, return Plan; } +LoopVectorizationPlanner::VPlanPtr +LoopVectorizationPlanner::buildVPlan(VFRange &Range) { + // Outer loop handling: They may require CFG and instruction level + // transformations before even evaluating whether vectorization is profitable. + // Since we cannot modify the incoming IR, we need to build VPlan upfront in + // the vectorization pipeline. + assert(!OrigLoop->empty()); + assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); + + // Create new empty VPlan + auto Plan = llvm::make_unique<VPlan>(); + + // Build hierarchical CFG + VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI); + HCFGBuilder.buildHierarchicalCFG(*Plan.get()); + + return Plan; +} + +Value* LoopVectorizationPlanner::VPCallbackILV:: +getOrCreateVectorValues(Value *V, unsigned Part) { + return ILV.getOrCreateVectorValue(V, Part); +} + void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const { O << " +\n" << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; @@ -8483,28 +7235,66 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { State.ILV->vectorizeMemoryInstruction(&Instr, &MaskValues); } +// Process the loop in the VPlan-native vectorization path. This path builds +// VPlan upfront in the vectorization pipeline, which allows to apply +// VPlan-to-VPlan transformations from the very beginning without modifying the +// input LLVM IR. +static bool processLoopInVPlanNativePath( + Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, + LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, + TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, + OptimizationRemarkEmitter *ORE, LoopVectorizeHints &Hints) { + + assert(EnableVPlanNativePath && "VPlan-native path is disabled."); + Function *F = L->getHeader()->getParent(); + InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); + LoopVectorizationCostModel CM(L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, + &Hints, IAI); + // Use the planner for outer loop vectorization. + // TODO: CM is not used at this point inside the planner. Turn CM into an + // optional argument if we don't need it in the future. + LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM); + + // Get user vectorization factor. + unsigned UserVF = Hints.getWidth(); + + // Check the function attributes to find out if this function should be + // optimized for size. + bool OptForSize = + Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->optForSize(); + + // Plan how to best vectorize, return the best VF and its cost. + LVP.planInVPlanNativePath(OptForSize, UserVF); + + // Returning false. We are currently not generating vector code in the VPlan + // native path. + return false; +} + bool LoopVectorizePass::processLoop(Loop *L) { - assert(L->empty() && "Only process inner loops."); + assert((EnableVPlanNativePath || L->empty()) && + "VPlan-native path is not enabled. Only process inner loops."); #ifndef NDEBUG const std::string DebugLocStr = getDebugLocString(L); #endif /* NDEBUG */ - DEBUG(dbgs() << "\nLV: Checking a loop in \"" - << L->getHeader()->getParent()->getName() << "\" from " - << DebugLocStr << "\n"); + LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" + << L->getHeader()->getParent()->getName() << "\" from " + << DebugLocStr << "\n"); LoopVectorizeHints Hints(L, DisableUnrolling, *ORE); - DEBUG(dbgs() << "LV: Loop hints:" - << " force=" - << (Hints.getForce() == LoopVectorizeHints::FK_Disabled - ? "disabled" - : (Hints.getForce() == LoopVectorizeHints::FK_Enabled - ? "enabled" - : "?")) - << " width=" << Hints.getWidth() - << " unroll=" << Hints.getInterleave() << "\n"); + LLVM_DEBUG( + dbgs() << "LV: Loop hints:" + << " force=" + << (Hints.getForce() == LoopVectorizeHints::FK_Disabled + ? "disabled" + : (Hints.getForce() == LoopVectorizeHints::FK_Enabled + ? "enabled" + : "?")) + << " width=" << Hints.getWidth() + << " unroll=" << Hints.getInterleave() << "\n"); // Function containing loop Function *F = L->getHeader()->getParent(); @@ -8518,7 +7308,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { // benefit from vectorization, respectively. if (!Hints.allowVectorization(F, L, AlwaysVectorize)) { - DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); + LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); return false; } @@ -8526,10 +7316,10 @@ bool LoopVectorizePass::processLoop(Loop *L) { // Check if it is legal to vectorize the loop. LoopVectorizationRequirements Requirements(*ORE); - LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, TTI, GetLAA, LI, ORE, - &Requirements, &Hints); - if (!LVL.canVectorize()) { - DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); + LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, GetLAA, LI, ORE, + &Requirements, &Hints, DB, AC); + if (!LVL.canVectorize(EnableVPlanNativePath)) { + LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); emitMissedWarning(F, L, Hints, ORE); return false; } @@ -8539,11 +7329,33 @@ bool LoopVectorizePass::processLoop(Loop *L) { bool OptForSize = Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->optForSize(); + // Entrance to the VPlan-native vectorization path. Outer loops are processed + // here. They may require CFG and instruction level transformations before + // even evaluating whether vectorization is profitable. Since we cannot modify + // the incoming IR, we need to build VPlan upfront in the vectorization + // pipeline. + if (!L->empty()) + return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, + ORE, Hints); + + assert(L->empty() && "Inner loop expected."); // Check the loop for a trip count threshold: vectorize loops with a tiny trip // count by optimizing for size, to minimize overheads. - unsigned ExpectedTC = SE->getSmallConstantMaxTripCount(L); - bool HasExpectedTC = (ExpectedTC > 0); - + // Prefer constant trip counts over profile data, over upper bound estimate. + unsigned ExpectedTC = 0; + bool HasExpectedTC = false; + if (const SCEVConstant *ConstExits = + dyn_cast<SCEVConstant>(SE->getBackedgeTakenCount(L))) { + const APInt &ExitsCount = ConstExits->getAPInt(); + // We are interested in small values for ExpectedTC. Skip over those that + // can't fit an unsigned. + if (ExitsCount.ult(std::numeric_limits<unsigned>::max())) { + ExpectedTC = static_cast<unsigned>(ExitsCount.getZExtValue()) + 1; + HasExpectedTC = true; + } + } + // ExpectedTC may be large because it's bound by a variable. Check + // profiling information to validate we should vectorize. if (!HasExpectedTC && LoopVectorizeWithBlockFrequency) { auto EstimatedTC = getLoopEstimatedTripCount(L); if (EstimatedTC) { @@ -8551,15 +7363,19 @@ bool LoopVectorizePass::processLoop(Loop *L) { HasExpectedTC = true; } } + if (!HasExpectedTC) { + ExpectedTC = SE->getSmallConstantMaxTripCount(L); + HasExpectedTC = (ExpectedTC > 0); + } if (HasExpectedTC && ExpectedTC < TinyTripCountVectorThreshold) { - DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " - << "This loop is worth vectorizing only if no scalar " - << "iteration overheads are incurred."); + LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " + << "This loop is worth vectorizing only if no scalar " + << "iteration overheads are incurred."); if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) - DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); + LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); else { - DEBUG(dbgs() << "\n"); + LLVM_DEBUG(dbgs() << "\n"); // Loops with a very small trip count are considered for vectorization // under OptForSize, thereby making sure the cost of their loop body is // dominant, free of runtime guards and scalar iteration overheads. @@ -8572,10 +7388,10 @@ bool LoopVectorizePass::processLoop(Loop *L) { // an integer loop and the vector instructions selected are purely integer // vector instructions? if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { - DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat" - "attribute is used.\n"); - ORE->emit(createMissedAnalysis(Hints.vectorizeAnalysisPassName(), - "NoImplicitFloat", L) + LLVM_DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat" + "attribute is used.\n"); + ORE->emit(createLVMissedAnalysis(Hints.vectorizeAnalysisPassName(), + "NoImplicitFloat", L) << "loop not vectorized due to NoImplicitFloat attribute"); emitMissedWarning(F, L, Hints, ORE); return false; @@ -8587,17 +7403,30 @@ bool LoopVectorizePass::processLoop(Loop *L) { // additional fp-math flags can help. if (Hints.isPotentiallyUnsafe() && TTI->isFPVectorizationPotentiallyUnsafe()) { - DEBUG(dbgs() << "LV: Potentially unsafe FP op prevents vectorization.\n"); + LLVM_DEBUG( + dbgs() << "LV: Potentially unsafe FP op prevents vectorization.\n"); ORE->emit( - createMissedAnalysis(Hints.vectorizeAnalysisPassName(), "UnsafeFP", L) + createLVMissedAnalysis(Hints.vectorizeAnalysisPassName(), "UnsafeFP", L) << "loop not vectorized due to unsafe FP support."); emitMissedWarning(F, L, Hints, ORE); return false; } + bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); + InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); + + // If an override option has been passed in for interleaved accesses, use it. + if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) + UseInterleaved = EnableInterleavedMemAccesses; + + // Analyze interleaved memory accesses. + if (UseInterleaved) { + IAI.analyzeInterleaving(); + } + // Use the cost model. LoopVectorizationCostModel CM(L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, F, - &Hints); + &Hints, IAI); CM.collectValuesToIgnore(); // Use the planner for vectorization. @@ -8607,8 +7436,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { unsigned UserVF = Hints.getWidth(); // Plan how to best vectorize, return the best VF and its cost. - LoopVectorizationCostModel::VectorizationFactor VF = - LVP.plan(OptForSize, UserVF); + VectorizationFactor VF = LVP.plan(OptForSize, UserVF); // Select the interleave count. unsigned IC = CM.selectInterleaveCount(OptForSize, VF.Width, VF.Cost); @@ -8620,14 +7448,14 @@ bool LoopVectorizePass::processLoop(Loop *L) { std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; bool VectorizeLoop = true, InterleaveLoop = true; if (Requirements.doesNotMeet(F, L, Hints)) { - DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " - "requirements.\n"); + LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " + "requirements.\n"); emitMissedWarning(F, L, Hints, ORE); return false; } if (VF.Width == 1) { - DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); + LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); VecDiagMsg = std::make_pair( "VectorizationNotBeneficial", "the cost-model indicates that vectorization is not beneficial"); @@ -8636,7 +7464,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { if (IC == 1 && UserIC <= 1) { // Tell the user interleaving is not beneficial. - DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); + LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); IntDiagMsg = std::make_pair( "InterleavingNotBeneficial", "the cost-model indicates that interleaving is not beneficial"); @@ -8648,8 +7476,8 @@ bool LoopVectorizePass::processLoop(Loop *L) { } } else if (IC > 1 && UserIC == 1) { // Tell the user interleaving is beneficial, but it explicitly disabled. - DEBUG(dbgs() - << "LV: Interleaving is beneficial but is explicitly disabled."); + LLVM_DEBUG( + dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); IntDiagMsg = std::make_pair( "InterleavingBeneficialButDisabled", "the cost-model indicates that interleaving is beneficial " @@ -8676,24 +7504,24 @@ bool LoopVectorizePass::processLoop(Loop *L) { }); return false; } else if (!VectorizeLoop && InterleaveLoop) { - DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); + LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); ORE->emit([&]() { return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, L->getStartLoc(), L->getHeader()) << VecDiagMsg.second; }); } else if (VectorizeLoop && !InterleaveLoop) { - DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in " - << DebugLocStr << '\n'); + LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width + << ") in " << DebugLocStr << '\n'); ORE->emit([&]() { return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, L->getStartLoc(), L->getHeader()) << IntDiagMsg.second; }); } else if (VectorizeLoop && InterleaveLoop) { - DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in " - << DebugLocStr << '\n'); - DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); + LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width + << ") in " << DebugLocStr << '\n'); + LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); } LVP.setBestPlan(VF.Width, IC); @@ -8740,7 +7568,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { // Mark the loop as already vectorized to avoid vectorizing again. Hints.setAlreadyVectorized(); - DEBUG(verifyFunction(*L->getHeader()->getParent())); + LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent())); return true; } @@ -8788,7 +7616,7 @@ bool LoopVectorizePass::runImpl( SmallVector<Loop *, 8> Worklist; for (Loop *L : *LI) - addAcyclicInnerLoop(*L, Worklist); + collectSupportedLoops(*L, LI, ORE, Worklist); LoopsAnalyzed += Worklist.size(); |